diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,276733 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 3952, + "global_step": 39517, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.530556469367614e-05, + "grad_norm": 179.49444580078125, + "learning_rate": 2.5252525252525256e-08, + "loss": 1.5323, + "step": 1 + }, + { + "epoch": 5.061112938735228e-05, + "grad_norm": 154.5267791748047, + "learning_rate": 5.050505050505051e-08, + "loss": 1.768, + "step": 2 + }, + { + "epoch": 7.591669408102842e-05, + "grad_norm": 358.39703369140625, + "learning_rate": 7.575757575757576e-08, + "loss": 1.8825, + "step": 3 + }, + { + "epoch": 0.00010122225877470456, + "grad_norm": 170.96112060546875, + "learning_rate": 1.0101010101010103e-07, + "loss": 1.3279, + "step": 4 + }, + { + "epoch": 0.0001265278234683807, + "grad_norm": 264.0881042480469, + "learning_rate": 1.2626262626262626e-07, + "loss": 1.6274, + "step": 5 + }, + { + "epoch": 0.00015183338816205684, + "grad_norm": 92.10224151611328, + "learning_rate": 1.5151515151515152e-07, + "loss": 0.9215, + "step": 6 + }, + { + "epoch": 0.00017713895285573298, + "grad_norm": 115.16714477539062, + "learning_rate": 1.767676767676768e-07, + "loss": 1.0808, + "step": 7 + }, + { + "epoch": 0.00020244451754940912, + "grad_norm": 171.17623901367188, + "learning_rate": 2.0202020202020205e-07, + "loss": 1.2457, + "step": 8 + }, + { + "epoch": 0.00022775008224308526, + "grad_norm": 175.01385498046875, + "learning_rate": 2.2727272727272729e-07, + "loss": 1.1205, + "step": 9 + }, + { + "epoch": 0.0002530556469367614, + "grad_norm": 291.12744140625, + "learning_rate": 2.525252525252525e-07, + "loss": 1.5855, + "step": 10 + }, + { + "epoch": 0.00027836121163043754, + "grad_norm": 312.9635925292969, + "learning_rate": 2.7777777777777776e-07, + "loss": 1.8248, + "step": 11 + }, + { + "epoch": 0.0003036667763241137, + "grad_norm": 136.85816955566406, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.9453, + "step": 12 + }, + { + "epoch": 0.0003289723410177898, + "grad_norm": 279.9758605957031, + "learning_rate": 3.2828282828282834e-07, + "loss": 1.1653, + "step": 13 + }, + { + "epoch": 0.00035427790571146596, + "grad_norm": 117.47865295410156, + "learning_rate": 3.535353535353536e-07, + "loss": 0.9945, + "step": 14 + }, + { + "epoch": 0.0003795834704051421, + "grad_norm": 199.69854736328125, + "learning_rate": 3.787878787878788e-07, + "loss": 1.7868, + "step": 15 + }, + { + "epoch": 0.00040488903509881824, + "grad_norm": 156.56851196289062, + "learning_rate": 4.040404040404041e-07, + "loss": 1.3944, + "step": 16 + }, + { + "epoch": 0.0004301945997924944, + "grad_norm": 259.8590393066406, + "learning_rate": 4.2929292929292934e-07, + "loss": 0.9742, + "step": 17 + }, + { + "epoch": 0.0004555001644861705, + "grad_norm": 88.6673812866211, + "learning_rate": 4.5454545454545457e-07, + "loss": 1.1201, + "step": 18 + }, + { + "epoch": 0.00048080572917984666, + "grad_norm": 198.1321258544922, + "learning_rate": 4.797979797979798e-07, + "loss": 1.0763, + "step": 19 + }, + { + "epoch": 0.0005061112938735228, + "grad_norm": 191.3653564453125, + "learning_rate": 5.05050505050505e-07, + "loss": 0.8722, + "step": 20 + }, + { + "epoch": 0.0005314168585671989, + "grad_norm": 265.2419128417969, + "learning_rate": 5.303030303030304e-07, + "loss": 1.4165, + "step": 21 + }, + { + "epoch": 0.0005567224232608751, + "grad_norm": 108.7779312133789, + "learning_rate": 5.555555555555555e-07, + "loss": 0.7764, + "step": 22 + }, + { + "epoch": 0.0005820279879545512, + "grad_norm": 130.71176147460938, + "learning_rate": 5.808080808080809e-07, + "loss": 1.1229, + "step": 23 + }, + { + "epoch": 0.0006073335526482274, + "grad_norm": 235.51052856445312, + "learning_rate": 6.060606060606061e-07, + "loss": 1.7838, + "step": 24 + }, + { + "epoch": 0.0006326391173419035, + "grad_norm": 110.71112823486328, + "learning_rate": 6.313131313131314e-07, + "loss": 0.8939, + "step": 25 + }, + { + "epoch": 0.0006579446820355796, + "grad_norm": 208.12510681152344, + "learning_rate": 6.565656565656567e-07, + "loss": 0.9993, + "step": 26 + }, + { + "epoch": 0.0006832502467292558, + "grad_norm": 129.99411010742188, + "learning_rate": 6.818181818181818e-07, + "loss": 0.7653, + "step": 27 + }, + { + "epoch": 0.0007085558114229319, + "grad_norm": 133.2783660888672, + "learning_rate": 7.070707070707071e-07, + "loss": 0.7045, + "step": 28 + }, + { + "epoch": 0.0007338613761166081, + "grad_norm": 62.0495719909668, + "learning_rate": 7.323232323232324e-07, + "loss": 1.2186, + "step": 29 + }, + { + "epoch": 0.0007591669408102842, + "grad_norm": 128.35855102539062, + "learning_rate": 7.575757575757576e-07, + "loss": 0.7665, + "step": 30 + }, + { + "epoch": 0.0007844725055039603, + "grad_norm": 53.19578552246094, + "learning_rate": 7.82828282828283e-07, + "loss": 0.5665, + "step": 31 + }, + { + "epoch": 0.0008097780701976365, + "grad_norm": 58.82590103149414, + "learning_rate": 8.080808080808082e-07, + "loss": 0.4742, + "step": 32 + }, + { + "epoch": 0.0008350836348913126, + "grad_norm": 47.3167724609375, + "learning_rate": 8.333333333333333e-07, + "loss": 0.5627, + "step": 33 + }, + { + "epoch": 0.0008603891995849888, + "grad_norm": 36.287513732910156, + "learning_rate": 8.585858585858587e-07, + "loss": 0.504, + "step": 34 + }, + { + "epoch": 0.0008856947642786649, + "grad_norm": 35.821815490722656, + "learning_rate": 8.838383838383839e-07, + "loss": 0.5181, + "step": 35 + }, + { + "epoch": 0.000911000328972341, + "grad_norm": 35.51008987426758, + "learning_rate": 9.090909090909091e-07, + "loss": 0.5855, + "step": 36 + }, + { + "epoch": 0.0009363058936660172, + "grad_norm": 33.47294616699219, + "learning_rate": 9.343434343434345e-07, + "loss": 0.5269, + "step": 37 + }, + { + "epoch": 0.0009616114583596933, + "grad_norm": 22.786113739013672, + "learning_rate": 9.595959595959596e-07, + "loss": 0.5473, + "step": 38 + }, + { + "epoch": 0.0009869170230533695, + "grad_norm": 34.1691780090332, + "learning_rate": 9.84848484848485e-07, + "loss": 0.4724, + "step": 39 + }, + { + "epoch": 0.0010122225877470456, + "grad_norm": 37.78269958496094, + "learning_rate": 1.01010101010101e-06, + "loss": 0.4402, + "step": 40 + }, + { + "epoch": 0.0010375281524407217, + "grad_norm": 32.63754653930664, + "learning_rate": 1.0353535353535354e-06, + "loss": 0.449, + "step": 41 + }, + { + "epoch": 0.0010628337171343979, + "grad_norm": 32.41181564331055, + "learning_rate": 1.0606060606060608e-06, + "loss": 0.4246, + "step": 42 + }, + { + "epoch": 0.001088139281828074, + "grad_norm": 39.743492126464844, + "learning_rate": 1.085858585858586e-06, + "loss": 0.484, + "step": 43 + }, + { + "epoch": 0.0011134448465217502, + "grad_norm": 30.635425567626953, + "learning_rate": 1.111111111111111e-06, + "loss": 0.4516, + "step": 44 + }, + { + "epoch": 0.0011387504112154263, + "grad_norm": 66.0496597290039, + "learning_rate": 1.1363636363636364e-06, + "loss": 0.5497, + "step": 45 + }, + { + "epoch": 0.0011640559759091024, + "grad_norm": 24.928890228271484, + "learning_rate": 1.1616161616161617e-06, + "loss": 0.4687, + "step": 46 + }, + { + "epoch": 0.0011893615406027786, + "grad_norm": 52.15830612182617, + "learning_rate": 1.186868686868687e-06, + "loss": 0.5852, + "step": 47 + }, + { + "epoch": 0.0012146671052964547, + "grad_norm": 27.920639038085938, + "learning_rate": 1.2121212121212122e-06, + "loss": 0.5029, + "step": 48 + }, + { + "epoch": 0.0012399726699901309, + "grad_norm": 27.135238647460938, + "learning_rate": 1.2373737373737375e-06, + "loss": 0.3113, + "step": 49 + }, + { + "epoch": 0.001265278234683807, + "grad_norm": 32.15848159790039, + "learning_rate": 1.2626262626262629e-06, + "loss": 0.4729, + "step": 50 + }, + { + "epoch": 0.0012905837993774831, + "grad_norm": 47.45156478881836, + "learning_rate": 1.287878787878788e-06, + "loss": 0.4272, + "step": 51 + }, + { + "epoch": 0.0013158893640711593, + "grad_norm": 44.82997512817383, + "learning_rate": 1.3131313131313134e-06, + "loss": 0.3945, + "step": 52 + }, + { + "epoch": 0.0013411949287648354, + "grad_norm": 42.4538459777832, + "learning_rate": 1.3383838383838385e-06, + "loss": 0.5141, + "step": 53 + }, + { + "epoch": 0.0013665004934585116, + "grad_norm": 23.92703628540039, + "learning_rate": 1.3636363636363636e-06, + "loss": 0.4549, + "step": 54 + }, + { + "epoch": 0.0013918060581521877, + "grad_norm": 28.87810516357422, + "learning_rate": 1.3888888888888892e-06, + "loss": 0.3873, + "step": 55 + }, + { + "epoch": 0.0014171116228458638, + "grad_norm": 30.38422393798828, + "learning_rate": 1.4141414141414143e-06, + "loss": 0.5042, + "step": 56 + }, + { + "epoch": 0.00144241718753954, + "grad_norm": 29.833969116210938, + "learning_rate": 1.4393939393939396e-06, + "loss": 0.3689, + "step": 57 + }, + { + "epoch": 0.0014677227522332161, + "grad_norm": 70.57917785644531, + "learning_rate": 1.4646464646464648e-06, + "loss": 0.4796, + "step": 58 + }, + { + "epoch": 0.0014930283169268923, + "grad_norm": 30.36263084411621, + "learning_rate": 1.48989898989899e-06, + "loss": 0.477, + "step": 59 + }, + { + "epoch": 0.0015183338816205684, + "grad_norm": 31.261978149414062, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.3799, + "step": 60 + }, + { + "epoch": 0.0015436394463142445, + "grad_norm": 44.55524444580078, + "learning_rate": 1.5404040404040404e-06, + "loss": 0.4622, + "step": 61 + }, + { + "epoch": 0.0015689450110079207, + "grad_norm": 30.118009567260742, + "learning_rate": 1.565656565656566e-06, + "loss": 0.3692, + "step": 62 + }, + { + "epoch": 0.0015942505757015968, + "grad_norm": 17.373249053955078, + "learning_rate": 1.590909090909091e-06, + "loss": 0.3714, + "step": 63 + }, + { + "epoch": 0.001619556140395273, + "grad_norm": 26.644670486450195, + "learning_rate": 1.6161616161616164e-06, + "loss": 0.4099, + "step": 64 + }, + { + "epoch": 0.001644861705088949, + "grad_norm": 16.556800842285156, + "learning_rate": 1.6414141414141415e-06, + "loss": 0.3747, + "step": 65 + }, + { + "epoch": 0.0016701672697826252, + "grad_norm": 31.08197021484375, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.2961, + "step": 66 + }, + { + "epoch": 0.0016954728344763014, + "grad_norm": 28.699644088745117, + "learning_rate": 1.6919191919191922e-06, + "loss": 0.3631, + "step": 67 + }, + { + "epoch": 0.0017207783991699775, + "grad_norm": 21.053564071655273, + "learning_rate": 1.7171717171717173e-06, + "loss": 0.3325, + "step": 68 + }, + { + "epoch": 0.0017460839638636537, + "grad_norm": 51.05125045776367, + "learning_rate": 1.7424242424242427e-06, + "loss": 0.4615, + "step": 69 + }, + { + "epoch": 0.0017713895285573298, + "grad_norm": 53.44393539428711, + "learning_rate": 1.7676767676767678e-06, + "loss": 0.6496, + "step": 70 + }, + { + "epoch": 0.001796695093251006, + "grad_norm": 36.8405876159668, + "learning_rate": 1.792929292929293e-06, + "loss": 0.4555, + "step": 71 + }, + { + "epoch": 0.001822000657944682, + "grad_norm": 30.761402130126953, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.3208, + "step": 72 + }, + { + "epoch": 0.0018473062226383582, + "grad_norm": 40.0262336730957, + "learning_rate": 1.8434343434343434e-06, + "loss": 0.4512, + "step": 73 + }, + { + "epoch": 0.0018726117873320344, + "grad_norm": 26.53936004638672, + "learning_rate": 1.868686868686869e-06, + "loss": 0.3178, + "step": 74 + }, + { + "epoch": 0.0018979173520257105, + "grad_norm": 42.11692428588867, + "learning_rate": 1.8939393939393941e-06, + "loss": 0.4489, + "step": 75 + }, + { + "epoch": 0.0019232229167193866, + "grad_norm": 41.153499603271484, + "learning_rate": 1.9191919191919192e-06, + "loss": 0.4088, + "step": 76 + }, + { + "epoch": 0.0019485284814130628, + "grad_norm": 21.870851516723633, + "learning_rate": 1.944444444444445e-06, + "loss": 0.318, + "step": 77 + }, + { + "epoch": 0.001973834046106739, + "grad_norm": 35.21098327636719, + "learning_rate": 1.96969696969697e-06, + "loss": 0.4595, + "step": 78 + }, + { + "epoch": 0.001999139610800415, + "grad_norm": 23.711137771606445, + "learning_rate": 1.994949494949495e-06, + "loss": 0.4019, + "step": 79 + }, + { + "epoch": 0.002024445175494091, + "grad_norm": 21.313568115234375, + "learning_rate": 2.02020202020202e-06, + "loss": 0.3255, + "step": 80 + }, + { + "epoch": 0.0020497507401877673, + "grad_norm": 26.293363571166992, + "learning_rate": 2.0454545454545457e-06, + "loss": 0.3492, + "step": 81 + }, + { + "epoch": 0.0020750563048814435, + "grad_norm": 13.968480110168457, + "learning_rate": 2.070707070707071e-06, + "loss": 0.3306, + "step": 82 + }, + { + "epoch": 0.0021003618695751196, + "grad_norm": 22.862659454345703, + "learning_rate": 2.095959595959596e-06, + "loss": 0.288, + "step": 83 + }, + { + "epoch": 0.0021256674342687958, + "grad_norm": 24.332727432250977, + "learning_rate": 2.1212121212121216e-06, + "loss": 0.2025, + "step": 84 + }, + { + "epoch": 0.002150972998962472, + "grad_norm": 49.715267181396484, + "learning_rate": 2.1464646464646467e-06, + "loss": 0.5275, + "step": 85 + }, + { + "epoch": 0.002176278563656148, + "grad_norm": 10.834492683410645, + "learning_rate": 2.171717171717172e-06, + "loss": 0.1976, + "step": 86 + }, + { + "epoch": 0.002201584128349824, + "grad_norm": 26.35019302368164, + "learning_rate": 2.196969696969697e-06, + "loss": 0.4303, + "step": 87 + }, + { + "epoch": 0.0022268896930435003, + "grad_norm": 20.765703201293945, + "learning_rate": 2.222222222222222e-06, + "loss": 0.2672, + "step": 88 + }, + { + "epoch": 0.0022521952577371765, + "grad_norm": 43.78834533691406, + "learning_rate": 2.2474747474747476e-06, + "loss": 0.2709, + "step": 89 + }, + { + "epoch": 0.0022775008224308526, + "grad_norm": 32.2617301940918, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.295, + "step": 90 + }, + { + "epoch": 0.0023028063871245287, + "grad_norm": 38.738895416259766, + "learning_rate": 2.2979797979797983e-06, + "loss": 0.4604, + "step": 91 + }, + { + "epoch": 0.002328111951818205, + "grad_norm": 19.809682846069336, + "learning_rate": 2.3232323232323234e-06, + "loss": 0.2371, + "step": 92 + }, + { + "epoch": 0.002353417516511881, + "grad_norm": 43.77133560180664, + "learning_rate": 2.348484848484849e-06, + "loss": 0.3518, + "step": 93 + }, + { + "epoch": 0.002378723081205557, + "grad_norm": 21.454639434814453, + "learning_rate": 2.373737373737374e-06, + "loss": 0.2469, + "step": 94 + }, + { + "epoch": 0.0024040286458992333, + "grad_norm": 27.47309684753418, + "learning_rate": 2.3989898989898993e-06, + "loss": 0.3036, + "step": 95 + }, + { + "epoch": 0.0024293342105929094, + "grad_norm": 26.74431610107422, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.385, + "step": 96 + }, + { + "epoch": 0.0024546397752865856, + "grad_norm": 29.568269729614258, + "learning_rate": 2.4494949494949495e-06, + "loss": 0.2671, + "step": 97 + }, + { + "epoch": 0.0024799453399802617, + "grad_norm": 29.627544403076172, + "learning_rate": 2.474747474747475e-06, + "loss": 0.3559, + "step": 98 + }, + { + "epoch": 0.002505250904673938, + "grad_norm": 75.17283630371094, + "learning_rate": 2.5e-06, + "loss": 0.3257, + "step": 99 + }, + { + "epoch": 0.002530556469367614, + "grad_norm": 31.252037048339844, + "learning_rate": 2.5252525252525258e-06, + "loss": 0.3051, + "step": 100 + }, + { + "epoch": 0.00255586203406129, + "grad_norm": 35.394439697265625, + "learning_rate": 2.5505050505050505e-06, + "loss": 0.3159, + "step": 101 + }, + { + "epoch": 0.0025811675987549663, + "grad_norm": 45.79186248779297, + "learning_rate": 2.575757575757576e-06, + "loss": 0.3485, + "step": 102 + }, + { + "epoch": 0.0026064731634486424, + "grad_norm": 18.149568557739258, + "learning_rate": 2.601010101010101e-06, + "loss": 0.2222, + "step": 103 + }, + { + "epoch": 0.0026317787281423186, + "grad_norm": 21.10387420654297, + "learning_rate": 2.6262626262626267e-06, + "loss": 0.3157, + "step": 104 + }, + { + "epoch": 0.0026570842928359947, + "grad_norm": 26.94928741455078, + "learning_rate": 2.6515151515151514e-06, + "loss": 0.2089, + "step": 105 + }, + { + "epoch": 0.002682389857529671, + "grad_norm": 23.429574966430664, + "learning_rate": 2.676767676767677e-06, + "loss": 0.3579, + "step": 106 + }, + { + "epoch": 0.002707695422223347, + "grad_norm": 54.64403533935547, + "learning_rate": 2.7020202020202025e-06, + "loss": 0.5333, + "step": 107 + }, + { + "epoch": 0.002733000986917023, + "grad_norm": 22.837114334106445, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.365, + "step": 108 + }, + { + "epoch": 0.0027583065516106993, + "grad_norm": 25.040437698364258, + "learning_rate": 2.7525252525252528e-06, + "loss": 0.3676, + "step": 109 + }, + { + "epoch": 0.0027836121163043754, + "grad_norm": 30.833831787109375, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.4126, + "step": 110 + }, + { + "epoch": 0.0028089176809980515, + "grad_norm": 15.017295837402344, + "learning_rate": 2.803030303030303e-06, + "loss": 0.3328, + "step": 111 + }, + { + "epoch": 0.0028342232456917277, + "grad_norm": 13.994912147521973, + "learning_rate": 2.8282828282828286e-06, + "loss": 0.3267, + "step": 112 + }, + { + "epoch": 0.002859528810385404, + "grad_norm": 15.540634155273438, + "learning_rate": 2.8535353535353537e-06, + "loss": 0.3774, + "step": 113 + }, + { + "epoch": 0.00288483437507908, + "grad_norm": 23.533435821533203, + "learning_rate": 2.8787878787878793e-06, + "loss": 0.3445, + "step": 114 + }, + { + "epoch": 0.002910139939772756, + "grad_norm": 25.780658721923828, + "learning_rate": 2.904040404040404e-06, + "loss": 0.2737, + "step": 115 + }, + { + "epoch": 0.0029354455044664322, + "grad_norm": 22.94389533996582, + "learning_rate": 2.9292929292929295e-06, + "loss": 0.4203, + "step": 116 + }, + { + "epoch": 0.0029607510691601084, + "grad_norm": 20.76779556274414, + "learning_rate": 2.954545454545455e-06, + "loss": 0.3801, + "step": 117 + }, + { + "epoch": 0.0029860566338537845, + "grad_norm": 15.19019889831543, + "learning_rate": 2.97979797979798e-06, + "loss": 0.1799, + "step": 118 + }, + { + "epoch": 0.0030113621985474607, + "grad_norm": 13.83127212524414, + "learning_rate": 3.0050505050505054e-06, + "loss": 0.2957, + "step": 119 + }, + { + "epoch": 0.003036667763241137, + "grad_norm": 17.3653621673584, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.2511, + "step": 120 + }, + { + "epoch": 0.003061973327934813, + "grad_norm": 21.749292373657227, + "learning_rate": 3.055555555555556e-06, + "loss": 0.4176, + "step": 121 + }, + { + "epoch": 0.003087278892628489, + "grad_norm": 16.690170288085938, + "learning_rate": 3.0808080808080807e-06, + "loss": 0.319, + "step": 122 + }, + { + "epoch": 0.0031125844573221652, + "grad_norm": 27.041065216064453, + "learning_rate": 3.1060606060606063e-06, + "loss": 0.3077, + "step": 123 + }, + { + "epoch": 0.0031378900220158414, + "grad_norm": 13.456790924072266, + "learning_rate": 3.131313131313132e-06, + "loss": 0.227, + "step": 124 + }, + { + "epoch": 0.0031631955867095175, + "grad_norm": 19.5488338470459, + "learning_rate": 3.1565656565656566e-06, + "loss": 0.1787, + "step": 125 + }, + { + "epoch": 0.0031885011514031936, + "grad_norm": 17.279130935668945, + "learning_rate": 3.181818181818182e-06, + "loss": 0.302, + "step": 126 + }, + { + "epoch": 0.0032138067160968698, + "grad_norm": 9.261479377746582, + "learning_rate": 3.2070707070707072e-06, + "loss": 0.1781, + "step": 127 + }, + { + "epoch": 0.003239112280790546, + "grad_norm": 26.946781158447266, + "learning_rate": 3.232323232323233e-06, + "loss": 0.3458, + "step": 128 + }, + { + "epoch": 0.003264417845484222, + "grad_norm": 21.74663543701172, + "learning_rate": 3.257575757575758e-06, + "loss": 0.3044, + "step": 129 + }, + { + "epoch": 0.003289723410177898, + "grad_norm": 26.209186553955078, + "learning_rate": 3.282828282828283e-06, + "loss": 0.362, + "step": 130 + }, + { + "epoch": 0.0033150289748715743, + "grad_norm": 17.79072380065918, + "learning_rate": 3.3080808080808086e-06, + "loss": 0.2554, + "step": 131 + }, + { + "epoch": 0.0033403345395652505, + "grad_norm": 19.10797882080078, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.2553, + "step": 132 + }, + { + "epoch": 0.0033656401042589266, + "grad_norm": 32.98556900024414, + "learning_rate": 3.358585858585859e-06, + "loss": 0.2873, + "step": 133 + }, + { + "epoch": 0.0033909456689526028, + "grad_norm": 17.488018035888672, + "learning_rate": 3.3838383838383844e-06, + "loss": 0.2469, + "step": 134 + }, + { + "epoch": 0.003416251233646279, + "grad_norm": 24.87834358215332, + "learning_rate": 3.409090909090909e-06, + "loss": 0.3726, + "step": 135 + }, + { + "epoch": 0.003441556798339955, + "grad_norm": 17.670969009399414, + "learning_rate": 3.4343434343434347e-06, + "loss": 0.2454, + "step": 136 + }, + { + "epoch": 0.003466862363033631, + "grad_norm": 20.845136642456055, + "learning_rate": 3.45959595959596e-06, + "loss": 0.3404, + "step": 137 + }, + { + "epoch": 0.0034921679277273073, + "grad_norm": 15.883112907409668, + "learning_rate": 3.4848484848484854e-06, + "loss": 0.3301, + "step": 138 + }, + { + "epoch": 0.0035174734924209835, + "grad_norm": 14.870492935180664, + "learning_rate": 3.51010101010101e-06, + "loss": 0.2582, + "step": 139 + }, + { + "epoch": 0.0035427790571146596, + "grad_norm": 30.698083877563477, + "learning_rate": 3.5353535353535356e-06, + "loss": 0.3352, + "step": 140 + }, + { + "epoch": 0.0035680846218083357, + "grad_norm": 9.771971702575684, + "learning_rate": 3.560606060606061e-06, + "loss": 0.2145, + "step": 141 + }, + { + "epoch": 0.003593390186502012, + "grad_norm": 35.729042053222656, + "learning_rate": 3.585858585858586e-06, + "loss": 0.4594, + "step": 142 + }, + { + "epoch": 0.003618695751195688, + "grad_norm": 22.244529724121094, + "learning_rate": 3.6111111111111115e-06, + "loss": 0.4277, + "step": 143 + }, + { + "epoch": 0.003644001315889364, + "grad_norm": 9.823049545288086, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.2241, + "step": 144 + }, + { + "epoch": 0.0036693068805830403, + "grad_norm": 14.548376083374023, + "learning_rate": 3.661616161616162e-06, + "loss": 0.2537, + "step": 145 + }, + { + "epoch": 0.0036946124452767164, + "grad_norm": 21.155439376831055, + "learning_rate": 3.686868686868687e-06, + "loss": 0.2168, + "step": 146 + }, + { + "epoch": 0.0037199180099703926, + "grad_norm": 10.948342323303223, + "learning_rate": 3.7121212121212124e-06, + "loss": 0.2855, + "step": 147 + }, + { + "epoch": 0.0037452235746640687, + "grad_norm": 10.017020225524902, + "learning_rate": 3.737373737373738e-06, + "loss": 0.2091, + "step": 148 + }, + { + "epoch": 0.003770529139357745, + "grad_norm": 16.846982955932617, + "learning_rate": 3.7626262626262627e-06, + "loss": 0.3501, + "step": 149 + }, + { + "epoch": 0.003795834704051421, + "grad_norm": 14.619205474853516, + "learning_rate": 3.7878787878787882e-06, + "loss": 0.2917, + "step": 150 + }, + { + "epoch": 0.003821140268745097, + "grad_norm": 16.028650283813477, + "learning_rate": 3.8131313131313138e-06, + "loss": 0.2933, + "step": 151 + }, + { + "epoch": 0.0038464458334387733, + "grad_norm": 24.08274269104004, + "learning_rate": 3.8383838383838385e-06, + "loss": 0.2732, + "step": 152 + }, + { + "epoch": 0.0038717513981324494, + "grad_norm": 21.10347557067871, + "learning_rate": 3.863636363636364e-06, + "loss": 0.334, + "step": 153 + }, + { + "epoch": 0.0038970569628261256, + "grad_norm": 23.890649795532227, + "learning_rate": 3.88888888888889e-06, + "loss": 0.3272, + "step": 154 + }, + { + "epoch": 0.003922362527519802, + "grad_norm": 14.023303031921387, + "learning_rate": 3.914141414141415e-06, + "loss": 0.2903, + "step": 155 + }, + { + "epoch": 0.003947668092213478, + "grad_norm": 18.876007080078125, + "learning_rate": 3.93939393939394e-06, + "loss": 0.2845, + "step": 156 + }, + { + "epoch": 0.003972973656907154, + "grad_norm": 21.883371353149414, + "learning_rate": 3.964646464646465e-06, + "loss": 0.4179, + "step": 157 + }, + { + "epoch": 0.00399827922160083, + "grad_norm": 23.60780906677246, + "learning_rate": 3.98989898989899e-06, + "loss": 0.3134, + "step": 158 + }, + { + "epoch": 0.004023584786294506, + "grad_norm": 17.919008255004883, + "learning_rate": 4.015151515151515e-06, + "loss": 0.2987, + "step": 159 + }, + { + "epoch": 0.004048890350988182, + "grad_norm": 12.944025039672852, + "learning_rate": 4.04040404040404e-06, + "loss": 0.3127, + "step": 160 + }, + { + "epoch": 0.0040741959156818585, + "grad_norm": 17.73430061340332, + "learning_rate": 4.065656565656566e-06, + "loss": 0.2235, + "step": 161 + }, + { + "epoch": 0.004099501480375535, + "grad_norm": 28.24292755126953, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.258, + "step": 162 + }, + { + "epoch": 0.004124807045069211, + "grad_norm": 14.118022918701172, + "learning_rate": 4.116161616161617e-06, + "loss": 0.2606, + "step": 163 + }, + { + "epoch": 0.004150112609762887, + "grad_norm": 27.313337326049805, + "learning_rate": 4.141414141414142e-06, + "loss": 0.2918, + "step": 164 + }, + { + "epoch": 0.004175418174456563, + "grad_norm": 27.831928253173828, + "learning_rate": 4.166666666666667e-06, + "loss": 0.3454, + "step": 165 + }, + { + "epoch": 0.004200723739150239, + "grad_norm": 26.754310607910156, + "learning_rate": 4.191919191919192e-06, + "loss": 0.3964, + "step": 166 + }, + { + "epoch": 0.004226029303843915, + "grad_norm": 48.81879806518555, + "learning_rate": 4.217171717171717e-06, + "loss": 0.6186, + "step": 167 + }, + { + "epoch": 0.0042513348685375915, + "grad_norm": 27.48755645751953, + "learning_rate": 4.242424242424243e-06, + "loss": 0.2815, + "step": 168 + }, + { + "epoch": 0.004276640433231268, + "grad_norm": 12.522850036621094, + "learning_rate": 4.267676767676767e-06, + "loss": 0.2308, + "step": 169 + }, + { + "epoch": 0.004301945997924944, + "grad_norm": 20.082483291625977, + "learning_rate": 4.292929292929293e-06, + "loss": 0.2278, + "step": 170 + }, + { + "epoch": 0.00432725156261862, + "grad_norm": 9.676962852478027, + "learning_rate": 4.3181818181818185e-06, + "loss": 0.1918, + "step": 171 + }, + { + "epoch": 0.004352557127312296, + "grad_norm": 17.118436813354492, + "learning_rate": 4.343434343434344e-06, + "loss": 0.2101, + "step": 172 + }, + { + "epoch": 0.004377862692005972, + "grad_norm": 12.080307006835938, + "learning_rate": 4.368686868686869e-06, + "loss": 0.2214, + "step": 173 + }, + { + "epoch": 0.004403168256699648, + "grad_norm": 22.983869552612305, + "learning_rate": 4.393939393939394e-06, + "loss": 0.3182, + "step": 174 + }, + { + "epoch": 0.0044284738213933245, + "grad_norm": 11.69090747833252, + "learning_rate": 4.41919191919192e-06, + "loss": 0.331, + "step": 175 + }, + { + "epoch": 0.004453779386087001, + "grad_norm": 14.995444297790527, + "learning_rate": 4.444444444444444e-06, + "loss": 0.2745, + "step": 176 + }, + { + "epoch": 0.004479084950780677, + "grad_norm": 29.96625518798828, + "learning_rate": 4.46969696969697e-06, + "loss": 0.4596, + "step": 177 + }, + { + "epoch": 0.004504390515474353, + "grad_norm": 20.663211822509766, + "learning_rate": 4.494949494949495e-06, + "loss": 0.2658, + "step": 178 + }, + { + "epoch": 0.004529696080168029, + "grad_norm": 22.315587997436523, + "learning_rate": 4.520202020202021e-06, + "loss": 0.3298, + "step": 179 + }, + { + "epoch": 0.004555001644861705, + "grad_norm": 13.187911987304688, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.1601, + "step": 180 + }, + { + "epoch": 0.004580307209555381, + "grad_norm": 27.021902084350586, + "learning_rate": 4.5707070707070715e-06, + "loss": 0.2995, + "step": 181 + }, + { + "epoch": 0.0046056127742490575, + "grad_norm": 26.516023635864258, + "learning_rate": 4.595959595959597e-06, + "loss": 0.2551, + "step": 182 + }, + { + "epoch": 0.004630918338942734, + "grad_norm": 22.908355712890625, + "learning_rate": 4.621212121212122e-06, + "loss": 0.2317, + "step": 183 + }, + { + "epoch": 0.00465622390363641, + "grad_norm": 8.863795280456543, + "learning_rate": 4.646464646464647e-06, + "loss": 0.2137, + "step": 184 + }, + { + "epoch": 0.004681529468330086, + "grad_norm": 15.437592506408691, + "learning_rate": 4.671717171717172e-06, + "loss": 0.3483, + "step": 185 + }, + { + "epoch": 0.004706835033023762, + "grad_norm": 31.647829055786133, + "learning_rate": 4.696969696969698e-06, + "loss": 0.4715, + "step": 186 + }, + { + "epoch": 0.004732140597717438, + "grad_norm": 25.072528839111328, + "learning_rate": 4.722222222222222e-06, + "loss": 0.3273, + "step": 187 + }, + { + "epoch": 0.004757446162411114, + "grad_norm": 17.979806900024414, + "learning_rate": 4.747474747474748e-06, + "loss": 0.298, + "step": 188 + }, + { + "epoch": 0.0047827517271047905, + "grad_norm": 19.213199615478516, + "learning_rate": 4.772727272727273e-06, + "loss": 0.2396, + "step": 189 + }, + { + "epoch": 0.004808057291798467, + "grad_norm": 24.691316604614258, + "learning_rate": 4.7979797979797985e-06, + "loss": 0.2821, + "step": 190 + }, + { + "epoch": 0.004833362856492143, + "grad_norm": 14.605237007141113, + "learning_rate": 4.823232323232324e-06, + "loss": 0.2591, + "step": 191 + }, + { + "epoch": 0.004858668421185819, + "grad_norm": 12.570551872253418, + "learning_rate": 4.848484848484849e-06, + "loss": 0.182, + "step": 192 + }, + { + "epoch": 0.004883973985879495, + "grad_norm": 12.331137657165527, + "learning_rate": 4.873737373737374e-06, + "loss": 0.3085, + "step": 193 + }, + { + "epoch": 0.004909279550573171, + "grad_norm": 26.27674102783203, + "learning_rate": 4.898989898989899e-06, + "loss": 0.3921, + "step": 194 + }, + { + "epoch": 0.004934585115266847, + "grad_norm": 33.7291374206543, + "learning_rate": 4.924242424242425e-06, + "loss": 0.3356, + "step": 195 + }, + { + "epoch": 0.0049598906799605234, + "grad_norm": 19.309654235839844, + "learning_rate": 4.94949494949495e-06, + "loss": 0.1701, + "step": 196 + }, + { + "epoch": 0.0049851962446542, + "grad_norm": 32.302215576171875, + "learning_rate": 4.974747474747475e-06, + "loss": 0.3719, + "step": 197 + }, + { + "epoch": 0.005010501809347876, + "grad_norm": 25.73348617553711, + "learning_rate": 5e-06, + "loss": 0.253, + "step": 198 + }, + { + "epoch": 0.005035807374041552, + "grad_norm": 21.775341033935547, + "learning_rate": 5.0252525252525255e-06, + "loss": 0.3531, + "step": 199 + }, + { + "epoch": 0.005061112938735228, + "grad_norm": 13.844507217407227, + "learning_rate": 5.0505050505050515e-06, + "loss": 0.2446, + "step": 200 + }, + { + "epoch": 0.005086418503428904, + "grad_norm": 19.96951675415039, + "learning_rate": 5.075757575757576e-06, + "loss": 0.2383, + "step": 201 + }, + { + "epoch": 0.00511172406812258, + "grad_norm": 29.02865219116211, + "learning_rate": 5.101010101010101e-06, + "loss": 0.3308, + "step": 202 + }, + { + "epoch": 0.005137029632816256, + "grad_norm": 17.31614112854004, + "learning_rate": 5.126262626262627e-06, + "loss": 0.3037, + "step": 203 + }, + { + "epoch": 0.0051623351975099326, + "grad_norm": 18.784631729125977, + "learning_rate": 5.151515151515152e-06, + "loss": 0.3084, + "step": 204 + }, + { + "epoch": 0.005187640762203609, + "grad_norm": 16.182270050048828, + "learning_rate": 5.176767676767676e-06, + "loss": 0.2703, + "step": 205 + }, + { + "epoch": 0.005212946326897285, + "grad_norm": 12.465658187866211, + "learning_rate": 5.202020202020202e-06, + "loss": 0.2725, + "step": 206 + }, + { + "epoch": 0.005238251891590961, + "grad_norm": 10.553352355957031, + "learning_rate": 5.2272727272727274e-06, + "loss": 0.2383, + "step": 207 + }, + { + "epoch": 0.005263557456284637, + "grad_norm": 16.007064819335938, + "learning_rate": 5.252525252525253e-06, + "loss": 0.1971, + "step": 208 + }, + { + "epoch": 0.005288863020978313, + "grad_norm": 41.55887222290039, + "learning_rate": 5.2777777777777785e-06, + "loss": 0.4624, + "step": 209 + }, + { + "epoch": 0.005314168585671989, + "grad_norm": 22.486234664916992, + "learning_rate": 5.303030303030303e-06, + "loss": 0.4453, + "step": 210 + }, + { + "epoch": 0.0053394741503656655, + "grad_norm": 25.388614654541016, + "learning_rate": 5.328282828282829e-06, + "loss": 0.2661, + "step": 211 + }, + { + "epoch": 0.005364779715059342, + "grad_norm": 7.9968695640563965, + "learning_rate": 5.353535353535354e-06, + "loss": 0.2592, + "step": 212 + }, + { + "epoch": 0.005390085279753018, + "grad_norm": 25.331857681274414, + "learning_rate": 5.378787878787879e-06, + "loss": 0.3516, + "step": 213 + }, + { + "epoch": 0.005415390844446694, + "grad_norm": 8.861308097839355, + "learning_rate": 5.404040404040405e-06, + "loss": 0.1456, + "step": 214 + }, + { + "epoch": 0.00544069640914037, + "grad_norm": 16.501996994018555, + "learning_rate": 5.429292929292929e-06, + "loss": 0.2045, + "step": 215 + }, + { + "epoch": 0.005466001973834046, + "grad_norm": 8.673340797424316, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.2943, + "step": 216 + }, + { + "epoch": 0.005491307538527722, + "grad_norm": 11.859415054321289, + "learning_rate": 5.4797979797979804e-06, + "loss": 0.2923, + "step": 217 + }, + { + "epoch": 0.0055166131032213985, + "grad_norm": 25.312538146972656, + "learning_rate": 5.5050505050505056e-06, + "loss": 0.3086, + "step": 218 + }, + { + "epoch": 0.005541918667915075, + "grad_norm": 21.96121597290039, + "learning_rate": 5.530303030303031e-06, + "loss": 0.3679, + "step": 219 + }, + { + "epoch": 0.005567224232608751, + "grad_norm": 11.999296188354492, + "learning_rate": 5.555555555555557e-06, + "loss": 0.2063, + "step": 220 + }, + { + "epoch": 0.005592529797302427, + "grad_norm": 15.100467681884766, + "learning_rate": 5.580808080808081e-06, + "loss": 0.345, + "step": 221 + }, + { + "epoch": 0.005617835361996103, + "grad_norm": 13.650830268859863, + "learning_rate": 5.606060606060606e-06, + "loss": 0.2028, + "step": 222 + }, + { + "epoch": 0.005643140926689779, + "grad_norm": 13.424610137939453, + "learning_rate": 5.631313131313132e-06, + "loss": 0.1625, + "step": 223 + }, + { + "epoch": 0.005668446491383455, + "grad_norm": 12.475699424743652, + "learning_rate": 5.656565656565657e-06, + "loss": 0.2536, + "step": 224 + }, + { + "epoch": 0.0056937520560771315, + "grad_norm": 13.454928398132324, + "learning_rate": 5.681818181818183e-06, + "loss": 0.3349, + "step": 225 + }, + { + "epoch": 0.005719057620770808, + "grad_norm": 15.454419136047363, + "learning_rate": 5.7070707070707075e-06, + "loss": 0.352, + "step": 226 + }, + { + "epoch": 0.005744363185464484, + "grad_norm": 11.795758247375488, + "learning_rate": 5.732323232323233e-06, + "loss": 0.2471, + "step": 227 + }, + { + "epoch": 0.00576966875015816, + "grad_norm": 21.920442581176758, + "learning_rate": 5.7575757575757586e-06, + "loss": 0.2092, + "step": 228 + }, + { + "epoch": 0.005794974314851836, + "grad_norm": 12.84473705291748, + "learning_rate": 5.782828282828284e-06, + "loss": 0.2885, + "step": 229 + }, + { + "epoch": 0.005820279879545512, + "grad_norm": 21.48814582824707, + "learning_rate": 5.808080808080808e-06, + "loss": 0.2306, + "step": 230 + }, + { + "epoch": 0.005845585444239188, + "grad_norm": 10.999265670776367, + "learning_rate": 5.833333333333334e-06, + "loss": 0.2335, + "step": 231 + }, + { + "epoch": 0.0058708910089328645, + "grad_norm": 18.341114044189453, + "learning_rate": 5.858585858585859e-06, + "loss": 0.3149, + "step": 232 + }, + { + "epoch": 0.005896196573626541, + "grad_norm": 21.395906448364258, + "learning_rate": 5.883838383838384e-06, + "loss": 0.2948, + "step": 233 + }, + { + "epoch": 0.005921502138320217, + "grad_norm": 28.631505966186523, + "learning_rate": 5.90909090909091e-06, + "loss": 0.5116, + "step": 234 + }, + { + "epoch": 0.005946807703013893, + "grad_norm": 28.650293350219727, + "learning_rate": 5.9343434343434345e-06, + "loss": 0.2523, + "step": 235 + }, + { + "epoch": 0.005972113267707569, + "grad_norm": 23.223047256469727, + "learning_rate": 5.95959595959596e-06, + "loss": 0.3226, + "step": 236 + }, + { + "epoch": 0.005997418832401245, + "grad_norm": 26.0902156829834, + "learning_rate": 5.984848484848486e-06, + "loss": 0.3912, + "step": 237 + }, + { + "epoch": 0.006022724397094921, + "grad_norm": 11.763318061828613, + "learning_rate": 6.010101010101011e-06, + "loss": 0.2871, + "step": 238 + }, + { + "epoch": 0.0060480299617885975, + "grad_norm": 18.078754425048828, + "learning_rate": 6.035353535353535e-06, + "loss": 0.3097, + "step": 239 + }, + { + "epoch": 0.006073335526482274, + "grad_norm": 15.363717079162598, + "learning_rate": 6.060606060606061e-06, + "loss": 0.246, + "step": 240 + }, + { + "epoch": 0.00609864109117595, + "grad_norm": 11.624951362609863, + "learning_rate": 6.085858585858586e-06, + "loss": 0.2981, + "step": 241 + }, + { + "epoch": 0.006123946655869626, + "grad_norm": 17.451763153076172, + "learning_rate": 6.111111111111112e-06, + "loss": 0.3496, + "step": 242 + }, + { + "epoch": 0.006149252220563302, + "grad_norm": 13.940397262573242, + "learning_rate": 6.136363636363637e-06, + "loss": 0.2579, + "step": 243 + }, + { + "epoch": 0.006174557785256978, + "grad_norm": 8.73307991027832, + "learning_rate": 6.1616161616161615e-06, + "loss": 0.1994, + "step": 244 + }, + { + "epoch": 0.006199863349950654, + "grad_norm": 21.891632080078125, + "learning_rate": 6.1868686868686875e-06, + "loss": 0.2645, + "step": 245 + }, + { + "epoch": 0.0062251689146443304, + "grad_norm": 14.322588920593262, + "learning_rate": 6.212121212121213e-06, + "loss": 0.1845, + "step": 246 + }, + { + "epoch": 0.006250474479338007, + "grad_norm": 11.515097618103027, + "learning_rate": 6.237373737373738e-06, + "loss": 0.2577, + "step": 247 + }, + { + "epoch": 0.006275780044031683, + "grad_norm": 12.857513427734375, + "learning_rate": 6.262626262626264e-06, + "loss": 0.2247, + "step": 248 + }, + { + "epoch": 0.006301085608725359, + "grad_norm": 18.906444549560547, + "learning_rate": 6.287878787878788e-06, + "loss": 0.217, + "step": 249 + }, + { + "epoch": 0.006326391173419035, + "grad_norm": 13.68765640258789, + "learning_rate": 6.313131313131313e-06, + "loss": 0.3378, + "step": 250 + }, + { + "epoch": 0.006351696738112711, + "grad_norm": 32.254905700683594, + "learning_rate": 6.338383838383839e-06, + "loss": 0.4177, + "step": 251 + }, + { + "epoch": 0.006377002302806387, + "grad_norm": 18.701406478881836, + "learning_rate": 6.363636363636364e-06, + "loss": 0.3687, + "step": 252 + }, + { + "epoch": 0.006402307867500063, + "grad_norm": 18.37067413330078, + "learning_rate": 6.3888888888888885e-06, + "loss": 0.2277, + "step": 253 + }, + { + "epoch": 0.0064276134321937396, + "grad_norm": 10.328590393066406, + "learning_rate": 6.4141414141414145e-06, + "loss": 0.3158, + "step": 254 + }, + { + "epoch": 0.006452918996887416, + "grad_norm": 15.86231803894043, + "learning_rate": 6.43939393939394e-06, + "loss": 0.2402, + "step": 255 + }, + { + "epoch": 0.006478224561581092, + "grad_norm": 22.704130172729492, + "learning_rate": 6.464646464646466e-06, + "loss": 0.3634, + "step": 256 + }, + { + "epoch": 0.006503530126274768, + "grad_norm": 39.17011642456055, + "learning_rate": 6.489898989898991e-06, + "loss": 0.2276, + "step": 257 + }, + { + "epoch": 0.006528835690968444, + "grad_norm": 26.73366928100586, + "learning_rate": 6.515151515151516e-06, + "loss": 0.3124, + "step": 258 + }, + { + "epoch": 0.00655414125566212, + "grad_norm": 22.43861961364746, + "learning_rate": 6.540404040404042e-06, + "loss": 0.2888, + "step": 259 + }, + { + "epoch": 0.006579446820355796, + "grad_norm": 17.236446380615234, + "learning_rate": 6.565656565656566e-06, + "loss": 0.2792, + "step": 260 + }, + { + "epoch": 0.0066047523850494725, + "grad_norm": 20.527462005615234, + "learning_rate": 6.590909090909091e-06, + "loss": 0.3534, + "step": 261 + }, + { + "epoch": 0.006630057949743149, + "grad_norm": 18.213298797607422, + "learning_rate": 6.616161616161617e-06, + "loss": 0.3491, + "step": 262 + }, + { + "epoch": 0.006655363514436825, + "grad_norm": 12.748693466186523, + "learning_rate": 6.641414141414142e-06, + "loss": 0.2556, + "step": 263 + }, + { + "epoch": 0.006680669079130501, + "grad_norm": 17.894304275512695, + "learning_rate": 6.666666666666667e-06, + "loss": 0.3036, + "step": 264 + }, + { + "epoch": 0.006705974643824177, + "grad_norm": 27.601490020751953, + "learning_rate": 6.691919191919193e-06, + "loss": 0.2305, + "step": 265 + }, + { + "epoch": 0.006731280208517853, + "grad_norm": 15.651050567626953, + "learning_rate": 6.717171717171718e-06, + "loss": 0.2724, + "step": 266 + }, + { + "epoch": 0.006756585773211529, + "grad_norm": 16.600597381591797, + "learning_rate": 6.742424242424243e-06, + "loss": 0.2957, + "step": 267 + }, + { + "epoch": 0.0067818913379052055, + "grad_norm": 24.02947425842285, + "learning_rate": 6.767676767676769e-06, + "loss": 0.3663, + "step": 268 + }, + { + "epoch": 0.006807196902598882, + "grad_norm": 24.75956153869629, + "learning_rate": 6.792929292929293e-06, + "loss": 0.4167, + "step": 269 + }, + { + "epoch": 0.006832502467292558, + "grad_norm": 14.583556175231934, + "learning_rate": 6.818181818181818e-06, + "loss": 0.1631, + "step": 270 + }, + { + "epoch": 0.006857808031986234, + "grad_norm": 11.27689266204834, + "learning_rate": 6.843434343434344e-06, + "loss": 0.2295, + "step": 271 + }, + { + "epoch": 0.00688311359667991, + "grad_norm": 24.851375579833984, + "learning_rate": 6.868686868686869e-06, + "loss": 0.3701, + "step": 272 + }, + { + "epoch": 0.006908419161373586, + "grad_norm": 17.54237937927246, + "learning_rate": 6.893939393939395e-06, + "loss": 0.3215, + "step": 273 + }, + { + "epoch": 0.006933724726067262, + "grad_norm": 17.973167419433594, + "learning_rate": 6.91919191919192e-06, + "loss": 0.2867, + "step": 274 + }, + { + "epoch": 0.0069590302907609385, + "grad_norm": 14.18320083618164, + "learning_rate": 6.944444444444445e-06, + "loss": 0.2923, + "step": 275 + }, + { + "epoch": 0.006984335855454615, + "grad_norm": 29.407896041870117, + "learning_rate": 6.969696969696971e-06, + "loss": 0.5324, + "step": 276 + }, + { + "epoch": 0.007009641420148291, + "grad_norm": 22.359174728393555, + "learning_rate": 6.994949494949496e-06, + "loss": 0.3684, + "step": 277 + }, + { + "epoch": 0.007034946984841967, + "grad_norm": 12.541617393493652, + "learning_rate": 7.02020202020202e-06, + "loss": 0.2751, + "step": 278 + }, + { + "epoch": 0.007060252549535643, + "grad_norm": 25.570138931274414, + "learning_rate": 7.045454545454546e-06, + "loss": 0.3033, + "step": 279 + }, + { + "epoch": 0.007085558114229319, + "grad_norm": 23.462854385375977, + "learning_rate": 7.070707070707071e-06, + "loss": 0.4287, + "step": 280 + }, + { + "epoch": 0.007110863678922995, + "grad_norm": 14.463920593261719, + "learning_rate": 7.095959595959596e-06, + "loss": 0.3688, + "step": 281 + }, + { + "epoch": 0.0071361692436166715, + "grad_norm": 17.433801651000977, + "learning_rate": 7.121212121212122e-06, + "loss": 0.2654, + "step": 282 + }, + { + "epoch": 0.007161474808310348, + "grad_norm": 19.65548324584961, + "learning_rate": 7.146464646464647e-06, + "loss": 0.2743, + "step": 283 + }, + { + "epoch": 0.007186780373004024, + "grad_norm": 14.93813705444336, + "learning_rate": 7.171717171717172e-06, + "loss": 0.306, + "step": 284 + }, + { + "epoch": 0.0072120859376977, + "grad_norm": 13.89380931854248, + "learning_rate": 7.196969696969698e-06, + "loss": 0.303, + "step": 285 + }, + { + "epoch": 0.007237391502391376, + "grad_norm": 14.5941743850708, + "learning_rate": 7.222222222222223e-06, + "loss": 0.3186, + "step": 286 + }, + { + "epoch": 0.007262697067085052, + "grad_norm": 17.90104866027832, + "learning_rate": 7.247474747474747e-06, + "loss": 0.2851, + "step": 287 + }, + { + "epoch": 0.007288002631778728, + "grad_norm": 13.054996490478516, + "learning_rate": 7.272727272727273e-06, + "loss": 0.2284, + "step": 288 + }, + { + "epoch": 0.0073133081964724045, + "grad_norm": 17.046144485473633, + "learning_rate": 7.297979797979798e-06, + "loss": 0.2617, + "step": 289 + }, + { + "epoch": 0.007338613761166081, + "grad_norm": 17.59104347229004, + "learning_rate": 7.323232323232324e-06, + "loss": 0.3663, + "step": 290 + }, + { + "epoch": 0.007363919325859757, + "grad_norm": 20.8853816986084, + "learning_rate": 7.348484848484849e-06, + "loss": 0.2329, + "step": 291 + }, + { + "epoch": 0.007389224890553433, + "grad_norm": 16.82635498046875, + "learning_rate": 7.373737373737374e-06, + "loss": 0.1557, + "step": 292 + }, + { + "epoch": 0.007414530455247109, + "grad_norm": 20.13576316833496, + "learning_rate": 7.3989898989899e-06, + "loss": 0.2707, + "step": 293 + }, + { + "epoch": 0.007439836019940785, + "grad_norm": 18.889863967895508, + "learning_rate": 7.424242424242425e-06, + "loss": 0.2483, + "step": 294 + }, + { + "epoch": 0.007465141584634461, + "grad_norm": 10.225619316101074, + "learning_rate": 7.44949494949495e-06, + "loss": 0.243, + "step": 295 + }, + { + "epoch": 0.007490447149328137, + "grad_norm": 6.650562763214111, + "learning_rate": 7.474747474747476e-06, + "loss": 0.1941, + "step": 296 + }, + { + "epoch": 0.007515752714021814, + "grad_norm": 19.348190307617188, + "learning_rate": 7.500000000000001e-06, + "loss": 0.2722, + "step": 297 + }, + { + "epoch": 0.00754105827871549, + "grad_norm": 27.294872283935547, + "learning_rate": 7.525252525252525e-06, + "loss": 0.3614, + "step": 298 + }, + { + "epoch": 0.007566363843409166, + "grad_norm": 14.496710777282715, + "learning_rate": 7.550505050505051e-06, + "loss": 0.2609, + "step": 299 + }, + { + "epoch": 0.007591669408102842, + "grad_norm": 14.334298133850098, + "learning_rate": 7.5757575757575764e-06, + "loss": 0.2959, + "step": 300 + }, + { + "epoch": 0.007616974972796518, + "grad_norm": 11.849609375, + "learning_rate": 7.6010101010101016e-06, + "loss": 0.3219, + "step": 301 + }, + { + "epoch": 0.007642280537490194, + "grad_norm": 8.692543983459473, + "learning_rate": 7.6262626262626275e-06, + "loss": 0.361, + "step": 302 + }, + { + "epoch": 0.00766758610218387, + "grad_norm": 15.481111526489258, + "learning_rate": 7.651515151515152e-06, + "loss": 0.2505, + "step": 303 + }, + { + "epoch": 0.0076928916668775466, + "grad_norm": 20.831937789916992, + "learning_rate": 7.676767676767677e-06, + "loss": 0.3459, + "step": 304 + }, + { + "epoch": 0.007718197231571223, + "grad_norm": 13.902071952819824, + "learning_rate": 7.702020202020202e-06, + "loss": 0.2285, + "step": 305 + }, + { + "epoch": 0.007743502796264899, + "grad_norm": 15.110657691955566, + "learning_rate": 7.727272727272727e-06, + "loss": 0.2993, + "step": 306 + }, + { + "epoch": 0.007768808360958575, + "grad_norm": 21.14511489868164, + "learning_rate": 7.752525252525254e-06, + "loss": 0.3314, + "step": 307 + }, + { + "epoch": 0.007794113925652251, + "grad_norm": 6.599723815917969, + "learning_rate": 7.77777777777778e-06, + "loss": 0.1677, + "step": 308 + }, + { + "epoch": 0.007819419490345926, + "grad_norm": 28.98724937438965, + "learning_rate": 7.803030303030303e-06, + "loss": 0.4191, + "step": 309 + }, + { + "epoch": 0.007844725055039603, + "grad_norm": 11.443516731262207, + "learning_rate": 7.82828282828283e-06, + "loss": 0.1758, + "step": 310 + }, + { + "epoch": 0.007870030619733279, + "grad_norm": 14.329314231872559, + "learning_rate": 7.853535353535355e-06, + "loss": 0.2277, + "step": 311 + }, + { + "epoch": 0.007895336184426956, + "grad_norm": 28.090497970581055, + "learning_rate": 7.87878787878788e-06, + "loss": 0.5247, + "step": 312 + }, + { + "epoch": 0.007920641749120631, + "grad_norm": 26.161563873291016, + "learning_rate": 7.904040404040405e-06, + "loss": 0.3619, + "step": 313 + }, + { + "epoch": 0.007945947313814308, + "grad_norm": 9.246990203857422, + "learning_rate": 7.92929292929293e-06, + "loss": 0.2154, + "step": 314 + }, + { + "epoch": 0.007971252878507983, + "grad_norm": 12.676644325256348, + "learning_rate": 7.954545454545455e-06, + "loss": 0.2248, + "step": 315 + }, + { + "epoch": 0.00799655844320166, + "grad_norm": 7.328291416168213, + "learning_rate": 7.97979797979798e-06, + "loss": 0.1701, + "step": 316 + }, + { + "epoch": 0.008021864007895336, + "grad_norm": 12.516305923461914, + "learning_rate": 8.005050505050505e-06, + "loss": 0.2694, + "step": 317 + }, + { + "epoch": 0.008047169572589013, + "grad_norm": 7.472118377685547, + "learning_rate": 8.03030303030303e-06, + "loss": 0.2746, + "step": 318 + }, + { + "epoch": 0.008072475137282688, + "grad_norm": 13.431432723999023, + "learning_rate": 8.055555555555557e-06, + "loss": 0.3095, + "step": 319 + }, + { + "epoch": 0.008097780701976365, + "grad_norm": 14.849778175354004, + "learning_rate": 8.08080808080808e-06, + "loss": 0.3169, + "step": 320 + }, + { + "epoch": 0.00812308626667004, + "grad_norm": 17.217525482177734, + "learning_rate": 8.106060606060606e-06, + "loss": 0.2436, + "step": 321 + }, + { + "epoch": 0.008148391831363717, + "grad_norm": 8.827483177185059, + "learning_rate": 8.131313131313133e-06, + "loss": 0.2999, + "step": 322 + }, + { + "epoch": 0.008173697396057392, + "grad_norm": 16.039011001586914, + "learning_rate": 8.156565656565658e-06, + "loss": 0.2754, + "step": 323 + }, + { + "epoch": 0.00819900296075107, + "grad_norm": 9.040390014648438, + "learning_rate": 8.181818181818183e-06, + "loss": 0.273, + "step": 324 + }, + { + "epoch": 0.008224308525444745, + "grad_norm": 23.94828987121582, + "learning_rate": 8.207070707070708e-06, + "loss": 0.2957, + "step": 325 + }, + { + "epoch": 0.008249614090138422, + "grad_norm": 17.350988388061523, + "learning_rate": 8.232323232323233e-06, + "loss": 0.3433, + "step": 326 + }, + { + "epoch": 0.008274919654832097, + "grad_norm": 11.978271484375, + "learning_rate": 8.257575757575758e-06, + "loss": 0.2442, + "step": 327 + }, + { + "epoch": 0.008300225219525774, + "grad_norm": 12.13689136505127, + "learning_rate": 8.282828282828283e-06, + "loss": 0.2976, + "step": 328 + }, + { + "epoch": 0.00832553078421945, + "grad_norm": 23.780353546142578, + "learning_rate": 8.308080808080809e-06, + "loss": 0.3207, + "step": 329 + }, + { + "epoch": 0.008350836348913126, + "grad_norm": 11.537596702575684, + "learning_rate": 8.333333333333334e-06, + "loss": 0.1784, + "step": 330 + }, + { + "epoch": 0.008376141913606801, + "grad_norm": 15.173540115356445, + "learning_rate": 8.358585858585859e-06, + "loss": 0.3065, + "step": 331 + }, + { + "epoch": 0.008401447478300478, + "grad_norm": 8.82516098022461, + "learning_rate": 8.383838383838384e-06, + "loss": 0.2918, + "step": 332 + }, + { + "epoch": 0.008426753042994154, + "grad_norm": 14.685188293457031, + "learning_rate": 8.40909090909091e-06, + "loss": 0.3156, + "step": 333 + }, + { + "epoch": 0.00845205860768783, + "grad_norm": 13.559962272644043, + "learning_rate": 8.434343434343434e-06, + "loss": 0.264, + "step": 334 + }, + { + "epoch": 0.008477364172381506, + "grad_norm": 13.121143341064453, + "learning_rate": 8.45959595959596e-06, + "loss": 0.2419, + "step": 335 + }, + { + "epoch": 0.008502669737075183, + "grad_norm": 10.970030784606934, + "learning_rate": 8.484848484848486e-06, + "loss": 0.2357, + "step": 336 + }, + { + "epoch": 0.008527975301768858, + "grad_norm": 19.643356323242188, + "learning_rate": 8.510101010101011e-06, + "loss": 0.2527, + "step": 337 + }, + { + "epoch": 0.008553280866462535, + "grad_norm": 11.423808097839355, + "learning_rate": 8.535353535353535e-06, + "loss": 0.2371, + "step": 338 + }, + { + "epoch": 0.00857858643115621, + "grad_norm": 18.009702682495117, + "learning_rate": 8.560606060606062e-06, + "loss": 0.2689, + "step": 339 + }, + { + "epoch": 0.008603891995849888, + "grad_norm": 12.82961368560791, + "learning_rate": 8.585858585858587e-06, + "loss": 0.2886, + "step": 340 + }, + { + "epoch": 0.008629197560543563, + "grad_norm": 12.637436866760254, + "learning_rate": 8.611111111111112e-06, + "loss": 0.2347, + "step": 341 + }, + { + "epoch": 0.00865450312523724, + "grad_norm": 18.537168502807617, + "learning_rate": 8.636363636363637e-06, + "loss": 0.3131, + "step": 342 + }, + { + "epoch": 0.008679808689930915, + "grad_norm": 12.805980682373047, + "learning_rate": 8.661616161616162e-06, + "loss": 0.2929, + "step": 343 + }, + { + "epoch": 0.008705114254624592, + "grad_norm": 16.150205612182617, + "learning_rate": 8.686868686868687e-06, + "loss": 0.3544, + "step": 344 + }, + { + "epoch": 0.008730419819318267, + "grad_norm": 8.961577415466309, + "learning_rate": 8.712121212121212e-06, + "loss": 0.2016, + "step": 345 + }, + { + "epoch": 0.008755725384011944, + "grad_norm": 25.138078689575195, + "learning_rate": 8.737373737373738e-06, + "loss": 0.266, + "step": 346 + }, + { + "epoch": 0.00878103094870562, + "grad_norm": 13.642184257507324, + "learning_rate": 8.762626262626264e-06, + "loss": 0.2572, + "step": 347 + }, + { + "epoch": 0.008806336513399297, + "grad_norm": 14.403043746948242, + "learning_rate": 8.787878787878788e-06, + "loss": 0.2154, + "step": 348 + }, + { + "epoch": 0.008831642078092972, + "grad_norm": 25.114225387573242, + "learning_rate": 8.813131313131313e-06, + "loss": 0.3587, + "step": 349 + }, + { + "epoch": 0.008856947642786649, + "grad_norm": 14.66457462310791, + "learning_rate": 8.83838383838384e-06, + "loss": 0.2179, + "step": 350 + }, + { + "epoch": 0.008882253207480324, + "grad_norm": 15.660161972045898, + "learning_rate": 8.863636363636365e-06, + "loss": 0.3361, + "step": 351 + }, + { + "epoch": 0.008907558772174001, + "grad_norm": 13.207562446594238, + "learning_rate": 8.888888888888888e-06, + "loss": 0.2213, + "step": 352 + }, + { + "epoch": 0.008932864336867677, + "grad_norm": 18.104259490966797, + "learning_rate": 8.914141414141415e-06, + "loss": 0.3962, + "step": 353 + }, + { + "epoch": 0.008958169901561354, + "grad_norm": 10.4760160446167, + "learning_rate": 8.93939393939394e-06, + "loss": 0.2053, + "step": 354 + }, + { + "epoch": 0.008983475466255029, + "grad_norm": 11.249814987182617, + "learning_rate": 8.964646464646465e-06, + "loss": 0.3122, + "step": 355 + }, + { + "epoch": 0.009008781030948706, + "grad_norm": 11.62142562866211, + "learning_rate": 8.98989898989899e-06, + "loss": 0.186, + "step": 356 + }, + { + "epoch": 0.009034086595642381, + "grad_norm": 10.19299030303955, + "learning_rate": 9.015151515151516e-06, + "loss": 0.3617, + "step": 357 + }, + { + "epoch": 0.009059392160336058, + "grad_norm": 10.764678955078125, + "learning_rate": 9.040404040404042e-06, + "loss": 0.1581, + "step": 358 + }, + { + "epoch": 0.009084697725029733, + "grad_norm": 16.48432159423828, + "learning_rate": 9.065656565656566e-06, + "loss": 0.3153, + "step": 359 + }, + { + "epoch": 0.00911000328972341, + "grad_norm": 14.046751022338867, + "learning_rate": 9.090909090909091e-06, + "loss": 0.2216, + "step": 360 + }, + { + "epoch": 0.009135308854417086, + "grad_norm": 12.794523239135742, + "learning_rate": 9.116161616161618e-06, + "loss": 0.3246, + "step": 361 + }, + { + "epoch": 0.009160614419110763, + "grad_norm": 6.911907196044922, + "learning_rate": 9.141414141414143e-06, + "loss": 0.2181, + "step": 362 + }, + { + "epoch": 0.009185919983804438, + "grad_norm": 7.209628105163574, + "learning_rate": 9.166666666666666e-06, + "loss": 0.1857, + "step": 363 + }, + { + "epoch": 0.009211225548498115, + "grad_norm": 11.14281940460205, + "learning_rate": 9.191919191919193e-06, + "loss": 0.2366, + "step": 364 + }, + { + "epoch": 0.00923653111319179, + "grad_norm": 8.965890884399414, + "learning_rate": 9.217171717171718e-06, + "loss": 0.1781, + "step": 365 + }, + { + "epoch": 0.009261836677885467, + "grad_norm": 18.022722244262695, + "learning_rate": 9.242424242424244e-06, + "loss": 0.313, + "step": 366 + }, + { + "epoch": 0.009287142242579143, + "grad_norm": 17.599794387817383, + "learning_rate": 9.267676767676769e-06, + "loss": 0.3379, + "step": 367 + }, + { + "epoch": 0.00931244780727282, + "grad_norm": 13.962872505187988, + "learning_rate": 9.292929292929294e-06, + "loss": 0.1921, + "step": 368 + }, + { + "epoch": 0.009337753371966495, + "grad_norm": 9.991558074951172, + "learning_rate": 9.318181818181819e-06, + "loss": 0.2486, + "step": 369 + }, + { + "epoch": 0.009363058936660172, + "grad_norm": 24.045095443725586, + "learning_rate": 9.343434343434344e-06, + "loss": 0.3232, + "step": 370 + }, + { + "epoch": 0.009388364501353847, + "grad_norm": 19.216787338256836, + "learning_rate": 9.36868686868687e-06, + "loss": 0.3663, + "step": 371 + }, + { + "epoch": 0.009413670066047524, + "grad_norm": 34.785648345947266, + "learning_rate": 9.393939393939396e-06, + "loss": 0.5359, + "step": 372 + }, + { + "epoch": 0.0094389756307412, + "grad_norm": 9.461493492126465, + "learning_rate": 9.41919191919192e-06, + "loss": 0.3306, + "step": 373 + }, + { + "epoch": 0.009464281195434876, + "grad_norm": 15.35291862487793, + "learning_rate": 9.444444444444445e-06, + "loss": 0.3727, + "step": 374 + }, + { + "epoch": 0.009489586760128552, + "grad_norm": 8.651911735534668, + "learning_rate": 9.469696969696971e-06, + "loss": 0.2466, + "step": 375 + }, + { + "epoch": 0.009514892324822229, + "grad_norm": 17.779752731323242, + "learning_rate": 9.494949494949497e-06, + "loss": 0.3042, + "step": 376 + }, + { + "epoch": 0.009540197889515904, + "grad_norm": 18.91037368774414, + "learning_rate": 9.52020202020202e-06, + "loss": 0.4591, + "step": 377 + }, + { + "epoch": 0.009565503454209581, + "grad_norm": 16.10509490966797, + "learning_rate": 9.545454545454547e-06, + "loss": 0.3625, + "step": 378 + }, + { + "epoch": 0.009590809018903256, + "grad_norm": 15.18287181854248, + "learning_rate": 9.570707070707072e-06, + "loss": 0.2948, + "step": 379 + }, + { + "epoch": 0.009616114583596933, + "grad_norm": 13.465246200561523, + "learning_rate": 9.595959595959597e-06, + "loss": 0.2479, + "step": 380 + }, + { + "epoch": 0.009641420148290608, + "grad_norm": 8.559194564819336, + "learning_rate": 9.621212121212122e-06, + "loss": 0.2415, + "step": 381 + }, + { + "epoch": 0.009666725712984285, + "grad_norm": 17.35698699951172, + "learning_rate": 9.646464646464647e-06, + "loss": 0.2831, + "step": 382 + }, + { + "epoch": 0.00969203127767796, + "grad_norm": 18.902782440185547, + "learning_rate": 9.671717171717172e-06, + "loss": 0.2656, + "step": 383 + }, + { + "epoch": 0.009717336842371638, + "grad_norm": 25.19538688659668, + "learning_rate": 9.696969696969698e-06, + "loss": 0.3274, + "step": 384 + }, + { + "epoch": 0.009742642407065313, + "grad_norm": 10.177167892456055, + "learning_rate": 9.722222222222223e-06, + "loss": 0.2776, + "step": 385 + }, + { + "epoch": 0.00976794797175899, + "grad_norm": 14.54052734375, + "learning_rate": 9.747474747474748e-06, + "loss": 0.1965, + "step": 386 + }, + { + "epoch": 0.009793253536452665, + "grad_norm": 22.34730339050293, + "learning_rate": 9.772727272727273e-06, + "loss": 0.3665, + "step": 387 + }, + { + "epoch": 0.009818559101146342, + "grad_norm": 21.72518539428711, + "learning_rate": 9.797979797979798e-06, + "loss": 0.3107, + "step": 388 + }, + { + "epoch": 0.009843864665840018, + "grad_norm": 23.31667137145996, + "learning_rate": 9.823232323232325e-06, + "loss": 0.3617, + "step": 389 + }, + { + "epoch": 0.009869170230533695, + "grad_norm": 20.831878662109375, + "learning_rate": 9.84848484848485e-06, + "loss": 0.3601, + "step": 390 + }, + { + "epoch": 0.00989447579522737, + "grad_norm": 11.22758674621582, + "learning_rate": 9.873737373737373e-06, + "loss": 0.2595, + "step": 391 + }, + { + "epoch": 0.009919781359921047, + "grad_norm": 14.004515647888184, + "learning_rate": 9.8989898989899e-06, + "loss": 0.3328, + "step": 392 + }, + { + "epoch": 0.009945086924614722, + "grad_norm": 11.214605331420898, + "learning_rate": 9.924242424242425e-06, + "loss": 0.2517, + "step": 393 + }, + { + "epoch": 0.0099703924893084, + "grad_norm": 8.892863273620605, + "learning_rate": 9.94949494949495e-06, + "loss": 0.2055, + "step": 394 + }, + { + "epoch": 0.009995698054002074, + "grad_norm": 13.775158882141113, + "learning_rate": 9.974747474747476e-06, + "loss": 0.1963, + "step": 395 + }, + { + "epoch": 0.010021003618695751, + "grad_norm": 10.459094047546387, + "learning_rate": 1e-05, + "loss": 0.2788, + "step": 396 + }, + { + "epoch": 0.010046309183389427, + "grad_norm": 11.152180671691895, + "learning_rate": 9.999999983877965e-06, + "loss": 0.2349, + "step": 397 + }, + { + "epoch": 0.010071614748083104, + "grad_norm": 8.08830738067627, + "learning_rate": 9.999999935511862e-06, + "loss": 0.216, + "step": 398 + }, + { + "epoch": 0.010096920312776779, + "grad_norm": 9.970813751220703, + "learning_rate": 9.999999854901688e-06, + "loss": 0.3231, + "step": 399 + }, + { + "epoch": 0.010122225877470456, + "grad_norm": 27.47215461730957, + "learning_rate": 9.999999742047443e-06, + "loss": 0.2311, + "step": 400 + }, + { + "epoch": 0.010147531442164131, + "grad_norm": 14.799821853637695, + "learning_rate": 9.999999596949132e-06, + "loss": 0.2364, + "step": 401 + }, + { + "epoch": 0.010172837006857808, + "grad_norm": 8.349687576293945, + "learning_rate": 9.999999419606752e-06, + "loss": 0.1334, + "step": 402 + }, + { + "epoch": 0.010198142571551484, + "grad_norm": 11.553247451782227, + "learning_rate": 9.999999210020308e-06, + "loss": 0.2936, + "step": 403 + }, + { + "epoch": 0.01022344813624516, + "grad_norm": 9.755717277526855, + "learning_rate": 9.999998968189798e-06, + "loss": 0.1745, + "step": 404 + }, + { + "epoch": 0.010248753700938836, + "grad_norm": 14.551294326782227, + "learning_rate": 9.999998694115225e-06, + "loss": 0.2902, + "step": 405 + }, + { + "epoch": 0.010274059265632513, + "grad_norm": 9.28193187713623, + "learning_rate": 9.99999838779659e-06, + "loss": 0.1985, + "step": 406 + }, + { + "epoch": 0.010299364830326188, + "grad_norm": 13.224087715148926, + "learning_rate": 9.999998049233897e-06, + "loss": 0.1745, + "step": 407 + }, + { + "epoch": 0.010324670395019865, + "grad_norm": 16.790788650512695, + "learning_rate": 9.999997678427145e-06, + "loss": 0.352, + "step": 408 + }, + { + "epoch": 0.01034997595971354, + "grad_norm": 19.77552604675293, + "learning_rate": 9.999997275376338e-06, + "loss": 0.387, + "step": 409 + }, + { + "epoch": 0.010375281524407217, + "grad_norm": 13.441563606262207, + "learning_rate": 9.999996840081478e-06, + "loss": 0.2642, + "step": 410 + }, + { + "epoch": 0.010400587089100893, + "grad_norm": 10.079488754272461, + "learning_rate": 9.999996372542571e-06, + "loss": 0.317, + "step": 411 + }, + { + "epoch": 0.01042589265379457, + "grad_norm": 24.60816764831543, + "learning_rate": 9.999995872759615e-06, + "loss": 0.44, + "step": 412 + }, + { + "epoch": 0.010451198218488245, + "grad_norm": 9.184035301208496, + "learning_rate": 9.999995340732617e-06, + "loss": 0.2524, + "step": 413 + }, + { + "epoch": 0.010476503783181922, + "grad_norm": 7.706802845001221, + "learning_rate": 9.999994776461578e-06, + "loss": 0.2404, + "step": 414 + }, + { + "epoch": 0.010501809347875597, + "grad_norm": 14.695344924926758, + "learning_rate": 9.999994179946504e-06, + "loss": 0.3156, + "step": 415 + }, + { + "epoch": 0.010527114912569274, + "grad_norm": 14.627485275268555, + "learning_rate": 9.999993551187398e-06, + "loss": 0.232, + "step": 416 + }, + { + "epoch": 0.01055242047726295, + "grad_norm": 10.701492309570312, + "learning_rate": 9.999992890184262e-06, + "loss": 0.2016, + "step": 417 + }, + { + "epoch": 0.010577726041956627, + "grad_norm": 14.391084671020508, + "learning_rate": 9.999992196937102e-06, + "loss": 0.3717, + "step": 418 + }, + { + "epoch": 0.010603031606650302, + "grad_norm": 12.872722625732422, + "learning_rate": 9.999991471445924e-06, + "loss": 0.3269, + "step": 419 + }, + { + "epoch": 0.010628337171343979, + "grad_norm": 14.682100296020508, + "learning_rate": 9.99999071371073e-06, + "loss": 0.3132, + "step": 420 + }, + { + "epoch": 0.010653642736037654, + "grad_norm": 9.952560424804688, + "learning_rate": 9.999989923731527e-06, + "loss": 0.2655, + "step": 421 + }, + { + "epoch": 0.010678948300731331, + "grad_norm": 17.40646743774414, + "learning_rate": 9.999989101508319e-06, + "loss": 0.2736, + "step": 422 + }, + { + "epoch": 0.010704253865425006, + "grad_norm": 15.019909858703613, + "learning_rate": 9.999988247041111e-06, + "loss": 0.3101, + "step": 423 + }, + { + "epoch": 0.010729559430118683, + "grad_norm": 15.893204689025879, + "learning_rate": 9.999987360329908e-06, + "loss": 0.3657, + "step": 424 + }, + { + "epoch": 0.010754864994812359, + "grad_norm": 12.060003280639648, + "learning_rate": 9.999986441374716e-06, + "loss": 0.2522, + "step": 425 + }, + { + "epoch": 0.010780170559506036, + "grad_norm": 7.006519317626953, + "learning_rate": 9.999985490175544e-06, + "loss": 0.254, + "step": 426 + }, + { + "epoch": 0.010805476124199711, + "grad_norm": 7.347377777099609, + "learning_rate": 9.999984506732392e-06, + "loss": 0.2627, + "step": 427 + }, + { + "epoch": 0.010830781688893388, + "grad_norm": 24.58062744140625, + "learning_rate": 9.999983491045273e-06, + "loss": 0.3037, + "step": 428 + }, + { + "epoch": 0.010856087253587063, + "grad_norm": 6.056812286376953, + "learning_rate": 9.99998244311419e-06, + "loss": 0.1679, + "step": 429 + }, + { + "epoch": 0.01088139281828074, + "grad_norm": 20.03372573852539, + "learning_rate": 9.999981362939151e-06, + "loss": 0.3009, + "step": 430 + }, + { + "epoch": 0.010906698382974415, + "grad_norm": 5.467203140258789, + "learning_rate": 9.99998025052016e-06, + "loss": 0.1694, + "step": 431 + }, + { + "epoch": 0.010932003947668092, + "grad_norm": 6.159570217132568, + "learning_rate": 9.999979105857228e-06, + "loss": 0.1667, + "step": 432 + }, + { + "epoch": 0.010957309512361768, + "grad_norm": 5.757872581481934, + "learning_rate": 9.99997792895036e-06, + "loss": 0.2012, + "step": 433 + }, + { + "epoch": 0.010982615077055445, + "grad_norm": 18.216941833496094, + "learning_rate": 9.999976719799566e-06, + "loss": 0.2954, + "step": 434 + }, + { + "epoch": 0.01100792064174912, + "grad_norm": 8.303834915161133, + "learning_rate": 9.999975478404851e-06, + "loss": 0.1895, + "step": 435 + }, + { + "epoch": 0.011033226206442797, + "grad_norm": 15.785534858703613, + "learning_rate": 9.999974204766223e-06, + "loss": 0.3774, + "step": 436 + }, + { + "epoch": 0.011058531771136472, + "grad_norm": 20.030170440673828, + "learning_rate": 9.999972898883694e-06, + "loss": 0.4181, + "step": 437 + }, + { + "epoch": 0.01108383733583015, + "grad_norm": 8.659469604492188, + "learning_rate": 9.99997156075727e-06, + "loss": 0.2397, + "step": 438 + }, + { + "epoch": 0.011109142900523825, + "grad_norm": 23.672090530395508, + "learning_rate": 9.999970190386956e-06, + "loss": 0.2513, + "step": 439 + }, + { + "epoch": 0.011134448465217502, + "grad_norm": 9.302375793457031, + "learning_rate": 9.999968787772767e-06, + "loss": 0.2442, + "step": 440 + }, + { + "epoch": 0.011159754029911177, + "grad_norm": 16.18445587158203, + "learning_rate": 9.999967352914708e-06, + "loss": 0.2248, + "step": 441 + }, + { + "epoch": 0.011185059594604854, + "grad_norm": 13.849456787109375, + "learning_rate": 9.99996588581279e-06, + "loss": 0.2898, + "step": 442 + }, + { + "epoch": 0.01121036515929853, + "grad_norm": 18.03218650817871, + "learning_rate": 9.999964386467023e-06, + "loss": 0.292, + "step": 443 + }, + { + "epoch": 0.011235670723992206, + "grad_norm": 7.980679512023926, + "learning_rate": 9.999962854877414e-06, + "loss": 0.2515, + "step": 444 + }, + { + "epoch": 0.011260976288685881, + "grad_norm": 15.884448051452637, + "learning_rate": 9.999961291043977e-06, + "loss": 0.2319, + "step": 445 + }, + { + "epoch": 0.011286281853379558, + "grad_norm": 7.158949375152588, + "learning_rate": 9.999959694966718e-06, + "loss": 0.1769, + "step": 446 + }, + { + "epoch": 0.011311587418073234, + "grad_norm": 10.45537281036377, + "learning_rate": 9.99995806664565e-06, + "loss": 0.208, + "step": 447 + }, + { + "epoch": 0.01133689298276691, + "grad_norm": 12.326287269592285, + "learning_rate": 9.999956406080782e-06, + "loss": 0.2158, + "step": 448 + }, + { + "epoch": 0.011362198547460586, + "grad_norm": 8.302717208862305, + "learning_rate": 9.999954713272125e-06, + "loss": 0.1799, + "step": 449 + }, + { + "epoch": 0.011387504112154263, + "grad_norm": 7.21712064743042, + "learning_rate": 9.99995298821969e-06, + "loss": 0.2507, + "step": 450 + }, + { + "epoch": 0.011412809676847938, + "grad_norm": 8.927454948425293, + "learning_rate": 9.999951230923487e-06, + "loss": 0.2435, + "step": 451 + }, + { + "epoch": 0.011438115241541615, + "grad_norm": 72.2356948852539, + "learning_rate": 9.999949441383532e-06, + "loss": 0.397, + "step": 452 + }, + { + "epoch": 0.01146342080623529, + "grad_norm": 38.14704132080078, + "learning_rate": 9.99994761959983e-06, + "loss": 0.2589, + "step": 453 + }, + { + "epoch": 0.011488726370928968, + "grad_norm": 19.625463485717773, + "learning_rate": 9.999945765572398e-06, + "loss": 0.2439, + "step": 454 + }, + { + "epoch": 0.011514031935622643, + "grad_norm": 9.873150825500488, + "learning_rate": 9.999943879301243e-06, + "loss": 0.2845, + "step": 455 + }, + { + "epoch": 0.01153933750031632, + "grad_norm": 20.159770965576172, + "learning_rate": 9.999941960786383e-06, + "loss": 0.3001, + "step": 456 + }, + { + "epoch": 0.011564643065009995, + "grad_norm": 12.162663459777832, + "learning_rate": 9.999940010027827e-06, + "loss": 0.2384, + "step": 457 + }, + { + "epoch": 0.011589948629703672, + "grad_norm": 11.708102226257324, + "learning_rate": 9.999938027025587e-06, + "loss": 0.235, + "step": 458 + }, + { + "epoch": 0.011615254194397347, + "grad_norm": 18.51699447631836, + "learning_rate": 9.999936011779676e-06, + "loss": 0.241, + "step": 459 + }, + { + "epoch": 0.011640559759091024, + "grad_norm": 11.972716331481934, + "learning_rate": 9.99993396429011e-06, + "loss": 0.2623, + "step": 460 + }, + { + "epoch": 0.0116658653237847, + "grad_norm": 11.309175491333008, + "learning_rate": 9.999931884556897e-06, + "loss": 0.2547, + "step": 461 + }, + { + "epoch": 0.011691170888478377, + "grad_norm": 23.150686264038086, + "learning_rate": 9.999929772580056e-06, + "loss": 0.3271, + "step": 462 + }, + { + "epoch": 0.011716476453172052, + "grad_norm": 6.642787456512451, + "learning_rate": 9.999927628359596e-06, + "loss": 0.22, + "step": 463 + }, + { + "epoch": 0.011741782017865729, + "grad_norm": 9.81338882446289, + "learning_rate": 9.999925451895534e-06, + "loss": 0.2033, + "step": 464 + }, + { + "epoch": 0.011767087582559404, + "grad_norm": 8.620713233947754, + "learning_rate": 9.999923243187882e-06, + "loss": 0.2285, + "step": 465 + }, + { + "epoch": 0.011792393147253081, + "grad_norm": 18.600465774536133, + "learning_rate": 9.999921002236656e-06, + "loss": 0.2843, + "step": 466 + }, + { + "epoch": 0.011817698711946757, + "grad_norm": 14.76247787475586, + "learning_rate": 9.999918729041869e-06, + "loss": 0.3136, + "step": 467 + }, + { + "epoch": 0.011843004276640434, + "grad_norm": 9.436158180236816, + "learning_rate": 9.999916423603534e-06, + "loss": 0.2894, + "step": 468 + }, + { + "epoch": 0.011868309841334109, + "grad_norm": 14.496892929077148, + "learning_rate": 9.999914085921671e-06, + "loss": 0.259, + "step": 469 + }, + { + "epoch": 0.011893615406027786, + "grad_norm": 10.937593460083008, + "learning_rate": 9.999911715996292e-06, + "loss": 0.2532, + "step": 470 + }, + { + "epoch": 0.011918920970721461, + "grad_norm": 8.709402084350586, + "learning_rate": 9.999909313827411e-06, + "loss": 0.2115, + "step": 471 + }, + { + "epoch": 0.011944226535415138, + "grad_norm": 10.69911003112793, + "learning_rate": 9.999906879415046e-06, + "loss": 0.3363, + "step": 472 + }, + { + "epoch": 0.011969532100108813, + "grad_norm": 18.15225601196289, + "learning_rate": 9.999904412759211e-06, + "loss": 0.2261, + "step": 473 + }, + { + "epoch": 0.01199483766480249, + "grad_norm": 12.419918060302734, + "learning_rate": 9.999901913859923e-06, + "loss": 0.3613, + "step": 474 + }, + { + "epoch": 0.012020143229496166, + "grad_norm": 12.827534675598145, + "learning_rate": 9.999899382717198e-06, + "loss": 0.2521, + "step": 475 + }, + { + "epoch": 0.012045448794189843, + "grad_norm": 15.00920581817627, + "learning_rate": 9.999896819331051e-06, + "loss": 0.2469, + "step": 476 + }, + { + "epoch": 0.012070754358883518, + "grad_norm": 11.261877059936523, + "learning_rate": 9.999894223701499e-06, + "loss": 0.201, + "step": 477 + }, + { + "epoch": 0.012096059923577195, + "grad_norm": 16.034940719604492, + "learning_rate": 9.99989159582856e-06, + "loss": 0.356, + "step": 478 + }, + { + "epoch": 0.01212136548827087, + "grad_norm": 16.33084487915039, + "learning_rate": 9.99988893571225e-06, + "loss": 0.2874, + "step": 479 + }, + { + "epoch": 0.012146671052964547, + "grad_norm": 26.733793258666992, + "learning_rate": 9.999886243352585e-06, + "loss": 0.4873, + "step": 480 + }, + { + "epoch": 0.012171976617658222, + "grad_norm": 27.73163604736328, + "learning_rate": 9.999883518749586e-06, + "loss": 0.2784, + "step": 481 + }, + { + "epoch": 0.0121972821823519, + "grad_norm": 8.892342567443848, + "learning_rate": 9.999880761903267e-06, + "loss": 0.3324, + "step": 482 + }, + { + "epoch": 0.012222587747045575, + "grad_norm": 27.96196937561035, + "learning_rate": 9.999877972813648e-06, + "loss": 0.3494, + "step": 483 + }, + { + "epoch": 0.012247893311739252, + "grad_norm": 8.173158645629883, + "learning_rate": 9.999875151480745e-06, + "loss": 0.2023, + "step": 484 + }, + { + "epoch": 0.012273198876432927, + "grad_norm": 8.495295524597168, + "learning_rate": 9.999872297904578e-06, + "loss": 0.2131, + "step": 485 + }, + { + "epoch": 0.012298504441126604, + "grad_norm": 23.894744873046875, + "learning_rate": 9.999869412085163e-06, + "loss": 0.4448, + "step": 486 + }, + { + "epoch": 0.01232381000582028, + "grad_norm": 15.2752685546875, + "learning_rate": 9.999866494022521e-06, + "loss": 0.3199, + "step": 487 + }, + { + "epoch": 0.012349115570513956, + "grad_norm": 14.75439167022705, + "learning_rate": 9.99986354371667e-06, + "loss": 0.2644, + "step": 488 + }, + { + "epoch": 0.012374421135207632, + "grad_norm": 12.731706619262695, + "learning_rate": 9.99986056116763e-06, + "loss": 0.3016, + "step": 489 + }, + { + "epoch": 0.012399726699901309, + "grad_norm": 18.3344783782959, + "learning_rate": 9.999857546375419e-06, + "loss": 0.2443, + "step": 490 + }, + { + "epoch": 0.012425032264594984, + "grad_norm": 21.1259822845459, + "learning_rate": 9.999854499340056e-06, + "loss": 0.2891, + "step": 491 + }, + { + "epoch": 0.012450337829288661, + "grad_norm": 14.061224937438965, + "learning_rate": 9.999851420061562e-06, + "loss": 0.3166, + "step": 492 + }, + { + "epoch": 0.012475643393982336, + "grad_norm": 13.369732856750488, + "learning_rate": 9.999848308539957e-06, + "loss": 0.306, + "step": 493 + }, + { + "epoch": 0.012500948958676013, + "grad_norm": 12.669580459594727, + "learning_rate": 9.999845164775257e-06, + "loss": 0.2447, + "step": 494 + }, + { + "epoch": 0.012526254523369688, + "grad_norm": 31.403186798095703, + "learning_rate": 9.99984198876749e-06, + "loss": 0.4047, + "step": 495 + }, + { + "epoch": 0.012551560088063365, + "grad_norm": 20.777751922607422, + "learning_rate": 9.999838780516669e-06, + "loss": 0.3087, + "step": 496 + }, + { + "epoch": 0.01257686565275704, + "grad_norm": 10.1168212890625, + "learning_rate": 9.99983554002282e-06, + "loss": 0.3159, + "step": 497 + }, + { + "epoch": 0.012602171217450718, + "grad_norm": 7.17617654800415, + "learning_rate": 9.999832267285959e-06, + "loss": 0.1959, + "step": 498 + }, + { + "epoch": 0.012627476782144393, + "grad_norm": 13.193840026855469, + "learning_rate": 9.99982896230611e-06, + "loss": 0.2735, + "step": 499 + }, + { + "epoch": 0.01265278234683807, + "grad_norm": 13.920795440673828, + "learning_rate": 9.999825625083295e-06, + "loss": 0.1821, + "step": 500 + }, + { + "epoch": 0.012678087911531745, + "grad_norm": 7.4185075759887695, + "learning_rate": 9.999822255617536e-06, + "loss": 0.2188, + "step": 501 + }, + { + "epoch": 0.012703393476225422, + "grad_norm": 16.61885643005371, + "learning_rate": 9.999818853908852e-06, + "loss": 0.3187, + "step": 502 + }, + { + "epoch": 0.012728699040919098, + "grad_norm": 11.362663269042969, + "learning_rate": 9.999815419957266e-06, + "loss": 0.286, + "step": 503 + }, + { + "epoch": 0.012754004605612775, + "grad_norm": 11.890412330627441, + "learning_rate": 9.9998119537628e-06, + "loss": 0.264, + "step": 504 + }, + { + "epoch": 0.01277931017030645, + "grad_norm": 11.109018325805664, + "learning_rate": 9.999808455325477e-06, + "loss": 0.1771, + "step": 505 + }, + { + "epoch": 0.012804615735000127, + "grad_norm": 7.140732288360596, + "learning_rate": 9.99980492464532e-06, + "loss": 0.205, + "step": 506 + }, + { + "epoch": 0.012829921299693802, + "grad_norm": 9.055607795715332, + "learning_rate": 9.99980136172235e-06, + "loss": 0.3, + "step": 507 + }, + { + "epoch": 0.012855226864387479, + "grad_norm": 9.934600830078125, + "learning_rate": 9.999797766556593e-06, + "loss": 0.2405, + "step": 508 + }, + { + "epoch": 0.012880532429081154, + "grad_norm": 5.672958850860596, + "learning_rate": 9.99979413914807e-06, + "loss": 0.1465, + "step": 509 + }, + { + "epoch": 0.012905837993774831, + "grad_norm": 9.253748893737793, + "learning_rate": 9.999790479496803e-06, + "loss": 0.1872, + "step": 510 + }, + { + "epoch": 0.012931143558468507, + "grad_norm": 19.370182037353516, + "learning_rate": 9.999786787602817e-06, + "loss": 0.3389, + "step": 511 + }, + { + "epoch": 0.012956449123162184, + "grad_norm": 13.482460975646973, + "learning_rate": 9.999783063466138e-06, + "loss": 0.3318, + "step": 512 + }, + { + "epoch": 0.012981754687855859, + "grad_norm": 18.584033966064453, + "learning_rate": 9.999779307086787e-06, + "loss": 0.2794, + "step": 513 + }, + { + "epoch": 0.013007060252549536, + "grad_norm": 17.123767852783203, + "learning_rate": 9.99977551846479e-06, + "loss": 0.2765, + "step": 514 + }, + { + "epoch": 0.013032365817243211, + "grad_norm": 12.808832168579102, + "learning_rate": 9.999771697600168e-06, + "loss": 0.2839, + "step": 515 + }, + { + "epoch": 0.013057671381936888, + "grad_norm": 12.506714820861816, + "learning_rate": 9.999767844492951e-06, + "loss": 0.2602, + "step": 516 + }, + { + "epoch": 0.013082976946630564, + "grad_norm": 21.310365676879883, + "learning_rate": 9.999763959143161e-06, + "loss": 0.2994, + "step": 517 + }, + { + "epoch": 0.01310828251132424, + "grad_norm": 11.845715522766113, + "learning_rate": 9.999760041550823e-06, + "loss": 0.2384, + "step": 518 + }, + { + "epoch": 0.013133588076017916, + "grad_norm": 9.37285041809082, + "learning_rate": 9.999756091715962e-06, + "loss": 0.2463, + "step": 519 + }, + { + "epoch": 0.013158893640711593, + "grad_norm": 5.958103656768799, + "learning_rate": 9.999752109638604e-06, + "loss": 0.202, + "step": 520 + }, + { + "epoch": 0.013184199205405268, + "grad_norm": 5.418430328369141, + "learning_rate": 9.999748095318776e-06, + "loss": 0.1919, + "step": 521 + }, + { + "epoch": 0.013209504770098945, + "grad_norm": 6.859382152557373, + "learning_rate": 9.999744048756501e-06, + "loss": 0.1211, + "step": 522 + }, + { + "epoch": 0.01323481033479262, + "grad_norm": 10.983718872070312, + "learning_rate": 9.999739969951807e-06, + "loss": 0.2105, + "step": 523 + }, + { + "epoch": 0.013260115899486297, + "grad_norm": 10.988341331481934, + "learning_rate": 9.99973585890472e-06, + "loss": 0.2184, + "step": 524 + }, + { + "epoch": 0.013285421464179973, + "grad_norm": 21.182886123657227, + "learning_rate": 9.999731715615267e-06, + "loss": 0.4367, + "step": 525 + }, + { + "epoch": 0.01331072702887365, + "grad_norm": 9.01203727722168, + "learning_rate": 9.999727540083473e-06, + "loss": 0.218, + "step": 526 + }, + { + "epoch": 0.013336032593567325, + "grad_norm": 26.998628616333008, + "learning_rate": 9.999723332309368e-06, + "loss": 0.3845, + "step": 527 + }, + { + "epoch": 0.013361338158261002, + "grad_norm": 75.14594268798828, + "learning_rate": 9.999719092292975e-06, + "loss": 0.6556, + "step": 528 + }, + { + "epoch": 0.013386643722954677, + "grad_norm": 15.695701599121094, + "learning_rate": 9.999714820034324e-06, + "loss": 0.3245, + "step": 529 + }, + { + "epoch": 0.013411949287648354, + "grad_norm": 28.235563278198242, + "learning_rate": 9.999710515533445e-06, + "loss": 0.354, + "step": 530 + }, + { + "epoch": 0.01343725485234203, + "grad_norm": 24.989450454711914, + "learning_rate": 9.999706178790361e-06, + "loss": 0.2815, + "step": 531 + }, + { + "epoch": 0.013462560417035706, + "grad_norm": 8.225149154663086, + "learning_rate": 9.9997018098051e-06, + "loss": 0.2335, + "step": 532 + }, + { + "epoch": 0.013487865981729382, + "grad_norm": 8.389829635620117, + "learning_rate": 9.999697408577694e-06, + "loss": 0.2452, + "step": 533 + }, + { + "epoch": 0.013513171546423059, + "grad_norm": 11.671229362487793, + "learning_rate": 9.99969297510817e-06, + "loss": 0.2432, + "step": 534 + }, + { + "epoch": 0.013538477111116734, + "grad_norm": 16.035764694213867, + "learning_rate": 9.999688509396555e-06, + "loss": 0.3073, + "step": 535 + }, + { + "epoch": 0.013563782675810411, + "grad_norm": 14.32793140411377, + "learning_rate": 9.999684011442878e-06, + "loss": 0.3198, + "step": 536 + }, + { + "epoch": 0.013589088240504086, + "grad_norm": 7.5000505447387695, + "learning_rate": 9.99967948124717e-06, + "loss": 0.1849, + "step": 537 + }, + { + "epoch": 0.013614393805197763, + "grad_norm": 13.764687538146973, + "learning_rate": 9.99967491880946e-06, + "loss": 0.2291, + "step": 538 + }, + { + "epoch": 0.013639699369891439, + "grad_norm": 5.820556163787842, + "learning_rate": 9.999670324129774e-06, + "loss": 0.2345, + "step": 539 + }, + { + "epoch": 0.013665004934585116, + "grad_norm": 8.511269569396973, + "learning_rate": 9.999665697208145e-06, + "loss": 0.2285, + "step": 540 + }, + { + "epoch": 0.01369031049927879, + "grad_norm": 18.084518432617188, + "learning_rate": 9.999661038044604e-06, + "loss": 0.2898, + "step": 541 + }, + { + "epoch": 0.013715616063972468, + "grad_norm": 11.506072998046875, + "learning_rate": 9.999656346639175e-06, + "loss": 0.2965, + "step": 542 + }, + { + "epoch": 0.013740921628666143, + "grad_norm": 9.283574104309082, + "learning_rate": 9.999651622991895e-06, + "loss": 0.1937, + "step": 543 + }, + { + "epoch": 0.01376622719335982, + "grad_norm": 9.649149894714355, + "learning_rate": 9.999646867102792e-06, + "loss": 0.2172, + "step": 544 + }, + { + "epoch": 0.013791532758053495, + "grad_norm": 13.719090461730957, + "learning_rate": 9.999642078971897e-06, + "loss": 0.4198, + "step": 545 + }, + { + "epoch": 0.013816838322747172, + "grad_norm": 10.099129676818848, + "learning_rate": 9.999637258599238e-06, + "loss": 0.2321, + "step": 546 + }, + { + "epoch": 0.013842143887440848, + "grad_norm": 11.015088081359863, + "learning_rate": 9.999632405984847e-06, + "loss": 0.2563, + "step": 547 + }, + { + "epoch": 0.013867449452134525, + "grad_norm": 7.402622699737549, + "learning_rate": 9.999627521128761e-06, + "loss": 0.2883, + "step": 548 + }, + { + "epoch": 0.0138927550168282, + "grad_norm": 11.384745597839355, + "learning_rate": 9.999622604031004e-06, + "loss": 0.3164, + "step": 549 + }, + { + "epoch": 0.013918060581521877, + "grad_norm": 11.665185928344727, + "learning_rate": 9.99961765469161e-06, + "loss": 0.2077, + "step": 550 + }, + { + "epoch": 0.013943366146215552, + "grad_norm": 13.650479316711426, + "learning_rate": 9.999612673110615e-06, + "loss": 0.2883, + "step": 551 + }, + { + "epoch": 0.01396867171090923, + "grad_norm": 11.896817207336426, + "learning_rate": 9.999607659288046e-06, + "loss": 0.2926, + "step": 552 + }, + { + "epoch": 0.013993977275602905, + "grad_norm": 10.127938270568848, + "learning_rate": 9.999602613223938e-06, + "loss": 0.3153, + "step": 553 + }, + { + "epoch": 0.014019282840296582, + "grad_norm": 11.786499977111816, + "learning_rate": 9.999597534918321e-06, + "loss": 0.2596, + "step": 554 + }, + { + "epoch": 0.014044588404990257, + "grad_norm": 10.8492431640625, + "learning_rate": 9.999592424371231e-06, + "loss": 0.2622, + "step": 555 + }, + { + "epoch": 0.014069893969683934, + "grad_norm": 27.8925838470459, + "learning_rate": 9.9995872815827e-06, + "loss": 0.4412, + "step": 556 + }, + { + "epoch": 0.014095199534377609, + "grad_norm": 15.50412654876709, + "learning_rate": 9.99958210655276e-06, + "loss": 0.3402, + "step": 557 + }, + { + "epoch": 0.014120505099071286, + "grad_norm": 15.672581672668457, + "learning_rate": 9.999576899281445e-06, + "loss": 0.323, + "step": 558 + }, + { + "epoch": 0.014145810663764961, + "grad_norm": 14.210780143737793, + "learning_rate": 9.999571659768788e-06, + "loss": 0.1821, + "step": 559 + }, + { + "epoch": 0.014171116228458638, + "grad_norm": 8.025430679321289, + "learning_rate": 9.999566388014823e-06, + "loss": 0.2176, + "step": 560 + }, + { + "epoch": 0.014196421793152314, + "grad_norm": 14.464191436767578, + "learning_rate": 9.999561084019585e-06, + "loss": 0.2633, + "step": 561 + }, + { + "epoch": 0.01422172735784599, + "grad_norm": 15.244545936584473, + "learning_rate": 9.999555747783109e-06, + "loss": 0.1927, + "step": 562 + }, + { + "epoch": 0.014247032922539666, + "grad_norm": 13.579668045043945, + "learning_rate": 9.999550379305426e-06, + "loss": 0.2876, + "step": 563 + }, + { + "epoch": 0.014272338487233343, + "grad_norm": 9.18748950958252, + "learning_rate": 9.999544978586573e-06, + "loss": 0.2929, + "step": 564 + }, + { + "epoch": 0.014297644051927018, + "grad_norm": 48.25861740112305, + "learning_rate": 9.999539545626584e-06, + "loss": 0.2799, + "step": 565 + }, + { + "epoch": 0.014322949616620695, + "grad_norm": 7.1483473777771, + "learning_rate": 9.999534080425497e-06, + "loss": 0.2764, + "step": 566 + }, + { + "epoch": 0.01434825518131437, + "grad_norm": 13.638626098632812, + "learning_rate": 9.999528582983343e-06, + "loss": 0.2468, + "step": 567 + }, + { + "epoch": 0.014373560746008048, + "grad_norm": 9.812019348144531, + "learning_rate": 9.999523053300157e-06, + "loss": 0.2365, + "step": 568 + }, + { + "epoch": 0.014398866310701723, + "grad_norm": 18.268230438232422, + "learning_rate": 9.999517491375979e-06, + "loss": 0.2696, + "step": 569 + }, + { + "epoch": 0.0144241718753954, + "grad_norm": 14.988903045654297, + "learning_rate": 9.999511897210843e-06, + "loss": 0.3178, + "step": 570 + }, + { + "epoch": 0.014449477440089075, + "grad_norm": 14.20508098602295, + "learning_rate": 9.999506270804784e-06, + "loss": 0.3066, + "step": 571 + }, + { + "epoch": 0.014474783004782752, + "grad_norm": 6.6660475730896, + "learning_rate": 9.99950061215784e-06, + "loss": 0.1783, + "step": 572 + }, + { + "epoch": 0.014500088569476427, + "grad_norm": 14.871021270751953, + "learning_rate": 9.999494921270045e-06, + "loss": 0.3077, + "step": 573 + }, + { + "epoch": 0.014525394134170104, + "grad_norm": 12.910694122314453, + "learning_rate": 9.999489198141438e-06, + "loss": 0.2494, + "step": 574 + }, + { + "epoch": 0.01455069969886378, + "grad_norm": 10.818997383117676, + "learning_rate": 9.999483442772055e-06, + "loss": 0.2141, + "step": 575 + }, + { + "epoch": 0.014576005263557457, + "grad_norm": 22.417423248291016, + "learning_rate": 9.999477655161933e-06, + "loss": 0.381, + "step": 576 + }, + { + "epoch": 0.014601310828251132, + "grad_norm": 23.456918716430664, + "learning_rate": 9.99947183531111e-06, + "loss": 0.3311, + "step": 577 + }, + { + "epoch": 0.014626616392944809, + "grad_norm": 6.3352179527282715, + "learning_rate": 9.999465983219623e-06, + "loss": 0.1616, + "step": 578 + }, + { + "epoch": 0.014651921957638484, + "grad_norm": 10.076543807983398, + "learning_rate": 9.99946009888751e-06, + "loss": 0.1312, + "step": 579 + }, + { + "epoch": 0.014677227522332161, + "grad_norm": 17.280059814453125, + "learning_rate": 9.99945418231481e-06, + "loss": 0.3399, + "step": 580 + }, + { + "epoch": 0.014702533087025836, + "grad_norm": 45.85381317138672, + "learning_rate": 9.999448233501558e-06, + "loss": 0.2648, + "step": 581 + }, + { + "epoch": 0.014727838651719513, + "grad_norm": 19.10299301147461, + "learning_rate": 9.999442252447795e-06, + "loss": 0.2223, + "step": 582 + }, + { + "epoch": 0.014753144216413189, + "grad_norm": 37.96903610229492, + "learning_rate": 9.999436239153559e-06, + "loss": 0.3399, + "step": 583 + }, + { + "epoch": 0.014778449781106866, + "grad_norm": 11.848058700561523, + "learning_rate": 9.999430193618887e-06, + "loss": 0.2489, + "step": 584 + }, + { + "epoch": 0.014803755345800541, + "grad_norm": 8.5596923828125, + "learning_rate": 9.999424115843823e-06, + "loss": 0.2503, + "step": 585 + }, + { + "epoch": 0.014829060910494218, + "grad_norm": 12.81474494934082, + "learning_rate": 9.9994180058284e-06, + "loss": 0.2422, + "step": 586 + }, + { + "epoch": 0.014854366475187893, + "grad_norm": 10.498761177062988, + "learning_rate": 9.999411863572661e-06, + "loss": 0.1683, + "step": 587 + }, + { + "epoch": 0.01487967203988157, + "grad_norm": 21.97634506225586, + "learning_rate": 9.999405689076644e-06, + "loss": 0.2378, + "step": 588 + }, + { + "epoch": 0.014904977604575246, + "grad_norm": 27.411827087402344, + "learning_rate": 9.999399482340391e-06, + "loss": 0.2381, + "step": 589 + }, + { + "epoch": 0.014930283169268923, + "grad_norm": 12.636817932128906, + "learning_rate": 9.999393243363942e-06, + "loss": 0.2496, + "step": 590 + }, + { + "epoch": 0.014955588733962598, + "grad_norm": 16.083024978637695, + "learning_rate": 9.999386972147333e-06, + "loss": 0.2799, + "step": 591 + }, + { + "epoch": 0.014980894298656275, + "grad_norm": 20.806198120117188, + "learning_rate": 9.999380668690609e-06, + "loss": 0.3095, + "step": 592 + }, + { + "epoch": 0.01500619986334995, + "grad_norm": 21.63608169555664, + "learning_rate": 9.99937433299381e-06, + "loss": 0.3568, + "step": 593 + }, + { + "epoch": 0.015031505428043627, + "grad_norm": 7.125497341156006, + "learning_rate": 9.999367965056974e-06, + "loss": 0.2697, + "step": 594 + }, + { + "epoch": 0.015056810992737302, + "grad_norm": 10.04990291595459, + "learning_rate": 9.999361564880145e-06, + "loss": 0.1932, + "step": 595 + }, + { + "epoch": 0.01508211655743098, + "grad_norm": 8.426280975341797, + "learning_rate": 9.999355132463362e-06, + "loss": 0.2254, + "step": 596 + }, + { + "epoch": 0.015107422122124655, + "grad_norm": 33.38533020019531, + "learning_rate": 9.999348667806668e-06, + "loss": 0.2924, + "step": 597 + }, + { + "epoch": 0.015132727686818332, + "grad_norm": 11.025497436523438, + "learning_rate": 9.999342170910106e-06, + "loss": 0.2664, + "step": 598 + }, + { + "epoch": 0.015158033251512007, + "grad_norm": 12.665035247802734, + "learning_rate": 9.999335641773715e-06, + "loss": 0.3393, + "step": 599 + }, + { + "epoch": 0.015183338816205684, + "grad_norm": 14.998407363891602, + "learning_rate": 9.999329080397539e-06, + "loss": 0.2606, + "step": 600 + }, + { + "epoch": 0.01520864438089936, + "grad_norm": 35.03995132446289, + "learning_rate": 9.999322486781618e-06, + "loss": 0.4444, + "step": 601 + }, + { + "epoch": 0.015233949945593036, + "grad_norm": 4.711733341217041, + "learning_rate": 9.999315860925997e-06, + "loss": 0.2223, + "step": 602 + }, + { + "epoch": 0.015259255510286712, + "grad_norm": 14.41649055480957, + "learning_rate": 9.99930920283072e-06, + "loss": 0.2651, + "step": 603 + }, + { + "epoch": 0.015284561074980389, + "grad_norm": 11.280531883239746, + "learning_rate": 9.999302512495826e-06, + "loss": 0.1672, + "step": 604 + }, + { + "epoch": 0.015309866639674064, + "grad_norm": 9.033862113952637, + "learning_rate": 9.99929578992136e-06, + "loss": 0.2028, + "step": 605 + }, + { + "epoch": 0.01533517220436774, + "grad_norm": 8.215812683105469, + "learning_rate": 9.999289035107367e-06, + "loss": 0.1842, + "step": 606 + }, + { + "epoch": 0.015360477769061416, + "grad_norm": 7.330166816711426, + "learning_rate": 9.999282248053888e-06, + "loss": 0.1849, + "step": 607 + }, + { + "epoch": 0.015385783333755093, + "grad_norm": 4.017598628997803, + "learning_rate": 9.999275428760965e-06, + "loss": 0.1649, + "step": 608 + }, + { + "epoch": 0.015411088898448768, + "grad_norm": 10.190245628356934, + "learning_rate": 9.999268577228649e-06, + "loss": 0.259, + "step": 609 + }, + { + "epoch": 0.015436394463142445, + "grad_norm": 9.746973991394043, + "learning_rate": 9.99926169345698e-06, + "loss": 0.2112, + "step": 610 + }, + { + "epoch": 0.01546170002783612, + "grad_norm": 19.704975128173828, + "learning_rate": 9.999254777445997e-06, + "loss": 0.3226, + "step": 611 + }, + { + "epoch": 0.015487005592529798, + "grad_norm": 12.568802833557129, + "learning_rate": 9.999247829195756e-06, + "loss": 0.2296, + "step": 612 + }, + { + "epoch": 0.015512311157223473, + "grad_norm": 33.377376556396484, + "learning_rate": 9.999240848706292e-06, + "loss": 0.5105, + "step": 613 + }, + { + "epoch": 0.01553761672191715, + "grad_norm": 11.516166687011719, + "learning_rate": 9.999233835977654e-06, + "loss": 0.2886, + "step": 614 + }, + { + "epoch": 0.015562922286610825, + "grad_norm": 4.302454471588135, + "learning_rate": 9.999226791009887e-06, + "loss": 0.2248, + "step": 615 + }, + { + "epoch": 0.015588227851304502, + "grad_norm": 11.387481689453125, + "learning_rate": 9.999219713803036e-06, + "loss": 0.312, + "step": 616 + }, + { + "epoch": 0.015613533415998177, + "grad_norm": 9.274151802062988, + "learning_rate": 9.99921260435715e-06, + "loss": 0.1842, + "step": 617 + }, + { + "epoch": 0.015638838980691853, + "grad_norm": 7.119625091552734, + "learning_rate": 9.999205462672268e-06, + "loss": 0.1553, + "step": 618 + }, + { + "epoch": 0.01566414454538553, + "grad_norm": 6.819424629211426, + "learning_rate": 9.999198288748441e-06, + "loss": 0.2346, + "step": 619 + }, + { + "epoch": 0.015689450110079207, + "grad_norm": 12.051433563232422, + "learning_rate": 9.999191082585715e-06, + "loss": 0.3046, + "step": 620 + }, + { + "epoch": 0.015714755674772884, + "grad_norm": 8.476211547851562, + "learning_rate": 9.999183844184138e-06, + "loss": 0.233, + "step": 621 + }, + { + "epoch": 0.015740061239466557, + "grad_norm": 6.455191135406494, + "learning_rate": 9.999176573543749e-06, + "loss": 0.2383, + "step": 622 + }, + { + "epoch": 0.015765366804160234, + "grad_norm": 13.891990661621094, + "learning_rate": 9.999169270664604e-06, + "loss": 0.3436, + "step": 623 + }, + { + "epoch": 0.01579067236885391, + "grad_norm": 11.125611305236816, + "learning_rate": 9.999161935546746e-06, + "loss": 0.2035, + "step": 624 + }, + { + "epoch": 0.01581597793354759, + "grad_norm": 9.429571151733398, + "learning_rate": 9.999154568190221e-06, + "loss": 0.2359, + "step": 625 + }, + { + "epoch": 0.015841283498241262, + "grad_norm": 12.891199111938477, + "learning_rate": 9.999147168595078e-06, + "loss": 0.2374, + "step": 626 + }, + { + "epoch": 0.01586658906293494, + "grad_norm": 32.670494079589844, + "learning_rate": 9.999139736761366e-06, + "loss": 0.4197, + "step": 627 + }, + { + "epoch": 0.015891894627628616, + "grad_norm": 20.59671974182129, + "learning_rate": 9.999132272689131e-06, + "loss": 0.3516, + "step": 628 + }, + { + "epoch": 0.015917200192322293, + "grad_norm": 10.533143043518066, + "learning_rate": 9.999124776378424e-06, + "loss": 0.2218, + "step": 629 + }, + { + "epoch": 0.015942505757015966, + "grad_norm": 12.705223083496094, + "learning_rate": 9.99911724782929e-06, + "loss": 0.2021, + "step": 630 + }, + { + "epoch": 0.015967811321709643, + "grad_norm": 10.764822006225586, + "learning_rate": 9.999109687041776e-06, + "loss": 0.2665, + "step": 631 + }, + { + "epoch": 0.01599311688640332, + "grad_norm": 10.491960525512695, + "learning_rate": 9.999102094015937e-06, + "loss": 0.3383, + "step": 632 + }, + { + "epoch": 0.016018422451096997, + "grad_norm": 5.8606390953063965, + "learning_rate": 9.999094468751818e-06, + "loss": 0.2044, + "step": 633 + }, + { + "epoch": 0.01604372801579067, + "grad_norm": 9.085097312927246, + "learning_rate": 9.999086811249468e-06, + "loss": 0.2603, + "step": 634 + }, + { + "epoch": 0.016069033580484348, + "grad_norm": 6.86657190322876, + "learning_rate": 9.999079121508936e-06, + "loss": 0.1832, + "step": 635 + }, + { + "epoch": 0.016094339145178025, + "grad_norm": 6.042163848876953, + "learning_rate": 9.999071399530274e-06, + "loss": 0.2278, + "step": 636 + }, + { + "epoch": 0.016119644709871702, + "grad_norm": 19.37464141845703, + "learning_rate": 9.99906364531353e-06, + "loss": 0.2922, + "step": 637 + }, + { + "epoch": 0.016144950274565376, + "grad_norm": 9.725465774536133, + "learning_rate": 9.999055858858754e-06, + "loss": 0.2771, + "step": 638 + }, + { + "epoch": 0.016170255839259053, + "grad_norm": 10.00796127319336, + "learning_rate": 9.999048040165997e-06, + "loss": 0.1606, + "step": 639 + }, + { + "epoch": 0.01619556140395273, + "grad_norm": 13.173089027404785, + "learning_rate": 9.99904018923531e-06, + "loss": 0.3373, + "step": 640 + }, + { + "epoch": 0.016220866968646407, + "grad_norm": 17.284143447875977, + "learning_rate": 9.999032306066742e-06, + "loss": 0.3848, + "step": 641 + }, + { + "epoch": 0.01624617253334008, + "grad_norm": 9.661308288574219, + "learning_rate": 9.999024390660346e-06, + "loss": 0.1751, + "step": 642 + }, + { + "epoch": 0.016271478098033757, + "grad_norm": 5.807017803192139, + "learning_rate": 9.999016443016169e-06, + "loss": 0.1917, + "step": 643 + }, + { + "epoch": 0.016296783662727434, + "grad_norm": 15.066106796264648, + "learning_rate": 9.999008463134267e-06, + "loss": 0.3496, + "step": 644 + }, + { + "epoch": 0.01632208922742111, + "grad_norm": 8.334376335144043, + "learning_rate": 9.999000451014687e-06, + "loss": 0.2378, + "step": 645 + }, + { + "epoch": 0.016347394792114785, + "grad_norm": 6.186484336853027, + "learning_rate": 9.998992406657486e-06, + "loss": 0.256, + "step": 646 + }, + { + "epoch": 0.01637270035680846, + "grad_norm": 9.615535736083984, + "learning_rate": 9.998984330062712e-06, + "loss": 0.2614, + "step": 647 + }, + { + "epoch": 0.01639800592150214, + "grad_norm": 20.20922088623047, + "learning_rate": 9.998976221230414e-06, + "loss": 0.2888, + "step": 648 + }, + { + "epoch": 0.016423311486195816, + "grad_norm": 8.285361289978027, + "learning_rate": 9.998968080160653e-06, + "loss": 0.3478, + "step": 649 + }, + { + "epoch": 0.01644861705088949, + "grad_norm": 13.195954322814941, + "learning_rate": 9.998959906853475e-06, + "loss": 0.307, + "step": 650 + }, + { + "epoch": 0.016473922615583166, + "grad_norm": 13.771105766296387, + "learning_rate": 9.998951701308934e-06, + "loss": 0.2695, + "step": 651 + }, + { + "epoch": 0.016499228180276843, + "grad_norm": 13.897220611572266, + "learning_rate": 9.998943463527084e-06, + "loss": 0.2655, + "step": 652 + }, + { + "epoch": 0.01652453374497052, + "grad_norm": 10.530561447143555, + "learning_rate": 9.998935193507978e-06, + "loss": 0.1757, + "step": 653 + }, + { + "epoch": 0.016549839309664194, + "grad_norm": 14.663229942321777, + "learning_rate": 9.998926891251668e-06, + "loss": 0.3561, + "step": 654 + }, + { + "epoch": 0.01657514487435787, + "grad_norm": 20.345203399658203, + "learning_rate": 9.998918556758208e-06, + "loss": 0.2593, + "step": 655 + }, + { + "epoch": 0.016600450439051548, + "grad_norm": 10.0175142288208, + "learning_rate": 9.998910190027653e-06, + "loss": 0.2095, + "step": 656 + }, + { + "epoch": 0.016625756003745225, + "grad_norm": 14.627350807189941, + "learning_rate": 9.998901791060056e-06, + "loss": 0.3336, + "step": 657 + }, + { + "epoch": 0.0166510615684389, + "grad_norm": 11.071358680725098, + "learning_rate": 9.99889335985547e-06, + "loss": 0.2787, + "step": 658 + }, + { + "epoch": 0.016676367133132575, + "grad_norm": 12.95853042602539, + "learning_rate": 9.998884896413951e-06, + "loss": 0.3518, + "step": 659 + }, + { + "epoch": 0.016701672697826252, + "grad_norm": 13.477054595947266, + "learning_rate": 9.998876400735556e-06, + "loss": 0.3204, + "step": 660 + }, + { + "epoch": 0.01672697826251993, + "grad_norm": 8.538702964782715, + "learning_rate": 9.998867872820333e-06, + "loss": 0.1917, + "step": 661 + }, + { + "epoch": 0.016752283827213603, + "grad_norm": 8.432108879089355, + "learning_rate": 9.998859312668343e-06, + "loss": 0.2681, + "step": 662 + }, + { + "epoch": 0.01677758939190728, + "grad_norm": 23.435651779174805, + "learning_rate": 9.998850720279639e-06, + "loss": 0.3284, + "step": 663 + }, + { + "epoch": 0.016802894956600957, + "grad_norm": 7.258855819702148, + "learning_rate": 9.998842095654275e-06, + "loss": 0.2004, + "step": 664 + }, + { + "epoch": 0.016828200521294634, + "grad_norm": 11.000269889831543, + "learning_rate": 9.99883343879231e-06, + "loss": 0.2793, + "step": 665 + }, + { + "epoch": 0.016853506085988307, + "grad_norm": 9.252107620239258, + "learning_rate": 9.998824749693799e-06, + "loss": 0.2338, + "step": 666 + }, + { + "epoch": 0.016878811650681984, + "grad_norm": 10.218987464904785, + "learning_rate": 9.998816028358794e-06, + "loss": 0.144, + "step": 667 + }, + { + "epoch": 0.01690411721537566, + "grad_norm": 11.391278266906738, + "learning_rate": 9.998807274787357e-06, + "loss": 0.2633, + "step": 668 + }, + { + "epoch": 0.01692942278006934, + "grad_norm": 5.84691047668457, + "learning_rate": 9.99879848897954e-06, + "loss": 0.1893, + "step": 669 + }, + { + "epoch": 0.016954728344763012, + "grad_norm": 7.464654922485352, + "learning_rate": 9.998789670935402e-06, + "loss": 0.2142, + "step": 670 + }, + { + "epoch": 0.01698003390945669, + "grad_norm": 10.25328540802002, + "learning_rate": 9.998780820654998e-06, + "loss": 0.2599, + "step": 671 + }, + { + "epoch": 0.017005339474150366, + "grad_norm": 20.668210983276367, + "learning_rate": 9.998771938138387e-06, + "loss": 0.2591, + "step": 672 + }, + { + "epoch": 0.017030645038844043, + "grad_norm": 10.14700984954834, + "learning_rate": 9.998763023385626e-06, + "loss": 0.2615, + "step": 673 + }, + { + "epoch": 0.017055950603537717, + "grad_norm": 14.726421356201172, + "learning_rate": 9.998754076396772e-06, + "loss": 0.311, + "step": 674 + }, + { + "epoch": 0.017081256168231394, + "grad_norm": 18.45738410949707, + "learning_rate": 9.998745097171885e-06, + "loss": 0.3725, + "step": 675 + }, + { + "epoch": 0.01710656173292507, + "grad_norm": 7.587445259094238, + "learning_rate": 9.998736085711018e-06, + "loss": 0.2455, + "step": 676 + }, + { + "epoch": 0.017131867297618748, + "grad_norm": 16.368209838867188, + "learning_rate": 9.998727042014234e-06, + "loss": 0.2382, + "step": 677 + }, + { + "epoch": 0.01715717286231242, + "grad_norm": 10.183869361877441, + "learning_rate": 9.998717966081586e-06, + "loss": 0.1982, + "step": 678 + }, + { + "epoch": 0.017182478427006098, + "grad_norm": 7.4235687255859375, + "learning_rate": 9.998708857913138e-06, + "loss": 0.2802, + "step": 679 + }, + { + "epoch": 0.017207783991699775, + "grad_norm": 12.378100395202637, + "learning_rate": 9.998699717508947e-06, + "loss": 0.2185, + "step": 680 + }, + { + "epoch": 0.017233089556393452, + "grad_norm": 9.898721694946289, + "learning_rate": 9.998690544869069e-06, + "loss": 0.2372, + "step": 681 + }, + { + "epoch": 0.017258395121087126, + "grad_norm": 11.423439025878906, + "learning_rate": 9.998681339993566e-06, + "loss": 0.269, + "step": 682 + }, + { + "epoch": 0.017283700685780803, + "grad_norm": 10.505799293518066, + "learning_rate": 9.998672102882499e-06, + "loss": 0.3016, + "step": 683 + }, + { + "epoch": 0.01730900625047448, + "grad_norm": 15.818330764770508, + "learning_rate": 9.998662833535922e-06, + "loss": 0.1741, + "step": 684 + }, + { + "epoch": 0.017334311815168157, + "grad_norm": 10.968268394470215, + "learning_rate": 9.998653531953902e-06, + "loss": 0.2618, + "step": 685 + }, + { + "epoch": 0.01735961737986183, + "grad_norm": 26.14155387878418, + "learning_rate": 9.998644198136492e-06, + "loss": 0.3172, + "step": 686 + }, + { + "epoch": 0.017384922944555507, + "grad_norm": 12.481574058532715, + "learning_rate": 9.998634832083757e-06, + "loss": 0.3093, + "step": 687 + }, + { + "epoch": 0.017410228509249184, + "grad_norm": 13.509492874145508, + "learning_rate": 9.998625433795755e-06, + "loss": 0.1949, + "step": 688 + }, + { + "epoch": 0.01743553407394286, + "grad_norm": 6.943411350250244, + "learning_rate": 9.998616003272547e-06, + "loss": 0.1827, + "step": 689 + }, + { + "epoch": 0.017460839638636535, + "grad_norm": 28.239948272705078, + "learning_rate": 9.998606540514196e-06, + "loss": 0.3839, + "step": 690 + }, + { + "epoch": 0.017486145203330212, + "grad_norm": 18.901504516601562, + "learning_rate": 9.998597045520761e-06, + "loss": 0.2466, + "step": 691 + }, + { + "epoch": 0.01751145076802389, + "grad_norm": 11.309270858764648, + "learning_rate": 9.998587518292302e-06, + "loss": 0.2518, + "step": 692 + }, + { + "epoch": 0.017536756332717566, + "grad_norm": 9.033074378967285, + "learning_rate": 9.998577958828884e-06, + "loss": 0.2235, + "step": 693 + }, + { + "epoch": 0.01756206189741124, + "grad_norm": 18.371183395385742, + "learning_rate": 9.998568367130565e-06, + "loss": 0.3892, + "step": 694 + }, + { + "epoch": 0.017587367462104916, + "grad_norm": 5.469046115875244, + "learning_rate": 9.998558743197409e-06, + "loss": 0.1956, + "step": 695 + }, + { + "epoch": 0.017612673026798593, + "grad_norm": 11.432561874389648, + "learning_rate": 9.998549087029476e-06, + "loss": 0.3134, + "step": 696 + }, + { + "epoch": 0.01763797859149227, + "grad_norm": 9.003416061401367, + "learning_rate": 9.998539398626832e-06, + "loss": 0.1998, + "step": 697 + }, + { + "epoch": 0.017663284156185944, + "grad_norm": 4.666547775268555, + "learning_rate": 9.998529677989535e-06, + "loss": 0.193, + "step": 698 + }, + { + "epoch": 0.01768858972087962, + "grad_norm": 7.190059661865234, + "learning_rate": 9.998519925117651e-06, + "loss": 0.2015, + "step": 699 + }, + { + "epoch": 0.017713895285573298, + "grad_norm": 8.88148307800293, + "learning_rate": 9.998510140011241e-06, + "loss": 0.3158, + "step": 700 + }, + { + "epoch": 0.017739200850266975, + "grad_norm": 7.8512420654296875, + "learning_rate": 9.99850032267037e-06, + "loss": 0.2307, + "step": 701 + }, + { + "epoch": 0.01776450641496065, + "grad_norm": 8.97547721862793, + "learning_rate": 9.998490473095101e-06, + "loss": 0.2525, + "step": 702 + }, + { + "epoch": 0.017789811979654326, + "grad_norm": 10.336207389831543, + "learning_rate": 9.998480591285495e-06, + "loss": 0.2379, + "step": 703 + }, + { + "epoch": 0.017815117544348003, + "grad_norm": 9.165116310119629, + "learning_rate": 9.998470677241619e-06, + "loss": 0.1846, + "step": 704 + }, + { + "epoch": 0.01784042310904168, + "grad_norm": 13.351237297058105, + "learning_rate": 9.998460730963533e-06, + "loss": 0.3154, + "step": 705 + }, + { + "epoch": 0.017865728673735353, + "grad_norm": 6.106309413909912, + "learning_rate": 9.998450752451306e-06, + "loss": 0.1513, + "step": 706 + }, + { + "epoch": 0.01789103423842903, + "grad_norm": 9.23594856262207, + "learning_rate": 9.998440741704998e-06, + "loss": 0.1543, + "step": 707 + }, + { + "epoch": 0.017916339803122707, + "grad_norm": 12.439859390258789, + "learning_rate": 9.998430698724677e-06, + "loss": 0.2814, + "step": 708 + }, + { + "epoch": 0.017941645367816384, + "grad_norm": 10.378833770751953, + "learning_rate": 9.998420623510406e-06, + "loss": 0.1754, + "step": 709 + }, + { + "epoch": 0.017966950932510058, + "grad_norm": 9.411355018615723, + "learning_rate": 9.99841051606225e-06, + "loss": 0.296, + "step": 710 + }, + { + "epoch": 0.017992256497203735, + "grad_norm": 19.79286003112793, + "learning_rate": 9.998400376380273e-06, + "loss": 0.4239, + "step": 711 + }, + { + "epoch": 0.01801756206189741, + "grad_norm": 10.073020935058594, + "learning_rate": 9.998390204464543e-06, + "loss": 0.2609, + "step": 712 + }, + { + "epoch": 0.01804286762659109, + "grad_norm": 9.436474800109863, + "learning_rate": 9.998380000315124e-06, + "loss": 0.2463, + "step": 713 + }, + { + "epoch": 0.018068173191284762, + "grad_norm": 14.839162826538086, + "learning_rate": 9.998369763932082e-06, + "loss": 0.3275, + "step": 714 + }, + { + "epoch": 0.01809347875597844, + "grad_norm": 21.447471618652344, + "learning_rate": 9.998359495315484e-06, + "loss": 0.3286, + "step": 715 + }, + { + "epoch": 0.018118784320672116, + "grad_norm": 13.724267959594727, + "learning_rate": 9.998349194465395e-06, + "loss": 0.2678, + "step": 716 + }, + { + "epoch": 0.018144089885365793, + "grad_norm": 13.145249366760254, + "learning_rate": 9.998338861381881e-06, + "loss": 0.2055, + "step": 717 + }, + { + "epoch": 0.018169395450059467, + "grad_norm": 9.915070533752441, + "learning_rate": 9.99832849606501e-06, + "loss": 0.3372, + "step": 718 + }, + { + "epoch": 0.018194701014753144, + "grad_norm": 6.967733860015869, + "learning_rate": 9.99831809851485e-06, + "loss": 0.2136, + "step": 719 + }, + { + "epoch": 0.01822000657944682, + "grad_norm": 11.771602630615234, + "learning_rate": 9.998307668731465e-06, + "loss": 0.3021, + "step": 720 + }, + { + "epoch": 0.018245312144140498, + "grad_norm": 12.381881713867188, + "learning_rate": 9.998297206714922e-06, + "loss": 0.2156, + "step": 721 + }, + { + "epoch": 0.01827061770883417, + "grad_norm": 14.484254837036133, + "learning_rate": 9.998286712465291e-06, + "loss": 0.2798, + "step": 722 + }, + { + "epoch": 0.01829592327352785, + "grad_norm": 12.762661933898926, + "learning_rate": 9.99827618598264e-06, + "loss": 0.2648, + "step": 723 + }, + { + "epoch": 0.018321228838221525, + "grad_norm": 8.205214500427246, + "learning_rate": 9.998265627267033e-06, + "loss": 0.271, + "step": 724 + }, + { + "epoch": 0.018346534402915202, + "grad_norm": 20.074298858642578, + "learning_rate": 9.998255036318542e-06, + "loss": 0.4226, + "step": 725 + }, + { + "epoch": 0.018371839967608876, + "grad_norm": 9.579397201538086, + "learning_rate": 9.998244413137236e-06, + "loss": 0.2911, + "step": 726 + }, + { + "epoch": 0.018397145532302553, + "grad_norm": 14.904553413391113, + "learning_rate": 9.99823375772318e-06, + "loss": 0.2626, + "step": 727 + }, + { + "epoch": 0.01842245109699623, + "grad_norm": 17.96747398376465, + "learning_rate": 9.998223070076445e-06, + "loss": 0.2549, + "step": 728 + }, + { + "epoch": 0.018447756661689907, + "grad_norm": 9.56375789642334, + "learning_rate": 9.998212350197098e-06, + "loss": 0.2485, + "step": 729 + }, + { + "epoch": 0.01847306222638358, + "grad_norm": 9.595809936523438, + "learning_rate": 9.998201598085208e-06, + "loss": 0.2772, + "step": 730 + }, + { + "epoch": 0.018498367791077257, + "grad_norm": 18.86570930480957, + "learning_rate": 9.998190813740849e-06, + "loss": 0.3224, + "step": 731 + }, + { + "epoch": 0.018523673355770934, + "grad_norm": 16.8681583404541, + "learning_rate": 9.998179997164085e-06, + "loss": 0.3161, + "step": 732 + }, + { + "epoch": 0.01854897892046461, + "grad_norm": 20.586658477783203, + "learning_rate": 9.998169148354989e-06, + "loss": 0.2619, + "step": 733 + }, + { + "epoch": 0.018574284485158285, + "grad_norm": 9.71986198425293, + "learning_rate": 9.998158267313629e-06, + "loss": 0.3484, + "step": 734 + }, + { + "epoch": 0.018599590049851962, + "grad_norm": 7.036600112915039, + "learning_rate": 9.998147354040077e-06, + "loss": 0.3223, + "step": 735 + }, + { + "epoch": 0.01862489561454564, + "grad_norm": 7.490869998931885, + "learning_rate": 9.9981364085344e-06, + "loss": 0.2749, + "step": 736 + }, + { + "epoch": 0.018650201179239316, + "grad_norm": 14.538434982299805, + "learning_rate": 9.998125430796674e-06, + "loss": 0.3344, + "step": 737 + }, + { + "epoch": 0.01867550674393299, + "grad_norm": 7.396387100219727, + "learning_rate": 9.998114420826966e-06, + "loss": 0.217, + "step": 738 + }, + { + "epoch": 0.018700812308626667, + "grad_norm": 8.388227462768555, + "learning_rate": 9.998103378625349e-06, + "loss": 0.1869, + "step": 739 + }, + { + "epoch": 0.018726117873320344, + "grad_norm": 8.329005241394043, + "learning_rate": 9.99809230419189e-06, + "loss": 0.2552, + "step": 740 + }, + { + "epoch": 0.01875142343801402, + "grad_norm": 6.441090106964111, + "learning_rate": 9.998081197526665e-06, + "loss": 0.3037, + "step": 741 + }, + { + "epoch": 0.018776729002707694, + "grad_norm": 19.16530990600586, + "learning_rate": 9.998070058629743e-06, + "loss": 0.3838, + "step": 742 + }, + { + "epoch": 0.01880203456740137, + "grad_norm": 7.639657974243164, + "learning_rate": 9.998058887501198e-06, + "loss": 0.2249, + "step": 743 + }, + { + "epoch": 0.018827340132095048, + "grad_norm": 11.156661987304688, + "learning_rate": 9.998047684141102e-06, + "loss": 0.3167, + "step": 744 + }, + { + "epoch": 0.018852645696788725, + "grad_norm": 12.204507827758789, + "learning_rate": 9.998036448549524e-06, + "loss": 0.2752, + "step": 745 + }, + { + "epoch": 0.0188779512614824, + "grad_norm": 9.069611549377441, + "learning_rate": 9.998025180726541e-06, + "loss": 0.2654, + "step": 746 + }, + { + "epoch": 0.018903256826176076, + "grad_norm": 9.918753623962402, + "learning_rate": 9.99801388067222e-06, + "loss": 0.3007, + "step": 747 + }, + { + "epoch": 0.018928562390869753, + "grad_norm": 4.74451208114624, + "learning_rate": 9.99800254838664e-06, + "loss": 0.1464, + "step": 748 + }, + { + "epoch": 0.01895386795556343, + "grad_norm": 27.366458892822266, + "learning_rate": 9.99799118386987e-06, + "loss": 0.3392, + "step": 749 + }, + { + "epoch": 0.018979173520257103, + "grad_norm": 12.706466674804688, + "learning_rate": 9.997979787121986e-06, + "loss": 0.3082, + "step": 750 + }, + { + "epoch": 0.01900447908495078, + "grad_norm": 14.478541374206543, + "learning_rate": 9.99796835814306e-06, + "loss": 0.2315, + "step": 751 + }, + { + "epoch": 0.019029784649644457, + "grad_norm": 25.950071334838867, + "learning_rate": 9.997956896933164e-06, + "loss": 0.2894, + "step": 752 + }, + { + "epoch": 0.019055090214338134, + "grad_norm": 8.39651870727539, + "learning_rate": 9.997945403492375e-06, + "loss": 0.1689, + "step": 753 + }, + { + "epoch": 0.019080395779031808, + "grad_norm": 16.760883331298828, + "learning_rate": 9.997933877820765e-06, + "loss": 0.2974, + "step": 754 + }, + { + "epoch": 0.019105701343725485, + "grad_norm": 13.254465103149414, + "learning_rate": 9.99792231991841e-06, + "loss": 0.2436, + "step": 755 + }, + { + "epoch": 0.019131006908419162, + "grad_norm": 10.105440139770508, + "learning_rate": 9.997910729785383e-06, + "loss": 0.2797, + "step": 756 + }, + { + "epoch": 0.01915631247311284, + "grad_norm": 12.206216812133789, + "learning_rate": 9.997899107421759e-06, + "loss": 0.2361, + "step": 757 + }, + { + "epoch": 0.019181618037806512, + "grad_norm": 15.579054832458496, + "learning_rate": 9.997887452827614e-06, + "loss": 0.3381, + "step": 758 + }, + { + "epoch": 0.01920692360250019, + "grad_norm": 18.896127700805664, + "learning_rate": 9.997875766003022e-06, + "loss": 0.262, + "step": 759 + }, + { + "epoch": 0.019232229167193866, + "grad_norm": 7.291704177856445, + "learning_rate": 9.99786404694806e-06, + "loss": 0.1737, + "step": 760 + }, + { + "epoch": 0.019257534731887543, + "grad_norm": 8.592514038085938, + "learning_rate": 9.997852295662801e-06, + "loss": 0.2054, + "step": 761 + }, + { + "epoch": 0.019282840296581217, + "grad_norm": 14.05815601348877, + "learning_rate": 9.997840512147323e-06, + "loss": 0.339, + "step": 762 + }, + { + "epoch": 0.019308145861274894, + "grad_norm": 16.512475967407227, + "learning_rate": 9.997828696401701e-06, + "loss": 0.24, + "step": 763 + }, + { + "epoch": 0.01933345142596857, + "grad_norm": 8.966811180114746, + "learning_rate": 9.997816848426011e-06, + "loss": 0.1351, + "step": 764 + }, + { + "epoch": 0.019358756990662248, + "grad_norm": 6.739043235778809, + "learning_rate": 9.99780496822033e-06, + "loss": 0.1166, + "step": 765 + }, + { + "epoch": 0.01938406255535592, + "grad_norm": 26.92143440246582, + "learning_rate": 9.997793055784735e-06, + "loss": 0.1961, + "step": 766 + }, + { + "epoch": 0.0194093681200496, + "grad_norm": 16.187477111816406, + "learning_rate": 9.997781111119302e-06, + "loss": 0.3115, + "step": 767 + }, + { + "epoch": 0.019434673684743276, + "grad_norm": 24.574373245239258, + "learning_rate": 9.997769134224108e-06, + "loss": 0.3313, + "step": 768 + }, + { + "epoch": 0.019459979249436953, + "grad_norm": 13.864501953125, + "learning_rate": 9.99775712509923e-06, + "loss": 0.2848, + "step": 769 + }, + { + "epoch": 0.019485284814130626, + "grad_norm": 16.57316780090332, + "learning_rate": 9.997745083744747e-06, + "loss": 0.3389, + "step": 770 + }, + { + "epoch": 0.019510590378824303, + "grad_norm": 11.271954536437988, + "learning_rate": 9.997733010160736e-06, + "loss": 0.3253, + "step": 771 + }, + { + "epoch": 0.01953589594351798, + "grad_norm": 9.100506782531738, + "learning_rate": 9.997720904347273e-06, + "loss": 0.1893, + "step": 772 + }, + { + "epoch": 0.019561201508211657, + "grad_norm": 16.63031578063965, + "learning_rate": 9.997708766304439e-06, + "loss": 0.3894, + "step": 773 + }, + { + "epoch": 0.01958650707290533, + "grad_norm": 7.851315021514893, + "learning_rate": 9.99769659603231e-06, + "loss": 0.262, + "step": 774 + }, + { + "epoch": 0.019611812637599008, + "grad_norm": 12.74656867980957, + "learning_rate": 9.997684393530964e-06, + "loss": 0.1833, + "step": 775 + }, + { + "epoch": 0.019637118202292685, + "grad_norm": 22.1888370513916, + "learning_rate": 9.997672158800483e-06, + "loss": 0.3558, + "step": 776 + }, + { + "epoch": 0.01966242376698636, + "grad_norm": 14.012761116027832, + "learning_rate": 9.997659891840943e-06, + "loss": 0.2835, + "step": 777 + }, + { + "epoch": 0.019687729331680035, + "grad_norm": 9.389825820922852, + "learning_rate": 9.997647592652423e-06, + "loss": 0.2129, + "step": 778 + }, + { + "epoch": 0.019713034896373712, + "grad_norm": 7.135106086730957, + "learning_rate": 9.997635261235006e-06, + "loss": 0.2363, + "step": 779 + }, + { + "epoch": 0.01973834046106739, + "grad_norm": 12.371942520141602, + "learning_rate": 9.997622897588769e-06, + "loss": 0.243, + "step": 780 + }, + { + "epoch": 0.019763646025761066, + "grad_norm": 12.542003631591797, + "learning_rate": 9.997610501713788e-06, + "loss": 0.2732, + "step": 781 + }, + { + "epoch": 0.01978895159045474, + "grad_norm": 14.184002876281738, + "learning_rate": 9.997598073610148e-06, + "loss": 0.2967, + "step": 782 + }, + { + "epoch": 0.019814257155148417, + "grad_norm": 16.825380325317383, + "learning_rate": 9.997585613277928e-06, + "loss": 0.3203, + "step": 783 + }, + { + "epoch": 0.019839562719842094, + "grad_norm": 7.560961723327637, + "learning_rate": 9.99757312071721e-06, + "loss": 0.247, + "step": 784 + }, + { + "epoch": 0.01986486828453577, + "grad_norm": 16.67943572998047, + "learning_rate": 9.997560595928068e-06, + "loss": 0.3722, + "step": 785 + }, + { + "epoch": 0.019890173849229444, + "grad_norm": 8.460172653198242, + "learning_rate": 9.997548038910591e-06, + "loss": 0.2561, + "step": 786 + }, + { + "epoch": 0.01991547941392312, + "grad_norm": 16.87831687927246, + "learning_rate": 9.997535449664856e-06, + "loss": 0.2834, + "step": 787 + }, + { + "epoch": 0.0199407849786168, + "grad_norm": 5.501188278198242, + "learning_rate": 9.997522828190944e-06, + "loss": 0.2336, + "step": 788 + }, + { + "epoch": 0.019966090543310475, + "grad_norm": 19.47857666015625, + "learning_rate": 9.997510174488936e-06, + "loss": 0.2967, + "step": 789 + }, + { + "epoch": 0.01999139610800415, + "grad_norm": 11.450820922851562, + "learning_rate": 9.997497488558916e-06, + "loss": 0.2531, + "step": 790 + }, + { + "epoch": 0.020016701672697826, + "grad_norm": 15.69693374633789, + "learning_rate": 9.997484770400963e-06, + "loss": 0.365, + "step": 791 + }, + { + "epoch": 0.020042007237391503, + "grad_norm": 8.604055404663086, + "learning_rate": 9.99747202001516e-06, + "loss": 0.275, + "step": 792 + }, + { + "epoch": 0.02006731280208518, + "grad_norm": 9.662964820861816, + "learning_rate": 9.99745923740159e-06, + "loss": 0.2034, + "step": 793 + }, + { + "epoch": 0.020092618366778853, + "grad_norm": 12.467280387878418, + "learning_rate": 9.997446422560337e-06, + "loss": 0.1852, + "step": 794 + }, + { + "epoch": 0.02011792393147253, + "grad_norm": 8.71025562286377, + "learning_rate": 9.99743357549148e-06, + "loss": 0.2327, + "step": 795 + }, + { + "epoch": 0.020143229496166207, + "grad_norm": 9.724043846130371, + "learning_rate": 9.997420696195102e-06, + "loss": 0.2375, + "step": 796 + }, + { + "epoch": 0.020168535060859884, + "grad_norm": 11.261017799377441, + "learning_rate": 9.99740778467129e-06, + "loss": 0.2242, + "step": 797 + }, + { + "epoch": 0.020193840625553558, + "grad_norm": 7.4740166664123535, + "learning_rate": 9.997394840920123e-06, + "loss": 0.2124, + "step": 798 + }, + { + "epoch": 0.020219146190247235, + "grad_norm": 9.746769905090332, + "learning_rate": 9.997381864941687e-06, + "loss": 0.2623, + "step": 799 + }, + { + "epoch": 0.020244451754940912, + "grad_norm": 19.54143524169922, + "learning_rate": 9.997368856736065e-06, + "loss": 0.3394, + "step": 800 + }, + { + "epoch": 0.02026975731963459, + "grad_norm": 13.892446517944336, + "learning_rate": 9.99735581630334e-06, + "loss": 0.2804, + "step": 801 + }, + { + "epoch": 0.020295062884328263, + "grad_norm": 10.271944999694824, + "learning_rate": 9.997342743643599e-06, + "loss": 0.2139, + "step": 802 + }, + { + "epoch": 0.02032036844902194, + "grad_norm": 8.539541244506836, + "learning_rate": 9.997329638756921e-06, + "loss": 0.2135, + "step": 803 + }, + { + "epoch": 0.020345674013715617, + "grad_norm": 6.662068843841553, + "learning_rate": 9.997316501643397e-06, + "loss": 0.1887, + "step": 804 + }, + { + "epoch": 0.020370979578409294, + "grad_norm": 13.718437194824219, + "learning_rate": 9.997303332303107e-06, + "loss": 0.2786, + "step": 805 + }, + { + "epoch": 0.020396285143102967, + "grad_norm": 9.176424026489258, + "learning_rate": 9.997290130736136e-06, + "loss": 0.2265, + "step": 806 + }, + { + "epoch": 0.020421590707796644, + "grad_norm": 14.129704475402832, + "learning_rate": 9.997276896942572e-06, + "loss": 0.2819, + "step": 807 + }, + { + "epoch": 0.02044689627249032, + "grad_norm": 6.361371040344238, + "learning_rate": 9.9972636309225e-06, + "loss": 0.1666, + "step": 808 + }, + { + "epoch": 0.020472201837183998, + "grad_norm": 4.638801574707031, + "learning_rate": 9.997250332676003e-06, + "loss": 0.1137, + "step": 809 + }, + { + "epoch": 0.02049750740187767, + "grad_norm": 9.363175392150879, + "learning_rate": 9.997237002203168e-06, + "loss": 0.2264, + "step": 810 + }, + { + "epoch": 0.02052281296657135, + "grad_norm": 30.70516014099121, + "learning_rate": 9.99722363950408e-06, + "loss": 0.3162, + "step": 811 + }, + { + "epoch": 0.020548118531265026, + "grad_norm": 12.118646621704102, + "learning_rate": 9.99721024457883e-06, + "loss": 0.257, + "step": 812 + }, + { + "epoch": 0.020573424095958703, + "grad_norm": 11.594852447509766, + "learning_rate": 9.997196817427498e-06, + "loss": 0.3933, + "step": 813 + }, + { + "epoch": 0.020598729660652376, + "grad_norm": 10.157822608947754, + "learning_rate": 9.997183358050173e-06, + "loss": 0.1899, + "step": 814 + }, + { + "epoch": 0.020624035225346053, + "grad_norm": 7.520358562469482, + "learning_rate": 9.997169866446942e-06, + "loss": 0.2397, + "step": 815 + }, + { + "epoch": 0.02064934079003973, + "grad_norm": 3.592925548553467, + "learning_rate": 9.997156342617894e-06, + "loss": 0.1736, + "step": 816 + }, + { + "epoch": 0.020674646354733407, + "grad_norm": 12.227616310119629, + "learning_rate": 9.997142786563114e-06, + "loss": 0.3091, + "step": 817 + }, + { + "epoch": 0.02069995191942708, + "grad_norm": 7.777493476867676, + "learning_rate": 9.997129198282689e-06, + "loss": 0.2551, + "step": 818 + }, + { + "epoch": 0.020725257484120758, + "grad_norm": 10.122897148132324, + "learning_rate": 9.997115577776705e-06, + "loss": 0.2719, + "step": 819 + }, + { + "epoch": 0.020750563048814435, + "grad_norm": 30.420955657958984, + "learning_rate": 9.997101925045257e-06, + "loss": 0.2797, + "step": 820 + }, + { + "epoch": 0.020775868613508112, + "grad_norm": 13.233255386352539, + "learning_rate": 9.997088240088424e-06, + "loss": 0.237, + "step": 821 + }, + { + "epoch": 0.020801174178201785, + "grad_norm": 22.185625076293945, + "learning_rate": 9.9970745229063e-06, + "loss": 0.248, + "step": 822 + }, + { + "epoch": 0.020826479742895462, + "grad_norm": 8.183877944946289, + "learning_rate": 9.997060773498973e-06, + "loss": 0.1596, + "step": 823 + }, + { + "epoch": 0.02085178530758914, + "grad_norm": 11.455021858215332, + "learning_rate": 9.997046991866528e-06, + "loss": 0.2098, + "step": 824 + }, + { + "epoch": 0.020877090872282816, + "grad_norm": 11.532597541809082, + "learning_rate": 9.997033178009059e-06, + "loss": 0.2186, + "step": 825 + }, + { + "epoch": 0.02090239643697649, + "grad_norm": 14.063714981079102, + "learning_rate": 9.997019331926652e-06, + "loss": 0.3889, + "step": 826 + }, + { + "epoch": 0.020927702001670167, + "grad_norm": 5.828180313110352, + "learning_rate": 9.997005453619397e-06, + "loss": 0.2362, + "step": 827 + }, + { + "epoch": 0.020953007566363844, + "grad_norm": 11.039033889770508, + "learning_rate": 9.996991543087381e-06, + "loss": 0.3438, + "step": 828 + }, + { + "epoch": 0.02097831313105752, + "grad_norm": 6.450070381164551, + "learning_rate": 9.9969776003307e-06, + "loss": 0.2368, + "step": 829 + }, + { + "epoch": 0.021003618695751194, + "grad_norm": 9.442981719970703, + "learning_rate": 9.996963625349437e-06, + "loss": 0.292, + "step": 830 + }, + { + "epoch": 0.02102892426044487, + "grad_norm": 9.570246696472168, + "learning_rate": 9.996949618143686e-06, + "loss": 0.3012, + "step": 831 + }, + { + "epoch": 0.02105422982513855, + "grad_norm": 10.422574996948242, + "learning_rate": 9.996935578713536e-06, + "loss": 0.2101, + "step": 832 + }, + { + "epoch": 0.021079535389832225, + "grad_norm": 11.957613945007324, + "learning_rate": 9.996921507059079e-06, + "loss": 0.1803, + "step": 833 + }, + { + "epoch": 0.0211048409545259, + "grad_norm": 23.304903030395508, + "learning_rate": 9.996907403180403e-06, + "loss": 0.3861, + "step": 834 + }, + { + "epoch": 0.021130146519219576, + "grad_norm": 15.894869804382324, + "learning_rate": 9.996893267077603e-06, + "loss": 0.2603, + "step": 835 + }, + { + "epoch": 0.021155452083913253, + "grad_norm": 8.179947853088379, + "learning_rate": 9.996879098750765e-06, + "loss": 0.2409, + "step": 836 + }, + { + "epoch": 0.02118075764860693, + "grad_norm": 8.66854476928711, + "learning_rate": 9.996864898199986e-06, + "loss": 0.1578, + "step": 837 + }, + { + "epoch": 0.021206063213300604, + "grad_norm": 10.376187324523926, + "learning_rate": 9.996850665425352e-06, + "loss": 0.2311, + "step": 838 + }, + { + "epoch": 0.02123136877799428, + "grad_norm": 10.617366790771484, + "learning_rate": 9.99683640042696e-06, + "loss": 0.2229, + "step": 839 + }, + { + "epoch": 0.021256674342687958, + "grad_norm": 7.663614749908447, + "learning_rate": 9.996822103204899e-06, + "loss": 0.2799, + "step": 840 + }, + { + "epoch": 0.021281979907381635, + "grad_norm": 17.898630142211914, + "learning_rate": 9.99680777375926e-06, + "loss": 0.2913, + "step": 841 + }, + { + "epoch": 0.021307285472075308, + "grad_norm": 5.2919697761535645, + "learning_rate": 9.99679341209014e-06, + "loss": 0.1403, + "step": 842 + }, + { + "epoch": 0.021332591036768985, + "grad_norm": 9.151687622070312, + "learning_rate": 9.996779018197627e-06, + "loss": 0.2569, + "step": 843 + }, + { + "epoch": 0.021357896601462662, + "grad_norm": 5.331577301025391, + "learning_rate": 9.996764592081814e-06, + "loss": 0.1699, + "step": 844 + }, + { + "epoch": 0.02138320216615634, + "grad_norm": 14.238883972167969, + "learning_rate": 9.996750133742798e-06, + "loss": 0.1844, + "step": 845 + }, + { + "epoch": 0.021408507730850013, + "grad_norm": 14.812729835510254, + "learning_rate": 9.99673564318067e-06, + "loss": 0.2797, + "step": 846 + }, + { + "epoch": 0.02143381329554369, + "grad_norm": 26.558576583862305, + "learning_rate": 9.996721120395521e-06, + "loss": 0.2875, + "step": 847 + }, + { + "epoch": 0.021459118860237367, + "grad_norm": 9.666671752929688, + "learning_rate": 9.99670656538745e-06, + "loss": 0.2766, + "step": 848 + }, + { + "epoch": 0.021484424424931044, + "grad_norm": 9.386077880859375, + "learning_rate": 9.996691978156545e-06, + "loss": 0.2441, + "step": 849 + }, + { + "epoch": 0.021509729989624717, + "grad_norm": 34.703277587890625, + "learning_rate": 9.996677358702904e-06, + "loss": 0.2881, + "step": 850 + }, + { + "epoch": 0.021535035554318394, + "grad_norm": 12.388720512390137, + "learning_rate": 9.99666270702662e-06, + "loss": 0.334, + "step": 851 + }, + { + "epoch": 0.02156034111901207, + "grad_norm": 6.350498676300049, + "learning_rate": 9.996648023127789e-06, + "loss": 0.178, + "step": 852 + }, + { + "epoch": 0.02158564668370575, + "grad_norm": 21.1513671875, + "learning_rate": 9.996633307006504e-06, + "loss": 0.3011, + "step": 853 + }, + { + "epoch": 0.021610952248399422, + "grad_norm": 5.35745906829834, + "learning_rate": 9.996618558662858e-06, + "loss": 0.1878, + "step": 854 + }, + { + "epoch": 0.0216362578130931, + "grad_norm": 12.627599716186523, + "learning_rate": 9.996603778096952e-06, + "loss": 0.2546, + "step": 855 + }, + { + "epoch": 0.021661563377786776, + "grad_norm": 9.377495765686035, + "learning_rate": 9.996588965308875e-06, + "loss": 0.3049, + "step": 856 + }, + { + "epoch": 0.021686868942480453, + "grad_norm": 6.033194065093994, + "learning_rate": 9.996574120298725e-06, + "loss": 0.1293, + "step": 857 + }, + { + "epoch": 0.021712174507174126, + "grad_norm": 27.211563110351562, + "learning_rate": 9.996559243066599e-06, + "loss": 0.4557, + "step": 858 + }, + { + "epoch": 0.021737480071867803, + "grad_norm": 13.064432144165039, + "learning_rate": 9.996544333612592e-06, + "loss": 0.2744, + "step": 859 + }, + { + "epoch": 0.02176278563656148, + "grad_norm": 6.135220527648926, + "learning_rate": 9.9965293919368e-06, + "loss": 0.2281, + "step": 860 + }, + { + "epoch": 0.021788091201255157, + "grad_norm": 17.298301696777344, + "learning_rate": 9.996514418039318e-06, + "loss": 0.2563, + "step": 861 + }, + { + "epoch": 0.02181339676594883, + "grad_norm": 9.367166519165039, + "learning_rate": 9.996499411920246e-06, + "loss": 0.3041, + "step": 862 + }, + { + "epoch": 0.021838702330642508, + "grad_norm": 12.635944366455078, + "learning_rate": 9.996484373579676e-06, + "loss": 0.224, + "step": 863 + }, + { + "epoch": 0.021864007895336185, + "grad_norm": 8.604538917541504, + "learning_rate": 9.99646930301771e-06, + "loss": 0.1823, + "step": 864 + }, + { + "epoch": 0.021889313460029862, + "grad_norm": 8.371360778808594, + "learning_rate": 9.996454200234442e-06, + "loss": 0.2381, + "step": 865 + }, + { + "epoch": 0.021914619024723535, + "grad_norm": 16.25839614868164, + "learning_rate": 9.99643906522997e-06, + "loss": 0.3245, + "step": 866 + }, + { + "epoch": 0.021939924589417212, + "grad_norm": 7.3915934562683105, + "learning_rate": 9.996423898004392e-06, + "loss": 0.227, + "step": 867 + }, + { + "epoch": 0.02196523015411089, + "grad_norm": 9.1781587600708, + "learning_rate": 9.996408698557807e-06, + "loss": 0.1827, + "step": 868 + }, + { + "epoch": 0.021990535718804567, + "grad_norm": 9.764066696166992, + "learning_rate": 9.99639346689031e-06, + "loss": 0.2285, + "step": 869 + }, + { + "epoch": 0.02201584128349824, + "grad_norm": 7.804599285125732, + "learning_rate": 9.996378203002003e-06, + "loss": 0.2709, + "step": 870 + }, + { + "epoch": 0.022041146848191917, + "grad_norm": 12.21908950805664, + "learning_rate": 9.99636290689298e-06, + "loss": 0.3471, + "step": 871 + }, + { + "epoch": 0.022066452412885594, + "grad_norm": 8.770467758178711, + "learning_rate": 9.996347578563344e-06, + "loss": 0.2535, + "step": 872 + }, + { + "epoch": 0.02209175797757927, + "grad_norm": 11.477250099182129, + "learning_rate": 9.996332218013191e-06, + "loss": 0.2848, + "step": 873 + }, + { + "epoch": 0.022117063542272945, + "grad_norm": 8.723024368286133, + "learning_rate": 9.99631682524262e-06, + "loss": 0.2689, + "step": 874 + }, + { + "epoch": 0.02214236910696662, + "grad_norm": 7.417282581329346, + "learning_rate": 9.996301400251733e-06, + "loss": 0.197, + "step": 875 + }, + { + "epoch": 0.0221676746716603, + "grad_norm": 19.077795028686523, + "learning_rate": 9.996285943040627e-06, + "loss": 0.2792, + "step": 876 + }, + { + "epoch": 0.022192980236353976, + "grad_norm": 5.360636234283447, + "learning_rate": 9.996270453609402e-06, + "loss": 0.2066, + "step": 877 + }, + { + "epoch": 0.02221828580104765, + "grad_norm": 6.87188196182251, + "learning_rate": 9.996254931958158e-06, + "loss": 0.174, + "step": 878 + }, + { + "epoch": 0.022243591365741326, + "grad_norm": 6.613792419433594, + "learning_rate": 9.996239378086996e-06, + "loss": 0.1837, + "step": 879 + }, + { + "epoch": 0.022268896930435003, + "grad_norm": 9.009709358215332, + "learning_rate": 9.996223791996013e-06, + "loss": 0.2328, + "step": 880 + }, + { + "epoch": 0.02229420249512868, + "grad_norm": 9.480873107910156, + "learning_rate": 9.996208173685315e-06, + "loss": 0.2722, + "step": 881 + }, + { + "epoch": 0.022319508059822354, + "grad_norm": 7.032013416290283, + "learning_rate": 9.996192523155e-06, + "loss": 0.1152, + "step": 882 + }, + { + "epoch": 0.02234481362451603, + "grad_norm": 6.560922145843506, + "learning_rate": 9.996176840405167e-06, + "loss": 0.2015, + "step": 883 + }, + { + "epoch": 0.022370119189209708, + "grad_norm": 14.573695182800293, + "learning_rate": 9.996161125435921e-06, + "loss": 0.2506, + "step": 884 + }, + { + "epoch": 0.022395424753903385, + "grad_norm": 8.365387916564941, + "learning_rate": 9.996145378247359e-06, + "loss": 0.2031, + "step": 885 + }, + { + "epoch": 0.02242073031859706, + "grad_norm": 9.108831405639648, + "learning_rate": 9.996129598839587e-06, + "loss": 0.2597, + "step": 886 + }, + { + "epoch": 0.022446035883290735, + "grad_norm": 15.705202102661133, + "learning_rate": 9.996113787212704e-06, + "loss": 0.3591, + "step": 887 + }, + { + "epoch": 0.022471341447984412, + "grad_norm": 7.247493743896484, + "learning_rate": 9.996097943366812e-06, + "loss": 0.1591, + "step": 888 + }, + { + "epoch": 0.02249664701267809, + "grad_norm": 8.224400520324707, + "learning_rate": 9.996082067302012e-06, + "loss": 0.2778, + "step": 889 + }, + { + "epoch": 0.022521952577371763, + "grad_norm": 10.268498420715332, + "learning_rate": 9.996066159018412e-06, + "loss": 0.272, + "step": 890 + }, + { + "epoch": 0.02254725814206544, + "grad_norm": 17.807138442993164, + "learning_rate": 9.996050218516107e-06, + "loss": 0.2297, + "step": 891 + }, + { + "epoch": 0.022572563706759117, + "grad_norm": 9.828855514526367, + "learning_rate": 9.996034245795206e-06, + "loss": 0.2253, + "step": 892 + }, + { + "epoch": 0.022597869271452794, + "grad_norm": 26.0128173828125, + "learning_rate": 9.996018240855809e-06, + "loss": 0.2799, + "step": 893 + }, + { + "epoch": 0.022623174836146467, + "grad_norm": 7.771652698516846, + "learning_rate": 9.996002203698018e-06, + "loss": 0.1486, + "step": 894 + }, + { + "epoch": 0.022648480400840144, + "grad_norm": 10.415810585021973, + "learning_rate": 9.99598613432194e-06, + "loss": 0.3166, + "step": 895 + }, + { + "epoch": 0.02267378596553382, + "grad_norm": 6.629223823547363, + "learning_rate": 9.995970032727676e-06, + "loss": 0.1942, + "step": 896 + }, + { + "epoch": 0.0226990915302275, + "grad_norm": 15.585407257080078, + "learning_rate": 9.99595389891533e-06, + "loss": 0.3639, + "step": 897 + }, + { + "epoch": 0.022724397094921172, + "grad_norm": 11.602970123291016, + "learning_rate": 9.995937732885007e-06, + "loss": 0.2152, + "step": 898 + }, + { + "epoch": 0.02274970265961485, + "grad_norm": 9.358983039855957, + "learning_rate": 9.995921534636813e-06, + "loss": 0.1928, + "step": 899 + }, + { + "epoch": 0.022775008224308526, + "grad_norm": 10.014068603515625, + "learning_rate": 9.995905304170848e-06, + "loss": 0.2923, + "step": 900 + }, + { + "epoch": 0.022800313789002203, + "grad_norm": 16.461702346801758, + "learning_rate": 9.995889041487218e-06, + "loss": 0.3587, + "step": 901 + }, + { + "epoch": 0.022825619353695877, + "grad_norm": 8.677318572998047, + "learning_rate": 9.995872746586031e-06, + "loss": 0.191, + "step": 902 + }, + { + "epoch": 0.022850924918389554, + "grad_norm": 7.792047500610352, + "learning_rate": 9.995856419467388e-06, + "loss": 0.2173, + "step": 903 + }, + { + "epoch": 0.02287623048308323, + "grad_norm": 12.069467544555664, + "learning_rate": 9.995840060131399e-06, + "loss": 0.27, + "step": 904 + }, + { + "epoch": 0.022901536047776908, + "grad_norm": 10.297518730163574, + "learning_rate": 9.995823668578165e-06, + "loss": 0.2715, + "step": 905 + }, + { + "epoch": 0.02292684161247058, + "grad_norm": 6.93367338180542, + "learning_rate": 9.995807244807793e-06, + "loss": 0.2709, + "step": 906 + }, + { + "epoch": 0.022952147177164258, + "grad_norm": 14.969608306884766, + "learning_rate": 9.99579078882039e-06, + "loss": 0.3165, + "step": 907 + }, + { + "epoch": 0.022977452741857935, + "grad_norm": 8.21203899383545, + "learning_rate": 9.995774300616063e-06, + "loss": 0.2629, + "step": 908 + }, + { + "epoch": 0.023002758306551612, + "grad_norm": 9.461505889892578, + "learning_rate": 9.995757780194915e-06, + "loss": 0.1921, + "step": 909 + }, + { + "epoch": 0.023028063871245286, + "grad_norm": 21.067569732666016, + "learning_rate": 9.995741227557055e-06, + "loss": 0.2478, + "step": 910 + }, + { + "epoch": 0.023053369435938963, + "grad_norm": 18.36295509338379, + "learning_rate": 9.995724642702589e-06, + "loss": 0.3376, + "step": 911 + }, + { + "epoch": 0.02307867500063264, + "grad_norm": 9.574444770812988, + "learning_rate": 9.995708025631624e-06, + "loss": 0.2049, + "step": 912 + }, + { + "epoch": 0.023103980565326317, + "grad_norm": 11.688669204711914, + "learning_rate": 9.995691376344267e-06, + "loss": 0.3176, + "step": 913 + }, + { + "epoch": 0.02312928613001999, + "grad_norm": 9.937437057495117, + "learning_rate": 9.995674694840624e-06, + "loss": 0.2176, + "step": 914 + }, + { + "epoch": 0.023154591694713667, + "grad_norm": 9.097447395324707, + "learning_rate": 9.995657981120808e-06, + "loss": 0.1948, + "step": 915 + }, + { + "epoch": 0.023179897259407344, + "grad_norm": 9.167795181274414, + "learning_rate": 9.99564123518492e-06, + "loss": 0.1768, + "step": 916 + }, + { + "epoch": 0.02320520282410102, + "grad_norm": 7.5612993240356445, + "learning_rate": 9.995624457033071e-06, + "loss": 0.1931, + "step": 917 + }, + { + "epoch": 0.023230508388794695, + "grad_norm": 7.326131820678711, + "learning_rate": 9.995607646665371e-06, + "loss": 0.2302, + "step": 918 + }, + { + "epoch": 0.023255813953488372, + "grad_norm": 5.843041896820068, + "learning_rate": 9.995590804081925e-06, + "loss": 0.1857, + "step": 919 + }, + { + "epoch": 0.02328111951818205, + "grad_norm": 15.00075626373291, + "learning_rate": 9.995573929282844e-06, + "loss": 0.251, + "step": 920 + }, + { + "epoch": 0.023306425082875726, + "grad_norm": 11.61009693145752, + "learning_rate": 9.995557022268236e-06, + "loss": 0.2257, + "step": 921 + }, + { + "epoch": 0.0233317306475694, + "grad_norm": 9.757715225219727, + "learning_rate": 9.995540083038208e-06, + "loss": 0.2392, + "step": 922 + }, + { + "epoch": 0.023357036212263076, + "grad_norm": 9.656588554382324, + "learning_rate": 9.995523111592875e-06, + "loss": 0.2376, + "step": 923 + }, + { + "epoch": 0.023382341776956753, + "grad_norm": 7.691436290740967, + "learning_rate": 9.99550610793234e-06, + "loss": 0.129, + "step": 924 + }, + { + "epoch": 0.02340764734165043, + "grad_norm": 10.099190711975098, + "learning_rate": 9.995489072056716e-06, + "loss": 0.2756, + "step": 925 + }, + { + "epoch": 0.023432952906344104, + "grad_norm": 10.448710441589355, + "learning_rate": 9.995472003966111e-06, + "loss": 0.2066, + "step": 926 + }, + { + "epoch": 0.02345825847103778, + "grad_norm": 8.503867149353027, + "learning_rate": 9.995454903660637e-06, + "loss": 0.2616, + "step": 927 + }, + { + "epoch": 0.023483564035731458, + "grad_norm": 3.8400068283081055, + "learning_rate": 9.995437771140405e-06, + "loss": 0.1784, + "step": 928 + }, + { + "epoch": 0.023508869600425135, + "grad_norm": 7.614108085632324, + "learning_rate": 9.995420606405523e-06, + "loss": 0.2476, + "step": 929 + }, + { + "epoch": 0.02353417516511881, + "grad_norm": 5.5324249267578125, + "learning_rate": 9.995403409456102e-06, + "loss": 0.2144, + "step": 930 + }, + { + "epoch": 0.023559480729812485, + "grad_norm": 8.383930206298828, + "learning_rate": 9.995386180292255e-06, + "loss": 0.1837, + "step": 931 + }, + { + "epoch": 0.023584786294506162, + "grad_norm": 5.990270137786865, + "learning_rate": 9.99536891891409e-06, + "loss": 0.2007, + "step": 932 + }, + { + "epoch": 0.02361009185919984, + "grad_norm": 8.492243766784668, + "learning_rate": 9.995351625321723e-06, + "loss": 0.2098, + "step": 933 + }, + { + "epoch": 0.023635397423893513, + "grad_norm": 12.480201721191406, + "learning_rate": 9.99533429951526e-06, + "loss": 0.3767, + "step": 934 + }, + { + "epoch": 0.02366070298858719, + "grad_norm": 12.312365531921387, + "learning_rate": 9.995316941494815e-06, + "loss": 0.2374, + "step": 935 + }, + { + "epoch": 0.023686008553280867, + "grad_norm": 5.3644585609436035, + "learning_rate": 9.995299551260502e-06, + "loss": 0.2011, + "step": 936 + }, + { + "epoch": 0.023711314117974544, + "grad_norm": 9.867116928100586, + "learning_rate": 9.99528212881243e-06, + "loss": 0.2444, + "step": 937 + }, + { + "epoch": 0.023736619682668218, + "grad_norm": 6.467618465423584, + "learning_rate": 9.995264674150713e-06, + "loss": 0.146, + "step": 938 + }, + { + "epoch": 0.023761925247361895, + "grad_norm": 6.782244682312012, + "learning_rate": 9.995247187275463e-06, + "loss": 0.2445, + "step": 939 + }, + { + "epoch": 0.02378723081205557, + "grad_norm": 9.38813304901123, + "learning_rate": 9.995229668186794e-06, + "loss": 0.2245, + "step": 940 + }, + { + "epoch": 0.02381253637674925, + "grad_norm": 13.854695320129395, + "learning_rate": 9.995212116884819e-06, + "loss": 0.3435, + "step": 941 + }, + { + "epoch": 0.023837841941442922, + "grad_norm": 15.037954330444336, + "learning_rate": 9.99519453336965e-06, + "loss": 0.3899, + "step": 942 + }, + { + "epoch": 0.0238631475061366, + "grad_norm": 6.963712215423584, + "learning_rate": 9.995176917641398e-06, + "loss": 0.1852, + "step": 943 + }, + { + "epoch": 0.023888453070830276, + "grad_norm": 12.99940013885498, + "learning_rate": 9.995159269700181e-06, + "loss": 0.2267, + "step": 944 + }, + { + "epoch": 0.023913758635523953, + "grad_norm": 13.810532569885254, + "learning_rate": 9.995141589546111e-06, + "loss": 0.2322, + "step": 945 + }, + { + "epoch": 0.023939064200217627, + "grad_norm": 7.326478481292725, + "learning_rate": 9.995123877179302e-06, + "loss": 0.2091, + "step": 946 + }, + { + "epoch": 0.023964369764911304, + "grad_norm": 7.378447532653809, + "learning_rate": 9.995106132599869e-06, + "loss": 0.2019, + "step": 947 + }, + { + "epoch": 0.02398967532960498, + "grad_norm": 5.591911792755127, + "learning_rate": 9.995088355807925e-06, + "loss": 0.1885, + "step": 948 + }, + { + "epoch": 0.024014980894298658, + "grad_norm": 7.3681321144104, + "learning_rate": 9.995070546803584e-06, + "loss": 0.1789, + "step": 949 + }, + { + "epoch": 0.02404028645899233, + "grad_norm": 19.19222640991211, + "learning_rate": 9.995052705586965e-06, + "loss": 0.2992, + "step": 950 + }, + { + "epoch": 0.024065592023686008, + "grad_norm": 8.97121524810791, + "learning_rate": 9.99503483215818e-06, + "loss": 0.2587, + "step": 951 + }, + { + "epoch": 0.024090897588379685, + "grad_norm": 16.02210235595703, + "learning_rate": 9.995016926517342e-06, + "loss": 0.2249, + "step": 952 + }, + { + "epoch": 0.024116203153073362, + "grad_norm": 5.976956844329834, + "learning_rate": 9.99499898866457e-06, + "loss": 0.1411, + "step": 953 + }, + { + "epoch": 0.024141508717767036, + "grad_norm": 33.50779342651367, + "learning_rate": 9.99498101859998e-06, + "loss": 0.3159, + "step": 954 + }, + { + "epoch": 0.024166814282460713, + "grad_norm": 6.713763236999512, + "learning_rate": 9.994963016323684e-06, + "loss": 0.2061, + "step": 955 + }, + { + "epoch": 0.02419211984715439, + "grad_norm": 21.54592514038086, + "learning_rate": 9.994944981835803e-06, + "loss": 0.338, + "step": 956 + }, + { + "epoch": 0.024217425411848067, + "grad_norm": 12.260071754455566, + "learning_rate": 9.994926915136449e-06, + "loss": 0.2854, + "step": 957 + }, + { + "epoch": 0.02424273097654174, + "grad_norm": 6.539503574371338, + "learning_rate": 9.994908816225742e-06, + "loss": 0.233, + "step": 958 + }, + { + "epoch": 0.024268036541235417, + "grad_norm": 6.857904434204102, + "learning_rate": 9.994890685103798e-06, + "loss": 0.1557, + "step": 959 + }, + { + "epoch": 0.024293342105929094, + "grad_norm": 6.3602294921875, + "learning_rate": 9.994872521770732e-06, + "loss": 0.2714, + "step": 960 + }, + { + "epoch": 0.02431864767062277, + "grad_norm": 5.710127353668213, + "learning_rate": 9.994854326226661e-06, + "loss": 0.1895, + "step": 961 + }, + { + "epoch": 0.024343953235316445, + "grad_norm": 7.221642971038818, + "learning_rate": 9.994836098471705e-06, + "loss": 0.2295, + "step": 962 + }, + { + "epoch": 0.024369258800010122, + "grad_norm": 5.849930763244629, + "learning_rate": 9.99481783850598e-06, + "loss": 0.2506, + "step": 963 + }, + { + "epoch": 0.0243945643647038, + "grad_norm": 7.7995829582214355, + "learning_rate": 9.994799546329604e-06, + "loss": 0.1966, + "step": 964 + }, + { + "epoch": 0.024419869929397476, + "grad_norm": 15.180273056030273, + "learning_rate": 9.994781221942693e-06, + "loss": 0.4148, + "step": 965 + }, + { + "epoch": 0.02444517549409115, + "grad_norm": 9.953888893127441, + "learning_rate": 9.994762865345368e-06, + "loss": 0.3043, + "step": 966 + }, + { + "epoch": 0.024470481058784826, + "grad_norm": 7.796439170837402, + "learning_rate": 9.994744476537748e-06, + "loss": 0.1813, + "step": 967 + }, + { + "epoch": 0.024495786623478503, + "grad_norm": 10.422286987304688, + "learning_rate": 9.994726055519947e-06, + "loss": 0.2553, + "step": 968 + }, + { + "epoch": 0.02452109218817218, + "grad_norm": 8.243529319763184, + "learning_rate": 9.99470760229209e-06, + "loss": 0.2244, + "step": 969 + }, + { + "epoch": 0.024546397752865854, + "grad_norm": 7.795766830444336, + "learning_rate": 9.99468911685429e-06, + "loss": 0.2143, + "step": 970 + }, + { + "epoch": 0.02457170331755953, + "grad_norm": 10.939414024353027, + "learning_rate": 9.994670599206671e-06, + "loss": 0.2293, + "step": 971 + }, + { + "epoch": 0.024597008882253208, + "grad_norm": 19.023122787475586, + "learning_rate": 9.994652049349349e-06, + "loss": 0.2362, + "step": 972 + }, + { + "epoch": 0.024622314446946885, + "grad_norm": 18.315818786621094, + "learning_rate": 9.994633467282447e-06, + "loss": 0.2328, + "step": 973 + }, + { + "epoch": 0.02464762001164056, + "grad_norm": 18.675630569458008, + "learning_rate": 9.994614853006082e-06, + "loss": 0.3137, + "step": 974 + }, + { + "epoch": 0.024672925576334236, + "grad_norm": 8.98159122467041, + "learning_rate": 9.994596206520373e-06, + "loss": 0.2029, + "step": 975 + }, + { + "epoch": 0.024698231141027913, + "grad_norm": 6.699990749359131, + "learning_rate": 9.994577527825445e-06, + "loss": 0.1787, + "step": 976 + }, + { + "epoch": 0.02472353670572159, + "grad_norm": 12.93747615814209, + "learning_rate": 9.994558816921413e-06, + "loss": 0.2852, + "step": 977 + }, + { + "epoch": 0.024748842270415263, + "grad_norm": 11.331523895263672, + "learning_rate": 9.994540073808401e-06, + "loss": 0.2585, + "step": 978 + }, + { + "epoch": 0.02477414783510894, + "grad_norm": 9.085844993591309, + "learning_rate": 9.99452129848653e-06, + "loss": 0.2993, + "step": 979 + }, + { + "epoch": 0.024799453399802617, + "grad_norm": 15.182241439819336, + "learning_rate": 9.99450249095592e-06, + "loss": 0.2576, + "step": 980 + }, + { + "epoch": 0.024824758964496294, + "grad_norm": 7.300711631774902, + "learning_rate": 9.994483651216691e-06, + "loss": 0.2296, + "step": 981 + }, + { + "epoch": 0.024850064529189968, + "grad_norm": 6.368177890777588, + "learning_rate": 9.994464779268967e-06, + "loss": 0.1999, + "step": 982 + }, + { + "epoch": 0.024875370093883645, + "grad_norm": 11.584558486938477, + "learning_rate": 9.994445875112869e-06, + "loss": 0.2346, + "step": 983 + }, + { + "epoch": 0.024900675658577322, + "grad_norm": 7.4994306564331055, + "learning_rate": 9.994426938748518e-06, + "loss": 0.2512, + "step": 984 + }, + { + "epoch": 0.024925981223271, + "grad_norm": 14.162617683410645, + "learning_rate": 9.994407970176038e-06, + "loss": 0.3684, + "step": 985 + }, + { + "epoch": 0.024951286787964672, + "grad_norm": 21.528432846069336, + "learning_rate": 9.994388969395548e-06, + "loss": 0.294, + "step": 986 + }, + { + "epoch": 0.02497659235265835, + "grad_norm": 6.5773186683654785, + "learning_rate": 9.994369936407175e-06, + "loss": 0.2092, + "step": 987 + }, + { + "epoch": 0.025001897917352026, + "grad_norm": 14.822239875793457, + "learning_rate": 9.994350871211039e-06, + "loss": 0.2423, + "step": 988 + }, + { + "epoch": 0.025027203482045703, + "grad_norm": 12.33026123046875, + "learning_rate": 9.99433177380726e-06, + "loss": 0.2611, + "step": 989 + }, + { + "epoch": 0.025052509046739377, + "grad_norm": 18.30289077758789, + "learning_rate": 9.994312644195968e-06, + "loss": 0.2327, + "step": 990 + }, + { + "epoch": 0.025077814611433054, + "grad_norm": 14.220196723937988, + "learning_rate": 9.99429348237728e-06, + "loss": 0.2085, + "step": 991 + }, + { + "epoch": 0.02510312017612673, + "grad_norm": 6.442544460296631, + "learning_rate": 9.994274288351324e-06, + "loss": 0.2205, + "step": 992 + }, + { + "epoch": 0.025128425740820408, + "grad_norm": 11.821319580078125, + "learning_rate": 9.994255062118222e-06, + "loss": 0.2041, + "step": 993 + }, + { + "epoch": 0.02515373130551408, + "grad_norm": 6.466485023498535, + "learning_rate": 9.994235803678098e-06, + "loss": 0.204, + "step": 994 + }, + { + "epoch": 0.02517903687020776, + "grad_norm": 10.12718391418457, + "learning_rate": 9.994216513031077e-06, + "loss": 0.2688, + "step": 995 + }, + { + "epoch": 0.025204342434901435, + "grad_norm": 8.57748031616211, + "learning_rate": 9.994197190177282e-06, + "loss": 0.2467, + "step": 996 + }, + { + "epoch": 0.025229647999595112, + "grad_norm": 11.432987213134766, + "learning_rate": 9.994177835116839e-06, + "loss": 0.2662, + "step": 997 + }, + { + "epoch": 0.025254953564288786, + "grad_norm": 9.241778373718262, + "learning_rate": 9.99415844784987e-06, + "loss": 0.3206, + "step": 998 + }, + { + "epoch": 0.025280259128982463, + "grad_norm": 17.48302459716797, + "learning_rate": 9.994139028376503e-06, + "loss": 0.3969, + "step": 999 + }, + { + "epoch": 0.02530556469367614, + "grad_norm": 21.70526885986328, + "learning_rate": 9.994119576696863e-06, + "loss": 0.3039, + "step": 1000 + }, + { + "epoch": 0.025330870258369817, + "grad_norm": 8.891977310180664, + "learning_rate": 9.994100092811075e-06, + "loss": 0.2237, + "step": 1001 + }, + { + "epoch": 0.02535617582306349, + "grad_norm": 13.811738014221191, + "learning_rate": 9.994080576719265e-06, + "loss": 0.2951, + "step": 1002 + }, + { + "epoch": 0.025381481387757168, + "grad_norm": 12.216206550598145, + "learning_rate": 9.994061028421558e-06, + "loss": 0.2563, + "step": 1003 + }, + { + "epoch": 0.025406786952450845, + "grad_norm": 7.113925457000732, + "learning_rate": 9.99404144791808e-06, + "loss": 0.2223, + "step": 1004 + }, + { + "epoch": 0.02543209251714452, + "grad_norm": 5.88624906539917, + "learning_rate": 9.994021835208957e-06, + "loss": 0.2093, + "step": 1005 + }, + { + "epoch": 0.025457398081838195, + "grad_norm": 12.371611595153809, + "learning_rate": 9.994002190294317e-06, + "loss": 0.2714, + "step": 1006 + }, + { + "epoch": 0.025482703646531872, + "grad_norm": 10.132512092590332, + "learning_rate": 9.993982513174285e-06, + "loss": 0.2202, + "step": 1007 + }, + { + "epoch": 0.02550800921122555, + "grad_norm": 8.302763938903809, + "learning_rate": 9.99396280384899e-06, + "loss": 0.2678, + "step": 1008 + }, + { + "epoch": 0.025533314775919226, + "grad_norm": 10.583211898803711, + "learning_rate": 9.993943062318557e-06, + "loss": 0.164, + "step": 1009 + }, + { + "epoch": 0.0255586203406129, + "grad_norm": 8.599952697753906, + "learning_rate": 9.993923288583114e-06, + "loss": 0.2663, + "step": 1010 + }, + { + "epoch": 0.025583925905306577, + "grad_norm": 10.962617874145508, + "learning_rate": 9.99390348264279e-06, + "loss": 0.2159, + "step": 1011 + }, + { + "epoch": 0.025609231470000254, + "grad_norm": 8.57041072845459, + "learning_rate": 9.993883644497711e-06, + "loss": 0.2515, + "step": 1012 + }, + { + "epoch": 0.02563453703469393, + "grad_norm": 13.380753517150879, + "learning_rate": 9.993863774148003e-06, + "loss": 0.3821, + "step": 1013 + }, + { + "epoch": 0.025659842599387604, + "grad_norm": 10.175156593322754, + "learning_rate": 9.993843871593798e-06, + "loss": 0.1813, + "step": 1014 + }, + { + "epoch": 0.02568514816408128, + "grad_norm": 8.561018943786621, + "learning_rate": 9.993823936835223e-06, + "loss": 0.1948, + "step": 1015 + }, + { + "epoch": 0.025710453728774958, + "grad_norm": 11.975056648254395, + "learning_rate": 9.993803969872407e-06, + "loss": 0.296, + "step": 1016 + }, + { + "epoch": 0.025735759293468635, + "grad_norm": 8.785384178161621, + "learning_rate": 9.993783970705478e-06, + "loss": 0.1384, + "step": 1017 + }, + { + "epoch": 0.02576106485816231, + "grad_norm": 5.999468803405762, + "learning_rate": 9.993763939334564e-06, + "loss": 0.1792, + "step": 1018 + }, + { + "epoch": 0.025786370422855986, + "grad_norm": 10.231145858764648, + "learning_rate": 9.993743875759797e-06, + "loss": 0.2486, + "step": 1019 + }, + { + "epoch": 0.025811675987549663, + "grad_norm": 6.483461856842041, + "learning_rate": 9.993723779981301e-06, + "loss": 0.2032, + "step": 1020 + }, + { + "epoch": 0.02583698155224334, + "grad_norm": 23.417701721191406, + "learning_rate": 9.993703651999212e-06, + "loss": 0.3601, + "step": 1021 + }, + { + "epoch": 0.025862287116937013, + "grad_norm": 15.234698295593262, + "learning_rate": 9.993683491813657e-06, + "loss": 0.2672, + "step": 1022 + }, + { + "epoch": 0.02588759268163069, + "grad_norm": 9.926774978637695, + "learning_rate": 9.993663299424766e-06, + "loss": 0.2522, + "step": 1023 + }, + { + "epoch": 0.025912898246324367, + "grad_norm": 9.640121459960938, + "learning_rate": 9.993643074832669e-06, + "loss": 0.2491, + "step": 1024 + }, + { + "epoch": 0.025938203811018044, + "grad_norm": 10.272279739379883, + "learning_rate": 9.993622818037495e-06, + "loss": 0.2393, + "step": 1025 + }, + { + "epoch": 0.025963509375711718, + "grad_norm": 14.266480445861816, + "learning_rate": 9.993602529039378e-06, + "loss": 0.2182, + "step": 1026 + }, + { + "epoch": 0.025988814940405395, + "grad_norm": 8.321634292602539, + "learning_rate": 9.993582207838449e-06, + "loss": 0.1716, + "step": 1027 + }, + { + "epoch": 0.026014120505099072, + "grad_norm": 7.306949138641357, + "learning_rate": 9.993561854434835e-06, + "loss": 0.259, + "step": 1028 + }, + { + "epoch": 0.02603942606979275, + "grad_norm": 8.337875366210938, + "learning_rate": 9.99354146882867e-06, + "loss": 0.2068, + "step": 1029 + }, + { + "epoch": 0.026064731634486422, + "grad_norm": 10.189177513122559, + "learning_rate": 9.993521051020085e-06, + "loss": 0.2704, + "step": 1030 + }, + { + "epoch": 0.0260900371991801, + "grad_norm": 6.115238189697266, + "learning_rate": 9.993500601009212e-06, + "loss": 0.2405, + "step": 1031 + }, + { + "epoch": 0.026115342763873776, + "grad_norm": 7.494271278381348, + "learning_rate": 9.993480118796182e-06, + "loss": 0.1908, + "step": 1032 + }, + { + "epoch": 0.026140648328567453, + "grad_norm": 9.324810981750488, + "learning_rate": 9.993459604381126e-06, + "loss": 0.198, + "step": 1033 + }, + { + "epoch": 0.026165953893261127, + "grad_norm": 4.633622646331787, + "learning_rate": 9.993439057764181e-06, + "loss": 0.1485, + "step": 1034 + }, + { + "epoch": 0.026191259457954804, + "grad_norm": 4.877185821533203, + "learning_rate": 9.993418478945474e-06, + "loss": 0.1616, + "step": 1035 + }, + { + "epoch": 0.02621656502264848, + "grad_norm": 14.54901123046875, + "learning_rate": 9.99339786792514e-06, + "loss": 0.1876, + "step": 1036 + }, + { + "epoch": 0.026241870587342158, + "grad_norm": 7.066275596618652, + "learning_rate": 9.993377224703313e-06, + "loss": 0.2642, + "step": 1037 + }, + { + "epoch": 0.02626717615203583, + "grad_norm": 7.792837142944336, + "learning_rate": 9.993356549280125e-06, + "loss": 0.2311, + "step": 1038 + }, + { + "epoch": 0.02629248171672951, + "grad_norm": 10.295673370361328, + "learning_rate": 9.99333584165571e-06, + "loss": 0.238, + "step": 1039 + }, + { + "epoch": 0.026317787281423186, + "grad_norm": 8.77371597290039, + "learning_rate": 9.993315101830202e-06, + "loss": 0.276, + "step": 1040 + }, + { + "epoch": 0.026343092846116863, + "grad_norm": 9.137700080871582, + "learning_rate": 9.993294329803732e-06, + "loss": 0.251, + "step": 1041 + }, + { + "epoch": 0.026368398410810536, + "grad_norm": 15.865553855895996, + "learning_rate": 9.993273525576435e-06, + "loss": 0.3888, + "step": 1042 + }, + { + "epoch": 0.026393703975504213, + "grad_norm": 4.582302570343018, + "learning_rate": 9.993252689148445e-06, + "loss": 0.1658, + "step": 1043 + }, + { + "epoch": 0.02641900954019789, + "grad_norm": 5.388115406036377, + "learning_rate": 9.9932318205199e-06, + "loss": 0.2232, + "step": 1044 + }, + { + "epoch": 0.026444315104891567, + "grad_norm": 8.397674560546875, + "learning_rate": 9.99321091969093e-06, + "loss": 0.2238, + "step": 1045 + }, + { + "epoch": 0.02646962066958524, + "grad_norm": 17.742664337158203, + "learning_rate": 9.993189986661673e-06, + "loss": 0.3922, + "step": 1046 + }, + { + "epoch": 0.026494926234278918, + "grad_norm": 5.313774585723877, + "learning_rate": 9.993169021432263e-06, + "loss": 0.1643, + "step": 1047 + }, + { + "epoch": 0.026520231798972595, + "grad_norm": 14.735795021057129, + "learning_rate": 9.993148024002832e-06, + "loss": 0.2823, + "step": 1048 + }, + { + "epoch": 0.02654553736366627, + "grad_norm": 5.859001159667969, + "learning_rate": 9.993126994373519e-06, + "loss": 0.1755, + "step": 1049 + }, + { + "epoch": 0.026570842928359945, + "grad_norm": 4.939141750335693, + "learning_rate": 9.99310593254446e-06, + "loss": 0.1537, + "step": 1050 + }, + { + "epoch": 0.026596148493053622, + "grad_norm": 7.502933502197266, + "learning_rate": 9.99308483851579e-06, + "loss": 0.2071, + "step": 1051 + }, + { + "epoch": 0.0266214540577473, + "grad_norm": 13.71558952331543, + "learning_rate": 9.993063712287644e-06, + "loss": 0.2382, + "step": 1052 + }, + { + "epoch": 0.026646759622440976, + "grad_norm": 9.55109691619873, + "learning_rate": 9.993042553860159e-06, + "loss": 0.2445, + "step": 1053 + }, + { + "epoch": 0.02667206518713465, + "grad_norm": 11.365985870361328, + "learning_rate": 9.99302136323347e-06, + "loss": 0.2602, + "step": 1054 + }, + { + "epoch": 0.026697370751828327, + "grad_norm": 10.290191650390625, + "learning_rate": 9.993000140407718e-06, + "loss": 0.2483, + "step": 1055 + }, + { + "epoch": 0.026722676316522004, + "grad_norm": 5.144367218017578, + "learning_rate": 9.992978885383034e-06, + "loss": 0.1442, + "step": 1056 + }, + { + "epoch": 0.02674798188121568, + "grad_norm": 31.929555892944336, + "learning_rate": 9.992957598159558e-06, + "loss": 0.4429, + "step": 1057 + }, + { + "epoch": 0.026773287445909354, + "grad_norm": 16.54120635986328, + "learning_rate": 9.992936278737429e-06, + "loss": 0.337, + "step": 1058 + }, + { + "epoch": 0.02679859301060303, + "grad_norm": 22.254518508911133, + "learning_rate": 9.99291492711678e-06, + "loss": 0.2997, + "step": 1059 + }, + { + "epoch": 0.02682389857529671, + "grad_norm": 9.888360977172852, + "learning_rate": 9.992893543297755e-06, + "loss": 0.289, + "step": 1060 + }, + { + "epoch": 0.026849204139990385, + "grad_norm": 11.987272262573242, + "learning_rate": 9.992872127280486e-06, + "loss": 0.3104, + "step": 1061 + }, + { + "epoch": 0.02687450970468406, + "grad_norm": 21.664403915405273, + "learning_rate": 9.992850679065115e-06, + "loss": 0.3717, + "step": 1062 + }, + { + "epoch": 0.026899815269377736, + "grad_norm": 5.521574020385742, + "learning_rate": 9.992829198651777e-06, + "loss": 0.2095, + "step": 1063 + }, + { + "epoch": 0.026925120834071413, + "grad_norm": 8.22581672668457, + "learning_rate": 9.992807686040613e-06, + "loss": 0.2184, + "step": 1064 + }, + { + "epoch": 0.02695042639876509, + "grad_norm": 7.4022603034973145, + "learning_rate": 9.99278614123176e-06, + "loss": 0.2086, + "step": 1065 + }, + { + "epoch": 0.026975731963458763, + "grad_norm": 23.724979400634766, + "learning_rate": 9.99276456422536e-06, + "loss": 0.3585, + "step": 1066 + }, + { + "epoch": 0.02700103752815244, + "grad_norm": 12.813375473022461, + "learning_rate": 9.992742955021549e-06, + "loss": 0.2581, + "step": 1067 + }, + { + "epoch": 0.027026343092846117, + "grad_norm": 9.493589401245117, + "learning_rate": 9.992721313620467e-06, + "loss": 0.2259, + "step": 1068 + }, + { + "epoch": 0.027051648657539795, + "grad_norm": 5.811013221740723, + "learning_rate": 9.992699640022255e-06, + "loss": 0.1987, + "step": 1069 + }, + { + "epoch": 0.027076954222233468, + "grad_norm": 11.511190414428711, + "learning_rate": 9.992677934227051e-06, + "loss": 0.2761, + "step": 1070 + }, + { + "epoch": 0.027102259786927145, + "grad_norm": 4.3158721923828125, + "learning_rate": 9.992656196234996e-06, + "loss": 0.1815, + "step": 1071 + }, + { + "epoch": 0.027127565351620822, + "grad_norm": 16.763490676879883, + "learning_rate": 9.992634426046229e-06, + "loss": 0.284, + "step": 1072 + }, + { + "epoch": 0.0271528709163145, + "grad_norm": 12.127169609069824, + "learning_rate": 9.992612623660892e-06, + "loss": 0.3131, + "step": 1073 + }, + { + "epoch": 0.027178176481008173, + "grad_norm": 7.423439025878906, + "learning_rate": 9.992590789079123e-06, + "loss": 0.1742, + "step": 1074 + }, + { + "epoch": 0.02720348204570185, + "grad_norm": 8.270417213439941, + "learning_rate": 9.992568922301066e-06, + "loss": 0.2194, + "step": 1075 + }, + { + "epoch": 0.027228787610395527, + "grad_norm": 28.532487869262695, + "learning_rate": 9.992547023326862e-06, + "loss": 0.2649, + "step": 1076 + }, + { + "epoch": 0.027254093175089204, + "grad_norm": 9.43613052368164, + "learning_rate": 9.99252509215665e-06, + "loss": 0.2308, + "step": 1077 + }, + { + "epoch": 0.027279398739782877, + "grad_norm": 6.632373809814453, + "learning_rate": 9.992503128790573e-06, + "loss": 0.1444, + "step": 1078 + }, + { + "epoch": 0.027304704304476554, + "grad_norm": 8.507515907287598, + "learning_rate": 9.992481133228771e-06, + "loss": 0.2926, + "step": 1079 + }, + { + "epoch": 0.02733000986917023, + "grad_norm": 12.732542037963867, + "learning_rate": 9.992459105471388e-06, + "loss": 0.2699, + "step": 1080 + }, + { + "epoch": 0.027355315433863908, + "grad_norm": 7.937012195587158, + "learning_rate": 9.992437045518564e-06, + "loss": 0.2385, + "step": 1081 + }, + { + "epoch": 0.02738062099855758, + "grad_norm": 12.093399047851562, + "learning_rate": 9.992414953370442e-06, + "loss": 0.3231, + "step": 1082 + }, + { + "epoch": 0.02740592656325126, + "grad_norm": 7.248775959014893, + "learning_rate": 9.992392829027164e-06, + "loss": 0.1897, + "step": 1083 + }, + { + "epoch": 0.027431232127944936, + "grad_norm": 10.18093490600586, + "learning_rate": 9.992370672488877e-06, + "loss": 0.1748, + "step": 1084 + }, + { + "epoch": 0.027456537692638613, + "grad_norm": 11.659477233886719, + "learning_rate": 9.992348483755716e-06, + "loss": 0.2867, + "step": 1085 + }, + { + "epoch": 0.027481843257332286, + "grad_norm": 8.28809642791748, + "learning_rate": 9.992326262827829e-06, + "loss": 0.2653, + "step": 1086 + }, + { + "epoch": 0.027507148822025963, + "grad_norm": 9.663434028625488, + "learning_rate": 9.99230400970536e-06, + "loss": 0.247, + "step": 1087 + }, + { + "epoch": 0.02753245438671964, + "grad_norm": 6.849148750305176, + "learning_rate": 9.99228172438845e-06, + "loss": 0.214, + "step": 1088 + }, + { + "epoch": 0.027557759951413317, + "grad_norm": 12.179064750671387, + "learning_rate": 9.992259406877244e-06, + "loss": 0.2481, + "step": 1089 + }, + { + "epoch": 0.02758306551610699, + "grad_norm": 9.555424690246582, + "learning_rate": 9.992237057171885e-06, + "loss": 0.2613, + "step": 1090 + }, + { + "epoch": 0.027608371080800668, + "grad_norm": 7.850297927856445, + "learning_rate": 9.99221467527252e-06, + "loss": 0.2963, + "step": 1091 + }, + { + "epoch": 0.027633676645494345, + "grad_norm": 12.513320922851562, + "learning_rate": 9.99219226117929e-06, + "loss": 0.2605, + "step": 1092 + }, + { + "epoch": 0.027658982210188022, + "grad_norm": 5.939188003540039, + "learning_rate": 9.99216981489234e-06, + "loss": 0.1927, + "step": 1093 + }, + { + "epoch": 0.027684287774881695, + "grad_norm": 11.099597930908203, + "learning_rate": 9.992147336411816e-06, + "loss": 0.2679, + "step": 1094 + }, + { + "epoch": 0.027709593339575372, + "grad_norm": 12.53689956665039, + "learning_rate": 9.992124825737862e-06, + "loss": 0.3121, + "step": 1095 + }, + { + "epoch": 0.02773489890426905, + "grad_norm": 9.618605613708496, + "learning_rate": 9.992102282870625e-06, + "loss": 0.2317, + "step": 1096 + }, + { + "epoch": 0.027760204468962726, + "grad_norm": 11.95199966430664, + "learning_rate": 9.992079707810248e-06, + "loss": 0.2176, + "step": 1097 + }, + { + "epoch": 0.0277855100336564, + "grad_norm": 8.979182243347168, + "learning_rate": 9.992057100556878e-06, + "loss": 0.2727, + "step": 1098 + }, + { + "epoch": 0.027810815598350077, + "grad_norm": 6.396799087524414, + "learning_rate": 9.99203446111066e-06, + "loss": 0.2152, + "step": 1099 + }, + { + "epoch": 0.027836121163043754, + "grad_norm": 8.057876586914062, + "learning_rate": 9.992011789471741e-06, + "loss": 0.268, + "step": 1100 + }, + { + "epoch": 0.02786142672773743, + "grad_norm": 8.789265632629395, + "learning_rate": 9.991989085640264e-06, + "loss": 0.2547, + "step": 1101 + }, + { + "epoch": 0.027886732292431105, + "grad_norm": 13.180317878723145, + "learning_rate": 9.99196634961638e-06, + "loss": 0.2456, + "step": 1102 + }, + { + "epoch": 0.02791203785712478, + "grad_norm": 7.951696872711182, + "learning_rate": 9.991943581400235e-06, + "loss": 0.1745, + "step": 1103 + }, + { + "epoch": 0.02793734342181846, + "grad_norm": 9.288726806640625, + "learning_rate": 9.991920780991973e-06, + "loss": 0.2632, + "step": 1104 + }, + { + "epoch": 0.027962648986512136, + "grad_norm": 15.917115211486816, + "learning_rate": 9.991897948391742e-06, + "loss": 0.2914, + "step": 1105 + }, + { + "epoch": 0.02798795455120581, + "grad_norm": 7.853461742401123, + "learning_rate": 9.991875083599689e-06, + "loss": 0.2435, + "step": 1106 + }, + { + "epoch": 0.028013260115899486, + "grad_norm": 5.986690998077393, + "learning_rate": 9.991852186615963e-06, + "loss": 0.243, + "step": 1107 + }, + { + "epoch": 0.028038565680593163, + "grad_norm": 34.33723831176758, + "learning_rate": 9.991829257440712e-06, + "loss": 0.4466, + "step": 1108 + }, + { + "epoch": 0.02806387124528684, + "grad_norm": 27.88035774230957, + "learning_rate": 9.991806296074081e-06, + "loss": 0.4364, + "step": 1109 + }, + { + "epoch": 0.028089176809980514, + "grad_norm": 10.516851425170898, + "learning_rate": 9.99178330251622e-06, + "loss": 0.2233, + "step": 1110 + }, + { + "epoch": 0.02811448237467419, + "grad_norm": 10.738503456115723, + "learning_rate": 9.991760276767279e-06, + "loss": 0.2493, + "step": 1111 + }, + { + "epoch": 0.028139787939367868, + "grad_norm": 11.32003402709961, + "learning_rate": 9.991737218827402e-06, + "loss": 0.2779, + "step": 1112 + }, + { + "epoch": 0.028165093504061545, + "grad_norm": 25.972919464111328, + "learning_rate": 9.99171412869674e-06, + "loss": 0.3609, + "step": 1113 + }, + { + "epoch": 0.028190399068755218, + "grad_norm": 13.61380386352539, + "learning_rate": 9.991691006375444e-06, + "loss": 0.2613, + "step": 1114 + }, + { + "epoch": 0.028215704633448895, + "grad_norm": 9.327658653259277, + "learning_rate": 9.991667851863662e-06, + "loss": 0.2535, + "step": 1115 + }, + { + "epoch": 0.028241010198142572, + "grad_norm": 10.65168571472168, + "learning_rate": 9.99164466516154e-06, + "loss": 0.2947, + "step": 1116 + }, + { + "epoch": 0.02826631576283625, + "grad_norm": 18.45180320739746, + "learning_rate": 9.99162144626923e-06, + "loss": 0.1856, + "step": 1117 + }, + { + "epoch": 0.028291621327529923, + "grad_norm": 5.951102256774902, + "learning_rate": 9.991598195186885e-06, + "loss": 0.1806, + "step": 1118 + }, + { + "epoch": 0.0283169268922236, + "grad_norm": 6.606086254119873, + "learning_rate": 9.99157491191465e-06, + "loss": 0.1904, + "step": 1119 + }, + { + "epoch": 0.028342232456917277, + "grad_norm": 11.31915283203125, + "learning_rate": 9.991551596452675e-06, + "loss": 0.2585, + "step": 1120 + }, + { + "epoch": 0.028367538021610954, + "grad_norm": 7.125246524810791, + "learning_rate": 9.991528248801114e-06, + "loss": 0.1718, + "step": 1121 + }, + { + "epoch": 0.028392843586304627, + "grad_norm": 8.835041046142578, + "learning_rate": 9.991504868960116e-06, + "loss": 0.2456, + "step": 1122 + }, + { + "epoch": 0.028418149150998304, + "grad_norm": 10.373453140258789, + "learning_rate": 9.991481456929832e-06, + "loss": 0.2641, + "step": 1123 + }, + { + "epoch": 0.02844345471569198, + "grad_norm": 6.015039443969727, + "learning_rate": 9.991458012710411e-06, + "loss": 0.147, + "step": 1124 + }, + { + "epoch": 0.02846876028038566, + "grad_norm": 5.972106456756592, + "learning_rate": 9.991434536302008e-06, + "loss": 0.181, + "step": 1125 + }, + { + "epoch": 0.028494065845079332, + "grad_norm": 6.470486164093018, + "learning_rate": 9.99141102770477e-06, + "loss": 0.1807, + "step": 1126 + }, + { + "epoch": 0.02851937140977301, + "grad_norm": 9.092905044555664, + "learning_rate": 9.991387486918851e-06, + "loss": 0.1861, + "step": 1127 + }, + { + "epoch": 0.028544676974466686, + "grad_norm": 4.1856889724731445, + "learning_rate": 9.991363913944402e-06, + "loss": 0.1787, + "step": 1128 + }, + { + "epoch": 0.028569982539160363, + "grad_norm": 24.51045036315918, + "learning_rate": 9.991340308781578e-06, + "loss": 0.2416, + "step": 1129 + }, + { + "epoch": 0.028595288103854036, + "grad_norm": 15.598127365112305, + "learning_rate": 9.991316671430526e-06, + "loss": 0.2218, + "step": 1130 + }, + { + "epoch": 0.028620593668547713, + "grad_norm": 11.60816478729248, + "learning_rate": 9.991293001891404e-06, + "loss": 0.2368, + "step": 1131 + }, + { + "epoch": 0.02864589923324139, + "grad_norm": 8.696708679199219, + "learning_rate": 9.99126930016436e-06, + "loss": 0.2919, + "step": 1132 + }, + { + "epoch": 0.028671204797935067, + "grad_norm": 6.690793037414551, + "learning_rate": 9.99124556624955e-06, + "loss": 0.2538, + "step": 1133 + }, + { + "epoch": 0.02869651036262874, + "grad_norm": 6.8039445877075195, + "learning_rate": 9.991221800147123e-06, + "loss": 0.1679, + "step": 1134 + }, + { + "epoch": 0.028721815927322418, + "grad_norm": 12.858781814575195, + "learning_rate": 9.991198001857238e-06, + "loss": 0.186, + "step": 1135 + }, + { + "epoch": 0.028747121492016095, + "grad_norm": 15.230918884277344, + "learning_rate": 9.991174171380045e-06, + "loss": 0.3151, + "step": 1136 + }, + { + "epoch": 0.028772427056709772, + "grad_norm": 5.568515300750732, + "learning_rate": 9.991150308715698e-06, + "loss": 0.2484, + "step": 1137 + }, + { + "epoch": 0.028797732621403446, + "grad_norm": 5.076595783233643, + "learning_rate": 9.991126413864351e-06, + "loss": 0.2267, + "step": 1138 + }, + { + "epoch": 0.028823038186097123, + "grad_norm": 14.065799713134766, + "learning_rate": 9.99110248682616e-06, + "loss": 0.3073, + "step": 1139 + }, + { + "epoch": 0.0288483437507908, + "grad_norm": 15.60949993133545, + "learning_rate": 9.991078527601273e-06, + "loss": 0.2816, + "step": 1140 + }, + { + "epoch": 0.028873649315484477, + "grad_norm": 9.020654678344727, + "learning_rate": 9.991054536189852e-06, + "loss": 0.151, + "step": 1141 + }, + { + "epoch": 0.02889895488017815, + "grad_norm": 7.022347450256348, + "learning_rate": 9.991030512592048e-06, + "loss": 0.2179, + "step": 1142 + }, + { + "epoch": 0.028924260444871827, + "grad_norm": 6.268130779266357, + "learning_rate": 9.991006456808017e-06, + "loss": 0.254, + "step": 1143 + }, + { + "epoch": 0.028949566009565504, + "grad_norm": 19.92980194091797, + "learning_rate": 9.990982368837914e-06, + "loss": 0.3192, + "step": 1144 + }, + { + "epoch": 0.02897487157425918, + "grad_norm": 16.48398208618164, + "learning_rate": 9.990958248681894e-06, + "loss": 0.2417, + "step": 1145 + }, + { + "epoch": 0.029000177138952855, + "grad_norm": 16.6312198638916, + "learning_rate": 9.990934096340112e-06, + "loss": 0.2874, + "step": 1146 + }, + { + "epoch": 0.02902548270364653, + "grad_norm": 8.071006774902344, + "learning_rate": 9.990909911812725e-06, + "loss": 0.2262, + "step": 1147 + }, + { + "epoch": 0.02905078826834021, + "grad_norm": 6.459487438201904, + "learning_rate": 9.990885695099887e-06, + "loss": 0.181, + "step": 1148 + }, + { + "epoch": 0.029076093833033886, + "grad_norm": 9.887219429016113, + "learning_rate": 9.990861446201759e-06, + "loss": 0.2965, + "step": 1149 + }, + { + "epoch": 0.02910139939772756, + "grad_norm": 8.616032600402832, + "learning_rate": 9.990837165118489e-06, + "loss": 0.1336, + "step": 1150 + }, + { + "epoch": 0.029126704962421236, + "grad_norm": 10.099907875061035, + "learning_rate": 9.990812851850243e-06, + "loss": 0.2458, + "step": 1151 + }, + { + "epoch": 0.029152010527114913, + "grad_norm": 11.16508674621582, + "learning_rate": 9.990788506397172e-06, + "loss": 0.3135, + "step": 1152 + }, + { + "epoch": 0.02917731609180859, + "grad_norm": 10.284266471862793, + "learning_rate": 9.990764128759432e-06, + "loss": 0.175, + "step": 1153 + }, + { + "epoch": 0.029202621656502264, + "grad_norm": 5.579133987426758, + "learning_rate": 9.990739718937185e-06, + "loss": 0.2188, + "step": 1154 + }, + { + "epoch": 0.02922792722119594, + "grad_norm": 5.399722576141357, + "learning_rate": 9.990715276930585e-06, + "loss": 0.2038, + "step": 1155 + }, + { + "epoch": 0.029253232785889618, + "grad_norm": 17.397600173950195, + "learning_rate": 9.99069080273979e-06, + "loss": 0.249, + "step": 1156 + }, + { + "epoch": 0.029278538350583295, + "grad_norm": 5.927721977233887, + "learning_rate": 9.990666296364959e-06, + "loss": 0.1517, + "step": 1157 + }, + { + "epoch": 0.02930384391527697, + "grad_norm": 7.210341453552246, + "learning_rate": 9.99064175780625e-06, + "loss": 0.1836, + "step": 1158 + }, + { + "epoch": 0.029329149479970645, + "grad_norm": 47.364280700683594, + "learning_rate": 9.990617187063818e-06, + "loss": 0.2848, + "step": 1159 + }, + { + "epoch": 0.029354455044664322, + "grad_norm": 21.47719955444336, + "learning_rate": 9.990592584137827e-06, + "loss": 0.3045, + "step": 1160 + }, + { + "epoch": 0.029379760609358, + "grad_norm": 10.219146728515625, + "learning_rate": 9.990567949028432e-06, + "loss": 0.2424, + "step": 1161 + }, + { + "epoch": 0.029405066174051673, + "grad_norm": 12.525958061218262, + "learning_rate": 9.990543281735792e-06, + "loss": 0.2684, + "step": 1162 + }, + { + "epoch": 0.02943037173874535, + "grad_norm": 6.306692123413086, + "learning_rate": 9.990518582260066e-06, + "loss": 0.1901, + "step": 1163 + }, + { + "epoch": 0.029455677303439027, + "grad_norm": 9.311384201049805, + "learning_rate": 9.990493850601417e-06, + "loss": 0.1951, + "step": 1164 + }, + { + "epoch": 0.029480982868132704, + "grad_norm": 14.69596004486084, + "learning_rate": 9.99046908676e-06, + "loss": 0.1565, + "step": 1165 + }, + { + "epoch": 0.029506288432826377, + "grad_norm": 15.855645179748535, + "learning_rate": 9.990444290735974e-06, + "loss": 0.3416, + "step": 1166 + }, + { + "epoch": 0.029531593997520054, + "grad_norm": 8.424561500549316, + "learning_rate": 9.990419462529504e-06, + "loss": 0.2308, + "step": 1167 + }, + { + "epoch": 0.02955689956221373, + "grad_norm": 8.177172660827637, + "learning_rate": 9.990394602140745e-06, + "loss": 0.1605, + "step": 1168 + }, + { + "epoch": 0.02958220512690741, + "grad_norm": 6.786746501922607, + "learning_rate": 9.990369709569861e-06, + "loss": 0.2528, + "step": 1169 + }, + { + "epoch": 0.029607510691601082, + "grad_norm": 19.23757553100586, + "learning_rate": 9.99034478481701e-06, + "loss": 0.3586, + "step": 1170 + }, + { + "epoch": 0.02963281625629476, + "grad_norm": 5.3654303550720215, + "learning_rate": 9.990319827882354e-06, + "loss": 0.2275, + "step": 1171 + }, + { + "epoch": 0.029658121820988436, + "grad_norm": 7.167639255523682, + "learning_rate": 9.990294838766055e-06, + "loss": 0.2062, + "step": 1172 + }, + { + "epoch": 0.029683427385682113, + "grad_norm": 4.08692741394043, + "learning_rate": 9.99026981746827e-06, + "loss": 0.1902, + "step": 1173 + }, + { + "epoch": 0.029708732950375787, + "grad_norm": 11.021031379699707, + "learning_rate": 9.990244763989165e-06, + "loss": 0.1922, + "step": 1174 + }, + { + "epoch": 0.029734038515069464, + "grad_norm": 4.93212366104126, + "learning_rate": 9.9902196783289e-06, + "loss": 0.1471, + "step": 1175 + }, + { + "epoch": 0.02975934407976314, + "grad_norm": 11.776172637939453, + "learning_rate": 9.990194560487635e-06, + "loss": 0.2784, + "step": 1176 + }, + { + "epoch": 0.029784649644456818, + "grad_norm": 6.868964195251465, + "learning_rate": 9.990169410465537e-06, + "loss": 0.1306, + "step": 1177 + }, + { + "epoch": 0.02980995520915049, + "grad_norm": 12.267664909362793, + "learning_rate": 9.990144228262762e-06, + "loss": 0.2226, + "step": 1178 + }, + { + "epoch": 0.029835260773844168, + "grad_norm": 9.844961166381836, + "learning_rate": 9.990119013879475e-06, + "loss": 0.1853, + "step": 1179 + }, + { + "epoch": 0.029860566338537845, + "grad_norm": 14.313916206359863, + "learning_rate": 9.990093767315841e-06, + "loss": 0.2586, + "step": 1180 + }, + { + "epoch": 0.029885871903231522, + "grad_norm": 8.852276802062988, + "learning_rate": 9.990068488572018e-06, + "loss": 0.2375, + "step": 1181 + }, + { + "epoch": 0.029911177467925196, + "grad_norm": 20.04242706298828, + "learning_rate": 9.990043177648173e-06, + "loss": 0.369, + "step": 1182 + }, + { + "epoch": 0.029936483032618873, + "grad_norm": 13.441553115844727, + "learning_rate": 9.990017834544465e-06, + "loss": 0.1761, + "step": 1183 + }, + { + "epoch": 0.02996178859731255, + "grad_norm": 8.524361610412598, + "learning_rate": 9.989992459261065e-06, + "loss": 0.2851, + "step": 1184 + }, + { + "epoch": 0.029987094162006227, + "grad_norm": 14.868454933166504, + "learning_rate": 9.989967051798129e-06, + "loss": 0.3501, + "step": 1185 + }, + { + "epoch": 0.0300123997266999, + "grad_norm": 9.424944877624512, + "learning_rate": 9.989941612155823e-06, + "loss": 0.2444, + "step": 1186 + }, + { + "epoch": 0.030037705291393577, + "grad_norm": 17.53418731689453, + "learning_rate": 9.989916140334312e-06, + "loss": 0.5146, + "step": 1187 + }, + { + "epoch": 0.030063010856087254, + "grad_norm": 9.56047248840332, + "learning_rate": 9.98989063633376e-06, + "loss": 0.2909, + "step": 1188 + }, + { + "epoch": 0.03008831642078093, + "grad_norm": 11.486077308654785, + "learning_rate": 9.989865100154333e-06, + "loss": 0.341, + "step": 1189 + }, + { + "epoch": 0.030113621985474605, + "grad_norm": 11.636173248291016, + "learning_rate": 9.989839531796192e-06, + "loss": 0.2047, + "step": 1190 + }, + { + "epoch": 0.030138927550168282, + "grad_norm": 9.658227920532227, + "learning_rate": 9.989813931259506e-06, + "loss": 0.2892, + "step": 1191 + }, + { + "epoch": 0.03016423311486196, + "grad_norm": 7.299750328063965, + "learning_rate": 9.989788298544439e-06, + "loss": 0.2143, + "step": 1192 + }, + { + "epoch": 0.030189538679555636, + "grad_norm": 7.897207736968994, + "learning_rate": 9.989762633651151e-06, + "loss": 0.1852, + "step": 1193 + }, + { + "epoch": 0.03021484424424931, + "grad_norm": 7.398946762084961, + "learning_rate": 9.989736936579817e-06, + "loss": 0.2027, + "step": 1194 + }, + { + "epoch": 0.030240149808942986, + "grad_norm": 6.477351188659668, + "learning_rate": 9.989711207330594e-06, + "loss": 0.2634, + "step": 1195 + }, + { + "epoch": 0.030265455373636663, + "grad_norm": 4.936367988586426, + "learning_rate": 9.989685445903654e-06, + "loss": 0.248, + "step": 1196 + }, + { + "epoch": 0.03029076093833034, + "grad_norm": 7.694951057434082, + "learning_rate": 9.98965965229916e-06, + "loss": 0.2091, + "step": 1197 + }, + { + "epoch": 0.030316066503024014, + "grad_norm": 21.395530700683594, + "learning_rate": 9.98963382651728e-06, + "loss": 0.4135, + "step": 1198 + }, + { + "epoch": 0.03034137206771769, + "grad_norm": 3.750629186630249, + "learning_rate": 9.989607968558176e-06, + "loss": 0.1829, + "step": 1199 + }, + { + "epoch": 0.030366677632411368, + "grad_norm": 6.4741973876953125, + "learning_rate": 9.989582078422023e-06, + "loss": 0.1936, + "step": 1200 + }, + { + "epoch": 0.030391983197105045, + "grad_norm": 4.439853191375732, + "learning_rate": 9.98955615610898e-06, + "loss": 0.1608, + "step": 1201 + }, + { + "epoch": 0.03041728876179872, + "grad_norm": 22.709091186523438, + "learning_rate": 9.989530201619219e-06, + "loss": 0.1856, + "step": 1202 + }, + { + "epoch": 0.030442594326492396, + "grad_norm": 11.605212211608887, + "learning_rate": 9.989504214952905e-06, + "loss": 0.3531, + "step": 1203 + }, + { + "epoch": 0.030467899891186073, + "grad_norm": 11.50270938873291, + "learning_rate": 9.989478196110207e-06, + "loss": 0.2797, + "step": 1204 + }, + { + "epoch": 0.03049320545587975, + "grad_norm": 14.029168128967285, + "learning_rate": 9.989452145091293e-06, + "loss": 0.3451, + "step": 1205 + }, + { + "epoch": 0.030518511020573423, + "grad_norm": 9.868197441101074, + "learning_rate": 9.989426061896329e-06, + "loss": 0.2116, + "step": 1206 + }, + { + "epoch": 0.0305438165852671, + "grad_norm": 8.994102478027344, + "learning_rate": 9.989399946525485e-06, + "loss": 0.1937, + "step": 1207 + }, + { + "epoch": 0.030569122149960777, + "grad_norm": 9.866095542907715, + "learning_rate": 9.989373798978927e-06, + "loss": 0.2854, + "step": 1208 + }, + { + "epoch": 0.030594427714654454, + "grad_norm": 7.472931385040283, + "learning_rate": 9.989347619256827e-06, + "loss": 0.255, + "step": 1209 + }, + { + "epoch": 0.030619733279348128, + "grad_norm": 5.980502128601074, + "learning_rate": 9.989321407359353e-06, + "loss": 0.1695, + "step": 1210 + }, + { + "epoch": 0.030645038844041805, + "grad_norm": 8.031277656555176, + "learning_rate": 9.989295163286672e-06, + "loss": 0.2253, + "step": 1211 + }, + { + "epoch": 0.03067034440873548, + "grad_norm": 7.306782245635986, + "learning_rate": 9.989268887038956e-06, + "loss": 0.2702, + "step": 1212 + }, + { + "epoch": 0.03069564997342916, + "grad_norm": 23.86313819885254, + "learning_rate": 9.989242578616372e-06, + "loss": 0.2865, + "step": 1213 + }, + { + "epoch": 0.030720955538122832, + "grad_norm": 5.9522600173950195, + "learning_rate": 9.98921623801909e-06, + "loss": 0.2322, + "step": 1214 + }, + { + "epoch": 0.03074626110281651, + "grad_norm": 7.473076820373535, + "learning_rate": 9.989189865247282e-06, + "loss": 0.1957, + "step": 1215 + }, + { + "epoch": 0.030771566667510186, + "grad_norm": 7.107340335845947, + "learning_rate": 9.989163460301117e-06, + "loss": 0.2558, + "step": 1216 + }, + { + "epoch": 0.030796872232203863, + "grad_norm": 5.313264846801758, + "learning_rate": 9.989137023180764e-06, + "loss": 0.2225, + "step": 1217 + }, + { + "epoch": 0.030822177796897537, + "grad_norm": 10.440020561218262, + "learning_rate": 9.989110553886394e-06, + "loss": 0.2045, + "step": 1218 + }, + { + "epoch": 0.030847483361591214, + "grad_norm": 6.837976455688477, + "learning_rate": 9.989084052418178e-06, + "loss": 0.134, + "step": 1219 + }, + { + "epoch": 0.03087278892628489, + "grad_norm": 24.822547912597656, + "learning_rate": 9.989057518776287e-06, + "loss": 0.4207, + "step": 1220 + }, + { + "epoch": 0.030898094490978568, + "grad_norm": 6.037609577178955, + "learning_rate": 9.989030952960893e-06, + "loss": 0.2222, + "step": 1221 + }, + { + "epoch": 0.03092340005567224, + "grad_norm": 14.609480857849121, + "learning_rate": 9.989004354972167e-06, + "loss": 0.3271, + "step": 1222 + }, + { + "epoch": 0.03094870562036592, + "grad_norm": 6.95751953125, + "learning_rate": 9.988977724810278e-06, + "loss": 0.263, + "step": 1223 + }, + { + "epoch": 0.030974011185059595, + "grad_norm": 8.421700477600098, + "learning_rate": 9.9889510624754e-06, + "loss": 0.1739, + "step": 1224 + }, + { + "epoch": 0.030999316749753272, + "grad_norm": 13.001262664794922, + "learning_rate": 9.988924367967705e-06, + "loss": 0.2643, + "step": 1225 + }, + { + "epoch": 0.031024622314446946, + "grad_norm": 18.846376419067383, + "learning_rate": 9.988897641287366e-06, + "loss": 0.3401, + "step": 1226 + }, + { + "epoch": 0.031049927879140623, + "grad_norm": 5.498992919921875, + "learning_rate": 9.988870882434552e-06, + "loss": 0.1385, + "step": 1227 + }, + { + "epoch": 0.0310752334438343, + "grad_norm": 13.780608177185059, + "learning_rate": 9.98884409140944e-06, + "loss": 0.2813, + "step": 1228 + }, + { + "epoch": 0.031100539008527977, + "grad_norm": 6.040072441101074, + "learning_rate": 9.988817268212198e-06, + "loss": 0.2373, + "step": 1229 + }, + { + "epoch": 0.03112584457322165, + "grad_norm": 7.5933518409729, + "learning_rate": 9.988790412843004e-06, + "loss": 0.2751, + "step": 1230 + }, + { + "epoch": 0.031151150137915327, + "grad_norm": 7.514369964599609, + "learning_rate": 9.988763525302026e-06, + "loss": 0.2125, + "step": 1231 + }, + { + "epoch": 0.031176455702609004, + "grad_norm": 20.091888427734375, + "learning_rate": 9.988736605589441e-06, + "loss": 0.3395, + "step": 1232 + }, + { + "epoch": 0.03120176126730268, + "grad_norm": 13.354718208312988, + "learning_rate": 9.988709653705422e-06, + "loss": 0.3517, + "step": 1233 + }, + { + "epoch": 0.031227066831996355, + "grad_norm": 7.858821868896484, + "learning_rate": 9.988682669650142e-06, + "loss": 0.201, + "step": 1234 + }, + { + "epoch": 0.031252372396690035, + "grad_norm": 10.0390625, + "learning_rate": 9.988655653423777e-06, + "loss": 0.2384, + "step": 1235 + }, + { + "epoch": 0.031277677961383706, + "grad_norm": 5.575423717498779, + "learning_rate": 9.988628605026498e-06, + "loss": 0.1529, + "step": 1236 + }, + { + "epoch": 0.03130298352607738, + "grad_norm": 4.569855213165283, + "learning_rate": 9.988601524458482e-06, + "loss": 0.0837, + "step": 1237 + }, + { + "epoch": 0.03132828909077106, + "grad_norm": 6.192471981048584, + "learning_rate": 9.988574411719902e-06, + "loss": 0.2281, + "step": 1238 + }, + { + "epoch": 0.03135359465546474, + "grad_norm": 15.639558792114258, + "learning_rate": 9.988547266810933e-06, + "loss": 0.3065, + "step": 1239 + }, + { + "epoch": 0.031378900220158414, + "grad_norm": 6.032498836517334, + "learning_rate": 9.98852008973175e-06, + "loss": 0.1698, + "step": 1240 + }, + { + "epoch": 0.03140420578485209, + "grad_norm": 7.883952617645264, + "learning_rate": 9.988492880482532e-06, + "loss": 0.1586, + "step": 1241 + }, + { + "epoch": 0.03142951134954577, + "grad_norm": 31.0389404296875, + "learning_rate": 9.98846563906345e-06, + "loss": 0.2935, + "step": 1242 + }, + { + "epoch": 0.031454816914239445, + "grad_norm": 21.123031616210938, + "learning_rate": 9.988438365474679e-06, + "loss": 0.4235, + "step": 1243 + }, + { + "epoch": 0.031480122478933115, + "grad_norm": 10.262578010559082, + "learning_rate": 9.988411059716398e-06, + "loss": 0.2989, + "step": 1244 + }, + { + "epoch": 0.03150542804362679, + "grad_norm": 16.28196907043457, + "learning_rate": 9.988383721788783e-06, + "loss": 0.3659, + "step": 1245 + }, + { + "epoch": 0.03153073360832047, + "grad_norm": 6.517911911010742, + "learning_rate": 9.98835635169201e-06, + "loss": 0.246, + "step": 1246 + }, + { + "epoch": 0.031556039173014146, + "grad_norm": 13.157851219177246, + "learning_rate": 9.988328949426253e-06, + "loss": 0.21, + "step": 1247 + }, + { + "epoch": 0.03158134473770782, + "grad_norm": 8.217767715454102, + "learning_rate": 9.988301514991692e-06, + "loss": 0.2859, + "step": 1248 + }, + { + "epoch": 0.0316066503024015, + "grad_norm": 7.394360542297363, + "learning_rate": 9.9882740483885e-06, + "loss": 0.1565, + "step": 1249 + }, + { + "epoch": 0.03163195586709518, + "grad_norm": 10.878229141235352, + "learning_rate": 9.988246549616858e-06, + "loss": 0.2027, + "step": 1250 + }, + { + "epoch": 0.031657261431788854, + "grad_norm": 8.4940824508667, + "learning_rate": 9.988219018676942e-06, + "loss": 0.2089, + "step": 1251 + }, + { + "epoch": 0.031682566996482524, + "grad_norm": 5.844663619995117, + "learning_rate": 9.98819145556893e-06, + "loss": 0.2422, + "step": 1252 + }, + { + "epoch": 0.0317078725611762, + "grad_norm": 10.269329071044922, + "learning_rate": 9.988163860292998e-06, + "loss": 0.2553, + "step": 1253 + }, + { + "epoch": 0.03173317812586988, + "grad_norm": 12.672540664672852, + "learning_rate": 9.988136232849325e-06, + "loss": 0.2839, + "step": 1254 + }, + { + "epoch": 0.031758483690563555, + "grad_norm": 6.547438144683838, + "learning_rate": 9.98810857323809e-06, + "loss": 0.2176, + "step": 1255 + }, + { + "epoch": 0.03178378925525723, + "grad_norm": 8.117012023925781, + "learning_rate": 9.988080881459472e-06, + "loss": 0.1966, + "step": 1256 + }, + { + "epoch": 0.03180909481995091, + "grad_norm": 10.225984573364258, + "learning_rate": 9.988053157513647e-06, + "loss": 0.1843, + "step": 1257 + }, + { + "epoch": 0.031834400384644586, + "grad_norm": 7.535458087921143, + "learning_rate": 9.988025401400794e-06, + "loss": 0.2075, + "step": 1258 + }, + { + "epoch": 0.03185970594933826, + "grad_norm": 7.630176544189453, + "learning_rate": 9.987997613121094e-06, + "loss": 0.2008, + "step": 1259 + }, + { + "epoch": 0.03188501151403193, + "grad_norm": 9.85926628112793, + "learning_rate": 9.987969792674725e-06, + "loss": 0.2721, + "step": 1260 + }, + { + "epoch": 0.03191031707872561, + "grad_norm": 6.298895835876465, + "learning_rate": 9.987941940061868e-06, + "loss": 0.2459, + "step": 1261 + }, + { + "epoch": 0.03193562264341929, + "grad_norm": 9.272953987121582, + "learning_rate": 9.9879140552827e-06, + "loss": 0.2303, + "step": 1262 + }, + { + "epoch": 0.031960928208112964, + "grad_norm": 6.692534446716309, + "learning_rate": 9.987886138337402e-06, + "loss": 0.1501, + "step": 1263 + }, + { + "epoch": 0.03198623377280664, + "grad_norm": 7.93616247177124, + "learning_rate": 9.987858189226154e-06, + "loss": 0.2437, + "step": 1264 + }, + { + "epoch": 0.03201153933750032, + "grad_norm": 14.158751487731934, + "learning_rate": 9.987830207949136e-06, + "loss": 0.3267, + "step": 1265 + }, + { + "epoch": 0.032036844902193995, + "grad_norm": 10.417723655700684, + "learning_rate": 9.98780219450653e-06, + "loss": 0.2882, + "step": 1266 + }, + { + "epoch": 0.03206215046688767, + "grad_norm": 29.236480712890625, + "learning_rate": 9.987774148898514e-06, + "loss": 0.4615, + "step": 1267 + }, + { + "epoch": 0.03208745603158134, + "grad_norm": 6.092208385467529, + "learning_rate": 9.987746071125272e-06, + "loss": 0.2156, + "step": 1268 + }, + { + "epoch": 0.03211276159627502, + "grad_norm": 7.366957187652588, + "learning_rate": 9.987717961186982e-06, + "loss": 0.1555, + "step": 1269 + }, + { + "epoch": 0.032138067160968696, + "grad_norm": 10.71274185180664, + "learning_rate": 9.987689819083827e-06, + "loss": 0.1612, + "step": 1270 + }, + { + "epoch": 0.03216337272566237, + "grad_norm": 10.310040473937988, + "learning_rate": 9.987661644815989e-06, + "loss": 0.225, + "step": 1271 + }, + { + "epoch": 0.03218867829035605, + "grad_norm": 18.56273651123047, + "learning_rate": 9.987633438383648e-06, + "loss": 0.299, + "step": 1272 + }, + { + "epoch": 0.03221398385504973, + "grad_norm": 13.157346725463867, + "learning_rate": 9.987605199786987e-06, + "loss": 0.3586, + "step": 1273 + }, + { + "epoch": 0.032239289419743404, + "grad_norm": 9.640522956848145, + "learning_rate": 9.987576929026189e-06, + "loss": 0.2533, + "step": 1274 + }, + { + "epoch": 0.03226459498443708, + "grad_norm": 4.506862163543701, + "learning_rate": 9.987548626101434e-06, + "loss": 0.197, + "step": 1275 + }, + { + "epoch": 0.03228990054913075, + "grad_norm": 9.070507049560547, + "learning_rate": 9.987520291012904e-06, + "loss": 0.1726, + "step": 1276 + }, + { + "epoch": 0.03231520611382443, + "grad_norm": 6.770977020263672, + "learning_rate": 9.987491923760786e-06, + "loss": 0.2788, + "step": 1277 + }, + { + "epoch": 0.032340511678518105, + "grad_norm": 8.1525239944458, + "learning_rate": 9.987463524345257e-06, + "loss": 0.2547, + "step": 1278 + }, + { + "epoch": 0.03236581724321178, + "grad_norm": 6.893962383270264, + "learning_rate": 9.987435092766506e-06, + "loss": 0.1962, + "step": 1279 + }, + { + "epoch": 0.03239112280790546, + "grad_norm": 7.285104751586914, + "learning_rate": 9.987406629024714e-06, + "loss": 0.2586, + "step": 1280 + }, + { + "epoch": 0.032416428372599136, + "grad_norm": 17.881696701049805, + "learning_rate": 9.987378133120064e-06, + "loss": 0.2754, + "step": 1281 + }, + { + "epoch": 0.03244173393729281, + "grad_norm": 6.155219554901123, + "learning_rate": 9.98734960505274e-06, + "loss": 0.1871, + "step": 1282 + }, + { + "epoch": 0.03246703950198649, + "grad_norm": 5.110387325286865, + "learning_rate": 9.987321044822926e-06, + "loss": 0.1788, + "step": 1283 + }, + { + "epoch": 0.03249234506668016, + "grad_norm": 15.275229454040527, + "learning_rate": 9.987292452430807e-06, + "loss": 0.3533, + "step": 1284 + }, + { + "epoch": 0.03251765063137384, + "grad_norm": 15.525352478027344, + "learning_rate": 9.987263827876566e-06, + "loss": 0.4018, + "step": 1285 + }, + { + "epoch": 0.032542956196067514, + "grad_norm": 13.6869478225708, + "learning_rate": 9.987235171160389e-06, + "loss": 0.2764, + "step": 1286 + }, + { + "epoch": 0.03256826176076119, + "grad_norm": 8.165386199951172, + "learning_rate": 9.987206482282459e-06, + "loss": 0.2235, + "step": 1287 + }, + { + "epoch": 0.03259356732545487, + "grad_norm": 7.906418800354004, + "learning_rate": 9.987177761242964e-06, + "loss": 0.1814, + "step": 1288 + }, + { + "epoch": 0.032618872890148545, + "grad_norm": 9.657983779907227, + "learning_rate": 9.987149008042085e-06, + "loss": 0.2145, + "step": 1289 + }, + { + "epoch": 0.03264417845484222, + "grad_norm": 7.721264839172363, + "learning_rate": 9.987120222680011e-06, + "loss": 0.2923, + "step": 1290 + }, + { + "epoch": 0.0326694840195359, + "grad_norm": 6.837065696716309, + "learning_rate": 9.987091405156926e-06, + "loss": 0.3102, + "step": 1291 + }, + { + "epoch": 0.03269478958422957, + "grad_norm": 7.47155237197876, + "learning_rate": 9.987062555473017e-06, + "loss": 0.173, + "step": 1292 + }, + { + "epoch": 0.032720095148923246, + "grad_norm": 18.669198989868164, + "learning_rate": 9.987033673628469e-06, + "loss": 0.3559, + "step": 1293 + }, + { + "epoch": 0.03274540071361692, + "grad_norm": 7.468612194061279, + "learning_rate": 9.987004759623467e-06, + "loss": 0.1517, + "step": 1294 + }, + { + "epoch": 0.0327707062783106, + "grad_norm": 10.674951553344727, + "learning_rate": 9.986975813458199e-06, + "loss": 0.2574, + "step": 1295 + }, + { + "epoch": 0.03279601184300428, + "grad_norm": 9.749489784240723, + "learning_rate": 9.986946835132853e-06, + "loss": 0.2862, + "step": 1296 + }, + { + "epoch": 0.032821317407697954, + "grad_norm": 6.991147994995117, + "learning_rate": 9.986917824647614e-06, + "loss": 0.2115, + "step": 1297 + }, + { + "epoch": 0.03284662297239163, + "grad_norm": 8.445427894592285, + "learning_rate": 9.98688878200267e-06, + "loss": 0.2155, + "step": 1298 + }, + { + "epoch": 0.03287192853708531, + "grad_norm": 7.80535888671875, + "learning_rate": 9.986859707198206e-06, + "loss": 0.2438, + "step": 1299 + }, + { + "epoch": 0.03289723410177898, + "grad_norm": 9.2847318649292, + "learning_rate": 9.986830600234414e-06, + "loss": 0.2515, + "step": 1300 + }, + { + "epoch": 0.032922539666472656, + "grad_norm": 10.855546951293945, + "learning_rate": 9.986801461111477e-06, + "loss": 0.2442, + "step": 1301 + }, + { + "epoch": 0.03294784523116633, + "grad_norm": 7.705623626708984, + "learning_rate": 9.986772289829588e-06, + "loss": 0.2505, + "step": 1302 + }, + { + "epoch": 0.03297315079586001, + "grad_norm": 7.8979082107543945, + "learning_rate": 9.986743086388927e-06, + "loss": 0.1942, + "step": 1303 + }, + { + "epoch": 0.032998456360553687, + "grad_norm": 6.657605171203613, + "learning_rate": 9.986713850789692e-06, + "loss": 0.1418, + "step": 1304 + }, + { + "epoch": 0.033023761925247364, + "grad_norm": 7.169976711273193, + "learning_rate": 9.986684583032066e-06, + "loss": 0.2614, + "step": 1305 + }, + { + "epoch": 0.03304906748994104, + "grad_norm": 17.306175231933594, + "learning_rate": 9.986655283116238e-06, + "loss": 0.243, + "step": 1306 + }, + { + "epoch": 0.03307437305463472, + "grad_norm": 6.467677593231201, + "learning_rate": 9.986625951042398e-06, + "loss": 0.19, + "step": 1307 + }, + { + "epoch": 0.03309967861932839, + "grad_norm": 12.194293975830078, + "learning_rate": 9.986596586810735e-06, + "loss": 0.3422, + "step": 1308 + }, + { + "epoch": 0.033124984184022065, + "grad_norm": 11.134478569030762, + "learning_rate": 9.986567190421436e-06, + "loss": 0.2102, + "step": 1309 + }, + { + "epoch": 0.03315028974871574, + "grad_norm": 9.906388282775879, + "learning_rate": 9.986537761874695e-06, + "loss": 0.1344, + "step": 1310 + }, + { + "epoch": 0.03317559531340942, + "grad_norm": 12.372749328613281, + "learning_rate": 9.986508301170701e-06, + "loss": 0.2449, + "step": 1311 + }, + { + "epoch": 0.033200900878103096, + "grad_norm": 13.783278465270996, + "learning_rate": 9.98647880830964e-06, + "loss": 0.261, + "step": 1312 + }, + { + "epoch": 0.03322620644279677, + "grad_norm": 7.908647060394287, + "learning_rate": 9.986449283291706e-06, + "loss": 0.2949, + "step": 1313 + }, + { + "epoch": 0.03325151200749045, + "grad_norm": 17.142656326293945, + "learning_rate": 9.986419726117087e-06, + "loss": 0.3657, + "step": 1314 + }, + { + "epoch": 0.03327681757218413, + "grad_norm": 10.843513488769531, + "learning_rate": 9.986390136785976e-06, + "loss": 0.2573, + "step": 1315 + }, + { + "epoch": 0.0333021231368778, + "grad_norm": 6.3763322830200195, + "learning_rate": 9.98636051529856e-06, + "loss": 0.2151, + "step": 1316 + }, + { + "epoch": 0.033327428701571474, + "grad_norm": 12.444320678710938, + "learning_rate": 9.986330861655036e-06, + "loss": 0.2478, + "step": 1317 + }, + { + "epoch": 0.03335273426626515, + "grad_norm": 13.14673900604248, + "learning_rate": 9.986301175855589e-06, + "loss": 0.2464, + "step": 1318 + }, + { + "epoch": 0.03337803983095883, + "grad_norm": 7.331721782684326, + "learning_rate": 9.986271457900414e-06, + "loss": 0.222, + "step": 1319 + }, + { + "epoch": 0.033403345395652505, + "grad_norm": 9.920171737670898, + "learning_rate": 9.986241707789704e-06, + "loss": 0.3117, + "step": 1320 + }, + { + "epoch": 0.03342865096034618, + "grad_norm": 12.816062927246094, + "learning_rate": 9.986211925523646e-06, + "loss": 0.2872, + "step": 1321 + }, + { + "epoch": 0.03345395652503986, + "grad_norm": 9.204841613769531, + "learning_rate": 9.986182111102435e-06, + "loss": 0.2083, + "step": 1322 + }, + { + "epoch": 0.033479262089733536, + "grad_norm": 8.765844345092773, + "learning_rate": 9.986152264526265e-06, + "loss": 0.3205, + "step": 1323 + }, + { + "epoch": 0.033504567654427206, + "grad_norm": 16.368629455566406, + "learning_rate": 9.986122385795324e-06, + "loss": 0.2594, + "step": 1324 + }, + { + "epoch": 0.03352987321912088, + "grad_norm": 9.205881118774414, + "learning_rate": 9.98609247490981e-06, + "loss": 0.2424, + "step": 1325 + }, + { + "epoch": 0.03355517878381456, + "grad_norm": 8.067310333251953, + "learning_rate": 9.986062531869913e-06, + "loss": 0.2538, + "step": 1326 + }, + { + "epoch": 0.03358048434850824, + "grad_norm": 5.866289138793945, + "learning_rate": 9.986032556675824e-06, + "loss": 0.1013, + "step": 1327 + }, + { + "epoch": 0.033605789913201914, + "grad_norm": 5.355849742889404, + "learning_rate": 9.98600254932774e-06, + "loss": 0.1805, + "step": 1328 + }, + { + "epoch": 0.03363109547789559, + "grad_norm": 10.503355026245117, + "learning_rate": 9.985972509825852e-06, + "loss": 0.1981, + "step": 1329 + }, + { + "epoch": 0.03365640104258927, + "grad_norm": 7.131343841552734, + "learning_rate": 9.985942438170356e-06, + "loss": 0.2271, + "step": 1330 + }, + { + "epoch": 0.033681706607282945, + "grad_norm": 10.514835357666016, + "learning_rate": 9.985912334361445e-06, + "loss": 0.2107, + "step": 1331 + }, + { + "epoch": 0.033707012171976615, + "grad_norm": 10.620448112487793, + "learning_rate": 9.98588219839931e-06, + "loss": 0.1646, + "step": 1332 + }, + { + "epoch": 0.03373231773667029, + "grad_norm": 9.777322769165039, + "learning_rate": 9.985852030284151e-06, + "loss": 0.2149, + "step": 1333 + }, + { + "epoch": 0.03375762330136397, + "grad_norm": 11.605814933776855, + "learning_rate": 9.985821830016158e-06, + "loss": 0.3253, + "step": 1334 + }, + { + "epoch": 0.033782928866057646, + "grad_norm": 16.043642044067383, + "learning_rate": 9.98579159759553e-06, + "loss": 0.3976, + "step": 1335 + }, + { + "epoch": 0.03380823443075132, + "grad_norm": 7.655010223388672, + "learning_rate": 9.985761333022455e-06, + "loss": 0.1793, + "step": 1336 + }, + { + "epoch": 0.033833539995445, + "grad_norm": 7.499018669128418, + "learning_rate": 9.985731036297135e-06, + "loss": 0.2235, + "step": 1337 + }, + { + "epoch": 0.03385884556013868, + "grad_norm": 12.229300498962402, + "learning_rate": 9.985700707419763e-06, + "loss": 0.1799, + "step": 1338 + }, + { + "epoch": 0.033884151124832354, + "grad_norm": 8.786460876464844, + "learning_rate": 9.985670346390535e-06, + "loss": 0.2535, + "step": 1339 + }, + { + "epoch": 0.033909456689526024, + "grad_norm": 8.464448928833008, + "learning_rate": 9.985639953209647e-06, + "loss": 0.2011, + "step": 1340 + }, + { + "epoch": 0.0339347622542197, + "grad_norm": 10.678413391113281, + "learning_rate": 9.985609527877292e-06, + "loss": 0.161, + "step": 1341 + }, + { + "epoch": 0.03396006781891338, + "grad_norm": 14.335681915283203, + "learning_rate": 9.98557907039367e-06, + "loss": 0.21, + "step": 1342 + }, + { + "epoch": 0.033985373383607055, + "grad_norm": 11.102860450744629, + "learning_rate": 9.985548580758976e-06, + "loss": 0.338, + "step": 1343 + }, + { + "epoch": 0.03401067894830073, + "grad_norm": 4.0775041580200195, + "learning_rate": 9.985518058973407e-06, + "loss": 0.1376, + "step": 1344 + }, + { + "epoch": 0.03403598451299441, + "grad_norm": 9.528867721557617, + "learning_rate": 9.98548750503716e-06, + "loss": 0.3013, + "step": 1345 + }, + { + "epoch": 0.034061290077688086, + "grad_norm": 5.981616973876953, + "learning_rate": 9.98545691895043e-06, + "loss": 0.1742, + "step": 1346 + }, + { + "epoch": 0.03408659564238176, + "grad_norm": 5.592037200927734, + "learning_rate": 9.985426300713414e-06, + "loss": 0.2162, + "step": 1347 + }, + { + "epoch": 0.03411190120707543, + "grad_norm": 9.026970863342285, + "learning_rate": 9.985395650326314e-06, + "loss": 0.2284, + "step": 1348 + }, + { + "epoch": 0.03413720677176911, + "grad_norm": 14.252121925354004, + "learning_rate": 9.985364967789324e-06, + "loss": 0.2389, + "step": 1349 + }, + { + "epoch": 0.03416251233646279, + "grad_norm": 16.50203514099121, + "learning_rate": 9.985334253102642e-06, + "loss": 0.4295, + "step": 1350 + }, + { + "epoch": 0.034187817901156464, + "grad_norm": 7.258693695068359, + "learning_rate": 9.985303506266469e-06, + "loss": 0.1736, + "step": 1351 + }, + { + "epoch": 0.03421312346585014, + "grad_norm": 6.151707649230957, + "learning_rate": 9.985272727280999e-06, + "loss": 0.1046, + "step": 1352 + }, + { + "epoch": 0.03423842903054382, + "grad_norm": 6.1153459548950195, + "learning_rate": 9.985241916146432e-06, + "loss": 0.1876, + "step": 1353 + }, + { + "epoch": 0.034263734595237495, + "grad_norm": 5.94398307800293, + "learning_rate": 9.985211072862967e-06, + "loss": 0.2186, + "step": 1354 + }, + { + "epoch": 0.03428904015993117, + "grad_norm": 8.019396781921387, + "learning_rate": 9.985180197430804e-06, + "loss": 0.3087, + "step": 1355 + }, + { + "epoch": 0.03431434572462484, + "grad_norm": 9.223150253295898, + "learning_rate": 9.985149289850142e-06, + "loss": 0.2588, + "step": 1356 + }, + { + "epoch": 0.03433965128931852, + "grad_norm": 7.781163215637207, + "learning_rate": 9.985118350121178e-06, + "loss": 0.2255, + "step": 1357 + }, + { + "epoch": 0.034364956854012196, + "grad_norm": 10.828813552856445, + "learning_rate": 9.985087378244116e-06, + "loss": 0.2737, + "step": 1358 + }, + { + "epoch": 0.03439026241870587, + "grad_norm": 3.0584287643432617, + "learning_rate": 9.985056374219151e-06, + "loss": 0.1817, + "step": 1359 + }, + { + "epoch": 0.03441556798339955, + "grad_norm": 5.679318904876709, + "learning_rate": 9.985025338046484e-06, + "loss": 0.2745, + "step": 1360 + }, + { + "epoch": 0.03444087354809323, + "grad_norm": 8.366219520568848, + "learning_rate": 9.984994269726316e-06, + "loss": 0.2756, + "step": 1361 + }, + { + "epoch": 0.034466179112786904, + "grad_norm": 7.790300369262695, + "learning_rate": 9.984963169258847e-06, + "loss": 0.2215, + "step": 1362 + }, + { + "epoch": 0.03449148467748058, + "grad_norm": 13.407808303833008, + "learning_rate": 9.98493203664428e-06, + "loss": 0.3098, + "step": 1363 + }, + { + "epoch": 0.03451679024217425, + "grad_norm": 11.461901664733887, + "learning_rate": 9.984900871882812e-06, + "loss": 0.201, + "step": 1364 + }, + { + "epoch": 0.03454209580686793, + "grad_norm": 6.900326251983643, + "learning_rate": 9.984869674974645e-06, + "loss": 0.1914, + "step": 1365 + }, + { + "epoch": 0.034567401371561605, + "grad_norm": 7.479091167449951, + "learning_rate": 9.98483844591998e-06, + "loss": 0.2352, + "step": 1366 + }, + { + "epoch": 0.03459270693625528, + "grad_norm": 6.57506799697876, + "learning_rate": 9.984807184719021e-06, + "loss": 0.2176, + "step": 1367 + }, + { + "epoch": 0.03461801250094896, + "grad_norm": 10.710339546203613, + "learning_rate": 9.984775891371967e-06, + "loss": 0.3246, + "step": 1368 + }, + { + "epoch": 0.034643318065642636, + "grad_norm": 7.168102741241455, + "learning_rate": 9.98474456587902e-06, + "loss": 0.2768, + "step": 1369 + }, + { + "epoch": 0.034668623630336314, + "grad_norm": 19.745561599731445, + "learning_rate": 9.984713208240384e-06, + "loss": 0.2671, + "step": 1370 + }, + { + "epoch": 0.03469392919502999, + "grad_norm": 7.820814609527588, + "learning_rate": 9.984681818456259e-06, + "loss": 0.2304, + "step": 1371 + }, + { + "epoch": 0.03471923475972366, + "grad_norm": 5.477067470550537, + "learning_rate": 9.984650396526849e-06, + "loss": 0.1391, + "step": 1372 + }, + { + "epoch": 0.03474454032441734, + "grad_norm": 14.816845893859863, + "learning_rate": 9.984618942452354e-06, + "loss": 0.312, + "step": 1373 + }, + { + "epoch": 0.034769845889111015, + "grad_norm": 9.69422435760498, + "learning_rate": 9.98458745623298e-06, + "loss": 0.1853, + "step": 1374 + }, + { + "epoch": 0.03479515145380469, + "grad_norm": 8.563726425170898, + "learning_rate": 9.984555937868928e-06, + "loss": 0.2638, + "step": 1375 + }, + { + "epoch": 0.03482045701849837, + "grad_norm": 8.693670272827148, + "learning_rate": 9.984524387360401e-06, + "loss": 0.2682, + "step": 1376 + }, + { + "epoch": 0.034845762583192046, + "grad_norm": 6.279068946838379, + "learning_rate": 9.984492804707606e-06, + "loss": 0.2358, + "step": 1377 + }, + { + "epoch": 0.03487106814788572, + "grad_norm": 6.774575710296631, + "learning_rate": 9.984461189910742e-06, + "loss": 0.199, + "step": 1378 + }, + { + "epoch": 0.0348963737125794, + "grad_norm": 13.999991416931152, + "learning_rate": 9.984429542970015e-06, + "loss": 0.2611, + "step": 1379 + }, + { + "epoch": 0.03492167927727307, + "grad_norm": 4.841914653778076, + "learning_rate": 9.98439786388563e-06, + "loss": 0.1867, + "step": 1380 + }, + { + "epoch": 0.03494698484196675, + "grad_norm": 7.289920806884766, + "learning_rate": 9.984366152657791e-06, + "loss": 0.2498, + "step": 1381 + }, + { + "epoch": 0.034972290406660424, + "grad_norm": 6.420675754547119, + "learning_rate": 9.9843344092867e-06, + "loss": 0.2526, + "step": 1382 + }, + { + "epoch": 0.0349975959713541, + "grad_norm": 4.539547443389893, + "learning_rate": 9.984302633772564e-06, + "loss": 0.1944, + "step": 1383 + }, + { + "epoch": 0.03502290153604778, + "grad_norm": 11.391812324523926, + "learning_rate": 9.98427082611559e-06, + "loss": 0.2395, + "step": 1384 + }, + { + "epoch": 0.035048207100741455, + "grad_norm": 11.446914672851562, + "learning_rate": 9.984238986315978e-06, + "loss": 0.2724, + "step": 1385 + }, + { + "epoch": 0.03507351266543513, + "grad_norm": 8.231719017028809, + "learning_rate": 9.984207114373936e-06, + "loss": 0.2194, + "step": 1386 + }, + { + "epoch": 0.03509881823012881, + "grad_norm": 8.880653381347656, + "learning_rate": 9.98417521028967e-06, + "loss": 0.3122, + "step": 1387 + }, + { + "epoch": 0.03512412379482248, + "grad_norm": 5.817066669464111, + "learning_rate": 9.984143274063384e-06, + "loss": 0.2069, + "step": 1388 + }, + { + "epoch": 0.035149429359516156, + "grad_norm": 6.3374104499816895, + "learning_rate": 9.984111305695288e-06, + "loss": 0.2141, + "step": 1389 + }, + { + "epoch": 0.03517473492420983, + "grad_norm": 10.57607364654541, + "learning_rate": 9.984079305185582e-06, + "loss": 0.3426, + "step": 1390 + }, + { + "epoch": 0.03520004048890351, + "grad_norm": 11.294158935546875, + "learning_rate": 9.984047272534479e-06, + "loss": 0.3623, + "step": 1391 + }, + { + "epoch": 0.03522534605359719, + "grad_norm": 12.933181762695312, + "learning_rate": 9.98401520774218e-06, + "loss": 0.3384, + "step": 1392 + }, + { + "epoch": 0.035250651618290864, + "grad_norm": 4.726290702819824, + "learning_rate": 9.983983110808894e-06, + "loss": 0.191, + "step": 1393 + }, + { + "epoch": 0.03527595718298454, + "grad_norm": 5.548561096191406, + "learning_rate": 9.983950981734828e-06, + "loss": 0.2126, + "step": 1394 + }, + { + "epoch": 0.03530126274767822, + "grad_norm": 20.473228454589844, + "learning_rate": 9.98391882052019e-06, + "loss": 0.2527, + "step": 1395 + }, + { + "epoch": 0.03532656831237189, + "grad_norm": 8.600914001464844, + "learning_rate": 9.983886627165186e-06, + "loss": 0.2661, + "step": 1396 + }, + { + "epoch": 0.035351873877065565, + "grad_norm": 13.887836456298828, + "learning_rate": 9.983854401670024e-06, + "loss": 0.2602, + "step": 1397 + }, + { + "epoch": 0.03537717944175924, + "grad_norm": 15.038057327270508, + "learning_rate": 9.983822144034913e-06, + "loss": 0.276, + "step": 1398 + }, + { + "epoch": 0.03540248500645292, + "grad_norm": 7.520573616027832, + "learning_rate": 9.98378985426006e-06, + "loss": 0.2947, + "step": 1399 + }, + { + "epoch": 0.035427790571146596, + "grad_norm": 4.65944766998291, + "learning_rate": 9.983757532345672e-06, + "loss": 0.1925, + "step": 1400 + }, + { + "epoch": 0.03545309613584027, + "grad_norm": 6.436762809753418, + "learning_rate": 9.983725178291959e-06, + "loss": 0.1927, + "step": 1401 + }, + { + "epoch": 0.03547840170053395, + "grad_norm": 7.1109299659729, + "learning_rate": 9.98369279209913e-06, + "loss": 0.2932, + "step": 1402 + }, + { + "epoch": 0.03550370726522763, + "grad_norm": 7.5809712409973145, + "learning_rate": 9.983660373767394e-06, + "loss": 0.2751, + "step": 1403 + }, + { + "epoch": 0.0355290128299213, + "grad_norm": 5.093123435974121, + "learning_rate": 9.983627923296959e-06, + "loss": 0.1528, + "step": 1404 + }, + { + "epoch": 0.035554318394614974, + "grad_norm": 5.092523574829102, + "learning_rate": 9.983595440688034e-06, + "loss": 0.217, + "step": 1405 + }, + { + "epoch": 0.03557962395930865, + "grad_norm": 6.6091742515563965, + "learning_rate": 9.983562925940829e-06, + "loss": 0.2349, + "step": 1406 + }, + { + "epoch": 0.03560492952400233, + "grad_norm": 14.968379020690918, + "learning_rate": 9.983530379055552e-06, + "loss": 0.3476, + "step": 1407 + }, + { + "epoch": 0.035630235088696005, + "grad_norm": 7.573129653930664, + "learning_rate": 9.983497800032418e-06, + "loss": 0.3239, + "step": 1408 + }, + { + "epoch": 0.03565554065338968, + "grad_norm": 6.345759868621826, + "learning_rate": 9.983465188871631e-06, + "loss": 0.143, + "step": 1409 + }, + { + "epoch": 0.03568084621808336, + "grad_norm": 10.323060035705566, + "learning_rate": 9.983432545573406e-06, + "loss": 0.386, + "step": 1410 + }, + { + "epoch": 0.035706151782777036, + "grad_norm": 9.249117851257324, + "learning_rate": 9.98339987013795e-06, + "loss": 0.2998, + "step": 1411 + }, + { + "epoch": 0.035731457347470706, + "grad_norm": 7.800357818603516, + "learning_rate": 9.983367162565477e-06, + "loss": 0.1921, + "step": 1412 + }, + { + "epoch": 0.03575676291216438, + "grad_norm": 13.242866516113281, + "learning_rate": 9.983334422856194e-06, + "loss": 0.2724, + "step": 1413 + }, + { + "epoch": 0.03578206847685806, + "grad_norm": 5.864246368408203, + "learning_rate": 9.983301651010315e-06, + "loss": 0.2465, + "step": 1414 + }, + { + "epoch": 0.03580737404155174, + "grad_norm": 6.210402965545654, + "learning_rate": 9.983268847028052e-06, + "loss": 0.194, + "step": 1415 + }, + { + "epoch": 0.035832679606245414, + "grad_norm": 7.467453479766846, + "learning_rate": 9.983236010909613e-06, + "loss": 0.2643, + "step": 1416 + }, + { + "epoch": 0.03585798517093909, + "grad_norm": 16.30581283569336, + "learning_rate": 9.983203142655213e-06, + "loss": 0.3851, + "step": 1417 + }, + { + "epoch": 0.03588329073563277, + "grad_norm": 8.607760429382324, + "learning_rate": 9.983170242265063e-06, + "loss": 0.2609, + "step": 1418 + }, + { + "epoch": 0.035908596300326445, + "grad_norm": 8.095380783081055, + "learning_rate": 9.983137309739375e-06, + "loss": 0.2124, + "step": 1419 + }, + { + "epoch": 0.035933901865020115, + "grad_norm": 9.681829452514648, + "learning_rate": 9.98310434507836e-06, + "loss": 0.2263, + "step": 1420 + }, + { + "epoch": 0.03595920742971379, + "grad_norm": 9.085349082946777, + "learning_rate": 9.983071348282233e-06, + "loss": 0.2426, + "step": 1421 + }, + { + "epoch": 0.03598451299440747, + "grad_norm": 9.880650520324707, + "learning_rate": 9.983038319351206e-06, + "loss": 0.2447, + "step": 1422 + }, + { + "epoch": 0.036009818559101146, + "grad_norm": 10.085219383239746, + "learning_rate": 9.983005258285492e-06, + "loss": 0.2686, + "step": 1423 + }, + { + "epoch": 0.03603512412379482, + "grad_norm": 11.323134422302246, + "learning_rate": 9.982972165085304e-06, + "loss": 0.3224, + "step": 1424 + }, + { + "epoch": 0.0360604296884885, + "grad_norm": 33.5469970703125, + "learning_rate": 9.982939039750855e-06, + "loss": 0.2182, + "step": 1425 + }, + { + "epoch": 0.03608573525318218, + "grad_norm": 16.38542938232422, + "learning_rate": 9.98290588228236e-06, + "loss": 0.2202, + "step": 1426 + }, + { + "epoch": 0.036111040817875854, + "grad_norm": 11.013993263244629, + "learning_rate": 9.98287269268003e-06, + "loss": 0.3126, + "step": 1427 + }, + { + "epoch": 0.036136346382569524, + "grad_norm": 9.264931678771973, + "learning_rate": 9.982839470944083e-06, + "loss": 0.2923, + "step": 1428 + }, + { + "epoch": 0.0361616519472632, + "grad_norm": 11.834539413452148, + "learning_rate": 9.982806217074729e-06, + "loss": 0.2595, + "step": 1429 + }, + { + "epoch": 0.03618695751195688, + "grad_norm": 14.345660209655762, + "learning_rate": 9.982772931072187e-06, + "loss": 0.2146, + "step": 1430 + }, + { + "epoch": 0.036212263076650555, + "grad_norm": 14.798079490661621, + "learning_rate": 9.982739612936668e-06, + "loss": 0.1642, + "step": 1431 + }, + { + "epoch": 0.03623756864134423, + "grad_norm": 11.369696617126465, + "learning_rate": 9.982706262668386e-06, + "loss": 0.2298, + "step": 1432 + }, + { + "epoch": 0.03626287420603791, + "grad_norm": 17.280183792114258, + "learning_rate": 9.982672880267562e-06, + "loss": 0.1909, + "step": 1433 + }, + { + "epoch": 0.036288179770731586, + "grad_norm": 7.70050048828125, + "learning_rate": 9.982639465734405e-06, + "loss": 0.2839, + "step": 1434 + }, + { + "epoch": 0.03631348533542526, + "grad_norm": 9.528090476989746, + "learning_rate": 9.982606019069134e-06, + "loss": 0.2437, + "step": 1435 + }, + { + "epoch": 0.036338790900118934, + "grad_norm": 8.571993827819824, + "learning_rate": 9.982572540271963e-06, + "loss": 0.2426, + "step": 1436 + }, + { + "epoch": 0.03636409646481261, + "grad_norm": 6.621396064758301, + "learning_rate": 9.982539029343108e-06, + "loss": 0.2141, + "step": 1437 + }, + { + "epoch": 0.03638940202950629, + "grad_norm": 10.474912643432617, + "learning_rate": 9.982505486282787e-06, + "loss": 0.2629, + "step": 1438 + }, + { + "epoch": 0.036414707594199965, + "grad_norm": 17.943283081054688, + "learning_rate": 9.982471911091214e-06, + "loss": 0.2382, + "step": 1439 + }, + { + "epoch": 0.03644001315889364, + "grad_norm": 22.299266815185547, + "learning_rate": 9.982438303768607e-06, + "loss": 0.3148, + "step": 1440 + }, + { + "epoch": 0.03646531872358732, + "grad_norm": 8.746076583862305, + "learning_rate": 9.982404664315181e-06, + "loss": 0.1407, + "step": 1441 + }, + { + "epoch": 0.036490624288280996, + "grad_norm": 10.509153366088867, + "learning_rate": 9.982370992731156e-06, + "loss": 0.2131, + "step": 1442 + }, + { + "epoch": 0.03651592985297467, + "grad_norm": 12.557414054870605, + "learning_rate": 9.982337289016746e-06, + "loss": 0.2566, + "step": 1443 + }, + { + "epoch": 0.03654123541766834, + "grad_norm": 13.671586036682129, + "learning_rate": 9.98230355317217e-06, + "loss": 0.3053, + "step": 1444 + }, + { + "epoch": 0.03656654098236202, + "grad_norm": 7.819859981536865, + "learning_rate": 9.982269785197646e-06, + "loss": 0.245, + "step": 1445 + }, + { + "epoch": 0.0365918465470557, + "grad_norm": 7.950913906097412, + "learning_rate": 9.98223598509339e-06, + "loss": 0.2521, + "step": 1446 + }, + { + "epoch": 0.036617152111749374, + "grad_norm": 8.667475700378418, + "learning_rate": 9.98220215285962e-06, + "loss": 0.1289, + "step": 1447 + }, + { + "epoch": 0.03664245767644305, + "grad_norm": 9.899477005004883, + "learning_rate": 9.982168288496557e-06, + "loss": 0.2538, + "step": 1448 + }, + { + "epoch": 0.03666776324113673, + "grad_norm": 7.310357570648193, + "learning_rate": 9.982134392004416e-06, + "loss": 0.1476, + "step": 1449 + }, + { + "epoch": 0.036693068805830405, + "grad_norm": 4.699000835418701, + "learning_rate": 9.982100463383418e-06, + "loss": 0.1726, + "step": 1450 + }, + { + "epoch": 0.03671837437052408, + "grad_norm": 7.109930992126465, + "learning_rate": 9.98206650263378e-06, + "loss": 0.1888, + "step": 1451 + }, + { + "epoch": 0.03674367993521775, + "grad_norm": 7.805633068084717, + "learning_rate": 9.982032509755721e-06, + "loss": 0.158, + "step": 1452 + }, + { + "epoch": 0.03676898549991143, + "grad_norm": 10.933465003967285, + "learning_rate": 9.981998484749463e-06, + "loss": 0.2038, + "step": 1453 + }, + { + "epoch": 0.036794291064605106, + "grad_norm": 10.289450645446777, + "learning_rate": 9.981964427615222e-06, + "loss": 0.2, + "step": 1454 + }, + { + "epoch": 0.03681959662929878, + "grad_norm": 11.568800926208496, + "learning_rate": 9.98193033835322e-06, + "loss": 0.2551, + "step": 1455 + }, + { + "epoch": 0.03684490219399246, + "grad_norm": 5.507721900939941, + "learning_rate": 9.981896216963677e-06, + "loss": 0.1592, + "step": 1456 + }, + { + "epoch": 0.03687020775868614, + "grad_norm": 4.956212997436523, + "learning_rate": 9.98186206344681e-06, + "loss": 0.1905, + "step": 1457 + }, + { + "epoch": 0.036895513323379814, + "grad_norm": 21.947080612182617, + "learning_rate": 9.981827877802842e-06, + "loss": 0.3705, + "step": 1458 + }, + { + "epoch": 0.03692081888807349, + "grad_norm": 10.00041675567627, + "learning_rate": 9.981793660031993e-06, + "loss": 0.2979, + "step": 1459 + }, + { + "epoch": 0.03694612445276716, + "grad_norm": 13.086312294006348, + "learning_rate": 9.981759410134484e-06, + "loss": 0.2644, + "step": 1460 + }, + { + "epoch": 0.03697143001746084, + "grad_norm": 16.98478126525879, + "learning_rate": 9.981725128110533e-06, + "loss": 0.2218, + "step": 1461 + }, + { + "epoch": 0.036996735582154515, + "grad_norm": 9.084173202514648, + "learning_rate": 9.981690813960365e-06, + "loss": 0.2838, + "step": 1462 + }, + { + "epoch": 0.03702204114684819, + "grad_norm": 6.290726661682129, + "learning_rate": 9.9816564676842e-06, + "loss": 0.2466, + "step": 1463 + }, + { + "epoch": 0.03704734671154187, + "grad_norm": 18.283462524414062, + "learning_rate": 9.98162208928226e-06, + "loss": 0.3363, + "step": 1464 + }, + { + "epoch": 0.037072652276235546, + "grad_norm": 14.434618949890137, + "learning_rate": 9.981587678754762e-06, + "loss": 0.4344, + "step": 1465 + }, + { + "epoch": 0.03709795784092922, + "grad_norm": 7.8474650382995605, + "learning_rate": 9.981553236101936e-06, + "loss": 0.2167, + "step": 1466 + }, + { + "epoch": 0.03712326340562289, + "grad_norm": 8.387113571166992, + "learning_rate": 9.981518761323996e-06, + "loss": 0.2322, + "step": 1467 + }, + { + "epoch": 0.03714856897031657, + "grad_norm": 12.61189079284668, + "learning_rate": 9.98148425442117e-06, + "loss": 0.3241, + "step": 1468 + }, + { + "epoch": 0.03717387453501025, + "grad_norm": 18.012910842895508, + "learning_rate": 9.981449715393678e-06, + "loss": 0.3232, + "step": 1469 + }, + { + "epoch": 0.037199180099703924, + "grad_norm": 9.99484920501709, + "learning_rate": 9.981415144241745e-06, + "loss": 0.3148, + "step": 1470 + }, + { + "epoch": 0.0372244856643976, + "grad_norm": 8.402323722839355, + "learning_rate": 9.98138054096559e-06, + "loss": 0.2606, + "step": 1471 + }, + { + "epoch": 0.03724979122909128, + "grad_norm": 7.657789707183838, + "learning_rate": 9.98134590556544e-06, + "loss": 0.2111, + "step": 1472 + }, + { + "epoch": 0.037275096793784955, + "grad_norm": 12.026458740234375, + "learning_rate": 9.981311238041517e-06, + "loss": 0.2401, + "step": 1473 + }, + { + "epoch": 0.03730040235847863, + "grad_norm": 5.687396049499512, + "learning_rate": 9.981276538394045e-06, + "loss": 0.2832, + "step": 1474 + }, + { + "epoch": 0.0373257079231723, + "grad_norm": 10.512983322143555, + "learning_rate": 9.981241806623246e-06, + "loss": 0.19, + "step": 1475 + }, + { + "epoch": 0.03735101348786598, + "grad_norm": 5.938201427459717, + "learning_rate": 9.981207042729346e-06, + "loss": 0.2742, + "step": 1476 + }, + { + "epoch": 0.037376319052559656, + "grad_norm": 17.545766830444336, + "learning_rate": 9.981172246712568e-06, + "loss": 0.1874, + "step": 1477 + }, + { + "epoch": 0.03740162461725333, + "grad_norm": 7.3691935539245605, + "learning_rate": 9.981137418573136e-06, + "loss": 0.1719, + "step": 1478 + }, + { + "epoch": 0.03742693018194701, + "grad_norm": 6.342506408691406, + "learning_rate": 9.981102558311275e-06, + "loss": 0.1886, + "step": 1479 + }, + { + "epoch": 0.03745223574664069, + "grad_norm": 9.620046615600586, + "learning_rate": 9.981067665927211e-06, + "loss": 0.2718, + "step": 1480 + }, + { + "epoch": 0.037477541311334364, + "grad_norm": 7.154120445251465, + "learning_rate": 9.98103274142117e-06, + "loss": 0.17, + "step": 1481 + }, + { + "epoch": 0.03750284687602804, + "grad_norm": 12.345234870910645, + "learning_rate": 9.980997784793375e-06, + "loss": 0.2079, + "step": 1482 + }, + { + "epoch": 0.03752815244072171, + "grad_norm": 6.290846824645996, + "learning_rate": 9.98096279604405e-06, + "loss": 0.2524, + "step": 1483 + }, + { + "epoch": 0.03755345800541539, + "grad_norm": 10.577013969421387, + "learning_rate": 9.980927775173422e-06, + "loss": 0.3494, + "step": 1484 + }, + { + "epoch": 0.037578763570109065, + "grad_norm": 5.039271831512451, + "learning_rate": 9.980892722181718e-06, + "loss": 0.1359, + "step": 1485 + }, + { + "epoch": 0.03760406913480274, + "grad_norm": 9.542911529541016, + "learning_rate": 9.980857637069164e-06, + "loss": 0.3138, + "step": 1486 + }, + { + "epoch": 0.03762937469949642, + "grad_norm": 9.966923713684082, + "learning_rate": 9.980822519835985e-06, + "loss": 0.2328, + "step": 1487 + }, + { + "epoch": 0.037654680264190096, + "grad_norm": 6.998325824737549, + "learning_rate": 9.980787370482408e-06, + "loss": 0.2914, + "step": 1488 + }, + { + "epoch": 0.03767998582888377, + "grad_norm": 15.759881973266602, + "learning_rate": 9.98075218900866e-06, + "loss": 0.3427, + "step": 1489 + }, + { + "epoch": 0.03770529139357745, + "grad_norm": 4.941316604614258, + "learning_rate": 9.980716975414968e-06, + "loss": 0.1704, + "step": 1490 + }, + { + "epoch": 0.03773059695827112, + "grad_norm": 10.124712944030762, + "learning_rate": 9.980681729701558e-06, + "loss": 0.2582, + "step": 1491 + }, + { + "epoch": 0.0377559025229648, + "grad_norm": 4.293117046356201, + "learning_rate": 9.980646451868658e-06, + "loss": 0.1242, + "step": 1492 + }, + { + "epoch": 0.037781208087658474, + "grad_norm": 12.862405776977539, + "learning_rate": 9.980611141916497e-06, + "loss": 0.1421, + "step": 1493 + }, + { + "epoch": 0.03780651365235215, + "grad_norm": 13.72451400756836, + "learning_rate": 9.9805757998453e-06, + "loss": 0.2341, + "step": 1494 + }, + { + "epoch": 0.03783181921704583, + "grad_norm": 11.781709671020508, + "learning_rate": 9.980540425655294e-06, + "loss": 0.2354, + "step": 1495 + }, + { + "epoch": 0.037857124781739505, + "grad_norm": 6.756852149963379, + "learning_rate": 9.980505019346713e-06, + "loss": 0.1447, + "step": 1496 + }, + { + "epoch": 0.03788243034643318, + "grad_norm": 4.05155086517334, + "learning_rate": 9.98046958091978e-06, + "loss": 0.1859, + "step": 1497 + }, + { + "epoch": 0.03790773591112686, + "grad_norm": 10.376532554626465, + "learning_rate": 9.980434110374725e-06, + "loss": 0.2396, + "step": 1498 + }, + { + "epoch": 0.03793304147582053, + "grad_norm": 14.664619445800781, + "learning_rate": 9.980398607711777e-06, + "loss": 0.2648, + "step": 1499 + }, + { + "epoch": 0.037958347040514206, + "grad_norm": 15.638175964355469, + "learning_rate": 9.980363072931165e-06, + "loss": 0.4306, + "step": 1500 + }, + { + "epoch": 0.037983652605207884, + "grad_norm": 14.179643630981445, + "learning_rate": 9.980327506033118e-06, + "loss": 0.2873, + "step": 1501 + }, + { + "epoch": 0.03800895816990156, + "grad_norm": 6.808987617492676, + "learning_rate": 9.980291907017864e-06, + "loss": 0.1777, + "step": 1502 + }, + { + "epoch": 0.03803426373459524, + "grad_norm": 22.165433883666992, + "learning_rate": 9.980256275885635e-06, + "loss": 0.4295, + "step": 1503 + }, + { + "epoch": 0.038059569299288915, + "grad_norm": 8.48691177368164, + "learning_rate": 9.98022061263666e-06, + "loss": 0.2356, + "step": 1504 + }, + { + "epoch": 0.03808487486398259, + "grad_norm": 5.015347003936768, + "learning_rate": 9.980184917271168e-06, + "loss": 0.2772, + "step": 1505 + }, + { + "epoch": 0.03811018042867627, + "grad_norm": 3.298283815383911, + "learning_rate": 9.98014918978939e-06, + "loss": 0.1616, + "step": 1506 + }, + { + "epoch": 0.03813548599336994, + "grad_norm": 4.436966896057129, + "learning_rate": 9.980113430191558e-06, + "loss": 0.1716, + "step": 1507 + }, + { + "epoch": 0.038160791558063616, + "grad_norm": 12.400177955627441, + "learning_rate": 9.9800776384779e-06, + "loss": 0.207, + "step": 1508 + }, + { + "epoch": 0.03818609712275729, + "grad_norm": 10.320786476135254, + "learning_rate": 9.980041814648645e-06, + "loss": 0.1768, + "step": 1509 + }, + { + "epoch": 0.03821140268745097, + "grad_norm": 11.399639129638672, + "learning_rate": 9.98000595870403e-06, + "loss": 0.324, + "step": 1510 + }, + { + "epoch": 0.03823670825214465, + "grad_norm": 9.830689430236816, + "learning_rate": 9.97997007064428e-06, + "loss": 0.3263, + "step": 1511 + }, + { + "epoch": 0.038262013816838324, + "grad_norm": 5.288264751434326, + "learning_rate": 9.979934150469632e-06, + "loss": 0.1689, + "step": 1512 + }, + { + "epoch": 0.038287319381532, + "grad_norm": 8.51948356628418, + "learning_rate": 9.979898198180313e-06, + "loss": 0.2391, + "step": 1513 + }, + { + "epoch": 0.03831262494622568, + "grad_norm": 10.315779685974121, + "learning_rate": 9.979862213776559e-06, + "loss": 0.2751, + "step": 1514 + }, + { + "epoch": 0.03833793051091935, + "grad_norm": 6.26671028137207, + "learning_rate": 9.979826197258598e-06, + "loss": 0.1554, + "step": 1515 + }, + { + "epoch": 0.038363236075613025, + "grad_norm": 7.3345136642456055, + "learning_rate": 9.979790148626665e-06, + "loss": 0.3073, + "step": 1516 + }, + { + "epoch": 0.0383885416403067, + "grad_norm": 8.975384712219238, + "learning_rate": 9.979754067880991e-06, + "loss": 0.1796, + "step": 1517 + }, + { + "epoch": 0.03841384720500038, + "grad_norm": 6.432920932769775, + "learning_rate": 9.97971795502181e-06, + "loss": 0.2477, + "step": 1518 + }, + { + "epoch": 0.038439152769694056, + "grad_norm": 8.248119354248047, + "learning_rate": 9.979681810049354e-06, + "loss": 0.2537, + "step": 1519 + }, + { + "epoch": 0.03846445833438773, + "grad_norm": 6.289349555969238, + "learning_rate": 9.979645632963857e-06, + "loss": 0.2094, + "step": 1520 + }, + { + "epoch": 0.03848976389908141, + "grad_norm": 5.790979862213135, + "learning_rate": 9.97960942376555e-06, + "loss": 0.1812, + "step": 1521 + }, + { + "epoch": 0.03851506946377509, + "grad_norm": 13.271751403808594, + "learning_rate": 9.979573182454669e-06, + "loss": 0.2096, + "step": 1522 + }, + { + "epoch": 0.03854037502846876, + "grad_norm": 14.491007804870605, + "learning_rate": 9.979536909031448e-06, + "loss": 0.1946, + "step": 1523 + }, + { + "epoch": 0.038565680593162434, + "grad_norm": 4.728292942047119, + "learning_rate": 9.979500603496118e-06, + "loss": 0.1787, + "step": 1524 + }, + { + "epoch": 0.03859098615785611, + "grad_norm": 10.653759956359863, + "learning_rate": 9.979464265848915e-06, + "loss": 0.2145, + "step": 1525 + }, + { + "epoch": 0.03861629172254979, + "grad_norm": 8.483741760253906, + "learning_rate": 9.979427896090075e-06, + "loss": 0.2293, + "step": 1526 + }, + { + "epoch": 0.038641597287243465, + "grad_norm": 5.964181423187256, + "learning_rate": 9.979391494219828e-06, + "loss": 0.1917, + "step": 1527 + }, + { + "epoch": 0.03866690285193714, + "grad_norm": 10.564431190490723, + "learning_rate": 9.979355060238413e-06, + "loss": 0.224, + "step": 1528 + }, + { + "epoch": 0.03869220841663082, + "grad_norm": 12.693527221679688, + "learning_rate": 9.979318594146064e-06, + "loss": 0.264, + "step": 1529 + }, + { + "epoch": 0.038717513981324496, + "grad_norm": 7.453309535980225, + "learning_rate": 9.979282095943016e-06, + "loss": 0.2479, + "step": 1530 + }, + { + "epoch": 0.038742819546018166, + "grad_norm": 12.941195487976074, + "learning_rate": 9.979245565629502e-06, + "loss": 0.2724, + "step": 1531 + }, + { + "epoch": 0.03876812511071184, + "grad_norm": 4.914851188659668, + "learning_rate": 9.979209003205761e-06, + "loss": 0.1996, + "step": 1532 + }, + { + "epoch": 0.03879343067540552, + "grad_norm": 4.4235100746154785, + "learning_rate": 9.979172408672026e-06, + "loss": 0.1528, + "step": 1533 + }, + { + "epoch": 0.0388187362400992, + "grad_norm": 8.038887977600098, + "learning_rate": 9.979135782028535e-06, + "loss": 0.3137, + "step": 1534 + }, + { + "epoch": 0.038844041804792874, + "grad_norm": 12.758983612060547, + "learning_rate": 9.979099123275525e-06, + "loss": 0.2571, + "step": 1535 + }, + { + "epoch": 0.03886934736948655, + "grad_norm": 6.1572465896606445, + "learning_rate": 9.979062432413228e-06, + "loss": 0.2046, + "step": 1536 + }, + { + "epoch": 0.03889465293418023, + "grad_norm": 12.240955352783203, + "learning_rate": 9.979025709441885e-06, + "loss": 0.1919, + "step": 1537 + }, + { + "epoch": 0.038919958498873905, + "grad_norm": 9.008999824523926, + "learning_rate": 9.978988954361733e-06, + "loss": 0.1746, + "step": 1538 + }, + { + "epoch": 0.038945264063567575, + "grad_norm": 14.364463806152344, + "learning_rate": 9.978952167173005e-06, + "loss": 0.2794, + "step": 1539 + }, + { + "epoch": 0.03897056962826125, + "grad_norm": 16.417043685913086, + "learning_rate": 9.978915347875941e-06, + "loss": 0.4824, + "step": 1540 + }, + { + "epoch": 0.03899587519295493, + "grad_norm": 6.805043697357178, + "learning_rate": 9.97887849647078e-06, + "loss": 0.2024, + "step": 1541 + }, + { + "epoch": 0.039021180757648606, + "grad_norm": 20.161636352539062, + "learning_rate": 9.978841612957756e-06, + "loss": 0.3841, + "step": 1542 + }, + { + "epoch": 0.03904648632234228, + "grad_norm": 12.646028518676758, + "learning_rate": 9.978804697337109e-06, + "loss": 0.2507, + "step": 1543 + }, + { + "epoch": 0.03907179188703596, + "grad_norm": 7.632071495056152, + "learning_rate": 9.978767749609076e-06, + "loss": 0.1581, + "step": 1544 + }, + { + "epoch": 0.03909709745172964, + "grad_norm": 4.961726665496826, + "learning_rate": 9.978730769773897e-06, + "loss": 0.1874, + "step": 1545 + }, + { + "epoch": 0.039122403016423314, + "grad_norm": 19.98894691467285, + "learning_rate": 9.978693757831808e-06, + "loss": 0.3082, + "step": 1546 + }, + { + "epoch": 0.039147708581116984, + "grad_norm": 3.400451183319092, + "learning_rate": 9.97865671378305e-06, + "loss": 0.129, + "step": 1547 + }, + { + "epoch": 0.03917301414581066, + "grad_norm": 15.321749687194824, + "learning_rate": 9.978619637627863e-06, + "loss": 0.2987, + "step": 1548 + }, + { + "epoch": 0.03919831971050434, + "grad_norm": 7.005988121032715, + "learning_rate": 9.97858252936648e-06, + "loss": 0.2628, + "step": 1549 + }, + { + "epoch": 0.039223625275198015, + "grad_norm": 5.618368148803711, + "learning_rate": 9.978545388999147e-06, + "loss": 0.2018, + "step": 1550 + }, + { + "epoch": 0.03924893083989169, + "grad_norm": 9.380524635314941, + "learning_rate": 9.978508216526099e-06, + "loss": 0.2611, + "step": 1551 + }, + { + "epoch": 0.03927423640458537, + "grad_norm": 8.440570831298828, + "learning_rate": 9.97847101194758e-06, + "loss": 0.2301, + "step": 1552 + }, + { + "epoch": 0.039299541969279046, + "grad_norm": 5.737386226654053, + "learning_rate": 9.978433775263826e-06, + "loss": 0.1864, + "step": 1553 + }, + { + "epoch": 0.03932484753397272, + "grad_norm": 14.955470085144043, + "learning_rate": 9.978396506475079e-06, + "loss": 0.3135, + "step": 1554 + }, + { + "epoch": 0.03935015309866639, + "grad_norm": 7.56571102142334, + "learning_rate": 9.978359205581578e-06, + "loss": 0.2609, + "step": 1555 + }, + { + "epoch": 0.03937545866336007, + "grad_norm": 8.322997093200684, + "learning_rate": 9.978321872583565e-06, + "loss": 0.2181, + "step": 1556 + }, + { + "epoch": 0.03940076422805375, + "grad_norm": 7.674066066741943, + "learning_rate": 9.978284507481281e-06, + "loss": 0.2567, + "step": 1557 + }, + { + "epoch": 0.039426069792747424, + "grad_norm": 5.595551013946533, + "learning_rate": 9.978247110274966e-06, + "loss": 0.149, + "step": 1558 + }, + { + "epoch": 0.0394513753574411, + "grad_norm": 11.560720443725586, + "learning_rate": 9.978209680964862e-06, + "loss": 0.2585, + "step": 1559 + }, + { + "epoch": 0.03947668092213478, + "grad_norm": 6.719056606292725, + "learning_rate": 9.97817221955121e-06, + "loss": 0.2593, + "step": 1560 + }, + { + "epoch": 0.039501986486828455, + "grad_norm": 11.012550354003906, + "learning_rate": 9.97813472603425e-06, + "loss": 0.2691, + "step": 1561 + }, + { + "epoch": 0.03952729205152213, + "grad_norm": 25.503036499023438, + "learning_rate": 9.978097200414225e-06, + "loss": 0.2683, + "step": 1562 + }, + { + "epoch": 0.0395525976162158, + "grad_norm": 7.5319671630859375, + "learning_rate": 9.978059642691378e-06, + "loss": 0.2454, + "step": 1563 + }, + { + "epoch": 0.03957790318090948, + "grad_norm": 8.579593658447266, + "learning_rate": 9.97802205286595e-06, + "loss": 0.3061, + "step": 1564 + }, + { + "epoch": 0.039603208745603156, + "grad_norm": 29.42296028137207, + "learning_rate": 9.977984430938184e-06, + "loss": 0.3442, + "step": 1565 + }, + { + "epoch": 0.03962851431029683, + "grad_norm": 10.794814109802246, + "learning_rate": 9.977946776908323e-06, + "loss": 0.3614, + "step": 1566 + }, + { + "epoch": 0.03965381987499051, + "grad_norm": 9.424519538879395, + "learning_rate": 9.977909090776607e-06, + "loss": 0.3079, + "step": 1567 + }, + { + "epoch": 0.03967912543968419, + "grad_norm": 9.170622825622559, + "learning_rate": 9.977871372543283e-06, + "loss": 0.332, + "step": 1568 + }, + { + "epoch": 0.039704431004377864, + "grad_norm": 7.8675947189331055, + "learning_rate": 9.977833622208592e-06, + "loss": 0.2675, + "step": 1569 + }, + { + "epoch": 0.03972973656907154, + "grad_norm": 8.830558776855469, + "learning_rate": 9.97779583977278e-06, + "loss": 0.2905, + "step": 1570 + }, + { + "epoch": 0.03975504213376521, + "grad_norm": 9.27371883392334, + "learning_rate": 9.977758025236086e-06, + "loss": 0.175, + "step": 1571 + }, + { + "epoch": 0.03978034769845889, + "grad_norm": 9.795316696166992, + "learning_rate": 9.977720178598758e-06, + "loss": 0.3234, + "step": 1572 + }, + { + "epoch": 0.039805653263152566, + "grad_norm": 6.1951093673706055, + "learning_rate": 9.977682299861037e-06, + "loss": 0.1601, + "step": 1573 + }, + { + "epoch": 0.03983095882784624, + "grad_norm": 5.372370719909668, + "learning_rate": 9.977644389023169e-06, + "loss": 0.1762, + "step": 1574 + }, + { + "epoch": 0.03985626439253992, + "grad_norm": 6.870186805725098, + "learning_rate": 9.9776064460854e-06, + "loss": 0.2033, + "step": 1575 + }, + { + "epoch": 0.0398815699572336, + "grad_norm": 6.266991138458252, + "learning_rate": 9.97756847104797e-06, + "loss": 0.2862, + "step": 1576 + }, + { + "epoch": 0.039906875521927274, + "grad_norm": 6.777806758880615, + "learning_rate": 9.977530463911129e-06, + "loss": 0.2225, + "step": 1577 + }, + { + "epoch": 0.03993218108662095, + "grad_norm": 8.87906551361084, + "learning_rate": 9.977492424675119e-06, + "loss": 0.2602, + "step": 1578 + }, + { + "epoch": 0.03995748665131462, + "grad_norm": 10.955697059631348, + "learning_rate": 9.977454353340187e-06, + "loss": 0.3098, + "step": 1579 + }, + { + "epoch": 0.0399827922160083, + "grad_norm": 4.380762100219727, + "learning_rate": 9.977416249906577e-06, + "loss": 0.2494, + "step": 1580 + }, + { + "epoch": 0.040008097780701975, + "grad_norm": 13.658143997192383, + "learning_rate": 9.977378114374536e-06, + "loss": 0.3352, + "step": 1581 + }, + { + "epoch": 0.04003340334539565, + "grad_norm": 12.29921817779541, + "learning_rate": 9.977339946744309e-06, + "loss": 0.3116, + "step": 1582 + }, + { + "epoch": 0.04005870891008933, + "grad_norm": 4.585203647613525, + "learning_rate": 9.977301747016143e-06, + "loss": 0.2226, + "step": 1583 + }, + { + "epoch": 0.040084014474783006, + "grad_norm": 8.188350677490234, + "learning_rate": 9.977263515190283e-06, + "loss": 0.2687, + "step": 1584 + }, + { + "epoch": 0.04010932003947668, + "grad_norm": 14.271995544433594, + "learning_rate": 9.977225251266977e-06, + "loss": 0.3992, + "step": 1585 + }, + { + "epoch": 0.04013462560417036, + "grad_norm": 6.364639759063721, + "learning_rate": 9.977186955246472e-06, + "loss": 0.2315, + "step": 1586 + }, + { + "epoch": 0.04015993116886403, + "grad_norm": 8.676740646362305, + "learning_rate": 9.977148627129014e-06, + "loss": 0.1533, + "step": 1587 + }, + { + "epoch": 0.04018523673355771, + "grad_norm": 8.811923027038574, + "learning_rate": 9.97711026691485e-06, + "loss": 0.1993, + "step": 1588 + }, + { + "epoch": 0.040210542298251384, + "grad_norm": 7.274052619934082, + "learning_rate": 9.977071874604228e-06, + "loss": 0.2726, + "step": 1589 + }, + { + "epoch": 0.04023584786294506, + "grad_norm": 5.399681091308594, + "learning_rate": 9.977033450197393e-06, + "loss": 0.1938, + "step": 1590 + }, + { + "epoch": 0.04026115342763874, + "grad_norm": 11.837246894836426, + "learning_rate": 9.976994993694598e-06, + "loss": 0.235, + "step": 1591 + }, + { + "epoch": 0.040286458992332415, + "grad_norm": 5.465115070343018, + "learning_rate": 9.976956505096086e-06, + "loss": 0.162, + "step": 1592 + }, + { + "epoch": 0.04031176455702609, + "grad_norm": 10.98816967010498, + "learning_rate": 9.976917984402109e-06, + "loss": 0.3333, + "step": 1593 + }, + { + "epoch": 0.04033707012171977, + "grad_norm": 14.96257209777832, + "learning_rate": 9.976879431612911e-06, + "loss": 0.2475, + "step": 1594 + }, + { + "epoch": 0.04036237568641344, + "grad_norm": 9.940006256103516, + "learning_rate": 9.976840846728746e-06, + "loss": 0.283, + "step": 1595 + }, + { + "epoch": 0.040387681251107116, + "grad_norm": 6.664530277252197, + "learning_rate": 9.97680222974986e-06, + "loss": 0.2208, + "step": 1596 + }, + { + "epoch": 0.04041298681580079, + "grad_norm": 5.416327953338623, + "learning_rate": 9.976763580676501e-06, + "loss": 0.1681, + "step": 1597 + }, + { + "epoch": 0.04043829238049447, + "grad_norm": 6.187167644500732, + "learning_rate": 9.97672489950892e-06, + "loss": 0.1918, + "step": 1598 + }, + { + "epoch": 0.04046359794518815, + "grad_norm": 5.109025955200195, + "learning_rate": 9.976686186247367e-06, + "loss": 0.1588, + "step": 1599 + }, + { + "epoch": 0.040488903509881824, + "grad_norm": 6.556674957275391, + "learning_rate": 9.976647440892088e-06, + "loss": 0.203, + "step": 1600 + }, + { + "epoch": 0.0405142090745755, + "grad_norm": 5.97578239440918, + "learning_rate": 9.976608663443337e-06, + "loss": 0.2387, + "step": 1601 + }, + { + "epoch": 0.04053951463926918, + "grad_norm": 5.944215297698975, + "learning_rate": 9.976569853901362e-06, + "loss": 0.1654, + "step": 1602 + }, + { + "epoch": 0.04056482020396285, + "grad_norm": 9.807177543640137, + "learning_rate": 9.976531012266414e-06, + "loss": 0.3111, + "step": 1603 + }, + { + "epoch": 0.040590125768656525, + "grad_norm": 11.019767761230469, + "learning_rate": 9.976492138538742e-06, + "loss": 0.2487, + "step": 1604 + }, + { + "epoch": 0.0406154313333502, + "grad_norm": 9.263914108276367, + "learning_rate": 9.976453232718599e-06, + "loss": 0.2157, + "step": 1605 + }, + { + "epoch": 0.04064073689804388, + "grad_norm": 11.736894607543945, + "learning_rate": 9.976414294806236e-06, + "loss": 0.3063, + "step": 1606 + }, + { + "epoch": 0.040666042462737556, + "grad_norm": 16.56871795654297, + "learning_rate": 9.976375324801902e-06, + "loss": 0.3995, + "step": 1607 + }, + { + "epoch": 0.04069134802743123, + "grad_norm": 9.224567413330078, + "learning_rate": 9.976336322705848e-06, + "loss": 0.2257, + "step": 1608 + }, + { + "epoch": 0.04071665359212491, + "grad_norm": 19.424869537353516, + "learning_rate": 9.976297288518327e-06, + "loss": 0.2835, + "step": 1609 + }, + { + "epoch": 0.04074195915681859, + "grad_norm": 6.467703342437744, + "learning_rate": 9.976258222239592e-06, + "loss": 0.1753, + "step": 1610 + }, + { + "epoch": 0.04076726472151226, + "grad_norm": 7.671004772186279, + "learning_rate": 9.976219123869893e-06, + "loss": 0.2399, + "step": 1611 + }, + { + "epoch": 0.040792570286205934, + "grad_norm": 8.74223518371582, + "learning_rate": 9.97617999340948e-06, + "loss": 0.3243, + "step": 1612 + }, + { + "epoch": 0.04081787585089961, + "grad_norm": 7.906851768493652, + "learning_rate": 9.976140830858611e-06, + "loss": 0.1836, + "step": 1613 + }, + { + "epoch": 0.04084318141559329, + "grad_norm": 7.3517680168151855, + "learning_rate": 9.976101636217535e-06, + "loss": 0.2352, + "step": 1614 + }, + { + "epoch": 0.040868486980286965, + "grad_norm": 14.778658866882324, + "learning_rate": 9.976062409486503e-06, + "loss": 0.2268, + "step": 1615 + }, + { + "epoch": 0.04089379254498064, + "grad_norm": 12.782198905944824, + "learning_rate": 9.976023150665772e-06, + "loss": 0.2295, + "step": 1616 + }, + { + "epoch": 0.04091909810967432, + "grad_norm": 8.518946647644043, + "learning_rate": 9.975983859755593e-06, + "loss": 0.2758, + "step": 1617 + }, + { + "epoch": 0.040944403674367996, + "grad_norm": 6.902440547943115, + "learning_rate": 9.97594453675622e-06, + "loss": 0.2484, + "step": 1618 + }, + { + "epoch": 0.040969709239061666, + "grad_norm": 15.380069732666016, + "learning_rate": 9.975905181667904e-06, + "loss": 0.4153, + "step": 1619 + }, + { + "epoch": 0.04099501480375534, + "grad_norm": 8.797369956970215, + "learning_rate": 9.975865794490904e-06, + "loss": 0.278, + "step": 1620 + }, + { + "epoch": 0.04102032036844902, + "grad_norm": 5.501868724822998, + "learning_rate": 9.97582637522547e-06, + "loss": 0.1912, + "step": 1621 + }, + { + "epoch": 0.0410456259331427, + "grad_norm": 8.700504302978516, + "learning_rate": 9.975786923871858e-06, + "loss": 0.2311, + "step": 1622 + }, + { + "epoch": 0.041070931497836374, + "grad_norm": 8.038318634033203, + "learning_rate": 9.975747440430321e-06, + "loss": 0.2737, + "step": 1623 + }, + { + "epoch": 0.04109623706253005, + "grad_norm": 7.728898525238037, + "learning_rate": 9.975707924901112e-06, + "loss": 0.1988, + "step": 1624 + }, + { + "epoch": 0.04112154262722373, + "grad_norm": 3.2996766567230225, + "learning_rate": 9.975668377284491e-06, + "loss": 0.2118, + "step": 1625 + }, + { + "epoch": 0.041146848191917405, + "grad_norm": 11.188117980957031, + "learning_rate": 9.97562879758071e-06, + "loss": 0.3828, + "step": 1626 + }, + { + "epoch": 0.041172153756611075, + "grad_norm": 3.2589077949523926, + "learning_rate": 9.975589185790024e-06, + "loss": 0.1656, + "step": 1627 + }, + { + "epoch": 0.04119745932130475, + "grad_norm": 5.032970428466797, + "learning_rate": 9.97554954191269e-06, + "loss": 0.2272, + "step": 1628 + }, + { + "epoch": 0.04122276488599843, + "grad_norm": 5.482730388641357, + "learning_rate": 9.97550986594896e-06, + "loss": 0.1903, + "step": 1629 + }, + { + "epoch": 0.041248070450692106, + "grad_norm": 8.766019821166992, + "learning_rate": 9.975470157899094e-06, + "loss": 0.1528, + "step": 1630 + }, + { + "epoch": 0.04127337601538578, + "grad_norm": 9.537262916564941, + "learning_rate": 9.975430417763346e-06, + "loss": 0.246, + "step": 1631 + }, + { + "epoch": 0.04129868158007946, + "grad_norm": 5.688976764678955, + "learning_rate": 9.975390645541973e-06, + "loss": 0.1984, + "step": 1632 + }, + { + "epoch": 0.04132398714477314, + "grad_norm": 11.828789710998535, + "learning_rate": 9.975350841235231e-06, + "loss": 0.2851, + "step": 1633 + }, + { + "epoch": 0.041349292709466814, + "grad_norm": 5.34282112121582, + "learning_rate": 9.975311004843378e-06, + "loss": 0.2608, + "step": 1634 + }, + { + "epoch": 0.041374598274160485, + "grad_norm": 13.104724884033203, + "learning_rate": 9.975271136366668e-06, + "loss": 0.3087, + "step": 1635 + }, + { + "epoch": 0.04139990383885416, + "grad_norm": 5.823247909545898, + "learning_rate": 9.975231235805362e-06, + "loss": 0.2495, + "step": 1636 + }, + { + "epoch": 0.04142520940354784, + "grad_norm": 9.818582534790039, + "learning_rate": 9.975191303159713e-06, + "loss": 0.2473, + "step": 1637 + }, + { + "epoch": 0.041450514968241516, + "grad_norm": 6.02924108505249, + "learning_rate": 9.975151338429983e-06, + "loss": 0.2168, + "step": 1638 + }, + { + "epoch": 0.04147582053293519, + "grad_norm": 5.0621724128723145, + "learning_rate": 9.975111341616426e-06, + "loss": 0.1713, + "step": 1639 + }, + { + "epoch": 0.04150112609762887, + "grad_norm": 12.898845672607422, + "learning_rate": 9.975071312719302e-06, + "loss": 0.2465, + "step": 1640 + }, + { + "epoch": 0.04152643166232255, + "grad_norm": 10.441394805908203, + "learning_rate": 9.975031251738866e-06, + "loss": 0.1749, + "step": 1641 + }, + { + "epoch": 0.041551737227016224, + "grad_norm": 8.851018905639648, + "learning_rate": 9.974991158675382e-06, + "loss": 0.2923, + "step": 1642 + }, + { + "epoch": 0.041577042791709894, + "grad_norm": 5.422561168670654, + "learning_rate": 9.974951033529106e-06, + "loss": 0.1683, + "step": 1643 + }, + { + "epoch": 0.04160234835640357, + "grad_norm": 4.942183017730713, + "learning_rate": 9.974910876300294e-06, + "loss": 0.2122, + "step": 1644 + }, + { + "epoch": 0.04162765392109725, + "grad_norm": 14.28628921508789, + "learning_rate": 9.974870686989207e-06, + "loss": 0.2345, + "step": 1645 + }, + { + "epoch": 0.041652959485790925, + "grad_norm": 9.79485034942627, + "learning_rate": 9.974830465596107e-06, + "loss": 0.1946, + "step": 1646 + }, + { + "epoch": 0.0416782650504846, + "grad_norm": 15.503375053405762, + "learning_rate": 9.974790212121249e-06, + "loss": 0.2531, + "step": 1647 + }, + { + "epoch": 0.04170357061517828, + "grad_norm": 4.070854663848877, + "learning_rate": 9.974749926564893e-06, + "loss": 0.1727, + "step": 1648 + }, + { + "epoch": 0.041728876179871956, + "grad_norm": 8.608760833740234, + "learning_rate": 9.974709608927303e-06, + "loss": 0.2336, + "step": 1649 + }, + { + "epoch": 0.04175418174456563, + "grad_norm": 8.00377082824707, + "learning_rate": 9.974669259208734e-06, + "loss": 0.1459, + "step": 1650 + }, + { + "epoch": 0.0417794873092593, + "grad_norm": 5.382720470428467, + "learning_rate": 9.974628877409447e-06, + "loss": 0.1784, + "step": 1651 + }, + { + "epoch": 0.04180479287395298, + "grad_norm": 4.8192572593688965, + "learning_rate": 9.974588463529707e-06, + "loss": 0.1836, + "step": 1652 + }, + { + "epoch": 0.04183009843864666, + "grad_norm": 8.557161331176758, + "learning_rate": 9.97454801756977e-06, + "loss": 0.191, + "step": 1653 + }, + { + "epoch": 0.041855404003340334, + "grad_norm": 6.653805732727051, + "learning_rate": 9.974507539529896e-06, + "loss": 0.2232, + "step": 1654 + }, + { + "epoch": 0.04188070956803401, + "grad_norm": 7.044993877410889, + "learning_rate": 9.97446702941035e-06, + "loss": 0.288, + "step": 1655 + }, + { + "epoch": 0.04190601513272769, + "grad_norm": 5.370026111602783, + "learning_rate": 9.974426487211393e-06, + "loss": 0.2099, + "step": 1656 + }, + { + "epoch": 0.041931320697421365, + "grad_norm": 9.71114444732666, + "learning_rate": 9.974385912933282e-06, + "loss": 0.2077, + "step": 1657 + }, + { + "epoch": 0.04195662626211504, + "grad_norm": 5.396797180175781, + "learning_rate": 9.974345306576284e-06, + "loss": 0.2044, + "step": 1658 + }, + { + "epoch": 0.04198193182680871, + "grad_norm": 9.259424209594727, + "learning_rate": 9.974304668140657e-06, + "loss": 0.2218, + "step": 1659 + }, + { + "epoch": 0.04200723739150239, + "grad_norm": 18.02800941467285, + "learning_rate": 9.974263997626665e-06, + "loss": 0.272, + "step": 1660 + }, + { + "epoch": 0.042032542956196066, + "grad_norm": 13.745356559753418, + "learning_rate": 9.97422329503457e-06, + "loss": 0.2308, + "step": 1661 + }, + { + "epoch": 0.04205784852088974, + "grad_norm": 7.345616817474365, + "learning_rate": 9.974182560364635e-06, + "loss": 0.182, + "step": 1662 + }, + { + "epoch": 0.04208315408558342, + "grad_norm": 6.719465732574463, + "learning_rate": 9.97414179361712e-06, + "loss": 0.0585, + "step": 1663 + }, + { + "epoch": 0.0421084596502771, + "grad_norm": 6.54770040512085, + "learning_rate": 9.97410099479229e-06, + "loss": 0.1433, + "step": 1664 + }, + { + "epoch": 0.042133765214970774, + "grad_norm": 5.233901500701904, + "learning_rate": 9.97406016389041e-06, + "loss": 0.2154, + "step": 1665 + }, + { + "epoch": 0.04215907077966445, + "grad_norm": 11.609484672546387, + "learning_rate": 9.97401930091174e-06, + "loss": 0.26, + "step": 1666 + }, + { + "epoch": 0.04218437634435812, + "grad_norm": 12.282440185546875, + "learning_rate": 9.973978405856544e-06, + "loss": 0.3183, + "step": 1667 + }, + { + "epoch": 0.0422096819090518, + "grad_norm": 12.744296073913574, + "learning_rate": 9.973937478725089e-06, + "loss": 0.2732, + "step": 1668 + }, + { + "epoch": 0.042234987473745475, + "grad_norm": 13.365348815917969, + "learning_rate": 9.973896519517635e-06, + "loss": 0.1892, + "step": 1669 + }, + { + "epoch": 0.04226029303843915, + "grad_norm": 6.662704944610596, + "learning_rate": 9.973855528234447e-06, + "loss": 0.1447, + "step": 1670 + }, + { + "epoch": 0.04228559860313283, + "grad_norm": 7.152637481689453, + "learning_rate": 9.97381450487579e-06, + "loss": 0.1169, + "step": 1671 + }, + { + "epoch": 0.042310904167826506, + "grad_norm": 14.635217666625977, + "learning_rate": 9.973773449441928e-06, + "loss": 0.3046, + "step": 1672 + }, + { + "epoch": 0.04233620973252018, + "grad_norm": 7.436445236206055, + "learning_rate": 9.973732361933126e-06, + "loss": 0.2545, + "step": 1673 + }, + { + "epoch": 0.04236151529721386, + "grad_norm": 10.005112648010254, + "learning_rate": 9.97369124234965e-06, + "loss": 0.3111, + "step": 1674 + }, + { + "epoch": 0.04238682086190753, + "grad_norm": 20.072654724121094, + "learning_rate": 9.973650090691764e-06, + "loss": 0.2928, + "step": 1675 + }, + { + "epoch": 0.04241212642660121, + "grad_norm": 7.868010520935059, + "learning_rate": 9.973608906959735e-06, + "loss": 0.2463, + "step": 1676 + }, + { + "epoch": 0.042437431991294884, + "grad_norm": 7.84945821762085, + "learning_rate": 9.973567691153827e-06, + "loss": 0.2603, + "step": 1677 + }, + { + "epoch": 0.04246273755598856, + "grad_norm": 4.927811622619629, + "learning_rate": 9.973526443274305e-06, + "loss": 0.202, + "step": 1678 + }, + { + "epoch": 0.04248804312068224, + "grad_norm": 9.710408210754395, + "learning_rate": 9.973485163321436e-06, + "loss": 0.1861, + "step": 1679 + }, + { + "epoch": 0.042513348685375915, + "grad_norm": 6.4449615478515625, + "learning_rate": 9.973443851295487e-06, + "loss": 0.2112, + "step": 1680 + }, + { + "epoch": 0.04253865425006959, + "grad_norm": 15.310823440551758, + "learning_rate": 9.973402507196723e-06, + "loss": 0.3036, + "step": 1681 + }, + { + "epoch": 0.04256395981476327, + "grad_norm": 4.998112201690674, + "learning_rate": 9.973361131025412e-06, + "loss": 0.1692, + "step": 1682 + }, + { + "epoch": 0.04258926537945694, + "grad_norm": 19.682846069335938, + "learning_rate": 9.97331972278182e-06, + "loss": 0.4935, + "step": 1683 + }, + { + "epoch": 0.042614570944150616, + "grad_norm": 10.353571891784668, + "learning_rate": 9.973278282466216e-06, + "loss": 0.2299, + "step": 1684 + }, + { + "epoch": 0.04263987650884429, + "grad_norm": 11.469210624694824, + "learning_rate": 9.973236810078863e-06, + "loss": 0.185, + "step": 1685 + }, + { + "epoch": 0.04266518207353797, + "grad_norm": 5.298785209655762, + "learning_rate": 9.973195305620032e-06, + "loss": 0.1667, + "step": 1686 + }, + { + "epoch": 0.04269048763823165, + "grad_norm": 9.666783332824707, + "learning_rate": 9.97315376908999e-06, + "loss": 0.2635, + "step": 1687 + }, + { + "epoch": 0.042715793202925324, + "grad_norm": 7.589277267456055, + "learning_rate": 9.973112200489003e-06, + "loss": 0.2512, + "step": 1688 + }, + { + "epoch": 0.042741098767619, + "grad_norm": 12.840662956237793, + "learning_rate": 9.973070599817341e-06, + "loss": 0.3264, + "step": 1689 + }, + { + "epoch": 0.04276640433231268, + "grad_norm": 6.440617561340332, + "learning_rate": 9.973028967075272e-06, + "loss": 0.2019, + "step": 1690 + }, + { + "epoch": 0.04279170989700635, + "grad_norm": 5.559985637664795, + "learning_rate": 9.972987302263064e-06, + "loss": 0.2699, + "step": 1691 + }, + { + "epoch": 0.042817015461700025, + "grad_norm": 7.626654624938965, + "learning_rate": 9.972945605380987e-06, + "loss": 0.1739, + "step": 1692 + }, + { + "epoch": 0.0428423210263937, + "grad_norm": 16.3814754486084, + "learning_rate": 9.972903876429308e-06, + "loss": 0.2632, + "step": 1693 + }, + { + "epoch": 0.04286762659108738, + "grad_norm": 12.812599182128906, + "learning_rate": 9.972862115408297e-06, + "loss": 0.1708, + "step": 1694 + }, + { + "epoch": 0.042892932155781056, + "grad_norm": 5.827377796173096, + "learning_rate": 9.972820322318224e-06, + "loss": 0.2679, + "step": 1695 + }, + { + "epoch": 0.04291823772047473, + "grad_norm": 6.778584957122803, + "learning_rate": 9.972778497159356e-06, + "loss": 0.1759, + "step": 1696 + }, + { + "epoch": 0.04294354328516841, + "grad_norm": 7.949367046356201, + "learning_rate": 9.972736639931966e-06, + "loss": 0.2864, + "step": 1697 + }, + { + "epoch": 0.04296884884986209, + "grad_norm": 4.321409225463867, + "learning_rate": 9.972694750636321e-06, + "loss": 0.2182, + "step": 1698 + }, + { + "epoch": 0.04299415441455576, + "grad_norm": 5.101837635040283, + "learning_rate": 9.972652829272693e-06, + "loss": 0.1425, + "step": 1699 + }, + { + "epoch": 0.043019459979249434, + "grad_norm": 5.187796115875244, + "learning_rate": 9.972610875841353e-06, + "loss": 0.1655, + "step": 1700 + }, + { + "epoch": 0.04304476554394311, + "grad_norm": 5.6820244789123535, + "learning_rate": 9.97256889034257e-06, + "loss": 0.2132, + "step": 1701 + }, + { + "epoch": 0.04307007110863679, + "grad_norm": 5.467798233032227, + "learning_rate": 9.972526872776614e-06, + "loss": 0.2442, + "step": 1702 + }, + { + "epoch": 0.043095376673330466, + "grad_norm": 8.271110534667969, + "learning_rate": 9.972484823143758e-06, + "loss": 0.2434, + "step": 1703 + }, + { + "epoch": 0.04312068223802414, + "grad_norm": 6.637341022491455, + "learning_rate": 9.972442741444273e-06, + "loss": 0.1996, + "step": 1704 + }, + { + "epoch": 0.04314598780271782, + "grad_norm": 6.988156318664551, + "learning_rate": 9.972400627678429e-06, + "loss": 0.2195, + "step": 1705 + }, + { + "epoch": 0.0431712933674115, + "grad_norm": 14.541023254394531, + "learning_rate": 9.972358481846498e-06, + "loss": 0.3546, + "step": 1706 + }, + { + "epoch": 0.04319659893210517, + "grad_norm": 4.510189533233643, + "learning_rate": 9.972316303948752e-06, + "loss": 0.185, + "step": 1707 + }, + { + "epoch": 0.043221904496798844, + "grad_norm": 8.65124225616455, + "learning_rate": 9.972274093985465e-06, + "loss": 0.2366, + "step": 1708 + }, + { + "epoch": 0.04324721006149252, + "grad_norm": 11.190027236938477, + "learning_rate": 9.972231851956906e-06, + "loss": 0.2428, + "step": 1709 + }, + { + "epoch": 0.0432725156261862, + "grad_norm": 7.274660110473633, + "learning_rate": 9.972189577863348e-06, + "loss": 0.2872, + "step": 1710 + }, + { + "epoch": 0.043297821190879875, + "grad_norm": 15.79271411895752, + "learning_rate": 9.972147271705065e-06, + "loss": 0.2219, + "step": 1711 + }, + { + "epoch": 0.04332312675557355, + "grad_norm": 6.217609882354736, + "learning_rate": 9.972104933482329e-06, + "loss": 0.1576, + "step": 1712 + }, + { + "epoch": 0.04334843232026723, + "grad_norm": 8.03512191772461, + "learning_rate": 9.972062563195413e-06, + "loss": 0.2508, + "step": 1713 + }, + { + "epoch": 0.043373737884960906, + "grad_norm": 38.19951248168945, + "learning_rate": 9.97202016084459e-06, + "loss": 0.3769, + "step": 1714 + }, + { + "epoch": 0.043399043449654576, + "grad_norm": 11.740537643432617, + "learning_rate": 9.971977726430135e-06, + "loss": 0.3365, + "step": 1715 + }, + { + "epoch": 0.04342434901434825, + "grad_norm": 4.88108491897583, + "learning_rate": 9.971935259952319e-06, + "loss": 0.1734, + "step": 1716 + }, + { + "epoch": 0.04344965457904193, + "grad_norm": 9.984344482421875, + "learning_rate": 9.97189276141142e-06, + "loss": 0.3094, + "step": 1717 + }, + { + "epoch": 0.04347496014373561, + "grad_norm": 17.798912048339844, + "learning_rate": 9.971850230807706e-06, + "loss": 0.2519, + "step": 1718 + }, + { + "epoch": 0.043500265708429284, + "grad_norm": 6.795989513397217, + "learning_rate": 9.971807668141458e-06, + "loss": 0.1938, + "step": 1719 + }, + { + "epoch": 0.04352557127312296, + "grad_norm": 6.137596130371094, + "learning_rate": 9.971765073412944e-06, + "loss": 0.2323, + "step": 1720 + }, + { + "epoch": 0.04355087683781664, + "grad_norm": 6.24748420715332, + "learning_rate": 9.971722446622443e-06, + "loss": 0.2194, + "step": 1721 + }, + { + "epoch": 0.043576182402510315, + "grad_norm": 11.960137367248535, + "learning_rate": 9.97167978777023e-06, + "loss": 0.2405, + "step": 1722 + }, + { + "epoch": 0.043601487967203985, + "grad_norm": 14.59883975982666, + "learning_rate": 9.971637096856578e-06, + "loss": 0.3966, + "step": 1723 + }, + { + "epoch": 0.04362679353189766, + "grad_norm": 9.463623046875, + "learning_rate": 9.971594373881764e-06, + "loss": 0.2303, + "step": 1724 + }, + { + "epoch": 0.04365209909659134, + "grad_norm": 3.9765331745147705, + "learning_rate": 9.97155161884606e-06, + "loss": 0.1508, + "step": 1725 + }, + { + "epoch": 0.043677404661285016, + "grad_norm": 15.174936294555664, + "learning_rate": 9.971508831749748e-06, + "loss": 0.4058, + "step": 1726 + }, + { + "epoch": 0.04370271022597869, + "grad_norm": 5.945741653442383, + "learning_rate": 9.971466012593098e-06, + "loss": 0.2107, + "step": 1727 + }, + { + "epoch": 0.04372801579067237, + "grad_norm": 5.422307968139648, + "learning_rate": 9.971423161376389e-06, + "loss": 0.1989, + "step": 1728 + }, + { + "epoch": 0.04375332135536605, + "grad_norm": 10.834919929504395, + "learning_rate": 9.971380278099896e-06, + "loss": 0.2495, + "step": 1729 + }, + { + "epoch": 0.043778626920059724, + "grad_norm": 8.21037769317627, + "learning_rate": 9.971337362763898e-06, + "loss": 0.2268, + "step": 1730 + }, + { + "epoch": 0.043803932484753394, + "grad_norm": 7.959634780883789, + "learning_rate": 9.971294415368668e-06, + "loss": 0.2318, + "step": 1731 + }, + { + "epoch": 0.04382923804944707, + "grad_norm": 5.033545017242432, + "learning_rate": 9.971251435914488e-06, + "loss": 0.2291, + "step": 1732 + }, + { + "epoch": 0.04385454361414075, + "grad_norm": 6.098812580108643, + "learning_rate": 9.97120842440163e-06, + "loss": 0.242, + "step": 1733 + }, + { + "epoch": 0.043879849178834425, + "grad_norm": 8.807971954345703, + "learning_rate": 9.971165380830375e-06, + "loss": 0.2447, + "step": 1734 + }, + { + "epoch": 0.0439051547435281, + "grad_norm": 14.765472412109375, + "learning_rate": 9.971122305200998e-06, + "loss": 0.2826, + "step": 1735 + }, + { + "epoch": 0.04393046030822178, + "grad_norm": 7.119283676147461, + "learning_rate": 9.971079197513778e-06, + "loss": 0.2542, + "step": 1736 + }, + { + "epoch": 0.043955765872915456, + "grad_norm": 6.833852767944336, + "learning_rate": 9.971036057768992e-06, + "loss": 0.2561, + "step": 1737 + }, + { + "epoch": 0.04398107143760913, + "grad_norm": 12.242719650268555, + "learning_rate": 9.970992885966921e-06, + "loss": 0.2032, + "step": 1738 + }, + { + "epoch": 0.0440063770023028, + "grad_norm": 5.0576934814453125, + "learning_rate": 9.97094968210784e-06, + "loss": 0.2453, + "step": 1739 + }, + { + "epoch": 0.04403168256699648, + "grad_norm": 6.149620056152344, + "learning_rate": 9.97090644619203e-06, + "loss": 0.2311, + "step": 1740 + }, + { + "epoch": 0.04405698813169016, + "grad_norm": 7.9945173263549805, + "learning_rate": 9.97086317821977e-06, + "loss": 0.1996, + "step": 1741 + }, + { + "epoch": 0.044082293696383834, + "grad_norm": 5.981322765350342, + "learning_rate": 9.970819878191335e-06, + "loss": 0.2903, + "step": 1742 + }, + { + "epoch": 0.04410759926107751, + "grad_norm": 9.04451847076416, + "learning_rate": 9.97077654610701e-06, + "loss": 0.1767, + "step": 1743 + }, + { + "epoch": 0.04413290482577119, + "grad_norm": 9.325383186340332, + "learning_rate": 9.970733181967071e-06, + "loss": 0.2595, + "step": 1744 + }, + { + "epoch": 0.044158210390464865, + "grad_norm": 6.1068854331970215, + "learning_rate": 9.970689785771798e-06, + "loss": 0.2051, + "step": 1745 + }, + { + "epoch": 0.04418351595515854, + "grad_norm": 12.70165729522705, + "learning_rate": 9.970646357521472e-06, + "loss": 0.3178, + "step": 1746 + }, + { + "epoch": 0.04420882151985221, + "grad_norm": 6.418624401092529, + "learning_rate": 9.970602897216373e-06, + "loss": 0.185, + "step": 1747 + }, + { + "epoch": 0.04423412708454589, + "grad_norm": 7.588964939117432, + "learning_rate": 9.970559404856778e-06, + "loss": 0.2586, + "step": 1748 + }, + { + "epoch": 0.044259432649239566, + "grad_norm": 6.96023416519165, + "learning_rate": 9.97051588044297e-06, + "loss": 0.16, + "step": 1749 + }, + { + "epoch": 0.04428473821393324, + "grad_norm": 18.775226593017578, + "learning_rate": 9.970472323975232e-06, + "loss": 0.2803, + "step": 1750 + }, + { + "epoch": 0.04431004377862692, + "grad_norm": 4.20867395401001, + "learning_rate": 9.970428735453843e-06, + "loss": 0.1448, + "step": 1751 + }, + { + "epoch": 0.0443353493433206, + "grad_norm": 10.118420600891113, + "learning_rate": 9.970385114879082e-06, + "loss": 0.2274, + "step": 1752 + }, + { + "epoch": 0.044360654908014274, + "grad_norm": 11.520347595214844, + "learning_rate": 9.970341462251232e-06, + "loss": 0.225, + "step": 1753 + }, + { + "epoch": 0.04438596047270795, + "grad_norm": 7.502699375152588, + "learning_rate": 9.970297777570574e-06, + "loss": 0.2413, + "step": 1754 + }, + { + "epoch": 0.04441126603740162, + "grad_norm": 8.604540824890137, + "learning_rate": 9.97025406083739e-06, + "loss": 0.185, + "step": 1755 + }, + { + "epoch": 0.0444365716020953, + "grad_norm": 10.874455451965332, + "learning_rate": 9.970210312051963e-06, + "loss": 0.2687, + "step": 1756 + }, + { + "epoch": 0.044461877166788975, + "grad_norm": 9.149238586425781, + "learning_rate": 9.970166531214573e-06, + "loss": 0.2305, + "step": 1757 + }, + { + "epoch": 0.04448718273148265, + "grad_norm": 9.711522102355957, + "learning_rate": 9.970122718325506e-06, + "loss": 0.235, + "step": 1758 + }, + { + "epoch": 0.04451248829617633, + "grad_norm": 5.830216407775879, + "learning_rate": 9.970078873385041e-06, + "loss": 0.2943, + "step": 1759 + }, + { + "epoch": 0.044537793860870006, + "grad_norm": 13.617576599121094, + "learning_rate": 9.970034996393461e-06, + "loss": 0.2247, + "step": 1760 + }, + { + "epoch": 0.04456309942556368, + "grad_norm": 4.103499412536621, + "learning_rate": 9.96999108735105e-06, + "loss": 0.1414, + "step": 1761 + }, + { + "epoch": 0.04458840499025736, + "grad_norm": 4.896123886108398, + "learning_rate": 9.969947146258092e-06, + "loss": 0.2294, + "step": 1762 + }, + { + "epoch": 0.04461371055495103, + "grad_norm": 8.243021011352539, + "learning_rate": 9.969903173114868e-06, + "loss": 0.2018, + "step": 1763 + }, + { + "epoch": 0.04463901611964471, + "grad_norm": 8.445853233337402, + "learning_rate": 9.969859167921663e-06, + "loss": 0.2773, + "step": 1764 + }, + { + "epoch": 0.044664321684338384, + "grad_norm": 7.705677509307861, + "learning_rate": 9.96981513067876e-06, + "loss": 0.1961, + "step": 1765 + }, + { + "epoch": 0.04468962724903206, + "grad_norm": 11.696249961853027, + "learning_rate": 9.969771061386448e-06, + "loss": 0.2866, + "step": 1766 + }, + { + "epoch": 0.04471493281372574, + "grad_norm": 4.223360538482666, + "learning_rate": 9.969726960045003e-06, + "loss": 0.1936, + "step": 1767 + }, + { + "epoch": 0.044740238378419415, + "grad_norm": 6.626424312591553, + "learning_rate": 9.969682826654715e-06, + "loss": 0.2298, + "step": 1768 + }, + { + "epoch": 0.04476554394311309, + "grad_norm": 6.786004066467285, + "learning_rate": 9.969638661215866e-06, + "loss": 0.2753, + "step": 1769 + }, + { + "epoch": 0.04479084950780677, + "grad_norm": 15.457003593444824, + "learning_rate": 9.96959446372874e-06, + "loss": 0.2662, + "step": 1770 + }, + { + "epoch": 0.04481615507250044, + "grad_norm": 6.688142776489258, + "learning_rate": 9.969550234193626e-06, + "loss": 0.201, + "step": 1771 + }, + { + "epoch": 0.04484146063719412, + "grad_norm": 4.575770378112793, + "learning_rate": 9.969505972610807e-06, + "loss": 0.1887, + "step": 1772 + }, + { + "epoch": 0.044866766201887794, + "grad_norm": 15.284431457519531, + "learning_rate": 9.969461678980567e-06, + "loss": 0.3006, + "step": 1773 + }, + { + "epoch": 0.04489207176658147, + "grad_norm": 8.8455228805542, + "learning_rate": 9.969417353303195e-06, + "loss": 0.2716, + "step": 1774 + }, + { + "epoch": 0.04491737733127515, + "grad_norm": 14.231979370117188, + "learning_rate": 9.969372995578972e-06, + "loss": 0.2252, + "step": 1775 + }, + { + "epoch": 0.044942682895968825, + "grad_norm": 4.652178764343262, + "learning_rate": 9.969328605808189e-06, + "loss": 0.2219, + "step": 1776 + }, + { + "epoch": 0.0449679884606625, + "grad_norm": 13.973824501037598, + "learning_rate": 9.96928418399113e-06, + "loss": 0.2234, + "step": 1777 + }, + { + "epoch": 0.04499329402535618, + "grad_norm": 37.3187370300293, + "learning_rate": 9.969239730128082e-06, + "loss": 0.1498, + "step": 1778 + }, + { + "epoch": 0.04501859959004985, + "grad_norm": 8.941641807556152, + "learning_rate": 9.96919524421933e-06, + "loss": 0.1424, + "step": 1779 + }, + { + "epoch": 0.045043905154743526, + "grad_norm": 6.446828842163086, + "learning_rate": 9.969150726265162e-06, + "loss": 0.2386, + "step": 1780 + }, + { + "epoch": 0.0450692107194372, + "grad_norm": 7.789473533630371, + "learning_rate": 9.969106176265866e-06, + "loss": 0.2723, + "step": 1781 + }, + { + "epoch": 0.04509451628413088, + "grad_norm": 13.948160171508789, + "learning_rate": 9.96906159422173e-06, + "loss": 0.3015, + "step": 1782 + }, + { + "epoch": 0.04511982184882456, + "grad_norm": 4.845565319061279, + "learning_rate": 9.96901698013304e-06, + "loss": 0.211, + "step": 1783 + }, + { + "epoch": 0.045145127413518234, + "grad_norm": 12.451457977294922, + "learning_rate": 9.968972334000083e-06, + "loss": 0.3537, + "step": 1784 + }, + { + "epoch": 0.04517043297821191, + "grad_norm": 5.318933963775635, + "learning_rate": 9.968927655823146e-06, + "loss": 0.2572, + "step": 1785 + }, + { + "epoch": 0.04519573854290559, + "grad_norm": 5.915700912475586, + "learning_rate": 9.96888294560252e-06, + "loss": 0.1723, + "step": 1786 + }, + { + "epoch": 0.04522104410759926, + "grad_norm": 11.631078720092773, + "learning_rate": 9.968838203338493e-06, + "loss": 0.2958, + "step": 1787 + }, + { + "epoch": 0.045246349672292935, + "grad_norm": 12.423503875732422, + "learning_rate": 9.968793429031352e-06, + "loss": 0.2486, + "step": 1788 + }, + { + "epoch": 0.04527165523698661, + "grad_norm": 10.025568962097168, + "learning_rate": 9.968748622681385e-06, + "loss": 0.2036, + "step": 1789 + }, + { + "epoch": 0.04529696080168029, + "grad_norm": 5.238072872161865, + "learning_rate": 9.968703784288885e-06, + "loss": 0.2466, + "step": 1790 + }, + { + "epoch": 0.045322266366373966, + "grad_norm": 6.395547389984131, + "learning_rate": 9.968658913854138e-06, + "loss": 0.23, + "step": 1791 + }, + { + "epoch": 0.04534757193106764, + "grad_norm": 6.304800987243652, + "learning_rate": 9.968614011377433e-06, + "loss": 0.241, + "step": 1792 + }, + { + "epoch": 0.04537287749576132, + "grad_norm": 14.347447395324707, + "learning_rate": 9.96856907685906e-06, + "loss": 0.2105, + "step": 1793 + }, + { + "epoch": 0.045398183060455, + "grad_norm": 7.5379958152771, + "learning_rate": 9.968524110299311e-06, + "loss": 0.2336, + "step": 1794 + }, + { + "epoch": 0.04542348862514867, + "grad_norm": 10.33797836303711, + "learning_rate": 9.968479111698474e-06, + "loss": 0.2979, + "step": 1795 + }, + { + "epoch": 0.045448794189842344, + "grad_norm": 8.380107879638672, + "learning_rate": 9.968434081056839e-06, + "loss": 0.2431, + "step": 1796 + }, + { + "epoch": 0.04547409975453602, + "grad_norm": 5.336024284362793, + "learning_rate": 9.968389018374695e-06, + "loss": 0.25, + "step": 1797 + }, + { + "epoch": 0.0454994053192297, + "grad_norm": 19.156204223632812, + "learning_rate": 9.968343923652337e-06, + "loss": 0.3102, + "step": 1798 + }, + { + "epoch": 0.045524710883923375, + "grad_norm": 8.275544166564941, + "learning_rate": 9.968298796890052e-06, + "loss": 0.2471, + "step": 1799 + }, + { + "epoch": 0.04555001644861705, + "grad_norm": 11.224287033081055, + "learning_rate": 9.968253638088133e-06, + "loss": 0.3372, + "step": 1800 + }, + { + "epoch": 0.04557532201331073, + "grad_norm": 13.208270072937012, + "learning_rate": 9.96820844724687e-06, + "loss": 0.3399, + "step": 1801 + }, + { + "epoch": 0.045600627578004406, + "grad_norm": 9.425146102905273, + "learning_rate": 9.968163224366554e-06, + "loss": 0.2349, + "step": 1802 + }, + { + "epoch": 0.045625933142698076, + "grad_norm": 9.374542236328125, + "learning_rate": 9.968117969447479e-06, + "loss": 0.1889, + "step": 1803 + }, + { + "epoch": 0.04565123870739175, + "grad_norm": 9.883291244506836, + "learning_rate": 9.968072682489935e-06, + "loss": 0.2543, + "step": 1804 + }, + { + "epoch": 0.04567654427208543, + "grad_norm": 4.94498872756958, + "learning_rate": 9.968027363494212e-06, + "loss": 0.2225, + "step": 1805 + }, + { + "epoch": 0.04570184983677911, + "grad_norm": 5.445494651794434, + "learning_rate": 9.96798201246061e-06, + "loss": 0.2142, + "step": 1806 + }, + { + "epoch": 0.045727155401472784, + "grad_norm": 5.056578159332275, + "learning_rate": 9.96793662938941e-06, + "loss": 0.2298, + "step": 1807 + }, + { + "epoch": 0.04575246096616646, + "grad_norm": 8.567256927490234, + "learning_rate": 9.967891214280915e-06, + "loss": 0.2492, + "step": 1808 + }, + { + "epoch": 0.04577776653086014, + "grad_norm": 4.466290473937988, + "learning_rate": 9.967845767135412e-06, + "loss": 0.2019, + "step": 1809 + }, + { + "epoch": 0.045803072095553815, + "grad_norm": 7.030611038208008, + "learning_rate": 9.967800287953195e-06, + "loss": 0.1984, + "step": 1810 + }, + { + "epoch": 0.045828377660247485, + "grad_norm": 7.891604423522949, + "learning_rate": 9.96775477673456e-06, + "loss": 0.2043, + "step": 1811 + }, + { + "epoch": 0.04585368322494116, + "grad_norm": 9.397258758544922, + "learning_rate": 9.967709233479798e-06, + "loss": 0.2744, + "step": 1812 + }, + { + "epoch": 0.04587898878963484, + "grad_norm": 5.726401329040527, + "learning_rate": 9.967663658189201e-06, + "loss": 0.1345, + "step": 1813 + }, + { + "epoch": 0.045904294354328516, + "grad_norm": 8.776479721069336, + "learning_rate": 9.967618050863068e-06, + "loss": 0.2769, + "step": 1814 + }, + { + "epoch": 0.04592959991902219, + "grad_norm": 13.690624237060547, + "learning_rate": 9.967572411501688e-06, + "loss": 0.2223, + "step": 1815 + }, + { + "epoch": 0.04595490548371587, + "grad_norm": 8.366368293762207, + "learning_rate": 9.96752674010536e-06, + "loss": 0.214, + "step": 1816 + }, + { + "epoch": 0.04598021104840955, + "grad_norm": 5.341800689697266, + "learning_rate": 9.967481036674376e-06, + "loss": 0.2225, + "step": 1817 + }, + { + "epoch": 0.046005516613103224, + "grad_norm": 9.250658988952637, + "learning_rate": 9.967435301209029e-06, + "loss": 0.2938, + "step": 1818 + }, + { + "epoch": 0.046030822177796894, + "grad_norm": 6.482370853424072, + "learning_rate": 9.967389533709616e-06, + "loss": 0.2315, + "step": 1819 + }, + { + "epoch": 0.04605612774249057, + "grad_norm": 15.977272033691406, + "learning_rate": 9.967343734176432e-06, + "loss": 0.2725, + "step": 1820 + }, + { + "epoch": 0.04608143330718425, + "grad_norm": 7.633048057556152, + "learning_rate": 9.967297902609773e-06, + "loss": 0.157, + "step": 1821 + }, + { + "epoch": 0.046106738871877925, + "grad_norm": 7.222787380218506, + "learning_rate": 9.967252039009934e-06, + "loss": 0.2153, + "step": 1822 + }, + { + "epoch": 0.0461320444365716, + "grad_norm": 7.231213092803955, + "learning_rate": 9.967206143377207e-06, + "loss": 0.2167, + "step": 1823 + }, + { + "epoch": 0.04615735000126528, + "grad_norm": 9.384899139404297, + "learning_rate": 9.967160215711896e-06, + "loss": 0.3574, + "step": 1824 + }, + { + "epoch": 0.046182655565958956, + "grad_norm": 19.475934982299805, + "learning_rate": 9.967114256014292e-06, + "loss": 0.2784, + "step": 1825 + }, + { + "epoch": 0.04620796113065263, + "grad_norm": 8.440403938293457, + "learning_rate": 9.967068264284692e-06, + "loss": 0.2382, + "step": 1826 + }, + { + "epoch": 0.0462332666953463, + "grad_norm": 6.868809700012207, + "learning_rate": 9.967022240523392e-06, + "loss": 0.2999, + "step": 1827 + }, + { + "epoch": 0.04625857226003998, + "grad_norm": 8.318065643310547, + "learning_rate": 9.966976184730688e-06, + "loss": 0.2417, + "step": 1828 + }, + { + "epoch": 0.04628387782473366, + "grad_norm": 8.605517387390137, + "learning_rate": 9.966930096906881e-06, + "loss": 0.2446, + "step": 1829 + }, + { + "epoch": 0.046309183389427334, + "grad_norm": 8.355656623840332, + "learning_rate": 9.966883977052264e-06, + "loss": 0.2948, + "step": 1830 + }, + { + "epoch": 0.04633448895412101, + "grad_norm": 6.695159912109375, + "learning_rate": 9.966837825167138e-06, + "loss": 0.1617, + "step": 1831 + }, + { + "epoch": 0.04635979451881469, + "grad_norm": 10.284083366394043, + "learning_rate": 9.966791641251796e-06, + "loss": 0.2363, + "step": 1832 + }, + { + "epoch": 0.046385100083508365, + "grad_norm": 10.797038078308105, + "learning_rate": 9.966745425306541e-06, + "loss": 0.2683, + "step": 1833 + }, + { + "epoch": 0.04641040564820204, + "grad_norm": 8.776719093322754, + "learning_rate": 9.96669917733167e-06, + "loss": 0.2116, + "step": 1834 + }, + { + "epoch": 0.04643571121289571, + "grad_norm": 7.179711818695068, + "learning_rate": 9.966652897327476e-06, + "loss": 0.234, + "step": 1835 + }, + { + "epoch": 0.04646101677758939, + "grad_norm": 9.873480796813965, + "learning_rate": 9.966606585294263e-06, + "loss": 0.2628, + "step": 1836 + }, + { + "epoch": 0.04648632234228307, + "grad_norm": 5.40507173538208, + "learning_rate": 9.966560241232328e-06, + "loss": 0.17, + "step": 1837 + }, + { + "epoch": 0.046511627906976744, + "grad_norm": 5.672785758972168, + "learning_rate": 9.96651386514197e-06, + "loss": 0.1845, + "step": 1838 + }, + { + "epoch": 0.04653693347167042, + "grad_norm": 7.4603962898254395, + "learning_rate": 9.966467457023488e-06, + "loss": 0.2048, + "step": 1839 + }, + { + "epoch": 0.0465622390363641, + "grad_norm": 9.85168170928955, + "learning_rate": 9.966421016877181e-06, + "loss": 0.2872, + "step": 1840 + }, + { + "epoch": 0.046587544601057775, + "grad_norm": 6.2638421058654785, + "learning_rate": 9.966374544703348e-06, + "loss": 0.2467, + "step": 1841 + }, + { + "epoch": 0.04661285016575145, + "grad_norm": 6.744561672210693, + "learning_rate": 9.966328040502291e-06, + "loss": 0.2051, + "step": 1842 + }, + { + "epoch": 0.04663815573044512, + "grad_norm": 8.772834777832031, + "learning_rate": 9.966281504274307e-06, + "loss": 0.2206, + "step": 1843 + }, + { + "epoch": 0.0466634612951388, + "grad_norm": 7.723798751831055, + "learning_rate": 9.966234936019699e-06, + "loss": 0.2459, + "step": 1844 + }, + { + "epoch": 0.046688766859832476, + "grad_norm": 10.413102149963379, + "learning_rate": 9.966188335738764e-06, + "loss": 0.2449, + "step": 1845 + }, + { + "epoch": 0.04671407242452615, + "grad_norm": 9.87121295928955, + "learning_rate": 9.966141703431804e-06, + "loss": 0.2003, + "step": 1846 + }, + { + "epoch": 0.04673937798921983, + "grad_norm": 9.698378562927246, + "learning_rate": 9.966095039099121e-06, + "loss": 0.2985, + "step": 1847 + }, + { + "epoch": 0.04676468355391351, + "grad_norm": 8.013836860656738, + "learning_rate": 9.966048342741015e-06, + "loss": 0.2272, + "step": 1848 + }, + { + "epoch": 0.046789989118607184, + "grad_norm": 3.8054003715515137, + "learning_rate": 9.966001614357786e-06, + "loss": 0.1243, + "step": 1849 + }, + { + "epoch": 0.04681529468330086, + "grad_norm": 10.963750839233398, + "learning_rate": 9.965954853949737e-06, + "loss": 0.2103, + "step": 1850 + }, + { + "epoch": 0.04684060024799453, + "grad_norm": 7.837224006652832, + "learning_rate": 9.96590806151717e-06, + "loss": 0.3122, + "step": 1851 + }, + { + "epoch": 0.04686590581268821, + "grad_norm": 9.290478706359863, + "learning_rate": 9.965861237060384e-06, + "loss": 0.1299, + "step": 1852 + }, + { + "epoch": 0.046891211377381885, + "grad_norm": 9.32878589630127, + "learning_rate": 9.965814380579683e-06, + "loss": 0.2288, + "step": 1853 + }, + { + "epoch": 0.04691651694207556, + "grad_norm": 5.499343395233154, + "learning_rate": 9.96576749207537e-06, + "loss": 0.1602, + "step": 1854 + }, + { + "epoch": 0.04694182250676924, + "grad_norm": 6.795897006988525, + "learning_rate": 9.965720571547743e-06, + "loss": 0.2462, + "step": 1855 + }, + { + "epoch": 0.046967128071462916, + "grad_norm": 6.287297248840332, + "learning_rate": 9.965673618997109e-06, + "loss": 0.2391, + "step": 1856 + }, + { + "epoch": 0.04699243363615659, + "grad_norm": 10.001503944396973, + "learning_rate": 9.96562663442377e-06, + "loss": 0.1395, + "step": 1857 + }, + { + "epoch": 0.04701773920085027, + "grad_norm": 6.824020862579346, + "learning_rate": 9.96557961782803e-06, + "loss": 0.1887, + "step": 1858 + }, + { + "epoch": 0.04704304476554394, + "grad_norm": 7.461430549621582, + "learning_rate": 9.965532569210188e-06, + "loss": 0.267, + "step": 1859 + }, + { + "epoch": 0.04706835033023762, + "grad_norm": 8.49498176574707, + "learning_rate": 9.965485488570553e-06, + "loss": 0.2306, + "step": 1860 + }, + { + "epoch": 0.047093655894931294, + "grad_norm": 5.735085964202881, + "learning_rate": 9.965438375909424e-06, + "loss": 0.2122, + "step": 1861 + }, + { + "epoch": 0.04711896145962497, + "grad_norm": 6.187440395355225, + "learning_rate": 9.965391231227106e-06, + "loss": 0.2085, + "step": 1862 + }, + { + "epoch": 0.04714426702431865, + "grad_norm": 13.919921875, + "learning_rate": 9.965344054523905e-06, + "loss": 0.2821, + "step": 1863 + }, + { + "epoch": 0.047169572589012325, + "grad_norm": 9.918747901916504, + "learning_rate": 9.965296845800123e-06, + "loss": 0.321, + "step": 1864 + }, + { + "epoch": 0.047194878153706, + "grad_norm": 6.482818603515625, + "learning_rate": 9.965249605056066e-06, + "loss": 0.1638, + "step": 1865 + }, + { + "epoch": 0.04722018371839968, + "grad_norm": 18.08170509338379, + "learning_rate": 9.965202332292036e-06, + "loss": 0.2772, + "step": 1866 + }, + { + "epoch": 0.04724548928309335, + "grad_norm": 5.283572196960449, + "learning_rate": 9.965155027508341e-06, + "loss": 0.1597, + "step": 1867 + }, + { + "epoch": 0.047270794847787026, + "grad_norm": 18.31578826904297, + "learning_rate": 9.965107690705286e-06, + "loss": 0.2467, + "step": 1868 + }, + { + "epoch": 0.0472961004124807, + "grad_norm": 9.10570240020752, + "learning_rate": 9.965060321883173e-06, + "loss": 0.3209, + "step": 1869 + }, + { + "epoch": 0.04732140597717438, + "grad_norm": 7.9514970779418945, + "learning_rate": 9.965012921042312e-06, + "loss": 0.2322, + "step": 1870 + }, + { + "epoch": 0.04734671154186806, + "grad_norm": 7.730062007904053, + "learning_rate": 9.964965488183004e-06, + "loss": 0.1365, + "step": 1871 + }, + { + "epoch": 0.047372017106561734, + "grad_norm": 7.677950382232666, + "learning_rate": 9.964918023305557e-06, + "loss": 0.2473, + "step": 1872 + }, + { + "epoch": 0.04739732267125541, + "grad_norm": 5.626697063446045, + "learning_rate": 9.96487052641028e-06, + "loss": 0.1783, + "step": 1873 + }, + { + "epoch": 0.04742262823594909, + "grad_norm": 12.698324203491211, + "learning_rate": 9.964822997497474e-06, + "loss": 0.2593, + "step": 1874 + }, + { + "epoch": 0.04744793380064276, + "grad_norm": 5.677745342254639, + "learning_rate": 9.964775436567449e-06, + "loss": 0.1807, + "step": 1875 + }, + { + "epoch": 0.047473239365336435, + "grad_norm": 10.534659385681152, + "learning_rate": 9.96472784362051e-06, + "loss": 0.2804, + "step": 1876 + }, + { + "epoch": 0.04749854493003011, + "grad_norm": 7.14988899230957, + "learning_rate": 9.964680218656964e-06, + "loss": 0.2138, + "step": 1877 + }, + { + "epoch": 0.04752385049472379, + "grad_norm": 11.030766487121582, + "learning_rate": 9.96463256167712e-06, + "loss": 0.2364, + "step": 1878 + }, + { + "epoch": 0.047549156059417466, + "grad_norm": 6.377966403961182, + "learning_rate": 9.964584872681285e-06, + "loss": 0.2094, + "step": 1879 + }, + { + "epoch": 0.04757446162411114, + "grad_norm": 6.059832572937012, + "learning_rate": 9.964537151669764e-06, + "loss": 0.1918, + "step": 1880 + }, + { + "epoch": 0.04759976718880482, + "grad_norm": 7.973600387573242, + "learning_rate": 9.964489398642867e-06, + "loss": 0.2397, + "step": 1881 + }, + { + "epoch": 0.0476250727534985, + "grad_norm": 6.47101354598999, + "learning_rate": 9.964441613600902e-06, + "loss": 0.2591, + "step": 1882 + }, + { + "epoch": 0.04765037831819217, + "grad_norm": 5.526650428771973, + "learning_rate": 9.964393796544174e-06, + "loss": 0.1767, + "step": 1883 + }, + { + "epoch": 0.047675683882885844, + "grad_norm": 8.56629467010498, + "learning_rate": 9.964345947472996e-06, + "loss": 0.1769, + "step": 1884 + }, + { + "epoch": 0.04770098944757952, + "grad_norm": 12.852615356445312, + "learning_rate": 9.964298066387674e-06, + "loss": 0.265, + "step": 1885 + }, + { + "epoch": 0.0477262950122732, + "grad_norm": 10.520793914794922, + "learning_rate": 9.964250153288517e-06, + "loss": 0.2753, + "step": 1886 + }, + { + "epoch": 0.047751600576966875, + "grad_norm": 14.783217430114746, + "learning_rate": 9.964202208175835e-06, + "loss": 0.2116, + "step": 1887 + }, + { + "epoch": 0.04777690614166055, + "grad_norm": 6.694015026092529, + "learning_rate": 9.964154231049937e-06, + "loss": 0.2391, + "step": 1888 + }, + { + "epoch": 0.04780221170635423, + "grad_norm": 6.623727321624756, + "learning_rate": 9.96410622191113e-06, + "loss": 0.2444, + "step": 1889 + }, + { + "epoch": 0.047827517271047906, + "grad_norm": 14.907434463500977, + "learning_rate": 9.964058180759726e-06, + "loss": 0.2991, + "step": 1890 + }, + { + "epoch": 0.047852822835741576, + "grad_norm": 10.303367614746094, + "learning_rate": 9.964010107596034e-06, + "loss": 0.2735, + "step": 1891 + }, + { + "epoch": 0.04787812840043525, + "grad_norm": 5.800581455230713, + "learning_rate": 9.963962002420364e-06, + "loss": 0.2015, + "step": 1892 + }, + { + "epoch": 0.04790343396512893, + "grad_norm": 4.895249366760254, + "learning_rate": 9.963913865233026e-06, + "loss": 0.2039, + "step": 1893 + }, + { + "epoch": 0.04792873952982261, + "grad_norm": 6.152859687805176, + "learning_rate": 9.963865696034332e-06, + "loss": 0.2459, + "step": 1894 + }, + { + "epoch": 0.047954045094516284, + "grad_norm": 10.064774513244629, + "learning_rate": 9.96381749482459e-06, + "loss": 0.3444, + "step": 1895 + }, + { + "epoch": 0.04797935065920996, + "grad_norm": 4.234400272369385, + "learning_rate": 9.963769261604114e-06, + "loss": 0.1554, + "step": 1896 + }, + { + "epoch": 0.04800465622390364, + "grad_norm": 10.53221321105957, + "learning_rate": 9.963720996373213e-06, + "loss": 0.1415, + "step": 1897 + }, + { + "epoch": 0.048029961788597315, + "grad_norm": 9.397747993469238, + "learning_rate": 9.963672699132198e-06, + "loss": 0.1979, + "step": 1898 + }, + { + "epoch": 0.048055267353290985, + "grad_norm": 8.859753608703613, + "learning_rate": 9.963624369881381e-06, + "loss": 0.2876, + "step": 1899 + }, + { + "epoch": 0.04808057291798466, + "grad_norm": 4.102493762969971, + "learning_rate": 9.963576008621074e-06, + "loss": 0.2295, + "step": 1900 + }, + { + "epoch": 0.04810587848267834, + "grad_norm": 13.812660217285156, + "learning_rate": 9.963527615351588e-06, + "loss": 0.2119, + "step": 1901 + }, + { + "epoch": 0.048131184047372016, + "grad_norm": 4.237075328826904, + "learning_rate": 9.963479190073238e-06, + "loss": 0.1541, + "step": 1902 + }, + { + "epoch": 0.048156489612065694, + "grad_norm": 3.4895904064178467, + "learning_rate": 9.963430732786331e-06, + "loss": 0.1836, + "step": 1903 + }, + { + "epoch": 0.04818179517675937, + "grad_norm": 4.3292412757873535, + "learning_rate": 9.963382243491185e-06, + "loss": 0.1204, + "step": 1904 + }, + { + "epoch": 0.04820710074145305, + "grad_norm": 9.652873992919922, + "learning_rate": 9.96333372218811e-06, + "loss": 0.2465, + "step": 1905 + }, + { + "epoch": 0.048232406306146725, + "grad_norm": 10.376275062561035, + "learning_rate": 9.963285168877416e-06, + "loss": 0.3464, + "step": 1906 + }, + { + "epoch": 0.048257711870840395, + "grad_norm": 18.788436889648438, + "learning_rate": 9.963236583559424e-06, + "loss": 0.3313, + "step": 1907 + }, + { + "epoch": 0.04828301743553407, + "grad_norm": 7.224172592163086, + "learning_rate": 9.96318796623444e-06, + "loss": 0.2473, + "step": 1908 + }, + { + "epoch": 0.04830832300022775, + "grad_norm": 9.062091827392578, + "learning_rate": 9.963139316902779e-06, + "loss": 0.1626, + "step": 1909 + }, + { + "epoch": 0.048333628564921426, + "grad_norm": 13.596329689025879, + "learning_rate": 9.963090635564755e-06, + "loss": 0.3893, + "step": 1910 + }, + { + "epoch": 0.0483589341296151, + "grad_norm": 12.309565544128418, + "learning_rate": 9.963041922220686e-06, + "loss": 0.2694, + "step": 1911 + }, + { + "epoch": 0.04838423969430878, + "grad_norm": 8.840025901794434, + "learning_rate": 9.962993176870881e-06, + "loss": 0.2056, + "step": 1912 + }, + { + "epoch": 0.04840954525900246, + "grad_norm": 7.692770004272461, + "learning_rate": 9.962944399515656e-06, + "loss": 0.1948, + "step": 1913 + }, + { + "epoch": 0.048434850823696134, + "grad_norm": 4.6357340812683105, + "learning_rate": 9.962895590155326e-06, + "loss": 0.242, + "step": 1914 + }, + { + "epoch": 0.048460156388389804, + "grad_norm": 17.876935958862305, + "learning_rate": 9.962846748790207e-06, + "loss": 0.41, + "step": 1915 + }, + { + "epoch": 0.04848546195308348, + "grad_norm": 10.056160926818848, + "learning_rate": 9.96279787542061e-06, + "loss": 0.2296, + "step": 1916 + }, + { + "epoch": 0.04851076751777716, + "grad_norm": 7.046332359313965, + "learning_rate": 9.962748970046854e-06, + "loss": 0.2582, + "step": 1917 + }, + { + "epoch": 0.048536073082470835, + "grad_norm": 8.337257385253906, + "learning_rate": 9.962700032669253e-06, + "loss": 0.2295, + "step": 1918 + }, + { + "epoch": 0.04856137864716451, + "grad_norm": 13.795743942260742, + "learning_rate": 9.962651063288123e-06, + "loss": 0.3393, + "step": 1919 + }, + { + "epoch": 0.04858668421185819, + "grad_norm": 10.06857967376709, + "learning_rate": 9.962602061903777e-06, + "loss": 0.2335, + "step": 1920 + }, + { + "epoch": 0.048611989776551866, + "grad_norm": 3.327035427093506, + "learning_rate": 9.962553028516535e-06, + "loss": 0.1446, + "step": 1921 + }, + { + "epoch": 0.04863729534124554, + "grad_norm": 8.256908416748047, + "learning_rate": 9.962503963126714e-06, + "loss": 0.2138, + "step": 1922 + }, + { + "epoch": 0.04866260090593921, + "grad_norm": 4.944957733154297, + "learning_rate": 9.962454865734626e-06, + "loss": 0.2291, + "step": 1923 + }, + { + "epoch": 0.04868790647063289, + "grad_norm": 4.992069721221924, + "learning_rate": 9.962405736340588e-06, + "loss": 0.1973, + "step": 1924 + }, + { + "epoch": 0.04871321203532657, + "grad_norm": 8.139432907104492, + "learning_rate": 9.96235657494492e-06, + "loss": 0.2941, + "step": 1925 + }, + { + "epoch": 0.048738517600020244, + "grad_norm": 7.406540870666504, + "learning_rate": 9.962307381547938e-06, + "loss": 0.2281, + "step": 1926 + }, + { + "epoch": 0.04876382316471392, + "grad_norm": 5.3399739265441895, + "learning_rate": 9.962258156149958e-06, + "loss": 0.2078, + "step": 1927 + }, + { + "epoch": 0.0487891287294076, + "grad_norm": 5.202992916107178, + "learning_rate": 9.9622088987513e-06, + "loss": 0.1008, + "step": 1928 + }, + { + "epoch": 0.048814434294101275, + "grad_norm": 10.02236557006836, + "learning_rate": 9.962159609352277e-06, + "loss": 0.2777, + "step": 1929 + }, + { + "epoch": 0.04883973985879495, + "grad_norm": 11.688759803771973, + "learning_rate": 9.96211028795321e-06, + "loss": 0.2366, + "step": 1930 + }, + { + "epoch": 0.04886504542348862, + "grad_norm": 5.783825874328613, + "learning_rate": 9.962060934554418e-06, + "loss": 0.1864, + "step": 1931 + }, + { + "epoch": 0.0488903509881823, + "grad_norm": 9.654927253723145, + "learning_rate": 9.962011549156219e-06, + "loss": 0.286, + "step": 1932 + }, + { + "epoch": 0.048915656552875976, + "grad_norm": 21.028703689575195, + "learning_rate": 9.96196213175893e-06, + "loss": 0.2925, + "step": 1933 + }, + { + "epoch": 0.04894096211756965, + "grad_norm": 19.535032272338867, + "learning_rate": 9.96191268236287e-06, + "loss": 0.2926, + "step": 1934 + }, + { + "epoch": 0.04896626768226333, + "grad_norm": 7.2619476318359375, + "learning_rate": 9.961863200968357e-06, + "loss": 0.1908, + "step": 1935 + }, + { + "epoch": 0.04899157324695701, + "grad_norm": 7.909819602966309, + "learning_rate": 9.961813687575712e-06, + "loss": 0.1801, + "step": 1936 + }, + { + "epoch": 0.049016878811650684, + "grad_norm": 10.635193824768066, + "learning_rate": 9.961764142185253e-06, + "loss": 0.2621, + "step": 1937 + }, + { + "epoch": 0.04904218437634436, + "grad_norm": 4.304841041564941, + "learning_rate": 9.9617145647973e-06, + "loss": 0.1786, + "step": 1938 + }, + { + "epoch": 0.04906748994103803, + "grad_norm": 8.347023010253906, + "learning_rate": 9.961664955412173e-06, + "loss": 0.2198, + "step": 1939 + }, + { + "epoch": 0.04909279550573171, + "grad_norm": 35.8165283203125, + "learning_rate": 9.961615314030193e-06, + "loss": 0.478, + "step": 1940 + }, + { + "epoch": 0.049118101070425385, + "grad_norm": 7.377870559692383, + "learning_rate": 9.961565640651676e-06, + "loss": 0.238, + "step": 1941 + }, + { + "epoch": 0.04914340663511906, + "grad_norm": 8.764490127563477, + "learning_rate": 9.961515935276947e-06, + "loss": 0.2505, + "step": 1942 + }, + { + "epoch": 0.04916871219981274, + "grad_norm": 12.238152503967285, + "learning_rate": 9.961466197906324e-06, + "loss": 0.2757, + "step": 1943 + }, + { + "epoch": 0.049194017764506416, + "grad_norm": 10.529440879821777, + "learning_rate": 9.961416428540129e-06, + "loss": 0.2966, + "step": 1944 + }, + { + "epoch": 0.04921932332920009, + "grad_norm": 9.886028289794922, + "learning_rate": 9.961366627178684e-06, + "loss": 0.2277, + "step": 1945 + }, + { + "epoch": 0.04924462889389377, + "grad_norm": 10.270379066467285, + "learning_rate": 9.961316793822305e-06, + "loss": 0.2481, + "step": 1946 + }, + { + "epoch": 0.04926993445858744, + "grad_norm": 5.677229881286621, + "learning_rate": 9.961266928471319e-06, + "loss": 0.1957, + "step": 1947 + }, + { + "epoch": 0.04929524002328112, + "grad_norm": 7.023438930511475, + "learning_rate": 9.961217031126045e-06, + "loss": 0.2307, + "step": 1948 + }, + { + "epoch": 0.049320545587974794, + "grad_norm": 9.418893814086914, + "learning_rate": 9.961167101786804e-06, + "loss": 0.2491, + "step": 1949 + }, + { + "epoch": 0.04934585115266847, + "grad_norm": 7.578453540802002, + "learning_rate": 9.961117140453921e-06, + "loss": 0.2273, + "step": 1950 + }, + { + "epoch": 0.04937115671736215, + "grad_norm": 10.920439720153809, + "learning_rate": 9.961067147127717e-06, + "loss": 0.2899, + "step": 1951 + }, + { + "epoch": 0.049396462282055825, + "grad_norm": 14.373958587646484, + "learning_rate": 9.961017121808511e-06, + "loss": 0.2692, + "step": 1952 + }, + { + "epoch": 0.0494217678467495, + "grad_norm": 8.047755241394043, + "learning_rate": 9.960967064496629e-06, + "loss": 0.3036, + "step": 1953 + }, + { + "epoch": 0.04944707341144318, + "grad_norm": 5.1758623123168945, + "learning_rate": 9.960916975192395e-06, + "loss": 0.2358, + "step": 1954 + }, + { + "epoch": 0.04947237897613685, + "grad_norm": 15.096342086791992, + "learning_rate": 9.96086685389613e-06, + "loss": 0.2661, + "step": 1955 + }, + { + "epoch": 0.049497684540830526, + "grad_norm": 18.939403533935547, + "learning_rate": 9.960816700608156e-06, + "loss": 0.235, + "step": 1956 + }, + { + "epoch": 0.0495229901055242, + "grad_norm": 7.526584625244141, + "learning_rate": 9.960766515328799e-06, + "loss": 0.1614, + "step": 1957 + }, + { + "epoch": 0.04954829567021788, + "grad_norm": 6.130624294281006, + "learning_rate": 9.960716298058382e-06, + "loss": 0.2528, + "step": 1958 + }, + { + "epoch": 0.04957360123491156, + "grad_norm": 5.911852836608887, + "learning_rate": 9.960666048797227e-06, + "loss": 0.2269, + "step": 1959 + }, + { + "epoch": 0.049598906799605234, + "grad_norm": 6.409494876861572, + "learning_rate": 9.960615767545661e-06, + "loss": 0.1931, + "step": 1960 + }, + { + "epoch": 0.04962421236429891, + "grad_norm": 6.647186756134033, + "learning_rate": 9.960565454304005e-06, + "loss": 0.2466, + "step": 1961 + }, + { + "epoch": 0.04964951792899259, + "grad_norm": 10.706624031066895, + "learning_rate": 9.960515109072587e-06, + "loss": 0.2599, + "step": 1962 + }, + { + "epoch": 0.04967482349368626, + "grad_norm": 10.198858261108398, + "learning_rate": 9.960464731851727e-06, + "loss": 0.3266, + "step": 1963 + }, + { + "epoch": 0.049700129058379935, + "grad_norm": 10.89801025390625, + "learning_rate": 9.960414322641753e-06, + "loss": 0.1644, + "step": 1964 + }, + { + "epoch": 0.04972543462307361, + "grad_norm": 7.361929416656494, + "learning_rate": 9.96036388144299e-06, + "loss": 0.2355, + "step": 1965 + }, + { + "epoch": 0.04975074018776729, + "grad_norm": 14.296187400817871, + "learning_rate": 9.960313408255766e-06, + "loss": 0.3138, + "step": 1966 + }, + { + "epoch": 0.049776045752460966, + "grad_norm": 9.882479667663574, + "learning_rate": 9.960262903080402e-06, + "loss": 0.2221, + "step": 1967 + }, + { + "epoch": 0.049801351317154643, + "grad_norm": 17.344770431518555, + "learning_rate": 9.960212365917223e-06, + "loss": 0.1378, + "step": 1968 + }, + { + "epoch": 0.04982665688184832, + "grad_norm": 8.812728881835938, + "learning_rate": 9.96016179676656e-06, + "loss": 0.1321, + "step": 1969 + }, + { + "epoch": 0.049851962446542, + "grad_norm": 6.434032917022705, + "learning_rate": 9.960111195628734e-06, + "loss": 0.2303, + "step": 1970 + }, + { + "epoch": 0.04987726801123567, + "grad_norm": 5.2372026443481445, + "learning_rate": 9.960060562504076e-06, + "loss": 0.1723, + "step": 1971 + }, + { + "epoch": 0.049902573575929345, + "grad_norm": 5.712100505828857, + "learning_rate": 9.960009897392909e-06, + "loss": 0.1632, + "step": 1972 + }, + { + "epoch": 0.04992787914062302, + "grad_norm": 6.055217742919922, + "learning_rate": 9.95995920029556e-06, + "loss": 0.1996, + "step": 1973 + }, + { + "epoch": 0.0499531847053167, + "grad_norm": 5.543403625488281, + "learning_rate": 9.959908471212358e-06, + "loss": 0.159, + "step": 1974 + }, + { + "epoch": 0.049978490270010376, + "grad_norm": 8.127638816833496, + "learning_rate": 9.959857710143627e-06, + "loss": 0.2411, + "step": 1975 + }, + { + "epoch": 0.05000379583470405, + "grad_norm": 12.3136625289917, + "learning_rate": 9.959806917089698e-06, + "loss": 0.3061, + "step": 1976 + }, + { + "epoch": 0.05002910139939773, + "grad_norm": 8.43520736694336, + "learning_rate": 9.959756092050896e-06, + "loss": 0.19, + "step": 1977 + }, + { + "epoch": 0.05005440696409141, + "grad_norm": 9.932762145996094, + "learning_rate": 9.959705235027549e-06, + "loss": 0.2327, + "step": 1978 + }, + { + "epoch": 0.05007971252878508, + "grad_norm": 9.56675910949707, + "learning_rate": 9.959654346019987e-06, + "loss": 0.2978, + "step": 1979 + }, + { + "epoch": 0.050105018093478754, + "grad_norm": 10.25030517578125, + "learning_rate": 9.959603425028536e-06, + "loss": 0.1793, + "step": 1980 + }, + { + "epoch": 0.05013032365817243, + "grad_norm": 7.110988616943359, + "learning_rate": 9.959552472053524e-06, + "loss": 0.2323, + "step": 1981 + }, + { + "epoch": 0.05015562922286611, + "grad_norm": 7.843000411987305, + "learning_rate": 9.959501487095283e-06, + "loss": 0.2231, + "step": 1982 + }, + { + "epoch": 0.050180934787559785, + "grad_norm": 8.441217422485352, + "learning_rate": 9.959450470154136e-06, + "loss": 0.2869, + "step": 1983 + }, + { + "epoch": 0.05020624035225346, + "grad_norm": 9.584348678588867, + "learning_rate": 9.959399421230419e-06, + "loss": 0.2237, + "step": 1984 + }, + { + "epoch": 0.05023154591694714, + "grad_norm": 8.911300659179688, + "learning_rate": 9.959348340324454e-06, + "loss": 0.2177, + "step": 1985 + }, + { + "epoch": 0.050256851481640816, + "grad_norm": 7.377748012542725, + "learning_rate": 9.959297227436578e-06, + "loss": 0.1539, + "step": 1986 + }, + { + "epoch": 0.050282157046334486, + "grad_norm": 6.769806385040283, + "learning_rate": 9.959246082567115e-06, + "loss": 0.2604, + "step": 1987 + }, + { + "epoch": 0.05030746261102816, + "grad_norm": 5.512221336364746, + "learning_rate": 9.959194905716396e-06, + "loss": 0.223, + "step": 1988 + }, + { + "epoch": 0.05033276817572184, + "grad_norm": 6.432788848876953, + "learning_rate": 9.959143696884752e-06, + "loss": 0.2442, + "step": 1989 + }, + { + "epoch": 0.05035807374041552, + "grad_norm": 7.210688591003418, + "learning_rate": 9.959092456072513e-06, + "loss": 0.2231, + "step": 1990 + }, + { + "epoch": 0.050383379305109194, + "grad_norm": 9.156584739685059, + "learning_rate": 9.95904118328001e-06, + "loss": 0.2313, + "step": 1991 + }, + { + "epoch": 0.05040868486980287, + "grad_norm": 11.488678932189941, + "learning_rate": 9.95898987850757e-06, + "loss": 0.2679, + "step": 1992 + }, + { + "epoch": 0.05043399043449655, + "grad_norm": 10.671357154846191, + "learning_rate": 9.95893854175553e-06, + "loss": 0.247, + "step": 1993 + }, + { + "epoch": 0.050459295999190225, + "grad_norm": 8.162171363830566, + "learning_rate": 9.958887173024215e-06, + "loss": 0.2772, + "step": 1994 + }, + { + "epoch": 0.050484601563883895, + "grad_norm": 6.2803826332092285, + "learning_rate": 9.958835772313961e-06, + "loss": 0.2346, + "step": 1995 + }, + { + "epoch": 0.05050990712857757, + "grad_norm": 3.84771728515625, + "learning_rate": 9.958784339625095e-06, + "loss": 0.1851, + "step": 1996 + }, + { + "epoch": 0.05053521269327125, + "grad_norm": 5.49039888381958, + "learning_rate": 9.958732874957954e-06, + "loss": 0.1661, + "step": 1997 + }, + { + "epoch": 0.050560518257964926, + "grad_norm": 12.93824291229248, + "learning_rate": 9.958681378312866e-06, + "loss": 0.365, + "step": 1998 + }, + { + "epoch": 0.0505858238226586, + "grad_norm": 6.502162933349609, + "learning_rate": 9.958629849690165e-06, + "loss": 0.3053, + "step": 1999 + }, + { + "epoch": 0.05061112938735228, + "grad_norm": 12.052800178527832, + "learning_rate": 9.958578289090181e-06, + "loss": 0.3057, + "step": 2000 + }, + { + "epoch": 0.05063643495204596, + "grad_norm": 5.0840067863464355, + "learning_rate": 9.958526696513248e-06, + "loss": 0.1977, + "step": 2001 + }, + { + "epoch": 0.050661740516739634, + "grad_norm": 10.932539939880371, + "learning_rate": 9.9584750719597e-06, + "loss": 0.3271, + "step": 2002 + }, + { + "epoch": 0.050687046081433304, + "grad_norm": 8.271261215209961, + "learning_rate": 9.95842341542987e-06, + "loss": 0.2095, + "step": 2003 + }, + { + "epoch": 0.05071235164612698, + "grad_norm": 8.979180335998535, + "learning_rate": 9.958371726924087e-06, + "loss": 0.2297, + "step": 2004 + }, + { + "epoch": 0.05073765721082066, + "grad_norm": 9.395736694335938, + "learning_rate": 9.958320006442687e-06, + "loss": 0.2951, + "step": 2005 + }, + { + "epoch": 0.050762962775514335, + "grad_norm": 4.204593658447266, + "learning_rate": 9.958268253986006e-06, + "loss": 0.1803, + "step": 2006 + }, + { + "epoch": 0.05078826834020801, + "grad_norm": 15.59800910949707, + "learning_rate": 9.958216469554372e-06, + "loss": 0.291, + "step": 2007 + }, + { + "epoch": 0.05081357390490169, + "grad_norm": 6.241776943206787, + "learning_rate": 9.958164653148124e-06, + "loss": 0.2336, + "step": 2008 + }, + { + "epoch": 0.050838879469595366, + "grad_norm": 7.068363666534424, + "learning_rate": 9.958112804767594e-06, + "loss": 0.1683, + "step": 2009 + }, + { + "epoch": 0.05086418503428904, + "grad_norm": 10.681844711303711, + "learning_rate": 9.958060924413118e-06, + "loss": 0.1844, + "step": 2010 + }, + { + "epoch": 0.05088949059898271, + "grad_norm": 13.075315475463867, + "learning_rate": 9.958009012085029e-06, + "loss": 0.3486, + "step": 2011 + }, + { + "epoch": 0.05091479616367639, + "grad_norm": 5.744171142578125, + "learning_rate": 9.957957067783662e-06, + "loss": 0.1867, + "step": 2012 + }, + { + "epoch": 0.05094010172837007, + "grad_norm": 20.04264259338379, + "learning_rate": 9.957905091509352e-06, + "loss": 0.2757, + "step": 2013 + }, + { + "epoch": 0.050965407293063744, + "grad_norm": 8.241971015930176, + "learning_rate": 9.957853083262434e-06, + "loss": 0.1928, + "step": 2014 + }, + { + "epoch": 0.05099071285775742, + "grad_norm": 8.651132583618164, + "learning_rate": 9.957801043043246e-06, + "loss": 0.2601, + "step": 2015 + }, + { + "epoch": 0.0510160184224511, + "grad_norm": 3.6790027618408203, + "learning_rate": 9.957748970852118e-06, + "loss": 0.1622, + "step": 2016 + }, + { + "epoch": 0.051041323987144775, + "grad_norm": 6.911820411682129, + "learning_rate": 9.95769686668939e-06, + "loss": 0.1944, + "step": 2017 + }, + { + "epoch": 0.05106662955183845, + "grad_norm": 6.352593898773193, + "learning_rate": 9.957644730555397e-06, + "loss": 0.2172, + "step": 2018 + }, + { + "epoch": 0.05109193511653212, + "grad_norm": 6.2344794273376465, + "learning_rate": 9.957592562450476e-06, + "loss": 0.2364, + "step": 2019 + }, + { + "epoch": 0.0511172406812258, + "grad_norm": 5.816836357116699, + "learning_rate": 9.957540362374962e-06, + "loss": 0.1539, + "step": 2020 + }, + { + "epoch": 0.051142546245919476, + "grad_norm": 7.287336826324463, + "learning_rate": 9.957488130329193e-06, + "loss": 0.248, + "step": 2021 + }, + { + "epoch": 0.05116785181061315, + "grad_norm": 9.690594673156738, + "learning_rate": 9.957435866313506e-06, + "loss": 0.2028, + "step": 2022 + }, + { + "epoch": 0.05119315737530683, + "grad_norm": 5.844912528991699, + "learning_rate": 9.957383570328236e-06, + "loss": 0.1828, + "step": 2023 + }, + { + "epoch": 0.05121846294000051, + "grad_norm": 5.3601861000061035, + "learning_rate": 9.957331242373723e-06, + "loss": 0.2104, + "step": 2024 + }, + { + "epoch": 0.051243768504694184, + "grad_norm": 7.958264350891113, + "learning_rate": 9.9572788824503e-06, + "loss": 0.1663, + "step": 2025 + }, + { + "epoch": 0.05126907406938786, + "grad_norm": 7.065930366516113, + "learning_rate": 9.95722649055831e-06, + "loss": 0.1376, + "step": 2026 + }, + { + "epoch": 0.05129437963408153, + "grad_norm": 5.534247875213623, + "learning_rate": 9.957174066698087e-06, + "loss": 0.2237, + "step": 2027 + }, + { + "epoch": 0.05131968519877521, + "grad_norm": 4.7722697257995605, + "learning_rate": 9.957121610869972e-06, + "loss": 0.2413, + "step": 2028 + }, + { + "epoch": 0.051344990763468885, + "grad_norm": 9.375845909118652, + "learning_rate": 9.9570691230743e-06, + "loss": 0.3248, + "step": 2029 + }, + { + "epoch": 0.05137029632816256, + "grad_norm": 7.391695499420166, + "learning_rate": 9.957016603311414e-06, + "loss": 0.2033, + "step": 2030 + }, + { + "epoch": 0.05139560189285624, + "grad_norm": 8.781757354736328, + "learning_rate": 9.956964051581648e-06, + "loss": 0.2676, + "step": 2031 + }, + { + "epoch": 0.051420907457549916, + "grad_norm": 6.413323402404785, + "learning_rate": 9.956911467885345e-06, + "loss": 0.2076, + "step": 2032 + }, + { + "epoch": 0.05144621302224359, + "grad_norm": 14.036200523376465, + "learning_rate": 9.95685885222284e-06, + "loss": 0.2692, + "step": 2033 + }, + { + "epoch": 0.05147151858693727, + "grad_norm": 6.571110248565674, + "learning_rate": 9.956806204594473e-06, + "loss": 0.163, + "step": 2034 + }, + { + "epoch": 0.05149682415163094, + "grad_norm": 10.50146770477295, + "learning_rate": 9.95675352500059e-06, + "loss": 0.2159, + "step": 2035 + }, + { + "epoch": 0.05152212971632462, + "grad_norm": 13.615203857421875, + "learning_rate": 9.95670081344152e-06, + "loss": 0.2913, + "step": 2036 + }, + { + "epoch": 0.051547435281018295, + "grad_norm": 6.907329559326172, + "learning_rate": 9.956648069917611e-06, + "loss": 0.2642, + "step": 2037 + }, + { + "epoch": 0.05157274084571197, + "grad_norm": 5.8049235343933105, + "learning_rate": 9.9565952944292e-06, + "loss": 0.2571, + "step": 2038 + }, + { + "epoch": 0.05159804641040565, + "grad_norm": 8.69064712524414, + "learning_rate": 9.956542486976628e-06, + "loss": 0.1898, + "step": 2039 + }, + { + "epoch": 0.051623351975099326, + "grad_norm": 6.213898181915283, + "learning_rate": 9.956489647560236e-06, + "loss": 0.2074, + "step": 2040 + }, + { + "epoch": 0.051648657539793, + "grad_norm": 7.493526458740234, + "learning_rate": 9.956436776180364e-06, + "loss": 0.1772, + "step": 2041 + }, + { + "epoch": 0.05167396310448668, + "grad_norm": 7.694423675537109, + "learning_rate": 9.956383872837352e-06, + "loss": 0.202, + "step": 2042 + }, + { + "epoch": 0.05169926866918035, + "grad_norm": 6.133342266082764, + "learning_rate": 9.956330937531543e-06, + "loss": 0.2349, + "step": 2043 + }, + { + "epoch": 0.05172457423387403, + "grad_norm": 3.8175718784332275, + "learning_rate": 9.95627797026328e-06, + "loss": 0.1759, + "step": 2044 + }, + { + "epoch": 0.051749879798567704, + "grad_norm": 8.080842018127441, + "learning_rate": 9.9562249710329e-06, + "loss": 0.1474, + "step": 2045 + }, + { + "epoch": 0.05177518536326138, + "grad_norm": 12.765069007873535, + "learning_rate": 9.956171939840747e-06, + "loss": 0.16, + "step": 2046 + }, + { + "epoch": 0.05180049092795506, + "grad_norm": 15.033445358276367, + "learning_rate": 9.956118876687165e-06, + "loss": 0.3765, + "step": 2047 + }, + { + "epoch": 0.051825796492648735, + "grad_norm": 6.505276679992676, + "learning_rate": 9.956065781572493e-06, + "loss": 0.2023, + "step": 2048 + }, + { + "epoch": 0.05185110205734241, + "grad_norm": 6.5911102294921875, + "learning_rate": 9.956012654497073e-06, + "loss": 0.1729, + "step": 2049 + }, + { + "epoch": 0.05187640762203609, + "grad_norm": 12.865067481994629, + "learning_rate": 9.955959495461252e-06, + "loss": 0.2721, + "step": 2050 + }, + { + "epoch": 0.05190171318672976, + "grad_norm": 10.843307495117188, + "learning_rate": 9.955906304465368e-06, + "loss": 0.2595, + "step": 2051 + }, + { + "epoch": 0.051927018751423436, + "grad_norm": 9.668099403381348, + "learning_rate": 9.955853081509768e-06, + "loss": 0.2287, + "step": 2052 + }, + { + "epoch": 0.05195232431611711, + "grad_norm": 9.808182716369629, + "learning_rate": 9.955799826594792e-06, + "loss": 0.1771, + "step": 2053 + }, + { + "epoch": 0.05197762988081079, + "grad_norm": 10.37990951538086, + "learning_rate": 9.955746539720786e-06, + "loss": 0.2777, + "step": 2054 + }, + { + "epoch": 0.05200293544550447, + "grad_norm": 9.897391319274902, + "learning_rate": 9.955693220888093e-06, + "loss": 0.2481, + "step": 2055 + }, + { + "epoch": 0.052028241010198144, + "grad_norm": 8.076681137084961, + "learning_rate": 9.955639870097053e-06, + "loss": 0.2671, + "step": 2056 + }, + { + "epoch": 0.05205354657489182, + "grad_norm": 5.760256290435791, + "learning_rate": 9.955586487348014e-06, + "loss": 0.1689, + "step": 2057 + }, + { + "epoch": 0.0520788521395855, + "grad_norm": 6.3540730476379395, + "learning_rate": 9.955533072641322e-06, + "loss": 0.1635, + "step": 2058 + }, + { + "epoch": 0.05210415770427917, + "grad_norm": 7.2069478034973145, + "learning_rate": 9.955479625977318e-06, + "loss": 0.2463, + "step": 2059 + }, + { + "epoch": 0.052129463268972845, + "grad_norm": 5.120695114135742, + "learning_rate": 9.955426147356346e-06, + "loss": 0.1647, + "step": 2060 + }, + { + "epoch": 0.05215476883366652, + "grad_norm": 10.630193710327148, + "learning_rate": 9.955372636778754e-06, + "loss": 0.2636, + "step": 2061 + }, + { + "epoch": 0.0521800743983602, + "grad_norm": 13.679975509643555, + "learning_rate": 9.955319094244887e-06, + "loss": 0.2435, + "step": 2062 + }, + { + "epoch": 0.052205379963053876, + "grad_norm": 9.680870056152344, + "learning_rate": 9.955265519755087e-06, + "loss": 0.2503, + "step": 2063 + }, + { + "epoch": 0.05223068552774755, + "grad_norm": 5.143460750579834, + "learning_rate": 9.955211913309702e-06, + "loss": 0.1467, + "step": 2064 + }, + { + "epoch": 0.05225599109244123, + "grad_norm": 9.615275382995605, + "learning_rate": 9.955158274909077e-06, + "loss": 0.2471, + "step": 2065 + }, + { + "epoch": 0.05228129665713491, + "grad_norm": 9.531944274902344, + "learning_rate": 9.955104604553559e-06, + "loss": 0.3161, + "step": 2066 + }, + { + "epoch": 0.05230660222182858, + "grad_norm": 6.753750324249268, + "learning_rate": 9.955050902243492e-06, + "loss": 0.2277, + "step": 2067 + }, + { + "epoch": 0.052331907786522254, + "grad_norm": 18.493040084838867, + "learning_rate": 9.954997167979224e-06, + "loss": 0.1903, + "step": 2068 + }, + { + "epoch": 0.05235721335121593, + "grad_norm": 5.557914733886719, + "learning_rate": 9.9549434017611e-06, + "loss": 0.2434, + "step": 2069 + }, + { + "epoch": 0.05238251891590961, + "grad_norm": 4.7508544921875, + "learning_rate": 9.954889603589468e-06, + "loss": 0.1607, + "step": 2070 + }, + { + "epoch": 0.052407824480603285, + "grad_norm": 14.710649490356445, + "learning_rate": 9.954835773464676e-06, + "loss": 0.3248, + "step": 2071 + }, + { + "epoch": 0.05243313004529696, + "grad_norm": 6.784769535064697, + "learning_rate": 9.954781911387068e-06, + "loss": 0.1933, + "step": 2072 + }, + { + "epoch": 0.05245843560999064, + "grad_norm": 4.171751976013184, + "learning_rate": 9.954728017356994e-06, + "loss": 0.1857, + "step": 2073 + }, + { + "epoch": 0.052483741174684316, + "grad_norm": 6.17579984664917, + "learning_rate": 9.9546740913748e-06, + "loss": 0.2146, + "step": 2074 + }, + { + "epoch": 0.052509046739377986, + "grad_norm": 5.64797306060791, + "learning_rate": 9.954620133440835e-06, + "loss": 0.2237, + "step": 2075 + }, + { + "epoch": 0.05253435230407166, + "grad_norm": 7.401173114776611, + "learning_rate": 9.954566143555447e-06, + "loss": 0.1506, + "step": 2076 + }, + { + "epoch": 0.05255965786876534, + "grad_norm": 5.115462303161621, + "learning_rate": 9.954512121718982e-06, + "loss": 0.1704, + "step": 2077 + }, + { + "epoch": 0.05258496343345902, + "grad_norm": 6.502842426300049, + "learning_rate": 9.954458067931791e-06, + "loss": 0.2069, + "step": 2078 + }, + { + "epoch": 0.052610268998152694, + "grad_norm": 3.3444676399230957, + "learning_rate": 9.95440398219422e-06, + "loss": 0.1608, + "step": 2079 + }, + { + "epoch": 0.05263557456284637, + "grad_norm": 12.01882266998291, + "learning_rate": 9.95434986450662e-06, + "loss": 0.3032, + "step": 2080 + }, + { + "epoch": 0.05266088012754005, + "grad_norm": 9.200342178344727, + "learning_rate": 9.954295714869341e-06, + "loss": 0.2322, + "step": 2081 + }, + { + "epoch": 0.052686185692233725, + "grad_norm": 13.387864112854004, + "learning_rate": 9.954241533282729e-06, + "loss": 0.2157, + "step": 2082 + }, + { + "epoch": 0.052711491256927395, + "grad_norm": 10.3300142288208, + "learning_rate": 9.954187319747136e-06, + "loss": 0.1984, + "step": 2083 + }, + { + "epoch": 0.05273679682162107, + "grad_norm": 10.562846183776855, + "learning_rate": 9.954133074262911e-06, + "loss": 0.3139, + "step": 2084 + }, + { + "epoch": 0.05276210238631475, + "grad_norm": 7.063003063201904, + "learning_rate": 9.9540787968304e-06, + "loss": 0.1772, + "step": 2085 + }, + { + "epoch": 0.052787407951008426, + "grad_norm": 7.339081764221191, + "learning_rate": 9.95402448744996e-06, + "loss": 0.1724, + "step": 2086 + }, + { + "epoch": 0.0528127135157021, + "grad_norm": 7.001015663146973, + "learning_rate": 9.953970146121936e-06, + "loss": 0.2808, + "step": 2087 + }, + { + "epoch": 0.05283801908039578, + "grad_norm": 4.975307941436768, + "learning_rate": 9.95391577284668e-06, + "loss": 0.2364, + "step": 2088 + }, + { + "epoch": 0.05286332464508946, + "grad_norm": 6.507004261016846, + "learning_rate": 9.953861367624542e-06, + "loss": 0.1762, + "step": 2089 + }, + { + "epoch": 0.052888630209783134, + "grad_norm": 6.620232582092285, + "learning_rate": 9.953806930455876e-06, + "loss": 0.2151, + "step": 2090 + }, + { + "epoch": 0.052913935774476804, + "grad_norm": 9.083650588989258, + "learning_rate": 9.953752461341028e-06, + "loss": 0.3362, + "step": 2091 + }, + { + "epoch": 0.05293924133917048, + "grad_norm": 6.145086765289307, + "learning_rate": 9.953697960280352e-06, + "loss": 0.1609, + "step": 2092 + }, + { + "epoch": 0.05296454690386416, + "grad_norm": 7.397143840789795, + "learning_rate": 9.953643427274201e-06, + "loss": 0.2331, + "step": 2093 + }, + { + "epoch": 0.052989852468557835, + "grad_norm": 8.719290733337402, + "learning_rate": 9.953588862322925e-06, + "loss": 0.2769, + "step": 2094 + }, + { + "epoch": 0.05301515803325151, + "grad_norm": 9.066914558410645, + "learning_rate": 9.953534265426875e-06, + "loss": 0.2469, + "step": 2095 + }, + { + "epoch": 0.05304046359794519, + "grad_norm": 6.486575603485107, + "learning_rate": 9.953479636586403e-06, + "loss": 0.2428, + "step": 2096 + }, + { + "epoch": 0.053065769162638866, + "grad_norm": 5.552257061004639, + "learning_rate": 9.953424975801865e-06, + "loss": 0.1257, + "step": 2097 + }, + { + "epoch": 0.05309107472733254, + "grad_norm": 12.977689743041992, + "learning_rate": 9.953370283073606e-06, + "loss": 0.387, + "step": 2098 + }, + { + "epoch": 0.053116380292026213, + "grad_norm": 11.820744514465332, + "learning_rate": 9.953315558401987e-06, + "loss": 0.2783, + "step": 2099 + }, + { + "epoch": 0.05314168585671989, + "grad_norm": 7.507589817047119, + "learning_rate": 9.953260801787357e-06, + "loss": 0.2908, + "step": 2100 + }, + { + "epoch": 0.05316699142141357, + "grad_norm": 7.673902988433838, + "learning_rate": 9.95320601323007e-06, + "loss": 0.1758, + "step": 2101 + }, + { + "epoch": 0.053192296986107244, + "grad_norm": 16.939983367919922, + "learning_rate": 9.953151192730478e-06, + "loss": 0.3551, + "step": 2102 + }, + { + "epoch": 0.05321760255080092, + "grad_norm": 10.2523775100708, + "learning_rate": 9.953096340288934e-06, + "loss": 0.2816, + "step": 2103 + }, + { + "epoch": 0.0532429081154946, + "grad_norm": 8.458643913269043, + "learning_rate": 9.953041455905795e-06, + "loss": 0.1989, + "step": 2104 + }, + { + "epoch": 0.053268213680188276, + "grad_norm": 11.490281105041504, + "learning_rate": 9.952986539581413e-06, + "loss": 0.2588, + "step": 2105 + }, + { + "epoch": 0.05329351924488195, + "grad_norm": 12.30412483215332, + "learning_rate": 9.952931591316142e-06, + "loss": 0.1748, + "step": 2106 + }, + { + "epoch": 0.05331882480957562, + "grad_norm": 8.949138641357422, + "learning_rate": 9.952876611110335e-06, + "loss": 0.1975, + "step": 2107 + }, + { + "epoch": 0.0533441303742693, + "grad_norm": 8.8027982711792, + "learning_rate": 9.952821598964349e-06, + "loss": 0.2312, + "step": 2108 + }, + { + "epoch": 0.05336943593896298, + "grad_norm": 5.548918724060059, + "learning_rate": 9.952766554878537e-06, + "loss": 0.1345, + "step": 2109 + }, + { + "epoch": 0.053394741503656654, + "grad_norm": 6.172746658325195, + "learning_rate": 9.952711478853257e-06, + "loss": 0.2822, + "step": 2110 + }, + { + "epoch": 0.05342004706835033, + "grad_norm": 4.828795433044434, + "learning_rate": 9.95265637088886e-06, + "loss": 0.2199, + "step": 2111 + }, + { + "epoch": 0.05344535263304401, + "grad_norm": 5.002433776855469, + "learning_rate": 9.952601230985703e-06, + "loss": 0.2222, + "step": 2112 + }, + { + "epoch": 0.053470658197737685, + "grad_norm": 8.394224166870117, + "learning_rate": 9.952546059144144e-06, + "loss": 0.1352, + "step": 2113 + }, + { + "epoch": 0.05349596376243136, + "grad_norm": 10.859803199768066, + "learning_rate": 9.952490855364535e-06, + "loss": 0.2426, + "step": 2114 + }, + { + "epoch": 0.05352126932712503, + "grad_norm": 12.402482986450195, + "learning_rate": 9.952435619647234e-06, + "loss": 0.1875, + "step": 2115 + }, + { + "epoch": 0.05354657489181871, + "grad_norm": 17.907066345214844, + "learning_rate": 9.952380351992597e-06, + "loss": 0.3236, + "step": 2116 + }, + { + "epoch": 0.053571880456512386, + "grad_norm": 4.449965953826904, + "learning_rate": 9.95232505240098e-06, + "loss": 0.2129, + "step": 2117 + }, + { + "epoch": 0.05359718602120606, + "grad_norm": 10.32491397857666, + "learning_rate": 9.952269720872741e-06, + "loss": 0.2811, + "step": 2118 + }, + { + "epoch": 0.05362249158589974, + "grad_norm": 7.280035018920898, + "learning_rate": 9.952214357408235e-06, + "loss": 0.2525, + "step": 2119 + }, + { + "epoch": 0.05364779715059342, + "grad_norm": 6.905939102172852, + "learning_rate": 9.95215896200782e-06, + "loss": 0.2542, + "step": 2120 + }, + { + "epoch": 0.053673102715287094, + "grad_norm": 7.518164157867432, + "learning_rate": 9.952103534671852e-06, + "loss": 0.2365, + "step": 2121 + }, + { + "epoch": 0.05369840827998077, + "grad_norm": 12.659868240356445, + "learning_rate": 9.952048075400691e-06, + "loss": 0.2292, + "step": 2122 + }, + { + "epoch": 0.05372371384467444, + "grad_norm": 10.245792388916016, + "learning_rate": 9.951992584194692e-06, + "loss": 0.1957, + "step": 2123 + }, + { + "epoch": 0.05374901940936812, + "grad_norm": 6.008357048034668, + "learning_rate": 9.951937061054216e-06, + "loss": 0.2134, + "step": 2124 + }, + { + "epoch": 0.053774324974061795, + "grad_norm": 6.791352272033691, + "learning_rate": 9.951881505979617e-06, + "loss": 0.1822, + "step": 2125 + }, + { + "epoch": 0.05379963053875547, + "grad_norm": 5.9900031089782715, + "learning_rate": 9.951825918971256e-06, + "loss": 0.1603, + "step": 2126 + }, + { + "epoch": 0.05382493610344915, + "grad_norm": 4.72391939163208, + "learning_rate": 9.95177030002949e-06, + "loss": 0.1371, + "step": 2127 + }, + { + "epoch": 0.053850241668142826, + "grad_norm": 35.786094665527344, + "learning_rate": 9.951714649154678e-06, + "loss": 0.261, + "step": 2128 + }, + { + "epoch": 0.0538755472328365, + "grad_norm": 13.19115161895752, + "learning_rate": 9.95165896634718e-06, + "loss": 0.2581, + "step": 2129 + }, + { + "epoch": 0.05390085279753018, + "grad_norm": 9.744097709655762, + "learning_rate": 9.951603251607355e-06, + "loss": 0.1897, + "step": 2130 + }, + { + "epoch": 0.05392615836222385, + "grad_norm": 9.105267524719238, + "learning_rate": 9.951547504935562e-06, + "loss": 0.2144, + "step": 2131 + }, + { + "epoch": 0.05395146392691753, + "grad_norm": 7.102557182312012, + "learning_rate": 9.95149172633216e-06, + "loss": 0.1988, + "step": 2132 + }, + { + "epoch": 0.053976769491611204, + "grad_norm": 4.052002906799316, + "learning_rate": 9.951435915797508e-06, + "loss": 0.1928, + "step": 2133 + }, + { + "epoch": 0.05400207505630488, + "grad_norm": 7.7304768562316895, + "learning_rate": 9.951380073331968e-06, + "loss": 0.2082, + "step": 2134 + }, + { + "epoch": 0.05402738062099856, + "grad_norm": 17.55780601501465, + "learning_rate": 9.951324198935898e-06, + "loss": 0.2316, + "step": 2135 + }, + { + "epoch": 0.054052686185692235, + "grad_norm": 14.156675338745117, + "learning_rate": 9.95126829260966e-06, + "loss": 0.2627, + "step": 2136 + }, + { + "epoch": 0.05407799175038591, + "grad_norm": 9.239119529724121, + "learning_rate": 9.951212354353613e-06, + "loss": 0.1825, + "step": 2137 + }, + { + "epoch": 0.05410329731507959, + "grad_norm": 13.598347663879395, + "learning_rate": 9.951156384168119e-06, + "loss": 0.2562, + "step": 2138 + }, + { + "epoch": 0.05412860287977326, + "grad_norm": 8.782367706298828, + "learning_rate": 9.951100382053539e-06, + "loss": 0.2663, + "step": 2139 + }, + { + "epoch": 0.054153908444466936, + "grad_norm": 7.305850505828857, + "learning_rate": 9.951044348010231e-06, + "loss": 0.2287, + "step": 2140 + }, + { + "epoch": 0.05417921400916061, + "grad_norm": 5.830178737640381, + "learning_rate": 9.95098828203856e-06, + "loss": 0.207, + "step": 2141 + }, + { + "epoch": 0.05420451957385429, + "grad_norm": 5.597790718078613, + "learning_rate": 9.950932184138889e-06, + "loss": 0.2256, + "step": 2142 + }, + { + "epoch": 0.05422982513854797, + "grad_norm": 4.926078796386719, + "learning_rate": 9.950876054311576e-06, + "loss": 0.1084, + "step": 2143 + }, + { + "epoch": 0.054255130703241644, + "grad_norm": 6.7094831466674805, + "learning_rate": 9.950819892556984e-06, + "loss": 0.2244, + "step": 2144 + }, + { + "epoch": 0.05428043626793532, + "grad_norm": 4.175011157989502, + "learning_rate": 9.950763698875474e-06, + "loss": 0.2025, + "step": 2145 + }, + { + "epoch": 0.054305741832629, + "grad_norm": 25.777772903442383, + "learning_rate": 9.950707473267411e-06, + "loss": 0.2371, + "step": 2146 + }, + { + "epoch": 0.05433104739732267, + "grad_norm": 20.225343704223633, + "learning_rate": 9.950651215733158e-06, + "loss": 0.252, + "step": 2147 + }, + { + "epoch": 0.054356352962016345, + "grad_norm": 4.904486656188965, + "learning_rate": 9.950594926273074e-06, + "loss": 0.1433, + "step": 2148 + }, + { + "epoch": 0.05438165852671002, + "grad_norm": 13.991622924804688, + "learning_rate": 9.950538604887525e-06, + "loss": 0.4124, + "step": 2149 + }, + { + "epoch": 0.0544069640914037, + "grad_norm": 5.890870571136475, + "learning_rate": 9.950482251576874e-06, + "loss": 0.1389, + "step": 2150 + }, + { + "epoch": 0.054432269656097376, + "grad_norm": 9.075501441955566, + "learning_rate": 9.950425866341484e-06, + "loss": 0.339, + "step": 2151 + }, + { + "epoch": 0.05445757522079105, + "grad_norm": 5.918026447296143, + "learning_rate": 9.950369449181717e-06, + "loss": 0.1693, + "step": 2152 + }, + { + "epoch": 0.05448288078548473, + "grad_norm": 7.199707984924316, + "learning_rate": 9.950313000097938e-06, + "loss": 0.2171, + "step": 2153 + }, + { + "epoch": 0.05450818635017841, + "grad_norm": 13.742623329162598, + "learning_rate": 9.950256519090513e-06, + "loss": 0.2881, + "step": 2154 + }, + { + "epoch": 0.05453349191487208, + "grad_norm": 12.53487491607666, + "learning_rate": 9.950200006159803e-06, + "loss": 0.365, + "step": 2155 + }, + { + "epoch": 0.054558797479565754, + "grad_norm": 4.740996837615967, + "learning_rate": 9.950143461306175e-06, + "loss": 0.2217, + "step": 2156 + }, + { + "epoch": 0.05458410304425943, + "grad_norm": 9.578332901000977, + "learning_rate": 9.950086884529993e-06, + "loss": 0.3241, + "step": 2157 + }, + { + "epoch": 0.05460940860895311, + "grad_norm": 7.263790130615234, + "learning_rate": 9.95003027583162e-06, + "loss": 0.2806, + "step": 2158 + }, + { + "epoch": 0.054634714173646785, + "grad_norm": 7.427844047546387, + "learning_rate": 9.949973635211423e-06, + "loss": 0.2261, + "step": 2159 + }, + { + "epoch": 0.05466001973834046, + "grad_norm": 11.471307754516602, + "learning_rate": 9.949916962669768e-06, + "loss": 0.3205, + "step": 2160 + }, + { + "epoch": 0.05468532530303414, + "grad_norm": 4.408820152282715, + "learning_rate": 9.949860258207018e-06, + "loss": 0.2299, + "step": 2161 + }, + { + "epoch": 0.054710630867727816, + "grad_norm": 8.530367851257324, + "learning_rate": 9.94980352182354e-06, + "loss": 0.2084, + "step": 2162 + }, + { + "epoch": 0.054735936432421486, + "grad_norm": 18.100509643554688, + "learning_rate": 9.9497467535197e-06, + "loss": 0.2572, + "step": 2163 + }, + { + "epoch": 0.05476124199711516, + "grad_norm": 6.680471420288086, + "learning_rate": 9.949689953295865e-06, + "loss": 0.2428, + "step": 2164 + }, + { + "epoch": 0.05478654756180884, + "grad_norm": 6.464618682861328, + "learning_rate": 9.949633121152399e-06, + "loss": 0.1905, + "step": 2165 + }, + { + "epoch": 0.05481185312650252, + "grad_norm": 10.261007308959961, + "learning_rate": 9.94957625708967e-06, + "loss": 0.2985, + "step": 2166 + }, + { + "epoch": 0.054837158691196194, + "grad_norm": 11.16152286529541, + "learning_rate": 9.949519361108044e-06, + "loss": 0.3223, + "step": 2167 + }, + { + "epoch": 0.05486246425588987, + "grad_norm": 13.987852096557617, + "learning_rate": 9.949462433207889e-06, + "loss": 0.3571, + "step": 2168 + }, + { + "epoch": 0.05488776982058355, + "grad_norm": 7.316878318786621, + "learning_rate": 9.94940547338957e-06, + "loss": 0.1701, + "step": 2169 + }, + { + "epoch": 0.054913075385277225, + "grad_norm": 7.335807800292969, + "learning_rate": 9.949348481653459e-06, + "loss": 0.2825, + "step": 2170 + }, + { + "epoch": 0.054938380949970896, + "grad_norm": 16.0322265625, + "learning_rate": 9.949291457999917e-06, + "loss": 0.3161, + "step": 2171 + }, + { + "epoch": 0.05496368651466457, + "grad_norm": 5.770012855529785, + "learning_rate": 9.949234402429317e-06, + "loss": 0.2319, + "step": 2172 + }, + { + "epoch": 0.05498899207935825, + "grad_norm": 8.241865158081055, + "learning_rate": 9.949177314942024e-06, + "loss": 0.267, + "step": 2173 + }, + { + "epoch": 0.05501429764405193, + "grad_norm": 5.307694911956787, + "learning_rate": 9.949120195538407e-06, + "loss": 0.1962, + "step": 2174 + }, + { + "epoch": 0.055039603208745604, + "grad_norm": 7.653372764587402, + "learning_rate": 9.949063044218835e-06, + "loss": 0.2288, + "step": 2175 + }, + { + "epoch": 0.05506490877343928, + "grad_norm": 16.679725646972656, + "learning_rate": 9.949005860983676e-06, + "loss": 0.3213, + "step": 2176 + }, + { + "epoch": 0.05509021433813296, + "grad_norm": 8.420411109924316, + "learning_rate": 9.948948645833298e-06, + "loss": 0.196, + "step": 2177 + }, + { + "epoch": 0.055115519902826635, + "grad_norm": 9.547136306762695, + "learning_rate": 9.948891398768071e-06, + "loss": 0.2373, + "step": 2178 + }, + { + "epoch": 0.055140825467520305, + "grad_norm": 7.197350978851318, + "learning_rate": 9.948834119788365e-06, + "loss": 0.2126, + "step": 2179 + }, + { + "epoch": 0.05516613103221398, + "grad_norm": 9.89563274383545, + "learning_rate": 9.948776808894547e-06, + "loss": 0.2075, + "step": 2180 + }, + { + "epoch": 0.05519143659690766, + "grad_norm": 7.118024826049805, + "learning_rate": 9.94871946608699e-06, + "loss": 0.2252, + "step": 2181 + }, + { + "epoch": 0.055216742161601336, + "grad_norm": 17.359495162963867, + "learning_rate": 9.948662091366059e-06, + "loss": 0.2629, + "step": 2182 + }, + { + "epoch": 0.05524204772629501, + "grad_norm": 11.248611450195312, + "learning_rate": 9.948604684732128e-06, + "loss": 0.3053, + "step": 2183 + }, + { + "epoch": 0.05526735329098869, + "grad_norm": 6.546998023986816, + "learning_rate": 9.948547246185564e-06, + "loss": 0.1666, + "step": 2184 + }, + { + "epoch": 0.05529265885568237, + "grad_norm": 6.779186725616455, + "learning_rate": 9.948489775726742e-06, + "loss": 0.1657, + "step": 2185 + }, + { + "epoch": 0.055317964420376044, + "grad_norm": 6.244149684906006, + "learning_rate": 9.948432273356028e-06, + "loss": 0.2078, + "step": 2186 + }, + { + "epoch": 0.055343269985069714, + "grad_norm": 4.935951232910156, + "learning_rate": 9.948374739073797e-06, + "loss": 0.1252, + "step": 2187 + }, + { + "epoch": 0.05536857554976339, + "grad_norm": 7.053433418273926, + "learning_rate": 9.948317172880418e-06, + "loss": 0.219, + "step": 2188 + }, + { + "epoch": 0.05539388111445707, + "grad_norm": 7.933578014373779, + "learning_rate": 9.948259574776258e-06, + "loss": 0.2132, + "step": 2189 + }, + { + "epoch": 0.055419186679150745, + "grad_norm": 5.8940229415893555, + "learning_rate": 9.948201944761696e-06, + "loss": 0.1868, + "step": 2190 + }, + { + "epoch": 0.05544449224384442, + "grad_norm": 8.507752418518066, + "learning_rate": 9.948144282837099e-06, + "loss": 0.2538, + "step": 2191 + }, + { + "epoch": 0.0554697978085381, + "grad_norm": 5.562230587005615, + "learning_rate": 9.94808658900284e-06, + "loss": 0.1976, + "step": 2192 + }, + { + "epoch": 0.055495103373231776, + "grad_norm": 6.468676567077637, + "learning_rate": 9.948028863259292e-06, + "loss": 0.2519, + "step": 2193 + }, + { + "epoch": 0.05552040893792545, + "grad_norm": 8.371431350708008, + "learning_rate": 9.947971105606824e-06, + "loss": 0.2068, + "step": 2194 + }, + { + "epoch": 0.05554571450261912, + "grad_norm": 8.053062438964844, + "learning_rate": 9.947913316045811e-06, + "loss": 0.2081, + "step": 2195 + }, + { + "epoch": 0.0555710200673128, + "grad_norm": 5.427274703979492, + "learning_rate": 9.947855494576628e-06, + "loss": 0.1512, + "step": 2196 + }, + { + "epoch": 0.05559632563200648, + "grad_norm": 5.12043571472168, + "learning_rate": 9.947797641199643e-06, + "loss": 0.1898, + "step": 2197 + }, + { + "epoch": 0.055621631196700154, + "grad_norm": 12.480367660522461, + "learning_rate": 9.947739755915233e-06, + "loss": 0.1612, + "step": 2198 + }, + { + "epoch": 0.05564693676139383, + "grad_norm": 10.524066925048828, + "learning_rate": 9.947681838723769e-06, + "loss": 0.271, + "step": 2199 + }, + { + "epoch": 0.05567224232608751, + "grad_norm": 3.8627965450286865, + "learning_rate": 9.947623889625624e-06, + "loss": 0.2102, + "step": 2200 + }, + { + "epoch": 0.055697547890781185, + "grad_norm": 9.059045791625977, + "learning_rate": 9.947565908621174e-06, + "loss": 0.247, + "step": 2201 + }, + { + "epoch": 0.05572285345547486, + "grad_norm": 7.620603561401367, + "learning_rate": 9.947507895710792e-06, + "loss": 0.285, + "step": 2202 + }, + { + "epoch": 0.05574815902016853, + "grad_norm": 7.435662746429443, + "learning_rate": 9.947449850894851e-06, + "loss": 0.27, + "step": 2203 + }, + { + "epoch": 0.05577346458486221, + "grad_norm": 8.000380516052246, + "learning_rate": 9.947391774173727e-06, + "loss": 0.2281, + "step": 2204 + }, + { + "epoch": 0.055798770149555886, + "grad_norm": 5.6945881843566895, + "learning_rate": 9.947333665547794e-06, + "loss": 0.3014, + "step": 2205 + }, + { + "epoch": 0.05582407571424956, + "grad_norm": 6.936382293701172, + "learning_rate": 9.947275525017427e-06, + "loss": 0.2072, + "step": 2206 + }, + { + "epoch": 0.05584938127894324, + "grad_norm": 5.034168243408203, + "learning_rate": 9.947217352582997e-06, + "loss": 0.1318, + "step": 2207 + }, + { + "epoch": 0.05587468684363692, + "grad_norm": 10.588862419128418, + "learning_rate": 9.947159148244886e-06, + "loss": 0.4528, + "step": 2208 + }, + { + "epoch": 0.055899992408330594, + "grad_norm": 8.211835861206055, + "learning_rate": 9.947100912003464e-06, + "loss": 0.2585, + "step": 2209 + }, + { + "epoch": 0.05592529797302427, + "grad_norm": 10.556849479675293, + "learning_rate": 9.947042643859108e-06, + "loss": 0.3003, + "step": 2210 + }, + { + "epoch": 0.05595060353771794, + "grad_norm": 10.421022415161133, + "learning_rate": 9.946984343812196e-06, + "loss": 0.2308, + "step": 2211 + }, + { + "epoch": 0.05597590910241162, + "grad_norm": 6.763217449188232, + "learning_rate": 9.946926011863102e-06, + "loss": 0.1789, + "step": 2212 + }, + { + "epoch": 0.056001214667105295, + "grad_norm": 5.313122749328613, + "learning_rate": 9.946867648012201e-06, + "loss": 0.2302, + "step": 2213 + }, + { + "epoch": 0.05602652023179897, + "grad_norm": 8.8001708984375, + "learning_rate": 9.946809252259872e-06, + "loss": 0.2472, + "step": 2214 + }, + { + "epoch": 0.05605182579649265, + "grad_norm": 11.724607467651367, + "learning_rate": 9.94675082460649e-06, + "loss": 0.2997, + "step": 2215 + }, + { + "epoch": 0.056077131361186326, + "grad_norm": 9.337080001831055, + "learning_rate": 9.946692365052432e-06, + "loss": 0.2816, + "step": 2216 + }, + { + "epoch": 0.05610243692588, + "grad_norm": 6.021917343139648, + "learning_rate": 9.946633873598075e-06, + "loss": 0.1982, + "step": 2217 + }, + { + "epoch": 0.05612774249057368, + "grad_norm": 5.3098931312561035, + "learning_rate": 9.946575350243796e-06, + "loss": 0.1558, + "step": 2218 + }, + { + "epoch": 0.05615304805526735, + "grad_norm": 8.08497142791748, + "learning_rate": 9.946516794989973e-06, + "loss": 0.1319, + "step": 2219 + }, + { + "epoch": 0.05617835361996103, + "grad_norm": 8.426389694213867, + "learning_rate": 9.946458207836985e-06, + "loss": 0.2093, + "step": 2220 + }, + { + "epoch": 0.056203659184654704, + "grad_norm": 7.11768102645874, + "learning_rate": 9.946399588785205e-06, + "loss": 0.2262, + "step": 2221 + }, + { + "epoch": 0.05622896474934838, + "grad_norm": 9.962178230285645, + "learning_rate": 9.946340937835016e-06, + "loss": 0.2716, + "step": 2222 + }, + { + "epoch": 0.05625427031404206, + "grad_norm": 7.373167037963867, + "learning_rate": 9.946282254986794e-06, + "loss": 0.1948, + "step": 2223 + }, + { + "epoch": 0.056279575878735735, + "grad_norm": 6.416654586791992, + "learning_rate": 9.946223540240918e-06, + "loss": 0.3291, + "step": 2224 + }, + { + "epoch": 0.05630488144342941, + "grad_norm": 8.170580863952637, + "learning_rate": 9.946164793597767e-06, + "loss": 0.238, + "step": 2225 + }, + { + "epoch": 0.05633018700812309, + "grad_norm": 6.678152561187744, + "learning_rate": 9.946106015057718e-06, + "loss": 0.1732, + "step": 2226 + }, + { + "epoch": 0.05635549257281676, + "grad_norm": 10.236824035644531, + "learning_rate": 9.946047204621152e-06, + "loss": 0.255, + "step": 2227 + }, + { + "epoch": 0.056380798137510436, + "grad_norm": 6.554372787475586, + "learning_rate": 9.945988362288448e-06, + "loss": 0.287, + "step": 2228 + }, + { + "epoch": 0.05640610370220411, + "grad_norm": 7.468888759613037, + "learning_rate": 9.945929488059985e-06, + "loss": 0.2216, + "step": 2229 + }, + { + "epoch": 0.05643140926689779, + "grad_norm": 4.075716495513916, + "learning_rate": 9.945870581936141e-06, + "loss": 0.1602, + "step": 2230 + }, + { + "epoch": 0.05645671483159147, + "grad_norm": 10.16793441772461, + "learning_rate": 9.9458116439173e-06, + "loss": 0.1845, + "step": 2231 + }, + { + "epoch": 0.056482020396285144, + "grad_norm": 6.367650985717773, + "learning_rate": 9.94575267400384e-06, + "loss": 0.1927, + "step": 2232 + }, + { + "epoch": 0.05650732596097882, + "grad_norm": 7.427491664886475, + "learning_rate": 9.945693672196138e-06, + "loss": 0.2926, + "step": 2233 + }, + { + "epoch": 0.0565326315256725, + "grad_norm": 7.680475234985352, + "learning_rate": 9.945634638494579e-06, + "loss": 0.2047, + "step": 2234 + }, + { + "epoch": 0.05655793709036617, + "grad_norm": 4.54924201965332, + "learning_rate": 9.945575572899545e-06, + "loss": 0.1505, + "step": 2235 + }, + { + "epoch": 0.056583242655059846, + "grad_norm": 11.418436050415039, + "learning_rate": 9.94551647541141e-06, + "loss": 0.1368, + "step": 2236 + }, + { + "epoch": 0.05660854821975352, + "grad_norm": 6.974081039428711, + "learning_rate": 9.945457346030561e-06, + "loss": 0.1933, + "step": 2237 + }, + { + "epoch": 0.0566338537844472, + "grad_norm": 6.973704814910889, + "learning_rate": 9.945398184757377e-06, + "loss": 0.1859, + "step": 2238 + }, + { + "epoch": 0.05665915934914088, + "grad_norm": 14.299307823181152, + "learning_rate": 9.945338991592239e-06, + "loss": 0.2052, + "step": 2239 + }, + { + "epoch": 0.056684464913834554, + "grad_norm": 15.245627403259277, + "learning_rate": 9.945279766535532e-06, + "loss": 0.3599, + "step": 2240 + }, + { + "epoch": 0.05670977047852823, + "grad_norm": 8.545418739318848, + "learning_rate": 9.945220509587634e-06, + "loss": 0.2542, + "step": 2241 + }, + { + "epoch": 0.05673507604322191, + "grad_norm": 3.435776948928833, + "learning_rate": 9.945161220748928e-06, + "loss": 0.0906, + "step": 2242 + }, + { + "epoch": 0.05676038160791558, + "grad_norm": 21.244422912597656, + "learning_rate": 9.9451019000198e-06, + "loss": 0.225, + "step": 2243 + }, + { + "epoch": 0.056785687172609255, + "grad_norm": 27.557767868041992, + "learning_rate": 9.945042547400628e-06, + "loss": 0.387, + "step": 2244 + }, + { + "epoch": 0.05681099273730293, + "grad_norm": 12.60043716430664, + "learning_rate": 9.944983162891797e-06, + "loss": 0.2099, + "step": 2245 + }, + { + "epoch": 0.05683629830199661, + "grad_norm": 15.62846851348877, + "learning_rate": 9.944923746493689e-06, + "loss": 0.3081, + "step": 2246 + }, + { + "epoch": 0.056861603866690286, + "grad_norm": 15.5197172164917, + "learning_rate": 9.944864298206689e-06, + "loss": 0.4644, + "step": 2247 + }, + { + "epoch": 0.05688690943138396, + "grad_norm": 8.97188949584961, + "learning_rate": 9.944804818031177e-06, + "loss": 0.2713, + "step": 2248 + }, + { + "epoch": 0.05691221499607764, + "grad_norm": 7.716209888458252, + "learning_rate": 9.94474530596754e-06, + "loss": 0.2463, + "step": 2249 + }, + { + "epoch": 0.05693752056077132, + "grad_norm": 10.315032005310059, + "learning_rate": 9.94468576201616e-06, + "loss": 0.2524, + "step": 2250 + }, + { + "epoch": 0.05696282612546499, + "grad_norm": 9.485343933105469, + "learning_rate": 9.944626186177419e-06, + "loss": 0.2361, + "step": 2251 + }, + { + "epoch": 0.056988131690158664, + "grad_norm": 7.905725002288818, + "learning_rate": 9.944566578451707e-06, + "loss": 0.2277, + "step": 2252 + }, + { + "epoch": 0.05701343725485234, + "grad_norm": 8.337803840637207, + "learning_rate": 9.944506938839402e-06, + "loss": 0.2476, + "step": 2253 + }, + { + "epoch": 0.05703874281954602, + "grad_norm": 10.980154037475586, + "learning_rate": 9.944447267340894e-06, + "loss": 0.3517, + "step": 2254 + }, + { + "epoch": 0.057064048384239695, + "grad_norm": 5.347143650054932, + "learning_rate": 9.944387563956563e-06, + "loss": 0.2441, + "step": 2255 + }, + { + "epoch": 0.05708935394893337, + "grad_norm": 8.004653930664062, + "learning_rate": 9.944327828686798e-06, + "loss": 0.2563, + "step": 2256 + }, + { + "epoch": 0.05711465951362705, + "grad_norm": 7.762207984924316, + "learning_rate": 9.944268061531981e-06, + "loss": 0.2498, + "step": 2257 + }, + { + "epoch": 0.057139965078320726, + "grad_norm": 7.431103229522705, + "learning_rate": 9.9442082624925e-06, + "loss": 0.2725, + "step": 2258 + }, + { + "epoch": 0.057165270643014396, + "grad_norm": 6.632376670837402, + "learning_rate": 9.94414843156874e-06, + "loss": 0.2034, + "step": 2259 + }, + { + "epoch": 0.05719057620770807, + "grad_norm": 15.380049705505371, + "learning_rate": 9.944088568761085e-06, + "loss": 0.2476, + "step": 2260 + }, + { + "epoch": 0.05721588177240175, + "grad_norm": 5.059668064117432, + "learning_rate": 9.944028674069924e-06, + "loss": 0.165, + "step": 2261 + }, + { + "epoch": 0.05724118733709543, + "grad_norm": 10.149492263793945, + "learning_rate": 9.943968747495642e-06, + "loss": 0.2783, + "step": 2262 + }, + { + "epoch": 0.057266492901789104, + "grad_norm": 5.640282154083252, + "learning_rate": 9.943908789038622e-06, + "loss": 0.2423, + "step": 2263 + }, + { + "epoch": 0.05729179846648278, + "grad_norm": 8.196479797363281, + "learning_rate": 9.943848798699258e-06, + "loss": 0.2793, + "step": 2264 + }, + { + "epoch": 0.05731710403117646, + "grad_norm": 10.187972068786621, + "learning_rate": 9.943788776477932e-06, + "loss": 0.2393, + "step": 2265 + }, + { + "epoch": 0.057342409595870135, + "grad_norm": 4.685329437255859, + "learning_rate": 9.94372872237503e-06, + "loss": 0.2363, + "step": 2266 + }, + { + "epoch": 0.057367715160563805, + "grad_norm": 6.998672962188721, + "learning_rate": 9.943668636390942e-06, + "loss": 0.1838, + "step": 2267 + }, + { + "epoch": 0.05739302072525748, + "grad_norm": 6.83812952041626, + "learning_rate": 9.943608518526053e-06, + "loss": 0.152, + "step": 2268 + }, + { + "epoch": 0.05741832628995116, + "grad_norm": 4.570624828338623, + "learning_rate": 9.943548368780754e-06, + "loss": 0.163, + "step": 2269 + }, + { + "epoch": 0.057443631854644836, + "grad_norm": 17.718473434448242, + "learning_rate": 9.94348818715543e-06, + "loss": 0.3868, + "step": 2270 + }, + { + "epoch": 0.05746893741933851, + "grad_norm": 8.482808113098145, + "learning_rate": 9.943427973650472e-06, + "loss": 0.2376, + "step": 2271 + }, + { + "epoch": 0.05749424298403219, + "grad_norm": 5.738348007202148, + "learning_rate": 9.943367728266267e-06, + "loss": 0.1869, + "step": 2272 + }, + { + "epoch": 0.05751954854872587, + "grad_norm": 4.8374409675598145, + "learning_rate": 9.943307451003202e-06, + "loss": 0.2115, + "step": 2273 + }, + { + "epoch": 0.057544854113419544, + "grad_norm": 3.9024906158447266, + "learning_rate": 9.943247141861664e-06, + "loss": 0.2149, + "step": 2274 + }, + { + "epoch": 0.057570159678113214, + "grad_norm": 13.374670028686523, + "learning_rate": 9.943186800842048e-06, + "loss": 0.2296, + "step": 2275 + }, + { + "epoch": 0.05759546524280689, + "grad_norm": 5.163764953613281, + "learning_rate": 9.943126427944739e-06, + "loss": 0.2639, + "step": 2276 + }, + { + "epoch": 0.05762077080750057, + "grad_norm": 4.470413684844971, + "learning_rate": 9.943066023170127e-06, + "loss": 0.192, + "step": 2277 + }, + { + "epoch": 0.057646076372194245, + "grad_norm": 10.232083320617676, + "learning_rate": 9.943005586518601e-06, + "loss": 0.2263, + "step": 2278 + }, + { + "epoch": 0.05767138193688792, + "grad_norm": 10.102434158325195, + "learning_rate": 9.942945117990554e-06, + "loss": 0.2565, + "step": 2279 + }, + { + "epoch": 0.0576966875015816, + "grad_norm": 4.700654029846191, + "learning_rate": 9.942884617586371e-06, + "loss": 0.1178, + "step": 2280 + }, + { + "epoch": 0.057721993066275276, + "grad_norm": 12.438298225402832, + "learning_rate": 9.942824085306445e-06, + "loss": 0.2876, + "step": 2281 + }, + { + "epoch": 0.05774729863096895, + "grad_norm": 16.328283309936523, + "learning_rate": 9.942763521151166e-06, + "loss": 0.3377, + "step": 2282 + }, + { + "epoch": 0.05777260419566262, + "grad_norm": 10.133891105651855, + "learning_rate": 9.942702925120924e-06, + "loss": 0.1132, + "step": 2283 + }, + { + "epoch": 0.0577979097603563, + "grad_norm": 5.177793979644775, + "learning_rate": 9.942642297216111e-06, + "loss": 0.1352, + "step": 2284 + }, + { + "epoch": 0.05782321532504998, + "grad_norm": 9.822596549987793, + "learning_rate": 9.942581637437118e-06, + "loss": 0.2071, + "step": 2285 + }, + { + "epoch": 0.057848520889743654, + "grad_norm": 5.581164836883545, + "learning_rate": 9.942520945784332e-06, + "loss": 0.1881, + "step": 2286 + }, + { + "epoch": 0.05787382645443733, + "grad_norm": 7.442304611206055, + "learning_rate": 9.942460222258153e-06, + "loss": 0.2244, + "step": 2287 + }, + { + "epoch": 0.05789913201913101, + "grad_norm": 8.624571800231934, + "learning_rate": 9.942399466858964e-06, + "loss": 0.2067, + "step": 2288 + }, + { + "epoch": 0.057924437583824685, + "grad_norm": 15.297745704650879, + "learning_rate": 9.94233867958716e-06, + "loss": 0.2955, + "step": 2289 + }, + { + "epoch": 0.05794974314851836, + "grad_norm": 6.046626567840576, + "learning_rate": 9.942277860443134e-06, + "loss": 0.2136, + "step": 2290 + }, + { + "epoch": 0.05797504871321203, + "grad_norm": 10.81985092163086, + "learning_rate": 9.942217009427278e-06, + "loss": 0.2964, + "step": 2291 + }, + { + "epoch": 0.05800035427790571, + "grad_norm": 6.765377998352051, + "learning_rate": 9.942156126539984e-06, + "loss": 0.227, + "step": 2292 + }, + { + "epoch": 0.058025659842599386, + "grad_norm": 9.81373405456543, + "learning_rate": 9.942095211781643e-06, + "loss": 0.2225, + "step": 2293 + }, + { + "epoch": 0.05805096540729306, + "grad_norm": 9.286498069763184, + "learning_rate": 9.94203426515265e-06, + "loss": 0.2483, + "step": 2294 + }, + { + "epoch": 0.05807627097198674, + "grad_norm": 10.655824661254883, + "learning_rate": 9.941973286653397e-06, + "loss": 0.2909, + "step": 2295 + }, + { + "epoch": 0.05810157653668042, + "grad_norm": 7.865334987640381, + "learning_rate": 9.941912276284278e-06, + "loss": 0.1982, + "step": 2296 + }, + { + "epoch": 0.058126882101374094, + "grad_norm": 6.100888252258301, + "learning_rate": 9.941851234045686e-06, + "loss": 0.1306, + "step": 2297 + }, + { + "epoch": 0.05815218766606777, + "grad_norm": 14.91405963897705, + "learning_rate": 9.941790159938014e-06, + "loss": 0.3244, + "step": 2298 + }, + { + "epoch": 0.05817749323076144, + "grad_norm": 10.278356552124023, + "learning_rate": 9.941729053961658e-06, + "loss": 0.248, + "step": 2299 + }, + { + "epoch": 0.05820279879545512, + "grad_norm": 5.453709602355957, + "learning_rate": 9.94166791611701e-06, + "loss": 0.2622, + "step": 2300 + }, + { + "epoch": 0.058228104360148795, + "grad_norm": 5.308578014373779, + "learning_rate": 9.941606746404464e-06, + "loss": 0.2448, + "step": 2301 + }, + { + "epoch": 0.05825340992484247, + "grad_norm": 5.970634937286377, + "learning_rate": 9.941545544824415e-06, + "loss": 0.1628, + "step": 2302 + }, + { + "epoch": 0.05827871548953615, + "grad_norm": 14.253464698791504, + "learning_rate": 9.941484311377259e-06, + "loss": 0.2781, + "step": 2303 + }, + { + "epoch": 0.058304021054229827, + "grad_norm": 4.63014030456543, + "learning_rate": 9.94142304606339e-06, + "loss": 0.2067, + "step": 2304 + }, + { + "epoch": 0.058329326618923504, + "grad_norm": 8.228860855102539, + "learning_rate": 9.941361748883202e-06, + "loss": 0.286, + "step": 2305 + }, + { + "epoch": 0.05835463218361718, + "grad_norm": 3.4636473655700684, + "learning_rate": 9.941300419837092e-06, + "loss": 0.1781, + "step": 2306 + }, + { + "epoch": 0.05837993774831085, + "grad_norm": 9.675287246704102, + "learning_rate": 9.941239058925454e-06, + "loss": 0.3317, + "step": 2307 + }, + { + "epoch": 0.05840524331300453, + "grad_norm": 9.424491882324219, + "learning_rate": 9.941177666148685e-06, + "loss": 0.3114, + "step": 2308 + }, + { + "epoch": 0.058430548877698205, + "grad_norm": 10.15584659576416, + "learning_rate": 9.94111624150718e-06, + "loss": 0.2572, + "step": 2309 + }, + { + "epoch": 0.05845585444239188, + "grad_norm": 8.841038703918457, + "learning_rate": 9.941054785001336e-06, + "loss": 0.3077, + "step": 2310 + }, + { + "epoch": 0.05848116000708556, + "grad_norm": 8.063253402709961, + "learning_rate": 9.940993296631549e-06, + "loss": 0.2516, + "step": 2311 + }, + { + "epoch": 0.058506465571779236, + "grad_norm": 7.107074737548828, + "learning_rate": 9.940931776398216e-06, + "loss": 0.2775, + "step": 2312 + }, + { + "epoch": 0.05853177113647291, + "grad_norm": 8.661887168884277, + "learning_rate": 9.94087022430173e-06, + "loss": 0.3044, + "step": 2313 + }, + { + "epoch": 0.05855707670116659, + "grad_norm": 7.251862525939941, + "learning_rate": 9.940808640342495e-06, + "loss": 0.2037, + "step": 2314 + }, + { + "epoch": 0.05858238226586026, + "grad_norm": 4.04741096496582, + "learning_rate": 9.9407470245209e-06, + "loss": 0.182, + "step": 2315 + }, + { + "epoch": 0.05860768783055394, + "grad_norm": 3.669374942779541, + "learning_rate": 9.940685376837349e-06, + "loss": 0.1864, + "step": 2316 + }, + { + "epoch": 0.058632993395247614, + "grad_norm": 8.611734390258789, + "learning_rate": 9.940623697292237e-06, + "loss": 0.2573, + "step": 2317 + }, + { + "epoch": 0.05865829895994129, + "grad_norm": 6.8846435546875, + "learning_rate": 9.940561985885962e-06, + "loss": 0.2989, + "step": 2318 + }, + { + "epoch": 0.05868360452463497, + "grad_norm": 4.57615852355957, + "learning_rate": 9.94050024261892e-06, + "loss": 0.1773, + "step": 2319 + }, + { + "epoch": 0.058708910089328645, + "grad_norm": 7.847965717315674, + "learning_rate": 9.940438467491513e-06, + "loss": 0.2002, + "step": 2320 + }, + { + "epoch": 0.05873421565402232, + "grad_norm": 5.408724784851074, + "learning_rate": 9.940376660504136e-06, + "loss": 0.1752, + "step": 2321 + }, + { + "epoch": 0.058759521218716, + "grad_norm": 5.482754707336426, + "learning_rate": 9.940314821657189e-06, + "loss": 0.2026, + "step": 2322 + }, + { + "epoch": 0.05878482678340967, + "grad_norm": 4.3258466720581055, + "learning_rate": 9.940252950951072e-06, + "loss": 0.1647, + "step": 2323 + }, + { + "epoch": 0.058810132348103346, + "grad_norm": 10.292285919189453, + "learning_rate": 9.94019104838618e-06, + "loss": 0.1586, + "step": 2324 + }, + { + "epoch": 0.05883543791279702, + "grad_norm": 6.984470367431641, + "learning_rate": 9.940129113962917e-06, + "loss": 0.1866, + "step": 2325 + }, + { + "epoch": 0.0588607434774907, + "grad_norm": 6.396296977996826, + "learning_rate": 9.94006714768168e-06, + "loss": 0.2203, + "step": 2326 + }, + { + "epoch": 0.05888604904218438, + "grad_norm": 14.062958717346191, + "learning_rate": 9.940005149542869e-06, + "loss": 0.2653, + "step": 2327 + }, + { + "epoch": 0.058911354606878054, + "grad_norm": 6.231757640838623, + "learning_rate": 9.939943119546885e-06, + "loss": 0.1577, + "step": 2328 + }, + { + "epoch": 0.05893666017157173, + "grad_norm": 12.660615921020508, + "learning_rate": 9.939881057694124e-06, + "loss": 0.2604, + "step": 2329 + }, + { + "epoch": 0.05896196573626541, + "grad_norm": 5.384793758392334, + "learning_rate": 9.93981896398499e-06, + "loss": 0.1813, + "step": 2330 + }, + { + "epoch": 0.05898727130095908, + "grad_norm": 7.309342861175537, + "learning_rate": 9.939756838419883e-06, + "loss": 0.2329, + "step": 2331 + }, + { + "epoch": 0.059012576865652755, + "grad_norm": 6.225468635559082, + "learning_rate": 9.939694680999202e-06, + "loss": 0.2202, + "step": 2332 + }, + { + "epoch": 0.05903788243034643, + "grad_norm": 7.619234561920166, + "learning_rate": 9.93963249172335e-06, + "loss": 0.1462, + "step": 2333 + }, + { + "epoch": 0.05906318799504011, + "grad_norm": 12.900358200073242, + "learning_rate": 9.939570270592725e-06, + "loss": 0.184, + "step": 2334 + }, + { + "epoch": 0.059088493559733786, + "grad_norm": 11.094266891479492, + "learning_rate": 9.939508017607732e-06, + "loss": 0.2712, + "step": 2335 + }, + { + "epoch": 0.05911379912442746, + "grad_norm": 14.313629150390625, + "learning_rate": 9.93944573276877e-06, + "loss": 0.315, + "step": 2336 + }, + { + "epoch": 0.05913910468912114, + "grad_norm": 8.6257963180542, + "learning_rate": 9.939383416076241e-06, + "loss": 0.2146, + "step": 2337 + }, + { + "epoch": 0.05916441025381482, + "grad_norm": 8.488743782043457, + "learning_rate": 9.939321067530548e-06, + "loss": 0.2149, + "step": 2338 + }, + { + "epoch": 0.05918971581850849, + "grad_norm": 6.7063093185424805, + "learning_rate": 9.939258687132092e-06, + "loss": 0.2415, + "step": 2339 + }, + { + "epoch": 0.059215021383202164, + "grad_norm": 6.903604984283447, + "learning_rate": 9.939196274881275e-06, + "loss": 0.2507, + "step": 2340 + }, + { + "epoch": 0.05924032694789584, + "grad_norm": 16.65725326538086, + "learning_rate": 9.9391338307785e-06, + "loss": 0.4649, + "step": 2341 + }, + { + "epoch": 0.05926563251258952, + "grad_norm": 6.404348850250244, + "learning_rate": 9.93907135482417e-06, + "loss": 0.2085, + "step": 2342 + }, + { + "epoch": 0.059290938077283195, + "grad_norm": 5.591773509979248, + "learning_rate": 9.939008847018688e-06, + "loss": 0.2068, + "step": 2343 + }, + { + "epoch": 0.05931624364197687, + "grad_norm": 8.623209953308105, + "learning_rate": 9.938946307362456e-06, + "loss": 0.2111, + "step": 2344 + }, + { + "epoch": 0.05934154920667055, + "grad_norm": 3.9211246967315674, + "learning_rate": 9.938883735855877e-06, + "loss": 0.1316, + "step": 2345 + }, + { + "epoch": 0.059366854771364226, + "grad_norm": 13.185630798339844, + "learning_rate": 9.938821132499356e-06, + "loss": 0.3501, + "step": 2346 + }, + { + "epoch": 0.059392160336057896, + "grad_norm": 7.218297481536865, + "learning_rate": 9.938758497293296e-06, + "loss": 0.2687, + "step": 2347 + }, + { + "epoch": 0.05941746590075157, + "grad_norm": 4.928699493408203, + "learning_rate": 9.938695830238103e-06, + "loss": 0.1924, + "step": 2348 + }, + { + "epoch": 0.05944277146544525, + "grad_norm": 6.5506134033203125, + "learning_rate": 9.938633131334178e-06, + "loss": 0.1796, + "step": 2349 + }, + { + "epoch": 0.05946807703013893, + "grad_norm": 6.08005952835083, + "learning_rate": 9.938570400581928e-06, + "loss": 0.2596, + "step": 2350 + }, + { + "epoch": 0.059493382594832604, + "grad_norm": 23.400745391845703, + "learning_rate": 9.938507637981754e-06, + "loss": 0.2101, + "step": 2351 + }, + { + "epoch": 0.05951868815952628, + "grad_norm": 9.713645935058594, + "learning_rate": 9.938444843534064e-06, + "loss": 0.2777, + "step": 2352 + }, + { + "epoch": 0.05954399372421996, + "grad_norm": 8.181090354919434, + "learning_rate": 9.93838201723926e-06, + "loss": 0.207, + "step": 2353 + }, + { + "epoch": 0.059569299288913635, + "grad_norm": 10.339616775512695, + "learning_rate": 9.938319159097753e-06, + "loss": 0.2737, + "step": 2354 + }, + { + "epoch": 0.059594604853607305, + "grad_norm": 12.728743553161621, + "learning_rate": 9.93825626910994e-06, + "loss": 0.2371, + "step": 2355 + }, + { + "epoch": 0.05961991041830098, + "grad_norm": 5.513946533203125, + "learning_rate": 9.938193347276234e-06, + "loss": 0.2244, + "step": 2356 + }, + { + "epoch": 0.05964521598299466, + "grad_norm": 6.388964653015137, + "learning_rate": 9.938130393597036e-06, + "loss": 0.2124, + "step": 2357 + }, + { + "epoch": 0.059670521547688336, + "grad_norm": 6.041751384735107, + "learning_rate": 9.938067408072754e-06, + "loss": 0.1662, + "step": 2358 + }, + { + "epoch": 0.05969582711238201, + "grad_norm": 7.345395088195801, + "learning_rate": 9.938004390703794e-06, + "loss": 0.2039, + "step": 2359 + }, + { + "epoch": 0.05972113267707569, + "grad_norm": 10.91663932800293, + "learning_rate": 9.937941341490563e-06, + "loss": 0.1771, + "step": 2360 + }, + { + "epoch": 0.05974643824176937, + "grad_norm": 9.331997871398926, + "learning_rate": 9.937878260433466e-06, + "loss": 0.2371, + "step": 2361 + }, + { + "epoch": 0.059771743806463044, + "grad_norm": 6.376991271972656, + "learning_rate": 9.93781514753291e-06, + "loss": 0.1712, + "step": 2362 + }, + { + "epoch": 0.059797049371156714, + "grad_norm": 23.106586456298828, + "learning_rate": 9.937752002789302e-06, + "loss": 0.3509, + "step": 2363 + }, + { + "epoch": 0.05982235493585039, + "grad_norm": 9.661709785461426, + "learning_rate": 9.937688826203052e-06, + "loss": 0.3131, + "step": 2364 + }, + { + "epoch": 0.05984766050054407, + "grad_norm": 17.737884521484375, + "learning_rate": 9.937625617774564e-06, + "loss": 0.3568, + "step": 2365 + }, + { + "epoch": 0.059872966065237745, + "grad_norm": 6.514814376831055, + "learning_rate": 9.937562377504249e-06, + "loss": 0.2229, + "step": 2366 + }, + { + "epoch": 0.05989827162993142, + "grad_norm": 7.033853530883789, + "learning_rate": 9.93749910539251e-06, + "loss": 0.279, + "step": 2367 + }, + { + "epoch": 0.0599235771946251, + "grad_norm": 8.003997802734375, + "learning_rate": 9.937435801439759e-06, + "loss": 0.2354, + "step": 2368 + }, + { + "epoch": 0.059948882759318776, + "grad_norm": 8.54647445678711, + "learning_rate": 9.937372465646402e-06, + "loss": 0.1861, + "step": 2369 + }, + { + "epoch": 0.059974188324012453, + "grad_norm": 8.612569808959961, + "learning_rate": 9.937309098012849e-06, + "loss": 0.3141, + "step": 2370 + }, + { + "epoch": 0.059999493888706124, + "grad_norm": 20.146177291870117, + "learning_rate": 9.937245698539508e-06, + "loss": 0.3527, + "step": 2371 + }, + { + "epoch": 0.0600247994533998, + "grad_norm": 6.170963764190674, + "learning_rate": 9.937182267226788e-06, + "loss": 0.2063, + "step": 2372 + }, + { + "epoch": 0.06005010501809348, + "grad_norm": 6.7019500732421875, + "learning_rate": 9.937118804075096e-06, + "loss": 0.2241, + "step": 2373 + }, + { + "epoch": 0.060075410582787155, + "grad_norm": 6.40119743347168, + "learning_rate": 9.937055309084847e-06, + "loss": 0.2711, + "step": 2374 + }, + { + "epoch": 0.06010071614748083, + "grad_norm": 8.772682189941406, + "learning_rate": 9.936991782256445e-06, + "loss": 0.264, + "step": 2375 + }, + { + "epoch": 0.06012602171217451, + "grad_norm": 5.264277935028076, + "learning_rate": 9.936928223590299e-06, + "loss": 0.2137, + "step": 2376 + }, + { + "epoch": 0.060151327276868186, + "grad_norm": 6.823627471923828, + "learning_rate": 9.936864633086824e-06, + "loss": 0.2331, + "step": 2377 + }, + { + "epoch": 0.06017663284156186, + "grad_norm": 3.9697372913360596, + "learning_rate": 9.936801010746426e-06, + "loss": 0.1764, + "step": 2378 + }, + { + "epoch": 0.06020193840625553, + "grad_norm": 6.2183027267456055, + "learning_rate": 9.936737356569517e-06, + "loss": 0.2236, + "step": 2379 + }, + { + "epoch": 0.06022724397094921, + "grad_norm": 8.982417106628418, + "learning_rate": 9.936673670556508e-06, + "loss": 0.2642, + "step": 2380 + }, + { + "epoch": 0.06025254953564289, + "grad_norm": 5.139775276184082, + "learning_rate": 9.936609952707807e-06, + "loss": 0.2068, + "step": 2381 + }, + { + "epoch": 0.060277855100336564, + "grad_norm": 12.342982292175293, + "learning_rate": 9.93654620302383e-06, + "loss": 0.2329, + "step": 2382 + }, + { + "epoch": 0.06030316066503024, + "grad_norm": 6.521796226501465, + "learning_rate": 9.936482421504982e-06, + "loss": 0.2145, + "step": 2383 + }, + { + "epoch": 0.06032846622972392, + "grad_norm": 6.367041110992432, + "learning_rate": 9.936418608151677e-06, + "loss": 0.2178, + "step": 2384 + }, + { + "epoch": 0.060353771794417595, + "grad_norm": 10.269803047180176, + "learning_rate": 9.936354762964327e-06, + "loss": 0.3053, + "step": 2385 + }, + { + "epoch": 0.06037907735911127, + "grad_norm": 7.838802814483643, + "learning_rate": 9.936290885943343e-06, + "loss": 0.2208, + "step": 2386 + }, + { + "epoch": 0.06040438292380494, + "grad_norm": 7.6585283279418945, + "learning_rate": 9.936226977089138e-06, + "loss": 0.2443, + "step": 2387 + }, + { + "epoch": 0.06042968848849862, + "grad_norm": 4.126601696014404, + "learning_rate": 9.936163036402124e-06, + "loss": 0.2077, + "step": 2388 + }, + { + "epoch": 0.060454994053192296, + "grad_norm": 6.23936128616333, + "learning_rate": 9.936099063882712e-06, + "loss": 0.1843, + "step": 2389 + }, + { + "epoch": 0.06048029961788597, + "grad_norm": 13.680896759033203, + "learning_rate": 9.936035059531315e-06, + "loss": 0.3011, + "step": 2390 + }, + { + "epoch": 0.06050560518257965, + "grad_norm": 5.143655300140381, + "learning_rate": 9.935971023348348e-06, + "loss": 0.2267, + "step": 2391 + }, + { + "epoch": 0.06053091074727333, + "grad_norm": 3.2633626461029053, + "learning_rate": 9.93590695533422e-06, + "loss": 0.1536, + "step": 2392 + }, + { + "epoch": 0.060556216311967004, + "grad_norm": 14.547119140625, + "learning_rate": 9.935842855489345e-06, + "loss": 0.329, + "step": 2393 + }, + { + "epoch": 0.06058152187666068, + "grad_norm": 16.794235229492188, + "learning_rate": 9.935778723814142e-06, + "loss": 0.3374, + "step": 2394 + }, + { + "epoch": 0.06060682744135435, + "grad_norm": 5.270005702972412, + "learning_rate": 9.935714560309017e-06, + "loss": 0.1646, + "step": 2395 + }, + { + "epoch": 0.06063213300604803, + "grad_norm": 4.7424845695495605, + "learning_rate": 9.935650364974388e-06, + "loss": 0.152, + "step": 2396 + }, + { + "epoch": 0.060657438570741705, + "grad_norm": 3.2040328979492188, + "learning_rate": 9.935586137810667e-06, + "loss": 0.2014, + "step": 2397 + }, + { + "epoch": 0.06068274413543538, + "grad_norm": 14.766387939453125, + "learning_rate": 9.935521878818268e-06, + "loss": 0.3038, + "step": 2398 + }, + { + "epoch": 0.06070804970012906, + "grad_norm": 9.790237426757812, + "learning_rate": 9.935457587997608e-06, + "loss": 0.1609, + "step": 2399 + }, + { + "epoch": 0.060733355264822736, + "grad_norm": 4.469689846038818, + "learning_rate": 9.9353932653491e-06, + "loss": 0.118, + "step": 2400 + }, + { + "epoch": 0.06075866082951641, + "grad_norm": 3.7694332599639893, + "learning_rate": 9.93532891087316e-06, + "loss": 0.2017, + "step": 2401 + }, + { + "epoch": 0.06078396639421009, + "grad_norm": 8.70443058013916, + "learning_rate": 9.9352645245702e-06, + "loss": 0.3009, + "step": 2402 + }, + { + "epoch": 0.06080927195890376, + "grad_norm": 6.801049709320068, + "learning_rate": 9.935200106440638e-06, + "loss": 0.1823, + "step": 2403 + }, + { + "epoch": 0.06083457752359744, + "grad_norm": 9.299168586730957, + "learning_rate": 9.935135656484888e-06, + "loss": 0.1897, + "step": 2404 + }, + { + "epoch": 0.060859883088291114, + "grad_norm": 4.054830551147461, + "learning_rate": 9.935071174703366e-06, + "loss": 0.1939, + "step": 2405 + }, + { + "epoch": 0.06088518865298479, + "grad_norm": 18.192657470703125, + "learning_rate": 9.93500666109649e-06, + "loss": 0.4417, + "step": 2406 + }, + { + "epoch": 0.06091049421767847, + "grad_norm": 6.636457443237305, + "learning_rate": 9.934942115664672e-06, + "loss": 0.1658, + "step": 2407 + }, + { + "epoch": 0.060935799782372145, + "grad_norm": 5.90308952331543, + "learning_rate": 9.93487753840833e-06, + "loss": 0.1997, + "step": 2408 + }, + { + "epoch": 0.06096110534706582, + "grad_norm": 7.72686767578125, + "learning_rate": 9.934812929327881e-06, + "loss": 0.2593, + "step": 2409 + }, + { + "epoch": 0.0609864109117595, + "grad_norm": 7.173154354095459, + "learning_rate": 9.934748288423744e-06, + "loss": 0.2483, + "step": 2410 + }, + { + "epoch": 0.06101171647645317, + "grad_norm": 6.427846908569336, + "learning_rate": 9.93468361569633e-06, + "loss": 0.2201, + "step": 2411 + }, + { + "epoch": 0.061037022041146846, + "grad_norm": 10.472811698913574, + "learning_rate": 9.93461891114606e-06, + "loss": 0.2739, + "step": 2412 + }, + { + "epoch": 0.06106232760584052, + "grad_norm": 4.4444756507873535, + "learning_rate": 9.93455417477335e-06, + "loss": 0.1287, + "step": 2413 + }, + { + "epoch": 0.0610876331705342, + "grad_norm": 7.572386741638184, + "learning_rate": 9.93448940657862e-06, + "loss": 0.2495, + "step": 2414 + }, + { + "epoch": 0.06111293873522788, + "grad_norm": 4.734987258911133, + "learning_rate": 9.934424606562286e-06, + "loss": 0.1713, + "step": 2415 + }, + { + "epoch": 0.061138244299921554, + "grad_norm": 25.301971435546875, + "learning_rate": 9.934359774724762e-06, + "loss": 0.3786, + "step": 2416 + }, + { + "epoch": 0.06116354986461523, + "grad_norm": 9.200904846191406, + "learning_rate": 9.934294911066474e-06, + "loss": 0.1244, + "step": 2417 + }, + { + "epoch": 0.06118885542930891, + "grad_norm": 7.671876430511475, + "learning_rate": 9.934230015587833e-06, + "loss": 0.2609, + "step": 2418 + }, + { + "epoch": 0.06121416099400258, + "grad_norm": 6.6788330078125, + "learning_rate": 9.934165088289261e-06, + "loss": 0.2522, + "step": 2419 + }, + { + "epoch": 0.061239466558696255, + "grad_norm": 5.637416362762451, + "learning_rate": 9.934100129171176e-06, + "loss": 0.2436, + "step": 2420 + }, + { + "epoch": 0.06126477212338993, + "grad_norm": 5.169113636016846, + "learning_rate": 9.934035138233997e-06, + "loss": 0.1774, + "step": 2421 + }, + { + "epoch": 0.06129007768808361, + "grad_norm": 7.391045570373535, + "learning_rate": 9.933970115478144e-06, + "loss": 0.2604, + "step": 2422 + }, + { + "epoch": 0.061315383252777286, + "grad_norm": 8.819815635681152, + "learning_rate": 9.933905060904037e-06, + "loss": 0.1996, + "step": 2423 + }, + { + "epoch": 0.06134068881747096, + "grad_norm": 4.011885643005371, + "learning_rate": 9.933839974512092e-06, + "loss": 0.1324, + "step": 2424 + }, + { + "epoch": 0.06136599438216464, + "grad_norm": 10.384468078613281, + "learning_rate": 9.93377485630273e-06, + "loss": 0.2685, + "step": 2425 + }, + { + "epoch": 0.06139129994685832, + "grad_norm": 6.24183988571167, + "learning_rate": 9.933709706276374e-06, + "loss": 0.2059, + "step": 2426 + }, + { + "epoch": 0.06141660551155199, + "grad_norm": 13.780216217041016, + "learning_rate": 9.93364452443344e-06, + "loss": 0.2701, + "step": 2427 + }, + { + "epoch": 0.061441911076245664, + "grad_norm": 3.7650442123413086, + "learning_rate": 9.93357931077435e-06, + "loss": 0.212, + "step": 2428 + }, + { + "epoch": 0.06146721664093934, + "grad_norm": 5.068334579467773, + "learning_rate": 9.933514065299527e-06, + "loss": 0.2206, + "step": 2429 + }, + { + "epoch": 0.06149252220563302, + "grad_norm": 8.801453590393066, + "learning_rate": 9.933448788009388e-06, + "loss": 0.2613, + "step": 2430 + }, + { + "epoch": 0.061517827770326695, + "grad_norm": 9.278834342956543, + "learning_rate": 9.933383478904355e-06, + "loss": 0.2548, + "step": 2431 + }, + { + "epoch": 0.06154313333502037, + "grad_norm": 6.710339069366455, + "learning_rate": 9.933318137984851e-06, + "loss": 0.2068, + "step": 2432 + }, + { + "epoch": 0.06156843889971405, + "grad_norm": 5.726295471191406, + "learning_rate": 9.933252765251295e-06, + "loss": 0.2202, + "step": 2433 + }, + { + "epoch": 0.061593744464407726, + "grad_norm": 9.150347709655762, + "learning_rate": 9.933187360704108e-06, + "loss": 0.1705, + "step": 2434 + }, + { + "epoch": 0.061619050029101397, + "grad_norm": 8.657722473144531, + "learning_rate": 9.933121924343717e-06, + "loss": 0.2799, + "step": 2435 + }, + { + "epoch": 0.061644355593795074, + "grad_norm": 7.503466606140137, + "learning_rate": 9.933056456170538e-06, + "loss": 0.2021, + "step": 2436 + }, + { + "epoch": 0.06166966115848875, + "grad_norm": 5.150439739227295, + "learning_rate": 9.932990956184996e-06, + "loss": 0.2187, + "step": 2437 + }, + { + "epoch": 0.06169496672318243, + "grad_norm": 5.541157245635986, + "learning_rate": 9.932925424387511e-06, + "loss": 0.2594, + "step": 2438 + }, + { + "epoch": 0.061720272287876105, + "grad_norm": 11.49275016784668, + "learning_rate": 9.93285986077851e-06, + "loss": 0.2738, + "step": 2439 + }, + { + "epoch": 0.06174557785256978, + "grad_norm": 8.485278129577637, + "learning_rate": 9.932794265358413e-06, + "loss": 0.2662, + "step": 2440 + }, + { + "epoch": 0.06177088341726346, + "grad_norm": 7.6844916343688965, + "learning_rate": 9.932728638127645e-06, + "loss": 0.2833, + "step": 2441 + }, + { + "epoch": 0.061796188981957136, + "grad_norm": 6.737525939941406, + "learning_rate": 9.932662979086625e-06, + "loss": 0.2224, + "step": 2442 + }, + { + "epoch": 0.061821494546650806, + "grad_norm": 3.5974576473236084, + "learning_rate": 9.932597288235778e-06, + "loss": 0.1323, + "step": 2443 + }, + { + "epoch": 0.06184680011134448, + "grad_norm": 4.918073654174805, + "learning_rate": 9.932531565575532e-06, + "loss": 0.237, + "step": 2444 + }, + { + "epoch": 0.06187210567603816, + "grad_norm": 6.363321781158447, + "learning_rate": 9.932465811106305e-06, + "loss": 0.1922, + "step": 2445 + }, + { + "epoch": 0.06189741124073184, + "grad_norm": 9.701301574707031, + "learning_rate": 9.932400024828524e-06, + "loss": 0.3283, + "step": 2446 + }, + { + "epoch": 0.061922716805425514, + "grad_norm": 14.799116134643555, + "learning_rate": 9.932334206742614e-06, + "loss": 0.1779, + "step": 2447 + }, + { + "epoch": 0.06194802237011919, + "grad_norm": 5.0107293128967285, + "learning_rate": 9.932268356848996e-06, + "loss": 0.2387, + "step": 2448 + }, + { + "epoch": 0.06197332793481287, + "grad_norm": 5.789350986480713, + "learning_rate": 9.932202475148099e-06, + "loss": 0.1982, + "step": 2449 + }, + { + "epoch": 0.061998633499506545, + "grad_norm": 17.982669830322266, + "learning_rate": 9.932136561640343e-06, + "loss": 0.3351, + "step": 2450 + }, + { + "epoch": 0.062023939064200215, + "grad_norm": 6.840789794921875, + "learning_rate": 9.932070616326159e-06, + "loss": 0.2316, + "step": 2451 + }, + { + "epoch": 0.06204924462889389, + "grad_norm": 5.090085029602051, + "learning_rate": 9.932004639205968e-06, + "loss": 0.1694, + "step": 2452 + }, + { + "epoch": 0.06207455019358757, + "grad_norm": 6.495306968688965, + "learning_rate": 9.931938630280195e-06, + "loss": 0.1298, + "step": 2453 + }, + { + "epoch": 0.062099855758281246, + "grad_norm": 8.11221694946289, + "learning_rate": 9.931872589549269e-06, + "loss": 0.2903, + "step": 2454 + }, + { + "epoch": 0.06212516132297492, + "grad_norm": 5.708250999450684, + "learning_rate": 9.931806517013612e-06, + "loss": 0.2708, + "step": 2455 + }, + { + "epoch": 0.0621504668876686, + "grad_norm": 5.995260715484619, + "learning_rate": 9.931740412673654e-06, + "loss": 0.1723, + "step": 2456 + }, + { + "epoch": 0.06217577245236228, + "grad_norm": 8.873872756958008, + "learning_rate": 9.931674276529819e-06, + "loss": 0.2777, + "step": 2457 + }, + { + "epoch": 0.062201078017055954, + "grad_norm": 6.790765762329102, + "learning_rate": 9.931608108582533e-06, + "loss": 0.2162, + "step": 2458 + }, + { + "epoch": 0.062226383581749624, + "grad_norm": 9.12885570526123, + "learning_rate": 9.931541908832226e-06, + "loss": 0.226, + "step": 2459 + }, + { + "epoch": 0.0622516891464433, + "grad_norm": 5.194972991943359, + "learning_rate": 9.931475677279321e-06, + "loss": 0.2006, + "step": 2460 + }, + { + "epoch": 0.06227699471113698, + "grad_norm": 12.14786148071289, + "learning_rate": 9.931409413924246e-06, + "loss": 0.2026, + "step": 2461 + }, + { + "epoch": 0.062302300275830655, + "grad_norm": 6.657279968261719, + "learning_rate": 9.93134311876743e-06, + "loss": 0.093, + "step": 2462 + }, + { + "epoch": 0.06232760584052433, + "grad_norm": 7.270766258239746, + "learning_rate": 9.9312767918093e-06, + "loss": 0.2628, + "step": 2463 + }, + { + "epoch": 0.06235291140521801, + "grad_norm": 5.133802890777588, + "learning_rate": 9.931210433050282e-06, + "loss": 0.1982, + "step": 2464 + }, + { + "epoch": 0.062378216969911686, + "grad_norm": 8.381193161010742, + "learning_rate": 9.931144042490806e-06, + "loss": 0.2451, + "step": 2465 + }, + { + "epoch": 0.06240352253460536, + "grad_norm": 7.010033130645752, + "learning_rate": 9.931077620131297e-06, + "loss": 0.1559, + "step": 2466 + }, + { + "epoch": 0.06242882809929903, + "grad_norm": 21.54863739013672, + "learning_rate": 9.931011165972188e-06, + "loss": 0.1861, + "step": 2467 + }, + { + "epoch": 0.06245413366399271, + "grad_norm": 10.446121215820312, + "learning_rate": 9.930944680013905e-06, + "loss": 0.2397, + "step": 2468 + }, + { + "epoch": 0.06247943922868639, + "grad_norm": 9.255364418029785, + "learning_rate": 9.930878162256879e-06, + "loss": 0.1728, + "step": 2469 + }, + { + "epoch": 0.06250474479338007, + "grad_norm": 6.26198673248291, + "learning_rate": 9.930811612701535e-06, + "loss": 0.1867, + "step": 2470 + }, + { + "epoch": 0.06253005035807374, + "grad_norm": 11.667094230651855, + "learning_rate": 9.930745031348303e-06, + "loss": 0.2925, + "step": 2471 + }, + { + "epoch": 0.06255535592276741, + "grad_norm": 11.589701652526855, + "learning_rate": 9.930678418197614e-06, + "loss": 0.3275, + "step": 2472 + }, + { + "epoch": 0.0625806614874611, + "grad_norm": 13.07981014251709, + "learning_rate": 9.930611773249899e-06, + "loss": 0.3368, + "step": 2473 + }, + { + "epoch": 0.06260596705215477, + "grad_norm": 21.616300582885742, + "learning_rate": 9.930545096505583e-06, + "loss": 0.2641, + "step": 2474 + }, + { + "epoch": 0.06263127261684845, + "grad_norm": 5.299165725708008, + "learning_rate": 9.9304783879651e-06, + "loss": 0.1852, + "step": 2475 + }, + { + "epoch": 0.06265657818154212, + "grad_norm": 7.805286884307861, + "learning_rate": 9.93041164762888e-06, + "loss": 0.2118, + "step": 2476 + }, + { + "epoch": 0.0626818837462358, + "grad_norm": 8.61569595336914, + "learning_rate": 9.930344875497351e-06, + "loss": 0.2996, + "step": 2477 + }, + { + "epoch": 0.06270718931092947, + "grad_norm": 6.3313984870910645, + "learning_rate": 9.930278071570945e-06, + "loss": 0.265, + "step": 2478 + }, + { + "epoch": 0.06273249487562314, + "grad_norm": 9.066610336303711, + "learning_rate": 9.930211235850094e-06, + "loss": 0.2379, + "step": 2479 + }, + { + "epoch": 0.06275780044031683, + "grad_norm": 4.9967041015625, + "learning_rate": 9.930144368335227e-06, + "loss": 0.214, + "step": 2480 + }, + { + "epoch": 0.0627831060050105, + "grad_norm": 7.269866466522217, + "learning_rate": 9.930077469026778e-06, + "loss": 0.2309, + "step": 2481 + }, + { + "epoch": 0.06280841156970418, + "grad_norm": 15.060328483581543, + "learning_rate": 9.930010537925174e-06, + "loss": 0.2789, + "step": 2482 + }, + { + "epoch": 0.06283371713439785, + "grad_norm": 6.630094051361084, + "learning_rate": 9.92994357503085e-06, + "loss": 0.1731, + "step": 2483 + }, + { + "epoch": 0.06285902269909154, + "grad_norm": 7.42113733291626, + "learning_rate": 9.929876580344236e-06, + "loss": 0.1507, + "step": 2484 + }, + { + "epoch": 0.0628843282637852, + "grad_norm": 8.63486385345459, + "learning_rate": 9.929809553865766e-06, + "loss": 0.2003, + "step": 2485 + }, + { + "epoch": 0.06290963382847889, + "grad_norm": 7.09483003616333, + "learning_rate": 9.929742495595871e-06, + "loss": 0.2624, + "step": 2486 + }, + { + "epoch": 0.06293493939317256, + "grad_norm": 10.569793701171875, + "learning_rate": 9.929675405534985e-06, + "loss": 0.3272, + "step": 2487 + }, + { + "epoch": 0.06296024495786623, + "grad_norm": 6.0569939613342285, + "learning_rate": 9.929608283683536e-06, + "loss": 0.2286, + "step": 2488 + }, + { + "epoch": 0.06298555052255991, + "grad_norm": 11.522741317749023, + "learning_rate": 9.929541130041962e-06, + "loss": 0.3162, + "step": 2489 + }, + { + "epoch": 0.06301085608725358, + "grad_norm": 7.961105823516846, + "learning_rate": 9.929473944610695e-06, + "loss": 0.1708, + "step": 2490 + }, + { + "epoch": 0.06303616165194727, + "grad_norm": 5.815308094024658, + "learning_rate": 9.929406727390167e-06, + "loss": 0.2056, + "step": 2491 + }, + { + "epoch": 0.06306146721664094, + "grad_norm": 8.258007049560547, + "learning_rate": 9.929339478380812e-06, + "loss": 0.2614, + "step": 2492 + }, + { + "epoch": 0.06308677278133462, + "grad_norm": 11.024388313293457, + "learning_rate": 9.929272197583063e-06, + "loss": 0.2703, + "step": 2493 + }, + { + "epoch": 0.06311207834602829, + "grad_norm": 9.392477035522461, + "learning_rate": 9.929204884997354e-06, + "loss": 0.2618, + "step": 2494 + }, + { + "epoch": 0.06313738391072196, + "grad_norm": 8.39858341217041, + "learning_rate": 9.92913754062412e-06, + "loss": 0.2804, + "step": 2495 + }, + { + "epoch": 0.06316268947541565, + "grad_norm": 6.188216686248779, + "learning_rate": 9.929070164463797e-06, + "loss": 0.1972, + "step": 2496 + }, + { + "epoch": 0.06318799504010932, + "grad_norm": 17.82537078857422, + "learning_rate": 9.929002756516814e-06, + "loss": 0.3472, + "step": 2497 + }, + { + "epoch": 0.063213300604803, + "grad_norm": 7.005037307739258, + "learning_rate": 9.928935316783612e-06, + "loss": 0.2042, + "step": 2498 + }, + { + "epoch": 0.06323860616949667, + "grad_norm": 8.084304809570312, + "learning_rate": 9.928867845264622e-06, + "loss": 0.2427, + "step": 2499 + }, + { + "epoch": 0.06326391173419035, + "grad_norm": 4.690130710601807, + "learning_rate": 9.92880034196028e-06, + "loss": 0.1963, + "step": 2500 + }, + { + "epoch": 0.06328921729888402, + "grad_norm": 8.677437782287598, + "learning_rate": 9.928732806871022e-06, + "loss": 0.2752, + "step": 2501 + }, + { + "epoch": 0.06331452286357771, + "grad_norm": 7.066489219665527, + "learning_rate": 9.928665239997281e-06, + "loss": 0.2199, + "step": 2502 + }, + { + "epoch": 0.06333982842827138, + "grad_norm": 8.470564842224121, + "learning_rate": 9.928597641339497e-06, + "loss": 0.2129, + "step": 2503 + }, + { + "epoch": 0.06336513399296505, + "grad_norm": 7.849959373474121, + "learning_rate": 9.928530010898102e-06, + "loss": 0.2276, + "step": 2504 + }, + { + "epoch": 0.06339043955765873, + "grad_norm": 7.956232070922852, + "learning_rate": 9.928462348673534e-06, + "loss": 0.2702, + "step": 2505 + }, + { + "epoch": 0.0634157451223524, + "grad_norm": 5.051803112030029, + "learning_rate": 9.928394654666229e-06, + "loss": 0.1693, + "step": 2506 + }, + { + "epoch": 0.06344105068704609, + "grad_norm": 5.046599388122559, + "learning_rate": 9.928326928876625e-06, + "loss": 0.2301, + "step": 2507 + }, + { + "epoch": 0.06346635625173976, + "grad_norm": 7.848165035247803, + "learning_rate": 9.928259171305157e-06, + "loss": 0.2359, + "step": 2508 + }, + { + "epoch": 0.06349166181643344, + "grad_norm": 9.427996635437012, + "learning_rate": 9.928191381952262e-06, + "loss": 0.2554, + "step": 2509 + }, + { + "epoch": 0.06351696738112711, + "grad_norm": 5.912941932678223, + "learning_rate": 9.928123560818377e-06, + "loss": 0.1369, + "step": 2510 + }, + { + "epoch": 0.06354227294582078, + "grad_norm": 6.911438941955566, + "learning_rate": 9.92805570790394e-06, + "loss": 0.1825, + "step": 2511 + }, + { + "epoch": 0.06356757851051446, + "grad_norm": 2.8764519691467285, + "learning_rate": 9.927987823209388e-06, + "loss": 0.1536, + "step": 2512 + }, + { + "epoch": 0.06359288407520813, + "grad_norm": 4.879718780517578, + "learning_rate": 9.927919906735159e-06, + "loss": 0.2129, + "step": 2513 + }, + { + "epoch": 0.06361818963990182, + "grad_norm": 4.587505340576172, + "learning_rate": 9.927851958481693e-06, + "loss": 0.1565, + "step": 2514 + }, + { + "epoch": 0.06364349520459549, + "grad_norm": 4.593412399291992, + "learning_rate": 9.927783978449425e-06, + "loss": 0.1633, + "step": 2515 + }, + { + "epoch": 0.06366880076928917, + "grad_norm": 5.687400817871094, + "learning_rate": 9.927715966638794e-06, + "loss": 0.1817, + "step": 2516 + }, + { + "epoch": 0.06369410633398284, + "grad_norm": 14.469887733459473, + "learning_rate": 9.92764792305024e-06, + "loss": 0.2633, + "step": 2517 + }, + { + "epoch": 0.06371941189867653, + "grad_norm": 8.563151359558105, + "learning_rate": 9.927579847684203e-06, + "loss": 0.2444, + "step": 2518 + }, + { + "epoch": 0.0637447174633702, + "grad_norm": 4.348944187164307, + "learning_rate": 9.927511740541119e-06, + "loss": 0.172, + "step": 2519 + }, + { + "epoch": 0.06377002302806387, + "grad_norm": 11.147071838378906, + "learning_rate": 9.927443601621428e-06, + "loss": 0.3271, + "step": 2520 + }, + { + "epoch": 0.06379532859275755, + "grad_norm": 16.540983200073242, + "learning_rate": 9.92737543092557e-06, + "loss": 0.347, + "step": 2521 + }, + { + "epoch": 0.06382063415745122, + "grad_norm": 9.07381820678711, + "learning_rate": 9.927307228453984e-06, + "loss": 0.2347, + "step": 2522 + }, + { + "epoch": 0.0638459397221449, + "grad_norm": 5.477041244506836, + "learning_rate": 9.927238994207108e-06, + "loss": 0.2206, + "step": 2523 + }, + { + "epoch": 0.06387124528683857, + "grad_norm": 21.725906372070312, + "learning_rate": 9.927170728185389e-06, + "loss": 0.3235, + "step": 2524 + }, + { + "epoch": 0.06389655085153226, + "grad_norm": 5.274666786193848, + "learning_rate": 9.927102430389259e-06, + "loss": 0.2278, + "step": 2525 + }, + { + "epoch": 0.06392185641622593, + "grad_norm": 6.43250036239624, + "learning_rate": 9.927034100819163e-06, + "loss": 0.2347, + "step": 2526 + }, + { + "epoch": 0.0639471619809196, + "grad_norm": 5.830924034118652, + "learning_rate": 9.926965739475542e-06, + "loss": 0.1878, + "step": 2527 + }, + { + "epoch": 0.06397246754561328, + "grad_norm": 12.625624656677246, + "learning_rate": 9.926897346358835e-06, + "loss": 0.3067, + "step": 2528 + }, + { + "epoch": 0.06399777311030695, + "grad_norm": 10.65054988861084, + "learning_rate": 9.926828921469482e-06, + "loss": 0.4006, + "step": 2529 + }, + { + "epoch": 0.06402307867500064, + "grad_norm": 7.635730743408203, + "learning_rate": 9.926760464807926e-06, + "loss": 0.2202, + "step": 2530 + }, + { + "epoch": 0.0640483842396943, + "grad_norm": 14.14200496673584, + "learning_rate": 9.926691976374609e-06, + "loss": 0.2013, + "step": 2531 + }, + { + "epoch": 0.06407368980438799, + "grad_norm": 6.078211784362793, + "learning_rate": 9.926623456169972e-06, + "loss": 0.2291, + "step": 2532 + }, + { + "epoch": 0.06409899536908166, + "grad_norm": 6.497138023376465, + "learning_rate": 9.926554904194456e-06, + "loss": 0.2007, + "step": 2533 + }, + { + "epoch": 0.06412430093377534, + "grad_norm": 4.953855514526367, + "learning_rate": 9.926486320448502e-06, + "loss": 0.2168, + "step": 2534 + }, + { + "epoch": 0.06414960649846901, + "grad_norm": 6.008237838745117, + "learning_rate": 9.926417704932559e-06, + "loss": 0.1553, + "step": 2535 + }, + { + "epoch": 0.06417491206316268, + "grad_norm": 6.272756576538086, + "learning_rate": 9.926349057647061e-06, + "loss": 0.1493, + "step": 2536 + }, + { + "epoch": 0.06420021762785637, + "grad_norm": 11.31812572479248, + "learning_rate": 9.926280378592454e-06, + "loss": 0.199, + "step": 2537 + }, + { + "epoch": 0.06422552319255004, + "grad_norm": 7.46641206741333, + "learning_rate": 9.926211667769183e-06, + "loss": 0.2624, + "step": 2538 + }, + { + "epoch": 0.06425082875724372, + "grad_norm": 4.460893630981445, + "learning_rate": 9.92614292517769e-06, + "loss": 0.0749, + "step": 2539 + }, + { + "epoch": 0.06427613432193739, + "grad_norm": 8.037023544311523, + "learning_rate": 9.926074150818414e-06, + "loss": 0.1783, + "step": 2540 + }, + { + "epoch": 0.06430143988663108, + "grad_norm": 6.027623176574707, + "learning_rate": 9.926005344691804e-06, + "loss": 0.2595, + "step": 2541 + }, + { + "epoch": 0.06432674545132475, + "grad_norm": 5.370815277099609, + "learning_rate": 9.925936506798304e-06, + "loss": 0.2332, + "step": 2542 + }, + { + "epoch": 0.06435205101601842, + "grad_norm": 4.066298484802246, + "learning_rate": 9.925867637138353e-06, + "loss": 0.1371, + "step": 2543 + }, + { + "epoch": 0.0643773565807121, + "grad_norm": 4.213862419128418, + "learning_rate": 9.9257987357124e-06, + "loss": 0.1852, + "step": 2544 + }, + { + "epoch": 0.06440266214540577, + "grad_norm": 8.363616943359375, + "learning_rate": 9.925729802520885e-06, + "loss": 0.301, + "step": 2545 + }, + { + "epoch": 0.06442796771009945, + "grad_norm": 9.452573776245117, + "learning_rate": 9.925660837564257e-06, + "loss": 0.2434, + "step": 2546 + }, + { + "epoch": 0.06445327327479312, + "grad_norm": 7.6615166664123535, + "learning_rate": 9.925591840842957e-06, + "loss": 0.2289, + "step": 2547 + }, + { + "epoch": 0.06447857883948681, + "grad_norm": 7.876495838165283, + "learning_rate": 9.925522812357431e-06, + "loss": 0.2982, + "step": 2548 + }, + { + "epoch": 0.06450388440418048, + "grad_norm": 6.525918960571289, + "learning_rate": 9.925453752108125e-06, + "loss": 0.1347, + "step": 2549 + }, + { + "epoch": 0.06452918996887416, + "grad_norm": 7.266229629516602, + "learning_rate": 9.925384660095486e-06, + "loss": 0.1917, + "step": 2550 + }, + { + "epoch": 0.06455449553356783, + "grad_norm": 10.850529670715332, + "learning_rate": 9.925315536319955e-06, + "loss": 0.2021, + "step": 2551 + }, + { + "epoch": 0.0645798010982615, + "grad_norm": 8.448720932006836, + "learning_rate": 9.925246380781981e-06, + "loss": 0.2521, + "step": 2552 + }, + { + "epoch": 0.06460510666295519, + "grad_norm": 3.8011133670806885, + "learning_rate": 9.92517719348201e-06, + "loss": 0.0891, + "step": 2553 + }, + { + "epoch": 0.06463041222764886, + "grad_norm": 8.109079360961914, + "learning_rate": 9.925107974420487e-06, + "loss": 0.3257, + "step": 2554 + }, + { + "epoch": 0.06465571779234254, + "grad_norm": 5.518883228302002, + "learning_rate": 9.925038723597858e-06, + "loss": 0.2809, + "step": 2555 + }, + { + "epoch": 0.06468102335703621, + "grad_norm": 8.20553207397461, + "learning_rate": 9.924969441014571e-06, + "loss": 0.2921, + "step": 2556 + }, + { + "epoch": 0.0647063289217299, + "grad_norm": 5.965091705322266, + "learning_rate": 9.924900126671074e-06, + "loss": 0.1431, + "step": 2557 + }, + { + "epoch": 0.06473163448642356, + "grad_norm": 3.9324138164520264, + "learning_rate": 9.924830780567809e-06, + "loss": 0.1827, + "step": 2558 + }, + { + "epoch": 0.06475694005111723, + "grad_norm": 7.024853706359863, + "learning_rate": 9.924761402705227e-06, + "loss": 0.2932, + "step": 2559 + }, + { + "epoch": 0.06478224561581092, + "grad_norm": 8.561430931091309, + "learning_rate": 9.924691993083778e-06, + "loss": 0.2511, + "step": 2560 + }, + { + "epoch": 0.06480755118050459, + "grad_norm": 4.721031188964844, + "learning_rate": 9.924622551703902e-06, + "loss": 0.1953, + "step": 2561 + }, + { + "epoch": 0.06483285674519827, + "grad_norm": 7.884391784667969, + "learning_rate": 9.924553078566053e-06, + "loss": 0.2369, + "step": 2562 + }, + { + "epoch": 0.06485816230989194, + "grad_norm": 5.456563949584961, + "learning_rate": 9.924483573670678e-06, + "loss": 0.2411, + "step": 2563 + }, + { + "epoch": 0.06488346787458563, + "grad_norm": 5.74809455871582, + "learning_rate": 9.924414037018224e-06, + "loss": 0.1722, + "step": 2564 + }, + { + "epoch": 0.0649087734392793, + "grad_norm": 4.329058647155762, + "learning_rate": 9.92434446860914e-06, + "loss": 0.118, + "step": 2565 + }, + { + "epoch": 0.06493407900397298, + "grad_norm": 7.376564979553223, + "learning_rate": 9.924274868443873e-06, + "loss": 0.2407, + "step": 2566 + }, + { + "epoch": 0.06495938456866665, + "grad_norm": 9.013745307922363, + "learning_rate": 9.924205236522876e-06, + "loss": 0.1817, + "step": 2567 + }, + { + "epoch": 0.06498469013336032, + "grad_norm": 4.393987655639648, + "learning_rate": 9.924135572846594e-06, + "loss": 0.2182, + "step": 2568 + }, + { + "epoch": 0.065009995698054, + "grad_norm": 6.212281227111816, + "learning_rate": 9.924065877415477e-06, + "loss": 0.1977, + "step": 2569 + }, + { + "epoch": 0.06503530126274767, + "grad_norm": 8.153939247131348, + "learning_rate": 9.923996150229976e-06, + "loss": 0.274, + "step": 2570 + }, + { + "epoch": 0.06506060682744136, + "grad_norm": 5.65022611618042, + "learning_rate": 9.923926391290539e-06, + "loss": 0.2226, + "step": 2571 + }, + { + "epoch": 0.06508591239213503, + "grad_norm": 13.298787117004395, + "learning_rate": 9.923856600597618e-06, + "loss": 0.2553, + "step": 2572 + }, + { + "epoch": 0.06511121795682871, + "grad_norm": 10.379897117614746, + "learning_rate": 9.92378677815166e-06, + "loss": 0.2091, + "step": 2573 + }, + { + "epoch": 0.06513652352152238, + "grad_norm": 5.848214149475098, + "learning_rate": 9.923716923953119e-06, + "loss": 0.2042, + "step": 2574 + }, + { + "epoch": 0.06516182908621605, + "grad_norm": 25.75859832763672, + "learning_rate": 9.923647038002441e-06, + "loss": 0.3635, + "step": 2575 + }, + { + "epoch": 0.06518713465090974, + "grad_norm": 6.6865363121032715, + "learning_rate": 9.923577120300081e-06, + "loss": 0.2851, + "step": 2576 + }, + { + "epoch": 0.0652124402156034, + "grad_norm": 16.439708709716797, + "learning_rate": 9.923507170846488e-06, + "loss": 0.1263, + "step": 2577 + }, + { + "epoch": 0.06523774578029709, + "grad_norm": 3.747251510620117, + "learning_rate": 9.923437189642112e-06, + "loss": 0.1499, + "step": 2578 + }, + { + "epoch": 0.06526305134499076, + "grad_norm": 12.815113067626953, + "learning_rate": 9.923367176687405e-06, + "loss": 0.2121, + "step": 2579 + }, + { + "epoch": 0.06528835690968444, + "grad_norm": 6.507975101470947, + "learning_rate": 9.923297131982821e-06, + "loss": 0.2331, + "step": 2580 + }, + { + "epoch": 0.06531366247437811, + "grad_norm": 9.538745880126953, + "learning_rate": 9.923227055528807e-06, + "loss": 0.2979, + "step": 2581 + }, + { + "epoch": 0.0653389680390718, + "grad_norm": 15.56159782409668, + "learning_rate": 9.92315694732582e-06, + "loss": 0.2648, + "step": 2582 + }, + { + "epoch": 0.06536427360376547, + "grad_norm": 13.280177116394043, + "learning_rate": 9.923086807374309e-06, + "loss": 0.2262, + "step": 2583 + }, + { + "epoch": 0.06538957916845914, + "grad_norm": 3.699235439300537, + "learning_rate": 9.923016635674727e-06, + "loss": 0.1859, + "step": 2584 + }, + { + "epoch": 0.06541488473315282, + "grad_norm": 8.936229705810547, + "learning_rate": 9.922946432227525e-06, + "loss": 0.2206, + "step": 2585 + }, + { + "epoch": 0.06544019029784649, + "grad_norm": 5.262148857116699, + "learning_rate": 9.922876197033157e-06, + "loss": 0.2146, + "step": 2586 + }, + { + "epoch": 0.06546549586254018, + "grad_norm": 4.160294055938721, + "learning_rate": 9.92280593009208e-06, + "loss": 0.279, + "step": 2587 + }, + { + "epoch": 0.06549080142723385, + "grad_norm": 6.920108795166016, + "learning_rate": 9.92273563140474e-06, + "loss": 0.1506, + "step": 2588 + }, + { + "epoch": 0.06551610699192753, + "grad_norm": 5.410256862640381, + "learning_rate": 9.922665300971594e-06, + "loss": 0.199, + "step": 2589 + }, + { + "epoch": 0.0655414125566212, + "grad_norm": 5.883775234222412, + "learning_rate": 9.922594938793096e-06, + "loss": 0.2204, + "step": 2590 + }, + { + "epoch": 0.06556671812131487, + "grad_norm": 6.059230804443359, + "learning_rate": 9.922524544869697e-06, + "loss": 0.2172, + "step": 2591 + }, + { + "epoch": 0.06559202368600855, + "grad_norm": 11.25751781463623, + "learning_rate": 9.922454119201853e-06, + "loss": 0.3268, + "step": 2592 + }, + { + "epoch": 0.06561732925070222, + "grad_norm": 7.416118144989014, + "learning_rate": 9.92238366179002e-06, + "loss": 0.2411, + "step": 2593 + }, + { + "epoch": 0.06564263481539591, + "grad_norm": 5.9028754234313965, + "learning_rate": 9.92231317263465e-06, + "loss": 0.2321, + "step": 2594 + }, + { + "epoch": 0.06566794038008958, + "grad_norm": 6.266108512878418, + "learning_rate": 9.922242651736197e-06, + "loss": 0.2327, + "step": 2595 + }, + { + "epoch": 0.06569324594478326, + "grad_norm": 8.6432466506958, + "learning_rate": 9.922172099095118e-06, + "loss": 0.1015, + "step": 2596 + }, + { + "epoch": 0.06571855150947693, + "grad_norm": 11.684826850891113, + "learning_rate": 9.922101514711866e-06, + "loss": 0.1665, + "step": 2597 + }, + { + "epoch": 0.06574385707417062, + "grad_norm": 10.582125663757324, + "learning_rate": 9.922030898586896e-06, + "loss": 0.2702, + "step": 2598 + }, + { + "epoch": 0.06576916263886429, + "grad_norm": 6.124980926513672, + "learning_rate": 9.921960250720666e-06, + "loss": 0.2093, + "step": 2599 + }, + { + "epoch": 0.06579446820355796, + "grad_norm": 4.8549041748046875, + "learning_rate": 9.921889571113629e-06, + "loss": 0.1867, + "step": 2600 + }, + { + "epoch": 0.06581977376825164, + "grad_norm": 10.02391242980957, + "learning_rate": 9.92181885976624e-06, + "loss": 0.378, + "step": 2601 + }, + { + "epoch": 0.06584507933294531, + "grad_norm": 9.646856307983398, + "learning_rate": 9.92174811667896e-06, + "loss": 0.1857, + "step": 2602 + }, + { + "epoch": 0.065870384897639, + "grad_norm": 7.286579132080078, + "learning_rate": 9.92167734185224e-06, + "loss": 0.1744, + "step": 2603 + }, + { + "epoch": 0.06589569046233267, + "grad_norm": 5.388571739196777, + "learning_rate": 9.921606535286538e-06, + "loss": 0.1319, + "step": 2604 + }, + { + "epoch": 0.06592099602702635, + "grad_norm": 10.721634864807129, + "learning_rate": 9.921535696982313e-06, + "loss": 0.2127, + "step": 2605 + }, + { + "epoch": 0.06594630159172002, + "grad_norm": 12.242243766784668, + "learning_rate": 9.921464826940018e-06, + "loss": 0.2134, + "step": 2606 + }, + { + "epoch": 0.06597160715641369, + "grad_norm": 9.144187927246094, + "learning_rate": 9.921393925160112e-06, + "loss": 0.2545, + "step": 2607 + }, + { + "epoch": 0.06599691272110737, + "grad_norm": 6.351029396057129, + "learning_rate": 9.921322991643053e-06, + "loss": 0.1401, + "step": 2608 + }, + { + "epoch": 0.06602221828580104, + "grad_norm": 16.51276206970215, + "learning_rate": 9.921252026389296e-06, + "loss": 0.2693, + "step": 2609 + }, + { + "epoch": 0.06604752385049473, + "grad_norm": 13.699933052062988, + "learning_rate": 9.921181029399301e-06, + "loss": 0.3646, + "step": 2610 + }, + { + "epoch": 0.0660728294151884, + "grad_norm": 6.416121959686279, + "learning_rate": 9.921110000673525e-06, + "loss": 0.2505, + "step": 2611 + }, + { + "epoch": 0.06609813497988208, + "grad_norm": 17.019977569580078, + "learning_rate": 9.921038940212425e-06, + "loss": 0.2801, + "step": 2612 + }, + { + "epoch": 0.06612344054457575, + "grad_norm": 5.2826337814331055, + "learning_rate": 9.920967848016461e-06, + "loss": 0.1134, + "step": 2613 + }, + { + "epoch": 0.06614874610926944, + "grad_norm": 8.272168159484863, + "learning_rate": 9.92089672408609e-06, + "loss": 0.2276, + "step": 2614 + }, + { + "epoch": 0.0661740516739631, + "grad_norm": 8.459026336669922, + "learning_rate": 9.920825568421774e-06, + "loss": 0.24, + "step": 2615 + }, + { + "epoch": 0.06619935723865678, + "grad_norm": 5.069420337677002, + "learning_rate": 9.920754381023967e-06, + "loss": 0.2297, + "step": 2616 + }, + { + "epoch": 0.06622466280335046, + "grad_norm": 6.0676045417785645, + "learning_rate": 9.920683161893131e-06, + "loss": 0.1753, + "step": 2617 + }, + { + "epoch": 0.06624996836804413, + "grad_norm": 3.9529409408569336, + "learning_rate": 9.920611911029725e-06, + "loss": 0.1661, + "step": 2618 + }, + { + "epoch": 0.06627527393273781, + "grad_norm": 10.179482460021973, + "learning_rate": 9.920540628434207e-06, + "loss": 0.3041, + "step": 2619 + }, + { + "epoch": 0.06630057949743148, + "grad_norm": 7.291133403778076, + "learning_rate": 9.920469314107037e-06, + "loss": 0.1423, + "step": 2620 + }, + { + "epoch": 0.06632588506212517, + "grad_norm": 7.881807804107666, + "learning_rate": 9.92039796804868e-06, + "loss": 0.1606, + "step": 2621 + }, + { + "epoch": 0.06635119062681884, + "grad_norm": 5.869313716888428, + "learning_rate": 9.920326590259587e-06, + "loss": 0.1388, + "step": 2622 + }, + { + "epoch": 0.06637649619151251, + "grad_norm": 5.745005130767822, + "learning_rate": 9.920255180740223e-06, + "loss": 0.168, + "step": 2623 + }, + { + "epoch": 0.06640180175620619, + "grad_norm": 8.891243934631348, + "learning_rate": 9.920183739491051e-06, + "loss": 0.254, + "step": 2624 + }, + { + "epoch": 0.06642710732089986, + "grad_norm": 7.715859413146973, + "learning_rate": 9.92011226651253e-06, + "loss": 0.1892, + "step": 2625 + }, + { + "epoch": 0.06645241288559355, + "grad_norm": 9.471552848815918, + "learning_rate": 9.920040761805118e-06, + "loss": 0.2614, + "step": 2626 + }, + { + "epoch": 0.06647771845028722, + "grad_norm": 12.242490768432617, + "learning_rate": 9.919969225369279e-06, + "loss": 0.4191, + "step": 2627 + }, + { + "epoch": 0.0665030240149809, + "grad_norm": 10.10022258758545, + "learning_rate": 9.919897657205472e-06, + "loss": 0.2013, + "step": 2628 + }, + { + "epoch": 0.06652832957967457, + "grad_norm": 8.141284942626953, + "learning_rate": 9.919826057314163e-06, + "loss": 0.2099, + "step": 2629 + }, + { + "epoch": 0.06655363514436825, + "grad_norm": 8.569403648376465, + "learning_rate": 9.919754425695812e-06, + "loss": 0.2248, + "step": 2630 + }, + { + "epoch": 0.06657894070906192, + "grad_norm": 4.611416816711426, + "learning_rate": 9.919682762350876e-06, + "loss": 0.1412, + "step": 2631 + }, + { + "epoch": 0.0666042462737556, + "grad_norm": 4.748319149017334, + "learning_rate": 9.919611067279823e-06, + "loss": 0.2291, + "step": 2632 + }, + { + "epoch": 0.06662955183844928, + "grad_norm": 5.935461521148682, + "learning_rate": 9.919539340483113e-06, + "loss": 0.184, + "step": 2633 + }, + { + "epoch": 0.06665485740314295, + "grad_norm": 5.104527473449707, + "learning_rate": 9.919467581961208e-06, + "loss": 0.2302, + "step": 2634 + }, + { + "epoch": 0.06668016296783663, + "grad_norm": 6.582694053649902, + "learning_rate": 9.919395791714574e-06, + "loss": 0.2628, + "step": 2635 + }, + { + "epoch": 0.0667054685325303, + "grad_norm": 8.868546485900879, + "learning_rate": 9.919323969743669e-06, + "loss": 0.2892, + "step": 2636 + }, + { + "epoch": 0.06673077409722399, + "grad_norm": 6.769328594207764, + "learning_rate": 9.91925211604896e-06, + "loss": 0.2339, + "step": 2637 + }, + { + "epoch": 0.06675607966191766, + "grad_norm": 6.033790111541748, + "learning_rate": 9.919180230630909e-06, + "loss": 0.1879, + "step": 2638 + }, + { + "epoch": 0.06678138522661133, + "grad_norm": 18.0506534576416, + "learning_rate": 9.91910831348998e-06, + "loss": 0.2975, + "step": 2639 + }, + { + "epoch": 0.06680669079130501, + "grad_norm": 12.708052635192871, + "learning_rate": 9.919036364626638e-06, + "loss": 0.3466, + "step": 2640 + }, + { + "epoch": 0.06683199635599868, + "grad_norm": 7.014477252960205, + "learning_rate": 9.918964384041343e-06, + "loss": 0.2281, + "step": 2641 + }, + { + "epoch": 0.06685730192069236, + "grad_norm": 5.974145412445068, + "learning_rate": 9.918892371734563e-06, + "loss": 0.1075, + "step": 2642 + }, + { + "epoch": 0.06688260748538603, + "grad_norm": 6.193750381469727, + "learning_rate": 9.91882032770676e-06, + "loss": 0.1589, + "step": 2643 + }, + { + "epoch": 0.06690791305007972, + "grad_norm": 22.416927337646484, + "learning_rate": 9.918748251958402e-06, + "loss": 0.2209, + "step": 2644 + }, + { + "epoch": 0.06693321861477339, + "grad_norm": 8.54400634765625, + "learning_rate": 9.918676144489948e-06, + "loss": 0.2723, + "step": 2645 + }, + { + "epoch": 0.06695852417946707, + "grad_norm": 34.47336959838867, + "learning_rate": 9.918604005301868e-06, + "loss": 0.2283, + "step": 2646 + }, + { + "epoch": 0.06698382974416074, + "grad_norm": 7.44660758972168, + "learning_rate": 9.918531834394628e-06, + "loss": 0.2779, + "step": 2647 + }, + { + "epoch": 0.06700913530885441, + "grad_norm": 3.8785626888275146, + "learning_rate": 9.918459631768688e-06, + "loss": 0.1784, + "step": 2648 + }, + { + "epoch": 0.0670344408735481, + "grad_norm": 7.309852123260498, + "learning_rate": 9.918387397424517e-06, + "loss": 0.3167, + "step": 2649 + }, + { + "epoch": 0.06705974643824177, + "grad_norm": 6.326377868652344, + "learning_rate": 9.918315131362581e-06, + "loss": 0.2299, + "step": 2650 + }, + { + "epoch": 0.06708505200293545, + "grad_norm": 9.39284896850586, + "learning_rate": 9.918242833583347e-06, + "loss": 0.2392, + "step": 2651 + }, + { + "epoch": 0.06711035756762912, + "grad_norm": 3.140108823776245, + "learning_rate": 9.918170504087278e-06, + "loss": 0.1203, + "step": 2652 + }, + { + "epoch": 0.0671356631323228, + "grad_norm": 5.872972011566162, + "learning_rate": 9.918098142874843e-06, + "loss": 0.2296, + "step": 2653 + }, + { + "epoch": 0.06716096869701647, + "grad_norm": 26.89016342163086, + "learning_rate": 9.91802574994651e-06, + "loss": 0.4149, + "step": 2654 + }, + { + "epoch": 0.06718627426171014, + "grad_norm": 5.011120796203613, + "learning_rate": 9.917953325302741e-06, + "loss": 0.2125, + "step": 2655 + }, + { + "epoch": 0.06721157982640383, + "grad_norm": 5.539502143859863, + "learning_rate": 9.917880868944005e-06, + "loss": 0.1726, + "step": 2656 + }, + { + "epoch": 0.0672368853910975, + "grad_norm": 6.6063313484191895, + "learning_rate": 9.917808380870772e-06, + "loss": 0.2624, + "step": 2657 + }, + { + "epoch": 0.06726219095579118, + "grad_norm": 6.251620769500732, + "learning_rate": 9.917735861083508e-06, + "loss": 0.2079, + "step": 2658 + }, + { + "epoch": 0.06728749652048485, + "grad_norm": 9.851106643676758, + "learning_rate": 9.917663309582679e-06, + "loss": 0.1672, + "step": 2659 + }, + { + "epoch": 0.06731280208517854, + "grad_norm": 14.557476043701172, + "learning_rate": 9.917590726368756e-06, + "loss": 0.2875, + "step": 2660 + }, + { + "epoch": 0.0673381076498722, + "grad_norm": 9.24103832244873, + "learning_rate": 9.917518111442203e-06, + "loss": 0.2963, + "step": 2661 + }, + { + "epoch": 0.06736341321456589, + "grad_norm": 12.723952293395996, + "learning_rate": 9.91744546480349e-06, + "loss": 0.2357, + "step": 2662 + }, + { + "epoch": 0.06738871877925956, + "grad_norm": 8.055302619934082, + "learning_rate": 9.917372786453088e-06, + "loss": 0.3255, + "step": 2663 + }, + { + "epoch": 0.06741402434395323, + "grad_norm": 4.982765197753906, + "learning_rate": 9.917300076391461e-06, + "loss": 0.2855, + "step": 2664 + }, + { + "epoch": 0.06743932990864691, + "grad_norm": 13.250364303588867, + "learning_rate": 9.917227334619083e-06, + "loss": 0.2549, + "step": 2665 + }, + { + "epoch": 0.06746463547334058, + "grad_norm": 6.78289270401001, + "learning_rate": 9.91715456113642e-06, + "loss": 0.2123, + "step": 2666 + }, + { + "epoch": 0.06748994103803427, + "grad_norm": 12.269546508789062, + "learning_rate": 9.91708175594394e-06, + "loss": 0.1895, + "step": 2667 + }, + { + "epoch": 0.06751524660272794, + "grad_norm": 13.262333869934082, + "learning_rate": 9.917008919042117e-06, + "loss": 0.2955, + "step": 2668 + }, + { + "epoch": 0.06754055216742162, + "grad_norm": 7.783548355102539, + "learning_rate": 9.916936050431417e-06, + "loss": 0.2914, + "step": 2669 + }, + { + "epoch": 0.06756585773211529, + "grad_norm": 4.68500280380249, + "learning_rate": 9.916863150112311e-06, + "loss": 0.1929, + "step": 2670 + }, + { + "epoch": 0.06759116329680896, + "grad_norm": 10.524398803710938, + "learning_rate": 9.91679021808527e-06, + "loss": 0.2248, + "step": 2671 + }, + { + "epoch": 0.06761646886150265, + "grad_norm": 15.743545532226562, + "learning_rate": 9.916717254350763e-06, + "loss": 0.4208, + "step": 2672 + }, + { + "epoch": 0.06764177442619632, + "grad_norm": 6.418128490447998, + "learning_rate": 9.91664425890926e-06, + "loss": 0.2358, + "step": 2673 + }, + { + "epoch": 0.06766707999089, + "grad_norm": 2.8532164096832275, + "learning_rate": 9.916571231761233e-06, + "loss": 0.1432, + "step": 2674 + }, + { + "epoch": 0.06769238555558367, + "grad_norm": 14.98817253112793, + "learning_rate": 9.916498172907155e-06, + "loss": 0.1467, + "step": 2675 + }, + { + "epoch": 0.06771769112027735, + "grad_norm": 7.730781078338623, + "learning_rate": 9.916425082347492e-06, + "loss": 0.2663, + "step": 2676 + }, + { + "epoch": 0.06774299668497102, + "grad_norm": 7.308042526245117, + "learning_rate": 9.91635196008272e-06, + "loss": 0.2889, + "step": 2677 + }, + { + "epoch": 0.06776830224966471, + "grad_norm": 4.37701940536499, + "learning_rate": 9.916278806113306e-06, + "loss": 0.1623, + "step": 2678 + }, + { + "epoch": 0.06779360781435838, + "grad_norm": 6.680368423461914, + "learning_rate": 9.916205620439727e-06, + "loss": 0.2347, + "step": 2679 + }, + { + "epoch": 0.06781891337905205, + "grad_norm": 5.540055274963379, + "learning_rate": 9.916132403062451e-06, + "loss": 0.2332, + "step": 2680 + }, + { + "epoch": 0.06784421894374573, + "grad_norm": 9.745691299438477, + "learning_rate": 9.916059153981954e-06, + "loss": 0.2286, + "step": 2681 + }, + { + "epoch": 0.0678695245084394, + "grad_norm": 5.1743292808532715, + "learning_rate": 9.915985873198703e-06, + "loss": 0.2145, + "step": 2682 + }, + { + "epoch": 0.06789483007313309, + "grad_norm": 4.087460517883301, + "learning_rate": 9.915912560713176e-06, + "loss": 0.1728, + "step": 2683 + }, + { + "epoch": 0.06792013563782676, + "grad_norm": 6.608890056610107, + "learning_rate": 9.915839216525842e-06, + "loss": 0.2686, + "step": 2684 + }, + { + "epoch": 0.06794544120252044, + "grad_norm": 5.961402893066406, + "learning_rate": 9.915765840637174e-06, + "loss": 0.1429, + "step": 2685 + }, + { + "epoch": 0.06797074676721411, + "grad_norm": 9.176290512084961, + "learning_rate": 9.91569243304765e-06, + "loss": 0.2203, + "step": 2686 + }, + { + "epoch": 0.06799605233190778, + "grad_norm": 3.0490565299987793, + "learning_rate": 9.915618993757737e-06, + "loss": 0.1543, + "step": 2687 + }, + { + "epoch": 0.06802135789660146, + "grad_norm": 6.367225170135498, + "learning_rate": 9.915545522767913e-06, + "loss": 0.2674, + "step": 2688 + }, + { + "epoch": 0.06804666346129513, + "grad_norm": 8.097295761108398, + "learning_rate": 9.915472020078649e-06, + "loss": 0.3097, + "step": 2689 + }, + { + "epoch": 0.06807196902598882, + "grad_norm": 10.59713363647461, + "learning_rate": 9.915398485690421e-06, + "loss": 0.2525, + "step": 2690 + }, + { + "epoch": 0.06809727459068249, + "grad_norm": 6.169872283935547, + "learning_rate": 9.915324919603703e-06, + "loss": 0.2401, + "step": 2691 + }, + { + "epoch": 0.06812258015537617, + "grad_norm": 5.113864421844482, + "learning_rate": 9.915251321818968e-06, + "loss": 0.1889, + "step": 2692 + }, + { + "epoch": 0.06814788572006984, + "grad_norm": 6.3364105224609375, + "learning_rate": 9.915177692336693e-06, + "loss": 0.1942, + "step": 2693 + }, + { + "epoch": 0.06817319128476353, + "grad_norm": 10.502400398254395, + "learning_rate": 9.91510403115735e-06, + "loss": 0.3192, + "step": 2694 + }, + { + "epoch": 0.0681984968494572, + "grad_norm": 9.795792579650879, + "learning_rate": 9.915030338281415e-06, + "loss": 0.2897, + "step": 2695 + }, + { + "epoch": 0.06822380241415087, + "grad_norm": 12.33945369720459, + "learning_rate": 9.914956613709364e-06, + "loss": 0.3157, + "step": 2696 + }, + { + "epoch": 0.06824910797884455, + "grad_norm": 4.46164083480835, + "learning_rate": 9.914882857441672e-06, + "loss": 0.2197, + "step": 2697 + }, + { + "epoch": 0.06827441354353822, + "grad_norm": 5.23024320602417, + "learning_rate": 9.914809069478816e-06, + "loss": 0.2342, + "step": 2698 + }, + { + "epoch": 0.0682997191082319, + "grad_norm": 4.4876790046691895, + "learning_rate": 9.91473524982127e-06, + "loss": 0.1999, + "step": 2699 + }, + { + "epoch": 0.06832502467292557, + "grad_norm": 15.741647720336914, + "learning_rate": 9.91466139846951e-06, + "loss": 0.3727, + "step": 2700 + }, + { + "epoch": 0.06835033023761926, + "grad_norm": 13.526158332824707, + "learning_rate": 9.914587515424012e-06, + "loss": 0.3216, + "step": 2701 + }, + { + "epoch": 0.06837563580231293, + "grad_norm": 5.8487725257873535, + "learning_rate": 9.914513600685255e-06, + "loss": 0.2414, + "step": 2702 + }, + { + "epoch": 0.0684009413670066, + "grad_norm": 6.444308280944824, + "learning_rate": 9.914439654253713e-06, + "loss": 0.2327, + "step": 2703 + }, + { + "epoch": 0.06842624693170028, + "grad_norm": 6.979428768157959, + "learning_rate": 9.914365676129864e-06, + "loss": 0.2711, + "step": 2704 + }, + { + "epoch": 0.06845155249639395, + "grad_norm": 9.91465950012207, + "learning_rate": 9.914291666314185e-06, + "loss": 0.2765, + "step": 2705 + }, + { + "epoch": 0.06847685806108764, + "grad_norm": 5.367353916168213, + "learning_rate": 9.914217624807152e-06, + "loss": 0.2052, + "step": 2706 + }, + { + "epoch": 0.0685021636257813, + "grad_norm": 8.8436279296875, + "learning_rate": 9.914143551609246e-06, + "loss": 0.2059, + "step": 2707 + }, + { + "epoch": 0.06852746919047499, + "grad_norm": 7.681924819946289, + "learning_rate": 9.91406944672094e-06, + "loss": 0.2761, + "step": 2708 + }, + { + "epoch": 0.06855277475516866, + "grad_norm": 4.870830535888672, + "learning_rate": 9.913995310142716e-06, + "loss": 0.2205, + "step": 2709 + }, + { + "epoch": 0.06857808031986234, + "grad_norm": 4.258144378662109, + "learning_rate": 9.91392114187505e-06, + "loss": 0.2477, + "step": 2710 + }, + { + "epoch": 0.06860338588455601, + "grad_norm": 6.8509440422058105, + "learning_rate": 9.91384694191842e-06, + "loss": 0.2237, + "step": 2711 + }, + { + "epoch": 0.06862869144924968, + "grad_norm": 4.209290981292725, + "learning_rate": 9.913772710273306e-06, + "loss": 0.2007, + "step": 2712 + }, + { + "epoch": 0.06865399701394337, + "grad_norm": 7.280081748962402, + "learning_rate": 9.913698446940185e-06, + "loss": 0.2806, + "step": 2713 + }, + { + "epoch": 0.06867930257863704, + "grad_norm": 10.571649551391602, + "learning_rate": 9.913624151919537e-06, + "loss": 0.1545, + "step": 2714 + }, + { + "epoch": 0.06870460814333072, + "grad_norm": 10.152403831481934, + "learning_rate": 9.913549825211842e-06, + "loss": 0.3452, + "step": 2715 + }, + { + "epoch": 0.06872991370802439, + "grad_norm": 7.746426582336426, + "learning_rate": 9.913475466817575e-06, + "loss": 0.1388, + "step": 2716 + }, + { + "epoch": 0.06875521927271808, + "grad_norm": 10.955488204956055, + "learning_rate": 9.913401076737221e-06, + "loss": 0.2213, + "step": 2717 + }, + { + "epoch": 0.06878052483741175, + "grad_norm": 7.569055557250977, + "learning_rate": 9.913326654971257e-06, + "loss": 0.1752, + "step": 2718 + }, + { + "epoch": 0.06880583040210542, + "grad_norm": 7.639313220977783, + "learning_rate": 9.913252201520164e-06, + "loss": 0.1649, + "step": 2719 + }, + { + "epoch": 0.0688311359667991, + "grad_norm": 8.647318840026855, + "learning_rate": 9.913177716384421e-06, + "loss": 0.2179, + "step": 2720 + }, + { + "epoch": 0.06885644153149277, + "grad_norm": 7.0726447105407715, + "learning_rate": 9.91310319956451e-06, + "loss": 0.165, + "step": 2721 + }, + { + "epoch": 0.06888174709618645, + "grad_norm": 5.432473182678223, + "learning_rate": 9.913028651060907e-06, + "loss": 0.2253, + "step": 2722 + }, + { + "epoch": 0.06890705266088012, + "grad_norm": 8.769659996032715, + "learning_rate": 9.912954070874098e-06, + "loss": 0.3724, + "step": 2723 + }, + { + "epoch": 0.06893235822557381, + "grad_norm": 16.50470542907715, + "learning_rate": 9.912879459004563e-06, + "loss": 0.3589, + "step": 2724 + }, + { + "epoch": 0.06895766379026748, + "grad_norm": 10.447502136230469, + "learning_rate": 9.912804815452781e-06, + "loss": 0.2888, + "step": 2725 + }, + { + "epoch": 0.06898296935496116, + "grad_norm": 9.944916725158691, + "learning_rate": 9.912730140219236e-06, + "loss": 0.2782, + "step": 2726 + }, + { + "epoch": 0.06900827491965483, + "grad_norm": 14.756790161132812, + "learning_rate": 9.912655433304406e-06, + "loss": 0.1783, + "step": 2727 + }, + { + "epoch": 0.0690335804843485, + "grad_norm": 12.732749938964844, + "learning_rate": 9.912580694708777e-06, + "loss": 0.3736, + "step": 2728 + }, + { + "epoch": 0.06905888604904219, + "grad_norm": 10.536060333251953, + "learning_rate": 9.912505924432828e-06, + "loss": 0.2289, + "step": 2729 + }, + { + "epoch": 0.06908419161373586, + "grad_norm": 3.830152750015259, + "learning_rate": 9.912431122477042e-06, + "loss": 0.2011, + "step": 2730 + }, + { + "epoch": 0.06910949717842954, + "grad_norm": 7.934570789337158, + "learning_rate": 9.912356288841902e-06, + "loss": 0.2147, + "step": 2731 + }, + { + "epoch": 0.06913480274312321, + "grad_norm": 6.206683158874512, + "learning_rate": 9.91228142352789e-06, + "loss": 0.2063, + "step": 2732 + }, + { + "epoch": 0.0691601083078169, + "grad_norm": 6.305654525756836, + "learning_rate": 9.912206526535488e-06, + "loss": 0.1466, + "step": 2733 + }, + { + "epoch": 0.06918541387251056, + "grad_norm": 6.035426139831543, + "learning_rate": 9.912131597865181e-06, + "loss": 0.2117, + "step": 2734 + }, + { + "epoch": 0.06921071943720424, + "grad_norm": 7.8976640701293945, + "learning_rate": 9.912056637517451e-06, + "loss": 0.1762, + "step": 2735 + }, + { + "epoch": 0.06923602500189792, + "grad_norm": 19.221508026123047, + "learning_rate": 9.911981645492781e-06, + "loss": 0.2897, + "step": 2736 + }, + { + "epoch": 0.06926133056659159, + "grad_norm": 9.408977508544922, + "learning_rate": 9.911906621791654e-06, + "loss": 0.2062, + "step": 2737 + }, + { + "epoch": 0.06928663613128527, + "grad_norm": 6.411707401275635, + "learning_rate": 9.911831566414556e-06, + "loss": 0.2183, + "step": 2738 + }, + { + "epoch": 0.06931194169597894, + "grad_norm": 5.108758449554443, + "learning_rate": 9.911756479361971e-06, + "loss": 0.2, + "step": 2739 + }, + { + "epoch": 0.06933724726067263, + "grad_norm": 7.502475261688232, + "learning_rate": 9.911681360634379e-06, + "loss": 0.1734, + "step": 2740 + }, + { + "epoch": 0.0693625528253663, + "grad_norm": 10.956083297729492, + "learning_rate": 9.911606210232271e-06, + "loss": 0.3323, + "step": 2741 + }, + { + "epoch": 0.06938785839005998, + "grad_norm": 28.611173629760742, + "learning_rate": 9.911531028156127e-06, + "loss": 0.27, + "step": 2742 + }, + { + "epoch": 0.06941316395475365, + "grad_norm": 13.501500129699707, + "learning_rate": 9.911455814406433e-06, + "loss": 0.3405, + "step": 2743 + }, + { + "epoch": 0.06943846951944732, + "grad_norm": 11.258886337280273, + "learning_rate": 9.911380568983672e-06, + "loss": 0.2911, + "step": 2744 + }, + { + "epoch": 0.069463775084141, + "grad_norm": 6.263479232788086, + "learning_rate": 9.911305291888333e-06, + "loss": 0.2624, + "step": 2745 + }, + { + "epoch": 0.06948908064883468, + "grad_norm": 5.781370639801025, + "learning_rate": 9.9112299831209e-06, + "loss": 0.1879, + "step": 2746 + }, + { + "epoch": 0.06951438621352836, + "grad_norm": 10.041987419128418, + "learning_rate": 9.91115464268186e-06, + "loss": 0.1641, + "step": 2747 + }, + { + "epoch": 0.06953969177822203, + "grad_norm": 6.318222999572754, + "learning_rate": 9.911079270571695e-06, + "loss": 0.2163, + "step": 2748 + }, + { + "epoch": 0.06956499734291571, + "grad_norm": 12.0723237991333, + "learning_rate": 9.911003866790894e-06, + "loss": 0.2154, + "step": 2749 + }, + { + "epoch": 0.06959030290760938, + "grad_norm": 6.07412576675415, + "learning_rate": 9.910928431339941e-06, + "loss": 0.1934, + "step": 2750 + }, + { + "epoch": 0.06961560847230305, + "grad_norm": 10.009506225585938, + "learning_rate": 9.910852964219326e-06, + "loss": 0.2908, + "step": 2751 + }, + { + "epoch": 0.06964091403699674, + "grad_norm": 5.801324844360352, + "learning_rate": 9.910777465429533e-06, + "loss": 0.208, + "step": 2752 + }, + { + "epoch": 0.06966621960169041, + "grad_norm": 7.403270244598389, + "learning_rate": 9.91070193497105e-06, + "loss": 0.2284, + "step": 2753 + }, + { + "epoch": 0.06969152516638409, + "grad_norm": 13.322779655456543, + "learning_rate": 9.910626372844363e-06, + "loss": 0.2127, + "step": 2754 + }, + { + "epoch": 0.06971683073107776, + "grad_norm": 8.395007133483887, + "learning_rate": 9.91055077904996e-06, + "loss": 0.1827, + "step": 2755 + }, + { + "epoch": 0.06974213629577145, + "grad_norm": 7.321232795715332, + "learning_rate": 9.91047515358833e-06, + "loss": 0.2033, + "step": 2756 + }, + { + "epoch": 0.06976744186046512, + "grad_norm": 6.443145751953125, + "learning_rate": 9.910399496459957e-06, + "loss": 0.2104, + "step": 2757 + }, + { + "epoch": 0.0697927474251588, + "grad_norm": 5.2731475830078125, + "learning_rate": 9.910323807665331e-06, + "loss": 0.2465, + "step": 2758 + }, + { + "epoch": 0.06981805298985247, + "grad_norm": 6.226791858673096, + "learning_rate": 9.910248087204944e-06, + "loss": 0.2495, + "step": 2759 + }, + { + "epoch": 0.06984335855454614, + "grad_norm": 9.60467529296875, + "learning_rate": 9.910172335079275e-06, + "loss": 0.2241, + "step": 2760 + }, + { + "epoch": 0.06986866411923982, + "grad_norm": 5.311227798461914, + "learning_rate": 9.910096551288821e-06, + "loss": 0.1749, + "step": 2761 + }, + { + "epoch": 0.0698939696839335, + "grad_norm": 4.593298435211182, + "learning_rate": 9.910020735834069e-06, + "loss": 0.2045, + "step": 2762 + }, + { + "epoch": 0.06991927524862718, + "grad_norm": 15.15086841583252, + "learning_rate": 9.909944888715503e-06, + "loss": 0.393, + "step": 2763 + }, + { + "epoch": 0.06994458081332085, + "grad_norm": 14.024909019470215, + "learning_rate": 9.90986900993362e-06, + "loss": 0.3706, + "step": 2764 + }, + { + "epoch": 0.06996988637801453, + "grad_norm": 5.402571678161621, + "learning_rate": 9.9097930994889e-06, + "loss": 0.218, + "step": 2765 + }, + { + "epoch": 0.0699951919427082, + "grad_norm": 5.597771644592285, + "learning_rate": 9.909717157381841e-06, + "loss": 0.2464, + "step": 2766 + }, + { + "epoch": 0.07002049750740187, + "grad_norm": 14.11719036102295, + "learning_rate": 9.90964118361293e-06, + "loss": 0.4479, + "step": 2767 + }, + { + "epoch": 0.07004580307209556, + "grad_norm": 5.599606513977051, + "learning_rate": 9.909565178182654e-06, + "loss": 0.2076, + "step": 2768 + }, + { + "epoch": 0.07007110863678923, + "grad_norm": 6.283289432525635, + "learning_rate": 9.909489141091507e-06, + "loss": 0.2413, + "step": 2769 + }, + { + "epoch": 0.07009641420148291, + "grad_norm": 6.709274768829346, + "learning_rate": 9.909413072339977e-06, + "loss": 0.2542, + "step": 2770 + }, + { + "epoch": 0.07012171976617658, + "grad_norm": 6.072334289550781, + "learning_rate": 9.909336971928555e-06, + "loss": 0.2334, + "step": 2771 + }, + { + "epoch": 0.07014702533087026, + "grad_norm": 5.834033012390137, + "learning_rate": 9.909260839857734e-06, + "loss": 0.157, + "step": 2772 + }, + { + "epoch": 0.07017233089556393, + "grad_norm": 8.975558280944824, + "learning_rate": 9.909184676128e-06, + "loss": 0.2277, + "step": 2773 + }, + { + "epoch": 0.07019763646025762, + "grad_norm": 5.350551605224609, + "learning_rate": 9.90910848073985e-06, + "loss": 0.2361, + "step": 2774 + }, + { + "epoch": 0.07022294202495129, + "grad_norm": 5.224635601043701, + "learning_rate": 9.90903225369377e-06, + "loss": 0.2567, + "step": 2775 + }, + { + "epoch": 0.07024824758964496, + "grad_norm": 7.654327869415283, + "learning_rate": 9.908955994990257e-06, + "loss": 0.2441, + "step": 2776 + }, + { + "epoch": 0.07027355315433864, + "grad_norm": 6.616550922393799, + "learning_rate": 9.908879704629797e-06, + "loss": 0.2149, + "step": 2777 + }, + { + "epoch": 0.07029885871903231, + "grad_norm": 3.292060613632202, + "learning_rate": 9.908803382612886e-06, + "loss": 0.0767, + "step": 2778 + }, + { + "epoch": 0.070324164283726, + "grad_norm": 6.723134994506836, + "learning_rate": 9.908727028940015e-06, + "loss": 0.2462, + "step": 2779 + }, + { + "epoch": 0.07034946984841967, + "grad_norm": 7.975804328918457, + "learning_rate": 9.908650643611677e-06, + "loss": 0.152, + "step": 2780 + }, + { + "epoch": 0.07037477541311335, + "grad_norm": 5.193274974822998, + "learning_rate": 9.908574226628365e-06, + "loss": 0.1465, + "step": 2781 + }, + { + "epoch": 0.07040008097780702, + "grad_norm": 11.75051212310791, + "learning_rate": 9.908497777990569e-06, + "loss": 0.3583, + "step": 2782 + }, + { + "epoch": 0.07042538654250069, + "grad_norm": 9.1771821975708, + "learning_rate": 9.908421297698785e-06, + "loss": 0.2848, + "step": 2783 + }, + { + "epoch": 0.07045069210719437, + "grad_norm": 13.801336288452148, + "learning_rate": 9.908344785753505e-06, + "loss": 0.2659, + "step": 2784 + }, + { + "epoch": 0.07047599767188804, + "grad_norm": 13.001602172851562, + "learning_rate": 9.90826824215522e-06, + "loss": 0.2874, + "step": 2785 + }, + { + "epoch": 0.07050130323658173, + "grad_norm": 8.560189247131348, + "learning_rate": 9.90819166690443e-06, + "loss": 0.2494, + "step": 2786 + }, + { + "epoch": 0.0705266088012754, + "grad_norm": 11.787034034729004, + "learning_rate": 9.908115060001624e-06, + "loss": 0.2961, + "step": 2787 + }, + { + "epoch": 0.07055191436596908, + "grad_norm": 5.746702671051025, + "learning_rate": 9.908038421447295e-06, + "loss": 0.203, + "step": 2788 + }, + { + "epoch": 0.07057721993066275, + "grad_norm": 20.642127990722656, + "learning_rate": 9.907961751241941e-06, + "loss": 0.4023, + "step": 2789 + }, + { + "epoch": 0.07060252549535644, + "grad_norm": 5.977545738220215, + "learning_rate": 9.907885049386054e-06, + "loss": 0.2651, + "step": 2790 + }, + { + "epoch": 0.0706278310600501, + "grad_norm": 6.322824478149414, + "learning_rate": 9.90780831588013e-06, + "loss": 0.2536, + "step": 2791 + }, + { + "epoch": 0.07065313662474378, + "grad_norm": 6.989267349243164, + "learning_rate": 9.907731550724662e-06, + "loss": 0.2564, + "step": 2792 + }, + { + "epoch": 0.07067844218943746, + "grad_norm": 38.822471618652344, + "learning_rate": 9.907654753920146e-06, + "loss": 0.1961, + "step": 2793 + }, + { + "epoch": 0.07070374775413113, + "grad_norm": 9.193778038024902, + "learning_rate": 9.907577925467079e-06, + "loss": 0.2649, + "step": 2794 + }, + { + "epoch": 0.07072905331882481, + "grad_norm": 7.9223527908325195, + "learning_rate": 9.907501065365953e-06, + "loss": 0.2398, + "step": 2795 + }, + { + "epoch": 0.07075435888351848, + "grad_norm": 7.00169563293457, + "learning_rate": 9.907424173617266e-06, + "loss": 0.1873, + "step": 2796 + }, + { + "epoch": 0.07077966444821217, + "grad_norm": 10.761435508728027, + "learning_rate": 9.907347250221514e-06, + "loss": 0.3338, + "step": 2797 + }, + { + "epoch": 0.07080497001290584, + "grad_norm": 10.743718147277832, + "learning_rate": 9.907270295179192e-06, + "loss": 0.2088, + "step": 2798 + }, + { + "epoch": 0.07083027557759951, + "grad_norm": 7.136923789978027, + "learning_rate": 9.907193308490797e-06, + "loss": 0.2165, + "step": 2799 + }, + { + "epoch": 0.07085558114229319, + "grad_norm": 5.345058917999268, + "learning_rate": 9.907116290156824e-06, + "loss": 0.214, + "step": 2800 + }, + { + "epoch": 0.07088088670698686, + "grad_norm": 10.639159202575684, + "learning_rate": 9.907039240177773e-06, + "loss": 0.2351, + "step": 2801 + }, + { + "epoch": 0.07090619227168055, + "grad_norm": 17.89259147644043, + "learning_rate": 9.906962158554137e-06, + "loss": 0.3572, + "step": 2802 + }, + { + "epoch": 0.07093149783637422, + "grad_norm": 7.305509090423584, + "learning_rate": 9.906885045286416e-06, + "loss": 0.1975, + "step": 2803 + }, + { + "epoch": 0.0709568034010679, + "grad_norm": 11.869732856750488, + "learning_rate": 9.906807900375103e-06, + "loss": 0.2272, + "step": 2804 + }, + { + "epoch": 0.07098210896576157, + "grad_norm": 4.388360500335693, + "learning_rate": 9.906730723820704e-06, + "loss": 0.167, + "step": 2805 + }, + { + "epoch": 0.07100741453045525, + "grad_norm": 4.628928184509277, + "learning_rate": 9.906653515623708e-06, + "loss": 0.1012, + "step": 2806 + }, + { + "epoch": 0.07103272009514892, + "grad_norm": 5.127703666687012, + "learning_rate": 9.906576275784616e-06, + "loss": 0.1986, + "step": 2807 + }, + { + "epoch": 0.0710580256598426, + "grad_norm": 10.224513053894043, + "learning_rate": 9.906499004303927e-06, + "loss": 0.1775, + "step": 2808 + }, + { + "epoch": 0.07108333122453628, + "grad_norm": 5.874336242675781, + "learning_rate": 9.906421701182137e-06, + "loss": 0.203, + "step": 2809 + }, + { + "epoch": 0.07110863678922995, + "grad_norm": 6.5731377601623535, + "learning_rate": 9.906344366419749e-06, + "loss": 0.2044, + "step": 2810 + }, + { + "epoch": 0.07113394235392363, + "grad_norm": 6.662053108215332, + "learning_rate": 9.906267000017256e-06, + "loss": 0.2273, + "step": 2811 + }, + { + "epoch": 0.0711592479186173, + "grad_norm": 4.209244251251221, + "learning_rate": 9.90618960197516e-06, + "loss": 0.1451, + "step": 2812 + }, + { + "epoch": 0.07118455348331099, + "grad_norm": 14.838464736938477, + "learning_rate": 9.906112172293963e-06, + "loss": 0.3568, + "step": 2813 + }, + { + "epoch": 0.07120985904800466, + "grad_norm": 5.037487983703613, + "learning_rate": 9.906034710974158e-06, + "loss": 0.1197, + "step": 2814 + }, + { + "epoch": 0.07123516461269833, + "grad_norm": 15.861095428466797, + "learning_rate": 9.905957218016249e-06, + "loss": 0.2511, + "step": 2815 + }, + { + "epoch": 0.07126047017739201, + "grad_norm": 4.974163055419922, + "learning_rate": 9.905879693420734e-06, + "loss": 0.1184, + "step": 2816 + }, + { + "epoch": 0.07128577574208568, + "grad_norm": 9.834843635559082, + "learning_rate": 9.905802137188114e-06, + "loss": 0.2841, + "step": 2817 + }, + { + "epoch": 0.07131108130677936, + "grad_norm": 6.928305149078369, + "learning_rate": 9.905724549318887e-06, + "loss": 0.1406, + "step": 2818 + }, + { + "epoch": 0.07133638687147303, + "grad_norm": 9.922453880310059, + "learning_rate": 9.905646929813556e-06, + "loss": 0.306, + "step": 2819 + }, + { + "epoch": 0.07136169243616672, + "grad_norm": 15.267735481262207, + "learning_rate": 9.905569278672622e-06, + "loss": 0.3028, + "step": 2820 + }, + { + "epoch": 0.07138699800086039, + "grad_norm": 9.138221740722656, + "learning_rate": 9.905491595896583e-06, + "loss": 0.1311, + "step": 2821 + }, + { + "epoch": 0.07141230356555407, + "grad_norm": 10.224769592285156, + "learning_rate": 9.905413881485942e-06, + "loss": 0.194, + "step": 2822 + }, + { + "epoch": 0.07143760913024774, + "grad_norm": 4.483449459075928, + "learning_rate": 9.905336135441197e-06, + "loss": 0.207, + "step": 2823 + }, + { + "epoch": 0.07146291469494141, + "grad_norm": 10.906035423278809, + "learning_rate": 9.905258357762854e-06, + "loss": 0.3633, + "step": 2824 + }, + { + "epoch": 0.0714882202596351, + "grad_norm": 13.0048246383667, + "learning_rate": 9.90518054845141e-06, + "loss": 0.3549, + "step": 2825 + }, + { + "epoch": 0.07151352582432877, + "grad_norm": 4.656225204467773, + "learning_rate": 9.905102707507371e-06, + "loss": 0.1403, + "step": 2826 + }, + { + "epoch": 0.07153883138902245, + "grad_norm": 6.873802661895752, + "learning_rate": 9.905024834931237e-06, + "loss": 0.1771, + "step": 2827 + }, + { + "epoch": 0.07156413695371612, + "grad_norm": 6.33624267578125, + "learning_rate": 9.90494693072351e-06, + "loss": 0.1125, + "step": 2828 + }, + { + "epoch": 0.0715894425184098, + "grad_norm": 8.640796661376953, + "learning_rate": 9.904868994884693e-06, + "loss": 0.1631, + "step": 2829 + }, + { + "epoch": 0.07161474808310347, + "grad_norm": 10.336572647094727, + "learning_rate": 9.904791027415288e-06, + "loss": 0.2057, + "step": 2830 + }, + { + "epoch": 0.07164005364779714, + "grad_norm": 5.839776039123535, + "learning_rate": 9.904713028315797e-06, + "loss": 0.2189, + "step": 2831 + }, + { + "epoch": 0.07166535921249083, + "grad_norm": 16.122743606567383, + "learning_rate": 9.904634997586724e-06, + "loss": 0.3061, + "step": 2832 + }, + { + "epoch": 0.0716906647771845, + "grad_norm": 10.926850318908691, + "learning_rate": 9.904556935228574e-06, + "loss": 0.1997, + "step": 2833 + }, + { + "epoch": 0.07171597034187818, + "grad_norm": 8.095747947692871, + "learning_rate": 9.904478841241847e-06, + "loss": 0.2464, + "step": 2834 + }, + { + "epoch": 0.07174127590657185, + "grad_norm": 9.656647682189941, + "learning_rate": 9.904400715627049e-06, + "loss": 0.2626, + "step": 2835 + }, + { + "epoch": 0.07176658147126554, + "grad_norm": 4.829094409942627, + "learning_rate": 9.904322558384683e-06, + "loss": 0.1938, + "step": 2836 + }, + { + "epoch": 0.0717918870359592, + "grad_norm": 9.140020370483398, + "learning_rate": 9.904244369515253e-06, + "loss": 0.2502, + "step": 2837 + }, + { + "epoch": 0.07181719260065289, + "grad_norm": 7.083395481109619, + "learning_rate": 9.904166149019263e-06, + "loss": 0.2991, + "step": 2838 + }, + { + "epoch": 0.07184249816534656, + "grad_norm": 5.132907390594482, + "learning_rate": 9.90408789689722e-06, + "loss": 0.1929, + "step": 2839 + }, + { + "epoch": 0.07186780373004023, + "grad_norm": 12.451945304870605, + "learning_rate": 9.904009613149624e-06, + "loss": 0.3393, + "step": 2840 + }, + { + "epoch": 0.07189310929473391, + "grad_norm": 6.788115978240967, + "learning_rate": 9.903931297776982e-06, + "loss": 0.1351, + "step": 2841 + }, + { + "epoch": 0.07191841485942758, + "grad_norm": 11.328506469726562, + "learning_rate": 9.9038529507798e-06, + "loss": 0.2739, + "step": 2842 + }, + { + "epoch": 0.07194372042412127, + "grad_norm": 9.389852523803711, + "learning_rate": 9.903774572158583e-06, + "loss": 0.1897, + "step": 2843 + }, + { + "epoch": 0.07196902598881494, + "grad_norm": 5.459094524383545, + "learning_rate": 9.903696161913835e-06, + "loss": 0.2134, + "step": 2844 + }, + { + "epoch": 0.07199433155350862, + "grad_norm": 8.497432708740234, + "learning_rate": 9.903617720046064e-06, + "loss": 0.333, + "step": 2845 + }, + { + "epoch": 0.07201963711820229, + "grad_norm": 7.190563678741455, + "learning_rate": 9.903539246555773e-06, + "loss": 0.1709, + "step": 2846 + }, + { + "epoch": 0.07204494268289596, + "grad_norm": 3.91770601272583, + "learning_rate": 9.90346074144347e-06, + "loss": 0.2023, + "step": 2847 + }, + { + "epoch": 0.07207024824758965, + "grad_norm": 6.774989128112793, + "learning_rate": 9.90338220470966e-06, + "loss": 0.1868, + "step": 2848 + }, + { + "epoch": 0.07209555381228332, + "grad_norm": 8.120539665222168, + "learning_rate": 9.903303636354852e-06, + "loss": 0.2018, + "step": 2849 + }, + { + "epoch": 0.072120859376977, + "grad_norm": 5.194084644317627, + "learning_rate": 9.903225036379549e-06, + "loss": 0.2027, + "step": 2850 + }, + { + "epoch": 0.07214616494167067, + "grad_norm": 13.183478355407715, + "learning_rate": 9.903146404784261e-06, + "loss": 0.3189, + "step": 2851 + }, + { + "epoch": 0.07217147050636435, + "grad_norm": 4.748705863952637, + "learning_rate": 9.903067741569495e-06, + "loss": 0.1829, + "step": 2852 + }, + { + "epoch": 0.07219677607105802, + "grad_norm": 7.2000322341918945, + "learning_rate": 9.902989046735756e-06, + "loss": 0.2051, + "step": 2853 + }, + { + "epoch": 0.07222208163575171, + "grad_norm": 8.199015617370605, + "learning_rate": 9.902910320283553e-06, + "loss": 0.2082, + "step": 2854 + }, + { + "epoch": 0.07224738720044538, + "grad_norm": 9.596230506896973, + "learning_rate": 9.902831562213392e-06, + "loss": 0.2678, + "step": 2855 + }, + { + "epoch": 0.07227269276513905, + "grad_norm": 8.16234302520752, + "learning_rate": 9.902752772525783e-06, + "loss": 0.2634, + "step": 2856 + }, + { + "epoch": 0.07229799832983273, + "grad_norm": 6.523270130157471, + "learning_rate": 9.902673951221234e-06, + "loss": 0.1704, + "step": 2857 + }, + { + "epoch": 0.0723233038945264, + "grad_norm": 6.5158185958862305, + "learning_rate": 9.902595098300252e-06, + "loss": 0.1988, + "step": 2858 + }, + { + "epoch": 0.07234860945922009, + "grad_norm": 10.658824920654297, + "learning_rate": 9.902516213763346e-06, + "loss": 0.2536, + "step": 2859 + }, + { + "epoch": 0.07237391502391376, + "grad_norm": 16.592952728271484, + "learning_rate": 9.902437297611025e-06, + "loss": 0.3256, + "step": 2860 + }, + { + "epoch": 0.07239922058860744, + "grad_norm": 5.621699810028076, + "learning_rate": 9.902358349843798e-06, + "loss": 0.2186, + "step": 2861 + }, + { + "epoch": 0.07242452615330111, + "grad_norm": 5.207841396331787, + "learning_rate": 9.902279370462175e-06, + "loss": 0.1607, + "step": 2862 + }, + { + "epoch": 0.07244983171799478, + "grad_norm": 4.4752516746521, + "learning_rate": 9.902200359466663e-06, + "loss": 0.2283, + "step": 2863 + }, + { + "epoch": 0.07247513728268846, + "grad_norm": 7.932072639465332, + "learning_rate": 9.902121316857772e-06, + "loss": 0.2391, + "step": 2864 + }, + { + "epoch": 0.07250044284738213, + "grad_norm": 8.042838096618652, + "learning_rate": 9.902042242636014e-06, + "loss": 0.2572, + "step": 2865 + }, + { + "epoch": 0.07252574841207582, + "grad_norm": 10.372261047363281, + "learning_rate": 9.901963136801896e-06, + "loss": 0.2393, + "step": 2866 + }, + { + "epoch": 0.07255105397676949, + "grad_norm": 5.629664421081543, + "learning_rate": 9.901883999355932e-06, + "loss": 0.2031, + "step": 2867 + }, + { + "epoch": 0.07257635954146317, + "grad_norm": 6.217041015625, + "learning_rate": 9.901804830298627e-06, + "loss": 0.2327, + "step": 2868 + }, + { + "epoch": 0.07260166510615684, + "grad_norm": 5.24517297744751, + "learning_rate": 9.901725629630497e-06, + "loss": 0.1463, + "step": 2869 + }, + { + "epoch": 0.07262697067085053, + "grad_norm": 5.555685043334961, + "learning_rate": 9.901646397352049e-06, + "loss": 0.1497, + "step": 2870 + }, + { + "epoch": 0.0726522762355442, + "grad_norm": 6.334310531616211, + "learning_rate": 9.901567133463795e-06, + "loss": 0.1716, + "step": 2871 + }, + { + "epoch": 0.07267758180023787, + "grad_norm": 4.164958477020264, + "learning_rate": 9.901487837966247e-06, + "loss": 0.14, + "step": 2872 + }, + { + "epoch": 0.07270288736493155, + "grad_norm": 6.072218894958496, + "learning_rate": 9.901408510859914e-06, + "loss": 0.2461, + "step": 2873 + }, + { + "epoch": 0.07272819292962522, + "grad_norm": 13.818914413452148, + "learning_rate": 9.90132915214531e-06, + "loss": 0.2519, + "step": 2874 + }, + { + "epoch": 0.0727534984943189, + "grad_norm": 12.111641883850098, + "learning_rate": 9.901249761822946e-06, + "loss": 0.2482, + "step": 2875 + }, + { + "epoch": 0.07277880405901258, + "grad_norm": 5.142913341522217, + "learning_rate": 9.901170339893336e-06, + "loss": 0.2243, + "step": 2876 + }, + { + "epoch": 0.07280410962370626, + "grad_norm": 6.277614593505859, + "learning_rate": 9.901090886356987e-06, + "loss": 0.2208, + "step": 2877 + }, + { + "epoch": 0.07282941518839993, + "grad_norm": 2.577514410018921, + "learning_rate": 9.901011401214417e-06, + "loss": 0.1375, + "step": 2878 + }, + { + "epoch": 0.0728547207530936, + "grad_norm": 5.2333784103393555, + "learning_rate": 9.900931884466135e-06, + "loss": 0.18, + "step": 2879 + }, + { + "epoch": 0.07288002631778728, + "grad_norm": 6.564517498016357, + "learning_rate": 9.900852336112656e-06, + "loss": 0.2235, + "step": 2880 + }, + { + "epoch": 0.07290533188248095, + "grad_norm": 7.864182949066162, + "learning_rate": 9.900772756154491e-06, + "loss": 0.1734, + "step": 2881 + }, + { + "epoch": 0.07293063744717464, + "grad_norm": 8.836481094360352, + "learning_rate": 9.900693144592154e-06, + "loss": 0.2712, + "step": 2882 + }, + { + "epoch": 0.07295594301186831, + "grad_norm": 5.610143184661865, + "learning_rate": 9.900613501426158e-06, + "loss": 0.2171, + "step": 2883 + }, + { + "epoch": 0.07298124857656199, + "grad_norm": 5.16574764251709, + "learning_rate": 9.900533826657017e-06, + "loss": 0.1238, + "step": 2884 + }, + { + "epoch": 0.07300655414125566, + "grad_norm": 5.536013126373291, + "learning_rate": 9.900454120285247e-06, + "loss": 0.1886, + "step": 2885 + }, + { + "epoch": 0.07303185970594935, + "grad_norm": 7.164122104644775, + "learning_rate": 9.900374382311358e-06, + "loss": 0.1926, + "step": 2886 + }, + { + "epoch": 0.07305716527064302, + "grad_norm": 5.439543724060059, + "learning_rate": 9.900294612735868e-06, + "loss": 0.2695, + "step": 2887 + }, + { + "epoch": 0.07308247083533669, + "grad_norm": 4.140669822692871, + "learning_rate": 9.900214811559288e-06, + "loss": 0.1442, + "step": 2888 + }, + { + "epoch": 0.07310777640003037, + "grad_norm": 4.0905632972717285, + "learning_rate": 9.900134978782134e-06, + "loss": 0.2018, + "step": 2889 + }, + { + "epoch": 0.07313308196472404, + "grad_norm": 7.719757080078125, + "learning_rate": 9.900055114404922e-06, + "loss": 0.2316, + "step": 2890 + }, + { + "epoch": 0.07315838752941772, + "grad_norm": 7.141812801361084, + "learning_rate": 9.899975218428166e-06, + "loss": 0.1713, + "step": 2891 + }, + { + "epoch": 0.0731836930941114, + "grad_norm": 9.003525733947754, + "learning_rate": 9.89989529085238e-06, + "loss": 0.1913, + "step": 2892 + }, + { + "epoch": 0.07320899865880508, + "grad_norm": 6.8526930809021, + "learning_rate": 9.899815331678082e-06, + "loss": 0.2541, + "step": 2893 + }, + { + "epoch": 0.07323430422349875, + "grad_norm": 5.252927780151367, + "learning_rate": 9.899735340905786e-06, + "loss": 0.2053, + "step": 2894 + }, + { + "epoch": 0.07325960978819242, + "grad_norm": 10.802002906799316, + "learning_rate": 9.899655318536008e-06, + "loss": 0.2418, + "step": 2895 + }, + { + "epoch": 0.0732849153528861, + "grad_norm": 7.04095458984375, + "learning_rate": 9.899575264569265e-06, + "loss": 0.1695, + "step": 2896 + }, + { + "epoch": 0.07331022091757977, + "grad_norm": 5.044936180114746, + "learning_rate": 9.899495179006072e-06, + "loss": 0.1867, + "step": 2897 + }, + { + "epoch": 0.07333552648227346, + "grad_norm": 5.275539875030518, + "learning_rate": 9.899415061846946e-06, + "loss": 0.2316, + "step": 2898 + }, + { + "epoch": 0.07336083204696713, + "grad_norm": 13.136314392089844, + "learning_rate": 9.899334913092403e-06, + "loss": 0.3405, + "step": 2899 + }, + { + "epoch": 0.07338613761166081, + "grad_norm": 11.004773139953613, + "learning_rate": 9.89925473274296e-06, + "loss": 0.2376, + "step": 2900 + }, + { + "epoch": 0.07341144317635448, + "grad_norm": 9.623425483703613, + "learning_rate": 9.899174520799138e-06, + "loss": 0.2555, + "step": 2901 + }, + { + "epoch": 0.07343674874104816, + "grad_norm": 5.047729015350342, + "learning_rate": 9.899094277261448e-06, + "loss": 0.118, + "step": 2902 + }, + { + "epoch": 0.07346205430574183, + "grad_norm": 4.484912872314453, + "learning_rate": 9.89901400213041e-06, + "loss": 0.1411, + "step": 2903 + }, + { + "epoch": 0.0734873598704355, + "grad_norm": 6.821268081665039, + "learning_rate": 9.898933695406543e-06, + "loss": 0.277, + "step": 2904 + }, + { + "epoch": 0.07351266543512919, + "grad_norm": 9.429730415344238, + "learning_rate": 9.898853357090363e-06, + "loss": 0.2247, + "step": 2905 + }, + { + "epoch": 0.07353797099982286, + "grad_norm": 3.829108953475952, + "learning_rate": 9.89877298718239e-06, + "loss": 0.1567, + "step": 2906 + }, + { + "epoch": 0.07356327656451654, + "grad_norm": 10.401264190673828, + "learning_rate": 9.89869258568314e-06, + "loss": 0.1928, + "step": 2907 + }, + { + "epoch": 0.07358858212921021, + "grad_norm": 5.360667705535889, + "learning_rate": 9.898612152593134e-06, + "loss": 0.1028, + "step": 2908 + }, + { + "epoch": 0.0736138876939039, + "grad_norm": 10.011363983154297, + "learning_rate": 9.898531687912889e-06, + "loss": 0.1938, + "step": 2909 + }, + { + "epoch": 0.07363919325859757, + "grad_norm": 4.928233623504639, + "learning_rate": 9.898451191642923e-06, + "loss": 0.2131, + "step": 2910 + }, + { + "epoch": 0.07366449882329124, + "grad_norm": 34.98011779785156, + "learning_rate": 9.89837066378376e-06, + "loss": 0.3169, + "step": 2911 + }, + { + "epoch": 0.07368980438798492, + "grad_norm": 9.299060821533203, + "learning_rate": 9.898290104335913e-06, + "loss": 0.1263, + "step": 2912 + }, + { + "epoch": 0.07371510995267859, + "grad_norm": 7.6049275398254395, + "learning_rate": 9.898209513299905e-06, + "loss": 0.2487, + "step": 2913 + }, + { + "epoch": 0.07374041551737227, + "grad_norm": 10.300877571105957, + "learning_rate": 9.898128890676255e-06, + "loss": 0.315, + "step": 2914 + }, + { + "epoch": 0.07376572108206594, + "grad_norm": 29.25831413269043, + "learning_rate": 9.898048236465484e-06, + "loss": 0.1777, + "step": 2915 + }, + { + "epoch": 0.07379102664675963, + "grad_norm": 13.230199813842773, + "learning_rate": 9.897967550668108e-06, + "loss": 0.2722, + "step": 2916 + }, + { + "epoch": 0.0738163322114533, + "grad_norm": 11.429743766784668, + "learning_rate": 9.897886833284653e-06, + "loss": 0.2997, + "step": 2917 + }, + { + "epoch": 0.07384163777614698, + "grad_norm": 5.2368950843811035, + "learning_rate": 9.897806084315635e-06, + "loss": 0.2006, + "step": 2918 + }, + { + "epoch": 0.07386694334084065, + "grad_norm": 10.146770477294922, + "learning_rate": 9.897725303761578e-06, + "loss": 0.3462, + "step": 2919 + }, + { + "epoch": 0.07389224890553432, + "grad_norm": 8.792738914489746, + "learning_rate": 9.897644491623e-06, + "loss": 0.1719, + "step": 2920 + }, + { + "epoch": 0.073917554470228, + "grad_norm": 12.699460983276367, + "learning_rate": 9.897563647900426e-06, + "loss": 0.2932, + "step": 2921 + }, + { + "epoch": 0.07394286003492168, + "grad_norm": 3.4916794300079346, + "learning_rate": 9.897482772594373e-06, + "loss": 0.1746, + "step": 2922 + }, + { + "epoch": 0.07396816559961536, + "grad_norm": 10.265900611877441, + "learning_rate": 9.897401865705366e-06, + "loss": 0.2802, + "step": 2923 + }, + { + "epoch": 0.07399347116430903, + "grad_norm": 7.3585920333862305, + "learning_rate": 9.897320927233924e-06, + "loss": 0.187, + "step": 2924 + }, + { + "epoch": 0.07401877672900271, + "grad_norm": 6.571939945220947, + "learning_rate": 9.897239957180572e-06, + "loss": 0.3235, + "step": 2925 + }, + { + "epoch": 0.07404408229369638, + "grad_norm": 20.46332550048828, + "learning_rate": 9.897158955545829e-06, + "loss": 0.2962, + "step": 2926 + }, + { + "epoch": 0.07406938785839005, + "grad_norm": 7.721602439880371, + "learning_rate": 9.89707792233022e-06, + "loss": 0.2398, + "step": 2927 + }, + { + "epoch": 0.07409469342308374, + "grad_norm": 9.167250633239746, + "learning_rate": 9.896996857534266e-06, + "loss": 0.2034, + "step": 2928 + }, + { + "epoch": 0.07411999898777741, + "grad_norm": 5.348546504974365, + "learning_rate": 9.89691576115849e-06, + "loss": 0.1945, + "step": 2929 + }, + { + "epoch": 0.07414530455247109, + "grad_norm": 7.184691429138184, + "learning_rate": 9.896834633203416e-06, + "loss": 0.21, + "step": 2930 + }, + { + "epoch": 0.07417061011716476, + "grad_norm": 8.974209785461426, + "learning_rate": 9.896753473669565e-06, + "loss": 0.2656, + "step": 2931 + }, + { + "epoch": 0.07419591568185845, + "grad_norm": 7.764801025390625, + "learning_rate": 9.896672282557462e-06, + "loss": 0.2742, + "step": 2932 + }, + { + "epoch": 0.07422122124655212, + "grad_norm": 19.666301727294922, + "learning_rate": 9.896591059867631e-06, + "loss": 0.3737, + "step": 2933 + }, + { + "epoch": 0.07424652681124579, + "grad_norm": 9.252907752990723, + "learning_rate": 9.896509805600595e-06, + "loss": 0.2391, + "step": 2934 + }, + { + "epoch": 0.07427183237593947, + "grad_norm": 6.640079021453857, + "learning_rate": 9.896428519756878e-06, + "loss": 0.2477, + "step": 2935 + }, + { + "epoch": 0.07429713794063314, + "grad_norm": 11.279173851013184, + "learning_rate": 9.896347202337003e-06, + "loss": 0.3029, + "step": 2936 + }, + { + "epoch": 0.07432244350532682, + "grad_norm": 6.402436256408691, + "learning_rate": 9.896265853341498e-06, + "loss": 0.235, + "step": 2937 + }, + { + "epoch": 0.0743477490700205, + "grad_norm": 12.377549171447754, + "learning_rate": 9.896184472770884e-06, + "loss": 0.245, + "step": 2938 + }, + { + "epoch": 0.07437305463471418, + "grad_norm": 11.182665824890137, + "learning_rate": 9.896103060625685e-06, + "loss": 0.2131, + "step": 2939 + }, + { + "epoch": 0.07439836019940785, + "grad_norm": 5.245672702789307, + "learning_rate": 9.896021616906432e-06, + "loss": 0.1853, + "step": 2940 + }, + { + "epoch": 0.07442366576410153, + "grad_norm": 4.746204853057861, + "learning_rate": 9.895940141613645e-06, + "loss": 0.2046, + "step": 2941 + }, + { + "epoch": 0.0744489713287952, + "grad_norm": 5.61624002456665, + "learning_rate": 9.895858634747849e-06, + "loss": 0.2163, + "step": 2942 + }, + { + "epoch": 0.07447427689348887, + "grad_norm": 8.300844192504883, + "learning_rate": 9.895777096309574e-06, + "loss": 0.3204, + "step": 2943 + }, + { + "epoch": 0.07449958245818256, + "grad_norm": 8.041197776794434, + "learning_rate": 9.89569552629934e-06, + "loss": 0.2385, + "step": 2944 + }, + { + "epoch": 0.07452488802287623, + "grad_norm": 18.71017074584961, + "learning_rate": 9.895613924717679e-06, + "loss": 0.2819, + "step": 2945 + }, + { + "epoch": 0.07455019358756991, + "grad_norm": 10.220779418945312, + "learning_rate": 9.895532291565113e-06, + "loss": 0.1755, + "step": 2946 + }, + { + "epoch": 0.07457549915226358, + "grad_norm": 10.660602569580078, + "learning_rate": 9.89545062684217e-06, + "loss": 0.3205, + "step": 2947 + }, + { + "epoch": 0.07460080471695726, + "grad_norm": 5.258834362030029, + "learning_rate": 9.895368930549379e-06, + "loss": 0.1784, + "step": 2948 + }, + { + "epoch": 0.07462611028165093, + "grad_norm": 15.562322616577148, + "learning_rate": 9.89528720268726e-06, + "loss": 0.3182, + "step": 2949 + }, + { + "epoch": 0.0746514158463446, + "grad_norm": 11.340535163879395, + "learning_rate": 9.895205443256348e-06, + "loss": 0.2739, + "step": 2950 + }, + { + "epoch": 0.07467672141103829, + "grad_norm": 8.129693031311035, + "learning_rate": 9.895123652257164e-06, + "loss": 0.248, + "step": 2951 + }, + { + "epoch": 0.07470202697573196, + "grad_norm": 16.8837833404541, + "learning_rate": 9.89504182969024e-06, + "loss": 0.3228, + "step": 2952 + }, + { + "epoch": 0.07472733254042564, + "grad_norm": 8.354168891906738, + "learning_rate": 9.894959975556103e-06, + "loss": 0.3281, + "step": 2953 + }, + { + "epoch": 0.07475263810511931, + "grad_norm": 6.756259918212891, + "learning_rate": 9.894878089855277e-06, + "loss": 0.2979, + "step": 2954 + }, + { + "epoch": 0.074777943669813, + "grad_norm": 8.874363899230957, + "learning_rate": 9.894796172588294e-06, + "loss": 0.2585, + "step": 2955 + }, + { + "epoch": 0.07480324923450667, + "grad_norm": 6.40907096862793, + "learning_rate": 9.89471422375568e-06, + "loss": 0.1917, + "step": 2956 + }, + { + "epoch": 0.07482855479920035, + "grad_norm": 3.870990514755249, + "learning_rate": 9.894632243357964e-06, + "loss": 0.1556, + "step": 2957 + }, + { + "epoch": 0.07485386036389402, + "grad_norm": 8.018892288208008, + "learning_rate": 9.894550231395678e-06, + "loss": 0.2704, + "step": 2958 + }, + { + "epoch": 0.07487916592858769, + "grad_norm": 7.284106731414795, + "learning_rate": 9.894468187869345e-06, + "loss": 0.2396, + "step": 2959 + }, + { + "epoch": 0.07490447149328137, + "grad_norm": 6.37661075592041, + "learning_rate": 9.894386112779499e-06, + "loss": 0.1948, + "step": 2960 + }, + { + "epoch": 0.07492977705797504, + "grad_norm": 11.627445220947266, + "learning_rate": 9.894304006126668e-06, + "loss": 0.2462, + "step": 2961 + }, + { + "epoch": 0.07495508262266873, + "grad_norm": 6.3010478019714355, + "learning_rate": 9.894221867911378e-06, + "loss": 0.1955, + "step": 2962 + }, + { + "epoch": 0.0749803881873624, + "grad_norm": 4.179787635803223, + "learning_rate": 9.894139698134165e-06, + "loss": 0.1652, + "step": 2963 + }, + { + "epoch": 0.07500569375205608, + "grad_norm": 4.7733941078186035, + "learning_rate": 9.894057496795553e-06, + "loss": 0.19, + "step": 2964 + }, + { + "epoch": 0.07503099931674975, + "grad_norm": 6.897562026977539, + "learning_rate": 9.893975263896075e-06, + "loss": 0.1655, + "step": 2965 + }, + { + "epoch": 0.07505630488144342, + "grad_norm": 4.602845191955566, + "learning_rate": 9.893892999436262e-06, + "loss": 0.1598, + "step": 2966 + }, + { + "epoch": 0.0750816104461371, + "grad_norm": 10.380417823791504, + "learning_rate": 9.893810703416642e-06, + "loss": 0.2242, + "step": 2967 + }, + { + "epoch": 0.07510691601083078, + "grad_norm": 5.332421779632568, + "learning_rate": 9.893728375837746e-06, + "loss": 0.1079, + "step": 2968 + }, + { + "epoch": 0.07513222157552446, + "grad_norm": 7.276487827301025, + "learning_rate": 9.893646016700109e-06, + "loss": 0.179, + "step": 2969 + }, + { + "epoch": 0.07515752714021813, + "grad_norm": 9.180831909179688, + "learning_rate": 9.893563626004257e-06, + "loss": 0.2642, + "step": 2970 + }, + { + "epoch": 0.07518283270491181, + "grad_norm": 8.521173477172852, + "learning_rate": 9.893481203750725e-06, + "loss": 0.2488, + "step": 2971 + }, + { + "epoch": 0.07520813826960548, + "grad_norm": 21.259380340576172, + "learning_rate": 9.893398749940043e-06, + "loss": 0.2243, + "step": 2972 + }, + { + "epoch": 0.07523344383429917, + "grad_norm": 11.44697380065918, + "learning_rate": 9.89331626457274e-06, + "loss": 0.3581, + "step": 2973 + }, + { + "epoch": 0.07525874939899284, + "grad_norm": 11.638978004455566, + "learning_rate": 9.893233747649354e-06, + "loss": 0.3058, + "step": 2974 + }, + { + "epoch": 0.07528405496368651, + "grad_norm": 6.074079513549805, + "learning_rate": 9.89315119917041e-06, + "loss": 0.268, + "step": 2975 + }, + { + "epoch": 0.07530936052838019, + "grad_norm": 6.228337287902832, + "learning_rate": 9.893068619136449e-06, + "loss": 0.2164, + "step": 2976 + }, + { + "epoch": 0.07533466609307386, + "grad_norm": 4.875720977783203, + "learning_rate": 9.892986007547994e-06, + "loss": 0.2001, + "step": 2977 + }, + { + "epoch": 0.07535997165776755, + "grad_norm": 8.746748924255371, + "learning_rate": 9.892903364405584e-06, + "loss": 0.2441, + "step": 2978 + }, + { + "epoch": 0.07538527722246122, + "grad_norm": 8.241585731506348, + "learning_rate": 9.892820689709751e-06, + "loss": 0.2743, + "step": 2979 + }, + { + "epoch": 0.0754105827871549, + "grad_norm": 7.161627292633057, + "learning_rate": 9.892737983461028e-06, + "loss": 0.218, + "step": 2980 + }, + { + "epoch": 0.07543588835184857, + "grad_norm": 6.7078986167907715, + "learning_rate": 9.892655245659947e-06, + "loss": 0.1688, + "step": 2981 + }, + { + "epoch": 0.07546119391654224, + "grad_norm": 6.0439772605896, + "learning_rate": 9.892572476307042e-06, + "loss": 0.2719, + "step": 2982 + }, + { + "epoch": 0.07548649948123592, + "grad_norm": 7.467667102813721, + "learning_rate": 9.892489675402848e-06, + "loss": 0.3256, + "step": 2983 + }, + { + "epoch": 0.0755118050459296, + "grad_norm": 9.46004581451416, + "learning_rate": 9.892406842947899e-06, + "loss": 0.2399, + "step": 2984 + }, + { + "epoch": 0.07553711061062328, + "grad_norm": 4.983580589294434, + "learning_rate": 9.892323978942726e-06, + "loss": 0.1125, + "step": 2985 + }, + { + "epoch": 0.07556241617531695, + "grad_norm": 4.704578399658203, + "learning_rate": 9.892241083387868e-06, + "loss": 0.1771, + "step": 2986 + }, + { + "epoch": 0.07558772174001063, + "grad_norm": 14.025799751281738, + "learning_rate": 9.892158156283855e-06, + "loss": 0.3669, + "step": 2987 + }, + { + "epoch": 0.0756130273047043, + "grad_norm": 8.49237060546875, + "learning_rate": 9.892075197631226e-06, + "loss": 0.2907, + "step": 2988 + }, + { + "epoch": 0.07563833286939799, + "grad_norm": 3.8875386714935303, + "learning_rate": 9.891992207430513e-06, + "loss": 0.1785, + "step": 2989 + }, + { + "epoch": 0.07566363843409166, + "grad_norm": 3.6055612564086914, + "learning_rate": 9.891909185682252e-06, + "loss": 0.112, + "step": 2990 + }, + { + "epoch": 0.07568894399878533, + "grad_norm": 7.311610698699951, + "learning_rate": 9.891826132386979e-06, + "loss": 0.2776, + "step": 2991 + }, + { + "epoch": 0.07571424956347901, + "grad_norm": 10.395149230957031, + "learning_rate": 9.89174304754523e-06, + "loss": 0.1875, + "step": 2992 + }, + { + "epoch": 0.07573955512817268, + "grad_norm": 6.998022079467773, + "learning_rate": 9.89165993115754e-06, + "loss": 0.1706, + "step": 2993 + }, + { + "epoch": 0.07576486069286636, + "grad_norm": 9.168519973754883, + "learning_rate": 9.891576783224444e-06, + "loss": 0.2339, + "step": 2994 + }, + { + "epoch": 0.07579016625756003, + "grad_norm": 10.032303810119629, + "learning_rate": 9.89149360374648e-06, + "loss": 0.2171, + "step": 2995 + }, + { + "epoch": 0.07581547182225372, + "grad_norm": 5.497076988220215, + "learning_rate": 9.891410392724184e-06, + "loss": 0.127, + "step": 2996 + }, + { + "epoch": 0.07584077738694739, + "grad_norm": 9.083003044128418, + "learning_rate": 9.89132715015809e-06, + "loss": 0.3169, + "step": 2997 + }, + { + "epoch": 0.07586608295164106, + "grad_norm": 6.8418803215026855, + "learning_rate": 9.891243876048739e-06, + "loss": 0.1607, + "step": 2998 + }, + { + "epoch": 0.07589138851633474, + "grad_norm": 11.767110824584961, + "learning_rate": 9.891160570396666e-06, + "loss": 0.2016, + "step": 2999 + }, + { + "epoch": 0.07591669408102841, + "grad_norm": 7.648541450500488, + "learning_rate": 9.891077233202409e-06, + "loss": 0.2072, + "step": 3000 + }, + { + "epoch": 0.0759419996457221, + "grad_norm": 14.765372276306152, + "learning_rate": 9.890993864466503e-06, + "loss": 0.1961, + "step": 3001 + }, + { + "epoch": 0.07596730521041577, + "grad_norm": 6.222733020782471, + "learning_rate": 9.890910464189487e-06, + "loss": 0.228, + "step": 3002 + }, + { + "epoch": 0.07599261077510945, + "grad_norm": 15.95337963104248, + "learning_rate": 9.890827032371901e-06, + "loss": 0.1911, + "step": 3003 + }, + { + "epoch": 0.07601791633980312, + "grad_norm": 4.568948268890381, + "learning_rate": 9.89074356901428e-06, + "loss": 0.1804, + "step": 3004 + }, + { + "epoch": 0.0760432219044968, + "grad_norm": 7.815079689025879, + "learning_rate": 9.890660074117164e-06, + "loss": 0.2691, + "step": 3005 + }, + { + "epoch": 0.07606852746919048, + "grad_norm": 6.66142463684082, + "learning_rate": 9.890576547681093e-06, + "loss": 0.2335, + "step": 3006 + }, + { + "epoch": 0.07609383303388415, + "grad_norm": 6.946269989013672, + "learning_rate": 9.890492989706599e-06, + "loss": 0.25, + "step": 3007 + }, + { + "epoch": 0.07611913859857783, + "grad_norm": 14.123048782348633, + "learning_rate": 9.89040940019423e-06, + "loss": 0.3188, + "step": 3008 + }, + { + "epoch": 0.0761444441632715, + "grad_norm": 11.562487602233887, + "learning_rate": 9.890325779144518e-06, + "loss": 0.3435, + "step": 3009 + }, + { + "epoch": 0.07616974972796518, + "grad_norm": 7.995794773101807, + "learning_rate": 9.890242126558005e-06, + "loss": 0.1934, + "step": 3010 + }, + { + "epoch": 0.07619505529265885, + "grad_norm": 12.345986366271973, + "learning_rate": 9.89015844243523e-06, + "loss": 0.2573, + "step": 3011 + }, + { + "epoch": 0.07622036085735254, + "grad_norm": 6.39669942855835, + "learning_rate": 9.890074726776733e-06, + "loss": 0.182, + "step": 3012 + }, + { + "epoch": 0.07624566642204621, + "grad_norm": 3.8696606159210205, + "learning_rate": 9.889990979583052e-06, + "loss": 0.1847, + "step": 3013 + }, + { + "epoch": 0.07627097198673988, + "grad_norm": 4.7928147315979, + "learning_rate": 9.889907200854731e-06, + "loss": 0.164, + "step": 3014 + }, + { + "epoch": 0.07629627755143356, + "grad_norm": 3.506957769393921, + "learning_rate": 9.889823390592309e-06, + "loss": 0.1723, + "step": 3015 + }, + { + "epoch": 0.07632158311612723, + "grad_norm": 8.618371963500977, + "learning_rate": 9.889739548796324e-06, + "loss": 0.2268, + "step": 3016 + }, + { + "epoch": 0.07634688868082092, + "grad_norm": 39.88480758666992, + "learning_rate": 9.889655675467318e-06, + "loss": 0.2176, + "step": 3017 + }, + { + "epoch": 0.07637219424551459, + "grad_norm": 13.680523872375488, + "learning_rate": 9.889571770605832e-06, + "loss": 0.4182, + "step": 3018 + }, + { + "epoch": 0.07639749981020827, + "grad_norm": 4.876103401184082, + "learning_rate": 9.889487834212408e-06, + "loss": 0.2818, + "step": 3019 + }, + { + "epoch": 0.07642280537490194, + "grad_norm": 4.482608795166016, + "learning_rate": 9.889403866287587e-06, + "loss": 0.201, + "step": 3020 + }, + { + "epoch": 0.07644811093959562, + "grad_norm": 6.832361698150635, + "learning_rate": 9.88931986683191e-06, + "loss": 0.2472, + "step": 3021 + }, + { + "epoch": 0.0764734165042893, + "grad_norm": 8.74272346496582, + "learning_rate": 9.889235835845916e-06, + "loss": 0.3718, + "step": 3022 + }, + { + "epoch": 0.07649872206898296, + "grad_norm": 6.5669097900390625, + "learning_rate": 9.889151773330152e-06, + "loss": 0.2211, + "step": 3023 + }, + { + "epoch": 0.07652402763367665, + "grad_norm": 3.8580377101898193, + "learning_rate": 9.889067679285156e-06, + "loss": 0.2348, + "step": 3024 + }, + { + "epoch": 0.07654933319837032, + "grad_norm": 6.912929058074951, + "learning_rate": 9.888983553711473e-06, + "loss": 0.2127, + "step": 3025 + }, + { + "epoch": 0.076574638763064, + "grad_norm": 4.167567729949951, + "learning_rate": 9.888899396609645e-06, + "loss": 0.2116, + "step": 3026 + }, + { + "epoch": 0.07659994432775767, + "grad_norm": 5.143871784210205, + "learning_rate": 9.888815207980214e-06, + "loss": 0.222, + "step": 3027 + }, + { + "epoch": 0.07662524989245136, + "grad_norm": 7.403280735015869, + "learning_rate": 9.888730987823722e-06, + "loss": 0.2277, + "step": 3028 + }, + { + "epoch": 0.07665055545714503, + "grad_norm": 8.274787902832031, + "learning_rate": 9.888646736140715e-06, + "loss": 0.2625, + "step": 3029 + }, + { + "epoch": 0.0766758610218387, + "grad_norm": 4.902839183807373, + "learning_rate": 9.888562452931734e-06, + "loss": 0.2269, + "step": 3030 + }, + { + "epoch": 0.07670116658653238, + "grad_norm": 7.849579334259033, + "learning_rate": 9.888478138197322e-06, + "loss": 0.2742, + "step": 3031 + }, + { + "epoch": 0.07672647215122605, + "grad_norm": 8.96796989440918, + "learning_rate": 9.888393791938024e-06, + "loss": 0.2309, + "step": 3032 + }, + { + "epoch": 0.07675177771591973, + "grad_norm": 7.217860221862793, + "learning_rate": 9.888309414154384e-06, + "loss": 0.2397, + "step": 3033 + }, + { + "epoch": 0.0767770832806134, + "grad_norm": 4.5038604736328125, + "learning_rate": 9.888225004846948e-06, + "loss": 0.1794, + "step": 3034 + }, + { + "epoch": 0.07680238884530709, + "grad_norm": 5.128208637237549, + "learning_rate": 9.888140564016255e-06, + "loss": 0.1975, + "step": 3035 + }, + { + "epoch": 0.07682769441000076, + "grad_norm": 4.804108619689941, + "learning_rate": 9.888056091662854e-06, + "loss": 0.1966, + "step": 3036 + }, + { + "epoch": 0.07685299997469444, + "grad_norm": 5.101033687591553, + "learning_rate": 9.887971587787289e-06, + "loss": 0.2472, + "step": 3037 + }, + { + "epoch": 0.07687830553938811, + "grad_norm": 14.931565284729004, + "learning_rate": 9.887887052390104e-06, + "loss": 0.2855, + "step": 3038 + }, + { + "epoch": 0.07690361110408178, + "grad_norm": 6.251331806182861, + "learning_rate": 9.887802485471844e-06, + "loss": 0.3054, + "step": 3039 + }, + { + "epoch": 0.07692891666877547, + "grad_norm": 4.5967278480529785, + "learning_rate": 9.887717887033057e-06, + "loss": 0.186, + "step": 3040 + }, + { + "epoch": 0.07695422223346914, + "grad_norm": 4.741849422454834, + "learning_rate": 9.887633257074285e-06, + "loss": 0.2277, + "step": 3041 + }, + { + "epoch": 0.07697952779816282, + "grad_norm": 5.686197757720947, + "learning_rate": 9.887548595596075e-06, + "loss": 0.2678, + "step": 3042 + }, + { + "epoch": 0.07700483336285649, + "grad_norm": 8.095924377441406, + "learning_rate": 9.887463902598973e-06, + "loss": 0.2186, + "step": 3043 + }, + { + "epoch": 0.07703013892755017, + "grad_norm": 8.531063079833984, + "learning_rate": 9.887379178083528e-06, + "loss": 0.2875, + "step": 3044 + }, + { + "epoch": 0.07705544449224384, + "grad_norm": 6.542211055755615, + "learning_rate": 9.88729442205028e-06, + "loss": 0.2338, + "step": 3045 + }, + { + "epoch": 0.07708075005693751, + "grad_norm": 10.64199447631836, + "learning_rate": 9.88720963449978e-06, + "loss": 0.2078, + "step": 3046 + }, + { + "epoch": 0.0771060556216312, + "grad_norm": 9.70663070678711, + "learning_rate": 9.887124815432577e-06, + "loss": 0.227, + "step": 3047 + }, + { + "epoch": 0.07713136118632487, + "grad_norm": 8.438136100769043, + "learning_rate": 9.887039964849215e-06, + "loss": 0.2465, + "step": 3048 + }, + { + "epoch": 0.07715666675101855, + "grad_norm": 5.562194347381592, + "learning_rate": 9.886955082750237e-06, + "loss": 0.2155, + "step": 3049 + }, + { + "epoch": 0.07718197231571222, + "grad_norm": 5.401731491088867, + "learning_rate": 9.8868701691362e-06, + "loss": 0.1483, + "step": 3050 + }, + { + "epoch": 0.0772072778804059, + "grad_norm": 10.993303298950195, + "learning_rate": 9.886785224007643e-06, + "loss": 0.2833, + "step": 3051 + }, + { + "epoch": 0.07723258344509958, + "grad_norm": 4.5628228187561035, + "learning_rate": 9.886700247365118e-06, + "loss": 0.19, + "step": 3052 + }, + { + "epoch": 0.07725788900979326, + "grad_norm": 8.566446304321289, + "learning_rate": 9.886615239209172e-06, + "loss": 0.2684, + "step": 3053 + }, + { + "epoch": 0.07728319457448693, + "grad_norm": 5.0665106773376465, + "learning_rate": 9.886530199540354e-06, + "loss": 0.2356, + "step": 3054 + }, + { + "epoch": 0.0773085001391806, + "grad_norm": 7.028911113739014, + "learning_rate": 9.886445128359212e-06, + "loss": 0.18, + "step": 3055 + }, + { + "epoch": 0.07733380570387428, + "grad_norm": 3.2286746501922607, + "learning_rate": 9.886360025666292e-06, + "loss": 0.1213, + "step": 3056 + }, + { + "epoch": 0.07735911126856795, + "grad_norm": 8.442419052124023, + "learning_rate": 9.886274891462147e-06, + "loss": 0.2769, + "step": 3057 + }, + { + "epoch": 0.07738441683326164, + "grad_norm": 5.553886890411377, + "learning_rate": 9.886189725747325e-06, + "loss": 0.2036, + "step": 3058 + }, + { + "epoch": 0.07740972239795531, + "grad_norm": 9.997652053833008, + "learning_rate": 9.886104528522372e-06, + "loss": 0.211, + "step": 3059 + }, + { + "epoch": 0.07743502796264899, + "grad_norm": 19.687414169311523, + "learning_rate": 9.886019299787842e-06, + "loss": 0.1884, + "step": 3060 + }, + { + "epoch": 0.07746033352734266, + "grad_norm": 7.01360559463501, + "learning_rate": 9.88593403954428e-06, + "loss": 0.2438, + "step": 3061 + }, + { + "epoch": 0.07748563909203633, + "grad_norm": 4.585926055908203, + "learning_rate": 9.88584874779224e-06, + "loss": 0.1245, + "step": 3062 + }, + { + "epoch": 0.07751094465673002, + "grad_norm": 8.706226348876953, + "learning_rate": 9.885763424532271e-06, + "loss": 0.1546, + "step": 3063 + }, + { + "epoch": 0.07753625022142369, + "grad_norm": 5.0758514404296875, + "learning_rate": 9.88567806976492e-06, + "loss": 0.1898, + "step": 3064 + }, + { + "epoch": 0.07756155578611737, + "grad_norm": 6.781303882598877, + "learning_rate": 9.885592683490742e-06, + "loss": 0.3231, + "step": 3065 + }, + { + "epoch": 0.07758686135081104, + "grad_norm": 5.531062602996826, + "learning_rate": 9.885507265710285e-06, + "loss": 0.185, + "step": 3066 + }, + { + "epoch": 0.07761216691550472, + "grad_norm": 8.502762794494629, + "learning_rate": 9.885421816424098e-06, + "loss": 0.2515, + "step": 3067 + }, + { + "epoch": 0.0776374724801984, + "grad_norm": 6.572807312011719, + "learning_rate": 9.885336335632738e-06, + "loss": 0.2239, + "step": 3068 + }, + { + "epoch": 0.07766277804489208, + "grad_norm": 8.753673553466797, + "learning_rate": 9.885250823336752e-06, + "loss": 0.2478, + "step": 3069 + }, + { + "epoch": 0.07768808360958575, + "grad_norm": 12.205924034118652, + "learning_rate": 9.88516527953669e-06, + "loss": 0.3515, + "step": 3070 + }, + { + "epoch": 0.07771338917427942, + "grad_norm": 7.478150844573975, + "learning_rate": 9.88507970423311e-06, + "loss": 0.1308, + "step": 3071 + }, + { + "epoch": 0.0777386947389731, + "grad_norm": 4.968457221984863, + "learning_rate": 9.884994097426556e-06, + "loss": 0.2325, + "step": 3072 + }, + { + "epoch": 0.07776400030366677, + "grad_norm": 5.814393520355225, + "learning_rate": 9.884908459117585e-06, + "loss": 0.2067, + "step": 3073 + }, + { + "epoch": 0.07778930586836046, + "grad_norm": 8.674863815307617, + "learning_rate": 9.884822789306746e-06, + "loss": 0.3389, + "step": 3074 + }, + { + "epoch": 0.07781461143305413, + "grad_norm": 5.28587007522583, + "learning_rate": 9.884737087994597e-06, + "loss": 0.2359, + "step": 3075 + }, + { + "epoch": 0.07783991699774781, + "grad_norm": 9.142730712890625, + "learning_rate": 9.884651355181685e-06, + "loss": 0.2626, + "step": 3076 + }, + { + "epoch": 0.07786522256244148, + "grad_norm": 5.11535120010376, + "learning_rate": 9.884565590868567e-06, + "loss": 0.1246, + "step": 3077 + }, + { + "epoch": 0.07789052812713515, + "grad_norm": 4.632297039031982, + "learning_rate": 9.884479795055791e-06, + "loss": 0.2374, + "step": 3078 + }, + { + "epoch": 0.07791583369182883, + "grad_norm": 6.553367614746094, + "learning_rate": 9.884393967743915e-06, + "loss": 0.1693, + "step": 3079 + }, + { + "epoch": 0.0779411392565225, + "grad_norm": 7.247071743011475, + "learning_rate": 9.884308108933492e-06, + "loss": 0.2299, + "step": 3080 + }, + { + "epoch": 0.07796644482121619, + "grad_norm": 6.4025726318359375, + "learning_rate": 9.884222218625073e-06, + "loss": 0.2316, + "step": 3081 + }, + { + "epoch": 0.07799175038590986, + "grad_norm": 12.36750316619873, + "learning_rate": 9.884136296819214e-06, + "loss": 0.1634, + "step": 3082 + }, + { + "epoch": 0.07801705595060354, + "grad_norm": 3.766954183578491, + "learning_rate": 9.884050343516467e-06, + "loss": 0.1908, + "step": 3083 + }, + { + "epoch": 0.07804236151529721, + "grad_norm": 6.355688571929932, + "learning_rate": 9.883964358717391e-06, + "loss": 0.2073, + "step": 3084 + }, + { + "epoch": 0.0780676670799909, + "grad_norm": 8.241601943969727, + "learning_rate": 9.883878342422536e-06, + "loss": 0.2292, + "step": 3085 + }, + { + "epoch": 0.07809297264468457, + "grad_norm": 4.357940196990967, + "learning_rate": 9.883792294632457e-06, + "loss": 0.1665, + "step": 3086 + }, + { + "epoch": 0.07811827820937824, + "grad_norm": 8.578805923461914, + "learning_rate": 9.88370621534771e-06, + "loss": 0.2234, + "step": 3087 + }, + { + "epoch": 0.07814358377407192, + "grad_norm": 5.561417579650879, + "learning_rate": 9.883620104568851e-06, + "loss": 0.201, + "step": 3088 + }, + { + "epoch": 0.07816888933876559, + "grad_norm": 6.8220062255859375, + "learning_rate": 9.883533962296435e-06, + "loss": 0.2877, + "step": 3089 + }, + { + "epoch": 0.07819419490345927, + "grad_norm": 7.161080837249756, + "learning_rate": 9.883447788531014e-06, + "loss": 0.1758, + "step": 3090 + }, + { + "epoch": 0.07821950046815294, + "grad_norm": 3.519731044769287, + "learning_rate": 9.88336158327315e-06, + "loss": 0.1935, + "step": 3091 + }, + { + "epoch": 0.07824480603284663, + "grad_norm": 11.29885196685791, + "learning_rate": 9.883275346523394e-06, + "loss": 0.2752, + "step": 3092 + }, + { + "epoch": 0.0782701115975403, + "grad_norm": 3.525277853012085, + "learning_rate": 9.883189078282303e-06, + "loss": 0.1567, + "step": 3093 + }, + { + "epoch": 0.07829541716223397, + "grad_norm": 7.2579779624938965, + "learning_rate": 9.883102778550434e-06, + "loss": 0.2252, + "step": 3094 + }, + { + "epoch": 0.07832072272692765, + "grad_norm": 5.427606582641602, + "learning_rate": 9.883016447328344e-06, + "loss": 0.1992, + "step": 3095 + }, + { + "epoch": 0.07834602829162132, + "grad_norm": 4.4001688957214355, + "learning_rate": 9.88293008461659e-06, + "loss": 0.2486, + "step": 3096 + }, + { + "epoch": 0.078371333856315, + "grad_norm": 7.401119232177734, + "learning_rate": 9.882843690415726e-06, + "loss": 0.295, + "step": 3097 + }, + { + "epoch": 0.07839663942100868, + "grad_norm": 3.8359415531158447, + "learning_rate": 9.882757264726314e-06, + "loss": 0.1255, + "step": 3098 + }, + { + "epoch": 0.07842194498570236, + "grad_norm": 5.44973611831665, + "learning_rate": 9.882670807548907e-06, + "loss": 0.205, + "step": 3099 + }, + { + "epoch": 0.07844725055039603, + "grad_norm": 6.414082050323486, + "learning_rate": 9.882584318884065e-06, + "loss": 0.2689, + "step": 3100 + }, + { + "epoch": 0.07847255611508971, + "grad_norm": 8.027883529663086, + "learning_rate": 9.882497798732344e-06, + "loss": 0.2029, + "step": 3101 + }, + { + "epoch": 0.07849786167978338, + "grad_norm": 23.820110321044922, + "learning_rate": 9.882411247094304e-06, + "loss": 0.3211, + "step": 3102 + }, + { + "epoch": 0.07852316724447705, + "grad_norm": 8.404760360717773, + "learning_rate": 9.882324663970502e-06, + "loss": 0.3051, + "step": 3103 + }, + { + "epoch": 0.07854847280917074, + "grad_norm": 5.804541110992432, + "learning_rate": 9.882238049361496e-06, + "loss": 0.2496, + "step": 3104 + }, + { + "epoch": 0.07857377837386441, + "grad_norm": 6.9371209144592285, + "learning_rate": 9.882151403267845e-06, + "loss": 0.2707, + "step": 3105 + }, + { + "epoch": 0.07859908393855809, + "grad_norm": 10.685888290405273, + "learning_rate": 9.882064725690109e-06, + "loss": 0.275, + "step": 3106 + }, + { + "epoch": 0.07862438950325176, + "grad_norm": 6.194056987762451, + "learning_rate": 9.881978016628843e-06, + "loss": 0.1345, + "step": 3107 + }, + { + "epoch": 0.07864969506794545, + "grad_norm": 6.378520488739014, + "learning_rate": 9.881891276084612e-06, + "loss": 0.229, + "step": 3108 + }, + { + "epoch": 0.07867500063263912, + "grad_norm": 4.268868923187256, + "learning_rate": 9.88180450405797e-06, + "loss": 0.1318, + "step": 3109 + }, + { + "epoch": 0.07870030619733279, + "grad_norm": 4.132165908813477, + "learning_rate": 9.88171770054948e-06, + "loss": 0.172, + "step": 3110 + }, + { + "epoch": 0.07872561176202647, + "grad_norm": 5.301781177520752, + "learning_rate": 9.8816308655597e-06, + "loss": 0.218, + "step": 3111 + }, + { + "epoch": 0.07875091732672014, + "grad_norm": 6.356647968292236, + "learning_rate": 9.881543999089191e-06, + "loss": 0.219, + "step": 3112 + }, + { + "epoch": 0.07877622289141382, + "grad_norm": 16.000747680664062, + "learning_rate": 9.881457101138511e-06, + "loss": 0.3122, + "step": 3113 + }, + { + "epoch": 0.0788015284561075, + "grad_norm": 6.774069309234619, + "learning_rate": 9.881370171708225e-06, + "loss": 0.1643, + "step": 3114 + }, + { + "epoch": 0.07882683402080118, + "grad_norm": 7.918529987335205, + "learning_rate": 9.881283210798889e-06, + "loss": 0.1723, + "step": 3115 + }, + { + "epoch": 0.07885213958549485, + "grad_norm": 5.73056173324585, + "learning_rate": 9.881196218411066e-06, + "loss": 0.1888, + "step": 3116 + }, + { + "epoch": 0.07887744515018853, + "grad_norm": 21.238515853881836, + "learning_rate": 9.881109194545317e-06, + "loss": 0.3671, + "step": 3117 + }, + { + "epoch": 0.0789027507148822, + "grad_norm": 5.74838924407959, + "learning_rate": 9.881022139202204e-06, + "loss": 0.2145, + "step": 3118 + }, + { + "epoch": 0.07892805627957587, + "grad_norm": 13.051509857177734, + "learning_rate": 9.880935052382284e-06, + "loss": 0.1728, + "step": 3119 + }, + { + "epoch": 0.07895336184426956, + "grad_norm": 8.415494918823242, + "learning_rate": 9.880847934086124e-06, + "loss": 0.2226, + "step": 3120 + }, + { + "epoch": 0.07897866740896323, + "grad_norm": 3.9122040271759033, + "learning_rate": 9.880760784314283e-06, + "loss": 0.1281, + "step": 3121 + }, + { + "epoch": 0.07900397297365691, + "grad_norm": 7.000075817108154, + "learning_rate": 9.880673603067324e-06, + "loss": 0.2445, + "step": 3122 + }, + { + "epoch": 0.07902927853835058, + "grad_norm": 4.829111576080322, + "learning_rate": 9.880586390345808e-06, + "loss": 0.1827, + "step": 3123 + }, + { + "epoch": 0.07905458410304426, + "grad_norm": 3.582007884979248, + "learning_rate": 9.880499146150297e-06, + "loss": 0.144, + "step": 3124 + }, + { + "epoch": 0.07907988966773793, + "grad_norm": 7.004733562469482, + "learning_rate": 9.880411870481356e-06, + "loss": 0.2286, + "step": 3125 + }, + { + "epoch": 0.0791051952324316, + "grad_norm": 6.240042209625244, + "learning_rate": 9.880324563339548e-06, + "loss": 0.1728, + "step": 3126 + }, + { + "epoch": 0.07913050079712529, + "grad_norm": 5.499801158905029, + "learning_rate": 9.880237224725432e-06, + "loss": 0.1987, + "step": 3127 + }, + { + "epoch": 0.07915580636181896, + "grad_norm": 3.9002084732055664, + "learning_rate": 9.880149854639579e-06, + "loss": 0.179, + "step": 3128 + }, + { + "epoch": 0.07918111192651264, + "grad_norm": 3.8161256313323975, + "learning_rate": 9.880062453082541e-06, + "loss": 0.1672, + "step": 3129 + }, + { + "epoch": 0.07920641749120631, + "grad_norm": 2.5385780334472656, + "learning_rate": 9.879975020054893e-06, + "loss": 0.1278, + "step": 3130 + }, + { + "epoch": 0.0792317230559, + "grad_norm": 3.9533255100250244, + "learning_rate": 9.87988755555719e-06, + "loss": 0.1951, + "step": 3131 + }, + { + "epoch": 0.07925702862059367, + "grad_norm": 12.671923637390137, + "learning_rate": 9.879800059590003e-06, + "loss": 0.2672, + "step": 3132 + }, + { + "epoch": 0.07928233418528735, + "grad_norm": 3.7918472290039062, + "learning_rate": 9.879712532153891e-06, + "loss": 0.1633, + "step": 3133 + }, + { + "epoch": 0.07930763974998102, + "grad_norm": 5.727550506591797, + "learning_rate": 9.879624973249422e-06, + "loss": 0.0934, + "step": 3134 + }, + { + "epoch": 0.07933294531467469, + "grad_norm": 7.543715000152588, + "learning_rate": 9.87953738287716e-06, + "loss": 0.279, + "step": 3135 + }, + { + "epoch": 0.07935825087936837, + "grad_norm": 4.959568023681641, + "learning_rate": 9.879449761037668e-06, + "loss": 0.2161, + "step": 3136 + }, + { + "epoch": 0.07938355644406205, + "grad_norm": 4.1787333488464355, + "learning_rate": 9.87936210773151e-06, + "loss": 0.1501, + "step": 3137 + }, + { + "epoch": 0.07940886200875573, + "grad_norm": 29.755714416503906, + "learning_rate": 9.879274422959256e-06, + "loss": 0.2815, + "step": 3138 + }, + { + "epoch": 0.0794341675734494, + "grad_norm": 4.091279029846191, + "learning_rate": 9.879186706721469e-06, + "loss": 0.1393, + "step": 3139 + }, + { + "epoch": 0.07945947313814308, + "grad_norm": 9.828557968139648, + "learning_rate": 9.879098959018714e-06, + "loss": 0.2692, + "step": 3140 + }, + { + "epoch": 0.07948477870283675, + "grad_norm": 9.539474487304688, + "learning_rate": 9.879011179851556e-06, + "loss": 0.2553, + "step": 3141 + }, + { + "epoch": 0.07951008426753042, + "grad_norm": 8.781137466430664, + "learning_rate": 9.878923369220563e-06, + "loss": 0.2492, + "step": 3142 + }, + { + "epoch": 0.07953538983222411, + "grad_norm": 16.990745544433594, + "learning_rate": 9.878835527126303e-06, + "loss": 0.2874, + "step": 3143 + }, + { + "epoch": 0.07956069539691778, + "grad_norm": 6.416784763336182, + "learning_rate": 9.878747653569337e-06, + "loss": 0.2009, + "step": 3144 + }, + { + "epoch": 0.07958600096161146, + "grad_norm": 7.1389946937561035, + "learning_rate": 9.878659748550236e-06, + "loss": 0.2, + "step": 3145 + }, + { + "epoch": 0.07961130652630513, + "grad_norm": 5.737908363342285, + "learning_rate": 9.878571812069566e-06, + "loss": 0.2524, + "step": 3146 + }, + { + "epoch": 0.07963661209099882, + "grad_norm": 22.163497924804688, + "learning_rate": 9.878483844127896e-06, + "loss": 0.2602, + "step": 3147 + }, + { + "epoch": 0.07966191765569249, + "grad_norm": 4.485475540161133, + "learning_rate": 9.878395844725789e-06, + "loss": 0.0981, + "step": 3148 + }, + { + "epoch": 0.07968722322038617, + "grad_norm": 5.392672538757324, + "learning_rate": 9.878307813863814e-06, + "loss": 0.1924, + "step": 3149 + }, + { + "epoch": 0.07971252878507984, + "grad_norm": 5.888943195343018, + "learning_rate": 9.87821975154254e-06, + "loss": 0.1447, + "step": 3150 + }, + { + "epoch": 0.07973783434977351, + "grad_norm": 8.671792984008789, + "learning_rate": 9.878131657762535e-06, + "loss": 0.2961, + "step": 3151 + }, + { + "epoch": 0.0797631399144672, + "grad_norm": 9.179877281188965, + "learning_rate": 9.878043532524366e-06, + "loss": 0.1267, + "step": 3152 + }, + { + "epoch": 0.07978844547916086, + "grad_norm": 10.792728424072266, + "learning_rate": 9.877955375828601e-06, + "loss": 0.233, + "step": 3153 + }, + { + "epoch": 0.07981375104385455, + "grad_norm": 3.325364112854004, + "learning_rate": 9.87786718767581e-06, + "loss": 0.1342, + "step": 3154 + }, + { + "epoch": 0.07983905660854822, + "grad_norm": 6.466375350952148, + "learning_rate": 9.877778968066561e-06, + "loss": 0.1781, + "step": 3155 + }, + { + "epoch": 0.0798643621732419, + "grad_norm": 12.580768585205078, + "learning_rate": 9.877690717001421e-06, + "loss": 0.252, + "step": 3156 + }, + { + "epoch": 0.07988966773793557, + "grad_norm": 16.935890197753906, + "learning_rate": 9.877602434480962e-06, + "loss": 0.3376, + "step": 3157 + }, + { + "epoch": 0.07991497330262924, + "grad_norm": 19.61089515686035, + "learning_rate": 9.877514120505753e-06, + "loss": 0.4054, + "step": 3158 + }, + { + "epoch": 0.07994027886732293, + "grad_norm": 18.140933990478516, + "learning_rate": 9.877425775076361e-06, + "loss": 0.3565, + "step": 3159 + }, + { + "epoch": 0.0799655844320166, + "grad_norm": 6.024295330047607, + "learning_rate": 9.877337398193359e-06, + "loss": 0.1536, + "step": 3160 + }, + { + "epoch": 0.07999088999671028, + "grad_norm": 7.338869094848633, + "learning_rate": 9.877248989857316e-06, + "loss": 0.2588, + "step": 3161 + }, + { + "epoch": 0.08001619556140395, + "grad_norm": 3.6835920810699463, + "learning_rate": 9.8771605500688e-06, + "loss": 0.2457, + "step": 3162 + }, + { + "epoch": 0.08004150112609763, + "grad_norm": 5.148019313812256, + "learning_rate": 9.877072078828382e-06, + "loss": 0.2139, + "step": 3163 + }, + { + "epoch": 0.0800668066907913, + "grad_norm": 17.537288665771484, + "learning_rate": 9.876983576136634e-06, + "loss": 0.1672, + "step": 3164 + }, + { + "epoch": 0.08009211225548499, + "grad_norm": 13.3223876953125, + "learning_rate": 9.876895041994128e-06, + "loss": 0.3534, + "step": 3165 + }, + { + "epoch": 0.08011741782017866, + "grad_norm": 7.476049423217773, + "learning_rate": 9.876806476401431e-06, + "loss": 0.3162, + "step": 3166 + }, + { + "epoch": 0.08014272338487233, + "grad_norm": 9.896561622619629, + "learning_rate": 9.876717879359117e-06, + "loss": 0.2711, + "step": 3167 + }, + { + "epoch": 0.08016802894956601, + "grad_norm": 9.46435832977295, + "learning_rate": 9.876629250867758e-06, + "loss": 0.2661, + "step": 3168 + }, + { + "epoch": 0.08019333451425968, + "grad_norm": 8.205981254577637, + "learning_rate": 9.876540590927921e-06, + "loss": 0.1904, + "step": 3169 + }, + { + "epoch": 0.08021864007895337, + "grad_norm": 6.264850616455078, + "learning_rate": 9.876451899540183e-06, + "loss": 0.2117, + "step": 3170 + }, + { + "epoch": 0.08024394564364704, + "grad_norm": 4.839395999908447, + "learning_rate": 9.876363176705113e-06, + "loss": 0.193, + "step": 3171 + }, + { + "epoch": 0.08026925120834072, + "grad_norm": 3.6126747131347656, + "learning_rate": 9.876274422423284e-06, + "loss": 0.1992, + "step": 3172 + }, + { + "epoch": 0.08029455677303439, + "grad_norm": 7.815122127532959, + "learning_rate": 9.876185636695269e-06, + "loss": 0.2609, + "step": 3173 + }, + { + "epoch": 0.08031986233772806, + "grad_norm": 4.3482513427734375, + "learning_rate": 9.876096819521638e-06, + "loss": 0.192, + "step": 3174 + }, + { + "epoch": 0.08034516790242174, + "grad_norm": 5.929497241973877, + "learning_rate": 9.876007970902968e-06, + "loss": 0.2295, + "step": 3175 + }, + { + "epoch": 0.08037047346711541, + "grad_norm": 3.812697172164917, + "learning_rate": 9.875919090839828e-06, + "loss": 0.1977, + "step": 3176 + }, + { + "epoch": 0.0803957790318091, + "grad_norm": 3.890108585357666, + "learning_rate": 9.875830179332794e-06, + "loss": 0.1935, + "step": 3177 + }, + { + "epoch": 0.08042108459650277, + "grad_norm": 7.637642860412598, + "learning_rate": 9.875741236382436e-06, + "loss": 0.2255, + "step": 3178 + }, + { + "epoch": 0.08044639016119645, + "grad_norm": 7.250385284423828, + "learning_rate": 9.875652261989333e-06, + "loss": 0.2886, + "step": 3179 + }, + { + "epoch": 0.08047169572589012, + "grad_norm": 16.30548095703125, + "learning_rate": 9.875563256154052e-06, + "loss": 0.2215, + "step": 3180 + }, + { + "epoch": 0.0804970012905838, + "grad_norm": 4.406983375549316, + "learning_rate": 9.875474218877172e-06, + "loss": 0.1698, + "step": 3181 + }, + { + "epoch": 0.08052230685527748, + "grad_norm": 10.782155990600586, + "learning_rate": 9.875385150159266e-06, + "loss": 0.2751, + "step": 3182 + }, + { + "epoch": 0.08054761241997115, + "grad_norm": 5.020319938659668, + "learning_rate": 9.875296050000908e-06, + "loss": 0.2198, + "step": 3183 + }, + { + "epoch": 0.08057291798466483, + "grad_norm": 7.236185073852539, + "learning_rate": 9.875206918402672e-06, + "loss": 0.1811, + "step": 3184 + }, + { + "epoch": 0.0805982235493585, + "grad_norm": 4.914517402648926, + "learning_rate": 9.875117755365135e-06, + "loss": 0.1297, + "step": 3185 + }, + { + "epoch": 0.08062352911405218, + "grad_norm": 7.2234392166137695, + "learning_rate": 9.87502856088887e-06, + "loss": 0.1915, + "step": 3186 + }, + { + "epoch": 0.08064883467874585, + "grad_norm": 9.723134994506836, + "learning_rate": 9.874939334974452e-06, + "loss": 0.2777, + "step": 3187 + }, + { + "epoch": 0.08067414024343954, + "grad_norm": 5.182560443878174, + "learning_rate": 9.874850077622456e-06, + "loss": 0.228, + "step": 3188 + }, + { + "epoch": 0.08069944580813321, + "grad_norm": 7.9057464599609375, + "learning_rate": 9.874760788833459e-06, + "loss": 0.209, + "step": 3189 + }, + { + "epoch": 0.08072475137282688, + "grad_norm": 13.287479400634766, + "learning_rate": 9.87467146860804e-06, + "loss": 0.3243, + "step": 3190 + }, + { + "epoch": 0.08075005693752056, + "grad_norm": 7.528152942657471, + "learning_rate": 9.874582116946767e-06, + "loss": 0.2169, + "step": 3191 + }, + { + "epoch": 0.08077536250221423, + "grad_norm": 9.618653297424316, + "learning_rate": 9.874492733850223e-06, + "loss": 0.2508, + "step": 3192 + }, + { + "epoch": 0.08080066806690792, + "grad_norm": 5.510907173156738, + "learning_rate": 9.874403319318981e-06, + "loss": 0.22, + "step": 3193 + }, + { + "epoch": 0.08082597363160159, + "grad_norm": 9.118961334228516, + "learning_rate": 9.874313873353619e-06, + "loss": 0.2745, + "step": 3194 + }, + { + "epoch": 0.08085127919629527, + "grad_norm": 11.186748504638672, + "learning_rate": 9.874224395954713e-06, + "loss": 0.1553, + "step": 3195 + }, + { + "epoch": 0.08087658476098894, + "grad_norm": 7.756422519683838, + "learning_rate": 9.874134887122841e-06, + "loss": 0.2767, + "step": 3196 + }, + { + "epoch": 0.08090189032568262, + "grad_norm": 5.757676601409912, + "learning_rate": 9.874045346858581e-06, + "loss": 0.2755, + "step": 3197 + }, + { + "epoch": 0.0809271958903763, + "grad_norm": 4.927589416503906, + "learning_rate": 9.873955775162508e-06, + "loss": 0.1916, + "step": 3198 + }, + { + "epoch": 0.08095250145506996, + "grad_norm": 11.73919677734375, + "learning_rate": 9.8738661720352e-06, + "loss": 0.3172, + "step": 3199 + }, + { + "epoch": 0.08097780701976365, + "grad_norm": 6.3134050369262695, + "learning_rate": 9.873776537477238e-06, + "loss": 0.1902, + "step": 3200 + }, + { + "epoch": 0.08100311258445732, + "grad_norm": 4.040049076080322, + "learning_rate": 9.873686871489196e-06, + "loss": 0.202, + "step": 3201 + }, + { + "epoch": 0.081028418149151, + "grad_norm": 9.400702476501465, + "learning_rate": 9.873597174071654e-06, + "loss": 0.2371, + "step": 3202 + }, + { + "epoch": 0.08105372371384467, + "grad_norm": 2.9168179035186768, + "learning_rate": 9.87350744522519e-06, + "loss": 0.1775, + "step": 3203 + }, + { + "epoch": 0.08107902927853836, + "grad_norm": 24.5081844329834, + "learning_rate": 9.873417684950383e-06, + "loss": 0.2253, + "step": 3204 + }, + { + "epoch": 0.08110433484323203, + "grad_norm": 4.053922176361084, + "learning_rate": 9.873327893247813e-06, + "loss": 0.2001, + "step": 3205 + }, + { + "epoch": 0.0811296404079257, + "grad_norm": 6.690453052520752, + "learning_rate": 9.873238070118058e-06, + "loss": 0.1812, + "step": 3206 + }, + { + "epoch": 0.08115494597261938, + "grad_norm": 16.07404136657715, + "learning_rate": 9.873148215561696e-06, + "loss": 0.2693, + "step": 3207 + }, + { + "epoch": 0.08118025153731305, + "grad_norm": 5.3717875480651855, + "learning_rate": 9.873058329579309e-06, + "loss": 0.3217, + "step": 3208 + }, + { + "epoch": 0.08120555710200673, + "grad_norm": 9.812849998474121, + "learning_rate": 9.872968412171475e-06, + "loss": 0.2973, + "step": 3209 + }, + { + "epoch": 0.0812308626667004, + "grad_norm": 5.973982810974121, + "learning_rate": 9.872878463338774e-06, + "loss": 0.187, + "step": 3210 + }, + { + "epoch": 0.08125616823139409, + "grad_norm": 5.456314563751221, + "learning_rate": 9.872788483081786e-06, + "loss": 0.2142, + "step": 3211 + }, + { + "epoch": 0.08128147379608776, + "grad_norm": 6.597762584686279, + "learning_rate": 9.87269847140109e-06, + "loss": 0.2323, + "step": 3212 + }, + { + "epoch": 0.08130677936078144, + "grad_norm": 8.336347579956055, + "learning_rate": 9.87260842829727e-06, + "loss": 0.2284, + "step": 3213 + }, + { + "epoch": 0.08133208492547511, + "grad_norm": 5.245907783508301, + "learning_rate": 9.872518353770903e-06, + "loss": 0.1902, + "step": 3214 + }, + { + "epoch": 0.08135739049016878, + "grad_norm": 12.385832786560059, + "learning_rate": 9.872428247822574e-06, + "loss": 0.2532, + "step": 3215 + }, + { + "epoch": 0.08138269605486247, + "grad_norm": 4.223989963531494, + "learning_rate": 9.87233811045286e-06, + "loss": 0.1943, + "step": 3216 + }, + { + "epoch": 0.08140800161955614, + "grad_norm": 5.223764896392822, + "learning_rate": 9.872247941662344e-06, + "loss": 0.1651, + "step": 3217 + }, + { + "epoch": 0.08143330718424982, + "grad_norm": 5.5471110343933105, + "learning_rate": 9.872157741451607e-06, + "loss": 0.2498, + "step": 3218 + }, + { + "epoch": 0.08145861274894349, + "grad_norm": 6.120841979980469, + "learning_rate": 9.872067509821231e-06, + "loss": 0.287, + "step": 3219 + }, + { + "epoch": 0.08148391831363717, + "grad_norm": 4.922421932220459, + "learning_rate": 9.871977246771797e-06, + "loss": 0.1826, + "step": 3220 + }, + { + "epoch": 0.08150922387833084, + "grad_norm": 8.41064453125, + "learning_rate": 9.87188695230389e-06, + "loss": 0.258, + "step": 3221 + }, + { + "epoch": 0.08153452944302451, + "grad_norm": 6.905182838439941, + "learning_rate": 9.87179662641809e-06, + "loss": 0.1601, + "step": 3222 + }, + { + "epoch": 0.0815598350077182, + "grad_norm": 14.25143814086914, + "learning_rate": 9.871706269114978e-06, + "loss": 0.2333, + "step": 3223 + }, + { + "epoch": 0.08158514057241187, + "grad_norm": 3.5538947582244873, + "learning_rate": 9.87161588039514e-06, + "loss": 0.2044, + "step": 3224 + }, + { + "epoch": 0.08161044613710555, + "grad_norm": 5.832971572875977, + "learning_rate": 9.871525460259157e-06, + "loss": 0.2658, + "step": 3225 + }, + { + "epoch": 0.08163575170179922, + "grad_norm": 5.779323101043701, + "learning_rate": 9.871435008707613e-06, + "loss": 0.2218, + "step": 3226 + }, + { + "epoch": 0.0816610572664929, + "grad_norm": 4.466123104095459, + "learning_rate": 9.87134452574109e-06, + "loss": 0.2106, + "step": 3227 + }, + { + "epoch": 0.08168636283118658, + "grad_norm": 6.282923698425293, + "learning_rate": 9.871254011360173e-06, + "loss": 0.2212, + "step": 3228 + }, + { + "epoch": 0.08171166839588026, + "grad_norm": 5.1665358543396, + "learning_rate": 9.871163465565446e-06, + "loss": 0.283, + "step": 3229 + }, + { + "epoch": 0.08173697396057393, + "grad_norm": 4.6008195877075195, + "learning_rate": 9.871072888357489e-06, + "loss": 0.1881, + "step": 3230 + }, + { + "epoch": 0.0817622795252676, + "grad_norm": 9.64161491394043, + "learning_rate": 9.870982279736891e-06, + "loss": 0.2365, + "step": 3231 + }, + { + "epoch": 0.08178758508996128, + "grad_norm": 6.339385986328125, + "learning_rate": 9.870891639704235e-06, + "loss": 0.1917, + "step": 3232 + }, + { + "epoch": 0.08181289065465495, + "grad_norm": 4.879279613494873, + "learning_rate": 9.870800968260103e-06, + "loss": 0.1425, + "step": 3233 + }, + { + "epoch": 0.08183819621934864, + "grad_norm": 4.878537654876709, + "learning_rate": 9.870710265405081e-06, + "loss": 0.19, + "step": 3234 + }, + { + "epoch": 0.08186350178404231, + "grad_norm": 4.987146854400635, + "learning_rate": 9.870619531139757e-06, + "loss": 0.1101, + "step": 3235 + }, + { + "epoch": 0.08188880734873599, + "grad_norm": 5.131012916564941, + "learning_rate": 9.870528765464713e-06, + "loss": 0.1966, + "step": 3236 + }, + { + "epoch": 0.08191411291342966, + "grad_norm": 5.6055145263671875, + "learning_rate": 9.870437968380535e-06, + "loss": 0.1985, + "step": 3237 + }, + { + "epoch": 0.08193941847812333, + "grad_norm": 6.619409084320068, + "learning_rate": 9.870347139887808e-06, + "loss": 0.2475, + "step": 3238 + }, + { + "epoch": 0.08196472404281702, + "grad_norm": 4.432377815246582, + "learning_rate": 9.870256279987117e-06, + "loss": 0.1197, + "step": 3239 + }, + { + "epoch": 0.08199002960751069, + "grad_norm": 8.342178344726562, + "learning_rate": 9.87016538867905e-06, + "loss": 0.2768, + "step": 3240 + }, + { + "epoch": 0.08201533517220437, + "grad_norm": 7.417104721069336, + "learning_rate": 9.870074465964193e-06, + "loss": 0.2199, + "step": 3241 + }, + { + "epoch": 0.08204064073689804, + "grad_norm": 12.918088912963867, + "learning_rate": 9.869983511843132e-06, + "loss": 0.1961, + "step": 3242 + }, + { + "epoch": 0.08206594630159172, + "grad_norm": 4.567843914031982, + "learning_rate": 9.869892526316453e-06, + "loss": 0.1664, + "step": 3243 + }, + { + "epoch": 0.0820912518662854, + "grad_norm": 5.913455009460449, + "learning_rate": 9.869801509384743e-06, + "loss": 0.1876, + "step": 3244 + }, + { + "epoch": 0.08211655743097908, + "grad_norm": 12.123668670654297, + "learning_rate": 9.869710461048587e-06, + "loss": 0.3808, + "step": 3245 + }, + { + "epoch": 0.08214186299567275, + "grad_norm": 8.313104629516602, + "learning_rate": 9.869619381308578e-06, + "loss": 0.2256, + "step": 3246 + }, + { + "epoch": 0.08216716856036642, + "grad_norm": 7.462380409240723, + "learning_rate": 9.869528270165296e-06, + "loss": 0.2081, + "step": 3247 + }, + { + "epoch": 0.0821924741250601, + "grad_norm": 8.440766334533691, + "learning_rate": 9.869437127619332e-06, + "loss": 0.2206, + "step": 3248 + }, + { + "epoch": 0.08221777968975377, + "grad_norm": 5.42858362197876, + "learning_rate": 9.869345953671275e-06, + "loss": 0.1223, + "step": 3249 + }, + { + "epoch": 0.08224308525444746, + "grad_norm": 15.948587417602539, + "learning_rate": 9.869254748321713e-06, + "loss": 0.2157, + "step": 3250 + }, + { + "epoch": 0.08226839081914113, + "grad_norm": 8.83531379699707, + "learning_rate": 9.869163511571231e-06, + "loss": 0.1911, + "step": 3251 + }, + { + "epoch": 0.08229369638383481, + "grad_norm": 9.175728797912598, + "learning_rate": 9.869072243420418e-06, + "loss": 0.3112, + "step": 3252 + }, + { + "epoch": 0.08231900194852848, + "grad_norm": 5.9076080322265625, + "learning_rate": 9.868980943869865e-06, + "loss": 0.1511, + "step": 3253 + }, + { + "epoch": 0.08234430751322215, + "grad_norm": 6.728817939758301, + "learning_rate": 9.868889612920162e-06, + "loss": 0.218, + "step": 3254 + }, + { + "epoch": 0.08236961307791583, + "grad_norm": 17.596927642822266, + "learning_rate": 9.868798250571894e-06, + "loss": 0.3278, + "step": 3255 + }, + { + "epoch": 0.0823949186426095, + "grad_norm": 7.831550121307373, + "learning_rate": 9.86870685682565e-06, + "loss": 0.1157, + "step": 3256 + }, + { + "epoch": 0.08242022420730319, + "grad_norm": 3.2322962284088135, + "learning_rate": 9.868615431682022e-06, + "loss": 0.1376, + "step": 3257 + }, + { + "epoch": 0.08244552977199686, + "grad_norm": 5.088609218597412, + "learning_rate": 9.8685239751416e-06, + "loss": 0.2394, + "step": 3258 + }, + { + "epoch": 0.08247083533669054, + "grad_norm": 6.921474933624268, + "learning_rate": 9.868432487204974e-06, + "loss": 0.2231, + "step": 3259 + }, + { + "epoch": 0.08249614090138421, + "grad_norm": 3.7568199634552, + "learning_rate": 9.868340967872729e-06, + "loss": 0.1378, + "step": 3260 + }, + { + "epoch": 0.0825214464660779, + "grad_norm": 7.120798110961914, + "learning_rate": 9.868249417145458e-06, + "loss": 0.2618, + "step": 3261 + }, + { + "epoch": 0.08254675203077157, + "grad_norm": 8.776001930236816, + "learning_rate": 9.868157835023755e-06, + "loss": 0.2401, + "step": 3262 + }, + { + "epoch": 0.08257205759546524, + "grad_norm": 8.676793098449707, + "learning_rate": 9.868066221508206e-06, + "loss": 0.2773, + "step": 3263 + }, + { + "epoch": 0.08259736316015892, + "grad_norm": 6.263089179992676, + "learning_rate": 9.867974576599405e-06, + "loss": 0.104, + "step": 3264 + }, + { + "epoch": 0.08262266872485259, + "grad_norm": 6.885971546173096, + "learning_rate": 9.86788290029794e-06, + "loss": 0.2719, + "step": 3265 + }, + { + "epoch": 0.08264797428954627, + "grad_norm": 7.895334243774414, + "learning_rate": 9.867791192604405e-06, + "loss": 0.3446, + "step": 3266 + }, + { + "epoch": 0.08267327985423994, + "grad_norm": 9.040156364440918, + "learning_rate": 9.86769945351939e-06, + "loss": 0.2489, + "step": 3267 + }, + { + "epoch": 0.08269858541893363, + "grad_norm": 5.831761360168457, + "learning_rate": 9.867607683043485e-06, + "loss": 0.2169, + "step": 3268 + }, + { + "epoch": 0.0827238909836273, + "grad_norm": 4.605239391326904, + "learning_rate": 9.867515881177285e-06, + "loss": 0.1767, + "step": 3269 + }, + { + "epoch": 0.08274919654832097, + "grad_norm": 20.689193725585938, + "learning_rate": 9.86742404792138e-06, + "loss": 0.4299, + "step": 3270 + }, + { + "epoch": 0.08277450211301465, + "grad_norm": 9.958290100097656, + "learning_rate": 9.867332183276363e-06, + "loss": 0.2092, + "step": 3271 + }, + { + "epoch": 0.08279980767770832, + "grad_norm": 9.633960723876953, + "learning_rate": 9.867240287242825e-06, + "loss": 0.2214, + "step": 3272 + }, + { + "epoch": 0.08282511324240201, + "grad_norm": 6.014046669006348, + "learning_rate": 9.867148359821362e-06, + "loss": 0.1656, + "step": 3273 + }, + { + "epoch": 0.08285041880709568, + "grad_norm": 8.514486312866211, + "learning_rate": 9.867056401012562e-06, + "loss": 0.1446, + "step": 3274 + }, + { + "epoch": 0.08287572437178936, + "grad_norm": 6.177371025085449, + "learning_rate": 9.866964410817021e-06, + "loss": 0.1782, + "step": 3275 + }, + { + "epoch": 0.08290102993648303, + "grad_norm": 7.5461297035217285, + "learning_rate": 9.866872389235333e-06, + "loss": 0.2062, + "step": 3276 + }, + { + "epoch": 0.08292633550117672, + "grad_norm": 4.482951641082764, + "learning_rate": 9.866780336268091e-06, + "loss": 0.2045, + "step": 3277 + }, + { + "epoch": 0.08295164106587039, + "grad_norm": 5.456926345825195, + "learning_rate": 9.866688251915886e-06, + "loss": 0.1889, + "step": 3278 + }, + { + "epoch": 0.08297694663056406, + "grad_norm": 4.453009128570557, + "learning_rate": 9.866596136179317e-06, + "loss": 0.1896, + "step": 3279 + }, + { + "epoch": 0.08300225219525774, + "grad_norm": 3.7301061153411865, + "learning_rate": 9.86650398905897e-06, + "loss": 0.1907, + "step": 3280 + }, + { + "epoch": 0.08302755775995141, + "grad_norm": 2.6492342948913574, + "learning_rate": 9.866411810555447e-06, + "loss": 0.0688, + "step": 3281 + }, + { + "epoch": 0.0830528633246451, + "grad_norm": 13.09228801727295, + "learning_rate": 9.866319600669338e-06, + "loss": 0.2852, + "step": 3282 + }, + { + "epoch": 0.08307816888933876, + "grad_norm": 10.640545845031738, + "learning_rate": 9.86622735940124e-06, + "loss": 0.2455, + "step": 3283 + }, + { + "epoch": 0.08310347445403245, + "grad_norm": 7.149264812469482, + "learning_rate": 9.866135086751747e-06, + "loss": 0.1912, + "step": 3284 + }, + { + "epoch": 0.08312878001872612, + "grad_norm": 6.748371601104736, + "learning_rate": 9.866042782721454e-06, + "loss": 0.2422, + "step": 3285 + }, + { + "epoch": 0.08315408558341979, + "grad_norm": 6.184902191162109, + "learning_rate": 9.865950447310957e-06, + "loss": 0.2095, + "step": 3286 + }, + { + "epoch": 0.08317939114811347, + "grad_norm": 8.4502534866333, + "learning_rate": 9.86585808052085e-06, + "loss": 0.2453, + "step": 3287 + }, + { + "epoch": 0.08320469671280714, + "grad_norm": 6.977379322052002, + "learning_rate": 9.865765682351728e-06, + "loss": 0.251, + "step": 3288 + }, + { + "epoch": 0.08323000227750083, + "grad_norm": 15.507476806640625, + "learning_rate": 9.86567325280419e-06, + "loss": 0.3205, + "step": 3289 + }, + { + "epoch": 0.0832553078421945, + "grad_norm": 14.086116790771484, + "learning_rate": 9.86558079187883e-06, + "loss": 0.2352, + "step": 3290 + }, + { + "epoch": 0.08328061340688818, + "grad_norm": 4.009586811065674, + "learning_rate": 9.865488299576244e-06, + "loss": 0.1395, + "step": 3291 + }, + { + "epoch": 0.08330591897158185, + "grad_norm": 9.527813911437988, + "learning_rate": 9.865395775897028e-06, + "loss": 0.25, + "step": 3292 + }, + { + "epoch": 0.08333122453627553, + "grad_norm": 13.751606941223145, + "learning_rate": 9.865303220841783e-06, + "loss": 0.2397, + "step": 3293 + }, + { + "epoch": 0.0833565301009692, + "grad_norm": 11.056577682495117, + "learning_rate": 9.8652106344111e-06, + "loss": 0.2645, + "step": 3294 + }, + { + "epoch": 0.08338183566566287, + "grad_norm": 4.484631061553955, + "learning_rate": 9.865118016605579e-06, + "loss": 0.133, + "step": 3295 + }, + { + "epoch": 0.08340714123035656, + "grad_norm": 5.544898986816406, + "learning_rate": 9.865025367425816e-06, + "loss": 0.1621, + "step": 3296 + }, + { + "epoch": 0.08343244679505023, + "grad_norm": 3.270879030227661, + "learning_rate": 9.86493268687241e-06, + "loss": 0.1, + "step": 3297 + }, + { + "epoch": 0.08345775235974391, + "grad_norm": 5.501993179321289, + "learning_rate": 9.864839974945958e-06, + "loss": 0.2191, + "step": 3298 + }, + { + "epoch": 0.08348305792443758, + "grad_norm": 8.058006286621094, + "learning_rate": 9.86474723164706e-06, + "loss": 0.2176, + "step": 3299 + }, + { + "epoch": 0.08350836348913127, + "grad_norm": 10.179594993591309, + "learning_rate": 9.864654456976311e-06, + "loss": 0.1847, + "step": 3300 + }, + { + "epoch": 0.08353366905382494, + "grad_norm": 4.871796607971191, + "learning_rate": 9.864561650934309e-06, + "loss": 0.1647, + "step": 3301 + }, + { + "epoch": 0.0835589746185186, + "grad_norm": 5.5158371925354, + "learning_rate": 9.864468813521655e-06, + "loss": 0.2258, + "step": 3302 + }, + { + "epoch": 0.08358428018321229, + "grad_norm": 6.5467915534973145, + "learning_rate": 9.864375944738945e-06, + "loss": 0.2001, + "step": 3303 + }, + { + "epoch": 0.08360958574790596, + "grad_norm": 5.818336486816406, + "learning_rate": 9.86428304458678e-06, + "loss": 0.2013, + "step": 3304 + }, + { + "epoch": 0.08363489131259964, + "grad_norm": 6.071565628051758, + "learning_rate": 9.86419011306576e-06, + "loss": 0.1012, + "step": 3305 + }, + { + "epoch": 0.08366019687729331, + "grad_norm": 12.378326416015625, + "learning_rate": 9.864097150176481e-06, + "loss": 0.2316, + "step": 3306 + }, + { + "epoch": 0.083685502441987, + "grad_norm": 7.304730415344238, + "learning_rate": 9.864004155919545e-06, + "loss": 0.2727, + "step": 3307 + }, + { + "epoch": 0.08371080800668067, + "grad_norm": 4.585891246795654, + "learning_rate": 9.86391113029555e-06, + "loss": 0.1921, + "step": 3308 + }, + { + "epoch": 0.08373611357137435, + "grad_norm": 10.63342571258545, + "learning_rate": 9.863818073305099e-06, + "loss": 0.2242, + "step": 3309 + }, + { + "epoch": 0.08376141913606802, + "grad_norm": 6.070986270904541, + "learning_rate": 9.863724984948789e-06, + "loss": 0.1753, + "step": 3310 + }, + { + "epoch": 0.08378672470076169, + "grad_norm": 6.559286594390869, + "learning_rate": 9.86363186522722e-06, + "loss": 0.2164, + "step": 3311 + }, + { + "epoch": 0.08381203026545538, + "grad_norm": 5.76591682434082, + "learning_rate": 9.863538714140995e-06, + "loss": 0.1466, + "step": 3312 + }, + { + "epoch": 0.08383733583014905, + "grad_norm": 10.441040992736816, + "learning_rate": 9.863445531690712e-06, + "loss": 0.2753, + "step": 3313 + }, + { + "epoch": 0.08386264139484273, + "grad_norm": 8.9215726852417, + "learning_rate": 9.863352317876976e-06, + "loss": 0.2226, + "step": 3314 + }, + { + "epoch": 0.0838879469595364, + "grad_norm": 3.5770838260650635, + "learning_rate": 9.863259072700383e-06, + "loss": 0.1602, + "step": 3315 + }, + { + "epoch": 0.08391325252423008, + "grad_norm": 7.319199562072754, + "learning_rate": 9.863165796161538e-06, + "loss": 0.2302, + "step": 3316 + }, + { + "epoch": 0.08393855808892375, + "grad_norm": 17.654747009277344, + "learning_rate": 9.863072488261042e-06, + "loss": 0.3278, + "step": 3317 + }, + { + "epoch": 0.08396386365361742, + "grad_norm": 8.036520957946777, + "learning_rate": 9.862979148999495e-06, + "loss": 0.3, + "step": 3318 + }, + { + "epoch": 0.08398916921831111, + "grad_norm": 7.561262607574463, + "learning_rate": 9.8628857783775e-06, + "loss": 0.2345, + "step": 3319 + }, + { + "epoch": 0.08401447478300478, + "grad_norm": 6.740570545196533, + "learning_rate": 9.862792376395661e-06, + "loss": 0.1937, + "step": 3320 + }, + { + "epoch": 0.08403978034769846, + "grad_norm": 5.657907485961914, + "learning_rate": 9.862698943054577e-06, + "loss": 0.1668, + "step": 3321 + }, + { + "epoch": 0.08406508591239213, + "grad_norm": 4.365798473358154, + "learning_rate": 9.862605478354853e-06, + "loss": 0.1372, + "step": 3322 + }, + { + "epoch": 0.08409039147708582, + "grad_norm": 9.960073471069336, + "learning_rate": 9.86251198229709e-06, + "loss": 0.1551, + "step": 3323 + }, + { + "epoch": 0.08411569704177949, + "grad_norm": 5.303121089935303, + "learning_rate": 9.862418454881892e-06, + "loss": 0.151, + "step": 3324 + }, + { + "epoch": 0.08414100260647317, + "grad_norm": 4.894349098205566, + "learning_rate": 9.862324896109861e-06, + "loss": 0.1705, + "step": 3325 + }, + { + "epoch": 0.08416630817116684, + "grad_norm": 7.38053560256958, + "learning_rate": 9.862231305981602e-06, + "loss": 0.1585, + "step": 3326 + }, + { + "epoch": 0.08419161373586051, + "grad_norm": 11.334428787231445, + "learning_rate": 9.862137684497719e-06, + "loss": 0.3045, + "step": 3327 + }, + { + "epoch": 0.0842169193005542, + "grad_norm": 5.708703517913818, + "learning_rate": 9.862044031658814e-06, + "loss": 0.1997, + "step": 3328 + }, + { + "epoch": 0.08424222486524786, + "grad_norm": 11.077413558959961, + "learning_rate": 9.86195034746549e-06, + "loss": 0.2188, + "step": 3329 + }, + { + "epoch": 0.08426753042994155, + "grad_norm": 4.6550374031066895, + "learning_rate": 9.861856631918353e-06, + "loss": 0.2066, + "step": 3330 + }, + { + "epoch": 0.08429283599463522, + "grad_norm": 4.892940521240234, + "learning_rate": 9.861762885018006e-06, + "loss": 0.2472, + "step": 3331 + }, + { + "epoch": 0.0843181415593289, + "grad_norm": 6.177387714385986, + "learning_rate": 9.861669106765057e-06, + "loss": 0.2375, + "step": 3332 + }, + { + "epoch": 0.08434344712402257, + "grad_norm": 3.576141834259033, + "learning_rate": 9.861575297160107e-06, + "loss": 0.1312, + "step": 3333 + }, + { + "epoch": 0.08436875268871624, + "grad_norm": 21.906856536865234, + "learning_rate": 9.861481456203761e-06, + "loss": 0.3096, + "step": 3334 + }, + { + "epoch": 0.08439405825340993, + "grad_norm": 4.8718485832214355, + "learning_rate": 9.861387583896627e-06, + "loss": 0.1676, + "step": 3335 + }, + { + "epoch": 0.0844193638181036, + "grad_norm": 4.553953647613525, + "learning_rate": 9.86129368023931e-06, + "loss": 0.2125, + "step": 3336 + }, + { + "epoch": 0.08444466938279728, + "grad_norm": 4.890908241271973, + "learning_rate": 9.861199745232413e-06, + "loss": 0.1956, + "step": 3337 + }, + { + "epoch": 0.08446997494749095, + "grad_norm": 4.622907638549805, + "learning_rate": 9.861105778876543e-06, + "loss": 0.1384, + "step": 3338 + }, + { + "epoch": 0.08449528051218463, + "grad_norm": 13.759110450744629, + "learning_rate": 9.861011781172305e-06, + "loss": 0.329, + "step": 3339 + }, + { + "epoch": 0.0845205860768783, + "grad_norm": 5.754587650299072, + "learning_rate": 9.860917752120307e-06, + "loss": 0.2144, + "step": 3340 + }, + { + "epoch": 0.08454589164157199, + "grad_norm": 5.662996292114258, + "learning_rate": 9.860823691721156e-06, + "loss": 0.2727, + "step": 3341 + }, + { + "epoch": 0.08457119720626566, + "grad_norm": 9.923436164855957, + "learning_rate": 9.860729599975456e-06, + "loss": 0.2244, + "step": 3342 + }, + { + "epoch": 0.08459650277095933, + "grad_norm": 4.830928802490234, + "learning_rate": 9.860635476883812e-06, + "loss": 0.1108, + "step": 3343 + }, + { + "epoch": 0.08462180833565301, + "grad_norm": 11.026968955993652, + "learning_rate": 9.860541322446839e-06, + "loss": 0.2409, + "step": 3344 + }, + { + "epoch": 0.08464711390034668, + "grad_norm": 16.557157516479492, + "learning_rate": 9.860447136665137e-06, + "loss": 0.187, + "step": 3345 + }, + { + "epoch": 0.08467241946504037, + "grad_norm": 6.490370273590088, + "learning_rate": 9.860352919539315e-06, + "loss": 0.227, + "step": 3346 + }, + { + "epoch": 0.08469772502973404, + "grad_norm": 5.457633972167969, + "learning_rate": 9.860258671069982e-06, + "loss": 0.2105, + "step": 3347 + }, + { + "epoch": 0.08472303059442772, + "grad_norm": 7.137743949890137, + "learning_rate": 9.860164391257745e-06, + "loss": 0.2396, + "step": 3348 + }, + { + "epoch": 0.08474833615912139, + "grad_norm": 19.323638916015625, + "learning_rate": 9.860070080103212e-06, + "loss": 0.2478, + "step": 3349 + }, + { + "epoch": 0.08477364172381506, + "grad_norm": 4.938972473144531, + "learning_rate": 9.859975737606989e-06, + "loss": 0.2336, + "step": 3350 + }, + { + "epoch": 0.08479894728850874, + "grad_norm": 5.2844014167785645, + "learning_rate": 9.859881363769688e-06, + "loss": 0.2292, + "step": 3351 + }, + { + "epoch": 0.08482425285320241, + "grad_norm": 11.598631858825684, + "learning_rate": 9.859786958591916e-06, + "loss": 0.2321, + "step": 3352 + }, + { + "epoch": 0.0848495584178961, + "grad_norm": 7.78490686416626, + "learning_rate": 9.859692522074283e-06, + "loss": 0.2128, + "step": 3353 + }, + { + "epoch": 0.08487486398258977, + "grad_norm": 7.695350646972656, + "learning_rate": 9.859598054217396e-06, + "loss": 0.1914, + "step": 3354 + }, + { + "epoch": 0.08490016954728345, + "grad_norm": 8.665884017944336, + "learning_rate": 9.859503555021865e-06, + "loss": 0.2525, + "step": 3355 + }, + { + "epoch": 0.08492547511197712, + "grad_norm": 11.960018157958984, + "learning_rate": 9.8594090244883e-06, + "loss": 0.2574, + "step": 3356 + }, + { + "epoch": 0.0849507806766708, + "grad_norm": 5.148463249206543, + "learning_rate": 9.859314462617309e-06, + "loss": 0.2072, + "step": 3357 + }, + { + "epoch": 0.08497608624136448, + "grad_norm": 5.576006889343262, + "learning_rate": 9.859219869409504e-06, + "loss": 0.1601, + "step": 3358 + }, + { + "epoch": 0.08500139180605815, + "grad_norm": 6.456022262573242, + "learning_rate": 9.859125244865492e-06, + "loss": 0.157, + "step": 3359 + }, + { + "epoch": 0.08502669737075183, + "grad_norm": 5.220463275909424, + "learning_rate": 9.859030588985888e-06, + "loss": 0.1786, + "step": 3360 + }, + { + "epoch": 0.0850520029354455, + "grad_norm": 5.974833011627197, + "learning_rate": 9.858935901771298e-06, + "loss": 0.2204, + "step": 3361 + }, + { + "epoch": 0.08507730850013918, + "grad_norm": 6.356268405914307, + "learning_rate": 9.858841183222336e-06, + "loss": 0.1505, + "step": 3362 + }, + { + "epoch": 0.08510261406483285, + "grad_norm": 20.663169860839844, + "learning_rate": 9.858746433339608e-06, + "loss": 0.2847, + "step": 3363 + }, + { + "epoch": 0.08512791962952654, + "grad_norm": 11.545694351196289, + "learning_rate": 9.85865165212373e-06, + "loss": 0.3693, + "step": 3364 + }, + { + "epoch": 0.08515322519422021, + "grad_norm": 8.058822631835938, + "learning_rate": 9.85855683957531e-06, + "loss": 0.205, + "step": 3365 + }, + { + "epoch": 0.08517853075891388, + "grad_norm": 8.106781959533691, + "learning_rate": 9.858461995694963e-06, + "loss": 0.2829, + "step": 3366 + }, + { + "epoch": 0.08520383632360756, + "grad_norm": 6.165156841278076, + "learning_rate": 9.858367120483296e-06, + "loss": 0.1837, + "step": 3367 + }, + { + "epoch": 0.08522914188830123, + "grad_norm": 9.798114776611328, + "learning_rate": 9.858272213940924e-06, + "loss": 0.3338, + "step": 3368 + }, + { + "epoch": 0.08525444745299492, + "grad_norm": 7.611477375030518, + "learning_rate": 9.85817727606846e-06, + "loss": 0.2379, + "step": 3369 + }, + { + "epoch": 0.08527975301768859, + "grad_norm": 5.768631458282471, + "learning_rate": 9.858082306866512e-06, + "loss": 0.2103, + "step": 3370 + }, + { + "epoch": 0.08530505858238227, + "grad_norm": 7.737504959106445, + "learning_rate": 9.857987306335695e-06, + "loss": 0.2562, + "step": 3371 + }, + { + "epoch": 0.08533036414707594, + "grad_norm": 11.61441421508789, + "learning_rate": 9.857892274476623e-06, + "loss": 0.2256, + "step": 3372 + }, + { + "epoch": 0.08535566971176962, + "grad_norm": 6.721486568450928, + "learning_rate": 9.857797211289906e-06, + "loss": 0.3682, + "step": 3373 + }, + { + "epoch": 0.0853809752764633, + "grad_norm": 9.637764930725098, + "learning_rate": 9.85770211677616e-06, + "loss": 0.2616, + "step": 3374 + }, + { + "epoch": 0.08540628084115696, + "grad_norm": 3.8452701568603516, + "learning_rate": 9.857606990935995e-06, + "loss": 0.1773, + "step": 3375 + }, + { + "epoch": 0.08543158640585065, + "grad_norm": 7.245283126831055, + "learning_rate": 9.857511833770026e-06, + "loss": 0.2087, + "step": 3376 + }, + { + "epoch": 0.08545689197054432, + "grad_norm": 10.729137420654297, + "learning_rate": 9.857416645278868e-06, + "loss": 0.2985, + "step": 3377 + }, + { + "epoch": 0.085482197535238, + "grad_norm": 4.40087366104126, + "learning_rate": 9.857321425463132e-06, + "loss": 0.1303, + "step": 3378 + }, + { + "epoch": 0.08550750309993167, + "grad_norm": 7.465539455413818, + "learning_rate": 9.857226174323435e-06, + "loss": 0.2047, + "step": 3379 + }, + { + "epoch": 0.08553280866462536, + "grad_norm": 7.049859523773193, + "learning_rate": 9.857130891860389e-06, + "loss": 0.1865, + "step": 3380 + }, + { + "epoch": 0.08555811422931903, + "grad_norm": 5.653465270996094, + "learning_rate": 9.857035578074609e-06, + "loss": 0.2069, + "step": 3381 + }, + { + "epoch": 0.0855834197940127, + "grad_norm": 4.206399917602539, + "learning_rate": 9.856940232966712e-06, + "loss": 0.1818, + "step": 3382 + }, + { + "epoch": 0.08560872535870638, + "grad_norm": 6.032512664794922, + "learning_rate": 9.85684485653731e-06, + "loss": 0.1434, + "step": 3383 + }, + { + "epoch": 0.08563403092340005, + "grad_norm": 12.538249015808105, + "learning_rate": 9.856749448787018e-06, + "loss": 0.2474, + "step": 3384 + }, + { + "epoch": 0.08565933648809373, + "grad_norm": 11.804909706115723, + "learning_rate": 9.856654009716453e-06, + "loss": 0.2084, + "step": 3385 + }, + { + "epoch": 0.0856846420527874, + "grad_norm": 3.532592535018921, + "learning_rate": 9.856558539326232e-06, + "loss": 0.1764, + "step": 3386 + }, + { + "epoch": 0.08570994761748109, + "grad_norm": 4.371918201446533, + "learning_rate": 9.856463037616965e-06, + "loss": 0.1963, + "step": 3387 + }, + { + "epoch": 0.08573525318217476, + "grad_norm": 8.833501815795898, + "learning_rate": 9.856367504589272e-06, + "loss": 0.2516, + "step": 3388 + }, + { + "epoch": 0.08576055874686844, + "grad_norm": 13.52296257019043, + "learning_rate": 9.85627194024377e-06, + "loss": 0.3039, + "step": 3389 + }, + { + "epoch": 0.08578586431156211, + "grad_norm": 9.888568878173828, + "learning_rate": 9.856176344581072e-06, + "loss": 0.22, + "step": 3390 + }, + { + "epoch": 0.08581116987625578, + "grad_norm": 6.391454219818115, + "learning_rate": 9.856080717601796e-06, + "loss": 0.155, + "step": 3391 + }, + { + "epoch": 0.08583647544094947, + "grad_norm": 10.335212707519531, + "learning_rate": 9.85598505930656e-06, + "loss": 0.2889, + "step": 3392 + }, + { + "epoch": 0.08586178100564314, + "grad_norm": 10.164977073669434, + "learning_rate": 9.855889369695976e-06, + "loss": 0.2121, + "step": 3393 + }, + { + "epoch": 0.08588708657033682, + "grad_norm": 8.723788261413574, + "learning_rate": 9.855793648770668e-06, + "loss": 0.3062, + "step": 3394 + }, + { + "epoch": 0.08591239213503049, + "grad_norm": 6.581558704376221, + "learning_rate": 9.85569789653125e-06, + "loss": 0.2611, + "step": 3395 + }, + { + "epoch": 0.08593769769972417, + "grad_norm": 3.9548990726470947, + "learning_rate": 9.85560211297834e-06, + "loss": 0.1754, + "step": 3396 + }, + { + "epoch": 0.08596300326441784, + "grad_norm": 11.358447074890137, + "learning_rate": 9.855506298112553e-06, + "loss": 0.2477, + "step": 3397 + }, + { + "epoch": 0.08598830882911151, + "grad_norm": 5.410521984100342, + "learning_rate": 9.85541045193451e-06, + "loss": 0.2074, + "step": 3398 + }, + { + "epoch": 0.0860136143938052, + "grad_norm": 7.494045257568359, + "learning_rate": 9.855314574444828e-06, + "loss": 0.1933, + "step": 3399 + }, + { + "epoch": 0.08603891995849887, + "grad_norm": 19.008949279785156, + "learning_rate": 9.855218665644126e-06, + "loss": 0.4476, + "step": 3400 + }, + { + "epoch": 0.08606422552319255, + "grad_norm": 8.289480209350586, + "learning_rate": 9.855122725533022e-06, + "loss": 0.3027, + "step": 3401 + }, + { + "epoch": 0.08608953108788622, + "grad_norm": 8.485555648803711, + "learning_rate": 9.855026754112134e-06, + "loss": 0.181, + "step": 3402 + }, + { + "epoch": 0.08611483665257991, + "grad_norm": 4.766201019287109, + "learning_rate": 9.854930751382084e-06, + "loss": 0.2408, + "step": 3403 + }, + { + "epoch": 0.08614014221727358, + "grad_norm": 5.588291645050049, + "learning_rate": 9.854834717343486e-06, + "loss": 0.2478, + "step": 3404 + }, + { + "epoch": 0.08616544778196726, + "grad_norm": 8.825355529785156, + "learning_rate": 9.854738651996962e-06, + "loss": 0.3353, + "step": 3405 + }, + { + "epoch": 0.08619075334666093, + "grad_norm": 5.52730131149292, + "learning_rate": 9.854642555343132e-06, + "loss": 0.1732, + "step": 3406 + }, + { + "epoch": 0.0862160589113546, + "grad_norm": 4.851006984710693, + "learning_rate": 9.854546427382616e-06, + "loss": 0.1816, + "step": 3407 + }, + { + "epoch": 0.08624136447604829, + "grad_norm": 7.014946460723877, + "learning_rate": 9.854450268116032e-06, + "loss": 0.2027, + "step": 3408 + }, + { + "epoch": 0.08626667004074196, + "grad_norm": 7.258975505828857, + "learning_rate": 9.854354077544002e-06, + "loss": 0.2221, + "step": 3409 + }, + { + "epoch": 0.08629197560543564, + "grad_norm": 21.463699340820312, + "learning_rate": 9.854257855667144e-06, + "loss": 0.3493, + "step": 3410 + }, + { + "epoch": 0.08631728117012931, + "grad_norm": 10.023748397827148, + "learning_rate": 9.854161602486082e-06, + "loss": 0.3067, + "step": 3411 + }, + { + "epoch": 0.086342586734823, + "grad_norm": 4.185462951660156, + "learning_rate": 9.854065318001434e-06, + "loss": 0.2151, + "step": 3412 + }, + { + "epoch": 0.08636789229951666, + "grad_norm": 6.526323318481445, + "learning_rate": 9.853969002213822e-06, + "loss": 0.156, + "step": 3413 + }, + { + "epoch": 0.08639319786421033, + "grad_norm": 9.73681640625, + "learning_rate": 9.853872655123866e-06, + "loss": 0.2406, + "step": 3414 + }, + { + "epoch": 0.08641850342890402, + "grad_norm": 4.725617408752441, + "learning_rate": 9.85377627673219e-06, + "loss": 0.167, + "step": 3415 + }, + { + "epoch": 0.08644380899359769, + "grad_norm": 6.457870006561279, + "learning_rate": 9.853679867039413e-06, + "loss": 0.1694, + "step": 3416 + }, + { + "epoch": 0.08646911455829137, + "grad_norm": 5.027888298034668, + "learning_rate": 9.853583426046156e-06, + "loss": 0.1672, + "step": 3417 + }, + { + "epoch": 0.08649442012298504, + "grad_norm": 12.963699340820312, + "learning_rate": 9.853486953753042e-06, + "loss": 0.2775, + "step": 3418 + }, + { + "epoch": 0.08651972568767873, + "grad_norm": 7.229549884796143, + "learning_rate": 9.853390450160695e-06, + "loss": 0.3037, + "step": 3419 + }, + { + "epoch": 0.0865450312523724, + "grad_norm": 4.557139873504639, + "learning_rate": 9.853293915269735e-06, + "loss": 0.2022, + "step": 3420 + }, + { + "epoch": 0.08657033681706608, + "grad_norm": 9.4192476272583, + "learning_rate": 9.853197349080784e-06, + "loss": 0.2055, + "step": 3421 + }, + { + "epoch": 0.08659564238175975, + "grad_norm": 8.675745964050293, + "learning_rate": 9.85310075159447e-06, + "loss": 0.2238, + "step": 3422 + }, + { + "epoch": 0.08662094794645342, + "grad_norm": 4.502333641052246, + "learning_rate": 9.85300412281141e-06, + "loss": 0.1741, + "step": 3423 + }, + { + "epoch": 0.0866462535111471, + "grad_norm": 7.667716026306152, + "learning_rate": 9.852907462732228e-06, + "loss": 0.1706, + "step": 3424 + }, + { + "epoch": 0.08667155907584077, + "grad_norm": 8.892560958862305, + "learning_rate": 9.852810771357549e-06, + "loss": 0.2724, + "step": 3425 + }, + { + "epoch": 0.08669686464053446, + "grad_norm": 6.805902004241943, + "learning_rate": 9.852714048687997e-06, + "loss": 0.3684, + "step": 3426 + }, + { + "epoch": 0.08672217020522813, + "grad_norm": 8.387343406677246, + "learning_rate": 9.852617294724193e-06, + "loss": 0.3075, + "step": 3427 + }, + { + "epoch": 0.08674747576992181, + "grad_norm": 2.5188543796539307, + "learning_rate": 9.852520509466764e-06, + "loss": 0.1101, + "step": 3428 + }, + { + "epoch": 0.08677278133461548, + "grad_norm": 3.576539993286133, + "learning_rate": 9.852423692916333e-06, + "loss": 0.1244, + "step": 3429 + }, + { + "epoch": 0.08679808689930915, + "grad_norm": 8.575522422790527, + "learning_rate": 9.852326845073523e-06, + "loss": 0.2284, + "step": 3430 + }, + { + "epoch": 0.08682339246400284, + "grad_norm": 5.809195518493652, + "learning_rate": 9.852229965938963e-06, + "loss": 0.2593, + "step": 3431 + }, + { + "epoch": 0.0868486980286965, + "grad_norm": 9.670915603637695, + "learning_rate": 9.852133055513271e-06, + "loss": 0.2358, + "step": 3432 + }, + { + "epoch": 0.08687400359339019, + "grad_norm": 17.412458419799805, + "learning_rate": 9.852036113797077e-06, + "loss": 0.1927, + "step": 3433 + }, + { + "epoch": 0.08689930915808386, + "grad_norm": 8.029240608215332, + "learning_rate": 9.851939140791004e-06, + "loss": 0.2553, + "step": 3434 + }, + { + "epoch": 0.08692461472277754, + "grad_norm": 5.204721450805664, + "learning_rate": 9.85184213649568e-06, + "loss": 0.1835, + "step": 3435 + }, + { + "epoch": 0.08694992028747121, + "grad_norm": 8.810046195983887, + "learning_rate": 9.851745100911725e-06, + "loss": 0.2008, + "step": 3436 + }, + { + "epoch": 0.0869752258521649, + "grad_norm": 8.966660499572754, + "learning_rate": 9.851648034039771e-06, + "loss": 0.2593, + "step": 3437 + }, + { + "epoch": 0.08700053141685857, + "grad_norm": 7.183274269104004, + "learning_rate": 9.851550935880441e-06, + "loss": 0.1342, + "step": 3438 + }, + { + "epoch": 0.08702583698155224, + "grad_norm": 4.091548442840576, + "learning_rate": 9.85145380643436e-06, + "loss": 0.0973, + "step": 3439 + }, + { + "epoch": 0.08705114254624592, + "grad_norm": 7.254127502441406, + "learning_rate": 9.851356645702157e-06, + "loss": 0.2358, + "step": 3440 + }, + { + "epoch": 0.08707644811093959, + "grad_norm": 4.165354251861572, + "learning_rate": 9.851259453684458e-06, + "loss": 0.1891, + "step": 3441 + }, + { + "epoch": 0.08710175367563328, + "grad_norm": 7.827985763549805, + "learning_rate": 9.851162230381888e-06, + "loss": 0.1865, + "step": 3442 + }, + { + "epoch": 0.08712705924032695, + "grad_norm": 6.074217319488525, + "learning_rate": 9.851064975795075e-06, + "loss": 0.2338, + "step": 3443 + }, + { + "epoch": 0.08715236480502063, + "grad_norm": 5.641777038574219, + "learning_rate": 9.850967689924647e-06, + "loss": 0.1756, + "step": 3444 + }, + { + "epoch": 0.0871776703697143, + "grad_norm": 16.65809440612793, + "learning_rate": 9.85087037277123e-06, + "loss": 0.4349, + "step": 3445 + }, + { + "epoch": 0.08720297593440797, + "grad_norm": 3.832662582397461, + "learning_rate": 9.850773024335453e-06, + "loss": 0.1593, + "step": 3446 + }, + { + "epoch": 0.08722828149910165, + "grad_norm": 14.638395309448242, + "learning_rate": 9.850675644617943e-06, + "loss": 0.3711, + "step": 3447 + }, + { + "epoch": 0.08725358706379532, + "grad_norm": 11.0472412109375, + "learning_rate": 9.85057823361933e-06, + "loss": 0.2439, + "step": 3448 + }, + { + "epoch": 0.08727889262848901, + "grad_norm": 6.246283054351807, + "learning_rate": 9.850480791340238e-06, + "loss": 0.2553, + "step": 3449 + }, + { + "epoch": 0.08730419819318268, + "grad_norm": 4.632671356201172, + "learning_rate": 9.850383317781298e-06, + "loss": 0.1565, + "step": 3450 + }, + { + "epoch": 0.08732950375787636, + "grad_norm": 7.032812118530273, + "learning_rate": 9.850285812943139e-06, + "loss": 0.1301, + "step": 3451 + }, + { + "epoch": 0.08735480932257003, + "grad_norm": 8.227299690246582, + "learning_rate": 9.850188276826388e-06, + "loss": 0.2126, + "step": 3452 + }, + { + "epoch": 0.08738011488726372, + "grad_norm": 6.219354152679443, + "learning_rate": 9.850090709431674e-06, + "loss": 0.1532, + "step": 3453 + }, + { + "epoch": 0.08740542045195739, + "grad_norm": 5.669376373291016, + "learning_rate": 9.84999311075963e-06, + "loss": 0.2093, + "step": 3454 + }, + { + "epoch": 0.08743072601665106, + "grad_norm": 6.939638137817383, + "learning_rate": 9.84989548081088e-06, + "loss": 0.2515, + "step": 3455 + }, + { + "epoch": 0.08745603158134474, + "grad_norm": 8.495100975036621, + "learning_rate": 9.849797819586058e-06, + "loss": 0.2185, + "step": 3456 + }, + { + "epoch": 0.08748133714603841, + "grad_norm": 5.567621231079102, + "learning_rate": 9.849700127085791e-06, + "loss": 0.2143, + "step": 3457 + }, + { + "epoch": 0.0875066427107321, + "grad_norm": 11.500421524047852, + "learning_rate": 9.849602403310711e-06, + "loss": 0.2961, + "step": 3458 + }, + { + "epoch": 0.08753194827542576, + "grad_norm": 7.976471900939941, + "learning_rate": 9.849504648261445e-06, + "loss": 0.2024, + "step": 3459 + }, + { + "epoch": 0.08755725384011945, + "grad_norm": 10.416044235229492, + "learning_rate": 9.849406861938629e-06, + "loss": 0.2136, + "step": 3460 + }, + { + "epoch": 0.08758255940481312, + "grad_norm": 4.818925857543945, + "learning_rate": 9.849309044342887e-06, + "loss": 0.1924, + "step": 3461 + }, + { + "epoch": 0.08760786496950679, + "grad_norm": 3.2221624851226807, + "learning_rate": 9.849211195474854e-06, + "loss": 0.118, + "step": 3462 + }, + { + "epoch": 0.08763317053420047, + "grad_norm": 22.392253875732422, + "learning_rate": 9.849113315335161e-06, + "loss": 0.1852, + "step": 3463 + }, + { + "epoch": 0.08765847609889414, + "grad_norm": 11.490374565124512, + "learning_rate": 9.849015403924438e-06, + "loss": 0.2613, + "step": 3464 + }, + { + "epoch": 0.08768378166358783, + "grad_norm": 12.981417655944824, + "learning_rate": 9.848917461243316e-06, + "loss": 0.4162, + "step": 3465 + }, + { + "epoch": 0.0877090872282815, + "grad_norm": 24.94282341003418, + "learning_rate": 9.848819487292427e-06, + "loss": 0.3702, + "step": 3466 + }, + { + "epoch": 0.08773439279297518, + "grad_norm": 4.599992275238037, + "learning_rate": 9.848721482072404e-06, + "loss": 0.131, + "step": 3467 + }, + { + "epoch": 0.08775969835766885, + "grad_norm": 5.416105270385742, + "learning_rate": 9.848623445583876e-06, + "loss": 0.1693, + "step": 3468 + }, + { + "epoch": 0.08778500392236253, + "grad_norm": 6.6548871994018555, + "learning_rate": 9.84852537782748e-06, + "loss": 0.2108, + "step": 3469 + }, + { + "epoch": 0.0878103094870562, + "grad_norm": 7.22734260559082, + "learning_rate": 9.848427278803845e-06, + "loss": 0.28, + "step": 3470 + }, + { + "epoch": 0.08783561505174987, + "grad_norm": 3.822683811187744, + "learning_rate": 9.848329148513605e-06, + "loss": 0.1634, + "step": 3471 + }, + { + "epoch": 0.08786092061644356, + "grad_norm": 31.098722457885742, + "learning_rate": 9.84823098695739e-06, + "loss": 0.253, + "step": 3472 + }, + { + "epoch": 0.08788622618113723, + "grad_norm": 4.953007221221924, + "learning_rate": 9.848132794135837e-06, + "loss": 0.1626, + "step": 3473 + }, + { + "epoch": 0.08791153174583091, + "grad_norm": 6.066255569458008, + "learning_rate": 9.848034570049576e-06, + "loss": 0.1965, + "step": 3474 + }, + { + "epoch": 0.08793683731052458, + "grad_norm": 4.30680513381958, + "learning_rate": 9.847936314699243e-06, + "loss": 0.1532, + "step": 3475 + }, + { + "epoch": 0.08796214287521827, + "grad_norm": 4.6663665771484375, + "learning_rate": 9.847838028085471e-06, + "loss": 0.1695, + "step": 3476 + }, + { + "epoch": 0.08798744843991194, + "grad_norm": 4.56466817855835, + "learning_rate": 9.847739710208892e-06, + "loss": 0.1628, + "step": 3477 + }, + { + "epoch": 0.0880127540046056, + "grad_norm": 8.550079345703125, + "learning_rate": 9.847641361070141e-06, + "loss": 0.2794, + "step": 3478 + }, + { + "epoch": 0.08803805956929929, + "grad_norm": 6.2595391273498535, + "learning_rate": 9.847542980669854e-06, + "loss": 0.1862, + "step": 3479 + }, + { + "epoch": 0.08806336513399296, + "grad_norm": 7.9180169105529785, + "learning_rate": 9.847444569008664e-06, + "loss": 0.2335, + "step": 3480 + }, + { + "epoch": 0.08808867069868664, + "grad_norm": 11.781734466552734, + "learning_rate": 9.847346126087204e-06, + "loss": 0.2433, + "step": 3481 + }, + { + "epoch": 0.08811397626338031, + "grad_norm": 6.272899150848389, + "learning_rate": 9.847247651906112e-06, + "loss": 0.2585, + "step": 3482 + }, + { + "epoch": 0.088139281828074, + "grad_norm": 9.115726470947266, + "learning_rate": 9.847149146466022e-06, + "loss": 0.1769, + "step": 3483 + }, + { + "epoch": 0.08816458739276767, + "grad_norm": 8.532458305358887, + "learning_rate": 9.847050609767568e-06, + "loss": 0.2059, + "step": 3484 + }, + { + "epoch": 0.08818989295746135, + "grad_norm": 5.524348258972168, + "learning_rate": 9.846952041811385e-06, + "loss": 0.2698, + "step": 3485 + }, + { + "epoch": 0.08821519852215502, + "grad_norm": 8.54317855834961, + "learning_rate": 9.846853442598112e-06, + "loss": 0.3061, + "step": 3486 + }, + { + "epoch": 0.08824050408684869, + "grad_norm": 7.568243980407715, + "learning_rate": 9.846754812128383e-06, + "loss": 0.155, + "step": 3487 + }, + { + "epoch": 0.08826580965154238, + "grad_norm": 7.433887481689453, + "learning_rate": 9.84665615040283e-06, + "loss": 0.2018, + "step": 3488 + }, + { + "epoch": 0.08829111521623605, + "grad_norm": 4.112532138824463, + "learning_rate": 9.846557457422097e-06, + "loss": 0.149, + "step": 3489 + }, + { + "epoch": 0.08831642078092973, + "grad_norm": 8.811431884765625, + "learning_rate": 9.846458733186816e-06, + "loss": 0.2487, + "step": 3490 + }, + { + "epoch": 0.0883417263456234, + "grad_norm": 50.38457489013672, + "learning_rate": 9.846359977697623e-06, + "loss": 0.2848, + "step": 3491 + }, + { + "epoch": 0.08836703191031708, + "grad_norm": 4.681051254272461, + "learning_rate": 9.846261190955156e-06, + "loss": 0.2206, + "step": 3492 + }, + { + "epoch": 0.08839233747501075, + "grad_norm": 5.848786354064941, + "learning_rate": 9.846162372960054e-06, + "loss": 0.2152, + "step": 3493 + }, + { + "epoch": 0.08841764303970442, + "grad_norm": 10.273333549499512, + "learning_rate": 9.846063523712949e-06, + "loss": 0.2846, + "step": 3494 + }, + { + "epoch": 0.08844294860439811, + "grad_norm": 9.060894012451172, + "learning_rate": 9.845964643214482e-06, + "loss": 0.2186, + "step": 3495 + }, + { + "epoch": 0.08846825416909178, + "grad_norm": 4.9107584953308105, + "learning_rate": 9.845865731465294e-06, + "loss": 0.1906, + "step": 3496 + }, + { + "epoch": 0.08849355973378546, + "grad_norm": 12.273651123046875, + "learning_rate": 9.845766788466016e-06, + "loss": 0.2812, + "step": 3497 + }, + { + "epoch": 0.08851886529847913, + "grad_norm": 16.91582489013672, + "learning_rate": 9.845667814217291e-06, + "loss": 0.3328, + "step": 3498 + }, + { + "epoch": 0.08854417086317282, + "grad_norm": 4.728429794311523, + "learning_rate": 9.845568808719755e-06, + "loss": 0.174, + "step": 3499 + }, + { + "epoch": 0.08856947642786649, + "grad_norm": 6.789236068725586, + "learning_rate": 9.845469771974047e-06, + "loss": 0.1519, + "step": 3500 + }, + { + "epoch": 0.08859478199256017, + "grad_norm": 9.799893379211426, + "learning_rate": 9.845370703980807e-06, + "loss": 0.3544, + "step": 3501 + }, + { + "epoch": 0.08862008755725384, + "grad_norm": 4.664016246795654, + "learning_rate": 9.845271604740671e-06, + "loss": 0.1618, + "step": 3502 + }, + { + "epoch": 0.08864539312194751, + "grad_norm": 10.858827590942383, + "learning_rate": 9.845172474254281e-06, + "loss": 0.2996, + "step": 3503 + }, + { + "epoch": 0.0886706986866412, + "grad_norm": 8.188462257385254, + "learning_rate": 9.845073312522273e-06, + "loss": 0.1727, + "step": 3504 + }, + { + "epoch": 0.08869600425133486, + "grad_norm": 5.308128356933594, + "learning_rate": 9.84497411954529e-06, + "loss": 0.2189, + "step": 3505 + }, + { + "epoch": 0.08872130981602855, + "grad_norm": 6.35048246383667, + "learning_rate": 9.844874895323971e-06, + "loss": 0.2335, + "step": 3506 + }, + { + "epoch": 0.08874661538072222, + "grad_norm": 15.559103965759277, + "learning_rate": 9.844775639858954e-06, + "loss": 0.2395, + "step": 3507 + }, + { + "epoch": 0.0887719209454159, + "grad_norm": 21.249542236328125, + "learning_rate": 9.844676353150879e-06, + "loss": 0.6454, + "step": 3508 + }, + { + "epoch": 0.08879722651010957, + "grad_norm": 7.202197551727295, + "learning_rate": 9.844577035200388e-06, + "loss": 0.2095, + "step": 3509 + }, + { + "epoch": 0.08882253207480324, + "grad_norm": 5.830288410186768, + "learning_rate": 9.844477686008122e-06, + "loss": 0.1801, + "step": 3510 + }, + { + "epoch": 0.08884783763949693, + "grad_norm": 7.851809978485107, + "learning_rate": 9.84437830557472e-06, + "loss": 0.2995, + "step": 3511 + }, + { + "epoch": 0.0888731432041906, + "grad_norm": 4.895080089569092, + "learning_rate": 9.844278893900823e-06, + "loss": 0.2147, + "step": 3512 + }, + { + "epoch": 0.08889844876888428, + "grad_norm": 6.4717912673950195, + "learning_rate": 9.844179450987074e-06, + "loss": 0.1444, + "step": 3513 + }, + { + "epoch": 0.08892375433357795, + "grad_norm": 65.52674865722656, + "learning_rate": 9.844079976834111e-06, + "loss": 0.438, + "step": 3514 + }, + { + "epoch": 0.08894905989827163, + "grad_norm": 6.484764099121094, + "learning_rate": 9.843980471442578e-06, + "loss": 0.1307, + "step": 3515 + }, + { + "epoch": 0.0889743654629653, + "grad_norm": 22.730863571166992, + "learning_rate": 9.843880934813116e-06, + "loss": 0.2289, + "step": 3516 + }, + { + "epoch": 0.08899967102765899, + "grad_norm": 6.528325080871582, + "learning_rate": 9.843781366946367e-06, + "loss": 0.1919, + "step": 3517 + }, + { + "epoch": 0.08902497659235266, + "grad_norm": 5.681602478027344, + "learning_rate": 9.843681767842971e-06, + "loss": 0.2376, + "step": 3518 + }, + { + "epoch": 0.08905028215704633, + "grad_norm": 7.414168357849121, + "learning_rate": 9.843582137503575e-06, + "loss": 0.2086, + "step": 3519 + }, + { + "epoch": 0.08907558772174001, + "grad_norm": 6.832705974578857, + "learning_rate": 9.843482475928818e-06, + "loss": 0.2035, + "step": 3520 + }, + { + "epoch": 0.08910089328643368, + "grad_norm": 6.951013088226318, + "learning_rate": 9.843382783119345e-06, + "loss": 0.2455, + "step": 3521 + }, + { + "epoch": 0.08912619885112737, + "grad_norm": 5.095296859741211, + "learning_rate": 9.843283059075794e-06, + "loss": 0.1794, + "step": 3522 + }, + { + "epoch": 0.08915150441582104, + "grad_norm": 10.24114990234375, + "learning_rate": 9.843183303798813e-06, + "loss": 0.2752, + "step": 3523 + }, + { + "epoch": 0.08917680998051472, + "grad_norm": 7.8980913162231445, + "learning_rate": 9.843083517289044e-06, + "loss": 0.2604, + "step": 3524 + }, + { + "epoch": 0.08920211554520839, + "grad_norm": 5.466288089752197, + "learning_rate": 9.84298369954713e-06, + "loss": 0.2498, + "step": 3525 + }, + { + "epoch": 0.08922742110990206, + "grad_norm": 6.866766929626465, + "learning_rate": 9.842883850573716e-06, + "loss": 0.2366, + "step": 3526 + }, + { + "epoch": 0.08925272667459574, + "grad_norm": 9.055486679077148, + "learning_rate": 9.842783970369444e-06, + "loss": 0.2818, + "step": 3527 + }, + { + "epoch": 0.08927803223928941, + "grad_norm": 4.61727237701416, + "learning_rate": 9.84268405893496e-06, + "loss": 0.2196, + "step": 3528 + }, + { + "epoch": 0.0893033378039831, + "grad_norm": 5.137567520141602, + "learning_rate": 9.842584116270906e-06, + "loss": 0.1879, + "step": 3529 + }, + { + "epoch": 0.08932864336867677, + "grad_norm": 16.36842918395996, + "learning_rate": 9.842484142377928e-06, + "loss": 0.3144, + "step": 3530 + }, + { + "epoch": 0.08935394893337045, + "grad_norm": 5.499597072601318, + "learning_rate": 9.84238413725667e-06, + "loss": 0.2044, + "step": 3531 + }, + { + "epoch": 0.08937925449806412, + "grad_norm": 6.198857307434082, + "learning_rate": 9.842284100907777e-06, + "loss": 0.2705, + "step": 3532 + }, + { + "epoch": 0.0894045600627578, + "grad_norm": 11.14847183227539, + "learning_rate": 9.842184033331897e-06, + "loss": 0.1505, + "step": 3533 + }, + { + "epoch": 0.08942986562745148, + "grad_norm": 4.304506301879883, + "learning_rate": 9.842083934529672e-06, + "loss": 0.1695, + "step": 3534 + }, + { + "epoch": 0.08945517119214515, + "grad_norm": 7.0646538734436035, + "learning_rate": 9.841983804501748e-06, + "loss": 0.2446, + "step": 3535 + }, + { + "epoch": 0.08948047675683883, + "grad_norm": 15.983831405639648, + "learning_rate": 9.84188364324877e-06, + "loss": 0.3112, + "step": 3536 + }, + { + "epoch": 0.0895057823215325, + "grad_norm": 5.531864643096924, + "learning_rate": 9.841783450771386e-06, + "loss": 0.2019, + "step": 3537 + }, + { + "epoch": 0.08953108788622618, + "grad_norm": 4.790957450866699, + "learning_rate": 9.841683227070242e-06, + "loss": 0.1945, + "step": 3538 + }, + { + "epoch": 0.08955639345091986, + "grad_norm": 3.8905811309814453, + "learning_rate": 9.841582972145983e-06, + "loss": 0.2247, + "step": 3539 + }, + { + "epoch": 0.08958169901561354, + "grad_norm": 5.655556678771973, + "learning_rate": 9.841482685999256e-06, + "loss": 0.2392, + "step": 3540 + }, + { + "epoch": 0.08960700458030721, + "grad_norm": 15.850563049316406, + "learning_rate": 9.841382368630708e-06, + "loss": 0.1745, + "step": 3541 + }, + { + "epoch": 0.08963231014500088, + "grad_norm": 15.548408508300781, + "learning_rate": 9.841282020040985e-06, + "loss": 0.3032, + "step": 3542 + }, + { + "epoch": 0.08965761570969456, + "grad_norm": 12.599991798400879, + "learning_rate": 9.841181640230735e-06, + "loss": 0.2752, + "step": 3543 + }, + { + "epoch": 0.08968292127438823, + "grad_norm": 5.1710100173950195, + "learning_rate": 9.841081229200605e-06, + "loss": 0.1769, + "step": 3544 + }, + { + "epoch": 0.08970822683908192, + "grad_norm": 8.71914005279541, + "learning_rate": 9.840980786951243e-06, + "loss": 0.2817, + "step": 3545 + }, + { + "epoch": 0.08973353240377559, + "grad_norm": 4.33672571182251, + "learning_rate": 9.840880313483297e-06, + "loss": 0.1707, + "step": 3546 + }, + { + "epoch": 0.08975883796846927, + "grad_norm": 4.770745277404785, + "learning_rate": 9.840779808797412e-06, + "loss": 0.1948, + "step": 3547 + }, + { + "epoch": 0.08978414353316294, + "grad_norm": 12.928403854370117, + "learning_rate": 9.84067927289424e-06, + "loss": 0.3179, + "step": 3548 + }, + { + "epoch": 0.08980944909785663, + "grad_norm": 6.137838363647461, + "learning_rate": 9.840578705774428e-06, + "loss": 0.1858, + "step": 3549 + }, + { + "epoch": 0.0898347546625503, + "grad_norm": 14.446003913879395, + "learning_rate": 9.840478107438625e-06, + "loss": 0.3278, + "step": 3550 + }, + { + "epoch": 0.08986006022724397, + "grad_norm": 5.994344234466553, + "learning_rate": 9.840377477887478e-06, + "loss": 0.2305, + "step": 3551 + }, + { + "epoch": 0.08988536579193765, + "grad_norm": 5.550107479095459, + "learning_rate": 9.840276817121638e-06, + "loss": 0.1468, + "step": 3552 + }, + { + "epoch": 0.08991067135663132, + "grad_norm": 4.342374324798584, + "learning_rate": 9.840176125141751e-06, + "loss": 0.1656, + "step": 3553 + }, + { + "epoch": 0.089935976921325, + "grad_norm": 7.998355388641357, + "learning_rate": 9.840075401948471e-06, + "loss": 0.2577, + "step": 3554 + }, + { + "epoch": 0.08996128248601867, + "grad_norm": 5.581055164337158, + "learning_rate": 9.839974647542446e-06, + "loss": 0.1362, + "step": 3555 + }, + { + "epoch": 0.08998658805071236, + "grad_norm": 11.257041931152344, + "learning_rate": 9.83987386192432e-06, + "loss": 0.2386, + "step": 3556 + }, + { + "epoch": 0.09001189361540603, + "grad_norm": 11.218677520751953, + "learning_rate": 9.839773045094751e-06, + "loss": 0.2994, + "step": 3557 + }, + { + "epoch": 0.0900371991800997, + "grad_norm": 5.660687446594238, + "learning_rate": 9.839672197054387e-06, + "loss": 0.1628, + "step": 3558 + }, + { + "epoch": 0.09006250474479338, + "grad_norm": 5.548017978668213, + "learning_rate": 9.839571317803876e-06, + "loss": 0.2647, + "step": 3559 + }, + { + "epoch": 0.09008781030948705, + "grad_norm": 8.58993148803711, + "learning_rate": 9.839470407343868e-06, + "loss": 0.2, + "step": 3560 + }, + { + "epoch": 0.09011311587418074, + "grad_norm": 8.227189064025879, + "learning_rate": 9.839369465675017e-06, + "loss": 0.1551, + "step": 3561 + }, + { + "epoch": 0.0901384214388744, + "grad_norm": 5.10697078704834, + "learning_rate": 9.839268492797973e-06, + "loss": 0.171, + "step": 3562 + }, + { + "epoch": 0.09016372700356809, + "grad_norm": 3.790140390396118, + "learning_rate": 9.839167488713388e-06, + "loss": 0.1646, + "step": 3563 + }, + { + "epoch": 0.09018903256826176, + "grad_norm": 4.371312141418457, + "learning_rate": 9.839066453421908e-06, + "loss": 0.151, + "step": 3564 + }, + { + "epoch": 0.09021433813295544, + "grad_norm": 7.191296100616455, + "learning_rate": 9.838965386924192e-06, + "loss": 0.2434, + "step": 3565 + }, + { + "epoch": 0.09023964369764911, + "grad_norm": 4.596219062805176, + "learning_rate": 9.838864289220887e-06, + "loss": 0.1276, + "step": 3566 + }, + { + "epoch": 0.09026494926234278, + "grad_norm": 5.856167793273926, + "learning_rate": 9.838763160312647e-06, + "loss": 0.1752, + "step": 3567 + }, + { + "epoch": 0.09029025482703647, + "grad_norm": 5.64872407913208, + "learning_rate": 9.838662000200124e-06, + "loss": 0.1996, + "step": 3568 + }, + { + "epoch": 0.09031556039173014, + "grad_norm": 10.160511016845703, + "learning_rate": 9.838560808883967e-06, + "loss": 0.1876, + "step": 3569 + }, + { + "epoch": 0.09034086595642382, + "grad_norm": 5.823670387268066, + "learning_rate": 9.838459586364833e-06, + "loss": 0.1346, + "step": 3570 + }, + { + "epoch": 0.09036617152111749, + "grad_norm": 6.43755578994751, + "learning_rate": 9.838358332643375e-06, + "loss": 0.2291, + "step": 3571 + }, + { + "epoch": 0.09039147708581118, + "grad_norm": 9.097649574279785, + "learning_rate": 9.838257047720241e-06, + "loss": 0.2513, + "step": 3572 + }, + { + "epoch": 0.09041678265050485, + "grad_norm": 5.899560451507568, + "learning_rate": 9.838155731596089e-06, + "loss": 0.1518, + "step": 3573 + }, + { + "epoch": 0.09044208821519852, + "grad_norm": 14.265214920043945, + "learning_rate": 9.838054384271571e-06, + "loss": 0.3138, + "step": 3574 + }, + { + "epoch": 0.0904673937798922, + "grad_norm": 9.625226020812988, + "learning_rate": 9.83795300574734e-06, + "loss": 0.2348, + "step": 3575 + }, + { + "epoch": 0.09049269934458587, + "grad_norm": 5.363582611083984, + "learning_rate": 9.837851596024048e-06, + "loss": 0.1643, + "step": 3576 + }, + { + "epoch": 0.09051800490927955, + "grad_norm": 12.352063179016113, + "learning_rate": 9.837750155102354e-06, + "loss": 0.3323, + "step": 3577 + }, + { + "epoch": 0.09054331047397322, + "grad_norm": 5.048472881317139, + "learning_rate": 9.837648682982908e-06, + "loss": 0.184, + "step": 3578 + }, + { + "epoch": 0.09056861603866691, + "grad_norm": 12.993204116821289, + "learning_rate": 9.837547179666365e-06, + "loss": 0.1518, + "step": 3579 + }, + { + "epoch": 0.09059392160336058, + "grad_norm": 8.753366470336914, + "learning_rate": 9.83744564515338e-06, + "loss": 0.1585, + "step": 3580 + }, + { + "epoch": 0.09061922716805426, + "grad_norm": 10.128610610961914, + "learning_rate": 9.837344079444608e-06, + "loss": 0.1933, + "step": 3581 + }, + { + "epoch": 0.09064453273274793, + "grad_norm": 6.892246246337891, + "learning_rate": 9.837242482540704e-06, + "loss": 0.234, + "step": 3582 + }, + { + "epoch": 0.0906698382974416, + "grad_norm": 5.253361701965332, + "learning_rate": 9.837140854442323e-06, + "loss": 0.1704, + "step": 3583 + }, + { + "epoch": 0.09069514386213529, + "grad_norm": 8.059123992919922, + "learning_rate": 9.83703919515012e-06, + "loss": 0.2148, + "step": 3584 + }, + { + "epoch": 0.09072044942682896, + "grad_norm": 14.951807022094727, + "learning_rate": 9.836937504664753e-06, + "loss": 0.2931, + "step": 3585 + }, + { + "epoch": 0.09074575499152264, + "grad_norm": 9.142621040344238, + "learning_rate": 9.836835782986873e-06, + "loss": 0.2692, + "step": 3586 + }, + { + "epoch": 0.09077106055621631, + "grad_norm": 3.9698309898376465, + "learning_rate": 9.83673403011714e-06, + "loss": 0.1649, + "step": 3587 + }, + { + "epoch": 0.09079636612091, + "grad_norm": 8.022318840026855, + "learning_rate": 9.83663224605621e-06, + "loss": 0.231, + "step": 3588 + }, + { + "epoch": 0.09082167168560366, + "grad_norm": 5.606048583984375, + "learning_rate": 9.836530430804737e-06, + "loss": 0.1524, + "step": 3589 + }, + { + "epoch": 0.09084697725029733, + "grad_norm": 3.066012144088745, + "learning_rate": 9.836428584363378e-06, + "loss": 0.0941, + "step": 3590 + }, + { + "epoch": 0.09087228281499102, + "grad_norm": 10.419405937194824, + "learning_rate": 9.836326706732792e-06, + "loss": 0.2381, + "step": 3591 + }, + { + "epoch": 0.09089758837968469, + "grad_norm": 12.40335464477539, + "learning_rate": 9.836224797913635e-06, + "loss": 0.1038, + "step": 3592 + }, + { + "epoch": 0.09092289394437837, + "grad_norm": 18.0, + "learning_rate": 9.836122857906562e-06, + "loss": 0.3405, + "step": 3593 + }, + { + "epoch": 0.09094819950907204, + "grad_norm": 5.198750972747803, + "learning_rate": 9.836020886712235e-06, + "loss": 0.186, + "step": 3594 + }, + { + "epoch": 0.09097350507376573, + "grad_norm": 14.488317489624023, + "learning_rate": 9.835918884331307e-06, + "loss": 0.2967, + "step": 3595 + }, + { + "epoch": 0.0909988106384594, + "grad_norm": 10.76805591583252, + "learning_rate": 9.83581685076444e-06, + "loss": 0.2475, + "step": 3596 + }, + { + "epoch": 0.09102411620315308, + "grad_norm": 5.08927059173584, + "learning_rate": 9.835714786012287e-06, + "loss": 0.2069, + "step": 3597 + }, + { + "epoch": 0.09104942176784675, + "grad_norm": 6.4681291580200195, + "learning_rate": 9.835612690075508e-06, + "loss": 0.2253, + "step": 3598 + }, + { + "epoch": 0.09107472733254042, + "grad_norm": 5.744680404663086, + "learning_rate": 9.835510562954765e-06, + "loss": 0.2176, + "step": 3599 + }, + { + "epoch": 0.0911000328972341, + "grad_norm": 8.359039306640625, + "learning_rate": 9.835408404650713e-06, + "loss": 0.2221, + "step": 3600 + }, + { + "epoch": 0.09112533846192777, + "grad_norm": 12.798986434936523, + "learning_rate": 9.835306215164013e-06, + "loss": 0.3134, + "step": 3601 + }, + { + "epoch": 0.09115064402662146, + "grad_norm": 10.31217098236084, + "learning_rate": 9.83520399449532e-06, + "loss": 0.1837, + "step": 3602 + }, + { + "epoch": 0.09117594959131513, + "grad_norm": 6.932554244995117, + "learning_rate": 9.835101742645296e-06, + "loss": 0.2802, + "step": 3603 + }, + { + "epoch": 0.09120125515600881, + "grad_norm": 8.750962257385254, + "learning_rate": 9.834999459614603e-06, + "loss": 0.2345, + "step": 3604 + }, + { + "epoch": 0.09122656072070248, + "grad_norm": 5.7711181640625, + "learning_rate": 9.834897145403895e-06, + "loss": 0.1782, + "step": 3605 + }, + { + "epoch": 0.09125186628539615, + "grad_norm": 38.001365661621094, + "learning_rate": 9.834794800013836e-06, + "loss": 0.2717, + "step": 3606 + }, + { + "epoch": 0.09127717185008984, + "grad_norm": 7.948178768157959, + "learning_rate": 9.834692423445085e-06, + "loss": 0.3555, + "step": 3607 + }, + { + "epoch": 0.0913024774147835, + "grad_norm": 8.003990173339844, + "learning_rate": 9.834590015698302e-06, + "loss": 0.2419, + "step": 3608 + }, + { + "epoch": 0.09132778297947719, + "grad_norm": 8.92911148071289, + "learning_rate": 9.834487576774146e-06, + "loss": 0.1911, + "step": 3609 + }, + { + "epoch": 0.09135308854417086, + "grad_norm": 4.481069087982178, + "learning_rate": 9.83438510667328e-06, + "loss": 0.1554, + "step": 3610 + }, + { + "epoch": 0.09137839410886454, + "grad_norm": 4.6610283851623535, + "learning_rate": 9.834282605396364e-06, + "loss": 0.1412, + "step": 3611 + }, + { + "epoch": 0.09140369967355821, + "grad_norm": 7.9553656578063965, + "learning_rate": 9.834180072944057e-06, + "loss": 0.245, + "step": 3612 + }, + { + "epoch": 0.0914290052382519, + "grad_norm": 9.313277244567871, + "learning_rate": 9.834077509317023e-06, + "loss": 0.1898, + "step": 3613 + }, + { + "epoch": 0.09145431080294557, + "grad_norm": 7.091104030609131, + "learning_rate": 9.833974914515923e-06, + "loss": 0.1737, + "step": 3614 + }, + { + "epoch": 0.09147961636763924, + "grad_norm": 6.63979959487915, + "learning_rate": 9.833872288541417e-06, + "loss": 0.1361, + "step": 3615 + }, + { + "epoch": 0.09150492193233292, + "grad_norm": 6.320712089538574, + "learning_rate": 9.833769631394167e-06, + "loss": 0.2653, + "step": 3616 + }, + { + "epoch": 0.09153022749702659, + "grad_norm": 10.441217422485352, + "learning_rate": 9.833666943074837e-06, + "loss": 0.1832, + "step": 3617 + }, + { + "epoch": 0.09155553306172028, + "grad_norm": 6.598591327667236, + "learning_rate": 9.833564223584089e-06, + "loss": 0.1524, + "step": 3618 + }, + { + "epoch": 0.09158083862641395, + "grad_norm": 5.716564178466797, + "learning_rate": 9.833461472922582e-06, + "loss": 0.2339, + "step": 3619 + }, + { + "epoch": 0.09160614419110763, + "grad_norm": 4.940080642700195, + "learning_rate": 9.833358691090983e-06, + "loss": 0.1792, + "step": 3620 + }, + { + "epoch": 0.0916314497558013, + "grad_norm": 6.380446434020996, + "learning_rate": 9.833255878089952e-06, + "loss": 0.2366, + "step": 3621 + }, + { + "epoch": 0.09165675532049497, + "grad_norm": 9.262063026428223, + "learning_rate": 9.833153033920153e-06, + "loss": 0.307, + "step": 3622 + }, + { + "epoch": 0.09168206088518865, + "grad_norm": 4.386521816253662, + "learning_rate": 9.83305015858225e-06, + "loss": 0.1931, + "step": 3623 + }, + { + "epoch": 0.09170736644988232, + "grad_norm": 6.126816272735596, + "learning_rate": 9.832947252076906e-06, + "loss": 0.1637, + "step": 3624 + }, + { + "epoch": 0.09173267201457601, + "grad_norm": 13.484124183654785, + "learning_rate": 9.832844314404782e-06, + "loss": 0.2921, + "step": 3625 + }, + { + "epoch": 0.09175797757926968, + "grad_norm": 17.815488815307617, + "learning_rate": 9.832741345566544e-06, + "loss": 0.2367, + "step": 3626 + }, + { + "epoch": 0.09178328314396336, + "grad_norm": 7.142099857330322, + "learning_rate": 9.832638345562858e-06, + "loss": 0.2137, + "step": 3627 + }, + { + "epoch": 0.09180858870865703, + "grad_norm": 3.8723413944244385, + "learning_rate": 9.832535314394385e-06, + "loss": 0.1664, + "step": 3628 + }, + { + "epoch": 0.09183389427335072, + "grad_norm": 6.9196648597717285, + "learning_rate": 9.832432252061789e-06, + "loss": 0.2056, + "step": 3629 + }, + { + "epoch": 0.09185919983804439, + "grad_norm": 6.3898749351501465, + "learning_rate": 9.83232915856574e-06, + "loss": 0.2137, + "step": 3630 + }, + { + "epoch": 0.09188450540273806, + "grad_norm": 16.43877601623535, + "learning_rate": 9.832226033906898e-06, + "loss": 0.2155, + "step": 3631 + }, + { + "epoch": 0.09190981096743174, + "grad_norm": 3.4994235038757324, + "learning_rate": 9.832122878085928e-06, + "loss": 0.2338, + "step": 3632 + }, + { + "epoch": 0.09193511653212541, + "grad_norm": 13.465428352355957, + "learning_rate": 9.832019691103496e-06, + "loss": 0.2563, + "step": 3633 + }, + { + "epoch": 0.0919604220968191, + "grad_norm": 6.320196151733398, + "learning_rate": 9.83191647296027e-06, + "loss": 0.1589, + "step": 3634 + }, + { + "epoch": 0.09198572766151276, + "grad_norm": 7.677237033843994, + "learning_rate": 9.831813223656912e-06, + "loss": 0.2975, + "step": 3635 + }, + { + "epoch": 0.09201103322620645, + "grad_norm": 7.325241565704346, + "learning_rate": 9.83170994319409e-06, + "loss": 0.2528, + "step": 3636 + }, + { + "epoch": 0.09203633879090012, + "grad_norm": 6.555890083312988, + "learning_rate": 9.831606631572467e-06, + "loss": 0.1895, + "step": 3637 + }, + { + "epoch": 0.09206164435559379, + "grad_norm": 11.187371253967285, + "learning_rate": 9.831503288792714e-06, + "loss": 0.2955, + "step": 3638 + }, + { + "epoch": 0.09208694992028747, + "grad_norm": 9.556686401367188, + "learning_rate": 9.831399914855495e-06, + "loss": 0.2978, + "step": 3639 + }, + { + "epoch": 0.09211225548498114, + "grad_norm": 5.8685688972473145, + "learning_rate": 9.831296509761476e-06, + "loss": 0.288, + "step": 3640 + }, + { + "epoch": 0.09213756104967483, + "grad_norm": 5.447117805480957, + "learning_rate": 9.831193073511326e-06, + "loss": 0.1871, + "step": 3641 + }, + { + "epoch": 0.0921628666143685, + "grad_norm": 7.287755966186523, + "learning_rate": 9.831089606105708e-06, + "loss": 0.2236, + "step": 3642 + }, + { + "epoch": 0.09218817217906218, + "grad_norm": 5.188628673553467, + "learning_rate": 9.830986107545293e-06, + "loss": 0.168, + "step": 3643 + }, + { + "epoch": 0.09221347774375585, + "grad_norm": 4.268647193908691, + "learning_rate": 9.830882577830747e-06, + "loss": 0.1853, + "step": 3644 + }, + { + "epoch": 0.09223878330844953, + "grad_norm": 6.309906005859375, + "learning_rate": 9.830779016962738e-06, + "loss": 0.1598, + "step": 3645 + }, + { + "epoch": 0.0922640888731432, + "grad_norm": 11.090214729309082, + "learning_rate": 9.830675424941934e-06, + "loss": 0.2581, + "step": 3646 + }, + { + "epoch": 0.09228939443783687, + "grad_norm": 5.007235050201416, + "learning_rate": 9.830571801769002e-06, + "loss": 0.2286, + "step": 3647 + }, + { + "epoch": 0.09231470000253056, + "grad_norm": 4.0128397941589355, + "learning_rate": 9.830468147444612e-06, + "loss": 0.2168, + "step": 3648 + }, + { + "epoch": 0.09234000556722423, + "grad_norm": 6.8898515701293945, + "learning_rate": 9.83036446196943e-06, + "loss": 0.2191, + "step": 3649 + }, + { + "epoch": 0.09236531113191791, + "grad_norm": 15.053409576416016, + "learning_rate": 9.830260745344128e-06, + "loss": 0.1992, + "step": 3650 + }, + { + "epoch": 0.09239061669661158, + "grad_norm": 9.335506439208984, + "learning_rate": 9.830156997569371e-06, + "loss": 0.2267, + "step": 3651 + }, + { + "epoch": 0.09241592226130527, + "grad_norm": 9.657772064208984, + "learning_rate": 9.830053218645832e-06, + "loss": 0.2822, + "step": 3652 + }, + { + "epoch": 0.09244122782599894, + "grad_norm": 5.143727779388428, + "learning_rate": 9.829949408574177e-06, + "loss": 0.2071, + "step": 3653 + }, + { + "epoch": 0.0924665333906926, + "grad_norm": 4.240163326263428, + "learning_rate": 9.829845567355077e-06, + "loss": 0.142, + "step": 3654 + }, + { + "epoch": 0.09249183895538629, + "grad_norm": 4.4107465744018555, + "learning_rate": 9.8297416949892e-06, + "loss": 0.1746, + "step": 3655 + }, + { + "epoch": 0.09251714452007996, + "grad_norm": 8.228323936462402, + "learning_rate": 9.82963779147722e-06, + "loss": 0.2119, + "step": 3656 + }, + { + "epoch": 0.09254245008477364, + "grad_norm": 12.668785095214844, + "learning_rate": 9.829533856819803e-06, + "loss": 0.2879, + "step": 3657 + }, + { + "epoch": 0.09256775564946731, + "grad_norm": 4.263275623321533, + "learning_rate": 9.82942989101762e-06, + "loss": 0.2343, + "step": 3658 + }, + { + "epoch": 0.092593061214161, + "grad_norm": 6.235024929046631, + "learning_rate": 9.829325894071342e-06, + "loss": 0.1975, + "step": 3659 + }, + { + "epoch": 0.09261836677885467, + "grad_norm": 7.853206634521484, + "learning_rate": 9.829221865981642e-06, + "loss": 0.295, + "step": 3660 + }, + { + "epoch": 0.09264367234354835, + "grad_norm": 3.7035205364227295, + "learning_rate": 9.829117806749185e-06, + "loss": 0.1062, + "step": 3661 + }, + { + "epoch": 0.09266897790824202, + "grad_norm": 5.276095867156982, + "learning_rate": 9.829013716374647e-06, + "loss": 0.2678, + "step": 3662 + }, + { + "epoch": 0.09269428347293569, + "grad_norm": 9.104588508605957, + "learning_rate": 9.828909594858697e-06, + "loss": 0.2961, + "step": 3663 + }, + { + "epoch": 0.09271958903762938, + "grad_norm": 9.024900436401367, + "learning_rate": 9.82880544220201e-06, + "loss": 0.2747, + "step": 3664 + }, + { + "epoch": 0.09274489460232305, + "grad_norm": 6.795555591583252, + "learning_rate": 9.828701258405252e-06, + "loss": 0.2746, + "step": 3665 + }, + { + "epoch": 0.09277020016701673, + "grad_norm": 7.8778767585754395, + "learning_rate": 9.828597043469098e-06, + "loss": 0.2254, + "step": 3666 + }, + { + "epoch": 0.0927955057317104, + "grad_norm": 7.850258827209473, + "learning_rate": 9.828492797394222e-06, + "loss": 0.1885, + "step": 3667 + }, + { + "epoch": 0.09282081129640408, + "grad_norm": 5.703213691711426, + "learning_rate": 9.828388520181293e-06, + "loss": 0.2335, + "step": 3668 + }, + { + "epoch": 0.09284611686109775, + "grad_norm": 5.002734184265137, + "learning_rate": 9.828284211830983e-06, + "loss": 0.2225, + "step": 3669 + }, + { + "epoch": 0.09287142242579143, + "grad_norm": 6.116297721862793, + "learning_rate": 9.828179872343968e-06, + "loss": 0.2325, + "step": 3670 + }, + { + "epoch": 0.09289672799048511, + "grad_norm": 9.621167182922363, + "learning_rate": 9.82807550172092e-06, + "loss": 0.391, + "step": 3671 + }, + { + "epoch": 0.09292203355517878, + "grad_norm": 10.073203086853027, + "learning_rate": 9.827971099962509e-06, + "loss": 0.2924, + "step": 3672 + }, + { + "epoch": 0.09294733911987246, + "grad_norm": 5.964564800262451, + "learning_rate": 9.827866667069412e-06, + "loss": 0.2317, + "step": 3673 + }, + { + "epoch": 0.09297264468456613, + "grad_norm": 5.242971420288086, + "learning_rate": 9.827762203042299e-06, + "loss": 0.1927, + "step": 3674 + }, + { + "epoch": 0.09299795024925982, + "grad_norm": 5.50260591506958, + "learning_rate": 9.827657707881848e-06, + "loss": 0.21, + "step": 3675 + }, + { + "epoch": 0.09302325581395349, + "grad_norm": 7.057523727416992, + "learning_rate": 9.82755318158873e-06, + "loss": 0.2102, + "step": 3676 + }, + { + "epoch": 0.09304856137864717, + "grad_norm": 7.154678821563721, + "learning_rate": 9.827448624163618e-06, + "loss": 0.2779, + "step": 3677 + }, + { + "epoch": 0.09307386694334084, + "grad_norm": 3.719278335571289, + "learning_rate": 9.827344035607188e-06, + "loss": 0.1493, + "step": 3678 + }, + { + "epoch": 0.09309917250803451, + "grad_norm": 4.675348281860352, + "learning_rate": 9.827239415920116e-06, + "loss": 0.1109, + "step": 3679 + }, + { + "epoch": 0.0931244780727282, + "grad_norm": 5.279188632965088, + "learning_rate": 9.827134765103073e-06, + "loss": 0.2071, + "step": 3680 + }, + { + "epoch": 0.09314978363742187, + "grad_norm": 7.764355659484863, + "learning_rate": 9.827030083156736e-06, + "loss": 0.1533, + "step": 3681 + }, + { + "epoch": 0.09317508920211555, + "grad_norm": 6.872541904449463, + "learning_rate": 9.826925370081782e-06, + "loss": 0.1798, + "step": 3682 + }, + { + "epoch": 0.09320039476680922, + "grad_norm": 4.534554481506348, + "learning_rate": 9.82682062587888e-06, + "loss": 0.164, + "step": 3683 + }, + { + "epoch": 0.0932257003315029, + "grad_norm": 9.536673545837402, + "learning_rate": 9.826715850548713e-06, + "loss": 0.2366, + "step": 3684 + }, + { + "epoch": 0.09325100589619657, + "grad_norm": 5.895775318145752, + "learning_rate": 9.826611044091953e-06, + "loss": 0.2137, + "step": 3685 + }, + { + "epoch": 0.09327631146089024, + "grad_norm": 4.2042765617370605, + "learning_rate": 9.826506206509277e-06, + "loss": 0.132, + "step": 3686 + }, + { + "epoch": 0.09330161702558393, + "grad_norm": 4.165465354919434, + "learning_rate": 9.82640133780136e-06, + "loss": 0.1459, + "step": 3687 + }, + { + "epoch": 0.0933269225902776, + "grad_norm": 6.0015363693237305, + "learning_rate": 9.826296437968877e-06, + "loss": 0.1959, + "step": 3688 + }, + { + "epoch": 0.09335222815497128, + "grad_norm": 15.178537368774414, + "learning_rate": 9.826191507012505e-06, + "loss": 0.1403, + "step": 3689 + }, + { + "epoch": 0.09337753371966495, + "grad_norm": 8.685749053955078, + "learning_rate": 9.826086544932925e-06, + "loss": 0.192, + "step": 3690 + }, + { + "epoch": 0.09340283928435864, + "grad_norm": 7.194304466247559, + "learning_rate": 9.825981551730807e-06, + "loss": 0.3129, + "step": 3691 + }, + { + "epoch": 0.0934281448490523, + "grad_norm": 10.110260963439941, + "learning_rate": 9.825876527406834e-06, + "loss": 0.2773, + "step": 3692 + }, + { + "epoch": 0.09345345041374599, + "grad_norm": 3.4085006713867188, + "learning_rate": 9.825771471961682e-06, + "loss": 0.1316, + "step": 3693 + }, + { + "epoch": 0.09347875597843966, + "grad_norm": 11.260897636413574, + "learning_rate": 9.825666385396027e-06, + "loss": 0.2532, + "step": 3694 + }, + { + "epoch": 0.09350406154313333, + "grad_norm": 17.247547149658203, + "learning_rate": 9.825561267710544e-06, + "loss": 0.2632, + "step": 3695 + }, + { + "epoch": 0.09352936710782701, + "grad_norm": 4.803816795349121, + "learning_rate": 9.825456118905917e-06, + "loss": 0.1982, + "step": 3696 + }, + { + "epoch": 0.09355467267252068, + "grad_norm": 5.810074329376221, + "learning_rate": 9.825350938982821e-06, + "loss": 0.1756, + "step": 3697 + }, + { + "epoch": 0.09357997823721437, + "grad_norm": 5.091504096984863, + "learning_rate": 9.825245727941933e-06, + "loss": 0.2276, + "step": 3698 + }, + { + "epoch": 0.09360528380190804, + "grad_norm": 12.498852729797363, + "learning_rate": 9.825140485783934e-06, + "loss": 0.1585, + "step": 3699 + }, + { + "epoch": 0.09363058936660172, + "grad_norm": 6.397273540496826, + "learning_rate": 9.825035212509502e-06, + "loss": 0.2128, + "step": 3700 + }, + { + "epoch": 0.09365589493129539, + "grad_norm": 12.472820281982422, + "learning_rate": 9.824929908119314e-06, + "loss": 0.2367, + "step": 3701 + }, + { + "epoch": 0.09368120049598906, + "grad_norm": 3.89528226852417, + "learning_rate": 9.82482457261405e-06, + "loss": 0.1567, + "step": 3702 + }, + { + "epoch": 0.09370650606068275, + "grad_norm": 7.650807857513428, + "learning_rate": 9.82471920599439e-06, + "loss": 0.2625, + "step": 3703 + }, + { + "epoch": 0.09373181162537642, + "grad_norm": 4.525721073150635, + "learning_rate": 9.824613808261015e-06, + "loss": 0.1413, + "step": 3704 + }, + { + "epoch": 0.0937571171900701, + "grad_norm": 16.547515869140625, + "learning_rate": 9.824508379414603e-06, + "loss": 0.2374, + "step": 3705 + }, + { + "epoch": 0.09378242275476377, + "grad_norm": 5.249958515167236, + "learning_rate": 9.824402919455834e-06, + "loss": 0.2528, + "step": 3706 + }, + { + "epoch": 0.09380772831945745, + "grad_norm": 3.9673542976379395, + "learning_rate": 9.824297428385385e-06, + "loss": 0.1678, + "step": 3707 + }, + { + "epoch": 0.09383303388415112, + "grad_norm": 7.86146354675293, + "learning_rate": 9.824191906203942e-06, + "loss": 0.3533, + "step": 3708 + }, + { + "epoch": 0.09385833944884481, + "grad_norm": 9.00036334991455, + "learning_rate": 9.824086352912183e-06, + "loss": 0.2376, + "step": 3709 + }, + { + "epoch": 0.09388364501353848, + "grad_norm": 12.194886207580566, + "learning_rate": 9.823980768510787e-06, + "loss": 0.3771, + "step": 3710 + }, + { + "epoch": 0.09390895057823215, + "grad_norm": 6.601235866546631, + "learning_rate": 9.823875153000438e-06, + "loss": 0.2189, + "step": 3711 + }, + { + "epoch": 0.09393425614292583, + "grad_norm": 7.414129734039307, + "learning_rate": 9.823769506381813e-06, + "loss": 0.1239, + "step": 3712 + }, + { + "epoch": 0.0939595617076195, + "grad_norm": 8.48491382598877, + "learning_rate": 9.823663828655598e-06, + "loss": 0.301, + "step": 3713 + }, + { + "epoch": 0.09398486727231319, + "grad_norm": 4.22186803817749, + "learning_rate": 9.82355811982247e-06, + "loss": 0.165, + "step": 3714 + }, + { + "epoch": 0.09401017283700686, + "grad_norm": 4.295551300048828, + "learning_rate": 9.823452379883115e-06, + "loss": 0.1695, + "step": 3715 + }, + { + "epoch": 0.09403547840170054, + "grad_norm": 7.132841110229492, + "learning_rate": 9.823346608838213e-06, + "loss": 0.199, + "step": 3716 + }, + { + "epoch": 0.09406078396639421, + "grad_norm": 7.777706623077393, + "learning_rate": 9.823240806688445e-06, + "loss": 0.2006, + "step": 3717 + }, + { + "epoch": 0.09408608953108788, + "grad_norm": 8.581802368164062, + "learning_rate": 9.823134973434494e-06, + "loss": 0.2843, + "step": 3718 + }, + { + "epoch": 0.09411139509578156, + "grad_norm": 4.1896162033081055, + "learning_rate": 9.823029109077042e-06, + "loss": 0.1617, + "step": 3719 + }, + { + "epoch": 0.09413670066047523, + "grad_norm": 10.52326488494873, + "learning_rate": 9.822923213616774e-06, + "loss": 0.2842, + "step": 3720 + }, + { + "epoch": 0.09416200622516892, + "grad_norm": 5.307373046875, + "learning_rate": 9.82281728705437e-06, + "loss": 0.2074, + "step": 3721 + }, + { + "epoch": 0.09418731178986259, + "grad_norm": 3.948312282562256, + "learning_rate": 9.822711329390515e-06, + "loss": 0.146, + "step": 3722 + }, + { + "epoch": 0.09421261735455627, + "grad_norm": 5.6223626136779785, + "learning_rate": 9.822605340625892e-06, + "loss": 0.1977, + "step": 3723 + }, + { + "epoch": 0.09423792291924994, + "grad_norm": 2.8554344177246094, + "learning_rate": 9.822499320761186e-06, + "loss": 0.1701, + "step": 3724 + }, + { + "epoch": 0.09426322848394363, + "grad_norm": 8.006601333618164, + "learning_rate": 9.822393269797075e-06, + "loss": 0.145, + "step": 3725 + }, + { + "epoch": 0.0942885340486373, + "grad_norm": 8.546895980834961, + "learning_rate": 9.82228718773425e-06, + "loss": 0.2466, + "step": 3726 + }, + { + "epoch": 0.09431383961333097, + "grad_norm": 7.874481201171875, + "learning_rate": 9.82218107457339e-06, + "loss": 0.2804, + "step": 3727 + }, + { + "epoch": 0.09433914517802465, + "grad_norm": 10.834612846374512, + "learning_rate": 9.822074930315183e-06, + "loss": 0.2086, + "step": 3728 + }, + { + "epoch": 0.09436445074271832, + "grad_norm": 8.4298734664917, + "learning_rate": 9.821968754960311e-06, + "loss": 0.1517, + "step": 3729 + }, + { + "epoch": 0.094389756307412, + "grad_norm": 7.269416332244873, + "learning_rate": 9.821862548509459e-06, + "loss": 0.2101, + "step": 3730 + }, + { + "epoch": 0.09441506187210567, + "grad_norm": 3.2244317531585693, + "learning_rate": 9.821756310963313e-06, + "loss": 0.1275, + "step": 3731 + }, + { + "epoch": 0.09444036743679936, + "grad_norm": 5.868415832519531, + "learning_rate": 9.821650042322556e-06, + "loss": 0.176, + "step": 3732 + }, + { + "epoch": 0.09446567300149303, + "grad_norm": 9.427703857421875, + "learning_rate": 9.821543742587876e-06, + "loss": 0.3266, + "step": 3733 + }, + { + "epoch": 0.0944909785661867, + "grad_norm": 6.681222915649414, + "learning_rate": 9.821437411759958e-06, + "loss": 0.2781, + "step": 3734 + }, + { + "epoch": 0.09451628413088038, + "grad_norm": 6.194453239440918, + "learning_rate": 9.821331049839486e-06, + "loss": 0.1516, + "step": 3735 + }, + { + "epoch": 0.09454158969557405, + "grad_norm": 6.509840488433838, + "learning_rate": 9.821224656827146e-06, + "loss": 0.2105, + "step": 3736 + }, + { + "epoch": 0.09456689526026774, + "grad_norm": 10.319588661193848, + "learning_rate": 9.821118232723628e-06, + "loss": 0.248, + "step": 3737 + }, + { + "epoch": 0.0945922008249614, + "grad_norm": 6.907766342163086, + "learning_rate": 9.821011777529613e-06, + "loss": 0.2945, + "step": 3738 + }, + { + "epoch": 0.09461750638965509, + "grad_norm": 3.716604471206665, + "learning_rate": 9.82090529124579e-06, + "loss": 0.0957, + "step": 3739 + }, + { + "epoch": 0.09464281195434876, + "grad_norm": 13.558648109436035, + "learning_rate": 9.820798773872847e-06, + "loss": 0.2477, + "step": 3740 + }, + { + "epoch": 0.09466811751904244, + "grad_norm": 12.226629257202148, + "learning_rate": 9.820692225411466e-06, + "loss": 0.1792, + "step": 3741 + }, + { + "epoch": 0.09469342308373611, + "grad_norm": 3.925023078918457, + "learning_rate": 9.82058564586234e-06, + "loss": 0.1027, + "step": 3742 + }, + { + "epoch": 0.09471872864842978, + "grad_norm": 4.199975490570068, + "learning_rate": 9.820479035226153e-06, + "loss": 0.1876, + "step": 3743 + }, + { + "epoch": 0.09474403421312347, + "grad_norm": 6.541990756988525, + "learning_rate": 9.820372393503594e-06, + "loss": 0.2145, + "step": 3744 + }, + { + "epoch": 0.09476933977781714, + "grad_norm": 6.874530792236328, + "learning_rate": 9.820265720695351e-06, + "loss": 0.184, + "step": 3745 + }, + { + "epoch": 0.09479464534251082, + "grad_norm": 8.945901870727539, + "learning_rate": 9.82015901680211e-06, + "loss": 0.2454, + "step": 3746 + }, + { + "epoch": 0.09481995090720449, + "grad_norm": 4.493734836578369, + "learning_rate": 9.82005228182456e-06, + "loss": 0.2171, + "step": 3747 + }, + { + "epoch": 0.09484525647189818, + "grad_norm": 12.707803726196289, + "learning_rate": 9.81994551576339e-06, + "loss": 0.2514, + "step": 3748 + }, + { + "epoch": 0.09487056203659185, + "grad_norm": 7.991660118103027, + "learning_rate": 9.819838718619287e-06, + "loss": 0.2789, + "step": 3749 + }, + { + "epoch": 0.09489586760128552, + "grad_norm": 7.4843668937683105, + "learning_rate": 9.819731890392942e-06, + "loss": 0.159, + "step": 3750 + }, + { + "epoch": 0.0949211731659792, + "grad_norm": 10.575857162475586, + "learning_rate": 9.819625031085042e-06, + "loss": 0.2243, + "step": 3751 + }, + { + "epoch": 0.09494647873067287, + "grad_norm": 9.406279563903809, + "learning_rate": 9.819518140696276e-06, + "loss": 0.256, + "step": 3752 + }, + { + "epoch": 0.09497178429536655, + "grad_norm": 9.235018730163574, + "learning_rate": 9.819411219227336e-06, + "loss": 0.2889, + "step": 3753 + }, + { + "epoch": 0.09499708986006022, + "grad_norm": 5.1013360023498535, + "learning_rate": 9.819304266678906e-06, + "loss": 0.1747, + "step": 3754 + }, + { + "epoch": 0.09502239542475391, + "grad_norm": 34.742645263671875, + "learning_rate": 9.819197283051683e-06, + "loss": 0.2062, + "step": 3755 + }, + { + "epoch": 0.09504770098944758, + "grad_norm": 15.442800521850586, + "learning_rate": 9.819090268346351e-06, + "loss": 0.188, + "step": 3756 + }, + { + "epoch": 0.09507300655414126, + "grad_norm": 13.731197357177734, + "learning_rate": 9.818983222563603e-06, + "loss": 0.2858, + "step": 3757 + }, + { + "epoch": 0.09509831211883493, + "grad_norm": 5.071784973144531, + "learning_rate": 9.818876145704129e-06, + "loss": 0.1756, + "step": 3758 + }, + { + "epoch": 0.0951236176835286, + "grad_norm": 7.800654888153076, + "learning_rate": 9.818769037768618e-06, + "loss": 0.1503, + "step": 3759 + }, + { + "epoch": 0.09514892324822229, + "grad_norm": 6.88004732131958, + "learning_rate": 9.818661898757763e-06, + "loss": 0.2553, + "step": 3760 + }, + { + "epoch": 0.09517422881291596, + "grad_norm": 23.822378158569336, + "learning_rate": 9.818554728672252e-06, + "loss": 0.3608, + "step": 3761 + }, + { + "epoch": 0.09519953437760964, + "grad_norm": 8.157581329345703, + "learning_rate": 9.818447527512778e-06, + "loss": 0.2194, + "step": 3762 + }, + { + "epoch": 0.09522483994230331, + "grad_norm": 12.43401050567627, + "learning_rate": 9.818340295280034e-06, + "loss": 0.2542, + "step": 3763 + }, + { + "epoch": 0.095250145506997, + "grad_norm": 10.9425630569458, + "learning_rate": 9.818233031974708e-06, + "loss": 0.2347, + "step": 3764 + }, + { + "epoch": 0.09527545107169066, + "grad_norm": 11.24193000793457, + "learning_rate": 9.818125737597495e-06, + "loss": 0.2715, + "step": 3765 + }, + { + "epoch": 0.09530075663638433, + "grad_norm": 4.659763336181641, + "learning_rate": 9.818018412149084e-06, + "loss": 0.207, + "step": 3766 + }, + { + "epoch": 0.09532606220107802, + "grad_norm": 4.155172824859619, + "learning_rate": 9.81791105563017e-06, + "loss": 0.1303, + "step": 3767 + }, + { + "epoch": 0.09535136776577169, + "grad_norm": 5.171994209289551, + "learning_rate": 9.817803668041442e-06, + "loss": 0.2319, + "step": 3768 + }, + { + "epoch": 0.09537667333046537, + "grad_norm": 5.031429767608643, + "learning_rate": 9.817696249383594e-06, + "loss": 0.1843, + "step": 3769 + }, + { + "epoch": 0.09540197889515904, + "grad_norm": 8.2289457321167, + "learning_rate": 9.817588799657322e-06, + "loss": 0.2767, + "step": 3770 + }, + { + "epoch": 0.09542728445985273, + "grad_norm": 3.9631166458129883, + "learning_rate": 9.817481318863314e-06, + "loss": 0.2022, + "step": 3771 + }, + { + "epoch": 0.0954525900245464, + "grad_norm": 18.978866577148438, + "learning_rate": 9.817373807002266e-06, + "loss": 0.3343, + "step": 3772 + }, + { + "epoch": 0.09547789558924008, + "grad_norm": 5.444255828857422, + "learning_rate": 9.81726626407487e-06, + "loss": 0.2296, + "step": 3773 + }, + { + "epoch": 0.09550320115393375, + "grad_norm": 5.988038539886475, + "learning_rate": 9.817158690081818e-06, + "loss": 0.2303, + "step": 3774 + }, + { + "epoch": 0.09552850671862742, + "grad_norm": 17.652780532836914, + "learning_rate": 9.817051085023807e-06, + "loss": 0.2481, + "step": 3775 + }, + { + "epoch": 0.0955538122833211, + "grad_norm": 8.576828956604004, + "learning_rate": 9.816943448901531e-06, + "loss": 0.2465, + "step": 3776 + }, + { + "epoch": 0.09557911784801477, + "grad_norm": 3.625973701477051, + "learning_rate": 9.816835781715682e-06, + "loss": 0.0931, + "step": 3777 + }, + { + "epoch": 0.09560442341270846, + "grad_norm": 10.671696662902832, + "learning_rate": 9.816728083466956e-06, + "loss": 0.3506, + "step": 3778 + }, + { + "epoch": 0.09562972897740213, + "grad_norm": 6.3285980224609375, + "learning_rate": 9.816620354156044e-06, + "loss": 0.2375, + "step": 3779 + }, + { + "epoch": 0.09565503454209581, + "grad_norm": 3.79876971244812, + "learning_rate": 9.816512593783645e-06, + "loss": 0.0887, + "step": 3780 + }, + { + "epoch": 0.09568034010678948, + "grad_norm": 6.146468639373779, + "learning_rate": 9.816404802350452e-06, + "loss": 0.1912, + "step": 3781 + }, + { + "epoch": 0.09570564567148315, + "grad_norm": 6.3397603034973145, + "learning_rate": 9.81629697985716e-06, + "loss": 0.1902, + "step": 3782 + }, + { + "epoch": 0.09573095123617684, + "grad_norm": 5.006350040435791, + "learning_rate": 9.816189126304465e-06, + "loss": 0.1891, + "step": 3783 + }, + { + "epoch": 0.0957562568008705, + "grad_norm": 7.745593547821045, + "learning_rate": 9.816081241693061e-06, + "loss": 0.1638, + "step": 3784 + }, + { + "epoch": 0.09578156236556419, + "grad_norm": 13.261859893798828, + "learning_rate": 9.815973326023647e-06, + "loss": 0.1872, + "step": 3785 + }, + { + "epoch": 0.09580686793025786, + "grad_norm": 4.346017360687256, + "learning_rate": 9.815865379296917e-06, + "loss": 0.178, + "step": 3786 + }, + { + "epoch": 0.09583217349495154, + "grad_norm": 11.192493438720703, + "learning_rate": 9.815757401513564e-06, + "loss": 0.1739, + "step": 3787 + }, + { + "epoch": 0.09585747905964521, + "grad_norm": 6.665482044219971, + "learning_rate": 9.815649392674288e-06, + "loss": 0.218, + "step": 3788 + }, + { + "epoch": 0.0958827846243389, + "grad_norm": 10.454625129699707, + "learning_rate": 9.815541352779787e-06, + "loss": 0.3001, + "step": 3789 + }, + { + "epoch": 0.09590809018903257, + "grad_norm": 7.819087505340576, + "learning_rate": 9.815433281830755e-06, + "loss": 0.2142, + "step": 3790 + }, + { + "epoch": 0.09593339575372624, + "grad_norm": 9.08287525177002, + "learning_rate": 9.815325179827889e-06, + "loss": 0.2637, + "step": 3791 + }, + { + "epoch": 0.09595870131841992, + "grad_norm": 5.094300270080566, + "learning_rate": 9.815217046771887e-06, + "loss": 0.2059, + "step": 3792 + }, + { + "epoch": 0.09598400688311359, + "grad_norm": 4.755411624908447, + "learning_rate": 9.815108882663446e-06, + "loss": 0.2197, + "step": 3793 + }, + { + "epoch": 0.09600931244780728, + "grad_norm": 5.334249019622803, + "learning_rate": 9.815000687503262e-06, + "loss": 0.2736, + "step": 3794 + }, + { + "epoch": 0.09603461801250095, + "grad_norm": 8.5060453414917, + "learning_rate": 9.814892461292035e-06, + "loss": 0.1626, + "step": 3795 + }, + { + "epoch": 0.09605992357719463, + "grad_norm": 5.925238132476807, + "learning_rate": 9.814784204030463e-06, + "loss": 0.2204, + "step": 3796 + }, + { + "epoch": 0.0960852291418883, + "grad_norm": 7.295643329620361, + "learning_rate": 9.814675915719243e-06, + "loss": 0.1859, + "step": 3797 + }, + { + "epoch": 0.09611053470658197, + "grad_norm": 9.363204002380371, + "learning_rate": 9.814567596359073e-06, + "loss": 0.2344, + "step": 3798 + }, + { + "epoch": 0.09613584027127565, + "grad_norm": 4.283543109893799, + "learning_rate": 9.814459245950653e-06, + "loss": 0.1869, + "step": 3799 + }, + { + "epoch": 0.09616114583596932, + "grad_norm": 7.5950188636779785, + "learning_rate": 9.814350864494681e-06, + "loss": 0.2508, + "step": 3800 + }, + { + "epoch": 0.09618645140066301, + "grad_norm": 5.970797538757324, + "learning_rate": 9.814242451991856e-06, + "loss": 0.0862, + "step": 3801 + }, + { + "epoch": 0.09621175696535668, + "grad_norm": 11.90527057647705, + "learning_rate": 9.814134008442875e-06, + "loss": 0.2767, + "step": 3802 + }, + { + "epoch": 0.09623706253005036, + "grad_norm": 4.936020374298096, + "learning_rate": 9.814025533848442e-06, + "loss": 0.1875, + "step": 3803 + }, + { + "epoch": 0.09626236809474403, + "grad_norm": 20.711990356445312, + "learning_rate": 9.81391702820925e-06, + "loss": 0.2447, + "step": 3804 + }, + { + "epoch": 0.09628767365943772, + "grad_norm": 10.781024932861328, + "learning_rate": 9.813808491526007e-06, + "loss": 0.2749, + "step": 3805 + }, + { + "epoch": 0.09631297922413139, + "grad_norm": 5.601130962371826, + "learning_rate": 9.813699923799406e-06, + "loss": 0.259, + "step": 3806 + }, + { + "epoch": 0.09633828478882506, + "grad_norm": 14.470513343811035, + "learning_rate": 9.81359132503015e-06, + "loss": 0.195, + "step": 3807 + }, + { + "epoch": 0.09636359035351874, + "grad_norm": 6.629192352294922, + "learning_rate": 9.813482695218938e-06, + "loss": 0.1611, + "step": 3808 + }, + { + "epoch": 0.09638889591821241, + "grad_norm": 4.958868026733398, + "learning_rate": 9.813374034366473e-06, + "loss": 0.251, + "step": 3809 + }, + { + "epoch": 0.0964142014829061, + "grad_norm": 5.462852954864502, + "learning_rate": 9.813265342473453e-06, + "loss": 0.1415, + "step": 3810 + }, + { + "epoch": 0.09643950704759977, + "grad_norm": 3.3173186779022217, + "learning_rate": 9.81315661954058e-06, + "loss": 0.1946, + "step": 3811 + }, + { + "epoch": 0.09646481261229345, + "grad_norm": 8.433377265930176, + "learning_rate": 9.813047865568557e-06, + "loss": 0.2754, + "step": 3812 + }, + { + "epoch": 0.09649011817698712, + "grad_norm": 4.575791835784912, + "learning_rate": 9.812939080558082e-06, + "loss": 0.184, + "step": 3813 + }, + { + "epoch": 0.09651542374168079, + "grad_norm": 6.338545322418213, + "learning_rate": 9.812830264509859e-06, + "loss": 0.1945, + "step": 3814 + }, + { + "epoch": 0.09654072930637447, + "grad_norm": 5.668262004852295, + "learning_rate": 9.812721417424588e-06, + "loss": 0.1553, + "step": 3815 + }, + { + "epoch": 0.09656603487106814, + "grad_norm": 6.2694091796875, + "learning_rate": 9.812612539302973e-06, + "loss": 0.2569, + "step": 3816 + }, + { + "epoch": 0.09659134043576183, + "grad_norm": 6.883702754974365, + "learning_rate": 9.812503630145715e-06, + "loss": 0.1812, + "step": 3817 + }, + { + "epoch": 0.0966166460004555, + "grad_norm": 4.631391525268555, + "learning_rate": 9.812394689953514e-06, + "loss": 0.2018, + "step": 3818 + }, + { + "epoch": 0.09664195156514918, + "grad_norm": 8.565771102905273, + "learning_rate": 9.812285718727078e-06, + "loss": 0.1655, + "step": 3819 + }, + { + "epoch": 0.09666725712984285, + "grad_norm": 6.524888038635254, + "learning_rate": 9.812176716467106e-06, + "loss": 0.1309, + "step": 3820 + }, + { + "epoch": 0.09669256269453654, + "grad_norm": 10.932195663452148, + "learning_rate": 9.8120676831743e-06, + "loss": 0.2076, + "step": 3821 + }, + { + "epoch": 0.0967178682592302, + "grad_norm": 17.2669620513916, + "learning_rate": 9.811958618849365e-06, + "loss": 0.2815, + "step": 3822 + }, + { + "epoch": 0.09674317382392388, + "grad_norm": 6.4232707023620605, + "learning_rate": 9.811849523493005e-06, + "loss": 0.1792, + "step": 3823 + }, + { + "epoch": 0.09676847938861756, + "grad_norm": 4.737259864807129, + "learning_rate": 9.81174039710592e-06, + "loss": 0.1293, + "step": 3824 + }, + { + "epoch": 0.09679378495331123, + "grad_norm": 5.340822696685791, + "learning_rate": 9.811631239688819e-06, + "loss": 0.1706, + "step": 3825 + }, + { + "epoch": 0.09681909051800491, + "grad_norm": 7.67525053024292, + "learning_rate": 9.811522051242401e-06, + "loss": 0.2338, + "step": 3826 + }, + { + "epoch": 0.09684439608269858, + "grad_norm": 11.36612606048584, + "learning_rate": 9.811412831767374e-06, + "loss": 0.2373, + "step": 3827 + }, + { + "epoch": 0.09686970164739227, + "grad_norm": 6.109791278839111, + "learning_rate": 9.81130358126444e-06, + "loss": 0.1308, + "step": 3828 + }, + { + "epoch": 0.09689500721208594, + "grad_norm": 5.553955078125, + "learning_rate": 9.811194299734303e-06, + "loss": 0.2149, + "step": 3829 + }, + { + "epoch": 0.09692031277677961, + "grad_norm": 3.7057459354400635, + "learning_rate": 9.81108498717767e-06, + "loss": 0.1434, + "step": 3830 + }, + { + "epoch": 0.09694561834147329, + "grad_norm": 8.48552131652832, + "learning_rate": 9.810975643595247e-06, + "loss": 0.2072, + "step": 3831 + }, + { + "epoch": 0.09697092390616696, + "grad_norm": 5.409631252288818, + "learning_rate": 9.810866268987733e-06, + "loss": 0.251, + "step": 3832 + }, + { + "epoch": 0.09699622947086065, + "grad_norm": 7.971614837646484, + "learning_rate": 9.810756863355839e-06, + "loss": 0.1992, + "step": 3833 + }, + { + "epoch": 0.09702153503555432, + "grad_norm": 9.968094825744629, + "learning_rate": 9.81064742670027e-06, + "loss": 0.2076, + "step": 3834 + }, + { + "epoch": 0.097046840600248, + "grad_norm": 16.04445457458496, + "learning_rate": 9.81053795902173e-06, + "loss": 0.2363, + "step": 3835 + }, + { + "epoch": 0.09707214616494167, + "grad_norm": 12.898112297058105, + "learning_rate": 9.810428460320924e-06, + "loss": 0.2714, + "step": 3836 + }, + { + "epoch": 0.09709745172963535, + "grad_norm": 4.399422645568848, + "learning_rate": 9.81031893059856e-06, + "loss": 0.1747, + "step": 3837 + }, + { + "epoch": 0.09712275729432902, + "grad_norm": 10.156179428100586, + "learning_rate": 9.810209369855347e-06, + "loss": 0.2306, + "step": 3838 + }, + { + "epoch": 0.0971480628590227, + "grad_norm": 4.844841003417969, + "learning_rate": 9.810099778091985e-06, + "loss": 0.1981, + "step": 3839 + }, + { + "epoch": 0.09717336842371638, + "grad_norm": 7.319516658782959, + "learning_rate": 9.809990155309187e-06, + "loss": 0.2138, + "step": 3840 + }, + { + "epoch": 0.09719867398841005, + "grad_norm": 4.046015739440918, + "learning_rate": 9.809880501507657e-06, + "loss": 0.1577, + "step": 3841 + }, + { + "epoch": 0.09722397955310373, + "grad_norm": 4.637729167938232, + "learning_rate": 9.8097708166881e-06, + "loss": 0.2028, + "step": 3842 + }, + { + "epoch": 0.0972492851177974, + "grad_norm": 3.6745686531066895, + "learning_rate": 9.809661100851226e-06, + "loss": 0.1699, + "step": 3843 + }, + { + "epoch": 0.09727459068249109, + "grad_norm": 4.550467491149902, + "learning_rate": 9.809551353997743e-06, + "loss": 0.1997, + "step": 3844 + }, + { + "epoch": 0.09729989624718476, + "grad_norm": 9.118945121765137, + "learning_rate": 9.809441576128358e-06, + "loss": 0.2078, + "step": 3845 + }, + { + "epoch": 0.09732520181187843, + "grad_norm": 4.413234233856201, + "learning_rate": 9.80933176724378e-06, + "loss": 0.2028, + "step": 3846 + }, + { + "epoch": 0.09735050737657211, + "grad_norm": 2.3630800247192383, + "learning_rate": 9.809221927344715e-06, + "loss": 0.1536, + "step": 3847 + }, + { + "epoch": 0.09737581294126578, + "grad_norm": 4.700382232666016, + "learning_rate": 9.809112056431873e-06, + "loss": 0.1148, + "step": 3848 + }, + { + "epoch": 0.09740111850595946, + "grad_norm": 9.293257713317871, + "learning_rate": 9.809002154505961e-06, + "loss": 0.3106, + "step": 3849 + }, + { + "epoch": 0.09742642407065313, + "grad_norm": 8.788673400878906, + "learning_rate": 9.80889222156769e-06, + "loss": 0.2667, + "step": 3850 + }, + { + "epoch": 0.09745172963534682, + "grad_norm": 5.805838584899902, + "learning_rate": 9.808782257617767e-06, + "loss": 0.2491, + "step": 3851 + }, + { + "epoch": 0.09747703520004049, + "grad_norm": 9.994400024414062, + "learning_rate": 9.8086722626569e-06, + "loss": 0.2557, + "step": 3852 + }, + { + "epoch": 0.09750234076473417, + "grad_norm": 6.263886451721191, + "learning_rate": 9.808562236685802e-06, + "loss": 0.1882, + "step": 3853 + }, + { + "epoch": 0.09752764632942784, + "grad_norm": 8.795832633972168, + "learning_rate": 9.808452179705178e-06, + "loss": 0.2174, + "step": 3854 + }, + { + "epoch": 0.09755295189412151, + "grad_norm": 4.926050186157227, + "learning_rate": 9.808342091715744e-06, + "loss": 0.1817, + "step": 3855 + }, + { + "epoch": 0.0975782574588152, + "grad_norm": 7.104277610778809, + "learning_rate": 9.808231972718203e-06, + "loss": 0.2324, + "step": 3856 + }, + { + "epoch": 0.09760356302350887, + "grad_norm": 15.859301567077637, + "learning_rate": 9.808121822713272e-06, + "loss": 0.2302, + "step": 3857 + }, + { + "epoch": 0.09762886858820255, + "grad_norm": 8.576516151428223, + "learning_rate": 9.808011641701654e-06, + "loss": 0.3566, + "step": 3858 + }, + { + "epoch": 0.09765417415289622, + "grad_norm": 5.7743239402771, + "learning_rate": 9.807901429684065e-06, + "loss": 0.1647, + "step": 3859 + }, + { + "epoch": 0.0976794797175899, + "grad_norm": 4.824992656707764, + "learning_rate": 9.807791186661215e-06, + "loss": 0.1892, + "step": 3860 + }, + { + "epoch": 0.09770478528228357, + "grad_norm": 5.092818260192871, + "learning_rate": 9.807680912633812e-06, + "loss": 0.1447, + "step": 3861 + }, + { + "epoch": 0.09773009084697724, + "grad_norm": 8.570161819458008, + "learning_rate": 9.80757060760257e-06, + "loss": 0.2173, + "step": 3862 + }, + { + "epoch": 0.09775539641167093, + "grad_norm": 6.825918674468994, + "learning_rate": 9.807460271568202e-06, + "loss": 0.1765, + "step": 3863 + }, + { + "epoch": 0.0977807019763646, + "grad_norm": 12.484423637390137, + "learning_rate": 9.807349904531413e-06, + "loss": 0.2314, + "step": 3864 + }, + { + "epoch": 0.09780600754105828, + "grad_norm": 7.841568470001221, + "learning_rate": 9.80723950649292e-06, + "loss": 0.1956, + "step": 3865 + }, + { + "epoch": 0.09783131310575195, + "grad_norm": 7.571649551391602, + "learning_rate": 9.807129077453434e-06, + "loss": 0.2293, + "step": 3866 + }, + { + "epoch": 0.09785661867044564, + "grad_norm": 6.5583086013793945, + "learning_rate": 9.807018617413666e-06, + "loss": 0.1684, + "step": 3867 + }, + { + "epoch": 0.0978819242351393, + "grad_norm": 6.102351665496826, + "learning_rate": 9.80690812637433e-06, + "loss": 0.2401, + "step": 3868 + }, + { + "epoch": 0.09790722979983299, + "grad_norm": 7.6996870040893555, + "learning_rate": 9.806797604336138e-06, + "loss": 0.1942, + "step": 3869 + }, + { + "epoch": 0.09793253536452666, + "grad_norm": 8.035664558410645, + "learning_rate": 9.806687051299802e-06, + "loss": 0.2379, + "step": 3870 + }, + { + "epoch": 0.09795784092922033, + "grad_norm": 19.68206024169922, + "learning_rate": 9.806576467266033e-06, + "loss": 0.2036, + "step": 3871 + }, + { + "epoch": 0.09798314649391401, + "grad_norm": 5.642932891845703, + "learning_rate": 9.80646585223555e-06, + "loss": 0.1924, + "step": 3872 + }, + { + "epoch": 0.09800845205860768, + "grad_norm": 5.012259006500244, + "learning_rate": 9.806355206209061e-06, + "loss": 0.2097, + "step": 3873 + }, + { + "epoch": 0.09803375762330137, + "grad_norm": 3.735189914703369, + "learning_rate": 9.806244529187281e-06, + "loss": 0.1963, + "step": 3874 + }, + { + "epoch": 0.09805906318799504, + "grad_norm": 5.28084135055542, + "learning_rate": 9.806133821170925e-06, + "loss": 0.1895, + "step": 3875 + }, + { + "epoch": 0.09808436875268872, + "grad_norm": 4.623587131500244, + "learning_rate": 9.806023082160705e-06, + "loss": 0.1907, + "step": 3876 + }, + { + "epoch": 0.09810967431738239, + "grad_norm": 4.8579325675964355, + "learning_rate": 9.805912312157338e-06, + "loss": 0.2364, + "step": 3877 + }, + { + "epoch": 0.09813497988207606, + "grad_norm": 4.376948356628418, + "learning_rate": 9.805801511161535e-06, + "loss": 0.196, + "step": 3878 + }, + { + "epoch": 0.09816028544676975, + "grad_norm": 7.782332897186279, + "learning_rate": 9.805690679174012e-06, + "loss": 0.195, + "step": 3879 + }, + { + "epoch": 0.09818559101146342, + "grad_norm": 8.019570350646973, + "learning_rate": 9.805579816195484e-06, + "loss": 0.1954, + "step": 3880 + }, + { + "epoch": 0.0982108965761571, + "grad_norm": 7.7288641929626465, + "learning_rate": 9.805468922226666e-06, + "loss": 0.1902, + "step": 3881 + }, + { + "epoch": 0.09823620214085077, + "grad_norm": 2.9488472938537598, + "learning_rate": 9.805357997268272e-06, + "loss": 0.1681, + "step": 3882 + }, + { + "epoch": 0.09826150770554445, + "grad_norm": 6.080204010009766, + "learning_rate": 9.805247041321019e-06, + "loss": 0.2286, + "step": 3883 + }, + { + "epoch": 0.09828681327023812, + "grad_norm": 33.49230194091797, + "learning_rate": 9.805136054385621e-06, + "loss": 0.4789, + "step": 3884 + }, + { + "epoch": 0.09831211883493181, + "grad_norm": 12.053814888000488, + "learning_rate": 9.805025036462793e-06, + "loss": 0.1567, + "step": 3885 + }, + { + "epoch": 0.09833742439962548, + "grad_norm": 13.019577980041504, + "learning_rate": 9.804913987553256e-06, + "loss": 0.2131, + "step": 3886 + }, + { + "epoch": 0.09836272996431915, + "grad_norm": 4.485830307006836, + "learning_rate": 9.80480290765772e-06, + "loss": 0.13, + "step": 3887 + }, + { + "epoch": 0.09838803552901283, + "grad_norm": 6.820305347442627, + "learning_rate": 9.804691796776903e-06, + "loss": 0.2855, + "step": 3888 + }, + { + "epoch": 0.0984133410937065, + "grad_norm": 6.451397895812988, + "learning_rate": 9.804580654911526e-06, + "loss": 0.2008, + "step": 3889 + }, + { + "epoch": 0.09843864665840019, + "grad_norm": 9.578474998474121, + "learning_rate": 9.8044694820623e-06, + "loss": 0.2569, + "step": 3890 + }, + { + "epoch": 0.09846395222309386, + "grad_norm": 3.882826328277588, + "learning_rate": 9.804358278229942e-06, + "loss": 0.1014, + "step": 3891 + }, + { + "epoch": 0.09848925778778754, + "grad_norm": 9.307594299316406, + "learning_rate": 9.804247043415174e-06, + "loss": 0.3096, + "step": 3892 + }, + { + "epoch": 0.09851456335248121, + "grad_norm": 4.6428022384643555, + "learning_rate": 9.804135777618708e-06, + "loss": 0.1222, + "step": 3893 + }, + { + "epoch": 0.09853986891717488, + "grad_norm": 6.764581680297852, + "learning_rate": 9.804024480841267e-06, + "loss": 0.212, + "step": 3894 + }, + { + "epoch": 0.09856517448186856, + "grad_norm": 4.840673446655273, + "learning_rate": 9.803913153083565e-06, + "loss": 0.2306, + "step": 3895 + }, + { + "epoch": 0.09859048004656223, + "grad_norm": 5.255924701690674, + "learning_rate": 9.80380179434632e-06, + "loss": 0.1892, + "step": 3896 + }, + { + "epoch": 0.09861578561125592, + "grad_norm": 7.224907875061035, + "learning_rate": 9.803690404630248e-06, + "loss": 0.1357, + "step": 3897 + }, + { + "epoch": 0.09864109117594959, + "grad_norm": 4.837213039398193, + "learning_rate": 9.803578983936074e-06, + "loss": 0.1653, + "step": 3898 + }, + { + "epoch": 0.09866639674064327, + "grad_norm": 7.634382724761963, + "learning_rate": 9.803467532264511e-06, + "loss": 0.2535, + "step": 3899 + }, + { + "epoch": 0.09869170230533694, + "grad_norm": 4.897393226623535, + "learning_rate": 9.80335604961628e-06, + "loss": 0.2106, + "step": 3900 + }, + { + "epoch": 0.09871700787003063, + "grad_norm": 8.842384338378906, + "learning_rate": 9.8032445359921e-06, + "loss": 0.1882, + "step": 3901 + }, + { + "epoch": 0.0987423134347243, + "grad_norm": 6.279628753662109, + "learning_rate": 9.803132991392688e-06, + "loss": 0.1668, + "step": 3902 + }, + { + "epoch": 0.09876761899941797, + "grad_norm": 3.9172630310058594, + "learning_rate": 9.803021415818765e-06, + "loss": 0.1417, + "step": 3903 + }, + { + "epoch": 0.09879292456411165, + "grad_norm": 10.21536636352539, + "learning_rate": 9.80290980927105e-06, + "loss": 0.3132, + "step": 3904 + }, + { + "epoch": 0.09881823012880532, + "grad_norm": 9.097460746765137, + "learning_rate": 9.802798171750265e-06, + "loss": 0.2725, + "step": 3905 + }, + { + "epoch": 0.098843535693499, + "grad_norm": 8.65365219116211, + "learning_rate": 9.802686503257126e-06, + "loss": 0.2358, + "step": 3906 + }, + { + "epoch": 0.09886884125819267, + "grad_norm": 4.757839202880859, + "learning_rate": 9.802574803792356e-06, + "loss": 0.1174, + "step": 3907 + }, + { + "epoch": 0.09889414682288636, + "grad_norm": 7.605111598968506, + "learning_rate": 9.802463073356674e-06, + "loss": 0.2694, + "step": 3908 + }, + { + "epoch": 0.09891945238758003, + "grad_norm": 6.883908748626709, + "learning_rate": 9.802351311950802e-06, + "loss": 0.2336, + "step": 3909 + }, + { + "epoch": 0.0989447579522737, + "grad_norm": 3.57947039604187, + "learning_rate": 9.802239519575459e-06, + "loss": 0.1788, + "step": 3910 + }, + { + "epoch": 0.09897006351696738, + "grad_norm": 7.303210258483887, + "learning_rate": 9.802127696231366e-06, + "loss": 0.1731, + "step": 3911 + }, + { + "epoch": 0.09899536908166105, + "grad_norm": 3.7458138465881348, + "learning_rate": 9.802015841919246e-06, + "loss": 0.1713, + "step": 3912 + }, + { + "epoch": 0.09902067464635474, + "grad_norm": 5.268394947052002, + "learning_rate": 9.801903956639817e-06, + "loss": 0.2722, + "step": 3913 + }, + { + "epoch": 0.0990459802110484, + "grad_norm": 15.972658157348633, + "learning_rate": 9.801792040393805e-06, + "loss": 0.2631, + "step": 3914 + }, + { + "epoch": 0.09907128577574209, + "grad_norm": 8.82864761352539, + "learning_rate": 9.801680093181928e-06, + "loss": 0.2074, + "step": 3915 + }, + { + "epoch": 0.09909659134043576, + "grad_norm": 5.0320892333984375, + "learning_rate": 9.801568115004911e-06, + "loss": 0.1706, + "step": 3916 + }, + { + "epoch": 0.09912189690512944, + "grad_norm": 6.453171730041504, + "learning_rate": 9.801456105863471e-06, + "loss": 0.1836, + "step": 3917 + }, + { + "epoch": 0.09914720246982311, + "grad_norm": 7.854148864746094, + "learning_rate": 9.801344065758336e-06, + "loss": 0.2987, + "step": 3918 + }, + { + "epoch": 0.09917250803451678, + "grad_norm": 7.5855512619018555, + "learning_rate": 9.801231994690226e-06, + "loss": 0.2076, + "step": 3919 + }, + { + "epoch": 0.09919781359921047, + "grad_norm": 12.909771919250488, + "learning_rate": 9.801119892659865e-06, + "loss": 0.1904, + "step": 3920 + }, + { + "epoch": 0.09922311916390414, + "grad_norm": 7.911561012268066, + "learning_rate": 9.801007759667972e-06, + "loss": 0.2605, + "step": 3921 + }, + { + "epoch": 0.09924842472859782, + "grad_norm": 4.721296787261963, + "learning_rate": 9.800895595715276e-06, + "loss": 0.2308, + "step": 3922 + }, + { + "epoch": 0.09927373029329149, + "grad_norm": 10.327676773071289, + "learning_rate": 9.800783400802495e-06, + "loss": 0.2249, + "step": 3923 + }, + { + "epoch": 0.09929903585798518, + "grad_norm": 5.212604522705078, + "learning_rate": 9.800671174930357e-06, + "loss": 0.2075, + "step": 3924 + }, + { + "epoch": 0.09932434142267885, + "grad_norm": 3.7671186923980713, + "learning_rate": 9.800558918099583e-06, + "loss": 0.1229, + "step": 3925 + }, + { + "epoch": 0.09934964698737252, + "grad_norm": 3.5520999431610107, + "learning_rate": 9.800446630310897e-06, + "loss": 0.1895, + "step": 3926 + }, + { + "epoch": 0.0993749525520662, + "grad_norm": 6.101281642913818, + "learning_rate": 9.800334311565023e-06, + "loss": 0.1198, + "step": 3927 + }, + { + "epoch": 0.09940025811675987, + "grad_norm": 3.5101473331451416, + "learning_rate": 9.800221961862687e-06, + "loss": 0.1332, + "step": 3928 + }, + { + "epoch": 0.09942556368145355, + "grad_norm": 3.585767984390259, + "learning_rate": 9.800109581204614e-06, + "loss": 0.1845, + "step": 3929 + }, + { + "epoch": 0.09945086924614722, + "grad_norm": 7.186892986297607, + "learning_rate": 9.799997169591523e-06, + "loss": 0.2415, + "step": 3930 + }, + { + "epoch": 0.09947617481084091, + "grad_norm": 4.3713908195495605, + "learning_rate": 9.799884727024147e-06, + "loss": 0.2117, + "step": 3931 + }, + { + "epoch": 0.09950148037553458, + "grad_norm": 7.331996440887451, + "learning_rate": 9.799772253503206e-06, + "loss": 0.1981, + "step": 3932 + }, + { + "epoch": 0.09952678594022826, + "grad_norm": 9.379281044006348, + "learning_rate": 9.799659749029426e-06, + "loss": 0.2027, + "step": 3933 + }, + { + "epoch": 0.09955209150492193, + "grad_norm": 5.692437171936035, + "learning_rate": 9.799547213603534e-06, + "loss": 0.2056, + "step": 3934 + }, + { + "epoch": 0.0995773970696156, + "grad_norm": 6.4234514236450195, + "learning_rate": 9.799434647226253e-06, + "loss": 0.2016, + "step": 3935 + }, + { + "epoch": 0.09960270263430929, + "grad_norm": 8.380946159362793, + "learning_rate": 9.799322049898313e-06, + "loss": 0.2162, + "step": 3936 + }, + { + "epoch": 0.09962800819900296, + "grad_norm": 10.399324417114258, + "learning_rate": 9.799209421620435e-06, + "loss": 0.3748, + "step": 3937 + }, + { + "epoch": 0.09965331376369664, + "grad_norm": 5.575129508972168, + "learning_rate": 9.79909676239335e-06, + "loss": 0.1573, + "step": 3938 + }, + { + "epoch": 0.09967861932839031, + "grad_norm": 6.682286262512207, + "learning_rate": 9.798984072217784e-06, + "loss": 0.1265, + "step": 3939 + }, + { + "epoch": 0.099703924893084, + "grad_norm": 4.704087257385254, + "learning_rate": 9.79887135109446e-06, + "loss": 0.2686, + "step": 3940 + }, + { + "epoch": 0.09972923045777767, + "grad_norm": 8.548129081726074, + "learning_rate": 9.798758599024108e-06, + "loss": 0.1269, + "step": 3941 + }, + { + "epoch": 0.09975453602247134, + "grad_norm": 14.04658317565918, + "learning_rate": 9.798645816007455e-06, + "loss": 0.2596, + "step": 3942 + }, + { + "epoch": 0.09977984158716502, + "grad_norm": 5.094700336456299, + "learning_rate": 9.798533002045227e-06, + "loss": 0.2701, + "step": 3943 + }, + { + "epoch": 0.09980514715185869, + "grad_norm": 12.54626750946045, + "learning_rate": 9.798420157138153e-06, + "loss": 0.2768, + "step": 3944 + }, + { + "epoch": 0.09983045271655237, + "grad_norm": 8.909289360046387, + "learning_rate": 9.79830728128696e-06, + "loss": 0.1371, + "step": 3945 + }, + { + "epoch": 0.09985575828124604, + "grad_norm": 6.663421154022217, + "learning_rate": 9.798194374492376e-06, + "loss": 0.1885, + "step": 3946 + }, + { + "epoch": 0.09988106384593973, + "grad_norm": 10.29789924621582, + "learning_rate": 9.798081436755128e-06, + "loss": 0.2445, + "step": 3947 + }, + { + "epoch": 0.0999063694106334, + "grad_norm": 7.600649833679199, + "learning_rate": 9.797968468075947e-06, + "loss": 0.175, + "step": 3948 + }, + { + "epoch": 0.09993167497532708, + "grad_norm": 7.967552661895752, + "learning_rate": 9.797855468455558e-06, + "loss": 0.2733, + "step": 3949 + }, + { + "epoch": 0.09995698054002075, + "grad_norm": 4.243651866912842, + "learning_rate": 9.797742437894693e-06, + "loss": 0.2082, + "step": 3950 + }, + { + "epoch": 0.09998228610471442, + "grad_norm": 4.634889125823975, + "learning_rate": 9.797629376394078e-06, + "loss": 0.1443, + "step": 3951 + }, + { + "epoch": 0.1000075916694081, + "grad_norm": 5.698996067047119, + "learning_rate": 9.797516283954444e-06, + "loss": 0.209, + "step": 3952 + }, + { + "epoch": 0.1000075916694081, + "eval_loss": 0.22069652378559113, + "eval_runtime": 69.8141, + "eval_samples_per_second": 45.75, + "eval_steps_per_second": 5.73, + "step": 3952 + }, + { + "epoch": 0.10003289723410178, + "grad_norm": 7.168352127075195, + "learning_rate": 9.79740316057652e-06, + "loss": 0.1371, + "step": 3953 + }, + { + "epoch": 0.10005820279879546, + "grad_norm": 6.732322692871094, + "learning_rate": 9.797290006261037e-06, + "loss": 0.2171, + "step": 3954 + }, + { + "epoch": 0.10008350836348913, + "grad_norm": 5.714839458465576, + "learning_rate": 9.79717682100872e-06, + "loss": 0.2284, + "step": 3955 + }, + { + "epoch": 0.10010881392818281, + "grad_norm": 11.51867961883545, + "learning_rate": 9.797063604820305e-06, + "loss": 0.2948, + "step": 3956 + }, + { + "epoch": 0.10013411949287648, + "grad_norm": 4.269284248352051, + "learning_rate": 9.796950357696518e-06, + "loss": 0.1385, + "step": 3957 + }, + { + "epoch": 0.10015942505757015, + "grad_norm": 4.89054012298584, + "learning_rate": 9.79683707963809e-06, + "loss": 0.1797, + "step": 3958 + }, + { + "epoch": 0.10018473062226384, + "grad_norm": 6.878995418548584, + "learning_rate": 9.796723770645753e-06, + "loss": 0.2248, + "step": 3959 + }, + { + "epoch": 0.10021003618695751, + "grad_norm": 9.430288314819336, + "learning_rate": 9.796610430720236e-06, + "loss": 0.1792, + "step": 3960 + }, + { + "epoch": 0.10023534175165119, + "grad_norm": 6.384383678436279, + "learning_rate": 9.796497059862272e-06, + "loss": 0.1016, + "step": 3961 + }, + { + "epoch": 0.10026064731634486, + "grad_norm": 8.42361068725586, + "learning_rate": 9.796383658072589e-06, + "loss": 0.2246, + "step": 3962 + }, + { + "epoch": 0.10028595288103855, + "grad_norm": 9.46086597442627, + "learning_rate": 9.796270225351919e-06, + "loss": 0.1322, + "step": 3963 + }, + { + "epoch": 0.10031125844573222, + "grad_norm": 6.878809928894043, + "learning_rate": 9.796156761700996e-06, + "loss": 0.2279, + "step": 3964 + }, + { + "epoch": 0.1003365640104259, + "grad_norm": 6.012547969818115, + "learning_rate": 9.796043267120549e-06, + "loss": 0.2557, + "step": 3965 + }, + { + "epoch": 0.10036186957511957, + "grad_norm": 6.571288585662842, + "learning_rate": 9.795929741611311e-06, + "loss": 0.1914, + "step": 3966 + }, + { + "epoch": 0.10038717513981324, + "grad_norm": 16.335506439208984, + "learning_rate": 9.795816185174017e-06, + "loss": 0.4035, + "step": 3967 + }, + { + "epoch": 0.10041248070450692, + "grad_norm": 9.649615287780762, + "learning_rate": 9.795702597809395e-06, + "loss": 0.2679, + "step": 3968 + }, + { + "epoch": 0.1004377862692006, + "grad_norm": 7.8227009773254395, + "learning_rate": 9.795588979518177e-06, + "loss": 0.2757, + "step": 3969 + }, + { + "epoch": 0.10046309183389428, + "grad_norm": 5.956107139587402, + "learning_rate": 9.7954753303011e-06, + "loss": 0.1873, + "step": 3970 + }, + { + "epoch": 0.10048839739858795, + "grad_norm": 4.04993200302124, + "learning_rate": 9.795361650158895e-06, + "loss": 0.2142, + "step": 3971 + }, + { + "epoch": 0.10051370296328163, + "grad_norm": 5.693306922912598, + "learning_rate": 9.795247939092294e-06, + "loss": 0.2216, + "step": 3972 + }, + { + "epoch": 0.1005390085279753, + "grad_norm": 15.049245834350586, + "learning_rate": 9.795134197102031e-06, + "loss": 0.3412, + "step": 3973 + }, + { + "epoch": 0.10056431409266897, + "grad_norm": 8.099739074707031, + "learning_rate": 9.79502042418884e-06, + "loss": 0.2349, + "step": 3974 + }, + { + "epoch": 0.10058961965736266, + "grad_norm": 5.526676654815674, + "learning_rate": 9.794906620353452e-06, + "loss": 0.2565, + "step": 3975 + }, + { + "epoch": 0.10061492522205633, + "grad_norm": 5.026691913604736, + "learning_rate": 9.794792785596605e-06, + "loss": 0.2369, + "step": 3976 + }, + { + "epoch": 0.10064023078675001, + "grad_norm": 5.199010372161865, + "learning_rate": 9.794678919919032e-06, + "loss": 0.1638, + "step": 3977 + }, + { + "epoch": 0.10066553635144368, + "grad_norm": 6.537684440612793, + "learning_rate": 9.794565023321465e-06, + "loss": 0.246, + "step": 3978 + }, + { + "epoch": 0.10069084191613736, + "grad_norm": 8.95829963684082, + "learning_rate": 9.79445109580464e-06, + "loss": 0.4032, + "step": 3979 + }, + { + "epoch": 0.10071614748083103, + "grad_norm": 7.817378520965576, + "learning_rate": 9.794337137369293e-06, + "loss": 0.2755, + "step": 3980 + }, + { + "epoch": 0.10074145304552472, + "grad_norm": 8.380510330200195, + "learning_rate": 9.794223148016158e-06, + "loss": 0.2006, + "step": 3981 + }, + { + "epoch": 0.10076675861021839, + "grad_norm": 8.71999740600586, + "learning_rate": 9.794109127745968e-06, + "loss": 0.2433, + "step": 3982 + }, + { + "epoch": 0.10079206417491206, + "grad_norm": 7.148333549499512, + "learning_rate": 9.793995076559459e-06, + "loss": 0.236, + "step": 3983 + }, + { + "epoch": 0.10081736973960574, + "grad_norm": 6.804225444793701, + "learning_rate": 9.79388099445737e-06, + "loss": 0.2379, + "step": 3984 + }, + { + "epoch": 0.10084267530429941, + "grad_norm": 5.069950580596924, + "learning_rate": 9.793766881440433e-06, + "loss": 0.1799, + "step": 3985 + }, + { + "epoch": 0.1008679808689931, + "grad_norm": 7.682594299316406, + "learning_rate": 9.793652737509384e-06, + "loss": 0.3054, + "step": 3986 + }, + { + "epoch": 0.10089328643368677, + "grad_norm": 4.149829387664795, + "learning_rate": 9.793538562664963e-06, + "loss": 0.1683, + "step": 3987 + }, + { + "epoch": 0.10091859199838045, + "grad_norm": 6.262515068054199, + "learning_rate": 9.793424356907902e-06, + "loss": 0.0989, + "step": 3988 + }, + { + "epoch": 0.10094389756307412, + "grad_norm": 5.545864105224609, + "learning_rate": 9.79331012023894e-06, + "loss": 0.1762, + "step": 3989 + }, + { + "epoch": 0.10096920312776779, + "grad_norm": 11.386395454406738, + "learning_rate": 9.79319585265881e-06, + "loss": 0.3171, + "step": 3990 + }, + { + "epoch": 0.10099450869246147, + "grad_norm": 11.100509643554688, + "learning_rate": 9.793081554168254e-06, + "loss": 0.267, + "step": 3991 + }, + { + "epoch": 0.10101981425715514, + "grad_norm": 7.163724422454834, + "learning_rate": 9.792967224768006e-06, + "loss": 0.2556, + "step": 3992 + }, + { + "epoch": 0.10104511982184883, + "grad_norm": 5.791297912597656, + "learning_rate": 9.792852864458805e-06, + "loss": 0.1985, + "step": 3993 + }, + { + "epoch": 0.1010704253865425, + "grad_norm": 12.189212799072266, + "learning_rate": 9.792738473241386e-06, + "loss": 0.2189, + "step": 3994 + }, + { + "epoch": 0.10109573095123618, + "grad_norm": 11.32535457611084, + "learning_rate": 9.792624051116488e-06, + "loss": 0.4434, + "step": 3995 + }, + { + "epoch": 0.10112103651592985, + "grad_norm": 4.767388343811035, + "learning_rate": 9.792509598084849e-06, + "loss": 0.2056, + "step": 3996 + }, + { + "epoch": 0.10114634208062354, + "grad_norm": 6.826062202453613, + "learning_rate": 9.792395114147206e-06, + "loss": 0.1866, + "step": 3997 + }, + { + "epoch": 0.1011716476453172, + "grad_norm": 4.007806777954102, + "learning_rate": 9.7922805993043e-06, + "loss": 0.168, + "step": 3998 + }, + { + "epoch": 0.10119695321001088, + "grad_norm": 14.475528717041016, + "learning_rate": 9.792166053556868e-06, + "loss": 0.3246, + "step": 3999 + }, + { + "epoch": 0.10122225877470456, + "grad_norm": 5.525607585906982, + "learning_rate": 9.792051476905648e-06, + "loss": 0.2368, + "step": 4000 + }, + { + "epoch": 0.10124756433939823, + "grad_norm": 31.29747200012207, + "learning_rate": 9.79193686935138e-06, + "loss": 0.3567, + "step": 4001 + }, + { + "epoch": 0.10127286990409191, + "grad_norm": 6.200589179992676, + "learning_rate": 9.791822230894802e-06, + "loss": 0.197, + "step": 4002 + }, + { + "epoch": 0.10129817546878558, + "grad_norm": 3.3836066722869873, + "learning_rate": 9.791707561536654e-06, + "loss": 0.1643, + "step": 4003 + }, + { + "epoch": 0.10132348103347927, + "grad_norm": 7.557086944580078, + "learning_rate": 9.791592861277674e-06, + "loss": 0.312, + "step": 4004 + }, + { + "epoch": 0.10134878659817294, + "grad_norm": 4.656862735748291, + "learning_rate": 9.791478130118605e-06, + "loss": 0.1979, + "step": 4005 + }, + { + "epoch": 0.10137409216286661, + "grad_norm": 3.3016295433044434, + "learning_rate": 9.791363368060183e-06, + "loss": 0.1889, + "step": 4006 + }, + { + "epoch": 0.10139939772756029, + "grad_norm": 7.889111042022705, + "learning_rate": 9.791248575103152e-06, + "loss": 0.1675, + "step": 4007 + }, + { + "epoch": 0.10142470329225396, + "grad_norm": 3.7117393016815186, + "learning_rate": 9.791133751248248e-06, + "loss": 0.154, + "step": 4008 + }, + { + "epoch": 0.10145000885694765, + "grad_norm": 5.438164234161377, + "learning_rate": 9.791018896496215e-06, + "loss": 0.1797, + "step": 4009 + }, + { + "epoch": 0.10147531442164132, + "grad_norm": 4.581814289093018, + "learning_rate": 9.79090401084779e-06, + "loss": 0.2084, + "step": 4010 + }, + { + "epoch": 0.101500619986335, + "grad_norm": 5.412738800048828, + "learning_rate": 9.79078909430372e-06, + "loss": 0.2387, + "step": 4011 + }, + { + "epoch": 0.10152592555102867, + "grad_norm": 4.647515296936035, + "learning_rate": 9.79067414686474e-06, + "loss": 0.2103, + "step": 4012 + }, + { + "epoch": 0.10155123111572235, + "grad_norm": 7.353252410888672, + "learning_rate": 9.790559168531595e-06, + "loss": 0.2403, + "step": 4013 + }, + { + "epoch": 0.10157653668041602, + "grad_norm": 4.3865861892700195, + "learning_rate": 9.790444159305024e-06, + "loss": 0.1832, + "step": 4014 + }, + { + "epoch": 0.1016018422451097, + "grad_norm": 5.751595497131348, + "learning_rate": 9.790329119185768e-06, + "loss": 0.1259, + "step": 4015 + }, + { + "epoch": 0.10162714780980338, + "grad_norm": 7.256200790405273, + "learning_rate": 9.790214048174574e-06, + "loss": 0.2411, + "step": 4016 + }, + { + "epoch": 0.10165245337449705, + "grad_norm": 6.673491954803467, + "learning_rate": 9.790098946272177e-06, + "loss": 0.2507, + "step": 4017 + }, + { + "epoch": 0.10167775893919073, + "grad_norm": 6.234791278839111, + "learning_rate": 9.789983813479326e-06, + "loss": 0.1833, + "step": 4018 + }, + { + "epoch": 0.1017030645038844, + "grad_norm": 9.097342491149902, + "learning_rate": 9.789868649796758e-06, + "loss": 0.2003, + "step": 4019 + }, + { + "epoch": 0.10172837006857809, + "grad_norm": 4.082630157470703, + "learning_rate": 9.789753455225219e-06, + "loss": 0.1898, + "step": 4020 + }, + { + "epoch": 0.10175367563327176, + "grad_norm": 6.740521430969238, + "learning_rate": 9.789638229765452e-06, + "loss": 0.2405, + "step": 4021 + }, + { + "epoch": 0.10177898119796543, + "grad_norm": 8.040037155151367, + "learning_rate": 9.789522973418198e-06, + "loss": 0.2739, + "step": 4022 + }, + { + "epoch": 0.10180428676265911, + "grad_norm": 6.519466400146484, + "learning_rate": 9.789407686184202e-06, + "loss": 0.2061, + "step": 4023 + }, + { + "epoch": 0.10182959232735278, + "grad_norm": 5.898673057556152, + "learning_rate": 9.789292368064208e-06, + "loss": 0.118, + "step": 4024 + }, + { + "epoch": 0.10185489789204646, + "grad_norm": 14.315768241882324, + "learning_rate": 9.789177019058956e-06, + "loss": 0.2261, + "step": 4025 + }, + { + "epoch": 0.10188020345674013, + "grad_norm": 12.089866638183594, + "learning_rate": 9.789061639169191e-06, + "loss": 0.3207, + "step": 4026 + }, + { + "epoch": 0.10190550902143382, + "grad_norm": 7.691464424133301, + "learning_rate": 9.788946228395661e-06, + "loss": 0.1887, + "step": 4027 + }, + { + "epoch": 0.10193081458612749, + "grad_norm": 5.915299415588379, + "learning_rate": 9.788830786739108e-06, + "loss": 0.155, + "step": 4028 + }, + { + "epoch": 0.10195612015082117, + "grad_norm": 9.55601692199707, + "learning_rate": 9.788715314200274e-06, + "loss": 0.2937, + "step": 4029 + }, + { + "epoch": 0.10198142571551484, + "grad_norm": 3.956552505493164, + "learning_rate": 9.788599810779907e-06, + "loss": 0.1302, + "step": 4030 + }, + { + "epoch": 0.10200673128020851, + "grad_norm": 10.12968635559082, + "learning_rate": 9.78848427647875e-06, + "loss": 0.3967, + "step": 4031 + }, + { + "epoch": 0.1020320368449022, + "grad_norm": 5.2640461921691895, + "learning_rate": 9.788368711297547e-06, + "loss": 0.2027, + "step": 4032 + }, + { + "epoch": 0.10205734240959587, + "grad_norm": 9.76281452178955, + "learning_rate": 9.788253115237047e-06, + "loss": 0.213, + "step": 4033 + }, + { + "epoch": 0.10208264797428955, + "grad_norm": 3.9972891807556152, + "learning_rate": 9.788137488297992e-06, + "loss": 0.1573, + "step": 4034 + }, + { + "epoch": 0.10210795353898322, + "grad_norm": 4.4231719970703125, + "learning_rate": 9.788021830481132e-06, + "loss": 0.1723, + "step": 4035 + }, + { + "epoch": 0.1021332591036769, + "grad_norm": 24.262718200683594, + "learning_rate": 9.787906141787206e-06, + "loss": 0.2864, + "step": 4036 + }, + { + "epoch": 0.10215856466837057, + "grad_norm": 3.2067575454711914, + "learning_rate": 9.787790422216967e-06, + "loss": 0.1757, + "step": 4037 + }, + { + "epoch": 0.10218387023306424, + "grad_norm": 10.701187133789062, + "learning_rate": 9.787674671771156e-06, + "loss": 0.372, + "step": 4038 + }, + { + "epoch": 0.10220917579775793, + "grad_norm": 7.034514904022217, + "learning_rate": 9.787558890450524e-06, + "loss": 0.1873, + "step": 4039 + }, + { + "epoch": 0.1022344813624516, + "grad_norm": 3.1536450386047363, + "learning_rate": 9.787443078255814e-06, + "loss": 0.1844, + "step": 4040 + }, + { + "epoch": 0.10225978692714528, + "grad_norm": 8.495804786682129, + "learning_rate": 9.787327235187774e-06, + "loss": 0.2343, + "step": 4041 + }, + { + "epoch": 0.10228509249183895, + "grad_norm": 20.411012649536133, + "learning_rate": 9.787211361247152e-06, + "loss": 0.1793, + "step": 4042 + }, + { + "epoch": 0.10231039805653264, + "grad_norm": 7.963510036468506, + "learning_rate": 9.787095456434693e-06, + "loss": 0.3077, + "step": 4043 + }, + { + "epoch": 0.1023357036212263, + "grad_norm": 7.99733829498291, + "learning_rate": 9.786979520751145e-06, + "loss": 0.2203, + "step": 4044 + }, + { + "epoch": 0.10236100918591999, + "grad_norm": 11.291923522949219, + "learning_rate": 9.786863554197259e-06, + "loss": 0.237, + "step": 4045 + }, + { + "epoch": 0.10238631475061366, + "grad_norm": 2.749274730682373, + "learning_rate": 9.78674755677378e-06, + "loss": 0.1627, + "step": 4046 + }, + { + "epoch": 0.10241162031530733, + "grad_norm": 4.16049861907959, + "learning_rate": 9.786631528481454e-06, + "loss": 0.2197, + "step": 4047 + }, + { + "epoch": 0.10243692588000101, + "grad_norm": 3.8079652786254883, + "learning_rate": 9.786515469321033e-06, + "loss": 0.1908, + "step": 4048 + }, + { + "epoch": 0.10246223144469468, + "grad_norm": 4.137901782989502, + "learning_rate": 9.786399379293265e-06, + "loss": 0.1233, + "step": 4049 + }, + { + "epoch": 0.10248753700938837, + "grad_norm": 2.706752061843872, + "learning_rate": 9.786283258398896e-06, + "loss": 0.1453, + "step": 4050 + }, + { + "epoch": 0.10251284257408204, + "grad_norm": 6.3875885009765625, + "learning_rate": 9.78616710663868e-06, + "loss": 0.2097, + "step": 4051 + }, + { + "epoch": 0.10253814813877572, + "grad_norm": 6.3582539558410645, + "learning_rate": 9.786050924013359e-06, + "loss": 0.2487, + "step": 4052 + }, + { + "epoch": 0.10256345370346939, + "grad_norm": 9.04422378540039, + "learning_rate": 9.785934710523686e-06, + "loss": 0.3517, + "step": 4053 + }, + { + "epoch": 0.10258875926816306, + "grad_norm": 3.994938373565674, + "learning_rate": 9.785818466170413e-06, + "loss": 0.1415, + "step": 4054 + }, + { + "epoch": 0.10261406483285675, + "grad_norm": 4.140641689300537, + "learning_rate": 9.785702190954285e-06, + "loss": 0.1261, + "step": 4055 + }, + { + "epoch": 0.10263937039755042, + "grad_norm": 11.584182739257812, + "learning_rate": 9.785585884876054e-06, + "loss": 0.3379, + "step": 4056 + }, + { + "epoch": 0.1026646759622441, + "grad_norm": 7.853322982788086, + "learning_rate": 9.78546954793647e-06, + "loss": 0.2235, + "step": 4057 + }, + { + "epoch": 0.10268998152693777, + "grad_norm": 15.173897743225098, + "learning_rate": 9.785353180136284e-06, + "loss": 0.2532, + "step": 4058 + }, + { + "epoch": 0.10271528709163145, + "grad_norm": 6.492870330810547, + "learning_rate": 9.785236781476244e-06, + "loss": 0.2346, + "step": 4059 + }, + { + "epoch": 0.10274059265632512, + "grad_norm": 5.514697074890137, + "learning_rate": 9.785120351957103e-06, + "loss": 0.2105, + "step": 4060 + }, + { + "epoch": 0.10276589822101881, + "grad_norm": 5.927398204803467, + "learning_rate": 9.78500389157961e-06, + "loss": 0.1736, + "step": 4061 + }, + { + "epoch": 0.10279120378571248, + "grad_norm": 9.10953426361084, + "learning_rate": 9.784887400344518e-06, + "loss": 0.2718, + "step": 4062 + }, + { + "epoch": 0.10281650935040615, + "grad_norm": 11.763191223144531, + "learning_rate": 9.784770878252577e-06, + "loss": 0.2136, + "step": 4063 + }, + { + "epoch": 0.10284181491509983, + "grad_norm": 8.335429191589355, + "learning_rate": 9.784654325304539e-06, + "loss": 0.2656, + "step": 4064 + }, + { + "epoch": 0.1028671204797935, + "grad_norm": 7.626679420471191, + "learning_rate": 9.784537741501155e-06, + "loss": 0.2194, + "step": 4065 + }, + { + "epoch": 0.10289242604448719, + "grad_norm": 7.031536102294922, + "learning_rate": 9.784421126843175e-06, + "loss": 0.1563, + "step": 4066 + }, + { + "epoch": 0.10291773160918086, + "grad_norm": 22.16071128845215, + "learning_rate": 9.784304481331356e-06, + "loss": 0.2396, + "step": 4067 + }, + { + "epoch": 0.10294303717387454, + "grad_norm": 13.729410171508789, + "learning_rate": 9.784187804966448e-06, + "loss": 0.2947, + "step": 4068 + }, + { + "epoch": 0.10296834273856821, + "grad_norm": 4.337582588195801, + "learning_rate": 9.784071097749199e-06, + "loss": 0.1966, + "step": 4069 + }, + { + "epoch": 0.10299364830326188, + "grad_norm": 5.886125087738037, + "learning_rate": 9.783954359680368e-06, + "loss": 0.2312, + "step": 4070 + }, + { + "epoch": 0.10301895386795556, + "grad_norm": 5.720602989196777, + "learning_rate": 9.783837590760705e-06, + "loss": 0.1749, + "step": 4071 + }, + { + "epoch": 0.10304425943264924, + "grad_norm": 3.812494993209839, + "learning_rate": 9.783720790990963e-06, + "loss": 0.2127, + "step": 4072 + }, + { + "epoch": 0.10306956499734292, + "grad_norm": 5.890902042388916, + "learning_rate": 9.783603960371894e-06, + "loss": 0.1547, + "step": 4073 + }, + { + "epoch": 0.10309487056203659, + "grad_norm": 4.686444282531738, + "learning_rate": 9.783487098904256e-06, + "loss": 0.1541, + "step": 4074 + }, + { + "epoch": 0.10312017612673027, + "grad_norm": 6.298539638519287, + "learning_rate": 9.783370206588798e-06, + "loss": 0.1887, + "step": 4075 + }, + { + "epoch": 0.10314548169142394, + "grad_norm": 20.98985481262207, + "learning_rate": 9.783253283426275e-06, + "loss": 0.2684, + "step": 4076 + }, + { + "epoch": 0.10317078725611763, + "grad_norm": 7.037590503692627, + "learning_rate": 9.78313632941744e-06, + "loss": 0.222, + "step": 4077 + }, + { + "epoch": 0.1031960928208113, + "grad_norm": 3.570857048034668, + "learning_rate": 9.783019344563052e-06, + "loss": 0.1868, + "step": 4078 + }, + { + "epoch": 0.10322139838550497, + "grad_norm": 7.5116777420043945, + "learning_rate": 9.782902328863858e-06, + "loss": 0.2504, + "step": 4079 + }, + { + "epoch": 0.10324670395019865, + "grad_norm": 25.73281478881836, + "learning_rate": 9.78278528232062e-06, + "loss": 0.3925, + "step": 4080 + }, + { + "epoch": 0.10327200951489232, + "grad_norm": 5.919952392578125, + "learning_rate": 9.782668204934087e-06, + "loss": 0.2932, + "step": 4081 + }, + { + "epoch": 0.103297315079586, + "grad_norm": 11.408084869384766, + "learning_rate": 9.782551096705017e-06, + "loss": 0.2732, + "step": 4082 + }, + { + "epoch": 0.10332262064427968, + "grad_norm": 4.514637470245361, + "learning_rate": 9.782433957634166e-06, + "loss": 0.1337, + "step": 4083 + }, + { + "epoch": 0.10334792620897336, + "grad_norm": 11.517480850219727, + "learning_rate": 9.782316787722286e-06, + "loss": 0.2598, + "step": 4084 + }, + { + "epoch": 0.10337323177366703, + "grad_norm": 9.19699478149414, + "learning_rate": 9.782199586970136e-06, + "loss": 0.233, + "step": 4085 + }, + { + "epoch": 0.1033985373383607, + "grad_norm": 8.514875411987305, + "learning_rate": 9.78208235537847e-06, + "loss": 0.4151, + "step": 4086 + }, + { + "epoch": 0.10342384290305438, + "grad_norm": 11.89619255065918, + "learning_rate": 9.781965092948046e-06, + "loss": 0.3287, + "step": 4087 + }, + { + "epoch": 0.10344914846774805, + "grad_norm": 8.527093887329102, + "learning_rate": 9.781847799679616e-06, + "loss": 0.1149, + "step": 4088 + }, + { + "epoch": 0.10347445403244174, + "grad_norm": 8.045331001281738, + "learning_rate": 9.781730475573938e-06, + "loss": 0.2359, + "step": 4089 + }, + { + "epoch": 0.10349975959713541, + "grad_norm": 7.252604007720947, + "learning_rate": 9.781613120631773e-06, + "loss": 0.2166, + "step": 4090 + }, + { + "epoch": 0.10352506516182909, + "grad_norm": 10.65107250213623, + "learning_rate": 9.781495734853873e-06, + "loss": 0.1975, + "step": 4091 + }, + { + "epoch": 0.10355037072652276, + "grad_norm": 7.950842380523682, + "learning_rate": 9.781378318240997e-06, + "loss": 0.2562, + "step": 4092 + }, + { + "epoch": 0.10357567629121645, + "grad_norm": 8.472436904907227, + "learning_rate": 9.7812608707939e-06, + "loss": 0.2095, + "step": 4093 + }, + { + "epoch": 0.10360098185591012, + "grad_norm": 8.50566577911377, + "learning_rate": 9.781143392513343e-06, + "loss": 0.2216, + "step": 4094 + }, + { + "epoch": 0.10362628742060379, + "grad_norm": 4.4668731689453125, + "learning_rate": 9.78102588340008e-06, + "loss": 0.2484, + "step": 4095 + }, + { + "epoch": 0.10365159298529747, + "grad_norm": 4.1588454246521, + "learning_rate": 9.780908343454872e-06, + "loss": 0.186, + "step": 4096 + }, + { + "epoch": 0.10367689854999114, + "grad_norm": 5.5731329917907715, + "learning_rate": 9.780790772678475e-06, + "loss": 0.2063, + "step": 4097 + }, + { + "epoch": 0.10370220411468482, + "grad_norm": 4.796866416931152, + "learning_rate": 9.780673171071647e-06, + "loss": 0.1758, + "step": 4098 + }, + { + "epoch": 0.1037275096793785, + "grad_norm": 5.2141923904418945, + "learning_rate": 9.780555538635147e-06, + "loss": 0.1683, + "step": 4099 + }, + { + "epoch": 0.10375281524407218, + "grad_norm": 4.8142409324646, + "learning_rate": 9.780437875369733e-06, + "loss": 0.2004, + "step": 4100 + }, + { + "epoch": 0.10377812080876585, + "grad_norm": 4.631553649902344, + "learning_rate": 9.780320181276165e-06, + "loss": 0.153, + "step": 4101 + }, + { + "epoch": 0.10380342637345952, + "grad_norm": 7.626914978027344, + "learning_rate": 9.780202456355203e-06, + "loss": 0.2138, + "step": 4102 + }, + { + "epoch": 0.1038287319381532, + "grad_norm": 17.9088077545166, + "learning_rate": 9.780084700607603e-06, + "loss": 0.1561, + "step": 4103 + }, + { + "epoch": 0.10385403750284687, + "grad_norm": 9.947066307067871, + "learning_rate": 9.779966914034124e-06, + "loss": 0.2715, + "step": 4104 + }, + { + "epoch": 0.10387934306754056, + "grad_norm": 5.454441070556641, + "learning_rate": 9.77984909663553e-06, + "loss": 0.1771, + "step": 4105 + }, + { + "epoch": 0.10390464863223423, + "grad_norm": 7.454544544219971, + "learning_rate": 9.779731248412578e-06, + "loss": 0.2101, + "step": 4106 + }, + { + "epoch": 0.10392995419692791, + "grad_norm": 4.955561637878418, + "learning_rate": 9.77961336936603e-06, + "loss": 0.1835, + "step": 4107 + }, + { + "epoch": 0.10395525976162158, + "grad_norm": 10.10281753540039, + "learning_rate": 9.779495459496642e-06, + "loss": 0.3228, + "step": 4108 + }, + { + "epoch": 0.10398056532631526, + "grad_norm": 4.511797904968262, + "learning_rate": 9.779377518805178e-06, + "loss": 0.1698, + "step": 4109 + }, + { + "epoch": 0.10400587089100893, + "grad_norm": 4.403419494628906, + "learning_rate": 9.779259547292396e-06, + "loss": 0.1777, + "step": 4110 + }, + { + "epoch": 0.1040311764557026, + "grad_norm": 7.16106653213501, + "learning_rate": 9.779141544959058e-06, + "loss": 0.2416, + "step": 4111 + }, + { + "epoch": 0.10405648202039629, + "grad_norm": 6.770826816558838, + "learning_rate": 9.779023511805927e-06, + "loss": 0.188, + "step": 4112 + }, + { + "epoch": 0.10408178758508996, + "grad_norm": 7.016541004180908, + "learning_rate": 9.778905447833762e-06, + "loss": 0.2301, + "step": 4113 + }, + { + "epoch": 0.10410709314978364, + "grad_norm": 7.771493434906006, + "learning_rate": 9.778787353043324e-06, + "loss": 0.1819, + "step": 4114 + }, + { + "epoch": 0.10413239871447731, + "grad_norm": 6.073580741882324, + "learning_rate": 9.778669227435375e-06, + "loss": 0.1523, + "step": 4115 + }, + { + "epoch": 0.104157704279171, + "grad_norm": 6.918230056762695, + "learning_rate": 9.778551071010678e-06, + "loss": 0.2535, + "step": 4116 + }, + { + "epoch": 0.10418300984386467, + "grad_norm": 11.909960746765137, + "learning_rate": 9.778432883769993e-06, + "loss": 0.2678, + "step": 4117 + }, + { + "epoch": 0.10420831540855834, + "grad_norm": 6.916815757751465, + "learning_rate": 9.778314665714084e-06, + "loss": 0.2285, + "step": 4118 + }, + { + "epoch": 0.10423362097325202, + "grad_norm": 14.143866539001465, + "learning_rate": 9.778196416843712e-06, + "loss": 0.1978, + "step": 4119 + }, + { + "epoch": 0.10425892653794569, + "grad_norm": 7.505026817321777, + "learning_rate": 9.77807813715964e-06, + "loss": 0.2755, + "step": 4120 + }, + { + "epoch": 0.10428423210263937, + "grad_norm": 13.105923652648926, + "learning_rate": 9.777959826662631e-06, + "loss": 0.3318, + "step": 4121 + }, + { + "epoch": 0.10430953766733304, + "grad_norm": 5.218010425567627, + "learning_rate": 9.77784148535345e-06, + "loss": 0.1677, + "step": 4122 + }, + { + "epoch": 0.10433484323202673, + "grad_norm": 9.419866561889648, + "learning_rate": 9.777723113232856e-06, + "loss": 0.2252, + "step": 4123 + }, + { + "epoch": 0.1043601487967204, + "grad_norm": 4.171421051025391, + "learning_rate": 9.777604710301613e-06, + "loss": 0.2388, + "step": 4124 + }, + { + "epoch": 0.10438545436141408, + "grad_norm": 13.38797378540039, + "learning_rate": 9.777486276560488e-06, + "loss": 0.246, + "step": 4125 + }, + { + "epoch": 0.10441075992610775, + "grad_norm": 8.567349433898926, + "learning_rate": 9.777367812010243e-06, + "loss": 0.1479, + "step": 4126 + }, + { + "epoch": 0.10443606549080142, + "grad_norm": 9.090409278869629, + "learning_rate": 9.77724931665164e-06, + "loss": 0.3141, + "step": 4127 + }, + { + "epoch": 0.1044613710554951, + "grad_norm": 6.720006942749023, + "learning_rate": 9.777130790485445e-06, + "loss": 0.2112, + "step": 4128 + }, + { + "epoch": 0.10448667662018878, + "grad_norm": 8.509537696838379, + "learning_rate": 9.777012233512424e-06, + "loss": 0.2078, + "step": 4129 + }, + { + "epoch": 0.10451198218488246, + "grad_norm": 5.27195930480957, + "learning_rate": 9.776893645733338e-06, + "loss": 0.203, + "step": 4130 + }, + { + "epoch": 0.10453728774957613, + "grad_norm": 4.6011199951171875, + "learning_rate": 9.776775027148954e-06, + "loss": 0.1632, + "step": 4131 + }, + { + "epoch": 0.10456259331426981, + "grad_norm": 8.835604667663574, + "learning_rate": 9.776656377760037e-06, + "loss": 0.2222, + "step": 4132 + }, + { + "epoch": 0.10458789887896348, + "grad_norm": 9.730175971984863, + "learning_rate": 9.77653769756735e-06, + "loss": 0.2718, + "step": 4133 + }, + { + "epoch": 0.10461320444365715, + "grad_norm": 7.421168327331543, + "learning_rate": 9.77641898657166e-06, + "loss": 0.1956, + "step": 4134 + }, + { + "epoch": 0.10463851000835084, + "grad_norm": 3.744446039199829, + "learning_rate": 9.776300244773733e-06, + "loss": 0.1943, + "step": 4135 + }, + { + "epoch": 0.10466381557304451, + "grad_norm": 4.318467617034912, + "learning_rate": 9.776181472174335e-06, + "loss": 0.1617, + "step": 4136 + }, + { + "epoch": 0.10468912113773819, + "grad_norm": 9.729085922241211, + "learning_rate": 9.77606266877423e-06, + "loss": 0.1963, + "step": 4137 + }, + { + "epoch": 0.10471442670243186, + "grad_norm": 6.404327392578125, + "learning_rate": 9.775943834574187e-06, + "loss": 0.1919, + "step": 4138 + }, + { + "epoch": 0.10473973226712555, + "grad_norm": 7.132415771484375, + "learning_rate": 9.775824969574968e-06, + "loss": 0.224, + "step": 4139 + }, + { + "epoch": 0.10476503783181922, + "grad_norm": 4.001979351043701, + "learning_rate": 9.775706073777344e-06, + "loss": 0.1388, + "step": 4140 + }, + { + "epoch": 0.1047903433965129, + "grad_norm": 13.017075538635254, + "learning_rate": 9.77558714718208e-06, + "loss": 0.1743, + "step": 4141 + }, + { + "epoch": 0.10481564896120657, + "grad_norm": 6.134490966796875, + "learning_rate": 9.775468189789942e-06, + "loss": 0.2105, + "step": 4142 + }, + { + "epoch": 0.10484095452590024, + "grad_norm": 6.479538440704346, + "learning_rate": 9.775349201601699e-06, + "loss": 0.2316, + "step": 4143 + }, + { + "epoch": 0.10486626009059392, + "grad_norm": 6.750748157501221, + "learning_rate": 9.775230182618114e-06, + "loss": 0.2645, + "step": 4144 + }, + { + "epoch": 0.1048915656552876, + "grad_norm": 10.353535652160645, + "learning_rate": 9.775111132839961e-06, + "loss": 0.2016, + "step": 4145 + }, + { + "epoch": 0.10491687121998128, + "grad_norm": 12.363527297973633, + "learning_rate": 9.774992052268005e-06, + "loss": 0.3166, + "step": 4146 + }, + { + "epoch": 0.10494217678467495, + "grad_norm": 6.155625820159912, + "learning_rate": 9.77487294090301e-06, + "loss": 0.1857, + "step": 4147 + }, + { + "epoch": 0.10496748234936863, + "grad_norm": 7.7672624588012695, + "learning_rate": 9.77475379874575e-06, + "loss": 0.1747, + "step": 4148 + }, + { + "epoch": 0.1049927879140623, + "grad_norm": 7.231545448303223, + "learning_rate": 9.774634625796992e-06, + "loss": 0.2453, + "step": 4149 + }, + { + "epoch": 0.10501809347875597, + "grad_norm": 6.027119159698486, + "learning_rate": 9.774515422057502e-06, + "loss": 0.201, + "step": 4150 + }, + { + "epoch": 0.10504339904344966, + "grad_norm": 3.731826066970825, + "learning_rate": 9.77439618752805e-06, + "loss": 0.1443, + "step": 4151 + }, + { + "epoch": 0.10506870460814333, + "grad_norm": 5.563605785369873, + "learning_rate": 9.774276922209407e-06, + "loss": 0.1929, + "step": 4152 + }, + { + "epoch": 0.10509401017283701, + "grad_norm": 3.1983909606933594, + "learning_rate": 9.774157626102337e-06, + "loss": 0.092, + "step": 4153 + }, + { + "epoch": 0.10511931573753068, + "grad_norm": 5.910723686218262, + "learning_rate": 9.774038299207615e-06, + "loss": 0.2602, + "step": 4154 + }, + { + "epoch": 0.10514462130222436, + "grad_norm": 24.83504295349121, + "learning_rate": 9.773918941526005e-06, + "loss": 0.2619, + "step": 4155 + }, + { + "epoch": 0.10516992686691803, + "grad_norm": 14.459009170532227, + "learning_rate": 9.773799553058283e-06, + "loss": 0.2963, + "step": 4156 + }, + { + "epoch": 0.10519523243161172, + "grad_norm": 7.041476249694824, + "learning_rate": 9.773680133805215e-06, + "loss": 0.2728, + "step": 4157 + }, + { + "epoch": 0.10522053799630539, + "grad_norm": 7.18710470199585, + "learning_rate": 9.773560683767572e-06, + "loss": 0.2348, + "step": 4158 + }, + { + "epoch": 0.10524584356099906, + "grad_norm": 5.6590070724487305, + "learning_rate": 9.773441202946124e-06, + "loss": 0.1837, + "step": 4159 + }, + { + "epoch": 0.10527114912569274, + "grad_norm": 4.527221202850342, + "learning_rate": 9.77332169134164e-06, + "loss": 0.2165, + "step": 4160 + }, + { + "epoch": 0.10529645469038641, + "grad_norm": 4.0869903564453125, + "learning_rate": 9.773202148954895e-06, + "loss": 0.1754, + "step": 4161 + }, + { + "epoch": 0.1053217602550801, + "grad_norm": 5.363789081573486, + "learning_rate": 9.773082575786654e-06, + "loss": 0.2114, + "step": 4162 + }, + { + "epoch": 0.10534706581977377, + "grad_norm": 6.814024925231934, + "learning_rate": 9.772962971837694e-06, + "loss": 0.2482, + "step": 4163 + }, + { + "epoch": 0.10537237138446745, + "grad_norm": 9.368720054626465, + "learning_rate": 9.772843337108783e-06, + "loss": 0.2788, + "step": 4164 + }, + { + "epoch": 0.10539767694916112, + "grad_norm": 4.830645561218262, + "learning_rate": 9.772723671600693e-06, + "loss": 0.179, + "step": 4165 + }, + { + "epoch": 0.10542298251385479, + "grad_norm": 12.260281562805176, + "learning_rate": 9.772603975314195e-06, + "loss": 0.2564, + "step": 4166 + }, + { + "epoch": 0.10544828807854847, + "grad_norm": 6.42793607711792, + "learning_rate": 9.772484248250062e-06, + "loss": 0.228, + "step": 4167 + }, + { + "epoch": 0.10547359364324214, + "grad_norm": 9.341279029846191, + "learning_rate": 9.772364490409067e-06, + "loss": 0.257, + "step": 4168 + }, + { + "epoch": 0.10549889920793583, + "grad_norm": 13.95773983001709, + "learning_rate": 9.77224470179198e-06, + "loss": 0.3462, + "step": 4169 + }, + { + "epoch": 0.1055242047726295, + "grad_norm": 6.663071155548096, + "learning_rate": 9.772124882399575e-06, + "loss": 0.1654, + "step": 4170 + }, + { + "epoch": 0.10554951033732318, + "grad_norm": 8.294903755187988, + "learning_rate": 9.772005032232625e-06, + "loss": 0.2586, + "step": 4171 + }, + { + "epoch": 0.10557481590201685, + "grad_norm": 7.8188347816467285, + "learning_rate": 9.771885151291902e-06, + "loss": 0.2191, + "step": 4172 + }, + { + "epoch": 0.10560012146671054, + "grad_norm": 7.997622013092041, + "learning_rate": 9.771765239578176e-06, + "loss": 0.182, + "step": 4173 + }, + { + "epoch": 0.1056254270314042, + "grad_norm": 4.566112518310547, + "learning_rate": 9.771645297092228e-06, + "loss": 0.2156, + "step": 4174 + }, + { + "epoch": 0.10565073259609788, + "grad_norm": 8.006340026855469, + "learning_rate": 9.771525323834826e-06, + "loss": 0.1583, + "step": 4175 + }, + { + "epoch": 0.10567603816079156, + "grad_norm": 5.919551849365234, + "learning_rate": 9.771405319806743e-06, + "loss": 0.1405, + "step": 4176 + }, + { + "epoch": 0.10570134372548523, + "grad_norm": 4.223970890045166, + "learning_rate": 9.771285285008756e-06, + "loss": 0.1685, + "step": 4177 + }, + { + "epoch": 0.10572664929017891, + "grad_norm": 3.7006988525390625, + "learning_rate": 9.771165219441639e-06, + "loss": 0.1939, + "step": 4178 + }, + { + "epoch": 0.10575195485487258, + "grad_norm": 5.788569450378418, + "learning_rate": 9.771045123106162e-06, + "loss": 0.128, + "step": 4179 + }, + { + "epoch": 0.10577726041956627, + "grad_norm": 17.82853126525879, + "learning_rate": 9.770924996003106e-06, + "loss": 0.3524, + "step": 4180 + }, + { + "epoch": 0.10580256598425994, + "grad_norm": 9.018245697021484, + "learning_rate": 9.770804838133239e-06, + "loss": 0.211, + "step": 4181 + }, + { + "epoch": 0.10582787154895361, + "grad_norm": 7.640196800231934, + "learning_rate": 9.770684649497341e-06, + "loss": 0.2071, + "step": 4182 + }, + { + "epoch": 0.10585317711364729, + "grad_norm": 10.123881340026855, + "learning_rate": 9.770564430096184e-06, + "loss": 0.2396, + "step": 4183 + }, + { + "epoch": 0.10587848267834096, + "grad_norm": 4.414417743682861, + "learning_rate": 9.770444179930544e-06, + "loss": 0.1812, + "step": 4184 + }, + { + "epoch": 0.10590378824303465, + "grad_norm": 3.1756949424743652, + "learning_rate": 9.770323899001198e-06, + "loss": 0.1957, + "step": 4185 + }, + { + "epoch": 0.10592909380772832, + "grad_norm": 16.492687225341797, + "learning_rate": 9.77020358730892e-06, + "loss": 0.2264, + "step": 4186 + }, + { + "epoch": 0.105954399372422, + "grad_norm": 8.014866828918457, + "learning_rate": 9.770083244854487e-06, + "loss": 0.2104, + "step": 4187 + }, + { + "epoch": 0.10597970493711567, + "grad_norm": 9.134064674377441, + "learning_rate": 9.769962871638674e-06, + "loss": 0.1966, + "step": 4188 + }, + { + "epoch": 0.10600501050180935, + "grad_norm": 3.2946548461914062, + "learning_rate": 9.769842467662258e-06, + "loss": 0.1777, + "step": 4189 + }, + { + "epoch": 0.10603031606650302, + "grad_norm": 7.775219917297363, + "learning_rate": 9.769722032926015e-06, + "loss": 0.2452, + "step": 4190 + }, + { + "epoch": 0.1060556216311967, + "grad_norm": 7.500355243682861, + "learning_rate": 9.769601567430722e-06, + "loss": 0.1585, + "step": 4191 + }, + { + "epoch": 0.10608092719589038, + "grad_norm": 14.068971633911133, + "learning_rate": 9.769481071177154e-06, + "loss": 0.4024, + "step": 4192 + }, + { + "epoch": 0.10610623276058405, + "grad_norm": 8.53516674041748, + "learning_rate": 9.769360544166092e-06, + "loss": 0.1717, + "step": 4193 + }, + { + "epoch": 0.10613153832527773, + "grad_norm": 2.6435117721557617, + "learning_rate": 9.76923998639831e-06, + "loss": 0.178, + "step": 4194 + }, + { + "epoch": 0.1061568438899714, + "grad_norm": 4.087888717651367, + "learning_rate": 9.769119397874587e-06, + "loss": 0.1479, + "step": 4195 + }, + { + "epoch": 0.10618214945466509, + "grad_norm": 5.170604228973389, + "learning_rate": 9.7689987785957e-06, + "loss": 0.2522, + "step": 4196 + }, + { + "epoch": 0.10620745501935876, + "grad_norm": 3.930145740509033, + "learning_rate": 9.768878128562427e-06, + "loss": 0.156, + "step": 4197 + }, + { + "epoch": 0.10623276058405243, + "grad_norm": 5.536209583282471, + "learning_rate": 9.768757447775546e-06, + "loss": 0.1418, + "step": 4198 + }, + { + "epoch": 0.10625806614874611, + "grad_norm": 7.428445339202881, + "learning_rate": 9.768636736235835e-06, + "loss": 0.1257, + "step": 4199 + }, + { + "epoch": 0.10628337171343978, + "grad_norm": 4.036035537719727, + "learning_rate": 9.768515993944072e-06, + "loss": 0.132, + "step": 4200 + }, + { + "epoch": 0.10630867727813346, + "grad_norm": 5.094107151031494, + "learning_rate": 9.768395220901037e-06, + "loss": 0.2881, + "step": 4201 + }, + { + "epoch": 0.10633398284282713, + "grad_norm": 7.24627161026001, + "learning_rate": 9.768274417107509e-06, + "loss": 0.2681, + "step": 4202 + }, + { + "epoch": 0.10635928840752082, + "grad_norm": 5.871206760406494, + "learning_rate": 9.768153582564266e-06, + "loss": 0.1494, + "step": 4203 + }, + { + "epoch": 0.10638459397221449, + "grad_norm": 7.876039028167725, + "learning_rate": 9.768032717272087e-06, + "loss": 0.2628, + "step": 4204 + }, + { + "epoch": 0.10640989953690817, + "grad_norm": 6.804625988006592, + "learning_rate": 9.767911821231753e-06, + "loss": 0.1945, + "step": 4205 + }, + { + "epoch": 0.10643520510160184, + "grad_norm": 6.843657970428467, + "learning_rate": 9.76779089444404e-06, + "loss": 0.2404, + "step": 4206 + }, + { + "epoch": 0.10646051066629551, + "grad_norm": 5.035196304321289, + "learning_rate": 9.767669936909733e-06, + "loss": 0.1114, + "step": 4207 + }, + { + "epoch": 0.1064858162309892, + "grad_norm": 4.062743186950684, + "learning_rate": 9.767548948629608e-06, + "loss": 0.1853, + "step": 4208 + }, + { + "epoch": 0.10651112179568287, + "grad_norm": 16.83101463317871, + "learning_rate": 9.767427929604448e-06, + "loss": 0.2973, + "step": 4209 + }, + { + "epoch": 0.10653642736037655, + "grad_norm": 3.812870502471924, + "learning_rate": 9.76730687983503e-06, + "loss": 0.1743, + "step": 4210 + }, + { + "epoch": 0.10656173292507022, + "grad_norm": 10.489871978759766, + "learning_rate": 9.767185799322139e-06, + "loss": 0.3597, + "step": 4211 + }, + { + "epoch": 0.1065870384897639, + "grad_norm": 8.838740348815918, + "learning_rate": 9.767064688066552e-06, + "loss": 0.2309, + "step": 4212 + }, + { + "epoch": 0.10661234405445758, + "grad_norm": 7.63209342956543, + "learning_rate": 9.766943546069052e-06, + "loss": 0.1719, + "step": 4213 + }, + { + "epoch": 0.10663764961915125, + "grad_norm": 3.5542614459991455, + "learning_rate": 9.76682237333042e-06, + "loss": 0.178, + "step": 4214 + }, + { + "epoch": 0.10666295518384493, + "grad_norm": 12.254000663757324, + "learning_rate": 9.766701169851436e-06, + "loss": 0.325, + "step": 4215 + }, + { + "epoch": 0.1066882607485386, + "grad_norm": 5.505725860595703, + "learning_rate": 9.766579935632885e-06, + "loss": 0.1727, + "step": 4216 + }, + { + "epoch": 0.10671356631323228, + "grad_norm": 9.085476875305176, + "learning_rate": 9.766458670675545e-06, + "loss": 0.2152, + "step": 4217 + }, + { + "epoch": 0.10673887187792595, + "grad_norm": 4.788471221923828, + "learning_rate": 9.766337374980201e-06, + "loss": 0.1285, + "step": 4218 + }, + { + "epoch": 0.10676417744261964, + "grad_norm": 7.455333709716797, + "learning_rate": 9.766216048547633e-06, + "loss": 0.2522, + "step": 4219 + }, + { + "epoch": 0.10678948300731331, + "grad_norm": 63.6682014465332, + "learning_rate": 9.766094691378625e-06, + "loss": 0.4203, + "step": 4220 + }, + { + "epoch": 0.10681478857200699, + "grad_norm": 6.596709251403809, + "learning_rate": 9.765973303473957e-06, + "loss": 0.2117, + "step": 4221 + }, + { + "epoch": 0.10684009413670066, + "grad_norm": 3.4778170585632324, + "learning_rate": 9.765851884834415e-06, + "loss": 0.1886, + "step": 4222 + }, + { + "epoch": 0.10686539970139433, + "grad_norm": 25.118488311767578, + "learning_rate": 9.76573043546078e-06, + "loss": 0.393, + "step": 4223 + }, + { + "epoch": 0.10689070526608802, + "grad_norm": 4.834495544433594, + "learning_rate": 9.765608955353838e-06, + "loss": 0.1986, + "step": 4224 + }, + { + "epoch": 0.10691601083078169, + "grad_norm": 8.959809303283691, + "learning_rate": 9.765487444514369e-06, + "loss": 0.3184, + "step": 4225 + }, + { + "epoch": 0.10694131639547537, + "grad_norm": 7.0230183601379395, + "learning_rate": 9.765365902943158e-06, + "loss": 0.1877, + "step": 4226 + }, + { + "epoch": 0.10696662196016904, + "grad_norm": 4.797277450561523, + "learning_rate": 9.765244330640988e-06, + "loss": 0.2128, + "step": 4227 + }, + { + "epoch": 0.10699192752486272, + "grad_norm": 4.111161231994629, + "learning_rate": 9.765122727608645e-06, + "loss": 0.1894, + "step": 4228 + }, + { + "epoch": 0.1070172330895564, + "grad_norm": 6.77517032623291, + "learning_rate": 9.76500109384691e-06, + "loss": 0.1675, + "step": 4229 + }, + { + "epoch": 0.10704253865425006, + "grad_norm": 8.89239501953125, + "learning_rate": 9.76487942935657e-06, + "loss": 0.2401, + "step": 4230 + }, + { + "epoch": 0.10706784421894375, + "grad_norm": 8.140174865722656, + "learning_rate": 9.76475773413841e-06, + "loss": 0.3378, + "step": 4231 + }, + { + "epoch": 0.10709314978363742, + "grad_norm": 11.94819164276123, + "learning_rate": 9.764636008193214e-06, + "loss": 0.2338, + "step": 4232 + }, + { + "epoch": 0.1071184553483311, + "grad_norm": 5.034268379211426, + "learning_rate": 9.764514251521764e-06, + "loss": 0.1402, + "step": 4233 + }, + { + "epoch": 0.10714376091302477, + "grad_norm": 4.189410209655762, + "learning_rate": 9.764392464124852e-06, + "loss": 0.1684, + "step": 4234 + }, + { + "epoch": 0.10716906647771846, + "grad_norm": 5.021290302276611, + "learning_rate": 9.764270646003255e-06, + "loss": 0.2062, + "step": 4235 + }, + { + "epoch": 0.10719437204241213, + "grad_norm": 9.585960388183594, + "learning_rate": 9.764148797157765e-06, + "loss": 0.2421, + "step": 4236 + }, + { + "epoch": 0.10721967760710581, + "grad_norm": 5.943109512329102, + "learning_rate": 9.764026917589165e-06, + "loss": 0.2311, + "step": 4237 + }, + { + "epoch": 0.10724498317179948, + "grad_norm": 5.16048526763916, + "learning_rate": 9.763905007298243e-06, + "loss": 0.2073, + "step": 4238 + }, + { + "epoch": 0.10727028873649315, + "grad_norm": 5.563462257385254, + "learning_rate": 9.763783066285783e-06, + "loss": 0.1586, + "step": 4239 + }, + { + "epoch": 0.10729559430118683, + "grad_norm": 4.02933931350708, + "learning_rate": 9.763661094552571e-06, + "loss": 0.1642, + "step": 4240 + }, + { + "epoch": 0.1073208998658805, + "grad_norm": 8.389654159545898, + "learning_rate": 9.763539092099396e-06, + "loss": 0.1849, + "step": 4241 + }, + { + "epoch": 0.10734620543057419, + "grad_norm": 6.367958068847656, + "learning_rate": 9.763417058927042e-06, + "loss": 0.2059, + "step": 4242 + }, + { + "epoch": 0.10737151099526786, + "grad_norm": 5.177082061767578, + "learning_rate": 9.763294995036299e-06, + "loss": 0.2349, + "step": 4243 + }, + { + "epoch": 0.10739681655996154, + "grad_norm": 7.881721496582031, + "learning_rate": 9.763172900427951e-06, + "loss": 0.1899, + "step": 4244 + }, + { + "epoch": 0.10742212212465521, + "grad_norm": 9.094575881958008, + "learning_rate": 9.763050775102789e-06, + "loss": 0.2156, + "step": 4245 + }, + { + "epoch": 0.10744742768934888, + "grad_norm": 10.536222457885742, + "learning_rate": 9.762928619061598e-06, + "loss": 0.2205, + "step": 4246 + }, + { + "epoch": 0.10747273325404257, + "grad_norm": 3.643033742904663, + "learning_rate": 9.762806432305166e-06, + "loss": 0.2028, + "step": 4247 + }, + { + "epoch": 0.10749803881873624, + "grad_norm": 15.42984390258789, + "learning_rate": 9.762684214834281e-06, + "loss": 0.321, + "step": 4248 + }, + { + "epoch": 0.10752334438342992, + "grad_norm": 17.487972259521484, + "learning_rate": 9.76256196664973e-06, + "loss": 0.2915, + "step": 4249 + }, + { + "epoch": 0.10754864994812359, + "grad_norm": 6.010347843170166, + "learning_rate": 9.762439687752305e-06, + "loss": 0.297, + "step": 4250 + }, + { + "epoch": 0.10757395551281727, + "grad_norm": 10.759389877319336, + "learning_rate": 9.762317378142792e-06, + "loss": 0.4, + "step": 4251 + }, + { + "epoch": 0.10759926107751094, + "grad_norm": 15.831520080566406, + "learning_rate": 9.76219503782198e-06, + "loss": 0.2624, + "step": 4252 + }, + { + "epoch": 0.10762456664220463, + "grad_norm": 4.450117111206055, + "learning_rate": 9.762072666790658e-06, + "loss": 0.1352, + "step": 4253 + }, + { + "epoch": 0.1076498722068983, + "grad_norm": 6.329897403717041, + "learning_rate": 9.761950265049615e-06, + "loss": 0.2011, + "step": 4254 + }, + { + "epoch": 0.10767517777159197, + "grad_norm": 4.7796173095703125, + "learning_rate": 9.761827832599639e-06, + "loss": 0.2416, + "step": 4255 + }, + { + "epoch": 0.10770048333628565, + "grad_norm": 3.98667049407959, + "learning_rate": 9.761705369441523e-06, + "loss": 0.1157, + "step": 4256 + }, + { + "epoch": 0.10772578890097932, + "grad_norm": 2.8487460613250732, + "learning_rate": 9.761582875576054e-06, + "loss": 0.106, + "step": 4257 + }, + { + "epoch": 0.107751094465673, + "grad_norm": 6.572669506072998, + "learning_rate": 9.761460351004025e-06, + "loss": 0.2684, + "step": 4258 + }, + { + "epoch": 0.10777640003036668, + "grad_norm": 6.133720397949219, + "learning_rate": 9.76133779572622e-06, + "loss": 0.2189, + "step": 4259 + }, + { + "epoch": 0.10780170559506036, + "grad_norm": 8.527758598327637, + "learning_rate": 9.761215209743437e-06, + "loss": 0.2236, + "step": 4260 + }, + { + "epoch": 0.10782701115975403, + "grad_norm": 8.388337135314941, + "learning_rate": 9.76109259305646e-06, + "loss": 0.2718, + "step": 4261 + }, + { + "epoch": 0.1078523167244477, + "grad_norm": 7.707720756530762, + "learning_rate": 9.760969945666084e-06, + "loss": 0.2966, + "step": 4262 + }, + { + "epoch": 0.10787762228914138, + "grad_norm": 5.400269985198975, + "learning_rate": 9.760847267573099e-06, + "loss": 0.1615, + "step": 4263 + }, + { + "epoch": 0.10790292785383505, + "grad_norm": 15.235986709594727, + "learning_rate": 9.760724558778294e-06, + "loss": 0.2828, + "step": 4264 + }, + { + "epoch": 0.10792823341852874, + "grad_norm": 5.195121765136719, + "learning_rate": 9.760601819282463e-06, + "loss": 0.2173, + "step": 4265 + }, + { + "epoch": 0.10795353898322241, + "grad_norm": 9.351164817810059, + "learning_rate": 9.760479049086396e-06, + "loss": 0.2676, + "step": 4266 + }, + { + "epoch": 0.10797884454791609, + "grad_norm": 12.640624046325684, + "learning_rate": 9.760356248190884e-06, + "loss": 0.2313, + "step": 4267 + }, + { + "epoch": 0.10800415011260976, + "grad_norm": 12.922294616699219, + "learning_rate": 9.76023341659672e-06, + "loss": 0.3598, + "step": 4268 + }, + { + "epoch": 0.10802945567730345, + "grad_norm": 5.614515781402588, + "learning_rate": 9.760110554304699e-06, + "loss": 0.1385, + "step": 4269 + }, + { + "epoch": 0.10805476124199712, + "grad_norm": 4.713812828063965, + "learning_rate": 9.75998766131561e-06, + "loss": 0.148, + "step": 4270 + }, + { + "epoch": 0.10808006680669079, + "grad_norm": 4.627410888671875, + "learning_rate": 9.759864737630242e-06, + "loss": 0.2819, + "step": 4271 + }, + { + "epoch": 0.10810537237138447, + "grad_norm": 6.146586894989014, + "learning_rate": 9.759741783249394e-06, + "loss": 0.2128, + "step": 4272 + }, + { + "epoch": 0.10813067793607814, + "grad_norm": 16.190223693847656, + "learning_rate": 9.759618798173857e-06, + "loss": 0.3106, + "step": 4273 + }, + { + "epoch": 0.10815598350077182, + "grad_norm": 9.641627311706543, + "learning_rate": 9.759495782404425e-06, + "loss": 0.2054, + "step": 4274 + }, + { + "epoch": 0.1081812890654655, + "grad_norm": 7.441507339477539, + "learning_rate": 9.759372735941888e-06, + "loss": 0.1632, + "step": 4275 + }, + { + "epoch": 0.10820659463015918, + "grad_norm": 2.9311206340789795, + "learning_rate": 9.759249658787043e-06, + "loss": 0.1911, + "step": 4276 + }, + { + "epoch": 0.10823190019485285, + "grad_norm": 4.818187236785889, + "learning_rate": 9.75912655094068e-06, + "loss": 0.157, + "step": 4277 + }, + { + "epoch": 0.10825720575954652, + "grad_norm": 11.064008712768555, + "learning_rate": 9.759003412403599e-06, + "loss": 0.3087, + "step": 4278 + }, + { + "epoch": 0.1082825113242402, + "grad_norm": 4.365088939666748, + "learning_rate": 9.758880243176587e-06, + "loss": 0.1787, + "step": 4279 + }, + { + "epoch": 0.10830781688893387, + "grad_norm": 5.7919020652771, + "learning_rate": 9.758757043260442e-06, + "loss": 0.2857, + "step": 4280 + }, + { + "epoch": 0.10833312245362756, + "grad_norm": 9.520122528076172, + "learning_rate": 9.75863381265596e-06, + "loss": 0.3149, + "step": 4281 + }, + { + "epoch": 0.10835842801832123, + "grad_norm": 6.911755561828613, + "learning_rate": 9.75851055136393e-06, + "loss": 0.1906, + "step": 4282 + }, + { + "epoch": 0.10838373358301491, + "grad_norm": 4.882405757904053, + "learning_rate": 9.758387259385154e-06, + "loss": 0.1953, + "step": 4283 + }, + { + "epoch": 0.10840903914770858, + "grad_norm": 4.833568572998047, + "learning_rate": 9.758263936720423e-06, + "loss": 0.2062, + "step": 4284 + }, + { + "epoch": 0.10843434471240226, + "grad_norm": 5.026228427886963, + "learning_rate": 9.758140583370532e-06, + "loss": 0.2046, + "step": 4285 + }, + { + "epoch": 0.10845965027709593, + "grad_norm": 4.254002094268799, + "learning_rate": 9.758017199336279e-06, + "loss": 0.1582, + "step": 4286 + }, + { + "epoch": 0.1084849558417896, + "grad_norm": 7.742366790771484, + "learning_rate": 9.757893784618455e-06, + "loss": 0.2486, + "step": 4287 + }, + { + "epoch": 0.10851026140648329, + "grad_norm": 8.608728408813477, + "learning_rate": 9.757770339217862e-06, + "loss": 0.1698, + "step": 4288 + }, + { + "epoch": 0.10853556697117696, + "grad_norm": 11.540090560913086, + "learning_rate": 9.757646863135291e-06, + "loss": 0.236, + "step": 4289 + }, + { + "epoch": 0.10856087253587064, + "grad_norm": 19.29571533203125, + "learning_rate": 9.757523356371542e-06, + "loss": 0.2192, + "step": 4290 + }, + { + "epoch": 0.10858617810056431, + "grad_norm": 5.141416072845459, + "learning_rate": 9.75739981892741e-06, + "loss": 0.1753, + "step": 4291 + }, + { + "epoch": 0.108611483665258, + "grad_norm": 8.348727226257324, + "learning_rate": 9.757276250803691e-06, + "loss": 0.1855, + "step": 4292 + }, + { + "epoch": 0.10863678922995167, + "grad_norm": 8.360726356506348, + "learning_rate": 9.757152652001181e-06, + "loss": 0.2091, + "step": 4293 + }, + { + "epoch": 0.10866209479464534, + "grad_norm": 8.424487113952637, + "learning_rate": 9.757029022520681e-06, + "loss": 0.1805, + "step": 4294 + }, + { + "epoch": 0.10868740035933902, + "grad_norm": 3.6664819717407227, + "learning_rate": 9.756905362362984e-06, + "loss": 0.1253, + "step": 4295 + }, + { + "epoch": 0.10871270592403269, + "grad_norm": 6.338119029998779, + "learning_rate": 9.75678167152889e-06, + "loss": 0.1162, + "step": 4296 + }, + { + "epoch": 0.10873801148872637, + "grad_norm": 11.212638854980469, + "learning_rate": 9.756657950019196e-06, + "loss": 0.245, + "step": 4297 + }, + { + "epoch": 0.10876331705342004, + "grad_norm": 6.325461387634277, + "learning_rate": 9.756534197834698e-06, + "loss": 0.2224, + "step": 4298 + }, + { + "epoch": 0.10878862261811373, + "grad_norm": 8.267169952392578, + "learning_rate": 9.756410414976195e-06, + "loss": 0.2188, + "step": 4299 + }, + { + "epoch": 0.1088139281828074, + "grad_norm": 4.065610885620117, + "learning_rate": 9.75628660144449e-06, + "loss": 0.1494, + "step": 4300 + }, + { + "epoch": 0.10883923374750108, + "grad_norm": 5.596909999847412, + "learning_rate": 9.756162757240375e-06, + "loss": 0.1895, + "step": 4301 + }, + { + "epoch": 0.10886453931219475, + "grad_norm": 5.072862148284912, + "learning_rate": 9.756038882364652e-06, + "loss": 0.1798, + "step": 4302 + }, + { + "epoch": 0.10888984487688842, + "grad_norm": 21.718399047851562, + "learning_rate": 9.755914976818118e-06, + "loss": 0.3758, + "step": 4303 + }, + { + "epoch": 0.1089151504415821, + "grad_norm": 7.726251602172852, + "learning_rate": 9.755791040601573e-06, + "loss": 0.3505, + "step": 4304 + }, + { + "epoch": 0.10894045600627578, + "grad_norm": 4.85481595993042, + "learning_rate": 9.755667073715817e-06, + "loss": 0.1384, + "step": 4305 + }, + { + "epoch": 0.10896576157096946, + "grad_norm": 8.80256175994873, + "learning_rate": 9.75554307616165e-06, + "loss": 0.229, + "step": 4306 + }, + { + "epoch": 0.10899106713566313, + "grad_norm": 10.973383903503418, + "learning_rate": 9.755419047939868e-06, + "loss": 0.2045, + "step": 4307 + }, + { + "epoch": 0.10901637270035681, + "grad_norm": 5.016086101531982, + "learning_rate": 9.755294989051272e-06, + "loss": 0.1654, + "step": 4308 + }, + { + "epoch": 0.10904167826505048, + "grad_norm": 17.19385528564453, + "learning_rate": 9.755170899496664e-06, + "loss": 0.1401, + "step": 4309 + }, + { + "epoch": 0.10906698382974415, + "grad_norm": 9.65201187133789, + "learning_rate": 9.755046779276844e-06, + "loss": 0.3414, + "step": 4310 + }, + { + "epoch": 0.10909228939443784, + "grad_norm": 7.179879188537598, + "learning_rate": 9.754922628392614e-06, + "loss": 0.17, + "step": 4311 + }, + { + "epoch": 0.10911759495913151, + "grad_norm": 4.4807257652282715, + "learning_rate": 9.754798446844769e-06, + "loss": 0.1778, + "step": 4312 + }, + { + "epoch": 0.10914290052382519, + "grad_norm": 6.348074913024902, + "learning_rate": 9.754674234634115e-06, + "loss": 0.2162, + "step": 4313 + }, + { + "epoch": 0.10916820608851886, + "grad_norm": 13.776793479919434, + "learning_rate": 9.75454999176145e-06, + "loss": 0.3143, + "step": 4314 + }, + { + "epoch": 0.10919351165321255, + "grad_norm": 17.318204879760742, + "learning_rate": 9.754425718227579e-06, + "loss": 0.2338, + "step": 4315 + }, + { + "epoch": 0.10921881721790622, + "grad_norm": 4.164055347442627, + "learning_rate": 9.7543014140333e-06, + "loss": 0.2108, + "step": 4316 + }, + { + "epoch": 0.1092441227825999, + "grad_norm": 8.901973724365234, + "learning_rate": 9.754177079179414e-06, + "loss": 0.2549, + "step": 4317 + }, + { + "epoch": 0.10926942834729357, + "grad_norm": 12.45174503326416, + "learning_rate": 9.754052713666726e-06, + "loss": 0.2007, + "step": 4318 + }, + { + "epoch": 0.10929473391198724, + "grad_norm": 6.100058078765869, + "learning_rate": 9.753928317496036e-06, + "loss": 0.192, + "step": 4319 + }, + { + "epoch": 0.10932003947668092, + "grad_norm": 7.0094146728515625, + "learning_rate": 9.753803890668146e-06, + "loss": 0.2521, + "step": 4320 + }, + { + "epoch": 0.1093453450413746, + "grad_norm": 7.596281051635742, + "learning_rate": 9.75367943318386e-06, + "loss": 0.1715, + "step": 4321 + }, + { + "epoch": 0.10937065060606828, + "grad_norm": 7.177692890167236, + "learning_rate": 9.753554945043978e-06, + "loss": 0.3121, + "step": 4322 + }, + { + "epoch": 0.10939595617076195, + "grad_norm": 5.478318691253662, + "learning_rate": 9.753430426249306e-06, + "loss": 0.208, + "step": 4323 + }, + { + "epoch": 0.10942126173545563, + "grad_norm": 6.6086225509643555, + "learning_rate": 9.753305876800645e-06, + "loss": 0.2199, + "step": 4324 + }, + { + "epoch": 0.1094465673001493, + "grad_norm": 8.135502815246582, + "learning_rate": 9.753181296698797e-06, + "loss": 0.3287, + "step": 4325 + }, + { + "epoch": 0.10947187286484297, + "grad_norm": 5.563817024230957, + "learning_rate": 9.753056685944568e-06, + "loss": 0.2762, + "step": 4326 + }, + { + "epoch": 0.10949717842953666, + "grad_norm": 6.679585933685303, + "learning_rate": 9.752932044538761e-06, + "loss": 0.1062, + "step": 4327 + }, + { + "epoch": 0.10952248399423033, + "grad_norm": 3.6042025089263916, + "learning_rate": 9.75280737248218e-06, + "loss": 0.1965, + "step": 4328 + }, + { + "epoch": 0.10954778955892401, + "grad_norm": 10.102043151855469, + "learning_rate": 9.752682669775628e-06, + "loss": 0.2583, + "step": 4329 + }, + { + "epoch": 0.10957309512361768, + "grad_norm": 4.711809158325195, + "learning_rate": 9.752557936419908e-06, + "loss": 0.127, + "step": 4330 + }, + { + "epoch": 0.10959840068831136, + "grad_norm": 6.876780033111572, + "learning_rate": 9.752433172415827e-06, + "loss": 0.2488, + "step": 4331 + }, + { + "epoch": 0.10962370625300503, + "grad_norm": 3.9874913692474365, + "learning_rate": 9.752308377764189e-06, + "loss": 0.1262, + "step": 4332 + }, + { + "epoch": 0.10964901181769872, + "grad_norm": 3.8517355918884277, + "learning_rate": 9.752183552465798e-06, + "loss": 0.1817, + "step": 4333 + }, + { + "epoch": 0.10967431738239239, + "grad_norm": 5.58670711517334, + "learning_rate": 9.752058696521458e-06, + "loss": 0.205, + "step": 4334 + }, + { + "epoch": 0.10969962294708606, + "grad_norm": 5.10333776473999, + "learning_rate": 9.751933809931977e-06, + "loss": 0.178, + "step": 4335 + }, + { + "epoch": 0.10972492851177974, + "grad_norm": 7.326529026031494, + "learning_rate": 9.751808892698158e-06, + "loss": 0.3094, + "step": 4336 + }, + { + "epoch": 0.10975023407647341, + "grad_norm": 4.357657432556152, + "learning_rate": 9.751683944820807e-06, + "loss": 0.2161, + "step": 4337 + }, + { + "epoch": 0.1097755396411671, + "grad_norm": 7.360988140106201, + "learning_rate": 9.751558966300731e-06, + "loss": 0.2234, + "step": 4338 + }, + { + "epoch": 0.10980084520586077, + "grad_norm": 7.741057395935059, + "learning_rate": 9.751433957138734e-06, + "loss": 0.2664, + "step": 4339 + }, + { + "epoch": 0.10982615077055445, + "grad_norm": 3.652590036392212, + "learning_rate": 9.751308917335624e-06, + "loss": 0.1303, + "step": 4340 + }, + { + "epoch": 0.10985145633524812, + "grad_norm": 5.970227241516113, + "learning_rate": 9.751183846892209e-06, + "loss": 0.2281, + "step": 4341 + }, + { + "epoch": 0.10987676189994179, + "grad_norm": 6.047496318817139, + "learning_rate": 9.75105874580929e-06, + "loss": 0.2669, + "step": 4342 + }, + { + "epoch": 0.10990206746463548, + "grad_norm": 3.866204261779785, + "learning_rate": 9.750933614087678e-06, + "loss": 0.2047, + "step": 4343 + }, + { + "epoch": 0.10992737302932915, + "grad_norm": 23.9704532623291, + "learning_rate": 9.750808451728178e-06, + "loss": 0.3361, + "step": 4344 + }, + { + "epoch": 0.10995267859402283, + "grad_norm": 5.550296783447266, + "learning_rate": 9.750683258731599e-06, + "loss": 0.1974, + "step": 4345 + }, + { + "epoch": 0.1099779841587165, + "grad_norm": 6.725069046020508, + "learning_rate": 9.750558035098746e-06, + "loss": 0.16, + "step": 4346 + }, + { + "epoch": 0.11000328972341018, + "grad_norm": 11.456296920776367, + "learning_rate": 9.75043278083043e-06, + "loss": 0.3227, + "step": 4347 + }, + { + "epoch": 0.11002859528810385, + "grad_norm": 8.587523460388184, + "learning_rate": 9.750307495927455e-06, + "loss": 0.0998, + "step": 4348 + }, + { + "epoch": 0.11005390085279754, + "grad_norm": 9.007246971130371, + "learning_rate": 9.75018218039063e-06, + "loss": 0.227, + "step": 4349 + }, + { + "epoch": 0.11007920641749121, + "grad_norm": 4.380722522735596, + "learning_rate": 9.750056834220764e-06, + "loss": 0.216, + "step": 4350 + }, + { + "epoch": 0.11010451198218488, + "grad_norm": 5.014342784881592, + "learning_rate": 9.749931457418667e-06, + "loss": 0.1772, + "step": 4351 + }, + { + "epoch": 0.11012981754687856, + "grad_norm": 5.20017671585083, + "learning_rate": 9.749806049985145e-06, + "loss": 0.1921, + "step": 4352 + }, + { + "epoch": 0.11015512311157223, + "grad_norm": 5.551333904266357, + "learning_rate": 9.749680611921007e-06, + "loss": 0.1885, + "step": 4353 + }, + { + "epoch": 0.11018042867626592, + "grad_norm": 6.468528747558594, + "learning_rate": 9.74955514322706e-06, + "loss": 0.2144, + "step": 4354 + }, + { + "epoch": 0.11020573424095959, + "grad_norm": 5.328756809234619, + "learning_rate": 9.749429643904117e-06, + "loss": 0.2492, + "step": 4355 + }, + { + "epoch": 0.11023103980565327, + "grad_norm": 7.445138931274414, + "learning_rate": 9.749304113952988e-06, + "loss": 0.2297, + "step": 4356 + }, + { + "epoch": 0.11025634537034694, + "grad_norm": 3.726928472518921, + "learning_rate": 9.749178553374478e-06, + "loss": 0.1771, + "step": 4357 + }, + { + "epoch": 0.11028165093504061, + "grad_norm": 4.568503379821777, + "learning_rate": 9.749052962169399e-06, + "loss": 0.2033, + "step": 4358 + }, + { + "epoch": 0.1103069564997343, + "grad_norm": 8.199727058410645, + "learning_rate": 9.748927340338561e-06, + "loss": 0.23, + "step": 4359 + }, + { + "epoch": 0.11033226206442796, + "grad_norm": 3.866816997528076, + "learning_rate": 9.748801687882775e-06, + "loss": 0.1773, + "step": 4360 + }, + { + "epoch": 0.11035756762912165, + "grad_norm": 3.6654744148254395, + "learning_rate": 9.748676004802848e-06, + "loss": 0.1154, + "step": 4361 + }, + { + "epoch": 0.11038287319381532, + "grad_norm": 5.394964218139648, + "learning_rate": 9.748550291099594e-06, + "loss": 0.1904, + "step": 4362 + }, + { + "epoch": 0.110408178758509, + "grad_norm": 5.4878716468811035, + "learning_rate": 9.748424546773822e-06, + "loss": 0.1839, + "step": 4363 + }, + { + "epoch": 0.11043348432320267, + "grad_norm": 5.636338233947754, + "learning_rate": 9.748298771826344e-06, + "loss": 0.2038, + "step": 4364 + }, + { + "epoch": 0.11045878988789636, + "grad_norm": 7.492364883422852, + "learning_rate": 9.74817296625797e-06, + "loss": 0.2172, + "step": 4365 + }, + { + "epoch": 0.11048409545259003, + "grad_norm": 13.438546180725098, + "learning_rate": 9.748047130069513e-06, + "loss": 0.3112, + "step": 4366 + }, + { + "epoch": 0.1105094010172837, + "grad_norm": 4.159576892852783, + "learning_rate": 9.747921263261781e-06, + "loss": 0.1532, + "step": 4367 + }, + { + "epoch": 0.11053470658197738, + "grad_norm": 11.484034538269043, + "learning_rate": 9.747795365835591e-06, + "loss": 0.2675, + "step": 4368 + }, + { + "epoch": 0.11056001214667105, + "grad_norm": 6.184978485107422, + "learning_rate": 9.747669437791749e-06, + "loss": 0.2032, + "step": 4369 + }, + { + "epoch": 0.11058531771136473, + "grad_norm": 6.767972469329834, + "learning_rate": 9.747543479131072e-06, + "loss": 0.1687, + "step": 4370 + }, + { + "epoch": 0.1106106232760584, + "grad_norm": 4.081185340881348, + "learning_rate": 9.74741748985437e-06, + "loss": 0.1701, + "step": 4371 + }, + { + "epoch": 0.11063592884075209, + "grad_norm": 7.6632843017578125, + "learning_rate": 9.747291469962454e-06, + "loss": 0.1573, + "step": 4372 + }, + { + "epoch": 0.11066123440544576, + "grad_norm": 5.345514297485352, + "learning_rate": 9.74716541945614e-06, + "loss": 0.1167, + "step": 4373 + }, + { + "epoch": 0.11068653997013943, + "grad_norm": 8.225020408630371, + "learning_rate": 9.747039338336237e-06, + "loss": 0.2809, + "step": 4374 + }, + { + "epoch": 0.11071184553483311, + "grad_norm": 6.123021125793457, + "learning_rate": 9.746913226603562e-06, + "loss": 0.1051, + "step": 4375 + }, + { + "epoch": 0.11073715109952678, + "grad_norm": 4.27128791809082, + "learning_rate": 9.746787084258927e-06, + "loss": 0.1667, + "step": 4376 + }, + { + "epoch": 0.11076245666422047, + "grad_norm": 3.731295585632324, + "learning_rate": 9.746660911303146e-06, + "loss": 0.1739, + "step": 4377 + }, + { + "epoch": 0.11078776222891414, + "grad_norm": 4.2237772941589355, + "learning_rate": 9.74653470773703e-06, + "loss": 0.2395, + "step": 4378 + }, + { + "epoch": 0.11081306779360782, + "grad_norm": 13.385733604431152, + "learning_rate": 9.746408473561393e-06, + "loss": 0.2619, + "step": 4379 + }, + { + "epoch": 0.11083837335830149, + "grad_norm": 6.797429084777832, + "learning_rate": 9.746282208777054e-06, + "loss": 0.2873, + "step": 4380 + }, + { + "epoch": 0.11086367892299517, + "grad_norm": 4.8965630531311035, + "learning_rate": 9.746155913384821e-06, + "loss": 0.2443, + "step": 4381 + }, + { + "epoch": 0.11088898448768884, + "grad_norm": 11.8800048828125, + "learning_rate": 9.746029587385513e-06, + "loss": 0.4032, + "step": 4382 + }, + { + "epoch": 0.11091429005238251, + "grad_norm": 4.136510848999023, + "learning_rate": 9.745903230779943e-06, + "loss": 0.1765, + "step": 4383 + }, + { + "epoch": 0.1109395956170762, + "grad_norm": 8.159950256347656, + "learning_rate": 9.745776843568927e-06, + "loss": 0.1851, + "step": 4384 + }, + { + "epoch": 0.11096490118176987, + "grad_norm": 15.602478981018066, + "learning_rate": 9.745650425753276e-06, + "loss": 0.4136, + "step": 4385 + }, + { + "epoch": 0.11099020674646355, + "grad_norm": 7.875895023345947, + "learning_rate": 9.74552397733381e-06, + "loss": 0.2126, + "step": 4386 + }, + { + "epoch": 0.11101551231115722, + "grad_norm": 4.231174468994141, + "learning_rate": 9.745397498311341e-06, + "loss": 0.2164, + "step": 4387 + }, + { + "epoch": 0.1110408178758509, + "grad_norm": 4.515319347381592, + "learning_rate": 9.74527098868669e-06, + "loss": 0.2134, + "step": 4388 + }, + { + "epoch": 0.11106612344054458, + "grad_norm": 3.9259555339813232, + "learning_rate": 9.745144448460667e-06, + "loss": 0.1702, + "step": 4389 + }, + { + "epoch": 0.11109142900523825, + "grad_norm": 4.173647880554199, + "learning_rate": 9.745017877634088e-06, + "loss": 0.2134, + "step": 4390 + }, + { + "epoch": 0.11111673456993193, + "grad_norm": 6.697768211364746, + "learning_rate": 9.744891276207774e-06, + "loss": 0.2139, + "step": 4391 + }, + { + "epoch": 0.1111420401346256, + "grad_norm": 3.7706046104431152, + "learning_rate": 9.744764644182538e-06, + "loss": 0.182, + "step": 4392 + }, + { + "epoch": 0.11116734569931928, + "grad_norm": 7.092607021331787, + "learning_rate": 9.744637981559197e-06, + "loss": 0.275, + "step": 4393 + }, + { + "epoch": 0.11119265126401295, + "grad_norm": 6.7909040451049805, + "learning_rate": 9.74451128833857e-06, + "loss": 0.2091, + "step": 4394 + }, + { + "epoch": 0.11121795682870664, + "grad_norm": 5.752892971038818, + "learning_rate": 9.74438456452147e-06, + "loss": 0.1678, + "step": 4395 + }, + { + "epoch": 0.11124326239340031, + "grad_norm": 3.0876104831695557, + "learning_rate": 9.744257810108718e-06, + "loss": 0.1581, + "step": 4396 + }, + { + "epoch": 0.11126856795809398, + "grad_norm": 5.234940528869629, + "learning_rate": 9.744131025101129e-06, + "loss": 0.1935, + "step": 4397 + }, + { + "epoch": 0.11129387352278766, + "grad_norm": 4.999234676361084, + "learning_rate": 9.74400420949952e-06, + "loss": 0.1383, + "step": 4398 + }, + { + "epoch": 0.11131917908748133, + "grad_norm": 4.61814546585083, + "learning_rate": 9.743877363304712e-06, + "loss": 0.161, + "step": 4399 + }, + { + "epoch": 0.11134448465217502, + "grad_norm": 4.608087539672852, + "learning_rate": 9.743750486517522e-06, + "loss": 0.1378, + "step": 4400 + }, + { + "epoch": 0.11136979021686869, + "grad_norm": 6.169746398925781, + "learning_rate": 9.743623579138766e-06, + "loss": 0.2154, + "step": 4401 + }, + { + "epoch": 0.11139509578156237, + "grad_norm": 7.368080139160156, + "learning_rate": 9.743496641169265e-06, + "loss": 0.3042, + "step": 4402 + }, + { + "epoch": 0.11142040134625604, + "grad_norm": 8.287271499633789, + "learning_rate": 9.743369672609836e-06, + "loss": 0.2263, + "step": 4403 + }, + { + "epoch": 0.11144570691094972, + "grad_norm": 3.6548402309417725, + "learning_rate": 9.743242673461299e-06, + "loss": 0.1388, + "step": 4404 + }, + { + "epoch": 0.1114710124756434, + "grad_norm": 17.406938552856445, + "learning_rate": 9.743115643724472e-06, + "loss": 0.2317, + "step": 4405 + }, + { + "epoch": 0.11149631804033706, + "grad_norm": 5.781492710113525, + "learning_rate": 9.742988583400175e-06, + "loss": 0.1785, + "step": 4406 + }, + { + "epoch": 0.11152162360503075, + "grad_norm": 8.182707786560059, + "learning_rate": 9.742861492489226e-06, + "loss": 0.2077, + "step": 4407 + }, + { + "epoch": 0.11154692916972442, + "grad_norm": 8.847641944885254, + "learning_rate": 9.742734370992445e-06, + "loss": 0.1981, + "step": 4408 + }, + { + "epoch": 0.1115722347344181, + "grad_norm": 6.469552040100098, + "learning_rate": 9.742607218910654e-06, + "loss": 0.2738, + "step": 4409 + }, + { + "epoch": 0.11159754029911177, + "grad_norm": 5.615752220153809, + "learning_rate": 9.742480036244671e-06, + "loss": 0.1959, + "step": 4410 + }, + { + "epoch": 0.11162284586380546, + "grad_norm": 5.639543533325195, + "learning_rate": 9.742352822995315e-06, + "loss": 0.1851, + "step": 4411 + }, + { + "epoch": 0.11164815142849913, + "grad_norm": 6.375070095062256, + "learning_rate": 9.742225579163407e-06, + "loss": 0.1908, + "step": 4412 + }, + { + "epoch": 0.1116734569931928, + "grad_norm": 11.64804744720459, + "learning_rate": 9.742098304749772e-06, + "loss": 0.2514, + "step": 4413 + }, + { + "epoch": 0.11169876255788648, + "grad_norm": 6.391574382781982, + "learning_rate": 9.741970999755224e-06, + "loss": 0.1449, + "step": 4414 + }, + { + "epoch": 0.11172406812258015, + "grad_norm": 7.159541606903076, + "learning_rate": 9.741843664180589e-06, + "loss": 0.2189, + "step": 4415 + }, + { + "epoch": 0.11174937368727383, + "grad_norm": 9.450810432434082, + "learning_rate": 9.741716298026685e-06, + "loss": 0.2933, + "step": 4416 + }, + { + "epoch": 0.1117746792519675, + "grad_norm": 4.406771183013916, + "learning_rate": 9.741588901294334e-06, + "loss": 0.1511, + "step": 4417 + }, + { + "epoch": 0.11179998481666119, + "grad_norm": 11.662858009338379, + "learning_rate": 9.74146147398436e-06, + "loss": 0.2061, + "step": 4418 + }, + { + "epoch": 0.11182529038135486, + "grad_norm": 13.348665237426758, + "learning_rate": 9.74133401609758e-06, + "loss": 0.3554, + "step": 4419 + }, + { + "epoch": 0.11185059594604854, + "grad_norm": 6.613575458526611, + "learning_rate": 9.741206527634823e-06, + "loss": 0.1715, + "step": 4420 + }, + { + "epoch": 0.11187590151074221, + "grad_norm": 7.9753217697143555, + "learning_rate": 9.741079008596905e-06, + "loss": 0.2157, + "step": 4421 + }, + { + "epoch": 0.11190120707543588, + "grad_norm": 8.567320823669434, + "learning_rate": 9.740951458984648e-06, + "loss": 0.3279, + "step": 4422 + }, + { + "epoch": 0.11192651264012957, + "grad_norm": 4.6434149742126465, + "learning_rate": 9.740823878798881e-06, + "loss": 0.217, + "step": 4423 + }, + { + "epoch": 0.11195181820482324, + "grad_norm": 7.991328239440918, + "learning_rate": 9.74069626804042e-06, + "loss": 0.2041, + "step": 4424 + }, + { + "epoch": 0.11197712376951692, + "grad_norm": 5.21807861328125, + "learning_rate": 9.740568626710091e-06, + "loss": 0.0588, + "step": 4425 + }, + { + "epoch": 0.11200242933421059, + "grad_norm": 4.090503692626953, + "learning_rate": 9.740440954808718e-06, + "loss": 0.1945, + "step": 4426 + }, + { + "epoch": 0.11202773489890427, + "grad_norm": 8.088930130004883, + "learning_rate": 9.740313252337122e-06, + "loss": 0.2056, + "step": 4427 + }, + { + "epoch": 0.11205304046359794, + "grad_norm": 9.88393497467041, + "learning_rate": 9.740185519296128e-06, + "loss": 0.2364, + "step": 4428 + }, + { + "epoch": 0.11207834602829161, + "grad_norm": 3.4917662143707275, + "learning_rate": 9.74005775568656e-06, + "loss": 0.1073, + "step": 4429 + }, + { + "epoch": 0.1121036515929853, + "grad_norm": 3.7313232421875, + "learning_rate": 9.73992996150924e-06, + "loss": 0.1297, + "step": 4430 + }, + { + "epoch": 0.11212895715767897, + "grad_norm": 9.62759017944336, + "learning_rate": 9.739802136764994e-06, + "loss": 0.3163, + "step": 4431 + }, + { + "epoch": 0.11215426272237265, + "grad_norm": 7.219558238983154, + "learning_rate": 9.739674281454645e-06, + "loss": 0.2544, + "step": 4432 + }, + { + "epoch": 0.11217956828706632, + "grad_norm": 4.795873641967773, + "learning_rate": 9.739546395579021e-06, + "loss": 0.238, + "step": 4433 + }, + { + "epoch": 0.11220487385176, + "grad_norm": 4.551512241363525, + "learning_rate": 9.73941847913894e-06, + "loss": 0.1445, + "step": 4434 + }, + { + "epoch": 0.11223017941645368, + "grad_norm": 6.115931034088135, + "learning_rate": 9.739290532135233e-06, + "loss": 0.2264, + "step": 4435 + }, + { + "epoch": 0.11225548498114736, + "grad_norm": 9.898385047912598, + "learning_rate": 9.739162554568723e-06, + "loss": 0.2105, + "step": 4436 + }, + { + "epoch": 0.11228079054584103, + "grad_norm": 6.541128635406494, + "learning_rate": 9.739034546440234e-06, + "loss": 0.177, + "step": 4437 + }, + { + "epoch": 0.1123060961105347, + "grad_norm": 5.663711071014404, + "learning_rate": 9.738906507750594e-06, + "loss": 0.2064, + "step": 4438 + }, + { + "epoch": 0.11233140167522838, + "grad_norm": 5.143519401550293, + "learning_rate": 9.738778438500628e-06, + "loss": 0.138, + "step": 4439 + }, + { + "epoch": 0.11235670723992205, + "grad_norm": 4.977030277252197, + "learning_rate": 9.738650338691157e-06, + "loss": 0.1522, + "step": 4440 + }, + { + "epoch": 0.11238201280461574, + "grad_norm": 9.068648338317871, + "learning_rate": 9.738522208323015e-06, + "loss": 0.2113, + "step": 4441 + }, + { + "epoch": 0.11240731836930941, + "grad_norm": 4.821164608001709, + "learning_rate": 9.738394047397026e-06, + "loss": 0.1376, + "step": 4442 + }, + { + "epoch": 0.11243262393400309, + "grad_norm": 16.451313018798828, + "learning_rate": 9.738265855914014e-06, + "loss": 0.3533, + "step": 4443 + }, + { + "epoch": 0.11245792949869676, + "grad_norm": 7.551180839538574, + "learning_rate": 9.738137633874806e-06, + "loss": 0.2729, + "step": 4444 + }, + { + "epoch": 0.11248323506339043, + "grad_norm": 6.4151482582092285, + "learning_rate": 9.73800938128023e-06, + "loss": 0.2668, + "step": 4445 + }, + { + "epoch": 0.11250854062808412, + "grad_norm": 12.406734466552734, + "learning_rate": 9.737881098131112e-06, + "loss": 0.4398, + "step": 4446 + }, + { + "epoch": 0.11253384619277779, + "grad_norm": 15.509040832519531, + "learning_rate": 9.737752784428282e-06, + "loss": 0.5198, + "step": 4447 + }, + { + "epoch": 0.11255915175747147, + "grad_norm": 3.409233331680298, + "learning_rate": 9.737624440172564e-06, + "loss": 0.1402, + "step": 4448 + }, + { + "epoch": 0.11258445732216514, + "grad_norm": 5.136625289916992, + "learning_rate": 9.737496065364789e-06, + "loss": 0.1162, + "step": 4449 + }, + { + "epoch": 0.11260976288685882, + "grad_norm": 4.615917205810547, + "learning_rate": 9.737367660005781e-06, + "loss": 0.2129, + "step": 4450 + }, + { + "epoch": 0.1126350684515525, + "grad_norm": 3.2970693111419678, + "learning_rate": 9.737239224096372e-06, + "loss": 0.1592, + "step": 4451 + }, + { + "epoch": 0.11266037401624618, + "grad_norm": 7.655593395233154, + "learning_rate": 9.737110757637387e-06, + "loss": 0.2787, + "step": 4452 + }, + { + "epoch": 0.11268567958093985, + "grad_norm": 6.128881454467773, + "learning_rate": 9.736982260629657e-06, + "loss": 0.1789, + "step": 4453 + }, + { + "epoch": 0.11271098514563352, + "grad_norm": 4.874983787536621, + "learning_rate": 9.73685373307401e-06, + "loss": 0.2103, + "step": 4454 + }, + { + "epoch": 0.1127362907103272, + "grad_norm": 12.513262748718262, + "learning_rate": 9.736725174971274e-06, + "loss": 0.314, + "step": 4455 + }, + { + "epoch": 0.11276159627502087, + "grad_norm": 4.219297885894775, + "learning_rate": 9.736596586322276e-06, + "loss": 0.2199, + "step": 4456 + }, + { + "epoch": 0.11278690183971456, + "grad_norm": 8.096887588500977, + "learning_rate": 9.73646796712785e-06, + "loss": 0.1464, + "step": 4457 + }, + { + "epoch": 0.11281220740440823, + "grad_norm": 4.005906105041504, + "learning_rate": 9.736339317388822e-06, + "loss": 0.1795, + "step": 4458 + }, + { + "epoch": 0.11283751296910191, + "grad_norm": 4.494263172149658, + "learning_rate": 9.736210637106025e-06, + "loss": 0.2056, + "step": 4459 + }, + { + "epoch": 0.11286281853379558, + "grad_norm": 19.66512107849121, + "learning_rate": 9.736081926280285e-06, + "loss": 0.2901, + "step": 4460 + }, + { + "epoch": 0.11288812409848925, + "grad_norm": 3.361786365509033, + "learning_rate": 9.735953184912435e-06, + "loss": 0.1997, + "step": 4461 + }, + { + "epoch": 0.11291342966318293, + "grad_norm": 10.589828491210938, + "learning_rate": 9.7358244130033e-06, + "loss": 0.2113, + "step": 4462 + }, + { + "epoch": 0.1129387352278766, + "grad_norm": 5.902021408081055, + "learning_rate": 9.735695610553719e-06, + "loss": 0.2563, + "step": 4463 + }, + { + "epoch": 0.11296404079257029, + "grad_norm": 6.5945281982421875, + "learning_rate": 9.735566777564515e-06, + "loss": 0.2121, + "step": 4464 + }, + { + "epoch": 0.11298934635726396, + "grad_norm": 13.243168830871582, + "learning_rate": 9.735437914036522e-06, + "loss": 0.2239, + "step": 4465 + }, + { + "epoch": 0.11301465192195764, + "grad_norm": 5.427128791809082, + "learning_rate": 9.735309019970573e-06, + "loss": 0.2236, + "step": 4466 + }, + { + "epoch": 0.11303995748665131, + "grad_norm": 3.0905957221984863, + "learning_rate": 9.735180095367495e-06, + "loss": 0.1831, + "step": 4467 + }, + { + "epoch": 0.113065263051345, + "grad_norm": 11.619546890258789, + "learning_rate": 9.73505114022812e-06, + "loss": 0.1522, + "step": 4468 + }, + { + "epoch": 0.11309056861603867, + "grad_norm": 9.793183326721191, + "learning_rate": 9.734922154553283e-06, + "loss": 0.2803, + "step": 4469 + }, + { + "epoch": 0.11311587418073234, + "grad_norm": 8.410017967224121, + "learning_rate": 9.734793138343814e-06, + "loss": 0.3019, + "step": 4470 + }, + { + "epoch": 0.11314117974542602, + "grad_norm": 3.5117416381835938, + "learning_rate": 9.734664091600545e-06, + "loss": 0.1028, + "step": 4471 + }, + { + "epoch": 0.11316648531011969, + "grad_norm": 3.5422072410583496, + "learning_rate": 9.734535014324307e-06, + "loss": 0.0591, + "step": 4472 + }, + { + "epoch": 0.11319179087481337, + "grad_norm": 4.737685203552246, + "learning_rate": 9.734405906515934e-06, + "loss": 0.1239, + "step": 4473 + }, + { + "epoch": 0.11321709643950705, + "grad_norm": 8.1100492477417, + "learning_rate": 9.734276768176257e-06, + "loss": 0.2616, + "step": 4474 + }, + { + "epoch": 0.11324240200420073, + "grad_norm": 8.32624626159668, + "learning_rate": 9.734147599306111e-06, + "loss": 0.3064, + "step": 4475 + }, + { + "epoch": 0.1132677075688944, + "grad_norm": 3.9249675273895264, + "learning_rate": 9.734018399906326e-06, + "loss": 0.1313, + "step": 4476 + }, + { + "epoch": 0.11329301313358807, + "grad_norm": 3.345865488052368, + "learning_rate": 9.733889169977739e-06, + "loss": 0.1431, + "step": 4477 + }, + { + "epoch": 0.11331831869828175, + "grad_norm": 7.244179725646973, + "learning_rate": 9.73375990952118e-06, + "loss": 0.1858, + "step": 4478 + }, + { + "epoch": 0.11334362426297542, + "grad_norm": 17.073198318481445, + "learning_rate": 9.733630618537483e-06, + "loss": 0.2574, + "step": 4479 + }, + { + "epoch": 0.11336892982766911, + "grad_norm": 6.2849040031433105, + "learning_rate": 9.733501297027485e-06, + "loss": 0.1078, + "step": 4480 + }, + { + "epoch": 0.11339423539236278, + "grad_norm": 3.619621515274048, + "learning_rate": 9.733371944992018e-06, + "loss": 0.1088, + "step": 4481 + }, + { + "epoch": 0.11341954095705646, + "grad_norm": 4.763803005218506, + "learning_rate": 9.733242562431913e-06, + "loss": 0.2316, + "step": 4482 + }, + { + "epoch": 0.11344484652175013, + "grad_norm": 4.136523246765137, + "learning_rate": 9.73311314934801e-06, + "loss": 0.1517, + "step": 4483 + }, + { + "epoch": 0.11347015208644382, + "grad_norm": 11.40373706817627, + "learning_rate": 9.732983705741138e-06, + "loss": 0.146, + "step": 4484 + }, + { + "epoch": 0.11349545765113749, + "grad_norm": 10.485091209411621, + "learning_rate": 9.732854231612137e-06, + "loss": 0.2006, + "step": 4485 + }, + { + "epoch": 0.11352076321583116, + "grad_norm": 8.269883155822754, + "learning_rate": 9.732724726961837e-06, + "loss": 0.1904, + "step": 4486 + }, + { + "epoch": 0.11354606878052484, + "grad_norm": 14.528176307678223, + "learning_rate": 9.732595191791077e-06, + "loss": 0.32, + "step": 4487 + }, + { + "epoch": 0.11357137434521851, + "grad_norm": 7.0732316970825195, + "learning_rate": 9.732465626100691e-06, + "loss": 0.2288, + "step": 4488 + }, + { + "epoch": 0.1135966799099122, + "grad_norm": 25.580360412597656, + "learning_rate": 9.732336029891513e-06, + "loss": 0.3956, + "step": 4489 + }, + { + "epoch": 0.11362198547460586, + "grad_norm": 9.648985862731934, + "learning_rate": 9.732206403164382e-06, + "loss": 0.2278, + "step": 4490 + }, + { + "epoch": 0.11364729103929955, + "grad_norm": 14.449674606323242, + "learning_rate": 9.732076745920131e-06, + "loss": 0.2831, + "step": 4491 + }, + { + "epoch": 0.11367259660399322, + "grad_norm": 5.487039089202881, + "learning_rate": 9.731947058159599e-06, + "loss": 0.1824, + "step": 4492 + }, + { + "epoch": 0.11369790216868689, + "grad_norm": 5.654101848602295, + "learning_rate": 9.731817339883619e-06, + "loss": 0.2593, + "step": 4493 + }, + { + "epoch": 0.11372320773338057, + "grad_norm": 4.317915916442871, + "learning_rate": 9.73168759109303e-06, + "loss": 0.2271, + "step": 4494 + }, + { + "epoch": 0.11374851329807424, + "grad_norm": 4.855752944946289, + "learning_rate": 9.731557811788668e-06, + "loss": 0.2107, + "step": 4495 + }, + { + "epoch": 0.11377381886276793, + "grad_norm": 4.770094871520996, + "learning_rate": 9.731428001971369e-06, + "loss": 0.18, + "step": 4496 + }, + { + "epoch": 0.1137991244274616, + "grad_norm": 4.812759876251221, + "learning_rate": 9.731298161641971e-06, + "loss": 0.1903, + "step": 4497 + }, + { + "epoch": 0.11382442999215528, + "grad_norm": 3.9005215167999268, + "learning_rate": 9.731168290801312e-06, + "loss": 0.17, + "step": 4498 + }, + { + "epoch": 0.11384973555684895, + "grad_norm": 4.721146106719971, + "learning_rate": 9.731038389450228e-06, + "loss": 0.1974, + "step": 4499 + }, + { + "epoch": 0.11387504112154263, + "grad_norm": 8.542325019836426, + "learning_rate": 9.730908457589559e-06, + "loss": 0.2396, + "step": 4500 + }, + { + "epoch": 0.1139003466862363, + "grad_norm": 6.072717666625977, + "learning_rate": 9.730778495220138e-06, + "loss": 0.2545, + "step": 4501 + }, + { + "epoch": 0.11392565225092997, + "grad_norm": 9.620133399963379, + "learning_rate": 9.730648502342809e-06, + "loss": 0.2382, + "step": 4502 + }, + { + "epoch": 0.11395095781562366, + "grad_norm": 11.354416847229004, + "learning_rate": 9.730518478958409e-06, + "loss": 0.2005, + "step": 4503 + }, + { + "epoch": 0.11397626338031733, + "grad_norm": 8.171156883239746, + "learning_rate": 9.730388425067774e-06, + "loss": 0.1748, + "step": 4504 + }, + { + "epoch": 0.11400156894501101, + "grad_norm": 11.123076438903809, + "learning_rate": 9.730258340671743e-06, + "loss": 0.2431, + "step": 4505 + }, + { + "epoch": 0.11402687450970468, + "grad_norm": 6.027850151062012, + "learning_rate": 9.730128225771155e-06, + "loss": 0.1759, + "step": 4506 + }, + { + "epoch": 0.11405218007439837, + "grad_norm": 5.880782127380371, + "learning_rate": 9.729998080366851e-06, + "loss": 0.1934, + "step": 4507 + }, + { + "epoch": 0.11407748563909204, + "grad_norm": 7.861382007598877, + "learning_rate": 9.729867904459671e-06, + "loss": 0.2528, + "step": 4508 + }, + { + "epoch": 0.1141027912037857, + "grad_norm": 7.372996807098389, + "learning_rate": 9.72973769805045e-06, + "loss": 0.1678, + "step": 4509 + }, + { + "epoch": 0.11412809676847939, + "grad_norm": 7.346508502960205, + "learning_rate": 9.729607461140034e-06, + "loss": 0.2332, + "step": 4510 + }, + { + "epoch": 0.11415340233317306, + "grad_norm": 15.825531959533691, + "learning_rate": 9.729477193729255e-06, + "loss": 0.2782, + "step": 4511 + }, + { + "epoch": 0.11417870789786674, + "grad_norm": 5.505288124084473, + "learning_rate": 9.729346895818961e-06, + "loss": 0.1419, + "step": 4512 + }, + { + "epoch": 0.11420401346256041, + "grad_norm": 4.7987565994262695, + "learning_rate": 9.729216567409987e-06, + "loss": 0.2023, + "step": 4513 + }, + { + "epoch": 0.1142293190272541, + "grad_norm": 10.433055877685547, + "learning_rate": 9.729086208503174e-06, + "loss": 0.3095, + "step": 4514 + }, + { + "epoch": 0.11425462459194777, + "grad_norm": 6.074895858764648, + "learning_rate": 9.728955819099366e-06, + "loss": 0.2388, + "step": 4515 + }, + { + "epoch": 0.11427993015664145, + "grad_norm": 5.132654190063477, + "learning_rate": 9.7288253991994e-06, + "loss": 0.2284, + "step": 4516 + }, + { + "epoch": 0.11430523572133512, + "grad_norm": 6.558721542358398, + "learning_rate": 9.728694948804118e-06, + "loss": 0.2061, + "step": 4517 + }, + { + "epoch": 0.11433054128602879, + "grad_norm": 5.617769241333008, + "learning_rate": 9.728564467914363e-06, + "loss": 0.167, + "step": 4518 + }, + { + "epoch": 0.11435584685072248, + "grad_norm": 5.696763515472412, + "learning_rate": 9.728433956530975e-06, + "loss": 0.2071, + "step": 4519 + }, + { + "epoch": 0.11438115241541615, + "grad_norm": 3.9076778888702393, + "learning_rate": 9.728303414654796e-06, + "loss": 0.2237, + "step": 4520 + }, + { + "epoch": 0.11440645798010983, + "grad_norm": 6.080982685089111, + "learning_rate": 9.728172842286667e-06, + "loss": 0.2366, + "step": 4521 + }, + { + "epoch": 0.1144317635448035, + "grad_norm": 5.171962738037109, + "learning_rate": 9.728042239427433e-06, + "loss": 0.167, + "step": 4522 + }, + { + "epoch": 0.11445706910949718, + "grad_norm": 7.492321491241455, + "learning_rate": 9.727911606077933e-06, + "loss": 0.1921, + "step": 4523 + }, + { + "epoch": 0.11448237467419085, + "grad_norm": 13.29776382446289, + "learning_rate": 9.72778094223901e-06, + "loss": 0.146, + "step": 4524 + }, + { + "epoch": 0.11450768023888452, + "grad_norm": 12.231032371520996, + "learning_rate": 9.727650247911507e-06, + "loss": 0.4681, + "step": 4525 + }, + { + "epoch": 0.11453298580357821, + "grad_norm": 4.3317155838012695, + "learning_rate": 9.727519523096267e-06, + "loss": 0.195, + "step": 4526 + }, + { + "epoch": 0.11455829136827188, + "grad_norm": 6.583286285400391, + "learning_rate": 9.727388767794135e-06, + "loss": 0.2478, + "step": 4527 + }, + { + "epoch": 0.11458359693296556, + "grad_norm": 6.151121616363525, + "learning_rate": 9.72725798200595e-06, + "loss": 0.201, + "step": 4528 + }, + { + "epoch": 0.11460890249765923, + "grad_norm": 3.6846237182617188, + "learning_rate": 9.727127165732558e-06, + "loss": 0.1784, + "step": 4529 + }, + { + "epoch": 0.11463420806235292, + "grad_norm": 9.121479034423828, + "learning_rate": 9.726996318974803e-06, + "loss": 0.3295, + "step": 4530 + }, + { + "epoch": 0.11465951362704659, + "grad_norm": 13.977977752685547, + "learning_rate": 9.726865441733527e-06, + "loss": 0.2267, + "step": 4531 + }, + { + "epoch": 0.11468481919174027, + "grad_norm": 11.400691032409668, + "learning_rate": 9.726734534009576e-06, + "loss": 0.2479, + "step": 4532 + }, + { + "epoch": 0.11471012475643394, + "grad_norm": 5.914066791534424, + "learning_rate": 9.726603595803793e-06, + "loss": 0.1792, + "step": 4533 + }, + { + "epoch": 0.11473543032112761, + "grad_norm": 7.45969820022583, + "learning_rate": 9.726472627117022e-06, + "loss": 0.2276, + "step": 4534 + }, + { + "epoch": 0.1147607358858213, + "grad_norm": 7.880381107330322, + "learning_rate": 9.726341627950109e-06, + "loss": 0.2282, + "step": 4535 + }, + { + "epoch": 0.11478604145051496, + "grad_norm": 6.7556962966918945, + "learning_rate": 9.726210598303897e-06, + "loss": 0.276, + "step": 4536 + }, + { + "epoch": 0.11481134701520865, + "grad_norm": 8.612677574157715, + "learning_rate": 9.726079538179233e-06, + "loss": 0.2431, + "step": 4537 + }, + { + "epoch": 0.11483665257990232, + "grad_norm": 8.9130277633667, + "learning_rate": 9.72594844757696e-06, + "loss": 0.3134, + "step": 4538 + }, + { + "epoch": 0.114861958144596, + "grad_norm": 6.0177178382873535, + "learning_rate": 9.725817326497925e-06, + "loss": 0.1542, + "step": 4539 + }, + { + "epoch": 0.11488726370928967, + "grad_norm": 8.399487495422363, + "learning_rate": 9.725686174942974e-06, + "loss": 0.2742, + "step": 4540 + }, + { + "epoch": 0.11491256927398334, + "grad_norm": 8.390628814697266, + "learning_rate": 9.72555499291295e-06, + "loss": 0.2412, + "step": 4541 + }, + { + "epoch": 0.11493787483867703, + "grad_norm": 9.41006851196289, + "learning_rate": 9.725423780408703e-06, + "loss": 0.2475, + "step": 4542 + }, + { + "epoch": 0.1149631804033707, + "grad_norm": 12.411730766296387, + "learning_rate": 9.725292537431076e-06, + "loss": 0.3161, + "step": 4543 + }, + { + "epoch": 0.11498848596806438, + "grad_norm": 9.022380828857422, + "learning_rate": 9.725161263980918e-06, + "loss": 0.2463, + "step": 4544 + }, + { + "epoch": 0.11501379153275805, + "grad_norm": 7.137384414672852, + "learning_rate": 9.725029960059072e-06, + "loss": 0.1359, + "step": 4545 + }, + { + "epoch": 0.11503909709745173, + "grad_norm": 3.845851182937622, + "learning_rate": 9.724898625666387e-06, + "loss": 0.1401, + "step": 4546 + }, + { + "epoch": 0.1150644026621454, + "grad_norm": 7.179487705230713, + "learning_rate": 9.724767260803711e-06, + "loss": 0.2012, + "step": 4547 + }, + { + "epoch": 0.11508970822683909, + "grad_norm": 7.881570816040039, + "learning_rate": 9.724635865471888e-06, + "loss": 0.264, + "step": 4548 + }, + { + "epoch": 0.11511501379153276, + "grad_norm": 8.965250968933105, + "learning_rate": 9.72450443967177e-06, + "loss": 0.236, + "step": 4549 + }, + { + "epoch": 0.11514031935622643, + "grad_norm": 10.491226196289062, + "learning_rate": 9.724372983404198e-06, + "loss": 0.3787, + "step": 4550 + }, + { + "epoch": 0.11516562492092011, + "grad_norm": 6.255393028259277, + "learning_rate": 9.724241496670025e-06, + "loss": 0.2397, + "step": 4551 + }, + { + "epoch": 0.11519093048561378, + "grad_norm": 3.69559383392334, + "learning_rate": 9.724109979470098e-06, + "loss": 0.1929, + "step": 4552 + }, + { + "epoch": 0.11521623605030747, + "grad_norm": 5.931342601776123, + "learning_rate": 9.723978431805264e-06, + "loss": 0.244, + "step": 4553 + }, + { + "epoch": 0.11524154161500114, + "grad_norm": 6.879518032073975, + "learning_rate": 9.723846853676371e-06, + "loss": 0.3196, + "step": 4554 + }, + { + "epoch": 0.11526684717969482, + "grad_norm": 9.193706512451172, + "learning_rate": 9.72371524508427e-06, + "loss": 0.1684, + "step": 4555 + }, + { + "epoch": 0.11529215274438849, + "grad_norm": 12.423624992370605, + "learning_rate": 9.723583606029807e-06, + "loss": 0.3168, + "step": 4556 + }, + { + "epoch": 0.11531745830908216, + "grad_norm": 6.193516731262207, + "learning_rate": 9.723451936513832e-06, + "loss": 0.2602, + "step": 4557 + }, + { + "epoch": 0.11534276387377584, + "grad_norm": 10.913820266723633, + "learning_rate": 9.723320236537193e-06, + "loss": 0.266, + "step": 4558 + }, + { + "epoch": 0.11536806943846951, + "grad_norm": 12.551465034484863, + "learning_rate": 9.723188506100742e-06, + "loss": 0.2535, + "step": 4559 + }, + { + "epoch": 0.1153933750031632, + "grad_norm": 8.030728340148926, + "learning_rate": 9.723056745205328e-06, + "loss": 0.2401, + "step": 4560 + }, + { + "epoch": 0.11541868056785687, + "grad_norm": 9.931229591369629, + "learning_rate": 9.722924953851798e-06, + "loss": 0.2725, + "step": 4561 + }, + { + "epoch": 0.11544398613255055, + "grad_norm": 6.468829154968262, + "learning_rate": 9.722793132041004e-06, + "loss": 0.191, + "step": 4562 + }, + { + "epoch": 0.11546929169724422, + "grad_norm": 5.896657466888428, + "learning_rate": 9.722661279773794e-06, + "loss": 0.2629, + "step": 4563 + }, + { + "epoch": 0.1154945972619379, + "grad_norm": 19.48906707763672, + "learning_rate": 9.722529397051022e-06, + "loss": 0.2908, + "step": 4564 + }, + { + "epoch": 0.11551990282663158, + "grad_norm": 5.574892044067383, + "learning_rate": 9.722397483873535e-06, + "loss": 0.1579, + "step": 4565 + }, + { + "epoch": 0.11554520839132525, + "grad_norm": 4.399411201477051, + "learning_rate": 9.722265540242185e-06, + "loss": 0.2177, + "step": 4566 + }, + { + "epoch": 0.11557051395601893, + "grad_norm": 8.983981132507324, + "learning_rate": 9.722133566157823e-06, + "loss": 0.2252, + "step": 4567 + }, + { + "epoch": 0.1155958195207126, + "grad_norm": 5.198063850402832, + "learning_rate": 9.722001561621302e-06, + "loss": 0.1471, + "step": 4568 + }, + { + "epoch": 0.11562112508540628, + "grad_norm": 5.758574485778809, + "learning_rate": 9.72186952663347e-06, + "loss": 0.2566, + "step": 4569 + }, + { + "epoch": 0.11564643065009995, + "grad_norm": 11.063091278076172, + "learning_rate": 9.721737461195179e-06, + "loss": 0.1866, + "step": 4570 + }, + { + "epoch": 0.11567173621479364, + "grad_norm": 5.975983142852783, + "learning_rate": 9.721605365307282e-06, + "loss": 0.2808, + "step": 4571 + }, + { + "epoch": 0.11569704177948731, + "grad_norm": 7.435004711151123, + "learning_rate": 9.72147323897063e-06, + "loss": 0.2347, + "step": 4572 + }, + { + "epoch": 0.11572234734418098, + "grad_norm": 8.783069610595703, + "learning_rate": 9.721341082186076e-06, + "loss": 0.2379, + "step": 4573 + }, + { + "epoch": 0.11574765290887466, + "grad_norm": 11.141565322875977, + "learning_rate": 9.721208894954471e-06, + "loss": 0.4045, + "step": 4574 + }, + { + "epoch": 0.11577295847356833, + "grad_norm": 4.443477153778076, + "learning_rate": 9.721076677276669e-06, + "loss": 0.1673, + "step": 4575 + }, + { + "epoch": 0.11579826403826202, + "grad_norm": 4.51447868347168, + "learning_rate": 9.720944429153521e-06, + "loss": 0.195, + "step": 4576 + }, + { + "epoch": 0.11582356960295569, + "grad_norm": 3.414604425430298, + "learning_rate": 9.72081215058588e-06, + "loss": 0.2077, + "step": 4577 + }, + { + "epoch": 0.11584887516764937, + "grad_norm": 8.932453155517578, + "learning_rate": 9.7206798415746e-06, + "loss": 0.193, + "step": 4578 + }, + { + "epoch": 0.11587418073234304, + "grad_norm": 23.068161010742188, + "learning_rate": 9.720547502120535e-06, + "loss": 0.241, + "step": 4579 + }, + { + "epoch": 0.11589948629703672, + "grad_norm": 5.1525959968566895, + "learning_rate": 9.720415132224534e-06, + "loss": 0.2185, + "step": 4580 + }, + { + "epoch": 0.1159247918617304, + "grad_norm": 4.206142902374268, + "learning_rate": 9.720282731887458e-06, + "loss": 0.1883, + "step": 4581 + }, + { + "epoch": 0.11595009742642406, + "grad_norm": 4.674678325653076, + "learning_rate": 9.720150301110154e-06, + "loss": 0.1914, + "step": 4582 + }, + { + "epoch": 0.11597540299111775, + "grad_norm": 10.871870040893555, + "learning_rate": 9.72001783989348e-06, + "loss": 0.2673, + "step": 4583 + }, + { + "epoch": 0.11600070855581142, + "grad_norm": 4.641600608825684, + "learning_rate": 9.719885348238288e-06, + "loss": 0.195, + "step": 4584 + }, + { + "epoch": 0.1160260141205051, + "grad_norm": 5.26608943939209, + "learning_rate": 9.719752826145433e-06, + "loss": 0.1984, + "step": 4585 + }, + { + "epoch": 0.11605131968519877, + "grad_norm": 6.335422992706299, + "learning_rate": 9.719620273615771e-06, + "loss": 0.249, + "step": 4586 + }, + { + "epoch": 0.11607662524989246, + "grad_norm": 6.9805521965026855, + "learning_rate": 9.719487690650155e-06, + "loss": 0.2185, + "step": 4587 + }, + { + "epoch": 0.11610193081458613, + "grad_norm": 4.714877128601074, + "learning_rate": 9.719355077249441e-06, + "loss": 0.1381, + "step": 4588 + }, + { + "epoch": 0.1161272363792798, + "grad_norm": 6.176710605621338, + "learning_rate": 9.719222433414484e-06, + "loss": 0.1999, + "step": 4589 + }, + { + "epoch": 0.11615254194397348, + "grad_norm": 4.089155197143555, + "learning_rate": 9.719089759146142e-06, + "loss": 0.1626, + "step": 4590 + }, + { + "epoch": 0.11617784750866715, + "grad_norm": 9.308416366577148, + "learning_rate": 9.718957054445264e-06, + "loss": 0.1942, + "step": 4591 + }, + { + "epoch": 0.11620315307336083, + "grad_norm": 6.4002909660339355, + "learning_rate": 9.718824319312712e-06, + "loss": 0.2522, + "step": 4592 + }, + { + "epoch": 0.1162284586380545, + "grad_norm": 7.264581203460693, + "learning_rate": 9.718691553749339e-06, + "loss": 0.2273, + "step": 4593 + }, + { + "epoch": 0.11625376420274819, + "grad_norm": 6.265960216522217, + "learning_rate": 9.718558757756002e-06, + "loss": 0.2048, + "step": 4594 + }, + { + "epoch": 0.11627906976744186, + "grad_norm": 3.343672752380371, + "learning_rate": 9.718425931333557e-06, + "loss": 0.1329, + "step": 4595 + }, + { + "epoch": 0.11630437533213554, + "grad_norm": 6.506674289703369, + "learning_rate": 9.71829307448286e-06, + "loss": 0.2422, + "step": 4596 + }, + { + "epoch": 0.11632968089682921, + "grad_norm": 11.28785228729248, + "learning_rate": 9.718160187204771e-06, + "loss": 0.1685, + "step": 4597 + }, + { + "epoch": 0.11635498646152288, + "grad_norm": 8.501084327697754, + "learning_rate": 9.718027269500144e-06, + "loss": 0.196, + "step": 4598 + }, + { + "epoch": 0.11638029202621657, + "grad_norm": 7.804808616638184, + "learning_rate": 9.717894321369836e-06, + "loss": 0.1427, + "step": 4599 + }, + { + "epoch": 0.11640559759091024, + "grad_norm": 5.048664569854736, + "learning_rate": 9.717761342814705e-06, + "loss": 0.1749, + "step": 4600 + }, + { + "epoch": 0.11643090315560392, + "grad_norm": 8.543719291687012, + "learning_rate": 9.717628333835608e-06, + "loss": 0.1957, + "step": 4601 + }, + { + "epoch": 0.11645620872029759, + "grad_norm": 11.078810691833496, + "learning_rate": 9.717495294433404e-06, + "loss": 0.1866, + "step": 4602 + }, + { + "epoch": 0.11648151428499127, + "grad_norm": 4.99090051651001, + "learning_rate": 9.71736222460895e-06, + "loss": 0.0915, + "step": 4603 + }, + { + "epoch": 0.11650681984968494, + "grad_norm": 8.068154335021973, + "learning_rate": 9.717229124363105e-06, + "loss": 0.2132, + "step": 4604 + }, + { + "epoch": 0.11653212541437862, + "grad_norm": 5.963909149169922, + "learning_rate": 9.717095993696728e-06, + "loss": 0.1802, + "step": 4605 + }, + { + "epoch": 0.1165574309790723, + "grad_norm": 5.023667812347412, + "learning_rate": 9.716962832610676e-06, + "loss": 0.2017, + "step": 4606 + }, + { + "epoch": 0.11658273654376597, + "grad_norm": 14.495763778686523, + "learning_rate": 9.716829641105806e-06, + "loss": 0.2603, + "step": 4607 + }, + { + "epoch": 0.11660804210845965, + "grad_norm": 13.725918769836426, + "learning_rate": 9.716696419182982e-06, + "loss": 0.1106, + "step": 4608 + }, + { + "epoch": 0.11663334767315332, + "grad_norm": 14.082701683044434, + "learning_rate": 9.716563166843058e-06, + "loss": 0.349, + "step": 4609 + }, + { + "epoch": 0.11665865323784701, + "grad_norm": 19.51284408569336, + "learning_rate": 9.716429884086898e-06, + "loss": 0.2207, + "step": 4610 + }, + { + "epoch": 0.11668395880254068, + "grad_norm": 3.85840106010437, + "learning_rate": 9.716296570915356e-06, + "loss": 0.1219, + "step": 4611 + }, + { + "epoch": 0.11670926436723436, + "grad_norm": 4.764227867126465, + "learning_rate": 9.716163227329297e-06, + "loss": 0.1215, + "step": 4612 + }, + { + "epoch": 0.11673456993192803, + "grad_norm": 10.4314546585083, + "learning_rate": 9.716029853329576e-06, + "loss": 0.2694, + "step": 4613 + }, + { + "epoch": 0.1167598754966217, + "grad_norm": 9.034160614013672, + "learning_rate": 9.715896448917057e-06, + "loss": 0.2427, + "step": 4614 + }, + { + "epoch": 0.11678518106131539, + "grad_norm": 10.73405647277832, + "learning_rate": 9.7157630140926e-06, + "loss": 0.2652, + "step": 4615 + }, + { + "epoch": 0.11681048662600906, + "grad_norm": 7.501205921173096, + "learning_rate": 9.715629548857063e-06, + "loss": 0.2837, + "step": 4616 + }, + { + "epoch": 0.11683579219070274, + "grad_norm": 7.314960956573486, + "learning_rate": 9.71549605321131e-06, + "loss": 0.2075, + "step": 4617 + }, + { + "epoch": 0.11686109775539641, + "grad_norm": 4.5704755783081055, + "learning_rate": 9.715362527156199e-06, + "loss": 0.2034, + "step": 4618 + }, + { + "epoch": 0.1168864033200901, + "grad_norm": 4.445754528045654, + "learning_rate": 9.715228970692593e-06, + "loss": 0.221, + "step": 4619 + }, + { + "epoch": 0.11691170888478376, + "grad_norm": 4.459304332733154, + "learning_rate": 9.715095383821351e-06, + "loss": 0.1693, + "step": 4620 + }, + { + "epoch": 0.11693701444947743, + "grad_norm": 6.584020614624023, + "learning_rate": 9.714961766543337e-06, + "loss": 0.1932, + "step": 4621 + }, + { + "epoch": 0.11696232001417112, + "grad_norm": 5.784090995788574, + "learning_rate": 9.714828118859413e-06, + "loss": 0.2867, + "step": 4622 + }, + { + "epoch": 0.11698762557886479, + "grad_norm": 14.416169166564941, + "learning_rate": 9.714694440770435e-06, + "loss": 0.1087, + "step": 4623 + }, + { + "epoch": 0.11701293114355847, + "grad_norm": 9.35970401763916, + "learning_rate": 9.714560732277273e-06, + "loss": 0.2298, + "step": 4624 + }, + { + "epoch": 0.11703823670825214, + "grad_norm": 7.7327880859375, + "learning_rate": 9.714426993380785e-06, + "loss": 0.2148, + "step": 4625 + }, + { + "epoch": 0.11706354227294583, + "grad_norm": 9.109554290771484, + "learning_rate": 9.714293224081835e-06, + "loss": 0.2624, + "step": 4626 + }, + { + "epoch": 0.1170888478376395, + "grad_norm": 5.02010440826416, + "learning_rate": 9.714159424381285e-06, + "loss": 0.2256, + "step": 4627 + }, + { + "epoch": 0.11711415340233318, + "grad_norm": 8.478276252746582, + "learning_rate": 9.714025594279997e-06, + "loss": 0.2321, + "step": 4628 + }, + { + "epoch": 0.11713945896702685, + "grad_norm": 4.5183796882629395, + "learning_rate": 9.713891733778836e-06, + "loss": 0.1676, + "step": 4629 + }, + { + "epoch": 0.11716476453172052, + "grad_norm": 6.792411804199219, + "learning_rate": 9.713757842878661e-06, + "loss": 0.1777, + "step": 4630 + }, + { + "epoch": 0.1171900700964142, + "grad_norm": 7.011117935180664, + "learning_rate": 9.713623921580342e-06, + "loss": 0.2197, + "step": 4631 + }, + { + "epoch": 0.11721537566110787, + "grad_norm": 4.986802577972412, + "learning_rate": 9.713489969884737e-06, + "loss": 0.2403, + "step": 4632 + }, + { + "epoch": 0.11724068122580156, + "grad_norm": 7.3548760414123535, + "learning_rate": 9.713355987792714e-06, + "loss": 0.1858, + "step": 4633 + }, + { + "epoch": 0.11726598679049523, + "grad_norm": 5.246963977813721, + "learning_rate": 9.713221975305135e-06, + "loss": 0.1622, + "step": 4634 + }, + { + "epoch": 0.11729129235518891, + "grad_norm": 3.546792507171631, + "learning_rate": 9.713087932422862e-06, + "loss": 0.1992, + "step": 4635 + }, + { + "epoch": 0.11731659791988258, + "grad_norm": 5.1275410652160645, + "learning_rate": 9.712953859146762e-06, + "loss": 0.1746, + "step": 4636 + }, + { + "epoch": 0.11734190348457625, + "grad_norm": 4.291863441467285, + "learning_rate": 9.7128197554777e-06, + "loss": 0.2294, + "step": 4637 + }, + { + "epoch": 0.11736720904926994, + "grad_norm": 4.112378120422363, + "learning_rate": 9.712685621416538e-06, + "loss": 0.2209, + "step": 4638 + }, + { + "epoch": 0.1173925146139636, + "grad_norm": 6.0046515464782715, + "learning_rate": 9.712551456964145e-06, + "loss": 0.1713, + "step": 4639 + }, + { + "epoch": 0.11741782017865729, + "grad_norm": 5.6324639320373535, + "learning_rate": 9.712417262121385e-06, + "loss": 0.1869, + "step": 4640 + }, + { + "epoch": 0.11744312574335096, + "grad_norm": 3.355693817138672, + "learning_rate": 9.71228303688912e-06, + "loss": 0.1065, + "step": 4641 + }, + { + "epoch": 0.11746843130804464, + "grad_norm": 5.5180983543396, + "learning_rate": 9.712148781268221e-06, + "loss": 0.2171, + "step": 4642 + }, + { + "epoch": 0.11749373687273831, + "grad_norm": 8.272201538085938, + "learning_rate": 9.71201449525955e-06, + "loss": 0.2614, + "step": 4643 + }, + { + "epoch": 0.117519042437432, + "grad_norm": 5.646500110626221, + "learning_rate": 9.711880178863974e-06, + "loss": 0.1904, + "step": 4644 + }, + { + "epoch": 0.11754434800212567, + "grad_norm": 5.971129417419434, + "learning_rate": 9.71174583208236e-06, + "loss": 0.2105, + "step": 4645 + }, + { + "epoch": 0.11756965356681934, + "grad_norm": 6.535452365875244, + "learning_rate": 9.711611454915572e-06, + "loss": 0.2156, + "step": 4646 + }, + { + "epoch": 0.11759495913151302, + "grad_norm": 5.651861190795898, + "learning_rate": 9.711477047364479e-06, + "loss": 0.2151, + "step": 4647 + }, + { + "epoch": 0.11762026469620669, + "grad_norm": 9.1570405960083, + "learning_rate": 9.711342609429948e-06, + "loss": 0.3329, + "step": 4648 + }, + { + "epoch": 0.11764557026090038, + "grad_norm": 5.728847503662109, + "learning_rate": 9.711208141112843e-06, + "loss": 0.152, + "step": 4649 + }, + { + "epoch": 0.11767087582559405, + "grad_norm": 8.172118186950684, + "learning_rate": 9.711073642414035e-06, + "loss": 0.2395, + "step": 4650 + }, + { + "epoch": 0.11769618139028773, + "grad_norm": 9.78782844543457, + "learning_rate": 9.710939113334388e-06, + "loss": 0.1763, + "step": 4651 + }, + { + "epoch": 0.1177214869549814, + "grad_norm": 7.44243049621582, + "learning_rate": 9.710804553874771e-06, + "loss": 0.2317, + "step": 4652 + }, + { + "epoch": 0.11774679251967507, + "grad_norm": 3.3409910202026367, + "learning_rate": 9.710669964036051e-06, + "loss": 0.1877, + "step": 4653 + }, + { + "epoch": 0.11777209808436875, + "grad_norm": 6.7825775146484375, + "learning_rate": 9.710535343819098e-06, + "loss": 0.2959, + "step": 4654 + }, + { + "epoch": 0.11779740364906242, + "grad_norm": 3.678410530090332, + "learning_rate": 9.710400693224779e-06, + "loss": 0.2436, + "step": 4655 + }, + { + "epoch": 0.11782270921375611, + "grad_norm": 10.23158073425293, + "learning_rate": 9.71026601225396e-06, + "loss": 0.3082, + "step": 4656 + }, + { + "epoch": 0.11784801477844978, + "grad_norm": 11.361495018005371, + "learning_rate": 9.710131300907515e-06, + "loss": 0.2335, + "step": 4657 + }, + { + "epoch": 0.11787332034314346, + "grad_norm": 9.447162628173828, + "learning_rate": 9.709996559186306e-06, + "loss": 0.3197, + "step": 4658 + }, + { + "epoch": 0.11789862590783713, + "grad_norm": 3.872178077697754, + "learning_rate": 9.709861787091207e-06, + "loss": 0.1191, + "step": 4659 + }, + { + "epoch": 0.11792393147253082, + "grad_norm": 8.037789344787598, + "learning_rate": 9.709726984623086e-06, + "loss": 0.2227, + "step": 4660 + }, + { + "epoch": 0.11794923703722449, + "grad_norm": 7.021259784698486, + "learning_rate": 9.709592151782811e-06, + "loss": 0.1417, + "step": 4661 + }, + { + "epoch": 0.11797454260191816, + "grad_norm": 16.025894165039062, + "learning_rate": 9.709457288571252e-06, + "loss": 0.3484, + "step": 4662 + }, + { + "epoch": 0.11799984816661184, + "grad_norm": 6.920129776000977, + "learning_rate": 9.70932239498928e-06, + "loss": 0.2013, + "step": 4663 + }, + { + "epoch": 0.11802515373130551, + "grad_norm": 9.498571395874023, + "learning_rate": 9.709187471037763e-06, + "loss": 0.1699, + "step": 4664 + }, + { + "epoch": 0.1180504592959992, + "grad_norm": 6.341936111450195, + "learning_rate": 9.709052516717572e-06, + "loss": 0.1983, + "step": 4665 + }, + { + "epoch": 0.11807576486069286, + "grad_norm": 4.384210586547852, + "learning_rate": 9.708917532029578e-06, + "loss": 0.1832, + "step": 4666 + }, + { + "epoch": 0.11810107042538655, + "grad_norm": 5.586032867431641, + "learning_rate": 9.708782516974648e-06, + "loss": 0.1731, + "step": 4667 + }, + { + "epoch": 0.11812637599008022, + "grad_norm": 8.196126937866211, + "learning_rate": 9.708647471553658e-06, + "loss": 0.306, + "step": 4668 + }, + { + "epoch": 0.11815168155477389, + "grad_norm": 12.127511978149414, + "learning_rate": 9.708512395767478e-06, + "loss": 0.2073, + "step": 4669 + }, + { + "epoch": 0.11817698711946757, + "grad_norm": 17.61423110961914, + "learning_rate": 9.708377289616976e-06, + "loss": 0.2032, + "step": 4670 + }, + { + "epoch": 0.11820229268416124, + "grad_norm": 8.616368293762207, + "learning_rate": 9.708242153103022e-06, + "loss": 0.2894, + "step": 4671 + }, + { + "epoch": 0.11822759824885493, + "grad_norm": 7.476250648498535, + "learning_rate": 9.708106986226492e-06, + "loss": 0.2507, + "step": 4672 + }, + { + "epoch": 0.1182529038135486, + "grad_norm": 21.118928909301758, + "learning_rate": 9.707971788988256e-06, + "loss": 0.3224, + "step": 4673 + }, + { + "epoch": 0.11827820937824228, + "grad_norm": 15.58924674987793, + "learning_rate": 9.707836561389186e-06, + "loss": 0.5234, + "step": 4674 + }, + { + "epoch": 0.11830351494293595, + "grad_norm": 8.527460098266602, + "learning_rate": 9.707701303430154e-06, + "loss": 0.2586, + "step": 4675 + }, + { + "epoch": 0.11832882050762963, + "grad_norm": 8.388026237487793, + "learning_rate": 9.70756601511203e-06, + "loss": 0.2525, + "step": 4676 + }, + { + "epoch": 0.1183541260723233, + "grad_norm": 3.964637517929077, + "learning_rate": 9.707430696435692e-06, + "loss": 0.1254, + "step": 4677 + }, + { + "epoch": 0.11837943163701697, + "grad_norm": 7.342624187469482, + "learning_rate": 9.707295347402006e-06, + "loss": 0.1339, + "step": 4678 + }, + { + "epoch": 0.11840473720171066, + "grad_norm": 12.725502967834473, + "learning_rate": 9.70715996801185e-06, + "loss": 0.3544, + "step": 4679 + }, + { + "epoch": 0.11843004276640433, + "grad_norm": 4.322381973266602, + "learning_rate": 9.707024558266093e-06, + "loss": 0.1983, + "step": 4680 + }, + { + "epoch": 0.11845534833109801, + "grad_norm": 9.936487197875977, + "learning_rate": 9.70688911816561e-06, + "loss": 0.2439, + "step": 4681 + }, + { + "epoch": 0.11848065389579168, + "grad_norm": 5.758763313293457, + "learning_rate": 9.706753647711276e-06, + "loss": 0.2236, + "step": 4682 + }, + { + "epoch": 0.11850595946048537, + "grad_norm": 3.936114549636841, + "learning_rate": 9.706618146903964e-06, + "loss": 0.1826, + "step": 4683 + }, + { + "epoch": 0.11853126502517904, + "grad_norm": 8.857226371765137, + "learning_rate": 9.706482615744546e-06, + "loss": 0.2423, + "step": 4684 + }, + { + "epoch": 0.1185565705898727, + "grad_norm": 7.44849967956543, + "learning_rate": 9.706347054233897e-06, + "loss": 0.2192, + "step": 4685 + }, + { + "epoch": 0.11858187615456639, + "grad_norm": 5.659938335418701, + "learning_rate": 9.70621146237289e-06, + "loss": 0.2305, + "step": 4686 + }, + { + "epoch": 0.11860718171926006, + "grad_norm": 6.543324947357178, + "learning_rate": 9.706075840162404e-06, + "loss": 0.2294, + "step": 4687 + }, + { + "epoch": 0.11863248728395374, + "grad_norm": 10.793770790100098, + "learning_rate": 9.705940187603306e-06, + "loss": 0.2574, + "step": 4688 + }, + { + "epoch": 0.11865779284864741, + "grad_norm": 4.124024868011475, + "learning_rate": 9.70580450469648e-06, + "loss": 0.1668, + "step": 4689 + }, + { + "epoch": 0.1186830984133411, + "grad_norm": 10.638385772705078, + "learning_rate": 9.705668791442793e-06, + "loss": 0.3172, + "step": 4690 + }, + { + "epoch": 0.11870840397803477, + "grad_norm": 3.162393808364868, + "learning_rate": 9.705533047843124e-06, + "loss": 0.1528, + "step": 4691 + }, + { + "epoch": 0.11873370954272845, + "grad_norm": 4.809152603149414, + "learning_rate": 9.705397273898346e-06, + "loss": 0.1208, + "step": 4692 + }, + { + "epoch": 0.11875901510742212, + "grad_norm": 11.439221382141113, + "learning_rate": 9.70526146960934e-06, + "loss": 0.2319, + "step": 4693 + }, + { + "epoch": 0.11878432067211579, + "grad_norm": 21.08500862121582, + "learning_rate": 9.705125634976975e-06, + "loss": 0.2356, + "step": 4694 + }, + { + "epoch": 0.11880962623680948, + "grad_norm": 19.309839248657227, + "learning_rate": 9.70498977000213e-06, + "loss": 0.2058, + "step": 4695 + }, + { + "epoch": 0.11883493180150315, + "grad_norm": 6.338422775268555, + "learning_rate": 9.704853874685682e-06, + "loss": 0.1892, + "step": 4696 + }, + { + "epoch": 0.11886023736619683, + "grad_norm": 7.232547760009766, + "learning_rate": 9.704717949028506e-06, + "loss": 0.2078, + "step": 4697 + }, + { + "epoch": 0.1188855429308905, + "grad_norm": 4.020184516906738, + "learning_rate": 9.704581993031481e-06, + "loss": 0.1928, + "step": 4698 + }, + { + "epoch": 0.11891084849558418, + "grad_norm": 4.540006637573242, + "learning_rate": 9.704446006695481e-06, + "loss": 0.194, + "step": 4699 + }, + { + "epoch": 0.11893615406027785, + "grad_norm": 3.229116201400757, + "learning_rate": 9.704309990021383e-06, + "loss": 0.1525, + "step": 4700 + }, + { + "epoch": 0.11896145962497152, + "grad_norm": 6.091355800628662, + "learning_rate": 9.704173943010065e-06, + "loss": 0.1237, + "step": 4701 + }, + { + "epoch": 0.11898676518966521, + "grad_norm": 4.169830322265625, + "learning_rate": 9.704037865662404e-06, + "loss": 0.1691, + "step": 4702 + }, + { + "epoch": 0.11901207075435888, + "grad_norm": 6.081337928771973, + "learning_rate": 9.703901757979278e-06, + "loss": 0.1491, + "step": 4703 + }, + { + "epoch": 0.11903737631905256, + "grad_norm": 6.66282844543457, + "learning_rate": 9.703765619961565e-06, + "loss": 0.1518, + "step": 4704 + }, + { + "epoch": 0.11906268188374623, + "grad_norm": 3.7211310863494873, + "learning_rate": 9.703629451610143e-06, + "loss": 0.1501, + "step": 4705 + }, + { + "epoch": 0.11908798744843992, + "grad_norm": 4.859346389770508, + "learning_rate": 9.703493252925888e-06, + "loss": 0.1916, + "step": 4706 + }, + { + "epoch": 0.11911329301313359, + "grad_norm": 5.977473735809326, + "learning_rate": 9.70335702390968e-06, + "loss": 0.1822, + "step": 4707 + }, + { + "epoch": 0.11913859857782727, + "grad_norm": 16.404390335083008, + "learning_rate": 9.7032207645624e-06, + "loss": 0.3775, + "step": 4708 + }, + { + "epoch": 0.11916390414252094, + "grad_norm": 12.72588062286377, + "learning_rate": 9.703084474884922e-06, + "loss": 0.3219, + "step": 4709 + }, + { + "epoch": 0.11918920970721461, + "grad_norm": 11.821474075317383, + "learning_rate": 9.702948154878127e-06, + "loss": 0.3546, + "step": 4710 + }, + { + "epoch": 0.1192145152719083, + "grad_norm": 6.539399147033691, + "learning_rate": 9.702811804542896e-06, + "loss": 0.25, + "step": 4711 + }, + { + "epoch": 0.11923982083660196, + "grad_norm": 12.277787208557129, + "learning_rate": 9.702675423880104e-06, + "loss": 0.2304, + "step": 4712 + }, + { + "epoch": 0.11926512640129565, + "grad_norm": 6.920677185058594, + "learning_rate": 9.702539012890634e-06, + "loss": 0.2059, + "step": 4713 + }, + { + "epoch": 0.11929043196598932, + "grad_norm": 11.095126152038574, + "learning_rate": 9.702402571575366e-06, + "loss": 0.2023, + "step": 4714 + }, + { + "epoch": 0.119315737530683, + "grad_norm": 5.8165717124938965, + "learning_rate": 9.702266099935177e-06, + "loss": 0.2244, + "step": 4715 + }, + { + "epoch": 0.11934104309537667, + "grad_norm": 5.903169631958008, + "learning_rate": 9.702129597970948e-06, + "loss": 0.1974, + "step": 4716 + }, + { + "epoch": 0.11936634866007034, + "grad_norm": 9.480598449707031, + "learning_rate": 9.701993065683562e-06, + "loss": 0.2156, + "step": 4717 + }, + { + "epoch": 0.11939165422476403, + "grad_norm": 6.761425495147705, + "learning_rate": 9.701856503073897e-06, + "loss": 0.1836, + "step": 4718 + }, + { + "epoch": 0.1194169597894577, + "grad_norm": 4.019590854644775, + "learning_rate": 9.701719910142832e-06, + "loss": 0.1849, + "step": 4719 + }, + { + "epoch": 0.11944226535415138, + "grad_norm": 7.593601703643799, + "learning_rate": 9.70158328689125e-06, + "loss": 0.1866, + "step": 4720 + }, + { + "epoch": 0.11946757091884505, + "grad_norm": 5.660490989685059, + "learning_rate": 9.701446633320034e-06, + "loss": 0.1499, + "step": 4721 + }, + { + "epoch": 0.11949287648353873, + "grad_norm": 6.85928201675415, + "learning_rate": 9.701309949430062e-06, + "loss": 0.226, + "step": 4722 + }, + { + "epoch": 0.1195181820482324, + "grad_norm": 4.138563632965088, + "learning_rate": 9.701173235222217e-06, + "loss": 0.1847, + "step": 4723 + }, + { + "epoch": 0.11954348761292609, + "grad_norm": 8.381975173950195, + "learning_rate": 9.70103649069738e-06, + "loss": 0.1854, + "step": 4724 + }, + { + "epoch": 0.11956879317761976, + "grad_norm": 8.055039405822754, + "learning_rate": 9.700899715856434e-06, + "loss": 0.2422, + "step": 4725 + }, + { + "epoch": 0.11959409874231343, + "grad_norm": 3.923943519592285, + "learning_rate": 9.700762910700258e-06, + "loss": 0.203, + "step": 4726 + }, + { + "epoch": 0.11961940430700711, + "grad_norm": 3.6325912475585938, + "learning_rate": 9.700626075229739e-06, + "loss": 0.1766, + "step": 4727 + }, + { + "epoch": 0.11964470987170078, + "grad_norm": 11.961127281188965, + "learning_rate": 9.700489209445755e-06, + "loss": 0.2866, + "step": 4728 + }, + { + "epoch": 0.11967001543639447, + "grad_norm": 3.474179744720459, + "learning_rate": 9.700352313349191e-06, + "loss": 0.1324, + "step": 4729 + }, + { + "epoch": 0.11969532100108814, + "grad_norm": 4.986298084259033, + "learning_rate": 9.70021538694093e-06, + "loss": 0.1759, + "step": 4730 + }, + { + "epoch": 0.11972062656578182, + "grad_norm": 8.617557525634766, + "learning_rate": 9.700078430221853e-06, + "loss": 0.179, + "step": 4731 + }, + { + "epoch": 0.11974593213047549, + "grad_norm": 8.173624992370605, + "learning_rate": 9.699941443192846e-06, + "loss": 0.2363, + "step": 4732 + }, + { + "epoch": 0.11977123769516916, + "grad_norm": 4.788870811462402, + "learning_rate": 9.69980442585479e-06, + "loss": 0.1716, + "step": 4733 + }, + { + "epoch": 0.11979654325986284, + "grad_norm": 3.0699615478515625, + "learning_rate": 9.699667378208569e-06, + "loss": 0.1556, + "step": 4734 + }, + { + "epoch": 0.11982184882455651, + "grad_norm": 11.066682815551758, + "learning_rate": 9.699530300255068e-06, + "loss": 0.3585, + "step": 4735 + }, + { + "epoch": 0.1198471543892502, + "grad_norm": 7.658775329589844, + "learning_rate": 9.69939319199517e-06, + "loss": 0.2696, + "step": 4736 + }, + { + "epoch": 0.11987245995394387, + "grad_norm": 3.6119773387908936, + "learning_rate": 9.69925605342976e-06, + "loss": 0.1072, + "step": 4737 + }, + { + "epoch": 0.11989776551863755, + "grad_norm": 5.982853412628174, + "learning_rate": 9.699118884559721e-06, + "loss": 0.2414, + "step": 4738 + }, + { + "epoch": 0.11992307108333122, + "grad_norm": 9.799171447753906, + "learning_rate": 9.69898168538594e-06, + "loss": 0.3539, + "step": 4739 + }, + { + "epoch": 0.11994837664802491, + "grad_norm": 3.9346120357513428, + "learning_rate": 9.698844455909297e-06, + "loss": 0.2001, + "step": 4740 + }, + { + "epoch": 0.11997368221271858, + "grad_norm": 8.464927673339844, + "learning_rate": 9.698707196130684e-06, + "loss": 0.1949, + "step": 4741 + }, + { + "epoch": 0.11999898777741225, + "grad_norm": 5.426105499267578, + "learning_rate": 9.69856990605098e-06, + "loss": 0.1162, + "step": 4742 + }, + { + "epoch": 0.12002429334210593, + "grad_norm": 5.9379987716674805, + "learning_rate": 9.698432585671073e-06, + "loss": 0.1413, + "step": 4743 + }, + { + "epoch": 0.1200495989067996, + "grad_norm": 14.04392147064209, + "learning_rate": 9.698295234991849e-06, + "loss": 0.2382, + "step": 4744 + }, + { + "epoch": 0.12007490447149329, + "grad_norm": 8.302522659301758, + "learning_rate": 9.698157854014192e-06, + "loss": 0.2974, + "step": 4745 + }, + { + "epoch": 0.12010021003618696, + "grad_norm": 6.345949172973633, + "learning_rate": 9.69802044273899e-06, + "loss": 0.255, + "step": 4746 + }, + { + "epoch": 0.12012551560088064, + "grad_norm": 5.44224214553833, + "learning_rate": 9.697883001167126e-06, + "loss": 0.1696, + "step": 4747 + }, + { + "epoch": 0.12015082116557431, + "grad_norm": 7.412600517272949, + "learning_rate": 9.69774552929949e-06, + "loss": 0.2711, + "step": 4748 + }, + { + "epoch": 0.12017612673026798, + "grad_norm": 6.3665971755981445, + "learning_rate": 9.697608027136966e-06, + "loss": 0.1559, + "step": 4749 + }, + { + "epoch": 0.12020143229496166, + "grad_norm": 4.0662360191345215, + "learning_rate": 9.697470494680443e-06, + "loss": 0.1911, + "step": 4750 + }, + { + "epoch": 0.12022673785965533, + "grad_norm": 4.9285688400268555, + "learning_rate": 9.697332931930806e-06, + "loss": 0.1637, + "step": 4751 + }, + { + "epoch": 0.12025204342434902, + "grad_norm": 7.82078742980957, + "learning_rate": 9.697195338888943e-06, + "loss": 0.274, + "step": 4752 + }, + { + "epoch": 0.12027734898904269, + "grad_norm": 3.053586721420288, + "learning_rate": 9.69705771555574e-06, + "loss": 0.1368, + "step": 4753 + }, + { + "epoch": 0.12030265455373637, + "grad_norm": 8.20456314086914, + "learning_rate": 9.696920061932086e-06, + "loss": 0.237, + "step": 4754 + }, + { + "epoch": 0.12032796011843004, + "grad_norm": 4.931854724884033, + "learning_rate": 9.696782378018867e-06, + "loss": 0.2136, + "step": 4755 + }, + { + "epoch": 0.12035326568312373, + "grad_norm": 3.498891830444336, + "learning_rate": 9.696644663816975e-06, + "loss": 0.217, + "step": 4756 + }, + { + "epoch": 0.1203785712478174, + "grad_norm": 5.194564342498779, + "learning_rate": 9.696506919327293e-06, + "loss": 0.0966, + "step": 4757 + }, + { + "epoch": 0.12040387681251107, + "grad_norm": 6.982698917388916, + "learning_rate": 9.696369144550712e-06, + "loss": 0.2034, + "step": 4758 + }, + { + "epoch": 0.12042918237720475, + "grad_norm": 4.695995330810547, + "learning_rate": 9.69623133948812e-06, + "loss": 0.1959, + "step": 4759 + }, + { + "epoch": 0.12045448794189842, + "grad_norm": 3.9105517864227295, + "learning_rate": 9.696093504140406e-06, + "loss": 0.1493, + "step": 4760 + }, + { + "epoch": 0.1204797935065921, + "grad_norm": 7.230266094207764, + "learning_rate": 9.695955638508459e-06, + "loss": 0.2436, + "step": 4761 + }, + { + "epoch": 0.12050509907128577, + "grad_norm": 4.498289108276367, + "learning_rate": 9.695817742593166e-06, + "loss": 0.2051, + "step": 4762 + }, + { + "epoch": 0.12053040463597946, + "grad_norm": 7.233725070953369, + "learning_rate": 9.695679816395419e-06, + "loss": 0.2465, + "step": 4763 + }, + { + "epoch": 0.12055571020067313, + "grad_norm": 4.812417984008789, + "learning_rate": 9.695541859916106e-06, + "loss": 0.2109, + "step": 4764 + }, + { + "epoch": 0.1205810157653668, + "grad_norm": 7.230075359344482, + "learning_rate": 9.695403873156118e-06, + "loss": 0.206, + "step": 4765 + }, + { + "epoch": 0.12060632133006048, + "grad_norm": 4.5535783767700195, + "learning_rate": 9.695265856116343e-06, + "loss": 0.1877, + "step": 4766 + }, + { + "epoch": 0.12063162689475415, + "grad_norm": 5.5049285888671875, + "learning_rate": 9.69512780879767e-06, + "loss": 0.2087, + "step": 4767 + }, + { + "epoch": 0.12065693245944784, + "grad_norm": 23.817289352416992, + "learning_rate": 9.694989731200995e-06, + "loss": 0.2664, + "step": 4768 + }, + { + "epoch": 0.1206822380241415, + "grad_norm": 14.242236137390137, + "learning_rate": 9.694851623327201e-06, + "loss": 0.3733, + "step": 4769 + }, + { + "epoch": 0.12070754358883519, + "grad_norm": 11.115914344787598, + "learning_rate": 9.694713485177183e-06, + "loss": 0.289, + "step": 4770 + }, + { + "epoch": 0.12073284915352886, + "grad_norm": 5.123577117919922, + "learning_rate": 9.694575316751833e-06, + "loss": 0.167, + "step": 4771 + }, + { + "epoch": 0.12075815471822254, + "grad_norm": 8.310234069824219, + "learning_rate": 9.694437118052038e-06, + "loss": 0.1935, + "step": 4772 + }, + { + "epoch": 0.12078346028291621, + "grad_norm": 4.827576160430908, + "learning_rate": 9.694298889078693e-06, + "loss": 0.1765, + "step": 4773 + }, + { + "epoch": 0.12080876584760988, + "grad_norm": 5.292753219604492, + "learning_rate": 9.694160629832686e-06, + "loss": 0.1849, + "step": 4774 + }, + { + "epoch": 0.12083407141230357, + "grad_norm": 5.1099853515625, + "learning_rate": 9.694022340314912e-06, + "loss": 0.1674, + "step": 4775 + }, + { + "epoch": 0.12085937697699724, + "grad_norm": 6.250189304351807, + "learning_rate": 9.693884020526258e-06, + "loss": 0.2116, + "step": 4776 + }, + { + "epoch": 0.12088468254169092, + "grad_norm": 3.3422107696533203, + "learning_rate": 9.693745670467623e-06, + "loss": 0.1867, + "step": 4777 + }, + { + "epoch": 0.12090998810638459, + "grad_norm": 15.002823829650879, + "learning_rate": 9.693607290139893e-06, + "loss": 0.3362, + "step": 4778 + }, + { + "epoch": 0.12093529367107828, + "grad_norm": 4.137721061706543, + "learning_rate": 9.693468879543963e-06, + "loss": 0.1648, + "step": 4779 + }, + { + "epoch": 0.12096059923577195, + "grad_norm": 6.639636993408203, + "learning_rate": 9.693330438680724e-06, + "loss": 0.1585, + "step": 4780 + }, + { + "epoch": 0.12098590480046562, + "grad_norm": 6.111191272735596, + "learning_rate": 9.693191967551073e-06, + "loss": 0.1386, + "step": 4781 + }, + { + "epoch": 0.1210112103651593, + "grad_norm": 8.844594955444336, + "learning_rate": 9.693053466155897e-06, + "loss": 0.3028, + "step": 4782 + }, + { + "epoch": 0.12103651592985297, + "grad_norm": 6.949151039123535, + "learning_rate": 9.692914934496094e-06, + "loss": 0.2414, + "step": 4783 + }, + { + "epoch": 0.12106182149454665, + "grad_norm": 6.335954189300537, + "learning_rate": 9.692776372572554e-06, + "loss": 0.1593, + "step": 4784 + }, + { + "epoch": 0.12108712705924032, + "grad_norm": 5.478625297546387, + "learning_rate": 9.692637780386175e-06, + "loss": 0.2321, + "step": 4785 + }, + { + "epoch": 0.12111243262393401, + "grad_norm": 16.74757957458496, + "learning_rate": 9.692499157937844e-06, + "loss": 0.1766, + "step": 4786 + }, + { + "epoch": 0.12113773818862768, + "grad_norm": 7.982388973236084, + "learning_rate": 9.692360505228463e-06, + "loss": 0.2603, + "step": 4787 + }, + { + "epoch": 0.12116304375332136, + "grad_norm": 11.454824447631836, + "learning_rate": 9.692221822258918e-06, + "loss": 0.2612, + "step": 4788 + }, + { + "epoch": 0.12118834931801503, + "grad_norm": 13.892426490783691, + "learning_rate": 9.69208310903011e-06, + "loss": 0.2883, + "step": 4789 + }, + { + "epoch": 0.1212136548827087, + "grad_norm": 5.302974700927734, + "learning_rate": 9.69194436554293e-06, + "loss": 0.1526, + "step": 4790 + }, + { + "epoch": 0.12123896044740239, + "grad_norm": 7.819581508636475, + "learning_rate": 9.691805591798274e-06, + "loss": 0.2582, + "step": 4791 + }, + { + "epoch": 0.12126426601209606, + "grad_norm": 11.276153564453125, + "learning_rate": 9.691666787797036e-06, + "loss": 0.2188, + "step": 4792 + }, + { + "epoch": 0.12128957157678974, + "grad_norm": 5.0378618240356445, + "learning_rate": 9.69152795354011e-06, + "loss": 0.1891, + "step": 4793 + }, + { + "epoch": 0.12131487714148341, + "grad_norm": 5.309053421020508, + "learning_rate": 9.691389089028394e-06, + "loss": 0.2435, + "step": 4794 + }, + { + "epoch": 0.1213401827061771, + "grad_norm": 4.231337070465088, + "learning_rate": 9.691250194262783e-06, + "loss": 0.2365, + "step": 4795 + }, + { + "epoch": 0.12136548827087076, + "grad_norm": 4.181152820587158, + "learning_rate": 9.691111269244172e-06, + "loss": 0.1565, + "step": 4796 + }, + { + "epoch": 0.12139079383556443, + "grad_norm": 6.50209379196167, + "learning_rate": 9.690972313973458e-06, + "loss": 0.2063, + "step": 4797 + }, + { + "epoch": 0.12141609940025812, + "grad_norm": 7.320619583129883, + "learning_rate": 9.690833328451534e-06, + "loss": 0.2758, + "step": 4798 + }, + { + "epoch": 0.12144140496495179, + "grad_norm": 5.404209613800049, + "learning_rate": 9.690694312679301e-06, + "loss": 0.1481, + "step": 4799 + }, + { + "epoch": 0.12146671052964547, + "grad_norm": 6.399345874786377, + "learning_rate": 9.690555266657651e-06, + "loss": 0.1947, + "step": 4800 + }, + { + "epoch": 0.12149201609433914, + "grad_norm": 22.07834815979004, + "learning_rate": 9.690416190387482e-06, + "loss": 0.2919, + "step": 4801 + }, + { + "epoch": 0.12151732165903283, + "grad_norm": 4.845158576965332, + "learning_rate": 9.690277083869692e-06, + "loss": 0.1768, + "step": 4802 + }, + { + "epoch": 0.1215426272237265, + "grad_norm": 5.811661243438721, + "learning_rate": 9.690137947105179e-06, + "loss": 0.112, + "step": 4803 + }, + { + "epoch": 0.12156793278842018, + "grad_norm": 3.7538201808929443, + "learning_rate": 9.689998780094839e-06, + "loss": 0.1308, + "step": 4804 + }, + { + "epoch": 0.12159323835311385, + "grad_norm": 9.04348373413086, + "learning_rate": 9.689859582839568e-06, + "loss": 0.2761, + "step": 4805 + }, + { + "epoch": 0.12161854391780752, + "grad_norm": 17.773147583007812, + "learning_rate": 9.689720355340263e-06, + "loss": 0.3567, + "step": 4806 + }, + { + "epoch": 0.1216438494825012, + "grad_norm": 6.633084297180176, + "learning_rate": 9.689581097597827e-06, + "loss": 0.1968, + "step": 4807 + }, + { + "epoch": 0.12166915504719487, + "grad_norm": 8.130508422851562, + "learning_rate": 9.689441809613154e-06, + "loss": 0.1786, + "step": 4808 + }, + { + "epoch": 0.12169446061188856, + "grad_norm": 5.993666648864746, + "learning_rate": 9.689302491387143e-06, + "loss": 0.1622, + "step": 4809 + }, + { + "epoch": 0.12171976617658223, + "grad_norm": 15.132580757141113, + "learning_rate": 9.689163142920692e-06, + "loss": 0.2541, + "step": 4810 + }, + { + "epoch": 0.12174507174127591, + "grad_norm": 6.211578369140625, + "learning_rate": 9.6890237642147e-06, + "loss": 0.186, + "step": 4811 + }, + { + "epoch": 0.12177037730596958, + "grad_norm": 5.292394161224365, + "learning_rate": 9.688884355270068e-06, + "loss": 0.2334, + "step": 4812 + }, + { + "epoch": 0.12179568287066325, + "grad_norm": 3.7481045722961426, + "learning_rate": 9.68874491608769e-06, + "loss": 0.1789, + "step": 4813 + }, + { + "epoch": 0.12182098843535694, + "grad_norm": 4.9604268074035645, + "learning_rate": 9.68860544666847e-06, + "loss": 0.2132, + "step": 4814 + }, + { + "epoch": 0.1218462940000506, + "grad_norm": 10.830204963684082, + "learning_rate": 9.688465947013306e-06, + "loss": 0.1559, + "step": 4815 + }, + { + "epoch": 0.12187159956474429, + "grad_norm": 11.493952751159668, + "learning_rate": 9.688326417123096e-06, + "loss": 0.2411, + "step": 4816 + }, + { + "epoch": 0.12189690512943796, + "grad_norm": 5.759165287017822, + "learning_rate": 9.688186856998742e-06, + "loss": 0.2295, + "step": 4817 + }, + { + "epoch": 0.12192221069413164, + "grad_norm": 6.327176570892334, + "learning_rate": 9.688047266641142e-06, + "loss": 0.1967, + "step": 4818 + }, + { + "epoch": 0.12194751625882531, + "grad_norm": 5.418521404266357, + "learning_rate": 9.687907646051197e-06, + "loss": 0.2078, + "step": 4819 + }, + { + "epoch": 0.121972821823519, + "grad_norm": 5.883377552032471, + "learning_rate": 9.687767995229808e-06, + "loss": 0.1808, + "step": 4820 + }, + { + "epoch": 0.12199812738821267, + "grad_norm": 6.049811840057373, + "learning_rate": 9.687628314177876e-06, + "loss": 0.2038, + "step": 4821 + }, + { + "epoch": 0.12202343295290634, + "grad_norm": 7.4383978843688965, + "learning_rate": 9.6874886028963e-06, + "loss": 0.2278, + "step": 4822 + }, + { + "epoch": 0.12204873851760002, + "grad_norm": 12.096234321594238, + "learning_rate": 9.687348861385984e-06, + "loss": 0.3173, + "step": 4823 + }, + { + "epoch": 0.12207404408229369, + "grad_norm": 9.787677764892578, + "learning_rate": 9.687209089647825e-06, + "loss": 0.1403, + "step": 4824 + }, + { + "epoch": 0.12209934964698738, + "grad_norm": 6.009631633758545, + "learning_rate": 9.687069287682727e-06, + "loss": 0.1854, + "step": 4825 + }, + { + "epoch": 0.12212465521168105, + "grad_norm": 7.727371692657471, + "learning_rate": 9.686929455491591e-06, + "loss": 0.2572, + "step": 4826 + }, + { + "epoch": 0.12214996077637473, + "grad_norm": 18.134449005126953, + "learning_rate": 9.686789593075319e-06, + "loss": 0.4334, + "step": 4827 + }, + { + "epoch": 0.1221752663410684, + "grad_norm": 9.483735084533691, + "learning_rate": 9.68664970043481e-06, + "loss": 0.2988, + "step": 4828 + }, + { + "epoch": 0.12220057190576207, + "grad_norm": 4.058422088623047, + "learning_rate": 9.686509777570974e-06, + "loss": 0.2404, + "step": 4829 + }, + { + "epoch": 0.12222587747045575, + "grad_norm": 4.297208786010742, + "learning_rate": 9.686369824484704e-06, + "loss": 0.2353, + "step": 4830 + }, + { + "epoch": 0.12225118303514942, + "grad_norm": 14.581790924072266, + "learning_rate": 9.68622984117691e-06, + "loss": 0.3402, + "step": 4831 + }, + { + "epoch": 0.12227648859984311, + "grad_norm": 5.983871936798096, + "learning_rate": 9.68608982764849e-06, + "loss": 0.2346, + "step": 4832 + }, + { + "epoch": 0.12230179416453678, + "grad_norm": 10.242904663085938, + "learning_rate": 9.685949783900348e-06, + "loss": 0.3069, + "step": 4833 + }, + { + "epoch": 0.12232709972923046, + "grad_norm": 6.334638595581055, + "learning_rate": 9.685809709933386e-06, + "loss": 0.2152, + "step": 4834 + }, + { + "epoch": 0.12235240529392413, + "grad_norm": 5.096194744110107, + "learning_rate": 9.68566960574851e-06, + "loss": 0.2008, + "step": 4835 + }, + { + "epoch": 0.12237771085861782, + "grad_norm": 5.536003112792969, + "learning_rate": 9.685529471346623e-06, + "loss": 0.154, + "step": 4836 + }, + { + "epoch": 0.12240301642331149, + "grad_norm": 8.76447868347168, + "learning_rate": 9.68538930672863e-06, + "loss": 0.3368, + "step": 4837 + }, + { + "epoch": 0.12242832198800516, + "grad_norm": 10.560314178466797, + "learning_rate": 9.68524911189543e-06, + "loss": 0.3681, + "step": 4838 + }, + { + "epoch": 0.12245362755269884, + "grad_norm": 6.940032005310059, + "learning_rate": 9.685108886847931e-06, + "loss": 0.2794, + "step": 4839 + }, + { + "epoch": 0.12247893311739251, + "grad_norm": 7.7239203453063965, + "learning_rate": 9.684968631587034e-06, + "loss": 0.1983, + "step": 4840 + }, + { + "epoch": 0.1225042386820862, + "grad_norm": 11.898085594177246, + "learning_rate": 9.68482834611365e-06, + "loss": 0.4214, + "step": 4841 + }, + { + "epoch": 0.12252954424677986, + "grad_norm": 6.155093669891357, + "learning_rate": 9.684688030428677e-06, + "loss": 0.2235, + "step": 4842 + }, + { + "epoch": 0.12255484981147355, + "grad_norm": 3.5743494033813477, + "learning_rate": 9.684547684533023e-06, + "loss": 0.1947, + "step": 4843 + }, + { + "epoch": 0.12258015537616722, + "grad_norm": 8.089584350585938, + "learning_rate": 9.684407308427592e-06, + "loss": 0.2128, + "step": 4844 + }, + { + "epoch": 0.12260546094086089, + "grad_norm": 4.960940361022949, + "learning_rate": 9.684266902113289e-06, + "loss": 0.2121, + "step": 4845 + }, + { + "epoch": 0.12263076650555457, + "grad_norm": 7.931694984436035, + "learning_rate": 9.68412646559102e-06, + "loss": 0.3121, + "step": 4846 + }, + { + "epoch": 0.12265607207024824, + "grad_norm": 4.274227619171143, + "learning_rate": 9.683985998861692e-06, + "loss": 0.169, + "step": 4847 + }, + { + "epoch": 0.12268137763494193, + "grad_norm": 11.424875259399414, + "learning_rate": 9.683845501926209e-06, + "loss": 0.3012, + "step": 4848 + }, + { + "epoch": 0.1227066831996356, + "grad_norm": 4.290745258331299, + "learning_rate": 9.683704974785478e-06, + "loss": 0.1407, + "step": 4849 + }, + { + "epoch": 0.12273198876432928, + "grad_norm": 17.07241439819336, + "learning_rate": 9.683564417440403e-06, + "loss": 0.2131, + "step": 4850 + }, + { + "epoch": 0.12275729432902295, + "grad_norm": 5.867480754852295, + "learning_rate": 9.683423829891895e-06, + "loss": 0.2355, + "step": 4851 + }, + { + "epoch": 0.12278259989371663, + "grad_norm": 4.217688083648682, + "learning_rate": 9.683283212140856e-06, + "loss": 0.1735, + "step": 4852 + }, + { + "epoch": 0.1228079054584103, + "grad_norm": 5.0090203285217285, + "learning_rate": 9.683142564188194e-06, + "loss": 0.1996, + "step": 4853 + }, + { + "epoch": 0.12283321102310397, + "grad_norm": 5.083445072174072, + "learning_rate": 9.68300188603482e-06, + "loss": 0.229, + "step": 4854 + }, + { + "epoch": 0.12285851658779766, + "grad_norm": 9.886690139770508, + "learning_rate": 9.682861177681636e-06, + "loss": 0.2903, + "step": 4855 + }, + { + "epoch": 0.12288382215249133, + "grad_norm": 3.7543139457702637, + "learning_rate": 9.682720439129552e-06, + "loss": 0.1854, + "step": 4856 + }, + { + "epoch": 0.12290912771718501, + "grad_norm": 5.877018451690674, + "learning_rate": 9.682579670379473e-06, + "loss": 0.2568, + "step": 4857 + }, + { + "epoch": 0.12293443328187868, + "grad_norm": 5.298611640930176, + "learning_rate": 9.68243887143231e-06, + "loss": 0.2212, + "step": 4858 + }, + { + "epoch": 0.12295973884657237, + "grad_norm": 10.450549125671387, + "learning_rate": 9.68229804228897e-06, + "loss": 0.2641, + "step": 4859 + }, + { + "epoch": 0.12298504441126604, + "grad_norm": 5.740719318389893, + "learning_rate": 9.682157182950361e-06, + "loss": 0.1525, + "step": 4860 + }, + { + "epoch": 0.1230103499759597, + "grad_norm": 11.160168647766113, + "learning_rate": 9.682016293417392e-06, + "loss": 0.32, + "step": 4861 + }, + { + "epoch": 0.12303565554065339, + "grad_norm": 4.438257694244385, + "learning_rate": 9.681875373690968e-06, + "loss": 0.1144, + "step": 4862 + }, + { + "epoch": 0.12306096110534706, + "grad_norm": 8.029613494873047, + "learning_rate": 9.681734423772002e-06, + "loss": 0.2287, + "step": 4863 + }, + { + "epoch": 0.12308626667004074, + "grad_norm": 8.39969253540039, + "learning_rate": 9.681593443661404e-06, + "loss": 0.135, + "step": 4864 + }, + { + "epoch": 0.12311157223473441, + "grad_norm": 4.880116939544678, + "learning_rate": 9.681452433360076e-06, + "loss": 0.2147, + "step": 4865 + }, + { + "epoch": 0.1231368777994281, + "grad_norm": 5.8614349365234375, + "learning_rate": 9.681311392868936e-06, + "loss": 0.1831, + "step": 4866 + }, + { + "epoch": 0.12316218336412177, + "grad_norm": 5.893483638763428, + "learning_rate": 9.68117032218889e-06, + "loss": 0.2489, + "step": 4867 + }, + { + "epoch": 0.12318748892881545, + "grad_norm": 15.32882022857666, + "learning_rate": 9.681029221320844e-06, + "loss": 0.2332, + "step": 4868 + }, + { + "epoch": 0.12321279449350912, + "grad_norm": 16.247900009155273, + "learning_rate": 9.680888090265714e-06, + "loss": 0.3572, + "step": 4869 + }, + { + "epoch": 0.12323810005820279, + "grad_norm": 6.644372463226318, + "learning_rate": 9.680746929024406e-06, + "loss": 0.2061, + "step": 4870 + }, + { + "epoch": 0.12326340562289648, + "grad_norm": 3.6704869270324707, + "learning_rate": 9.680605737597833e-06, + "loss": 0.1679, + "step": 4871 + }, + { + "epoch": 0.12328871118759015, + "grad_norm": 15.62206745147705, + "learning_rate": 9.680464515986903e-06, + "loss": 0.3155, + "step": 4872 + }, + { + "epoch": 0.12331401675228383, + "grad_norm": 10.630818367004395, + "learning_rate": 9.680323264192528e-06, + "loss": 0.3119, + "step": 4873 + }, + { + "epoch": 0.1233393223169775, + "grad_norm": 6.504001140594482, + "learning_rate": 9.68018198221562e-06, + "loss": 0.2636, + "step": 4874 + }, + { + "epoch": 0.12336462788167118, + "grad_norm": 7.169891357421875, + "learning_rate": 9.680040670057087e-06, + "loss": 0.2545, + "step": 4875 + }, + { + "epoch": 0.12338993344636486, + "grad_norm": 7.554673671722412, + "learning_rate": 9.679899327717843e-06, + "loss": 0.2496, + "step": 4876 + }, + { + "epoch": 0.12341523901105853, + "grad_norm": 3.8118529319763184, + "learning_rate": 9.679757955198801e-06, + "loss": 0.1405, + "step": 4877 + }, + { + "epoch": 0.12344054457575221, + "grad_norm": 8.53034496307373, + "learning_rate": 9.679616552500867e-06, + "loss": 0.2025, + "step": 4878 + }, + { + "epoch": 0.12346585014044588, + "grad_norm": 4.279938220977783, + "learning_rate": 9.67947511962496e-06, + "loss": 0.1617, + "step": 4879 + }, + { + "epoch": 0.12349115570513956, + "grad_norm": 3.634545087814331, + "learning_rate": 9.679333656571986e-06, + "loss": 0.1144, + "step": 4880 + }, + { + "epoch": 0.12351646126983323, + "grad_norm": 9.405831336975098, + "learning_rate": 9.679192163342859e-06, + "loss": 0.2029, + "step": 4881 + }, + { + "epoch": 0.12354176683452692, + "grad_norm": 7.967907905578613, + "learning_rate": 9.679050639938494e-06, + "loss": 0.3173, + "step": 4882 + }, + { + "epoch": 0.12356707239922059, + "grad_norm": 5.562282085418701, + "learning_rate": 9.678909086359801e-06, + "loss": 0.2747, + "step": 4883 + }, + { + "epoch": 0.12359237796391427, + "grad_norm": 3.987929105758667, + "learning_rate": 9.678767502607693e-06, + "loss": 0.223, + "step": 4884 + }, + { + "epoch": 0.12361768352860794, + "grad_norm": 4.727980613708496, + "learning_rate": 9.678625888683084e-06, + "loss": 0.165, + "step": 4885 + }, + { + "epoch": 0.12364298909330161, + "grad_norm": 10.464408874511719, + "learning_rate": 9.678484244586888e-06, + "loss": 0.2134, + "step": 4886 + }, + { + "epoch": 0.1236682946579953, + "grad_norm": 5.065913200378418, + "learning_rate": 9.678342570320018e-06, + "loss": 0.2779, + "step": 4887 + }, + { + "epoch": 0.12369360022268897, + "grad_norm": 5.078924655914307, + "learning_rate": 9.678200865883384e-06, + "loss": 0.157, + "step": 4888 + }, + { + "epoch": 0.12371890578738265, + "grad_norm": 6.419178009033203, + "learning_rate": 9.678059131277905e-06, + "loss": 0.2097, + "step": 4889 + }, + { + "epoch": 0.12374421135207632, + "grad_norm": 14.824954986572266, + "learning_rate": 9.677917366504493e-06, + "loss": 0.2376, + "step": 4890 + }, + { + "epoch": 0.12376951691677, + "grad_norm": 7.841866493225098, + "learning_rate": 9.67777557156406e-06, + "loss": 0.2259, + "step": 4891 + }, + { + "epoch": 0.12379482248146367, + "grad_norm": 11.608719825744629, + "learning_rate": 9.677633746457526e-06, + "loss": 0.2424, + "step": 4892 + }, + { + "epoch": 0.12382012804615734, + "grad_norm": 16.666881561279297, + "learning_rate": 9.677491891185799e-06, + "loss": 0.2821, + "step": 4893 + }, + { + "epoch": 0.12384543361085103, + "grad_norm": 5.299986839294434, + "learning_rate": 9.677350005749798e-06, + "loss": 0.1448, + "step": 4894 + }, + { + "epoch": 0.1238707391755447, + "grad_norm": 5.050096035003662, + "learning_rate": 9.677208090150436e-06, + "loss": 0.2014, + "step": 4895 + }, + { + "epoch": 0.12389604474023838, + "grad_norm": 9.219508171081543, + "learning_rate": 9.67706614438863e-06, + "loss": 0.2226, + "step": 4896 + }, + { + "epoch": 0.12392135030493205, + "grad_norm": 6.308465957641602, + "learning_rate": 9.676924168465295e-06, + "loss": 0.1926, + "step": 4897 + }, + { + "epoch": 0.12394665586962574, + "grad_norm": 11.088055610656738, + "learning_rate": 9.676782162381345e-06, + "loss": 0.282, + "step": 4898 + }, + { + "epoch": 0.1239719614343194, + "grad_norm": 7.326005458831787, + "learning_rate": 9.676640126137698e-06, + "loss": 0.1613, + "step": 4899 + }, + { + "epoch": 0.12399726699901309, + "grad_norm": 10.08672046661377, + "learning_rate": 9.676498059735266e-06, + "loss": 0.2407, + "step": 4900 + }, + { + "epoch": 0.12402257256370676, + "grad_norm": 11.435855865478516, + "learning_rate": 9.67635596317497e-06, + "loss": 0.1834, + "step": 4901 + }, + { + "epoch": 0.12404787812840043, + "grad_norm": 6.996180534362793, + "learning_rate": 9.676213836457723e-06, + "loss": 0.2522, + "step": 4902 + }, + { + "epoch": 0.12407318369309411, + "grad_norm": 20.222850799560547, + "learning_rate": 9.676071679584445e-06, + "loss": 0.2654, + "step": 4903 + }, + { + "epoch": 0.12409848925778778, + "grad_norm": 3.213191270828247, + "learning_rate": 9.67592949255605e-06, + "loss": 0.1348, + "step": 4904 + }, + { + "epoch": 0.12412379482248147, + "grad_norm": 16.428064346313477, + "learning_rate": 9.675787275373455e-06, + "loss": 0.387, + "step": 4905 + }, + { + "epoch": 0.12414910038717514, + "grad_norm": 7.485500335693359, + "learning_rate": 9.675645028037577e-06, + "loss": 0.216, + "step": 4906 + }, + { + "epoch": 0.12417440595186882, + "grad_norm": 6.375401973724365, + "learning_rate": 9.675502750549335e-06, + "loss": 0.2182, + "step": 4907 + }, + { + "epoch": 0.12419971151656249, + "grad_norm": 4.393087863922119, + "learning_rate": 9.675360442909645e-06, + "loss": 0.1717, + "step": 4908 + }, + { + "epoch": 0.12422501708125616, + "grad_norm": 7.878834247589111, + "learning_rate": 9.675218105119424e-06, + "loss": 0.18, + "step": 4909 + }, + { + "epoch": 0.12425032264594985, + "grad_norm": 7.3119964599609375, + "learning_rate": 9.675075737179592e-06, + "loss": 0.2545, + "step": 4910 + }, + { + "epoch": 0.12427562821064352, + "grad_norm": 6.108448505401611, + "learning_rate": 9.674933339091066e-06, + "loss": 0.1917, + "step": 4911 + }, + { + "epoch": 0.1243009337753372, + "grad_norm": 6.207770347595215, + "learning_rate": 9.674790910854766e-06, + "loss": 0.2451, + "step": 4912 + }, + { + "epoch": 0.12432623934003087, + "grad_norm": 5.326935291290283, + "learning_rate": 9.674648452471607e-06, + "loss": 0.1618, + "step": 4913 + }, + { + "epoch": 0.12435154490472455, + "grad_norm": 7.901306629180908, + "learning_rate": 9.67450596394251e-06, + "loss": 0.1736, + "step": 4914 + }, + { + "epoch": 0.12437685046941822, + "grad_norm": 5.2111897468566895, + "learning_rate": 9.674363445268395e-06, + "loss": 0.1901, + "step": 4915 + }, + { + "epoch": 0.12440215603411191, + "grad_norm": 8.087340354919434, + "learning_rate": 9.674220896450179e-06, + "loss": 0.1819, + "step": 4916 + }, + { + "epoch": 0.12442746159880558, + "grad_norm": 13.133322715759277, + "learning_rate": 9.674078317488781e-06, + "loss": 0.2388, + "step": 4917 + }, + { + "epoch": 0.12445276716349925, + "grad_norm": 5.172425746917725, + "learning_rate": 9.673935708385122e-06, + "loss": 0.2032, + "step": 4918 + }, + { + "epoch": 0.12447807272819293, + "grad_norm": 5.86233377456665, + "learning_rate": 9.673793069140121e-06, + "loss": 0.2056, + "step": 4919 + }, + { + "epoch": 0.1245033782928866, + "grad_norm": 5.0015435218811035, + "learning_rate": 9.673650399754697e-06, + "loss": 0.2124, + "step": 4920 + }, + { + "epoch": 0.12452868385758029, + "grad_norm": 4.791194915771484, + "learning_rate": 9.673507700229773e-06, + "loss": 0.1496, + "step": 4921 + }, + { + "epoch": 0.12455398942227396, + "grad_norm": 4.0185933113098145, + "learning_rate": 9.673364970566266e-06, + "loss": 0.1872, + "step": 4922 + }, + { + "epoch": 0.12457929498696764, + "grad_norm": 6.425978183746338, + "learning_rate": 9.673222210765098e-06, + "loss": 0.2163, + "step": 4923 + }, + { + "epoch": 0.12460460055166131, + "grad_norm": 11.574325561523438, + "learning_rate": 9.67307942082719e-06, + "loss": 0.2219, + "step": 4924 + }, + { + "epoch": 0.12462990611635498, + "grad_norm": 4.52811861038208, + "learning_rate": 9.67293660075346e-06, + "loss": 0.1938, + "step": 4925 + }, + { + "epoch": 0.12465521168104866, + "grad_norm": 4.057758808135986, + "learning_rate": 9.672793750544831e-06, + "loss": 0.1398, + "step": 4926 + }, + { + "epoch": 0.12468051724574233, + "grad_norm": 5.831533432006836, + "learning_rate": 9.672650870202227e-06, + "loss": 0.1972, + "step": 4927 + }, + { + "epoch": 0.12470582281043602, + "grad_norm": 5.497646331787109, + "learning_rate": 9.672507959726564e-06, + "loss": 0.1812, + "step": 4928 + }, + { + "epoch": 0.12473112837512969, + "grad_norm": 5.002539157867432, + "learning_rate": 9.672365019118769e-06, + "loss": 0.2323, + "step": 4929 + }, + { + "epoch": 0.12475643393982337, + "grad_norm": 11.304019927978516, + "learning_rate": 9.672222048379758e-06, + "loss": 0.2769, + "step": 4930 + }, + { + "epoch": 0.12478173950451704, + "grad_norm": 7.39884090423584, + "learning_rate": 9.672079047510458e-06, + "loss": 0.264, + "step": 4931 + }, + { + "epoch": 0.12480704506921073, + "grad_norm": 5.387747287750244, + "learning_rate": 9.671936016511789e-06, + "loss": 0.2162, + "step": 4932 + }, + { + "epoch": 0.1248323506339044, + "grad_norm": 4.774173736572266, + "learning_rate": 9.671792955384673e-06, + "loss": 0.1667, + "step": 4933 + }, + { + "epoch": 0.12485765619859807, + "grad_norm": 5.02263879776001, + "learning_rate": 9.671649864130034e-06, + "loss": 0.2294, + "step": 4934 + }, + { + "epoch": 0.12488296176329175, + "grad_norm": 6.216445446014404, + "learning_rate": 9.671506742748796e-06, + "loss": 0.157, + "step": 4935 + }, + { + "epoch": 0.12490826732798542, + "grad_norm": 7.869661808013916, + "learning_rate": 9.671363591241876e-06, + "loss": 0.2149, + "step": 4936 + }, + { + "epoch": 0.1249335728926791, + "grad_norm": 21.869449615478516, + "learning_rate": 9.671220409610203e-06, + "loss": 0.2469, + "step": 4937 + }, + { + "epoch": 0.12495887845737277, + "grad_norm": 8.263611793518066, + "learning_rate": 9.6710771978547e-06, + "loss": 0.26, + "step": 4938 + }, + { + "epoch": 0.12498418402206646, + "grad_norm": 6.475461483001709, + "learning_rate": 9.670933955976286e-06, + "loss": 0.2092, + "step": 4939 + }, + { + "epoch": 0.12500948958676014, + "grad_norm": 4.561783790588379, + "learning_rate": 9.67079068397589e-06, + "loss": 0.1614, + "step": 4940 + }, + { + "epoch": 0.1250347951514538, + "grad_norm": 6.243081569671631, + "learning_rate": 9.670647381854432e-06, + "loss": 0.2416, + "step": 4941 + }, + { + "epoch": 0.12506010071614748, + "grad_norm": 6.24957799911499, + "learning_rate": 9.670504049612838e-06, + "loss": 0.1994, + "step": 4942 + }, + { + "epoch": 0.12508540628084117, + "grad_norm": 7.0195207595825195, + "learning_rate": 9.670360687252034e-06, + "loss": 0.2435, + "step": 4943 + }, + { + "epoch": 0.12511071184553482, + "grad_norm": 12.927098274230957, + "learning_rate": 9.670217294772942e-06, + "loss": 0.2328, + "step": 4944 + }, + { + "epoch": 0.1251360174102285, + "grad_norm": 6.877899169921875, + "learning_rate": 9.670073872176487e-06, + "loss": 0.1611, + "step": 4945 + }, + { + "epoch": 0.1251613229749222, + "grad_norm": 5.462316036224365, + "learning_rate": 9.669930419463596e-06, + "loss": 0.2649, + "step": 4946 + }, + { + "epoch": 0.12518662853961587, + "grad_norm": 5.581876754760742, + "learning_rate": 9.669786936635191e-06, + "loss": 0.2324, + "step": 4947 + }, + { + "epoch": 0.12521193410430953, + "grad_norm": 8.25714111328125, + "learning_rate": 9.669643423692196e-06, + "loss": 0.3136, + "step": 4948 + }, + { + "epoch": 0.12523723966900321, + "grad_norm": 5.434625148773193, + "learning_rate": 9.669499880635542e-06, + "loss": 0.1771, + "step": 4949 + }, + { + "epoch": 0.1252625452336969, + "grad_norm": 8.3087158203125, + "learning_rate": 9.669356307466154e-06, + "loss": 0.2188, + "step": 4950 + }, + { + "epoch": 0.12528785079839055, + "grad_norm": 5.552030086517334, + "learning_rate": 9.669212704184952e-06, + "loss": 0.1389, + "step": 4951 + }, + { + "epoch": 0.12531315636308424, + "grad_norm": 3.966999053955078, + "learning_rate": 9.669069070792866e-06, + "loss": 0.1591, + "step": 4952 + }, + { + "epoch": 0.12533846192777792, + "grad_norm": 6.621290683746338, + "learning_rate": 9.668925407290824e-06, + "loss": 0.2365, + "step": 4953 + }, + { + "epoch": 0.1253637674924716, + "grad_norm": 5.164254665374756, + "learning_rate": 9.66878171367975e-06, + "loss": 0.2372, + "step": 4954 + }, + { + "epoch": 0.12538907305716526, + "grad_norm": 6.55429220199585, + "learning_rate": 9.66863798996057e-06, + "loss": 0.1872, + "step": 4955 + }, + { + "epoch": 0.12541437862185895, + "grad_norm": 21.14301872253418, + "learning_rate": 9.668494236134215e-06, + "loss": 0.3037, + "step": 4956 + }, + { + "epoch": 0.12543968418655263, + "grad_norm": 8.590311050415039, + "learning_rate": 9.668350452201606e-06, + "loss": 0.2379, + "step": 4957 + }, + { + "epoch": 0.1254649897512463, + "grad_norm": 7.0323262214660645, + "learning_rate": 9.668206638163674e-06, + "loss": 0.2095, + "step": 4958 + }, + { + "epoch": 0.12549029531593997, + "grad_norm": 7.650531768798828, + "learning_rate": 9.668062794021347e-06, + "loss": 0.1905, + "step": 4959 + }, + { + "epoch": 0.12551560088063365, + "grad_norm": 6.806257724761963, + "learning_rate": 9.66791891977555e-06, + "loss": 0.2163, + "step": 4960 + }, + { + "epoch": 0.12554090644532734, + "grad_norm": 2.6106808185577393, + "learning_rate": 9.667775015427212e-06, + "loss": 0.0995, + "step": 4961 + }, + { + "epoch": 0.125566212010021, + "grad_norm": 11.430364608764648, + "learning_rate": 9.667631080977261e-06, + "loss": 0.342, + "step": 4962 + }, + { + "epoch": 0.12559151757471468, + "grad_norm": 7.724496841430664, + "learning_rate": 9.667487116426628e-06, + "loss": 0.2851, + "step": 4963 + }, + { + "epoch": 0.12561682313940836, + "grad_norm": 6.477316379547119, + "learning_rate": 9.667343121776235e-06, + "loss": 0.1409, + "step": 4964 + }, + { + "epoch": 0.12564212870410202, + "grad_norm": 5.634525775909424, + "learning_rate": 9.667199097027018e-06, + "loss": 0.1568, + "step": 4965 + }, + { + "epoch": 0.1256674342687957, + "grad_norm": 4.455196380615234, + "learning_rate": 9.6670550421799e-06, + "loss": 0.1512, + "step": 4966 + }, + { + "epoch": 0.1256927398334894, + "grad_norm": 9.259379386901855, + "learning_rate": 9.666910957235813e-06, + "loss": 0.184, + "step": 4967 + }, + { + "epoch": 0.12571804539818307, + "grad_norm": 12.06889820098877, + "learning_rate": 9.666766842195685e-06, + "loss": 0.1617, + "step": 4968 + }, + { + "epoch": 0.12574335096287673, + "grad_norm": 5.536801815032959, + "learning_rate": 9.666622697060446e-06, + "loss": 0.1884, + "step": 4969 + }, + { + "epoch": 0.1257686565275704, + "grad_norm": 3.2204301357269287, + "learning_rate": 9.666478521831025e-06, + "loss": 0.1494, + "step": 4970 + }, + { + "epoch": 0.1257939620922641, + "grad_norm": 10.569050788879395, + "learning_rate": 9.666334316508352e-06, + "loss": 0.193, + "step": 4971 + }, + { + "epoch": 0.12581926765695778, + "grad_norm": 6.8018035888671875, + "learning_rate": 9.666190081093359e-06, + "loss": 0.2199, + "step": 4972 + }, + { + "epoch": 0.12584457322165143, + "grad_norm": 8.43602466583252, + "learning_rate": 9.666045815586971e-06, + "loss": 0.2168, + "step": 4973 + }, + { + "epoch": 0.12586987878634512, + "grad_norm": 4.437739372253418, + "learning_rate": 9.665901519990122e-06, + "loss": 0.1943, + "step": 4974 + }, + { + "epoch": 0.1258951843510388, + "grad_norm": 8.248429298400879, + "learning_rate": 9.665757194303745e-06, + "loss": 0.2276, + "step": 4975 + }, + { + "epoch": 0.12592048991573246, + "grad_norm": 5.223411560058594, + "learning_rate": 9.665612838528764e-06, + "loss": 0.0825, + "step": 4976 + }, + { + "epoch": 0.12594579548042614, + "grad_norm": 3.4853720664978027, + "learning_rate": 9.665468452666116e-06, + "loss": 0.1388, + "step": 4977 + }, + { + "epoch": 0.12597110104511983, + "grad_norm": 5.4345831871032715, + "learning_rate": 9.66532403671673e-06, + "loss": 0.1857, + "step": 4978 + }, + { + "epoch": 0.1259964066098135, + "grad_norm": 8.695576667785645, + "learning_rate": 9.665179590681536e-06, + "loss": 0.2166, + "step": 4979 + }, + { + "epoch": 0.12602171217450717, + "grad_norm": 5.323106288909912, + "learning_rate": 9.665035114561466e-06, + "loss": 0.1225, + "step": 4980 + }, + { + "epoch": 0.12604701773920085, + "grad_norm": 7.649448871612549, + "learning_rate": 9.664890608357454e-06, + "loss": 0.1547, + "step": 4981 + }, + { + "epoch": 0.12607232330389453, + "grad_norm": 11.017805099487305, + "learning_rate": 9.664746072070428e-06, + "loss": 0.224, + "step": 4982 + }, + { + "epoch": 0.1260976288685882, + "grad_norm": 5.760494232177734, + "learning_rate": 9.664601505701323e-06, + "loss": 0.2332, + "step": 4983 + }, + { + "epoch": 0.12612293443328187, + "grad_norm": 6.480969429016113, + "learning_rate": 9.664456909251071e-06, + "loss": 0.2554, + "step": 4984 + }, + { + "epoch": 0.12614823999797556, + "grad_norm": 6.256540775299072, + "learning_rate": 9.664312282720605e-06, + "loss": 0.2261, + "step": 4985 + }, + { + "epoch": 0.12617354556266924, + "grad_norm": 6.718878746032715, + "learning_rate": 9.664167626110857e-06, + "loss": 0.2147, + "step": 4986 + }, + { + "epoch": 0.1261988511273629, + "grad_norm": 10.990501403808594, + "learning_rate": 9.664022939422757e-06, + "loss": 0.2618, + "step": 4987 + }, + { + "epoch": 0.12622415669205658, + "grad_norm": 8.175066947937012, + "learning_rate": 9.663878222657244e-06, + "loss": 0.2539, + "step": 4988 + }, + { + "epoch": 0.12624946225675027, + "grad_norm": 6.430383205413818, + "learning_rate": 9.663733475815245e-06, + "loss": 0.2499, + "step": 4989 + }, + { + "epoch": 0.12627476782144392, + "grad_norm": 5.271198749542236, + "learning_rate": 9.663588698897697e-06, + "loss": 0.1918, + "step": 4990 + }, + { + "epoch": 0.1263000733861376, + "grad_norm": 5.552242755889893, + "learning_rate": 9.663443891905533e-06, + "loss": 0.1936, + "step": 4991 + }, + { + "epoch": 0.1263253789508313, + "grad_norm": 12.897761344909668, + "learning_rate": 9.663299054839688e-06, + "loss": 0.2649, + "step": 4992 + }, + { + "epoch": 0.12635068451552497, + "grad_norm": 6.744993209838867, + "learning_rate": 9.663154187701093e-06, + "loss": 0.245, + "step": 4993 + }, + { + "epoch": 0.12637599008021863, + "grad_norm": 7.3540120124816895, + "learning_rate": 9.663009290490686e-06, + "loss": 0.1891, + "step": 4994 + }, + { + "epoch": 0.12640129564491231, + "grad_norm": 9.227166175842285, + "learning_rate": 9.662864363209398e-06, + "loss": 0.1724, + "step": 4995 + }, + { + "epoch": 0.126426601209606, + "grad_norm": 6.783991813659668, + "learning_rate": 9.662719405858164e-06, + "loss": 0.1716, + "step": 4996 + }, + { + "epoch": 0.12645190677429965, + "grad_norm": 5.888018608093262, + "learning_rate": 9.662574418437923e-06, + "loss": 0.2076, + "step": 4997 + }, + { + "epoch": 0.12647721233899334, + "grad_norm": 7.728612899780273, + "learning_rate": 9.662429400949605e-06, + "loss": 0.2527, + "step": 4998 + }, + { + "epoch": 0.12650251790368702, + "grad_norm": 3.2289116382598877, + "learning_rate": 9.662284353394148e-06, + "loss": 0.1835, + "step": 4999 + }, + { + "epoch": 0.1265278234683807, + "grad_norm": 5.124027252197266, + "learning_rate": 9.662139275772486e-06, + "loss": 0.162, + "step": 5000 + }, + { + "epoch": 0.12655312903307436, + "grad_norm": 10.912276268005371, + "learning_rate": 9.661994168085555e-06, + "loss": 0.2861, + "step": 5001 + }, + { + "epoch": 0.12657843459776805, + "grad_norm": 4.849756240844727, + "learning_rate": 9.661849030334291e-06, + "loss": 0.1238, + "step": 5002 + }, + { + "epoch": 0.12660374016246173, + "grad_norm": 5.48518705368042, + "learning_rate": 9.66170386251963e-06, + "loss": 0.1027, + "step": 5003 + }, + { + "epoch": 0.12662904572715541, + "grad_norm": 4.956331253051758, + "learning_rate": 9.661558664642507e-06, + "loss": 0.1503, + "step": 5004 + }, + { + "epoch": 0.12665435129184907, + "grad_norm": 5.9757232666015625, + "learning_rate": 9.66141343670386e-06, + "loss": 0.1791, + "step": 5005 + }, + { + "epoch": 0.12667965685654275, + "grad_norm": 8.066028594970703, + "learning_rate": 9.661268178704624e-06, + "loss": 0.3109, + "step": 5006 + }, + { + "epoch": 0.12670496242123644, + "grad_norm": 6.551309585571289, + "learning_rate": 9.661122890645738e-06, + "loss": 0.1831, + "step": 5007 + }, + { + "epoch": 0.1267302679859301, + "grad_norm": 14.505240440368652, + "learning_rate": 9.660977572528136e-06, + "loss": 0.3432, + "step": 5008 + }, + { + "epoch": 0.12675557355062378, + "grad_norm": 5.662907123565674, + "learning_rate": 9.660832224352758e-06, + "loss": 0.1609, + "step": 5009 + }, + { + "epoch": 0.12678087911531746, + "grad_norm": 7.315140247344971, + "learning_rate": 9.660686846120537e-06, + "loss": 0.1636, + "step": 5010 + }, + { + "epoch": 0.12680618468001115, + "grad_norm": 20.0776309967041, + "learning_rate": 9.660541437832417e-06, + "loss": 0.2224, + "step": 5011 + }, + { + "epoch": 0.1268314902447048, + "grad_norm": 8.268023490905762, + "learning_rate": 9.660395999489331e-06, + "loss": 0.1598, + "step": 5012 + }, + { + "epoch": 0.1268567958093985, + "grad_norm": 7.854330539703369, + "learning_rate": 9.660250531092217e-06, + "loss": 0.1782, + "step": 5013 + }, + { + "epoch": 0.12688210137409217, + "grad_norm": 8.317123413085938, + "learning_rate": 9.660105032642017e-06, + "loss": 0.2993, + "step": 5014 + }, + { + "epoch": 0.12690740693878583, + "grad_norm": 7.517323970794678, + "learning_rate": 9.659959504139665e-06, + "loss": 0.1215, + "step": 5015 + }, + { + "epoch": 0.1269327125034795, + "grad_norm": 5.970787048339844, + "learning_rate": 9.6598139455861e-06, + "loss": 0.2038, + "step": 5016 + }, + { + "epoch": 0.1269580180681732, + "grad_norm": 3.672947645187378, + "learning_rate": 9.659668356982262e-06, + "loss": 0.175, + "step": 5017 + }, + { + "epoch": 0.12698332363286688, + "grad_norm": 5.725954055786133, + "learning_rate": 9.65952273832909e-06, + "loss": 0.2543, + "step": 5018 + }, + { + "epoch": 0.12700862919756054, + "grad_norm": 15.685206413269043, + "learning_rate": 9.659377089627522e-06, + "loss": 0.4146, + "step": 5019 + }, + { + "epoch": 0.12703393476225422, + "grad_norm": 17.86627960205078, + "learning_rate": 9.659231410878499e-06, + "loss": 0.2343, + "step": 5020 + }, + { + "epoch": 0.1270592403269479, + "grad_norm": 9.549832344055176, + "learning_rate": 9.659085702082958e-06, + "loss": 0.2452, + "step": 5021 + }, + { + "epoch": 0.12708454589164156, + "grad_norm": 5.329253196716309, + "learning_rate": 9.65893996324184e-06, + "loss": 0.2086, + "step": 5022 + }, + { + "epoch": 0.12710985145633524, + "grad_norm": 6.963690280914307, + "learning_rate": 9.658794194356087e-06, + "loss": 0.1966, + "step": 5023 + }, + { + "epoch": 0.12713515702102893, + "grad_norm": 4.192307949066162, + "learning_rate": 9.658648395426636e-06, + "loss": 0.1115, + "step": 5024 + }, + { + "epoch": 0.1271604625857226, + "grad_norm": 5.962058067321777, + "learning_rate": 9.658502566454427e-06, + "loss": 0.1573, + "step": 5025 + }, + { + "epoch": 0.12718576815041627, + "grad_norm": 4.861330986022949, + "learning_rate": 9.658356707440402e-06, + "loss": 0.1603, + "step": 5026 + }, + { + "epoch": 0.12721107371510995, + "grad_norm": 6.344979763031006, + "learning_rate": 9.658210818385501e-06, + "loss": 0.2408, + "step": 5027 + }, + { + "epoch": 0.12723637927980364, + "grad_norm": 6.979543209075928, + "learning_rate": 9.658064899290665e-06, + "loss": 0.1979, + "step": 5028 + }, + { + "epoch": 0.1272616848444973, + "grad_norm": 3.7632179260253906, + "learning_rate": 9.657918950156834e-06, + "loss": 0.0942, + "step": 5029 + }, + { + "epoch": 0.12728699040919098, + "grad_norm": 6.373358249664307, + "learning_rate": 9.657772970984951e-06, + "loss": 0.2764, + "step": 5030 + }, + { + "epoch": 0.12731229597388466, + "grad_norm": 7.757971286773682, + "learning_rate": 9.657626961775957e-06, + "loss": 0.2906, + "step": 5031 + }, + { + "epoch": 0.12733760153857834, + "grad_norm": 6.427656650543213, + "learning_rate": 9.657480922530792e-06, + "loss": 0.2439, + "step": 5032 + }, + { + "epoch": 0.127362907103272, + "grad_norm": 6.8537678718566895, + "learning_rate": 9.657334853250399e-06, + "loss": 0.2066, + "step": 5033 + }, + { + "epoch": 0.12738821266796568, + "grad_norm": 6.4888434410095215, + "learning_rate": 9.65718875393572e-06, + "loss": 0.2485, + "step": 5034 + }, + { + "epoch": 0.12741351823265937, + "grad_norm": 10.539905548095703, + "learning_rate": 9.657042624587698e-06, + "loss": 0.2089, + "step": 5035 + }, + { + "epoch": 0.12743882379735305, + "grad_norm": 7.001877784729004, + "learning_rate": 9.656896465207274e-06, + "loss": 0.2315, + "step": 5036 + }, + { + "epoch": 0.1274641293620467, + "grad_norm": 4.74961519241333, + "learning_rate": 9.65675027579539e-06, + "loss": 0.171, + "step": 5037 + }, + { + "epoch": 0.1274894349267404, + "grad_norm": 5.914671421051025, + "learning_rate": 9.656604056352991e-06, + "loss": 0.2112, + "step": 5038 + }, + { + "epoch": 0.12751474049143408, + "grad_norm": 5.832690715789795, + "learning_rate": 9.656457806881018e-06, + "loss": 0.1857, + "step": 5039 + }, + { + "epoch": 0.12754004605612773, + "grad_norm": 4.931840896606445, + "learning_rate": 9.656311527380415e-06, + "loss": 0.1479, + "step": 5040 + }, + { + "epoch": 0.12756535162082142, + "grad_norm": 11.348600387573242, + "learning_rate": 9.656165217852126e-06, + "loss": 0.2979, + "step": 5041 + }, + { + "epoch": 0.1275906571855151, + "grad_norm": 15.917150497436523, + "learning_rate": 9.65601887829709e-06, + "loss": 0.2848, + "step": 5042 + }, + { + "epoch": 0.12761596275020878, + "grad_norm": 6.056188106536865, + "learning_rate": 9.655872508716257e-06, + "loss": 0.2434, + "step": 5043 + }, + { + "epoch": 0.12764126831490244, + "grad_norm": 6.37506628036499, + "learning_rate": 9.655726109110569e-06, + "loss": 0.1676, + "step": 5044 + }, + { + "epoch": 0.12766657387959612, + "grad_norm": 4.255563259124756, + "learning_rate": 9.655579679480969e-06, + "loss": 0.1524, + "step": 5045 + }, + { + "epoch": 0.1276918794442898, + "grad_norm": 11.060731887817383, + "learning_rate": 9.6554332198284e-06, + "loss": 0.3029, + "step": 5046 + }, + { + "epoch": 0.12771718500898346, + "grad_norm": 7.261877059936523, + "learning_rate": 9.65528673015381e-06, + "loss": 0.2673, + "step": 5047 + }, + { + "epoch": 0.12774249057367715, + "grad_norm": 7.6596150398254395, + "learning_rate": 9.65514021045814e-06, + "loss": 0.2006, + "step": 5048 + }, + { + "epoch": 0.12776779613837083, + "grad_norm": 7.86625862121582, + "learning_rate": 9.654993660742339e-06, + "loss": 0.2548, + "step": 5049 + }, + { + "epoch": 0.12779310170306452, + "grad_norm": 5.333439826965332, + "learning_rate": 9.654847081007347e-06, + "loss": 0.1978, + "step": 5050 + }, + { + "epoch": 0.12781840726775817, + "grad_norm": 6.804747581481934, + "learning_rate": 9.654700471254115e-06, + "loss": 0.1854, + "step": 5051 + }, + { + "epoch": 0.12784371283245186, + "grad_norm": 11.858895301818848, + "learning_rate": 9.654553831483584e-06, + "loss": 0.2631, + "step": 5052 + }, + { + "epoch": 0.12786901839714554, + "grad_norm": 4.080764293670654, + "learning_rate": 9.654407161696702e-06, + "loss": 0.2125, + "step": 5053 + }, + { + "epoch": 0.1278943239618392, + "grad_norm": 6.060672760009766, + "learning_rate": 9.654260461894414e-06, + "loss": 0.1704, + "step": 5054 + }, + { + "epoch": 0.12791962952653288, + "grad_norm": 7.237622261047363, + "learning_rate": 9.654113732077663e-06, + "loss": 0.2543, + "step": 5055 + }, + { + "epoch": 0.12794493509122656, + "grad_norm": 3.3855018615722656, + "learning_rate": 9.653966972247402e-06, + "loss": 0.0887, + "step": 5056 + }, + { + "epoch": 0.12797024065592025, + "grad_norm": 4.3826799392700195, + "learning_rate": 9.653820182404571e-06, + "loss": 0.1488, + "step": 5057 + }, + { + "epoch": 0.1279955462206139, + "grad_norm": 8.061973571777344, + "learning_rate": 9.653673362550122e-06, + "loss": 0.2733, + "step": 5058 + }, + { + "epoch": 0.1280208517853076, + "grad_norm": 4.414258003234863, + "learning_rate": 9.653526512684997e-06, + "loss": 0.1881, + "step": 5059 + }, + { + "epoch": 0.12804615735000127, + "grad_norm": 10.297370910644531, + "learning_rate": 9.653379632810147e-06, + "loss": 0.1942, + "step": 5060 + }, + { + "epoch": 0.12807146291469493, + "grad_norm": 6.769359588623047, + "learning_rate": 9.653232722926515e-06, + "loss": 0.2467, + "step": 5061 + }, + { + "epoch": 0.1280967684793886, + "grad_norm": 8.089828491210938, + "learning_rate": 9.653085783035052e-06, + "loss": 0.1428, + "step": 5062 + }, + { + "epoch": 0.1281220740440823, + "grad_norm": 7.036204814910889, + "learning_rate": 9.652938813136706e-06, + "loss": 0.204, + "step": 5063 + }, + { + "epoch": 0.12814737960877598, + "grad_norm": 11.009182929992676, + "learning_rate": 9.65279181323242e-06, + "loss": 0.2173, + "step": 5064 + }, + { + "epoch": 0.12817268517346964, + "grad_norm": 4.146817684173584, + "learning_rate": 9.652644783323146e-06, + "loss": 0.143, + "step": 5065 + }, + { + "epoch": 0.12819799073816332, + "grad_norm": 7.506922245025635, + "learning_rate": 9.652497723409833e-06, + "loss": 0.2464, + "step": 5066 + }, + { + "epoch": 0.128223296302857, + "grad_norm": 3.4386234283447266, + "learning_rate": 9.652350633493426e-06, + "loss": 0.186, + "step": 5067 + }, + { + "epoch": 0.1282486018675507, + "grad_norm": 4.956689834594727, + "learning_rate": 9.652203513574875e-06, + "loss": 0.2064, + "step": 5068 + }, + { + "epoch": 0.12827390743224434, + "grad_norm": 8.529417991638184, + "learning_rate": 9.652056363655129e-06, + "loss": 0.3368, + "step": 5069 + }, + { + "epoch": 0.12829921299693803, + "grad_norm": 5.905375957489014, + "learning_rate": 9.651909183735137e-06, + "loss": 0.1281, + "step": 5070 + }, + { + "epoch": 0.1283245185616317, + "grad_norm": 6.728270530700684, + "learning_rate": 9.651761973815849e-06, + "loss": 0.2877, + "step": 5071 + }, + { + "epoch": 0.12834982412632537, + "grad_norm": 12.477978706359863, + "learning_rate": 9.651614733898213e-06, + "loss": 0.2096, + "step": 5072 + }, + { + "epoch": 0.12837512969101905, + "grad_norm": 22.41704750061035, + "learning_rate": 9.651467463983177e-06, + "loss": 0.2918, + "step": 5073 + }, + { + "epoch": 0.12840043525571274, + "grad_norm": 5.733108043670654, + "learning_rate": 9.651320164071696e-06, + "loss": 0.1624, + "step": 5074 + }, + { + "epoch": 0.12842574082040642, + "grad_norm": 4.5321269035339355, + "learning_rate": 9.651172834164714e-06, + "loss": 0.1343, + "step": 5075 + }, + { + "epoch": 0.12845104638510008, + "grad_norm": 6.547933578491211, + "learning_rate": 9.651025474263184e-06, + "loss": 0.2074, + "step": 5076 + }, + { + "epoch": 0.12847635194979376, + "grad_norm": 11.578390121459961, + "learning_rate": 9.650878084368057e-06, + "loss": 0.1634, + "step": 5077 + }, + { + "epoch": 0.12850165751448744, + "grad_norm": 5.997805595397949, + "learning_rate": 9.65073066448028e-06, + "loss": 0.2385, + "step": 5078 + }, + { + "epoch": 0.1285269630791811, + "grad_norm": 4.4382429122924805, + "learning_rate": 9.650583214600807e-06, + "loss": 0.1517, + "step": 5079 + }, + { + "epoch": 0.12855226864387478, + "grad_norm": 5.484348773956299, + "learning_rate": 9.650435734730589e-06, + "loss": 0.1954, + "step": 5080 + }, + { + "epoch": 0.12857757420856847, + "grad_norm": 5.651706218719482, + "learning_rate": 9.650288224870575e-06, + "loss": 0.165, + "step": 5081 + }, + { + "epoch": 0.12860287977326215, + "grad_norm": 9.63459587097168, + "learning_rate": 9.650140685021716e-06, + "loss": 0.2835, + "step": 5082 + }, + { + "epoch": 0.1286281853379558, + "grad_norm": 7.096347332000732, + "learning_rate": 9.649993115184968e-06, + "loss": 0.2312, + "step": 5083 + }, + { + "epoch": 0.1286534909026495, + "grad_norm": 5.33470344543457, + "learning_rate": 9.649845515361278e-06, + "loss": 0.1994, + "step": 5084 + }, + { + "epoch": 0.12867879646734318, + "grad_norm": 6.221845626831055, + "learning_rate": 9.649697885551598e-06, + "loss": 0.2241, + "step": 5085 + }, + { + "epoch": 0.12870410203203683, + "grad_norm": 3.399517297744751, + "learning_rate": 9.649550225756882e-06, + "loss": 0.1567, + "step": 5086 + }, + { + "epoch": 0.12872940759673052, + "grad_norm": 9.79856014251709, + "learning_rate": 9.649402535978082e-06, + "loss": 0.3398, + "step": 5087 + }, + { + "epoch": 0.1287547131614242, + "grad_norm": 4.952977657318115, + "learning_rate": 9.649254816216149e-06, + "loss": 0.1696, + "step": 5088 + }, + { + "epoch": 0.12878001872611788, + "grad_norm": 5.064163684844971, + "learning_rate": 9.649107066472037e-06, + "loss": 0.1906, + "step": 5089 + }, + { + "epoch": 0.12880532429081154, + "grad_norm": 8.326427459716797, + "learning_rate": 9.648959286746699e-06, + "loss": 0.2236, + "step": 5090 + }, + { + "epoch": 0.12883062985550522, + "grad_norm": 9.175077438354492, + "learning_rate": 9.648811477041087e-06, + "loss": 0.3223, + "step": 5091 + }, + { + "epoch": 0.1288559354201989, + "grad_norm": 5.265559196472168, + "learning_rate": 9.648663637356153e-06, + "loss": 0.2121, + "step": 5092 + }, + { + "epoch": 0.12888124098489256, + "grad_norm": 10.373367309570312, + "learning_rate": 9.648515767692854e-06, + "loss": 0.286, + "step": 5093 + }, + { + "epoch": 0.12890654654958625, + "grad_norm": 6.481996059417725, + "learning_rate": 9.64836786805214e-06, + "loss": 0.1806, + "step": 5094 + }, + { + "epoch": 0.12893185211427993, + "grad_norm": 4.627971649169922, + "learning_rate": 9.648219938434967e-06, + "loss": 0.216, + "step": 5095 + }, + { + "epoch": 0.12895715767897362, + "grad_norm": 4.583371162414551, + "learning_rate": 9.648071978842288e-06, + "loss": 0.1646, + "step": 5096 + }, + { + "epoch": 0.12898246324366727, + "grad_norm": 4.2524003982543945, + "learning_rate": 9.647923989275057e-06, + "loss": 0.1989, + "step": 5097 + }, + { + "epoch": 0.12900776880836096, + "grad_norm": 8.526751518249512, + "learning_rate": 9.64777596973423e-06, + "loss": 0.3134, + "step": 5098 + }, + { + "epoch": 0.12903307437305464, + "grad_norm": 6.677285194396973, + "learning_rate": 9.647627920220759e-06, + "loss": 0.2765, + "step": 5099 + }, + { + "epoch": 0.12905837993774832, + "grad_norm": 7.468000888824463, + "learning_rate": 9.6474798407356e-06, + "loss": 0.2484, + "step": 5100 + }, + { + "epoch": 0.12908368550244198, + "grad_norm": 6.082614421844482, + "learning_rate": 9.64733173127971e-06, + "loss": 0.2687, + "step": 5101 + }, + { + "epoch": 0.12910899106713566, + "grad_norm": 4.119299411773682, + "learning_rate": 9.64718359185404e-06, + "loss": 0.1878, + "step": 5102 + }, + { + "epoch": 0.12913429663182935, + "grad_norm": 3.3670833110809326, + "learning_rate": 9.64703542245955e-06, + "loss": 0.2225, + "step": 5103 + }, + { + "epoch": 0.129159602196523, + "grad_norm": 4.573888301849365, + "learning_rate": 9.64688722309719e-06, + "loss": 0.224, + "step": 5104 + }, + { + "epoch": 0.1291849077612167, + "grad_norm": 4.83845329284668, + "learning_rate": 9.64673899376792e-06, + "loss": 0.1756, + "step": 5105 + }, + { + "epoch": 0.12921021332591037, + "grad_norm": 5.5812273025512695, + "learning_rate": 9.646590734472693e-06, + "loss": 0.1576, + "step": 5106 + }, + { + "epoch": 0.12923551889060406, + "grad_norm": 4.500786304473877, + "learning_rate": 9.64644244521247e-06, + "loss": 0.1863, + "step": 5107 + }, + { + "epoch": 0.1292608244552977, + "grad_norm": 5.05211877822876, + "learning_rate": 9.646294125988202e-06, + "loss": 0.1937, + "step": 5108 + }, + { + "epoch": 0.1292861300199914, + "grad_norm": 5.5892839431762695, + "learning_rate": 9.646145776800848e-06, + "loss": 0.1817, + "step": 5109 + }, + { + "epoch": 0.12931143558468508, + "grad_norm": 11.852094650268555, + "learning_rate": 9.645997397651363e-06, + "loss": 0.3839, + "step": 5110 + }, + { + "epoch": 0.12933674114937874, + "grad_norm": 5.415852069854736, + "learning_rate": 9.645848988540707e-06, + "loss": 0.1916, + "step": 5111 + }, + { + "epoch": 0.12936204671407242, + "grad_norm": 5.328253269195557, + "learning_rate": 9.645700549469832e-06, + "loss": 0.1651, + "step": 5112 + }, + { + "epoch": 0.1293873522787661, + "grad_norm": 5.121105670928955, + "learning_rate": 9.645552080439702e-06, + "loss": 0.2321, + "step": 5113 + }, + { + "epoch": 0.1294126578434598, + "grad_norm": 5.3559675216674805, + "learning_rate": 9.645403581451268e-06, + "loss": 0.1682, + "step": 5114 + }, + { + "epoch": 0.12943796340815344, + "grad_norm": 6.159922122955322, + "learning_rate": 9.645255052505491e-06, + "loss": 0.1782, + "step": 5115 + }, + { + "epoch": 0.12946326897284713, + "grad_norm": 6.3014421463012695, + "learning_rate": 9.645106493603329e-06, + "loss": 0.2124, + "step": 5116 + }, + { + "epoch": 0.1294885745375408, + "grad_norm": 9.280250549316406, + "learning_rate": 9.64495790474574e-06, + "loss": 0.3039, + "step": 5117 + }, + { + "epoch": 0.12951388010223447, + "grad_norm": 7.1626763343811035, + "learning_rate": 9.64480928593368e-06, + "loss": 0.2213, + "step": 5118 + }, + { + "epoch": 0.12953918566692815, + "grad_norm": 4.226489067077637, + "learning_rate": 9.644660637168109e-06, + "loss": 0.2199, + "step": 5119 + }, + { + "epoch": 0.12956449123162184, + "grad_norm": 7.6660590171813965, + "learning_rate": 9.644511958449986e-06, + "loss": 0.2276, + "step": 5120 + }, + { + "epoch": 0.12958979679631552, + "grad_norm": 10.342109680175781, + "learning_rate": 9.64436324978027e-06, + "loss": 0.2451, + "step": 5121 + }, + { + "epoch": 0.12961510236100918, + "grad_norm": 14.644083976745605, + "learning_rate": 9.644214511159918e-06, + "loss": 0.2653, + "step": 5122 + }, + { + "epoch": 0.12964040792570286, + "grad_norm": 4.322844505310059, + "learning_rate": 9.644065742589889e-06, + "loss": 0.1815, + "step": 5123 + }, + { + "epoch": 0.12966571349039654, + "grad_norm": 9.68669319152832, + "learning_rate": 9.643916944071147e-06, + "loss": 0.2638, + "step": 5124 + }, + { + "epoch": 0.1296910190550902, + "grad_norm": 7.729945182800293, + "learning_rate": 9.643768115604649e-06, + "loss": 0.245, + "step": 5125 + }, + { + "epoch": 0.12971632461978388, + "grad_norm": 8.458714485168457, + "learning_rate": 9.643619257191352e-06, + "loss": 0.2789, + "step": 5126 + }, + { + "epoch": 0.12974163018447757, + "grad_norm": 4.464286804199219, + "learning_rate": 9.643470368832218e-06, + "loss": 0.2523, + "step": 5127 + }, + { + "epoch": 0.12976693574917125, + "grad_norm": 7.256535530090332, + "learning_rate": 9.643321450528207e-06, + "loss": 0.2171, + "step": 5128 + }, + { + "epoch": 0.1297922413138649, + "grad_norm": 6.958280563354492, + "learning_rate": 9.643172502280283e-06, + "loss": 0.1826, + "step": 5129 + }, + { + "epoch": 0.1298175468785586, + "grad_norm": 5.272419452667236, + "learning_rate": 9.6430235240894e-06, + "loss": 0.2077, + "step": 5130 + }, + { + "epoch": 0.12984285244325228, + "grad_norm": 4.364738941192627, + "learning_rate": 9.642874515956523e-06, + "loss": 0.1908, + "step": 5131 + }, + { + "epoch": 0.12986815800794596, + "grad_norm": 5.084549427032471, + "learning_rate": 9.642725477882613e-06, + "loss": 0.1903, + "step": 5132 + }, + { + "epoch": 0.12989346357263962, + "grad_norm": 6.102195739746094, + "learning_rate": 9.642576409868627e-06, + "loss": 0.1674, + "step": 5133 + }, + { + "epoch": 0.1299187691373333, + "grad_norm": 7.11787748336792, + "learning_rate": 9.642427311915532e-06, + "loss": 0.228, + "step": 5134 + }, + { + "epoch": 0.12994407470202698, + "grad_norm": 6.031714916229248, + "learning_rate": 9.642278184024285e-06, + "loss": 0.2163, + "step": 5135 + }, + { + "epoch": 0.12996938026672064, + "grad_norm": 3.8469338417053223, + "learning_rate": 9.64212902619585e-06, + "loss": 0.1126, + "step": 5136 + }, + { + "epoch": 0.12999468583141432, + "grad_norm": 5.586337566375732, + "learning_rate": 9.64197983843119e-06, + "loss": 0.244, + "step": 5137 + }, + { + "epoch": 0.130019991396108, + "grad_norm": 3.8169448375701904, + "learning_rate": 9.641830620731265e-06, + "loss": 0.1358, + "step": 5138 + }, + { + "epoch": 0.1300452969608017, + "grad_norm": 3.5164265632629395, + "learning_rate": 9.641681373097037e-06, + "loss": 0.2225, + "step": 5139 + }, + { + "epoch": 0.13007060252549535, + "grad_norm": 4.876254558563232, + "learning_rate": 9.641532095529468e-06, + "loss": 0.1473, + "step": 5140 + }, + { + "epoch": 0.13009590809018903, + "grad_norm": 4.553982734680176, + "learning_rate": 9.641382788029525e-06, + "loss": 0.2258, + "step": 5141 + }, + { + "epoch": 0.13012121365488272, + "grad_norm": 4.860623359680176, + "learning_rate": 9.641233450598167e-06, + "loss": 0.1898, + "step": 5142 + }, + { + "epoch": 0.13014651921957637, + "grad_norm": 7.952675819396973, + "learning_rate": 9.641084083236357e-06, + "loss": 0.1877, + "step": 5143 + }, + { + "epoch": 0.13017182478427006, + "grad_norm": 14.746378898620605, + "learning_rate": 9.640934685945057e-06, + "loss": 0.384, + "step": 5144 + }, + { + "epoch": 0.13019713034896374, + "grad_norm": 4.604210376739502, + "learning_rate": 9.640785258725236e-06, + "loss": 0.2622, + "step": 5145 + }, + { + "epoch": 0.13022243591365742, + "grad_norm": 6.825836658477783, + "learning_rate": 9.64063580157785e-06, + "loss": 0.168, + "step": 5146 + }, + { + "epoch": 0.13024774147835108, + "grad_norm": 9.440023422241211, + "learning_rate": 9.640486314503872e-06, + "loss": 0.2906, + "step": 5147 + }, + { + "epoch": 0.13027304704304477, + "grad_norm": 5.736621856689453, + "learning_rate": 9.640336797504258e-06, + "loss": 0.1761, + "step": 5148 + }, + { + "epoch": 0.13029835260773845, + "grad_norm": 5.050718784332275, + "learning_rate": 9.640187250579975e-06, + "loss": 0.1583, + "step": 5149 + }, + { + "epoch": 0.1303236581724321, + "grad_norm": 7.898765563964844, + "learning_rate": 9.640037673731989e-06, + "loss": 0.2386, + "step": 5150 + }, + { + "epoch": 0.1303489637371258, + "grad_norm": 6.166585922241211, + "learning_rate": 9.639888066961262e-06, + "loss": 0.2054, + "step": 5151 + }, + { + "epoch": 0.13037426930181947, + "grad_norm": 7.746421813964844, + "learning_rate": 9.639738430268762e-06, + "loss": 0.2323, + "step": 5152 + }, + { + "epoch": 0.13039957486651316, + "grad_norm": 4.64280366897583, + "learning_rate": 9.63958876365545e-06, + "loss": 0.2312, + "step": 5153 + }, + { + "epoch": 0.1304248804312068, + "grad_norm": 3.7083001136779785, + "learning_rate": 9.639439067122292e-06, + "loss": 0.188, + "step": 5154 + }, + { + "epoch": 0.1304501859959005, + "grad_norm": 7.5269060134887695, + "learning_rate": 9.639289340670256e-06, + "loss": 0.161, + "step": 5155 + }, + { + "epoch": 0.13047549156059418, + "grad_norm": 9.782353401184082, + "learning_rate": 9.639139584300306e-06, + "loss": 0.2214, + "step": 5156 + }, + { + "epoch": 0.13050079712528784, + "grad_norm": 5.008162021636963, + "learning_rate": 9.638989798013408e-06, + "loss": 0.2396, + "step": 5157 + }, + { + "epoch": 0.13052610268998152, + "grad_norm": 8.164440155029297, + "learning_rate": 9.638839981810526e-06, + "loss": 0.1256, + "step": 5158 + }, + { + "epoch": 0.1305514082546752, + "grad_norm": 4.034746170043945, + "learning_rate": 9.63869013569263e-06, + "loss": 0.1446, + "step": 5159 + }, + { + "epoch": 0.1305767138193689, + "grad_norm": 10.271174430847168, + "learning_rate": 9.638540259660683e-06, + "loss": 0.2591, + "step": 5160 + }, + { + "epoch": 0.13060201938406255, + "grad_norm": 7.463202953338623, + "learning_rate": 9.638390353715653e-06, + "loss": 0.2293, + "step": 5161 + }, + { + "epoch": 0.13062732494875623, + "grad_norm": 7.0328593254089355, + "learning_rate": 9.638240417858507e-06, + "loss": 0.2081, + "step": 5162 + }, + { + "epoch": 0.1306526305134499, + "grad_norm": 6.012906074523926, + "learning_rate": 9.638090452090211e-06, + "loss": 0.1645, + "step": 5163 + }, + { + "epoch": 0.1306779360781436, + "grad_norm": 6.637207984924316, + "learning_rate": 9.637940456411732e-06, + "loss": 0.2619, + "step": 5164 + }, + { + "epoch": 0.13070324164283725, + "grad_norm": 9.126391410827637, + "learning_rate": 9.637790430824037e-06, + "loss": 0.257, + "step": 5165 + }, + { + "epoch": 0.13072854720753094, + "grad_norm": 5.601104736328125, + "learning_rate": 9.637640375328095e-06, + "loss": 0.1874, + "step": 5166 + }, + { + "epoch": 0.13075385277222462, + "grad_norm": 4.635882377624512, + "learning_rate": 9.637490289924873e-06, + "loss": 0.1477, + "step": 5167 + }, + { + "epoch": 0.13077915833691828, + "grad_norm": 7.462405204772949, + "learning_rate": 9.63734017461534e-06, + "loss": 0.1806, + "step": 5168 + }, + { + "epoch": 0.13080446390161196, + "grad_norm": 10.486412048339844, + "learning_rate": 9.63719002940046e-06, + "loss": 0.18, + "step": 5169 + }, + { + "epoch": 0.13082976946630565, + "grad_norm": 5.065071105957031, + "learning_rate": 9.637039854281207e-06, + "loss": 0.1502, + "step": 5170 + }, + { + "epoch": 0.13085507503099933, + "grad_norm": 4.947219371795654, + "learning_rate": 9.636889649258546e-06, + "loss": 0.1147, + "step": 5171 + }, + { + "epoch": 0.13088038059569299, + "grad_norm": 8.87340259552002, + "learning_rate": 9.636739414333444e-06, + "loss": 0.299, + "step": 5172 + }, + { + "epoch": 0.13090568616038667, + "grad_norm": 4.318167686462402, + "learning_rate": 9.636589149506874e-06, + "loss": 0.1758, + "step": 5173 + }, + { + "epoch": 0.13093099172508035, + "grad_norm": 7.496587753295898, + "learning_rate": 9.636438854779801e-06, + "loss": 0.252, + "step": 5174 + }, + { + "epoch": 0.130956297289774, + "grad_norm": 7.362189292907715, + "learning_rate": 9.636288530153199e-06, + "loss": 0.2201, + "step": 5175 + }, + { + "epoch": 0.1309816028544677, + "grad_norm": 6.51007604598999, + "learning_rate": 9.636138175628034e-06, + "loss": 0.3173, + "step": 5176 + }, + { + "epoch": 0.13100690841916138, + "grad_norm": 7.313040256500244, + "learning_rate": 9.635987791205275e-06, + "loss": 0.2385, + "step": 5177 + }, + { + "epoch": 0.13103221398385506, + "grad_norm": 6.393057346343994, + "learning_rate": 9.635837376885893e-06, + "loss": 0.2617, + "step": 5178 + }, + { + "epoch": 0.13105751954854872, + "grad_norm": 8.150015830993652, + "learning_rate": 9.63568693267086e-06, + "loss": 0.1984, + "step": 5179 + }, + { + "epoch": 0.1310828251132424, + "grad_norm": 5.518680095672607, + "learning_rate": 9.635536458561143e-06, + "loss": 0.2214, + "step": 5180 + }, + { + "epoch": 0.13110813067793609, + "grad_norm": 3.930415153503418, + "learning_rate": 9.635385954557715e-06, + "loss": 0.1809, + "step": 5181 + }, + { + "epoch": 0.13113343624262974, + "grad_norm": 4.111059188842773, + "learning_rate": 9.635235420661544e-06, + "loss": 0.2374, + "step": 5182 + }, + { + "epoch": 0.13115874180732343, + "grad_norm": 4.530597686767578, + "learning_rate": 9.6350848568736e-06, + "loss": 0.2376, + "step": 5183 + }, + { + "epoch": 0.1311840473720171, + "grad_norm": 4.748114109039307, + "learning_rate": 9.63493426319486e-06, + "loss": 0.2006, + "step": 5184 + }, + { + "epoch": 0.1312093529367108, + "grad_norm": 8.303061485290527, + "learning_rate": 9.634783639626288e-06, + "loss": 0.3124, + "step": 5185 + }, + { + "epoch": 0.13123465850140445, + "grad_norm": 4.666988849639893, + "learning_rate": 9.63463298616886e-06, + "loss": 0.0623, + "step": 5186 + }, + { + "epoch": 0.13125996406609813, + "grad_norm": 6.298798084259033, + "learning_rate": 9.634482302823545e-06, + "loss": 0.1893, + "step": 5187 + }, + { + "epoch": 0.13128526963079182, + "grad_norm": 9.790925979614258, + "learning_rate": 9.634331589591317e-06, + "loss": 0.2709, + "step": 5188 + }, + { + "epoch": 0.13131057519548547, + "grad_norm": 6.932468414306641, + "learning_rate": 9.634180846473145e-06, + "loss": 0.2076, + "step": 5189 + }, + { + "epoch": 0.13133588076017916, + "grad_norm": 7.654328346252441, + "learning_rate": 9.634030073470005e-06, + "loss": 0.1971, + "step": 5190 + }, + { + "epoch": 0.13136118632487284, + "grad_norm": 3.929091691970825, + "learning_rate": 9.633879270582864e-06, + "loss": 0.1807, + "step": 5191 + }, + { + "epoch": 0.13138649188956653, + "grad_norm": 4.589252471923828, + "learning_rate": 9.6337284378127e-06, + "loss": 0.1715, + "step": 5192 + }, + { + "epoch": 0.13141179745426018, + "grad_norm": 4.9371418952941895, + "learning_rate": 9.633577575160481e-06, + "loss": 0.1757, + "step": 5193 + }, + { + "epoch": 0.13143710301895387, + "grad_norm": 3.2601318359375, + "learning_rate": 9.633426682627186e-06, + "loss": 0.1284, + "step": 5194 + }, + { + "epoch": 0.13146240858364755, + "grad_norm": 5.436182022094727, + "learning_rate": 9.633275760213781e-06, + "loss": 0.2635, + "step": 5195 + }, + { + "epoch": 0.13148771414834123, + "grad_norm": 3.53682541847229, + "learning_rate": 9.633124807921243e-06, + "loss": 0.1633, + "step": 5196 + }, + { + "epoch": 0.1315130197130349, + "grad_norm": 4.149437427520752, + "learning_rate": 9.632973825750545e-06, + "loss": 0.2047, + "step": 5197 + }, + { + "epoch": 0.13153832527772857, + "grad_norm": 6.9004292488098145, + "learning_rate": 9.63282281370266e-06, + "loss": 0.2035, + "step": 5198 + }, + { + "epoch": 0.13156363084242226, + "grad_norm": 8.822722434997559, + "learning_rate": 9.632671771778563e-06, + "loss": 0.1471, + "step": 5199 + }, + { + "epoch": 0.13158893640711591, + "grad_norm": 6.284319877624512, + "learning_rate": 9.632520699979228e-06, + "loss": 0.1787, + "step": 5200 + }, + { + "epoch": 0.1316142419718096, + "grad_norm": 5.751925468444824, + "learning_rate": 9.632369598305628e-06, + "loss": 0.1188, + "step": 5201 + }, + { + "epoch": 0.13163954753650328, + "grad_norm": 7.036798477172852, + "learning_rate": 9.632218466758739e-06, + "loss": 0.2068, + "step": 5202 + }, + { + "epoch": 0.13166485310119697, + "grad_norm": 6.718809604644775, + "learning_rate": 9.632067305339535e-06, + "loss": 0.1227, + "step": 5203 + }, + { + "epoch": 0.13169015866589062, + "grad_norm": 12.243675231933594, + "learning_rate": 9.631916114048989e-06, + "loss": 0.2057, + "step": 5204 + }, + { + "epoch": 0.1317154642305843, + "grad_norm": 18.198699951171875, + "learning_rate": 9.631764892888078e-06, + "loss": 0.3853, + "step": 5205 + }, + { + "epoch": 0.131740769795278, + "grad_norm": 5.818737506866455, + "learning_rate": 9.631613641857778e-06, + "loss": 0.1722, + "step": 5206 + }, + { + "epoch": 0.13176607535997165, + "grad_norm": 6.073861122131348, + "learning_rate": 9.631462360959062e-06, + "loss": 0.2707, + "step": 5207 + }, + { + "epoch": 0.13179138092466533, + "grad_norm": 4.421298503875732, + "learning_rate": 9.631311050192908e-06, + "loss": 0.1539, + "step": 5208 + }, + { + "epoch": 0.13181668648935901, + "grad_norm": 4.100461959838867, + "learning_rate": 9.631159709560289e-06, + "loss": 0.1384, + "step": 5209 + }, + { + "epoch": 0.1318419920540527, + "grad_norm": 4.351313591003418, + "learning_rate": 9.631008339062185e-06, + "loss": 0.1571, + "step": 5210 + }, + { + "epoch": 0.13186729761874635, + "grad_norm": 5.402710914611816, + "learning_rate": 9.630856938699567e-06, + "loss": 0.2112, + "step": 5211 + }, + { + "epoch": 0.13189260318344004, + "grad_norm": 10.334912300109863, + "learning_rate": 9.630705508473415e-06, + "loss": 0.1544, + "step": 5212 + }, + { + "epoch": 0.13191790874813372, + "grad_norm": 6.941708564758301, + "learning_rate": 9.630554048384704e-06, + "loss": 0.1648, + "step": 5213 + }, + { + "epoch": 0.13194321431282738, + "grad_norm": 13.842906951904297, + "learning_rate": 9.630402558434411e-06, + "loss": 0.2351, + "step": 5214 + }, + { + "epoch": 0.13196851987752106, + "grad_norm": 4.3707380294799805, + "learning_rate": 9.630251038623514e-06, + "loss": 0.2105, + "step": 5215 + }, + { + "epoch": 0.13199382544221475, + "grad_norm": 6.015904903411865, + "learning_rate": 9.630099488952991e-06, + "loss": 0.1738, + "step": 5216 + }, + { + "epoch": 0.13201913100690843, + "grad_norm": 5.398966312408447, + "learning_rate": 9.629947909423816e-06, + "loss": 0.239, + "step": 5217 + }, + { + "epoch": 0.1320444365716021, + "grad_norm": 7.126430034637451, + "learning_rate": 9.629796300036967e-06, + "loss": 0.1951, + "step": 5218 + }, + { + "epoch": 0.13206974213629577, + "grad_norm": 5.289400577545166, + "learning_rate": 9.629644660793425e-06, + "loss": 0.1102, + "step": 5219 + }, + { + "epoch": 0.13209504770098945, + "grad_norm": 6.238710403442383, + "learning_rate": 9.629492991694165e-06, + "loss": 0.2366, + "step": 5220 + }, + { + "epoch": 0.1321203532656831, + "grad_norm": 12.628365516662598, + "learning_rate": 9.629341292740163e-06, + "loss": 0.2704, + "step": 5221 + }, + { + "epoch": 0.1321456588303768, + "grad_norm": 8.08019733428955, + "learning_rate": 9.629189563932404e-06, + "loss": 0.2923, + "step": 5222 + }, + { + "epoch": 0.13217096439507048, + "grad_norm": 6.865862846374512, + "learning_rate": 9.629037805271861e-06, + "loss": 0.2376, + "step": 5223 + }, + { + "epoch": 0.13219626995976416, + "grad_norm": 8.525626182556152, + "learning_rate": 9.628886016759516e-06, + "loss": 0.1141, + "step": 5224 + }, + { + "epoch": 0.13222157552445782, + "grad_norm": 4.927068710327148, + "learning_rate": 9.628734198396344e-06, + "loss": 0.2194, + "step": 5225 + }, + { + "epoch": 0.1322468810891515, + "grad_norm": 7.03499174118042, + "learning_rate": 9.628582350183326e-06, + "loss": 0.1413, + "step": 5226 + }, + { + "epoch": 0.1322721866538452, + "grad_norm": 2.8432438373565674, + "learning_rate": 9.628430472121445e-06, + "loss": 0.1342, + "step": 5227 + }, + { + "epoch": 0.13229749221853887, + "grad_norm": 24.428401947021484, + "learning_rate": 9.628278564211672e-06, + "loss": 0.1617, + "step": 5228 + }, + { + "epoch": 0.13232279778323253, + "grad_norm": 7.888550281524658, + "learning_rate": 9.628126626454995e-06, + "loss": 0.2043, + "step": 5229 + }, + { + "epoch": 0.1323481033479262, + "grad_norm": 6.09145450592041, + "learning_rate": 9.627974658852388e-06, + "loss": 0.2267, + "step": 5230 + }, + { + "epoch": 0.1323734089126199, + "grad_norm": 27.089853286743164, + "learning_rate": 9.627822661404835e-06, + "loss": 0.2678, + "step": 5231 + }, + { + "epoch": 0.13239871447731355, + "grad_norm": 23.68894386291504, + "learning_rate": 9.627670634113314e-06, + "loss": 0.2767, + "step": 5232 + }, + { + "epoch": 0.13242402004200723, + "grad_norm": 8.527981758117676, + "learning_rate": 9.627518576978804e-06, + "loss": 0.2397, + "step": 5233 + }, + { + "epoch": 0.13244932560670092, + "grad_norm": 3.3706514835357666, + "learning_rate": 9.62736649000229e-06, + "loss": 0.1059, + "step": 5234 + }, + { + "epoch": 0.1324746311713946, + "grad_norm": 5.054283142089844, + "learning_rate": 9.62721437318475e-06, + "loss": 0.11, + "step": 5235 + }, + { + "epoch": 0.13249993673608826, + "grad_norm": 5.758924961090088, + "learning_rate": 9.627062226527163e-06, + "loss": 0.1972, + "step": 5236 + }, + { + "epoch": 0.13252524230078194, + "grad_norm": 4.994110584259033, + "learning_rate": 9.626910050030514e-06, + "loss": 0.1075, + "step": 5237 + }, + { + "epoch": 0.13255054786547563, + "grad_norm": 4.174153804779053, + "learning_rate": 9.626757843695783e-06, + "loss": 0.1938, + "step": 5238 + }, + { + "epoch": 0.13257585343016928, + "grad_norm": 7.562349796295166, + "learning_rate": 9.626605607523952e-06, + "loss": 0.2036, + "step": 5239 + }, + { + "epoch": 0.13260115899486297, + "grad_norm": 10.135679244995117, + "learning_rate": 9.626453341516e-06, + "loss": 0.1836, + "step": 5240 + }, + { + "epoch": 0.13262646455955665, + "grad_norm": 6.182308673858643, + "learning_rate": 9.626301045672912e-06, + "loss": 0.2177, + "step": 5241 + }, + { + "epoch": 0.13265177012425033, + "grad_norm": 5.50065279006958, + "learning_rate": 9.62614871999567e-06, + "loss": 0.1921, + "step": 5242 + }, + { + "epoch": 0.132677075688944, + "grad_norm": 6.651754856109619, + "learning_rate": 9.625996364485253e-06, + "loss": 0.2127, + "step": 5243 + }, + { + "epoch": 0.13270238125363767, + "grad_norm": 5.211285591125488, + "learning_rate": 9.625843979142648e-06, + "loss": 0.1468, + "step": 5244 + }, + { + "epoch": 0.13272768681833136, + "grad_norm": 5.703742980957031, + "learning_rate": 9.625691563968835e-06, + "loss": 0.2179, + "step": 5245 + }, + { + "epoch": 0.13275299238302501, + "grad_norm": 8.406461715698242, + "learning_rate": 9.625539118964798e-06, + "loss": 0.2504, + "step": 5246 + }, + { + "epoch": 0.1327782979477187, + "grad_norm": 20.60727882385254, + "learning_rate": 9.62538664413152e-06, + "loss": 0.338, + "step": 5247 + }, + { + "epoch": 0.13280360351241238, + "grad_norm": 8.637629508972168, + "learning_rate": 9.625234139469984e-06, + "loss": 0.1907, + "step": 5248 + }, + { + "epoch": 0.13282890907710607, + "grad_norm": 5.0497002601623535, + "learning_rate": 9.625081604981172e-06, + "loss": 0.2085, + "step": 5249 + }, + { + "epoch": 0.13285421464179972, + "grad_norm": 4.17037296295166, + "learning_rate": 9.62492904066607e-06, + "loss": 0.1501, + "step": 5250 + }, + { + "epoch": 0.1328795202064934, + "grad_norm": 4.03876256942749, + "learning_rate": 9.62477644652566e-06, + "loss": 0.1991, + "step": 5251 + }, + { + "epoch": 0.1329048257711871, + "grad_norm": 18.784801483154297, + "learning_rate": 9.624623822560928e-06, + "loss": 0.6056, + "step": 5252 + }, + { + "epoch": 0.13293013133588075, + "grad_norm": 12.963665008544922, + "learning_rate": 9.624471168772858e-06, + "loss": 0.2838, + "step": 5253 + }, + { + "epoch": 0.13295543690057443, + "grad_norm": 9.032705307006836, + "learning_rate": 9.624318485162432e-06, + "loss": 0.2221, + "step": 5254 + }, + { + "epoch": 0.13298074246526811, + "grad_norm": 6.253818988800049, + "learning_rate": 9.624165771730635e-06, + "loss": 0.2198, + "step": 5255 + }, + { + "epoch": 0.1330060480299618, + "grad_norm": 6.5723700523376465, + "learning_rate": 9.624013028478456e-06, + "loss": 0.1178, + "step": 5256 + }, + { + "epoch": 0.13303135359465545, + "grad_norm": 17.908832550048828, + "learning_rate": 9.623860255406876e-06, + "loss": 0.5437, + "step": 5257 + }, + { + "epoch": 0.13305665915934914, + "grad_norm": 9.880858421325684, + "learning_rate": 9.623707452516881e-06, + "loss": 0.2562, + "step": 5258 + }, + { + "epoch": 0.13308196472404282, + "grad_norm": 3.171027183532715, + "learning_rate": 9.623554619809457e-06, + "loss": 0.1466, + "step": 5259 + }, + { + "epoch": 0.1331072702887365, + "grad_norm": 6.31566858291626, + "learning_rate": 9.62340175728559e-06, + "loss": 0.2728, + "step": 5260 + }, + { + "epoch": 0.13313257585343016, + "grad_norm": 5.488141059875488, + "learning_rate": 9.623248864946264e-06, + "loss": 0.1791, + "step": 5261 + }, + { + "epoch": 0.13315788141812385, + "grad_norm": 3.8097164630889893, + "learning_rate": 9.623095942792467e-06, + "loss": 0.1669, + "step": 5262 + }, + { + "epoch": 0.13318318698281753, + "grad_norm": 5.947007656097412, + "learning_rate": 9.622942990825183e-06, + "loss": 0.2115, + "step": 5263 + }, + { + "epoch": 0.1332084925475112, + "grad_norm": 4.62166166305542, + "learning_rate": 9.6227900090454e-06, + "loss": 0.1936, + "step": 5264 + }, + { + "epoch": 0.13323379811220487, + "grad_norm": 4.623903274536133, + "learning_rate": 9.622636997454105e-06, + "loss": 0.1728, + "step": 5265 + }, + { + "epoch": 0.13325910367689855, + "grad_norm": 6.70378303527832, + "learning_rate": 9.622483956052284e-06, + "loss": 0.1486, + "step": 5266 + }, + { + "epoch": 0.13328440924159224, + "grad_norm": 5.572836399078369, + "learning_rate": 9.622330884840923e-06, + "loss": 0.2323, + "step": 5267 + }, + { + "epoch": 0.1333097148062859, + "grad_norm": 14.832475662231445, + "learning_rate": 9.622177783821009e-06, + "loss": 0.1831, + "step": 5268 + }, + { + "epoch": 0.13333502037097958, + "grad_norm": 9.185921669006348, + "learning_rate": 9.622024652993531e-06, + "loss": 0.2093, + "step": 5269 + }, + { + "epoch": 0.13336032593567326, + "grad_norm": 8.430559158325195, + "learning_rate": 9.621871492359477e-06, + "loss": 0.2031, + "step": 5270 + }, + { + "epoch": 0.13338563150036692, + "grad_norm": 7.357163429260254, + "learning_rate": 9.621718301919833e-06, + "loss": 0.2122, + "step": 5271 + }, + { + "epoch": 0.1334109370650606, + "grad_norm": 9.617640495300293, + "learning_rate": 9.621565081675586e-06, + "loss": 0.2218, + "step": 5272 + }, + { + "epoch": 0.1334362426297543, + "grad_norm": 7.434346675872803, + "learning_rate": 9.621411831627728e-06, + "loss": 0.2565, + "step": 5273 + }, + { + "epoch": 0.13346154819444797, + "grad_norm": 5.19126033782959, + "learning_rate": 9.621258551777242e-06, + "loss": 0.1617, + "step": 5274 + }, + { + "epoch": 0.13348685375914163, + "grad_norm": 12.726130485534668, + "learning_rate": 9.621105242125122e-06, + "loss": 0.1926, + "step": 5275 + }, + { + "epoch": 0.1335121593238353, + "grad_norm": 8.434976577758789, + "learning_rate": 9.62095190267235e-06, + "loss": 0.2843, + "step": 5276 + }, + { + "epoch": 0.133537464888529, + "grad_norm": 25.404634475708008, + "learning_rate": 9.620798533419922e-06, + "loss": 0.358, + "step": 5277 + }, + { + "epoch": 0.13356277045322265, + "grad_norm": 10.981632232666016, + "learning_rate": 9.620645134368823e-06, + "loss": 0.2616, + "step": 5278 + }, + { + "epoch": 0.13358807601791634, + "grad_norm": 5.457858562469482, + "learning_rate": 9.620491705520043e-06, + "loss": 0.1985, + "step": 5279 + }, + { + "epoch": 0.13361338158261002, + "grad_norm": 8.426986694335938, + "learning_rate": 9.620338246874572e-06, + "loss": 0.1976, + "step": 5280 + }, + { + "epoch": 0.1336386871473037, + "grad_norm": 8.49406623840332, + "learning_rate": 9.620184758433397e-06, + "loss": 0.1632, + "step": 5281 + }, + { + "epoch": 0.13366399271199736, + "grad_norm": 3.2781946659088135, + "learning_rate": 9.620031240197511e-06, + "loss": 0.1489, + "step": 5282 + }, + { + "epoch": 0.13368929827669104, + "grad_norm": 8.526286125183105, + "learning_rate": 9.619877692167906e-06, + "loss": 0.2831, + "step": 5283 + }, + { + "epoch": 0.13371460384138473, + "grad_norm": 5.053609371185303, + "learning_rate": 9.619724114345564e-06, + "loss": 0.1768, + "step": 5284 + }, + { + "epoch": 0.13373990940607838, + "grad_norm": 9.060948371887207, + "learning_rate": 9.619570506731484e-06, + "loss": 0.239, + "step": 5285 + }, + { + "epoch": 0.13376521497077207, + "grad_norm": 7.405439376831055, + "learning_rate": 9.619416869326651e-06, + "loss": 0.2891, + "step": 5286 + }, + { + "epoch": 0.13379052053546575, + "grad_norm": 9.050040245056152, + "learning_rate": 9.619263202132058e-06, + "loss": 0.3193, + "step": 5287 + }, + { + "epoch": 0.13381582610015944, + "grad_norm": 3.6322672367095947, + "learning_rate": 9.619109505148698e-06, + "loss": 0.2033, + "step": 5288 + }, + { + "epoch": 0.1338411316648531, + "grad_norm": 6.5681538581848145, + "learning_rate": 9.618955778377558e-06, + "loss": 0.2367, + "step": 5289 + }, + { + "epoch": 0.13386643722954678, + "grad_norm": 10.3670654296875, + "learning_rate": 9.618802021819634e-06, + "loss": 0.1737, + "step": 5290 + }, + { + "epoch": 0.13389174279424046, + "grad_norm": 14.01963996887207, + "learning_rate": 9.61864823547591e-06, + "loss": 0.3027, + "step": 5291 + }, + { + "epoch": 0.13391704835893414, + "grad_norm": 4.7342095375061035, + "learning_rate": 9.618494419347389e-06, + "loss": 0.2435, + "step": 5292 + }, + { + "epoch": 0.1339423539236278, + "grad_norm": 8.331653594970703, + "learning_rate": 9.618340573435053e-06, + "loss": 0.2023, + "step": 5293 + }, + { + "epoch": 0.13396765948832148, + "grad_norm": 7.942968845367432, + "learning_rate": 9.618186697739898e-06, + "loss": 0.2268, + "step": 5294 + }, + { + "epoch": 0.13399296505301517, + "grad_norm": 10.56554889678955, + "learning_rate": 9.618032792262915e-06, + "loss": 0.2773, + "step": 5295 + }, + { + "epoch": 0.13401827061770882, + "grad_norm": 4.597839832305908, + "learning_rate": 9.617878857005098e-06, + "loss": 0.1518, + "step": 5296 + }, + { + "epoch": 0.1340435761824025, + "grad_norm": 4.716428279876709, + "learning_rate": 9.617724891967442e-06, + "loss": 0.2389, + "step": 5297 + }, + { + "epoch": 0.1340688817470962, + "grad_norm": 15.099228858947754, + "learning_rate": 9.617570897150933e-06, + "loss": 0.3208, + "step": 5298 + }, + { + "epoch": 0.13409418731178988, + "grad_norm": 5.6706767082214355, + "learning_rate": 9.617416872556572e-06, + "loss": 0.1497, + "step": 5299 + }, + { + "epoch": 0.13411949287648353, + "grad_norm": 6.115284442901611, + "learning_rate": 9.617262818185346e-06, + "loss": 0.1806, + "step": 5300 + }, + { + "epoch": 0.13414479844117722, + "grad_norm": 6.731723308563232, + "learning_rate": 9.617108734038252e-06, + "loss": 0.1963, + "step": 5301 + }, + { + "epoch": 0.1341701040058709, + "grad_norm": 13.810909271240234, + "learning_rate": 9.61695462011628e-06, + "loss": 0.2777, + "step": 5302 + }, + { + "epoch": 0.13419540957056456, + "grad_norm": 3.889631509780884, + "learning_rate": 9.616800476420431e-06, + "loss": 0.1116, + "step": 5303 + }, + { + "epoch": 0.13422071513525824, + "grad_norm": 7.299005031585693, + "learning_rate": 9.616646302951692e-06, + "loss": 0.2222, + "step": 5304 + }, + { + "epoch": 0.13424602069995192, + "grad_norm": 8.650304794311523, + "learning_rate": 9.61649209971106e-06, + "loss": 0.2337, + "step": 5305 + }, + { + "epoch": 0.1342713262646456, + "grad_norm": 4.382131099700928, + "learning_rate": 9.61633786669953e-06, + "loss": 0.1322, + "step": 5306 + }, + { + "epoch": 0.13429663182933926, + "grad_norm": 8.421006202697754, + "learning_rate": 9.616183603918094e-06, + "loss": 0.2441, + "step": 5307 + }, + { + "epoch": 0.13432193739403295, + "grad_norm": 4.458134651184082, + "learning_rate": 9.61602931136775e-06, + "loss": 0.1876, + "step": 5308 + }, + { + "epoch": 0.13434724295872663, + "grad_norm": 3.913092851638794, + "learning_rate": 9.615874989049492e-06, + "loss": 0.1907, + "step": 5309 + }, + { + "epoch": 0.1343725485234203, + "grad_norm": 5.925967216491699, + "learning_rate": 9.615720636964312e-06, + "loss": 0.137, + "step": 5310 + }, + { + "epoch": 0.13439785408811397, + "grad_norm": 4.526166915893555, + "learning_rate": 9.615566255113212e-06, + "loss": 0.1728, + "step": 5311 + }, + { + "epoch": 0.13442315965280766, + "grad_norm": 5.289488792419434, + "learning_rate": 9.615411843497182e-06, + "loss": 0.1102, + "step": 5312 + }, + { + "epoch": 0.13444846521750134, + "grad_norm": 4.997550010681152, + "learning_rate": 9.61525740211722e-06, + "loss": 0.2198, + "step": 5313 + }, + { + "epoch": 0.134473770782195, + "grad_norm": 3.580177068710327, + "learning_rate": 9.61510293097432e-06, + "loss": 0.1438, + "step": 5314 + }, + { + "epoch": 0.13449907634688868, + "grad_norm": 7.3111186027526855, + "learning_rate": 9.614948430069481e-06, + "loss": 0.282, + "step": 5315 + }, + { + "epoch": 0.13452438191158236, + "grad_norm": 10.275283813476562, + "learning_rate": 9.614793899403697e-06, + "loss": 0.2936, + "step": 5316 + }, + { + "epoch": 0.13454968747627602, + "grad_norm": 6.228529930114746, + "learning_rate": 9.614639338977967e-06, + "loss": 0.145, + "step": 5317 + }, + { + "epoch": 0.1345749930409697, + "grad_norm": 8.582983016967773, + "learning_rate": 9.614484748793287e-06, + "loss": 0.2866, + "step": 5318 + }, + { + "epoch": 0.1346002986056634, + "grad_norm": 8.951250076293945, + "learning_rate": 9.614330128850651e-06, + "loss": 0.2415, + "step": 5319 + }, + { + "epoch": 0.13462560417035707, + "grad_norm": 6.385840892791748, + "learning_rate": 9.61417547915106e-06, + "loss": 0.152, + "step": 5320 + }, + { + "epoch": 0.13465090973505073, + "grad_norm": 9.160054206848145, + "learning_rate": 9.614020799695508e-06, + "loss": 0.1581, + "step": 5321 + }, + { + "epoch": 0.1346762152997444, + "grad_norm": 8.059442520141602, + "learning_rate": 9.613866090484996e-06, + "loss": 0.2229, + "step": 5322 + }, + { + "epoch": 0.1347015208644381, + "grad_norm": 5.942714691162109, + "learning_rate": 9.613711351520519e-06, + "loss": 0.206, + "step": 5323 + }, + { + "epoch": 0.13472682642913178, + "grad_norm": 7.70973014831543, + "learning_rate": 9.613556582803074e-06, + "loss": 0.2232, + "step": 5324 + }, + { + "epoch": 0.13475213199382544, + "grad_norm": 6.861318588256836, + "learning_rate": 9.613401784333662e-06, + "loss": 0.2881, + "step": 5325 + }, + { + "epoch": 0.13477743755851912, + "grad_norm": 7.7621541023254395, + "learning_rate": 9.613246956113282e-06, + "loss": 0.3334, + "step": 5326 + }, + { + "epoch": 0.1348027431232128, + "grad_norm": 4.230119228363037, + "learning_rate": 9.613092098142929e-06, + "loss": 0.2356, + "step": 5327 + }, + { + "epoch": 0.13482804868790646, + "grad_norm": 10.503564834594727, + "learning_rate": 9.612937210423601e-06, + "loss": 0.2789, + "step": 5328 + }, + { + "epoch": 0.13485335425260014, + "grad_norm": 4.853087425231934, + "learning_rate": 9.612782292956302e-06, + "loss": 0.1959, + "step": 5329 + }, + { + "epoch": 0.13487865981729383, + "grad_norm": 3.33382511138916, + "learning_rate": 9.612627345742028e-06, + "loss": 0.1315, + "step": 5330 + }, + { + "epoch": 0.1349039653819875, + "grad_norm": 5.156519412994385, + "learning_rate": 9.612472368781777e-06, + "loss": 0.2329, + "step": 5331 + }, + { + "epoch": 0.13492927094668117, + "grad_norm": 7.840662956237793, + "learning_rate": 9.61231736207655e-06, + "loss": 0.1926, + "step": 5332 + }, + { + "epoch": 0.13495457651137485, + "grad_norm": 7.121429443359375, + "learning_rate": 9.612162325627346e-06, + "loss": 0.2001, + "step": 5333 + }, + { + "epoch": 0.13497988207606854, + "grad_norm": 5.732529163360596, + "learning_rate": 9.612007259435165e-06, + "loss": 0.2121, + "step": 5334 + }, + { + "epoch": 0.1350051876407622, + "grad_norm": 12.353272438049316, + "learning_rate": 9.611852163501007e-06, + "loss": 0.1988, + "step": 5335 + }, + { + "epoch": 0.13503049320545588, + "grad_norm": 4.34060525894165, + "learning_rate": 9.611697037825874e-06, + "loss": 0.0964, + "step": 5336 + }, + { + "epoch": 0.13505579877014956, + "grad_norm": 5.261797904968262, + "learning_rate": 9.611541882410764e-06, + "loss": 0.2114, + "step": 5337 + }, + { + "epoch": 0.13508110433484324, + "grad_norm": 4.915639400482178, + "learning_rate": 9.611386697256678e-06, + "loss": 0.1611, + "step": 5338 + }, + { + "epoch": 0.1351064098995369, + "grad_norm": 6.693774223327637, + "learning_rate": 9.611231482364616e-06, + "loss": 0.2046, + "step": 5339 + }, + { + "epoch": 0.13513171546423058, + "grad_norm": 14.215187072753906, + "learning_rate": 9.61107623773558e-06, + "loss": 0.3712, + "step": 5340 + }, + { + "epoch": 0.13515702102892427, + "grad_norm": 6.597877025604248, + "learning_rate": 9.61092096337057e-06, + "loss": 0.2095, + "step": 5341 + }, + { + "epoch": 0.13518232659361792, + "grad_norm": 8.68776798248291, + "learning_rate": 9.610765659270592e-06, + "loss": 0.2775, + "step": 5342 + }, + { + "epoch": 0.1352076321583116, + "grad_norm": 5.588975429534912, + "learning_rate": 9.610610325436641e-06, + "loss": 0.1324, + "step": 5343 + }, + { + "epoch": 0.1352329377230053, + "grad_norm": 4.619362831115723, + "learning_rate": 9.610454961869723e-06, + "loss": 0.2275, + "step": 5344 + }, + { + "epoch": 0.13525824328769898, + "grad_norm": 8.665692329406738, + "learning_rate": 9.610299568570838e-06, + "loss": 0.3059, + "step": 5345 + }, + { + "epoch": 0.13528354885239263, + "grad_norm": 3.9445714950561523, + "learning_rate": 9.610144145540989e-06, + "loss": 0.238, + "step": 5346 + }, + { + "epoch": 0.13530885441708632, + "grad_norm": 4.749362468719482, + "learning_rate": 9.609988692781178e-06, + "loss": 0.1698, + "step": 5347 + }, + { + "epoch": 0.13533415998178, + "grad_norm": 6.769443511962891, + "learning_rate": 9.609833210292407e-06, + "loss": 0.2353, + "step": 5348 + }, + { + "epoch": 0.13535946554647366, + "grad_norm": 6.478213787078857, + "learning_rate": 9.609677698075679e-06, + "loss": 0.2571, + "step": 5349 + }, + { + "epoch": 0.13538477111116734, + "grad_norm": 11.70229721069336, + "learning_rate": 9.609522156131998e-06, + "loss": 0.1401, + "step": 5350 + }, + { + "epoch": 0.13541007667586102, + "grad_norm": 8.426337242126465, + "learning_rate": 9.609366584462365e-06, + "loss": 0.1881, + "step": 5351 + }, + { + "epoch": 0.1354353822405547, + "grad_norm": 4.65758752822876, + "learning_rate": 9.609210983067785e-06, + "loss": 0.2066, + "step": 5352 + }, + { + "epoch": 0.13546068780524836, + "grad_norm": 5.033788204193115, + "learning_rate": 9.609055351949258e-06, + "loss": 0.1058, + "step": 5353 + }, + { + "epoch": 0.13548599336994205, + "grad_norm": 16.294193267822266, + "learning_rate": 9.608899691107795e-06, + "loss": 0.2473, + "step": 5354 + }, + { + "epoch": 0.13551129893463573, + "grad_norm": 3.9824130535125732, + "learning_rate": 9.608744000544392e-06, + "loss": 0.1707, + "step": 5355 + }, + { + "epoch": 0.13553660449932942, + "grad_norm": 10.879539489746094, + "learning_rate": 9.608588280260058e-06, + "loss": 0.249, + "step": 5356 + }, + { + "epoch": 0.13556191006402307, + "grad_norm": 13.687895774841309, + "learning_rate": 9.608432530255795e-06, + "loss": 0.2341, + "step": 5357 + }, + { + "epoch": 0.13558721562871676, + "grad_norm": 4.80888557434082, + "learning_rate": 9.608276750532608e-06, + "loss": 0.1945, + "step": 5358 + }, + { + "epoch": 0.13561252119341044, + "grad_norm": 28.64473533630371, + "learning_rate": 9.6081209410915e-06, + "loss": 0.3028, + "step": 5359 + }, + { + "epoch": 0.1356378267581041, + "grad_norm": 3.8161072731018066, + "learning_rate": 9.607965101933479e-06, + "loss": 0.1603, + "step": 5360 + }, + { + "epoch": 0.13566313232279778, + "grad_norm": 5.183565139770508, + "learning_rate": 9.607809233059546e-06, + "loss": 0.1044, + "step": 5361 + }, + { + "epoch": 0.13568843788749146, + "grad_norm": 9.597620010375977, + "learning_rate": 9.60765333447071e-06, + "loss": 0.2599, + "step": 5362 + }, + { + "epoch": 0.13571374345218515, + "grad_norm": 4.919435024261475, + "learning_rate": 9.607497406167974e-06, + "loss": 0.183, + "step": 5363 + }, + { + "epoch": 0.1357390490168788, + "grad_norm": 5.473318576812744, + "learning_rate": 9.607341448152345e-06, + "loss": 0.217, + "step": 5364 + }, + { + "epoch": 0.1357643545815725, + "grad_norm": 10.684821128845215, + "learning_rate": 9.607185460424829e-06, + "loss": 0.2566, + "step": 5365 + }, + { + "epoch": 0.13578966014626617, + "grad_norm": 11.898456573486328, + "learning_rate": 9.60702944298643e-06, + "loss": 0.2781, + "step": 5366 + }, + { + "epoch": 0.13581496571095983, + "grad_norm": 11.105653762817383, + "learning_rate": 9.606873395838155e-06, + "loss": 0.2125, + "step": 5367 + }, + { + "epoch": 0.1358402712756535, + "grad_norm": 5.090908050537109, + "learning_rate": 9.60671731898101e-06, + "loss": 0.1567, + "step": 5368 + }, + { + "epoch": 0.1358655768403472, + "grad_norm": 16.749553680419922, + "learning_rate": 9.606561212416002e-06, + "loss": 0.2588, + "step": 5369 + }, + { + "epoch": 0.13589088240504088, + "grad_norm": 4.576550006866455, + "learning_rate": 9.606405076144139e-06, + "loss": 0.1887, + "step": 5370 + }, + { + "epoch": 0.13591618796973454, + "grad_norm": 3.101900815963745, + "learning_rate": 9.606248910166425e-06, + "loss": 0.1663, + "step": 5371 + }, + { + "epoch": 0.13594149353442822, + "grad_norm": 10.304763793945312, + "learning_rate": 9.606092714483867e-06, + "loss": 0.278, + "step": 5372 + }, + { + "epoch": 0.1359667990991219, + "grad_norm": 5.9509992599487305, + "learning_rate": 9.605936489097477e-06, + "loss": 0.2639, + "step": 5373 + }, + { + "epoch": 0.13599210466381556, + "grad_norm": 4.60014533996582, + "learning_rate": 9.605780234008258e-06, + "loss": 0.2433, + "step": 5374 + }, + { + "epoch": 0.13601741022850924, + "grad_norm": 16.72956085205078, + "learning_rate": 9.60562394921722e-06, + "loss": 0.2171, + "step": 5375 + }, + { + "epoch": 0.13604271579320293, + "grad_norm": 8.261191368103027, + "learning_rate": 9.60546763472537e-06, + "loss": 0.2118, + "step": 5376 + }, + { + "epoch": 0.1360680213578966, + "grad_norm": 5.048183917999268, + "learning_rate": 9.605311290533715e-06, + "loss": 0.104, + "step": 5377 + }, + { + "epoch": 0.13609332692259027, + "grad_norm": 6.771894931793213, + "learning_rate": 9.605154916643262e-06, + "loss": 0.2462, + "step": 5378 + }, + { + "epoch": 0.13611863248728395, + "grad_norm": 10.763435363769531, + "learning_rate": 9.604998513055025e-06, + "loss": 0.3507, + "step": 5379 + }, + { + "epoch": 0.13614393805197764, + "grad_norm": 4.147438049316406, + "learning_rate": 9.604842079770007e-06, + "loss": 0.1832, + "step": 5380 + }, + { + "epoch": 0.1361692436166713, + "grad_norm": 7.186886787414551, + "learning_rate": 9.60468561678922e-06, + "loss": 0.2607, + "step": 5381 + }, + { + "epoch": 0.13619454918136498, + "grad_norm": 10.287993431091309, + "learning_rate": 9.604529124113673e-06, + "loss": 0.3, + "step": 5382 + }, + { + "epoch": 0.13621985474605866, + "grad_norm": 4.397261142730713, + "learning_rate": 9.604372601744372e-06, + "loss": 0.225, + "step": 5383 + }, + { + "epoch": 0.13624516031075234, + "grad_norm": 17.670135498046875, + "learning_rate": 9.60421604968233e-06, + "loss": 0.2809, + "step": 5384 + }, + { + "epoch": 0.136270465875446, + "grad_norm": 14.062921524047852, + "learning_rate": 9.604059467928555e-06, + "loss": 0.3346, + "step": 5385 + }, + { + "epoch": 0.13629577144013968, + "grad_norm": 3.9921107292175293, + "learning_rate": 9.603902856484056e-06, + "loss": 0.1998, + "step": 5386 + }, + { + "epoch": 0.13632107700483337, + "grad_norm": 10.416763305664062, + "learning_rate": 9.603746215349845e-06, + "loss": 0.2608, + "step": 5387 + }, + { + "epoch": 0.13634638256952705, + "grad_norm": 5.545690059661865, + "learning_rate": 9.60358954452693e-06, + "loss": 0.2175, + "step": 5388 + }, + { + "epoch": 0.1363716881342207, + "grad_norm": 11.673556327819824, + "learning_rate": 9.603432844016324e-06, + "loss": 0.1982, + "step": 5389 + }, + { + "epoch": 0.1363969936989144, + "grad_norm": 3.41338849067688, + "learning_rate": 9.603276113819035e-06, + "loss": 0.1759, + "step": 5390 + }, + { + "epoch": 0.13642229926360808, + "grad_norm": 8.557936668395996, + "learning_rate": 9.603119353936075e-06, + "loss": 0.2966, + "step": 5391 + }, + { + "epoch": 0.13644760482830173, + "grad_norm": 14.427416801452637, + "learning_rate": 9.602962564368453e-06, + "loss": 0.2792, + "step": 5392 + }, + { + "epoch": 0.13647291039299542, + "grad_norm": 2.921792507171631, + "learning_rate": 9.602805745117183e-06, + "loss": 0.1655, + "step": 5393 + }, + { + "epoch": 0.1364982159576891, + "grad_norm": 3.9341325759887695, + "learning_rate": 9.602648896183274e-06, + "loss": 0.2189, + "step": 5394 + }, + { + "epoch": 0.13652352152238278, + "grad_norm": 10.791647911071777, + "learning_rate": 9.60249201756774e-06, + "loss": 0.2544, + "step": 5395 + }, + { + "epoch": 0.13654882708707644, + "grad_norm": 4.404546737670898, + "learning_rate": 9.60233510927159e-06, + "loss": 0.1662, + "step": 5396 + }, + { + "epoch": 0.13657413265177012, + "grad_norm": 12.644877433776855, + "learning_rate": 9.602178171295838e-06, + "loss": 0.3059, + "step": 5397 + }, + { + "epoch": 0.1365994382164638, + "grad_norm": 3.1279115676879883, + "learning_rate": 9.602021203641492e-06, + "loss": 0.1379, + "step": 5398 + }, + { + "epoch": 0.13662474378115746, + "grad_norm": 9.175057411193848, + "learning_rate": 9.60186420630957e-06, + "loss": 0.3073, + "step": 5399 + }, + { + "epoch": 0.13665004934585115, + "grad_norm": 3.9829752445220947, + "learning_rate": 9.60170717930108e-06, + "loss": 0.1665, + "step": 5400 + }, + { + "epoch": 0.13667535491054483, + "grad_norm": 5.036673545837402, + "learning_rate": 9.601550122617038e-06, + "loss": 0.1761, + "step": 5401 + }, + { + "epoch": 0.13670066047523852, + "grad_norm": 5.13842248916626, + "learning_rate": 9.601393036258455e-06, + "loss": 0.1466, + "step": 5402 + }, + { + "epoch": 0.13672596603993217, + "grad_norm": 16.919002532958984, + "learning_rate": 9.601235920226345e-06, + "loss": 0.3082, + "step": 5403 + }, + { + "epoch": 0.13675127160462586, + "grad_norm": 4.495138168334961, + "learning_rate": 9.60107877452172e-06, + "loss": 0.1751, + "step": 5404 + }, + { + "epoch": 0.13677657716931954, + "grad_norm": 11.866866111755371, + "learning_rate": 9.600921599145591e-06, + "loss": 0.3324, + "step": 5405 + }, + { + "epoch": 0.1368018827340132, + "grad_norm": 11.428393363952637, + "learning_rate": 9.600764394098978e-06, + "loss": 0.2832, + "step": 5406 + }, + { + "epoch": 0.13682718829870688, + "grad_norm": 5.397747039794922, + "learning_rate": 9.600607159382891e-06, + "loss": 0.15, + "step": 5407 + }, + { + "epoch": 0.13685249386340056, + "grad_norm": 6.026437282562256, + "learning_rate": 9.600449894998342e-06, + "loss": 0.242, + "step": 5408 + }, + { + "epoch": 0.13687779942809425, + "grad_norm": 8.699058532714844, + "learning_rate": 9.600292600946349e-06, + "loss": 0.2186, + "step": 5409 + }, + { + "epoch": 0.1369031049927879, + "grad_norm": 4.674177646636963, + "learning_rate": 9.600135277227924e-06, + "loss": 0.1844, + "step": 5410 + }, + { + "epoch": 0.1369284105574816, + "grad_norm": 3.989687442779541, + "learning_rate": 9.599977923844082e-06, + "loss": 0.2204, + "step": 5411 + }, + { + "epoch": 0.13695371612217527, + "grad_norm": 5.2790961265563965, + "learning_rate": 9.59982054079584e-06, + "loss": 0.1889, + "step": 5412 + }, + { + "epoch": 0.13697902168686893, + "grad_norm": 8.32160472869873, + "learning_rate": 9.599663128084208e-06, + "loss": 0.19, + "step": 5413 + }, + { + "epoch": 0.1370043272515626, + "grad_norm": 6.093139171600342, + "learning_rate": 9.599505685710207e-06, + "loss": 0.2089, + "step": 5414 + }, + { + "epoch": 0.1370296328162563, + "grad_norm": 4.581203937530518, + "learning_rate": 9.599348213674848e-06, + "loss": 0.171, + "step": 5415 + }, + { + "epoch": 0.13705493838094998, + "grad_norm": 5.845627307891846, + "learning_rate": 9.599190711979147e-06, + "loss": 0.2469, + "step": 5416 + }, + { + "epoch": 0.13708024394564364, + "grad_norm": 7.6371049880981445, + "learning_rate": 9.599033180624122e-06, + "loss": 0.1911, + "step": 5417 + }, + { + "epoch": 0.13710554951033732, + "grad_norm": 4.623056411743164, + "learning_rate": 9.598875619610788e-06, + "loss": 0.264, + "step": 5418 + }, + { + "epoch": 0.137130855075031, + "grad_norm": 5.676667213439941, + "learning_rate": 9.598718028940159e-06, + "loss": 0.2179, + "step": 5419 + }, + { + "epoch": 0.1371561606397247, + "grad_norm": 6.581503868103027, + "learning_rate": 9.598560408613253e-06, + "loss": 0.2837, + "step": 5420 + }, + { + "epoch": 0.13718146620441835, + "grad_norm": 3.737078905105591, + "learning_rate": 9.598402758631088e-06, + "loss": 0.0901, + "step": 5421 + }, + { + "epoch": 0.13720677176911203, + "grad_norm": 8.749200820922852, + "learning_rate": 9.598245078994678e-06, + "loss": 0.2421, + "step": 5422 + }, + { + "epoch": 0.1372320773338057, + "grad_norm": 7.18253755569458, + "learning_rate": 9.59808736970504e-06, + "loss": 0.2326, + "step": 5423 + }, + { + "epoch": 0.13725738289849937, + "grad_norm": 15.970293045043945, + "learning_rate": 9.597929630763192e-06, + "loss": 0.3483, + "step": 5424 + }, + { + "epoch": 0.13728268846319305, + "grad_norm": 5.5319013595581055, + "learning_rate": 9.597771862170153e-06, + "loss": 0.262, + "step": 5425 + }, + { + "epoch": 0.13730799402788674, + "grad_norm": 13.231831550598145, + "learning_rate": 9.597614063926936e-06, + "loss": 0.265, + "step": 5426 + }, + { + "epoch": 0.13733329959258042, + "grad_norm": 7.895750522613525, + "learning_rate": 9.597456236034564e-06, + "loss": 0.2028, + "step": 5427 + }, + { + "epoch": 0.13735860515727408, + "grad_norm": 5.213756561279297, + "learning_rate": 9.597298378494048e-06, + "loss": 0.2242, + "step": 5428 + }, + { + "epoch": 0.13738391072196776, + "grad_norm": 4.024433612823486, + "learning_rate": 9.597140491306414e-06, + "loss": 0.138, + "step": 5429 + }, + { + "epoch": 0.13740921628666145, + "grad_norm": 6.057895660400391, + "learning_rate": 9.596982574472673e-06, + "loss": 0.2219, + "step": 5430 + }, + { + "epoch": 0.1374345218513551, + "grad_norm": 4.801260471343994, + "learning_rate": 9.596824627993849e-06, + "loss": 0.2064, + "step": 5431 + }, + { + "epoch": 0.13745982741604879, + "grad_norm": 5.988558292388916, + "learning_rate": 9.596666651870957e-06, + "loss": 0.2301, + "step": 5432 + }, + { + "epoch": 0.13748513298074247, + "grad_norm": 8.421640396118164, + "learning_rate": 9.596508646105017e-06, + "loss": 0.3049, + "step": 5433 + }, + { + "epoch": 0.13751043854543615, + "grad_norm": 6.0732526779174805, + "learning_rate": 9.596350610697047e-06, + "loss": 0.2332, + "step": 5434 + }, + { + "epoch": 0.1375357441101298, + "grad_norm": 6.5735297203063965, + "learning_rate": 9.596192545648067e-06, + "loss": 0.2628, + "step": 5435 + }, + { + "epoch": 0.1375610496748235, + "grad_norm": 3.1991655826568604, + "learning_rate": 9.596034450959095e-06, + "loss": 0.1223, + "step": 5436 + }, + { + "epoch": 0.13758635523951718, + "grad_norm": 3.8393781185150146, + "learning_rate": 9.595876326631155e-06, + "loss": 0.1834, + "step": 5437 + }, + { + "epoch": 0.13761166080421083, + "grad_norm": 4.116111755371094, + "learning_rate": 9.59571817266526e-06, + "loss": 0.1101, + "step": 5438 + }, + { + "epoch": 0.13763696636890452, + "grad_norm": 5.959966659545898, + "learning_rate": 9.595559989062435e-06, + "loss": 0.2056, + "step": 5439 + }, + { + "epoch": 0.1376622719335982, + "grad_norm": 3.871455430984497, + "learning_rate": 9.595401775823698e-06, + "loss": 0.132, + "step": 5440 + }, + { + "epoch": 0.13768757749829189, + "grad_norm": 5.682863235473633, + "learning_rate": 9.595243532950071e-06, + "loss": 0.2165, + "step": 5441 + }, + { + "epoch": 0.13771288306298554, + "grad_norm": 7.7423834800720215, + "learning_rate": 9.595085260442573e-06, + "loss": 0.1616, + "step": 5442 + }, + { + "epoch": 0.13773818862767923, + "grad_norm": 13.653453826904297, + "learning_rate": 9.594926958302224e-06, + "loss": 0.2409, + "step": 5443 + }, + { + "epoch": 0.1377634941923729, + "grad_norm": 4.024818420410156, + "learning_rate": 9.594768626530046e-06, + "loss": 0.1938, + "step": 5444 + }, + { + "epoch": 0.13778879975706657, + "grad_norm": 5.150744438171387, + "learning_rate": 9.59461026512706e-06, + "loss": 0.2006, + "step": 5445 + }, + { + "epoch": 0.13781410532176025, + "grad_norm": 5.922123908996582, + "learning_rate": 9.594451874094287e-06, + "loss": 0.1829, + "step": 5446 + }, + { + "epoch": 0.13783941088645393, + "grad_norm": 7.806780815124512, + "learning_rate": 9.594293453432749e-06, + "loss": 0.1717, + "step": 5447 + }, + { + "epoch": 0.13786471645114762, + "grad_norm": 13.300353050231934, + "learning_rate": 9.594135003143467e-06, + "loss": 0.2956, + "step": 5448 + }, + { + "epoch": 0.13789002201584127, + "grad_norm": 7.936733245849609, + "learning_rate": 9.593976523227462e-06, + "loss": 0.2081, + "step": 5449 + }, + { + "epoch": 0.13791532758053496, + "grad_norm": 5.803137302398682, + "learning_rate": 9.593818013685759e-06, + "loss": 0.2433, + "step": 5450 + }, + { + "epoch": 0.13794063314522864, + "grad_norm": 5.537047863006592, + "learning_rate": 9.593659474519376e-06, + "loss": 0.153, + "step": 5451 + }, + { + "epoch": 0.13796593870992233, + "grad_norm": 5.0079522132873535, + "learning_rate": 9.593500905729338e-06, + "loss": 0.1812, + "step": 5452 + }, + { + "epoch": 0.13799124427461598, + "grad_norm": 8.408000946044922, + "learning_rate": 9.593342307316667e-06, + "loss": 0.2544, + "step": 5453 + }, + { + "epoch": 0.13801654983930967, + "grad_norm": 4.672859191894531, + "learning_rate": 9.593183679282387e-06, + "loss": 0.1696, + "step": 5454 + }, + { + "epoch": 0.13804185540400335, + "grad_norm": 4.644595146179199, + "learning_rate": 9.593025021627519e-06, + "loss": 0.199, + "step": 5455 + }, + { + "epoch": 0.138067160968697, + "grad_norm": 3.9467170238494873, + "learning_rate": 9.592866334353086e-06, + "loss": 0.125, + "step": 5456 + }, + { + "epoch": 0.1380924665333907, + "grad_norm": 6.307995319366455, + "learning_rate": 9.592707617460115e-06, + "loss": 0.2205, + "step": 5457 + }, + { + "epoch": 0.13811777209808437, + "grad_norm": 22.169967651367188, + "learning_rate": 9.592548870949625e-06, + "loss": 0.3445, + "step": 5458 + }, + { + "epoch": 0.13814307766277806, + "grad_norm": 4.404360294342041, + "learning_rate": 9.59239009482264e-06, + "loss": 0.1193, + "step": 5459 + }, + { + "epoch": 0.1381683832274717, + "grad_norm": 4.8162102699279785, + "learning_rate": 9.59223128908019e-06, + "loss": 0.2186, + "step": 5460 + }, + { + "epoch": 0.1381936887921654, + "grad_norm": 6.172119140625, + "learning_rate": 9.59207245372329e-06, + "loss": 0.2449, + "step": 5461 + }, + { + "epoch": 0.13821899435685908, + "grad_norm": 6.024612903594971, + "learning_rate": 9.591913588752972e-06, + "loss": 0.1929, + "step": 5462 + }, + { + "epoch": 0.13824429992155274, + "grad_norm": 5.104278087615967, + "learning_rate": 9.591754694170257e-06, + "loss": 0.205, + "step": 5463 + }, + { + "epoch": 0.13826960548624642, + "grad_norm": 4.7905731201171875, + "learning_rate": 9.591595769976169e-06, + "loss": 0.2264, + "step": 5464 + }, + { + "epoch": 0.1382949110509401, + "grad_norm": 10.950177192687988, + "learning_rate": 9.591436816171735e-06, + "loss": 0.1539, + "step": 5465 + }, + { + "epoch": 0.1383202166156338, + "grad_norm": 14.246177673339844, + "learning_rate": 9.591277832757978e-06, + "loss": 0.2091, + "step": 5466 + }, + { + "epoch": 0.13834552218032745, + "grad_norm": 4.049752235412598, + "learning_rate": 9.591118819735924e-06, + "loss": 0.1411, + "step": 5467 + }, + { + "epoch": 0.13837082774502113, + "grad_norm": 7.483647346496582, + "learning_rate": 9.590959777106601e-06, + "loss": 0.226, + "step": 5468 + }, + { + "epoch": 0.1383961333097148, + "grad_norm": 8.569842338562012, + "learning_rate": 9.59080070487103e-06, + "loss": 0.2239, + "step": 5469 + }, + { + "epoch": 0.13842143887440847, + "grad_norm": 10.873096466064453, + "learning_rate": 9.590641603030241e-06, + "loss": 0.3141, + "step": 5470 + }, + { + "epoch": 0.13844674443910215, + "grad_norm": 7.89854097366333, + "learning_rate": 9.590482471585258e-06, + "loss": 0.2181, + "step": 5471 + }, + { + "epoch": 0.13847205000379584, + "grad_norm": 2.7363626956939697, + "learning_rate": 9.590323310537107e-06, + "loss": 0.1404, + "step": 5472 + }, + { + "epoch": 0.13849735556848952, + "grad_norm": 3.9629714488983154, + "learning_rate": 9.590164119886814e-06, + "loss": 0.1778, + "step": 5473 + }, + { + "epoch": 0.13852266113318318, + "grad_norm": 12.675795555114746, + "learning_rate": 9.590004899635408e-06, + "loss": 0.3223, + "step": 5474 + }, + { + "epoch": 0.13854796669787686, + "grad_norm": 8.27682113647461, + "learning_rate": 9.589845649783913e-06, + "loss": 0.2503, + "step": 5475 + }, + { + "epoch": 0.13857327226257055, + "grad_norm": 10.1819429397583, + "learning_rate": 9.589686370333358e-06, + "loss": 0.1853, + "step": 5476 + }, + { + "epoch": 0.1385985778272642, + "grad_norm": 6.335493087768555, + "learning_rate": 9.589527061284769e-06, + "loss": 0.2399, + "step": 5477 + }, + { + "epoch": 0.13862388339195789, + "grad_norm": 9.549781799316406, + "learning_rate": 9.589367722639174e-06, + "loss": 0.2787, + "step": 5478 + }, + { + "epoch": 0.13864918895665157, + "grad_norm": 5.078299045562744, + "learning_rate": 9.589208354397599e-06, + "loss": 0.1837, + "step": 5479 + }, + { + "epoch": 0.13867449452134525, + "grad_norm": 6.7074689865112305, + "learning_rate": 9.589048956561074e-06, + "loss": 0.245, + "step": 5480 + }, + { + "epoch": 0.1386998000860389, + "grad_norm": 6.575143814086914, + "learning_rate": 9.588889529130625e-06, + "loss": 0.2018, + "step": 5481 + }, + { + "epoch": 0.1387251056507326, + "grad_norm": 8.391082763671875, + "learning_rate": 9.588730072107282e-06, + "loss": 0.2438, + "step": 5482 + }, + { + "epoch": 0.13875041121542628, + "grad_norm": 7.765320301055908, + "learning_rate": 9.588570585492071e-06, + "loss": 0.1768, + "step": 5483 + }, + { + "epoch": 0.13877571678011996, + "grad_norm": 9.582015991210938, + "learning_rate": 9.588411069286024e-06, + "loss": 0.1688, + "step": 5484 + }, + { + "epoch": 0.13880102234481362, + "grad_norm": 6.788969039916992, + "learning_rate": 9.588251523490167e-06, + "loss": 0.2186, + "step": 5485 + }, + { + "epoch": 0.1388263279095073, + "grad_norm": 5.918017387390137, + "learning_rate": 9.588091948105529e-06, + "loss": 0.1692, + "step": 5486 + }, + { + "epoch": 0.13885163347420099, + "grad_norm": 6.2189741134643555, + "learning_rate": 9.58793234313314e-06, + "loss": 0.2101, + "step": 5487 + }, + { + "epoch": 0.13887693903889464, + "grad_norm": 4.910626411437988, + "learning_rate": 9.587772708574028e-06, + "loss": 0.1798, + "step": 5488 + }, + { + "epoch": 0.13890224460358833, + "grad_norm": 6.280309677124023, + "learning_rate": 9.587613044429223e-06, + "loss": 0.1787, + "step": 5489 + }, + { + "epoch": 0.138927550168282, + "grad_norm": 12.894279479980469, + "learning_rate": 9.587453350699754e-06, + "loss": 0.2739, + "step": 5490 + }, + { + "epoch": 0.1389528557329757, + "grad_norm": 6.408495903015137, + "learning_rate": 9.587293627386653e-06, + "loss": 0.214, + "step": 5491 + }, + { + "epoch": 0.13897816129766935, + "grad_norm": 6.381372928619385, + "learning_rate": 9.587133874490949e-06, + "loss": 0.2298, + "step": 5492 + }, + { + "epoch": 0.13900346686236303, + "grad_norm": 8.467733383178711, + "learning_rate": 9.586974092013672e-06, + "loss": 0.1458, + "step": 5493 + }, + { + "epoch": 0.13902877242705672, + "grad_norm": 9.49759292602539, + "learning_rate": 9.586814279955852e-06, + "loss": 0.145, + "step": 5494 + }, + { + "epoch": 0.13905407799175037, + "grad_norm": 26.225746154785156, + "learning_rate": 9.586654438318518e-06, + "loss": 0.335, + "step": 5495 + }, + { + "epoch": 0.13907938355644406, + "grad_norm": 8.338041305541992, + "learning_rate": 9.586494567102706e-06, + "loss": 0.2997, + "step": 5496 + }, + { + "epoch": 0.13910468912113774, + "grad_norm": 10.722372055053711, + "learning_rate": 9.586334666309442e-06, + "loss": 0.2044, + "step": 5497 + }, + { + "epoch": 0.13912999468583143, + "grad_norm": 5.359621047973633, + "learning_rate": 9.586174735939759e-06, + "loss": 0.1672, + "step": 5498 + }, + { + "epoch": 0.13915530025052508, + "grad_norm": 9.109251022338867, + "learning_rate": 9.586014775994687e-06, + "loss": 0.3016, + "step": 5499 + }, + { + "epoch": 0.13918060581521877, + "grad_norm": 3.1806445121765137, + "learning_rate": 9.58585478647526e-06, + "loss": 0.1769, + "step": 5500 + }, + { + "epoch": 0.13920591137991245, + "grad_norm": 6.112001419067383, + "learning_rate": 9.585694767382508e-06, + "loss": 0.214, + "step": 5501 + }, + { + "epoch": 0.1392312169446061, + "grad_norm": 21.776798248291016, + "learning_rate": 9.585534718717464e-06, + "loss": 0.3128, + "step": 5502 + }, + { + "epoch": 0.1392565225092998, + "grad_norm": 5.497831344604492, + "learning_rate": 9.58537464048116e-06, + "loss": 0.1848, + "step": 5503 + }, + { + "epoch": 0.13928182807399347, + "grad_norm": 11.16723346710205, + "learning_rate": 9.585214532674627e-06, + "loss": 0.1867, + "step": 5504 + }, + { + "epoch": 0.13930713363868716, + "grad_norm": 9.89917278289795, + "learning_rate": 9.585054395298899e-06, + "loss": 0.2093, + "step": 5505 + }, + { + "epoch": 0.13933243920338081, + "grad_norm": 6.074802875518799, + "learning_rate": 9.584894228355008e-06, + "loss": 0.2752, + "step": 5506 + }, + { + "epoch": 0.1393577447680745, + "grad_norm": 30.979583740234375, + "learning_rate": 9.584734031843988e-06, + "loss": 0.2961, + "step": 5507 + }, + { + "epoch": 0.13938305033276818, + "grad_norm": 26.002155303955078, + "learning_rate": 9.584573805766868e-06, + "loss": 0.2656, + "step": 5508 + }, + { + "epoch": 0.13940835589746184, + "grad_norm": 7.944418430328369, + "learning_rate": 9.584413550124687e-06, + "loss": 0.1992, + "step": 5509 + }, + { + "epoch": 0.13943366146215552, + "grad_norm": 4.418741703033447, + "learning_rate": 9.584253264918476e-06, + "loss": 0.1614, + "step": 5510 + }, + { + "epoch": 0.1394589670268492, + "grad_norm": 7.275608062744141, + "learning_rate": 9.584092950149266e-06, + "loss": 0.2045, + "step": 5511 + }, + { + "epoch": 0.1394842725915429, + "grad_norm": 12.651259422302246, + "learning_rate": 9.583932605818095e-06, + "loss": 0.2358, + "step": 5512 + }, + { + "epoch": 0.13950957815623655, + "grad_norm": 4.527602195739746, + "learning_rate": 9.583772231925994e-06, + "loss": 0.2073, + "step": 5513 + }, + { + "epoch": 0.13953488372093023, + "grad_norm": 5.729404449462891, + "learning_rate": 9.583611828474e-06, + "loss": 0.2083, + "step": 5514 + }, + { + "epoch": 0.13956018928562391, + "grad_norm": 4.99163818359375, + "learning_rate": 9.583451395463146e-06, + "loss": 0.1341, + "step": 5515 + }, + { + "epoch": 0.1395854948503176, + "grad_norm": 3.990572929382324, + "learning_rate": 9.583290932894466e-06, + "loss": 0.0839, + "step": 5516 + }, + { + "epoch": 0.13961080041501125, + "grad_norm": 4.787992477416992, + "learning_rate": 9.583130440768995e-06, + "loss": 0.1638, + "step": 5517 + }, + { + "epoch": 0.13963610597970494, + "grad_norm": 11.666082382202148, + "learning_rate": 9.582969919087768e-06, + "loss": 0.2229, + "step": 5518 + }, + { + "epoch": 0.13966141154439862, + "grad_norm": 5.21742582321167, + "learning_rate": 9.582809367851822e-06, + "loss": 0.2519, + "step": 5519 + }, + { + "epoch": 0.13968671710909228, + "grad_norm": 7.476504802703857, + "learning_rate": 9.58264878706219e-06, + "loss": 0.2514, + "step": 5520 + }, + { + "epoch": 0.13971202267378596, + "grad_norm": 3.720337152481079, + "learning_rate": 9.582488176719908e-06, + "loss": 0.1526, + "step": 5521 + }, + { + "epoch": 0.13973732823847965, + "grad_norm": 5.726036548614502, + "learning_rate": 9.582327536826013e-06, + "loss": 0.221, + "step": 5522 + }, + { + "epoch": 0.13976263380317333, + "grad_norm": 9.389707565307617, + "learning_rate": 9.58216686738154e-06, + "loss": 0.2454, + "step": 5523 + }, + { + "epoch": 0.139787939367867, + "grad_norm": 10.62515640258789, + "learning_rate": 9.582006168387523e-06, + "loss": 0.209, + "step": 5524 + }, + { + "epoch": 0.13981324493256067, + "grad_norm": 4.163458347320557, + "learning_rate": 9.581845439845004e-06, + "loss": 0.2018, + "step": 5525 + }, + { + "epoch": 0.13983855049725435, + "grad_norm": 3.7283244132995605, + "learning_rate": 9.581684681755013e-06, + "loss": 0.1585, + "step": 5526 + }, + { + "epoch": 0.139863856061948, + "grad_norm": 6.5860490798950195, + "learning_rate": 9.581523894118592e-06, + "loss": 0.2112, + "step": 5527 + }, + { + "epoch": 0.1398891616266417, + "grad_norm": 3.922891855239868, + "learning_rate": 9.581363076936775e-06, + "loss": 0.2256, + "step": 5528 + }, + { + "epoch": 0.13991446719133538, + "grad_norm": 5.727550029754639, + "learning_rate": 9.5812022302106e-06, + "loss": 0.2263, + "step": 5529 + }, + { + "epoch": 0.13993977275602906, + "grad_norm": 11.312771797180176, + "learning_rate": 9.581041353941103e-06, + "loss": 0.178, + "step": 5530 + }, + { + "epoch": 0.13996507832072272, + "grad_norm": 6.761474609375, + "learning_rate": 9.580880448129324e-06, + "loss": 0.1779, + "step": 5531 + }, + { + "epoch": 0.1399903838854164, + "grad_norm": 11.387090682983398, + "learning_rate": 9.580719512776296e-06, + "loss": 0.2985, + "step": 5532 + }, + { + "epoch": 0.1400156894501101, + "grad_norm": 4.495622634887695, + "learning_rate": 9.580558547883063e-06, + "loss": 0.1649, + "step": 5533 + }, + { + "epoch": 0.14004099501480374, + "grad_norm": 5.981683254241943, + "learning_rate": 9.58039755345066e-06, + "loss": 0.1742, + "step": 5534 + }, + { + "epoch": 0.14006630057949743, + "grad_norm": 7.37001371383667, + "learning_rate": 9.580236529480123e-06, + "loss": 0.098, + "step": 5535 + }, + { + "epoch": 0.1400916061441911, + "grad_norm": 7.1402082443237305, + "learning_rate": 9.580075475972494e-06, + "loss": 0.208, + "step": 5536 + }, + { + "epoch": 0.1401169117088848, + "grad_norm": 4.917815685272217, + "learning_rate": 9.579914392928809e-06, + "loss": 0.1793, + "step": 5537 + }, + { + "epoch": 0.14014221727357845, + "grad_norm": 4.325852870941162, + "learning_rate": 9.57975328035011e-06, + "loss": 0.2317, + "step": 5538 + }, + { + "epoch": 0.14016752283827213, + "grad_norm": 10.402549743652344, + "learning_rate": 9.579592138237432e-06, + "loss": 0.1056, + "step": 5539 + }, + { + "epoch": 0.14019282840296582, + "grad_norm": 5.84243106842041, + "learning_rate": 9.579430966591818e-06, + "loss": 0.2334, + "step": 5540 + }, + { + "epoch": 0.14021813396765948, + "grad_norm": 5.409329414367676, + "learning_rate": 9.579269765414304e-06, + "loss": 0.1336, + "step": 5541 + }, + { + "epoch": 0.14024343953235316, + "grad_norm": 16.670591354370117, + "learning_rate": 9.579108534705933e-06, + "loss": 0.2978, + "step": 5542 + }, + { + "epoch": 0.14026874509704684, + "grad_norm": 3.7817604541778564, + "learning_rate": 9.578947274467741e-06, + "loss": 0.0773, + "step": 5543 + }, + { + "epoch": 0.14029405066174053, + "grad_norm": 3.9679410457611084, + "learning_rate": 9.57878598470077e-06, + "loss": 0.1904, + "step": 5544 + }, + { + "epoch": 0.14031935622643418, + "grad_norm": 4.485296249389648, + "learning_rate": 9.57862466540606e-06, + "loss": 0.2177, + "step": 5545 + }, + { + "epoch": 0.14034466179112787, + "grad_norm": 6.759088039398193, + "learning_rate": 9.578463316584651e-06, + "loss": 0.2208, + "step": 5546 + }, + { + "epoch": 0.14036996735582155, + "grad_norm": 4.387478828430176, + "learning_rate": 9.578301938237585e-06, + "loss": 0.082, + "step": 5547 + }, + { + "epoch": 0.14039527292051523, + "grad_norm": 4.354421615600586, + "learning_rate": 9.578140530365902e-06, + "loss": 0.1667, + "step": 5548 + }, + { + "epoch": 0.1404205784852089, + "grad_norm": 10.156109809875488, + "learning_rate": 9.57797909297064e-06, + "loss": 0.2902, + "step": 5549 + }, + { + "epoch": 0.14044588404990258, + "grad_norm": 6.7114176750183105, + "learning_rate": 9.577817626052844e-06, + "loss": 0.2054, + "step": 5550 + }, + { + "epoch": 0.14047118961459626, + "grad_norm": 10.331454277038574, + "learning_rate": 9.577656129613554e-06, + "loss": 0.2588, + "step": 5551 + }, + { + "epoch": 0.14049649517928992, + "grad_norm": 7.035944938659668, + "learning_rate": 9.577494603653808e-06, + "loss": 0.1955, + "step": 5552 + }, + { + "epoch": 0.1405218007439836, + "grad_norm": 9.679948806762695, + "learning_rate": 9.577333048174654e-06, + "loss": 0.2356, + "step": 5553 + }, + { + "epoch": 0.14054710630867728, + "grad_norm": 3.9091289043426514, + "learning_rate": 9.577171463177129e-06, + "loss": 0.1911, + "step": 5554 + }, + { + "epoch": 0.14057241187337097, + "grad_norm": 6.96608304977417, + "learning_rate": 9.577009848662277e-06, + "loss": 0.2527, + "step": 5555 + }, + { + "epoch": 0.14059771743806462, + "grad_norm": 7.885451793670654, + "learning_rate": 9.57684820463114e-06, + "loss": 0.1606, + "step": 5556 + }, + { + "epoch": 0.1406230230027583, + "grad_norm": 5.083280086517334, + "learning_rate": 9.576686531084761e-06, + "loss": 0.2566, + "step": 5557 + }, + { + "epoch": 0.140648328567452, + "grad_norm": 8.59396743774414, + "learning_rate": 9.576524828024182e-06, + "loss": 0.2569, + "step": 5558 + }, + { + "epoch": 0.14067363413214565, + "grad_norm": 5.365314483642578, + "learning_rate": 9.576363095450444e-06, + "loss": 0.2322, + "step": 5559 + }, + { + "epoch": 0.14069893969683933, + "grad_norm": 3.8159611225128174, + "learning_rate": 9.576201333364593e-06, + "loss": 0.1847, + "step": 5560 + }, + { + "epoch": 0.14072424526153302, + "grad_norm": 5.807662487030029, + "learning_rate": 9.576039541767672e-06, + "loss": 0.1698, + "step": 5561 + }, + { + "epoch": 0.1407495508262267, + "grad_norm": 12.527799606323242, + "learning_rate": 9.57587772066072e-06, + "loss": 0.1368, + "step": 5562 + }, + { + "epoch": 0.14077485639092036, + "grad_norm": 22.317270278930664, + "learning_rate": 9.575715870044786e-06, + "loss": 0.1729, + "step": 5563 + }, + { + "epoch": 0.14080016195561404, + "grad_norm": 7.304521083831787, + "learning_rate": 9.575553989920912e-06, + "loss": 0.271, + "step": 5564 + }, + { + "epoch": 0.14082546752030772, + "grad_norm": 5.404259204864502, + "learning_rate": 9.575392080290141e-06, + "loss": 0.1976, + "step": 5565 + }, + { + "epoch": 0.14085077308500138, + "grad_norm": 10.427204132080078, + "learning_rate": 9.575230141153516e-06, + "loss": 0.3218, + "step": 5566 + }, + { + "epoch": 0.14087607864969506, + "grad_norm": 12.486493110656738, + "learning_rate": 9.575068172512083e-06, + "loss": 0.2164, + "step": 5567 + }, + { + "epoch": 0.14090138421438875, + "grad_norm": 10.70232105255127, + "learning_rate": 9.574906174366887e-06, + "loss": 0.2111, + "step": 5568 + }, + { + "epoch": 0.14092668977908243, + "grad_norm": 6.5605034828186035, + "learning_rate": 9.574744146718973e-06, + "loss": 0.1457, + "step": 5569 + }, + { + "epoch": 0.1409519953437761, + "grad_norm": 7.088801860809326, + "learning_rate": 9.574582089569384e-06, + "loss": 0.2581, + "step": 5570 + }, + { + "epoch": 0.14097730090846977, + "grad_norm": 5.477907180786133, + "learning_rate": 9.574420002919168e-06, + "loss": 0.1413, + "step": 5571 + }, + { + "epoch": 0.14100260647316346, + "grad_norm": 3.6832327842712402, + "learning_rate": 9.574257886769365e-06, + "loss": 0.1528, + "step": 5572 + }, + { + "epoch": 0.1410279120378571, + "grad_norm": 4.7264018058776855, + "learning_rate": 9.574095741121025e-06, + "loss": 0.239, + "step": 5573 + }, + { + "epoch": 0.1410532176025508, + "grad_norm": 10.741972923278809, + "learning_rate": 9.573933565975193e-06, + "loss": 0.2521, + "step": 5574 + }, + { + "epoch": 0.14107852316724448, + "grad_norm": 6.347989082336426, + "learning_rate": 9.573771361332914e-06, + "loss": 0.269, + "step": 5575 + }, + { + "epoch": 0.14110382873193816, + "grad_norm": 4.9442243576049805, + "learning_rate": 9.573609127195235e-06, + "loss": 0.19, + "step": 5576 + }, + { + "epoch": 0.14112913429663182, + "grad_norm": 14.264694213867188, + "learning_rate": 9.5734468635632e-06, + "loss": 0.2517, + "step": 5577 + }, + { + "epoch": 0.1411544398613255, + "grad_norm": 5.283942699432373, + "learning_rate": 9.573284570437857e-06, + "loss": 0.1801, + "step": 5578 + }, + { + "epoch": 0.1411797454260192, + "grad_norm": 7.933864593505859, + "learning_rate": 9.573122247820255e-06, + "loss": 0.2656, + "step": 5579 + }, + { + "epoch": 0.14120505099071287, + "grad_norm": 4.400036811828613, + "learning_rate": 9.572959895711435e-06, + "loss": 0.1573, + "step": 5580 + }, + { + "epoch": 0.14123035655540653, + "grad_norm": 11.769125938415527, + "learning_rate": 9.572797514112449e-06, + "loss": 0.2987, + "step": 5581 + }, + { + "epoch": 0.1412556621201002, + "grad_norm": 14.4563627243042, + "learning_rate": 9.572635103024344e-06, + "loss": 0.2187, + "step": 5582 + }, + { + "epoch": 0.1412809676847939, + "grad_norm": 3.7947959899902344, + "learning_rate": 9.572472662448163e-06, + "loss": 0.1608, + "step": 5583 + }, + { + "epoch": 0.14130627324948755, + "grad_norm": 13.670777320861816, + "learning_rate": 9.572310192384957e-06, + "loss": 0.3703, + "step": 5584 + }, + { + "epoch": 0.14133157881418124, + "grad_norm": 7.145440578460693, + "learning_rate": 9.572147692835774e-06, + "loss": 0.1945, + "step": 5585 + }, + { + "epoch": 0.14135688437887492, + "grad_norm": 38.461849212646484, + "learning_rate": 9.57198516380166e-06, + "loss": 0.2112, + "step": 5586 + }, + { + "epoch": 0.1413821899435686, + "grad_norm": 11.155705451965332, + "learning_rate": 9.571822605283665e-06, + "loss": 0.1467, + "step": 5587 + }, + { + "epoch": 0.14140749550826226, + "grad_norm": 17.668481826782227, + "learning_rate": 9.571660017282836e-06, + "loss": 0.2992, + "step": 5588 + }, + { + "epoch": 0.14143280107295594, + "grad_norm": 4.2088165283203125, + "learning_rate": 9.571497399800222e-06, + "loss": 0.1706, + "step": 5589 + }, + { + "epoch": 0.14145810663764963, + "grad_norm": 7.997603893280029, + "learning_rate": 9.571334752836872e-06, + "loss": 0.2386, + "step": 5590 + }, + { + "epoch": 0.14148341220234328, + "grad_norm": 14.503689765930176, + "learning_rate": 9.571172076393833e-06, + "loss": 0.2559, + "step": 5591 + }, + { + "epoch": 0.14150871776703697, + "grad_norm": 8.604913711547852, + "learning_rate": 9.571009370472157e-06, + "loss": 0.2336, + "step": 5592 + }, + { + "epoch": 0.14153402333173065, + "grad_norm": 6.670801639556885, + "learning_rate": 9.570846635072892e-06, + "loss": 0.1922, + "step": 5593 + }, + { + "epoch": 0.14155932889642434, + "grad_norm": 15.059847831726074, + "learning_rate": 9.570683870197086e-06, + "loss": 0.2146, + "step": 5594 + }, + { + "epoch": 0.141584634461118, + "grad_norm": 6.777496337890625, + "learning_rate": 9.570521075845792e-06, + "loss": 0.1671, + "step": 5595 + }, + { + "epoch": 0.14160994002581168, + "grad_norm": 13.50027084350586, + "learning_rate": 9.570358252020055e-06, + "loss": 0.3309, + "step": 5596 + }, + { + "epoch": 0.14163524559050536, + "grad_norm": 9.46336555480957, + "learning_rate": 9.570195398720929e-06, + "loss": 0.1789, + "step": 5597 + }, + { + "epoch": 0.14166055115519902, + "grad_norm": 10.779165267944336, + "learning_rate": 9.570032515949465e-06, + "loss": 0.1784, + "step": 5598 + }, + { + "epoch": 0.1416858567198927, + "grad_norm": 6.367692947387695, + "learning_rate": 9.569869603706709e-06, + "loss": 0.2013, + "step": 5599 + }, + { + "epoch": 0.14171116228458638, + "grad_norm": 4.468124866485596, + "learning_rate": 9.569706661993715e-06, + "loss": 0.1995, + "step": 5600 + }, + { + "epoch": 0.14173646784928007, + "grad_norm": 12.910371780395508, + "learning_rate": 9.569543690811533e-06, + "loss": 0.2874, + "step": 5601 + }, + { + "epoch": 0.14176177341397372, + "grad_norm": 11.834373474121094, + "learning_rate": 9.569380690161213e-06, + "loss": 0.3581, + "step": 5602 + }, + { + "epoch": 0.1417870789786674, + "grad_norm": 5.832364082336426, + "learning_rate": 9.569217660043807e-06, + "loss": 0.2498, + "step": 5603 + }, + { + "epoch": 0.1418123845433611, + "grad_norm": 4.6135783195495605, + "learning_rate": 9.569054600460366e-06, + "loss": 0.1785, + "step": 5604 + }, + { + "epoch": 0.14183769010805475, + "grad_norm": 3.664667844772339, + "learning_rate": 9.568891511411942e-06, + "loss": 0.1743, + "step": 5605 + }, + { + "epoch": 0.14186299567274843, + "grad_norm": 9.2011079788208, + "learning_rate": 9.568728392899587e-06, + "loss": 0.2109, + "step": 5606 + }, + { + "epoch": 0.14188830123744212, + "grad_norm": 6.550989151000977, + "learning_rate": 9.568565244924353e-06, + "loss": 0.164, + "step": 5607 + }, + { + "epoch": 0.1419136068021358, + "grad_norm": 4.526059627532959, + "learning_rate": 9.568402067487291e-06, + "loss": 0.1404, + "step": 5608 + }, + { + "epoch": 0.14193891236682946, + "grad_norm": 4.961488723754883, + "learning_rate": 9.568238860589452e-06, + "loss": 0.2499, + "step": 5609 + }, + { + "epoch": 0.14196421793152314, + "grad_norm": 7.584564685821533, + "learning_rate": 9.568075624231892e-06, + "loss": 0.2089, + "step": 5610 + }, + { + "epoch": 0.14198952349621682, + "grad_norm": 7.134178161621094, + "learning_rate": 9.567912358415662e-06, + "loss": 0.1941, + "step": 5611 + }, + { + "epoch": 0.1420148290609105, + "grad_norm": 8.446942329406738, + "learning_rate": 9.567749063141816e-06, + "loss": 0.3018, + "step": 5612 + }, + { + "epoch": 0.14204013462560416, + "grad_norm": 4.988759994506836, + "learning_rate": 9.567585738411404e-06, + "loss": 0.211, + "step": 5613 + }, + { + "epoch": 0.14206544019029785, + "grad_norm": 7.538276672363281, + "learning_rate": 9.567422384225483e-06, + "loss": 0.2669, + "step": 5614 + }, + { + "epoch": 0.14209074575499153, + "grad_norm": 9.137469291687012, + "learning_rate": 9.567259000585103e-06, + "loss": 0.2329, + "step": 5615 + }, + { + "epoch": 0.1421160513196852, + "grad_norm": 7.368988990783691, + "learning_rate": 9.56709558749132e-06, + "loss": 0.1635, + "step": 5616 + }, + { + "epoch": 0.14214135688437887, + "grad_norm": 5.452740669250488, + "learning_rate": 9.566932144945187e-06, + "loss": 0.2136, + "step": 5617 + }, + { + "epoch": 0.14216666244907256, + "grad_norm": 9.106660842895508, + "learning_rate": 9.566768672947756e-06, + "loss": 0.2702, + "step": 5618 + }, + { + "epoch": 0.14219196801376624, + "grad_norm": 5.815059661865234, + "learning_rate": 9.566605171500087e-06, + "loss": 0.1803, + "step": 5619 + }, + { + "epoch": 0.1422172735784599, + "grad_norm": 7.107976913452148, + "learning_rate": 9.56644164060323e-06, + "loss": 0.1496, + "step": 5620 + }, + { + "epoch": 0.14224257914315358, + "grad_norm": 4.68714714050293, + "learning_rate": 9.566278080258237e-06, + "loss": 0.1366, + "step": 5621 + }, + { + "epoch": 0.14226788470784726, + "grad_norm": 7.98899507522583, + "learning_rate": 9.56611449046617e-06, + "loss": 0.2619, + "step": 5622 + }, + { + "epoch": 0.14229319027254092, + "grad_norm": 8.038018226623535, + "learning_rate": 9.565950871228079e-06, + "loss": 0.3109, + "step": 5623 + }, + { + "epoch": 0.1423184958372346, + "grad_norm": 12.897417068481445, + "learning_rate": 9.565787222545019e-06, + "loss": 0.2949, + "step": 5624 + }, + { + "epoch": 0.1423438014019283, + "grad_norm": 7.572901725769043, + "learning_rate": 9.565623544418048e-06, + "loss": 0.2375, + "step": 5625 + }, + { + "epoch": 0.14236910696662197, + "grad_norm": 3.291811227798462, + "learning_rate": 9.56545983684822e-06, + "loss": 0.1396, + "step": 5626 + }, + { + "epoch": 0.14239441253131563, + "grad_norm": 6.292844295501709, + "learning_rate": 9.565296099836587e-06, + "loss": 0.1275, + "step": 5627 + }, + { + "epoch": 0.1424197180960093, + "grad_norm": 4.124205589294434, + "learning_rate": 9.565132333384213e-06, + "loss": 0.1909, + "step": 5628 + }, + { + "epoch": 0.142445023660703, + "grad_norm": 4.191384792327881, + "learning_rate": 9.564968537492148e-06, + "loss": 0.2203, + "step": 5629 + }, + { + "epoch": 0.14247032922539665, + "grad_norm": 8.658994674682617, + "learning_rate": 9.564804712161451e-06, + "loss": 0.2024, + "step": 5630 + }, + { + "epoch": 0.14249563479009034, + "grad_norm": 3.988983392715454, + "learning_rate": 9.564640857393178e-06, + "loss": 0.2075, + "step": 5631 + }, + { + "epoch": 0.14252094035478402, + "grad_norm": 4.680359363555908, + "learning_rate": 9.564476973188382e-06, + "loss": 0.1847, + "step": 5632 + }, + { + "epoch": 0.1425462459194777, + "grad_norm": 5.829050540924072, + "learning_rate": 9.564313059548125e-06, + "loss": 0.1904, + "step": 5633 + }, + { + "epoch": 0.14257155148417136, + "grad_norm": 5.8943047523498535, + "learning_rate": 9.564149116473462e-06, + "loss": 0.1559, + "step": 5634 + }, + { + "epoch": 0.14259685704886504, + "grad_norm": 5.0860724449157715, + "learning_rate": 9.563985143965453e-06, + "loss": 0.2083, + "step": 5635 + }, + { + "epoch": 0.14262216261355873, + "grad_norm": 5.13686466217041, + "learning_rate": 9.563821142025149e-06, + "loss": 0.1991, + "step": 5636 + }, + { + "epoch": 0.14264746817825238, + "grad_norm": 4.4220452308654785, + "learning_rate": 9.563657110653613e-06, + "loss": 0.2123, + "step": 5637 + }, + { + "epoch": 0.14267277374294607, + "grad_norm": 8.4122314453125, + "learning_rate": 9.5634930498519e-06, + "loss": 0.2028, + "step": 5638 + }, + { + "epoch": 0.14269807930763975, + "grad_norm": 3.8051342964172363, + "learning_rate": 9.563328959621072e-06, + "loss": 0.1191, + "step": 5639 + }, + { + "epoch": 0.14272338487233344, + "grad_norm": 9.852866172790527, + "learning_rate": 9.563164839962182e-06, + "loss": 0.3307, + "step": 5640 + }, + { + "epoch": 0.1427486904370271, + "grad_norm": 2.8514950275421143, + "learning_rate": 9.56300069087629e-06, + "loss": 0.0824, + "step": 5641 + }, + { + "epoch": 0.14277399600172078, + "grad_norm": 2.943023920059204, + "learning_rate": 9.562836512364458e-06, + "loss": 0.1583, + "step": 5642 + }, + { + "epoch": 0.14279930156641446, + "grad_norm": 4.87596321105957, + "learning_rate": 9.56267230442774e-06, + "loss": 0.1714, + "step": 5643 + }, + { + "epoch": 0.14282460713110814, + "grad_norm": 5.614060878753662, + "learning_rate": 9.562508067067197e-06, + "loss": 0.166, + "step": 5644 + }, + { + "epoch": 0.1428499126958018, + "grad_norm": 9.254861831665039, + "learning_rate": 9.56234380028389e-06, + "loss": 0.254, + "step": 5645 + }, + { + "epoch": 0.14287521826049548, + "grad_norm": 11.236745834350586, + "learning_rate": 9.562179504078876e-06, + "loss": 0.21, + "step": 5646 + }, + { + "epoch": 0.14290052382518917, + "grad_norm": 7.745090007781982, + "learning_rate": 9.562015178453215e-06, + "loss": 0.1533, + "step": 5647 + }, + { + "epoch": 0.14292582938988282, + "grad_norm": 15.627212524414062, + "learning_rate": 9.561850823407966e-06, + "loss": 0.3188, + "step": 5648 + }, + { + "epoch": 0.1429511349545765, + "grad_norm": 3.568528652191162, + "learning_rate": 9.561686438944188e-06, + "loss": 0.1693, + "step": 5649 + }, + { + "epoch": 0.1429764405192702, + "grad_norm": 6.668076038360596, + "learning_rate": 9.561522025062946e-06, + "loss": 0.2316, + "step": 5650 + }, + { + "epoch": 0.14300174608396388, + "grad_norm": 13.119836807250977, + "learning_rate": 9.561357581765295e-06, + "loss": 0.2555, + "step": 5651 + }, + { + "epoch": 0.14302705164865753, + "grad_norm": 4.28045129776001, + "learning_rate": 9.561193109052299e-06, + "loss": 0.1932, + "step": 5652 + }, + { + "epoch": 0.14305235721335122, + "grad_norm": 7.472131252288818, + "learning_rate": 9.561028606925014e-06, + "loss": 0.2511, + "step": 5653 + }, + { + "epoch": 0.1430776627780449, + "grad_norm": 4.429595470428467, + "learning_rate": 9.560864075384506e-06, + "loss": 0.131, + "step": 5654 + }, + { + "epoch": 0.14310296834273856, + "grad_norm": 44.034576416015625, + "learning_rate": 9.560699514431836e-06, + "loss": 0.31, + "step": 5655 + }, + { + "epoch": 0.14312827390743224, + "grad_norm": 4.559661388397217, + "learning_rate": 9.560534924068059e-06, + "loss": 0.1405, + "step": 5656 + }, + { + "epoch": 0.14315357947212592, + "grad_norm": 10.194551467895508, + "learning_rate": 9.560370304294244e-06, + "loss": 0.2713, + "step": 5657 + }, + { + "epoch": 0.1431788850368196, + "grad_norm": 3.658590078353882, + "learning_rate": 9.560205655111447e-06, + "loss": 0.0956, + "step": 5658 + }, + { + "epoch": 0.14320419060151326, + "grad_norm": 4.234119415283203, + "learning_rate": 9.560040976520732e-06, + "loss": 0.1848, + "step": 5659 + }, + { + "epoch": 0.14322949616620695, + "grad_norm": 4.68237829208374, + "learning_rate": 9.559876268523164e-06, + "loss": 0.1326, + "step": 5660 + }, + { + "epoch": 0.14325480173090063, + "grad_norm": 3.700827121734619, + "learning_rate": 9.559711531119798e-06, + "loss": 0.1547, + "step": 5661 + }, + { + "epoch": 0.1432801072955943, + "grad_norm": 4.156147003173828, + "learning_rate": 9.559546764311703e-06, + "loss": 0.1563, + "step": 5662 + }, + { + "epoch": 0.14330541286028797, + "grad_norm": 3.831289291381836, + "learning_rate": 9.559381968099939e-06, + "loss": 0.1773, + "step": 5663 + }, + { + "epoch": 0.14333071842498166, + "grad_norm": 4.64357328414917, + "learning_rate": 9.559217142485568e-06, + "loss": 0.1504, + "step": 5664 + }, + { + "epoch": 0.14335602398967534, + "grad_norm": 3.99871826171875, + "learning_rate": 9.559052287469655e-06, + "loss": 0.1424, + "step": 5665 + }, + { + "epoch": 0.143381329554369, + "grad_norm": 4.453938961029053, + "learning_rate": 9.558887403053259e-06, + "loss": 0.211, + "step": 5666 + }, + { + "epoch": 0.14340663511906268, + "grad_norm": 8.340727806091309, + "learning_rate": 9.558722489237449e-06, + "loss": 0.2795, + "step": 5667 + }, + { + "epoch": 0.14343194068375636, + "grad_norm": 6.919480800628662, + "learning_rate": 9.558557546023283e-06, + "loss": 0.1933, + "step": 5668 + }, + { + "epoch": 0.14345724624845002, + "grad_norm": 3.4197170734405518, + "learning_rate": 9.55839257341183e-06, + "loss": 0.1269, + "step": 5669 + }, + { + "epoch": 0.1434825518131437, + "grad_norm": 5.593437671661377, + "learning_rate": 9.55822757140415e-06, + "loss": 0.1839, + "step": 5670 + }, + { + "epoch": 0.1435078573778374, + "grad_norm": 8.299286842346191, + "learning_rate": 9.558062540001309e-06, + "loss": 0.1782, + "step": 5671 + }, + { + "epoch": 0.14353316294253107, + "grad_norm": 4.828310489654541, + "learning_rate": 9.55789747920437e-06, + "loss": 0.1475, + "step": 5672 + }, + { + "epoch": 0.14355846850722473, + "grad_norm": 25.031843185424805, + "learning_rate": 9.557732389014398e-06, + "loss": 0.3207, + "step": 5673 + }, + { + "epoch": 0.1435837740719184, + "grad_norm": 6.662337303161621, + "learning_rate": 9.557567269432459e-06, + "loss": 0.1515, + "step": 5674 + }, + { + "epoch": 0.1436090796366121, + "grad_norm": 8.930885314941406, + "learning_rate": 9.557402120459615e-06, + "loss": 0.2439, + "step": 5675 + }, + { + "epoch": 0.14363438520130578, + "grad_norm": 2.827009439468384, + "learning_rate": 9.557236942096933e-06, + "loss": 0.1118, + "step": 5676 + }, + { + "epoch": 0.14365969076599944, + "grad_norm": 5.059515476226807, + "learning_rate": 9.557071734345477e-06, + "loss": 0.2231, + "step": 5677 + }, + { + "epoch": 0.14368499633069312, + "grad_norm": 7.451333522796631, + "learning_rate": 9.556906497206314e-06, + "loss": 0.2349, + "step": 5678 + }, + { + "epoch": 0.1437103018953868, + "grad_norm": 5.38057804107666, + "learning_rate": 9.556741230680506e-06, + "loss": 0.2137, + "step": 5679 + }, + { + "epoch": 0.14373560746008046, + "grad_norm": 15.242111206054688, + "learning_rate": 9.556575934769124e-06, + "loss": 0.174, + "step": 5680 + }, + { + "epoch": 0.14376091302477415, + "grad_norm": 15.74515151977539, + "learning_rate": 9.556410609473231e-06, + "loss": 0.3564, + "step": 5681 + }, + { + "epoch": 0.14378621858946783, + "grad_norm": 12.659552574157715, + "learning_rate": 9.556245254793894e-06, + "loss": 0.3052, + "step": 5682 + }, + { + "epoch": 0.1438115241541615, + "grad_norm": 6.138054847717285, + "learning_rate": 9.556079870732178e-06, + "loss": 0.2416, + "step": 5683 + }, + { + "epoch": 0.14383682971885517, + "grad_norm": 5.830741882324219, + "learning_rate": 9.55591445728915e-06, + "loss": 0.2233, + "step": 5684 + }, + { + "epoch": 0.14386213528354885, + "grad_norm": 6.987799644470215, + "learning_rate": 9.555749014465877e-06, + "loss": 0.1701, + "step": 5685 + }, + { + "epoch": 0.14388744084824254, + "grad_norm": 4.706925868988037, + "learning_rate": 9.555583542263428e-06, + "loss": 0.2363, + "step": 5686 + }, + { + "epoch": 0.1439127464129362, + "grad_norm": 6.755610942840576, + "learning_rate": 9.555418040682866e-06, + "loss": 0.2084, + "step": 5687 + }, + { + "epoch": 0.14393805197762988, + "grad_norm": 3.6907472610473633, + "learning_rate": 9.555252509725263e-06, + "loss": 0.195, + "step": 5688 + }, + { + "epoch": 0.14396335754232356, + "grad_norm": 3.5421876907348633, + "learning_rate": 9.555086949391681e-06, + "loss": 0.1419, + "step": 5689 + }, + { + "epoch": 0.14398866310701725, + "grad_norm": 2.807974100112915, + "learning_rate": 9.554921359683193e-06, + "loss": 0.146, + "step": 5690 + }, + { + "epoch": 0.1440139686717109, + "grad_norm": 7.711394309997559, + "learning_rate": 9.554755740600863e-06, + "loss": 0.198, + "step": 5691 + }, + { + "epoch": 0.14403927423640459, + "grad_norm": 5.919564723968506, + "learning_rate": 9.55459009214576e-06, + "loss": 0.2189, + "step": 5692 + }, + { + "epoch": 0.14406457980109827, + "grad_norm": 3.871798515319824, + "learning_rate": 9.554424414318955e-06, + "loss": 0.1318, + "step": 5693 + }, + { + "epoch": 0.14408988536579193, + "grad_norm": 3.252445697784424, + "learning_rate": 9.554258707121512e-06, + "loss": 0.1799, + "step": 5694 + }, + { + "epoch": 0.1441151909304856, + "grad_norm": 4.28021764755249, + "learning_rate": 9.554092970554502e-06, + "loss": 0.1446, + "step": 5695 + }, + { + "epoch": 0.1441404964951793, + "grad_norm": 13.026674270629883, + "learning_rate": 9.553927204618993e-06, + "loss": 0.1015, + "step": 5696 + }, + { + "epoch": 0.14416580205987298, + "grad_norm": 3.087053060531616, + "learning_rate": 9.553761409316056e-06, + "loss": 0.1546, + "step": 5697 + }, + { + "epoch": 0.14419110762456663, + "grad_norm": 6.830740928649902, + "learning_rate": 9.55359558464676e-06, + "loss": 0.2198, + "step": 5698 + }, + { + "epoch": 0.14421641318926032, + "grad_norm": 3.6543142795562744, + "learning_rate": 9.55342973061217e-06, + "loss": 0.1061, + "step": 5699 + }, + { + "epoch": 0.144241718753954, + "grad_norm": 10.429083824157715, + "learning_rate": 9.553263847213361e-06, + "loss": 0.2743, + "step": 5700 + }, + { + "epoch": 0.14426702431864766, + "grad_norm": 4.2434611320495605, + "learning_rate": 9.553097934451398e-06, + "loss": 0.0852, + "step": 5701 + }, + { + "epoch": 0.14429232988334134, + "grad_norm": 2.8749284744262695, + "learning_rate": 9.552931992327354e-06, + "loss": 0.0939, + "step": 5702 + }, + { + "epoch": 0.14431763544803503, + "grad_norm": 3.0734310150146484, + "learning_rate": 9.5527660208423e-06, + "loss": 0.1113, + "step": 5703 + }, + { + "epoch": 0.1443429410127287, + "grad_norm": 12.838891983032227, + "learning_rate": 9.552600019997305e-06, + "loss": 0.1819, + "step": 5704 + }, + { + "epoch": 0.14436824657742237, + "grad_norm": 5.287367820739746, + "learning_rate": 9.552433989793438e-06, + "loss": 0.1349, + "step": 5705 + }, + { + "epoch": 0.14439355214211605, + "grad_norm": 8.722601890563965, + "learning_rate": 9.55226793023177e-06, + "loss": 0.2054, + "step": 5706 + }, + { + "epoch": 0.14441885770680973, + "grad_norm": 4.361596584320068, + "learning_rate": 9.552101841313377e-06, + "loss": 0.1728, + "step": 5707 + }, + { + "epoch": 0.14444416327150342, + "grad_norm": 7.472800254821777, + "learning_rate": 9.551935723039323e-06, + "loss": 0.2315, + "step": 5708 + }, + { + "epoch": 0.14446946883619707, + "grad_norm": 9.930015563964844, + "learning_rate": 9.551769575410681e-06, + "loss": 0.2385, + "step": 5709 + }, + { + "epoch": 0.14449477440089076, + "grad_norm": 3.607792854309082, + "learning_rate": 9.551603398428527e-06, + "loss": 0.1665, + "step": 5710 + }, + { + "epoch": 0.14452007996558444, + "grad_norm": 13.912067413330078, + "learning_rate": 9.551437192093926e-06, + "loss": 0.366, + "step": 5711 + }, + { + "epoch": 0.1445453855302781, + "grad_norm": 15.3066987991333, + "learning_rate": 9.551270956407956e-06, + "loss": 0.233, + "step": 5712 + }, + { + "epoch": 0.14457069109497178, + "grad_norm": 5.499396324157715, + "learning_rate": 9.551104691371685e-06, + "loss": 0.1841, + "step": 5713 + }, + { + "epoch": 0.14459599665966547, + "grad_norm": 12.27018928527832, + "learning_rate": 9.550938396986186e-06, + "loss": 0.2866, + "step": 5714 + }, + { + "epoch": 0.14462130222435915, + "grad_norm": 16.149715423583984, + "learning_rate": 9.550772073252533e-06, + "loss": 0.2949, + "step": 5715 + }, + { + "epoch": 0.1446466077890528, + "grad_norm": 7.835735321044922, + "learning_rate": 9.550605720171794e-06, + "loss": 0.2498, + "step": 5716 + }, + { + "epoch": 0.1446719133537465, + "grad_norm": 9.108819007873535, + "learning_rate": 9.55043933774505e-06, + "loss": 0.2186, + "step": 5717 + }, + { + "epoch": 0.14469721891844017, + "grad_norm": 10.748117446899414, + "learning_rate": 9.550272925973368e-06, + "loss": 0.2316, + "step": 5718 + }, + { + "epoch": 0.14472252448313383, + "grad_norm": 11.069599151611328, + "learning_rate": 9.550106484857819e-06, + "loss": 0.3068, + "step": 5719 + }, + { + "epoch": 0.1447478300478275, + "grad_norm": 13.048917770385742, + "learning_rate": 9.549940014399483e-06, + "loss": 0.2124, + "step": 5720 + }, + { + "epoch": 0.1447731356125212, + "grad_norm": 4.908567905426025, + "learning_rate": 9.54977351459943e-06, + "loss": 0.1617, + "step": 5721 + }, + { + "epoch": 0.14479844117721488, + "grad_norm": 6.689101696014404, + "learning_rate": 9.549606985458733e-06, + "loss": 0.2437, + "step": 5722 + }, + { + "epoch": 0.14482374674190854, + "grad_norm": 7.923528671264648, + "learning_rate": 9.549440426978467e-06, + "loss": 0.2209, + "step": 5723 + }, + { + "epoch": 0.14484905230660222, + "grad_norm": 6.699520111083984, + "learning_rate": 9.549273839159705e-06, + "loss": 0.2315, + "step": 5724 + }, + { + "epoch": 0.1448743578712959, + "grad_norm": 6.188473224639893, + "learning_rate": 9.549107222003523e-06, + "loss": 0.2418, + "step": 5725 + }, + { + "epoch": 0.14489966343598956, + "grad_norm": 5.921144962310791, + "learning_rate": 9.548940575510993e-06, + "loss": 0.251, + "step": 5726 + }, + { + "epoch": 0.14492496900068325, + "grad_norm": 3.3862035274505615, + "learning_rate": 9.548773899683194e-06, + "loss": 0.1784, + "step": 5727 + }, + { + "epoch": 0.14495027456537693, + "grad_norm": 5.1420722007751465, + "learning_rate": 9.548607194521199e-06, + "loss": 0.2938, + "step": 5728 + }, + { + "epoch": 0.1449755801300706, + "grad_norm": 4.166703701019287, + "learning_rate": 9.54844046002608e-06, + "loss": 0.2103, + "step": 5729 + }, + { + "epoch": 0.14500088569476427, + "grad_norm": 6.327300071716309, + "learning_rate": 9.548273696198915e-06, + "loss": 0.2108, + "step": 5730 + }, + { + "epoch": 0.14502619125945795, + "grad_norm": 5.241597652435303, + "learning_rate": 9.54810690304078e-06, + "loss": 0.1552, + "step": 5731 + }, + { + "epoch": 0.14505149682415164, + "grad_norm": 7.04771089553833, + "learning_rate": 9.547940080552749e-06, + "loss": 0.1135, + "step": 5732 + }, + { + "epoch": 0.1450768023888453, + "grad_norm": 6.0866827964782715, + "learning_rate": 9.547773228735899e-06, + "loss": 0.1218, + "step": 5733 + }, + { + "epoch": 0.14510210795353898, + "grad_norm": 8.622493743896484, + "learning_rate": 9.547606347591305e-06, + "loss": 0.2076, + "step": 5734 + }, + { + "epoch": 0.14512741351823266, + "grad_norm": 4.740808963775635, + "learning_rate": 9.547439437120042e-06, + "loss": 0.2007, + "step": 5735 + }, + { + "epoch": 0.14515271908292635, + "grad_norm": 5.7649102210998535, + "learning_rate": 9.54727249732319e-06, + "loss": 0.2938, + "step": 5736 + }, + { + "epoch": 0.14517802464762, + "grad_norm": 12.662385940551758, + "learning_rate": 9.547105528201824e-06, + "loss": 0.317, + "step": 5737 + }, + { + "epoch": 0.14520333021231369, + "grad_norm": 5.232266426086426, + "learning_rate": 9.546938529757017e-06, + "loss": 0.1604, + "step": 5738 + }, + { + "epoch": 0.14522863577700737, + "grad_norm": 4.733714580535889, + "learning_rate": 9.546771501989852e-06, + "loss": 0.1657, + "step": 5739 + }, + { + "epoch": 0.14525394134170105, + "grad_norm": 5.89326286315918, + "learning_rate": 9.546604444901403e-06, + "loss": 0.1478, + "step": 5740 + }, + { + "epoch": 0.1452792469063947, + "grad_norm": 3.8410990238189697, + "learning_rate": 9.546437358492748e-06, + "loss": 0.1677, + "step": 5741 + }, + { + "epoch": 0.1453045524710884, + "grad_norm": 4.372282981872559, + "learning_rate": 9.546270242764962e-06, + "loss": 0.1232, + "step": 5742 + }, + { + "epoch": 0.14532985803578208, + "grad_norm": 11.533421516418457, + "learning_rate": 9.546103097719126e-06, + "loss": 0.2159, + "step": 5743 + }, + { + "epoch": 0.14535516360047573, + "grad_norm": 7.77659273147583, + "learning_rate": 9.545935923356318e-06, + "loss": 0.211, + "step": 5744 + }, + { + "epoch": 0.14538046916516942, + "grad_norm": 7.550324440002441, + "learning_rate": 9.545768719677614e-06, + "loss": 0.1902, + "step": 5745 + }, + { + "epoch": 0.1454057747298631, + "grad_norm": 24.26948356628418, + "learning_rate": 9.545601486684093e-06, + "loss": 0.2817, + "step": 5746 + }, + { + "epoch": 0.14543108029455679, + "grad_norm": 14.546647071838379, + "learning_rate": 9.545434224376832e-06, + "loss": 0.261, + "step": 5747 + }, + { + "epoch": 0.14545638585925044, + "grad_norm": 4.809877395629883, + "learning_rate": 9.545266932756914e-06, + "loss": 0.1586, + "step": 5748 + }, + { + "epoch": 0.14548169142394413, + "grad_norm": 5.198024749755859, + "learning_rate": 9.545099611825415e-06, + "loss": 0.1952, + "step": 5749 + }, + { + "epoch": 0.1455069969886378, + "grad_norm": 7.3964691162109375, + "learning_rate": 9.544932261583413e-06, + "loss": 0.259, + "step": 5750 + }, + { + "epoch": 0.14553230255333147, + "grad_norm": 6.105518817901611, + "learning_rate": 9.544764882031986e-06, + "loss": 0.2485, + "step": 5751 + }, + { + "epoch": 0.14555760811802515, + "grad_norm": 7.441194534301758, + "learning_rate": 9.544597473172218e-06, + "loss": 0.2416, + "step": 5752 + }, + { + "epoch": 0.14558291368271883, + "grad_norm": 5.5991291999816895, + "learning_rate": 9.544430035005188e-06, + "loss": 0.1846, + "step": 5753 + }, + { + "epoch": 0.14560821924741252, + "grad_norm": 5.185643196105957, + "learning_rate": 9.544262567531972e-06, + "loss": 0.1396, + "step": 5754 + }, + { + "epoch": 0.14563352481210617, + "grad_norm": 4.221414566040039, + "learning_rate": 9.544095070753652e-06, + "loss": 0.1627, + "step": 5755 + }, + { + "epoch": 0.14565883037679986, + "grad_norm": 10.524792671203613, + "learning_rate": 9.543927544671307e-06, + "loss": 0.2984, + "step": 5756 + }, + { + "epoch": 0.14568413594149354, + "grad_norm": 6.875045299530029, + "learning_rate": 9.543759989286021e-06, + "loss": 0.2244, + "step": 5757 + }, + { + "epoch": 0.1457094415061872, + "grad_norm": 10.668780326843262, + "learning_rate": 9.54359240459887e-06, + "loss": 0.1906, + "step": 5758 + }, + { + "epoch": 0.14573474707088088, + "grad_norm": 4.245319843292236, + "learning_rate": 9.543424790610937e-06, + "loss": 0.1595, + "step": 5759 + }, + { + "epoch": 0.14576005263557457, + "grad_norm": 10.144405364990234, + "learning_rate": 9.543257147323303e-06, + "loss": 0.1526, + "step": 5760 + }, + { + "epoch": 0.14578535820026825, + "grad_norm": 5.227084636688232, + "learning_rate": 9.54308947473705e-06, + "loss": 0.2011, + "step": 5761 + }, + { + "epoch": 0.1458106637649619, + "grad_norm": 9.43842887878418, + "learning_rate": 9.542921772853258e-06, + "loss": 0.2261, + "step": 5762 + }, + { + "epoch": 0.1458359693296556, + "grad_norm": 8.815017700195312, + "learning_rate": 9.542754041673008e-06, + "loss": 0.1832, + "step": 5763 + }, + { + "epoch": 0.14586127489434927, + "grad_norm": 3.9804344177246094, + "learning_rate": 9.542586281197382e-06, + "loss": 0.2107, + "step": 5764 + }, + { + "epoch": 0.14588658045904293, + "grad_norm": 13.379561424255371, + "learning_rate": 9.542418491427462e-06, + "loss": 0.2976, + "step": 5765 + }, + { + "epoch": 0.14591188602373661, + "grad_norm": 3.964684247970581, + "learning_rate": 9.542250672364328e-06, + "loss": 0.1939, + "step": 5766 + }, + { + "epoch": 0.1459371915884303, + "grad_norm": 7.334924221038818, + "learning_rate": 9.542082824009068e-06, + "loss": 0.1862, + "step": 5767 + }, + { + "epoch": 0.14596249715312398, + "grad_norm": 13.294106483459473, + "learning_rate": 9.541914946362759e-06, + "loss": 0.2144, + "step": 5768 + }, + { + "epoch": 0.14598780271781764, + "grad_norm": 16.394821166992188, + "learning_rate": 9.541747039426486e-06, + "loss": 0.2042, + "step": 5769 + }, + { + "epoch": 0.14601310828251132, + "grad_norm": 6.094120979309082, + "learning_rate": 9.54157910320133e-06, + "loss": 0.1561, + "step": 5770 + }, + { + "epoch": 0.146038413847205, + "grad_norm": 6.877468109130859, + "learning_rate": 9.541411137688375e-06, + "loss": 0.2934, + "step": 5771 + }, + { + "epoch": 0.1460637194118987, + "grad_norm": 5.639578819274902, + "learning_rate": 9.541243142888706e-06, + "loss": 0.1961, + "step": 5772 + }, + { + "epoch": 0.14608902497659235, + "grad_norm": 6.690487861633301, + "learning_rate": 9.541075118803404e-06, + "loss": 0.2508, + "step": 5773 + }, + { + "epoch": 0.14611433054128603, + "grad_norm": 6.069886207580566, + "learning_rate": 9.540907065433552e-06, + "loss": 0.2041, + "step": 5774 + }, + { + "epoch": 0.14613963610597971, + "grad_norm": 11.937782287597656, + "learning_rate": 9.540738982780237e-06, + "loss": 0.2565, + "step": 5775 + }, + { + "epoch": 0.14616494167067337, + "grad_norm": 6.221440315246582, + "learning_rate": 9.540570870844539e-06, + "loss": 0.1763, + "step": 5776 + }, + { + "epoch": 0.14619024723536705, + "grad_norm": 7.295203685760498, + "learning_rate": 9.540402729627545e-06, + "loss": 0.2458, + "step": 5777 + }, + { + "epoch": 0.14621555280006074, + "grad_norm": 5.858676910400391, + "learning_rate": 9.540234559130338e-06, + "loss": 0.1904, + "step": 5778 + }, + { + "epoch": 0.14624085836475442, + "grad_norm": 10.674344062805176, + "learning_rate": 9.540066359354005e-06, + "loss": 0.2317, + "step": 5779 + }, + { + "epoch": 0.14626616392944808, + "grad_norm": 3.57505202293396, + "learning_rate": 9.539898130299627e-06, + "loss": 0.1499, + "step": 5780 + }, + { + "epoch": 0.14629146949414176, + "grad_norm": 2.8396239280700684, + "learning_rate": 9.53972987196829e-06, + "loss": 0.1041, + "step": 5781 + }, + { + "epoch": 0.14631677505883545, + "grad_norm": 2.9911715984344482, + "learning_rate": 9.539561584361079e-06, + "loss": 0.1067, + "step": 5782 + }, + { + "epoch": 0.1463420806235291, + "grad_norm": 13.320570945739746, + "learning_rate": 9.539393267479081e-06, + "loss": 0.2264, + "step": 5783 + }, + { + "epoch": 0.1463673861882228, + "grad_norm": 8.102449417114258, + "learning_rate": 9.53922492132338e-06, + "loss": 0.2515, + "step": 5784 + }, + { + "epoch": 0.14639269175291647, + "grad_norm": 8.176665306091309, + "learning_rate": 9.539056545895063e-06, + "loss": 0.1309, + "step": 5785 + }, + { + "epoch": 0.14641799731761015, + "grad_norm": 20.41901397705078, + "learning_rate": 9.538888141195213e-06, + "loss": 0.2907, + "step": 5786 + }, + { + "epoch": 0.1464433028823038, + "grad_norm": 4.917073726654053, + "learning_rate": 9.538719707224917e-06, + "loss": 0.216, + "step": 5787 + }, + { + "epoch": 0.1464686084469975, + "grad_norm": 10.359766960144043, + "learning_rate": 9.538551243985264e-06, + "loss": 0.2588, + "step": 5788 + }, + { + "epoch": 0.14649391401169118, + "grad_norm": 8.449560165405273, + "learning_rate": 9.538382751477337e-06, + "loss": 0.189, + "step": 5789 + }, + { + "epoch": 0.14651921957638483, + "grad_norm": 10.813263893127441, + "learning_rate": 9.538214229702223e-06, + "loss": 0.2052, + "step": 5790 + }, + { + "epoch": 0.14654452514107852, + "grad_norm": 5.316347599029541, + "learning_rate": 9.53804567866101e-06, + "loss": 0.1554, + "step": 5791 + }, + { + "epoch": 0.1465698307057722, + "grad_norm": 9.952630043029785, + "learning_rate": 9.537877098354787e-06, + "loss": 0.1794, + "step": 5792 + }, + { + "epoch": 0.1465951362704659, + "grad_norm": 21.76690673828125, + "learning_rate": 9.537708488784637e-06, + "loss": 0.2629, + "step": 5793 + }, + { + "epoch": 0.14662044183515954, + "grad_norm": 8.100488662719727, + "learning_rate": 9.537539849951648e-06, + "loss": 0.2719, + "step": 5794 + }, + { + "epoch": 0.14664574739985323, + "grad_norm": 12.211793899536133, + "learning_rate": 9.537371181856908e-06, + "loss": 0.3023, + "step": 5795 + }, + { + "epoch": 0.1466710529645469, + "grad_norm": 7.25735330581665, + "learning_rate": 9.537202484501509e-06, + "loss": 0.2278, + "step": 5796 + }, + { + "epoch": 0.14669635852924057, + "grad_norm": 12.177145957946777, + "learning_rate": 9.537033757886532e-06, + "loss": 0.2177, + "step": 5797 + }, + { + "epoch": 0.14672166409393425, + "grad_norm": 6.982829570770264, + "learning_rate": 9.53686500201307e-06, + "loss": 0.1641, + "step": 5798 + }, + { + "epoch": 0.14674696965862793, + "grad_norm": 17.124547958374023, + "learning_rate": 9.536696216882209e-06, + "loss": 0.259, + "step": 5799 + }, + { + "epoch": 0.14677227522332162, + "grad_norm": 10.289484024047852, + "learning_rate": 9.536527402495037e-06, + "loss": 0.1986, + "step": 5800 + }, + { + "epoch": 0.14679758078801527, + "grad_norm": 15.636480331420898, + "learning_rate": 9.536358558852643e-06, + "loss": 0.3737, + "step": 5801 + }, + { + "epoch": 0.14682288635270896, + "grad_norm": 7.223194599151611, + "learning_rate": 9.536189685956119e-06, + "loss": 0.1897, + "step": 5802 + }, + { + "epoch": 0.14684819191740264, + "grad_norm": 7.128625869750977, + "learning_rate": 9.53602078380655e-06, + "loss": 0.2725, + "step": 5803 + }, + { + "epoch": 0.14687349748209633, + "grad_norm": 4.735269069671631, + "learning_rate": 9.535851852405027e-06, + "loss": 0.1735, + "step": 5804 + }, + { + "epoch": 0.14689880304678998, + "grad_norm": 5.678887844085693, + "learning_rate": 9.53568289175264e-06, + "loss": 0.2041, + "step": 5805 + }, + { + "epoch": 0.14692410861148367, + "grad_norm": 3.6094794273376465, + "learning_rate": 9.535513901850476e-06, + "loss": 0.1743, + "step": 5806 + }, + { + "epoch": 0.14694941417617735, + "grad_norm": 7.410816669464111, + "learning_rate": 9.535344882699628e-06, + "loss": 0.2886, + "step": 5807 + }, + { + "epoch": 0.146974719740871, + "grad_norm": 5.953976154327393, + "learning_rate": 9.535175834301183e-06, + "loss": 0.1981, + "step": 5808 + }, + { + "epoch": 0.1470000253055647, + "grad_norm": 3.7260448932647705, + "learning_rate": 9.535006756656233e-06, + "loss": 0.1413, + "step": 5809 + }, + { + "epoch": 0.14702533087025837, + "grad_norm": 5.822237968444824, + "learning_rate": 9.534837649765869e-06, + "loss": 0.0901, + "step": 5810 + }, + { + "epoch": 0.14705063643495206, + "grad_norm": 5.290262699127197, + "learning_rate": 9.53466851363118e-06, + "loss": 0.1902, + "step": 5811 + }, + { + "epoch": 0.14707594199964572, + "grad_norm": 4.92208194732666, + "learning_rate": 9.534499348253258e-06, + "loss": 0.1548, + "step": 5812 + }, + { + "epoch": 0.1471012475643394, + "grad_norm": 11.00516414642334, + "learning_rate": 9.534330153633193e-06, + "loss": 0.3433, + "step": 5813 + }, + { + "epoch": 0.14712655312903308, + "grad_norm": 6.152195930480957, + "learning_rate": 9.534160929772076e-06, + "loss": 0.2469, + "step": 5814 + }, + { + "epoch": 0.14715185869372674, + "grad_norm": 8.736042976379395, + "learning_rate": 9.533991676670997e-06, + "loss": 0.1892, + "step": 5815 + }, + { + "epoch": 0.14717716425842042, + "grad_norm": 5.598234176635742, + "learning_rate": 9.53382239433105e-06, + "loss": 0.1832, + "step": 5816 + }, + { + "epoch": 0.1472024698231141, + "grad_norm": 7.469332218170166, + "learning_rate": 9.533653082753326e-06, + "loss": 0.274, + "step": 5817 + }, + { + "epoch": 0.1472277753878078, + "grad_norm": 7.5972747802734375, + "learning_rate": 9.533483741938918e-06, + "loss": 0.2571, + "step": 5818 + }, + { + "epoch": 0.14725308095250145, + "grad_norm": 12.790627479553223, + "learning_rate": 9.533314371888915e-06, + "loss": 0.2429, + "step": 5819 + }, + { + "epoch": 0.14727838651719513, + "grad_norm": 6.745908737182617, + "learning_rate": 9.533144972604412e-06, + "loss": 0.2474, + "step": 5820 + }, + { + "epoch": 0.14730369208188882, + "grad_norm": 4.769514083862305, + "learning_rate": 9.532975544086501e-06, + "loss": 0.2543, + "step": 5821 + }, + { + "epoch": 0.14732899764658247, + "grad_norm": 7.377514839172363, + "learning_rate": 9.53280608633627e-06, + "loss": 0.1935, + "step": 5822 + }, + { + "epoch": 0.14735430321127616, + "grad_norm": 9.083264350891113, + "learning_rate": 9.532636599354819e-06, + "loss": 0.183, + "step": 5823 + }, + { + "epoch": 0.14737960877596984, + "grad_norm": 6.689853668212891, + "learning_rate": 9.532467083143238e-06, + "loss": 0.1951, + "step": 5824 + }, + { + "epoch": 0.14740491434066352, + "grad_norm": 3.8569729328155518, + "learning_rate": 9.532297537702619e-06, + "loss": 0.2479, + "step": 5825 + }, + { + "epoch": 0.14743021990535718, + "grad_norm": 12.268593788146973, + "learning_rate": 9.532127963034055e-06, + "loss": 0.2513, + "step": 5826 + }, + { + "epoch": 0.14745552547005086, + "grad_norm": 6.117206573486328, + "learning_rate": 9.531958359138642e-06, + "loss": 0.1988, + "step": 5827 + }, + { + "epoch": 0.14748083103474455, + "grad_norm": 6.406696796417236, + "learning_rate": 9.531788726017472e-06, + "loss": 0.1898, + "step": 5828 + }, + { + "epoch": 0.1475061365994382, + "grad_norm": 11.958508491516113, + "learning_rate": 9.53161906367164e-06, + "loss": 0.2544, + "step": 5829 + }, + { + "epoch": 0.1475314421641319, + "grad_norm": 4.6498260498046875, + "learning_rate": 9.53144937210224e-06, + "loss": 0.233, + "step": 5830 + }, + { + "epoch": 0.14755674772882557, + "grad_norm": 6.289292812347412, + "learning_rate": 9.531279651310366e-06, + "loss": 0.1925, + "step": 5831 + }, + { + "epoch": 0.14758205329351926, + "grad_norm": 7.969634056091309, + "learning_rate": 9.53110990129711e-06, + "loss": 0.1932, + "step": 5832 + }, + { + "epoch": 0.1476073588582129, + "grad_norm": 6.190080165863037, + "learning_rate": 9.530940122063572e-06, + "loss": 0.2729, + "step": 5833 + }, + { + "epoch": 0.1476326644229066, + "grad_norm": 5.8287811279296875, + "learning_rate": 9.530770313610845e-06, + "loss": 0.1585, + "step": 5834 + }, + { + "epoch": 0.14765796998760028, + "grad_norm": 3.6586122512817383, + "learning_rate": 9.53060047594002e-06, + "loss": 0.2191, + "step": 5835 + }, + { + "epoch": 0.14768327555229396, + "grad_norm": 4.515066623687744, + "learning_rate": 9.530430609052197e-06, + "loss": 0.1522, + "step": 5836 + }, + { + "epoch": 0.14770858111698762, + "grad_norm": 3.8746542930603027, + "learning_rate": 9.530260712948467e-06, + "loss": 0.1935, + "step": 5837 + }, + { + "epoch": 0.1477338866816813, + "grad_norm": 2.910515785217285, + "learning_rate": 9.53009078762993e-06, + "loss": 0.1785, + "step": 5838 + }, + { + "epoch": 0.147759192246375, + "grad_norm": 3.6298229694366455, + "learning_rate": 9.529920833097681e-06, + "loss": 0.1823, + "step": 5839 + }, + { + "epoch": 0.14778449781106864, + "grad_norm": 3.3601088523864746, + "learning_rate": 9.529750849352814e-06, + "loss": 0.174, + "step": 5840 + }, + { + "epoch": 0.14780980337576233, + "grad_norm": 7.902161598205566, + "learning_rate": 9.529580836396429e-06, + "loss": 0.1505, + "step": 5841 + }, + { + "epoch": 0.147835108940456, + "grad_norm": 7.845359802246094, + "learning_rate": 9.529410794229616e-06, + "loss": 0.1898, + "step": 5842 + }, + { + "epoch": 0.1478604145051497, + "grad_norm": 3.1397945880889893, + "learning_rate": 9.529240722853478e-06, + "loss": 0.1739, + "step": 5843 + }, + { + "epoch": 0.14788572006984335, + "grad_norm": 9.139513969421387, + "learning_rate": 9.529070622269109e-06, + "loss": 0.2466, + "step": 5844 + }, + { + "epoch": 0.14791102563453704, + "grad_norm": 4.046562194824219, + "learning_rate": 9.528900492477605e-06, + "loss": 0.1536, + "step": 5845 + }, + { + "epoch": 0.14793633119923072, + "grad_norm": 8.86604118347168, + "learning_rate": 9.528730333480064e-06, + "loss": 0.262, + "step": 5846 + }, + { + "epoch": 0.14796163676392438, + "grad_norm": 9.319292068481445, + "learning_rate": 9.528560145277585e-06, + "loss": 0.3477, + "step": 5847 + }, + { + "epoch": 0.14798694232861806, + "grad_norm": 6.83073091506958, + "learning_rate": 9.528389927871263e-06, + "loss": 0.216, + "step": 5848 + }, + { + "epoch": 0.14801224789331174, + "grad_norm": 7.148290634155273, + "learning_rate": 9.528219681262196e-06, + "loss": 0.2353, + "step": 5849 + }, + { + "epoch": 0.14803755345800543, + "grad_norm": 6.010673522949219, + "learning_rate": 9.528049405451483e-06, + "loss": 0.0936, + "step": 5850 + }, + { + "epoch": 0.14806285902269908, + "grad_norm": 5.80493688583374, + "learning_rate": 9.527879100440222e-06, + "loss": 0.1867, + "step": 5851 + }, + { + "epoch": 0.14808816458739277, + "grad_norm": 10.678481101989746, + "learning_rate": 9.527708766229511e-06, + "loss": 0.2412, + "step": 5852 + }, + { + "epoch": 0.14811347015208645, + "grad_norm": 5.5629987716674805, + "learning_rate": 9.527538402820447e-06, + "loss": 0.194, + "step": 5853 + }, + { + "epoch": 0.1481387757167801, + "grad_norm": 9.911746978759766, + "learning_rate": 9.527368010214132e-06, + "loss": 0.2146, + "step": 5854 + }, + { + "epoch": 0.1481640812814738, + "grad_norm": 3.01873779296875, + "learning_rate": 9.527197588411662e-06, + "loss": 0.1492, + "step": 5855 + }, + { + "epoch": 0.14818938684616748, + "grad_norm": 4.958164215087891, + "learning_rate": 9.527027137414137e-06, + "loss": 0.173, + "step": 5856 + }, + { + "epoch": 0.14821469241086116, + "grad_norm": 6.919050216674805, + "learning_rate": 9.526856657222657e-06, + "loss": 0.2313, + "step": 5857 + }, + { + "epoch": 0.14823999797555482, + "grad_norm": 13.886204719543457, + "learning_rate": 9.526686147838318e-06, + "loss": 0.2456, + "step": 5858 + }, + { + "epoch": 0.1482653035402485, + "grad_norm": 6.707104206085205, + "learning_rate": 9.526515609262224e-06, + "loss": 0.2062, + "step": 5859 + }, + { + "epoch": 0.14829060910494218, + "grad_norm": 4.966984272003174, + "learning_rate": 9.526345041495474e-06, + "loss": 0.1472, + "step": 5860 + }, + { + "epoch": 0.14831591466963584, + "grad_norm": 25.83479118347168, + "learning_rate": 9.526174444539163e-06, + "loss": 0.2831, + "step": 5861 + }, + { + "epoch": 0.14834122023432952, + "grad_norm": 7.402118682861328, + "learning_rate": 9.526003818394399e-06, + "loss": 0.2207, + "step": 5862 + }, + { + "epoch": 0.1483665257990232, + "grad_norm": 3.6912741661071777, + "learning_rate": 9.525833163062275e-06, + "loss": 0.1212, + "step": 5863 + }, + { + "epoch": 0.1483918313637169, + "grad_norm": 4.717308044433594, + "learning_rate": 9.525662478543895e-06, + "loss": 0.139, + "step": 5864 + }, + { + "epoch": 0.14841713692841055, + "grad_norm": 11.708765029907227, + "learning_rate": 9.525491764840362e-06, + "loss": 0.2231, + "step": 5865 + }, + { + "epoch": 0.14844244249310423, + "grad_norm": 10.543684959411621, + "learning_rate": 9.525321021952774e-06, + "loss": 0.3881, + "step": 5866 + }, + { + "epoch": 0.14846774805779792, + "grad_norm": 4.274234294891357, + "learning_rate": 9.52515024988223e-06, + "loss": 0.1278, + "step": 5867 + }, + { + "epoch": 0.14849305362249157, + "grad_norm": 6.5415120124816895, + "learning_rate": 9.524979448629837e-06, + "loss": 0.2445, + "step": 5868 + }, + { + "epoch": 0.14851835918718526, + "grad_norm": 5.6102752685546875, + "learning_rate": 9.52480861819669e-06, + "loss": 0.2991, + "step": 5869 + }, + { + "epoch": 0.14854366475187894, + "grad_norm": 6.456179141998291, + "learning_rate": 9.524637758583896e-06, + "loss": 0.2113, + "step": 5870 + }, + { + "epoch": 0.14856897031657262, + "grad_norm": 6.478338241577148, + "learning_rate": 9.524466869792555e-06, + "loss": 0.1089, + "step": 5871 + }, + { + "epoch": 0.14859427588126628, + "grad_norm": 5.167032241821289, + "learning_rate": 9.524295951823768e-06, + "loss": 0.1748, + "step": 5872 + }, + { + "epoch": 0.14861958144595996, + "grad_norm": 4.476031303405762, + "learning_rate": 9.524125004678637e-06, + "loss": 0.2131, + "step": 5873 + }, + { + "epoch": 0.14864488701065365, + "grad_norm": 4.367677688598633, + "learning_rate": 9.523954028358266e-06, + "loss": 0.2368, + "step": 5874 + }, + { + "epoch": 0.14867019257534733, + "grad_norm": 8.745646476745605, + "learning_rate": 9.523783022863759e-06, + "loss": 0.3463, + "step": 5875 + }, + { + "epoch": 0.148695498140041, + "grad_norm": 9.48337173461914, + "learning_rate": 9.523611988196216e-06, + "loss": 0.2919, + "step": 5876 + }, + { + "epoch": 0.14872080370473467, + "grad_norm": 14.525253295898438, + "learning_rate": 9.523440924356739e-06, + "loss": 0.2482, + "step": 5877 + }, + { + "epoch": 0.14874610926942836, + "grad_norm": 5.097122669219971, + "learning_rate": 9.523269831346433e-06, + "loss": 0.1174, + "step": 5878 + }, + { + "epoch": 0.148771414834122, + "grad_norm": 6.132641315460205, + "learning_rate": 9.523098709166404e-06, + "loss": 0.1855, + "step": 5879 + }, + { + "epoch": 0.1487967203988157, + "grad_norm": 5.698124408721924, + "learning_rate": 9.522927557817749e-06, + "loss": 0.131, + "step": 5880 + }, + { + "epoch": 0.14882202596350938, + "grad_norm": 6.57286262512207, + "learning_rate": 9.522756377301578e-06, + "loss": 0.2686, + "step": 5881 + }, + { + "epoch": 0.14884733152820306, + "grad_norm": 4.584482192993164, + "learning_rate": 9.522585167618992e-06, + "loss": 0.1615, + "step": 5882 + }, + { + "epoch": 0.14887263709289672, + "grad_norm": 8.742151260375977, + "learning_rate": 9.522413928771094e-06, + "loss": 0.2159, + "step": 5883 + }, + { + "epoch": 0.1488979426575904, + "grad_norm": 6.410709857940674, + "learning_rate": 9.522242660758993e-06, + "loss": 0.2407, + "step": 5884 + }, + { + "epoch": 0.1489232482222841, + "grad_norm": 9.53237247467041, + "learning_rate": 9.522071363583787e-06, + "loss": 0.29, + "step": 5885 + }, + { + "epoch": 0.14894855378697774, + "grad_norm": 4.168550968170166, + "learning_rate": 9.521900037246586e-06, + "loss": 0.2146, + "step": 5886 + }, + { + "epoch": 0.14897385935167143, + "grad_norm": 8.521587371826172, + "learning_rate": 9.521728681748492e-06, + "loss": 0.189, + "step": 5887 + }, + { + "epoch": 0.1489991649163651, + "grad_norm": 6.62928581237793, + "learning_rate": 9.52155729709061e-06, + "loss": 0.3269, + "step": 5888 + }, + { + "epoch": 0.1490244704810588, + "grad_norm": 2.893402099609375, + "learning_rate": 9.521385883274049e-06, + "loss": 0.1746, + "step": 5889 + }, + { + "epoch": 0.14904977604575245, + "grad_norm": 3.3061459064483643, + "learning_rate": 9.52121444029991e-06, + "loss": 0.1394, + "step": 5890 + }, + { + "epoch": 0.14907508161044614, + "grad_norm": 12.603063583374023, + "learning_rate": 9.521042968169298e-06, + "loss": 0.3189, + "step": 5891 + }, + { + "epoch": 0.14910038717513982, + "grad_norm": 4.824406147003174, + "learning_rate": 9.520871466883323e-06, + "loss": 0.2736, + "step": 5892 + }, + { + "epoch": 0.14912569273983348, + "grad_norm": 6.358041763305664, + "learning_rate": 9.52069993644309e-06, + "loss": 0.2066, + "step": 5893 + }, + { + "epoch": 0.14915099830452716, + "grad_norm": 7.838067531585693, + "learning_rate": 9.520528376849701e-06, + "loss": 0.2406, + "step": 5894 + }, + { + "epoch": 0.14917630386922084, + "grad_norm": 9.752437591552734, + "learning_rate": 9.520356788104268e-06, + "loss": 0.27, + "step": 5895 + }, + { + "epoch": 0.14920160943391453, + "grad_norm": 5.257207870483398, + "learning_rate": 9.520185170207893e-06, + "loss": 0.1871, + "step": 5896 + }, + { + "epoch": 0.14922691499860818, + "grad_norm": 2.8319790363311768, + "learning_rate": 9.520013523161685e-06, + "loss": 0.172, + "step": 5897 + }, + { + "epoch": 0.14925222056330187, + "grad_norm": 4.30122184753418, + "learning_rate": 9.51984184696675e-06, + "loss": 0.1535, + "step": 5898 + }, + { + "epoch": 0.14927752612799555, + "grad_norm": 4.33198881149292, + "learning_rate": 9.519670141624198e-06, + "loss": 0.1515, + "step": 5899 + }, + { + "epoch": 0.1493028316926892, + "grad_norm": 3.8703224658966064, + "learning_rate": 9.519498407135133e-06, + "loss": 0.1908, + "step": 5900 + }, + { + "epoch": 0.1493281372573829, + "grad_norm": 9.158268928527832, + "learning_rate": 9.519326643500662e-06, + "loss": 0.1663, + "step": 5901 + }, + { + "epoch": 0.14935344282207658, + "grad_norm": 9.11315631866455, + "learning_rate": 9.519154850721897e-06, + "loss": 0.1766, + "step": 5902 + }, + { + "epoch": 0.14937874838677026, + "grad_norm": 3.484975814819336, + "learning_rate": 9.51898302879994e-06, + "loss": 0.1542, + "step": 5903 + }, + { + "epoch": 0.14940405395146392, + "grad_norm": 5.113791465759277, + "learning_rate": 9.518811177735903e-06, + "loss": 0.2113, + "step": 5904 + }, + { + "epoch": 0.1494293595161576, + "grad_norm": 6.357874393463135, + "learning_rate": 9.518639297530894e-06, + "loss": 0.1858, + "step": 5905 + }, + { + "epoch": 0.14945466508085128, + "grad_norm": 8.740901947021484, + "learning_rate": 9.51846738818602e-06, + "loss": 0.262, + "step": 5906 + }, + { + "epoch": 0.14947997064554497, + "grad_norm": 8.399864196777344, + "learning_rate": 9.51829544970239e-06, + "loss": 0.1534, + "step": 5907 + }, + { + "epoch": 0.14950527621023862, + "grad_norm": 4.073001861572266, + "learning_rate": 9.518123482081113e-06, + "loss": 0.1739, + "step": 5908 + }, + { + "epoch": 0.1495305817749323, + "grad_norm": 20.01066780090332, + "learning_rate": 9.517951485323301e-06, + "loss": 0.3415, + "step": 5909 + }, + { + "epoch": 0.149555887339626, + "grad_norm": 6.514890193939209, + "learning_rate": 9.517779459430057e-06, + "loss": 0.2425, + "step": 5910 + }, + { + "epoch": 0.14958119290431965, + "grad_norm": 6.674981117248535, + "learning_rate": 9.517607404402495e-06, + "loss": 0.2873, + "step": 5911 + }, + { + "epoch": 0.14960649846901333, + "grad_norm": 12.19666576385498, + "learning_rate": 9.517435320241723e-06, + "loss": 0.1452, + "step": 5912 + }, + { + "epoch": 0.14963180403370702, + "grad_norm": 7.010603904724121, + "learning_rate": 9.51726320694885e-06, + "loss": 0.1982, + "step": 5913 + }, + { + "epoch": 0.1496571095984007, + "grad_norm": 4.622838020324707, + "learning_rate": 9.517091064524987e-06, + "loss": 0.1518, + "step": 5914 + }, + { + "epoch": 0.14968241516309436, + "grad_norm": 4.38731050491333, + "learning_rate": 9.516918892971245e-06, + "loss": 0.1444, + "step": 5915 + }, + { + "epoch": 0.14970772072778804, + "grad_norm": 15.553821563720703, + "learning_rate": 9.516746692288734e-06, + "loss": 0.1943, + "step": 5916 + }, + { + "epoch": 0.14973302629248172, + "grad_norm": 3.9579458236694336, + "learning_rate": 9.516574462478563e-06, + "loss": 0.1159, + "step": 5917 + }, + { + "epoch": 0.14975833185717538, + "grad_norm": 11.678235054016113, + "learning_rate": 9.516402203541842e-06, + "loss": 0.4226, + "step": 5918 + }, + { + "epoch": 0.14978363742186906, + "grad_norm": 4.23512601852417, + "learning_rate": 9.516229915479684e-06, + "loss": 0.1398, + "step": 5919 + }, + { + "epoch": 0.14980894298656275, + "grad_norm": 21.59330177307129, + "learning_rate": 9.5160575982932e-06, + "loss": 0.3722, + "step": 5920 + }, + { + "epoch": 0.14983424855125643, + "grad_norm": 6.6234211921691895, + "learning_rate": 9.515885251983502e-06, + "loss": 0.2467, + "step": 5921 + }, + { + "epoch": 0.1498595541159501, + "grad_norm": 10.505328178405762, + "learning_rate": 9.515712876551697e-06, + "loss": 0.2442, + "step": 5922 + }, + { + "epoch": 0.14988485968064377, + "grad_norm": 5.289226055145264, + "learning_rate": 9.515540471998901e-06, + "loss": 0.2405, + "step": 5923 + }, + { + "epoch": 0.14991016524533746, + "grad_norm": 4.628262996673584, + "learning_rate": 9.515368038326224e-06, + "loss": 0.1724, + "step": 5924 + }, + { + "epoch": 0.1499354708100311, + "grad_norm": 4.736710071563721, + "learning_rate": 9.515195575534779e-06, + "loss": 0.2185, + "step": 5925 + }, + { + "epoch": 0.1499607763747248, + "grad_norm": 4.446347236633301, + "learning_rate": 9.515023083625678e-06, + "loss": 0.209, + "step": 5926 + }, + { + "epoch": 0.14998608193941848, + "grad_norm": 5.783327579498291, + "learning_rate": 9.514850562600033e-06, + "loss": 0.183, + "step": 5927 + }, + { + "epoch": 0.15001138750411216, + "grad_norm": 3.6210522651672363, + "learning_rate": 9.514678012458955e-06, + "loss": 0.1654, + "step": 5928 + }, + { + "epoch": 0.15003669306880582, + "grad_norm": 3.3472766876220703, + "learning_rate": 9.51450543320356e-06, + "loss": 0.1271, + "step": 5929 + }, + { + "epoch": 0.1500619986334995, + "grad_norm": 8.138139724731445, + "learning_rate": 9.514332824834959e-06, + "loss": 0.2918, + "step": 5930 + }, + { + "epoch": 0.1500873041981932, + "grad_norm": 7.80588436126709, + "learning_rate": 9.514160187354264e-06, + "loss": 0.2093, + "step": 5931 + }, + { + "epoch": 0.15011260976288684, + "grad_norm": 4.923759937286377, + "learning_rate": 9.513987520762592e-06, + "loss": 0.1727, + "step": 5932 + }, + { + "epoch": 0.15013791532758053, + "grad_norm": 11.189945220947266, + "learning_rate": 9.513814825061052e-06, + "loss": 0.1296, + "step": 5933 + }, + { + "epoch": 0.1501632208922742, + "grad_norm": 6.945481777191162, + "learning_rate": 9.51364210025076e-06, + "loss": 0.2669, + "step": 5934 + }, + { + "epoch": 0.1501885264569679, + "grad_norm": 5.230526447296143, + "learning_rate": 9.51346934633283e-06, + "loss": 0.1751, + "step": 5935 + }, + { + "epoch": 0.15021383202166155, + "grad_norm": 9.092663764953613, + "learning_rate": 9.513296563308375e-06, + "loss": 0.2728, + "step": 5936 + }, + { + "epoch": 0.15023913758635524, + "grad_norm": 7.8595991134643555, + "learning_rate": 9.513123751178513e-06, + "loss": 0.3119, + "step": 5937 + }, + { + "epoch": 0.15026444315104892, + "grad_norm": 7.878296375274658, + "learning_rate": 9.512950909944353e-06, + "loss": 0.2262, + "step": 5938 + }, + { + "epoch": 0.1502897487157426, + "grad_norm": 5.668768405914307, + "learning_rate": 9.512778039607013e-06, + "loss": 0.221, + "step": 5939 + }, + { + "epoch": 0.15031505428043626, + "grad_norm": 5.880731105804443, + "learning_rate": 9.512605140167606e-06, + "loss": 0.177, + "step": 5940 + }, + { + "epoch": 0.15034035984512994, + "grad_norm": 7.954769134521484, + "learning_rate": 9.512432211627248e-06, + "loss": 0.1373, + "step": 5941 + }, + { + "epoch": 0.15036566540982363, + "grad_norm": 10.129841804504395, + "learning_rate": 9.512259253987056e-06, + "loss": 0.2292, + "step": 5942 + }, + { + "epoch": 0.15039097097451729, + "grad_norm": 6.320976734161377, + "learning_rate": 9.51208626724814e-06, + "loss": 0.2757, + "step": 5943 + }, + { + "epoch": 0.15041627653921097, + "grad_norm": 5.915867328643799, + "learning_rate": 9.51191325141162e-06, + "loss": 0.2088, + "step": 5944 + }, + { + "epoch": 0.15044158210390465, + "grad_norm": 4.532639980316162, + "learning_rate": 9.511740206478613e-06, + "loss": 0.2103, + "step": 5945 + }, + { + "epoch": 0.15046688766859834, + "grad_norm": 4.163997173309326, + "learning_rate": 9.51156713245023e-06, + "loss": 0.1917, + "step": 5946 + }, + { + "epoch": 0.150492193233292, + "grad_norm": 6.745870113372803, + "learning_rate": 9.511394029327591e-06, + "loss": 0.2164, + "step": 5947 + }, + { + "epoch": 0.15051749879798568, + "grad_norm": 9.57485580444336, + "learning_rate": 9.51122089711181e-06, + "loss": 0.2142, + "step": 5948 + }, + { + "epoch": 0.15054280436267936, + "grad_norm": 10.303861618041992, + "learning_rate": 9.511047735804007e-06, + "loss": 0.1515, + "step": 5949 + }, + { + "epoch": 0.15056810992737302, + "grad_norm": 17.515398025512695, + "learning_rate": 9.510874545405294e-06, + "loss": 0.2123, + "step": 5950 + }, + { + "epoch": 0.1505934154920667, + "grad_norm": 5.14113187789917, + "learning_rate": 9.510701325916792e-06, + "loss": 0.2168, + "step": 5951 + }, + { + "epoch": 0.15061872105676039, + "grad_norm": 8.863405227661133, + "learning_rate": 9.510528077339614e-06, + "loss": 0.3044, + "step": 5952 + }, + { + "epoch": 0.15064402662145407, + "grad_norm": 4.835205078125, + "learning_rate": 9.510354799674882e-06, + "loss": 0.1471, + "step": 5953 + }, + { + "epoch": 0.15066933218614773, + "grad_norm": 5.174808502197266, + "learning_rate": 9.510181492923709e-06, + "loss": 0.2032, + "step": 5954 + }, + { + "epoch": 0.1506946377508414, + "grad_norm": 6.570454120635986, + "learning_rate": 9.510008157087214e-06, + "loss": 0.2034, + "step": 5955 + }, + { + "epoch": 0.1507199433155351, + "grad_norm": 4.943114757537842, + "learning_rate": 9.509834792166517e-06, + "loss": 0.1613, + "step": 5956 + }, + { + "epoch": 0.15074524888022875, + "grad_norm": 4.606504440307617, + "learning_rate": 9.509661398162733e-06, + "loss": 0.1518, + "step": 5957 + }, + { + "epoch": 0.15077055444492243, + "grad_norm": 12.562688827514648, + "learning_rate": 9.509487975076982e-06, + "loss": 0.3241, + "step": 5958 + }, + { + "epoch": 0.15079586000961612, + "grad_norm": 3.233112096786499, + "learning_rate": 9.509314522910383e-06, + "loss": 0.1698, + "step": 5959 + }, + { + "epoch": 0.1508211655743098, + "grad_norm": 5.030848503112793, + "learning_rate": 9.509141041664052e-06, + "loss": 0.2405, + "step": 5960 + }, + { + "epoch": 0.15084647113900346, + "grad_norm": 5.1836934089660645, + "learning_rate": 9.508967531339109e-06, + "loss": 0.1562, + "step": 5961 + }, + { + "epoch": 0.15087177670369714, + "grad_norm": 5.786949157714844, + "learning_rate": 9.508793991936673e-06, + "loss": 0.1649, + "step": 5962 + }, + { + "epoch": 0.15089708226839083, + "grad_norm": 6.176273345947266, + "learning_rate": 9.508620423457864e-06, + "loss": 0.2051, + "step": 5963 + }, + { + "epoch": 0.15092238783308448, + "grad_norm": 5.8518524169921875, + "learning_rate": 9.508446825903801e-06, + "loss": 0.1526, + "step": 5964 + }, + { + "epoch": 0.15094769339777817, + "grad_norm": 3.075526475906372, + "learning_rate": 9.508273199275603e-06, + "loss": 0.1066, + "step": 5965 + }, + { + "epoch": 0.15097299896247185, + "grad_norm": 7.133418560028076, + "learning_rate": 9.50809954357439e-06, + "loss": 0.2866, + "step": 5966 + }, + { + "epoch": 0.15099830452716553, + "grad_norm": 6.7122087478637695, + "learning_rate": 9.507925858801282e-06, + "loss": 0.1718, + "step": 5967 + }, + { + "epoch": 0.1510236100918592, + "grad_norm": 5.040565490722656, + "learning_rate": 9.507752144957396e-06, + "loss": 0.2246, + "step": 5968 + }, + { + "epoch": 0.15104891565655287, + "grad_norm": 3.943077802658081, + "learning_rate": 9.507578402043856e-06, + "loss": 0.167, + "step": 5969 + }, + { + "epoch": 0.15107422122124656, + "grad_norm": 4.620900630950928, + "learning_rate": 9.507404630061782e-06, + "loss": 0.1158, + "step": 5970 + }, + { + "epoch": 0.15109952678594024, + "grad_norm": 7.322348117828369, + "learning_rate": 9.507230829012296e-06, + "loss": 0.2068, + "step": 5971 + }, + { + "epoch": 0.1511248323506339, + "grad_norm": 18.966899871826172, + "learning_rate": 9.507056998896514e-06, + "loss": 0.2587, + "step": 5972 + }, + { + "epoch": 0.15115013791532758, + "grad_norm": 4.342460632324219, + "learning_rate": 9.50688313971556e-06, + "loss": 0.187, + "step": 5973 + }, + { + "epoch": 0.15117544348002127, + "grad_norm": 5.503500938415527, + "learning_rate": 9.506709251470555e-06, + "loss": 0.1799, + "step": 5974 + }, + { + "epoch": 0.15120074904471492, + "grad_norm": 4.980278968811035, + "learning_rate": 9.506535334162621e-06, + "loss": 0.228, + "step": 5975 + }, + { + "epoch": 0.1512260546094086, + "grad_norm": 5.339017868041992, + "learning_rate": 9.506361387792879e-06, + "loss": 0.1513, + "step": 5976 + }, + { + "epoch": 0.1512513601741023, + "grad_norm": 4.321683883666992, + "learning_rate": 9.506187412362449e-06, + "loss": 0.1762, + "step": 5977 + }, + { + "epoch": 0.15127666573879597, + "grad_norm": 5.178703308105469, + "learning_rate": 9.506013407872457e-06, + "loss": 0.2037, + "step": 5978 + }, + { + "epoch": 0.15130197130348963, + "grad_norm": 6.5384039878845215, + "learning_rate": 9.505839374324022e-06, + "loss": 0.1379, + "step": 5979 + }, + { + "epoch": 0.1513272768681833, + "grad_norm": 5.72824239730835, + "learning_rate": 9.505665311718266e-06, + "loss": 0.1976, + "step": 5980 + }, + { + "epoch": 0.151352582432877, + "grad_norm": 8.386262893676758, + "learning_rate": 9.505491220056313e-06, + "loss": 0.2034, + "step": 5981 + }, + { + "epoch": 0.15137788799757065, + "grad_norm": 4.94400691986084, + "learning_rate": 9.505317099339286e-06, + "loss": 0.1633, + "step": 5982 + }, + { + "epoch": 0.15140319356226434, + "grad_norm": 12.898332595825195, + "learning_rate": 9.505142949568306e-06, + "loss": 0.2935, + "step": 5983 + }, + { + "epoch": 0.15142849912695802, + "grad_norm": 4.45135498046875, + "learning_rate": 9.504968770744498e-06, + "loss": 0.1908, + "step": 5984 + }, + { + "epoch": 0.1514538046916517, + "grad_norm": 5.737434387207031, + "learning_rate": 9.504794562868983e-06, + "loss": 0.1759, + "step": 5985 + }, + { + "epoch": 0.15147911025634536, + "grad_norm": 11.685519218444824, + "learning_rate": 9.504620325942888e-06, + "loss": 0.2039, + "step": 5986 + }, + { + "epoch": 0.15150441582103905, + "grad_norm": 8.032222747802734, + "learning_rate": 9.504446059967333e-06, + "loss": 0.2329, + "step": 5987 + }, + { + "epoch": 0.15152972138573273, + "grad_norm": 6.1496262550354, + "learning_rate": 9.504271764943444e-06, + "loss": 0.2043, + "step": 5988 + }, + { + "epoch": 0.15155502695042639, + "grad_norm": 4.373764514923096, + "learning_rate": 9.504097440872343e-06, + "loss": 0.1764, + "step": 5989 + }, + { + "epoch": 0.15158033251512007, + "grad_norm": 7.597569465637207, + "learning_rate": 9.503923087755156e-06, + "loss": 0.191, + "step": 5990 + }, + { + "epoch": 0.15160563807981375, + "grad_norm": 7.377484321594238, + "learning_rate": 9.503748705593007e-06, + "loss": 0.162, + "step": 5991 + }, + { + "epoch": 0.15163094364450744, + "grad_norm": 6.458983421325684, + "learning_rate": 9.503574294387021e-06, + "loss": 0.1531, + "step": 5992 + }, + { + "epoch": 0.1516562492092011, + "grad_norm": 9.769317626953125, + "learning_rate": 9.503399854138322e-06, + "loss": 0.2418, + "step": 5993 + }, + { + "epoch": 0.15168155477389478, + "grad_norm": 7.822906970977783, + "learning_rate": 9.503225384848033e-06, + "loss": 0.1684, + "step": 5994 + }, + { + "epoch": 0.15170686033858846, + "grad_norm": 8.957422256469727, + "learning_rate": 9.503050886517285e-06, + "loss": 0.2068, + "step": 5995 + }, + { + "epoch": 0.15173216590328212, + "grad_norm": 7.475324630737305, + "learning_rate": 9.502876359147195e-06, + "loss": 0.1811, + "step": 5996 + }, + { + "epoch": 0.1517574714679758, + "grad_norm": 16.374601364135742, + "learning_rate": 9.502701802738897e-06, + "loss": 0.2354, + "step": 5997 + }, + { + "epoch": 0.15178277703266949, + "grad_norm": 5.083106994628906, + "learning_rate": 9.502527217293509e-06, + "loss": 0.188, + "step": 5998 + }, + { + "epoch": 0.15180808259736317, + "grad_norm": 4.80810546875, + "learning_rate": 9.502352602812164e-06, + "loss": 0.1707, + "step": 5999 + }, + { + "epoch": 0.15183338816205683, + "grad_norm": 11.649181365966797, + "learning_rate": 9.502177959295983e-06, + "loss": 0.3375, + "step": 6000 + }, + { + "epoch": 0.1518586937267505, + "grad_norm": 4.825132369995117, + "learning_rate": 9.502003286746092e-06, + "loss": 0.1817, + "step": 6001 + }, + { + "epoch": 0.1518839992914442, + "grad_norm": 3.06528639793396, + "learning_rate": 9.501828585163622e-06, + "loss": 0.1463, + "step": 6002 + }, + { + "epoch": 0.15190930485613788, + "grad_norm": 4.8180766105651855, + "learning_rate": 9.501653854549694e-06, + "loss": 0.1535, + "step": 6003 + }, + { + "epoch": 0.15193461042083153, + "grad_norm": 19.088176727294922, + "learning_rate": 9.50147909490544e-06, + "loss": 0.326, + "step": 6004 + }, + { + "epoch": 0.15195991598552522, + "grad_norm": 6.293423652648926, + "learning_rate": 9.501304306231985e-06, + "loss": 0.1681, + "step": 6005 + }, + { + "epoch": 0.1519852215502189, + "grad_norm": 3.1601004600524902, + "learning_rate": 9.501129488530454e-06, + "loss": 0.1831, + "step": 6006 + }, + { + "epoch": 0.15201052711491256, + "grad_norm": 17.73317527770996, + "learning_rate": 9.500954641801979e-06, + "loss": 0.2003, + "step": 6007 + }, + { + "epoch": 0.15203583267960624, + "grad_norm": 5.27238130569458, + "learning_rate": 9.500779766047681e-06, + "loss": 0.1661, + "step": 6008 + }, + { + "epoch": 0.15206113824429993, + "grad_norm": 6.739080905914307, + "learning_rate": 9.500604861268694e-06, + "loss": 0.1765, + "step": 6009 + }, + { + "epoch": 0.1520864438089936, + "grad_norm": 7.623315811157227, + "learning_rate": 9.500429927466141e-06, + "loss": 0.1603, + "step": 6010 + }, + { + "epoch": 0.15211174937368727, + "grad_norm": 19.9696044921875, + "learning_rate": 9.500254964641157e-06, + "loss": 0.1619, + "step": 6011 + }, + { + "epoch": 0.15213705493838095, + "grad_norm": 4.388583183288574, + "learning_rate": 9.500079972794863e-06, + "loss": 0.1683, + "step": 6012 + }, + { + "epoch": 0.15216236050307463, + "grad_norm": 7.144293785095215, + "learning_rate": 9.499904951928389e-06, + "loss": 0.2629, + "step": 6013 + }, + { + "epoch": 0.1521876660677683, + "grad_norm": 3.285365581512451, + "learning_rate": 9.499729902042868e-06, + "loss": 0.1609, + "step": 6014 + }, + { + "epoch": 0.15221297163246197, + "grad_norm": 5.379047393798828, + "learning_rate": 9.499554823139422e-06, + "loss": 0.125, + "step": 6015 + }, + { + "epoch": 0.15223827719715566, + "grad_norm": 2.5644149780273438, + "learning_rate": 9.499379715219188e-06, + "loss": 0.1062, + "step": 6016 + }, + { + "epoch": 0.15226358276184934, + "grad_norm": 12.724791526794434, + "learning_rate": 9.49920457828329e-06, + "loss": 0.2514, + "step": 6017 + }, + { + "epoch": 0.152288888326543, + "grad_norm": 9.77806282043457, + "learning_rate": 9.499029412332856e-06, + "loss": 0.2031, + "step": 6018 + }, + { + "epoch": 0.15231419389123668, + "grad_norm": 37.6979866027832, + "learning_rate": 9.498854217369021e-06, + "loss": 0.4332, + "step": 6019 + }, + { + "epoch": 0.15233949945593037, + "grad_norm": 2.470508575439453, + "learning_rate": 9.498678993392911e-06, + "loss": 0.0953, + "step": 6020 + }, + { + "epoch": 0.15236480502062402, + "grad_norm": 7.551733016967773, + "learning_rate": 9.498503740405659e-06, + "loss": 0.1986, + "step": 6021 + }, + { + "epoch": 0.1523901105853177, + "grad_norm": 5.475766181945801, + "learning_rate": 9.49832845840839e-06, + "loss": 0.1797, + "step": 6022 + }, + { + "epoch": 0.1524154161500114, + "grad_norm": 4.495007038116455, + "learning_rate": 9.498153147402239e-06, + "loss": 0.1541, + "step": 6023 + }, + { + "epoch": 0.15244072171470507, + "grad_norm": 12.459345817565918, + "learning_rate": 9.497977807388334e-06, + "loss": 0.4264, + "step": 6024 + }, + { + "epoch": 0.15246602727939873, + "grad_norm": 5.35205078125, + "learning_rate": 9.497802438367808e-06, + "loss": 0.2364, + "step": 6025 + }, + { + "epoch": 0.15249133284409241, + "grad_norm": 18.486682891845703, + "learning_rate": 9.497627040341792e-06, + "loss": 0.7053, + "step": 6026 + }, + { + "epoch": 0.1525166384087861, + "grad_norm": 4.498195648193359, + "learning_rate": 9.497451613311413e-06, + "loss": 0.14, + "step": 6027 + }, + { + "epoch": 0.15254194397347975, + "grad_norm": 3.6466526985168457, + "learning_rate": 9.497276157277809e-06, + "loss": 0.1888, + "step": 6028 + }, + { + "epoch": 0.15256724953817344, + "grad_norm": 6.3784637451171875, + "learning_rate": 9.497100672242104e-06, + "loss": 0.2176, + "step": 6029 + }, + { + "epoch": 0.15259255510286712, + "grad_norm": 8.419543266296387, + "learning_rate": 9.496925158205435e-06, + "loss": 0.2826, + "step": 6030 + }, + { + "epoch": 0.1526178606675608, + "grad_norm": 4.092791557312012, + "learning_rate": 9.49674961516893e-06, + "loss": 0.1601, + "step": 6031 + }, + { + "epoch": 0.15264316623225446, + "grad_norm": 6.603612899780273, + "learning_rate": 9.496574043133725e-06, + "loss": 0.2017, + "step": 6032 + }, + { + "epoch": 0.15266847179694815, + "grad_norm": 3.2556071281433105, + "learning_rate": 9.49639844210095e-06, + "loss": 0.0959, + "step": 6033 + }, + { + "epoch": 0.15269377736164183, + "grad_norm": 7.388389587402344, + "learning_rate": 9.496222812071738e-06, + "loss": 0.2097, + "step": 6034 + }, + { + "epoch": 0.15271908292633551, + "grad_norm": 5.982498645782471, + "learning_rate": 9.496047153047223e-06, + "loss": 0.097, + "step": 6035 + }, + { + "epoch": 0.15274438849102917, + "grad_norm": 6.9642815589904785, + "learning_rate": 9.495871465028534e-06, + "loss": 0.2059, + "step": 6036 + }, + { + "epoch": 0.15276969405572285, + "grad_norm": 4.314016342163086, + "learning_rate": 9.495695748016806e-06, + "loss": 0.1574, + "step": 6037 + }, + { + "epoch": 0.15279499962041654, + "grad_norm": 4.366947174072266, + "learning_rate": 9.495520002013174e-06, + "loss": 0.1313, + "step": 6038 + }, + { + "epoch": 0.1528203051851102, + "grad_norm": 14.704797744750977, + "learning_rate": 9.49534422701877e-06, + "loss": 0.1904, + "step": 6039 + }, + { + "epoch": 0.15284561074980388, + "grad_norm": 6.739812850952148, + "learning_rate": 9.495168423034726e-06, + "loss": 0.1818, + "step": 6040 + }, + { + "epoch": 0.15287091631449756, + "grad_norm": 6.061249732971191, + "learning_rate": 9.494992590062178e-06, + "loss": 0.1749, + "step": 6041 + }, + { + "epoch": 0.15289622187919125, + "grad_norm": 4.033559322357178, + "learning_rate": 9.494816728102257e-06, + "loss": 0.1766, + "step": 6042 + }, + { + "epoch": 0.1529215274438849, + "grad_norm": 6.8535003662109375, + "learning_rate": 9.494640837156101e-06, + "loss": 0.2057, + "step": 6043 + }, + { + "epoch": 0.1529468330085786, + "grad_norm": 6.7177863121032715, + "learning_rate": 9.494464917224842e-06, + "loss": 0.2081, + "step": 6044 + }, + { + "epoch": 0.15297213857327227, + "grad_norm": 9.532546997070312, + "learning_rate": 9.494288968309612e-06, + "loss": 0.139, + "step": 6045 + }, + { + "epoch": 0.15299744413796593, + "grad_norm": 12.97177505493164, + "learning_rate": 9.494112990411553e-06, + "loss": 0.337, + "step": 6046 + }, + { + "epoch": 0.1530227497026596, + "grad_norm": 5.623614311218262, + "learning_rate": 9.493936983531793e-06, + "loss": 0.1507, + "step": 6047 + }, + { + "epoch": 0.1530480552673533, + "grad_norm": 4.793657302856445, + "learning_rate": 9.493760947671469e-06, + "loss": 0.1843, + "step": 6048 + }, + { + "epoch": 0.15307336083204698, + "grad_norm": 4.479475021362305, + "learning_rate": 9.493584882831717e-06, + "loss": 0.1771, + "step": 6049 + }, + { + "epoch": 0.15309866639674063, + "grad_norm": 9.069047927856445, + "learning_rate": 9.493408789013672e-06, + "loss": 0.2517, + "step": 6050 + }, + { + "epoch": 0.15312397196143432, + "grad_norm": 5.257626533508301, + "learning_rate": 9.493232666218469e-06, + "loss": 0.2374, + "step": 6051 + }, + { + "epoch": 0.153149277526128, + "grad_norm": 8.657352447509766, + "learning_rate": 9.493056514447245e-06, + "loss": 0.2029, + "step": 6052 + }, + { + "epoch": 0.15317458309082166, + "grad_norm": 6.30349063873291, + "learning_rate": 9.492880333701136e-06, + "loss": 0.1664, + "step": 6053 + }, + { + "epoch": 0.15319988865551534, + "grad_norm": 4.638247966766357, + "learning_rate": 9.492704123981275e-06, + "loss": 0.1799, + "step": 6054 + }, + { + "epoch": 0.15322519422020903, + "grad_norm": 5.839657783508301, + "learning_rate": 9.492527885288804e-06, + "loss": 0.2374, + "step": 6055 + }, + { + "epoch": 0.1532504997849027, + "grad_norm": 4.8268537521362305, + "learning_rate": 9.492351617624855e-06, + "loss": 0.1297, + "step": 6056 + }, + { + "epoch": 0.15327580534959637, + "grad_norm": 6.4018874168396, + "learning_rate": 9.492175320990566e-06, + "loss": 0.1701, + "step": 6057 + }, + { + "epoch": 0.15330111091429005, + "grad_norm": 8.018417358398438, + "learning_rate": 9.491998995387073e-06, + "loss": 0.2068, + "step": 6058 + }, + { + "epoch": 0.15332641647898373, + "grad_norm": 4.2851409912109375, + "learning_rate": 9.491822640815515e-06, + "loss": 0.1071, + "step": 6059 + }, + { + "epoch": 0.1533517220436774, + "grad_norm": 15.265859603881836, + "learning_rate": 9.491646257277027e-06, + "loss": 0.2059, + "step": 6060 + }, + { + "epoch": 0.15337702760837107, + "grad_norm": 9.610896110534668, + "learning_rate": 9.491469844772749e-06, + "loss": 0.2382, + "step": 6061 + }, + { + "epoch": 0.15340233317306476, + "grad_norm": 4.637914180755615, + "learning_rate": 9.491293403303819e-06, + "loss": 0.178, + "step": 6062 + }, + { + "epoch": 0.15342763873775844, + "grad_norm": 7.678048610687256, + "learning_rate": 9.491116932871371e-06, + "loss": 0.2117, + "step": 6063 + }, + { + "epoch": 0.1534529443024521, + "grad_norm": 6.747796058654785, + "learning_rate": 9.490940433476546e-06, + "loss": 0.2701, + "step": 6064 + }, + { + "epoch": 0.15347824986714578, + "grad_norm": 11.097407341003418, + "learning_rate": 9.49076390512048e-06, + "loss": 0.2937, + "step": 6065 + }, + { + "epoch": 0.15350355543183947, + "grad_norm": 11.435480117797852, + "learning_rate": 9.490587347804316e-06, + "loss": 0.183, + "step": 6066 + }, + { + "epoch": 0.15352886099653315, + "grad_norm": 6.435477256774902, + "learning_rate": 9.490410761529188e-06, + "loss": 0.1628, + "step": 6067 + }, + { + "epoch": 0.1535541665612268, + "grad_norm": 11.647188186645508, + "learning_rate": 9.490234146296236e-06, + "loss": 0.5031, + "step": 6068 + }, + { + "epoch": 0.1535794721259205, + "grad_norm": 5.895669937133789, + "learning_rate": 9.4900575021066e-06, + "loss": 0.2151, + "step": 6069 + }, + { + "epoch": 0.15360477769061417, + "grad_norm": 5.655144691467285, + "learning_rate": 9.489880828961417e-06, + "loss": 0.2166, + "step": 6070 + }, + { + "epoch": 0.15363008325530783, + "grad_norm": 5.486322402954102, + "learning_rate": 9.48970412686183e-06, + "loss": 0.1725, + "step": 6071 + }, + { + "epoch": 0.15365538882000151, + "grad_norm": 8.664081573486328, + "learning_rate": 9.489527395808976e-06, + "loss": 0.2579, + "step": 6072 + }, + { + "epoch": 0.1536806943846952, + "grad_norm": 5.732126235961914, + "learning_rate": 9.489350635803993e-06, + "loss": 0.1733, + "step": 6073 + }, + { + "epoch": 0.15370599994938888, + "grad_norm": 11.699235916137695, + "learning_rate": 9.489173846848023e-06, + "loss": 0.2425, + "step": 6074 + }, + { + "epoch": 0.15373130551408254, + "grad_norm": 8.732714653015137, + "learning_rate": 9.488997028942206e-06, + "loss": 0.2006, + "step": 6075 + }, + { + "epoch": 0.15375661107877622, + "grad_norm": 7.221104145050049, + "learning_rate": 9.488820182087683e-06, + "loss": 0.2249, + "step": 6076 + }, + { + "epoch": 0.1537819166434699, + "grad_norm": 6.875404357910156, + "learning_rate": 9.488643306285594e-06, + "loss": 0.2685, + "step": 6077 + }, + { + "epoch": 0.15380722220816356, + "grad_norm": 5.294724941253662, + "learning_rate": 9.488466401537078e-06, + "loss": 0.2323, + "step": 6078 + }, + { + "epoch": 0.15383252777285725, + "grad_norm": 15.64592456817627, + "learning_rate": 9.488289467843278e-06, + "loss": 0.352, + "step": 6079 + }, + { + "epoch": 0.15385783333755093, + "grad_norm": 4.44063663482666, + "learning_rate": 9.488112505205333e-06, + "loss": 0.1457, + "step": 6080 + }, + { + "epoch": 0.15388313890224461, + "grad_norm": 4.327639102935791, + "learning_rate": 9.487935513624386e-06, + "loss": 0.1489, + "step": 6081 + }, + { + "epoch": 0.15390844446693827, + "grad_norm": 4.105655193328857, + "learning_rate": 9.487758493101578e-06, + "loss": 0.1615, + "step": 6082 + }, + { + "epoch": 0.15393375003163196, + "grad_norm": 5.3733696937561035, + "learning_rate": 9.487581443638051e-06, + "loss": 0.152, + "step": 6083 + }, + { + "epoch": 0.15395905559632564, + "grad_norm": 2.9336133003234863, + "learning_rate": 9.487404365234945e-06, + "loss": 0.1683, + "step": 6084 + }, + { + "epoch": 0.1539843611610193, + "grad_norm": 9.522431373596191, + "learning_rate": 9.487227257893403e-06, + "loss": 0.1949, + "step": 6085 + }, + { + "epoch": 0.15400966672571298, + "grad_norm": 2.9196295738220215, + "learning_rate": 9.487050121614567e-06, + "loss": 0.1447, + "step": 6086 + }, + { + "epoch": 0.15403497229040666, + "grad_norm": 4.9311299324035645, + "learning_rate": 9.48687295639958e-06, + "loss": 0.2386, + "step": 6087 + }, + { + "epoch": 0.15406027785510035, + "grad_norm": 4.9618988037109375, + "learning_rate": 9.486695762249582e-06, + "loss": 0.173, + "step": 6088 + }, + { + "epoch": 0.154085583419794, + "grad_norm": 4.803892612457275, + "learning_rate": 9.486518539165721e-06, + "loss": 0.1991, + "step": 6089 + }, + { + "epoch": 0.1541108889844877, + "grad_norm": 16.730056762695312, + "learning_rate": 9.486341287149134e-06, + "loss": 0.2035, + "step": 6090 + }, + { + "epoch": 0.15413619454918137, + "grad_norm": 4.4595818519592285, + "learning_rate": 9.486164006200968e-06, + "loss": 0.1871, + "step": 6091 + }, + { + "epoch": 0.15416150011387503, + "grad_norm": 4.547821044921875, + "learning_rate": 9.485986696322364e-06, + "loss": 0.215, + "step": 6092 + }, + { + "epoch": 0.1541868056785687, + "grad_norm": 4.895432949066162, + "learning_rate": 9.485809357514466e-06, + "loss": 0.1347, + "step": 6093 + }, + { + "epoch": 0.1542121112432624, + "grad_norm": 5.701746940612793, + "learning_rate": 9.485631989778418e-06, + "loss": 0.2462, + "step": 6094 + }, + { + "epoch": 0.15423741680795608, + "grad_norm": 3.875042676925659, + "learning_rate": 9.485454593115363e-06, + "loss": 0.2086, + "step": 6095 + }, + { + "epoch": 0.15426272237264974, + "grad_norm": 8.26725959777832, + "learning_rate": 9.485277167526448e-06, + "loss": 0.2321, + "step": 6096 + }, + { + "epoch": 0.15428802793734342, + "grad_norm": 13.334552764892578, + "learning_rate": 9.485099713012813e-06, + "loss": 0.2251, + "step": 6097 + }, + { + "epoch": 0.1543133335020371, + "grad_norm": 6.723242282867432, + "learning_rate": 9.484922229575603e-06, + "loss": 0.2002, + "step": 6098 + }, + { + "epoch": 0.1543386390667308, + "grad_norm": 7.057797908782959, + "learning_rate": 9.484744717215967e-06, + "loss": 0.1373, + "step": 6099 + }, + { + "epoch": 0.15436394463142444, + "grad_norm": 11.628663063049316, + "learning_rate": 9.484567175935044e-06, + "loss": 0.2424, + "step": 6100 + }, + { + "epoch": 0.15438925019611813, + "grad_norm": 7.047635078430176, + "learning_rate": 9.484389605733984e-06, + "loss": 0.2062, + "step": 6101 + }, + { + "epoch": 0.1544145557608118, + "grad_norm": 7.666759014129639, + "learning_rate": 9.484212006613927e-06, + "loss": 0.1978, + "step": 6102 + }, + { + "epoch": 0.15443986132550547, + "grad_norm": 4.548895835876465, + "learning_rate": 9.484034378576022e-06, + "loss": 0.2433, + "step": 6103 + }, + { + "epoch": 0.15446516689019915, + "grad_norm": 5.850460529327393, + "learning_rate": 9.483856721621414e-06, + "loss": 0.2191, + "step": 6104 + }, + { + "epoch": 0.15449047245489284, + "grad_norm": 34.20394515991211, + "learning_rate": 9.483679035751248e-06, + "loss": 0.2246, + "step": 6105 + }, + { + "epoch": 0.15451577801958652, + "grad_norm": 5.971524238586426, + "learning_rate": 9.483501320966668e-06, + "loss": 0.2419, + "step": 6106 + }, + { + "epoch": 0.15454108358428018, + "grad_norm": 9.336790084838867, + "learning_rate": 9.483323577268824e-06, + "loss": 0.317, + "step": 6107 + }, + { + "epoch": 0.15456638914897386, + "grad_norm": 24.229917526245117, + "learning_rate": 9.483145804658859e-06, + "loss": 0.3512, + "step": 6108 + }, + { + "epoch": 0.15459169471366754, + "grad_norm": 3.8923230171203613, + "learning_rate": 9.482968003137921e-06, + "loss": 0.1822, + "step": 6109 + }, + { + "epoch": 0.1546170002783612, + "grad_norm": 4.475729465484619, + "learning_rate": 9.482790172707158e-06, + "loss": 0.1764, + "step": 6110 + }, + { + "epoch": 0.15464230584305488, + "grad_norm": 4.559519290924072, + "learning_rate": 9.482612313367712e-06, + "loss": 0.2585, + "step": 6111 + }, + { + "epoch": 0.15466761140774857, + "grad_norm": 5.232095241546631, + "learning_rate": 9.482434425120734e-06, + "loss": 0.1841, + "step": 6112 + }, + { + "epoch": 0.15469291697244225, + "grad_norm": 5.325733184814453, + "learning_rate": 9.482256507967371e-06, + "loss": 0.2722, + "step": 6113 + }, + { + "epoch": 0.1547182225371359, + "grad_norm": 11.889595985412598, + "learning_rate": 9.482078561908769e-06, + "loss": 0.3153, + "step": 6114 + }, + { + "epoch": 0.1547435281018296, + "grad_norm": 5.315407752990723, + "learning_rate": 9.481900586946077e-06, + "loss": 0.1512, + "step": 6115 + }, + { + "epoch": 0.15476883366652328, + "grad_norm": 5.740054130554199, + "learning_rate": 9.481722583080442e-06, + "loss": 0.1489, + "step": 6116 + }, + { + "epoch": 0.15479413923121693, + "grad_norm": 6.080564975738525, + "learning_rate": 9.481544550313009e-06, + "loss": 0.2117, + "step": 6117 + }, + { + "epoch": 0.15481944479591062, + "grad_norm": 3.862340211868286, + "learning_rate": 9.481366488644933e-06, + "loss": 0.1256, + "step": 6118 + }, + { + "epoch": 0.1548447503606043, + "grad_norm": 8.042943000793457, + "learning_rate": 9.481188398077354e-06, + "loss": 0.1843, + "step": 6119 + }, + { + "epoch": 0.15487005592529798, + "grad_norm": 13.210591316223145, + "learning_rate": 9.481010278611427e-06, + "loss": 0.1541, + "step": 6120 + }, + { + "epoch": 0.15489536148999164, + "grad_norm": 11.253837585449219, + "learning_rate": 9.480832130248297e-06, + "loss": 0.2189, + "step": 6121 + }, + { + "epoch": 0.15492066705468532, + "grad_norm": 5.282617092132568, + "learning_rate": 9.480653952989114e-06, + "loss": 0.274, + "step": 6122 + }, + { + "epoch": 0.154945972619379, + "grad_norm": 4.964048862457275, + "learning_rate": 9.480475746835028e-06, + "loss": 0.1797, + "step": 6123 + }, + { + "epoch": 0.15497127818407266, + "grad_norm": 9.292845726013184, + "learning_rate": 9.480297511787188e-06, + "loss": 0.3311, + "step": 6124 + }, + { + "epoch": 0.15499658374876635, + "grad_norm": 2.6862709522247314, + "learning_rate": 9.480119247846743e-06, + "loss": 0.1129, + "step": 6125 + }, + { + "epoch": 0.15502188931346003, + "grad_norm": 4.565157413482666, + "learning_rate": 9.479940955014838e-06, + "loss": 0.1958, + "step": 6126 + }, + { + "epoch": 0.15504719487815372, + "grad_norm": 9.349175453186035, + "learning_rate": 9.479762633292631e-06, + "loss": 0.2429, + "step": 6127 + }, + { + "epoch": 0.15507250044284737, + "grad_norm": 5.228845596313477, + "learning_rate": 9.479584282681269e-06, + "loss": 0.185, + "step": 6128 + }, + { + "epoch": 0.15509780600754106, + "grad_norm": 6.453636169433594, + "learning_rate": 9.4794059031819e-06, + "loss": 0.1873, + "step": 6129 + }, + { + "epoch": 0.15512311157223474, + "grad_norm": 4.473347187042236, + "learning_rate": 9.479227494795675e-06, + "loss": 0.181, + "step": 6130 + }, + { + "epoch": 0.15514841713692842, + "grad_norm": 4.106030464172363, + "learning_rate": 9.479049057523743e-06, + "loss": 0.1379, + "step": 6131 + }, + { + "epoch": 0.15517372270162208, + "grad_norm": 5.186275959014893, + "learning_rate": 9.478870591367261e-06, + "loss": 0.206, + "step": 6132 + }, + { + "epoch": 0.15519902826631576, + "grad_norm": 12.98534870147705, + "learning_rate": 9.478692096327373e-06, + "loss": 0.2175, + "step": 6133 + }, + { + "epoch": 0.15522433383100945, + "grad_norm": 8.937005043029785, + "learning_rate": 9.478513572405235e-06, + "loss": 0.2099, + "step": 6134 + }, + { + "epoch": 0.1552496393957031, + "grad_norm": 7.435971260070801, + "learning_rate": 9.478335019601994e-06, + "loss": 0.1992, + "step": 6135 + }, + { + "epoch": 0.1552749449603968, + "grad_norm": 7.557187080383301, + "learning_rate": 9.478156437918803e-06, + "loss": 0.1783, + "step": 6136 + }, + { + "epoch": 0.15530025052509047, + "grad_norm": 8.872733116149902, + "learning_rate": 9.477977827356816e-06, + "loss": 0.2963, + "step": 6137 + }, + { + "epoch": 0.15532555608978416, + "grad_norm": 7.038205623626709, + "learning_rate": 9.477799187917183e-06, + "loss": 0.1884, + "step": 6138 + }, + { + "epoch": 0.1553508616544778, + "grad_norm": 6.19425630569458, + "learning_rate": 9.477620519601054e-06, + "loss": 0.1764, + "step": 6139 + }, + { + "epoch": 0.1553761672191715, + "grad_norm": 3.3482754230499268, + "learning_rate": 9.477441822409585e-06, + "loss": 0.1587, + "step": 6140 + }, + { + "epoch": 0.15540147278386518, + "grad_norm": 5.280039310455322, + "learning_rate": 9.477263096343925e-06, + "loss": 0.1847, + "step": 6141 + }, + { + "epoch": 0.15542677834855884, + "grad_norm": 12.308512687683105, + "learning_rate": 9.47708434140523e-06, + "loss": 0.1889, + "step": 6142 + }, + { + "epoch": 0.15545208391325252, + "grad_norm": 2.97725248336792, + "learning_rate": 9.47690555759465e-06, + "loss": 0.1729, + "step": 6143 + }, + { + "epoch": 0.1554773894779462, + "grad_norm": 4.258358001708984, + "learning_rate": 9.476726744913339e-06, + "loss": 0.1919, + "step": 6144 + }, + { + "epoch": 0.1555026950426399, + "grad_norm": 6.93135929107666, + "learning_rate": 9.47654790336245e-06, + "loss": 0.1946, + "step": 6145 + }, + { + "epoch": 0.15552800060733354, + "grad_norm": 5.308907508850098, + "learning_rate": 9.476369032943134e-06, + "loss": 0.2308, + "step": 6146 + }, + { + "epoch": 0.15555330617202723, + "grad_norm": 3.2787184715270996, + "learning_rate": 9.47619013365655e-06, + "loss": 0.1325, + "step": 6147 + }, + { + "epoch": 0.1555786117367209, + "grad_norm": 7.512898921966553, + "learning_rate": 9.476011205503846e-06, + "loss": 0.1755, + "step": 6148 + }, + { + "epoch": 0.15560391730141457, + "grad_norm": 5.177562713623047, + "learning_rate": 9.47583224848618e-06, + "loss": 0.2092, + "step": 6149 + }, + { + "epoch": 0.15562922286610825, + "grad_norm": 7.025943279266357, + "learning_rate": 9.475653262604704e-06, + "loss": 0.1636, + "step": 6150 + }, + { + "epoch": 0.15565452843080194, + "grad_norm": 3.7625420093536377, + "learning_rate": 9.475474247860572e-06, + "loss": 0.1624, + "step": 6151 + }, + { + "epoch": 0.15567983399549562, + "grad_norm": 3.88848876953125, + "learning_rate": 9.47529520425494e-06, + "loss": 0.1322, + "step": 6152 + }, + { + "epoch": 0.15570513956018928, + "grad_norm": 5.640760898590088, + "learning_rate": 9.475116131788962e-06, + "loss": 0.126, + "step": 6153 + }, + { + "epoch": 0.15573044512488296, + "grad_norm": 18.85200309753418, + "learning_rate": 9.474937030463792e-06, + "loss": 0.4048, + "step": 6154 + }, + { + "epoch": 0.15575575068957664, + "grad_norm": 9.718636512756348, + "learning_rate": 9.474757900280584e-06, + "loss": 0.407, + "step": 6155 + }, + { + "epoch": 0.1557810562542703, + "grad_norm": 9.235925674438477, + "learning_rate": 9.474578741240496e-06, + "loss": 0.1467, + "step": 6156 + }, + { + "epoch": 0.15580636181896398, + "grad_norm": 3.5275027751922607, + "learning_rate": 9.474399553344682e-06, + "loss": 0.1352, + "step": 6157 + }, + { + "epoch": 0.15583166738365767, + "grad_norm": 5.43347692489624, + "learning_rate": 9.474220336594298e-06, + "loss": 0.1576, + "step": 6158 + }, + { + "epoch": 0.15585697294835135, + "grad_norm": 7.805102825164795, + "learning_rate": 9.474041090990498e-06, + "loss": 0.2483, + "step": 6159 + }, + { + "epoch": 0.155882278513045, + "grad_norm": 6.316664695739746, + "learning_rate": 9.473861816534443e-06, + "loss": 0.266, + "step": 6160 + }, + { + "epoch": 0.1559075840777387, + "grad_norm": 7.314826488494873, + "learning_rate": 9.473682513227281e-06, + "loss": 0.1924, + "step": 6161 + }, + { + "epoch": 0.15593288964243238, + "grad_norm": 12.695594787597656, + "learning_rate": 9.473503181070174e-06, + "loss": 0.1992, + "step": 6162 + }, + { + "epoch": 0.15595819520712606, + "grad_norm": 3.4877090454101562, + "learning_rate": 9.473323820064278e-06, + "loss": 0.127, + "step": 6163 + }, + { + "epoch": 0.15598350077181972, + "grad_norm": 9.91317367553711, + "learning_rate": 9.473144430210747e-06, + "loss": 0.3222, + "step": 6164 + }, + { + "epoch": 0.1560088063365134, + "grad_norm": 5.092978477478027, + "learning_rate": 9.47296501151074e-06, + "loss": 0.1567, + "step": 6165 + }, + { + "epoch": 0.15603411190120708, + "grad_norm": 6.22285795211792, + "learning_rate": 9.472785563965415e-06, + "loss": 0.1663, + "step": 6166 + }, + { + "epoch": 0.15605941746590074, + "grad_norm": 5.155788421630859, + "learning_rate": 9.472606087575929e-06, + "loss": 0.1976, + "step": 6167 + }, + { + "epoch": 0.15608472303059442, + "grad_norm": 3.844158172607422, + "learning_rate": 9.472426582343434e-06, + "loss": 0.0966, + "step": 6168 + }, + { + "epoch": 0.1561100285952881, + "grad_norm": 4.58372688293457, + "learning_rate": 9.472247048269097e-06, + "loss": 0.1703, + "step": 6169 + }, + { + "epoch": 0.1561353341599818, + "grad_norm": 10.589333534240723, + "learning_rate": 9.472067485354067e-06, + "loss": 0.2968, + "step": 6170 + }, + { + "epoch": 0.15616063972467545, + "grad_norm": 4.26143217086792, + "learning_rate": 9.471887893599508e-06, + "loss": 0.2459, + "step": 6171 + }, + { + "epoch": 0.15618594528936913, + "grad_norm": 11.91737174987793, + "learning_rate": 9.471708273006574e-06, + "loss": 0.2182, + "step": 6172 + }, + { + "epoch": 0.15621125085406282, + "grad_norm": 4.373501300811768, + "learning_rate": 9.471528623576425e-06, + "loss": 0.1494, + "step": 6173 + }, + { + "epoch": 0.15623655641875647, + "grad_norm": 5.8134236335754395, + "learning_rate": 9.471348945310221e-06, + "loss": 0.1935, + "step": 6174 + }, + { + "epoch": 0.15626186198345016, + "grad_norm": 5.1514105796813965, + "learning_rate": 9.47116923820912e-06, + "loss": 0.1566, + "step": 6175 + }, + { + "epoch": 0.15628716754814384, + "grad_norm": 4.074557781219482, + "learning_rate": 9.470989502274278e-06, + "loss": 0.1656, + "step": 6176 + }, + { + "epoch": 0.15631247311283752, + "grad_norm": 5.233521461486816, + "learning_rate": 9.470809737506858e-06, + "loss": 0.1515, + "step": 6177 + }, + { + "epoch": 0.15633777867753118, + "grad_norm": 8.0914888381958, + "learning_rate": 9.470629943908017e-06, + "loss": 0.179, + "step": 6178 + }, + { + "epoch": 0.15636308424222486, + "grad_norm": 3.8967790603637695, + "learning_rate": 9.470450121478914e-06, + "loss": 0.0781, + "step": 6179 + }, + { + "epoch": 0.15638838980691855, + "grad_norm": 3.5959904193878174, + "learning_rate": 9.470270270220711e-06, + "loss": 0.1866, + "step": 6180 + }, + { + "epoch": 0.1564136953716122, + "grad_norm": 5.453238010406494, + "learning_rate": 9.470090390134568e-06, + "loss": 0.1856, + "step": 6181 + }, + { + "epoch": 0.1564390009363059, + "grad_norm": 10.449686050415039, + "learning_rate": 9.469910481221641e-06, + "loss": 0.1747, + "step": 6182 + }, + { + "epoch": 0.15646430650099957, + "grad_norm": 18.449504852294922, + "learning_rate": 9.469730543483094e-06, + "loss": 0.2087, + "step": 6183 + }, + { + "epoch": 0.15648961206569326, + "grad_norm": 6.783027172088623, + "learning_rate": 9.469550576920086e-06, + "loss": 0.2858, + "step": 6184 + }, + { + "epoch": 0.1565149176303869, + "grad_norm": 6.881307125091553, + "learning_rate": 9.469370581533778e-06, + "loss": 0.2155, + "step": 6185 + }, + { + "epoch": 0.1565402231950806, + "grad_norm": 4.167217254638672, + "learning_rate": 9.46919055732533e-06, + "loss": 0.1814, + "step": 6186 + }, + { + "epoch": 0.15656552875977428, + "grad_norm": 3.831925868988037, + "learning_rate": 9.469010504295903e-06, + "loss": 0.1636, + "step": 6187 + }, + { + "epoch": 0.15659083432446794, + "grad_norm": 13.5957670211792, + "learning_rate": 9.46883042244666e-06, + "loss": 0.1988, + "step": 6188 + }, + { + "epoch": 0.15661613988916162, + "grad_norm": 4.334914207458496, + "learning_rate": 9.46865031177876e-06, + "loss": 0.1714, + "step": 6189 + }, + { + "epoch": 0.1566414454538553, + "grad_norm": 6.451932430267334, + "learning_rate": 9.468470172293366e-06, + "loss": 0.212, + "step": 6190 + }, + { + "epoch": 0.156666751018549, + "grad_norm": 7.50288200378418, + "learning_rate": 9.468290003991637e-06, + "loss": 0.2388, + "step": 6191 + }, + { + "epoch": 0.15669205658324264, + "grad_norm": 5.809343338012695, + "learning_rate": 9.468109806874739e-06, + "loss": 0.1973, + "step": 6192 + }, + { + "epoch": 0.15671736214793633, + "grad_norm": 5.091099739074707, + "learning_rate": 9.467929580943832e-06, + "loss": 0.0957, + "step": 6193 + }, + { + "epoch": 0.15674266771263, + "grad_norm": 20.694379806518555, + "learning_rate": 9.467749326200077e-06, + "loss": 0.3326, + "step": 6194 + }, + { + "epoch": 0.1567679732773237, + "grad_norm": 5.254334449768066, + "learning_rate": 9.46756904264464e-06, + "loss": 0.1823, + "step": 6195 + }, + { + "epoch": 0.15679327884201735, + "grad_norm": 8.403188705444336, + "learning_rate": 9.467388730278679e-06, + "loss": 0.2168, + "step": 6196 + }, + { + "epoch": 0.15681858440671104, + "grad_norm": 10.029640197753906, + "learning_rate": 9.46720838910336e-06, + "loss": 0.2481, + "step": 6197 + }, + { + "epoch": 0.15684388997140472, + "grad_norm": 4.403545379638672, + "learning_rate": 9.467028019119844e-06, + "loss": 0.1438, + "step": 6198 + }, + { + "epoch": 0.15686919553609838, + "grad_norm": 26.573583602905273, + "learning_rate": 9.466847620329296e-06, + "loss": 0.1462, + "step": 6199 + }, + { + "epoch": 0.15689450110079206, + "grad_norm": 5.333693027496338, + "learning_rate": 9.466667192732879e-06, + "loss": 0.1795, + "step": 6200 + }, + { + "epoch": 0.15691980666548574, + "grad_norm": 6.994149208068848, + "learning_rate": 9.466486736331757e-06, + "loss": 0.2167, + "step": 6201 + }, + { + "epoch": 0.15694511223017943, + "grad_norm": 5.442689418792725, + "learning_rate": 9.46630625112709e-06, + "loss": 0.1803, + "step": 6202 + }, + { + "epoch": 0.15697041779487308, + "grad_norm": 5.815001010894775, + "learning_rate": 9.466125737120047e-06, + "loss": 0.1353, + "step": 6203 + }, + { + "epoch": 0.15699572335956677, + "grad_norm": 8.284941673278809, + "learning_rate": 9.465945194311789e-06, + "loss": 0.2347, + "step": 6204 + }, + { + "epoch": 0.15702102892426045, + "grad_norm": 4.885751724243164, + "learning_rate": 9.465764622703481e-06, + "loss": 0.1285, + "step": 6205 + }, + { + "epoch": 0.1570463344889541, + "grad_norm": 4.49934720993042, + "learning_rate": 9.465584022296288e-06, + "loss": 0.196, + "step": 6206 + }, + { + "epoch": 0.1570716400536478, + "grad_norm": 17.74190330505371, + "learning_rate": 9.465403393091374e-06, + "loss": 0.2169, + "step": 6207 + }, + { + "epoch": 0.15709694561834148, + "grad_norm": 3.4989476203918457, + "learning_rate": 9.465222735089905e-06, + "loss": 0.1431, + "step": 6208 + }, + { + "epoch": 0.15712225118303516, + "grad_norm": 4.1969380378723145, + "learning_rate": 9.465042048293045e-06, + "loss": 0.1899, + "step": 6209 + }, + { + "epoch": 0.15714755674772882, + "grad_norm": 12.198019981384277, + "learning_rate": 9.46486133270196e-06, + "loss": 0.2702, + "step": 6210 + }, + { + "epoch": 0.1571728623124225, + "grad_norm": 5.15306282043457, + "learning_rate": 9.464680588317814e-06, + "loss": 0.1738, + "step": 6211 + }, + { + "epoch": 0.15719816787711618, + "grad_norm": 6.6073150634765625, + "learning_rate": 9.464499815141772e-06, + "loss": 0.2029, + "step": 6212 + }, + { + "epoch": 0.15722347344180984, + "grad_norm": 9.465625762939453, + "learning_rate": 9.464319013175003e-06, + "loss": 0.3224, + "step": 6213 + }, + { + "epoch": 0.15724877900650353, + "grad_norm": 8.172534942626953, + "learning_rate": 9.464138182418671e-06, + "loss": 0.2753, + "step": 6214 + }, + { + "epoch": 0.1572740845711972, + "grad_norm": 5.410919666290283, + "learning_rate": 9.463957322873942e-06, + "loss": 0.1285, + "step": 6215 + }, + { + "epoch": 0.1572993901358909, + "grad_norm": 7.281305313110352, + "learning_rate": 9.463776434541981e-06, + "loss": 0.261, + "step": 6216 + }, + { + "epoch": 0.15732469570058455, + "grad_norm": 2.8551418781280518, + "learning_rate": 9.463595517423959e-06, + "loss": 0.1048, + "step": 6217 + }, + { + "epoch": 0.15735000126527823, + "grad_norm": 6.47718620300293, + "learning_rate": 9.463414571521037e-06, + "loss": 0.2273, + "step": 6218 + }, + { + "epoch": 0.15737530682997192, + "grad_norm": 10.853639602661133, + "learning_rate": 9.463233596834387e-06, + "loss": 0.2461, + "step": 6219 + }, + { + "epoch": 0.15740061239466557, + "grad_norm": 5.836655616760254, + "learning_rate": 9.463052593365172e-06, + "loss": 0.2242, + "step": 6220 + }, + { + "epoch": 0.15742591795935926, + "grad_norm": 6.213253974914551, + "learning_rate": 9.462871561114561e-06, + "loss": 0.1972, + "step": 6221 + }, + { + "epoch": 0.15745122352405294, + "grad_norm": 6.571801662445068, + "learning_rate": 9.462690500083723e-06, + "loss": 0.1772, + "step": 6222 + }, + { + "epoch": 0.15747652908874663, + "grad_norm": 8.657561302185059, + "learning_rate": 9.462509410273824e-06, + "loss": 0.2781, + "step": 6223 + }, + { + "epoch": 0.15750183465344028, + "grad_norm": 4.919859409332275, + "learning_rate": 9.46232829168603e-06, + "loss": 0.219, + "step": 6224 + }, + { + "epoch": 0.15752714021813397, + "grad_norm": 4.317273139953613, + "learning_rate": 9.462147144321513e-06, + "loss": 0.1538, + "step": 6225 + }, + { + "epoch": 0.15755244578282765, + "grad_norm": 8.897234916687012, + "learning_rate": 9.461965968181438e-06, + "loss": 0.218, + "step": 6226 + }, + { + "epoch": 0.15757775134752133, + "grad_norm": 7.746687412261963, + "learning_rate": 9.461784763266975e-06, + "loss": 0.2503, + "step": 6227 + }, + { + "epoch": 0.157603056912215, + "grad_norm": 6.3006744384765625, + "learning_rate": 9.46160352957929e-06, + "loss": 0.1683, + "step": 6228 + }, + { + "epoch": 0.15762836247690867, + "grad_norm": 12.54702377319336, + "learning_rate": 9.461422267119557e-06, + "loss": 0.2655, + "step": 6229 + }, + { + "epoch": 0.15765366804160236, + "grad_norm": 7.696883201599121, + "learning_rate": 9.461240975888938e-06, + "loss": 0.241, + "step": 6230 + }, + { + "epoch": 0.157678973606296, + "grad_norm": 6.153310775756836, + "learning_rate": 9.461059655888609e-06, + "loss": 0.2194, + "step": 6231 + }, + { + "epoch": 0.1577042791709897, + "grad_norm": 6.5497870445251465, + "learning_rate": 9.460878307119736e-06, + "loss": 0.1823, + "step": 6232 + }, + { + "epoch": 0.15772958473568338, + "grad_norm": 2.7804031372070312, + "learning_rate": 9.460696929583487e-06, + "loss": 0.1212, + "step": 6233 + }, + { + "epoch": 0.15775489030037707, + "grad_norm": 4.9011125564575195, + "learning_rate": 9.460515523281034e-06, + "loss": 0.1822, + "step": 6234 + }, + { + "epoch": 0.15778019586507072, + "grad_norm": 10.21211051940918, + "learning_rate": 9.460334088213544e-06, + "loss": 0.3487, + "step": 6235 + }, + { + "epoch": 0.1578055014297644, + "grad_norm": 8.085674285888672, + "learning_rate": 9.460152624382192e-06, + "loss": 0.2188, + "step": 6236 + }, + { + "epoch": 0.1578308069944581, + "grad_norm": 2.801048517227173, + "learning_rate": 9.459971131788145e-06, + "loss": 0.1403, + "step": 6237 + }, + { + "epoch": 0.15785611255915175, + "grad_norm": 13.146331787109375, + "learning_rate": 9.459789610432574e-06, + "loss": 0.364, + "step": 6238 + }, + { + "epoch": 0.15788141812384543, + "grad_norm": 7.631584644317627, + "learning_rate": 9.459608060316649e-06, + "loss": 0.2212, + "step": 6239 + }, + { + "epoch": 0.1579067236885391, + "grad_norm": 4.5733866691589355, + "learning_rate": 9.45942648144154e-06, + "loss": 0.1926, + "step": 6240 + }, + { + "epoch": 0.1579320292532328, + "grad_norm": 6.831989288330078, + "learning_rate": 9.45924487380842e-06, + "loss": 0.2549, + "step": 6241 + }, + { + "epoch": 0.15795733481792645, + "grad_norm": 6.028754234313965, + "learning_rate": 9.45906323741846e-06, + "loss": 0.1999, + "step": 6242 + }, + { + "epoch": 0.15798264038262014, + "grad_norm": 7.450459957122803, + "learning_rate": 9.45888157227283e-06, + "loss": 0.2441, + "step": 6243 + }, + { + "epoch": 0.15800794594731382, + "grad_norm": 4.890110015869141, + "learning_rate": 9.4586998783727e-06, + "loss": 0.1376, + "step": 6244 + }, + { + "epoch": 0.15803325151200748, + "grad_norm": 4.351700782775879, + "learning_rate": 9.458518155719247e-06, + "loss": 0.2715, + "step": 6245 + }, + { + "epoch": 0.15805855707670116, + "grad_norm": 9.022796630859375, + "learning_rate": 9.45833640431364e-06, + "loss": 0.2417, + "step": 6246 + }, + { + "epoch": 0.15808386264139485, + "grad_norm": 6.274764060974121, + "learning_rate": 9.458154624157048e-06, + "loss": 0.2282, + "step": 6247 + }, + { + "epoch": 0.15810916820608853, + "grad_norm": 9.23851203918457, + "learning_rate": 9.457972815250648e-06, + "loss": 0.2391, + "step": 6248 + }, + { + "epoch": 0.15813447377078219, + "grad_norm": 6.173685550689697, + "learning_rate": 9.45779097759561e-06, + "loss": 0.2215, + "step": 6249 + }, + { + "epoch": 0.15815977933547587, + "grad_norm": 8.596477508544922, + "learning_rate": 9.457609111193109e-06, + "loss": 0.3004, + "step": 6250 + }, + { + "epoch": 0.15818508490016955, + "grad_norm": 5.252194881439209, + "learning_rate": 9.457427216044314e-06, + "loss": 0.1488, + "step": 6251 + }, + { + "epoch": 0.1582103904648632, + "grad_norm": 6.676657676696777, + "learning_rate": 9.4572452921504e-06, + "loss": 0.1845, + "step": 6252 + }, + { + "epoch": 0.1582356960295569, + "grad_norm": 6.5439653396606445, + "learning_rate": 9.45706333951254e-06, + "loss": 0.2697, + "step": 6253 + }, + { + "epoch": 0.15826100159425058, + "grad_norm": 4.498314380645752, + "learning_rate": 9.456881358131909e-06, + "loss": 0.1592, + "step": 6254 + }, + { + "epoch": 0.15828630715894426, + "grad_norm": 6.777843475341797, + "learning_rate": 9.45669934800968e-06, + "loss": 0.2657, + "step": 6255 + }, + { + "epoch": 0.15831161272363792, + "grad_norm": 2.950352907180786, + "learning_rate": 9.456517309147022e-06, + "loss": 0.1704, + "step": 6256 + }, + { + "epoch": 0.1583369182883316, + "grad_norm": 3.604584217071533, + "learning_rate": 9.456335241545116e-06, + "loss": 0.1875, + "step": 6257 + }, + { + "epoch": 0.15836222385302529, + "grad_norm": 5.13887357711792, + "learning_rate": 9.456153145205131e-06, + "loss": 0.2107, + "step": 6258 + }, + { + "epoch": 0.15838752941771897, + "grad_norm": 5.132934093475342, + "learning_rate": 9.455971020128245e-06, + "loss": 0.1896, + "step": 6259 + }, + { + "epoch": 0.15841283498241263, + "grad_norm": 11.112171173095703, + "learning_rate": 9.455788866315631e-06, + "loss": 0.3054, + "step": 6260 + }, + { + "epoch": 0.1584381405471063, + "grad_norm": 6.554523468017578, + "learning_rate": 9.455606683768463e-06, + "loss": 0.3108, + "step": 6261 + }, + { + "epoch": 0.1584634461118, + "grad_norm": 5.085146903991699, + "learning_rate": 9.455424472487916e-06, + "loss": 0.2149, + "step": 6262 + }, + { + "epoch": 0.15848875167649365, + "grad_norm": 2.821289539337158, + "learning_rate": 9.455242232475164e-06, + "loss": 0.1749, + "step": 6263 + }, + { + "epoch": 0.15851405724118733, + "grad_norm": 6.610503196716309, + "learning_rate": 9.455059963731385e-06, + "loss": 0.2145, + "step": 6264 + }, + { + "epoch": 0.15853936280588102, + "grad_norm": 6.736037254333496, + "learning_rate": 9.454877666257753e-06, + "loss": 0.2047, + "step": 6265 + }, + { + "epoch": 0.1585646683705747, + "grad_norm": 8.201794624328613, + "learning_rate": 9.454695340055445e-06, + "loss": 0.1543, + "step": 6266 + }, + { + "epoch": 0.15858997393526836, + "grad_norm": 7.9420390129089355, + "learning_rate": 9.454512985125632e-06, + "loss": 0.2882, + "step": 6267 + }, + { + "epoch": 0.15861527949996204, + "grad_norm": 8.988243103027344, + "learning_rate": 9.454330601469496e-06, + "loss": 0.2929, + "step": 6268 + }, + { + "epoch": 0.15864058506465573, + "grad_norm": 10.113003730773926, + "learning_rate": 9.454148189088211e-06, + "loss": 0.1499, + "step": 6269 + }, + { + "epoch": 0.15866589062934938, + "grad_norm": 5.218171119689941, + "learning_rate": 9.453965747982951e-06, + "loss": 0.1854, + "step": 6270 + }, + { + "epoch": 0.15869119619404307, + "grad_norm": 5.387177467346191, + "learning_rate": 9.453783278154894e-06, + "loss": 0.117, + "step": 6271 + }, + { + "epoch": 0.15871650175873675, + "grad_norm": 7.030371189117432, + "learning_rate": 9.453600779605219e-06, + "loss": 0.1696, + "step": 6272 + }, + { + "epoch": 0.15874180732343043, + "grad_norm": 7.342338562011719, + "learning_rate": 9.4534182523351e-06, + "loss": 0.1991, + "step": 6273 + }, + { + "epoch": 0.1587671128881241, + "grad_norm": 7.365732669830322, + "learning_rate": 9.453235696345715e-06, + "loss": 0.2329, + "step": 6274 + }, + { + "epoch": 0.15879241845281777, + "grad_norm": 8.136373519897461, + "learning_rate": 9.453053111638242e-06, + "loss": 0.2029, + "step": 6275 + }, + { + "epoch": 0.15881772401751146, + "grad_norm": 15.325011253356934, + "learning_rate": 9.452870498213858e-06, + "loss": 0.2837, + "step": 6276 + }, + { + "epoch": 0.15884302958220511, + "grad_norm": 4.07147216796875, + "learning_rate": 9.45268785607374e-06, + "loss": 0.2179, + "step": 6277 + }, + { + "epoch": 0.1588683351468988, + "grad_norm": 7.9635796546936035, + "learning_rate": 9.452505185219065e-06, + "loss": 0.3547, + "step": 6278 + }, + { + "epoch": 0.15889364071159248, + "grad_norm": 3.403987169265747, + "learning_rate": 9.452322485651014e-06, + "loss": 0.1645, + "step": 6279 + }, + { + "epoch": 0.15891894627628617, + "grad_norm": 3.6931819915771484, + "learning_rate": 9.452139757370761e-06, + "loss": 0.0886, + "step": 6280 + }, + { + "epoch": 0.15894425184097982, + "grad_norm": 5.8321943283081055, + "learning_rate": 9.45195700037949e-06, + "loss": 0.2429, + "step": 6281 + }, + { + "epoch": 0.1589695574056735, + "grad_norm": 7.189483642578125, + "learning_rate": 9.451774214678375e-06, + "loss": 0.2434, + "step": 6282 + }, + { + "epoch": 0.1589948629703672, + "grad_norm": 5.179967880249023, + "learning_rate": 9.451591400268595e-06, + "loss": 0.1891, + "step": 6283 + }, + { + "epoch": 0.15902016853506085, + "grad_norm": 6.434647083282471, + "learning_rate": 9.451408557151333e-06, + "loss": 0.2053, + "step": 6284 + }, + { + "epoch": 0.15904547409975453, + "grad_norm": 3.272200107574463, + "learning_rate": 9.451225685327765e-06, + "loss": 0.1795, + "step": 6285 + }, + { + "epoch": 0.15907077966444821, + "grad_norm": 15.08495807647705, + "learning_rate": 9.451042784799068e-06, + "loss": 0.2814, + "step": 6286 + }, + { + "epoch": 0.1590960852291419, + "grad_norm": 4.994421482086182, + "learning_rate": 9.450859855566426e-06, + "loss": 0.1057, + "step": 6287 + }, + { + "epoch": 0.15912139079383555, + "grad_norm": 3.9678828716278076, + "learning_rate": 9.450676897631014e-06, + "loss": 0.1084, + "step": 6288 + }, + { + "epoch": 0.15914669635852924, + "grad_norm": 7.826379776000977, + "learning_rate": 9.450493910994019e-06, + "loss": 0.2055, + "step": 6289 + }, + { + "epoch": 0.15917200192322292, + "grad_norm": 6.934129238128662, + "learning_rate": 9.450310895656613e-06, + "loss": 0.2101, + "step": 6290 + }, + { + "epoch": 0.1591973074879166, + "grad_norm": 8.021591186523438, + "learning_rate": 9.450127851619981e-06, + "loss": 0.2263, + "step": 6291 + }, + { + "epoch": 0.15922261305261026, + "grad_norm": 5.408292770385742, + "learning_rate": 9.449944778885302e-06, + "loss": 0.2257, + "step": 6292 + }, + { + "epoch": 0.15924791861730395, + "grad_norm": 6.949671268463135, + "learning_rate": 9.449761677453757e-06, + "loss": 0.2274, + "step": 6293 + }, + { + "epoch": 0.15927322418199763, + "grad_norm": 5.2135910987854, + "learning_rate": 9.449578547326527e-06, + "loss": 0.2102, + "step": 6294 + }, + { + "epoch": 0.1592985297466913, + "grad_norm": 9.318553924560547, + "learning_rate": 9.449395388504795e-06, + "loss": 0.2259, + "step": 6295 + }, + { + "epoch": 0.15932383531138497, + "grad_norm": 6.287254810333252, + "learning_rate": 9.449212200989736e-06, + "loss": 0.2287, + "step": 6296 + }, + { + "epoch": 0.15934914087607865, + "grad_norm": 6.092579364776611, + "learning_rate": 9.449028984782536e-06, + "loss": 0.2835, + "step": 6297 + }, + { + "epoch": 0.15937444644077234, + "grad_norm": 7.009377956390381, + "learning_rate": 9.448845739884375e-06, + "loss": 0.1904, + "step": 6298 + }, + { + "epoch": 0.159399752005466, + "grad_norm": 4.397865295410156, + "learning_rate": 9.448662466296438e-06, + "loss": 0.195, + "step": 6299 + }, + { + "epoch": 0.15942505757015968, + "grad_norm": 9.18659782409668, + "learning_rate": 9.448479164019902e-06, + "loss": 0.3072, + "step": 6300 + }, + { + "epoch": 0.15945036313485336, + "grad_norm": 4.367898941040039, + "learning_rate": 9.448295833055952e-06, + "loss": 0.2754, + "step": 6301 + }, + { + "epoch": 0.15947566869954702, + "grad_norm": 4.570737361907959, + "learning_rate": 9.44811247340577e-06, + "loss": 0.138, + "step": 6302 + }, + { + "epoch": 0.1595009742642407, + "grad_norm": 3.6781671047210693, + "learning_rate": 9.447929085070537e-06, + "loss": 0.1354, + "step": 6303 + }, + { + "epoch": 0.1595262798289344, + "grad_norm": 19.952133178710938, + "learning_rate": 9.447745668051437e-06, + "loss": 0.258, + "step": 6304 + }, + { + "epoch": 0.15955158539362807, + "grad_norm": 4.188070774078369, + "learning_rate": 9.447562222349654e-06, + "loss": 0.1957, + "step": 6305 + }, + { + "epoch": 0.15957689095832173, + "grad_norm": 4.294879913330078, + "learning_rate": 9.44737874796637e-06, + "loss": 0.1841, + "step": 6306 + }, + { + "epoch": 0.1596021965230154, + "grad_norm": 3.4072306156158447, + "learning_rate": 9.447195244902764e-06, + "loss": 0.1761, + "step": 6307 + }, + { + "epoch": 0.1596275020877091, + "grad_norm": 3.0185558795928955, + "learning_rate": 9.447011713160027e-06, + "loss": 0.117, + "step": 6308 + }, + { + "epoch": 0.15965280765240275, + "grad_norm": 10.71307373046875, + "learning_rate": 9.446828152739336e-06, + "loss": 0.2167, + "step": 6309 + }, + { + "epoch": 0.15967811321709643, + "grad_norm": 5.547906875610352, + "learning_rate": 9.44664456364188e-06, + "loss": 0.2059, + "step": 6310 + }, + { + "epoch": 0.15970341878179012, + "grad_norm": 3.5279922485351562, + "learning_rate": 9.446460945868838e-06, + "loss": 0.1875, + "step": 6311 + }, + { + "epoch": 0.1597287243464838, + "grad_norm": 10.065072059631348, + "learning_rate": 9.446277299421398e-06, + "loss": 0.2192, + "step": 6312 + }, + { + "epoch": 0.15975402991117746, + "grad_norm": 8.36650562286377, + "learning_rate": 9.446093624300741e-06, + "loss": 0.1487, + "step": 6313 + }, + { + "epoch": 0.15977933547587114, + "grad_norm": 6.504800796508789, + "learning_rate": 9.445909920508055e-06, + "loss": 0.167, + "step": 6314 + }, + { + "epoch": 0.15980464104056483, + "grad_norm": 6.769462585449219, + "learning_rate": 9.445726188044522e-06, + "loss": 0.2344, + "step": 6315 + }, + { + "epoch": 0.15982994660525848, + "grad_norm": 15.213961601257324, + "learning_rate": 9.445542426911329e-06, + "loss": 0.1733, + "step": 6316 + }, + { + "epoch": 0.15985525216995217, + "grad_norm": 6.789443492889404, + "learning_rate": 9.445358637109658e-06, + "loss": 0.2076, + "step": 6317 + }, + { + "epoch": 0.15988055773464585, + "grad_norm": 13.53662109375, + "learning_rate": 9.445174818640697e-06, + "loss": 0.2613, + "step": 6318 + }, + { + "epoch": 0.15990586329933953, + "grad_norm": 3.8934273719787598, + "learning_rate": 9.44499097150563e-06, + "loss": 0.2363, + "step": 6319 + }, + { + "epoch": 0.1599311688640332, + "grad_norm": 3.0048723220825195, + "learning_rate": 9.444807095705643e-06, + "loss": 0.1392, + "step": 6320 + }, + { + "epoch": 0.15995647442872687, + "grad_norm": 6.796597957611084, + "learning_rate": 9.444623191241924e-06, + "loss": 0.2036, + "step": 6321 + }, + { + "epoch": 0.15998177999342056, + "grad_norm": 7.69489860534668, + "learning_rate": 9.444439258115654e-06, + "loss": 0.1925, + "step": 6322 + }, + { + "epoch": 0.16000708555811424, + "grad_norm": 4.560435771942139, + "learning_rate": 9.444255296328023e-06, + "loss": 0.1738, + "step": 6323 + }, + { + "epoch": 0.1600323911228079, + "grad_norm": 5.2140936851501465, + "learning_rate": 9.444071305880217e-06, + "loss": 0.1127, + "step": 6324 + }, + { + "epoch": 0.16005769668750158, + "grad_norm": 4.021539688110352, + "learning_rate": 9.443887286773422e-06, + "loss": 0.1416, + "step": 6325 + }, + { + "epoch": 0.16008300225219527, + "grad_norm": 6.134873867034912, + "learning_rate": 9.443703239008822e-06, + "loss": 0.2192, + "step": 6326 + }, + { + "epoch": 0.16010830781688892, + "grad_norm": 6.598244667053223, + "learning_rate": 9.443519162587609e-06, + "loss": 0.1924, + "step": 6327 + }, + { + "epoch": 0.1601336133815826, + "grad_norm": 5.92717981338501, + "learning_rate": 9.443335057510966e-06, + "loss": 0.2076, + "step": 6328 + }, + { + "epoch": 0.1601589189462763, + "grad_norm": 6.195281982421875, + "learning_rate": 9.443150923780083e-06, + "loss": 0.1262, + "step": 6329 + }, + { + "epoch": 0.16018422451096997, + "grad_norm": 7.657509803771973, + "learning_rate": 9.442966761396144e-06, + "loss": 0.2147, + "step": 6330 + }, + { + "epoch": 0.16020953007566363, + "grad_norm": 5.389286994934082, + "learning_rate": 9.442782570360341e-06, + "loss": 0.2208, + "step": 6331 + }, + { + "epoch": 0.16023483564035731, + "grad_norm": 3.9194207191467285, + "learning_rate": 9.442598350673859e-06, + "loss": 0.1104, + "step": 6332 + }, + { + "epoch": 0.160260141205051, + "grad_norm": 6.713450908660889, + "learning_rate": 9.442414102337887e-06, + "loss": 0.2162, + "step": 6333 + }, + { + "epoch": 0.16028544676974465, + "grad_norm": 5.005957126617432, + "learning_rate": 9.442229825353612e-06, + "loss": 0.2083, + "step": 6334 + }, + { + "epoch": 0.16031075233443834, + "grad_norm": 8.39184856414795, + "learning_rate": 9.442045519722224e-06, + "loss": 0.21, + "step": 6335 + }, + { + "epoch": 0.16033605789913202, + "grad_norm": 7.276681423187256, + "learning_rate": 9.44186118544491e-06, + "loss": 0.2369, + "step": 6336 + }, + { + "epoch": 0.1603613634638257, + "grad_norm": 3.807199478149414, + "learning_rate": 9.441676822522859e-06, + "loss": 0.1607, + "step": 6337 + }, + { + "epoch": 0.16038666902851936, + "grad_norm": 40.88711929321289, + "learning_rate": 9.44149243095726e-06, + "loss": 0.3249, + "step": 6338 + }, + { + "epoch": 0.16041197459321305, + "grad_norm": 3.734103202819824, + "learning_rate": 9.441308010749305e-06, + "loss": 0.1357, + "step": 6339 + }, + { + "epoch": 0.16043728015790673, + "grad_norm": 5.165284156799316, + "learning_rate": 9.441123561900178e-06, + "loss": 0.181, + "step": 6340 + }, + { + "epoch": 0.1604625857226004, + "grad_norm": 10.026942253112793, + "learning_rate": 9.440939084411073e-06, + "loss": 0.2906, + "step": 6341 + }, + { + "epoch": 0.16048789128729407, + "grad_norm": 4.329538822174072, + "learning_rate": 9.440754578283178e-06, + "loss": 0.1277, + "step": 6342 + }, + { + "epoch": 0.16051319685198775, + "grad_norm": 5.351358413696289, + "learning_rate": 9.440570043517683e-06, + "loss": 0.2, + "step": 6343 + }, + { + "epoch": 0.16053850241668144, + "grad_norm": 5.269994735717773, + "learning_rate": 9.440385480115777e-06, + "loss": 0.2496, + "step": 6344 + }, + { + "epoch": 0.1605638079813751, + "grad_norm": 2.9594335556030273, + "learning_rate": 9.44020088807865e-06, + "loss": 0.1476, + "step": 6345 + }, + { + "epoch": 0.16058911354606878, + "grad_norm": 3.5513086318969727, + "learning_rate": 9.440016267407495e-06, + "loss": 0.1463, + "step": 6346 + }, + { + "epoch": 0.16061441911076246, + "grad_norm": 8.170598030090332, + "learning_rate": 9.439831618103501e-06, + "loss": 0.1913, + "step": 6347 + }, + { + "epoch": 0.16063972467545612, + "grad_norm": 4.741661548614502, + "learning_rate": 9.439646940167858e-06, + "loss": 0.1795, + "step": 6348 + }, + { + "epoch": 0.1606650302401498, + "grad_norm": 11.51826000213623, + "learning_rate": 9.439462233601757e-06, + "loss": 0.3197, + "step": 6349 + }, + { + "epoch": 0.1606903358048435, + "grad_norm": 8.587456703186035, + "learning_rate": 9.439277498406392e-06, + "loss": 0.3078, + "step": 6350 + }, + { + "epoch": 0.16071564136953717, + "grad_norm": 5.054083824157715, + "learning_rate": 9.439092734582951e-06, + "loss": 0.2719, + "step": 6351 + }, + { + "epoch": 0.16074094693423083, + "grad_norm": 6.285971641540527, + "learning_rate": 9.438907942132627e-06, + "loss": 0.1966, + "step": 6352 + }, + { + "epoch": 0.1607662524989245, + "grad_norm": 8.940530776977539, + "learning_rate": 9.43872312105661e-06, + "loss": 0.2491, + "step": 6353 + }, + { + "epoch": 0.1607915580636182, + "grad_norm": 5.3739142417907715, + "learning_rate": 9.438538271356095e-06, + "loss": 0.2187, + "step": 6354 + }, + { + "epoch": 0.16081686362831188, + "grad_norm": 3.8249542713165283, + "learning_rate": 9.438353393032272e-06, + "loss": 0.1561, + "step": 6355 + }, + { + "epoch": 0.16084216919300554, + "grad_norm": 5.533318042755127, + "learning_rate": 9.438168486086331e-06, + "loss": 0.2031, + "step": 6356 + }, + { + "epoch": 0.16086747475769922, + "grad_norm": 5.374282360076904, + "learning_rate": 9.43798355051947e-06, + "loss": 0.2579, + "step": 6357 + }, + { + "epoch": 0.1608927803223929, + "grad_norm": 5.456698417663574, + "learning_rate": 9.43779858633288e-06, + "loss": 0.132, + "step": 6358 + }, + { + "epoch": 0.16091808588708656, + "grad_norm": 8.320338249206543, + "learning_rate": 9.43761359352775e-06, + "loss": 0.2696, + "step": 6359 + }, + { + "epoch": 0.16094339145178024, + "grad_norm": 3.8839111328125, + "learning_rate": 9.437428572105276e-06, + "loss": 0.145, + "step": 6360 + }, + { + "epoch": 0.16096869701647393, + "grad_norm": 4.4024882316589355, + "learning_rate": 9.437243522066649e-06, + "loss": 0.1789, + "step": 6361 + }, + { + "epoch": 0.1609940025811676, + "grad_norm": 7.0887956619262695, + "learning_rate": 9.437058443413066e-06, + "loss": 0.2212, + "step": 6362 + }, + { + "epoch": 0.16101930814586127, + "grad_norm": 8.86989974975586, + "learning_rate": 9.436873336145717e-06, + "loss": 0.2355, + "step": 6363 + }, + { + "epoch": 0.16104461371055495, + "grad_norm": 7.093245029449463, + "learning_rate": 9.436688200265798e-06, + "loss": 0.194, + "step": 6364 + }, + { + "epoch": 0.16106991927524864, + "grad_norm": 2.25588321685791, + "learning_rate": 9.436503035774501e-06, + "loss": 0.13, + "step": 6365 + }, + { + "epoch": 0.1610952248399423, + "grad_norm": 4.121480464935303, + "learning_rate": 9.436317842673023e-06, + "loss": 0.1545, + "step": 6366 + }, + { + "epoch": 0.16112053040463598, + "grad_norm": 13.19347858428955, + "learning_rate": 9.436132620962554e-06, + "loss": 0.2278, + "step": 6367 + }, + { + "epoch": 0.16114583596932966, + "grad_norm": 3.9513673782348633, + "learning_rate": 9.435947370644293e-06, + "loss": 0.2214, + "step": 6368 + }, + { + "epoch": 0.16117114153402334, + "grad_norm": 5.8545756340026855, + "learning_rate": 9.435762091719433e-06, + "loss": 0.1497, + "step": 6369 + }, + { + "epoch": 0.161196447098717, + "grad_norm": 7.926573276519775, + "learning_rate": 9.435576784189167e-06, + "loss": 0.233, + "step": 6370 + }, + { + "epoch": 0.16122175266341068, + "grad_norm": 15.278483390808105, + "learning_rate": 9.435391448054691e-06, + "loss": 0.2393, + "step": 6371 + }, + { + "epoch": 0.16124705822810437, + "grad_norm": 14.603643417358398, + "learning_rate": 9.435206083317202e-06, + "loss": 0.3036, + "step": 6372 + }, + { + "epoch": 0.16127236379279802, + "grad_norm": 4.8016743659973145, + "learning_rate": 9.435020689977892e-06, + "loss": 0.2345, + "step": 6373 + }, + { + "epoch": 0.1612976693574917, + "grad_norm": 17.40947914123535, + "learning_rate": 9.434835268037961e-06, + "loss": 0.2994, + "step": 6374 + }, + { + "epoch": 0.1613229749221854, + "grad_norm": 4.886221408843994, + "learning_rate": 9.434649817498601e-06, + "loss": 0.2425, + "step": 6375 + }, + { + "epoch": 0.16134828048687908, + "grad_norm": 7.035793304443359, + "learning_rate": 9.43446433836101e-06, + "loss": 0.2181, + "step": 6376 + }, + { + "epoch": 0.16137358605157273, + "grad_norm": 9.70659065246582, + "learning_rate": 9.434278830626382e-06, + "loss": 0.3206, + "step": 6377 + }, + { + "epoch": 0.16139889161626642, + "grad_norm": 9.737120628356934, + "learning_rate": 9.434093294295917e-06, + "loss": 0.2362, + "step": 6378 + }, + { + "epoch": 0.1614241971809601, + "grad_norm": 9.430562973022461, + "learning_rate": 9.433907729370806e-06, + "loss": 0.3452, + "step": 6379 + }, + { + "epoch": 0.16144950274565376, + "grad_norm": 8.665970802307129, + "learning_rate": 9.43372213585225e-06, + "loss": 0.1318, + "step": 6380 + }, + { + "epoch": 0.16147480831034744, + "grad_norm": 10.211084365844727, + "learning_rate": 9.433536513741446e-06, + "loss": 0.2484, + "step": 6381 + }, + { + "epoch": 0.16150011387504112, + "grad_norm": 14.005661964416504, + "learning_rate": 9.433350863039588e-06, + "loss": 0.1509, + "step": 6382 + }, + { + "epoch": 0.1615254194397348, + "grad_norm": 4.998174667358398, + "learning_rate": 9.433165183747876e-06, + "loss": 0.1866, + "step": 6383 + }, + { + "epoch": 0.16155072500442846, + "grad_norm": 4.1154656410217285, + "learning_rate": 9.432979475867506e-06, + "loss": 0.1447, + "step": 6384 + }, + { + "epoch": 0.16157603056912215, + "grad_norm": 3.873284101486206, + "learning_rate": 9.432793739399678e-06, + "loss": 0.1393, + "step": 6385 + }, + { + "epoch": 0.16160133613381583, + "grad_norm": 4.676520347595215, + "learning_rate": 9.432607974345585e-06, + "loss": 0.182, + "step": 6386 + }, + { + "epoch": 0.16162664169850952, + "grad_norm": 3.099276304244995, + "learning_rate": 9.432422180706429e-06, + "loss": 0.1386, + "step": 6387 + }, + { + "epoch": 0.16165194726320317, + "grad_norm": 4.886888027191162, + "learning_rate": 9.432236358483406e-06, + "loss": 0.1346, + "step": 6388 + }, + { + "epoch": 0.16167725282789686, + "grad_norm": 4.461008071899414, + "learning_rate": 9.432050507677715e-06, + "loss": 0.1912, + "step": 6389 + }, + { + "epoch": 0.16170255839259054, + "grad_norm": 6.66319465637207, + "learning_rate": 9.431864628290555e-06, + "loss": 0.2025, + "step": 6390 + }, + { + "epoch": 0.1617278639572842, + "grad_norm": 5.349076747894287, + "learning_rate": 9.431678720323125e-06, + "loss": 0.1762, + "step": 6391 + }, + { + "epoch": 0.16175316952197788, + "grad_norm": 8.220946311950684, + "learning_rate": 9.431492783776622e-06, + "loss": 0.2604, + "step": 6392 + }, + { + "epoch": 0.16177847508667156, + "grad_norm": 6.483526229858398, + "learning_rate": 9.431306818652248e-06, + "loss": 0.1991, + "step": 6393 + }, + { + "epoch": 0.16180378065136525, + "grad_norm": 6.387860298156738, + "learning_rate": 9.4311208249512e-06, + "loss": 0.2163, + "step": 6394 + }, + { + "epoch": 0.1618290862160589, + "grad_norm": 8.68197250366211, + "learning_rate": 9.430934802674678e-06, + "loss": 0.1839, + "step": 6395 + }, + { + "epoch": 0.1618543917807526, + "grad_norm": 11.536637306213379, + "learning_rate": 9.430748751823882e-06, + "loss": 0.2464, + "step": 6396 + }, + { + "epoch": 0.16187969734544627, + "grad_norm": 4.742502689361572, + "learning_rate": 9.43056267240001e-06, + "loss": 0.1655, + "step": 6397 + }, + { + "epoch": 0.16190500291013993, + "grad_norm": 15.06602668762207, + "learning_rate": 9.430376564404265e-06, + "loss": 0.1453, + "step": 6398 + }, + { + "epoch": 0.1619303084748336, + "grad_norm": 5.844307899475098, + "learning_rate": 9.430190427837845e-06, + "loss": 0.228, + "step": 6399 + }, + { + "epoch": 0.1619556140395273, + "grad_norm": 9.774087905883789, + "learning_rate": 9.430004262701951e-06, + "loss": 0.1178, + "step": 6400 + }, + { + "epoch": 0.16198091960422098, + "grad_norm": 9.771039962768555, + "learning_rate": 9.429818068997784e-06, + "loss": 0.2331, + "step": 6401 + }, + { + "epoch": 0.16200622516891464, + "grad_norm": 3.2559802532196045, + "learning_rate": 9.429631846726543e-06, + "loss": 0.0739, + "step": 6402 + }, + { + "epoch": 0.16203153073360832, + "grad_norm": 7.218895435333252, + "learning_rate": 9.429445595889433e-06, + "loss": 0.2603, + "step": 6403 + }, + { + "epoch": 0.162056836298302, + "grad_norm": 5.133842468261719, + "learning_rate": 9.429259316487651e-06, + "loss": 0.1538, + "step": 6404 + }, + { + "epoch": 0.16208214186299566, + "grad_norm": 11.369463920593262, + "learning_rate": 9.429073008522399e-06, + "loss": 0.2318, + "step": 6405 + }, + { + "epoch": 0.16210744742768934, + "grad_norm": 5.589999675750732, + "learning_rate": 9.428886671994878e-06, + "loss": 0.2264, + "step": 6406 + }, + { + "epoch": 0.16213275299238303, + "grad_norm": 7.97866678237915, + "learning_rate": 9.428700306906291e-06, + "loss": 0.2031, + "step": 6407 + }, + { + "epoch": 0.1621580585570767, + "grad_norm": 8.924288749694824, + "learning_rate": 9.428513913257842e-06, + "loss": 0.2279, + "step": 6408 + }, + { + "epoch": 0.16218336412177037, + "grad_norm": 6.889601707458496, + "learning_rate": 9.428327491050729e-06, + "loss": 0.1933, + "step": 6409 + }, + { + "epoch": 0.16220866968646405, + "grad_norm": 6.592592716217041, + "learning_rate": 9.428141040286157e-06, + "loss": 0.1993, + "step": 6410 + }, + { + "epoch": 0.16223397525115774, + "grad_norm": 4.762804985046387, + "learning_rate": 9.427954560965324e-06, + "loss": 0.2488, + "step": 6411 + }, + { + "epoch": 0.1622592808158514, + "grad_norm": 6.251031875610352, + "learning_rate": 9.427768053089437e-06, + "loss": 0.2449, + "step": 6412 + }, + { + "epoch": 0.16228458638054508, + "grad_norm": 6.954991817474365, + "learning_rate": 9.427581516659699e-06, + "loss": 0.147, + "step": 6413 + }, + { + "epoch": 0.16230989194523876, + "grad_norm": 3.1794352531433105, + "learning_rate": 9.42739495167731e-06, + "loss": 0.19, + "step": 6414 + }, + { + "epoch": 0.16233519750993244, + "grad_norm": 5.913274765014648, + "learning_rate": 9.427208358143474e-06, + "loss": 0.1719, + "step": 6415 + }, + { + "epoch": 0.1623605030746261, + "grad_norm": 4.577851295471191, + "learning_rate": 9.427021736059397e-06, + "loss": 0.1268, + "step": 6416 + }, + { + "epoch": 0.16238580863931978, + "grad_norm": 10.251601219177246, + "learning_rate": 9.426835085426278e-06, + "loss": 0.1853, + "step": 6417 + }, + { + "epoch": 0.16241111420401347, + "grad_norm": 3.794581413269043, + "learning_rate": 9.426648406245325e-06, + "loss": 0.141, + "step": 6418 + }, + { + "epoch": 0.16243641976870715, + "grad_norm": 7.373172283172607, + "learning_rate": 9.426461698517738e-06, + "loss": 0.2011, + "step": 6419 + }, + { + "epoch": 0.1624617253334008, + "grad_norm": 11.816069602966309, + "learning_rate": 9.426274962244724e-06, + "loss": 0.1887, + "step": 6420 + }, + { + "epoch": 0.1624870308980945, + "grad_norm": 10.127960205078125, + "learning_rate": 9.426088197427486e-06, + "loss": 0.1467, + "step": 6421 + }, + { + "epoch": 0.16251233646278818, + "grad_norm": 2.1745476722717285, + "learning_rate": 9.425901404067228e-06, + "loss": 0.0751, + "step": 6422 + }, + { + "epoch": 0.16253764202748183, + "grad_norm": 9.273398399353027, + "learning_rate": 9.425714582165155e-06, + "loss": 0.161, + "step": 6423 + }, + { + "epoch": 0.16256294759217552, + "grad_norm": 3.2891244888305664, + "learning_rate": 9.425527731722473e-06, + "loss": 0.1826, + "step": 6424 + }, + { + "epoch": 0.1625882531568692, + "grad_norm": 9.061126708984375, + "learning_rate": 9.425340852740386e-06, + "loss": 0.2541, + "step": 6425 + }, + { + "epoch": 0.16261355872156288, + "grad_norm": 6.272447109222412, + "learning_rate": 9.425153945220097e-06, + "loss": 0.2297, + "step": 6426 + }, + { + "epoch": 0.16263886428625654, + "grad_norm": 2.9708006381988525, + "learning_rate": 9.424967009162815e-06, + "loss": 0.1378, + "step": 6427 + }, + { + "epoch": 0.16266416985095022, + "grad_norm": 7.302326679229736, + "learning_rate": 9.424780044569744e-06, + "loss": 0.2173, + "step": 6428 + }, + { + "epoch": 0.1626894754156439, + "grad_norm": 3.231797218322754, + "learning_rate": 9.424593051442088e-06, + "loss": 0.1581, + "step": 6429 + }, + { + "epoch": 0.16271478098033756, + "grad_norm": 13.527801513671875, + "learning_rate": 9.424406029781057e-06, + "loss": 0.2546, + "step": 6430 + }, + { + "epoch": 0.16274008654503125, + "grad_norm": 12.567400932312012, + "learning_rate": 9.424218979587852e-06, + "loss": 0.2832, + "step": 6431 + }, + { + "epoch": 0.16276539210972493, + "grad_norm": 5.986892223358154, + "learning_rate": 9.424031900863683e-06, + "loss": 0.239, + "step": 6432 + }, + { + "epoch": 0.16279069767441862, + "grad_norm": 6.073790550231934, + "learning_rate": 9.423844793609756e-06, + "loss": 0.2148, + "step": 6433 + }, + { + "epoch": 0.16281600323911227, + "grad_norm": 9.902475357055664, + "learning_rate": 9.423657657827276e-06, + "loss": 0.2478, + "step": 6434 + }, + { + "epoch": 0.16284130880380596, + "grad_norm": 2.625699996948242, + "learning_rate": 9.42347049351745e-06, + "loss": 0.1415, + "step": 6435 + }, + { + "epoch": 0.16286661436849964, + "grad_norm": 4.658029556274414, + "learning_rate": 9.423283300681485e-06, + "loss": 0.2256, + "step": 6436 + }, + { + "epoch": 0.1628919199331933, + "grad_norm": 5.907656669616699, + "learning_rate": 9.42309607932059e-06, + "loss": 0.1895, + "step": 6437 + }, + { + "epoch": 0.16291722549788698, + "grad_norm": 12.250284194946289, + "learning_rate": 9.422908829435971e-06, + "loss": 0.2373, + "step": 6438 + }, + { + "epoch": 0.16294253106258066, + "grad_norm": 3.9796435832977295, + "learning_rate": 9.422721551028837e-06, + "loss": 0.1548, + "step": 6439 + }, + { + "epoch": 0.16296783662727435, + "grad_norm": 7.141930103302002, + "learning_rate": 9.422534244100392e-06, + "loss": 0.1666, + "step": 6440 + }, + { + "epoch": 0.162993142191968, + "grad_norm": 3.268219232559204, + "learning_rate": 9.422346908651847e-06, + "loss": 0.1296, + "step": 6441 + }, + { + "epoch": 0.1630184477566617, + "grad_norm": 4.837988376617432, + "learning_rate": 9.42215954468441e-06, + "loss": 0.164, + "step": 6442 + }, + { + "epoch": 0.16304375332135537, + "grad_norm": 13.255522727966309, + "learning_rate": 9.421972152199289e-06, + "loss": 0.4017, + "step": 6443 + }, + { + "epoch": 0.16306905888604903, + "grad_norm": 7.08125638961792, + "learning_rate": 9.421784731197691e-06, + "loss": 0.2015, + "step": 6444 + }, + { + "epoch": 0.1630943644507427, + "grad_norm": 11.719327926635742, + "learning_rate": 9.421597281680828e-06, + "loss": 0.3266, + "step": 6445 + }, + { + "epoch": 0.1631196700154364, + "grad_norm": 6.7928242683410645, + "learning_rate": 9.421409803649904e-06, + "loss": 0.2418, + "step": 6446 + }, + { + "epoch": 0.16314497558013008, + "grad_norm": 8.386358261108398, + "learning_rate": 9.421222297106131e-06, + "loss": 0.2365, + "step": 6447 + }, + { + "epoch": 0.16317028114482374, + "grad_norm": 7.054584503173828, + "learning_rate": 9.42103476205072e-06, + "loss": 0.1904, + "step": 6448 + }, + { + "epoch": 0.16319558670951742, + "grad_norm": 4.15570592880249, + "learning_rate": 9.420847198484876e-06, + "loss": 0.1157, + "step": 6449 + }, + { + "epoch": 0.1632208922742111, + "grad_norm": 5.9775238037109375, + "learning_rate": 9.42065960640981e-06, + "loss": 0.2331, + "step": 6450 + }, + { + "epoch": 0.1632461978389048, + "grad_norm": 4.318760395050049, + "learning_rate": 9.420471985826736e-06, + "loss": 0.1343, + "step": 6451 + }, + { + "epoch": 0.16327150340359844, + "grad_norm": 5.84885835647583, + "learning_rate": 9.420284336736859e-06, + "loss": 0.1766, + "step": 6452 + }, + { + "epoch": 0.16329680896829213, + "grad_norm": 7.378305912017822, + "learning_rate": 9.420096659141389e-06, + "loss": 0.1777, + "step": 6453 + }, + { + "epoch": 0.1633221145329858, + "grad_norm": 7.179826736450195, + "learning_rate": 9.41990895304154e-06, + "loss": 0.1873, + "step": 6454 + }, + { + "epoch": 0.16334742009767947, + "grad_norm": 6.105809688568115, + "learning_rate": 9.419721218438518e-06, + "loss": 0.1833, + "step": 6455 + }, + { + "epoch": 0.16337272566237315, + "grad_norm": 5.982931137084961, + "learning_rate": 9.419533455333535e-06, + "loss": 0.1627, + "step": 6456 + }, + { + "epoch": 0.16339803122706684, + "grad_norm": 5.0247321128845215, + "learning_rate": 9.419345663727805e-06, + "loss": 0.2147, + "step": 6457 + }, + { + "epoch": 0.16342333679176052, + "grad_norm": 5.252191543579102, + "learning_rate": 9.419157843622537e-06, + "loss": 0.1447, + "step": 6458 + }, + { + "epoch": 0.16344864235645418, + "grad_norm": 7.998533725738525, + "learning_rate": 9.418969995018944e-06, + "loss": 0.3071, + "step": 6459 + }, + { + "epoch": 0.16347394792114786, + "grad_norm": 5.173703670501709, + "learning_rate": 9.418782117918233e-06, + "loss": 0.2543, + "step": 6460 + }, + { + "epoch": 0.16349925348584154, + "grad_norm": 4.839731216430664, + "learning_rate": 9.418594212321617e-06, + "loss": 0.2264, + "step": 6461 + }, + { + "epoch": 0.1635245590505352, + "grad_norm": 4.630385398864746, + "learning_rate": 9.41840627823031e-06, + "loss": 0.1833, + "step": 6462 + }, + { + "epoch": 0.16354986461522888, + "grad_norm": 7.537333011627197, + "learning_rate": 9.418218315645525e-06, + "loss": 0.2962, + "step": 6463 + }, + { + "epoch": 0.16357517017992257, + "grad_norm": 8.254283905029297, + "learning_rate": 9.41803032456847e-06, + "loss": 0.2616, + "step": 6464 + }, + { + "epoch": 0.16360047574461625, + "grad_norm": 13.542909622192383, + "learning_rate": 9.417842305000359e-06, + "loss": 0.2439, + "step": 6465 + }, + { + "epoch": 0.1636257813093099, + "grad_norm": 3.4528400897979736, + "learning_rate": 9.417654256942405e-06, + "loss": 0.1525, + "step": 6466 + }, + { + "epoch": 0.1636510868740036, + "grad_norm": 5.858790397644043, + "learning_rate": 9.41746618039582e-06, + "loss": 0.1614, + "step": 6467 + }, + { + "epoch": 0.16367639243869728, + "grad_norm": 7.309754848480225, + "learning_rate": 9.417278075361818e-06, + "loss": 0.1891, + "step": 6468 + }, + { + "epoch": 0.16370169800339093, + "grad_norm": 6.737215042114258, + "learning_rate": 9.417089941841611e-06, + "loss": 0.2922, + "step": 6469 + }, + { + "epoch": 0.16372700356808462, + "grad_norm": 7.40907621383667, + "learning_rate": 9.416901779836413e-06, + "loss": 0.2494, + "step": 6470 + }, + { + "epoch": 0.1637523091327783, + "grad_norm": 7.496338367462158, + "learning_rate": 9.416713589347437e-06, + "loss": 0.3064, + "step": 6471 + }, + { + "epoch": 0.16377761469747198, + "grad_norm": 5.246039390563965, + "learning_rate": 9.416525370375897e-06, + "loss": 0.2468, + "step": 6472 + }, + { + "epoch": 0.16380292026216564, + "grad_norm": 12.911567687988281, + "learning_rate": 9.416337122923005e-06, + "loss": 0.2232, + "step": 6473 + }, + { + "epoch": 0.16382822582685932, + "grad_norm": 6.223925590515137, + "learning_rate": 9.416148846989976e-06, + "loss": 0.1414, + "step": 6474 + }, + { + "epoch": 0.163853531391553, + "grad_norm": 5.858348369598389, + "learning_rate": 9.415960542578027e-06, + "loss": 0.1872, + "step": 6475 + }, + { + "epoch": 0.16387883695624667, + "grad_norm": 4.702218532562256, + "learning_rate": 9.41577220968837e-06, + "loss": 0.201, + "step": 6476 + }, + { + "epoch": 0.16390414252094035, + "grad_norm": 7.3233795166015625, + "learning_rate": 9.415583848322218e-06, + "loss": 0.2587, + "step": 6477 + }, + { + "epoch": 0.16392944808563403, + "grad_norm": 8.639924049377441, + "learning_rate": 9.415395458480787e-06, + "loss": 0.3373, + "step": 6478 + }, + { + "epoch": 0.16395475365032772, + "grad_norm": 5.414719581604004, + "learning_rate": 9.415207040165291e-06, + "loss": 0.158, + "step": 6479 + }, + { + "epoch": 0.16398005921502137, + "grad_norm": 5.390158176422119, + "learning_rate": 9.415018593376947e-06, + "loss": 0.1723, + "step": 6480 + }, + { + "epoch": 0.16400536477971506, + "grad_norm": 5.571353435516357, + "learning_rate": 9.414830118116971e-06, + "loss": 0.2868, + "step": 6481 + }, + { + "epoch": 0.16403067034440874, + "grad_norm": 11.829361915588379, + "learning_rate": 9.414641614386575e-06, + "loss": 0.188, + "step": 6482 + }, + { + "epoch": 0.16405597590910243, + "grad_norm": 4.842912197113037, + "learning_rate": 9.414453082186976e-06, + "loss": 0.1555, + "step": 6483 + }, + { + "epoch": 0.16408128147379608, + "grad_norm": 5.24864387512207, + "learning_rate": 9.414264521519392e-06, + "loss": 0.27, + "step": 6484 + }, + { + "epoch": 0.16410658703848977, + "grad_norm": 10.047249794006348, + "learning_rate": 9.414075932385035e-06, + "loss": 0.1987, + "step": 6485 + }, + { + "epoch": 0.16413189260318345, + "grad_norm": 5.523014068603516, + "learning_rate": 9.413887314785127e-06, + "loss": 0.1931, + "step": 6486 + }, + { + "epoch": 0.1641571981678771, + "grad_norm": 5.777325630187988, + "learning_rate": 9.413698668720878e-06, + "loss": 0.152, + "step": 6487 + }, + { + "epoch": 0.1641825037325708, + "grad_norm": 12.130219459533691, + "learning_rate": 9.413509994193507e-06, + "loss": 0.1939, + "step": 6488 + }, + { + "epoch": 0.16420780929726447, + "grad_norm": 8.813141822814941, + "learning_rate": 9.413321291204231e-06, + "loss": 0.2333, + "step": 6489 + }, + { + "epoch": 0.16423311486195816, + "grad_norm": 5.2674713134765625, + "learning_rate": 9.413132559754268e-06, + "loss": 0.202, + "step": 6490 + }, + { + "epoch": 0.1642584204266518, + "grad_norm": 2.9230408668518066, + "learning_rate": 9.412943799844835e-06, + "loss": 0.1135, + "step": 6491 + }, + { + "epoch": 0.1642837259913455, + "grad_norm": 3.758007287979126, + "learning_rate": 9.412755011477147e-06, + "loss": 0.1952, + "step": 6492 + }, + { + "epoch": 0.16430903155603918, + "grad_norm": 6.666371822357178, + "learning_rate": 9.412566194652422e-06, + "loss": 0.2026, + "step": 6493 + }, + { + "epoch": 0.16433433712073284, + "grad_norm": 18.22930908203125, + "learning_rate": 9.41237734937188e-06, + "loss": 0.3434, + "step": 6494 + }, + { + "epoch": 0.16435964268542652, + "grad_norm": 15.15005111694336, + "learning_rate": 9.412188475636737e-06, + "loss": 0.243, + "step": 6495 + }, + { + "epoch": 0.1643849482501202, + "grad_norm": 4.66493558883667, + "learning_rate": 9.41199957344821e-06, + "loss": 0.1887, + "step": 6496 + }, + { + "epoch": 0.1644102538148139, + "grad_norm": 10.204439163208008, + "learning_rate": 9.41181064280752e-06, + "loss": 0.2708, + "step": 6497 + }, + { + "epoch": 0.16443555937950755, + "grad_norm": 5.557548522949219, + "learning_rate": 9.411621683715883e-06, + "loss": 0.2197, + "step": 6498 + }, + { + "epoch": 0.16446086494420123, + "grad_norm": 7.127562522888184, + "learning_rate": 9.411432696174518e-06, + "loss": 0.1891, + "step": 6499 + }, + { + "epoch": 0.1644861705088949, + "grad_norm": 2.7388765811920166, + "learning_rate": 9.411243680184645e-06, + "loss": 0.142, + "step": 6500 + }, + { + "epoch": 0.16451147607358857, + "grad_norm": 10.193744659423828, + "learning_rate": 9.411054635747481e-06, + "loss": 0.2615, + "step": 6501 + }, + { + "epoch": 0.16453678163828225, + "grad_norm": 4.1723198890686035, + "learning_rate": 9.410865562864247e-06, + "loss": 0.1426, + "step": 6502 + }, + { + "epoch": 0.16456208720297594, + "grad_norm": 6.609566688537598, + "learning_rate": 9.410676461536161e-06, + "loss": 0.2448, + "step": 6503 + }, + { + "epoch": 0.16458739276766962, + "grad_norm": 4.863243103027344, + "learning_rate": 9.410487331764444e-06, + "loss": 0.1391, + "step": 6504 + }, + { + "epoch": 0.16461269833236328, + "grad_norm": 3.80049204826355, + "learning_rate": 9.410298173550313e-06, + "loss": 0.1267, + "step": 6505 + }, + { + "epoch": 0.16463800389705696, + "grad_norm": 6.605835437774658, + "learning_rate": 9.41010898689499e-06, + "loss": 0.2574, + "step": 6506 + }, + { + "epoch": 0.16466330946175065, + "grad_norm": 2.9178287982940674, + "learning_rate": 9.409919771799694e-06, + "loss": 0.1092, + "step": 6507 + }, + { + "epoch": 0.1646886150264443, + "grad_norm": 10.517829895019531, + "learning_rate": 9.409730528265648e-06, + "loss": 0.2274, + "step": 6508 + }, + { + "epoch": 0.16471392059113799, + "grad_norm": 7.4832844734191895, + "learning_rate": 9.409541256294068e-06, + "loss": 0.2726, + "step": 6509 + }, + { + "epoch": 0.16473922615583167, + "grad_norm": 4.689323425292969, + "learning_rate": 9.409351955886177e-06, + "loss": 0.0982, + "step": 6510 + }, + { + "epoch": 0.16476453172052535, + "grad_norm": 18.428325653076172, + "learning_rate": 9.409162627043196e-06, + "loss": 0.232, + "step": 6511 + }, + { + "epoch": 0.164789837285219, + "grad_norm": 10.30028247833252, + "learning_rate": 9.408973269766345e-06, + "loss": 0.2137, + "step": 6512 + }, + { + "epoch": 0.1648151428499127, + "grad_norm": 4.438777446746826, + "learning_rate": 9.408783884056844e-06, + "loss": 0.1541, + "step": 6513 + }, + { + "epoch": 0.16484044841460638, + "grad_norm": 8.55518913269043, + "learning_rate": 9.408594469915919e-06, + "loss": 0.26, + "step": 6514 + }, + { + "epoch": 0.16486575397930006, + "grad_norm": 8.808012008666992, + "learning_rate": 9.408405027344787e-06, + "loss": 0.2449, + "step": 6515 + }, + { + "epoch": 0.16489105954399372, + "grad_norm": 5.151573181152344, + "learning_rate": 9.40821555634467e-06, + "loss": 0.2802, + "step": 6516 + }, + { + "epoch": 0.1649163651086874, + "grad_norm": 5.806856632232666, + "learning_rate": 9.408026056916792e-06, + "loss": 0.126, + "step": 6517 + }, + { + "epoch": 0.16494167067338109, + "grad_norm": 6.937745571136475, + "learning_rate": 9.407836529062373e-06, + "loss": 0.2477, + "step": 6518 + }, + { + "epoch": 0.16496697623807474, + "grad_norm": 7.354395866394043, + "learning_rate": 9.407646972782637e-06, + "loss": 0.2203, + "step": 6519 + }, + { + "epoch": 0.16499228180276843, + "grad_norm": 3.7834856510162354, + "learning_rate": 9.407457388078804e-06, + "loss": 0.1595, + "step": 6520 + }, + { + "epoch": 0.1650175873674621, + "grad_norm": 3.6704561710357666, + "learning_rate": 9.4072677749521e-06, + "loss": 0.1662, + "step": 6521 + }, + { + "epoch": 0.1650428929321558, + "grad_norm": 4.293300628662109, + "learning_rate": 9.407078133403745e-06, + "loss": 0.2247, + "step": 6522 + }, + { + "epoch": 0.16506819849684945, + "grad_norm": 3.755939245223999, + "learning_rate": 9.406888463434965e-06, + "loss": 0.1731, + "step": 6523 + }, + { + "epoch": 0.16509350406154313, + "grad_norm": 4.114330291748047, + "learning_rate": 9.406698765046977e-06, + "loss": 0.156, + "step": 6524 + }, + { + "epoch": 0.16511880962623682, + "grad_norm": 5.899810791015625, + "learning_rate": 9.406509038241012e-06, + "loss": 0.2072, + "step": 6525 + }, + { + "epoch": 0.16514411519093047, + "grad_norm": 5.9483184814453125, + "learning_rate": 9.406319283018289e-06, + "loss": 0.1915, + "step": 6526 + }, + { + "epoch": 0.16516942075562416, + "grad_norm": 5.0234503746032715, + "learning_rate": 9.406129499380031e-06, + "loss": 0.1379, + "step": 6527 + }, + { + "epoch": 0.16519472632031784, + "grad_norm": 9.156627655029297, + "learning_rate": 9.405939687327466e-06, + "loss": 0.2951, + "step": 6528 + }, + { + "epoch": 0.16522003188501153, + "grad_norm": 5.81081485748291, + "learning_rate": 9.405749846861814e-06, + "loss": 0.1505, + "step": 6529 + }, + { + "epoch": 0.16524533744970518, + "grad_norm": 8.738752365112305, + "learning_rate": 9.4055599779843e-06, + "loss": 0.2478, + "step": 6530 + }, + { + "epoch": 0.16527064301439887, + "grad_norm": 9.174933433532715, + "learning_rate": 9.40537008069615e-06, + "loss": 0.1844, + "step": 6531 + }, + { + "epoch": 0.16529594857909255, + "grad_norm": 5.203457832336426, + "learning_rate": 9.405180154998587e-06, + "loss": 0.1732, + "step": 6532 + }, + { + "epoch": 0.1653212541437862, + "grad_norm": 6.4893622398376465, + "learning_rate": 9.404990200892839e-06, + "loss": 0.1958, + "step": 6533 + }, + { + "epoch": 0.1653465597084799, + "grad_norm": 8.42387866973877, + "learning_rate": 9.404800218380125e-06, + "loss": 0.1916, + "step": 6534 + }, + { + "epoch": 0.16537186527317357, + "grad_norm": 9.226655006408691, + "learning_rate": 9.404610207461678e-06, + "loss": 0.2595, + "step": 6535 + }, + { + "epoch": 0.16539717083786726, + "grad_norm": 10.901080131530762, + "learning_rate": 9.404420168138716e-06, + "loss": 0.2318, + "step": 6536 + }, + { + "epoch": 0.16542247640256091, + "grad_norm": 5.932149410247803, + "learning_rate": 9.404230100412469e-06, + "loss": 0.1581, + "step": 6537 + }, + { + "epoch": 0.1654477819672546, + "grad_norm": 19.954803466796875, + "learning_rate": 9.40404000428416e-06, + "loss": 0.1964, + "step": 6538 + }, + { + "epoch": 0.16547308753194828, + "grad_norm": 11.411225318908691, + "learning_rate": 9.403849879755016e-06, + "loss": 0.163, + "step": 6539 + }, + { + "epoch": 0.16549839309664194, + "grad_norm": 2.951465129852295, + "learning_rate": 9.403659726826265e-06, + "loss": 0.1354, + "step": 6540 + }, + { + "epoch": 0.16552369866133562, + "grad_norm": 5.256303787231445, + "learning_rate": 9.403469545499131e-06, + "loss": 0.1826, + "step": 6541 + }, + { + "epoch": 0.1655490042260293, + "grad_norm": 15.21177864074707, + "learning_rate": 9.40327933577484e-06, + "loss": 0.229, + "step": 6542 + }, + { + "epoch": 0.165574309790723, + "grad_norm": 5.4166083335876465, + "learning_rate": 9.403089097654621e-06, + "loss": 0.1601, + "step": 6543 + }, + { + "epoch": 0.16559961535541665, + "grad_norm": 4.90434455871582, + "learning_rate": 9.4028988311397e-06, + "loss": 0.1925, + "step": 6544 + }, + { + "epoch": 0.16562492092011033, + "grad_norm": 7.367715835571289, + "learning_rate": 9.402708536231301e-06, + "loss": 0.1941, + "step": 6545 + }, + { + "epoch": 0.16565022648480401, + "grad_norm": 3.0373995304107666, + "learning_rate": 9.402518212930655e-06, + "loss": 0.1695, + "step": 6546 + }, + { + "epoch": 0.1656755320494977, + "grad_norm": 7.206755638122559, + "learning_rate": 9.40232786123899e-06, + "loss": 0.2703, + "step": 6547 + }, + { + "epoch": 0.16570083761419135, + "grad_norm": 6.981588363647461, + "learning_rate": 9.402137481157529e-06, + "loss": 0.2818, + "step": 6548 + }, + { + "epoch": 0.16572614317888504, + "grad_norm": 2.791964292526245, + "learning_rate": 9.401947072687503e-06, + "loss": 0.0912, + "step": 6549 + }, + { + "epoch": 0.16575144874357872, + "grad_norm": 12.247304916381836, + "learning_rate": 9.401756635830139e-06, + "loss": 0.1595, + "step": 6550 + }, + { + "epoch": 0.16577675430827238, + "grad_norm": 6.914100646972656, + "learning_rate": 9.401566170586664e-06, + "loss": 0.186, + "step": 6551 + }, + { + "epoch": 0.16580205987296606, + "grad_norm": 3.653205156326294, + "learning_rate": 9.401375676958309e-06, + "loss": 0.1368, + "step": 6552 + }, + { + "epoch": 0.16582736543765975, + "grad_norm": 7.780629634857178, + "learning_rate": 9.401185154946302e-06, + "loss": 0.2408, + "step": 6553 + }, + { + "epoch": 0.16585267100235343, + "grad_norm": 5.025808334350586, + "learning_rate": 9.400994604551869e-06, + "loss": 0.1915, + "step": 6554 + }, + { + "epoch": 0.1658779765670471, + "grad_norm": 7.912924289703369, + "learning_rate": 9.40080402577624e-06, + "loss": 0.2036, + "step": 6555 + }, + { + "epoch": 0.16590328213174077, + "grad_norm": 3.6891980171203613, + "learning_rate": 9.400613418620645e-06, + "loss": 0.1693, + "step": 6556 + }, + { + "epoch": 0.16592858769643445, + "grad_norm": 25.452739715576172, + "learning_rate": 9.400422783086314e-06, + "loss": 0.1728, + "step": 6557 + }, + { + "epoch": 0.1659538932611281, + "grad_norm": 7.187661170959473, + "learning_rate": 9.400232119174472e-06, + "loss": 0.3244, + "step": 6558 + }, + { + "epoch": 0.1659791988258218, + "grad_norm": 8.557363510131836, + "learning_rate": 9.400041426886356e-06, + "loss": 0.2666, + "step": 6559 + }, + { + "epoch": 0.16600450439051548, + "grad_norm": 7.956151485443115, + "learning_rate": 9.399850706223187e-06, + "loss": 0.2324, + "step": 6560 + }, + { + "epoch": 0.16602980995520916, + "grad_norm": 4.381087779998779, + "learning_rate": 9.399659957186201e-06, + "loss": 0.2084, + "step": 6561 + }, + { + "epoch": 0.16605511551990282, + "grad_norm": 10.076704978942871, + "learning_rate": 9.399469179776626e-06, + "loss": 0.2128, + "step": 6562 + }, + { + "epoch": 0.1660804210845965, + "grad_norm": 8.761272430419922, + "learning_rate": 9.399278373995693e-06, + "loss": 0.2425, + "step": 6563 + }, + { + "epoch": 0.1661057266492902, + "grad_norm": 16.488948822021484, + "learning_rate": 9.399087539844632e-06, + "loss": 0.3861, + "step": 6564 + }, + { + "epoch": 0.16613103221398384, + "grad_norm": 5.078317165374756, + "learning_rate": 9.398896677324673e-06, + "loss": 0.2527, + "step": 6565 + }, + { + "epoch": 0.16615633777867753, + "grad_norm": 7.850916862487793, + "learning_rate": 9.39870578643705e-06, + "loss": 0.1807, + "step": 6566 + }, + { + "epoch": 0.1661816433433712, + "grad_norm": 7.594887733459473, + "learning_rate": 9.39851486718299e-06, + "loss": 0.1827, + "step": 6567 + }, + { + "epoch": 0.1662069489080649, + "grad_norm": 4.4038190841674805, + "learning_rate": 9.398323919563725e-06, + "loss": 0.1893, + "step": 6568 + }, + { + "epoch": 0.16623225447275855, + "grad_norm": 6.294721603393555, + "learning_rate": 9.398132943580488e-06, + "loss": 0.1807, + "step": 6569 + }, + { + "epoch": 0.16625756003745223, + "grad_norm": 3.543689012527466, + "learning_rate": 9.397941939234509e-06, + "loss": 0.16, + "step": 6570 + }, + { + "epoch": 0.16628286560214592, + "grad_norm": 13.000617980957031, + "learning_rate": 9.397750906527022e-06, + "loss": 0.2559, + "step": 6571 + }, + { + "epoch": 0.16630817116683957, + "grad_norm": 3.551058769226074, + "learning_rate": 9.397559845459256e-06, + "loss": 0.1568, + "step": 6572 + }, + { + "epoch": 0.16633347673153326, + "grad_norm": 9.63441276550293, + "learning_rate": 9.397368756032445e-06, + "loss": 0.1277, + "step": 6573 + }, + { + "epoch": 0.16635878229622694, + "grad_norm": 8.147014617919922, + "learning_rate": 9.397177638247822e-06, + "loss": 0.2035, + "step": 6574 + }, + { + "epoch": 0.16638408786092063, + "grad_norm": 6.099086761474609, + "learning_rate": 9.396986492106618e-06, + "loss": 0.2477, + "step": 6575 + }, + { + "epoch": 0.16640939342561428, + "grad_norm": 6.1402459144592285, + "learning_rate": 9.396795317610066e-06, + "loss": 0.1873, + "step": 6576 + }, + { + "epoch": 0.16643469899030797, + "grad_norm": 3.386946439743042, + "learning_rate": 9.396604114759398e-06, + "loss": 0.1685, + "step": 6577 + }, + { + "epoch": 0.16646000455500165, + "grad_norm": 7.315353870391846, + "learning_rate": 9.396412883555847e-06, + "loss": 0.2003, + "step": 6578 + }, + { + "epoch": 0.16648531011969533, + "grad_norm": 3.7243871688842773, + "learning_rate": 9.39622162400065e-06, + "loss": 0.0978, + "step": 6579 + }, + { + "epoch": 0.166510615684389, + "grad_norm": 10.768536567687988, + "learning_rate": 9.396030336095035e-06, + "loss": 0.2402, + "step": 6580 + }, + { + "epoch": 0.16653592124908267, + "grad_norm": 3.6833009719848633, + "learning_rate": 9.395839019840238e-06, + "loss": 0.1356, + "step": 6581 + }, + { + "epoch": 0.16656122681377636, + "grad_norm": 7.469836711883545, + "learning_rate": 9.395647675237495e-06, + "loss": 0.1654, + "step": 6582 + }, + { + "epoch": 0.16658653237847001, + "grad_norm": 4.600751876831055, + "learning_rate": 9.395456302288037e-06, + "loss": 0.188, + "step": 6583 + }, + { + "epoch": 0.1666118379431637, + "grad_norm": 3.9566640853881836, + "learning_rate": 9.395264900993098e-06, + "loss": 0.1459, + "step": 6584 + }, + { + "epoch": 0.16663714350785738, + "grad_norm": 21.211557388305664, + "learning_rate": 9.395073471353913e-06, + "loss": 0.1956, + "step": 6585 + }, + { + "epoch": 0.16666244907255107, + "grad_norm": 3.6863627433776855, + "learning_rate": 9.394882013371717e-06, + "loss": 0.1629, + "step": 6586 + }, + { + "epoch": 0.16668775463724472, + "grad_norm": 3.330552816390991, + "learning_rate": 9.394690527047745e-06, + "loss": 0.0988, + "step": 6587 + }, + { + "epoch": 0.1667130602019384, + "grad_norm": 8.112339973449707, + "learning_rate": 9.39449901238323e-06, + "loss": 0.2519, + "step": 6588 + }, + { + "epoch": 0.1667383657666321, + "grad_norm": 5.549391746520996, + "learning_rate": 9.394307469379411e-06, + "loss": 0.241, + "step": 6589 + }, + { + "epoch": 0.16676367133132575, + "grad_norm": 6.562213897705078, + "learning_rate": 9.394115898037519e-06, + "loss": 0.2253, + "step": 6590 + }, + { + "epoch": 0.16678897689601943, + "grad_norm": 5.10015344619751, + "learning_rate": 9.393924298358791e-06, + "loss": 0.163, + "step": 6591 + }, + { + "epoch": 0.16681428246071311, + "grad_norm": 2.8388054370880127, + "learning_rate": 9.393732670344463e-06, + "loss": 0.0606, + "step": 6592 + }, + { + "epoch": 0.1668395880254068, + "grad_norm": 9.94461441040039, + "learning_rate": 9.39354101399577e-06, + "loss": 0.2162, + "step": 6593 + }, + { + "epoch": 0.16686489359010045, + "grad_norm": 4.9861931800842285, + "learning_rate": 9.393349329313948e-06, + "loss": 0.1969, + "step": 6594 + }, + { + "epoch": 0.16689019915479414, + "grad_norm": 4.119276523590088, + "learning_rate": 9.393157616300233e-06, + "loss": 0.135, + "step": 6595 + }, + { + "epoch": 0.16691550471948782, + "grad_norm": 9.680429458618164, + "learning_rate": 9.392965874955863e-06, + "loss": 0.283, + "step": 6596 + }, + { + "epoch": 0.16694081028418148, + "grad_norm": 5.489605903625488, + "learning_rate": 9.392774105282072e-06, + "loss": 0.1649, + "step": 6597 + }, + { + "epoch": 0.16696611584887516, + "grad_norm": 5.060104846954346, + "learning_rate": 9.392582307280099e-06, + "loss": 0.1331, + "step": 6598 + }, + { + "epoch": 0.16699142141356885, + "grad_norm": 7.9865241050720215, + "learning_rate": 9.392390480951178e-06, + "loss": 0.1382, + "step": 6599 + }, + { + "epoch": 0.16701672697826253, + "grad_norm": 5.701747417449951, + "learning_rate": 9.39219862629655e-06, + "loss": 0.1863, + "step": 6600 + }, + { + "epoch": 0.1670420325429562, + "grad_norm": 5.702038764953613, + "learning_rate": 9.392006743317448e-06, + "loss": 0.1386, + "step": 6601 + }, + { + "epoch": 0.16706733810764987, + "grad_norm": 8.071221351623535, + "learning_rate": 9.391814832015113e-06, + "loss": 0.1854, + "step": 6602 + }, + { + "epoch": 0.16709264367234355, + "grad_norm": 3.9835405349731445, + "learning_rate": 9.39162289239078e-06, + "loss": 0.1237, + "step": 6603 + }, + { + "epoch": 0.1671179492370372, + "grad_norm": 9.500327110290527, + "learning_rate": 9.391430924445689e-06, + "loss": 0.2725, + "step": 6604 + }, + { + "epoch": 0.1671432548017309, + "grad_norm": 4.919365882873535, + "learning_rate": 9.391238928181075e-06, + "loss": 0.1617, + "step": 6605 + }, + { + "epoch": 0.16716856036642458, + "grad_norm": 26.56180763244629, + "learning_rate": 9.39104690359818e-06, + "loss": 0.2144, + "step": 6606 + }, + { + "epoch": 0.16719386593111826, + "grad_norm": 7.063427448272705, + "learning_rate": 9.39085485069824e-06, + "loss": 0.2119, + "step": 6607 + }, + { + "epoch": 0.16721917149581192, + "grad_norm": 5.274073123931885, + "learning_rate": 9.390662769482493e-06, + "loss": 0.214, + "step": 6608 + }, + { + "epoch": 0.1672444770605056, + "grad_norm": 6.112064838409424, + "learning_rate": 9.390470659952179e-06, + "loss": 0.2641, + "step": 6609 + }, + { + "epoch": 0.1672697826251993, + "grad_norm": 12.016668319702148, + "learning_rate": 9.390278522108536e-06, + "loss": 0.3282, + "step": 6610 + }, + { + "epoch": 0.16729508818989297, + "grad_norm": 9.237060546875, + "learning_rate": 9.390086355952804e-06, + "loss": 0.2052, + "step": 6611 + }, + { + "epoch": 0.16732039375458663, + "grad_norm": 6.060627460479736, + "learning_rate": 9.389894161486221e-06, + "loss": 0.2143, + "step": 6612 + }, + { + "epoch": 0.1673456993192803, + "grad_norm": 8.255448341369629, + "learning_rate": 9.389701938710029e-06, + "loss": 0.2438, + "step": 6613 + }, + { + "epoch": 0.167371004883974, + "grad_norm": 8.43892765045166, + "learning_rate": 9.389509687625465e-06, + "loss": 0.2362, + "step": 6614 + }, + { + "epoch": 0.16739631044866765, + "grad_norm": 4.5354509353637695, + "learning_rate": 9.389317408233768e-06, + "loss": 0.1608, + "step": 6615 + }, + { + "epoch": 0.16742161601336134, + "grad_norm": 4.402872085571289, + "learning_rate": 9.38912510053618e-06, + "loss": 0.2105, + "step": 6616 + }, + { + "epoch": 0.16744692157805502, + "grad_norm": 10.083643913269043, + "learning_rate": 9.38893276453394e-06, + "loss": 0.3469, + "step": 6617 + }, + { + "epoch": 0.1674722271427487, + "grad_norm": 19.418006896972656, + "learning_rate": 9.388740400228292e-06, + "loss": 0.19, + "step": 6618 + }, + { + "epoch": 0.16749753270744236, + "grad_norm": 8.111990928649902, + "learning_rate": 9.38854800762047e-06, + "loss": 0.2598, + "step": 6619 + }, + { + "epoch": 0.16752283827213604, + "grad_norm": 6.028273582458496, + "learning_rate": 9.38835558671172e-06, + "loss": 0.1978, + "step": 6620 + }, + { + "epoch": 0.16754814383682973, + "grad_norm": 5.074099540710449, + "learning_rate": 9.388163137503281e-06, + "loss": 0.2065, + "step": 6621 + }, + { + "epoch": 0.16757344940152338, + "grad_norm": 3.211012601852417, + "learning_rate": 9.387970659996395e-06, + "loss": 0.1531, + "step": 6622 + }, + { + "epoch": 0.16759875496621707, + "grad_norm": 5.888981819152832, + "learning_rate": 9.387778154192302e-06, + "loss": 0.1843, + "step": 6623 + }, + { + "epoch": 0.16762406053091075, + "grad_norm": 10.154402732849121, + "learning_rate": 9.387585620092243e-06, + "loss": 0.3113, + "step": 6624 + }, + { + "epoch": 0.16764936609560444, + "grad_norm": 4.004254341125488, + "learning_rate": 9.38739305769746e-06, + "loss": 0.1651, + "step": 6625 + }, + { + "epoch": 0.1676746716602981, + "grad_norm": 6.833828926086426, + "learning_rate": 9.387200467009197e-06, + "loss": 0.262, + "step": 6626 + }, + { + "epoch": 0.16769997722499178, + "grad_norm": 7.358512878417969, + "learning_rate": 9.387007848028695e-06, + "loss": 0.208, + "step": 6627 + }, + { + "epoch": 0.16772528278968546, + "grad_norm": 3.4632513523101807, + "learning_rate": 9.386815200757194e-06, + "loss": 0.1351, + "step": 6628 + }, + { + "epoch": 0.16775058835437912, + "grad_norm": 5.17762565612793, + "learning_rate": 9.386622525195936e-06, + "loss": 0.1802, + "step": 6629 + }, + { + "epoch": 0.1677758939190728, + "grad_norm": 3.9158809185028076, + "learning_rate": 9.386429821346168e-06, + "loss": 0.1391, + "step": 6630 + }, + { + "epoch": 0.16780119948376648, + "grad_norm": 4.41292142868042, + "learning_rate": 9.386237089209128e-06, + "loss": 0.2371, + "step": 6631 + }, + { + "epoch": 0.16782650504846017, + "grad_norm": 5.665665149688721, + "learning_rate": 9.386044328786062e-06, + "loss": 0.192, + "step": 6632 + }, + { + "epoch": 0.16785181061315382, + "grad_norm": 3.8309154510498047, + "learning_rate": 9.385851540078213e-06, + "loss": 0.1433, + "step": 6633 + }, + { + "epoch": 0.1678771161778475, + "grad_norm": 5.327815055847168, + "learning_rate": 9.385658723086822e-06, + "loss": 0.1505, + "step": 6634 + }, + { + "epoch": 0.1679024217425412, + "grad_norm": 3.845172882080078, + "learning_rate": 9.385465877813133e-06, + "loss": 0.1214, + "step": 6635 + }, + { + "epoch": 0.16792772730723485, + "grad_norm": 8.017242431640625, + "learning_rate": 9.385273004258391e-06, + "loss": 0.2286, + "step": 6636 + }, + { + "epoch": 0.16795303287192853, + "grad_norm": 4.022746562957764, + "learning_rate": 9.38508010242384e-06, + "loss": 0.1549, + "step": 6637 + }, + { + "epoch": 0.16797833843662222, + "grad_norm": 6.198230743408203, + "learning_rate": 9.384887172310723e-06, + "loss": 0.1986, + "step": 6638 + }, + { + "epoch": 0.1680036440013159, + "grad_norm": 18.041799545288086, + "learning_rate": 9.384694213920284e-06, + "loss": 0.3889, + "step": 6639 + }, + { + "epoch": 0.16802894956600956, + "grad_norm": 6.1641459465026855, + "learning_rate": 9.384501227253767e-06, + "loss": 0.2793, + "step": 6640 + }, + { + "epoch": 0.16805425513070324, + "grad_norm": 3.8641090393066406, + "learning_rate": 9.384308212312418e-06, + "loss": 0.1585, + "step": 6641 + }, + { + "epoch": 0.16807956069539692, + "grad_norm": 5.643777370452881, + "learning_rate": 9.38411516909748e-06, + "loss": 0.2455, + "step": 6642 + }, + { + "epoch": 0.1681048662600906, + "grad_norm": 4.249958038330078, + "learning_rate": 9.383922097610198e-06, + "loss": 0.2012, + "step": 6643 + }, + { + "epoch": 0.16813017182478426, + "grad_norm": 3.922595977783203, + "learning_rate": 9.38372899785182e-06, + "loss": 0.1541, + "step": 6644 + }, + { + "epoch": 0.16815547738947795, + "grad_norm": 7.0429277420043945, + "learning_rate": 9.383535869823588e-06, + "loss": 0.2606, + "step": 6645 + }, + { + "epoch": 0.16818078295417163, + "grad_norm": 18.542953491210938, + "learning_rate": 9.383342713526748e-06, + "loss": 0.2808, + "step": 6646 + }, + { + "epoch": 0.1682060885188653, + "grad_norm": 17.579896926879883, + "learning_rate": 9.383149528962547e-06, + "loss": 0.3469, + "step": 6647 + }, + { + "epoch": 0.16823139408355897, + "grad_norm": 3.0827174186706543, + "learning_rate": 9.382956316132231e-06, + "loss": 0.1128, + "step": 6648 + }, + { + "epoch": 0.16825669964825266, + "grad_norm": 12.870049476623535, + "learning_rate": 9.382763075037044e-06, + "loss": 0.3363, + "step": 6649 + }, + { + "epoch": 0.16828200521294634, + "grad_norm": 6.384428024291992, + "learning_rate": 9.382569805678233e-06, + "loss": 0.1981, + "step": 6650 + }, + { + "epoch": 0.16830731077764, + "grad_norm": 5.66657018661499, + "learning_rate": 9.382376508057044e-06, + "loss": 0.1324, + "step": 6651 + }, + { + "epoch": 0.16833261634233368, + "grad_norm": 14.38591194152832, + "learning_rate": 9.382183182174725e-06, + "loss": 0.2521, + "step": 6652 + }, + { + "epoch": 0.16835792190702736, + "grad_norm": 5.8313727378845215, + "learning_rate": 9.381989828032524e-06, + "loss": 0.2228, + "step": 6653 + }, + { + "epoch": 0.16838322747172102, + "grad_norm": 6.759738922119141, + "learning_rate": 9.381796445631682e-06, + "loss": 0.1787, + "step": 6654 + }, + { + "epoch": 0.1684085330364147, + "grad_norm": 9.369675636291504, + "learning_rate": 9.381603034973452e-06, + "loss": 0.1534, + "step": 6655 + }, + { + "epoch": 0.1684338386011084, + "grad_norm": 3.1732804775238037, + "learning_rate": 9.381409596059076e-06, + "loss": 0.1503, + "step": 6656 + }, + { + "epoch": 0.16845914416580207, + "grad_norm": 4.394157409667969, + "learning_rate": 9.381216128889807e-06, + "loss": 0.2014, + "step": 6657 + }, + { + "epoch": 0.16848444973049573, + "grad_norm": 5.160019874572754, + "learning_rate": 9.38102263346689e-06, + "loss": 0.2224, + "step": 6658 + }, + { + "epoch": 0.1685097552951894, + "grad_norm": 5.062007427215576, + "learning_rate": 9.380829109791573e-06, + "loss": 0.2536, + "step": 6659 + }, + { + "epoch": 0.1685350608598831, + "grad_norm": 25.695877075195312, + "learning_rate": 9.380635557865105e-06, + "loss": 0.2146, + "step": 6660 + }, + { + "epoch": 0.16856036642457675, + "grad_norm": 6.955696105957031, + "learning_rate": 9.380441977688731e-06, + "loss": 0.1429, + "step": 6661 + }, + { + "epoch": 0.16858567198927044, + "grad_norm": 5.020847797393799, + "learning_rate": 9.380248369263702e-06, + "loss": 0.1285, + "step": 6662 + }, + { + "epoch": 0.16861097755396412, + "grad_norm": 12.684264183044434, + "learning_rate": 9.380054732591267e-06, + "loss": 0.2782, + "step": 6663 + }, + { + "epoch": 0.1686362831186578, + "grad_norm": 21.801111221313477, + "learning_rate": 9.379861067672674e-06, + "loss": 0.3339, + "step": 6664 + }, + { + "epoch": 0.16866158868335146, + "grad_norm": 4.271626949310303, + "learning_rate": 9.37966737450917e-06, + "loss": 0.1197, + "step": 6665 + }, + { + "epoch": 0.16868689424804514, + "grad_norm": 5.654237270355225, + "learning_rate": 9.379473653102006e-06, + "loss": 0.2315, + "step": 6666 + }, + { + "epoch": 0.16871219981273883, + "grad_norm": 6.557236194610596, + "learning_rate": 9.379279903452432e-06, + "loss": 0.2648, + "step": 6667 + }, + { + "epoch": 0.16873750537743248, + "grad_norm": 6.043856620788574, + "learning_rate": 9.379086125561697e-06, + "loss": 0.2283, + "step": 6668 + }, + { + "epoch": 0.16876281094212617, + "grad_norm": 3.825425624847412, + "learning_rate": 9.37889231943105e-06, + "loss": 0.1406, + "step": 6669 + }, + { + "epoch": 0.16878811650681985, + "grad_norm": 4.7306928634643555, + "learning_rate": 9.37869848506174e-06, + "loss": 0.1511, + "step": 6670 + }, + { + "epoch": 0.16881342207151354, + "grad_norm": 20.160293579101562, + "learning_rate": 9.378504622455019e-06, + "loss": 0.3543, + "step": 6671 + }, + { + "epoch": 0.1688387276362072, + "grad_norm": 6.03337287902832, + "learning_rate": 9.378310731612135e-06, + "loss": 0.1439, + "step": 6672 + }, + { + "epoch": 0.16886403320090088, + "grad_norm": 18.72551918029785, + "learning_rate": 9.37811681253434e-06, + "loss": 0.2285, + "step": 6673 + }, + { + "epoch": 0.16888933876559456, + "grad_norm": 4.993171215057373, + "learning_rate": 9.377922865222886e-06, + "loss": 0.2535, + "step": 6674 + }, + { + "epoch": 0.16891464433028824, + "grad_norm": 10.979228019714355, + "learning_rate": 9.377728889679018e-06, + "loss": 0.1853, + "step": 6675 + }, + { + "epoch": 0.1689399498949819, + "grad_norm": 3.802600383758545, + "learning_rate": 9.377534885903993e-06, + "loss": 0.1607, + "step": 6676 + }, + { + "epoch": 0.16896525545967558, + "grad_norm": 5.210061550140381, + "learning_rate": 9.37734085389906e-06, + "loss": 0.2034, + "step": 6677 + }, + { + "epoch": 0.16899056102436927, + "grad_norm": 5.537067890167236, + "learning_rate": 9.37714679366547e-06, + "loss": 0.167, + "step": 6678 + }, + { + "epoch": 0.16901586658906292, + "grad_norm": 15.655715942382812, + "learning_rate": 9.376952705204474e-06, + "loss": 0.3493, + "step": 6679 + }, + { + "epoch": 0.1690411721537566, + "grad_norm": 8.007528305053711, + "learning_rate": 9.376758588517324e-06, + "loss": 0.2193, + "step": 6680 + }, + { + "epoch": 0.1690664777184503, + "grad_norm": 8.99441146850586, + "learning_rate": 9.376564443605272e-06, + "loss": 0.228, + "step": 6681 + }, + { + "epoch": 0.16909178328314398, + "grad_norm": 5.117008686065674, + "learning_rate": 9.37637027046957e-06, + "loss": 0.2122, + "step": 6682 + }, + { + "epoch": 0.16911708884783763, + "grad_norm": 11.067580223083496, + "learning_rate": 9.37617606911147e-06, + "loss": 0.2334, + "step": 6683 + }, + { + "epoch": 0.16914239441253132, + "grad_norm": 9.803935050964355, + "learning_rate": 9.375981839532226e-06, + "loss": 0.3452, + "step": 6684 + }, + { + "epoch": 0.169167699977225, + "grad_norm": 3.9748477935791016, + "learning_rate": 9.375787581733087e-06, + "loss": 0.2215, + "step": 6685 + }, + { + "epoch": 0.16919300554191866, + "grad_norm": 5.9499359130859375, + "learning_rate": 9.37559329571531e-06, + "loss": 0.1323, + "step": 6686 + }, + { + "epoch": 0.16921831110661234, + "grad_norm": 5.378055095672607, + "learning_rate": 9.375398981480145e-06, + "loss": 0.246, + "step": 6687 + }, + { + "epoch": 0.16924361667130602, + "grad_norm": 3.803631067276001, + "learning_rate": 9.375204639028846e-06, + "loss": 0.2043, + "step": 6688 + }, + { + "epoch": 0.1692689222359997, + "grad_norm": 5.650613784790039, + "learning_rate": 9.375010268362665e-06, + "loss": 0.1628, + "step": 6689 + }, + { + "epoch": 0.16929422780069336, + "grad_norm": 10.54760456085205, + "learning_rate": 9.374815869482859e-06, + "loss": 0.2348, + "step": 6690 + }, + { + "epoch": 0.16931953336538705, + "grad_norm": 3.9451260566711426, + "learning_rate": 9.374621442390677e-06, + "loss": 0.1486, + "step": 6691 + }, + { + "epoch": 0.16934483893008073, + "grad_norm": 9.595006942749023, + "learning_rate": 9.374426987087374e-06, + "loss": 0.1294, + "step": 6692 + }, + { + "epoch": 0.1693701444947744, + "grad_norm": 7.3606648445129395, + "learning_rate": 9.374232503574208e-06, + "loss": 0.2818, + "step": 6693 + }, + { + "epoch": 0.16939545005946807, + "grad_norm": 11.730530738830566, + "learning_rate": 9.374037991852428e-06, + "loss": 0.2315, + "step": 6694 + }, + { + "epoch": 0.16942075562416176, + "grad_norm": 6.040815353393555, + "learning_rate": 9.373843451923293e-06, + "loss": 0.1924, + "step": 6695 + }, + { + "epoch": 0.16944606118885544, + "grad_norm": 5.316272735595703, + "learning_rate": 9.373648883788052e-06, + "loss": 0.1439, + "step": 6696 + }, + { + "epoch": 0.1694713667535491, + "grad_norm": 2.840729236602783, + "learning_rate": 9.373454287447964e-06, + "loss": 0.0818, + "step": 6697 + }, + { + "epoch": 0.16949667231824278, + "grad_norm": 7.453365325927734, + "learning_rate": 9.373259662904285e-06, + "loss": 0.2257, + "step": 6698 + }, + { + "epoch": 0.16952197788293646, + "grad_norm": 8.413126945495605, + "learning_rate": 9.373065010158267e-06, + "loss": 0.2129, + "step": 6699 + }, + { + "epoch": 0.16954728344763012, + "grad_norm": 14.78431510925293, + "learning_rate": 9.372870329211165e-06, + "loss": 0.3064, + "step": 6700 + }, + { + "epoch": 0.1695725890123238, + "grad_norm": 4.448458194732666, + "learning_rate": 9.372675620064237e-06, + "loss": 0.2226, + "step": 6701 + }, + { + "epoch": 0.1695978945770175, + "grad_norm": 4.865854263305664, + "learning_rate": 9.372480882718736e-06, + "loss": 0.1201, + "step": 6702 + }, + { + "epoch": 0.16962320014171117, + "grad_norm": 8.737432479858398, + "learning_rate": 9.372286117175919e-06, + "loss": 0.2793, + "step": 6703 + }, + { + "epoch": 0.16964850570640483, + "grad_norm": 11.3775634765625, + "learning_rate": 9.372091323437042e-06, + "loss": 0.1899, + "step": 6704 + }, + { + "epoch": 0.1696738112710985, + "grad_norm": 7.414143085479736, + "learning_rate": 9.371896501503363e-06, + "loss": 0.1792, + "step": 6705 + }, + { + "epoch": 0.1696991168357922, + "grad_norm": 6.850212097167969, + "learning_rate": 9.371701651376137e-06, + "loss": 0.1895, + "step": 6706 + }, + { + "epoch": 0.16972442240048588, + "grad_norm": 4.165535926818848, + "learning_rate": 9.371506773056618e-06, + "loss": 0.1536, + "step": 6707 + }, + { + "epoch": 0.16974972796517954, + "grad_norm": 5.018568515777588, + "learning_rate": 9.371311866546067e-06, + "loss": 0.2023, + "step": 6708 + }, + { + "epoch": 0.16977503352987322, + "grad_norm": 6.177471160888672, + "learning_rate": 9.37111693184574e-06, + "loss": 0.1529, + "step": 6709 + }, + { + "epoch": 0.1698003390945669, + "grad_norm": 7.577075958251953, + "learning_rate": 9.370921968956888e-06, + "loss": 0.231, + "step": 6710 + }, + { + "epoch": 0.16982564465926056, + "grad_norm": 5.822270393371582, + "learning_rate": 9.370726977880777e-06, + "loss": 0.1189, + "step": 6711 + }, + { + "epoch": 0.16985095022395424, + "grad_norm": 6.950250148773193, + "learning_rate": 9.37053195861866e-06, + "loss": 0.2358, + "step": 6712 + }, + { + "epoch": 0.16987625578864793, + "grad_norm": 5.800390720367432, + "learning_rate": 9.370336911171796e-06, + "loss": 0.2088, + "step": 6713 + }, + { + "epoch": 0.1699015613533416, + "grad_norm": 4.818026542663574, + "learning_rate": 9.37014183554144e-06, + "loss": 0.167, + "step": 6714 + }, + { + "epoch": 0.16992686691803527, + "grad_norm": 13.311141967773438, + "learning_rate": 9.369946731728855e-06, + "loss": 0.0935, + "step": 6715 + }, + { + "epoch": 0.16995217248272895, + "grad_norm": 4.301824569702148, + "learning_rate": 9.369751599735296e-06, + "loss": 0.1607, + "step": 6716 + }, + { + "epoch": 0.16997747804742264, + "grad_norm": 9.99817180633545, + "learning_rate": 9.36955643956202e-06, + "loss": 0.2849, + "step": 6717 + }, + { + "epoch": 0.1700027836121163, + "grad_norm": 4.455733299255371, + "learning_rate": 9.369361251210288e-06, + "loss": 0.1071, + "step": 6718 + }, + { + "epoch": 0.17002808917680998, + "grad_norm": 6.322576522827148, + "learning_rate": 9.369166034681358e-06, + "loss": 0.2215, + "step": 6719 + }, + { + "epoch": 0.17005339474150366, + "grad_norm": 3.1542818546295166, + "learning_rate": 9.36897078997649e-06, + "loss": 0.1592, + "step": 6720 + }, + { + "epoch": 0.17007870030619734, + "grad_norm": 3.9298529624938965, + "learning_rate": 9.368775517096942e-06, + "loss": 0.125, + "step": 6721 + }, + { + "epoch": 0.170104005870891, + "grad_norm": 4.268465995788574, + "learning_rate": 9.368580216043972e-06, + "loss": 0.1116, + "step": 6722 + }, + { + "epoch": 0.17012931143558468, + "grad_norm": 10.158897399902344, + "learning_rate": 9.368384886818842e-06, + "loss": 0.1119, + "step": 6723 + }, + { + "epoch": 0.17015461700027837, + "grad_norm": 3.4960129261016846, + "learning_rate": 9.36818952942281e-06, + "loss": 0.1388, + "step": 6724 + }, + { + "epoch": 0.17017992256497202, + "grad_norm": 7.558712482452393, + "learning_rate": 9.367994143857136e-06, + "loss": 0.1677, + "step": 6725 + }, + { + "epoch": 0.1702052281296657, + "grad_norm": 5.867245674133301, + "learning_rate": 9.367798730123081e-06, + "loss": 0.2577, + "step": 6726 + }, + { + "epoch": 0.1702305336943594, + "grad_norm": 5.754895210266113, + "learning_rate": 9.367603288221903e-06, + "loss": 0.1993, + "step": 6727 + }, + { + "epoch": 0.17025583925905308, + "grad_norm": 10.543083190917969, + "learning_rate": 9.367407818154865e-06, + "loss": 0.2259, + "step": 6728 + }, + { + "epoch": 0.17028114482374673, + "grad_norm": 7.569628715515137, + "learning_rate": 9.367212319923226e-06, + "loss": 0.2494, + "step": 6729 + }, + { + "epoch": 0.17030645038844042, + "grad_norm": 25.657451629638672, + "learning_rate": 9.367016793528247e-06, + "loss": 0.3715, + "step": 6730 + }, + { + "epoch": 0.1703317559531341, + "grad_norm": 3.430884838104248, + "learning_rate": 9.36682123897119e-06, + "loss": 0.185, + "step": 6731 + }, + { + "epoch": 0.17035706151782776, + "grad_norm": 28.206340789794922, + "learning_rate": 9.366625656253314e-06, + "loss": 0.2858, + "step": 6732 + }, + { + "epoch": 0.17038236708252144, + "grad_norm": 13.495930671691895, + "learning_rate": 9.366430045375883e-06, + "loss": 0.2313, + "step": 6733 + }, + { + "epoch": 0.17040767264721512, + "grad_norm": 8.596074104309082, + "learning_rate": 9.366234406340155e-06, + "loss": 0.1939, + "step": 6734 + }, + { + "epoch": 0.1704329782119088, + "grad_norm": 7.604497909545898, + "learning_rate": 9.366038739147395e-06, + "loss": 0.2064, + "step": 6735 + }, + { + "epoch": 0.17045828377660246, + "grad_norm": 7.261715412139893, + "learning_rate": 9.365843043798863e-06, + "loss": 0.2098, + "step": 6736 + }, + { + "epoch": 0.17048358934129615, + "grad_norm": 7.018747329711914, + "learning_rate": 9.36564732029582e-06, + "loss": 0.2296, + "step": 6737 + }, + { + "epoch": 0.17050889490598983, + "grad_norm": 4.196245193481445, + "learning_rate": 9.36545156863953e-06, + "loss": 0.1717, + "step": 6738 + }, + { + "epoch": 0.17053420047068352, + "grad_norm": 11.513946533203125, + "learning_rate": 9.365255788831255e-06, + "loss": 0.1305, + "step": 6739 + }, + { + "epoch": 0.17055950603537717, + "grad_norm": 7.1759819984436035, + "learning_rate": 9.365059980872258e-06, + "loss": 0.1601, + "step": 6740 + }, + { + "epoch": 0.17058481160007086, + "grad_norm": 4.7165961265563965, + "learning_rate": 9.364864144763801e-06, + "loss": 0.1664, + "step": 6741 + }, + { + "epoch": 0.17061011716476454, + "grad_norm": 7.192108154296875, + "learning_rate": 9.364668280507147e-06, + "loss": 0.1819, + "step": 6742 + }, + { + "epoch": 0.1706354227294582, + "grad_norm": 7.042332172393799, + "learning_rate": 9.364472388103557e-06, + "loss": 0.1863, + "step": 6743 + }, + { + "epoch": 0.17066072829415188, + "grad_norm": 6.970243453979492, + "learning_rate": 9.364276467554301e-06, + "loss": 0.1865, + "step": 6744 + }, + { + "epoch": 0.17068603385884557, + "grad_norm": 3.726618528366089, + "learning_rate": 9.364080518860634e-06, + "loss": 0.1726, + "step": 6745 + }, + { + "epoch": 0.17071133942353925, + "grad_norm": 5.691264629364014, + "learning_rate": 9.363884542023825e-06, + "loss": 0.237, + "step": 6746 + }, + { + "epoch": 0.1707366449882329, + "grad_norm": 11.098023414611816, + "learning_rate": 9.363688537045137e-06, + "loss": 0.1986, + "step": 6747 + }, + { + "epoch": 0.1707619505529266, + "grad_norm": 5.1705169677734375, + "learning_rate": 9.363492503925832e-06, + "loss": 0.1672, + "step": 6748 + }, + { + "epoch": 0.17078725611762027, + "grad_norm": 15.920377731323242, + "learning_rate": 9.363296442667175e-06, + "loss": 0.2734, + "step": 6749 + }, + { + "epoch": 0.17081256168231393, + "grad_norm": 8.922563552856445, + "learning_rate": 9.363100353270434e-06, + "loss": 0.3197, + "step": 6750 + }, + { + "epoch": 0.1708378672470076, + "grad_norm": 16.440771102905273, + "learning_rate": 9.362904235736868e-06, + "loss": 0.1811, + "step": 6751 + }, + { + "epoch": 0.1708631728117013, + "grad_norm": 5.766258716583252, + "learning_rate": 9.362708090067744e-06, + "loss": 0.234, + "step": 6752 + }, + { + "epoch": 0.17088847837639498, + "grad_norm": 10.716845512390137, + "learning_rate": 9.362511916264327e-06, + "loss": 0.1827, + "step": 6753 + }, + { + "epoch": 0.17091378394108864, + "grad_norm": 6.614422798156738, + "learning_rate": 9.362315714327883e-06, + "loss": 0.1889, + "step": 6754 + }, + { + "epoch": 0.17093908950578232, + "grad_norm": 10.46297836303711, + "learning_rate": 9.362119484259675e-06, + "loss": 0.2225, + "step": 6755 + }, + { + "epoch": 0.170964395070476, + "grad_norm": 9.599376678466797, + "learning_rate": 9.361923226060971e-06, + "loss": 0.1928, + "step": 6756 + }, + { + "epoch": 0.17098970063516966, + "grad_norm": 10.178948402404785, + "learning_rate": 9.361726939733036e-06, + "loss": 0.3253, + "step": 6757 + }, + { + "epoch": 0.17101500619986335, + "grad_norm": 4.08784818649292, + "learning_rate": 9.361530625277135e-06, + "loss": 0.1146, + "step": 6758 + }, + { + "epoch": 0.17104031176455703, + "grad_norm": 5.6894378662109375, + "learning_rate": 9.361334282694535e-06, + "loss": 0.1911, + "step": 6759 + }, + { + "epoch": 0.1710656173292507, + "grad_norm": 7.082962512969971, + "learning_rate": 9.3611379119865e-06, + "loss": 0.208, + "step": 6760 + }, + { + "epoch": 0.17109092289394437, + "grad_norm": 9.600049018859863, + "learning_rate": 9.360941513154299e-06, + "loss": 0.2041, + "step": 6761 + }, + { + "epoch": 0.17111622845863805, + "grad_norm": 6.593135833740234, + "learning_rate": 9.360745086199195e-06, + "loss": 0.1169, + "step": 6762 + }, + { + "epoch": 0.17114153402333174, + "grad_norm": 3.08156418800354, + "learning_rate": 9.36054863112246e-06, + "loss": 0.1671, + "step": 6763 + }, + { + "epoch": 0.1711668395880254, + "grad_norm": 12.107463836669922, + "learning_rate": 9.360352147925356e-06, + "loss": 0.3001, + "step": 6764 + }, + { + "epoch": 0.17119214515271908, + "grad_norm": 7.50424337387085, + "learning_rate": 9.360155636609154e-06, + "loss": 0.1838, + "step": 6765 + }, + { + "epoch": 0.17121745071741276, + "grad_norm": 5.394518852233887, + "learning_rate": 9.359959097175118e-06, + "loss": 0.1967, + "step": 6766 + }, + { + "epoch": 0.17124275628210645, + "grad_norm": 4.270266056060791, + "learning_rate": 9.359762529624517e-06, + "loss": 0.1313, + "step": 6767 + }, + { + "epoch": 0.1712680618468001, + "grad_norm": 3.602637767791748, + "learning_rate": 9.35956593395862e-06, + "loss": 0.1577, + "step": 6768 + }, + { + "epoch": 0.17129336741149379, + "grad_norm": 38.60195541381836, + "learning_rate": 9.35936931017869e-06, + "loss": 0.3014, + "step": 6769 + }, + { + "epoch": 0.17131867297618747, + "grad_norm": 13.716398239135742, + "learning_rate": 9.359172658286e-06, + "loss": 0.3226, + "step": 6770 + }, + { + "epoch": 0.17134397854088115, + "grad_norm": 4.3702545166015625, + "learning_rate": 9.358975978281815e-06, + "loss": 0.1546, + "step": 6771 + }, + { + "epoch": 0.1713692841055748, + "grad_norm": 5.380885124206543, + "learning_rate": 9.358779270167408e-06, + "loss": 0.2204, + "step": 6772 + }, + { + "epoch": 0.1713945896702685, + "grad_norm": 7.841830730438232, + "learning_rate": 9.358582533944042e-06, + "loss": 0.227, + "step": 6773 + }, + { + "epoch": 0.17141989523496218, + "grad_norm": 10.673113822937012, + "learning_rate": 9.358385769612988e-06, + "loss": 0.2791, + "step": 6774 + }, + { + "epoch": 0.17144520079965583, + "grad_norm": 5.126916885375977, + "learning_rate": 9.358188977175516e-06, + "loss": 0.2059, + "step": 6775 + }, + { + "epoch": 0.17147050636434952, + "grad_norm": 9.903324127197266, + "learning_rate": 9.357992156632893e-06, + "loss": 0.1339, + "step": 6776 + }, + { + "epoch": 0.1714958119290432, + "grad_norm": 10.610599517822266, + "learning_rate": 9.357795307986389e-06, + "loss": 0.2781, + "step": 6777 + }, + { + "epoch": 0.17152111749373689, + "grad_norm": 25.726409912109375, + "learning_rate": 9.357598431237273e-06, + "loss": 0.2982, + "step": 6778 + }, + { + "epoch": 0.17154642305843054, + "grad_norm": 8.612374305725098, + "learning_rate": 9.357401526386818e-06, + "loss": 0.227, + "step": 6779 + }, + { + "epoch": 0.17157172862312423, + "grad_norm": 3.4557652473449707, + "learning_rate": 9.357204593436289e-06, + "loss": 0.1389, + "step": 6780 + }, + { + "epoch": 0.1715970341878179, + "grad_norm": 14.086133003234863, + "learning_rate": 9.357007632386958e-06, + "loss": 0.1882, + "step": 6781 + }, + { + "epoch": 0.17162233975251157, + "grad_norm": 6.403519153594971, + "learning_rate": 9.356810643240095e-06, + "loss": 0.2351, + "step": 6782 + }, + { + "epoch": 0.17164764531720525, + "grad_norm": 4.79519510269165, + "learning_rate": 9.356613625996972e-06, + "loss": 0.2585, + "step": 6783 + }, + { + "epoch": 0.17167295088189893, + "grad_norm": 5.05873441696167, + "learning_rate": 9.356416580658857e-06, + "loss": 0.1991, + "step": 6784 + }, + { + "epoch": 0.17169825644659262, + "grad_norm": 1.8520368337631226, + "learning_rate": 9.356219507227023e-06, + "loss": 0.0904, + "step": 6785 + }, + { + "epoch": 0.17172356201128627, + "grad_norm": 5.226218223571777, + "learning_rate": 9.35602240570274e-06, + "loss": 0.166, + "step": 6786 + }, + { + "epoch": 0.17174886757597996, + "grad_norm": 4.137634754180908, + "learning_rate": 9.355825276087278e-06, + "loss": 0.1214, + "step": 6787 + }, + { + "epoch": 0.17177417314067364, + "grad_norm": 8.688240051269531, + "learning_rate": 9.355628118381908e-06, + "loss": 0.1844, + "step": 6788 + }, + { + "epoch": 0.1717994787053673, + "grad_norm": 5.797042369842529, + "learning_rate": 9.355430932587904e-06, + "loss": 0.1844, + "step": 6789 + }, + { + "epoch": 0.17182478427006098, + "grad_norm": 4.982198715209961, + "learning_rate": 9.355233718706536e-06, + "loss": 0.1679, + "step": 6790 + }, + { + "epoch": 0.17185008983475467, + "grad_norm": 15.030769348144531, + "learning_rate": 9.355036476739075e-06, + "loss": 0.173, + "step": 6791 + }, + { + "epoch": 0.17187539539944835, + "grad_norm": 3.0058207511901855, + "learning_rate": 9.354839206686794e-06, + "loss": 0.0943, + "step": 6792 + }, + { + "epoch": 0.171900700964142, + "grad_norm": 3.963477373123169, + "learning_rate": 9.354641908550965e-06, + "loss": 0.1882, + "step": 6793 + }, + { + "epoch": 0.1719260065288357, + "grad_norm": 9.558378219604492, + "learning_rate": 9.354444582332861e-06, + "loss": 0.2803, + "step": 6794 + }, + { + "epoch": 0.17195131209352937, + "grad_norm": 15.332976341247559, + "learning_rate": 9.354247228033753e-06, + "loss": 0.2458, + "step": 6795 + }, + { + "epoch": 0.17197661765822303, + "grad_norm": 5.7552690505981445, + "learning_rate": 9.354049845654916e-06, + "loss": 0.247, + "step": 6796 + }, + { + "epoch": 0.1720019232229167, + "grad_norm": 4.509395599365234, + "learning_rate": 9.35385243519762e-06, + "loss": 0.1632, + "step": 6797 + }, + { + "epoch": 0.1720272287876104, + "grad_norm": 5.06679630279541, + "learning_rate": 9.35365499666314e-06, + "loss": 0.1656, + "step": 6798 + }, + { + "epoch": 0.17205253435230408, + "grad_norm": 18.176149368286133, + "learning_rate": 9.35345753005275e-06, + "loss": 0.4079, + "step": 6799 + }, + { + "epoch": 0.17207783991699774, + "grad_norm": 22.880290985107422, + "learning_rate": 9.35326003536772e-06, + "loss": 0.2191, + "step": 6800 + }, + { + "epoch": 0.17210314548169142, + "grad_norm": 4.581879138946533, + "learning_rate": 9.353062512609327e-06, + "loss": 0.1279, + "step": 6801 + }, + { + "epoch": 0.1721284510463851, + "grad_norm": 5.46740198135376, + "learning_rate": 9.352864961778842e-06, + "loss": 0.1581, + "step": 6802 + }, + { + "epoch": 0.1721537566110788, + "grad_norm": 6.080196380615234, + "learning_rate": 9.352667382877543e-06, + "loss": 0.2393, + "step": 6803 + }, + { + "epoch": 0.17217906217577245, + "grad_norm": 4.984930992126465, + "learning_rate": 9.3524697759067e-06, + "loss": 0.1695, + "step": 6804 + }, + { + "epoch": 0.17220436774046613, + "grad_norm": 5.32568883895874, + "learning_rate": 9.352272140867592e-06, + "loss": 0.1139, + "step": 6805 + }, + { + "epoch": 0.17222967330515981, + "grad_norm": 7.5475335121154785, + "learning_rate": 9.352074477761485e-06, + "loss": 0.1501, + "step": 6806 + }, + { + "epoch": 0.17225497886985347, + "grad_norm": 7.310075283050537, + "learning_rate": 9.351876786589664e-06, + "loss": 0.2459, + "step": 6807 + }, + { + "epoch": 0.17228028443454715, + "grad_norm": 3.7461555004119873, + "learning_rate": 9.351679067353397e-06, + "loss": 0.102, + "step": 6808 + }, + { + "epoch": 0.17230558999924084, + "grad_norm": 14.598490715026855, + "learning_rate": 9.351481320053962e-06, + "loss": 0.4352, + "step": 6809 + }, + { + "epoch": 0.17233089556393452, + "grad_norm": 5.3701019287109375, + "learning_rate": 9.351283544692633e-06, + "loss": 0.2334, + "step": 6810 + }, + { + "epoch": 0.17235620112862818, + "grad_norm": 4.511604309082031, + "learning_rate": 9.351085741270687e-06, + "loss": 0.1887, + "step": 6811 + }, + { + "epoch": 0.17238150669332186, + "grad_norm": 4.538031101226807, + "learning_rate": 9.350887909789396e-06, + "loss": 0.1278, + "step": 6812 + }, + { + "epoch": 0.17240681225801555, + "grad_norm": 12.398747444152832, + "learning_rate": 9.35069005025004e-06, + "loss": 0.3049, + "step": 6813 + }, + { + "epoch": 0.1724321178227092, + "grad_norm": 9.440991401672363, + "learning_rate": 9.350492162653893e-06, + "loss": 0.3649, + "step": 6814 + }, + { + "epoch": 0.1724574233874029, + "grad_norm": 4.171928882598877, + "learning_rate": 9.350294247002232e-06, + "loss": 0.1742, + "step": 6815 + }, + { + "epoch": 0.17248272895209657, + "grad_norm": 8.034317970275879, + "learning_rate": 9.350096303296331e-06, + "loss": 0.1772, + "step": 6816 + }, + { + "epoch": 0.17250803451679025, + "grad_norm": 4.888920307159424, + "learning_rate": 9.349898331537469e-06, + "loss": 0.1673, + "step": 6817 + }, + { + "epoch": 0.1725333400814839, + "grad_norm": 6.614371299743652, + "learning_rate": 9.349700331726921e-06, + "loss": 0.1164, + "step": 6818 + }, + { + "epoch": 0.1725586456461776, + "grad_norm": 3.29116153717041, + "learning_rate": 9.349502303865966e-06, + "loss": 0.1528, + "step": 6819 + }, + { + "epoch": 0.17258395121087128, + "grad_norm": 5.799214839935303, + "learning_rate": 9.34930424795588e-06, + "loss": 0.1069, + "step": 6820 + }, + { + "epoch": 0.17260925677556493, + "grad_norm": 6.972298622131348, + "learning_rate": 9.349106163997938e-06, + "loss": 0.1179, + "step": 6821 + }, + { + "epoch": 0.17263456234025862, + "grad_norm": 5.881075859069824, + "learning_rate": 9.34890805199342e-06, + "loss": 0.135, + "step": 6822 + }, + { + "epoch": 0.1726598679049523, + "grad_norm": 6.646449565887451, + "learning_rate": 9.348709911943603e-06, + "loss": 0.2736, + "step": 6823 + }, + { + "epoch": 0.172685173469646, + "grad_norm": 11.034160614013672, + "learning_rate": 9.348511743849764e-06, + "loss": 0.3405, + "step": 6824 + }, + { + "epoch": 0.17271047903433964, + "grad_norm": 6.575568675994873, + "learning_rate": 9.348313547713183e-06, + "loss": 0.239, + "step": 6825 + }, + { + "epoch": 0.17273578459903333, + "grad_norm": 4.738037109375, + "learning_rate": 9.348115323535136e-06, + "loss": 0.1739, + "step": 6826 + }, + { + "epoch": 0.172761090163727, + "grad_norm": 4.293935775756836, + "learning_rate": 9.347917071316903e-06, + "loss": 0.1835, + "step": 6827 + }, + { + "epoch": 0.17278639572842067, + "grad_norm": 8.652552604675293, + "learning_rate": 9.347718791059761e-06, + "loss": 0.2918, + "step": 6828 + }, + { + "epoch": 0.17281170129311435, + "grad_norm": 7.933583736419678, + "learning_rate": 9.34752048276499e-06, + "loss": 0.1235, + "step": 6829 + }, + { + "epoch": 0.17283700685780803, + "grad_norm": 3.1304125785827637, + "learning_rate": 9.347322146433866e-06, + "loss": 0.1701, + "step": 6830 + }, + { + "epoch": 0.17286231242250172, + "grad_norm": 4.578464508056641, + "learning_rate": 9.347123782067674e-06, + "loss": 0.114, + "step": 6831 + }, + { + "epoch": 0.17288761798719537, + "grad_norm": 8.749781608581543, + "learning_rate": 9.346925389667685e-06, + "loss": 0.2453, + "step": 6832 + }, + { + "epoch": 0.17291292355188906, + "grad_norm": 3.6669342517852783, + "learning_rate": 9.346726969235184e-06, + "loss": 0.1941, + "step": 6833 + }, + { + "epoch": 0.17293822911658274, + "grad_norm": 11.278275489807129, + "learning_rate": 9.34652852077145e-06, + "loss": 0.2774, + "step": 6834 + }, + { + "epoch": 0.17296353468127643, + "grad_norm": 6.511534214019775, + "learning_rate": 9.346330044277762e-06, + "loss": 0.1803, + "step": 6835 + }, + { + "epoch": 0.17298884024597008, + "grad_norm": 5.511619567871094, + "learning_rate": 9.346131539755401e-06, + "loss": 0.1588, + "step": 6836 + }, + { + "epoch": 0.17301414581066377, + "grad_norm": 5.207586288452148, + "learning_rate": 9.345933007205645e-06, + "loss": 0.1724, + "step": 6837 + }, + { + "epoch": 0.17303945137535745, + "grad_norm": 4.7350382804870605, + "learning_rate": 9.345734446629776e-06, + "loss": 0.1521, + "step": 6838 + }, + { + "epoch": 0.1730647569400511, + "grad_norm": 12.784790992736816, + "learning_rate": 9.345535858029073e-06, + "loss": 0.1949, + "step": 6839 + }, + { + "epoch": 0.1730900625047448, + "grad_norm": 4.895819664001465, + "learning_rate": 9.345337241404818e-06, + "loss": 0.1676, + "step": 6840 + }, + { + "epoch": 0.17311536806943847, + "grad_norm": 7.201969623565674, + "learning_rate": 9.345138596758291e-06, + "loss": 0.1206, + "step": 6841 + }, + { + "epoch": 0.17314067363413216, + "grad_norm": 10.999751091003418, + "learning_rate": 9.344939924090774e-06, + "loss": 0.2138, + "step": 6842 + }, + { + "epoch": 0.17316597919882581, + "grad_norm": 10.094195365905762, + "learning_rate": 9.344741223403548e-06, + "loss": 0.3072, + "step": 6843 + }, + { + "epoch": 0.1731912847635195, + "grad_norm": 7.708078861236572, + "learning_rate": 9.344542494697895e-06, + "loss": 0.1263, + "step": 6844 + }, + { + "epoch": 0.17321659032821318, + "grad_norm": 6.202820777893066, + "learning_rate": 9.344343737975093e-06, + "loss": 0.2661, + "step": 6845 + }, + { + "epoch": 0.17324189589290684, + "grad_norm": 7.320472717285156, + "learning_rate": 9.344144953236428e-06, + "loss": 0.2874, + "step": 6846 + }, + { + "epoch": 0.17326720145760052, + "grad_norm": 8.094647407531738, + "learning_rate": 9.34394614048318e-06, + "loss": 0.1474, + "step": 6847 + }, + { + "epoch": 0.1732925070222942, + "grad_norm": 20.691160202026367, + "learning_rate": 9.343747299716632e-06, + "loss": 0.3242, + "step": 6848 + }, + { + "epoch": 0.1733178125869879, + "grad_norm": 6.035953521728516, + "learning_rate": 9.343548430938064e-06, + "loss": 0.2106, + "step": 6849 + }, + { + "epoch": 0.17334311815168155, + "grad_norm": 8.16810417175293, + "learning_rate": 9.343349534148761e-06, + "loss": 0.3349, + "step": 6850 + }, + { + "epoch": 0.17336842371637523, + "grad_norm": 5.78212833404541, + "learning_rate": 9.343150609350004e-06, + "loss": 0.1961, + "step": 6851 + }, + { + "epoch": 0.17339372928106891, + "grad_norm": 5.2652411460876465, + "learning_rate": 9.342951656543079e-06, + "loss": 0.2671, + "step": 6852 + }, + { + "epoch": 0.17341903484576257, + "grad_norm": 11.243071556091309, + "learning_rate": 9.342752675729265e-06, + "loss": 0.2608, + "step": 6853 + }, + { + "epoch": 0.17344434041045625, + "grad_norm": 7.752415657043457, + "learning_rate": 9.342553666909847e-06, + "loss": 0.2077, + "step": 6854 + }, + { + "epoch": 0.17346964597514994, + "grad_norm": 8.544716835021973, + "learning_rate": 9.342354630086108e-06, + "loss": 0.2584, + "step": 6855 + }, + { + "epoch": 0.17349495153984362, + "grad_norm": 4.001067638397217, + "learning_rate": 9.342155565259332e-06, + "loss": 0.1408, + "step": 6856 + }, + { + "epoch": 0.17352025710453728, + "grad_norm": 6.656439304351807, + "learning_rate": 9.341956472430803e-06, + "loss": 0.2847, + "step": 6857 + }, + { + "epoch": 0.17354556266923096, + "grad_norm": 3.83164381980896, + "learning_rate": 9.341757351601803e-06, + "loss": 0.2082, + "step": 6858 + }, + { + "epoch": 0.17357086823392465, + "grad_norm": 4.898778915405273, + "learning_rate": 9.341558202773617e-06, + "loss": 0.1663, + "step": 6859 + }, + { + "epoch": 0.1735961737986183, + "grad_norm": 9.588056564331055, + "learning_rate": 9.341359025947533e-06, + "loss": 0.2932, + "step": 6860 + }, + { + "epoch": 0.173621479363312, + "grad_norm": 3.4737067222595215, + "learning_rate": 9.341159821124828e-06, + "loss": 0.1601, + "step": 6861 + }, + { + "epoch": 0.17364678492800567, + "grad_norm": 4.597464084625244, + "learning_rate": 9.340960588306793e-06, + "loss": 0.1595, + "step": 6862 + }, + { + "epoch": 0.17367209049269935, + "grad_norm": 13.941070556640625, + "learning_rate": 9.34076132749471e-06, + "loss": 0.365, + "step": 6863 + }, + { + "epoch": 0.173697396057393, + "grad_norm": 20.672893524169922, + "learning_rate": 9.340562038689867e-06, + "loss": 0.2204, + "step": 6864 + }, + { + "epoch": 0.1737227016220867, + "grad_norm": 5.329143047332764, + "learning_rate": 9.340362721893544e-06, + "loss": 0.2534, + "step": 6865 + }, + { + "epoch": 0.17374800718678038, + "grad_norm": 8.031635284423828, + "learning_rate": 9.34016337710703e-06, + "loss": 0.2528, + "step": 6866 + }, + { + "epoch": 0.17377331275147406, + "grad_norm": 3.3632867336273193, + "learning_rate": 9.33996400433161e-06, + "loss": 0.1702, + "step": 6867 + }, + { + "epoch": 0.17379861831616772, + "grad_norm": 6.984025478363037, + "learning_rate": 9.339764603568568e-06, + "loss": 0.1268, + "step": 6868 + }, + { + "epoch": 0.1738239238808614, + "grad_norm": 7.700283527374268, + "learning_rate": 9.339565174819195e-06, + "loss": 0.2475, + "step": 6869 + }, + { + "epoch": 0.1738492294455551, + "grad_norm": 6.9327311515808105, + "learning_rate": 9.33936571808477e-06, + "loss": 0.2389, + "step": 6870 + }, + { + "epoch": 0.17387453501024874, + "grad_norm": 8.539257049560547, + "learning_rate": 9.339166233366583e-06, + "loss": 0.2824, + "step": 6871 + }, + { + "epoch": 0.17389984057494243, + "grad_norm": 4.4880475997924805, + "learning_rate": 9.338966720665921e-06, + "loss": 0.1727, + "step": 6872 + }, + { + "epoch": 0.1739251461396361, + "grad_norm": 5.189909934997559, + "learning_rate": 9.33876717998407e-06, + "loss": 0.2265, + "step": 6873 + }, + { + "epoch": 0.1739504517043298, + "grad_norm": 5.408937931060791, + "learning_rate": 9.338567611322315e-06, + "loss": 0.2567, + "step": 6874 + }, + { + "epoch": 0.17397575726902345, + "grad_norm": 5.620266914367676, + "learning_rate": 9.338368014681947e-06, + "loss": 0.1698, + "step": 6875 + }, + { + "epoch": 0.17400106283371714, + "grad_norm": 6.280446529388428, + "learning_rate": 9.338168390064248e-06, + "loss": 0.2707, + "step": 6876 + }, + { + "epoch": 0.17402636839841082, + "grad_norm": 5.549182415008545, + "learning_rate": 9.337968737470511e-06, + "loss": 0.1433, + "step": 6877 + }, + { + "epoch": 0.17405167396310448, + "grad_norm": 5.041600704193115, + "learning_rate": 9.337769056902017e-06, + "loss": 0.1561, + "step": 6878 + }, + { + "epoch": 0.17407697952779816, + "grad_norm": 11.326645851135254, + "learning_rate": 9.33756934836006e-06, + "loss": 0.2468, + "step": 6879 + }, + { + "epoch": 0.17410228509249184, + "grad_norm": 14.036043167114258, + "learning_rate": 9.337369611845923e-06, + "loss": 0.2926, + "step": 6880 + }, + { + "epoch": 0.17412759065718553, + "grad_norm": 7.128543853759766, + "learning_rate": 9.3371698473609e-06, + "loss": 0.1758, + "step": 6881 + }, + { + "epoch": 0.17415289622187918, + "grad_norm": 4.121572971343994, + "learning_rate": 9.336970054906272e-06, + "loss": 0.1764, + "step": 6882 + }, + { + "epoch": 0.17417820178657287, + "grad_norm": 8.42479419708252, + "learning_rate": 9.336770234483331e-06, + "loss": 0.2314, + "step": 6883 + }, + { + "epoch": 0.17420350735126655, + "grad_norm": 4.346531867980957, + "learning_rate": 9.336570386093366e-06, + "loss": 0.1495, + "step": 6884 + }, + { + "epoch": 0.1742288129159602, + "grad_norm": 4.136104106903076, + "learning_rate": 9.336370509737664e-06, + "loss": 0.1949, + "step": 6885 + }, + { + "epoch": 0.1742541184806539, + "grad_norm": 7.975350379943848, + "learning_rate": 9.336170605417519e-06, + "loss": 0.2748, + "step": 6886 + }, + { + "epoch": 0.17427942404534758, + "grad_norm": 10.953435897827148, + "learning_rate": 9.335970673134213e-06, + "loss": 0.2266, + "step": 6887 + }, + { + "epoch": 0.17430472961004126, + "grad_norm": 6.609344482421875, + "learning_rate": 9.335770712889038e-06, + "loss": 0.2637, + "step": 6888 + }, + { + "epoch": 0.17433003517473492, + "grad_norm": 4.441615104675293, + "learning_rate": 9.335570724683287e-06, + "loss": 0.1951, + "step": 6889 + }, + { + "epoch": 0.1743553407394286, + "grad_norm": 5.478949069976807, + "learning_rate": 9.335370708518246e-06, + "loss": 0.2387, + "step": 6890 + }, + { + "epoch": 0.17438064630412228, + "grad_norm": 6.829658031463623, + "learning_rate": 9.335170664395206e-06, + "loss": 0.2337, + "step": 6891 + }, + { + "epoch": 0.17440595186881594, + "grad_norm": 5.169778823852539, + "learning_rate": 9.334970592315455e-06, + "loss": 0.1467, + "step": 6892 + }, + { + "epoch": 0.17443125743350962, + "grad_norm": 4.934110641479492, + "learning_rate": 9.334770492280288e-06, + "loss": 0.1903, + "step": 6893 + }, + { + "epoch": 0.1744565629982033, + "grad_norm": 6.22324275970459, + "learning_rate": 9.33457036429099e-06, + "loss": 0.2204, + "step": 6894 + }, + { + "epoch": 0.174481868562897, + "grad_norm": 5.289677143096924, + "learning_rate": 9.334370208348853e-06, + "loss": 0.1627, + "step": 6895 + }, + { + "epoch": 0.17450717412759065, + "grad_norm": 8.229637145996094, + "learning_rate": 9.334170024455172e-06, + "loss": 0.159, + "step": 6896 + }, + { + "epoch": 0.17453247969228433, + "grad_norm": 5.859115123748779, + "learning_rate": 9.333969812611232e-06, + "loss": 0.2009, + "step": 6897 + }, + { + "epoch": 0.17455778525697802, + "grad_norm": 10.26511001586914, + "learning_rate": 9.333769572818328e-06, + "loss": 0.1908, + "step": 6898 + }, + { + "epoch": 0.1745830908216717, + "grad_norm": 5.576926231384277, + "learning_rate": 9.33356930507775e-06, + "loss": 0.2046, + "step": 6899 + }, + { + "epoch": 0.17460839638636536, + "grad_norm": 3.743474006652832, + "learning_rate": 9.33336900939079e-06, + "loss": 0.1905, + "step": 6900 + }, + { + "epoch": 0.17463370195105904, + "grad_norm": 5.838858127593994, + "learning_rate": 9.33316868575874e-06, + "loss": 0.2498, + "step": 6901 + }, + { + "epoch": 0.17465900751575272, + "grad_norm": 3.2320661544799805, + "learning_rate": 9.33296833418289e-06, + "loss": 0.1218, + "step": 6902 + }, + { + "epoch": 0.17468431308044638, + "grad_norm": 7.281537055969238, + "learning_rate": 9.332767954664534e-06, + "loss": 0.1787, + "step": 6903 + }, + { + "epoch": 0.17470961864514006, + "grad_norm": 5.825101852416992, + "learning_rate": 9.332567547204962e-06, + "loss": 0.1808, + "step": 6904 + }, + { + "epoch": 0.17473492420983375, + "grad_norm": 7.50433349609375, + "learning_rate": 9.33236711180547e-06, + "loss": 0.2545, + "step": 6905 + }, + { + "epoch": 0.17476022977452743, + "grad_norm": 4.252110481262207, + "learning_rate": 9.332166648467347e-06, + "loss": 0.1686, + "step": 6906 + }, + { + "epoch": 0.1747855353392211, + "grad_norm": 25.576719284057617, + "learning_rate": 9.331966157191887e-06, + "loss": 0.3577, + "step": 6907 + }, + { + "epoch": 0.17481084090391477, + "grad_norm": 4.0477752685546875, + "learning_rate": 9.331765637980383e-06, + "loss": 0.1748, + "step": 6908 + }, + { + "epoch": 0.17483614646860846, + "grad_norm": 8.377290725708008, + "learning_rate": 9.331565090834127e-06, + "loss": 0.3137, + "step": 6909 + }, + { + "epoch": 0.1748614520333021, + "grad_norm": 4.116704940795898, + "learning_rate": 9.331364515754416e-06, + "loss": 0.1331, + "step": 6910 + }, + { + "epoch": 0.1748867575979958, + "grad_norm": 5.293659687042236, + "learning_rate": 9.33116391274254e-06, + "loss": 0.165, + "step": 6911 + }, + { + "epoch": 0.17491206316268948, + "grad_norm": 8.829760551452637, + "learning_rate": 9.330963281799794e-06, + "loss": 0.2383, + "step": 6912 + }, + { + "epoch": 0.17493736872738316, + "grad_norm": 8.503864288330078, + "learning_rate": 9.33076262292747e-06, + "loss": 0.2348, + "step": 6913 + }, + { + "epoch": 0.17496267429207682, + "grad_norm": 8.527604103088379, + "learning_rate": 9.330561936126864e-06, + "loss": 0.2635, + "step": 6914 + }, + { + "epoch": 0.1749879798567705, + "grad_norm": 6.9584879875183105, + "learning_rate": 9.33036122139927e-06, + "loss": 0.29, + "step": 6915 + }, + { + "epoch": 0.1750132854214642, + "grad_norm": 4.252775192260742, + "learning_rate": 9.330160478745983e-06, + "loss": 0.1542, + "step": 6916 + }, + { + "epoch": 0.17503859098615784, + "grad_norm": 4.471229553222656, + "learning_rate": 9.329959708168294e-06, + "loss": 0.1773, + "step": 6917 + }, + { + "epoch": 0.17506389655085153, + "grad_norm": 4.4788970947265625, + "learning_rate": 9.329758909667503e-06, + "loss": 0.2427, + "step": 6918 + }, + { + "epoch": 0.1750892021155452, + "grad_norm": 5.606412887573242, + "learning_rate": 9.3295580832449e-06, + "loss": 0.1717, + "step": 6919 + }, + { + "epoch": 0.1751145076802389, + "grad_norm": 5.330726146697998, + "learning_rate": 9.329357228901784e-06, + "loss": 0.1884, + "step": 6920 + }, + { + "epoch": 0.17513981324493255, + "grad_norm": 4.293797969818115, + "learning_rate": 9.329156346639448e-06, + "loss": 0.1487, + "step": 6921 + }, + { + "epoch": 0.17516511880962624, + "grad_norm": 4.3766021728515625, + "learning_rate": 9.328955436459188e-06, + "loss": 0.1898, + "step": 6922 + }, + { + "epoch": 0.17519042437431992, + "grad_norm": 4.77790641784668, + "learning_rate": 9.328754498362299e-06, + "loss": 0.1805, + "step": 6923 + }, + { + "epoch": 0.17521572993901358, + "grad_norm": 4.900589466094971, + "learning_rate": 9.328553532350077e-06, + "loss": 0.203, + "step": 6924 + }, + { + "epoch": 0.17524103550370726, + "grad_norm": 5.043643474578857, + "learning_rate": 9.328352538423819e-06, + "loss": 0.2233, + "step": 6925 + }, + { + "epoch": 0.17526634106840094, + "grad_norm": 4.420312404632568, + "learning_rate": 9.328151516584823e-06, + "loss": 0.1948, + "step": 6926 + }, + { + "epoch": 0.17529164663309463, + "grad_norm": 5.136519908905029, + "learning_rate": 9.327950466834378e-06, + "loss": 0.1995, + "step": 6927 + }, + { + "epoch": 0.17531695219778828, + "grad_norm": 13.634394645690918, + "learning_rate": 9.32774938917379e-06, + "loss": 0.249, + "step": 6928 + }, + { + "epoch": 0.17534225776248197, + "grad_norm": 10.19631290435791, + "learning_rate": 9.32754828360435e-06, + "loss": 0.3879, + "step": 6929 + }, + { + "epoch": 0.17536756332717565, + "grad_norm": 5.425992012023926, + "learning_rate": 9.327347150127355e-06, + "loss": 0.1028, + "step": 6930 + }, + { + "epoch": 0.17539286889186934, + "grad_norm": 10.420841217041016, + "learning_rate": 9.327145988744104e-06, + "loss": 0.281, + "step": 6931 + }, + { + "epoch": 0.175418174456563, + "grad_norm": 8.315710067749023, + "learning_rate": 9.326944799455893e-06, + "loss": 0.149, + "step": 6932 + }, + { + "epoch": 0.17544348002125668, + "grad_norm": 9.001465797424316, + "learning_rate": 9.32674358226402e-06, + "loss": 0.2886, + "step": 6933 + }, + { + "epoch": 0.17546878558595036, + "grad_norm": 5.377025127410889, + "learning_rate": 9.326542337169784e-06, + "loss": 0.2155, + "step": 6934 + }, + { + "epoch": 0.17549409115064402, + "grad_norm": 12.22855281829834, + "learning_rate": 9.32634106417448e-06, + "loss": 0.2449, + "step": 6935 + }, + { + "epoch": 0.1755193967153377, + "grad_norm": 4.563912391662598, + "learning_rate": 9.326139763279407e-06, + "loss": 0.1919, + "step": 6936 + }, + { + "epoch": 0.17554470228003138, + "grad_norm": 6.333463191986084, + "learning_rate": 9.325938434485864e-06, + "loss": 0.139, + "step": 6937 + }, + { + "epoch": 0.17557000784472507, + "grad_norm": 7.1755146980285645, + "learning_rate": 9.325737077795147e-06, + "loss": 0.1897, + "step": 6938 + }, + { + "epoch": 0.17559531340941872, + "grad_norm": 10.410775184631348, + "learning_rate": 9.325535693208559e-06, + "loss": 0.2421, + "step": 6939 + }, + { + "epoch": 0.1756206189741124, + "grad_norm": 8.88414478302002, + "learning_rate": 9.325334280727393e-06, + "loss": 0.1796, + "step": 6940 + }, + { + "epoch": 0.1756459245388061, + "grad_norm": 20.074905395507812, + "learning_rate": 9.325132840352953e-06, + "loss": 0.1568, + "step": 6941 + }, + { + "epoch": 0.17567123010349975, + "grad_norm": 7.037825584411621, + "learning_rate": 9.324931372086535e-06, + "loss": 0.16, + "step": 6942 + }, + { + "epoch": 0.17569653566819343, + "grad_norm": 15.175381660461426, + "learning_rate": 9.324729875929439e-06, + "loss": 0.1886, + "step": 6943 + }, + { + "epoch": 0.17572184123288712, + "grad_norm": 12.132315635681152, + "learning_rate": 9.324528351882964e-06, + "loss": 0.2203, + "step": 6944 + }, + { + "epoch": 0.1757471467975808, + "grad_norm": 6.681112289428711, + "learning_rate": 9.324326799948411e-06, + "loss": 0.2878, + "step": 6945 + }, + { + "epoch": 0.17577245236227446, + "grad_norm": 6.252891540527344, + "learning_rate": 9.324125220127078e-06, + "loss": 0.2889, + "step": 6946 + }, + { + "epoch": 0.17579775792696814, + "grad_norm": 9.874669075012207, + "learning_rate": 9.323923612420268e-06, + "loss": 0.2124, + "step": 6947 + }, + { + "epoch": 0.17582306349166182, + "grad_norm": 12.935711860656738, + "learning_rate": 9.323721976829277e-06, + "loss": 0.2555, + "step": 6948 + }, + { + "epoch": 0.17584836905635548, + "grad_norm": 3.7568650245666504, + "learning_rate": 9.323520313355408e-06, + "loss": 0.1509, + "step": 6949 + }, + { + "epoch": 0.17587367462104916, + "grad_norm": 4.650008201599121, + "learning_rate": 9.32331862199996e-06, + "loss": 0.2034, + "step": 6950 + }, + { + "epoch": 0.17589898018574285, + "grad_norm": 8.537698745727539, + "learning_rate": 9.323116902764235e-06, + "loss": 0.2351, + "step": 6951 + }, + { + "epoch": 0.17592428575043653, + "grad_norm": 6.467163562774658, + "learning_rate": 9.322915155649532e-06, + "loss": 0.2026, + "step": 6952 + }, + { + "epoch": 0.1759495913151302, + "grad_norm": 6.945534706115723, + "learning_rate": 9.322713380657154e-06, + "loss": 0.1596, + "step": 6953 + }, + { + "epoch": 0.17597489687982387, + "grad_norm": 7.621642112731934, + "learning_rate": 9.322511577788403e-06, + "loss": 0.2808, + "step": 6954 + }, + { + "epoch": 0.17600020244451756, + "grad_norm": 10.638258934020996, + "learning_rate": 9.322309747044577e-06, + "loss": 0.4016, + "step": 6955 + }, + { + "epoch": 0.1760255080092112, + "grad_norm": 5.588740348815918, + "learning_rate": 9.32210788842698e-06, + "loss": 0.2237, + "step": 6956 + }, + { + "epoch": 0.1760508135739049, + "grad_norm": 4.240406513214111, + "learning_rate": 9.321906001936913e-06, + "loss": 0.2276, + "step": 6957 + }, + { + "epoch": 0.17607611913859858, + "grad_norm": 4.371446132659912, + "learning_rate": 9.321704087575678e-06, + "loss": 0.1935, + "step": 6958 + }, + { + "epoch": 0.17610142470329226, + "grad_norm": 5.481623649597168, + "learning_rate": 9.321502145344578e-06, + "loss": 0.2136, + "step": 6959 + }, + { + "epoch": 0.17612673026798592, + "grad_norm": 4.178761959075928, + "learning_rate": 9.321300175244913e-06, + "loss": 0.1779, + "step": 6960 + }, + { + "epoch": 0.1761520358326796, + "grad_norm": 13.575218200683594, + "learning_rate": 9.32109817727799e-06, + "loss": 0.3332, + "step": 6961 + }, + { + "epoch": 0.1761773413973733, + "grad_norm": 6.578983783721924, + "learning_rate": 9.320896151445107e-06, + "loss": 0.1448, + "step": 6962 + }, + { + "epoch": 0.17620264696206697, + "grad_norm": 6.117470741271973, + "learning_rate": 9.320694097747567e-06, + "loss": 0.1872, + "step": 6963 + }, + { + "epoch": 0.17622795252676063, + "grad_norm": 10.954172134399414, + "learning_rate": 9.320492016186676e-06, + "loss": 0.1775, + "step": 6964 + }, + { + "epoch": 0.1762532580914543, + "grad_norm": 2.227628231048584, + "learning_rate": 9.320289906763735e-06, + "loss": 0.147, + "step": 6965 + }, + { + "epoch": 0.176278563656148, + "grad_norm": 6.832056522369385, + "learning_rate": 9.320087769480049e-06, + "loss": 0.25, + "step": 6966 + }, + { + "epoch": 0.17630386922084165, + "grad_norm": 5.35568904876709, + "learning_rate": 9.31988560433692e-06, + "loss": 0.207, + "step": 6967 + }, + { + "epoch": 0.17632917478553534, + "grad_norm": 3.714111328125, + "learning_rate": 9.319683411335652e-06, + "loss": 0.2133, + "step": 6968 + }, + { + "epoch": 0.17635448035022902, + "grad_norm": 3.7823874950408936, + "learning_rate": 9.319481190477551e-06, + "loss": 0.0983, + "step": 6969 + }, + { + "epoch": 0.1763797859149227, + "grad_norm": 15.895670890808105, + "learning_rate": 9.319278941763917e-06, + "loss": 0.1552, + "step": 6970 + }, + { + "epoch": 0.17640509147961636, + "grad_norm": 4.652440071105957, + "learning_rate": 9.319076665196057e-06, + "loss": 0.19, + "step": 6971 + }, + { + "epoch": 0.17643039704431004, + "grad_norm": 9.304423332214355, + "learning_rate": 9.318874360775277e-06, + "loss": 0.217, + "step": 6972 + }, + { + "epoch": 0.17645570260900373, + "grad_norm": 7.383551120758057, + "learning_rate": 9.318672028502877e-06, + "loss": 0.2427, + "step": 6973 + }, + { + "epoch": 0.17648100817369738, + "grad_norm": 4.004838466644287, + "learning_rate": 9.318469668380166e-06, + "loss": 0.1796, + "step": 6974 + }, + { + "epoch": 0.17650631373839107, + "grad_norm": 4.986091136932373, + "learning_rate": 9.318267280408446e-06, + "loss": 0.1449, + "step": 6975 + }, + { + "epoch": 0.17653161930308475, + "grad_norm": 9.40735912322998, + "learning_rate": 9.318064864589024e-06, + "loss": 0.2633, + "step": 6976 + }, + { + "epoch": 0.17655692486777844, + "grad_norm": 4.235361576080322, + "learning_rate": 9.317862420923205e-06, + "loss": 0.221, + "step": 6977 + }, + { + "epoch": 0.1765822304324721, + "grad_norm": 5.617639541625977, + "learning_rate": 9.317659949412295e-06, + "loss": 0.2707, + "step": 6978 + }, + { + "epoch": 0.17660753599716578, + "grad_norm": 7.727810859680176, + "learning_rate": 9.317457450057598e-06, + "loss": 0.1055, + "step": 6979 + }, + { + "epoch": 0.17663284156185946, + "grad_norm": 4.4078474044799805, + "learning_rate": 9.317254922860423e-06, + "loss": 0.1636, + "step": 6980 + }, + { + "epoch": 0.17665814712655312, + "grad_norm": 4.534428596496582, + "learning_rate": 9.317052367822073e-06, + "loss": 0.1604, + "step": 6981 + }, + { + "epoch": 0.1766834526912468, + "grad_norm": 3.5204827785491943, + "learning_rate": 9.316849784943856e-06, + "loss": 0.1728, + "step": 6982 + }, + { + "epoch": 0.17670875825594048, + "grad_norm": 5.167903423309326, + "learning_rate": 9.316647174227076e-06, + "loss": 0.2277, + "step": 6983 + }, + { + "epoch": 0.17673406382063417, + "grad_norm": 3.8549792766571045, + "learning_rate": 9.316444535673041e-06, + "loss": 0.1344, + "step": 6984 + }, + { + "epoch": 0.17675936938532782, + "grad_norm": 7.14749813079834, + "learning_rate": 9.31624186928306e-06, + "loss": 0.2015, + "step": 6985 + }, + { + "epoch": 0.1767846749500215, + "grad_norm": 3.730802536010742, + "learning_rate": 9.316039175058437e-06, + "loss": 0.2157, + "step": 6986 + }, + { + "epoch": 0.1768099805147152, + "grad_norm": 4.5704665184021, + "learning_rate": 9.315836453000483e-06, + "loss": 0.1839, + "step": 6987 + }, + { + "epoch": 0.17683528607940885, + "grad_norm": 13.461202621459961, + "learning_rate": 9.315633703110499e-06, + "loss": 0.2946, + "step": 6988 + }, + { + "epoch": 0.17686059164410253, + "grad_norm": 5.1254754066467285, + "learning_rate": 9.315430925389796e-06, + "loss": 0.1995, + "step": 6989 + }, + { + "epoch": 0.17688589720879622, + "grad_norm": 4.326883792877197, + "learning_rate": 9.315228119839685e-06, + "loss": 0.1883, + "step": 6990 + }, + { + "epoch": 0.1769112027734899, + "grad_norm": 8.118707656860352, + "learning_rate": 9.315025286461467e-06, + "loss": 0.2225, + "step": 6991 + }, + { + "epoch": 0.17693650833818356, + "grad_norm": 15.26432991027832, + "learning_rate": 9.314822425256455e-06, + "loss": 0.1932, + "step": 6992 + }, + { + "epoch": 0.17696181390287724, + "grad_norm": 5.983484268188477, + "learning_rate": 9.314619536225956e-06, + "loss": 0.1356, + "step": 6993 + }, + { + "epoch": 0.17698711946757092, + "grad_norm": 4.291076183319092, + "learning_rate": 9.314416619371278e-06, + "loss": 0.2226, + "step": 6994 + }, + { + "epoch": 0.1770124250322646, + "grad_norm": 4.284210205078125, + "learning_rate": 9.31421367469373e-06, + "loss": 0.1442, + "step": 6995 + }, + { + "epoch": 0.17703773059695826, + "grad_norm": 6.749930381774902, + "learning_rate": 9.31401070219462e-06, + "loss": 0.1943, + "step": 6996 + }, + { + "epoch": 0.17706303616165195, + "grad_norm": 13.361767768859863, + "learning_rate": 9.313807701875259e-06, + "loss": 0.3078, + "step": 6997 + }, + { + "epoch": 0.17708834172634563, + "grad_norm": 3.336937189102173, + "learning_rate": 9.313604673736953e-06, + "loss": 0.1326, + "step": 6998 + }, + { + "epoch": 0.1771136472910393, + "grad_norm": 6.608205795288086, + "learning_rate": 9.313401617781013e-06, + "loss": 0.1376, + "step": 6999 + }, + { + "epoch": 0.17713895285573297, + "grad_norm": 4.418377876281738, + "learning_rate": 9.313198534008749e-06, + "loss": 0.2008, + "step": 7000 + }, + { + "epoch": 0.17716425842042666, + "grad_norm": 7.14253044128418, + "learning_rate": 9.312995422421468e-06, + "loss": 0.2772, + "step": 7001 + }, + { + "epoch": 0.17718956398512034, + "grad_norm": 8.18836498260498, + "learning_rate": 9.312792283020484e-06, + "loss": 0.2118, + "step": 7002 + }, + { + "epoch": 0.177214869549814, + "grad_norm": 16.089399337768555, + "learning_rate": 9.312589115807105e-06, + "loss": 0.3068, + "step": 7003 + }, + { + "epoch": 0.17724017511450768, + "grad_norm": 5.076848030090332, + "learning_rate": 9.312385920782639e-06, + "loss": 0.1874, + "step": 7004 + }, + { + "epoch": 0.17726548067920136, + "grad_norm": 7.506100654602051, + "learning_rate": 9.3121826979484e-06, + "loss": 0.2448, + "step": 7005 + }, + { + "epoch": 0.17729078624389502, + "grad_norm": 10.582344055175781, + "learning_rate": 9.311979447305696e-06, + "loss": 0.1719, + "step": 7006 + }, + { + "epoch": 0.1773160918085887, + "grad_norm": 3.2410106658935547, + "learning_rate": 9.311776168855839e-06, + "loss": 0.1496, + "step": 7007 + }, + { + "epoch": 0.1773413973732824, + "grad_norm": 31.409631729125977, + "learning_rate": 9.31157286260014e-06, + "loss": 0.3588, + "step": 7008 + }, + { + "epoch": 0.17736670293797607, + "grad_norm": 3.1980738639831543, + "learning_rate": 9.311369528539908e-06, + "loss": 0.1277, + "step": 7009 + }, + { + "epoch": 0.17739200850266973, + "grad_norm": 7.463833808898926, + "learning_rate": 9.311166166676457e-06, + "loss": 0.264, + "step": 7010 + }, + { + "epoch": 0.1774173140673634, + "grad_norm": 7.027156352996826, + "learning_rate": 9.310962777011099e-06, + "loss": 0.2359, + "step": 7011 + }, + { + "epoch": 0.1774426196320571, + "grad_norm": 4.39315128326416, + "learning_rate": 9.310759359545142e-06, + "loss": 0.1894, + "step": 7012 + }, + { + "epoch": 0.17746792519675075, + "grad_norm": 3.7893195152282715, + "learning_rate": 9.3105559142799e-06, + "loss": 0.117, + "step": 7013 + }, + { + "epoch": 0.17749323076144444, + "grad_norm": 3.2922794818878174, + "learning_rate": 9.310352441216684e-06, + "loss": 0.1522, + "step": 7014 + }, + { + "epoch": 0.17751853632613812, + "grad_norm": 6.006938457489014, + "learning_rate": 9.310148940356808e-06, + "loss": 0.1892, + "step": 7015 + }, + { + "epoch": 0.1775438418908318, + "grad_norm": 4.317462921142578, + "learning_rate": 9.309945411701582e-06, + "loss": 0.1496, + "step": 7016 + }, + { + "epoch": 0.17756914745552546, + "grad_norm": 5.025449275970459, + "learning_rate": 9.309741855252322e-06, + "loss": 0.1535, + "step": 7017 + }, + { + "epoch": 0.17759445302021915, + "grad_norm": 3.907280683517456, + "learning_rate": 9.309538271010336e-06, + "loss": 0.1874, + "step": 7018 + }, + { + "epoch": 0.17761975858491283, + "grad_norm": 4.800095558166504, + "learning_rate": 9.30933465897694e-06, + "loss": 0.1852, + "step": 7019 + }, + { + "epoch": 0.17764506414960649, + "grad_norm": 6.6387248039245605, + "learning_rate": 9.309131019153448e-06, + "loss": 0.1888, + "step": 7020 + }, + { + "epoch": 0.17767036971430017, + "grad_norm": 4.646575927734375, + "learning_rate": 9.30892735154117e-06, + "loss": 0.1566, + "step": 7021 + }, + { + "epoch": 0.17769567527899385, + "grad_norm": 5.077683925628662, + "learning_rate": 9.308723656141422e-06, + "loss": 0.1551, + "step": 7022 + }, + { + "epoch": 0.17772098084368754, + "grad_norm": 4.631475925445557, + "learning_rate": 9.308519932955515e-06, + "loss": 0.1333, + "step": 7023 + }, + { + "epoch": 0.1777462864083812, + "grad_norm": 6.779780387878418, + "learning_rate": 9.308316181984767e-06, + "loss": 0.1887, + "step": 7024 + }, + { + "epoch": 0.17777159197307488, + "grad_norm": 5.27170991897583, + "learning_rate": 9.308112403230486e-06, + "loss": 0.1816, + "step": 7025 + }, + { + "epoch": 0.17779689753776856, + "grad_norm": 2.7730438709259033, + "learning_rate": 9.307908596693992e-06, + "loss": 0.1495, + "step": 7026 + }, + { + "epoch": 0.17782220310246225, + "grad_norm": 3.8276450634002686, + "learning_rate": 9.307704762376597e-06, + "loss": 0.1921, + "step": 7027 + }, + { + "epoch": 0.1778475086671559, + "grad_norm": 5.040582656860352, + "learning_rate": 9.307500900279615e-06, + "loss": 0.2316, + "step": 7028 + }, + { + "epoch": 0.17787281423184959, + "grad_norm": 11.649421691894531, + "learning_rate": 9.30729701040436e-06, + "loss": 0.1912, + "step": 7029 + }, + { + "epoch": 0.17789811979654327, + "grad_norm": 4.092268943786621, + "learning_rate": 9.30709309275215e-06, + "loss": 0.1753, + "step": 7030 + }, + { + "epoch": 0.17792342536123693, + "grad_norm": 7.391625881195068, + "learning_rate": 9.306889147324294e-06, + "loss": 0.2521, + "step": 7031 + }, + { + "epoch": 0.1779487309259306, + "grad_norm": 4.358129978179932, + "learning_rate": 9.306685174122112e-06, + "loss": 0.2114, + "step": 7032 + }, + { + "epoch": 0.1779740364906243, + "grad_norm": 5.803347110748291, + "learning_rate": 9.30648117314692e-06, + "loss": 0.1871, + "step": 7033 + }, + { + "epoch": 0.17799934205531798, + "grad_norm": 3.673407793045044, + "learning_rate": 9.306277144400032e-06, + "loss": 0.1258, + "step": 7034 + }, + { + "epoch": 0.17802464762001163, + "grad_norm": 9.457686424255371, + "learning_rate": 9.306073087882763e-06, + "loss": 0.1856, + "step": 7035 + }, + { + "epoch": 0.17804995318470532, + "grad_norm": 5.698096752166748, + "learning_rate": 9.305869003596431e-06, + "loss": 0.1153, + "step": 7036 + }, + { + "epoch": 0.178075258749399, + "grad_norm": 8.15119457244873, + "learning_rate": 9.305664891542348e-06, + "loss": 0.2098, + "step": 7037 + }, + { + "epoch": 0.17810056431409266, + "grad_norm": 5.876218795776367, + "learning_rate": 9.305460751721835e-06, + "loss": 0.1281, + "step": 7038 + }, + { + "epoch": 0.17812586987878634, + "grad_norm": 6.336813449859619, + "learning_rate": 9.305256584136206e-06, + "loss": 0.1354, + "step": 7039 + }, + { + "epoch": 0.17815117544348003, + "grad_norm": 8.335762977600098, + "learning_rate": 9.305052388786778e-06, + "loss": 0.169, + "step": 7040 + }, + { + "epoch": 0.1781764810081737, + "grad_norm": 8.786406517028809, + "learning_rate": 9.30484816567487e-06, + "loss": 0.1741, + "step": 7041 + }, + { + "epoch": 0.17820178657286737, + "grad_norm": 3.011301279067993, + "learning_rate": 9.304643914801793e-06, + "loss": 0.1776, + "step": 7042 + }, + { + "epoch": 0.17822709213756105, + "grad_norm": 6.181131362915039, + "learning_rate": 9.304439636168871e-06, + "loss": 0.164, + "step": 7043 + }, + { + "epoch": 0.17825239770225473, + "grad_norm": 6.106533527374268, + "learning_rate": 9.304235329777418e-06, + "loss": 0.2043, + "step": 7044 + }, + { + "epoch": 0.1782777032669484, + "grad_norm": 8.653458595275879, + "learning_rate": 9.30403099562875e-06, + "loss": 0.1535, + "step": 7045 + }, + { + "epoch": 0.17830300883164207, + "grad_norm": 5.625532150268555, + "learning_rate": 9.30382663372419e-06, + "loss": 0.1962, + "step": 7046 + }, + { + "epoch": 0.17832831439633576, + "grad_norm": 6.406497478485107, + "learning_rate": 9.30362224406505e-06, + "loss": 0.2466, + "step": 7047 + }, + { + "epoch": 0.17835361996102944, + "grad_norm": 8.340116500854492, + "learning_rate": 9.303417826652651e-06, + "loss": 0.3248, + "step": 7048 + }, + { + "epoch": 0.1783789255257231, + "grad_norm": 9.03573226928711, + "learning_rate": 9.303213381488311e-06, + "loss": 0.1736, + "step": 7049 + }, + { + "epoch": 0.17840423109041678, + "grad_norm": 12.985828399658203, + "learning_rate": 9.30300890857335e-06, + "loss": 0.2403, + "step": 7050 + }, + { + "epoch": 0.17842953665511047, + "grad_norm": 7.4810662269592285, + "learning_rate": 9.302804407909085e-06, + "loss": 0.2984, + "step": 7051 + }, + { + "epoch": 0.17845484221980412, + "grad_norm": 9.238042831420898, + "learning_rate": 9.302599879496834e-06, + "loss": 0.2892, + "step": 7052 + }, + { + "epoch": 0.1784801477844978, + "grad_norm": 3.709750175476074, + "learning_rate": 9.302395323337916e-06, + "loss": 0.086, + "step": 7053 + }, + { + "epoch": 0.1785054533491915, + "grad_norm": 8.649529457092285, + "learning_rate": 9.302190739433652e-06, + "loss": 0.2375, + "step": 7054 + }, + { + "epoch": 0.17853075891388517, + "grad_norm": 6.696395397186279, + "learning_rate": 9.301986127785359e-06, + "loss": 0.1776, + "step": 7055 + }, + { + "epoch": 0.17855606447857883, + "grad_norm": 5.462604999542236, + "learning_rate": 9.301781488394357e-06, + "loss": 0.137, + "step": 7056 + }, + { + "epoch": 0.1785813700432725, + "grad_norm": 11.494051933288574, + "learning_rate": 9.301576821261968e-06, + "loss": 0.2249, + "step": 7057 + }, + { + "epoch": 0.1786066756079662, + "grad_norm": 4.002531051635742, + "learning_rate": 9.30137212638951e-06, + "loss": 0.1969, + "step": 7058 + }, + { + "epoch": 0.17863198117265988, + "grad_norm": 6.820018768310547, + "learning_rate": 9.301167403778303e-06, + "loss": 0.1726, + "step": 7059 + }, + { + "epoch": 0.17865728673735354, + "grad_norm": 4.93993616104126, + "learning_rate": 9.300962653429668e-06, + "loss": 0.1339, + "step": 7060 + }, + { + "epoch": 0.17868259230204722, + "grad_norm": 5.648169040679932, + "learning_rate": 9.300757875344923e-06, + "loss": 0.2119, + "step": 7061 + }, + { + "epoch": 0.1787078978667409, + "grad_norm": 5.248458385467529, + "learning_rate": 9.300553069525393e-06, + "loss": 0.1549, + "step": 7062 + }, + { + "epoch": 0.17873320343143456, + "grad_norm": 5.0522003173828125, + "learning_rate": 9.300348235972396e-06, + "loss": 0.1233, + "step": 7063 + }, + { + "epoch": 0.17875850899612825, + "grad_norm": 8.602591514587402, + "learning_rate": 9.300143374687252e-06, + "loss": 0.2087, + "step": 7064 + }, + { + "epoch": 0.17878381456082193, + "grad_norm": 6.746963977813721, + "learning_rate": 9.299938485671283e-06, + "loss": 0.1981, + "step": 7065 + }, + { + "epoch": 0.1788091201255156, + "grad_norm": 7.710261821746826, + "learning_rate": 9.29973356892581e-06, + "loss": 0.2648, + "step": 7066 + }, + { + "epoch": 0.17883442569020927, + "grad_norm": 5.880885124206543, + "learning_rate": 9.299528624452156e-06, + "loss": 0.192, + "step": 7067 + }, + { + "epoch": 0.17885973125490295, + "grad_norm": 5.969942092895508, + "learning_rate": 9.299323652251643e-06, + "loss": 0.1197, + "step": 7068 + }, + { + "epoch": 0.17888503681959664, + "grad_norm": 8.728659629821777, + "learning_rate": 9.29911865232559e-06, + "loss": 0.2208, + "step": 7069 + }, + { + "epoch": 0.1789103423842903, + "grad_norm": 8.436484336853027, + "learning_rate": 9.29891362467532e-06, + "loss": 0.2086, + "step": 7070 + }, + { + "epoch": 0.17893564794898398, + "grad_norm": 6.285791397094727, + "learning_rate": 9.298708569302157e-06, + "loss": 0.224, + "step": 7071 + }, + { + "epoch": 0.17896095351367766, + "grad_norm": 6.28830099105835, + "learning_rate": 9.298503486207422e-06, + "loss": 0.1964, + "step": 7072 + }, + { + "epoch": 0.17898625907837135, + "grad_norm": 5.264509201049805, + "learning_rate": 9.298298375392437e-06, + "loss": 0.1741, + "step": 7073 + }, + { + "epoch": 0.179011564643065, + "grad_norm": 4.948216915130615, + "learning_rate": 9.298093236858526e-06, + "loss": 0.1285, + "step": 7074 + }, + { + "epoch": 0.17903687020775869, + "grad_norm": 9.507451057434082, + "learning_rate": 9.297888070607011e-06, + "loss": 0.1653, + "step": 7075 + }, + { + "epoch": 0.17906217577245237, + "grad_norm": 3.3979222774505615, + "learning_rate": 9.297682876639215e-06, + "loss": 0.1239, + "step": 7076 + }, + { + "epoch": 0.17908748133714603, + "grad_norm": 5.0559611320495605, + "learning_rate": 9.297477654956462e-06, + "loss": 0.2007, + "step": 7077 + }, + { + "epoch": 0.1791127869018397, + "grad_norm": 7.801389217376709, + "learning_rate": 9.297272405560073e-06, + "loss": 0.1906, + "step": 7078 + }, + { + "epoch": 0.1791380924665334, + "grad_norm": 5.878754138946533, + "learning_rate": 9.297067128451375e-06, + "loss": 0.1441, + "step": 7079 + }, + { + "epoch": 0.17916339803122708, + "grad_norm": 14.593664169311523, + "learning_rate": 9.296861823631691e-06, + "loss": 0.3425, + "step": 7080 + }, + { + "epoch": 0.17918870359592073, + "grad_norm": 5.2141923904418945, + "learning_rate": 9.296656491102345e-06, + "loss": 0.1411, + "step": 7081 + }, + { + "epoch": 0.17921400916061442, + "grad_norm": 6.199173927307129, + "learning_rate": 9.296451130864658e-06, + "loss": 0.1605, + "step": 7082 + }, + { + "epoch": 0.1792393147253081, + "grad_norm": 10.789734840393066, + "learning_rate": 9.29624574291996e-06, + "loss": 0.3147, + "step": 7083 + }, + { + "epoch": 0.17926462029000176, + "grad_norm": 8.00094985961914, + "learning_rate": 9.29604032726957e-06, + "loss": 0.2005, + "step": 7084 + }, + { + "epoch": 0.17928992585469544, + "grad_norm": 9.939878463745117, + "learning_rate": 9.295834883914816e-06, + "loss": 0.2028, + "step": 7085 + }, + { + "epoch": 0.17931523141938913, + "grad_norm": 3.8312413692474365, + "learning_rate": 9.295629412857022e-06, + "loss": 0.1401, + "step": 7086 + }, + { + "epoch": 0.1793405369840828, + "grad_norm": 4.378125190734863, + "learning_rate": 9.295423914097511e-06, + "loss": 0.2058, + "step": 7087 + }, + { + "epoch": 0.17936584254877647, + "grad_norm": 4.079626083374023, + "learning_rate": 9.295218387637613e-06, + "loss": 0.1454, + "step": 7088 + }, + { + "epoch": 0.17939114811347015, + "grad_norm": 11.065913200378418, + "learning_rate": 9.29501283347865e-06, + "loss": 0.2254, + "step": 7089 + }, + { + "epoch": 0.17941645367816383, + "grad_norm": 4.977231025695801, + "learning_rate": 9.294807251621947e-06, + "loss": 0.1747, + "step": 7090 + }, + { + "epoch": 0.17944175924285752, + "grad_norm": 7.943212032318115, + "learning_rate": 9.294601642068831e-06, + "loss": 0.2843, + "step": 7091 + }, + { + "epoch": 0.17946706480755117, + "grad_norm": 11.632683753967285, + "learning_rate": 9.29439600482063e-06, + "loss": 0.1187, + "step": 7092 + }, + { + "epoch": 0.17949237037224486, + "grad_norm": 4.5355544090271, + "learning_rate": 9.294190339878666e-06, + "loss": 0.1367, + "step": 7093 + }, + { + "epoch": 0.17951767593693854, + "grad_norm": 13.118632316589355, + "learning_rate": 9.293984647244268e-06, + "loss": 0.2404, + "step": 7094 + }, + { + "epoch": 0.1795429815016322, + "grad_norm": 21.87714958190918, + "learning_rate": 9.29377892691876e-06, + "loss": 0.257, + "step": 7095 + }, + { + "epoch": 0.17956828706632588, + "grad_norm": 6.5762481689453125, + "learning_rate": 9.293573178903472e-06, + "loss": 0.1912, + "step": 7096 + }, + { + "epoch": 0.17959359263101957, + "grad_norm": 14.687666893005371, + "learning_rate": 9.293367403199729e-06, + "loss": 0.2565, + "step": 7097 + }, + { + "epoch": 0.17961889819571325, + "grad_norm": 9.957643508911133, + "learning_rate": 9.293161599808858e-06, + "loss": 0.2986, + "step": 7098 + }, + { + "epoch": 0.1796442037604069, + "grad_norm": 6.229722499847412, + "learning_rate": 9.292955768732187e-06, + "loss": 0.1235, + "step": 7099 + }, + { + "epoch": 0.1796695093251006, + "grad_norm": 5.425766468048096, + "learning_rate": 9.292749909971043e-06, + "loss": 0.2069, + "step": 7100 + }, + { + "epoch": 0.17969481488979427, + "grad_norm": 5.478490829467773, + "learning_rate": 9.292544023526753e-06, + "loss": 0.1596, + "step": 7101 + }, + { + "epoch": 0.17972012045448793, + "grad_norm": 6.551464080810547, + "learning_rate": 9.292338109400644e-06, + "loss": 0.2089, + "step": 7102 + }, + { + "epoch": 0.17974542601918161, + "grad_norm": 4.390848636627197, + "learning_rate": 9.292132167594045e-06, + "loss": 0.186, + "step": 7103 + }, + { + "epoch": 0.1797707315838753, + "grad_norm": 3.8617897033691406, + "learning_rate": 9.291926198108287e-06, + "loss": 0.1411, + "step": 7104 + }, + { + "epoch": 0.17979603714856898, + "grad_norm": 4.297536849975586, + "learning_rate": 9.291720200944693e-06, + "loss": 0.1677, + "step": 7105 + }, + { + "epoch": 0.17982134271326264, + "grad_norm": 5.40979528427124, + "learning_rate": 9.291514176104593e-06, + "loss": 0.2597, + "step": 7106 + }, + { + "epoch": 0.17984664827795632, + "grad_norm": 5.059398174285889, + "learning_rate": 9.291308123589317e-06, + "loss": 0.2299, + "step": 7107 + }, + { + "epoch": 0.17987195384265, + "grad_norm": 9.187435150146484, + "learning_rate": 9.291102043400195e-06, + "loss": 0.2154, + "step": 7108 + }, + { + "epoch": 0.17989725940734366, + "grad_norm": 3.3772590160369873, + "learning_rate": 9.290895935538552e-06, + "loss": 0.1996, + "step": 7109 + }, + { + "epoch": 0.17992256497203735, + "grad_norm": 2.8432483673095703, + "learning_rate": 9.29068980000572e-06, + "loss": 0.1259, + "step": 7110 + }, + { + "epoch": 0.17994787053673103, + "grad_norm": 10.894735336303711, + "learning_rate": 9.290483636803028e-06, + "loss": 0.241, + "step": 7111 + }, + { + "epoch": 0.17997317610142471, + "grad_norm": 10.364006042480469, + "learning_rate": 9.290277445931803e-06, + "loss": 0.225, + "step": 7112 + }, + { + "epoch": 0.17999848166611837, + "grad_norm": 3.8737072944641113, + "learning_rate": 9.290071227393381e-06, + "loss": 0.1317, + "step": 7113 + }, + { + "epoch": 0.18002378723081205, + "grad_norm": 7.080484867095947, + "learning_rate": 9.289864981189084e-06, + "loss": 0.253, + "step": 7114 + }, + { + "epoch": 0.18004909279550574, + "grad_norm": 6.9996466636657715, + "learning_rate": 9.289658707320247e-06, + "loss": 0.2494, + "step": 7115 + }, + { + "epoch": 0.1800743983601994, + "grad_norm": 5.926032543182373, + "learning_rate": 9.289452405788198e-06, + "loss": 0.2204, + "step": 7116 + }, + { + "epoch": 0.18009970392489308, + "grad_norm": 4.160160064697266, + "learning_rate": 9.289246076594269e-06, + "loss": 0.124, + "step": 7117 + }, + { + "epoch": 0.18012500948958676, + "grad_norm": 9.239767074584961, + "learning_rate": 9.289039719739791e-06, + "loss": 0.2434, + "step": 7118 + }, + { + "epoch": 0.18015031505428045, + "grad_norm": 5.528353691101074, + "learning_rate": 9.288833335226093e-06, + "loss": 0.2, + "step": 7119 + }, + { + "epoch": 0.1801756206189741, + "grad_norm": 6.361721992492676, + "learning_rate": 9.288626923054506e-06, + "loss": 0.1633, + "step": 7120 + }, + { + "epoch": 0.1802009261836678, + "grad_norm": 9.08027458190918, + "learning_rate": 9.28842048322636e-06, + "loss": 0.2056, + "step": 7121 + }, + { + "epoch": 0.18022623174836147, + "grad_norm": 5.168491363525391, + "learning_rate": 9.288214015742992e-06, + "loss": 0.1113, + "step": 7122 + }, + { + "epoch": 0.18025153731305515, + "grad_norm": 10.43698787689209, + "learning_rate": 9.288007520605727e-06, + "loss": 0.4266, + "step": 7123 + }, + { + "epoch": 0.1802768428777488, + "grad_norm": 4.025913715362549, + "learning_rate": 9.287800997815899e-06, + "loss": 0.142, + "step": 7124 + }, + { + "epoch": 0.1803021484424425, + "grad_norm": 3.855832576751709, + "learning_rate": 9.287594447374841e-06, + "loss": 0.1218, + "step": 7125 + }, + { + "epoch": 0.18032745400713618, + "grad_norm": 5.43759298324585, + "learning_rate": 9.287387869283884e-06, + "loss": 0.2006, + "step": 7126 + }, + { + "epoch": 0.18035275957182983, + "grad_norm": 4.008563995361328, + "learning_rate": 9.28718126354436e-06, + "loss": 0.1433, + "step": 7127 + }, + { + "epoch": 0.18037806513652352, + "grad_norm": 3.629920721054077, + "learning_rate": 9.286974630157601e-06, + "loss": 0.1965, + "step": 7128 + }, + { + "epoch": 0.1804033707012172, + "grad_norm": 18.11212158203125, + "learning_rate": 9.28676796912494e-06, + "loss": 0.2588, + "step": 7129 + }, + { + "epoch": 0.1804286762659109, + "grad_norm": 8.145552635192871, + "learning_rate": 9.28656128044771e-06, + "loss": 0.1812, + "step": 7130 + }, + { + "epoch": 0.18045398183060454, + "grad_norm": 5.829428195953369, + "learning_rate": 9.286354564127245e-06, + "loss": 0.2385, + "step": 7131 + }, + { + "epoch": 0.18047928739529823, + "grad_norm": 3.4562692642211914, + "learning_rate": 9.286147820164875e-06, + "loss": 0.2051, + "step": 7132 + }, + { + "epoch": 0.1805045929599919, + "grad_norm": 5.329333782196045, + "learning_rate": 9.285941048561935e-06, + "loss": 0.2427, + "step": 7133 + }, + { + "epoch": 0.18052989852468557, + "grad_norm": 9.826545715332031, + "learning_rate": 9.285734249319758e-06, + "loss": 0.3597, + "step": 7134 + }, + { + "epoch": 0.18055520408937925, + "grad_norm": 10.272750854492188, + "learning_rate": 9.28552742243968e-06, + "loss": 0.3388, + "step": 7135 + }, + { + "epoch": 0.18058050965407293, + "grad_norm": 6.864361763000488, + "learning_rate": 9.28532056792303e-06, + "loss": 0.1782, + "step": 7136 + }, + { + "epoch": 0.18060581521876662, + "grad_norm": 5.715782165527344, + "learning_rate": 9.285113685771148e-06, + "loss": 0.2094, + "step": 7137 + }, + { + "epoch": 0.18063112078346028, + "grad_norm": 5.239694595336914, + "learning_rate": 9.284906775985363e-06, + "loss": 0.1673, + "step": 7138 + }, + { + "epoch": 0.18065642634815396, + "grad_norm": 4.4374165534973145, + "learning_rate": 9.284699838567012e-06, + "loss": 0.2229, + "step": 7139 + }, + { + "epoch": 0.18068173191284764, + "grad_norm": 5.166074275970459, + "learning_rate": 9.284492873517428e-06, + "loss": 0.2077, + "step": 7140 + }, + { + "epoch": 0.1807070374775413, + "grad_norm": 4.6798319816589355, + "learning_rate": 9.284285880837947e-06, + "loss": 0.1886, + "step": 7141 + }, + { + "epoch": 0.18073234304223498, + "grad_norm": 6.3238959312438965, + "learning_rate": 9.284078860529904e-06, + "loss": 0.2247, + "step": 7142 + }, + { + "epoch": 0.18075764860692867, + "grad_norm": 14.763148307800293, + "learning_rate": 9.28387181259463e-06, + "loss": 0.3283, + "step": 7143 + }, + { + "epoch": 0.18078295417162235, + "grad_norm": 12.5813570022583, + "learning_rate": 9.283664737033468e-06, + "loss": 0.3045, + "step": 7144 + }, + { + "epoch": 0.180808259736316, + "grad_norm": 5.546518802642822, + "learning_rate": 9.283457633847747e-06, + "loss": 0.2166, + "step": 7145 + }, + { + "epoch": 0.1808335653010097, + "grad_norm": 5.604958534240723, + "learning_rate": 9.283250503038805e-06, + "loss": 0.152, + "step": 7146 + }, + { + "epoch": 0.18085887086570338, + "grad_norm": 12.166036605834961, + "learning_rate": 9.283043344607975e-06, + "loss": 0.2975, + "step": 7147 + }, + { + "epoch": 0.18088417643039703, + "grad_norm": 3.785609245300293, + "learning_rate": 9.2828361585566e-06, + "loss": 0.1745, + "step": 7148 + }, + { + "epoch": 0.18090948199509072, + "grad_norm": 4.361323833465576, + "learning_rate": 9.282628944886006e-06, + "loss": 0.1668, + "step": 7149 + }, + { + "epoch": 0.1809347875597844, + "grad_norm": 8.581879615783691, + "learning_rate": 9.282421703597537e-06, + "loss": 0.0905, + "step": 7150 + }, + { + "epoch": 0.18096009312447808, + "grad_norm": 3.289496660232544, + "learning_rate": 9.282214434692527e-06, + "loss": 0.1061, + "step": 7151 + }, + { + "epoch": 0.18098539868917174, + "grad_norm": 8.604721069335938, + "learning_rate": 9.282007138172312e-06, + "loss": 0.3144, + "step": 7152 + }, + { + "epoch": 0.18101070425386542, + "grad_norm": 3.2030200958251953, + "learning_rate": 9.281799814038231e-06, + "loss": 0.0974, + "step": 7153 + }, + { + "epoch": 0.1810360098185591, + "grad_norm": 6.642822265625, + "learning_rate": 9.281592462291618e-06, + "loss": 0.1724, + "step": 7154 + }, + { + "epoch": 0.1810613153832528, + "grad_norm": 13.714324951171875, + "learning_rate": 9.281385082933812e-06, + "loss": 0.3792, + "step": 7155 + }, + { + "epoch": 0.18108662094794645, + "grad_norm": 4.114198684692383, + "learning_rate": 9.281177675966148e-06, + "loss": 0.2233, + "step": 7156 + }, + { + "epoch": 0.18111192651264013, + "grad_norm": 7.375676155090332, + "learning_rate": 9.280970241389968e-06, + "loss": 0.2431, + "step": 7157 + }, + { + "epoch": 0.18113723207733382, + "grad_norm": 3.638950824737549, + "learning_rate": 9.280762779206608e-06, + "loss": 0.1759, + "step": 7158 + }, + { + "epoch": 0.18116253764202747, + "grad_norm": 8.00721263885498, + "learning_rate": 9.280555289417403e-06, + "loss": 0.1327, + "step": 7159 + }, + { + "epoch": 0.18118784320672116, + "grad_norm": 4.4102044105529785, + "learning_rate": 9.280347772023694e-06, + "loss": 0.2043, + "step": 7160 + }, + { + "epoch": 0.18121314877141484, + "grad_norm": 10.388111114501953, + "learning_rate": 9.28014022702682e-06, + "loss": 0.2132, + "step": 7161 + }, + { + "epoch": 0.18123845433610852, + "grad_norm": 7.8655314445495605, + "learning_rate": 9.279932654428116e-06, + "loss": 0.2748, + "step": 7162 + }, + { + "epoch": 0.18126375990080218, + "grad_norm": 5.12955904006958, + "learning_rate": 9.279725054228923e-06, + "loss": 0.1748, + "step": 7163 + }, + { + "epoch": 0.18128906546549586, + "grad_norm": 4.155306339263916, + "learning_rate": 9.279517426430578e-06, + "loss": 0.1039, + "step": 7164 + }, + { + "epoch": 0.18131437103018955, + "grad_norm": 2.905670166015625, + "learning_rate": 9.279309771034423e-06, + "loss": 0.124, + "step": 7165 + }, + { + "epoch": 0.1813396765948832, + "grad_norm": 5.074924468994141, + "learning_rate": 9.279102088041793e-06, + "loss": 0.2229, + "step": 7166 + }, + { + "epoch": 0.1813649821595769, + "grad_norm": 6.558659553527832, + "learning_rate": 9.278894377454033e-06, + "loss": 0.1658, + "step": 7167 + }, + { + "epoch": 0.18139028772427057, + "grad_norm": 17.182147979736328, + "learning_rate": 9.278686639272475e-06, + "loss": 0.224, + "step": 7168 + }, + { + "epoch": 0.18141559328896426, + "grad_norm": 3.5467593669891357, + "learning_rate": 9.278478873498467e-06, + "loss": 0.1692, + "step": 7169 + }, + { + "epoch": 0.1814408988536579, + "grad_norm": 8.58435344696045, + "learning_rate": 9.278271080133341e-06, + "loss": 0.1806, + "step": 7170 + }, + { + "epoch": 0.1814662044183516, + "grad_norm": 5.087329864501953, + "learning_rate": 9.278063259178444e-06, + "loss": 0.1988, + "step": 7171 + }, + { + "epoch": 0.18149150998304528, + "grad_norm": 9.219524383544922, + "learning_rate": 9.27785541063511e-06, + "loss": 0.2995, + "step": 7172 + }, + { + "epoch": 0.18151681554773894, + "grad_norm": 5.329097747802734, + "learning_rate": 9.277647534504684e-06, + "loss": 0.2136, + "step": 7173 + }, + { + "epoch": 0.18154212111243262, + "grad_norm": 4.379832744598389, + "learning_rate": 9.277439630788502e-06, + "loss": 0.1337, + "step": 7174 + }, + { + "epoch": 0.1815674266771263, + "grad_norm": 6.9192328453063965, + "learning_rate": 9.27723169948791e-06, + "loss": 0.211, + "step": 7175 + }, + { + "epoch": 0.18159273224182, + "grad_norm": 11.602486610412598, + "learning_rate": 9.277023740604247e-06, + "loss": 0.2807, + "step": 7176 + }, + { + "epoch": 0.18161803780651364, + "grad_norm": 3.553828001022339, + "learning_rate": 9.276815754138852e-06, + "loss": 0.1702, + "step": 7177 + }, + { + "epoch": 0.18164334337120733, + "grad_norm": 4.485715389251709, + "learning_rate": 9.276607740093069e-06, + "loss": 0.1437, + "step": 7178 + }, + { + "epoch": 0.181668648935901, + "grad_norm": 3.5593724250793457, + "learning_rate": 9.276399698468237e-06, + "loss": 0.125, + "step": 7179 + }, + { + "epoch": 0.18169395450059467, + "grad_norm": 3.3677310943603516, + "learning_rate": 9.276191629265698e-06, + "loss": 0.1646, + "step": 7180 + }, + { + "epoch": 0.18171926006528835, + "grad_norm": 5.265438079833984, + "learning_rate": 9.275983532486797e-06, + "loss": 0.1301, + "step": 7181 + }, + { + "epoch": 0.18174456562998204, + "grad_norm": 5.49753999710083, + "learning_rate": 9.275775408132871e-06, + "loss": 0.1819, + "step": 7182 + }, + { + "epoch": 0.18176987119467572, + "grad_norm": 7.701435565948486, + "learning_rate": 9.275567256205267e-06, + "loss": 0.1372, + "step": 7183 + }, + { + "epoch": 0.18179517675936938, + "grad_norm": 5.898725986480713, + "learning_rate": 9.275359076705324e-06, + "loss": 0.1767, + "step": 7184 + }, + { + "epoch": 0.18182048232406306, + "grad_norm": 16.488101959228516, + "learning_rate": 9.275150869634385e-06, + "loss": 0.3666, + "step": 7185 + }, + { + "epoch": 0.18184578788875674, + "grad_norm": 7.934884071350098, + "learning_rate": 9.274942634993795e-06, + "loss": 0.1684, + "step": 7186 + }, + { + "epoch": 0.18187109345345043, + "grad_norm": 8.56485652923584, + "learning_rate": 9.274734372784893e-06, + "loss": 0.2137, + "step": 7187 + }, + { + "epoch": 0.18189639901814408, + "grad_norm": 5.637012481689453, + "learning_rate": 9.274526083009026e-06, + "loss": 0.2414, + "step": 7188 + }, + { + "epoch": 0.18192170458283777, + "grad_norm": 18.853628158569336, + "learning_rate": 9.274317765667535e-06, + "loss": 0.2546, + "step": 7189 + }, + { + "epoch": 0.18194701014753145, + "grad_norm": 4.765817642211914, + "learning_rate": 9.274109420761763e-06, + "loss": 0.1746, + "step": 7190 + }, + { + "epoch": 0.1819723157122251, + "grad_norm": 4.786428928375244, + "learning_rate": 9.273901048293055e-06, + "loss": 0.1571, + "step": 7191 + }, + { + "epoch": 0.1819976212769188, + "grad_norm": 5.00604248046875, + "learning_rate": 9.273692648262755e-06, + "loss": 0.1641, + "step": 7192 + }, + { + "epoch": 0.18202292684161248, + "grad_norm": 8.526365280151367, + "learning_rate": 9.273484220672205e-06, + "loss": 0.2275, + "step": 7193 + }, + { + "epoch": 0.18204823240630616, + "grad_norm": 13.231293678283691, + "learning_rate": 9.27327576552275e-06, + "loss": 0.2635, + "step": 7194 + }, + { + "epoch": 0.18207353797099982, + "grad_norm": 6.625739097595215, + "learning_rate": 9.273067282815735e-06, + "loss": 0.1724, + "step": 7195 + }, + { + "epoch": 0.1820988435356935, + "grad_norm": 9.547313690185547, + "learning_rate": 9.272858772552504e-06, + "loss": 0.2101, + "step": 7196 + }, + { + "epoch": 0.18212414910038718, + "grad_norm": 10.070318222045898, + "learning_rate": 9.272650234734402e-06, + "loss": 0.1934, + "step": 7197 + }, + { + "epoch": 0.18214945466508084, + "grad_norm": 4.672276496887207, + "learning_rate": 9.272441669362772e-06, + "loss": 0.1342, + "step": 7198 + }, + { + "epoch": 0.18217476022977452, + "grad_norm": 2.723440408706665, + "learning_rate": 9.272233076438961e-06, + "loss": 0.1204, + "step": 7199 + }, + { + "epoch": 0.1822000657944682, + "grad_norm": 5.944004058837891, + "learning_rate": 9.272024455964313e-06, + "loss": 0.2051, + "step": 7200 + }, + { + "epoch": 0.1822253713591619, + "grad_norm": 8.555737495422363, + "learning_rate": 9.271815807940175e-06, + "loss": 0.2557, + "step": 7201 + }, + { + "epoch": 0.18225067692385555, + "grad_norm": 5.92703104019165, + "learning_rate": 9.271607132367892e-06, + "loss": 0.197, + "step": 7202 + }, + { + "epoch": 0.18227598248854923, + "grad_norm": 4.227445602416992, + "learning_rate": 9.271398429248808e-06, + "loss": 0.2188, + "step": 7203 + }, + { + "epoch": 0.18230128805324292, + "grad_norm": 9.4781494140625, + "learning_rate": 9.27118969858427e-06, + "loss": 0.2322, + "step": 7204 + }, + { + "epoch": 0.18232659361793657, + "grad_norm": 11.514862060546875, + "learning_rate": 9.270980940375624e-06, + "loss": 0.3128, + "step": 7205 + }, + { + "epoch": 0.18235189918263026, + "grad_norm": 7.643739223480225, + "learning_rate": 9.270772154624217e-06, + "loss": 0.2123, + "step": 7206 + }, + { + "epoch": 0.18237720474732394, + "grad_norm": 5.047801971435547, + "learning_rate": 9.270563341331395e-06, + "loss": 0.1539, + "step": 7207 + }, + { + "epoch": 0.18240251031201762, + "grad_norm": 7.620893478393555, + "learning_rate": 9.270354500498503e-06, + "loss": 0.1462, + "step": 7208 + }, + { + "epoch": 0.18242781587671128, + "grad_norm": 8.127671241760254, + "learning_rate": 9.270145632126893e-06, + "loss": 0.1155, + "step": 7209 + }, + { + "epoch": 0.18245312144140496, + "grad_norm": 9.023584365844727, + "learning_rate": 9.269936736217905e-06, + "loss": 0.3927, + "step": 7210 + }, + { + "epoch": 0.18247842700609865, + "grad_norm": 4.252960681915283, + "learning_rate": 9.269727812772888e-06, + "loss": 0.1742, + "step": 7211 + }, + { + "epoch": 0.1825037325707923, + "grad_norm": 6.147472381591797, + "learning_rate": 9.269518861793193e-06, + "loss": 0.2201, + "step": 7212 + }, + { + "epoch": 0.182529038135486, + "grad_norm": 9.423853874206543, + "learning_rate": 9.269309883280164e-06, + "loss": 0.1811, + "step": 7213 + }, + { + "epoch": 0.18255434370017967, + "grad_norm": 6.685093879699707, + "learning_rate": 9.269100877235151e-06, + "loss": 0.3282, + "step": 7214 + }, + { + "epoch": 0.18257964926487336, + "grad_norm": 7.408766746520996, + "learning_rate": 9.2688918436595e-06, + "loss": 0.2324, + "step": 7215 + }, + { + "epoch": 0.182604954829567, + "grad_norm": 4.4473700523376465, + "learning_rate": 9.268682782554559e-06, + "loss": 0.163, + "step": 7216 + }, + { + "epoch": 0.1826302603942607, + "grad_norm": 13.827202796936035, + "learning_rate": 9.268473693921676e-06, + "loss": 0.1747, + "step": 7217 + }, + { + "epoch": 0.18265556595895438, + "grad_norm": 3.126946210861206, + "learning_rate": 9.268264577762202e-06, + "loss": 0.1425, + "step": 7218 + }, + { + "epoch": 0.18268087152364806, + "grad_norm": 3.719735860824585, + "learning_rate": 9.268055434077483e-06, + "loss": 0.1915, + "step": 7219 + }, + { + "epoch": 0.18270617708834172, + "grad_norm": 10.031670570373535, + "learning_rate": 9.267846262868868e-06, + "loss": 0.1923, + "step": 7220 + }, + { + "epoch": 0.1827314826530354, + "grad_norm": 7.332655429840088, + "learning_rate": 9.267637064137706e-06, + "loss": 0.219, + "step": 7221 + }, + { + "epoch": 0.1827567882177291, + "grad_norm": 3.2241272926330566, + "learning_rate": 9.267427837885348e-06, + "loss": 0.0873, + "step": 7222 + }, + { + "epoch": 0.18278209378242274, + "grad_norm": 2.70017409324646, + "learning_rate": 9.26721858411314e-06, + "loss": 0.128, + "step": 7223 + }, + { + "epoch": 0.18280739934711643, + "grad_norm": 2.8699798583984375, + "learning_rate": 9.267009302822435e-06, + "loss": 0.1199, + "step": 7224 + }, + { + "epoch": 0.1828327049118101, + "grad_norm": 4.392472267150879, + "learning_rate": 9.266799994014578e-06, + "loss": 0.1601, + "step": 7225 + }, + { + "epoch": 0.1828580104765038, + "grad_norm": 16.901466369628906, + "learning_rate": 9.266590657690924e-06, + "loss": 0.2877, + "step": 7226 + }, + { + "epoch": 0.18288331604119745, + "grad_norm": 14.454489707946777, + "learning_rate": 9.266381293852819e-06, + "loss": 0.2933, + "step": 7227 + }, + { + "epoch": 0.18290862160589114, + "grad_norm": 6.253414154052734, + "learning_rate": 9.266171902501615e-06, + "loss": 0.1218, + "step": 7228 + }, + { + "epoch": 0.18293392717058482, + "grad_norm": 3.2615630626678467, + "learning_rate": 9.265962483638662e-06, + "loss": 0.1395, + "step": 7229 + }, + { + "epoch": 0.18295923273527848, + "grad_norm": 4.5597243309021, + "learning_rate": 9.265753037265311e-06, + "loss": 0.1958, + "step": 7230 + }, + { + "epoch": 0.18298453829997216, + "grad_norm": 5.85781717300415, + "learning_rate": 9.265543563382912e-06, + "loss": 0.2236, + "step": 7231 + }, + { + "epoch": 0.18300984386466584, + "grad_norm": 10.199003219604492, + "learning_rate": 9.265334061992814e-06, + "loss": 0.2709, + "step": 7232 + }, + { + "epoch": 0.18303514942935953, + "grad_norm": 7.930936336517334, + "learning_rate": 9.265124533096372e-06, + "loss": 0.2809, + "step": 7233 + }, + { + "epoch": 0.18306045499405318, + "grad_norm": 6.474140644073486, + "learning_rate": 9.264914976694936e-06, + "loss": 0.2001, + "step": 7234 + }, + { + "epoch": 0.18308576055874687, + "grad_norm": 4.912583827972412, + "learning_rate": 9.264705392789857e-06, + "loss": 0.1451, + "step": 7235 + }, + { + "epoch": 0.18311106612344055, + "grad_norm": 7.037714958190918, + "learning_rate": 9.264495781382483e-06, + "loss": 0.2164, + "step": 7236 + }, + { + "epoch": 0.1831363716881342, + "grad_norm": 6.523293495178223, + "learning_rate": 9.264286142474171e-06, + "loss": 0.2035, + "step": 7237 + }, + { + "epoch": 0.1831616772528279, + "grad_norm": 39.896793365478516, + "learning_rate": 9.26407647606627e-06, + "loss": 0.2255, + "step": 7238 + }, + { + "epoch": 0.18318698281752158, + "grad_norm": 5.538957595825195, + "learning_rate": 9.263866782160135e-06, + "loss": 0.1826, + "step": 7239 + }, + { + "epoch": 0.18321228838221526, + "grad_norm": 7.459085464477539, + "learning_rate": 9.263657060757116e-06, + "loss": 0.2858, + "step": 7240 + }, + { + "epoch": 0.18323759394690892, + "grad_norm": 4.123482704162598, + "learning_rate": 9.263447311858565e-06, + "loss": 0.1996, + "step": 7241 + }, + { + "epoch": 0.1832628995116026, + "grad_norm": 8.319372177124023, + "learning_rate": 9.263237535465834e-06, + "loss": 0.21, + "step": 7242 + }, + { + "epoch": 0.18328820507629628, + "grad_norm": 13.05124282836914, + "learning_rate": 9.26302773158028e-06, + "loss": 0.3861, + "step": 7243 + }, + { + "epoch": 0.18331351064098994, + "grad_norm": 7.617503643035889, + "learning_rate": 9.262817900203249e-06, + "loss": 0.2737, + "step": 7244 + }, + { + "epoch": 0.18333881620568362, + "grad_norm": 8.90355110168457, + "learning_rate": 9.262608041336101e-06, + "loss": 0.2631, + "step": 7245 + }, + { + "epoch": 0.1833641217703773, + "grad_norm": 7.29623556137085, + "learning_rate": 9.262398154980186e-06, + "loss": 0.1419, + "step": 7246 + }, + { + "epoch": 0.183389427335071, + "grad_norm": 6.939629077911377, + "learning_rate": 9.26218824113686e-06, + "loss": 0.174, + "step": 7247 + }, + { + "epoch": 0.18341473289976465, + "grad_norm": 13.827350616455078, + "learning_rate": 9.261978299807473e-06, + "loss": 0.294, + "step": 7248 + }, + { + "epoch": 0.18344003846445833, + "grad_norm": 5.1271281242370605, + "learning_rate": 9.261768330993382e-06, + "loss": 0.1884, + "step": 7249 + }, + { + "epoch": 0.18346534402915202, + "grad_norm": 8.646145820617676, + "learning_rate": 9.261558334695938e-06, + "loss": 0.3331, + "step": 7250 + }, + { + "epoch": 0.1834906495938457, + "grad_norm": 5.053031921386719, + "learning_rate": 9.261348310916498e-06, + "loss": 0.2323, + "step": 7251 + }, + { + "epoch": 0.18351595515853936, + "grad_norm": 13.712712287902832, + "learning_rate": 9.261138259656414e-06, + "loss": 0.2377, + "step": 7252 + }, + { + "epoch": 0.18354126072323304, + "grad_norm": 6.400597095489502, + "learning_rate": 9.260928180917043e-06, + "loss": 0.2401, + "step": 7253 + }, + { + "epoch": 0.18356656628792672, + "grad_norm": 3.567808151245117, + "learning_rate": 9.260718074699741e-06, + "loss": 0.1619, + "step": 7254 + }, + { + "epoch": 0.18359187185262038, + "grad_norm": 3.4426674842834473, + "learning_rate": 9.260507941005859e-06, + "loss": 0.1138, + "step": 7255 + }, + { + "epoch": 0.18361717741731406, + "grad_norm": 10.958163261413574, + "learning_rate": 9.260297779836753e-06, + "loss": 0.177, + "step": 7256 + }, + { + "epoch": 0.18364248298200775, + "grad_norm": 4.365667819976807, + "learning_rate": 9.26008759119378e-06, + "loss": 0.1578, + "step": 7257 + }, + { + "epoch": 0.18366778854670143, + "grad_norm": 5.48730993270874, + "learning_rate": 9.259877375078293e-06, + "loss": 0.2242, + "step": 7258 + }, + { + "epoch": 0.1836930941113951, + "grad_norm": 7.042131423950195, + "learning_rate": 9.25966713149165e-06, + "loss": 0.2354, + "step": 7259 + }, + { + "epoch": 0.18371839967608877, + "grad_norm": 4.17920446395874, + "learning_rate": 9.259456860435207e-06, + "loss": 0.2179, + "step": 7260 + }, + { + "epoch": 0.18374370524078246, + "grad_norm": 2.804896831512451, + "learning_rate": 9.259246561910319e-06, + "loss": 0.135, + "step": 7261 + }, + { + "epoch": 0.1837690108054761, + "grad_norm": 2.5561234951019287, + "learning_rate": 9.259036235918341e-06, + "loss": 0.1458, + "step": 7262 + }, + { + "epoch": 0.1837943163701698, + "grad_norm": 8.182222366333008, + "learning_rate": 9.25882588246063e-06, + "loss": 0.2966, + "step": 7263 + }, + { + "epoch": 0.18381962193486348, + "grad_norm": 3.832768678665161, + "learning_rate": 9.258615501538546e-06, + "loss": 0.1446, + "step": 7264 + }, + { + "epoch": 0.18384492749955716, + "grad_norm": 12.794037818908691, + "learning_rate": 9.258405093153442e-06, + "loss": 0.1773, + "step": 7265 + }, + { + "epoch": 0.18387023306425082, + "grad_norm": 6.368136882781982, + "learning_rate": 9.258194657306673e-06, + "loss": 0.2269, + "step": 7266 + }, + { + "epoch": 0.1838955386289445, + "grad_norm": 8.096918106079102, + "learning_rate": 9.257984193999601e-06, + "loss": 0.2133, + "step": 7267 + }, + { + "epoch": 0.1839208441936382, + "grad_norm": 4.3367085456848145, + "learning_rate": 9.257773703233581e-06, + "loss": 0.2005, + "step": 7268 + }, + { + "epoch": 0.18394614975833185, + "grad_norm": 4.014474868774414, + "learning_rate": 9.25756318500997e-06, + "loss": 0.1439, + "step": 7269 + }, + { + "epoch": 0.18397145532302553, + "grad_norm": 6.217222213745117, + "learning_rate": 9.257352639330125e-06, + "loss": 0.2183, + "step": 7270 + }, + { + "epoch": 0.1839967608877192, + "grad_norm": 3.712836503982544, + "learning_rate": 9.257142066195406e-06, + "loss": 0.1613, + "step": 7271 + }, + { + "epoch": 0.1840220664524129, + "grad_norm": 5.900671005249023, + "learning_rate": 9.256931465607171e-06, + "loss": 0.2103, + "step": 7272 + }, + { + "epoch": 0.18404737201710655, + "grad_norm": 3.3158347606658936, + "learning_rate": 9.256720837566773e-06, + "loss": 0.169, + "step": 7273 + }, + { + "epoch": 0.18407267758180024, + "grad_norm": 8.660181045532227, + "learning_rate": 9.256510182075577e-06, + "loss": 0.2478, + "step": 7274 + }, + { + "epoch": 0.18409798314649392, + "grad_norm": 4.832549571990967, + "learning_rate": 9.256299499134939e-06, + "loss": 0.179, + "step": 7275 + }, + { + "epoch": 0.18412328871118758, + "grad_norm": 6.159467697143555, + "learning_rate": 9.256088788746216e-06, + "loss": 0.1946, + "step": 7276 + }, + { + "epoch": 0.18414859427588126, + "grad_norm": 5.259505271911621, + "learning_rate": 9.255878050910767e-06, + "loss": 0.209, + "step": 7277 + }, + { + "epoch": 0.18417389984057495, + "grad_norm": 3.7866051197052, + "learning_rate": 9.255667285629955e-06, + "loss": 0.1478, + "step": 7278 + }, + { + "epoch": 0.18419920540526863, + "grad_norm": 5.8361029624938965, + "learning_rate": 9.255456492905134e-06, + "loss": 0.1382, + "step": 7279 + }, + { + "epoch": 0.18422451096996229, + "grad_norm": 4.1406636238098145, + "learning_rate": 9.255245672737665e-06, + "loss": 0.1583, + "step": 7280 + }, + { + "epoch": 0.18424981653465597, + "grad_norm": 4.7673821449279785, + "learning_rate": 9.25503482512891e-06, + "loss": 0.1334, + "step": 7281 + }, + { + "epoch": 0.18427512209934965, + "grad_norm": 4.038175106048584, + "learning_rate": 9.254823950080227e-06, + "loss": 0.1819, + "step": 7282 + }, + { + "epoch": 0.18430042766404334, + "grad_norm": 7.855571269989014, + "learning_rate": 9.254613047592975e-06, + "loss": 0.2509, + "step": 7283 + }, + { + "epoch": 0.184325733228737, + "grad_norm": 7.604547023773193, + "learning_rate": 9.254402117668515e-06, + "loss": 0.2499, + "step": 7284 + }, + { + "epoch": 0.18435103879343068, + "grad_norm": 9.899079322814941, + "learning_rate": 9.254191160308208e-06, + "loss": 0.2169, + "step": 7285 + }, + { + "epoch": 0.18437634435812436, + "grad_norm": 6.231081485748291, + "learning_rate": 9.253980175513413e-06, + "loss": 0.2495, + "step": 7286 + }, + { + "epoch": 0.18440164992281802, + "grad_norm": 7.1001691818237305, + "learning_rate": 9.25376916328549e-06, + "loss": 0.2958, + "step": 7287 + }, + { + "epoch": 0.1844269554875117, + "grad_norm": 9.073775291442871, + "learning_rate": 9.253558123625803e-06, + "loss": 0.2164, + "step": 7288 + }, + { + "epoch": 0.18445226105220539, + "grad_norm": 8.465052604675293, + "learning_rate": 9.253347056535709e-06, + "loss": 0.3633, + "step": 7289 + }, + { + "epoch": 0.18447756661689907, + "grad_norm": 4.776576995849609, + "learning_rate": 9.253135962016572e-06, + "loss": 0.2158, + "step": 7290 + }, + { + "epoch": 0.18450287218159273, + "grad_norm": 4.62740421295166, + "learning_rate": 9.252924840069752e-06, + "loss": 0.1763, + "step": 7291 + }, + { + "epoch": 0.1845281777462864, + "grad_norm": 9.409817695617676, + "learning_rate": 9.25271369069661e-06, + "loss": 0.4134, + "step": 7292 + }, + { + "epoch": 0.1845534833109801, + "grad_norm": 10.120574951171875, + "learning_rate": 9.25250251389851e-06, + "loss": 0.2908, + "step": 7293 + }, + { + "epoch": 0.18457878887567375, + "grad_norm": 9.359343528747559, + "learning_rate": 9.252291309676812e-06, + "loss": 0.3106, + "step": 7294 + }, + { + "epoch": 0.18460409444036743, + "grad_norm": 4.565522193908691, + "learning_rate": 9.252080078032877e-06, + "loss": 0.1959, + "step": 7295 + }, + { + "epoch": 0.18462940000506112, + "grad_norm": 3.1559808254241943, + "learning_rate": 9.25186881896807e-06, + "loss": 0.1553, + "step": 7296 + }, + { + "epoch": 0.1846547055697548, + "grad_norm": 4.49196195602417, + "learning_rate": 9.251657532483751e-06, + "loss": 0.1765, + "step": 7297 + }, + { + "epoch": 0.18468001113444846, + "grad_norm": 8.848149299621582, + "learning_rate": 9.251446218581285e-06, + "loss": 0.2878, + "step": 7298 + }, + { + "epoch": 0.18470531669914214, + "grad_norm": 3.750244617462158, + "learning_rate": 9.251234877262032e-06, + "loss": 0.1887, + "step": 7299 + }, + { + "epoch": 0.18473062226383583, + "grad_norm": 4.192558288574219, + "learning_rate": 9.251023508527356e-06, + "loss": 0.1637, + "step": 7300 + }, + { + "epoch": 0.18475592782852948, + "grad_norm": 3.56480073928833, + "learning_rate": 9.25081211237862e-06, + "loss": 0.0508, + "step": 7301 + }, + { + "epoch": 0.18478123339322317, + "grad_norm": 5.954595565795898, + "learning_rate": 9.250600688817189e-06, + "loss": 0.1971, + "step": 7302 + }, + { + "epoch": 0.18480653895791685, + "grad_norm": 6.024895668029785, + "learning_rate": 9.250389237844421e-06, + "loss": 0.3475, + "step": 7303 + }, + { + "epoch": 0.18483184452261053, + "grad_norm": 3.008204936981201, + "learning_rate": 9.250177759461687e-06, + "loss": 0.2016, + "step": 7304 + }, + { + "epoch": 0.1848571500873042, + "grad_norm": 4.821243762969971, + "learning_rate": 9.249966253670347e-06, + "loss": 0.2766, + "step": 7305 + }, + { + "epoch": 0.18488245565199787, + "grad_norm": 5.64265775680542, + "learning_rate": 9.249754720471764e-06, + "loss": 0.1593, + "step": 7306 + }, + { + "epoch": 0.18490776121669156, + "grad_norm": 3.616736650466919, + "learning_rate": 9.249543159867302e-06, + "loss": 0.1855, + "step": 7307 + }, + { + "epoch": 0.1849330667813852, + "grad_norm": 4.364917755126953, + "learning_rate": 9.249331571858329e-06, + "loss": 0.1297, + "step": 7308 + }, + { + "epoch": 0.1849583723460789, + "grad_norm": 5.436645984649658, + "learning_rate": 9.249119956446207e-06, + "loss": 0.2399, + "step": 7309 + }, + { + "epoch": 0.18498367791077258, + "grad_norm": 9.27431869506836, + "learning_rate": 9.2489083136323e-06, + "loss": 0.2134, + "step": 7310 + }, + { + "epoch": 0.18500898347546627, + "grad_norm": 7.583409309387207, + "learning_rate": 9.248696643417974e-06, + "loss": 0.2073, + "step": 7311 + }, + { + "epoch": 0.18503428904015992, + "grad_norm": 7.6728105545043945, + "learning_rate": 9.248484945804593e-06, + "loss": 0.2332, + "step": 7312 + }, + { + "epoch": 0.1850595946048536, + "grad_norm": 3.9738495349884033, + "learning_rate": 9.248273220793524e-06, + "loss": 0.224, + "step": 7313 + }, + { + "epoch": 0.1850849001695473, + "grad_norm": 8.595680236816406, + "learning_rate": 9.24806146838613e-06, + "loss": 0.2105, + "step": 7314 + }, + { + "epoch": 0.18511020573424097, + "grad_norm": 7.648444652557373, + "learning_rate": 9.247849688583778e-06, + "loss": 0.2204, + "step": 7315 + }, + { + "epoch": 0.18513551129893463, + "grad_norm": 5.473320007324219, + "learning_rate": 9.247637881387835e-06, + "loss": 0.2467, + "step": 7316 + }, + { + "epoch": 0.1851608168636283, + "grad_norm": 5.293506145477295, + "learning_rate": 9.247426046799663e-06, + "loss": 0.1589, + "step": 7317 + }, + { + "epoch": 0.185186122428322, + "grad_norm": 3.718804121017456, + "learning_rate": 9.247214184820633e-06, + "loss": 0.1274, + "step": 7318 + }, + { + "epoch": 0.18521142799301565, + "grad_norm": 4.956206321716309, + "learning_rate": 9.247002295452107e-06, + "loss": 0.2049, + "step": 7319 + }, + { + "epoch": 0.18523673355770934, + "grad_norm": 2.904557466506958, + "learning_rate": 9.246790378695454e-06, + "loss": 0.1669, + "step": 7320 + }, + { + "epoch": 0.18526203912240302, + "grad_norm": 5.720641613006592, + "learning_rate": 9.246578434552041e-06, + "loss": 0.1472, + "step": 7321 + }, + { + "epoch": 0.1852873446870967, + "grad_norm": 8.20901107788086, + "learning_rate": 9.24636646302323e-06, + "loss": 0.2736, + "step": 7322 + }, + { + "epoch": 0.18531265025179036, + "grad_norm": 3.5680923461914062, + "learning_rate": 9.246154464110394e-06, + "loss": 0.2001, + "step": 7323 + }, + { + "epoch": 0.18533795581648405, + "grad_norm": 7.819835186004639, + "learning_rate": 9.245942437814899e-06, + "loss": 0.3305, + "step": 7324 + }, + { + "epoch": 0.18536326138117773, + "grad_norm": 5.4471516609191895, + "learning_rate": 9.245730384138109e-06, + "loss": 0.1722, + "step": 7325 + }, + { + "epoch": 0.18538856694587139, + "grad_norm": 3.3837602138519287, + "learning_rate": 9.245518303081393e-06, + "loss": 0.1632, + "step": 7326 + }, + { + "epoch": 0.18541387251056507, + "grad_norm": 6.883366584777832, + "learning_rate": 9.245306194646122e-06, + "loss": 0.1611, + "step": 7327 + }, + { + "epoch": 0.18543917807525875, + "grad_norm": 4.391929626464844, + "learning_rate": 9.24509405883366e-06, + "loss": 0.1536, + "step": 7328 + }, + { + "epoch": 0.18546448363995244, + "grad_norm": 8.352392196655273, + "learning_rate": 9.244881895645374e-06, + "loss": 0.1459, + "step": 7329 + }, + { + "epoch": 0.1854897892046461, + "grad_norm": 4.869713306427002, + "learning_rate": 9.244669705082636e-06, + "loss": 0.1958, + "step": 7330 + }, + { + "epoch": 0.18551509476933978, + "grad_norm": 11.22594928741455, + "learning_rate": 9.244457487146812e-06, + "loss": 0.2351, + "step": 7331 + }, + { + "epoch": 0.18554040033403346, + "grad_norm": 8.223175048828125, + "learning_rate": 9.244245241839272e-06, + "loss": 0.2917, + "step": 7332 + }, + { + "epoch": 0.18556570589872712, + "grad_norm": 6.165021896362305, + "learning_rate": 9.244032969161382e-06, + "loss": 0.1422, + "step": 7333 + }, + { + "epoch": 0.1855910114634208, + "grad_norm": 6.332920551300049, + "learning_rate": 9.243820669114514e-06, + "loss": 0.1267, + "step": 7334 + }, + { + "epoch": 0.18561631702811449, + "grad_norm": 4.59925651550293, + "learning_rate": 9.243608341700036e-06, + "loss": 0.1836, + "step": 7335 + }, + { + "epoch": 0.18564162259280817, + "grad_norm": 3.656554698944092, + "learning_rate": 9.243395986919319e-06, + "loss": 0.1686, + "step": 7336 + }, + { + "epoch": 0.18566692815750183, + "grad_norm": 2.989265203475952, + "learning_rate": 9.243183604773728e-06, + "loss": 0.1492, + "step": 7337 + }, + { + "epoch": 0.1856922337221955, + "grad_norm": 16.71387481689453, + "learning_rate": 9.242971195264637e-06, + "loss": 0.3167, + "step": 7338 + }, + { + "epoch": 0.1857175392868892, + "grad_norm": 3.881087064743042, + "learning_rate": 9.242758758393413e-06, + "loss": 0.1131, + "step": 7339 + }, + { + "epoch": 0.18574284485158285, + "grad_norm": 26.95444679260254, + "learning_rate": 9.242546294161426e-06, + "loss": 0.2485, + "step": 7340 + }, + { + "epoch": 0.18576815041627653, + "grad_norm": 14.40998649597168, + "learning_rate": 9.24233380257005e-06, + "loss": 0.2329, + "step": 7341 + }, + { + "epoch": 0.18579345598097022, + "grad_norm": 6.369855880737305, + "learning_rate": 9.24212128362065e-06, + "loss": 0.1986, + "step": 7342 + }, + { + "epoch": 0.1858187615456639, + "grad_norm": 11.065469741821289, + "learning_rate": 9.2419087373146e-06, + "loss": 0.1751, + "step": 7343 + }, + { + "epoch": 0.18584406711035756, + "grad_norm": 4.11049222946167, + "learning_rate": 9.241696163653268e-06, + "loss": 0.1337, + "step": 7344 + }, + { + "epoch": 0.18586937267505124, + "grad_norm": 4.113131046295166, + "learning_rate": 9.241483562638027e-06, + "loss": 0.1321, + "step": 7345 + }, + { + "epoch": 0.18589467823974493, + "grad_norm": 8.506893157958984, + "learning_rate": 9.24127093427025e-06, + "loss": 0.2202, + "step": 7346 + }, + { + "epoch": 0.18591998380443858, + "grad_norm": 5.285325050354004, + "learning_rate": 9.241058278551303e-06, + "loss": 0.1968, + "step": 7347 + }, + { + "epoch": 0.18594528936913227, + "grad_norm": 6.591526508331299, + "learning_rate": 9.240845595482562e-06, + "loss": 0.2191, + "step": 7348 + }, + { + "epoch": 0.18597059493382595, + "grad_norm": 6.5262346267700195, + "learning_rate": 9.240632885065394e-06, + "loss": 0.2144, + "step": 7349 + }, + { + "epoch": 0.18599590049851963, + "grad_norm": 8.736224174499512, + "learning_rate": 9.240420147301176e-06, + "loss": 0.1444, + "step": 7350 + }, + { + "epoch": 0.1860212060632133, + "grad_norm": 10.198739051818848, + "learning_rate": 9.240207382191275e-06, + "loss": 0.2125, + "step": 7351 + }, + { + "epoch": 0.18604651162790697, + "grad_norm": 6.093356132507324, + "learning_rate": 9.239994589737067e-06, + "loss": 0.1294, + "step": 7352 + }, + { + "epoch": 0.18607181719260066, + "grad_norm": 5.280349254608154, + "learning_rate": 9.239781769939922e-06, + "loss": 0.1375, + "step": 7353 + }, + { + "epoch": 0.18609712275729434, + "grad_norm": 4.0743207931518555, + "learning_rate": 9.239568922801213e-06, + "loss": 0.1487, + "step": 7354 + }, + { + "epoch": 0.186122428321988, + "grad_norm": 25.583595275878906, + "learning_rate": 9.239356048322313e-06, + "loss": 0.2569, + "step": 7355 + }, + { + "epoch": 0.18614773388668168, + "grad_norm": 7.146260738372803, + "learning_rate": 9.239143146504594e-06, + "loss": 0.2462, + "step": 7356 + }, + { + "epoch": 0.18617303945137537, + "grad_norm": 11.075724601745605, + "learning_rate": 9.23893021734943e-06, + "loss": 0.1935, + "step": 7357 + }, + { + "epoch": 0.18619834501606902, + "grad_norm": 7.088604927062988, + "learning_rate": 9.238717260858192e-06, + "loss": 0.204, + "step": 7358 + }, + { + "epoch": 0.1862236505807627, + "grad_norm": 8.404023170471191, + "learning_rate": 9.238504277032257e-06, + "loss": 0.2547, + "step": 7359 + }, + { + "epoch": 0.1862489561454564, + "grad_norm": 4.524331569671631, + "learning_rate": 9.238291265872995e-06, + "loss": 0.169, + "step": 7360 + }, + { + "epoch": 0.18627426171015007, + "grad_norm": 2.308520793914795, + "learning_rate": 9.238078227381782e-06, + "loss": 0.1816, + "step": 7361 + }, + { + "epoch": 0.18629956727484373, + "grad_norm": 3.882309913635254, + "learning_rate": 9.237865161559989e-06, + "loss": 0.1429, + "step": 7362 + }, + { + "epoch": 0.18632487283953741, + "grad_norm": 4.762967586517334, + "learning_rate": 9.237652068408993e-06, + "loss": 0.1428, + "step": 7363 + }, + { + "epoch": 0.1863501784042311, + "grad_norm": 3.425232172012329, + "learning_rate": 9.237438947930166e-06, + "loss": 0.1288, + "step": 7364 + }, + { + "epoch": 0.18637548396892475, + "grad_norm": 4.1165008544921875, + "learning_rate": 9.237225800124885e-06, + "loss": 0.1286, + "step": 7365 + }, + { + "epoch": 0.18640078953361844, + "grad_norm": 26.018321990966797, + "learning_rate": 9.237012624994523e-06, + "loss": 0.235, + "step": 7366 + }, + { + "epoch": 0.18642609509831212, + "grad_norm": 9.731244087219238, + "learning_rate": 9.236799422540454e-06, + "loss": 0.2841, + "step": 7367 + }, + { + "epoch": 0.1864514006630058, + "grad_norm": 4.3638458251953125, + "learning_rate": 9.236586192764054e-06, + "loss": 0.1044, + "step": 7368 + }, + { + "epoch": 0.18647670622769946, + "grad_norm": 8.326388359069824, + "learning_rate": 9.236372935666697e-06, + "loss": 0.2674, + "step": 7369 + }, + { + "epoch": 0.18650201179239315, + "grad_norm": 8.759661674499512, + "learning_rate": 9.23615965124976e-06, + "loss": 0.3311, + "step": 7370 + }, + { + "epoch": 0.18652731735708683, + "grad_norm": 5.814236164093018, + "learning_rate": 9.235946339514617e-06, + "loss": 0.2365, + "step": 7371 + }, + { + "epoch": 0.1865526229217805, + "grad_norm": 4.914188385009766, + "learning_rate": 9.235733000462643e-06, + "loss": 0.2221, + "step": 7372 + }, + { + "epoch": 0.18657792848647417, + "grad_norm": 27.261964797973633, + "learning_rate": 9.235519634095216e-06, + "loss": 0.1997, + "step": 7373 + }, + { + "epoch": 0.18660323405116785, + "grad_norm": 6.883638381958008, + "learning_rate": 9.235306240413711e-06, + "loss": 0.1282, + "step": 7374 + }, + { + "epoch": 0.18662853961586154, + "grad_norm": 5.3345232009887695, + "learning_rate": 9.235092819419503e-06, + "loss": 0.215, + "step": 7375 + }, + { + "epoch": 0.1866538451805552, + "grad_norm": 6.1867756843566895, + "learning_rate": 9.23487937111397e-06, + "loss": 0.1934, + "step": 7376 + }, + { + "epoch": 0.18667915074524888, + "grad_norm": 9.87980842590332, + "learning_rate": 9.234665895498488e-06, + "loss": 0.3156, + "step": 7377 + }, + { + "epoch": 0.18670445630994256, + "grad_norm": 4.51318359375, + "learning_rate": 9.234452392574431e-06, + "loss": 0.1979, + "step": 7378 + }, + { + "epoch": 0.18672976187463622, + "grad_norm": 6.875176906585693, + "learning_rate": 9.23423886234318e-06, + "loss": 0.2052, + "step": 7379 + }, + { + "epoch": 0.1867550674393299, + "grad_norm": 4.218818664550781, + "learning_rate": 9.23402530480611e-06, + "loss": 0.1863, + "step": 7380 + }, + { + "epoch": 0.1867803730040236, + "grad_norm": 4.6474738121032715, + "learning_rate": 9.233811719964598e-06, + "loss": 0.1269, + "step": 7381 + }, + { + "epoch": 0.18680567856871727, + "grad_norm": 2.948190450668335, + "learning_rate": 9.233598107820024e-06, + "loss": 0.1591, + "step": 7382 + }, + { + "epoch": 0.18683098413341093, + "grad_norm": 4.905131816864014, + "learning_rate": 9.233384468373761e-06, + "loss": 0.1727, + "step": 7383 + }, + { + "epoch": 0.1868562896981046, + "grad_norm": 3.814654588699341, + "learning_rate": 9.23317080162719e-06, + "loss": 0.108, + "step": 7384 + }, + { + "epoch": 0.1868815952627983, + "grad_norm": 5.9958696365356445, + "learning_rate": 9.232957107581685e-06, + "loss": 0.1902, + "step": 7385 + }, + { + "epoch": 0.18690690082749198, + "grad_norm": 7.062072277069092, + "learning_rate": 9.23274338623863e-06, + "loss": 0.2934, + "step": 7386 + }, + { + "epoch": 0.18693220639218563, + "grad_norm": 16.81883430480957, + "learning_rate": 9.2325296375994e-06, + "loss": 0.3975, + "step": 7387 + }, + { + "epoch": 0.18695751195687932, + "grad_norm": 10.237370491027832, + "learning_rate": 9.232315861665373e-06, + "loss": 0.347, + "step": 7388 + }, + { + "epoch": 0.186982817521573, + "grad_norm": 8.046952247619629, + "learning_rate": 9.232102058437928e-06, + "loss": 0.3049, + "step": 7389 + }, + { + "epoch": 0.18700812308626666, + "grad_norm": 6.523675918579102, + "learning_rate": 9.231888227918446e-06, + "loss": 0.2503, + "step": 7390 + }, + { + "epoch": 0.18703342865096034, + "grad_norm": 7.723688125610352, + "learning_rate": 9.231674370108303e-06, + "loss": 0.2673, + "step": 7391 + }, + { + "epoch": 0.18705873421565403, + "grad_norm": 6.643231391906738, + "learning_rate": 9.231460485008879e-06, + "loss": 0.2188, + "step": 7392 + }, + { + "epoch": 0.1870840397803477, + "grad_norm": 5.340329647064209, + "learning_rate": 9.231246572621553e-06, + "loss": 0.1972, + "step": 7393 + }, + { + "epoch": 0.18710934534504137, + "grad_norm": 3.9168472290039062, + "learning_rate": 9.231032632947705e-06, + "loss": 0.1964, + "step": 7394 + }, + { + "epoch": 0.18713465090973505, + "grad_norm": 18.35198402404785, + "learning_rate": 9.230818665988714e-06, + "loss": 0.2347, + "step": 7395 + }, + { + "epoch": 0.18715995647442873, + "grad_norm": 4.499203205108643, + "learning_rate": 9.230604671745962e-06, + "loss": 0.1614, + "step": 7396 + }, + { + "epoch": 0.1871852620391224, + "grad_norm": 5.594235897064209, + "learning_rate": 9.230390650220827e-06, + "loss": 0.1822, + "step": 7397 + }, + { + "epoch": 0.18721056760381607, + "grad_norm": 4.088644504547119, + "learning_rate": 9.230176601414688e-06, + "loss": 0.2303, + "step": 7398 + }, + { + "epoch": 0.18723587316850976, + "grad_norm": 3.692673444747925, + "learning_rate": 9.229962525328928e-06, + "loss": 0.1701, + "step": 7399 + }, + { + "epoch": 0.18726117873320344, + "grad_norm": 5.825982093811035, + "learning_rate": 9.229748421964928e-06, + "loss": 0.2373, + "step": 7400 + }, + { + "epoch": 0.1872864842978971, + "grad_norm": 5.053061485290527, + "learning_rate": 9.229534291324065e-06, + "loss": 0.1976, + "step": 7401 + }, + { + "epoch": 0.18731178986259078, + "grad_norm": 7.5349297523498535, + "learning_rate": 9.229320133407723e-06, + "loss": 0.2819, + "step": 7402 + }, + { + "epoch": 0.18733709542728447, + "grad_norm": 4.321945667266846, + "learning_rate": 9.229105948217284e-06, + "loss": 0.1271, + "step": 7403 + }, + { + "epoch": 0.18736240099197812, + "grad_norm": 4.047527313232422, + "learning_rate": 9.228891735754124e-06, + "loss": 0.1821, + "step": 7404 + }, + { + "epoch": 0.1873877065566718, + "grad_norm": 6.734079837799072, + "learning_rate": 9.228677496019628e-06, + "loss": 0.2388, + "step": 7405 + }, + { + "epoch": 0.1874130121213655, + "grad_norm": 4.628611087799072, + "learning_rate": 9.228463229015182e-06, + "loss": 0.194, + "step": 7406 + }, + { + "epoch": 0.18743831768605917, + "grad_norm": 6.877040863037109, + "learning_rate": 9.228248934742159e-06, + "loss": 0.1523, + "step": 7407 + }, + { + "epoch": 0.18746362325075283, + "grad_norm": 7.452498435974121, + "learning_rate": 9.228034613201947e-06, + "loss": 0.2522, + "step": 7408 + }, + { + "epoch": 0.18748892881544652, + "grad_norm": 4.6081085205078125, + "learning_rate": 9.227820264395923e-06, + "loss": 0.2096, + "step": 7409 + }, + { + "epoch": 0.1875142343801402, + "grad_norm": 3.3972232341766357, + "learning_rate": 9.227605888325476e-06, + "loss": 0.1658, + "step": 7410 + }, + { + "epoch": 0.18753953994483386, + "grad_norm": 3.1321537494659424, + "learning_rate": 9.227391484991985e-06, + "loss": 0.1321, + "step": 7411 + }, + { + "epoch": 0.18756484550952754, + "grad_norm": 3.716325044631958, + "learning_rate": 9.227177054396831e-06, + "loss": 0.1161, + "step": 7412 + }, + { + "epoch": 0.18759015107422122, + "grad_norm": 2.501553773880005, + "learning_rate": 9.226962596541399e-06, + "loss": 0.1734, + "step": 7413 + }, + { + "epoch": 0.1876154566389149, + "grad_norm": 5.501908302307129, + "learning_rate": 9.226748111427071e-06, + "loss": 0.2061, + "step": 7414 + }, + { + "epoch": 0.18764076220360856, + "grad_norm": 9.834477424621582, + "learning_rate": 9.226533599055232e-06, + "loss": 0.2808, + "step": 7415 + }, + { + "epoch": 0.18766606776830225, + "grad_norm": 5.001461029052734, + "learning_rate": 9.226319059427263e-06, + "loss": 0.1778, + "step": 7416 + }, + { + "epoch": 0.18769137333299593, + "grad_norm": 6.792590141296387, + "learning_rate": 9.22610449254455e-06, + "loss": 0.1551, + "step": 7417 + }, + { + "epoch": 0.18771667889768962, + "grad_norm": 9.189826965332031, + "learning_rate": 9.225889898408473e-06, + "loss": 0.2011, + "step": 7418 + }, + { + "epoch": 0.18774198446238327, + "grad_norm": 2.939786672592163, + "learning_rate": 9.22567527702042e-06, + "loss": 0.1609, + "step": 7419 + }, + { + "epoch": 0.18776729002707696, + "grad_norm": 6.597641468048096, + "learning_rate": 9.225460628381772e-06, + "loss": 0.2155, + "step": 7420 + }, + { + "epoch": 0.18779259559177064, + "grad_norm": 4.985378265380859, + "learning_rate": 9.225245952493915e-06, + "loss": 0.1895, + "step": 7421 + }, + { + "epoch": 0.1878179011564643, + "grad_norm": 7.067831993103027, + "learning_rate": 9.225031249358233e-06, + "loss": 0.1121, + "step": 7422 + }, + { + "epoch": 0.18784320672115798, + "grad_norm": 11.464816093444824, + "learning_rate": 9.22481651897611e-06, + "loss": 0.2785, + "step": 7423 + }, + { + "epoch": 0.18786851228585166, + "grad_norm": 3.482712507247925, + "learning_rate": 9.22460176134893e-06, + "loss": 0.0917, + "step": 7424 + }, + { + "epoch": 0.18789381785054535, + "grad_norm": 11.552490234375, + "learning_rate": 9.224386976478082e-06, + "loss": 0.2661, + "step": 7425 + }, + { + "epoch": 0.187919123415239, + "grad_norm": 7.073760986328125, + "learning_rate": 9.224172164364947e-06, + "loss": 0.2141, + "step": 7426 + }, + { + "epoch": 0.1879444289799327, + "grad_norm": 4.945969104766846, + "learning_rate": 9.22395732501091e-06, + "loss": 0.2093, + "step": 7427 + }, + { + "epoch": 0.18796973454462637, + "grad_norm": 5.695584297180176, + "learning_rate": 9.223742458417358e-06, + "loss": 0.21, + "step": 7428 + }, + { + "epoch": 0.18799504010932003, + "grad_norm": 8.17972183227539, + "learning_rate": 9.223527564585677e-06, + "loss": 0.2185, + "step": 7429 + }, + { + "epoch": 0.1880203456740137, + "grad_norm": 17.499452590942383, + "learning_rate": 9.223312643517252e-06, + "loss": 0.2757, + "step": 7430 + }, + { + "epoch": 0.1880456512387074, + "grad_norm": 5.277899742126465, + "learning_rate": 9.223097695213472e-06, + "loss": 0.1804, + "step": 7431 + }, + { + "epoch": 0.18807095680340108, + "grad_norm": 3.3449301719665527, + "learning_rate": 9.222882719675718e-06, + "loss": 0.1183, + "step": 7432 + }, + { + "epoch": 0.18809626236809474, + "grad_norm": 9.487072944641113, + "learning_rate": 9.22266771690538e-06, + "loss": 0.2785, + "step": 7433 + }, + { + "epoch": 0.18812156793278842, + "grad_norm": 9.44406795501709, + "learning_rate": 9.222452686903842e-06, + "loss": 0.2599, + "step": 7434 + }, + { + "epoch": 0.1881468734974821, + "grad_norm": 4.1313934326171875, + "learning_rate": 9.222237629672493e-06, + "loss": 0.1488, + "step": 7435 + }, + { + "epoch": 0.18817217906217576, + "grad_norm": 3.9869720935821533, + "learning_rate": 9.222022545212719e-06, + "loss": 0.2099, + "step": 7436 + }, + { + "epoch": 0.18819748462686944, + "grad_norm": 10.11157512664795, + "learning_rate": 9.221807433525907e-06, + "loss": 0.1693, + "step": 7437 + }, + { + "epoch": 0.18822279019156313, + "grad_norm": 4.8288469314575195, + "learning_rate": 9.221592294613444e-06, + "loss": 0.1701, + "step": 7438 + }, + { + "epoch": 0.1882480957562568, + "grad_norm": 5.178727626800537, + "learning_rate": 9.221377128476717e-06, + "loss": 0.1377, + "step": 7439 + }, + { + "epoch": 0.18827340132095047, + "grad_norm": 3.612271785736084, + "learning_rate": 9.221161935117115e-06, + "loss": 0.1739, + "step": 7440 + }, + { + "epoch": 0.18829870688564415, + "grad_norm": 4.823600769042969, + "learning_rate": 9.220946714536026e-06, + "loss": 0.1747, + "step": 7441 + }, + { + "epoch": 0.18832401245033784, + "grad_norm": 7.115347385406494, + "learning_rate": 9.220731466734833e-06, + "loss": 0.2058, + "step": 7442 + }, + { + "epoch": 0.1883493180150315, + "grad_norm": 5.29157018661499, + "learning_rate": 9.220516191714931e-06, + "loss": 0.113, + "step": 7443 + }, + { + "epoch": 0.18837462357972518, + "grad_norm": 7.170475006103516, + "learning_rate": 9.220300889477704e-06, + "loss": 0.1903, + "step": 7444 + }, + { + "epoch": 0.18839992914441886, + "grad_norm": 5.342553615570068, + "learning_rate": 9.220085560024543e-06, + "loss": 0.1778, + "step": 7445 + }, + { + "epoch": 0.18842523470911254, + "grad_norm": 3.850954532623291, + "learning_rate": 9.219870203356835e-06, + "loss": 0.1816, + "step": 7446 + }, + { + "epoch": 0.1884505402738062, + "grad_norm": 10.766861915588379, + "learning_rate": 9.219654819475967e-06, + "loss": 0.3106, + "step": 7447 + }, + { + "epoch": 0.18847584583849988, + "grad_norm": 3.6220879554748535, + "learning_rate": 9.219439408383332e-06, + "loss": 0.1038, + "step": 7448 + }, + { + "epoch": 0.18850115140319357, + "grad_norm": 7.533324241638184, + "learning_rate": 9.219223970080317e-06, + "loss": 0.1997, + "step": 7449 + }, + { + "epoch": 0.18852645696788725, + "grad_norm": 2.8877155780792236, + "learning_rate": 9.21900850456831e-06, + "loss": 0.1091, + "step": 7450 + }, + { + "epoch": 0.1885517625325809, + "grad_norm": 4.08153772354126, + "learning_rate": 9.218793011848703e-06, + "loss": 0.1485, + "step": 7451 + }, + { + "epoch": 0.1885770680972746, + "grad_norm": 5.339501857757568, + "learning_rate": 9.218577491922884e-06, + "loss": 0.1965, + "step": 7452 + }, + { + "epoch": 0.18860237366196828, + "grad_norm": 6.966603755950928, + "learning_rate": 9.218361944792245e-06, + "loss": 0.1546, + "step": 7453 + }, + { + "epoch": 0.18862767922666193, + "grad_norm": 5.322533130645752, + "learning_rate": 9.218146370458172e-06, + "loss": 0.1655, + "step": 7454 + }, + { + "epoch": 0.18865298479135562, + "grad_norm": 7.300174713134766, + "learning_rate": 9.217930768922058e-06, + "loss": 0.2106, + "step": 7455 + }, + { + "epoch": 0.1886782903560493, + "grad_norm": 6.562990188598633, + "learning_rate": 9.217715140185295e-06, + "loss": 0.2415, + "step": 7456 + }, + { + "epoch": 0.18870359592074298, + "grad_norm": 6.358510494232178, + "learning_rate": 9.21749948424927e-06, + "loss": 0.1406, + "step": 7457 + }, + { + "epoch": 0.18872890148543664, + "grad_norm": 4.884017467498779, + "learning_rate": 9.217283801115378e-06, + "loss": 0.2177, + "step": 7458 + }, + { + "epoch": 0.18875420705013032, + "grad_norm": 7.152364253997803, + "learning_rate": 9.217068090785004e-06, + "loss": 0.1518, + "step": 7459 + }, + { + "epoch": 0.188779512614824, + "grad_norm": 4.091202259063721, + "learning_rate": 9.216852353259545e-06, + "loss": 0.1683, + "step": 7460 + }, + { + "epoch": 0.18880481817951766, + "grad_norm": 7.013462066650391, + "learning_rate": 9.216636588540387e-06, + "loss": 0.2807, + "step": 7461 + }, + { + "epoch": 0.18883012374421135, + "grad_norm": 3.901876449584961, + "learning_rate": 9.216420796628925e-06, + "loss": 0.1782, + "step": 7462 + }, + { + "epoch": 0.18885542930890503, + "grad_norm": 2.3260116577148438, + "learning_rate": 9.216204977526551e-06, + "loss": 0.1143, + "step": 7463 + }, + { + "epoch": 0.18888073487359872, + "grad_norm": 2.985105276107788, + "learning_rate": 9.215989131234653e-06, + "loss": 0.1438, + "step": 7464 + }, + { + "epoch": 0.18890604043829237, + "grad_norm": 9.769664764404297, + "learning_rate": 9.215773257754628e-06, + "loss": 0.2972, + "step": 7465 + }, + { + "epoch": 0.18893134600298606, + "grad_norm": 17.934110641479492, + "learning_rate": 9.215557357087863e-06, + "loss": 0.2337, + "step": 7466 + }, + { + "epoch": 0.18895665156767974, + "grad_norm": 3.3821380138397217, + "learning_rate": 9.215341429235754e-06, + "loss": 0.1442, + "step": 7467 + }, + { + "epoch": 0.1889819571323734, + "grad_norm": 6.045826435089111, + "learning_rate": 9.21512547419969e-06, + "loss": 0.2158, + "step": 7468 + }, + { + "epoch": 0.18900726269706708, + "grad_norm": 6.7477617263793945, + "learning_rate": 9.21490949198107e-06, + "loss": 0.1844, + "step": 7469 + }, + { + "epoch": 0.18903256826176076, + "grad_norm": 6.405612468719482, + "learning_rate": 9.214693482581281e-06, + "loss": 0.151, + "step": 7470 + }, + { + "epoch": 0.18905787382645445, + "grad_norm": 6.356578350067139, + "learning_rate": 9.214477446001715e-06, + "loss": 0.1927, + "step": 7471 + }, + { + "epoch": 0.1890831793911481, + "grad_norm": 5.296367168426514, + "learning_rate": 9.214261382243771e-06, + "loss": 0.1855, + "step": 7472 + }, + { + "epoch": 0.1891084849558418, + "grad_norm": 3.372157096862793, + "learning_rate": 9.214045291308839e-06, + "loss": 0.1799, + "step": 7473 + }, + { + "epoch": 0.18913379052053547, + "grad_norm": 5.173106670379639, + "learning_rate": 9.213829173198311e-06, + "loss": 0.1745, + "step": 7474 + }, + { + "epoch": 0.18915909608522913, + "grad_norm": 4.962325572967529, + "learning_rate": 9.213613027913584e-06, + "loss": 0.1825, + "step": 7475 + }, + { + "epoch": 0.1891844016499228, + "grad_norm": 4.461088180541992, + "learning_rate": 9.213396855456051e-06, + "loss": 0.2173, + "step": 7476 + }, + { + "epoch": 0.1892097072146165, + "grad_norm": 8.547017097473145, + "learning_rate": 9.213180655827105e-06, + "loss": 0.152, + "step": 7477 + }, + { + "epoch": 0.18923501277931018, + "grad_norm": 5.183667182922363, + "learning_rate": 9.21296442902814e-06, + "loss": 0.1568, + "step": 7478 + }, + { + "epoch": 0.18926031834400384, + "grad_norm": 7.035439491271973, + "learning_rate": 9.21274817506055e-06, + "loss": 0.2194, + "step": 7479 + }, + { + "epoch": 0.18928562390869752, + "grad_norm": 4.717964172363281, + "learning_rate": 9.212531893925733e-06, + "loss": 0.1661, + "step": 7480 + }, + { + "epoch": 0.1893109294733912, + "grad_norm": 3.605336904525757, + "learning_rate": 9.21231558562508e-06, + "loss": 0.1414, + "step": 7481 + }, + { + "epoch": 0.1893362350380849, + "grad_norm": 5.366586208343506, + "learning_rate": 9.212099250159987e-06, + "loss": 0.1656, + "step": 7482 + }, + { + "epoch": 0.18936154060277854, + "grad_norm": 3.3757505416870117, + "learning_rate": 9.21188288753185e-06, + "loss": 0.1394, + "step": 7483 + }, + { + "epoch": 0.18938684616747223, + "grad_norm": 8.4558744430542, + "learning_rate": 9.211666497742065e-06, + "loss": 0.251, + "step": 7484 + }, + { + "epoch": 0.1894121517321659, + "grad_norm": 17.422916412353516, + "learning_rate": 9.211450080792023e-06, + "loss": 0.2709, + "step": 7485 + }, + { + "epoch": 0.18943745729685957, + "grad_norm": 7.765401363372803, + "learning_rate": 9.211233636683126e-06, + "loss": 0.2063, + "step": 7486 + }, + { + "epoch": 0.18946276286155325, + "grad_norm": 5.801289081573486, + "learning_rate": 9.211017165416766e-06, + "loss": 0.2556, + "step": 7487 + }, + { + "epoch": 0.18948806842624694, + "grad_norm": 8.3783597946167, + "learning_rate": 9.21080066699434e-06, + "loss": 0.1686, + "step": 7488 + }, + { + "epoch": 0.18951337399094062, + "grad_norm": 8.547811508178711, + "learning_rate": 9.210584141417242e-06, + "loss": 0.2389, + "step": 7489 + }, + { + "epoch": 0.18953867955563428, + "grad_norm": 6.479634761810303, + "learning_rate": 9.21036758868687e-06, + "loss": 0.1811, + "step": 7490 + }, + { + "epoch": 0.18956398512032796, + "grad_norm": 4.063691139221191, + "learning_rate": 9.210151008804623e-06, + "loss": 0.2036, + "step": 7491 + }, + { + "epoch": 0.18958929068502164, + "grad_norm": 6.860466957092285, + "learning_rate": 9.209934401771892e-06, + "loss": 0.2083, + "step": 7492 + }, + { + "epoch": 0.1896145962497153, + "grad_norm": 6.177178382873535, + "learning_rate": 9.20971776759008e-06, + "loss": 0.1936, + "step": 7493 + }, + { + "epoch": 0.18963990181440898, + "grad_norm": 6.026276111602783, + "learning_rate": 9.20950110626058e-06, + "loss": 0.2486, + "step": 7494 + }, + { + "epoch": 0.18966520737910267, + "grad_norm": 21.715402603149414, + "learning_rate": 9.209284417784791e-06, + "loss": 0.159, + "step": 7495 + }, + { + "epoch": 0.18969051294379635, + "grad_norm": 3.7123045921325684, + "learning_rate": 9.209067702164109e-06, + "loss": 0.1164, + "step": 7496 + }, + { + "epoch": 0.18971581850849, + "grad_norm": 2.7808122634887695, + "learning_rate": 9.208850959399934e-06, + "loss": 0.1378, + "step": 7497 + }, + { + "epoch": 0.1897411240731837, + "grad_norm": 6.446653366088867, + "learning_rate": 9.208634189493659e-06, + "loss": 0.2532, + "step": 7498 + }, + { + "epoch": 0.18976642963787738, + "grad_norm": 4.7315826416015625, + "learning_rate": 9.208417392446686e-06, + "loss": 0.2108, + "step": 7499 + }, + { + "epoch": 0.18979173520257103, + "grad_norm": 4.004515647888184, + "learning_rate": 9.208200568260414e-06, + "loss": 0.0968, + "step": 7500 + }, + { + "epoch": 0.18981704076726472, + "grad_norm": 3.7840158939361572, + "learning_rate": 9.207983716936238e-06, + "loss": 0.181, + "step": 7501 + }, + { + "epoch": 0.1898423463319584, + "grad_norm": 8.731989860534668, + "learning_rate": 9.207766838475559e-06, + "loss": 0.1672, + "step": 7502 + }, + { + "epoch": 0.18986765189665208, + "grad_norm": 9.425962448120117, + "learning_rate": 9.20754993287977e-06, + "loss": 0.2465, + "step": 7503 + }, + { + "epoch": 0.18989295746134574, + "grad_norm": 14.041167259216309, + "learning_rate": 9.20733300015028e-06, + "loss": 0.2472, + "step": 7504 + }, + { + "epoch": 0.18991826302603942, + "grad_norm": 8.822427749633789, + "learning_rate": 9.20711604028848e-06, + "loss": 0.2604, + "step": 7505 + }, + { + "epoch": 0.1899435685907331, + "grad_norm": 10.878252029418945, + "learning_rate": 9.206899053295771e-06, + "loss": 0.2298, + "step": 7506 + }, + { + "epoch": 0.18996887415542676, + "grad_norm": 8.175151824951172, + "learning_rate": 9.206682039173551e-06, + "loss": 0.2458, + "step": 7507 + }, + { + "epoch": 0.18999417972012045, + "grad_norm": 11.241057395935059, + "learning_rate": 9.206464997923223e-06, + "loss": 0.3143, + "step": 7508 + }, + { + "epoch": 0.19001948528481413, + "grad_norm": 5.716048717498779, + "learning_rate": 9.206247929546184e-06, + "loss": 0.1417, + "step": 7509 + }, + { + "epoch": 0.19004479084950782, + "grad_norm": 4.379761695861816, + "learning_rate": 9.206030834043833e-06, + "loss": 0.1485, + "step": 7510 + }, + { + "epoch": 0.19007009641420147, + "grad_norm": 9.884217262268066, + "learning_rate": 9.205813711417573e-06, + "loss": 0.2673, + "step": 7511 + }, + { + "epoch": 0.19009540197889516, + "grad_norm": 6.425127029418945, + "learning_rate": 9.205596561668804e-06, + "loss": 0.1675, + "step": 7512 + }, + { + "epoch": 0.19012070754358884, + "grad_norm": 5.3159403800964355, + "learning_rate": 9.205379384798924e-06, + "loss": 0.1653, + "step": 7513 + }, + { + "epoch": 0.19014601310828252, + "grad_norm": 3.1175100803375244, + "learning_rate": 9.205162180809334e-06, + "loss": 0.1558, + "step": 7514 + }, + { + "epoch": 0.19017131867297618, + "grad_norm": 5.521666049957275, + "learning_rate": 9.204944949701436e-06, + "loss": 0.1406, + "step": 7515 + }, + { + "epoch": 0.19019662423766986, + "grad_norm": 4.028675556182861, + "learning_rate": 9.20472769147663e-06, + "loss": 0.1067, + "step": 7516 + }, + { + "epoch": 0.19022192980236355, + "grad_norm": 13.895008087158203, + "learning_rate": 9.204510406136317e-06, + "loss": 0.2737, + "step": 7517 + }, + { + "epoch": 0.1902472353670572, + "grad_norm": 11.480191230773926, + "learning_rate": 9.204293093681898e-06, + "loss": 0.3167, + "step": 7518 + }, + { + "epoch": 0.1902725409317509, + "grad_norm": 5.9373321533203125, + "learning_rate": 9.204075754114777e-06, + "loss": 0.2544, + "step": 7519 + }, + { + "epoch": 0.19029784649644457, + "grad_norm": 7.915765762329102, + "learning_rate": 9.203858387436352e-06, + "loss": 0.2522, + "step": 7520 + }, + { + "epoch": 0.19032315206113826, + "grad_norm": 6.318707466125488, + "learning_rate": 9.203640993648027e-06, + "loss": 0.2697, + "step": 7521 + }, + { + "epoch": 0.1903484576258319, + "grad_norm": 11.350150108337402, + "learning_rate": 9.2034235727512e-06, + "loss": 0.2626, + "step": 7522 + }, + { + "epoch": 0.1903737631905256, + "grad_norm": 4.022933006286621, + "learning_rate": 9.203206124747279e-06, + "loss": 0.1479, + "step": 7523 + }, + { + "epoch": 0.19039906875521928, + "grad_norm": 8.610818862915039, + "learning_rate": 9.202988649637664e-06, + "loss": 0.3291, + "step": 7524 + }, + { + "epoch": 0.19042437431991294, + "grad_norm": 8.234959602355957, + "learning_rate": 9.202771147423757e-06, + "loss": 0.166, + "step": 7525 + }, + { + "epoch": 0.19044967988460662, + "grad_norm": 8.929224014282227, + "learning_rate": 9.202553618106959e-06, + "loss": 0.2229, + "step": 7526 + }, + { + "epoch": 0.1904749854493003, + "grad_norm": 8.874557495117188, + "learning_rate": 9.202336061688676e-06, + "loss": 0.2997, + "step": 7527 + }, + { + "epoch": 0.190500291013994, + "grad_norm": 5.493492126464844, + "learning_rate": 9.20211847817031e-06, + "loss": 0.197, + "step": 7528 + }, + { + "epoch": 0.19052559657868764, + "grad_norm": 6.771122932434082, + "learning_rate": 9.201900867553261e-06, + "loss": 0.2407, + "step": 7529 + }, + { + "epoch": 0.19055090214338133, + "grad_norm": 18.165536880493164, + "learning_rate": 9.201683229838937e-06, + "loss": 0.342, + "step": 7530 + }, + { + "epoch": 0.190576207708075, + "grad_norm": 4.591786861419678, + "learning_rate": 9.201465565028738e-06, + "loss": 0.213, + "step": 7531 + }, + { + "epoch": 0.19060151327276867, + "grad_norm": 7.231898784637451, + "learning_rate": 9.20124787312407e-06, + "loss": 0.1349, + "step": 7532 + }, + { + "epoch": 0.19062681883746235, + "grad_norm": 5.898644924163818, + "learning_rate": 9.201030154126336e-06, + "loss": 0.2219, + "step": 7533 + }, + { + "epoch": 0.19065212440215604, + "grad_norm": 11.938454627990723, + "learning_rate": 9.20081240803694e-06, + "loss": 0.2231, + "step": 7534 + }, + { + "epoch": 0.19067742996684972, + "grad_norm": 4.349595069885254, + "learning_rate": 9.200594634857286e-06, + "loss": 0.184, + "step": 7535 + }, + { + "epoch": 0.19070273553154338, + "grad_norm": 6.363295078277588, + "learning_rate": 9.200376834588778e-06, + "loss": 0.2191, + "step": 7536 + }, + { + "epoch": 0.19072804109623706, + "grad_norm": 11.352869033813477, + "learning_rate": 9.200159007232823e-06, + "loss": 0.2635, + "step": 7537 + }, + { + "epoch": 0.19075334666093074, + "grad_norm": 4.764845371246338, + "learning_rate": 9.199941152790822e-06, + "loss": 0.1211, + "step": 7538 + }, + { + "epoch": 0.1907786522256244, + "grad_norm": 4.003209114074707, + "learning_rate": 9.199723271264182e-06, + "loss": 0.2155, + "step": 7539 + }, + { + "epoch": 0.19080395779031809, + "grad_norm": 3.9046521186828613, + "learning_rate": 9.199505362654308e-06, + "loss": 0.198, + "step": 7540 + }, + { + "epoch": 0.19082926335501177, + "grad_norm": 4.377660274505615, + "learning_rate": 9.199287426962606e-06, + "loss": 0.1371, + "step": 7541 + }, + { + "epoch": 0.19085456891970545, + "grad_norm": 10.959867477416992, + "learning_rate": 9.19906946419048e-06, + "loss": 0.2953, + "step": 7542 + }, + { + "epoch": 0.1908798744843991, + "grad_norm": 5.012214660644531, + "learning_rate": 9.198851474339335e-06, + "loss": 0.1807, + "step": 7543 + }, + { + "epoch": 0.1909051800490928, + "grad_norm": 2.3071446418762207, + "learning_rate": 9.198633457410579e-06, + "loss": 0.0746, + "step": 7544 + }, + { + "epoch": 0.19093048561378648, + "grad_norm": 4.448593616485596, + "learning_rate": 9.198415413405617e-06, + "loss": 0.2504, + "step": 7545 + }, + { + "epoch": 0.19095579117848016, + "grad_norm": 3.116849422454834, + "learning_rate": 9.198197342325855e-06, + "loss": 0.156, + "step": 7546 + }, + { + "epoch": 0.19098109674317382, + "grad_norm": 6.845058917999268, + "learning_rate": 9.197979244172699e-06, + "loss": 0.2476, + "step": 7547 + }, + { + "epoch": 0.1910064023078675, + "grad_norm": 5.157723903656006, + "learning_rate": 9.197761118947555e-06, + "loss": 0.2198, + "step": 7548 + }, + { + "epoch": 0.19103170787256119, + "grad_norm": 5.306150913238525, + "learning_rate": 9.19754296665183e-06, + "loss": 0.1903, + "step": 7549 + }, + { + "epoch": 0.19105701343725484, + "grad_norm": 3.7776925563812256, + "learning_rate": 9.197324787286933e-06, + "loss": 0.2041, + "step": 7550 + }, + { + "epoch": 0.19108231900194853, + "grad_norm": 3.6134777069091797, + "learning_rate": 9.197106580854268e-06, + "loss": 0.1421, + "step": 7551 + }, + { + "epoch": 0.1911076245666422, + "grad_norm": 4.688652038574219, + "learning_rate": 9.196888347355244e-06, + "loss": 0.1785, + "step": 7552 + }, + { + "epoch": 0.1911329301313359, + "grad_norm": 4.768962383270264, + "learning_rate": 9.196670086791269e-06, + "loss": 0.1646, + "step": 7553 + }, + { + "epoch": 0.19115823569602955, + "grad_norm": 9.013197898864746, + "learning_rate": 9.196451799163746e-06, + "loss": 0.2086, + "step": 7554 + }, + { + "epoch": 0.19118354126072323, + "grad_norm": 5.74928617477417, + "learning_rate": 9.196233484474088e-06, + "loss": 0.2616, + "step": 7555 + }, + { + "epoch": 0.19120884682541692, + "grad_norm": 7.568410396575928, + "learning_rate": 9.1960151427237e-06, + "loss": 0.2589, + "step": 7556 + }, + { + "epoch": 0.19123415239011057, + "grad_norm": 4.800210952758789, + "learning_rate": 9.195796773913992e-06, + "loss": 0.1505, + "step": 7557 + }, + { + "epoch": 0.19125945795480426, + "grad_norm": 6.039072513580322, + "learning_rate": 9.19557837804637e-06, + "loss": 0.2613, + "step": 7558 + }, + { + "epoch": 0.19128476351949794, + "grad_norm": 4.758134365081787, + "learning_rate": 9.195359955122244e-06, + "loss": 0.2432, + "step": 7559 + }, + { + "epoch": 0.19131006908419163, + "grad_norm": 7.175486087799072, + "learning_rate": 9.195141505143021e-06, + "loss": 0.1677, + "step": 7560 + }, + { + "epoch": 0.19133537464888528, + "grad_norm": 4.372587203979492, + "learning_rate": 9.194923028110112e-06, + "loss": 0.1799, + "step": 7561 + }, + { + "epoch": 0.19136068021357897, + "grad_norm": 4.666967391967773, + "learning_rate": 9.194704524024925e-06, + "loss": 0.1292, + "step": 7562 + }, + { + "epoch": 0.19138598577827265, + "grad_norm": 4.299922943115234, + "learning_rate": 9.194485992888869e-06, + "loss": 0.1455, + "step": 7563 + }, + { + "epoch": 0.1914112913429663, + "grad_norm": 30.528244018554688, + "learning_rate": 9.194267434703352e-06, + "loss": 0.281, + "step": 7564 + }, + { + "epoch": 0.19143659690766, + "grad_norm": 7.981289386749268, + "learning_rate": 9.194048849469785e-06, + "loss": 0.2152, + "step": 7565 + }, + { + "epoch": 0.19146190247235367, + "grad_norm": 6.673203468322754, + "learning_rate": 9.193830237189576e-06, + "loss": 0.2473, + "step": 7566 + }, + { + "epoch": 0.19148720803704736, + "grad_norm": 10.720682144165039, + "learning_rate": 9.193611597864138e-06, + "loss": 0.1958, + "step": 7567 + }, + { + "epoch": 0.191512513601741, + "grad_norm": 4.509429931640625, + "learning_rate": 9.193392931494877e-06, + "loss": 0.1248, + "step": 7568 + }, + { + "epoch": 0.1915378191664347, + "grad_norm": 14.376008987426758, + "learning_rate": 9.193174238083207e-06, + "loss": 0.181, + "step": 7569 + }, + { + "epoch": 0.19156312473112838, + "grad_norm": 12.68601131439209, + "learning_rate": 9.192955517630533e-06, + "loss": 0.3473, + "step": 7570 + }, + { + "epoch": 0.19158843029582204, + "grad_norm": 3.741626024246216, + "learning_rate": 9.192736770138272e-06, + "loss": 0.2173, + "step": 7571 + }, + { + "epoch": 0.19161373586051572, + "grad_norm": 4.529228210449219, + "learning_rate": 9.19251799560783e-06, + "loss": 0.2113, + "step": 7572 + }, + { + "epoch": 0.1916390414252094, + "grad_norm": 6.039937496185303, + "learning_rate": 9.19229919404062e-06, + "loss": 0.1793, + "step": 7573 + }, + { + "epoch": 0.1916643469899031, + "grad_norm": 6.437211513519287, + "learning_rate": 9.192080365438052e-06, + "loss": 0.2425, + "step": 7574 + }, + { + "epoch": 0.19168965255459675, + "grad_norm": 3.7557668685913086, + "learning_rate": 9.191861509801539e-06, + "loss": 0.1288, + "step": 7575 + }, + { + "epoch": 0.19171495811929043, + "grad_norm": 3.773231029510498, + "learning_rate": 9.19164262713249e-06, + "loss": 0.1965, + "step": 7576 + }, + { + "epoch": 0.1917402636839841, + "grad_norm": 7.144568920135498, + "learning_rate": 9.191423717432315e-06, + "loss": 0.2927, + "step": 7577 + }, + { + "epoch": 0.1917655692486778, + "grad_norm": 6.957852840423584, + "learning_rate": 9.19120478070243e-06, + "loss": 0.2794, + "step": 7578 + }, + { + "epoch": 0.19179087481337145, + "grad_norm": 5.344756603240967, + "learning_rate": 9.190985816944244e-06, + "loss": 0.1339, + "step": 7579 + }, + { + "epoch": 0.19181618037806514, + "grad_norm": 8.686148643493652, + "learning_rate": 9.190766826159173e-06, + "loss": 0.2436, + "step": 7580 + }, + { + "epoch": 0.19184148594275882, + "grad_norm": 6.563179016113281, + "learning_rate": 9.190547808348624e-06, + "loss": 0.2155, + "step": 7581 + }, + { + "epoch": 0.19186679150745248, + "grad_norm": 4.6380462646484375, + "learning_rate": 9.190328763514012e-06, + "loss": 0.2085, + "step": 7582 + }, + { + "epoch": 0.19189209707214616, + "grad_norm": 6.182232856750488, + "learning_rate": 9.190109691656747e-06, + "loss": 0.2128, + "step": 7583 + }, + { + "epoch": 0.19191740263683985, + "grad_norm": 8.28166675567627, + "learning_rate": 9.189890592778248e-06, + "loss": 0.2449, + "step": 7584 + }, + { + "epoch": 0.19194270820153353, + "grad_norm": 8.941286087036133, + "learning_rate": 9.18967146687992e-06, + "loss": 0.2627, + "step": 7585 + }, + { + "epoch": 0.19196801376622719, + "grad_norm": 6.540243148803711, + "learning_rate": 9.189452313963185e-06, + "loss": 0.2304, + "step": 7586 + }, + { + "epoch": 0.19199331933092087, + "grad_norm": 3.244724750518799, + "learning_rate": 9.189233134029447e-06, + "loss": 0.1772, + "step": 7587 + }, + { + "epoch": 0.19201862489561455, + "grad_norm": 5.080626010894775, + "learning_rate": 9.189013927080126e-06, + "loss": 0.2199, + "step": 7588 + }, + { + "epoch": 0.1920439304603082, + "grad_norm": 5.949594974517822, + "learning_rate": 9.188794693116631e-06, + "loss": 0.2173, + "step": 7589 + }, + { + "epoch": 0.1920692360250019, + "grad_norm": 4.928884029388428, + "learning_rate": 9.18857543214038e-06, + "loss": 0.1695, + "step": 7590 + }, + { + "epoch": 0.19209454158969558, + "grad_norm": 5.312558174133301, + "learning_rate": 9.188356144152786e-06, + "loss": 0.1472, + "step": 7591 + }, + { + "epoch": 0.19211984715438926, + "grad_norm": 7.624845027923584, + "learning_rate": 9.188136829155263e-06, + "loss": 0.2431, + "step": 7592 + }, + { + "epoch": 0.19214515271908292, + "grad_norm": 3.462839126586914, + "learning_rate": 9.187917487149221e-06, + "loss": 0.1685, + "step": 7593 + }, + { + "epoch": 0.1921704582837766, + "grad_norm": 14.645837783813477, + "learning_rate": 9.187698118136081e-06, + "loss": 0.3301, + "step": 7594 + }, + { + "epoch": 0.19219576384847029, + "grad_norm": 8.607986450195312, + "learning_rate": 9.187478722117254e-06, + "loss": 0.1781, + "step": 7595 + }, + { + "epoch": 0.19222106941316394, + "grad_norm": 5.869731426239014, + "learning_rate": 9.187259299094156e-06, + "loss": 0.1855, + "step": 7596 + }, + { + "epoch": 0.19224637497785763, + "grad_norm": 4.803187847137451, + "learning_rate": 9.187039849068202e-06, + "loss": 0.1781, + "step": 7597 + }, + { + "epoch": 0.1922716805425513, + "grad_norm": 6.448910713195801, + "learning_rate": 9.186820372040805e-06, + "loss": 0.2081, + "step": 7598 + }, + { + "epoch": 0.192296986107245, + "grad_norm": 5.575407981872559, + "learning_rate": 9.186600868013385e-06, + "loss": 0.1596, + "step": 7599 + }, + { + "epoch": 0.19232229167193865, + "grad_norm": 6.379859924316406, + "learning_rate": 9.18638133698735e-06, + "loss": 0.2602, + "step": 7600 + }, + { + "epoch": 0.19234759723663233, + "grad_norm": 5.040486812591553, + "learning_rate": 9.186161778964126e-06, + "loss": 0.2096, + "step": 7601 + }, + { + "epoch": 0.19237290280132602, + "grad_norm": 4.873586654663086, + "learning_rate": 9.185942193945122e-06, + "loss": 0.1774, + "step": 7602 + }, + { + "epoch": 0.19239820836601967, + "grad_norm": 14.280661582946777, + "learning_rate": 9.185722581931753e-06, + "loss": 0.2762, + "step": 7603 + }, + { + "epoch": 0.19242351393071336, + "grad_norm": 5.041785717010498, + "learning_rate": 9.18550294292544e-06, + "loss": 0.1556, + "step": 7604 + }, + { + "epoch": 0.19244881949540704, + "grad_norm": 10.30062198638916, + "learning_rate": 9.185283276927595e-06, + "loss": 0.2327, + "step": 7605 + }, + { + "epoch": 0.19247412506010073, + "grad_norm": 6.952793121337891, + "learning_rate": 9.185063583939638e-06, + "loss": 0.2177, + "step": 7606 + }, + { + "epoch": 0.19249943062479438, + "grad_norm": 4.129718780517578, + "learning_rate": 9.184843863962984e-06, + "loss": 0.1674, + "step": 7607 + }, + { + "epoch": 0.19252473618948807, + "grad_norm": 4.421944618225098, + "learning_rate": 9.18462411699905e-06, + "loss": 0.1535, + "step": 7608 + }, + { + "epoch": 0.19255004175418175, + "grad_norm": 6.0564656257629395, + "learning_rate": 9.184404343049256e-06, + "loss": 0.2055, + "step": 7609 + }, + { + "epoch": 0.19257534731887543, + "grad_norm": 9.19377613067627, + "learning_rate": 9.184184542115014e-06, + "loss": 0.1889, + "step": 7610 + }, + { + "epoch": 0.1926006528835691, + "grad_norm": 19.857749938964844, + "learning_rate": 9.183964714197746e-06, + "loss": 0.2411, + "step": 7611 + }, + { + "epoch": 0.19262595844826277, + "grad_norm": 5.094814777374268, + "learning_rate": 9.183744859298867e-06, + "loss": 0.2038, + "step": 7612 + }, + { + "epoch": 0.19265126401295646, + "grad_norm": 13.816823959350586, + "learning_rate": 9.183524977419795e-06, + "loss": 0.3137, + "step": 7613 + }, + { + "epoch": 0.19267656957765011, + "grad_norm": 5.513500690460205, + "learning_rate": 9.18330506856195e-06, + "loss": 0.1374, + "step": 7614 + }, + { + "epoch": 0.1927018751423438, + "grad_norm": 4.418592929840088, + "learning_rate": 9.183085132726748e-06, + "loss": 0.1405, + "step": 7615 + }, + { + "epoch": 0.19272718070703748, + "grad_norm": 3.365116596221924, + "learning_rate": 9.182865169915606e-06, + "loss": 0.1096, + "step": 7616 + }, + { + "epoch": 0.19275248627173117, + "grad_norm": 3.5916364192962646, + "learning_rate": 9.182645180129948e-06, + "loss": 0.1623, + "step": 7617 + }, + { + "epoch": 0.19277779183642482, + "grad_norm": 5.38197135925293, + "learning_rate": 9.182425163371189e-06, + "loss": 0.2457, + "step": 7618 + }, + { + "epoch": 0.1928030974011185, + "grad_norm": 9.899272918701172, + "learning_rate": 9.182205119640746e-06, + "loss": 0.3199, + "step": 7619 + }, + { + "epoch": 0.1928284029658122, + "grad_norm": 9.740480422973633, + "learning_rate": 9.181985048940041e-06, + "loss": 0.1959, + "step": 7620 + }, + { + "epoch": 0.19285370853050585, + "grad_norm": 9.18591022491455, + "learning_rate": 9.181764951270493e-06, + "loss": 0.2677, + "step": 7621 + }, + { + "epoch": 0.19287901409519953, + "grad_norm": 4.145220756530762, + "learning_rate": 9.18154482663352e-06, + "loss": 0.1887, + "step": 7622 + }, + { + "epoch": 0.19290431965989321, + "grad_norm": 6.648749351501465, + "learning_rate": 9.181324675030543e-06, + "loss": 0.2231, + "step": 7623 + }, + { + "epoch": 0.1929296252245869, + "grad_norm": 4.673592567443848, + "learning_rate": 9.18110449646298e-06, + "loss": 0.2128, + "step": 7624 + }, + { + "epoch": 0.19295493078928055, + "grad_norm": 6.4567670822143555, + "learning_rate": 9.180884290932253e-06, + "loss": 0.2819, + "step": 7625 + }, + { + "epoch": 0.19298023635397424, + "grad_norm": 9.409998893737793, + "learning_rate": 9.18066405843978e-06, + "loss": 0.351, + "step": 7626 + }, + { + "epoch": 0.19300554191866792, + "grad_norm": 11.232563972473145, + "learning_rate": 9.180443798986982e-06, + "loss": 0.2398, + "step": 7627 + }, + { + "epoch": 0.19303084748336158, + "grad_norm": 16.095678329467773, + "learning_rate": 9.18022351257528e-06, + "loss": 0.3113, + "step": 7628 + }, + { + "epoch": 0.19305615304805526, + "grad_norm": 5.657938003540039, + "learning_rate": 9.180003199206093e-06, + "loss": 0.2516, + "step": 7629 + }, + { + "epoch": 0.19308145861274895, + "grad_norm": 5.387292861938477, + "learning_rate": 9.179782858880845e-06, + "loss": 0.2623, + "step": 7630 + }, + { + "epoch": 0.19310676417744263, + "grad_norm": 6.482067584991455, + "learning_rate": 9.179562491600954e-06, + "loss": 0.1511, + "step": 7631 + }, + { + "epoch": 0.1931320697421363, + "grad_norm": 7.59641695022583, + "learning_rate": 9.17934209736784e-06, + "loss": 0.2417, + "step": 7632 + }, + { + "epoch": 0.19315737530682997, + "grad_norm": 4.928979396820068, + "learning_rate": 9.179121676182928e-06, + "loss": 0.1089, + "step": 7633 + }, + { + "epoch": 0.19318268087152365, + "grad_norm": 5.829622268676758, + "learning_rate": 9.178901228047638e-06, + "loss": 0.2454, + "step": 7634 + }, + { + "epoch": 0.1932079864362173, + "grad_norm": 2.406785488128662, + "learning_rate": 9.178680752963392e-06, + "loss": 0.0587, + "step": 7635 + }, + { + "epoch": 0.193233292000911, + "grad_norm": 3.972357749938965, + "learning_rate": 9.178460250931609e-06, + "loss": 0.1598, + "step": 7636 + }, + { + "epoch": 0.19325859756560468, + "grad_norm": 6.978273868560791, + "learning_rate": 9.178239721953714e-06, + "loss": 0.3123, + "step": 7637 + }, + { + "epoch": 0.19328390313029836, + "grad_norm": 7.620541095733643, + "learning_rate": 9.178019166031129e-06, + "loss": 0.1464, + "step": 7638 + }, + { + "epoch": 0.19330920869499202, + "grad_norm": 4.447360038757324, + "learning_rate": 9.177798583165276e-06, + "loss": 0.1416, + "step": 7639 + }, + { + "epoch": 0.1933345142596857, + "grad_norm": 5.499796390533447, + "learning_rate": 9.177577973357575e-06, + "loss": 0.184, + "step": 7640 + }, + { + "epoch": 0.1933598198243794, + "grad_norm": 6.024085521697998, + "learning_rate": 9.177357336609452e-06, + "loss": 0.2231, + "step": 7641 + }, + { + "epoch": 0.19338512538907307, + "grad_norm": 8.84868049621582, + "learning_rate": 9.177136672922329e-06, + "loss": 0.2626, + "step": 7642 + }, + { + "epoch": 0.19341043095376673, + "grad_norm": 5.322149753570557, + "learning_rate": 9.176915982297627e-06, + "loss": 0.2167, + "step": 7643 + }, + { + "epoch": 0.1934357365184604, + "grad_norm": 4.6946587562561035, + "learning_rate": 9.176695264736772e-06, + "loss": 0.2361, + "step": 7644 + }, + { + "epoch": 0.1934610420831541, + "grad_norm": 4.218125820159912, + "learning_rate": 9.176474520241185e-06, + "loss": 0.1668, + "step": 7645 + }, + { + "epoch": 0.19348634764784775, + "grad_norm": 5.908261775970459, + "learning_rate": 9.176253748812291e-06, + "loss": 0.1246, + "step": 7646 + }, + { + "epoch": 0.19351165321254143, + "grad_norm": 6.1131463050842285, + "learning_rate": 9.176032950451515e-06, + "loss": 0.1862, + "step": 7647 + }, + { + "epoch": 0.19353695877723512, + "grad_norm": 4.7066850662231445, + "learning_rate": 9.175812125160279e-06, + "loss": 0.1915, + "step": 7648 + }, + { + "epoch": 0.1935622643419288, + "grad_norm": 8.053093910217285, + "learning_rate": 9.175591272940007e-06, + "loss": 0.2308, + "step": 7649 + }, + { + "epoch": 0.19358756990662246, + "grad_norm": 10.344560623168945, + "learning_rate": 9.175370393792122e-06, + "loss": 0.3201, + "step": 7650 + }, + { + "epoch": 0.19361287547131614, + "grad_norm": 7.244174003601074, + "learning_rate": 9.175149487718051e-06, + "loss": 0.2492, + "step": 7651 + }, + { + "epoch": 0.19363818103600983, + "grad_norm": 4.3227338790893555, + "learning_rate": 9.174928554719218e-06, + "loss": 0.1599, + "step": 7652 + }, + { + "epoch": 0.19366348660070348, + "grad_norm": 4.301417827606201, + "learning_rate": 9.174707594797047e-06, + "loss": 0.2574, + "step": 7653 + }, + { + "epoch": 0.19368879216539717, + "grad_norm": 4.295656681060791, + "learning_rate": 9.174486607952965e-06, + "loss": 0.1972, + "step": 7654 + }, + { + "epoch": 0.19371409773009085, + "grad_norm": 3.0284512042999268, + "learning_rate": 9.174265594188394e-06, + "loss": 0.1653, + "step": 7655 + }, + { + "epoch": 0.19373940329478453, + "grad_norm": 16.374095916748047, + "learning_rate": 9.174044553504761e-06, + "loss": 0.1662, + "step": 7656 + }, + { + "epoch": 0.1937647088594782, + "grad_norm": 7.598485469818115, + "learning_rate": 9.17382348590349e-06, + "loss": 0.2255, + "step": 7657 + }, + { + "epoch": 0.19379001442417187, + "grad_norm": 4.412242889404297, + "learning_rate": 9.173602391386009e-06, + "loss": 0.1886, + "step": 7658 + }, + { + "epoch": 0.19381531998886556, + "grad_norm": 5.333999156951904, + "learning_rate": 9.173381269953741e-06, + "loss": 0.2298, + "step": 7659 + }, + { + "epoch": 0.19384062555355921, + "grad_norm": 3.9316329956054688, + "learning_rate": 9.173160121608116e-06, + "loss": 0.1154, + "step": 7660 + }, + { + "epoch": 0.1938659311182529, + "grad_norm": 3.865525007247925, + "learning_rate": 9.172938946350556e-06, + "loss": 0.1291, + "step": 7661 + }, + { + "epoch": 0.19389123668294658, + "grad_norm": 4.7812371253967285, + "learning_rate": 9.17271774418249e-06, + "loss": 0.1533, + "step": 7662 + }, + { + "epoch": 0.19391654224764027, + "grad_norm": 6.779796600341797, + "learning_rate": 9.17249651510534e-06, + "loss": 0.3137, + "step": 7663 + }, + { + "epoch": 0.19394184781233392, + "grad_norm": 3.890514850616455, + "learning_rate": 9.172275259120538e-06, + "loss": 0.1302, + "step": 7664 + }, + { + "epoch": 0.1939671533770276, + "grad_norm": 2.9558260440826416, + "learning_rate": 9.17205397622951e-06, + "loss": 0.1361, + "step": 7665 + }, + { + "epoch": 0.1939924589417213, + "grad_norm": 8.09666919708252, + "learning_rate": 9.171832666433681e-06, + "loss": 0.1619, + "step": 7666 + }, + { + "epoch": 0.19401776450641495, + "grad_norm": 6.242908000946045, + "learning_rate": 9.17161132973448e-06, + "loss": 0.2545, + "step": 7667 + }, + { + "epoch": 0.19404307007110863, + "grad_norm": 22.403310775756836, + "learning_rate": 9.171389966133335e-06, + "loss": 0.4632, + "step": 7668 + }, + { + "epoch": 0.19406837563580231, + "grad_norm": 5.31709623336792, + "learning_rate": 9.171168575631668e-06, + "loss": 0.1717, + "step": 7669 + }, + { + "epoch": 0.194093681200496, + "grad_norm": 6.141355037689209, + "learning_rate": 9.170947158230914e-06, + "loss": 0.2589, + "step": 7670 + }, + { + "epoch": 0.19411898676518966, + "grad_norm": 4.15690279006958, + "learning_rate": 9.170725713932497e-06, + "loss": 0.145, + "step": 7671 + }, + { + "epoch": 0.19414429232988334, + "grad_norm": 6.660842418670654, + "learning_rate": 9.170504242737846e-06, + "loss": 0.1823, + "step": 7672 + }, + { + "epoch": 0.19416959789457702, + "grad_norm": 4.635875225067139, + "learning_rate": 9.170282744648387e-06, + "loss": 0.1711, + "step": 7673 + }, + { + "epoch": 0.1941949034592707, + "grad_norm": 3.3798677921295166, + "learning_rate": 9.17006121966555e-06, + "loss": 0.1285, + "step": 7674 + }, + { + "epoch": 0.19422020902396436, + "grad_norm": 5.047544479370117, + "learning_rate": 9.169839667790767e-06, + "loss": 0.2374, + "step": 7675 + }, + { + "epoch": 0.19424551458865805, + "grad_norm": 23.642824172973633, + "learning_rate": 9.169618089025462e-06, + "loss": 0.2434, + "step": 7676 + }, + { + "epoch": 0.19427082015335173, + "grad_norm": 6.878213882446289, + "learning_rate": 9.169396483371067e-06, + "loss": 0.223, + "step": 7677 + }, + { + "epoch": 0.1942961257180454, + "grad_norm": 4.623343467712402, + "learning_rate": 9.169174850829008e-06, + "loss": 0.1912, + "step": 7678 + }, + { + "epoch": 0.19432143128273907, + "grad_norm": 11.488605499267578, + "learning_rate": 9.168953191400716e-06, + "loss": 0.2107, + "step": 7679 + }, + { + "epoch": 0.19434673684743276, + "grad_norm": 4.578749656677246, + "learning_rate": 9.16873150508762e-06, + "loss": 0.2313, + "step": 7680 + }, + { + "epoch": 0.19437204241212644, + "grad_norm": 10.994672775268555, + "learning_rate": 9.168509791891152e-06, + "loss": 0.1577, + "step": 7681 + }, + { + "epoch": 0.1943973479768201, + "grad_norm": 5.491765022277832, + "learning_rate": 9.168288051812736e-06, + "loss": 0.1646, + "step": 7682 + }, + { + "epoch": 0.19442265354151378, + "grad_norm": 4.429698944091797, + "learning_rate": 9.168066284853809e-06, + "loss": 0.2017, + "step": 7683 + }, + { + "epoch": 0.19444795910620746, + "grad_norm": 7.146637439727783, + "learning_rate": 9.167844491015797e-06, + "loss": 0.1852, + "step": 7684 + }, + { + "epoch": 0.19447326467090112, + "grad_norm": 3.5178146362304688, + "learning_rate": 9.167622670300131e-06, + "loss": 0.1736, + "step": 7685 + }, + { + "epoch": 0.1944985702355948, + "grad_norm": 9.020615577697754, + "learning_rate": 9.167400822708241e-06, + "loss": 0.1893, + "step": 7686 + }, + { + "epoch": 0.1945238758002885, + "grad_norm": 2.732459306716919, + "learning_rate": 9.167178948241559e-06, + "loss": 0.0786, + "step": 7687 + }, + { + "epoch": 0.19454918136498217, + "grad_norm": 7.593354225158691, + "learning_rate": 9.166957046901515e-06, + "loss": 0.2099, + "step": 7688 + }, + { + "epoch": 0.19457448692967583, + "grad_norm": 8.805987358093262, + "learning_rate": 9.16673511868954e-06, + "loss": 0.306, + "step": 7689 + }, + { + "epoch": 0.1945997924943695, + "grad_norm": 7.09770393371582, + "learning_rate": 9.166513163607067e-06, + "loss": 0.324, + "step": 7690 + }, + { + "epoch": 0.1946250980590632, + "grad_norm": 8.539405822753906, + "learning_rate": 9.166291181655522e-06, + "loss": 0.2223, + "step": 7691 + }, + { + "epoch": 0.19465040362375685, + "grad_norm": 5.881454944610596, + "learning_rate": 9.166069172836343e-06, + "loss": 0.2614, + "step": 7692 + }, + { + "epoch": 0.19467570918845054, + "grad_norm": 15.495336532592773, + "learning_rate": 9.165847137150958e-06, + "loss": 0.3246, + "step": 7693 + }, + { + "epoch": 0.19470101475314422, + "grad_norm": 6.640043258666992, + "learning_rate": 9.1656250746008e-06, + "loss": 0.1759, + "step": 7694 + }, + { + "epoch": 0.1947263203178379, + "grad_norm": 5.032530307769775, + "learning_rate": 9.1654029851873e-06, + "loss": 0.152, + "step": 7695 + }, + { + "epoch": 0.19475162588253156, + "grad_norm": 8.661602020263672, + "learning_rate": 9.165180868911892e-06, + "loss": 0.2502, + "step": 7696 + }, + { + "epoch": 0.19477693144722524, + "grad_norm": 22.906389236450195, + "learning_rate": 9.164958725776008e-06, + "loss": 0.143, + "step": 7697 + }, + { + "epoch": 0.19480223701191893, + "grad_norm": 7.343883991241455, + "learning_rate": 9.164736555781077e-06, + "loss": 0.2338, + "step": 7698 + }, + { + "epoch": 0.19482754257661258, + "grad_norm": 4.807402610778809, + "learning_rate": 9.164514358928538e-06, + "loss": 0.1475, + "step": 7699 + }, + { + "epoch": 0.19485284814130627, + "grad_norm": 6.253566265106201, + "learning_rate": 9.164292135219819e-06, + "loss": 0.222, + "step": 7700 + }, + { + "epoch": 0.19487815370599995, + "grad_norm": 6.903411865234375, + "learning_rate": 9.164069884656354e-06, + "loss": 0.1417, + "step": 7701 + }, + { + "epoch": 0.19490345927069364, + "grad_norm": 4.7483038902282715, + "learning_rate": 9.163847607239577e-06, + "loss": 0.176, + "step": 7702 + }, + { + "epoch": 0.1949287648353873, + "grad_norm": 6.878358840942383, + "learning_rate": 9.163625302970922e-06, + "loss": 0.222, + "step": 7703 + }, + { + "epoch": 0.19495407040008098, + "grad_norm": 9.230731964111328, + "learning_rate": 9.163402971851822e-06, + "loss": 0.2016, + "step": 7704 + }, + { + "epoch": 0.19497937596477466, + "grad_norm": 5.254789352416992, + "learning_rate": 9.16318061388371e-06, + "loss": 0.1494, + "step": 7705 + }, + { + "epoch": 0.19500468152946834, + "grad_norm": 6.816583156585693, + "learning_rate": 9.162958229068021e-06, + "loss": 0.2481, + "step": 7706 + }, + { + "epoch": 0.195029987094162, + "grad_norm": 5.0328779220581055, + "learning_rate": 9.16273581740619e-06, + "loss": 0.203, + "step": 7707 + }, + { + "epoch": 0.19505529265885568, + "grad_norm": 12.535542488098145, + "learning_rate": 9.16251337889965e-06, + "loss": 0.322, + "step": 7708 + }, + { + "epoch": 0.19508059822354937, + "grad_norm": 2.2619199752807617, + "learning_rate": 9.162290913549833e-06, + "loss": 0.1496, + "step": 7709 + }, + { + "epoch": 0.19510590378824302, + "grad_norm": 3.7975645065307617, + "learning_rate": 9.162068421358177e-06, + "loss": 0.215, + "step": 7710 + }, + { + "epoch": 0.1951312093529367, + "grad_norm": 4.118989944458008, + "learning_rate": 9.161845902326117e-06, + "loss": 0.1881, + "step": 7711 + }, + { + "epoch": 0.1951565149176304, + "grad_norm": 7.686540126800537, + "learning_rate": 9.161623356455086e-06, + "loss": 0.1749, + "step": 7712 + }, + { + "epoch": 0.19518182048232408, + "grad_norm": 3.2530298233032227, + "learning_rate": 9.161400783746522e-06, + "loss": 0.1418, + "step": 7713 + }, + { + "epoch": 0.19520712604701773, + "grad_norm": 3.866123914718628, + "learning_rate": 9.161178184201856e-06, + "loss": 0.1526, + "step": 7714 + }, + { + "epoch": 0.19523243161171142, + "grad_norm": 3.0114758014678955, + "learning_rate": 9.160955557822525e-06, + "loss": 0.1265, + "step": 7715 + }, + { + "epoch": 0.1952577371764051, + "grad_norm": 4.826830863952637, + "learning_rate": 9.160732904609969e-06, + "loss": 0.2024, + "step": 7716 + }, + { + "epoch": 0.19528304274109876, + "grad_norm": 5.861417770385742, + "learning_rate": 9.16051022456562e-06, + "loss": 0.164, + "step": 7717 + }, + { + "epoch": 0.19530834830579244, + "grad_norm": 27.975337982177734, + "learning_rate": 9.160287517690912e-06, + "loss": 0.2263, + "step": 7718 + }, + { + "epoch": 0.19533365387048612, + "grad_norm": 2.9780304431915283, + "learning_rate": 9.160064783987286e-06, + "loss": 0.1996, + "step": 7719 + }, + { + "epoch": 0.1953589594351798, + "grad_norm": 10.701652526855469, + "learning_rate": 9.159842023456175e-06, + "loss": 0.3278, + "step": 7720 + }, + { + "epoch": 0.19538426499987346, + "grad_norm": 8.62348461151123, + "learning_rate": 9.159619236099016e-06, + "loss": 0.253, + "step": 7721 + }, + { + "epoch": 0.19540957056456715, + "grad_norm": 7.540221691131592, + "learning_rate": 9.159396421917246e-06, + "loss": 0.1886, + "step": 7722 + }, + { + "epoch": 0.19543487612926083, + "grad_norm": 3.6051182746887207, + "learning_rate": 9.159173580912303e-06, + "loss": 0.0841, + "step": 7723 + }, + { + "epoch": 0.1954601816939545, + "grad_norm": 5.056974411010742, + "learning_rate": 9.158950713085625e-06, + "loss": 0.1417, + "step": 7724 + }, + { + "epoch": 0.19548548725864817, + "grad_norm": 7.127732753753662, + "learning_rate": 9.158727818438646e-06, + "loss": 0.2832, + "step": 7725 + }, + { + "epoch": 0.19551079282334186, + "grad_norm": 4.7398362159729, + "learning_rate": 9.158504896972803e-06, + "loss": 0.1987, + "step": 7726 + }, + { + "epoch": 0.19553609838803554, + "grad_norm": 6.56605863571167, + "learning_rate": 9.158281948689538e-06, + "loss": 0.2134, + "step": 7727 + }, + { + "epoch": 0.1955614039527292, + "grad_norm": 3.0559475421905518, + "learning_rate": 9.158058973590288e-06, + "loss": 0.1703, + "step": 7728 + }, + { + "epoch": 0.19558670951742288, + "grad_norm": 5.726400375366211, + "learning_rate": 9.157835971676485e-06, + "loss": 0.1779, + "step": 7729 + }, + { + "epoch": 0.19561201508211656, + "grad_norm": 5.666218280792236, + "learning_rate": 9.157612942949572e-06, + "loss": 0.1709, + "step": 7730 + }, + { + "epoch": 0.19563732064681022, + "grad_norm": 6.699759483337402, + "learning_rate": 9.157389887410987e-06, + "loss": 0.2739, + "step": 7731 + }, + { + "epoch": 0.1956626262115039, + "grad_norm": 4.84144926071167, + "learning_rate": 9.15716680506217e-06, + "loss": 0.2385, + "step": 7732 + }, + { + "epoch": 0.1956879317761976, + "grad_norm": 9.674922943115234, + "learning_rate": 9.156943695904556e-06, + "loss": 0.216, + "step": 7733 + }, + { + "epoch": 0.19571323734089127, + "grad_norm": 5.811367988586426, + "learning_rate": 9.156720559939585e-06, + "loss": 0.1273, + "step": 7734 + }, + { + "epoch": 0.19573854290558493, + "grad_norm": 4.3535637855529785, + "learning_rate": 9.156497397168695e-06, + "loss": 0.1689, + "step": 7735 + }, + { + "epoch": 0.1957638484702786, + "grad_norm": 6.209511756896973, + "learning_rate": 9.15627420759333e-06, + "loss": 0.1808, + "step": 7736 + }, + { + "epoch": 0.1957891540349723, + "grad_norm": 5.133718967437744, + "learning_rate": 9.156050991214923e-06, + "loss": 0.0941, + "step": 7737 + }, + { + "epoch": 0.19581445959966598, + "grad_norm": 4.4322190284729, + "learning_rate": 9.155827748034916e-06, + "loss": 0.128, + "step": 7738 + }, + { + "epoch": 0.19583976516435964, + "grad_norm": 6.581161022186279, + "learning_rate": 9.15560447805475e-06, + "loss": 0.2339, + "step": 7739 + }, + { + "epoch": 0.19586507072905332, + "grad_norm": 9.809861183166504, + "learning_rate": 9.155381181275864e-06, + "loss": 0.2328, + "step": 7740 + }, + { + "epoch": 0.195890376293747, + "grad_norm": 9.118487358093262, + "learning_rate": 9.155157857699697e-06, + "loss": 0.1375, + "step": 7741 + }, + { + "epoch": 0.19591568185844066, + "grad_norm": 18.385835647583008, + "learning_rate": 9.154934507327691e-06, + "loss": 0.3672, + "step": 7742 + }, + { + "epoch": 0.19594098742313434, + "grad_norm": 7.146213531494141, + "learning_rate": 9.154711130161285e-06, + "loss": 0.1385, + "step": 7743 + }, + { + "epoch": 0.19596629298782803, + "grad_norm": 5.771479606628418, + "learning_rate": 9.154487726201918e-06, + "loss": 0.2128, + "step": 7744 + }, + { + "epoch": 0.1959915985525217, + "grad_norm": 9.609026908874512, + "learning_rate": 9.154264295451034e-06, + "loss": 0.3028, + "step": 7745 + }, + { + "epoch": 0.19601690411721537, + "grad_norm": 2.023775815963745, + "learning_rate": 9.154040837910072e-06, + "loss": 0.1155, + "step": 7746 + }, + { + "epoch": 0.19604220968190905, + "grad_norm": 3.349984645843506, + "learning_rate": 9.153817353580473e-06, + "loss": 0.1403, + "step": 7747 + }, + { + "epoch": 0.19606751524660274, + "grad_norm": 6.312962055206299, + "learning_rate": 9.15359384246368e-06, + "loss": 0.1998, + "step": 7748 + }, + { + "epoch": 0.1960928208112964, + "grad_norm": 5.296465873718262, + "learning_rate": 9.15337030456113e-06, + "loss": 0.1339, + "step": 7749 + }, + { + "epoch": 0.19611812637599008, + "grad_norm": 4.116501808166504, + "learning_rate": 9.15314673987427e-06, + "loss": 0.1238, + "step": 7750 + }, + { + "epoch": 0.19614343194068376, + "grad_norm": 4.44504451751709, + "learning_rate": 9.15292314840454e-06, + "loss": 0.1396, + "step": 7751 + }, + { + "epoch": 0.19616873750537744, + "grad_norm": 3.462054967880249, + "learning_rate": 9.152699530153378e-06, + "loss": 0.117, + "step": 7752 + }, + { + "epoch": 0.1961940430700711, + "grad_norm": 5.2954559326171875, + "learning_rate": 9.152475885122232e-06, + "loss": 0.2482, + "step": 7753 + }, + { + "epoch": 0.19621934863476478, + "grad_norm": 4.2955827713012695, + "learning_rate": 9.15225221331254e-06, + "loss": 0.2003, + "step": 7754 + }, + { + "epoch": 0.19624465419945847, + "grad_norm": 4.056842803955078, + "learning_rate": 9.152028514725745e-06, + "loss": 0.1725, + "step": 7755 + }, + { + "epoch": 0.19626995976415212, + "grad_norm": 13.765250205993652, + "learning_rate": 9.151804789363292e-06, + "loss": 0.2771, + "step": 7756 + }, + { + "epoch": 0.1962952653288458, + "grad_norm": 4.845197677612305, + "learning_rate": 9.151581037226622e-06, + "loss": 0.1192, + "step": 7757 + }, + { + "epoch": 0.1963205708935395, + "grad_norm": 3.01301908493042, + "learning_rate": 9.151357258317179e-06, + "loss": 0.1019, + "step": 7758 + }, + { + "epoch": 0.19634587645823318, + "grad_norm": 5.914816379547119, + "learning_rate": 9.151133452636402e-06, + "loss": 0.2098, + "step": 7759 + }, + { + "epoch": 0.19637118202292683, + "grad_norm": 3.3095829486846924, + "learning_rate": 9.15090962018574e-06, + "loss": 0.1672, + "step": 7760 + }, + { + "epoch": 0.19639648758762052, + "grad_norm": 6.852296352386475, + "learning_rate": 9.150685760966635e-06, + "loss": 0.2353, + "step": 7761 + }, + { + "epoch": 0.1964217931523142, + "grad_norm": 5.766999244689941, + "learning_rate": 9.150461874980528e-06, + "loss": 0.2136, + "step": 7762 + }, + { + "epoch": 0.19644709871700786, + "grad_norm": 3.1473820209503174, + "learning_rate": 9.150237962228864e-06, + "loss": 0.1644, + "step": 7763 + }, + { + "epoch": 0.19647240428170154, + "grad_norm": 7.138420581817627, + "learning_rate": 9.150014022713089e-06, + "loss": 0.1399, + "step": 7764 + }, + { + "epoch": 0.19649770984639522, + "grad_norm": 11.460942268371582, + "learning_rate": 9.149790056434643e-06, + "loss": 0.3514, + "step": 7765 + }, + { + "epoch": 0.1965230154110889, + "grad_norm": 5.401416778564453, + "learning_rate": 9.149566063394976e-06, + "loss": 0.1611, + "step": 7766 + }, + { + "epoch": 0.19654832097578256, + "grad_norm": 10.59506893157959, + "learning_rate": 9.149342043595528e-06, + "loss": 0.3145, + "step": 7767 + }, + { + "epoch": 0.19657362654047625, + "grad_norm": 15.378829002380371, + "learning_rate": 9.149117997037744e-06, + "loss": 0.1379, + "step": 7768 + }, + { + "epoch": 0.19659893210516993, + "grad_norm": 6.140577793121338, + "learning_rate": 9.14889392372307e-06, + "loss": 0.2399, + "step": 7769 + }, + { + "epoch": 0.19662423766986362, + "grad_norm": 4.069941997528076, + "learning_rate": 9.148669823652952e-06, + "loss": 0.2667, + "step": 7770 + }, + { + "epoch": 0.19664954323455727, + "grad_norm": 10.599328994750977, + "learning_rate": 9.148445696828834e-06, + "loss": 0.2702, + "step": 7771 + }, + { + "epoch": 0.19667484879925096, + "grad_norm": 4.046878337860107, + "learning_rate": 9.14822154325216e-06, + "loss": 0.1192, + "step": 7772 + }, + { + "epoch": 0.19670015436394464, + "grad_norm": 4.71923828125, + "learning_rate": 9.14799736292438e-06, + "loss": 0.1507, + "step": 7773 + }, + { + "epoch": 0.1967254599286383, + "grad_norm": 3.2780954837799072, + "learning_rate": 9.147773155846932e-06, + "loss": 0.1602, + "step": 7774 + }, + { + "epoch": 0.19675076549333198, + "grad_norm": 11.294175148010254, + "learning_rate": 9.14754892202127e-06, + "loss": 0.264, + "step": 7775 + }, + { + "epoch": 0.19677607105802566, + "grad_norm": 9.930092811584473, + "learning_rate": 9.147324661448835e-06, + "loss": 0.3081, + "step": 7776 + }, + { + "epoch": 0.19680137662271935, + "grad_norm": 12.45425033569336, + "learning_rate": 9.147100374131074e-06, + "loss": 0.2045, + "step": 7777 + }, + { + "epoch": 0.196826682187413, + "grad_norm": 7.026828289031982, + "learning_rate": 9.146876060069437e-06, + "loss": 0.2149, + "step": 7778 + }, + { + "epoch": 0.1968519877521067, + "grad_norm": 2.704655408859253, + "learning_rate": 9.146651719265365e-06, + "loss": 0.1285, + "step": 7779 + }, + { + "epoch": 0.19687729331680037, + "grad_norm": 6.7525105476379395, + "learning_rate": 9.14642735172031e-06, + "loss": 0.2062, + "step": 7780 + }, + { + "epoch": 0.19690259888149403, + "grad_norm": 5.771271228790283, + "learning_rate": 9.146202957435712e-06, + "loss": 0.2428, + "step": 7781 + }, + { + "epoch": 0.1969279044461877, + "grad_norm": 4.199929237365723, + "learning_rate": 9.145978536413026e-06, + "loss": 0.1524, + "step": 7782 + }, + { + "epoch": 0.1969532100108814, + "grad_norm": 8.230833053588867, + "learning_rate": 9.145754088653694e-06, + "loss": 0.1983, + "step": 7783 + }, + { + "epoch": 0.19697851557557508, + "grad_norm": 4.713881015777588, + "learning_rate": 9.145529614159165e-06, + "loss": 0.1628, + "step": 7784 + }, + { + "epoch": 0.19700382114026874, + "grad_norm": 4.914071083068848, + "learning_rate": 9.145305112930889e-06, + "loss": 0.1512, + "step": 7785 + }, + { + "epoch": 0.19702912670496242, + "grad_norm": 14.373673439025879, + "learning_rate": 9.14508058497031e-06, + "loss": 0.204, + "step": 7786 + }, + { + "epoch": 0.1970544322696561, + "grad_norm": 9.243921279907227, + "learning_rate": 9.144856030278877e-06, + "loss": 0.2573, + "step": 7787 + }, + { + "epoch": 0.19707973783434976, + "grad_norm": 5.764886379241943, + "learning_rate": 9.144631448858038e-06, + "loss": 0.2827, + "step": 7788 + }, + { + "epoch": 0.19710504339904344, + "grad_norm": 7.734719753265381, + "learning_rate": 9.144406840709243e-06, + "loss": 0.2407, + "step": 7789 + }, + { + "epoch": 0.19713034896373713, + "grad_norm": 4.003348350524902, + "learning_rate": 9.144182205833938e-06, + "loss": 0.0924, + "step": 7790 + }, + { + "epoch": 0.1971556545284308, + "grad_norm": 8.664437294006348, + "learning_rate": 9.143957544233573e-06, + "loss": 0.1825, + "step": 7791 + }, + { + "epoch": 0.19718096009312447, + "grad_norm": 4.20923376083374, + "learning_rate": 9.143732855909596e-06, + "loss": 0.1672, + "step": 7792 + }, + { + "epoch": 0.19720626565781815, + "grad_norm": 12.157539367675781, + "learning_rate": 9.143508140863458e-06, + "loss": 0.2818, + "step": 7793 + }, + { + "epoch": 0.19723157122251184, + "grad_norm": 5.564530372619629, + "learning_rate": 9.143283399096606e-06, + "loss": 0.2169, + "step": 7794 + }, + { + "epoch": 0.1972568767872055, + "grad_norm": 5.519768238067627, + "learning_rate": 9.14305863061049e-06, + "loss": 0.1979, + "step": 7795 + }, + { + "epoch": 0.19728218235189918, + "grad_norm": 12.623595237731934, + "learning_rate": 9.14283383540656e-06, + "loss": 0.2183, + "step": 7796 + }, + { + "epoch": 0.19730748791659286, + "grad_norm": 13.442761421203613, + "learning_rate": 9.142609013486266e-06, + "loss": 0.2693, + "step": 7797 + }, + { + "epoch": 0.19733279348128654, + "grad_norm": 10.930052757263184, + "learning_rate": 9.142384164851055e-06, + "loss": 0.2118, + "step": 7798 + }, + { + "epoch": 0.1973580990459802, + "grad_norm": 6.108536243438721, + "learning_rate": 9.142159289502382e-06, + "loss": 0.1897, + "step": 7799 + }, + { + "epoch": 0.19738340461067388, + "grad_norm": 6.388487339019775, + "learning_rate": 9.14193438744169e-06, + "loss": 0.2618, + "step": 7800 + }, + { + "epoch": 0.19740871017536757, + "grad_norm": 6.656213283538818, + "learning_rate": 9.141709458670439e-06, + "loss": 0.3015, + "step": 7801 + }, + { + "epoch": 0.19743401574006125, + "grad_norm": 10.316140174865723, + "learning_rate": 9.14148450319007e-06, + "loss": 0.2832, + "step": 7802 + }, + { + "epoch": 0.1974593213047549, + "grad_norm": 3.4224343299865723, + "learning_rate": 9.141259521002039e-06, + "loss": 0.1261, + "step": 7803 + }, + { + "epoch": 0.1974846268694486, + "grad_norm": 3.552520275115967, + "learning_rate": 9.141034512107796e-06, + "loss": 0.1942, + "step": 7804 + }, + { + "epoch": 0.19750993243414228, + "grad_norm": 4.991799354553223, + "learning_rate": 9.140809476508791e-06, + "loss": 0.1529, + "step": 7805 + }, + { + "epoch": 0.19753523799883593, + "grad_norm": 14.452225685119629, + "learning_rate": 9.140584414206475e-06, + "loss": 0.2441, + "step": 7806 + }, + { + "epoch": 0.19756054356352962, + "grad_norm": 6.328886032104492, + "learning_rate": 9.140359325202302e-06, + "loss": 0.258, + "step": 7807 + }, + { + "epoch": 0.1975858491282233, + "grad_norm": 4.598789691925049, + "learning_rate": 9.140134209497721e-06, + "loss": 0.1664, + "step": 7808 + }, + { + "epoch": 0.19761115469291698, + "grad_norm": 4.60610294342041, + "learning_rate": 9.139909067094186e-06, + "loss": 0.1499, + "step": 7809 + }, + { + "epoch": 0.19763646025761064, + "grad_norm": 4.427441120147705, + "learning_rate": 9.139683897993146e-06, + "loss": 0.1837, + "step": 7810 + }, + { + "epoch": 0.19766176582230433, + "grad_norm": 10.442099571228027, + "learning_rate": 9.139458702196056e-06, + "loss": 0.197, + "step": 7811 + }, + { + "epoch": 0.197687071386998, + "grad_norm": 5.752194404602051, + "learning_rate": 9.139233479704366e-06, + "loss": 0.2231, + "step": 7812 + }, + { + "epoch": 0.19771237695169167, + "grad_norm": 9.126596450805664, + "learning_rate": 9.139008230519529e-06, + "loss": 0.1341, + "step": 7813 + }, + { + "epoch": 0.19773768251638535, + "grad_norm": 9.045042991638184, + "learning_rate": 9.138782954642998e-06, + "loss": 0.2453, + "step": 7814 + }, + { + "epoch": 0.19776298808107903, + "grad_norm": 4.36015510559082, + "learning_rate": 9.138557652076227e-06, + "loss": 0.2033, + "step": 7815 + }, + { + "epoch": 0.19778829364577272, + "grad_norm": 7.091445446014404, + "learning_rate": 9.138332322820666e-06, + "loss": 0.2658, + "step": 7816 + }, + { + "epoch": 0.19781359921046637, + "grad_norm": 5.452569484710693, + "learning_rate": 9.138106966877768e-06, + "loss": 0.1651, + "step": 7817 + }, + { + "epoch": 0.19783890477516006, + "grad_norm": 8.001421928405762, + "learning_rate": 9.13788158424899e-06, + "loss": 0.1176, + "step": 7818 + }, + { + "epoch": 0.19786421033985374, + "grad_norm": 7.9006218910217285, + "learning_rate": 9.137656174935785e-06, + "loss": 0.164, + "step": 7819 + }, + { + "epoch": 0.1978895159045474, + "grad_norm": 6.258629322052002, + "learning_rate": 9.137430738939604e-06, + "loss": 0.2513, + "step": 7820 + }, + { + "epoch": 0.19791482146924108, + "grad_norm": 5.3595662117004395, + "learning_rate": 9.1372052762619e-06, + "loss": 0.1971, + "step": 7821 + }, + { + "epoch": 0.19794012703393477, + "grad_norm": 8.342884063720703, + "learning_rate": 9.13697978690413e-06, + "loss": 0.3013, + "step": 7822 + }, + { + "epoch": 0.19796543259862845, + "grad_norm": 8.220264434814453, + "learning_rate": 9.136754270867746e-06, + "loss": 0.2498, + "step": 7823 + }, + { + "epoch": 0.1979907381633221, + "grad_norm": 4.386321544647217, + "learning_rate": 9.136528728154204e-06, + "loss": 0.1551, + "step": 7824 + }, + { + "epoch": 0.1980160437280158, + "grad_norm": 12.664323806762695, + "learning_rate": 9.136303158764956e-06, + "loss": 0.2513, + "step": 7825 + }, + { + "epoch": 0.19804134929270947, + "grad_norm": 4.1855387687683105, + "learning_rate": 9.13607756270146e-06, + "loss": 0.1638, + "step": 7826 + }, + { + "epoch": 0.19806665485740313, + "grad_norm": 10.73205852508545, + "learning_rate": 9.13585193996517e-06, + "loss": 0.3383, + "step": 7827 + }, + { + "epoch": 0.1980919604220968, + "grad_norm": 6.579288482666016, + "learning_rate": 9.13562629055754e-06, + "loss": 0.2161, + "step": 7828 + }, + { + "epoch": 0.1981172659867905, + "grad_norm": 4.050027847290039, + "learning_rate": 9.135400614480023e-06, + "loss": 0.2056, + "step": 7829 + }, + { + "epoch": 0.19814257155148418, + "grad_norm": 9.46254825592041, + "learning_rate": 9.13517491173408e-06, + "loss": 0.3255, + "step": 7830 + }, + { + "epoch": 0.19816787711617784, + "grad_norm": 5.391755104064941, + "learning_rate": 9.13494918232116e-06, + "loss": 0.1918, + "step": 7831 + }, + { + "epoch": 0.19819318268087152, + "grad_norm": 2.7290163040161133, + "learning_rate": 9.134723426242724e-06, + "loss": 0.1314, + "step": 7832 + }, + { + "epoch": 0.1982184882455652, + "grad_norm": 11.200592041015625, + "learning_rate": 9.134497643500224e-06, + "loss": 0.1534, + "step": 7833 + }, + { + "epoch": 0.1982437938102589, + "grad_norm": 4.639080047607422, + "learning_rate": 9.13427183409512e-06, + "loss": 0.118, + "step": 7834 + }, + { + "epoch": 0.19826909937495255, + "grad_norm": 6.068765163421631, + "learning_rate": 9.134045998028865e-06, + "loss": 0.1398, + "step": 7835 + }, + { + "epoch": 0.19829440493964623, + "grad_norm": 5.600907325744629, + "learning_rate": 9.133820135302915e-06, + "loss": 0.1531, + "step": 7836 + }, + { + "epoch": 0.1983197105043399, + "grad_norm": 10.002199172973633, + "learning_rate": 9.133594245918729e-06, + "loss": 0.1836, + "step": 7837 + }, + { + "epoch": 0.19834501606903357, + "grad_norm": 4.813011646270752, + "learning_rate": 9.133368329877763e-06, + "loss": 0.1302, + "step": 7838 + }, + { + "epoch": 0.19837032163372725, + "grad_norm": 10.78818416595459, + "learning_rate": 9.133142387181473e-06, + "loss": 0.2967, + "step": 7839 + }, + { + "epoch": 0.19839562719842094, + "grad_norm": 4.359085559844971, + "learning_rate": 9.132916417831316e-06, + "loss": 0.1984, + "step": 7840 + }, + { + "epoch": 0.19842093276311462, + "grad_norm": 6.233381271362305, + "learning_rate": 9.132690421828751e-06, + "loss": 0.2383, + "step": 7841 + }, + { + "epoch": 0.19844623832780828, + "grad_norm": 6.795114517211914, + "learning_rate": 9.132464399175235e-06, + "loss": 0.2251, + "step": 7842 + }, + { + "epoch": 0.19847154389250196, + "grad_norm": 7.051027297973633, + "learning_rate": 9.132238349872222e-06, + "loss": 0.1674, + "step": 7843 + }, + { + "epoch": 0.19849684945719565, + "grad_norm": 5.202233791351318, + "learning_rate": 9.132012273921173e-06, + "loss": 0.212, + "step": 7844 + }, + { + "epoch": 0.1985221550218893, + "grad_norm": 4.131586074829102, + "learning_rate": 9.131786171323548e-06, + "loss": 0.1714, + "step": 7845 + }, + { + "epoch": 0.19854746058658299, + "grad_norm": 5.064265727996826, + "learning_rate": 9.1315600420808e-06, + "loss": 0.2026, + "step": 7846 + }, + { + "epoch": 0.19857276615127667, + "grad_norm": 4.126780033111572, + "learning_rate": 9.131333886194391e-06, + "loss": 0.2118, + "step": 7847 + }, + { + "epoch": 0.19859807171597035, + "grad_norm": 5.190652370452881, + "learning_rate": 9.131107703665777e-06, + "loss": 0.1667, + "step": 7848 + }, + { + "epoch": 0.198623377280664, + "grad_norm": 6.949294567108154, + "learning_rate": 9.13088149449642e-06, + "loss": 0.2382, + "step": 7849 + }, + { + "epoch": 0.1986486828453577, + "grad_norm": 4.163298606872559, + "learning_rate": 9.130655258687776e-06, + "loss": 0.1788, + "step": 7850 + }, + { + "epoch": 0.19867398841005138, + "grad_norm": 4.0460968017578125, + "learning_rate": 9.130428996241306e-06, + "loss": 0.1546, + "step": 7851 + }, + { + "epoch": 0.19869929397474503, + "grad_norm": 3.5727081298828125, + "learning_rate": 9.130202707158465e-06, + "loss": 0.1864, + "step": 7852 + }, + { + "epoch": 0.19872459953943872, + "grad_norm": 2.814938545227051, + "learning_rate": 9.129976391440717e-06, + "loss": 0.1178, + "step": 7853 + }, + { + "epoch": 0.1987499051041324, + "grad_norm": 5.9205708503723145, + "learning_rate": 9.129750049089517e-06, + "loss": 0.1972, + "step": 7854 + }, + { + "epoch": 0.19877521066882609, + "grad_norm": 5.473782539367676, + "learning_rate": 9.129523680106329e-06, + "loss": 0.1709, + "step": 7855 + }, + { + "epoch": 0.19880051623351974, + "grad_norm": 10.954131126403809, + "learning_rate": 9.12929728449261e-06, + "loss": 0.2562, + "step": 7856 + }, + { + "epoch": 0.19882582179821343, + "grad_norm": 9.927347183227539, + "learning_rate": 9.129070862249821e-06, + "loss": 0.1801, + "step": 7857 + }, + { + "epoch": 0.1988511273629071, + "grad_norm": 9.482071876525879, + "learning_rate": 9.128844413379422e-06, + "loss": 0.2083, + "step": 7858 + }, + { + "epoch": 0.19887643292760077, + "grad_norm": 3.672374963760376, + "learning_rate": 9.128617937882875e-06, + "loss": 0.194, + "step": 7859 + }, + { + "epoch": 0.19890173849229445, + "grad_norm": 10.00898551940918, + "learning_rate": 9.128391435761637e-06, + "loss": 0.313, + "step": 7860 + }, + { + "epoch": 0.19892704405698813, + "grad_norm": 21.53856086730957, + "learning_rate": 9.128164907017171e-06, + "loss": 0.3013, + "step": 7861 + }, + { + "epoch": 0.19895234962168182, + "grad_norm": 8.637887001037598, + "learning_rate": 9.127938351650939e-06, + "loss": 0.2346, + "step": 7862 + }, + { + "epoch": 0.19897765518637547, + "grad_norm": 6.387328624725342, + "learning_rate": 9.1277117696644e-06, + "loss": 0.16, + "step": 7863 + }, + { + "epoch": 0.19900296075106916, + "grad_norm": 8.408289909362793, + "learning_rate": 9.127485161059013e-06, + "loss": 0.2343, + "step": 7864 + }, + { + "epoch": 0.19902826631576284, + "grad_norm": 8.581971168518066, + "learning_rate": 9.127258525836244e-06, + "loss": 0.2354, + "step": 7865 + }, + { + "epoch": 0.19905357188045653, + "grad_norm": 4.58689546585083, + "learning_rate": 9.127031863997552e-06, + "loss": 0.1825, + "step": 7866 + }, + { + "epoch": 0.19907887744515018, + "grad_norm": 9.260891914367676, + "learning_rate": 9.1268051755444e-06, + "loss": 0.1544, + "step": 7867 + }, + { + "epoch": 0.19910418300984387, + "grad_norm": 6.894949913024902, + "learning_rate": 9.126578460478249e-06, + "loss": 0.2104, + "step": 7868 + }, + { + "epoch": 0.19912948857453755, + "grad_norm": 4.875341892242432, + "learning_rate": 9.12635171880056e-06, + "loss": 0.2146, + "step": 7869 + }, + { + "epoch": 0.1991547941392312, + "grad_norm": 9.176263809204102, + "learning_rate": 9.126124950512799e-06, + "loss": 0.3099, + "step": 7870 + }, + { + "epoch": 0.1991800997039249, + "grad_norm": 3.951230049133301, + "learning_rate": 9.125898155616422e-06, + "loss": 0.1311, + "step": 7871 + }, + { + "epoch": 0.19920540526861857, + "grad_norm": 5.386782169342041, + "learning_rate": 9.125671334112897e-06, + "loss": 0.294, + "step": 7872 + }, + { + "epoch": 0.19923071083331226, + "grad_norm": 8.017911911010742, + "learning_rate": 9.125444486003684e-06, + "loss": 0.1493, + "step": 7873 + }, + { + "epoch": 0.19925601639800591, + "grad_norm": 19.80481719970703, + "learning_rate": 9.12521761129025e-06, + "loss": 0.1858, + "step": 7874 + }, + { + "epoch": 0.1992813219626996, + "grad_norm": 4.1731486320495605, + "learning_rate": 9.124990709974052e-06, + "loss": 0.2226, + "step": 7875 + }, + { + "epoch": 0.19930662752739328, + "grad_norm": 11.983566284179688, + "learning_rate": 9.124763782056557e-06, + "loss": 0.2917, + "step": 7876 + }, + { + "epoch": 0.19933193309208694, + "grad_norm": 6.566020965576172, + "learning_rate": 9.124536827539228e-06, + "loss": 0.2401, + "step": 7877 + }, + { + "epoch": 0.19935723865678062, + "grad_norm": 7.088867664337158, + "learning_rate": 9.124309846423527e-06, + "loss": 0.2044, + "step": 7878 + }, + { + "epoch": 0.1993825442214743, + "grad_norm": 9.932748794555664, + "learning_rate": 9.12408283871092e-06, + "loss": 0.3425, + "step": 7879 + }, + { + "epoch": 0.199407849786168, + "grad_norm": 3.853482246398926, + "learning_rate": 9.12385580440287e-06, + "loss": 0.2135, + "step": 7880 + }, + { + "epoch": 0.19943315535086165, + "grad_norm": 3.5274624824523926, + "learning_rate": 9.12362874350084e-06, + "loss": 0.1667, + "step": 7881 + }, + { + "epoch": 0.19945846091555533, + "grad_norm": 5.65885066986084, + "learning_rate": 9.123401656006297e-06, + "loss": 0.1875, + "step": 7882 + }, + { + "epoch": 0.19948376648024901, + "grad_norm": 7.985793590545654, + "learning_rate": 9.123174541920704e-06, + "loss": 0.262, + "step": 7883 + }, + { + "epoch": 0.19950907204494267, + "grad_norm": 7.740478515625, + "learning_rate": 9.122947401245523e-06, + "loss": 0.1688, + "step": 7884 + }, + { + "epoch": 0.19953437760963635, + "grad_norm": 3.84360408782959, + "learning_rate": 9.122720233982223e-06, + "loss": 0.1694, + "step": 7885 + }, + { + "epoch": 0.19955968317433004, + "grad_norm": 4.080310344696045, + "learning_rate": 9.122493040132266e-06, + "loss": 0.1407, + "step": 7886 + }, + { + "epoch": 0.19958498873902372, + "grad_norm": 3.6544463634490967, + "learning_rate": 9.122265819697119e-06, + "loss": 0.1114, + "step": 7887 + }, + { + "epoch": 0.19961029430371738, + "grad_norm": 12.352694511413574, + "learning_rate": 9.122038572678246e-06, + "loss": 0.2116, + "step": 7888 + }, + { + "epoch": 0.19963559986841106, + "grad_norm": 3.526496648788452, + "learning_rate": 9.121811299077113e-06, + "loss": 0.1654, + "step": 7889 + }, + { + "epoch": 0.19966090543310475, + "grad_norm": 4.161818981170654, + "learning_rate": 9.121583998895187e-06, + "loss": 0.1424, + "step": 7890 + }, + { + "epoch": 0.1996862109977984, + "grad_norm": 3.2258193492889404, + "learning_rate": 9.12135667213393e-06, + "loss": 0.08, + "step": 7891 + }, + { + "epoch": 0.1997115165624921, + "grad_norm": 3.4736597537994385, + "learning_rate": 9.121129318794813e-06, + "loss": 0.2045, + "step": 7892 + }, + { + "epoch": 0.19973682212718577, + "grad_norm": 5.803914546966553, + "learning_rate": 9.120901938879297e-06, + "loss": 0.2769, + "step": 7893 + }, + { + "epoch": 0.19976212769187945, + "grad_norm": 8.204154968261719, + "learning_rate": 9.120674532388853e-06, + "loss": 0.2652, + "step": 7894 + }, + { + "epoch": 0.1997874332565731, + "grad_norm": 2.749875545501709, + "learning_rate": 9.120447099324945e-06, + "loss": 0.1161, + "step": 7895 + }, + { + "epoch": 0.1998127388212668, + "grad_norm": 4.021431922912598, + "learning_rate": 9.120219639689038e-06, + "loss": 0.1861, + "step": 7896 + }, + { + "epoch": 0.19983804438596048, + "grad_norm": 12.725284576416016, + "learning_rate": 9.119992153482604e-06, + "loss": 0.2474, + "step": 7897 + }, + { + "epoch": 0.19986334995065416, + "grad_norm": 8.9203462600708, + "learning_rate": 9.119764640707106e-06, + "loss": 0.3527, + "step": 7898 + }, + { + "epoch": 0.19988865551534782, + "grad_norm": 9.179801940917969, + "learning_rate": 9.119537101364012e-06, + "loss": 0.1871, + "step": 7899 + }, + { + "epoch": 0.1999139610800415, + "grad_norm": 4.480133533477783, + "learning_rate": 9.119309535454789e-06, + "loss": 0.156, + "step": 7900 + }, + { + "epoch": 0.1999392666447352, + "grad_norm": 8.595481872558594, + "learning_rate": 9.119081942980904e-06, + "loss": 0.277, + "step": 7901 + }, + { + "epoch": 0.19996457220942884, + "grad_norm": 4.541461944580078, + "learning_rate": 9.118854323943828e-06, + "loss": 0.1265, + "step": 7902 + }, + { + "epoch": 0.19998987777412253, + "grad_norm": 6.351028919219971, + "learning_rate": 9.118626678345027e-06, + "loss": 0.3053, + "step": 7903 + }, + { + "epoch": 0.2000151833388162, + "grad_norm": 5.371601581573486, + "learning_rate": 9.118399006185968e-06, + "loss": 0.1541, + "step": 7904 + }, + { + "epoch": 0.2000151833388162, + "eval_loss": 0.21083411574363708, + "eval_runtime": 69.8338, + "eval_samples_per_second": 45.737, + "eval_steps_per_second": 5.728, + "step": 7904 + }, + { + "epoch": 0.2000404889035099, + "grad_norm": 6.395238876342773, + "learning_rate": 9.11817130746812e-06, + "loss": 0.26, + "step": 7905 + }, + { + "epoch": 0.20006579446820355, + "grad_norm": 4.720654010772705, + "learning_rate": 9.11794358219295e-06, + "loss": 0.1812, + "step": 7906 + }, + { + "epoch": 0.20009110003289723, + "grad_norm": 16.208133697509766, + "learning_rate": 9.117715830361928e-06, + "loss": 0.3818, + "step": 7907 + }, + { + "epoch": 0.20011640559759092, + "grad_norm": 3.518639087677002, + "learning_rate": 9.117488051976524e-06, + "loss": 0.1167, + "step": 7908 + }, + { + "epoch": 0.20014171116228457, + "grad_norm": 5.174818515777588, + "learning_rate": 9.117260247038205e-06, + "loss": 0.1867, + "step": 7909 + }, + { + "epoch": 0.20016701672697826, + "grad_norm": 3.947333574295044, + "learning_rate": 9.11703241554844e-06, + "loss": 0.1674, + "step": 7910 + }, + { + "epoch": 0.20019232229167194, + "grad_norm": 8.933723449707031, + "learning_rate": 9.116804557508698e-06, + "loss": 0.1962, + "step": 7911 + }, + { + "epoch": 0.20021762785636563, + "grad_norm": 4.887424468994141, + "learning_rate": 9.11657667292045e-06, + "loss": 0.1459, + "step": 7912 + }, + { + "epoch": 0.20024293342105928, + "grad_norm": 5.62775182723999, + "learning_rate": 9.116348761785165e-06, + "loss": 0.2247, + "step": 7913 + }, + { + "epoch": 0.20026823898575297, + "grad_norm": 5.441728591918945, + "learning_rate": 9.116120824104313e-06, + "loss": 0.1217, + "step": 7914 + }, + { + "epoch": 0.20029354455044665, + "grad_norm": 8.75740909576416, + "learning_rate": 9.115892859879362e-06, + "loss": 0.2562, + "step": 7915 + }, + { + "epoch": 0.2003188501151403, + "grad_norm": 9.149409294128418, + "learning_rate": 9.115664869111786e-06, + "loss": 0.3546, + "step": 7916 + }, + { + "epoch": 0.200344155679834, + "grad_norm": 12.610291481018066, + "learning_rate": 9.115436851803052e-06, + "loss": 0.3024, + "step": 7917 + }, + { + "epoch": 0.20036946124452767, + "grad_norm": 5.319777011871338, + "learning_rate": 9.115208807954629e-06, + "loss": 0.2108, + "step": 7918 + }, + { + "epoch": 0.20039476680922136, + "grad_norm": 3.704057455062866, + "learning_rate": 9.114980737567991e-06, + "loss": 0.2035, + "step": 7919 + }, + { + "epoch": 0.20042007237391501, + "grad_norm": 5.716034889221191, + "learning_rate": 9.114752640644608e-06, + "loss": 0.1945, + "step": 7920 + }, + { + "epoch": 0.2004453779386087, + "grad_norm": 4.415262699127197, + "learning_rate": 9.114524517185952e-06, + "loss": 0.1191, + "step": 7921 + }, + { + "epoch": 0.20047068350330238, + "grad_norm": 4.36647367477417, + "learning_rate": 9.114296367193491e-06, + "loss": 0.11, + "step": 7922 + }, + { + "epoch": 0.20049598906799604, + "grad_norm": 10.291486740112305, + "learning_rate": 9.1140681906687e-06, + "loss": 0.1894, + "step": 7923 + }, + { + "epoch": 0.20052129463268972, + "grad_norm": 6.697264194488525, + "learning_rate": 9.113839987613046e-06, + "loss": 0.2713, + "step": 7924 + }, + { + "epoch": 0.2005466001973834, + "grad_norm": 4.5609822273254395, + "learning_rate": 9.113611758028005e-06, + "loss": 0.175, + "step": 7925 + }, + { + "epoch": 0.2005719057620771, + "grad_norm": 4.68949556350708, + "learning_rate": 9.113383501915045e-06, + "loss": 0.1476, + "step": 7926 + }, + { + "epoch": 0.20059721132677075, + "grad_norm": 14.989462852478027, + "learning_rate": 9.113155219275642e-06, + "loss": 0.2628, + "step": 7927 + }, + { + "epoch": 0.20062251689146443, + "grad_norm": 5.594372272491455, + "learning_rate": 9.112926910111265e-06, + "loss": 0.2621, + "step": 7928 + }, + { + "epoch": 0.20064782245615811, + "grad_norm": 5.50510835647583, + "learning_rate": 9.112698574423387e-06, + "loss": 0.1459, + "step": 7929 + }, + { + "epoch": 0.2006731280208518, + "grad_norm": 10.796647071838379, + "learning_rate": 9.112470212213482e-06, + "loss": 0.3171, + "step": 7930 + }, + { + "epoch": 0.20069843358554545, + "grad_norm": 3.6109366416931152, + "learning_rate": 9.112241823483022e-06, + "loss": 0.1847, + "step": 7931 + }, + { + "epoch": 0.20072373915023914, + "grad_norm": 6.341274261474609, + "learning_rate": 9.112013408233478e-06, + "loss": 0.2802, + "step": 7932 + }, + { + "epoch": 0.20074904471493282, + "grad_norm": 4.5921549797058105, + "learning_rate": 9.111784966466325e-06, + "loss": 0.2628, + "step": 7933 + }, + { + "epoch": 0.20077435027962648, + "grad_norm": 22.23598861694336, + "learning_rate": 9.111556498183037e-06, + "loss": 0.4189, + "step": 7934 + }, + { + "epoch": 0.20079965584432016, + "grad_norm": 3.700943946838379, + "learning_rate": 9.111328003385083e-06, + "loss": 0.2134, + "step": 7935 + }, + { + "epoch": 0.20082496140901385, + "grad_norm": 3.9856412410736084, + "learning_rate": 9.111099482073942e-06, + "loss": 0.1872, + "step": 7936 + }, + { + "epoch": 0.20085026697370753, + "grad_norm": 12.539490699768066, + "learning_rate": 9.110870934251084e-06, + "loss": 0.2031, + "step": 7937 + }, + { + "epoch": 0.2008755725384012, + "grad_norm": 4.887551307678223, + "learning_rate": 9.110642359917984e-06, + "loss": 0.1688, + "step": 7938 + }, + { + "epoch": 0.20090087810309487, + "grad_norm": 5.194907188415527, + "learning_rate": 9.110413759076115e-06, + "loss": 0.1386, + "step": 7939 + }, + { + "epoch": 0.20092618366778855, + "grad_norm": 5.178788185119629, + "learning_rate": 9.110185131726954e-06, + "loss": 0.2265, + "step": 7940 + }, + { + "epoch": 0.2009514892324822, + "grad_norm": 13.537059783935547, + "learning_rate": 9.109956477871973e-06, + "loss": 0.2493, + "step": 7941 + }, + { + "epoch": 0.2009767947971759, + "grad_norm": 4.864574432373047, + "learning_rate": 9.109727797512647e-06, + "loss": 0.1862, + "step": 7942 + }, + { + "epoch": 0.20100210036186958, + "grad_norm": 4.656571388244629, + "learning_rate": 9.109499090650452e-06, + "loss": 0.2378, + "step": 7943 + }, + { + "epoch": 0.20102740592656326, + "grad_norm": 7.474507808685303, + "learning_rate": 9.109270357286861e-06, + "loss": 0.2887, + "step": 7944 + }, + { + "epoch": 0.20105271149125692, + "grad_norm": 3.2460289001464844, + "learning_rate": 9.10904159742335e-06, + "loss": 0.1512, + "step": 7945 + }, + { + "epoch": 0.2010780170559506, + "grad_norm": 7.4515838623046875, + "learning_rate": 9.108812811061394e-06, + "loss": 0.256, + "step": 7946 + }, + { + "epoch": 0.2011033226206443, + "grad_norm": 6.704629898071289, + "learning_rate": 9.10858399820247e-06, + "loss": 0.1767, + "step": 7947 + }, + { + "epoch": 0.20112862818533794, + "grad_norm": 7.742275238037109, + "learning_rate": 9.108355158848049e-06, + "loss": 0.2631, + "step": 7948 + }, + { + "epoch": 0.20115393375003163, + "grad_norm": 4.236429214477539, + "learning_rate": 9.108126292999613e-06, + "loss": 0.1197, + "step": 7949 + }, + { + "epoch": 0.2011792393147253, + "grad_norm": 6.525941371917725, + "learning_rate": 9.107897400658632e-06, + "loss": 0.2874, + "step": 7950 + }, + { + "epoch": 0.201204544879419, + "grad_norm": 4.0183234214782715, + "learning_rate": 9.107668481826584e-06, + "loss": 0.1435, + "step": 7951 + }, + { + "epoch": 0.20122985044411265, + "grad_norm": 4.714328289031982, + "learning_rate": 9.10743953650495e-06, + "loss": 0.22, + "step": 7952 + }, + { + "epoch": 0.20125515600880634, + "grad_norm": 9.163739204406738, + "learning_rate": 9.1072105646952e-06, + "loss": 0.3388, + "step": 7953 + }, + { + "epoch": 0.20128046157350002, + "grad_norm": 8.184938430786133, + "learning_rate": 9.106981566398813e-06, + "loss": 0.3114, + "step": 7954 + }, + { + "epoch": 0.20130576713819368, + "grad_norm": 5.955254077911377, + "learning_rate": 9.106752541617265e-06, + "loss": 0.2118, + "step": 7955 + }, + { + "epoch": 0.20133107270288736, + "grad_norm": 6.563122749328613, + "learning_rate": 9.106523490352035e-06, + "loss": 0.1982, + "step": 7956 + }, + { + "epoch": 0.20135637826758104, + "grad_norm": 6.249114036560059, + "learning_rate": 9.106294412604599e-06, + "loss": 0.2417, + "step": 7957 + }, + { + "epoch": 0.20138168383227473, + "grad_norm": 6.848107814788818, + "learning_rate": 9.106065308376436e-06, + "loss": 0.1108, + "step": 7958 + }, + { + "epoch": 0.20140698939696838, + "grad_norm": 6.434474945068359, + "learning_rate": 9.105836177669018e-06, + "loss": 0.172, + "step": 7959 + }, + { + "epoch": 0.20143229496166207, + "grad_norm": 3.502516269683838, + "learning_rate": 9.105607020483829e-06, + "loss": 0.1686, + "step": 7960 + }, + { + "epoch": 0.20145760052635575, + "grad_norm": 5.878865718841553, + "learning_rate": 9.105377836822343e-06, + "loss": 0.1931, + "step": 7961 + }, + { + "epoch": 0.20148290609104944, + "grad_norm": 6.27651309967041, + "learning_rate": 9.105148626686039e-06, + "loss": 0.2205, + "step": 7962 + }, + { + "epoch": 0.2015082116557431, + "grad_norm": 10.041112899780273, + "learning_rate": 9.104919390076394e-06, + "loss": 0.2255, + "step": 7963 + }, + { + "epoch": 0.20153351722043678, + "grad_norm": 7.516108989715576, + "learning_rate": 9.104690126994888e-06, + "loss": 0.2481, + "step": 7964 + }, + { + "epoch": 0.20155882278513046, + "grad_norm": 5.977839469909668, + "learning_rate": 9.104460837443e-06, + "loss": 0.1862, + "step": 7965 + }, + { + "epoch": 0.20158412834982412, + "grad_norm": 13.571541786193848, + "learning_rate": 9.104231521422206e-06, + "loss": 0.2373, + "step": 7966 + }, + { + "epoch": 0.2016094339145178, + "grad_norm": 7.40570592880249, + "learning_rate": 9.104002178933989e-06, + "loss": 0.25, + "step": 7967 + }, + { + "epoch": 0.20163473947921148, + "grad_norm": 4.611997127532959, + "learning_rate": 9.103772809979822e-06, + "loss": 0.199, + "step": 7968 + }, + { + "epoch": 0.20166004504390517, + "grad_norm": 9.50308609008789, + "learning_rate": 9.10354341456119e-06, + "loss": 0.2531, + "step": 7969 + }, + { + "epoch": 0.20168535060859882, + "grad_norm": 5.180412292480469, + "learning_rate": 9.103313992679568e-06, + "loss": 0.2384, + "step": 7970 + }, + { + "epoch": 0.2017106561732925, + "grad_norm": 4.3015570640563965, + "learning_rate": 9.103084544336439e-06, + "loss": 0.1508, + "step": 7971 + }, + { + "epoch": 0.2017359617379862, + "grad_norm": 6.451689720153809, + "learning_rate": 9.102855069533281e-06, + "loss": 0.142, + "step": 7972 + }, + { + "epoch": 0.20176126730267985, + "grad_norm": 8.56763744354248, + "learning_rate": 9.102625568271572e-06, + "loss": 0.2432, + "step": 7973 + }, + { + "epoch": 0.20178657286737353, + "grad_norm": 3.7301690578460693, + "learning_rate": 9.102396040552795e-06, + "loss": 0.1156, + "step": 7974 + }, + { + "epoch": 0.20181187843206722, + "grad_norm": 9.2139892578125, + "learning_rate": 9.10216648637843e-06, + "loss": 0.2509, + "step": 7975 + }, + { + "epoch": 0.2018371839967609, + "grad_norm": 4.794434070587158, + "learning_rate": 9.101936905749956e-06, + "loss": 0.18, + "step": 7976 + }, + { + "epoch": 0.20186248956145456, + "grad_norm": 4.3544440269470215, + "learning_rate": 9.101707298668852e-06, + "loss": 0.254, + "step": 7977 + }, + { + "epoch": 0.20188779512614824, + "grad_norm": 8.486947059631348, + "learning_rate": 9.101477665136602e-06, + "loss": 0.2087, + "step": 7978 + }, + { + "epoch": 0.20191310069084192, + "grad_norm": 3.632610321044922, + "learning_rate": 9.101248005154687e-06, + "loss": 0.2208, + "step": 7979 + }, + { + "epoch": 0.20193840625553558, + "grad_norm": 4.80309534072876, + "learning_rate": 9.101018318724584e-06, + "loss": 0.2319, + "step": 7980 + }, + { + "epoch": 0.20196371182022926, + "grad_norm": 2.7614266872406006, + "learning_rate": 9.100788605847776e-06, + "loss": 0.0771, + "step": 7981 + }, + { + "epoch": 0.20198901738492295, + "grad_norm": 11.812291145324707, + "learning_rate": 9.100558866525748e-06, + "loss": 0.2477, + "step": 7982 + }, + { + "epoch": 0.20201432294961663, + "grad_norm": 13.41158676147461, + "learning_rate": 9.100329100759977e-06, + "loss": 0.2693, + "step": 7983 + }, + { + "epoch": 0.2020396285143103, + "grad_norm": 6.297445774078369, + "learning_rate": 9.100099308551946e-06, + "loss": 0.1963, + "step": 7984 + }, + { + "epoch": 0.20206493407900397, + "grad_norm": 5.578211784362793, + "learning_rate": 9.09986948990314e-06, + "loss": 0.1199, + "step": 7985 + }, + { + "epoch": 0.20209023964369766, + "grad_norm": 7.135880947113037, + "learning_rate": 9.099639644815034e-06, + "loss": 0.2076, + "step": 7986 + }, + { + "epoch": 0.2021155452083913, + "grad_norm": 10.546565055847168, + "learning_rate": 9.099409773289118e-06, + "loss": 0.3407, + "step": 7987 + }, + { + "epoch": 0.202140850773085, + "grad_norm": 5.336878776550293, + "learning_rate": 9.099179875326868e-06, + "loss": 0.1539, + "step": 7988 + }, + { + "epoch": 0.20216615633777868, + "grad_norm": 9.849529266357422, + "learning_rate": 9.098949950929772e-06, + "loss": 0.279, + "step": 7989 + }, + { + "epoch": 0.20219146190247236, + "grad_norm": 14.71402645111084, + "learning_rate": 9.09872000009931e-06, + "loss": 0.2404, + "step": 7990 + }, + { + "epoch": 0.20221676746716602, + "grad_norm": 5.393197059631348, + "learning_rate": 9.098490022836966e-06, + "loss": 0.1471, + "step": 7991 + }, + { + "epoch": 0.2022420730318597, + "grad_norm": 3.4768447875976562, + "learning_rate": 9.09826001914422e-06, + "loss": 0.1725, + "step": 7992 + }, + { + "epoch": 0.2022673785965534, + "grad_norm": 3.7657015323638916, + "learning_rate": 9.098029989022558e-06, + "loss": 0.1718, + "step": 7993 + }, + { + "epoch": 0.20229268416124707, + "grad_norm": 4.72985315322876, + "learning_rate": 9.097799932473464e-06, + "loss": 0.2045, + "step": 7994 + }, + { + "epoch": 0.20231798972594073, + "grad_norm": 5.852225303649902, + "learning_rate": 9.097569849498417e-06, + "loss": 0.1721, + "step": 7995 + }, + { + "epoch": 0.2023432952906344, + "grad_norm": 5.539883136749268, + "learning_rate": 9.097339740098909e-06, + "loss": 0.2498, + "step": 7996 + }, + { + "epoch": 0.2023686008553281, + "grad_norm": 5.897795677185059, + "learning_rate": 9.097109604276416e-06, + "loss": 0.2065, + "step": 7997 + }, + { + "epoch": 0.20239390642002175, + "grad_norm": 4.618927955627441, + "learning_rate": 9.096879442032428e-06, + "loss": 0.1737, + "step": 7998 + }, + { + "epoch": 0.20241921198471544, + "grad_norm": 5.029360294342041, + "learning_rate": 9.096649253368423e-06, + "loss": 0.1702, + "step": 7999 + }, + { + "epoch": 0.20244451754940912, + "grad_norm": 12.166600227355957, + "learning_rate": 9.096419038285891e-06, + "loss": 0.3355, + "step": 8000 + }, + { + "epoch": 0.2024698231141028, + "grad_norm": 8.435458183288574, + "learning_rate": 9.096188796786316e-06, + "loss": 0.2084, + "step": 8001 + }, + { + "epoch": 0.20249512867879646, + "grad_norm": 4.971983432769775, + "learning_rate": 9.095958528871178e-06, + "loss": 0.1704, + "step": 8002 + }, + { + "epoch": 0.20252043424349014, + "grad_norm": 5.081822395324707, + "learning_rate": 9.095728234541968e-06, + "loss": 0.1577, + "step": 8003 + }, + { + "epoch": 0.20254573980818383, + "grad_norm": 6.302639484405518, + "learning_rate": 9.095497913800168e-06, + "loss": 0.2403, + "step": 8004 + }, + { + "epoch": 0.20257104537287748, + "grad_norm": 6.752810478210449, + "learning_rate": 9.095267566647263e-06, + "loss": 0.1883, + "step": 8005 + }, + { + "epoch": 0.20259635093757117, + "grad_norm": 5.996938705444336, + "learning_rate": 9.09503719308474e-06, + "loss": 0.197, + "step": 8006 + }, + { + "epoch": 0.20262165650226485, + "grad_norm": 5.058488845825195, + "learning_rate": 9.094806793114084e-06, + "loss": 0.1423, + "step": 8007 + }, + { + "epoch": 0.20264696206695854, + "grad_norm": 6.944398403167725, + "learning_rate": 9.09457636673678e-06, + "loss": 0.2451, + "step": 8008 + }, + { + "epoch": 0.2026722676316522, + "grad_norm": 5.192162036895752, + "learning_rate": 9.094345913954315e-06, + "loss": 0.1755, + "step": 8009 + }, + { + "epoch": 0.20269757319634588, + "grad_norm": 4.909603118896484, + "learning_rate": 9.094115434768176e-06, + "loss": 0.1928, + "step": 8010 + }, + { + "epoch": 0.20272287876103956, + "grad_norm": 9.751919746398926, + "learning_rate": 9.093884929179846e-06, + "loss": 0.1943, + "step": 8011 + }, + { + "epoch": 0.20274818432573322, + "grad_norm": 5.322269916534424, + "learning_rate": 9.093654397190815e-06, + "loss": 0.1968, + "step": 8012 + }, + { + "epoch": 0.2027734898904269, + "grad_norm": 5.693955421447754, + "learning_rate": 9.093423838802567e-06, + "loss": 0.1594, + "step": 8013 + }, + { + "epoch": 0.20279879545512058, + "grad_norm": 9.570640563964844, + "learning_rate": 9.09319325401659e-06, + "loss": 0.2033, + "step": 8014 + }, + { + "epoch": 0.20282410101981427, + "grad_norm": 6.652688503265381, + "learning_rate": 9.092962642834373e-06, + "loss": 0.273, + "step": 8015 + }, + { + "epoch": 0.20284940658450792, + "grad_norm": 8.200197219848633, + "learning_rate": 9.092732005257399e-06, + "loss": 0.1853, + "step": 8016 + }, + { + "epoch": 0.2028747121492016, + "grad_norm": 11.922615051269531, + "learning_rate": 9.092501341287158e-06, + "loss": 0.2643, + "step": 8017 + }, + { + "epoch": 0.2029000177138953, + "grad_norm": 9.85617733001709, + "learning_rate": 9.092270650925139e-06, + "loss": 0.2318, + "step": 8018 + }, + { + "epoch": 0.20292532327858895, + "grad_norm": 4.317425727844238, + "learning_rate": 9.092039934172828e-06, + "loss": 0.2578, + "step": 8019 + }, + { + "epoch": 0.20295062884328263, + "grad_norm": 4.4232001304626465, + "learning_rate": 9.091809191031711e-06, + "loss": 0.1737, + "step": 8020 + }, + { + "epoch": 0.20297593440797632, + "grad_norm": 3.210054874420166, + "learning_rate": 9.091578421503277e-06, + "loss": 0.1539, + "step": 8021 + }, + { + "epoch": 0.20300123997267, + "grad_norm": 5.728116512298584, + "learning_rate": 9.091347625589018e-06, + "loss": 0.24, + "step": 8022 + }, + { + "epoch": 0.20302654553736366, + "grad_norm": 5.309719562530518, + "learning_rate": 9.091116803290416e-06, + "loss": 0.1619, + "step": 8023 + }, + { + "epoch": 0.20305185110205734, + "grad_norm": 5.163084983825684, + "learning_rate": 9.090885954608963e-06, + "loss": 0.1611, + "step": 8024 + }, + { + "epoch": 0.20307715666675102, + "grad_norm": 3.4486470222473145, + "learning_rate": 9.09065507954615e-06, + "loss": 0.1647, + "step": 8025 + }, + { + "epoch": 0.2031024622314447, + "grad_norm": 3.684554100036621, + "learning_rate": 9.09042417810346e-06, + "loss": 0.2016, + "step": 8026 + }, + { + "epoch": 0.20312776779613836, + "grad_norm": 4.424521446228027, + "learning_rate": 9.090193250282387e-06, + "loss": 0.1844, + "step": 8027 + }, + { + "epoch": 0.20315307336083205, + "grad_norm": 4.513114929199219, + "learning_rate": 9.08996229608442e-06, + "loss": 0.1653, + "step": 8028 + }, + { + "epoch": 0.20317837892552573, + "grad_norm": 9.148407936096191, + "learning_rate": 9.089731315511045e-06, + "loss": 0.2524, + "step": 8029 + }, + { + "epoch": 0.2032036844902194, + "grad_norm": 6.904575824737549, + "learning_rate": 9.089500308563754e-06, + "loss": 0.2151, + "step": 8030 + }, + { + "epoch": 0.20322899005491307, + "grad_norm": 5.4486894607543945, + "learning_rate": 9.089269275244035e-06, + "loss": 0.2432, + "step": 8031 + }, + { + "epoch": 0.20325429561960676, + "grad_norm": 8.406908988952637, + "learning_rate": 9.089038215553381e-06, + "loss": 0.1508, + "step": 8032 + }, + { + "epoch": 0.20327960118430044, + "grad_norm": 7.758333683013916, + "learning_rate": 9.08880712949328e-06, + "loss": 0.2722, + "step": 8033 + }, + { + "epoch": 0.2033049067489941, + "grad_norm": 4.1851301193237305, + "learning_rate": 9.088576017065221e-06, + "loss": 0.1581, + "step": 8034 + }, + { + "epoch": 0.20333021231368778, + "grad_norm": 6.164541244506836, + "learning_rate": 9.088344878270697e-06, + "loss": 0.2097, + "step": 8035 + }, + { + "epoch": 0.20335551787838146, + "grad_norm": 11.183712005615234, + "learning_rate": 9.088113713111198e-06, + "loss": 0.1843, + "step": 8036 + }, + { + "epoch": 0.20338082344307512, + "grad_norm": 4.895061016082764, + "learning_rate": 9.087882521588212e-06, + "loss": 0.2025, + "step": 8037 + }, + { + "epoch": 0.2034061290077688, + "grad_norm": 5.822433948516846, + "learning_rate": 9.087651303703233e-06, + "loss": 0.2105, + "step": 8038 + }, + { + "epoch": 0.2034314345724625, + "grad_norm": 7.011116027832031, + "learning_rate": 9.087420059457752e-06, + "loss": 0.2582, + "step": 8039 + }, + { + "epoch": 0.20345674013715617, + "grad_norm": 4.225038528442383, + "learning_rate": 9.087188788853258e-06, + "loss": 0.1227, + "step": 8040 + }, + { + "epoch": 0.20348204570184983, + "grad_norm": 3.9352173805236816, + "learning_rate": 9.086957491891243e-06, + "loss": 0.2146, + "step": 8041 + }, + { + "epoch": 0.2035073512665435, + "grad_norm": 3.048858404159546, + "learning_rate": 9.086726168573202e-06, + "loss": 0.1466, + "step": 8042 + }, + { + "epoch": 0.2035326568312372, + "grad_norm": 2.617502450942993, + "learning_rate": 9.086494818900624e-06, + "loss": 0.1855, + "step": 8043 + }, + { + "epoch": 0.20355796239593085, + "grad_norm": 6.2492241859436035, + "learning_rate": 9.086263442874998e-06, + "loss": 0.2003, + "step": 8044 + }, + { + "epoch": 0.20358326796062454, + "grad_norm": 4.548287868499756, + "learning_rate": 9.086032040497823e-06, + "loss": 0.2242, + "step": 8045 + }, + { + "epoch": 0.20360857352531822, + "grad_norm": 2.972184658050537, + "learning_rate": 9.085800611770585e-06, + "loss": 0.1428, + "step": 8046 + }, + { + "epoch": 0.2036338790900119, + "grad_norm": 9.126022338867188, + "learning_rate": 9.08556915669478e-06, + "loss": 0.2303, + "step": 8047 + }, + { + "epoch": 0.20365918465470556, + "grad_norm": 4.197001934051514, + "learning_rate": 9.085337675271898e-06, + "loss": 0.2191, + "step": 8048 + }, + { + "epoch": 0.20368449021939924, + "grad_norm": 12.17724895477295, + "learning_rate": 9.085106167503435e-06, + "loss": 0.1668, + "step": 8049 + }, + { + "epoch": 0.20370979578409293, + "grad_norm": 6.362460613250732, + "learning_rate": 9.084874633390881e-06, + "loss": 0.2083, + "step": 8050 + }, + { + "epoch": 0.20373510134878658, + "grad_norm": 3.9103612899780273, + "learning_rate": 9.08464307293573e-06, + "loss": 0.1491, + "step": 8051 + }, + { + "epoch": 0.20376040691348027, + "grad_norm": 6.5658979415893555, + "learning_rate": 9.084411486139477e-06, + "loss": 0.2194, + "step": 8052 + }, + { + "epoch": 0.20378571247817395, + "grad_norm": 3.9206862449645996, + "learning_rate": 9.084179873003614e-06, + "loss": 0.2232, + "step": 8053 + }, + { + "epoch": 0.20381101804286764, + "grad_norm": 5.143557071685791, + "learning_rate": 9.083948233529634e-06, + "loss": 0.185, + "step": 8054 + }, + { + "epoch": 0.2038363236075613, + "grad_norm": 2.9128568172454834, + "learning_rate": 9.083716567719031e-06, + "loss": 0.1169, + "step": 8055 + }, + { + "epoch": 0.20386162917225498, + "grad_norm": 6.172805309295654, + "learning_rate": 9.0834848755733e-06, + "loss": 0.1755, + "step": 8056 + }, + { + "epoch": 0.20388693473694866, + "grad_norm": 3.769996166229248, + "learning_rate": 9.083253157093934e-06, + "loss": 0.1923, + "step": 8057 + }, + { + "epoch": 0.20391224030164234, + "grad_norm": 4.440304279327393, + "learning_rate": 9.083021412282428e-06, + "loss": 0.1789, + "step": 8058 + }, + { + "epoch": 0.203937545866336, + "grad_norm": 4.928411483764648, + "learning_rate": 9.082789641140279e-06, + "loss": 0.237, + "step": 8059 + }, + { + "epoch": 0.20396285143102968, + "grad_norm": 8.166753768920898, + "learning_rate": 9.082557843668977e-06, + "loss": 0.231, + "step": 8060 + }, + { + "epoch": 0.20398815699572337, + "grad_norm": 10.54537582397461, + "learning_rate": 9.082326019870018e-06, + "loss": 0.2797, + "step": 8061 + }, + { + "epoch": 0.20401346256041702, + "grad_norm": 5.7192063331604, + "learning_rate": 9.082094169744899e-06, + "loss": 0.1924, + "step": 8062 + }, + { + "epoch": 0.2040387681251107, + "grad_norm": 4.177818298339844, + "learning_rate": 9.081862293295114e-06, + "loss": 0.1489, + "step": 8063 + }, + { + "epoch": 0.2040640736898044, + "grad_norm": 6.033791542053223, + "learning_rate": 9.081630390522158e-06, + "loss": 0.1711, + "step": 8064 + }, + { + "epoch": 0.20408937925449808, + "grad_norm": 9.118475914001465, + "learning_rate": 9.081398461427529e-06, + "loss": 0.2301, + "step": 8065 + }, + { + "epoch": 0.20411468481919173, + "grad_norm": 3.971339702606201, + "learning_rate": 9.081166506012718e-06, + "loss": 0.1592, + "step": 8066 + }, + { + "epoch": 0.20413999038388542, + "grad_norm": 5.067959785461426, + "learning_rate": 9.080934524279223e-06, + "loss": 0.2381, + "step": 8067 + }, + { + "epoch": 0.2041652959485791, + "grad_norm": 3.6657209396362305, + "learning_rate": 9.080702516228541e-06, + "loss": 0.1653, + "step": 8068 + }, + { + "epoch": 0.20419060151327276, + "grad_norm": 5.354133129119873, + "learning_rate": 9.08047048186217e-06, + "loss": 0.1863, + "step": 8069 + }, + { + "epoch": 0.20421590707796644, + "grad_norm": 13.0159273147583, + "learning_rate": 9.0802384211816e-06, + "loss": 0.2943, + "step": 8070 + }, + { + "epoch": 0.20424121264266012, + "grad_norm": 3.1772725582122803, + "learning_rate": 9.080006334188334e-06, + "loss": 0.1136, + "step": 8071 + }, + { + "epoch": 0.2042665182073538, + "grad_norm": 4.9530348777771, + "learning_rate": 9.079774220883866e-06, + "loss": 0.1313, + "step": 8072 + }, + { + "epoch": 0.20429182377204747, + "grad_norm": 5.821621417999268, + "learning_rate": 9.079542081269692e-06, + "loss": 0.1794, + "step": 8073 + }, + { + "epoch": 0.20431712933674115, + "grad_norm": 7.747300148010254, + "learning_rate": 9.079309915347309e-06, + "loss": 0.2114, + "step": 8074 + }, + { + "epoch": 0.20434243490143483, + "grad_norm": 3.219278573989868, + "learning_rate": 9.079077723118217e-06, + "loss": 0.2082, + "step": 8075 + }, + { + "epoch": 0.2043677404661285, + "grad_norm": 6.328221321105957, + "learning_rate": 9.078845504583912e-06, + "loss": 0.1382, + "step": 8076 + }, + { + "epoch": 0.20439304603082217, + "grad_norm": 6.021063327789307, + "learning_rate": 9.07861325974589e-06, + "loss": 0.2013, + "step": 8077 + }, + { + "epoch": 0.20441835159551586, + "grad_norm": 19.41734504699707, + "learning_rate": 9.078380988605649e-06, + "loss": 0.3831, + "step": 8078 + }, + { + "epoch": 0.20444365716020954, + "grad_norm": 3.5054588317871094, + "learning_rate": 9.078148691164687e-06, + "loss": 0.0828, + "step": 8079 + }, + { + "epoch": 0.2044689627249032, + "grad_norm": 2.6802403926849365, + "learning_rate": 9.077916367424506e-06, + "loss": 0.1344, + "step": 8080 + }, + { + "epoch": 0.20449426828959688, + "grad_norm": 5.293497085571289, + "learning_rate": 9.077684017386599e-06, + "loss": 0.2128, + "step": 8081 + }, + { + "epoch": 0.20451957385429057, + "grad_norm": 12.694757461547852, + "learning_rate": 9.077451641052466e-06, + "loss": 0.4267, + "step": 8082 + }, + { + "epoch": 0.20454487941898422, + "grad_norm": 41.653411865234375, + "learning_rate": 9.077219238423605e-06, + "loss": 0.2966, + "step": 8083 + }, + { + "epoch": 0.2045701849836779, + "grad_norm": 6.094161510467529, + "learning_rate": 9.076986809501518e-06, + "loss": 0.25, + "step": 8084 + }, + { + "epoch": 0.2045954905483716, + "grad_norm": 22.960691452026367, + "learning_rate": 9.076754354287698e-06, + "loss": 0.3994, + "step": 8085 + }, + { + "epoch": 0.20462079611306527, + "grad_norm": 2.962705612182617, + "learning_rate": 9.07652187278365e-06, + "loss": 0.1251, + "step": 8086 + }, + { + "epoch": 0.20464610167775893, + "grad_norm": 8.573068618774414, + "learning_rate": 9.07628936499087e-06, + "loss": 0.1898, + "step": 8087 + }, + { + "epoch": 0.2046714072424526, + "grad_norm": 3.8229408264160156, + "learning_rate": 9.07605683091086e-06, + "loss": 0.1971, + "step": 8088 + }, + { + "epoch": 0.2046967128071463, + "grad_norm": 3.5167534351348877, + "learning_rate": 9.075824270545116e-06, + "loss": 0.1478, + "step": 8089 + }, + { + "epoch": 0.20472201837183998, + "grad_norm": 3.5692620277404785, + "learning_rate": 9.075591683895139e-06, + "loss": 0.1246, + "step": 8090 + }, + { + "epoch": 0.20474732393653364, + "grad_norm": 6.427013397216797, + "learning_rate": 9.07535907096243e-06, + "loss": 0.2162, + "step": 8091 + }, + { + "epoch": 0.20477262950122732, + "grad_norm": 3.958272933959961, + "learning_rate": 9.075126431748489e-06, + "loss": 0.2161, + "step": 8092 + }, + { + "epoch": 0.204797935065921, + "grad_norm": 8.893757820129395, + "learning_rate": 9.074893766254816e-06, + "loss": 0.1387, + "step": 8093 + }, + { + "epoch": 0.20482324063061466, + "grad_norm": 3.9981141090393066, + "learning_rate": 9.074661074482908e-06, + "loss": 0.1749, + "step": 8094 + }, + { + "epoch": 0.20484854619530835, + "grad_norm": 9.308635711669922, + "learning_rate": 9.074428356434272e-06, + "loss": 0.3258, + "step": 8095 + }, + { + "epoch": 0.20487385176000203, + "grad_norm": 16.691072463989258, + "learning_rate": 9.074195612110404e-06, + "loss": 0.2447, + "step": 8096 + }, + { + "epoch": 0.2048991573246957, + "grad_norm": 6.652346134185791, + "learning_rate": 9.073962841512807e-06, + "loss": 0.2667, + "step": 8097 + }, + { + "epoch": 0.20492446288938937, + "grad_norm": 3.927319288253784, + "learning_rate": 9.073730044642981e-06, + "loss": 0.2225, + "step": 8098 + }, + { + "epoch": 0.20494976845408305, + "grad_norm": 4.219431400299072, + "learning_rate": 9.073497221502428e-06, + "loss": 0.1923, + "step": 8099 + }, + { + "epoch": 0.20497507401877674, + "grad_norm": 9.028204917907715, + "learning_rate": 9.073264372092648e-06, + "loss": 0.1952, + "step": 8100 + }, + { + "epoch": 0.2050003795834704, + "grad_norm": 6.257613182067871, + "learning_rate": 9.073031496415144e-06, + "loss": 0.2051, + "step": 8101 + }, + { + "epoch": 0.20502568514816408, + "grad_norm": 6.050264835357666, + "learning_rate": 9.072798594471419e-06, + "loss": 0.1802, + "step": 8102 + }, + { + "epoch": 0.20505099071285776, + "grad_norm": 8.960165977478027, + "learning_rate": 9.072565666262973e-06, + "loss": 0.2102, + "step": 8103 + }, + { + "epoch": 0.20507629627755145, + "grad_norm": 7.309792518615723, + "learning_rate": 9.072332711791307e-06, + "loss": 0.183, + "step": 8104 + }, + { + "epoch": 0.2051016018422451, + "grad_norm": 9.702596664428711, + "learning_rate": 9.072099731057926e-06, + "loss": 0.2264, + "step": 8105 + }, + { + "epoch": 0.20512690740693879, + "grad_norm": 7.2438459396362305, + "learning_rate": 9.071866724064332e-06, + "loss": 0.2739, + "step": 8106 + }, + { + "epoch": 0.20515221297163247, + "grad_norm": 6.078071594238281, + "learning_rate": 9.071633690812026e-06, + "loss": 0.1322, + "step": 8107 + }, + { + "epoch": 0.20517751853632613, + "grad_norm": 14.90393352508545, + "learning_rate": 9.071400631302513e-06, + "loss": 0.5253, + "step": 8108 + }, + { + "epoch": 0.2052028241010198, + "grad_norm": 5.058172225952148, + "learning_rate": 9.071167545537294e-06, + "loss": 0.1985, + "step": 8109 + }, + { + "epoch": 0.2052281296657135, + "grad_norm": 7.026741027832031, + "learning_rate": 9.070934433517872e-06, + "loss": 0.1953, + "step": 8110 + }, + { + "epoch": 0.20525343523040718, + "grad_norm": 6.789827346801758, + "learning_rate": 9.070701295245752e-06, + "loss": 0.2474, + "step": 8111 + }, + { + "epoch": 0.20527874079510083, + "grad_norm": 3.1792781352996826, + "learning_rate": 9.070468130722435e-06, + "loss": 0.154, + "step": 8112 + }, + { + "epoch": 0.20530404635979452, + "grad_norm": 7.465359687805176, + "learning_rate": 9.070234939949427e-06, + "loss": 0.1926, + "step": 8113 + }, + { + "epoch": 0.2053293519244882, + "grad_norm": 3.2848613262176514, + "learning_rate": 9.070001722928232e-06, + "loss": 0.1369, + "step": 8114 + }, + { + "epoch": 0.20535465748918186, + "grad_norm": 10.197179794311523, + "learning_rate": 9.069768479660352e-06, + "loss": 0.2747, + "step": 8115 + }, + { + "epoch": 0.20537996305387554, + "grad_norm": 6.449770927429199, + "learning_rate": 9.069535210147293e-06, + "loss": 0.2107, + "step": 8116 + }, + { + "epoch": 0.20540526861856923, + "grad_norm": 4.7728705406188965, + "learning_rate": 9.069301914390558e-06, + "loss": 0.1834, + "step": 8117 + }, + { + "epoch": 0.2054305741832629, + "grad_norm": 9.900177001953125, + "learning_rate": 9.069068592391652e-06, + "loss": 0.2308, + "step": 8118 + }, + { + "epoch": 0.20545587974795657, + "grad_norm": 6.10776424407959, + "learning_rate": 9.06883524415208e-06, + "loss": 0.1578, + "step": 8119 + }, + { + "epoch": 0.20548118531265025, + "grad_norm": 8.513026237487793, + "learning_rate": 9.068601869673344e-06, + "loss": 0.2358, + "step": 8120 + }, + { + "epoch": 0.20550649087734393, + "grad_norm": 5.968698978424072, + "learning_rate": 9.068368468956952e-06, + "loss": 0.1745, + "step": 8121 + }, + { + "epoch": 0.20553179644203762, + "grad_norm": 5.260853290557861, + "learning_rate": 9.06813504200441e-06, + "loss": 0.1741, + "step": 8122 + }, + { + "epoch": 0.20555710200673127, + "grad_norm": 13.583504676818848, + "learning_rate": 9.067901588817221e-06, + "loss": 0.3127, + "step": 8123 + }, + { + "epoch": 0.20558240757142496, + "grad_norm": 6.299941539764404, + "learning_rate": 9.067668109396892e-06, + "loss": 0.2621, + "step": 8124 + }, + { + "epoch": 0.20560771313611864, + "grad_norm": 11.838956832885742, + "learning_rate": 9.067434603744929e-06, + "loss": 0.2834, + "step": 8125 + }, + { + "epoch": 0.2056330187008123, + "grad_norm": 4.137224197387695, + "learning_rate": 9.067201071862834e-06, + "loss": 0.205, + "step": 8126 + }, + { + "epoch": 0.20565832426550598, + "grad_norm": 3.5912153720855713, + "learning_rate": 9.066967513752118e-06, + "loss": 0.1167, + "step": 8127 + }, + { + "epoch": 0.20568362983019967, + "grad_norm": 7.947838306427002, + "learning_rate": 9.066733929414283e-06, + "loss": 0.247, + "step": 8128 + }, + { + "epoch": 0.20570893539489335, + "grad_norm": 7.960760116577148, + "learning_rate": 9.06650031885084e-06, + "loss": 0.1881, + "step": 8129 + }, + { + "epoch": 0.205734240959587, + "grad_norm": 6.067989826202393, + "learning_rate": 9.066266682063291e-06, + "loss": 0.1811, + "step": 8130 + }, + { + "epoch": 0.2057595465242807, + "grad_norm": 4.705331802368164, + "learning_rate": 9.066033019053147e-06, + "loss": 0.139, + "step": 8131 + }, + { + "epoch": 0.20578485208897437, + "grad_norm": 5.904500484466553, + "learning_rate": 9.06579932982191e-06, + "loss": 0.1132, + "step": 8132 + }, + { + "epoch": 0.20581015765366803, + "grad_norm": 3.703953266143799, + "learning_rate": 9.06556561437109e-06, + "loss": 0.1802, + "step": 8133 + }, + { + "epoch": 0.20583546321836171, + "grad_norm": 6.213325500488281, + "learning_rate": 9.065331872702194e-06, + "loss": 0.2693, + "step": 8134 + }, + { + "epoch": 0.2058607687830554, + "grad_norm": 10.808267593383789, + "learning_rate": 9.065098104816728e-06, + "loss": 0.1813, + "step": 8135 + }, + { + "epoch": 0.20588607434774908, + "grad_norm": 4.666353702545166, + "learning_rate": 9.0648643107162e-06, + "loss": 0.1724, + "step": 8136 + }, + { + "epoch": 0.20591137991244274, + "grad_norm": 5.390063762664795, + "learning_rate": 9.06463049040212e-06, + "loss": 0.2015, + "step": 8137 + }, + { + "epoch": 0.20593668547713642, + "grad_norm": 4.940870761871338, + "learning_rate": 9.064396643875993e-06, + "loss": 0.1727, + "step": 8138 + }, + { + "epoch": 0.2059619910418301, + "grad_norm": 3.1728789806365967, + "learning_rate": 9.064162771139329e-06, + "loss": 0.1085, + "step": 8139 + }, + { + "epoch": 0.20598729660652376, + "grad_norm": 9.35116195678711, + "learning_rate": 9.063928872193635e-06, + "loss": 0.3029, + "step": 8140 + }, + { + "epoch": 0.20601260217121745, + "grad_norm": 8.20859146118164, + "learning_rate": 9.06369494704042e-06, + "loss": 0.1922, + "step": 8141 + }, + { + "epoch": 0.20603790773591113, + "grad_norm": 4.059065818786621, + "learning_rate": 9.06346099568119e-06, + "loss": 0.224, + "step": 8142 + }, + { + "epoch": 0.20606321330060481, + "grad_norm": 5.9281325340271, + "learning_rate": 9.063227018117457e-06, + "loss": 0.1726, + "step": 8143 + }, + { + "epoch": 0.20608851886529847, + "grad_norm": 13.485159873962402, + "learning_rate": 9.062993014350729e-06, + "loss": 0.2379, + "step": 8144 + }, + { + "epoch": 0.20611382442999215, + "grad_norm": 6.20458459854126, + "learning_rate": 9.062758984382515e-06, + "loss": 0.1908, + "step": 8145 + }, + { + "epoch": 0.20613912999468584, + "grad_norm": 3.7584147453308105, + "learning_rate": 9.062524928214325e-06, + "loss": 0.1823, + "step": 8146 + }, + { + "epoch": 0.2061644355593795, + "grad_norm": 5.377167224884033, + "learning_rate": 9.062290845847666e-06, + "loss": 0.1748, + "step": 8147 + }, + { + "epoch": 0.20618974112407318, + "grad_norm": 3.6684422492980957, + "learning_rate": 9.06205673728405e-06, + "loss": 0.1667, + "step": 8148 + }, + { + "epoch": 0.20621504668876686, + "grad_norm": 4.103124141693115, + "learning_rate": 9.061822602524985e-06, + "loss": 0.1986, + "step": 8149 + }, + { + "epoch": 0.20624035225346055, + "grad_norm": 4.381398677825928, + "learning_rate": 9.06158844157198e-06, + "loss": 0.1826, + "step": 8150 + }, + { + "epoch": 0.2062656578181542, + "grad_norm": 3.2848281860351562, + "learning_rate": 9.06135425442655e-06, + "loss": 0.1761, + "step": 8151 + }, + { + "epoch": 0.2062909633828479, + "grad_norm": 4.279769420623779, + "learning_rate": 9.0611200410902e-06, + "loss": 0.1746, + "step": 8152 + }, + { + "epoch": 0.20631626894754157, + "grad_norm": 7.778155326843262, + "learning_rate": 9.060885801564443e-06, + "loss": 0.3002, + "step": 8153 + }, + { + "epoch": 0.20634157451223525, + "grad_norm": 13.654439926147461, + "learning_rate": 9.060651535850788e-06, + "loss": 0.1851, + "step": 8154 + }, + { + "epoch": 0.2063668800769289, + "grad_norm": 7.432271480560303, + "learning_rate": 9.060417243950748e-06, + "loss": 0.2743, + "step": 8155 + }, + { + "epoch": 0.2063921856416226, + "grad_norm": 8.91952896118164, + "learning_rate": 9.06018292586583e-06, + "loss": 0.2604, + "step": 8156 + }, + { + "epoch": 0.20641749120631628, + "grad_norm": 7.328738212585449, + "learning_rate": 9.059948581597549e-06, + "loss": 0.198, + "step": 8157 + }, + { + "epoch": 0.20644279677100993, + "grad_norm": 4.704005718231201, + "learning_rate": 9.059714211147415e-06, + "loss": 0.2599, + "step": 8158 + }, + { + "epoch": 0.20646810233570362, + "grad_norm": 5.877425670623779, + "learning_rate": 9.05947981451694e-06, + "loss": 0.1286, + "step": 8159 + }, + { + "epoch": 0.2064934079003973, + "grad_norm": 4.935370445251465, + "learning_rate": 9.059245391707635e-06, + "loss": 0.1343, + "step": 8160 + }, + { + "epoch": 0.206518713465091, + "grad_norm": 4.747280120849609, + "learning_rate": 9.059010942721009e-06, + "loss": 0.2047, + "step": 8161 + }, + { + "epoch": 0.20654401902978464, + "grad_norm": 5.492842197418213, + "learning_rate": 9.058776467558578e-06, + "loss": 0.2156, + "step": 8162 + }, + { + "epoch": 0.20656932459447833, + "grad_norm": 4.046675205230713, + "learning_rate": 9.058541966221852e-06, + "loss": 0.2017, + "step": 8163 + }, + { + "epoch": 0.206594630159172, + "grad_norm": 4.546823978424072, + "learning_rate": 9.058307438712343e-06, + "loss": 0.2095, + "step": 8164 + }, + { + "epoch": 0.20661993572386567, + "grad_norm": 4.90784215927124, + "learning_rate": 9.058072885031568e-06, + "loss": 0.2103, + "step": 8165 + }, + { + "epoch": 0.20664524128855935, + "grad_norm": 8.823704719543457, + "learning_rate": 9.057838305181033e-06, + "loss": 0.1871, + "step": 8166 + }, + { + "epoch": 0.20667054685325303, + "grad_norm": 8.07016658782959, + "learning_rate": 9.057603699162253e-06, + "loss": 0.2233, + "step": 8167 + }, + { + "epoch": 0.20669585241794672, + "grad_norm": 3.637629985809326, + "learning_rate": 9.057369066976743e-06, + "loss": 0.2212, + "step": 8168 + }, + { + "epoch": 0.20672115798264037, + "grad_norm": 4.15571403503418, + "learning_rate": 9.057134408626014e-06, + "loss": 0.1884, + "step": 8169 + }, + { + "epoch": 0.20674646354733406, + "grad_norm": 10.972412109375, + "learning_rate": 9.056899724111579e-06, + "loss": 0.2634, + "step": 8170 + }, + { + "epoch": 0.20677176911202774, + "grad_norm": 4.77396821975708, + "learning_rate": 9.056665013434954e-06, + "loss": 0.2688, + "step": 8171 + }, + { + "epoch": 0.2067970746767214, + "grad_norm": 4.562601089477539, + "learning_rate": 9.056430276597649e-06, + "loss": 0.1804, + "step": 8172 + }, + { + "epoch": 0.20682238024141508, + "grad_norm": 37.10982894897461, + "learning_rate": 9.056195513601182e-06, + "loss": 0.371, + "step": 8173 + }, + { + "epoch": 0.20684768580610877, + "grad_norm": 14.187033653259277, + "learning_rate": 9.055960724447063e-06, + "loss": 0.1706, + "step": 8174 + }, + { + "epoch": 0.20687299137080245, + "grad_norm": 6.768634796142578, + "learning_rate": 9.055725909136807e-06, + "loss": 0.1896, + "step": 8175 + }, + { + "epoch": 0.2068982969354961, + "grad_norm": 3.29219388961792, + "learning_rate": 9.05549106767193e-06, + "loss": 0.1655, + "step": 8176 + }, + { + "epoch": 0.2069236025001898, + "grad_norm": 3.787531614303589, + "learning_rate": 9.055256200053947e-06, + "loss": 0.1777, + "step": 8177 + }, + { + "epoch": 0.20694890806488347, + "grad_norm": 3.3501358032226562, + "learning_rate": 9.055021306284369e-06, + "loss": 0.1293, + "step": 8178 + }, + { + "epoch": 0.20697421362957713, + "grad_norm": 2.699298858642578, + "learning_rate": 9.054786386364714e-06, + "loss": 0.1033, + "step": 8179 + }, + { + "epoch": 0.20699951919427081, + "grad_norm": 3.5978643894195557, + "learning_rate": 9.054551440296497e-06, + "loss": 0.1662, + "step": 8180 + }, + { + "epoch": 0.2070248247589645, + "grad_norm": 9.219245910644531, + "learning_rate": 9.05431646808123e-06, + "loss": 0.2177, + "step": 8181 + }, + { + "epoch": 0.20705013032365818, + "grad_norm": 4.659676551818848, + "learning_rate": 9.05408146972043e-06, + "loss": 0.1772, + "step": 8182 + }, + { + "epoch": 0.20707543588835184, + "grad_norm": 5.125293254852295, + "learning_rate": 9.053846445215615e-06, + "loss": 0.109, + "step": 8183 + }, + { + "epoch": 0.20710074145304552, + "grad_norm": 5.691486835479736, + "learning_rate": 9.053611394568295e-06, + "loss": 0.198, + "step": 8184 + }, + { + "epoch": 0.2071260470177392, + "grad_norm": 5.050751686096191, + "learning_rate": 9.053376317779992e-06, + "loss": 0.2277, + "step": 8185 + }, + { + "epoch": 0.2071513525824329, + "grad_norm": 4.6980881690979, + "learning_rate": 9.053141214852218e-06, + "loss": 0.2269, + "step": 8186 + }, + { + "epoch": 0.20717665814712655, + "grad_norm": 28.66170883178711, + "learning_rate": 9.05290608578649e-06, + "loss": 0.5071, + "step": 8187 + }, + { + "epoch": 0.20720196371182023, + "grad_norm": 4.056826591491699, + "learning_rate": 9.052670930584325e-06, + "loss": 0.1573, + "step": 8188 + }, + { + "epoch": 0.20722726927651391, + "grad_norm": 3.0655479431152344, + "learning_rate": 9.05243574924724e-06, + "loss": 0.1218, + "step": 8189 + }, + { + "epoch": 0.20725257484120757, + "grad_norm": 3.669187545776367, + "learning_rate": 9.052200541776748e-06, + "loss": 0.1794, + "step": 8190 + }, + { + "epoch": 0.20727788040590125, + "grad_norm": 5.048906326293945, + "learning_rate": 9.051965308174371e-06, + "loss": 0.1894, + "step": 8191 + }, + { + "epoch": 0.20730318597059494, + "grad_norm": 7.469935417175293, + "learning_rate": 9.051730048441623e-06, + "loss": 0.2758, + "step": 8192 + }, + { + "epoch": 0.20732849153528862, + "grad_norm": 3.855705738067627, + "learning_rate": 9.05149476258002e-06, + "loss": 0.2142, + "step": 8193 + }, + { + "epoch": 0.20735379709998228, + "grad_norm": 20.525197982788086, + "learning_rate": 9.051259450591083e-06, + "loss": 0.2497, + "step": 8194 + }, + { + "epoch": 0.20737910266467596, + "grad_norm": 6.223742961883545, + "learning_rate": 9.051024112476325e-06, + "loss": 0.174, + "step": 8195 + }, + { + "epoch": 0.20740440822936965, + "grad_norm": 6.187188148498535, + "learning_rate": 9.05078874823727e-06, + "loss": 0.1696, + "step": 8196 + }, + { + "epoch": 0.2074297137940633, + "grad_norm": 6.409437656402588, + "learning_rate": 9.050553357875428e-06, + "loss": 0.2013, + "step": 8197 + }, + { + "epoch": 0.207455019358757, + "grad_norm": 9.455550193786621, + "learning_rate": 9.050317941392321e-06, + "loss": 0.2058, + "step": 8198 + }, + { + "epoch": 0.20748032492345067, + "grad_norm": 3.7789602279663086, + "learning_rate": 9.05008249878947e-06, + "loss": 0.1688, + "step": 8199 + }, + { + "epoch": 0.20750563048814435, + "grad_norm": 4.411416053771973, + "learning_rate": 9.049847030068388e-06, + "loss": 0.1555, + "step": 8200 + }, + { + "epoch": 0.207530936052838, + "grad_norm": 7.270355701446533, + "learning_rate": 9.049611535230596e-06, + "loss": 0.2199, + "step": 8201 + }, + { + "epoch": 0.2075562416175317, + "grad_norm": 5.573863506317139, + "learning_rate": 9.049376014277614e-06, + "loss": 0.2187, + "step": 8202 + }, + { + "epoch": 0.20758154718222538, + "grad_norm": 3.498944044113159, + "learning_rate": 9.049140467210957e-06, + "loss": 0.1375, + "step": 8203 + }, + { + "epoch": 0.20760685274691904, + "grad_norm": 3.939495325088501, + "learning_rate": 9.048904894032147e-06, + "loss": 0.0963, + "step": 8204 + }, + { + "epoch": 0.20763215831161272, + "grad_norm": 5.5374979972839355, + "learning_rate": 9.048669294742704e-06, + "loss": 0.1685, + "step": 8205 + }, + { + "epoch": 0.2076574638763064, + "grad_norm": 5.926839351654053, + "learning_rate": 9.048433669344146e-06, + "loss": 0.1928, + "step": 8206 + }, + { + "epoch": 0.2076827694410001, + "grad_norm": 4.252636432647705, + "learning_rate": 9.04819801783799e-06, + "loss": 0.1723, + "step": 8207 + }, + { + "epoch": 0.20770807500569374, + "grad_norm": 3.6179184913635254, + "learning_rate": 9.047962340225759e-06, + "loss": 0.1118, + "step": 8208 + }, + { + "epoch": 0.20773338057038743, + "grad_norm": 4.37580680847168, + "learning_rate": 9.047726636508972e-06, + "loss": 0.2031, + "step": 8209 + }, + { + "epoch": 0.2077586861350811, + "grad_norm": 6.680218696594238, + "learning_rate": 9.04749090668915e-06, + "loss": 0.3264, + "step": 8210 + }, + { + "epoch": 0.20778399169977477, + "grad_norm": 5.788675785064697, + "learning_rate": 9.047255150767812e-06, + "loss": 0.1305, + "step": 8211 + }, + { + "epoch": 0.20780929726446845, + "grad_norm": 5.54313850402832, + "learning_rate": 9.047019368746475e-06, + "loss": 0.2239, + "step": 8212 + }, + { + "epoch": 0.20783460282916214, + "grad_norm": 5.447913646697998, + "learning_rate": 9.046783560626665e-06, + "loss": 0.221, + "step": 8213 + }, + { + "epoch": 0.20785990839385582, + "grad_norm": 4.384561538696289, + "learning_rate": 9.046547726409902e-06, + "loss": 0.2061, + "step": 8214 + }, + { + "epoch": 0.20788521395854948, + "grad_norm": 8.234725952148438, + "learning_rate": 9.046311866097704e-06, + "loss": 0.1791, + "step": 8215 + }, + { + "epoch": 0.20791051952324316, + "grad_norm": 12.142243385314941, + "learning_rate": 9.046075979691593e-06, + "loss": 0.1677, + "step": 8216 + }, + { + "epoch": 0.20793582508793684, + "grad_norm": 10.304376602172852, + "learning_rate": 9.045840067193093e-06, + "loss": 0.1995, + "step": 8217 + }, + { + "epoch": 0.20796113065263053, + "grad_norm": 5.143742084503174, + "learning_rate": 9.04560412860372e-06, + "loss": 0.1489, + "step": 8218 + }, + { + "epoch": 0.20798643621732418, + "grad_norm": 8.777105331420898, + "learning_rate": 9.045368163925e-06, + "loss": 0.2646, + "step": 8219 + }, + { + "epoch": 0.20801174178201787, + "grad_norm": 4.916921138763428, + "learning_rate": 9.045132173158452e-06, + "loss": 0.1721, + "step": 8220 + }, + { + "epoch": 0.20803704734671155, + "grad_norm": 4.403266429901123, + "learning_rate": 9.0448961563056e-06, + "loss": 0.2457, + "step": 8221 + }, + { + "epoch": 0.2080623529114052, + "grad_norm": 3.877652406692505, + "learning_rate": 9.044660113367966e-06, + "loss": 0.1542, + "step": 8222 + }, + { + "epoch": 0.2080876584760989, + "grad_norm": 3.2666187286376953, + "learning_rate": 9.04442404434707e-06, + "loss": 0.1017, + "step": 8223 + }, + { + "epoch": 0.20811296404079258, + "grad_norm": 3.5824029445648193, + "learning_rate": 9.044187949244437e-06, + "loss": 0.1193, + "step": 8224 + }, + { + "epoch": 0.20813826960548626, + "grad_norm": 12.283221244812012, + "learning_rate": 9.043951828061587e-06, + "loss": 0.2627, + "step": 8225 + }, + { + "epoch": 0.20816357517017992, + "grad_norm": 10.392684936523438, + "learning_rate": 9.043715680800045e-06, + "loss": 0.2062, + "step": 8226 + }, + { + "epoch": 0.2081888807348736, + "grad_norm": 13.845457077026367, + "learning_rate": 9.043479507461332e-06, + "loss": 0.3037, + "step": 8227 + }, + { + "epoch": 0.20821418629956728, + "grad_norm": 3.5981476306915283, + "learning_rate": 9.043243308046971e-06, + "loss": 0.1267, + "step": 8228 + }, + { + "epoch": 0.20823949186426094, + "grad_norm": 3.7087817192077637, + "learning_rate": 9.043007082558486e-06, + "loss": 0.1342, + "step": 8229 + }, + { + "epoch": 0.20826479742895462, + "grad_norm": 5.85768461227417, + "learning_rate": 9.042770830997401e-06, + "loss": 0.1453, + "step": 8230 + }, + { + "epoch": 0.2082901029936483, + "grad_norm": 6.555765151977539, + "learning_rate": 9.04253455336524e-06, + "loss": 0.2697, + "step": 8231 + }, + { + "epoch": 0.208315408558342, + "grad_norm": 4.808698654174805, + "learning_rate": 9.042298249663523e-06, + "loss": 0.175, + "step": 8232 + }, + { + "epoch": 0.20834071412303565, + "grad_norm": 7.353789329528809, + "learning_rate": 9.04206191989378e-06, + "loss": 0.1713, + "step": 8233 + }, + { + "epoch": 0.20836601968772933, + "grad_norm": 14.709165573120117, + "learning_rate": 9.04182556405753e-06, + "loss": 0.271, + "step": 8234 + }, + { + "epoch": 0.20839132525242302, + "grad_norm": 29.45014762878418, + "learning_rate": 9.041589182156299e-06, + "loss": 0.3574, + "step": 8235 + }, + { + "epoch": 0.20841663081711667, + "grad_norm": 5.6057963371276855, + "learning_rate": 9.04135277419161e-06, + "loss": 0.1854, + "step": 8236 + }, + { + "epoch": 0.20844193638181036, + "grad_norm": 9.629960060119629, + "learning_rate": 9.041116340164989e-06, + "loss": 0.2304, + "step": 8237 + }, + { + "epoch": 0.20846724194650404, + "grad_norm": 7.052785396575928, + "learning_rate": 9.040879880077962e-06, + "loss": 0.2089, + "step": 8238 + }, + { + "epoch": 0.20849254751119772, + "grad_norm": 5.645647048950195, + "learning_rate": 9.040643393932051e-06, + "loss": 0.2168, + "step": 8239 + }, + { + "epoch": 0.20851785307589138, + "grad_norm": 10.935060501098633, + "learning_rate": 9.040406881728781e-06, + "loss": 0.4061, + "step": 8240 + }, + { + "epoch": 0.20854315864058506, + "grad_norm": 5.140719890594482, + "learning_rate": 9.040170343469681e-06, + "loss": 0.2492, + "step": 8241 + }, + { + "epoch": 0.20856846420527875, + "grad_norm": 5.010900020599365, + "learning_rate": 9.039933779156274e-06, + "loss": 0.14, + "step": 8242 + }, + { + "epoch": 0.2085937697699724, + "grad_norm": 5.941256046295166, + "learning_rate": 9.039697188790084e-06, + "loss": 0.1995, + "step": 8243 + }, + { + "epoch": 0.2086190753346661, + "grad_norm": 5.1136322021484375, + "learning_rate": 9.039460572372639e-06, + "loss": 0.188, + "step": 8244 + }, + { + "epoch": 0.20864438089935977, + "grad_norm": 9.638968467712402, + "learning_rate": 9.039223929905464e-06, + "loss": 0.2405, + "step": 8245 + }, + { + "epoch": 0.20866968646405346, + "grad_norm": 12.319721221923828, + "learning_rate": 9.038987261390085e-06, + "loss": 0.2975, + "step": 8246 + }, + { + "epoch": 0.2086949920287471, + "grad_norm": 3.212176561355591, + "learning_rate": 9.03875056682803e-06, + "loss": 0.1985, + "step": 8247 + }, + { + "epoch": 0.2087202975934408, + "grad_norm": 7.083092212677002, + "learning_rate": 9.038513846220823e-06, + "loss": 0.2938, + "step": 8248 + }, + { + "epoch": 0.20874560315813448, + "grad_norm": 9.50341796875, + "learning_rate": 9.03827709956999e-06, + "loss": 0.2585, + "step": 8249 + }, + { + "epoch": 0.20877090872282816, + "grad_norm": 7.886209964752197, + "learning_rate": 9.03804032687706e-06, + "loss": 0.1086, + "step": 8250 + }, + { + "epoch": 0.20879621428752182, + "grad_norm": 5.3189473152160645, + "learning_rate": 9.037803528143559e-06, + "loss": 0.259, + "step": 8251 + }, + { + "epoch": 0.2088215198522155, + "grad_norm": 4.26790189743042, + "learning_rate": 9.037566703371015e-06, + "loss": 0.1125, + "step": 8252 + }, + { + "epoch": 0.2088468254169092, + "grad_norm": 7.749162197113037, + "learning_rate": 9.037329852560954e-06, + "loss": 0.1965, + "step": 8253 + }, + { + "epoch": 0.20887213098160284, + "grad_norm": 5.010400295257568, + "learning_rate": 9.037092975714902e-06, + "loss": 0.1794, + "step": 8254 + }, + { + "epoch": 0.20889743654629653, + "grad_norm": 7.902381896972656, + "learning_rate": 9.03685607283439e-06, + "loss": 0.1945, + "step": 8255 + }, + { + "epoch": 0.2089227421109902, + "grad_norm": 6.025662422180176, + "learning_rate": 9.036619143920945e-06, + "loss": 0.1871, + "step": 8256 + }, + { + "epoch": 0.2089480476756839, + "grad_norm": 4.432151794433594, + "learning_rate": 9.036382188976094e-06, + "loss": 0.2302, + "step": 8257 + }, + { + "epoch": 0.20897335324037755, + "grad_norm": 4.739956855773926, + "learning_rate": 9.036145208001364e-06, + "loss": 0.1741, + "step": 8258 + }, + { + "epoch": 0.20899865880507124, + "grad_norm": 5.292924880981445, + "learning_rate": 9.035908200998285e-06, + "loss": 0.1846, + "step": 8259 + }, + { + "epoch": 0.20902396436976492, + "grad_norm": 23.895069122314453, + "learning_rate": 9.035671167968383e-06, + "loss": 0.3093, + "step": 8260 + }, + { + "epoch": 0.20904926993445858, + "grad_norm": 6.68243408203125, + "learning_rate": 9.03543410891319e-06, + "loss": 0.2124, + "step": 8261 + }, + { + "epoch": 0.20907457549915226, + "grad_norm": 3.6821274757385254, + "learning_rate": 9.035197023834233e-06, + "loss": 0.1771, + "step": 8262 + }, + { + "epoch": 0.20909988106384594, + "grad_norm": 4.68776273727417, + "learning_rate": 9.03495991273304e-06, + "loss": 0.1718, + "step": 8263 + }, + { + "epoch": 0.20912518662853963, + "grad_norm": 6.907690525054932, + "learning_rate": 9.034722775611143e-06, + "loss": 0.1863, + "step": 8264 + }, + { + "epoch": 0.20915049219323328, + "grad_norm": 5.922850131988525, + "learning_rate": 9.034485612470066e-06, + "loss": 0.2071, + "step": 8265 + }, + { + "epoch": 0.20917579775792697, + "grad_norm": 13.393206596374512, + "learning_rate": 9.034248423311345e-06, + "loss": 0.336, + "step": 8266 + }, + { + "epoch": 0.20920110332262065, + "grad_norm": 4.625543117523193, + "learning_rate": 9.034011208136504e-06, + "loss": 0.2051, + "step": 8267 + }, + { + "epoch": 0.2092264088873143, + "grad_norm": 3.422848701477051, + "learning_rate": 9.033773966947077e-06, + "loss": 0.0967, + "step": 8268 + }, + { + "epoch": 0.209251714452008, + "grad_norm": 7.453725814819336, + "learning_rate": 9.033536699744591e-06, + "loss": 0.2703, + "step": 8269 + }, + { + "epoch": 0.20927702001670168, + "grad_norm": 7.921491622924805, + "learning_rate": 9.033299406530577e-06, + "loss": 0.1975, + "step": 8270 + }, + { + "epoch": 0.20930232558139536, + "grad_norm": 7.237805366516113, + "learning_rate": 9.033062087306565e-06, + "loss": 0.2111, + "step": 8271 + }, + { + "epoch": 0.20932763114608902, + "grad_norm": 4.205026626586914, + "learning_rate": 9.032824742074088e-06, + "loss": 0.1703, + "step": 8272 + }, + { + "epoch": 0.2093529367107827, + "grad_norm": 6.668095588684082, + "learning_rate": 9.032587370834673e-06, + "loss": 0.1755, + "step": 8273 + }, + { + "epoch": 0.20937824227547638, + "grad_norm": 4.712621212005615, + "learning_rate": 9.032349973589852e-06, + "loss": 0.1858, + "step": 8274 + }, + { + "epoch": 0.20940354784017004, + "grad_norm": 5.701255798339844, + "learning_rate": 9.032112550341157e-06, + "loss": 0.1717, + "step": 8275 + }, + { + "epoch": 0.20942885340486372, + "grad_norm": 2.915520191192627, + "learning_rate": 9.031875101090117e-06, + "loss": 0.1366, + "step": 8276 + }, + { + "epoch": 0.2094541589695574, + "grad_norm": 4.4604363441467285, + "learning_rate": 9.031637625838265e-06, + "loss": 0.1082, + "step": 8277 + }, + { + "epoch": 0.2094794645342511, + "grad_norm": 3.573277711868286, + "learning_rate": 9.031400124587132e-06, + "loss": 0.1552, + "step": 8278 + }, + { + "epoch": 0.20950477009894475, + "grad_norm": 7.893520832061768, + "learning_rate": 9.03116259733825e-06, + "loss": 0.2949, + "step": 8279 + }, + { + "epoch": 0.20953007566363843, + "grad_norm": 4.827880382537842, + "learning_rate": 9.030925044093149e-06, + "loss": 0.1988, + "step": 8280 + }, + { + "epoch": 0.20955538122833212, + "grad_norm": 4.888383388519287, + "learning_rate": 9.030687464853363e-06, + "loss": 0.1941, + "step": 8281 + }, + { + "epoch": 0.2095806867930258, + "grad_norm": 8.761669158935547, + "learning_rate": 9.030449859620423e-06, + "loss": 0.2103, + "step": 8282 + }, + { + "epoch": 0.20960599235771946, + "grad_norm": 6.941095352172852, + "learning_rate": 9.030212228395861e-06, + "loss": 0.1389, + "step": 8283 + }, + { + "epoch": 0.20963129792241314, + "grad_norm": 10.107081413269043, + "learning_rate": 9.02997457118121e-06, + "loss": 0.2248, + "step": 8284 + }, + { + "epoch": 0.20965660348710682, + "grad_norm": 4.23944616317749, + "learning_rate": 9.029736887978004e-06, + "loss": 0.1444, + "step": 8285 + }, + { + "epoch": 0.20968190905180048, + "grad_norm": 4.83998966217041, + "learning_rate": 9.029499178787774e-06, + "loss": 0.2483, + "step": 8286 + }, + { + "epoch": 0.20970721461649416, + "grad_norm": 6.45383358001709, + "learning_rate": 9.029261443612053e-06, + "loss": 0.2172, + "step": 8287 + }, + { + "epoch": 0.20973252018118785, + "grad_norm": 6.390013694763184, + "learning_rate": 9.029023682452373e-06, + "loss": 0.1284, + "step": 8288 + }, + { + "epoch": 0.20975782574588153, + "grad_norm": 7.80691385269165, + "learning_rate": 9.02878589531027e-06, + "loss": 0.211, + "step": 8289 + }, + { + "epoch": 0.2097831313105752, + "grad_norm": 4.885219573974609, + "learning_rate": 9.028548082187278e-06, + "loss": 0.2369, + "step": 8290 + }, + { + "epoch": 0.20980843687526887, + "grad_norm": 9.916589736938477, + "learning_rate": 9.028310243084927e-06, + "loss": 0.2863, + "step": 8291 + }, + { + "epoch": 0.20983374243996256, + "grad_norm": 5.535555362701416, + "learning_rate": 9.028072378004752e-06, + "loss": 0.2035, + "step": 8292 + }, + { + "epoch": 0.2098590480046562, + "grad_norm": 6.4805216789245605, + "learning_rate": 9.027834486948287e-06, + "loss": 0.1563, + "step": 8293 + }, + { + "epoch": 0.2098843535693499, + "grad_norm": 4.127923011779785, + "learning_rate": 9.02759656991707e-06, + "loss": 0.1793, + "step": 8294 + }, + { + "epoch": 0.20990965913404358, + "grad_norm": 11.208585739135742, + "learning_rate": 9.02735862691263e-06, + "loss": 0.1339, + "step": 8295 + }, + { + "epoch": 0.20993496469873726, + "grad_norm": 5.832517147064209, + "learning_rate": 9.027120657936503e-06, + "loss": 0.2728, + "step": 8296 + }, + { + "epoch": 0.20996027026343092, + "grad_norm": 4.722412586212158, + "learning_rate": 9.026882662990224e-06, + "loss": 0.1891, + "step": 8297 + }, + { + "epoch": 0.2099855758281246, + "grad_norm": 11.518465042114258, + "learning_rate": 9.026644642075328e-06, + "loss": 0.2028, + "step": 8298 + }, + { + "epoch": 0.2100108813928183, + "grad_norm": 9.549799919128418, + "learning_rate": 9.026406595193349e-06, + "loss": 0.1308, + "step": 8299 + }, + { + "epoch": 0.21003618695751194, + "grad_norm": 13.200551986694336, + "learning_rate": 9.026168522345824e-06, + "loss": 0.3032, + "step": 8300 + }, + { + "epoch": 0.21006149252220563, + "grad_norm": 3.655590534210205, + "learning_rate": 9.025930423534287e-06, + "loss": 0.1433, + "step": 8301 + }, + { + "epoch": 0.2100867980868993, + "grad_norm": 7.134367942810059, + "learning_rate": 9.025692298760273e-06, + "loss": 0.2409, + "step": 8302 + }, + { + "epoch": 0.210112103651593, + "grad_norm": 4.743054389953613, + "learning_rate": 9.025454148025317e-06, + "loss": 0.1944, + "step": 8303 + }, + { + "epoch": 0.21013740921628665, + "grad_norm": 5.4892072677612305, + "learning_rate": 9.025215971330959e-06, + "loss": 0.1267, + "step": 8304 + }, + { + "epoch": 0.21016271478098034, + "grad_norm": 3.2764711380004883, + "learning_rate": 9.024977768678728e-06, + "loss": 0.1438, + "step": 8305 + }, + { + "epoch": 0.21018802034567402, + "grad_norm": 6.472110748291016, + "learning_rate": 9.024739540070166e-06, + "loss": 0.184, + "step": 8306 + }, + { + "epoch": 0.21021332591036768, + "grad_norm": 5.491683006286621, + "learning_rate": 9.02450128550681e-06, + "loss": 0.2362, + "step": 8307 + }, + { + "epoch": 0.21023863147506136, + "grad_norm": 4.5747599601745605, + "learning_rate": 9.024263004990191e-06, + "loss": 0.1691, + "step": 8308 + }, + { + "epoch": 0.21026393703975504, + "grad_norm": 4.1320271492004395, + "learning_rate": 9.02402469852185e-06, + "loss": 0.1489, + "step": 8309 + }, + { + "epoch": 0.21028924260444873, + "grad_norm": 7.191992282867432, + "learning_rate": 9.02378636610332e-06, + "loss": 0.2214, + "step": 8310 + }, + { + "epoch": 0.21031454816914238, + "grad_norm": 3.035165548324585, + "learning_rate": 9.023548007736142e-06, + "loss": 0.1101, + "step": 8311 + }, + { + "epoch": 0.21033985373383607, + "grad_norm": 7.107935905456543, + "learning_rate": 9.02330962342185e-06, + "loss": 0.2625, + "step": 8312 + }, + { + "epoch": 0.21036515929852975, + "grad_norm": 6.912434101104736, + "learning_rate": 9.023071213161983e-06, + "loss": 0.0939, + "step": 8313 + }, + { + "epoch": 0.21039046486322344, + "grad_norm": 4.077386856079102, + "learning_rate": 9.02283277695808e-06, + "loss": 0.0936, + "step": 8314 + }, + { + "epoch": 0.2104157704279171, + "grad_norm": 3.61800479888916, + "learning_rate": 9.022594314811675e-06, + "loss": 0.1902, + "step": 8315 + }, + { + "epoch": 0.21044107599261078, + "grad_norm": 3.090602159500122, + "learning_rate": 9.022355826724308e-06, + "loss": 0.1286, + "step": 8316 + }, + { + "epoch": 0.21046638155730446, + "grad_norm": 3.6820173263549805, + "learning_rate": 9.022117312697516e-06, + "loss": 0.1453, + "step": 8317 + }, + { + "epoch": 0.21049168712199812, + "grad_norm": 9.88864803314209, + "learning_rate": 9.021878772732837e-06, + "loss": 0.3101, + "step": 8318 + }, + { + "epoch": 0.2105169926866918, + "grad_norm": 7.495756149291992, + "learning_rate": 9.021640206831811e-06, + "loss": 0.2719, + "step": 8319 + }, + { + "epoch": 0.21054229825138548, + "grad_norm": 15.647445678710938, + "learning_rate": 9.021401614995975e-06, + "loss": 0.2004, + "step": 8320 + }, + { + "epoch": 0.21056760381607917, + "grad_norm": 5.469583988189697, + "learning_rate": 9.021162997226868e-06, + "loss": 0.1366, + "step": 8321 + }, + { + "epoch": 0.21059290938077282, + "grad_norm": 5.184152126312256, + "learning_rate": 9.020924353526029e-06, + "loss": 0.1436, + "step": 8322 + }, + { + "epoch": 0.2106182149454665, + "grad_norm": 9.03649616241455, + "learning_rate": 9.020685683894997e-06, + "loss": 0.2487, + "step": 8323 + }, + { + "epoch": 0.2106435205101602, + "grad_norm": 2.8144278526306152, + "learning_rate": 9.02044698833531e-06, + "loss": 0.1437, + "step": 8324 + }, + { + "epoch": 0.21066882607485385, + "grad_norm": 3.1980178356170654, + "learning_rate": 9.02020826684851e-06, + "loss": 0.1251, + "step": 8325 + }, + { + "epoch": 0.21069413163954753, + "grad_norm": 7.731018543243408, + "learning_rate": 9.019969519436133e-06, + "loss": 0.2472, + "step": 8326 + }, + { + "epoch": 0.21071943720424122, + "grad_norm": 5.417757987976074, + "learning_rate": 9.019730746099722e-06, + "loss": 0.151, + "step": 8327 + }, + { + "epoch": 0.2107447427689349, + "grad_norm": 9.007912635803223, + "learning_rate": 9.019491946840814e-06, + "loss": 0.2759, + "step": 8328 + }, + { + "epoch": 0.21077004833362856, + "grad_norm": 3.116832971572876, + "learning_rate": 9.01925312166095e-06, + "loss": 0.1485, + "step": 8329 + }, + { + "epoch": 0.21079535389832224, + "grad_norm": 8.073911666870117, + "learning_rate": 9.01901427056167e-06, + "loss": 0.1689, + "step": 8330 + }, + { + "epoch": 0.21082065946301592, + "grad_norm": 6.204314708709717, + "learning_rate": 9.018775393544515e-06, + "loss": 0.2199, + "step": 8331 + }, + { + "epoch": 0.21084596502770958, + "grad_norm": 6.978521823883057, + "learning_rate": 9.018536490611024e-06, + "loss": 0.1718, + "step": 8332 + }, + { + "epoch": 0.21087127059240326, + "grad_norm": 3.3489251136779785, + "learning_rate": 9.018297561762742e-06, + "loss": 0.1142, + "step": 8333 + }, + { + "epoch": 0.21089657615709695, + "grad_norm": 5.200564861297607, + "learning_rate": 9.018058607001204e-06, + "loss": 0.2036, + "step": 8334 + }, + { + "epoch": 0.21092188172179063, + "grad_norm": 3.325432538986206, + "learning_rate": 9.017819626327953e-06, + "loss": 0.0904, + "step": 8335 + }, + { + "epoch": 0.2109471872864843, + "grad_norm": 4.073355197906494, + "learning_rate": 9.017580619744533e-06, + "loss": 0.1316, + "step": 8336 + }, + { + "epoch": 0.21097249285117797, + "grad_norm": 9.075081825256348, + "learning_rate": 9.017341587252479e-06, + "loss": 0.2415, + "step": 8337 + }, + { + "epoch": 0.21099779841587166, + "grad_norm": 3.6010348796844482, + "learning_rate": 9.01710252885334e-06, + "loss": 0.1156, + "step": 8338 + }, + { + "epoch": 0.2110231039805653, + "grad_norm": 4.6761088371276855, + "learning_rate": 9.016863444548653e-06, + "loss": 0.2274, + "step": 8339 + }, + { + "epoch": 0.211048409545259, + "grad_norm": 13.591646194458008, + "learning_rate": 9.01662433433996e-06, + "loss": 0.2232, + "step": 8340 + }, + { + "epoch": 0.21107371510995268, + "grad_norm": 3.531100273132324, + "learning_rate": 9.016385198228804e-06, + "loss": 0.1412, + "step": 8341 + }, + { + "epoch": 0.21109902067464636, + "grad_norm": 13.328577995300293, + "learning_rate": 9.016146036216727e-06, + "loss": 0.2158, + "step": 8342 + }, + { + "epoch": 0.21112432623934002, + "grad_norm": 3.8106136322021484, + "learning_rate": 9.015906848305273e-06, + "loss": 0.1727, + "step": 8343 + }, + { + "epoch": 0.2111496318040337, + "grad_norm": 12.792584419250488, + "learning_rate": 9.015667634495981e-06, + "loss": 0.2439, + "step": 8344 + }, + { + "epoch": 0.2111749373687274, + "grad_norm": 2.3079733848571777, + "learning_rate": 9.015428394790395e-06, + "loss": 0.0927, + "step": 8345 + }, + { + "epoch": 0.21120024293342107, + "grad_norm": 18.726144790649414, + "learning_rate": 9.01518912919006e-06, + "loss": 0.2136, + "step": 8346 + }, + { + "epoch": 0.21122554849811473, + "grad_norm": 6.633214473724365, + "learning_rate": 9.014949837696516e-06, + "loss": 0.1976, + "step": 8347 + }, + { + "epoch": 0.2112508540628084, + "grad_norm": 6.114283084869385, + "learning_rate": 9.014710520311307e-06, + "loss": 0.2344, + "step": 8348 + }, + { + "epoch": 0.2112761596275021, + "grad_norm": 8.041839599609375, + "learning_rate": 9.014471177035977e-06, + "loss": 0.1755, + "step": 8349 + }, + { + "epoch": 0.21130146519219575, + "grad_norm": 7.097731113433838, + "learning_rate": 9.014231807872069e-06, + "loss": 0.212, + "step": 8350 + }, + { + "epoch": 0.21132677075688944, + "grad_norm": 5.479612827301025, + "learning_rate": 9.013992412821126e-06, + "loss": 0.2255, + "step": 8351 + }, + { + "epoch": 0.21135207632158312, + "grad_norm": 13.475677490234375, + "learning_rate": 9.013752991884694e-06, + "loss": 0.1545, + "step": 8352 + }, + { + "epoch": 0.2113773818862768, + "grad_norm": 5.299386024475098, + "learning_rate": 9.013513545064316e-06, + "loss": 0.2466, + "step": 8353 + }, + { + "epoch": 0.21140268745097046, + "grad_norm": 11.276626586914062, + "learning_rate": 9.013274072361535e-06, + "loss": 0.2008, + "step": 8354 + }, + { + "epoch": 0.21142799301566415, + "grad_norm": 10.696271896362305, + "learning_rate": 9.013034573777895e-06, + "loss": 0.1973, + "step": 8355 + }, + { + "epoch": 0.21145329858035783, + "grad_norm": 6.845823764801025, + "learning_rate": 9.012795049314943e-06, + "loss": 0.2583, + "step": 8356 + }, + { + "epoch": 0.21147860414505149, + "grad_norm": 19.913515090942383, + "learning_rate": 9.012555498974222e-06, + "loss": 0.2426, + "step": 8357 + }, + { + "epoch": 0.21150390970974517, + "grad_norm": 3.837057113647461, + "learning_rate": 9.012315922757277e-06, + "loss": 0.1888, + "step": 8358 + }, + { + "epoch": 0.21152921527443885, + "grad_norm": 7.721858978271484, + "learning_rate": 9.012076320665653e-06, + "loss": 0.2091, + "step": 8359 + }, + { + "epoch": 0.21155452083913254, + "grad_norm": 4.660994052886963, + "learning_rate": 9.011836692700894e-06, + "loss": 0.1664, + "step": 8360 + }, + { + "epoch": 0.2115798264038262, + "grad_norm": 5.971701145172119, + "learning_rate": 9.011597038864547e-06, + "loss": 0.2112, + "step": 8361 + }, + { + "epoch": 0.21160513196851988, + "grad_norm": 4.504244327545166, + "learning_rate": 9.011357359158157e-06, + "loss": 0.129, + "step": 8362 + }, + { + "epoch": 0.21163043753321356, + "grad_norm": 18.119606018066406, + "learning_rate": 9.01111765358327e-06, + "loss": 0.3241, + "step": 8363 + }, + { + "epoch": 0.21165574309790722, + "grad_norm": 7.754940509796143, + "learning_rate": 9.01087792214143e-06, + "loss": 0.2178, + "step": 8364 + }, + { + "epoch": 0.2116810486626009, + "grad_norm": 4.089425563812256, + "learning_rate": 9.010638164834186e-06, + "loss": 0.1907, + "step": 8365 + }, + { + "epoch": 0.21170635422729459, + "grad_norm": 4.2671074867248535, + "learning_rate": 9.010398381663082e-06, + "loss": 0.1502, + "step": 8366 + }, + { + "epoch": 0.21173165979198827, + "grad_norm": 4.462687969207764, + "learning_rate": 9.010158572629664e-06, + "loss": 0.1402, + "step": 8367 + }, + { + "epoch": 0.21175696535668193, + "grad_norm": 5.637714862823486, + "learning_rate": 9.009918737735479e-06, + "loss": 0.1995, + "step": 8368 + }, + { + "epoch": 0.2117822709213756, + "grad_norm": 6.389106750488281, + "learning_rate": 9.009678876982075e-06, + "loss": 0.2108, + "step": 8369 + }, + { + "epoch": 0.2118075764860693, + "grad_norm": 5.806633949279785, + "learning_rate": 9.009438990370998e-06, + "loss": 0.2023, + "step": 8370 + }, + { + "epoch": 0.21183288205076295, + "grad_norm": 7.583001613616943, + "learning_rate": 9.009199077903794e-06, + "loss": 0.2566, + "step": 8371 + }, + { + "epoch": 0.21185818761545663, + "grad_norm": 3.1314961910247803, + "learning_rate": 9.008959139582011e-06, + "loss": 0.1022, + "step": 8372 + }, + { + "epoch": 0.21188349318015032, + "grad_norm": 9.08366584777832, + "learning_rate": 9.008719175407197e-06, + "loss": 0.3042, + "step": 8373 + }, + { + "epoch": 0.211908798744844, + "grad_norm": 3.6297597885131836, + "learning_rate": 9.0084791853809e-06, + "loss": 0.1471, + "step": 8374 + }, + { + "epoch": 0.21193410430953766, + "grad_norm": 5.49296760559082, + "learning_rate": 9.008239169504665e-06, + "loss": 0.2467, + "step": 8375 + }, + { + "epoch": 0.21195940987423134, + "grad_norm": 5.953841209411621, + "learning_rate": 9.007999127780041e-06, + "loss": 0.1895, + "step": 8376 + }, + { + "epoch": 0.21198471543892503, + "grad_norm": 7.230778217315674, + "learning_rate": 9.007759060208574e-06, + "loss": 0.1867, + "step": 8377 + }, + { + "epoch": 0.2120100210036187, + "grad_norm": 4.07774019241333, + "learning_rate": 9.007518966791818e-06, + "loss": 0.1836, + "step": 8378 + }, + { + "epoch": 0.21203532656831237, + "grad_norm": 14.330789566040039, + "learning_rate": 9.007278847531315e-06, + "loss": 0.2754, + "step": 8379 + }, + { + "epoch": 0.21206063213300605, + "grad_norm": 4.0979084968566895, + "learning_rate": 9.007038702428617e-06, + "loss": 0.1626, + "step": 8380 + }, + { + "epoch": 0.21208593769769973, + "grad_norm": 3.747246265411377, + "learning_rate": 9.006798531485273e-06, + "loss": 0.1181, + "step": 8381 + }, + { + "epoch": 0.2121112432623934, + "grad_norm": 6.865072727203369, + "learning_rate": 9.006558334702829e-06, + "loss": 0.2358, + "step": 8382 + }, + { + "epoch": 0.21213654882708707, + "grad_norm": 5.230052471160889, + "learning_rate": 9.006318112082837e-06, + "loss": 0.1449, + "step": 8383 + }, + { + "epoch": 0.21216185439178076, + "grad_norm": 14.052891731262207, + "learning_rate": 9.006077863626843e-06, + "loss": 0.2514, + "step": 8384 + }, + { + "epoch": 0.21218715995647444, + "grad_norm": 11.687732696533203, + "learning_rate": 9.0058375893364e-06, + "loss": 0.3988, + "step": 8385 + }, + { + "epoch": 0.2122124655211681, + "grad_norm": 6.2596564292907715, + "learning_rate": 9.005597289213053e-06, + "loss": 0.1836, + "step": 8386 + }, + { + "epoch": 0.21223777108586178, + "grad_norm": 5.926446437835693, + "learning_rate": 9.005356963258356e-06, + "loss": 0.1985, + "step": 8387 + }, + { + "epoch": 0.21226307665055547, + "grad_norm": 4.576792240142822, + "learning_rate": 9.005116611473857e-06, + "loss": 0.1791, + "step": 8388 + }, + { + "epoch": 0.21228838221524912, + "grad_norm": 4.265875816345215, + "learning_rate": 9.004876233861104e-06, + "loss": 0.185, + "step": 8389 + }, + { + "epoch": 0.2123136877799428, + "grad_norm": 10.6087646484375, + "learning_rate": 9.004635830421651e-06, + "loss": 0.1823, + "step": 8390 + }, + { + "epoch": 0.2123389933446365, + "grad_norm": 13.380814552307129, + "learning_rate": 9.004395401157046e-06, + "loss": 0.2438, + "step": 8391 + }, + { + "epoch": 0.21236429890933017, + "grad_norm": 4.559227466583252, + "learning_rate": 9.004154946068841e-06, + "loss": 0.1716, + "step": 8392 + }, + { + "epoch": 0.21238960447402383, + "grad_norm": 10.371009826660156, + "learning_rate": 9.003914465158584e-06, + "loss": 0.3007, + "step": 8393 + }, + { + "epoch": 0.2124149100387175, + "grad_norm": 4.077297210693359, + "learning_rate": 9.003673958427828e-06, + "loss": 0.172, + "step": 8394 + }, + { + "epoch": 0.2124402156034112, + "grad_norm": 3.2565629482269287, + "learning_rate": 9.003433425878123e-06, + "loss": 0.1146, + "step": 8395 + }, + { + "epoch": 0.21246552116810485, + "grad_norm": 5.186214447021484, + "learning_rate": 9.003192867511021e-06, + "loss": 0.1938, + "step": 8396 + }, + { + "epoch": 0.21249082673279854, + "grad_norm": 5.030869483947754, + "learning_rate": 9.002952283328073e-06, + "loss": 0.178, + "step": 8397 + }, + { + "epoch": 0.21251613229749222, + "grad_norm": 5.459820747375488, + "learning_rate": 9.002711673330831e-06, + "loss": 0.2133, + "step": 8398 + }, + { + "epoch": 0.2125414378621859, + "grad_norm": 9.697821617126465, + "learning_rate": 9.002471037520844e-06, + "loss": 0.2642, + "step": 8399 + }, + { + "epoch": 0.21256674342687956, + "grad_norm": 5.735559463500977, + "learning_rate": 9.002230375899668e-06, + "loss": 0.2158, + "step": 8400 + }, + { + "epoch": 0.21259204899157325, + "grad_norm": 6.04610538482666, + "learning_rate": 9.00198968846885e-06, + "loss": 0.1995, + "step": 8401 + }, + { + "epoch": 0.21261735455626693, + "grad_norm": 2.878509283065796, + "learning_rate": 9.001748975229947e-06, + "loss": 0.163, + "step": 8402 + }, + { + "epoch": 0.21264266012096059, + "grad_norm": 15.1868257522583, + "learning_rate": 9.001508236184509e-06, + "loss": 0.1853, + "step": 8403 + }, + { + "epoch": 0.21266796568565427, + "grad_norm": 38.90547180175781, + "learning_rate": 9.001267471334088e-06, + "loss": 0.3756, + "step": 8404 + }, + { + "epoch": 0.21269327125034795, + "grad_norm": 13.265949249267578, + "learning_rate": 9.001026680680239e-06, + "loss": 0.2841, + "step": 8405 + }, + { + "epoch": 0.21271857681504164, + "grad_norm": 5.740435600280762, + "learning_rate": 9.000785864224514e-06, + "loss": 0.1854, + "step": 8406 + }, + { + "epoch": 0.2127438823797353, + "grad_norm": 5.435031890869141, + "learning_rate": 9.000545021968461e-06, + "loss": 0.1841, + "step": 8407 + }, + { + "epoch": 0.21276918794442898, + "grad_norm": 6.9328131675720215, + "learning_rate": 9.000304153913639e-06, + "loss": 0.1649, + "step": 8408 + }, + { + "epoch": 0.21279449350912266, + "grad_norm": 11.003037452697754, + "learning_rate": 9.0000632600616e-06, + "loss": 0.2879, + "step": 8409 + }, + { + "epoch": 0.21281979907381635, + "grad_norm": 5.965943336486816, + "learning_rate": 8.999822340413897e-06, + "loss": 0.1705, + "step": 8410 + }, + { + "epoch": 0.21284510463851, + "grad_norm": 11.624044418334961, + "learning_rate": 8.999581394972086e-06, + "loss": 0.2394, + "step": 8411 + }, + { + "epoch": 0.21287041020320369, + "grad_norm": 3.2312915325164795, + "learning_rate": 8.999340423737715e-06, + "loss": 0.1457, + "step": 8412 + }, + { + "epoch": 0.21289571576789737, + "grad_norm": 10.401586532592773, + "learning_rate": 8.999099426712343e-06, + "loss": 0.3205, + "step": 8413 + }, + { + "epoch": 0.21292102133259103, + "grad_norm": 5.567553520202637, + "learning_rate": 8.998858403897523e-06, + "loss": 0.1878, + "step": 8414 + }, + { + "epoch": 0.2129463268972847, + "grad_norm": 9.036304473876953, + "learning_rate": 8.998617355294809e-06, + "loss": 0.2454, + "step": 8415 + }, + { + "epoch": 0.2129716324619784, + "grad_norm": 4.202988147735596, + "learning_rate": 8.998376280905756e-06, + "loss": 0.1924, + "step": 8416 + }, + { + "epoch": 0.21299693802667208, + "grad_norm": 3.9665348529815674, + "learning_rate": 8.998135180731917e-06, + "loss": 0.1075, + "step": 8417 + }, + { + "epoch": 0.21302224359136573, + "grad_norm": 31.0739688873291, + "learning_rate": 8.99789405477485e-06, + "loss": 0.2342, + "step": 8418 + }, + { + "epoch": 0.21304754915605942, + "grad_norm": 7.323211193084717, + "learning_rate": 8.997652903036105e-06, + "loss": 0.1973, + "step": 8419 + }, + { + "epoch": 0.2130728547207531, + "grad_norm": 12.826631546020508, + "learning_rate": 8.997411725517243e-06, + "loss": 0.2535, + "step": 8420 + }, + { + "epoch": 0.21309816028544676, + "grad_norm": 8.022686004638672, + "learning_rate": 8.997170522219814e-06, + "loss": 0.2593, + "step": 8421 + }, + { + "epoch": 0.21312346585014044, + "grad_norm": 10.277687072753906, + "learning_rate": 8.996929293145375e-06, + "loss": 0.2649, + "step": 8422 + }, + { + "epoch": 0.21314877141483413, + "grad_norm": 9.938939094543457, + "learning_rate": 8.996688038295485e-06, + "loss": 0.2688, + "step": 8423 + }, + { + "epoch": 0.2131740769795278, + "grad_norm": 7.319474697113037, + "learning_rate": 8.996446757671696e-06, + "loss": 0.152, + "step": 8424 + }, + { + "epoch": 0.21319938254422147, + "grad_norm": 14.796005249023438, + "learning_rate": 8.996205451275565e-06, + "loss": 0.1635, + "step": 8425 + }, + { + "epoch": 0.21322468810891515, + "grad_norm": 7.5186028480529785, + "learning_rate": 8.99596411910865e-06, + "loss": 0.1451, + "step": 8426 + }, + { + "epoch": 0.21324999367360883, + "grad_norm": 4.071700096130371, + "learning_rate": 8.995722761172503e-06, + "loss": 0.1773, + "step": 8427 + }, + { + "epoch": 0.2132752992383025, + "grad_norm": 4.182036876678467, + "learning_rate": 8.995481377468686e-06, + "loss": 0.1299, + "step": 8428 + }, + { + "epoch": 0.21330060480299617, + "grad_norm": 4.264776229858398, + "learning_rate": 8.995239967998752e-06, + "loss": 0.1403, + "step": 8429 + }, + { + "epoch": 0.21332591036768986, + "grad_norm": 13.915092468261719, + "learning_rate": 8.994998532764258e-06, + "loss": 0.2366, + "step": 8430 + }, + { + "epoch": 0.21335121593238354, + "grad_norm": 6.943350315093994, + "learning_rate": 8.994757071766762e-06, + "loss": 0.2056, + "step": 8431 + }, + { + "epoch": 0.2133765214970772, + "grad_norm": 5.287805080413818, + "learning_rate": 8.99451558500782e-06, + "loss": 0.2337, + "step": 8432 + }, + { + "epoch": 0.21340182706177088, + "grad_norm": 4.833090782165527, + "learning_rate": 8.994274072488992e-06, + "loss": 0.2204, + "step": 8433 + }, + { + "epoch": 0.21342713262646457, + "grad_norm": 9.747961044311523, + "learning_rate": 8.994032534211831e-06, + "loss": 0.2322, + "step": 8434 + }, + { + "epoch": 0.21345243819115822, + "grad_norm": 5.578670978546143, + "learning_rate": 8.993790970177898e-06, + "loss": 0.1651, + "step": 8435 + }, + { + "epoch": 0.2134777437558519, + "grad_norm": 9.181635856628418, + "learning_rate": 8.993549380388751e-06, + "loss": 0.1821, + "step": 8436 + }, + { + "epoch": 0.2135030493205456, + "grad_norm": 11.042889595031738, + "learning_rate": 8.993307764845945e-06, + "loss": 0.1957, + "step": 8437 + }, + { + "epoch": 0.21352835488523927, + "grad_norm": 6.152070045471191, + "learning_rate": 8.993066123551042e-06, + "loss": 0.1533, + "step": 8438 + }, + { + "epoch": 0.21355366044993293, + "grad_norm": 3.995028018951416, + "learning_rate": 8.992824456505599e-06, + "loss": 0.2087, + "step": 8439 + }, + { + "epoch": 0.21357896601462661, + "grad_norm": 4.897697448730469, + "learning_rate": 8.99258276371117e-06, + "loss": 0.2809, + "step": 8440 + }, + { + "epoch": 0.2136042715793203, + "grad_norm": 4.71058464050293, + "learning_rate": 8.99234104516932e-06, + "loss": 0.2164, + "step": 8441 + }, + { + "epoch": 0.21362957714401398, + "grad_norm": 10.97534465789795, + "learning_rate": 8.992099300881606e-06, + "loss": 0.244, + "step": 8442 + }, + { + "epoch": 0.21365488270870764, + "grad_norm": 3.465588092803955, + "learning_rate": 8.991857530849584e-06, + "loss": 0.1596, + "step": 8443 + }, + { + "epoch": 0.21368018827340132, + "grad_norm": 6.916377544403076, + "learning_rate": 8.991615735074817e-06, + "loss": 0.2172, + "step": 8444 + }, + { + "epoch": 0.213705493838095, + "grad_norm": 5.883934497833252, + "learning_rate": 8.99137391355886e-06, + "loss": 0.1996, + "step": 8445 + }, + { + "epoch": 0.21373079940278866, + "grad_norm": 6.422900676727295, + "learning_rate": 8.991132066303278e-06, + "loss": 0.1065, + "step": 8446 + }, + { + "epoch": 0.21375610496748235, + "grad_norm": 9.690683364868164, + "learning_rate": 8.990890193309627e-06, + "loss": 0.1249, + "step": 8447 + }, + { + "epoch": 0.21378141053217603, + "grad_norm": 5.4256272315979, + "learning_rate": 8.990648294579466e-06, + "loss": 0.2172, + "step": 8448 + }, + { + "epoch": 0.21380671609686971, + "grad_norm": 4.548116683959961, + "learning_rate": 8.990406370114359e-06, + "loss": 0.1585, + "step": 8449 + }, + { + "epoch": 0.21383202166156337, + "grad_norm": 5.422043323516846, + "learning_rate": 8.990164419915863e-06, + "loss": 0.1943, + "step": 8450 + }, + { + "epoch": 0.21385732722625705, + "grad_norm": 6.076163291931152, + "learning_rate": 8.989922443985538e-06, + "loss": 0.183, + "step": 8451 + }, + { + "epoch": 0.21388263279095074, + "grad_norm": 6.072235107421875, + "learning_rate": 8.989680442324947e-06, + "loss": 0.1876, + "step": 8452 + }, + { + "epoch": 0.2139079383556444, + "grad_norm": 5.202139377593994, + "learning_rate": 8.989438414935647e-06, + "loss": 0.2169, + "step": 8453 + }, + { + "epoch": 0.21393324392033808, + "grad_norm": 5.8376359939575195, + "learning_rate": 8.989196361819201e-06, + "loss": 0.2688, + "step": 8454 + }, + { + "epoch": 0.21395854948503176, + "grad_norm": 4.339588165283203, + "learning_rate": 8.98895428297717e-06, + "loss": 0.2207, + "step": 8455 + }, + { + "epoch": 0.21398385504972545, + "grad_norm": 4.078848838806152, + "learning_rate": 8.988712178411114e-06, + "loss": 0.171, + "step": 8456 + }, + { + "epoch": 0.2140091606144191, + "grad_norm": 6.923192977905273, + "learning_rate": 8.988470048122598e-06, + "loss": 0.2363, + "step": 8457 + }, + { + "epoch": 0.2140344661791128, + "grad_norm": 7.5330376625061035, + "learning_rate": 8.988227892113178e-06, + "loss": 0.1706, + "step": 8458 + }, + { + "epoch": 0.21405977174380647, + "grad_norm": 4.108709812164307, + "learning_rate": 8.987985710384418e-06, + "loss": 0.1436, + "step": 8459 + }, + { + "epoch": 0.21408507730850013, + "grad_norm": 3.271775960922241, + "learning_rate": 8.98774350293788e-06, + "loss": 0.1422, + "step": 8460 + }, + { + "epoch": 0.2141103828731938, + "grad_norm": 5.844564437866211, + "learning_rate": 8.987501269775128e-06, + "loss": 0.2311, + "step": 8461 + }, + { + "epoch": 0.2141356884378875, + "grad_norm": 4.731941223144531, + "learning_rate": 8.98725901089772e-06, + "loss": 0.1528, + "step": 8462 + }, + { + "epoch": 0.21416099400258118, + "grad_norm": 10.711053848266602, + "learning_rate": 8.987016726307221e-06, + "loss": 0.1923, + "step": 8463 + }, + { + "epoch": 0.21418629956727483, + "grad_norm": 3.874225616455078, + "learning_rate": 8.986774416005193e-06, + "loss": 0.1575, + "step": 8464 + }, + { + "epoch": 0.21421160513196852, + "grad_norm": 4.604304790496826, + "learning_rate": 8.986532079993197e-06, + "loss": 0.1319, + "step": 8465 + }, + { + "epoch": 0.2142369106966622, + "grad_norm": 3.928396701812744, + "learning_rate": 8.9862897182728e-06, + "loss": 0.1822, + "step": 8466 + }, + { + "epoch": 0.21426221626135586, + "grad_norm": 3.4976348876953125, + "learning_rate": 8.98604733084556e-06, + "loss": 0.1003, + "step": 8467 + }, + { + "epoch": 0.21428752182604954, + "grad_norm": 8.804192543029785, + "learning_rate": 8.985804917713043e-06, + "loss": 0.2499, + "step": 8468 + }, + { + "epoch": 0.21431282739074323, + "grad_norm": 6.852408409118652, + "learning_rate": 8.985562478876811e-06, + "loss": 0.2516, + "step": 8469 + }, + { + "epoch": 0.2143381329554369, + "grad_norm": 4.843634605407715, + "learning_rate": 8.985320014338428e-06, + "loss": 0.1788, + "step": 8470 + }, + { + "epoch": 0.21436343852013057, + "grad_norm": 8.953132629394531, + "learning_rate": 8.985077524099458e-06, + "loss": 0.24, + "step": 8471 + }, + { + "epoch": 0.21438874408482425, + "grad_norm": 7.7193522453308105, + "learning_rate": 8.984835008161464e-06, + "loss": 0.2371, + "step": 8472 + }, + { + "epoch": 0.21441404964951793, + "grad_norm": 8.047340393066406, + "learning_rate": 8.98459246652601e-06, + "loss": 0.2577, + "step": 8473 + }, + { + "epoch": 0.21443935521421162, + "grad_norm": 9.76678466796875, + "learning_rate": 8.984349899194661e-06, + "loss": 0.2077, + "step": 8474 + }, + { + "epoch": 0.21446466077890528, + "grad_norm": 4.656784534454346, + "learning_rate": 8.984107306168982e-06, + "loss": 0.1656, + "step": 8475 + }, + { + "epoch": 0.21448996634359896, + "grad_norm": 5.971406936645508, + "learning_rate": 8.983864687450533e-06, + "loss": 0.1883, + "step": 8476 + }, + { + "epoch": 0.21451527190829264, + "grad_norm": 4.159972190856934, + "learning_rate": 8.983622043040883e-06, + "loss": 0.1451, + "step": 8477 + }, + { + "epoch": 0.2145405774729863, + "grad_norm": 4.6911702156066895, + "learning_rate": 8.983379372941597e-06, + "loss": 0.1851, + "step": 8478 + }, + { + "epoch": 0.21456588303767998, + "grad_norm": 4.037179946899414, + "learning_rate": 8.983136677154236e-06, + "loss": 0.1706, + "step": 8479 + }, + { + "epoch": 0.21459118860237367, + "grad_norm": 25.754289627075195, + "learning_rate": 8.982893955680367e-06, + "loss": 0.2313, + "step": 8480 + }, + { + "epoch": 0.21461649416706735, + "grad_norm": 8.05750560760498, + "learning_rate": 8.982651208521558e-06, + "loss": 0.299, + "step": 8481 + }, + { + "epoch": 0.214641799731761, + "grad_norm": 6.090150833129883, + "learning_rate": 8.982408435679371e-06, + "loss": 0.1831, + "step": 8482 + }, + { + "epoch": 0.2146671052964547, + "grad_norm": 8.417072296142578, + "learning_rate": 8.982165637155374e-06, + "loss": 0.2973, + "step": 8483 + }, + { + "epoch": 0.21469241086114838, + "grad_norm": 18.942171096801758, + "learning_rate": 8.98192281295113e-06, + "loss": 0.231, + "step": 8484 + }, + { + "epoch": 0.21471771642584203, + "grad_norm": 5.395334720611572, + "learning_rate": 8.981679963068208e-06, + "loss": 0.1567, + "step": 8485 + }, + { + "epoch": 0.21474302199053572, + "grad_norm": 6.444736003875732, + "learning_rate": 8.981437087508171e-06, + "loss": 0.237, + "step": 8486 + }, + { + "epoch": 0.2147683275552294, + "grad_norm": 9.413355827331543, + "learning_rate": 8.981194186272587e-06, + "loss": 0.2479, + "step": 8487 + }, + { + "epoch": 0.21479363311992308, + "grad_norm": 8.47735595703125, + "learning_rate": 8.980951259363023e-06, + "loss": 0.1725, + "step": 8488 + }, + { + "epoch": 0.21481893868461674, + "grad_norm": 3.725318193435669, + "learning_rate": 8.980708306781045e-06, + "loss": 0.1416, + "step": 8489 + }, + { + "epoch": 0.21484424424931042, + "grad_norm": 3.998075246810913, + "learning_rate": 8.98046532852822e-06, + "loss": 0.1602, + "step": 8490 + }, + { + "epoch": 0.2148695498140041, + "grad_norm": 5.907116413116455, + "learning_rate": 8.980222324606113e-06, + "loss": 0.2345, + "step": 8491 + }, + { + "epoch": 0.21489485537869776, + "grad_norm": 9.197464942932129, + "learning_rate": 8.979979295016295e-06, + "loss": 0.219, + "step": 8492 + }, + { + "epoch": 0.21492016094339145, + "grad_norm": 12.950334548950195, + "learning_rate": 8.979736239760328e-06, + "loss": 0.1724, + "step": 8493 + }, + { + "epoch": 0.21494546650808513, + "grad_norm": 9.204604148864746, + "learning_rate": 8.979493158839784e-06, + "loss": 0.1469, + "step": 8494 + }, + { + "epoch": 0.21497077207277882, + "grad_norm": 7.142143249511719, + "learning_rate": 8.979250052256228e-06, + "loss": 0.2757, + "step": 8495 + }, + { + "epoch": 0.21499607763747247, + "grad_norm": 6.459751129150391, + "learning_rate": 8.979006920011229e-06, + "loss": 0.1744, + "step": 8496 + }, + { + "epoch": 0.21502138320216616, + "grad_norm": 6.206418991088867, + "learning_rate": 8.978763762106354e-06, + "loss": 0.1615, + "step": 8497 + }, + { + "epoch": 0.21504668876685984, + "grad_norm": 6.5825300216674805, + "learning_rate": 8.978520578543173e-06, + "loss": 0.2422, + "step": 8498 + }, + { + "epoch": 0.2150719943315535, + "grad_norm": 8.936675071716309, + "learning_rate": 8.978277369323252e-06, + "loss": 0.1277, + "step": 8499 + }, + { + "epoch": 0.21509729989624718, + "grad_norm": 3.714491605758667, + "learning_rate": 8.97803413444816e-06, + "loss": 0.1248, + "step": 8500 + }, + { + "epoch": 0.21512260546094086, + "grad_norm": 10.63693904876709, + "learning_rate": 8.977790873919465e-06, + "loss": 0.2625, + "step": 8501 + }, + { + "epoch": 0.21514791102563455, + "grad_norm": 3.4618663787841797, + "learning_rate": 8.977547587738739e-06, + "loss": 0.1327, + "step": 8502 + }, + { + "epoch": 0.2151732165903282, + "grad_norm": 4.411441326141357, + "learning_rate": 8.977304275907547e-06, + "loss": 0.1027, + "step": 8503 + }, + { + "epoch": 0.2151985221550219, + "grad_norm": 3.59899640083313, + "learning_rate": 8.97706093842746e-06, + "loss": 0.1453, + "step": 8504 + }, + { + "epoch": 0.21522382771971557, + "grad_norm": 7.490694046020508, + "learning_rate": 8.976817575300045e-06, + "loss": 0.2046, + "step": 8505 + }, + { + "epoch": 0.21524913328440926, + "grad_norm": 7.954771995544434, + "learning_rate": 8.976574186526876e-06, + "loss": 0.1518, + "step": 8506 + }, + { + "epoch": 0.2152744388491029, + "grad_norm": 7.850590705871582, + "learning_rate": 8.976330772109518e-06, + "loss": 0.2436, + "step": 8507 + }, + { + "epoch": 0.2152997444137966, + "grad_norm": 13.32821273803711, + "learning_rate": 8.976087332049543e-06, + "loss": 0.4248, + "step": 8508 + }, + { + "epoch": 0.21532504997849028, + "grad_norm": 4.455726146697998, + "learning_rate": 8.97584386634852e-06, + "loss": 0.179, + "step": 8509 + }, + { + "epoch": 0.21535035554318394, + "grad_norm": 15.596914291381836, + "learning_rate": 8.97560037500802e-06, + "loss": 0.2015, + "step": 8510 + }, + { + "epoch": 0.21537566110787762, + "grad_norm": 4.584231376647949, + "learning_rate": 8.975356858029614e-06, + "loss": 0.1975, + "step": 8511 + }, + { + "epoch": 0.2154009666725713, + "grad_norm": 9.262380599975586, + "learning_rate": 8.975113315414868e-06, + "loss": 0.2569, + "step": 8512 + }, + { + "epoch": 0.215426272237265, + "grad_norm": 13.255398750305176, + "learning_rate": 8.974869747165358e-06, + "loss": 0.2112, + "step": 8513 + }, + { + "epoch": 0.21545157780195864, + "grad_norm": 3.365062952041626, + "learning_rate": 8.974626153282651e-06, + "loss": 0.0934, + "step": 8514 + }, + { + "epoch": 0.21547688336665233, + "grad_norm": 4.685547351837158, + "learning_rate": 8.974382533768319e-06, + "loss": 0.2356, + "step": 8515 + }, + { + "epoch": 0.215502188931346, + "grad_norm": 8.550244331359863, + "learning_rate": 8.974138888623935e-06, + "loss": 0.3158, + "step": 8516 + }, + { + "epoch": 0.21552749449603967, + "grad_norm": 3.957136631011963, + "learning_rate": 8.973895217851069e-06, + "loss": 0.1892, + "step": 8517 + }, + { + "epoch": 0.21555280006073335, + "grad_norm": 7.95678186416626, + "learning_rate": 8.97365152145129e-06, + "loss": 0.2073, + "step": 8518 + }, + { + "epoch": 0.21557810562542704, + "grad_norm": 5.391383171081543, + "learning_rate": 8.973407799426172e-06, + "loss": 0.2009, + "step": 8519 + }, + { + "epoch": 0.21560341119012072, + "grad_norm": 4.422752857208252, + "learning_rate": 8.973164051777288e-06, + "loss": 0.1265, + "step": 8520 + }, + { + "epoch": 0.21562871675481438, + "grad_norm": 9.519980430603027, + "learning_rate": 8.972920278506205e-06, + "loss": 0.2188, + "step": 8521 + }, + { + "epoch": 0.21565402231950806, + "grad_norm": 7.600645065307617, + "learning_rate": 8.9726764796145e-06, + "loss": 0.1495, + "step": 8522 + }, + { + "epoch": 0.21567932788420174, + "grad_norm": 6.265585422515869, + "learning_rate": 8.972432655103745e-06, + "loss": 0.2413, + "step": 8523 + }, + { + "epoch": 0.2157046334488954, + "grad_norm": 4.875582695007324, + "learning_rate": 8.972188804975507e-06, + "loss": 0.1985, + "step": 8524 + }, + { + "epoch": 0.21572993901358908, + "grad_norm": 3.871258020401001, + "learning_rate": 8.971944929231364e-06, + "loss": 0.1889, + "step": 8525 + }, + { + "epoch": 0.21575524457828277, + "grad_norm": 4.291952133178711, + "learning_rate": 8.971701027872888e-06, + "loss": 0.1878, + "step": 8526 + }, + { + "epoch": 0.21578055014297645, + "grad_norm": 3.995680570602417, + "learning_rate": 8.971457100901649e-06, + "loss": 0.1801, + "step": 8527 + }, + { + "epoch": 0.2158058557076701, + "grad_norm": 13.155547142028809, + "learning_rate": 8.971213148319223e-06, + "loss": 0.2267, + "step": 8528 + }, + { + "epoch": 0.2158311612723638, + "grad_norm": 3.3154449462890625, + "learning_rate": 8.970969170127182e-06, + "loss": 0.171, + "step": 8529 + }, + { + "epoch": 0.21585646683705748, + "grad_norm": 10.46595287322998, + "learning_rate": 8.9707251663271e-06, + "loss": 0.2, + "step": 8530 + }, + { + "epoch": 0.21588177240175113, + "grad_norm": 6.953920364379883, + "learning_rate": 8.97048113692055e-06, + "loss": 0.2269, + "step": 8531 + }, + { + "epoch": 0.21590707796644482, + "grad_norm": 5.0415167808532715, + "learning_rate": 8.970237081909105e-06, + "loss": 0.2112, + "step": 8532 + }, + { + "epoch": 0.2159323835311385, + "grad_norm": 18.1259765625, + "learning_rate": 8.969993001294339e-06, + "loss": 0.1986, + "step": 8533 + }, + { + "epoch": 0.21595768909583218, + "grad_norm": 2.3838603496551514, + "learning_rate": 8.969748895077828e-06, + "loss": 0.0922, + "step": 8534 + }, + { + "epoch": 0.21598299466052584, + "grad_norm": 19.277002334594727, + "learning_rate": 8.969504763261144e-06, + "loss": 0.2801, + "step": 8535 + }, + { + "epoch": 0.21600830022521952, + "grad_norm": 4.741073131561279, + "learning_rate": 8.969260605845861e-06, + "loss": 0.1024, + "step": 8536 + }, + { + "epoch": 0.2160336057899132, + "grad_norm": 5.279925346374512, + "learning_rate": 8.969016422833557e-06, + "loss": 0.2542, + "step": 8537 + }, + { + "epoch": 0.2160589113546069, + "grad_norm": 5.760890007019043, + "learning_rate": 8.968772214225804e-06, + "loss": 0.2114, + "step": 8538 + }, + { + "epoch": 0.21608421691930055, + "grad_norm": 3.919179677963257, + "learning_rate": 8.968527980024177e-06, + "loss": 0.1796, + "step": 8539 + }, + { + "epoch": 0.21610952248399423, + "grad_norm": 5.374841690063477, + "learning_rate": 8.96828372023025e-06, + "loss": 0.2236, + "step": 8540 + }, + { + "epoch": 0.21613482804868792, + "grad_norm": 8.659976959228516, + "learning_rate": 8.968039434845602e-06, + "loss": 0.2225, + "step": 8541 + }, + { + "epoch": 0.21616013361338157, + "grad_norm": 7.663945198059082, + "learning_rate": 8.967795123871803e-06, + "loss": 0.172, + "step": 8542 + }, + { + "epoch": 0.21618543917807526, + "grad_norm": 5.071283340454102, + "learning_rate": 8.967550787310433e-06, + "loss": 0.265, + "step": 8543 + }, + { + "epoch": 0.21621074474276894, + "grad_norm": 6.301100730895996, + "learning_rate": 8.967306425163064e-06, + "loss": 0.2425, + "step": 8544 + }, + { + "epoch": 0.21623605030746262, + "grad_norm": 3.7669014930725098, + "learning_rate": 8.967062037431276e-06, + "loss": 0.1587, + "step": 8545 + }, + { + "epoch": 0.21626135587215628, + "grad_norm": 6.8060407638549805, + "learning_rate": 8.966817624116642e-06, + "loss": 0.1636, + "step": 8546 + }, + { + "epoch": 0.21628666143684996, + "grad_norm": 2.7409377098083496, + "learning_rate": 8.96657318522074e-06, + "loss": 0.2104, + "step": 8547 + }, + { + "epoch": 0.21631196700154365, + "grad_norm": 4.23255729675293, + "learning_rate": 8.966328720745143e-06, + "loss": 0.197, + "step": 8548 + }, + { + "epoch": 0.2163372725662373, + "grad_norm": 13.289803504943848, + "learning_rate": 8.966084230691432e-06, + "loss": 0.2001, + "step": 8549 + }, + { + "epoch": 0.216362578130931, + "grad_norm": 9.409523010253906, + "learning_rate": 8.965839715061182e-06, + "loss": 0.4, + "step": 8550 + }, + { + "epoch": 0.21638788369562467, + "grad_norm": 7.149208068847656, + "learning_rate": 8.965595173855968e-06, + "loss": 0.2552, + "step": 8551 + }, + { + "epoch": 0.21641318926031836, + "grad_norm": 9.616747856140137, + "learning_rate": 8.965350607077369e-06, + "loss": 0.2576, + "step": 8552 + }, + { + "epoch": 0.216438494825012, + "grad_norm": 5.453551292419434, + "learning_rate": 8.96510601472696e-06, + "loss": 0.1921, + "step": 8553 + }, + { + "epoch": 0.2164638003897057, + "grad_norm": 7.603755950927734, + "learning_rate": 8.96486139680632e-06, + "loss": 0.1161, + "step": 8554 + }, + { + "epoch": 0.21648910595439938, + "grad_norm": 10.065412521362305, + "learning_rate": 8.96461675331703e-06, + "loss": 0.1465, + "step": 8555 + }, + { + "epoch": 0.21651441151909304, + "grad_norm": 4.642651557922363, + "learning_rate": 8.964372084260663e-06, + "loss": 0.1152, + "step": 8556 + }, + { + "epoch": 0.21653971708378672, + "grad_norm": 5.692375183105469, + "learning_rate": 8.964127389638795e-06, + "loss": 0.222, + "step": 8557 + }, + { + "epoch": 0.2165650226484804, + "grad_norm": 8.61133098602295, + "learning_rate": 8.963882669453008e-06, + "loss": 0.1727, + "step": 8558 + }, + { + "epoch": 0.2165903282131741, + "grad_norm": 5.750070095062256, + "learning_rate": 8.963637923704878e-06, + "loss": 0.1303, + "step": 8559 + }, + { + "epoch": 0.21661563377786774, + "grad_norm": 6.291069507598877, + "learning_rate": 8.963393152395986e-06, + "loss": 0.25, + "step": 8560 + }, + { + "epoch": 0.21664093934256143, + "grad_norm": 5.462020397186279, + "learning_rate": 8.963148355527909e-06, + "loss": 0.1554, + "step": 8561 + }, + { + "epoch": 0.2166662449072551, + "grad_norm": 3.76153564453125, + "learning_rate": 8.962903533102225e-06, + "loss": 0.1467, + "step": 8562 + }, + { + "epoch": 0.21669155047194877, + "grad_norm": 6.9630327224731445, + "learning_rate": 8.962658685120514e-06, + "loss": 0.1712, + "step": 8563 + }, + { + "epoch": 0.21671685603664245, + "grad_norm": 9.723196029663086, + "learning_rate": 8.962413811584352e-06, + "loss": 0.249, + "step": 8564 + }, + { + "epoch": 0.21674216160133614, + "grad_norm": 9.783561706542969, + "learning_rate": 8.962168912495322e-06, + "loss": 0.2337, + "step": 8565 + }, + { + "epoch": 0.21676746716602982, + "grad_norm": 5.308958530426025, + "learning_rate": 8.961923987855001e-06, + "loss": 0.1878, + "step": 8566 + }, + { + "epoch": 0.21679277273072348, + "grad_norm": 16.8566951751709, + "learning_rate": 8.96167903766497e-06, + "loss": 0.2423, + "step": 8567 + }, + { + "epoch": 0.21681807829541716, + "grad_norm": 7.2014007568359375, + "learning_rate": 8.961434061926807e-06, + "loss": 0.3276, + "step": 8568 + }, + { + "epoch": 0.21684338386011084, + "grad_norm": 2.989917278289795, + "learning_rate": 8.961189060642094e-06, + "loss": 0.1217, + "step": 8569 + }, + { + "epoch": 0.21686868942480453, + "grad_norm": 9.574421882629395, + "learning_rate": 8.960944033812408e-06, + "loss": 0.253, + "step": 8570 + }, + { + "epoch": 0.21689399498949818, + "grad_norm": 7.948975563049316, + "learning_rate": 8.960698981439332e-06, + "loss": 0.2422, + "step": 8571 + }, + { + "epoch": 0.21691930055419187, + "grad_norm": 7.666871547698975, + "learning_rate": 8.960453903524444e-06, + "loss": 0.2114, + "step": 8572 + }, + { + "epoch": 0.21694460611888555, + "grad_norm": 8.360780715942383, + "learning_rate": 8.960208800069325e-06, + "loss": 0.166, + "step": 8573 + }, + { + "epoch": 0.2169699116835792, + "grad_norm": 3.9618959426879883, + "learning_rate": 8.959963671075559e-06, + "loss": 0.1612, + "step": 8574 + }, + { + "epoch": 0.2169952172482729, + "grad_norm": 5.774713516235352, + "learning_rate": 8.95971851654472e-06, + "loss": 0.1997, + "step": 8575 + }, + { + "epoch": 0.21702052281296658, + "grad_norm": 4.4299445152282715, + "learning_rate": 8.959473336478396e-06, + "loss": 0.176, + "step": 8576 + }, + { + "epoch": 0.21704582837766026, + "grad_norm": 2.4991016387939453, + "learning_rate": 8.959228130878163e-06, + "loss": 0.122, + "step": 8577 + }, + { + "epoch": 0.21707113394235392, + "grad_norm": 3.437831401824951, + "learning_rate": 8.958982899745606e-06, + "loss": 0.0866, + "step": 8578 + }, + { + "epoch": 0.2170964395070476, + "grad_norm": 3.726733684539795, + "learning_rate": 8.958737643082303e-06, + "loss": 0.1636, + "step": 8579 + }, + { + "epoch": 0.21712174507174128, + "grad_norm": 4.2478766441345215, + "learning_rate": 8.95849236088984e-06, + "loss": 0.1934, + "step": 8580 + }, + { + "epoch": 0.21714705063643494, + "grad_norm": 3.6697733402252197, + "learning_rate": 8.958247053169793e-06, + "loss": 0.1783, + "step": 8581 + }, + { + "epoch": 0.21717235620112862, + "grad_norm": 8.624124526977539, + "learning_rate": 8.958001719923748e-06, + "loss": 0.2732, + "step": 8582 + }, + { + "epoch": 0.2171976617658223, + "grad_norm": 5.793237686157227, + "learning_rate": 8.957756361153285e-06, + "loss": 0.2119, + "step": 8583 + }, + { + "epoch": 0.217222967330516, + "grad_norm": 2.992094039916992, + "learning_rate": 8.957510976859989e-06, + "loss": 0.1386, + "step": 8584 + }, + { + "epoch": 0.21724827289520965, + "grad_norm": 9.790671348571777, + "learning_rate": 8.957265567045442e-06, + "loss": 0.2501, + "step": 8585 + }, + { + "epoch": 0.21727357845990333, + "grad_norm": 9.623562812805176, + "learning_rate": 8.957020131711223e-06, + "loss": 0.1987, + "step": 8586 + }, + { + "epoch": 0.21729888402459702, + "grad_norm": 3.6294593811035156, + "learning_rate": 8.95677467085892e-06, + "loss": 0.1118, + "step": 8587 + }, + { + "epoch": 0.21732418958929067, + "grad_norm": 5.333874702453613, + "learning_rate": 8.95652918449011e-06, + "loss": 0.2376, + "step": 8588 + }, + { + "epoch": 0.21734949515398436, + "grad_norm": 5.978618144989014, + "learning_rate": 8.956283672606381e-06, + "loss": 0.2931, + "step": 8589 + }, + { + "epoch": 0.21737480071867804, + "grad_norm": 10.792621612548828, + "learning_rate": 8.956038135209314e-06, + "loss": 0.3357, + "step": 8590 + }, + { + "epoch": 0.21740010628337172, + "grad_norm": 3.6917366981506348, + "learning_rate": 8.955792572300492e-06, + "loss": 0.1699, + "step": 8591 + }, + { + "epoch": 0.21742541184806538, + "grad_norm": 6.65817403793335, + "learning_rate": 8.955546983881503e-06, + "loss": 0.2328, + "step": 8592 + }, + { + "epoch": 0.21745071741275906, + "grad_norm": 6.176294326782227, + "learning_rate": 8.955301369953923e-06, + "loss": 0.2327, + "step": 8593 + }, + { + "epoch": 0.21747602297745275, + "grad_norm": 5.89373254776001, + "learning_rate": 8.955055730519342e-06, + "loss": 0.2174, + "step": 8594 + }, + { + "epoch": 0.2175013285421464, + "grad_norm": 6.290531635284424, + "learning_rate": 8.954810065579344e-06, + "loss": 0.1585, + "step": 8595 + }, + { + "epoch": 0.2175266341068401, + "grad_norm": 4.102779865264893, + "learning_rate": 8.954564375135509e-06, + "loss": 0.1805, + "step": 8596 + }, + { + "epoch": 0.21755193967153377, + "grad_norm": 3.961047649383545, + "learning_rate": 8.954318659189425e-06, + "loss": 0.2123, + "step": 8597 + }, + { + "epoch": 0.21757724523622746, + "grad_norm": 7.4979119300842285, + "learning_rate": 8.954072917742676e-06, + "loss": 0.2436, + "step": 8598 + }, + { + "epoch": 0.2176025508009211, + "grad_norm": 6.398622035980225, + "learning_rate": 8.953827150796845e-06, + "loss": 0.2257, + "step": 8599 + }, + { + "epoch": 0.2176278563656148, + "grad_norm": 2.8310916423797607, + "learning_rate": 8.95358135835352e-06, + "loss": 0.174, + "step": 8600 + }, + { + "epoch": 0.21765316193030848, + "grad_norm": 10.592679977416992, + "learning_rate": 8.953335540414283e-06, + "loss": 0.2458, + "step": 8601 + }, + { + "epoch": 0.21767846749500216, + "grad_norm": 3.824323892593384, + "learning_rate": 8.95308969698072e-06, + "loss": 0.1263, + "step": 8602 + }, + { + "epoch": 0.21770377305969582, + "grad_norm": 8.224188804626465, + "learning_rate": 8.952843828054418e-06, + "loss": 0.2087, + "step": 8603 + }, + { + "epoch": 0.2177290786243895, + "grad_norm": 5.1560750007629395, + "learning_rate": 8.95259793363696e-06, + "loss": 0.1959, + "step": 8604 + }, + { + "epoch": 0.2177543841890832, + "grad_norm": 5.425243377685547, + "learning_rate": 8.952352013729934e-06, + "loss": 0.1494, + "step": 8605 + }, + { + "epoch": 0.21777968975377685, + "grad_norm": 4.764886856079102, + "learning_rate": 8.952106068334925e-06, + "loss": 0.2137, + "step": 8606 + }, + { + "epoch": 0.21780499531847053, + "grad_norm": 4.1869730949401855, + "learning_rate": 8.951860097453521e-06, + "loss": 0.2013, + "step": 8607 + }, + { + "epoch": 0.2178303008831642, + "grad_norm": 10.167938232421875, + "learning_rate": 8.951614101087305e-06, + "loss": 0.1696, + "step": 8608 + }, + { + "epoch": 0.2178556064478579, + "grad_norm": 8.607876777648926, + "learning_rate": 8.951368079237863e-06, + "loss": 0.1987, + "step": 8609 + }, + { + "epoch": 0.21788091201255155, + "grad_norm": 5.409172058105469, + "learning_rate": 8.951122031906784e-06, + "loss": 0.1786, + "step": 8610 + }, + { + "epoch": 0.21790621757724524, + "grad_norm": 2.562429189682007, + "learning_rate": 8.950875959095656e-06, + "loss": 0.1201, + "step": 8611 + }, + { + "epoch": 0.21793152314193892, + "grad_norm": 7.500498294830322, + "learning_rate": 8.950629860806062e-06, + "loss": 0.1727, + "step": 8612 + }, + { + "epoch": 0.21795682870663258, + "grad_norm": 14.285513877868652, + "learning_rate": 8.950383737039592e-06, + "loss": 0.2881, + "step": 8613 + }, + { + "epoch": 0.21798213427132626, + "grad_norm": 6.210323333740234, + "learning_rate": 8.950137587797832e-06, + "loss": 0.1814, + "step": 8614 + }, + { + "epoch": 0.21800743983601995, + "grad_norm": 5.762773513793945, + "learning_rate": 8.949891413082369e-06, + "loss": 0.248, + "step": 8615 + }, + { + "epoch": 0.21803274540071363, + "grad_norm": 6.441662311553955, + "learning_rate": 8.94964521289479e-06, + "loss": 0.1792, + "step": 8616 + }, + { + "epoch": 0.21805805096540729, + "grad_norm": 4.30980920791626, + "learning_rate": 8.949398987236686e-06, + "loss": 0.1388, + "step": 8617 + }, + { + "epoch": 0.21808335653010097, + "grad_norm": 11.999780654907227, + "learning_rate": 8.94915273610964e-06, + "loss": 0.1858, + "step": 8618 + }, + { + "epoch": 0.21810866209479465, + "grad_norm": 4.152886867523193, + "learning_rate": 8.948906459515245e-06, + "loss": 0.1355, + "step": 8619 + }, + { + "epoch": 0.2181339676594883, + "grad_norm": 4.842062950134277, + "learning_rate": 8.948660157455085e-06, + "loss": 0.162, + "step": 8620 + }, + { + "epoch": 0.218159273224182, + "grad_norm": 3.7725143432617188, + "learning_rate": 8.948413829930752e-06, + "loss": 0.1737, + "step": 8621 + }, + { + "epoch": 0.21818457878887568, + "grad_norm": 12.350003242492676, + "learning_rate": 8.94816747694383e-06, + "loss": 0.2224, + "step": 8622 + }, + { + "epoch": 0.21820988435356936, + "grad_norm": 4.861994743347168, + "learning_rate": 8.947921098495912e-06, + "loss": 0.1551, + "step": 8623 + }, + { + "epoch": 0.21823518991826302, + "grad_norm": 4.182019233703613, + "learning_rate": 8.947674694588585e-06, + "loss": 0.1448, + "step": 8624 + }, + { + "epoch": 0.2182604954829567, + "grad_norm": 6.049222946166992, + "learning_rate": 8.94742826522344e-06, + "loss": 0.2224, + "step": 8625 + }, + { + "epoch": 0.21828580104765039, + "grad_norm": 4.370765209197998, + "learning_rate": 8.947181810402061e-06, + "loss": 0.2107, + "step": 8626 + }, + { + "epoch": 0.21831110661234404, + "grad_norm": 6.742657661437988, + "learning_rate": 8.946935330126043e-06, + "loss": 0.2667, + "step": 8627 + }, + { + "epoch": 0.21833641217703773, + "grad_norm": 11.363924026489258, + "learning_rate": 8.946688824396972e-06, + "loss": 0.2625, + "step": 8628 + }, + { + "epoch": 0.2183617177417314, + "grad_norm": 6.3689727783203125, + "learning_rate": 8.94644229321644e-06, + "loss": 0.2775, + "step": 8629 + }, + { + "epoch": 0.2183870233064251, + "grad_norm": 4.8997273445129395, + "learning_rate": 8.946195736586035e-06, + "loss": 0.21, + "step": 8630 + }, + { + "epoch": 0.21841232887111875, + "grad_norm": 3.442777156829834, + "learning_rate": 8.945949154507347e-06, + "loss": 0.1491, + "step": 8631 + }, + { + "epoch": 0.21843763443581243, + "grad_norm": 3.3553357124328613, + "learning_rate": 8.94570254698197e-06, + "loss": 0.2299, + "step": 8632 + }, + { + "epoch": 0.21846294000050612, + "grad_norm": 5.817130088806152, + "learning_rate": 8.945455914011488e-06, + "loss": 0.2028, + "step": 8633 + }, + { + "epoch": 0.2184882455651998, + "grad_norm": 9.36555290222168, + "learning_rate": 8.945209255597496e-06, + "loss": 0.158, + "step": 8634 + }, + { + "epoch": 0.21851355112989346, + "grad_norm": 3.978936195373535, + "learning_rate": 8.944962571741584e-06, + "loss": 0.1808, + "step": 8635 + }, + { + "epoch": 0.21853885669458714, + "grad_norm": 5.482710838317871, + "learning_rate": 8.94471586244534e-06, + "loss": 0.2108, + "step": 8636 + }, + { + "epoch": 0.21856416225928083, + "grad_norm": 7.871764659881592, + "learning_rate": 8.94446912771036e-06, + "loss": 0.252, + "step": 8637 + }, + { + "epoch": 0.21858946782397448, + "grad_norm": 8.131762504577637, + "learning_rate": 8.944222367538231e-06, + "loss": 0.1553, + "step": 8638 + }, + { + "epoch": 0.21861477338866817, + "grad_norm": 6.523644924163818, + "learning_rate": 8.943975581930545e-06, + "loss": 0.1944, + "step": 8639 + }, + { + "epoch": 0.21864007895336185, + "grad_norm": 6.325746536254883, + "learning_rate": 8.943728770888895e-06, + "loss": 0.2185, + "step": 8640 + }, + { + "epoch": 0.21866538451805553, + "grad_norm": 5.192887306213379, + "learning_rate": 8.94348193441487e-06, + "loss": 0.2243, + "step": 8641 + }, + { + "epoch": 0.2186906900827492, + "grad_norm": 3.7688405513763428, + "learning_rate": 8.943235072510066e-06, + "loss": 0.0543, + "step": 8642 + }, + { + "epoch": 0.21871599564744287, + "grad_norm": 3.724594831466675, + "learning_rate": 8.94298818517607e-06, + "loss": 0.1757, + "step": 8643 + }, + { + "epoch": 0.21874130121213656, + "grad_norm": 3.639026403427124, + "learning_rate": 8.942741272414478e-06, + "loss": 0.1547, + "step": 8644 + }, + { + "epoch": 0.2187666067768302, + "grad_norm": 13.359231948852539, + "learning_rate": 8.942494334226883e-06, + "loss": 0.3214, + "step": 8645 + }, + { + "epoch": 0.2187919123415239, + "grad_norm": 7.879061698913574, + "learning_rate": 8.942247370614874e-06, + "loss": 0.2284, + "step": 8646 + }, + { + "epoch": 0.21881721790621758, + "grad_norm": 8.433684349060059, + "learning_rate": 8.942000381580044e-06, + "loss": 0.2917, + "step": 8647 + }, + { + "epoch": 0.21884252347091127, + "grad_norm": 6.090966701507568, + "learning_rate": 8.941753367123986e-06, + "loss": 0.1788, + "step": 8648 + }, + { + "epoch": 0.21886782903560492, + "grad_norm": 6.355635166168213, + "learning_rate": 8.941506327248296e-06, + "loss": 0.2465, + "step": 8649 + }, + { + "epoch": 0.2188931346002986, + "grad_norm": 4.833603382110596, + "learning_rate": 8.941259261954564e-06, + "loss": 0.1427, + "step": 8650 + }, + { + "epoch": 0.2189184401649923, + "grad_norm": 6.572903633117676, + "learning_rate": 8.941012171244386e-06, + "loss": 0.2398, + "step": 8651 + }, + { + "epoch": 0.21894374572968595, + "grad_norm": 4.012027740478516, + "learning_rate": 8.940765055119351e-06, + "loss": 0.1581, + "step": 8652 + }, + { + "epoch": 0.21896905129437963, + "grad_norm": 3.4514684677124023, + "learning_rate": 8.940517913581055e-06, + "loss": 0.2314, + "step": 8653 + }, + { + "epoch": 0.2189943568590733, + "grad_norm": 6.847410678863525, + "learning_rate": 8.940270746631094e-06, + "loss": 0.1792, + "step": 8654 + }, + { + "epoch": 0.219019662423767, + "grad_norm": 9.080938339233398, + "learning_rate": 8.940023554271058e-06, + "loss": 0.1939, + "step": 8655 + }, + { + "epoch": 0.21904496798846065, + "grad_norm": 12.377106666564941, + "learning_rate": 8.939776336502544e-06, + "loss": 0.2495, + "step": 8656 + }, + { + "epoch": 0.21907027355315434, + "grad_norm": 5.799506187438965, + "learning_rate": 8.939529093327145e-06, + "loss": 0.1865, + "step": 8657 + }, + { + "epoch": 0.21909557911784802, + "grad_norm": 10.86510944366455, + "learning_rate": 8.939281824746456e-06, + "loss": 0.3656, + "step": 8658 + }, + { + "epoch": 0.21912088468254168, + "grad_norm": 2.9567060470581055, + "learning_rate": 8.93903453076207e-06, + "loss": 0.1556, + "step": 8659 + }, + { + "epoch": 0.21914619024723536, + "grad_norm": 6.276323318481445, + "learning_rate": 8.938787211375584e-06, + "loss": 0.2115, + "step": 8660 + }, + { + "epoch": 0.21917149581192905, + "grad_norm": 4.578521728515625, + "learning_rate": 8.938539866588593e-06, + "loss": 0.1373, + "step": 8661 + }, + { + "epoch": 0.21919680137662273, + "grad_norm": 5.494046688079834, + "learning_rate": 8.93829249640269e-06, + "loss": 0.2179, + "step": 8662 + }, + { + "epoch": 0.21922210694131639, + "grad_norm": 4.811040878295898, + "learning_rate": 8.93804510081947e-06, + "loss": 0.1831, + "step": 8663 + }, + { + "epoch": 0.21924741250601007, + "grad_norm": 13.828702926635742, + "learning_rate": 8.937797679840532e-06, + "loss": 0.2288, + "step": 8664 + }, + { + "epoch": 0.21927271807070375, + "grad_norm": 2.9384968280792236, + "learning_rate": 8.937550233467466e-06, + "loss": 0.1047, + "step": 8665 + }, + { + "epoch": 0.21929802363539744, + "grad_norm": 4.9536919593811035, + "learning_rate": 8.937302761701875e-06, + "loss": 0.1976, + "step": 8666 + }, + { + "epoch": 0.2193233292000911, + "grad_norm": 11.31653881072998, + "learning_rate": 8.937055264545348e-06, + "loss": 0.2696, + "step": 8667 + }, + { + "epoch": 0.21934863476478478, + "grad_norm": 6.126646041870117, + "learning_rate": 8.936807741999486e-06, + "loss": 0.3106, + "step": 8668 + }, + { + "epoch": 0.21937394032947846, + "grad_norm": 19.977924346923828, + "learning_rate": 8.936560194065883e-06, + "loss": 0.2185, + "step": 8669 + }, + { + "epoch": 0.21939924589417212, + "grad_norm": 2.454775094985962, + "learning_rate": 8.936312620746134e-06, + "loss": 0.1604, + "step": 8670 + }, + { + "epoch": 0.2194245514588658, + "grad_norm": 4.304953575134277, + "learning_rate": 8.936065022041837e-06, + "loss": 0.1891, + "step": 8671 + }, + { + "epoch": 0.21944985702355949, + "grad_norm": 11.112907409667969, + "learning_rate": 8.93581739795459e-06, + "loss": 0.3193, + "step": 8672 + }, + { + "epoch": 0.21947516258825317, + "grad_norm": 2.7912566661834717, + "learning_rate": 8.935569748485988e-06, + "loss": 0.1574, + "step": 8673 + }, + { + "epoch": 0.21950046815294683, + "grad_norm": 5.0230889320373535, + "learning_rate": 8.93532207363763e-06, + "loss": 0.1671, + "step": 8674 + }, + { + "epoch": 0.2195257737176405, + "grad_norm": 4.679023742675781, + "learning_rate": 8.935074373411111e-06, + "loss": 0.2595, + "step": 8675 + }, + { + "epoch": 0.2195510792823342, + "grad_norm": 7.2388997077941895, + "learning_rate": 8.934826647808029e-06, + "loss": 0.194, + "step": 8676 + }, + { + "epoch": 0.21957638484702785, + "grad_norm": 8.858641624450684, + "learning_rate": 8.934578896829983e-06, + "loss": 0.2246, + "step": 8677 + }, + { + "epoch": 0.21960169041172153, + "grad_norm": 5.964995861053467, + "learning_rate": 8.934331120478568e-06, + "loss": 0.239, + "step": 8678 + }, + { + "epoch": 0.21962699597641522, + "grad_norm": 4.349976539611816, + "learning_rate": 8.934083318755386e-06, + "loss": 0.2103, + "step": 8679 + }, + { + "epoch": 0.2196523015411089, + "grad_norm": 2.0935373306274414, + "learning_rate": 8.93383549166203e-06, + "loss": 0.0601, + "step": 8680 + }, + { + "epoch": 0.21967760710580256, + "grad_norm": 8.699769020080566, + "learning_rate": 8.933587639200101e-06, + "loss": 0.3132, + "step": 8681 + }, + { + "epoch": 0.21970291267049624, + "grad_norm": 7.345704555511475, + "learning_rate": 8.933339761371199e-06, + "loss": 0.2785, + "step": 8682 + }, + { + "epoch": 0.21972821823518993, + "grad_norm": 9.214067459106445, + "learning_rate": 8.933091858176919e-06, + "loss": 0.2158, + "step": 8683 + }, + { + "epoch": 0.21975352379988358, + "grad_norm": 8.936450004577637, + "learning_rate": 8.932843929618862e-06, + "loss": 0.3299, + "step": 8684 + }, + { + "epoch": 0.21977882936457727, + "grad_norm": 4.625047206878662, + "learning_rate": 8.932595975698625e-06, + "loss": 0.104, + "step": 8685 + }, + { + "epoch": 0.21980413492927095, + "grad_norm": 4.663252830505371, + "learning_rate": 8.932347996417807e-06, + "loss": 0.1737, + "step": 8686 + }, + { + "epoch": 0.21982944049396463, + "grad_norm": 8.539923667907715, + "learning_rate": 8.93209999177801e-06, + "loss": 0.2534, + "step": 8687 + }, + { + "epoch": 0.2198547460586583, + "grad_norm": 5.617743015289307, + "learning_rate": 8.931851961780832e-06, + "loss": 0.19, + "step": 8688 + }, + { + "epoch": 0.21988005162335197, + "grad_norm": 4.777050018310547, + "learning_rate": 8.93160390642787e-06, + "loss": 0.2502, + "step": 8689 + }, + { + "epoch": 0.21990535718804566, + "grad_norm": 4.870433807373047, + "learning_rate": 8.931355825720728e-06, + "loss": 0.2488, + "step": 8690 + }, + { + "epoch": 0.21993066275273931, + "grad_norm": 11.850105285644531, + "learning_rate": 8.931107719661002e-06, + "loss": 0.3033, + "step": 8691 + }, + { + "epoch": 0.219955968317433, + "grad_norm": 7.03664493560791, + "learning_rate": 8.930859588250296e-06, + "loss": 0.1977, + "step": 8692 + }, + { + "epoch": 0.21998127388212668, + "grad_norm": 5.306541442871094, + "learning_rate": 8.930611431490205e-06, + "loss": 0.1746, + "step": 8693 + }, + { + "epoch": 0.22000657944682037, + "grad_norm": 3.4210479259490967, + "learning_rate": 8.930363249382333e-06, + "loss": 0.1793, + "step": 8694 + }, + { + "epoch": 0.22003188501151402, + "grad_norm": 6.026905536651611, + "learning_rate": 8.93011504192828e-06, + "loss": 0.1723, + "step": 8695 + }, + { + "epoch": 0.2200571905762077, + "grad_norm": 6.871109962463379, + "learning_rate": 8.929866809129645e-06, + "loss": 0.2713, + "step": 8696 + }, + { + "epoch": 0.2200824961409014, + "grad_norm": 6.438204765319824, + "learning_rate": 8.92961855098803e-06, + "loss": 0.2211, + "step": 8697 + }, + { + "epoch": 0.22010780170559507, + "grad_norm": 6.505000591278076, + "learning_rate": 8.929370267505037e-06, + "loss": 0.1765, + "step": 8698 + }, + { + "epoch": 0.22013310727028873, + "grad_norm": 6.4855570793151855, + "learning_rate": 8.929121958682266e-06, + "loss": 0.1725, + "step": 8699 + }, + { + "epoch": 0.22015841283498241, + "grad_norm": 3.244206190109253, + "learning_rate": 8.928873624521317e-06, + "loss": 0.1081, + "step": 8700 + }, + { + "epoch": 0.2201837183996761, + "grad_norm": 5.912468910217285, + "learning_rate": 8.928625265023795e-06, + "loss": 0.2146, + "step": 8701 + }, + { + "epoch": 0.22020902396436975, + "grad_norm": 4.661192893981934, + "learning_rate": 8.928376880191299e-06, + "loss": 0.189, + "step": 8702 + }, + { + "epoch": 0.22023432952906344, + "grad_norm": 2.7853119373321533, + "learning_rate": 8.92812847002543e-06, + "loss": 0.1928, + "step": 8703 + }, + { + "epoch": 0.22025963509375712, + "grad_norm": 3.525231122970581, + "learning_rate": 8.927880034527792e-06, + "loss": 0.2031, + "step": 8704 + }, + { + "epoch": 0.2202849406584508, + "grad_norm": 5.835777759552002, + "learning_rate": 8.927631573699985e-06, + "loss": 0.1947, + "step": 8705 + }, + { + "epoch": 0.22031024622314446, + "grad_norm": 9.132120132446289, + "learning_rate": 8.927383087543613e-06, + "loss": 0.3022, + "step": 8706 + }, + { + "epoch": 0.22033555178783815, + "grad_norm": 9.800749778747559, + "learning_rate": 8.92713457606028e-06, + "loss": 0.3036, + "step": 8707 + }, + { + "epoch": 0.22036085735253183, + "grad_norm": 4.366161346435547, + "learning_rate": 8.926886039251584e-06, + "loss": 0.1444, + "step": 8708 + }, + { + "epoch": 0.2203861629172255, + "grad_norm": 4.003454685211182, + "learning_rate": 8.92663747711913e-06, + "loss": 0.166, + "step": 8709 + }, + { + "epoch": 0.22041146848191917, + "grad_norm": 5.242945671081543, + "learning_rate": 8.926388889664524e-06, + "loss": 0.1723, + "step": 8710 + }, + { + "epoch": 0.22043677404661285, + "grad_norm": 5.241084098815918, + "learning_rate": 8.926140276889365e-06, + "loss": 0.25, + "step": 8711 + }, + { + "epoch": 0.22046207961130654, + "grad_norm": 5.516785621643066, + "learning_rate": 8.925891638795258e-06, + "loss": 0.2238, + "step": 8712 + }, + { + "epoch": 0.2204873851760002, + "grad_norm": 4.346506595611572, + "learning_rate": 8.925642975383807e-06, + "loss": 0.1621, + "step": 8713 + }, + { + "epoch": 0.22051269074069388, + "grad_norm": 3.8152899742126465, + "learning_rate": 8.925394286656613e-06, + "loss": 0.1343, + "step": 8714 + }, + { + "epoch": 0.22053799630538756, + "grad_norm": 6.31015682220459, + "learning_rate": 8.925145572615282e-06, + "loss": 0.2139, + "step": 8715 + }, + { + "epoch": 0.22056330187008122, + "grad_norm": 5.040715217590332, + "learning_rate": 8.924896833261419e-06, + "loss": 0.2343, + "step": 8716 + }, + { + "epoch": 0.2205886074347749, + "grad_norm": 3.5372049808502197, + "learning_rate": 8.924648068596625e-06, + "loss": 0.1574, + "step": 8717 + }, + { + "epoch": 0.2206139129994686, + "grad_norm": 12.961199760437012, + "learning_rate": 8.924399278622504e-06, + "loss": 0.2708, + "step": 8718 + }, + { + "epoch": 0.22063921856416227, + "grad_norm": 14.557462692260742, + "learning_rate": 8.924150463340664e-06, + "loss": 0.2644, + "step": 8719 + }, + { + "epoch": 0.22066452412885593, + "grad_norm": 7.008004665374756, + "learning_rate": 8.923901622752706e-06, + "loss": 0.1886, + "step": 8720 + }, + { + "epoch": 0.2206898296935496, + "grad_norm": 4.73413610458374, + "learning_rate": 8.923652756860238e-06, + "loss": 0.095, + "step": 8721 + }, + { + "epoch": 0.2207151352582433, + "grad_norm": 11.768245697021484, + "learning_rate": 8.923403865664863e-06, + "loss": 0.2733, + "step": 8722 + }, + { + "epoch": 0.22074044082293695, + "grad_norm": 8.697568893432617, + "learning_rate": 8.923154949168187e-06, + "loss": 0.2428, + "step": 8723 + }, + { + "epoch": 0.22076574638763063, + "grad_norm": 4.748321056365967, + "learning_rate": 8.922906007371813e-06, + "loss": 0.1917, + "step": 8724 + }, + { + "epoch": 0.22079105195232432, + "grad_norm": 15.063642501831055, + "learning_rate": 8.92265704027735e-06, + "loss": 0.2597, + "step": 8725 + }, + { + "epoch": 0.220816357517018, + "grad_norm": 9.21662425994873, + "learning_rate": 8.9224080478864e-06, + "loss": 0.2891, + "step": 8726 + }, + { + "epoch": 0.22084166308171166, + "grad_norm": 4.135088920593262, + "learning_rate": 8.922159030200569e-06, + "loss": 0.1555, + "step": 8727 + }, + { + "epoch": 0.22086696864640534, + "grad_norm": 7.1328887939453125, + "learning_rate": 8.921909987221464e-06, + "loss": 0.1421, + "step": 8728 + }, + { + "epoch": 0.22089227421109903, + "grad_norm": 5.713837623596191, + "learning_rate": 8.921660918950692e-06, + "loss": 0.2042, + "step": 8729 + }, + { + "epoch": 0.2209175797757927, + "grad_norm": 5.826309680938721, + "learning_rate": 8.921411825389859e-06, + "loss": 0.1993, + "step": 8730 + }, + { + "epoch": 0.22094288534048637, + "grad_norm": 4.973749160766602, + "learning_rate": 8.921162706540569e-06, + "loss": 0.1359, + "step": 8731 + }, + { + "epoch": 0.22096819090518005, + "grad_norm": 10.6233549118042, + "learning_rate": 8.92091356240443e-06, + "loss": 0.3158, + "step": 8732 + }, + { + "epoch": 0.22099349646987373, + "grad_norm": 4.021721839904785, + "learning_rate": 8.92066439298305e-06, + "loss": 0.1768, + "step": 8733 + }, + { + "epoch": 0.2210188020345674, + "grad_norm": 5.1836090087890625, + "learning_rate": 8.920415198278034e-06, + "loss": 0.1887, + "step": 8734 + }, + { + "epoch": 0.22104410759926107, + "grad_norm": 40.38928985595703, + "learning_rate": 8.920165978290991e-06, + "loss": 0.2526, + "step": 8735 + }, + { + "epoch": 0.22106941316395476, + "grad_norm": 3.524470806121826, + "learning_rate": 8.919916733023527e-06, + "loss": 0.1374, + "step": 8736 + }, + { + "epoch": 0.22109471872864844, + "grad_norm": 3.800023078918457, + "learning_rate": 8.919667462477247e-06, + "loss": 0.153, + "step": 8737 + }, + { + "epoch": 0.2211200242933421, + "grad_norm": 5.829605579376221, + "learning_rate": 8.91941816665376e-06, + "loss": 0.1777, + "step": 8738 + }, + { + "epoch": 0.22114532985803578, + "grad_norm": 3.783867120742798, + "learning_rate": 8.919168845554677e-06, + "loss": 0.1285, + "step": 8739 + }, + { + "epoch": 0.22117063542272947, + "grad_norm": 6.299962520599365, + "learning_rate": 8.918919499181602e-06, + "loss": 0.1462, + "step": 8740 + }, + { + "epoch": 0.22119594098742312, + "grad_norm": 2.839280366897583, + "learning_rate": 8.918670127536144e-06, + "loss": 0.0974, + "step": 8741 + }, + { + "epoch": 0.2212212465521168, + "grad_norm": 4.598496437072754, + "learning_rate": 8.918420730619911e-06, + "loss": 0.2438, + "step": 8742 + }, + { + "epoch": 0.2212465521168105, + "grad_norm": 10.054732322692871, + "learning_rate": 8.918171308434511e-06, + "loss": 0.3122, + "step": 8743 + }, + { + "epoch": 0.22127185768150417, + "grad_norm": 3.904961347579956, + "learning_rate": 8.917921860981556e-06, + "loss": 0.1642, + "step": 8744 + }, + { + "epoch": 0.22129716324619783, + "grad_norm": 6.438225746154785, + "learning_rate": 8.917672388262648e-06, + "loss": 0.2649, + "step": 8745 + }, + { + "epoch": 0.22132246881089152, + "grad_norm": 5.037119388580322, + "learning_rate": 8.9174228902794e-06, + "loss": 0.1969, + "step": 8746 + }, + { + "epoch": 0.2213477743755852, + "grad_norm": 4.434383392333984, + "learning_rate": 8.917173367033424e-06, + "loss": 0.1876, + "step": 8747 + }, + { + "epoch": 0.22137307994027886, + "grad_norm": 13.608945846557617, + "learning_rate": 8.916923818526323e-06, + "loss": 0.1255, + "step": 8748 + }, + { + "epoch": 0.22139838550497254, + "grad_norm": 3.1446425914764404, + "learning_rate": 8.916674244759709e-06, + "loss": 0.1409, + "step": 8749 + }, + { + "epoch": 0.22142369106966622, + "grad_norm": 13.336889266967773, + "learning_rate": 8.916424645735192e-06, + "loss": 0.2169, + "step": 8750 + }, + { + "epoch": 0.2214489966343599, + "grad_norm": 3.8197641372680664, + "learning_rate": 8.91617502145438e-06, + "loss": 0.1583, + "step": 8751 + }, + { + "epoch": 0.22147430219905356, + "grad_norm": 6.708371639251709, + "learning_rate": 8.915925371918884e-06, + "loss": 0.1891, + "step": 8752 + }, + { + "epoch": 0.22149960776374725, + "grad_norm": 8.291884422302246, + "learning_rate": 8.915675697130315e-06, + "loss": 0.2361, + "step": 8753 + }, + { + "epoch": 0.22152491332844093, + "grad_norm": 8.394672393798828, + "learning_rate": 8.91542599709028e-06, + "loss": 0.1768, + "step": 8754 + }, + { + "epoch": 0.2215502188931346, + "grad_norm": 11.354536056518555, + "learning_rate": 8.915176271800393e-06, + "loss": 0.3512, + "step": 8755 + }, + { + "epoch": 0.22157552445782827, + "grad_norm": 4.407454490661621, + "learning_rate": 8.91492652126226e-06, + "loss": 0.2432, + "step": 8756 + }, + { + "epoch": 0.22160083002252196, + "grad_norm": 5.479608058929443, + "learning_rate": 8.914676745477496e-06, + "loss": 0.2201, + "step": 8757 + }, + { + "epoch": 0.22162613558721564, + "grad_norm": 4.1874189376831055, + "learning_rate": 8.91442694444771e-06, + "loss": 0.1707, + "step": 8758 + }, + { + "epoch": 0.2216514411519093, + "grad_norm": 15.237220764160156, + "learning_rate": 8.914177118174512e-06, + "loss": 0.3634, + "step": 8759 + }, + { + "epoch": 0.22167674671660298, + "grad_norm": 7.486152648925781, + "learning_rate": 8.913927266659514e-06, + "loss": 0.1598, + "step": 8760 + }, + { + "epoch": 0.22170205228129666, + "grad_norm": 4.1130194664001465, + "learning_rate": 8.913677389904325e-06, + "loss": 0.1965, + "step": 8761 + }, + { + "epoch": 0.22172735784599035, + "grad_norm": 11.048828125, + "learning_rate": 8.913427487910562e-06, + "loss": 0.1819, + "step": 8762 + }, + { + "epoch": 0.221752663410684, + "grad_norm": 4.909826278686523, + "learning_rate": 8.91317756067983e-06, + "loss": 0.2036, + "step": 8763 + }, + { + "epoch": 0.2217779689753777, + "grad_norm": 5.101602554321289, + "learning_rate": 8.912927608213745e-06, + "loss": 0.1783, + "step": 8764 + }, + { + "epoch": 0.22180327454007137, + "grad_norm": 7.973651885986328, + "learning_rate": 8.91267763051392e-06, + "loss": 0.2138, + "step": 8765 + }, + { + "epoch": 0.22182858010476503, + "grad_norm": 10.79550838470459, + "learning_rate": 8.912427627581962e-06, + "loss": 0.2753, + "step": 8766 + }, + { + "epoch": 0.2218538856694587, + "grad_norm": 4.632380485534668, + "learning_rate": 8.912177599419487e-06, + "loss": 0.1569, + "step": 8767 + }, + { + "epoch": 0.2218791912341524, + "grad_norm": 39.645111083984375, + "learning_rate": 8.911927546028108e-06, + "loss": 0.1243, + "step": 8768 + }, + { + "epoch": 0.22190449679884608, + "grad_norm": 5.926843166351318, + "learning_rate": 8.911677467409435e-06, + "loss": 0.1766, + "step": 8769 + }, + { + "epoch": 0.22192980236353974, + "grad_norm": 5.447268486022949, + "learning_rate": 8.911427363565079e-06, + "loss": 0.1891, + "step": 8770 + }, + { + "epoch": 0.22195510792823342, + "grad_norm": 6.76448917388916, + "learning_rate": 8.911177234496657e-06, + "loss": 0.2109, + "step": 8771 + }, + { + "epoch": 0.2219804134929271, + "grad_norm": 6.542407035827637, + "learning_rate": 8.910927080205782e-06, + "loss": 0.2402, + "step": 8772 + }, + { + "epoch": 0.22200571905762076, + "grad_norm": 4.460249900817871, + "learning_rate": 8.910676900694064e-06, + "loss": 0.2213, + "step": 8773 + }, + { + "epoch": 0.22203102462231444, + "grad_norm": 5.274564743041992, + "learning_rate": 8.91042669596312e-06, + "loss": 0.2301, + "step": 8774 + }, + { + "epoch": 0.22205633018700813, + "grad_norm": 6.680665493011475, + "learning_rate": 8.91017646601456e-06, + "loss": 0.2173, + "step": 8775 + }, + { + "epoch": 0.2220816357517018, + "grad_norm": 4.339986801147461, + "learning_rate": 8.90992621085e-06, + "loss": 0.1724, + "step": 8776 + }, + { + "epoch": 0.22210694131639547, + "grad_norm": 7.149774074554443, + "learning_rate": 8.909675930471053e-06, + "loss": 0.1668, + "step": 8777 + }, + { + "epoch": 0.22213224688108915, + "grad_norm": 3.6091294288635254, + "learning_rate": 8.909425624879332e-06, + "loss": 0.1811, + "step": 8778 + }, + { + "epoch": 0.22215755244578284, + "grad_norm": 7.802359580993652, + "learning_rate": 8.909175294076454e-06, + "loss": 0.2898, + "step": 8779 + }, + { + "epoch": 0.2221828580104765, + "grad_norm": 3.7014851570129395, + "learning_rate": 8.908924938064031e-06, + "loss": 0.2375, + "step": 8780 + }, + { + "epoch": 0.22220816357517018, + "grad_norm": 5.141650199890137, + "learning_rate": 8.908674556843678e-06, + "loss": 0.1676, + "step": 8781 + }, + { + "epoch": 0.22223346913986386, + "grad_norm": 4.782454967498779, + "learning_rate": 8.90842415041701e-06, + "loss": 0.1929, + "step": 8782 + }, + { + "epoch": 0.22225877470455754, + "grad_norm": 5.031164646148682, + "learning_rate": 8.908173718785642e-06, + "loss": 0.2205, + "step": 8783 + }, + { + "epoch": 0.2222840802692512, + "grad_norm": 5.361345291137695, + "learning_rate": 8.90792326195119e-06, + "loss": 0.2021, + "step": 8784 + }, + { + "epoch": 0.22230938583394488, + "grad_norm": 2.616190195083618, + "learning_rate": 8.907672779915266e-06, + "loss": 0.0891, + "step": 8785 + }, + { + "epoch": 0.22233469139863857, + "grad_norm": 7.008584022521973, + "learning_rate": 8.907422272679486e-06, + "loss": 0.2135, + "step": 8786 + }, + { + "epoch": 0.22235999696333222, + "grad_norm": 2.7698307037353516, + "learning_rate": 8.90717174024547e-06, + "loss": 0.1389, + "step": 8787 + }, + { + "epoch": 0.2223853025280259, + "grad_norm": 15.147929191589355, + "learning_rate": 8.906921182614827e-06, + "loss": 0.3395, + "step": 8788 + }, + { + "epoch": 0.2224106080927196, + "grad_norm": 7.835666179656982, + "learning_rate": 8.906670599789178e-06, + "loss": 0.2036, + "step": 8789 + }, + { + "epoch": 0.22243591365741328, + "grad_norm": 6.596282482147217, + "learning_rate": 8.906419991770136e-06, + "loss": 0.1797, + "step": 8790 + }, + { + "epoch": 0.22246121922210693, + "grad_norm": 2.859818696975708, + "learning_rate": 8.906169358559319e-06, + "loss": 0.07, + "step": 8791 + }, + { + "epoch": 0.22248652478680062, + "grad_norm": 10.862080574035645, + "learning_rate": 8.905918700158342e-06, + "loss": 0.2007, + "step": 8792 + }, + { + "epoch": 0.2225118303514943, + "grad_norm": 8.098214149475098, + "learning_rate": 8.905668016568821e-06, + "loss": 0.2877, + "step": 8793 + }, + { + "epoch": 0.22253713591618796, + "grad_norm": 7.650639057159424, + "learning_rate": 8.905417307792375e-06, + "loss": 0.2422, + "step": 8794 + }, + { + "epoch": 0.22256244148088164, + "grad_norm": 8.28975772857666, + "learning_rate": 8.905166573830616e-06, + "loss": 0.1977, + "step": 8795 + }, + { + "epoch": 0.22258774704557532, + "grad_norm": 2.3676273822784424, + "learning_rate": 8.904915814685168e-06, + "loss": 0.1189, + "step": 8796 + }, + { + "epoch": 0.222613052610269, + "grad_norm": 10.5542573928833, + "learning_rate": 8.904665030357644e-06, + "loss": 0.2333, + "step": 8797 + }, + { + "epoch": 0.22263835817496266, + "grad_norm": 4.3771185874938965, + "learning_rate": 8.90441422084966e-06, + "loss": 0.1831, + "step": 8798 + }, + { + "epoch": 0.22266366373965635, + "grad_norm": 5.4397735595703125, + "learning_rate": 8.904163386162836e-06, + "loss": 0.1758, + "step": 8799 + }, + { + "epoch": 0.22268896930435003, + "grad_norm": 5.695253849029541, + "learning_rate": 8.903912526298788e-06, + "loss": 0.1425, + "step": 8800 + }, + { + "epoch": 0.22271427486904372, + "grad_norm": 6.2024688720703125, + "learning_rate": 8.903661641259133e-06, + "loss": 0.184, + "step": 8801 + }, + { + "epoch": 0.22273958043373737, + "grad_norm": 4.201321125030518, + "learning_rate": 8.903410731045493e-06, + "loss": 0.1626, + "step": 8802 + }, + { + "epoch": 0.22276488599843106, + "grad_norm": 13.083808898925781, + "learning_rate": 8.903159795659482e-06, + "loss": 0.2665, + "step": 8803 + }, + { + "epoch": 0.22279019156312474, + "grad_norm": 3.3372223377227783, + "learning_rate": 8.902908835102718e-06, + "loss": 0.1516, + "step": 8804 + }, + { + "epoch": 0.2228154971278184, + "grad_norm": 6.316431045532227, + "learning_rate": 8.902657849376822e-06, + "loss": 0.229, + "step": 8805 + }, + { + "epoch": 0.22284080269251208, + "grad_norm": 7.8767619132995605, + "learning_rate": 8.902406838483413e-06, + "loss": 0.1532, + "step": 8806 + }, + { + "epoch": 0.22286610825720576, + "grad_norm": 5.086261749267578, + "learning_rate": 8.902155802424106e-06, + "loss": 0.1457, + "step": 8807 + }, + { + "epoch": 0.22289141382189945, + "grad_norm": 5.812704086303711, + "learning_rate": 8.901904741200524e-06, + "loss": 0.1548, + "step": 8808 + }, + { + "epoch": 0.2229167193865931, + "grad_norm": 5.542610168457031, + "learning_rate": 8.901653654814281e-06, + "loss": 0.1352, + "step": 8809 + }, + { + "epoch": 0.2229420249512868, + "grad_norm": 2.8159313201904297, + "learning_rate": 8.901402543267003e-06, + "loss": 0.0705, + "step": 8810 + }, + { + "epoch": 0.22296733051598047, + "grad_norm": 5.106937885284424, + "learning_rate": 8.901151406560305e-06, + "loss": 0.0842, + "step": 8811 + }, + { + "epoch": 0.22299263608067413, + "grad_norm": 5.372989654541016, + "learning_rate": 8.900900244695806e-06, + "loss": 0.2123, + "step": 8812 + }, + { + "epoch": 0.2230179416453678, + "grad_norm": 4.490477085113525, + "learning_rate": 8.900649057675127e-06, + "loss": 0.203, + "step": 8813 + }, + { + "epoch": 0.2230432472100615, + "grad_norm": 6.5395379066467285, + "learning_rate": 8.90039784549989e-06, + "loss": 0.1586, + "step": 8814 + }, + { + "epoch": 0.22306855277475518, + "grad_norm": 5.774198055267334, + "learning_rate": 8.90014660817171e-06, + "loss": 0.1549, + "step": 8815 + }, + { + "epoch": 0.22309385833944884, + "grad_norm": 6.796389102935791, + "learning_rate": 8.89989534569221e-06, + "loss": 0.1566, + "step": 8816 + }, + { + "epoch": 0.22311916390414252, + "grad_norm": 6.1573381423950195, + "learning_rate": 8.89964405806301e-06, + "loss": 0.1648, + "step": 8817 + }, + { + "epoch": 0.2231444694688362, + "grad_norm": 4.281301498413086, + "learning_rate": 8.899392745285733e-06, + "loss": 0.1446, + "step": 8818 + }, + { + "epoch": 0.22316977503352986, + "grad_norm": 4.550563812255859, + "learning_rate": 8.899141407361998e-06, + "loss": 0.2121, + "step": 8819 + }, + { + "epoch": 0.22319508059822354, + "grad_norm": 7.908170700073242, + "learning_rate": 8.898890044293423e-06, + "loss": 0.2094, + "step": 8820 + }, + { + "epoch": 0.22322038616291723, + "grad_norm": 6.451504230499268, + "learning_rate": 8.898638656081633e-06, + "loss": 0.2081, + "step": 8821 + }, + { + "epoch": 0.2232456917276109, + "grad_norm": 13.309481620788574, + "learning_rate": 8.898387242728247e-06, + "loss": 0.3962, + "step": 8822 + }, + { + "epoch": 0.22327099729230457, + "grad_norm": 9.671228408813477, + "learning_rate": 8.898135804234886e-06, + "loss": 0.3017, + "step": 8823 + }, + { + "epoch": 0.22329630285699825, + "grad_norm": 3.402340888977051, + "learning_rate": 8.897884340603174e-06, + "loss": 0.1607, + "step": 8824 + }, + { + "epoch": 0.22332160842169194, + "grad_norm": 3.660799026489258, + "learning_rate": 8.89763285183473e-06, + "loss": 0.1715, + "step": 8825 + }, + { + "epoch": 0.2233469139863856, + "grad_norm": 3.801076889038086, + "learning_rate": 8.897381337931177e-06, + "loss": 0.1609, + "step": 8826 + }, + { + "epoch": 0.22337221955107928, + "grad_norm": 5.268182277679443, + "learning_rate": 8.897129798894136e-06, + "loss": 0.257, + "step": 8827 + }, + { + "epoch": 0.22339752511577296, + "grad_norm": 8.046890258789062, + "learning_rate": 8.89687823472523e-06, + "loss": 0.1736, + "step": 8828 + }, + { + "epoch": 0.22342283068046664, + "grad_norm": 12.909693717956543, + "learning_rate": 8.896626645426083e-06, + "loss": 0.3355, + "step": 8829 + }, + { + "epoch": 0.2234481362451603, + "grad_norm": 8.376177787780762, + "learning_rate": 8.896375030998313e-06, + "loss": 0.2501, + "step": 8830 + }, + { + "epoch": 0.22347344180985398, + "grad_norm": 7.327095031738281, + "learning_rate": 8.896123391443548e-06, + "loss": 0.2522, + "step": 8831 + }, + { + "epoch": 0.22349874737454767, + "grad_norm": 13.118626594543457, + "learning_rate": 8.895871726763406e-06, + "loss": 0.1873, + "step": 8832 + }, + { + "epoch": 0.22352405293924135, + "grad_norm": 4.091648578643799, + "learning_rate": 8.895620036959512e-06, + "loss": 0.193, + "step": 8833 + }, + { + "epoch": 0.223549358503935, + "grad_norm": 5.349668502807617, + "learning_rate": 8.895368322033489e-06, + "loss": 0.1599, + "step": 8834 + }, + { + "epoch": 0.2235746640686287, + "grad_norm": 5.499974250793457, + "learning_rate": 8.89511658198696e-06, + "loss": 0.2011, + "step": 8835 + }, + { + "epoch": 0.22359996963332238, + "grad_norm": 8.049606323242188, + "learning_rate": 8.894864816821549e-06, + "loss": 0.1831, + "step": 8836 + }, + { + "epoch": 0.22362527519801603, + "grad_norm": 20.821170806884766, + "learning_rate": 8.894613026538879e-06, + "loss": 0.278, + "step": 8837 + }, + { + "epoch": 0.22365058076270972, + "grad_norm": 9.413115501403809, + "learning_rate": 8.894361211140576e-06, + "loss": 0.2083, + "step": 8838 + }, + { + "epoch": 0.2236758863274034, + "grad_norm": 4.965612888336182, + "learning_rate": 8.89410937062826e-06, + "loss": 0.1965, + "step": 8839 + }, + { + "epoch": 0.22370119189209708, + "grad_norm": 8.155631065368652, + "learning_rate": 8.893857505003558e-06, + "loss": 0.2864, + "step": 8840 + }, + { + "epoch": 0.22372649745679074, + "grad_norm": 5.148904323577881, + "learning_rate": 8.893605614268093e-06, + "loss": 0.1654, + "step": 8841 + }, + { + "epoch": 0.22375180302148442, + "grad_norm": 11.486411094665527, + "learning_rate": 8.893353698423488e-06, + "loss": 0.3007, + "step": 8842 + }, + { + "epoch": 0.2237771085861781, + "grad_norm": 5.170057773590088, + "learning_rate": 8.893101757471372e-06, + "loss": 0.19, + "step": 8843 + }, + { + "epoch": 0.22380241415087176, + "grad_norm": 4.897408962249756, + "learning_rate": 8.892849791413365e-06, + "loss": 0.2992, + "step": 8844 + }, + { + "epoch": 0.22382771971556545, + "grad_norm": 3.7088234424591064, + "learning_rate": 8.892597800251094e-06, + "loss": 0.1869, + "step": 8845 + }, + { + "epoch": 0.22385302528025913, + "grad_norm": 3.691671371459961, + "learning_rate": 8.892345783986185e-06, + "loss": 0.1374, + "step": 8846 + }, + { + "epoch": 0.22387833084495282, + "grad_norm": 4.189391613006592, + "learning_rate": 8.892093742620261e-06, + "loss": 0.143, + "step": 8847 + }, + { + "epoch": 0.22390363640964647, + "grad_norm": 5.4666242599487305, + "learning_rate": 8.891841676154949e-06, + "loss": 0.1929, + "step": 8848 + }, + { + "epoch": 0.22392894197434016, + "grad_norm": 11.556304931640625, + "learning_rate": 8.891589584591874e-06, + "loss": 0.4082, + "step": 8849 + }, + { + "epoch": 0.22395424753903384, + "grad_norm": 7.777013301849365, + "learning_rate": 8.89133746793266e-06, + "loss": 0.2648, + "step": 8850 + }, + { + "epoch": 0.2239795531037275, + "grad_norm": 9.835602760314941, + "learning_rate": 8.891085326178938e-06, + "loss": 0.2895, + "step": 8851 + }, + { + "epoch": 0.22400485866842118, + "grad_norm": 3.8870596885681152, + "learning_rate": 8.890833159332328e-06, + "loss": 0.1911, + "step": 8852 + }, + { + "epoch": 0.22403016423311486, + "grad_norm": 7.887063503265381, + "learning_rate": 8.89058096739446e-06, + "loss": 0.256, + "step": 8853 + }, + { + "epoch": 0.22405546979780855, + "grad_norm": 10.532811164855957, + "learning_rate": 8.890328750366957e-06, + "loss": 0.3294, + "step": 8854 + }, + { + "epoch": 0.2240807753625022, + "grad_norm": 5.378879547119141, + "learning_rate": 8.89007650825145e-06, + "loss": 0.1387, + "step": 8855 + }, + { + "epoch": 0.2241060809271959, + "grad_norm": 5.852137088775635, + "learning_rate": 8.889824241049561e-06, + "loss": 0.1717, + "step": 8856 + }, + { + "epoch": 0.22413138649188957, + "grad_norm": 4.035078525543213, + "learning_rate": 8.88957194876292e-06, + "loss": 0.1738, + "step": 8857 + }, + { + "epoch": 0.22415669205658323, + "grad_norm": 6.306805610656738, + "learning_rate": 8.889319631393153e-06, + "loss": 0.178, + "step": 8858 + }, + { + "epoch": 0.2241819976212769, + "grad_norm": 7.200897693634033, + "learning_rate": 8.889067288941889e-06, + "loss": 0.3201, + "step": 8859 + }, + { + "epoch": 0.2242073031859706, + "grad_norm": 9.444605827331543, + "learning_rate": 8.888814921410751e-06, + "loss": 0.277, + "step": 8860 + }, + { + "epoch": 0.22423260875066428, + "grad_norm": 5.662010192871094, + "learning_rate": 8.888562528801369e-06, + "loss": 0.1407, + "step": 8861 + }, + { + "epoch": 0.22425791431535794, + "grad_norm": 6.889223098754883, + "learning_rate": 8.888310111115371e-06, + "loss": 0.2817, + "step": 8862 + }, + { + "epoch": 0.22428321988005162, + "grad_norm": 7.113540172576904, + "learning_rate": 8.888057668354386e-06, + "loss": 0.3176, + "step": 8863 + }, + { + "epoch": 0.2243085254447453, + "grad_norm": 4.7414164543151855, + "learning_rate": 8.887805200520038e-06, + "loss": 0.2106, + "step": 8864 + }, + { + "epoch": 0.224333831009439, + "grad_norm": 3.988520622253418, + "learning_rate": 8.887552707613958e-06, + "loss": 0.1716, + "step": 8865 + }, + { + "epoch": 0.22435913657413264, + "grad_norm": 5.390191555023193, + "learning_rate": 8.887300189637774e-06, + "loss": 0.131, + "step": 8866 + }, + { + "epoch": 0.22438444213882633, + "grad_norm": 9.791234970092773, + "learning_rate": 8.887047646593114e-06, + "loss": 0.234, + "step": 8867 + }, + { + "epoch": 0.22440974770352, + "grad_norm": 5.106468200683594, + "learning_rate": 8.886795078481606e-06, + "loss": 0.2414, + "step": 8868 + }, + { + "epoch": 0.22443505326821367, + "grad_norm": 8.934859275817871, + "learning_rate": 8.886542485304881e-06, + "loss": 0.2841, + "step": 8869 + }, + { + "epoch": 0.22446035883290735, + "grad_norm": 5.3846845626831055, + "learning_rate": 8.886289867064568e-06, + "loss": 0.1982, + "step": 8870 + }, + { + "epoch": 0.22448566439760104, + "grad_norm": 3.8942668437957764, + "learning_rate": 8.886037223762292e-06, + "loss": 0.2182, + "step": 8871 + }, + { + "epoch": 0.22451096996229472, + "grad_norm": 5.857364654541016, + "learning_rate": 8.885784555399687e-06, + "loss": 0.219, + "step": 8872 + }, + { + "epoch": 0.22453627552698838, + "grad_norm": 2.96408748626709, + "learning_rate": 8.885531861978377e-06, + "loss": 0.1924, + "step": 8873 + }, + { + "epoch": 0.22456158109168206, + "grad_norm": 4.653412342071533, + "learning_rate": 8.885279143499996e-06, + "loss": 0.1941, + "step": 8874 + }, + { + "epoch": 0.22458688665637574, + "grad_norm": 8.843084335327148, + "learning_rate": 8.885026399966174e-06, + "loss": 0.291, + "step": 8875 + }, + { + "epoch": 0.2246121922210694, + "grad_norm": 12.844362258911133, + "learning_rate": 8.884773631378538e-06, + "loss": 0.2714, + "step": 8876 + }, + { + "epoch": 0.22463749778576309, + "grad_norm": 7.477929592132568, + "learning_rate": 8.88452083773872e-06, + "loss": 0.3296, + "step": 8877 + }, + { + "epoch": 0.22466280335045677, + "grad_norm": 3.6030404567718506, + "learning_rate": 8.88426801904835e-06, + "loss": 0.188, + "step": 8878 + }, + { + "epoch": 0.22468810891515045, + "grad_norm": 10.759100914001465, + "learning_rate": 8.884015175309057e-06, + "loss": 0.3473, + "step": 8879 + }, + { + "epoch": 0.2247134144798441, + "grad_norm": 3.5526955127716064, + "learning_rate": 8.883762306522473e-06, + "loss": 0.1411, + "step": 8880 + }, + { + "epoch": 0.2247387200445378, + "grad_norm": 3.9403750896453857, + "learning_rate": 8.883509412690227e-06, + "loss": 0.1608, + "step": 8881 + }, + { + "epoch": 0.22476402560923148, + "grad_norm": 5.129638671875, + "learning_rate": 8.883256493813954e-06, + "loss": 0.1598, + "step": 8882 + }, + { + "epoch": 0.22478933117392513, + "grad_norm": 17.85686683654785, + "learning_rate": 8.883003549895281e-06, + "loss": 0.2643, + "step": 8883 + }, + { + "epoch": 0.22481463673861882, + "grad_norm": 4.413265228271484, + "learning_rate": 8.88275058093584e-06, + "loss": 0.2894, + "step": 8884 + }, + { + "epoch": 0.2248399423033125, + "grad_norm": 5.6879682540893555, + "learning_rate": 8.882497586937264e-06, + "loss": 0.1749, + "step": 8885 + }, + { + "epoch": 0.22486524786800619, + "grad_norm": 3.186241865158081, + "learning_rate": 8.882244567901181e-06, + "loss": 0.2053, + "step": 8886 + }, + { + "epoch": 0.22489055343269984, + "grad_norm": 4.422640800476074, + "learning_rate": 8.881991523829228e-06, + "loss": 0.1591, + "step": 8887 + }, + { + "epoch": 0.22491585899739353, + "grad_norm": 3.8294684886932373, + "learning_rate": 8.881738454723032e-06, + "loss": 0.203, + "step": 8888 + }, + { + "epoch": 0.2249411645620872, + "grad_norm": 4.904381275177002, + "learning_rate": 8.881485360584225e-06, + "loss": 0.2136, + "step": 8889 + }, + { + "epoch": 0.22496647012678087, + "grad_norm": 4.559602737426758, + "learning_rate": 8.881232241414444e-06, + "loss": 0.1883, + "step": 8890 + }, + { + "epoch": 0.22499177569147455, + "grad_norm": 4.435028553009033, + "learning_rate": 8.880979097215316e-06, + "loss": 0.1922, + "step": 8891 + }, + { + "epoch": 0.22501708125616823, + "grad_norm": 4.764965534210205, + "learning_rate": 8.880725927988478e-06, + "loss": 0.1685, + "step": 8892 + }, + { + "epoch": 0.22504238682086192, + "grad_norm": 12.660016059875488, + "learning_rate": 8.88047273373556e-06, + "loss": 0.2765, + "step": 8893 + }, + { + "epoch": 0.22506769238555557, + "grad_norm": 7.261476516723633, + "learning_rate": 8.880219514458192e-06, + "loss": 0.1803, + "step": 8894 + }, + { + "epoch": 0.22509299795024926, + "grad_norm": 6.21509313583374, + "learning_rate": 8.879966270158014e-06, + "loss": 0.2373, + "step": 8895 + }, + { + "epoch": 0.22511830351494294, + "grad_norm": 5.127520561218262, + "learning_rate": 8.879713000836653e-06, + "loss": 0.1559, + "step": 8896 + }, + { + "epoch": 0.22514360907963663, + "grad_norm": 5.992597579956055, + "learning_rate": 8.879459706495746e-06, + "loss": 0.1796, + "step": 8897 + }, + { + "epoch": 0.22516891464433028, + "grad_norm": 6.3721113204956055, + "learning_rate": 8.879206387136923e-06, + "loss": 0.1958, + "step": 8898 + }, + { + "epoch": 0.22519422020902397, + "grad_norm": 4.199694633483887, + "learning_rate": 8.87895304276182e-06, + "loss": 0.1983, + "step": 8899 + }, + { + "epoch": 0.22521952577371765, + "grad_norm": 9.287208557128906, + "learning_rate": 8.878699673372071e-06, + "loss": 0.1958, + "step": 8900 + }, + { + "epoch": 0.2252448313384113, + "grad_norm": 33.73911666870117, + "learning_rate": 8.87844627896931e-06, + "loss": 0.4065, + "step": 8901 + }, + { + "epoch": 0.225270136903105, + "grad_norm": 5.8023858070373535, + "learning_rate": 8.87819285955517e-06, + "loss": 0.2139, + "step": 8902 + }, + { + "epoch": 0.22529544246779867, + "grad_norm": 8.127710342407227, + "learning_rate": 8.877939415131286e-06, + "loss": 0.16, + "step": 8903 + }, + { + "epoch": 0.22532074803249236, + "grad_norm": 5.231276988983154, + "learning_rate": 8.87768594569929e-06, + "loss": 0.1989, + "step": 8904 + }, + { + "epoch": 0.225346053597186, + "grad_norm": 4.830030918121338, + "learning_rate": 8.877432451260821e-06, + "loss": 0.1485, + "step": 8905 + }, + { + "epoch": 0.2253713591618797, + "grad_norm": 4.06162166595459, + "learning_rate": 8.877178931817512e-06, + "loss": 0.2144, + "step": 8906 + }, + { + "epoch": 0.22539666472657338, + "grad_norm": 4.150369167327881, + "learning_rate": 8.876925387370994e-06, + "loss": 0.0976, + "step": 8907 + }, + { + "epoch": 0.22542197029126704, + "grad_norm": 2.7940168380737305, + "learning_rate": 8.876671817922908e-06, + "loss": 0.1286, + "step": 8908 + }, + { + "epoch": 0.22544727585596072, + "grad_norm": 3.9611494541168213, + "learning_rate": 8.876418223474886e-06, + "loss": 0.2035, + "step": 8909 + }, + { + "epoch": 0.2254725814206544, + "grad_norm": 5.858065128326416, + "learning_rate": 8.876164604028563e-06, + "loss": 0.2424, + "step": 8910 + }, + { + "epoch": 0.2254978869853481, + "grad_norm": 6.786761283874512, + "learning_rate": 8.875910959585576e-06, + "loss": 0.2132, + "step": 8911 + }, + { + "epoch": 0.22552319255004175, + "grad_norm": 4.062539577484131, + "learning_rate": 8.875657290147561e-06, + "loss": 0.1398, + "step": 8912 + }, + { + "epoch": 0.22554849811473543, + "grad_norm": 14.885895729064941, + "learning_rate": 8.875403595716152e-06, + "loss": 0.2442, + "step": 8913 + }, + { + "epoch": 0.2255738036794291, + "grad_norm": 12.443146705627441, + "learning_rate": 8.875149876292987e-06, + "loss": 0.3924, + "step": 8914 + }, + { + "epoch": 0.22559910924412277, + "grad_norm": 4.241669178009033, + "learning_rate": 8.8748961318797e-06, + "loss": 0.2014, + "step": 8915 + }, + { + "epoch": 0.22562441480881645, + "grad_norm": 6.813391208648682, + "learning_rate": 8.87464236247793e-06, + "loss": 0.2234, + "step": 8916 + }, + { + "epoch": 0.22564972037351014, + "grad_norm": 2.3871994018554688, + "learning_rate": 8.874388568089312e-06, + "loss": 0.1132, + "step": 8917 + }, + { + "epoch": 0.22567502593820382, + "grad_norm": 4.556662082672119, + "learning_rate": 8.874134748715482e-06, + "loss": 0.1735, + "step": 8918 + }, + { + "epoch": 0.22570033150289748, + "grad_norm": 3.765167236328125, + "learning_rate": 8.873880904358079e-06, + "loss": 0.181, + "step": 8919 + }, + { + "epoch": 0.22572563706759116, + "grad_norm": 4.1664838790893555, + "learning_rate": 8.873627035018737e-06, + "loss": 0.2192, + "step": 8920 + }, + { + "epoch": 0.22575094263228485, + "grad_norm": 5.3850531578063965, + "learning_rate": 8.873373140699096e-06, + "loss": 0.2439, + "step": 8921 + }, + { + "epoch": 0.2257762481969785, + "grad_norm": 7.864750385284424, + "learning_rate": 8.87311922140079e-06, + "loss": 0.2183, + "step": 8922 + }, + { + "epoch": 0.22580155376167219, + "grad_norm": 6.85085916519165, + "learning_rate": 8.87286527712546e-06, + "loss": 0.207, + "step": 8923 + }, + { + "epoch": 0.22582685932636587, + "grad_norm": 4.003817558288574, + "learning_rate": 8.872611307874744e-06, + "loss": 0.1913, + "step": 8924 + }, + { + "epoch": 0.22585216489105955, + "grad_norm": 8.65869140625, + "learning_rate": 8.872357313650275e-06, + "loss": 0.1694, + "step": 8925 + }, + { + "epoch": 0.2258774704557532, + "grad_norm": 5.342836380004883, + "learning_rate": 8.872103294453695e-06, + "loss": 0.2236, + "step": 8926 + }, + { + "epoch": 0.2259027760204469, + "grad_norm": 5.250886917114258, + "learning_rate": 8.871849250286641e-06, + "loss": 0.2145, + "step": 8927 + }, + { + "epoch": 0.22592808158514058, + "grad_norm": 3.962881088256836, + "learning_rate": 8.871595181150753e-06, + "loss": 0.1345, + "step": 8928 + }, + { + "epoch": 0.22595338714983426, + "grad_norm": 5.997687816619873, + "learning_rate": 8.871341087047667e-06, + "loss": 0.199, + "step": 8929 + }, + { + "epoch": 0.22597869271452792, + "grad_norm": 8.136863708496094, + "learning_rate": 8.871086967979022e-06, + "loss": 0.3279, + "step": 8930 + }, + { + "epoch": 0.2260039982792216, + "grad_norm": 4.348442554473877, + "learning_rate": 8.870832823946459e-06, + "loss": 0.1357, + "step": 8931 + }, + { + "epoch": 0.22602930384391529, + "grad_norm": 7.705530643463135, + "learning_rate": 8.870578654951612e-06, + "loss": 0.1797, + "step": 8932 + }, + { + "epoch": 0.22605460940860894, + "grad_norm": 7.506080150604248, + "learning_rate": 8.870324460996124e-06, + "loss": 0.307, + "step": 8933 + }, + { + "epoch": 0.22607991497330263, + "grad_norm": 3.720966339111328, + "learning_rate": 8.870070242081636e-06, + "loss": 0.1473, + "step": 8934 + }, + { + "epoch": 0.2261052205379963, + "grad_norm": 6.4813971519470215, + "learning_rate": 8.869815998209784e-06, + "loss": 0.255, + "step": 8935 + }, + { + "epoch": 0.22613052610269, + "grad_norm": 5.299229145050049, + "learning_rate": 8.869561729382206e-06, + "loss": 0.1651, + "step": 8936 + }, + { + "epoch": 0.22615583166738365, + "grad_norm": 4.850813388824463, + "learning_rate": 8.869307435600546e-06, + "loss": 0.2237, + "step": 8937 + }, + { + "epoch": 0.22618113723207733, + "grad_norm": 3.6329545974731445, + "learning_rate": 8.869053116866443e-06, + "loss": 0.1771, + "step": 8938 + }, + { + "epoch": 0.22620644279677102, + "grad_norm": 8.701059341430664, + "learning_rate": 8.868798773181535e-06, + "loss": 0.1761, + "step": 8939 + }, + { + "epoch": 0.22623174836146467, + "grad_norm": 5.066177845001221, + "learning_rate": 8.868544404547464e-06, + "loss": 0.1705, + "step": 8940 + }, + { + "epoch": 0.22625705392615836, + "grad_norm": 4.271900177001953, + "learning_rate": 8.868290010965869e-06, + "loss": 0.1815, + "step": 8941 + }, + { + "epoch": 0.22628235949085204, + "grad_norm": 4.42783260345459, + "learning_rate": 8.86803559243839e-06, + "loss": 0.2151, + "step": 8942 + }, + { + "epoch": 0.22630766505554573, + "grad_norm": 4.268255710601807, + "learning_rate": 8.867781148966673e-06, + "loss": 0.1923, + "step": 8943 + }, + { + "epoch": 0.22633297062023938, + "grad_norm": 3.4418697357177734, + "learning_rate": 8.867526680552352e-06, + "loss": 0.1602, + "step": 8944 + }, + { + "epoch": 0.22635827618493307, + "grad_norm": 4.770813465118408, + "learning_rate": 8.86727218719707e-06, + "loss": 0.1984, + "step": 8945 + }, + { + "epoch": 0.22638358174962675, + "grad_norm": 9.422350883483887, + "learning_rate": 8.86701766890247e-06, + "loss": 0.2555, + "step": 8946 + }, + { + "epoch": 0.2264088873143204, + "grad_norm": 6.551853656768799, + "learning_rate": 8.866763125670194e-06, + "loss": 0.1233, + "step": 8947 + }, + { + "epoch": 0.2264341928790141, + "grad_norm": 5.764827728271484, + "learning_rate": 8.86650855750188e-06, + "loss": 0.183, + "step": 8948 + }, + { + "epoch": 0.22645949844370777, + "grad_norm": 12.243721008300781, + "learning_rate": 8.866253964399172e-06, + "loss": 0.241, + "step": 8949 + }, + { + "epoch": 0.22648480400840146, + "grad_norm": 11.899360656738281, + "learning_rate": 8.865999346363712e-06, + "loss": 0.2502, + "step": 8950 + }, + { + "epoch": 0.22651010957309511, + "grad_norm": 3.994112730026245, + "learning_rate": 8.86574470339714e-06, + "loss": 0.1538, + "step": 8951 + }, + { + "epoch": 0.2265354151377888, + "grad_norm": 18.523529052734375, + "learning_rate": 8.8654900355011e-06, + "loss": 0.2631, + "step": 8952 + }, + { + "epoch": 0.22656072070248248, + "grad_norm": 4.911791801452637, + "learning_rate": 8.865235342677235e-06, + "loss": 0.1628, + "step": 8953 + }, + { + "epoch": 0.22658602626717614, + "grad_norm": 14.400473594665527, + "learning_rate": 8.864980624927184e-06, + "loss": 0.1566, + "step": 8954 + }, + { + "epoch": 0.22661133183186982, + "grad_norm": 11.906131744384766, + "learning_rate": 8.864725882252594e-06, + "loss": 0.2479, + "step": 8955 + }, + { + "epoch": 0.2266366373965635, + "grad_norm": 13.584700584411621, + "learning_rate": 8.864471114655105e-06, + "loss": 0.45, + "step": 8956 + }, + { + "epoch": 0.2266619429612572, + "grad_norm": 3.515805244445801, + "learning_rate": 8.864216322136362e-06, + "loss": 0.1143, + "step": 8957 + }, + { + "epoch": 0.22668724852595085, + "grad_norm": 10.052229881286621, + "learning_rate": 8.863961504698004e-06, + "loss": 0.2753, + "step": 8958 + }, + { + "epoch": 0.22671255409064453, + "grad_norm": 10.769420623779297, + "learning_rate": 8.863706662341678e-06, + "loss": 0.2184, + "step": 8959 + }, + { + "epoch": 0.22673785965533821, + "grad_norm": 6.392117023468018, + "learning_rate": 8.863451795069026e-06, + "loss": 0.1873, + "step": 8960 + }, + { + "epoch": 0.2267631652200319, + "grad_norm": 17.656463623046875, + "learning_rate": 8.863196902881693e-06, + "loss": 0.2101, + "step": 8961 + }, + { + "epoch": 0.22678847078472555, + "grad_norm": 4.989457607269287, + "learning_rate": 8.862941985781321e-06, + "loss": 0.2074, + "step": 8962 + }, + { + "epoch": 0.22681377634941924, + "grad_norm": 3.6943325996398926, + "learning_rate": 8.862687043769555e-06, + "loss": 0.1276, + "step": 8963 + }, + { + "epoch": 0.22683908191411292, + "grad_norm": 5.046115875244141, + "learning_rate": 8.86243207684804e-06, + "loss": 0.2178, + "step": 8964 + }, + { + "epoch": 0.22686438747880658, + "grad_norm": 9.068890571594238, + "learning_rate": 8.862177085018418e-06, + "loss": 0.2932, + "step": 8965 + }, + { + "epoch": 0.22688969304350026, + "grad_norm": 3.2273976802825928, + "learning_rate": 8.861922068282334e-06, + "loss": 0.1749, + "step": 8966 + }, + { + "epoch": 0.22691499860819395, + "grad_norm": 4.891936302185059, + "learning_rate": 8.861667026641434e-06, + "loss": 0.2144, + "step": 8967 + }, + { + "epoch": 0.22694030417288763, + "grad_norm": 2.359856605529785, + "learning_rate": 8.861411960097361e-06, + "loss": 0.0855, + "step": 8968 + }, + { + "epoch": 0.2269656097375813, + "grad_norm": 5.595662593841553, + "learning_rate": 8.86115686865176e-06, + "loss": 0.2031, + "step": 8969 + }, + { + "epoch": 0.22699091530227497, + "grad_norm": 6.068289279937744, + "learning_rate": 8.860901752306278e-06, + "loss": 0.1887, + "step": 8970 + }, + { + "epoch": 0.22701622086696865, + "grad_norm": 4.706778526306152, + "learning_rate": 8.860646611062559e-06, + "loss": 0.1078, + "step": 8971 + }, + { + "epoch": 0.2270415264316623, + "grad_norm": 7.2127580642700195, + "learning_rate": 8.860391444922244e-06, + "loss": 0.2764, + "step": 8972 + }, + { + "epoch": 0.227066831996356, + "grad_norm": 3.6702499389648438, + "learning_rate": 8.860136253886986e-06, + "loss": 0.0846, + "step": 8973 + }, + { + "epoch": 0.22709213756104968, + "grad_norm": 3.6247260570526123, + "learning_rate": 8.859881037958427e-06, + "loss": 0.1338, + "step": 8974 + }, + { + "epoch": 0.22711744312574336, + "grad_norm": 3.7556140422821045, + "learning_rate": 8.859625797138213e-06, + "loss": 0.1824, + "step": 8975 + }, + { + "epoch": 0.22714274869043702, + "grad_norm": 7.849806785583496, + "learning_rate": 8.859370531427992e-06, + "loss": 0.3104, + "step": 8976 + }, + { + "epoch": 0.2271680542551307, + "grad_norm": 3.252556562423706, + "learning_rate": 8.859115240829406e-06, + "loss": 0.1097, + "step": 8977 + }, + { + "epoch": 0.2271933598198244, + "grad_norm": 4.278405666351318, + "learning_rate": 8.858859925344105e-06, + "loss": 0.1163, + "step": 8978 + }, + { + "epoch": 0.22721866538451804, + "grad_norm": 4.63279390335083, + "learning_rate": 8.858604584973733e-06, + "loss": 0.1532, + "step": 8979 + }, + { + "epoch": 0.22724397094921173, + "grad_norm": 7.587115287780762, + "learning_rate": 8.858349219719939e-06, + "loss": 0.2562, + "step": 8980 + }, + { + "epoch": 0.2272692765139054, + "grad_norm": 12.405365943908691, + "learning_rate": 8.858093829584367e-06, + "loss": 0.2511, + "step": 8981 + }, + { + "epoch": 0.2272945820785991, + "grad_norm": 8.196453094482422, + "learning_rate": 8.857838414568667e-06, + "loss": 0.1384, + "step": 8982 + }, + { + "epoch": 0.22731988764329275, + "grad_norm": 9.444193840026855, + "learning_rate": 8.857582974674484e-06, + "loss": 0.1484, + "step": 8983 + }, + { + "epoch": 0.22734519320798643, + "grad_norm": 4.120792388916016, + "learning_rate": 8.857327509903466e-06, + "loss": 0.1867, + "step": 8984 + }, + { + "epoch": 0.22737049877268012, + "grad_norm": 4.959303379058838, + "learning_rate": 8.857072020257261e-06, + "loss": 0.1437, + "step": 8985 + }, + { + "epoch": 0.22739580433737377, + "grad_norm": 5.279537200927734, + "learning_rate": 8.856816505737515e-06, + "loss": 0.1811, + "step": 8986 + }, + { + "epoch": 0.22742110990206746, + "grad_norm": 3.8496479988098145, + "learning_rate": 8.856560966345878e-06, + "loss": 0.1286, + "step": 8987 + }, + { + "epoch": 0.22744641546676114, + "grad_norm": 6.081680774688721, + "learning_rate": 8.856305402083996e-06, + "loss": 0.2133, + "step": 8988 + }, + { + "epoch": 0.22747172103145483, + "grad_norm": 3.098691940307617, + "learning_rate": 8.856049812953517e-06, + "loss": 0.1574, + "step": 8989 + }, + { + "epoch": 0.22749702659614848, + "grad_norm": 5.960718154907227, + "learning_rate": 8.855794198956092e-06, + "loss": 0.2006, + "step": 8990 + }, + { + "epoch": 0.22752233216084217, + "grad_norm": 3.7688233852386475, + "learning_rate": 8.855538560093366e-06, + "loss": 0.1822, + "step": 8991 + }, + { + "epoch": 0.22754763772553585, + "grad_norm": 4.284513473510742, + "learning_rate": 8.85528289636699e-06, + "loss": 0.1572, + "step": 8992 + }, + { + "epoch": 0.22757294329022953, + "grad_norm": 4.103488922119141, + "learning_rate": 8.85502720777861e-06, + "loss": 0.2266, + "step": 8993 + }, + { + "epoch": 0.2275982488549232, + "grad_norm": 6.6319355964660645, + "learning_rate": 8.854771494329877e-06, + "loss": 0.2393, + "step": 8994 + }, + { + "epoch": 0.22762355441961687, + "grad_norm": 10.6194486618042, + "learning_rate": 8.854515756022441e-06, + "loss": 0.1793, + "step": 8995 + }, + { + "epoch": 0.22764885998431056, + "grad_norm": 3.3650104999542236, + "learning_rate": 8.854259992857948e-06, + "loss": 0.2148, + "step": 8996 + }, + { + "epoch": 0.22767416554900421, + "grad_norm": 7.611555576324463, + "learning_rate": 8.854004204838052e-06, + "loss": 0.2919, + "step": 8997 + }, + { + "epoch": 0.2276994711136979, + "grad_norm": 3.945265054702759, + "learning_rate": 8.853748391964397e-06, + "loss": 0.1579, + "step": 8998 + }, + { + "epoch": 0.22772477667839158, + "grad_norm": 50.63380813598633, + "learning_rate": 8.853492554238637e-06, + "loss": 0.2972, + "step": 8999 + }, + { + "epoch": 0.22775008224308527, + "grad_norm": 3.5256564617156982, + "learning_rate": 8.853236691662421e-06, + "loss": 0.1538, + "step": 9000 + }, + { + "epoch": 0.22777538780777892, + "grad_norm": 5.282230854034424, + "learning_rate": 8.852980804237397e-06, + "loss": 0.1957, + "step": 9001 + }, + { + "epoch": 0.2278006933724726, + "grad_norm": 4.595561981201172, + "learning_rate": 8.852724891965216e-06, + "loss": 0.1389, + "step": 9002 + }, + { + "epoch": 0.2278259989371663, + "grad_norm": 3.757225513458252, + "learning_rate": 8.85246895484753e-06, + "loss": 0.1838, + "step": 9003 + }, + { + "epoch": 0.22785130450185995, + "grad_norm": 7.266322612762451, + "learning_rate": 8.852212992885989e-06, + "loss": 0.2369, + "step": 9004 + }, + { + "epoch": 0.22787661006655363, + "grad_norm": 4.185303211212158, + "learning_rate": 8.851957006082242e-06, + "loss": 0.1236, + "step": 9005 + }, + { + "epoch": 0.22790191563124731, + "grad_norm": 6.032547473907471, + "learning_rate": 8.851700994437941e-06, + "loss": 0.1095, + "step": 9006 + }, + { + "epoch": 0.227927221195941, + "grad_norm": 4.644323348999023, + "learning_rate": 8.851444957954736e-06, + "loss": 0.1853, + "step": 9007 + }, + { + "epoch": 0.22795252676063466, + "grad_norm": 11.14380931854248, + "learning_rate": 8.85118889663428e-06, + "loss": 0.2072, + "step": 9008 + }, + { + "epoch": 0.22797783232532834, + "grad_norm": 6.819361686706543, + "learning_rate": 8.850932810478222e-06, + "loss": 0.2305, + "step": 9009 + }, + { + "epoch": 0.22800313789002202, + "grad_norm": 5.687652587890625, + "learning_rate": 8.850676699488215e-06, + "loss": 0.2271, + "step": 9010 + }, + { + "epoch": 0.22802844345471568, + "grad_norm": 4.129985809326172, + "learning_rate": 8.850420563665912e-06, + "loss": 0.1935, + "step": 9011 + }, + { + "epoch": 0.22805374901940936, + "grad_norm": 4.010630130767822, + "learning_rate": 8.850164403012959e-06, + "loss": 0.0976, + "step": 9012 + }, + { + "epoch": 0.22807905458410305, + "grad_norm": 3.2048139572143555, + "learning_rate": 8.849908217531015e-06, + "loss": 0.1272, + "step": 9013 + }, + { + "epoch": 0.22810436014879673, + "grad_norm": 4.801650047302246, + "learning_rate": 8.84965200722173e-06, + "loss": 0.1374, + "step": 9014 + }, + { + "epoch": 0.2281296657134904, + "grad_norm": 4.8348846435546875, + "learning_rate": 8.849395772086752e-06, + "loss": 0.1497, + "step": 9015 + }, + { + "epoch": 0.22815497127818407, + "grad_norm": 3.9636359214782715, + "learning_rate": 8.849139512127741e-06, + "loss": 0.1737, + "step": 9016 + }, + { + "epoch": 0.22818027684287776, + "grad_norm": 10.804226875305176, + "learning_rate": 8.848883227346341e-06, + "loss": 0.3204, + "step": 9017 + }, + { + "epoch": 0.2282055824075714, + "grad_norm": 8.689821243286133, + "learning_rate": 8.848626917744211e-06, + "loss": 0.3236, + "step": 9018 + }, + { + "epoch": 0.2282308879722651, + "grad_norm": 3.7583227157592773, + "learning_rate": 8.848370583323003e-06, + "loss": 0.1919, + "step": 9019 + }, + { + "epoch": 0.22825619353695878, + "grad_norm": 7.5419840812683105, + "learning_rate": 8.848114224084368e-06, + "loss": 0.2771, + "step": 9020 + }, + { + "epoch": 0.22828149910165246, + "grad_norm": 17.610532760620117, + "learning_rate": 8.847857840029958e-06, + "loss": 0.1775, + "step": 9021 + }, + { + "epoch": 0.22830680466634612, + "grad_norm": 4.9900078773498535, + "learning_rate": 8.847601431161431e-06, + "loss": 0.1877, + "step": 9022 + }, + { + "epoch": 0.2283321102310398, + "grad_norm": 4.367607116699219, + "learning_rate": 8.847344997480438e-06, + "loss": 0.1667, + "step": 9023 + }, + { + "epoch": 0.2283574157957335, + "grad_norm": 4.6637067794799805, + "learning_rate": 8.847088538988632e-06, + "loss": 0.1702, + "step": 9024 + }, + { + "epoch": 0.22838272136042717, + "grad_norm": 7.313059329986572, + "learning_rate": 8.846832055687668e-06, + "loss": 0.1731, + "step": 9025 + }, + { + "epoch": 0.22840802692512083, + "grad_norm": 3.807828426361084, + "learning_rate": 8.8465755475792e-06, + "loss": 0.1938, + "step": 9026 + }, + { + "epoch": 0.2284333324898145, + "grad_norm": 3.947455406188965, + "learning_rate": 8.846319014664882e-06, + "loss": 0.1939, + "step": 9027 + }, + { + "epoch": 0.2284586380545082, + "grad_norm": 3.724869966506958, + "learning_rate": 8.846062456946366e-06, + "loss": 0.1599, + "step": 9028 + }, + { + "epoch": 0.22848394361920185, + "grad_norm": 4.464668273925781, + "learning_rate": 8.845805874425311e-06, + "loss": 0.1506, + "step": 9029 + }, + { + "epoch": 0.22850924918389554, + "grad_norm": 3.2335901260375977, + "learning_rate": 8.845549267103369e-06, + "loss": 0.1379, + "step": 9030 + }, + { + "epoch": 0.22853455474858922, + "grad_norm": 3.6285665035247803, + "learning_rate": 8.845292634982194e-06, + "loss": 0.1434, + "step": 9031 + }, + { + "epoch": 0.2285598603132829, + "grad_norm": 8.67365550994873, + "learning_rate": 8.845035978063442e-06, + "loss": 0.2239, + "step": 9032 + }, + { + "epoch": 0.22858516587797656, + "grad_norm": 14.869178771972656, + "learning_rate": 8.844779296348769e-06, + "loss": 0.2708, + "step": 9033 + }, + { + "epoch": 0.22861047144267024, + "grad_norm": 7.413976669311523, + "learning_rate": 8.84452258983983e-06, + "loss": 0.229, + "step": 9034 + }, + { + "epoch": 0.22863577700736393, + "grad_norm": 4.3438029289245605, + "learning_rate": 8.844265858538279e-06, + "loss": 0.1838, + "step": 9035 + }, + { + "epoch": 0.22866108257205758, + "grad_norm": 12.206024169921875, + "learning_rate": 8.844009102445772e-06, + "loss": 0.2148, + "step": 9036 + }, + { + "epoch": 0.22868638813675127, + "grad_norm": 2.8913378715515137, + "learning_rate": 8.843752321563966e-06, + "loss": 0.1398, + "step": 9037 + }, + { + "epoch": 0.22871169370144495, + "grad_norm": 9.450557708740234, + "learning_rate": 8.843495515894518e-06, + "loss": 0.1953, + "step": 9038 + }, + { + "epoch": 0.22873699926613864, + "grad_norm": 6.974820137023926, + "learning_rate": 8.843238685439079e-06, + "loss": 0.2605, + "step": 9039 + }, + { + "epoch": 0.2287623048308323, + "grad_norm": 4.291203498840332, + "learning_rate": 8.84298183019931e-06, + "loss": 0.1336, + "step": 9040 + }, + { + "epoch": 0.22878761039552598, + "grad_norm": 9.052224159240723, + "learning_rate": 8.842724950176866e-06, + "loss": 0.2116, + "step": 9041 + }, + { + "epoch": 0.22881291596021966, + "grad_norm": 3.1094181537628174, + "learning_rate": 8.842468045373405e-06, + "loss": 0.1318, + "step": 9042 + }, + { + "epoch": 0.22883822152491332, + "grad_norm": 3.231224298477173, + "learning_rate": 8.842211115790583e-06, + "loss": 0.195, + "step": 9043 + }, + { + "epoch": 0.228863527089607, + "grad_norm": 4.296218395233154, + "learning_rate": 8.841954161430053e-06, + "loss": 0.1549, + "step": 9044 + }, + { + "epoch": 0.22888883265430068, + "grad_norm": 8.401008605957031, + "learning_rate": 8.841697182293478e-06, + "loss": 0.2448, + "step": 9045 + }, + { + "epoch": 0.22891413821899437, + "grad_norm": 3.283003807067871, + "learning_rate": 8.84144017838251e-06, + "loss": 0.1651, + "step": 9046 + }, + { + "epoch": 0.22893944378368802, + "grad_norm": 5.468053340911865, + "learning_rate": 8.84118314969881e-06, + "loss": 0.2316, + "step": 9047 + }, + { + "epoch": 0.2289647493483817, + "grad_norm": 5.8346076011657715, + "learning_rate": 8.840926096244035e-06, + "loss": 0.1403, + "step": 9048 + }, + { + "epoch": 0.2289900549130754, + "grad_norm": 3.2325990200042725, + "learning_rate": 8.840669018019842e-06, + "loss": 0.1584, + "step": 9049 + }, + { + "epoch": 0.22901536047776905, + "grad_norm": 7.296062469482422, + "learning_rate": 8.84041191502789e-06, + "loss": 0.2651, + "step": 9050 + }, + { + "epoch": 0.22904066604246273, + "grad_norm": 14.68761157989502, + "learning_rate": 8.840154787269833e-06, + "loss": 0.4209, + "step": 9051 + }, + { + "epoch": 0.22906597160715642, + "grad_norm": 3.8710947036743164, + "learning_rate": 8.839897634747334e-06, + "loss": 0.2062, + "step": 9052 + }, + { + "epoch": 0.2290912771718501, + "grad_norm": 7.875890731811523, + "learning_rate": 8.83964045746205e-06, + "loss": 0.1858, + "step": 9053 + }, + { + "epoch": 0.22911658273654376, + "grad_norm": 6.226352214813232, + "learning_rate": 8.839383255415637e-06, + "loss": 0.2028, + "step": 9054 + }, + { + "epoch": 0.22914188830123744, + "grad_norm": 2.367873191833496, + "learning_rate": 8.839126028609757e-06, + "loss": 0.1738, + "step": 9055 + }, + { + "epoch": 0.22916719386593112, + "grad_norm": 3.5026297569274902, + "learning_rate": 8.838868777046066e-06, + "loss": 0.1972, + "step": 9056 + }, + { + "epoch": 0.2291924994306248, + "grad_norm": 3.254161834716797, + "learning_rate": 8.838611500726226e-06, + "loss": 0.1507, + "step": 9057 + }, + { + "epoch": 0.22921780499531846, + "grad_norm": 5.4574127197265625, + "learning_rate": 8.838354199651892e-06, + "loss": 0.1658, + "step": 9058 + }, + { + "epoch": 0.22924311056001215, + "grad_norm": 4.186513900756836, + "learning_rate": 8.838096873824728e-06, + "loss": 0.1668, + "step": 9059 + }, + { + "epoch": 0.22926841612470583, + "grad_norm": 8.972687721252441, + "learning_rate": 8.83783952324639e-06, + "loss": 0.2395, + "step": 9060 + }, + { + "epoch": 0.2292937216893995, + "grad_norm": 4.660366058349609, + "learning_rate": 8.837582147918542e-06, + "loss": 0.2147, + "step": 9061 + }, + { + "epoch": 0.22931902725409317, + "grad_norm": 3.1776673793792725, + "learning_rate": 8.837324747842838e-06, + "loss": 0.1288, + "step": 9062 + }, + { + "epoch": 0.22934433281878686, + "grad_norm": 3.529775619506836, + "learning_rate": 8.83706732302094e-06, + "loss": 0.1242, + "step": 9063 + }, + { + "epoch": 0.22936963838348054, + "grad_norm": 4.208033561706543, + "learning_rate": 8.836809873454508e-06, + "loss": 0.1569, + "step": 9064 + }, + { + "epoch": 0.2293949439481742, + "grad_norm": 6.7624592781066895, + "learning_rate": 8.836552399145204e-06, + "loss": 0.1894, + "step": 9065 + }, + { + "epoch": 0.22942024951286788, + "grad_norm": 6.089359283447266, + "learning_rate": 8.836294900094687e-06, + "loss": 0.191, + "step": 9066 + }, + { + "epoch": 0.22944555507756156, + "grad_norm": 4.306403636932373, + "learning_rate": 8.836037376304618e-06, + "loss": 0.152, + "step": 9067 + }, + { + "epoch": 0.22947086064225522, + "grad_norm": 3.2091917991638184, + "learning_rate": 8.835779827776658e-06, + "loss": 0.1891, + "step": 9068 + }, + { + "epoch": 0.2294961662069489, + "grad_norm": 16.646282196044922, + "learning_rate": 8.835522254512468e-06, + "loss": 0.3027, + "step": 9069 + }, + { + "epoch": 0.2295214717716426, + "grad_norm": 4.8543524742126465, + "learning_rate": 8.835264656513707e-06, + "loss": 0.1644, + "step": 9070 + }, + { + "epoch": 0.22954677733633627, + "grad_norm": 10.451217651367188, + "learning_rate": 8.835007033782037e-06, + "loss": 0.2027, + "step": 9071 + }, + { + "epoch": 0.22957208290102993, + "grad_norm": 2.4436728954315186, + "learning_rate": 8.834749386319121e-06, + "loss": 0.1187, + "step": 9072 + }, + { + "epoch": 0.2295973884657236, + "grad_norm": 5.886061668395996, + "learning_rate": 8.834491714126619e-06, + "loss": 0.1744, + "step": 9073 + }, + { + "epoch": 0.2296226940304173, + "grad_norm": 5.545167922973633, + "learning_rate": 8.834234017206194e-06, + "loss": 0.1706, + "step": 9074 + }, + { + "epoch": 0.22964799959511095, + "grad_norm": 5.226049423217773, + "learning_rate": 8.833976295559507e-06, + "loss": 0.1228, + "step": 9075 + }, + { + "epoch": 0.22967330515980464, + "grad_norm": 13.781731605529785, + "learning_rate": 8.83371854918822e-06, + "loss": 0.1876, + "step": 9076 + }, + { + "epoch": 0.22969861072449832, + "grad_norm": 5.744904518127441, + "learning_rate": 8.833460778093994e-06, + "loss": 0.1018, + "step": 9077 + }, + { + "epoch": 0.229723916289192, + "grad_norm": 5.709130764007568, + "learning_rate": 8.833202982278496e-06, + "loss": 0.2219, + "step": 9078 + }, + { + "epoch": 0.22974922185388566, + "grad_norm": 4.860949516296387, + "learning_rate": 8.83294516174338e-06, + "loss": 0.1752, + "step": 9079 + }, + { + "epoch": 0.22977452741857934, + "grad_norm": 6.510313987731934, + "learning_rate": 8.832687316490317e-06, + "loss": 0.1908, + "step": 9080 + }, + { + "epoch": 0.22979983298327303, + "grad_norm": 3.7105789184570312, + "learning_rate": 8.832429446520965e-06, + "loss": 0.0867, + "step": 9081 + }, + { + "epoch": 0.22982513854796668, + "grad_norm": 19.669157028198242, + "learning_rate": 8.83217155183699e-06, + "loss": 0.173, + "step": 9082 + }, + { + "epoch": 0.22985044411266037, + "grad_norm": 7.014414310455322, + "learning_rate": 8.831913632440052e-06, + "loss": 0.2556, + "step": 9083 + }, + { + "epoch": 0.22987574967735405, + "grad_norm": 7.360530853271484, + "learning_rate": 8.831655688331817e-06, + "loss": 0.1465, + "step": 9084 + }, + { + "epoch": 0.22990105524204774, + "grad_norm": 10.432516098022461, + "learning_rate": 8.831397719513946e-06, + "loss": 0.2991, + "step": 9085 + }, + { + "epoch": 0.2299263608067414, + "grad_norm": 4.627267837524414, + "learning_rate": 8.831139725988103e-06, + "loss": 0.1667, + "step": 9086 + }, + { + "epoch": 0.22995166637143508, + "grad_norm": 4.878843307495117, + "learning_rate": 8.830881707755955e-06, + "loss": 0.178, + "step": 9087 + }, + { + "epoch": 0.22997697193612876, + "grad_norm": 6.897216796875, + "learning_rate": 8.830623664819163e-06, + "loss": 0.1514, + "step": 9088 + }, + { + "epoch": 0.23000227750082244, + "grad_norm": 14.999512672424316, + "learning_rate": 8.83036559717939e-06, + "loss": 0.1968, + "step": 9089 + }, + { + "epoch": 0.2300275830655161, + "grad_norm": 12.382305145263672, + "learning_rate": 8.830107504838302e-06, + "loss": 0.2777, + "step": 9090 + }, + { + "epoch": 0.23005288863020978, + "grad_norm": 3.13804030418396, + "learning_rate": 8.829849387797564e-06, + "loss": 0.1072, + "step": 9091 + }, + { + "epoch": 0.23007819419490347, + "grad_norm": 5.2675347328186035, + "learning_rate": 8.829591246058839e-06, + "loss": 0.1066, + "step": 9092 + }, + { + "epoch": 0.23010349975959712, + "grad_norm": 14.584823608398438, + "learning_rate": 8.829333079623795e-06, + "loss": 0.2403, + "step": 9093 + }, + { + "epoch": 0.2301288053242908, + "grad_norm": 5.960893154144287, + "learning_rate": 8.82907488849409e-06, + "loss": 0.115, + "step": 9094 + }, + { + "epoch": 0.2301541108889845, + "grad_norm": 2.8784594535827637, + "learning_rate": 8.828816672671396e-06, + "loss": 0.1096, + "step": 9095 + }, + { + "epoch": 0.23017941645367818, + "grad_norm": 6.641319274902344, + "learning_rate": 8.828558432157374e-06, + "loss": 0.233, + "step": 9096 + }, + { + "epoch": 0.23020472201837183, + "grad_norm": 2.8725569248199463, + "learning_rate": 8.828300166953692e-06, + "loss": 0.1605, + "step": 9097 + }, + { + "epoch": 0.23023002758306552, + "grad_norm": 4.011223316192627, + "learning_rate": 8.828041877062015e-06, + "loss": 0.2125, + "step": 9098 + }, + { + "epoch": 0.2302553331477592, + "grad_norm": 8.481375694274902, + "learning_rate": 8.827783562484006e-06, + "loss": 0.2165, + "step": 9099 + }, + { + "epoch": 0.23028063871245286, + "grad_norm": 3.3173725605010986, + "learning_rate": 8.827525223221335e-06, + "loss": 0.174, + "step": 9100 + }, + { + "epoch": 0.23030594427714654, + "grad_norm": 3.0722997188568115, + "learning_rate": 8.827266859275665e-06, + "loss": 0.1177, + "step": 9101 + }, + { + "epoch": 0.23033124984184022, + "grad_norm": 11.650716781616211, + "learning_rate": 8.827008470648662e-06, + "loss": 0.3298, + "step": 9102 + }, + { + "epoch": 0.2303565554065339, + "grad_norm": 7.149286270141602, + "learning_rate": 8.826750057341994e-06, + "loss": 0.1884, + "step": 9103 + }, + { + "epoch": 0.23038186097122756, + "grad_norm": 10.35311222076416, + "learning_rate": 8.826491619357326e-06, + "loss": 0.1929, + "step": 9104 + }, + { + "epoch": 0.23040716653592125, + "grad_norm": 4.207490921020508, + "learning_rate": 8.826233156696327e-06, + "loss": 0.1554, + "step": 9105 + }, + { + "epoch": 0.23043247210061493, + "grad_norm": 5.545538902282715, + "learning_rate": 8.825974669360662e-06, + "loss": 0.1222, + "step": 9106 + }, + { + "epoch": 0.2304577776653086, + "grad_norm": 9.031952857971191, + "learning_rate": 8.825716157351996e-06, + "loss": 0.231, + "step": 9107 + }, + { + "epoch": 0.23048308323000227, + "grad_norm": 6.583849906921387, + "learning_rate": 8.825457620671999e-06, + "loss": 0.1093, + "step": 9108 + }, + { + "epoch": 0.23050838879469596, + "grad_norm": 7.902552604675293, + "learning_rate": 8.825199059322337e-06, + "loss": 0.2698, + "step": 9109 + }, + { + "epoch": 0.23053369435938964, + "grad_norm": 4.234184265136719, + "learning_rate": 8.82494047330468e-06, + "loss": 0.1759, + "step": 9110 + }, + { + "epoch": 0.2305589999240833, + "grad_norm": 7.804285526275635, + "learning_rate": 8.824681862620691e-06, + "loss": 0.1677, + "step": 9111 + }, + { + "epoch": 0.23058430548877698, + "grad_norm": 11.641921997070312, + "learning_rate": 8.824423227272043e-06, + "loss": 0.2038, + "step": 9112 + }, + { + "epoch": 0.23060961105347066, + "grad_norm": 3.713268518447876, + "learning_rate": 8.824164567260398e-06, + "loss": 0.2257, + "step": 9113 + }, + { + "epoch": 0.23063491661816432, + "grad_norm": 30.039247512817383, + "learning_rate": 8.823905882587427e-06, + "loss": 0.4206, + "step": 9114 + }, + { + "epoch": 0.230660222182858, + "grad_norm": 3.0422308444976807, + "learning_rate": 8.823647173254799e-06, + "loss": 0.1394, + "step": 9115 + }, + { + "epoch": 0.2306855277475517, + "grad_norm": 5.793537139892578, + "learning_rate": 8.823388439264183e-06, + "loss": 0.1121, + "step": 9116 + }, + { + "epoch": 0.23071083331224537, + "grad_norm": 8.278068542480469, + "learning_rate": 8.823129680617244e-06, + "loss": 0.2885, + "step": 9117 + }, + { + "epoch": 0.23073613887693903, + "grad_norm": 3.958761692047119, + "learning_rate": 8.822870897315653e-06, + "loss": 0.1645, + "step": 9118 + }, + { + "epoch": 0.2307614444416327, + "grad_norm": 7.323860168457031, + "learning_rate": 8.822612089361082e-06, + "loss": 0.2122, + "step": 9119 + }, + { + "epoch": 0.2307867500063264, + "grad_norm": 5.851650238037109, + "learning_rate": 8.822353256755194e-06, + "loss": 0.2214, + "step": 9120 + }, + { + "epoch": 0.23081205557102008, + "grad_norm": 7.729881763458252, + "learning_rate": 8.82209439949966e-06, + "loss": 0.2691, + "step": 9121 + }, + { + "epoch": 0.23083736113571374, + "grad_norm": 5.16257905960083, + "learning_rate": 8.821835517596154e-06, + "loss": 0.2396, + "step": 9122 + }, + { + "epoch": 0.23086266670040742, + "grad_norm": 4.705042839050293, + "learning_rate": 8.821576611046339e-06, + "loss": 0.2061, + "step": 9123 + }, + { + "epoch": 0.2308879722651011, + "grad_norm": 3.8644888401031494, + "learning_rate": 8.821317679851886e-06, + "loss": 0.1286, + "step": 9124 + }, + { + "epoch": 0.23091327782979476, + "grad_norm": 10.573563575744629, + "learning_rate": 8.821058724014467e-06, + "loss": 0.2117, + "step": 9125 + }, + { + "epoch": 0.23093858339448844, + "grad_norm": 3.3611090183258057, + "learning_rate": 8.820799743535752e-06, + "loss": 0.1091, + "step": 9126 + }, + { + "epoch": 0.23096388895918213, + "grad_norm": 9.658782958984375, + "learning_rate": 8.820540738417408e-06, + "loss": 0.296, + "step": 9127 + }, + { + "epoch": 0.2309891945238758, + "grad_norm": 8.010263442993164, + "learning_rate": 8.82028170866111e-06, + "loss": 0.303, + "step": 9128 + }, + { + "epoch": 0.23101450008856947, + "grad_norm": 6.69428825378418, + "learning_rate": 8.820022654268525e-06, + "loss": 0.2481, + "step": 9129 + }, + { + "epoch": 0.23103980565326315, + "grad_norm": 2.4906198978424072, + "learning_rate": 8.819763575241325e-06, + "loss": 0.135, + "step": 9130 + }, + { + "epoch": 0.23106511121795684, + "grad_norm": 5.8681416511535645, + "learning_rate": 8.819504471581179e-06, + "loss": 0.2667, + "step": 9131 + }, + { + "epoch": 0.2310904167826505, + "grad_norm": 10.445540428161621, + "learning_rate": 8.81924534328976e-06, + "loss": 0.2313, + "step": 9132 + }, + { + "epoch": 0.23111572234734418, + "grad_norm": 4.78541898727417, + "learning_rate": 8.818986190368739e-06, + "loss": 0.2418, + "step": 9133 + }, + { + "epoch": 0.23114102791203786, + "grad_norm": 6.341945648193359, + "learning_rate": 8.818727012819786e-06, + "loss": 0.139, + "step": 9134 + }, + { + "epoch": 0.23116633347673154, + "grad_norm": 3.94534969329834, + "learning_rate": 8.818467810644573e-06, + "loss": 0.1637, + "step": 9135 + }, + { + "epoch": 0.2311916390414252, + "grad_norm": 7.817538261413574, + "learning_rate": 8.818208583844769e-06, + "loss": 0.237, + "step": 9136 + }, + { + "epoch": 0.23121694460611888, + "grad_norm": 4.096109867095947, + "learning_rate": 8.81794933242205e-06, + "loss": 0.2118, + "step": 9137 + }, + { + "epoch": 0.23124225017081257, + "grad_norm": 5.092865467071533, + "learning_rate": 8.817690056378086e-06, + "loss": 0.2192, + "step": 9138 + }, + { + "epoch": 0.23126755573550623, + "grad_norm": 3.7222952842712402, + "learning_rate": 8.817430755714549e-06, + "loss": 0.182, + "step": 9139 + }, + { + "epoch": 0.2312928613001999, + "grad_norm": 2.831017017364502, + "learning_rate": 8.81717143043311e-06, + "loss": 0.1849, + "step": 9140 + }, + { + "epoch": 0.2313181668648936, + "grad_norm": 4.784504413604736, + "learning_rate": 8.816912080535445e-06, + "loss": 0.1229, + "step": 9141 + }, + { + "epoch": 0.23134347242958728, + "grad_norm": 5.405756950378418, + "learning_rate": 8.81665270602322e-06, + "loss": 0.242, + "step": 9142 + }, + { + "epoch": 0.23136877799428093, + "grad_norm": 3.483268976211548, + "learning_rate": 8.816393306898116e-06, + "loss": 0.1098, + "step": 9143 + }, + { + "epoch": 0.23139408355897462, + "grad_norm": 5.310311317443848, + "learning_rate": 8.8161338831618e-06, + "loss": 0.2145, + "step": 9144 + }, + { + "epoch": 0.2314193891236683, + "grad_norm": 4.342601299285889, + "learning_rate": 8.815874434815946e-06, + "loss": 0.1849, + "step": 9145 + }, + { + "epoch": 0.23144469468836196, + "grad_norm": 10.626973152160645, + "learning_rate": 8.815614961862226e-06, + "loss": 0.2818, + "step": 9146 + }, + { + "epoch": 0.23147000025305564, + "grad_norm": 9.573466300964355, + "learning_rate": 8.815355464302317e-06, + "loss": 0.1958, + "step": 9147 + }, + { + "epoch": 0.23149530581774933, + "grad_norm": 5.262455463409424, + "learning_rate": 8.81509594213789e-06, + "loss": 0.1441, + "step": 9148 + }, + { + "epoch": 0.231520611382443, + "grad_norm": 8.95622730255127, + "learning_rate": 8.814836395370618e-06, + "loss": 0.2323, + "step": 9149 + }, + { + "epoch": 0.23154591694713667, + "grad_norm": 5.101030349731445, + "learning_rate": 8.814576824002177e-06, + "loss": 0.1749, + "step": 9150 + }, + { + "epoch": 0.23157122251183035, + "grad_norm": 4.737504482269287, + "learning_rate": 8.814317228034239e-06, + "loss": 0.1857, + "step": 9151 + }, + { + "epoch": 0.23159652807652403, + "grad_norm": 11.752699851989746, + "learning_rate": 8.814057607468479e-06, + "loss": 0.3564, + "step": 9152 + }, + { + "epoch": 0.23162183364121772, + "grad_norm": 5.156250953674316, + "learning_rate": 8.81379796230657e-06, + "loss": 0.1626, + "step": 9153 + }, + { + "epoch": 0.23164713920591137, + "grad_norm": 5.0390167236328125, + "learning_rate": 8.813538292550188e-06, + "loss": 0.2403, + "step": 9154 + }, + { + "epoch": 0.23167244477060506, + "grad_norm": 5.293144702911377, + "learning_rate": 8.813278598201007e-06, + "loss": 0.1733, + "step": 9155 + }, + { + "epoch": 0.23169775033529874, + "grad_norm": 9.128421783447266, + "learning_rate": 8.813018879260702e-06, + "loss": 0.2886, + "step": 9156 + }, + { + "epoch": 0.2317230558999924, + "grad_norm": 8.190834045410156, + "learning_rate": 8.812759135730946e-06, + "loss": 0.0931, + "step": 9157 + }, + { + "epoch": 0.23174836146468608, + "grad_norm": 6.136649131774902, + "learning_rate": 8.812499367613417e-06, + "loss": 0.2379, + "step": 9158 + }, + { + "epoch": 0.23177366702937977, + "grad_norm": 6.470221519470215, + "learning_rate": 8.81223957490979e-06, + "loss": 0.2277, + "step": 9159 + }, + { + "epoch": 0.23179897259407345, + "grad_norm": 17.839487075805664, + "learning_rate": 8.811979757621736e-06, + "loss": 0.2291, + "step": 9160 + }, + { + "epoch": 0.2318242781587671, + "grad_norm": 11.222451210021973, + "learning_rate": 8.811719915750936e-06, + "loss": 0.404, + "step": 9161 + }, + { + "epoch": 0.2318495837234608, + "grad_norm": 8.61713695526123, + "learning_rate": 8.811460049299062e-06, + "loss": 0.207, + "step": 9162 + }, + { + "epoch": 0.23187488928815447, + "grad_norm": 7.9243011474609375, + "learning_rate": 8.811200158267792e-06, + "loss": 0.2483, + "step": 9163 + }, + { + "epoch": 0.23190019485284813, + "grad_norm": 6.53853702545166, + "learning_rate": 8.8109402426588e-06, + "loss": 0.1973, + "step": 9164 + }, + { + "epoch": 0.2319255004175418, + "grad_norm": 6.081085205078125, + "learning_rate": 8.810680302473763e-06, + "loss": 0.1211, + "step": 9165 + }, + { + "epoch": 0.2319508059822355, + "grad_norm": 7.626801013946533, + "learning_rate": 8.810420337714361e-06, + "loss": 0.2285, + "step": 9166 + }, + { + "epoch": 0.23197611154692918, + "grad_norm": 11.601179122924805, + "learning_rate": 8.810160348382264e-06, + "loss": 0.2977, + "step": 9167 + }, + { + "epoch": 0.23200141711162284, + "grad_norm": 9.404379844665527, + "learning_rate": 8.809900334479152e-06, + "loss": 0.1447, + "step": 9168 + }, + { + "epoch": 0.23202672267631652, + "grad_norm": 3.1555888652801514, + "learning_rate": 8.8096402960067e-06, + "loss": 0.0902, + "step": 9169 + }, + { + "epoch": 0.2320520282410102, + "grad_norm": 8.172049522399902, + "learning_rate": 8.80938023296659e-06, + "loss": 0.1315, + "step": 9170 + }, + { + "epoch": 0.23207733380570386, + "grad_norm": 3.93029522895813, + "learning_rate": 8.809120145360493e-06, + "loss": 0.1833, + "step": 9171 + }, + { + "epoch": 0.23210263937039755, + "grad_norm": 5.194059371948242, + "learning_rate": 8.80886003319009e-06, + "loss": 0.2125, + "step": 9172 + }, + { + "epoch": 0.23212794493509123, + "grad_norm": 3.526480197906494, + "learning_rate": 8.808599896457055e-06, + "loss": 0.1469, + "step": 9173 + }, + { + "epoch": 0.2321532504997849, + "grad_norm": 5.187831401824951, + "learning_rate": 8.80833973516307e-06, + "loss": 0.2447, + "step": 9174 + }, + { + "epoch": 0.23217855606447857, + "grad_norm": 4.235044956207275, + "learning_rate": 8.80807954930981e-06, + "loss": 0.1958, + "step": 9175 + }, + { + "epoch": 0.23220386162917225, + "grad_norm": 3.777177095413208, + "learning_rate": 8.807819338898954e-06, + "loss": 0.0813, + "step": 9176 + }, + { + "epoch": 0.23222916719386594, + "grad_norm": 25.487287521362305, + "learning_rate": 8.807559103932177e-06, + "loss": 0.3184, + "step": 9177 + }, + { + "epoch": 0.2322544727585596, + "grad_norm": 7.512444972991943, + "learning_rate": 8.80729884441116e-06, + "loss": 0.1876, + "step": 9178 + }, + { + "epoch": 0.23227977832325328, + "grad_norm": 5.089847087860107, + "learning_rate": 8.807038560337581e-06, + "loss": 0.0819, + "step": 9179 + }, + { + "epoch": 0.23230508388794696, + "grad_norm": 3.7580933570861816, + "learning_rate": 8.806778251713121e-06, + "loss": 0.1696, + "step": 9180 + }, + { + "epoch": 0.23233038945264065, + "grad_norm": 4.335800647735596, + "learning_rate": 8.806517918539454e-06, + "loss": 0.119, + "step": 9181 + }, + { + "epoch": 0.2323556950173343, + "grad_norm": 4.122987270355225, + "learning_rate": 8.806257560818263e-06, + "loss": 0.2019, + "step": 9182 + }, + { + "epoch": 0.23238100058202799, + "grad_norm": 4.614480495452881, + "learning_rate": 8.805997178551222e-06, + "loss": 0.1739, + "step": 9183 + }, + { + "epoch": 0.23240630614672167, + "grad_norm": 5.2706780433654785, + "learning_rate": 8.805736771740014e-06, + "loss": 0.1566, + "step": 9184 + }, + { + "epoch": 0.23243161171141535, + "grad_norm": 6.00616979598999, + "learning_rate": 8.805476340386319e-06, + "loss": 0.2433, + "step": 9185 + }, + { + "epoch": 0.232456917276109, + "grad_norm": 4.050119876861572, + "learning_rate": 8.805215884491815e-06, + "loss": 0.1703, + "step": 9186 + }, + { + "epoch": 0.2324822228408027, + "grad_norm": 7.742641925811768, + "learning_rate": 8.80495540405818e-06, + "loss": 0.2086, + "step": 9187 + }, + { + "epoch": 0.23250752840549638, + "grad_norm": 6.604145050048828, + "learning_rate": 8.804694899087097e-06, + "loss": 0.1746, + "step": 9188 + }, + { + "epoch": 0.23253283397019003, + "grad_norm": 12.84156608581543, + "learning_rate": 8.804434369580243e-06, + "loss": 0.247, + "step": 9189 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 5.826672554016113, + "learning_rate": 8.804173815539299e-06, + "loss": 0.2242, + "step": 9190 + }, + { + "epoch": 0.2325834450995774, + "grad_norm": 4.279433250427246, + "learning_rate": 8.803913236965947e-06, + "loss": 0.1986, + "step": 9191 + }, + { + "epoch": 0.23260875066427109, + "grad_norm": 10.131429672241211, + "learning_rate": 8.803652633861865e-06, + "loss": 0.1169, + "step": 9192 + }, + { + "epoch": 0.23263405622896474, + "grad_norm": 6.735658645629883, + "learning_rate": 8.803392006228736e-06, + "loss": 0.2034, + "step": 9193 + }, + { + "epoch": 0.23265936179365843, + "grad_norm": 10.337409019470215, + "learning_rate": 8.80313135406824e-06, + "loss": 0.2332, + "step": 9194 + }, + { + "epoch": 0.2326846673583521, + "grad_norm": 3.90735125541687, + "learning_rate": 8.802870677382056e-06, + "loss": 0.1508, + "step": 9195 + }, + { + "epoch": 0.23270997292304577, + "grad_norm": 10.465649604797363, + "learning_rate": 8.802609976171866e-06, + "loss": 0.2838, + "step": 9196 + }, + { + "epoch": 0.23273527848773945, + "grad_norm": 6.248128890991211, + "learning_rate": 8.802349250439353e-06, + "loss": 0.175, + "step": 9197 + }, + { + "epoch": 0.23276058405243313, + "grad_norm": 4.84774923324585, + "learning_rate": 8.802088500186195e-06, + "loss": 0.1387, + "step": 9198 + }, + { + "epoch": 0.23278588961712682, + "grad_norm": 4.351926326751709, + "learning_rate": 8.801827725414076e-06, + "loss": 0.186, + "step": 9199 + }, + { + "epoch": 0.23281119518182047, + "grad_norm": 4.0753326416015625, + "learning_rate": 8.801566926124678e-06, + "loss": 0.194, + "step": 9200 + }, + { + "epoch": 0.23283650074651416, + "grad_norm": 4.609501838684082, + "learning_rate": 8.801306102319682e-06, + "loss": 0.1744, + "step": 9201 + }, + { + "epoch": 0.23286180631120784, + "grad_norm": 7.363157272338867, + "learning_rate": 8.80104525400077e-06, + "loss": 0.1539, + "step": 9202 + }, + { + "epoch": 0.2328871118759015, + "grad_norm": 10.59093189239502, + "learning_rate": 8.800784381169624e-06, + "loss": 0.2052, + "step": 9203 + }, + { + "epoch": 0.23291241744059518, + "grad_norm": 5.686209678649902, + "learning_rate": 8.800523483827928e-06, + "loss": 0.1579, + "step": 9204 + }, + { + "epoch": 0.23293772300528887, + "grad_norm": 3.057779312133789, + "learning_rate": 8.80026256197736e-06, + "loss": 0.1037, + "step": 9205 + }, + { + "epoch": 0.23296302856998255, + "grad_norm": 4.103731632232666, + "learning_rate": 8.800001615619607e-06, + "loss": 0.1606, + "step": 9206 + }, + { + "epoch": 0.2329883341346762, + "grad_norm": 18.36577606201172, + "learning_rate": 8.799740644756352e-06, + "loss": 0.2309, + "step": 9207 + }, + { + "epoch": 0.2330136396993699, + "grad_norm": 5.272091865539551, + "learning_rate": 8.799479649389275e-06, + "loss": 0.2206, + "step": 9208 + }, + { + "epoch": 0.23303894526406357, + "grad_norm": 4.8400397300720215, + "learning_rate": 8.799218629520061e-06, + "loss": 0.2191, + "step": 9209 + }, + { + "epoch": 0.23306425082875723, + "grad_norm": 14.695822715759277, + "learning_rate": 8.798957585150391e-06, + "loss": 0.3398, + "step": 9210 + }, + { + "epoch": 0.23308955639345091, + "grad_norm": 4.963815689086914, + "learning_rate": 8.798696516281953e-06, + "loss": 0.1167, + "step": 9211 + }, + { + "epoch": 0.2331148619581446, + "grad_norm": 10.466712951660156, + "learning_rate": 8.798435422916425e-06, + "loss": 0.3635, + "step": 9212 + }, + { + "epoch": 0.23314016752283828, + "grad_norm": 12.486979484558105, + "learning_rate": 8.798174305055496e-06, + "loss": 0.1482, + "step": 9213 + }, + { + "epoch": 0.23316547308753194, + "grad_norm": 6.571579933166504, + "learning_rate": 8.797913162700846e-06, + "loss": 0.2589, + "step": 9214 + }, + { + "epoch": 0.23319077865222562, + "grad_norm": 10.598034858703613, + "learning_rate": 8.79765199585416e-06, + "loss": 0.3895, + "step": 9215 + }, + { + "epoch": 0.2332160842169193, + "grad_norm": 4.766707420349121, + "learning_rate": 8.797390804517124e-06, + "loss": 0.1311, + "step": 9216 + }, + { + "epoch": 0.233241389781613, + "grad_norm": 22.057851791381836, + "learning_rate": 8.79712958869142e-06, + "loss": 0.4703, + "step": 9217 + }, + { + "epoch": 0.23326669534630665, + "grad_norm": 9.047327995300293, + "learning_rate": 8.796868348378735e-06, + "loss": 0.1, + "step": 9218 + }, + { + "epoch": 0.23329200091100033, + "grad_norm": 5.070095062255859, + "learning_rate": 8.79660708358075e-06, + "loss": 0.2424, + "step": 9219 + }, + { + "epoch": 0.23331730647569401, + "grad_norm": 9.608473777770996, + "learning_rate": 8.796345794299155e-06, + "loss": 0.1705, + "step": 9220 + }, + { + "epoch": 0.23334261204038767, + "grad_norm": 2.183706521987915, + "learning_rate": 8.79608448053563e-06, + "loss": 0.1115, + "step": 9221 + }, + { + "epoch": 0.23336791760508135, + "grad_norm": 3.986206531524658, + "learning_rate": 8.795823142291863e-06, + "loss": 0.1911, + "step": 9222 + }, + { + "epoch": 0.23339322316977504, + "grad_norm": 5.975214004516602, + "learning_rate": 8.795561779569539e-06, + "loss": 0.179, + "step": 9223 + }, + { + "epoch": 0.23341852873446872, + "grad_norm": 4.56141996383667, + "learning_rate": 8.795300392370342e-06, + "loss": 0.1866, + "step": 9224 + }, + { + "epoch": 0.23344383429916238, + "grad_norm": 11.084256172180176, + "learning_rate": 8.79503898069596e-06, + "loss": 0.2327, + "step": 9225 + }, + { + "epoch": 0.23346913986385606, + "grad_norm": 4.854404926300049, + "learning_rate": 8.794777544548078e-06, + "loss": 0.2464, + "step": 9226 + }, + { + "epoch": 0.23349444542854975, + "grad_norm": 6.757501125335693, + "learning_rate": 8.794516083928379e-06, + "loss": 0.2022, + "step": 9227 + }, + { + "epoch": 0.2335197509932434, + "grad_norm": 5.167013645172119, + "learning_rate": 8.794254598838554e-06, + "loss": 0.1386, + "step": 9228 + }, + { + "epoch": 0.2335450565579371, + "grad_norm": 3.883929491043091, + "learning_rate": 8.793993089280286e-06, + "loss": 0.1385, + "step": 9229 + }, + { + "epoch": 0.23357036212263077, + "grad_norm": 4.6334123611450195, + "learning_rate": 8.793731555255262e-06, + "loss": 0.2236, + "step": 9230 + }, + { + "epoch": 0.23359566768732445, + "grad_norm": 5.060256481170654, + "learning_rate": 8.793469996765169e-06, + "loss": 0.1621, + "step": 9231 + }, + { + "epoch": 0.2336209732520181, + "grad_norm": 5.961134910583496, + "learning_rate": 8.793208413811695e-06, + "loss": 0.1705, + "step": 9232 + }, + { + "epoch": 0.2336462788167118, + "grad_norm": 4.811519622802734, + "learning_rate": 8.792946806396524e-06, + "loss": 0.1787, + "step": 9233 + }, + { + "epoch": 0.23367158438140548, + "grad_norm": 9.753837585449219, + "learning_rate": 8.792685174521344e-06, + "loss": 0.2644, + "step": 9234 + }, + { + "epoch": 0.23369688994609913, + "grad_norm": 15.406986236572266, + "learning_rate": 8.792423518187846e-06, + "loss": 0.3379, + "step": 9235 + }, + { + "epoch": 0.23372219551079282, + "grad_norm": 5.603180408477783, + "learning_rate": 8.79216183739771e-06, + "loss": 0.2302, + "step": 9236 + }, + { + "epoch": 0.2337475010754865, + "grad_norm": 14.215389251708984, + "learning_rate": 8.791900132152632e-06, + "loss": 0.2967, + "step": 9237 + }, + { + "epoch": 0.2337728066401802, + "grad_norm": 7.111508846282959, + "learning_rate": 8.791638402454293e-06, + "loss": 0.2548, + "step": 9238 + }, + { + "epoch": 0.23379811220487384, + "grad_norm": 3.0030083656311035, + "learning_rate": 8.791376648304382e-06, + "loss": 0.1249, + "step": 9239 + }, + { + "epoch": 0.23382341776956753, + "grad_norm": 3.781914472579956, + "learning_rate": 8.791114869704588e-06, + "loss": 0.1445, + "step": 9240 + }, + { + "epoch": 0.2338487233342612, + "grad_norm": 10.147327423095703, + "learning_rate": 8.790853066656601e-06, + "loss": 0.2097, + "step": 9241 + }, + { + "epoch": 0.23387402889895487, + "grad_norm": 9.566787719726562, + "learning_rate": 8.79059123916211e-06, + "loss": 0.3031, + "step": 9242 + }, + { + "epoch": 0.23389933446364855, + "grad_norm": 4.522811412811279, + "learning_rate": 8.790329387222798e-06, + "loss": 0.1577, + "step": 9243 + }, + { + "epoch": 0.23392464002834223, + "grad_norm": 5.538619518280029, + "learning_rate": 8.790067510840358e-06, + "loss": 0.2147, + "step": 9244 + }, + { + "epoch": 0.23394994559303592, + "grad_norm": 7.1776909828186035, + "learning_rate": 8.789805610016478e-06, + "loss": 0.2329, + "step": 9245 + }, + { + "epoch": 0.23397525115772957, + "grad_norm": 7.25636625289917, + "learning_rate": 8.789543684752845e-06, + "loss": 0.1661, + "step": 9246 + }, + { + "epoch": 0.23400055672242326, + "grad_norm": 21.97743797302246, + "learning_rate": 8.78928173505115e-06, + "loss": 0.1701, + "step": 9247 + }, + { + "epoch": 0.23402586228711694, + "grad_norm": 8.299034118652344, + "learning_rate": 8.789019760913082e-06, + "loss": 0.2467, + "step": 9248 + }, + { + "epoch": 0.23405116785181063, + "grad_norm": 7.492396354675293, + "learning_rate": 8.788757762340332e-06, + "loss": 0.1936, + "step": 9249 + }, + { + "epoch": 0.23407647341650428, + "grad_norm": 4.720326900482178, + "learning_rate": 8.788495739334587e-06, + "loss": 0.231, + "step": 9250 + }, + { + "epoch": 0.23410177898119797, + "grad_norm": 4.408481121063232, + "learning_rate": 8.788233691897537e-06, + "loss": 0.1617, + "step": 9251 + }, + { + "epoch": 0.23412708454589165, + "grad_norm": 6.818581581115723, + "learning_rate": 8.787971620030872e-06, + "loss": 0.2306, + "step": 9252 + }, + { + "epoch": 0.2341523901105853, + "grad_norm": 4.009725570678711, + "learning_rate": 8.787709523736286e-06, + "loss": 0.2145, + "step": 9253 + }, + { + "epoch": 0.234177695675279, + "grad_norm": 11.135357856750488, + "learning_rate": 8.787447403015463e-06, + "loss": 0.2043, + "step": 9254 + }, + { + "epoch": 0.23420300123997267, + "grad_norm": 5.551290988922119, + "learning_rate": 8.787185257870097e-06, + "loss": 0.2075, + "step": 9255 + }, + { + "epoch": 0.23422830680466636, + "grad_norm": 9.12482738494873, + "learning_rate": 8.786923088301877e-06, + "loss": 0.1922, + "step": 9256 + }, + { + "epoch": 0.23425361236936001, + "grad_norm": 4.969911098480225, + "learning_rate": 8.786660894312496e-06, + "loss": 0.1606, + "step": 9257 + }, + { + "epoch": 0.2342789179340537, + "grad_norm": 8.010116577148438, + "learning_rate": 8.786398675903643e-06, + "loss": 0.2465, + "step": 9258 + }, + { + "epoch": 0.23430422349874738, + "grad_norm": 4.0121073722839355, + "learning_rate": 8.786136433077008e-06, + "loss": 0.1997, + "step": 9259 + }, + { + "epoch": 0.23432952906344104, + "grad_norm": 3.726735830307007, + "learning_rate": 8.785874165834284e-06, + "loss": 0.1418, + "step": 9260 + }, + { + "epoch": 0.23435483462813472, + "grad_norm": 6.195417881011963, + "learning_rate": 8.785611874177161e-06, + "loss": 0.227, + "step": 9261 + }, + { + "epoch": 0.2343801401928284, + "grad_norm": 4.0161638259887695, + "learning_rate": 8.785349558107332e-06, + "loss": 0.1724, + "step": 9262 + }, + { + "epoch": 0.2344054457575221, + "grad_norm": 4.239056587219238, + "learning_rate": 8.785087217626487e-06, + "loss": 0.2068, + "step": 9263 + }, + { + "epoch": 0.23443075132221575, + "grad_norm": 5.536708354949951, + "learning_rate": 8.78482485273632e-06, + "loss": 0.2358, + "step": 9264 + }, + { + "epoch": 0.23445605688690943, + "grad_norm": 3.6531410217285156, + "learning_rate": 8.784562463438522e-06, + "loss": 0.1901, + "step": 9265 + }, + { + "epoch": 0.23448136245160311, + "grad_norm": 7.1669111251831055, + "learning_rate": 8.784300049734783e-06, + "loss": 0.2797, + "step": 9266 + }, + { + "epoch": 0.23450666801629677, + "grad_norm": 3.285456895828247, + "learning_rate": 8.784037611626798e-06, + "loss": 0.1509, + "step": 9267 + }, + { + "epoch": 0.23453197358099045, + "grad_norm": 4.774399757385254, + "learning_rate": 8.783775149116258e-06, + "loss": 0.2573, + "step": 9268 + }, + { + "epoch": 0.23455727914568414, + "grad_norm": 4.1857099533081055, + "learning_rate": 8.783512662204855e-06, + "loss": 0.1426, + "step": 9269 + }, + { + "epoch": 0.23458258471037782, + "grad_norm": 7.032052516937256, + "learning_rate": 8.783250150894284e-06, + "loss": 0.2367, + "step": 9270 + }, + { + "epoch": 0.23460789027507148, + "grad_norm": 5.465542316436768, + "learning_rate": 8.782987615186236e-06, + "loss": 0.1284, + "step": 9271 + }, + { + "epoch": 0.23463319583976516, + "grad_norm": 3.6850602626800537, + "learning_rate": 8.782725055082403e-06, + "loss": 0.2003, + "step": 9272 + }, + { + "epoch": 0.23465850140445885, + "grad_norm": 7.013312816619873, + "learning_rate": 8.782462470584482e-06, + "loss": 0.2364, + "step": 9273 + }, + { + "epoch": 0.2346838069691525, + "grad_norm": 7.026103496551514, + "learning_rate": 8.782199861694164e-06, + "loss": 0.1566, + "step": 9274 + }, + { + "epoch": 0.2347091125338462, + "grad_norm": 4.3273396492004395, + "learning_rate": 8.781937228413142e-06, + "loss": 0.1002, + "step": 9275 + }, + { + "epoch": 0.23473441809853987, + "grad_norm": 7.294978618621826, + "learning_rate": 8.781674570743109e-06, + "loss": 0.3312, + "step": 9276 + }, + { + "epoch": 0.23475972366323355, + "grad_norm": 4.633492469787598, + "learning_rate": 8.781411888685762e-06, + "loss": 0.1369, + "step": 9277 + }, + { + "epoch": 0.2347850292279272, + "grad_norm": 8.860252380371094, + "learning_rate": 8.781149182242791e-06, + "loss": 0.2778, + "step": 9278 + }, + { + "epoch": 0.2348103347926209, + "grad_norm": 7.910706996917725, + "learning_rate": 8.780886451415895e-06, + "loss": 0.1536, + "step": 9279 + }, + { + "epoch": 0.23483564035731458, + "grad_norm": 6.780004978179932, + "learning_rate": 8.780623696206766e-06, + "loss": 0.2186, + "step": 9280 + }, + { + "epoch": 0.23486094592200826, + "grad_norm": 3.715291976928711, + "learning_rate": 8.780360916617094e-06, + "loss": 0.1691, + "step": 9281 + }, + { + "epoch": 0.23488625148670192, + "grad_norm": 9.378487586975098, + "learning_rate": 8.78009811264858e-06, + "loss": 0.2734, + "step": 9282 + }, + { + "epoch": 0.2349115570513956, + "grad_norm": 3.005950450897217, + "learning_rate": 8.779835284302918e-06, + "loss": 0.1604, + "step": 9283 + }, + { + "epoch": 0.2349368626160893, + "grad_norm": 5.091832160949707, + "learning_rate": 8.7795724315818e-06, + "loss": 0.1127, + "step": 9284 + }, + { + "epoch": 0.23496216818078294, + "grad_norm": 8.739545822143555, + "learning_rate": 8.77930955448692e-06, + "loss": 0.2901, + "step": 9285 + }, + { + "epoch": 0.23498747374547663, + "grad_norm": 3.6496667861938477, + "learning_rate": 8.779046653019979e-06, + "loss": 0.1695, + "step": 9286 + }, + { + "epoch": 0.2350127793101703, + "grad_norm": 5.898557662963867, + "learning_rate": 8.778783727182667e-06, + "loss": 0.197, + "step": 9287 + }, + { + "epoch": 0.235038084874864, + "grad_norm": 2.790466547012329, + "learning_rate": 8.778520776976684e-06, + "loss": 0.1544, + "step": 9288 + }, + { + "epoch": 0.23506339043955765, + "grad_norm": 5.459575653076172, + "learning_rate": 8.778257802403722e-06, + "loss": 0.2687, + "step": 9289 + }, + { + "epoch": 0.23508869600425134, + "grad_norm": 17.417951583862305, + "learning_rate": 8.777994803465479e-06, + "loss": 0.2827, + "step": 9290 + }, + { + "epoch": 0.23511400156894502, + "grad_norm": 5.100456714630127, + "learning_rate": 8.77773178016365e-06, + "loss": 0.2092, + "step": 9291 + }, + { + "epoch": 0.23513930713363868, + "grad_norm": 7.797430515289307, + "learning_rate": 8.77746873249993e-06, + "loss": 0.1257, + "step": 9292 + }, + { + "epoch": 0.23516461269833236, + "grad_norm": 7.770002365112305, + "learning_rate": 8.777205660476017e-06, + "loss": 0.2224, + "step": 9293 + }, + { + "epoch": 0.23518991826302604, + "grad_norm": 3.2854936122894287, + "learning_rate": 8.77694256409361e-06, + "loss": 0.1822, + "step": 9294 + }, + { + "epoch": 0.23521522382771973, + "grad_norm": 7.1103620529174805, + "learning_rate": 8.776679443354401e-06, + "loss": 0.186, + "step": 9295 + }, + { + "epoch": 0.23524052939241338, + "grad_norm": 6.1909637451171875, + "learning_rate": 8.776416298260089e-06, + "loss": 0.1997, + "step": 9296 + }, + { + "epoch": 0.23526583495710707, + "grad_norm": 8.60075855255127, + "learning_rate": 8.77615312881237e-06, + "loss": 0.1617, + "step": 9297 + }, + { + "epoch": 0.23529114052180075, + "grad_norm": 4.435788631439209, + "learning_rate": 8.775889935012942e-06, + "loss": 0.1864, + "step": 9298 + }, + { + "epoch": 0.2353164460864944, + "grad_norm": 5.000490665435791, + "learning_rate": 8.775626716863503e-06, + "loss": 0.2545, + "step": 9299 + }, + { + "epoch": 0.2353417516511881, + "grad_norm": 5.530342102050781, + "learning_rate": 8.77536347436575e-06, + "loss": 0.1249, + "step": 9300 + }, + { + "epoch": 0.23536705721588178, + "grad_norm": 8.226361274719238, + "learning_rate": 8.775100207521378e-06, + "loss": 0.2246, + "step": 9301 + }, + { + "epoch": 0.23539236278057546, + "grad_norm": 12.411422729492188, + "learning_rate": 8.77483691633209e-06, + "loss": 0.2795, + "step": 9302 + }, + { + "epoch": 0.23541766834526912, + "grad_norm": 3.4643194675445557, + "learning_rate": 8.774573600799579e-06, + "loss": 0.1845, + "step": 9303 + }, + { + "epoch": 0.2354429739099628, + "grad_norm": 6.7028584480285645, + "learning_rate": 8.774310260925546e-06, + "loss": 0.245, + "step": 9304 + }, + { + "epoch": 0.23546827947465648, + "grad_norm": 5.8373637199401855, + "learning_rate": 8.774046896711687e-06, + "loss": 0.2699, + "step": 9305 + }, + { + "epoch": 0.23549358503935014, + "grad_norm": 4.236074447631836, + "learning_rate": 8.773783508159703e-06, + "loss": 0.1687, + "step": 9306 + }, + { + "epoch": 0.23551889060404382, + "grad_norm": 11.230164527893066, + "learning_rate": 8.773520095271289e-06, + "loss": 0.3443, + "step": 9307 + }, + { + "epoch": 0.2355441961687375, + "grad_norm": 2.1822965145111084, + "learning_rate": 8.773256658048148e-06, + "loss": 0.0863, + "step": 9308 + }, + { + "epoch": 0.2355695017334312, + "grad_norm": 6.0108962059021, + "learning_rate": 8.772993196491975e-06, + "loss": 0.2125, + "step": 9309 + }, + { + "epoch": 0.23559480729812485, + "grad_norm": 6.383946895599365, + "learning_rate": 8.772729710604473e-06, + "loss": 0.2458, + "step": 9310 + }, + { + "epoch": 0.23562011286281853, + "grad_norm": 5.022489547729492, + "learning_rate": 8.772466200387338e-06, + "loss": 0.1681, + "step": 9311 + }, + { + "epoch": 0.23564541842751222, + "grad_norm": 5.332653045654297, + "learning_rate": 8.772202665842269e-06, + "loss": 0.1482, + "step": 9312 + }, + { + "epoch": 0.2356707239922059, + "grad_norm": 5.367880344390869, + "learning_rate": 8.771939106970967e-06, + "loss": 0.1927, + "step": 9313 + }, + { + "epoch": 0.23569602955689956, + "grad_norm": 4.311965465545654, + "learning_rate": 8.771675523775131e-06, + "loss": 0.1356, + "step": 9314 + }, + { + "epoch": 0.23572133512159324, + "grad_norm": 3.9552040100097656, + "learning_rate": 8.771411916256463e-06, + "loss": 0.1855, + "step": 9315 + }, + { + "epoch": 0.23574664068628692, + "grad_norm": 7.762195110321045, + "learning_rate": 8.77114828441666e-06, + "loss": 0.3731, + "step": 9316 + }, + { + "epoch": 0.23577194625098058, + "grad_norm": 4.986079692840576, + "learning_rate": 8.770884628257424e-06, + "loss": 0.2182, + "step": 9317 + }, + { + "epoch": 0.23579725181567426, + "grad_norm": 3.3082034587860107, + "learning_rate": 8.770620947780452e-06, + "loss": 0.1288, + "step": 9318 + }, + { + "epoch": 0.23582255738036795, + "grad_norm": 8.626445770263672, + "learning_rate": 8.770357242987448e-06, + "loss": 0.228, + "step": 9319 + }, + { + "epoch": 0.23584786294506163, + "grad_norm": 4.223798751831055, + "learning_rate": 8.770093513880114e-06, + "loss": 0.1788, + "step": 9320 + }, + { + "epoch": 0.2358731685097553, + "grad_norm": 4.32236385345459, + "learning_rate": 8.769829760460145e-06, + "loss": 0.2184, + "step": 9321 + }, + { + "epoch": 0.23589847407444897, + "grad_norm": 13.867345809936523, + "learning_rate": 8.769565982729247e-06, + "loss": 0.2879, + "step": 9322 + }, + { + "epoch": 0.23592377963914266, + "grad_norm": 5.95239782333374, + "learning_rate": 8.769302180689119e-06, + "loss": 0.2551, + "step": 9323 + }, + { + "epoch": 0.2359490852038363, + "grad_norm": 4.703500747680664, + "learning_rate": 8.769038354341462e-06, + "loss": 0.1729, + "step": 9324 + }, + { + "epoch": 0.23597439076853, + "grad_norm": 5.522946357727051, + "learning_rate": 8.768774503687976e-06, + "loss": 0.1511, + "step": 9325 + }, + { + "epoch": 0.23599969633322368, + "grad_norm": 8.272146224975586, + "learning_rate": 8.768510628730366e-06, + "loss": 0.2372, + "step": 9326 + }, + { + "epoch": 0.23602500189791736, + "grad_norm": 6.330533504486084, + "learning_rate": 8.768246729470332e-06, + "loss": 0.2223, + "step": 9327 + }, + { + "epoch": 0.23605030746261102, + "grad_norm": 4.24091911315918, + "learning_rate": 8.767982805909576e-06, + "loss": 0.1534, + "step": 9328 + }, + { + "epoch": 0.2360756130273047, + "grad_norm": 8.383865356445312, + "learning_rate": 8.767718858049798e-06, + "loss": 0.1699, + "step": 9329 + }, + { + "epoch": 0.2361009185919984, + "grad_norm": 7.79979133605957, + "learning_rate": 8.767454885892704e-06, + "loss": 0.2639, + "step": 9330 + }, + { + "epoch": 0.23612622415669204, + "grad_norm": 6.102581024169922, + "learning_rate": 8.767190889439993e-06, + "loss": 0.1986, + "step": 9331 + }, + { + "epoch": 0.23615152972138573, + "grad_norm": 5.70891809463501, + "learning_rate": 8.76692686869337e-06, + "loss": 0.2564, + "step": 9332 + }, + { + "epoch": 0.2361768352860794, + "grad_norm": 5.664266109466553, + "learning_rate": 8.766662823654538e-06, + "loss": 0.2485, + "step": 9333 + }, + { + "epoch": 0.2362021408507731, + "grad_norm": 3.8388311862945557, + "learning_rate": 8.766398754325194e-06, + "loss": 0.1555, + "step": 9334 + }, + { + "epoch": 0.23622744641546675, + "grad_norm": 9.441571235656738, + "learning_rate": 8.766134660707046e-06, + "loss": 0.2026, + "step": 9335 + }, + { + "epoch": 0.23625275198016044, + "grad_norm": 5.243715763092041, + "learning_rate": 8.765870542801797e-06, + "loss": 0.2448, + "step": 9336 + }, + { + "epoch": 0.23627805754485412, + "grad_norm": 7.0171661376953125, + "learning_rate": 8.76560640061115e-06, + "loss": 0.2342, + "step": 9337 + }, + { + "epoch": 0.23630336310954778, + "grad_norm": 6.0829315185546875, + "learning_rate": 8.765342234136807e-06, + "loss": 0.1834, + "step": 9338 + }, + { + "epoch": 0.23632866867424146, + "grad_norm": 5.417539596557617, + "learning_rate": 8.765078043380472e-06, + "loss": 0.2514, + "step": 9339 + }, + { + "epoch": 0.23635397423893514, + "grad_norm": 5.402170658111572, + "learning_rate": 8.76481382834385e-06, + "loss": 0.2399, + "step": 9340 + }, + { + "epoch": 0.23637927980362883, + "grad_norm": 13.408720016479492, + "learning_rate": 8.764549589028644e-06, + "loss": 0.388, + "step": 9341 + }, + { + "epoch": 0.23640458536832248, + "grad_norm": 8.854355812072754, + "learning_rate": 8.764285325436558e-06, + "loss": 0.2714, + "step": 9342 + }, + { + "epoch": 0.23642989093301617, + "grad_norm": 9.064724922180176, + "learning_rate": 8.764021037569294e-06, + "loss": 0.2381, + "step": 9343 + }, + { + "epoch": 0.23645519649770985, + "grad_norm": 6.816880702972412, + "learning_rate": 8.763756725428561e-06, + "loss": 0.2631, + "step": 9344 + }, + { + "epoch": 0.23648050206240354, + "grad_norm": 6.486067771911621, + "learning_rate": 8.76349238901606e-06, + "loss": 0.2343, + "step": 9345 + }, + { + "epoch": 0.2365058076270972, + "grad_norm": 9.58495807647705, + "learning_rate": 8.763228028333495e-06, + "loss": 0.2225, + "step": 9346 + }, + { + "epoch": 0.23653111319179088, + "grad_norm": 6.323162078857422, + "learning_rate": 8.762963643382575e-06, + "loss": 0.1494, + "step": 9347 + }, + { + "epoch": 0.23655641875648456, + "grad_norm": 9.223576545715332, + "learning_rate": 8.762699234165002e-06, + "loss": 0.155, + "step": 9348 + }, + { + "epoch": 0.23658172432117822, + "grad_norm": 6.903299331665039, + "learning_rate": 8.76243480068248e-06, + "loss": 0.2755, + "step": 9349 + }, + { + "epoch": 0.2366070298858719, + "grad_norm": 7.70264196395874, + "learning_rate": 8.762170342936717e-06, + "loss": 0.2343, + "step": 9350 + }, + { + "epoch": 0.23663233545056558, + "grad_norm": 8.356361389160156, + "learning_rate": 8.761905860929417e-06, + "loss": 0.2011, + "step": 9351 + }, + { + "epoch": 0.23665764101525927, + "grad_norm": 8.06752872467041, + "learning_rate": 8.761641354662287e-06, + "loss": 0.2225, + "step": 9352 + }, + { + "epoch": 0.23668294657995292, + "grad_norm": 5.04988956451416, + "learning_rate": 8.761376824137028e-06, + "loss": 0.2288, + "step": 9353 + }, + { + "epoch": 0.2367082521446466, + "grad_norm": 11.699213981628418, + "learning_rate": 8.761112269355353e-06, + "loss": 0.1622, + "step": 9354 + }, + { + "epoch": 0.2367335577093403, + "grad_norm": 3.126943349838257, + "learning_rate": 8.760847690318965e-06, + "loss": 0.1551, + "step": 9355 + }, + { + "epoch": 0.23675886327403395, + "grad_norm": 3.9168522357940674, + "learning_rate": 8.760583087029568e-06, + "loss": 0.1971, + "step": 9356 + }, + { + "epoch": 0.23678416883872763, + "grad_norm": 4.2194318771362305, + "learning_rate": 8.760318459488871e-06, + "loss": 0.1655, + "step": 9357 + }, + { + "epoch": 0.23680947440342132, + "grad_norm": 13.205842018127441, + "learning_rate": 8.76005380769858e-06, + "loss": 0.2835, + "step": 9358 + }, + { + "epoch": 0.236834779968115, + "grad_norm": 13.673007011413574, + "learning_rate": 8.759789131660401e-06, + "loss": 0.255, + "step": 9359 + }, + { + "epoch": 0.23686008553280866, + "grad_norm": 12.11800765991211, + "learning_rate": 8.75952443137604e-06, + "loss": 0.2302, + "step": 9360 + }, + { + "epoch": 0.23688539109750234, + "grad_norm": 2.6119587421417236, + "learning_rate": 8.759259706847208e-06, + "loss": 0.1654, + "step": 9361 + }, + { + "epoch": 0.23691069666219602, + "grad_norm": 6.0173516273498535, + "learning_rate": 8.758994958075608e-06, + "loss": 0.1878, + "step": 9362 + }, + { + "epoch": 0.23693600222688968, + "grad_norm": 5.67067813873291, + "learning_rate": 8.758730185062948e-06, + "loss": 0.1779, + "step": 9363 + }, + { + "epoch": 0.23696130779158336, + "grad_norm": 5.193602085113525, + "learning_rate": 8.758465387810937e-06, + "loss": 0.2129, + "step": 9364 + }, + { + "epoch": 0.23698661335627705, + "grad_norm": 11.836381912231445, + "learning_rate": 8.758200566321282e-06, + "loss": 0.1836, + "step": 9365 + }, + { + "epoch": 0.23701191892097073, + "grad_norm": 3.5557920932769775, + "learning_rate": 8.757935720595689e-06, + "loss": 0.1304, + "step": 9366 + }, + { + "epoch": 0.2370372244856644, + "grad_norm": 6.667800426483154, + "learning_rate": 8.75767085063587e-06, + "loss": 0.2458, + "step": 9367 + }, + { + "epoch": 0.23706253005035807, + "grad_norm": 3.558250904083252, + "learning_rate": 8.75740595644353e-06, + "loss": 0.166, + "step": 9368 + }, + { + "epoch": 0.23708783561505176, + "grad_norm": 8.51478099822998, + "learning_rate": 8.757141038020377e-06, + "loss": 0.2116, + "step": 9369 + }, + { + "epoch": 0.2371131411797454, + "grad_norm": 16.321447372436523, + "learning_rate": 8.75687609536812e-06, + "loss": 0.178, + "step": 9370 + }, + { + "epoch": 0.2371384467444391, + "grad_norm": 5.9730753898620605, + "learning_rate": 8.75661112848847e-06, + "loss": 0.2362, + "step": 9371 + }, + { + "epoch": 0.23716375230913278, + "grad_norm": 3.347379446029663, + "learning_rate": 8.756346137383132e-06, + "loss": 0.1609, + "step": 9372 + }, + { + "epoch": 0.23718905787382646, + "grad_norm": 8.354248046875, + "learning_rate": 8.756081122053816e-06, + "loss": 0.3261, + "step": 9373 + }, + { + "epoch": 0.23721436343852012, + "grad_norm": 3.215977191925049, + "learning_rate": 8.755816082502233e-06, + "loss": 0.1357, + "step": 9374 + }, + { + "epoch": 0.2372396690032138, + "grad_norm": 3.7380354404449463, + "learning_rate": 8.75555101873009e-06, + "loss": 0.148, + "step": 9375 + }, + { + "epoch": 0.2372649745679075, + "grad_norm": 5.971784591674805, + "learning_rate": 8.755285930739097e-06, + "loss": 0.1546, + "step": 9376 + }, + { + "epoch": 0.23729028013260117, + "grad_norm": 4.3031415939331055, + "learning_rate": 8.755020818530964e-06, + "loss": 0.21, + "step": 9377 + }, + { + "epoch": 0.23731558569729483, + "grad_norm": 4.169820785522461, + "learning_rate": 8.754755682107399e-06, + "loss": 0.1643, + "step": 9378 + }, + { + "epoch": 0.2373408912619885, + "grad_norm": 9.022138595581055, + "learning_rate": 8.754490521470113e-06, + "loss": 0.1826, + "step": 9379 + }, + { + "epoch": 0.2373661968266822, + "grad_norm": 3.279472589492798, + "learning_rate": 8.754225336620819e-06, + "loss": 0.176, + "step": 9380 + }, + { + "epoch": 0.23739150239137585, + "grad_norm": 6.70741081237793, + "learning_rate": 8.75396012756122e-06, + "loss": 0.1641, + "step": 9381 + }, + { + "epoch": 0.23741680795606954, + "grad_norm": 7.336593151092529, + "learning_rate": 8.753694894293034e-06, + "loss": 0.1645, + "step": 9382 + }, + { + "epoch": 0.23744211352076322, + "grad_norm": 4.663640022277832, + "learning_rate": 8.753429636817966e-06, + "loss": 0.1576, + "step": 9383 + }, + { + "epoch": 0.2374674190854569, + "grad_norm": 5.932092189788818, + "learning_rate": 8.753164355137729e-06, + "loss": 0.2077, + "step": 9384 + }, + { + "epoch": 0.23749272465015056, + "grad_norm": 6.430821418762207, + "learning_rate": 8.752899049254034e-06, + "loss": 0.2051, + "step": 9385 + }, + { + "epoch": 0.23751803021484424, + "grad_norm": 4.232224941253662, + "learning_rate": 8.752633719168589e-06, + "loss": 0.2006, + "step": 9386 + }, + { + "epoch": 0.23754333577953793, + "grad_norm": 13.830942153930664, + "learning_rate": 8.75236836488311e-06, + "loss": 0.2815, + "step": 9387 + }, + { + "epoch": 0.23756864134423158, + "grad_norm": 10.273097038269043, + "learning_rate": 8.752102986399302e-06, + "loss": 0.2272, + "step": 9388 + }, + { + "epoch": 0.23759394690892527, + "grad_norm": 8.313619613647461, + "learning_rate": 8.751837583718882e-06, + "loss": 0.2849, + "step": 9389 + }, + { + "epoch": 0.23761925247361895, + "grad_norm": 9.97758674621582, + "learning_rate": 8.751572156843559e-06, + "loss": 0.3201, + "step": 9390 + }, + { + "epoch": 0.23764455803831264, + "grad_norm": 4.774307727813721, + "learning_rate": 8.751306705775045e-06, + "loss": 0.1292, + "step": 9391 + }, + { + "epoch": 0.2376698636030063, + "grad_norm": 6.7406721115112305, + "learning_rate": 8.751041230515052e-06, + "loss": 0.1741, + "step": 9392 + }, + { + "epoch": 0.23769516916769998, + "grad_norm": 7.003917217254639, + "learning_rate": 8.750775731065291e-06, + "loss": 0.2794, + "step": 9393 + }, + { + "epoch": 0.23772047473239366, + "grad_norm": 6.645174980163574, + "learning_rate": 8.750510207427477e-06, + "loss": 0.2726, + "step": 9394 + }, + { + "epoch": 0.23774578029708732, + "grad_norm": 5.773153305053711, + "learning_rate": 8.750244659603318e-06, + "loss": 0.1988, + "step": 9395 + }, + { + "epoch": 0.237771085861781, + "grad_norm": 7.892769813537598, + "learning_rate": 8.74997908759453e-06, + "loss": 0.3408, + "step": 9396 + }, + { + "epoch": 0.23779639142647468, + "grad_norm": 3.7142419815063477, + "learning_rate": 8.749713491402824e-06, + "loss": 0.0908, + "step": 9397 + }, + { + "epoch": 0.23782169699116837, + "grad_norm": 4.652118682861328, + "learning_rate": 8.749447871029914e-06, + "loss": 0.21, + "step": 9398 + }, + { + "epoch": 0.23784700255586202, + "grad_norm": 7.3275532722473145, + "learning_rate": 8.749182226477511e-06, + "loss": 0.1924, + "step": 9399 + }, + { + "epoch": 0.2378723081205557, + "grad_norm": 6.628086090087891, + "learning_rate": 8.74891655774733e-06, + "loss": 0.189, + "step": 9400 + }, + { + "epoch": 0.2378976136852494, + "grad_norm": 7.023498058319092, + "learning_rate": 8.748650864841083e-06, + "loss": 0.1924, + "step": 9401 + }, + { + "epoch": 0.23792291924994305, + "grad_norm": 5.134088516235352, + "learning_rate": 8.748385147760484e-06, + "loss": 0.1682, + "step": 9402 + }, + { + "epoch": 0.23794822481463673, + "grad_norm": 3.712636947631836, + "learning_rate": 8.748119406507248e-06, + "loss": 0.1761, + "step": 9403 + }, + { + "epoch": 0.23797353037933042, + "grad_norm": 7.433045864105225, + "learning_rate": 8.747853641083084e-06, + "loss": 0.2115, + "step": 9404 + }, + { + "epoch": 0.2379988359440241, + "grad_norm": 9.724787712097168, + "learning_rate": 8.747587851489711e-06, + "loss": 0.2183, + "step": 9405 + }, + { + "epoch": 0.23802414150871776, + "grad_norm": 7.1876420974731445, + "learning_rate": 8.74732203772884e-06, + "loss": 0.161, + "step": 9406 + }, + { + "epoch": 0.23804944707341144, + "grad_norm": 5.308559417724609, + "learning_rate": 8.747056199802188e-06, + "loss": 0.2118, + "step": 9407 + }, + { + "epoch": 0.23807475263810512, + "grad_norm": 4.5457868576049805, + "learning_rate": 8.746790337711467e-06, + "loss": 0.1491, + "step": 9408 + }, + { + "epoch": 0.2381000582027988, + "grad_norm": 2.961089849472046, + "learning_rate": 8.746524451458392e-06, + "loss": 0.1449, + "step": 9409 + }, + { + "epoch": 0.23812536376749247, + "grad_norm": 3.9122354984283447, + "learning_rate": 8.746258541044677e-06, + "loss": 0.2009, + "step": 9410 + }, + { + "epoch": 0.23815066933218615, + "grad_norm": 7.463719367980957, + "learning_rate": 8.745992606472036e-06, + "loss": 0.2601, + "step": 9411 + }, + { + "epoch": 0.23817597489687983, + "grad_norm": 10.172625541687012, + "learning_rate": 8.745726647742188e-06, + "loss": 0.2308, + "step": 9412 + }, + { + "epoch": 0.2382012804615735, + "grad_norm": 4.612297534942627, + "learning_rate": 8.745460664856845e-06, + "loss": 0.0936, + "step": 9413 + }, + { + "epoch": 0.23822658602626717, + "grad_norm": 5.292971611022949, + "learning_rate": 8.745194657817722e-06, + "loss": 0.1905, + "step": 9414 + }, + { + "epoch": 0.23825189159096086, + "grad_norm": 5.88694429397583, + "learning_rate": 8.744928626626536e-06, + "loss": 0.1983, + "step": 9415 + }, + { + "epoch": 0.23827719715565454, + "grad_norm": 10.167266845703125, + "learning_rate": 8.744662571285004e-06, + "loss": 0.2185, + "step": 9416 + }, + { + "epoch": 0.2383025027203482, + "grad_norm": 16.272987365722656, + "learning_rate": 8.744396491794837e-06, + "loss": 0.1994, + "step": 9417 + }, + { + "epoch": 0.23832780828504188, + "grad_norm": 9.836631774902344, + "learning_rate": 8.744130388157754e-06, + "loss": 0.238, + "step": 9418 + }, + { + "epoch": 0.23835311384973557, + "grad_norm": 6.014159202575684, + "learning_rate": 8.743864260375471e-06, + "loss": 0.1374, + "step": 9419 + }, + { + "epoch": 0.23837841941442922, + "grad_norm": 7.239490032196045, + "learning_rate": 8.743598108449704e-06, + "loss": 0.1826, + "step": 9420 + }, + { + "epoch": 0.2384037249791229, + "grad_norm": 14.245532035827637, + "learning_rate": 8.743331932382169e-06, + "loss": 0.3298, + "step": 9421 + }, + { + "epoch": 0.2384290305438166, + "grad_norm": 11.228875160217285, + "learning_rate": 8.74306573217458e-06, + "loss": 0.2091, + "step": 9422 + }, + { + "epoch": 0.23845433610851027, + "grad_norm": 3.893404960632324, + "learning_rate": 8.74279950782866e-06, + "loss": 0.1102, + "step": 9423 + }, + { + "epoch": 0.23847964167320393, + "grad_norm": 8.457877159118652, + "learning_rate": 8.742533259346122e-06, + "loss": 0.2936, + "step": 9424 + }, + { + "epoch": 0.2385049472378976, + "grad_norm": 4.680324554443359, + "learning_rate": 8.74226698672868e-06, + "loss": 0.2278, + "step": 9425 + }, + { + "epoch": 0.2385302528025913, + "grad_norm": 4.250125885009766, + "learning_rate": 8.742000689978057e-06, + "loss": 0.151, + "step": 9426 + }, + { + "epoch": 0.23855555836728495, + "grad_norm": 11.98482894897461, + "learning_rate": 8.741734369095968e-06, + "loss": 0.1679, + "step": 9427 + }, + { + "epoch": 0.23858086393197864, + "grad_norm": 5.507715225219727, + "learning_rate": 8.741468024084128e-06, + "loss": 0.1796, + "step": 9428 + }, + { + "epoch": 0.23860616949667232, + "grad_norm": 4.922824382781982, + "learning_rate": 8.741201654944258e-06, + "loss": 0.1828, + "step": 9429 + }, + { + "epoch": 0.238631475061366, + "grad_norm": 4.815858840942383, + "learning_rate": 8.740935261678075e-06, + "loss": 0.218, + "step": 9430 + }, + { + "epoch": 0.23865678062605966, + "grad_norm": 6.309937000274658, + "learning_rate": 8.740668844287295e-06, + "loss": 0.2283, + "step": 9431 + }, + { + "epoch": 0.23868208619075335, + "grad_norm": 8.07723617553711, + "learning_rate": 8.740402402773639e-06, + "loss": 0.1968, + "step": 9432 + }, + { + "epoch": 0.23870739175544703, + "grad_norm": 3.2561564445495605, + "learning_rate": 8.740135937138823e-06, + "loss": 0.1595, + "step": 9433 + }, + { + "epoch": 0.23873269732014069, + "grad_norm": 6.340638160705566, + "learning_rate": 8.739869447384565e-06, + "loss": 0.1562, + "step": 9434 + }, + { + "epoch": 0.23875800288483437, + "grad_norm": 6.956690311431885, + "learning_rate": 8.739602933512586e-06, + "loss": 0.1552, + "step": 9435 + }, + { + "epoch": 0.23878330844952805, + "grad_norm": 6.383976459503174, + "learning_rate": 8.739336395524603e-06, + "loss": 0.2911, + "step": 9436 + }, + { + "epoch": 0.23880861401422174, + "grad_norm": 4.96024751663208, + "learning_rate": 8.739069833422333e-06, + "loss": 0.2251, + "step": 9437 + }, + { + "epoch": 0.2388339195789154, + "grad_norm": 4.936514377593994, + "learning_rate": 8.7388032472075e-06, + "loss": 0.2175, + "step": 9438 + }, + { + "epoch": 0.23885922514360908, + "grad_norm": 5.229740619659424, + "learning_rate": 8.73853663688182e-06, + "loss": 0.1661, + "step": 9439 + }, + { + "epoch": 0.23888453070830276, + "grad_norm": 7.032295227050781, + "learning_rate": 8.73827000244701e-06, + "loss": 0.2116, + "step": 9440 + }, + { + "epoch": 0.23890983627299645, + "grad_norm": 6.68683385848999, + "learning_rate": 8.738003343904795e-06, + "loss": 0.1461, + "step": 9441 + }, + { + "epoch": 0.2389351418376901, + "grad_norm": 4.36771821975708, + "learning_rate": 8.737736661256892e-06, + "loss": 0.1055, + "step": 9442 + }, + { + "epoch": 0.23896044740238379, + "grad_norm": 16.04199981689453, + "learning_rate": 8.737469954505019e-06, + "loss": 0.2224, + "step": 9443 + }, + { + "epoch": 0.23898575296707747, + "grad_norm": 8.704912185668945, + "learning_rate": 8.7372032236509e-06, + "loss": 0.3014, + "step": 9444 + }, + { + "epoch": 0.23901105853177113, + "grad_norm": 3.5231738090515137, + "learning_rate": 8.736936468696248e-06, + "loss": 0.1588, + "step": 9445 + }, + { + "epoch": 0.2390363640964648, + "grad_norm": 10.020288467407227, + "learning_rate": 8.736669689642791e-06, + "loss": 0.2287, + "step": 9446 + }, + { + "epoch": 0.2390616696611585, + "grad_norm": 18.3988037109375, + "learning_rate": 8.736402886492245e-06, + "loss": 0.3218, + "step": 9447 + }, + { + "epoch": 0.23908697522585218, + "grad_norm": 15.335262298583984, + "learning_rate": 8.736136059246332e-06, + "loss": 0.218, + "step": 9448 + }, + { + "epoch": 0.23911228079054583, + "grad_norm": 3.206005334854126, + "learning_rate": 8.735869207906774e-06, + "loss": 0.1354, + "step": 9449 + }, + { + "epoch": 0.23913758635523952, + "grad_norm": 10.952378273010254, + "learning_rate": 8.735602332475289e-06, + "loss": 0.2738, + "step": 9450 + }, + { + "epoch": 0.2391628919199332, + "grad_norm": 8.842470169067383, + "learning_rate": 8.7353354329536e-06, + "loss": 0.1945, + "step": 9451 + }, + { + "epoch": 0.23918819748462686, + "grad_norm": 3.7545506954193115, + "learning_rate": 8.735068509343427e-06, + "loss": 0.1785, + "step": 9452 + }, + { + "epoch": 0.23921350304932054, + "grad_norm": 5.480612277984619, + "learning_rate": 8.734801561646493e-06, + "loss": 0.2667, + "step": 9453 + }, + { + "epoch": 0.23923880861401423, + "grad_norm": 6.98356819152832, + "learning_rate": 8.734534589864517e-06, + "loss": 0.196, + "step": 9454 + }, + { + "epoch": 0.2392641141787079, + "grad_norm": 7.586127758026123, + "learning_rate": 8.734267593999224e-06, + "loss": 0.2806, + "step": 9455 + }, + { + "epoch": 0.23928941974340157, + "grad_norm": 6.012825965881348, + "learning_rate": 8.734000574052333e-06, + "loss": 0.1131, + "step": 9456 + }, + { + "epoch": 0.23931472530809525, + "grad_norm": 2.685889959335327, + "learning_rate": 8.733733530025565e-06, + "loss": 0.0649, + "step": 9457 + }, + { + "epoch": 0.23934003087278893, + "grad_norm": 5.4239501953125, + "learning_rate": 8.733466461920647e-06, + "loss": 0.1619, + "step": 9458 + }, + { + "epoch": 0.2393653364374826, + "grad_norm": 3.8897275924682617, + "learning_rate": 8.733199369739296e-06, + "loss": 0.1597, + "step": 9459 + }, + { + "epoch": 0.23939064200217627, + "grad_norm": 3.831468105316162, + "learning_rate": 8.732932253483239e-06, + "loss": 0.2026, + "step": 9460 + }, + { + "epoch": 0.23941594756686996, + "grad_norm": 11.398386001586914, + "learning_rate": 8.732665113154194e-06, + "loss": 0.2887, + "step": 9461 + }, + { + "epoch": 0.23944125313156364, + "grad_norm": 3.909191846847534, + "learning_rate": 8.732397948753887e-06, + "loss": 0.1777, + "step": 9462 + }, + { + "epoch": 0.2394665586962573, + "grad_norm": 3.63578462600708, + "learning_rate": 8.73213076028404e-06, + "loss": 0.1943, + "step": 9463 + }, + { + "epoch": 0.23949186426095098, + "grad_norm": 8.21634578704834, + "learning_rate": 8.731863547746375e-06, + "loss": 0.2516, + "step": 9464 + }, + { + "epoch": 0.23951716982564467, + "grad_norm": 3.079822063446045, + "learning_rate": 8.731596311142617e-06, + "loss": 0.1602, + "step": 9465 + }, + { + "epoch": 0.23954247539033832, + "grad_norm": 5.816110610961914, + "learning_rate": 8.731329050474488e-06, + "loss": 0.2406, + "step": 9466 + }, + { + "epoch": 0.239567780955032, + "grad_norm": 6.806967735290527, + "learning_rate": 8.731061765743714e-06, + "loss": 0.2117, + "step": 9467 + }, + { + "epoch": 0.2395930865197257, + "grad_norm": 16.479774475097656, + "learning_rate": 8.730794456952015e-06, + "loss": 0.2937, + "step": 9468 + }, + { + "epoch": 0.23961839208441937, + "grad_norm": 3.361931324005127, + "learning_rate": 8.730527124101116e-06, + "loss": 0.2014, + "step": 9469 + }, + { + "epoch": 0.23964369764911303, + "grad_norm": 4.118569374084473, + "learning_rate": 8.730259767192743e-06, + "loss": 0.1837, + "step": 9470 + }, + { + "epoch": 0.23966900321380671, + "grad_norm": 3.059751510620117, + "learning_rate": 8.729992386228617e-06, + "loss": 0.1162, + "step": 9471 + }, + { + "epoch": 0.2396943087785004, + "grad_norm": 4.409551620483398, + "learning_rate": 8.729724981210465e-06, + "loss": 0.1052, + "step": 9472 + }, + { + "epoch": 0.23971961434319408, + "grad_norm": 3.8634486198425293, + "learning_rate": 8.72945755214001e-06, + "loss": 0.2066, + "step": 9473 + }, + { + "epoch": 0.23974491990788774, + "grad_norm": 4.6349196434021, + "learning_rate": 8.729190099018979e-06, + "loss": 0.1526, + "step": 9474 + }, + { + "epoch": 0.23977022547258142, + "grad_norm": 5.810095310211182, + "learning_rate": 8.728922621849091e-06, + "loss": 0.2334, + "step": 9475 + }, + { + "epoch": 0.2397955310372751, + "grad_norm": 5.877584457397461, + "learning_rate": 8.728655120632076e-06, + "loss": 0.1884, + "step": 9476 + }, + { + "epoch": 0.23982083660196876, + "grad_norm": 11.125903129577637, + "learning_rate": 8.728387595369659e-06, + "loss": 0.3229, + "step": 9477 + }, + { + "epoch": 0.23984614216666245, + "grad_norm": 4.078487873077393, + "learning_rate": 8.728120046063564e-06, + "loss": 0.1997, + "step": 9478 + }, + { + "epoch": 0.23987144773135613, + "grad_norm": 2.9105935096740723, + "learning_rate": 8.727852472715515e-06, + "loss": 0.1222, + "step": 9479 + }, + { + "epoch": 0.23989675329604981, + "grad_norm": 10.915939331054688, + "learning_rate": 8.72758487532724e-06, + "loss": 0.2727, + "step": 9480 + }, + { + "epoch": 0.23992205886074347, + "grad_norm": 10.228116035461426, + "learning_rate": 8.727317253900462e-06, + "loss": 0.2883, + "step": 9481 + }, + { + "epoch": 0.23994736442543715, + "grad_norm": 10.919265747070312, + "learning_rate": 8.72704960843691e-06, + "loss": 0.2273, + "step": 9482 + }, + { + "epoch": 0.23997266999013084, + "grad_norm": 4.685222148895264, + "learning_rate": 8.726781938938306e-06, + "loss": 0.1298, + "step": 9483 + }, + { + "epoch": 0.2399979755548245, + "grad_norm": 3.9218218326568604, + "learning_rate": 8.726514245406382e-06, + "loss": 0.0858, + "step": 9484 + }, + { + "epoch": 0.24002328111951818, + "grad_norm": 2.737060070037842, + "learning_rate": 8.726246527842859e-06, + "loss": 0.1348, + "step": 9485 + }, + { + "epoch": 0.24004858668421186, + "grad_norm": 2.311819314956665, + "learning_rate": 8.725978786249465e-06, + "loss": 0.1202, + "step": 9486 + }, + { + "epoch": 0.24007389224890555, + "grad_norm": 3.8654158115386963, + "learning_rate": 8.725711020627927e-06, + "loss": 0.1069, + "step": 9487 + }, + { + "epoch": 0.2400991978135992, + "grad_norm": 10.250136375427246, + "learning_rate": 8.725443230979971e-06, + "loss": 0.1128, + "step": 9488 + }, + { + "epoch": 0.2401245033782929, + "grad_norm": 5.746701717376709, + "learning_rate": 8.725175417307325e-06, + "loss": 0.1697, + "step": 9489 + }, + { + "epoch": 0.24014980894298657, + "grad_norm": 3.2868247032165527, + "learning_rate": 8.724907579611716e-06, + "loss": 0.1354, + "step": 9490 + }, + { + "epoch": 0.24017511450768023, + "grad_norm": 5.321997165679932, + "learning_rate": 8.724639717894871e-06, + "loss": 0.1623, + "step": 9491 + }, + { + "epoch": 0.2402004200723739, + "grad_norm": 3.552274227142334, + "learning_rate": 8.724371832158518e-06, + "loss": 0.1966, + "step": 9492 + }, + { + "epoch": 0.2402257256370676, + "grad_norm": 4.7984795570373535, + "learning_rate": 8.724103922404383e-06, + "loss": 0.1549, + "step": 9493 + }, + { + "epoch": 0.24025103120176128, + "grad_norm": 4.858424186706543, + "learning_rate": 8.723835988634195e-06, + "loss": 0.2176, + "step": 9494 + }, + { + "epoch": 0.24027633676645493, + "grad_norm": 7.006882667541504, + "learning_rate": 8.72356803084968e-06, + "loss": 0.1978, + "step": 9495 + }, + { + "epoch": 0.24030164233114862, + "grad_norm": 6.065154552459717, + "learning_rate": 8.72330004905257e-06, + "loss": 0.2635, + "step": 9496 + }, + { + "epoch": 0.2403269478958423, + "grad_norm": 6.863412857055664, + "learning_rate": 8.723032043244588e-06, + "loss": 0.2359, + "step": 9497 + }, + { + "epoch": 0.24035225346053596, + "grad_norm": 3.9606988430023193, + "learning_rate": 8.722764013427465e-06, + "loss": 0.0786, + "step": 9498 + }, + { + "epoch": 0.24037755902522964, + "grad_norm": 7.82757043838501, + "learning_rate": 8.722495959602931e-06, + "loss": 0.2789, + "step": 9499 + }, + { + "epoch": 0.24040286458992333, + "grad_norm": 7.779689311981201, + "learning_rate": 8.72222788177271e-06, + "loss": 0.2414, + "step": 9500 + }, + { + "epoch": 0.240428170154617, + "grad_norm": 3.8206686973571777, + "learning_rate": 8.721959779938537e-06, + "loss": 0.1501, + "step": 9501 + }, + { + "epoch": 0.24045347571931067, + "grad_norm": 5.133087158203125, + "learning_rate": 8.721691654102137e-06, + "loss": 0.1592, + "step": 9502 + }, + { + "epoch": 0.24047878128400435, + "grad_norm": 3.6047186851501465, + "learning_rate": 8.721423504265241e-06, + "loss": 0.1703, + "step": 9503 + }, + { + "epoch": 0.24050408684869803, + "grad_norm": 7.636030197143555, + "learning_rate": 8.721155330429576e-06, + "loss": 0.2024, + "step": 9504 + }, + { + "epoch": 0.24052939241339172, + "grad_norm": 10.36331558227539, + "learning_rate": 8.720887132596871e-06, + "loss": 0.2376, + "step": 9505 + }, + { + "epoch": 0.24055469797808537, + "grad_norm": 9.320265769958496, + "learning_rate": 8.72061891076886e-06, + "loss": 0.2197, + "step": 9506 + }, + { + "epoch": 0.24058000354277906, + "grad_norm": 6.324916839599609, + "learning_rate": 8.720350664947266e-06, + "loss": 0.174, + "step": 9507 + }, + { + "epoch": 0.24060530910747274, + "grad_norm": 3.969545364379883, + "learning_rate": 8.720082395133825e-06, + "loss": 0.1993, + "step": 9508 + }, + { + "epoch": 0.2406306146721664, + "grad_norm": 4.5527753829956055, + "learning_rate": 8.719814101330264e-06, + "loss": 0.2084, + "step": 9509 + }, + { + "epoch": 0.24065592023686008, + "grad_norm": 10.944422721862793, + "learning_rate": 8.719545783538312e-06, + "loss": 0.2552, + "step": 9510 + }, + { + "epoch": 0.24068122580155377, + "grad_norm": 3.372008800506592, + "learning_rate": 8.719277441759703e-06, + "loss": 0.1631, + "step": 9511 + }, + { + "epoch": 0.24070653136624745, + "grad_norm": 5.725810527801514, + "learning_rate": 8.719009075996167e-06, + "loss": 0.2507, + "step": 9512 + }, + { + "epoch": 0.2407318369309411, + "grad_norm": 7.294891834259033, + "learning_rate": 8.718740686249432e-06, + "loss": 0.1188, + "step": 9513 + }, + { + "epoch": 0.2407571424956348, + "grad_norm": 5.642937660217285, + "learning_rate": 8.718472272521231e-06, + "loss": 0.1613, + "step": 9514 + }, + { + "epoch": 0.24078244806032847, + "grad_norm": 5.978453636169434, + "learning_rate": 8.718203834813291e-06, + "loss": 0.1781, + "step": 9515 + }, + { + "epoch": 0.24080775362502213, + "grad_norm": 18.275888442993164, + "learning_rate": 8.717935373127348e-06, + "loss": 0.3161, + "step": 9516 + }, + { + "epoch": 0.24083305918971581, + "grad_norm": 6.97698974609375, + "learning_rate": 8.717666887465131e-06, + "loss": 0.2649, + "step": 9517 + }, + { + "epoch": 0.2408583647544095, + "grad_norm": 9.85849380493164, + "learning_rate": 8.717398377828372e-06, + "loss": 0.1786, + "step": 9518 + }, + { + "epoch": 0.24088367031910318, + "grad_norm": 6.316752910614014, + "learning_rate": 8.717129844218803e-06, + "loss": 0.2633, + "step": 9519 + }, + { + "epoch": 0.24090897588379684, + "grad_norm": 4.0597920417785645, + "learning_rate": 8.716861286638155e-06, + "loss": 0.2247, + "step": 9520 + }, + { + "epoch": 0.24093428144849052, + "grad_norm": 10.975320816040039, + "learning_rate": 8.716592705088159e-06, + "loss": 0.2653, + "step": 9521 + }, + { + "epoch": 0.2409595870131842, + "grad_norm": 5.176411151885986, + "learning_rate": 8.71632409957055e-06, + "loss": 0.2161, + "step": 9522 + }, + { + "epoch": 0.24098489257787786, + "grad_norm": 3.6061034202575684, + "learning_rate": 8.716055470087055e-06, + "loss": 0.1771, + "step": 9523 + }, + { + "epoch": 0.24101019814257155, + "grad_norm": 9.832749366760254, + "learning_rate": 8.715786816639412e-06, + "loss": 0.2183, + "step": 9524 + }, + { + "epoch": 0.24103550370726523, + "grad_norm": 5.170926570892334, + "learning_rate": 8.715518139229349e-06, + "loss": 0.1894, + "step": 9525 + }, + { + "epoch": 0.24106080927195891, + "grad_norm": 2.8010036945343018, + "learning_rate": 8.715249437858603e-06, + "loss": 0.1543, + "step": 9526 + }, + { + "epoch": 0.24108611483665257, + "grad_norm": 3.148148536682129, + "learning_rate": 8.714980712528903e-06, + "loss": 0.1713, + "step": 9527 + }, + { + "epoch": 0.24111142040134625, + "grad_norm": 4.508944988250732, + "learning_rate": 8.714711963241985e-06, + "loss": 0.1566, + "step": 9528 + }, + { + "epoch": 0.24113672596603994, + "grad_norm": 5.391952991485596, + "learning_rate": 8.714443189999577e-06, + "loss": 0.2424, + "step": 9529 + }, + { + "epoch": 0.2411620315307336, + "grad_norm": 3.629870653152466, + "learning_rate": 8.714174392803419e-06, + "loss": 0.1772, + "step": 9530 + }, + { + "epoch": 0.24118733709542728, + "grad_norm": 5.945162296295166, + "learning_rate": 8.713905571655242e-06, + "loss": 0.2158, + "step": 9531 + }, + { + "epoch": 0.24121264266012096, + "grad_norm": 4.714727401733398, + "learning_rate": 8.713636726556775e-06, + "loss": 0.2154, + "step": 9532 + }, + { + "epoch": 0.24123794822481465, + "grad_norm": 4.202945232391357, + "learning_rate": 8.713367857509757e-06, + "loss": 0.1399, + "step": 9533 + }, + { + "epoch": 0.2412632537895083, + "grad_norm": 4.678526878356934, + "learning_rate": 8.713098964515923e-06, + "loss": 0.199, + "step": 9534 + }, + { + "epoch": 0.241288559354202, + "grad_norm": 4.162632942199707, + "learning_rate": 8.712830047577001e-06, + "loss": 0.1712, + "step": 9535 + }, + { + "epoch": 0.24131386491889567, + "grad_norm": 2.8804874420166016, + "learning_rate": 8.712561106694732e-06, + "loss": 0.1213, + "step": 9536 + }, + { + "epoch": 0.24133917048358935, + "grad_norm": 4.799121379852295, + "learning_rate": 8.712292141870844e-06, + "loss": 0.1061, + "step": 9537 + }, + { + "epoch": 0.241364476048283, + "grad_norm": 6.179125785827637, + "learning_rate": 8.712023153107077e-06, + "loss": 0.2367, + "step": 9538 + }, + { + "epoch": 0.2413897816129767, + "grad_norm": 9.146215438842773, + "learning_rate": 8.711754140405162e-06, + "loss": 0.2214, + "step": 9539 + }, + { + "epoch": 0.24141508717767038, + "grad_norm": 5.961746692657471, + "learning_rate": 8.711485103766834e-06, + "loss": 0.237, + "step": 9540 + }, + { + "epoch": 0.24144039274236404, + "grad_norm": 7.377232551574707, + "learning_rate": 8.711216043193828e-06, + "loss": 0.2347, + "step": 9541 + }, + { + "epoch": 0.24146569830705772, + "grad_norm": 5.923223972320557, + "learning_rate": 8.710946958687884e-06, + "loss": 0.1916, + "step": 9542 + }, + { + "epoch": 0.2414910038717514, + "grad_norm": 4.392657279968262, + "learning_rate": 8.71067785025073e-06, + "loss": 0.1974, + "step": 9543 + }, + { + "epoch": 0.2415163094364451, + "grad_norm": 24.983888626098633, + "learning_rate": 8.710408717884107e-06, + "loss": 0.2285, + "step": 9544 + }, + { + "epoch": 0.24154161500113874, + "grad_norm": 4.6939263343811035, + "learning_rate": 8.710139561589747e-06, + "loss": 0.1322, + "step": 9545 + }, + { + "epoch": 0.24156692056583243, + "grad_norm": 3.1597914695739746, + "learning_rate": 8.709870381369387e-06, + "loss": 0.1521, + "step": 9546 + }, + { + "epoch": 0.2415922261305261, + "grad_norm": 9.871986389160156, + "learning_rate": 8.709601177224764e-06, + "loss": 0.2042, + "step": 9547 + }, + { + "epoch": 0.24161753169521977, + "grad_norm": 9.276507377624512, + "learning_rate": 8.709331949157614e-06, + "loss": 0.1988, + "step": 9548 + }, + { + "epoch": 0.24164283725991345, + "grad_norm": 3.529892683029175, + "learning_rate": 8.70906269716967e-06, + "loss": 0.118, + "step": 9549 + }, + { + "epoch": 0.24166814282460714, + "grad_norm": 5.1801958084106445, + "learning_rate": 8.708793421262671e-06, + "loss": 0.2113, + "step": 9550 + }, + { + "epoch": 0.24169344838930082, + "grad_norm": 7.986947059631348, + "learning_rate": 8.708524121438353e-06, + "loss": 0.317, + "step": 9551 + }, + { + "epoch": 0.24171875395399448, + "grad_norm": 4.528393268585205, + "learning_rate": 8.708254797698454e-06, + "loss": 0.1531, + "step": 9552 + }, + { + "epoch": 0.24174405951868816, + "grad_norm": 10.772794723510742, + "learning_rate": 8.70798545004471e-06, + "loss": 0.1723, + "step": 9553 + }, + { + "epoch": 0.24176936508338184, + "grad_norm": 5.5860981941223145, + "learning_rate": 8.707716078478856e-06, + "loss": 0.1337, + "step": 9554 + }, + { + "epoch": 0.2417946706480755, + "grad_norm": 7.330665111541748, + "learning_rate": 8.707446683002632e-06, + "loss": 0.1335, + "step": 9555 + }, + { + "epoch": 0.24181997621276918, + "grad_norm": 8.78172779083252, + "learning_rate": 8.707177263617772e-06, + "loss": 0.3063, + "step": 9556 + }, + { + "epoch": 0.24184528177746287, + "grad_norm": 6.717369556427002, + "learning_rate": 8.706907820326016e-06, + "loss": 0.1124, + "step": 9557 + }, + { + "epoch": 0.24187058734215655, + "grad_norm": 6.257438659667969, + "learning_rate": 8.706638353129103e-06, + "loss": 0.1751, + "step": 9558 + }, + { + "epoch": 0.2418958929068502, + "grad_norm": 6.446613788604736, + "learning_rate": 8.706368862028767e-06, + "loss": 0.2628, + "step": 9559 + }, + { + "epoch": 0.2419211984715439, + "grad_norm": 2.5718772411346436, + "learning_rate": 8.706099347026747e-06, + "loss": 0.1092, + "step": 9560 + }, + { + "epoch": 0.24194650403623758, + "grad_norm": 2.036389112472534, + "learning_rate": 8.705829808124783e-06, + "loss": 0.0677, + "step": 9561 + }, + { + "epoch": 0.24197180960093123, + "grad_norm": 3.6765129566192627, + "learning_rate": 8.705560245324612e-06, + "loss": 0.1836, + "step": 9562 + }, + { + "epoch": 0.24199711516562492, + "grad_norm": 3.6850745677948, + "learning_rate": 8.705290658627972e-06, + "loss": 0.202, + "step": 9563 + }, + { + "epoch": 0.2420224207303186, + "grad_norm": 8.427109718322754, + "learning_rate": 8.705021048036602e-06, + "loss": 0.2003, + "step": 9564 + }, + { + "epoch": 0.24204772629501228, + "grad_norm": 7.5941362380981445, + "learning_rate": 8.70475141355224e-06, + "loss": 0.2357, + "step": 9565 + }, + { + "epoch": 0.24207303185970594, + "grad_norm": 4.077803134918213, + "learning_rate": 8.704481755176624e-06, + "loss": 0.1098, + "step": 9566 + }, + { + "epoch": 0.24209833742439962, + "grad_norm": 11.752724647521973, + "learning_rate": 8.704212072911497e-06, + "loss": 0.1575, + "step": 9567 + }, + { + "epoch": 0.2421236429890933, + "grad_norm": 11.898929595947266, + "learning_rate": 8.703942366758593e-06, + "loss": 0.2351, + "step": 9568 + }, + { + "epoch": 0.242148948553787, + "grad_norm": 6.703600883483887, + "learning_rate": 8.703672636719655e-06, + "loss": 0.2546, + "step": 9569 + }, + { + "epoch": 0.24217425411848065, + "grad_norm": 6.558358669281006, + "learning_rate": 8.70340288279642e-06, + "loss": 0.2014, + "step": 9570 + }, + { + "epoch": 0.24219955968317433, + "grad_norm": 7.136911392211914, + "learning_rate": 8.703133104990632e-06, + "loss": 0.1794, + "step": 9571 + }, + { + "epoch": 0.24222486524786802, + "grad_norm": 4.601885795593262, + "learning_rate": 8.702863303304023e-06, + "loss": 0.2077, + "step": 9572 + }, + { + "epoch": 0.24225017081256167, + "grad_norm": 5.012329578399658, + "learning_rate": 8.702593477738339e-06, + "loss": 0.2215, + "step": 9573 + }, + { + "epoch": 0.24227547637725536, + "grad_norm": 7.156667232513428, + "learning_rate": 8.702323628295319e-06, + "loss": 0.1752, + "step": 9574 + }, + { + "epoch": 0.24230078194194904, + "grad_norm": 7.339443206787109, + "learning_rate": 8.702053754976703e-06, + "loss": 0.2394, + "step": 9575 + }, + { + "epoch": 0.24232608750664272, + "grad_norm": 13.626090049743652, + "learning_rate": 8.70178385778423e-06, + "loss": 0.193, + "step": 9576 + }, + { + "epoch": 0.24235139307133638, + "grad_norm": 4.69013786315918, + "learning_rate": 8.70151393671964e-06, + "loss": 0.2136, + "step": 9577 + }, + { + "epoch": 0.24237669863603006, + "grad_norm": 3.5807905197143555, + "learning_rate": 8.701243991784678e-06, + "loss": 0.1513, + "step": 9578 + }, + { + "epoch": 0.24240200420072375, + "grad_norm": 16.827911376953125, + "learning_rate": 8.700974022981082e-06, + "loss": 0.5576, + "step": 9579 + }, + { + "epoch": 0.2424273097654174, + "grad_norm": 4.7224836349487305, + "learning_rate": 8.700704030310592e-06, + "loss": 0.2066, + "step": 9580 + }, + { + "epoch": 0.2424526153301111, + "grad_norm": 4.393967151641846, + "learning_rate": 8.700434013774948e-06, + "loss": 0.2267, + "step": 9581 + }, + { + "epoch": 0.24247792089480477, + "grad_norm": 7.2268452644348145, + "learning_rate": 8.700163973375896e-06, + "loss": 0.1635, + "step": 9582 + }, + { + "epoch": 0.24250322645949846, + "grad_norm": 5.297142505645752, + "learning_rate": 8.699893909115174e-06, + "loss": 0.2459, + "step": 9583 + }, + { + "epoch": 0.2425285320241921, + "grad_norm": 5.498746395111084, + "learning_rate": 8.699623820994524e-06, + "loss": 0.2723, + "step": 9584 + }, + { + "epoch": 0.2425538375888858, + "grad_norm": 4.298901081085205, + "learning_rate": 8.69935370901569e-06, + "loss": 0.1874, + "step": 9585 + }, + { + "epoch": 0.24257914315357948, + "grad_norm": 8.488195419311523, + "learning_rate": 8.699083573180409e-06, + "loss": 0.173, + "step": 9586 + }, + { + "epoch": 0.24260444871827314, + "grad_norm": 3.1141231060028076, + "learning_rate": 8.698813413490429e-06, + "loss": 0.2033, + "step": 9587 + }, + { + "epoch": 0.24262975428296682, + "grad_norm": 5.38545560836792, + "learning_rate": 8.698543229947487e-06, + "loss": 0.2051, + "step": 9588 + }, + { + "epoch": 0.2426550598476605, + "grad_norm": 5.944283962249756, + "learning_rate": 8.69827302255333e-06, + "loss": 0.252, + "step": 9589 + }, + { + "epoch": 0.2426803654123542, + "grad_norm": 5.004329681396484, + "learning_rate": 8.698002791309696e-06, + "loss": 0.18, + "step": 9590 + }, + { + "epoch": 0.24270567097704784, + "grad_norm": 6.0710673332214355, + "learning_rate": 8.697732536218331e-06, + "loss": 0.1899, + "step": 9591 + }, + { + "epoch": 0.24273097654174153, + "grad_norm": 10.224383354187012, + "learning_rate": 8.697462257280978e-06, + "loss": 0.2315, + "step": 9592 + }, + { + "epoch": 0.2427562821064352, + "grad_norm": 9.402692794799805, + "learning_rate": 8.697191954499377e-06, + "loss": 0.28, + "step": 9593 + }, + { + "epoch": 0.24278158767112887, + "grad_norm": 12.19609546661377, + "learning_rate": 8.696921627875272e-06, + "loss": 0.3147, + "step": 9594 + }, + { + "epoch": 0.24280689323582255, + "grad_norm": 5.4198431968688965, + "learning_rate": 8.696651277410409e-06, + "loss": 0.2402, + "step": 9595 + }, + { + "epoch": 0.24283219880051624, + "grad_norm": 13.601188659667969, + "learning_rate": 8.696380903106529e-06, + "loss": 0.2875, + "step": 9596 + }, + { + "epoch": 0.24285750436520992, + "grad_norm": 3.1290111541748047, + "learning_rate": 8.696110504965377e-06, + "loss": 0.1838, + "step": 9597 + }, + { + "epoch": 0.24288280992990358, + "grad_norm": 4.061845779418945, + "learning_rate": 8.695840082988693e-06, + "loss": 0.1926, + "step": 9598 + }, + { + "epoch": 0.24290811549459726, + "grad_norm": 2.6180286407470703, + "learning_rate": 8.695569637178226e-06, + "loss": 0.1097, + "step": 9599 + }, + { + "epoch": 0.24293342105929094, + "grad_norm": 4.360866546630859, + "learning_rate": 8.695299167535717e-06, + "loss": 0.2692, + "step": 9600 + }, + { + "epoch": 0.24295872662398463, + "grad_norm": 3.8326821327209473, + "learning_rate": 8.695028674062913e-06, + "loss": 0.1971, + "step": 9601 + }, + { + "epoch": 0.24298403218867828, + "grad_norm": 3.4604337215423584, + "learning_rate": 8.694758156761553e-06, + "loss": 0.1445, + "step": 9602 + }, + { + "epoch": 0.24300933775337197, + "grad_norm": 5.064058303833008, + "learning_rate": 8.694487615633388e-06, + "loss": 0.1665, + "step": 9603 + }, + { + "epoch": 0.24303464331806565, + "grad_norm": 4.395284652709961, + "learning_rate": 8.694217050680158e-06, + "loss": 0.2397, + "step": 9604 + }, + { + "epoch": 0.2430599488827593, + "grad_norm": 3.887331485748291, + "learning_rate": 8.693946461903611e-06, + "loss": 0.1694, + "step": 9605 + }, + { + "epoch": 0.243085254447453, + "grad_norm": 3.693027973175049, + "learning_rate": 8.693675849305489e-06, + "loss": 0.2018, + "step": 9606 + }, + { + "epoch": 0.24311056001214668, + "grad_norm": 5.346983909606934, + "learning_rate": 8.693405212887538e-06, + "loss": 0.1637, + "step": 9607 + }, + { + "epoch": 0.24313586557684036, + "grad_norm": 3.3906242847442627, + "learning_rate": 8.693134552651506e-06, + "loss": 0.1577, + "step": 9608 + }, + { + "epoch": 0.24316117114153402, + "grad_norm": 6.505084991455078, + "learning_rate": 8.692863868599133e-06, + "loss": 0.1996, + "step": 9609 + }, + { + "epoch": 0.2431864767062277, + "grad_norm": 2.56831693649292, + "learning_rate": 8.69259316073217e-06, + "loss": 0.1368, + "step": 9610 + }, + { + "epoch": 0.24321178227092138, + "grad_norm": 6.536100387573242, + "learning_rate": 8.69232242905236e-06, + "loss": 0.2595, + "step": 9611 + }, + { + "epoch": 0.24323708783561504, + "grad_norm": 6.378600120544434, + "learning_rate": 8.69205167356145e-06, + "loss": 0.2292, + "step": 9612 + }, + { + "epoch": 0.24326239340030872, + "grad_norm": 4.541906833648682, + "learning_rate": 8.691780894261186e-06, + "loss": 0.2136, + "step": 9613 + }, + { + "epoch": 0.2432876989650024, + "grad_norm": 5.362764835357666, + "learning_rate": 8.691510091153312e-06, + "loss": 0.2456, + "step": 9614 + }, + { + "epoch": 0.2433130045296961, + "grad_norm": 6.7364397048950195, + "learning_rate": 8.691239264239577e-06, + "loss": 0.1688, + "step": 9615 + }, + { + "epoch": 0.24333831009438975, + "grad_norm": 3.95532488822937, + "learning_rate": 8.690968413521727e-06, + "loss": 0.1617, + "step": 9616 + }, + { + "epoch": 0.24336361565908343, + "grad_norm": 7.351770401000977, + "learning_rate": 8.690697539001506e-06, + "loss": 0.1855, + "step": 9617 + }, + { + "epoch": 0.24338892122377712, + "grad_norm": 11.618837356567383, + "learning_rate": 8.690426640680666e-06, + "loss": 0.2174, + "step": 9618 + }, + { + "epoch": 0.24341422678847077, + "grad_norm": 8.141599655151367, + "learning_rate": 8.69015571856095e-06, + "loss": 0.1688, + "step": 9619 + }, + { + "epoch": 0.24343953235316446, + "grad_norm": 4.295202255249023, + "learning_rate": 8.689884772644107e-06, + "loss": 0.1386, + "step": 9620 + }, + { + "epoch": 0.24346483791785814, + "grad_norm": 2.715075731277466, + "learning_rate": 8.689613802931883e-06, + "loss": 0.1703, + "step": 9621 + }, + { + "epoch": 0.24349014348255182, + "grad_norm": 3.9446139335632324, + "learning_rate": 8.689342809426026e-06, + "loss": 0.1495, + "step": 9622 + }, + { + "epoch": 0.24351544904724548, + "grad_norm": 4.338237762451172, + "learning_rate": 8.689071792128284e-06, + "loss": 0.128, + "step": 9623 + }, + { + "epoch": 0.24354075461193916, + "grad_norm": 11.78996753692627, + "learning_rate": 8.688800751040403e-06, + "loss": 0.2844, + "step": 9624 + }, + { + "epoch": 0.24356606017663285, + "grad_norm": 11.439993858337402, + "learning_rate": 8.68852968616413e-06, + "loss": 0.1991, + "step": 9625 + }, + { + "epoch": 0.2435913657413265, + "grad_norm": 13.15718936920166, + "learning_rate": 8.68825859750122e-06, + "loss": 0.2772, + "step": 9626 + }, + { + "epoch": 0.2436166713060202, + "grad_norm": 7.72974157333374, + "learning_rate": 8.687987485053412e-06, + "loss": 0.2259, + "step": 9627 + }, + { + "epoch": 0.24364197687071387, + "grad_norm": 8.2037935256958, + "learning_rate": 8.687716348822462e-06, + "loss": 0.1725, + "step": 9628 + }, + { + "epoch": 0.24366728243540756, + "grad_norm": 6.1258063316345215, + "learning_rate": 8.687445188810114e-06, + "loss": 0.2053, + "step": 9629 + }, + { + "epoch": 0.2436925880001012, + "grad_norm": 5.91240930557251, + "learning_rate": 8.687174005018117e-06, + "loss": 0.211, + "step": 9630 + }, + { + "epoch": 0.2437178935647949, + "grad_norm": 3.484224557876587, + "learning_rate": 8.686902797448222e-06, + "loss": 0.1413, + "step": 9631 + }, + { + "epoch": 0.24374319912948858, + "grad_norm": 6.562892913818359, + "learning_rate": 8.686631566102177e-06, + "loss": 0.2212, + "step": 9632 + }, + { + "epoch": 0.24376850469418226, + "grad_norm": 5.55271053314209, + "learning_rate": 8.68636031098173e-06, + "loss": 0.2012, + "step": 9633 + }, + { + "epoch": 0.24379381025887592, + "grad_norm": 3.4480841159820557, + "learning_rate": 8.686089032088632e-06, + "loss": 0.1807, + "step": 9634 + }, + { + "epoch": 0.2438191158235696, + "grad_norm": 13.47215747833252, + "learning_rate": 8.68581772942463e-06, + "loss": 0.2866, + "step": 9635 + }, + { + "epoch": 0.2438444213882633, + "grad_norm": 4.619564533233643, + "learning_rate": 8.685546402991475e-06, + "loss": 0.1752, + "step": 9636 + }, + { + "epoch": 0.24386972695295694, + "grad_norm": 6.799777030944824, + "learning_rate": 8.685275052790918e-06, + "loss": 0.1697, + "step": 9637 + }, + { + "epoch": 0.24389503251765063, + "grad_norm": 2.499762535095215, + "learning_rate": 8.685003678824708e-06, + "loss": 0.1185, + "step": 9638 + }, + { + "epoch": 0.2439203380823443, + "grad_norm": 3.7310752868652344, + "learning_rate": 8.684732281094595e-06, + "loss": 0.2041, + "step": 9639 + }, + { + "epoch": 0.243945643647038, + "grad_norm": 9.284575462341309, + "learning_rate": 8.684460859602327e-06, + "loss": 0.1452, + "step": 9640 + }, + { + "epoch": 0.24397094921173165, + "grad_norm": 5.796928405761719, + "learning_rate": 8.684189414349657e-06, + "loss": 0.2563, + "step": 9641 + }, + { + "epoch": 0.24399625477642534, + "grad_norm": 4.150679588317871, + "learning_rate": 8.683917945338336e-06, + "loss": 0.1457, + "step": 9642 + }, + { + "epoch": 0.24402156034111902, + "grad_norm": 3.5681543350219727, + "learning_rate": 8.683646452570112e-06, + "loss": 0.1235, + "step": 9643 + }, + { + "epoch": 0.24404686590581268, + "grad_norm": 9.087066650390625, + "learning_rate": 8.683374936046738e-06, + "loss": 0.2602, + "step": 9644 + }, + { + "epoch": 0.24407217147050636, + "grad_norm": 6.663735866546631, + "learning_rate": 8.683103395769963e-06, + "loss": 0.2084, + "step": 9645 + }, + { + "epoch": 0.24409747703520004, + "grad_norm": 4.672973155975342, + "learning_rate": 8.68283183174154e-06, + "loss": 0.139, + "step": 9646 + }, + { + "epoch": 0.24412278259989373, + "grad_norm": 6.142241954803467, + "learning_rate": 8.68256024396322e-06, + "loss": 0.3038, + "step": 9647 + }, + { + "epoch": 0.24414808816458738, + "grad_norm": 11.449589729309082, + "learning_rate": 8.682288632436752e-06, + "loss": 0.1814, + "step": 9648 + }, + { + "epoch": 0.24417339372928107, + "grad_norm": 3.706153631210327, + "learning_rate": 8.682016997163894e-06, + "loss": 0.1839, + "step": 9649 + }, + { + "epoch": 0.24419869929397475, + "grad_norm": 3.7516841888427734, + "learning_rate": 8.68174533814639e-06, + "loss": 0.1972, + "step": 9650 + }, + { + "epoch": 0.2442240048586684, + "grad_norm": 4.064740180969238, + "learning_rate": 8.681473655385994e-06, + "loss": 0.1703, + "step": 9651 + }, + { + "epoch": 0.2442493104233621, + "grad_norm": 4.195969104766846, + "learning_rate": 8.681201948884461e-06, + "loss": 0.1673, + "step": 9652 + }, + { + "epoch": 0.24427461598805578, + "grad_norm": 10.767261505126953, + "learning_rate": 8.680930218643543e-06, + "loss": 0.2983, + "step": 9653 + }, + { + "epoch": 0.24429992155274946, + "grad_norm": 4.115212917327881, + "learning_rate": 8.680658464664988e-06, + "loss": 0.1525, + "step": 9654 + }, + { + "epoch": 0.24432522711744312, + "grad_norm": 6.499784469604492, + "learning_rate": 8.680386686950553e-06, + "loss": 0.206, + "step": 9655 + }, + { + "epoch": 0.2443505326821368, + "grad_norm": 5.152589321136475, + "learning_rate": 8.680114885501989e-06, + "loss": 0.1889, + "step": 9656 + }, + { + "epoch": 0.24437583824683048, + "grad_norm": 7.073915958404541, + "learning_rate": 8.679843060321047e-06, + "loss": 0.1465, + "step": 9657 + }, + { + "epoch": 0.24440114381152414, + "grad_norm": 5.7621169090271, + "learning_rate": 8.679571211409483e-06, + "loss": 0.1224, + "step": 9658 + }, + { + "epoch": 0.24442644937621782, + "grad_norm": 6.35337495803833, + "learning_rate": 8.679299338769047e-06, + "loss": 0.1515, + "step": 9659 + }, + { + "epoch": 0.2444517549409115, + "grad_norm": 5.500761032104492, + "learning_rate": 8.679027442401495e-06, + "loss": 0.2232, + "step": 9660 + }, + { + "epoch": 0.2444770605056052, + "grad_norm": 3.166121006011963, + "learning_rate": 8.67875552230858e-06, + "loss": 0.1432, + "step": 9661 + }, + { + "epoch": 0.24450236607029885, + "grad_norm": 2.8692333698272705, + "learning_rate": 8.678483578492055e-06, + "loss": 0.1281, + "step": 9662 + }, + { + "epoch": 0.24452767163499253, + "grad_norm": 22.875579833984375, + "learning_rate": 8.678211610953673e-06, + "loss": 0.2648, + "step": 9663 + }, + { + "epoch": 0.24455297719968622, + "grad_norm": 13.535743713378906, + "learning_rate": 8.677939619695188e-06, + "loss": 0.1377, + "step": 9664 + }, + { + "epoch": 0.2445782827643799, + "grad_norm": 11.772814750671387, + "learning_rate": 8.677667604718356e-06, + "loss": 0.1657, + "step": 9665 + }, + { + "epoch": 0.24460358832907356, + "grad_norm": 5.241633892059326, + "learning_rate": 8.67739556602493e-06, + "loss": 0.168, + "step": 9666 + }, + { + "epoch": 0.24462889389376724, + "grad_norm": 5.768234729766846, + "learning_rate": 8.677123503616661e-06, + "loss": 0.1629, + "step": 9667 + }, + { + "epoch": 0.24465419945846092, + "grad_norm": 5.2784647941589355, + "learning_rate": 8.676851417495309e-06, + "loss": 0.2324, + "step": 9668 + }, + { + "epoch": 0.24467950502315458, + "grad_norm": 4.093019485473633, + "learning_rate": 8.676579307662624e-06, + "loss": 0.173, + "step": 9669 + }, + { + "epoch": 0.24470481058784826, + "grad_norm": 5.052532196044922, + "learning_rate": 8.676307174120364e-06, + "loss": 0.1909, + "step": 9670 + }, + { + "epoch": 0.24473011615254195, + "grad_norm": 17.627397537231445, + "learning_rate": 8.676035016870286e-06, + "loss": 0.2868, + "step": 9671 + }, + { + "epoch": 0.24475542171723563, + "grad_norm": 6.719974994659424, + "learning_rate": 8.675762835914139e-06, + "loss": 0.17, + "step": 9672 + }, + { + "epoch": 0.2447807272819293, + "grad_norm": 6.785614013671875, + "learning_rate": 8.67549063125368e-06, + "loss": 0.2185, + "step": 9673 + }, + { + "epoch": 0.24480603284662297, + "grad_norm": 4.947588920593262, + "learning_rate": 8.675218402890668e-06, + "loss": 0.2111, + "step": 9674 + }, + { + "epoch": 0.24483133841131666, + "grad_norm": 4.959394931793213, + "learning_rate": 8.674946150826855e-06, + "loss": 0.1737, + "step": 9675 + }, + { + "epoch": 0.2448566439760103, + "grad_norm": 5.773176670074463, + "learning_rate": 8.674673875064e-06, + "loss": 0.1512, + "step": 9676 + }, + { + "epoch": 0.244881949540704, + "grad_norm": 7.5616374015808105, + "learning_rate": 8.674401575603854e-06, + "loss": 0.2011, + "step": 9677 + }, + { + "epoch": 0.24490725510539768, + "grad_norm": 4.840736389160156, + "learning_rate": 8.674129252448176e-06, + "loss": 0.1791, + "step": 9678 + }, + { + "epoch": 0.24493256067009136, + "grad_norm": 4.543796539306641, + "learning_rate": 8.673856905598723e-06, + "loss": 0.1617, + "step": 9679 + }, + { + "epoch": 0.24495786623478502, + "grad_norm": 5.589093208312988, + "learning_rate": 8.673584535057249e-06, + "loss": 0.1846, + "step": 9680 + }, + { + "epoch": 0.2449831717994787, + "grad_norm": 13.940597534179688, + "learning_rate": 8.673312140825512e-06, + "loss": 0.2124, + "step": 9681 + }, + { + "epoch": 0.2450084773641724, + "grad_norm": 3.916501522064209, + "learning_rate": 8.67303972290527e-06, + "loss": 0.1593, + "step": 9682 + }, + { + "epoch": 0.24503378292886605, + "grad_norm": 8.623858451843262, + "learning_rate": 8.672767281298277e-06, + "loss": 0.2548, + "step": 9683 + }, + { + "epoch": 0.24505908849355973, + "grad_norm": 7.092390060424805, + "learning_rate": 8.67249481600629e-06, + "loss": 0.2578, + "step": 9684 + }, + { + "epoch": 0.2450843940582534, + "grad_norm": 5.011534214019775, + "learning_rate": 8.672222327031067e-06, + "loss": 0.1958, + "step": 9685 + }, + { + "epoch": 0.2451096996229471, + "grad_norm": 7.8842668533325195, + "learning_rate": 8.671949814374367e-06, + "loss": 0.1937, + "step": 9686 + }, + { + "epoch": 0.24513500518764075, + "grad_norm": 12.385220527648926, + "learning_rate": 8.671677278037946e-06, + "loss": 0.1741, + "step": 9687 + }, + { + "epoch": 0.24516031075233444, + "grad_norm": 10.744393348693848, + "learning_rate": 8.671404718023559e-06, + "loss": 0.2536, + "step": 9688 + }, + { + "epoch": 0.24518561631702812, + "grad_norm": 11.65833854675293, + "learning_rate": 8.671132134332968e-06, + "loss": 0.2674, + "step": 9689 + }, + { + "epoch": 0.24521092188172178, + "grad_norm": 6.031578540802002, + "learning_rate": 8.670859526967928e-06, + "loss": 0.1092, + "step": 9690 + }, + { + "epoch": 0.24523622744641546, + "grad_norm": 3.362067222595215, + "learning_rate": 8.670586895930196e-06, + "loss": 0.1036, + "step": 9691 + }, + { + "epoch": 0.24526153301110915, + "grad_norm": 6.979641914367676, + "learning_rate": 8.670314241221533e-06, + "loss": 0.2164, + "step": 9692 + }, + { + "epoch": 0.24528683857580283, + "grad_norm": 9.214783668518066, + "learning_rate": 8.670041562843697e-06, + "loss": 0.2764, + "step": 9693 + }, + { + "epoch": 0.24531214414049649, + "grad_norm": 3.7245359420776367, + "learning_rate": 8.669768860798446e-06, + "loss": 0.1422, + "step": 9694 + }, + { + "epoch": 0.24533744970519017, + "grad_norm": 5.743599891662598, + "learning_rate": 8.669496135087536e-06, + "loss": 0.1648, + "step": 9695 + }, + { + "epoch": 0.24536275526988385, + "grad_norm": 4.867973327636719, + "learning_rate": 8.66922338571273e-06, + "loss": 0.1949, + "step": 9696 + }, + { + "epoch": 0.24538806083457754, + "grad_norm": 12.405287742614746, + "learning_rate": 8.668950612675784e-06, + "loss": 0.255, + "step": 9697 + }, + { + "epoch": 0.2454133663992712, + "grad_norm": 3.4026732444763184, + "learning_rate": 8.668677815978459e-06, + "loss": 0.1335, + "step": 9698 + }, + { + "epoch": 0.24543867196396488, + "grad_norm": 5.474335670471191, + "learning_rate": 8.668404995622512e-06, + "loss": 0.2365, + "step": 9699 + }, + { + "epoch": 0.24546397752865856, + "grad_norm": 5.673740863800049, + "learning_rate": 8.668132151609706e-06, + "loss": 0.2222, + "step": 9700 + }, + { + "epoch": 0.24548928309335222, + "grad_norm": 3.924407482147217, + "learning_rate": 8.667859283941795e-06, + "loss": 0.1761, + "step": 9701 + }, + { + "epoch": 0.2455145886580459, + "grad_norm": 14.539606094360352, + "learning_rate": 8.667586392620542e-06, + "loss": 0.2, + "step": 9702 + }, + { + "epoch": 0.24553989422273959, + "grad_norm": 3.900944948196411, + "learning_rate": 8.667313477647708e-06, + "loss": 0.1331, + "step": 9703 + }, + { + "epoch": 0.24556519978743327, + "grad_norm": 5.643644332885742, + "learning_rate": 8.667040539025052e-06, + "loss": 0.1915, + "step": 9704 + }, + { + "epoch": 0.24559050535212693, + "grad_norm": 5.2978925704956055, + "learning_rate": 8.666767576754332e-06, + "loss": 0.1932, + "step": 9705 + }, + { + "epoch": 0.2456158109168206, + "grad_norm": 4.322944641113281, + "learning_rate": 8.666494590837312e-06, + "loss": 0.1759, + "step": 9706 + }, + { + "epoch": 0.2456411164815143, + "grad_norm": 6.807122230529785, + "learning_rate": 8.666221581275746e-06, + "loss": 0.1622, + "step": 9707 + }, + { + "epoch": 0.24566642204620795, + "grad_norm": 11.412843704223633, + "learning_rate": 8.665948548071404e-06, + "loss": 0.1763, + "step": 9708 + }, + { + "epoch": 0.24569172761090163, + "grad_norm": 5.68076753616333, + "learning_rate": 8.66567549122604e-06, + "loss": 0.249, + "step": 9709 + }, + { + "epoch": 0.24571703317559532, + "grad_norm": 5.004478454589844, + "learning_rate": 8.665402410741417e-06, + "loss": 0.1646, + "step": 9710 + }, + { + "epoch": 0.245742338740289, + "grad_norm": 2.610873222351074, + "learning_rate": 8.665129306619293e-06, + "loss": 0.0708, + "step": 9711 + }, + { + "epoch": 0.24576764430498266, + "grad_norm": 8.049712181091309, + "learning_rate": 8.664856178861434e-06, + "loss": 0.2568, + "step": 9712 + }, + { + "epoch": 0.24579294986967634, + "grad_norm": 5.9428887367248535, + "learning_rate": 8.664583027469599e-06, + "loss": 0.2456, + "step": 9713 + }, + { + "epoch": 0.24581825543437003, + "grad_norm": 4.199779033660889, + "learning_rate": 8.664309852445551e-06, + "loss": 0.1706, + "step": 9714 + }, + { + "epoch": 0.24584356099906368, + "grad_norm": 4.952336311340332, + "learning_rate": 8.66403665379105e-06, + "loss": 0.1773, + "step": 9715 + }, + { + "epoch": 0.24586886656375737, + "grad_norm": 10.061334609985352, + "learning_rate": 8.663763431507857e-06, + "loss": 0.2822, + "step": 9716 + }, + { + "epoch": 0.24589417212845105, + "grad_norm": 3.807541608810425, + "learning_rate": 8.663490185597736e-06, + "loss": 0.1875, + "step": 9717 + }, + { + "epoch": 0.24591947769314473, + "grad_norm": 8.63152027130127, + "learning_rate": 8.663216916062447e-06, + "loss": 0.3287, + "step": 9718 + }, + { + "epoch": 0.2459447832578384, + "grad_norm": 3.327887535095215, + "learning_rate": 8.662943622903754e-06, + "loss": 0.1609, + "step": 9719 + }, + { + "epoch": 0.24597008882253207, + "grad_norm": 5.659451484680176, + "learning_rate": 8.66267030612342e-06, + "loss": 0.2135, + "step": 9720 + }, + { + "epoch": 0.24599539438722576, + "grad_norm": 25.60486602783203, + "learning_rate": 8.662396965723206e-06, + "loss": 0.158, + "step": 9721 + }, + { + "epoch": 0.2460206999519194, + "grad_norm": 3.0157454013824463, + "learning_rate": 8.662123601704875e-06, + "loss": 0.1086, + "step": 9722 + }, + { + "epoch": 0.2460460055166131, + "grad_norm": 24.452560424804688, + "learning_rate": 8.66185021407019e-06, + "loss": 0.1679, + "step": 9723 + }, + { + "epoch": 0.24607131108130678, + "grad_norm": 10.549466133117676, + "learning_rate": 8.661576802820915e-06, + "loss": 0.2705, + "step": 9724 + }, + { + "epoch": 0.24609661664600047, + "grad_norm": 8.002100944519043, + "learning_rate": 8.661303367958811e-06, + "loss": 0.2902, + "step": 9725 + }, + { + "epoch": 0.24612192221069412, + "grad_norm": 7.797903060913086, + "learning_rate": 8.661029909485644e-06, + "loss": 0.2211, + "step": 9726 + }, + { + "epoch": 0.2461472277753878, + "grad_norm": 3.352015256881714, + "learning_rate": 8.660756427403174e-06, + "loss": 0.1073, + "step": 9727 + }, + { + "epoch": 0.2461725333400815, + "grad_norm": 4.498810291290283, + "learning_rate": 8.66048292171317e-06, + "loss": 0.1539, + "step": 9728 + }, + { + "epoch": 0.24619783890477517, + "grad_norm": 5.327233791351318, + "learning_rate": 8.66020939241739e-06, + "loss": 0.1547, + "step": 9729 + }, + { + "epoch": 0.24622314446946883, + "grad_norm": 5.144904613494873, + "learning_rate": 8.6599358395176e-06, + "loss": 0.2054, + "step": 9730 + }, + { + "epoch": 0.2462484500341625, + "grad_norm": 4.123590469360352, + "learning_rate": 8.659662263015565e-06, + "loss": 0.1341, + "step": 9731 + }, + { + "epoch": 0.2462737555988562, + "grad_norm": 7.088906764984131, + "learning_rate": 8.659388662913051e-06, + "loss": 0.2445, + "step": 9732 + }, + { + "epoch": 0.24629906116354985, + "grad_norm": 9.030234336853027, + "learning_rate": 8.659115039211819e-06, + "loss": 0.2452, + "step": 9733 + }, + { + "epoch": 0.24632436672824354, + "grad_norm": 5.10770845413208, + "learning_rate": 8.658841391913633e-06, + "loss": 0.1848, + "step": 9734 + }, + { + "epoch": 0.24634967229293722, + "grad_norm": 3.05452299118042, + "learning_rate": 8.65856772102026e-06, + "loss": 0.1374, + "step": 9735 + }, + { + "epoch": 0.2463749778576309, + "grad_norm": 7.2218217849731445, + "learning_rate": 8.658294026533466e-06, + "loss": 0.2444, + "step": 9736 + }, + { + "epoch": 0.24640028342232456, + "grad_norm": 6.828534126281738, + "learning_rate": 8.658020308455015e-06, + "loss": 0.1337, + "step": 9737 + }, + { + "epoch": 0.24642558898701825, + "grad_norm": 6.689272403717041, + "learning_rate": 8.657746566786668e-06, + "loss": 0.2373, + "step": 9738 + }, + { + "epoch": 0.24645089455171193, + "grad_norm": 4.417629241943359, + "learning_rate": 8.657472801530198e-06, + "loss": 0.2049, + "step": 9739 + }, + { + "epoch": 0.24647620011640559, + "grad_norm": 6.859844207763672, + "learning_rate": 8.657199012687362e-06, + "loss": 0.2318, + "step": 9740 + }, + { + "epoch": 0.24650150568109927, + "grad_norm": 3.760514974594116, + "learning_rate": 8.656925200259934e-06, + "loss": 0.2051, + "step": 9741 + }, + { + "epoch": 0.24652681124579295, + "grad_norm": 5.569518566131592, + "learning_rate": 8.656651364249674e-06, + "loss": 0.2178, + "step": 9742 + }, + { + "epoch": 0.24655211681048664, + "grad_norm": 6.7094407081604, + "learning_rate": 8.656377504658348e-06, + "loss": 0.2161, + "step": 9743 + }, + { + "epoch": 0.2465774223751803, + "grad_norm": 4.294581413269043, + "learning_rate": 8.656103621487724e-06, + "loss": 0.1628, + "step": 9744 + }, + { + "epoch": 0.24660272793987398, + "grad_norm": 5.44440221786499, + "learning_rate": 8.65582971473957e-06, + "loss": 0.1724, + "step": 9745 + }, + { + "epoch": 0.24662803350456766, + "grad_norm": 5.595747470855713, + "learning_rate": 8.655555784415648e-06, + "loss": 0.1932, + "step": 9746 + }, + { + "epoch": 0.24665333906926132, + "grad_norm": 8.858844757080078, + "learning_rate": 8.655281830517727e-06, + "loss": 0.179, + "step": 9747 + }, + { + "epoch": 0.246678644633955, + "grad_norm": 5.268465995788574, + "learning_rate": 8.655007853047574e-06, + "loss": 0.1767, + "step": 9748 + }, + { + "epoch": 0.24670395019864869, + "grad_norm": 4.741480350494385, + "learning_rate": 8.654733852006956e-06, + "loss": 0.1991, + "step": 9749 + }, + { + "epoch": 0.24672925576334237, + "grad_norm": 11.800919532775879, + "learning_rate": 8.654459827397637e-06, + "loss": 0.1608, + "step": 9750 + }, + { + "epoch": 0.24675456132803603, + "grad_norm": 6.324162483215332, + "learning_rate": 8.654185779221388e-06, + "loss": 0.1947, + "step": 9751 + }, + { + "epoch": 0.2467798668927297, + "grad_norm": 4.039731502532959, + "learning_rate": 8.653911707479973e-06, + "loss": 0.1281, + "step": 9752 + }, + { + "epoch": 0.2468051724574234, + "grad_norm": 7.298461437225342, + "learning_rate": 8.653637612175162e-06, + "loss": 0.2793, + "step": 9753 + }, + { + "epoch": 0.24683047802211705, + "grad_norm": 5.0664896965026855, + "learning_rate": 8.653363493308724e-06, + "loss": 0.1489, + "step": 9754 + }, + { + "epoch": 0.24685578358681073, + "grad_norm": 5.755301475524902, + "learning_rate": 8.653089350882423e-06, + "loss": 0.2397, + "step": 9755 + }, + { + "epoch": 0.24688108915150442, + "grad_norm": 7.972052097320557, + "learning_rate": 8.652815184898026e-06, + "loss": 0.1457, + "step": 9756 + }, + { + "epoch": 0.2469063947161981, + "grad_norm": 13.746660232543945, + "learning_rate": 8.652540995357305e-06, + "loss": 0.2416, + "step": 9757 + }, + { + "epoch": 0.24693170028089176, + "grad_norm": 9.302023887634277, + "learning_rate": 8.652266782262027e-06, + "loss": 0.2095, + "step": 9758 + }, + { + "epoch": 0.24695700584558544, + "grad_norm": 3.7153737545013428, + "learning_rate": 8.651992545613962e-06, + "loss": 0.1187, + "step": 9759 + }, + { + "epoch": 0.24698231141027913, + "grad_norm": 5.298496246337891, + "learning_rate": 8.651718285414871e-06, + "loss": 0.1765, + "step": 9760 + }, + { + "epoch": 0.2470076169749728, + "grad_norm": 7.473146438598633, + "learning_rate": 8.651444001666532e-06, + "loss": 0.2069, + "step": 9761 + }, + { + "epoch": 0.24703292253966647, + "grad_norm": 12.092522621154785, + "learning_rate": 8.65116969437071e-06, + "loss": 0.2417, + "step": 9762 + }, + { + "epoch": 0.24705822810436015, + "grad_norm": 5.9547438621521, + "learning_rate": 8.650895363529172e-06, + "loss": 0.1931, + "step": 9763 + }, + { + "epoch": 0.24708353366905383, + "grad_norm": 6.452813625335693, + "learning_rate": 8.650621009143689e-06, + "loss": 0.1744, + "step": 9764 + }, + { + "epoch": 0.2471088392337475, + "grad_norm": 6.415065288543701, + "learning_rate": 8.650346631216032e-06, + "loss": 0.1805, + "step": 9765 + }, + { + "epoch": 0.24713414479844117, + "grad_norm": 19.714210510253906, + "learning_rate": 8.650072229747967e-06, + "loss": 0.2207, + "step": 9766 + }, + { + "epoch": 0.24715945036313486, + "grad_norm": 8.161277770996094, + "learning_rate": 8.649797804741266e-06, + "loss": 0.2429, + "step": 9767 + }, + { + "epoch": 0.24718475592782854, + "grad_norm": 4.106656074523926, + "learning_rate": 8.649523356197697e-06, + "loss": 0.1456, + "step": 9768 + }, + { + "epoch": 0.2472100614925222, + "grad_norm": 4.751427173614502, + "learning_rate": 8.649248884119031e-06, + "loss": 0.1748, + "step": 9769 + }, + { + "epoch": 0.24723536705721588, + "grad_norm": 5.036435127258301, + "learning_rate": 8.648974388507037e-06, + "loss": 0.153, + "step": 9770 + }, + { + "epoch": 0.24726067262190957, + "grad_norm": 9.57483196258545, + "learning_rate": 8.648699869363488e-06, + "loss": 0.2895, + "step": 9771 + }, + { + "epoch": 0.24728597818660322, + "grad_norm": 8.23924446105957, + "learning_rate": 8.64842532669015e-06, + "loss": 0.3214, + "step": 9772 + }, + { + "epoch": 0.2473112837512969, + "grad_norm": 6.273495197296143, + "learning_rate": 8.648150760488799e-06, + "loss": 0.2276, + "step": 9773 + }, + { + "epoch": 0.2473365893159906, + "grad_norm": 5.484777450561523, + "learning_rate": 8.647876170761198e-06, + "loss": 0.2357, + "step": 9774 + }, + { + "epoch": 0.24736189488068427, + "grad_norm": 8.345627784729004, + "learning_rate": 8.647601557509125e-06, + "loss": 0.245, + "step": 9775 + }, + { + "epoch": 0.24738720044537793, + "grad_norm": 6.459839820861816, + "learning_rate": 8.647326920734347e-06, + "loss": 0.1492, + "step": 9776 + }, + { + "epoch": 0.24741250601007161, + "grad_norm": 5.707108497619629, + "learning_rate": 8.647052260438636e-06, + "loss": 0.2049, + "step": 9777 + }, + { + "epoch": 0.2474378115747653, + "grad_norm": 4.899604797363281, + "learning_rate": 8.646777576623764e-06, + "loss": 0.1649, + "step": 9778 + }, + { + "epoch": 0.24746311713945895, + "grad_norm": 4.005462646484375, + "learning_rate": 8.646502869291502e-06, + "loss": 0.1872, + "step": 9779 + }, + { + "epoch": 0.24748842270415264, + "grad_norm": 5.5869951248168945, + "learning_rate": 8.64622813844362e-06, + "loss": 0.1935, + "step": 9780 + }, + { + "epoch": 0.24751372826884632, + "grad_norm": 2.969971179962158, + "learning_rate": 8.645953384081894e-06, + "loss": 0.169, + "step": 9781 + }, + { + "epoch": 0.24753903383354, + "grad_norm": 21.2325382232666, + "learning_rate": 8.64567860620809e-06, + "loss": 0.2578, + "step": 9782 + }, + { + "epoch": 0.24756433939823366, + "grad_norm": 3.357218027114868, + "learning_rate": 8.645403804823984e-06, + "loss": 0.1864, + "step": 9783 + }, + { + "epoch": 0.24758964496292735, + "grad_norm": 4.712965488433838, + "learning_rate": 8.645128979931347e-06, + "loss": 0.1937, + "step": 9784 + }, + { + "epoch": 0.24761495052762103, + "grad_norm": 8.518141746520996, + "learning_rate": 8.64485413153195e-06, + "loss": 0.2938, + "step": 9785 + }, + { + "epoch": 0.2476402560923147, + "grad_norm": 3.9961345195770264, + "learning_rate": 8.644579259627568e-06, + "loss": 0.1871, + "step": 9786 + }, + { + "epoch": 0.24766556165700837, + "grad_norm": 11.768349647521973, + "learning_rate": 8.644304364219972e-06, + "loss": 0.2075, + "step": 9787 + }, + { + "epoch": 0.24769086722170205, + "grad_norm": 7.512240409851074, + "learning_rate": 8.644029445310933e-06, + "loss": 0.2164, + "step": 9788 + }, + { + "epoch": 0.24771617278639574, + "grad_norm": 6.051864147186279, + "learning_rate": 8.643754502902229e-06, + "loss": 0.2736, + "step": 9789 + }, + { + "epoch": 0.2477414783510894, + "grad_norm": 14.056877136230469, + "learning_rate": 8.643479536995628e-06, + "loss": 0.2297, + "step": 9790 + }, + { + "epoch": 0.24776678391578308, + "grad_norm": 4.030829429626465, + "learning_rate": 8.643204547592905e-06, + "loss": 0.1762, + "step": 9791 + }, + { + "epoch": 0.24779208948047676, + "grad_norm": 12.235846519470215, + "learning_rate": 8.642929534695837e-06, + "loss": 0.1582, + "step": 9792 + }, + { + "epoch": 0.24781739504517045, + "grad_norm": 6.17356014251709, + "learning_rate": 8.642654498306192e-06, + "loss": 0.1767, + "step": 9793 + }, + { + "epoch": 0.2478427006098641, + "grad_norm": 4.435286521911621, + "learning_rate": 8.642379438425744e-06, + "loss": 0.2216, + "step": 9794 + }, + { + "epoch": 0.2478680061745578, + "grad_norm": 9.72603988647461, + "learning_rate": 8.64210435505627e-06, + "loss": 0.2103, + "step": 9795 + }, + { + "epoch": 0.24789331173925147, + "grad_norm": 8.666370391845703, + "learning_rate": 8.641829248199543e-06, + "loss": 0.21, + "step": 9796 + }, + { + "epoch": 0.24791861730394513, + "grad_norm": 8.514034271240234, + "learning_rate": 8.641554117857336e-06, + "loss": 0.282, + "step": 9797 + }, + { + "epoch": 0.2479439228686388, + "grad_norm": 2.5884766578674316, + "learning_rate": 8.641278964031422e-06, + "loss": 0.1118, + "step": 9798 + }, + { + "epoch": 0.2479692284333325, + "grad_norm": 9.093385696411133, + "learning_rate": 8.64100378672358e-06, + "loss": 0.2037, + "step": 9799 + }, + { + "epoch": 0.24799453399802618, + "grad_norm": 7.620001316070557, + "learning_rate": 8.64072858593558e-06, + "loss": 0.1578, + "step": 9800 + }, + { + "epoch": 0.24801983956271983, + "grad_norm": 2.5103933811187744, + "learning_rate": 8.6404533616692e-06, + "loss": 0.132, + "step": 9801 + }, + { + "epoch": 0.24804514512741352, + "grad_norm": 3.0304365158081055, + "learning_rate": 8.640178113926215e-06, + "loss": 0.1571, + "step": 9802 + }, + { + "epoch": 0.2480704506921072, + "grad_norm": 9.640218734741211, + "learning_rate": 8.639902842708396e-06, + "loss": 0.1974, + "step": 9803 + }, + { + "epoch": 0.24809575625680086, + "grad_norm": 4.253366947174072, + "learning_rate": 8.63962754801752e-06, + "loss": 0.1434, + "step": 9804 + }, + { + "epoch": 0.24812106182149454, + "grad_norm": 3.0886690616607666, + "learning_rate": 8.639352229855364e-06, + "loss": 0.1216, + "step": 9805 + }, + { + "epoch": 0.24814636738618823, + "grad_norm": 5.94567346572876, + "learning_rate": 8.639076888223703e-06, + "loss": 0.1954, + "step": 9806 + }, + { + "epoch": 0.2481716729508819, + "grad_norm": 4.837891578674316, + "learning_rate": 8.638801523124311e-06, + "loss": 0.1567, + "step": 9807 + }, + { + "epoch": 0.24819697851557557, + "grad_norm": 4.744117736816406, + "learning_rate": 8.638526134558966e-06, + "loss": 0.1923, + "step": 9808 + }, + { + "epoch": 0.24822228408026925, + "grad_norm": 9.482297897338867, + "learning_rate": 8.638250722529443e-06, + "loss": 0.1919, + "step": 9809 + }, + { + "epoch": 0.24824758964496293, + "grad_norm": 8.79946231842041, + "learning_rate": 8.637975287037517e-06, + "loss": 0.2781, + "step": 9810 + }, + { + "epoch": 0.2482728952096566, + "grad_norm": 4.228246212005615, + "learning_rate": 8.637699828084965e-06, + "loss": 0.1395, + "step": 9811 + }, + { + "epoch": 0.24829820077435028, + "grad_norm": 5.362427711486816, + "learning_rate": 8.637424345673563e-06, + "loss": 0.2814, + "step": 9812 + }, + { + "epoch": 0.24832350633904396, + "grad_norm": 4.848025798797607, + "learning_rate": 8.63714883980509e-06, + "loss": 0.2147, + "step": 9813 + }, + { + "epoch": 0.24834881190373764, + "grad_norm": 6.941290378570557, + "learning_rate": 8.636873310481319e-06, + "loss": 0.1956, + "step": 9814 + }, + { + "epoch": 0.2483741174684313, + "grad_norm": 15.176249504089355, + "learning_rate": 8.636597757704029e-06, + "loss": 0.306, + "step": 9815 + }, + { + "epoch": 0.24839942303312498, + "grad_norm": 11.788896560668945, + "learning_rate": 8.636322181474997e-06, + "loss": 0.2457, + "step": 9816 + }, + { + "epoch": 0.24842472859781867, + "grad_norm": 4.176814556121826, + "learning_rate": 8.636046581795997e-06, + "loss": 0.1477, + "step": 9817 + }, + { + "epoch": 0.24845003416251232, + "grad_norm": 5.230217456817627, + "learning_rate": 8.635770958668812e-06, + "loss": 0.2116, + "step": 9818 + }, + { + "epoch": 0.248475339727206, + "grad_norm": 5.2758660316467285, + "learning_rate": 8.635495312095214e-06, + "loss": 0.2818, + "step": 9819 + }, + { + "epoch": 0.2485006452918997, + "grad_norm": 13.566376686096191, + "learning_rate": 8.635219642076985e-06, + "loss": 0.2469, + "step": 9820 + }, + { + "epoch": 0.24852595085659338, + "grad_norm": 16.549884796142578, + "learning_rate": 8.6349439486159e-06, + "loss": 0.3504, + "step": 9821 + }, + { + "epoch": 0.24855125642128703, + "grad_norm": 4.260953426361084, + "learning_rate": 8.634668231713738e-06, + "loss": 0.1776, + "step": 9822 + }, + { + "epoch": 0.24857656198598072, + "grad_norm": 8.256208419799805, + "learning_rate": 8.634392491372276e-06, + "loss": 0.181, + "step": 9823 + }, + { + "epoch": 0.2486018675506744, + "grad_norm": 9.414713859558105, + "learning_rate": 8.634116727593293e-06, + "loss": 0.1825, + "step": 9824 + }, + { + "epoch": 0.24862717311536808, + "grad_norm": 5.749857425689697, + "learning_rate": 8.633840940378568e-06, + "loss": 0.1693, + "step": 9825 + }, + { + "epoch": 0.24865247868006174, + "grad_norm": 6.31205415725708, + "learning_rate": 8.63356512972988e-06, + "loss": 0.228, + "step": 9826 + }, + { + "epoch": 0.24867778424475542, + "grad_norm": 6.078405380249023, + "learning_rate": 8.633289295649004e-06, + "loss": 0.1405, + "step": 9827 + }, + { + "epoch": 0.2487030898094491, + "grad_norm": 3.525998115539551, + "learning_rate": 8.63301343813772e-06, + "loss": 0.135, + "step": 9828 + }, + { + "epoch": 0.24872839537414276, + "grad_norm": 3.916470527648926, + "learning_rate": 8.632737557197811e-06, + "loss": 0.2276, + "step": 9829 + }, + { + "epoch": 0.24875370093883645, + "grad_norm": 3.1808664798736572, + "learning_rate": 8.632461652831053e-06, + "loss": 0.1864, + "step": 9830 + }, + { + "epoch": 0.24877900650353013, + "grad_norm": 11.896666526794434, + "learning_rate": 8.632185725039224e-06, + "loss": 0.22, + "step": 9831 + }, + { + "epoch": 0.24880431206822382, + "grad_norm": 7.215083599090576, + "learning_rate": 8.631909773824106e-06, + "loss": 0.241, + "step": 9832 + }, + { + "epoch": 0.24882961763291747, + "grad_norm": 4.750816822052002, + "learning_rate": 8.631633799187475e-06, + "loss": 0.1027, + "step": 9833 + }, + { + "epoch": 0.24885492319761116, + "grad_norm": 4.199499607086182, + "learning_rate": 8.631357801131118e-06, + "loss": 0.1395, + "step": 9834 + }, + { + "epoch": 0.24888022876230484, + "grad_norm": 5.535975933074951, + "learning_rate": 8.631081779656807e-06, + "loss": 0.1917, + "step": 9835 + }, + { + "epoch": 0.2489055343269985, + "grad_norm": 8.554847717285156, + "learning_rate": 8.630805734766324e-06, + "loss": 0.2059, + "step": 9836 + }, + { + "epoch": 0.24893083989169218, + "grad_norm": 8.614359855651855, + "learning_rate": 8.630529666461451e-06, + "loss": 0.1968, + "step": 9837 + }, + { + "epoch": 0.24895614545638586, + "grad_norm": 4.765392303466797, + "learning_rate": 8.630253574743968e-06, + "loss": 0.1354, + "step": 9838 + }, + { + "epoch": 0.24898145102107955, + "grad_norm": 6.187159538269043, + "learning_rate": 8.629977459615655e-06, + "loss": 0.2281, + "step": 9839 + }, + { + "epoch": 0.2490067565857732, + "grad_norm": 4.254953861236572, + "learning_rate": 8.629701321078292e-06, + "loss": 0.1109, + "step": 9840 + }, + { + "epoch": 0.2490320621504669, + "grad_norm": 8.576170921325684, + "learning_rate": 8.629425159133661e-06, + "loss": 0.1385, + "step": 9841 + }, + { + "epoch": 0.24905736771516057, + "grad_norm": 3.59592866897583, + "learning_rate": 8.629148973783543e-06, + "loss": 0.1693, + "step": 9842 + }, + { + "epoch": 0.24908267327985423, + "grad_norm": 6.788307189941406, + "learning_rate": 8.628872765029717e-06, + "loss": 0.1912, + "step": 9843 + }, + { + "epoch": 0.2491079788445479, + "grad_norm": 3.3832454681396484, + "learning_rate": 8.628596532873966e-06, + "loss": 0.0812, + "step": 9844 + }, + { + "epoch": 0.2491332844092416, + "grad_norm": 4.586738586425781, + "learning_rate": 8.628320277318072e-06, + "loss": 0.1795, + "step": 9845 + }, + { + "epoch": 0.24915858997393528, + "grad_norm": 3.6528055667877197, + "learning_rate": 8.628043998363813e-06, + "loss": 0.1525, + "step": 9846 + }, + { + "epoch": 0.24918389553862894, + "grad_norm": 8.583043098449707, + "learning_rate": 8.627767696012975e-06, + "loss": 0.2174, + "step": 9847 + }, + { + "epoch": 0.24920920110332262, + "grad_norm": 14.856968879699707, + "learning_rate": 8.627491370267338e-06, + "loss": 0.2549, + "step": 9848 + }, + { + "epoch": 0.2492345066680163, + "grad_norm": 3.5186212062835693, + "learning_rate": 8.627215021128683e-06, + "loss": 0.1626, + "step": 9849 + }, + { + "epoch": 0.24925981223270996, + "grad_norm": 10.773308753967285, + "learning_rate": 8.626938648598792e-06, + "loss": 0.2215, + "step": 9850 + }, + { + "epoch": 0.24928511779740364, + "grad_norm": 4.451940536499023, + "learning_rate": 8.626662252679452e-06, + "loss": 0.1831, + "step": 9851 + }, + { + "epoch": 0.24931042336209733, + "grad_norm": 9.77845287322998, + "learning_rate": 8.626385833372439e-06, + "loss": 0.2299, + "step": 9852 + }, + { + "epoch": 0.249335728926791, + "grad_norm": 5.675522804260254, + "learning_rate": 8.626109390679537e-06, + "loss": 0.1515, + "step": 9853 + }, + { + "epoch": 0.24936103449148467, + "grad_norm": 6.074417591094971, + "learning_rate": 8.625832924602533e-06, + "loss": 0.1335, + "step": 9854 + }, + { + "epoch": 0.24938634005617835, + "grad_norm": 18.503061294555664, + "learning_rate": 8.625556435143206e-06, + "loss": 0.1942, + "step": 9855 + }, + { + "epoch": 0.24941164562087204, + "grad_norm": 4.639101028442383, + "learning_rate": 8.62527992230334e-06, + "loss": 0.097, + "step": 9856 + }, + { + "epoch": 0.24943695118556572, + "grad_norm": 7.127091884613037, + "learning_rate": 8.625003386084718e-06, + "loss": 0.147, + "step": 9857 + }, + { + "epoch": 0.24946225675025938, + "grad_norm": 6.956714153289795, + "learning_rate": 8.624726826489122e-06, + "loss": 0.274, + "step": 9858 + }, + { + "epoch": 0.24948756231495306, + "grad_norm": 8.898577690124512, + "learning_rate": 8.62445024351834e-06, + "loss": 0.1949, + "step": 9859 + }, + { + "epoch": 0.24951286787964674, + "grad_norm": 5.345767498016357, + "learning_rate": 8.62417363717415e-06, + "loss": 0.2017, + "step": 9860 + }, + { + "epoch": 0.2495381734443404, + "grad_norm": 6.254638671875, + "learning_rate": 8.62389700745834e-06, + "loss": 0.172, + "step": 9861 + }, + { + "epoch": 0.24956347900903408, + "grad_norm": 10.287147521972656, + "learning_rate": 8.623620354372691e-06, + "loss": 0.1707, + "step": 9862 + }, + { + "epoch": 0.24958878457372777, + "grad_norm": 3.30696439743042, + "learning_rate": 8.62334367791899e-06, + "loss": 0.1351, + "step": 9863 + }, + { + "epoch": 0.24961409013842145, + "grad_norm": 4.99864387512207, + "learning_rate": 8.623066978099018e-06, + "loss": 0.2029, + "step": 9864 + }, + { + "epoch": 0.2496393957031151, + "grad_norm": 3.7189226150512695, + "learning_rate": 8.622790254914561e-06, + "loss": 0.1777, + "step": 9865 + }, + { + "epoch": 0.2496647012678088, + "grad_norm": 8.011542320251465, + "learning_rate": 8.622513508367404e-06, + "loss": 0.1817, + "step": 9866 + }, + { + "epoch": 0.24969000683250248, + "grad_norm": 14.791790008544922, + "learning_rate": 8.622236738459331e-06, + "loss": 0.1819, + "step": 9867 + }, + { + "epoch": 0.24971531239719613, + "grad_norm": 4.697065830230713, + "learning_rate": 8.621959945192127e-06, + "loss": 0.2045, + "step": 9868 + }, + { + "epoch": 0.24974061796188982, + "grad_norm": 5.611972332000732, + "learning_rate": 8.621683128567578e-06, + "loss": 0.2114, + "step": 9869 + }, + { + "epoch": 0.2497659235265835, + "grad_norm": 4.453611373901367, + "learning_rate": 8.621406288587469e-06, + "loss": 0.2204, + "step": 9870 + }, + { + "epoch": 0.24979122909127718, + "grad_norm": 3.0570719242095947, + "learning_rate": 8.621129425253581e-06, + "loss": 0.1067, + "step": 9871 + }, + { + "epoch": 0.24981653465597084, + "grad_norm": 4.48840856552124, + "learning_rate": 8.620852538567706e-06, + "loss": 0.1421, + "step": 9872 + }, + { + "epoch": 0.24984184022066452, + "grad_norm": 8.69397258758545, + "learning_rate": 8.620575628531625e-06, + "loss": 0.2047, + "step": 9873 + }, + { + "epoch": 0.2498671457853582, + "grad_norm": 9.874652862548828, + "learning_rate": 8.620298695147125e-06, + "loss": 0.2255, + "step": 9874 + }, + { + "epoch": 0.24989245135005186, + "grad_norm": 7.02459716796875, + "learning_rate": 8.620021738415993e-06, + "loss": 0.1972, + "step": 9875 + }, + { + "epoch": 0.24991775691474555, + "grad_norm": 12.013799667358398, + "learning_rate": 8.619744758340014e-06, + "loss": 0.2543, + "step": 9876 + }, + { + "epoch": 0.24994306247943923, + "grad_norm": 5.766481399536133, + "learning_rate": 8.619467754920976e-06, + "loss": 0.3, + "step": 9877 + }, + { + "epoch": 0.24996836804413292, + "grad_norm": 4.544289588928223, + "learning_rate": 8.619190728160662e-06, + "loss": 0.2, + "step": 9878 + }, + { + "epoch": 0.24999367360882657, + "grad_norm": 4.250052452087402, + "learning_rate": 8.618913678060859e-06, + "loss": 0.217, + "step": 9879 + }, + { + "epoch": 0.2500189791735203, + "grad_norm": 3.555340051651001, + "learning_rate": 8.618636604623357e-06, + "loss": 0.1559, + "step": 9880 + }, + { + "epoch": 0.25004428473821394, + "grad_norm": 3.2974274158477783, + "learning_rate": 8.61835950784994e-06, + "loss": 0.1291, + "step": 9881 + }, + { + "epoch": 0.2500695903029076, + "grad_norm": 3.405900239944458, + "learning_rate": 8.618082387742395e-06, + "loss": 0.1526, + "step": 9882 + }, + { + "epoch": 0.2500948958676013, + "grad_norm": 6.016334056854248, + "learning_rate": 8.617805244302511e-06, + "loss": 0.209, + "step": 9883 + }, + { + "epoch": 0.25012020143229496, + "grad_norm": 5.737435817718506, + "learning_rate": 8.617528077532073e-06, + "loss": 0.1852, + "step": 9884 + }, + { + "epoch": 0.2501455069969886, + "grad_norm": 5.971340656280518, + "learning_rate": 8.61725088743287e-06, + "loss": 0.2275, + "step": 9885 + }, + { + "epoch": 0.25017081256168233, + "grad_norm": 4.396924018859863, + "learning_rate": 8.616973674006688e-06, + "loss": 0.1845, + "step": 9886 + }, + { + "epoch": 0.250196118126376, + "grad_norm": 4.750046253204346, + "learning_rate": 8.616696437255317e-06, + "loss": 0.1726, + "step": 9887 + }, + { + "epoch": 0.25022142369106964, + "grad_norm": 12.524999618530273, + "learning_rate": 8.616419177180542e-06, + "loss": 0.245, + "step": 9888 + }, + { + "epoch": 0.25024672925576336, + "grad_norm": 5.015815734863281, + "learning_rate": 8.61614189378415e-06, + "loss": 0.155, + "step": 9889 + }, + { + "epoch": 0.250272034820457, + "grad_norm": 3.0358033180236816, + "learning_rate": 8.615864587067937e-06, + "loss": 0.161, + "step": 9890 + }, + { + "epoch": 0.25029734038515067, + "grad_norm": 3.3915717601776123, + "learning_rate": 8.615587257033681e-06, + "loss": 0.1496, + "step": 9891 + }, + { + "epoch": 0.2503226459498444, + "grad_norm": 13.514963150024414, + "learning_rate": 8.615309903683178e-06, + "loss": 0.3274, + "step": 9892 + }, + { + "epoch": 0.25034795151453804, + "grad_norm": 4.6953229904174805, + "learning_rate": 8.615032527018215e-06, + "loss": 0.1395, + "step": 9893 + }, + { + "epoch": 0.25037325707923175, + "grad_norm": 3.987619638442993, + "learning_rate": 8.614755127040578e-06, + "loss": 0.1525, + "step": 9894 + }, + { + "epoch": 0.2503985626439254, + "grad_norm": 3.4234282970428467, + "learning_rate": 8.614477703752058e-06, + "loss": 0.1206, + "step": 9895 + }, + { + "epoch": 0.25042386820861906, + "grad_norm": 6.142043590545654, + "learning_rate": 8.614200257154442e-06, + "loss": 0.213, + "step": 9896 + }, + { + "epoch": 0.25044917377331277, + "grad_norm": 5.68034029006958, + "learning_rate": 8.613922787249522e-06, + "loss": 0.2326, + "step": 9897 + }, + { + "epoch": 0.25047447933800643, + "grad_norm": 9.10778522491455, + "learning_rate": 8.613645294039088e-06, + "loss": 0.1911, + "step": 9898 + }, + { + "epoch": 0.2504997849027001, + "grad_norm": 6.335321426391602, + "learning_rate": 8.613367777524927e-06, + "loss": 0.1871, + "step": 9899 + }, + { + "epoch": 0.2505250904673938, + "grad_norm": 4.328324317932129, + "learning_rate": 8.613090237708827e-06, + "loss": 0.1838, + "step": 9900 + }, + { + "epoch": 0.25055039603208745, + "grad_norm": 8.060245513916016, + "learning_rate": 8.612812674592582e-06, + "loss": 0.1405, + "step": 9901 + }, + { + "epoch": 0.2505757015967811, + "grad_norm": 4.116305351257324, + "learning_rate": 8.61253508817798e-06, + "loss": 0.1621, + "step": 9902 + }, + { + "epoch": 0.2506010071614748, + "grad_norm": 4.44330358505249, + "learning_rate": 8.61225747846681e-06, + "loss": 0.1167, + "step": 9903 + }, + { + "epoch": 0.2506263127261685, + "grad_norm": 10.870309829711914, + "learning_rate": 8.611979845460865e-06, + "loss": 0.2504, + "step": 9904 + }, + { + "epoch": 0.2506516182908622, + "grad_norm": 60.519004821777344, + "learning_rate": 8.611702189161932e-06, + "loss": 0.2335, + "step": 9905 + }, + { + "epoch": 0.25067692385555584, + "grad_norm": 10.558815956115723, + "learning_rate": 8.611424509571807e-06, + "loss": 0.2441, + "step": 9906 + }, + { + "epoch": 0.2507022294202495, + "grad_norm": 5.465696334838867, + "learning_rate": 8.611146806692275e-06, + "loss": 0.225, + "step": 9907 + }, + { + "epoch": 0.2507275349849432, + "grad_norm": 9.025534629821777, + "learning_rate": 8.610869080525128e-06, + "loss": 0.1663, + "step": 9908 + }, + { + "epoch": 0.25075284054963687, + "grad_norm": 6.844824314117432, + "learning_rate": 8.610591331072157e-06, + "loss": 0.1679, + "step": 9909 + }, + { + "epoch": 0.2507781461143305, + "grad_norm": 11.438518524169922, + "learning_rate": 8.610313558335157e-06, + "loss": 0.3212, + "step": 9910 + }, + { + "epoch": 0.25080345167902424, + "grad_norm": 3.0670034885406494, + "learning_rate": 8.610035762315915e-06, + "loss": 0.1228, + "step": 9911 + }, + { + "epoch": 0.2508287572437179, + "grad_norm": 7.358963489532471, + "learning_rate": 8.609757943016223e-06, + "loss": 0.145, + "step": 9912 + }, + { + "epoch": 0.25085406280841155, + "grad_norm": 6.579077243804932, + "learning_rate": 8.609480100437875e-06, + "loss": 0.2156, + "step": 9913 + }, + { + "epoch": 0.25087936837310526, + "grad_norm": 4.259300708770752, + "learning_rate": 8.60920223458266e-06, + "loss": 0.1389, + "step": 9914 + }, + { + "epoch": 0.2509046739377989, + "grad_norm": 3.967587947845459, + "learning_rate": 8.608924345452374e-06, + "loss": 0.12, + "step": 9915 + }, + { + "epoch": 0.2509299795024926, + "grad_norm": 2.4885950088500977, + "learning_rate": 8.608646433048805e-06, + "loss": 0.1031, + "step": 9916 + }, + { + "epoch": 0.2509552850671863, + "grad_norm": 4.555812358856201, + "learning_rate": 8.608368497373745e-06, + "loss": 0.2047, + "step": 9917 + }, + { + "epoch": 0.25098059063187994, + "grad_norm": 3.2523303031921387, + "learning_rate": 8.608090538428987e-06, + "loss": 0.1129, + "step": 9918 + }, + { + "epoch": 0.25100589619657365, + "grad_norm": 15.51524829864502, + "learning_rate": 8.607812556216327e-06, + "loss": 0.2782, + "step": 9919 + }, + { + "epoch": 0.2510312017612673, + "grad_norm": 5.145576000213623, + "learning_rate": 8.607534550737553e-06, + "loss": 0.1946, + "step": 9920 + }, + { + "epoch": 0.25105650732596096, + "grad_norm": 4.106423377990723, + "learning_rate": 8.60725652199446e-06, + "loss": 0.1644, + "step": 9921 + }, + { + "epoch": 0.2510818128906547, + "grad_norm": 6.113478660583496, + "learning_rate": 8.606978469988842e-06, + "loss": 0.26, + "step": 9922 + }, + { + "epoch": 0.25110711845534833, + "grad_norm": 4.923524379730225, + "learning_rate": 8.60670039472249e-06, + "loss": 0.225, + "step": 9923 + }, + { + "epoch": 0.251132424020042, + "grad_norm": 6.410646915435791, + "learning_rate": 8.606422296197196e-06, + "loss": 0.1793, + "step": 9924 + }, + { + "epoch": 0.2511577295847357, + "grad_norm": 3.465108633041382, + "learning_rate": 8.606144174414759e-06, + "loss": 0.1616, + "step": 9925 + }, + { + "epoch": 0.25118303514942936, + "grad_norm": 5.333579063415527, + "learning_rate": 8.605866029376966e-06, + "loss": 0.1953, + "step": 9926 + }, + { + "epoch": 0.251208340714123, + "grad_norm": 11.49817943572998, + "learning_rate": 8.605587861085616e-06, + "loss": 0.2177, + "step": 9927 + }, + { + "epoch": 0.2512336462788167, + "grad_norm": 5.483430862426758, + "learning_rate": 8.6053096695425e-06, + "loss": 0.2076, + "step": 9928 + }, + { + "epoch": 0.2512589518435104, + "grad_norm": 7.304688930511475, + "learning_rate": 8.605031454749412e-06, + "loss": 0.2347, + "step": 9929 + }, + { + "epoch": 0.25128425740820404, + "grad_norm": 6.231929779052734, + "learning_rate": 8.604753216708147e-06, + "loss": 0.1855, + "step": 9930 + }, + { + "epoch": 0.25130956297289775, + "grad_norm": 4.416334629058838, + "learning_rate": 8.604474955420499e-06, + "loss": 0.1473, + "step": 9931 + }, + { + "epoch": 0.2513348685375914, + "grad_norm": 14.530125617980957, + "learning_rate": 8.604196670888264e-06, + "loss": 0.2237, + "step": 9932 + }, + { + "epoch": 0.2513601741022851, + "grad_norm": 6.0415449142456055, + "learning_rate": 8.603918363113233e-06, + "loss": 0.204, + "step": 9933 + }, + { + "epoch": 0.2513854796669788, + "grad_norm": 5.531843185424805, + "learning_rate": 8.603640032097202e-06, + "loss": 0.2292, + "step": 9934 + }, + { + "epoch": 0.25141078523167243, + "grad_norm": 3.37847900390625, + "learning_rate": 8.603361677841968e-06, + "loss": 0.1521, + "step": 9935 + }, + { + "epoch": 0.25143609079636614, + "grad_norm": 6.681996822357178, + "learning_rate": 8.603083300349326e-06, + "loss": 0.2027, + "step": 9936 + }, + { + "epoch": 0.2514613963610598, + "grad_norm": 6.708804130554199, + "learning_rate": 8.602804899621068e-06, + "loss": 0.1964, + "step": 9937 + }, + { + "epoch": 0.25148670192575345, + "grad_norm": 5.504685878753662, + "learning_rate": 8.602526475658994e-06, + "loss": 0.2098, + "step": 9938 + }, + { + "epoch": 0.25151200749044716, + "grad_norm": 11.609944343566895, + "learning_rate": 8.602248028464895e-06, + "loss": 0.1906, + "step": 9939 + }, + { + "epoch": 0.2515373130551408, + "grad_norm": 4.7335710525512695, + "learning_rate": 8.601969558040568e-06, + "loss": 0.1603, + "step": 9940 + }, + { + "epoch": 0.2515626186198345, + "grad_norm": 5.171718597412109, + "learning_rate": 8.601691064387812e-06, + "loss": 0.1397, + "step": 9941 + }, + { + "epoch": 0.2515879241845282, + "grad_norm": 6.97899055480957, + "learning_rate": 8.601412547508417e-06, + "loss": 0.1986, + "step": 9942 + }, + { + "epoch": 0.25161322974922185, + "grad_norm": 6.131107330322266, + "learning_rate": 8.601134007404185e-06, + "loss": 0.2279, + "step": 9943 + }, + { + "epoch": 0.25163853531391556, + "grad_norm": 5.588132381439209, + "learning_rate": 8.600855444076907e-06, + "loss": 0.1866, + "step": 9944 + }, + { + "epoch": 0.2516638408786092, + "grad_norm": 4.230302333831787, + "learning_rate": 8.600576857528385e-06, + "loss": 0.195, + "step": 9945 + }, + { + "epoch": 0.25168914644330287, + "grad_norm": 7.592627048492432, + "learning_rate": 8.600298247760411e-06, + "loss": 0.2327, + "step": 9946 + }, + { + "epoch": 0.2517144520079966, + "grad_norm": 4.121680736541748, + "learning_rate": 8.600019614774784e-06, + "loss": 0.1892, + "step": 9947 + }, + { + "epoch": 0.25173975757269024, + "grad_norm": 4.671170234680176, + "learning_rate": 8.5997409585733e-06, + "loss": 0.1871, + "step": 9948 + }, + { + "epoch": 0.2517650631373839, + "grad_norm": 5.189410209655762, + "learning_rate": 8.599462279157755e-06, + "loss": 0.1293, + "step": 9949 + }, + { + "epoch": 0.2517903687020776, + "grad_norm": 3.739457607269287, + "learning_rate": 8.599183576529947e-06, + "loss": 0.1927, + "step": 9950 + }, + { + "epoch": 0.25181567426677126, + "grad_norm": 4.954107761383057, + "learning_rate": 8.598904850691676e-06, + "loss": 0.1692, + "step": 9951 + }, + { + "epoch": 0.2518409798314649, + "grad_norm": 5.3014020919799805, + "learning_rate": 8.598626101644735e-06, + "loss": 0.2541, + "step": 9952 + }, + { + "epoch": 0.25186628539615863, + "grad_norm": 6.590268611907959, + "learning_rate": 8.598347329390926e-06, + "loss": 0.215, + "step": 9953 + }, + { + "epoch": 0.2518915909608523, + "grad_norm": 5.127480506896973, + "learning_rate": 8.598068533932042e-06, + "loss": 0.178, + "step": 9954 + }, + { + "epoch": 0.25191689652554594, + "grad_norm": 6.821988582611084, + "learning_rate": 8.597789715269883e-06, + "loss": 0.2635, + "step": 9955 + }, + { + "epoch": 0.25194220209023965, + "grad_norm": 13.044836044311523, + "learning_rate": 8.597510873406249e-06, + "loss": 0.2098, + "step": 9956 + }, + { + "epoch": 0.2519675076549333, + "grad_norm": 3.1152546405792236, + "learning_rate": 8.597232008342938e-06, + "loss": 0.1456, + "step": 9957 + }, + { + "epoch": 0.251992813219627, + "grad_norm": 6.208017349243164, + "learning_rate": 8.596953120081744e-06, + "loss": 0.2321, + "step": 9958 + }, + { + "epoch": 0.2520181187843207, + "grad_norm": 8.412837982177734, + "learning_rate": 8.59667420862447e-06, + "loss": 0.2041, + "step": 9959 + }, + { + "epoch": 0.25204342434901433, + "grad_norm": 7.561116695404053, + "learning_rate": 8.596395273972911e-06, + "loss": 0.3039, + "step": 9960 + }, + { + "epoch": 0.25206872991370805, + "grad_norm": 4.7463297843933105, + "learning_rate": 8.59611631612887e-06, + "loss": 0.1926, + "step": 9961 + }, + { + "epoch": 0.2520940354784017, + "grad_norm": 8.631179809570312, + "learning_rate": 8.595837335094143e-06, + "loss": 0.1833, + "step": 9962 + }, + { + "epoch": 0.25211934104309536, + "grad_norm": 4.92233943939209, + "learning_rate": 8.59555833087053e-06, + "loss": 0.183, + "step": 9963 + }, + { + "epoch": 0.25214464660778907, + "grad_norm": 4.516829967498779, + "learning_rate": 8.595279303459831e-06, + "loss": 0.1827, + "step": 9964 + }, + { + "epoch": 0.2521699521724827, + "grad_norm": 4.964354991912842, + "learning_rate": 8.595000252863843e-06, + "loss": 0.1684, + "step": 9965 + }, + { + "epoch": 0.2521952577371764, + "grad_norm": 3.829294204711914, + "learning_rate": 8.59472117908437e-06, + "loss": 0.1087, + "step": 9966 + }, + { + "epoch": 0.2522205633018701, + "grad_norm": 5.01642370223999, + "learning_rate": 8.594442082123204e-06, + "loss": 0.1799, + "step": 9967 + }, + { + "epoch": 0.25224586886656375, + "grad_norm": 5.498959064483643, + "learning_rate": 8.594162961982153e-06, + "loss": 0.1675, + "step": 9968 + }, + { + "epoch": 0.25227117443125746, + "grad_norm": 4.445042610168457, + "learning_rate": 8.593883818663013e-06, + "loss": 0.1551, + "step": 9969 + }, + { + "epoch": 0.2522964799959511, + "grad_norm": 5.494027137756348, + "learning_rate": 8.593604652167584e-06, + "loss": 0.1177, + "step": 9970 + }, + { + "epoch": 0.2523217855606448, + "grad_norm": 8.07705020904541, + "learning_rate": 8.59332546249767e-06, + "loss": 0.1973, + "step": 9971 + }, + { + "epoch": 0.2523470911253385, + "grad_norm": 4.593314170837402, + "learning_rate": 8.593046249655067e-06, + "loss": 0.1303, + "step": 9972 + }, + { + "epoch": 0.25237239669003214, + "grad_norm": 6.576028823852539, + "learning_rate": 8.592767013641574e-06, + "loss": 0.24, + "step": 9973 + }, + { + "epoch": 0.2523977022547258, + "grad_norm": 8.4435453414917, + "learning_rate": 8.592487754458998e-06, + "loss": 0.1885, + "step": 9974 + }, + { + "epoch": 0.2524230078194195, + "grad_norm": 7.839654445648193, + "learning_rate": 8.592208472109137e-06, + "loss": 0.2563, + "step": 9975 + }, + { + "epoch": 0.25244831338411317, + "grad_norm": 12.492718696594238, + "learning_rate": 8.591929166593789e-06, + "loss": 0.2577, + "step": 9976 + }, + { + "epoch": 0.2524736189488068, + "grad_norm": 6.419128894805908, + "learning_rate": 8.59164983791476e-06, + "loss": 0.2153, + "step": 9977 + }, + { + "epoch": 0.25249892451350053, + "grad_norm": 5.197327613830566, + "learning_rate": 8.59137048607385e-06, + "loss": 0.1627, + "step": 9978 + }, + { + "epoch": 0.2525242300781942, + "grad_norm": 5.879818916320801, + "learning_rate": 8.591091111072859e-06, + "loss": 0.1997, + "step": 9979 + }, + { + "epoch": 0.25254953564288785, + "grad_norm": 3.6908557415008545, + "learning_rate": 8.590811712913587e-06, + "loss": 0.1293, + "step": 9980 + }, + { + "epoch": 0.25257484120758156, + "grad_norm": 6.284695625305176, + "learning_rate": 8.59053229159784e-06, + "loss": 0.1991, + "step": 9981 + }, + { + "epoch": 0.2526001467722752, + "grad_norm": 5.600692272186279, + "learning_rate": 8.590252847127419e-06, + "loss": 0.2647, + "step": 9982 + }, + { + "epoch": 0.2526254523369689, + "grad_norm": 2.9918949604034424, + "learning_rate": 8.589973379504125e-06, + "loss": 0.1405, + "step": 9983 + }, + { + "epoch": 0.2526507579016626, + "grad_norm": 6.311496257781982, + "learning_rate": 8.58969388872976e-06, + "loss": 0.1858, + "step": 9984 + }, + { + "epoch": 0.25267606346635624, + "grad_norm": 5.153480529785156, + "learning_rate": 8.589414374806127e-06, + "loss": 0.1868, + "step": 9985 + }, + { + "epoch": 0.25270136903104995, + "grad_norm": 10.43376350402832, + "learning_rate": 8.58913483773503e-06, + "loss": 0.1758, + "step": 9986 + }, + { + "epoch": 0.2527266745957436, + "grad_norm": 10.946304321289062, + "learning_rate": 8.588855277518267e-06, + "loss": 0.2909, + "step": 9987 + }, + { + "epoch": 0.25275198016043726, + "grad_norm": 7.596711158752441, + "learning_rate": 8.588575694157647e-06, + "loss": 0.1852, + "step": 9988 + }, + { + "epoch": 0.252777285725131, + "grad_norm": 5.6303606033325195, + "learning_rate": 8.588296087654965e-06, + "loss": 0.1756, + "step": 9989 + }, + { + "epoch": 0.25280259128982463, + "grad_norm": 7.931446075439453, + "learning_rate": 8.588016458012035e-06, + "loss": 0.2015, + "step": 9990 + }, + { + "epoch": 0.2528278968545183, + "grad_norm": 5.699656963348389, + "learning_rate": 8.58773680523065e-06, + "loss": 0.1779, + "step": 9991 + }, + { + "epoch": 0.252853202419212, + "grad_norm": 4.231354713439941, + "learning_rate": 8.58745712931262e-06, + "loss": 0.1164, + "step": 9992 + }, + { + "epoch": 0.25287850798390565, + "grad_norm": 8.68911075592041, + "learning_rate": 8.587177430259745e-06, + "loss": 0.2165, + "step": 9993 + }, + { + "epoch": 0.2529038135485993, + "grad_norm": 5.175604343414307, + "learning_rate": 8.586897708073832e-06, + "loss": 0.1897, + "step": 9994 + }, + { + "epoch": 0.252929119113293, + "grad_norm": 7.13427734375, + "learning_rate": 8.58661796275668e-06, + "loss": 0.1465, + "step": 9995 + }, + { + "epoch": 0.2529544246779867, + "grad_norm": 2.6265246868133545, + "learning_rate": 8.586338194310098e-06, + "loss": 0.1353, + "step": 9996 + }, + { + "epoch": 0.2529797302426804, + "grad_norm": 5.813927173614502, + "learning_rate": 8.586058402735886e-06, + "loss": 0.2058, + "step": 9997 + }, + { + "epoch": 0.25300503580737405, + "grad_norm": 7.127507209777832, + "learning_rate": 8.585778588035852e-06, + "loss": 0.2231, + "step": 9998 + }, + { + "epoch": 0.2530303413720677, + "grad_norm": 8.713883399963379, + "learning_rate": 8.5854987502118e-06, + "loss": 0.1995, + "step": 9999 + }, + { + "epoch": 0.2530556469367614, + "grad_norm": 7.804343223571777, + "learning_rate": 8.585218889265532e-06, + "loss": 0.2294, + "step": 10000 + }, + { + "epoch": 0.25308095250145507, + "grad_norm": 12.684911727905273, + "learning_rate": 8.584939005198854e-06, + "loss": 0.2207, + "step": 10001 + }, + { + "epoch": 0.2531062580661487, + "grad_norm": 8.255081176757812, + "learning_rate": 8.584659098013573e-06, + "loss": 0.1988, + "step": 10002 + }, + { + "epoch": 0.25313156363084244, + "grad_norm": 6.880980014801025, + "learning_rate": 8.58437916771149e-06, + "loss": 0.2396, + "step": 10003 + }, + { + "epoch": 0.2531568691955361, + "grad_norm": 4.2153801918029785, + "learning_rate": 8.584099214294413e-06, + "loss": 0.1117, + "step": 10004 + }, + { + "epoch": 0.25318217476022975, + "grad_norm": 5.678420066833496, + "learning_rate": 8.583819237764148e-06, + "loss": 0.1966, + "step": 10005 + }, + { + "epoch": 0.25320748032492346, + "grad_norm": 3.4437761306762695, + "learning_rate": 8.583539238122498e-06, + "loss": 0.1408, + "step": 10006 + }, + { + "epoch": 0.2532327858896171, + "grad_norm": 13.704096794128418, + "learning_rate": 8.583259215371272e-06, + "loss": 0.2793, + "step": 10007 + }, + { + "epoch": 0.25325809145431083, + "grad_norm": 6.064497470855713, + "learning_rate": 8.582979169512272e-06, + "loss": 0.2169, + "step": 10008 + }, + { + "epoch": 0.2532833970190045, + "grad_norm": 13.725611686706543, + "learning_rate": 8.582699100547308e-06, + "loss": 0.2239, + "step": 10009 + }, + { + "epoch": 0.25330870258369814, + "grad_norm": 3.3431107997894287, + "learning_rate": 8.58241900847818e-06, + "loss": 0.1083, + "step": 10010 + }, + { + "epoch": 0.25333400814839185, + "grad_norm": 9.935626983642578, + "learning_rate": 8.582138893306702e-06, + "loss": 0.2583, + "step": 10011 + }, + { + "epoch": 0.2533593137130855, + "grad_norm": 3.9718315601348877, + "learning_rate": 8.581858755034675e-06, + "loss": 0.1477, + "step": 10012 + }, + { + "epoch": 0.25338461927777917, + "grad_norm": 9.875374794006348, + "learning_rate": 8.581578593663906e-06, + "loss": 0.2489, + "step": 10013 + }, + { + "epoch": 0.2534099248424729, + "grad_norm": 3.7642815113067627, + "learning_rate": 8.581298409196206e-06, + "loss": 0.1725, + "step": 10014 + }, + { + "epoch": 0.25343523040716653, + "grad_norm": 4.105684757232666, + "learning_rate": 8.581018201633376e-06, + "loss": 0.1552, + "step": 10015 + }, + { + "epoch": 0.2534605359718602, + "grad_norm": 3.8163275718688965, + "learning_rate": 8.580737970977227e-06, + "loss": 0.1406, + "step": 10016 + }, + { + "epoch": 0.2534858415365539, + "grad_norm": 7.130276679992676, + "learning_rate": 8.580457717229564e-06, + "loss": 0.2186, + "step": 10017 + }, + { + "epoch": 0.25351114710124756, + "grad_norm": 6.472238540649414, + "learning_rate": 8.580177440392197e-06, + "loss": 0.2469, + "step": 10018 + }, + { + "epoch": 0.2535364526659412, + "grad_norm": 9.013903617858887, + "learning_rate": 8.57989714046693e-06, + "loss": 0.2066, + "step": 10019 + }, + { + "epoch": 0.2535617582306349, + "grad_norm": 3.5709855556488037, + "learning_rate": 8.579616817455572e-06, + "loss": 0.1543, + "step": 10020 + }, + { + "epoch": 0.2535870637953286, + "grad_norm": 11.353063583374023, + "learning_rate": 8.579336471359933e-06, + "loss": 0.1344, + "step": 10021 + }, + { + "epoch": 0.2536123693600223, + "grad_norm": 4.588673114776611, + "learning_rate": 8.579056102181818e-06, + "loss": 0.1852, + "step": 10022 + }, + { + "epoch": 0.25363767492471595, + "grad_norm": 18.362516403198242, + "learning_rate": 8.578775709923034e-06, + "loss": 0.2468, + "step": 10023 + }, + { + "epoch": 0.2536629804894096, + "grad_norm": 4.7773027420043945, + "learning_rate": 8.578495294585393e-06, + "loss": 0.203, + "step": 10024 + }, + { + "epoch": 0.2536882860541033, + "grad_norm": 3.3730170726776123, + "learning_rate": 8.578214856170702e-06, + "loss": 0.1171, + "step": 10025 + }, + { + "epoch": 0.253713591618797, + "grad_norm": 5.143283367156982, + "learning_rate": 8.577934394680768e-06, + "loss": 0.1812, + "step": 10026 + }, + { + "epoch": 0.25373889718349063, + "grad_norm": 7.0044169425964355, + "learning_rate": 8.5776539101174e-06, + "loss": 0.2129, + "step": 10027 + }, + { + "epoch": 0.25376420274818434, + "grad_norm": 6.227170944213867, + "learning_rate": 8.577373402482409e-06, + "loss": 0.1464, + "step": 10028 + }, + { + "epoch": 0.253789508312878, + "grad_norm": 9.021735191345215, + "learning_rate": 8.577092871777601e-06, + "loss": 0.2964, + "step": 10029 + }, + { + "epoch": 0.25381481387757165, + "grad_norm": 6.387619495391846, + "learning_rate": 8.576812318004788e-06, + "loss": 0.2064, + "step": 10030 + }, + { + "epoch": 0.25384011944226537, + "grad_norm": 4.775890827178955, + "learning_rate": 8.576531741165777e-06, + "loss": 0.1627, + "step": 10031 + }, + { + "epoch": 0.253865425006959, + "grad_norm": 8.566510200500488, + "learning_rate": 8.576251141262377e-06, + "loss": 0.2521, + "step": 10032 + }, + { + "epoch": 0.25389073057165273, + "grad_norm": 5.652292728424072, + "learning_rate": 8.5759705182964e-06, + "loss": 0.2754, + "step": 10033 + }, + { + "epoch": 0.2539160361363464, + "grad_norm": 3.6754133701324463, + "learning_rate": 8.575689872269655e-06, + "loss": 0.1108, + "step": 10034 + }, + { + "epoch": 0.25394134170104005, + "grad_norm": 10.000675201416016, + "learning_rate": 8.575409203183948e-06, + "loss": 0.1478, + "step": 10035 + }, + { + "epoch": 0.25396664726573376, + "grad_norm": 3.786045551300049, + "learning_rate": 8.575128511041095e-06, + "loss": 0.1129, + "step": 10036 + }, + { + "epoch": 0.2539919528304274, + "grad_norm": 3.061523199081421, + "learning_rate": 8.574847795842902e-06, + "loss": 0.1054, + "step": 10037 + }, + { + "epoch": 0.25401725839512107, + "grad_norm": 5.324776649475098, + "learning_rate": 8.574567057591182e-06, + "loss": 0.1687, + "step": 10038 + }, + { + "epoch": 0.2540425639598148, + "grad_norm": 5.467930316925049, + "learning_rate": 8.574286296287741e-06, + "loss": 0.1995, + "step": 10039 + }, + { + "epoch": 0.25406786952450844, + "grad_norm": 4.1292724609375, + "learning_rate": 8.574005511934396e-06, + "loss": 0.1436, + "step": 10040 + }, + { + "epoch": 0.2540931750892021, + "grad_norm": 4.195946216583252, + "learning_rate": 8.573724704532952e-06, + "loss": 0.1937, + "step": 10041 + }, + { + "epoch": 0.2541184806538958, + "grad_norm": 4.225431442260742, + "learning_rate": 8.573443874085224e-06, + "loss": 0.2078, + "step": 10042 + }, + { + "epoch": 0.25414378621858946, + "grad_norm": 5.34292459487915, + "learning_rate": 8.573163020593018e-06, + "loss": 0.1644, + "step": 10043 + }, + { + "epoch": 0.2541690917832831, + "grad_norm": 16.312406539916992, + "learning_rate": 8.57288214405815e-06, + "loss": 0.2702, + "step": 10044 + }, + { + "epoch": 0.25419439734797683, + "grad_norm": 3.5683999061584473, + "learning_rate": 8.57260124448243e-06, + "loss": 0.1485, + "step": 10045 + }, + { + "epoch": 0.2542197029126705, + "grad_norm": 10.996941566467285, + "learning_rate": 8.57232032186767e-06, + "loss": 0.2728, + "step": 10046 + }, + { + "epoch": 0.2542450084773642, + "grad_norm": 7.990415096282959, + "learning_rate": 8.572039376215679e-06, + "loss": 0.194, + "step": 10047 + }, + { + "epoch": 0.25427031404205785, + "grad_norm": 17.602210998535156, + "learning_rate": 8.571758407528273e-06, + "loss": 0.2865, + "step": 10048 + }, + { + "epoch": 0.2542956196067515, + "grad_norm": 10.88884449005127, + "learning_rate": 8.571477415807259e-06, + "loss": 0.3454, + "step": 10049 + }, + { + "epoch": 0.2543209251714452, + "grad_norm": 6.834782600402832, + "learning_rate": 8.571196401054453e-06, + "loss": 0.1685, + "step": 10050 + }, + { + "epoch": 0.2543462307361389, + "grad_norm": 5.110536098480225, + "learning_rate": 8.570915363271664e-06, + "loss": 0.1089, + "step": 10051 + }, + { + "epoch": 0.25437153630083253, + "grad_norm": 8.293745040893555, + "learning_rate": 8.570634302460707e-06, + "loss": 0.1471, + "step": 10052 + }, + { + "epoch": 0.25439684186552625, + "grad_norm": 16.16700553894043, + "learning_rate": 8.570353218623395e-06, + "loss": 0.2367, + "step": 10053 + }, + { + "epoch": 0.2544221474302199, + "grad_norm": 4.210175037384033, + "learning_rate": 8.570072111761536e-06, + "loss": 0.2428, + "step": 10054 + }, + { + "epoch": 0.25444745299491356, + "grad_norm": 4.497589588165283, + "learning_rate": 8.569790981876948e-06, + "loss": 0.2007, + "step": 10055 + }, + { + "epoch": 0.25447275855960727, + "grad_norm": 6.887942790985107, + "learning_rate": 8.569509828971443e-06, + "loss": 0.2621, + "step": 10056 + }, + { + "epoch": 0.2544980641243009, + "grad_norm": 5.712172508239746, + "learning_rate": 8.569228653046831e-06, + "loss": 0.2288, + "step": 10057 + }, + { + "epoch": 0.2545233696889946, + "grad_norm": 5.068460941314697, + "learning_rate": 8.56894745410493e-06, + "loss": 0.2051, + "step": 10058 + }, + { + "epoch": 0.2545486752536883, + "grad_norm": 11.24236011505127, + "learning_rate": 8.568666232147548e-06, + "loss": 0.1912, + "step": 10059 + }, + { + "epoch": 0.25457398081838195, + "grad_norm": 10.49569320678711, + "learning_rate": 8.568384987176504e-06, + "loss": 0.2273, + "step": 10060 + }, + { + "epoch": 0.25459928638307566, + "grad_norm": 3.513216257095337, + "learning_rate": 8.568103719193605e-06, + "loss": 0.1428, + "step": 10061 + }, + { + "epoch": 0.2546245919477693, + "grad_norm": 4.233588218688965, + "learning_rate": 8.567822428200672e-06, + "loss": 0.1749, + "step": 10062 + }, + { + "epoch": 0.254649897512463, + "grad_norm": 4.657106399536133, + "learning_rate": 8.567541114199516e-06, + "loss": 0.2296, + "step": 10063 + }, + { + "epoch": 0.2546752030771567, + "grad_norm": 5.318558692932129, + "learning_rate": 8.56725977719195e-06, + "loss": 0.2165, + "step": 10064 + }, + { + "epoch": 0.25470050864185034, + "grad_norm": 7.089815616607666, + "learning_rate": 8.566978417179787e-06, + "loss": 0.181, + "step": 10065 + }, + { + "epoch": 0.254725814206544, + "grad_norm": 3.760647773742676, + "learning_rate": 8.566697034164848e-06, + "loss": 0.164, + "step": 10066 + }, + { + "epoch": 0.2547511197712377, + "grad_norm": 9.728596687316895, + "learning_rate": 8.566415628148942e-06, + "loss": 0.2994, + "step": 10067 + }, + { + "epoch": 0.25477642533593137, + "grad_norm": 3.521934986114502, + "learning_rate": 8.566134199133882e-06, + "loss": 0.1414, + "step": 10068 + }, + { + "epoch": 0.254801730900625, + "grad_norm": 5.886298656463623, + "learning_rate": 8.565852747121488e-06, + "loss": 0.1869, + "step": 10069 + }, + { + "epoch": 0.25482703646531873, + "grad_norm": 7.11690092086792, + "learning_rate": 8.565571272113575e-06, + "loss": 0.1827, + "step": 10070 + }, + { + "epoch": 0.2548523420300124, + "grad_norm": 5.254795551300049, + "learning_rate": 8.565289774111953e-06, + "loss": 0.2158, + "step": 10071 + }, + { + "epoch": 0.2548776475947061, + "grad_norm": 3.896702766418457, + "learning_rate": 8.565008253118443e-06, + "loss": 0.1596, + "step": 10072 + }, + { + "epoch": 0.25490295315939976, + "grad_norm": 11.241191864013672, + "learning_rate": 8.564726709134856e-06, + "loss": 0.3888, + "step": 10073 + }, + { + "epoch": 0.2549282587240934, + "grad_norm": 19.062835693359375, + "learning_rate": 8.56444514216301e-06, + "loss": 0.1778, + "step": 10074 + }, + { + "epoch": 0.2549535642887871, + "grad_norm": 11.477234840393066, + "learning_rate": 8.564163552204721e-06, + "loss": 0.3423, + "step": 10075 + }, + { + "epoch": 0.2549788698534808, + "grad_norm": 7.787164688110352, + "learning_rate": 8.563881939261802e-06, + "loss": 0.2458, + "step": 10076 + }, + { + "epoch": 0.25500417541817444, + "grad_norm": 6.6901092529296875, + "learning_rate": 8.563600303336073e-06, + "loss": 0.2861, + "step": 10077 + }, + { + "epoch": 0.25502948098286815, + "grad_norm": 6.840521335601807, + "learning_rate": 8.563318644429348e-06, + "loss": 0.1622, + "step": 10078 + }, + { + "epoch": 0.2550547865475618, + "grad_norm": 6.432548522949219, + "learning_rate": 8.563036962543445e-06, + "loss": 0.1589, + "step": 10079 + }, + { + "epoch": 0.25508009211225546, + "grad_norm": 2.6344776153564453, + "learning_rate": 8.56275525768018e-06, + "loss": 0.1501, + "step": 10080 + }, + { + "epoch": 0.2551053976769492, + "grad_norm": 7.756394386291504, + "learning_rate": 8.562473529841366e-06, + "loss": 0.2712, + "step": 10081 + }, + { + "epoch": 0.25513070324164283, + "grad_norm": 4.386800765991211, + "learning_rate": 8.562191779028825e-06, + "loss": 0.1532, + "step": 10082 + }, + { + "epoch": 0.2551560088063365, + "grad_norm": 6.269327640533447, + "learning_rate": 8.561910005244371e-06, + "loss": 0.1989, + "step": 10083 + }, + { + "epoch": 0.2551813143710302, + "grad_norm": 3.7523326873779297, + "learning_rate": 8.561628208489822e-06, + "loss": 0.137, + "step": 10084 + }, + { + "epoch": 0.25520661993572386, + "grad_norm": 3.267238140106201, + "learning_rate": 8.561346388766997e-06, + "loss": 0.1555, + "step": 10085 + }, + { + "epoch": 0.25523192550041757, + "grad_norm": 4.940674781799316, + "learning_rate": 8.561064546077711e-06, + "loss": 0.2076, + "step": 10086 + }, + { + "epoch": 0.2552572310651112, + "grad_norm": 2.9380996227264404, + "learning_rate": 8.560782680423781e-06, + "loss": 0.1109, + "step": 10087 + }, + { + "epoch": 0.2552825366298049, + "grad_norm": 6.342121601104736, + "learning_rate": 8.560500791807026e-06, + "loss": 0.191, + "step": 10088 + }, + { + "epoch": 0.2553078421944986, + "grad_norm": 2.6429193019866943, + "learning_rate": 8.560218880229264e-06, + "loss": 0.099, + "step": 10089 + }, + { + "epoch": 0.25533314775919225, + "grad_norm": 15.059836387634277, + "learning_rate": 8.559936945692314e-06, + "loss": 0.1936, + "step": 10090 + }, + { + "epoch": 0.2553584533238859, + "grad_norm": 13.312015533447266, + "learning_rate": 8.55965498819799e-06, + "loss": 0.2078, + "step": 10091 + }, + { + "epoch": 0.2553837588885796, + "grad_norm": 7.999607086181641, + "learning_rate": 8.559373007748115e-06, + "loss": 0.2428, + "step": 10092 + }, + { + "epoch": 0.25540906445327327, + "grad_norm": 18.733753204345703, + "learning_rate": 8.559091004344506e-06, + "loss": 0.194, + "step": 10093 + }, + { + "epoch": 0.2554343700179669, + "grad_norm": 9.413012504577637, + "learning_rate": 8.55880897798898e-06, + "loss": 0.2524, + "step": 10094 + }, + { + "epoch": 0.25545967558266064, + "grad_norm": 3.1471621990203857, + "learning_rate": 8.558526928683358e-06, + "loss": 0.0938, + "step": 10095 + }, + { + "epoch": 0.2554849811473543, + "grad_norm": 4.11922550201416, + "learning_rate": 8.558244856429456e-06, + "loss": 0.2025, + "step": 10096 + }, + { + "epoch": 0.255510286712048, + "grad_norm": 5.419373989105225, + "learning_rate": 8.557962761229095e-06, + "loss": 0.2355, + "step": 10097 + }, + { + "epoch": 0.25553559227674166, + "grad_norm": 3.481405258178711, + "learning_rate": 8.557680643084095e-06, + "loss": 0.1042, + "step": 10098 + }, + { + "epoch": 0.2555608978414353, + "grad_norm": 4.112757682800293, + "learning_rate": 8.557398501996275e-06, + "loss": 0.1749, + "step": 10099 + }, + { + "epoch": 0.25558620340612903, + "grad_norm": 3.169080972671509, + "learning_rate": 8.557116337967452e-06, + "loss": 0.1131, + "step": 10100 + }, + { + "epoch": 0.2556115089708227, + "grad_norm": 4.792452812194824, + "learning_rate": 8.55683415099945e-06, + "loss": 0.2102, + "step": 10101 + }, + { + "epoch": 0.25563681453551634, + "grad_norm": 3.957625389099121, + "learning_rate": 8.556551941094084e-06, + "loss": 0.132, + "step": 10102 + }, + { + "epoch": 0.25566212010021006, + "grad_norm": 7.136697292327881, + "learning_rate": 8.556269708253175e-06, + "loss": 0.1728, + "step": 10103 + }, + { + "epoch": 0.2556874256649037, + "grad_norm": 5.028072357177734, + "learning_rate": 8.555987452478546e-06, + "loss": 0.1994, + "step": 10104 + }, + { + "epoch": 0.25571273122959737, + "grad_norm": 7.3652753829956055, + "learning_rate": 8.555705173772015e-06, + "loss": 0.13, + "step": 10105 + }, + { + "epoch": 0.2557380367942911, + "grad_norm": 7.523593902587891, + "learning_rate": 8.555422872135404e-06, + "loss": 0.165, + "step": 10106 + }, + { + "epoch": 0.25576334235898474, + "grad_norm": 3.52194881439209, + "learning_rate": 8.55514054757053e-06, + "loss": 0.112, + "step": 10107 + }, + { + "epoch": 0.2557886479236784, + "grad_norm": 7.465062141418457, + "learning_rate": 8.554858200079218e-06, + "loss": 0.1825, + "step": 10108 + }, + { + "epoch": 0.2558139534883721, + "grad_norm": 7.353593826293945, + "learning_rate": 8.554575829663286e-06, + "loss": 0.1812, + "step": 10109 + }, + { + "epoch": 0.25583925905306576, + "grad_norm": 4.475555419921875, + "learning_rate": 8.554293436324555e-06, + "loss": 0.1381, + "step": 10110 + }, + { + "epoch": 0.25586456461775947, + "grad_norm": 13.898128509521484, + "learning_rate": 8.554011020064847e-06, + "loss": 0.1753, + "step": 10111 + }, + { + "epoch": 0.2558898701824531, + "grad_norm": 9.443598747253418, + "learning_rate": 8.553728580885984e-06, + "loss": 0.3358, + "step": 10112 + }, + { + "epoch": 0.2559151757471468, + "grad_norm": 13.42105484008789, + "learning_rate": 8.553446118789785e-06, + "loss": 0.3238, + "step": 10113 + }, + { + "epoch": 0.2559404813118405, + "grad_norm": 7.052289009094238, + "learning_rate": 8.553163633778075e-06, + "loss": 0.1869, + "step": 10114 + }, + { + "epoch": 0.25596578687653415, + "grad_norm": 11.53709888458252, + "learning_rate": 8.55288112585267e-06, + "loss": 0.3215, + "step": 10115 + }, + { + "epoch": 0.2559910924412278, + "grad_norm": 3.735334634780884, + "learning_rate": 8.552598595015398e-06, + "loss": 0.1628, + "step": 10116 + }, + { + "epoch": 0.2560163980059215, + "grad_norm": 20.568937301635742, + "learning_rate": 8.552316041268079e-06, + "loss": 0.2335, + "step": 10117 + }, + { + "epoch": 0.2560417035706152, + "grad_norm": 4.297054767608643, + "learning_rate": 8.552033464612534e-06, + "loss": 0.2044, + "step": 10118 + }, + { + "epoch": 0.25606700913530883, + "grad_norm": 4.656481742858887, + "learning_rate": 8.551750865050586e-06, + "loss": 0.2006, + "step": 10119 + }, + { + "epoch": 0.25609231470000254, + "grad_norm": 4.579344272613525, + "learning_rate": 8.551468242584058e-06, + "loss": 0.1343, + "step": 10120 + }, + { + "epoch": 0.2561176202646962, + "grad_norm": 2.9524128437042236, + "learning_rate": 8.55118559721477e-06, + "loss": 0.101, + "step": 10121 + }, + { + "epoch": 0.25614292582938986, + "grad_norm": 9.445821762084961, + "learning_rate": 8.550902928944549e-06, + "loss": 0.1953, + "step": 10122 + }, + { + "epoch": 0.25616823139408357, + "grad_norm": 6.995548248291016, + "learning_rate": 8.550620237775213e-06, + "loss": 0.183, + "step": 10123 + }, + { + "epoch": 0.2561935369587772, + "grad_norm": 9.513328552246094, + "learning_rate": 8.550337523708589e-06, + "loss": 0.2422, + "step": 10124 + }, + { + "epoch": 0.25621884252347094, + "grad_norm": 4.541510105133057, + "learning_rate": 8.5500547867465e-06, + "loss": 0.1283, + "step": 10125 + }, + { + "epoch": 0.2562441480881646, + "grad_norm": 3.3051106929779053, + "learning_rate": 8.549772026890766e-06, + "loss": 0.0824, + "step": 10126 + }, + { + "epoch": 0.25626945365285825, + "grad_norm": 2.9913065433502197, + "learning_rate": 8.549489244143213e-06, + "loss": 0.1383, + "step": 10127 + }, + { + "epoch": 0.25629475921755196, + "grad_norm": 3.9603075981140137, + "learning_rate": 8.549206438505664e-06, + "loss": 0.1357, + "step": 10128 + }, + { + "epoch": 0.2563200647822456, + "grad_norm": 3.7941200733184814, + "learning_rate": 8.548923609979945e-06, + "loss": 0.0911, + "step": 10129 + }, + { + "epoch": 0.25634537034693927, + "grad_norm": 6.350209712982178, + "learning_rate": 8.548640758567875e-06, + "loss": 0.2221, + "step": 10130 + }, + { + "epoch": 0.256370675911633, + "grad_norm": 7.281110763549805, + "learning_rate": 8.54835788427128e-06, + "loss": 0.2387, + "step": 10131 + }, + { + "epoch": 0.25639598147632664, + "grad_norm": 3.8910813331604004, + "learning_rate": 8.548074987091987e-06, + "loss": 0.1427, + "step": 10132 + }, + { + "epoch": 0.2564212870410203, + "grad_norm": 5.739709854125977, + "learning_rate": 8.547792067031819e-06, + "loss": 0.2009, + "step": 10133 + }, + { + "epoch": 0.256446592605714, + "grad_norm": 8.450925827026367, + "learning_rate": 8.5475091240926e-06, + "loss": 0.1762, + "step": 10134 + }, + { + "epoch": 0.25647189817040766, + "grad_norm": 5.131655216217041, + "learning_rate": 8.547226158276154e-06, + "loss": 0.1414, + "step": 10135 + }, + { + "epoch": 0.2564972037351014, + "grad_norm": 9.083166122436523, + "learning_rate": 8.546943169584306e-06, + "loss": 0.1708, + "step": 10136 + }, + { + "epoch": 0.25652250929979503, + "grad_norm": 4.310462951660156, + "learning_rate": 8.54666015801888e-06, + "loss": 0.1804, + "step": 10137 + }, + { + "epoch": 0.2565478148644887, + "grad_norm": 6.319972515106201, + "learning_rate": 8.546377123581704e-06, + "loss": 0.1475, + "step": 10138 + }, + { + "epoch": 0.2565731204291824, + "grad_norm": 3.9207231998443604, + "learning_rate": 8.546094066274602e-06, + "loss": 0.1559, + "step": 10139 + }, + { + "epoch": 0.25659842599387606, + "grad_norm": 4.359175682067871, + "learning_rate": 8.545810986099399e-06, + "loss": 0.1384, + "step": 10140 + }, + { + "epoch": 0.2566237315585697, + "grad_norm": 7.36332368850708, + "learning_rate": 8.54552788305792e-06, + "loss": 0.2168, + "step": 10141 + }, + { + "epoch": 0.2566490371232634, + "grad_norm": 4.74539041519165, + "learning_rate": 8.54524475715199e-06, + "loss": 0.188, + "step": 10142 + }, + { + "epoch": 0.2566743426879571, + "grad_norm": 2.812490701675415, + "learning_rate": 8.544961608383437e-06, + "loss": 0.1256, + "step": 10143 + }, + { + "epoch": 0.25669964825265074, + "grad_norm": 4.2463154792785645, + "learning_rate": 8.544678436754085e-06, + "loss": 0.1791, + "step": 10144 + }, + { + "epoch": 0.25672495381734445, + "grad_norm": 7.039431095123291, + "learning_rate": 8.544395242265764e-06, + "loss": 0.273, + "step": 10145 + }, + { + "epoch": 0.2567502593820381, + "grad_norm": 24.478363037109375, + "learning_rate": 8.544112024920295e-06, + "loss": 0.3304, + "step": 10146 + }, + { + "epoch": 0.25677556494673176, + "grad_norm": 6.898641586303711, + "learning_rate": 8.543828784719508e-06, + "loss": 0.1311, + "step": 10147 + }, + { + "epoch": 0.25680087051142547, + "grad_norm": 9.784061431884766, + "learning_rate": 8.543545521665226e-06, + "loss": 0.3455, + "step": 10148 + }, + { + "epoch": 0.25682617607611913, + "grad_norm": 4.3329033851623535, + "learning_rate": 8.54326223575928e-06, + "loss": 0.1678, + "step": 10149 + }, + { + "epoch": 0.25685148164081284, + "grad_norm": 14.013123512268066, + "learning_rate": 8.542978927003495e-06, + "loss": 0.3492, + "step": 10150 + }, + { + "epoch": 0.2568767872055065, + "grad_norm": 6.307936191558838, + "learning_rate": 8.542695595399698e-06, + "loss": 0.1872, + "step": 10151 + }, + { + "epoch": 0.25690209277020015, + "grad_norm": 2.893657684326172, + "learning_rate": 8.542412240949715e-06, + "loss": 0.1292, + "step": 10152 + }, + { + "epoch": 0.25692739833489386, + "grad_norm": 6.031981945037842, + "learning_rate": 8.542128863655377e-06, + "loss": 0.1437, + "step": 10153 + }, + { + "epoch": 0.2569527038995875, + "grad_norm": 8.361865997314453, + "learning_rate": 8.541845463518507e-06, + "loss": 0.3003, + "step": 10154 + }, + { + "epoch": 0.2569780094642812, + "grad_norm": 4.7129058837890625, + "learning_rate": 8.541562040540934e-06, + "loss": 0.1796, + "step": 10155 + }, + { + "epoch": 0.2570033150289749, + "grad_norm": 6.158433437347412, + "learning_rate": 8.541278594724488e-06, + "loss": 0.2363, + "step": 10156 + }, + { + "epoch": 0.25702862059366854, + "grad_norm": 15.917238235473633, + "learning_rate": 8.540995126070993e-06, + "loss": 0.2979, + "step": 10157 + }, + { + "epoch": 0.2570539261583622, + "grad_norm": 8.604390144348145, + "learning_rate": 8.540711634582279e-06, + "loss": 0.2352, + "step": 10158 + }, + { + "epoch": 0.2570792317230559, + "grad_norm": 3.682990789413452, + "learning_rate": 8.540428120260175e-06, + "loss": 0.1594, + "step": 10159 + }, + { + "epoch": 0.25710453728774957, + "grad_norm": 8.409561157226562, + "learning_rate": 8.540144583106509e-06, + "loss": 0.2856, + "step": 10160 + }, + { + "epoch": 0.2571298428524433, + "grad_norm": 9.124186515808105, + "learning_rate": 8.539861023123108e-06, + "loss": 0.2358, + "step": 10161 + }, + { + "epoch": 0.25715514841713694, + "grad_norm": 8.044088363647461, + "learning_rate": 8.539577440311805e-06, + "loss": 0.2305, + "step": 10162 + }, + { + "epoch": 0.2571804539818306, + "grad_norm": 4.1068902015686035, + "learning_rate": 8.539293834674422e-06, + "loss": 0.2073, + "step": 10163 + }, + { + "epoch": 0.2572057595465243, + "grad_norm": 3.31335186958313, + "learning_rate": 8.539010206212791e-06, + "loss": 0.1963, + "step": 10164 + }, + { + "epoch": 0.25723106511121796, + "grad_norm": 6.3824334144592285, + "learning_rate": 8.538726554928744e-06, + "loss": 0.1926, + "step": 10165 + }, + { + "epoch": 0.2572563706759116, + "grad_norm": 8.702608108520508, + "learning_rate": 8.538442880824106e-06, + "loss": 0.1791, + "step": 10166 + }, + { + "epoch": 0.25728167624060533, + "grad_norm": 3.348850727081299, + "learning_rate": 8.538159183900709e-06, + "loss": 0.1638, + "step": 10167 + }, + { + "epoch": 0.257306981805299, + "grad_norm": 4.100390911102295, + "learning_rate": 8.53787546416038e-06, + "loss": 0.1737, + "step": 10168 + }, + { + "epoch": 0.25733228736999264, + "grad_norm": 7.151303291320801, + "learning_rate": 8.537591721604954e-06, + "loss": 0.2607, + "step": 10169 + }, + { + "epoch": 0.25735759293468635, + "grad_norm": 3.627558469772339, + "learning_rate": 8.537307956236254e-06, + "loss": 0.1494, + "step": 10170 + }, + { + "epoch": 0.25738289849938, + "grad_norm": 5.602145195007324, + "learning_rate": 8.537024168056114e-06, + "loss": 0.195, + "step": 10171 + }, + { + "epoch": 0.25740820406407366, + "grad_norm": 5.770579814910889, + "learning_rate": 8.536740357066361e-06, + "loss": 0.2684, + "step": 10172 + }, + { + "epoch": 0.2574335096287674, + "grad_norm": 6.072679042816162, + "learning_rate": 8.53645652326883e-06, + "loss": 0.2464, + "step": 10173 + }, + { + "epoch": 0.25745881519346103, + "grad_norm": 5.34559440612793, + "learning_rate": 8.536172666665348e-06, + "loss": 0.1979, + "step": 10174 + }, + { + "epoch": 0.25748412075815474, + "grad_norm": 8.592824935913086, + "learning_rate": 8.535888787257745e-06, + "loss": 0.1755, + "step": 10175 + }, + { + "epoch": 0.2575094263228484, + "grad_norm": 30.451675415039062, + "learning_rate": 8.535604885047855e-06, + "loss": 0.3606, + "step": 10176 + }, + { + "epoch": 0.25753473188754206, + "grad_norm": 10.50244140625, + "learning_rate": 8.535320960037506e-06, + "loss": 0.27, + "step": 10177 + }, + { + "epoch": 0.25756003745223577, + "grad_norm": 3.866452217102051, + "learning_rate": 8.535037012228528e-06, + "loss": 0.1647, + "step": 10178 + }, + { + "epoch": 0.2575853430169294, + "grad_norm": 7.645660400390625, + "learning_rate": 8.534753041622757e-06, + "loss": 0.1419, + "step": 10179 + }, + { + "epoch": 0.2576106485816231, + "grad_norm": 8.504585266113281, + "learning_rate": 8.534469048222019e-06, + "loss": 0.2263, + "step": 10180 + }, + { + "epoch": 0.2576359541463168, + "grad_norm": 9.096549987792969, + "learning_rate": 8.534185032028146e-06, + "loss": 0.1863, + "step": 10181 + }, + { + "epoch": 0.25766125971101045, + "grad_norm": 3.6712262630462646, + "learning_rate": 8.533900993042974e-06, + "loss": 0.1785, + "step": 10182 + }, + { + "epoch": 0.2576865652757041, + "grad_norm": 19.513879776000977, + "learning_rate": 8.533616931268332e-06, + "loss": 0.3657, + "step": 10183 + }, + { + "epoch": 0.2577118708403978, + "grad_norm": 9.3374605178833, + "learning_rate": 8.53333284670605e-06, + "loss": 0.1496, + "step": 10184 + }, + { + "epoch": 0.2577371764050915, + "grad_norm": 7.051568984985352, + "learning_rate": 8.53304873935796e-06, + "loss": 0.2312, + "step": 10185 + }, + { + "epoch": 0.25776248196978513, + "grad_norm": 6.5381574630737305, + "learning_rate": 8.532764609225902e-06, + "loss": 0.2688, + "step": 10186 + }, + { + "epoch": 0.25778778753447884, + "grad_norm": 5.367345333099365, + "learning_rate": 8.532480456311696e-06, + "loss": 0.178, + "step": 10187 + }, + { + "epoch": 0.2578130930991725, + "grad_norm": 6.278810977935791, + "learning_rate": 8.532196280617184e-06, + "loss": 0.153, + "step": 10188 + }, + { + "epoch": 0.2578383986638662, + "grad_norm": 3.121494770050049, + "learning_rate": 8.531912082144193e-06, + "loss": 0.2166, + "step": 10189 + }, + { + "epoch": 0.25786370422855986, + "grad_norm": 3.511225700378418, + "learning_rate": 8.53162786089456e-06, + "loss": 0.1937, + "step": 10190 + }, + { + "epoch": 0.2578890097932535, + "grad_norm": 9.025677680969238, + "learning_rate": 8.531343616870116e-06, + "loss": 0.3069, + "step": 10191 + }, + { + "epoch": 0.25791431535794723, + "grad_norm": 3.3115878105163574, + "learning_rate": 8.531059350072693e-06, + "loss": 0.1428, + "step": 10192 + }, + { + "epoch": 0.2579396209226409, + "grad_norm": 8.984540939331055, + "learning_rate": 8.530775060504124e-06, + "loss": 0.2413, + "step": 10193 + }, + { + "epoch": 0.25796492648733454, + "grad_norm": 4.833586692810059, + "learning_rate": 8.530490748166245e-06, + "loss": 0.185, + "step": 10194 + }, + { + "epoch": 0.25799023205202826, + "grad_norm": 3.217331647872925, + "learning_rate": 8.530206413060887e-06, + "loss": 0.1492, + "step": 10195 + }, + { + "epoch": 0.2580155376167219, + "grad_norm": 4.671605110168457, + "learning_rate": 8.529922055189885e-06, + "loss": 0.1651, + "step": 10196 + }, + { + "epoch": 0.25804084318141557, + "grad_norm": 7.923281669616699, + "learning_rate": 8.529637674555074e-06, + "loss": 0.2484, + "step": 10197 + }, + { + "epoch": 0.2580661487461093, + "grad_norm": 8.107341766357422, + "learning_rate": 8.529353271158282e-06, + "loss": 0.2296, + "step": 10198 + }, + { + "epoch": 0.25809145431080294, + "grad_norm": 2.766785144805908, + "learning_rate": 8.52906884500135e-06, + "loss": 0.1131, + "step": 10199 + }, + { + "epoch": 0.25811675987549665, + "grad_norm": 4.624353885650635, + "learning_rate": 8.52878439608611e-06, + "loss": 0.1484, + "step": 10200 + }, + { + "epoch": 0.2581420654401903, + "grad_norm": 4.712363243103027, + "learning_rate": 8.528499924414395e-06, + "loss": 0.2315, + "step": 10201 + }, + { + "epoch": 0.25816737100488396, + "grad_norm": 4.82125186920166, + "learning_rate": 8.528215429988041e-06, + "loss": 0.1283, + "step": 10202 + }, + { + "epoch": 0.2581926765695777, + "grad_norm": 4.348428726196289, + "learning_rate": 8.527930912808881e-06, + "loss": 0.2068, + "step": 10203 + }, + { + "epoch": 0.25821798213427133, + "grad_norm": 6.246733665466309, + "learning_rate": 8.52764637287875e-06, + "loss": 0.2136, + "step": 10204 + }, + { + "epoch": 0.258243287698965, + "grad_norm": 8.170109748840332, + "learning_rate": 8.527361810199486e-06, + "loss": 0.1321, + "step": 10205 + }, + { + "epoch": 0.2582685932636587, + "grad_norm": 6.516417980194092, + "learning_rate": 8.527077224772919e-06, + "loss": 0.2193, + "step": 10206 + }, + { + "epoch": 0.25829389882835235, + "grad_norm": 6.625659942626953, + "learning_rate": 8.526792616600889e-06, + "loss": 0.2618, + "step": 10207 + }, + { + "epoch": 0.258319204393046, + "grad_norm": 5.49139404296875, + "learning_rate": 8.52650798568523e-06, + "loss": 0.2268, + "step": 10208 + }, + { + "epoch": 0.2583445099577397, + "grad_norm": 3.6240761280059814, + "learning_rate": 8.526223332027777e-06, + "loss": 0.1143, + "step": 10209 + }, + { + "epoch": 0.2583698155224334, + "grad_norm": 12.150742530822754, + "learning_rate": 8.525938655630364e-06, + "loss": 0.1023, + "step": 10210 + }, + { + "epoch": 0.25839512108712703, + "grad_norm": 5.0894246101379395, + "learning_rate": 8.525653956494829e-06, + "loss": 0.1242, + "step": 10211 + }, + { + "epoch": 0.25842042665182074, + "grad_norm": 3.0759501457214355, + "learning_rate": 8.525369234623009e-06, + "loss": 0.1193, + "step": 10212 + }, + { + "epoch": 0.2584457322165144, + "grad_norm": 3.719278573989868, + "learning_rate": 8.525084490016736e-06, + "loss": 0.0933, + "step": 10213 + }, + { + "epoch": 0.2584710377812081, + "grad_norm": 6.593709468841553, + "learning_rate": 8.52479972267785e-06, + "loss": 0.2123, + "step": 10214 + }, + { + "epoch": 0.25849634334590177, + "grad_norm": 6.861702919006348, + "learning_rate": 8.524514932608187e-06, + "loss": 0.1734, + "step": 10215 + }, + { + "epoch": 0.2585216489105954, + "grad_norm": 5.731334209442139, + "learning_rate": 8.524230119809582e-06, + "loss": 0.2537, + "step": 10216 + }, + { + "epoch": 0.25854695447528914, + "grad_norm": 7.119470119476318, + "learning_rate": 8.523945284283872e-06, + "loss": 0.1757, + "step": 10217 + }, + { + "epoch": 0.2585722600399828, + "grad_norm": 5.162485122680664, + "learning_rate": 8.523660426032897e-06, + "loss": 0.2134, + "step": 10218 + }, + { + "epoch": 0.25859756560467645, + "grad_norm": 6.940744400024414, + "learning_rate": 8.523375545058487e-06, + "loss": 0.1071, + "step": 10219 + }, + { + "epoch": 0.25862287116937016, + "grad_norm": 11.152063369750977, + "learning_rate": 8.523090641362488e-06, + "loss": 0.1766, + "step": 10220 + }, + { + "epoch": 0.2586481767340638, + "grad_norm": 3.931844472885132, + "learning_rate": 8.522805714946729e-06, + "loss": 0.1518, + "step": 10221 + }, + { + "epoch": 0.2586734822987575, + "grad_norm": 5.668583393096924, + "learning_rate": 8.522520765813053e-06, + "loss": 0.1848, + "step": 10222 + }, + { + "epoch": 0.2586987878634512, + "grad_norm": 5.9531354904174805, + "learning_rate": 8.522235793963298e-06, + "loss": 0.2306, + "step": 10223 + }, + { + "epoch": 0.25872409342814484, + "grad_norm": 3.711829423904419, + "learning_rate": 8.521950799399296e-06, + "loss": 0.1406, + "step": 10224 + }, + { + "epoch": 0.25874939899283855, + "grad_norm": 8.464627265930176, + "learning_rate": 8.521665782122889e-06, + "loss": 0.2939, + "step": 10225 + }, + { + "epoch": 0.2587747045575322, + "grad_norm": 3.892896890640259, + "learning_rate": 8.521380742135916e-06, + "loss": 0.1836, + "step": 10226 + }, + { + "epoch": 0.25880001012222587, + "grad_norm": 5.356791019439697, + "learning_rate": 8.521095679440213e-06, + "loss": 0.1768, + "step": 10227 + }, + { + "epoch": 0.2588253156869196, + "grad_norm": 2.72729754447937, + "learning_rate": 8.520810594037618e-06, + "loss": 0.0986, + "step": 10228 + }, + { + "epoch": 0.25885062125161323, + "grad_norm": 6.085396766662598, + "learning_rate": 8.520525485929972e-06, + "loss": 0.2183, + "step": 10229 + }, + { + "epoch": 0.2588759268163069, + "grad_norm": 7.091953754425049, + "learning_rate": 8.520240355119112e-06, + "loss": 0.2394, + "step": 10230 + }, + { + "epoch": 0.2589012323810006, + "grad_norm": 5.087151527404785, + "learning_rate": 8.519955201606876e-06, + "loss": 0.2099, + "step": 10231 + }, + { + "epoch": 0.25892653794569426, + "grad_norm": 4.158445358276367, + "learning_rate": 8.519670025395104e-06, + "loss": 0.1008, + "step": 10232 + }, + { + "epoch": 0.2589518435103879, + "grad_norm": 6.604459762573242, + "learning_rate": 8.519384826485632e-06, + "loss": 0.2454, + "step": 10233 + }, + { + "epoch": 0.2589771490750816, + "grad_norm": 5.262744426727295, + "learning_rate": 8.519099604880305e-06, + "loss": 0.2136, + "step": 10234 + }, + { + "epoch": 0.2590024546397753, + "grad_norm": 7.950146198272705, + "learning_rate": 8.518814360580959e-06, + "loss": 0.2763, + "step": 10235 + }, + { + "epoch": 0.25902776020446894, + "grad_norm": 4.027386665344238, + "learning_rate": 8.518529093589434e-06, + "loss": 0.2063, + "step": 10236 + }, + { + "epoch": 0.25905306576916265, + "grad_norm": 7.410518169403076, + "learning_rate": 8.51824380390757e-06, + "loss": 0.1737, + "step": 10237 + }, + { + "epoch": 0.2590783713338563, + "grad_norm": 3.669050693511963, + "learning_rate": 8.517958491537203e-06, + "loss": 0.1371, + "step": 10238 + }, + { + "epoch": 0.25910367689855, + "grad_norm": 10.087275505065918, + "learning_rate": 8.517673156480178e-06, + "loss": 0.26, + "step": 10239 + }, + { + "epoch": 0.2591289824632437, + "grad_norm": 19.09530258178711, + "learning_rate": 8.517387798738332e-06, + "loss": 0.1571, + "step": 10240 + }, + { + "epoch": 0.25915428802793733, + "grad_norm": 10.634439468383789, + "learning_rate": 8.517102418313508e-06, + "loss": 0.2346, + "step": 10241 + }, + { + "epoch": 0.25917959359263104, + "grad_norm": 5.270219326019287, + "learning_rate": 8.516817015207544e-06, + "loss": 0.1467, + "step": 10242 + }, + { + "epoch": 0.2592048991573247, + "grad_norm": 12.242191314697266, + "learning_rate": 8.516531589422281e-06, + "loss": 0.2717, + "step": 10243 + }, + { + "epoch": 0.25923020472201835, + "grad_norm": 7.873970985412598, + "learning_rate": 8.516246140959561e-06, + "loss": 0.2658, + "step": 10244 + }, + { + "epoch": 0.25925551028671207, + "grad_norm": 3.2659366130828857, + "learning_rate": 8.515960669821222e-06, + "loss": 0.1994, + "step": 10245 + }, + { + "epoch": 0.2592808158514057, + "grad_norm": 8.509653091430664, + "learning_rate": 8.515675176009108e-06, + "loss": 0.1939, + "step": 10246 + }, + { + "epoch": 0.2593061214160994, + "grad_norm": 16.800085067749023, + "learning_rate": 8.51538965952506e-06, + "loss": 0.2762, + "step": 10247 + }, + { + "epoch": 0.2593314269807931, + "grad_norm": 4.212732315063477, + "learning_rate": 8.515104120370915e-06, + "loss": 0.2044, + "step": 10248 + }, + { + "epoch": 0.25935673254548675, + "grad_norm": 4.110699653625488, + "learning_rate": 8.514818558548519e-06, + "loss": 0.1338, + "step": 10249 + }, + { + "epoch": 0.2593820381101804, + "grad_norm": 2.4519128799438477, + "learning_rate": 8.514532974059712e-06, + "loss": 0.1397, + "step": 10250 + }, + { + "epoch": 0.2594073436748741, + "grad_norm": 3.7380659580230713, + "learning_rate": 8.514247366906334e-06, + "loss": 0.0886, + "step": 10251 + }, + { + "epoch": 0.25943264923956777, + "grad_norm": 4.349316596984863, + "learning_rate": 8.51396173709023e-06, + "loss": 0.1952, + "step": 10252 + }, + { + "epoch": 0.2594579548042615, + "grad_norm": 4.12303352355957, + "learning_rate": 8.51367608461324e-06, + "loss": 0.1264, + "step": 10253 + }, + { + "epoch": 0.25948326036895514, + "grad_norm": 5.085989952087402, + "learning_rate": 8.513390409477207e-06, + "loss": 0.1623, + "step": 10254 + }, + { + "epoch": 0.2595085659336488, + "grad_norm": 4.382436752319336, + "learning_rate": 8.513104711683973e-06, + "loss": 0.1449, + "step": 10255 + }, + { + "epoch": 0.2595338714983425, + "grad_norm": 2.296245574951172, + "learning_rate": 8.51281899123538e-06, + "loss": 0.1012, + "step": 10256 + }, + { + "epoch": 0.25955917706303616, + "grad_norm": 6.220541954040527, + "learning_rate": 8.51253324813327e-06, + "loss": 0.224, + "step": 10257 + }, + { + "epoch": 0.2595844826277298, + "grad_norm": 2.845630645751953, + "learning_rate": 8.512247482379488e-06, + "loss": 0.138, + "step": 10258 + }, + { + "epoch": 0.25960978819242353, + "grad_norm": 3.5328593254089355, + "learning_rate": 8.511961693975873e-06, + "loss": 0.1249, + "step": 10259 + }, + { + "epoch": 0.2596350937571172, + "grad_norm": 3.6364076137542725, + "learning_rate": 8.511675882924271e-06, + "loss": 0.1298, + "step": 10260 + }, + { + "epoch": 0.25966039932181084, + "grad_norm": 4.528350830078125, + "learning_rate": 8.511390049226525e-06, + "loss": 0.1605, + "step": 10261 + }, + { + "epoch": 0.25968570488650455, + "grad_norm": 8.750112533569336, + "learning_rate": 8.511104192884479e-06, + "loss": 0.2362, + "step": 10262 + }, + { + "epoch": 0.2597110104511982, + "grad_norm": 2.8430018424987793, + "learning_rate": 8.510818313899974e-06, + "loss": 0.1476, + "step": 10263 + }, + { + "epoch": 0.2597363160158919, + "grad_norm": 6.276622772216797, + "learning_rate": 8.510532412274854e-06, + "loss": 0.3078, + "step": 10264 + }, + { + "epoch": 0.2597616215805856, + "grad_norm": 4.816963195800781, + "learning_rate": 8.510246488010964e-06, + "loss": 0.1264, + "step": 10265 + }, + { + "epoch": 0.25978692714527923, + "grad_norm": 20.64894676208496, + "learning_rate": 8.509960541110148e-06, + "loss": 0.2342, + "step": 10266 + }, + { + "epoch": 0.25981223270997295, + "grad_norm": 4.462568283081055, + "learning_rate": 8.50967457157425e-06, + "loss": 0.1113, + "step": 10267 + }, + { + "epoch": 0.2598375382746666, + "grad_norm": 5.34818696975708, + "learning_rate": 8.50938857940511e-06, + "loss": 0.1169, + "step": 10268 + }, + { + "epoch": 0.25986284383936026, + "grad_norm": 11.29842758178711, + "learning_rate": 8.509102564604578e-06, + "loss": 0.2113, + "step": 10269 + }, + { + "epoch": 0.25988814940405397, + "grad_norm": 3.768117904663086, + "learning_rate": 8.508816527174496e-06, + "loss": 0.1317, + "step": 10270 + }, + { + "epoch": 0.2599134549687476, + "grad_norm": 10.54957389831543, + "learning_rate": 8.50853046711671e-06, + "loss": 0.3323, + "step": 10271 + }, + { + "epoch": 0.2599387605334413, + "grad_norm": 4.39597225189209, + "learning_rate": 8.508244384433064e-06, + "loss": 0.1863, + "step": 10272 + }, + { + "epoch": 0.259964066098135, + "grad_norm": 8.409516334533691, + "learning_rate": 8.5079582791254e-06, + "loss": 0.288, + "step": 10273 + }, + { + "epoch": 0.25998937166282865, + "grad_norm": 3.9883484840393066, + "learning_rate": 8.507672151195569e-06, + "loss": 0.1217, + "step": 10274 + }, + { + "epoch": 0.2600146772275223, + "grad_norm": 5.764760971069336, + "learning_rate": 8.50738600064541e-06, + "loss": 0.2061, + "step": 10275 + }, + { + "epoch": 0.260039982792216, + "grad_norm": 4.677880764007568, + "learning_rate": 8.507099827476772e-06, + "loss": 0.1957, + "step": 10276 + }, + { + "epoch": 0.2600652883569097, + "grad_norm": 5.707921981811523, + "learning_rate": 8.5068136316915e-06, + "loss": 0.2243, + "step": 10277 + }, + { + "epoch": 0.2600905939216034, + "grad_norm": 12.734086990356445, + "learning_rate": 8.50652741329144e-06, + "loss": 0.2608, + "step": 10278 + }, + { + "epoch": 0.26011589948629704, + "grad_norm": 25.44917869567871, + "learning_rate": 8.506241172278435e-06, + "loss": 0.3324, + "step": 10279 + }, + { + "epoch": 0.2601412050509907, + "grad_norm": 4.703967094421387, + "learning_rate": 8.505954908654335e-06, + "loss": 0.1019, + "step": 10280 + }, + { + "epoch": 0.2601665106156844, + "grad_norm": 5.107739448547363, + "learning_rate": 8.505668622420982e-06, + "loss": 0.1156, + "step": 10281 + }, + { + "epoch": 0.26019181618037807, + "grad_norm": 7.858736991882324, + "learning_rate": 8.505382313580226e-06, + "loss": 0.2556, + "step": 10282 + }, + { + "epoch": 0.2602171217450717, + "grad_norm": 14.535285949707031, + "learning_rate": 8.50509598213391e-06, + "loss": 0.2722, + "step": 10283 + }, + { + "epoch": 0.26024242730976543, + "grad_norm": 3.200212240219116, + "learning_rate": 8.504809628083882e-06, + "loss": 0.1318, + "step": 10284 + }, + { + "epoch": 0.2602677328744591, + "grad_norm": 6.103607654571533, + "learning_rate": 8.504523251431989e-06, + "loss": 0.2337, + "step": 10285 + }, + { + "epoch": 0.26029303843915275, + "grad_norm": 12.692668914794922, + "learning_rate": 8.504236852180076e-06, + "loss": 0.2983, + "step": 10286 + }, + { + "epoch": 0.26031834400384646, + "grad_norm": 5.886824607849121, + "learning_rate": 8.503950430329995e-06, + "loss": 0.1431, + "step": 10287 + }, + { + "epoch": 0.2603436495685401, + "grad_norm": 4.610296249389648, + "learning_rate": 8.503663985883585e-06, + "loss": 0.1761, + "step": 10288 + }, + { + "epoch": 0.26036895513323377, + "grad_norm": 10.084341049194336, + "learning_rate": 8.503377518842701e-06, + "loss": 0.2584, + "step": 10289 + }, + { + "epoch": 0.2603942606979275, + "grad_norm": 3.997173547744751, + "learning_rate": 8.503091029209186e-06, + "loss": 0.1005, + "step": 10290 + }, + { + "epoch": 0.26041956626262114, + "grad_norm": 7.567362308502197, + "learning_rate": 8.502804516984888e-06, + "loss": 0.2046, + "step": 10291 + }, + { + "epoch": 0.26044487182731485, + "grad_norm": 4.794182300567627, + "learning_rate": 8.502517982171657e-06, + "loss": 0.1935, + "step": 10292 + }, + { + "epoch": 0.2604701773920085, + "grad_norm": 3.788034439086914, + "learning_rate": 8.502231424771337e-06, + "loss": 0.1492, + "step": 10293 + }, + { + "epoch": 0.26049548295670216, + "grad_norm": 4.4905686378479, + "learning_rate": 8.50194484478578e-06, + "loss": 0.2144, + "step": 10294 + }, + { + "epoch": 0.2605207885213959, + "grad_norm": 7.182879447937012, + "learning_rate": 8.50165824221683e-06, + "loss": 0.2634, + "step": 10295 + }, + { + "epoch": 0.26054609408608953, + "grad_norm": 11.931489944458008, + "learning_rate": 8.50137161706634e-06, + "loss": 0.2594, + "step": 10296 + }, + { + "epoch": 0.2605713996507832, + "grad_norm": 5.631324768066406, + "learning_rate": 8.501084969336153e-06, + "loss": 0.2033, + "step": 10297 + }, + { + "epoch": 0.2605967052154769, + "grad_norm": 5.411634922027588, + "learning_rate": 8.500798299028122e-06, + "loss": 0.2001, + "step": 10298 + }, + { + "epoch": 0.26062201078017055, + "grad_norm": 21.99734878540039, + "learning_rate": 8.500511606144093e-06, + "loss": 0.2959, + "step": 10299 + }, + { + "epoch": 0.2606473163448642, + "grad_norm": 5.282745361328125, + "learning_rate": 8.500224890685918e-06, + "loss": 0.1809, + "step": 10300 + }, + { + "epoch": 0.2606726219095579, + "grad_norm": 3.9418578147888184, + "learning_rate": 8.499938152655441e-06, + "loss": 0.1466, + "step": 10301 + }, + { + "epoch": 0.2606979274742516, + "grad_norm": 5.179017543792725, + "learning_rate": 8.499651392054514e-06, + "loss": 0.2069, + "step": 10302 + }, + { + "epoch": 0.2607232330389453, + "grad_norm": 7.244589328765869, + "learning_rate": 8.499364608884987e-06, + "loss": 0.1357, + "step": 10303 + }, + { + "epoch": 0.26074853860363895, + "grad_norm": 7.742295742034912, + "learning_rate": 8.499077803148709e-06, + "loss": 0.1902, + "step": 10304 + }, + { + "epoch": 0.2607738441683326, + "grad_norm": 10.189184188842773, + "learning_rate": 8.498790974847529e-06, + "loss": 0.1956, + "step": 10305 + }, + { + "epoch": 0.2607991497330263, + "grad_norm": 6.105958461761475, + "learning_rate": 8.498504123983295e-06, + "loss": 0.1802, + "step": 10306 + }, + { + "epoch": 0.26082445529771997, + "grad_norm": 5.469505786895752, + "learning_rate": 8.498217250557863e-06, + "loss": 0.1912, + "step": 10307 + }, + { + "epoch": 0.2608497608624136, + "grad_norm": 4.100525856018066, + "learning_rate": 8.497930354573075e-06, + "loss": 0.2016, + "step": 10308 + }, + { + "epoch": 0.26087506642710734, + "grad_norm": 2.355360746383667, + "learning_rate": 8.497643436030785e-06, + "loss": 0.0985, + "step": 10309 + }, + { + "epoch": 0.260900371991801, + "grad_norm": 6.273809909820557, + "learning_rate": 8.497356494932844e-06, + "loss": 0.1673, + "step": 10310 + }, + { + "epoch": 0.26092567755649465, + "grad_norm": 5.282484531402588, + "learning_rate": 8.497069531281101e-06, + "loss": 0.2343, + "step": 10311 + }, + { + "epoch": 0.26095098312118836, + "grad_norm": 8.18370532989502, + "learning_rate": 8.496782545077408e-06, + "loss": 0.1512, + "step": 10312 + }, + { + "epoch": 0.260976288685882, + "grad_norm": 3.6563613414764404, + "learning_rate": 8.496495536323614e-06, + "loss": 0.1273, + "step": 10313 + }, + { + "epoch": 0.2610015942505757, + "grad_norm": 5.262667655944824, + "learning_rate": 8.496208505021572e-06, + "loss": 0.2163, + "step": 10314 + }, + { + "epoch": 0.2610268998152694, + "grad_norm": 15.399640083312988, + "learning_rate": 8.49592145117313e-06, + "loss": 0.2466, + "step": 10315 + }, + { + "epoch": 0.26105220537996304, + "grad_norm": 3.1625916957855225, + "learning_rate": 8.495634374780141e-06, + "loss": 0.1633, + "step": 10316 + }, + { + "epoch": 0.26107751094465675, + "grad_norm": 6.561440944671631, + "learning_rate": 8.495347275844457e-06, + "loss": 0.1895, + "step": 10317 + }, + { + "epoch": 0.2611028165093504, + "grad_norm": 10.87917423248291, + "learning_rate": 8.49506015436793e-06, + "loss": 0.3846, + "step": 10318 + }, + { + "epoch": 0.26112812207404407, + "grad_norm": 4.048357963562012, + "learning_rate": 8.494773010352406e-06, + "loss": 0.1716, + "step": 10319 + }, + { + "epoch": 0.2611534276387378, + "grad_norm": 12.053279876708984, + "learning_rate": 8.494485843799745e-06, + "loss": 0.2228, + "step": 10320 + }, + { + "epoch": 0.26117873320343143, + "grad_norm": 11.19942569732666, + "learning_rate": 8.494198654711791e-06, + "loss": 0.2437, + "step": 10321 + }, + { + "epoch": 0.2612040387681251, + "grad_norm": 3.7910003662109375, + "learning_rate": 8.493911443090404e-06, + "loss": 0.148, + "step": 10322 + }, + { + "epoch": 0.2612293443328188, + "grad_norm": 5.196164131164551, + "learning_rate": 8.49362420893743e-06, + "loss": 0.236, + "step": 10323 + }, + { + "epoch": 0.26125464989751246, + "grad_norm": 3.6011412143707275, + "learning_rate": 8.493336952254723e-06, + "loss": 0.1363, + "step": 10324 + }, + { + "epoch": 0.2612799554622061, + "grad_norm": 7.080883979797363, + "learning_rate": 8.493049673044135e-06, + "loss": 0.2606, + "step": 10325 + }, + { + "epoch": 0.2613052610268998, + "grad_norm": 5.862204551696777, + "learning_rate": 8.492762371307521e-06, + "loss": 0.3044, + "step": 10326 + }, + { + "epoch": 0.2613305665915935, + "grad_norm": 4.138370513916016, + "learning_rate": 8.492475047046733e-06, + "loss": 0.1626, + "step": 10327 + }, + { + "epoch": 0.2613558721562872, + "grad_norm": 8.052160263061523, + "learning_rate": 8.49218770026362e-06, + "loss": 0.1973, + "step": 10328 + }, + { + "epoch": 0.26138117772098085, + "grad_norm": 6.2708306312561035, + "learning_rate": 8.49190033096004e-06, + "loss": 0.2126, + "step": 10329 + }, + { + "epoch": 0.2614064832856745, + "grad_norm": 6.954400062561035, + "learning_rate": 8.491612939137846e-06, + "loss": 0.1406, + "step": 10330 + }, + { + "epoch": 0.2614317888503682, + "grad_norm": 4.780607223510742, + "learning_rate": 8.491325524798888e-06, + "loss": 0.13, + "step": 10331 + }, + { + "epoch": 0.2614570944150619, + "grad_norm": 4.636606216430664, + "learning_rate": 8.49103808794502e-06, + "loss": 0.2597, + "step": 10332 + }, + { + "epoch": 0.26148239997975553, + "grad_norm": 7.213512420654297, + "learning_rate": 8.490750628578098e-06, + "loss": 0.1718, + "step": 10333 + }, + { + "epoch": 0.26150770554444924, + "grad_norm": 4.827089786529541, + "learning_rate": 8.490463146699974e-06, + "loss": 0.2899, + "step": 10334 + }, + { + "epoch": 0.2615330111091429, + "grad_norm": 32.12724685668945, + "learning_rate": 8.490175642312504e-06, + "loss": 0.2883, + "step": 10335 + }, + { + "epoch": 0.26155831667383656, + "grad_norm": 2.043424129486084, + "learning_rate": 8.489888115417538e-06, + "loss": 0.1486, + "step": 10336 + }, + { + "epoch": 0.26158362223853027, + "grad_norm": 3.8654890060424805, + "learning_rate": 8.489600566016935e-06, + "loss": 0.1509, + "step": 10337 + }, + { + "epoch": 0.2616089278032239, + "grad_norm": 4.874197483062744, + "learning_rate": 8.489312994112546e-06, + "loss": 0.1701, + "step": 10338 + }, + { + "epoch": 0.2616342333679176, + "grad_norm": 3.03303861618042, + "learning_rate": 8.489025399706226e-06, + "loss": 0.118, + "step": 10339 + }, + { + "epoch": 0.2616595389326113, + "grad_norm": 3.9509079456329346, + "learning_rate": 8.488737782799832e-06, + "loss": 0.1874, + "step": 10340 + }, + { + "epoch": 0.26168484449730495, + "grad_norm": 2.3906021118164062, + "learning_rate": 8.488450143395216e-06, + "loss": 0.0904, + "step": 10341 + }, + { + "epoch": 0.26171015006199866, + "grad_norm": 5.9672675132751465, + "learning_rate": 8.488162481494233e-06, + "loss": 0.1859, + "step": 10342 + }, + { + "epoch": 0.2617354556266923, + "grad_norm": 4.063562870025635, + "learning_rate": 8.487874797098738e-06, + "loss": 0.2038, + "step": 10343 + }, + { + "epoch": 0.26176076119138597, + "grad_norm": 2.258735418319702, + "learning_rate": 8.48758709021059e-06, + "loss": 0.0718, + "step": 10344 + }, + { + "epoch": 0.2617860667560797, + "grad_norm": 8.034360885620117, + "learning_rate": 8.487299360831641e-06, + "loss": 0.2163, + "step": 10345 + }, + { + "epoch": 0.26181137232077334, + "grad_norm": 4.270712852478027, + "learning_rate": 8.487011608963746e-06, + "loss": 0.1349, + "step": 10346 + }, + { + "epoch": 0.261836677885467, + "grad_norm": 4.6165995597839355, + "learning_rate": 8.486723834608764e-06, + "loss": 0.1439, + "step": 10347 + }, + { + "epoch": 0.2618619834501607, + "grad_norm": 3.9792897701263428, + "learning_rate": 8.486436037768546e-06, + "loss": 0.1734, + "step": 10348 + }, + { + "epoch": 0.26188728901485436, + "grad_norm": 5.353747367858887, + "learning_rate": 8.486148218444952e-06, + "loss": 0.1249, + "step": 10349 + }, + { + "epoch": 0.261912594579548, + "grad_norm": 4.788697242736816, + "learning_rate": 8.485860376639836e-06, + "loss": 0.2082, + "step": 10350 + }, + { + "epoch": 0.26193790014424173, + "grad_norm": 9.623342514038086, + "learning_rate": 8.485572512355055e-06, + "loss": 0.3432, + "step": 10351 + }, + { + "epoch": 0.2619632057089354, + "grad_norm": 15.578344345092773, + "learning_rate": 8.485284625592464e-06, + "loss": 0.2736, + "step": 10352 + }, + { + "epoch": 0.26198851127362904, + "grad_norm": 10.008689880371094, + "learning_rate": 8.484996716353924e-06, + "loss": 0.1607, + "step": 10353 + }, + { + "epoch": 0.26201381683832276, + "grad_norm": 4.854345321655273, + "learning_rate": 8.484708784641285e-06, + "loss": 0.0987, + "step": 10354 + }, + { + "epoch": 0.2620391224030164, + "grad_norm": 2.5146894454956055, + "learning_rate": 8.484420830456408e-06, + "loss": 0.1203, + "step": 10355 + }, + { + "epoch": 0.2620644279677101, + "grad_norm": 4.785123825073242, + "learning_rate": 8.48413285380115e-06, + "loss": 0.2477, + "step": 10356 + }, + { + "epoch": 0.2620897335324038, + "grad_norm": 4.595946311950684, + "learning_rate": 8.483844854677367e-06, + "loss": 0.1354, + "step": 10357 + }, + { + "epoch": 0.26211503909709744, + "grad_norm": 9.337820053100586, + "learning_rate": 8.483556833086918e-06, + "loss": 0.2801, + "step": 10358 + }, + { + "epoch": 0.26214034466179115, + "grad_norm": 16.291166305541992, + "learning_rate": 8.483268789031658e-06, + "loss": 0.3037, + "step": 10359 + }, + { + "epoch": 0.2621656502264848, + "grad_norm": 5.614597797393799, + "learning_rate": 8.482980722513446e-06, + "loss": 0.1303, + "step": 10360 + }, + { + "epoch": 0.26219095579117846, + "grad_norm": 23.265731811523438, + "learning_rate": 8.482692633534138e-06, + "loss": 0.2424, + "step": 10361 + }, + { + "epoch": 0.26221626135587217, + "grad_norm": 5.698431491851807, + "learning_rate": 8.482404522095592e-06, + "loss": 0.1961, + "step": 10362 + }, + { + "epoch": 0.2622415669205658, + "grad_norm": 28.222240447998047, + "learning_rate": 8.482116388199669e-06, + "loss": 0.3482, + "step": 10363 + }, + { + "epoch": 0.2622668724852595, + "grad_norm": 5.061816692352295, + "learning_rate": 8.481828231848226e-06, + "loss": 0.1799, + "step": 10364 + }, + { + "epoch": 0.2622921780499532, + "grad_norm": 7.29575252532959, + "learning_rate": 8.481540053043119e-06, + "loss": 0.193, + "step": 10365 + }, + { + "epoch": 0.26231748361464685, + "grad_norm": 4.608319282531738, + "learning_rate": 8.481251851786207e-06, + "loss": 0.1784, + "step": 10366 + }, + { + "epoch": 0.26234278917934056, + "grad_norm": 6.873600959777832, + "learning_rate": 8.480963628079351e-06, + "loss": 0.1577, + "step": 10367 + }, + { + "epoch": 0.2623680947440342, + "grad_norm": 5.474859714508057, + "learning_rate": 8.480675381924406e-06, + "loss": 0.1599, + "step": 10368 + }, + { + "epoch": 0.2623934003087279, + "grad_norm": 7.645293235778809, + "learning_rate": 8.480387113323236e-06, + "loss": 0.2155, + "step": 10369 + }, + { + "epoch": 0.2624187058734216, + "grad_norm": 3.3457751274108887, + "learning_rate": 8.480098822277694e-06, + "loss": 0.149, + "step": 10370 + }, + { + "epoch": 0.26244401143811524, + "grad_norm": 3.3271284103393555, + "learning_rate": 8.479810508789642e-06, + "loss": 0.1316, + "step": 10371 + }, + { + "epoch": 0.2624693170028089, + "grad_norm": 8.862865447998047, + "learning_rate": 8.479522172860941e-06, + "loss": 0.2628, + "step": 10372 + }, + { + "epoch": 0.2624946225675026, + "grad_norm": 7.119082927703857, + "learning_rate": 8.47923381449345e-06, + "loss": 0.2348, + "step": 10373 + }, + { + "epoch": 0.26251992813219627, + "grad_norm": 6.726749897003174, + "learning_rate": 8.478945433689025e-06, + "loss": 0.1923, + "step": 10374 + }, + { + "epoch": 0.2625452336968899, + "grad_norm": 3.4838321208953857, + "learning_rate": 8.478657030449528e-06, + "loss": 0.1572, + "step": 10375 + }, + { + "epoch": 0.26257053926158364, + "grad_norm": 4.618091583251953, + "learning_rate": 8.47836860477682e-06, + "loss": 0.1463, + "step": 10376 + }, + { + "epoch": 0.2625958448262773, + "grad_norm": 8.756314277648926, + "learning_rate": 8.478080156672758e-06, + "loss": 0.1761, + "step": 10377 + }, + { + "epoch": 0.26262115039097095, + "grad_norm": 9.376907348632812, + "learning_rate": 8.477791686139207e-06, + "loss": 0.1807, + "step": 10378 + }, + { + "epoch": 0.26264645595566466, + "grad_norm": 3.2621817588806152, + "learning_rate": 8.477503193178023e-06, + "loss": 0.0939, + "step": 10379 + }, + { + "epoch": 0.2626717615203583, + "grad_norm": 6.888204097747803, + "learning_rate": 8.477214677791067e-06, + "loss": 0.1998, + "step": 10380 + }, + { + "epoch": 0.262697067085052, + "grad_norm": 23.762081146240234, + "learning_rate": 8.476926139980202e-06, + "loss": 0.1835, + "step": 10381 + }, + { + "epoch": 0.2627223726497457, + "grad_norm": 9.000612258911133, + "learning_rate": 8.476637579747285e-06, + "loss": 0.2539, + "step": 10382 + }, + { + "epoch": 0.26274767821443934, + "grad_norm": 5.111517429351807, + "learning_rate": 8.47634899709418e-06, + "loss": 0.1483, + "step": 10383 + }, + { + "epoch": 0.26277298377913305, + "grad_norm": 6.931931018829346, + "learning_rate": 8.476060392022747e-06, + "loss": 0.1371, + "step": 10384 + }, + { + "epoch": 0.2627982893438267, + "grad_norm": 4.738119602203369, + "learning_rate": 8.475771764534849e-06, + "loss": 0.1684, + "step": 10385 + }, + { + "epoch": 0.26282359490852036, + "grad_norm": 5.76654052734375, + "learning_rate": 8.475483114632342e-06, + "loss": 0.1996, + "step": 10386 + }, + { + "epoch": 0.2628489004732141, + "grad_norm": 9.878705978393555, + "learning_rate": 8.475194442317092e-06, + "loss": 0.2952, + "step": 10387 + }, + { + "epoch": 0.26287420603790773, + "grad_norm": 4.171550750732422, + "learning_rate": 8.47490574759096e-06, + "loss": 0.2159, + "step": 10388 + }, + { + "epoch": 0.2628995116026014, + "grad_norm": 7.352310657501221, + "learning_rate": 8.474617030455807e-06, + "loss": 0.2236, + "step": 10389 + }, + { + "epoch": 0.2629248171672951, + "grad_norm": 4.805974006652832, + "learning_rate": 8.474328290913496e-06, + "loss": 0.1407, + "step": 10390 + }, + { + "epoch": 0.26295012273198876, + "grad_norm": 6.4951863288879395, + "learning_rate": 8.474039528965888e-06, + "loss": 0.2679, + "step": 10391 + }, + { + "epoch": 0.26297542829668247, + "grad_norm": 4.858984470367432, + "learning_rate": 8.473750744614844e-06, + "loss": 0.0951, + "step": 10392 + }, + { + "epoch": 0.2630007338613761, + "grad_norm": 6.033475399017334, + "learning_rate": 8.473461937862228e-06, + "loss": 0.144, + "step": 10393 + }, + { + "epoch": 0.2630260394260698, + "grad_norm": 15.284578323364258, + "learning_rate": 8.4731731087099e-06, + "loss": 0.1999, + "step": 10394 + }, + { + "epoch": 0.2630513449907635, + "grad_norm": 14.0071439743042, + "learning_rate": 8.472884257159727e-06, + "loss": 0.2573, + "step": 10395 + }, + { + "epoch": 0.26307665055545715, + "grad_norm": 8.204195976257324, + "learning_rate": 8.47259538321357e-06, + "loss": 0.2036, + "step": 10396 + }, + { + "epoch": 0.2631019561201508, + "grad_norm": 7.6877570152282715, + "learning_rate": 8.472306486873291e-06, + "loss": 0.2695, + "step": 10397 + }, + { + "epoch": 0.2631272616848445, + "grad_norm": 4.010733127593994, + "learning_rate": 8.472017568140754e-06, + "loss": 0.1374, + "step": 10398 + }, + { + "epoch": 0.26315256724953817, + "grad_norm": 9.563157081604004, + "learning_rate": 8.47172862701782e-06, + "loss": 0.2615, + "step": 10399 + }, + { + "epoch": 0.26317787281423183, + "grad_norm": 5.533722877502441, + "learning_rate": 8.471439663506356e-06, + "loss": 0.1639, + "step": 10400 + }, + { + "epoch": 0.26320317837892554, + "grad_norm": 7.677048683166504, + "learning_rate": 8.47115067760822e-06, + "loss": 0.1052, + "step": 10401 + }, + { + "epoch": 0.2632284839436192, + "grad_norm": 3.8037450313568115, + "learning_rate": 8.470861669325281e-06, + "loss": 0.1824, + "step": 10402 + }, + { + "epoch": 0.26325378950831285, + "grad_norm": 4.649457931518555, + "learning_rate": 8.470572638659401e-06, + "loss": 0.1797, + "step": 10403 + }, + { + "epoch": 0.26327909507300656, + "grad_norm": 4.38084077835083, + "learning_rate": 8.470283585612445e-06, + "loss": 0.2564, + "step": 10404 + }, + { + "epoch": 0.2633044006377002, + "grad_norm": 4.086110591888428, + "learning_rate": 8.469994510186273e-06, + "loss": 0.1739, + "step": 10405 + }, + { + "epoch": 0.26332970620239393, + "grad_norm": 15.115222930908203, + "learning_rate": 8.469705412382752e-06, + "loss": 0.3742, + "step": 10406 + }, + { + "epoch": 0.2633550117670876, + "grad_norm": 5.575117588043213, + "learning_rate": 8.469416292203747e-06, + "loss": 0.1658, + "step": 10407 + }, + { + "epoch": 0.26338031733178124, + "grad_norm": 3.9042184352874756, + "learning_rate": 8.469127149651122e-06, + "loss": 0.1483, + "step": 10408 + }, + { + "epoch": 0.26340562289647496, + "grad_norm": 3.4874379634857178, + "learning_rate": 8.46883798472674e-06, + "loss": 0.1658, + "step": 10409 + }, + { + "epoch": 0.2634309284611686, + "grad_norm": 8.545494079589844, + "learning_rate": 8.468548797432468e-06, + "loss": 0.2593, + "step": 10410 + }, + { + "epoch": 0.26345623402586227, + "grad_norm": 8.94041919708252, + "learning_rate": 8.468259587770171e-06, + "loss": 0.1872, + "step": 10411 + }, + { + "epoch": 0.263481539590556, + "grad_norm": 5.413268566131592, + "learning_rate": 8.467970355741712e-06, + "loss": 0.1801, + "step": 10412 + }, + { + "epoch": 0.26350684515524964, + "grad_norm": 8.20463752746582, + "learning_rate": 8.467681101348957e-06, + "loss": 0.1518, + "step": 10413 + }, + { + "epoch": 0.2635321507199433, + "grad_norm": 6.054373741149902, + "learning_rate": 8.467391824593772e-06, + "loss": 0.2185, + "step": 10414 + }, + { + "epoch": 0.263557456284637, + "grad_norm": 7.471255779266357, + "learning_rate": 8.467102525478023e-06, + "loss": 0.2014, + "step": 10415 + }, + { + "epoch": 0.26358276184933066, + "grad_norm": 6.7190165519714355, + "learning_rate": 8.466813204003573e-06, + "loss": 0.1993, + "step": 10416 + }, + { + "epoch": 0.2636080674140243, + "grad_norm": 11.482385635375977, + "learning_rate": 8.46652386017229e-06, + "loss": 0.2911, + "step": 10417 + }, + { + "epoch": 0.26363337297871803, + "grad_norm": 5.627030849456787, + "learning_rate": 8.46623449398604e-06, + "loss": 0.2267, + "step": 10418 + }, + { + "epoch": 0.2636586785434117, + "grad_norm": 7.296575546264648, + "learning_rate": 8.465945105446687e-06, + "loss": 0.2097, + "step": 10419 + }, + { + "epoch": 0.2636839841081054, + "grad_norm": 8.015409469604492, + "learning_rate": 8.465655694556101e-06, + "loss": 0.2402, + "step": 10420 + }, + { + "epoch": 0.26370928967279905, + "grad_norm": 4.468583106994629, + "learning_rate": 8.465366261316146e-06, + "loss": 0.1689, + "step": 10421 + }, + { + "epoch": 0.2637345952374927, + "grad_norm": 5.03550910949707, + "learning_rate": 8.465076805728687e-06, + "loss": 0.1611, + "step": 10422 + }, + { + "epoch": 0.2637599008021864, + "grad_norm": 3.1216955184936523, + "learning_rate": 8.464787327795593e-06, + "loss": 0.0708, + "step": 10423 + }, + { + "epoch": 0.2637852063668801, + "grad_norm": 3.4966015815734863, + "learning_rate": 8.464497827518731e-06, + "loss": 0.1572, + "step": 10424 + }, + { + "epoch": 0.26381051193157373, + "grad_norm": 4.738998889923096, + "learning_rate": 8.464208304899966e-06, + "loss": 0.2079, + "step": 10425 + }, + { + "epoch": 0.26383581749626744, + "grad_norm": 3.915222644805908, + "learning_rate": 8.463918759941165e-06, + "loss": 0.1716, + "step": 10426 + }, + { + "epoch": 0.2638611230609611, + "grad_norm": 11.615762710571289, + "learning_rate": 8.463629192644199e-06, + "loss": 0.245, + "step": 10427 + }, + { + "epoch": 0.26388642862565476, + "grad_norm": 3.9855635166168213, + "learning_rate": 8.46333960301093e-06, + "loss": 0.2227, + "step": 10428 + }, + { + "epoch": 0.26391173419034847, + "grad_norm": 3.461153745651245, + "learning_rate": 8.463049991043228e-06, + "loss": 0.1669, + "step": 10429 + }, + { + "epoch": 0.2639370397550421, + "grad_norm": 4.281308174133301, + "learning_rate": 8.462760356742963e-06, + "loss": 0.1659, + "step": 10430 + }, + { + "epoch": 0.26396234531973584, + "grad_norm": 6.220182418823242, + "learning_rate": 8.462470700111998e-06, + "loss": 0.216, + "step": 10431 + }, + { + "epoch": 0.2639876508844295, + "grad_norm": 2.718522310256958, + "learning_rate": 8.462181021152205e-06, + "loss": 0.1523, + "step": 10432 + }, + { + "epoch": 0.26401295644912315, + "grad_norm": 7.214269638061523, + "learning_rate": 8.461891319865451e-06, + "loss": 0.2392, + "step": 10433 + }, + { + "epoch": 0.26403826201381686, + "grad_norm": 4.656429290771484, + "learning_rate": 8.461601596253603e-06, + "loss": 0.1844, + "step": 10434 + }, + { + "epoch": 0.2640635675785105, + "grad_norm": 4.494876861572266, + "learning_rate": 8.46131185031853e-06, + "loss": 0.166, + "step": 10435 + }, + { + "epoch": 0.2640888731432042, + "grad_norm": 3.9903573989868164, + "learning_rate": 8.461022082062103e-06, + "loss": 0.1959, + "step": 10436 + }, + { + "epoch": 0.2641141787078979, + "grad_norm": 8.801342010498047, + "learning_rate": 8.460732291486186e-06, + "loss": 0.353, + "step": 10437 + }, + { + "epoch": 0.26413948427259154, + "grad_norm": 3.481517791748047, + "learning_rate": 8.460442478592649e-06, + "loss": 0.1257, + "step": 10438 + }, + { + "epoch": 0.2641647898372852, + "grad_norm": 9.204092979431152, + "learning_rate": 8.460152643383366e-06, + "loss": 0.2911, + "step": 10439 + }, + { + "epoch": 0.2641900954019789, + "grad_norm": 3.0649404525756836, + "learning_rate": 8.4598627858602e-06, + "loss": 0.1618, + "step": 10440 + }, + { + "epoch": 0.26421540096667256, + "grad_norm": 6.2808637619018555, + "learning_rate": 8.459572906025022e-06, + "loss": 0.1792, + "step": 10441 + }, + { + "epoch": 0.2642407065313662, + "grad_norm": 4.886424541473389, + "learning_rate": 8.459283003879703e-06, + "loss": 0.2349, + "step": 10442 + }, + { + "epoch": 0.26426601209605993, + "grad_norm": 5.3235907554626465, + "learning_rate": 8.45899307942611e-06, + "loss": 0.151, + "step": 10443 + }, + { + "epoch": 0.2642913176607536, + "grad_norm": 7.037683010101318, + "learning_rate": 8.458703132666116e-06, + "loss": 0.2229, + "step": 10444 + }, + { + "epoch": 0.2643166232254473, + "grad_norm": 4.152225971221924, + "learning_rate": 8.458413163601589e-06, + "loss": 0.1667, + "step": 10445 + }, + { + "epoch": 0.26434192879014096, + "grad_norm": 5.6992902755737305, + "learning_rate": 8.458123172234398e-06, + "loss": 0.244, + "step": 10446 + }, + { + "epoch": 0.2643672343548346, + "grad_norm": 7.729330539703369, + "learning_rate": 8.457833158566413e-06, + "loss": 0.2892, + "step": 10447 + }, + { + "epoch": 0.2643925399195283, + "grad_norm": 12.22103500366211, + "learning_rate": 8.457543122599506e-06, + "loss": 0.2375, + "step": 10448 + }, + { + "epoch": 0.264417845484222, + "grad_norm": 18.229034423828125, + "learning_rate": 8.457253064335545e-06, + "loss": 0.2458, + "step": 10449 + }, + { + "epoch": 0.26444315104891564, + "grad_norm": 3.0856571197509766, + "learning_rate": 8.456962983776405e-06, + "loss": 0.152, + "step": 10450 + }, + { + "epoch": 0.26446845661360935, + "grad_norm": 12.040923118591309, + "learning_rate": 8.45667288092395e-06, + "loss": 0.1782, + "step": 10451 + }, + { + "epoch": 0.264493762178303, + "grad_norm": 7.34976053237915, + "learning_rate": 8.456382755780057e-06, + "loss": 0.2463, + "step": 10452 + }, + { + "epoch": 0.26451906774299666, + "grad_norm": 4.4559831619262695, + "learning_rate": 8.456092608346596e-06, + "loss": 0.2635, + "step": 10453 + }, + { + "epoch": 0.2645443733076904, + "grad_norm": 6.527637004852295, + "learning_rate": 8.455802438625434e-06, + "loss": 0.1817, + "step": 10454 + }, + { + "epoch": 0.26456967887238403, + "grad_norm": 4.799999237060547, + "learning_rate": 8.455512246618447e-06, + "loss": 0.2321, + "step": 10455 + }, + { + "epoch": 0.26459498443707774, + "grad_norm": 6.915521144866943, + "learning_rate": 8.455222032327502e-06, + "loss": 0.2127, + "step": 10456 + }, + { + "epoch": 0.2646202900017714, + "grad_norm": 7.302552223205566, + "learning_rate": 8.454931795754475e-06, + "loss": 0.2825, + "step": 10457 + }, + { + "epoch": 0.26464559556646505, + "grad_norm": 5.81635856628418, + "learning_rate": 8.454641536901233e-06, + "loss": 0.1929, + "step": 10458 + }, + { + "epoch": 0.26467090113115876, + "grad_norm": 4.859588623046875, + "learning_rate": 8.454351255769653e-06, + "loss": 0.1861, + "step": 10459 + }, + { + "epoch": 0.2646962066958524, + "grad_norm": 12.468504905700684, + "learning_rate": 8.454060952361602e-06, + "loss": 0.2274, + "step": 10460 + }, + { + "epoch": 0.2647215122605461, + "grad_norm": 2.362738847732544, + "learning_rate": 8.453770626678957e-06, + "loss": 0.1533, + "step": 10461 + }, + { + "epoch": 0.2647468178252398, + "grad_norm": 4.1639628410339355, + "learning_rate": 8.453480278723586e-06, + "loss": 0.0981, + "step": 10462 + }, + { + "epoch": 0.26477212338993344, + "grad_norm": 8.3252592086792, + "learning_rate": 8.453189908497363e-06, + "loss": 0.1952, + "step": 10463 + }, + { + "epoch": 0.2647974289546271, + "grad_norm": 5.145864486694336, + "learning_rate": 8.452899516002162e-06, + "loss": 0.1443, + "step": 10464 + }, + { + "epoch": 0.2648227345193208, + "grad_norm": 4.2279767990112305, + "learning_rate": 8.452609101239852e-06, + "loss": 0.2108, + "step": 10465 + }, + { + "epoch": 0.26484804008401447, + "grad_norm": 4.023159027099609, + "learning_rate": 8.45231866421231e-06, + "loss": 0.1842, + "step": 10466 + }, + { + "epoch": 0.2648733456487081, + "grad_norm": 4.319878101348877, + "learning_rate": 8.452028204921407e-06, + "loss": 0.1608, + "step": 10467 + }, + { + "epoch": 0.26489865121340184, + "grad_norm": 7.781618118286133, + "learning_rate": 8.451737723369015e-06, + "loss": 0.2438, + "step": 10468 + }, + { + "epoch": 0.2649239567780955, + "grad_norm": 6.435972690582275, + "learning_rate": 8.45144721955701e-06, + "loss": 0.1717, + "step": 10469 + }, + { + "epoch": 0.2649492623427892, + "grad_norm": 3.7579057216644287, + "learning_rate": 8.451156693487264e-06, + "loss": 0.147, + "step": 10470 + }, + { + "epoch": 0.26497456790748286, + "grad_norm": 9.02859115600586, + "learning_rate": 8.45086614516165e-06, + "loss": 0.2632, + "step": 10471 + }, + { + "epoch": 0.2649998734721765, + "grad_norm": 3.293238401412964, + "learning_rate": 8.45057557458204e-06, + "loss": 0.1175, + "step": 10472 + }, + { + "epoch": 0.26502517903687023, + "grad_norm": 4.013329982757568, + "learning_rate": 8.450284981750313e-06, + "loss": 0.1789, + "step": 10473 + }, + { + "epoch": 0.2650504846015639, + "grad_norm": 9.888769149780273, + "learning_rate": 8.449994366668337e-06, + "loss": 0.2895, + "step": 10474 + }, + { + "epoch": 0.26507579016625754, + "grad_norm": 2.7985692024230957, + "learning_rate": 8.449703729337992e-06, + "loss": 0.1423, + "step": 10475 + }, + { + "epoch": 0.26510109573095125, + "grad_norm": 5.068385601043701, + "learning_rate": 8.449413069761147e-06, + "loss": 0.1655, + "step": 10476 + }, + { + "epoch": 0.2651264012956449, + "grad_norm": 6.680750846862793, + "learning_rate": 8.44912238793968e-06, + "loss": 0.2326, + "step": 10477 + }, + { + "epoch": 0.26515170686033857, + "grad_norm": 4.213906288146973, + "learning_rate": 8.448831683875465e-06, + "loss": 0.1667, + "step": 10478 + }, + { + "epoch": 0.2651770124250323, + "grad_norm": 7.152377128601074, + "learning_rate": 8.448540957570374e-06, + "loss": 0.1566, + "step": 10479 + }, + { + "epoch": 0.26520231798972593, + "grad_norm": 3.7971482276916504, + "learning_rate": 8.448250209026284e-06, + "loss": 0.159, + "step": 10480 + }, + { + "epoch": 0.2652276235544196, + "grad_norm": 3.4783661365509033, + "learning_rate": 8.447959438245073e-06, + "loss": 0.1907, + "step": 10481 + }, + { + "epoch": 0.2652529291191133, + "grad_norm": 2.5371451377868652, + "learning_rate": 8.447668645228611e-06, + "loss": 0.1184, + "step": 10482 + }, + { + "epoch": 0.26527823468380696, + "grad_norm": 4.518753528594971, + "learning_rate": 8.447377829978774e-06, + "loss": 0.2008, + "step": 10483 + }, + { + "epoch": 0.26530354024850067, + "grad_norm": 7.37626838684082, + "learning_rate": 8.44708699249744e-06, + "loss": 0.1783, + "step": 10484 + }, + { + "epoch": 0.2653288458131943, + "grad_norm": 8.143056869506836, + "learning_rate": 8.446796132786483e-06, + "loss": 0.2889, + "step": 10485 + }, + { + "epoch": 0.265354151377888, + "grad_norm": 8.631343841552734, + "learning_rate": 8.446505250847779e-06, + "loss": 0.2933, + "step": 10486 + }, + { + "epoch": 0.2653794569425817, + "grad_norm": 4.481607437133789, + "learning_rate": 8.446214346683203e-06, + "loss": 0.1707, + "step": 10487 + }, + { + "epoch": 0.26540476250727535, + "grad_norm": 3.3422765731811523, + "learning_rate": 8.445923420294633e-06, + "loss": 0.1638, + "step": 10488 + }, + { + "epoch": 0.265430068071969, + "grad_norm": 4.476866245269775, + "learning_rate": 8.445632471683942e-06, + "loss": 0.2196, + "step": 10489 + }, + { + "epoch": 0.2654553736366627, + "grad_norm": 7.108537197113037, + "learning_rate": 8.44534150085301e-06, + "loss": 0.2563, + "step": 10490 + }, + { + "epoch": 0.2654806792013564, + "grad_norm": 7.445459842681885, + "learning_rate": 8.44505050780371e-06, + "loss": 0.2453, + "step": 10491 + }, + { + "epoch": 0.26550598476605003, + "grad_norm": 4.7804951667785645, + "learning_rate": 8.444759492537921e-06, + "loss": 0.2001, + "step": 10492 + }, + { + "epoch": 0.26553129033074374, + "grad_norm": 8.877863883972168, + "learning_rate": 8.444468455057518e-06, + "loss": 0.2251, + "step": 10493 + }, + { + "epoch": 0.2655565958954374, + "grad_norm": 4.933835506439209, + "learning_rate": 8.44417739536438e-06, + "loss": 0.1932, + "step": 10494 + }, + { + "epoch": 0.2655819014601311, + "grad_norm": 3.089728832244873, + "learning_rate": 8.443886313460382e-06, + "loss": 0.1565, + "step": 10495 + }, + { + "epoch": 0.26560720702482477, + "grad_norm": 5.978271961212158, + "learning_rate": 8.443595209347401e-06, + "loss": 0.2212, + "step": 10496 + }, + { + "epoch": 0.2656325125895184, + "grad_norm": 5.860185623168945, + "learning_rate": 8.443304083027317e-06, + "loss": 0.2112, + "step": 10497 + }, + { + "epoch": 0.26565781815421213, + "grad_norm": 5.6536455154418945, + "learning_rate": 8.443012934502004e-06, + "loss": 0.1751, + "step": 10498 + }, + { + "epoch": 0.2656831237189058, + "grad_norm": 8.21766471862793, + "learning_rate": 8.442721763773341e-06, + "loss": 0.3078, + "step": 10499 + }, + { + "epoch": 0.26570842928359945, + "grad_norm": 5.994294166564941, + "learning_rate": 8.442430570843206e-06, + "loss": 0.2379, + "step": 10500 + }, + { + "epoch": 0.26573373484829316, + "grad_norm": 6.876434803009033, + "learning_rate": 8.442139355713477e-06, + "loss": 0.2119, + "step": 10501 + }, + { + "epoch": 0.2657590404129868, + "grad_norm": 3.5361289978027344, + "learning_rate": 8.441848118386032e-06, + "loss": 0.1643, + "step": 10502 + }, + { + "epoch": 0.26578434597768047, + "grad_norm": 9.407776832580566, + "learning_rate": 8.441556858862747e-06, + "loss": 0.2133, + "step": 10503 + }, + { + "epoch": 0.2658096515423742, + "grad_norm": 3.5675435066223145, + "learning_rate": 8.441265577145503e-06, + "loss": 0.1873, + "step": 10504 + }, + { + "epoch": 0.26583495710706784, + "grad_norm": 8.780586242675781, + "learning_rate": 8.440974273236176e-06, + "loss": 0.176, + "step": 10505 + }, + { + "epoch": 0.2658602626717615, + "grad_norm": 8.473772048950195, + "learning_rate": 8.440682947136649e-06, + "loss": 0.289, + "step": 10506 + }, + { + "epoch": 0.2658855682364552, + "grad_norm": 4.888726234436035, + "learning_rate": 8.440391598848796e-06, + "loss": 0.1389, + "step": 10507 + }, + { + "epoch": 0.26591087380114886, + "grad_norm": 7.528891563415527, + "learning_rate": 8.440100228374496e-06, + "loss": 0.1694, + "step": 10508 + }, + { + "epoch": 0.2659361793658426, + "grad_norm": 4.638913631439209, + "learning_rate": 8.439808835715632e-06, + "loss": 0.1178, + "step": 10509 + }, + { + "epoch": 0.26596148493053623, + "grad_norm": 3.342214584350586, + "learning_rate": 8.439517420874079e-06, + "loss": 0.1805, + "step": 10510 + }, + { + "epoch": 0.2659867904952299, + "grad_norm": 9.583842277526855, + "learning_rate": 8.439225983851718e-06, + "loss": 0.2137, + "step": 10511 + }, + { + "epoch": 0.2660120960599236, + "grad_norm": 7.905214786529541, + "learning_rate": 8.43893452465043e-06, + "loss": 0.2267, + "step": 10512 + }, + { + "epoch": 0.26603740162461725, + "grad_norm": 4.340991497039795, + "learning_rate": 8.438643043272093e-06, + "loss": 0.1476, + "step": 10513 + }, + { + "epoch": 0.2660627071893109, + "grad_norm": 6.423403263092041, + "learning_rate": 8.438351539718586e-06, + "loss": 0.2753, + "step": 10514 + }, + { + "epoch": 0.2660880127540046, + "grad_norm": 3.658921480178833, + "learning_rate": 8.438060013991789e-06, + "loss": 0.1501, + "step": 10515 + }, + { + "epoch": 0.2661133183186983, + "grad_norm": 7.590980052947998, + "learning_rate": 8.43776846609358e-06, + "loss": 0.1623, + "step": 10516 + }, + { + "epoch": 0.26613862388339193, + "grad_norm": 3.949100971221924, + "learning_rate": 8.437476896025847e-06, + "loss": 0.1664, + "step": 10517 + }, + { + "epoch": 0.26616392944808565, + "grad_norm": 11.939329147338867, + "learning_rate": 8.43718530379046e-06, + "loss": 0.3397, + "step": 10518 + }, + { + "epoch": 0.2661892350127793, + "grad_norm": 6.584211826324463, + "learning_rate": 8.436893689389307e-06, + "loss": 0.1848, + "step": 10519 + }, + { + "epoch": 0.266214540577473, + "grad_norm": 2.912442684173584, + "learning_rate": 8.436602052824265e-06, + "loss": 0.148, + "step": 10520 + }, + { + "epoch": 0.26623984614216667, + "grad_norm": 22.110149383544922, + "learning_rate": 8.436310394097218e-06, + "loss": 0.3001, + "step": 10521 + }, + { + "epoch": 0.2662651517068603, + "grad_norm": 4.7100510597229, + "learning_rate": 8.436018713210042e-06, + "loss": 0.1415, + "step": 10522 + }, + { + "epoch": 0.26629045727155404, + "grad_norm": 2.8868489265441895, + "learning_rate": 8.43572701016462e-06, + "loss": 0.1525, + "step": 10523 + }, + { + "epoch": 0.2663157628362477, + "grad_norm": 3.148186206817627, + "learning_rate": 8.435435284962835e-06, + "loss": 0.145, + "step": 10524 + }, + { + "epoch": 0.26634106840094135, + "grad_norm": 5.78278112411499, + "learning_rate": 8.435143537606568e-06, + "loss": 0.1789, + "step": 10525 + }, + { + "epoch": 0.26636637396563506, + "grad_norm": 6.394472122192383, + "learning_rate": 8.434851768097697e-06, + "loss": 0.2575, + "step": 10526 + }, + { + "epoch": 0.2663916795303287, + "grad_norm": 8.105911254882812, + "learning_rate": 8.434559976438106e-06, + "loss": 0.2091, + "step": 10527 + }, + { + "epoch": 0.2664169850950224, + "grad_norm": 4.343158721923828, + "learning_rate": 8.434268162629678e-06, + "loss": 0.1651, + "step": 10528 + }, + { + "epoch": 0.2664422906597161, + "grad_norm": 4.8212175369262695, + "learning_rate": 8.433976326674292e-06, + "loss": 0.2029, + "step": 10529 + }, + { + "epoch": 0.26646759622440974, + "grad_norm": 7.209071636199951, + "learning_rate": 8.433684468573832e-06, + "loss": 0.2526, + "step": 10530 + }, + { + "epoch": 0.2664929017891034, + "grad_norm": 3.951819896697998, + "learning_rate": 8.43339258833018e-06, + "loss": 0.1246, + "step": 10531 + }, + { + "epoch": 0.2665182073537971, + "grad_norm": 7.177188396453857, + "learning_rate": 8.43310068594522e-06, + "loss": 0.2157, + "step": 10532 + }, + { + "epoch": 0.26654351291849077, + "grad_norm": 5.548970699310303, + "learning_rate": 8.432808761420827e-06, + "loss": 0.1721, + "step": 10533 + }, + { + "epoch": 0.2665688184831845, + "grad_norm": 7.4221110343933105, + "learning_rate": 8.432516814758893e-06, + "loss": 0.1958, + "step": 10534 + }, + { + "epoch": 0.26659412404787813, + "grad_norm": 3.711730718612671, + "learning_rate": 8.432224845961294e-06, + "loss": 0.0913, + "step": 10535 + }, + { + "epoch": 0.2666194296125718, + "grad_norm": 9.19107437133789, + "learning_rate": 8.431932855029918e-06, + "loss": 0.218, + "step": 10536 + }, + { + "epoch": 0.2666447351772655, + "grad_norm": 7.150022029876709, + "learning_rate": 8.431640841966643e-06, + "loss": 0.1581, + "step": 10537 + }, + { + "epoch": 0.26667004074195916, + "grad_norm": 6.637162685394287, + "learning_rate": 8.431348806773356e-06, + "loss": 0.1539, + "step": 10538 + }, + { + "epoch": 0.2666953463066528, + "grad_norm": 2.441136121749878, + "learning_rate": 8.431056749451939e-06, + "loss": 0.079, + "step": 10539 + }, + { + "epoch": 0.2667206518713465, + "grad_norm": 5.084435939788818, + "learning_rate": 8.430764670004275e-06, + "loss": 0.1694, + "step": 10540 + }, + { + "epoch": 0.2667459574360402, + "grad_norm": 3.6731090545654297, + "learning_rate": 8.430472568432247e-06, + "loss": 0.135, + "step": 10541 + }, + { + "epoch": 0.26677126300073384, + "grad_norm": 4.042828559875488, + "learning_rate": 8.43018044473774e-06, + "loss": 0.1975, + "step": 10542 + }, + { + "epoch": 0.26679656856542755, + "grad_norm": 6.359329700469971, + "learning_rate": 8.429888298922637e-06, + "loss": 0.2727, + "step": 10543 + }, + { + "epoch": 0.2668218741301212, + "grad_norm": 6.8864312171936035, + "learning_rate": 8.429596130988824e-06, + "loss": 0.1748, + "step": 10544 + }, + { + "epoch": 0.26684717969481486, + "grad_norm": 7.326634407043457, + "learning_rate": 8.429303940938183e-06, + "loss": 0.2356, + "step": 10545 + }, + { + "epoch": 0.2668724852595086, + "grad_norm": 6.613637447357178, + "learning_rate": 8.429011728772598e-06, + "loss": 0.2532, + "step": 10546 + }, + { + "epoch": 0.26689779082420223, + "grad_norm": 5.753690242767334, + "learning_rate": 8.428719494493953e-06, + "loss": 0.188, + "step": 10547 + }, + { + "epoch": 0.26692309638889594, + "grad_norm": 3.049239158630371, + "learning_rate": 8.428427238104135e-06, + "loss": 0.1675, + "step": 10548 + }, + { + "epoch": 0.2669484019535896, + "grad_norm": 4.2541184425354, + "learning_rate": 8.428134959605028e-06, + "loss": 0.2005, + "step": 10549 + }, + { + "epoch": 0.26697370751828325, + "grad_norm": 5.365501880645752, + "learning_rate": 8.427842658998515e-06, + "loss": 0.1745, + "step": 10550 + }, + { + "epoch": 0.26699901308297697, + "grad_norm": 9.578052520751953, + "learning_rate": 8.427550336286483e-06, + "loss": 0.1325, + "step": 10551 + }, + { + "epoch": 0.2670243186476706, + "grad_norm": 8.141839027404785, + "learning_rate": 8.427257991470817e-06, + "loss": 0.266, + "step": 10552 + }, + { + "epoch": 0.2670496242123643, + "grad_norm": 3.7901432514190674, + "learning_rate": 8.426965624553401e-06, + "loss": 0.1074, + "step": 10553 + }, + { + "epoch": 0.267074929777058, + "grad_norm": 7.37481164932251, + "learning_rate": 8.426673235536123e-06, + "loss": 0.2434, + "step": 10554 + }, + { + "epoch": 0.26710023534175165, + "grad_norm": 12.685559272766113, + "learning_rate": 8.426380824420864e-06, + "loss": 0.2739, + "step": 10555 + }, + { + "epoch": 0.2671255409064453, + "grad_norm": 5.89685583114624, + "learning_rate": 8.426088391209515e-06, + "loss": 0.2196, + "step": 10556 + }, + { + "epoch": 0.267150846471139, + "grad_norm": 4.370602607727051, + "learning_rate": 8.425795935903957e-06, + "loss": 0.2106, + "step": 10557 + }, + { + "epoch": 0.26717615203583267, + "grad_norm": 6.067553520202637, + "learning_rate": 8.425503458506079e-06, + "loss": 0.1928, + "step": 10558 + }, + { + "epoch": 0.2672014576005264, + "grad_norm": 5.8692545890808105, + "learning_rate": 8.425210959017766e-06, + "loss": 0.1433, + "step": 10559 + }, + { + "epoch": 0.26722676316522004, + "grad_norm": 5.150245666503906, + "learning_rate": 8.424918437440905e-06, + "loss": 0.1403, + "step": 10560 + }, + { + "epoch": 0.2672520687299137, + "grad_norm": 10.756708145141602, + "learning_rate": 8.424625893777384e-06, + "loss": 0.2833, + "step": 10561 + }, + { + "epoch": 0.2672773742946074, + "grad_norm": 3.3245625495910645, + "learning_rate": 8.424333328029086e-06, + "loss": 0.1851, + "step": 10562 + }, + { + "epoch": 0.26730267985930106, + "grad_norm": 4.0671515464782715, + "learning_rate": 8.424040740197898e-06, + "loss": 0.1657, + "step": 10563 + }, + { + "epoch": 0.2673279854239947, + "grad_norm": 10.171281814575195, + "learning_rate": 8.42374813028571e-06, + "loss": 0.2545, + "step": 10564 + }, + { + "epoch": 0.26735329098868843, + "grad_norm": 4.944435119628906, + "learning_rate": 8.423455498294408e-06, + "loss": 0.1644, + "step": 10565 + }, + { + "epoch": 0.2673785965533821, + "grad_norm": 3.782287120819092, + "learning_rate": 8.423162844225877e-06, + "loss": 0.1703, + "step": 10566 + }, + { + "epoch": 0.26740390211807574, + "grad_norm": 7.8666605949401855, + "learning_rate": 8.422870168082004e-06, + "loss": 0.1586, + "step": 10567 + }, + { + "epoch": 0.26742920768276945, + "grad_norm": 6.326907634735107, + "learning_rate": 8.42257746986468e-06, + "loss": 0.1646, + "step": 10568 + }, + { + "epoch": 0.2674545132474631, + "grad_norm": 5.2834343910217285, + "learning_rate": 8.42228474957579e-06, + "loss": 0.2069, + "step": 10569 + }, + { + "epoch": 0.26747981881215677, + "grad_norm": 10.756415367126465, + "learning_rate": 8.421992007217222e-06, + "loss": 0.2629, + "step": 10570 + }, + { + "epoch": 0.2675051243768505, + "grad_norm": 13.97558879852295, + "learning_rate": 8.421699242790864e-06, + "loss": 0.3483, + "step": 10571 + }, + { + "epoch": 0.26753042994154413, + "grad_norm": 4.765318393707275, + "learning_rate": 8.421406456298605e-06, + "loss": 0.2183, + "step": 10572 + }, + { + "epoch": 0.26755573550623785, + "grad_norm": 2.8172004222869873, + "learning_rate": 8.421113647742331e-06, + "loss": 0.1087, + "step": 10573 + }, + { + "epoch": 0.2675810410709315, + "grad_norm": 6.5759477615356445, + "learning_rate": 8.420820817123933e-06, + "loss": 0.1829, + "step": 10574 + }, + { + "epoch": 0.26760634663562516, + "grad_norm": 3.364884614944458, + "learning_rate": 8.420527964445297e-06, + "loss": 0.1399, + "step": 10575 + }, + { + "epoch": 0.26763165220031887, + "grad_norm": 6.321227073669434, + "learning_rate": 8.420235089708313e-06, + "loss": 0.1972, + "step": 10576 + }, + { + "epoch": 0.2676569577650125, + "grad_norm": 4.442440032958984, + "learning_rate": 8.419942192914868e-06, + "loss": 0.1662, + "step": 10577 + }, + { + "epoch": 0.2676822633297062, + "grad_norm": 9.27501392364502, + "learning_rate": 8.419649274066853e-06, + "loss": 0.3687, + "step": 10578 + }, + { + "epoch": 0.2677075688943999, + "grad_norm": 6.833491325378418, + "learning_rate": 8.419356333166155e-06, + "loss": 0.1553, + "step": 10579 + }, + { + "epoch": 0.26773287445909355, + "grad_norm": 9.04385757446289, + "learning_rate": 8.419063370214665e-06, + "loss": 0.2725, + "step": 10580 + }, + { + "epoch": 0.2677581800237872, + "grad_norm": 5.271190643310547, + "learning_rate": 8.41877038521427e-06, + "loss": 0.1657, + "step": 10581 + }, + { + "epoch": 0.2677834855884809, + "grad_norm": 3.0701522827148438, + "learning_rate": 8.418477378166861e-06, + "loss": 0.1565, + "step": 10582 + }, + { + "epoch": 0.2678087911531746, + "grad_norm": 5.761259078979492, + "learning_rate": 8.418184349074329e-06, + "loss": 0.2287, + "step": 10583 + }, + { + "epoch": 0.2678340967178683, + "grad_norm": 3.488149881362915, + "learning_rate": 8.41789129793856e-06, + "loss": 0.1673, + "step": 10584 + }, + { + "epoch": 0.26785940228256194, + "grad_norm": 13.57160758972168, + "learning_rate": 8.417598224761448e-06, + "loss": 0.282, + "step": 10585 + }, + { + "epoch": 0.2678847078472556, + "grad_norm": 4.763064384460449, + "learning_rate": 8.417305129544879e-06, + "loss": 0.1597, + "step": 10586 + }, + { + "epoch": 0.2679100134119493, + "grad_norm": 6.768884658813477, + "learning_rate": 8.417012012290745e-06, + "loss": 0.1535, + "step": 10587 + }, + { + "epoch": 0.26793531897664297, + "grad_norm": 4.214169025421143, + "learning_rate": 8.416718873000936e-06, + "loss": 0.1743, + "step": 10588 + }, + { + "epoch": 0.2679606245413366, + "grad_norm": 3.8767480850219727, + "learning_rate": 8.416425711677343e-06, + "loss": 0.1451, + "step": 10589 + }, + { + "epoch": 0.26798593010603033, + "grad_norm": 5.8767595291137695, + "learning_rate": 8.416132528321856e-06, + "loss": 0.1465, + "step": 10590 + }, + { + "epoch": 0.268011235670724, + "grad_norm": 7.517460346221924, + "learning_rate": 8.415839322936366e-06, + "loss": 0.2339, + "step": 10591 + }, + { + "epoch": 0.26803654123541765, + "grad_norm": 13.23035717010498, + "learning_rate": 8.415546095522763e-06, + "loss": 0.2743, + "step": 10592 + }, + { + "epoch": 0.26806184680011136, + "grad_norm": 6.854432582855225, + "learning_rate": 8.41525284608294e-06, + "loss": 0.1889, + "step": 10593 + }, + { + "epoch": 0.268087152364805, + "grad_norm": 6.257035732269287, + "learning_rate": 8.414959574618786e-06, + "loss": 0.2096, + "step": 10594 + }, + { + "epoch": 0.26811245792949867, + "grad_norm": 4.477064609527588, + "learning_rate": 8.414666281132191e-06, + "loss": 0.1547, + "step": 10595 + }, + { + "epoch": 0.2681377634941924, + "grad_norm": 9.072380065917969, + "learning_rate": 8.414372965625051e-06, + "loss": 0.2065, + "step": 10596 + }, + { + "epoch": 0.26816306905888604, + "grad_norm": 4.544955253601074, + "learning_rate": 8.414079628099253e-06, + "loss": 0.1835, + "step": 10597 + }, + { + "epoch": 0.26818837462357975, + "grad_norm": 3.949650287628174, + "learning_rate": 8.413786268556692e-06, + "loss": 0.1581, + "step": 10598 + }, + { + "epoch": 0.2682136801882734, + "grad_norm": 6.7983832359313965, + "learning_rate": 8.413492886999256e-06, + "loss": 0.1373, + "step": 10599 + }, + { + "epoch": 0.26823898575296706, + "grad_norm": 6.7847490310668945, + "learning_rate": 8.413199483428842e-06, + "loss": 0.135, + "step": 10600 + }, + { + "epoch": 0.2682642913176608, + "grad_norm": 7.148129940032959, + "learning_rate": 8.41290605784734e-06, + "loss": 0.2045, + "step": 10601 + }, + { + "epoch": 0.26828959688235443, + "grad_norm": 5.137709617614746, + "learning_rate": 8.412612610256638e-06, + "loss": 0.136, + "step": 10602 + }, + { + "epoch": 0.2683149024470481, + "grad_norm": 10.997811317443848, + "learning_rate": 8.412319140658636e-06, + "loss": 0.3051, + "step": 10603 + }, + { + "epoch": 0.2683402080117418, + "grad_norm": 17.93840217590332, + "learning_rate": 8.41202564905522e-06, + "loss": 0.1559, + "step": 10604 + }, + { + "epoch": 0.26836551357643545, + "grad_norm": 6.392353057861328, + "learning_rate": 8.411732135448287e-06, + "loss": 0.2108, + "step": 10605 + }, + { + "epoch": 0.2683908191411291, + "grad_norm": 12.853509902954102, + "learning_rate": 8.411438599839725e-06, + "loss": 0.2967, + "step": 10606 + }, + { + "epoch": 0.2684161247058228, + "grad_norm": 8.126588821411133, + "learning_rate": 8.411145042231433e-06, + "loss": 0.2351, + "step": 10607 + }, + { + "epoch": 0.2684414302705165, + "grad_norm": 15.122687339782715, + "learning_rate": 8.4108514626253e-06, + "loss": 0.312, + "step": 10608 + }, + { + "epoch": 0.26846673583521014, + "grad_norm": 5.010795593261719, + "learning_rate": 8.410557861023222e-06, + "loss": 0.1606, + "step": 10609 + }, + { + "epoch": 0.26849204139990385, + "grad_norm": 6.9271559715271, + "learning_rate": 8.410264237427089e-06, + "loss": 0.1721, + "step": 10610 + }, + { + "epoch": 0.2685173469645975, + "grad_norm": 15.39150333404541, + "learning_rate": 8.409970591838797e-06, + "loss": 0.1331, + "step": 10611 + }, + { + "epoch": 0.2685426525292912, + "grad_norm": 6.27674674987793, + "learning_rate": 8.409676924260239e-06, + "loss": 0.1953, + "step": 10612 + }, + { + "epoch": 0.26856795809398487, + "grad_norm": 3.433522939682007, + "learning_rate": 8.409383234693307e-06, + "loss": 0.0981, + "step": 10613 + }, + { + "epoch": 0.2685932636586785, + "grad_norm": 3.2398409843444824, + "learning_rate": 8.4090895231399e-06, + "loss": 0.1484, + "step": 10614 + }, + { + "epoch": 0.26861856922337224, + "grad_norm": 2.3429312705993652, + "learning_rate": 8.408795789601907e-06, + "loss": 0.1271, + "step": 10615 + }, + { + "epoch": 0.2686438747880659, + "grad_norm": 5.637628078460693, + "learning_rate": 8.408502034081222e-06, + "loss": 0.2572, + "step": 10616 + }, + { + "epoch": 0.26866918035275955, + "grad_norm": 3.2931337356567383, + "learning_rate": 8.408208256579745e-06, + "loss": 0.1867, + "step": 10617 + }, + { + "epoch": 0.26869448591745326, + "grad_norm": 4.887974739074707, + "learning_rate": 8.407914457099366e-06, + "loss": 0.2063, + "step": 10618 + }, + { + "epoch": 0.2687197914821469, + "grad_norm": 4.008730411529541, + "learning_rate": 8.40762063564198e-06, + "loss": 0.1381, + "step": 10619 + }, + { + "epoch": 0.2687450970468406, + "grad_norm": 4.972774505615234, + "learning_rate": 8.407326792209483e-06, + "loss": 0.2431, + "step": 10620 + }, + { + "epoch": 0.2687704026115343, + "grad_norm": 6.889462947845459, + "learning_rate": 8.407032926803768e-06, + "loss": 0.2515, + "step": 10621 + }, + { + "epoch": 0.26879570817622794, + "grad_norm": 3.9115028381347656, + "learning_rate": 8.406739039426731e-06, + "loss": 0.1851, + "step": 10622 + }, + { + "epoch": 0.26882101374092165, + "grad_norm": 9.575657844543457, + "learning_rate": 8.40644513008027e-06, + "loss": 0.2107, + "step": 10623 + }, + { + "epoch": 0.2688463193056153, + "grad_norm": 3.585357666015625, + "learning_rate": 8.406151198766276e-06, + "loss": 0.1063, + "step": 10624 + }, + { + "epoch": 0.26887162487030897, + "grad_norm": 4.919020652770996, + "learning_rate": 8.405857245486648e-06, + "loss": 0.16, + "step": 10625 + }, + { + "epoch": 0.2688969304350027, + "grad_norm": 3.324946403503418, + "learning_rate": 8.405563270243278e-06, + "loss": 0.1557, + "step": 10626 + }, + { + "epoch": 0.26892223599969634, + "grad_norm": 3.6278176307678223, + "learning_rate": 8.405269273038066e-06, + "loss": 0.1318, + "step": 10627 + }, + { + "epoch": 0.26894754156439, + "grad_norm": 3.0157713890075684, + "learning_rate": 8.404975253872905e-06, + "loss": 0.13, + "step": 10628 + }, + { + "epoch": 0.2689728471290837, + "grad_norm": 4.370317459106445, + "learning_rate": 8.404681212749693e-06, + "loss": 0.1965, + "step": 10629 + }, + { + "epoch": 0.26899815269377736, + "grad_norm": 4.454805850982666, + "learning_rate": 8.404387149670323e-06, + "loss": 0.2482, + "step": 10630 + }, + { + "epoch": 0.269023458258471, + "grad_norm": 5.409299373626709, + "learning_rate": 8.404093064636695e-06, + "loss": 0.1892, + "step": 10631 + }, + { + "epoch": 0.2690487638231647, + "grad_norm": 4.038141250610352, + "learning_rate": 8.403798957650705e-06, + "loss": 0.1906, + "step": 10632 + }, + { + "epoch": 0.2690740693878584, + "grad_norm": 6.003714561462402, + "learning_rate": 8.403504828714248e-06, + "loss": 0.2811, + "step": 10633 + }, + { + "epoch": 0.26909937495255204, + "grad_norm": 7.291495323181152, + "learning_rate": 8.40321067782922e-06, + "loss": 0.209, + "step": 10634 + }, + { + "epoch": 0.26912468051724575, + "grad_norm": 4.0454888343811035, + "learning_rate": 8.40291650499752e-06, + "loss": 0.2114, + "step": 10635 + }, + { + "epoch": 0.2691499860819394, + "grad_norm": 5.18910026550293, + "learning_rate": 8.402622310221046e-06, + "loss": 0.2029, + "step": 10636 + }, + { + "epoch": 0.2691752916466331, + "grad_norm": 7.175722599029541, + "learning_rate": 8.402328093501693e-06, + "loss": 0.1937, + "step": 10637 + }, + { + "epoch": 0.2692005972113268, + "grad_norm": 13.50097370147705, + "learning_rate": 8.402033854841358e-06, + "loss": 0.5254, + "step": 10638 + }, + { + "epoch": 0.26922590277602043, + "grad_norm": 4.650014877319336, + "learning_rate": 8.40173959424194e-06, + "loss": 0.2195, + "step": 10639 + }, + { + "epoch": 0.26925120834071414, + "grad_norm": 2.7712013721466064, + "learning_rate": 8.401445311705338e-06, + "loss": 0.1237, + "step": 10640 + }, + { + "epoch": 0.2692765139054078, + "grad_norm": 3.1440064907073975, + "learning_rate": 8.401151007233446e-06, + "loss": 0.1347, + "step": 10641 + }, + { + "epoch": 0.26930181947010146, + "grad_norm": 3.0790534019470215, + "learning_rate": 8.400856680828163e-06, + "loss": 0.0963, + "step": 10642 + }, + { + "epoch": 0.26932712503479517, + "grad_norm": 5.886397838592529, + "learning_rate": 8.400562332491389e-06, + "loss": 0.1509, + "step": 10643 + }, + { + "epoch": 0.2693524305994888, + "grad_norm": 13.958436012268066, + "learning_rate": 8.400267962225021e-06, + "loss": 0.2988, + "step": 10644 + }, + { + "epoch": 0.2693777361641825, + "grad_norm": 10.685567855834961, + "learning_rate": 8.399973570030958e-06, + "loss": 0.2555, + "step": 10645 + }, + { + "epoch": 0.2694030417288762, + "grad_norm": 15.310393333435059, + "learning_rate": 8.399679155911096e-06, + "loss": 0.3173, + "step": 10646 + }, + { + "epoch": 0.26942834729356985, + "grad_norm": 4.622008800506592, + "learning_rate": 8.399384719867338e-06, + "loss": 0.1786, + "step": 10647 + }, + { + "epoch": 0.26945365285826356, + "grad_norm": 3.5572378635406494, + "learning_rate": 8.399090261901578e-06, + "loss": 0.1757, + "step": 10648 + }, + { + "epoch": 0.2694789584229572, + "grad_norm": 6.038553237915039, + "learning_rate": 8.398795782015718e-06, + "loss": 0.1694, + "step": 10649 + }, + { + "epoch": 0.26950426398765087, + "grad_norm": 6.825554370880127, + "learning_rate": 8.398501280211657e-06, + "loss": 0.2835, + "step": 10650 + }, + { + "epoch": 0.2695295695523446, + "grad_norm": 4.268792152404785, + "learning_rate": 8.398206756491293e-06, + "loss": 0.1596, + "step": 10651 + }, + { + "epoch": 0.26955487511703824, + "grad_norm": 5.235597133636475, + "learning_rate": 8.397912210856525e-06, + "loss": 0.1781, + "step": 10652 + }, + { + "epoch": 0.2695801806817319, + "grad_norm": 3.690849781036377, + "learning_rate": 8.397617643309256e-06, + "loss": 0.1579, + "step": 10653 + }, + { + "epoch": 0.2696054862464256, + "grad_norm": 3.9226653575897217, + "learning_rate": 8.397323053851378e-06, + "loss": 0.1793, + "step": 10654 + }, + { + "epoch": 0.26963079181111926, + "grad_norm": 6.151307106018066, + "learning_rate": 8.397028442484798e-06, + "loss": 0.1801, + "step": 10655 + }, + { + "epoch": 0.2696560973758129, + "grad_norm": 5.269989967346191, + "learning_rate": 8.396733809211414e-06, + "loss": 0.1937, + "step": 10656 + }, + { + "epoch": 0.26968140294050663, + "grad_norm": 12.62187385559082, + "learning_rate": 8.396439154033124e-06, + "loss": 0.2085, + "step": 10657 + }, + { + "epoch": 0.2697067085052003, + "grad_norm": 8.913442611694336, + "learning_rate": 8.396144476951833e-06, + "loss": 0.2269, + "step": 10658 + }, + { + "epoch": 0.26973201406989394, + "grad_norm": 4.061830043792725, + "learning_rate": 8.395849777969436e-06, + "loss": 0.1385, + "step": 10659 + }, + { + "epoch": 0.26975731963458766, + "grad_norm": 3.503369092941284, + "learning_rate": 8.395555057087834e-06, + "loss": 0.1784, + "step": 10660 + }, + { + "epoch": 0.2697826251992813, + "grad_norm": 5.0060529708862305, + "learning_rate": 8.39526031430893e-06, + "loss": 0.2174, + "step": 10661 + }, + { + "epoch": 0.269807930763975, + "grad_norm": 5.566481113433838, + "learning_rate": 8.394965549634625e-06, + "loss": 0.1301, + "step": 10662 + }, + { + "epoch": 0.2698332363286687, + "grad_norm": 3.7328734397888184, + "learning_rate": 8.394670763066818e-06, + "loss": 0.201, + "step": 10663 + }, + { + "epoch": 0.26985854189336234, + "grad_norm": 6.673259258270264, + "learning_rate": 8.394375954607411e-06, + "loss": 0.2137, + "step": 10664 + }, + { + "epoch": 0.26988384745805605, + "grad_norm": 14.237056732177734, + "learning_rate": 8.394081124258305e-06, + "loss": 0.3654, + "step": 10665 + }, + { + "epoch": 0.2699091530227497, + "grad_norm": 3.5580523014068604, + "learning_rate": 8.3937862720214e-06, + "loss": 0.1388, + "step": 10666 + }, + { + "epoch": 0.26993445858744336, + "grad_norm": 5.080211639404297, + "learning_rate": 8.393491397898598e-06, + "loss": 0.1585, + "step": 10667 + }, + { + "epoch": 0.26995976415213707, + "grad_norm": 2.9516241550445557, + "learning_rate": 8.393196501891802e-06, + "loss": 0.1461, + "step": 10668 + }, + { + "epoch": 0.26998506971683073, + "grad_norm": 7.3636369705200195, + "learning_rate": 8.392901584002915e-06, + "loss": 0.2931, + "step": 10669 + }, + { + "epoch": 0.2700103752815244, + "grad_norm": 4.187100887298584, + "learning_rate": 8.392606644233833e-06, + "loss": 0.1588, + "step": 10670 + }, + { + "epoch": 0.2700356808462181, + "grad_norm": 6.053644180297852, + "learning_rate": 8.392311682586465e-06, + "loss": 0.2418, + "step": 10671 + }, + { + "epoch": 0.27006098641091175, + "grad_norm": 3.9131202697753906, + "learning_rate": 8.39201669906271e-06, + "loss": 0.1882, + "step": 10672 + }, + { + "epoch": 0.2700862919756054, + "grad_norm": 5.103616714477539, + "learning_rate": 8.39172169366447e-06, + "loss": 0.2086, + "step": 10673 + }, + { + "epoch": 0.2701115975402991, + "grad_norm": 3.136467456817627, + "learning_rate": 8.391426666393646e-06, + "loss": 0.2116, + "step": 10674 + }, + { + "epoch": 0.2701369031049928, + "grad_norm": 5.4653544425964355, + "learning_rate": 8.391131617252143e-06, + "loss": 0.0971, + "step": 10675 + }, + { + "epoch": 0.2701622086696865, + "grad_norm": 4.393897533416748, + "learning_rate": 8.390836546241864e-06, + "loss": 0.1233, + "step": 10676 + }, + { + "epoch": 0.27018751423438014, + "grad_norm": 9.09140682220459, + "learning_rate": 8.39054145336471e-06, + "loss": 0.1873, + "step": 10677 + }, + { + "epoch": 0.2702128197990738, + "grad_norm": 3.473334550857544, + "learning_rate": 8.390246338622585e-06, + "loss": 0.1712, + "step": 10678 + }, + { + "epoch": 0.2702381253637675, + "grad_norm": 12.359743118286133, + "learning_rate": 8.38995120201739e-06, + "loss": 0.257, + "step": 10679 + }, + { + "epoch": 0.27026343092846117, + "grad_norm": 6.543669700622559, + "learning_rate": 8.389656043551032e-06, + "loss": 0.173, + "step": 10680 + }, + { + "epoch": 0.2702887364931548, + "grad_norm": 9.151365280151367, + "learning_rate": 8.389360863225415e-06, + "loss": 0.19, + "step": 10681 + }, + { + "epoch": 0.27031404205784854, + "grad_norm": 4.9746575355529785, + "learning_rate": 8.389065661042439e-06, + "loss": 0.1573, + "step": 10682 + }, + { + "epoch": 0.2703393476225422, + "grad_norm": 5.857110023498535, + "learning_rate": 8.388770437004008e-06, + "loss": 0.2018, + "step": 10683 + }, + { + "epoch": 0.27036465318723585, + "grad_norm": 11.343490600585938, + "learning_rate": 8.388475191112027e-06, + "loss": 0.2585, + "step": 10684 + }, + { + "epoch": 0.27038995875192956, + "grad_norm": 5.699913024902344, + "learning_rate": 8.388179923368398e-06, + "loss": 0.2261, + "step": 10685 + }, + { + "epoch": 0.2704152643166232, + "grad_norm": 7.508904933929443, + "learning_rate": 8.38788463377503e-06, + "loss": 0.2171, + "step": 10686 + }, + { + "epoch": 0.27044056988131693, + "grad_norm": 18.65547752380371, + "learning_rate": 8.387589322333822e-06, + "loss": 0.2991, + "step": 10687 + }, + { + "epoch": 0.2704658754460106, + "grad_norm": 3.889817953109741, + "learning_rate": 8.387293989046682e-06, + "loss": 0.1664, + "step": 10688 + }, + { + "epoch": 0.27049118101070424, + "grad_norm": 4.089715003967285, + "learning_rate": 8.386998633915513e-06, + "loss": 0.1316, + "step": 10689 + }, + { + "epoch": 0.27051648657539795, + "grad_norm": 5.0177812576293945, + "learning_rate": 8.386703256942219e-06, + "loss": 0.1936, + "step": 10690 + }, + { + "epoch": 0.2705417921400916, + "grad_norm": 2.4272491931915283, + "learning_rate": 8.386407858128707e-06, + "loss": 0.1465, + "step": 10691 + }, + { + "epoch": 0.27056709770478526, + "grad_norm": 4.684676170349121, + "learning_rate": 8.386112437476878e-06, + "loss": 0.1628, + "step": 10692 + }, + { + "epoch": 0.270592403269479, + "grad_norm": 12.581374168395996, + "learning_rate": 8.385816994988643e-06, + "loss": 0.291, + "step": 10693 + }, + { + "epoch": 0.27061770883417263, + "grad_norm": 6.329986572265625, + "learning_rate": 8.385521530665903e-06, + "loss": 0.2222, + "step": 10694 + }, + { + "epoch": 0.2706430143988663, + "grad_norm": 4.143374443054199, + "learning_rate": 8.385226044510563e-06, + "loss": 0.1298, + "step": 10695 + }, + { + "epoch": 0.27066831996356, + "grad_norm": 4.814084053039551, + "learning_rate": 8.384930536524532e-06, + "loss": 0.209, + "step": 10696 + }, + { + "epoch": 0.27069362552825366, + "grad_norm": 4.087100028991699, + "learning_rate": 8.384635006709712e-06, + "loss": 0.1455, + "step": 10697 + }, + { + "epoch": 0.2707189310929473, + "grad_norm": 5.454858303070068, + "learning_rate": 8.38433945506801e-06, + "loss": 0.1648, + "step": 10698 + }, + { + "epoch": 0.270744236657641, + "grad_norm": 5.792229652404785, + "learning_rate": 8.384043881601333e-06, + "loss": 0.2103, + "step": 10699 + }, + { + "epoch": 0.2707695422223347, + "grad_norm": 6.773155689239502, + "learning_rate": 8.383748286311589e-06, + "loss": 0.2401, + "step": 10700 + }, + { + "epoch": 0.2707948477870284, + "grad_norm": 4.619763374328613, + "learning_rate": 8.383452669200678e-06, + "loss": 0.2034, + "step": 10701 + }, + { + "epoch": 0.27082015335172205, + "grad_norm": 10.889423370361328, + "learning_rate": 8.383157030270512e-06, + "loss": 0.3067, + "step": 10702 + }, + { + "epoch": 0.2708454589164157, + "grad_norm": 4.600412845611572, + "learning_rate": 8.382861369522994e-06, + "loss": 0.1512, + "step": 10703 + }, + { + "epoch": 0.2708707644811094, + "grad_norm": 10.963038444519043, + "learning_rate": 8.382565686960035e-06, + "loss": 0.223, + "step": 10704 + }, + { + "epoch": 0.2708960700458031, + "grad_norm": 7.7343220710754395, + "learning_rate": 8.382269982583538e-06, + "loss": 0.1492, + "step": 10705 + }, + { + "epoch": 0.27092137561049673, + "grad_norm": 5.083706378936768, + "learning_rate": 8.38197425639541e-06, + "loss": 0.139, + "step": 10706 + }, + { + "epoch": 0.27094668117519044, + "grad_norm": 5.704322814941406, + "learning_rate": 8.38167850839756e-06, + "loss": 0.1969, + "step": 10707 + }, + { + "epoch": 0.2709719867398841, + "grad_norm": 12.652010917663574, + "learning_rate": 8.381382738591895e-06, + "loss": 0.2433, + "step": 10708 + }, + { + "epoch": 0.27099729230457775, + "grad_norm": 4.004083633422852, + "learning_rate": 8.38108694698032e-06, + "loss": 0.195, + "step": 10709 + }, + { + "epoch": 0.27102259786927146, + "grad_norm": 4.232246398925781, + "learning_rate": 8.380791133564746e-06, + "loss": 0.2067, + "step": 10710 + }, + { + "epoch": 0.2710479034339651, + "grad_norm": 4.915969371795654, + "learning_rate": 8.380495298347078e-06, + "loss": 0.1506, + "step": 10711 + }, + { + "epoch": 0.27107320899865883, + "grad_norm": 5.265605449676514, + "learning_rate": 8.380199441329223e-06, + "loss": 0.1719, + "step": 10712 + }, + { + "epoch": 0.2710985145633525, + "grad_norm": 3.429213047027588, + "learning_rate": 8.379903562513092e-06, + "loss": 0.1649, + "step": 10713 + }, + { + "epoch": 0.27112382012804614, + "grad_norm": 3.18399977684021, + "learning_rate": 8.379607661900592e-06, + "loss": 0.1085, + "step": 10714 + }, + { + "epoch": 0.27114912569273986, + "grad_norm": 6.015758514404297, + "learning_rate": 8.379311739493632e-06, + "loss": 0.2049, + "step": 10715 + }, + { + "epoch": 0.2711744312574335, + "grad_norm": 3.8607981204986572, + "learning_rate": 8.379015795294116e-06, + "loss": 0.1429, + "step": 10716 + }, + { + "epoch": 0.27119973682212717, + "grad_norm": 3.949488401412964, + "learning_rate": 8.378719829303958e-06, + "loss": 0.1728, + "step": 10717 + }, + { + "epoch": 0.2712250423868209, + "grad_norm": 6.160884857177734, + "learning_rate": 8.378423841525064e-06, + "loss": 0.223, + "step": 10718 + }, + { + "epoch": 0.27125034795151454, + "grad_norm": 5.950523376464844, + "learning_rate": 8.378127831959343e-06, + "loss": 0.239, + "step": 10719 + }, + { + "epoch": 0.2712756535162082, + "grad_norm": 4.959014415740967, + "learning_rate": 8.377831800608704e-06, + "loss": 0.2388, + "step": 10720 + }, + { + "epoch": 0.2713009590809019, + "grad_norm": 4.912403106689453, + "learning_rate": 8.377535747475056e-06, + "loss": 0.1968, + "step": 10721 + }, + { + "epoch": 0.27132626464559556, + "grad_norm": 6.12144136428833, + "learning_rate": 8.37723967256031e-06, + "loss": 0.2787, + "step": 10722 + }, + { + "epoch": 0.2713515702102892, + "grad_norm": 3.7702574729919434, + "learning_rate": 8.37694357586637e-06, + "loss": 0.1622, + "step": 10723 + }, + { + "epoch": 0.27137687577498293, + "grad_norm": 7.738291263580322, + "learning_rate": 8.376647457395151e-06, + "loss": 0.1921, + "step": 10724 + }, + { + "epoch": 0.2714021813396766, + "grad_norm": 5.151636123657227, + "learning_rate": 8.376351317148559e-06, + "loss": 0.1469, + "step": 10725 + }, + { + "epoch": 0.2714274869043703, + "grad_norm": 6.595466613769531, + "learning_rate": 8.376055155128509e-06, + "loss": 0.1662, + "step": 10726 + }, + { + "epoch": 0.27145279246906395, + "grad_norm": 3.831758499145508, + "learning_rate": 8.375758971336903e-06, + "loss": 0.2025, + "step": 10727 + }, + { + "epoch": 0.2714780980337576, + "grad_norm": 5.561741828918457, + "learning_rate": 8.375462765775658e-06, + "loss": 0.2179, + "step": 10728 + }, + { + "epoch": 0.2715034035984513, + "grad_norm": 11.724949836730957, + "learning_rate": 8.375166538446682e-06, + "loss": 0.2488, + "step": 10729 + }, + { + "epoch": 0.271528709163145, + "grad_norm": 5.360397815704346, + "learning_rate": 8.374870289351882e-06, + "loss": 0.1383, + "step": 10730 + }, + { + "epoch": 0.27155401472783863, + "grad_norm": 12.607794761657715, + "learning_rate": 8.374574018493174e-06, + "loss": 0.3541, + "step": 10731 + }, + { + "epoch": 0.27157932029253234, + "grad_norm": 3.8765344619750977, + "learning_rate": 8.374277725872464e-06, + "loss": 0.1718, + "step": 10732 + }, + { + "epoch": 0.271604625857226, + "grad_norm": 3.127899646759033, + "learning_rate": 8.373981411491666e-06, + "loss": 0.1415, + "step": 10733 + }, + { + "epoch": 0.27162993142191966, + "grad_norm": 7.6644978523254395, + "learning_rate": 8.37368507535269e-06, + "loss": 0.1999, + "step": 10734 + }, + { + "epoch": 0.27165523698661337, + "grad_norm": 9.8234281539917, + "learning_rate": 8.373388717457444e-06, + "loss": 0.2355, + "step": 10735 + }, + { + "epoch": 0.271680542551307, + "grad_norm": 3.9708967208862305, + "learning_rate": 8.373092337807845e-06, + "loss": 0.1633, + "step": 10736 + }, + { + "epoch": 0.2717058481160007, + "grad_norm": 3.670524835586548, + "learning_rate": 8.3727959364058e-06, + "loss": 0.1157, + "step": 10737 + }, + { + "epoch": 0.2717311536806944, + "grad_norm": 14.487397193908691, + "learning_rate": 8.372499513253218e-06, + "loss": 0.276, + "step": 10738 + }, + { + "epoch": 0.27175645924538805, + "grad_norm": 6.994666576385498, + "learning_rate": 8.372203068352016e-06, + "loss": 0.1485, + "step": 10739 + }, + { + "epoch": 0.27178176481008176, + "grad_norm": 5.038339138031006, + "learning_rate": 8.371906601704105e-06, + "loss": 0.1906, + "step": 10740 + }, + { + "epoch": 0.2718070703747754, + "grad_norm": 8.899870872497559, + "learning_rate": 8.371610113311394e-06, + "loss": 0.2243, + "step": 10741 + }, + { + "epoch": 0.2718323759394691, + "grad_norm": 14.681918144226074, + "learning_rate": 8.371313603175798e-06, + "loss": 0.2682, + "step": 10742 + }, + { + "epoch": 0.2718576815041628, + "grad_norm": 9.335860252380371, + "learning_rate": 8.371017071299229e-06, + "loss": 0.2496, + "step": 10743 + }, + { + "epoch": 0.27188298706885644, + "grad_norm": 7.221581935882568, + "learning_rate": 8.370720517683595e-06, + "loss": 0.2547, + "step": 10744 + }, + { + "epoch": 0.2719082926335501, + "grad_norm": 8.28762435913086, + "learning_rate": 8.370423942330811e-06, + "loss": 0.2137, + "step": 10745 + }, + { + "epoch": 0.2719335981982438, + "grad_norm": 5.020345687866211, + "learning_rate": 8.370127345242793e-06, + "loss": 0.1757, + "step": 10746 + }, + { + "epoch": 0.27195890376293747, + "grad_norm": 3.485100269317627, + "learning_rate": 8.369830726421449e-06, + "loss": 0.1533, + "step": 10747 + }, + { + "epoch": 0.2719842093276311, + "grad_norm": 7.180148124694824, + "learning_rate": 8.369534085868693e-06, + "loss": 0.1982, + "step": 10748 + }, + { + "epoch": 0.27200951489232483, + "grad_norm": 9.57802963256836, + "learning_rate": 8.369237423586438e-06, + "loss": 0.2631, + "step": 10749 + }, + { + "epoch": 0.2720348204570185, + "grad_norm": 9.459364891052246, + "learning_rate": 8.3689407395766e-06, + "loss": 0.1725, + "step": 10750 + }, + { + "epoch": 0.2720601260217122, + "grad_norm": 5.340977668762207, + "learning_rate": 8.368644033841087e-06, + "loss": 0.2268, + "step": 10751 + }, + { + "epoch": 0.27208543158640586, + "grad_norm": 5.940357208251953, + "learning_rate": 8.368347306381816e-06, + "loss": 0.231, + "step": 10752 + }, + { + "epoch": 0.2721107371510995, + "grad_norm": 23.93462562561035, + "learning_rate": 8.3680505572007e-06, + "loss": 0.2409, + "step": 10753 + }, + { + "epoch": 0.2721360427157932, + "grad_norm": 2.8520615100860596, + "learning_rate": 8.367753786299653e-06, + "loss": 0.1471, + "step": 10754 + }, + { + "epoch": 0.2721613482804869, + "grad_norm": 9.988829612731934, + "learning_rate": 8.36745699368059e-06, + "loss": 0.318, + "step": 10755 + }, + { + "epoch": 0.27218665384518054, + "grad_norm": 6.223102569580078, + "learning_rate": 8.367160179345419e-06, + "loss": 0.2406, + "step": 10756 + }, + { + "epoch": 0.27221195940987425, + "grad_norm": 5.207132816314697, + "learning_rate": 8.366863343296062e-06, + "loss": 0.2204, + "step": 10757 + }, + { + "epoch": 0.2722372649745679, + "grad_norm": 3.8881776332855225, + "learning_rate": 8.366566485534428e-06, + "loss": 0.1822, + "step": 10758 + }, + { + "epoch": 0.27226257053926156, + "grad_norm": 4.041469573974609, + "learning_rate": 8.366269606062433e-06, + "loss": 0.1909, + "step": 10759 + }, + { + "epoch": 0.2722878761039553, + "grad_norm": 8.360347747802734, + "learning_rate": 8.36597270488199e-06, + "loss": 0.2424, + "step": 10760 + }, + { + "epoch": 0.27231318166864893, + "grad_norm": 6.958494663238525, + "learning_rate": 8.365675781995017e-06, + "loss": 0.2073, + "step": 10761 + }, + { + "epoch": 0.2723384872333426, + "grad_norm": 5.9020185470581055, + "learning_rate": 8.365378837403429e-06, + "loss": 0.2359, + "step": 10762 + }, + { + "epoch": 0.2723637927980363, + "grad_norm": 3.8336100578308105, + "learning_rate": 8.365081871109136e-06, + "loss": 0.1437, + "step": 10763 + }, + { + "epoch": 0.27238909836272995, + "grad_norm": 9.1887845993042, + "learning_rate": 8.364784883114057e-06, + "loss": 0.351, + "step": 10764 + }, + { + "epoch": 0.27241440392742367, + "grad_norm": 5.139823913574219, + "learning_rate": 8.364487873420106e-06, + "loss": 0.2792, + "step": 10765 + }, + { + "epoch": 0.2724397094921173, + "grad_norm": 4.906932353973389, + "learning_rate": 8.364190842029198e-06, + "loss": 0.1944, + "step": 10766 + }, + { + "epoch": 0.272465015056811, + "grad_norm": 8.643292427062988, + "learning_rate": 8.363893788943249e-06, + "loss": 0.3679, + "step": 10767 + }, + { + "epoch": 0.2724903206215047, + "grad_norm": 7.296391010284424, + "learning_rate": 8.363596714164176e-06, + "loss": 0.2705, + "step": 10768 + }, + { + "epoch": 0.27251562618619835, + "grad_norm": 6.091707229614258, + "learning_rate": 8.363299617693892e-06, + "loss": 0.2434, + "step": 10769 + }, + { + "epoch": 0.272540931750892, + "grad_norm": 4.3489837646484375, + "learning_rate": 8.363002499534314e-06, + "loss": 0.1845, + "step": 10770 + }, + { + "epoch": 0.2725662373155857, + "grad_norm": 4.951250076293945, + "learning_rate": 8.36270535968736e-06, + "loss": 0.2302, + "step": 10771 + }, + { + "epoch": 0.27259154288027937, + "grad_norm": 3.2485573291778564, + "learning_rate": 8.362408198154944e-06, + "loss": 0.1444, + "step": 10772 + }, + { + "epoch": 0.272616848444973, + "grad_norm": 4.577859401702881, + "learning_rate": 8.362111014938982e-06, + "loss": 0.2515, + "step": 10773 + }, + { + "epoch": 0.27264215400966674, + "grad_norm": 10.035866737365723, + "learning_rate": 8.361813810041393e-06, + "loss": 0.2847, + "step": 10774 + }, + { + "epoch": 0.2726674595743604, + "grad_norm": 3.0486342906951904, + "learning_rate": 8.36151658346409e-06, + "loss": 0.1164, + "step": 10775 + }, + { + "epoch": 0.2726927651390541, + "grad_norm": 2.5232982635498047, + "learning_rate": 8.361219335208994e-06, + "loss": 0.152, + "step": 10776 + }, + { + "epoch": 0.27271807070374776, + "grad_norm": 12.960687637329102, + "learning_rate": 8.36092206527802e-06, + "loss": 0.2437, + "step": 10777 + }, + { + "epoch": 0.2727433762684414, + "grad_norm": 4.564014434814453, + "learning_rate": 8.360624773673085e-06, + "loss": 0.1771, + "step": 10778 + }, + { + "epoch": 0.27276868183313513, + "grad_norm": 3.809333562850952, + "learning_rate": 8.360327460396104e-06, + "loss": 0.1952, + "step": 10779 + }, + { + "epoch": 0.2727939873978288, + "grad_norm": 9.499610900878906, + "learning_rate": 8.360030125448996e-06, + "loss": 0.3461, + "step": 10780 + }, + { + "epoch": 0.27281929296252244, + "grad_norm": 4.774816513061523, + "learning_rate": 8.359732768833681e-06, + "loss": 0.1593, + "step": 10781 + }, + { + "epoch": 0.27284459852721615, + "grad_norm": 6.114917278289795, + "learning_rate": 8.359435390552073e-06, + "loss": 0.2068, + "step": 10782 + }, + { + "epoch": 0.2728699040919098, + "grad_norm": 5.202179908752441, + "learning_rate": 8.359137990606093e-06, + "loss": 0.2041, + "step": 10783 + }, + { + "epoch": 0.27289520965660347, + "grad_norm": 4.007955551147461, + "learning_rate": 8.358840568997655e-06, + "loss": 0.1757, + "step": 10784 + }, + { + "epoch": 0.2729205152212972, + "grad_norm": 7.384390354156494, + "learning_rate": 8.358543125728677e-06, + "loss": 0.1979, + "step": 10785 + }, + { + "epoch": 0.27294582078599083, + "grad_norm": 4.273658275604248, + "learning_rate": 8.358245660801084e-06, + "loss": 0.1561, + "step": 10786 + }, + { + "epoch": 0.2729711263506845, + "grad_norm": 7.23166561126709, + "learning_rate": 8.357948174216786e-06, + "loss": 0.1892, + "step": 10787 + }, + { + "epoch": 0.2729964319153782, + "grad_norm": 3.221923351287842, + "learning_rate": 8.357650665977706e-06, + "loss": 0.1548, + "step": 10788 + }, + { + "epoch": 0.27302173748007186, + "grad_norm": 6.703615665435791, + "learning_rate": 8.357353136085759e-06, + "loss": 0.1945, + "step": 10789 + }, + { + "epoch": 0.27304704304476557, + "grad_norm": 5.022793292999268, + "learning_rate": 8.357055584542868e-06, + "loss": 0.2004, + "step": 10790 + }, + { + "epoch": 0.2730723486094592, + "grad_norm": 3.8237407207489014, + "learning_rate": 8.356758011350952e-06, + "loss": 0.1195, + "step": 10791 + }, + { + "epoch": 0.2730976541741529, + "grad_norm": 4.426592826843262, + "learning_rate": 8.356460416511925e-06, + "loss": 0.1811, + "step": 10792 + }, + { + "epoch": 0.2731229597388466, + "grad_norm": 5.189620494842529, + "learning_rate": 8.35616280002771e-06, + "loss": 0.2291, + "step": 10793 + }, + { + "epoch": 0.27314826530354025, + "grad_norm": 16.03660011291504, + "learning_rate": 8.355865161900226e-06, + "loss": 0.1721, + "step": 10794 + }, + { + "epoch": 0.2731735708682339, + "grad_norm": 4.95162296295166, + "learning_rate": 8.355567502131392e-06, + "loss": 0.1961, + "step": 10795 + }, + { + "epoch": 0.2731988764329276, + "grad_norm": 3.7932469844818115, + "learning_rate": 8.355269820723126e-06, + "loss": 0.1313, + "step": 10796 + }, + { + "epoch": 0.2732241819976213, + "grad_norm": 4.149731159210205, + "learning_rate": 8.354972117677351e-06, + "loss": 0.1728, + "step": 10797 + }, + { + "epoch": 0.27324948756231493, + "grad_norm": 4.628711223602295, + "learning_rate": 8.354674392995983e-06, + "loss": 0.1593, + "step": 10798 + }, + { + "epoch": 0.27327479312700864, + "grad_norm": 8.737862586975098, + "learning_rate": 8.354376646680944e-06, + "loss": 0.2169, + "step": 10799 + }, + { + "epoch": 0.2733000986917023, + "grad_norm": 8.942673683166504, + "learning_rate": 8.354078878734155e-06, + "loss": 0.3022, + "step": 10800 + }, + { + "epoch": 0.27332540425639595, + "grad_norm": 6.3152756690979, + "learning_rate": 8.353781089157535e-06, + "loss": 0.1476, + "step": 10801 + }, + { + "epoch": 0.27335070982108967, + "grad_norm": 5.510531902313232, + "learning_rate": 8.353483277953005e-06, + "loss": 0.1512, + "step": 10802 + }, + { + "epoch": 0.2733760153857833, + "grad_norm": 9.917067527770996, + "learning_rate": 8.353185445122484e-06, + "loss": 0.2229, + "step": 10803 + }, + { + "epoch": 0.27340132095047703, + "grad_norm": 3.7238609790802, + "learning_rate": 8.352887590667896e-06, + "loss": 0.1198, + "step": 10804 + }, + { + "epoch": 0.2734266265151707, + "grad_norm": 5.932008743286133, + "learning_rate": 8.352589714591157e-06, + "loss": 0.1961, + "step": 10805 + }, + { + "epoch": 0.27345193207986435, + "grad_norm": 5.001956939697266, + "learning_rate": 8.352291816894192e-06, + "loss": 0.1136, + "step": 10806 + }, + { + "epoch": 0.27347723764455806, + "grad_norm": 5.148382186889648, + "learning_rate": 8.35199389757892e-06, + "loss": 0.1485, + "step": 10807 + }, + { + "epoch": 0.2735025432092517, + "grad_norm": 10.575511932373047, + "learning_rate": 8.351695956647262e-06, + "loss": 0.2546, + "step": 10808 + }, + { + "epoch": 0.27352784877394537, + "grad_norm": 13.238920211791992, + "learning_rate": 8.351397994101141e-06, + "loss": 0.2889, + "step": 10809 + }, + { + "epoch": 0.2735531543386391, + "grad_norm": 9.367731094360352, + "learning_rate": 8.351100009942478e-06, + "loss": 0.1797, + "step": 10810 + }, + { + "epoch": 0.27357845990333274, + "grad_norm": 4.042380332946777, + "learning_rate": 8.350802004173196e-06, + "loss": 0.1414, + "step": 10811 + }, + { + "epoch": 0.2736037654680264, + "grad_norm": 5.015152931213379, + "learning_rate": 8.350503976795212e-06, + "loss": 0.171, + "step": 10812 + }, + { + "epoch": 0.2736290710327201, + "grad_norm": 5.151336669921875, + "learning_rate": 8.350205927810453e-06, + "loss": 0.1258, + "step": 10813 + }, + { + "epoch": 0.27365437659741376, + "grad_norm": 3.7559263706207275, + "learning_rate": 8.349907857220838e-06, + "loss": 0.1306, + "step": 10814 + }, + { + "epoch": 0.2736796821621075, + "grad_norm": 4.516400337219238, + "learning_rate": 8.349609765028293e-06, + "loss": 0.1386, + "step": 10815 + }, + { + "epoch": 0.27370498772680113, + "grad_norm": 5.813481330871582, + "learning_rate": 8.349311651234736e-06, + "loss": 0.1209, + "step": 10816 + }, + { + "epoch": 0.2737302932914948, + "grad_norm": 4.731585502624512, + "learning_rate": 8.34901351584209e-06, + "loss": 0.1915, + "step": 10817 + }, + { + "epoch": 0.2737555988561885, + "grad_norm": 25.95937728881836, + "learning_rate": 8.348715358852279e-06, + "loss": 0.2068, + "step": 10818 + }, + { + "epoch": 0.27378090442088215, + "grad_norm": 35.49095153808594, + "learning_rate": 8.348417180267228e-06, + "loss": 0.1892, + "step": 10819 + }, + { + "epoch": 0.2738062099855758, + "grad_norm": 6.839831829071045, + "learning_rate": 8.348118980088855e-06, + "loss": 0.1957, + "step": 10820 + }, + { + "epoch": 0.2738315155502695, + "grad_norm": 6.480659484863281, + "learning_rate": 8.347820758319087e-06, + "loss": 0.2257, + "step": 10821 + }, + { + "epoch": 0.2738568211149632, + "grad_norm": 3.324939727783203, + "learning_rate": 8.347522514959844e-06, + "loss": 0.0993, + "step": 10822 + }, + { + "epoch": 0.27388212667965683, + "grad_norm": 8.269657135009766, + "learning_rate": 8.347224250013053e-06, + "loss": 0.2229, + "step": 10823 + }, + { + "epoch": 0.27390743224435055, + "grad_norm": 7.844018936157227, + "learning_rate": 8.346925963480634e-06, + "loss": 0.1581, + "step": 10824 + }, + { + "epoch": 0.2739327378090442, + "grad_norm": 11.157275199890137, + "learning_rate": 8.346627655364514e-06, + "loss": 0.2679, + "step": 10825 + }, + { + "epoch": 0.27395804337373786, + "grad_norm": 3.7995126247406006, + "learning_rate": 8.346329325666614e-06, + "loss": 0.1816, + "step": 10826 + }, + { + "epoch": 0.27398334893843157, + "grad_norm": 15.640998840332031, + "learning_rate": 8.346030974388857e-06, + "loss": 0.3507, + "step": 10827 + }, + { + "epoch": 0.2740086545031252, + "grad_norm": 4.718337059020996, + "learning_rate": 8.34573260153317e-06, + "loss": 0.1717, + "step": 10828 + }, + { + "epoch": 0.27403396006781894, + "grad_norm": 7.383962631225586, + "learning_rate": 8.345434207101474e-06, + "loss": 0.2015, + "step": 10829 + }, + { + "epoch": 0.2740592656325126, + "grad_norm": 4.8968610763549805, + "learning_rate": 8.345135791095698e-06, + "loss": 0.1491, + "step": 10830 + }, + { + "epoch": 0.27408457119720625, + "grad_norm": 7.095733165740967, + "learning_rate": 8.34483735351776e-06, + "loss": 0.2331, + "step": 10831 + }, + { + "epoch": 0.27410987676189996, + "grad_norm": 7.5580735206604, + "learning_rate": 8.344538894369592e-06, + "loss": 0.1805, + "step": 10832 + }, + { + "epoch": 0.2741351823265936, + "grad_norm": 3.7956507205963135, + "learning_rate": 8.344240413653112e-06, + "loss": 0.1248, + "step": 10833 + }, + { + "epoch": 0.2741604878912873, + "grad_norm": 4.595688819885254, + "learning_rate": 8.34394191137025e-06, + "loss": 0.1928, + "step": 10834 + }, + { + "epoch": 0.274185793455981, + "grad_norm": 11.80144214630127, + "learning_rate": 8.343643387522927e-06, + "loss": 0.1381, + "step": 10835 + }, + { + "epoch": 0.27421109902067464, + "grad_norm": 6.704380989074707, + "learning_rate": 8.343344842113069e-06, + "loss": 0.2113, + "step": 10836 + }, + { + "epoch": 0.2742364045853683, + "grad_norm": 4.043241500854492, + "learning_rate": 8.343046275142604e-06, + "loss": 0.1882, + "step": 10837 + }, + { + "epoch": 0.274261710150062, + "grad_norm": 5.185323238372803, + "learning_rate": 8.342747686613453e-06, + "loss": 0.1832, + "step": 10838 + }, + { + "epoch": 0.27428701571475567, + "grad_norm": 4.44733190536499, + "learning_rate": 8.342449076527545e-06, + "loss": 0.1445, + "step": 10839 + }, + { + "epoch": 0.2743123212794494, + "grad_norm": 4.47720193862915, + "learning_rate": 8.342150444886806e-06, + "loss": 0.1624, + "step": 10840 + }, + { + "epoch": 0.27433762684414303, + "grad_norm": 5.633653163909912, + "learning_rate": 8.341851791693159e-06, + "loss": 0.1806, + "step": 10841 + }, + { + "epoch": 0.2743629324088367, + "grad_norm": 8.84292221069336, + "learning_rate": 8.341553116948533e-06, + "loss": 0.2901, + "step": 10842 + }, + { + "epoch": 0.2743882379735304, + "grad_norm": 8.82027530670166, + "learning_rate": 8.341254420654853e-06, + "loss": 0.2508, + "step": 10843 + }, + { + "epoch": 0.27441354353822406, + "grad_norm": 11.615265846252441, + "learning_rate": 8.34095570281404e-06, + "loss": 0.2931, + "step": 10844 + }, + { + "epoch": 0.2744388491029177, + "grad_norm": 9.562122344970703, + "learning_rate": 8.34065696342803e-06, + "loss": 0.1932, + "step": 10845 + }, + { + "epoch": 0.2744641546676114, + "grad_norm": 3.023063898086548, + "learning_rate": 8.340358202498741e-06, + "loss": 0.185, + "step": 10846 + }, + { + "epoch": 0.2744894602323051, + "grad_norm": 6.075410842895508, + "learning_rate": 8.340059420028105e-06, + "loss": 0.1903, + "step": 10847 + }, + { + "epoch": 0.27451476579699874, + "grad_norm": 4.938581466674805, + "learning_rate": 8.339760616018048e-06, + "loss": 0.1885, + "step": 10848 + }, + { + "epoch": 0.27454007136169245, + "grad_norm": 3.5243964195251465, + "learning_rate": 8.339461790470496e-06, + "loss": 0.172, + "step": 10849 + }, + { + "epoch": 0.2745653769263861, + "grad_norm": 6.897732257843018, + "learning_rate": 8.339162943387376e-06, + "loss": 0.24, + "step": 10850 + }, + { + "epoch": 0.27459068249107976, + "grad_norm": 16.283554077148438, + "learning_rate": 8.338864074770614e-06, + "loss": 0.3505, + "step": 10851 + }, + { + "epoch": 0.2746159880557735, + "grad_norm": 7.327380180358887, + "learning_rate": 8.33856518462214e-06, + "loss": 0.164, + "step": 10852 + }, + { + "epoch": 0.27464129362046713, + "grad_norm": 3.111697196960449, + "learning_rate": 8.33826627294388e-06, + "loss": 0.1598, + "step": 10853 + }, + { + "epoch": 0.27466659918516084, + "grad_norm": 3.3092994689941406, + "learning_rate": 8.337967339737763e-06, + "loss": 0.1405, + "step": 10854 + }, + { + "epoch": 0.2746919047498545, + "grad_norm": 4.164897441864014, + "learning_rate": 8.337668385005713e-06, + "loss": 0.152, + "step": 10855 + }, + { + "epoch": 0.27471721031454815, + "grad_norm": 6.313178062438965, + "learning_rate": 8.33736940874966e-06, + "loss": 0.1914, + "step": 10856 + }, + { + "epoch": 0.27474251587924187, + "grad_norm": 4.748938083648682, + "learning_rate": 8.337070410971534e-06, + "loss": 0.1387, + "step": 10857 + }, + { + "epoch": 0.2747678214439355, + "grad_norm": 18.091812133789062, + "learning_rate": 8.336771391673261e-06, + "loss": 0.1816, + "step": 10858 + }, + { + "epoch": 0.2747931270086292, + "grad_norm": 4.283593654632568, + "learning_rate": 8.336472350856772e-06, + "loss": 0.1969, + "step": 10859 + }, + { + "epoch": 0.2748184325733229, + "grad_norm": 19.94152069091797, + "learning_rate": 8.336173288523992e-06, + "loss": 0.1408, + "step": 10860 + }, + { + "epoch": 0.27484373813801655, + "grad_norm": 6.650412082672119, + "learning_rate": 8.335874204676851e-06, + "loss": 0.2614, + "step": 10861 + }, + { + "epoch": 0.2748690437027102, + "grad_norm": 5.284524440765381, + "learning_rate": 8.335575099317278e-06, + "loss": 0.1855, + "step": 10862 + }, + { + "epoch": 0.2748943492674039, + "grad_norm": 5.760485649108887, + "learning_rate": 8.335275972447201e-06, + "loss": 0.2469, + "step": 10863 + }, + { + "epoch": 0.27491965483209757, + "grad_norm": 7.729361534118652, + "learning_rate": 8.33497682406855e-06, + "loss": 0.1782, + "step": 10864 + }, + { + "epoch": 0.2749449603967912, + "grad_norm": 7.899010181427002, + "learning_rate": 8.334677654183254e-06, + "loss": 0.2035, + "step": 10865 + }, + { + "epoch": 0.27497026596148494, + "grad_norm": 4.347992420196533, + "learning_rate": 8.334378462793241e-06, + "loss": 0.2297, + "step": 10866 + }, + { + "epoch": 0.2749955715261786, + "grad_norm": 4.973597049713135, + "learning_rate": 8.334079249900443e-06, + "loss": 0.1905, + "step": 10867 + }, + { + "epoch": 0.2750208770908723, + "grad_norm": 7.45785665512085, + "learning_rate": 8.33378001550679e-06, + "loss": 0.232, + "step": 10868 + }, + { + "epoch": 0.27504618265556596, + "grad_norm": 3.2233269214630127, + "learning_rate": 8.333480759614205e-06, + "loss": 0.131, + "step": 10869 + }, + { + "epoch": 0.2750714882202596, + "grad_norm": 14.12924861907959, + "learning_rate": 8.333181482224625e-06, + "loss": 0.2934, + "step": 10870 + }, + { + "epoch": 0.27509679378495333, + "grad_norm": 8.276695251464844, + "learning_rate": 8.332882183339978e-06, + "loss": 0.1195, + "step": 10871 + }, + { + "epoch": 0.275122099349647, + "grad_norm": 11.667404174804688, + "learning_rate": 8.332582862962196e-06, + "loss": 0.2589, + "step": 10872 + }, + { + "epoch": 0.27514740491434064, + "grad_norm": 4.999871253967285, + "learning_rate": 8.332283521093204e-06, + "loss": 0.2232, + "step": 10873 + }, + { + "epoch": 0.27517271047903435, + "grad_norm": 7.6086745262146, + "learning_rate": 8.331984157734937e-06, + "loss": 0.2102, + "step": 10874 + }, + { + "epoch": 0.275198016043728, + "grad_norm": 4.742599964141846, + "learning_rate": 8.33168477288932e-06, + "loss": 0.2637, + "step": 10875 + }, + { + "epoch": 0.27522332160842167, + "grad_norm": 5.435588836669922, + "learning_rate": 8.331385366558292e-06, + "loss": 0.2229, + "step": 10876 + }, + { + "epoch": 0.2752486271731154, + "grad_norm": 9.993029594421387, + "learning_rate": 8.331085938743777e-06, + "loss": 0.1201, + "step": 10877 + }, + { + "epoch": 0.27527393273780904, + "grad_norm": 5.308750629425049, + "learning_rate": 8.33078648944771e-06, + "loss": 0.2407, + "step": 10878 + }, + { + "epoch": 0.27529923830250275, + "grad_norm": 5.74211311340332, + "learning_rate": 8.33048701867202e-06, + "loss": 0.1666, + "step": 10879 + }, + { + "epoch": 0.2753245438671964, + "grad_norm": 4.114747524261475, + "learning_rate": 8.330187526418639e-06, + "loss": 0.2023, + "step": 10880 + }, + { + "epoch": 0.27534984943189006, + "grad_norm": 5.863961219787598, + "learning_rate": 8.329888012689497e-06, + "loss": 0.216, + "step": 10881 + }, + { + "epoch": 0.27537515499658377, + "grad_norm": 8.312832832336426, + "learning_rate": 8.329588477486528e-06, + "loss": 0.1953, + "step": 10882 + }, + { + "epoch": 0.2754004605612774, + "grad_norm": 5.088934898376465, + "learning_rate": 8.329288920811662e-06, + "loss": 0.2594, + "step": 10883 + }, + { + "epoch": 0.2754257661259711, + "grad_norm": 4.258525371551514, + "learning_rate": 8.32898934266683e-06, + "loss": 0.1404, + "step": 10884 + }, + { + "epoch": 0.2754510716906648, + "grad_norm": 4.475386142730713, + "learning_rate": 8.328689743053966e-06, + "loss": 0.1506, + "step": 10885 + }, + { + "epoch": 0.27547637725535845, + "grad_norm": 5.962909698486328, + "learning_rate": 8.328390121975e-06, + "loss": 0.1598, + "step": 10886 + }, + { + "epoch": 0.2755016828200521, + "grad_norm": 5.72971773147583, + "learning_rate": 8.328090479431868e-06, + "loss": 0.1967, + "step": 10887 + }, + { + "epoch": 0.2755269883847458, + "grad_norm": 5.4188666343688965, + "learning_rate": 8.327790815426497e-06, + "loss": 0.1841, + "step": 10888 + }, + { + "epoch": 0.2755522939494395, + "grad_norm": 3.1605865955352783, + "learning_rate": 8.327491129960822e-06, + "loss": 0.1767, + "step": 10889 + }, + { + "epoch": 0.27557759951413313, + "grad_norm": 5.608345985412598, + "learning_rate": 8.327191423036776e-06, + "loss": 0.1629, + "step": 10890 + }, + { + "epoch": 0.27560290507882684, + "grad_norm": 11.934582710266113, + "learning_rate": 8.326891694656291e-06, + "loss": 0.2136, + "step": 10891 + }, + { + "epoch": 0.2756282106435205, + "grad_norm": 5.439445495605469, + "learning_rate": 8.326591944821303e-06, + "loss": 0.2393, + "step": 10892 + }, + { + "epoch": 0.2756535162082142, + "grad_norm": 8.681201934814453, + "learning_rate": 8.326292173533739e-06, + "loss": 0.2217, + "step": 10893 + }, + { + "epoch": 0.27567882177290787, + "grad_norm": 4.612142086029053, + "learning_rate": 8.325992380795538e-06, + "loss": 0.2021, + "step": 10894 + }, + { + "epoch": 0.2757041273376015, + "grad_norm": 5.66092586517334, + "learning_rate": 8.325692566608628e-06, + "loss": 0.1867, + "step": 10895 + }, + { + "epoch": 0.27572943290229524, + "grad_norm": 3.9436538219451904, + "learning_rate": 8.325392730974949e-06, + "loss": 0.1729, + "step": 10896 + }, + { + "epoch": 0.2757547384669889, + "grad_norm": 4.621226787567139, + "learning_rate": 8.325092873896427e-06, + "loss": 0.1449, + "step": 10897 + }, + { + "epoch": 0.27578004403168255, + "grad_norm": 3.34588885307312, + "learning_rate": 8.324792995375001e-06, + "loss": 0.1129, + "step": 10898 + }, + { + "epoch": 0.27580534959637626, + "grad_norm": 4.8059983253479, + "learning_rate": 8.324493095412604e-06, + "loss": 0.1813, + "step": 10899 + }, + { + "epoch": 0.2758306551610699, + "grad_norm": 11.396769523620605, + "learning_rate": 8.324193174011169e-06, + "loss": 0.205, + "step": 10900 + }, + { + "epoch": 0.27585596072576357, + "grad_norm": 9.632155418395996, + "learning_rate": 8.323893231172631e-06, + "loss": 0.2785, + "step": 10901 + }, + { + "epoch": 0.2758812662904573, + "grad_norm": 10.35649585723877, + "learning_rate": 8.323593266898923e-06, + "loss": 0.1832, + "step": 10902 + }, + { + "epoch": 0.27590657185515094, + "grad_norm": 3.413151741027832, + "learning_rate": 8.323293281191982e-06, + "loss": 0.1498, + "step": 10903 + }, + { + "epoch": 0.27593187741984465, + "grad_norm": 18.706693649291992, + "learning_rate": 8.32299327405374e-06, + "loss": 0.4446, + "step": 10904 + }, + { + "epoch": 0.2759571829845383, + "grad_norm": 5.8808488845825195, + "learning_rate": 8.322693245486132e-06, + "loss": 0.2278, + "step": 10905 + }, + { + "epoch": 0.27598248854923196, + "grad_norm": 4.6967363357543945, + "learning_rate": 8.322393195491094e-06, + "loss": 0.2312, + "step": 10906 + }, + { + "epoch": 0.2760077941139257, + "grad_norm": 3.6919689178466797, + "learning_rate": 8.322093124070559e-06, + "loss": 0.1428, + "step": 10907 + }, + { + "epoch": 0.27603309967861933, + "grad_norm": 6.017293930053711, + "learning_rate": 8.321793031226464e-06, + "loss": 0.1816, + "step": 10908 + }, + { + "epoch": 0.276058405243313, + "grad_norm": 7.693678855895996, + "learning_rate": 8.321492916960745e-06, + "loss": 0.2755, + "step": 10909 + }, + { + "epoch": 0.2760837108080067, + "grad_norm": 8.020801544189453, + "learning_rate": 8.321192781275334e-06, + "loss": 0.273, + "step": 10910 + }, + { + "epoch": 0.27610901637270036, + "grad_norm": 7.628782272338867, + "learning_rate": 8.320892624172169e-06, + "loss": 0.2567, + "step": 10911 + }, + { + "epoch": 0.276134321937394, + "grad_norm": 5.337718963623047, + "learning_rate": 8.320592445653186e-06, + "loss": 0.1634, + "step": 10912 + }, + { + "epoch": 0.2761596275020877, + "grad_norm": 5.561782360076904, + "learning_rate": 8.32029224572032e-06, + "loss": 0.2099, + "step": 10913 + }, + { + "epoch": 0.2761849330667814, + "grad_norm": 4.2986226081848145, + "learning_rate": 8.319992024375506e-06, + "loss": 0.209, + "step": 10914 + }, + { + "epoch": 0.27621023863147504, + "grad_norm": 10.33103084564209, + "learning_rate": 8.319691781620682e-06, + "loss": 0.2005, + "step": 10915 + }, + { + "epoch": 0.27623554419616875, + "grad_norm": 5.908377647399902, + "learning_rate": 8.319391517457781e-06, + "loss": 0.2083, + "step": 10916 + }, + { + "epoch": 0.2762608497608624, + "grad_norm": 4.369060039520264, + "learning_rate": 8.319091231888744e-06, + "loss": 0.207, + "step": 10917 + }, + { + "epoch": 0.2762861553255561, + "grad_norm": 6.533609390258789, + "learning_rate": 8.318790924915506e-06, + "loss": 0.1703, + "step": 10918 + }, + { + "epoch": 0.27631146089024977, + "grad_norm": 4.622640609741211, + "learning_rate": 8.31849059654e-06, + "loss": 0.2585, + "step": 10919 + }, + { + "epoch": 0.2763367664549434, + "grad_norm": 3.999263048171997, + "learning_rate": 8.318190246764166e-06, + "loss": 0.1442, + "step": 10920 + }, + { + "epoch": 0.27636207201963714, + "grad_norm": 11.065071105957031, + "learning_rate": 8.31788987558994e-06, + "loss": 0.1725, + "step": 10921 + }, + { + "epoch": 0.2763873775843308, + "grad_norm": 2.960496187210083, + "learning_rate": 8.31758948301926e-06, + "loss": 0.1796, + "step": 10922 + }, + { + "epoch": 0.27641268314902445, + "grad_norm": 7.1328630447387695, + "learning_rate": 8.317289069054064e-06, + "loss": 0.1815, + "step": 10923 + }, + { + "epoch": 0.27643798871371816, + "grad_norm": 4.398410797119141, + "learning_rate": 8.316988633696286e-06, + "loss": 0.1335, + "step": 10924 + }, + { + "epoch": 0.2764632942784118, + "grad_norm": 7.6274824142456055, + "learning_rate": 8.316688176947866e-06, + "loss": 0.2413, + "step": 10925 + }, + { + "epoch": 0.2764885998431055, + "grad_norm": 3.843219041824341, + "learning_rate": 8.31638769881074e-06, + "loss": 0.1843, + "step": 10926 + }, + { + "epoch": 0.2765139054077992, + "grad_norm": 4.791032314300537, + "learning_rate": 8.316087199286848e-06, + "loss": 0.1722, + "step": 10927 + }, + { + "epoch": 0.27653921097249284, + "grad_norm": 4.548641204833984, + "learning_rate": 8.315786678378126e-06, + "loss": 0.2067, + "step": 10928 + }, + { + "epoch": 0.2765645165371865, + "grad_norm": 3.5082478523254395, + "learning_rate": 8.315486136086512e-06, + "loss": 0.1411, + "step": 10929 + }, + { + "epoch": 0.2765898221018802, + "grad_norm": 5.543331146240234, + "learning_rate": 8.315185572413944e-06, + "loss": 0.219, + "step": 10930 + }, + { + "epoch": 0.27661512766657387, + "grad_norm": 7.75986385345459, + "learning_rate": 8.314884987362363e-06, + "loss": 0.0922, + "step": 10931 + }, + { + "epoch": 0.2766404332312676, + "grad_norm": 4.041108131408691, + "learning_rate": 8.314584380933704e-06, + "loss": 0.1587, + "step": 10932 + }, + { + "epoch": 0.27666573879596124, + "grad_norm": 5.764856815338135, + "learning_rate": 8.314283753129907e-06, + "loss": 0.2189, + "step": 10933 + }, + { + "epoch": 0.2766910443606549, + "grad_norm": 5.1198859214782715, + "learning_rate": 8.313983103952912e-06, + "loss": 0.2106, + "step": 10934 + }, + { + "epoch": 0.2767163499253486, + "grad_norm": 10.206798553466797, + "learning_rate": 8.313682433404654e-06, + "loss": 0.2698, + "step": 10935 + }, + { + "epoch": 0.27674165549004226, + "grad_norm": 4.730399131774902, + "learning_rate": 8.313381741487076e-06, + "loss": 0.2351, + "step": 10936 + }, + { + "epoch": 0.2767669610547359, + "grad_norm": 2.9862964153289795, + "learning_rate": 8.313081028202114e-06, + "loss": 0.1867, + "step": 10937 + }, + { + "epoch": 0.2767922666194296, + "grad_norm": 27.68906021118164, + "learning_rate": 8.31278029355171e-06, + "loss": 0.3423, + "step": 10938 + }, + { + "epoch": 0.2768175721841233, + "grad_norm": 2.9944961071014404, + "learning_rate": 8.312479537537803e-06, + "loss": 0.1136, + "step": 10939 + }, + { + "epoch": 0.27684287774881694, + "grad_norm": 2.779853343963623, + "learning_rate": 8.31217876016233e-06, + "loss": 0.1185, + "step": 10940 + }, + { + "epoch": 0.27686818331351065, + "grad_norm": 4.52351713180542, + "learning_rate": 8.311877961427233e-06, + "loss": 0.1638, + "step": 10941 + }, + { + "epoch": 0.2768934888782043, + "grad_norm": 16.83654022216797, + "learning_rate": 8.311577141334451e-06, + "loss": 0.4608, + "step": 10942 + }, + { + "epoch": 0.276918794442898, + "grad_norm": 3.475454092025757, + "learning_rate": 8.311276299885924e-06, + "loss": 0.1419, + "step": 10943 + }, + { + "epoch": 0.2769441000075917, + "grad_norm": 8.587817192077637, + "learning_rate": 8.31097543708359e-06, + "loss": 0.1923, + "step": 10944 + }, + { + "epoch": 0.27696940557228533, + "grad_norm": 4.585927963256836, + "learning_rate": 8.310674552929395e-06, + "loss": 0.2087, + "step": 10945 + }, + { + "epoch": 0.27699471113697904, + "grad_norm": 8.346138954162598, + "learning_rate": 8.310373647425273e-06, + "loss": 0.1889, + "step": 10946 + }, + { + "epoch": 0.2770200167016727, + "grad_norm": 3.6729085445404053, + "learning_rate": 8.310072720573168e-06, + "loss": 0.1203, + "step": 10947 + }, + { + "epoch": 0.27704532226636636, + "grad_norm": 12.881928443908691, + "learning_rate": 8.30977177237502e-06, + "loss": 0.103, + "step": 10948 + }, + { + "epoch": 0.27707062783106007, + "grad_norm": 3.4915246963500977, + "learning_rate": 8.30947080283277e-06, + "loss": 0.1613, + "step": 10949 + }, + { + "epoch": 0.2770959333957537, + "grad_norm": 7.322198390960693, + "learning_rate": 8.309169811948357e-06, + "loss": 0.2106, + "step": 10950 + }, + { + "epoch": 0.2771212389604474, + "grad_norm": 8.078361511230469, + "learning_rate": 8.308868799723725e-06, + "loss": 0.258, + "step": 10951 + }, + { + "epoch": 0.2771465445251411, + "grad_norm": 7.683990955352783, + "learning_rate": 8.308567766160811e-06, + "loss": 0.1818, + "step": 10952 + }, + { + "epoch": 0.27717185008983475, + "grad_norm": 3.228022336959839, + "learning_rate": 8.308266711261562e-06, + "loss": 0.1307, + "step": 10953 + }, + { + "epoch": 0.2771971556545284, + "grad_norm": 8.581711769104004, + "learning_rate": 8.307965635027914e-06, + "loss": 0.1839, + "step": 10954 + }, + { + "epoch": 0.2772224612192221, + "grad_norm": 11.775225639343262, + "learning_rate": 8.307664537461812e-06, + "loss": 0.2572, + "step": 10955 + }, + { + "epoch": 0.27724776678391577, + "grad_norm": 13.11594009399414, + "learning_rate": 8.307363418565197e-06, + "loss": 0.1426, + "step": 10956 + }, + { + "epoch": 0.2772730723486095, + "grad_norm": 12.858345985412598, + "learning_rate": 8.30706227834001e-06, + "loss": 0.3418, + "step": 10957 + }, + { + "epoch": 0.27729837791330314, + "grad_norm": 9.326244354248047, + "learning_rate": 8.306761116788193e-06, + "loss": 0.2777, + "step": 10958 + }, + { + "epoch": 0.2773236834779968, + "grad_norm": 4.126452922821045, + "learning_rate": 8.306459933911689e-06, + "loss": 0.1282, + "step": 10959 + }, + { + "epoch": 0.2773489890426905, + "grad_norm": 4.511930465698242, + "learning_rate": 8.306158729712442e-06, + "loss": 0.1732, + "step": 10960 + }, + { + "epoch": 0.27737429460738416, + "grad_norm": 3.670100450515747, + "learning_rate": 8.30585750419239e-06, + "loss": 0.1643, + "step": 10961 + }, + { + "epoch": 0.2773996001720778, + "grad_norm": 23.601354598999023, + "learning_rate": 8.305556257353477e-06, + "loss": 0.2131, + "step": 10962 + }, + { + "epoch": 0.27742490573677153, + "grad_norm": 3.7912673950195312, + "learning_rate": 8.305254989197648e-06, + "loss": 0.1436, + "step": 10963 + }, + { + "epoch": 0.2774502113014652, + "grad_norm": 8.67268180847168, + "learning_rate": 8.304953699726846e-06, + "loss": 0.3354, + "step": 10964 + }, + { + "epoch": 0.27747551686615884, + "grad_norm": 4.934019565582275, + "learning_rate": 8.30465238894301e-06, + "loss": 0.2044, + "step": 10965 + }, + { + "epoch": 0.27750082243085256, + "grad_norm": 5.291407585144043, + "learning_rate": 8.304351056848085e-06, + "loss": 0.1829, + "step": 10966 + }, + { + "epoch": 0.2775261279955462, + "grad_norm": 8.636017799377441, + "learning_rate": 8.304049703444016e-06, + "loss": 0.1236, + "step": 10967 + }, + { + "epoch": 0.2775514335602399, + "grad_norm": 4.319156646728516, + "learning_rate": 8.303748328732746e-06, + "loss": 0.1349, + "step": 10968 + }, + { + "epoch": 0.2775767391249336, + "grad_norm": 10.96149730682373, + "learning_rate": 8.303446932716217e-06, + "loss": 0.2053, + "step": 10969 + }, + { + "epoch": 0.27760204468962724, + "grad_norm": 12.75313949584961, + "learning_rate": 8.303145515396372e-06, + "loss": 0.3985, + "step": 10970 + }, + { + "epoch": 0.27762735025432095, + "grad_norm": 12.817320823669434, + "learning_rate": 8.302844076775158e-06, + "loss": 0.2824, + "step": 10971 + }, + { + "epoch": 0.2776526558190146, + "grad_norm": 9.457571983337402, + "learning_rate": 8.302542616854514e-06, + "loss": 0.3483, + "step": 10972 + }, + { + "epoch": 0.27767796138370826, + "grad_norm": 7.72842264175415, + "learning_rate": 8.30224113563639e-06, + "loss": 0.2307, + "step": 10973 + }, + { + "epoch": 0.27770326694840197, + "grad_norm": 4.095849514007568, + "learning_rate": 8.301939633122726e-06, + "loss": 0.183, + "step": 10974 + }, + { + "epoch": 0.27772857251309563, + "grad_norm": 4.260350227355957, + "learning_rate": 8.301638109315466e-06, + "loss": 0.1656, + "step": 10975 + }, + { + "epoch": 0.2777538780777893, + "grad_norm": 17.95979881286621, + "learning_rate": 8.301336564216557e-06, + "loss": 0.3704, + "step": 10976 + }, + { + "epoch": 0.277779183642483, + "grad_norm": 6.39158821105957, + "learning_rate": 8.301034997827943e-06, + "loss": 0.1728, + "step": 10977 + }, + { + "epoch": 0.27780448920717665, + "grad_norm": 7.081830978393555, + "learning_rate": 8.30073341015157e-06, + "loss": 0.237, + "step": 10978 + }, + { + "epoch": 0.2778297947718703, + "grad_norm": 9.806777000427246, + "learning_rate": 8.300431801189379e-06, + "loss": 0.1908, + "step": 10979 + }, + { + "epoch": 0.277855100336564, + "grad_norm": 10.32082462310791, + "learning_rate": 8.300130170943318e-06, + "loss": 0.3411, + "step": 10980 + }, + { + "epoch": 0.2778804059012577, + "grad_norm": 12.668102264404297, + "learning_rate": 8.299828519415331e-06, + "loss": 0.1614, + "step": 10981 + }, + { + "epoch": 0.2779057114659514, + "grad_norm": 4.8289313316345215, + "learning_rate": 8.299526846607364e-06, + "loss": 0.1972, + "step": 10982 + }, + { + "epoch": 0.27793101703064504, + "grad_norm": 34.98325729370117, + "learning_rate": 8.299225152521362e-06, + "loss": 0.2889, + "step": 10983 + }, + { + "epoch": 0.2779563225953387, + "grad_norm": 4.151366710662842, + "learning_rate": 8.29892343715927e-06, + "loss": 0.2218, + "step": 10984 + }, + { + "epoch": 0.2779816281600324, + "grad_norm": 4.760356426239014, + "learning_rate": 8.298621700523035e-06, + "loss": 0.218, + "step": 10985 + }, + { + "epoch": 0.27800693372472607, + "grad_norm": 10.98462200164795, + "learning_rate": 8.298319942614603e-06, + "loss": 0.2228, + "step": 10986 + }, + { + "epoch": 0.2780322392894197, + "grad_norm": 4.055786609649658, + "learning_rate": 8.298018163435919e-06, + "loss": 0.197, + "step": 10987 + }, + { + "epoch": 0.27805754485411344, + "grad_norm": 4.17803430557251, + "learning_rate": 8.297716362988928e-06, + "loss": 0.1688, + "step": 10988 + }, + { + "epoch": 0.2780828504188071, + "grad_norm": 5.676901340484619, + "learning_rate": 8.29741454127558e-06, + "loss": 0.2149, + "step": 10989 + }, + { + "epoch": 0.27810815598350075, + "grad_norm": 3.204300880432129, + "learning_rate": 8.297112698297816e-06, + "loss": 0.1306, + "step": 10990 + }, + { + "epoch": 0.27813346154819446, + "grad_norm": 8.815461158752441, + "learning_rate": 8.296810834057589e-06, + "loss": 0.1758, + "step": 10991 + }, + { + "epoch": 0.2781587671128881, + "grad_norm": 5.871033191680908, + "learning_rate": 8.296508948556839e-06, + "loss": 0.2222, + "step": 10992 + }, + { + "epoch": 0.2781840726775818, + "grad_norm": 3.1817233562469482, + "learning_rate": 8.296207041797518e-06, + "loss": 0.1336, + "step": 10993 + }, + { + "epoch": 0.2782093782422755, + "grad_norm": 3.9266860485076904, + "learning_rate": 8.295905113781568e-06, + "loss": 0.1483, + "step": 10994 + }, + { + "epoch": 0.27823468380696914, + "grad_norm": 3.9794199466705322, + "learning_rate": 8.295603164510941e-06, + "loss": 0.1216, + "step": 10995 + }, + { + "epoch": 0.27825998937166285, + "grad_norm": 2.826538324356079, + "learning_rate": 8.295301193987583e-06, + "loss": 0.1204, + "step": 10996 + }, + { + "epoch": 0.2782852949363565, + "grad_norm": 10.164022445678711, + "learning_rate": 8.294999202213441e-06, + "loss": 0.3495, + "step": 10997 + }, + { + "epoch": 0.27831060050105016, + "grad_norm": 5.546726226806641, + "learning_rate": 8.29469718919046e-06, + "loss": 0.175, + "step": 10998 + }, + { + "epoch": 0.2783359060657439, + "grad_norm": 10.787284851074219, + "learning_rate": 8.29439515492059e-06, + "loss": 0.203, + "step": 10999 + }, + { + "epoch": 0.27836121163043753, + "grad_norm": 3.611607313156128, + "learning_rate": 8.29409309940578e-06, + "loss": 0.1657, + "step": 11000 + }, + { + "epoch": 0.2783865171951312, + "grad_norm": 3.3908748626708984, + "learning_rate": 8.293791022647976e-06, + "loss": 0.1306, + "step": 11001 + }, + { + "epoch": 0.2784118227598249, + "grad_norm": 9.540270805358887, + "learning_rate": 8.293488924649126e-06, + "loss": 0.1914, + "step": 11002 + }, + { + "epoch": 0.27843712832451856, + "grad_norm": 5.08247709274292, + "learning_rate": 8.293186805411177e-06, + "loss": 0.1892, + "step": 11003 + }, + { + "epoch": 0.2784624338892122, + "grad_norm": 5.68326473236084, + "learning_rate": 8.292884664936082e-06, + "loss": 0.1901, + "step": 11004 + }, + { + "epoch": 0.2784877394539059, + "grad_norm": 5.138784885406494, + "learning_rate": 8.292582503225784e-06, + "loss": 0.1379, + "step": 11005 + }, + { + "epoch": 0.2785130450185996, + "grad_norm": 7.548203945159912, + "learning_rate": 8.292280320282233e-06, + "loss": 0.2413, + "step": 11006 + }, + { + "epoch": 0.2785383505832933, + "grad_norm": 4.576005935668945, + "learning_rate": 8.291978116107383e-06, + "loss": 0.1321, + "step": 11007 + }, + { + "epoch": 0.27856365614798695, + "grad_norm": 9.889975547790527, + "learning_rate": 8.291675890703173e-06, + "loss": 0.2889, + "step": 11008 + }, + { + "epoch": 0.2785889617126806, + "grad_norm": 6.00705099105835, + "learning_rate": 8.29137364407156e-06, + "loss": 0.1651, + "step": 11009 + }, + { + "epoch": 0.2786142672773743, + "grad_norm": 5.121209621429443, + "learning_rate": 8.291071376214493e-06, + "loss": 0.1862, + "step": 11010 + }, + { + "epoch": 0.278639572842068, + "grad_norm": 5.370815277099609, + "learning_rate": 8.290769087133916e-06, + "loss": 0.1993, + "step": 11011 + }, + { + "epoch": 0.27866487840676163, + "grad_norm": 8.430990219116211, + "learning_rate": 8.290466776831782e-06, + "loss": 0.2562, + "step": 11012 + }, + { + "epoch": 0.27869018397145534, + "grad_norm": 7.666898727416992, + "learning_rate": 8.290164445310039e-06, + "loss": 0.3397, + "step": 11013 + }, + { + "epoch": 0.278715489536149, + "grad_norm": 7.307922840118408, + "learning_rate": 8.289862092570638e-06, + "loss": 0.2601, + "step": 11014 + }, + { + "epoch": 0.27874079510084265, + "grad_norm": 3.6860413551330566, + "learning_rate": 8.28955971861553e-06, + "loss": 0.1547, + "step": 11015 + }, + { + "epoch": 0.27876610066553636, + "grad_norm": 8.682196617126465, + "learning_rate": 8.28925732344666e-06, + "loss": 0.2049, + "step": 11016 + }, + { + "epoch": 0.27879140623023, + "grad_norm": 4.6129608154296875, + "learning_rate": 8.288954907065983e-06, + "loss": 0.1425, + "step": 11017 + }, + { + "epoch": 0.2788167117949237, + "grad_norm": 5.912996292114258, + "learning_rate": 8.288652469475449e-06, + "loss": 0.2258, + "step": 11018 + }, + { + "epoch": 0.2788420173596174, + "grad_norm": 4.855042934417725, + "learning_rate": 8.288350010677006e-06, + "loss": 0.2307, + "step": 11019 + }, + { + "epoch": 0.27886732292431105, + "grad_norm": 2.8988802433013916, + "learning_rate": 8.288047530672606e-06, + "loss": 0.1311, + "step": 11020 + }, + { + "epoch": 0.27889262848900476, + "grad_norm": 5.964665412902832, + "learning_rate": 8.287745029464195e-06, + "loss": 0.1719, + "step": 11021 + }, + { + "epoch": 0.2789179340536984, + "grad_norm": 10.698740005493164, + "learning_rate": 8.287442507053734e-06, + "loss": 0.2415, + "step": 11022 + }, + { + "epoch": 0.27894323961839207, + "grad_norm": 5.667135715484619, + "learning_rate": 8.287139963443164e-06, + "loss": 0.1462, + "step": 11023 + }, + { + "epoch": 0.2789685451830858, + "grad_norm": 2.4018380641937256, + "learning_rate": 8.286837398634442e-06, + "loss": 0.1203, + "step": 11024 + }, + { + "epoch": 0.27899385074777944, + "grad_norm": 7.280552864074707, + "learning_rate": 8.286534812629514e-06, + "loss": 0.2004, + "step": 11025 + }, + { + "epoch": 0.2790191563124731, + "grad_norm": 4.902568817138672, + "learning_rate": 8.286232205430336e-06, + "loss": 0.1575, + "step": 11026 + }, + { + "epoch": 0.2790444618771668, + "grad_norm": 17.578479766845703, + "learning_rate": 8.285929577038858e-06, + "loss": 0.1976, + "step": 11027 + }, + { + "epoch": 0.27906976744186046, + "grad_norm": 5.690922260284424, + "learning_rate": 8.285626927457032e-06, + "loss": 0.1996, + "step": 11028 + }, + { + "epoch": 0.2790950730065541, + "grad_norm": 6.297248840332031, + "learning_rate": 8.285324256686808e-06, + "loss": 0.1632, + "step": 11029 + }, + { + "epoch": 0.27912037857124783, + "grad_norm": 8.440645217895508, + "learning_rate": 8.285021564730141e-06, + "loss": 0.2656, + "step": 11030 + }, + { + "epoch": 0.2791456841359415, + "grad_norm": 2.545527935028076, + "learning_rate": 8.284718851588978e-06, + "loss": 0.1068, + "step": 11031 + }, + { + "epoch": 0.2791709897006352, + "grad_norm": 7.569794654846191, + "learning_rate": 8.284416117265276e-06, + "loss": 0.2334, + "step": 11032 + }, + { + "epoch": 0.27919629526532885, + "grad_norm": 10.891339302062988, + "learning_rate": 8.284113361760985e-06, + "loss": 0.2394, + "step": 11033 + }, + { + "epoch": 0.2792216008300225, + "grad_norm": 6.194827556610107, + "learning_rate": 8.283810585078058e-06, + "loss": 0.2376, + "step": 11034 + }, + { + "epoch": 0.2792469063947162, + "grad_norm": 2.945727825164795, + "learning_rate": 8.283507787218447e-06, + "loss": 0.1347, + "step": 11035 + }, + { + "epoch": 0.2792722119594099, + "grad_norm": 3.7184722423553467, + "learning_rate": 8.283204968184103e-06, + "loss": 0.1408, + "step": 11036 + }, + { + "epoch": 0.27929751752410353, + "grad_norm": 11.02608585357666, + "learning_rate": 8.282902127976985e-06, + "loss": 0.2031, + "step": 11037 + }, + { + "epoch": 0.27932282308879725, + "grad_norm": 4.584911823272705, + "learning_rate": 8.282599266599041e-06, + "loss": 0.1843, + "step": 11038 + }, + { + "epoch": 0.2793481286534909, + "grad_norm": 5.390825271606445, + "learning_rate": 8.282296384052222e-06, + "loss": 0.1733, + "step": 11039 + }, + { + "epoch": 0.27937343421818456, + "grad_norm": 6.062172889709473, + "learning_rate": 8.281993480338487e-06, + "loss": 0.1578, + "step": 11040 + }, + { + "epoch": 0.27939873978287827, + "grad_norm": 4.8249616622924805, + "learning_rate": 8.281690555459785e-06, + "loss": 0.1818, + "step": 11041 + }, + { + "epoch": 0.2794240453475719, + "grad_norm": 4.036491394042969, + "learning_rate": 8.281387609418072e-06, + "loss": 0.0855, + "step": 11042 + }, + { + "epoch": 0.2794493509122656, + "grad_norm": 11.187702178955078, + "learning_rate": 8.2810846422153e-06, + "loss": 0.3662, + "step": 11043 + }, + { + "epoch": 0.2794746564769593, + "grad_norm": 5.917755603790283, + "learning_rate": 8.280781653853424e-06, + "loss": 0.2018, + "step": 11044 + }, + { + "epoch": 0.27949996204165295, + "grad_norm": 6.162342548370361, + "learning_rate": 8.280478644334398e-06, + "loss": 0.2211, + "step": 11045 + }, + { + "epoch": 0.27952526760634666, + "grad_norm": 8.642840385437012, + "learning_rate": 8.280175613660175e-06, + "loss": 0.2215, + "step": 11046 + }, + { + "epoch": 0.2795505731710403, + "grad_norm": 5.341919898986816, + "learning_rate": 8.279872561832709e-06, + "loss": 0.1543, + "step": 11047 + }, + { + "epoch": 0.279575878735734, + "grad_norm": 5.656745433807373, + "learning_rate": 8.279569488853954e-06, + "loss": 0.158, + "step": 11048 + }, + { + "epoch": 0.2796011843004277, + "grad_norm": 6.96244478225708, + "learning_rate": 8.279266394725867e-06, + "loss": 0.229, + "step": 11049 + }, + { + "epoch": 0.27962648986512134, + "grad_norm": 15.424239158630371, + "learning_rate": 8.2789632794504e-06, + "loss": 0.2512, + "step": 11050 + }, + { + "epoch": 0.279651795429815, + "grad_norm": 8.07665729522705, + "learning_rate": 8.27866014302951e-06, + "loss": 0.1462, + "step": 11051 + }, + { + "epoch": 0.2796771009945087, + "grad_norm": 4.685385704040527, + "learning_rate": 8.278356985465151e-06, + "loss": 0.2022, + "step": 11052 + }, + { + "epoch": 0.27970240655920237, + "grad_norm": 3.7599756717681885, + "learning_rate": 8.278053806759276e-06, + "loss": 0.1247, + "step": 11053 + }, + { + "epoch": 0.279727712123896, + "grad_norm": 5.859435081481934, + "learning_rate": 8.277750606913842e-06, + "loss": 0.1969, + "step": 11054 + }, + { + "epoch": 0.27975301768858973, + "grad_norm": 8.760396957397461, + "learning_rate": 8.277447385930804e-06, + "loss": 0.1448, + "step": 11055 + }, + { + "epoch": 0.2797783232532834, + "grad_norm": 8.940452575683594, + "learning_rate": 8.27714414381212e-06, + "loss": 0.2972, + "step": 11056 + }, + { + "epoch": 0.27980362881797705, + "grad_norm": 3.453493595123291, + "learning_rate": 8.27684088055974e-06, + "loss": 0.1322, + "step": 11057 + }, + { + "epoch": 0.27982893438267076, + "grad_norm": 4.626375675201416, + "learning_rate": 8.276537596175623e-06, + "loss": 0.1778, + "step": 11058 + }, + { + "epoch": 0.2798542399473644, + "grad_norm": 7.304650783538818, + "learning_rate": 8.276234290661724e-06, + "loss": 0.2379, + "step": 11059 + }, + { + "epoch": 0.2798795455120581, + "grad_norm": 4.603764057159424, + "learning_rate": 8.275930964020002e-06, + "loss": 0.1894, + "step": 11060 + }, + { + "epoch": 0.2799048510767518, + "grad_norm": 7.889980792999268, + "learning_rate": 8.275627616252408e-06, + "loss": 0.2032, + "step": 11061 + }, + { + "epoch": 0.27993015664144544, + "grad_norm": 3.8519599437713623, + "learning_rate": 8.275324247360903e-06, + "loss": 0.1434, + "step": 11062 + }, + { + "epoch": 0.27995546220613915, + "grad_norm": 5.823288440704346, + "learning_rate": 8.27502085734744e-06, + "loss": 0.174, + "step": 11063 + }, + { + "epoch": 0.2799807677708328, + "grad_norm": 4.842787265777588, + "learning_rate": 8.274717446213977e-06, + "loss": 0.0881, + "step": 11064 + }, + { + "epoch": 0.28000607333552646, + "grad_norm": 11.71456527709961, + "learning_rate": 8.27441401396247e-06, + "loss": 0.2428, + "step": 11065 + }, + { + "epoch": 0.2800313789002202, + "grad_norm": 6.740334510803223, + "learning_rate": 8.274110560594877e-06, + "loss": 0.185, + "step": 11066 + }, + { + "epoch": 0.28005668446491383, + "grad_norm": 3.432466506958008, + "learning_rate": 8.273807086113152e-06, + "loss": 0.1478, + "step": 11067 + }, + { + "epoch": 0.2800819900296075, + "grad_norm": 5.595333576202393, + "learning_rate": 8.273503590519256e-06, + "loss": 0.1257, + "step": 11068 + }, + { + "epoch": 0.2801072955943012, + "grad_norm": 10.96634578704834, + "learning_rate": 8.273200073815144e-06, + "loss": 0.3037, + "step": 11069 + }, + { + "epoch": 0.28013260115899485, + "grad_norm": 9.49665641784668, + "learning_rate": 8.272896536002773e-06, + "loss": 0.2439, + "step": 11070 + }, + { + "epoch": 0.28015790672368857, + "grad_norm": 3.2998597621917725, + "learning_rate": 8.272592977084102e-06, + "loss": 0.1464, + "step": 11071 + }, + { + "epoch": 0.2801832122883822, + "grad_norm": 5.148952484130859, + "learning_rate": 8.272289397061085e-06, + "loss": 0.1505, + "step": 11072 + }, + { + "epoch": 0.2802085178530759, + "grad_norm": 4.759096622467041, + "learning_rate": 8.271985795935687e-06, + "loss": 0.1433, + "step": 11073 + }, + { + "epoch": 0.2802338234177696, + "grad_norm": 5.515710353851318, + "learning_rate": 8.271682173709857e-06, + "loss": 0.1932, + "step": 11074 + }, + { + "epoch": 0.28025912898246325, + "grad_norm": 3.7431347370147705, + "learning_rate": 8.27137853038556e-06, + "loss": 0.1361, + "step": 11075 + }, + { + "epoch": 0.2802844345471569, + "grad_norm": 5.8860392570495605, + "learning_rate": 8.27107486596475e-06, + "loss": 0.2071, + "step": 11076 + }, + { + "epoch": 0.2803097401118506, + "grad_norm": 4.131628036499023, + "learning_rate": 8.270771180449386e-06, + "loss": 0.1998, + "step": 11077 + }, + { + "epoch": 0.28033504567654427, + "grad_norm": 11.299020767211914, + "learning_rate": 8.270467473841428e-06, + "loss": 0.1929, + "step": 11078 + }, + { + "epoch": 0.2803603512412379, + "grad_norm": 6.65916109085083, + "learning_rate": 8.270163746142834e-06, + "loss": 0.25, + "step": 11079 + }, + { + "epoch": 0.28038565680593164, + "grad_norm": 8.003178596496582, + "learning_rate": 8.269859997355562e-06, + "loss": 0.2444, + "step": 11080 + }, + { + "epoch": 0.2804109623706253, + "grad_norm": 4.602385520935059, + "learning_rate": 8.26955622748157e-06, + "loss": 0.1076, + "step": 11081 + }, + { + "epoch": 0.28043626793531895, + "grad_norm": 7.487829685211182, + "learning_rate": 8.26925243652282e-06, + "loss": 0.1447, + "step": 11082 + }, + { + "epoch": 0.28046157350001266, + "grad_norm": 4.350066184997559, + "learning_rate": 8.268948624481267e-06, + "loss": 0.241, + "step": 11083 + }, + { + "epoch": 0.2804868790647063, + "grad_norm": 5.302410125732422, + "learning_rate": 8.268644791358875e-06, + "loss": 0.1967, + "step": 11084 + }, + { + "epoch": 0.28051218462940003, + "grad_norm": 9.63899040222168, + "learning_rate": 8.2683409371576e-06, + "loss": 0.161, + "step": 11085 + }, + { + "epoch": 0.2805374901940937, + "grad_norm": 3.4280755519866943, + "learning_rate": 8.2680370618794e-06, + "loss": 0.1634, + "step": 11086 + }, + { + "epoch": 0.28056279575878734, + "grad_norm": 13.039061546325684, + "learning_rate": 8.267733165526239e-06, + "loss": 0.2288, + "step": 11087 + }, + { + "epoch": 0.28058810132348105, + "grad_norm": 8.791802406311035, + "learning_rate": 8.267429248100074e-06, + "loss": 0.2613, + "step": 11088 + }, + { + "epoch": 0.2806134068881747, + "grad_norm": 7.973074436187744, + "learning_rate": 8.267125309602866e-06, + "loss": 0.264, + "step": 11089 + }, + { + "epoch": 0.28063871245286837, + "grad_norm": 6.670782566070557, + "learning_rate": 8.266821350036575e-06, + "loss": 0.3003, + "step": 11090 + }, + { + "epoch": 0.2806640180175621, + "grad_norm": 4.159941673278809, + "learning_rate": 8.26651736940316e-06, + "loss": 0.1979, + "step": 11091 + }, + { + "epoch": 0.28068932358225573, + "grad_norm": 4.780151844024658, + "learning_rate": 8.266213367704581e-06, + "loss": 0.1997, + "step": 11092 + }, + { + "epoch": 0.2807146291469494, + "grad_norm": 13.220701217651367, + "learning_rate": 8.265909344942802e-06, + "loss": 0.3166, + "step": 11093 + }, + { + "epoch": 0.2807399347116431, + "grad_norm": 4.047801971435547, + "learning_rate": 8.265605301119779e-06, + "loss": 0.1786, + "step": 11094 + }, + { + "epoch": 0.28076524027633676, + "grad_norm": 4.657801151275635, + "learning_rate": 8.265301236237477e-06, + "loss": 0.1836, + "step": 11095 + }, + { + "epoch": 0.28079054584103047, + "grad_norm": 3.8421573638916016, + "learning_rate": 8.264997150297853e-06, + "loss": 0.1191, + "step": 11096 + }, + { + "epoch": 0.2808158514057241, + "grad_norm": 6.824842929840088, + "learning_rate": 8.264693043302869e-06, + "loss": 0.2591, + "step": 11097 + }, + { + "epoch": 0.2808411569704178, + "grad_norm": 4.844263553619385, + "learning_rate": 8.264388915254489e-06, + "loss": 0.2309, + "step": 11098 + }, + { + "epoch": 0.2808664625351115, + "grad_norm": 4.609888553619385, + "learning_rate": 8.264084766154671e-06, + "loss": 0.1233, + "step": 11099 + }, + { + "epoch": 0.28089176809980515, + "grad_norm": 3.915151834487915, + "learning_rate": 8.263780596005378e-06, + "loss": 0.1557, + "step": 11100 + }, + { + "epoch": 0.2809170736644988, + "grad_norm": 6.913577556610107, + "learning_rate": 8.263476404808571e-06, + "loss": 0.2941, + "step": 11101 + }, + { + "epoch": 0.2809423792291925, + "grad_norm": 10.358964920043945, + "learning_rate": 8.263172192566211e-06, + "loss": 0.1469, + "step": 11102 + }, + { + "epoch": 0.2809676847938862, + "grad_norm": 5.504085540771484, + "learning_rate": 8.26286795928026e-06, + "loss": 0.208, + "step": 11103 + }, + { + "epoch": 0.28099299035857983, + "grad_norm": 16.717124938964844, + "learning_rate": 8.262563704952682e-06, + "loss": 0.2476, + "step": 11104 + }, + { + "epoch": 0.28101829592327354, + "grad_norm": 3.485687017440796, + "learning_rate": 8.262259429585438e-06, + "loss": 0.1735, + "step": 11105 + }, + { + "epoch": 0.2810436014879672, + "grad_norm": 4.557295799255371, + "learning_rate": 8.261955133180488e-06, + "loss": 0.1884, + "step": 11106 + }, + { + "epoch": 0.28106890705266085, + "grad_norm": 14.484380722045898, + "learning_rate": 8.261650815739797e-06, + "loss": 0.2306, + "step": 11107 + }, + { + "epoch": 0.28109421261735457, + "grad_norm": 10.059917449951172, + "learning_rate": 8.261346477265326e-06, + "loss": 0.2126, + "step": 11108 + }, + { + "epoch": 0.2811195181820482, + "grad_norm": 4.61664342880249, + "learning_rate": 8.26104211775904e-06, + "loss": 0.0761, + "step": 11109 + }, + { + "epoch": 0.28114482374674193, + "grad_norm": 3.9117953777313232, + "learning_rate": 8.260737737222896e-06, + "loss": 0.1848, + "step": 11110 + }, + { + "epoch": 0.2811701293114356, + "grad_norm": 4.580993175506592, + "learning_rate": 8.260433335658865e-06, + "loss": 0.1561, + "step": 11111 + }, + { + "epoch": 0.28119543487612925, + "grad_norm": 5.247560024261475, + "learning_rate": 8.260128913068904e-06, + "loss": 0.1303, + "step": 11112 + }, + { + "epoch": 0.28122074044082296, + "grad_norm": 6.564403057098389, + "learning_rate": 8.25982446945498e-06, + "loss": 0.1471, + "step": 11113 + }, + { + "epoch": 0.2812460460055166, + "grad_norm": 4.438492298126221, + "learning_rate": 8.25952000481905e-06, + "loss": 0.1663, + "step": 11114 + }, + { + "epoch": 0.28127135157021027, + "grad_norm": 11.985430717468262, + "learning_rate": 8.259215519163087e-06, + "loss": 0.2514, + "step": 11115 + }, + { + "epoch": 0.281296657134904, + "grad_norm": 5.958883762359619, + "learning_rate": 8.258911012489047e-06, + "loss": 0.2194, + "step": 11116 + }, + { + "epoch": 0.28132196269959764, + "grad_norm": 6.674321174621582, + "learning_rate": 8.258606484798896e-06, + "loss": 0.2088, + "step": 11117 + }, + { + "epoch": 0.2813472682642913, + "grad_norm": 9.629273414611816, + "learning_rate": 8.258301936094597e-06, + "loss": 0.2132, + "step": 11118 + }, + { + "epoch": 0.281372573828985, + "grad_norm": 5.330239295959473, + "learning_rate": 8.257997366378116e-06, + "loss": 0.1652, + "step": 11119 + }, + { + "epoch": 0.28139787939367866, + "grad_norm": 7.1992034912109375, + "learning_rate": 8.257692775651416e-06, + "loss": 0.1799, + "step": 11120 + }, + { + "epoch": 0.2814231849583723, + "grad_norm": 5.467199802398682, + "learning_rate": 8.257388163916463e-06, + "loss": 0.1631, + "step": 11121 + }, + { + "epoch": 0.28144849052306603, + "grad_norm": 8.693733215332031, + "learning_rate": 8.257083531175218e-06, + "loss": 0.2601, + "step": 11122 + }, + { + "epoch": 0.2814737960877597, + "grad_norm": 9.184314727783203, + "learning_rate": 8.256778877429645e-06, + "loss": 0.1795, + "step": 11123 + }, + { + "epoch": 0.2814991016524534, + "grad_norm": 12.870973587036133, + "learning_rate": 8.256474202681715e-06, + "loss": 0.308, + "step": 11124 + }, + { + "epoch": 0.28152440721714705, + "grad_norm": 3.4056947231292725, + "learning_rate": 8.256169506933385e-06, + "loss": 0.1232, + "step": 11125 + }, + { + "epoch": 0.2815497127818407, + "grad_norm": 4.8347320556640625, + "learning_rate": 8.255864790186625e-06, + "loss": 0.2061, + "step": 11126 + }, + { + "epoch": 0.2815750183465344, + "grad_norm": 4.0841827392578125, + "learning_rate": 8.2555600524434e-06, + "loss": 0.1574, + "step": 11127 + }, + { + "epoch": 0.2816003239112281, + "grad_norm": 3.95766282081604, + "learning_rate": 8.255255293705672e-06, + "loss": 0.1419, + "step": 11128 + }, + { + "epoch": 0.28162562947592173, + "grad_norm": 6.135948657989502, + "learning_rate": 8.254950513975408e-06, + "loss": 0.2285, + "step": 11129 + }, + { + "epoch": 0.28165093504061545, + "grad_norm": 6.473555088043213, + "learning_rate": 8.254645713254575e-06, + "loss": 0.1815, + "step": 11130 + }, + { + "epoch": 0.2816762406053091, + "grad_norm": 7.013535022735596, + "learning_rate": 8.254340891545136e-06, + "loss": 0.2014, + "step": 11131 + }, + { + "epoch": 0.28170154617000276, + "grad_norm": 2.8655834197998047, + "learning_rate": 8.254036048849059e-06, + "loss": 0.0884, + "step": 11132 + }, + { + "epoch": 0.28172685173469647, + "grad_norm": 2.914095401763916, + "learning_rate": 8.253731185168307e-06, + "loss": 0.1111, + "step": 11133 + }, + { + "epoch": 0.2817521572993901, + "grad_norm": 4.004434585571289, + "learning_rate": 8.25342630050485e-06, + "loss": 0.1269, + "step": 11134 + }, + { + "epoch": 0.28177746286408384, + "grad_norm": 5.724282741546631, + "learning_rate": 8.25312139486065e-06, + "loss": 0.1573, + "step": 11135 + }, + { + "epoch": 0.2818027684287775, + "grad_norm": 8.933989524841309, + "learning_rate": 8.252816468237676e-06, + "loss": 0.1831, + "step": 11136 + }, + { + "epoch": 0.28182807399347115, + "grad_norm": 3.0924301147460938, + "learning_rate": 8.252511520637896e-06, + "loss": 0.2028, + "step": 11137 + }, + { + "epoch": 0.28185337955816486, + "grad_norm": 6.062920093536377, + "learning_rate": 8.252206552063271e-06, + "loss": 0.2077, + "step": 11138 + }, + { + "epoch": 0.2818786851228585, + "grad_norm": 4.200484275817871, + "learning_rate": 8.251901562515772e-06, + "loss": 0.1567, + "step": 11139 + }, + { + "epoch": 0.2819039906875522, + "grad_norm": 2.616887331008911, + "learning_rate": 8.251596551997365e-06, + "loss": 0.1164, + "step": 11140 + }, + { + "epoch": 0.2819292962522459, + "grad_norm": 9.028138160705566, + "learning_rate": 8.251291520510017e-06, + "loss": 0.2621, + "step": 11141 + }, + { + "epoch": 0.28195460181693954, + "grad_norm": 10.962625503540039, + "learning_rate": 8.250986468055693e-06, + "loss": 0.2259, + "step": 11142 + }, + { + "epoch": 0.2819799073816332, + "grad_norm": 8.86406421661377, + "learning_rate": 8.250681394636364e-06, + "loss": 0.2571, + "step": 11143 + }, + { + "epoch": 0.2820052129463269, + "grad_norm": 7.2113356590271, + "learning_rate": 8.250376300253995e-06, + "loss": 0.2447, + "step": 11144 + }, + { + "epoch": 0.28203051851102057, + "grad_norm": 6.889649391174316, + "learning_rate": 8.250071184910553e-06, + "loss": 0.2287, + "step": 11145 + }, + { + "epoch": 0.2820558240757142, + "grad_norm": 7.964534759521484, + "learning_rate": 8.249766048608006e-06, + "loss": 0.3186, + "step": 11146 + }, + { + "epoch": 0.28208112964040793, + "grad_norm": 5.8886895179748535, + "learning_rate": 8.249460891348324e-06, + "loss": 0.1204, + "step": 11147 + }, + { + "epoch": 0.2821064352051016, + "grad_norm": 4.444962024688721, + "learning_rate": 8.249155713133472e-06, + "loss": 0.1453, + "step": 11148 + }, + { + "epoch": 0.2821317407697953, + "grad_norm": 5.248854160308838, + "learning_rate": 8.24885051396542e-06, + "loss": 0.2222, + "step": 11149 + }, + { + "epoch": 0.28215704633448896, + "grad_norm": 20.832992553710938, + "learning_rate": 8.248545293846135e-06, + "loss": 0.1666, + "step": 11150 + }, + { + "epoch": 0.2821823518991826, + "grad_norm": 3.9713034629821777, + "learning_rate": 8.248240052777586e-06, + "loss": 0.1768, + "step": 11151 + }, + { + "epoch": 0.2822076574638763, + "grad_norm": 6.237461566925049, + "learning_rate": 8.24793479076174e-06, + "loss": 0.1937, + "step": 11152 + }, + { + "epoch": 0.28223296302857, + "grad_norm": 7.37210750579834, + "learning_rate": 8.247629507800568e-06, + "loss": 0.2357, + "step": 11153 + }, + { + "epoch": 0.28225826859326364, + "grad_norm": 7.120695114135742, + "learning_rate": 8.247324203896037e-06, + "loss": 0.1549, + "step": 11154 + }, + { + "epoch": 0.28228357415795735, + "grad_norm": 8.061257362365723, + "learning_rate": 8.247018879050115e-06, + "loss": 0.1897, + "step": 11155 + }, + { + "epoch": 0.282308879722651, + "grad_norm": 5.150970935821533, + "learning_rate": 8.246713533264773e-06, + "loss": 0.1687, + "step": 11156 + }, + { + "epoch": 0.28233418528734466, + "grad_norm": 5.599131107330322, + "learning_rate": 8.24640816654198e-06, + "loss": 0.1517, + "step": 11157 + }, + { + "epoch": 0.2823594908520384, + "grad_norm": 13.176787376403809, + "learning_rate": 8.246102778883705e-06, + "loss": 0.1977, + "step": 11158 + }, + { + "epoch": 0.28238479641673203, + "grad_norm": 4.331790924072266, + "learning_rate": 8.245797370291915e-06, + "loss": 0.133, + "step": 11159 + }, + { + "epoch": 0.28241010198142574, + "grad_norm": 12.774518966674805, + "learning_rate": 8.245491940768584e-06, + "loss": 0.217, + "step": 11160 + }, + { + "epoch": 0.2824354075461194, + "grad_norm": 4.430160045623779, + "learning_rate": 8.245186490315678e-06, + "loss": 0.176, + "step": 11161 + }, + { + "epoch": 0.28246071311081306, + "grad_norm": 3.6802666187286377, + "learning_rate": 8.244881018935167e-06, + "loss": 0.2166, + "step": 11162 + }, + { + "epoch": 0.28248601867550677, + "grad_norm": 4.123634338378906, + "learning_rate": 8.244575526629024e-06, + "loss": 0.2571, + "step": 11163 + }, + { + "epoch": 0.2825113242402004, + "grad_norm": 6.0652594566345215, + "learning_rate": 8.244270013399215e-06, + "loss": 0.1934, + "step": 11164 + }, + { + "epoch": 0.2825366298048941, + "grad_norm": 8.240222930908203, + "learning_rate": 8.243964479247712e-06, + "loss": 0.3224, + "step": 11165 + }, + { + "epoch": 0.2825619353695878, + "grad_norm": 3.6353752613067627, + "learning_rate": 8.243658924176487e-06, + "loss": 0.1498, + "step": 11166 + }, + { + "epoch": 0.28258724093428145, + "grad_norm": 9.698081970214844, + "learning_rate": 8.243353348187509e-06, + "loss": 0.2358, + "step": 11167 + }, + { + "epoch": 0.2826125464989751, + "grad_norm": 5.964770317077637, + "learning_rate": 8.243047751282748e-06, + "loss": 0.1854, + "step": 11168 + }, + { + "epoch": 0.2826378520636688, + "grad_norm": 8.230062484741211, + "learning_rate": 8.242742133464174e-06, + "loss": 0.174, + "step": 11169 + }, + { + "epoch": 0.28266315762836247, + "grad_norm": 3.5026116371154785, + "learning_rate": 8.242436494733761e-06, + "loss": 0.1765, + "step": 11170 + }, + { + "epoch": 0.2826884631930561, + "grad_norm": 11.762604713439941, + "learning_rate": 8.242130835093478e-06, + "loss": 0.228, + "step": 11171 + }, + { + "epoch": 0.28271376875774984, + "grad_norm": 5.304136276245117, + "learning_rate": 8.241825154545296e-06, + "loss": 0.2447, + "step": 11172 + }, + { + "epoch": 0.2827390743224435, + "grad_norm": 11.370168685913086, + "learning_rate": 8.241519453091187e-06, + "loss": 0.2079, + "step": 11173 + }, + { + "epoch": 0.2827643798871372, + "grad_norm": 15.9974946975708, + "learning_rate": 8.241213730733122e-06, + "loss": 0.2827, + "step": 11174 + }, + { + "epoch": 0.28278968545183086, + "grad_norm": 7.099074840545654, + "learning_rate": 8.240907987473072e-06, + "loss": 0.2029, + "step": 11175 + }, + { + "epoch": 0.2828149910165245, + "grad_norm": 4.8544602394104, + "learning_rate": 8.240602223313007e-06, + "loss": 0.2113, + "step": 11176 + }, + { + "epoch": 0.28284029658121823, + "grad_norm": 4.623033046722412, + "learning_rate": 8.240296438254905e-06, + "loss": 0.2677, + "step": 11177 + }, + { + "epoch": 0.2828656021459119, + "grad_norm": 4.316504001617432, + "learning_rate": 8.239990632300732e-06, + "loss": 0.2332, + "step": 11178 + }, + { + "epoch": 0.28289090771060554, + "grad_norm": 8.265233039855957, + "learning_rate": 8.239684805452464e-06, + "loss": 0.2113, + "step": 11179 + }, + { + "epoch": 0.28291621327529926, + "grad_norm": 4.929754734039307, + "learning_rate": 8.23937895771207e-06, + "loss": 0.1985, + "step": 11180 + }, + { + "epoch": 0.2829415188399929, + "grad_norm": 6.615025997161865, + "learning_rate": 8.239073089081523e-06, + "loss": 0.2144, + "step": 11181 + }, + { + "epoch": 0.28296682440468657, + "grad_norm": 7.1105265617370605, + "learning_rate": 8.238767199562797e-06, + "loss": 0.2564, + "step": 11182 + }, + { + "epoch": 0.2829921299693803, + "grad_norm": 8.513827323913574, + "learning_rate": 8.238461289157864e-06, + "loss": 0.2164, + "step": 11183 + }, + { + "epoch": 0.28301743553407394, + "grad_norm": 6.281893253326416, + "learning_rate": 8.238155357868695e-06, + "loss": 0.1111, + "step": 11184 + }, + { + "epoch": 0.2830427410987676, + "grad_norm": 3.0903987884521484, + "learning_rate": 8.237849405697267e-06, + "loss": 0.1898, + "step": 11185 + }, + { + "epoch": 0.2830680466634613, + "grad_norm": 5.312710285186768, + "learning_rate": 8.237543432645547e-06, + "loss": 0.2228, + "step": 11186 + }, + { + "epoch": 0.28309335222815496, + "grad_norm": 8.080476760864258, + "learning_rate": 8.237237438715513e-06, + "loss": 0.2129, + "step": 11187 + }, + { + "epoch": 0.28311865779284867, + "grad_norm": 4.182003498077393, + "learning_rate": 8.23693142390914e-06, + "loss": 0.152, + "step": 11188 + }, + { + "epoch": 0.2831439633575423, + "grad_norm": 10.295482635498047, + "learning_rate": 8.236625388228393e-06, + "loss": 0.3145, + "step": 11189 + }, + { + "epoch": 0.283169268922236, + "grad_norm": 3.7050178050994873, + "learning_rate": 8.236319331675256e-06, + "loss": 0.1534, + "step": 11190 + }, + { + "epoch": 0.2831945744869297, + "grad_norm": 4.878856658935547, + "learning_rate": 8.236013254251695e-06, + "loss": 0.1078, + "step": 11191 + }, + { + "epoch": 0.28321988005162335, + "grad_norm": 5.101975917816162, + "learning_rate": 8.235707155959686e-06, + "loss": 0.2243, + "step": 11192 + }, + { + "epoch": 0.283245185616317, + "grad_norm": 3.1325066089630127, + "learning_rate": 8.235401036801205e-06, + "loss": 0.178, + "step": 11193 + }, + { + "epoch": 0.2832704911810107, + "grad_norm": 4.9011454582214355, + "learning_rate": 8.235094896778224e-06, + "loss": 0.1202, + "step": 11194 + }, + { + "epoch": 0.2832957967457044, + "grad_norm": 6.961948871612549, + "learning_rate": 8.234788735892717e-06, + "loss": 0.3068, + "step": 11195 + }, + { + "epoch": 0.28332110231039803, + "grad_norm": 3.197080135345459, + "learning_rate": 8.234482554146662e-06, + "loss": 0.1392, + "step": 11196 + }, + { + "epoch": 0.28334640787509174, + "grad_norm": 3.688319206237793, + "learning_rate": 8.234176351542027e-06, + "loss": 0.1861, + "step": 11197 + }, + { + "epoch": 0.2833717134397854, + "grad_norm": 3.795448064804077, + "learning_rate": 8.233870128080793e-06, + "loss": 0.1629, + "step": 11198 + }, + { + "epoch": 0.2833970190044791, + "grad_norm": 4.090415954589844, + "learning_rate": 8.23356388376493e-06, + "loss": 0.1495, + "step": 11199 + }, + { + "epoch": 0.28342232456917277, + "grad_norm": 6.564752578735352, + "learning_rate": 8.233257618596415e-06, + "loss": 0.2384, + "step": 11200 + }, + { + "epoch": 0.2834476301338664, + "grad_norm": 14.472301483154297, + "learning_rate": 8.232951332577226e-06, + "loss": 0.4296, + "step": 11201 + }, + { + "epoch": 0.28347293569856014, + "grad_norm": 6.62538480758667, + "learning_rate": 8.232645025709333e-06, + "loss": 0.2273, + "step": 11202 + }, + { + "epoch": 0.2834982412632538, + "grad_norm": 5.218963146209717, + "learning_rate": 8.232338697994713e-06, + "loss": 0.1646, + "step": 11203 + }, + { + "epoch": 0.28352354682794745, + "grad_norm": 3.99359393119812, + "learning_rate": 8.232032349435344e-06, + "loss": 0.1796, + "step": 11204 + }, + { + "epoch": 0.28354885239264116, + "grad_norm": 20.5157470703125, + "learning_rate": 8.231725980033198e-06, + "loss": 0.2453, + "step": 11205 + }, + { + "epoch": 0.2835741579573348, + "grad_norm": 4.631371974945068, + "learning_rate": 8.231419589790253e-06, + "loss": 0.2064, + "step": 11206 + }, + { + "epoch": 0.28359946352202847, + "grad_norm": 9.513243675231934, + "learning_rate": 8.231113178708484e-06, + "loss": 0.1568, + "step": 11207 + }, + { + "epoch": 0.2836247690867222, + "grad_norm": 2.9802286624908447, + "learning_rate": 8.230806746789865e-06, + "loss": 0.1096, + "step": 11208 + }, + { + "epoch": 0.28365007465141584, + "grad_norm": 4.161352157592773, + "learning_rate": 8.230500294036376e-06, + "loss": 0.1665, + "step": 11209 + }, + { + "epoch": 0.2836753802161095, + "grad_norm": 7.341386795043945, + "learning_rate": 8.230193820449993e-06, + "loss": 0.1466, + "step": 11210 + }, + { + "epoch": 0.2837006857808032, + "grad_norm": 5.7411346435546875, + "learning_rate": 8.229887326032688e-06, + "loss": 0.2509, + "step": 11211 + }, + { + "epoch": 0.28372599134549686, + "grad_norm": 10.108762741088867, + "learning_rate": 8.229580810786444e-06, + "loss": 0.2479, + "step": 11212 + }, + { + "epoch": 0.2837512969101906, + "grad_norm": 6.668600082397461, + "learning_rate": 8.229274274713232e-06, + "loss": 0.2311, + "step": 11213 + }, + { + "epoch": 0.28377660247488423, + "grad_norm": 4.904072284698486, + "learning_rate": 8.22896771781503e-06, + "loss": 0.257, + "step": 11214 + }, + { + "epoch": 0.2838019080395779, + "grad_norm": 3.606811046600342, + "learning_rate": 8.228661140093816e-06, + "loss": 0.1456, + "step": 11215 + }, + { + "epoch": 0.2838272136042716, + "grad_norm": 2.8136565685272217, + "learning_rate": 8.22835454155157e-06, + "loss": 0.172, + "step": 11216 + }, + { + "epoch": 0.28385251916896526, + "grad_norm": 8.770195960998535, + "learning_rate": 8.228047922190264e-06, + "loss": 0.1551, + "step": 11217 + }, + { + "epoch": 0.2838778247336589, + "grad_norm": 5.009883880615234, + "learning_rate": 8.227741282011876e-06, + "loss": 0.173, + "step": 11218 + }, + { + "epoch": 0.2839031302983526, + "grad_norm": 5.610533714294434, + "learning_rate": 8.227434621018387e-06, + "loss": 0.2716, + "step": 11219 + }, + { + "epoch": 0.2839284358630463, + "grad_norm": 3.5669171810150146, + "learning_rate": 8.22712793921177e-06, + "loss": 0.1104, + "step": 11220 + }, + { + "epoch": 0.28395374142773994, + "grad_norm": 7.686405658721924, + "learning_rate": 8.22682123659401e-06, + "loss": 0.2281, + "step": 11221 + }, + { + "epoch": 0.28397904699243365, + "grad_norm": 5.165277481079102, + "learning_rate": 8.226514513167074e-06, + "loss": 0.1304, + "step": 11222 + }, + { + "epoch": 0.2840043525571273, + "grad_norm": 3.644408702850342, + "learning_rate": 8.22620776893295e-06, + "loss": 0.1388, + "step": 11223 + }, + { + "epoch": 0.284029658121821, + "grad_norm": 5.787392616271973, + "learning_rate": 8.22590100389361e-06, + "loss": 0.1788, + "step": 11224 + }, + { + "epoch": 0.28405496368651467, + "grad_norm": 4.304913520812988, + "learning_rate": 8.225594218051038e-06, + "loss": 0.2128, + "step": 11225 + }, + { + "epoch": 0.28408026925120833, + "grad_norm": 9.176848411560059, + "learning_rate": 8.225287411407206e-06, + "loss": 0.3898, + "step": 11226 + }, + { + "epoch": 0.28410557481590204, + "grad_norm": 3.700321912765503, + "learning_rate": 8.224980583964096e-06, + "loss": 0.1624, + "step": 11227 + }, + { + "epoch": 0.2841308803805957, + "grad_norm": 3.8292057514190674, + "learning_rate": 8.224673735723686e-06, + "loss": 0.1689, + "step": 11228 + }, + { + "epoch": 0.28415618594528935, + "grad_norm": 6.480971813201904, + "learning_rate": 8.224366866687956e-06, + "loss": 0.1895, + "step": 11229 + }, + { + "epoch": 0.28418149150998306, + "grad_norm": 6.171059608459473, + "learning_rate": 8.224059976858882e-06, + "loss": 0.1117, + "step": 11230 + }, + { + "epoch": 0.2842067970746767, + "grad_norm": 4.910263538360596, + "learning_rate": 8.223753066238446e-06, + "loss": 0.1744, + "step": 11231 + }, + { + "epoch": 0.2842321026393704, + "grad_norm": 5.832354545593262, + "learning_rate": 8.223446134828625e-06, + "loss": 0.1828, + "step": 11232 + }, + { + "epoch": 0.2842574082040641, + "grad_norm": 5.910681247711182, + "learning_rate": 8.2231391826314e-06, + "loss": 0.1421, + "step": 11233 + }, + { + "epoch": 0.28428271376875774, + "grad_norm": 5.954283237457275, + "learning_rate": 8.222832209648752e-06, + "loss": 0.2381, + "step": 11234 + }, + { + "epoch": 0.2843080193334514, + "grad_norm": 5.070044994354248, + "learning_rate": 8.222525215882657e-06, + "loss": 0.1114, + "step": 11235 + }, + { + "epoch": 0.2843333248981451, + "grad_norm": 4.121721267700195, + "learning_rate": 8.222218201335096e-06, + "loss": 0.1894, + "step": 11236 + }, + { + "epoch": 0.28435863046283877, + "grad_norm": 7.734220027923584, + "learning_rate": 8.22191116600805e-06, + "loss": 0.1594, + "step": 11237 + }, + { + "epoch": 0.2843839360275325, + "grad_norm": 5.796983242034912, + "learning_rate": 8.221604109903498e-06, + "loss": 0.2, + "step": 11238 + }, + { + "epoch": 0.28440924159222614, + "grad_norm": 5.041720390319824, + "learning_rate": 8.221297033023418e-06, + "loss": 0.1517, + "step": 11239 + }, + { + "epoch": 0.2844345471569198, + "grad_norm": 3.131209373474121, + "learning_rate": 8.220989935369796e-06, + "loss": 0.1684, + "step": 11240 + }, + { + "epoch": 0.2844598527216135, + "grad_norm": 7.2218780517578125, + "learning_rate": 8.220682816944607e-06, + "loss": 0.1914, + "step": 11241 + }, + { + "epoch": 0.28448515828630716, + "grad_norm": 4.947218418121338, + "learning_rate": 8.220375677749835e-06, + "loss": 0.1437, + "step": 11242 + }, + { + "epoch": 0.2845104638510008, + "grad_norm": 3.969176769256592, + "learning_rate": 8.220068517787458e-06, + "loss": 0.1674, + "step": 11243 + }, + { + "epoch": 0.28453576941569453, + "grad_norm": 4.166779041290283, + "learning_rate": 8.21976133705946e-06, + "loss": 0.186, + "step": 11244 + }, + { + "epoch": 0.2845610749803882, + "grad_norm": 5.175046920776367, + "learning_rate": 8.219454135567819e-06, + "loss": 0.1558, + "step": 11245 + }, + { + "epoch": 0.28458638054508184, + "grad_norm": 6.283176422119141, + "learning_rate": 8.219146913314517e-06, + "loss": 0.199, + "step": 11246 + }, + { + "epoch": 0.28461168610977555, + "grad_norm": 8.729870796203613, + "learning_rate": 8.218839670301535e-06, + "loss": 0.2173, + "step": 11247 + }, + { + "epoch": 0.2846369916744692, + "grad_norm": 5.495711326599121, + "learning_rate": 8.218532406530854e-06, + "loss": 0.1528, + "step": 11248 + }, + { + "epoch": 0.28466229723916286, + "grad_norm": 9.5139741897583, + "learning_rate": 8.218225122004458e-06, + "loss": 0.3493, + "step": 11249 + }, + { + "epoch": 0.2846876028038566, + "grad_norm": 4.147099018096924, + "learning_rate": 8.217917816724327e-06, + "loss": 0.1372, + "step": 11250 + }, + { + "epoch": 0.28471290836855023, + "grad_norm": 16.812469482421875, + "learning_rate": 8.21761049069244e-06, + "loss": 0.1991, + "step": 11251 + }, + { + "epoch": 0.28473821393324394, + "grad_norm": 7.685302257537842, + "learning_rate": 8.217303143910784e-06, + "loss": 0.2663, + "step": 11252 + }, + { + "epoch": 0.2847635194979376, + "grad_norm": 4.651264667510986, + "learning_rate": 8.216995776381339e-06, + "loss": 0.1322, + "step": 11253 + }, + { + "epoch": 0.28478882506263126, + "grad_norm": 5.251461505889893, + "learning_rate": 8.216688388106084e-06, + "loss": 0.1499, + "step": 11254 + }, + { + "epoch": 0.28481413062732497, + "grad_norm": 5.8311920166015625, + "learning_rate": 8.216380979087005e-06, + "loss": 0.2579, + "step": 11255 + }, + { + "epoch": 0.2848394361920186, + "grad_norm": 6.891823768615723, + "learning_rate": 8.216073549326084e-06, + "loss": 0.1596, + "step": 11256 + }, + { + "epoch": 0.2848647417567123, + "grad_norm": 3.985670804977417, + "learning_rate": 8.215766098825301e-06, + "loss": 0.1772, + "step": 11257 + }, + { + "epoch": 0.284890047321406, + "grad_norm": 3.240744113922119, + "learning_rate": 8.215458627586643e-06, + "loss": 0.1331, + "step": 11258 + }, + { + "epoch": 0.28491535288609965, + "grad_norm": 5.003453254699707, + "learning_rate": 8.215151135612089e-06, + "loss": 0.1493, + "step": 11259 + }, + { + "epoch": 0.2849406584507933, + "grad_norm": 5.116835117340088, + "learning_rate": 8.214843622903625e-06, + "loss": 0.1523, + "step": 11260 + }, + { + "epoch": 0.284965964015487, + "grad_norm": 7.7622175216674805, + "learning_rate": 8.214536089463233e-06, + "loss": 0.2639, + "step": 11261 + }, + { + "epoch": 0.2849912695801807, + "grad_norm": 5.619132041931152, + "learning_rate": 8.214228535292891e-06, + "loss": 0.2619, + "step": 11262 + }, + { + "epoch": 0.2850165751448744, + "grad_norm": 6.04612922668457, + "learning_rate": 8.213920960394593e-06, + "loss": 0.2663, + "step": 11263 + }, + { + "epoch": 0.28504188070956804, + "grad_norm": 6.573853015899658, + "learning_rate": 8.213613364770311e-06, + "loss": 0.233, + "step": 11264 + }, + { + "epoch": 0.2850671862742617, + "grad_norm": 5.11031436920166, + "learning_rate": 8.213305748422039e-06, + "loss": 0.1771, + "step": 11265 + }, + { + "epoch": 0.2850924918389554, + "grad_norm": 3.809817314147949, + "learning_rate": 8.212998111351753e-06, + "loss": 0.1555, + "step": 11266 + }, + { + "epoch": 0.28511779740364906, + "grad_norm": 5.24392557144165, + "learning_rate": 8.212690453561442e-06, + "loss": 0.2022, + "step": 11267 + }, + { + "epoch": 0.2851431029683427, + "grad_norm": 3.509215831756592, + "learning_rate": 8.212382775053085e-06, + "loss": 0.1465, + "step": 11268 + }, + { + "epoch": 0.28516840853303643, + "grad_norm": 3.74226450920105, + "learning_rate": 8.212075075828672e-06, + "loss": 0.1523, + "step": 11269 + }, + { + "epoch": 0.2851937140977301, + "grad_norm": 3.236424684524536, + "learning_rate": 8.211767355890183e-06, + "loss": 0.0998, + "step": 11270 + }, + { + "epoch": 0.28521901966242375, + "grad_norm": 3.354083299636841, + "learning_rate": 8.211459615239603e-06, + "loss": 0.1258, + "step": 11271 + }, + { + "epoch": 0.28524432522711746, + "grad_norm": 5.030895233154297, + "learning_rate": 8.211151853878918e-06, + "loss": 0.1474, + "step": 11272 + }, + { + "epoch": 0.2852696307918111, + "grad_norm": 4.8688836097717285, + "learning_rate": 8.210844071810111e-06, + "loss": 0.2151, + "step": 11273 + }, + { + "epoch": 0.28529493635650477, + "grad_norm": 5.0279340744018555, + "learning_rate": 8.21053626903517e-06, + "loss": 0.1879, + "step": 11274 + }, + { + "epoch": 0.2853202419211985, + "grad_norm": 5.9573259353637695, + "learning_rate": 8.210228445556076e-06, + "loss": 0.1671, + "step": 11275 + }, + { + "epoch": 0.28534554748589214, + "grad_norm": 5.15043306350708, + "learning_rate": 8.209920601374816e-06, + "loss": 0.2238, + "step": 11276 + }, + { + "epoch": 0.28537085305058585, + "grad_norm": 5.2498602867126465, + "learning_rate": 8.209612736493377e-06, + "loss": 0.2179, + "step": 11277 + }, + { + "epoch": 0.2853961586152795, + "grad_norm": 7.686700820922852, + "learning_rate": 8.209304850913743e-06, + "loss": 0.1893, + "step": 11278 + }, + { + "epoch": 0.28542146417997316, + "grad_norm": 5.320690155029297, + "learning_rate": 8.208996944637895e-06, + "loss": 0.1786, + "step": 11279 + }, + { + "epoch": 0.2854467697446669, + "grad_norm": 5.2806477546691895, + "learning_rate": 8.208689017667826e-06, + "loss": 0.1798, + "step": 11280 + }, + { + "epoch": 0.28547207530936053, + "grad_norm": 3.80666446685791, + "learning_rate": 8.208381070005517e-06, + "loss": 0.1082, + "step": 11281 + }, + { + "epoch": 0.2854973808740542, + "grad_norm": 3.937439441680908, + "learning_rate": 8.208073101652956e-06, + "loss": 0.1703, + "step": 11282 + }, + { + "epoch": 0.2855226864387479, + "grad_norm": 13.719468116760254, + "learning_rate": 8.207765112612128e-06, + "loss": 0.2983, + "step": 11283 + }, + { + "epoch": 0.28554799200344155, + "grad_norm": 3.923118829727173, + "learning_rate": 8.207457102885018e-06, + "loss": 0.1576, + "step": 11284 + }, + { + "epoch": 0.2855732975681352, + "grad_norm": 10.013233184814453, + "learning_rate": 8.207149072473617e-06, + "loss": 0.2379, + "step": 11285 + }, + { + "epoch": 0.2855986031328289, + "grad_norm": 3.45090913772583, + "learning_rate": 8.206841021379904e-06, + "loss": 0.1012, + "step": 11286 + }, + { + "epoch": 0.2856239086975226, + "grad_norm": 8.421844482421875, + "learning_rate": 8.206532949605873e-06, + "loss": 0.265, + "step": 11287 + }, + { + "epoch": 0.2856492142622163, + "grad_norm": 6.442477703094482, + "learning_rate": 8.206224857153507e-06, + "loss": 0.1531, + "step": 11288 + }, + { + "epoch": 0.28567451982690995, + "grad_norm": 4.013957977294922, + "learning_rate": 8.205916744024792e-06, + "loss": 0.1529, + "step": 11289 + }, + { + "epoch": 0.2856998253916036, + "grad_norm": 3.953707218170166, + "learning_rate": 8.205608610221717e-06, + "loss": 0.1432, + "step": 11290 + }, + { + "epoch": 0.2857251309562973, + "grad_norm": 9.26586627960205, + "learning_rate": 8.20530045574627e-06, + "loss": 0.2058, + "step": 11291 + }, + { + "epoch": 0.28575043652099097, + "grad_norm": 4.933180332183838, + "learning_rate": 8.204992280600435e-06, + "loss": 0.1541, + "step": 11292 + }, + { + "epoch": 0.2857757420856846, + "grad_norm": 17.28968620300293, + "learning_rate": 8.204684084786201e-06, + "loss": 0.2317, + "step": 11293 + }, + { + "epoch": 0.28580104765037834, + "grad_norm": 6.829824447631836, + "learning_rate": 8.204375868305553e-06, + "loss": 0.1735, + "step": 11294 + }, + { + "epoch": 0.285826353215072, + "grad_norm": 7.451723575592041, + "learning_rate": 8.204067631160485e-06, + "loss": 0.1826, + "step": 11295 + }, + { + "epoch": 0.28585165877976565, + "grad_norm": 3.9177753925323486, + "learning_rate": 8.203759373352979e-06, + "loss": 0.1326, + "step": 11296 + }, + { + "epoch": 0.28587696434445936, + "grad_norm": 3.8719065189361572, + "learning_rate": 8.203451094885025e-06, + "loss": 0.1866, + "step": 11297 + }, + { + "epoch": 0.285902269909153, + "grad_norm": 5.782723903656006, + "learning_rate": 8.203142795758612e-06, + "loss": 0.1086, + "step": 11298 + }, + { + "epoch": 0.2859275754738467, + "grad_norm": 2.946188449859619, + "learning_rate": 8.202834475975725e-06, + "loss": 0.1784, + "step": 11299 + }, + { + "epoch": 0.2859528810385404, + "grad_norm": 6.742173671722412, + "learning_rate": 8.202526135538356e-06, + "loss": 0.148, + "step": 11300 + }, + { + "epoch": 0.28597818660323404, + "grad_norm": 3.872746467590332, + "learning_rate": 8.20221777444849e-06, + "loss": 0.1696, + "step": 11301 + }, + { + "epoch": 0.28600349216792775, + "grad_norm": 5.752420425415039, + "learning_rate": 8.201909392708118e-06, + "loss": 0.2205, + "step": 11302 + }, + { + "epoch": 0.2860287977326214, + "grad_norm": 3.8185250759124756, + "learning_rate": 8.201600990319227e-06, + "loss": 0.1592, + "step": 11303 + }, + { + "epoch": 0.28605410329731507, + "grad_norm": 4.719186305999756, + "learning_rate": 8.20129256728381e-06, + "loss": 0.1852, + "step": 11304 + }, + { + "epoch": 0.2860794088620088, + "grad_norm": 7.681201457977295, + "learning_rate": 8.20098412360385e-06, + "loss": 0.2299, + "step": 11305 + }, + { + "epoch": 0.28610471442670243, + "grad_norm": 6.338661193847656, + "learning_rate": 8.200675659281338e-06, + "loss": 0.2293, + "step": 11306 + }, + { + "epoch": 0.2861300199913961, + "grad_norm": 13.801689147949219, + "learning_rate": 8.200367174318266e-06, + "loss": 0.2092, + "step": 11307 + }, + { + "epoch": 0.2861553255560898, + "grad_norm": 12.911260604858398, + "learning_rate": 8.200058668716621e-06, + "loss": 0.2178, + "step": 11308 + }, + { + "epoch": 0.28618063112078346, + "grad_norm": 10.490174293518066, + "learning_rate": 8.199750142478393e-06, + "loss": 0.1464, + "step": 11309 + }, + { + "epoch": 0.2862059366854771, + "grad_norm": 8.446579933166504, + "learning_rate": 8.199441595605571e-06, + "loss": 0.1375, + "step": 11310 + }, + { + "epoch": 0.2862312422501708, + "grad_norm": 13.39366626739502, + "learning_rate": 8.199133028100145e-06, + "loss": 0.1837, + "step": 11311 + }, + { + "epoch": 0.2862565478148645, + "grad_norm": 4.814571380615234, + "learning_rate": 8.198824439964104e-06, + "loss": 0.2311, + "step": 11312 + }, + { + "epoch": 0.28628185337955814, + "grad_norm": 4.919283866882324, + "learning_rate": 8.198515831199442e-06, + "loss": 0.1504, + "step": 11313 + }, + { + "epoch": 0.28630715894425185, + "grad_norm": 5.394707202911377, + "learning_rate": 8.198207201808146e-06, + "loss": 0.2538, + "step": 11314 + }, + { + "epoch": 0.2863324645089455, + "grad_norm": 10.963178634643555, + "learning_rate": 8.197898551792205e-06, + "loss": 0.3802, + "step": 11315 + }, + { + "epoch": 0.2863577700736392, + "grad_norm": 5.3793745040893555, + "learning_rate": 8.197589881153612e-06, + "loss": 0.245, + "step": 11316 + }, + { + "epoch": 0.2863830756383329, + "grad_norm": 2.9854583740234375, + "learning_rate": 8.197281189894356e-06, + "loss": 0.0881, + "step": 11317 + }, + { + "epoch": 0.28640838120302653, + "grad_norm": 5.460810661315918, + "learning_rate": 8.19697247801643e-06, + "loss": 0.171, + "step": 11318 + }, + { + "epoch": 0.28643368676772024, + "grad_norm": 7.35358190536499, + "learning_rate": 8.196663745521821e-06, + "loss": 0.1966, + "step": 11319 + }, + { + "epoch": 0.2864589923324139, + "grad_norm": 4.158198833465576, + "learning_rate": 8.196354992412523e-06, + "loss": 0.1173, + "step": 11320 + }, + { + "epoch": 0.28648429789710755, + "grad_norm": 4.8177266120910645, + "learning_rate": 8.196046218690527e-06, + "loss": 0.217, + "step": 11321 + }, + { + "epoch": 0.28650960346180127, + "grad_norm": 8.747153282165527, + "learning_rate": 8.195737424357824e-06, + "loss": 0.2934, + "step": 11322 + }, + { + "epoch": 0.2865349090264949, + "grad_norm": 4.695549488067627, + "learning_rate": 8.195428609416403e-06, + "loss": 0.1716, + "step": 11323 + }, + { + "epoch": 0.2865602145911886, + "grad_norm": 5.115114688873291, + "learning_rate": 8.195119773868258e-06, + "loss": 0.2082, + "step": 11324 + }, + { + "epoch": 0.2865855201558823, + "grad_norm": 10.649513244628906, + "learning_rate": 8.194810917715379e-06, + "loss": 0.2412, + "step": 11325 + }, + { + "epoch": 0.28661082572057595, + "grad_norm": 5.6652421951293945, + "learning_rate": 8.19450204095976e-06, + "loss": 0.1944, + "step": 11326 + }, + { + "epoch": 0.28663613128526966, + "grad_norm": 4.252448558807373, + "learning_rate": 8.194193143603392e-06, + "loss": 0.1243, + "step": 11327 + }, + { + "epoch": 0.2866614368499633, + "grad_norm": 2.912275791168213, + "learning_rate": 8.193884225648265e-06, + "loss": 0.1709, + "step": 11328 + }, + { + "epoch": 0.28668674241465697, + "grad_norm": 10.098254203796387, + "learning_rate": 8.193575287096373e-06, + "loss": 0.2181, + "step": 11329 + }, + { + "epoch": 0.2867120479793507, + "grad_norm": 5.600988388061523, + "learning_rate": 8.193266327949709e-06, + "loss": 0.1679, + "step": 11330 + }, + { + "epoch": 0.28673735354404434, + "grad_norm": 8.195527076721191, + "learning_rate": 8.192957348210264e-06, + "loss": 0.2205, + "step": 11331 + }, + { + "epoch": 0.286762659108738, + "grad_norm": 6.121243953704834, + "learning_rate": 8.19264834788003e-06, + "loss": 0.1711, + "step": 11332 + }, + { + "epoch": 0.2867879646734317, + "grad_norm": 4.952574253082275, + "learning_rate": 8.192339326961004e-06, + "loss": 0.1814, + "step": 11333 + }, + { + "epoch": 0.28681327023812536, + "grad_norm": 6.021895885467529, + "learning_rate": 8.192030285455171e-06, + "loss": 0.1673, + "step": 11334 + }, + { + "epoch": 0.286838575802819, + "grad_norm": 3.979536771774292, + "learning_rate": 8.191721223364532e-06, + "loss": 0.0945, + "step": 11335 + }, + { + "epoch": 0.28686388136751273, + "grad_norm": 5.086989402770996, + "learning_rate": 8.191412140691075e-06, + "loss": 0.2084, + "step": 11336 + }, + { + "epoch": 0.2868891869322064, + "grad_norm": 3.443723440170288, + "learning_rate": 8.191103037436795e-06, + "loss": 0.1285, + "step": 11337 + }, + { + "epoch": 0.28691449249690004, + "grad_norm": 5.97582483291626, + "learning_rate": 8.190793913603684e-06, + "loss": 0.1983, + "step": 11338 + }, + { + "epoch": 0.28693979806159375, + "grad_norm": 6.665950775146484, + "learning_rate": 8.190484769193738e-06, + "loss": 0.1889, + "step": 11339 + }, + { + "epoch": 0.2869651036262874, + "grad_norm": 6.122060775756836, + "learning_rate": 8.190175604208949e-06, + "loss": 0.2574, + "step": 11340 + }, + { + "epoch": 0.2869904091909811, + "grad_norm": 3.706421375274658, + "learning_rate": 8.189866418651309e-06, + "loss": 0.1862, + "step": 11341 + }, + { + "epoch": 0.2870157147556748, + "grad_norm": 7.857560634613037, + "learning_rate": 8.189557212522816e-06, + "loss": 0.1949, + "step": 11342 + }, + { + "epoch": 0.28704102032036843, + "grad_norm": 13.872440338134766, + "learning_rate": 8.189247985825462e-06, + "loss": 0.2563, + "step": 11343 + }, + { + "epoch": 0.28706632588506215, + "grad_norm": 4.348609447479248, + "learning_rate": 8.18893873856124e-06, + "loss": 0.1688, + "step": 11344 + }, + { + "epoch": 0.2870916314497558, + "grad_norm": 4.990092754364014, + "learning_rate": 8.188629470732146e-06, + "loss": 0.2023, + "step": 11345 + }, + { + "epoch": 0.28711693701444946, + "grad_norm": 3.7192375659942627, + "learning_rate": 8.188320182340173e-06, + "loss": 0.1529, + "step": 11346 + }, + { + "epoch": 0.28714224257914317, + "grad_norm": 6.271886348724365, + "learning_rate": 8.188010873387316e-06, + "loss": 0.1897, + "step": 11347 + }, + { + "epoch": 0.2871675481438368, + "grad_norm": 12.403935432434082, + "learning_rate": 8.18770154387557e-06, + "loss": 0.19, + "step": 11348 + }, + { + "epoch": 0.2871928537085305, + "grad_norm": 5.584383964538574, + "learning_rate": 8.18739219380693e-06, + "loss": 0.08, + "step": 11349 + }, + { + "epoch": 0.2872181592732242, + "grad_norm": 5.159824848175049, + "learning_rate": 8.187082823183392e-06, + "loss": 0.2054, + "step": 11350 + }, + { + "epoch": 0.28724346483791785, + "grad_norm": 4.280668258666992, + "learning_rate": 8.18677343200695e-06, + "loss": 0.1695, + "step": 11351 + }, + { + "epoch": 0.28726877040261156, + "grad_norm": 3.5635321140289307, + "learning_rate": 8.186464020279597e-06, + "loss": 0.1499, + "step": 11352 + }, + { + "epoch": 0.2872940759673052, + "grad_norm": 4.5677595138549805, + "learning_rate": 8.18615458800333e-06, + "loss": 0.174, + "step": 11353 + }, + { + "epoch": 0.2873193815319989, + "grad_norm": 5.243188381195068, + "learning_rate": 8.185845135180145e-06, + "loss": 0.1596, + "step": 11354 + }, + { + "epoch": 0.2873446870966926, + "grad_norm": 3.8880362510681152, + "learning_rate": 8.185535661812038e-06, + "loss": 0.1509, + "step": 11355 + }, + { + "epoch": 0.28736999266138624, + "grad_norm": 10.285181045532227, + "learning_rate": 8.185226167901004e-06, + "loss": 0.3191, + "step": 11356 + }, + { + "epoch": 0.2873952982260799, + "grad_norm": 3.006600856781006, + "learning_rate": 8.184916653449038e-06, + "loss": 0.1467, + "step": 11357 + }, + { + "epoch": 0.2874206037907736, + "grad_norm": 4.278427600860596, + "learning_rate": 8.184607118458137e-06, + "loss": 0.2093, + "step": 11358 + }, + { + "epoch": 0.28744590935546727, + "grad_norm": 4.097467422485352, + "learning_rate": 8.184297562930298e-06, + "loss": 0.1862, + "step": 11359 + }, + { + "epoch": 0.2874712149201609, + "grad_norm": 4.362137794494629, + "learning_rate": 8.183987986867517e-06, + "loss": 0.1944, + "step": 11360 + }, + { + "epoch": 0.28749652048485463, + "grad_norm": 23.629308700561523, + "learning_rate": 8.183678390271788e-06, + "loss": 0.2137, + "step": 11361 + }, + { + "epoch": 0.2875218260495483, + "grad_norm": 7.866022109985352, + "learning_rate": 8.18336877314511e-06, + "loss": 0.2273, + "step": 11362 + }, + { + "epoch": 0.28754713161424195, + "grad_norm": 2.966339111328125, + "learning_rate": 8.18305913548948e-06, + "loss": 0.1218, + "step": 11363 + }, + { + "epoch": 0.28757243717893566, + "grad_norm": 4.291317462921143, + "learning_rate": 8.182749477306892e-06, + "loss": 0.2061, + "step": 11364 + }, + { + "epoch": 0.2875977427436293, + "grad_norm": 3.8304309844970703, + "learning_rate": 8.182439798599346e-06, + "loss": 0.2191, + "step": 11365 + }, + { + "epoch": 0.287623048308323, + "grad_norm": 4.188916206359863, + "learning_rate": 8.182130099368836e-06, + "loss": 0.2277, + "step": 11366 + }, + { + "epoch": 0.2876483538730167, + "grad_norm": 10.75063705444336, + "learning_rate": 8.181820379617363e-06, + "loss": 0.3658, + "step": 11367 + }, + { + "epoch": 0.28767365943771034, + "grad_norm": 9.921475410461426, + "learning_rate": 8.181510639346921e-06, + "loss": 0.1855, + "step": 11368 + }, + { + "epoch": 0.28769896500240405, + "grad_norm": 5.223006248474121, + "learning_rate": 8.181200878559509e-06, + "loss": 0.1741, + "step": 11369 + }, + { + "epoch": 0.2877242705670977, + "grad_norm": 6.076648235321045, + "learning_rate": 8.180891097257125e-06, + "loss": 0.2075, + "step": 11370 + }, + { + "epoch": 0.28774957613179136, + "grad_norm": 7.0586347579956055, + "learning_rate": 8.180581295441765e-06, + "loss": 0.2156, + "step": 11371 + }, + { + "epoch": 0.2877748816964851, + "grad_norm": 6.372061252593994, + "learning_rate": 8.180271473115426e-06, + "loss": 0.1781, + "step": 11372 + }, + { + "epoch": 0.28780018726117873, + "grad_norm": 10.175994873046875, + "learning_rate": 8.17996163028011e-06, + "loss": 0.2251, + "step": 11373 + }, + { + "epoch": 0.2878254928258724, + "grad_norm": 6.832915306091309, + "learning_rate": 8.179651766937813e-06, + "loss": 0.1804, + "step": 11374 + }, + { + "epoch": 0.2878507983905661, + "grad_norm": 8.732851028442383, + "learning_rate": 8.179341883090535e-06, + "loss": 0.1934, + "step": 11375 + }, + { + "epoch": 0.28787610395525975, + "grad_norm": 4.088726043701172, + "learning_rate": 8.179031978740271e-06, + "loss": 0.1681, + "step": 11376 + }, + { + "epoch": 0.2879014095199534, + "grad_norm": 9.180573463439941, + "learning_rate": 8.17872205388902e-06, + "loss": 0.2489, + "step": 11377 + }, + { + "epoch": 0.2879267150846471, + "grad_norm": 3.285862922668457, + "learning_rate": 8.178412108538782e-06, + "loss": 0.0984, + "step": 11378 + }, + { + "epoch": 0.2879520206493408, + "grad_norm": 4.105904579162598, + "learning_rate": 8.178102142691558e-06, + "loss": 0.1703, + "step": 11379 + }, + { + "epoch": 0.2879773262140345, + "grad_norm": 6.776450157165527, + "learning_rate": 8.177792156349342e-06, + "loss": 0.2573, + "step": 11380 + }, + { + "epoch": 0.28800263177872815, + "grad_norm": 4.13872766494751, + "learning_rate": 8.177482149514136e-06, + "loss": 0.1567, + "step": 11381 + }, + { + "epoch": 0.2880279373434218, + "grad_norm": 8.839495658874512, + "learning_rate": 8.177172122187939e-06, + "loss": 0.1489, + "step": 11382 + }, + { + "epoch": 0.2880532429081155, + "grad_norm": 6.263126373291016, + "learning_rate": 8.17686207437275e-06, + "loss": 0.2038, + "step": 11383 + }, + { + "epoch": 0.28807854847280917, + "grad_norm": 5.371701240539551, + "learning_rate": 8.176552006070568e-06, + "loss": 0.2303, + "step": 11384 + }, + { + "epoch": 0.2881038540375028, + "grad_norm": 5.989731788635254, + "learning_rate": 8.176241917283392e-06, + "loss": 0.2114, + "step": 11385 + }, + { + "epoch": 0.28812915960219654, + "grad_norm": 7.7452287673950195, + "learning_rate": 8.175931808013225e-06, + "loss": 0.2305, + "step": 11386 + }, + { + "epoch": 0.2881544651668902, + "grad_norm": 2.399832248687744, + "learning_rate": 8.175621678262062e-06, + "loss": 0.1013, + "step": 11387 + }, + { + "epoch": 0.28817977073158385, + "grad_norm": 2.3141896724700928, + "learning_rate": 8.17531152803191e-06, + "loss": 0.0843, + "step": 11388 + }, + { + "epoch": 0.28820507629627756, + "grad_norm": 5.658171653747559, + "learning_rate": 8.175001357324759e-06, + "loss": 0.1966, + "step": 11389 + }, + { + "epoch": 0.2882303818609712, + "grad_norm": 18.139680862426758, + "learning_rate": 8.174691166142616e-06, + "loss": 0.3198, + "step": 11390 + }, + { + "epoch": 0.28825568742566493, + "grad_norm": 5.733304023742676, + "learning_rate": 8.174380954487484e-06, + "loss": 0.1313, + "step": 11391 + }, + { + "epoch": 0.2882809929903586, + "grad_norm": 2.920541763305664, + "learning_rate": 8.174070722361356e-06, + "loss": 0.148, + "step": 11392 + }, + { + "epoch": 0.28830629855505224, + "grad_norm": 4.84035587310791, + "learning_rate": 8.173760469766238e-06, + "loss": 0.2049, + "step": 11393 + }, + { + "epoch": 0.28833160411974595, + "grad_norm": 4.489033222198486, + "learning_rate": 8.173450196704129e-06, + "loss": 0.1886, + "step": 11394 + }, + { + "epoch": 0.2883569096844396, + "grad_norm": 3.9483141899108887, + "learning_rate": 8.17313990317703e-06, + "loss": 0.1079, + "step": 11395 + }, + { + "epoch": 0.28838221524913327, + "grad_norm": 11.403517723083496, + "learning_rate": 8.172829589186941e-06, + "loss": 0.1761, + "step": 11396 + }, + { + "epoch": 0.288407520813827, + "grad_norm": 3.520829677581787, + "learning_rate": 8.172519254735866e-06, + "loss": 0.1942, + "step": 11397 + }, + { + "epoch": 0.28843282637852063, + "grad_norm": 9.210052490234375, + "learning_rate": 8.172208899825802e-06, + "loss": 0.249, + "step": 11398 + }, + { + "epoch": 0.2884581319432143, + "grad_norm": 5.543241024017334, + "learning_rate": 8.171898524458755e-06, + "loss": 0.0619, + "step": 11399 + }, + { + "epoch": 0.288483437507908, + "grad_norm": 4.341203212738037, + "learning_rate": 8.171588128636722e-06, + "loss": 0.1377, + "step": 11400 + }, + { + "epoch": 0.28850874307260166, + "grad_norm": 4.050184726715088, + "learning_rate": 8.17127771236171e-06, + "loss": 0.1807, + "step": 11401 + }, + { + "epoch": 0.2885340486372953, + "grad_norm": 5.612389087677002, + "learning_rate": 8.170967275635717e-06, + "loss": 0.1063, + "step": 11402 + }, + { + "epoch": 0.288559354201989, + "grad_norm": 5.3092265129089355, + "learning_rate": 8.170656818460745e-06, + "loss": 0.1763, + "step": 11403 + }, + { + "epoch": 0.2885846597666827, + "grad_norm": 3.960026979446411, + "learning_rate": 8.170346340838798e-06, + "loss": 0.151, + "step": 11404 + }, + { + "epoch": 0.2886099653313764, + "grad_norm": 6.550901412963867, + "learning_rate": 8.170035842771877e-06, + "loss": 0.1936, + "step": 11405 + }, + { + "epoch": 0.28863527089607005, + "grad_norm": 4.733722686767578, + "learning_rate": 8.169725324261984e-06, + "loss": 0.208, + "step": 11406 + }, + { + "epoch": 0.2886605764607637, + "grad_norm": 4.596617698669434, + "learning_rate": 8.169414785311123e-06, + "loss": 0.2536, + "step": 11407 + }, + { + "epoch": 0.2886858820254574, + "grad_norm": 4.9304094314575195, + "learning_rate": 8.169104225921294e-06, + "loss": 0.1627, + "step": 11408 + }, + { + "epoch": 0.2887111875901511, + "grad_norm": 4.267944812774658, + "learning_rate": 8.168793646094502e-06, + "loss": 0.1429, + "step": 11409 + }, + { + "epoch": 0.28873649315484473, + "grad_norm": 6.812602519989014, + "learning_rate": 8.16848304583275e-06, + "loss": 0.2727, + "step": 11410 + }, + { + "epoch": 0.28876179871953844, + "grad_norm": 6.818572998046875, + "learning_rate": 8.16817242513804e-06, + "loss": 0.2239, + "step": 11411 + }, + { + "epoch": 0.2887871042842321, + "grad_norm": 4.217419147491455, + "learning_rate": 8.167861784012375e-06, + "loss": 0.153, + "step": 11412 + }, + { + "epoch": 0.28881240984892576, + "grad_norm": 12.406728744506836, + "learning_rate": 8.16755112245776e-06, + "loss": 0.3362, + "step": 11413 + }, + { + "epoch": 0.28883771541361947, + "grad_norm": 6.880361557006836, + "learning_rate": 8.167240440476194e-06, + "loss": 0.2277, + "step": 11414 + }, + { + "epoch": 0.2888630209783131, + "grad_norm": 2.929924726486206, + "learning_rate": 8.166929738069686e-06, + "loss": 0.167, + "step": 11415 + }, + { + "epoch": 0.28888832654300683, + "grad_norm": 5.675448894500732, + "learning_rate": 8.166619015240236e-06, + "loss": 0.1088, + "step": 11416 + }, + { + "epoch": 0.2889136321077005, + "grad_norm": 4.922084331512451, + "learning_rate": 8.166308271989849e-06, + "loss": 0.1779, + "step": 11417 + }, + { + "epoch": 0.28893893767239415, + "grad_norm": 11.13778305053711, + "learning_rate": 8.16599750832053e-06, + "loss": 0.3214, + "step": 11418 + }, + { + "epoch": 0.28896424323708786, + "grad_norm": 3.8006458282470703, + "learning_rate": 8.16568672423428e-06, + "loss": 0.1562, + "step": 11419 + }, + { + "epoch": 0.2889895488017815, + "grad_norm": 7.792923927307129, + "learning_rate": 8.165375919733107e-06, + "loss": 0.2707, + "step": 11420 + }, + { + "epoch": 0.28901485436647517, + "grad_norm": 6.295653343200684, + "learning_rate": 8.165065094819013e-06, + "loss": 0.255, + "step": 11421 + }, + { + "epoch": 0.2890401599311689, + "grad_norm": 3.9065730571746826, + "learning_rate": 8.164754249494003e-06, + "loss": 0.1648, + "step": 11422 + }, + { + "epoch": 0.28906546549586254, + "grad_norm": 7.018754005432129, + "learning_rate": 8.164443383760081e-06, + "loss": 0.1515, + "step": 11423 + }, + { + "epoch": 0.2890907710605562, + "grad_norm": 3.4685418605804443, + "learning_rate": 8.164132497619254e-06, + "loss": 0.112, + "step": 11424 + }, + { + "epoch": 0.2891160766252499, + "grad_norm": 3.7194643020629883, + "learning_rate": 8.163821591073525e-06, + "loss": 0.1427, + "step": 11425 + }, + { + "epoch": 0.28914138218994356, + "grad_norm": 7.777151584625244, + "learning_rate": 8.163510664124896e-06, + "loss": 0.2114, + "step": 11426 + }, + { + "epoch": 0.2891666877546372, + "grad_norm": 4.651371002197266, + "learning_rate": 8.163199716775378e-06, + "loss": 0.2386, + "step": 11427 + }, + { + "epoch": 0.28919199331933093, + "grad_norm": 7.216926574707031, + "learning_rate": 8.162888749026973e-06, + "loss": 0.2185, + "step": 11428 + }, + { + "epoch": 0.2892172988840246, + "grad_norm": 4.474124908447266, + "learning_rate": 8.162577760881684e-06, + "loss": 0.1693, + "step": 11429 + }, + { + "epoch": 0.2892426044487183, + "grad_norm": 7.935265064239502, + "learning_rate": 8.162266752341523e-06, + "loss": 0.1627, + "step": 11430 + }, + { + "epoch": 0.28926791001341196, + "grad_norm": 3.4751482009887695, + "learning_rate": 8.16195572340849e-06, + "loss": 0.1155, + "step": 11431 + }, + { + "epoch": 0.2892932155781056, + "grad_norm": 7.960752964019775, + "learning_rate": 8.161644674084593e-06, + "loss": 0.2768, + "step": 11432 + }, + { + "epoch": 0.2893185211427993, + "grad_norm": 3.423041343688965, + "learning_rate": 8.16133360437184e-06, + "loss": 0.1901, + "step": 11433 + }, + { + "epoch": 0.289343826707493, + "grad_norm": 10.652932167053223, + "learning_rate": 8.161022514272231e-06, + "loss": 0.2077, + "step": 11434 + }, + { + "epoch": 0.28936913227218664, + "grad_norm": 5.531343936920166, + "learning_rate": 8.160711403787778e-06, + "loss": 0.1489, + "step": 11435 + }, + { + "epoch": 0.28939443783688035, + "grad_norm": 12.348569869995117, + "learning_rate": 8.160400272920484e-06, + "loss": 0.2666, + "step": 11436 + }, + { + "epoch": 0.289419743401574, + "grad_norm": 10.595879554748535, + "learning_rate": 8.160089121672356e-06, + "loss": 0.0957, + "step": 11437 + }, + { + "epoch": 0.28944504896626766, + "grad_norm": 3.6858623027801514, + "learning_rate": 8.159777950045403e-06, + "loss": 0.1504, + "step": 11438 + }, + { + "epoch": 0.28947035453096137, + "grad_norm": 3.7827181816101074, + "learning_rate": 8.15946675804163e-06, + "loss": 0.1886, + "step": 11439 + }, + { + "epoch": 0.289495660095655, + "grad_norm": 3.3999900817871094, + "learning_rate": 8.159155545663043e-06, + "loss": 0.203, + "step": 11440 + }, + { + "epoch": 0.2895209656603487, + "grad_norm": 7.762533664703369, + "learning_rate": 8.158844312911648e-06, + "loss": 0.2683, + "step": 11441 + }, + { + "epoch": 0.2895462712250424, + "grad_norm": 2.53779935836792, + "learning_rate": 8.158533059789454e-06, + "loss": 0.1175, + "step": 11442 + }, + { + "epoch": 0.28957157678973605, + "grad_norm": 3.8326945304870605, + "learning_rate": 8.158221786298467e-06, + "loss": 0.0808, + "step": 11443 + }, + { + "epoch": 0.28959688235442976, + "grad_norm": 3.744307279586792, + "learning_rate": 8.157910492440697e-06, + "loss": 0.1307, + "step": 11444 + }, + { + "epoch": 0.2896221879191234, + "grad_norm": 3.210881471633911, + "learning_rate": 8.15759917821815e-06, + "loss": 0.1555, + "step": 11445 + }, + { + "epoch": 0.2896474934838171, + "grad_norm": 4.503856658935547, + "learning_rate": 8.157287843632833e-06, + "loss": 0.2329, + "step": 11446 + }, + { + "epoch": 0.2896727990485108, + "grad_norm": 18.92955207824707, + "learning_rate": 8.156976488686753e-06, + "loss": 0.2819, + "step": 11447 + }, + { + "epoch": 0.28969810461320444, + "grad_norm": 4.980787754058838, + "learning_rate": 8.156665113381918e-06, + "loss": 0.1847, + "step": 11448 + }, + { + "epoch": 0.2897234101778981, + "grad_norm": 8.022329330444336, + "learning_rate": 8.156353717720339e-06, + "loss": 0.2068, + "step": 11449 + }, + { + "epoch": 0.2897487157425918, + "grad_norm": 6.712928771972656, + "learning_rate": 8.156042301704021e-06, + "loss": 0.2175, + "step": 11450 + }, + { + "epoch": 0.28977402130728547, + "grad_norm": 7.22198486328125, + "learning_rate": 8.155730865334974e-06, + "loss": 0.1999, + "step": 11451 + }, + { + "epoch": 0.2897993268719791, + "grad_norm": 7.122495651245117, + "learning_rate": 8.155419408615206e-06, + "loss": 0.2051, + "step": 11452 + }, + { + "epoch": 0.28982463243667284, + "grad_norm": 3.469182252883911, + "learning_rate": 8.155107931546723e-06, + "loss": 0.1192, + "step": 11453 + }, + { + "epoch": 0.2898499380013665, + "grad_norm": 6.8875250816345215, + "learning_rate": 8.154796434131537e-06, + "loss": 0.2559, + "step": 11454 + }, + { + "epoch": 0.2898752435660602, + "grad_norm": 6.609772682189941, + "learning_rate": 8.154484916371656e-06, + "loss": 0.2206, + "step": 11455 + }, + { + "epoch": 0.28990054913075386, + "grad_norm": 2.535957098007202, + "learning_rate": 8.154173378269088e-06, + "loss": 0.1239, + "step": 11456 + }, + { + "epoch": 0.2899258546954475, + "grad_norm": 11.973065376281738, + "learning_rate": 8.153861819825844e-06, + "loss": 0.2714, + "step": 11457 + }, + { + "epoch": 0.2899511602601412, + "grad_norm": 5.910017967224121, + "learning_rate": 8.153550241043931e-06, + "loss": 0.1852, + "step": 11458 + }, + { + "epoch": 0.2899764658248349, + "grad_norm": 5.13132905960083, + "learning_rate": 8.153238641925358e-06, + "loss": 0.2224, + "step": 11459 + }, + { + "epoch": 0.29000177138952854, + "grad_norm": 4.057261943817139, + "learning_rate": 8.152927022472136e-06, + "loss": 0.1551, + "step": 11460 + }, + { + "epoch": 0.29002707695422225, + "grad_norm": 4.511441230773926, + "learning_rate": 8.152615382686275e-06, + "loss": 0.1391, + "step": 11461 + }, + { + "epoch": 0.2900523825189159, + "grad_norm": 5.074634552001953, + "learning_rate": 8.152303722569783e-06, + "loss": 0.2082, + "step": 11462 + }, + { + "epoch": 0.29007768808360956, + "grad_norm": 8.432220458984375, + "learning_rate": 8.151992042124672e-06, + "loss": 0.2921, + "step": 11463 + }, + { + "epoch": 0.2901029936483033, + "grad_norm": 5.510247707366943, + "learning_rate": 8.151680341352948e-06, + "loss": 0.1892, + "step": 11464 + }, + { + "epoch": 0.29012829921299693, + "grad_norm": 5.289615154266357, + "learning_rate": 8.151368620256625e-06, + "loss": 0.1916, + "step": 11465 + }, + { + "epoch": 0.2901536047776906, + "grad_norm": 6.207977771759033, + "learning_rate": 8.151056878837713e-06, + "loss": 0.1813, + "step": 11466 + }, + { + "epoch": 0.2901789103423843, + "grad_norm": 6.731039047241211, + "learning_rate": 8.150745117098222e-06, + "loss": 0.2153, + "step": 11467 + }, + { + "epoch": 0.29020421590707796, + "grad_norm": 5.4799909591674805, + "learning_rate": 8.15043333504016e-06, + "loss": 0.1604, + "step": 11468 + }, + { + "epoch": 0.29022952147177167, + "grad_norm": 6.5925726890563965, + "learning_rate": 8.15012153266554e-06, + "loss": 0.1965, + "step": 11469 + }, + { + "epoch": 0.2902548270364653, + "grad_norm": 3.6513164043426514, + "learning_rate": 8.149809709976373e-06, + "loss": 0.1335, + "step": 11470 + }, + { + "epoch": 0.290280132601159, + "grad_norm": 7.93606424331665, + "learning_rate": 8.149497866974667e-06, + "loss": 0.2619, + "step": 11471 + }, + { + "epoch": 0.2903054381658527, + "grad_norm": 5.030745983123779, + "learning_rate": 8.149186003662437e-06, + "loss": 0.1646, + "step": 11472 + }, + { + "epoch": 0.29033074373054635, + "grad_norm": 5.338039398193359, + "learning_rate": 8.14887412004169e-06, + "loss": 0.2307, + "step": 11473 + }, + { + "epoch": 0.29035604929524, + "grad_norm": 3.1183013916015625, + "learning_rate": 8.148562216114442e-06, + "loss": 0.1712, + "step": 11474 + }, + { + "epoch": 0.2903813548599337, + "grad_norm": 6.848255157470703, + "learning_rate": 8.148250291882701e-06, + "loss": 0.1186, + "step": 11475 + }, + { + "epoch": 0.29040666042462737, + "grad_norm": 3.3188438415527344, + "learning_rate": 8.147938347348478e-06, + "loss": 0.1518, + "step": 11476 + }, + { + "epoch": 0.29043196598932103, + "grad_norm": 5.24747371673584, + "learning_rate": 8.14762638251379e-06, + "loss": 0.1419, + "step": 11477 + }, + { + "epoch": 0.29045727155401474, + "grad_norm": 6.694678783416748, + "learning_rate": 8.147314397380641e-06, + "loss": 0.213, + "step": 11478 + }, + { + "epoch": 0.2904825771187084, + "grad_norm": 4.05702018737793, + "learning_rate": 8.14700239195105e-06, + "loss": 0.1676, + "step": 11479 + }, + { + "epoch": 0.2905078826834021, + "grad_norm": 4.22557258605957, + "learning_rate": 8.146690366227023e-06, + "loss": 0.2047, + "step": 11480 + }, + { + "epoch": 0.29053318824809576, + "grad_norm": 22.641883850097656, + "learning_rate": 8.146378320210577e-06, + "loss": 0.3962, + "step": 11481 + }, + { + "epoch": 0.2905584938127894, + "grad_norm": 7.540339469909668, + "learning_rate": 8.146066253903723e-06, + "loss": 0.3174, + "step": 11482 + }, + { + "epoch": 0.29058379937748313, + "grad_norm": 10.49332046508789, + "learning_rate": 8.145754167308474e-06, + "loss": 0.2484, + "step": 11483 + }, + { + "epoch": 0.2906091049421768, + "grad_norm": 2.2989768981933594, + "learning_rate": 8.145442060426837e-06, + "loss": 0.0993, + "step": 11484 + }, + { + "epoch": 0.29063441050687044, + "grad_norm": 3.490846872329712, + "learning_rate": 8.145129933260834e-06, + "loss": 0.1748, + "step": 11485 + }, + { + "epoch": 0.29065971607156416, + "grad_norm": 3.063368320465088, + "learning_rate": 8.14481778581247e-06, + "loss": 0.1237, + "step": 11486 + }, + { + "epoch": 0.2906850216362578, + "grad_norm": 3.6982038021087646, + "learning_rate": 8.144505618083762e-06, + "loss": 0.1203, + "step": 11487 + }, + { + "epoch": 0.29071032720095147, + "grad_norm": 3.149120807647705, + "learning_rate": 8.144193430076722e-06, + "loss": 0.1952, + "step": 11488 + }, + { + "epoch": 0.2907356327656452, + "grad_norm": 3.486938238143921, + "learning_rate": 8.143881221793365e-06, + "loss": 0.1442, + "step": 11489 + }, + { + "epoch": 0.29076093833033884, + "grad_norm": 3.0600624084472656, + "learning_rate": 8.1435689932357e-06, + "loss": 0.1297, + "step": 11490 + }, + { + "epoch": 0.2907862438950325, + "grad_norm": 4.583457946777344, + "learning_rate": 8.143256744405746e-06, + "loss": 0.1711, + "step": 11491 + }, + { + "epoch": 0.2908115494597262, + "grad_norm": 3.3275704383850098, + "learning_rate": 8.142944475305511e-06, + "loss": 0.1454, + "step": 11492 + }, + { + "epoch": 0.29083685502441986, + "grad_norm": 4.836212158203125, + "learning_rate": 8.142632185937013e-06, + "loss": 0.1815, + "step": 11493 + }, + { + "epoch": 0.29086216058911357, + "grad_norm": 3.784604787826538, + "learning_rate": 8.142319876302268e-06, + "loss": 0.0866, + "step": 11494 + }, + { + "epoch": 0.29088746615380723, + "grad_norm": 11.128608703613281, + "learning_rate": 8.142007546403282e-06, + "loss": 0.1887, + "step": 11495 + }, + { + "epoch": 0.2909127717185009, + "grad_norm": 6.8843512535095215, + "learning_rate": 8.141695196242076e-06, + "loss": 0.1435, + "step": 11496 + }, + { + "epoch": 0.2909380772831946, + "grad_norm": 4.107492923736572, + "learning_rate": 8.141382825820661e-06, + "loss": 0.1662, + "step": 11497 + }, + { + "epoch": 0.29096338284788825, + "grad_norm": 4.065109729766846, + "learning_rate": 8.141070435141052e-06, + "loss": 0.1797, + "step": 11498 + }, + { + "epoch": 0.2909886884125819, + "grad_norm": 13.146905899047852, + "learning_rate": 8.140758024205266e-06, + "loss": 0.2803, + "step": 11499 + }, + { + "epoch": 0.2910139939772756, + "grad_norm": 5.507149696350098, + "learning_rate": 8.140445593015316e-06, + "loss": 0.1849, + "step": 11500 + }, + { + "epoch": 0.2910392995419693, + "grad_norm": 6.391613006591797, + "learning_rate": 8.140133141573214e-06, + "loss": 0.2065, + "step": 11501 + }, + { + "epoch": 0.29106460510666293, + "grad_norm": 16.296337127685547, + "learning_rate": 8.13982066988098e-06, + "loss": 0.2597, + "step": 11502 + }, + { + "epoch": 0.29108991067135664, + "grad_norm": 6.1365532875061035, + "learning_rate": 8.139508177940625e-06, + "loss": 0.2476, + "step": 11503 + }, + { + "epoch": 0.2911152162360503, + "grad_norm": 2.4448153972625732, + "learning_rate": 8.139195665754166e-06, + "loss": 0.128, + "step": 11504 + }, + { + "epoch": 0.29114052180074396, + "grad_norm": 5.9542236328125, + "learning_rate": 8.138883133323619e-06, + "loss": 0.1519, + "step": 11505 + }, + { + "epoch": 0.29116582736543767, + "grad_norm": 5.687209129333496, + "learning_rate": 8.138570580650997e-06, + "loss": 0.2351, + "step": 11506 + }, + { + "epoch": 0.2911911329301313, + "grad_norm": 20.61739730834961, + "learning_rate": 8.138258007738318e-06, + "loss": 0.3006, + "step": 11507 + }, + { + "epoch": 0.29121643849482504, + "grad_norm": 3.6756441593170166, + "learning_rate": 8.137945414587596e-06, + "loss": 0.2163, + "step": 11508 + }, + { + "epoch": 0.2912417440595187, + "grad_norm": 8.80003547668457, + "learning_rate": 8.13763280120085e-06, + "loss": 0.2303, + "step": 11509 + }, + { + "epoch": 0.29126704962421235, + "grad_norm": 4.565417289733887, + "learning_rate": 8.137320167580092e-06, + "loss": 0.2055, + "step": 11510 + }, + { + "epoch": 0.29129235518890606, + "grad_norm": 7.008752822875977, + "learning_rate": 8.13700751372734e-06, + "loss": 0.1567, + "step": 11511 + }, + { + "epoch": 0.2913176607535997, + "grad_norm": 3.0042037963867188, + "learning_rate": 8.13669483964461e-06, + "loss": 0.1249, + "step": 11512 + }, + { + "epoch": 0.2913429663182934, + "grad_norm": 9.685907363891602, + "learning_rate": 8.13638214533392e-06, + "loss": 0.1525, + "step": 11513 + }, + { + "epoch": 0.2913682718829871, + "grad_norm": 8.663229942321777, + "learning_rate": 8.136069430797283e-06, + "loss": 0.2054, + "step": 11514 + }, + { + "epoch": 0.29139357744768074, + "grad_norm": 4.59173059463501, + "learning_rate": 8.135756696036718e-06, + "loss": 0.1799, + "step": 11515 + }, + { + "epoch": 0.2914188830123744, + "grad_norm": 9.231627464294434, + "learning_rate": 8.135443941054242e-06, + "loss": 0.4045, + "step": 11516 + }, + { + "epoch": 0.2914441885770681, + "grad_norm": 3.721224546432495, + "learning_rate": 8.135131165851871e-06, + "loss": 0.1791, + "step": 11517 + }, + { + "epoch": 0.29146949414176176, + "grad_norm": 5.082040309906006, + "learning_rate": 8.13481837043162e-06, + "loss": 0.239, + "step": 11518 + }, + { + "epoch": 0.2914947997064555, + "grad_norm": 5.602968215942383, + "learning_rate": 8.134505554795512e-06, + "loss": 0.1997, + "step": 11519 + }, + { + "epoch": 0.29152010527114913, + "grad_norm": 4.229550361633301, + "learning_rate": 8.134192718945559e-06, + "loss": 0.1289, + "step": 11520 + }, + { + "epoch": 0.2915454108358428, + "grad_norm": 10.395035743713379, + "learning_rate": 8.13387986288378e-06, + "loss": 0.2191, + "step": 11521 + }, + { + "epoch": 0.2915707164005365, + "grad_norm": 6.227656841278076, + "learning_rate": 8.133566986612193e-06, + "loss": 0.2712, + "step": 11522 + }, + { + "epoch": 0.29159602196523016, + "grad_norm": 8.29024887084961, + "learning_rate": 8.133254090132816e-06, + "loss": 0.1674, + "step": 11523 + }, + { + "epoch": 0.2916213275299238, + "grad_norm": 4.563589572906494, + "learning_rate": 8.132941173447665e-06, + "loss": 0.1993, + "step": 11524 + }, + { + "epoch": 0.2916466330946175, + "grad_norm": 2.8150458335876465, + "learning_rate": 8.13262823655876e-06, + "loss": 0.1325, + "step": 11525 + }, + { + "epoch": 0.2916719386593112, + "grad_norm": 8.65768051147461, + "learning_rate": 8.132315279468119e-06, + "loss": 0.2444, + "step": 11526 + }, + { + "epoch": 0.29169724422400484, + "grad_norm": 2.689312696456909, + "learning_rate": 8.132002302177759e-06, + "loss": 0.1611, + "step": 11527 + }, + { + "epoch": 0.29172254978869855, + "grad_norm": 4.634989261627197, + "learning_rate": 8.131689304689698e-06, + "loss": 0.22, + "step": 11528 + }, + { + "epoch": 0.2917478553533922, + "grad_norm": 3.3187925815582275, + "learning_rate": 8.131376287005954e-06, + "loss": 0.085, + "step": 11529 + }, + { + "epoch": 0.29177316091808586, + "grad_norm": 4.678548812866211, + "learning_rate": 8.131063249128549e-06, + "loss": 0.1589, + "step": 11530 + }, + { + "epoch": 0.2917984664827796, + "grad_norm": 13.395692825317383, + "learning_rate": 8.1307501910595e-06, + "loss": 0.3114, + "step": 11531 + }, + { + "epoch": 0.29182377204747323, + "grad_norm": 7.471484184265137, + "learning_rate": 8.130437112800823e-06, + "loss": 0.2763, + "step": 11532 + }, + { + "epoch": 0.29184907761216694, + "grad_norm": 4.122398376464844, + "learning_rate": 8.130124014354541e-06, + "loss": 0.1727, + "step": 11533 + }, + { + "epoch": 0.2918743831768606, + "grad_norm": 5.095211029052734, + "learning_rate": 8.129810895722671e-06, + "loss": 0.1571, + "step": 11534 + }, + { + "epoch": 0.29189968874155425, + "grad_norm": 4.392849922180176, + "learning_rate": 8.129497756907231e-06, + "loss": 0.1102, + "step": 11535 + }, + { + "epoch": 0.29192499430624796, + "grad_norm": 3.660672664642334, + "learning_rate": 8.129184597910245e-06, + "loss": 0.1748, + "step": 11536 + }, + { + "epoch": 0.2919502998709416, + "grad_norm": 3.9758124351501465, + "learning_rate": 8.128871418733728e-06, + "loss": 0.1653, + "step": 11537 + }, + { + "epoch": 0.2919756054356353, + "grad_norm": 7.926916599273682, + "learning_rate": 8.128558219379703e-06, + "loss": 0.2421, + "step": 11538 + }, + { + "epoch": 0.292000911000329, + "grad_norm": 6.491769313812256, + "learning_rate": 8.128244999850186e-06, + "loss": 0.1838, + "step": 11539 + }, + { + "epoch": 0.29202621656502264, + "grad_norm": 4.357585430145264, + "learning_rate": 8.1279317601472e-06, + "loss": 0.1713, + "step": 11540 + }, + { + "epoch": 0.2920515221297163, + "grad_norm": 5.192793369293213, + "learning_rate": 8.127618500272762e-06, + "loss": 0.2239, + "step": 11541 + }, + { + "epoch": 0.29207682769441, + "grad_norm": 3.8551464080810547, + "learning_rate": 8.127305220228896e-06, + "loss": 0.1366, + "step": 11542 + }, + { + "epoch": 0.29210213325910367, + "grad_norm": 6.454930782318115, + "learning_rate": 8.126991920017621e-06, + "loss": 0.2213, + "step": 11543 + }, + { + "epoch": 0.2921274388237974, + "grad_norm": 7.7038960456848145, + "learning_rate": 8.126678599640955e-06, + "loss": 0.2022, + "step": 11544 + }, + { + "epoch": 0.29215274438849104, + "grad_norm": 6.066104888916016, + "learning_rate": 8.12636525910092e-06, + "loss": 0.2, + "step": 11545 + }, + { + "epoch": 0.2921780499531847, + "grad_norm": 2.8504583835601807, + "learning_rate": 8.12605189839954e-06, + "loss": 0.1077, + "step": 11546 + }, + { + "epoch": 0.2922033555178784, + "grad_norm": 3.852677345275879, + "learning_rate": 8.12573851753883e-06, + "loss": 0.1156, + "step": 11547 + }, + { + "epoch": 0.29222866108257206, + "grad_norm": 2.1448071002960205, + "learning_rate": 8.125425116520815e-06, + "loss": 0.0789, + "step": 11548 + }, + { + "epoch": 0.2922539666472657, + "grad_norm": 4.198514461517334, + "learning_rate": 8.125111695347513e-06, + "loss": 0.1575, + "step": 11549 + }, + { + "epoch": 0.29227927221195943, + "grad_norm": 5.909639358520508, + "learning_rate": 8.12479825402095e-06, + "loss": 0.2454, + "step": 11550 + }, + { + "epoch": 0.2923045777766531, + "grad_norm": 9.524950981140137, + "learning_rate": 8.124484792543141e-06, + "loss": 0.2125, + "step": 11551 + }, + { + "epoch": 0.29232988334134674, + "grad_norm": 7.027233123779297, + "learning_rate": 8.124171310916113e-06, + "loss": 0.1773, + "step": 11552 + }, + { + "epoch": 0.29235518890604045, + "grad_norm": 9.773150444030762, + "learning_rate": 8.123857809141885e-06, + "loss": 0.2552, + "step": 11553 + }, + { + "epoch": 0.2923804944707341, + "grad_norm": 6.470432758331299, + "learning_rate": 8.12354428722248e-06, + "loss": 0.1954, + "step": 11554 + }, + { + "epoch": 0.29240580003542777, + "grad_norm": 4.452539920806885, + "learning_rate": 8.123230745159918e-06, + "loss": 0.1795, + "step": 11555 + }, + { + "epoch": 0.2924311056001215, + "grad_norm": 3.292236566543579, + "learning_rate": 8.122917182956221e-06, + "loss": 0.1569, + "step": 11556 + }, + { + "epoch": 0.29245641116481513, + "grad_norm": 7.071910858154297, + "learning_rate": 8.122603600613413e-06, + "loss": 0.1953, + "step": 11557 + }, + { + "epoch": 0.29248171672950884, + "grad_norm": 3.8578600883483887, + "learning_rate": 8.122289998133515e-06, + "loss": 0.1967, + "step": 11558 + }, + { + "epoch": 0.2925070222942025, + "grad_norm": 5.92374324798584, + "learning_rate": 8.12197637551855e-06, + "loss": 0.1195, + "step": 11559 + }, + { + "epoch": 0.29253232785889616, + "grad_norm": 5.7487406730651855, + "learning_rate": 8.121662732770537e-06, + "loss": 0.1659, + "step": 11560 + }, + { + "epoch": 0.29255763342358987, + "grad_norm": 4.03800106048584, + "learning_rate": 8.121349069891506e-06, + "loss": 0.1599, + "step": 11561 + }, + { + "epoch": 0.2925829389882835, + "grad_norm": 6.679915428161621, + "learning_rate": 8.121035386883474e-06, + "loss": 0.1738, + "step": 11562 + }, + { + "epoch": 0.2926082445529772, + "grad_norm": 10.240608215332031, + "learning_rate": 8.120721683748465e-06, + "loss": 0.1869, + "step": 11563 + }, + { + "epoch": 0.2926335501176709, + "grad_norm": 5.003573894500732, + "learning_rate": 8.120407960488501e-06, + "loss": 0.1739, + "step": 11564 + }, + { + "epoch": 0.29265885568236455, + "grad_norm": 18.770484924316406, + "learning_rate": 8.120094217105609e-06, + "loss": 0.2461, + "step": 11565 + }, + { + "epoch": 0.2926841612470582, + "grad_norm": 4.160163402557373, + "learning_rate": 8.119780453601808e-06, + "loss": 0.1362, + "step": 11566 + }, + { + "epoch": 0.2927094668117519, + "grad_norm": 8.137343406677246, + "learning_rate": 8.119466669979125e-06, + "loss": 0.2205, + "step": 11567 + }, + { + "epoch": 0.2927347723764456, + "grad_norm": 6.883715629577637, + "learning_rate": 8.119152866239579e-06, + "loss": 0.271, + "step": 11568 + }, + { + "epoch": 0.29276007794113923, + "grad_norm": 8.353708267211914, + "learning_rate": 8.1188390423852e-06, + "loss": 0.2659, + "step": 11569 + }, + { + "epoch": 0.29278538350583294, + "grad_norm": 8.74693489074707, + "learning_rate": 8.118525198418005e-06, + "loss": 0.2023, + "step": 11570 + }, + { + "epoch": 0.2928106890705266, + "grad_norm": 4.766046047210693, + "learning_rate": 8.118211334340023e-06, + "loss": 0.2156, + "step": 11571 + }, + { + "epoch": 0.2928359946352203, + "grad_norm": 6.3869171142578125, + "learning_rate": 8.117897450153275e-06, + "loss": 0.2564, + "step": 11572 + }, + { + "epoch": 0.29286130019991397, + "grad_norm": 3.6833224296569824, + "learning_rate": 8.117583545859787e-06, + "loss": 0.164, + "step": 11573 + }, + { + "epoch": 0.2928866057646076, + "grad_norm": 6.512035846710205, + "learning_rate": 8.117269621461581e-06, + "loss": 0.0932, + "step": 11574 + }, + { + "epoch": 0.29291191132930133, + "grad_norm": 3.687194585800171, + "learning_rate": 8.116955676960685e-06, + "loss": 0.2118, + "step": 11575 + }, + { + "epoch": 0.292937216893995, + "grad_norm": 5.245865345001221, + "learning_rate": 8.116641712359122e-06, + "loss": 0.1868, + "step": 11576 + }, + { + "epoch": 0.29296252245868865, + "grad_norm": 7.5034942626953125, + "learning_rate": 8.116327727658915e-06, + "loss": 0.1793, + "step": 11577 + }, + { + "epoch": 0.29298782802338236, + "grad_norm": 2.99371600151062, + "learning_rate": 8.11601372286209e-06, + "loss": 0.1635, + "step": 11578 + }, + { + "epoch": 0.293013133588076, + "grad_norm": 3.89426589012146, + "learning_rate": 8.115699697970673e-06, + "loss": 0.1591, + "step": 11579 + }, + { + "epoch": 0.29303843915276967, + "grad_norm": 3.2574117183685303, + "learning_rate": 8.115385652986689e-06, + "loss": 0.1313, + "step": 11580 + }, + { + "epoch": 0.2930637447174634, + "grad_norm": 12.22158145904541, + "learning_rate": 8.11507158791216e-06, + "loss": 0.2796, + "step": 11581 + }, + { + "epoch": 0.29308905028215704, + "grad_norm": 7.3313517570495605, + "learning_rate": 8.114757502749114e-06, + "loss": 0.1719, + "step": 11582 + }, + { + "epoch": 0.29311435584685075, + "grad_norm": 21.45802116394043, + "learning_rate": 8.114443397499578e-06, + "loss": 0.4889, + "step": 11583 + }, + { + "epoch": 0.2931396614115444, + "grad_norm": 30.34442901611328, + "learning_rate": 8.114129272165576e-06, + "loss": 0.3231, + "step": 11584 + }, + { + "epoch": 0.29316496697623806, + "grad_norm": 5.04325532913208, + "learning_rate": 8.113815126749132e-06, + "loss": 0.1859, + "step": 11585 + }, + { + "epoch": 0.2931902725409318, + "grad_norm": 4.315030574798584, + "learning_rate": 8.113500961252273e-06, + "loss": 0.1577, + "step": 11586 + }, + { + "epoch": 0.29321557810562543, + "grad_norm": 11.386841773986816, + "learning_rate": 8.113186775677029e-06, + "loss": 0.1957, + "step": 11587 + }, + { + "epoch": 0.2932408836703191, + "grad_norm": 9.45108699798584, + "learning_rate": 8.11287257002542e-06, + "loss": 0.1966, + "step": 11588 + }, + { + "epoch": 0.2932661892350128, + "grad_norm": 3.8967044353485107, + "learning_rate": 8.112558344299475e-06, + "loss": 0.1309, + "step": 11589 + }, + { + "epoch": 0.29329149479970645, + "grad_norm": 6.601212501525879, + "learning_rate": 8.112244098501221e-06, + "loss": 0.1368, + "step": 11590 + }, + { + "epoch": 0.2933168003644001, + "grad_norm": 3.1357293128967285, + "learning_rate": 8.111929832632683e-06, + "loss": 0.1552, + "step": 11591 + }, + { + "epoch": 0.2933421059290938, + "grad_norm": 7.678310394287109, + "learning_rate": 8.111615546695887e-06, + "loss": 0.2484, + "step": 11592 + }, + { + "epoch": 0.2933674114937875, + "grad_norm": 3.376429796218872, + "learning_rate": 8.111301240692863e-06, + "loss": 0.1842, + "step": 11593 + }, + { + "epoch": 0.29339271705848113, + "grad_norm": 4.37099027633667, + "learning_rate": 8.110986914625635e-06, + "loss": 0.1951, + "step": 11594 + }, + { + "epoch": 0.29341802262317485, + "grad_norm": 2.714282512664795, + "learning_rate": 8.110672568496232e-06, + "loss": 0.1091, + "step": 11595 + }, + { + "epoch": 0.2934433281878685, + "grad_norm": 8.070829391479492, + "learning_rate": 8.11035820230668e-06, + "loss": 0.2748, + "step": 11596 + }, + { + "epoch": 0.2934686337525622, + "grad_norm": 4.811413764953613, + "learning_rate": 8.110043816059005e-06, + "loss": 0.2009, + "step": 11597 + }, + { + "epoch": 0.29349393931725587, + "grad_norm": 13.722198486328125, + "learning_rate": 8.10972940975524e-06, + "loss": 0.1648, + "step": 11598 + }, + { + "epoch": 0.2935192448819495, + "grad_norm": 8.178766250610352, + "learning_rate": 8.109414983397405e-06, + "loss": 0.2196, + "step": 11599 + }, + { + "epoch": 0.29354455044664324, + "grad_norm": 14.017691612243652, + "learning_rate": 8.109100536987531e-06, + "loss": 0.1875, + "step": 11600 + }, + { + "epoch": 0.2935698560113369, + "grad_norm": 6.472843170166016, + "learning_rate": 8.108786070527647e-06, + "loss": 0.2274, + "step": 11601 + }, + { + "epoch": 0.29359516157603055, + "grad_norm": 7.7691650390625, + "learning_rate": 8.108471584019782e-06, + "loss": 0.2426, + "step": 11602 + }, + { + "epoch": 0.29362046714072426, + "grad_norm": 5.571775913238525, + "learning_rate": 8.10815707746596e-06, + "loss": 0.1482, + "step": 11603 + }, + { + "epoch": 0.2936457727054179, + "grad_norm": 3.2204666137695312, + "learning_rate": 8.10784255086821e-06, + "loss": 0.1073, + "step": 11604 + }, + { + "epoch": 0.2936710782701116, + "grad_norm": 12.39403247833252, + "learning_rate": 8.107528004228563e-06, + "loss": 0.216, + "step": 11605 + }, + { + "epoch": 0.2936963838348053, + "grad_norm": 7.175530910491943, + "learning_rate": 8.107213437549045e-06, + "loss": 0.2545, + "step": 11606 + }, + { + "epoch": 0.29372168939949894, + "grad_norm": 6.178318977355957, + "learning_rate": 8.106898850831687e-06, + "loss": 0.2571, + "step": 11607 + }, + { + "epoch": 0.29374699496419265, + "grad_norm": 5.30674934387207, + "learning_rate": 8.106584244078514e-06, + "loss": 0.1295, + "step": 11608 + }, + { + "epoch": 0.2937723005288863, + "grad_norm": 3.826617479324341, + "learning_rate": 8.10626961729156e-06, + "loss": 0.2056, + "step": 11609 + }, + { + "epoch": 0.29379760609357997, + "grad_norm": 5.006311416625977, + "learning_rate": 8.105954970472848e-06, + "loss": 0.1411, + "step": 11610 + }, + { + "epoch": 0.2938229116582737, + "grad_norm": 6.097585678100586, + "learning_rate": 8.105640303624412e-06, + "loss": 0.1817, + "step": 11611 + }, + { + "epoch": 0.29384821722296733, + "grad_norm": 3.878235101699829, + "learning_rate": 8.105325616748279e-06, + "loss": 0.1395, + "step": 11612 + }, + { + "epoch": 0.293873522787661, + "grad_norm": 5.624078273773193, + "learning_rate": 8.105010909846478e-06, + "loss": 0.1604, + "step": 11613 + }, + { + "epoch": 0.2938988283523547, + "grad_norm": 5.300279140472412, + "learning_rate": 8.10469618292104e-06, + "loss": 0.1907, + "step": 11614 + }, + { + "epoch": 0.29392413391704836, + "grad_norm": 4.998624801635742, + "learning_rate": 8.104381435973993e-06, + "loss": 0.1935, + "step": 11615 + }, + { + "epoch": 0.293949439481742, + "grad_norm": 4.1593804359436035, + "learning_rate": 8.10406666900737e-06, + "loss": 0.1303, + "step": 11616 + }, + { + "epoch": 0.2939747450464357, + "grad_norm": 6.628014087677002, + "learning_rate": 8.103751882023197e-06, + "loss": 0.2374, + "step": 11617 + }, + { + "epoch": 0.2940000506111294, + "grad_norm": 3.400984764099121, + "learning_rate": 8.103437075023504e-06, + "loss": 0.19, + "step": 11618 + }, + { + "epoch": 0.29402535617582304, + "grad_norm": 5.4202775955200195, + "learning_rate": 8.103122248010324e-06, + "loss": 0.1263, + "step": 11619 + }, + { + "epoch": 0.29405066174051675, + "grad_norm": 4.6451568603515625, + "learning_rate": 8.102807400985686e-06, + "loss": 0.1726, + "step": 11620 + }, + { + "epoch": 0.2940759673052104, + "grad_norm": 4.984665870666504, + "learning_rate": 8.10249253395162e-06, + "loss": 0.2105, + "step": 11621 + }, + { + "epoch": 0.2941012728699041, + "grad_norm": 4.013720512390137, + "learning_rate": 8.102177646910157e-06, + "loss": 0.1491, + "step": 11622 + }, + { + "epoch": 0.2941265784345978, + "grad_norm": 6.940145969390869, + "learning_rate": 8.101862739863327e-06, + "loss": 0.2127, + "step": 11623 + }, + { + "epoch": 0.29415188399929143, + "grad_norm": 6.693833351135254, + "learning_rate": 8.101547812813159e-06, + "loss": 0.2446, + "step": 11624 + }, + { + "epoch": 0.29417718956398514, + "grad_norm": 4.6506195068359375, + "learning_rate": 8.10123286576169e-06, + "loss": 0.2041, + "step": 11625 + }, + { + "epoch": 0.2942024951286788, + "grad_norm": 6.7989654541015625, + "learning_rate": 8.100917898710945e-06, + "loss": 0.2624, + "step": 11626 + }, + { + "epoch": 0.29422780069337245, + "grad_norm": 6.072089672088623, + "learning_rate": 8.100602911662958e-06, + "loss": 0.2174, + "step": 11627 + }, + { + "epoch": 0.29425310625806617, + "grad_norm": 7.2189860343933105, + "learning_rate": 8.100287904619758e-06, + "loss": 0.1462, + "step": 11628 + }, + { + "epoch": 0.2942784118227598, + "grad_norm": 4.040782451629639, + "learning_rate": 8.099972877583381e-06, + "loss": 0.1625, + "step": 11629 + }, + { + "epoch": 0.2943037173874535, + "grad_norm": 7.64746618270874, + "learning_rate": 8.099657830555853e-06, + "loss": 0.2191, + "step": 11630 + }, + { + "epoch": 0.2943290229521472, + "grad_norm": 4.59202241897583, + "learning_rate": 8.09934276353921e-06, + "loss": 0.2058, + "step": 11631 + }, + { + "epoch": 0.29435432851684085, + "grad_norm": 3.0403976440429688, + "learning_rate": 8.09902767653548e-06, + "loss": 0.1037, + "step": 11632 + }, + { + "epoch": 0.2943796340815345, + "grad_norm": 8.678194999694824, + "learning_rate": 8.098712569546699e-06, + "loss": 0.2149, + "step": 11633 + }, + { + "epoch": 0.2944049396462282, + "grad_norm": 2.5119125843048096, + "learning_rate": 8.098397442574897e-06, + "loss": 0.1196, + "step": 11634 + }, + { + "epoch": 0.29443024521092187, + "grad_norm": 5.881099224090576, + "learning_rate": 8.098082295622104e-06, + "loss": 0.2506, + "step": 11635 + }, + { + "epoch": 0.2944555507756156, + "grad_norm": 10.490458488464355, + "learning_rate": 8.097767128690356e-06, + "loss": 0.2573, + "step": 11636 + }, + { + "epoch": 0.29448085634030924, + "grad_norm": 4.684136867523193, + "learning_rate": 8.097451941781684e-06, + "loss": 0.1403, + "step": 11637 + }, + { + "epoch": 0.2945061619050029, + "grad_norm": 9.616984367370605, + "learning_rate": 8.09713673489812e-06, + "loss": 0.1061, + "step": 11638 + }, + { + "epoch": 0.2945314674696966, + "grad_norm": 7.656881332397461, + "learning_rate": 8.0968215080417e-06, + "loss": 0.2777, + "step": 11639 + }, + { + "epoch": 0.29455677303439026, + "grad_norm": 7.756229877471924, + "learning_rate": 8.09650626121445e-06, + "loss": 0.2479, + "step": 11640 + }, + { + "epoch": 0.2945820785990839, + "grad_norm": 18.065147399902344, + "learning_rate": 8.09619099441841e-06, + "loss": 0.1587, + "step": 11641 + }, + { + "epoch": 0.29460738416377763, + "grad_norm": 5.424914360046387, + "learning_rate": 8.09587570765561e-06, + "loss": 0.1488, + "step": 11642 + }, + { + "epoch": 0.2946326897284713, + "grad_norm": 3.8150253295898438, + "learning_rate": 8.095560400928082e-06, + "loss": 0.1989, + "step": 11643 + }, + { + "epoch": 0.29465799529316494, + "grad_norm": 2.819789409637451, + "learning_rate": 8.095245074237862e-06, + "loss": 0.1601, + "step": 11644 + }, + { + "epoch": 0.29468330085785865, + "grad_norm": 14.613340377807617, + "learning_rate": 8.094929727586982e-06, + "loss": 0.3436, + "step": 11645 + }, + { + "epoch": 0.2947086064225523, + "grad_norm": 5.5326995849609375, + "learning_rate": 8.094614360977475e-06, + "loss": 0.1263, + "step": 11646 + }, + { + "epoch": 0.294733911987246, + "grad_norm": 2.6795923709869385, + "learning_rate": 8.094298974411377e-06, + "loss": 0.1655, + "step": 11647 + }, + { + "epoch": 0.2947592175519397, + "grad_norm": 7.088406562805176, + "learning_rate": 8.09398356789072e-06, + "loss": 0.2536, + "step": 11648 + }, + { + "epoch": 0.29478452311663333, + "grad_norm": 12.43612289428711, + "learning_rate": 8.093668141417539e-06, + "loss": 0.1835, + "step": 11649 + }, + { + "epoch": 0.29480982868132705, + "grad_norm": 8.715843200683594, + "learning_rate": 8.093352694993865e-06, + "loss": 0.2148, + "step": 11650 + }, + { + "epoch": 0.2948351342460207, + "grad_norm": 3.807281970977783, + "learning_rate": 8.093037228621738e-06, + "loss": 0.1373, + "step": 11651 + }, + { + "epoch": 0.29486043981071436, + "grad_norm": 6.8127970695495605, + "learning_rate": 8.092721742303189e-06, + "loss": 0.2965, + "step": 11652 + }, + { + "epoch": 0.29488574537540807, + "grad_norm": 9.507308959960938, + "learning_rate": 8.092406236040251e-06, + "loss": 0.1564, + "step": 11653 + }, + { + "epoch": 0.2949110509401017, + "grad_norm": 4.623828887939453, + "learning_rate": 8.092090709834961e-06, + "loss": 0.1422, + "step": 11654 + }, + { + "epoch": 0.2949363565047954, + "grad_norm": 9.947859764099121, + "learning_rate": 8.091775163689354e-06, + "loss": 0.2196, + "step": 11655 + }, + { + "epoch": 0.2949616620694891, + "grad_norm": 4.443120956420898, + "learning_rate": 8.091459597605463e-06, + "loss": 0.202, + "step": 11656 + }, + { + "epoch": 0.29498696763418275, + "grad_norm": 2.832225799560547, + "learning_rate": 8.091144011585325e-06, + "loss": 0.1321, + "step": 11657 + }, + { + "epoch": 0.2950122731988764, + "grad_norm": 3.4911258220672607, + "learning_rate": 8.090828405630974e-06, + "loss": 0.1684, + "step": 11658 + }, + { + "epoch": 0.2950375787635701, + "grad_norm": 3.871915102005005, + "learning_rate": 8.090512779744444e-06, + "loss": 0.1423, + "step": 11659 + }, + { + "epoch": 0.2950628843282638, + "grad_norm": 6.403805255889893, + "learning_rate": 8.090197133927775e-06, + "loss": 0.282, + "step": 11660 + }, + { + "epoch": 0.2950881898929575, + "grad_norm": 5.423400402069092, + "learning_rate": 8.089881468182998e-06, + "loss": 0.2486, + "step": 11661 + }, + { + "epoch": 0.29511349545765114, + "grad_norm": 3.3278636932373047, + "learning_rate": 8.08956578251215e-06, + "loss": 0.1162, + "step": 11662 + }, + { + "epoch": 0.2951388010223448, + "grad_norm": 5.984988212585449, + "learning_rate": 8.089250076917268e-06, + "loss": 0.3119, + "step": 11663 + }, + { + "epoch": 0.2951641065870385, + "grad_norm": 7.923135280609131, + "learning_rate": 8.088934351400386e-06, + "loss": 0.216, + "step": 11664 + }, + { + "epoch": 0.29518941215173217, + "grad_norm": 8.634416580200195, + "learning_rate": 8.08861860596354e-06, + "loss": 0.2614, + "step": 11665 + }, + { + "epoch": 0.2952147177164258, + "grad_norm": 5.763696670532227, + "learning_rate": 8.088302840608769e-06, + "loss": 0.1709, + "step": 11666 + }, + { + "epoch": 0.29524002328111953, + "grad_norm": 6.416005611419678, + "learning_rate": 8.087987055338106e-06, + "loss": 0.1604, + "step": 11667 + }, + { + "epoch": 0.2952653288458132, + "grad_norm": 10.317317008972168, + "learning_rate": 8.087671250153588e-06, + "loss": 0.2196, + "step": 11668 + }, + { + "epoch": 0.29529063441050685, + "grad_norm": 5.414995193481445, + "learning_rate": 8.087355425057255e-06, + "loss": 0.2043, + "step": 11669 + }, + { + "epoch": 0.29531593997520056, + "grad_norm": 3.1104564666748047, + "learning_rate": 8.08703958005114e-06, + "loss": 0.1571, + "step": 11670 + }, + { + "epoch": 0.2953412455398942, + "grad_norm": 4.711106300354004, + "learning_rate": 8.086723715137281e-06, + "loss": 0.2029, + "step": 11671 + }, + { + "epoch": 0.2953665511045879, + "grad_norm": 13.61816120147705, + "learning_rate": 8.086407830317714e-06, + "loss": 0.4259, + "step": 11672 + }, + { + "epoch": 0.2953918566692816, + "grad_norm": 6.095036506652832, + "learning_rate": 8.08609192559448e-06, + "loss": 0.2958, + "step": 11673 + }, + { + "epoch": 0.29541716223397524, + "grad_norm": 10.524020195007324, + "learning_rate": 8.08577600096961e-06, + "loss": 0.1695, + "step": 11674 + }, + { + "epoch": 0.29544246779866895, + "grad_norm": 6.140295028686523, + "learning_rate": 8.085460056445147e-06, + "loss": 0.242, + "step": 11675 + }, + { + "epoch": 0.2954677733633626, + "grad_norm": 6.8282647132873535, + "learning_rate": 8.085144092023123e-06, + "loss": 0.2032, + "step": 11676 + }, + { + "epoch": 0.29549307892805626, + "grad_norm": 6.88934850692749, + "learning_rate": 8.084828107705583e-06, + "loss": 0.2204, + "step": 11677 + }, + { + "epoch": 0.29551838449275, + "grad_norm": 7.729603290557861, + "learning_rate": 8.084512103494556e-06, + "loss": 0.2944, + "step": 11678 + }, + { + "epoch": 0.29554369005744363, + "grad_norm": 4.7977681159973145, + "learning_rate": 8.084196079392087e-06, + "loss": 0.2108, + "step": 11679 + }, + { + "epoch": 0.2955689956221373, + "grad_norm": 5.307918548583984, + "learning_rate": 8.083880035400208e-06, + "loss": 0.1744, + "step": 11680 + }, + { + "epoch": 0.295594301186831, + "grad_norm": 5.023597240447998, + "learning_rate": 8.083563971520963e-06, + "loss": 0.1919, + "step": 11681 + }, + { + "epoch": 0.29561960675152466, + "grad_norm": 4.111904144287109, + "learning_rate": 8.083247887756388e-06, + "loss": 0.1772, + "step": 11682 + }, + { + "epoch": 0.2956449123162183, + "grad_norm": 19.094451904296875, + "learning_rate": 8.08293178410852e-06, + "loss": 0.349, + "step": 11683 + }, + { + "epoch": 0.295670217880912, + "grad_norm": 4.557724952697754, + "learning_rate": 8.082615660579398e-06, + "loss": 0.1665, + "step": 11684 + }, + { + "epoch": 0.2956955234456057, + "grad_norm": 6.2378950119018555, + "learning_rate": 8.082299517171061e-06, + "loss": 0.2465, + "step": 11685 + }, + { + "epoch": 0.2957208290102994, + "grad_norm": 7.367788314819336, + "learning_rate": 8.08198335388555e-06, + "loss": 0.2572, + "step": 11686 + }, + { + "epoch": 0.29574613457499305, + "grad_norm": 5.4537811279296875, + "learning_rate": 8.081667170724899e-06, + "loss": 0.2114, + "step": 11687 + }, + { + "epoch": 0.2957714401396867, + "grad_norm": 9.4976167678833, + "learning_rate": 8.081350967691151e-06, + "loss": 0.2184, + "step": 11688 + }, + { + "epoch": 0.2957967457043804, + "grad_norm": 4.8570685386657715, + "learning_rate": 8.081034744786342e-06, + "loss": 0.2098, + "step": 11689 + }, + { + "epoch": 0.29582205126907407, + "grad_norm": 4.262640476226807, + "learning_rate": 8.080718502012514e-06, + "loss": 0.1716, + "step": 11690 + }, + { + "epoch": 0.2958473568337677, + "grad_norm": 4.5270538330078125, + "learning_rate": 8.080402239371706e-06, + "loss": 0.1408, + "step": 11691 + }, + { + "epoch": 0.29587266239846144, + "grad_norm": 4.402870178222656, + "learning_rate": 8.080085956865958e-06, + "loss": 0.1682, + "step": 11692 + }, + { + "epoch": 0.2958979679631551, + "grad_norm": 5.486601829528809, + "learning_rate": 8.079769654497307e-06, + "loss": 0.1542, + "step": 11693 + }, + { + "epoch": 0.29592327352784875, + "grad_norm": 4.476035118103027, + "learning_rate": 8.079453332267794e-06, + "loss": 0.1004, + "step": 11694 + }, + { + "epoch": 0.29594857909254246, + "grad_norm": 3.494961977005005, + "learning_rate": 8.07913699017946e-06, + "loss": 0.1376, + "step": 11695 + }, + { + "epoch": 0.2959738846572361, + "grad_norm": 2.7097060680389404, + "learning_rate": 8.078820628234343e-06, + "loss": 0.1441, + "step": 11696 + }, + { + "epoch": 0.2959991902219298, + "grad_norm": 20.45442771911621, + "learning_rate": 8.078504246434487e-06, + "loss": 0.2033, + "step": 11697 + }, + { + "epoch": 0.2960244957866235, + "grad_norm": 6.959649562835693, + "learning_rate": 8.078187844781928e-06, + "loss": 0.2678, + "step": 11698 + }, + { + "epoch": 0.29604980135131714, + "grad_norm": 13.913368225097656, + "learning_rate": 8.077871423278708e-06, + "loss": 0.1871, + "step": 11699 + }, + { + "epoch": 0.29607510691601086, + "grad_norm": 7.6300530433654785, + "learning_rate": 8.077554981926869e-06, + "loss": 0.2251, + "step": 11700 + }, + { + "epoch": 0.2961004124807045, + "grad_norm": 5.795248508453369, + "learning_rate": 8.07723852072845e-06, + "loss": 0.1864, + "step": 11701 + }, + { + "epoch": 0.29612571804539817, + "grad_norm": 4.287088394165039, + "learning_rate": 8.076922039685492e-06, + "loss": 0.1712, + "step": 11702 + }, + { + "epoch": 0.2961510236100919, + "grad_norm": 6.742890357971191, + "learning_rate": 8.076605538800035e-06, + "loss": 0.1995, + "step": 11703 + }, + { + "epoch": 0.29617632917478554, + "grad_norm": 4.708189010620117, + "learning_rate": 8.076289018074122e-06, + "loss": 0.1435, + "step": 11704 + }, + { + "epoch": 0.2962016347394792, + "grad_norm": 6.6027913093566895, + "learning_rate": 8.075972477509795e-06, + "loss": 0.1741, + "step": 11705 + }, + { + "epoch": 0.2962269403041729, + "grad_norm": 5.125168323516846, + "learning_rate": 8.075655917109092e-06, + "loss": 0.2058, + "step": 11706 + }, + { + "epoch": 0.29625224586886656, + "grad_norm": 6.127476215362549, + "learning_rate": 8.075339336874057e-06, + "loss": 0.1629, + "step": 11707 + }, + { + "epoch": 0.2962775514335602, + "grad_norm": 3.047588586807251, + "learning_rate": 8.07502273680673e-06, + "loss": 0.073, + "step": 11708 + }, + { + "epoch": 0.2963028569982539, + "grad_norm": 4.64502477645874, + "learning_rate": 8.074706116909151e-06, + "loss": 0.1787, + "step": 11709 + }, + { + "epoch": 0.2963281625629476, + "grad_norm": 3.2618961334228516, + "learning_rate": 8.074389477183368e-06, + "loss": 0.1221, + "step": 11710 + }, + { + "epoch": 0.2963534681276413, + "grad_norm": 4.830432891845703, + "learning_rate": 8.074072817631417e-06, + "loss": 0.2, + "step": 11711 + }, + { + "epoch": 0.29637877369233495, + "grad_norm": 7.396939754486084, + "learning_rate": 8.073756138255344e-06, + "loss": 0.1539, + "step": 11712 + }, + { + "epoch": 0.2964040792570286, + "grad_norm": 2.938793659210205, + "learning_rate": 8.073439439057188e-06, + "loss": 0.1421, + "step": 11713 + }, + { + "epoch": 0.2964293848217223, + "grad_norm": 4.537446975708008, + "learning_rate": 8.073122720038992e-06, + "loss": 0.1958, + "step": 11714 + }, + { + "epoch": 0.296454690386416, + "grad_norm": 12.004964828491211, + "learning_rate": 8.072805981202802e-06, + "loss": 0.1976, + "step": 11715 + }, + { + "epoch": 0.29647999595110963, + "grad_norm": 10.731422424316406, + "learning_rate": 8.072489222550656e-06, + "loss": 0.2444, + "step": 11716 + }, + { + "epoch": 0.29650530151580334, + "grad_norm": 20.34027099609375, + "learning_rate": 8.072172444084599e-06, + "loss": 0.208, + "step": 11717 + }, + { + "epoch": 0.296530607080497, + "grad_norm": 5.243487358093262, + "learning_rate": 8.071855645806672e-06, + "loss": 0.1841, + "step": 11718 + }, + { + "epoch": 0.29655591264519066, + "grad_norm": 6.264786243438721, + "learning_rate": 8.071538827718922e-06, + "loss": 0.2219, + "step": 11719 + }, + { + "epoch": 0.29658121820988437, + "grad_norm": 4.552664756774902, + "learning_rate": 8.071221989823387e-06, + "loss": 0.164, + "step": 11720 + }, + { + "epoch": 0.296606523774578, + "grad_norm": 4.258606433868408, + "learning_rate": 8.070905132122113e-06, + "loss": 0.1226, + "step": 11721 + }, + { + "epoch": 0.2966318293392717, + "grad_norm": 4.271297931671143, + "learning_rate": 8.070588254617144e-06, + "loss": 0.1176, + "step": 11722 + }, + { + "epoch": 0.2966571349039654, + "grad_norm": 4.5435638427734375, + "learning_rate": 8.070271357310522e-06, + "loss": 0.2074, + "step": 11723 + }, + { + "epoch": 0.29668244046865905, + "grad_norm": 7.051817417144775, + "learning_rate": 8.069954440204291e-06, + "loss": 0.137, + "step": 11724 + }, + { + "epoch": 0.29670774603335276, + "grad_norm": 5.003664970397949, + "learning_rate": 8.069637503300495e-06, + "loss": 0.134, + "step": 11725 + }, + { + "epoch": 0.2967330515980464, + "grad_norm": 5.3420023918151855, + "learning_rate": 8.069320546601178e-06, + "loss": 0.1799, + "step": 11726 + }, + { + "epoch": 0.29675835716274007, + "grad_norm": 7.144336223602295, + "learning_rate": 8.069003570108383e-06, + "loss": 0.201, + "step": 11727 + }, + { + "epoch": 0.2967836627274338, + "grad_norm": 7.785359859466553, + "learning_rate": 8.068686573824155e-06, + "loss": 0.2835, + "step": 11728 + }, + { + "epoch": 0.29680896829212744, + "grad_norm": 4.81992769241333, + "learning_rate": 8.068369557750539e-06, + "loss": 0.2213, + "step": 11729 + }, + { + "epoch": 0.2968342738568211, + "grad_norm": 6.714131832122803, + "learning_rate": 8.06805252188958e-06, + "loss": 0.2248, + "step": 11730 + }, + { + "epoch": 0.2968595794215148, + "grad_norm": 7.088565349578857, + "learning_rate": 8.067735466243319e-06, + "loss": 0.2265, + "step": 11731 + }, + { + "epoch": 0.29688488498620846, + "grad_norm": 8.409042358398438, + "learning_rate": 8.067418390813801e-06, + "loss": 0.1661, + "step": 11732 + }, + { + "epoch": 0.2969101905509021, + "grad_norm": 28.889116287231445, + "learning_rate": 8.067101295603074e-06, + "loss": 0.1626, + "step": 11733 + }, + { + "epoch": 0.29693549611559583, + "grad_norm": 4.213003158569336, + "learning_rate": 8.066784180613183e-06, + "loss": 0.166, + "step": 11734 + }, + { + "epoch": 0.2969608016802895, + "grad_norm": 6.26441764831543, + "learning_rate": 8.06646704584617e-06, + "loss": 0.1533, + "step": 11735 + }, + { + "epoch": 0.29698610724498314, + "grad_norm": 4.110362529754639, + "learning_rate": 8.06614989130408e-06, + "loss": 0.1139, + "step": 11736 + }, + { + "epoch": 0.29701141280967686, + "grad_norm": 5.071873188018799, + "learning_rate": 8.06583271698896e-06, + "loss": 0.1207, + "step": 11737 + }, + { + "epoch": 0.2970367183743705, + "grad_norm": 5.152367115020752, + "learning_rate": 8.065515522902856e-06, + "loss": 0.2116, + "step": 11738 + }, + { + "epoch": 0.2970620239390642, + "grad_norm": 4.495487689971924, + "learning_rate": 8.065198309047812e-06, + "loss": 0.2731, + "step": 11739 + }, + { + "epoch": 0.2970873295037579, + "grad_norm": 7.775115966796875, + "learning_rate": 8.064881075425874e-06, + "loss": 0.2184, + "step": 11740 + }, + { + "epoch": 0.29711263506845154, + "grad_norm": 11.749404907226562, + "learning_rate": 8.064563822039088e-06, + "loss": 0.2524, + "step": 11741 + }, + { + "epoch": 0.29713794063314525, + "grad_norm": 5.247134208679199, + "learning_rate": 8.064246548889502e-06, + "loss": 0.1654, + "step": 11742 + }, + { + "epoch": 0.2971632461978389, + "grad_norm": 4.847548484802246, + "learning_rate": 8.063929255979159e-06, + "loss": 0.2397, + "step": 11743 + }, + { + "epoch": 0.29718855176253256, + "grad_norm": 6.9875640869140625, + "learning_rate": 8.063611943310106e-06, + "loss": 0.1792, + "step": 11744 + }, + { + "epoch": 0.29721385732722627, + "grad_norm": 2.9994208812713623, + "learning_rate": 8.06329461088439e-06, + "loss": 0.1055, + "step": 11745 + }, + { + "epoch": 0.29723916289191993, + "grad_norm": 12.407438278198242, + "learning_rate": 8.062977258704056e-06, + "loss": 0.3342, + "step": 11746 + }, + { + "epoch": 0.2972644684566136, + "grad_norm": 4.113555431365967, + "learning_rate": 8.062659886771152e-06, + "loss": 0.1897, + "step": 11747 + }, + { + "epoch": 0.2972897740213073, + "grad_norm": 3.3669352531433105, + "learning_rate": 8.062342495087723e-06, + "loss": 0.1999, + "step": 11748 + }, + { + "epoch": 0.29731507958600095, + "grad_norm": 4.375404357910156, + "learning_rate": 8.062025083655818e-06, + "loss": 0.1853, + "step": 11749 + }, + { + "epoch": 0.29734038515069466, + "grad_norm": 5.413273334503174, + "learning_rate": 8.061707652477482e-06, + "loss": 0.1971, + "step": 11750 + }, + { + "epoch": 0.2973656907153883, + "grad_norm": 7.737677574157715, + "learning_rate": 8.061390201554764e-06, + "loss": 0.2158, + "step": 11751 + }, + { + "epoch": 0.297390996280082, + "grad_norm": 11.339937210083008, + "learning_rate": 8.061072730889707e-06, + "loss": 0.2016, + "step": 11752 + }, + { + "epoch": 0.2974163018447757, + "grad_norm": 6.347370147705078, + "learning_rate": 8.060755240484366e-06, + "loss": 0.2869, + "step": 11753 + }, + { + "epoch": 0.29744160740946934, + "grad_norm": 11.408212661743164, + "learning_rate": 8.06043773034078e-06, + "loss": 0.2483, + "step": 11754 + }, + { + "epoch": 0.297466912974163, + "grad_norm": 11.743000030517578, + "learning_rate": 8.060120200461003e-06, + "loss": 0.2027, + "step": 11755 + }, + { + "epoch": 0.2974922185388567, + "grad_norm": 4.513606548309326, + "learning_rate": 8.059802650847078e-06, + "loss": 0.2006, + "step": 11756 + }, + { + "epoch": 0.29751752410355037, + "grad_norm": 4.016789436340332, + "learning_rate": 8.059485081501056e-06, + "loss": 0.144, + "step": 11757 + }, + { + "epoch": 0.297542829668244, + "grad_norm": 3.7953245639801025, + "learning_rate": 8.059167492424984e-06, + "loss": 0.1808, + "step": 11758 + }, + { + "epoch": 0.29756813523293774, + "grad_norm": 5.711161136627197, + "learning_rate": 8.058849883620909e-06, + "loss": 0.1866, + "step": 11759 + }, + { + "epoch": 0.2975934407976314, + "grad_norm": 6.364774703979492, + "learning_rate": 8.05853225509088e-06, + "loss": 0.1886, + "step": 11760 + }, + { + "epoch": 0.29761874636232505, + "grad_norm": 5.416629314422607, + "learning_rate": 8.058214606836947e-06, + "loss": 0.1518, + "step": 11761 + }, + { + "epoch": 0.29764405192701876, + "grad_norm": 3.460418939590454, + "learning_rate": 8.057896938861156e-06, + "loss": 0.1265, + "step": 11762 + }, + { + "epoch": 0.2976693574917124, + "grad_norm": 9.043815612792969, + "learning_rate": 8.057579251165557e-06, + "loss": 0.2252, + "step": 11763 + }, + { + "epoch": 0.29769466305640613, + "grad_norm": 4.71518087387085, + "learning_rate": 8.057261543752197e-06, + "loss": 0.2227, + "step": 11764 + }, + { + "epoch": 0.2977199686210998, + "grad_norm": 4.522095680236816, + "learning_rate": 8.056943816623126e-06, + "loss": 0.1465, + "step": 11765 + }, + { + "epoch": 0.29774527418579344, + "grad_norm": 5.141330242156982, + "learning_rate": 8.056626069780393e-06, + "loss": 0.2177, + "step": 11766 + }, + { + "epoch": 0.29777057975048715, + "grad_norm": 4.321918487548828, + "learning_rate": 8.056308303226048e-06, + "loss": 0.1382, + "step": 11767 + }, + { + "epoch": 0.2977958853151808, + "grad_norm": 6.803455352783203, + "learning_rate": 8.05599051696214e-06, + "loss": 0.2129, + "step": 11768 + }, + { + "epoch": 0.29782119087987446, + "grad_norm": 2.6024386882781982, + "learning_rate": 8.055672710990714e-06, + "loss": 0.1179, + "step": 11769 + }, + { + "epoch": 0.2978464964445682, + "grad_norm": 3.7518703937530518, + "learning_rate": 8.055354885313827e-06, + "loss": 0.0929, + "step": 11770 + }, + { + "epoch": 0.29787180200926183, + "grad_norm": 9.373302459716797, + "learning_rate": 8.055037039933522e-06, + "loss": 0.2436, + "step": 11771 + }, + { + "epoch": 0.2978971075739555, + "grad_norm": 5.158785343170166, + "learning_rate": 8.054719174851854e-06, + "loss": 0.1999, + "step": 11772 + }, + { + "epoch": 0.2979224131386492, + "grad_norm": 6.821712017059326, + "learning_rate": 8.054401290070868e-06, + "loss": 0.2413, + "step": 11773 + }, + { + "epoch": 0.29794771870334286, + "grad_norm": 6.901726722717285, + "learning_rate": 8.054083385592616e-06, + "loss": 0.2332, + "step": 11774 + }, + { + "epoch": 0.29797302426803657, + "grad_norm": 4.955403804779053, + "learning_rate": 8.053765461419152e-06, + "loss": 0.1332, + "step": 11775 + }, + { + "epoch": 0.2979983298327302, + "grad_norm": 8.148571968078613, + "learning_rate": 8.053447517552518e-06, + "loss": 0.3148, + "step": 11776 + }, + { + "epoch": 0.2980236353974239, + "grad_norm": 17.783981323242188, + "learning_rate": 8.053129553994772e-06, + "loss": 0.2616, + "step": 11777 + }, + { + "epoch": 0.2980489409621176, + "grad_norm": 3.832481622695923, + "learning_rate": 8.052811570747961e-06, + "loss": 0.2043, + "step": 11778 + }, + { + "epoch": 0.29807424652681125, + "grad_norm": 5.082639694213867, + "learning_rate": 8.052493567814136e-06, + "loss": 0.1879, + "step": 11779 + }, + { + "epoch": 0.2980995520915049, + "grad_norm": 8.01483154296875, + "learning_rate": 8.052175545195346e-06, + "loss": 0.2449, + "step": 11780 + }, + { + "epoch": 0.2981248576561986, + "grad_norm": 4.273539066314697, + "learning_rate": 8.051857502893644e-06, + "loss": 0.2471, + "step": 11781 + }, + { + "epoch": 0.2981501632208923, + "grad_norm": 4.858164310455322, + "learning_rate": 8.051539440911082e-06, + "loss": 0.1728, + "step": 11782 + }, + { + "epoch": 0.29817546878558593, + "grad_norm": 3.2660229206085205, + "learning_rate": 8.05122135924971e-06, + "loss": 0.2146, + "step": 11783 + }, + { + "epoch": 0.29820077435027964, + "grad_norm": 3.52006459236145, + "learning_rate": 8.050903257911577e-06, + "loss": 0.1394, + "step": 11784 + }, + { + "epoch": 0.2982260799149733, + "grad_norm": 6.163203716278076, + "learning_rate": 8.050585136898738e-06, + "loss": 0.2762, + "step": 11785 + }, + { + "epoch": 0.29825138547966695, + "grad_norm": 2.893882989883423, + "learning_rate": 8.050266996213242e-06, + "loss": 0.1095, + "step": 11786 + }, + { + "epoch": 0.29827669104436066, + "grad_norm": 4.8072075843811035, + "learning_rate": 8.049948835857142e-06, + "loss": 0.1702, + "step": 11787 + }, + { + "epoch": 0.2983019966090543, + "grad_norm": 5.143196105957031, + "learning_rate": 8.04963065583249e-06, + "loss": 0.2234, + "step": 11788 + }, + { + "epoch": 0.29832730217374803, + "grad_norm": 8.915507316589355, + "learning_rate": 8.049312456141337e-06, + "loss": 0.1404, + "step": 11789 + }, + { + "epoch": 0.2983526077384417, + "grad_norm": 6.501884460449219, + "learning_rate": 8.048994236785735e-06, + "loss": 0.1703, + "step": 11790 + }, + { + "epoch": 0.29837791330313534, + "grad_norm": 5.8854289054870605, + "learning_rate": 8.048675997767737e-06, + "loss": 0.2066, + "step": 11791 + }, + { + "epoch": 0.29840321886782906, + "grad_norm": 4.529021263122559, + "learning_rate": 8.048357739089392e-06, + "loss": 0.1945, + "step": 11792 + }, + { + "epoch": 0.2984285244325227, + "grad_norm": 3.33062481880188, + "learning_rate": 8.048039460752757e-06, + "loss": 0.1884, + "step": 11793 + }, + { + "epoch": 0.29845382999721637, + "grad_norm": 8.27435302734375, + "learning_rate": 8.047721162759883e-06, + "loss": 0.2021, + "step": 11794 + }, + { + "epoch": 0.2984791355619101, + "grad_norm": 7.425567150115967, + "learning_rate": 8.04740284511282e-06, + "loss": 0.212, + "step": 11795 + }, + { + "epoch": 0.29850444112660374, + "grad_norm": 3.3669612407684326, + "learning_rate": 8.047084507813625e-06, + "loss": 0.1132, + "step": 11796 + }, + { + "epoch": 0.2985297466912974, + "grad_norm": 2.6543078422546387, + "learning_rate": 8.04676615086435e-06, + "loss": 0.0897, + "step": 11797 + }, + { + "epoch": 0.2985550522559911, + "grad_norm": 4.562434673309326, + "learning_rate": 8.046447774267044e-06, + "loss": 0.1963, + "step": 11798 + }, + { + "epoch": 0.29858035782068476, + "grad_norm": 3.7903971672058105, + "learning_rate": 8.046129378023764e-06, + "loss": 0.1334, + "step": 11799 + }, + { + "epoch": 0.2986056633853784, + "grad_norm": 3.772700548171997, + "learning_rate": 8.04581096213656e-06, + "loss": 0.2094, + "step": 11800 + }, + { + "epoch": 0.29863096895007213, + "grad_norm": 4.478537559509277, + "learning_rate": 8.04549252660749e-06, + "loss": 0.2321, + "step": 11801 + }, + { + "epoch": 0.2986562745147658, + "grad_norm": 5.2227277755737305, + "learning_rate": 8.045174071438606e-06, + "loss": 0.1916, + "step": 11802 + }, + { + "epoch": 0.2986815800794595, + "grad_norm": 3.456967830657959, + "learning_rate": 8.04485559663196e-06, + "loss": 0.1828, + "step": 11803 + }, + { + "epoch": 0.29870688564415315, + "grad_norm": 4.129673957824707, + "learning_rate": 8.044537102189607e-06, + "loss": 0.2055, + "step": 11804 + }, + { + "epoch": 0.2987321912088468, + "grad_norm": 51.177696228027344, + "learning_rate": 8.044218588113599e-06, + "loss": 0.3193, + "step": 11805 + }, + { + "epoch": 0.2987574967735405, + "grad_norm": 4.045809745788574, + "learning_rate": 8.043900054405993e-06, + "loss": 0.1407, + "step": 11806 + }, + { + "epoch": 0.2987828023382342, + "grad_norm": 6.365997791290283, + "learning_rate": 8.043581501068842e-06, + "loss": 0.2244, + "step": 11807 + }, + { + "epoch": 0.29880810790292783, + "grad_norm": 6.184141159057617, + "learning_rate": 8.043262928104198e-06, + "loss": 0.2666, + "step": 11808 + }, + { + "epoch": 0.29883341346762154, + "grad_norm": 9.307280540466309, + "learning_rate": 8.04294433551412e-06, + "loss": 0.162, + "step": 11809 + }, + { + "epoch": 0.2988587190323152, + "grad_norm": 3.7227118015289307, + "learning_rate": 8.04262572330066e-06, + "loss": 0.1502, + "step": 11810 + }, + { + "epoch": 0.29888402459700886, + "grad_norm": 4.709445476531982, + "learning_rate": 8.04230709146587e-06, + "loss": 0.1201, + "step": 11811 + }, + { + "epoch": 0.29890933016170257, + "grad_norm": 4.872000694274902, + "learning_rate": 8.04198844001181e-06, + "loss": 0.1905, + "step": 11812 + }, + { + "epoch": 0.2989346357263962, + "grad_norm": 7.963230133056641, + "learning_rate": 8.041669768940531e-06, + "loss": 0.1943, + "step": 11813 + }, + { + "epoch": 0.29895994129108994, + "grad_norm": 7.349377155303955, + "learning_rate": 8.04135107825409e-06, + "loss": 0.1884, + "step": 11814 + }, + { + "epoch": 0.2989852468557836, + "grad_norm": 3.7129480838775635, + "learning_rate": 8.041032367954542e-06, + "loss": 0.1851, + "step": 11815 + }, + { + "epoch": 0.29901055242047725, + "grad_norm": 5.297959327697754, + "learning_rate": 8.04071363804394e-06, + "loss": 0.1544, + "step": 11816 + }, + { + "epoch": 0.29903585798517096, + "grad_norm": 10.564549446105957, + "learning_rate": 8.040394888524345e-06, + "loss": 0.1989, + "step": 11817 + }, + { + "epoch": 0.2990611635498646, + "grad_norm": 5.700638294219971, + "learning_rate": 8.040076119397805e-06, + "loss": 0.2172, + "step": 11818 + }, + { + "epoch": 0.2990864691145583, + "grad_norm": 3.3601553440093994, + "learning_rate": 8.039757330666382e-06, + "loss": 0.1243, + "step": 11819 + }, + { + "epoch": 0.299111774679252, + "grad_norm": 10.553484916687012, + "learning_rate": 8.039438522332129e-06, + "loss": 0.2802, + "step": 11820 + }, + { + "epoch": 0.29913708024394564, + "grad_norm": 3.728782892227173, + "learning_rate": 8.039119694397101e-06, + "loss": 0.1301, + "step": 11821 + }, + { + "epoch": 0.2991623858086393, + "grad_norm": 4.630340099334717, + "learning_rate": 8.038800846863358e-06, + "loss": 0.2061, + "step": 11822 + }, + { + "epoch": 0.299187691373333, + "grad_norm": 4.625114440917969, + "learning_rate": 8.038481979732953e-06, + "loss": 0.2108, + "step": 11823 + }, + { + "epoch": 0.29921299693802667, + "grad_norm": 7.62315559387207, + "learning_rate": 8.038163093007942e-06, + "loss": 0.2216, + "step": 11824 + }, + { + "epoch": 0.2992383025027203, + "grad_norm": 4.504459381103516, + "learning_rate": 8.037844186690383e-06, + "loss": 0.2074, + "step": 11825 + }, + { + "epoch": 0.29926360806741403, + "grad_norm": 3.605356216430664, + "learning_rate": 8.037525260782331e-06, + "loss": 0.1466, + "step": 11826 + }, + { + "epoch": 0.2992889136321077, + "grad_norm": 5.648887634277344, + "learning_rate": 8.037206315285842e-06, + "loss": 0.1315, + "step": 11827 + }, + { + "epoch": 0.2993142191968014, + "grad_norm": 8.257638931274414, + "learning_rate": 8.036887350202977e-06, + "loss": 0.1523, + "step": 11828 + }, + { + "epoch": 0.29933952476149506, + "grad_norm": 4.096458911895752, + "learning_rate": 8.03656836553579e-06, + "loss": 0.1882, + "step": 11829 + }, + { + "epoch": 0.2993648303261887, + "grad_norm": 7.551666259765625, + "learning_rate": 8.036249361286339e-06, + "loss": 0.2377, + "step": 11830 + }, + { + "epoch": 0.2993901358908824, + "grad_norm": 11.796234130859375, + "learning_rate": 8.035930337456678e-06, + "loss": 0.1934, + "step": 11831 + }, + { + "epoch": 0.2994154414555761, + "grad_norm": 7.320418834686279, + "learning_rate": 8.03561129404887e-06, + "loss": 0.2048, + "step": 11832 + }, + { + "epoch": 0.29944074702026974, + "grad_norm": 13.044047355651855, + "learning_rate": 8.035292231064968e-06, + "loss": 0.1893, + "step": 11833 + }, + { + "epoch": 0.29946605258496345, + "grad_norm": 5.580687046051025, + "learning_rate": 8.034973148507032e-06, + "loss": 0.2183, + "step": 11834 + }, + { + "epoch": 0.2994913581496571, + "grad_norm": 8.148185729980469, + "learning_rate": 8.034654046377118e-06, + "loss": 0.17, + "step": 11835 + }, + { + "epoch": 0.29951666371435076, + "grad_norm": 3.9135053157806396, + "learning_rate": 8.034334924677285e-06, + "loss": 0.1789, + "step": 11836 + }, + { + "epoch": 0.2995419692790445, + "grad_norm": 25.216331481933594, + "learning_rate": 8.03401578340959e-06, + "loss": 0.2799, + "step": 11837 + }, + { + "epoch": 0.29956727484373813, + "grad_norm": 2.5976054668426514, + "learning_rate": 8.033696622576092e-06, + "loss": 0.0983, + "step": 11838 + }, + { + "epoch": 0.29959258040843184, + "grad_norm": 3.718010902404785, + "learning_rate": 8.03337744217885e-06, + "loss": 0.1449, + "step": 11839 + }, + { + "epoch": 0.2996178859731255, + "grad_norm": 9.24990463256836, + "learning_rate": 8.033058242219918e-06, + "loss": 0.306, + "step": 11840 + }, + { + "epoch": 0.29964319153781915, + "grad_norm": 9.215478897094727, + "learning_rate": 8.032739022701359e-06, + "loss": 0.2694, + "step": 11841 + }, + { + "epoch": 0.29966849710251287, + "grad_norm": 4.574280738830566, + "learning_rate": 8.032419783625231e-06, + "loss": 0.227, + "step": 11842 + }, + { + "epoch": 0.2996938026672065, + "grad_norm": 4.404741287231445, + "learning_rate": 8.032100524993591e-06, + "loss": 0.1852, + "step": 11843 + }, + { + "epoch": 0.2997191082319002, + "grad_norm": 6.1229071617126465, + "learning_rate": 8.031781246808499e-06, + "loss": 0.2647, + "step": 11844 + }, + { + "epoch": 0.2997444137965939, + "grad_norm": 7.265507698059082, + "learning_rate": 8.031461949072014e-06, + "loss": 0.1732, + "step": 11845 + }, + { + "epoch": 0.29976971936128755, + "grad_norm": 4.175209999084473, + "learning_rate": 8.031142631786193e-06, + "loss": 0.1403, + "step": 11846 + }, + { + "epoch": 0.2997950249259812, + "grad_norm": 12.573667526245117, + "learning_rate": 8.0308232949531e-06, + "loss": 0.159, + "step": 11847 + }, + { + "epoch": 0.2998203304906749, + "grad_norm": 2.618347406387329, + "learning_rate": 8.03050393857479e-06, + "loss": 0.149, + "step": 11848 + }, + { + "epoch": 0.29984563605536857, + "grad_norm": 5.253021240234375, + "learning_rate": 8.030184562653322e-06, + "loss": 0.1548, + "step": 11849 + }, + { + "epoch": 0.2998709416200622, + "grad_norm": 5.301075458526611, + "learning_rate": 8.02986516719076e-06, + "loss": 0.2423, + "step": 11850 + }, + { + "epoch": 0.29989624718475594, + "grad_norm": 9.086211204528809, + "learning_rate": 8.02954575218916e-06, + "loss": 0.1738, + "step": 11851 + }, + { + "epoch": 0.2999215527494496, + "grad_norm": 8.721212387084961, + "learning_rate": 8.029226317650583e-06, + "loss": 0.132, + "step": 11852 + }, + { + "epoch": 0.2999468583141433, + "grad_norm": 7.059730529785156, + "learning_rate": 8.02890686357709e-06, + "loss": 0.1303, + "step": 11853 + }, + { + "epoch": 0.29997216387883696, + "grad_norm": 3.8988537788391113, + "learning_rate": 8.028587389970739e-06, + "loss": 0.1438, + "step": 11854 + }, + { + "epoch": 0.2999974694435306, + "grad_norm": 5.671333312988281, + "learning_rate": 8.02826789683359e-06, + "loss": 0.1508, + "step": 11855 + }, + { + "epoch": 0.30002277500822433, + "grad_norm": 4.099856853485107, + "learning_rate": 8.027948384167708e-06, + "loss": 0.1519, + "step": 11856 + }, + { + "epoch": 0.30002277500822433, + "eval_loss": 0.20302896201610565, + "eval_runtime": 69.8332, + "eval_samples_per_second": 45.738, + "eval_steps_per_second": 5.728, + "step": 11856 + }, + { + "epoch": 0.300048080572918, + "grad_norm": 7.044040679931641, + "learning_rate": 8.027628851975146e-06, + "loss": 0.1812, + "step": 11857 + }, + { + "epoch": 0.30007338613761164, + "grad_norm": 3.4302351474761963, + "learning_rate": 8.027309300257972e-06, + "loss": 0.1421, + "step": 11858 + }, + { + "epoch": 0.30009869170230535, + "grad_norm": 9.370220184326172, + "learning_rate": 8.026989729018242e-06, + "loss": 0.198, + "step": 11859 + }, + { + "epoch": 0.300123997266999, + "grad_norm": 5.442381858825684, + "learning_rate": 8.026670138258019e-06, + "loss": 0.1658, + "step": 11860 + }, + { + "epoch": 0.30014930283169267, + "grad_norm": 4.275566101074219, + "learning_rate": 8.026350527979363e-06, + "loss": 0.1769, + "step": 11861 + }, + { + "epoch": 0.3001746083963864, + "grad_norm": 6.135824680328369, + "learning_rate": 8.026030898184336e-06, + "loss": 0.1762, + "step": 11862 + }, + { + "epoch": 0.30019991396108003, + "grad_norm": 9.056774139404297, + "learning_rate": 8.025711248874997e-06, + "loss": 0.275, + "step": 11863 + }, + { + "epoch": 0.3002252195257737, + "grad_norm": 7.444146633148193, + "learning_rate": 8.02539158005341e-06, + "loss": 0.2494, + "step": 11864 + }, + { + "epoch": 0.3002505250904674, + "grad_norm": 5.889921188354492, + "learning_rate": 8.025071891721636e-06, + "loss": 0.1649, + "step": 11865 + }, + { + "epoch": 0.30027583065516106, + "grad_norm": 12.403197288513184, + "learning_rate": 8.024752183881736e-06, + "loss": 0.268, + "step": 11866 + }, + { + "epoch": 0.30030113621985477, + "grad_norm": 4.977078437805176, + "learning_rate": 8.024432456535771e-06, + "loss": 0.2112, + "step": 11867 + }, + { + "epoch": 0.3003264417845484, + "grad_norm": 7.29057502746582, + "learning_rate": 8.024112709685804e-06, + "loss": 0.2512, + "step": 11868 + }, + { + "epoch": 0.3003517473492421, + "grad_norm": 7.112475395202637, + "learning_rate": 8.023792943333897e-06, + "loss": 0.2258, + "step": 11869 + }, + { + "epoch": 0.3003770529139358, + "grad_norm": 4.931941509246826, + "learning_rate": 8.023473157482112e-06, + "loss": 0.1829, + "step": 11870 + }, + { + "epoch": 0.30040235847862945, + "grad_norm": 5.78624963760376, + "learning_rate": 8.023153352132509e-06, + "loss": 0.175, + "step": 11871 + }, + { + "epoch": 0.3004276640433231, + "grad_norm": 2.8749687671661377, + "learning_rate": 8.022833527287155e-06, + "loss": 0.0796, + "step": 11872 + }, + { + "epoch": 0.3004529696080168, + "grad_norm": 18.57029151916504, + "learning_rate": 8.022513682948107e-06, + "loss": 0.2624, + "step": 11873 + }, + { + "epoch": 0.3004782751727105, + "grad_norm": 5.2755513191223145, + "learning_rate": 8.022193819117433e-06, + "loss": 0.2067, + "step": 11874 + }, + { + "epoch": 0.30050358073740413, + "grad_norm": 4.677342891693115, + "learning_rate": 8.021873935797192e-06, + "loss": 0.192, + "step": 11875 + }, + { + "epoch": 0.30052888630209784, + "grad_norm": 7.854928970336914, + "learning_rate": 8.021554032989449e-06, + "loss": 0.1075, + "step": 11876 + }, + { + "epoch": 0.3005541918667915, + "grad_norm": 6.737752437591553, + "learning_rate": 8.021234110696266e-06, + "loss": 0.1462, + "step": 11877 + }, + { + "epoch": 0.3005794974314852, + "grad_norm": 5.788865566253662, + "learning_rate": 8.020914168919706e-06, + "loss": 0.2442, + "step": 11878 + }, + { + "epoch": 0.30060480299617887, + "grad_norm": 5.219752788543701, + "learning_rate": 8.020594207661832e-06, + "loss": 0.1222, + "step": 11879 + }, + { + "epoch": 0.3006301085608725, + "grad_norm": 8.663932800292969, + "learning_rate": 8.020274226924706e-06, + "loss": 0.2133, + "step": 11880 + }, + { + "epoch": 0.30065541412556623, + "grad_norm": 5.944777488708496, + "learning_rate": 8.019954226710398e-06, + "loss": 0.1836, + "step": 11881 + }, + { + "epoch": 0.3006807196902599, + "grad_norm": 3.7638332843780518, + "learning_rate": 8.019634207020963e-06, + "loss": 0.1978, + "step": 11882 + }, + { + "epoch": 0.30070602525495355, + "grad_norm": 3.8601667881011963, + "learning_rate": 8.01931416785847e-06, + "loss": 0.1336, + "step": 11883 + }, + { + "epoch": 0.30073133081964726, + "grad_norm": 5.296444892883301, + "learning_rate": 8.018994109224982e-06, + "loss": 0.1572, + "step": 11884 + }, + { + "epoch": 0.3007566363843409, + "grad_norm": 7.480373859405518, + "learning_rate": 8.018674031122563e-06, + "loss": 0.1588, + "step": 11885 + }, + { + "epoch": 0.30078194194903457, + "grad_norm": 3.818444013595581, + "learning_rate": 8.018353933553276e-06, + "loss": 0.1951, + "step": 11886 + }, + { + "epoch": 0.3008072475137283, + "grad_norm": 4.193634033203125, + "learning_rate": 8.018033816519185e-06, + "loss": 0.1272, + "step": 11887 + }, + { + "epoch": 0.30083255307842194, + "grad_norm": 3.8019285202026367, + "learning_rate": 8.017713680022357e-06, + "loss": 0.1686, + "step": 11888 + }, + { + "epoch": 0.3008578586431156, + "grad_norm": 3.3962531089782715, + "learning_rate": 8.017393524064854e-06, + "loss": 0.1402, + "step": 11889 + }, + { + "epoch": 0.3008831642078093, + "grad_norm": 3.543130874633789, + "learning_rate": 8.01707334864874e-06, + "loss": 0.1145, + "step": 11890 + }, + { + "epoch": 0.30090846977250296, + "grad_norm": 10.313347816467285, + "learning_rate": 8.016753153776083e-06, + "loss": 0.1915, + "step": 11891 + }, + { + "epoch": 0.3009337753371967, + "grad_norm": 5.05756950378418, + "learning_rate": 8.016432939448945e-06, + "loss": 0.2599, + "step": 11892 + }, + { + "epoch": 0.30095908090189033, + "grad_norm": 7.650054931640625, + "learning_rate": 8.016112705669395e-06, + "loss": 0.1943, + "step": 11893 + }, + { + "epoch": 0.300984386466584, + "grad_norm": 11.011927604675293, + "learning_rate": 8.015792452439491e-06, + "loss": 0.2638, + "step": 11894 + }, + { + "epoch": 0.3010096920312777, + "grad_norm": 8.44832706451416, + "learning_rate": 8.015472179761306e-06, + "loss": 0.258, + "step": 11895 + }, + { + "epoch": 0.30103499759597135, + "grad_norm": 2.856278419494629, + "learning_rate": 8.0151518876369e-06, + "loss": 0.1349, + "step": 11896 + }, + { + "epoch": 0.301060303160665, + "grad_norm": 3.727907180786133, + "learning_rate": 8.01483157606834e-06, + "loss": 0.1534, + "step": 11897 + }, + { + "epoch": 0.3010856087253587, + "grad_norm": 4.028025150299072, + "learning_rate": 8.014511245057692e-06, + "loss": 0.1641, + "step": 11898 + }, + { + "epoch": 0.3011109142900524, + "grad_norm": 6.620236873626709, + "learning_rate": 8.014190894607023e-06, + "loss": 0.1526, + "step": 11899 + }, + { + "epoch": 0.30113621985474603, + "grad_norm": 5.361276626586914, + "learning_rate": 8.013870524718397e-06, + "loss": 0.2061, + "step": 11900 + }, + { + "epoch": 0.30116152541943975, + "grad_norm": 3.7450339794158936, + "learning_rate": 8.013550135393879e-06, + "loss": 0.1439, + "step": 11901 + }, + { + "epoch": 0.3011868309841334, + "grad_norm": 44.92051315307617, + "learning_rate": 8.01322972663554e-06, + "loss": 0.1313, + "step": 11902 + }, + { + "epoch": 0.3012121365488271, + "grad_norm": 4.234721660614014, + "learning_rate": 8.01290929844544e-06, + "loss": 0.1947, + "step": 11903 + }, + { + "epoch": 0.30123744211352077, + "grad_norm": 8.274702072143555, + "learning_rate": 8.012588850825648e-06, + "loss": 0.175, + "step": 11904 + }, + { + "epoch": 0.3012627476782144, + "grad_norm": 8.515840530395508, + "learning_rate": 8.012268383778235e-06, + "loss": 0.0781, + "step": 11905 + }, + { + "epoch": 0.30128805324290814, + "grad_norm": 8.612916946411133, + "learning_rate": 8.01194789730526e-06, + "loss": 0.2034, + "step": 11906 + }, + { + "epoch": 0.3013133588076018, + "grad_norm": 8.463837623596191, + "learning_rate": 8.011627391408795e-06, + "loss": 0.2064, + "step": 11907 + }, + { + "epoch": 0.30133866437229545, + "grad_norm": 2.720226526260376, + "learning_rate": 8.011306866090904e-06, + "loss": 0.1006, + "step": 11908 + }, + { + "epoch": 0.30136396993698916, + "grad_norm": 8.30967903137207, + "learning_rate": 8.010986321353655e-06, + "loss": 0.236, + "step": 11909 + }, + { + "epoch": 0.3013892755016828, + "grad_norm": 4.776269912719727, + "learning_rate": 8.010665757199117e-06, + "loss": 0.2399, + "step": 11910 + }, + { + "epoch": 0.3014145810663765, + "grad_norm": 5.3224945068359375, + "learning_rate": 8.010345173629355e-06, + "loss": 0.1589, + "step": 11911 + }, + { + "epoch": 0.3014398866310702, + "grad_norm": 14.496912956237793, + "learning_rate": 8.010024570646436e-06, + "loss": 0.2313, + "step": 11912 + }, + { + "epoch": 0.30146519219576384, + "grad_norm": 5.408419132232666, + "learning_rate": 8.009703948252428e-06, + "loss": 0.1953, + "step": 11913 + }, + { + "epoch": 0.3014904977604575, + "grad_norm": 2.8486201763153076, + "learning_rate": 8.0093833064494e-06, + "loss": 0.1609, + "step": 11914 + }, + { + "epoch": 0.3015158033251512, + "grad_norm": 6.190127849578857, + "learning_rate": 8.009062645239418e-06, + "loss": 0.1797, + "step": 11915 + }, + { + "epoch": 0.30154110888984487, + "grad_norm": 4.3768510818481445, + "learning_rate": 8.008741964624553e-06, + "loss": 0.1785, + "step": 11916 + }, + { + "epoch": 0.3015664144545386, + "grad_norm": 6.369144439697266, + "learning_rate": 8.008421264606869e-06, + "loss": 0.2094, + "step": 11917 + }, + { + "epoch": 0.30159172001923223, + "grad_norm": 3.2488017082214355, + "learning_rate": 8.008100545188437e-06, + "loss": 0.148, + "step": 11918 + }, + { + "epoch": 0.3016170255839259, + "grad_norm": 4.258906364440918, + "learning_rate": 8.007779806371321e-06, + "loss": 0.1912, + "step": 11919 + }, + { + "epoch": 0.3016423311486196, + "grad_norm": 3.2861194610595703, + "learning_rate": 8.007459048157596e-06, + "loss": 0.1903, + "step": 11920 + }, + { + "epoch": 0.30166763671331326, + "grad_norm": 4.541302680969238, + "learning_rate": 8.007138270549326e-06, + "loss": 0.1778, + "step": 11921 + }, + { + "epoch": 0.3016929422780069, + "grad_norm": 13.892099380493164, + "learning_rate": 8.006817473548581e-06, + "loss": 0.2648, + "step": 11922 + }, + { + "epoch": 0.3017182478427006, + "grad_norm": 3.944925308227539, + "learning_rate": 8.006496657157428e-06, + "loss": 0.18, + "step": 11923 + }, + { + "epoch": 0.3017435534073943, + "grad_norm": 5.313460826873779, + "learning_rate": 8.006175821377938e-06, + "loss": 0.1817, + "step": 11924 + }, + { + "epoch": 0.30176885897208794, + "grad_norm": 9.097987174987793, + "learning_rate": 8.00585496621218e-06, + "loss": 0.1961, + "step": 11925 + }, + { + "epoch": 0.30179416453678165, + "grad_norm": 8.90963077545166, + "learning_rate": 8.005534091662222e-06, + "loss": 0.1674, + "step": 11926 + }, + { + "epoch": 0.3018194701014753, + "grad_norm": 7.636899471282959, + "learning_rate": 8.005213197730134e-06, + "loss": 0.2067, + "step": 11927 + }, + { + "epoch": 0.30184477566616896, + "grad_norm": 6.610861778259277, + "learning_rate": 8.004892284417985e-06, + "loss": 0.1362, + "step": 11928 + }, + { + "epoch": 0.3018700812308627, + "grad_norm": 5.952230453491211, + "learning_rate": 8.004571351727843e-06, + "loss": 0.248, + "step": 11929 + }, + { + "epoch": 0.30189538679555633, + "grad_norm": 8.326805114746094, + "learning_rate": 8.004250399661782e-06, + "loss": 0.1229, + "step": 11930 + }, + { + "epoch": 0.30192069236025004, + "grad_norm": 8.274858474731445, + "learning_rate": 8.003929428221868e-06, + "loss": 0.3254, + "step": 11931 + }, + { + "epoch": 0.3019459979249437, + "grad_norm": 4.527010440826416, + "learning_rate": 8.003608437410171e-06, + "loss": 0.164, + "step": 11932 + }, + { + "epoch": 0.30197130348963735, + "grad_norm": 5.9610090255737305, + "learning_rate": 8.003287427228761e-06, + "loss": 0.1481, + "step": 11933 + }, + { + "epoch": 0.30199660905433107, + "grad_norm": 12.33957290649414, + "learning_rate": 8.002966397679712e-06, + "loss": 0.2052, + "step": 11934 + }, + { + "epoch": 0.3020219146190247, + "grad_norm": 3.2458105087280273, + "learning_rate": 8.00264534876509e-06, + "loss": 0.1579, + "step": 11935 + }, + { + "epoch": 0.3020472201837184, + "grad_norm": 4.6680755615234375, + "learning_rate": 8.002324280486966e-06, + "loss": 0.1802, + "step": 11936 + }, + { + "epoch": 0.3020725257484121, + "grad_norm": 3.4652836322784424, + "learning_rate": 8.002003192847412e-06, + "loss": 0.1621, + "step": 11937 + }, + { + "epoch": 0.30209783131310575, + "grad_norm": 16.906869888305664, + "learning_rate": 8.001682085848496e-06, + "loss": 0.3087, + "step": 11938 + }, + { + "epoch": 0.3021231368777994, + "grad_norm": 3.147728681564331, + "learning_rate": 8.001360959492292e-06, + "loss": 0.1055, + "step": 11939 + }, + { + "epoch": 0.3021484424424931, + "grad_norm": 4.083227634429932, + "learning_rate": 8.001039813780872e-06, + "loss": 0.1052, + "step": 11940 + }, + { + "epoch": 0.30217374800718677, + "grad_norm": 5.523993968963623, + "learning_rate": 8.0007186487163e-06, + "loss": 0.1507, + "step": 11941 + }, + { + "epoch": 0.3021990535718805, + "grad_norm": 4.345670700073242, + "learning_rate": 8.000397464300655e-06, + "loss": 0.1414, + "step": 11942 + }, + { + "epoch": 0.30222435913657414, + "grad_norm": 5.147322177886963, + "learning_rate": 8.000076260536002e-06, + "loss": 0.1405, + "step": 11943 + }, + { + "epoch": 0.3022496647012678, + "grad_norm": 7.959371089935303, + "learning_rate": 7.999755037424417e-06, + "loss": 0.3771, + "step": 11944 + }, + { + "epoch": 0.3022749702659615, + "grad_norm": 13.338513374328613, + "learning_rate": 7.99943379496797e-06, + "loss": 0.2643, + "step": 11945 + }, + { + "epoch": 0.30230027583065516, + "grad_norm": 3.047257661819458, + "learning_rate": 7.999112533168731e-06, + "loss": 0.0951, + "step": 11946 + }, + { + "epoch": 0.3023255813953488, + "grad_norm": 4.175169944763184, + "learning_rate": 7.998791252028774e-06, + "loss": 0.104, + "step": 11947 + }, + { + "epoch": 0.30235088696004253, + "grad_norm": 4.6024699211120605, + "learning_rate": 7.998469951550169e-06, + "loss": 0.135, + "step": 11948 + }, + { + "epoch": 0.3023761925247362, + "grad_norm": 5.5665364265441895, + "learning_rate": 7.998148631734992e-06, + "loss": 0.1329, + "step": 11949 + }, + { + "epoch": 0.30240149808942984, + "grad_norm": 6.118223667144775, + "learning_rate": 7.99782729258531e-06, + "loss": 0.208, + "step": 11950 + }, + { + "epoch": 0.30242680365412355, + "grad_norm": 5.069662570953369, + "learning_rate": 7.997505934103197e-06, + "loss": 0.2345, + "step": 11951 + }, + { + "epoch": 0.3024521092188172, + "grad_norm": 6.516354560852051, + "learning_rate": 7.997184556290728e-06, + "loss": 0.1581, + "step": 11952 + }, + { + "epoch": 0.30247741478351087, + "grad_norm": 12.374565124511719, + "learning_rate": 7.996863159149971e-06, + "loss": 0.1745, + "step": 11953 + }, + { + "epoch": 0.3025027203482046, + "grad_norm": 7.015350341796875, + "learning_rate": 7.996541742683002e-06, + "loss": 0.1425, + "step": 11954 + }, + { + "epoch": 0.30252802591289824, + "grad_norm": 7.370038032531738, + "learning_rate": 7.996220306891891e-06, + "loss": 0.2083, + "step": 11955 + }, + { + "epoch": 0.30255333147759195, + "grad_norm": 21.39289093017578, + "learning_rate": 7.995898851778716e-06, + "loss": 0.3811, + "step": 11956 + }, + { + "epoch": 0.3025786370422856, + "grad_norm": 7.1666579246521, + "learning_rate": 7.995577377345543e-06, + "loss": 0.1699, + "step": 11957 + }, + { + "epoch": 0.30260394260697926, + "grad_norm": 7.427956581115723, + "learning_rate": 7.995255883594452e-06, + "loss": 0.263, + "step": 11958 + }, + { + "epoch": 0.30262924817167297, + "grad_norm": 21.110137939453125, + "learning_rate": 7.99493437052751e-06, + "loss": 0.3374, + "step": 11959 + }, + { + "epoch": 0.3026545537363666, + "grad_norm": 5.177831649780273, + "learning_rate": 7.994612838146797e-06, + "loss": 0.1353, + "step": 11960 + }, + { + "epoch": 0.3026798593010603, + "grad_norm": 3.9948554039001465, + "learning_rate": 7.994291286454379e-06, + "loss": 0.1245, + "step": 11961 + }, + { + "epoch": 0.302705164865754, + "grad_norm": 10.397147178649902, + "learning_rate": 7.993969715452335e-06, + "loss": 0.1944, + "step": 11962 + }, + { + "epoch": 0.30273047043044765, + "grad_norm": 4.383563041687012, + "learning_rate": 7.993648125142737e-06, + "loss": 0.1308, + "step": 11963 + }, + { + "epoch": 0.3027557759951413, + "grad_norm": 7.261713027954102, + "learning_rate": 7.99332651552766e-06, + "loss": 0.1286, + "step": 11964 + }, + { + "epoch": 0.302781081559835, + "grad_norm": 6.20150899887085, + "learning_rate": 7.993004886609176e-06, + "loss": 0.1848, + "step": 11965 + }, + { + "epoch": 0.3028063871245287, + "grad_norm": 4.2565999031066895, + "learning_rate": 7.99268323838936e-06, + "loss": 0.1168, + "step": 11966 + }, + { + "epoch": 0.3028316926892224, + "grad_norm": 7.549849033355713, + "learning_rate": 7.992361570870289e-06, + "loss": 0.2293, + "step": 11967 + }, + { + "epoch": 0.30285699825391604, + "grad_norm": 9.394305229187012, + "learning_rate": 7.99203988405403e-06, + "loss": 0.2223, + "step": 11968 + }, + { + "epoch": 0.3028823038186097, + "grad_norm": 5.280980587005615, + "learning_rate": 7.991718177942667e-06, + "loss": 0.177, + "step": 11969 + }, + { + "epoch": 0.3029076093833034, + "grad_norm": 5.005942344665527, + "learning_rate": 7.991396452538269e-06, + "loss": 0.1784, + "step": 11970 + }, + { + "epoch": 0.30293291494799707, + "grad_norm": 8.48609447479248, + "learning_rate": 7.99107470784291e-06, + "loss": 0.2281, + "step": 11971 + }, + { + "epoch": 0.3029582205126907, + "grad_norm": 4.2009758949279785, + "learning_rate": 7.990752943858668e-06, + "loss": 0.1437, + "step": 11972 + }, + { + "epoch": 0.30298352607738444, + "grad_norm": 7.719625473022461, + "learning_rate": 7.990431160587617e-06, + "loss": 0.2983, + "step": 11973 + }, + { + "epoch": 0.3030088316420781, + "grad_norm": 7.101787090301514, + "learning_rate": 7.990109358031831e-06, + "loss": 0.1829, + "step": 11974 + }, + { + "epoch": 0.30303413720677175, + "grad_norm": 3.3399226665496826, + "learning_rate": 7.989787536193386e-06, + "loss": 0.1158, + "step": 11975 + }, + { + "epoch": 0.30305944277146546, + "grad_norm": 8.910511016845703, + "learning_rate": 7.989465695074357e-06, + "loss": 0.2631, + "step": 11976 + }, + { + "epoch": 0.3030847483361591, + "grad_norm": 33.59355926513672, + "learning_rate": 7.989143834676817e-06, + "loss": 0.2804, + "step": 11977 + }, + { + "epoch": 0.30311005390085277, + "grad_norm": 3.1658523082733154, + "learning_rate": 7.988821955002851e-06, + "loss": 0.1764, + "step": 11978 + }, + { + "epoch": 0.3031353594655465, + "grad_norm": 2.896862030029297, + "learning_rate": 7.988500056054523e-06, + "loss": 0.1608, + "step": 11979 + }, + { + "epoch": 0.30316066503024014, + "grad_norm": 3.602653980255127, + "learning_rate": 7.988178137833915e-06, + "loss": 0.1033, + "step": 11980 + }, + { + "epoch": 0.30318597059493385, + "grad_norm": 5.494304180145264, + "learning_rate": 7.987856200343104e-06, + "loss": 0.2569, + "step": 11981 + }, + { + "epoch": 0.3032112761596275, + "grad_norm": 3.782209634780884, + "learning_rate": 7.987534243584162e-06, + "loss": 0.168, + "step": 11982 + }, + { + "epoch": 0.30323658172432116, + "grad_norm": 3.306375026702881, + "learning_rate": 7.987212267559168e-06, + "loss": 0.1477, + "step": 11983 + }, + { + "epoch": 0.3032618872890149, + "grad_norm": 5.290353775024414, + "learning_rate": 7.986890272270198e-06, + "loss": 0.212, + "step": 11984 + }, + { + "epoch": 0.30328719285370853, + "grad_norm": 5.286633491516113, + "learning_rate": 7.986568257719329e-06, + "loss": 0.2226, + "step": 11985 + }, + { + "epoch": 0.3033124984184022, + "grad_norm": 3.6865572929382324, + "learning_rate": 7.986246223908635e-06, + "loss": 0.1547, + "step": 11986 + }, + { + "epoch": 0.3033378039830959, + "grad_norm": 4.05324649810791, + "learning_rate": 7.985924170840198e-06, + "loss": 0.1248, + "step": 11987 + }, + { + "epoch": 0.30336310954778956, + "grad_norm": 7.217466831207275, + "learning_rate": 7.985602098516087e-06, + "loss": 0.2975, + "step": 11988 + }, + { + "epoch": 0.3033884151124832, + "grad_norm": 6.239953994750977, + "learning_rate": 7.985280006938386e-06, + "loss": 0.2347, + "step": 11989 + }, + { + "epoch": 0.3034137206771769, + "grad_norm": 4.736719608306885, + "learning_rate": 7.984957896109169e-06, + "loss": 0.1402, + "step": 11990 + }, + { + "epoch": 0.3034390262418706, + "grad_norm": 7.402833938598633, + "learning_rate": 7.984635766030514e-06, + "loss": 0.2082, + "step": 11991 + }, + { + "epoch": 0.30346433180656424, + "grad_norm": 9.617876052856445, + "learning_rate": 7.984313616704497e-06, + "loss": 0.2951, + "step": 11992 + }, + { + "epoch": 0.30348963737125795, + "grad_norm": 4.153014183044434, + "learning_rate": 7.9839914481332e-06, + "loss": 0.1941, + "step": 11993 + }, + { + "epoch": 0.3035149429359516, + "grad_norm": 10.264415740966797, + "learning_rate": 7.983669260318695e-06, + "loss": 0.2013, + "step": 11994 + }, + { + "epoch": 0.3035402485006453, + "grad_norm": 5.967888832092285, + "learning_rate": 7.98334705326306e-06, + "loss": 0.214, + "step": 11995 + }, + { + "epoch": 0.30356555406533897, + "grad_norm": 4.867498874664307, + "learning_rate": 7.983024826968377e-06, + "loss": 0.2211, + "step": 11996 + }, + { + "epoch": 0.30359085963003263, + "grad_norm": 14.349323272705078, + "learning_rate": 7.982702581436721e-06, + "loss": 0.2964, + "step": 11997 + }, + { + "epoch": 0.30361616519472634, + "grad_norm": 5.2898125648498535, + "learning_rate": 7.982380316670171e-06, + "loss": 0.1736, + "step": 11998 + }, + { + "epoch": 0.30364147075942, + "grad_norm": 5.746641635894775, + "learning_rate": 7.982058032670806e-06, + "loss": 0.1951, + "step": 11999 + }, + { + "epoch": 0.30366677632411365, + "grad_norm": 4.365914821624756, + "learning_rate": 7.981735729440702e-06, + "loss": 0.1513, + "step": 12000 + }, + { + "epoch": 0.30369208188880736, + "grad_norm": 6.4528913497924805, + "learning_rate": 7.98141340698194e-06, + "loss": 0.2353, + "step": 12001 + }, + { + "epoch": 0.303717387453501, + "grad_norm": 5.0878777503967285, + "learning_rate": 7.981091065296599e-06, + "loss": 0.2209, + "step": 12002 + }, + { + "epoch": 0.3037426930181947, + "grad_norm": 5.740851879119873, + "learning_rate": 7.980768704386752e-06, + "loss": 0.1695, + "step": 12003 + }, + { + "epoch": 0.3037679985828884, + "grad_norm": 5.618620872497559, + "learning_rate": 7.980446324254485e-06, + "loss": 0.178, + "step": 12004 + }, + { + "epoch": 0.30379330414758204, + "grad_norm": 8.336987495422363, + "learning_rate": 7.980123924901873e-06, + "loss": 0.1773, + "step": 12005 + }, + { + "epoch": 0.30381860971227576, + "grad_norm": 7.393466472625732, + "learning_rate": 7.979801506330997e-06, + "loss": 0.2379, + "step": 12006 + }, + { + "epoch": 0.3038439152769694, + "grad_norm": 7.728920936584473, + "learning_rate": 7.979479068543935e-06, + "loss": 0.1866, + "step": 12007 + }, + { + "epoch": 0.30386922084166307, + "grad_norm": 2.90968656539917, + "learning_rate": 7.979156611542768e-06, + "loss": 0.1381, + "step": 12008 + }, + { + "epoch": 0.3038945264063568, + "grad_norm": 5.225203514099121, + "learning_rate": 7.978834135329572e-06, + "loss": 0.101, + "step": 12009 + }, + { + "epoch": 0.30391983197105044, + "grad_norm": 6.049739837646484, + "learning_rate": 7.97851163990643e-06, + "loss": 0.1756, + "step": 12010 + }, + { + "epoch": 0.3039451375357441, + "grad_norm": 3.3365817070007324, + "learning_rate": 7.97818912527542e-06, + "loss": 0.2134, + "step": 12011 + }, + { + "epoch": 0.3039704431004378, + "grad_norm": 6.015886306762695, + "learning_rate": 7.977866591438623e-06, + "loss": 0.2271, + "step": 12012 + }, + { + "epoch": 0.30399574866513146, + "grad_norm": 9.559386253356934, + "learning_rate": 7.97754403839812e-06, + "loss": 0.2943, + "step": 12013 + }, + { + "epoch": 0.3040210542298251, + "grad_norm": 4.659024238586426, + "learning_rate": 7.977221466155985e-06, + "loss": 0.1253, + "step": 12014 + }, + { + "epoch": 0.30404635979451883, + "grad_norm": 8.005361557006836, + "learning_rate": 7.976898874714305e-06, + "loss": 0.1845, + "step": 12015 + }, + { + "epoch": 0.3040716653592125, + "grad_norm": 9.666098594665527, + "learning_rate": 7.976576264075158e-06, + "loss": 0.3347, + "step": 12016 + }, + { + "epoch": 0.30409697092390614, + "grad_norm": 7.042448043823242, + "learning_rate": 7.976253634240625e-06, + "loss": 0.1802, + "step": 12017 + }, + { + "epoch": 0.30412227648859985, + "grad_norm": 3.3023335933685303, + "learning_rate": 7.975930985212783e-06, + "loss": 0.1826, + "step": 12018 + }, + { + "epoch": 0.3041475820532935, + "grad_norm": 5.982184410095215, + "learning_rate": 7.975608316993717e-06, + "loss": 0.1766, + "step": 12019 + }, + { + "epoch": 0.3041728876179872, + "grad_norm": 5.188113212585449, + "learning_rate": 7.975285629585509e-06, + "loss": 0.1808, + "step": 12020 + }, + { + "epoch": 0.3041981931826809, + "grad_norm": 4.233210563659668, + "learning_rate": 7.974962922990236e-06, + "loss": 0.0977, + "step": 12021 + }, + { + "epoch": 0.30422349874737453, + "grad_norm": 6.600615978240967, + "learning_rate": 7.97464019720998e-06, + "loss": 0.2529, + "step": 12022 + }, + { + "epoch": 0.30424880431206824, + "grad_norm": 6.497523307800293, + "learning_rate": 7.974317452246822e-06, + "loss": 0.1331, + "step": 12023 + }, + { + "epoch": 0.3042741098767619, + "grad_norm": 6.175530910491943, + "learning_rate": 7.973994688102845e-06, + "loss": 0.2144, + "step": 12024 + }, + { + "epoch": 0.30429941544145556, + "grad_norm": 3.7014126777648926, + "learning_rate": 7.973671904780128e-06, + "loss": 0.1331, + "step": 12025 + }, + { + "epoch": 0.30432472100614927, + "grad_norm": 5.562847137451172, + "learning_rate": 7.973349102280756e-06, + "loss": 0.1947, + "step": 12026 + }, + { + "epoch": 0.3043500265708429, + "grad_norm": 3.903942823410034, + "learning_rate": 7.973026280606809e-06, + "loss": 0.1351, + "step": 12027 + }, + { + "epoch": 0.3043753321355366, + "grad_norm": 2.743316888809204, + "learning_rate": 7.972703439760366e-06, + "loss": 0.0948, + "step": 12028 + }, + { + "epoch": 0.3044006377002303, + "grad_norm": 3.920419454574585, + "learning_rate": 7.972380579743512e-06, + "loss": 0.1464, + "step": 12029 + }, + { + "epoch": 0.30442594326492395, + "grad_norm": 4.4140625, + "learning_rate": 7.972057700558332e-06, + "loss": 0.1974, + "step": 12030 + }, + { + "epoch": 0.30445124882961766, + "grad_norm": 7.1709699630737305, + "learning_rate": 7.971734802206901e-06, + "loss": 0.3134, + "step": 12031 + }, + { + "epoch": 0.3044765543943113, + "grad_norm": 9.65030288696289, + "learning_rate": 7.971411884691305e-06, + "loss": 0.3012, + "step": 12032 + }, + { + "epoch": 0.304501859959005, + "grad_norm": 4.761721611022949, + "learning_rate": 7.97108894801363e-06, + "loss": 0.1574, + "step": 12033 + }, + { + "epoch": 0.3045271655236987, + "grad_norm": 6.141003608703613, + "learning_rate": 7.970765992175952e-06, + "loss": 0.1827, + "step": 12034 + }, + { + "epoch": 0.30455247108839234, + "grad_norm": 6.119683265686035, + "learning_rate": 7.970443017180357e-06, + "loss": 0.1677, + "step": 12035 + }, + { + "epoch": 0.304577776653086, + "grad_norm": 4.683807849884033, + "learning_rate": 7.970120023028928e-06, + "loss": 0.2144, + "step": 12036 + }, + { + "epoch": 0.3046030822177797, + "grad_norm": 3.9139983654022217, + "learning_rate": 7.969797009723748e-06, + "loss": 0.1783, + "step": 12037 + }, + { + "epoch": 0.30462838778247336, + "grad_norm": 4.551696300506592, + "learning_rate": 7.969473977266899e-06, + "loss": 0.1521, + "step": 12038 + }, + { + "epoch": 0.304653693347167, + "grad_norm": 8.074399948120117, + "learning_rate": 7.969150925660464e-06, + "loss": 0.1791, + "step": 12039 + }, + { + "epoch": 0.30467899891186073, + "grad_norm": 7.40624475479126, + "learning_rate": 7.96882785490653e-06, + "loss": 0.2396, + "step": 12040 + }, + { + "epoch": 0.3047043044765544, + "grad_norm": 6.8931989669799805, + "learning_rate": 7.968504765007174e-06, + "loss": 0.183, + "step": 12041 + }, + { + "epoch": 0.30472961004124804, + "grad_norm": 2.7883310317993164, + "learning_rate": 7.968181655964484e-06, + "loss": 0.1319, + "step": 12042 + }, + { + "epoch": 0.30475491560594176, + "grad_norm": 5.569948673248291, + "learning_rate": 7.967858527780541e-06, + "loss": 0.1859, + "step": 12043 + }, + { + "epoch": 0.3047802211706354, + "grad_norm": 6.911693096160889, + "learning_rate": 7.967535380457434e-06, + "loss": 0.1424, + "step": 12044 + }, + { + "epoch": 0.3048055267353291, + "grad_norm": 4.675998210906982, + "learning_rate": 7.967212213997241e-06, + "loss": 0.1738, + "step": 12045 + }, + { + "epoch": 0.3048308323000228, + "grad_norm": 10.128270149230957, + "learning_rate": 7.96688902840205e-06, + "loss": 0.218, + "step": 12046 + }, + { + "epoch": 0.30485613786471644, + "grad_norm": 3.5757319927215576, + "learning_rate": 7.96656582367394e-06, + "loss": 0.1902, + "step": 12047 + }, + { + "epoch": 0.30488144342941015, + "grad_norm": 4.882094383239746, + "learning_rate": 7.966242599815003e-06, + "loss": 0.2752, + "step": 12048 + }, + { + "epoch": 0.3049067489941038, + "grad_norm": 3.841686725616455, + "learning_rate": 7.965919356827316e-06, + "loss": 0.1499, + "step": 12049 + }, + { + "epoch": 0.30493205455879746, + "grad_norm": 6.458188056945801, + "learning_rate": 7.965596094712969e-06, + "loss": 0.2398, + "step": 12050 + }, + { + "epoch": 0.3049573601234912, + "grad_norm": 15.544960021972656, + "learning_rate": 7.965272813474044e-06, + "loss": 0.2271, + "step": 12051 + }, + { + "epoch": 0.30498266568818483, + "grad_norm": 3.786329984664917, + "learning_rate": 7.964949513112625e-06, + "loss": 0.1875, + "step": 12052 + }, + { + "epoch": 0.3050079712528785, + "grad_norm": 3.675184965133667, + "learning_rate": 7.9646261936308e-06, + "loss": 0.1449, + "step": 12053 + }, + { + "epoch": 0.3050332768175722, + "grad_norm": 4.734977722167969, + "learning_rate": 7.96430285503065e-06, + "loss": 0.129, + "step": 12054 + }, + { + "epoch": 0.30505858238226585, + "grad_norm": 4.0214009284973145, + "learning_rate": 7.963979497314264e-06, + "loss": 0.206, + "step": 12055 + }, + { + "epoch": 0.3050838879469595, + "grad_norm": 6.2411885261535645, + "learning_rate": 7.963656120483724e-06, + "loss": 0.1705, + "step": 12056 + }, + { + "epoch": 0.3051091935116532, + "grad_norm": 6.6229753494262695, + "learning_rate": 7.96333272454112e-06, + "loss": 0.1656, + "step": 12057 + }, + { + "epoch": 0.3051344990763469, + "grad_norm": 4.452500820159912, + "learning_rate": 7.96300930948853e-06, + "loss": 0.2232, + "step": 12058 + }, + { + "epoch": 0.3051598046410406, + "grad_norm": 4.183586120605469, + "learning_rate": 7.962685875328047e-06, + "loss": 0.1392, + "step": 12059 + }, + { + "epoch": 0.30518511020573424, + "grad_norm": 10.73587417602539, + "learning_rate": 7.962362422061753e-06, + "loss": 0.26, + "step": 12060 + }, + { + "epoch": 0.3052104157704279, + "grad_norm": 9.923794746398926, + "learning_rate": 7.962038949691734e-06, + "loss": 0.2359, + "step": 12061 + }, + { + "epoch": 0.3052357213351216, + "grad_norm": 4.233339309692383, + "learning_rate": 7.96171545822008e-06, + "loss": 0.1738, + "step": 12062 + }, + { + "epoch": 0.30526102689981527, + "grad_norm": 5.130425930023193, + "learning_rate": 7.96139194764887e-06, + "loss": 0.0982, + "step": 12063 + }, + { + "epoch": 0.3052863324645089, + "grad_norm": 3.8836066722869873, + "learning_rate": 7.961068417980198e-06, + "loss": 0.1228, + "step": 12064 + }, + { + "epoch": 0.30531163802920264, + "grad_norm": 5.763112545013428, + "learning_rate": 7.960744869216144e-06, + "loss": 0.1506, + "step": 12065 + }, + { + "epoch": 0.3053369435938963, + "grad_norm": 7.103333950042725, + "learning_rate": 7.960421301358798e-06, + "loss": 0.1953, + "step": 12066 + }, + { + "epoch": 0.30536224915858995, + "grad_norm": 7.042933464050293, + "learning_rate": 7.960097714410246e-06, + "loss": 0.1776, + "step": 12067 + }, + { + "epoch": 0.30538755472328366, + "grad_norm": 2.3519668579101562, + "learning_rate": 7.959774108372573e-06, + "loss": 0.0948, + "step": 12068 + }, + { + "epoch": 0.3054128602879773, + "grad_norm": 4.221383094787598, + "learning_rate": 7.959450483247869e-06, + "loss": 0.1489, + "step": 12069 + }, + { + "epoch": 0.30543816585267103, + "grad_norm": 3.814544916152954, + "learning_rate": 7.959126839038218e-06, + "loss": 0.1877, + "step": 12070 + }, + { + "epoch": 0.3054634714173647, + "grad_norm": 6.165023326873779, + "learning_rate": 7.95880317574571e-06, + "loss": 0.1883, + "step": 12071 + }, + { + "epoch": 0.30548877698205834, + "grad_norm": 5.424685478210449, + "learning_rate": 7.958479493372429e-06, + "loss": 0.2041, + "step": 12072 + }, + { + "epoch": 0.30551408254675205, + "grad_norm": 17.69914436340332, + "learning_rate": 7.958155791920465e-06, + "loss": 0.3417, + "step": 12073 + }, + { + "epoch": 0.3055393881114457, + "grad_norm": 11.860493659973145, + "learning_rate": 7.957832071391904e-06, + "loss": 0.3382, + "step": 12074 + }, + { + "epoch": 0.30556469367613937, + "grad_norm": 7.576938152313232, + "learning_rate": 7.957508331788834e-06, + "loss": 0.2095, + "step": 12075 + }, + { + "epoch": 0.3055899992408331, + "grad_norm": 4.883270263671875, + "learning_rate": 7.957184573113343e-06, + "loss": 0.112, + "step": 12076 + }, + { + "epoch": 0.30561530480552673, + "grad_norm": 3.0432190895080566, + "learning_rate": 7.956860795367521e-06, + "loss": 0.1284, + "step": 12077 + }, + { + "epoch": 0.3056406103702204, + "grad_norm": 3.1498050689697266, + "learning_rate": 7.956536998553451e-06, + "loss": 0.1638, + "step": 12078 + }, + { + "epoch": 0.3056659159349141, + "grad_norm": 11.093947410583496, + "learning_rate": 7.956213182673225e-06, + "loss": 0.2037, + "step": 12079 + }, + { + "epoch": 0.30569122149960776, + "grad_norm": 4.135753631591797, + "learning_rate": 7.95588934772893e-06, + "loss": 0.2396, + "step": 12080 + }, + { + "epoch": 0.3057165270643014, + "grad_norm": 2.942107677459717, + "learning_rate": 7.955565493722655e-06, + "loss": 0.0877, + "step": 12081 + }, + { + "epoch": 0.3057418326289951, + "grad_norm": 9.40880012512207, + "learning_rate": 7.955241620656487e-06, + "loss": 0.2162, + "step": 12082 + }, + { + "epoch": 0.3057671381936888, + "grad_norm": 4.3434038162231445, + "learning_rate": 7.954917728532515e-06, + "loss": 0.1263, + "step": 12083 + }, + { + "epoch": 0.3057924437583825, + "grad_norm": 4.573331832885742, + "learning_rate": 7.954593817352829e-06, + "loss": 0.1722, + "step": 12084 + }, + { + "epoch": 0.30581774932307615, + "grad_norm": 7.711184024810791, + "learning_rate": 7.954269887119516e-06, + "loss": 0.2743, + "step": 12085 + }, + { + "epoch": 0.3058430548877698, + "grad_norm": 5.800144195556641, + "learning_rate": 7.953945937834669e-06, + "loss": 0.2188, + "step": 12086 + }, + { + "epoch": 0.3058683604524635, + "grad_norm": 12.118208885192871, + "learning_rate": 7.953621969500371e-06, + "loss": 0.3293, + "step": 12087 + }, + { + "epoch": 0.3058936660171572, + "grad_norm": 6.664429664611816, + "learning_rate": 7.953297982118718e-06, + "loss": 0.2586, + "step": 12088 + }, + { + "epoch": 0.30591897158185083, + "grad_norm": 5.444357395172119, + "learning_rate": 7.952973975691791e-06, + "loss": 0.1634, + "step": 12089 + }, + { + "epoch": 0.30594427714654454, + "grad_norm": 2.4257664680480957, + "learning_rate": 7.952649950221687e-06, + "loss": 0.0785, + "step": 12090 + }, + { + "epoch": 0.3059695827112382, + "grad_norm": 3.261112689971924, + "learning_rate": 7.952325905710492e-06, + "loss": 0.126, + "step": 12091 + }, + { + "epoch": 0.30599488827593185, + "grad_norm": 3.856752872467041, + "learning_rate": 7.952001842160296e-06, + "loss": 0.2122, + "step": 12092 + }, + { + "epoch": 0.30602019384062557, + "grad_norm": 5.52935266494751, + "learning_rate": 7.95167775957319e-06, + "loss": 0.2116, + "step": 12093 + }, + { + "epoch": 0.3060454994053192, + "grad_norm": 13.161210060119629, + "learning_rate": 7.951353657951263e-06, + "loss": 0.1468, + "step": 12094 + }, + { + "epoch": 0.30607080497001293, + "grad_norm": 3.258439779281616, + "learning_rate": 7.951029537296603e-06, + "loss": 0.1567, + "step": 12095 + }, + { + "epoch": 0.3060961105347066, + "grad_norm": 8.116998672485352, + "learning_rate": 7.950705397611306e-06, + "loss": 0.229, + "step": 12096 + }, + { + "epoch": 0.30612141609940025, + "grad_norm": 8.785374641418457, + "learning_rate": 7.950381238897458e-06, + "loss": 0.2554, + "step": 12097 + }, + { + "epoch": 0.30614672166409396, + "grad_norm": 2.796206474304199, + "learning_rate": 7.950057061157149e-06, + "loss": 0.1538, + "step": 12098 + }, + { + "epoch": 0.3061720272287876, + "grad_norm": 7.523889064788818, + "learning_rate": 7.949732864392473e-06, + "loss": 0.1852, + "step": 12099 + }, + { + "epoch": 0.30619733279348127, + "grad_norm": 10.953359603881836, + "learning_rate": 7.949408648605514e-06, + "loss": 0.2502, + "step": 12100 + }, + { + "epoch": 0.306222638358175, + "grad_norm": 3.7299468517303467, + "learning_rate": 7.94908441379837e-06, + "loss": 0.1678, + "step": 12101 + }, + { + "epoch": 0.30624794392286864, + "grad_norm": 3.2401554584503174, + "learning_rate": 7.94876015997313e-06, + "loss": 0.1948, + "step": 12102 + }, + { + "epoch": 0.3062732494875623, + "grad_norm": 4.694333553314209, + "learning_rate": 7.948435887131884e-06, + "loss": 0.1603, + "step": 12103 + }, + { + "epoch": 0.306298555052256, + "grad_norm": 3.7591447830200195, + "learning_rate": 7.948111595276721e-06, + "loss": 0.1455, + "step": 12104 + }, + { + "epoch": 0.30632386061694966, + "grad_norm": 7.248936176300049, + "learning_rate": 7.947787284409738e-06, + "loss": 0.2079, + "step": 12105 + }, + { + "epoch": 0.3063491661816433, + "grad_norm": 3.3941233158111572, + "learning_rate": 7.947462954533022e-06, + "loss": 0.1309, + "step": 12106 + }, + { + "epoch": 0.30637447174633703, + "grad_norm": 6.663671493530273, + "learning_rate": 7.947138605648665e-06, + "loss": 0.2338, + "step": 12107 + }, + { + "epoch": 0.3063997773110307, + "grad_norm": 3.829761028289795, + "learning_rate": 7.94681423775876e-06, + "loss": 0.155, + "step": 12108 + }, + { + "epoch": 0.3064250828757244, + "grad_norm": 7.152969837188721, + "learning_rate": 7.946489850865398e-06, + "loss": 0.2049, + "step": 12109 + }, + { + "epoch": 0.30645038844041805, + "grad_norm": 5.1726179122924805, + "learning_rate": 7.94616544497067e-06, + "loss": 0.1348, + "step": 12110 + }, + { + "epoch": 0.3064756940051117, + "grad_norm": 7.258720874786377, + "learning_rate": 7.94584102007667e-06, + "loss": 0.1711, + "step": 12111 + }, + { + "epoch": 0.3065009995698054, + "grad_norm": 6.671204090118408, + "learning_rate": 7.94551657618549e-06, + "loss": 0.2002, + "step": 12112 + }, + { + "epoch": 0.3065263051344991, + "grad_norm": 5.746369361877441, + "learning_rate": 7.94519211329922e-06, + "loss": 0.2312, + "step": 12113 + }, + { + "epoch": 0.30655161069919273, + "grad_norm": 4.745060443878174, + "learning_rate": 7.944867631419955e-06, + "loss": 0.1653, + "step": 12114 + }, + { + "epoch": 0.30657691626388645, + "grad_norm": 9.313458442687988, + "learning_rate": 7.944543130549787e-06, + "loss": 0.2273, + "step": 12115 + }, + { + "epoch": 0.3066022218285801, + "grad_norm": 7.257711887359619, + "learning_rate": 7.944218610690806e-06, + "loss": 0.2725, + "step": 12116 + }, + { + "epoch": 0.30662752739327376, + "grad_norm": 11.466397285461426, + "learning_rate": 7.943894071845108e-06, + "loss": 0.3533, + "step": 12117 + }, + { + "epoch": 0.30665283295796747, + "grad_norm": 3.376417875289917, + "learning_rate": 7.943569514014785e-06, + "loss": 0.0956, + "step": 12118 + }, + { + "epoch": 0.3066781385226611, + "grad_norm": 9.0325288772583, + "learning_rate": 7.94324493720193e-06, + "loss": 0.2227, + "step": 12119 + }, + { + "epoch": 0.3067034440873548, + "grad_norm": 10.237431526184082, + "learning_rate": 7.942920341408635e-06, + "loss": 0.1551, + "step": 12120 + }, + { + "epoch": 0.3067287496520485, + "grad_norm": 6.273176193237305, + "learning_rate": 7.942595726636994e-06, + "loss": 0.2161, + "step": 12121 + }, + { + "epoch": 0.30675405521674215, + "grad_norm": 13.294154167175293, + "learning_rate": 7.9422710928891e-06, + "loss": 0.2389, + "step": 12122 + }, + { + "epoch": 0.30677936078143586, + "grad_norm": 5.320262908935547, + "learning_rate": 7.94194644016705e-06, + "loss": 0.2015, + "step": 12123 + }, + { + "epoch": 0.3068046663461295, + "grad_norm": 11.745688438415527, + "learning_rate": 7.941621768472931e-06, + "loss": 0.2249, + "step": 12124 + }, + { + "epoch": 0.3068299719108232, + "grad_norm": 5.59930944442749, + "learning_rate": 7.941297077808843e-06, + "loss": 0.1933, + "step": 12125 + }, + { + "epoch": 0.3068552774755169, + "grad_norm": 4.073618412017822, + "learning_rate": 7.940972368176876e-06, + "loss": 0.1258, + "step": 12126 + }, + { + "epoch": 0.30688058304021054, + "grad_norm": 6.881129264831543, + "learning_rate": 7.940647639579125e-06, + "loss": 0.2499, + "step": 12127 + }, + { + "epoch": 0.3069058886049042, + "grad_norm": 3.270545482635498, + "learning_rate": 7.940322892017686e-06, + "loss": 0.194, + "step": 12128 + }, + { + "epoch": 0.3069311941695979, + "grad_norm": 4.3619608879089355, + "learning_rate": 7.93999812549465e-06, + "loss": 0.1544, + "step": 12129 + }, + { + "epoch": 0.30695649973429157, + "grad_norm": 10.077081680297852, + "learning_rate": 7.939673340012114e-06, + "loss": 0.1936, + "step": 12130 + }, + { + "epoch": 0.3069818052989852, + "grad_norm": 4.45304536819458, + "learning_rate": 7.939348535572172e-06, + "loss": 0.1536, + "step": 12131 + }, + { + "epoch": 0.30700711086367893, + "grad_norm": 5.467031002044678, + "learning_rate": 7.939023712176916e-06, + "loss": 0.1567, + "step": 12132 + }, + { + "epoch": 0.3070324164283726, + "grad_norm": 7.724366188049316, + "learning_rate": 7.938698869828442e-06, + "loss": 0.1105, + "step": 12133 + }, + { + "epoch": 0.3070577219930663, + "grad_norm": 4.045492172241211, + "learning_rate": 7.938374008528848e-06, + "loss": 0.1203, + "step": 12134 + }, + { + "epoch": 0.30708302755775996, + "grad_norm": 5.9016337394714355, + "learning_rate": 7.938049128280228e-06, + "loss": 0.2579, + "step": 12135 + }, + { + "epoch": 0.3071083331224536, + "grad_norm": 6.27346134185791, + "learning_rate": 7.937724229084673e-06, + "loss": 0.1678, + "step": 12136 + }, + { + "epoch": 0.3071336386871473, + "grad_norm": 2.9512336254119873, + "learning_rate": 7.937399310944282e-06, + "loss": 0.1507, + "step": 12137 + }, + { + "epoch": 0.307158944251841, + "grad_norm": 3.6363539695739746, + "learning_rate": 7.937074373861147e-06, + "loss": 0.1616, + "step": 12138 + }, + { + "epoch": 0.30718424981653464, + "grad_norm": 4.9640913009643555, + "learning_rate": 7.936749417837369e-06, + "loss": 0.2086, + "step": 12139 + }, + { + "epoch": 0.30720955538122835, + "grad_norm": 4.1761298179626465, + "learning_rate": 7.936424442875039e-06, + "loss": 0.1539, + "step": 12140 + }, + { + "epoch": 0.307234860945922, + "grad_norm": 5.704134464263916, + "learning_rate": 7.936099448976254e-06, + "loss": 0.1724, + "step": 12141 + }, + { + "epoch": 0.30726016651061566, + "grad_norm": 3.6502678394317627, + "learning_rate": 7.935774436143109e-06, + "loss": 0.1613, + "step": 12142 + }, + { + "epoch": 0.3072854720753094, + "grad_norm": 3.7357304096221924, + "learning_rate": 7.935449404377703e-06, + "loss": 0.1832, + "step": 12143 + }, + { + "epoch": 0.30731077764000303, + "grad_norm": 3.554302215576172, + "learning_rate": 7.935124353682126e-06, + "loss": 0.1556, + "step": 12144 + }, + { + "epoch": 0.3073360832046967, + "grad_norm": 6.851524829864502, + "learning_rate": 7.93479928405848e-06, + "loss": 0.1289, + "step": 12145 + }, + { + "epoch": 0.3073613887693904, + "grad_norm": 5.88759708404541, + "learning_rate": 7.93447419550886e-06, + "loss": 0.1446, + "step": 12146 + }, + { + "epoch": 0.30738669433408405, + "grad_norm": 5.256760120391846, + "learning_rate": 7.934149088035361e-06, + "loss": 0.1456, + "step": 12147 + }, + { + "epoch": 0.30741199989877777, + "grad_norm": 5.8366923332214355, + "learning_rate": 7.933823961640081e-06, + "loss": 0.1432, + "step": 12148 + }, + { + "epoch": 0.3074373054634714, + "grad_norm": 3.312195062637329, + "learning_rate": 7.933498816325115e-06, + "loss": 0.0955, + "step": 12149 + }, + { + "epoch": 0.3074626110281651, + "grad_norm": 6.221615314483643, + "learning_rate": 7.933173652092563e-06, + "loss": 0.1959, + "step": 12150 + }, + { + "epoch": 0.3074879165928588, + "grad_norm": 5.311913967132568, + "learning_rate": 7.932848468944519e-06, + "loss": 0.214, + "step": 12151 + }, + { + "epoch": 0.30751322215755245, + "grad_norm": 5.146218776702881, + "learning_rate": 7.93252326688308e-06, + "loss": 0.2011, + "step": 12152 + }, + { + "epoch": 0.3075385277222461, + "grad_norm": 20.030099868774414, + "learning_rate": 7.932198045910342e-06, + "loss": 0.3403, + "step": 12153 + }, + { + "epoch": 0.3075638332869398, + "grad_norm": 7.224399566650391, + "learning_rate": 7.931872806028407e-06, + "loss": 0.2344, + "step": 12154 + }, + { + "epoch": 0.30758913885163347, + "grad_norm": 9.889847755432129, + "learning_rate": 7.931547547239368e-06, + "loss": 0.1529, + "step": 12155 + }, + { + "epoch": 0.3076144444163271, + "grad_norm": 13.414008140563965, + "learning_rate": 7.931222269545326e-06, + "loss": 0.3219, + "step": 12156 + }, + { + "epoch": 0.30763974998102084, + "grad_norm": 5.701589584350586, + "learning_rate": 7.930896972948377e-06, + "loss": 0.1956, + "step": 12157 + }, + { + "epoch": 0.3076650555457145, + "grad_norm": 7.319240570068359, + "learning_rate": 7.930571657450616e-06, + "loss": 0.2395, + "step": 12158 + }, + { + "epoch": 0.3076903611104082, + "grad_norm": 5.602306365966797, + "learning_rate": 7.930246323054146e-06, + "loss": 0.1541, + "step": 12159 + }, + { + "epoch": 0.30771566667510186, + "grad_norm": 5.210147380828857, + "learning_rate": 7.92992096976106e-06, + "loss": 0.2208, + "step": 12160 + }, + { + "epoch": 0.3077409722397955, + "grad_norm": 5.835524082183838, + "learning_rate": 7.929595597573461e-06, + "loss": 0.2482, + "step": 12161 + }, + { + "epoch": 0.30776627780448923, + "grad_norm": 5.661257266998291, + "learning_rate": 7.929270206493444e-06, + "loss": 0.1808, + "step": 12162 + }, + { + "epoch": 0.3077915833691829, + "grad_norm": 5.134428024291992, + "learning_rate": 7.928944796523109e-06, + "loss": 0.1996, + "step": 12163 + }, + { + "epoch": 0.30781688893387654, + "grad_norm": 6.193796157836914, + "learning_rate": 7.928619367664554e-06, + "loss": 0.206, + "step": 12164 + }, + { + "epoch": 0.30784219449857025, + "grad_norm": 6.441089630126953, + "learning_rate": 7.928293919919876e-06, + "loss": 0.228, + "step": 12165 + }, + { + "epoch": 0.3078675000632639, + "grad_norm": 2.5572669506073, + "learning_rate": 7.927968453291174e-06, + "loss": 0.1555, + "step": 12166 + }, + { + "epoch": 0.30789280562795757, + "grad_norm": 5.426469802856445, + "learning_rate": 7.927642967780552e-06, + "loss": 0.1363, + "step": 12167 + }, + { + "epoch": 0.3079181111926513, + "grad_norm": 3.9773714542388916, + "learning_rate": 7.927317463390103e-06, + "loss": 0.1634, + "step": 12168 + }, + { + "epoch": 0.30794341675734493, + "grad_norm": 3.923398494720459, + "learning_rate": 7.926991940121928e-06, + "loss": 0.1667, + "step": 12169 + }, + { + "epoch": 0.3079687223220386, + "grad_norm": 24.345876693725586, + "learning_rate": 7.926666397978127e-06, + "loss": 0.3339, + "step": 12170 + }, + { + "epoch": 0.3079940278867323, + "grad_norm": 7.112741470336914, + "learning_rate": 7.926340836960799e-06, + "loss": 0.187, + "step": 12171 + }, + { + "epoch": 0.30801933345142596, + "grad_norm": 3.237759590148926, + "learning_rate": 7.926015257072042e-06, + "loss": 0.1196, + "step": 12172 + }, + { + "epoch": 0.30804463901611967, + "grad_norm": 4.603772163391113, + "learning_rate": 7.925689658313957e-06, + "loss": 0.1986, + "step": 12173 + }, + { + "epoch": 0.3080699445808133, + "grad_norm": 2.717571258544922, + "learning_rate": 7.925364040688646e-06, + "loss": 0.0902, + "step": 12174 + }, + { + "epoch": 0.308095250145507, + "grad_norm": 4.559843063354492, + "learning_rate": 7.925038404198203e-06, + "loss": 0.1504, + "step": 12175 + }, + { + "epoch": 0.3081205557102007, + "grad_norm": 4.430372714996338, + "learning_rate": 7.924712748844736e-06, + "loss": 0.2208, + "step": 12176 + }, + { + "epoch": 0.30814586127489435, + "grad_norm": 4.480404853820801, + "learning_rate": 7.924387074630336e-06, + "loss": 0.1557, + "step": 12177 + }, + { + "epoch": 0.308171166839588, + "grad_norm": 10.248217582702637, + "learning_rate": 7.924061381557112e-06, + "loss": 0.2588, + "step": 12178 + }, + { + "epoch": 0.3081964724042817, + "grad_norm": 3.383157253265381, + "learning_rate": 7.923735669627156e-06, + "loss": 0.1566, + "step": 12179 + }, + { + "epoch": 0.3082217779689754, + "grad_norm": 2.6077523231506348, + "learning_rate": 7.923409938842573e-06, + "loss": 0.1525, + "step": 12180 + }, + { + "epoch": 0.30824708353366903, + "grad_norm": 3.6444251537323, + "learning_rate": 7.923084189205466e-06, + "loss": 0.2361, + "step": 12181 + }, + { + "epoch": 0.30827238909836274, + "grad_norm": 10.921581268310547, + "learning_rate": 7.92275842071793e-06, + "loss": 0.2117, + "step": 12182 + }, + { + "epoch": 0.3082976946630564, + "grad_norm": 4.40740442276001, + "learning_rate": 7.92243263338207e-06, + "loss": 0.161, + "step": 12183 + }, + { + "epoch": 0.30832300022775005, + "grad_norm": 3.9465436935424805, + "learning_rate": 7.922106827199984e-06, + "loss": 0.2047, + "step": 12184 + }, + { + "epoch": 0.30834830579244377, + "grad_norm": 3.8467612266540527, + "learning_rate": 7.921781002173777e-06, + "loss": 0.1801, + "step": 12185 + }, + { + "epoch": 0.3083736113571374, + "grad_norm": 3.560858726501465, + "learning_rate": 7.921455158305547e-06, + "loss": 0.1636, + "step": 12186 + }, + { + "epoch": 0.30839891692183113, + "grad_norm": 7.5938239097595215, + "learning_rate": 7.921129295597394e-06, + "loss": 0.2056, + "step": 12187 + }, + { + "epoch": 0.3084242224865248, + "grad_norm": 3.2282474040985107, + "learning_rate": 7.920803414051423e-06, + "loss": 0.1074, + "step": 12188 + }, + { + "epoch": 0.30844952805121845, + "grad_norm": 4.438357353210449, + "learning_rate": 7.920477513669736e-06, + "loss": 0.1693, + "step": 12189 + }, + { + "epoch": 0.30847483361591216, + "grad_norm": 3.7108314037323, + "learning_rate": 7.920151594454431e-06, + "loss": 0.1541, + "step": 12190 + }, + { + "epoch": 0.3085001391806058, + "grad_norm": 4.9682393074035645, + "learning_rate": 7.91982565640761e-06, + "loss": 0.2008, + "step": 12191 + }, + { + "epoch": 0.30852544474529947, + "grad_norm": 6.905528545379639, + "learning_rate": 7.919499699531379e-06, + "loss": 0.1593, + "step": 12192 + }, + { + "epoch": 0.3085507503099932, + "grad_norm": 8.939891815185547, + "learning_rate": 7.919173723827836e-06, + "loss": 0.2376, + "step": 12193 + }, + { + "epoch": 0.30857605587468684, + "grad_norm": 15.025738716125488, + "learning_rate": 7.918847729299086e-06, + "loss": 0.2262, + "step": 12194 + }, + { + "epoch": 0.3086013614393805, + "grad_norm": 4.627471446990967, + "learning_rate": 7.918521715947229e-06, + "loss": 0.2443, + "step": 12195 + }, + { + "epoch": 0.3086266670040742, + "grad_norm": 8.043971061706543, + "learning_rate": 7.91819568377437e-06, + "loss": 0.2572, + "step": 12196 + }, + { + "epoch": 0.30865197256876786, + "grad_norm": 5.319970607757568, + "learning_rate": 7.917869632782607e-06, + "loss": 0.1783, + "step": 12197 + }, + { + "epoch": 0.3086772781334616, + "grad_norm": 4.238616943359375, + "learning_rate": 7.917543562974047e-06, + "loss": 0.1842, + "step": 12198 + }, + { + "epoch": 0.30870258369815523, + "grad_norm": 9.624513626098633, + "learning_rate": 7.917217474350791e-06, + "loss": 0.2825, + "step": 12199 + }, + { + "epoch": 0.3087278892628489, + "grad_norm": 4.173855781555176, + "learning_rate": 7.916891366914943e-06, + "loss": 0.1111, + "step": 12200 + }, + { + "epoch": 0.3087531948275426, + "grad_norm": 3.6698989868164062, + "learning_rate": 7.916565240668604e-06, + "loss": 0.1866, + "step": 12201 + }, + { + "epoch": 0.30877850039223625, + "grad_norm": 6.9011712074279785, + "learning_rate": 7.91623909561388e-06, + "loss": 0.1291, + "step": 12202 + }, + { + "epoch": 0.3088038059569299, + "grad_norm": 3.367361307144165, + "learning_rate": 7.915912931752872e-06, + "loss": 0.1484, + "step": 12203 + }, + { + "epoch": 0.3088291115216236, + "grad_norm": 14.73681640625, + "learning_rate": 7.91558674908768e-06, + "loss": 0.1576, + "step": 12204 + }, + { + "epoch": 0.3088544170863173, + "grad_norm": 4.64976692199707, + "learning_rate": 7.915260547620416e-06, + "loss": 0.1525, + "step": 12205 + }, + { + "epoch": 0.30887972265101094, + "grad_norm": 5.723728179931641, + "learning_rate": 7.914934327353178e-06, + "loss": 0.1773, + "step": 12206 + }, + { + "epoch": 0.30890502821570465, + "grad_norm": 4.329639434814453, + "learning_rate": 7.914608088288071e-06, + "loss": 0.276, + "step": 12207 + }, + { + "epoch": 0.3089303337803983, + "grad_norm": 3.520646572113037, + "learning_rate": 7.914281830427198e-06, + "loss": 0.1847, + "step": 12208 + }, + { + "epoch": 0.30895563934509196, + "grad_norm": 4.168717861175537, + "learning_rate": 7.913955553772663e-06, + "loss": 0.0965, + "step": 12209 + }, + { + "epoch": 0.30898094490978567, + "grad_norm": 6.829684734344482, + "learning_rate": 7.913629258326571e-06, + "loss": 0.2101, + "step": 12210 + }, + { + "epoch": 0.3090062504744793, + "grad_norm": 8.857081413269043, + "learning_rate": 7.913302944091025e-06, + "loss": 0.1462, + "step": 12211 + }, + { + "epoch": 0.30903155603917304, + "grad_norm": 13.434744834899902, + "learning_rate": 7.912976611068133e-06, + "loss": 0.2189, + "step": 12212 + }, + { + "epoch": 0.3090568616038667, + "grad_norm": 4.657166481018066, + "learning_rate": 7.912650259259995e-06, + "loss": 0.1979, + "step": 12213 + }, + { + "epoch": 0.30908216716856035, + "grad_norm": 4.1788787841796875, + "learning_rate": 7.912323888668716e-06, + "loss": 0.1564, + "step": 12214 + }, + { + "epoch": 0.30910747273325406, + "grad_norm": 4.0920610427856445, + "learning_rate": 7.911997499296405e-06, + "loss": 0.1492, + "step": 12215 + }, + { + "epoch": 0.3091327782979477, + "grad_norm": 4.938377857208252, + "learning_rate": 7.911671091145161e-06, + "loss": 0.1708, + "step": 12216 + }, + { + "epoch": 0.3091580838626414, + "grad_norm": 6.037628650665283, + "learning_rate": 7.911344664217093e-06, + "loss": 0.2377, + "step": 12217 + }, + { + "epoch": 0.3091833894273351, + "grad_norm": 5.081456661224365, + "learning_rate": 7.911018218514306e-06, + "loss": 0.0803, + "step": 12218 + }, + { + "epoch": 0.30920869499202874, + "grad_norm": 5.1522345542907715, + "learning_rate": 7.910691754038902e-06, + "loss": 0.1946, + "step": 12219 + }, + { + "epoch": 0.3092340005567224, + "grad_norm": 3.788393259048462, + "learning_rate": 7.91036527079299e-06, + "loss": 0.1563, + "step": 12220 + }, + { + "epoch": 0.3092593061214161, + "grad_norm": 3.4776971340179443, + "learning_rate": 7.910038768778671e-06, + "loss": 0.108, + "step": 12221 + }, + { + "epoch": 0.30928461168610977, + "grad_norm": 4.642300128936768, + "learning_rate": 7.909712247998056e-06, + "loss": 0.1356, + "step": 12222 + }, + { + "epoch": 0.3093099172508035, + "grad_norm": 6.388111591339111, + "learning_rate": 7.909385708453246e-06, + "loss": 0.1696, + "step": 12223 + }, + { + "epoch": 0.30933522281549714, + "grad_norm": 10.827768325805664, + "learning_rate": 7.909059150146349e-06, + "loss": 0.2092, + "step": 12224 + }, + { + "epoch": 0.3093605283801908, + "grad_norm": 3.9747190475463867, + "learning_rate": 7.908732573079473e-06, + "loss": 0.1046, + "step": 12225 + }, + { + "epoch": 0.3093858339448845, + "grad_norm": 5.306041717529297, + "learning_rate": 7.90840597725472e-06, + "loss": 0.1633, + "step": 12226 + }, + { + "epoch": 0.30941113950957816, + "grad_norm": 3.2617430686950684, + "learning_rate": 7.908079362674198e-06, + "loss": 0.1752, + "step": 12227 + }, + { + "epoch": 0.3094364450742718, + "grad_norm": 13.440601348876953, + "learning_rate": 7.907752729340012e-06, + "loss": 0.2782, + "step": 12228 + }, + { + "epoch": 0.3094617506389655, + "grad_norm": 10.610136985778809, + "learning_rate": 7.907426077254273e-06, + "loss": 0.2316, + "step": 12229 + }, + { + "epoch": 0.3094870562036592, + "grad_norm": 2.9987614154815674, + "learning_rate": 7.90709940641908e-06, + "loss": 0.1391, + "step": 12230 + }, + { + "epoch": 0.30951236176835284, + "grad_norm": 6.201592445373535, + "learning_rate": 7.906772716836547e-06, + "loss": 0.1833, + "step": 12231 + }, + { + "epoch": 0.30953766733304655, + "grad_norm": 6.441122055053711, + "learning_rate": 7.906446008508775e-06, + "loss": 0.1786, + "step": 12232 + }, + { + "epoch": 0.3095629728977402, + "grad_norm": 5.138381004333496, + "learning_rate": 7.906119281437873e-06, + "loss": 0.1641, + "step": 12233 + }, + { + "epoch": 0.30958827846243386, + "grad_norm": 4.987646579742432, + "learning_rate": 7.90579253562595e-06, + "loss": 0.1357, + "step": 12234 + }, + { + "epoch": 0.3096135840271276, + "grad_norm": 6.90881872177124, + "learning_rate": 7.90546577107511e-06, + "loss": 0.1317, + "step": 12235 + }, + { + "epoch": 0.30963888959182123, + "grad_norm": 6.390951633453369, + "learning_rate": 7.905138987787464e-06, + "loss": 0.1462, + "step": 12236 + }, + { + "epoch": 0.30966419515651494, + "grad_norm": 7.350165843963623, + "learning_rate": 7.904812185765114e-06, + "loss": 0.2414, + "step": 12237 + }, + { + "epoch": 0.3096895007212086, + "grad_norm": 4.795356750488281, + "learning_rate": 7.904485365010172e-06, + "loss": 0.1885, + "step": 12238 + }, + { + "epoch": 0.30971480628590226, + "grad_norm": 7.9793314933776855, + "learning_rate": 7.904158525524746e-06, + "loss": 0.1996, + "step": 12239 + }, + { + "epoch": 0.30974011185059597, + "grad_norm": 4.177329063415527, + "learning_rate": 7.90383166731094e-06, + "loss": 0.2114, + "step": 12240 + }, + { + "epoch": 0.3097654174152896, + "grad_norm": 5.851536273956299, + "learning_rate": 7.903504790370864e-06, + "loss": 0.1762, + "step": 12241 + }, + { + "epoch": 0.3097907229799833, + "grad_norm": 3.5999515056610107, + "learning_rate": 7.903177894706626e-06, + "loss": 0.1336, + "step": 12242 + }, + { + "epoch": 0.309816028544677, + "grad_norm": 5.044424533843994, + "learning_rate": 7.902850980320334e-06, + "loss": 0.2129, + "step": 12243 + }, + { + "epoch": 0.30984133410937065, + "grad_norm": 7.311453342437744, + "learning_rate": 7.902524047214096e-06, + "loss": 0.2484, + "step": 12244 + }, + { + "epoch": 0.3098666396740643, + "grad_norm": 5.890646457672119, + "learning_rate": 7.902197095390019e-06, + "loss": 0.1424, + "step": 12245 + }, + { + "epoch": 0.309891945238758, + "grad_norm": 10.146862983703613, + "learning_rate": 7.901870124850213e-06, + "loss": 0.2423, + "step": 12246 + }, + { + "epoch": 0.30991725080345167, + "grad_norm": 4.657558441162109, + "learning_rate": 7.901543135596788e-06, + "loss": 0.1718, + "step": 12247 + }, + { + "epoch": 0.3099425563681453, + "grad_norm": 7.884860992431641, + "learning_rate": 7.901216127631848e-06, + "loss": 0.1488, + "step": 12248 + }, + { + "epoch": 0.30996786193283904, + "grad_norm": 4.799698352813721, + "learning_rate": 7.900889100957509e-06, + "loss": 0.1484, + "step": 12249 + }, + { + "epoch": 0.3099931674975327, + "grad_norm": 7.132423400878906, + "learning_rate": 7.900562055575873e-06, + "loss": 0.2881, + "step": 12250 + }, + { + "epoch": 0.3100184730622264, + "grad_norm": 6.490077018737793, + "learning_rate": 7.900234991489054e-06, + "loss": 0.2414, + "step": 12251 + }, + { + "epoch": 0.31004377862692006, + "grad_norm": 6.231410503387451, + "learning_rate": 7.899907908699156e-06, + "loss": 0.225, + "step": 12252 + }, + { + "epoch": 0.3100690841916137, + "grad_norm": 4.181406021118164, + "learning_rate": 7.89958080720829e-06, + "loss": 0.1323, + "step": 12253 + }, + { + "epoch": 0.31009438975630743, + "grad_norm": 3.9537339210510254, + "learning_rate": 7.899253687018571e-06, + "loss": 0.1095, + "step": 12254 + }, + { + "epoch": 0.3101196953210011, + "grad_norm": 4.603212833404541, + "learning_rate": 7.898926548132102e-06, + "loss": 0.1016, + "step": 12255 + }, + { + "epoch": 0.31014500088569474, + "grad_norm": 4.599337100982666, + "learning_rate": 7.898599390550998e-06, + "loss": 0.1615, + "step": 12256 + }, + { + "epoch": 0.31017030645038846, + "grad_norm": 7.816036701202393, + "learning_rate": 7.898272214277362e-06, + "loss": 0.1704, + "step": 12257 + }, + { + "epoch": 0.3101956120150821, + "grad_norm": 5.116186141967773, + "learning_rate": 7.897945019313311e-06, + "loss": 0.1228, + "step": 12258 + }, + { + "epoch": 0.31022091757977577, + "grad_norm": 6.679928302764893, + "learning_rate": 7.897617805660948e-06, + "loss": 0.1215, + "step": 12259 + }, + { + "epoch": 0.3102462231444695, + "grad_norm": 11.35606575012207, + "learning_rate": 7.89729057332239e-06, + "loss": 0.1887, + "step": 12260 + }, + { + "epoch": 0.31027152870916314, + "grad_norm": 3.466402769088745, + "learning_rate": 7.896963322299743e-06, + "loss": 0.1436, + "step": 12261 + }, + { + "epoch": 0.31029683427385685, + "grad_norm": 4.3621416091918945, + "learning_rate": 7.896636052595117e-06, + "loss": 0.1315, + "step": 12262 + }, + { + "epoch": 0.3103221398385505, + "grad_norm": 9.482474327087402, + "learning_rate": 7.896308764210626e-06, + "loss": 0.2522, + "step": 12263 + }, + { + "epoch": 0.31034744540324416, + "grad_norm": 8.409112930297852, + "learning_rate": 7.895981457148376e-06, + "loss": 0.1553, + "step": 12264 + }, + { + "epoch": 0.31037275096793787, + "grad_norm": 15.930463790893555, + "learning_rate": 7.895654131410483e-06, + "loss": 0.2468, + "step": 12265 + }, + { + "epoch": 0.3103980565326315, + "grad_norm": 7.224931240081787, + "learning_rate": 7.895326786999052e-06, + "loss": 0.1618, + "step": 12266 + }, + { + "epoch": 0.3104233620973252, + "grad_norm": 5.092609405517578, + "learning_rate": 7.894999423916199e-06, + "loss": 0.2214, + "step": 12267 + }, + { + "epoch": 0.3104486676620189, + "grad_norm": 6.990938663482666, + "learning_rate": 7.894672042164034e-06, + "loss": 0.2097, + "step": 12268 + }, + { + "epoch": 0.31047397322671255, + "grad_norm": 7.005984783172607, + "learning_rate": 7.894344641744667e-06, + "loss": 0.1761, + "step": 12269 + }, + { + "epoch": 0.3104992787914062, + "grad_norm": 4.477050304412842, + "learning_rate": 7.894017222660209e-06, + "loss": 0.188, + "step": 12270 + }, + { + "epoch": 0.3105245843560999, + "grad_norm": 6.244382858276367, + "learning_rate": 7.893689784912773e-06, + "loss": 0.1963, + "step": 12271 + }, + { + "epoch": 0.3105498899207936, + "grad_norm": 3.2568957805633545, + "learning_rate": 7.893362328504468e-06, + "loss": 0.1683, + "step": 12272 + }, + { + "epoch": 0.31057519548548723, + "grad_norm": 4.13332986831665, + "learning_rate": 7.893034853437411e-06, + "loss": 0.1683, + "step": 12273 + }, + { + "epoch": 0.31060050105018094, + "grad_norm": 4.737875938415527, + "learning_rate": 7.892707359713707e-06, + "loss": 0.2077, + "step": 12274 + }, + { + "epoch": 0.3106258066148746, + "grad_norm": 5.441653728485107, + "learning_rate": 7.892379847335474e-06, + "loss": 0.2419, + "step": 12275 + }, + { + "epoch": 0.3106511121795683, + "grad_norm": 6.36318826675415, + "learning_rate": 7.89205231630482e-06, + "loss": 0.1823, + "step": 12276 + }, + { + "epoch": 0.31067641774426197, + "grad_norm": 5.760793209075928, + "learning_rate": 7.891724766623859e-06, + "loss": 0.1822, + "step": 12277 + }, + { + "epoch": 0.3107017233089556, + "grad_norm": 3.997852087020874, + "learning_rate": 7.891397198294703e-06, + "loss": 0.1295, + "step": 12278 + }, + { + "epoch": 0.31072702887364934, + "grad_norm": 5.149819850921631, + "learning_rate": 7.891069611319463e-06, + "loss": 0.1435, + "step": 12279 + }, + { + "epoch": 0.310752334438343, + "grad_norm": 4.738735675811768, + "learning_rate": 7.890742005700254e-06, + "loss": 0.1973, + "step": 12280 + }, + { + "epoch": 0.31077764000303665, + "grad_norm": 16.317380905151367, + "learning_rate": 7.890414381439187e-06, + "loss": 0.3315, + "step": 12281 + }, + { + "epoch": 0.31080294556773036, + "grad_norm": 6.606010913848877, + "learning_rate": 7.890086738538377e-06, + "loss": 0.1796, + "step": 12282 + }, + { + "epoch": 0.310828251132424, + "grad_norm": 4.967373847961426, + "learning_rate": 7.889759076999931e-06, + "loss": 0.169, + "step": 12283 + }, + { + "epoch": 0.31085355669711767, + "grad_norm": 4.665699005126953, + "learning_rate": 7.88943139682597e-06, + "loss": 0.1687, + "step": 12284 + }, + { + "epoch": 0.3108788622618114, + "grad_norm": 4.387739658355713, + "learning_rate": 7.889103698018603e-06, + "loss": 0.1515, + "step": 12285 + }, + { + "epoch": 0.31090416782650504, + "grad_norm": 11.486573219299316, + "learning_rate": 7.888775980579944e-06, + "loss": 0.275, + "step": 12286 + }, + { + "epoch": 0.31092947339119875, + "grad_norm": 6.273064136505127, + "learning_rate": 7.888448244512105e-06, + "loss": 0.2115, + "step": 12287 + }, + { + "epoch": 0.3109547789558924, + "grad_norm": 2.638784885406494, + "learning_rate": 7.8881204898172e-06, + "loss": 0.0964, + "step": 12288 + }, + { + "epoch": 0.31098008452058606, + "grad_norm": 5.290248870849609, + "learning_rate": 7.887792716497344e-06, + "loss": 0.1394, + "step": 12289 + }, + { + "epoch": 0.3110053900852798, + "grad_norm": 5.790405750274658, + "learning_rate": 7.887464924554651e-06, + "loss": 0.1932, + "step": 12290 + }, + { + "epoch": 0.31103069564997343, + "grad_norm": 4.6122612953186035, + "learning_rate": 7.887137113991232e-06, + "loss": 0.1572, + "step": 12291 + }, + { + "epoch": 0.3110560012146671, + "grad_norm": 4.356723308563232, + "learning_rate": 7.886809284809204e-06, + "loss": 0.1165, + "step": 12292 + }, + { + "epoch": 0.3110813067793608, + "grad_norm": 9.445384979248047, + "learning_rate": 7.88648143701068e-06, + "loss": 0.1812, + "step": 12293 + }, + { + "epoch": 0.31110661234405446, + "grad_norm": 6.542433261871338, + "learning_rate": 7.886153570597772e-06, + "loss": 0.2103, + "step": 12294 + }, + { + "epoch": 0.3111319179087481, + "grad_norm": 6.567593574523926, + "learning_rate": 7.8858256855726e-06, + "loss": 0.1544, + "step": 12295 + }, + { + "epoch": 0.3111572234734418, + "grad_norm": 3.2896993160247803, + "learning_rate": 7.885497781937272e-06, + "loss": 0.1718, + "step": 12296 + }, + { + "epoch": 0.3111825290381355, + "grad_norm": 6.375735759735107, + "learning_rate": 7.885169859693907e-06, + "loss": 0.2171, + "step": 12297 + }, + { + "epoch": 0.31120783460282914, + "grad_norm": 8.182901382446289, + "learning_rate": 7.884841918844617e-06, + "loss": 0.1843, + "step": 12298 + }, + { + "epoch": 0.31123314016752285, + "grad_norm": 4.036660671234131, + "learning_rate": 7.884513959391518e-06, + "loss": 0.1594, + "step": 12299 + }, + { + "epoch": 0.3112584457322165, + "grad_norm": 6.547786712646484, + "learning_rate": 7.884185981336727e-06, + "loss": 0.239, + "step": 12300 + }, + { + "epoch": 0.3112837512969102, + "grad_norm": 6.4362945556640625, + "learning_rate": 7.883857984682355e-06, + "loss": 0.1874, + "step": 12301 + }, + { + "epoch": 0.3113090568616039, + "grad_norm": 7.562295436859131, + "learning_rate": 7.88352996943052e-06, + "loss": 0.2057, + "step": 12302 + }, + { + "epoch": 0.31133436242629753, + "grad_norm": 5.503317832946777, + "learning_rate": 7.883201935583337e-06, + "loss": 0.2038, + "step": 12303 + }, + { + "epoch": 0.31135966799099124, + "grad_norm": 35.694095611572266, + "learning_rate": 7.882873883142922e-06, + "loss": 0.1993, + "step": 12304 + }, + { + "epoch": 0.3113849735556849, + "grad_norm": 5.097553253173828, + "learning_rate": 7.882545812111389e-06, + "loss": 0.2632, + "step": 12305 + }, + { + "epoch": 0.31141027912037855, + "grad_norm": 6.6246418952941895, + "learning_rate": 7.882217722490854e-06, + "loss": 0.2224, + "step": 12306 + }, + { + "epoch": 0.31143558468507226, + "grad_norm": 3.933525323867798, + "learning_rate": 7.881889614283433e-06, + "loss": 0.1328, + "step": 12307 + }, + { + "epoch": 0.3114608902497659, + "grad_norm": 6.512136936187744, + "learning_rate": 7.88156148749124e-06, + "loss": 0.2569, + "step": 12308 + }, + { + "epoch": 0.3114861958144596, + "grad_norm": 7.339579105377197, + "learning_rate": 7.881233342116396e-06, + "loss": 0.1764, + "step": 12309 + }, + { + "epoch": 0.3115115013791533, + "grad_norm": 7.150688648223877, + "learning_rate": 7.880905178161013e-06, + "loss": 0.2025, + "step": 12310 + }, + { + "epoch": 0.31153680694384694, + "grad_norm": 7.964991092681885, + "learning_rate": 7.88057699562721e-06, + "loss": 0.1951, + "step": 12311 + }, + { + "epoch": 0.3115621125085406, + "grad_norm": 6.301798343658447, + "learning_rate": 7.880248794517099e-06, + "loss": 0.112, + "step": 12312 + }, + { + "epoch": 0.3115874180732343, + "grad_norm": 10.718547821044922, + "learning_rate": 7.8799205748328e-06, + "loss": 0.2163, + "step": 12313 + }, + { + "epoch": 0.31161272363792797, + "grad_norm": 6.005395889282227, + "learning_rate": 7.879592336576429e-06, + "loss": 0.228, + "step": 12314 + }, + { + "epoch": 0.3116380292026217, + "grad_norm": 4.749809265136719, + "learning_rate": 7.879264079750104e-06, + "loss": 0.2192, + "step": 12315 + }, + { + "epoch": 0.31166333476731534, + "grad_norm": 3.076564311981201, + "learning_rate": 7.878935804355938e-06, + "loss": 0.1241, + "step": 12316 + }, + { + "epoch": 0.311688640332009, + "grad_norm": 3.6299712657928467, + "learning_rate": 7.878607510396052e-06, + "loss": 0.109, + "step": 12317 + }, + { + "epoch": 0.3117139458967027, + "grad_norm": 6.863562107086182, + "learning_rate": 7.878279197872563e-06, + "loss": 0.1471, + "step": 12318 + }, + { + "epoch": 0.31173925146139636, + "grad_norm": 5.3864617347717285, + "learning_rate": 7.877950866787585e-06, + "loss": 0.235, + "step": 12319 + }, + { + "epoch": 0.31176455702609, + "grad_norm": 8.30197811126709, + "learning_rate": 7.877622517143238e-06, + "loss": 0.1931, + "step": 12320 + }, + { + "epoch": 0.31178986259078373, + "grad_norm": 7.154334545135498, + "learning_rate": 7.877294148941638e-06, + "loss": 0.217, + "step": 12321 + }, + { + "epoch": 0.3118151681554774, + "grad_norm": 3.735438346862793, + "learning_rate": 7.876965762184903e-06, + "loss": 0.1953, + "step": 12322 + }, + { + "epoch": 0.31184047372017104, + "grad_norm": 6.528133392333984, + "learning_rate": 7.876637356875152e-06, + "loss": 0.1843, + "step": 12323 + }, + { + "epoch": 0.31186577928486475, + "grad_norm": 9.983258247375488, + "learning_rate": 7.876308933014502e-06, + "loss": 0.189, + "step": 12324 + }, + { + "epoch": 0.3118910848495584, + "grad_norm": 6.5024213790893555, + "learning_rate": 7.87598049060507e-06, + "loss": 0.1306, + "step": 12325 + }, + { + "epoch": 0.3119163904142521, + "grad_norm": 6.826806545257568, + "learning_rate": 7.875652029648974e-06, + "loss": 0.2395, + "step": 12326 + }, + { + "epoch": 0.3119416959789458, + "grad_norm": 2.6888155937194824, + "learning_rate": 7.875323550148335e-06, + "loss": 0.0787, + "step": 12327 + }, + { + "epoch": 0.31196700154363943, + "grad_norm": 12.117980003356934, + "learning_rate": 7.874995052105267e-06, + "loss": 0.2589, + "step": 12328 + }, + { + "epoch": 0.31199230710833314, + "grad_norm": 3.1019551753997803, + "learning_rate": 7.874666535521891e-06, + "loss": 0.1188, + "step": 12329 + }, + { + "epoch": 0.3120176126730268, + "grad_norm": 9.294692993164062, + "learning_rate": 7.874338000400327e-06, + "loss": 0.2679, + "step": 12330 + }, + { + "epoch": 0.31204291823772046, + "grad_norm": 5.672503471374512, + "learning_rate": 7.874009446742692e-06, + "loss": 0.1851, + "step": 12331 + }, + { + "epoch": 0.31206822380241417, + "grad_norm": 4.287254810333252, + "learning_rate": 7.873680874551102e-06, + "loss": 0.0978, + "step": 12332 + }, + { + "epoch": 0.3120935293671078, + "grad_norm": 4.809192657470703, + "learning_rate": 7.873352283827681e-06, + "loss": 0.166, + "step": 12333 + }, + { + "epoch": 0.3121188349318015, + "grad_norm": 9.003876686096191, + "learning_rate": 7.873023674574544e-06, + "loss": 0.2212, + "step": 12334 + }, + { + "epoch": 0.3121441404964952, + "grad_norm": 3.7333364486694336, + "learning_rate": 7.872695046793815e-06, + "loss": 0.1562, + "step": 12335 + }, + { + "epoch": 0.31216944606118885, + "grad_norm": 2.744185447692871, + "learning_rate": 7.872366400487606e-06, + "loss": 0.1273, + "step": 12336 + }, + { + "epoch": 0.3121947516258825, + "grad_norm": 5.962776184082031, + "learning_rate": 7.872037735658042e-06, + "loss": 0.235, + "step": 12337 + }, + { + "epoch": 0.3122200571905762, + "grad_norm": 8.737730026245117, + "learning_rate": 7.871709052307241e-06, + "loss": 0.2905, + "step": 12338 + }, + { + "epoch": 0.3122453627552699, + "grad_norm": 10.71473503112793, + "learning_rate": 7.871380350437323e-06, + "loss": 0.3899, + "step": 12339 + }, + { + "epoch": 0.3122706683199636, + "grad_norm": 7.923318386077881, + "learning_rate": 7.871051630050405e-06, + "loss": 0.1703, + "step": 12340 + }, + { + "epoch": 0.31229597388465724, + "grad_norm": 3.888714075088501, + "learning_rate": 7.870722891148613e-06, + "loss": 0.1533, + "step": 12341 + }, + { + "epoch": 0.3123212794493509, + "grad_norm": 7.0793938636779785, + "learning_rate": 7.87039413373406e-06, + "loss": 0.1635, + "step": 12342 + }, + { + "epoch": 0.3123465850140446, + "grad_norm": 15.682988166809082, + "learning_rate": 7.870065357808869e-06, + "loss": 0.2932, + "step": 12343 + }, + { + "epoch": 0.31237189057873826, + "grad_norm": 8.853097915649414, + "learning_rate": 7.869736563375163e-06, + "loss": 0.1929, + "step": 12344 + }, + { + "epoch": 0.3123971961434319, + "grad_norm": 5.33103609085083, + "learning_rate": 7.869407750435056e-06, + "loss": 0.2258, + "step": 12345 + }, + { + "epoch": 0.31242250170812563, + "grad_norm": 12.361608505249023, + "learning_rate": 7.869078918990675e-06, + "loss": 0.2162, + "step": 12346 + }, + { + "epoch": 0.3124478072728193, + "grad_norm": 7.602045059204102, + "learning_rate": 7.868750069044137e-06, + "loss": 0.2541, + "step": 12347 + }, + { + "epoch": 0.31247311283751295, + "grad_norm": 6.273939609527588, + "learning_rate": 7.868421200597563e-06, + "loss": 0.1282, + "step": 12348 + }, + { + "epoch": 0.31249841840220666, + "grad_norm": 14.728657722473145, + "learning_rate": 7.868092313653075e-06, + "loss": 0.3406, + "step": 12349 + }, + { + "epoch": 0.3125237239669003, + "grad_norm": 4.421978950500488, + "learning_rate": 7.867763408212792e-06, + "loss": 0.1179, + "step": 12350 + }, + { + "epoch": 0.312549029531594, + "grad_norm": 3.9506773948669434, + "learning_rate": 7.867434484278838e-06, + "loss": 0.1616, + "step": 12351 + }, + { + "epoch": 0.3125743350962877, + "grad_norm": 8.806584358215332, + "learning_rate": 7.86710554185333e-06, + "loss": 0.1551, + "step": 12352 + }, + { + "epoch": 0.31259964066098134, + "grad_norm": 6.474493026733398, + "learning_rate": 7.866776580938396e-06, + "loss": 0.2202, + "step": 12353 + }, + { + "epoch": 0.31262494622567505, + "grad_norm": 4.725464344024658, + "learning_rate": 7.86644760153615e-06, + "loss": 0.1455, + "step": 12354 + }, + { + "epoch": 0.3126502517903687, + "grad_norm": 4.626962661743164, + "learning_rate": 7.866118603648716e-06, + "loss": 0.2257, + "step": 12355 + }, + { + "epoch": 0.31267555735506236, + "grad_norm": 10.64315128326416, + "learning_rate": 7.865789587278217e-06, + "loss": 0.0883, + "step": 12356 + }, + { + "epoch": 0.3127008629197561, + "grad_norm": 5.062471866607666, + "learning_rate": 7.865460552426774e-06, + "loss": 0.1888, + "step": 12357 + }, + { + "epoch": 0.31272616848444973, + "grad_norm": 3.478276491165161, + "learning_rate": 7.865131499096509e-06, + "loss": 0.1387, + "step": 12358 + }, + { + "epoch": 0.3127514740491434, + "grad_norm": 6.147737979888916, + "learning_rate": 7.864802427289545e-06, + "loss": 0.1817, + "step": 12359 + }, + { + "epoch": 0.3127767796138371, + "grad_norm": 4.284268856048584, + "learning_rate": 7.864473337008001e-06, + "loss": 0.0818, + "step": 12360 + }, + { + "epoch": 0.31280208517853075, + "grad_norm": 7.126986503601074, + "learning_rate": 7.864144228254002e-06, + "loss": 0.2322, + "step": 12361 + }, + { + "epoch": 0.3128273907432244, + "grad_norm": 7.152819633483887, + "learning_rate": 7.86381510102967e-06, + "loss": 0.3165, + "step": 12362 + }, + { + "epoch": 0.3128526963079181, + "grad_norm": 18.607437133789062, + "learning_rate": 7.863485955337127e-06, + "loss": 0.2807, + "step": 12363 + }, + { + "epoch": 0.3128780018726118, + "grad_norm": 7.127747058868408, + "learning_rate": 7.863156791178496e-06, + "loss": 0.2035, + "step": 12364 + }, + { + "epoch": 0.3129033074373055, + "grad_norm": 4.394076347351074, + "learning_rate": 7.862827608555899e-06, + "loss": 0.0722, + "step": 12365 + }, + { + "epoch": 0.31292861300199915, + "grad_norm": 6.469498157501221, + "learning_rate": 7.86249840747146e-06, + "loss": 0.2153, + "step": 12366 + }, + { + "epoch": 0.3129539185666928, + "grad_norm": 9.391091346740723, + "learning_rate": 7.8621691879273e-06, + "loss": 0.2169, + "step": 12367 + }, + { + "epoch": 0.3129792241313865, + "grad_norm": 5.845604419708252, + "learning_rate": 7.861839949925544e-06, + "loss": 0.2013, + "step": 12368 + }, + { + "epoch": 0.31300452969608017, + "grad_norm": 4.688770294189453, + "learning_rate": 7.861510693468315e-06, + "loss": 0.206, + "step": 12369 + }, + { + "epoch": 0.3130298352607738, + "grad_norm": 2.950723886489868, + "learning_rate": 7.861181418557734e-06, + "loss": 0.1071, + "step": 12370 + }, + { + "epoch": 0.31305514082546754, + "grad_norm": 2.6457583904266357, + "learning_rate": 7.860852125195928e-06, + "loss": 0.1623, + "step": 12371 + }, + { + "epoch": 0.3130804463901612, + "grad_norm": 5.889560699462891, + "learning_rate": 7.860522813385018e-06, + "loss": 0.2106, + "step": 12372 + }, + { + "epoch": 0.31310575195485485, + "grad_norm": 5.343464374542236, + "learning_rate": 7.86019348312713e-06, + "loss": 0.1603, + "step": 12373 + }, + { + "epoch": 0.31313105751954856, + "grad_norm": 9.38215160369873, + "learning_rate": 7.859864134424383e-06, + "loss": 0.2226, + "step": 12374 + }, + { + "epoch": 0.3131563630842422, + "grad_norm": 3.6013190746307373, + "learning_rate": 7.85953476727891e-06, + "loss": 0.1133, + "step": 12375 + }, + { + "epoch": 0.3131816686489359, + "grad_norm": 5.32080078125, + "learning_rate": 7.859205381692824e-06, + "loss": 0.2394, + "step": 12376 + }, + { + "epoch": 0.3132069742136296, + "grad_norm": 5.7811174392700195, + "learning_rate": 7.858875977668255e-06, + "loss": 0.1727, + "step": 12377 + }, + { + "epoch": 0.31323227977832324, + "grad_norm": 4.888936996459961, + "learning_rate": 7.858546555207329e-06, + "loss": 0.1451, + "step": 12378 + }, + { + "epoch": 0.31325758534301695, + "grad_norm": 3.6687817573547363, + "learning_rate": 7.858217114312165e-06, + "loss": 0.1926, + "step": 12379 + }, + { + "epoch": 0.3132828909077106, + "grad_norm": 5.161199569702148, + "learning_rate": 7.857887654984892e-06, + "loss": 0.1245, + "step": 12380 + }, + { + "epoch": 0.31330819647240427, + "grad_norm": 13.31037712097168, + "learning_rate": 7.857558177227634e-06, + "loss": 0.2391, + "step": 12381 + }, + { + "epoch": 0.313333502037098, + "grad_norm": 14.792901039123535, + "learning_rate": 7.857228681042514e-06, + "loss": 0.2848, + "step": 12382 + }, + { + "epoch": 0.31335880760179163, + "grad_norm": 5.306642532348633, + "learning_rate": 7.856899166431657e-06, + "loss": 0.1087, + "step": 12383 + }, + { + "epoch": 0.3133841131664853, + "grad_norm": 6.542719841003418, + "learning_rate": 7.85656963339719e-06, + "loss": 0.1948, + "step": 12384 + }, + { + "epoch": 0.313409418731179, + "grad_norm": 18.00804901123047, + "learning_rate": 7.856240081941236e-06, + "loss": 0.2437, + "step": 12385 + }, + { + "epoch": 0.31343472429587266, + "grad_norm": 5.371174335479736, + "learning_rate": 7.85591051206592e-06, + "loss": 0.2076, + "step": 12386 + }, + { + "epoch": 0.3134600298605663, + "grad_norm": 3.4010894298553467, + "learning_rate": 7.85558092377337e-06, + "loss": 0.0959, + "step": 12387 + }, + { + "epoch": 0.31348533542526, + "grad_norm": 7.298402786254883, + "learning_rate": 7.855251317065711e-06, + "loss": 0.2081, + "step": 12388 + }, + { + "epoch": 0.3135106409899537, + "grad_norm": 8.405644416809082, + "learning_rate": 7.854921691945064e-06, + "loss": 0.2442, + "step": 12389 + }, + { + "epoch": 0.3135359465546474, + "grad_norm": 4.07550573348999, + "learning_rate": 7.854592048413561e-06, + "loss": 0.0963, + "step": 12390 + }, + { + "epoch": 0.31356125211934105, + "grad_norm": 14.337911605834961, + "learning_rate": 7.854262386473322e-06, + "loss": 0.2245, + "step": 12391 + }, + { + "epoch": 0.3135865576840347, + "grad_norm": 4.1259541511535645, + "learning_rate": 7.853932706126477e-06, + "loss": 0.1586, + "step": 12392 + }, + { + "epoch": 0.3136118632487284, + "grad_norm": 8.938579559326172, + "learning_rate": 7.853603007375152e-06, + "loss": 0.2723, + "step": 12393 + }, + { + "epoch": 0.3136371688134221, + "grad_norm": 5.089425086975098, + "learning_rate": 7.853273290221471e-06, + "loss": 0.177, + "step": 12394 + }, + { + "epoch": 0.31366247437811573, + "grad_norm": 7.340662956237793, + "learning_rate": 7.852943554667562e-06, + "loss": 0.1705, + "step": 12395 + }, + { + "epoch": 0.31368777994280944, + "grad_norm": 7.659252166748047, + "learning_rate": 7.85261380071555e-06, + "loss": 0.2076, + "step": 12396 + }, + { + "epoch": 0.3137130855075031, + "grad_norm": 6.542447566986084, + "learning_rate": 7.852284028367564e-06, + "loss": 0.1816, + "step": 12397 + }, + { + "epoch": 0.31373839107219675, + "grad_norm": 7.890366554260254, + "learning_rate": 7.851954237625728e-06, + "loss": 0.2385, + "step": 12398 + }, + { + "epoch": 0.31376369663689047, + "grad_norm": 2.627312660217285, + "learning_rate": 7.851624428492169e-06, + "loss": 0.1176, + "step": 12399 + }, + { + "epoch": 0.3137890022015841, + "grad_norm": 4.256875038146973, + "learning_rate": 7.851294600969013e-06, + "loss": 0.1428, + "step": 12400 + }, + { + "epoch": 0.3138143077662778, + "grad_norm": 9.082621574401855, + "learning_rate": 7.85096475505839e-06, + "loss": 0.2548, + "step": 12401 + }, + { + "epoch": 0.3138396133309715, + "grad_norm": 6.673600196838379, + "learning_rate": 7.850634890762425e-06, + "loss": 0.1993, + "step": 12402 + }, + { + "epoch": 0.31386491889566515, + "grad_norm": 7.314170837402344, + "learning_rate": 7.850305008083248e-06, + "loss": 0.3317, + "step": 12403 + }, + { + "epoch": 0.31389022446035886, + "grad_norm": 4.650983810424805, + "learning_rate": 7.849975107022983e-06, + "loss": 0.1671, + "step": 12404 + }, + { + "epoch": 0.3139155300250525, + "grad_norm": 9.068941116333008, + "learning_rate": 7.849645187583757e-06, + "loss": 0.3468, + "step": 12405 + }, + { + "epoch": 0.31394083558974617, + "grad_norm": 7.4350762367248535, + "learning_rate": 7.8493152497677e-06, + "loss": 0.244, + "step": 12406 + }, + { + "epoch": 0.3139661411544399, + "grad_norm": 4.783388137817383, + "learning_rate": 7.84898529357694e-06, + "loss": 0.1917, + "step": 12407 + }, + { + "epoch": 0.31399144671913354, + "grad_norm": 4.234922409057617, + "learning_rate": 7.848655319013604e-06, + "loss": 0.1885, + "step": 12408 + }, + { + "epoch": 0.3140167522838272, + "grad_norm": 4.1014790534973145, + "learning_rate": 7.84832532607982e-06, + "loss": 0.1646, + "step": 12409 + }, + { + "epoch": 0.3140420578485209, + "grad_norm": 4.51956033706665, + "learning_rate": 7.847995314777714e-06, + "loss": 0.1684, + "step": 12410 + }, + { + "epoch": 0.31406736341321456, + "grad_norm": 5.0085601806640625, + "learning_rate": 7.847665285109417e-06, + "loss": 0.2067, + "step": 12411 + }, + { + "epoch": 0.3140926689779082, + "grad_norm": 4.70938777923584, + "learning_rate": 7.847335237077055e-06, + "loss": 0.1642, + "step": 12412 + }, + { + "epoch": 0.31411797454260193, + "grad_norm": 3.811657667160034, + "learning_rate": 7.84700517068276e-06, + "loss": 0.1544, + "step": 12413 + }, + { + "epoch": 0.3141432801072956, + "grad_norm": 5.675346374511719, + "learning_rate": 7.846675085928657e-06, + "loss": 0.1251, + "step": 12414 + }, + { + "epoch": 0.3141685856719893, + "grad_norm": 7.445523262023926, + "learning_rate": 7.846344982816876e-06, + "loss": 0.2348, + "step": 12415 + }, + { + "epoch": 0.31419389123668295, + "grad_norm": 8.495972633361816, + "learning_rate": 7.846014861349546e-06, + "loss": 0.2462, + "step": 12416 + }, + { + "epoch": 0.3142191968013766, + "grad_norm": 5.027403831481934, + "learning_rate": 7.845684721528795e-06, + "loss": 0.0899, + "step": 12417 + }, + { + "epoch": 0.3142445023660703, + "grad_norm": 8.588398933410645, + "learning_rate": 7.845354563356753e-06, + "loss": 0.157, + "step": 12418 + }, + { + "epoch": 0.314269807930764, + "grad_norm": 4.607921123504639, + "learning_rate": 7.845024386835548e-06, + "loss": 0.2202, + "step": 12419 + }, + { + "epoch": 0.31429511349545763, + "grad_norm": 7.870711803436279, + "learning_rate": 7.84469419196731e-06, + "loss": 0.1835, + "step": 12420 + }, + { + "epoch": 0.31432041906015135, + "grad_norm": 3.7266905307769775, + "learning_rate": 7.84436397875417e-06, + "loss": 0.1142, + "step": 12421 + }, + { + "epoch": 0.314345724624845, + "grad_norm": 6.204106330871582, + "learning_rate": 7.844033747198254e-06, + "loss": 0.1368, + "step": 12422 + }, + { + "epoch": 0.31437103018953866, + "grad_norm": 3.531604290008545, + "learning_rate": 7.843703497301694e-06, + "loss": 0.1199, + "step": 12423 + }, + { + "epoch": 0.31439633575423237, + "grad_norm": 3.6500449180603027, + "learning_rate": 7.84337322906662e-06, + "loss": 0.1853, + "step": 12424 + }, + { + "epoch": 0.314421641318926, + "grad_norm": 4.299424171447754, + "learning_rate": 7.843042942495159e-06, + "loss": 0.1166, + "step": 12425 + }, + { + "epoch": 0.3144469468836197, + "grad_norm": 5.875302791595459, + "learning_rate": 7.842712637589445e-06, + "loss": 0.1204, + "step": 12426 + }, + { + "epoch": 0.3144722524483134, + "grad_norm": 3.40742564201355, + "learning_rate": 7.842382314351605e-06, + "loss": 0.1308, + "step": 12427 + }, + { + "epoch": 0.31449755801300705, + "grad_norm": 11.173999786376953, + "learning_rate": 7.84205197278377e-06, + "loss": 0.2492, + "step": 12428 + }, + { + "epoch": 0.31452286357770076, + "grad_norm": 12.562252044677734, + "learning_rate": 7.84172161288807e-06, + "loss": 0.2276, + "step": 12429 + }, + { + "epoch": 0.3145481691423944, + "grad_norm": 11.201213836669922, + "learning_rate": 7.841391234666637e-06, + "loss": 0.3129, + "step": 12430 + }, + { + "epoch": 0.3145734747070881, + "grad_norm": 2.971912145614624, + "learning_rate": 7.841060838121599e-06, + "loss": 0.161, + "step": 12431 + }, + { + "epoch": 0.3145987802717818, + "grad_norm": 6.2718586921691895, + "learning_rate": 7.840730423255089e-06, + "loss": 0.197, + "step": 12432 + }, + { + "epoch": 0.31462408583647544, + "grad_norm": 4.0633416175842285, + "learning_rate": 7.840399990069239e-06, + "loss": 0.1504, + "step": 12433 + }, + { + "epoch": 0.3146493914011691, + "grad_norm": 6.857402324676514, + "learning_rate": 7.840069538566175e-06, + "loss": 0.1976, + "step": 12434 + }, + { + "epoch": 0.3146746969658628, + "grad_norm": 4.883126735687256, + "learning_rate": 7.839739068748032e-06, + "loss": 0.1676, + "step": 12435 + }, + { + "epoch": 0.31470000253055647, + "grad_norm": 9.680171012878418, + "learning_rate": 7.839408580616938e-06, + "loss": 0.1784, + "step": 12436 + }, + { + "epoch": 0.3147253080952501, + "grad_norm": 3.0005555152893066, + "learning_rate": 7.83907807417503e-06, + "loss": 0.1325, + "step": 12437 + }, + { + "epoch": 0.31475061365994383, + "grad_norm": 5.3666510581970215, + "learning_rate": 7.838747549424432e-06, + "loss": 0.2013, + "step": 12438 + }, + { + "epoch": 0.3147759192246375, + "grad_norm": 9.80034065246582, + "learning_rate": 7.838417006367281e-06, + "loss": 0.1858, + "step": 12439 + }, + { + "epoch": 0.31480122478933115, + "grad_norm": 4.217283248901367, + "learning_rate": 7.838086445005707e-06, + "loss": 0.1926, + "step": 12440 + }, + { + "epoch": 0.31482653035402486, + "grad_norm": 6.326805591583252, + "learning_rate": 7.83775586534184e-06, + "loss": 0.2398, + "step": 12441 + }, + { + "epoch": 0.3148518359187185, + "grad_norm": 5.281457901000977, + "learning_rate": 7.837425267377814e-06, + "loss": 0.0972, + "step": 12442 + }, + { + "epoch": 0.3148771414834122, + "grad_norm": 7.520423889160156, + "learning_rate": 7.837094651115762e-06, + "loss": 0.3357, + "step": 12443 + }, + { + "epoch": 0.3149024470481059, + "grad_norm": 3.40700364112854, + "learning_rate": 7.836764016557811e-06, + "loss": 0.1642, + "step": 12444 + }, + { + "epoch": 0.31492775261279954, + "grad_norm": 5.908158302307129, + "learning_rate": 7.836433363706097e-06, + "loss": 0.2095, + "step": 12445 + }, + { + "epoch": 0.31495305817749325, + "grad_norm": 4.530211448669434, + "learning_rate": 7.836102692562755e-06, + "loss": 0.1613, + "step": 12446 + }, + { + "epoch": 0.3149783637421869, + "grad_norm": 9.365180969238281, + "learning_rate": 7.835772003129911e-06, + "loss": 0.2591, + "step": 12447 + }, + { + "epoch": 0.31500366930688056, + "grad_norm": 5.092117786407471, + "learning_rate": 7.835441295409703e-06, + "loss": 0.2128, + "step": 12448 + }, + { + "epoch": 0.3150289748715743, + "grad_norm": 6.343091011047363, + "learning_rate": 7.835110569404261e-06, + "loss": 0.1978, + "step": 12449 + }, + { + "epoch": 0.31505428043626793, + "grad_norm": 4.273496150970459, + "learning_rate": 7.834779825115716e-06, + "loss": 0.1011, + "step": 12450 + }, + { + "epoch": 0.3150795860009616, + "grad_norm": 5.7862548828125, + "learning_rate": 7.834449062546207e-06, + "loss": 0.1961, + "step": 12451 + }, + { + "epoch": 0.3151048915656553, + "grad_norm": 3.5637729167938232, + "learning_rate": 7.834118281697862e-06, + "loss": 0.1793, + "step": 12452 + }, + { + "epoch": 0.31513019713034895, + "grad_norm": 4.05575704574585, + "learning_rate": 7.833787482572814e-06, + "loss": 0.1389, + "step": 12453 + }, + { + "epoch": 0.31515550269504267, + "grad_norm": 5.963667392730713, + "learning_rate": 7.8334566651732e-06, + "loss": 0.1709, + "step": 12454 + }, + { + "epoch": 0.3151808082597363, + "grad_norm": 6.020655155181885, + "learning_rate": 7.833125829501149e-06, + "loss": 0.2189, + "step": 12455 + }, + { + "epoch": 0.31520611382443, + "grad_norm": 5.030003547668457, + "learning_rate": 7.832794975558795e-06, + "loss": 0.1772, + "step": 12456 + }, + { + "epoch": 0.3152314193891237, + "grad_norm": 18.649629592895508, + "learning_rate": 7.832464103348277e-06, + "loss": 0.4208, + "step": 12457 + }, + { + "epoch": 0.31525672495381735, + "grad_norm": 5.427563190460205, + "learning_rate": 7.832133212871722e-06, + "loss": 0.1988, + "step": 12458 + }, + { + "epoch": 0.315282030518511, + "grad_norm": 4.883903980255127, + "learning_rate": 7.831802304131268e-06, + "loss": 0.1877, + "step": 12459 + }, + { + "epoch": 0.3153073360832047, + "grad_norm": 10.418322563171387, + "learning_rate": 7.831471377129048e-06, + "loss": 0.2229, + "step": 12460 + }, + { + "epoch": 0.31533264164789837, + "grad_norm": 4.815146446228027, + "learning_rate": 7.831140431867195e-06, + "loss": 0.1831, + "step": 12461 + }, + { + "epoch": 0.315357947212592, + "grad_norm": 7.95730447769165, + "learning_rate": 7.830809468347845e-06, + "loss": 0.177, + "step": 12462 + }, + { + "epoch": 0.31538325277728574, + "grad_norm": 4.529356956481934, + "learning_rate": 7.83047848657313e-06, + "loss": 0.1476, + "step": 12463 + }, + { + "epoch": 0.3154085583419794, + "grad_norm": 6.609053134918213, + "learning_rate": 7.830147486545185e-06, + "loss": 0.1784, + "step": 12464 + }, + { + "epoch": 0.31543386390667305, + "grad_norm": 7.639070987701416, + "learning_rate": 7.829816468266149e-06, + "loss": 0.1613, + "step": 12465 + }, + { + "epoch": 0.31545916947136676, + "grad_norm": 5.184242248535156, + "learning_rate": 7.829485431738149e-06, + "loss": 0.1968, + "step": 12466 + }, + { + "epoch": 0.3154844750360604, + "grad_norm": 11.547382354736328, + "learning_rate": 7.829154376963326e-06, + "loss": 0.253, + "step": 12467 + }, + { + "epoch": 0.31550978060075413, + "grad_norm": 12.257869720458984, + "learning_rate": 7.828823303943812e-06, + "loss": 0.238, + "step": 12468 + }, + { + "epoch": 0.3155350861654478, + "grad_norm": 13.671601295471191, + "learning_rate": 7.828492212681743e-06, + "loss": 0.2725, + "step": 12469 + }, + { + "epoch": 0.31556039173014144, + "grad_norm": 4.571345329284668, + "learning_rate": 7.828161103179254e-06, + "loss": 0.1924, + "step": 12470 + }, + { + "epoch": 0.31558569729483515, + "grad_norm": 5.816637992858887, + "learning_rate": 7.827829975438479e-06, + "loss": 0.249, + "step": 12471 + }, + { + "epoch": 0.3156110028595288, + "grad_norm": 4.877383708953857, + "learning_rate": 7.827498829461556e-06, + "loss": 0.1769, + "step": 12472 + }, + { + "epoch": 0.31563630842422247, + "grad_norm": 5.498697757720947, + "learning_rate": 7.827167665250617e-06, + "loss": 0.2095, + "step": 12473 + }, + { + "epoch": 0.3156616139889162, + "grad_norm": 4.336915969848633, + "learning_rate": 7.826836482807802e-06, + "loss": 0.1642, + "step": 12474 + }, + { + "epoch": 0.31568691955360983, + "grad_norm": 2.361487627029419, + "learning_rate": 7.826505282135242e-06, + "loss": 0.1079, + "step": 12475 + }, + { + "epoch": 0.3157122251183035, + "grad_norm": 4.084493160247803, + "learning_rate": 7.826174063235077e-06, + "loss": 0.1233, + "step": 12476 + }, + { + "epoch": 0.3157375306829972, + "grad_norm": 3.5702383518218994, + "learning_rate": 7.82584282610944e-06, + "loss": 0.1283, + "step": 12477 + }, + { + "epoch": 0.31576283624769086, + "grad_norm": 6.0453925132751465, + "learning_rate": 7.82551157076047e-06, + "loss": 0.1237, + "step": 12478 + }, + { + "epoch": 0.31578814181238457, + "grad_norm": 3.1261751651763916, + "learning_rate": 7.8251802971903e-06, + "loss": 0.1346, + "step": 12479 + }, + { + "epoch": 0.3158134473770782, + "grad_norm": 3.2960121631622314, + "learning_rate": 7.824849005401069e-06, + "loss": 0.0603, + "step": 12480 + }, + { + "epoch": 0.3158387529417719, + "grad_norm": 4.579973220825195, + "learning_rate": 7.824517695394911e-06, + "loss": 0.125, + "step": 12481 + }, + { + "epoch": 0.3158640585064656, + "grad_norm": 5.824780464172363, + "learning_rate": 7.824186367173963e-06, + "loss": 0.2474, + "step": 12482 + }, + { + "epoch": 0.31588936407115925, + "grad_norm": 5.521779537200928, + "learning_rate": 7.823855020740366e-06, + "loss": 0.2392, + "step": 12483 + }, + { + "epoch": 0.3159146696358529, + "grad_norm": 6.4212541580200195, + "learning_rate": 7.82352365609625e-06, + "loss": 0.2302, + "step": 12484 + }, + { + "epoch": 0.3159399752005466, + "grad_norm": 9.679384231567383, + "learning_rate": 7.823192273243757e-06, + "loss": 0.1704, + "step": 12485 + }, + { + "epoch": 0.3159652807652403, + "grad_norm": 8.639045715332031, + "learning_rate": 7.82286087218502e-06, + "loss": 0.239, + "step": 12486 + }, + { + "epoch": 0.31599058632993393, + "grad_norm": 6.994393348693848, + "learning_rate": 7.822529452922179e-06, + "loss": 0.2504, + "step": 12487 + }, + { + "epoch": 0.31601589189462764, + "grad_norm": 12.024301528930664, + "learning_rate": 7.822198015457371e-06, + "loss": 0.231, + "step": 12488 + }, + { + "epoch": 0.3160411974593213, + "grad_norm": 3.6527645587921143, + "learning_rate": 7.821866559792732e-06, + "loss": 0.1808, + "step": 12489 + }, + { + "epoch": 0.31606650302401496, + "grad_norm": 8.458640098571777, + "learning_rate": 7.821535085930404e-06, + "loss": 0.2591, + "step": 12490 + }, + { + "epoch": 0.31609180858870867, + "grad_norm": 3.573415517807007, + "learning_rate": 7.821203593872518e-06, + "loss": 0.1557, + "step": 12491 + }, + { + "epoch": 0.3161171141534023, + "grad_norm": 11.746204376220703, + "learning_rate": 7.820872083621215e-06, + "loss": 0.1627, + "step": 12492 + }, + { + "epoch": 0.31614241971809603, + "grad_norm": 7.754673957824707, + "learning_rate": 7.820540555178633e-06, + "loss": 0.1741, + "step": 12493 + }, + { + "epoch": 0.3161677252827897, + "grad_norm": 4.4818902015686035, + "learning_rate": 7.82020900854691e-06, + "loss": 0.209, + "step": 12494 + }, + { + "epoch": 0.31619303084748335, + "grad_norm": 4.004260540008545, + "learning_rate": 7.819877443728184e-06, + "loss": 0.1332, + "step": 12495 + }, + { + "epoch": 0.31621833641217706, + "grad_norm": 9.283679008483887, + "learning_rate": 7.819545860724591e-06, + "loss": 0.1518, + "step": 12496 + }, + { + "epoch": 0.3162436419768707, + "grad_norm": 3.246201753616333, + "learning_rate": 7.819214259538272e-06, + "loss": 0.1606, + "step": 12497 + }, + { + "epoch": 0.31626894754156437, + "grad_norm": 6.974591255187988, + "learning_rate": 7.818882640171367e-06, + "loss": 0.258, + "step": 12498 + }, + { + "epoch": 0.3162942531062581, + "grad_norm": 4.352072238922119, + "learning_rate": 7.81855100262601e-06, + "loss": 0.189, + "step": 12499 + }, + { + "epoch": 0.31631955867095174, + "grad_norm": 5.06545877456665, + "learning_rate": 7.81821934690434e-06, + "loss": 0.1274, + "step": 12500 + }, + { + "epoch": 0.3163448642356454, + "grad_norm": 5.336413860321045, + "learning_rate": 7.817887673008501e-06, + "loss": 0.1867, + "step": 12501 + }, + { + "epoch": 0.3163701698003391, + "grad_norm": 8.485148429870605, + "learning_rate": 7.817555980940628e-06, + "loss": 0.1653, + "step": 12502 + }, + { + "epoch": 0.31639547536503276, + "grad_norm": 9.647459030151367, + "learning_rate": 7.81722427070286e-06, + "loss": 0.3873, + "step": 12503 + }, + { + "epoch": 0.3164207809297264, + "grad_norm": 6.4916863441467285, + "learning_rate": 7.816892542297336e-06, + "loss": 0.2593, + "step": 12504 + }, + { + "epoch": 0.31644608649442013, + "grad_norm": 6.225339889526367, + "learning_rate": 7.816560795726198e-06, + "loss": 0.2581, + "step": 12505 + }, + { + "epoch": 0.3164713920591138, + "grad_norm": 3.977055549621582, + "learning_rate": 7.816229030991583e-06, + "loss": 0.1964, + "step": 12506 + }, + { + "epoch": 0.3164966976238075, + "grad_norm": 5.987802028656006, + "learning_rate": 7.81589724809563e-06, + "loss": 0.2396, + "step": 12507 + }, + { + "epoch": 0.31652200318850116, + "grad_norm": 5.9926629066467285, + "learning_rate": 7.815565447040479e-06, + "loss": 0.1727, + "step": 12508 + }, + { + "epoch": 0.3165473087531948, + "grad_norm": 5.450361251831055, + "learning_rate": 7.815233627828269e-06, + "loss": 0.2164, + "step": 12509 + }, + { + "epoch": 0.3165726143178885, + "grad_norm": 3.4068455696105957, + "learning_rate": 7.814901790461144e-06, + "loss": 0.1741, + "step": 12510 + }, + { + "epoch": 0.3165979198825822, + "grad_norm": 3.805483341217041, + "learning_rate": 7.814569934941238e-06, + "loss": 0.2291, + "step": 12511 + }, + { + "epoch": 0.31662322544727584, + "grad_norm": 5.212125301361084, + "learning_rate": 7.814238061270697e-06, + "loss": 0.2056, + "step": 12512 + }, + { + "epoch": 0.31664853101196955, + "grad_norm": 5.068088054656982, + "learning_rate": 7.813906169451658e-06, + "loss": 0.1877, + "step": 12513 + }, + { + "epoch": 0.3166738365766632, + "grad_norm": 6.784142017364502, + "learning_rate": 7.813574259486258e-06, + "loss": 0.2755, + "step": 12514 + }, + { + "epoch": 0.31669914214135686, + "grad_norm": 3.196730375289917, + "learning_rate": 7.813242331376645e-06, + "loss": 0.0994, + "step": 12515 + }, + { + "epoch": 0.31672444770605057, + "grad_norm": 14.723172187805176, + "learning_rate": 7.812910385124953e-06, + "loss": 0.2307, + "step": 12516 + }, + { + "epoch": 0.3167497532707442, + "grad_norm": 4.868274211883545, + "learning_rate": 7.812578420733325e-06, + "loss": 0.2391, + "step": 12517 + }, + { + "epoch": 0.31677505883543794, + "grad_norm": 8.261750221252441, + "learning_rate": 7.812246438203905e-06, + "loss": 0.1404, + "step": 12518 + }, + { + "epoch": 0.3168003644001316, + "grad_norm": 3.7572696208953857, + "learning_rate": 7.811914437538828e-06, + "loss": 0.1716, + "step": 12519 + }, + { + "epoch": 0.31682566996482525, + "grad_norm": 3.220576763153076, + "learning_rate": 7.811582418740238e-06, + "loss": 0.152, + "step": 12520 + }, + { + "epoch": 0.31685097552951896, + "grad_norm": 3.352341413497925, + "learning_rate": 7.811250381810276e-06, + "loss": 0.172, + "step": 12521 + }, + { + "epoch": 0.3168762810942126, + "grad_norm": 3.1215267181396484, + "learning_rate": 7.810918326751085e-06, + "loss": 0.1388, + "step": 12522 + }, + { + "epoch": 0.3169015866589063, + "grad_norm": 6.334549903869629, + "learning_rate": 7.810586253564803e-06, + "loss": 0.209, + "step": 12523 + }, + { + "epoch": 0.3169268922236, + "grad_norm": 3.398574113845825, + "learning_rate": 7.810254162253574e-06, + "loss": 0.1556, + "step": 12524 + }, + { + "epoch": 0.31695219778829364, + "grad_norm": 6.312050819396973, + "learning_rate": 7.809922052819538e-06, + "loss": 0.2027, + "step": 12525 + }, + { + "epoch": 0.3169775033529873, + "grad_norm": 14.043780326843262, + "learning_rate": 7.809589925264838e-06, + "loss": 0.3332, + "step": 12526 + }, + { + "epoch": 0.317002808917681, + "grad_norm": 7.430623531341553, + "learning_rate": 7.809257779591615e-06, + "loss": 0.1806, + "step": 12527 + }, + { + "epoch": 0.31702811448237467, + "grad_norm": 12.819899559020996, + "learning_rate": 7.80892561580201e-06, + "loss": 0.2913, + "step": 12528 + }, + { + "epoch": 0.3170534200470683, + "grad_norm": 5.884408473968506, + "learning_rate": 7.808593433898168e-06, + "loss": 0.1506, + "step": 12529 + }, + { + "epoch": 0.31707872561176204, + "grad_norm": 4.21409273147583, + "learning_rate": 7.808261233882227e-06, + "loss": 0.189, + "step": 12530 + }, + { + "epoch": 0.3171040311764557, + "grad_norm": 7.622764587402344, + "learning_rate": 7.807929015756332e-06, + "loss": 0.2112, + "step": 12531 + }, + { + "epoch": 0.3171293367411494, + "grad_norm": 3.684648036956787, + "learning_rate": 7.807596779522628e-06, + "loss": 0.1426, + "step": 12532 + }, + { + "epoch": 0.31715464230584306, + "grad_norm": 3.7891485691070557, + "learning_rate": 7.807264525183252e-06, + "loss": 0.1715, + "step": 12533 + }, + { + "epoch": 0.3171799478705367, + "grad_norm": 7.791707992553711, + "learning_rate": 7.806932252740351e-06, + "loss": 0.3184, + "step": 12534 + }, + { + "epoch": 0.3172052534352304, + "grad_norm": 13.28450870513916, + "learning_rate": 7.806599962196065e-06, + "loss": 0.2239, + "step": 12535 + }, + { + "epoch": 0.3172305589999241, + "grad_norm": 5.605217933654785, + "learning_rate": 7.806267653552538e-06, + "loss": 0.1359, + "step": 12536 + }, + { + "epoch": 0.31725586456461774, + "grad_norm": 4.6907501220703125, + "learning_rate": 7.805935326811913e-06, + "loss": 0.1605, + "step": 12537 + }, + { + "epoch": 0.31728117012931145, + "grad_norm": 9.72302532196045, + "learning_rate": 7.805602981976332e-06, + "loss": 0.3338, + "step": 12538 + }, + { + "epoch": 0.3173064756940051, + "grad_norm": 3.8915657997131348, + "learning_rate": 7.80527061904794e-06, + "loss": 0.2197, + "step": 12539 + }, + { + "epoch": 0.31733178125869876, + "grad_norm": 18.25269889831543, + "learning_rate": 7.80493823802888e-06, + "loss": 0.3058, + "step": 12540 + }, + { + "epoch": 0.3173570868233925, + "grad_norm": 4.7640700340271, + "learning_rate": 7.804605838921296e-06, + "loss": 0.1623, + "step": 12541 + }, + { + "epoch": 0.31738239238808613, + "grad_norm": 3.9999024868011475, + "learning_rate": 7.804273421727329e-06, + "loss": 0.1876, + "step": 12542 + }, + { + "epoch": 0.31740769795277984, + "grad_norm": 3.0488972663879395, + "learning_rate": 7.803940986449125e-06, + "loss": 0.1196, + "step": 12543 + }, + { + "epoch": 0.3174330035174735, + "grad_norm": 7.623456954956055, + "learning_rate": 7.803608533088826e-06, + "loss": 0.3477, + "step": 12544 + }, + { + "epoch": 0.31745830908216716, + "grad_norm": 4.1941728591918945, + "learning_rate": 7.803276061648579e-06, + "loss": 0.1766, + "step": 12545 + }, + { + "epoch": 0.31748361464686087, + "grad_norm": 8.162933349609375, + "learning_rate": 7.802943572130524e-06, + "loss": 0.2175, + "step": 12546 + }, + { + "epoch": 0.3175089202115545, + "grad_norm": 5.073642253875732, + "learning_rate": 7.80261106453681e-06, + "loss": 0.1833, + "step": 12547 + }, + { + "epoch": 0.3175342257762482, + "grad_norm": 5.542023658752441, + "learning_rate": 7.802278538869577e-06, + "loss": 0.2478, + "step": 12548 + }, + { + "epoch": 0.3175595313409419, + "grad_norm": 4.762641429901123, + "learning_rate": 7.80194599513097e-06, + "loss": 0.1573, + "step": 12549 + }, + { + "epoch": 0.31758483690563555, + "grad_norm": 6.0935282707214355, + "learning_rate": 7.801613433323137e-06, + "loss": 0.1685, + "step": 12550 + }, + { + "epoch": 0.3176101424703292, + "grad_norm": 7.75428581237793, + "learning_rate": 7.801280853448219e-06, + "loss": 0.2339, + "step": 12551 + }, + { + "epoch": 0.3176354480350229, + "grad_norm": 6.999661445617676, + "learning_rate": 7.80094825550836e-06, + "loss": 0.1666, + "step": 12552 + }, + { + "epoch": 0.31766075359971657, + "grad_norm": 4.1627960205078125, + "learning_rate": 7.800615639505708e-06, + "loss": 0.1211, + "step": 12553 + }, + { + "epoch": 0.31768605916441023, + "grad_norm": 3.8865256309509277, + "learning_rate": 7.800283005442408e-06, + "loss": 0.1994, + "step": 12554 + }, + { + "epoch": 0.31771136472910394, + "grad_norm": 3.818070888519287, + "learning_rate": 7.799950353320602e-06, + "loss": 0.1508, + "step": 12555 + }, + { + "epoch": 0.3177366702937976, + "grad_norm": 5.060222625732422, + "learning_rate": 7.799617683142437e-06, + "loss": 0.1659, + "step": 12556 + }, + { + "epoch": 0.3177619758584913, + "grad_norm": 8.445176124572754, + "learning_rate": 7.79928499491006e-06, + "loss": 0.1781, + "step": 12557 + }, + { + "epoch": 0.31778728142318496, + "grad_norm": 10.155661582946777, + "learning_rate": 7.798952288625614e-06, + "loss": 0.2589, + "step": 12558 + }, + { + "epoch": 0.3178125869878786, + "grad_norm": 6.915836811065674, + "learning_rate": 7.798619564291244e-06, + "loss": 0.2418, + "step": 12559 + }, + { + "epoch": 0.31783789255257233, + "grad_norm": 4.013150215148926, + "learning_rate": 7.7982868219091e-06, + "loss": 0.1282, + "step": 12560 + }, + { + "epoch": 0.317863198117266, + "grad_norm": 5.530273914337158, + "learning_rate": 7.79795406148132e-06, + "loss": 0.1722, + "step": 12561 + }, + { + "epoch": 0.31788850368195964, + "grad_norm": 5.512455940246582, + "learning_rate": 7.79762128301006e-06, + "loss": 0.1976, + "step": 12562 + }, + { + "epoch": 0.31791380924665336, + "grad_norm": 3.661902904510498, + "learning_rate": 7.797288486497459e-06, + "loss": 0.1685, + "step": 12563 + }, + { + "epoch": 0.317939114811347, + "grad_norm": 7.944306373596191, + "learning_rate": 7.796955671945662e-06, + "loss": 0.2993, + "step": 12564 + }, + { + "epoch": 0.31796442037604067, + "grad_norm": 5.821650981903076, + "learning_rate": 7.796622839356822e-06, + "loss": 0.1579, + "step": 12565 + }, + { + "epoch": 0.3179897259407344, + "grad_norm": 7.905322074890137, + "learning_rate": 7.796289988733079e-06, + "loss": 0.2273, + "step": 12566 + }, + { + "epoch": 0.31801503150542804, + "grad_norm": 5.154605865478516, + "learning_rate": 7.795957120076584e-06, + "loss": 0.2288, + "step": 12567 + }, + { + "epoch": 0.3180403370701217, + "grad_norm": 8.495622634887695, + "learning_rate": 7.79562423338948e-06, + "loss": 0.2221, + "step": 12568 + }, + { + "epoch": 0.3180656426348154, + "grad_norm": 2.7496137619018555, + "learning_rate": 7.795291328673917e-06, + "loss": 0.0765, + "step": 12569 + }, + { + "epoch": 0.31809094819950906, + "grad_norm": 4.788698673248291, + "learning_rate": 7.794958405932038e-06, + "loss": 0.1618, + "step": 12570 + }, + { + "epoch": 0.31811625376420277, + "grad_norm": 10.04921817779541, + "learning_rate": 7.794625465165993e-06, + "loss": 0.2827, + "step": 12571 + }, + { + "epoch": 0.31814155932889643, + "grad_norm": 7.527470111846924, + "learning_rate": 7.79429250637793e-06, + "loss": 0.2662, + "step": 12572 + }, + { + "epoch": 0.3181668648935901, + "grad_norm": 3.4827210903167725, + "learning_rate": 7.793959529569992e-06, + "loss": 0.1514, + "step": 12573 + }, + { + "epoch": 0.3181921704582838, + "grad_norm": 7.321206092834473, + "learning_rate": 7.79362653474433e-06, + "loss": 0.1362, + "step": 12574 + }, + { + "epoch": 0.31821747602297745, + "grad_norm": 6.7343525886535645, + "learning_rate": 7.793293521903089e-06, + "loss": 0.1966, + "step": 12575 + }, + { + "epoch": 0.3182427815876711, + "grad_norm": 2.9428632259368896, + "learning_rate": 7.792960491048419e-06, + "loss": 0.1622, + "step": 12576 + }, + { + "epoch": 0.3182680871523648, + "grad_norm": 8.985167503356934, + "learning_rate": 7.792627442182465e-06, + "loss": 0.1312, + "step": 12577 + }, + { + "epoch": 0.3182933927170585, + "grad_norm": 3.4344255924224854, + "learning_rate": 7.792294375307377e-06, + "loss": 0.1815, + "step": 12578 + }, + { + "epoch": 0.31831869828175213, + "grad_norm": 4.2538886070251465, + "learning_rate": 7.791961290425301e-06, + "loss": 0.1392, + "step": 12579 + }, + { + "epoch": 0.31834400384644584, + "grad_norm": 3.2473771572113037, + "learning_rate": 7.791628187538387e-06, + "loss": 0.1525, + "step": 12580 + }, + { + "epoch": 0.3183693094111395, + "grad_norm": 26.02132797241211, + "learning_rate": 7.791295066648781e-06, + "loss": 0.3229, + "step": 12581 + }, + { + "epoch": 0.3183946149758332, + "grad_norm": 7.016847610473633, + "learning_rate": 7.790961927758635e-06, + "loss": 0.1438, + "step": 12582 + }, + { + "epoch": 0.31841992054052687, + "grad_norm": 7.058066368103027, + "learning_rate": 7.790628770870092e-06, + "loss": 0.2251, + "step": 12583 + }, + { + "epoch": 0.3184452261052205, + "grad_norm": 7.76125431060791, + "learning_rate": 7.790295595985304e-06, + "loss": 0.2451, + "step": 12584 + }, + { + "epoch": 0.31847053166991424, + "grad_norm": 10.530628204345703, + "learning_rate": 7.78996240310642e-06, + "loss": 0.2413, + "step": 12585 + }, + { + "epoch": 0.3184958372346079, + "grad_norm": 5.0042829513549805, + "learning_rate": 7.789629192235586e-06, + "loss": 0.1684, + "step": 12586 + }, + { + "epoch": 0.31852114279930155, + "grad_norm": 16.02667236328125, + "learning_rate": 7.789295963374952e-06, + "loss": 0.208, + "step": 12587 + }, + { + "epoch": 0.31854644836399526, + "grad_norm": 4.8902201652526855, + "learning_rate": 7.788962716526667e-06, + "loss": 0.1383, + "step": 12588 + }, + { + "epoch": 0.3185717539286889, + "grad_norm": 6.953856945037842, + "learning_rate": 7.788629451692882e-06, + "loss": 0.2988, + "step": 12589 + }, + { + "epoch": 0.3185970594933826, + "grad_norm": 4.277731895446777, + "learning_rate": 7.788296168875742e-06, + "loss": 0.1619, + "step": 12590 + }, + { + "epoch": 0.3186223650580763, + "grad_norm": 7.24662446975708, + "learning_rate": 7.787962868077401e-06, + "loss": 0.2465, + "step": 12591 + }, + { + "epoch": 0.31864767062276994, + "grad_norm": 8.581341743469238, + "learning_rate": 7.787629549300005e-06, + "loss": 0.1892, + "step": 12592 + }, + { + "epoch": 0.3186729761874636, + "grad_norm": 5.394927024841309, + "learning_rate": 7.787296212545705e-06, + "loss": 0.172, + "step": 12593 + }, + { + "epoch": 0.3186982817521573, + "grad_norm": 5.793895244598389, + "learning_rate": 7.786962857816649e-06, + "loss": 0.1325, + "step": 12594 + }, + { + "epoch": 0.31872358731685096, + "grad_norm": 3.8782734870910645, + "learning_rate": 7.78662948511499e-06, + "loss": 0.14, + "step": 12595 + }, + { + "epoch": 0.3187488928815447, + "grad_norm": 4.666974067687988, + "learning_rate": 7.786296094442874e-06, + "loss": 0.1201, + "step": 12596 + }, + { + "epoch": 0.31877419844623833, + "grad_norm": 6.153609752655029, + "learning_rate": 7.785962685802452e-06, + "loss": 0.2833, + "step": 12597 + }, + { + "epoch": 0.318799504010932, + "grad_norm": 3.596668004989624, + "learning_rate": 7.785629259195876e-06, + "loss": 0.2228, + "step": 12598 + }, + { + "epoch": 0.3188248095756257, + "grad_norm": 4.233908176422119, + "learning_rate": 7.785295814625295e-06, + "loss": 0.1888, + "step": 12599 + }, + { + "epoch": 0.31885011514031936, + "grad_norm": 11.214886665344238, + "learning_rate": 7.784962352092862e-06, + "loss": 0.1888, + "step": 12600 + }, + { + "epoch": 0.318875420705013, + "grad_norm": 5.161268711090088, + "learning_rate": 7.784628871600721e-06, + "loss": 0.2524, + "step": 12601 + }, + { + "epoch": 0.3189007262697067, + "grad_norm": 6.860062122344971, + "learning_rate": 7.784295373151028e-06, + "loss": 0.1537, + "step": 12602 + }, + { + "epoch": 0.3189260318344004, + "grad_norm": 4.998114585876465, + "learning_rate": 7.783961856745932e-06, + "loss": 0.1398, + "step": 12603 + }, + { + "epoch": 0.31895133739909404, + "grad_norm": 4.996494770050049, + "learning_rate": 7.783628322387585e-06, + "loss": 0.1907, + "step": 12604 + }, + { + "epoch": 0.31897664296378775, + "grad_norm": 3.3192877769470215, + "learning_rate": 7.783294770078133e-06, + "loss": 0.1882, + "step": 12605 + }, + { + "epoch": 0.3190019485284814, + "grad_norm": 5.719119071960449, + "learning_rate": 7.782961199819734e-06, + "loss": 0.2065, + "step": 12606 + }, + { + "epoch": 0.3190272540931751, + "grad_norm": 7.614871501922607, + "learning_rate": 7.782627611614535e-06, + "loss": 0.25, + "step": 12607 + }, + { + "epoch": 0.3190525596578688, + "grad_norm": 4.521573066711426, + "learning_rate": 7.782294005464688e-06, + "loss": 0.2084, + "step": 12608 + }, + { + "epoch": 0.31907786522256243, + "grad_norm": 6.596709728240967, + "learning_rate": 7.781960381372346e-06, + "loss": 0.1854, + "step": 12609 + }, + { + "epoch": 0.31910317078725614, + "grad_norm": 5.366518020629883, + "learning_rate": 7.781626739339657e-06, + "loss": 0.1385, + "step": 12610 + }, + { + "epoch": 0.3191284763519498, + "grad_norm": 3.700026273727417, + "learning_rate": 7.781293079368776e-06, + "loss": 0.1182, + "step": 12611 + }, + { + "epoch": 0.31915378191664345, + "grad_norm": 8.986875534057617, + "learning_rate": 7.780959401461853e-06, + "loss": 0.296, + "step": 12612 + }, + { + "epoch": 0.31917908748133716, + "grad_norm": 3.5343713760375977, + "learning_rate": 7.780625705621039e-06, + "loss": 0.1314, + "step": 12613 + }, + { + "epoch": 0.3192043930460308, + "grad_norm": 4.912128448486328, + "learning_rate": 7.780291991848487e-06, + "loss": 0.1642, + "step": 12614 + }, + { + "epoch": 0.3192296986107245, + "grad_norm": 3.4438636302948, + "learning_rate": 7.779958260146353e-06, + "loss": 0.1224, + "step": 12615 + }, + { + "epoch": 0.3192550041754182, + "grad_norm": 8.235267639160156, + "learning_rate": 7.779624510516781e-06, + "loss": 0.2498, + "step": 12616 + }, + { + "epoch": 0.31928030974011185, + "grad_norm": 4.88595724105835, + "learning_rate": 7.779290742961929e-06, + "loss": 0.1378, + "step": 12617 + }, + { + "epoch": 0.3193056153048055, + "grad_norm": 2.9909191131591797, + "learning_rate": 7.778956957483947e-06, + "loss": 0.1547, + "step": 12618 + }, + { + "epoch": 0.3193309208694992, + "grad_norm": 4.113757610321045, + "learning_rate": 7.77862315408499e-06, + "loss": 0.2165, + "step": 12619 + }, + { + "epoch": 0.31935622643419287, + "grad_norm": 8.222410202026367, + "learning_rate": 7.778289332767208e-06, + "loss": 0.1487, + "step": 12620 + }, + { + "epoch": 0.3193815319988866, + "grad_norm": 5.233488082885742, + "learning_rate": 7.777955493532755e-06, + "loss": 0.1224, + "step": 12621 + }, + { + "epoch": 0.31940683756358024, + "grad_norm": 3.424848794937134, + "learning_rate": 7.777621636383786e-06, + "loss": 0.1514, + "step": 12622 + }, + { + "epoch": 0.3194321431282739, + "grad_norm": 5.842631816864014, + "learning_rate": 7.777287761322448e-06, + "loss": 0.2355, + "step": 12623 + }, + { + "epoch": 0.3194574486929676, + "grad_norm": 9.872713088989258, + "learning_rate": 7.7769538683509e-06, + "loss": 0.3014, + "step": 12624 + }, + { + "epoch": 0.31948275425766126, + "grad_norm": 11.073836326599121, + "learning_rate": 7.776619957471291e-06, + "loss": 0.2699, + "step": 12625 + }, + { + "epoch": 0.3195080598223549, + "grad_norm": 7.36972713470459, + "learning_rate": 7.77628602868578e-06, + "loss": 0.1939, + "step": 12626 + }, + { + "epoch": 0.31953336538704863, + "grad_norm": 4.739633560180664, + "learning_rate": 7.775952081996515e-06, + "loss": 0.1505, + "step": 12627 + }, + { + "epoch": 0.3195586709517423, + "grad_norm": 3.605863094329834, + "learning_rate": 7.775618117405651e-06, + "loss": 0.1726, + "step": 12628 + }, + { + "epoch": 0.31958397651643594, + "grad_norm": 4.202856063842773, + "learning_rate": 7.775284134915343e-06, + "loss": 0.2479, + "step": 12629 + }, + { + "epoch": 0.31960928208112965, + "grad_norm": 3.88979434967041, + "learning_rate": 7.774950134527744e-06, + "loss": 0.1508, + "step": 12630 + }, + { + "epoch": 0.3196345876458233, + "grad_norm": 5.864480018615723, + "learning_rate": 7.774616116245006e-06, + "loss": 0.2062, + "step": 12631 + }, + { + "epoch": 0.31965989321051697, + "grad_norm": 15.777886390686035, + "learning_rate": 7.774282080069286e-06, + "loss": 0.2118, + "step": 12632 + }, + { + "epoch": 0.3196851987752107, + "grad_norm": 4.235927104949951, + "learning_rate": 7.773948026002737e-06, + "loss": 0.1443, + "step": 12633 + }, + { + "epoch": 0.31971050433990433, + "grad_norm": 3.0977323055267334, + "learning_rate": 7.773613954047512e-06, + "loss": 0.0816, + "step": 12634 + }, + { + "epoch": 0.31973580990459805, + "grad_norm": 5.493777751922607, + "learning_rate": 7.77327986420577e-06, + "loss": 0.2216, + "step": 12635 + }, + { + "epoch": 0.3197611154692917, + "grad_norm": 3.5476386547088623, + "learning_rate": 7.772945756479658e-06, + "loss": 0.1999, + "step": 12636 + }, + { + "epoch": 0.31978642103398536, + "grad_norm": 4.148077011108398, + "learning_rate": 7.772611630871334e-06, + "loss": 0.1487, + "step": 12637 + }, + { + "epoch": 0.31981172659867907, + "grad_norm": 3.9369494915008545, + "learning_rate": 7.772277487382958e-06, + "loss": 0.1557, + "step": 12638 + }, + { + "epoch": 0.3198370321633727, + "grad_norm": 4.607567310333252, + "learning_rate": 7.771943326016676e-06, + "loss": 0.2443, + "step": 12639 + }, + { + "epoch": 0.3198623377280664, + "grad_norm": 4.892247676849365, + "learning_rate": 7.77160914677465e-06, + "loss": 0.1917, + "step": 12640 + }, + { + "epoch": 0.3198876432927601, + "grad_norm": 8.157421112060547, + "learning_rate": 7.77127494965903e-06, + "loss": 0.2496, + "step": 12641 + }, + { + "epoch": 0.31991294885745375, + "grad_norm": 4.577882289886475, + "learning_rate": 7.770940734671975e-06, + "loss": 0.1456, + "step": 12642 + }, + { + "epoch": 0.3199382544221474, + "grad_norm": 4.875659465789795, + "learning_rate": 7.770606501815638e-06, + "loss": 0.1291, + "step": 12643 + }, + { + "epoch": 0.3199635599868411, + "grad_norm": 5.924720764160156, + "learning_rate": 7.770272251092174e-06, + "loss": 0.1554, + "step": 12644 + }, + { + "epoch": 0.3199888655515348, + "grad_norm": 6.939540863037109, + "learning_rate": 7.76993798250374e-06, + "loss": 0.22, + "step": 12645 + }, + { + "epoch": 0.3200141711162285, + "grad_norm": 3.7502448558807373, + "learning_rate": 7.769603696052493e-06, + "loss": 0.1325, + "step": 12646 + }, + { + "epoch": 0.32003947668092214, + "grad_norm": 2.819978713989258, + "learning_rate": 7.769269391740585e-06, + "loss": 0.1524, + "step": 12647 + }, + { + "epoch": 0.3200647822456158, + "grad_norm": 3.2385411262512207, + "learning_rate": 7.768935069570175e-06, + "loss": 0.1301, + "step": 12648 + }, + { + "epoch": 0.3200900878103095, + "grad_norm": 17.682565689086914, + "learning_rate": 7.768600729543417e-06, + "loss": 0.2987, + "step": 12649 + }, + { + "epoch": 0.32011539337500317, + "grad_norm": 3.476546049118042, + "learning_rate": 7.768266371662469e-06, + "loss": 0.1767, + "step": 12650 + }, + { + "epoch": 0.3201406989396968, + "grad_norm": 1.732560396194458, + "learning_rate": 7.767931995929485e-06, + "loss": 0.089, + "step": 12651 + }, + { + "epoch": 0.32016600450439053, + "grad_norm": 11.806270599365234, + "learning_rate": 7.767597602346622e-06, + "loss": 0.2636, + "step": 12652 + }, + { + "epoch": 0.3201913100690842, + "grad_norm": 4.276787281036377, + "learning_rate": 7.767263190916039e-06, + "loss": 0.1234, + "step": 12653 + }, + { + "epoch": 0.32021661563377785, + "grad_norm": 4.498136043548584, + "learning_rate": 7.766928761639888e-06, + "loss": 0.1249, + "step": 12654 + }, + { + "epoch": 0.32024192119847156, + "grad_norm": 15.616194725036621, + "learning_rate": 7.76659431452033e-06, + "loss": 0.1594, + "step": 12655 + }, + { + "epoch": 0.3202672267631652, + "grad_norm": 3.7682738304138184, + "learning_rate": 7.76625984955952e-06, + "loss": 0.0922, + "step": 12656 + }, + { + "epoch": 0.32029253232785887, + "grad_norm": 7.580065727233887, + "learning_rate": 7.765925366759613e-06, + "loss": 0.227, + "step": 12657 + }, + { + "epoch": 0.3203178378925526, + "grad_norm": 4.7201080322265625, + "learning_rate": 7.765590866122768e-06, + "loss": 0.172, + "step": 12658 + }, + { + "epoch": 0.32034314345724624, + "grad_norm": 7.721548557281494, + "learning_rate": 7.765256347651142e-06, + "loss": 0.2451, + "step": 12659 + }, + { + "epoch": 0.32036844902193995, + "grad_norm": 5.100618839263916, + "learning_rate": 7.764921811346893e-06, + "loss": 0.1652, + "step": 12660 + }, + { + "epoch": 0.3203937545866336, + "grad_norm": 12.00085163116455, + "learning_rate": 7.764587257212178e-06, + "loss": 0.2482, + "step": 12661 + }, + { + "epoch": 0.32041906015132726, + "grad_norm": 5.821018218994141, + "learning_rate": 7.764252685249153e-06, + "loss": 0.1146, + "step": 12662 + }, + { + "epoch": 0.320444365716021, + "grad_norm": 8.378334999084473, + "learning_rate": 7.763918095459977e-06, + "loss": 0.2918, + "step": 12663 + }, + { + "epoch": 0.32046967128071463, + "grad_norm": 6.7365522384643555, + "learning_rate": 7.763583487846806e-06, + "loss": 0.253, + "step": 12664 + }, + { + "epoch": 0.3204949768454083, + "grad_norm": 5.276801586151123, + "learning_rate": 7.763248862411801e-06, + "loss": 0.1711, + "step": 12665 + }, + { + "epoch": 0.320520282410102, + "grad_norm": 6.602205753326416, + "learning_rate": 7.762914219157119e-06, + "loss": 0.2368, + "step": 12666 + }, + { + "epoch": 0.32054558797479565, + "grad_norm": 3.6813104152679443, + "learning_rate": 7.762579558084915e-06, + "loss": 0.2004, + "step": 12667 + }, + { + "epoch": 0.3205708935394893, + "grad_norm": 4.933360576629639, + "learning_rate": 7.76224487919735e-06, + "loss": 0.1426, + "step": 12668 + }, + { + "epoch": 0.320596199104183, + "grad_norm": 8.627992630004883, + "learning_rate": 7.76191018249658e-06, + "loss": 0.1664, + "step": 12669 + }, + { + "epoch": 0.3206215046688767, + "grad_norm": 12.071220397949219, + "learning_rate": 7.761575467984767e-06, + "loss": 0.175, + "step": 12670 + }, + { + "epoch": 0.3206468102335704, + "grad_norm": 5.523820877075195, + "learning_rate": 7.761240735664067e-06, + "loss": 0.1972, + "step": 12671 + }, + { + "epoch": 0.32067211579826405, + "grad_norm": 10.172884941101074, + "learning_rate": 7.76090598553664e-06, + "loss": 0.106, + "step": 12672 + }, + { + "epoch": 0.3206974213629577, + "grad_norm": 10.568052291870117, + "learning_rate": 7.760571217604643e-06, + "loss": 0.1936, + "step": 12673 + }, + { + "epoch": 0.3207227269276514, + "grad_norm": 6.767277240753174, + "learning_rate": 7.760236431870237e-06, + "loss": 0.2243, + "step": 12674 + }, + { + "epoch": 0.32074803249234507, + "grad_norm": 5.250727653503418, + "learning_rate": 7.759901628335577e-06, + "loss": 0.2093, + "step": 12675 + }, + { + "epoch": 0.3207733380570387, + "grad_norm": 4.457923889160156, + "learning_rate": 7.759566807002825e-06, + "loss": 0.1319, + "step": 12676 + }, + { + "epoch": 0.32079864362173244, + "grad_norm": 7.16160249710083, + "learning_rate": 7.759231967874144e-06, + "loss": 0.1307, + "step": 12677 + }, + { + "epoch": 0.3208239491864261, + "grad_norm": 5.175529956817627, + "learning_rate": 7.758897110951685e-06, + "loss": 0.1721, + "step": 12678 + }, + { + "epoch": 0.32084925475111975, + "grad_norm": 3.933140277862549, + "learning_rate": 7.758562236237614e-06, + "loss": 0.1681, + "step": 12679 + }, + { + "epoch": 0.32087456031581346, + "grad_norm": 2.1589128971099854, + "learning_rate": 7.758227343734086e-06, + "loss": 0.0972, + "step": 12680 + }, + { + "epoch": 0.3208998658805071, + "grad_norm": 9.068336486816406, + "learning_rate": 7.757892433443265e-06, + "loss": 0.1949, + "step": 12681 + }, + { + "epoch": 0.3209251714452008, + "grad_norm": 3.114227533340454, + "learning_rate": 7.757557505367308e-06, + "loss": 0.0976, + "step": 12682 + }, + { + "epoch": 0.3209504770098945, + "grad_norm": 13.9700927734375, + "learning_rate": 7.757222559508377e-06, + "loss": 0.279, + "step": 12683 + }, + { + "epoch": 0.32097578257458814, + "grad_norm": 5.95958948135376, + "learning_rate": 7.756887595868628e-06, + "loss": 0.2084, + "step": 12684 + }, + { + "epoch": 0.32100108813928185, + "grad_norm": 9.340654373168945, + "learning_rate": 7.756552614450226e-06, + "loss": 0.2185, + "step": 12685 + }, + { + "epoch": 0.3210263937039755, + "grad_norm": 4.553046703338623, + "learning_rate": 7.756217615255328e-06, + "loss": 0.2014, + "step": 12686 + }, + { + "epoch": 0.32105169926866917, + "grad_norm": 5.19481897354126, + "learning_rate": 7.755882598286095e-06, + "loss": 0.1939, + "step": 12687 + }, + { + "epoch": 0.3210770048333629, + "grad_norm": 12.038787841796875, + "learning_rate": 7.755547563544688e-06, + "loss": 0.1163, + "step": 12688 + }, + { + "epoch": 0.32110231039805653, + "grad_norm": 10.447816848754883, + "learning_rate": 7.755212511033267e-06, + "loss": 0.1654, + "step": 12689 + }, + { + "epoch": 0.3211276159627502, + "grad_norm": 3.537322521209717, + "learning_rate": 7.754877440753994e-06, + "loss": 0.2085, + "step": 12690 + }, + { + "epoch": 0.3211529215274439, + "grad_norm": 5.437306880950928, + "learning_rate": 7.754542352709029e-06, + "loss": 0.1556, + "step": 12691 + }, + { + "epoch": 0.32117822709213756, + "grad_norm": 6.819316387176514, + "learning_rate": 7.754207246900531e-06, + "loss": 0.1418, + "step": 12692 + }, + { + "epoch": 0.3212035326568312, + "grad_norm": 7.003757953643799, + "learning_rate": 7.753872123330664e-06, + "loss": 0.192, + "step": 12693 + }, + { + "epoch": 0.3212288382215249, + "grad_norm": 4.34937858581543, + "learning_rate": 7.753536982001587e-06, + "loss": 0.2083, + "step": 12694 + }, + { + "epoch": 0.3212541437862186, + "grad_norm": 3.705256462097168, + "learning_rate": 7.753201822915465e-06, + "loss": 0.1104, + "step": 12695 + }, + { + "epoch": 0.32127944935091224, + "grad_norm": 8.575274467468262, + "learning_rate": 7.752866646074454e-06, + "loss": 0.1307, + "step": 12696 + }, + { + "epoch": 0.32130475491560595, + "grad_norm": 4.825028419494629, + "learning_rate": 7.752531451480718e-06, + "loss": 0.1837, + "step": 12697 + }, + { + "epoch": 0.3213300604802996, + "grad_norm": 4.066221714019775, + "learning_rate": 7.752196239136418e-06, + "loss": 0.2114, + "step": 12698 + }, + { + "epoch": 0.3213553660449933, + "grad_norm": 3.5974478721618652, + "learning_rate": 7.75186100904372e-06, + "loss": 0.1892, + "step": 12699 + }, + { + "epoch": 0.321380671609687, + "grad_norm": 4.069952487945557, + "learning_rate": 7.75152576120478e-06, + "loss": 0.1667, + "step": 12700 + }, + { + "epoch": 0.32140597717438063, + "grad_norm": 3.908754587173462, + "learning_rate": 7.751190495621762e-06, + "loss": 0.1747, + "step": 12701 + }, + { + "epoch": 0.32143128273907434, + "grad_norm": 3.3660261631011963, + "learning_rate": 7.750855212296827e-06, + "loss": 0.1488, + "step": 12702 + }, + { + "epoch": 0.321456588303768, + "grad_norm": 1.9876123666763306, + "learning_rate": 7.75051991123214e-06, + "loss": 0.0782, + "step": 12703 + }, + { + "epoch": 0.32148189386846165, + "grad_norm": 7.3164167404174805, + "learning_rate": 7.750184592429862e-06, + "loss": 0.1762, + "step": 12704 + }, + { + "epoch": 0.32150719943315537, + "grad_norm": 4.700590133666992, + "learning_rate": 7.749849255892154e-06, + "loss": 0.1395, + "step": 12705 + }, + { + "epoch": 0.321532504997849, + "grad_norm": 11.826855659484863, + "learning_rate": 7.749513901621181e-06, + "loss": 0.284, + "step": 12706 + }, + { + "epoch": 0.3215578105625427, + "grad_norm": 6.081523418426514, + "learning_rate": 7.749178529619102e-06, + "loss": 0.2002, + "step": 12707 + }, + { + "epoch": 0.3215831161272364, + "grad_norm": 4.002323150634766, + "learning_rate": 7.748843139888085e-06, + "loss": 0.1312, + "step": 12708 + }, + { + "epoch": 0.32160842169193005, + "grad_norm": 4.40864896774292, + "learning_rate": 7.748507732430289e-06, + "loss": 0.1472, + "step": 12709 + }, + { + "epoch": 0.32163372725662376, + "grad_norm": 2.693157911300659, + "learning_rate": 7.748172307247876e-06, + "loss": 0.1002, + "step": 12710 + }, + { + "epoch": 0.3216590328213174, + "grad_norm": 2.607656478881836, + "learning_rate": 7.747836864343012e-06, + "loss": 0.1362, + "step": 12711 + }, + { + "epoch": 0.32168433838601107, + "grad_norm": 4.127254009246826, + "learning_rate": 7.74750140371786e-06, + "loss": 0.1125, + "step": 12712 + }, + { + "epoch": 0.3217096439507048, + "grad_norm": 7.339848518371582, + "learning_rate": 7.747165925374583e-06, + "loss": 0.2235, + "step": 12713 + }, + { + "epoch": 0.32173494951539844, + "grad_norm": 4.701685905456543, + "learning_rate": 7.746830429315343e-06, + "loss": 0.1811, + "step": 12714 + }, + { + "epoch": 0.3217602550800921, + "grad_norm": 5.088680267333984, + "learning_rate": 7.746494915542305e-06, + "loss": 0.2049, + "step": 12715 + }, + { + "epoch": 0.3217855606447858, + "grad_norm": 7.604069709777832, + "learning_rate": 7.746159384057632e-06, + "loss": 0.1709, + "step": 12716 + }, + { + "epoch": 0.32181086620947946, + "grad_norm": 4.129343509674072, + "learning_rate": 7.745823834863487e-06, + "loss": 0.1961, + "step": 12717 + }, + { + "epoch": 0.3218361717741731, + "grad_norm": 4.662707328796387, + "learning_rate": 7.745488267962035e-06, + "loss": 0.148, + "step": 12718 + }, + { + "epoch": 0.32186147733886683, + "grad_norm": 3.285128593444824, + "learning_rate": 7.745152683355442e-06, + "loss": 0.2167, + "step": 12719 + }, + { + "epoch": 0.3218867829035605, + "grad_norm": 3.6101927757263184, + "learning_rate": 7.744817081045868e-06, + "loss": 0.204, + "step": 12720 + }, + { + "epoch": 0.32191208846825414, + "grad_norm": 5.954335689544678, + "learning_rate": 7.74448146103548e-06, + "loss": 0.1387, + "step": 12721 + }, + { + "epoch": 0.32193739403294785, + "grad_norm": 4.01973819732666, + "learning_rate": 7.74414582332644e-06, + "loss": 0.1847, + "step": 12722 + }, + { + "epoch": 0.3219626995976415, + "grad_norm": 5.313425540924072, + "learning_rate": 7.743810167920915e-06, + "loss": 0.2182, + "step": 12723 + }, + { + "epoch": 0.3219880051623352, + "grad_norm": 5.6299333572387695, + "learning_rate": 7.743474494821071e-06, + "loss": 0.1988, + "step": 12724 + }, + { + "epoch": 0.3220133107270289, + "grad_norm": 10.416181564331055, + "learning_rate": 7.743138804029067e-06, + "loss": 0.3015, + "step": 12725 + }, + { + "epoch": 0.32203861629172253, + "grad_norm": 4.1181769371032715, + "learning_rate": 7.742803095547073e-06, + "loss": 0.1273, + "step": 12726 + }, + { + "epoch": 0.32206392185641625, + "grad_norm": 9.874327659606934, + "learning_rate": 7.74246736937725e-06, + "loss": 0.1884, + "step": 12727 + }, + { + "epoch": 0.3220892274211099, + "grad_norm": 2.483067035675049, + "learning_rate": 7.742131625521768e-06, + "loss": 0.0959, + "step": 12728 + }, + { + "epoch": 0.32211453298580356, + "grad_norm": 7.572810649871826, + "learning_rate": 7.741795863982787e-06, + "loss": 0.1598, + "step": 12729 + }, + { + "epoch": 0.32213983855049727, + "grad_norm": 7.660792827606201, + "learning_rate": 7.741460084762476e-06, + "loss": 0.2064, + "step": 12730 + }, + { + "epoch": 0.3221651441151909, + "grad_norm": 6.758990287780762, + "learning_rate": 7.741124287862997e-06, + "loss": 0.1268, + "step": 12731 + }, + { + "epoch": 0.3221904496798846, + "grad_norm": 4.6327385902404785, + "learning_rate": 7.740788473286518e-06, + "loss": 0.1491, + "step": 12732 + }, + { + "epoch": 0.3222157552445783, + "grad_norm": 4.840311527252197, + "learning_rate": 7.740452641035206e-06, + "loss": 0.1546, + "step": 12733 + }, + { + "epoch": 0.32224106080927195, + "grad_norm": 5.882189750671387, + "learning_rate": 7.74011679111122e-06, + "loss": 0.2107, + "step": 12734 + }, + { + "epoch": 0.32226636637396566, + "grad_norm": 8.862467765808105, + "learning_rate": 7.739780923516736e-06, + "loss": 0.2561, + "step": 12735 + }, + { + "epoch": 0.3222916719386593, + "grad_norm": 2.9642655849456787, + "learning_rate": 7.73944503825391e-06, + "loss": 0.1245, + "step": 12736 + }, + { + "epoch": 0.322316977503353, + "grad_norm": 7.910043716430664, + "learning_rate": 7.739109135324916e-06, + "loss": 0.1962, + "step": 12737 + }, + { + "epoch": 0.3223422830680467, + "grad_norm": 5.325186252593994, + "learning_rate": 7.738773214731914e-06, + "loss": 0.255, + "step": 12738 + }, + { + "epoch": 0.32236758863274034, + "grad_norm": 11.712589263916016, + "learning_rate": 7.738437276477075e-06, + "loss": 0.2282, + "step": 12739 + }, + { + "epoch": 0.322392894197434, + "grad_norm": 3.16605544090271, + "learning_rate": 7.738101320562562e-06, + "loss": 0.0934, + "step": 12740 + }, + { + "epoch": 0.3224181997621277, + "grad_norm": 5.934406757354736, + "learning_rate": 7.737765346990545e-06, + "loss": 0.242, + "step": 12741 + }, + { + "epoch": 0.32244350532682137, + "grad_norm": 15.523408889770508, + "learning_rate": 7.737429355763187e-06, + "loss": 0.2343, + "step": 12742 + }, + { + "epoch": 0.322468810891515, + "grad_norm": 9.190190315246582, + "learning_rate": 7.737093346882655e-06, + "loss": 0.2213, + "step": 12743 + }, + { + "epoch": 0.32249411645620873, + "grad_norm": 6.6954450607299805, + "learning_rate": 7.736757320351118e-06, + "loss": 0.2683, + "step": 12744 + }, + { + "epoch": 0.3225194220209024, + "grad_norm": 7.99918270111084, + "learning_rate": 7.736421276170742e-06, + "loss": 0.1785, + "step": 12745 + }, + { + "epoch": 0.32254472758559605, + "grad_norm": 4.959445476531982, + "learning_rate": 7.736085214343696e-06, + "loss": 0.2237, + "step": 12746 + }, + { + "epoch": 0.32257003315028976, + "grad_norm": 4.189716339111328, + "learning_rate": 7.735749134872143e-06, + "loss": 0.1945, + "step": 12747 + }, + { + "epoch": 0.3225953387149834, + "grad_norm": 3.2003517150878906, + "learning_rate": 7.735413037758257e-06, + "loss": 0.1456, + "step": 12748 + }, + { + "epoch": 0.3226206442796771, + "grad_norm": 3.5275135040283203, + "learning_rate": 7.735076923004197e-06, + "loss": 0.1626, + "step": 12749 + }, + { + "epoch": 0.3226459498443708, + "grad_norm": 3.864685297012329, + "learning_rate": 7.734740790612137e-06, + "loss": 0.2531, + "step": 12750 + }, + { + "epoch": 0.32267125540906444, + "grad_norm": 3.851057767868042, + "learning_rate": 7.73440464058424e-06, + "loss": 0.1825, + "step": 12751 + }, + { + "epoch": 0.32269656097375815, + "grad_norm": 4.032713890075684, + "learning_rate": 7.734068472922677e-06, + "loss": 0.0411, + "step": 12752 + }, + { + "epoch": 0.3227218665384518, + "grad_norm": 14.184779167175293, + "learning_rate": 7.733732287629613e-06, + "loss": 0.1706, + "step": 12753 + }, + { + "epoch": 0.32274717210314546, + "grad_norm": 16.432275772094727, + "learning_rate": 7.73339608470722e-06, + "loss": 0.2058, + "step": 12754 + }, + { + "epoch": 0.3227724776678392, + "grad_norm": 6.403563499450684, + "learning_rate": 7.733059864157667e-06, + "loss": 0.1637, + "step": 12755 + }, + { + "epoch": 0.32279778323253283, + "grad_norm": 7.033827781677246, + "learning_rate": 7.732723625983116e-06, + "loss": 0.2113, + "step": 12756 + }, + { + "epoch": 0.3228230887972265, + "grad_norm": 5.423579692840576, + "learning_rate": 7.73238737018574e-06, + "loss": 0.1312, + "step": 12757 + }, + { + "epoch": 0.3228483943619202, + "grad_norm": 4.210136890411377, + "learning_rate": 7.732051096767705e-06, + "loss": 0.1913, + "step": 12758 + }, + { + "epoch": 0.32287369992661386, + "grad_norm": 2.9498064517974854, + "learning_rate": 7.731714805731181e-06, + "loss": 0.136, + "step": 12759 + }, + { + "epoch": 0.3228990054913075, + "grad_norm": 5.364416122436523, + "learning_rate": 7.731378497078336e-06, + "loss": 0.1449, + "step": 12760 + }, + { + "epoch": 0.3229243110560012, + "grad_norm": 13.169132232666016, + "learning_rate": 7.73104217081134e-06, + "loss": 0.295, + "step": 12761 + }, + { + "epoch": 0.3229496166206949, + "grad_norm": 11.189956665039062, + "learning_rate": 7.73070582693236e-06, + "loss": 0.1811, + "step": 12762 + }, + { + "epoch": 0.3229749221853886, + "grad_norm": 4.263969898223877, + "learning_rate": 7.730369465443568e-06, + "loss": 0.1568, + "step": 12763 + }, + { + "epoch": 0.32300022775008225, + "grad_norm": 12.581185340881348, + "learning_rate": 7.730033086347129e-06, + "loss": 0.1572, + "step": 12764 + }, + { + "epoch": 0.3230255333147759, + "grad_norm": 3.9749319553375244, + "learning_rate": 7.729696689645214e-06, + "loss": 0.1678, + "step": 12765 + }, + { + "epoch": 0.3230508388794696, + "grad_norm": 6.046675205230713, + "learning_rate": 7.729360275339996e-06, + "loss": 0.269, + "step": 12766 + }, + { + "epoch": 0.32307614444416327, + "grad_norm": 2.777322292327881, + "learning_rate": 7.729023843433638e-06, + "loss": 0.1285, + "step": 12767 + }, + { + "epoch": 0.3231014500088569, + "grad_norm": 3.304610013961792, + "learning_rate": 7.728687393928314e-06, + "loss": 0.1323, + "step": 12768 + }, + { + "epoch": 0.32312675557355064, + "grad_norm": 3.509061336517334, + "learning_rate": 7.728350926826194e-06, + "loss": 0.0974, + "step": 12769 + }, + { + "epoch": 0.3231520611382443, + "grad_norm": 9.070908546447754, + "learning_rate": 7.728014442129446e-06, + "loss": 0.2479, + "step": 12770 + }, + { + "epoch": 0.32317736670293795, + "grad_norm": 4.2139973640441895, + "learning_rate": 7.72767793984024e-06, + "loss": 0.1348, + "step": 12771 + }, + { + "epoch": 0.32320267226763166, + "grad_norm": 12.651687622070312, + "learning_rate": 7.727341419960745e-06, + "loss": 0.2837, + "step": 12772 + }, + { + "epoch": 0.3232279778323253, + "grad_norm": 7.211435317993164, + "learning_rate": 7.727004882493133e-06, + "loss": 0.1879, + "step": 12773 + }, + { + "epoch": 0.32325328339701903, + "grad_norm": 4.423880100250244, + "learning_rate": 7.726668327439575e-06, + "loss": 0.1495, + "step": 12774 + }, + { + "epoch": 0.3232785889617127, + "grad_norm": 3.2781732082366943, + "learning_rate": 7.726331754802239e-06, + "loss": 0.1564, + "step": 12775 + }, + { + "epoch": 0.32330389452640634, + "grad_norm": 3.9626381397247314, + "learning_rate": 7.725995164583296e-06, + "loss": 0.1715, + "step": 12776 + }, + { + "epoch": 0.32332920009110006, + "grad_norm": 8.15209674835205, + "learning_rate": 7.725658556784919e-06, + "loss": 0.1279, + "step": 12777 + }, + { + "epoch": 0.3233545056557937, + "grad_norm": 4.6219587326049805, + "learning_rate": 7.725321931409275e-06, + "loss": 0.2198, + "step": 12778 + }, + { + "epoch": 0.32337981122048737, + "grad_norm": 3.6394801139831543, + "learning_rate": 7.72498528845854e-06, + "loss": 0.1261, + "step": 12779 + }, + { + "epoch": 0.3234051167851811, + "grad_norm": 23.421985626220703, + "learning_rate": 7.724648627934879e-06, + "loss": 0.1814, + "step": 12780 + }, + { + "epoch": 0.32343042234987474, + "grad_norm": 2.584982395172119, + "learning_rate": 7.724311949840467e-06, + "loss": 0.1217, + "step": 12781 + }, + { + "epoch": 0.3234557279145684, + "grad_norm": 4.359592437744141, + "learning_rate": 7.723975254177473e-06, + "loss": 0.1287, + "step": 12782 + }, + { + "epoch": 0.3234810334792621, + "grad_norm": 5.28851842880249, + "learning_rate": 7.72363854094807e-06, + "loss": 0.1692, + "step": 12783 + }, + { + "epoch": 0.32350633904395576, + "grad_norm": 5.742042064666748, + "learning_rate": 7.72330181015443e-06, + "loss": 0.2258, + "step": 12784 + }, + { + "epoch": 0.3235316446086494, + "grad_norm": 8.711884498596191, + "learning_rate": 7.722965061798722e-06, + "loss": 0.2298, + "step": 12785 + }, + { + "epoch": 0.3235569501733431, + "grad_norm": 11.066028594970703, + "learning_rate": 7.722628295883116e-06, + "loss": 0.245, + "step": 12786 + }, + { + "epoch": 0.3235822557380368, + "grad_norm": 8.342145919799805, + "learning_rate": 7.722291512409789e-06, + "loss": 0.1819, + "step": 12787 + }, + { + "epoch": 0.3236075613027305, + "grad_norm": 5.55684232711792, + "learning_rate": 7.721954711380913e-06, + "loss": 0.1788, + "step": 12788 + }, + { + "epoch": 0.32363286686742415, + "grad_norm": 8.22565746307373, + "learning_rate": 7.721617892798655e-06, + "loss": 0.1646, + "step": 12789 + }, + { + "epoch": 0.3236581724321178, + "grad_norm": 3.078636407852173, + "learning_rate": 7.72128105666519e-06, + "loss": 0.1394, + "step": 12790 + }, + { + "epoch": 0.3236834779968115, + "grad_norm": 3.910337448120117, + "learning_rate": 7.720944202982687e-06, + "loss": 0.1237, + "step": 12791 + }, + { + "epoch": 0.3237087835615052, + "grad_norm": 11.838135719299316, + "learning_rate": 7.720607331753325e-06, + "loss": 0.3148, + "step": 12792 + }, + { + "epoch": 0.32373408912619883, + "grad_norm": 4.717576503753662, + "learning_rate": 7.720270442979269e-06, + "loss": 0.2252, + "step": 12793 + }, + { + "epoch": 0.32375939469089254, + "grad_norm": 9.238871574401855, + "learning_rate": 7.719933536662697e-06, + "loss": 0.2387, + "step": 12794 + }, + { + "epoch": 0.3237847002555862, + "grad_norm": 3.618093252182007, + "learning_rate": 7.719596612805778e-06, + "loss": 0.2086, + "step": 12795 + }, + { + "epoch": 0.32381000582027986, + "grad_norm": 6.049127578735352, + "learning_rate": 7.719259671410688e-06, + "loss": 0.1809, + "step": 12796 + }, + { + "epoch": 0.32383531138497357, + "grad_norm": 3.408630609512329, + "learning_rate": 7.718922712479597e-06, + "loss": 0.1133, + "step": 12797 + }, + { + "epoch": 0.3238606169496672, + "grad_norm": 6.299231052398682, + "learning_rate": 7.71858573601468e-06, + "loss": 0.2204, + "step": 12798 + }, + { + "epoch": 0.32388592251436094, + "grad_norm": 7.226955413818359, + "learning_rate": 7.718248742018109e-06, + "loss": 0.2721, + "step": 12799 + }, + { + "epoch": 0.3239112280790546, + "grad_norm": 5.710111618041992, + "learning_rate": 7.717911730492059e-06, + "loss": 0.1984, + "step": 12800 + }, + { + "epoch": 0.32393653364374825, + "grad_norm": 6.277602195739746, + "learning_rate": 7.7175747014387e-06, + "loss": 0.1574, + "step": 12801 + }, + { + "epoch": 0.32396183920844196, + "grad_norm": 4.8899078369140625, + "learning_rate": 7.717237654860207e-06, + "loss": 0.236, + "step": 12802 + }, + { + "epoch": 0.3239871447731356, + "grad_norm": 7.763626575469971, + "learning_rate": 7.716900590758754e-06, + "loss": 0.2096, + "step": 12803 + }, + { + "epoch": 0.32401245033782927, + "grad_norm": 4.740170001983643, + "learning_rate": 7.716563509136516e-06, + "loss": 0.175, + "step": 12804 + }, + { + "epoch": 0.324037755902523, + "grad_norm": 7.103297233581543, + "learning_rate": 7.716226409995665e-06, + "loss": 0.1845, + "step": 12805 + }, + { + "epoch": 0.32406306146721664, + "grad_norm": 4.015678882598877, + "learning_rate": 7.715889293338372e-06, + "loss": 0.1537, + "step": 12806 + }, + { + "epoch": 0.3240883670319103, + "grad_norm": 7.034729957580566, + "learning_rate": 7.715552159166816e-06, + "loss": 0.1231, + "step": 12807 + }, + { + "epoch": 0.324113672596604, + "grad_norm": 4.6808929443359375, + "learning_rate": 7.71521500748317e-06, + "loss": 0.1967, + "step": 12808 + }, + { + "epoch": 0.32413897816129766, + "grad_norm": 17.995485305786133, + "learning_rate": 7.714877838289607e-06, + "loss": 0.3076, + "step": 12809 + }, + { + "epoch": 0.3241642837259913, + "grad_norm": 6.707090854644775, + "learning_rate": 7.714540651588302e-06, + "loss": 0.2411, + "step": 12810 + }, + { + "epoch": 0.32418958929068503, + "grad_norm": 3.678947687149048, + "learning_rate": 7.71420344738143e-06, + "loss": 0.0645, + "step": 12811 + }, + { + "epoch": 0.3242148948553787, + "grad_norm": 3.352808952331543, + "learning_rate": 7.713866225671164e-06, + "loss": 0.154, + "step": 12812 + }, + { + "epoch": 0.3242402004200724, + "grad_norm": 2.6188199520111084, + "learning_rate": 7.71352898645968e-06, + "loss": 0.1555, + "step": 12813 + }, + { + "epoch": 0.32426550598476606, + "grad_norm": 3.920567035675049, + "learning_rate": 7.713191729749152e-06, + "loss": 0.1211, + "step": 12814 + }, + { + "epoch": 0.3242908115494597, + "grad_norm": 3.1293833255767822, + "learning_rate": 7.712854455541754e-06, + "loss": 0.0996, + "step": 12815 + }, + { + "epoch": 0.3243161171141534, + "grad_norm": 10.182207107543945, + "learning_rate": 7.712517163839663e-06, + "loss": 0.2903, + "step": 12816 + }, + { + "epoch": 0.3243414226788471, + "grad_norm": 7.187110424041748, + "learning_rate": 7.712179854645054e-06, + "loss": 0.1504, + "step": 12817 + }, + { + "epoch": 0.32436672824354074, + "grad_norm": 5.687452793121338, + "learning_rate": 7.711842527960101e-06, + "loss": 0.2485, + "step": 12818 + }, + { + "epoch": 0.32439203380823445, + "grad_norm": 3.2260992527008057, + "learning_rate": 7.711505183786982e-06, + "loss": 0.0794, + "step": 12819 + }, + { + "epoch": 0.3244173393729281, + "grad_norm": 10.603902816772461, + "learning_rate": 7.711167822127868e-06, + "loss": 0.2148, + "step": 12820 + }, + { + "epoch": 0.32444264493762176, + "grad_norm": 4.209174156188965, + "learning_rate": 7.710830442984938e-06, + "loss": 0.1319, + "step": 12821 + }, + { + "epoch": 0.32446795050231547, + "grad_norm": 9.105396270751953, + "learning_rate": 7.710493046360367e-06, + "loss": 0.2133, + "step": 12822 + }, + { + "epoch": 0.32449325606700913, + "grad_norm": 9.75525188446045, + "learning_rate": 7.71015563225633e-06, + "loss": 0.2279, + "step": 12823 + }, + { + "epoch": 0.3245185616317028, + "grad_norm": 6.466670513153076, + "learning_rate": 7.709818200675003e-06, + "loss": 0.2263, + "step": 12824 + }, + { + "epoch": 0.3245438671963965, + "grad_norm": 5.06690788269043, + "learning_rate": 7.709480751618563e-06, + "loss": 0.1536, + "step": 12825 + }, + { + "epoch": 0.32456917276109015, + "grad_norm": 70.12763214111328, + "learning_rate": 7.709143285089187e-06, + "loss": 0.5528, + "step": 12826 + }, + { + "epoch": 0.32459447832578386, + "grad_norm": 3.3747448921203613, + "learning_rate": 7.708805801089047e-06, + "loss": 0.1529, + "step": 12827 + }, + { + "epoch": 0.3246197838904775, + "grad_norm": 6.766327857971191, + "learning_rate": 7.708468299620324e-06, + "loss": 0.2339, + "step": 12828 + }, + { + "epoch": 0.3246450894551712, + "grad_norm": 4.963756561279297, + "learning_rate": 7.708130780685192e-06, + "loss": 0.1425, + "step": 12829 + }, + { + "epoch": 0.3246703950198649, + "grad_norm": 4.93489408493042, + "learning_rate": 7.70779324428583e-06, + "loss": 0.1364, + "step": 12830 + }, + { + "epoch": 0.32469570058455854, + "grad_norm": 6.255474090576172, + "learning_rate": 7.707455690424413e-06, + "loss": 0.1646, + "step": 12831 + }, + { + "epoch": 0.3247210061492522, + "grad_norm": 5.981197357177734, + "learning_rate": 7.707118119103116e-06, + "loss": 0.2793, + "step": 12832 + }, + { + "epoch": 0.3247463117139459, + "grad_norm": 6.663332462310791, + "learning_rate": 7.706780530324119e-06, + "loss": 0.1938, + "step": 12833 + }, + { + "epoch": 0.32477161727863957, + "grad_norm": 7.351893424987793, + "learning_rate": 7.706442924089597e-06, + "loss": 0.2264, + "step": 12834 + }, + { + "epoch": 0.3247969228433332, + "grad_norm": 4.856276035308838, + "learning_rate": 7.70610530040173e-06, + "loss": 0.2047, + "step": 12835 + }, + { + "epoch": 0.32482222840802694, + "grad_norm": 5.046750545501709, + "learning_rate": 7.70576765926269e-06, + "loss": 0.1993, + "step": 12836 + }, + { + "epoch": 0.3248475339727206, + "grad_norm": 4.374218940734863, + "learning_rate": 7.705430000674661e-06, + "loss": 0.1848, + "step": 12837 + }, + { + "epoch": 0.3248728395374143, + "grad_norm": 4.507775783538818, + "learning_rate": 7.705092324639815e-06, + "loss": 0.1464, + "step": 12838 + }, + { + "epoch": 0.32489814510210796, + "grad_norm": 4.36163330078125, + "learning_rate": 7.704754631160333e-06, + "loss": 0.1345, + "step": 12839 + }, + { + "epoch": 0.3249234506668016, + "grad_norm": 3.655473232269287, + "learning_rate": 7.70441692023839e-06, + "loss": 0.2429, + "step": 12840 + }, + { + "epoch": 0.32494875623149533, + "grad_norm": 3.2927491664886475, + "learning_rate": 7.704079191876167e-06, + "loss": 0.1652, + "step": 12841 + }, + { + "epoch": 0.324974061796189, + "grad_norm": 8.366461753845215, + "learning_rate": 7.703741446075838e-06, + "loss": 0.2082, + "step": 12842 + }, + { + "epoch": 0.32499936736088264, + "grad_norm": 7.975167751312256, + "learning_rate": 7.703403682839588e-06, + "loss": 0.3014, + "step": 12843 + }, + { + "epoch": 0.32502467292557635, + "grad_norm": 4.747790813446045, + "learning_rate": 7.703065902169586e-06, + "loss": 0.1638, + "step": 12844 + }, + { + "epoch": 0.32504997849027, + "grad_norm": 4.612624168395996, + "learning_rate": 7.702728104068016e-06, + "loss": 0.1637, + "step": 12845 + }, + { + "epoch": 0.32507528405496366, + "grad_norm": 4.758296966552734, + "learning_rate": 7.702390288537056e-06, + "loss": 0.1621, + "step": 12846 + }, + { + "epoch": 0.3251005896196574, + "grad_norm": 4.363203048706055, + "learning_rate": 7.702052455578884e-06, + "loss": 0.1623, + "step": 12847 + }, + { + "epoch": 0.32512589518435103, + "grad_norm": 3.376725912094116, + "learning_rate": 7.701714605195677e-06, + "loss": 0.1062, + "step": 12848 + }, + { + "epoch": 0.3251512007490447, + "grad_norm": 4.0224103927612305, + "learning_rate": 7.701376737389616e-06, + "loss": 0.1383, + "step": 12849 + }, + { + "epoch": 0.3251765063137384, + "grad_norm": 7.638915061950684, + "learning_rate": 7.701038852162877e-06, + "loss": 0.1932, + "step": 12850 + }, + { + "epoch": 0.32520181187843206, + "grad_norm": 8.71313190460205, + "learning_rate": 7.700700949517643e-06, + "loss": 0.1657, + "step": 12851 + }, + { + "epoch": 0.32522711744312577, + "grad_norm": 4.106019973754883, + "learning_rate": 7.700363029456091e-06, + "loss": 0.199, + "step": 12852 + }, + { + "epoch": 0.3252524230078194, + "grad_norm": 3.406608819961548, + "learning_rate": 7.7000250919804e-06, + "loss": 0.1409, + "step": 12853 + }, + { + "epoch": 0.3252777285725131, + "grad_norm": 2.3551201820373535, + "learning_rate": 7.699687137092748e-06, + "loss": 0.0902, + "step": 12854 + }, + { + "epoch": 0.3253030341372068, + "grad_norm": 5.700803756713867, + "learning_rate": 7.699349164795318e-06, + "loss": 0.1669, + "step": 12855 + }, + { + "epoch": 0.32532833970190045, + "grad_norm": 5.720608711242676, + "learning_rate": 7.699011175090286e-06, + "loss": 0.2246, + "step": 12856 + }, + { + "epoch": 0.3253536452665941, + "grad_norm": 5.718557357788086, + "learning_rate": 7.698673167979833e-06, + "loss": 0.1684, + "step": 12857 + }, + { + "epoch": 0.3253789508312878, + "grad_norm": 4.8211541175842285, + "learning_rate": 7.698335143466139e-06, + "loss": 0.1563, + "step": 12858 + }, + { + "epoch": 0.3254042563959815, + "grad_norm": 8.774637222290039, + "learning_rate": 7.697997101551381e-06, + "loss": 0.2399, + "step": 12859 + }, + { + "epoch": 0.32542956196067513, + "grad_norm": 10.726341247558594, + "learning_rate": 7.697659042237746e-06, + "loss": 0.2061, + "step": 12860 + }, + { + "epoch": 0.32545486752536884, + "grad_norm": 5.018823146820068, + "learning_rate": 7.697320965527408e-06, + "loss": 0.1282, + "step": 12861 + }, + { + "epoch": 0.3254801730900625, + "grad_norm": 5.860963821411133, + "learning_rate": 7.696982871422546e-06, + "loss": 0.1825, + "step": 12862 + }, + { + "epoch": 0.3255054786547562, + "grad_norm": 3.0467844009399414, + "learning_rate": 7.696644759925347e-06, + "loss": 0.0904, + "step": 12863 + }, + { + "epoch": 0.32553078421944986, + "grad_norm": 5.120993614196777, + "learning_rate": 7.696306631037986e-06, + "loss": 0.2367, + "step": 12864 + }, + { + "epoch": 0.3255560897841435, + "grad_norm": 5.602052211761475, + "learning_rate": 7.695968484762646e-06, + "loss": 0.2434, + "step": 12865 + }, + { + "epoch": 0.32558139534883723, + "grad_norm": 3.3592727184295654, + "learning_rate": 7.695630321101507e-06, + "loss": 0.1223, + "step": 12866 + }, + { + "epoch": 0.3256067009135309, + "grad_norm": 5.876312255859375, + "learning_rate": 7.695292140056748e-06, + "loss": 0.2033, + "step": 12867 + }, + { + "epoch": 0.32563200647822454, + "grad_norm": 3.2555744647979736, + "learning_rate": 7.69495394163055e-06, + "loss": 0.1287, + "step": 12868 + }, + { + "epoch": 0.32565731204291826, + "grad_norm": 51.70414352416992, + "learning_rate": 7.6946157258251e-06, + "loss": 0.1974, + "step": 12869 + }, + { + "epoch": 0.3256826176076119, + "grad_norm": 13.243642807006836, + "learning_rate": 7.69427749264257e-06, + "loss": 0.2414, + "step": 12870 + }, + { + "epoch": 0.32570792317230557, + "grad_norm": 4.2905378341674805, + "learning_rate": 7.693939242085146e-06, + "loss": 0.1913, + "step": 12871 + }, + { + "epoch": 0.3257332287369993, + "grad_norm": 8.304967880249023, + "learning_rate": 7.69360097415501e-06, + "loss": 0.2855, + "step": 12872 + }, + { + "epoch": 0.32575853430169294, + "grad_norm": 7.6933369636535645, + "learning_rate": 7.693262688854343e-06, + "loss": 0.1898, + "step": 12873 + }, + { + "epoch": 0.3257838398663866, + "grad_norm": 5.094778537750244, + "learning_rate": 7.692924386185326e-06, + "loss": 0.1267, + "step": 12874 + }, + { + "epoch": 0.3258091454310803, + "grad_norm": 18.72052001953125, + "learning_rate": 7.692586066150139e-06, + "loss": 0.3157, + "step": 12875 + }, + { + "epoch": 0.32583445099577396, + "grad_norm": 5.666948318481445, + "learning_rate": 7.692247728750969e-06, + "loss": 0.1246, + "step": 12876 + }, + { + "epoch": 0.3258597565604677, + "grad_norm": 6.311001300811768, + "learning_rate": 7.69190937398999e-06, + "loss": 0.1749, + "step": 12877 + }, + { + "epoch": 0.32588506212516133, + "grad_norm": 24.48395538330078, + "learning_rate": 7.691571001869388e-06, + "loss": 0.2516, + "step": 12878 + }, + { + "epoch": 0.325910367689855, + "grad_norm": 6.099520683288574, + "learning_rate": 7.691232612391349e-06, + "loss": 0.2144, + "step": 12879 + }, + { + "epoch": 0.3259356732545487, + "grad_norm": 8.410412788391113, + "learning_rate": 7.690894205558049e-06, + "loss": 0.1975, + "step": 12880 + }, + { + "epoch": 0.32596097881924235, + "grad_norm": 5.66348123550415, + "learning_rate": 7.690555781371672e-06, + "loss": 0.1764, + "step": 12881 + }, + { + "epoch": 0.325986284383936, + "grad_norm": 4.150737762451172, + "learning_rate": 7.690217339834402e-06, + "loss": 0.115, + "step": 12882 + }, + { + "epoch": 0.3260115899486297, + "grad_norm": 4.468952178955078, + "learning_rate": 7.689878880948423e-06, + "loss": 0.1356, + "step": 12883 + }, + { + "epoch": 0.3260368955133234, + "grad_norm": 9.646047592163086, + "learning_rate": 7.689540404715914e-06, + "loss": 0.2394, + "step": 12884 + }, + { + "epoch": 0.32606220107801703, + "grad_norm": 5.777393341064453, + "learning_rate": 7.689201911139059e-06, + "loss": 0.1717, + "step": 12885 + }, + { + "epoch": 0.32608750664271074, + "grad_norm": 5.359890460968018, + "learning_rate": 7.688863400220041e-06, + "loss": 0.2016, + "step": 12886 + }, + { + "epoch": 0.3261128122074044, + "grad_norm": 12.062003135681152, + "learning_rate": 7.688524871961042e-06, + "loss": 0.1587, + "step": 12887 + }, + { + "epoch": 0.32613811777209806, + "grad_norm": 4.130220890045166, + "learning_rate": 7.688186326364248e-06, + "loss": 0.1663, + "step": 12888 + }, + { + "epoch": 0.32616342333679177, + "grad_norm": 5.756045818328857, + "learning_rate": 7.687847763431839e-06, + "loss": 0.1941, + "step": 12889 + }, + { + "epoch": 0.3261887289014854, + "grad_norm": 4.072643280029297, + "learning_rate": 7.687509183166001e-06, + "loss": 0.1903, + "step": 12890 + }, + { + "epoch": 0.32621403446617914, + "grad_norm": 4.4787821769714355, + "learning_rate": 7.687170585568916e-06, + "loss": 0.1588, + "step": 12891 + }, + { + "epoch": 0.3262393400308728, + "grad_norm": 4.968559741973877, + "learning_rate": 7.686831970642768e-06, + "loss": 0.1985, + "step": 12892 + }, + { + "epoch": 0.32626464559556645, + "grad_norm": 10.887052536010742, + "learning_rate": 7.68649333838974e-06, + "loss": 0.273, + "step": 12893 + }, + { + "epoch": 0.32628995116026016, + "grad_norm": 12.17192554473877, + "learning_rate": 7.686154688812016e-06, + "loss": 0.2381, + "step": 12894 + }, + { + "epoch": 0.3263152567249538, + "grad_norm": 4.1734700202941895, + "learning_rate": 7.685816021911779e-06, + "loss": 0.1802, + "step": 12895 + }, + { + "epoch": 0.3263405622896475, + "grad_norm": 8.194321632385254, + "learning_rate": 7.685477337691216e-06, + "loss": 0.2688, + "step": 12896 + }, + { + "epoch": 0.3263658678543412, + "grad_norm": 25.41254997253418, + "learning_rate": 7.685138636152507e-06, + "loss": 0.248, + "step": 12897 + }, + { + "epoch": 0.32639117341903484, + "grad_norm": 10.116728782653809, + "learning_rate": 7.68479991729784e-06, + "loss": 0.1815, + "step": 12898 + }, + { + "epoch": 0.3264164789837285, + "grad_norm": 2.8847146034240723, + "learning_rate": 7.684461181129397e-06, + "loss": 0.147, + "step": 12899 + }, + { + "epoch": 0.3264417845484222, + "grad_norm": 9.671289443969727, + "learning_rate": 7.684122427649366e-06, + "loss": 0.2789, + "step": 12900 + }, + { + "epoch": 0.32646709011311587, + "grad_norm": 2.9100046157836914, + "learning_rate": 7.683783656859925e-06, + "loss": 0.1363, + "step": 12901 + }, + { + "epoch": 0.3264923956778096, + "grad_norm": 10.232650756835938, + "learning_rate": 7.683444868763264e-06, + "loss": 0.2449, + "step": 12902 + }, + { + "epoch": 0.32651770124250323, + "grad_norm": 6.590419769287109, + "learning_rate": 7.683106063361567e-06, + "loss": 0.3407, + "step": 12903 + }, + { + "epoch": 0.3265430068071969, + "grad_norm": 6.870750904083252, + "learning_rate": 7.682767240657016e-06, + "loss": 0.2173, + "step": 12904 + }, + { + "epoch": 0.3265683123718906, + "grad_norm": 3.0080225467681885, + "learning_rate": 7.6824284006518e-06, + "loss": 0.116, + "step": 12905 + }, + { + "epoch": 0.32659361793658426, + "grad_norm": 4.475757122039795, + "learning_rate": 7.682089543348102e-06, + "loss": 0.1666, + "step": 12906 + }, + { + "epoch": 0.3266189235012779, + "grad_norm": 7.0129265785217285, + "learning_rate": 7.681750668748107e-06, + "loss": 0.1303, + "step": 12907 + }, + { + "epoch": 0.3266442290659716, + "grad_norm": 11.080130577087402, + "learning_rate": 7.681411776854e-06, + "loss": 0.2423, + "step": 12908 + }, + { + "epoch": 0.3266695346306653, + "grad_norm": 4.156808376312256, + "learning_rate": 7.681072867667968e-06, + "loss": 0.1221, + "step": 12909 + }, + { + "epoch": 0.32669484019535894, + "grad_norm": 4.915095329284668, + "learning_rate": 7.680733941192195e-06, + "loss": 0.1729, + "step": 12910 + }, + { + "epoch": 0.32672014576005265, + "grad_norm": 2.7499310970306396, + "learning_rate": 7.680394997428869e-06, + "loss": 0.1201, + "step": 12911 + }, + { + "epoch": 0.3267454513247463, + "grad_norm": 4.131078243255615, + "learning_rate": 7.680056036380173e-06, + "loss": 0.1791, + "step": 12912 + }, + { + "epoch": 0.32677075688943996, + "grad_norm": 3.1553702354431152, + "learning_rate": 7.679717058048295e-06, + "loss": 0.1561, + "step": 12913 + }, + { + "epoch": 0.3267960624541337, + "grad_norm": 11.989316940307617, + "learning_rate": 7.679378062435421e-06, + "loss": 0.2493, + "step": 12914 + }, + { + "epoch": 0.32682136801882733, + "grad_norm": 2.455745220184326, + "learning_rate": 7.679039049543733e-06, + "loss": 0.1382, + "step": 12915 + }, + { + "epoch": 0.32684667358352104, + "grad_norm": 2.5544493198394775, + "learning_rate": 7.678700019375425e-06, + "loss": 0.1457, + "step": 12916 + }, + { + "epoch": 0.3268719791482147, + "grad_norm": 4.772746562957764, + "learning_rate": 7.678360971932675e-06, + "loss": 0.1672, + "step": 12917 + }, + { + "epoch": 0.32689728471290835, + "grad_norm": 13.182934761047363, + "learning_rate": 7.678021907217675e-06, + "loss": 0.2041, + "step": 12918 + }, + { + "epoch": 0.32692259027760207, + "grad_norm": 2.1206886768341064, + "learning_rate": 7.67768282523261e-06, + "loss": 0.0689, + "step": 12919 + }, + { + "epoch": 0.3269478958422957, + "grad_norm": 5.467552185058594, + "learning_rate": 7.677343725979667e-06, + "loss": 0.1714, + "step": 12920 + }, + { + "epoch": 0.3269732014069894, + "grad_norm": 16.37981414794922, + "learning_rate": 7.67700460946103e-06, + "loss": 0.211, + "step": 12921 + }, + { + "epoch": 0.3269985069716831, + "grad_norm": 4.9153947830200195, + "learning_rate": 7.67666547567889e-06, + "loss": 0.1764, + "step": 12922 + }, + { + "epoch": 0.32702381253637675, + "grad_norm": 11.873151779174805, + "learning_rate": 7.676326324635432e-06, + "loss": 0.2733, + "step": 12923 + }, + { + "epoch": 0.3270491181010704, + "grad_norm": 8.389345169067383, + "learning_rate": 7.675987156332844e-06, + "loss": 0.2535, + "step": 12924 + }, + { + "epoch": 0.3270744236657641, + "grad_norm": 11.695171356201172, + "learning_rate": 7.67564797077331e-06, + "loss": 0.2004, + "step": 12925 + }, + { + "epoch": 0.32709972923045777, + "grad_norm": 5.656132698059082, + "learning_rate": 7.675308767959023e-06, + "loss": 0.1894, + "step": 12926 + }, + { + "epoch": 0.3271250347951515, + "grad_norm": 3.624652862548828, + "learning_rate": 7.674969547892166e-06, + "loss": 0.1553, + "step": 12927 + }, + { + "epoch": 0.32715034035984514, + "grad_norm": 4.83629846572876, + "learning_rate": 7.674630310574928e-06, + "loss": 0.1738, + "step": 12928 + }, + { + "epoch": 0.3271756459245388, + "grad_norm": 5.078412055969238, + "learning_rate": 7.674291056009497e-06, + "loss": 0.2005, + "step": 12929 + }, + { + "epoch": 0.3272009514892325, + "grad_norm": 6.970969200134277, + "learning_rate": 7.67395178419806e-06, + "loss": 0.1499, + "step": 12930 + }, + { + "epoch": 0.32722625705392616, + "grad_norm": 11.206748962402344, + "learning_rate": 7.673612495142806e-06, + "loss": 0.3071, + "step": 12931 + }, + { + "epoch": 0.3272515626186198, + "grad_norm": 7.944115161895752, + "learning_rate": 7.673273188845922e-06, + "loss": 0.1317, + "step": 12932 + }, + { + "epoch": 0.32727686818331353, + "grad_norm": 10.119227409362793, + "learning_rate": 7.672933865309597e-06, + "loss": 0.2299, + "step": 12933 + }, + { + "epoch": 0.3273021737480072, + "grad_norm": 3.7449278831481934, + "learning_rate": 7.672594524536019e-06, + "loss": 0.1567, + "step": 12934 + }, + { + "epoch": 0.32732747931270084, + "grad_norm": 3.660634994506836, + "learning_rate": 7.672255166527375e-06, + "loss": 0.1341, + "step": 12935 + }, + { + "epoch": 0.32735278487739455, + "grad_norm": 4.735842227935791, + "learning_rate": 7.671915791285853e-06, + "loss": 0.2339, + "step": 12936 + }, + { + "epoch": 0.3273780904420882, + "grad_norm": 3.043344736099243, + "learning_rate": 7.671576398813644e-06, + "loss": 0.1566, + "step": 12937 + }, + { + "epoch": 0.32740339600678187, + "grad_norm": 3.9285671710968018, + "learning_rate": 7.671236989112938e-06, + "loss": 0.1202, + "step": 12938 + }, + { + "epoch": 0.3274287015714756, + "grad_norm": 11.286105155944824, + "learning_rate": 7.67089756218592e-06, + "loss": 0.2165, + "step": 12939 + }, + { + "epoch": 0.32745400713616923, + "grad_norm": 8.665923118591309, + "learning_rate": 7.67055811803478e-06, + "loss": 0.1556, + "step": 12940 + }, + { + "epoch": 0.32747931270086295, + "grad_norm": 6.329555511474609, + "learning_rate": 7.67021865666171e-06, + "loss": 0.2909, + "step": 12941 + }, + { + "epoch": 0.3275046182655566, + "grad_norm": 4.459111213684082, + "learning_rate": 7.669879178068892e-06, + "loss": 0.1613, + "step": 12942 + }, + { + "epoch": 0.32752992383025026, + "grad_norm": 4.770596027374268, + "learning_rate": 7.669539682258525e-06, + "loss": 0.2475, + "step": 12943 + }, + { + "epoch": 0.32755522939494397, + "grad_norm": 5.318233489990234, + "learning_rate": 7.66920016923279e-06, + "loss": 0.1558, + "step": 12944 + }, + { + "epoch": 0.3275805349596376, + "grad_norm": 4.60222864151001, + "learning_rate": 7.66886063899388e-06, + "loss": 0.119, + "step": 12945 + }, + { + "epoch": 0.3276058405243313, + "grad_norm": 8.314388275146484, + "learning_rate": 7.668521091543984e-06, + "loss": 0.1838, + "step": 12946 + }, + { + "epoch": 0.327631146089025, + "grad_norm": 3.4694087505340576, + "learning_rate": 7.668181526885293e-06, + "loss": 0.1213, + "step": 12947 + }, + { + "epoch": 0.32765645165371865, + "grad_norm": 4.040299415588379, + "learning_rate": 7.667841945019996e-06, + "loss": 0.2317, + "step": 12948 + }, + { + "epoch": 0.3276817572184123, + "grad_norm": 4.2755889892578125, + "learning_rate": 7.667502345950281e-06, + "loss": 0.1944, + "step": 12949 + }, + { + "epoch": 0.327707062783106, + "grad_norm": 4.1866044998168945, + "learning_rate": 7.66716272967834e-06, + "loss": 0.162, + "step": 12950 + }, + { + "epoch": 0.3277323683477997, + "grad_norm": 6.3819122314453125, + "learning_rate": 7.666823096206362e-06, + "loss": 0.1131, + "step": 12951 + }, + { + "epoch": 0.32775767391249333, + "grad_norm": 4.2771406173706055, + "learning_rate": 7.666483445536539e-06, + "loss": 0.1484, + "step": 12952 + }, + { + "epoch": 0.32778297947718704, + "grad_norm": 2.9221534729003906, + "learning_rate": 7.66614377767106e-06, + "loss": 0.0892, + "step": 12953 + }, + { + "epoch": 0.3278082850418807, + "grad_norm": 10.070229530334473, + "learning_rate": 7.665804092612114e-06, + "loss": 0.1786, + "step": 12954 + }, + { + "epoch": 0.3278335906065744, + "grad_norm": 2.967841148376465, + "learning_rate": 7.665464390361896e-06, + "loss": 0.1598, + "step": 12955 + }, + { + "epoch": 0.32785889617126807, + "grad_norm": 14.705224990844727, + "learning_rate": 7.665124670922593e-06, + "loss": 0.2846, + "step": 12956 + }, + { + "epoch": 0.3278842017359617, + "grad_norm": 4.676906108856201, + "learning_rate": 7.664784934296397e-06, + "loss": 0.1989, + "step": 12957 + }, + { + "epoch": 0.32790950730065543, + "grad_norm": 2.61537766456604, + "learning_rate": 7.664445180485498e-06, + "loss": 0.0981, + "step": 12958 + }, + { + "epoch": 0.3279348128653491, + "grad_norm": 6.947556972503662, + "learning_rate": 7.664105409492087e-06, + "loss": 0.1828, + "step": 12959 + }, + { + "epoch": 0.32796011843004275, + "grad_norm": 3.307847261428833, + "learning_rate": 7.663765621318357e-06, + "loss": 0.1487, + "step": 12960 + }, + { + "epoch": 0.32798542399473646, + "grad_norm": 6.433609962463379, + "learning_rate": 7.663425815966498e-06, + "loss": 0.1533, + "step": 12961 + }, + { + "epoch": 0.3280107295594301, + "grad_norm": 5.404277324676514, + "learning_rate": 7.663085993438701e-06, + "loss": 0.1522, + "step": 12962 + }, + { + "epoch": 0.32803603512412377, + "grad_norm": 3.057248592376709, + "learning_rate": 7.662746153737157e-06, + "loss": 0.1346, + "step": 12963 + }, + { + "epoch": 0.3280613406888175, + "grad_norm": 18.37408447265625, + "learning_rate": 7.66240629686406e-06, + "loss": 0.2251, + "step": 12964 + }, + { + "epoch": 0.32808664625351114, + "grad_norm": 5.571263313293457, + "learning_rate": 7.6620664228216e-06, + "loss": 0.1518, + "step": 12965 + }, + { + "epoch": 0.32811195181820485, + "grad_norm": 4.034793853759766, + "learning_rate": 7.661726531611968e-06, + "loss": 0.1372, + "step": 12966 + }, + { + "epoch": 0.3281372573828985, + "grad_norm": 8.783512115478516, + "learning_rate": 7.661386623237357e-06, + "loss": 0.2031, + "step": 12967 + }, + { + "epoch": 0.32816256294759216, + "grad_norm": 3.3676505088806152, + "learning_rate": 7.661046697699957e-06, + "loss": 0.143, + "step": 12968 + }, + { + "epoch": 0.3281878685122859, + "grad_norm": 3.9186465740203857, + "learning_rate": 7.660706755001965e-06, + "loss": 0.1151, + "step": 12969 + }, + { + "epoch": 0.32821317407697953, + "grad_norm": 5.327577114105225, + "learning_rate": 7.660366795145568e-06, + "loss": 0.2077, + "step": 12970 + }, + { + "epoch": 0.3282384796416732, + "grad_norm": 5.199295997619629, + "learning_rate": 7.66002681813296e-06, + "loss": 0.2102, + "step": 12971 + }, + { + "epoch": 0.3282637852063669, + "grad_norm": 6.036950588226318, + "learning_rate": 7.659686823966335e-06, + "loss": 0.1323, + "step": 12972 + }, + { + "epoch": 0.32828909077106055, + "grad_norm": 3.425966501235962, + "learning_rate": 7.659346812647884e-06, + "loss": 0.138, + "step": 12973 + }, + { + "epoch": 0.3283143963357542, + "grad_norm": 7.84598970413208, + "learning_rate": 7.659006784179801e-06, + "loss": 0.2061, + "step": 12974 + }, + { + "epoch": 0.3283397019004479, + "grad_norm": 4.899274826049805, + "learning_rate": 7.658666738564278e-06, + "loss": 0.1934, + "step": 12975 + }, + { + "epoch": 0.3283650074651416, + "grad_norm": 4.147278308868408, + "learning_rate": 7.658326675803506e-06, + "loss": 0.102, + "step": 12976 + }, + { + "epoch": 0.32839031302983523, + "grad_norm": 3.8444597721099854, + "learning_rate": 7.65798659589968e-06, + "loss": 0.1397, + "step": 12977 + }, + { + "epoch": 0.32841561859452895, + "grad_norm": 6.685508728027344, + "learning_rate": 7.657646498854994e-06, + "loss": 0.2246, + "step": 12978 + }, + { + "epoch": 0.3284409241592226, + "grad_norm": 5.607038974761963, + "learning_rate": 7.65730638467164e-06, + "loss": 0.1948, + "step": 12979 + }, + { + "epoch": 0.3284662297239163, + "grad_norm": 4.945211887359619, + "learning_rate": 7.65696625335181e-06, + "loss": 0.1606, + "step": 12980 + }, + { + "epoch": 0.32849153528860997, + "grad_norm": 7.4785685539245605, + "learning_rate": 7.6566261048977e-06, + "loss": 0.2122, + "step": 12981 + }, + { + "epoch": 0.3285168408533036, + "grad_norm": 7.592663288116455, + "learning_rate": 7.656285939311502e-06, + "loss": 0.2403, + "step": 12982 + }, + { + "epoch": 0.32854214641799734, + "grad_norm": 3.566206693649292, + "learning_rate": 7.65594575659541e-06, + "loss": 0.2076, + "step": 12983 + }, + { + "epoch": 0.328567451982691, + "grad_norm": 3.010392665863037, + "learning_rate": 7.65560555675162e-06, + "loss": 0.1176, + "step": 12984 + }, + { + "epoch": 0.32859275754738465, + "grad_norm": 3.2583117485046387, + "learning_rate": 7.65526533978232e-06, + "loss": 0.1291, + "step": 12985 + }, + { + "epoch": 0.32861806311207836, + "grad_norm": 2.738710403442383, + "learning_rate": 7.65492510568971e-06, + "loss": 0.0779, + "step": 12986 + }, + { + "epoch": 0.328643368676772, + "grad_norm": 3.23233962059021, + "learning_rate": 7.65458485447598e-06, + "loss": 0.1309, + "step": 12987 + }, + { + "epoch": 0.3286686742414657, + "grad_norm": 3.7540082931518555, + "learning_rate": 7.654244586143329e-06, + "loss": 0.1405, + "step": 12988 + }, + { + "epoch": 0.3286939798061594, + "grad_norm": 9.238636016845703, + "learning_rate": 7.653904300693946e-06, + "loss": 0.1571, + "step": 12989 + }, + { + "epoch": 0.32871928537085304, + "grad_norm": 6.160572528839111, + "learning_rate": 7.65356399813003e-06, + "loss": 0.2005, + "step": 12990 + }, + { + "epoch": 0.32874459093554675, + "grad_norm": 3.3294339179992676, + "learning_rate": 7.653223678453771e-06, + "loss": 0.1284, + "step": 12991 + }, + { + "epoch": 0.3287698965002404, + "grad_norm": 9.369833946228027, + "learning_rate": 7.652883341667368e-06, + "loss": 0.3672, + "step": 12992 + }, + { + "epoch": 0.32879520206493407, + "grad_norm": 3.2200064659118652, + "learning_rate": 7.652542987773013e-06, + "loss": 0.1265, + "step": 12993 + }, + { + "epoch": 0.3288205076296278, + "grad_norm": 10.196139335632324, + "learning_rate": 7.652202616772902e-06, + "loss": 0.1478, + "step": 12994 + }, + { + "epoch": 0.32884581319432143, + "grad_norm": 4.247600078582764, + "learning_rate": 7.651862228669229e-06, + "loss": 0.1677, + "step": 12995 + }, + { + "epoch": 0.3288711187590151, + "grad_norm": 2.2910544872283936, + "learning_rate": 7.65152182346419e-06, + "loss": 0.1049, + "step": 12996 + }, + { + "epoch": 0.3288964243237088, + "grad_norm": 3.4475083351135254, + "learning_rate": 7.65118140115998e-06, + "loss": 0.1098, + "step": 12997 + }, + { + "epoch": 0.32892172988840246, + "grad_norm": 5.050864219665527, + "learning_rate": 7.650840961758794e-06, + "loss": 0.2178, + "step": 12998 + }, + { + "epoch": 0.3289470354530961, + "grad_norm": 5.587236404418945, + "learning_rate": 7.650500505262828e-06, + "loss": 0.1292, + "step": 12999 + }, + { + "epoch": 0.3289723410177898, + "grad_norm": 10.46838092803955, + "learning_rate": 7.650160031674278e-06, + "loss": 0.1657, + "step": 13000 + }, + { + "epoch": 0.3289976465824835, + "grad_norm": 9.345162391662598, + "learning_rate": 7.649819540995338e-06, + "loss": 0.1594, + "step": 13001 + }, + { + "epoch": 0.32902295214717714, + "grad_norm": 6.426964282989502, + "learning_rate": 7.649479033228206e-06, + "loss": 0.1724, + "step": 13002 + }, + { + "epoch": 0.32904825771187085, + "grad_norm": 5.490148544311523, + "learning_rate": 7.649138508375075e-06, + "loss": 0.1915, + "step": 13003 + }, + { + "epoch": 0.3290735632765645, + "grad_norm": 10.664018630981445, + "learning_rate": 7.648797966438144e-06, + "loss": 0.1326, + "step": 13004 + }, + { + "epoch": 0.3290988688412582, + "grad_norm": 4.582333564758301, + "learning_rate": 7.648457407419606e-06, + "loss": 0.1806, + "step": 13005 + }, + { + "epoch": 0.3291241744059519, + "grad_norm": 2.1572160720825195, + "learning_rate": 7.64811683132166e-06, + "loss": 0.0738, + "step": 13006 + }, + { + "epoch": 0.32914947997064553, + "grad_norm": 4.431854724884033, + "learning_rate": 7.6477762381465e-06, + "loss": 0.193, + "step": 13007 + }, + { + "epoch": 0.32917478553533924, + "grad_norm": 5.609034538269043, + "learning_rate": 7.647435627896325e-06, + "loss": 0.2175, + "step": 13008 + }, + { + "epoch": 0.3292000911000329, + "grad_norm": 3.597609043121338, + "learning_rate": 7.647095000573327e-06, + "loss": 0.1378, + "step": 13009 + }, + { + "epoch": 0.32922539666472656, + "grad_norm": 10.2607421875, + "learning_rate": 7.64675435617971e-06, + "loss": 0.175, + "step": 13010 + }, + { + "epoch": 0.32925070222942027, + "grad_norm": 5.790213108062744, + "learning_rate": 7.646413694717664e-06, + "loss": 0.1048, + "step": 13011 + }, + { + "epoch": 0.3292760077941139, + "grad_norm": 6.883979797363281, + "learning_rate": 7.646073016189389e-06, + "loss": 0.243, + "step": 13012 + }, + { + "epoch": 0.3293013133588076, + "grad_norm": 3.9501240253448486, + "learning_rate": 7.645732320597081e-06, + "loss": 0.1612, + "step": 13013 + }, + { + "epoch": 0.3293266189235013, + "grad_norm": 6.466934680938721, + "learning_rate": 7.645391607942938e-06, + "loss": 0.1354, + "step": 13014 + }, + { + "epoch": 0.32935192448819495, + "grad_norm": 5.763943672180176, + "learning_rate": 7.645050878229156e-06, + "loss": 0.1496, + "step": 13015 + }, + { + "epoch": 0.3293772300528886, + "grad_norm": 4.407000541687012, + "learning_rate": 7.644710131457933e-06, + "loss": 0.1523, + "step": 13016 + }, + { + "epoch": 0.3294025356175823, + "grad_norm": 12.801785469055176, + "learning_rate": 7.644369367631469e-06, + "loss": 0.1723, + "step": 13017 + }, + { + "epoch": 0.32942784118227597, + "grad_norm": 5.580350399017334, + "learning_rate": 7.644028586751955e-06, + "loss": 0.2135, + "step": 13018 + }, + { + "epoch": 0.3294531467469697, + "grad_norm": 5.011892318725586, + "learning_rate": 7.643687788821593e-06, + "loss": 0.1559, + "step": 13019 + }, + { + "epoch": 0.32947845231166334, + "grad_norm": 10.44852066040039, + "learning_rate": 7.643346973842581e-06, + "loss": 0.1725, + "step": 13020 + }, + { + "epoch": 0.329503757876357, + "grad_norm": 7.0286664962768555, + "learning_rate": 7.643006141817115e-06, + "loss": 0.2029, + "step": 13021 + }, + { + "epoch": 0.3295290634410507, + "grad_norm": 4.488757610321045, + "learning_rate": 7.642665292747395e-06, + "loss": 0.1762, + "step": 13022 + }, + { + "epoch": 0.32955436900574436, + "grad_norm": 4.524243354797363, + "learning_rate": 7.642324426635619e-06, + "loss": 0.1616, + "step": 13023 + }, + { + "epoch": 0.329579674570438, + "grad_norm": 9.554527282714844, + "learning_rate": 7.641983543483984e-06, + "loss": 0.3243, + "step": 13024 + }, + { + "epoch": 0.32960498013513173, + "grad_norm": 9.717704772949219, + "learning_rate": 7.641642643294687e-06, + "loss": 0.284, + "step": 13025 + }, + { + "epoch": 0.3296302856998254, + "grad_norm": 3.2034358978271484, + "learning_rate": 7.641301726069929e-06, + "loss": 0.1761, + "step": 13026 + }, + { + "epoch": 0.32965559126451904, + "grad_norm": 3.824756145477295, + "learning_rate": 7.640960791811909e-06, + "loss": 0.1553, + "step": 13027 + }, + { + "epoch": 0.32968089682921276, + "grad_norm": 3.9135525226593018, + "learning_rate": 7.640619840522823e-06, + "loss": 0.1797, + "step": 13028 + }, + { + "epoch": 0.3297062023939064, + "grad_norm": 4.516472339630127, + "learning_rate": 7.64027887220487e-06, + "loss": 0.1736, + "step": 13029 + }, + { + "epoch": 0.3297315079586001, + "grad_norm": 5.763438701629639, + "learning_rate": 7.639937886860249e-06, + "loss": 0.2549, + "step": 13030 + }, + { + "epoch": 0.3297568135232938, + "grad_norm": 9.634855270385742, + "learning_rate": 7.639596884491163e-06, + "loss": 0.2916, + "step": 13031 + }, + { + "epoch": 0.32978211908798744, + "grad_norm": 5.519450664520264, + "learning_rate": 7.639255865099804e-06, + "loss": 0.2141, + "step": 13032 + }, + { + "epoch": 0.32980742465268115, + "grad_norm": 7.006271839141846, + "learning_rate": 7.638914828688378e-06, + "loss": 0.2294, + "step": 13033 + }, + { + "epoch": 0.3298327302173748, + "grad_norm": 10.734480857849121, + "learning_rate": 7.638573775259078e-06, + "loss": 0.2172, + "step": 13034 + }, + { + "epoch": 0.32985803578206846, + "grad_norm": 4.135466575622559, + "learning_rate": 7.638232704814108e-06, + "loss": 0.1111, + "step": 13035 + }, + { + "epoch": 0.32988334134676217, + "grad_norm": 6.3524169921875, + "learning_rate": 7.637891617355666e-06, + "loss": 0.2665, + "step": 13036 + }, + { + "epoch": 0.3299086469114558, + "grad_norm": 3.712879180908203, + "learning_rate": 7.637550512885952e-06, + "loss": 0.1959, + "step": 13037 + }, + { + "epoch": 0.3299339524761495, + "grad_norm": 4.4868974685668945, + "learning_rate": 7.637209391407166e-06, + "loss": 0.2096, + "step": 13038 + }, + { + "epoch": 0.3299592580408432, + "grad_norm": 3.608374834060669, + "learning_rate": 7.636868252921506e-06, + "loss": 0.1613, + "step": 13039 + }, + { + "epoch": 0.32998456360553685, + "grad_norm": 4.4318156242370605, + "learning_rate": 7.636527097431172e-06, + "loss": 0.1839, + "step": 13040 + }, + { + "epoch": 0.3300098691702305, + "grad_norm": 14.857242584228516, + "learning_rate": 7.636185924938368e-06, + "loss": 0.2561, + "step": 13041 + }, + { + "epoch": 0.3300351747349242, + "grad_norm": 3.9901251792907715, + "learning_rate": 7.635844735445289e-06, + "loss": 0.1862, + "step": 13042 + }, + { + "epoch": 0.3300604802996179, + "grad_norm": 7.6908650398254395, + "learning_rate": 7.635503528954139e-06, + "loss": 0.2218, + "step": 13043 + }, + { + "epoch": 0.3300857858643116, + "grad_norm": 3.2043750286102295, + "learning_rate": 7.635162305467117e-06, + "loss": 0.1515, + "step": 13044 + }, + { + "epoch": 0.33011109142900524, + "grad_norm": 3.73254132270813, + "learning_rate": 7.634821064986422e-06, + "loss": 0.2136, + "step": 13045 + }, + { + "epoch": 0.3301363969936989, + "grad_norm": 6.208174705505371, + "learning_rate": 7.634479807514256e-06, + "loss": 0.1356, + "step": 13046 + }, + { + "epoch": 0.3301617025583926, + "grad_norm": 5.133847713470459, + "learning_rate": 7.634138533052822e-06, + "loss": 0.2242, + "step": 13047 + }, + { + "epoch": 0.33018700812308627, + "grad_norm": 6.229996681213379, + "learning_rate": 7.633797241604317e-06, + "loss": 0.1553, + "step": 13048 + }, + { + "epoch": 0.3302123136877799, + "grad_norm": 3.0228586196899414, + "learning_rate": 7.633455933170944e-06, + "loss": 0.1809, + "step": 13049 + }, + { + "epoch": 0.33023761925247364, + "grad_norm": 28.913856506347656, + "learning_rate": 7.633114607754903e-06, + "loss": 0.2154, + "step": 13050 + }, + { + "epoch": 0.3302629248171673, + "grad_norm": 14.186614036560059, + "learning_rate": 7.632773265358397e-06, + "loss": 0.1455, + "step": 13051 + }, + { + "epoch": 0.33028823038186095, + "grad_norm": 3.7268903255462646, + "learning_rate": 7.632431905983626e-06, + "loss": 0.1747, + "step": 13052 + }, + { + "epoch": 0.33031353594655466, + "grad_norm": 8.271648406982422, + "learning_rate": 7.63209052963279e-06, + "loss": 0.3314, + "step": 13053 + }, + { + "epoch": 0.3303388415112483, + "grad_norm": 2.5012876987457275, + "learning_rate": 7.63174913630809e-06, + "loss": 0.1049, + "step": 13054 + }, + { + "epoch": 0.330364147075942, + "grad_norm": 6.090512275695801, + "learning_rate": 7.631407726011733e-06, + "loss": 0.2025, + "step": 13055 + }, + { + "epoch": 0.3303894526406357, + "grad_norm": 3.21659255027771, + "learning_rate": 7.631066298745914e-06, + "loss": 0.1819, + "step": 13056 + }, + { + "epoch": 0.33041475820532934, + "grad_norm": 4.820655822753906, + "learning_rate": 7.630724854512841e-06, + "loss": 0.1892, + "step": 13057 + }, + { + "epoch": 0.33044006377002305, + "grad_norm": 3.147367477416992, + "learning_rate": 7.630383393314712e-06, + "loss": 0.142, + "step": 13058 + }, + { + "epoch": 0.3304653693347167, + "grad_norm": 5.061694145202637, + "learning_rate": 7.63004191515373e-06, + "loss": 0.1337, + "step": 13059 + }, + { + "epoch": 0.33049067489941036, + "grad_norm": 7.2110466957092285, + "learning_rate": 7.629700420032096e-06, + "loss": 0.2127, + "step": 13060 + }, + { + "epoch": 0.3305159804641041, + "grad_norm": 7.562376976013184, + "learning_rate": 7.629358907952011e-06, + "loss": 0.1741, + "step": 13061 + }, + { + "epoch": 0.33054128602879773, + "grad_norm": 4.12213659286499, + "learning_rate": 7.629017378915683e-06, + "loss": 0.2183, + "step": 13062 + }, + { + "epoch": 0.3305665915934914, + "grad_norm": 3.7920994758605957, + "learning_rate": 7.62867583292531e-06, + "loss": 0.1811, + "step": 13063 + }, + { + "epoch": 0.3305918971581851, + "grad_norm": 5.843165874481201, + "learning_rate": 7.628334269983095e-06, + "loss": 0.1598, + "step": 13064 + }, + { + "epoch": 0.33061720272287876, + "grad_norm": 18.596900939941406, + "learning_rate": 7.627992690091241e-06, + "loss": 0.2925, + "step": 13065 + }, + { + "epoch": 0.3306425082875724, + "grad_norm": 3.8463144302368164, + "learning_rate": 7.627651093251951e-06, + "loss": 0.166, + "step": 13066 + }, + { + "epoch": 0.3306678138522661, + "grad_norm": 4.55462121963501, + "learning_rate": 7.627309479467428e-06, + "loss": 0.1476, + "step": 13067 + }, + { + "epoch": 0.3306931194169598, + "grad_norm": 5.344258785247803, + "learning_rate": 7.626967848739874e-06, + "loss": 0.2026, + "step": 13068 + }, + { + "epoch": 0.3307184249816535, + "grad_norm": 3.671799659729004, + "learning_rate": 7.626626201071494e-06, + "loss": 0.1708, + "step": 13069 + }, + { + "epoch": 0.33074373054634715, + "grad_norm": 5.513296127319336, + "learning_rate": 7.6262845364644925e-06, + "loss": 0.1448, + "step": 13070 + }, + { + "epoch": 0.3307690361110408, + "grad_norm": 4.340055465698242, + "learning_rate": 7.625942854921067e-06, + "loss": 0.1907, + "step": 13071 + }, + { + "epoch": 0.3307943416757345, + "grad_norm": 4.090754985809326, + "learning_rate": 7.625601156443426e-06, + "loss": 0.209, + "step": 13072 + }, + { + "epoch": 0.33081964724042817, + "grad_norm": 6.324199199676514, + "learning_rate": 7.6252594410337714e-06, + "loss": 0.2523, + "step": 13073 + }, + { + "epoch": 0.33084495280512183, + "grad_norm": 4.307615756988525, + "learning_rate": 7.624917708694306e-06, + "loss": 0.166, + "step": 13074 + }, + { + "epoch": 0.33087025836981554, + "grad_norm": 3.4004666805267334, + "learning_rate": 7.624575959427236e-06, + "loss": 0.0856, + "step": 13075 + }, + { + "epoch": 0.3308955639345092, + "grad_norm": 5.271688461303711, + "learning_rate": 7.624234193234763e-06, + "loss": 0.1559, + "step": 13076 + }, + { + "epoch": 0.33092086949920285, + "grad_norm": 4.038361549377441, + "learning_rate": 7.623892410119092e-06, + "loss": 0.1723, + "step": 13077 + }, + { + "epoch": 0.33094617506389656, + "grad_norm": 6.746001243591309, + "learning_rate": 7.623550610082428e-06, + "loss": 0.1725, + "step": 13078 + }, + { + "epoch": 0.3309714806285902, + "grad_norm": 4.82464075088501, + "learning_rate": 7.623208793126973e-06, + "loss": 0.2028, + "step": 13079 + }, + { + "epoch": 0.3309967861932839, + "grad_norm": 10.363554954528809, + "learning_rate": 7.622866959254932e-06, + "loss": 0.2139, + "step": 13080 + }, + { + "epoch": 0.3310220917579776, + "grad_norm": 6.673834323883057, + "learning_rate": 7.622525108468511e-06, + "loss": 0.2463, + "step": 13081 + }, + { + "epoch": 0.33104739732267124, + "grad_norm": 6.195114612579346, + "learning_rate": 7.622183240769914e-06, + "loss": 0.1992, + "step": 13082 + }, + { + "epoch": 0.33107270288736496, + "grad_norm": 8.572896957397461, + "learning_rate": 7.621841356161344e-06, + "loss": 0.2092, + "step": 13083 + }, + { + "epoch": 0.3310980084520586, + "grad_norm": 5.955389976501465, + "learning_rate": 7.621499454645005e-06, + "loss": 0.1535, + "step": 13084 + }, + { + "epoch": 0.33112331401675227, + "grad_norm": 5.49383020401001, + "learning_rate": 7.621157536223105e-06, + "loss": 0.2771, + "step": 13085 + }, + { + "epoch": 0.331148619581446, + "grad_norm": 6.925850868225098, + "learning_rate": 7.620815600897849e-06, + "loss": 0.2407, + "step": 13086 + }, + { + "epoch": 0.33117392514613964, + "grad_norm": 7.569515705108643, + "learning_rate": 7.620473648671441e-06, + "loss": 0.2425, + "step": 13087 + }, + { + "epoch": 0.3311992307108333, + "grad_norm": 5.848243236541748, + "learning_rate": 7.620131679546083e-06, + "loss": 0.2214, + "step": 13088 + }, + { + "epoch": 0.331224536275527, + "grad_norm": 3.974907875061035, + "learning_rate": 7.6197896935239865e-06, + "loss": 0.143, + "step": 13089 + }, + { + "epoch": 0.33124984184022066, + "grad_norm": 4.212835311889648, + "learning_rate": 7.619447690607351e-06, + "loss": 0.2056, + "step": 13090 + }, + { + "epoch": 0.3312751474049143, + "grad_norm": 4.972081661224365, + "learning_rate": 7.619105670798385e-06, + "loss": 0.1095, + "step": 13091 + }, + { + "epoch": 0.33130045296960803, + "grad_norm": 4.230614185333252, + "learning_rate": 7.618763634099296e-06, + "loss": 0.2131, + "step": 13092 + }, + { + "epoch": 0.3313257585343017, + "grad_norm": 5.492215633392334, + "learning_rate": 7.618421580512285e-06, + "loss": 0.1581, + "step": 13093 + }, + { + "epoch": 0.3313510640989954, + "grad_norm": 13.426286697387695, + "learning_rate": 7.618079510039562e-06, + "loss": 0.2169, + "step": 13094 + }, + { + "epoch": 0.33137636966368905, + "grad_norm": 3.3769729137420654, + "learning_rate": 7.617737422683329e-06, + "loss": 0.1879, + "step": 13095 + }, + { + "epoch": 0.3314016752283827, + "grad_norm": 2.914703130722046, + "learning_rate": 7.6173953184457946e-06, + "loss": 0.161, + "step": 13096 + }, + { + "epoch": 0.3314269807930764, + "grad_norm": 10.897500991821289, + "learning_rate": 7.617053197329167e-06, + "loss": 0.2526, + "step": 13097 + }, + { + "epoch": 0.3314522863577701, + "grad_norm": 4.166365623474121, + "learning_rate": 7.6167110593356485e-06, + "loss": 0.2184, + "step": 13098 + }, + { + "epoch": 0.33147759192246373, + "grad_norm": 4.25540828704834, + "learning_rate": 7.616368904467447e-06, + "loss": 0.0965, + "step": 13099 + }, + { + "epoch": 0.33150289748715744, + "grad_norm": 4.1143341064453125, + "learning_rate": 7.616026732726769e-06, + "loss": 0.1562, + "step": 13100 + }, + { + "epoch": 0.3315282030518511, + "grad_norm": 3.7298943996429443, + "learning_rate": 7.615684544115822e-06, + "loss": 0.1257, + "step": 13101 + }, + { + "epoch": 0.33155350861654476, + "grad_norm": 8.672476768493652, + "learning_rate": 7.615342338636811e-06, + "loss": 0.1868, + "step": 13102 + }, + { + "epoch": 0.33157881418123847, + "grad_norm": 6.254241943359375, + "learning_rate": 7.615000116291946e-06, + "loss": 0.1712, + "step": 13103 + }, + { + "epoch": 0.3316041197459321, + "grad_norm": 5.5813307762146, + "learning_rate": 7.614657877083429e-06, + "loss": 0.1597, + "step": 13104 + }, + { + "epoch": 0.3316294253106258, + "grad_norm": 5.071139335632324, + "learning_rate": 7.61431562101347e-06, + "loss": 0.2906, + "step": 13105 + }, + { + "epoch": 0.3316547308753195, + "grad_norm": 4.329270839691162, + "learning_rate": 7.613973348084278e-06, + "loss": 0.144, + "step": 13106 + }, + { + "epoch": 0.33168003644001315, + "grad_norm": 13.791906356811523, + "learning_rate": 7.613631058298056e-06, + "loss": 0.3542, + "step": 13107 + }, + { + "epoch": 0.33170534200470686, + "grad_norm": 5.603874683380127, + "learning_rate": 7.613288751657014e-06, + "loss": 0.1885, + "step": 13108 + }, + { + "epoch": 0.3317306475694005, + "grad_norm": 6.930412769317627, + "learning_rate": 7.6129464281633595e-06, + "loss": 0.2512, + "step": 13109 + }, + { + "epoch": 0.3317559531340942, + "grad_norm": 3.8259024620056152, + "learning_rate": 7.6126040878193e-06, + "loss": 0.1965, + "step": 13110 + }, + { + "epoch": 0.3317812586987879, + "grad_norm": 6.411436080932617, + "learning_rate": 7.6122617306270405e-06, + "loss": 0.1456, + "step": 13111 + }, + { + "epoch": 0.33180656426348154, + "grad_norm": 4.844552993774414, + "learning_rate": 7.611919356588793e-06, + "loss": 0.1963, + "step": 13112 + }, + { + "epoch": 0.3318318698281752, + "grad_norm": 16.4615535736084, + "learning_rate": 7.611576965706762e-06, + "loss": 0.1654, + "step": 13113 + }, + { + "epoch": 0.3318571753928689, + "grad_norm": 10.284278869628906, + "learning_rate": 7.611234557983158e-06, + "loss": 0.1954, + "step": 13114 + }, + { + "epoch": 0.33188248095756256, + "grad_norm": 8.813898086547852, + "learning_rate": 7.610892133420188e-06, + "loss": 0.268, + "step": 13115 + }, + { + "epoch": 0.3319077865222562, + "grad_norm": 5.54910945892334, + "learning_rate": 7.610549692020061e-06, + "loss": 0.1847, + "step": 13116 + }, + { + "epoch": 0.33193309208694993, + "grad_norm": 4.577683925628662, + "learning_rate": 7.610207233784983e-06, + "loss": 0.193, + "step": 13117 + }, + { + "epoch": 0.3319583976516436, + "grad_norm": 8.273907661437988, + "learning_rate": 7.609864758717165e-06, + "loss": 0.3068, + "step": 13118 + }, + { + "epoch": 0.3319837032163373, + "grad_norm": 5.563830852508545, + "learning_rate": 7.609522266818814e-06, + "loss": 0.1558, + "step": 13119 + }, + { + "epoch": 0.33200900878103096, + "grad_norm": 6.020695686340332, + "learning_rate": 7.60917975809214e-06, + "loss": 0.1762, + "step": 13120 + }, + { + "epoch": 0.3320343143457246, + "grad_norm": 4.960463047027588, + "learning_rate": 7.608837232539352e-06, + "loss": 0.2087, + "step": 13121 + }, + { + "epoch": 0.3320596199104183, + "grad_norm": 5.943263530731201, + "learning_rate": 7.608494690162657e-06, + "loss": 0.1368, + "step": 13122 + }, + { + "epoch": 0.332084925475112, + "grad_norm": 7.11954927444458, + "learning_rate": 7.608152130964265e-06, + "loss": 0.1859, + "step": 13123 + }, + { + "epoch": 0.33211023103980564, + "grad_norm": 5.8123016357421875, + "learning_rate": 7.607809554946384e-06, + "loss": 0.2107, + "step": 13124 + }, + { + "epoch": 0.33213553660449935, + "grad_norm": 13.92271900177002, + "learning_rate": 7.607466962111225e-06, + "loss": 0.254, + "step": 13125 + }, + { + "epoch": 0.332160842169193, + "grad_norm": 3.711606502532959, + "learning_rate": 7.6071243524609975e-06, + "loss": 0.164, + "step": 13126 + }, + { + "epoch": 0.33218614773388666, + "grad_norm": 3.984539747238159, + "learning_rate": 7.606781725997909e-06, + "loss": 0.1576, + "step": 13127 + }, + { + "epoch": 0.3322114532985804, + "grad_norm": 6.289523601531982, + "learning_rate": 7.606439082724171e-06, + "loss": 0.2206, + "step": 13128 + }, + { + "epoch": 0.33223675886327403, + "grad_norm": 28.56496810913086, + "learning_rate": 7.606096422641991e-06, + "loss": 0.3919, + "step": 13129 + }, + { + "epoch": 0.3322620644279677, + "grad_norm": 3.5594849586486816, + "learning_rate": 7.60575374575358e-06, + "loss": 0.0933, + "step": 13130 + }, + { + "epoch": 0.3322873699926614, + "grad_norm": 6.083924293518066, + "learning_rate": 7.605411052061148e-06, + "loss": 0.1848, + "step": 13131 + }, + { + "epoch": 0.33231267555735505, + "grad_norm": 7.548995494842529, + "learning_rate": 7.605068341566906e-06, + "loss": 0.2012, + "step": 13132 + }, + { + "epoch": 0.33233798112204876, + "grad_norm": 4.245636463165283, + "learning_rate": 7.604725614273061e-06, + "loss": 0.2281, + "step": 13133 + }, + { + "epoch": 0.3323632866867424, + "grad_norm": 3.5966479778289795, + "learning_rate": 7.604382870181825e-06, + "loss": 0.1768, + "step": 13134 + }, + { + "epoch": 0.3323885922514361, + "grad_norm": 7.029331207275391, + "learning_rate": 7.60404010929541e-06, + "loss": 0.1785, + "step": 13135 + }, + { + "epoch": 0.3324138978161298, + "grad_norm": 2.4659268856048584, + "learning_rate": 7.6036973316160245e-06, + "loss": 0.1208, + "step": 13136 + }, + { + "epoch": 0.33243920338082344, + "grad_norm": 11.653802871704102, + "learning_rate": 7.603354537145878e-06, + "loss": 0.1691, + "step": 13137 + }, + { + "epoch": 0.3324645089455171, + "grad_norm": 5.575159072875977, + "learning_rate": 7.603011725887183e-06, + "loss": 0.1804, + "step": 13138 + }, + { + "epoch": 0.3324898145102108, + "grad_norm": 13.366500854492188, + "learning_rate": 7.602668897842149e-06, + "loss": 0.2721, + "step": 13139 + }, + { + "epoch": 0.33251512007490447, + "grad_norm": 6.678605556488037, + "learning_rate": 7.602326053012988e-06, + "loss": 0.1937, + "step": 13140 + }, + { + "epoch": 0.3325404256395981, + "grad_norm": 3.891061544418335, + "learning_rate": 7.601983191401911e-06, + "loss": 0.0886, + "step": 13141 + }, + { + "epoch": 0.33256573120429184, + "grad_norm": 11.327417373657227, + "learning_rate": 7.601640313011127e-06, + "loss": 0.2981, + "step": 13142 + }, + { + "epoch": 0.3325910367689855, + "grad_norm": 9.215835571289062, + "learning_rate": 7.60129741784285e-06, + "loss": 0.2859, + "step": 13143 + }, + { + "epoch": 0.33261634233367915, + "grad_norm": 5.958109378814697, + "learning_rate": 7.600954505899289e-06, + "loss": 0.2246, + "step": 13144 + }, + { + "epoch": 0.33264164789837286, + "grad_norm": 3.773601531982422, + "learning_rate": 7.600611577182657e-06, + "loss": 0.1658, + "step": 13145 + }, + { + "epoch": 0.3326669534630665, + "grad_norm": 3.6944613456726074, + "learning_rate": 7.6002686316951635e-06, + "loss": 0.1482, + "step": 13146 + }, + { + "epoch": 0.33269225902776023, + "grad_norm": 5.903700351715088, + "learning_rate": 7.599925669439022e-06, + "loss": 0.2176, + "step": 13147 + }, + { + "epoch": 0.3327175645924539, + "grad_norm": 4.532757759094238, + "learning_rate": 7.599582690416445e-06, + "loss": 0.2137, + "step": 13148 + }, + { + "epoch": 0.33274287015714754, + "grad_norm": 8.26686954498291, + "learning_rate": 7.59923969462964e-06, + "loss": 0.2453, + "step": 13149 + }, + { + "epoch": 0.33276817572184125, + "grad_norm": 20.392568588256836, + "learning_rate": 7.598896682080823e-06, + "loss": 0.2601, + "step": 13150 + }, + { + "epoch": 0.3327934812865349, + "grad_norm": 3.0218312740325928, + "learning_rate": 7.598553652772205e-06, + "loss": 0.1327, + "step": 13151 + }, + { + "epoch": 0.33281878685122857, + "grad_norm": 3.711373805999756, + "learning_rate": 7.598210606705997e-06, + "loss": 0.1672, + "step": 13152 + }, + { + "epoch": 0.3328440924159223, + "grad_norm": 2.8965017795562744, + "learning_rate": 7.597867543884412e-06, + "loss": 0.1483, + "step": 13153 + }, + { + "epoch": 0.33286939798061593, + "grad_norm": 7.281460762023926, + "learning_rate": 7.597524464309664e-06, + "loss": 0.217, + "step": 13154 + }, + { + "epoch": 0.3328947035453096, + "grad_norm": 3.4974570274353027, + "learning_rate": 7.5971813679839615e-06, + "loss": 0.1501, + "step": 13155 + }, + { + "epoch": 0.3329200091100033, + "grad_norm": 4.279049873352051, + "learning_rate": 7.596838254909522e-06, + "loss": 0.1103, + "step": 13156 + }, + { + "epoch": 0.33294531467469696, + "grad_norm": 4.954763889312744, + "learning_rate": 7.596495125088555e-06, + "loss": 0.2089, + "step": 13157 + }, + { + "epoch": 0.33297062023939067, + "grad_norm": 3.3318216800689697, + "learning_rate": 7.596151978523272e-06, + "loss": 0.1886, + "step": 13158 + }, + { + "epoch": 0.3329959258040843, + "grad_norm": 6.014683723449707, + "learning_rate": 7.59580881521589e-06, + "loss": 0.2905, + "step": 13159 + }, + { + "epoch": 0.333021231368778, + "grad_norm": 4.786255836486816, + "learning_rate": 7.5954656351686175e-06, + "loss": 0.181, + "step": 13160 + }, + { + "epoch": 0.3330465369334717, + "grad_norm": 2.971428871154785, + "learning_rate": 7.595122438383673e-06, + "loss": 0.107, + "step": 13161 + }, + { + "epoch": 0.33307184249816535, + "grad_norm": 6.274554252624512, + "learning_rate": 7.594779224863264e-06, + "loss": 0.2401, + "step": 13162 + }, + { + "epoch": 0.333097148062859, + "grad_norm": 5.073665618896484, + "learning_rate": 7.594435994609608e-06, + "loss": 0.2277, + "step": 13163 + }, + { + "epoch": 0.3331224536275527, + "grad_norm": 2.4158036708831787, + "learning_rate": 7.594092747624916e-06, + "loss": 0.1089, + "step": 13164 + }, + { + "epoch": 0.3331477591922464, + "grad_norm": 11.791193008422852, + "learning_rate": 7.593749483911403e-06, + "loss": 0.2182, + "step": 13165 + }, + { + "epoch": 0.33317306475694003, + "grad_norm": 3.298830986022949, + "learning_rate": 7.59340620347128e-06, + "loss": 0.1476, + "step": 13166 + }, + { + "epoch": 0.33319837032163374, + "grad_norm": 5.726194381713867, + "learning_rate": 7.593062906306767e-06, + "loss": 0.2377, + "step": 13167 + }, + { + "epoch": 0.3332236758863274, + "grad_norm": 3.9442055225372314, + "learning_rate": 7.59271959242007e-06, + "loss": 0.1205, + "step": 13168 + }, + { + "epoch": 0.33324898145102105, + "grad_norm": 4.905116558074951, + "learning_rate": 7.592376261813409e-06, + "loss": 0.1997, + "step": 13169 + }, + { + "epoch": 0.33327428701571477, + "grad_norm": 3.2311594486236572, + "learning_rate": 7.592032914488994e-06, + "loss": 0.1194, + "step": 13170 + }, + { + "epoch": 0.3332995925804084, + "grad_norm": 5.949219703674316, + "learning_rate": 7.591689550449042e-06, + "loss": 0.2104, + "step": 13171 + }, + { + "epoch": 0.33332489814510213, + "grad_norm": 5.153510093688965, + "learning_rate": 7.591346169695766e-06, + "loss": 0.1569, + "step": 13172 + }, + { + "epoch": 0.3333502037097958, + "grad_norm": 3.797083616256714, + "learning_rate": 7.59100277223138e-06, + "loss": 0.2096, + "step": 13173 + }, + { + "epoch": 0.33337550927448945, + "grad_norm": 3.89151668548584, + "learning_rate": 7.590659358058099e-06, + "loss": 0.1179, + "step": 13174 + }, + { + "epoch": 0.33340081483918316, + "grad_norm": 5.218163967132568, + "learning_rate": 7.590315927178138e-06, + "loss": 0.1359, + "step": 13175 + }, + { + "epoch": 0.3334261204038768, + "grad_norm": 5.844834327697754, + "learning_rate": 7.589972479593712e-06, + "loss": 0.2124, + "step": 13176 + }, + { + "epoch": 0.33345142596857047, + "grad_norm": 6.721480846405029, + "learning_rate": 7.589629015307034e-06, + "loss": 0.1883, + "step": 13177 + }, + { + "epoch": 0.3334767315332642, + "grad_norm": 4.848079681396484, + "learning_rate": 7.5892855343203205e-06, + "loss": 0.1299, + "step": 13178 + }, + { + "epoch": 0.33350203709795784, + "grad_norm": 10.98773193359375, + "learning_rate": 7.588942036635788e-06, + "loss": 0.1857, + "step": 13179 + }, + { + "epoch": 0.3335273426626515, + "grad_norm": 6.963271617889404, + "learning_rate": 7.588598522255648e-06, + "loss": 0.2356, + "step": 13180 + }, + { + "epoch": 0.3335526482273452, + "grad_norm": 7.554630279541016, + "learning_rate": 7.588254991182118e-06, + "loss": 0.1353, + "step": 13181 + }, + { + "epoch": 0.33357795379203886, + "grad_norm": 5.098311424255371, + "learning_rate": 7.587911443417413e-06, + "loss": 0.1797, + "step": 13182 + }, + { + "epoch": 0.3336032593567325, + "grad_norm": 9.703444480895996, + "learning_rate": 7.58756787896375e-06, + "loss": 0.2467, + "step": 13183 + }, + { + "epoch": 0.33362856492142623, + "grad_norm": 8.845782279968262, + "learning_rate": 7.587224297823341e-06, + "loss": 0.2704, + "step": 13184 + }, + { + "epoch": 0.3336538704861199, + "grad_norm": 8.611519813537598, + "learning_rate": 7.586880699998404e-06, + "loss": 0.2551, + "step": 13185 + }, + { + "epoch": 0.3336791760508136, + "grad_norm": 3.569443941116333, + "learning_rate": 7.586537085491156e-06, + "loss": 0.102, + "step": 13186 + }, + { + "epoch": 0.33370448161550725, + "grad_norm": 7.222901344299316, + "learning_rate": 7.586193454303811e-06, + "loss": 0.2945, + "step": 13187 + }, + { + "epoch": 0.3337297871802009, + "grad_norm": 13.095071792602539, + "learning_rate": 7.585849806438584e-06, + "loss": 0.1887, + "step": 13188 + }, + { + "epoch": 0.3337550927448946, + "grad_norm": 6.643560886383057, + "learning_rate": 7.585506141897694e-06, + "loss": 0.1545, + "step": 13189 + }, + { + "epoch": 0.3337803983095883, + "grad_norm": 3.92816162109375, + "learning_rate": 7.585162460683355e-06, + "loss": 0.1745, + "step": 13190 + }, + { + "epoch": 0.33380570387428193, + "grad_norm": 7.638027191162109, + "learning_rate": 7.5848187627977855e-06, + "loss": 0.2494, + "step": 13191 + }, + { + "epoch": 0.33383100943897565, + "grad_norm": 4.275501251220703, + "learning_rate": 7.584475048243199e-06, + "loss": 0.1767, + "step": 13192 + }, + { + "epoch": 0.3338563150036693, + "grad_norm": 3.6375679969787598, + "learning_rate": 7.5841313170218144e-06, + "loss": 0.1557, + "step": 13193 + }, + { + "epoch": 0.33388162056836296, + "grad_norm": 2.9749531745910645, + "learning_rate": 7.583787569135847e-06, + "loss": 0.1506, + "step": 13194 + }, + { + "epoch": 0.33390692613305667, + "grad_norm": 3.1742265224456787, + "learning_rate": 7.583443804587516e-06, + "loss": 0.0698, + "step": 13195 + }, + { + "epoch": 0.3339322316977503, + "grad_norm": 3.687190055847168, + "learning_rate": 7.5831000233790354e-06, + "loss": 0.1492, + "step": 13196 + }, + { + "epoch": 0.33395753726244404, + "grad_norm": 5.53673791885376, + "learning_rate": 7.582756225512622e-06, + "loss": 0.1886, + "step": 13197 + }, + { + "epoch": 0.3339828428271377, + "grad_norm": 4.907255172729492, + "learning_rate": 7.582412410990496e-06, + "loss": 0.1984, + "step": 13198 + }, + { + "epoch": 0.33400814839183135, + "grad_norm": 12.023426055908203, + "learning_rate": 7.582068579814872e-06, + "loss": 0.2394, + "step": 13199 + }, + { + "epoch": 0.33403345395652506, + "grad_norm": 10.97154426574707, + "learning_rate": 7.581724731987968e-06, + "loss": 0.2233, + "step": 13200 + }, + { + "epoch": 0.3340587595212187, + "grad_norm": 10.536355018615723, + "learning_rate": 7.581380867512e-06, + "loss": 0.2451, + "step": 13201 + }, + { + "epoch": 0.3340840650859124, + "grad_norm": 4.219127655029297, + "learning_rate": 7.58103698638919e-06, + "loss": 0.1551, + "step": 13202 + }, + { + "epoch": 0.3341093706506061, + "grad_norm": 5.689638614654541, + "learning_rate": 7.5806930886217514e-06, + "loss": 0.1214, + "step": 13203 + }, + { + "epoch": 0.33413467621529974, + "grad_norm": 6.080116271972656, + "learning_rate": 7.580349174211903e-06, + "loss": 0.196, + "step": 13204 + }, + { + "epoch": 0.3341599817799934, + "grad_norm": 6.181838512420654, + "learning_rate": 7.580005243161861e-06, + "loss": 0.1736, + "step": 13205 + }, + { + "epoch": 0.3341852873446871, + "grad_norm": 3.9406542778015137, + "learning_rate": 7.579661295473848e-06, + "loss": 0.1933, + "step": 13206 + }, + { + "epoch": 0.33421059290938077, + "grad_norm": 5.270648002624512, + "learning_rate": 7.579317331150076e-06, + "loss": 0.2107, + "step": 13207 + }, + { + "epoch": 0.3342358984740744, + "grad_norm": 9.460755348205566, + "learning_rate": 7.5789733501927685e-06, + "loss": 0.21, + "step": 13208 + }, + { + "epoch": 0.33426120403876813, + "grad_norm": 13.316154479980469, + "learning_rate": 7.578629352604143e-06, + "loss": 0.1904, + "step": 13209 + }, + { + "epoch": 0.3342865096034618, + "grad_norm": 4.26582670211792, + "learning_rate": 7.578285338386413e-06, + "loss": 0.179, + "step": 13210 + }, + { + "epoch": 0.3343118151681555, + "grad_norm": 12.203317642211914, + "learning_rate": 7.577941307541803e-06, + "loss": 0.2713, + "step": 13211 + }, + { + "epoch": 0.33433712073284916, + "grad_norm": 7.5415849685668945, + "learning_rate": 7.577597260072528e-06, + "loss": 0.233, + "step": 13212 + }, + { + "epoch": 0.3343624262975428, + "grad_norm": 8.141093254089355, + "learning_rate": 7.577253195980807e-06, + "loss": 0.2134, + "step": 13213 + }, + { + "epoch": 0.3343877318622365, + "grad_norm": 5.911406993865967, + "learning_rate": 7.576909115268861e-06, + "loss": 0.2142, + "step": 13214 + }, + { + "epoch": 0.3344130374269302, + "grad_norm": 4.043821811676025, + "learning_rate": 7.5765650179389064e-06, + "loss": 0.134, + "step": 13215 + }, + { + "epoch": 0.33443834299162384, + "grad_norm": 5.055861473083496, + "learning_rate": 7.5762209039931644e-06, + "loss": 0.1716, + "step": 13216 + }, + { + "epoch": 0.33446364855631755, + "grad_norm": 3.6353087425231934, + "learning_rate": 7.575876773433852e-06, + "loss": 0.1661, + "step": 13217 + }, + { + "epoch": 0.3344889541210112, + "grad_norm": 7.229534149169922, + "learning_rate": 7.57553262626319e-06, + "loss": 0.0676, + "step": 13218 + }, + { + "epoch": 0.33451425968570486, + "grad_norm": 5.827766418457031, + "learning_rate": 7.575188462483396e-06, + "loss": 0.2335, + "step": 13219 + }, + { + "epoch": 0.3345395652503986, + "grad_norm": 8.78532600402832, + "learning_rate": 7.574844282096692e-06, + "loss": 0.2519, + "step": 13220 + }, + { + "epoch": 0.33456487081509223, + "grad_norm": 3.905836343765259, + "learning_rate": 7.574500085105297e-06, + "loss": 0.1432, + "step": 13221 + }, + { + "epoch": 0.33459017637978594, + "grad_norm": 5.564059734344482, + "learning_rate": 7.574155871511428e-06, + "loss": 0.1638, + "step": 13222 + }, + { + "epoch": 0.3346154819444796, + "grad_norm": 3.9339382648468018, + "learning_rate": 7.573811641317307e-06, + "loss": 0.1383, + "step": 13223 + }, + { + "epoch": 0.33464078750917325, + "grad_norm": 4.9554548263549805, + "learning_rate": 7.573467394525153e-06, + "loss": 0.1766, + "step": 13224 + }, + { + "epoch": 0.33466609307386697, + "grad_norm": 5.015204906463623, + "learning_rate": 7.573123131137188e-06, + "loss": 0.1963, + "step": 13225 + }, + { + "epoch": 0.3346913986385606, + "grad_norm": 4.712169647216797, + "learning_rate": 7.572778851155629e-06, + "loss": 0.1513, + "step": 13226 + }, + { + "epoch": 0.3347167042032543, + "grad_norm": 10.171887397766113, + "learning_rate": 7.5724345545827e-06, + "loss": 0.2467, + "step": 13227 + }, + { + "epoch": 0.334742009767948, + "grad_norm": 4.880277633666992, + "learning_rate": 7.572090241420617e-06, + "loss": 0.1549, + "step": 13228 + }, + { + "epoch": 0.33476731533264165, + "grad_norm": 3.720991373062134, + "learning_rate": 7.571745911671603e-06, + "loss": 0.1627, + "step": 13229 + }, + { + "epoch": 0.3347926208973353, + "grad_norm": 4.200597286224365, + "learning_rate": 7.571401565337877e-06, + "loss": 0.1693, + "step": 13230 + }, + { + "epoch": 0.334817926462029, + "grad_norm": 6.315365791320801, + "learning_rate": 7.571057202421663e-06, + "loss": 0.2573, + "step": 13231 + }, + { + "epoch": 0.33484323202672267, + "grad_norm": 4.914342880249023, + "learning_rate": 7.570712822925177e-06, + "loss": 0.1769, + "step": 13232 + }, + { + "epoch": 0.3348685375914163, + "grad_norm": 6.753527641296387, + "learning_rate": 7.570368426850644e-06, + "loss": 0.2156, + "step": 13233 + }, + { + "epoch": 0.33489384315611004, + "grad_norm": 6.515764236450195, + "learning_rate": 7.570024014200281e-06, + "loss": 0.2297, + "step": 13234 + }, + { + "epoch": 0.3349191487208037, + "grad_norm": 10.518945693969727, + "learning_rate": 7.569679584976314e-06, + "loss": 0.3282, + "step": 13235 + }, + { + "epoch": 0.3349444542854974, + "grad_norm": 4.71016788482666, + "learning_rate": 7.56933513918096e-06, + "loss": 0.1717, + "step": 13236 + }, + { + "epoch": 0.33496975985019106, + "grad_norm": 10.993781089782715, + "learning_rate": 7.568990676816441e-06, + "loss": 0.1377, + "step": 13237 + }, + { + "epoch": 0.3349950654148847, + "grad_norm": 5.272708892822266, + "learning_rate": 7.568646197884981e-06, + "loss": 0.1173, + "step": 13238 + }, + { + "epoch": 0.33502037097957843, + "grad_norm": 6.646285057067871, + "learning_rate": 7.568301702388797e-06, + "loss": 0.159, + "step": 13239 + }, + { + "epoch": 0.3350456765442721, + "grad_norm": 5.161362648010254, + "learning_rate": 7.567957190330115e-06, + "loss": 0.1708, + "step": 13240 + }, + { + "epoch": 0.33507098210896574, + "grad_norm": 13.978615760803223, + "learning_rate": 7.567612661711153e-06, + "loss": 0.27, + "step": 13241 + }, + { + "epoch": 0.33509628767365945, + "grad_norm": 20.313697814941406, + "learning_rate": 7.567268116534137e-06, + "loss": 0.3011, + "step": 13242 + }, + { + "epoch": 0.3351215932383531, + "grad_norm": 2.8902480602264404, + "learning_rate": 7.566923554801283e-06, + "loss": 0.1723, + "step": 13243 + }, + { + "epoch": 0.33514689880304677, + "grad_norm": 3.8914072513580322, + "learning_rate": 7.566578976514819e-06, + "loss": 0.1249, + "step": 13244 + }, + { + "epoch": 0.3351722043677405, + "grad_norm": 10.609570503234863, + "learning_rate": 7.566234381676964e-06, + "loss": 0.2046, + "step": 13245 + }, + { + "epoch": 0.33519750993243413, + "grad_norm": 6.322993755340576, + "learning_rate": 7.56588977028994e-06, + "loss": 0.2037, + "step": 13246 + }, + { + "epoch": 0.3352228154971278, + "grad_norm": 9.744775772094727, + "learning_rate": 7.565545142355971e-06, + "loss": 0.2038, + "step": 13247 + }, + { + "epoch": 0.3352481210618215, + "grad_norm": 6.4377288818359375, + "learning_rate": 7.565200497877278e-06, + "loss": 0.2414, + "step": 13248 + }, + { + "epoch": 0.33527342662651516, + "grad_norm": 6.126889228820801, + "learning_rate": 7.5648558368560865e-06, + "loss": 0.187, + "step": 13249 + }, + { + "epoch": 0.33529873219120887, + "grad_norm": 11.535117149353027, + "learning_rate": 7.564511159294614e-06, + "loss": 0.1784, + "step": 13250 + }, + { + "epoch": 0.3353240377559025, + "grad_norm": 9.700878143310547, + "learning_rate": 7.564166465195087e-06, + "loss": 0.1695, + "step": 13251 + }, + { + "epoch": 0.3353493433205962, + "grad_norm": 4.007117748260498, + "learning_rate": 7.563821754559728e-06, + "loss": 0.2079, + "step": 13252 + }, + { + "epoch": 0.3353746488852899, + "grad_norm": 6.427014350891113, + "learning_rate": 7.563477027390759e-06, + "loss": 0.1483, + "step": 13253 + }, + { + "epoch": 0.33539995444998355, + "grad_norm": 7.613846302032471, + "learning_rate": 7.563132283690402e-06, + "loss": 0.3597, + "step": 13254 + }, + { + "epoch": 0.3354252600146772, + "grad_norm": 4.502518177032471, + "learning_rate": 7.562787523460884e-06, + "loss": 0.1611, + "step": 13255 + }, + { + "epoch": 0.3354505655793709, + "grad_norm": 6.454935550689697, + "learning_rate": 7.5624427467044244e-06, + "loss": 0.1966, + "step": 13256 + }, + { + "epoch": 0.3354758711440646, + "grad_norm": 5.42455530166626, + "learning_rate": 7.5620979534232485e-06, + "loss": 0.1591, + "step": 13257 + }, + { + "epoch": 0.33550117670875823, + "grad_norm": 3.403679847717285, + "learning_rate": 7.561753143619581e-06, + "loss": 0.186, + "step": 13258 + }, + { + "epoch": 0.33552648227345194, + "grad_norm": 3.7445802688598633, + "learning_rate": 7.561408317295642e-06, + "loss": 0.1669, + "step": 13259 + }, + { + "epoch": 0.3355517878381456, + "grad_norm": 10.49937629699707, + "learning_rate": 7.561063474453658e-06, + "loss": 0.2349, + "step": 13260 + }, + { + "epoch": 0.3355770934028393, + "grad_norm": 4.318219184875488, + "learning_rate": 7.5607186150958525e-06, + "loss": 0.1592, + "step": 13261 + }, + { + "epoch": 0.33560239896753297, + "grad_norm": 6.668302059173584, + "learning_rate": 7.5603737392244495e-06, + "loss": 0.1338, + "step": 13262 + }, + { + "epoch": 0.3356277045322266, + "grad_norm": 8.06200885772705, + "learning_rate": 7.560028846841671e-06, + "loss": 0.1487, + "step": 13263 + }, + { + "epoch": 0.33565301009692033, + "grad_norm": 11.616703987121582, + "learning_rate": 7.559683937949744e-06, + "loss": 0.258, + "step": 13264 + }, + { + "epoch": 0.335678315661614, + "grad_norm": 6.545408725738525, + "learning_rate": 7.55933901255089e-06, + "loss": 0.2184, + "step": 13265 + }, + { + "epoch": 0.33570362122630765, + "grad_norm": 8.436500549316406, + "learning_rate": 7.5589940706473355e-06, + "loss": 0.2143, + "step": 13266 + }, + { + "epoch": 0.33572892679100136, + "grad_norm": 4.252138137817383, + "learning_rate": 7.5586491122413055e-06, + "loss": 0.1612, + "step": 13267 + }, + { + "epoch": 0.335754232355695, + "grad_norm": 3.9127397537231445, + "learning_rate": 7.55830413733502e-06, + "loss": 0.162, + "step": 13268 + }, + { + "epoch": 0.33577953792038867, + "grad_norm": 5.244645118713379, + "learning_rate": 7.55795914593071e-06, + "loss": 0.1867, + "step": 13269 + }, + { + "epoch": 0.3358048434850824, + "grad_norm": 3.2655739784240723, + "learning_rate": 7.557614138030595e-06, + "loss": 0.1907, + "step": 13270 + }, + { + "epoch": 0.33583014904977604, + "grad_norm": 5.405508518218994, + "learning_rate": 7.557269113636905e-06, + "loss": 0.1251, + "step": 13271 + }, + { + "epoch": 0.3358554546144697, + "grad_norm": 4.071292877197266, + "learning_rate": 7.55692407275186e-06, + "loss": 0.1336, + "step": 13272 + }, + { + "epoch": 0.3358807601791634, + "grad_norm": 7.3240556716918945, + "learning_rate": 7.556579015377688e-06, + "loss": 0.2419, + "step": 13273 + }, + { + "epoch": 0.33590606574385706, + "grad_norm": 7.455846786499023, + "learning_rate": 7.556233941516613e-06, + "loss": 0.2006, + "step": 13274 + }, + { + "epoch": 0.3359313713085508, + "grad_norm": 4.383024215698242, + "learning_rate": 7.555888851170863e-06, + "loss": 0.17, + "step": 13275 + }, + { + "epoch": 0.33595667687324443, + "grad_norm": 2.81872296333313, + "learning_rate": 7.555543744342658e-06, + "loss": 0.1363, + "step": 13276 + }, + { + "epoch": 0.3359819824379381, + "grad_norm": 4.518769264221191, + "learning_rate": 7.555198621034228e-06, + "loss": 0.1356, + "step": 13277 + }, + { + "epoch": 0.3360072880026318, + "grad_norm": 5.796823024749756, + "learning_rate": 7.554853481247798e-06, + "loss": 0.1712, + "step": 13278 + }, + { + "epoch": 0.33603259356732545, + "grad_norm": 11.896479606628418, + "learning_rate": 7.554508324985592e-06, + "loss": 0.2493, + "step": 13279 + }, + { + "epoch": 0.3360578991320191, + "grad_norm": 4.572514057159424, + "learning_rate": 7.554163152249838e-06, + "loss": 0.2162, + "step": 13280 + }, + { + "epoch": 0.3360832046967128, + "grad_norm": 4.317484378814697, + "learning_rate": 7.553817963042761e-06, + "loss": 0.1745, + "step": 13281 + }, + { + "epoch": 0.3361085102614065, + "grad_norm": 4.165070533752441, + "learning_rate": 7.5534727573665865e-06, + "loss": 0.1322, + "step": 13282 + }, + { + "epoch": 0.33613381582610014, + "grad_norm": 6.544311046600342, + "learning_rate": 7.5531275352235415e-06, + "loss": 0.2638, + "step": 13283 + }, + { + "epoch": 0.33615912139079385, + "grad_norm": 5.200934886932373, + "learning_rate": 7.552782296615852e-06, + "loss": 0.2059, + "step": 13284 + }, + { + "epoch": 0.3361844269554875, + "grad_norm": 15.288359642028809, + "learning_rate": 7.552437041545744e-06, + "loss": 0.1813, + "step": 13285 + }, + { + "epoch": 0.3362097325201812, + "grad_norm": 7.009294509887695, + "learning_rate": 7.552091770015445e-06, + "loss": 0.2375, + "step": 13286 + }, + { + "epoch": 0.33623503808487487, + "grad_norm": 4.712545871734619, + "learning_rate": 7.55174648202718e-06, + "loss": 0.1381, + "step": 13287 + }, + { + "epoch": 0.3362603436495685, + "grad_norm": 5.764260292053223, + "learning_rate": 7.551401177583176e-06, + "loss": 0.1959, + "step": 13288 + }, + { + "epoch": 0.33628564921426224, + "grad_norm": 4.7285542488098145, + "learning_rate": 7.5510558566856625e-06, + "loss": 0.192, + "step": 13289 + }, + { + "epoch": 0.3363109547789559, + "grad_norm": 9.269702911376953, + "learning_rate": 7.550710519336863e-06, + "loss": 0.1767, + "step": 13290 + }, + { + "epoch": 0.33633626034364955, + "grad_norm": 4.762983798980713, + "learning_rate": 7.550365165539006e-06, + "loss": 0.1616, + "step": 13291 + }, + { + "epoch": 0.33636156590834326, + "grad_norm": 6.7957892417907715, + "learning_rate": 7.550019795294319e-06, + "loss": 0.1953, + "step": 13292 + }, + { + "epoch": 0.3363868714730369, + "grad_norm": 3.336327314376831, + "learning_rate": 7.549674408605027e-06, + "loss": 0.1466, + "step": 13293 + }, + { + "epoch": 0.3364121770377306, + "grad_norm": 3.855917453765869, + "learning_rate": 7.549329005473362e-06, + "loss": 0.1835, + "step": 13294 + }, + { + "epoch": 0.3364374826024243, + "grad_norm": 5.5377397537231445, + "learning_rate": 7.5489835859015456e-06, + "loss": 0.225, + "step": 13295 + }, + { + "epoch": 0.33646278816711794, + "grad_norm": 3.878026008605957, + "learning_rate": 7.548638149891809e-06, + "loss": 0.1552, + "step": 13296 + }, + { + "epoch": 0.3364880937318116, + "grad_norm": 11.247760772705078, + "learning_rate": 7.54829269744638e-06, + "loss": 0.2064, + "step": 13297 + }, + { + "epoch": 0.3365133992965053, + "grad_norm": 4.190451145172119, + "learning_rate": 7.547947228567485e-06, + "loss": 0.1886, + "step": 13298 + }, + { + "epoch": 0.33653870486119897, + "grad_norm": 3.1738767623901367, + "learning_rate": 7.547601743257351e-06, + "loss": 0.1275, + "step": 13299 + }, + { + "epoch": 0.3365640104258927, + "grad_norm": 10.81003475189209, + "learning_rate": 7.5472562415182085e-06, + "loss": 0.169, + "step": 13300 + }, + { + "epoch": 0.33658931599058634, + "grad_norm": 9.508584022521973, + "learning_rate": 7.546910723352283e-06, + "loss": 0.2402, + "step": 13301 + }, + { + "epoch": 0.33661462155528, + "grad_norm": 4.95872163772583, + "learning_rate": 7.546565188761806e-06, + "loss": 0.2147, + "step": 13302 + }, + { + "epoch": 0.3366399271199737, + "grad_norm": 2.880197286605835, + "learning_rate": 7.546219637749002e-06, + "loss": 0.1331, + "step": 13303 + }, + { + "epoch": 0.33666523268466736, + "grad_norm": 2.7161362171173096, + "learning_rate": 7.545874070316104e-06, + "loss": 0.1327, + "step": 13304 + }, + { + "epoch": 0.336690538249361, + "grad_norm": 4.741302490234375, + "learning_rate": 7.545528486465335e-06, + "loss": 0.181, + "step": 13305 + }, + { + "epoch": 0.3367158438140547, + "grad_norm": 4.1304707527160645, + "learning_rate": 7.5451828861989275e-06, + "loss": 0.199, + "step": 13306 + }, + { + "epoch": 0.3367411493787484, + "grad_norm": 7.790441989898682, + "learning_rate": 7.544837269519107e-06, + "loss": 0.2368, + "step": 13307 + }, + { + "epoch": 0.33676645494344204, + "grad_norm": 5.068296432495117, + "learning_rate": 7.544491636428107e-06, + "loss": 0.163, + "step": 13308 + }, + { + "epoch": 0.33679176050813575, + "grad_norm": 4.832935333251953, + "learning_rate": 7.544145986928152e-06, + "loss": 0.102, + "step": 13309 + }, + { + "epoch": 0.3368170660728294, + "grad_norm": 7.943174362182617, + "learning_rate": 7.543800321021473e-06, + "loss": 0.2641, + "step": 13310 + }, + { + "epoch": 0.33684237163752306, + "grad_norm": 9.060044288635254, + "learning_rate": 7.543454638710299e-06, + "loss": 0.2986, + "step": 13311 + }, + { + "epoch": 0.3368676772022168, + "grad_norm": 4.473021507263184, + "learning_rate": 7.54310893999686e-06, + "loss": 0.1839, + "step": 13312 + }, + { + "epoch": 0.33689298276691043, + "grad_norm": 4.387174606323242, + "learning_rate": 7.542763224883384e-06, + "loss": 0.1559, + "step": 13313 + }, + { + "epoch": 0.33691828833160414, + "grad_norm": 5.7582268714904785, + "learning_rate": 7.5424174933721005e-06, + "loss": 0.1423, + "step": 13314 + }, + { + "epoch": 0.3369435938962978, + "grad_norm": 7.073176860809326, + "learning_rate": 7.54207174546524e-06, + "loss": 0.1999, + "step": 13315 + }, + { + "epoch": 0.33696889946099146, + "grad_norm": 4.201976299285889, + "learning_rate": 7.541725981165031e-06, + "loss": 0.1825, + "step": 13316 + }, + { + "epoch": 0.33699420502568517, + "grad_norm": 4.146292209625244, + "learning_rate": 7.5413802004737045e-06, + "loss": 0.173, + "step": 13317 + }, + { + "epoch": 0.3370195105903788, + "grad_norm": 13.61258316040039, + "learning_rate": 7.54103440339349e-06, + "loss": 0.1887, + "step": 13318 + }, + { + "epoch": 0.3370448161550725, + "grad_norm": 5.127865791320801, + "learning_rate": 7.540688589926617e-06, + "loss": 0.154, + "step": 13319 + }, + { + "epoch": 0.3370701217197662, + "grad_norm": 4.503152370452881, + "learning_rate": 7.540342760075316e-06, + "loss": 0.1726, + "step": 13320 + }, + { + "epoch": 0.33709542728445985, + "grad_norm": 14.643987655639648, + "learning_rate": 7.539996913841816e-06, + "loss": 0.2178, + "step": 13321 + }, + { + "epoch": 0.3371207328491535, + "grad_norm": 3.1048529148101807, + "learning_rate": 7.539651051228352e-06, + "loss": 0.1922, + "step": 13322 + }, + { + "epoch": 0.3371460384138472, + "grad_norm": 7.294281482696533, + "learning_rate": 7.539305172237148e-06, + "loss": 0.175, + "step": 13323 + }, + { + "epoch": 0.33717134397854087, + "grad_norm": 5.243082523345947, + "learning_rate": 7.538959276870437e-06, + "loss": 0.1548, + "step": 13324 + }, + { + "epoch": 0.3371966495432346, + "grad_norm": 6.932604789733887, + "learning_rate": 7.538613365130451e-06, + "loss": 0.1661, + "step": 13325 + }, + { + "epoch": 0.33722195510792824, + "grad_norm": 9.986969947814941, + "learning_rate": 7.53826743701942e-06, + "loss": 0.2734, + "step": 13326 + }, + { + "epoch": 0.3372472606726219, + "grad_norm": 7.338990211486816, + "learning_rate": 7.537921492539573e-06, + "loss": 0.1391, + "step": 13327 + }, + { + "epoch": 0.3372725662373156, + "grad_norm": 13.714395523071289, + "learning_rate": 7.537575531693142e-06, + "loss": 0.1752, + "step": 13328 + }, + { + "epoch": 0.33729787180200926, + "grad_norm": 4.13303279876709, + "learning_rate": 7.537229554482359e-06, + "loss": 0.1711, + "step": 13329 + }, + { + "epoch": 0.3373231773667029, + "grad_norm": 4.2650146484375, + "learning_rate": 7.5368835609094545e-06, + "loss": 0.1709, + "step": 13330 + }, + { + "epoch": 0.33734848293139663, + "grad_norm": 6.745903015136719, + "learning_rate": 7.536537550976661e-06, + "loss": 0.1578, + "step": 13331 + }, + { + "epoch": 0.3373737884960903, + "grad_norm": 2.775615930557251, + "learning_rate": 7.536191524686207e-06, + "loss": 0.1399, + "step": 13332 + }, + { + "epoch": 0.33739909406078394, + "grad_norm": 3.2255048751831055, + "learning_rate": 7.535845482040327e-06, + "loss": 0.1864, + "step": 13333 + }, + { + "epoch": 0.33742439962547766, + "grad_norm": 9.206546783447266, + "learning_rate": 7.535499423041251e-06, + "loss": 0.1907, + "step": 13334 + }, + { + "epoch": 0.3374497051901713, + "grad_norm": 8.37462043762207, + "learning_rate": 7.535153347691211e-06, + "loss": 0.1884, + "step": 13335 + }, + { + "epoch": 0.33747501075486497, + "grad_norm": 3.8940908908843994, + "learning_rate": 7.534807255992438e-06, + "loss": 0.1524, + "step": 13336 + }, + { + "epoch": 0.3375003163195587, + "grad_norm": 5.045400619506836, + "learning_rate": 7.534461147947165e-06, + "loss": 0.1921, + "step": 13337 + }, + { + "epoch": 0.33752562188425234, + "grad_norm": 4.460097789764404, + "learning_rate": 7.5341150235576225e-06, + "loss": 0.1511, + "step": 13338 + }, + { + "epoch": 0.33755092744894605, + "grad_norm": 5.522242069244385, + "learning_rate": 7.5337688828260445e-06, + "loss": 0.1804, + "step": 13339 + }, + { + "epoch": 0.3375762330136397, + "grad_norm": 5.606051445007324, + "learning_rate": 7.533422725754662e-06, + "loss": 0.2224, + "step": 13340 + }, + { + "epoch": 0.33760153857833336, + "grad_norm": 2.8288705348968506, + "learning_rate": 7.533076552345708e-06, + "loss": 0.1373, + "step": 13341 + }, + { + "epoch": 0.33762684414302707, + "grad_norm": 4.745129108428955, + "learning_rate": 7.5327303626014134e-06, + "loss": 0.1609, + "step": 13342 + }, + { + "epoch": 0.33765214970772073, + "grad_norm": 3.667386770248413, + "learning_rate": 7.532384156524013e-06, + "loss": 0.1408, + "step": 13343 + }, + { + "epoch": 0.3376774552724144, + "grad_norm": 4.875473976135254, + "learning_rate": 7.5320379341157375e-06, + "loss": 0.1886, + "step": 13344 + }, + { + "epoch": 0.3377027608371081, + "grad_norm": 6.45291805267334, + "learning_rate": 7.53169169537882e-06, + "loss": 0.1982, + "step": 13345 + }, + { + "epoch": 0.33772806640180175, + "grad_norm": 7.8385186195373535, + "learning_rate": 7.531345440315496e-06, + "loss": 0.2425, + "step": 13346 + }, + { + "epoch": 0.3377533719664954, + "grad_norm": 4.825141429901123, + "learning_rate": 7.530999168927994e-06, + "loss": 0.1519, + "step": 13347 + }, + { + "epoch": 0.3377786775311891, + "grad_norm": 5.609509468078613, + "learning_rate": 7.53065288121855e-06, + "loss": 0.2121, + "step": 13348 + }, + { + "epoch": 0.3378039830958828, + "grad_norm": 18.087697982788086, + "learning_rate": 7.530306577189394e-06, + "loss": 0.2892, + "step": 13349 + }, + { + "epoch": 0.3378292886605765, + "grad_norm": 5.5184645652771, + "learning_rate": 7.529960256842765e-06, + "loss": 0.1464, + "step": 13350 + }, + { + "epoch": 0.33785459422527014, + "grad_norm": 5.849862098693848, + "learning_rate": 7.529613920180892e-06, + "loss": 0.1779, + "step": 13351 + }, + { + "epoch": 0.3378798997899638, + "grad_norm": 4.885343074798584, + "learning_rate": 7.529267567206009e-06, + "loss": 0.2079, + "step": 13352 + }, + { + "epoch": 0.3379052053546575, + "grad_norm": 3.7981374263763428, + "learning_rate": 7.528921197920349e-06, + "loss": 0.1544, + "step": 13353 + }, + { + "epoch": 0.33793051091935117, + "grad_norm": 8.412837982177734, + "learning_rate": 7.528574812326148e-06, + "loss": 0.2081, + "step": 13354 + }, + { + "epoch": 0.3379558164840448, + "grad_norm": 3.4531984329223633, + "learning_rate": 7.528228410425638e-06, + "loss": 0.1471, + "step": 13355 + }, + { + "epoch": 0.33798112204873854, + "grad_norm": 4.521352767944336, + "learning_rate": 7.527881992221053e-06, + "loss": 0.1655, + "step": 13356 + }, + { + "epoch": 0.3380064276134322, + "grad_norm": 2.949613332748413, + "learning_rate": 7.527535557714627e-06, + "loss": 0.1172, + "step": 13357 + }, + { + "epoch": 0.33803173317812585, + "grad_norm": 7.232264518737793, + "learning_rate": 7.527189106908594e-06, + "loss": 0.1955, + "step": 13358 + }, + { + "epoch": 0.33805703874281956, + "grad_norm": 4.1807684898376465, + "learning_rate": 7.526842639805189e-06, + "loss": 0.1314, + "step": 13359 + }, + { + "epoch": 0.3380823443075132, + "grad_norm": 4.116983413696289, + "learning_rate": 7.526496156406646e-06, + "loss": 0.1273, + "step": 13360 + }, + { + "epoch": 0.3381076498722069, + "grad_norm": 3.000542402267456, + "learning_rate": 7.5261496567152e-06, + "loss": 0.0975, + "step": 13361 + }, + { + "epoch": 0.3381329554369006, + "grad_norm": 6.820765972137451, + "learning_rate": 7.525803140733083e-06, + "loss": 0.1944, + "step": 13362 + }, + { + "epoch": 0.33815826100159424, + "grad_norm": 3.459205389022827, + "learning_rate": 7.525456608462534e-06, + "loss": 0.1433, + "step": 13363 + }, + { + "epoch": 0.33818356656628795, + "grad_norm": 9.8279390335083, + "learning_rate": 7.525110059905783e-06, + "loss": 0.1916, + "step": 13364 + }, + { + "epoch": 0.3382088721309816, + "grad_norm": 4.887001991271973, + "learning_rate": 7.524763495065066e-06, + "loss": 0.1731, + "step": 13365 + }, + { + "epoch": 0.33823417769567526, + "grad_norm": 6.855721950531006, + "learning_rate": 7.524416913942622e-06, + "loss": 0.3256, + "step": 13366 + }, + { + "epoch": 0.338259483260369, + "grad_norm": 3.8947458267211914, + "learning_rate": 7.524070316540681e-06, + "loss": 0.1411, + "step": 13367 + }, + { + "epoch": 0.33828478882506263, + "grad_norm": 4.577896595001221, + "learning_rate": 7.523723702861481e-06, + "loss": 0.2274, + "step": 13368 + }, + { + "epoch": 0.3383100943897563, + "grad_norm": 9.01406192779541, + "learning_rate": 7.523377072907254e-06, + "loss": 0.1637, + "step": 13369 + }, + { + "epoch": 0.33833539995445, + "grad_norm": 19.656597137451172, + "learning_rate": 7.52303042668024e-06, + "loss": 0.2403, + "step": 13370 + }, + { + "epoch": 0.33836070551914366, + "grad_norm": 7.289614677429199, + "learning_rate": 7.52268376418267e-06, + "loss": 0.2389, + "step": 13371 + }, + { + "epoch": 0.3383860110838373, + "grad_norm": 8.907171249389648, + "learning_rate": 7.522337085416784e-06, + "loss": 0.1798, + "step": 13372 + }, + { + "epoch": 0.338411316648531, + "grad_norm": 17.444135665893555, + "learning_rate": 7.521990390384815e-06, + "loss": 0.1664, + "step": 13373 + }, + { + "epoch": 0.3384366222132247, + "grad_norm": 27.67461395263672, + "learning_rate": 7.521643679088997e-06, + "loss": 0.2616, + "step": 13374 + }, + { + "epoch": 0.33846192777791834, + "grad_norm": 6.473365306854248, + "learning_rate": 7.521296951531569e-06, + "loss": 0.1651, + "step": 13375 + }, + { + "epoch": 0.33848723334261205, + "grad_norm": 5.28648042678833, + "learning_rate": 7.520950207714765e-06, + "loss": 0.282, + "step": 13376 + }, + { + "epoch": 0.3385125389073057, + "grad_norm": 3.614351987838745, + "learning_rate": 7.520603447640825e-06, + "loss": 0.2287, + "step": 13377 + }, + { + "epoch": 0.3385378444719994, + "grad_norm": 3.520953416824341, + "learning_rate": 7.520256671311981e-06, + "loss": 0.1826, + "step": 13378 + }, + { + "epoch": 0.3385631500366931, + "grad_norm": 12.658754348754883, + "learning_rate": 7.51990987873047e-06, + "loss": 0.225, + "step": 13379 + }, + { + "epoch": 0.33858845560138673, + "grad_norm": 5.576518535614014, + "learning_rate": 7.519563069898528e-06, + "loss": 0.2574, + "step": 13380 + }, + { + "epoch": 0.33861376116608044, + "grad_norm": 2.8928511142730713, + "learning_rate": 7.519216244818395e-06, + "loss": 0.1049, + "step": 13381 + }, + { + "epoch": 0.3386390667307741, + "grad_norm": 8.52193832397461, + "learning_rate": 7.5188694034923025e-06, + "loss": 0.2382, + "step": 13382 + }, + { + "epoch": 0.33866437229546775, + "grad_norm": 5.0953168869018555, + "learning_rate": 7.518522545922491e-06, + "loss": 0.1515, + "step": 13383 + }, + { + "epoch": 0.33868967786016146, + "grad_norm": 4.972285747528076, + "learning_rate": 7.518175672111196e-06, + "loss": 0.2277, + "step": 13384 + }, + { + "epoch": 0.3387149834248551, + "grad_norm": 3.6107497215270996, + "learning_rate": 7.517828782060654e-06, + "loss": 0.1466, + "step": 13385 + }, + { + "epoch": 0.3387402889895488, + "grad_norm": 6.358749866485596, + "learning_rate": 7.517481875773104e-06, + "loss": 0.2773, + "step": 13386 + }, + { + "epoch": 0.3387655945542425, + "grad_norm": 4.866247177124023, + "learning_rate": 7.51713495325078e-06, + "loss": 0.1354, + "step": 13387 + }, + { + "epoch": 0.33879090011893614, + "grad_norm": 3.48099684715271, + "learning_rate": 7.516788014495921e-06, + "loss": 0.1613, + "step": 13388 + }, + { + "epoch": 0.33881620568362986, + "grad_norm": 9.948617935180664, + "learning_rate": 7.516441059510765e-06, + "loss": 0.296, + "step": 13389 + }, + { + "epoch": 0.3388415112483235, + "grad_norm": 5.358748435974121, + "learning_rate": 7.51609408829755e-06, + "loss": 0.1766, + "step": 13390 + }, + { + "epoch": 0.33886681681301717, + "grad_norm": 11.536456108093262, + "learning_rate": 7.5157471008585095e-06, + "loss": 0.3389, + "step": 13391 + }, + { + "epoch": 0.3388921223777109, + "grad_norm": 11.925089836120605, + "learning_rate": 7.5154000971958864e-06, + "loss": 0.2694, + "step": 13392 + }, + { + "epoch": 0.33891742794240454, + "grad_norm": 4.163150310516357, + "learning_rate": 7.515053077311913e-06, + "loss": 0.1559, + "step": 13393 + }, + { + "epoch": 0.3389427335070982, + "grad_norm": 2.5449774265289307, + "learning_rate": 7.514706041208832e-06, + "loss": 0.1217, + "step": 13394 + }, + { + "epoch": 0.3389680390717919, + "grad_norm": 3.1841859817504883, + "learning_rate": 7.514358988888879e-06, + "loss": 0.1993, + "step": 13395 + }, + { + "epoch": 0.33899334463648556, + "grad_norm": 4.049499034881592, + "learning_rate": 7.514011920354293e-06, + "loss": 0.2095, + "step": 13396 + }, + { + "epoch": 0.3390186502011792, + "grad_norm": 6.5527729988098145, + "learning_rate": 7.513664835607313e-06, + "loss": 0.1406, + "step": 13397 + }, + { + "epoch": 0.33904395576587293, + "grad_norm": 13.627796173095703, + "learning_rate": 7.513317734650174e-06, + "loss": 0.1579, + "step": 13398 + }, + { + "epoch": 0.3390692613305666, + "grad_norm": 5.757753849029541, + "learning_rate": 7.512970617485117e-06, + "loss": 0.2089, + "step": 13399 + }, + { + "epoch": 0.33909456689526024, + "grad_norm": 4.071067810058594, + "learning_rate": 7.51262348411438e-06, + "loss": 0.1253, + "step": 13400 + }, + { + "epoch": 0.33911987245995395, + "grad_norm": 8.116642951965332, + "learning_rate": 7.512276334540202e-06, + "loss": 0.1912, + "step": 13401 + }, + { + "epoch": 0.3391451780246476, + "grad_norm": 16.574443817138672, + "learning_rate": 7.51192916876482e-06, + "loss": 0.3122, + "step": 13402 + }, + { + "epoch": 0.3391704835893413, + "grad_norm": 10.988641738891602, + "learning_rate": 7.511581986790475e-06, + "loss": 0.3187, + "step": 13403 + }, + { + "epoch": 0.339195789154035, + "grad_norm": 3.957008123397827, + "learning_rate": 7.5112347886194044e-06, + "loss": 0.1948, + "step": 13404 + }, + { + "epoch": 0.33922109471872863, + "grad_norm": 2.617839813232422, + "learning_rate": 7.5108875742538486e-06, + "loss": 0.0924, + "step": 13405 + }, + { + "epoch": 0.33924640028342234, + "grad_norm": 3.6726291179656982, + "learning_rate": 7.510540343696047e-06, + "loss": 0.1487, + "step": 13406 + }, + { + "epoch": 0.339271705848116, + "grad_norm": 8.611543655395508, + "learning_rate": 7.510193096948235e-06, + "loss": 0.3274, + "step": 13407 + }, + { + "epoch": 0.33929701141280966, + "grad_norm": 5.147753715515137, + "learning_rate": 7.509845834012657e-06, + "loss": 0.1554, + "step": 13408 + }, + { + "epoch": 0.33932231697750337, + "grad_norm": 7.241278648376465, + "learning_rate": 7.509498554891547e-06, + "loss": 0.3121, + "step": 13409 + }, + { + "epoch": 0.339347622542197, + "grad_norm": 5.1904988288879395, + "learning_rate": 7.509151259587151e-06, + "loss": 0.1832, + "step": 13410 + }, + { + "epoch": 0.3393729281068907, + "grad_norm": 11.261208534240723, + "learning_rate": 7.508803948101704e-06, + "loss": 0.2673, + "step": 13411 + }, + { + "epoch": 0.3393982336715844, + "grad_norm": 15.939671516418457, + "learning_rate": 7.508456620437447e-06, + "loss": 0.1521, + "step": 13412 + }, + { + "epoch": 0.33942353923627805, + "grad_norm": 2.0819664001464844, + "learning_rate": 7.50810927659662e-06, + "loss": 0.115, + "step": 13413 + }, + { + "epoch": 0.33944884480097176, + "grad_norm": 3.3158299922943115, + "learning_rate": 7.507761916581462e-06, + "loss": 0.1309, + "step": 13414 + }, + { + "epoch": 0.3394741503656654, + "grad_norm": 8.15933895111084, + "learning_rate": 7.507414540394215e-06, + "loss": 0.2294, + "step": 13415 + }, + { + "epoch": 0.3394994559303591, + "grad_norm": 7.454897880554199, + "learning_rate": 7.507067148037117e-06, + "loss": 0.1709, + "step": 13416 + }, + { + "epoch": 0.3395247614950528, + "grad_norm": 3.1538360118865967, + "learning_rate": 7.50671973951241e-06, + "loss": 0.1967, + "step": 13417 + }, + { + "epoch": 0.33955006705974644, + "grad_norm": 5.186992168426514, + "learning_rate": 7.5063723148223334e-06, + "loss": 0.2218, + "step": 13418 + }, + { + "epoch": 0.3395753726244401, + "grad_norm": 9.476861000061035, + "learning_rate": 7.506024873969129e-06, + "loss": 0.2043, + "step": 13419 + }, + { + "epoch": 0.3396006781891338, + "grad_norm": 9.24765682220459, + "learning_rate": 7.505677416955035e-06, + "loss": 0.231, + "step": 13420 + }, + { + "epoch": 0.33962598375382747, + "grad_norm": 4.883594036102295, + "learning_rate": 7.505329943782293e-06, + "loss": 0.1585, + "step": 13421 + }, + { + "epoch": 0.3396512893185211, + "grad_norm": 12.585648536682129, + "learning_rate": 7.504982454453145e-06, + "loss": 0.3111, + "step": 13422 + }, + { + "epoch": 0.33967659488321483, + "grad_norm": 6.506510257720947, + "learning_rate": 7.504634948969833e-06, + "loss": 0.1768, + "step": 13423 + }, + { + "epoch": 0.3397019004479085, + "grad_norm": 6.065308094024658, + "learning_rate": 7.504287427334593e-06, + "loss": 0.1692, + "step": 13424 + }, + { + "epoch": 0.33972720601260215, + "grad_norm": 4.729558944702148, + "learning_rate": 7.503939889549671e-06, + "loss": 0.1501, + "step": 13425 + }, + { + "epoch": 0.33975251157729586, + "grad_norm": 2.7687625885009766, + "learning_rate": 7.503592335617304e-06, + "loss": 0.1514, + "step": 13426 + }, + { + "epoch": 0.3397778171419895, + "grad_norm": 5.065991401672363, + "learning_rate": 7.503244765539739e-06, + "loss": 0.1358, + "step": 13427 + }, + { + "epoch": 0.3398031227066832, + "grad_norm": 4.676320552825928, + "learning_rate": 7.502897179319213e-06, + "loss": 0.1653, + "step": 13428 + }, + { + "epoch": 0.3398284282713769, + "grad_norm": 6.230667591094971, + "learning_rate": 7.502549576957967e-06, + "loss": 0.1733, + "step": 13429 + }, + { + "epoch": 0.33985373383607054, + "grad_norm": 12.986666679382324, + "learning_rate": 7.502201958458247e-06, + "loss": 0.2162, + "step": 13430 + }, + { + "epoch": 0.33987903940076425, + "grad_norm": 7.816709041595459, + "learning_rate": 7.501854323822291e-06, + "loss": 0.3041, + "step": 13431 + }, + { + "epoch": 0.3399043449654579, + "grad_norm": 4.662293910980225, + "learning_rate": 7.501506673052341e-06, + "loss": 0.1548, + "step": 13432 + }, + { + "epoch": 0.33992965053015156, + "grad_norm": 4.797255039215088, + "learning_rate": 7.50115900615064e-06, + "loss": 0.1892, + "step": 13433 + }, + { + "epoch": 0.3399549560948453, + "grad_norm": 19.323898315429688, + "learning_rate": 7.5008113231194316e-06, + "loss": 0.3695, + "step": 13434 + }, + { + "epoch": 0.33998026165953893, + "grad_norm": 4.800852298736572, + "learning_rate": 7.500463623960954e-06, + "loss": 0.1986, + "step": 13435 + }, + { + "epoch": 0.3400055672242326, + "grad_norm": 3.5782358646392822, + "learning_rate": 7.500115908677452e-06, + "loss": 0.1713, + "step": 13436 + }, + { + "epoch": 0.3400308727889263, + "grad_norm": 14.568897247314453, + "learning_rate": 7.499768177271168e-06, + "loss": 0.16, + "step": 13437 + }, + { + "epoch": 0.34005617835361995, + "grad_norm": 4.733387470245361, + "learning_rate": 7.499420429744344e-06, + "loss": 0.178, + "step": 13438 + }, + { + "epoch": 0.3400814839183136, + "grad_norm": 7.0947747230529785, + "learning_rate": 7.499072666099223e-06, + "loss": 0.1862, + "step": 13439 + }, + { + "epoch": 0.3401067894830073, + "grad_norm": 7.4683308601379395, + "learning_rate": 7.498724886338047e-06, + "loss": 0.2602, + "step": 13440 + }, + { + "epoch": 0.340132095047701, + "grad_norm": 6.585425853729248, + "learning_rate": 7.498377090463058e-06, + "loss": 0.2569, + "step": 13441 + }, + { + "epoch": 0.3401574006123947, + "grad_norm": 2.9023396968841553, + "learning_rate": 7.4980292784765015e-06, + "loss": 0.0842, + "step": 13442 + }, + { + "epoch": 0.34018270617708835, + "grad_norm": 5.115022659301758, + "learning_rate": 7.497681450380618e-06, + "loss": 0.1711, + "step": 13443 + }, + { + "epoch": 0.340208011741782, + "grad_norm": 3.728100061416626, + "learning_rate": 7.497333606177651e-06, + "loss": 0.1473, + "step": 13444 + }, + { + "epoch": 0.3402333173064757, + "grad_norm": 4.555905818939209, + "learning_rate": 7.496985745869846e-06, + "loss": 0.1058, + "step": 13445 + }, + { + "epoch": 0.34025862287116937, + "grad_norm": 6.983024597167969, + "learning_rate": 7.496637869459442e-06, + "loss": 0.1935, + "step": 13446 + }, + { + "epoch": 0.340283928435863, + "grad_norm": 9.47528076171875, + "learning_rate": 7.496289976948685e-06, + "loss": 0.1863, + "step": 13447 + }, + { + "epoch": 0.34030923400055674, + "grad_norm": 5.24548864364624, + "learning_rate": 7.49594206833982e-06, + "loss": 0.1559, + "step": 13448 + }, + { + "epoch": 0.3403345395652504, + "grad_norm": 5.083047866821289, + "learning_rate": 7.495594143635088e-06, + "loss": 0.1774, + "step": 13449 + }, + { + "epoch": 0.34035984512994405, + "grad_norm": 5.047209739685059, + "learning_rate": 7.495246202836733e-06, + "loss": 0.1641, + "step": 13450 + }, + { + "epoch": 0.34038515069463776, + "grad_norm": 4.131845951080322, + "learning_rate": 7.494898245947e-06, + "loss": 0.2105, + "step": 13451 + }, + { + "epoch": 0.3404104562593314, + "grad_norm": 3.4901552200317383, + "learning_rate": 7.494550272968133e-06, + "loss": 0.1784, + "step": 13452 + }, + { + "epoch": 0.34043576182402513, + "grad_norm": 5.173212051391602, + "learning_rate": 7.494202283902374e-06, + "loss": 0.111, + "step": 13453 + }, + { + "epoch": 0.3404610673887188, + "grad_norm": 3.0724036693573, + "learning_rate": 7.493854278751969e-06, + "loss": 0.0901, + "step": 13454 + }, + { + "epoch": 0.34048637295341244, + "grad_norm": 5.1407999992370605, + "learning_rate": 7.4935062575191605e-06, + "loss": 0.1774, + "step": 13455 + }, + { + "epoch": 0.34051167851810615, + "grad_norm": 5.890231609344482, + "learning_rate": 7.493158220206196e-06, + "loss": 0.1943, + "step": 13456 + }, + { + "epoch": 0.3405369840827998, + "grad_norm": 4.682126522064209, + "learning_rate": 7.492810166815316e-06, + "loss": 0.1613, + "step": 13457 + }, + { + "epoch": 0.34056228964749347, + "grad_norm": 3.5500893592834473, + "learning_rate": 7.492462097348768e-06, + "loss": 0.1944, + "step": 13458 + }, + { + "epoch": 0.3405875952121872, + "grad_norm": 5.597775459289551, + "learning_rate": 7.492114011808796e-06, + "loss": 0.181, + "step": 13459 + }, + { + "epoch": 0.34061290077688083, + "grad_norm": 18.372098922729492, + "learning_rate": 7.4917659101976435e-06, + "loss": 0.2571, + "step": 13460 + }, + { + "epoch": 0.3406382063415745, + "grad_norm": 3.45009446144104, + "learning_rate": 7.491417792517557e-06, + "loss": 0.14, + "step": 13461 + }, + { + "epoch": 0.3406635119062682, + "grad_norm": 70.01094818115234, + "learning_rate": 7.49106965877078e-06, + "loss": 0.3739, + "step": 13462 + }, + { + "epoch": 0.34068881747096186, + "grad_norm": 4.580502033233643, + "learning_rate": 7.490721508959558e-06, + "loss": 0.2501, + "step": 13463 + }, + { + "epoch": 0.3407141230356555, + "grad_norm": 5.72512674331665, + "learning_rate": 7.4903733430861355e-06, + "loss": 0.2865, + "step": 13464 + }, + { + "epoch": 0.3407394286003492, + "grad_norm": 4.1176605224609375, + "learning_rate": 7.490025161152761e-06, + "loss": 0.1836, + "step": 13465 + }, + { + "epoch": 0.3407647341650429, + "grad_norm": 9.036401748657227, + "learning_rate": 7.489676963161675e-06, + "loss": 0.2584, + "step": 13466 + }, + { + "epoch": 0.3407900397297366, + "grad_norm": 3.8806169033050537, + "learning_rate": 7.489328749115127e-06, + "loss": 0.1786, + "step": 13467 + }, + { + "epoch": 0.34081534529443025, + "grad_norm": 4.355708599090576, + "learning_rate": 7.48898051901536e-06, + "loss": 0.1496, + "step": 13468 + }, + { + "epoch": 0.3408406508591239, + "grad_norm": 4.3946027755737305, + "learning_rate": 7.488632272864621e-06, + "loss": 0.2059, + "step": 13469 + }, + { + "epoch": 0.3408659564238176, + "grad_norm": 9.310702323913574, + "learning_rate": 7.488284010665155e-06, + "loss": 0.2433, + "step": 13470 + }, + { + "epoch": 0.3408912619885113, + "grad_norm": 5.639653205871582, + "learning_rate": 7.487935732419209e-06, + "loss": 0.1847, + "step": 13471 + }, + { + "epoch": 0.34091656755320493, + "grad_norm": 2.9000332355499268, + "learning_rate": 7.487587438129028e-06, + "loss": 0.1356, + "step": 13472 + }, + { + "epoch": 0.34094187311789864, + "grad_norm": 4.8675923347473145, + "learning_rate": 7.487239127796858e-06, + "loss": 0.186, + "step": 13473 + }, + { + "epoch": 0.3409671786825923, + "grad_norm": 3.179417610168457, + "learning_rate": 7.4868908014249465e-06, + "loss": 0.097, + "step": 13474 + }, + { + "epoch": 0.34099248424728595, + "grad_norm": 4.000964164733887, + "learning_rate": 7.4865424590155365e-06, + "loss": 0.1228, + "step": 13475 + }, + { + "epoch": 0.34101778981197967, + "grad_norm": 12.813002586364746, + "learning_rate": 7.48619410057088e-06, + "loss": 0.2343, + "step": 13476 + }, + { + "epoch": 0.3410430953766733, + "grad_norm": 2.331521511077881, + "learning_rate": 7.485845726093217e-06, + "loss": 0.0892, + "step": 13477 + }, + { + "epoch": 0.34106840094136703, + "grad_norm": 9.872968673706055, + "learning_rate": 7.485497335584799e-06, + "loss": 0.1772, + "step": 13478 + }, + { + "epoch": 0.3410937065060607, + "grad_norm": 3.5138189792633057, + "learning_rate": 7.4851489290478706e-06, + "loss": 0.1115, + "step": 13479 + }, + { + "epoch": 0.34111901207075435, + "grad_norm": 9.619598388671875, + "learning_rate": 7.484800506484679e-06, + "loss": 0.3009, + "step": 13480 + }, + { + "epoch": 0.34114431763544806, + "grad_norm": 6.097079277038574, + "learning_rate": 7.484452067897472e-06, + "loss": 0.1926, + "step": 13481 + }, + { + "epoch": 0.3411696232001417, + "grad_norm": 4.591494560241699, + "learning_rate": 7.484103613288494e-06, + "loss": 0.1836, + "step": 13482 + }, + { + "epoch": 0.34119492876483537, + "grad_norm": 5.225560188293457, + "learning_rate": 7.483755142659995e-06, + "loss": 0.1125, + "step": 13483 + }, + { + "epoch": 0.3412202343295291, + "grad_norm": 10.899263381958008, + "learning_rate": 7.483406656014221e-06, + "loss": 0.2294, + "step": 13484 + }, + { + "epoch": 0.34124553989422274, + "grad_norm": 3.8487162590026855, + "learning_rate": 7.48305815335342e-06, + "loss": 0.1592, + "step": 13485 + }, + { + "epoch": 0.3412708454589164, + "grad_norm": 3.7824015617370605, + "learning_rate": 7.4827096346798375e-06, + "loss": 0.1081, + "step": 13486 + }, + { + "epoch": 0.3412961510236101, + "grad_norm": 10.68786907196045, + "learning_rate": 7.482361099995724e-06, + "loss": 0.2257, + "step": 13487 + }, + { + "epoch": 0.34132145658830376, + "grad_norm": 2.7137298583984375, + "learning_rate": 7.482012549303324e-06, + "loss": 0.1382, + "step": 13488 + }, + { + "epoch": 0.3413467621529974, + "grad_norm": 8.374763488769531, + "learning_rate": 7.481663982604889e-06, + "loss": 0.2819, + "step": 13489 + }, + { + "epoch": 0.34137206771769113, + "grad_norm": 7.712489604949951, + "learning_rate": 7.481315399902662e-06, + "loss": 0.1943, + "step": 13490 + }, + { + "epoch": 0.3413973732823848, + "grad_norm": 6.981956958770752, + "learning_rate": 7.4809668011988945e-06, + "loss": 0.1767, + "step": 13491 + }, + { + "epoch": 0.3414226788470785, + "grad_norm": 4.6934075355529785, + "learning_rate": 7.480618186495834e-06, + "loss": 0.1221, + "step": 13492 + }, + { + "epoch": 0.34144798441177215, + "grad_norm": 4.4598917961120605, + "learning_rate": 7.480269555795727e-06, + "loss": 0.2179, + "step": 13493 + }, + { + "epoch": 0.3414732899764658, + "grad_norm": 27.317121505737305, + "learning_rate": 7.479920909100825e-06, + "loss": 0.3343, + "step": 13494 + }, + { + "epoch": 0.3414985955411595, + "grad_norm": 6.25692081451416, + "learning_rate": 7.479572246413372e-06, + "loss": 0.1563, + "step": 13495 + }, + { + "epoch": 0.3415239011058532, + "grad_norm": 7.495860576629639, + "learning_rate": 7.479223567735621e-06, + "loss": 0.3176, + "step": 13496 + }, + { + "epoch": 0.34154920667054683, + "grad_norm": 3.441965103149414, + "learning_rate": 7.478874873069817e-06, + "loss": 0.1635, + "step": 13497 + }, + { + "epoch": 0.34157451223524055, + "grad_norm": 3.6476292610168457, + "learning_rate": 7.478526162418212e-06, + "loss": 0.1236, + "step": 13498 + }, + { + "epoch": 0.3415998177999342, + "grad_norm": 11.869138717651367, + "learning_rate": 7.478177435783051e-06, + "loss": 0.2234, + "step": 13499 + }, + { + "epoch": 0.34162512336462786, + "grad_norm": 26.59531593322754, + "learning_rate": 7.477828693166585e-06, + "loss": 0.1998, + "step": 13500 + }, + { + "epoch": 0.34165042892932157, + "grad_norm": 5.727459907531738, + "learning_rate": 7.477479934571064e-06, + "loss": 0.1297, + "step": 13501 + }, + { + "epoch": 0.3416757344940152, + "grad_norm": 9.241803169250488, + "learning_rate": 7.477131159998734e-06, + "loss": 0.2113, + "step": 13502 + }, + { + "epoch": 0.3417010400587089, + "grad_norm": 3.9810004234313965, + "learning_rate": 7.476782369451846e-06, + "loss": 0.1313, + "step": 13503 + }, + { + "epoch": 0.3417263456234026, + "grad_norm": 3.23724365234375, + "learning_rate": 7.47643356293265e-06, + "loss": 0.1414, + "step": 13504 + }, + { + "epoch": 0.34175165118809625, + "grad_norm": 4.651614189147949, + "learning_rate": 7.476084740443394e-06, + "loss": 0.128, + "step": 13505 + }, + { + "epoch": 0.34177695675278996, + "grad_norm": 3.14780855178833, + "learning_rate": 7.475735901986328e-06, + "loss": 0.1562, + "step": 13506 + }, + { + "epoch": 0.3418022623174836, + "grad_norm": 10.787154197692871, + "learning_rate": 7.475387047563703e-06, + "loss": 0.2011, + "step": 13507 + }, + { + "epoch": 0.3418275678821773, + "grad_norm": 6.868342876434326, + "learning_rate": 7.4750381771777654e-06, + "loss": 0.2671, + "step": 13508 + }, + { + "epoch": 0.341852873446871, + "grad_norm": 5.187498569488525, + "learning_rate": 7.474689290830769e-06, + "loss": 0.1633, + "step": 13509 + }, + { + "epoch": 0.34187817901156464, + "grad_norm": 3.41078782081604, + "learning_rate": 7.47434038852496e-06, + "loss": 0.0955, + "step": 13510 + }, + { + "epoch": 0.3419034845762583, + "grad_norm": 6.511963844299316, + "learning_rate": 7.47399147026259e-06, + "loss": 0.2345, + "step": 13511 + }, + { + "epoch": 0.341928790140952, + "grad_norm": 4.324894428253174, + "learning_rate": 7.473642536045911e-06, + "loss": 0.1339, + "step": 13512 + }, + { + "epoch": 0.34195409570564567, + "grad_norm": 7.979368686676025, + "learning_rate": 7.47329358587717e-06, + "loss": 0.3037, + "step": 13513 + }, + { + "epoch": 0.3419794012703393, + "grad_norm": 7.858458995819092, + "learning_rate": 7.47294461975862e-06, + "loss": 0.1782, + "step": 13514 + }, + { + "epoch": 0.34200470683503303, + "grad_norm": 4.149681568145752, + "learning_rate": 7.472595637692508e-06, + "loss": 0.1845, + "step": 13515 + }, + { + "epoch": 0.3420300123997267, + "grad_norm": 2.4830703735351562, + "learning_rate": 7.472246639681089e-06, + "loss": 0.1041, + "step": 13516 + }, + { + "epoch": 0.3420553179644204, + "grad_norm": 9.281346321105957, + "learning_rate": 7.47189762572661e-06, + "loss": 0.1656, + "step": 13517 + }, + { + "epoch": 0.34208062352911406, + "grad_norm": 6.609033107757568, + "learning_rate": 7.471548595831324e-06, + "loss": 0.1701, + "step": 13518 + }, + { + "epoch": 0.3421059290938077, + "grad_norm": 6.0133843421936035, + "learning_rate": 7.471199549997478e-06, + "loss": 0.2043, + "step": 13519 + }, + { + "epoch": 0.3421312346585014, + "grad_norm": 4.616486549377441, + "learning_rate": 7.47085048822733e-06, + "loss": 0.1312, + "step": 13520 + }, + { + "epoch": 0.3421565402231951, + "grad_norm": 4.564270496368408, + "learning_rate": 7.470501410523123e-06, + "loss": 0.1393, + "step": 13521 + }, + { + "epoch": 0.34218184578788874, + "grad_norm": 3.9455292224884033, + "learning_rate": 7.470152316887114e-06, + "loss": 0.1514, + "step": 13522 + }, + { + "epoch": 0.34220715135258245, + "grad_norm": 8.4133939743042, + "learning_rate": 7.4698032073215535e-06, + "loss": 0.3406, + "step": 13523 + }, + { + "epoch": 0.3422324569172761, + "grad_norm": 5.336572170257568, + "learning_rate": 7.469454081828689e-06, + "loss": 0.1258, + "step": 13524 + }, + { + "epoch": 0.34225776248196976, + "grad_norm": 5.163942813873291, + "learning_rate": 7.469104940410775e-06, + "loss": 0.111, + "step": 13525 + }, + { + "epoch": 0.3422830680466635, + "grad_norm": 16.474546432495117, + "learning_rate": 7.468755783070063e-06, + "loss": 0.3333, + "step": 13526 + }, + { + "epoch": 0.34230837361135713, + "grad_norm": 4.272092342376709, + "learning_rate": 7.468406609808805e-06, + "loss": 0.1271, + "step": 13527 + }, + { + "epoch": 0.3423336791760508, + "grad_norm": 2.630671501159668, + "learning_rate": 7.46805742062925e-06, + "loss": 0.1345, + "step": 13528 + }, + { + "epoch": 0.3423589847407445, + "grad_norm": 5.843874931335449, + "learning_rate": 7.467708215533653e-06, + "loss": 0.1539, + "step": 13529 + }, + { + "epoch": 0.34238429030543815, + "grad_norm": 11.063204765319824, + "learning_rate": 7.467358994524265e-06, + "loss": 0.2295, + "step": 13530 + }, + { + "epoch": 0.34240959587013187, + "grad_norm": 12.254335403442383, + "learning_rate": 7.467009757603338e-06, + "loss": 0.4009, + "step": 13531 + }, + { + "epoch": 0.3424349014348255, + "grad_norm": 10.847898483276367, + "learning_rate": 7.466660504773123e-06, + "loss": 0.2667, + "step": 13532 + }, + { + "epoch": 0.3424602069995192, + "grad_norm": 4.058257579803467, + "learning_rate": 7.466311236035873e-06, + "loss": 0.1935, + "step": 13533 + }, + { + "epoch": 0.3424855125642129, + "grad_norm": 5.682099342346191, + "learning_rate": 7.465961951393842e-06, + "loss": 0.1315, + "step": 13534 + }, + { + "epoch": 0.34251081812890655, + "grad_norm": 6.828039169311523, + "learning_rate": 7.46561265084928e-06, + "loss": 0.3338, + "step": 13535 + }, + { + "epoch": 0.3425361236936002, + "grad_norm": 4.206899166107178, + "learning_rate": 7.465263334404441e-06, + "loss": 0.2104, + "step": 13536 + }, + { + "epoch": 0.3425614292582939, + "grad_norm": 3.5285027027130127, + "learning_rate": 7.464914002061578e-06, + "loss": 0.1183, + "step": 13537 + }, + { + "epoch": 0.34258673482298757, + "grad_norm": 6.57982873916626, + "learning_rate": 7.4645646538229426e-06, + "loss": 0.2251, + "step": 13538 + }, + { + "epoch": 0.3426120403876812, + "grad_norm": 4.551399230957031, + "learning_rate": 7.464215289690787e-06, + "loss": 0.1628, + "step": 13539 + }, + { + "epoch": 0.34263734595237494, + "grad_norm": 3.897141218185425, + "learning_rate": 7.463865909667367e-06, + "loss": 0.1656, + "step": 13540 + }, + { + "epoch": 0.3426626515170686, + "grad_norm": 8.85788631439209, + "learning_rate": 7.463516513754932e-06, + "loss": 0.2596, + "step": 13541 + }, + { + "epoch": 0.3426879570817623, + "grad_norm": 5.620447158813477, + "learning_rate": 7.463167101955739e-06, + "loss": 0.2329, + "step": 13542 + }, + { + "epoch": 0.34271326264645596, + "grad_norm": 5.5514817237854, + "learning_rate": 7.462817674272039e-06, + "loss": 0.1707, + "step": 13543 + }, + { + "epoch": 0.3427385682111496, + "grad_norm": 4.376486301422119, + "learning_rate": 7.462468230706086e-06, + "loss": 0.1656, + "step": 13544 + }, + { + "epoch": 0.34276387377584333, + "grad_norm": 6.295670032501221, + "learning_rate": 7.462118771260134e-06, + "loss": 0.1855, + "step": 13545 + }, + { + "epoch": 0.342789179340537, + "grad_norm": 15.96481704711914, + "learning_rate": 7.461769295936435e-06, + "loss": 0.2705, + "step": 13546 + }, + { + "epoch": 0.34281448490523064, + "grad_norm": 3.440204381942749, + "learning_rate": 7.461419804737245e-06, + "loss": 0.1803, + "step": 13547 + }, + { + "epoch": 0.34283979046992435, + "grad_norm": 3.283709764480591, + "learning_rate": 7.461070297664816e-06, + "loss": 0.1496, + "step": 13548 + }, + { + "epoch": 0.342865096034618, + "grad_norm": 8.166308403015137, + "learning_rate": 7.4607207747214025e-06, + "loss": 0.2186, + "step": 13549 + }, + { + "epoch": 0.34289040159931167, + "grad_norm": 19.678390502929688, + "learning_rate": 7.460371235909258e-06, + "loss": 0.1557, + "step": 13550 + }, + { + "epoch": 0.3429157071640054, + "grad_norm": 3.9322805404663086, + "learning_rate": 7.460021681230638e-06, + "loss": 0.1449, + "step": 13551 + }, + { + "epoch": 0.34294101272869904, + "grad_norm": 4.4837117195129395, + "learning_rate": 7.459672110687795e-06, + "loss": 0.1614, + "step": 13552 + }, + { + "epoch": 0.3429663182933927, + "grad_norm": 5.196079730987549, + "learning_rate": 7.459322524282983e-06, + "loss": 0.2424, + "step": 13553 + }, + { + "epoch": 0.3429916238580864, + "grad_norm": 5.562961101531982, + "learning_rate": 7.458972922018459e-06, + "loss": 0.15, + "step": 13554 + }, + { + "epoch": 0.34301692942278006, + "grad_norm": 6.980226516723633, + "learning_rate": 7.458623303896475e-06, + "loss": 0.1629, + "step": 13555 + }, + { + "epoch": 0.34304223498747377, + "grad_norm": 2.92840313911438, + "learning_rate": 7.458273669919289e-06, + "loss": 0.1577, + "step": 13556 + }, + { + "epoch": 0.3430675405521674, + "grad_norm": 10.571218490600586, + "learning_rate": 7.4579240200891524e-06, + "loss": 0.1866, + "step": 13557 + }, + { + "epoch": 0.3430928461168611, + "grad_norm": 4.311107158660889, + "learning_rate": 7.457574354408319e-06, + "loss": 0.1867, + "step": 13558 + }, + { + "epoch": 0.3431181516815548, + "grad_norm": 4.9219255447387695, + "learning_rate": 7.457224672879048e-06, + "loss": 0.1177, + "step": 13559 + }, + { + "epoch": 0.34314345724624845, + "grad_norm": 8.51948356628418, + "learning_rate": 7.456874975503591e-06, + "loss": 0.1726, + "step": 13560 + }, + { + "epoch": 0.3431687628109421, + "grad_norm": 3.655799627304077, + "learning_rate": 7.456525262284204e-06, + "loss": 0.1651, + "step": 13561 + }, + { + "epoch": 0.3431940683756358, + "grad_norm": 5.6042351722717285, + "learning_rate": 7.456175533223145e-06, + "loss": 0.122, + "step": 13562 + }, + { + "epoch": 0.3432193739403295, + "grad_norm": 8.347644805908203, + "learning_rate": 7.4558257883226634e-06, + "loss": 0.2021, + "step": 13563 + }, + { + "epoch": 0.34324467950502313, + "grad_norm": 4.850529670715332, + "learning_rate": 7.45547602758502e-06, + "loss": 0.1361, + "step": 13564 + }, + { + "epoch": 0.34326998506971684, + "grad_norm": 6.82844352722168, + "learning_rate": 7.455126251012469e-06, + "loss": 0.1998, + "step": 13565 + }, + { + "epoch": 0.3432952906344105, + "grad_norm": 4.524291038513184, + "learning_rate": 7.4547764586072635e-06, + "loss": 0.1505, + "step": 13566 + }, + { + "epoch": 0.34332059619910416, + "grad_norm": 10.10136604309082, + "learning_rate": 7.454426650371664e-06, + "loss": 0.2807, + "step": 13567 + }, + { + "epoch": 0.34334590176379787, + "grad_norm": 2.97861909866333, + "learning_rate": 7.454076826307921e-06, + "loss": 0.1924, + "step": 13568 + }, + { + "epoch": 0.3433712073284915, + "grad_norm": 3.9203574657440186, + "learning_rate": 7.453726986418293e-06, + "loss": 0.1544, + "step": 13569 + }, + { + "epoch": 0.34339651289318524, + "grad_norm": 3.720639228820801, + "learning_rate": 7.4533771307050375e-06, + "loss": 0.1457, + "step": 13570 + }, + { + "epoch": 0.3434218184578789, + "grad_norm": 5.28305721282959, + "learning_rate": 7.45302725917041e-06, + "loss": 0.163, + "step": 13571 + }, + { + "epoch": 0.34344712402257255, + "grad_norm": 6.284279823303223, + "learning_rate": 7.452677371816663e-06, + "loss": 0.2249, + "step": 13572 + }, + { + "epoch": 0.34347242958726626, + "grad_norm": 8.85888385772705, + "learning_rate": 7.452327468646059e-06, + "loss": 0.2119, + "step": 13573 + }, + { + "epoch": 0.3434977351519599, + "grad_norm": 5.490776538848877, + "learning_rate": 7.451977549660849e-06, + "loss": 0.2037, + "step": 13574 + }, + { + "epoch": 0.34352304071665357, + "grad_norm": 4.703118324279785, + "learning_rate": 7.451627614863293e-06, + "loss": 0.1306, + "step": 13575 + }, + { + "epoch": 0.3435483462813473, + "grad_norm": 4.001275539398193, + "learning_rate": 7.451277664255647e-06, + "loss": 0.2177, + "step": 13576 + }, + { + "epoch": 0.34357365184604094, + "grad_norm": 5.564486503601074, + "learning_rate": 7.450927697840167e-06, + "loss": 0.1308, + "step": 13577 + }, + { + "epoch": 0.3435989574107346, + "grad_norm": 4.02154016494751, + "learning_rate": 7.450577715619109e-06, + "loss": 0.1551, + "step": 13578 + }, + { + "epoch": 0.3436242629754283, + "grad_norm": 3.4250054359436035, + "learning_rate": 7.450227717594732e-06, + "loss": 0.148, + "step": 13579 + }, + { + "epoch": 0.34364956854012196, + "grad_norm": 6.861491680145264, + "learning_rate": 7.449877703769294e-06, + "loss": 0.2776, + "step": 13580 + }, + { + "epoch": 0.3436748741048157, + "grad_norm": 3.718130350112915, + "learning_rate": 7.449527674145047e-06, + "loss": 0.1788, + "step": 13581 + }, + { + "epoch": 0.34370017966950933, + "grad_norm": 4.413766860961914, + "learning_rate": 7.449177628724254e-06, + "loss": 0.1485, + "step": 13582 + }, + { + "epoch": 0.343725485234203, + "grad_norm": 12.13066291809082, + "learning_rate": 7.44882756750917e-06, + "loss": 0.269, + "step": 13583 + }, + { + "epoch": 0.3437507907988967, + "grad_norm": 5.46685266494751, + "learning_rate": 7.448477490502053e-06, + "loss": 0.2042, + "step": 13584 + }, + { + "epoch": 0.34377609636359036, + "grad_norm": 4.948602676391602, + "learning_rate": 7.448127397705158e-06, + "loss": 0.221, + "step": 13585 + }, + { + "epoch": 0.343801401928284, + "grad_norm": 16.55699348449707, + "learning_rate": 7.447777289120746e-06, + "loss": 0.2302, + "step": 13586 + }, + { + "epoch": 0.3438267074929777, + "grad_norm": 4.302700042724609, + "learning_rate": 7.447427164751073e-06, + "loss": 0.1809, + "step": 13587 + }, + { + "epoch": 0.3438520130576714, + "grad_norm": 3.9535651206970215, + "learning_rate": 7.447077024598399e-06, + "loss": 0.2145, + "step": 13588 + }, + { + "epoch": 0.34387731862236504, + "grad_norm": 10.234630584716797, + "learning_rate": 7.44672686866498e-06, + "loss": 0.2169, + "step": 13589 + }, + { + "epoch": 0.34390262418705875, + "grad_norm": 6.463864326477051, + "learning_rate": 7.4463766969530736e-06, + "loss": 0.2666, + "step": 13590 + }, + { + "epoch": 0.3439279297517524, + "grad_norm": 5.237135887145996, + "learning_rate": 7.44602650946494e-06, + "loss": 0.1469, + "step": 13591 + }, + { + "epoch": 0.34395323531644606, + "grad_norm": 7.94114875793457, + "learning_rate": 7.445676306202835e-06, + "loss": 0.1653, + "step": 13592 + }, + { + "epoch": 0.34397854088113977, + "grad_norm": 6.213430404663086, + "learning_rate": 7.44532608716902e-06, + "loss": 0.2202, + "step": 13593 + }, + { + "epoch": 0.3440038464458334, + "grad_norm": 3.024148464202881, + "learning_rate": 7.4449758523657505e-06, + "loss": 0.1039, + "step": 13594 + }, + { + "epoch": 0.34402915201052714, + "grad_norm": 11.480051040649414, + "learning_rate": 7.4446256017952876e-06, + "loss": 0.1726, + "step": 13595 + }, + { + "epoch": 0.3440544575752208, + "grad_norm": 3.6626200675964355, + "learning_rate": 7.444275335459887e-06, + "loss": 0.1112, + "step": 13596 + }, + { + "epoch": 0.34407976313991445, + "grad_norm": 6.599711894989014, + "learning_rate": 7.443925053361811e-06, + "loss": 0.2192, + "step": 13597 + }, + { + "epoch": 0.34410506870460816, + "grad_norm": 3.936736822128296, + "learning_rate": 7.443574755503317e-06, + "loss": 0.2151, + "step": 13598 + }, + { + "epoch": 0.3441303742693018, + "grad_norm": 8.021611213684082, + "learning_rate": 7.443224441886664e-06, + "loss": 0.2601, + "step": 13599 + }, + { + "epoch": 0.3441556798339955, + "grad_norm": 9.129144668579102, + "learning_rate": 7.442874112514111e-06, + "loss": 0.2579, + "step": 13600 + }, + { + "epoch": 0.3441809853986892, + "grad_norm": 5.614837169647217, + "learning_rate": 7.442523767387917e-06, + "loss": 0.1449, + "step": 13601 + }, + { + "epoch": 0.34420629096338284, + "grad_norm": 4.085674285888672, + "learning_rate": 7.442173406510342e-06, + "loss": 0.1781, + "step": 13602 + }, + { + "epoch": 0.3442315965280765, + "grad_norm": 4.3688178062438965, + "learning_rate": 7.441823029883644e-06, + "loss": 0.1542, + "step": 13603 + }, + { + "epoch": 0.3442569020927702, + "grad_norm": 5.5314764976501465, + "learning_rate": 7.441472637510085e-06, + "loss": 0.2103, + "step": 13604 + }, + { + "epoch": 0.34428220765746387, + "grad_norm": 4.2093119621276855, + "learning_rate": 7.441122229391922e-06, + "loss": 0.2241, + "step": 13605 + }, + { + "epoch": 0.3443075132221576, + "grad_norm": 3.8668105602264404, + "learning_rate": 7.440771805531417e-06, + "loss": 0.1229, + "step": 13606 + }, + { + "epoch": 0.34433281878685124, + "grad_norm": 3.383004903793335, + "learning_rate": 7.440421365930827e-06, + "loss": 0.0956, + "step": 13607 + }, + { + "epoch": 0.3443581243515449, + "grad_norm": 17.815685272216797, + "learning_rate": 7.440070910592414e-06, + "loss": 0.2478, + "step": 13608 + }, + { + "epoch": 0.3443834299162386, + "grad_norm": 7.80714750289917, + "learning_rate": 7.439720439518439e-06, + "loss": 0.2667, + "step": 13609 + }, + { + "epoch": 0.34440873548093226, + "grad_norm": 4.087882041931152, + "learning_rate": 7.439369952711158e-06, + "loss": 0.0975, + "step": 13610 + }, + { + "epoch": 0.3444340410456259, + "grad_norm": 5.179803848266602, + "learning_rate": 7.4390194501728365e-06, + "loss": 0.1797, + "step": 13611 + }, + { + "epoch": 0.34445934661031963, + "grad_norm": 7.470865249633789, + "learning_rate": 7.43866893190573e-06, + "loss": 0.1821, + "step": 13612 + }, + { + "epoch": 0.3444846521750133, + "grad_norm": 2.773651123046875, + "learning_rate": 7.438318397912103e-06, + "loss": 0.1895, + "step": 13613 + }, + { + "epoch": 0.34450995773970694, + "grad_norm": 7.0435991287231445, + "learning_rate": 7.4379678481942145e-06, + "loss": 0.127, + "step": 13614 + }, + { + "epoch": 0.34453526330440065, + "grad_norm": 2.646594762802124, + "learning_rate": 7.437617282754324e-06, + "loss": 0.187, + "step": 13615 + }, + { + "epoch": 0.3445605688690943, + "grad_norm": 10.729513168334961, + "learning_rate": 7.437266701594692e-06, + "loss": 0.3127, + "step": 13616 + }, + { + "epoch": 0.34458587443378796, + "grad_norm": 6.5947136878967285, + "learning_rate": 7.436916104717582e-06, + "loss": 0.1531, + "step": 13617 + }, + { + "epoch": 0.3446111799984817, + "grad_norm": 6.282017230987549, + "learning_rate": 7.436565492125254e-06, + "loss": 0.2258, + "step": 13618 + }, + { + "epoch": 0.34463648556317533, + "grad_norm": 5.9660844802856445, + "learning_rate": 7.436214863819966e-06, + "loss": 0.1959, + "step": 13619 + }, + { + "epoch": 0.34466179112786904, + "grad_norm": 3.6725056171417236, + "learning_rate": 7.4358642198039835e-06, + "loss": 0.1363, + "step": 13620 + }, + { + "epoch": 0.3446870966925627, + "grad_norm": 3.1698319911956787, + "learning_rate": 7.435513560079565e-06, + "loss": 0.1069, + "step": 13621 + }, + { + "epoch": 0.34471240225725636, + "grad_norm": 7.273515701293945, + "learning_rate": 7.435162884648973e-06, + "loss": 0.2026, + "step": 13622 + }, + { + "epoch": 0.34473770782195007, + "grad_norm": 8.2747163772583, + "learning_rate": 7.4348121935144675e-06, + "loss": 0.1472, + "step": 13623 + }, + { + "epoch": 0.3447630133866437, + "grad_norm": 3.439519166946411, + "learning_rate": 7.4344614866783124e-06, + "loss": 0.146, + "step": 13624 + }, + { + "epoch": 0.3447883189513374, + "grad_norm": 6.704033374786377, + "learning_rate": 7.434110764142767e-06, + "loss": 0.2082, + "step": 13625 + }, + { + "epoch": 0.3448136245160311, + "grad_norm": 2.3336524963378906, + "learning_rate": 7.4337600259100954e-06, + "loss": 0.1084, + "step": 13626 + }, + { + "epoch": 0.34483893008072475, + "grad_norm": 3.6011412143707275, + "learning_rate": 7.433409271982556e-06, + "loss": 0.2022, + "step": 13627 + }, + { + "epoch": 0.3448642356454184, + "grad_norm": 11.169617652893066, + "learning_rate": 7.4330585023624145e-06, + "loss": 0.3694, + "step": 13628 + }, + { + "epoch": 0.3448895412101121, + "grad_norm": 5.726242542266846, + "learning_rate": 7.43270771705193e-06, + "loss": 0.2182, + "step": 13629 + }, + { + "epoch": 0.3449148467748058, + "grad_norm": 3.3142142295837402, + "learning_rate": 7.432356916053366e-06, + "loss": 0.148, + "step": 13630 + }, + { + "epoch": 0.34494015233949943, + "grad_norm": 5.947891712188721, + "learning_rate": 7.432006099368986e-06, + "loss": 0.2063, + "step": 13631 + }, + { + "epoch": 0.34496545790419314, + "grad_norm": 12.813484191894531, + "learning_rate": 7.43165526700105e-06, + "loss": 0.1502, + "step": 13632 + }, + { + "epoch": 0.3449907634688868, + "grad_norm": 4.575421333312988, + "learning_rate": 7.431304418951823e-06, + "loss": 0.1547, + "step": 13633 + }, + { + "epoch": 0.3450160690335805, + "grad_norm": 4.550850868225098, + "learning_rate": 7.430953555223565e-06, + "loss": 0.2171, + "step": 13634 + }, + { + "epoch": 0.34504137459827416, + "grad_norm": 2.971590042114258, + "learning_rate": 7.430602675818539e-06, + "loss": 0.1362, + "step": 13635 + }, + { + "epoch": 0.3450666801629678, + "grad_norm": 4.696249008178711, + "learning_rate": 7.430251780739008e-06, + "loss": 0.1927, + "step": 13636 + }, + { + "epoch": 0.34509198572766153, + "grad_norm": 6.715738773345947, + "learning_rate": 7.4299008699872375e-06, + "loss": 0.2137, + "step": 13637 + }, + { + "epoch": 0.3451172912923552, + "grad_norm": 4.3649468421936035, + "learning_rate": 7.429549943565487e-06, + "loss": 0.2676, + "step": 13638 + }, + { + "epoch": 0.34514259685704884, + "grad_norm": 12.18580150604248, + "learning_rate": 7.4291990014760215e-06, + "loss": 0.2352, + "step": 13639 + }, + { + "epoch": 0.34516790242174256, + "grad_norm": 13.62482738494873, + "learning_rate": 7.428848043721103e-06, + "loss": 0.2087, + "step": 13640 + }, + { + "epoch": 0.3451932079864362, + "grad_norm": 10.211967468261719, + "learning_rate": 7.428497070302995e-06, + "loss": 0.2006, + "step": 13641 + }, + { + "epoch": 0.34521851355112987, + "grad_norm": 4.89219331741333, + "learning_rate": 7.428146081223962e-06, + "loss": 0.1769, + "step": 13642 + }, + { + "epoch": 0.3452438191158236, + "grad_norm": 3.4676976203918457, + "learning_rate": 7.427795076486266e-06, + "loss": 0.194, + "step": 13643 + }, + { + "epoch": 0.34526912468051724, + "grad_norm": 4.730061054229736, + "learning_rate": 7.427444056092172e-06, + "loss": 0.1955, + "step": 13644 + }, + { + "epoch": 0.34529443024521095, + "grad_norm": 7.244555473327637, + "learning_rate": 7.427093020043942e-06, + "loss": 0.2753, + "step": 13645 + }, + { + "epoch": 0.3453197358099046, + "grad_norm": 3.556748867034912, + "learning_rate": 7.4267419683438405e-06, + "loss": 0.1793, + "step": 13646 + }, + { + "epoch": 0.34534504137459826, + "grad_norm": 8.659043312072754, + "learning_rate": 7.426390900994132e-06, + "loss": 0.2507, + "step": 13647 + }, + { + "epoch": 0.345370346939292, + "grad_norm": 14.799986839294434, + "learning_rate": 7.426039817997079e-06, + "loss": 0.1917, + "step": 13648 + }, + { + "epoch": 0.34539565250398563, + "grad_norm": 2.7827839851379395, + "learning_rate": 7.425688719354948e-06, + "loss": 0.1181, + "step": 13649 + }, + { + "epoch": 0.3454209580686793, + "grad_norm": 4.320609092712402, + "learning_rate": 7.42533760507e-06, + "loss": 0.1943, + "step": 13650 + }, + { + "epoch": 0.345446263633373, + "grad_norm": 7.056897163391113, + "learning_rate": 7.424986475144502e-06, + "loss": 0.2258, + "step": 13651 + }, + { + "epoch": 0.34547156919806665, + "grad_norm": 14.018725395202637, + "learning_rate": 7.424635329580717e-06, + "loss": 0.2712, + "step": 13652 + }, + { + "epoch": 0.3454968747627603, + "grad_norm": 4.044495105743408, + "learning_rate": 7.424284168380911e-06, + "loss": 0.12, + "step": 13653 + }, + { + "epoch": 0.345522180327454, + "grad_norm": 6.090994358062744, + "learning_rate": 7.423932991547344e-06, + "loss": 0.2134, + "step": 13654 + }, + { + "epoch": 0.3455474858921477, + "grad_norm": 4.416069984436035, + "learning_rate": 7.423581799082288e-06, + "loss": 0.1351, + "step": 13655 + }, + { + "epoch": 0.34557279145684133, + "grad_norm": 5.7810187339782715, + "learning_rate": 7.423230590988001e-06, + "loss": 0.1621, + "step": 13656 + }, + { + "epoch": 0.34559809702153504, + "grad_norm": 4.452706336975098, + "learning_rate": 7.422879367266752e-06, + "loss": 0.1231, + "step": 13657 + }, + { + "epoch": 0.3456234025862287, + "grad_norm": 5.626373291015625, + "learning_rate": 7.422528127920804e-06, + "loss": 0.1767, + "step": 13658 + }, + { + "epoch": 0.3456487081509224, + "grad_norm": 7.0534563064575195, + "learning_rate": 7.422176872952423e-06, + "loss": 0.2134, + "step": 13659 + }, + { + "epoch": 0.34567401371561607, + "grad_norm": 3.6428163051605225, + "learning_rate": 7.421825602363874e-06, + "loss": 0.1399, + "step": 13660 + }, + { + "epoch": 0.3456993192803097, + "grad_norm": 3.9309756755828857, + "learning_rate": 7.421474316157421e-06, + "loss": 0.1376, + "step": 13661 + }, + { + "epoch": 0.34572462484500344, + "grad_norm": 6.430425643920898, + "learning_rate": 7.421123014335332e-06, + "loss": 0.2088, + "step": 13662 + }, + { + "epoch": 0.3457499304096971, + "grad_norm": 2.9296908378601074, + "learning_rate": 7.420771696899871e-06, + "loss": 0.1684, + "step": 13663 + }, + { + "epoch": 0.34577523597439075, + "grad_norm": 2.9564437866210938, + "learning_rate": 7.420420363853303e-06, + "loss": 0.1738, + "step": 13664 + }, + { + "epoch": 0.34580054153908446, + "grad_norm": 4.854440212249756, + "learning_rate": 7.420069015197895e-06, + "loss": 0.1748, + "step": 13665 + }, + { + "epoch": 0.3458258471037781, + "grad_norm": 5.206188678741455, + "learning_rate": 7.419717650935912e-06, + "loss": 0.1836, + "step": 13666 + }, + { + "epoch": 0.3458511526684718, + "grad_norm": 3.768170118331909, + "learning_rate": 7.419366271069619e-06, + "loss": 0.1378, + "step": 13667 + }, + { + "epoch": 0.3458764582331655, + "grad_norm": 3.851041555404663, + "learning_rate": 7.419014875601284e-06, + "loss": 0.2428, + "step": 13668 + }, + { + "epoch": 0.34590176379785914, + "grad_norm": 7.18710470199585, + "learning_rate": 7.418663464533171e-06, + "loss": 0.1706, + "step": 13669 + }, + { + "epoch": 0.34592706936255285, + "grad_norm": 7.3737053871154785, + "learning_rate": 7.418312037867549e-06, + "loss": 0.2255, + "step": 13670 + }, + { + "epoch": 0.3459523749272465, + "grad_norm": 5.555451393127441, + "learning_rate": 7.41796059560668e-06, + "loss": 0.1075, + "step": 13671 + }, + { + "epoch": 0.34597768049194016, + "grad_norm": 3.932713031768799, + "learning_rate": 7.417609137752835e-06, + "loss": 0.1237, + "step": 13672 + }, + { + "epoch": 0.3460029860566339, + "grad_norm": 6.266505241394043, + "learning_rate": 7.4172576643082775e-06, + "loss": 0.2228, + "step": 13673 + }, + { + "epoch": 0.34602829162132753, + "grad_norm": 8.059361457824707, + "learning_rate": 7.416906175275276e-06, + "loss": 0.1658, + "step": 13674 + }, + { + "epoch": 0.3460535971860212, + "grad_norm": 9.571030616760254, + "learning_rate": 7.416554670656094e-06, + "loss": 0.1525, + "step": 13675 + }, + { + "epoch": 0.3460789027507149, + "grad_norm": 3.7386510372161865, + "learning_rate": 7.4162031504530026e-06, + "loss": 0.1784, + "step": 13676 + }, + { + "epoch": 0.34610420831540856, + "grad_norm": 5.014374256134033, + "learning_rate": 7.415851614668265e-06, + "loss": 0.202, + "step": 13677 + }, + { + "epoch": 0.3461295138801022, + "grad_norm": 4.583553314208984, + "learning_rate": 7.41550006330415e-06, + "loss": 0.2045, + "step": 13678 + }, + { + "epoch": 0.3461548194447959, + "grad_norm": 4.328359127044678, + "learning_rate": 7.415148496362926e-06, + "loss": 0.1974, + "step": 13679 + }, + { + "epoch": 0.3461801250094896, + "grad_norm": 3.5216002464294434, + "learning_rate": 7.4147969138468575e-06, + "loss": 0.1829, + "step": 13680 + }, + { + "epoch": 0.34620543057418324, + "grad_norm": 21.982147216796875, + "learning_rate": 7.414445315758213e-06, + "loss": 0.2362, + "step": 13681 + }, + { + "epoch": 0.34623073613887695, + "grad_norm": 5.13161563873291, + "learning_rate": 7.414093702099259e-06, + "loss": 0.1554, + "step": 13682 + }, + { + "epoch": 0.3462560417035706, + "grad_norm": 6.135571479797363, + "learning_rate": 7.413742072872264e-06, + "loss": 0.1763, + "step": 13683 + }, + { + "epoch": 0.3462813472682643, + "grad_norm": 3.443866729736328, + "learning_rate": 7.413390428079496e-06, + "loss": 0.0754, + "step": 13684 + }, + { + "epoch": 0.346306652832958, + "grad_norm": 9.118894577026367, + "learning_rate": 7.413038767723222e-06, + "loss": 0.2923, + "step": 13685 + }, + { + "epoch": 0.34633195839765163, + "grad_norm": 7.966063022613525, + "learning_rate": 7.412687091805709e-06, + "loss": 0.2674, + "step": 13686 + }, + { + "epoch": 0.34635726396234534, + "grad_norm": 7.107515335083008, + "learning_rate": 7.412335400329227e-06, + "loss": 0.1857, + "step": 13687 + }, + { + "epoch": 0.346382569527039, + "grad_norm": 6.67046594619751, + "learning_rate": 7.411983693296042e-06, + "loss": 0.2198, + "step": 13688 + }, + { + "epoch": 0.34640787509173265, + "grad_norm": 5.815803527832031, + "learning_rate": 7.411631970708423e-06, + "loss": 0.2209, + "step": 13689 + }, + { + "epoch": 0.34643318065642636, + "grad_norm": 5.8369879722595215, + "learning_rate": 7.411280232568639e-06, + "loss": 0.2035, + "step": 13690 + }, + { + "epoch": 0.34645848622112, + "grad_norm": 5.51198148727417, + "learning_rate": 7.410928478878955e-06, + "loss": 0.1632, + "step": 13691 + }, + { + "epoch": 0.3464837917858137, + "grad_norm": 6.02046537399292, + "learning_rate": 7.4105767096416435e-06, + "loss": 0.2212, + "step": 13692 + }, + { + "epoch": 0.3465090973505074, + "grad_norm": 4.303238391876221, + "learning_rate": 7.410224924858969e-06, + "loss": 0.2326, + "step": 13693 + }, + { + "epoch": 0.34653440291520105, + "grad_norm": 16.104610443115234, + "learning_rate": 7.409873124533204e-06, + "loss": 0.1572, + "step": 13694 + }, + { + "epoch": 0.3465597084798947, + "grad_norm": 4.862789154052734, + "learning_rate": 7.409521308666616e-06, + "loss": 0.2258, + "step": 13695 + }, + { + "epoch": 0.3465850140445884, + "grad_norm": 5.938526153564453, + "learning_rate": 7.409169477261472e-06, + "loss": 0.1964, + "step": 13696 + }, + { + "epoch": 0.34661031960928207, + "grad_norm": 10.366247177124023, + "learning_rate": 7.4088176303200445e-06, + "loss": 0.2263, + "step": 13697 + }, + { + "epoch": 0.3466356251739758, + "grad_norm": 6.361989974975586, + "learning_rate": 7.408465767844597e-06, + "loss": 0.2497, + "step": 13698 + }, + { + "epoch": 0.34666093073866944, + "grad_norm": 3.9010961055755615, + "learning_rate": 7.408113889837404e-06, + "loss": 0.1132, + "step": 13699 + }, + { + "epoch": 0.3466862363033631, + "grad_norm": 4.1356096267700195, + "learning_rate": 7.407761996300731e-06, + "loss": 0.1924, + "step": 13700 + }, + { + "epoch": 0.3467115418680568, + "grad_norm": 4.117496490478516, + "learning_rate": 7.40741008723685e-06, + "loss": 0.2233, + "step": 13701 + }, + { + "epoch": 0.34673684743275046, + "grad_norm": 6.261764049530029, + "learning_rate": 7.407058162648029e-06, + "loss": 0.1443, + "step": 13702 + }, + { + "epoch": 0.3467621529974441, + "grad_norm": 10.996105194091797, + "learning_rate": 7.4067062225365374e-06, + "loss": 0.1972, + "step": 13703 + }, + { + "epoch": 0.34678745856213783, + "grad_norm": 7.02742338180542, + "learning_rate": 7.4063542669046454e-06, + "loss": 0.1627, + "step": 13704 + }, + { + "epoch": 0.3468127641268315, + "grad_norm": 3.271928310394287, + "learning_rate": 7.406002295754622e-06, + "loss": 0.2208, + "step": 13705 + }, + { + "epoch": 0.34683806969152514, + "grad_norm": 10.895051956176758, + "learning_rate": 7.405650309088737e-06, + "loss": 0.3239, + "step": 13706 + }, + { + "epoch": 0.34686337525621885, + "grad_norm": 4.577854156494141, + "learning_rate": 7.405298306909261e-06, + "loss": 0.1746, + "step": 13707 + }, + { + "epoch": 0.3468886808209125, + "grad_norm": 3.5823349952697754, + "learning_rate": 7.404946289218464e-06, + "loss": 0.1174, + "step": 13708 + }, + { + "epoch": 0.3469139863856062, + "grad_norm": 6.76181173324585, + "learning_rate": 7.404594256018616e-06, + "loss": 0.2859, + "step": 13709 + }, + { + "epoch": 0.3469392919502999, + "grad_norm": 4.518527507781982, + "learning_rate": 7.404242207311986e-06, + "loss": 0.1727, + "step": 13710 + }, + { + "epoch": 0.34696459751499353, + "grad_norm": 5.754664421081543, + "learning_rate": 7.403890143100845e-06, + "loss": 0.2721, + "step": 13711 + }, + { + "epoch": 0.34698990307968725, + "grad_norm": 5.755553722381592, + "learning_rate": 7.403538063387464e-06, + "loss": 0.1752, + "step": 13712 + }, + { + "epoch": 0.3470152086443809, + "grad_norm": 6.469030857086182, + "learning_rate": 7.403185968174114e-06, + "loss": 0.283, + "step": 13713 + }, + { + "epoch": 0.34704051420907456, + "grad_norm": 4.447511672973633, + "learning_rate": 7.4028338574630636e-06, + "loss": 0.2097, + "step": 13714 + }, + { + "epoch": 0.34706581977376827, + "grad_norm": 4.577086448669434, + "learning_rate": 7.402481731256585e-06, + "loss": 0.2242, + "step": 13715 + }, + { + "epoch": 0.3470911253384619, + "grad_norm": 9.123611450195312, + "learning_rate": 7.402129589556948e-06, + "loss": 0.2725, + "step": 13716 + }, + { + "epoch": 0.3471164309031556, + "grad_norm": 4.774433135986328, + "learning_rate": 7.401777432366425e-06, + "loss": 0.2248, + "step": 13717 + }, + { + "epoch": 0.3471417364678493, + "grad_norm": 3.266184091567993, + "learning_rate": 7.4014252596872845e-06, + "loss": 0.1778, + "step": 13718 + }, + { + "epoch": 0.34716704203254295, + "grad_norm": 4.852959156036377, + "learning_rate": 7.4010730715218015e-06, + "loss": 0.1737, + "step": 13719 + }, + { + "epoch": 0.3471923475972366, + "grad_norm": 8.572929382324219, + "learning_rate": 7.400720867872245e-06, + "loss": 0.2531, + "step": 13720 + }, + { + "epoch": 0.3472176531619303, + "grad_norm": 8.453852653503418, + "learning_rate": 7.4003686487408846e-06, + "loss": 0.2432, + "step": 13721 + }, + { + "epoch": 0.347242958726624, + "grad_norm": 13.195289611816406, + "learning_rate": 7.400016414129993e-06, + "loss": 0.311, + "step": 13722 + }, + { + "epoch": 0.3472682642913177, + "grad_norm": 3.5410356521606445, + "learning_rate": 7.399664164041843e-06, + "loss": 0.1563, + "step": 13723 + }, + { + "epoch": 0.34729356985601134, + "grad_norm": 12.003546714782715, + "learning_rate": 7.399311898478705e-06, + "loss": 0.1809, + "step": 13724 + }, + { + "epoch": 0.347318875420705, + "grad_norm": 5.461714744567871, + "learning_rate": 7.39895961744285e-06, + "loss": 0.2112, + "step": 13725 + }, + { + "epoch": 0.3473441809853987, + "grad_norm": 3.345853805541992, + "learning_rate": 7.3986073209365525e-06, + "loss": 0.1606, + "step": 13726 + }, + { + "epoch": 0.34736948655009237, + "grad_norm": 4.1598405838012695, + "learning_rate": 7.398255008962081e-06, + "loss": 0.112, + "step": 13727 + }, + { + "epoch": 0.347394792114786, + "grad_norm": 17.087162017822266, + "learning_rate": 7.3979026815217094e-06, + "loss": 0.2215, + "step": 13728 + }, + { + "epoch": 0.34742009767947973, + "grad_norm": 6.846421241760254, + "learning_rate": 7.397550338617709e-06, + "loss": 0.3291, + "step": 13729 + }, + { + "epoch": 0.3474454032441734, + "grad_norm": 4.111597537994385, + "learning_rate": 7.397197980252354e-06, + "loss": 0.2273, + "step": 13730 + }, + { + "epoch": 0.34747070880886705, + "grad_norm": 7.113844394683838, + "learning_rate": 7.396845606427914e-06, + "loss": 0.3201, + "step": 13731 + }, + { + "epoch": 0.34749601437356076, + "grad_norm": 6.156606674194336, + "learning_rate": 7.3964932171466634e-06, + "loss": 0.1622, + "step": 13732 + }, + { + "epoch": 0.3475213199382544, + "grad_norm": 3.305210590362549, + "learning_rate": 7.396140812410872e-06, + "loss": 0.1812, + "step": 13733 + }, + { + "epoch": 0.3475466255029481, + "grad_norm": 12.03675365447998, + "learning_rate": 7.395788392222816e-06, + "loss": 0.2174, + "step": 13734 + }, + { + "epoch": 0.3475719310676418, + "grad_norm": 5.103421211242676, + "learning_rate": 7.395435956584767e-06, + "loss": 0.1737, + "step": 13735 + }, + { + "epoch": 0.34759723663233544, + "grad_norm": 4.84855842590332, + "learning_rate": 7.395083505498994e-06, + "loss": 0.1457, + "step": 13736 + }, + { + "epoch": 0.34762254219702915, + "grad_norm": 3.816692590713501, + "learning_rate": 7.3947310389677765e-06, + "loss": 0.1728, + "step": 13737 + }, + { + "epoch": 0.3476478477617228, + "grad_norm": 5.12485408782959, + "learning_rate": 7.394378556993381e-06, + "loss": 0.1753, + "step": 13738 + }, + { + "epoch": 0.34767315332641646, + "grad_norm": 4.211846351623535, + "learning_rate": 7.394026059578085e-06, + "loss": 0.1666, + "step": 13739 + }, + { + "epoch": 0.3476984588911102, + "grad_norm": 8.732784271240234, + "learning_rate": 7.39367354672416e-06, + "loss": 0.2512, + "step": 13740 + }, + { + "epoch": 0.34772376445580383, + "grad_norm": 3.4739460945129395, + "learning_rate": 7.393321018433881e-06, + "loss": 0.1648, + "step": 13741 + }, + { + "epoch": 0.3477490700204975, + "grad_norm": 4.248836994171143, + "learning_rate": 7.392968474709518e-06, + "loss": 0.1329, + "step": 13742 + }, + { + "epoch": 0.3477743755851912, + "grad_norm": 8.201577186584473, + "learning_rate": 7.3926159155533475e-06, + "loss": 0.216, + "step": 13743 + }, + { + "epoch": 0.34779968114988485, + "grad_norm": 11.563515663146973, + "learning_rate": 7.392263340967641e-06, + "loss": 0.2431, + "step": 13744 + }, + { + "epoch": 0.3478249867145785, + "grad_norm": 4.011210918426514, + "learning_rate": 7.391910750954674e-06, + "loss": 0.143, + "step": 13745 + }, + { + "epoch": 0.3478502922792722, + "grad_norm": 8.718657493591309, + "learning_rate": 7.3915581455167195e-06, + "loss": 0.1906, + "step": 13746 + }, + { + "epoch": 0.3478755978439659, + "grad_norm": 3.4344561100006104, + "learning_rate": 7.391205524656051e-06, + "loss": 0.1291, + "step": 13747 + }, + { + "epoch": 0.3479009034086596, + "grad_norm": 4.854847431182861, + "learning_rate": 7.390852888374943e-06, + "loss": 0.2592, + "step": 13748 + }, + { + "epoch": 0.34792620897335325, + "grad_norm": 3.9145209789276123, + "learning_rate": 7.390500236675668e-06, + "loss": 0.1532, + "step": 13749 + }, + { + "epoch": 0.3479515145380469, + "grad_norm": 3.500386953353882, + "learning_rate": 7.390147569560504e-06, + "loss": 0.113, + "step": 13750 + }, + { + "epoch": 0.3479768201027406, + "grad_norm": 8.82613468170166, + "learning_rate": 7.389794887031723e-06, + "loss": 0.273, + "step": 13751 + }, + { + "epoch": 0.34800212566743427, + "grad_norm": 4.626077651977539, + "learning_rate": 7.389442189091598e-06, + "loss": 0.1557, + "step": 13752 + }, + { + "epoch": 0.3480274312321279, + "grad_norm": 4.957943916320801, + "learning_rate": 7.389089475742405e-06, + "loss": 0.1693, + "step": 13753 + }, + { + "epoch": 0.34805273679682164, + "grad_norm": 4.689600944519043, + "learning_rate": 7.388736746986419e-06, + "loss": 0.1476, + "step": 13754 + }, + { + "epoch": 0.3480780423615153, + "grad_norm": 14.69162368774414, + "learning_rate": 7.388384002825913e-06, + "loss": 0.1503, + "step": 13755 + }, + { + "epoch": 0.34810334792620895, + "grad_norm": 4.231713771820068, + "learning_rate": 7.388031243263164e-06, + "loss": 0.1833, + "step": 13756 + }, + { + "epoch": 0.34812865349090266, + "grad_norm": 4.515694618225098, + "learning_rate": 7.387678468300444e-06, + "loss": 0.1043, + "step": 13757 + }, + { + "epoch": 0.3481539590555963, + "grad_norm": 11.919556617736816, + "learning_rate": 7.3873256779400315e-06, + "loss": 0.251, + "step": 13758 + }, + { + "epoch": 0.34817926462029, + "grad_norm": 5.511648654937744, + "learning_rate": 7.3869728721842e-06, + "loss": 0.2015, + "step": 13759 + }, + { + "epoch": 0.3482045701849837, + "grad_norm": 12.364596366882324, + "learning_rate": 7.386620051035223e-06, + "loss": 0.3018, + "step": 13760 + }, + { + "epoch": 0.34822987574967734, + "grad_norm": 13.133872985839844, + "learning_rate": 7.386267214495378e-06, + "loss": 0.1841, + "step": 13761 + }, + { + "epoch": 0.34825518131437105, + "grad_norm": 6.462825298309326, + "learning_rate": 7.385914362566938e-06, + "loss": 0.2374, + "step": 13762 + }, + { + "epoch": 0.3482804868790647, + "grad_norm": 5.033787250518799, + "learning_rate": 7.385561495252183e-06, + "loss": 0.0876, + "step": 13763 + }, + { + "epoch": 0.34830579244375837, + "grad_norm": 2.2495267391204834, + "learning_rate": 7.385208612553384e-06, + "loss": 0.1303, + "step": 13764 + }, + { + "epoch": 0.3483310980084521, + "grad_norm": 17.50810432434082, + "learning_rate": 7.384855714472818e-06, + "loss": 0.4028, + "step": 13765 + }, + { + "epoch": 0.34835640357314573, + "grad_norm": 4.924755573272705, + "learning_rate": 7.384502801012763e-06, + "loss": 0.1992, + "step": 13766 + }, + { + "epoch": 0.3483817091378394, + "grad_norm": 7.699958324432373, + "learning_rate": 7.3841498721754915e-06, + "loss": 0.2187, + "step": 13767 + }, + { + "epoch": 0.3484070147025331, + "grad_norm": 5.225532054901123, + "learning_rate": 7.3837969279632805e-06, + "loss": 0.1246, + "step": 13768 + }, + { + "epoch": 0.34843232026722676, + "grad_norm": 8.192322731018066, + "learning_rate": 7.3834439683784075e-06, + "loss": 0.1116, + "step": 13769 + }, + { + "epoch": 0.3484576258319204, + "grad_norm": 5.309086322784424, + "learning_rate": 7.3830909934231475e-06, + "loss": 0.1583, + "step": 13770 + }, + { + "epoch": 0.3484829313966141, + "grad_norm": 2.6502864360809326, + "learning_rate": 7.382738003099777e-06, + "loss": 0.1082, + "step": 13771 + }, + { + "epoch": 0.3485082369613078, + "grad_norm": 6.816653251647949, + "learning_rate": 7.3823849974105725e-06, + "loss": 0.2006, + "step": 13772 + }, + { + "epoch": 0.3485335425260015, + "grad_norm": 5.715769290924072, + "learning_rate": 7.3820319763578085e-06, + "loss": 0.1704, + "step": 13773 + }, + { + "epoch": 0.34855884809069515, + "grad_norm": 3.7965595722198486, + "learning_rate": 7.381678939943766e-06, + "loss": 0.0926, + "step": 13774 + }, + { + "epoch": 0.3485841536553888, + "grad_norm": 3.99589467048645, + "learning_rate": 7.381325888170718e-06, + "loss": 0.1218, + "step": 13775 + }, + { + "epoch": 0.3486094592200825, + "grad_norm": 5.994110584259033, + "learning_rate": 7.380972821040942e-06, + "loss": 0.13, + "step": 13776 + }, + { + "epoch": 0.3486347647847762, + "grad_norm": 11.506826400756836, + "learning_rate": 7.3806197385567144e-06, + "loss": 0.334, + "step": 13777 + }, + { + "epoch": 0.34866007034946983, + "grad_norm": 5.14497184753418, + "learning_rate": 7.380266640720314e-06, + "loss": 0.1624, + "step": 13778 + }, + { + "epoch": 0.34868537591416354, + "grad_norm": 9.359794616699219, + "learning_rate": 7.379913527534017e-06, + "loss": 0.396, + "step": 13779 + }, + { + "epoch": 0.3487106814788572, + "grad_norm": 7.465999603271484, + "learning_rate": 7.3795603990001e-06, + "loss": 0.3374, + "step": 13780 + }, + { + "epoch": 0.34873598704355085, + "grad_norm": 6.0292887687683105, + "learning_rate": 7.3792072551208404e-06, + "loss": 0.1524, + "step": 13781 + }, + { + "epoch": 0.34876129260824457, + "grad_norm": 6.500025272369385, + "learning_rate": 7.378854095898515e-06, + "loss": 0.1495, + "step": 13782 + }, + { + "epoch": 0.3487865981729382, + "grad_norm": 11.35360050201416, + "learning_rate": 7.378500921335404e-06, + "loss": 0.2675, + "step": 13783 + }, + { + "epoch": 0.3488119037376319, + "grad_norm": 4.203456401824951, + "learning_rate": 7.378147731433783e-06, + "loss": 0.1153, + "step": 13784 + }, + { + "epoch": 0.3488372093023256, + "grad_norm": 6.008092403411865, + "learning_rate": 7.377794526195929e-06, + "loss": 0.149, + "step": 13785 + }, + { + "epoch": 0.34886251486701925, + "grad_norm": 8.567648887634277, + "learning_rate": 7.377441305624119e-06, + "loss": 0.1897, + "step": 13786 + }, + { + "epoch": 0.34888782043171296, + "grad_norm": 3.518831729888916, + "learning_rate": 7.3770880697206335e-06, + "loss": 0.1689, + "step": 13787 + }, + { + "epoch": 0.3489131259964066, + "grad_norm": 3.4716999530792236, + "learning_rate": 7.3767348184877494e-06, + "loss": 0.1389, + "step": 13788 + }, + { + "epoch": 0.34893843156110027, + "grad_norm": 5.185070991516113, + "learning_rate": 7.376381551927744e-06, + "loss": 0.1105, + "step": 13789 + }, + { + "epoch": 0.348963737125794, + "grad_norm": 5.689918041229248, + "learning_rate": 7.376028270042897e-06, + "loss": 0.1526, + "step": 13790 + }, + { + "epoch": 0.34898904269048764, + "grad_norm": 4.656894683837891, + "learning_rate": 7.3756749728354835e-06, + "loss": 0.1754, + "step": 13791 + }, + { + "epoch": 0.3490143482551813, + "grad_norm": 8.066327095031738, + "learning_rate": 7.375321660307786e-06, + "loss": 0.2543, + "step": 13792 + }, + { + "epoch": 0.349039653819875, + "grad_norm": 4.195034503936768, + "learning_rate": 7.3749683324620815e-06, + "loss": 0.2275, + "step": 13793 + }, + { + "epoch": 0.34906495938456866, + "grad_norm": 7.935795783996582, + "learning_rate": 7.374614989300646e-06, + "loss": 0.1069, + "step": 13794 + }, + { + "epoch": 0.3490902649492623, + "grad_norm": 8.80964183807373, + "learning_rate": 7.374261630825761e-06, + "loss": 0.2288, + "step": 13795 + }, + { + "epoch": 0.34911557051395603, + "grad_norm": 4.942139148712158, + "learning_rate": 7.373908257039706e-06, + "loss": 0.1304, + "step": 13796 + }, + { + "epoch": 0.3491408760786497, + "grad_norm": 9.099757194519043, + "learning_rate": 7.373554867944756e-06, + "loss": 0.2461, + "step": 13797 + }, + { + "epoch": 0.3491661816433434, + "grad_norm": 7.34781551361084, + "learning_rate": 7.373201463543193e-06, + "loss": 0.2572, + "step": 13798 + }, + { + "epoch": 0.34919148720803705, + "grad_norm": 7.357944965362549, + "learning_rate": 7.372848043837295e-06, + "loss": 0.195, + "step": 13799 + }, + { + "epoch": 0.3492167927727307, + "grad_norm": 6.219390392303467, + "learning_rate": 7.37249460882934e-06, + "loss": 0.1442, + "step": 13800 + }, + { + "epoch": 0.3492420983374244, + "grad_norm": 5.407147407531738, + "learning_rate": 7.372141158521611e-06, + "loss": 0.14, + "step": 13801 + }, + { + "epoch": 0.3492674039021181, + "grad_norm": 5.604129791259766, + "learning_rate": 7.371787692916383e-06, + "loss": 0.1565, + "step": 13802 + }, + { + "epoch": 0.34929270946681173, + "grad_norm": 6.7347564697265625, + "learning_rate": 7.371434212015939e-06, + "loss": 0.1476, + "step": 13803 + }, + { + "epoch": 0.34931801503150545, + "grad_norm": 15.643017768859863, + "learning_rate": 7.371080715822556e-06, + "loss": 0.2102, + "step": 13804 + }, + { + "epoch": 0.3493433205961991, + "grad_norm": 5.572001934051514, + "learning_rate": 7.370727204338515e-06, + "loss": 0.2002, + "step": 13805 + }, + { + "epoch": 0.34936862616089276, + "grad_norm": 6.058245658874512, + "learning_rate": 7.370373677566094e-06, + "loss": 0.2135, + "step": 13806 + }, + { + "epoch": 0.34939393172558647, + "grad_norm": 15.480299949645996, + "learning_rate": 7.3700201355075765e-06, + "loss": 0.256, + "step": 13807 + }, + { + "epoch": 0.3494192372902801, + "grad_norm": 5.369250774383545, + "learning_rate": 7.369666578165238e-06, + "loss": 0.2289, + "step": 13808 + }, + { + "epoch": 0.3494445428549738, + "grad_norm": 11.604406356811523, + "learning_rate": 7.369313005541361e-06, + "loss": 0.2339, + "step": 13809 + }, + { + "epoch": 0.3494698484196675, + "grad_norm": 4.401304244995117, + "learning_rate": 7.3689594176382255e-06, + "loss": 0.1394, + "step": 13810 + }, + { + "epoch": 0.34949515398436115, + "grad_norm": 3.6951956748962402, + "learning_rate": 7.3686058144581095e-06, + "loss": 0.1516, + "step": 13811 + }, + { + "epoch": 0.34952045954905486, + "grad_norm": 4.13602352142334, + "learning_rate": 7.368252196003298e-06, + "loss": 0.1567, + "step": 13812 + }, + { + "epoch": 0.3495457651137485, + "grad_norm": 2.5755856037139893, + "learning_rate": 7.3678985622760665e-06, + "loss": 0.1255, + "step": 13813 + }, + { + "epoch": 0.3495710706784422, + "grad_norm": 9.914406776428223, + "learning_rate": 7.367544913278699e-06, + "loss": 0.1757, + "step": 13814 + }, + { + "epoch": 0.3495963762431359, + "grad_norm": 3.287874698638916, + "learning_rate": 7.367191249013473e-06, + "loss": 0.2099, + "step": 13815 + }, + { + "epoch": 0.34962168180782954, + "grad_norm": 3.294797658920288, + "learning_rate": 7.366837569482673e-06, + "loss": 0.1815, + "step": 13816 + }, + { + "epoch": 0.3496469873725232, + "grad_norm": 2.260923385620117, + "learning_rate": 7.366483874688577e-06, + "loss": 0.0869, + "step": 13817 + }, + { + "epoch": 0.3496722929372169, + "grad_norm": 7.337021350860596, + "learning_rate": 7.366130164633467e-06, + "loss": 0.3069, + "step": 13818 + }, + { + "epoch": 0.34969759850191057, + "grad_norm": 9.542250633239746, + "learning_rate": 7.365776439319623e-06, + "loss": 0.135, + "step": 13819 + }, + { + "epoch": 0.3497229040666042, + "grad_norm": 6.460012912750244, + "learning_rate": 7.365422698749328e-06, + "loss": 0.1331, + "step": 13820 + }, + { + "epoch": 0.34974820963129793, + "grad_norm": 6.173953056335449, + "learning_rate": 7.36506894292486e-06, + "loss": 0.2014, + "step": 13821 + }, + { + "epoch": 0.3497735151959916, + "grad_norm": 10.134836196899414, + "learning_rate": 7.364715171848504e-06, + "loss": 0.1331, + "step": 13822 + }, + { + "epoch": 0.34979882076068525, + "grad_norm": 5.785647392272949, + "learning_rate": 7.364361385522539e-06, + "loss": 0.2761, + "step": 13823 + }, + { + "epoch": 0.34982412632537896, + "grad_norm": 3.313167095184326, + "learning_rate": 7.364007583949249e-06, + "loss": 0.109, + "step": 13824 + }, + { + "epoch": 0.3498494318900726, + "grad_norm": 4.353607654571533, + "learning_rate": 7.363653767130912e-06, + "loss": 0.1396, + "step": 13825 + }, + { + "epoch": 0.3498747374547663, + "grad_norm": 7.442469596862793, + "learning_rate": 7.363299935069812e-06, + "loss": 0.2493, + "step": 13826 + }, + { + "epoch": 0.34990004301946, + "grad_norm": 11.131248474121094, + "learning_rate": 7.36294608776823e-06, + "loss": 0.2936, + "step": 13827 + }, + { + "epoch": 0.34992534858415364, + "grad_norm": 3.8833420276641846, + "learning_rate": 7.362592225228449e-06, + "loss": 0.1505, + "step": 13828 + }, + { + "epoch": 0.34995065414884735, + "grad_norm": 2.4706053733825684, + "learning_rate": 7.362238347452749e-06, + "loss": 0.1168, + "step": 13829 + }, + { + "epoch": 0.349975959713541, + "grad_norm": 6.225584030151367, + "learning_rate": 7.3618844544434145e-06, + "loss": 0.1743, + "step": 13830 + }, + { + "epoch": 0.35000126527823466, + "grad_norm": 2.555933952331543, + "learning_rate": 7.361530546202726e-06, + "loss": 0.1763, + "step": 13831 + }, + { + "epoch": 0.3500265708429284, + "grad_norm": 7.3502397537231445, + "learning_rate": 7.3611766227329664e-06, + "loss": 0.1702, + "step": 13832 + }, + { + "epoch": 0.35005187640762203, + "grad_norm": 11.408029556274414, + "learning_rate": 7.360822684036417e-06, + "loss": 0.3283, + "step": 13833 + }, + { + "epoch": 0.3500771819723157, + "grad_norm": 3.242629051208496, + "learning_rate": 7.3604687301153624e-06, + "loss": 0.1738, + "step": 13834 + }, + { + "epoch": 0.3501024875370094, + "grad_norm": 8.694705963134766, + "learning_rate": 7.360114760972083e-06, + "loss": 0.235, + "step": 13835 + }, + { + "epoch": 0.35012779310170306, + "grad_norm": 6.579079627990723, + "learning_rate": 7.3597607766088645e-06, + "loss": 0.1378, + "step": 13836 + }, + { + "epoch": 0.35015309866639677, + "grad_norm": 12.286937713623047, + "learning_rate": 7.359406777027986e-06, + "loss": 0.2365, + "step": 13837 + }, + { + "epoch": 0.3501784042310904, + "grad_norm": 4.7499871253967285, + "learning_rate": 7.359052762231732e-06, + "loss": 0.1656, + "step": 13838 + }, + { + "epoch": 0.3502037097957841, + "grad_norm": 4.148696422576904, + "learning_rate": 7.3586987322223855e-06, + "loss": 0.1765, + "step": 13839 + }, + { + "epoch": 0.3502290153604778, + "grad_norm": 26.08310890197754, + "learning_rate": 7.3583446870022314e-06, + "loss": 0.3228, + "step": 13840 + }, + { + "epoch": 0.35025432092517145, + "grad_norm": 4.930622100830078, + "learning_rate": 7.357990626573549e-06, + "loss": 0.1925, + "step": 13841 + }, + { + "epoch": 0.3502796264898651, + "grad_norm": 6.787783622741699, + "learning_rate": 7.357636550938624e-06, + "loss": 0.1458, + "step": 13842 + }, + { + "epoch": 0.3503049320545588, + "grad_norm": 6.792390823364258, + "learning_rate": 7.357282460099741e-06, + "loss": 0.2428, + "step": 13843 + }, + { + "epoch": 0.35033023761925247, + "grad_norm": 6.13002872467041, + "learning_rate": 7.35692835405918e-06, + "loss": 0.2147, + "step": 13844 + }, + { + "epoch": 0.3503555431839461, + "grad_norm": 4.441368579864502, + "learning_rate": 7.356574232819228e-06, + "loss": 0.1894, + "step": 13845 + }, + { + "epoch": 0.35038084874863984, + "grad_norm": 12.000511169433594, + "learning_rate": 7.356220096382165e-06, + "loss": 0.2477, + "step": 13846 + }, + { + "epoch": 0.3504061543133335, + "grad_norm": 6.49365758895874, + "learning_rate": 7.35586594475028e-06, + "loss": 0.2184, + "step": 13847 + }, + { + "epoch": 0.35043145987802715, + "grad_norm": 7.222270488739014, + "learning_rate": 7.355511777925851e-06, + "loss": 0.2171, + "step": 13848 + }, + { + "epoch": 0.35045676544272086, + "grad_norm": 8.467408180236816, + "learning_rate": 7.355157595911167e-06, + "loss": 0.2437, + "step": 13849 + }, + { + "epoch": 0.3504820710074145, + "grad_norm": 4.9505720138549805, + "learning_rate": 7.354803398708508e-06, + "loss": 0.0986, + "step": 13850 + }, + { + "epoch": 0.35050737657210823, + "grad_norm": 3.4882729053497314, + "learning_rate": 7.35444918632016e-06, + "loss": 0.1753, + "step": 13851 + }, + { + "epoch": 0.3505326821368019, + "grad_norm": 3.3945326805114746, + "learning_rate": 7.354094958748408e-06, + "loss": 0.1, + "step": 13852 + }, + { + "epoch": 0.35055798770149554, + "grad_norm": 7.826551914215088, + "learning_rate": 7.353740715995536e-06, + "loss": 0.1124, + "step": 13853 + }, + { + "epoch": 0.35058329326618926, + "grad_norm": 3.8861401081085205, + "learning_rate": 7.353386458063827e-06, + "loss": 0.1352, + "step": 13854 + }, + { + "epoch": 0.3506085988308829, + "grad_norm": 8.955710411071777, + "learning_rate": 7.353032184955566e-06, + "loss": 0.2243, + "step": 13855 + }, + { + "epoch": 0.35063390439557657, + "grad_norm": 3.0424180030822754, + "learning_rate": 7.352677896673039e-06, + "loss": 0.1537, + "step": 13856 + }, + { + "epoch": 0.3506592099602703, + "grad_norm": 9.611512184143066, + "learning_rate": 7.352323593218529e-06, + "loss": 0.1854, + "step": 13857 + }, + { + "epoch": 0.35068451552496394, + "grad_norm": 11.990276336669922, + "learning_rate": 7.351969274594324e-06, + "loss": 0.1558, + "step": 13858 + }, + { + "epoch": 0.3507098210896576, + "grad_norm": 4.51541805267334, + "learning_rate": 7.351614940802704e-06, + "loss": 0.1942, + "step": 13859 + }, + { + "epoch": 0.3507351266543513, + "grad_norm": 3.80084228515625, + "learning_rate": 7.351260591845958e-06, + "loss": 0.1638, + "step": 13860 + }, + { + "epoch": 0.35076043221904496, + "grad_norm": 3.755131483078003, + "learning_rate": 7.350906227726369e-06, + "loss": 0.1794, + "step": 13861 + }, + { + "epoch": 0.35078573778373867, + "grad_norm": 6.449428081512451, + "learning_rate": 7.350551848446223e-06, + "loss": 0.1819, + "step": 13862 + }, + { + "epoch": 0.3508110433484323, + "grad_norm": 5.859111785888672, + "learning_rate": 7.350197454007806e-06, + "loss": 0.156, + "step": 13863 + }, + { + "epoch": 0.350836348913126, + "grad_norm": 4.176905155181885, + "learning_rate": 7.349843044413403e-06, + "loss": 0.1168, + "step": 13864 + }, + { + "epoch": 0.3508616544778197, + "grad_norm": 7.581145763397217, + "learning_rate": 7.3494886196652995e-06, + "loss": 0.2679, + "step": 13865 + }, + { + "epoch": 0.35088696004251335, + "grad_norm": 4.289201736450195, + "learning_rate": 7.34913417976578e-06, + "loss": 0.1178, + "step": 13866 + }, + { + "epoch": 0.350912265607207, + "grad_norm": 8.32038402557373, + "learning_rate": 7.348779724717132e-06, + "loss": 0.3168, + "step": 13867 + }, + { + "epoch": 0.3509375711719007, + "grad_norm": 2.65423321723938, + "learning_rate": 7.348425254521639e-06, + "loss": 0.111, + "step": 13868 + }, + { + "epoch": 0.3509628767365944, + "grad_norm": 13.394664764404297, + "learning_rate": 7.34807076918159e-06, + "loss": 0.1385, + "step": 13869 + }, + { + "epoch": 0.35098818230128803, + "grad_norm": 3.0398662090301514, + "learning_rate": 7.347716268699267e-06, + "loss": 0.135, + "step": 13870 + }, + { + "epoch": 0.35101348786598174, + "grad_norm": 6.301492214202881, + "learning_rate": 7.347361753076963e-06, + "loss": 0.1807, + "step": 13871 + }, + { + "epoch": 0.3510387934306754, + "grad_norm": 5.6115241050720215, + "learning_rate": 7.347007222316955e-06, + "loss": 0.2267, + "step": 13872 + }, + { + "epoch": 0.35106409899536906, + "grad_norm": 17.431930541992188, + "learning_rate": 7.346652676421537e-06, + "loss": 0.3104, + "step": 13873 + }, + { + "epoch": 0.35108940456006277, + "grad_norm": 4.987429141998291, + "learning_rate": 7.3462981153929914e-06, + "loss": 0.099, + "step": 13874 + }, + { + "epoch": 0.3511147101247564, + "grad_norm": 1.7968066930770874, + "learning_rate": 7.345943539233604e-06, + "loss": 0.0631, + "step": 13875 + }, + { + "epoch": 0.35114001568945014, + "grad_norm": 9.349733352661133, + "learning_rate": 7.345588947945665e-06, + "loss": 0.1806, + "step": 13876 + }, + { + "epoch": 0.3511653212541438, + "grad_norm": 3.729969024658203, + "learning_rate": 7.345234341531459e-06, + "loss": 0.169, + "step": 13877 + }, + { + "epoch": 0.35119062681883745, + "grad_norm": 3.617255210876465, + "learning_rate": 7.3448797199932744e-06, + "loss": 0.182, + "step": 13878 + }, + { + "epoch": 0.35121593238353116, + "grad_norm": 4.13559627532959, + "learning_rate": 7.344525083333394e-06, + "loss": 0.1901, + "step": 13879 + }, + { + "epoch": 0.3512412379482248, + "grad_norm": 7.0877814292907715, + "learning_rate": 7.344170431554109e-06, + "loss": 0.2202, + "step": 13880 + }, + { + "epoch": 0.35126654351291847, + "grad_norm": 4.394799709320068, + "learning_rate": 7.343815764657705e-06, + "loss": 0.157, + "step": 13881 + }, + { + "epoch": 0.3512918490776122, + "grad_norm": 6.771627902984619, + "learning_rate": 7.343461082646468e-06, + "loss": 0.2108, + "step": 13882 + }, + { + "epoch": 0.35131715464230584, + "grad_norm": 9.742725372314453, + "learning_rate": 7.343106385522688e-06, + "loss": 0.224, + "step": 13883 + }, + { + "epoch": 0.3513424602069995, + "grad_norm": 4.962099075317383, + "learning_rate": 7.342751673288649e-06, + "loss": 0.2162, + "step": 13884 + }, + { + "epoch": 0.3513677657716932, + "grad_norm": 4.568495750427246, + "learning_rate": 7.342396945946642e-06, + "loss": 0.1215, + "step": 13885 + }, + { + "epoch": 0.35139307133638686, + "grad_norm": 14.113426208496094, + "learning_rate": 7.342042203498952e-06, + "loss": 0.3081, + "step": 13886 + }, + { + "epoch": 0.3514183769010805, + "grad_norm": 3.9206161499023438, + "learning_rate": 7.341687445947867e-06, + "loss": 0.1732, + "step": 13887 + }, + { + "epoch": 0.35144368246577423, + "grad_norm": 3.578187942504883, + "learning_rate": 7.341332673295676e-06, + "loss": 0.1838, + "step": 13888 + }, + { + "epoch": 0.3514689880304679, + "grad_norm": 7.284815788269043, + "learning_rate": 7.340977885544666e-06, + "loss": 0.2062, + "step": 13889 + }, + { + "epoch": 0.3514942935951616, + "grad_norm": 8.439688682556152, + "learning_rate": 7.340623082697124e-06, + "loss": 0.1484, + "step": 13890 + }, + { + "epoch": 0.35151959915985526, + "grad_norm": 3.9610800743103027, + "learning_rate": 7.34026826475534e-06, + "loss": 0.1719, + "step": 13891 + }, + { + "epoch": 0.3515449047245489, + "grad_norm": 5.344090938568115, + "learning_rate": 7.339913431721602e-06, + "loss": 0.1036, + "step": 13892 + }, + { + "epoch": 0.3515702102892426, + "grad_norm": 9.001408576965332, + "learning_rate": 7.339558583598196e-06, + "loss": 0.1833, + "step": 13893 + }, + { + "epoch": 0.3515955158539363, + "grad_norm": 7.041150093078613, + "learning_rate": 7.339203720387413e-06, + "loss": 0.2647, + "step": 13894 + }, + { + "epoch": 0.35162082141862994, + "grad_norm": 23.82269287109375, + "learning_rate": 7.3388488420915396e-06, + "loss": 0.3917, + "step": 13895 + }, + { + "epoch": 0.35164612698332365, + "grad_norm": 4.1115851402282715, + "learning_rate": 7.338493948712866e-06, + "loss": 0.216, + "step": 13896 + }, + { + "epoch": 0.3516714325480173, + "grad_norm": 5.758035659790039, + "learning_rate": 7.338139040253679e-06, + "loss": 0.1599, + "step": 13897 + }, + { + "epoch": 0.35169673811271096, + "grad_norm": 3.9368085861206055, + "learning_rate": 7.337784116716269e-06, + "loss": 0.1474, + "step": 13898 + }, + { + "epoch": 0.35172204367740467, + "grad_norm": 9.13571548461914, + "learning_rate": 7.337429178102923e-06, + "loss": 0.3107, + "step": 13899 + }, + { + "epoch": 0.35174734924209833, + "grad_norm": 2.6276228427886963, + "learning_rate": 7.337074224415932e-06, + "loss": 0.172, + "step": 13900 + }, + { + "epoch": 0.35177265480679204, + "grad_norm": 5.507945537567139, + "learning_rate": 7.3367192556575835e-06, + "loss": 0.1646, + "step": 13901 + }, + { + "epoch": 0.3517979603714857, + "grad_norm": 5.782341480255127, + "learning_rate": 7.336364271830168e-06, + "loss": 0.1591, + "step": 13902 + }, + { + "epoch": 0.35182326593617935, + "grad_norm": 3.773582935333252, + "learning_rate": 7.3360092729359734e-06, + "loss": 0.1583, + "step": 13903 + }, + { + "epoch": 0.35184857150087306, + "grad_norm": 6.110756874084473, + "learning_rate": 7.3356542589772895e-06, + "loss": 0.1909, + "step": 13904 + }, + { + "epoch": 0.3518738770655667, + "grad_norm": 4.786040782928467, + "learning_rate": 7.335299229956406e-06, + "loss": 0.2098, + "step": 13905 + }, + { + "epoch": 0.3518991826302604, + "grad_norm": 22.80280303955078, + "learning_rate": 7.334944185875611e-06, + "loss": 0.3136, + "step": 13906 + }, + { + "epoch": 0.3519244881949541, + "grad_norm": 3.0147171020507812, + "learning_rate": 7.334589126737197e-06, + "loss": 0.2174, + "step": 13907 + }, + { + "epoch": 0.35194979375964774, + "grad_norm": 5.335355758666992, + "learning_rate": 7.334234052543451e-06, + "loss": 0.2117, + "step": 13908 + }, + { + "epoch": 0.3519750993243414, + "grad_norm": 5.496492862701416, + "learning_rate": 7.333878963296663e-06, + "loss": 0.1429, + "step": 13909 + }, + { + "epoch": 0.3520004048890351, + "grad_norm": 3.818814754486084, + "learning_rate": 7.333523858999126e-06, + "loss": 0.2132, + "step": 13910 + }, + { + "epoch": 0.35202571045372877, + "grad_norm": 5.136870861053467, + "learning_rate": 7.333168739653126e-06, + "loss": 0.1853, + "step": 13911 + }, + { + "epoch": 0.3520510160184224, + "grad_norm": 5.7928032875061035, + "learning_rate": 7.332813605260955e-06, + "loss": 0.2218, + "step": 13912 + }, + { + "epoch": 0.35207632158311614, + "grad_norm": 4.618797302246094, + "learning_rate": 7.332458455824904e-06, + "loss": 0.1358, + "step": 13913 + }, + { + "epoch": 0.3521016271478098, + "grad_norm": 4.347586154937744, + "learning_rate": 7.332103291347261e-06, + "loss": 0.1532, + "step": 13914 + }, + { + "epoch": 0.3521269327125035, + "grad_norm": 2.853423833847046, + "learning_rate": 7.3317481118303195e-06, + "loss": 0.0926, + "step": 13915 + }, + { + "epoch": 0.35215223827719716, + "grad_norm": 3.391007900238037, + "learning_rate": 7.331392917276367e-06, + "loss": 0.1419, + "step": 13916 + }, + { + "epoch": 0.3521775438418908, + "grad_norm": 6.293397426605225, + "learning_rate": 7.331037707687694e-06, + "loss": 0.2194, + "step": 13917 + }, + { + "epoch": 0.35220284940658453, + "grad_norm": 5.021198749542236, + "learning_rate": 7.330682483066594e-06, + "loss": 0.2308, + "step": 13918 + }, + { + "epoch": 0.3522281549712782, + "grad_norm": 3.852430820465088, + "learning_rate": 7.330327243415355e-06, + "loss": 0.184, + "step": 13919 + }, + { + "epoch": 0.35225346053597184, + "grad_norm": 6.037027359008789, + "learning_rate": 7.32997198873627e-06, + "loss": 0.1683, + "step": 13920 + }, + { + "epoch": 0.35227876610066555, + "grad_norm": 4.719030857086182, + "learning_rate": 7.329616719031629e-06, + "loss": 0.1781, + "step": 13921 + }, + { + "epoch": 0.3523040716653592, + "grad_norm": 3.097109317779541, + "learning_rate": 7.329261434303723e-06, + "loss": 0.0955, + "step": 13922 + }, + { + "epoch": 0.35232937723005286, + "grad_norm": 4.524515151977539, + "learning_rate": 7.328906134554843e-06, + "loss": 0.1312, + "step": 13923 + }, + { + "epoch": 0.3523546827947466, + "grad_norm": 3.2052931785583496, + "learning_rate": 7.3285508197872816e-06, + "loss": 0.1717, + "step": 13924 + }, + { + "epoch": 0.35237998835944023, + "grad_norm": 7.682070255279541, + "learning_rate": 7.328195490003327e-06, + "loss": 0.2739, + "step": 13925 + }, + { + "epoch": 0.35240529392413394, + "grad_norm": 12.35523509979248, + "learning_rate": 7.327840145205275e-06, + "loss": 0.2094, + "step": 13926 + }, + { + "epoch": 0.3524305994888276, + "grad_norm": 9.267152786254883, + "learning_rate": 7.3274847853954135e-06, + "loss": 0.1753, + "step": 13927 + }, + { + "epoch": 0.35245590505352126, + "grad_norm": 6.209259510040283, + "learning_rate": 7.327129410576036e-06, + "loss": 0.1992, + "step": 13928 + }, + { + "epoch": 0.35248121061821497, + "grad_norm": 13.68467903137207, + "learning_rate": 7.3267740207494345e-06, + "loss": 0.1816, + "step": 13929 + }, + { + "epoch": 0.3525065161829086, + "grad_norm": 6.106749534606934, + "learning_rate": 7.326418615917898e-06, + "loss": 0.1877, + "step": 13930 + }, + { + "epoch": 0.3525318217476023, + "grad_norm": 4.616002559661865, + "learning_rate": 7.326063196083723e-06, + "loss": 0.1311, + "step": 13931 + }, + { + "epoch": 0.352557127312296, + "grad_norm": 7.699233531951904, + "learning_rate": 7.325707761249199e-06, + "loss": 0.1806, + "step": 13932 + }, + { + "epoch": 0.35258243287698965, + "grad_norm": 10.674337387084961, + "learning_rate": 7.325352311416617e-06, + "loss": 0.2079, + "step": 13933 + }, + { + "epoch": 0.3526077384416833, + "grad_norm": 3.780599355697632, + "learning_rate": 7.32499684658827e-06, + "loss": 0.1085, + "step": 13934 + }, + { + "epoch": 0.352633044006377, + "grad_norm": 15.566624641418457, + "learning_rate": 7.324641366766453e-06, + "loss": 0.2353, + "step": 13935 + }, + { + "epoch": 0.3526583495710707, + "grad_norm": 2.591214179992676, + "learning_rate": 7.324285871953455e-06, + "loss": 0.1142, + "step": 13936 + }, + { + "epoch": 0.35268365513576433, + "grad_norm": 5.552353382110596, + "learning_rate": 7.32393036215157e-06, + "loss": 0.1144, + "step": 13937 + }, + { + "epoch": 0.35270896070045804, + "grad_norm": 4.235361099243164, + "learning_rate": 7.32357483736309e-06, + "loss": 0.1659, + "step": 13938 + }, + { + "epoch": 0.3527342662651517, + "grad_norm": 7.039766311645508, + "learning_rate": 7.323219297590308e-06, + "loss": 0.2215, + "step": 13939 + }, + { + "epoch": 0.3527595718298454, + "grad_norm": 6.49176025390625, + "learning_rate": 7.322863742835517e-06, + "loss": 0.212, + "step": 13940 + }, + { + "epoch": 0.35278487739453906, + "grad_norm": 4.202331066131592, + "learning_rate": 7.322508173101011e-06, + "loss": 0.1443, + "step": 13941 + }, + { + "epoch": 0.3528101829592327, + "grad_norm": 3.6920955181121826, + "learning_rate": 7.322152588389081e-06, + "loss": 0.155, + "step": 13942 + }, + { + "epoch": 0.35283548852392643, + "grad_norm": 7.171718120574951, + "learning_rate": 7.3217969887020205e-06, + "loss": 0.2123, + "step": 13943 + }, + { + "epoch": 0.3528607940886201, + "grad_norm": 4.482614040374756, + "learning_rate": 7.321441374042124e-06, + "loss": 0.1442, + "step": 13944 + }, + { + "epoch": 0.35288609965331375, + "grad_norm": 35.06601333618164, + "learning_rate": 7.321085744411683e-06, + "loss": 0.4249, + "step": 13945 + }, + { + "epoch": 0.35291140521800746, + "grad_norm": 3.9348878860473633, + "learning_rate": 7.320730099812992e-06, + "loss": 0.0854, + "step": 13946 + }, + { + "epoch": 0.3529367107827011, + "grad_norm": 3.8035671710968018, + "learning_rate": 7.320374440248343e-06, + "loss": 0.1513, + "step": 13947 + }, + { + "epoch": 0.35296201634739477, + "grad_norm": 12.370026588439941, + "learning_rate": 7.3200187657200335e-06, + "loss": 0.1763, + "step": 13948 + }, + { + "epoch": 0.3529873219120885, + "grad_norm": 5.282337188720703, + "learning_rate": 7.319663076230352e-06, + "loss": 0.2287, + "step": 13949 + }, + { + "epoch": 0.35301262747678214, + "grad_norm": 4.031398296356201, + "learning_rate": 7.319307371781596e-06, + "loss": 0.1559, + "step": 13950 + }, + { + "epoch": 0.3530379330414758, + "grad_norm": 4.139734745025635, + "learning_rate": 7.318951652376058e-06, + "loss": 0.1428, + "step": 13951 + }, + { + "epoch": 0.3530632386061695, + "grad_norm": 3.497319221496582, + "learning_rate": 7.318595918016033e-06, + "loss": 0.0939, + "step": 13952 + }, + { + "epoch": 0.35308854417086316, + "grad_norm": 4.426158428192139, + "learning_rate": 7.318240168703814e-06, + "loss": 0.15, + "step": 13953 + }, + { + "epoch": 0.3531138497355569, + "grad_norm": 17.251476287841797, + "learning_rate": 7.317884404441694e-06, + "loss": 0.4294, + "step": 13954 + }, + { + "epoch": 0.35313915530025053, + "grad_norm": 6.6568121910095215, + "learning_rate": 7.317528625231971e-06, + "loss": 0.1652, + "step": 13955 + }, + { + "epoch": 0.3531644608649442, + "grad_norm": 6.473673343658447, + "learning_rate": 7.317172831076935e-06, + "loss": 0.1729, + "step": 13956 + }, + { + "epoch": 0.3531897664296379, + "grad_norm": 5.660766124725342, + "learning_rate": 7.3168170219788836e-06, + "loss": 0.3023, + "step": 13957 + }, + { + "epoch": 0.35321507199433155, + "grad_norm": 14.345907211303711, + "learning_rate": 7.316461197940109e-06, + "loss": 0.1465, + "step": 13958 + }, + { + "epoch": 0.3532403775590252, + "grad_norm": 9.491374969482422, + "learning_rate": 7.316105358962907e-06, + "loss": 0.2433, + "step": 13959 + }, + { + "epoch": 0.3532656831237189, + "grad_norm": 8.66729736328125, + "learning_rate": 7.315749505049573e-06, + "loss": 0.2074, + "step": 13960 + }, + { + "epoch": 0.3532909886884126, + "grad_norm": 14.491925239562988, + "learning_rate": 7.3153936362024016e-06, + "loss": 0.2588, + "step": 13961 + }, + { + "epoch": 0.35331629425310623, + "grad_norm": 5.160829544067383, + "learning_rate": 7.315037752423687e-06, + "loss": 0.1639, + "step": 13962 + }, + { + "epoch": 0.35334159981779995, + "grad_norm": 4.473805904388428, + "learning_rate": 7.314681853715724e-06, + "loss": 0.2448, + "step": 13963 + }, + { + "epoch": 0.3533669053824936, + "grad_norm": 3.6316890716552734, + "learning_rate": 7.314325940080809e-06, + "loss": 0.1556, + "step": 13964 + }, + { + "epoch": 0.3533922109471873, + "grad_norm": 4.729853630065918, + "learning_rate": 7.3139700115212365e-06, + "loss": 0.1743, + "step": 13965 + }, + { + "epoch": 0.35341751651188097, + "grad_norm": 2.979191541671753, + "learning_rate": 7.313614068039302e-06, + "loss": 0.128, + "step": 13966 + }, + { + "epoch": 0.3534428220765746, + "grad_norm": 6.767559051513672, + "learning_rate": 7.3132581096373e-06, + "loss": 0.2182, + "step": 13967 + }, + { + "epoch": 0.35346812764126834, + "grad_norm": 8.20809268951416, + "learning_rate": 7.312902136317528e-06, + "loss": 0.272, + "step": 13968 + }, + { + "epoch": 0.353493433205962, + "grad_norm": 5.437412261962891, + "learning_rate": 7.312546148082278e-06, + "loss": 0.182, + "step": 13969 + }, + { + "epoch": 0.35351873877065565, + "grad_norm": 6.187009334564209, + "learning_rate": 7.31219014493385e-06, + "loss": 0.1788, + "step": 13970 + }, + { + "epoch": 0.35354404433534936, + "grad_norm": 6.635514736175537, + "learning_rate": 7.311834126874538e-06, + "loss": 0.239, + "step": 13971 + }, + { + "epoch": 0.353569349900043, + "grad_norm": 4.717441558837891, + "learning_rate": 7.311478093906636e-06, + "loss": 0.1029, + "step": 13972 + }, + { + "epoch": 0.3535946554647367, + "grad_norm": 4.490231037139893, + "learning_rate": 7.311122046032444e-06, + "loss": 0.1844, + "step": 13973 + }, + { + "epoch": 0.3536199610294304, + "grad_norm": 4.428189277648926, + "learning_rate": 7.310765983254254e-06, + "loss": 0.1569, + "step": 13974 + }, + { + "epoch": 0.35364526659412404, + "grad_norm": 12.544137954711914, + "learning_rate": 7.310409905574365e-06, + "loss": 0.2615, + "step": 13975 + }, + { + "epoch": 0.3536705721588177, + "grad_norm": 5.477504253387451, + "learning_rate": 7.3100538129950726e-06, + "loss": 0.1528, + "step": 13976 + }, + { + "epoch": 0.3536958777235114, + "grad_norm": 8.096985816955566, + "learning_rate": 7.309697705518672e-06, + "loss": 0.2624, + "step": 13977 + }, + { + "epoch": 0.35372118328820507, + "grad_norm": 4.661945343017578, + "learning_rate": 7.30934158314746e-06, + "loss": 0.184, + "step": 13978 + }, + { + "epoch": 0.3537464888528988, + "grad_norm": 8.654141426086426, + "learning_rate": 7.308985445883736e-06, + "loss": 0.2541, + "step": 13979 + }, + { + "epoch": 0.35377179441759243, + "grad_norm": 4.602313041687012, + "learning_rate": 7.308629293729792e-06, + "loss": 0.2041, + "step": 13980 + }, + { + "epoch": 0.3537970999822861, + "grad_norm": 5.983795166015625, + "learning_rate": 7.308273126687928e-06, + "loss": 0.229, + "step": 13981 + }, + { + "epoch": 0.3538224055469798, + "grad_norm": 4.051845550537109, + "learning_rate": 7.30791694476044e-06, + "loss": 0.2148, + "step": 13982 + }, + { + "epoch": 0.35384771111167346, + "grad_norm": 5.162802696228027, + "learning_rate": 7.307560747949625e-06, + "loss": 0.1985, + "step": 13983 + }, + { + "epoch": 0.3538730166763671, + "grad_norm": 9.314122200012207, + "learning_rate": 7.307204536257779e-06, + "loss": 0.2494, + "step": 13984 + }, + { + "epoch": 0.3538983222410608, + "grad_norm": 4.814642429351807, + "learning_rate": 7.306848309687201e-06, + "loss": 0.1652, + "step": 13985 + }, + { + "epoch": 0.3539236278057545, + "grad_norm": 4.384979724884033, + "learning_rate": 7.306492068240189e-06, + "loss": 0.1139, + "step": 13986 + }, + { + "epoch": 0.35394893337044814, + "grad_norm": 5.516266345977783, + "learning_rate": 7.306135811919035e-06, + "loss": 0.2571, + "step": 13987 + }, + { + "epoch": 0.35397423893514185, + "grad_norm": 2.998734474182129, + "learning_rate": 7.305779540726043e-06, + "loss": 0.1482, + "step": 13988 + }, + { + "epoch": 0.3539995444998355, + "grad_norm": 4.578073501586914, + "learning_rate": 7.3054232546635066e-06, + "loss": 0.2049, + "step": 13989 + }, + { + "epoch": 0.3540248500645292, + "grad_norm": 5.65899658203125, + "learning_rate": 7.305066953733725e-06, + "loss": 0.1597, + "step": 13990 + }, + { + "epoch": 0.3540501556292229, + "grad_norm": 4.54791259765625, + "learning_rate": 7.3047106379389945e-06, + "loss": 0.1642, + "step": 13991 + }, + { + "epoch": 0.35407546119391653, + "grad_norm": 4.301231861114502, + "learning_rate": 7.304354307281613e-06, + "loss": 0.1756, + "step": 13992 + }, + { + "epoch": 0.35410076675861024, + "grad_norm": 3.769170045852661, + "learning_rate": 7.303997961763883e-06, + "loss": 0.1454, + "step": 13993 + }, + { + "epoch": 0.3541260723233039, + "grad_norm": 4.361507892608643, + "learning_rate": 7.303641601388095e-06, + "loss": 0.1845, + "step": 13994 + }, + { + "epoch": 0.35415137788799755, + "grad_norm": 4.887667655944824, + "learning_rate": 7.303285226156553e-06, + "loss": 0.1873, + "step": 13995 + }, + { + "epoch": 0.35417668345269127, + "grad_norm": 5.000020980834961, + "learning_rate": 7.302928836071551e-06, + "loss": 0.2154, + "step": 13996 + }, + { + "epoch": 0.3542019890173849, + "grad_norm": 3.435567855834961, + "learning_rate": 7.302572431135392e-06, + "loss": 0.115, + "step": 13997 + }, + { + "epoch": 0.3542272945820786, + "grad_norm": 6.318670749664307, + "learning_rate": 7.30221601135037e-06, + "loss": 0.2226, + "step": 13998 + }, + { + "epoch": 0.3542526001467723, + "grad_norm": 3.9469430446624756, + "learning_rate": 7.301859576718785e-06, + "loss": 0.135, + "step": 13999 + }, + { + "epoch": 0.35427790571146595, + "grad_norm": 4.9719157218933105, + "learning_rate": 7.301503127242937e-06, + "loss": 0.1504, + "step": 14000 + }, + { + "epoch": 0.3543032112761596, + "grad_norm": 3.708716630935669, + "learning_rate": 7.301146662925121e-06, + "loss": 0.175, + "step": 14001 + }, + { + "epoch": 0.3543285168408533, + "grad_norm": 3.4609482288360596, + "learning_rate": 7.300790183767641e-06, + "loss": 0.2001, + "step": 14002 + }, + { + "epoch": 0.35435382240554697, + "grad_norm": 4.7628936767578125, + "learning_rate": 7.3004336897727915e-06, + "loss": 0.1917, + "step": 14003 + }, + { + "epoch": 0.3543791279702407, + "grad_norm": 6.772756099700928, + "learning_rate": 7.3000771809428735e-06, + "loss": 0.0766, + "step": 14004 + }, + { + "epoch": 0.35440443353493434, + "grad_norm": 14.631917953491211, + "learning_rate": 7.2997206572801846e-06, + "loss": 0.1739, + "step": 14005 + }, + { + "epoch": 0.354429739099628, + "grad_norm": 8.628311157226562, + "learning_rate": 7.299364118787027e-06, + "loss": 0.2475, + "step": 14006 + }, + { + "epoch": 0.3544550446643217, + "grad_norm": 4.430951118469238, + "learning_rate": 7.299007565465696e-06, + "loss": 0.1862, + "step": 14007 + }, + { + "epoch": 0.35448035022901536, + "grad_norm": 5.964262008666992, + "learning_rate": 7.298650997318495e-06, + "loss": 0.1169, + "step": 14008 + }, + { + "epoch": 0.354505655793709, + "grad_norm": 5.810975074768066, + "learning_rate": 7.29829441434772e-06, + "loss": 0.1681, + "step": 14009 + }, + { + "epoch": 0.35453096135840273, + "grad_norm": 6.663167476654053, + "learning_rate": 7.297937816555671e-06, + "loss": 0.2616, + "step": 14010 + }, + { + "epoch": 0.3545562669230964, + "grad_norm": 3.1567800045013428, + "learning_rate": 7.297581203944649e-06, + "loss": 0.1787, + "step": 14011 + }, + { + "epoch": 0.35458157248779004, + "grad_norm": 6.895911693572998, + "learning_rate": 7.297224576516955e-06, + "loss": 0.1505, + "step": 14012 + }, + { + "epoch": 0.35460687805248375, + "grad_norm": 6.777425765991211, + "learning_rate": 7.2968679342748845e-06, + "loss": 0.2297, + "step": 14013 + }, + { + "epoch": 0.3546321836171774, + "grad_norm": 5.363424301147461, + "learning_rate": 7.296511277220741e-06, + "loss": 0.1455, + "step": 14014 + }, + { + "epoch": 0.35465748918187107, + "grad_norm": 2.7903432846069336, + "learning_rate": 7.296154605356823e-06, + "loss": 0.1293, + "step": 14015 + }, + { + "epoch": 0.3546827947465648, + "grad_norm": 2.7232768535614014, + "learning_rate": 7.295797918685431e-06, + "loss": 0.119, + "step": 14016 + }, + { + "epoch": 0.35470810031125843, + "grad_norm": 2.727677345275879, + "learning_rate": 7.295441217208866e-06, + "loss": 0.1275, + "step": 14017 + }, + { + "epoch": 0.35473340587595215, + "grad_norm": 12.131278991699219, + "learning_rate": 7.295084500929427e-06, + "loss": 0.2203, + "step": 14018 + }, + { + "epoch": 0.3547587114406458, + "grad_norm": 15.057408332824707, + "learning_rate": 7.294727769849416e-06, + "loss": 0.2628, + "step": 14019 + }, + { + "epoch": 0.35478401700533946, + "grad_norm": 6.082690238952637, + "learning_rate": 7.294371023971132e-06, + "loss": 0.2092, + "step": 14020 + }, + { + "epoch": 0.35480932257003317, + "grad_norm": 8.643447875976562, + "learning_rate": 7.2940142632968754e-06, + "loss": 0.21, + "step": 14021 + }, + { + "epoch": 0.3548346281347268, + "grad_norm": 6.83982515335083, + "learning_rate": 7.293657487828948e-06, + "loss": 0.1197, + "step": 14022 + }, + { + "epoch": 0.3548599336994205, + "grad_norm": 9.153412818908691, + "learning_rate": 7.2933006975696496e-06, + "loss": 0.1829, + "step": 14023 + }, + { + "epoch": 0.3548852392641142, + "grad_norm": 10.3685941696167, + "learning_rate": 7.292943892521282e-06, + "loss": 0.1683, + "step": 14024 + }, + { + "epoch": 0.35491054482880785, + "grad_norm": 7.4125776290893555, + "learning_rate": 7.292587072686146e-06, + "loss": 0.2618, + "step": 14025 + }, + { + "epoch": 0.3549358503935015, + "grad_norm": 14.9762601852417, + "learning_rate": 7.292230238066542e-06, + "loss": 0.282, + "step": 14026 + }, + { + "epoch": 0.3549611559581952, + "grad_norm": 8.1624174118042, + "learning_rate": 7.291873388664772e-06, + "loss": 0.166, + "step": 14027 + }, + { + "epoch": 0.3549864615228889, + "grad_norm": 5.690176010131836, + "learning_rate": 7.291516524483136e-06, + "loss": 0.2467, + "step": 14028 + }, + { + "epoch": 0.3550117670875826, + "grad_norm": 3.0727717876434326, + "learning_rate": 7.291159645523937e-06, + "loss": 0.1999, + "step": 14029 + }, + { + "epoch": 0.35503707265227624, + "grad_norm": 2.434544563293457, + "learning_rate": 7.290802751789476e-06, + "loss": 0.0608, + "step": 14030 + }, + { + "epoch": 0.3550623782169699, + "grad_norm": 3.0850493907928467, + "learning_rate": 7.290445843282054e-06, + "loss": 0.1578, + "step": 14031 + }, + { + "epoch": 0.3550876837816636, + "grad_norm": 6.142644882202148, + "learning_rate": 7.290088920003973e-06, + "loss": 0.1948, + "step": 14032 + }, + { + "epoch": 0.35511298934635727, + "grad_norm": 10.71252155303955, + "learning_rate": 7.289731981957533e-06, + "loss": 0.2328, + "step": 14033 + }, + { + "epoch": 0.3551382949110509, + "grad_norm": 8.426237106323242, + "learning_rate": 7.289375029145038e-06, + "loss": 0.2562, + "step": 14034 + }, + { + "epoch": 0.35516360047574463, + "grad_norm": 9.543999671936035, + "learning_rate": 7.289018061568791e-06, + "loss": 0.0787, + "step": 14035 + }, + { + "epoch": 0.3551889060404383, + "grad_norm": 3.1309244632720947, + "learning_rate": 7.288661079231089e-06, + "loss": 0.1392, + "step": 14036 + }, + { + "epoch": 0.35521421160513195, + "grad_norm": 11.946228981018066, + "learning_rate": 7.288304082134241e-06, + "loss": 0.1724, + "step": 14037 + }, + { + "epoch": 0.35523951716982566, + "grad_norm": 3.12956166267395, + "learning_rate": 7.287947070280543e-06, + "loss": 0.1999, + "step": 14038 + }, + { + "epoch": 0.3552648227345193, + "grad_norm": 3.7698452472686768, + "learning_rate": 7.287590043672301e-06, + "loss": 0.1791, + "step": 14039 + }, + { + "epoch": 0.35529012829921297, + "grad_norm": 6.369462966918945, + "learning_rate": 7.287233002311817e-06, + "loss": 0.2207, + "step": 14040 + }, + { + "epoch": 0.3553154338639067, + "grad_norm": 5.054734230041504, + "learning_rate": 7.2868759462013925e-06, + "loss": 0.1557, + "step": 14041 + }, + { + "epoch": 0.35534073942860034, + "grad_norm": 7.141181945800781, + "learning_rate": 7.286518875343329e-06, + "loss": 0.2251, + "step": 14042 + }, + { + "epoch": 0.35536604499329405, + "grad_norm": 5.345818519592285, + "learning_rate": 7.286161789739932e-06, + "loss": 0.1608, + "step": 14043 + }, + { + "epoch": 0.3553913505579877, + "grad_norm": 5.6699910163879395, + "learning_rate": 7.285804689393503e-06, + "loss": 0.1876, + "step": 14044 + }, + { + "epoch": 0.35541665612268136, + "grad_norm": 3.659161329269409, + "learning_rate": 7.285447574306344e-06, + "loss": 0.1629, + "step": 14045 + }, + { + "epoch": 0.3554419616873751, + "grad_norm": 3.9590392112731934, + "learning_rate": 7.285090444480759e-06, + "loss": 0.1941, + "step": 14046 + }, + { + "epoch": 0.35546726725206873, + "grad_norm": 9.80388069152832, + "learning_rate": 7.28473329991905e-06, + "loss": 0.21, + "step": 14047 + }, + { + "epoch": 0.3554925728167624, + "grad_norm": 3.243807792663574, + "learning_rate": 7.2843761406235225e-06, + "loss": 0.1387, + "step": 14048 + }, + { + "epoch": 0.3555178783814561, + "grad_norm": 3.893613815307617, + "learning_rate": 7.284018966596477e-06, + "loss": 0.1433, + "step": 14049 + }, + { + "epoch": 0.35554318394614975, + "grad_norm": 5.375444412231445, + "learning_rate": 7.2836617778402185e-06, + "loss": 0.2065, + "step": 14050 + }, + { + "epoch": 0.3555684895108434, + "grad_norm": 4.148630142211914, + "learning_rate": 7.28330457435705e-06, + "loss": 0.1613, + "step": 14051 + }, + { + "epoch": 0.3555937950755371, + "grad_norm": 4.029128074645996, + "learning_rate": 7.282947356149275e-06, + "loss": 0.2006, + "step": 14052 + }, + { + "epoch": 0.3556191006402308, + "grad_norm": 3.931576728820801, + "learning_rate": 7.282590123219196e-06, + "loss": 0.2162, + "step": 14053 + }, + { + "epoch": 0.3556444062049245, + "grad_norm": 3.4613757133483887, + "learning_rate": 7.28223287556912e-06, + "loss": 0.1694, + "step": 14054 + }, + { + "epoch": 0.35566971176961815, + "grad_norm": 14.265591621398926, + "learning_rate": 7.281875613201348e-06, + "loss": 0.2369, + "step": 14055 + }, + { + "epoch": 0.3556950173343118, + "grad_norm": 7.034685134887695, + "learning_rate": 7.281518336118184e-06, + "loss": 0.1487, + "step": 14056 + }, + { + "epoch": 0.3557203228990055, + "grad_norm": 5.417492866516113, + "learning_rate": 7.281161044321933e-06, + "loss": 0.1809, + "step": 14057 + }, + { + "epoch": 0.35574562846369917, + "grad_norm": 12.217312812805176, + "learning_rate": 7.280803737814899e-06, + "loss": 0.2059, + "step": 14058 + }, + { + "epoch": 0.3557709340283928, + "grad_norm": 10.052889823913574, + "learning_rate": 7.280446416599386e-06, + "loss": 0.1977, + "step": 14059 + }, + { + "epoch": 0.35579623959308654, + "grad_norm": 4.423162460327148, + "learning_rate": 7.280089080677698e-06, + "loss": 0.1703, + "step": 14060 + }, + { + "epoch": 0.3558215451577802, + "grad_norm": 8.493451118469238, + "learning_rate": 7.27973173005214e-06, + "loss": 0.2281, + "step": 14061 + }, + { + "epoch": 0.35584685072247385, + "grad_norm": 18.17963218688965, + "learning_rate": 7.279374364725015e-06, + "loss": 0.1401, + "step": 14062 + }, + { + "epoch": 0.35587215628716756, + "grad_norm": 16.557680130004883, + "learning_rate": 7.27901698469863e-06, + "loss": 0.2769, + "step": 14063 + }, + { + "epoch": 0.3558974618518612, + "grad_norm": 9.57496452331543, + "learning_rate": 7.278659589975287e-06, + "loss": 0.1775, + "step": 14064 + }, + { + "epoch": 0.3559227674165549, + "grad_norm": 7.112194061279297, + "learning_rate": 7.278302180557294e-06, + "loss": 0.215, + "step": 14065 + }, + { + "epoch": 0.3559480729812486, + "grad_norm": 5.159282207489014, + "learning_rate": 7.2779447564469526e-06, + "loss": 0.151, + "step": 14066 + }, + { + "epoch": 0.35597337854594224, + "grad_norm": 5.64794921875, + "learning_rate": 7.277587317646569e-06, + "loss": 0.1898, + "step": 14067 + }, + { + "epoch": 0.35599868411063595, + "grad_norm": 8.06303596496582, + "learning_rate": 7.27722986415845e-06, + "loss": 0.1999, + "step": 14068 + }, + { + "epoch": 0.3560239896753296, + "grad_norm": 5.242639064788818, + "learning_rate": 7.276872395984897e-06, + "loss": 0.1541, + "step": 14069 + }, + { + "epoch": 0.35604929524002327, + "grad_norm": 9.170147895812988, + "learning_rate": 7.2765149131282195e-06, + "loss": 0.298, + "step": 14070 + }, + { + "epoch": 0.356074600804717, + "grad_norm": 7.505256175994873, + "learning_rate": 7.27615741559072e-06, + "loss": 0.2248, + "step": 14071 + }, + { + "epoch": 0.35609990636941063, + "grad_norm": 3.8489654064178467, + "learning_rate": 7.275799903374705e-06, + "loss": 0.1331, + "step": 14072 + }, + { + "epoch": 0.3561252119341043, + "grad_norm": 8.449942588806152, + "learning_rate": 7.275442376482479e-06, + "loss": 0.2306, + "step": 14073 + }, + { + "epoch": 0.356150517498798, + "grad_norm": 11.339497566223145, + "learning_rate": 7.27508483491635e-06, + "loss": 0.2244, + "step": 14074 + }, + { + "epoch": 0.35617582306349166, + "grad_norm": 3.5823652744293213, + "learning_rate": 7.274727278678621e-06, + "loss": 0.1316, + "step": 14075 + }, + { + "epoch": 0.3562011286281853, + "grad_norm": 4.9534173011779785, + "learning_rate": 7.274369707771598e-06, + "loss": 0.1944, + "step": 14076 + }, + { + "epoch": 0.356226434192879, + "grad_norm": 6.298049449920654, + "learning_rate": 7.274012122197589e-06, + "loss": 0.1681, + "step": 14077 + }, + { + "epoch": 0.3562517397575727, + "grad_norm": 4.245490074157715, + "learning_rate": 7.273654521958897e-06, + "loss": 0.1754, + "step": 14078 + }, + { + "epoch": 0.35627704532226634, + "grad_norm": 5.375612735748291, + "learning_rate": 7.273296907057833e-06, + "loss": 0.1685, + "step": 14079 + }, + { + "epoch": 0.35630235088696005, + "grad_norm": 3.542633295059204, + "learning_rate": 7.272939277496698e-06, + "loss": 0.2035, + "step": 14080 + }, + { + "epoch": 0.3563276564516537, + "grad_norm": 6.621676921844482, + "learning_rate": 7.2725816332778025e-06, + "loss": 0.1443, + "step": 14081 + }, + { + "epoch": 0.3563529620163474, + "grad_norm": 8.589860916137695, + "learning_rate": 7.272223974403447e-06, + "loss": 0.1775, + "step": 14082 + }, + { + "epoch": 0.3563782675810411, + "grad_norm": 7.201625347137451, + "learning_rate": 7.271866300875945e-06, + "loss": 0.1226, + "step": 14083 + }, + { + "epoch": 0.35640357314573473, + "grad_norm": 5.4749603271484375, + "learning_rate": 7.271508612697599e-06, + "loss": 0.1212, + "step": 14084 + }, + { + "epoch": 0.35642887871042844, + "grad_norm": 47.71369934082031, + "learning_rate": 7.2711509098707165e-06, + "loss": 0.3388, + "step": 14085 + }, + { + "epoch": 0.3564541842751221, + "grad_norm": 4.645035743713379, + "learning_rate": 7.270793192397604e-06, + "loss": 0.2209, + "step": 14086 + }, + { + "epoch": 0.35647948983981576, + "grad_norm": 8.215394020080566, + "learning_rate": 7.27043546028057e-06, + "loss": 0.2642, + "step": 14087 + }, + { + "epoch": 0.35650479540450947, + "grad_norm": 4.754138946533203, + "learning_rate": 7.270077713521918e-06, + "loss": 0.1931, + "step": 14088 + }, + { + "epoch": 0.3565301009692031, + "grad_norm": 3.5856292247772217, + "learning_rate": 7.269719952123957e-06, + "loss": 0.1569, + "step": 14089 + }, + { + "epoch": 0.3565554065338968, + "grad_norm": 10.546218872070312, + "learning_rate": 7.269362176088996e-06, + "loss": 0.1595, + "step": 14090 + }, + { + "epoch": 0.3565807120985905, + "grad_norm": 2.941803455352783, + "learning_rate": 7.269004385419339e-06, + "loss": 0.1286, + "step": 14091 + }, + { + "epoch": 0.35660601766328415, + "grad_norm": 8.931739807128906, + "learning_rate": 7.268646580117296e-06, + "loss": 0.1377, + "step": 14092 + }, + { + "epoch": 0.35663132322797786, + "grad_norm": 4.535534381866455, + "learning_rate": 7.268288760185172e-06, + "loss": 0.0888, + "step": 14093 + }, + { + "epoch": 0.3566566287926715, + "grad_norm": 11.264342308044434, + "learning_rate": 7.267930925625277e-06, + "loss": 0.2622, + "step": 14094 + }, + { + "epoch": 0.35668193435736517, + "grad_norm": 5.670280933380127, + "learning_rate": 7.267573076439916e-06, + "loss": 0.1713, + "step": 14095 + }, + { + "epoch": 0.3567072399220589, + "grad_norm": 5.514698505401611, + "learning_rate": 7.267215212631399e-06, + "loss": 0.1597, + "step": 14096 + }, + { + "epoch": 0.35673254548675254, + "grad_norm": 9.90002727508545, + "learning_rate": 7.266857334202031e-06, + "loss": 0.2878, + "step": 14097 + }, + { + "epoch": 0.3567578510514462, + "grad_norm": 3.9040045738220215, + "learning_rate": 7.266499441154123e-06, + "loss": 0.1808, + "step": 14098 + }, + { + "epoch": 0.3567831566161399, + "grad_norm": 5.959760665893555, + "learning_rate": 7.2661415334899835e-06, + "loss": 0.2208, + "step": 14099 + }, + { + "epoch": 0.35680846218083356, + "grad_norm": 4.714351177215576, + "learning_rate": 7.2657836112119165e-06, + "loss": 0.0656, + "step": 14100 + }, + { + "epoch": 0.3568337677455272, + "grad_norm": 4.86702299118042, + "learning_rate": 7.265425674322233e-06, + "loss": 0.2426, + "step": 14101 + }, + { + "epoch": 0.35685907331022093, + "grad_norm": 5.63591194152832, + "learning_rate": 7.26506772282324e-06, + "loss": 0.2604, + "step": 14102 + }, + { + "epoch": 0.3568843788749146, + "grad_norm": 4.9829206466674805, + "learning_rate": 7.2647097567172475e-06, + "loss": 0.2317, + "step": 14103 + }, + { + "epoch": 0.35690968443960824, + "grad_norm": 6.085807800292969, + "learning_rate": 7.264351776006563e-06, + "loss": 0.196, + "step": 14104 + }, + { + "epoch": 0.35693499000430196, + "grad_norm": 5.883573055267334, + "learning_rate": 7.263993780693496e-06, + "loss": 0.28, + "step": 14105 + }, + { + "epoch": 0.3569602955689956, + "grad_norm": 4.788015365600586, + "learning_rate": 7.263635770780353e-06, + "loss": 0.1263, + "step": 14106 + }, + { + "epoch": 0.3569856011336893, + "grad_norm": 5.3842244148254395, + "learning_rate": 7.263277746269444e-06, + "loss": 0.2315, + "step": 14107 + }, + { + "epoch": 0.357010906698383, + "grad_norm": 6.395328044891357, + "learning_rate": 7.2629197071630775e-06, + "loss": 0.1989, + "step": 14108 + }, + { + "epoch": 0.35703621226307664, + "grad_norm": 6.9136457443237305, + "learning_rate": 7.262561653463564e-06, + "loss": 0.2577, + "step": 14109 + }, + { + "epoch": 0.35706151782777035, + "grad_norm": 2.565901041030884, + "learning_rate": 7.262203585173211e-06, + "loss": 0.1009, + "step": 14110 + }, + { + "epoch": 0.357086823392464, + "grad_norm": 7.162531852722168, + "learning_rate": 7.261845502294327e-06, + "loss": 0.2499, + "step": 14111 + }, + { + "epoch": 0.35711212895715766, + "grad_norm": 7.972986698150635, + "learning_rate": 7.261487404829223e-06, + "loss": 0.1343, + "step": 14112 + }, + { + "epoch": 0.35713743452185137, + "grad_norm": 3.8760876655578613, + "learning_rate": 7.261129292780206e-06, + "loss": 0.1841, + "step": 14113 + }, + { + "epoch": 0.357162740086545, + "grad_norm": 8.388885498046875, + "learning_rate": 7.260771166149588e-06, + "loss": 0.1564, + "step": 14114 + }, + { + "epoch": 0.3571880456512387, + "grad_norm": 8.8675537109375, + "learning_rate": 7.260413024939676e-06, + "loss": 0.2352, + "step": 14115 + }, + { + "epoch": 0.3572133512159324, + "grad_norm": 3.11971378326416, + "learning_rate": 7.2600548691527825e-06, + "loss": 0.1518, + "step": 14116 + }, + { + "epoch": 0.35723865678062605, + "grad_norm": 3.5439510345458984, + "learning_rate": 7.259696698791217e-06, + "loss": 0.1376, + "step": 14117 + }, + { + "epoch": 0.35726396234531976, + "grad_norm": 8.238470077514648, + "learning_rate": 7.259338513857285e-06, + "loss": 0.2364, + "step": 14118 + }, + { + "epoch": 0.3572892679100134, + "grad_norm": 4.1179914474487305, + "learning_rate": 7.2589803143533e-06, + "loss": 0.2124, + "step": 14119 + }, + { + "epoch": 0.3573145734747071, + "grad_norm": 9.589800834655762, + "learning_rate": 7.25862210028157e-06, + "loss": 0.3015, + "step": 14120 + }, + { + "epoch": 0.3573398790394008, + "grad_norm": 11.512701988220215, + "learning_rate": 7.258263871644409e-06, + "loss": 0.1838, + "step": 14121 + }, + { + "epoch": 0.35736518460409444, + "grad_norm": 4.313660621643066, + "learning_rate": 7.2579056284441215e-06, + "loss": 0.157, + "step": 14122 + }, + { + "epoch": 0.3573904901687881, + "grad_norm": 4.89666748046875, + "learning_rate": 7.257547370683022e-06, + "loss": 0.1463, + "step": 14123 + }, + { + "epoch": 0.3574157957334818, + "grad_norm": 8.349385261535645, + "learning_rate": 7.257189098363419e-06, + "loss": 0.2185, + "step": 14124 + }, + { + "epoch": 0.35744110129817547, + "grad_norm": 6.211122512817383, + "learning_rate": 7.256830811487625e-06, + "loss": 0.2267, + "step": 14125 + }, + { + "epoch": 0.3574664068628691, + "grad_norm": 5.20823860168457, + "learning_rate": 7.256472510057947e-06, + "loss": 0.1617, + "step": 14126 + }, + { + "epoch": 0.35749171242756284, + "grad_norm": 5.717545032501221, + "learning_rate": 7.2561141940766975e-06, + "loss": 0.2288, + "step": 14127 + }, + { + "epoch": 0.3575170179922565, + "grad_norm": 3.9549012184143066, + "learning_rate": 7.255755863546188e-06, + "loss": 0.1405, + "step": 14128 + }, + { + "epoch": 0.35754232355695015, + "grad_norm": 4.336534023284912, + "learning_rate": 7.255397518468728e-06, + "loss": 0.2273, + "step": 14129 + }, + { + "epoch": 0.35756762912164386, + "grad_norm": 4.577698707580566, + "learning_rate": 7.25503915884663e-06, + "loss": 0.1568, + "step": 14130 + }, + { + "epoch": 0.3575929346863375, + "grad_norm": 3.5587151050567627, + "learning_rate": 7.254680784682202e-06, + "loss": 0.1334, + "step": 14131 + }, + { + "epoch": 0.3576182402510312, + "grad_norm": 4.817620277404785, + "learning_rate": 7.254322395977758e-06, + "loss": 0.1492, + "step": 14132 + }, + { + "epoch": 0.3576435458157249, + "grad_norm": 3.8098392486572266, + "learning_rate": 7.2539639927356085e-06, + "loss": 0.1276, + "step": 14133 + }, + { + "epoch": 0.35766885138041854, + "grad_norm": 3.3187687397003174, + "learning_rate": 7.253605574958065e-06, + "loss": 0.1215, + "step": 14134 + }, + { + "epoch": 0.35769415694511225, + "grad_norm": 5.5720930099487305, + "learning_rate": 7.253247142647437e-06, + "loss": 0.2113, + "step": 14135 + }, + { + "epoch": 0.3577194625098059, + "grad_norm": 4.924854278564453, + "learning_rate": 7.25288869580604e-06, + "loss": 0.2203, + "step": 14136 + }, + { + "epoch": 0.35774476807449956, + "grad_norm": 3.5816502571105957, + "learning_rate": 7.252530234436179e-06, + "loss": 0.2009, + "step": 14137 + }, + { + "epoch": 0.3577700736391933, + "grad_norm": 6.532973289489746, + "learning_rate": 7.252171758540173e-06, + "loss": 0.2525, + "step": 14138 + }, + { + "epoch": 0.35779537920388693, + "grad_norm": 7.260556697845459, + "learning_rate": 7.251813268120328e-06, + "loss": 0.3509, + "step": 14139 + }, + { + "epoch": 0.3578206847685806, + "grad_norm": 2.4396278858184814, + "learning_rate": 7.251454763178959e-06, + "loss": 0.0897, + "step": 14140 + }, + { + "epoch": 0.3578459903332743, + "grad_norm": 3.2073044776916504, + "learning_rate": 7.251096243718376e-06, + "loss": 0.1448, + "step": 14141 + }, + { + "epoch": 0.35787129589796796, + "grad_norm": 3.1808958053588867, + "learning_rate": 7.250737709740892e-06, + "loss": 0.1101, + "step": 14142 + }, + { + "epoch": 0.3578966014626616, + "grad_norm": 8.317333221435547, + "learning_rate": 7.250379161248822e-06, + "loss": 0.2479, + "step": 14143 + }, + { + "epoch": 0.3579219070273553, + "grad_norm": 7.755032062530518, + "learning_rate": 7.250020598244474e-06, + "loss": 0.1615, + "step": 14144 + }, + { + "epoch": 0.357947212592049, + "grad_norm": 13.329980850219727, + "learning_rate": 7.24966202073016e-06, + "loss": 0.2466, + "step": 14145 + }, + { + "epoch": 0.3579725181567427, + "grad_norm": 4.74941873550415, + "learning_rate": 7.249303428708196e-06, + "loss": 0.1867, + "step": 14146 + }, + { + "epoch": 0.35799782372143635, + "grad_norm": 2.5852346420288086, + "learning_rate": 7.248944822180892e-06, + "loss": 0.1138, + "step": 14147 + }, + { + "epoch": 0.35802312928613, + "grad_norm": 3.9852771759033203, + "learning_rate": 7.2485862011505616e-06, + "loss": 0.1181, + "step": 14148 + }, + { + "epoch": 0.3580484348508237, + "grad_norm": 5.143370628356934, + "learning_rate": 7.248227565619517e-06, + "loss": 0.1362, + "step": 14149 + }, + { + "epoch": 0.35807374041551737, + "grad_norm": 6.047898292541504, + "learning_rate": 7.247868915590069e-06, + "loss": 0.2974, + "step": 14150 + }, + { + "epoch": 0.35809904598021103, + "grad_norm": 6.44905424118042, + "learning_rate": 7.247510251064534e-06, + "loss": 0.2339, + "step": 14151 + }, + { + "epoch": 0.35812435154490474, + "grad_norm": 4.250176429748535, + "learning_rate": 7.247151572045225e-06, + "loss": 0.182, + "step": 14152 + }, + { + "epoch": 0.3581496571095984, + "grad_norm": 3.8860435485839844, + "learning_rate": 7.246792878534451e-06, + "loss": 0.1999, + "step": 14153 + }, + { + "epoch": 0.35817496267429205, + "grad_norm": 9.033397674560547, + "learning_rate": 7.2464341705345285e-06, + "loss": 0.1446, + "step": 14154 + }, + { + "epoch": 0.35820026823898576, + "grad_norm": 7.133526802062988, + "learning_rate": 7.24607544804777e-06, + "loss": 0.2201, + "step": 14155 + }, + { + "epoch": 0.3582255738036794, + "grad_norm": 15.357769966125488, + "learning_rate": 7.245716711076489e-06, + "loss": 0.4506, + "step": 14156 + }, + { + "epoch": 0.35825087936837313, + "grad_norm": 2.738572597503662, + "learning_rate": 7.2453579596229984e-06, + "loss": 0.1478, + "step": 14157 + }, + { + "epoch": 0.3582761849330668, + "grad_norm": 4.547314167022705, + "learning_rate": 7.244999193689613e-06, + "loss": 0.1665, + "step": 14158 + }, + { + "epoch": 0.35830149049776044, + "grad_norm": 5.564113140106201, + "learning_rate": 7.244640413278643e-06, + "loss": 0.1157, + "step": 14159 + }, + { + "epoch": 0.35832679606245416, + "grad_norm": 3.2271976470947266, + "learning_rate": 7.244281618392407e-06, + "loss": 0.1287, + "step": 14160 + }, + { + "epoch": 0.3583521016271478, + "grad_norm": 5.169583320617676, + "learning_rate": 7.243922809033215e-06, + "loss": 0.1798, + "step": 14161 + }, + { + "epoch": 0.35837740719184147, + "grad_norm": 10.876480102539062, + "learning_rate": 7.2435639852033825e-06, + "loss": 0.2322, + "step": 14162 + }, + { + "epoch": 0.3584027127565352, + "grad_norm": 8.454768180847168, + "learning_rate": 7.2432051469052235e-06, + "loss": 0.2031, + "step": 14163 + }, + { + "epoch": 0.35842801832122884, + "grad_norm": 13.977784156799316, + "learning_rate": 7.24284629414105e-06, + "loss": 0.2549, + "step": 14164 + }, + { + "epoch": 0.3584533238859225, + "grad_norm": 4.426716327667236, + "learning_rate": 7.242487426913181e-06, + "loss": 0.1514, + "step": 14165 + }, + { + "epoch": 0.3584786294506162, + "grad_norm": 5.894128322601318, + "learning_rate": 7.242128545223925e-06, + "loss": 0.1442, + "step": 14166 + }, + { + "epoch": 0.35850393501530986, + "grad_norm": 6.4859700202941895, + "learning_rate": 7.2417696490756e-06, + "loss": 0.2491, + "step": 14167 + }, + { + "epoch": 0.3585292405800035, + "grad_norm": 13.123003959655762, + "learning_rate": 7.2414107384705205e-06, + "loss": 0.2979, + "step": 14168 + }, + { + "epoch": 0.35855454614469723, + "grad_norm": 4.523940563201904, + "learning_rate": 7.2410518134109995e-06, + "loss": 0.2076, + "step": 14169 + }, + { + "epoch": 0.3585798517093909, + "grad_norm": 2.678544521331787, + "learning_rate": 7.240692873899352e-06, + "loss": 0.0987, + "step": 14170 + }, + { + "epoch": 0.3586051572740846, + "grad_norm": 2.9551663398742676, + "learning_rate": 7.240333919937893e-06, + "loss": 0.1396, + "step": 14171 + }, + { + "epoch": 0.35863046283877825, + "grad_norm": 10.573247909545898, + "learning_rate": 7.239974951528936e-06, + "loss": 0.1944, + "step": 14172 + }, + { + "epoch": 0.3586557684034719, + "grad_norm": 5.952528953552246, + "learning_rate": 7.2396159686748e-06, + "loss": 0.1594, + "step": 14173 + }, + { + "epoch": 0.3586810739681656, + "grad_norm": 7.700791358947754, + "learning_rate": 7.239256971377795e-06, + "loss": 0.1935, + "step": 14174 + }, + { + "epoch": 0.3587063795328593, + "grad_norm": 3.5904760360717773, + "learning_rate": 7.23889795964024e-06, + "loss": 0.1672, + "step": 14175 + }, + { + "epoch": 0.35873168509755293, + "grad_norm": 5.178941249847412, + "learning_rate": 7.2385389334644476e-06, + "loss": 0.1447, + "step": 14176 + }, + { + "epoch": 0.35875699066224664, + "grad_norm": 3.8231289386749268, + "learning_rate": 7.238179892852733e-06, + "loss": 0.1532, + "step": 14177 + }, + { + "epoch": 0.3587822962269403, + "grad_norm": 2.1371209621429443, + "learning_rate": 7.237820837807414e-06, + "loss": 0.1119, + "step": 14178 + }, + { + "epoch": 0.35880760179163396, + "grad_norm": 3.2098562717437744, + "learning_rate": 7.2374617683308044e-06, + "loss": 0.1067, + "step": 14179 + }, + { + "epoch": 0.35883290735632767, + "grad_norm": 4.847023010253906, + "learning_rate": 7.237102684425221e-06, + "loss": 0.1591, + "step": 14180 + }, + { + "epoch": 0.3588582129210213, + "grad_norm": 5.489414691925049, + "learning_rate": 7.236743586092977e-06, + "loss": 0.2276, + "step": 14181 + }, + { + "epoch": 0.35888351848571504, + "grad_norm": 6.686070919036865, + "learning_rate": 7.2363844733363904e-06, + "loss": 0.1997, + "step": 14182 + }, + { + "epoch": 0.3589088240504087, + "grad_norm": 2.991318702697754, + "learning_rate": 7.236025346157775e-06, + "loss": 0.2001, + "step": 14183 + }, + { + "epoch": 0.35893412961510235, + "grad_norm": 11.712363243103027, + "learning_rate": 7.23566620455945e-06, + "loss": 0.2006, + "step": 14184 + }, + { + "epoch": 0.35895943517979606, + "grad_norm": 5.265027046203613, + "learning_rate": 7.235307048543728e-06, + "loss": 0.1826, + "step": 14185 + }, + { + "epoch": 0.3589847407444897, + "grad_norm": 8.11996078491211, + "learning_rate": 7.2349478781129275e-06, + "loss": 0.2062, + "step": 14186 + }, + { + "epoch": 0.3590100463091834, + "grad_norm": 3.838688373565674, + "learning_rate": 7.234588693269365e-06, + "loss": 0.1239, + "step": 14187 + }, + { + "epoch": 0.3590353518738771, + "grad_norm": 4.279470920562744, + "learning_rate": 7.234229494015352e-06, + "loss": 0.1998, + "step": 14188 + }, + { + "epoch": 0.35906065743857074, + "grad_norm": 3.495833158493042, + "learning_rate": 7.233870280353212e-06, + "loss": 0.1165, + "step": 14189 + }, + { + "epoch": 0.3590859630032644, + "grad_norm": 3.7442948818206787, + "learning_rate": 7.233511052285257e-06, + "loss": 0.1681, + "step": 14190 + }, + { + "epoch": 0.3591112685679581, + "grad_norm": 11.92442512512207, + "learning_rate": 7.233151809813805e-06, + "loss": 0.1976, + "step": 14191 + }, + { + "epoch": 0.35913657413265176, + "grad_norm": 4.473238468170166, + "learning_rate": 7.232792552941171e-06, + "loss": 0.1921, + "step": 14192 + }, + { + "epoch": 0.3591618796973454, + "grad_norm": 8.276012420654297, + "learning_rate": 7.232433281669676e-06, + "loss": 0.2464, + "step": 14193 + }, + { + "epoch": 0.35918718526203913, + "grad_norm": 4.752829074859619, + "learning_rate": 7.232073996001632e-06, + "loss": 0.2229, + "step": 14194 + }, + { + "epoch": 0.3592124908267328, + "grad_norm": 7.952308654785156, + "learning_rate": 7.231714695939357e-06, + "loss": 0.3444, + "step": 14195 + }, + { + "epoch": 0.3592377963914265, + "grad_norm": 3.717958450317383, + "learning_rate": 7.231355381485171e-06, + "loss": 0.2141, + "step": 14196 + }, + { + "epoch": 0.35926310195612016, + "grad_norm": 8.125123977661133, + "learning_rate": 7.230996052641388e-06, + "loss": 0.1871, + "step": 14197 + }, + { + "epoch": 0.3592884075208138, + "grad_norm": 5.137533664703369, + "learning_rate": 7.230636709410328e-06, + "loss": 0.2216, + "step": 14198 + }, + { + "epoch": 0.3593137130855075, + "grad_norm": 4.872967720031738, + "learning_rate": 7.230277351794305e-06, + "loss": 0.2188, + "step": 14199 + }, + { + "epoch": 0.3593390186502012, + "grad_norm": 8.45586109161377, + "learning_rate": 7.22991797979564e-06, + "loss": 0.1863, + "step": 14200 + }, + { + "epoch": 0.35936432421489484, + "grad_norm": 5.480071067810059, + "learning_rate": 7.2295585934166464e-06, + "loss": 0.1914, + "step": 14201 + }, + { + "epoch": 0.35938962977958855, + "grad_norm": 4.976081371307373, + "learning_rate": 7.229199192659646e-06, + "loss": 0.2296, + "step": 14202 + }, + { + "epoch": 0.3594149353442822, + "grad_norm": 3.1744773387908936, + "learning_rate": 7.2288397775269535e-06, + "loss": 0.1287, + "step": 14203 + }, + { + "epoch": 0.35944024090897586, + "grad_norm": 7.0131449699401855, + "learning_rate": 7.22848034802089e-06, + "loss": 0.2314, + "step": 14204 + }, + { + "epoch": 0.3594655464736696, + "grad_norm": 3.9816009998321533, + "learning_rate": 7.228120904143769e-06, + "loss": 0.1663, + "step": 14205 + }, + { + "epoch": 0.35949085203836323, + "grad_norm": 5.174711227416992, + "learning_rate": 7.227761445897912e-06, + "loss": 0.1912, + "step": 14206 + }, + { + "epoch": 0.3595161576030569, + "grad_norm": 4.303305625915527, + "learning_rate": 7.227401973285636e-06, + "loss": 0.2241, + "step": 14207 + }, + { + "epoch": 0.3595414631677506, + "grad_norm": 8.295419692993164, + "learning_rate": 7.227042486309258e-06, + "loss": 0.1429, + "step": 14208 + }, + { + "epoch": 0.35956676873244425, + "grad_norm": 13.042191505432129, + "learning_rate": 7.226682984971099e-06, + "loss": 0.2337, + "step": 14209 + }, + { + "epoch": 0.35959207429713796, + "grad_norm": 4.162139415740967, + "learning_rate": 7.226323469273473e-06, + "loss": 0.1536, + "step": 14210 + }, + { + "epoch": 0.3596173798618316, + "grad_norm": 7.553391933441162, + "learning_rate": 7.225963939218703e-06, + "loss": 0.2722, + "step": 14211 + }, + { + "epoch": 0.3596426854265253, + "grad_norm": 5.351119518280029, + "learning_rate": 7.225604394809105e-06, + "loss": 0.2671, + "step": 14212 + }, + { + "epoch": 0.359667990991219, + "grad_norm": 4.098875999450684, + "learning_rate": 7.225244836046999e-06, + "loss": 0.1575, + "step": 14213 + }, + { + "epoch": 0.35969329655591264, + "grad_norm": 7.695374488830566, + "learning_rate": 7.224885262934701e-06, + "loss": 0.2698, + "step": 14214 + }, + { + "epoch": 0.3597186021206063, + "grad_norm": 4.527026176452637, + "learning_rate": 7.224525675474532e-06, + "loss": 0.1281, + "step": 14215 + }, + { + "epoch": 0.3597439076853, + "grad_norm": 5.685694694519043, + "learning_rate": 7.224166073668812e-06, + "loss": 0.2272, + "step": 14216 + }, + { + "epoch": 0.35976921324999367, + "grad_norm": 5.576969146728516, + "learning_rate": 7.223806457519858e-06, + "loss": 0.1373, + "step": 14217 + }, + { + "epoch": 0.3597945188146873, + "grad_norm": 5.486759185791016, + "learning_rate": 7.223446827029992e-06, + "loss": 0.2151, + "step": 14218 + }, + { + "epoch": 0.35981982437938104, + "grad_norm": 3.5054633617401123, + "learning_rate": 7.2230871822015284e-06, + "loss": 0.1284, + "step": 14219 + }, + { + "epoch": 0.3598451299440747, + "grad_norm": 5.5359930992126465, + "learning_rate": 7.22272752303679e-06, + "loss": 0.2261, + "step": 14220 + }, + { + "epoch": 0.3598704355087684, + "grad_norm": 6.447904586791992, + "learning_rate": 7.222367849538095e-06, + "loss": 0.2226, + "step": 14221 + }, + { + "epoch": 0.35989574107346206, + "grad_norm": 5.9664998054504395, + "learning_rate": 7.2220081617077645e-06, + "loss": 0.2238, + "step": 14222 + }, + { + "epoch": 0.3599210466381557, + "grad_norm": 6.92598295211792, + "learning_rate": 7.221648459548114e-06, + "loss": 0.241, + "step": 14223 + }, + { + "epoch": 0.35994635220284943, + "grad_norm": 5.166162967681885, + "learning_rate": 7.2212887430614685e-06, + "loss": 0.114, + "step": 14224 + }, + { + "epoch": 0.3599716577675431, + "grad_norm": 2.781986951828003, + "learning_rate": 7.2209290122501445e-06, + "loss": 0.1516, + "step": 14225 + }, + { + "epoch": 0.35999696333223674, + "grad_norm": 6.672741889953613, + "learning_rate": 7.220569267116461e-06, + "loss": 0.2469, + "step": 14226 + }, + { + "epoch": 0.36002226889693045, + "grad_norm": 3.9096124172210693, + "learning_rate": 7.2202095076627395e-06, + "loss": 0.1675, + "step": 14227 + }, + { + "epoch": 0.3600475744616241, + "grad_norm": 4.3736701011657715, + "learning_rate": 7.219849733891301e-06, + "loss": 0.1723, + "step": 14228 + }, + { + "epoch": 0.36007288002631777, + "grad_norm": 7.928844451904297, + "learning_rate": 7.219489945804464e-06, + "loss": 0.1761, + "step": 14229 + }, + { + "epoch": 0.3600981855910115, + "grad_norm": 9.064918518066406, + "learning_rate": 7.219130143404549e-06, + "loss": 0.1364, + "step": 14230 + }, + { + "epoch": 0.36012349115570513, + "grad_norm": 3.627261161804199, + "learning_rate": 7.218770326693878e-06, + "loss": 0.2039, + "step": 14231 + }, + { + "epoch": 0.3601487967203988, + "grad_norm": 10.43649959564209, + "learning_rate": 7.218410495674768e-06, + "loss": 0.2776, + "step": 14232 + }, + { + "epoch": 0.3601741022850925, + "grad_norm": 5.70747184753418, + "learning_rate": 7.218050650349542e-06, + "loss": 0.1579, + "step": 14233 + }, + { + "epoch": 0.36019940784978616, + "grad_norm": 7.322774887084961, + "learning_rate": 7.21769079072052e-06, + "loss": 0.2153, + "step": 14234 + }, + { + "epoch": 0.36022471341447987, + "grad_norm": 7.162778377532959, + "learning_rate": 7.217330916790023e-06, + "loss": 0.1887, + "step": 14235 + }, + { + "epoch": 0.3602500189791735, + "grad_norm": 5.477730751037598, + "learning_rate": 7.216971028560371e-06, + "loss": 0.2377, + "step": 14236 + }, + { + "epoch": 0.3602753245438672, + "grad_norm": 7.16928768157959, + "learning_rate": 7.216611126033886e-06, + "loss": 0.2258, + "step": 14237 + }, + { + "epoch": 0.3603006301085609, + "grad_norm": 3.109241008758545, + "learning_rate": 7.2162512092128875e-06, + "loss": 0.0709, + "step": 14238 + }, + { + "epoch": 0.36032593567325455, + "grad_norm": 8.50182056427002, + "learning_rate": 7.2158912780996955e-06, + "loss": 0.2006, + "step": 14239 + }, + { + "epoch": 0.3603512412379482, + "grad_norm": 3.2096409797668457, + "learning_rate": 7.215531332696636e-06, + "loss": 0.1227, + "step": 14240 + }, + { + "epoch": 0.3603765468026419, + "grad_norm": 3.7738966941833496, + "learning_rate": 7.215171373006024e-06, + "loss": 0.1233, + "step": 14241 + }, + { + "epoch": 0.3604018523673356, + "grad_norm": 3.360567331314087, + "learning_rate": 7.214811399030187e-06, + "loss": 0.1463, + "step": 14242 + }, + { + "epoch": 0.36042715793202923, + "grad_norm": 11.336292266845703, + "learning_rate": 7.214451410771441e-06, + "loss": 0.2666, + "step": 14243 + }, + { + "epoch": 0.36045246349672294, + "grad_norm": 5.567352771759033, + "learning_rate": 7.214091408232111e-06, + "loss": 0.1713, + "step": 14244 + }, + { + "epoch": 0.3604777690614166, + "grad_norm": 3.119025945663452, + "learning_rate": 7.213731391414516e-06, + "loss": 0.1625, + "step": 14245 + }, + { + "epoch": 0.3605030746261103, + "grad_norm": 6.569650650024414, + "learning_rate": 7.2133713603209795e-06, + "loss": 0.2156, + "step": 14246 + }, + { + "epoch": 0.36052838019080397, + "grad_norm": 5.976787090301514, + "learning_rate": 7.213011314953824e-06, + "loss": 0.1973, + "step": 14247 + }, + { + "epoch": 0.3605536857554976, + "grad_norm": 3.670809745788574, + "learning_rate": 7.2126512553153685e-06, + "loss": 0.1703, + "step": 14248 + }, + { + "epoch": 0.36057899132019133, + "grad_norm": 6.0093536376953125, + "learning_rate": 7.2122911814079375e-06, + "loss": 0.2386, + "step": 14249 + }, + { + "epoch": 0.360604296884885, + "grad_norm": 2.900876522064209, + "learning_rate": 7.211931093233853e-06, + "loss": 0.0984, + "step": 14250 + }, + { + "epoch": 0.36062960244957865, + "grad_norm": 4.215332508087158, + "learning_rate": 7.211570990795436e-06, + "loss": 0.1124, + "step": 14251 + }, + { + "epoch": 0.36065490801427236, + "grad_norm": 7.2548747062683105, + "learning_rate": 7.211210874095007e-06, + "loss": 0.278, + "step": 14252 + }, + { + "epoch": 0.360680213578966, + "grad_norm": 4.7898149490356445, + "learning_rate": 7.210850743134893e-06, + "loss": 0.181, + "step": 14253 + }, + { + "epoch": 0.36070551914365967, + "grad_norm": 5.482934951782227, + "learning_rate": 7.210490597917412e-06, + "loss": 0.2091, + "step": 14254 + }, + { + "epoch": 0.3607308247083534, + "grad_norm": 3.1520495414733887, + "learning_rate": 7.210130438444889e-06, + "loss": 0.1863, + "step": 14255 + }, + { + "epoch": 0.36075613027304704, + "grad_norm": 4.222506046295166, + "learning_rate": 7.209770264719646e-06, + "loss": 0.1781, + "step": 14256 + }, + { + "epoch": 0.3607814358377407, + "grad_norm": 5.901435852050781, + "learning_rate": 7.2094100767440055e-06, + "loss": 0.1134, + "step": 14257 + }, + { + "epoch": 0.3608067414024344, + "grad_norm": 5.904341220855713, + "learning_rate": 7.209049874520291e-06, + "loss": 0.1939, + "step": 14258 + }, + { + "epoch": 0.36083204696712806, + "grad_norm": 5.478646278381348, + "learning_rate": 7.2086896580508225e-06, + "loss": 0.1994, + "step": 14259 + }, + { + "epoch": 0.3608573525318218, + "grad_norm": 7.450321674346924, + "learning_rate": 7.208329427337928e-06, + "loss": 0.2518, + "step": 14260 + }, + { + "epoch": 0.36088265809651543, + "grad_norm": 3.326650619506836, + "learning_rate": 7.207969182383927e-06, + "loss": 0.1648, + "step": 14261 + }, + { + "epoch": 0.3609079636612091, + "grad_norm": 6.179543972015381, + "learning_rate": 7.207608923191143e-06, + "loss": 0.2095, + "step": 14262 + }, + { + "epoch": 0.3609332692259028, + "grad_norm": 2.946239471435547, + "learning_rate": 7.207248649761901e-06, + "loss": 0.1529, + "step": 14263 + }, + { + "epoch": 0.36095857479059645, + "grad_norm": 4.914819240570068, + "learning_rate": 7.2068883620985204e-06, + "loss": 0.2134, + "step": 14264 + }, + { + "epoch": 0.3609838803552901, + "grad_norm": 6.465766906738281, + "learning_rate": 7.20652806020333e-06, + "loss": 0.1424, + "step": 14265 + }, + { + "epoch": 0.3610091859199838, + "grad_norm": 6.584743499755859, + "learning_rate": 7.20616774407865e-06, + "loss": 0.2134, + "step": 14266 + }, + { + "epoch": 0.3610344914846775, + "grad_norm": 8.253076553344727, + "learning_rate": 7.2058074137268045e-06, + "loss": 0.2448, + "step": 14267 + }, + { + "epoch": 0.36105979704937113, + "grad_norm": 10.258932113647461, + "learning_rate": 7.205447069150116e-06, + "loss": 0.1366, + "step": 14268 + }, + { + "epoch": 0.36108510261406485, + "grad_norm": 5.138733386993408, + "learning_rate": 7.205086710350911e-06, + "loss": 0.2031, + "step": 14269 + }, + { + "epoch": 0.3611104081787585, + "grad_norm": 8.423977851867676, + "learning_rate": 7.20472633733151e-06, + "loss": 0.1938, + "step": 14270 + }, + { + "epoch": 0.36113571374345216, + "grad_norm": 4.4324517250061035, + "learning_rate": 7.204365950094241e-06, + "loss": 0.1784, + "step": 14271 + }, + { + "epoch": 0.36116101930814587, + "grad_norm": 6.952647686004639, + "learning_rate": 7.204005548641425e-06, + "loss": 0.1841, + "step": 14272 + }, + { + "epoch": 0.3611863248728395, + "grad_norm": 4.260460376739502, + "learning_rate": 7.203645132975389e-06, + "loss": 0.1364, + "step": 14273 + }, + { + "epoch": 0.36121163043753324, + "grad_norm": 6.640723705291748, + "learning_rate": 7.203284703098453e-06, + "loss": 0.2752, + "step": 14274 + }, + { + "epoch": 0.3612369360022269, + "grad_norm": 3.640368700027466, + "learning_rate": 7.202924259012945e-06, + "loss": 0.2119, + "step": 14275 + }, + { + "epoch": 0.36126224156692055, + "grad_norm": 5.173888206481934, + "learning_rate": 7.202563800721187e-06, + "loss": 0.2269, + "step": 14276 + }, + { + "epoch": 0.36128754713161426, + "grad_norm": 3.764486074447632, + "learning_rate": 7.202203328225508e-06, + "loss": 0.1573, + "step": 14277 + }, + { + "epoch": 0.3613128526963079, + "grad_norm": 4.946791172027588, + "learning_rate": 7.201842841528226e-06, + "loss": 0.1491, + "step": 14278 + }, + { + "epoch": 0.3613381582610016, + "grad_norm": 5.759818077087402, + "learning_rate": 7.201482340631669e-06, + "loss": 0.1227, + "step": 14279 + }, + { + "epoch": 0.3613634638256953, + "grad_norm": 7.315062999725342, + "learning_rate": 7.201121825538164e-06, + "loss": 0.2746, + "step": 14280 + }, + { + "epoch": 0.36138876939038894, + "grad_norm": 9.845338821411133, + "learning_rate": 7.200761296250033e-06, + "loss": 0.2061, + "step": 14281 + }, + { + "epoch": 0.3614140749550826, + "grad_norm": 3.006823778152466, + "learning_rate": 7.200400752769602e-06, + "loss": 0.1587, + "step": 14282 + }, + { + "epoch": 0.3614393805197763, + "grad_norm": 6.037185192108154, + "learning_rate": 7.200040195099195e-06, + "loss": 0.1769, + "step": 14283 + }, + { + "epoch": 0.36146468608446997, + "grad_norm": 3.6091794967651367, + "learning_rate": 7.1996796232411394e-06, + "loss": 0.1525, + "step": 14284 + }, + { + "epoch": 0.3614899916491637, + "grad_norm": 5.400124549865723, + "learning_rate": 7.199319037197759e-06, + "loss": 0.1739, + "step": 14285 + }, + { + "epoch": 0.36151529721385733, + "grad_norm": 6.485711574554443, + "learning_rate": 7.198958436971377e-06, + "loss": 0.1586, + "step": 14286 + }, + { + "epoch": 0.361540602778551, + "grad_norm": 9.633151054382324, + "learning_rate": 7.198597822564323e-06, + "loss": 0.2053, + "step": 14287 + }, + { + "epoch": 0.3615659083432447, + "grad_norm": 5.2138190269470215, + "learning_rate": 7.198237193978921e-06, + "loss": 0.1679, + "step": 14288 + }, + { + "epoch": 0.36159121390793836, + "grad_norm": 7.137990474700928, + "learning_rate": 7.1978765512174955e-06, + "loss": 0.3056, + "step": 14289 + }, + { + "epoch": 0.361616519472632, + "grad_norm": 7.61697244644165, + "learning_rate": 7.197515894282373e-06, + "loss": 0.2974, + "step": 14290 + }, + { + "epoch": 0.3616418250373257, + "grad_norm": 6.467564582824707, + "learning_rate": 7.197155223175878e-06, + "loss": 0.2035, + "step": 14291 + }, + { + "epoch": 0.3616671306020194, + "grad_norm": 4.575799942016602, + "learning_rate": 7.19679453790034e-06, + "loss": 0.1921, + "step": 14292 + }, + { + "epoch": 0.36169243616671304, + "grad_norm": 4.233867645263672, + "learning_rate": 7.196433838458081e-06, + "loss": 0.1269, + "step": 14293 + }, + { + "epoch": 0.36171774173140675, + "grad_norm": 7.0417680740356445, + "learning_rate": 7.196073124851429e-06, + "loss": 0.232, + "step": 14294 + }, + { + "epoch": 0.3617430472961004, + "grad_norm": 4.249754905700684, + "learning_rate": 7.1957123970827115e-06, + "loss": 0.2024, + "step": 14295 + }, + { + "epoch": 0.36176835286079406, + "grad_norm": 5.305327892303467, + "learning_rate": 7.195351655154251e-06, + "loss": 0.1776, + "step": 14296 + }, + { + "epoch": 0.3617936584254878, + "grad_norm": 3.085980176925659, + "learning_rate": 7.194990899068377e-06, + "loss": 0.1795, + "step": 14297 + }, + { + "epoch": 0.36181896399018143, + "grad_norm": 6.639301300048828, + "learning_rate": 7.194630128827415e-06, + "loss": 0.1567, + "step": 14298 + }, + { + "epoch": 0.36184426955487514, + "grad_norm": 8.728753089904785, + "learning_rate": 7.194269344433692e-06, + "loss": 0.2185, + "step": 14299 + }, + { + "epoch": 0.3618695751195688, + "grad_norm": 2.160536766052246, + "learning_rate": 7.193908545889533e-06, + "loss": 0.1044, + "step": 14300 + }, + { + "epoch": 0.36189488068426245, + "grad_norm": 6.910897731781006, + "learning_rate": 7.1935477331972656e-06, + "loss": 0.1745, + "step": 14301 + }, + { + "epoch": 0.36192018624895617, + "grad_norm": 4.987204551696777, + "learning_rate": 7.193186906359219e-06, + "loss": 0.17, + "step": 14302 + }, + { + "epoch": 0.3619454918136498, + "grad_norm": 3.865283250808716, + "learning_rate": 7.192826065377716e-06, + "loss": 0.1625, + "step": 14303 + }, + { + "epoch": 0.3619707973783435, + "grad_norm": 3.628909111022949, + "learning_rate": 7.1924652102550865e-06, + "loss": 0.1249, + "step": 14304 + }, + { + "epoch": 0.3619961029430372, + "grad_norm": 3.782696008682251, + "learning_rate": 7.192104340993656e-06, + "loss": 0.1268, + "step": 14305 + }, + { + "epoch": 0.36202140850773085, + "grad_norm": 8.429167747497559, + "learning_rate": 7.191743457595754e-06, + "loss": 0.1788, + "step": 14306 + }, + { + "epoch": 0.3620467140724245, + "grad_norm": 4.9547224044799805, + "learning_rate": 7.191382560063704e-06, + "loss": 0.135, + "step": 14307 + }, + { + "epoch": 0.3620720196371182, + "grad_norm": 8.842081069946289, + "learning_rate": 7.191021648399838e-06, + "loss": 0.1871, + "step": 14308 + }, + { + "epoch": 0.36209732520181187, + "grad_norm": 4.993475437164307, + "learning_rate": 7.190660722606478e-06, + "loss": 0.1796, + "step": 14309 + }, + { + "epoch": 0.3621226307665056, + "grad_norm": 11.258370399475098, + "learning_rate": 7.190299782685957e-06, + "loss": 0.3116, + "step": 14310 + }, + { + "epoch": 0.36214793633119924, + "grad_norm": 3.857403516769409, + "learning_rate": 7.189938828640599e-06, + "loss": 0.1035, + "step": 14311 + }, + { + "epoch": 0.3621732418958929, + "grad_norm": 10.527884483337402, + "learning_rate": 7.189577860472732e-06, + "loss": 0.2121, + "step": 14312 + }, + { + "epoch": 0.3621985474605866, + "grad_norm": 4.737531661987305, + "learning_rate": 7.189216878184686e-06, + "loss": 0.2077, + "step": 14313 + }, + { + "epoch": 0.36222385302528026, + "grad_norm": 41.10659408569336, + "learning_rate": 7.188855881778787e-06, + "loss": 0.1842, + "step": 14314 + }, + { + "epoch": 0.3622491585899739, + "grad_norm": 4.348485469818115, + "learning_rate": 7.188494871257365e-06, + "loss": 0.2133, + "step": 14315 + }, + { + "epoch": 0.36227446415466763, + "grad_norm": 4.988929271697998, + "learning_rate": 7.1881338466227446e-06, + "loss": 0.1831, + "step": 14316 + }, + { + "epoch": 0.3622997697193613, + "grad_norm": 4.244132041931152, + "learning_rate": 7.187772807877258e-06, + "loss": 0.2123, + "step": 14317 + }, + { + "epoch": 0.36232507528405494, + "grad_norm": 3.8823750019073486, + "learning_rate": 7.1874117550232294e-06, + "loss": 0.1694, + "step": 14318 + }, + { + "epoch": 0.36235038084874865, + "grad_norm": 4.552765846252441, + "learning_rate": 7.1870506880629905e-06, + "loss": 0.1793, + "step": 14319 + }, + { + "epoch": 0.3623756864134423, + "grad_norm": 7.920347690582275, + "learning_rate": 7.186689606998869e-06, + "loss": 0.2282, + "step": 14320 + }, + { + "epoch": 0.36240099197813597, + "grad_norm": 4.524187088012695, + "learning_rate": 7.186328511833192e-06, + "loss": 0.1747, + "step": 14321 + }, + { + "epoch": 0.3624262975428297, + "grad_norm": 4.848586559295654, + "learning_rate": 7.18596740256829e-06, + "loss": 0.1987, + "step": 14322 + }, + { + "epoch": 0.36245160310752333, + "grad_norm": 3.026209592819214, + "learning_rate": 7.1856062792064896e-06, + "loss": 0.1561, + "step": 14323 + }, + { + "epoch": 0.36247690867221705, + "grad_norm": 4.269382476806641, + "learning_rate": 7.185245141750123e-06, + "loss": 0.158, + "step": 14324 + }, + { + "epoch": 0.3625022142369107, + "grad_norm": 7.6753435134887695, + "learning_rate": 7.184883990201514e-06, + "loss": 0.1814, + "step": 14325 + }, + { + "epoch": 0.36252751980160436, + "grad_norm": 4.7874369621276855, + "learning_rate": 7.184522824562998e-06, + "loss": 0.1821, + "step": 14326 + }, + { + "epoch": 0.36255282536629807, + "grad_norm": 7.924256324768066, + "learning_rate": 7.184161644836898e-06, + "loss": 0.2589, + "step": 14327 + }, + { + "epoch": 0.3625781309309917, + "grad_norm": 4.159043312072754, + "learning_rate": 7.183800451025546e-06, + "loss": 0.1814, + "step": 14328 + }, + { + "epoch": 0.3626034364956854, + "grad_norm": 3.198178768157959, + "learning_rate": 7.183439243131271e-06, + "loss": 0.1509, + "step": 14329 + }, + { + "epoch": 0.3626287420603791, + "grad_norm": 6.898970603942871, + "learning_rate": 7.183078021156404e-06, + "loss": 0.192, + "step": 14330 + }, + { + "epoch": 0.36265404762507275, + "grad_norm": 4.174157619476318, + "learning_rate": 7.182716785103272e-06, + "loss": 0.1473, + "step": 14331 + }, + { + "epoch": 0.3626793531897664, + "grad_norm": 5.922574043273926, + "learning_rate": 7.1823555349742055e-06, + "loss": 0.2386, + "step": 14332 + }, + { + "epoch": 0.3627046587544601, + "grad_norm": 3.033538341522217, + "learning_rate": 7.181994270771535e-06, + "loss": 0.1206, + "step": 14333 + }, + { + "epoch": 0.3627299643191538, + "grad_norm": 10.512043952941895, + "learning_rate": 7.181632992497588e-06, + "loss": 0.2616, + "step": 14334 + }, + { + "epoch": 0.36275526988384743, + "grad_norm": 4.417410373687744, + "learning_rate": 7.181271700154696e-06, + "loss": 0.1792, + "step": 14335 + }, + { + "epoch": 0.36278057544854114, + "grad_norm": 6.520939350128174, + "learning_rate": 7.180910393745188e-06, + "loss": 0.2403, + "step": 14336 + }, + { + "epoch": 0.3628058810132348, + "grad_norm": 5.752292156219482, + "learning_rate": 7.180549073271397e-06, + "loss": 0.2844, + "step": 14337 + }, + { + "epoch": 0.3628311865779285, + "grad_norm": 8.914359092712402, + "learning_rate": 7.180187738735648e-06, + "loss": 0.2503, + "step": 14338 + }, + { + "epoch": 0.36285649214262217, + "grad_norm": 3.539283275604248, + "learning_rate": 7.179826390140275e-06, + "loss": 0.1199, + "step": 14339 + }, + { + "epoch": 0.3628817977073158, + "grad_norm": 5.819514751434326, + "learning_rate": 7.179465027487606e-06, + "loss": 0.1562, + "step": 14340 + }, + { + "epoch": 0.36290710327200953, + "grad_norm": 6.396612167358398, + "learning_rate": 7.179103650779974e-06, + "loss": 0.2276, + "step": 14341 + }, + { + "epoch": 0.3629324088367032, + "grad_norm": 4.064968585968018, + "learning_rate": 7.178742260019706e-06, + "loss": 0.2051, + "step": 14342 + }, + { + "epoch": 0.36295771440139685, + "grad_norm": 22.92877769470215, + "learning_rate": 7.178380855209136e-06, + "loss": 0.383, + "step": 14343 + }, + { + "epoch": 0.36298301996609056, + "grad_norm": 3.8021609783172607, + "learning_rate": 7.178019436350593e-06, + "loss": 0.2192, + "step": 14344 + }, + { + "epoch": 0.3630083255307842, + "grad_norm": 4.737054824829102, + "learning_rate": 7.177658003446407e-06, + "loss": 0.2056, + "step": 14345 + }, + { + "epoch": 0.36303363109547787, + "grad_norm": 3.3314359188079834, + "learning_rate": 7.1772965564989105e-06, + "loss": 0.1659, + "step": 14346 + }, + { + "epoch": 0.3630589366601716, + "grad_norm": 3.7970387935638428, + "learning_rate": 7.176935095510434e-06, + "loss": 0.1999, + "step": 14347 + }, + { + "epoch": 0.36308424222486524, + "grad_norm": 4.067307949066162, + "learning_rate": 7.176573620483307e-06, + "loss": 0.22, + "step": 14348 + }, + { + "epoch": 0.36310954778955895, + "grad_norm": 5.903380870819092, + "learning_rate": 7.176212131419861e-06, + "loss": 0.1813, + "step": 14349 + }, + { + "epoch": 0.3631348533542526, + "grad_norm": 7.794025897979736, + "learning_rate": 7.175850628322428e-06, + "loss": 0.3181, + "step": 14350 + }, + { + "epoch": 0.36316015891894626, + "grad_norm": 2.9473183155059814, + "learning_rate": 7.175489111193341e-06, + "loss": 0.1488, + "step": 14351 + }, + { + "epoch": 0.36318546448364, + "grad_norm": 6.655990123748779, + "learning_rate": 7.175127580034927e-06, + "loss": 0.1939, + "step": 14352 + }, + { + "epoch": 0.36321077004833363, + "grad_norm": 4.397918224334717, + "learning_rate": 7.17476603484952e-06, + "loss": 0.1119, + "step": 14353 + }, + { + "epoch": 0.3632360756130273, + "grad_norm": 8.062810897827148, + "learning_rate": 7.174404475639454e-06, + "loss": 0.2026, + "step": 14354 + }, + { + "epoch": 0.363261381177721, + "grad_norm": 4.030273914337158, + "learning_rate": 7.1740429024070555e-06, + "loss": 0.1068, + "step": 14355 + }, + { + "epoch": 0.36328668674241466, + "grad_norm": 7.341643810272217, + "learning_rate": 7.1736813151546594e-06, + "loss": 0.2271, + "step": 14356 + }, + { + "epoch": 0.3633119923071083, + "grad_norm": 12.525126457214355, + "learning_rate": 7.1733197138845966e-06, + "loss": 0.2784, + "step": 14357 + }, + { + "epoch": 0.363337297871802, + "grad_norm": 8.429288864135742, + "learning_rate": 7.172958098599199e-06, + "loss": 0.1313, + "step": 14358 + }, + { + "epoch": 0.3633626034364957, + "grad_norm": 5.562105178833008, + "learning_rate": 7.1725964693007996e-06, + "loss": 0.2215, + "step": 14359 + }, + { + "epoch": 0.36338790900118934, + "grad_norm": 4.562878131866455, + "learning_rate": 7.172234825991729e-06, + "loss": 0.1773, + "step": 14360 + }, + { + "epoch": 0.36341321456588305, + "grad_norm": 9.033378601074219, + "learning_rate": 7.171873168674321e-06, + "loss": 0.2049, + "step": 14361 + }, + { + "epoch": 0.3634385201305767, + "grad_norm": 9.63735294342041, + "learning_rate": 7.171511497350906e-06, + "loss": 0.2402, + "step": 14362 + }, + { + "epoch": 0.3634638256952704, + "grad_norm": 5.043582439422607, + "learning_rate": 7.171149812023819e-06, + "loss": 0.2243, + "step": 14363 + }, + { + "epoch": 0.36348913125996407, + "grad_norm": 6.733304977416992, + "learning_rate": 7.170788112695388e-06, + "loss": 0.235, + "step": 14364 + }, + { + "epoch": 0.3635144368246577, + "grad_norm": 4.59484338760376, + "learning_rate": 7.170426399367949e-06, + "loss": 0.1732, + "step": 14365 + }, + { + "epoch": 0.36353974238935144, + "grad_norm": 9.855474472045898, + "learning_rate": 7.170064672043835e-06, + "loss": 0.2142, + "step": 14366 + }, + { + "epoch": 0.3635650479540451, + "grad_norm": 2.665109157562256, + "learning_rate": 7.169702930725377e-06, + "loss": 0.113, + "step": 14367 + }, + { + "epoch": 0.36359035351873875, + "grad_norm": 5.773416042327881, + "learning_rate": 7.169341175414909e-06, + "loss": 0.2547, + "step": 14368 + }, + { + "epoch": 0.36361565908343246, + "grad_norm": 3.5434534549713135, + "learning_rate": 7.168979406114762e-06, + "loss": 0.109, + "step": 14369 + }, + { + "epoch": 0.3636409646481261, + "grad_norm": 4.050022602081299, + "learning_rate": 7.168617622827272e-06, + "loss": 0.2147, + "step": 14370 + }, + { + "epoch": 0.3636662702128198, + "grad_norm": 3.5589213371276855, + "learning_rate": 7.168255825554768e-06, + "loss": 0.1591, + "step": 14371 + }, + { + "epoch": 0.3636915757775135, + "grad_norm": 3.0772461891174316, + "learning_rate": 7.167894014299588e-06, + "loss": 0.1488, + "step": 14372 + }, + { + "epoch": 0.36371688134220714, + "grad_norm": 5.664663791656494, + "learning_rate": 7.167532189064061e-06, + "loss": 0.1933, + "step": 14373 + }, + { + "epoch": 0.36374218690690086, + "grad_norm": 6.109489440917969, + "learning_rate": 7.167170349850524e-06, + "loss": 0.1426, + "step": 14374 + }, + { + "epoch": 0.3637674924715945, + "grad_norm": 4.709804534912109, + "learning_rate": 7.166808496661306e-06, + "loss": 0.1416, + "step": 14375 + }, + { + "epoch": 0.36379279803628817, + "grad_norm": 4.103699684143066, + "learning_rate": 7.166446629498744e-06, + "loss": 0.1454, + "step": 14376 + }, + { + "epoch": 0.3638181036009819, + "grad_norm": 6.402338027954102, + "learning_rate": 7.166084748365172e-06, + "loss": 0.1818, + "step": 14377 + }, + { + "epoch": 0.36384340916567554, + "grad_norm": 3.830629825592041, + "learning_rate": 7.165722853262921e-06, + "loss": 0.1043, + "step": 14378 + }, + { + "epoch": 0.3638687147303692, + "grad_norm": 2.949674129486084, + "learning_rate": 7.165360944194327e-06, + "loss": 0.0988, + "step": 14379 + }, + { + "epoch": 0.3638940202950629, + "grad_norm": 3.6195178031921387, + "learning_rate": 7.164999021161724e-06, + "loss": 0.1475, + "step": 14380 + }, + { + "epoch": 0.36391932585975656, + "grad_norm": 5.228402137756348, + "learning_rate": 7.1646370841674426e-06, + "loss": 0.1205, + "step": 14381 + }, + { + "epoch": 0.3639446314244502, + "grad_norm": 4.720870494842529, + "learning_rate": 7.16427513321382e-06, + "loss": 0.2119, + "step": 14382 + }, + { + "epoch": 0.3639699369891439, + "grad_norm": 6.447768688201904, + "learning_rate": 7.163913168303192e-06, + "loss": 0.1304, + "step": 14383 + }, + { + "epoch": 0.3639952425538376, + "grad_norm": 6.175440788269043, + "learning_rate": 7.1635511894378874e-06, + "loss": 0.1091, + "step": 14384 + }, + { + "epoch": 0.36402054811853124, + "grad_norm": 4.628201484680176, + "learning_rate": 7.163189196620246e-06, + "loss": 0.1641, + "step": 14385 + }, + { + "epoch": 0.36404585368322495, + "grad_norm": 20.923194885253906, + "learning_rate": 7.162827189852598e-06, + "loss": 0.2609, + "step": 14386 + }, + { + "epoch": 0.3640711592479186, + "grad_norm": 3.920509099960327, + "learning_rate": 7.162465169137279e-06, + "loss": 0.1414, + "step": 14387 + }, + { + "epoch": 0.3640964648126123, + "grad_norm": 3.1096434593200684, + "learning_rate": 7.1621031344766264e-06, + "loss": 0.1683, + "step": 14388 + }, + { + "epoch": 0.364121770377306, + "grad_norm": 3.659106492996216, + "learning_rate": 7.161741085872973e-06, + "loss": 0.1452, + "step": 14389 + }, + { + "epoch": 0.36414707594199963, + "grad_norm": 3.24204421043396, + "learning_rate": 7.161379023328653e-06, + "loss": 0.0831, + "step": 14390 + }, + { + "epoch": 0.36417238150669334, + "grad_norm": 4.115426540374756, + "learning_rate": 7.161016946846e-06, + "loss": 0.1277, + "step": 14391 + }, + { + "epoch": 0.364197687071387, + "grad_norm": 7.1900248527526855, + "learning_rate": 7.160654856427354e-06, + "loss": 0.2139, + "step": 14392 + }, + { + "epoch": 0.36422299263608066, + "grad_norm": 5.2485833168029785, + "learning_rate": 7.1602927520750436e-06, + "loss": 0.126, + "step": 14393 + }, + { + "epoch": 0.36424829820077437, + "grad_norm": 7.857799053192139, + "learning_rate": 7.159930633791409e-06, + "loss": 0.2473, + "step": 14394 + }, + { + "epoch": 0.364273603765468, + "grad_norm": 6.034533977508545, + "learning_rate": 7.159568501578782e-06, + "loss": 0.1783, + "step": 14395 + }, + { + "epoch": 0.3642989093301617, + "grad_norm": 10.01115894317627, + "learning_rate": 7.1592063554394994e-06, + "loss": 0.2631, + "step": 14396 + }, + { + "epoch": 0.3643242148948554, + "grad_norm": 5.596843719482422, + "learning_rate": 7.158844195375898e-06, + "loss": 0.2093, + "step": 14397 + }, + { + "epoch": 0.36434952045954905, + "grad_norm": 3.43430757522583, + "learning_rate": 7.158482021390312e-06, + "loss": 0.1531, + "step": 14398 + }, + { + "epoch": 0.3643748260242427, + "grad_norm": 7.347231388092041, + "learning_rate": 7.158119833485077e-06, + "loss": 0.2303, + "step": 14399 + }, + { + "epoch": 0.3644001315889364, + "grad_norm": 5.522922515869141, + "learning_rate": 7.157757631662528e-06, + "loss": 0.1256, + "step": 14400 + }, + { + "epoch": 0.36442543715363007, + "grad_norm": 3.6350669860839844, + "learning_rate": 7.157395415925002e-06, + "loss": 0.1023, + "step": 14401 + }, + { + "epoch": 0.3644507427183238, + "grad_norm": 5.299045562744141, + "learning_rate": 7.1570331862748335e-06, + "loss": 0.1759, + "step": 14402 + }, + { + "epoch": 0.36447604828301744, + "grad_norm": 7.601853370666504, + "learning_rate": 7.156670942714359e-06, + "loss": 0.2142, + "step": 14403 + }, + { + "epoch": 0.3645013538477111, + "grad_norm": 7.512778282165527, + "learning_rate": 7.156308685245915e-06, + "loss": 0.2116, + "step": 14404 + }, + { + "epoch": 0.3645266594124048, + "grad_norm": 4.374242782592773, + "learning_rate": 7.155946413871838e-06, + "loss": 0.179, + "step": 14405 + }, + { + "epoch": 0.36455196497709846, + "grad_norm": 4.880007743835449, + "learning_rate": 7.155584128594463e-06, + "loss": 0.1707, + "step": 14406 + }, + { + "epoch": 0.3645772705417921, + "grad_norm": 4.817750453948975, + "learning_rate": 7.155221829416128e-06, + "loss": 0.1079, + "step": 14407 + }, + { + "epoch": 0.36460257610648583, + "grad_norm": 10.3385648727417, + "learning_rate": 7.154859516339166e-06, + "loss": 0.3057, + "step": 14408 + }, + { + "epoch": 0.3646278816711795, + "grad_norm": 4.216435432434082, + "learning_rate": 7.154497189365919e-06, + "loss": 0.1621, + "step": 14409 + }, + { + "epoch": 0.36465318723587314, + "grad_norm": 4.560703277587891, + "learning_rate": 7.154134848498719e-06, + "loss": 0.1422, + "step": 14410 + }, + { + "epoch": 0.36467849280056686, + "grad_norm": 6.607154846191406, + "learning_rate": 7.153772493739904e-06, + "loss": 0.124, + "step": 14411 + }, + { + "epoch": 0.3647037983652605, + "grad_norm": 4.489040851593018, + "learning_rate": 7.153410125091811e-06, + "loss": 0.1478, + "step": 14412 + }, + { + "epoch": 0.3647291039299542, + "grad_norm": 6.951314926147461, + "learning_rate": 7.153047742556776e-06, + "loss": 0.2137, + "step": 14413 + }, + { + "epoch": 0.3647544094946479, + "grad_norm": 6.187180995941162, + "learning_rate": 7.152685346137136e-06, + "loss": 0.2422, + "step": 14414 + }, + { + "epoch": 0.36477971505934154, + "grad_norm": 4.804900169372559, + "learning_rate": 7.1523229358352295e-06, + "loss": 0.1514, + "step": 14415 + }, + { + "epoch": 0.36480502062403525, + "grad_norm": 4.446528434753418, + "learning_rate": 7.151960511653393e-06, + "loss": 0.1464, + "step": 14416 + }, + { + "epoch": 0.3648303261887289, + "grad_norm": 6.480020046234131, + "learning_rate": 7.151598073593964e-06, + "loss": 0.1995, + "step": 14417 + }, + { + "epoch": 0.36485563175342256, + "grad_norm": 4.439349174499512, + "learning_rate": 7.151235621659277e-06, + "loss": 0.1789, + "step": 14418 + }, + { + "epoch": 0.36488093731811627, + "grad_norm": 6.532121181488037, + "learning_rate": 7.150873155851674e-06, + "loss": 0.1485, + "step": 14419 + }, + { + "epoch": 0.36490624288280993, + "grad_norm": 8.73677921295166, + "learning_rate": 7.150510676173489e-06, + "loss": 0.1775, + "step": 14420 + }, + { + "epoch": 0.3649315484475036, + "grad_norm": 4.4835028648376465, + "learning_rate": 7.150148182627061e-06, + "loss": 0.1306, + "step": 14421 + }, + { + "epoch": 0.3649568540121973, + "grad_norm": 3.80554461479187, + "learning_rate": 7.149785675214727e-06, + "loss": 0.1407, + "step": 14422 + }, + { + "epoch": 0.36498215957689095, + "grad_norm": 6.714746475219727, + "learning_rate": 7.149423153938826e-06, + "loss": 0.2746, + "step": 14423 + }, + { + "epoch": 0.3650074651415846, + "grad_norm": 3.8280675411224365, + "learning_rate": 7.149060618801693e-06, + "loss": 0.1505, + "step": 14424 + }, + { + "epoch": 0.3650327707062783, + "grad_norm": 12.670907020568848, + "learning_rate": 7.148698069805669e-06, + "loss": 0.2502, + "step": 14425 + }, + { + "epoch": 0.365058076270972, + "grad_norm": 4.984179496765137, + "learning_rate": 7.148335506953091e-06, + "loss": 0.2297, + "step": 14426 + }, + { + "epoch": 0.3650833818356657, + "grad_norm": 4.5173139572143555, + "learning_rate": 7.1479729302462975e-06, + "loss": 0.1545, + "step": 14427 + }, + { + "epoch": 0.36510868740035934, + "grad_norm": 3.501049041748047, + "learning_rate": 7.147610339687625e-06, + "loss": 0.1602, + "step": 14428 + }, + { + "epoch": 0.365133992965053, + "grad_norm": 6.882030963897705, + "learning_rate": 7.1472477352794124e-06, + "loss": 0.2977, + "step": 14429 + }, + { + "epoch": 0.3651592985297467, + "grad_norm": 3.6634886264801025, + "learning_rate": 7.146885117024001e-06, + "loss": 0.1703, + "step": 14430 + }, + { + "epoch": 0.36518460409444037, + "grad_norm": 2.3118269443511963, + "learning_rate": 7.146522484923725e-06, + "loss": 0.1025, + "step": 14431 + }, + { + "epoch": 0.365209909659134, + "grad_norm": 4.235665798187256, + "learning_rate": 7.146159838980925e-06, + "loss": 0.1988, + "step": 14432 + }, + { + "epoch": 0.36523521522382774, + "grad_norm": 12.619232177734375, + "learning_rate": 7.14579717919794e-06, + "loss": 0.1943, + "step": 14433 + }, + { + "epoch": 0.3652605207885214, + "grad_norm": 14.587130546569824, + "learning_rate": 7.145434505577107e-06, + "loss": 0.225, + "step": 14434 + }, + { + "epoch": 0.36528582635321505, + "grad_norm": 3.9938857555389404, + "learning_rate": 7.145071818120767e-06, + "loss": 0.1581, + "step": 14435 + }, + { + "epoch": 0.36531113191790876, + "grad_norm": 11.963455200195312, + "learning_rate": 7.144709116831258e-06, + "loss": 0.2763, + "step": 14436 + }, + { + "epoch": 0.3653364374826024, + "grad_norm": 11.846827507019043, + "learning_rate": 7.144346401710918e-06, + "loss": 0.2161, + "step": 14437 + }, + { + "epoch": 0.36536174304729613, + "grad_norm": 4.553040504455566, + "learning_rate": 7.143983672762087e-06, + "loss": 0.1974, + "step": 14438 + }, + { + "epoch": 0.3653870486119898, + "grad_norm": 12.214086532592773, + "learning_rate": 7.143620929987104e-06, + "loss": 0.1754, + "step": 14439 + }, + { + "epoch": 0.36541235417668344, + "grad_norm": 12.484457969665527, + "learning_rate": 7.14325817338831e-06, + "loss": 0.2432, + "step": 14440 + }, + { + "epoch": 0.36543765974137715, + "grad_norm": 5.1610612869262695, + "learning_rate": 7.142895402968041e-06, + "loss": 0.1545, + "step": 14441 + }, + { + "epoch": 0.3654629653060708, + "grad_norm": 4.7865190505981445, + "learning_rate": 7.142532618728638e-06, + "loss": 0.2347, + "step": 14442 + }, + { + "epoch": 0.36548827087076446, + "grad_norm": 11.684181213378906, + "learning_rate": 7.142169820672442e-06, + "loss": 0.2598, + "step": 14443 + }, + { + "epoch": 0.3655135764354582, + "grad_norm": 3.0122673511505127, + "learning_rate": 7.141807008801791e-06, + "loss": 0.1648, + "step": 14444 + }, + { + "epoch": 0.36553888200015183, + "grad_norm": 4.949476718902588, + "learning_rate": 7.141444183119024e-06, + "loss": 0.1667, + "step": 14445 + }, + { + "epoch": 0.3655641875648455, + "grad_norm": 3.850224256515503, + "learning_rate": 7.141081343626482e-06, + "loss": 0.1914, + "step": 14446 + }, + { + "epoch": 0.3655894931295392, + "grad_norm": 4.538756847381592, + "learning_rate": 7.140718490326506e-06, + "loss": 0.1942, + "step": 14447 + }, + { + "epoch": 0.36561479869423286, + "grad_norm": 5.363064765930176, + "learning_rate": 7.140355623221433e-06, + "loss": 0.1996, + "step": 14448 + }, + { + "epoch": 0.3656401042589265, + "grad_norm": 3.7905080318450928, + "learning_rate": 7.139992742313606e-06, + "loss": 0.1915, + "step": 14449 + }, + { + "epoch": 0.3656654098236202, + "grad_norm": 6.516941070556641, + "learning_rate": 7.139629847605362e-06, + "loss": 0.1934, + "step": 14450 + }, + { + "epoch": 0.3656907153883139, + "grad_norm": 5.665782928466797, + "learning_rate": 7.139266939099045e-06, + "loss": 0.1489, + "step": 14451 + }, + { + "epoch": 0.3657160209530076, + "grad_norm": 2.839418888092041, + "learning_rate": 7.138904016796993e-06, + "loss": 0.1245, + "step": 14452 + }, + { + "epoch": 0.36574132651770125, + "grad_norm": 4.847457408905029, + "learning_rate": 7.138541080701547e-06, + "loss": 0.1993, + "step": 14453 + }, + { + "epoch": 0.3657666320823949, + "grad_norm": 6.13198184967041, + "learning_rate": 7.138178130815048e-06, + "loss": 0.2063, + "step": 14454 + }, + { + "epoch": 0.3657919376470886, + "grad_norm": 5.048129558563232, + "learning_rate": 7.137815167139834e-06, + "loss": 0.135, + "step": 14455 + }, + { + "epoch": 0.3658172432117823, + "grad_norm": 8.7410306930542, + "learning_rate": 7.137452189678249e-06, + "loss": 0.2709, + "step": 14456 + }, + { + "epoch": 0.36584254877647593, + "grad_norm": 3.6026690006256104, + "learning_rate": 7.137089198432632e-06, + "loss": 0.1787, + "step": 14457 + }, + { + "epoch": 0.36586785434116964, + "grad_norm": 8.912224769592285, + "learning_rate": 7.136726193405326e-06, + "loss": 0.3728, + "step": 14458 + }, + { + "epoch": 0.3658931599058633, + "grad_norm": 7.355819225311279, + "learning_rate": 7.136363174598667e-06, + "loss": 0.2163, + "step": 14459 + }, + { + "epoch": 0.36591846547055695, + "grad_norm": 6.024326801300049, + "learning_rate": 7.136000142015001e-06, + "loss": 0.1501, + "step": 14460 + }, + { + "epoch": 0.36594377103525066, + "grad_norm": 4.828467845916748, + "learning_rate": 7.135637095656668e-06, + "loss": 0.1407, + "step": 14461 + }, + { + "epoch": 0.3659690765999443, + "grad_norm": 4.360918045043945, + "learning_rate": 7.1352740355260075e-06, + "loss": 0.1536, + "step": 14462 + }, + { + "epoch": 0.365994382164638, + "grad_norm": 4.454730033874512, + "learning_rate": 7.134910961625363e-06, + "loss": 0.178, + "step": 14463 + }, + { + "epoch": 0.3660196877293317, + "grad_norm": 2.538383960723877, + "learning_rate": 7.134547873957074e-06, + "loss": 0.155, + "step": 14464 + }, + { + "epoch": 0.36604499329402534, + "grad_norm": 3.9374301433563232, + "learning_rate": 7.134184772523484e-06, + "loss": 0.1728, + "step": 14465 + }, + { + "epoch": 0.36607029885871906, + "grad_norm": 3.4139750003814697, + "learning_rate": 7.133821657326931e-06, + "loss": 0.163, + "step": 14466 + }, + { + "epoch": 0.3660956044234127, + "grad_norm": 3.0299856662750244, + "learning_rate": 7.133458528369761e-06, + "loss": 0.1509, + "step": 14467 + }, + { + "epoch": 0.36612090998810637, + "grad_norm": 6.985439777374268, + "learning_rate": 7.1330953856543135e-06, + "loss": 0.2294, + "step": 14468 + }, + { + "epoch": 0.3661462155528001, + "grad_norm": 4.3951311111450195, + "learning_rate": 7.1327322291829306e-06, + "loss": 0.165, + "step": 14469 + }, + { + "epoch": 0.36617152111749374, + "grad_norm": 13.920162200927734, + "learning_rate": 7.132369058957955e-06, + "loss": 0.2917, + "step": 14470 + }, + { + "epoch": 0.3661968266821874, + "grad_norm": 22.21415138244629, + "learning_rate": 7.132005874981727e-06, + "loss": 0.206, + "step": 14471 + }, + { + "epoch": 0.3662221322468811, + "grad_norm": 4.65394926071167, + "learning_rate": 7.13164267725659e-06, + "loss": 0.1693, + "step": 14472 + }, + { + "epoch": 0.36624743781157476, + "grad_norm": 5.376835346221924, + "learning_rate": 7.131279465784886e-06, + "loss": 0.2043, + "step": 14473 + }, + { + "epoch": 0.3662727433762684, + "grad_norm": 3.7627878189086914, + "learning_rate": 7.1309162405689565e-06, + "loss": 0.1369, + "step": 14474 + }, + { + "epoch": 0.36629804894096213, + "grad_norm": 4.2165608406066895, + "learning_rate": 7.1305530016111445e-06, + "loss": 0.2335, + "step": 14475 + }, + { + "epoch": 0.3663233545056558, + "grad_norm": 9.228195190429688, + "learning_rate": 7.130189748913794e-06, + "loss": 0.1853, + "step": 14476 + }, + { + "epoch": 0.3663486600703495, + "grad_norm": 3.598768472671509, + "learning_rate": 7.129826482479245e-06, + "loss": 0.1451, + "step": 14477 + }, + { + "epoch": 0.36637396563504315, + "grad_norm": 3.3274424076080322, + "learning_rate": 7.129463202309841e-06, + "loss": 0.1352, + "step": 14478 + }, + { + "epoch": 0.3663992711997368, + "grad_norm": 5.8263726234436035, + "learning_rate": 7.129099908407925e-06, + "loss": 0.2035, + "step": 14479 + }, + { + "epoch": 0.3664245767644305, + "grad_norm": 4.081235408782959, + "learning_rate": 7.1287366007758405e-06, + "loss": 0.1964, + "step": 14480 + }, + { + "epoch": 0.3664498823291242, + "grad_norm": 4.739854335784912, + "learning_rate": 7.128373279415929e-06, + "loss": 0.172, + "step": 14481 + }, + { + "epoch": 0.36647518789381783, + "grad_norm": 4.752556800842285, + "learning_rate": 7.1280099443305324e-06, + "loss": 0.2422, + "step": 14482 + }, + { + "epoch": 0.36650049345851154, + "grad_norm": 4.376026153564453, + "learning_rate": 7.127646595521998e-06, + "loss": 0.141, + "step": 14483 + }, + { + "epoch": 0.3665257990232052, + "grad_norm": 6.111319065093994, + "learning_rate": 7.1272832329926655e-06, + "loss": 0.2136, + "step": 14484 + }, + { + "epoch": 0.36655110458789886, + "grad_norm": 5.328105449676514, + "learning_rate": 7.12691985674488e-06, + "loss": 0.1979, + "step": 14485 + }, + { + "epoch": 0.36657641015259257, + "grad_norm": 4.400324821472168, + "learning_rate": 7.126556466780983e-06, + "loss": 0.1078, + "step": 14486 + }, + { + "epoch": 0.3666017157172862, + "grad_norm": 8.119994163513184, + "learning_rate": 7.12619306310332e-06, + "loss": 0.1857, + "step": 14487 + }, + { + "epoch": 0.3666270212819799, + "grad_norm": 5.251693248748779, + "learning_rate": 7.125829645714232e-06, + "loss": 0.1584, + "step": 14488 + }, + { + "epoch": 0.3666523268466736, + "grad_norm": 4.088431358337402, + "learning_rate": 7.125466214616066e-06, + "loss": 0.1852, + "step": 14489 + }, + { + "epoch": 0.36667763241136725, + "grad_norm": 4.11200475692749, + "learning_rate": 7.125102769811161e-06, + "loss": 0.1673, + "step": 14490 + }, + { + "epoch": 0.36670293797606096, + "grad_norm": 13.103554725646973, + "learning_rate": 7.124739311301866e-06, + "loss": 0.2642, + "step": 14491 + }, + { + "epoch": 0.3667282435407546, + "grad_norm": 3.319040060043335, + "learning_rate": 7.124375839090521e-06, + "loss": 0.0891, + "step": 14492 + }, + { + "epoch": 0.3667535491054483, + "grad_norm": 5.226715087890625, + "learning_rate": 7.124012353179471e-06, + "loss": 0.1627, + "step": 14493 + }, + { + "epoch": 0.366778854670142, + "grad_norm": 4.190304756164551, + "learning_rate": 7.123648853571063e-06, + "loss": 0.2073, + "step": 14494 + }, + { + "epoch": 0.36680416023483564, + "grad_norm": 5.304349899291992, + "learning_rate": 7.123285340267636e-06, + "loss": 0.1952, + "step": 14495 + }, + { + "epoch": 0.3668294657995293, + "grad_norm": 4.200331211090088, + "learning_rate": 7.122921813271537e-06, + "loss": 0.1436, + "step": 14496 + }, + { + "epoch": 0.366854771364223, + "grad_norm": 6.251322269439697, + "learning_rate": 7.122558272585111e-06, + "loss": 0.2145, + "step": 14497 + }, + { + "epoch": 0.36688007692891667, + "grad_norm": 11.321024894714355, + "learning_rate": 7.122194718210702e-06, + "loss": 0.3241, + "step": 14498 + }, + { + "epoch": 0.3669053824936103, + "grad_norm": 12.224210739135742, + "learning_rate": 7.121831150150653e-06, + "loss": 0.1559, + "step": 14499 + }, + { + "epoch": 0.36693068805830403, + "grad_norm": 3.954035520553589, + "learning_rate": 7.12146756840731e-06, + "loss": 0.124, + "step": 14500 + }, + { + "epoch": 0.3669559936229977, + "grad_norm": 9.364997863769531, + "learning_rate": 7.121103972983018e-06, + "loss": 0.1667, + "step": 14501 + }, + { + "epoch": 0.3669812991876914, + "grad_norm": 5.065418720245361, + "learning_rate": 7.120740363880121e-06, + "loss": 0.2253, + "step": 14502 + }, + { + "epoch": 0.36700660475238506, + "grad_norm": 5.086967468261719, + "learning_rate": 7.120376741100962e-06, + "loss": 0.2186, + "step": 14503 + }, + { + "epoch": 0.3670319103170787, + "grad_norm": 15.960570335388184, + "learning_rate": 7.120013104647888e-06, + "loss": 0.2622, + "step": 14504 + }, + { + "epoch": 0.3670572158817724, + "grad_norm": 3.4801204204559326, + "learning_rate": 7.119649454523245e-06, + "loss": 0.1585, + "step": 14505 + }, + { + "epoch": 0.3670825214464661, + "grad_norm": 7.309345245361328, + "learning_rate": 7.119285790729377e-06, + "loss": 0.2782, + "step": 14506 + }, + { + "epoch": 0.36710782701115974, + "grad_norm": 8.765371322631836, + "learning_rate": 7.1189221132686295e-06, + "loss": 0.26, + "step": 14507 + }, + { + "epoch": 0.36713313257585345, + "grad_norm": 3.2275490760803223, + "learning_rate": 7.118558422143347e-06, + "loss": 0.143, + "step": 14508 + }, + { + "epoch": 0.3671584381405471, + "grad_norm": 4.389556407928467, + "learning_rate": 7.118194717355875e-06, + "loss": 0.1506, + "step": 14509 + }, + { + "epoch": 0.36718374370524076, + "grad_norm": 6.0007004737854, + "learning_rate": 7.11783099890856e-06, + "loss": 0.1371, + "step": 14510 + }, + { + "epoch": 0.3672090492699345, + "grad_norm": 8.87425708770752, + "learning_rate": 7.117467266803746e-06, + "loss": 0.1939, + "step": 14511 + }, + { + "epoch": 0.36723435483462813, + "grad_norm": 4.8898210525512695, + "learning_rate": 7.117103521043779e-06, + "loss": 0.2482, + "step": 14512 + }, + { + "epoch": 0.3672596603993218, + "grad_norm": 6.1367950439453125, + "learning_rate": 7.116739761631006e-06, + "loss": 0.1664, + "step": 14513 + }, + { + "epoch": 0.3672849659640155, + "grad_norm": 4.379637718200684, + "learning_rate": 7.116375988567772e-06, + "loss": 0.1435, + "step": 14514 + }, + { + "epoch": 0.36731027152870915, + "grad_norm": 4.9583024978637695, + "learning_rate": 7.116012201856421e-06, + "loss": 0.227, + "step": 14515 + }, + { + "epoch": 0.36733557709340287, + "grad_norm": 5.07706880569458, + "learning_rate": 7.1156484014993046e-06, + "loss": 0.1074, + "step": 14516 + }, + { + "epoch": 0.3673608826580965, + "grad_norm": 9.595906257629395, + "learning_rate": 7.115284587498763e-06, + "loss": 0.2075, + "step": 14517 + }, + { + "epoch": 0.3673861882227902, + "grad_norm": 4.51278829574585, + "learning_rate": 7.114920759857145e-06, + "loss": 0.189, + "step": 14518 + }, + { + "epoch": 0.3674114937874839, + "grad_norm": 3.4256439208984375, + "learning_rate": 7.114556918576796e-06, + "loss": 0.1523, + "step": 14519 + }, + { + "epoch": 0.36743679935217755, + "grad_norm": 7.615924835205078, + "learning_rate": 7.114193063660064e-06, + "loss": 0.1462, + "step": 14520 + }, + { + "epoch": 0.3674621049168712, + "grad_norm": 7.662829399108887, + "learning_rate": 7.113829195109293e-06, + "loss": 0.2765, + "step": 14521 + }, + { + "epoch": 0.3674874104815649, + "grad_norm": 6.822933197021484, + "learning_rate": 7.113465312926832e-06, + "loss": 0.1755, + "step": 14522 + }, + { + "epoch": 0.36751271604625857, + "grad_norm": 7.307027816772461, + "learning_rate": 7.113101417115025e-06, + "loss": 0.2342, + "step": 14523 + }, + { + "epoch": 0.3675380216109522, + "grad_norm": 5.045020580291748, + "learning_rate": 7.11273750767622e-06, + "loss": 0.1234, + "step": 14524 + }, + { + "epoch": 0.36756332717564594, + "grad_norm": 8.681553840637207, + "learning_rate": 7.112373584612764e-06, + "loss": 0.2587, + "step": 14525 + }, + { + "epoch": 0.3675886327403396, + "grad_norm": 4.326169490814209, + "learning_rate": 7.112009647927005e-06, + "loss": 0.0786, + "step": 14526 + }, + { + "epoch": 0.36761393830503325, + "grad_norm": 6.7851176261901855, + "learning_rate": 7.111645697621287e-06, + "loss": 0.2816, + "step": 14527 + }, + { + "epoch": 0.36763924386972696, + "grad_norm": 10.540407180786133, + "learning_rate": 7.111281733697959e-06, + "loss": 0.2304, + "step": 14528 + }, + { + "epoch": 0.3676645494344206, + "grad_norm": 3.0978550910949707, + "learning_rate": 7.110917756159369e-06, + "loss": 0.1657, + "step": 14529 + }, + { + "epoch": 0.36768985499911433, + "grad_norm": 4.83272123336792, + "learning_rate": 7.110553765007861e-06, + "loss": 0.186, + "step": 14530 + }, + { + "epoch": 0.367715160563808, + "grad_norm": 7.184851169586182, + "learning_rate": 7.110189760245787e-06, + "loss": 0.2079, + "step": 14531 + }, + { + "epoch": 0.36774046612850164, + "grad_norm": 4.828956604003906, + "learning_rate": 7.109825741875489e-06, + "loss": 0.2114, + "step": 14532 + }, + { + "epoch": 0.36776577169319535, + "grad_norm": 20.101778030395508, + "learning_rate": 7.109461709899319e-06, + "loss": 0.1868, + "step": 14533 + }, + { + "epoch": 0.367791077257889, + "grad_norm": 6.708409309387207, + "learning_rate": 7.109097664319622e-06, + "loss": 0.1889, + "step": 14534 + }, + { + "epoch": 0.36781638282258267, + "grad_norm": 3.504366159439087, + "learning_rate": 7.108733605138747e-06, + "loss": 0.1185, + "step": 14535 + }, + { + "epoch": 0.3678416883872764, + "grad_norm": 5.6326704025268555, + "learning_rate": 7.108369532359041e-06, + "loss": 0.1751, + "step": 14536 + }, + { + "epoch": 0.36786699395197003, + "grad_norm": 3.78764009475708, + "learning_rate": 7.108005445982852e-06, + "loss": 0.1298, + "step": 14537 + }, + { + "epoch": 0.3678922995166637, + "grad_norm": 6.049242973327637, + "learning_rate": 7.107641346012528e-06, + "loss": 0.1319, + "step": 14538 + }, + { + "epoch": 0.3679176050813574, + "grad_norm": 3.6243033409118652, + "learning_rate": 7.107277232450417e-06, + "loss": 0.1333, + "step": 14539 + }, + { + "epoch": 0.36794291064605106, + "grad_norm": 4.047354698181152, + "learning_rate": 7.106913105298868e-06, + "loss": 0.1406, + "step": 14540 + }, + { + "epoch": 0.36796821621074477, + "grad_norm": 5.399641990661621, + "learning_rate": 7.106548964560227e-06, + "loss": 0.1245, + "step": 14541 + }, + { + "epoch": 0.3679935217754384, + "grad_norm": 3.3903186321258545, + "learning_rate": 7.106184810236844e-06, + "loss": 0.1537, + "step": 14542 + }, + { + "epoch": 0.3680188273401321, + "grad_norm": 12.271658897399902, + "learning_rate": 7.105820642331067e-06, + "loss": 0.1185, + "step": 14543 + }, + { + "epoch": 0.3680441329048258, + "grad_norm": 5.56791877746582, + "learning_rate": 7.105456460845246e-06, + "loss": 0.1723, + "step": 14544 + }, + { + "epoch": 0.36806943846951945, + "grad_norm": 4.836367130279541, + "learning_rate": 7.105092265781726e-06, + "loss": 0.1555, + "step": 14545 + }, + { + "epoch": 0.3680947440342131, + "grad_norm": 5.100318908691406, + "learning_rate": 7.104728057142857e-06, + "loss": 0.1965, + "step": 14546 + }, + { + "epoch": 0.3681200495989068, + "grad_norm": 6.11450719833374, + "learning_rate": 7.104363834930989e-06, + "loss": 0.1831, + "step": 14547 + }, + { + "epoch": 0.3681453551636005, + "grad_norm": 4.922724723815918, + "learning_rate": 7.10399959914847e-06, + "loss": 0.1609, + "step": 14548 + }, + { + "epoch": 0.36817066072829413, + "grad_norm": 9.719407081604004, + "learning_rate": 7.10363534979765e-06, + "loss": 0.2567, + "step": 14549 + }, + { + "epoch": 0.36819596629298784, + "grad_norm": 5.277806758880615, + "learning_rate": 7.103271086880876e-06, + "loss": 0.2294, + "step": 14550 + }, + { + "epoch": 0.3682212718576815, + "grad_norm": 7.773805141448975, + "learning_rate": 7.1029068104004985e-06, + "loss": 0.2233, + "step": 14551 + }, + { + "epoch": 0.36824657742237515, + "grad_norm": 11.122552871704102, + "learning_rate": 7.102542520358865e-06, + "loss": 0.3092, + "step": 14552 + }, + { + "epoch": 0.36827188298706887, + "grad_norm": 7.896976470947266, + "learning_rate": 7.1021782167583275e-06, + "loss": 0.1685, + "step": 14553 + }, + { + "epoch": 0.3682971885517625, + "grad_norm": 5.860850811004639, + "learning_rate": 7.101813899601233e-06, + "loss": 0.2231, + "step": 14554 + }, + { + "epoch": 0.36832249411645623, + "grad_norm": 3.435516834259033, + "learning_rate": 7.101449568889931e-06, + "loss": 0.1484, + "step": 14555 + }, + { + "epoch": 0.3683477996811499, + "grad_norm": 3.6440510749816895, + "learning_rate": 7.1010852246267725e-06, + "loss": 0.1354, + "step": 14556 + }, + { + "epoch": 0.36837310524584355, + "grad_norm": 6.147468090057373, + "learning_rate": 7.100720866814106e-06, + "loss": 0.266, + "step": 14557 + }, + { + "epoch": 0.36839841081053726, + "grad_norm": 2.308074951171875, + "learning_rate": 7.1003564954542825e-06, + "loss": 0.0864, + "step": 14558 + }, + { + "epoch": 0.3684237163752309, + "grad_norm": 8.821622848510742, + "learning_rate": 7.099992110549648e-06, + "loss": 0.2083, + "step": 14559 + }, + { + "epoch": 0.36844902193992457, + "grad_norm": 3.4851346015930176, + "learning_rate": 7.099627712102557e-06, + "loss": 0.1411, + "step": 14560 + }, + { + "epoch": 0.3684743275046183, + "grad_norm": 10.408310890197754, + "learning_rate": 7.099263300115358e-06, + "loss": 0.1947, + "step": 14561 + }, + { + "epoch": 0.36849963306931194, + "grad_norm": 5.517858982086182, + "learning_rate": 7.098898874590399e-06, + "loss": 0.1877, + "step": 14562 + }, + { + "epoch": 0.3685249386340056, + "grad_norm": 7.8660197257995605, + "learning_rate": 7.098534435530031e-06, + "loss": 0.2289, + "step": 14563 + }, + { + "epoch": 0.3685502441986993, + "grad_norm": 12.40723991394043, + "learning_rate": 7.0981699829366065e-06, + "loss": 0.1815, + "step": 14564 + }, + { + "epoch": 0.36857554976339296, + "grad_norm": 2.990736484527588, + "learning_rate": 7.0978055168124725e-06, + "loss": 0.1187, + "step": 14565 + }, + { + "epoch": 0.3686008553280867, + "grad_norm": 3.9879724979400635, + "learning_rate": 7.097441037159982e-06, + "loss": 0.1498, + "step": 14566 + }, + { + "epoch": 0.36862616089278033, + "grad_norm": 3.9725186824798584, + "learning_rate": 7.0970765439814845e-06, + "loss": 0.1533, + "step": 14567 + }, + { + "epoch": 0.368651466457474, + "grad_norm": 8.461746215820312, + "learning_rate": 7.09671203727933e-06, + "loss": 0.1882, + "step": 14568 + }, + { + "epoch": 0.3686767720221677, + "grad_norm": 5.66470193862915, + "learning_rate": 7.096347517055871e-06, + "loss": 0.2423, + "step": 14569 + }, + { + "epoch": 0.36870207758686135, + "grad_norm": 8.622276306152344, + "learning_rate": 7.095982983313454e-06, + "loss": 0.2738, + "step": 14570 + }, + { + "epoch": 0.368727383151555, + "grad_norm": 11.3418607711792, + "learning_rate": 7.095618436054434e-06, + "loss": 0.2625, + "step": 14571 + }, + { + "epoch": 0.3687526887162487, + "grad_norm": 3.6310741901397705, + "learning_rate": 7.095253875281161e-06, + "loss": 0.1705, + "step": 14572 + }, + { + "epoch": 0.3687779942809424, + "grad_norm": 4.374404430389404, + "learning_rate": 7.0948893009959856e-06, + "loss": 0.1449, + "step": 14573 + }, + { + "epoch": 0.36880329984563603, + "grad_norm": 4.849564075469971, + "learning_rate": 7.094524713201258e-06, + "loss": 0.1854, + "step": 14574 + }, + { + "epoch": 0.36882860541032975, + "grad_norm": 8.046095848083496, + "learning_rate": 7.094160111899331e-06, + "loss": 0.2311, + "step": 14575 + }, + { + "epoch": 0.3688539109750234, + "grad_norm": 3.7621116638183594, + "learning_rate": 7.093795497092553e-06, + "loss": 0.1682, + "step": 14576 + }, + { + "epoch": 0.36887921653971706, + "grad_norm": 6.69598913192749, + "learning_rate": 7.09343086878328e-06, + "loss": 0.2148, + "step": 14577 + }, + { + "epoch": 0.36890452210441077, + "grad_norm": 5.366689205169678, + "learning_rate": 7.093066226973859e-06, + "loss": 0.1445, + "step": 14578 + }, + { + "epoch": 0.3689298276691044, + "grad_norm": 2.7600460052490234, + "learning_rate": 7.0927015716666435e-06, + "loss": 0.1575, + "step": 14579 + }, + { + "epoch": 0.36895513323379814, + "grad_norm": 6.574641227722168, + "learning_rate": 7.0923369028639855e-06, + "loss": 0.2041, + "step": 14580 + }, + { + "epoch": 0.3689804387984918, + "grad_norm": 3.885110378265381, + "learning_rate": 7.091972220568235e-06, + "loss": 0.1799, + "step": 14581 + }, + { + "epoch": 0.36900574436318545, + "grad_norm": 3.5249083042144775, + "learning_rate": 7.091607524781746e-06, + "loss": 0.1783, + "step": 14582 + }, + { + "epoch": 0.36903104992787916, + "grad_norm": 6.11361837387085, + "learning_rate": 7.091242815506868e-06, + "loss": 0.1441, + "step": 14583 + }, + { + "epoch": 0.3690563554925728, + "grad_norm": 8.688329696655273, + "learning_rate": 7.0908780927459555e-06, + "loss": 0.1713, + "step": 14584 + }, + { + "epoch": 0.3690816610572665, + "grad_norm": 7.452057361602783, + "learning_rate": 7.090513356501359e-06, + "loss": 0.2237, + "step": 14585 + }, + { + "epoch": 0.3691069666219602, + "grad_norm": 3.814182758331299, + "learning_rate": 7.090148606775429e-06, + "loss": 0.0999, + "step": 14586 + }, + { + "epoch": 0.36913227218665384, + "grad_norm": 5.630589008331299, + "learning_rate": 7.089783843570521e-06, + "loss": 0.1804, + "step": 14587 + }, + { + "epoch": 0.3691575777513475, + "grad_norm": 3.6252591609954834, + "learning_rate": 7.089419066888985e-06, + "loss": 0.2084, + "step": 14588 + }, + { + "epoch": 0.3691828833160412, + "grad_norm": 7.897738933563232, + "learning_rate": 7.089054276733174e-06, + "loss": 0.2103, + "step": 14589 + }, + { + "epoch": 0.36920818888073487, + "grad_norm": 7.217687606811523, + "learning_rate": 7.088689473105441e-06, + "loss": 0.186, + "step": 14590 + }, + { + "epoch": 0.3692334944454285, + "grad_norm": 7.469285488128662, + "learning_rate": 7.088324656008138e-06, + "loss": 0.2343, + "step": 14591 + }, + { + "epoch": 0.36925880001012223, + "grad_norm": 5.25341796875, + "learning_rate": 7.087959825443617e-06, + "loss": 0.1721, + "step": 14592 + }, + { + "epoch": 0.3692841055748159, + "grad_norm": 3.593125104904175, + "learning_rate": 7.087594981414233e-06, + "loss": 0.2103, + "step": 14593 + }, + { + "epoch": 0.3693094111395096, + "grad_norm": 4.967260360717773, + "learning_rate": 7.0872301239223365e-06, + "loss": 0.2144, + "step": 14594 + }, + { + "epoch": 0.36933471670420326, + "grad_norm": 7.604156970977783, + "learning_rate": 7.086865252970281e-06, + "loss": 0.2238, + "step": 14595 + }, + { + "epoch": 0.3693600222688969, + "grad_norm": 3.3371951580047607, + "learning_rate": 7.086500368560419e-06, + "loss": 0.1272, + "step": 14596 + }, + { + "epoch": 0.3693853278335906, + "grad_norm": 3.321139335632324, + "learning_rate": 7.086135470695106e-06, + "loss": 0.1196, + "step": 14597 + }, + { + "epoch": 0.3694106333982843, + "grad_norm": 4.444307327270508, + "learning_rate": 7.085770559376692e-06, + "loss": 0.126, + "step": 14598 + }, + { + "epoch": 0.36943593896297794, + "grad_norm": 6.830066204071045, + "learning_rate": 7.085405634607532e-06, + "loss": 0.1971, + "step": 14599 + }, + { + "epoch": 0.36946124452767165, + "grad_norm": 7.689294338226318, + "learning_rate": 7.085040696389978e-06, + "loss": 0.2932, + "step": 14600 + }, + { + "epoch": 0.3694865500923653, + "grad_norm": 5.212336540222168, + "learning_rate": 7.084675744726386e-06, + "loss": 0.2318, + "step": 14601 + }, + { + "epoch": 0.36951185565705896, + "grad_norm": 3.5519065856933594, + "learning_rate": 7.084310779619108e-06, + "loss": 0.1828, + "step": 14602 + }, + { + "epoch": 0.3695371612217527, + "grad_norm": 4.435414791107178, + "learning_rate": 7.0839458010704965e-06, + "loss": 0.2334, + "step": 14603 + }, + { + "epoch": 0.36956246678644633, + "grad_norm": 5.040058135986328, + "learning_rate": 7.083580809082906e-06, + "loss": 0.1557, + "step": 14604 + }, + { + "epoch": 0.36958777235114004, + "grad_norm": 4.155810356140137, + "learning_rate": 7.083215803658692e-06, + "loss": 0.1475, + "step": 14605 + }, + { + "epoch": 0.3696130779158337, + "grad_norm": 4.484760284423828, + "learning_rate": 7.082850784800206e-06, + "loss": 0.2377, + "step": 14606 + }, + { + "epoch": 0.36963838348052735, + "grad_norm": 3.133566379547119, + "learning_rate": 7.0824857525098015e-06, + "loss": 0.1806, + "step": 14607 + }, + { + "epoch": 0.36966368904522107, + "grad_norm": 16.14387321472168, + "learning_rate": 7.082120706789834e-06, + "loss": 0.2381, + "step": 14608 + }, + { + "epoch": 0.3696889946099147, + "grad_norm": 6.339475631713867, + "learning_rate": 7.0817556476426586e-06, + "loss": 0.1357, + "step": 14609 + }, + { + "epoch": 0.3697143001746084, + "grad_norm": 10.225591659545898, + "learning_rate": 7.081390575070628e-06, + "loss": 0.1292, + "step": 14610 + }, + { + "epoch": 0.3697396057393021, + "grad_norm": 15.489540100097656, + "learning_rate": 7.081025489076097e-06, + "loss": 0.1898, + "step": 14611 + }, + { + "epoch": 0.36976491130399575, + "grad_norm": 3.3419249057769775, + "learning_rate": 7.0806603896614176e-06, + "loss": 0.116, + "step": 14612 + }, + { + "epoch": 0.3697902168686894, + "grad_norm": 4.3215179443359375, + "learning_rate": 7.080295276828949e-06, + "loss": 0.176, + "step": 14613 + }, + { + "epoch": 0.3698155224333831, + "grad_norm": 2.5904417037963867, + "learning_rate": 7.079930150581043e-06, + "loss": 0.1402, + "step": 14614 + }, + { + "epoch": 0.36984082799807677, + "grad_norm": 4.006889820098877, + "learning_rate": 7.079565010920053e-06, + "loss": 0.2333, + "step": 14615 + }, + { + "epoch": 0.3698661335627704, + "grad_norm": 5.7575531005859375, + "learning_rate": 7.079199857848335e-06, + "loss": 0.1704, + "step": 14616 + }, + { + "epoch": 0.36989143912746414, + "grad_norm": 7.715060710906982, + "learning_rate": 7.078834691368245e-06, + "loss": 0.2328, + "step": 14617 + }, + { + "epoch": 0.3699167446921578, + "grad_norm": 4.040050983428955, + "learning_rate": 7.078469511482136e-06, + "loss": 0.1602, + "step": 14618 + }, + { + "epoch": 0.3699420502568515, + "grad_norm": 14.297908782958984, + "learning_rate": 7.078104318192364e-06, + "loss": 0.2359, + "step": 14619 + }, + { + "epoch": 0.36996735582154516, + "grad_norm": 12.542035102844238, + "learning_rate": 7.0777391115012835e-06, + "loss": 0.2412, + "step": 14620 + }, + { + "epoch": 0.3699926613862388, + "grad_norm": 5.415945053100586, + "learning_rate": 7.07737389141125e-06, + "loss": 0.1483, + "step": 14621 + }, + { + "epoch": 0.37001796695093253, + "grad_norm": 10.89547061920166, + "learning_rate": 7.07700865792462e-06, + "loss": 0.2788, + "step": 14622 + }, + { + "epoch": 0.3700432725156262, + "grad_norm": 5.903773784637451, + "learning_rate": 7.076643411043747e-06, + "loss": 0.2055, + "step": 14623 + }, + { + "epoch": 0.37006857808031984, + "grad_norm": 8.956474304199219, + "learning_rate": 7.0762781507709855e-06, + "loss": 0.2388, + "step": 14624 + }, + { + "epoch": 0.37009388364501355, + "grad_norm": 6.107313632965088, + "learning_rate": 7.0759128771086924e-06, + "loss": 0.2007, + "step": 14625 + }, + { + "epoch": 0.3701191892097072, + "grad_norm": 4.768820285797119, + "learning_rate": 7.075547590059225e-06, + "loss": 0.1871, + "step": 14626 + }, + { + "epoch": 0.37014449477440087, + "grad_norm": 6.524638652801514, + "learning_rate": 7.075182289624937e-06, + "loss": 0.1497, + "step": 14627 + }, + { + "epoch": 0.3701698003390946, + "grad_norm": 3.197211742401123, + "learning_rate": 7.074816975808183e-06, + "loss": 0.1306, + "step": 14628 + }, + { + "epoch": 0.37019510590378824, + "grad_norm": 3.923525094985962, + "learning_rate": 7.074451648611321e-06, + "loss": 0.149, + "step": 14629 + }, + { + "epoch": 0.37022041146848195, + "grad_norm": 2.8854382038116455, + "learning_rate": 7.0740863080367075e-06, + "loss": 0.154, + "step": 14630 + }, + { + "epoch": 0.3702457170331756, + "grad_norm": 5.273980140686035, + "learning_rate": 7.073720954086696e-06, + "loss": 0.2015, + "step": 14631 + }, + { + "epoch": 0.37027102259786926, + "grad_norm": 33.0447883605957, + "learning_rate": 7.073355586763643e-06, + "loss": 0.4353, + "step": 14632 + }, + { + "epoch": 0.37029632816256297, + "grad_norm": 3.188835382461548, + "learning_rate": 7.072990206069907e-06, + "loss": 0.1262, + "step": 14633 + }, + { + "epoch": 0.3703216337272566, + "grad_norm": 4.500261306762695, + "learning_rate": 7.0726248120078415e-06, + "loss": 0.229, + "step": 14634 + }, + { + "epoch": 0.3703469392919503, + "grad_norm": 5.494414329528809, + "learning_rate": 7.072259404579804e-06, + "loss": 0.2457, + "step": 14635 + }, + { + "epoch": 0.370372244856644, + "grad_norm": 4.759035587310791, + "learning_rate": 7.071893983788151e-06, + "loss": 0.2219, + "step": 14636 + }, + { + "epoch": 0.37039755042133765, + "grad_norm": 6.361189365386963, + "learning_rate": 7.071528549635242e-06, + "loss": 0.2329, + "step": 14637 + }, + { + "epoch": 0.3704228559860313, + "grad_norm": 3.833136796951294, + "learning_rate": 7.071163102123428e-06, + "loss": 0.1711, + "step": 14638 + }, + { + "epoch": 0.370448161550725, + "grad_norm": 2.964674711227417, + "learning_rate": 7.070797641255069e-06, + "loss": 0.1538, + "step": 14639 + }, + { + "epoch": 0.3704734671154187, + "grad_norm": 9.14280891418457, + "learning_rate": 7.070432167032521e-06, + "loss": 0.1919, + "step": 14640 + }, + { + "epoch": 0.37049877268011233, + "grad_norm": 6.556356906890869, + "learning_rate": 7.070066679458142e-06, + "loss": 0.1941, + "step": 14641 + }, + { + "epoch": 0.37052407824480604, + "grad_norm": 6.290951251983643, + "learning_rate": 7.069701178534287e-06, + "loss": 0.1959, + "step": 14642 + }, + { + "epoch": 0.3705493838094997, + "grad_norm": 5.487074375152588, + "learning_rate": 7.069335664263315e-06, + "loss": 0.1567, + "step": 14643 + }, + { + "epoch": 0.3705746893741934, + "grad_norm": 2.4436888694763184, + "learning_rate": 7.068970136647582e-06, + "loss": 0.0824, + "step": 14644 + }, + { + "epoch": 0.37059999493888707, + "grad_norm": 8.366844177246094, + "learning_rate": 7.068604595689445e-06, + "loss": 0.2166, + "step": 14645 + }, + { + "epoch": 0.3706253005035807, + "grad_norm": 4.570041179656982, + "learning_rate": 7.068239041391262e-06, + "loss": 0.1604, + "step": 14646 + }, + { + "epoch": 0.37065060606827444, + "grad_norm": 4.215144634246826, + "learning_rate": 7.06787347375539e-06, + "loss": 0.2111, + "step": 14647 + }, + { + "epoch": 0.3706759116329681, + "grad_norm": 7.05499792098999, + "learning_rate": 7.067507892784186e-06, + "loss": 0.1927, + "step": 14648 + }, + { + "epoch": 0.37070121719766175, + "grad_norm": 4.166233539581299, + "learning_rate": 7.06714229848001e-06, + "loss": 0.1737, + "step": 14649 + }, + { + "epoch": 0.37072652276235546, + "grad_norm": 10.100103378295898, + "learning_rate": 7.066776690845216e-06, + "loss": 0.3099, + "step": 14650 + }, + { + "epoch": 0.3707518283270491, + "grad_norm": 5.827634334564209, + "learning_rate": 7.0664110698821645e-06, + "loss": 0.1638, + "step": 14651 + }, + { + "epoch": 0.37077713389174277, + "grad_norm": 5.7162766456604, + "learning_rate": 7.066045435593211e-06, + "loss": 0.2403, + "step": 14652 + }, + { + "epoch": 0.3708024394564365, + "grad_norm": 6.855136394500732, + "learning_rate": 7.065679787980717e-06, + "loss": 0.1707, + "step": 14653 + }, + { + "epoch": 0.37082774502113014, + "grad_norm": 9.732962608337402, + "learning_rate": 7.0653141270470375e-06, + "loss": 0.2883, + "step": 14654 + }, + { + "epoch": 0.3708530505858238, + "grad_norm": 4.2407636642456055, + "learning_rate": 7.064948452794531e-06, + "loss": 0.1632, + "step": 14655 + }, + { + "epoch": 0.3708783561505175, + "grad_norm": 9.168915748596191, + "learning_rate": 7.064582765225556e-06, + "loss": 0.157, + "step": 14656 + }, + { + "epoch": 0.37090366171521116, + "grad_norm": 2.5427613258361816, + "learning_rate": 7.064217064342472e-06, + "loss": 0.071, + "step": 14657 + }, + { + "epoch": 0.3709289672799049, + "grad_norm": 4.167209148406982, + "learning_rate": 7.063851350147635e-06, + "loss": 0.1464, + "step": 14658 + }, + { + "epoch": 0.37095427284459853, + "grad_norm": 3.733994722366333, + "learning_rate": 7.0634856226434045e-06, + "loss": 0.197, + "step": 14659 + }, + { + "epoch": 0.3709795784092922, + "grad_norm": 2.761453866958618, + "learning_rate": 7.063119881832139e-06, + "loss": 0.0793, + "step": 14660 + }, + { + "epoch": 0.3710048839739859, + "grad_norm": 7.933006286621094, + "learning_rate": 7.062754127716197e-06, + "loss": 0.201, + "step": 14661 + }, + { + "epoch": 0.37103018953867956, + "grad_norm": 6.764613151550293, + "learning_rate": 7.062388360297938e-06, + "loss": 0.1871, + "step": 14662 + }, + { + "epoch": 0.3710554951033732, + "grad_norm": 12.394457817077637, + "learning_rate": 7.06202257957972e-06, + "loss": 0.2095, + "step": 14663 + }, + { + "epoch": 0.3710808006680669, + "grad_norm": 3.8519351482391357, + "learning_rate": 7.0616567855639e-06, + "loss": 0.1453, + "step": 14664 + }, + { + "epoch": 0.3711061062327606, + "grad_norm": 5.092653274536133, + "learning_rate": 7.061290978252842e-06, + "loss": 0.1888, + "step": 14665 + }, + { + "epoch": 0.37113141179745424, + "grad_norm": 3.722374677658081, + "learning_rate": 7.0609251576489e-06, + "loss": 0.0922, + "step": 14666 + }, + { + "epoch": 0.37115671736214795, + "grad_norm": 3.2534360885620117, + "learning_rate": 7.060559323754436e-06, + "loss": 0.1284, + "step": 14667 + }, + { + "epoch": 0.3711820229268416, + "grad_norm": 3.2203009128570557, + "learning_rate": 7.060193476571808e-06, + "loss": 0.1469, + "step": 14668 + }, + { + "epoch": 0.3712073284915353, + "grad_norm": 6.694163799285889, + "learning_rate": 7.059827616103374e-06, + "loss": 0.1982, + "step": 14669 + }, + { + "epoch": 0.37123263405622897, + "grad_norm": 11.951541900634766, + "learning_rate": 7.059461742351497e-06, + "loss": 0.2704, + "step": 14670 + }, + { + "epoch": 0.37125793962092263, + "grad_norm": 4.78844690322876, + "learning_rate": 7.059095855318533e-06, + "loss": 0.1662, + "step": 14671 + }, + { + "epoch": 0.37128324518561634, + "grad_norm": 5.008557319641113, + "learning_rate": 7.058729955006843e-06, + "loss": 0.1819, + "step": 14672 + }, + { + "epoch": 0.37130855075031, + "grad_norm": 6.8100666999816895, + "learning_rate": 7.058364041418787e-06, + "loss": 0.2249, + "step": 14673 + }, + { + "epoch": 0.37133385631500365, + "grad_norm": 7.776528358459473, + "learning_rate": 7.057998114556725e-06, + "loss": 0.1951, + "step": 14674 + }, + { + "epoch": 0.37135916187969736, + "grad_norm": 6.486180305480957, + "learning_rate": 7.057632174423015e-06, + "loss": 0.2474, + "step": 14675 + }, + { + "epoch": 0.371384467444391, + "grad_norm": 4.6446027755737305, + "learning_rate": 7.0572662210200174e-06, + "loss": 0.1656, + "step": 14676 + }, + { + "epoch": 0.3714097730090847, + "grad_norm": 4.281264781951904, + "learning_rate": 7.056900254350093e-06, + "loss": 0.1291, + "step": 14677 + }, + { + "epoch": 0.3714350785737784, + "grad_norm": 2.6075925827026367, + "learning_rate": 7.056534274415601e-06, + "loss": 0.1252, + "step": 14678 + }, + { + "epoch": 0.37146038413847204, + "grad_norm": 5.9319281578063965, + "learning_rate": 7.056168281218904e-06, + "loss": 0.1576, + "step": 14679 + }, + { + "epoch": 0.3714856897031657, + "grad_norm": 4.0732221603393555, + "learning_rate": 7.0558022747623574e-06, + "loss": 0.1258, + "step": 14680 + }, + { + "epoch": 0.3715109952678594, + "grad_norm": 6.618837833404541, + "learning_rate": 7.055436255048327e-06, + "loss": 0.2829, + "step": 14681 + }, + { + "epoch": 0.37153630083255307, + "grad_norm": 3.4388139247894287, + "learning_rate": 7.055070222079168e-06, + "loss": 0.1916, + "step": 14682 + }, + { + "epoch": 0.3715616063972468, + "grad_norm": 9.089104652404785, + "learning_rate": 7.054704175857246e-06, + "loss": 0.2814, + "step": 14683 + }, + { + "epoch": 0.37158691196194044, + "grad_norm": 5.536989688873291, + "learning_rate": 7.054338116384916e-06, + "loss": 0.2358, + "step": 14684 + }, + { + "epoch": 0.3716122175266341, + "grad_norm": 4.394877910614014, + "learning_rate": 7.053972043664543e-06, + "loss": 0.2062, + "step": 14685 + }, + { + "epoch": 0.3716375230913278, + "grad_norm": 3.961885929107666, + "learning_rate": 7.053605957698486e-06, + "loss": 0.0802, + "step": 14686 + }, + { + "epoch": 0.37166282865602146, + "grad_norm": 4.797359466552734, + "learning_rate": 7.053239858489107e-06, + "loss": 0.1926, + "step": 14687 + }, + { + "epoch": 0.3716881342207151, + "grad_norm": 6.2561163902282715, + "learning_rate": 7.052873746038766e-06, + "loss": 0.2545, + "step": 14688 + }, + { + "epoch": 0.37171343978540883, + "grad_norm": 41.272151947021484, + "learning_rate": 7.0525076203498245e-06, + "loss": 0.192, + "step": 14689 + }, + { + "epoch": 0.3717387453501025, + "grad_norm": 2.5712201595306396, + "learning_rate": 7.052141481424643e-06, + "loss": 0.0659, + "step": 14690 + }, + { + "epoch": 0.37176405091479614, + "grad_norm": 5.214028358459473, + "learning_rate": 7.051775329265581e-06, + "loss": 0.2335, + "step": 14691 + }, + { + "epoch": 0.37178935647948985, + "grad_norm": 11.925971984863281, + "learning_rate": 7.051409163875004e-06, + "loss": 0.2478, + "step": 14692 + }, + { + "epoch": 0.3718146620441835, + "grad_norm": 2.9256162643432617, + "learning_rate": 7.051042985255269e-06, + "loss": 0.0798, + "step": 14693 + }, + { + "epoch": 0.37183996760887716, + "grad_norm": 8.033437728881836, + "learning_rate": 7.050676793408741e-06, + "loss": 0.1982, + "step": 14694 + }, + { + "epoch": 0.3718652731735709, + "grad_norm": 10.108537673950195, + "learning_rate": 7.050310588337777e-06, + "loss": 0.324, + "step": 14695 + }, + { + "epoch": 0.37189057873826453, + "grad_norm": 10.7552490234375, + "learning_rate": 7.0499443700447435e-06, + "loss": 0.2904, + "step": 14696 + }, + { + "epoch": 0.37191588430295824, + "grad_norm": 3.351033926010132, + "learning_rate": 7.0495781385320006e-06, + "loss": 0.211, + "step": 14697 + }, + { + "epoch": 0.3719411898676519, + "grad_norm": 4.8322834968566895, + "learning_rate": 7.049211893801908e-06, + "loss": 0.152, + "step": 14698 + }, + { + "epoch": 0.37196649543234556, + "grad_norm": 10.890924453735352, + "learning_rate": 7.048845635856831e-06, + "loss": 0.4242, + "step": 14699 + }, + { + "epoch": 0.37199180099703927, + "grad_norm": 5.184764385223389, + "learning_rate": 7.048479364699128e-06, + "loss": 0.1254, + "step": 14700 + }, + { + "epoch": 0.3720171065617329, + "grad_norm": 6.618014335632324, + "learning_rate": 7.048113080331164e-06, + "loss": 0.2438, + "step": 14701 + }, + { + "epoch": 0.3720424121264266, + "grad_norm": 3.2336158752441406, + "learning_rate": 7.047746782755298e-06, + "loss": 0.1364, + "step": 14702 + }, + { + "epoch": 0.3720677176911203, + "grad_norm": 5.420582294464111, + "learning_rate": 7.047380471973894e-06, + "loss": 0.1681, + "step": 14703 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 7.243800163269043, + "learning_rate": 7.047014147989317e-06, + "loss": 0.1863, + "step": 14704 + }, + { + "epoch": 0.3721183288205076, + "grad_norm": 5.289560317993164, + "learning_rate": 7.046647810803924e-06, + "loss": 0.2142, + "step": 14705 + }, + { + "epoch": 0.3721436343852013, + "grad_norm": 6.542332172393799, + "learning_rate": 7.046281460420082e-06, + "loss": 0.1768, + "step": 14706 + }, + { + "epoch": 0.372168939949895, + "grad_norm": 4.131228923797607, + "learning_rate": 7.045915096840148e-06, + "loss": 0.2055, + "step": 14707 + }, + { + "epoch": 0.3721942455145887, + "grad_norm": 3.9857022762298584, + "learning_rate": 7.045548720066491e-06, + "loss": 0.1445, + "step": 14708 + }, + { + "epoch": 0.37221955107928234, + "grad_norm": 5.050914287567139, + "learning_rate": 7.04518233010147e-06, + "loss": 0.2099, + "step": 14709 + }, + { + "epoch": 0.372244856643976, + "grad_norm": 3.9826159477233887, + "learning_rate": 7.044815926947448e-06, + "loss": 0.0774, + "step": 14710 + }, + { + "epoch": 0.3722701622086697, + "grad_norm": 6.814582347869873, + "learning_rate": 7.044449510606789e-06, + "loss": 0.1605, + "step": 14711 + }, + { + "epoch": 0.37229546777336336, + "grad_norm": 9.234103202819824, + "learning_rate": 7.044083081081856e-06, + "loss": 0.1972, + "step": 14712 + }, + { + "epoch": 0.372320773338057, + "grad_norm": 3.9755072593688965, + "learning_rate": 7.04371663837501e-06, + "loss": 0.1521, + "step": 14713 + }, + { + "epoch": 0.37234607890275073, + "grad_norm": 2.646028757095337, + "learning_rate": 7.043350182488616e-06, + "loss": 0.1592, + "step": 14714 + }, + { + "epoch": 0.3723713844674444, + "grad_norm": 2.6730973720550537, + "learning_rate": 7.042983713425038e-06, + "loss": 0.1041, + "step": 14715 + }, + { + "epoch": 0.37239669003213804, + "grad_norm": 3.756389856338501, + "learning_rate": 7.042617231186636e-06, + "loss": 0.17, + "step": 14716 + }, + { + "epoch": 0.37242199559683176, + "grad_norm": 6.609350681304932, + "learning_rate": 7.042250735775777e-06, + "loss": 0.1682, + "step": 14717 + }, + { + "epoch": 0.3724473011615254, + "grad_norm": 6.230995178222656, + "learning_rate": 7.041884227194822e-06, + "loss": 0.183, + "step": 14718 + }, + { + "epoch": 0.37247260672621907, + "grad_norm": 7.0803303718566895, + "learning_rate": 7.041517705446137e-06, + "loss": 0.2029, + "step": 14719 + }, + { + "epoch": 0.3724979122909128, + "grad_norm": 4.2363457679748535, + "learning_rate": 7.041151170532081e-06, + "loss": 0.1709, + "step": 14720 + }, + { + "epoch": 0.37252321785560644, + "grad_norm": 6.430394172668457, + "learning_rate": 7.0407846224550235e-06, + "loss": 0.2078, + "step": 14721 + }, + { + "epoch": 0.37254852342030015, + "grad_norm": 6.607444763183594, + "learning_rate": 7.040418061217325e-06, + "loss": 0.2062, + "step": 14722 + }, + { + "epoch": 0.3725738289849938, + "grad_norm": 8.175498962402344, + "learning_rate": 7.040051486821351e-06, + "loss": 0.2461, + "step": 14723 + }, + { + "epoch": 0.37259913454968746, + "grad_norm": 3.928309679031372, + "learning_rate": 7.0396848992694624e-06, + "loss": 0.1783, + "step": 14724 + }, + { + "epoch": 0.3726244401143812, + "grad_norm": 8.841934204101562, + "learning_rate": 7.039318298564026e-06, + "loss": 0.1512, + "step": 14725 + }, + { + "epoch": 0.37264974567907483, + "grad_norm": 8.73119831085205, + "learning_rate": 7.038951684707405e-06, + "loss": 0.1849, + "step": 14726 + }, + { + "epoch": 0.3726750512437685, + "grad_norm": 7.714687347412109, + "learning_rate": 7.038585057701964e-06, + "loss": 0.1536, + "step": 14727 + }, + { + "epoch": 0.3727003568084622, + "grad_norm": 3.3773269653320312, + "learning_rate": 7.0382184175500674e-06, + "loss": 0.1316, + "step": 14728 + }, + { + "epoch": 0.37272566237315585, + "grad_norm": 14.428640365600586, + "learning_rate": 7.03785176425408e-06, + "loss": 0.2462, + "step": 14729 + }, + { + "epoch": 0.3727509679378495, + "grad_norm": 3.8035647869110107, + "learning_rate": 7.037485097816366e-06, + "loss": 0.1637, + "step": 14730 + }, + { + "epoch": 0.3727762735025432, + "grad_norm": 4.79435920715332, + "learning_rate": 7.037118418239288e-06, + "loss": 0.1699, + "step": 14731 + }, + { + "epoch": 0.3728015790672369, + "grad_norm": 4.452937602996826, + "learning_rate": 7.036751725525212e-06, + "loss": 0.1091, + "step": 14732 + }, + { + "epoch": 0.3728268846319306, + "grad_norm": 5.879352569580078, + "learning_rate": 7.036385019676505e-06, + "loss": 0.1999, + "step": 14733 + }, + { + "epoch": 0.37285219019662424, + "grad_norm": 3.858743190765381, + "learning_rate": 7.036018300695528e-06, + "loss": 0.1724, + "step": 14734 + }, + { + "epoch": 0.3728774957613179, + "grad_norm": 4.349798202514648, + "learning_rate": 7.035651568584649e-06, + "loss": 0.1459, + "step": 14735 + }, + { + "epoch": 0.3729028013260116, + "grad_norm": 6.126262664794922, + "learning_rate": 7.035284823346231e-06, + "loss": 0.1831, + "step": 14736 + }, + { + "epoch": 0.37292810689070527, + "grad_norm": 4.507683753967285, + "learning_rate": 7.03491806498264e-06, + "loss": 0.1711, + "step": 14737 + }, + { + "epoch": 0.3729534124553989, + "grad_norm": 16.161104202270508, + "learning_rate": 7.03455129349624e-06, + "loss": 0.2443, + "step": 14738 + }, + { + "epoch": 0.37297871802009264, + "grad_norm": 5.932376384735107, + "learning_rate": 7.034184508889399e-06, + "loss": 0.1354, + "step": 14739 + }, + { + "epoch": 0.3730040235847863, + "grad_norm": 1.7522627115249634, + "learning_rate": 7.033817711164479e-06, + "loss": 0.0478, + "step": 14740 + }, + { + "epoch": 0.37302932914947995, + "grad_norm": 4.202563762664795, + "learning_rate": 7.033450900323847e-06, + "loss": 0.1444, + "step": 14741 + }, + { + "epoch": 0.37305463471417366, + "grad_norm": 8.575486183166504, + "learning_rate": 7.033084076369868e-06, + "loss": 0.2044, + "step": 14742 + }, + { + "epoch": 0.3730799402788673, + "grad_norm": 6.956895351409912, + "learning_rate": 7.032717239304909e-06, + "loss": 0.1921, + "step": 14743 + }, + { + "epoch": 0.373105245843561, + "grad_norm": 10.652044296264648, + "learning_rate": 7.032350389131335e-06, + "loss": 0.2531, + "step": 14744 + }, + { + "epoch": 0.3731305514082547, + "grad_norm": 7.243531227111816, + "learning_rate": 7.031983525851511e-06, + "loss": 0.2003, + "step": 14745 + }, + { + "epoch": 0.37315585697294834, + "grad_norm": 5.740936756134033, + "learning_rate": 7.031616649467803e-06, + "loss": 0.1394, + "step": 14746 + }, + { + "epoch": 0.37318116253764205, + "grad_norm": 14.343866348266602, + "learning_rate": 7.031249759982577e-06, + "loss": 0.2676, + "step": 14747 + }, + { + "epoch": 0.3732064681023357, + "grad_norm": 9.024553298950195, + "learning_rate": 7.030882857398198e-06, + "loss": 0.1637, + "step": 14748 + }, + { + "epoch": 0.37323177366702937, + "grad_norm": 3.4751322269439697, + "learning_rate": 7.0305159417170345e-06, + "loss": 0.1392, + "step": 14749 + }, + { + "epoch": 0.3732570792317231, + "grad_norm": 7.316894054412842, + "learning_rate": 7.030149012941452e-06, + "loss": 0.2089, + "step": 14750 + }, + { + "epoch": 0.37328238479641673, + "grad_norm": 5.858447074890137, + "learning_rate": 7.029782071073815e-06, + "loss": 0.1824, + "step": 14751 + }, + { + "epoch": 0.3733076903611104, + "grad_norm": 11.519142150878906, + "learning_rate": 7.029415116116492e-06, + "loss": 0.2258, + "step": 14752 + }, + { + "epoch": 0.3733329959258041, + "grad_norm": 6.5504841804504395, + "learning_rate": 7.0290481480718476e-06, + "loss": 0.1456, + "step": 14753 + }, + { + "epoch": 0.37335830149049776, + "grad_norm": 4.396661758422852, + "learning_rate": 7.028681166942249e-06, + "loss": 0.1219, + "step": 14754 + }, + { + "epoch": 0.3733836070551914, + "grad_norm": 4.1861066818237305, + "learning_rate": 7.028314172730063e-06, + "loss": 0.1686, + "step": 14755 + }, + { + "epoch": 0.3734089126198851, + "grad_norm": 7.978463649749756, + "learning_rate": 7.027947165437658e-06, + "loss": 0.1541, + "step": 14756 + }, + { + "epoch": 0.3734342181845788, + "grad_norm": 13.121612548828125, + "learning_rate": 7.0275801450673964e-06, + "loss": 0.2092, + "step": 14757 + }, + { + "epoch": 0.37345952374927244, + "grad_norm": 3.848515748977661, + "learning_rate": 7.027213111621649e-06, + "loss": 0.1291, + "step": 14758 + }, + { + "epoch": 0.37348482931396615, + "grad_norm": 7.362691879272461, + "learning_rate": 7.0268460651027814e-06, + "loss": 0.2464, + "step": 14759 + }, + { + "epoch": 0.3735101348786598, + "grad_norm": 5.269174098968506, + "learning_rate": 7.02647900551316e-06, + "loss": 0.1376, + "step": 14760 + }, + { + "epoch": 0.3735354404433535, + "grad_norm": 11.998791694641113, + "learning_rate": 7.026111932855152e-06, + "loss": 0.21, + "step": 14761 + }, + { + "epoch": 0.3735607460080472, + "grad_norm": 8.191720008850098, + "learning_rate": 7.025744847131126e-06, + "loss": 0.178, + "step": 14762 + }, + { + "epoch": 0.37358605157274083, + "grad_norm": 9.580185890197754, + "learning_rate": 7.025377748343448e-06, + "loss": 0.2877, + "step": 14763 + }, + { + "epoch": 0.37361135713743454, + "grad_norm": 7.136445045471191, + "learning_rate": 7.025010636494486e-06, + "loss": 0.209, + "step": 14764 + }, + { + "epoch": 0.3736366627021282, + "grad_norm": 6.140591621398926, + "learning_rate": 7.024643511586607e-06, + "loss": 0.1364, + "step": 14765 + }, + { + "epoch": 0.37366196826682185, + "grad_norm": 4.747672080993652, + "learning_rate": 7.0242763736221774e-06, + "loss": 0.206, + "step": 14766 + }, + { + "epoch": 0.37368727383151557, + "grad_norm": 12.657905578613281, + "learning_rate": 7.023909222603567e-06, + "loss": 0.3421, + "step": 14767 + }, + { + "epoch": 0.3737125793962092, + "grad_norm": 4.936875820159912, + "learning_rate": 7.0235420585331425e-06, + "loss": 0.1551, + "step": 14768 + }, + { + "epoch": 0.3737378849609029, + "grad_norm": 3.8796427249908447, + "learning_rate": 7.023174881413273e-06, + "loss": 0.1385, + "step": 14769 + }, + { + "epoch": 0.3737631905255966, + "grad_norm": 4.557696342468262, + "learning_rate": 7.022807691246323e-06, + "loss": 0.1736, + "step": 14770 + }, + { + "epoch": 0.37378849609029025, + "grad_norm": 4.5697102546691895, + "learning_rate": 7.022440488034662e-06, + "loss": 0.1236, + "step": 14771 + }, + { + "epoch": 0.37381380165498396, + "grad_norm": 4.149712562561035, + "learning_rate": 7.0220732717806605e-06, + "loss": 0.1472, + "step": 14772 + }, + { + "epoch": 0.3738391072196776, + "grad_norm": 4.348977565765381, + "learning_rate": 7.021706042486682e-06, + "loss": 0.1589, + "step": 14773 + }, + { + "epoch": 0.37386441278437127, + "grad_norm": 3.409213066101074, + "learning_rate": 7.0213388001550995e-06, + "loss": 0.13, + "step": 14774 + }, + { + "epoch": 0.373889718349065, + "grad_norm": 6.512276649475098, + "learning_rate": 7.020971544788279e-06, + "loss": 0.1963, + "step": 14775 + }, + { + "epoch": 0.37391502391375864, + "grad_norm": 5.016767978668213, + "learning_rate": 7.02060427638859e-06, + "loss": 0.14, + "step": 14776 + }, + { + "epoch": 0.3739403294784523, + "grad_norm": 8.253716468811035, + "learning_rate": 7.020236994958397e-06, + "loss": 0.3146, + "step": 14777 + }, + { + "epoch": 0.373965635043146, + "grad_norm": 5.289658069610596, + "learning_rate": 7.019869700500072e-06, + "loss": 0.1944, + "step": 14778 + }, + { + "epoch": 0.37399094060783966, + "grad_norm": 7.615782737731934, + "learning_rate": 7.019502393015984e-06, + "loss": 0.1844, + "step": 14779 + }, + { + "epoch": 0.3740162461725333, + "grad_norm": 5.205264091491699, + "learning_rate": 7.0191350725085006e-06, + "loss": 0.1129, + "step": 14780 + }, + { + "epoch": 0.37404155173722703, + "grad_norm": 4.440075397491455, + "learning_rate": 7.018767738979991e-06, + "loss": 0.1912, + "step": 14781 + }, + { + "epoch": 0.3740668573019207, + "grad_norm": 3.5960593223571777, + "learning_rate": 7.018400392432823e-06, + "loss": 0.129, + "step": 14782 + }, + { + "epoch": 0.37409216286661434, + "grad_norm": 6.261009693145752, + "learning_rate": 7.018033032869367e-06, + "loss": 0.1715, + "step": 14783 + }, + { + "epoch": 0.37411746843130805, + "grad_norm": 9.641587257385254, + "learning_rate": 7.01766566029199e-06, + "loss": 0.1369, + "step": 14784 + }, + { + "epoch": 0.3741427739960017, + "grad_norm": 4.248437881469727, + "learning_rate": 7.017298274703065e-06, + "loss": 0.1724, + "step": 14785 + }, + { + "epoch": 0.3741680795606954, + "grad_norm": 5.420570373535156, + "learning_rate": 7.016930876104957e-06, + "loss": 0.1184, + "step": 14786 + }, + { + "epoch": 0.3741933851253891, + "grad_norm": 6.014224052429199, + "learning_rate": 7.016563464500037e-06, + "loss": 0.1751, + "step": 14787 + }, + { + "epoch": 0.37421869069008273, + "grad_norm": 5.350011348724365, + "learning_rate": 7.016196039890675e-06, + "loss": 0.1995, + "step": 14788 + }, + { + "epoch": 0.37424399625477645, + "grad_norm": 9.932777404785156, + "learning_rate": 7.015828602279239e-06, + "loss": 0.2593, + "step": 14789 + }, + { + "epoch": 0.3742693018194701, + "grad_norm": 3.911388397216797, + "learning_rate": 7.015461151668099e-06, + "loss": 0.1736, + "step": 14790 + }, + { + "epoch": 0.37429460738416376, + "grad_norm": 6.792376518249512, + "learning_rate": 7.015093688059627e-06, + "loss": 0.1623, + "step": 14791 + }, + { + "epoch": 0.37431991294885747, + "grad_norm": 6.520549297332764, + "learning_rate": 7.0147262114561885e-06, + "loss": 0.2775, + "step": 14792 + }, + { + "epoch": 0.3743452185135511, + "grad_norm": 4.730758190155029, + "learning_rate": 7.014358721860156e-06, + "loss": 0.1964, + "step": 14793 + }, + { + "epoch": 0.3743705240782448, + "grad_norm": 4.608858108520508, + "learning_rate": 7.013991219273899e-06, + "loss": 0.1365, + "step": 14794 + }, + { + "epoch": 0.3743958296429385, + "grad_norm": 4.209932804107666, + "learning_rate": 7.013623703699787e-06, + "loss": 0.1574, + "step": 14795 + }, + { + "epoch": 0.37442113520763215, + "grad_norm": 15.7230863571167, + "learning_rate": 7.013256175140191e-06, + "loss": 0.2268, + "step": 14796 + }, + { + "epoch": 0.37444644077232586, + "grad_norm": 3.2659130096435547, + "learning_rate": 7.012888633597479e-06, + "loss": 0.1327, + "step": 14797 + }, + { + "epoch": 0.3744717463370195, + "grad_norm": 7.3836188316345215, + "learning_rate": 7.012521079074024e-06, + "loss": 0.2101, + "step": 14798 + }, + { + "epoch": 0.3744970519017132, + "grad_norm": 6.124066352844238, + "learning_rate": 7.012153511572195e-06, + "loss": 0.1353, + "step": 14799 + }, + { + "epoch": 0.3745223574664069, + "grad_norm": 15.405309677124023, + "learning_rate": 7.011785931094361e-06, + "loss": 0.1764, + "step": 14800 + }, + { + "epoch": 0.37454766303110054, + "grad_norm": 6.800795555114746, + "learning_rate": 7.011418337642895e-06, + "loss": 0.2916, + "step": 14801 + }, + { + "epoch": 0.3745729685957942, + "grad_norm": 5.942075729370117, + "learning_rate": 7.011050731220165e-06, + "loss": 0.2248, + "step": 14802 + }, + { + "epoch": 0.3745982741604879, + "grad_norm": 4.064310073852539, + "learning_rate": 7.010683111828545e-06, + "loss": 0.1714, + "step": 14803 + }, + { + "epoch": 0.37462357972518157, + "grad_norm": 5.874645233154297, + "learning_rate": 7.010315479470401e-06, + "loss": 0.2016, + "step": 14804 + }, + { + "epoch": 0.3746488852898752, + "grad_norm": 3.039365291595459, + "learning_rate": 7.009947834148109e-06, + "loss": 0.1401, + "step": 14805 + }, + { + "epoch": 0.37467419085456893, + "grad_norm": 4.676660537719727, + "learning_rate": 7.009580175864035e-06, + "loss": 0.2199, + "step": 14806 + }, + { + "epoch": 0.3746994964192626, + "grad_norm": 5.64682149887085, + "learning_rate": 7.009212504620554e-06, + "loss": 0.2471, + "step": 14807 + }, + { + "epoch": 0.37472480198395625, + "grad_norm": 2.699033737182617, + "learning_rate": 7.0088448204200336e-06, + "loss": 0.1083, + "step": 14808 + }, + { + "epoch": 0.37475010754864996, + "grad_norm": 5.975103855133057, + "learning_rate": 7.008477123264849e-06, + "loss": 0.269, + "step": 14809 + }, + { + "epoch": 0.3747754131133436, + "grad_norm": 4.0225300788879395, + "learning_rate": 7.008109413157367e-06, + "loss": 0.1674, + "step": 14810 + }, + { + "epoch": 0.3748007186780373, + "grad_norm": 2.6901988983154297, + "learning_rate": 7.007741690099962e-06, + "loss": 0.1392, + "step": 14811 + }, + { + "epoch": 0.374826024242731, + "grad_norm": 3.9226460456848145, + "learning_rate": 7.0073739540950026e-06, + "loss": 0.0967, + "step": 14812 + }, + { + "epoch": 0.37485132980742464, + "grad_norm": 5.062632083892822, + "learning_rate": 7.0070062051448625e-06, + "loss": 0.1223, + "step": 14813 + }, + { + "epoch": 0.37487663537211835, + "grad_norm": 3.2316765785217285, + "learning_rate": 7.006638443251914e-06, + "loss": 0.1606, + "step": 14814 + }, + { + "epoch": 0.374901940936812, + "grad_norm": 12.475214958190918, + "learning_rate": 7.006270668418527e-06, + "loss": 0.4071, + "step": 14815 + }, + { + "epoch": 0.37492724650150566, + "grad_norm": 12.219926834106445, + "learning_rate": 7.005902880647074e-06, + "loss": 0.332, + "step": 14816 + }, + { + "epoch": 0.3749525520661994, + "grad_norm": 3.250378131866455, + "learning_rate": 7.005535079939926e-06, + "loss": 0.1531, + "step": 14817 + }, + { + "epoch": 0.37497785763089303, + "grad_norm": 4.202767372131348, + "learning_rate": 7.005167266299455e-06, + "loss": 0.1467, + "step": 14818 + }, + { + "epoch": 0.3750031631955867, + "grad_norm": 6.4825005531311035, + "learning_rate": 7.004799439728034e-06, + "loss": 0.2133, + "step": 14819 + }, + { + "epoch": 0.3750284687602804, + "grad_norm": 6.126625061035156, + "learning_rate": 7.0044316002280345e-06, + "loss": 0.24, + "step": 14820 + }, + { + "epoch": 0.37505377432497405, + "grad_norm": 4.706504821777344, + "learning_rate": 7.004063747801828e-06, + "loss": 0.1485, + "step": 14821 + }, + { + "epoch": 0.3750790798896677, + "grad_norm": 6.687238693237305, + "learning_rate": 7.003695882451787e-06, + "loss": 0.2293, + "step": 14822 + }, + { + "epoch": 0.3751043854543614, + "grad_norm": 6.211894512176514, + "learning_rate": 7.003328004180284e-06, + "loss": 0.1327, + "step": 14823 + }, + { + "epoch": 0.3751296910190551, + "grad_norm": 6.992819786071777, + "learning_rate": 7.002960112989692e-06, + "loss": 0.2828, + "step": 14824 + }, + { + "epoch": 0.3751549965837488, + "grad_norm": 8.727996826171875, + "learning_rate": 7.0025922088823815e-06, + "loss": 0.3238, + "step": 14825 + }, + { + "epoch": 0.37518030214844245, + "grad_norm": 3.6212775707244873, + "learning_rate": 7.0022242918607275e-06, + "loss": 0.1061, + "step": 14826 + }, + { + "epoch": 0.3752056077131361, + "grad_norm": 3.0135951042175293, + "learning_rate": 7.0018563619271015e-06, + "loss": 0.1496, + "step": 14827 + }, + { + "epoch": 0.3752309132778298, + "grad_norm": 10.213644981384277, + "learning_rate": 7.001488419083875e-06, + "loss": 0.2764, + "step": 14828 + }, + { + "epoch": 0.37525621884252347, + "grad_norm": 3.3794546127319336, + "learning_rate": 7.001120463333424e-06, + "loss": 0.1716, + "step": 14829 + }, + { + "epoch": 0.3752815244072171, + "grad_norm": 10.600831031799316, + "learning_rate": 7.000752494678117e-06, + "loss": 0.3012, + "step": 14830 + }, + { + "epoch": 0.37530682997191084, + "grad_norm": 7.630206108093262, + "learning_rate": 7.00038451312033e-06, + "loss": 0.1832, + "step": 14831 + }, + { + "epoch": 0.3753321355366045, + "grad_norm": 4.228780746459961, + "learning_rate": 7.000016518662437e-06, + "loss": 0.191, + "step": 14832 + }, + { + "epoch": 0.37535744110129815, + "grad_norm": 3.3737943172454834, + "learning_rate": 6.999648511306808e-06, + "loss": 0.1749, + "step": 14833 + }, + { + "epoch": 0.37538274666599186, + "grad_norm": 5.142396926879883, + "learning_rate": 6.9992804910558175e-06, + "loss": 0.1892, + "step": 14834 + }, + { + "epoch": 0.3754080522306855, + "grad_norm": 4.503765106201172, + "learning_rate": 6.998912457911839e-06, + "loss": 0.1401, + "step": 14835 + }, + { + "epoch": 0.37543335779537923, + "grad_norm": 2.515960693359375, + "learning_rate": 6.998544411877247e-06, + "loss": 0.1006, + "step": 14836 + }, + { + "epoch": 0.3754586633600729, + "grad_norm": 2.5700080394744873, + "learning_rate": 6.998176352954413e-06, + "loss": 0.1138, + "step": 14837 + }, + { + "epoch": 0.37548396892476654, + "grad_norm": 3.594733953475952, + "learning_rate": 6.9978082811457126e-06, + "loss": 0.1397, + "step": 14838 + }, + { + "epoch": 0.37550927448946025, + "grad_norm": 4.091944694519043, + "learning_rate": 6.9974401964535165e-06, + "loss": 0.2009, + "step": 14839 + }, + { + "epoch": 0.3755345800541539, + "grad_norm": 7.002494812011719, + "learning_rate": 6.9970720988802024e-06, + "loss": 0.1348, + "step": 14840 + }, + { + "epoch": 0.37555988561884757, + "grad_norm": 3.844586133956909, + "learning_rate": 6.9967039884281395e-06, + "loss": 0.1846, + "step": 14841 + }, + { + "epoch": 0.3755851911835413, + "grad_norm": 4.355047702789307, + "learning_rate": 6.9963358650997045e-06, + "loss": 0.2706, + "step": 14842 + }, + { + "epoch": 0.37561049674823493, + "grad_norm": 3.63665509223938, + "learning_rate": 6.995967728897272e-06, + "loss": 0.1411, + "step": 14843 + }, + { + "epoch": 0.3756358023129286, + "grad_norm": 5.000973701477051, + "learning_rate": 6.995599579823213e-06, + "loss": 0.1877, + "step": 14844 + }, + { + "epoch": 0.3756611078776223, + "grad_norm": 12.579392433166504, + "learning_rate": 6.995231417879905e-06, + "loss": 0.1711, + "step": 14845 + }, + { + "epoch": 0.37568641344231596, + "grad_norm": 5.1653618812561035, + "learning_rate": 6.9948632430697205e-06, + "loss": 0.2924, + "step": 14846 + }, + { + "epoch": 0.3757117190070096, + "grad_norm": 6.984127998352051, + "learning_rate": 6.994495055395034e-06, + "loss": 0.2257, + "step": 14847 + }, + { + "epoch": 0.3757370245717033, + "grad_norm": 3.958178758621216, + "learning_rate": 6.994126854858219e-06, + "loss": 0.1455, + "step": 14848 + }, + { + "epoch": 0.375762330136397, + "grad_norm": 11.593714714050293, + "learning_rate": 6.993758641461652e-06, + "loss": 0.2799, + "step": 14849 + }, + { + "epoch": 0.3757876357010907, + "grad_norm": 8.25718879699707, + "learning_rate": 6.993390415207707e-06, + "loss": 0.1966, + "step": 14850 + }, + { + "epoch": 0.37581294126578435, + "grad_norm": 5.5305023193359375, + "learning_rate": 6.993022176098756e-06, + "loss": 0.2385, + "step": 14851 + }, + { + "epoch": 0.375838246830478, + "grad_norm": 4.129225254058838, + "learning_rate": 6.992653924137177e-06, + "loss": 0.1803, + "step": 14852 + }, + { + "epoch": 0.3758635523951717, + "grad_norm": 3.102992057800293, + "learning_rate": 6.992285659325344e-06, + "loss": 0.1096, + "step": 14853 + }, + { + "epoch": 0.3758888579598654, + "grad_norm": 3.421271800994873, + "learning_rate": 6.991917381665631e-06, + "loss": 0.1795, + "step": 14854 + }, + { + "epoch": 0.37591416352455903, + "grad_norm": 4.225742340087891, + "learning_rate": 6.991549091160412e-06, + "loss": 0.2068, + "step": 14855 + }, + { + "epoch": 0.37593946908925274, + "grad_norm": 9.69595718383789, + "learning_rate": 6.991180787812064e-06, + "loss": 0.2001, + "step": 14856 + }, + { + "epoch": 0.3759647746539464, + "grad_norm": 8.79620361328125, + "learning_rate": 6.990812471622962e-06, + "loss": 0.2471, + "step": 14857 + }, + { + "epoch": 0.37599008021864005, + "grad_norm": 4.349132537841797, + "learning_rate": 6.9904441425954805e-06, + "loss": 0.1237, + "step": 14858 + }, + { + "epoch": 0.37601538578333377, + "grad_norm": 7.005450248718262, + "learning_rate": 6.9900758007319945e-06, + "loss": 0.2021, + "step": 14859 + }, + { + "epoch": 0.3760406913480274, + "grad_norm": 7.303987503051758, + "learning_rate": 6.989707446034881e-06, + "loss": 0.1987, + "step": 14860 + }, + { + "epoch": 0.37606599691272113, + "grad_norm": 6.369832992553711, + "learning_rate": 6.989339078506513e-06, + "loss": 0.1891, + "step": 14861 + }, + { + "epoch": 0.3760913024774148, + "grad_norm": 14.407415390014648, + "learning_rate": 6.988970698149269e-06, + "loss": 0.4396, + "step": 14862 + }, + { + "epoch": 0.37611660804210845, + "grad_norm": 3.309114933013916, + "learning_rate": 6.988602304965521e-06, + "loss": 0.1895, + "step": 14863 + }, + { + "epoch": 0.37614191360680216, + "grad_norm": 3.376957416534424, + "learning_rate": 6.988233898957648e-06, + "loss": 0.1314, + "step": 14864 + }, + { + "epoch": 0.3761672191714958, + "grad_norm": 5.188787937164307, + "learning_rate": 6.987865480128023e-06, + "loss": 0.227, + "step": 14865 + }, + { + "epoch": 0.37619252473618947, + "grad_norm": 5.285853862762451, + "learning_rate": 6.987497048479024e-06, + "loss": 0.1483, + "step": 14866 + }, + { + "epoch": 0.3762178303008832, + "grad_norm": 11.181612968444824, + "learning_rate": 6.987128604013027e-06, + "loss": 0.2672, + "step": 14867 + }, + { + "epoch": 0.37624313586557684, + "grad_norm": 8.71496295928955, + "learning_rate": 6.986760146732406e-06, + "loss": 0.2077, + "step": 14868 + }, + { + "epoch": 0.3762684414302705, + "grad_norm": 14.607306480407715, + "learning_rate": 6.98639167663954e-06, + "loss": 0.1911, + "step": 14869 + }, + { + "epoch": 0.3762937469949642, + "grad_norm": 7.565850734710693, + "learning_rate": 6.986023193736802e-06, + "loss": 0.2747, + "step": 14870 + }, + { + "epoch": 0.37631905255965786, + "grad_norm": 6.1866936683654785, + "learning_rate": 6.985654698026572e-06, + "loss": 0.1663, + "step": 14871 + }, + { + "epoch": 0.3763443581243515, + "grad_norm": 5.555373191833496, + "learning_rate": 6.985286189511221e-06, + "loss": 0.1217, + "step": 14872 + }, + { + "epoch": 0.37636966368904523, + "grad_norm": 5.340723991394043, + "learning_rate": 6.984917668193132e-06, + "loss": 0.158, + "step": 14873 + }, + { + "epoch": 0.3763949692537389, + "grad_norm": 4.989706993103027, + "learning_rate": 6.984549134074675e-06, + "loss": 0.1844, + "step": 14874 + }, + { + "epoch": 0.3764202748184326, + "grad_norm": 4.443443775177002, + "learning_rate": 6.9841805871582325e-06, + "loss": 0.2316, + "step": 14875 + }, + { + "epoch": 0.37644558038312625, + "grad_norm": 14.412357330322266, + "learning_rate": 6.983812027446178e-06, + "loss": 0.2031, + "step": 14876 + }, + { + "epoch": 0.3764708859478199, + "grad_norm": 4.577278137207031, + "learning_rate": 6.983443454940887e-06, + "loss": 0.1377, + "step": 14877 + }, + { + "epoch": 0.3764961915125136, + "grad_norm": 4.365828990936279, + "learning_rate": 6.983074869644739e-06, + "loss": 0.2037, + "step": 14878 + }, + { + "epoch": 0.3765214970772073, + "grad_norm": 2.9596567153930664, + "learning_rate": 6.982706271560111e-06, + "loss": 0.1101, + "step": 14879 + }, + { + "epoch": 0.37654680264190094, + "grad_norm": 9.083524703979492, + "learning_rate": 6.982337660689378e-06, + "loss": 0.1604, + "step": 14880 + }, + { + "epoch": 0.37657210820659465, + "grad_norm": 4.753423690795898, + "learning_rate": 6.981969037034919e-06, + "loss": 0.2168, + "step": 14881 + }, + { + "epoch": 0.3765974137712883, + "grad_norm": 13.204550743103027, + "learning_rate": 6.98160040059911e-06, + "loss": 0.2107, + "step": 14882 + }, + { + "epoch": 0.37662271933598196, + "grad_norm": 12.584369659423828, + "learning_rate": 6.9812317513843275e-06, + "loss": 0.3519, + "step": 14883 + }, + { + "epoch": 0.37664802490067567, + "grad_norm": 3.881896495819092, + "learning_rate": 6.980863089392952e-06, + "loss": 0.124, + "step": 14884 + }, + { + "epoch": 0.3766733304653693, + "grad_norm": 4.668941974639893, + "learning_rate": 6.980494414627356e-06, + "loss": 0.1804, + "step": 14885 + }, + { + "epoch": 0.376698636030063, + "grad_norm": 6.0961995124816895, + "learning_rate": 6.980125727089924e-06, + "loss": 0.2176, + "step": 14886 + }, + { + "epoch": 0.3767239415947567, + "grad_norm": 5.285163402557373, + "learning_rate": 6.979757026783026e-06, + "loss": 0.2134, + "step": 14887 + }, + { + "epoch": 0.37674924715945035, + "grad_norm": 3.524684429168701, + "learning_rate": 6.979388313709044e-06, + "loss": 0.1426, + "step": 14888 + }, + { + "epoch": 0.37677455272414406, + "grad_norm": 6.93832540512085, + "learning_rate": 6.979019587870356e-06, + "loss": 0.1908, + "step": 14889 + }, + { + "epoch": 0.3767998582888377, + "grad_norm": 5.317320823669434, + "learning_rate": 6.978650849269338e-06, + "loss": 0.174, + "step": 14890 + }, + { + "epoch": 0.3768251638535314, + "grad_norm": 3.361860513687134, + "learning_rate": 6.9782820979083695e-06, + "loss": 0.1185, + "step": 14891 + }, + { + "epoch": 0.3768504694182251, + "grad_norm": 6.816385746002197, + "learning_rate": 6.9779133337898274e-06, + "loss": 0.1818, + "step": 14892 + }, + { + "epoch": 0.37687577498291874, + "grad_norm": 2.330596446990967, + "learning_rate": 6.977544556916089e-06, + "loss": 0.1378, + "step": 14893 + }, + { + "epoch": 0.3769010805476124, + "grad_norm": 9.377045631408691, + "learning_rate": 6.977175767289534e-06, + "loss": 0.1718, + "step": 14894 + }, + { + "epoch": 0.3769263861123061, + "grad_norm": 5.6812310218811035, + "learning_rate": 6.976806964912543e-06, + "loss": 0.2057, + "step": 14895 + }, + { + "epoch": 0.37695169167699977, + "grad_norm": 4.659709453582764, + "learning_rate": 6.976438149787487e-06, + "loss": 0.1951, + "step": 14896 + }, + { + "epoch": 0.3769769972416934, + "grad_norm": 4.856293678283691, + "learning_rate": 6.976069321916753e-06, + "loss": 0.1823, + "step": 14897 + }, + { + "epoch": 0.37700230280638714, + "grad_norm": 4.979188919067383, + "learning_rate": 6.975700481302714e-06, + "loss": 0.1691, + "step": 14898 + }, + { + "epoch": 0.3770276083710808, + "grad_norm": 2.6130740642547607, + "learning_rate": 6.975331627947749e-06, + "loss": 0.124, + "step": 14899 + }, + { + "epoch": 0.3770529139357745, + "grad_norm": 3.731893301010132, + "learning_rate": 6.9749627618542395e-06, + "loss": 0.1169, + "step": 14900 + }, + { + "epoch": 0.37707821950046816, + "grad_norm": 4.893628120422363, + "learning_rate": 6.974593883024561e-06, + "loss": 0.1277, + "step": 14901 + }, + { + "epoch": 0.3771035250651618, + "grad_norm": 4.1872076988220215, + "learning_rate": 6.974224991461096e-06, + "loss": 0.1561, + "step": 14902 + }, + { + "epoch": 0.3771288306298555, + "grad_norm": 5.6842451095581055, + "learning_rate": 6.973856087166218e-06, + "loss": 0.1961, + "step": 14903 + }, + { + "epoch": 0.3771541361945492, + "grad_norm": 5.4293131828308105, + "learning_rate": 6.973487170142313e-06, + "loss": 0.1863, + "step": 14904 + }, + { + "epoch": 0.37717944175924284, + "grad_norm": 4.782637119293213, + "learning_rate": 6.973118240391754e-06, + "loss": 0.1936, + "step": 14905 + }, + { + "epoch": 0.37720474732393655, + "grad_norm": 9.388668060302734, + "learning_rate": 6.972749297916923e-06, + "loss": 0.2575, + "step": 14906 + }, + { + "epoch": 0.3772300528886302, + "grad_norm": 11.477081298828125, + "learning_rate": 6.972380342720199e-06, + "loss": 0.2874, + "step": 14907 + }, + { + "epoch": 0.37725535845332386, + "grad_norm": 12.112527847290039, + "learning_rate": 6.972011374803961e-06, + "loss": 0.2298, + "step": 14908 + }, + { + "epoch": 0.3772806640180176, + "grad_norm": 4.94147253036499, + "learning_rate": 6.971642394170589e-06, + "loss": 0.1822, + "step": 14909 + }, + { + "epoch": 0.37730596958271123, + "grad_norm": 7.67999792098999, + "learning_rate": 6.97127340082246e-06, + "loss": 0.3027, + "step": 14910 + }, + { + "epoch": 0.3773312751474049, + "grad_norm": 4.1117119789123535, + "learning_rate": 6.970904394761958e-06, + "loss": 0.0923, + "step": 14911 + }, + { + "epoch": 0.3773565807120986, + "grad_norm": 4.44710111618042, + "learning_rate": 6.9705353759914585e-06, + "loss": 0.2142, + "step": 14912 + }, + { + "epoch": 0.37738188627679226, + "grad_norm": 36.526798248291016, + "learning_rate": 6.970166344513344e-06, + "loss": 0.3277, + "step": 14913 + }, + { + "epoch": 0.37740719184148597, + "grad_norm": 10.914224624633789, + "learning_rate": 6.969797300329993e-06, + "loss": 0.2578, + "step": 14914 + }, + { + "epoch": 0.3774324974061796, + "grad_norm": 5.632599830627441, + "learning_rate": 6.969428243443784e-06, + "loss": 0.133, + "step": 14915 + }, + { + "epoch": 0.3774578029708733, + "grad_norm": 8.71716594696045, + "learning_rate": 6.9690591738571e-06, + "loss": 0.2449, + "step": 14916 + }, + { + "epoch": 0.377483108535567, + "grad_norm": 2.227729558944702, + "learning_rate": 6.96869009157232e-06, + "loss": 0.1003, + "step": 14917 + }, + { + "epoch": 0.37750841410026065, + "grad_norm": 2.26403546333313, + "learning_rate": 6.968320996591823e-06, + "loss": 0.1524, + "step": 14918 + }, + { + "epoch": 0.3775337196649543, + "grad_norm": 2.8036727905273438, + "learning_rate": 6.96795188891799e-06, + "loss": 0.144, + "step": 14919 + }, + { + "epoch": 0.377559025229648, + "grad_norm": 2.213148832321167, + "learning_rate": 6.967582768553202e-06, + "loss": 0.1236, + "step": 14920 + }, + { + "epoch": 0.37758433079434167, + "grad_norm": 4.038415908813477, + "learning_rate": 6.967213635499838e-06, + "loss": 0.1926, + "step": 14921 + }, + { + "epoch": 0.37760963635903533, + "grad_norm": 3.293687105178833, + "learning_rate": 6.96684448976028e-06, + "loss": 0.1836, + "step": 14922 + }, + { + "epoch": 0.37763494192372904, + "grad_norm": 7.4289655685424805, + "learning_rate": 6.966475331336906e-06, + "loss": 0.1981, + "step": 14923 + }, + { + "epoch": 0.3776602474884227, + "grad_norm": 4.68851900100708, + "learning_rate": 6.9661061602321e-06, + "loss": 0.156, + "step": 14924 + }, + { + "epoch": 0.3776855530531164, + "grad_norm": 8.55237102508545, + "learning_rate": 6.965736976448241e-06, + "loss": 0.2793, + "step": 14925 + }, + { + "epoch": 0.37771085861781006, + "grad_norm": 8.711570739746094, + "learning_rate": 6.96536777998771e-06, + "loss": 0.3141, + "step": 14926 + }, + { + "epoch": 0.3777361641825037, + "grad_norm": 7.746304988861084, + "learning_rate": 6.964998570852887e-06, + "loss": 0.2064, + "step": 14927 + }, + { + "epoch": 0.37776146974719743, + "grad_norm": 4.735496520996094, + "learning_rate": 6.964629349046155e-06, + "loss": 0.1973, + "step": 14928 + }, + { + "epoch": 0.3777867753118911, + "grad_norm": 2.491684913635254, + "learning_rate": 6.964260114569892e-06, + "loss": 0.0788, + "step": 14929 + }, + { + "epoch": 0.37781208087658474, + "grad_norm": 3.5882408618927, + "learning_rate": 6.9638908674264815e-06, + "loss": 0.1136, + "step": 14930 + }, + { + "epoch": 0.37783738644127846, + "grad_norm": 9.013569831848145, + "learning_rate": 6.963521607618306e-06, + "loss": 0.2547, + "step": 14931 + }, + { + "epoch": 0.3778626920059721, + "grad_norm": 1.6772743463516235, + "learning_rate": 6.963152335147743e-06, + "loss": 0.0756, + "step": 14932 + }, + { + "epoch": 0.37788799757066577, + "grad_norm": 14.130337715148926, + "learning_rate": 6.962783050017176e-06, + "loss": 0.2362, + "step": 14933 + }, + { + "epoch": 0.3779133031353595, + "grad_norm": 8.35792064666748, + "learning_rate": 6.9624137522289856e-06, + "loss": 0.2039, + "step": 14934 + }, + { + "epoch": 0.37793860870005314, + "grad_norm": 4.680477619171143, + "learning_rate": 6.962044441785555e-06, + "loss": 0.2258, + "step": 14935 + }, + { + "epoch": 0.3779639142647468, + "grad_norm": 3.9462993144989014, + "learning_rate": 6.961675118689264e-06, + "loss": 0.1474, + "step": 14936 + }, + { + "epoch": 0.3779892198294405, + "grad_norm": 4.428168773651123, + "learning_rate": 6.961305782942496e-06, + "loss": 0.164, + "step": 14937 + }, + { + "epoch": 0.37801452539413416, + "grad_norm": 4.062443733215332, + "learning_rate": 6.960936434547632e-06, + "loss": 0.2142, + "step": 14938 + }, + { + "epoch": 0.37803983095882787, + "grad_norm": 19.322628021240234, + "learning_rate": 6.960567073507053e-06, + "loss": 0.2648, + "step": 14939 + }, + { + "epoch": 0.37806513652352153, + "grad_norm": 4.598119735717773, + "learning_rate": 6.960197699823142e-06, + "loss": 0.165, + "step": 14940 + }, + { + "epoch": 0.3780904420882152, + "grad_norm": 5.829938888549805, + "learning_rate": 6.95982831349828e-06, + "loss": 0.2076, + "step": 14941 + }, + { + "epoch": 0.3781157476529089, + "grad_norm": 10.588399887084961, + "learning_rate": 6.95945891453485e-06, + "loss": 0.2697, + "step": 14942 + }, + { + "epoch": 0.37814105321760255, + "grad_norm": 7.809830665588379, + "learning_rate": 6.959089502935233e-06, + "loss": 0.2263, + "step": 14943 + }, + { + "epoch": 0.3781663587822962, + "grad_norm": 4.700123310089111, + "learning_rate": 6.958720078701815e-06, + "loss": 0.1081, + "step": 14944 + }, + { + "epoch": 0.3781916643469899, + "grad_norm": 4.438076972961426, + "learning_rate": 6.958350641836973e-06, + "loss": 0.1957, + "step": 14945 + }, + { + "epoch": 0.3782169699116836, + "grad_norm": 6.127580642700195, + "learning_rate": 6.957981192343093e-06, + "loss": 0.1242, + "step": 14946 + }, + { + "epoch": 0.37824227547637723, + "grad_norm": 2.405393362045288, + "learning_rate": 6.957611730222556e-06, + "loss": 0.1015, + "step": 14947 + }, + { + "epoch": 0.37826758104107094, + "grad_norm": 9.640022277832031, + "learning_rate": 6.957242255477747e-06, + "loss": 0.2504, + "step": 14948 + }, + { + "epoch": 0.3782928866057646, + "grad_norm": 4.786396503448486, + "learning_rate": 6.956872768111044e-06, + "loss": 0.1436, + "step": 14949 + }, + { + "epoch": 0.37831819217045826, + "grad_norm": 8.978445053100586, + "learning_rate": 6.956503268124835e-06, + "loss": 0.2575, + "step": 14950 + }, + { + "epoch": 0.37834349773515197, + "grad_norm": 5.3263630867004395, + "learning_rate": 6.956133755521497e-06, + "loss": 0.1316, + "step": 14951 + }, + { + "epoch": 0.3783688032998456, + "grad_norm": 3.8415753841400146, + "learning_rate": 6.955764230303419e-06, + "loss": 0.1364, + "step": 14952 + }, + { + "epoch": 0.37839410886453934, + "grad_norm": 2.9434385299682617, + "learning_rate": 6.955394692472981e-06, + "loss": 0.141, + "step": 14953 + }, + { + "epoch": 0.378419414429233, + "grad_norm": 6.096809387207031, + "learning_rate": 6.955025142032565e-06, + "loss": 0.1691, + "step": 14954 + }, + { + "epoch": 0.37844471999392665, + "grad_norm": 4.82597541809082, + "learning_rate": 6.954655578984557e-06, + "loss": 0.1858, + "step": 14955 + }, + { + "epoch": 0.37847002555862036, + "grad_norm": 5.316607475280762, + "learning_rate": 6.954286003331339e-06, + "loss": 0.1293, + "step": 14956 + }, + { + "epoch": 0.378495331123314, + "grad_norm": 5.186221122741699, + "learning_rate": 6.953916415075292e-06, + "loss": 0.1879, + "step": 14957 + }, + { + "epoch": 0.3785206366880077, + "grad_norm": 7.646273612976074, + "learning_rate": 6.953546814218803e-06, + "loss": 0.2429, + "step": 14958 + }, + { + "epoch": 0.3785459422527014, + "grad_norm": 9.485292434692383, + "learning_rate": 6.953177200764254e-06, + "loss": 0.3355, + "step": 14959 + }, + { + "epoch": 0.37857124781739504, + "grad_norm": 12.064408302307129, + "learning_rate": 6.952807574714029e-06, + "loss": 0.2939, + "step": 14960 + }, + { + "epoch": 0.3785965533820887, + "grad_norm": 2.7456629276275635, + "learning_rate": 6.9524379360705105e-06, + "loss": 0.1237, + "step": 14961 + }, + { + "epoch": 0.3786218589467824, + "grad_norm": 10.161188125610352, + "learning_rate": 6.952068284836082e-06, + "loss": 0.2423, + "step": 14962 + }, + { + "epoch": 0.37864716451147606, + "grad_norm": 5.835043907165527, + "learning_rate": 6.95169862101313e-06, + "loss": 0.1525, + "step": 14963 + }, + { + "epoch": 0.3786724700761698, + "grad_norm": 2.844187021255493, + "learning_rate": 6.951328944604036e-06, + "loss": 0.0785, + "step": 14964 + }, + { + "epoch": 0.37869777564086343, + "grad_norm": 12.17940616607666, + "learning_rate": 6.9509592556111835e-06, + "loss": 0.21, + "step": 14965 + }, + { + "epoch": 0.3787230812055571, + "grad_norm": 3.854112386703491, + "learning_rate": 6.9505895540369594e-06, + "loss": 0.1505, + "step": 14966 + }, + { + "epoch": 0.3787483867702508, + "grad_norm": 5.8420586585998535, + "learning_rate": 6.950219839883746e-06, + "loss": 0.2044, + "step": 14967 + }, + { + "epoch": 0.37877369233494446, + "grad_norm": 13.282618522644043, + "learning_rate": 6.949850113153926e-06, + "loss": 0.1222, + "step": 14968 + }, + { + "epoch": 0.3787989978996381, + "grad_norm": 5.8042426109313965, + "learning_rate": 6.949480373849887e-06, + "loss": 0.1647, + "step": 14969 + }, + { + "epoch": 0.3788243034643318, + "grad_norm": 3.9179306030273438, + "learning_rate": 6.949110621974012e-06, + "loss": 0.1814, + "step": 14970 + }, + { + "epoch": 0.3788496090290255, + "grad_norm": 4.362908840179443, + "learning_rate": 6.9487408575286844e-06, + "loss": 0.1321, + "step": 14971 + }, + { + "epoch": 0.37887491459371914, + "grad_norm": 5.326789379119873, + "learning_rate": 6.948371080516288e-06, + "loss": 0.2015, + "step": 14972 + }, + { + "epoch": 0.37890022015841285, + "grad_norm": 2.58284330368042, + "learning_rate": 6.94800129093921e-06, + "loss": 0.0987, + "step": 14973 + }, + { + "epoch": 0.3789255257231065, + "grad_norm": 7.216279029846191, + "learning_rate": 6.9476314887998355e-06, + "loss": 0.208, + "step": 14974 + }, + { + "epoch": 0.37895083128780016, + "grad_norm": 2.5239744186401367, + "learning_rate": 6.947261674100546e-06, + "loss": 0.0745, + "step": 14975 + }, + { + "epoch": 0.3789761368524939, + "grad_norm": 4.160551071166992, + "learning_rate": 6.9468918468437285e-06, + "loss": 0.1394, + "step": 14976 + }, + { + "epoch": 0.37900144241718753, + "grad_norm": 4.213732719421387, + "learning_rate": 6.946522007031768e-06, + "loss": 0.1957, + "step": 14977 + }, + { + "epoch": 0.37902674798188124, + "grad_norm": 7.514348983764648, + "learning_rate": 6.94615215466705e-06, + "loss": 0.1896, + "step": 14978 + }, + { + "epoch": 0.3790520535465749, + "grad_norm": 12.528801918029785, + "learning_rate": 6.945782289751958e-06, + "loss": 0.247, + "step": 14979 + }, + { + "epoch": 0.37907735911126855, + "grad_norm": 5.86763858795166, + "learning_rate": 6.9454124122888765e-06, + "loss": 0.1651, + "step": 14980 + }, + { + "epoch": 0.37910266467596226, + "grad_norm": 3.9803690910339355, + "learning_rate": 6.9450425222801946e-06, + "loss": 0.184, + "step": 14981 + }, + { + "epoch": 0.3791279702406559, + "grad_norm": 5.178952217102051, + "learning_rate": 6.944672619728293e-06, + "loss": 0.2047, + "step": 14982 + }, + { + "epoch": 0.3791532758053496, + "grad_norm": 3.474747657775879, + "learning_rate": 6.94430270463556e-06, + "loss": 0.0961, + "step": 14983 + }, + { + "epoch": 0.3791785813700433, + "grad_norm": 5.891303539276123, + "learning_rate": 6.943932777004382e-06, + "loss": 0.2047, + "step": 14984 + }, + { + "epoch": 0.37920388693473694, + "grad_norm": 5.472965240478516, + "learning_rate": 6.943562836837141e-06, + "loss": 0.1848, + "step": 14985 + }, + { + "epoch": 0.3792291924994306, + "grad_norm": 3.311727285385132, + "learning_rate": 6.943192884136227e-06, + "loss": 0.1282, + "step": 14986 + }, + { + "epoch": 0.3792544980641243, + "grad_norm": 6.679988861083984, + "learning_rate": 6.942822918904022e-06, + "loss": 0.2094, + "step": 14987 + }, + { + "epoch": 0.37927980362881797, + "grad_norm": 4.1290693283081055, + "learning_rate": 6.942452941142914e-06, + "loss": 0.1699, + "step": 14988 + }, + { + "epoch": 0.3793051091935117, + "grad_norm": 3.841566801071167, + "learning_rate": 6.942082950855289e-06, + "loss": 0.1428, + "step": 14989 + }, + { + "epoch": 0.37933041475820534, + "grad_norm": 4.071282863616943, + "learning_rate": 6.9417129480435315e-06, + "loss": 0.1862, + "step": 14990 + }, + { + "epoch": 0.379355720322899, + "grad_norm": 4.238287925720215, + "learning_rate": 6.941342932710028e-06, + "loss": 0.1702, + "step": 14991 + }, + { + "epoch": 0.3793810258875927, + "grad_norm": 21.132675170898438, + "learning_rate": 6.940972904857166e-06, + "loss": 0.3229, + "step": 14992 + }, + { + "epoch": 0.37940633145228636, + "grad_norm": 18.81490135192871, + "learning_rate": 6.9406028644873306e-06, + "loss": 0.2069, + "step": 14993 + }, + { + "epoch": 0.37943163701698, + "grad_norm": 3.7144129276275635, + "learning_rate": 6.9402328116029085e-06, + "loss": 0.1382, + "step": 14994 + }, + { + "epoch": 0.37945694258167373, + "grad_norm": 4.447633743286133, + "learning_rate": 6.939862746206284e-06, + "loss": 0.1732, + "step": 14995 + }, + { + "epoch": 0.3794822481463674, + "grad_norm": 3.969170331954956, + "learning_rate": 6.9394926682998474e-06, + "loss": 0.1367, + "step": 14996 + }, + { + "epoch": 0.37950755371106104, + "grad_norm": 5.488568305969238, + "learning_rate": 6.939122577885983e-06, + "loss": 0.1529, + "step": 14997 + }, + { + "epoch": 0.37953285927575475, + "grad_norm": 6.088753700256348, + "learning_rate": 6.9387524749670776e-06, + "loss": 0.2056, + "step": 14998 + }, + { + "epoch": 0.3795581648404484, + "grad_norm": 8.343880653381348, + "learning_rate": 6.938382359545519e-06, + "loss": 0.2842, + "step": 14999 + }, + { + "epoch": 0.37958347040514206, + "grad_norm": 3.9377095699310303, + "learning_rate": 6.938012231623692e-06, + "loss": 0.1794, + "step": 15000 + }, + { + "epoch": 0.3796087759698358, + "grad_norm": 4.206009387969971, + "learning_rate": 6.937642091203986e-06, + "loss": 0.1618, + "step": 15001 + }, + { + "epoch": 0.37963408153452943, + "grad_norm": 2.3263676166534424, + "learning_rate": 6.937271938288784e-06, + "loss": 0.1146, + "step": 15002 + }, + { + "epoch": 0.37965938709922314, + "grad_norm": 5.238368988037109, + "learning_rate": 6.936901772880478e-06, + "loss": 0.1819, + "step": 15003 + }, + { + "epoch": 0.3796846926639168, + "grad_norm": 4.436710834503174, + "learning_rate": 6.936531594981452e-06, + "loss": 0.134, + "step": 15004 + }, + { + "epoch": 0.37970999822861046, + "grad_norm": 7.230082988739014, + "learning_rate": 6.936161404594093e-06, + "loss": 0.1914, + "step": 15005 + }, + { + "epoch": 0.37973530379330417, + "grad_norm": 36.068450927734375, + "learning_rate": 6.935791201720791e-06, + "loss": 0.2699, + "step": 15006 + }, + { + "epoch": 0.3797606093579978, + "grad_norm": 6.0538411140441895, + "learning_rate": 6.93542098636393e-06, + "loss": 0.1508, + "step": 15007 + }, + { + "epoch": 0.3797859149226915, + "grad_norm": 3.055041790008545, + "learning_rate": 6.9350507585259e-06, + "loss": 0.1022, + "step": 15008 + }, + { + "epoch": 0.3798112204873852, + "grad_norm": 16.296375274658203, + "learning_rate": 6.934680518209087e-06, + "loss": 0.2203, + "step": 15009 + }, + { + "epoch": 0.37983652605207885, + "grad_norm": 5.747393608093262, + "learning_rate": 6.93431026541588e-06, + "loss": 0.1712, + "step": 15010 + }, + { + "epoch": 0.3798618316167725, + "grad_norm": 4.261577606201172, + "learning_rate": 6.9339400001486645e-06, + "loss": 0.1481, + "step": 15011 + }, + { + "epoch": 0.3798871371814662, + "grad_norm": 16.644807815551758, + "learning_rate": 6.933569722409832e-06, + "loss": 0.2731, + "step": 15012 + }, + { + "epoch": 0.3799124427461599, + "grad_norm": 3.642287015914917, + "learning_rate": 6.933199432201766e-06, + "loss": 0.11, + "step": 15013 + }, + { + "epoch": 0.37993774831085353, + "grad_norm": 4.873890399932861, + "learning_rate": 6.932829129526858e-06, + "loss": 0.1438, + "step": 15014 + }, + { + "epoch": 0.37996305387554724, + "grad_norm": 8.002645492553711, + "learning_rate": 6.932458814387493e-06, + "loss": 0.1981, + "step": 15015 + }, + { + "epoch": 0.3799883594402409, + "grad_norm": 6.01218843460083, + "learning_rate": 6.932088486786062e-06, + "loss": 0.2124, + "step": 15016 + }, + { + "epoch": 0.3800136650049346, + "grad_norm": 3.513490676879883, + "learning_rate": 6.9317181467249505e-06, + "loss": 0.1258, + "step": 15017 + }, + { + "epoch": 0.38003897056962826, + "grad_norm": 3.5611677169799805, + "learning_rate": 6.931347794206548e-06, + "loss": 0.1968, + "step": 15018 + }, + { + "epoch": 0.3800642761343219, + "grad_norm": 5.335304260253906, + "learning_rate": 6.930977429233244e-06, + "loss": 0.1292, + "step": 15019 + }, + { + "epoch": 0.38008958169901563, + "grad_norm": 4.7789692878723145, + "learning_rate": 6.9306070518074245e-06, + "loss": 0.1726, + "step": 15020 + }, + { + "epoch": 0.3801148872637093, + "grad_norm": 6.174996852874756, + "learning_rate": 6.93023666193148e-06, + "loss": 0.1919, + "step": 15021 + }, + { + "epoch": 0.38014019282840295, + "grad_norm": 5.679439067840576, + "learning_rate": 6.929866259607798e-06, + "loss": 0.1554, + "step": 15022 + }, + { + "epoch": 0.38016549839309666, + "grad_norm": 6.895664691925049, + "learning_rate": 6.9294958448387685e-06, + "loss": 0.2231, + "step": 15023 + }, + { + "epoch": 0.3801908039577903, + "grad_norm": 5.067355155944824, + "learning_rate": 6.929125417626778e-06, + "loss": 0.2104, + "step": 15024 + }, + { + "epoch": 0.38021610952248397, + "grad_norm": 3.1653058528900146, + "learning_rate": 6.928754977974217e-06, + "loss": 0.1693, + "step": 15025 + }, + { + "epoch": 0.3802414150871777, + "grad_norm": 6.344904899597168, + "learning_rate": 6.928384525883473e-06, + "loss": 0.1713, + "step": 15026 + }, + { + "epoch": 0.38026672065187134, + "grad_norm": 12.004781723022461, + "learning_rate": 6.928014061356938e-06, + "loss": 0.1893, + "step": 15027 + }, + { + "epoch": 0.38029202621656505, + "grad_norm": 2.99320125579834, + "learning_rate": 6.927643584396998e-06, + "loss": 0.0908, + "step": 15028 + }, + { + "epoch": 0.3803173317812587, + "grad_norm": 4.601027011871338, + "learning_rate": 6.927273095006042e-06, + "loss": 0.1535, + "step": 15029 + }, + { + "epoch": 0.38034263734595236, + "grad_norm": 5.2794389724731445, + "learning_rate": 6.926902593186462e-06, + "loss": 0.1432, + "step": 15030 + }, + { + "epoch": 0.3803679429106461, + "grad_norm": 7.051529407501221, + "learning_rate": 6.926532078940645e-06, + "loss": 0.2461, + "step": 15031 + }, + { + "epoch": 0.38039324847533973, + "grad_norm": 2.826040506362915, + "learning_rate": 6.92616155227098e-06, + "loss": 0.1142, + "step": 15032 + }, + { + "epoch": 0.3804185540400334, + "grad_norm": 4.661758899688721, + "learning_rate": 6.9257910131798574e-06, + "loss": 0.1239, + "step": 15033 + }, + { + "epoch": 0.3804438596047271, + "grad_norm": 3.1797640323638916, + "learning_rate": 6.925420461669667e-06, + "loss": 0.1905, + "step": 15034 + }, + { + "epoch": 0.38046916516942075, + "grad_norm": 5.729575157165527, + "learning_rate": 6.9250498977427994e-06, + "loss": 0.1653, + "step": 15035 + }, + { + "epoch": 0.3804944707341144, + "grad_norm": 20.777795791625977, + "learning_rate": 6.924679321401642e-06, + "loss": 0.1645, + "step": 15036 + }, + { + "epoch": 0.3805197762988081, + "grad_norm": 9.21125602722168, + "learning_rate": 6.924308732648586e-06, + "loss": 0.1481, + "step": 15037 + }, + { + "epoch": 0.3805450818635018, + "grad_norm": 7.981427192687988, + "learning_rate": 6.923938131486021e-06, + "loss": 0.2873, + "step": 15038 + }, + { + "epoch": 0.38057038742819543, + "grad_norm": 2.8698508739471436, + "learning_rate": 6.923567517916336e-06, + "loss": 0.1421, + "step": 15039 + }, + { + "epoch": 0.38059569299288915, + "grad_norm": 3.162398338317871, + "learning_rate": 6.923196891941923e-06, + "loss": 0.0749, + "step": 15040 + }, + { + "epoch": 0.3806209985575828, + "grad_norm": 4.461495876312256, + "learning_rate": 6.922826253565171e-06, + "loss": 0.2285, + "step": 15041 + }, + { + "epoch": 0.3806463041222765, + "grad_norm": 4.053493499755859, + "learning_rate": 6.922455602788469e-06, + "loss": 0.1886, + "step": 15042 + }, + { + "epoch": 0.38067160968697017, + "grad_norm": 6.466688632965088, + "learning_rate": 6.9220849396142106e-06, + "loss": 0.1154, + "step": 15043 + }, + { + "epoch": 0.3806969152516638, + "grad_norm": 3.8676698207855225, + "learning_rate": 6.921714264044782e-06, + "loss": 0.1298, + "step": 15044 + }, + { + "epoch": 0.38072222081635754, + "grad_norm": 6.6984944343566895, + "learning_rate": 6.921343576082578e-06, + "loss": 0.307, + "step": 15045 + }, + { + "epoch": 0.3807475263810512, + "grad_norm": 8.62158489227295, + "learning_rate": 6.9209728757299834e-06, + "loss": 0.1518, + "step": 15046 + }, + { + "epoch": 0.38077283194574485, + "grad_norm": 11.35078239440918, + "learning_rate": 6.920602162989394e-06, + "loss": 0.1906, + "step": 15047 + }, + { + "epoch": 0.38079813751043856, + "grad_norm": 9.46497917175293, + "learning_rate": 6.9202314378631976e-06, + "loss": 0.2377, + "step": 15048 + }, + { + "epoch": 0.3808234430751322, + "grad_norm": 8.039145469665527, + "learning_rate": 6.919860700353787e-06, + "loss": 0.2412, + "step": 15049 + }, + { + "epoch": 0.3808487486398259, + "grad_norm": 3.3472185134887695, + "learning_rate": 6.91948995046355e-06, + "loss": 0.1182, + "step": 15050 + }, + { + "epoch": 0.3808740542045196, + "grad_norm": 10.724239349365234, + "learning_rate": 6.91911918819488e-06, + "loss": 0.284, + "step": 15051 + }, + { + "epoch": 0.38089935976921324, + "grad_norm": 19.176029205322266, + "learning_rate": 6.91874841355017e-06, + "loss": 0.2054, + "step": 15052 + }, + { + "epoch": 0.38092466533390695, + "grad_norm": 14.195915222167969, + "learning_rate": 6.918377626531805e-06, + "loss": 0.2865, + "step": 15053 + }, + { + "epoch": 0.3809499708986006, + "grad_norm": 6.298263072967529, + "learning_rate": 6.91800682714218e-06, + "loss": 0.2092, + "step": 15054 + }, + { + "epoch": 0.38097527646329427, + "grad_norm": 3.4056921005249023, + "learning_rate": 6.9176360153836885e-06, + "loss": 0.1798, + "step": 15055 + }, + { + "epoch": 0.381000582027988, + "grad_norm": 5.622430801391602, + "learning_rate": 6.917265191258716e-06, + "loss": 0.2013, + "step": 15056 + }, + { + "epoch": 0.38102588759268163, + "grad_norm": 5.828993320465088, + "learning_rate": 6.9168943547696585e-06, + "loss": 0.2209, + "step": 15057 + }, + { + "epoch": 0.3810511931573753, + "grad_norm": 10.616520881652832, + "learning_rate": 6.916523505918906e-06, + "loss": 0.3589, + "step": 15058 + }, + { + "epoch": 0.381076498722069, + "grad_norm": 6.093019962310791, + "learning_rate": 6.9161526447088485e-06, + "loss": 0.2457, + "step": 15059 + }, + { + "epoch": 0.38110180428676266, + "grad_norm": 5.987102508544922, + "learning_rate": 6.9157817711418805e-06, + "loss": 0.1909, + "step": 15060 + }, + { + "epoch": 0.3811271098514563, + "grad_norm": 3.460277557373047, + "learning_rate": 6.9154108852203915e-06, + "loss": 0.1113, + "step": 15061 + }, + { + "epoch": 0.38115241541615, + "grad_norm": 4.006099700927734, + "learning_rate": 6.915039986946774e-06, + "loss": 0.13, + "step": 15062 + }, + { + "epoch": 0.3811777209808437, + "grad_norm": 10.295079231262207, + "learning_rate": 6.91466907632342e-06, + "loss": 0.1958, + "step": 15063 + }, + { + "epoch": 0.38120302654553734, + "grad_norm": 21.025381088256836, + "learning_rate": 6.914298153352721e-06, + "loss": 0.2783, + "step": 15064 + }, + { + "epoch": 0.38122833211023105, + "grad_norm": 5.080828666687012, + "learning_rate": 6.91392721803707e-06, + "loss": 0.1435, + "step": 15065 + }, + { + "epoch": 0.3812536376749247, + "grad_norm": 5.382139205932617, + "learning_rate": 6.913556270378857e-06, + "loss": 0.1943, + "step": 15066 + }, + { + "epoch": 0.3812789432396184, + "grad_norm": 8.702701568603516, + "learning_rate": 6.913185310380477e-06, + "loss": 0.2008, + "step": 15067 + }, + { + "epoch": 0.3813042488043121, + "grad_norm": 3.828641414642334, + "learning_rate": 6.91281433804432e-06, + "loss": 0.1293, + "step": 15068 + }, + { + "epoch": 0.38132955436900573, + "grad_norm": 3.6268324851989746, + "learning_rate": 6.9124433533727785e-06, + "loss": 0.1365, + "step": 15069 + }, + { + "epoch": 0.38135485993369944, + "grad_norm": 8.783693313598633, + "learning_rate": 6.912072356368248e-06, + "loss": 0.2961, + "step": 15070 + }, + { + "epoch": 0.3813801654983931, + "grad_norm": 5.625563144683838, + "learning_rate": 6.911701347033117e-06, + "loss": 0.1736, + "step": 15071 + }, + { + "epoch": 0.38140547106308675, + "grad_norm": 6.701103687286377, + "learning_rate": 6.91133032536978e-06, + "loss": 0.1948, + "step": 15072 + }, + { + "epoch": 0.38143077662778047, + "grad_norm": 5.039541244506836, + "learning_rate": 6.910959291380628e-06, + "loss": 0.2313, + "step": 15073 + }, + { + "epoch": 0.3814560821924741, + "grad_norm": 3.7289810180664062, + "learning_rate": 6.9105882450680565e-06, + "loss": 0.1422, + "step": 15074 + }, + { + "epoch": 0.3814813877571678, + "grad_norm": 3.496492862701416, + "learning_rate": 6.910217186434455e-06, + "loss": 0.1198, + "step": 15075 + }, + { + "epoch": 0.3815066933218615, + "grad_norm": 3.4071297645568848, + "learning_rate": 6.909846115482219e-06, + "loss": 0.1675, + "step": 15076 + }, + { + "epoch": 0.38153199888655515, + "grad_norm": 5.021637916564941, + "learning_rate": 6.909475032213742e-06, + "loss": 0.2179, + "step": 15077 + }, + { + "epoch": 0.3815573044512488, + "grad_norm": 8.325719833374023, + "learning_rate": 6.909103936631415e-06, + "loss": 0.2459, + "step": 15078 + }, + { + "epoch": 0.3815826100159425, + "grad_norm": 2.877882957458496, + "learning_rate": 6.908732828737631e-06, + "loss": 0.1775, + "step": 15079 + }, + { + "epoch": 0.38160791558063617, + "grad_norm": 8.277984619140625, + "learning_rate": 6.908361708534785e-06, + "loss": 0.2244, + "step": 15080 + }, + { + "epoch": 0.3816332211453299, + "grad_norm": 2.814387321472168, + "learning_rate": 6.907990576025269e-06, + "loss": 0.1076, + "step": 15081 + }, + { + "epoch": 0.38165852671002354, + "grad_norm": 9.512349128723145, + "learning_rate": 6.907619431211475e-06, + "loss": 0.2398, + "step": 15082 + }, + { + "epoch": 0.3816838322747172, + "grad_norm": 2.348180055618286, + "learning_rate": 6.907248274095801e-06, + "loss": 0.1199, + "step": 15083 + }, + { + "epoch": 0.3817091378394109, + "grad_norm": 4.92818021774292, + "learning_rate": 6.906877104680637e-06, + "loss": 0.1605, + "step": 15084 + }, + { + "epoch": 0.38173444340410456, + "grad_norm": 3.920013666152954, + "learning_rate": 6.906505922968376e-06, + "loss": 0.1785, + "step": 15085 + }, + { + "epoch": 0.3817597489687982, + "grad_norm": 4.2582011222839355, + "learning_rate": 6.906134728961414e-06, + "loss": 0.2144, + "step": 15086 + }, + { + "epoch": 0.38178505453349193, + "grad_norm": 19.934267044067383, + "learning_rate": 6.905763522662143e-06, + "loss": 0.2286, + "step": 15087 + }, + { + "epoch": 0.3818103600981856, + "grad_norm": 5.247166156768799, + "learning_rate": 6.905392304072958e-06, + "loss": 0.2051, + "step": 15088 + }, + { + "epoch": 0.38183566566287924, + "grad_norm": 3.7012481689453125, + "learning_rate": 6.905021073196253e-06, + "loss": 0.1826, + "step": 15089 + }, + { + "epoch": 0.38186097122757295, + "grad_norm": 2.8195254802703857, + "learning_rate": 6.90464983003442e-06, + "loss": 0.1074, + "step": 15090 + }, + { + "epoch": 0.3818862767922666, + "grad_norm": 3.464019536972046, + "learning_rate": 6.904278574589856e-06, + "loss": 0.1361, + "step": 15091 + }, + { + "epoch": 0.3819115823569603, + "grad_norm": 5.1509904861450195, + "learning_rate": 6.903907306864953e-06, + "loss": 0.1558, + "step": 15092 + }, + { + "epoch": 0.381936887921654, + "grad_norm": 7.13140344619751, + "learning_rate": 6.903536026862105e-06, + "loss": 0.1661, + "step": 15093 + }, + { + "epoch": 0.38196219348634763, + "grad_norm": 7.053470611572266, + "learning_rate": 6.9031647345837095e-06, + "loss": 0.1618, + "step": 15094 + }, + { + "epoch": 0.38198749905104135, + "grad_norm": 4.084783554077148, + "learning_rate": 6.902793430032156e-06, + "loss": 0.1088, + "step": 15095 + }, + { + "epoch": 0.382012804615735, + "grad_norm": 9.97586727142334, + "learning_rate": 6.902422113209843e-06, + "loss": 0.2017, + "step": 15096 + }, + { + "epoch": 0.38203811018042866, + "grad_norm": 6.555049896240234, + "learning_rate": 6.9020507841191646e-06, + "loss": 0.2101, + "step": 15097 + }, + { + "epoch": 0.38206341574512237, + "grad_norm": 5.5203046798706055, + "learning_rate": 6.9016794427625136e-06, + "loss": 0.1775, + "step": 15098 + }, + { + "epoch": 0.382088721309816, + "grad_norm": 3.771228075027466, + "learning_rate": 6.901308089142285e-06, + "loss": 0.1263, + "step": 15099 + }, + { + "epoch": 0.3821140268745097, + "grad_norm": 6.73240852355957, + "learning_rate": 6.900936723260874e-06, + "loss": 0.176, + "step": 15100 + }, + { + "epoch": 0.3821393324392034, + "grad_norm": 5.741216659545898, + "learning_rate": 6.900565345120676e-06, + "loss": 0.2201, + "step": 15101 + }, + { + "epoch": 0.38216463800389705, + "grad_norm": 3.234269380569458, + "learning_rate": 6.900193954724086e-06, + "loss": 0.1664, + "step": 15102 + }, + { + "epoch": 0.3821899435685907, + "grad_norm": 3.253297805786133, + "learning_rate": 6.8998225520735e-06, + "loss": 0.1211, + "step": 15103 + }, + { + "epoch": 0.3822152491332844, + "grad_norm": 7.226421356201172, + "learning_rate": 6.899451137171309e-06, + "loss": 0.264, + "step": 15104 + }, + { + "epoch": 0.3822405546979781, + "grad_norm": 6.40209436416626, + "learning_rate": 6.899079710019914e-06, + "loss": 0.1665, + "step": 15105 + }, + { + "epoch": 0.3822658602626718, + "grad_norm": 7.980622291564941, + "learning_rate": 6.898708270621705e-06, + "loss": 0.1347, + "step": 15106 + }, + { + "epoch": 0.38229116582736544, + "grad_norm": 2.315612554550171, + "learning_rate": 6.89833681897908e-06, + "loss": 0.1358, + "step": 15107 + }, + { + "epoch": 0.3823164713920591, + "grad_norm": 10.846545219421387, + "learning_rate": 6.897965355094434e-06, + "loss": 0.147, + "step": 15108 + }, + { + "epoch": 0.3823417769567528, + "grad_norm": 5.397785663604736, + "learning_rate": 6.8975938789701634e-06, + "loss": 0.1787, + "step": 15109 + }, + { + "epoch": 0.38236708252144647, + "grad_norm": 9.167792320251465, + "learning_rate": 6.897222390608661e-06, + "loss": 0.2225, + "step": 15110 + }, + { + "epoch": 0.3823923880861401, + "grad_norm": 3.964975595474243, + "learning_rate": 6.8968508900123255e-06, + "loss": 0.1411, + "step": 15111 + }, + { + "epoch": 0.38241769365083383, + "grad_norm": 4.116896629333496, + "learning_rate": 6.896479377183551e-06, + "loss": 0.1865, + "step": 15112 + }, + { + "epoch": 0.3824429992155275, + "grad_norm": 5.350859642028809, + "learning_rate": 6.896107852124735e-06, + "loss": 0.1379, + "step": 15113 + }, + { + "epoch": 0.38246830478022115, + "grad_norm": 10.711161613464355, + "learning_rate": 6.8957363148382705e-06, + "loss": 0.3545, + "step": 15114 + }, + { + "epoch": 0.38249361034491486, + "grad_norm": 4.200926780700684, + "learning_rate": 6.895364765326557e-06, + "loss": 0.2169, + "step": 15115 + }, + { + "epoch": 0.3825189159096085, + "grad_norm": 7.837900638580322, + "learning_rate": 6.894993203591987e-06, + "loss": 0.1294, + "step": 15116 + }, + { + "epoch": 0.3825442214743022, + "grad_norm": 6.233958721160889, + "learning_rate": 6.894621629636959e-06, + "loss": 0.193, + "step": 15117 + }, + { + "epoch": 0.3825695270389959, + "grad_norm": 3.695158004760742, + "learning_rate": 6.8942500434638695e-06, + "loss": 0.1647, + "step": 15118 + }, + { + "epoch": 0.38259483260368954, + "grad_norm": 16.458356857299805, + "learning_rate": 6.893878445075111e-06, + "loss": 0.1937, + "step": 15119 + }, + { + "epoch": 0.38262013816838325, + "grad_norm": 5.91002082824707, + "learning_rate": 6.893506834473086e-06, + "loss": 0.2286, + "step": 15120 + }, + { + "epoch": 0.3826454437330769, + "grad_norm": 10.179964065551758, + "learning_rate": 6.893135211660184e-06, + "loss": 0.1798, + "step": 15121 + }, + { + "epoch": 0.38267074929777056, + "grad_norm": 4.990475177764893, + "learning_rate": 6.892763576638808e-06, + "loss": 0.149, + "step": 15122 + }, + { + "epoch": 0.3826960548624643, + "grad_norm": 12.91901683807373, + "learning_rate": 6.8923919294113515e-06, + "loss": 0.3726, + "step": 15123 + }, + { + "epoch": 0.38272136042715793, + "grad_norm": 9.912290573120117, + "learning_rate": 6.892020269980211e-06, + "loss": 0.1672, + "step": 15124 + }, + { + "epoch": 0.3827466659918516, + "grad_norm": 3.0549166202545166, + "learning_rate": 6.891648598347785e-06, + "loss": 0.1535, + "step": 15125 + }, + { + "epoch": 0.3827719715565453, + "grad_norm": 5.62178897857666, + "learning_rate": 6.891276914516466e-06, + "loss": 0.1577, + "step": 15126 + }, + { + "epoch": 0.38279727712123895, + "grad_norm": 4.190532207489014, + "learning_rate": 6.890905218488657e-06, + "loss": 0.1863, + "step": 15127 + }, + { + "epoch": 0.3828225826859326, + "grad_norm": 4.139204978942871, + "learning_rate": 6.89053351026675e-06, + "loss": 0.1377, + "step": 15128 + }, + { + "epoch": 0.3828478882506263, + "grad_norm": 4.928879261016846, + "learning_rate": 6.890161789853145e-06, + "loss": 0.2034, + "step": 15129 + }, + { + "epoch": 0.38287319381532, + "grad_norm": 15.844016075134277, + "learning_rate": 6.889790057250238e-06, + "loss": 0.2232, + "step": 15130 + }, + { + "epoch": 0.3828984993800137, + "grad_norm": 4.102311134338379, + "learning_rate": 6.889418312460427e-06, + "loss": 0.1711, + "step": 15131 + }, + { + "epoch": 0.38292380494470735, + "grad_norm": 4.2905402183532715, + "learning_rate": 6.889046555486107e-06, + "loss": 0.1496, + "step": 15132 + }, + { + "epoch": 0.382949110509401, + "grad_norm": 3.611222505569458, + "learning_rate": 6.888674786329678e-06, + "loss": 0.1011, + "step": 15133 + }, + { + "epoch": 0.3829744160740947, + "grad_norm": 6.473349571228027, + "learning_rate": 6.8883030049935375e-06, + "loss": 0.2361, + "step": 15134 + }, + { + "epoch": 0.38299972163878837, + "grad_norm": 3.400311231613159, + "learning_rate": 6.887931211480081e-06, + "loss": 0.1174, + "step": 15135 + }, + { + "epoch": 0.383025027203482, + "grad_norm": 2.68160343170166, + "learning_rate": 6.887559405791708e-06, + "loss": 0.1124, + "step": 15136 + }, + { + "epoch": 0.38305033276817574, + "grad_norm": 5.320291042327881, + "learning_rate": 6.887187587930816e-06, + "loss": 0.1499, + "step": 15137 + }, + { + "epoch": 0.3830756383328694, + "grad_norm": 4.229643821716309, + "learning_rate": 6.8868157578998016e-06, + "loss": 0.1406, + "step": 15138 + }, + { + "epoch": 0.38310094389756305, + "grad_norm": 4.5967631340026855, + "learning_rate": 6.886443915701065e-06, + "loss": 0.1919, + "step": 15139 + }, + { + "epoch": 0.38312624946225676, + "grad_norm": 11.023869514465332, + "learning_rate": 6.886072061337e-06, + "loss": 0.1637, + "step": 15140 + }, + { + "epoch": 0.3831515550269504, + "grad_norm": 3.41326904296875, + "learning_rate": 6.885700194810008e-06, + "loss": 0.1201, + "step": 15141 + }, + { + "epoch": 0.3831768605916441, + "grad_norm": 5.643435478210449, + "learning_rate": 6.885328316122486e-06, + "loss": 0.2261, + "step": 15142 + }, + { + "epoch": 0.3832021661563378, + "grad_norm": 5.628629684448242, + "learning_rate": 6.884956425276833e-06, + "loss": 0.1883, + "step": 15143 + }, + { + "epoch": 0.38322747172103144, + "grad_norm": 3.5868871212005615, + "learning_rate": 6.884584522275447e-06, + "loss": 0.1382, + "step": 15144 + }, + { + "epoch": 0.38325277728572515, + "grad_norm": 4.700436592102051, + "learning_rate": 6.884212607120726e-06, + "loss": 0.1316, + "step": 15145 + }, + { + "epoch": 0.3832780828504188, + "grad_norm": 5.068686008453369, + "learning_rate": 6.883840679815067e-06, + "loss": 0.2806, + "step": 15146 + }, + { + "epoch": 0.38330338841511247, + "grad_norm": 5.165341377258301, + "learning_rate": 6.8834687403608715e-06, + "loss": 0.0964, + "step": 15147 + }, + { + "epoch": 0.3833286939798062, + "grad_norm": 5.7451090812683105, + "learning_rate": 6.883096788760535e-06, + "loss": 0.1572, + "step": 15148 + }, + { + "epoch": 0.38335399954449983, + "grad_norm": 4.226415157318115, + "learning_rate": 6.882724825016458e-06, + "loss": 0.1882, + "step": 15149 + }, + { + "epoch": 0.3833793051091935, + "grad_norm": 2.7576651573181152, + "learning_rate": 6.8823528491310384e-06, + "loss": 0.0866, + "step": 15150 + }, + { + "epoch": 0.3834046106738872, + "grad_norm": 4.995689868927002, + "learning_rate": 6.8819808611066764e-06, + "loss": 0.1752, + "step": 15151 + }, + { + "epoch": 0.38342991623858086, + "grad_norm": 4.623729228973389, + "learning_rate": 6.88160886094577e-06, + "loss": 0.2027, + "step": 15152 + }, + { + "epoch": 0.3834552218032745, + "grad_norm": 5.472260475158691, + "learning_rate": 6.881236848650717e-06, + "loss": 0.1572, + "step": 15153 + }, + { + "epoch": 0.3834805273679682, + "grad_norm": 7.46604061126709, + "learning_rate": 6.880864824223918e-06, + "loss": 0.0825, + "step": 15154 + }, + { + "epoch": 0.3835058329326619, + "grad_norm": 3.1979899406433105, + "learning_rate": 6.880492787667772e-06, + "loss": 0.1046, + "step": 15155 + }, + { + "epoch": 0.3835311384973556, + "grad_norm": 8.752520561218262, + "learning_rate": 6.880120738984677e-06, + "loss": 0.1821, + "step": 15156 + }, + { + "epoch": 0.38355644406204925, + "grad_norm": 3.226004123687744, + "learning_rate": 6.879748678177034e-06, + "loss": 0.1331, + "step": 15157 + }, + { + "epoch": 0.3835817496267429, + "grad_norm": 5.285010814666748, + "learning_rate": 6.879376605247241e-06, + "loss": 0.2038, + "step": 15158 + }, + { + "epoch": 0.3836070551914366, + "grad_norm": 4.707796573638916, + "learning_rate": 6.8790045201976975e-06, + "loss": 0.1884, + "step": 15159 + }, + { + "epoch": 0.3836323607561303, + "grad_norm": 4.72330379486084, + "learning_rate": 6.878632423030804e-06, + "loss": 0.1497, + "step": 15160 + }, + { + "epoch": 0.38365766632082393, + "grad_norm": 6.3471150398254395, + "learning_rate": 6.878260313748959e-06, + "loss": 0.1364, + "step": 15161 + }, + { + "epoch": 0.38368297188551764, + "grad_norm": 7.164623737335205, + "learning_rate": 6.877888192354562e-06, + "loss": 0.1794, + "step": 15162 + }, + { + "epoch": 0.3837082774502113, + "grad_norm": 3.786081314086914, + "learning_rate": 6.877516058850015e-06, + "loss": 0.1422, + "step": 15163 + }, + { + "epoch": 0.38373358301490496, + "grad_norm": 4.703909397125244, + "learning_rate": 6.877143913237714e-06, + "loss": 0.1501, + "step": 15164 + }, + { + "epoch": 0.38375888857959867, + "grad_norm": 14.389296531677246, + "learning_rate": 6.876771755520063e-06, + "loss": 0.2745, + "step": 15165 + }, + { + "epoch": 0.3837841941442923, + "grad_norm": 5.618731498718262, + "learning_rate": 6.876399585699459e-06, + "loss": 0.182, + "step": 15166 + }, + { + "epoch": 0.383809499708986, + "grad_norm": 8.712013244628906, + "learning_rate": 6.876027403778305e-06, + "loss": 0.119, + "step": 15167 + }, + { + "epoch": 0.3838348052736797, + "grad_norm": 4.237080097198486, + "learning_rate": 6.875655209758997e-06, + "loss": 0.1724, + "step": 15168 + }, + { + "epoch": 0.38386011083837335, + "grad_norm": 7.218325138092041, + "learning_rate": 6.8752830036439385e-06, + "loss": 0.215, + "step": 15169 + }, + { + "epoch": 0.38388541640306706, + "grad_norm": 2.744220495223999, + "learning_rate": 6.874910785435528e-06, + "loss": 0.0834, + "step": 15170 + }, + { + "epoch": 0.3839107219677607, + "grad_norm": 4.212800025939941, + "learning_rate": 6.8745385551361675e-06, + "loss": 0.1378, + "step": 15171 + }, + { + "epoch": 0.38393602753245437, + "grad_norm": 9.492410659790039, + "learning_rate": 6.874166312748256e-06, + "loss": 0.1741, + "step": 15172 + }, + { + "epoch": 0.3839613330971481, + "grad_norm": 2.8210630416870117, + "learning_rate": 6.873794058274194e-06, + "loss": 0.1587, + "step": 15173 + }, + { + "epoch": 0.38398663866184174, + "grad_norm": 16.622880935668945, + "learning_rate": 6.873421791716386e-06, + "loss": 0.2811, + "step": 15174 + }, + { + "epoch": 0.3840119442265354, + "grad_norm": 8.065581321716309, + "learning_rate": 6.873049513077225e-06, + "loss": 0.1827, + "step": 15175 + }, + { + "epoch": 0.3840372497912291, + "grad_norm": 7.604254722595215, + "learning_rate": 6.872677222359119e-06, + "loss": 0.2067, + "step": 15176 + }, + { + "epoch": 0.38406255535592276, + "grad_norm": 3.1133668422698975, + "learning_rate": 6.8723049195644644e-06, + "loss": 0.1169, + "step": 15177 + }, + { + "epoch": 0.3840878609206164, + "grad_norm": 6.255014896392822, + "learning_rate": 6.871932604695665e-06, + "loss": 0.167, + "step": 15178 + }, + { + "epoch": 0.38411316648531013, + "grad_norm": 5.525323390960693, + "learning_rate": 6.871560277755118e-06, + "loss": 0.2242, + "step": 15179 + }, + { + "epoch": 0.3841384720500038, + "grad_norm": 3.087406873703003, + "learning_rate": 6.871187938745229e-06, + "loss": 0.1613, + "step": 15180 + }, + { + "epoch": 0.3841637776146975, + "grad_norm": 7.821084976196289, + "learning_rate": 6.870815587668396e-06, + "loss": 0.2004, + "step": 15181 + }, + { + "epoch": 0.38418908317939116, + "grad_norm": 7.268228530883789, + "learning_rate": 6.870443224527023e-06, + "loss": 0.1848, + "step": 15182 + }, + { + "epoch": 0.3842143887440848, + "grad_norm": 5.1873393058776855, + "learning_rate": 6.8700708493235066e-06, + "loss": 0.1922, + "step": 15183 + }, + { + "epoch": 0.3842396943087785, + "grad_norm": 4.6749677658081055, + "learning_rate": 6.869698462060255e-06, + "loss": 0.1479, + "step": 15184 + }, + { + "epoch": 0.3842649998734722, + "grad_norm": 5.1352338790893555, + "learning_rate": 6.8693260627396625e-06, + "loss": 0.1708, + "step": 15185 + }, + { + "epoch": 0.38429030543816584, + "grad_norm": 3.4907991886138916, + "learning_rate": 6.868953651364134e-06, + "loss": 0.1289, + "step": 15186 + }, + { + "epoch": 0.38431561100285955, + "grad_norm": 5.377363204956055, + "learning_rate": 6.8685812279360734e-06, + "loss": 0.227, + "step": 15187 + }, + { + "epoch": 0.3843409165675532, + "grad_norm": 5.704521179199219, + "learning_rate": 6.8682087924578775e-06, + "loss": 0.1945, + "step": 15188 + }, + { + "epoch": 0.38436622213224686, + "grad_norm": 10.094426155090332, + "learning_rate": 6.867836344931953e-06, + "loss": 0.3134, + "step": 15189 + }, + { + "epoch": 0.38439152769694057, + "grad_norm": 5.520411491394043, + "learning_rate": 6.867463885360698e-06, + "loss": 0.1988, + "step": 15190 + }, + { + "epoch": 0.3844168332616342, + "grad_norm": 3.334933042526245, + "learning_rate": 6.867091413746516e-06, + "loss": 0.1482, + "step": 15191 + }, + { + "epoch": 0.3844421388263279, + "grad_norm": 2.9486746788024902, + "learning_rate": 6.866718930091809e-06, + "loss": 0.1343, + "step": 15192 + }, + { + "epoch": 0.3844674443910216, + "grad_norm": 10.768167495727539, + "learning_rate": 6.866346434398978e-06, + "loss": 0.2081, + "step": 15193 + }, + { + "epoch": 0.38449274995571525, + "grad_norm": 17.630659103393555, + "learning_rate": 6.865973926670427e-06, + "loss": 0.3402, + "step": 15194 + }, + { + "epoch": 0.38451805552040896, + "grad_norm": 5.010204792022705, + "learning_rate": 6.865601406908557e-06, + "loss": 0.1455, + "step": 15195 + }, + { + "epoch": 0.3845433610851026, + "grad_norm": 2.1097216606140137, + "learning_rate": 6.8652288751157705e-06, + "loss": 0.083, + "step": 15196 + }, + { + "epoch": 0.3845686666497963, + "grad_norm": 5.249341011047363, + "learning_rate": 6.8648563312944695e-06, + "loss": 0.1775, + "step": 15197 + }, + { + "epoch": 0.38459397221449, + "grad_norm": 7.108743190765381, + "learning_rate": 6.864483775447057e-06, + "loss": 0.1362, + "step": 15198 + }, + { + "epoch": 0.38461927777918364, + "grad_norm": 6.106245517730713, + "learning_rate": 6.864111207575936e-06, + "loss": 0.2214, + "step": 15199 + }, + { + "epoch": 0.3846445833438773, + "grad_norm": 7.048960208892822, + "learning_rate": 6.8637386276835095e-06, + "loss": 0.148, + "step": 15200 + }, + { + "epoch": 0.384669888908571, + "grad_norm": 6.0395002365112305, + "learning_rate": 6.863366035772179e-06, + "loss": 0.192, + "step": 15201 + }, + { + "epoch": 0.38469519447326467, + "grad_norm": 3.676309823989868, + "learning_rate": 6.862993431844345e-06, + "loss": 0.1146, + "step": 15202 + }, + { + "epoch": 0.3847205000379583, + "grad_norm": 6.531455993652344, + "learning_rate": 6.862620815902415e-06, + "loss": 0.2052, + "step": 15203 + }, + { + "epoch": 0.38474580560265204, + "grad_norm": 4.283148765563965, + "learning_rate": 6.862248187948789e-06, + "loss": 0.1944, + "step": 15204 + }, + { + "epoch": 0.3847711111673457, + "grad_norm": 3.5193700790405273, + "learning_rate": 6.8618755479858725e-06, + "loss": 0.116, + "step": 15205 + }, + { + "epoch": 0.38479641673203935, + "grad_norm": 2.671785593032837, + "learning_rate": 6.8615028960160655e-06, + "loss": 0.1658, + "step": 15206 + }, + { + "epoch": 0.38482172229673306, + "grad_norm": 3.1920785903930664, + "learning_rate": 6.861130232041773e-06, + "loss": 0.124, + "step": 15207 + }, + { + "epoch": 0.3848470278614267, + "grad_norm": 3.1512935161590576, + "learning_rate": 6.860757556065396e-06, + "loss": 0.1123, + "step": 15208 + }, + { + "epoch": 0.3848723334261204, + "grad_norm": 3.2122082710266113, + "learning_rate": 6.860384868089343e-06, + "loss": 0.1308, + "step": 15209 + }, + { + "epoch": 0.3848976389908141, + "grad_norm": 3.537140369415283, + "learning_rate": 6.860012168116012e-06, + "loss": 0.1017, + "step": 15210 + }, + { + "epoch": 0.38492294455550774, + "grad_norm": 6.991912364959717, + "learning_rate": 6.8596394561478084e-06, + "loss": 0.1687, + "step": 15211 + }, + { + "epoch": 0.38494825012020145, + "grad_norm": 11.423884391784668, + "learning_rate": 6.859266732187136e-06, + "loss": 0.3035, + "step": 15212 + }, + { + "epoch": 0.3849735556848951, + "grad_norm": 4.200794696807861, + "learning_rate": 6.8588939962364e-06, + "loss": 0.1397, + "step": 15213 + }, + { + "epoch": 0.38499886124958876, + "grad_norm": 3.940643072128296, + "learning_rate": 6.858521248298e-06, + "loss": 0.1162, + "step": 15214 + }, + { + "epoch": 0.3850241668142825, + "grad_norm": 3.1087398529052734, + "learning_rate": 6.858148488374343e-06, + "loss": 0.1415, + "step": 15215 + }, + { + "epoch": 0.38504947237897613, + "grad_norm": 4.617380142211914, + "learning_rate": 6.857775716467832e-06, + "loss": 0.098, + "step": 15216 + }, + { + "epoch": 0.3850747779436698, + "grad_norm": 13.931717872619629, + "learning_rate": 6.857402932580871e-06, + "loss": 0.3429, + "step": 15217 + }, + { + "epoch": 0.3851000835083635, + "grad_norm": 3.0048370361328125, + "learning_rate": 6.857030136715865e-06, + "loss": 0.0811, + "step": 15218 + }, + { + "epoch": 0.38512538907305716, + "grad_norm": 4.71550989151001, + "learning_rate": 6.8566573288752156e-06, + "loss": 0.1596, + "step": 15219 + }, + { + "epoch": 0.38515069463775087, + "grad_norm": 14.26432991027832, + "learning_rate": 6.85628450906133e-06, + "loss": 0.185, + "step": 15220 + }, + { + "epoch": 0.3851760002024445, + "grad_norm": 16.002431869506836, + "learning_rate": 6.8559116772766085e-06, + "loss": 0.2791, + "step": 15221 + }, + { + "epoch": 0.3852013057671382, + "grad_norm": 4.15530252456665, + "learning_rate": 6.855538833523461e-06, + "loss": 0.1685, + "step": 15222 + }, + { + "epoch": 0.3852266113318319, + "grad_norm": 4.7045392990112305, + "learning_rate": 6.8551659778042855e-06, + "loss": 0.185, + "step": 15223 + }, + { + "epoch": 0.38525191689652555, + "grad_norm": 11.646505355834961, + "learning_rate": 6.854793110121491e-06, + "loss": 0.1856, + "step": 15224 + }, + { + "epoch": 0.3852772224612192, + "grad_norm": 3.718644857406616, + "learning_rate": 6.854420230477481e-06, + "loss": 0.189, + "step": 15225 + }, + { + "epoch": 0.3853025280259129, + "grad_norm": 2.8177967071533203, + "learning_rate": 6.854047338874661e-06, + "loss": 0.1325, + "step": 15226 + }, + { + "epoch": 0.38532783359060657, + "grad_norm": 6.039247035980225, + "learning_rate": 6.853674435315432e-06, + "loss": 0.1636, + "step": 15227 + }, + { + "epoch": 0.38535313915530023, + "grad_norm": 3.022486925125122, + "learning_rate": 6.853301519802203e-06, + "loss": 0.0948, + "step": 15228 + }, + { + "epoch": 0.38537844471999394, + "grad_norm": 5.839437007904053, + "learning_rate": 6.852928592337378e-06, + "loss": 0.1901, + "step": 15229 + }, + { + "epoch": 0.3854037502846876, + "grad_norm": 5.279316425323486, + "learning_rate": 6.852555652923361e-06, + "loss": 0.1657, + "step": 15230 + }, + { + "epoch": 0.38542905584938125, + "grad_norm": 16.67914390563965, + "learning_rate": 6.852182701562555e-06, + "loss": 0.1552, + "step": 15231 + }, + { + "epoch": 0.38545436141407496, + "grad_norm": 3.828744649887085, + "learning_rate": 6.85180973825737e-06, + "loss": 0.1401, + "step": 15232 + }, + { + "epoch": 0.3854796669787686, + "grad_norm": 9.267197608947754, + "learning_rate": 6.8514367630102065e-06, + "loss": 0.2389, + "step": 15233 + }, + { + "epoch": 0.38550497254346233, + "grad_norm": 5.035271644592285, + "learning_rate": 6.851063775823472e-06, + "loss": 0.1166, + "step": 15234 + }, + { + "epoch": 0.385530278108156, + "grad_norm": 4.170678615570068, + "learning_rate": 6.850690776699574e-06, + "loss": 0.1322, + "step": 15235 + }, + { + "epoch": 0.38555558367284964, + "grad_norm": 6.979654312133789, + "learning_rate": 6.850317765640913e-06, + "loss": 0.1509, + "step": 15236 + }, + { + "epoch": 0.38558088923754336, + "grad_norm": 5.825316905975342, + "learning_rate": 6.849944742649896e-06, + "loss": 0.2363, + "step": 15237 + }, + { + "epoch": 0.385606194802237, + "grad_norm": 5.959096908569336, + "learning_rate": 6.8495717077289316e-06, + "loss": 0.2086, + "step": 15238 + }, + { + "epoch": 0.38563150036693067, + "grad_norm": 3.643115997314453, + "learning_rate": 6.8491986608804215e-06, + "loss": 0.1698, + "step": 15239 + }, + { + "epoch": 0.3856568059316244, + "grad_norm": 3.7800350189208984, + "learning_rate": 6.848825602106774e-06, + "loss": 0.1337, + "step": 15240 + }, + { + "epoch": 0.38568211149631804, + "grad_norm": 5.326610565185547, + "learning_rate": 6.848452531410395e-06, + "loss": 0.1697, + "step": 15241 + }, + { + "epoch": 0.3857074170610117, + "grad_norm": 3.176562547683716, + "learning_rate": 6.848079448793688e-06, + "loss": 0.1415, + "step": 15242 + }, + { + "epoch": 0.3857327226257054, + "grad_norm": 4.841870307922363, + "learning_rate": 6.847706354259061e-06, + "loss": 0.1911, + "step": 15243 + }, + { + "epoch": 0.38575802819039906, + "grad_norm": 2.4146783351898193, + "learning_rate": 6.84733324780892e-06, + "loss": 0.1166, + "step": 15244 + }, + { + "epoch": 0.38578333375509277, + "grad_norm": 10.186796188354492, + "learning_rate": 6.84696012944567e-06, + "loss": 0.1989, + "step": 15245 + }, + { + "epoch": 0.38580863931978643, + "grad_norm": 6.278020858764648, + "learning_rate": 6.846586999171718e-06, + "loss": 0.1894, + "step": 15246 + }, + { + "epoch": 0.3858339448844801, + "grad_norm": 4.861022472381592, + "learning_rate": 6.84621385698947e-06, + "loss": 0.1362, + "step": 15247 + }, + { + "epoch": 0.3858592504491738, + "grad_norm": 3.1412413120269775, + "learning_rate": 6.845840702901332e-06, + "loss": 0.1589, + "step": 15248 + }, + { + "epoch": 0.38588455601386745, + "grad_norm": 4.2452826499938965, + "learning_rate": 6.84546753690971e-06, + "loss": 0.1752, + "step": 15249 + }, + { + "epoch": 0.3859098615785611, + "grad_norm": 3.453613758087158, + "learning_rate": 6.845094359017011e-06, + "loss": 0.1395, + "step": 15250 + }, + { + "epoch": 0.3859351671432548, + "grad_norm": 3.8258230686187744, + "learning_rate": 6.8447211692256436e-06, + "loss": 0.1408, + "step": 15251 + }, + { + "epoch": 0.3859604727079485, + "grad_norm": 4.840778827667236, + "learning_rate": 6.844347967538011e-06, + "loss": 0.2223, + "step": 15252 + }, + { + "epoch": 0.38598577827264213, + "grad_norm": 3.900656223297119, + "learning_rate": 6.843974753956522e-06, + "loss": 0.1457, + "step": 15253 + }, + { + "epoch": 0.38601108383733584, + "grad_norm": 14.408124923706055, + "learning_rate": 6.843601528483583e-06, + "loss": 0.1976, + "step": 15254 + }, + { + "epoch": 0.3860363894020295, + "grad_norm": 7.209978103637695, + "learning_rate": 6.843228291121599e-06, + "loss": 0.201, + "step": 15255 + }, + { + "epoch": 0.38606169496672316, + "grad_norm": 5.705668926239014, + "learning_rate": 6.84285504187298e-06, + "loss": 0.1567, + "step": 15256 + }, + { + "epoch": 0.38608700053141687, + "grad_norm": 9.503933906555176, + "learning_rate": 6.842481780740134e-06, + "loss": 0.2411, + "step": 15257 + }, + { + "epoch": 0.3861123060961105, + "grad_norm": 7.71392822265625, + "learning_rate": 6.8421085077254614e-06, + "loss": 0.1949, + "step": 15258 + }, + { + "epoch": 0.38613761166080424, + "grad_norm": 2.433742046356201, + "learning_rate": 6.841735222831376e-06, + "loss": 0.1147, + "step": 15259 + }, + { + "epoch": 0.3861629172254979, + "grad_norm": 10.862592697143555, + "learning_rate": 6.841361926060281e-06, + "loss": 0.1727, + "step": 15260 + }, + { + "epoch": 0.38618822279019155, + "grad_norm": 4.29536247253418, + "learning_rate": 6.840988617414588e-06, + "loss": 0.1546, + "step": 15261 + }, + { + "epoch": 0.38621352835488526, + "grad_norm": 3.5191638469696045, + "learning_rate": 6.840615296896702e-06, + "loss": 0.1155, + "step": 15262 + }, + { + "epoch": 0.3862388339195789, + "grad_norm": 2.7564570903778076, + "learning_rate": 6.840241964509028e-06, + "loss": 0.1813, + "step": 15263 + }, + { + "epoch": 0.3862641394842726, + "grad_norm": 4.613610744476318, + "learning_rate": 6.839868620253977e-06, + "loss": 0.1815, + "step": 15264 + }, + { + "epoch": 0.3862894450489663, + "grad_norm": 6.498239517211914, + "learning_rate": 6.839495264133955e-06, + "loss": 0.1535, + "step": 15265 + }, + { + "epoch": 0.38631475061365994, + "grad_norm": 9.374258995056152, + "learning_rate": 6.839121896151371e-06, + "loss": 0.2371, + "step": 15266 + }, + { + "epoch": 0.3863400561783536, + "grad_norm": 9.22726058959961, + "learning_rate": 6.8387485163086305e-06, + "loss": 0.1793, + "step": 15267 + }, + { + "epoch": 0.3863653617430473, + "grad_norm": 3.5976386070251465, + "learning_rate": 6.8383751246081455e-06, + "loss": 0.1713, + "step": 15268 + }, + { + "epoch": 0.38639066730774096, + "grad_norm": 2.86002779006958, + "learning_rate": 6.838001721052319e-06, + "loss": 0.0893, + "step": 15269 + }, + { + "epoch": 0.3864159728724346, + "grad_norm": 8.977190017700195, + "learning_rate": 6.837628305643562e-06, + "loss": 0.2866, + "step": 15270 + }, + { + "epoch": 0.38644127843712833, + "grad_norm": 3.1975271701812744, + "learning_rate": 6.837254878384282e-06, + "loss": 0.113, + "step": 15271 + }, + { + "epoch": 0.386466584001822, + "grad_norm": 6.079636096954346, + "learning_rate": 6.836881439276886e-06, + "loss": 0.1683, + "step": 15272 + }, + { + "epoch": 0.3864918895665157, + "grad_norm": 6.0375657081604, + "learning_rate": 6.836507988323785e-06, + "loss": 0.2136, + "step": 15273 + }, + { + "epoch": 0.38651719513120936, + "grad_norm": 4.951823711395264, + "learning_rate": 6.836134525527385e-06, + "loss": 0.2381, + "step": 15274 + }, + { + "epoch": 0.386542500695903, + "grad_norm": 10.352021217346191, + "learning_rate": 6.835761050890095e-06, + "loss": 0.3185, + "step": 15275 + }, + { + "epoch": 0.3865678062605967, + "grad_norm": 7.667544364929199, + "learning_rate": 6.835387564414322e-06, + "loss": 0.258, + "step": 15276 + }, + { + "epoch": 0.3865931118252904, + "grad_norm": 7.042190074920654, + "learning_rate": 6.835014066102478e-06, + "loss": 0.1714, + "step": 15277 + }, + { + "epoch": 0.38661841738998404, + "grad_norm": 4.227297306060791, + "learning_rate": 6.834640555956968e-06, + "loss": 0.1724, + "step": 15278 + }, + { + "epoch": 0.38664372295467775, + "grad_norm": 9.144803047180176, + "learning_rate": 6.834267033980204e-06, + "loss": 0.1565, + "step": 15279 + }, + { + "epoch": 0.3866690285193714, + "grad_norm": 4.2505269050598145, + "learning_rate": 6.833893500174592e-06, + "loss": 0.1732, + "step": 15280 + }, + { + "epoch": 0.38669433408406506, + "grad_norm": 5.928356170654297, + "learning_rate": 6.8335199545425424e-06, + "loss": 0.1731, + "step": 15281 + }, + { + "epoch": 0.3867196396487588, + "grad_norm": 3.1816139221191406, + "learning_rate": 6.833146397086463e-06, + "loss": 0.1142, + "step": 15282 + }, + { + "epoch": 0.38674494521345243, + "grad_norm": 3.7427964210510254, + "learning_rate": 6.832772827808765e-06, + "loss": 0.1914, + "step": 15283 + }, + { + "epoch": 0.38677025077814614, + "grad_norm": 4.570962429046631, + "learning_rate": 6.832399246711854e-06, + "loss": 0.1793, + "step": 15284 + }, + { + "epoch": 0.3867955563428398, + "grad_norm": 4.769894123077393, + "learning_rate": 6.832025653798142e-06, + "loss": 0.171, + "step": 15285 + }, + { + "epoch": 0.38682086190753345, + "grad_norm": 4.528384208679199, + "learning_rate": 6.831652049070038e-06, + "loss": 0.0833, + "step": 15286 + }, + { + "epoch": 0.38684616747222716, + "grad_norm": 4.060110569000244, + "learning_rate": 6.831278432529949e-06, + "loss": 0.1705, + "step": 15287 + }, + { + "epoch": 0.3868714730369208, + "grad_norm": 5.174388408660889, + "learning_rate": 6.830904804180289e-06, + "loss": 0.2015, + "step": 15288 + }, + { + "epoch": 0.3868967786016145, + "grad_norm": 12.449336051940918, + "learning_rate": 6.830531164023462e-06, + "loss": 0.2368, + "step": 15289 + }, + { + "epoch": 0.3869220841663082, + "grad_norm": 7.662501335144043, + "learning_rate": 6.83015751206188e-06, + "loss": 0.3277, + "step": 15290 + }, + { + "epoch": 0.38694738973100185, + "grad_norm": 5.540419578552246, + "learning_rate": 6.829783848297954e-06, + "loss": 0.2282, + "step": 15291 + }, + { + "epoch": 0.3869726952956955, + "grad_norm": 4.454988479614258, + "learning_rate": 6.8294101727340915e-06, + "loss": 0.2053, + "step": 15292 + }, + { + "epoch": 0.3869980008603892, + "grad_norm": 3.771087408065796, + "learning_rate": 6.829036485372703e-06, + "loss": 0.1542, + "step": 15293 + }, + { + "epoch": 0.38702330642508287, + "grad_norm": 8.490189552307129, + "learning_rate": 6.828662786216198e-06, + "loss": 0.2672, + "step": 15294 + }, + { + "epoch": 0.3870486119897765, + "grad_norm": 3.447559118270874, + "learning_rate": 6.828289075266988e-06, + "loss": 0.1804, + "step": 15295 + }, + { + "epoch": 0.38707391755447024, + "grad_norm": 5.468057632446289, + "learning_rate": 6.827915352527481e-06, + "loss": 0.2582, + "step": 15296 + }, + { + "epoch": 0.3870992231191639, + "grad_norm": 3.7139925956726074, + "learning_rate": 6.827541618000088e-06, + "loss": 0.1459, + "step": 15297 + }, + { + "epoch": 0.3871245286838576, + "grad_norm": 7.981595039367676, + "learning_rate": 6.82716787168722e-06, + "loss": 0.2254, + "step": 15298 + }, + { + "epoch": 0.38714983424855126, + "grad_norm": 4.205367088317871, + "learning_rate": 6.826794113591286e-06, + "loss": 0.0923, + "step": 15299 + }, + { + "epoch": 0.3871751398132449, + "grad_norm": 17.195154190063477, + "learning_rate": 6.826420343714696e-06, + "loss": 0.241, + "step": 15300 + }, + { + "epoch": 0.38720044537793863, + "grad_norm": 7.106306552886963, + "learning_rate": 6.826046562059861e-06, + "loss": 0.2637, + "step": 15301 + }, + { + "epoch": 0.3872257509426323, + "grad_norm": 7.818571090698242, + "learning_rate": 6.825672768629191e-06, + "loss": 0.156, + "step": 15302 + }, + { + "epoch": 0.38725105650732594, + "grad_norm": 10.106225967407227, + "learning_rate": 6.8252989634250965e-06, + "loss": 0.3086, + "step": 15303 + }, + { + "epoch": 0.38727636207201965, + "grad_norm": 4.550287246704102, + "learning_rate": 6.82492514644999e-06, + "loss": 0.1229, + "step": 15304 + }, + { + "epoch": 0.3873016676367133, + "grad_norm": 5.764514923095703, + "learning_rate": 6.824551317706281e-06, + "loss": 0.1544, + "step": 15305 + }, + { + "epoch": 0.38732697320140697, + "grad_norm": 3.785491943359375, + "learning_rate": 6.824177477196379e-06, + "loss": 0.2006, + "step": 15306 + }, + { + "epoch": 0.3873522787661007, + "grad_norm": 2.331188917160034, + "learning_rate": 6.823803624922694e-06, + "loss": 0.1321, + "step": 15307 + }, + { + "epoch": 0.38737758433079433, + "grad_norm": 6.102585315704346, + "learning_rate": 6.823429760887642e-06, + "loss": 0.1747, + "step": 15308 + }, + { + "epoch": 0.38740288989548805, + "grad_norm": 3.917529344558716, + "learning_rate": 6.8230558850936275e-06, + "loss": 0.1627, + "step": 15309 + }, + { + "epoch": 0.3874281954601817, + "grad_norm": 6.038415431976318, + "learning_rate": 6.822681997543068e-06, + "loss": 0.1644, + "step": 15310 + }, + { + "epoch": 0.38745350102487536, + "grad_norm": 9.489595413208008, + "learning_rate": 6.822308098238368e-06, + "loss": 0.2165, + "step": 15311 + }, + { + "epoch": 0.38747880658956907, + "grad_norm": 12.175439834594727, + "learning_rate": 6.8219341871819455e-06, + "loss": 0.1936, + "step": 15312 + }, + { + "epoch": 0.3875041121542627, + "grad_norm": 3.3150691986083984, + "learning_rate": 6.821560264376206e-06, + "loss": 0.1553, + "step": 15313 + }, + { + "epoch": 0.3875294177189564, + "grad_norm": 4.061993598937988, + "learning_rate": 6.821186329823562e-06, + "loss": 0.1257, + "step": 15314 + }, + { + "epoch": 0.3875547232836501, + "grad_norm": 4.47136926651001, + "learning_rate": 6.82081238352643e-06, + "loss": 0.1853, + "step": 15315 + }, + { + "epoch": 0.38758002884834375, + "grad_norm": 3.6933035850524902, + "learning_rate": 6.820438425487215e-06, + "loss": 0.1783, + "step": 15316 + }, + { + "epoch": 0.3876053344130374, + "grad_norm": 3.6496567726135254, + "learning_rate": 6.820064455708332e-06, + "loss": 0.1654, + "step": 15317 + }, + { + "epoch": 0.3876306399777311, + "grad_norm": 7.614336013793945, + "learning_rate": 6.819690474192192e-06, + "loss": 0.1226, + "step": 15318 + }, + { + "epoch": 0.3876559455424248, + "grad_norm": 5.847638130187988, + "learning_rate": 6.819316480941205e-06, + "loss": 0.236, + "step": 15319 + }, + { + "epoch": 0.38768125110711843, + "grad_norm": 6.66862678527832, + "learning_rate": 6.818942475957786e-06, + "loss": 0.1457, + "step": 15320 + }, + { + "epoch": 0.38770655667181214, + "grad_norm": 12.050944328308105, + "learning_rate": 6.8185684592443456e-06, + "loss": 0.4672, + "step": 15321 + }, + { + "epoch": 0.3877318622365058, + "grad_norm": 2.818118095397949, + "learning_rate": 6.818194430803295e-06, + "loss": 0.1073, + "step": 15322 + }, + { + "epoch": 0.3877571678011995, + "grad_norm": 5.6082234382629395, + "learning_rate": 6.8178203906370465e-06, + "loss": 0.2457, + "step": 15323 + }, + { + "epoch": 0.38778247336589317, + "grad_norm": 7.678555488586426, + "learning_rate": 6.8174463387480126e-06, + "loss": 0.2966, + "step": 15324 + }, + { + "epoch": 0.3878077789305868, + "grad_norm": 6.291634559631348, + "learning_rate": 6.817072275138606e-06, + "loss": 0.1675, + "step": 15325 + }, + { + "epoch": 0.38783308449528053, + "grad_norm": 5.371745586395264, + "learning_rate": 6.816698199811237e-06, + "loss": 0.1746, + "step": 15326 + }, + { + "epoch": 0.3878583900599742, + "grad_norm": 6.341283321380615, + "learning_rate": 6.81632411276832e-06, + "loss": 0.2482, + "step": 15327 + }, + { + "epoch": 0.38788369562466785, + "grad_norm": 3.510112762451172, + "learning_rate": 6.815950014012266e-06, + "loss": 0.166, + "step": 15328 + }, + { + "epoch": 0.38790900118936156, + "grad_norm": 5.600684642791748, + "learning_rate": 6.815575903545488e-06, + "loss": 0.2682, + "step": 15329 + }, + { + "epoch": 0.3879343067540552, + "grad_norm": 7.609000205993652, + "learning_rate": 6.8152017813704e-06, + "loss": 0.2059, + "step": 15330 + }, + { + "epoch": 0.38795961231874887, + "grad_norm": 4.327236652374268, + "learning_rate": 6.814827647489413e-06, + "loss": 0.1457, + "step": 15331 + }, + { + "epoch": 0.3879849178834426, + "grad_norm": 3.4889683723449707, + "learning_rate": 6.81445350190494e-06, + "loss": 0.1657, + "step": 15332 + }, + { + "epoch": 0.38801022344813624, + "grad_norm": 6.300575256347656, + "learning_rate": 6.814079344619393e-06, + "loss": 0.1762, + "step": 15333 + }, + { + "epoch": 0.3880355290128299, + "grad_norm": 7.9866414070129395, + "learning_rate": 6.813705175635187e-06, + "loss": 0.1878, + "step": 15334 + }, + { + "epoch": 0.3880608345775236, + "grad_norm": 5.122518539428711, + "learning_rate": 6.813330994954732e-06, + "loss": 0.1882, + "step": 15335 + }, + { + "epoch": 0.38808614014221726, + "grad_norm": 5.567081928253174, + "learning_rate": 6.812956802580444e-06, + "loss": 0.223, + "step": 15336 + }, + { + "epoch": 0.388111445706911, + "grad_norm": 10.618758201599121, + "learning_rate": 6.8125825985147356e-06, + "loss": 0.1528, + "step": 15337 + }, + { + "epoch": 0.38813675127160463, + "grad_norm": 3.190983533859253, + "learning_rate": 6.812208382760017e-06, + "loss": 0.1105, + "step": 15338 + }, + { + "epoch": 0.3881620568362983, + "grad_norm": 5.8412628173828125, + "learning_rate": 6.811834155318706e-06, + "loss": 0.1539, + "step": 15339 + }, + { + "epoch": 0.388187362400992, + "grad_norm": 5.103126049041748, + "learning_rate": 6.811459916193211e-06, + "loss": 0.1731, + "step": 15340 + }, + { + "epoch": 0.38821266796568565, + "grad_norm": 4.725388050079346, + "learning_rate": 6.8110856653859504e-06, + "loss": 0.1769, + "step": 15341 + }, + { + "epoch": 0.3882379735303793, + "grad_norm": 8.889245986938477, + "learning_rate": 6.810711402899333e-06, + "loss": 0.197, + "step": 15342 + }, + { + "epoch": 0.388263279095073, + "grad_norm": 5.172481060028076, + "learning_rate": 6.8103371287357775e-06, + "loss": 0.2097, + "step": 15343 + }, + { + "epoch": 0.3882885846597667, + "grad_norm": 10.094427108764648, + "learning_rate": 6.809962842897693e-06, + "loss": 0.3339, + "step": 15344 + }, + { + "epoch": 0.38831389022446033, + "grad_norm": 14.74405288696289, + "learning_rate": 6.809588545387493e-06, + "loss": 0.3068, + "step": 15345 + }, + { + "epoch": 0.38833919578915405, + "grad_norm": 4.371132850646973, + "learning_rate": 6.8092142362075954e-06, + "loss": 0.2107, + "step": 15346 + }, + { + "epoch": 0.3883645013538477, + "grad_norm": 5.667905330657959, + "learning_rate": 6.8088399153604125e-06, + "loss": 0.191, + "step": 15347 + }, + { + "epoch": 0.3883898069185414, + "grad_norm": 14.76968765258789, + "learning_rate": 6.808465582848355e-06, + "loss": 0.1769, + "step": 15348 + }, + { + "epoch": 0.38841511248323507, + "grad_norm": 6.25742244720459, + "learning_rate": 6.808091238673841e-06, + "loss": 0.1959, + "step": 15349 + }, + { + "epoch": 0.3884404180479287, + "grad_norm": 3.6173157691955566, + "learning_rate": 6.807716882839282e-06, + "loss": 0.1815, + "step": 15350 + }, + { + "epoch": 0.38846572361262244, + "grad_norm": 7.696859359741211, + "learning_rate": 6.807342515347094e-06, + "loss": 0.1869, + "step": 15351 + }, + { + "epoch": 0.3884910291773161, + "grad_norm": 3.31744122505188, + "learning_rate": 6.8069681361996895e-06, + "loss": 0.1907, + "step": 15352 + }, + { + "epoch": 0.38851633474200975, + "grad_norm": 4.656799793243408, + "learning_rate": 6.8065937453994834e-06, + "loss": 0.1502, + "step": 15353 + }, + { + "epoch": 0.38854164030670346, + "grad_norm": 5.449112415313721, + "learning_rate": 6.806219342948892e-06, + "loss": 0.1651, + "step": 15354 + }, + { + "epoch": 0.3885669458713971, + "grad_norm": 4.443235874176025, + "learning_rate": 6.805844928850327e-06, + "loss": 0.2231, + "step": 15355 + }, + { + "epoch": 0.3885922514360908, + "grad_norm": 24.61186408996582, + "learning_rate": 6.805470503106203e-06, + "loss": 0.3224, + "step": 15356 + }, + { + "epoch": 0.3886175570007845, + "grad_norm": 4.325736045837402, + "learning_rate": 6.805096065718937e-06, + "loss": 0.2335, + "step": 15357 + }, + { + "epoch": 0.38864286256547814, + "grad_norm": 13.2725191116333, + "learning_rate": 6.80472161669094e-06, + "loss": 0.1844, + "step": 15358 + }, + { + "epoch": 0.3886681681301718, + "grad_norm": 5.046485900878906, + "learning_rate": 6.804347156024631e-06, + "loss": 0.1756, + "step": 15359 + }, + { + "epoch": 0.3886934736948655, + "grad_norm": 4.796178817749023, + "learning_rate": 6.803972683722423e-06, + "loss": 0.1249, + "step": 15360 + }, + { + "epoch": 0.38871877925955917, + "grad_norm": 5.32880163192749, + "learning_rate": 6.8035981997867304e-06, + "loss": 0.1644, + "step": 15361 + }, + { + "epoch": 0.3887440848242529, + "grad_norm": 3.730945348739624, + "learning_rate": 6.803223704219968e-06, + "loss": 0.1146, + "step": 15362 + }, + { + "epoch": 0.38876939038894653, + "grad_norm": 6.937705993652344, + "learning_rate": 6.802849197024553e-06, + "loss": 0.2365, + "step": 15363 + }, + { + "epoch": 0.3887946959536402, + "grad_norm": 4.225322246551514, + "learning_rate": 6.802474678202896e-06, + "loss": 0.1155, + "step": 15364 + }, + { + "epoch": 0.3888200015183339, + "grad_norm": 4.70347261428833, + "learning_rate": 6.802100147757416e-06, + "loss": 0.197, + "step": 15365 + }, + { + "epoch": 0.38884530708302756, + "grad_norm": 3.198179244995117, + "learning_rate": 6.8017256056905276e-06, + "loss": 0.1177, + "step": 15366 + }, + { + "epoch": 0.3888706126477212, + "grad_norm": 2.922268867492676, + "learning_rate": 6.8013510520046465e-06, + "loss": 0.1872, + "step": 15367 + }, + { + "epoch": 0.3888959182124149, + "grad_norm": 3.8800048828125, + "learning_rate": 6.800976486702188e-06, + "loss": 0.0998, + "step": 15368 + }, + { + "epoch": 0.3889212237771086, + "grad_norm": 3.607003927230835, + "learning_rate": 6.800601909785566e-06, + "loss": 0.1422, + "step": 15369 + }, + { + "epoch": 0.38894652934180224, + "grad_norm": 6.752297401428223, + "learning_rate": 6.800227321257198e-06, + "loss": 0.1626, + "step": 15370 + }, + { + "epoch": 0.38897183490649595, + "grad_norm": 10.922398567199707, + "learning_rate": 6.7998527211194975e-06, + "loss": 0.1472, + "step": 15371 + }, + { + "epoch": 0.3889971404711896, + "grad_norm": 6.109681129455566, + "learning_rate": 6.799478109374884e-06, + "loss": 0.2339, + "step": 15372 + }, + { + "epoch": 0.3890224460358833, + "grad_norm": 5.337420463562012, + "learning_rate": 6.7991034860257684e-06, + "loss": 0.1676, + "step": 15373 + }, + { + "epoch": 0.389047751600577, + "grad_norm": 3.6615259647369385, + "learning_rate": 6.798728851074571e-06, + "loss": 0.1068, + "step": 15374 + }, + { + "epoch": 0.38907305716527063, + "grad_norm": 6.137481212615967, + "learning_rate": 6.798354204523705e-06, + "loss": 0.1885, + "step": 15375 + }, + { + "epoch": 0.38909836272996434, + "grad_norm": 3.126514434814453, + "learning_rate": 6.797979546375587e-06, + "loss": 0.0834, + "step": 15376 + }, + { + "epoch": 0.389123668294658, + "grad_norm": 5.898769378662109, + "learning_rate": 6.797604876632633e-06, + "loss": 0.2379, + "step": 15377 + }, + { + "epoch": 0.38914897385935165, + "grad_norm": 6.431203842163086, + "learning_rate": 6.797230195297262e-06, + "loss": 0.2355, + "step": 15378 + }, + { + "epoch": 0.38917427942404537, + "grad_norm": 3.764113664627075, + "learning_rate": 6.796855502371886e-06, + "loss": 0.1882, + "step": 15379 + }, + { + "epoch": 0.389199584988739, + "grad_norm": 6.854066371917725, + "learning_rate": 6.796480797858922e-06, + "loss": 0.235, + "step": 15380 + }, + { + "epoch": 0.3892248905534327, + "grad_norm": 4.350749969482422, + "learning_rate": 6.796106081760789e-06, + "loss": 0.1747, + "step": 15381 + }, + { + "epoch": 0.3892501961181264, + "grad_norm": 11.092860221862793, + "learning_rate": 6.795731354079902e-06, + "loss": 0.2468, + "step": 15382 + }, + { + "epoch": 0.38927550168282005, + "grad_norm": 19.88367462158203, + "learning_rate": 6.795356614818677e-06, + "loss": 0.2241, + "step": 15383 + }, + { + "epoch": 0.3893008072475137, + "grad_norm": 3.4636666774749756, + "learning_rate": 6.794981863979531e-06, + "loss": 0.1594, + "step": 15384 + }, + { + "epoch": 0.3893261128122074, + "grad_norm": 4.371118068695068, + "learning_rate": 6.7946071015648816e-06, + "loss": 0.1574, + "step": 15385 + }, + { + "epoch": 0.38935141837690107, + "grad_norm": 4.468760967254639, + "learning_rate": 6.794232327577144e-06, + "loss": 0.1828, + "step": 15386 + }, + { + "epoch": 0.3893767239415948, + "grad_norm": 7.56680965423584, + "learning_rate": 6.793857542018736e-06, + "loss": 0.2383, + "step": 15387 + }, + { + "epoch": 0.38940202950628844, + "grad_norm": 10.33316707611084, + "learning_rate": 6.793482744892075e-06, + "loss": 0.3748, + "step": 15388 + }, + { + "epoch": 0.3894273350709821, + "grad_norm": 13.386212348937988, + "learning_rate": 6.7931079361995764e-06, + "loss": 0.3491, + "step": 15389 + }, + { + "epoch": 0.3894526406356758, + "grad_norm": 2.9589686393737793, + "learning_rate": 6.792733115943659e-06, + "loss": 0.1571, + "step": 15390 + }, + { + "epoch": 0.38947794620036946, + "grad_norm": 9.781989097595215, + "learning_rate": 6.792358284126738e-06, + "loss": 0.15, + "step": 15391 + }, + { + "epoch": 0.3895032517650631, + "grad_norm": 17.190401077270508, + "learning_rate": 6.791983440751234e-06, + "loss": 0.1651, + "step": 15392 + }, + { + "epoch": 0.38952855732975683, + "grad_norm": 9.242436408996582, + "learning_rate": 6.79160858581956e-06, + "loss": 0.1635, + "step": 15393 + }, + { + "epoch": 0.3895538628944505, + "grad_norm": 6.188953399658203, + "learning_rate": 6.791233719334137e-06, + "loss": 0.183, + "step": 15394 + }, + { + "epoch": 0.38957916845914414, + "grad_norm": 4.619524955749512, + "learning_rate": 6.790858841297381e-06, + "loss": 0.1062, + "step": 15395 + }, + { + "epoch": 0.38960447402383785, + "grad_norm": 6.643299579620361, + "learning_rate": 6.790483951711708e-06, + "loss": 0.2629, + "step": 15396 + }, + { + "epoch": 0.3896297795885315, + "grad_norm": 3.6959762573242188, + "learning_rate": 6.790109050579538e-06, + "loss": 0.1615, + "step": 15397 + }, + { + "epoch": 0.38965508515322517, + "grad_norm": 11.739465713500977, + "learning_rate": 6.789734137903289e-06, + "loss": 0.1991, + "step": 15398 + }, + { + "epoch": 0.3896803907179189, + "grad_norm": 8.325201988220215, + "learning_rate": 6.789359213685375e-06, + "loss": 0.1433, + "step": 15399 + }, + { + "epoch": 0.38970569628261253, + "grad_norm": 8.993839263916016, + "learning_rate": 6.788984277928217e-06, + "loss": 0.174, + "step": 15400 + }, + { + "epoch": 0.38973100184730625, + "grad_norm": 3.273559093475342, + "learning_rate": 6.788609330634232e-06, + "loss": 0.1695, + "step": 15401 + }, + { + "epoch": 0.3897563074119999, + "grad_norm": 6.633016109466553, + "learning_rate": 6.78823437180584e-06, + "loss": 0.1929, + "step": 15402 + }, + { + "epoch": 0.38978161297669356, + "grad_norm": 10.861433982849121, + "learning_rate": 6.787859401445456e-06, + "loss": 0.2021, + "step": 15403 + }, + { + "epoch": 0.38980691854138727, + "grad_norm": 4.177833557128906, + "learning_rate": 6.787484419555499e-06, + "loss": 0.144, + "step": 15404 + }, + { + "epoch": 0.3898322241060809, + "grad_norm": 3.896606922149658, + "learning_rate": 6.787109426138387e-06, + "loss": 0.1968, + "step": 15405 + }, + { + "epoch": 0.3898575296707746, + "grad_norm": 4.457096576690674, + "learning_rate": 6.786734421196539e-06, + "loss": 0.2245, + "step": 15406 + }, + { + "epoch": 0.3898828352354683, + "grad_norm": 11.289063453674316, + "learning_rate": 6.786359404732375e-06, + "loss": 0.3768, + "step": 15407 + }, + { + "epoch": 0.38990814080016195, + "grad_norm": 7.833276271820068, + "learning_rate": 6.78598437674831e-06, + "loss": 0.1718, + "step": 15408 + }, + { + "epoch": 0.3899334463648556, + "grad_norm": 11.355215072631836, + "learning_rate": 6.785609337246764e-06, + "loss": 0.1865, + "step": 15409 + }, + { + "epoch": 0.3899587519295493, + "grad_norm": 2.506875514984131, + "learning_rate": 6.785234286230156e-06, + "loss": 0.1108, + "step": 15410 + }, + { + "epoch": 0.389984057494243, + "grad_norm": 4.178235054016113, + "learning_rate": 6.784859223700903e-06, + "loss": 0.2028, + "step": 15411 + }, + { + "epoch": 0.3900093630589367, + "grad_norm": 4.733356475830078, + "learning_rate": 6.784484149661425e-06, + "loss": 0.1579, + "step": 15412 + }, + { + "epoch": 0.39003466862363034, + "grad_norm": 6.222299098968506, + "learning_rate": 6.784109064114142e-06, + "loss": 0.2277, + "step": 15413 + }, + { + "epoch": 0.390059974188324, + "grad_norm": 4.4459028244018555, + "learning_rate": 6.78373396706147e-06, + "loss": 0.196, + "step": 15414 + }, + { + "epoch": 0.3900852797530177, + "grad_norm": 3.230461597442627, + "learning_rate": 6.78335885850583e-06, + "loss": 0.1441, + "step": 15415 + }, + { + "epoch": 0.39011058531771137, + "grad_norm": 16.492517471313477, + "learning_rate": 6.782983738449641e-06, + "loss": 0.2956, + "step": 15416 + }, + { + "epoch": 0.390135890882405, + "grad_norm": 8.634200096130371, + "learning_rate": 6.78260860689532e-06, + "loss": 0.1898, + "step": 15417 + }, + { + "epoch": 0.39016119644709873, + "grad_norm": 5.326013088226318, + "learning_rate": 6.782233463845291e-06, + "loss": 0.2068, + "step": 15418 + }, + { + "epoch": 0.3901865020117924, + "grad_norm": 6.506650447845459, + "learning_rate": 6.781858309301966e-06, + "loss": 0.2223, + "step": 15419 + }, + { + "epoch": 0.39021180757648605, + "grad_norm": 9.05278491973877, + "learning_rate": 6.78148314326777e-06, + "loss": 0.3113, + "step": 15420 + }, + { + "epoch": 0.39023711314117976, + "grad_norm": 14.517824172973633, + "learning_rate": 6.781107965745121e-06, + "loss": 0.3347, + "step": 15421 + }, + { + "epoch": 0.3902624187058734, + "grad_norm": 6.828853130340576, + "learning_rate": 6.780732776736436e-06, + "loss": 0.2542, + "step": 15422 + }, + { + "epoch": 0.39028772427056707, + "grad_norm": 6.002462387084961, + "learning_rate": 6.780357576244138e-06, + "loss": 0.1574, + "step": 15423 + }, + { + "epoch": 0.3903130298352608, + "grad_norm": 4.6500372886657715, + "learning_rate": 6.779982364270645e-06, + "loss": 0.1846, + "step": 15424 + }, + { + "epoch": 0.39033833539995444, + "grad_norm": 5.04671573638916, + "learning_rate": 6.779607140818378e-06, + "loss": 0.1767, + "step": 15425 + }, + { + "epoch": 0.39036364096464815, + "grad_norm": 7.086812973022461, + "learning_rate": 6.779231905889755e-06, + "loss": 0.2685, + "step": 15426 + }, + { + "epoch": 0.3903889465293418, + "grad_norm": 4.918341636657715, + "learning_rate": 6.7788566594871964e-06, + "loss": 0.1621, + "step": 15427 + }, + { + "epoch": 0.39041425209403546, + "grad_norm": 6.801792621612549, + "learning_rate": 6.778481401613122e-06, + "loss": 0.1624, + "step": 15428 + }, + { + "epoch": 0.3904395576587292, + "grad_norm": 4.745988845825195, + "learning_rate": 6.778106132269951e-06, + "loss": 0.1646, + "step": 15429 + }, + { + "epoch": 0.39046486322342283, + "grad_norm": 3.090423583984375, + "learning_rate": 6.777730851460105e-06, + "loss": 0.2035, + "step": 15430 + }, + { + "epoch": 0.3904901687881165, + "grad_norm": 5.0616044998168945, + "learning_rate": 6.777355559186003e-06, + "loss": 0.1504, + "step": 15431 + }, + { + "epoch": 0.3905154743528102, + "grad_norm": 2.2499966621398926, + "learning_rate": 6.7769802554500664e-06, + "loss": 0.1129, + "step": 15432 + }, + { + "epoch": 0.39054077991750386, + "grad_norm": 4.328911781311035, + "learning_rate": 6.776604940254714e-06, + "loss": 0.2548, + "step": 15433 + }, + { + "epoch": 0.3905660854821975, + "grad_norm": 18.632272720336914, + "learning_rate": 6.776229613602368e-06, + "loss": 0.1966, + "step": 15434 + }, + { + "epoch": 0.3905913910468912, + "grad_norm": 5.515592575073242, + "learning_rate": 6.775854275495446e-06, + "loss": 0.218, + "step": 15435 + }, + { + "epoch": 0.3906166966115849, + "grad_norm": 6.334281921386719, + "learning_rate": 6.775478925936371e-06, + "loss": 0.159, + "step": 15436 + }, + { + "epoch": 0.3906420021762786, + "grad_norm": 5.420837879180908, + "learning_rate": 6.775103564927563e-06, + "loss": 0.1432, + "step": 15437 + }, + { + "epoch": 0.39066730774097225, + "grad_norm": 5.235005855560303, + "learning_rate": 6.77472819247144e-06, + "loss": 0.2595, + "step": 15438 + }, + { + "epoch": 0.3906926133056659, + "grad_norm": 5.651984214782715, + "learning_rate": 6.774352808570428e-06, + "loss": 0.161, + "step": 15439 + }, + { + "epoch": 0.3907179188703596, + "grad_norm": 4.6031904220581055, + "learning_rate": 6.773977413226944e-06, + "loss": 0.2154, + "step": 15440 + }, + { + "epoch": 0.39074322443505327, + "grad_norm": 4.5501227378845215, + "learning_rate": 6.77360200644341e-06, + "loss": 0.1411, + "step": 15441 + }, + { + "epoch": 0.3907685299997469, + "grad_norm": 1.9562828540802002, + "learning_rate": 6.773226588222245e-06, + "loss": 0.1233, + "step": 15442 + }, + { + "epoch": 0.39079383556444064, + "grad_norm": 2.420133352279663, + "learning_rate": 6.7728511585658714e-06, + "loss": 0.168, + "step": 15443 + }, + { + "epoch": 0.3908191411291343, + "grad_norm": 2.5831432342529297, + "learning_rate": 6.772475717476712e-06, + "loss": 0.1096, + "step": 15444 + }, + { + "epoch": 0.39084444669382795, + "grad_norm": 5.218894004821777, + "learning_rate": 6.772100264957186e-06, + "loss": 0.1834, + "step": 15445 + }, + { + "epoch": 0.39086975225852166, + "grad_norm": 8.0121431350708, + "learning_rate": 6.771724801009715e-06, + "loss": 0.1907, + "step": 15446 + }, + { + "epoch": 0.3908950578232153, + "grad_norm": 4.892730236053467, + "learning_rate": 6.7713493256367206e-06, + "loss": 0.2426, + "step": 15447 + }, + { + "epoch": 0.390920363387909, + "grad_norm": 3.28442645072937, + "learning_rate": 6.770973838840623e-06, + "loss": 0.1776, + "step": 15448 + }, + { + "epoch": 0.3909456689526027, + "grad_norm": 3.1828083992004395, + "learning_rate": 6.770598340623845e-06, + "loss": 0.1931, + "step": 15449 + }, + { + "epoch": 0.39097097451729634, + "grad_norm": 2.522033214569092, + "learning_rate": 6.770222830988808e-06, + "loss": 0.1122, + "step": 15450 + }, + { + "epoch": 0.39099628008199006, + "grad_norm": 3.5432868003845215, + "learning_rate": 6.7698473099379335e-06, + "loss": 0.164, + "step": 15451 + }, + { + "epoch": 0.3910215856466837, + "grad_norm": 3.451371908187866, + "learning_rate": 6.769471777473642e-06, + "loss": 0.1958, + "step": 15452 + }, + { + "epoch": 0.39104689121137737, + "grad_norm": 3.965008020401001, + "learning_rate": 6.769096233598355e-06, + "loss": 0.2172, + "step": 15453 + }, + { + "epoch": 0.3910721967760711, + "grad_norm": 3.3532190322875977, + "learning_rate": 6.768720678314497e-06, + "loss": 0.1787, + "step": 15454 + }, + { + "epoch": 0.39109750234076474, + "grad_norm": 3.7123265266418457, + "learning_rate": 6.768345111624489e-06, + "loss": 0.1415, + "step": 15455 + }, + { + "epoch": 0.3911228079054584, + "grad_norm": 7.025033950805664, + "learning_rate": 6.7679695335307514e-06, + "loss": 0.2275, + "step": 15456 + }, + { + "epoch": 0.3911481134701521, + "grad_norm": 3.3403050899505615, + "learning_rate": 6.767593944035708e-06, + "loss": 0.1543, + "step": 15457 + }, + { + "epoch": 0.39117341903484576, + "grad_norm": 2.942775011062622, + "learning_rate": 6.767218343141779e-06, + "loss": 0.1601, + "step": 15458 + }, + { + "epoch": 0.3911987245995394, + "grad_norm": 3.8929989337921143, + "learning_rate": 6.766842730851388e-06, + "loss": 0.1415, + "step": 15459 + }, + { + "epoch": 0.3912240301642331, + "grad_norm": 7.398610591888428, + "learning_rate": 6.7664671071669575e-06, + "loss": 0.2764, + "step": 15460 + }, + { + "epoch": 0.3912493357289268, + "grad_norm": 4.071935176849365, + "learning_rate": 6.766091472090908e-06, + "loss": 0.1515, + "step": 15461 + }, + { + "epoch": 0.39127464129362044, + "grad_norm": 4.4712748527526855, + "learning_rate": 6.765715825625665e-06, + "loss": 0.1602, + "step": 15462 + }, + { + "epoch": 0.39129994685831415, + "grad_norm": 5.737335205078125, + "learning_rate": 6.765340167773646e-06, + "loss": 0.1823, + "step": 15463 + }, + { + "epoch": 0.3913252524230078, + "grad_norm": 6.091115951538086, + "learning_rate": 6.76496449853728e-06, + "loss": 0.1231, + "step": 15464 + }, + { + "epoch": 0.3913505579877015, + "grad_norm": 3.0582456588745117, + "learning_rate": 6.764588817918984e-06, + "loss": 0.1207, + "step": 15465 + }, + { + "epoch": 0.3913758635523952, + "grad_norm": 8.132124900817871, + "learning_rate": 6.764213125921184e-06, + "loss": 0.2036, + "step": 15466 + }, + { + "epoch": 0.39140116911708883, + "grad_norm": 3.186397075653076, + "learning_rate": 6.763837422546301e-06, + "loss": 0.1627, + "step": 15467 + }, + { + "epoch": 0.39142647468178254, + "grad_norm": 6.100031852722168, + "learning_rate": 6.7634617077967605e-06, + "loss": 0.2012, + "step": 15468 + }, + { + "epoch": 0.3914517802464762, + "grad_norm": 7.201234817504883, + "learning_rate": 6.7630859816749815e-06, + "loss": 0.2336, + "step": 15469 + }, + { + "epoch": 0.39147708581116986, + "grad_norm": 4.298344135284424, + "learning_rate": 6.762710244183389e-06, + "loss": 0.1642, + "step": 15470 + }, + { + "epoch": 0.39150239137586357, + "grad_norm": 5.663259029388428, + "learning_rate": 6.762334495324408e-06, + "loss": 0.1239, + "step": 15471 + }, + { + "epoch": 0.3915276969405572, + "grad_norm": 3.9966816902160645, + "learning_rate": 6.761958735100457e-06, + "loss": 0.1461, + "step": 15472 + }, + { + "epoch": 0.3915530025052509, + "grad_norm": 5.0460004806518555, + "learning_rate": 6.761582963513962e-06, + "loss": 0.1888, + "step": 15473 + }, + { + "epoch": 0.3915783080699446, + "grad_norm": 5.079953670501709, + "learning_rate": 6.7612071805673486e-06, + "loss": 0.1864, + "step": 15474 + }, + { + "epoch": 0.39160361363463825, + "grad_norm": 10.639154434204102, + "learning_rate": 6.760831386263036e-06, + "loss": 0.2611, + "step": 15475 + }, + { + "epoch": 0.39162891919933196, + "grad_norm": 7.514072895050049, + "learning_rate": 6.760455580603449e-06, + "loss": 0.2382, + "step": 15476 + }, + { + "epoch": 0.3916542247640256, + "grad_norm": 3.313251256942749, + "learning_rate": 6.760079763591012e-06, + "loss": 0.1792, + "step": 15477 + }, + { + "epoch": 0.39167953032871927, + "grad_norm": 5.6676740646362305, + "learning_rate": 6.759703935228148e-06, + "loss": 0.2352, + "step": 15478 + }, + { + "epoch": 0.391704835893413, + "grad_norm": 3.19644832611084, + "learning_rate": 6.75932809551728e-06, + "loss": 0.1321, + "step": 15479 + }, + { + "epoch": 0.39173014145810664, + "grad_norm": 7.46331787109375, + "learning_rate": 6.758952244460835e-06, + "loss": 0.1124, + "step": 15480 + }, + { + "epoch": 0.3917554470228003, + "grad_norm": 8.340005874633789, + "learning_rate": 6.75857638206123e-06, + "loss": 0.3272, + "step": 15481 + }, + { + "epoch": 0.391780752587494, + "grad_norm": 5.274707317352295, + "learning_rate": 6.758200508320896e-06, + "loss": 0.1632, + "step": 15482 + }, + { + "epoch": 0.39180605815218766, + "grad_norm": 4.014041423797607, + "learning_rate": 6.757824623242253e-06, + "loss": 0.1876, + "step": 15483 + }, + { + "epoch": 0.3918313637168813, + "grad_norm": 4.226926326751709, + "learning_rate": 6.7574487268277255e-06, + "loss": 0.1251, + "step": 15484 + }, + { + "epoch": 0.39185666928157503, + "grad_norm": 4.254776477813721, + "learning_rate": 6.7570728190797395e-06, + "loss": 0.1406, + "step": 15485 + }, + { + "epoch": 0.3918819748462687, + "grad_norm": 12.658411979675293, + "learning_rate": 6.756696900000717e-06, + "loss": 0.1443, + "step": 15486 + }, + { + "epoch": 0.39190728041096234, + "grad_norm": 14.111344337463379, + "learning_rate": 6.756320969593083e-06, + "loss": 0.1402, + "step": 15487 + }, + { + "epoch": 0.39193258597565606, + "grad_norm": 4.742029190063477, + "learning_rate": 6.755945027859261e-06, + "loss": 0.122, + "step": 15488 + }, + { + "epoch": 0.3919578915403497, + "grad_norm": 5.963810443878174, + "learning_rate": 6.755569074801678e-06, + "loss": 0.234, + "step": 15489 + }, + { + "epoch": 0.3919831971050434, + "grad_norm": 5.856215476989746, + "learning_rate": 6.755193110422755e-06, + "loss": 0.1837, + "step": 15490 + }, + { + "epoch": 0.3920085026697371, + "grad_norm": 8.912259101867676, + "learning_rate": 6.754817134724919e-06, + "loss": 0.1919, + "step": 15491 + }, + { + "epoch": 0.39203380823443074, + "grad_norm": 6.647262096405029, + "learning_rate": 6.754441147710593e-06, + "loss": 0.2269, + "step": 15492 + }, + { + "epoch": 0.39205911379912445, + "grad_norm": 2.431948661804199, + "learning_rate": 6.754065149382203e-06, + "loss": 0.0931, + "step": 15493 + }, + { + "epoch": 0.3920844193638181, + "grad_norm": 6.601036548614502, + "learning_rate": 6.753689139742172e-06, + "loss": 0.1845, + "step": 15494 + }, + { + "epoch": 0.39210972492851176, + "grad_norm": 6.4015913009643555, + "learning_rate": 6.753313118792928e-06, + "loss": 0.135, + "step": 15495 + }, + { + "epoch": 0.39213503049320547, + "grad_norm": 8.378345489501953, + "learning_rate": 6.752937086536891e-06, + "loss": 0.2245, + "step": 15496 + }, + { + "epoch": 0.39216033605789913, + "grad_norm": 6.280643939971924, + "learning_rate": 6.752561042976491e-06, + "loss": 0.1836, + "step": 15497 + }, + { + "epoch": 0.3921856416225928, + "grad_norm": 5.237199783325195, + "learning_rate": 6.752184988114151e-06, + "loss": 0.1986, + "step": 15498 + }, + { + "epoch": 0.3922109471872865, + "grad_norm": 4.047425746917725, + "learning_rate": 6.751808921952295e-06, + "loss": 0.0725, + "step": 15499 + }, + { + "epoch": 0.39223625275198015, + "grad_norm": 9.142860412597656, + "learning_rate": 6.75143284449335e-06, + "loss": 0.1793, + "step": 15500 + }, + { + "epoch": 0.39226155831667386, + "grad_norm": 4.270633220672607, + "learning_rate": 6.751056755739739e-06, + "loss": 0.137, + "step": 15501 + }, + { + "epoch": 0.3922868638813675, + "grad_norm": 14.439434051513672, + "learning_rate": 6.75068065569389e-06, + "loss": 0.3085, + "step": 15502 + }, + { + "epoch": 0.3923121694460612, + "grad_norm": 5.327470779418945, + "learning_rate": 6.750304544358226e-06, + "loss": 0.173, + "step": 15503 + }, + { + "epoch": 0.3923374750107549, + "grad_norm": 3.1598947048187256, + "learning_rate": 6.749928421735175e-06, + "loss": 0.1312, + "step": 15504 + }, + { + "epoch": 0.39236278057544854, + "grad_norm": 9.657624244689941, + "learning_rate": 6.749552287827159e-06, + "loss": 0.3087, + "step": 15505 + }, + { + "epoch": 0.3923880861401422, + "grad_norm": 2.774827003479004, + "learning_rate": 6.749176142636607e-06, + "loss": 0.1254, + "step": 15506 + }, + { + "epoch": 0.3924133917048359, + "grad_norm": 27.622827529907227, + "learning_rate": 6.748799986165943e-06, + "loss": 0.1852, + "step": 15507 + }, + { + "epoch": 0.39243869726952957, + "grad_norm": 9.799884796142578, + "learning_rate": 6.7484238184175934e-06, + "loss": 0.1823, + "step": 15508 + }, + { + "epoch": 0.3924640028342232, + "grad_norm": 4.027931213378906, + "learning_rate": 6.748047639393984e-06, + "loss": 0.1782, + "step": 15509 + }, + { + "epoch": 0.39248930839891694, + "grad_norm": 5.558581829071045, + "learning_rate": 6.747671449097542e-06, + "loss": 0.2078, + "step": 15510 + }, + { + "epoch": 0.3925146139636106, + "grad_norm": 5.535958766937256, + "learning_rate": 6.74729524753069e-06, + "loss": 0.2052, + "step": 15511 + }, + { + "epoch": 0.39253991952830425, + "grad_norm": 6.494235515594482, + "learning_rate": 6.746919034695856e-06, + "loss": 0.2026, + "step": 15512 + }, + { + "epoch": 0.39256522509299796, + "grad_norm": 3.862103223800659, + "learning_rate": 6.7465428105954665e-06, + "loss": 0.1553, + "step": 15513 + }, + { + "epoch": 0.3925905306576916, + "grad_norm": 5.780959129333496, + "learning_rate": 6.7461665752319476e-06, + "loss": 0.1957, + "step": 15514 + }, + { + "epoch": 0.39261583622238533, + "grad_norm": 6.907562255859375, + "learning_rate": 6.745790328607725e-06, + "loss": 0.1469, + "step": 15515 + }, + { + "epoch": 0.392641141787079, + "grad_norm": 7.10875940322876, + "learning_rate": 6.745414070725225e-06, + "loss": 0.1108, + "step": 15516 + }, + { + "epoch": 0.39266644735177264, + "grad_norm": 6.711361885070801, + "learning_rate": 6.7450378015868736e-06, + "loss": 0.1187, + "step": 15517 + }, + { + "epoch": 0.39269175291646635, + "grad_norm": 7.411181449890137, + "learning_rate": 6.744661521195099e-06, + "loss": 0.2035, + "step": 15518 + }, + { + "epoch": 0.39271705848116, + "grad_norm": 9.783740043640137, + "learning_rate": 6.744285229552328e-06, + "loss": 0.1572, + "step": 15519 + }, + { + "epoch": 0.39274236404585366, + "grad_norm": 3.5843231678009033, + "learning_rate": 6.743908926660985e-06, + "loss": 0.1523, + "step": 15520 + }, + { + "epoch": 0.3927676696105474, + "grad_norm": 9.351107597351074, + "learning_rate": 6.743532612523497e-06, + "loss": 0.2629, + "step": 15521 + }, + { + "epoch": 0.39279297517524103, + "grad_norm": 10.602156639099121, + "learning_rate": 6.743156287142292e-06, + "loss": 0.2495, + "step": 15522 + }, + { + "epoch": 0.3928182807399347, + "grad_norm": 9.832101821899414, + "learning_rate": 6.742779950519796e-06, + "loss": 0.2284, + "step": 15523 + }, + { + "epoch": 0.3928435863046284, + "grad_norm": 4.338719367980957, + "learning_rate": 6.742403602658437e-06, + "loss": 0.1613, + "step": 15524 + }, + { + "epoch": 0.39286889186932206, + "grad_norm": 12.470149040222168, + "learning_rate": 6.742027243560642e-06, + "loss": 0.0995, + "step": 15525 + }, + { + "epoch": 0.3928941974340157, + "grad_norm": 8.067164421081543, + "learning_rate": 6.7416508732288354e-06, + "loss": 0.2535, + "step": 15526 + }, + { + "epoch": 0.3929195029987094, + "grad_norm": 4.55942964553833, + "learning_rate": 6.741274491665449e-06, + "loss": 0.2001, + "step": 15527 + }, + { + "epoch": 0.3929448085634031, + "grad_norm": 4.648953914642334, + "learning_rate": 6.740898098872904e-06, + "loss": 0.1702, + "step": 15528 + }, + { + "epoch": 0.3929701141280968, + "grad_norm": 5.122621059417725, + "learning_rate": 6.740521694853632e-06, + "loss": 0.1632, + "step": 15529 + }, + { + "epoch": 0.39299541969279045, + "grad_norm": 6.901252746582031, + "learning_rate": 6.74014527961006e-06, + "loss": 0.219, + "step": 15530 + }, + { + "epoch": 0.3930207252574841, + "grad_norm": 8.402234077453613, + "learning_rate": 6.739768853144615e-06, + "loss": 0.1204, + "step": 15531 + }, + { + "epoch": 0.3930460308221778, + "grad_norm": 3.3703455924987793, + "learning_rate": 6.739392415459723e-06, + "loss": 0.1518, + "step": 15532 + }, + { + "epoch": 0.3930713363868715, + "grad_norm": 7.965439796447754, + "learning_rate": 6.739015966557815e-06, + "loss": 0.2013, + "step": 15533 + }, + { + "epoch": 0.39309664195156513, + "grad_norm": 9.29715347290039, + "learning_rate": 6.7386395064413144e-06, + "loss": 0.1767, + "step": 15534 + }, + { + "epoch": 0.39312194751625884, + "grad_norm": 8.04635238647461, + "learning_rate": 6.738263035112653e-06, + "loss": 0.2149, + "step": 15535 + }, + { + "epoch": 0.3931472530809525, + "grad_norm": 4.348053932189941, + "learning_rate": 6.737886552574255e-06, + "loss": 0.2526, + "step": 15536 + }, + { + "epoch": 0.39317255864564615, + "grad_norm": 4.636786460876465, + "learning_rate": 6.73751005882855e-06, + "loss": 0.1241, + "step": 15537 + }, + { + "epoch": 0.39319786421033986, + "grad_norm": 3.3267829418182373, + "learning_rate": 6.737133553877967e-06, + "loss": 0.1666, + "step": 15538 + }, + { + "epoch": 0.3932231697750335, + "grad_norm": 7.593628883361816, + "learning_rate": 6.736757037724932e-06, + "loss": 0.1934, + "step": 15539 + }, + { + "epoch": 0.39324847533972723, + "grad_norm": 3.0212254524230957, + "learning_rate": 6.736380510371874e-06, + "loss": 0.1209, + "step": 15540 + }, + { + "epoch": 0.3932737809044209, + "grad_norm": 8.876128196716309, + "learning_rate": 6.736003971821221e-06, + "loss": 0.208, + "step": 15541 + }, + { + "epoch": 0.39329908646911454, + "grad_norm": 2.9879517555236816, + "learning_rate": 6.735627422075401e-06, + "loss": 0.1351, + "step": 15542 + }, + { + "epoch": 0.39332439203380826, + "grad_norm": 3.304511785507202, + "learning_rate": 6.735250861136844e-06, + "loss": 0.18, + "step": 15543 + }, + { + "epoch": 0.3933496975985019, + "grad_norm": 4.881955146789551, + "learning_rate": 6.7348742890079756e-06, + "loss": 0.1821, + "step": 15544 + }, + { + "epoch": 0.39337500316319557, + "grad_norm": 6.810164451599121, + "learning_rate": 6.7344977056912256e-06, + "loss": 0.2334, + "step": 15545 + }, + { + "epoch": 0.3934003087278893, + "grad_norm": 4.741627216339111, + "learning_rate": 6.7341211111890235e-06, + "loss": 0.2509, + "step": 15546 + }, + { + "epoch": 0.39342561429258294, + "grad_norm": 4.439504623413086, + "learning_rate": 6.733744505503796e-06, + "loss": 0.1478, + "step": 15547 + }, + { + "epoch": 0.3934509198572766, + "grad_norm": 5.50680685043335, + "learning_rate": 6.733367888637973e-06, + "loss": 0.1317, + "step": 15548 + }, + { + "epoch": 0.3934762254219703, + "grad_norm": 10.2858247756958, + "learning_rate": 6.7329912605939825e-06, + "loss": 0.2592, + "step": 15549 + }, + { + "epoch": 0.39350153098666396, + "grad_norm": 3.754167079925537, + "learning_rate": 6.732614621374254e-06, + "loss": 0.0891, + "step": 15550 + }, + { + "epoch": 0.3935268365513576, + "grad_norm": 11.93274211883545, + "learning_rate": 6.7322379709812165e-06, + "loss": 0.4119, + "step": 15551 + }, + { + "epoch": 0.39355214211605133, + "grad_norm": 10.280088424682617, + "learning_rate": 6.731861309417299e-06, + "loss": 0.2752, + "step": 15552 + }, + { + "epoch": 0.393577447680745, + "grad_norm": 5.000765323638916, + "learning_rate": 6.73148463668493e-06, + "loss": 0.2432, + "step": 15553 + }, + { + "epoch": 0.3936027532454387, + "grad_norm": 5.90366792678833, + "learning_rate": 6.7311079527865386e-06, + "loss": 0.2299, + "step": 15554 + }, + { + "epoch": 0.39362805881013235, + "grad_norm": 4.972300052642822, + "learning_rate": 6.730731257724553e-06, + "loss": 0.2267, + "step": 15555 + }, + { + "epoch": 0.393653364374826, + "grad_norm": 8.393941879272461, + "learning_rate": 6.7303545515014045e-06, + "loss": 0.1595, + "step": 15556 + }, + { + "epoch": 0.3936786699395197, + "grad_norm": 4.261453628540039, + "learning_rate": 6.7299778341195195e-06, + "loss": 0.1612, + "step": 15557 + }, + { + "epoch": 0.3937039755042134, + "grad_norm": 3.1935880184173584, + "learning_rate": 6.729601105581331e-06, + "loss": 0.1667, + "step": 15558 + }, + { + "epoch": 0.39372928106890703, + "grad_norm": 3.3093159198760986, + "learning_rate": 6.729224365889266e-06, + "loss": 0.167, + "step": 15559 + }, + { + "epoch": 0.39375458663360074, + "grad_norm": 4.3016676902771, + "learning_rate": 6.728847615045756e-06, + "loss": 0.1974, + "step": 15560 + }, + { + "epoch": 0.3937798921982944, + "grad_norm": 9.453057289123535, + "learning_rate": 6.7284708530532285e-06, + "loss": 0.226, + "step": 15561 + }, + { + "epoch": 0.39380519776298806, + "grad_norm": 10.881773948669434, + "learning_rate": 6.728094079914114e-06, + "loss": 0.216, + "step": 15562 + }, + { + "epoch": 0.39383050332768177, + "grad_norm": 3.6411819458007812, + "learning_rate": 6.727717295630841e-06, + "loss": 0.133, + "step": 15563 + }, + { + "epoch": 0.3938558088923754, + "grad_norm": 3.7657086849212646, + "learning_rate": 6.7273405002058425e-06, + "loss": 0.199, + "step": 15564 + }, + { + "epoch": 0.39388111445706914, + "grad_norm": 3.7794885635375977, + "learning_rate": 6.726963693641545e-06, + "loss": 0.1702, + "step": 15565 + }, + { + "epoch": 0.3939064200217628, + "grad_norm": 8.93663501739502, + "learning_rate": 6.726586875940381e-06, + "loss": 0.2849, + "step": 15566 + }, + { + "epoch": 0.39393172558645645, + "grad_norm": 3.9230048656463623, + "learning_rate": 6.726210047104779e-06, + "loss": 0.2217, + "step": 15567 + }, + { + "epoch": 0.39395703115115016, + "grad_norm": 29.023836135864258, + "learning_rate": 6.725833207137169e-06, + "loss": 0.4163, + "step": 15568 + }, + { + "epoch": 0.3939823367158438, + "grad_norm": 7.143198490142822, + "learning_rate": 6.725456356039982e-06, + "loss": 0.1954, + "step": 15569 + }, + { + "epoch": 0.3940076422805375, + "grad_norm": 5.289846420288086, + "learning_rate": 6.725079493815647e-06, + "loss": 0.1483, + "step": 15570 + }, + { + "epoch": 0.3940329478452312, + "grad_norm": 7.893951892852783, + "learning_rate": 6.7247026204665965e-06, + "loss": 0.2169, + "step": 15571 + }, + { + "epoch": 0.39405825340992484, + "grad_norm": 2.6718389987945557, + "learning_rate": 6.7243257359952595e-06, + "loss": 0.1985, + "step": 15572 + }, + { + "epoch": 0.3940835589746185, + "grad_norm": 5.483427047729492, + "learning_rate": 6.723948840404066e-06, + "loss": 0.1989, + "step": 15573 + }, + { + "epoch": 0.3941088645393122, + "grad_norm": 4.763908863067627, + "learning_rate": 6.723571933695447e-06, + "loss": 0.1997, + "step": 15574 + }, + { + "epoch": 0.39413417010400587, + "grad_norm": 3.1700029373168945, + "learning_rate": 6.7231950158718354e-06, + "loss": 0.136, + "step": 15575 + }, + { + "epoch": 0.3941594756686995, + "grad_norm": 8.385258674621582, + "learning_rate": 6.722818086935656e-06, + "loss": 0.1934, + "step": 15576 + }, + { + "epoch": 0.39418478123339323, + "grad_norm": 5.417388916015625, + "learning_rate": 6.722441146889347e-06, + "loss": 0.2511, + "step": 15577 + }, + { + "epoch": 0.3942100867980869, + "grad_norm": 5.757866382598877, + "learning_rate": 6.722064195735332e-06, + "loss": 0.1908, + "step": 15578 + }, + { + "epoch": 0.3942353923627806, + "grad_norm": 6.3581671714782715, + "learning_rate": 6.721687233476049e-06, + "loss": 0.1973, + "step": 15579 + }, + { + "epoch": 0.39426069792747426, + "grad_norm": 7.536074638366699, + "learning_rate": 6.721310260113922e-06, + "loss": 0.1792, + "step": 15580 + }, + { + "epoch": 0.3942860034921679, + "grad_norm": 4.1361870765686035, + "learning_rate": 6.720933275651387e-06, + "loss": 0.1355, + "step": 15581 + }, + { + "epoch": 0.3943113090568616, + "grad_norm": 3.573173761367798, + "learning_rate": 6.720556280090873e-06, + "loss": 0.1757, + "step": 15582 + }, + { + "epoch": 0.3943366146215553, + "grad_norm": 3.815920352935791, + "learning_rate": 6.720179273434812e-06, + "loss": 0.2421, + "step": 15583 + }, + { + "epoch": 0.39436192018624894, + "grad_norm": 4.314471244812012, + "learning_rate": 6.719802255685633e-06, + "loss": 0.1253, + "step": 15584 + }, + { + "epoch": 0.39438722575094265, + "grad_norm": 5.432374954223633, + "learning_rate": 6.719425226845772e-06, + "loss": 0.1736, + "step": 15585 + }, + { + "epoch": 0.3944125313156363, + "grad_norm": 11.390629768371582, + "learning_rate": 6.719048186917656e-06, + "loss": 0.1402, + "step": 15586 + }, + { + "epoch": 0.39443783688032996, + "grad_norm": 6.005814552307129, + "learning_rate": 6.718671135903718e-06, + "loss": 0.2158, + "step": 15587 + }, + { + "epoch": 0.3944631424450237, + "grad_norm": 9.28184986114502, + "learning_rate": 6.71829407380639e-06, + "loss": 0.1462, + "step": 15588 + }, + { + "epoch": 0.39448844800971733, + "grad_norm": 9.352922439575195, + "learning_rate": 6.717917000628102e-06, + "loss": 0.2061, + "step": 15589 + }, + { + "epoch": 0.394513753574411, + "grad_norm": 3.5453274250030518, + "learning_rate": 6.7175399163712884e-06, + "loss": 0.0932, + "step": 15590 + }, + { + "epoch": 0.3945390591391047, + "grad_norm": 8.618471145629883, + "learning_rate": 6.717162821038379e-06, + "loss": 0.21, + "step": 15591 + }, + { + "epoch": 0.39456436470379835, + "grad_norm": 4.523120880126953, + "learning_rate": 6.716785714631804e-06, + "loss": 0.223, + "step": 15592 + }, + { + "epoch": 0.39458967026849207, + "grad_norm": 4.919244289398193, + "learning_rate": 6.716408597154e-06, + "loss": 0.1952, + "step": 15593 + }, + { + "epoch": 0.3946149758331857, + "grad_norm": 5.823119640350342, + "learning_rate": 6.716031468607395e-06, + "loss": 0.1868, + "step": 15594 + }, + { + "epoch": 0.3946402813978794, + "grad_norm": 7.8263750076293945, + "learning_rate": 6.715654328994422e-06, + "loss": 0.2088, + "step": 15595 + }, + { + "epoch": 0.3946655869625731, + "grad_norm": 7.5131611824035645, + "learning_rate": 6.7152771783175144e-06, + "loss": 0.2256, + "step": 15596 + }, + { + "epoch": 0.39469089252726675, + "grad_norm": 5.516415596008301, + "learning_rate": 6.714900016579102e-06, + "loss": 0.1177, + "step": 15597 + }, + { + "epoch": 0.3947161980919604, + "grad_norm": 11.203606605529785, + "learning_rate": 6.714522843781619e-06, + "loss": 0.1714, + "step": 15598 + }, + { + "epoch": 0.3947415036566541, + "grad_norm": 8.297408103942871, + "learning_rate": 6.714145659927498e-06, + "loss": 0.2814, + "step": 15599 + }, + { + "epoch": 0.39476680922134777, + "grad_norm": 5.43072509765625, + "learning_rate": 6.7137684650191695e-06, + "loss": 0.1956, + "step": 15600 + }, + { + "epoch": 0.3947921147860414, + "grad_norm": 2.170464038848877, + "learning_rate": 6.713391259059066e-06, + "loss": 0.0873, + "step": 15601 + }, + { + "epoch": 0.39481742035073514, + "grad_norm": 11.560474395751953, + "learning_rate": 6.713014042049622e-06, + "loss": 0.1891, + "step": 15602 + }, + { + "epoch": 0.3948427259154288, + "grad_norm": 6.146017551422119, + "learning_rate": 6.712636813993269e-06, + "loss": 0.1066, + "step": 15603 + }, + { + "epoch": 0.3948680314801225, + "grad_norm": 9.09431266784668, + "learning_rate": 6.712259574892441e-06, + "loss": 0.2495, + "step": 15604 + }, + { + "epoch": 0.39489333704481616, + "grad_norm": 6.581905841827393, + "learning_rate": 6.711882324749568e-06, + "loss": 0.2809, + "step": 15605 + }, + { + "epoch": 0.3949186426095098, + "grad_norm": 4.021815776824951, + "learning_rate": 6.7115050635670855e-06, + "loss": 0.1617, + "step": 15606 + }, + { + "epoch": 0.39494394817420353, + "grad_norm": 4.8865790367126465, + "learning_rate": 6.711127791347424e-06, + "loss": 0.2167, + "step": 15607 + }, + { + "epoch": 0.3949692537388972, + "grad_norm": 3.699692964553833, + "learning_rate": 6.7107505080930176e-06, + "loss": 0.1071, + "step": 15608 + }, + { + "epoch": 0.39499455930359084, + "grad_norm": 3.297919273376465, + "learning_rate": 6.7103732138063e-06, + "loss": 0.1524, + "step": 15609 + }, + { + "epoch": 0.39501986486828455, + "grad_norm": 3.921090602874756, + "learning_rate": 6.709995908489705e-06, + "loss": 0.204, + "step": 15610 + }, + { + "epoch": 0.3950451704329782, + "grad_norm": 3.999302387237549, + "learning_rate": 6.709618592145662e-06, + "loss": 0.1272, + "step": 15611 + }, + { + "epoch": 0.39507047599767187, + "grad_norm": 3.9630255699157715, + "learning_rate": 6.70924126477661e-06, + "loss": 0.1764, + "step": 15612 + }, + { + "epoch": 0.3950957815623656, + "grad_norm": 4.779752254486084, + "learning_rate": 6.708863926384976e-06, + "loss": 0.1457, + "step": 15613 + }, + { + "epoch": 0.39512108712705923, + "grad_norm": 8.989718437194824, + "learning_rate": 6.708486576973197e-06, + "loss": 0.2198, + "step": 15614 + }, + { + "epoch": 0.3951463926917529, + "grad_norm": 4.672971248626709, + "learning_rate": 6.708109216543707e-06, + "loss": 0.1603, + "step": 15615 + }, + { + "epoch": 0.3951716982564466, + "grad_norm": 5.522369861602783, + "learning_rate": 6.707731845098937e-06, + "loss": 0.1837, + "step": 15616 + }, + { + "epoch": 0.39519700382114026, + "grad_norm": 3.2569780349731445, + "learning_rate": 6.707354462641322e-06, + "loss": 0.1728, + "step": 15617 + }, + { + "epoch": 0.39522230938583397, + "grad_norm": 9.200366020202637, + "learning_rate": 6.706977069173296e-06, + "loss": 0.2378, + "step": 15618 + }, + { + "epoch": 0.3952476149505276, + "grad_norm": 3.618330717086792, + "learning_rate": 6.706599664697293e-06, + "loss": 0.1341, + "step": 15619 + }, + { + "epoch": 0.3952729205152213, + "grad_norm": 13.161518096923828, + "learning_rate": 6.706222249215746e-06, + "loss": 0.3164, + "step": 15620 + }, + { + "epoch": 0.395298226079915, + "grad_norm": 5.642309665679932, + "learning_rate": 6.705844822731089e-06, + "loss": 0.2088, + "step": 15621 + }, + { + "epoch": 0.39532353164460865, + "grad_norm": 6.518332481384277, + "learning_rate": 6.705467385245757e-06, + "loss": 0.2519, + "step": 15622 + }, + { + "epoch": 0.3953488372093023, + "grad_norm": 3.6630401611328125, + "learning_rate": 6.70508993676218e-06, + "loss": 0.1134, + "step": 15623 + }, + { + "epoch": 0.395374142773996, + "grad_norm": 4.492221832275391, + "learning_rate": 6.704712477282798e-06, + "loss": 0.1166, + "step": 15624 + }, + { + "epoch": 0.3953994483386897, + "grad_norm": 3.2491207122802734, + "learning_rate": 6.704335006810041e-06, + "loss": 0.176, + "step": 15625 + }, + { + "epoch": 0.39542475390338333, + "grad_norm": 7.559648513793945, + "learning_rate": 6.7039575253463455e-06, + "loss": 0.2252, + "step": 15626 + }, + { + "epoch": 0.39545005946807704, + "grad_norm": 10.074692726135254, + "learning_rate": 6.703580032894143e-06, + "loss": 0.1876, + "step": 15627 + }, + { + "epoch": 0.3954753650327707, + "grad_norm": 5.761122226715088, + "learning_rate": 6.703202529455874e-06, + "loss": 0.139, + "step": 15628 + }, + { + "epoch": 0.3955006705974644, + "grad_norm": 4.588712215423584, + "learning_rate": 6.7028250150339646e-06, + "loss": 0.1821, + "step": 15629 + }, + { + "epoch": 0.39552597616215807, + "grad_norm": 4.347973823547363, + "learning_rate": 6.702447489630856e-06, + "loss": 0.2076, + "step": 15630 + }, + { + "epoch": 0.3955512817268517, + "grad_norm": 4.950854778289795, + "learning_rate": 6.702069953248977e-06, + "loss": 0.1897, + "step": 15631 + }, + { + "epoch": 0.39557658729154543, + "grad_norm": 5.086580753326416, + "learning_rate": 6.701692405890769e-06, + "loss": 0.1909, + "step": 15632 + }, + { + "epoch": 0.3956018928562391, + "grad_norm": 7.178652286529541, + "learning_rate": 6.701314847558661e-06, + "loss": 0.1989, + "step": 15633 + }, + { + "epoch": 0.39562719842093275, + "grad_norm": 5.704336643218994, + "learning_rate": 6.700937278255091e-06, + "loss": 0.1612, + "step": 15634 + }, + { + "epoch": 0.39565250398562646, + "grad_norm": 2.9456565380096436, + "learning_rate": 6.700559697982492e-06, + "loss": 0.1581, + "step": 15635 + }, + { + "epoch": 0.3956778095503201, + "grad_norm": 5.8261189460754395, + "learning_rate": 6.7001821067433e-06, + "loss": 0.1948, + "step": 15636 + }, + { + "epoch": 0.39570311511501377, + "grad_norm": 3.746920585632324, + "learning_rate": 6.6998045045399505e-06, + "loss": 0.129, + "step": 15637 + }, + { + "epoch": 0.3957284206797075, + "grad_norm": 6.040470123291016, + "learning_rate": 6.699426891374878e-06, + "loss": 0.1762, + "step": 15638 + }, + { + "epoch": 0.39575372624440114, + "grad_norm": 11.633291244506836, + "learning_rate": 6.699049267250518e-06, + "loss": 0.1815, + "step": 15639 + }, + { + "epoch": 0.3957790318090948, + "grad_norm": 4.268375873565674, + "learning_rate": 6.698671632169304e-06, + "loss": 0.1794, + "step": 15640 + }, + { + "epoch": 0.3958043373737885, + "grad_norm": 5.424485683441162, + "learning_rate": 6.6982939861336735e-06, + "loss": 0.1566, + "step": 15641 + }, + { + "epoch": 0.39582964293848216, + "grad_norm": 3.7567875385284424, + "learning_rate": 6.697916329146062e-06, + "loss": 0.1392, + "step": 15642 + }, + { + "epoch": 0.3958549485031759, + "grad_norm": 3.4489176273345947, + "learning_rate": 6.697538661208902e-06, + "loss": 0.1697, + "step": 15643 + }, + { + "epoch": 0.39588025406786953, + "grad_norm": 3.7700917720794678, + "learning_rate": 6.697160982324632e-06, + "loss": 0.1824, + "step": 15644 + }, + { + "epoch": 0.3959055596325632, + "grad_norm": 5.347061634063721, + "learning_rate": 6.696783292495685e-06, + "loss": 0.2671, + "step": 15645 + }, + { + "epoch": 0.3959308651972569, + "grad_norm": 4.487453460693359, + "learning_rate": 6.696405591724501e-06, + "loss": 0.2195, + "step": 15646 + }, + { + "epoch": 0.39595617076195055, + "grad_norm": 17.540193557739258, + "learning_rate": 6.6960278800135104e-06, + "loss": 0.2355, + "step": 15647 + }, + { + "epoch": 0.3959814763266442, + "grad_norm": 12.02097225189209, + "learning_rate": 6.695650157365152e-06, + "loss": 0.2742, + "step": 15648 + }, + { + "epoch": 0.3960067818913379, + "grad_norm": 3.6800994873046875, + "learning_rate": 6.695272423781861e-06, + "loss": 0.1734, + "step": 15649 + }, + { + "epoch": 0.3960320874560316, + "grad_norm": 4.931004047393799, + "learning_rate": 6.694894679266075e-06, + "loss": 0.1657, + "step": 15650 + }, + { + "epoch": 0.39605739302072523, + "grad_norm": 5.805148601531982, + "learning_rate": 6.694516923820227e-06, + "loss": 0.172, + "step": 15651 + }, + { + "epoch": 0.39608269858541895, + "grad_norm": 17.34506607055664, + "learning_rate": 6.694139157446755e-06, + "loss": 0.1209, + "step": 15652 + }, + { + "epoch": 0.3961080041501126, + "grad_norm": 6.625548362731934, + "learning_rate": 6.693761380148095e-06, + "loss": 0.193, + "step": 15653 + }, + { + "epoch": 0.39613330971480626, + "grad_norm": 8.511496543884277, + "learning_rate": 6.693383591926683e-06, + "loss": 0.257, + "step": 15654 + }, + { + "epoch": 0.39615861527949997, + "grad_norm": 4.4445881843566895, + "learning_rate": 6.693005792784955e-06, + "loss": 0.1511, + "step": 15655 + }, + { + "epoch": 0.3961839208441936, + "grad_norm": 6.377466201782227, + "learning_rate": 6.692627982725346e-06, + "loss": 0.142, + "step": 15656 + }, + { + "epoch": 0.39620922640888734, + "grad_norm": 4.397558212280273, + "learning_rate": 6.692250161750295e-06, + "loss": 0.2056, + "step": 15657 + }, + { + "epoch": 0.396234531973581, + "grad_norm": 5.219418048858643, + "learning_rate": 6.691872329862239e-06, + "loss": 0.1398, + "step": 15658 + }, + { + "epoch": 0.39625983753827465, + "grad_norm": 9.933883666992188, + "learning_rate": 6.691494487063612e-06, + "loss": 0.2886, + "step": 15659 + }, + { + "epoch": 0.39628514310296836, + "grad_norm": 5.433518409729004, + "learning_rate": 6.6911166333568515e-06, + "loss": 0.1809, + "step": 15660 + }, + { + "epoch": 0.396310448667662, + "grad_norm": 5.543102741241455, + "learning_rate": 6.690738768744395e-06, + "loss": 0.172, + "step": 15661 + }, + { + "epoch": 0.3963357542323557, + "grad_norm": 14.01550006866455, + "learning_rate": 6.6903608932286776e-06, + "loss": 0.2623, + "step": 15662 + }, + { + "epoch": 0.3963610597970494, + "grad_norm": 5.381435394287109, + "learning_rate": 6.6899830068121385e-06, + "loss": 0.2085, + "step": 15663 + }, + { + "epoch": 0.39638636536174304, + "grad_norm": 6.223716735839844, + "learning_rate": 6.689605109497213e-06, + "loss": 0.2197, + "step": 15664 + }, + { + "epoch": 0.3964116709264367, + "grad_norm": 7.1862945556640625, + "learning_rate": 6.689227201286337e-06, + "loss": 0.1761, + "step": 15665 + }, + { + "epoch": 0.3964369764911304, + "grad_norm": 3.689713716506958, + "learning_rate": 6.68884928218195e-06, + "loss": 0.1898, + "step": 15666 + }, + { + "epoch": 0.39646228205582407, + "grad_norm": 5.416590213775635, + "learning_rate": 6.688471352186488e-06, + "loss": 0.2424, + "step": 15667 + }, + { + "epoch": 0.3964875876205178, + "grad_norm": 5.933425426483154, + "learning_rate": 6.6880934113023885e-06, + "loss": 0.2154, + "step": 15668 + }, + { + "epoch": 0.39651289318521143, + "grad_norm": 9.236079216003418, + "learning_rate": 6.687715459532089e-06, + "loss": 0.2416, + "step": 15669 + }, + { + "epoch": 0.3965381987499051, + "grad_norm": 2.8646655082702637, + "learning_rate": 6.687337496878026e-06, + "loss": 0.1623, + "step": 15670 + }, + { + "epoch": 0.3965635043145988, + "grad_norm": 4.376464366912842, + "learning_rate": 6.686959523342639e-06, + "loss": 0.1524, + "step": 15671 + }, + { + "epoch": 0.39658880987929246, + "grad_norm": 11.78936767578125, + "learning_rate": 6.686581538928362e-06, + "loss": 0.1682, + "step": 15672 + }, + { + "epoch": 0.3966141154439861, + "grad_norm": 3.4212515354156494, + "learning_rate": 6.686203543637635e-06, + "loss": 0.1508, + "step": 15673 + }, + { + "epoch": 0.3966394210086798, + "grad_norm": 4.605261325836182, + "learning_rate": 6.6858255374728945e-06, + "loss": 0.2536, + "step": 15674 + }, + { + "epoch": 0.3966647265733735, + "grad_norm": 6.315799236297607, + "learning_rate": 6.6854475204365786e-06, + "loss": 0.1775, + "step": 15675 + }, + { + "epoch": 0.39669003213806714, + "grad_norm": 5.122149467468262, + "learning_rate": 6.685069492531126e-06, + "loss": 0.1385, + "step": 15676 + }, + { + "epoch": 0.39671533770276085, + "grad_norm": 11.184601783752441, + "learning_rate": 6.684691453758973e-06, + "loss": 0.3176, + "step": 15677 + }, + { + "epoch": 0.3967406432674545, + "grad_norm": 7.019400596618652, + "learning_rate": 6.684313404122558e-06, + "loss": 0.1618, + "step": 15678 + }, + { + "epoch": 0.39676594883214816, + "grad_norm": 4.954742908477783, + "learning_rate": 6.68393534362432e-06, + "loss": 0.2096, + "step": 15679 + }, + { + "epoch": 0.3967912543968419, + "grad_norm": 3.43019962310791, + "learning_rate": 6.683557272266695e-06, + "loss": 0.1025, + "step": 15680 + }, + { + "epoch": 0.39681655996153553, + "grad_norm": 4.133613586425781, + "learning_rate": 6.683179190052124e-06, + "loss": 0.1578, + "step": 15681 + }, + { + "epoch": 0.39684186552622924, + "grad_norm": 10.877959251403809, + "learning_rate": 6.682801096983041e-06, + "loss": 0.1437, + "step": 15682 + }, + { + "epoch": 0.3968671710909229, + "grad_norm": 10.6312255859375, + "learning_rate": 6.68242299306189e-06, + "loss": 0.2495, + "step": 15683 + }, + { + "epoch": 0.39689247665561656, + "grad_norm": 4.553867340087891, + "learning_rate": 6.682044878291103e-06, + "loss": 0.2224, + "step": 15684 + }, + { + "epoch": 0.39691778222031027, + "grad_norm": 7.859106063842773, + "learning_rate": 6.681666752673124e-06, + "loss": 0.2407, + "step": 15685 + }, + { + "epoch": 0.3969430877850039, + "grad_norm": 4.868499279022217, + "learning_rate": 6.681288616210386e-06, + "loss": 0.1255, + "step": 15686 + }, + { + "epoch": 0.3969683933496976, + "grad_norm": 4.813555717468262, + "learning_rate": 6.680910468905333e-06, + "loss": 0.2299, + "step": 15687 + }, + { + "epoch": 0.3969936989143913, + "grad_norm": 7.427838325500488, + "learning_rate": 6.6805323107604005e-06, + "loss": 0.0995, + "step": 15688 + }, + { + "epoch": 0.39701900447908495, + "grad_norm": 4.187067031860352, + "learning_rate": 6.680154141778027e-06, + "loss": 0.1738, + "step": 15689 + }, + { + "epoch": 0.3970443100437786, + "grad_norm": 7.376218318939209, + "learning_rate": 6.679775961960653e-06, + "loss": 0.3797, + "step": 15690 + }, + { + "epoch": 0.3970696156084723, + "grad_norm": 4.3561601638793945, + "learning_rate": 6.679397771310715e-06, + "loss": 0.1557, + "step": 15691 + }, + { + "epoch": 0.39709492117316597, + "grad_norm": 3.7567129135131836, + "learning_rate": 6.679019569830655e-06, + "loss": 0.1395, + "step": 15692 + }, + { + "epoch": 0.3971202267378597, + "grad_norm": 3.1152186393737793, + "learning_rate": 6.678641357522908e-06, + "loss": 0.1374, + "step": 15693 + }, + { + "epoch": 0.39714553230255334, + "grad_norm": 4.193688869476318, + "learning_rate": 6.678263134389918e-06, + "loss": 0.1509, + "step": 15694 + }, + { + "epoch": 0.397170837867247, + "grad_norm": 2.9960081577301025, + "learning_rate": 6.677884900434117e-06, + "loss": 0.139, + "step": 15695 + }, + { + "epoch": 0.3971961434319407, + "grad_norm": 3.2343974113464355, + "learning_rate": 6.6775066556579524e-06, + "loss": 0.1801, + "step": 15696 + }, + { + "epoch": 0.39722144899663436, + "grad_norm": 2.8985307216644287, + "learning_rate": 6.677128400063857e-06, + "loss": 0.1368, + "step": 15697 + }, + { + "epoch": 0.397246754561328, + "grad_norm": 4.0292510986328125, + "learning_rate": 6.676750133654272e-06, + "loss": 0.226, + "step": 15698 + }, + { + "epoch": 0.39727206012602173, + "grad_norm": 6.431515693664551, + "learning_rate": 6.67637185643164e-06, + "loss": 0.1579, + "step": 15699 + }, + { + "epoch": 0.3972973656907154, + "grad_norm": 10.965458869934082, + "learning_rate": 6.6759935683983965e-06, + "loss": 0.2532, + "step": 15700 + }, + { + "epoch": 0.39732267125540904, + "grad_norm": 3.1465976238250732, + "learning_rate": 6.675615269556982e-06, + "loss": 0.1543, + "step": 15701 + }, + { + "epoch": 0.39734797682010276, + "grad_norm": 4.9910807609558105, + "learning_rate": 6.675236959909836e-06, + "loss": 0.1388, + "step": 15702 + }, + { + "epoch": 0.3973732823847964, + "grad_norm": 5.346615791320801, + "learning_rate": 6.674858639459401e-06, + "loss": 0.188, + "step": 15703 + }, + { + "epoch": 0.39739858794949007, + "grad_norm": 4.9488606452941895, + "learning_rate": 6.674480308208111e-06, + "loss": 0.1685, + "step": 15704 + }, + { + "epoch": 0.3974238935141838, + "grad_norm": 3.226154088973999, + "learning_rate": 6.67410196615841e-06, + "loss": 0.1495, + "step": 15705 + }, + { + "epoch": 0.39744919907887744, + "grad_norm": 3.5713679790496826, + "learning_rate": 6.673723613312737e-06, + "loss": 0.1248, + "step": 15706 + }, + { + "epoch": 0.39747450464357115, + "grad_norm": 4.894337177276611, + "learning_rate": 6.673345249673532e-06, + "loss": 0.1485, + "step": 15707 + }, + { + "epoch": 0.3974998102082648, + "grad_norm": 6.420823574066162, + "learning_rate": 6.6729668752432345e-06, + "loss": 0.1997, + "step": 15708 + }, + { + "epoch": 0.39752511577295846, + "grad_norm": 5.895207405090332, + "learning_rate": 6.672588490024283e-06, + "loss": 0.2304, + "step": 15709 + }, + { + "epoch": 0.39755042133765217, + "grad_norm": 3.1607189178466797, + "learning_rate": 6.672210094019122e-06, + "loss": 0.1598, + "step": 15710 + }, + { + "epoch": 0.3975757269023458, + "grad_norm": 5.673055648803711, + "learning_rate": 6.671831687230188e-06, + "loss": 0.1728, + "step": 15711 + }, + { + "epoch": 0.3976010324670395, + "grad_norm": 3.230921506881714, + "learning_rate": 6.671453269659922e-06, + "loss": 0.1231, + "step": 15712 + }, + { + "epoch": 0.3976263380317332, + "grad_norm": 8.774234771728516, + "learning_rate": 6.6710748413107654e-06, + "loss": 0.2579, + "step": 15713 + }, + { + "epoch": 0.39765164359642685, + "grad_norm": 3.2172887325286865, + "learning_rate": 6.6706964021851595e-06, + "loss": 0.1229, + "step": 15714 + }, + { + "epoch": 0.3976769491611205, + "grad_norm": 2.2269461154937744, + "learning_rate": 6.670317952285541e-06, + "loss": 0.1469, + "step": 15715 + }, + { + "epoch": 0.3977022547258142, + "grad_norm": 4.882124423980713, + "learning_rate": 6.669939491614354e-06, + "loss": 0.114, + "step": 15716 + }, + { + "epoch": 0.3977275602905079, + "grad_norm": 3.9318482875823975, + "learning_rate": 6.669561020174037e-06, + "loss": 0.1959, + "step": 15717 + }, + { + "epoch": 0.39775286585520153, + "grad_norm": 5.540093421936035, + "learning_rate": 6.669182537967031e-06, + "loss": 0.2018, + "step": 15718 + }, + { + "epoch": 0.39777817141989524, + "grad_norm": 3.6636459827423096, + "learning_rate": 6.668804044995779e-06, + "loss": 0.169, + "step": 15719 + }, + { + "epoch": 0.3978034769845889, + "grad_norm": 3.33297061920166, + "learning_rate": 6.668425541262719e-06, + "loss": 0.1302, + "step": 15720 + }, + { + "epoch": 0.3978287825492826, + "grad_norm": 4.731149196624756, + "learning_rate": 6.668047026770295e-06, + "loss": 0.1446, + "step": 15721 + }, + { + "epoch": 0.39785408811397627, + "grad_norm": 3.168412208557129, + "learning_rate": 6.667668501520943e-06, + "loss": 0.1234, + "step": 15722 + }, + { + "epoch": 0.3978793936786699, + "grad_norm": 5.431639194488525, + "learning_rate": 6.667289965517109e-06, + "loss": 0.2081, + "step": 15723 + }, + { + "epoch": 0.39790469924336364, + "grad_norm": 6.345787048339844, + "learning_rate": 6.66691141876123e-06, + "loss": 0.1919, + "step": 15724 + }, + { + "epoch": 0.3979300048080573, + "grad_norm": 12.61830997467041, + "learning_rate": 6.666532861255752e-06, + "loss": 0.2108, + "step": 15725 + }, + { + "epoch": 0.39795531037275095, + "grad_norm": 6.672201633453369, + "learning_rate": 6.666154293003113e-06, + "loss": 0.1802, + "step": 15726 + }, + { + "epoch": 0.39798061593744466, + "grad_norm": 3.2860755920410156, + "learning_rate": 6.665775714005754e-06, + "loss": 0.1722, + "step": 15727 + }, + { + "epoch": 0.3980059215021383, + "grad_norm": 7.486366271972656, + "learning_rate": 6.665397124266117e-06, + "loss": 0.1327, + "step": 15728 + }, + { + "epoch": 0.39803122706683197, + "grad_norm": 5.851344585418701, + "learning_rate": 6.665018523786646e-06, + "loss": 0.1639, + "step": 15729 + }, + { + "epoch": 0.3980565326315257, + "grad_norm": 6.165992736816406, + "learning_rate": 6.664639912569778e-06, + "loss": 0.1269, + "step": 15730 + }, + { + "epoch": 0.39808183819621934, + "grad_norm": 4.394519329071045, + "learning_rate": 6.664261290617958e-06, + "loss": 0.1612, + "step": 15731 + }, + { + "epoch": 0.39810714376091305, + "grad_norm": 3.2067055702209473, + "learning_rate": 6.663882657933627e-06, + "loss": 0.1358, + "step": 15732 + }, + { + "epoch": 0.3981324493256067, + "grad_norm": 2.6845927238464355, + "learning_rate": 6.663504014519225e-06, + "loss": 0.1308, + "step": 15733 + }, + { + "epoch": 0.39815775489030036, + "grad_norm": 2.7873921394348145, + "learning_rate": 6.663125360377195e-06, + "loss": 0.1031, + "step": 15734 + }, + { + "epoch": 0.3981830604549941, + "grad_norm": 9.027064323425293, + "learning_rate": 6.662746695509979e-06, + "loss": 0.2167, + "step": 15735 + }, + { + "epoch": 0.39820836601968773, + "grad_norm": 8.23211669921875, + "learning_rate": 6.66236801992002e-06, + "loss": 0.195, + "step": 15736 + }, + { + "epoch": 0.3982336715843814, + "grad_norm": 3.740687847137451, + "learning_rate": 6.661989333609757e-06, + "loss": 0.1461, + "step": 15737 + }, + { + "epoch": 0.3982589771490751, + "grad_norm": 3.8015239238739014, + "learning_rate": 6.6616106365816354e-06, + "loss": 0.1714, + "step": 15738 + }, + { + "epoch": 0.39828428271376876, + "grad_norm": 4.093530654907227, + "learning_rate": 6.6612319288380955e-06, + "loss": 0.1181, + "step": 15739 + }, + { + "epoch": 0.3983095882784624, + "grad_norm": 3.579793691635132, + "learning_rate": 6.660853210381579e-06, + "loss": 0.1505, + "step": 15740 + }, + { + "epoch": 0.3983348938431561, + "grad_norm": 4.711146831512451, + "learning_rate": 6.660474481214531e-06, + "loss": 0.2374, + "step": 15741 + }, + { + "epoch": 0.3983601994078498, + "grad_norm": 7.949561595916748, + "learning_rate": 6.66009574133939e-06, + "loss": 0.1797, + "step": 15742 + }, + { + "epoch": 0.39838550497254344, + "grad_norm": 4.898477554321289, + "learning_rate": 6.659716990758603e-06, + "loss": 0.2497, + "step": 15743 + }, + { + "epoch": 0.39841081053723715, + "grad_norm": 3.4845473766326904, + "learning_rate": 6.6593382294746065e-06, + "loss": 0.2236, + "step": 15744 + }, + { + "epoch": 0.3984361161019308, + "grad_norm": 4.5156378746032715, + "learning_rate": 6.658959457489848e-06, + "loss": 0.1617, + "step": 15745 + }, + { + "epoch": 0.3984614216666245, + "grad_norm": 9.009622573852539, + "learning_rate": 6.6585806748067685e-06, + "loss": 0.2815, + "step": 15746 + }, + { + "epoch": 0.39848672723131817, + "grad_norm": 4.350069046020508, + "learning_rate": 6.658201881427811e-06, + "loss": 0.1625, + "step": 15747 + }, + { + "epoch": 0.39851203279601183, + "grad_norm": 3.5498480796813965, + "learning_rate": 6.657823077355417e-06, + "loss": 0.1902, + "step": 15748 + }, + { + "epoch": 0.39853733836070554, + "grad_norm": 3.512477397918701, + "learning_rate": 6.657444262592032e-06, + "loss": 0.1638, + "step": 15749 + }, + { + "epoch": 0.3985626439253992, + "grad_norm": 2.6130785942077637, + "learning_rate": 6.657065437140096e-06, + "loss": 0.1066, + "step": 15750 + }, + { + "epoch": 0.39858794949009285, + "grad_norm": 8.937309265136719, + "learning_rate": 6.656686601002053e-06, + "loss": 0.3425, + "step": 15751 + }, + { + "epoch": 0.39861325505478656, + "grad_norm": 7.3685126304626465, + "learning_rate": 6.656307754180348e-06, + "loss": 0.1767, + "step": 15752 + }, + { + "epoch": 0.3986385606194802, + "grad_norm": 8.055916786193848, + "learning_rate": 6.65592889667742e-06, + "loss": 0.1633, + "step": 15753 + }, + { + "epoch": 0.3986638661841739, + "grad_norm": 8.148664474487305, + "learning_rate": 6.655550028495717e-06, + "loss": 0.2452, + "step": 15754 + }, + { + "epoch": 0.3986891717488676, + "grad_norm": 3.615410327911377, + "learning_rate": 6.655171149637678e-06, + "loss": 0.1557, + "step": 15755 + }, + { + "epoch": 0.39871447731356124, + "grad_norm": 3.7566895484924316, + "learning_rate": 6.654792260105749e-06, + "loss": 0.0534, + "step": 15756 + }, + { + "epoch": 0.39873978287825496, + "grad_norm": 8.236655235290527, + "learning_rate": 6.654413359902373e-06, + "loss": 0.2169, + "step": 15757 + }, + { + "epoch": 0.3987650884429486, + "grad_norm": 3.5510642528533936, + "learning_rate": 6.654034449029993e-06, + "loss": 0.104, + "step": 15758 + }, + { + "epoch": 0.39879039400764227, + "grad_norm": 4.025068283081055, + "learning_rate": 6.653655527491051e-06, + "loss": 0.1068, + "step": 15759 + }, + { + "epoch": 0.398815699572336, + "grad_norm": 5.297296524047852, + "learning_rate": 6.653276595287994e-06, + "loss": 0.1843, + "step": 15760 + }, + { + "epoch": 0.39884100513702964, + "grad_norm": 5.087904453277588, + "learning_rate": 6.652897652423262e-06, + "loss": 0.2005, + "step": 15761 + }, + { + "epoch": 0.3988663107017233, + "grad_norm": 5.600838661193848, + "learning_rate": 6.652518698899302e-06, + "loss": 0.1503, + "step": 15762 + }, + { + "epoch": 0.398891616266417, + "grad_norm": 4.032707691192627, + "learning_rate": 6.6521397347185556e-06, + "loss": 0.1767, + "step": 15763 + }, + { + "epoch": 0.39891692183111066, + "grad_norm": 3.659497022628784, + "learning_rate": 6.6517607598834664e-06, + "loss": 0.1661, + "step": 15764 + }, + { + "epoch": 0.3989422273958043, + "grad_norm": 5.730302810668945, + "learning_rate": 6.6513817743964814e-06, + "loss": 0.1568, + "step": 15765 + }, + { + "epoch": 0.39896753296049803, + "grad_norm": 8.813892364501953, + "learning_rate": 6.651002778260041e-06, + "loss": 0.2332, + "step": 15766 + }, + { + "epoch": 0.3989928385251917, + "grad_norm": 5.9136881828308105, + "learning_rate": 6.650623771476592e-06, + "loss": 0.1806, + "step": 15767 + }, + { + "epoch": 0.39901814408988534, + "grad_norm": 4.963469505310059, + "learning_rate": 6.650244754048576e-06, + "loss": 0.2116, + "step": 15768 + }, + { + "epoch": 0.39904344965457905, + "grad_norm": 4.009708881378174, + "learning_rate": 6.64986572597844e-06, + "loss": 0.1457, + "step": 15769 + }, + { + "epoch": 0.3990687552192727, + "grad_norm": 3.37668776512146, + "learning_rate": 6.649486687268625e-06, + "loss": 0.1376, + "step": 15770 + }, + { + "epoch": 0.3990940607839664, + "grad_norm": 7.404079437255859, + "learning_rate": 6.649107637921578e-06, + "loss": 0.2103, + "step": 15771 + }, + { + "epoch": 0.3991193663486601, + "grad_norm": 8.688417434692383, + "learning_rate": 6.648728577939743e-06, + "loss": 0.1675, + "step": 15772 + }, + { + "epoch": 0.39914467191335373, + "grad_norm": 5.467282295227051, + "learning_rate": 6.648349507325562e-06, + "loss": 0.1662, + "step": 15773 + }, + { + "epoch": 0.39916997747804744, + "grad_norm": 3.830720901489258, + "learning_rate": 6.647970426081485e-06, + "loss": 0.1614, + "step": 15774 + }, + { + "epoch": 0.3991952830427411, + "grad_norm": 3.112027168273926, + "learning_rate": 6.6475913342099505e-06, + "loss": 0.1314, + "step": 15775 + }, + { + "epoch": 0.39922058860743476, + "grad_norm": 3.414628267288208, + "learning_rate": 6.647212231713408e-06, + "loss": 0.1152, + "step": 15776 + }, + { + "epoch": 0.39924589417212847, + "grad_norm": 7.631807327270508, + "learning_rate": 6.646833118594298e-06, + "loss": 0.0791, + "step": 15777 + }, + { + "epoch": 0.3992711997368221, + "grad_norm": 8.87423324584961, + "learning_rate": 6.64645399485507e-06, + "loss": 0.1748, + "step": 15778 + }, + { + "epoch": 0.3992965053015158, + "grad_norm": 9.219552993774414, + "learning_rate": 6.646074860498165e-06, + "loss": 0.136, + "step": 15779 + }, + { + "epoch": 0.3993218108662095, + "grad_norm": 3.8057944774627686, + "learning_rate": 6.645695715526031e-06, + "loss": 0.1434, + "step": 15780 + }, + { + "epoch": 0.39934711643090315, + "grad_norm": 8.31525993347168, + "learning_rate": 6.645316559941109e-06, + "loss": 0.2883, + "step": 15781 + }, + { + "epoch": 0.3993724219955968, + "grad_norm": 6.0112996101379395, + "learning_rate": 6.644937393745848e-06, + "loss": 0.1963, + "step": 15782 + }, + { + "epoch": 0.3993977275602905, + "grad_norm": 5.28558349609375, + "learning_rate": 6.644558216942692e-06, + "loss": 0.1554, + "step": 15783 + }, + { + "epoch": 0.3994230331249842, + "grad_norm": 12.352517127990723, + "learning_rate": 6.644179029534085e-06, + "loss": 0.3039, + "step": 15784 + }, + { + "epoch": 0.3994483386896779, + "grad_norm": 2.9042041301727295, + "learning_rate": 6.643799831522474e-06, + "loss": 0.096, + "step": 15785 + }, + { + "epoch": 0.39947364425437154, + "grad_norm": 3.1430306434631348, + "learning_rate": 6.643420622910302e-06, + "loss": 0.1595, + "step": 15786 + }, + { + "epoch": 0.3994989498190652, + "grad_norm": 3.561141014099121, + "learning_rate": 6.643041403700019e-06, + "loss": 0.1781, + "step": 15787 + }, + { + "epoch": 0.3995242553837589, + "grad_norm": 2.54425311088562, + "learning_rate": 6.642662173894067e-06, + "loss": 0.1078, + "step": 15788 + }, + { + "epoch": 0.39954956094845256, + "grad_norm": 21.763137817382812, + "learning_rate": 6.642282933494891e-06, + "loss": 0.4166, + "step": 15789 + }, + { + "epoch": 0.3995748665131462, + "grad_norm": 8.057673454284668, + "learning_rate": 6.641903682504938e-06, + "loss": 0.1544, + "step": 15790 + }, + { + "epoch": 0.39960017207783993, + "grad_norm": 7.210511207580566, + "learning_rate": 6.641524420926653e-06, + "loss": 0.2072, + "step": 15791 + }, + { + "epoch": 0.3996254776425336, + "grad_norm": 8.254266738891602, + "learning_rate": 6.6411451487624835e-06, + "loss": 0.3141, + "step": 15792 + }, + { + "epoch": 0.39965078320722724, + "grad_norm": 2.075474977493286, + "learning_rate": 6.6407658660148735e-06, + "loss": 0.0722, + "step": 15793 + }, + { + "epoch": 0.39967608877192096, + "grad_norm": 3.7818686962127686, + "learning_rate": 6.64038657268627e-06, + "loss": 0.1619, + "step": 15794 + }, + { + "epoch": 0.3997013943366146, + "grad_norm": 2.832521438598633, + "learning_rate": 6.640007268779118e-06, + "loss": 0.1531, + "step": 15795 + }, + { + "epoch": 0.3997266999013083, + "grad_norm": 4.649170398712158, + "learning_rate": 6.639627954295866e-06, + "loss": 0.2824, + "step": 15796 + }, + { + "epoch": 0.399752005466002, + "grad_norm": 4.025277614593506, + "learning_rate": 6.6392486292389565e-06, + "loss": 0.1706, + "step": 15797 + }, + { + "epoch": 0.39977731103069564, + "grad_norm": 2.5707998275756836, + "learning_rate": 6.638869293610839e-06, + "loss": 0.1032, + "step": 15798 + }, + { + "epoch": 0.39980261659538935, + "grad_norm": 2.295198440551758, + "learning_rate": 6.638489947413956e-06, + "loss": 0.1381, + "step": 15799 + }, + { + "epoch": 0.399827922160083, + "grad_norm": 4.491826057434082, + "learning_rate": 6.638110590650759e-06, + "loss": 0.1175, + "step": 15800 + }, + { + "epoch": 0.39985322772477666, + "grad_norm": 6.16759729385376, + "learning_rate": 6.637731223323689e-06, + "loss": 0.2186, + "step": 15801 + }, + { + "epoch": 0.3998785332894704, + "grad_norm": 9.263199806213379, + "learning_rate": 6.637351845435197e-06, + "loss": 0.235, + "step": 15802 + }, + { + "epoch": 0.39990383885416403, + "grad_norm": 4.560266971588135, + "learning_rate": 6.636972456987726e-06, + "loss": 0.1248, + "step": 15803 + }, + { + "epoch": 0.3999291444188577, + "grad_norm": 6.0166120529174805, + "learning_rate": 6.6365930579837236e-06, + "loss": 0.2076, + "step": 15804 + }, + { + "epoch": 0.3999544499835514, + "grad_norm": 4.152877330780029, + "learning_rate": 6.636213648425639e-06, + "loss": 0.1447, + "step": 15805 + }, + { + "epoch": 0.39997975554824505, + "grad_norm": 3.284693717956543, + "learning_rate": 6.635834228315915e-06, + "loss": 0.1395, + "step": 15806 + }, + { + "epoch": 0.4000050611129387, + "grad_norm": 5.729011535644531, + "learning_rate": 6.635454797657002e-06, + "loss": 0.1953, + "step": 15807 + }, + { + "epoch": 0.4000303666776324, + "grad_norm": 7.492619037628174, + "learning_rate": 6.635075356451343e-06, + "loss": 0.3499, + "step": 15808 + }, + { + "epoch": 0.4000303666776324, + "eval_loss": 0.19394682347774506, + "eval_runtime": 69.8231, + "eval_samples_per_second": 45.744, + "eval_steps_per_second": 5.729, + "step": 15808 + }, + { + "epoch": 0.4000556722423261, + "grad_norm": 5.3607001304626465, + "learning_rate": 6.6346959047013904e-06, + "loss": 0.175, + "step": 15809 + }, + { + "epoch": 0.4000809778070198, + "grad_norm": 4.20883321762085, + "learning_rate": 6.6343164424095865e-06, + "loss": 0.1108, + "step": 15810 + }, + { + "epoch": 0.40010628337171344, + "grad_norm": 5.941440105438232, + "learning_rate": 6.633936969578379e-06, + "loss": 0.304, + "step": 15811 + }, + { + "epoch": 0.4001315889364071, + "grad_norm": 5.247284889221191, + "learning_rate": 6.633557486210217e-06, + "loss": 0.1267, + "step": 15812 + }, + { + "epoch": 0.4001568945011008, + "grad_norm": 7.1405558586120605, + "learning_rate": 6.633177992307547e-06, + "loss": 0.1722, + "step": 15813 + }, + { + "epoch": 0.40018220006579447, + "grad_norm": 5.45837926864624, + "learning_rate": 6.632798487872814e-06, + "loss": 0.1052, + "step": 15814 + }, + { + "epoch": 0.4002075056304881, + "grad_norm": 3.650791645050049, + "learning_rate": 6.632418972908467e-06, + "loss": 0.1253, + "step": 15815 + }, + { + "epoch": 0.40023281119518184, + "grad_norm": 9.22518539428711, + "learning_rate": 6.632039447416957e-06, + "loss": 0.1629, + "step": 15816 + }, + { + "epoch": 0.4002581167598755, + "grad_norm": 2.992055892944336, + "learning_rate": 6.6316599114007265e-06, + "loss": 0.1157, + "step": 15817 + }, + { + "epoch": 0.40028342232456915, + "grad_norm": 4.530770301818848, + "learning_rate": 6.631280364862225e-06, + "loss": 0.1718, + "step": 15818 + }, + { + "epoch": 0.40030872788926286, + "grad_norm": 3.418536424636841, + "learning_rate": 6.630900807803899e-06, + "loss": 0.1282, + "step": 15819 + }, + { + "epoch": 0.4003340334539565, + "grad_norm": 13.332216262817383, + "learning_rate": 6.630521240228197e-06, + "loss": 0.3019, + "step": 15820 + }, + { + "epoch": 0.40035933901865023, + "grad_norm": 2.9711897373199463, + "learning_rate": 6.630141662137568e-06, + "loss": 0.0862, + "step": 15821 + }, + { + "epoch": 0.4003846445833439, + "grad_norm": 5.410966396331787, + "learning_rate": 6.629762073534458e-06, + "loss": 0.1891, + "step": 15822 + }, + { + "epoch": 0.40040995014803754, + "grad_norm": 2.277804374694824, + "learning_rate": 6.629382474421315e-06, + "loss": 0.1087, + "step": 15823 + }, + { + "epoch": 0.40043525571273125, + "grad_norm": 3.314241409301758, + "learning_rate": 6.629002864800589e-06, + "loss": 0.1205, + "step": 15824 + }, + { + "epoch": 0.4004605612774249, + "grad_norm": 4.208225250244141, + "learning_rate": 6.628623244674726e-06, + "loss": 0.1155, + "step": 15825 + }, + { + "epoch": 0.40048586684211857, + "grad_norm": 4.814334392547607, + "learning_rate": 6.628243614046173e-06, + "loss": 0.1601, + "step": 15826 + }, + { + "epoch": 0.4005111724068123, + "grad_norm": 5.580595016479492, + "learning_rate": 6.6278639729173814e-06, + "loss": 0.1656, + "step": 15827 + }, + { + "epoch": 0.40053647797150593, + "grad_norm": 5.264279842376709, + "learning_rate": 6.627484321290797e-06, + "loss": 0.2019, + "step": 15828 + }, + { + "epoch": 0.4005617835361996, + "grad_norm": 3.6790387630462646, + "learning_rate": 6.62710465916887e-06, + "loss": 0.1435, + "step": 15829 + }, + { + "epoch": 0.4005870891008933, + "grad_norm": 6.03679895401001, + "learning_rate": 6.626724986554047e-06, + "loss": 0.1562, + "step": 15830 + }, + { + "epoch": 0.40061239466558696, + "grad_norm": 6.530252456665039, + "learning_rate": 6.626345303448778e-06, + "loss": 0.2272, + "step": 15831 + }, + { + "epoch": 0.4006377002302806, + "grad_norm": 2.7345266342163086, + "learning_rate": 6.625965609855509e-06, + "loss": 0.1178, + "step": 15832 + }, + { + "epoch": 0.4006630057949743, + "grad_norm": 3.6597185134887695, + "learning_rate": 6.625585905776692e-06, + "loss": 0.1755, + "step": 15833 + }, + { + "epoch": 0.400688311359668, + "grad_norm": 5.059306621551514, + "learning_rate": 6.625206191214773e-06, + "loss": 0.2139, + "step": 15834 + }, + { + "epoch": 0.4007136169243617, + "grad_norm": 5.721815586090088, + "learning_rate": 6.6248264661722015e-06, + "loss": 0.2232, + "step": 15835 + }, + { + "epoch": 0.40073892248905535, + "grad_norm": 3.8909378051757812, + "learning_rate": 6.624446730651426e-06, + "loss": 0.148, + "step": 15836 + }, + { + "epoch": 0.400764228053749, + "grad_norm": 6.293478012084961, + "learning_rate": 6.624066984654896e-06, + "loss": 0.1006, + "step": 15837 + }, + { + "epoch": 0.4007895336184427, + "grad_norm": 14.296002388000488, + "learning_rate": 6.62368722818506e-06, + "loss": 0.2517, + "step": 15838 + }, + { + "epoch": 0.4008148391831364, + "grad_norm": 4.475332260131836, + "learning_rate": 6.623307461244367e-06, + "loss": 0.1694, + "step": 15839 + }, + { + "epoch": 0.40084014474783003, + "grad_norm": 4.649043560028076, + "learning_rate": 6.622927683835267e-06, + "loss": 0.1722, + "step": 15840 + }, + { + "epoch": 0.40086545031252374, + "grad_norm": 5.5034356117248535, + "learning_rate": 6.622547895960207e-06, + "loss": 0.2139, + "step": 15841 + }, + { + "epoch": 0.4008907558772174, + "grad_norm": 10.486111640930176, + "learning_rate": 6.62216809762164e-06, + "loss": 0.2593, + "step": 15842 + }, + { + "epoch": 0.40091606144191105, + "grad_norm": 3.472959041595459, + "learning_rate": 6.62178828882201e-06, + "loss": 0.1156, + "step": 15843 + }, + { + "epoch": 0.40094136700660477, + "grad_norm": 3.2505974769592285, + "learning_rate": 6.621408469563771e-06, + "loss": 0.1265, + "step": 15844 + }, + { + "epoch": 0.4009666725712984, + "grad_norm": 8.60783576965332, + "learning_rate": 6.621028639849368e-06, + "loss": 0.2516, + "step": 15845 + }, + { + "epoch": 0.4009919781359921, + "grad_norm": 12.21835994720459, + "learning_rate": 6.620648799681255e-06, + "loss": 0.1822, + "step": 15846 + }, + { + "epoch": 0.4010172837006858, + "grad_norm": 2.8459982872009277, + "learning_rate": 6.620268949061877e-06, + "loss": 0.1405, + "step": 15847 + }, + { + "epoch": 0.40104258926537945, + "grad_norm": 5.531050205230713, + "learning_rate": 6.61988908799369e-06, + "loss": 0.2, + "step": 15848 + }, + { + "epoch": 0.40106789483007316, + "grad_norm": 10.242383003234863, + "learning_rate": 6.619509216479136e-06, + "loss": 0.1884, + "step": 15849 + }, + { + "epoch": 0.4010932003947668, + "grad_norm": 4.431921005249023, + "learning_rate": 6.61912933452067e-06, + "loss": 0.16, + "step": 15850 + }, + { + "epoch": 0.40111850595946047, + "grad_norm": 3.8198866844177246, + "learning_rate": 6.61874944212074e-06, + "loss": 0.0851, + "step": 15851 + }, + { + "epoch": 0.4011438115241542, + "grad_norm": 7.916647434234619, + "learning_rate": 6.618369539281795e-06, + "loss": 0.2465, + "step": 15852 + }, + { + "epoch": 0.40116911708884784, + "grad_norm": 4.509405136108398, + "learning_rate": 6.617989626006286e-06, + "loss": 0.1562, + "step": 15853 + }, + { + "epoch": 0.4011944226535415, + "grad_norm": 4.503248691558838, + "learning_rate": 6.617609702296664e-06, + "loss": 0.1456, + "step": 15854 + }, + { + "epoch": 0.4012197282182352, + "grad_norm": 7.550464630126953, + "learning_rate": 6.617229768155377e-06, + "loss": 0.2318, + "step": 15855 + }, + { + "epoch": 0.40124503378292886, + "grad_norm": 9.123357772827148, + "learning_rate": 6.616849823584875e-06, + "loss": 0.2891, + "step": 15856 + }, + { + "epoch": 0.4012703393476225, + "grad_norm": 6.949467182159424, + "learning_rate": 6.616469868587609e-06, + "loss": 0.2464, + "step": 15857 + }, + { + "epoch": 0.40129564491231623, + "grad_norm": 3.208705425262451, + "learning_rate": 6.6160899031660316e-06, + "loss": 0.1562, + "step": 15858 + }, + { + "epoch": 0.4013209504770099, + "grad_norm": 4.563851356506348, + "learning_rate": 6.615709927322591e-06, + "loss": 0.1781, + "step": 15859 + }, + { + "epoch": 0.4013462560417036, + "grad_norm": 2.3182320594787598, + "learning_rate": 6.615329941059737e-06, + "loss": 0.1204, + "step": 15860 + }, + { + "epoch": 0.40137156160639725, + "grad_norm": 5.346834182739258, + "learning_rate": 6.61494994437992e-06, + "loss": 0.1834, + "step": 15861 + }, + { + "epoch": 0.4013968671710909, + "grad_norm": 7.53691291809082, + "learning_rate": 6.614569937285593e-06, + "loss": 0.1894, + "step": 15862 + }, + { + "epoch": 0.4014221727357846, + "grad_norm": 2.9831559658050537, + "learning_rate": 6.614189919779202e-06, + "loss": 0.1345, + "step": 15863 + }, + { + "epoch": 0.4014474783004783, + "grad_norm": 5.792959690093994, + "learning_rate": 6.613809891863203e-06, + "loss": 0.2063, + "step": 15864 + }, + { + "epoch": 0.40147278386517193, + "grad_norm": 8.561556816101074, + "learning_rate": 6.6134298535400435e-06, + "loss": 0.2131, + "step": 15865 + }, + { + "epoch": 0.40149808942986565, + "grad_norm": 9.027210235595703, + "learning_rate": 6.613049804812175e-06, + "loss": 0.2863, + "step": 15866 + }, + { + "epoch": 0.4015233949945593, + "grad_norm": 4.030777931213379, + "learning_rate": 6.612669745682048e-06, + "loss": 0.1961, + "step": 15867 + }, + { + "epoch": 0.40154870055925296, + "grad_norm": 5.364043712615967, + "learning_rate": 6.612289676152112e-06, + "loss": 0.1651, + "step": 15868 + }, + { + "epoch": 0.40157400612394667, + "grad_norm": 4.120664596557617, + "learning_rate": 6.6119095962248235e-06, + "loss": 0.1373, + "step": 15869 + }, + { + "epoch": 0.4015993116886403, + "grad_norm": 7.188621520996094, + "learning_rate": 6.611529505902627e-06, + "loss": 0.2302, + "step": 15870 + }, + { + "epoch": 0.401624617253334, + "grad_norm": 4.889364719390869, + "learning_rate": 6.611149405187979e-06, + "loss": 0.1773, + "step": 15871 + }, + { + "epoch": 0.4016499228180277, + "grad_norm": 3.761199474334717, + "learning_rate": 6.610769294083325e-06, + "loss": 0.1795, + "step": 15872 + }, + { + "epoch": 0.40167522838272135, + "grad_norm": 3.006486177444458, + "learning_rate": 6.610389172591123e-06, + "loss": 0.1846, + "step": 15873 + }, + { + "epoch": 0.40170053394741506, + "grad_norm": 4.0024189949035645, + "learning_rate": 6.610009040713818e-06, + "loss": 0.1828, + "step": 15874 + }, + { + "epoch": 0.4017258395121087, + "grad_norm": 2.1906187534332275, + "learning_rate": 6.609628898453866e-06, + "loss": 0.1304, + "step": 15875 + }, + { + "epoch": 0.4017511450768024, + "grad_norm": 6.216200828552246, + "learning_rate": 6.609248745813715e-06, + "loss": 0.1756, + "step": 15876 + }, + { + "epoch": 0.4017764506414961, + "grad_norm": 5.766873836517334, + "learning_rate": 6.60886858279582e-06, + "loss": 0.2315, + "step": 15877 + }, + { + "epoch": 0.40180175620618974, + "grad_norm": 6.04135799407959, + "learning_rate": 6.6084884094026305e-06, + "loss": 0.1239, + "step": 15878 + }, + { + "epoch": 0.4018270617708834, + "grad_norm": 5.384582996368408, + "learning_rate": 6.608108225636597e-06, + "loss": 0.1929, + "step": 15879 + }, + { + "epoch": 0.4018523673355771, + "grad_norm": 4.421394348144531, + "learning_rate": 6.607728031500174e-06, + "loss": 0.1472, + "step": 15880 + }, + { + "epoch": 0.40187767290027077, + "grad_norm": 4.812070369720459, + "learning_rate": 6.607347826995812e-06, + "loss": 0.2319, + "step": 15881 + }, + { + "epoch": 0.4019029784649644, + "grad_norm": 6.728211402893066, + "learning_rate": 6.606967612125962e-06, + "loss": 0.1193, + "step": 15882 + }, + { + "epoch": 0.40192828402965813, + "grad_norm": 8.011621475219727, + "learning_rate": 6.606587386893076e-06, + "loss": 0.2625, + "step": 15883 + }, + { + "epoch": 0.4019535895943518, + "grad_norm": 3.1405158042907715, + "learning_rate": 6.606207151299609e-06, + "loss": 0.1635, + "step": 15884 + }, + { + "epoch": 0.4019788951590455, + "grad_norm": 14.137369155883789, + "learning_rate": 6.6058269053480094e-06, + "loss": 0.3068, + "step": 15885 + }, + { + "epoch": 0.40200420072373916, + "grad_norm": 5.575997829437256, + "learning_rate": 6.605446649040731e-06, + "loss": 0.1989, + "step": 15886 + }, + { + "epoch": 0.4020295062884328, + "grad_norm": 4.7639594078063965, + "learning_rate": 6.605066382380225e-06, + "loss": 0.1695, + "step": 15887 + }, + { + "epoch": 0.4020548118531265, + "grad_norm": 5.956568241119385, + "learning_rate": 6.604686105368947e-06, + "loss": 0.2293, + "step": 15888 + }, + { + "epoch": 0.4020801174178202, + "grad_norm": 4.440559387207031, + "learning_rate": 6.604305818009345e-06, + "loss": 0.212, + "step": 15889 + }, + { + "epoch": 0.40210542298251384, + "grad_norm": 6.554767608642578, + "learning_rate": 6.603925520303872e-06, + "loss": 0.1637, + "step": 15890 + }, + { + "epoch": 0.40213072854720755, + "grad_norm": 6.746870994567871, + "learning_rate": 6.603545212254984e-06, + "loss": 0.2072, + "step": 15891 + }, + { + "epoch": 0.4021560341119012, + "grad_norm": 3.9109976291656494, + "learning_rate": 6.603164893865129e-06, + "loss": 0.1577, + "step": 15892 + }, + { + "epoch": 0.40218133967659486, + "grad_norm": 3.5086889266967773, + "learning_rate": 6.602784565136764e-06, + "loss": 0.1631, + "step": 15893 + }, + { + "epoch": 0.4022066452412886, + "grad_norm": 3.9675769805908203, + "learning_rate": 6.602404226072339e-06, + "loss": 0.1998, + "step": 15894 + }, + { + "epoch": 0.40223195080598223, + "grad_norm": 3.180741548538208, + "learning_rate": 6.602023876674306e-06, + "loss": 0.1883, + "step": 15895 + }, + { + "epoch": 0.4022572563706759, + "grad_norm": 4.345023155212402, + "learning_rate": 6.60164351694512e-06, + "loss": 0.1428, + "step": 15896 + }, + { + "epoch": 0.4022825619353696, + "grad_norm": 20.749225616455078, + "learning_rate": 6.601263146887234e-06, + "loss": 0.3459, + "step": 15897 + }, + { + "epoch": 0.40230786750006325, + "grad_norm": 4.457442283630371, + "learning_rate": 6.600882766503097e-06, + "loss": 0.1547, + "step": 15898 + }, + { + "epoch": 0.40233317306475697, + "grad_norm": 12.989458084106445, + "learning_rate": 6.600502375795167e-06, + "loss": 0.3107, + "step": 15899 + }, + { + "epoch": 0.4023584786294506, + "grad_norm": 2.741339921951294, + "learning_rate": 6.600121974765894e-06, + "loss": 0.1296, + "step": 15900 + }, + { + "epoch": 0.4023837841941443, + "grad_norm": 2.6546239852905273, + "learning_rate": 6.599741563417731e-06, + "loss": 0.1188, + "step": 15901 + }, + { + "epoch": 0.402409089758838, + "grad_norm": 3.3779468536376953, + "learning_rate": 6.5993611417531345e-06, + "loss": 0.1546, + "step": 15902 + }, + { + "epoch": 0.40243439532353165, + "grad_norm": 4.551624298095703, + "learning_rate": 6.598980709774554e-06, + "loss": 0.186, + "step": 15903 + }, + { + "epoch": 0.4024597008882253, + "grad_norm": 9.832775115966797, + "learning_rate": 6.598600267484445e-06, + "loss": 0.1652, + "step": 15904 + }, + { + "epoch": 0.402485006452919, + "grad_norm": 8.210289001464844, + "learning_rate": 6.59821981488526e-06, + "loss": 0.2095, + "step": 15905 + }, + { + "epoch": 0.40251031201761267, + "grad_norm": 5.2985100746154785, + "learning_rate": 6.597839351979453e-06, + "loss": 0.1506, + "step": 15906 + }, + { + "epoch": 0.4025356175823063, + "grad_norm": 5.220443248748779, + "learning_rate": 6.597458878769477e-06, + "loss": 0.2128, + "step": 15907 + }, + { + "epoch": 0.40256092314700004, + "grad_norm": 3.7529308795928955, + "learning_rate": 6.5970783952577856e-06, + "loss": 0.1906, + "step": 15908 + }, + { + "epoch": 0.4025862287116937, + "grad_norm": 3.6900734901428223, + "learning_rate": 6.596697901446833e-06, + "loss": 0.178, + "step": 15909 + }, + { + "epoch": 0.40261153427638735, + "grad_norm": 2.8150229454040527, + "learning_rate": 6.596317397339073e-06, + "loss": 0.0932, + "step": 15910 + }, + { + "epoch": 0.40263683984108106, + "grad_norm": 6.181107044219971, + "learning_rate": 6.595936882936959e-06, + "loss": 0.0898, + "step": 15911 + }, + { + "epoch": 0.4026621454057747, + "grad_norm": 6.601971626281738, + "learning_rate": 6.595556358242944e-06, + "loss": 0.2164, + "step": 15912 + }, + { + "epoch": 0.40268745097046843, + "grad_norm": 19.29395294189453, + "learning_rate": 6.5951758232594835e-06, + "loss": 0.2414, + "step": 15913 + }, + { + "epoch": 0.4027127565351621, + "grad_norm": 6.938826560974121, + "learning_rate": 6.59479527798903e-06, + "loss": 0.1701, + "step": 15914 + }, + { + "epoch": 0.40273806209985574, + "grad_norm": 3.093099355697632, + "learning_rate": 6.594414722434041e-06, + "loss": 0.1677, + "step": 15915 + }, + { + "epoch": 0.40276336766454945, + "grad_norm": 3.285578489303589, + "learning_rate": 6.594034156596965e-06, + "loss": 0.1546, + "step": 15916 + }, + { + "epoch": 0.4027886732292431, + "grad_norm": 7.235725402832031, + "learning_rate": 6.5936535804802616e-06, + "loss": 0.2904, + "step": 15917 + }, + { + "epoch": 0.40281397879393677, + "grad_norm": 5.504022121429443, + "learning_rate": 6.5932729940863805e-06, + "loss": 0.2747, + "step": 15918 + }, + { + "epoch": 0.4028392843586305, + "grad_norm": 2.678968906402588, + "learning_rate": 6.59289239741778e-06, + "loss": 0.1295, + "step": 15919 + }, + { + "epoch": 0.40286458992332413, + "grad_norm": 3.8730616569519043, + "learning_rate": 6.592511790476911e-06, + "loss": 0.1271, + "step": 15920 + }, + { + "epoch": 0.4028898954880178, + "grad_norm": 4.290369510650635, + "learning_rate": 6.5921311732662295e-06, + "loss": 0.173, + "step": 15921 + }, + { + "epoch": 0.4029152010527115, + "grad_norm": 2.763498306274414, + "learning_rate": 6.591750545788193e-06, + "loss": 0.0652, + "step": 15922 + }, + { + "epoch": 0.40294050661740516, + "grad_norm": 2.5220723152160645, + "learning_rate": 6.591369908045251e-06, + "loss": 0.1235, + "step": 15923 + }, + { + "epoch": 0.40296581218209887, + "grad_norm": 5.58475399017334, + "learning_rate": 6.590989260039862e-06, + "loss": 0.1816, + "step": 15924 + }, + { + "epoch": 0.4029911177467925, + "grad_norm": 6.016635417938232, + "learning_rate": 6.590608601774477e-06, + "loss": 0.1727, + "step": 15925 + }, + { + "epoch": 0.4030164233114862, + "grad_norm": 7.146072864532471, + "learning_rate": 6.590227933251555e-06, + "loss": 0.2227, + "step": 15926 + }, + { + "epoch": 0.4030417288761799, + "grad_norm": 9.743425369262695, + "learning_rate": 6.589847254473548e-06, + "loss": 0.2557, + "step": 15927 + }, + { + "epoch": 0.40306703444087355, + "grad_norm": 10.970337867736816, + "learning_rate": 6.589466565442913e-06, + "loss": 0.156, + "step": 15928 + }, + { + "epoch": 0.4030923400055672, + "grad_norm": 8.37062931060791, + "learning_rate": 6.589085866162101e-06, + "loss": 0.1833, + "step": 15929 + }, + { + "epoch": 0.4031176455702609, + "grad_norm": 13.405892372131348, + "learning_rate": 6.588705156633573e-06, + "loss": 0.3978, + "step": 15930 + }, + { + "epoch": 0.4031429511349546, + "grad_norm": 2.609586238861084, + "learning_rate": 6.5883244368597785e-06, + "loss": 0.0947, + "step": 15931 + }, + { + "epoch": 0.40316825669964823, + "grad_norm": 3.334312677383423, + "learning_rate": 6.587943706843176e-06, + "loss": 0.1294, + "step": 15932 + }, + { + "epoch": 0.40319356226434194, + "grad_norm": 5.448371887207031, + "learning_rate": 6.5875629665862204e-06, + "loss": 0.1274, + "step": 15933 + }, + { + "epoch": 0.4032188678290356, + "grad_norm": 12.973134994506836, + "learning_rate": 6.587182216091368e-06, + "loss": 0.2718, + "step": 15934 + }, + { + "epoch": 0.40324417339372925, + "grad_norm": 3.5817625522613525, + "learning_rate": 6.586801455361071e-06, + "loss": 0.197, + "step": 15935 + }, + { + "epoch": 0.40326947895842297, + "grad_norm": 10.39903450012207, + "learning_rate": 6.586420684397786e-06, + "loss": 0.2355, + "step": 15936 + }, + { + "epoch": 0.4032947845231166, + "grad_norm": 4.748671054840088, + "learning_rate": 6.58603990320397e-06, + "loss": 0.203, + "step": 15937 + }, + { + "epoch": 0.40332009008781033, + "grad_norm": 5.913177490234375, + "learning_rate": 6.585659111782078e-06, + "loss": 0.2082, + "step": 15938 + }, + { + "epoch": 0.403345395652504, + "grad_norm": 6.179413795471191, + "learning_rate": 6.585278310134565e-06, + "loss": 0.1582, + "step": 15939 + }, + { + "epoch": 0.40337070121719765, + "grad_norm": 7.312123775482178, + "learning_rate": 6.584897498263887e-06, + "loss": 0.2117, + "step": 15940 + }, + { + "epoch": 0.40339600678189136, + "grad_norm": 4.4849090576171875, + "learning_rate": 6.5845166761725e-06, + "loss": 0.1981, + "step": 15941 + }, + { + "epoch": 0.403421312346585, + "grad_norm": 3.8342273235321045, + "learning_rate": 6.584135843862859e-06, + "loss": 0.1171, + "step": 15942 + }, + { + "epoch": 0.40344661791127867, + "grad_norm": 5.419796466827393, + "learning_rate": 6.58375500133742e-06, + "loss": 0.2507, + "step": 15943 + }, + { + "epoch": 0.4034719234759724, + "grad_norm": 2.783832311630249, + "learning_rate": 6.5833741485986416e-06, + "loss": 0.1381, + "step": 15944 + }, + { + "epoch": 0.40349722904066604, + "grad_norm": 5.391597747802734, + "learning_rate": 6.582993285648977e-06, + "loss": 0.195, + "step": 15945 + }, + { + "epoch": 0.4035225346053597, + "grad_norm": 3.6442768573760986, + "learning_rate": 6.582612412490883e-06, + "loss": 0.151, + "step": 15946 + }, + { + "epoch": 0.4035478401700534, + "grad_norm": 4.556238651275635, + "learning_rate": 6.582231529126815e-06, + "loss": 0.1675, + "step": 15947 + }, + { + "epoch": 0.40357314573474706, + "grad_norm": 4.202931880950928, + "learning_rate": 6.581850635559232e-06, + "loss": 0.1699, + "step": 15948 + }, + { + "epoch": 0.4035984512994408, + "grad_norm": 6.738760471343994, + "learning_rate": 6.581469731790586e-06, + "loss": 0.1735, + "step": 15949 + }, + { + "epoch": 0.40362375686413443, + "grad_norm": 5.920482158660889, + "learning_rate": 6.581088817823339e-06, + "loss": 0.1653, + "step": 15950 + }, + { + "epoch": 0.4036490624288281, + "grad_norm": 5.520725250244141, + "learning_rate": 6.580707893659942e-06, + "loss": 0.17, + "step": 15951 + }, + { + "epoch": 0.4036743679935218, + "grad_norm": 5.9941325187683105, + "learning_rate": 6.580326959302855e-06, + "loss": 0.2257, + "step": 15952 + }, + { + "epoch": 0.40369967355821545, + "grad_norm": 8.034440040588379, + "learning_rate": 6.5799460147545324e-06, + "loss": 0.193, + "step": 15953 + }, + { + "epoch": 0.4037249791229091, + "grad_norm": 4.224371433258057, + "learning_rate": 6.579565060017432e-06, + "loss": 0.1671, + "step": 15954 + }, + { + "epoch": 0.4037502846876028, + "grad_norm": 7.556125164031982, + "learning_rate": 6.579184095094012e-06, + "loss": 0.1962, + "step": 15955 + }, + { + "epoch": 0.4037755902522965, + "grad_norm": 6.170813083648682, + "learning_rate": 6.578803119986726e-06, + "loss": 0.1921, + "step": 15956 + }, + { + "epoch": 0.40380089581699014, + "grad_norm": 5.5024800300598145, + "learning_rate": 6.578422134698033e-06, + "loss": 0.1209, + "step": 15957 + }, + { + "epoch": 0.40382620138168385, + "grad_norm": 5.37725305557251, + "learning_rate": 6.5780411392303885e-06, + "loss": 0.1577, + "step": 15958 + }, + { + "epoch": 0.4038515069463775, + "grad_norm": 7.807758808135986, + "learning_rate": 6.5776601335862525e-06, + "loss": 0.2378, + "step": 15959 + }, + { + "epoch": 0.40387681251107116, + "grad_norm": 7.00758695602417, + "learning_rate": 6.577279117768076e-06, + "loss": 0.1289, + "step": 15960 + }, + { + "epoch": 0.40390211807576487, + "grad_norm": 4.193323612213135, + "learning_rate": 6.576898091778323e-06, + "loss": 0.1466, + "step": 15961 + }, + { + "epoch": 0.4039274236404585, + "grad_norm": 11.665471076965332, + "learning_rate": 6.5765170556194456e-06, + "loss": 0.2327, + "step": 15962 + }, + { + "epoch": 0.40395272920515224, + "grad_norm": 3.981645345687866, + "learning_rate": 6.5761360092939044e-06, + "loss": 0.2429, + "step": 15963 + }, + { + "epoch": 0.4039780347698459, + "grad_norm": 6.560240745544434, + "learning_rate": 6.575754952804154e-06, + "loss": 0.2552, + "step": 15964 + }, + { + "epoch": 0.40400334033453955, + "grad_norm": 4.57296085357666, + "learning_rate": 6.575373886152654e-06, + "loss": 0.1803, + "step": 15965 + }, + { + "epoch": 0.40402864589923326, + "grad_norm": 4.260028839111328, + "learning_rate": 6.574992809341861e-06, + "loss": 0.1449, + "step": 15966 + }, + { + "epoch": 0.4040539514639269, + "grad_norm": 8.569753646850586, + "learning_rate": 6.5746117223742315e-06, + "loss": 0.2355, + "step": 15967 + }, + { + "epoch": 0.4040792570286206, + "grad_norm": 4.489577293395996, + "learning_rate": 6.5742306252522245e-06, + "loss": 0.1221, + "step": 15968 + }, + { + "epoch": 0.4041045625933143, + "grad_norm": 8.148420333862305, + "learning_rate": 6.573849517978297e-06, + "loss": 0.2209, + "step": 15969 + }, + { + "epoch": 0.40412986815800794, + "grad_norm": 5.972466468811035, + "learning_rate": 6.573468400554907e-06, + "loss": 0.1367, + "step": 15970 + }, + { + "epoch": 0.4041551737227016, + "grad_norm": 7.104945659637451, + "learning_rate": 6.573087272984511e-06, + "loss": 0.2393, + "step": 15971 + }, + { + "epoch": 0.4041804792873953, + "grad_norm": 3.924907684326172, + "learning_rate": 6.572706135269569e-06, + "loss": 0.1746, + "step": 15972 + }, + { + "epoch": 0.40420578485208897, + "grad_norm": 14.032925605773926, + "learning_rate": 6.572324987412536e-06, + "loss": 0.5835, + "step": 15973 + }, + { + "epoch": 0.4042310904167826, + "grad_norm": 4.813501834869385, + "learning_rate": 6.571943829415873e-06, + "loss": 0.1693, + "step": 15974 + }, + { + "epoch": 0.40425639598147634, + "grad_norm": 7.096410274505615, + "learning_rate": 6.5715626612820364e-06, + "loss": 0.2305, + "step": 15975 + }, + { + "epoch": 0.40428170154617, + "grad_norm": 5.30460786819458, + "learning_rate": 6.571181483013484e-06, + "loss": 0.1544, + "step": 15976 + }, + { + "epoch": 0.4043070071108637, + "grad_norm": 30.580842971801758, + "learning_rate": 6.5708002946126744e-06, + "loss": 0.2673, + "step": 15977 + }, + { + "epoch": 0.40433231267555736, + "grad_norm": 5.032062530517578, + "learning_rate": 6.570419096082066e-06, + "loss": 0.2081, + "step": 15978 + }, + { + "epoch": 0.404357618240251, + "grad_norm": 5.825603008270264, + "learning_rate": 6.5700378874241175e-06, + "loss": 0.1781, + "step": 15979 + }, + { + "epoch": 0.4043829238049447, + "grad_norm": 5.793405055999756, + "learning_rate": 6.569656668641287e-06, + "loss": 0.1988, + "step": 15980 + }, + { + "epoch": 0.4044082293696384, + "grad_norm": 4.7329583168029785, + "learning_rate": 6.569275439736031e-06, + "loss": 0.2004, + "step": 15981 + }, + { + "epoch": 0.40443353493433204, + "grad_norm": 7.4749836921691895, + "learning_rate": 6.5688942007108115e-06, + "loss": 0.1749, + "step": 15982 + }, + { + "epoch": 0.40445884049902575, + "grad_norm": 5.288230895996094, + "learning_rate": 6.568512951568085e-06, + "loss": 0.1723, + "step": 15983 + }, + { + "epoch": 0.4044841460637194, + "grad_norm": 5.449615001678467, + "learning_rate": 6.568131692310309e-06, + "loss": 0.1913, + "step": 15984 + }, + { + "epoch": 0.40450945162841306, + "grad_norm": 4.493563175201416, + "learning_rate": 6.567750422939944e-06, + "loss": 0.2151, + "step": 15985 + }, + { + "epoch": 0.4045347571931068, + "grad_norm": 11.796643257141113, + "learning_rate": 6.567369143459448e-06, + "loss": 0.3605, + "step": 15986 + }, + { + "epoch": 0.40456006275780043, + "grad_norm": 7.673746585845947, + "learning_rate": 6.566987853871279e-06, + "loss": 0.1959, + "step": 15987 + }, + { + "epoch": 0.40458536832249414, + "grad_norm": 3.00770902633667, + "learning_rate": 6.566606554177899e-06, + "loss": 0.1382, + "step": 15988 + }, + { + "epoch": 0.4046106738871878, + "grad_norm": 5.2203240394592285, + "learning_rate": 6.566225244381763e-06, + "loss": 0.1334, + "step": 15989 + }, + { + "epoch": 0.40463597945188146, + "grad_norm": 3.7422678470611572, + "learning_rate": 6.565843924485333e-06, + "loss": 0.1751, + "step": 15990 + }, + { + "epoch": 0.40466128501657517, + "grad_norm": 6.267037868499756, + "learning_rate": 6.565462594491065e-06, + "loss": 0.1784, + "step": 15991 + }, + { + "epoch": 0.4046865905812688, + "grad_norm": 8.473969459533691, + "learning_rate": 6.565081254401422e-06, + "loss": 0.3173, + "step": 15992 + }, + { + "epoch": 0.4047118961459625, + "grad_norm": 1.9046858549118042, + "learning_rate": 6.5646999042188594e-06, + "loss": 0.0781, + "step": 15993 + }, + { + "epoch": 0.4047372017106562, + "grad_norm": 9.413363456726074, + "learning_rate": 6.56431854394584e-06, + "loss": 0.2826, + "step": 15994 + }, + { + "epoch": 0.40476250727534985, + "grad_norm": 5.107834339141846, + "learning_rate": 6.563937173584821e-06, + "loss": 0.1291, + "step": 15995 + }, + { + "epoch": 0.4047878128400435, + "grad_norm": 7.937182903289795, + "learning_rate": 6.563555793138261e-06, + "loss": 0.1416, + "step": 15996 + }, + { + "epoch": 0.4048131184047372, + "grad_norm": 3.3497674465179443, + "learning_rate": 6.5631744026086205e-06, + "loss": 0.1666, + "step": 15997 + }, + { + "epoch": 0.40483842396943087, + "grad_norm": 4.515687942504883, + "learning_rate": 6.56279300199836e-06, + "loss": 0.164, + "step": 15998 + }, + { + "epoch": 0.40486372953412453, + "grad_norm": 4.033939361572266, + "learning_rate": 6.562411591309938e-06, + "loss": 0.1397, + "step": 15999 + }, + { + "epoch": 0.40488903509881824, + "grad_norm": 19.847679138183594, + "learning_rate": 6.562030170545814e-06, + "loss": 0.2763, + "step": 16000 + }, + { + "epoch": 0.4049143406635119, + "grad_norm": 2.6031312942504883, + "learning_rate": 6.561648739708448e-06, + "loss": 0.1199, + "step": 16001 + }, + { + "epoch": 0.4049396462282056, + "grad_norm": 4.708900451660156, + "learning_rate": 6.561267298800299e-06, + "loss": 0.1913, + "step": 16002 + }, + { + "epoch": 0.40496495179289926, + "grad_norm": 5.003931045532227, + "learning_rate": 6.5608858478238295e-06, + "loss": 0.0921, + "step": 16003 + }, + { + "epoch": 0.4049902573575929, + "grad_norm": 4.580280780792236, + "learning_rate": 6.560504386781494e-06, + "loss": 0.2048, + "step": 16004 + }, + { + "epoch": 0.40501556292228663, + "grad_norm": 7.672571182250977, + "learning_rate": 6.56012291567576e-06, + "loss": 0.2186, + "step": 16005 + }, + { + "epoch": 0.4050408684869803, + "grad_norm": 11.947443008422852, + "learning_rate": 6.5597414345090814e-06, + "loss": 0.3216, + "step": 16006 + }, + { + "epoch": 0.40506617405167394, + "grad_norm": 4.75075101852417, + "learning_rate": 6.559359943283921e-06, + "loss": 0.1705, + "step": 16007 + }, + { + "epoch": 0.40509147961636766, + "grad_norm": 6.171209335327148, + "learning_rate": 6.558978442002738e-06, + "loss": 0.1111, + "step": 16008 + }, + { + "epoch": 0.4051167851810613, + "grad_norm": 5.387854099273682, + "learning_rate": 6.558596930667992e-06, + "loss": 0.1506, + "step": 16009 + }, + { + "epoch": 0.40514209074575497, + "grad_norm": 8.04572582244873, + "learning_rate": 6.558215409282147e-06, + "loss": 0.2048, + "step": 16010 + }, + { + "epoch": 0.4051673963104487, + "grad_norm": 4.916393756866455, + "learning_rate": 6.557833877847658e-06, + "loss": 0.1478, + "step": 16011 + }, + { + "epoch": 0.40519270187514234, + "grad_norm": 5.506516933441162, + "learning_rate": 6.55745233636699e-06, + "loss": 0.2468, + "step": 16012 + }, + { + "epoch": 0.40521800743983605, + "grad_norm": 14.239439010620117, + "learning_rate": 6.557070784842601e-06, + "loss": 0.2156, + "step": 16013 + }, + { + "epoch": 0.4052433130045297, + "grad_norm": 7.715312480926514, + "learning_rate": 6.556689223276952e-06, + "loss": 0.1136, + "step": 16014 + }, + { + "epoch": 0.40526861856922336, + "grad_norm": 19.00605583190918, + "learning_rate": 6.556307651672504e-06, + "loss": 0.202, + "step": 16015 + }, + { + "epoch": 0.40529392413391707, + "grad_norm": 3.6574647426605225, + "learning_rate": 6.555926070031717e-06, + "loss": 0.1611, + "step": 16016 + }, + { + "epoch": 0.40531922969861073, + "grad_norm": 3.224407196044922, + "learning_rate": 6.555544478357052e-06, + "loss": 0.1433, + "step": 16017 + }, + { + "epoch": 0.4053445352633044, + "grad_norm": 4.755006313323975, + "learning_rate": 6.55516287665097e-06, + "loss": 0.1513, + "step": 16018 + }, + { + "epoch": 0.4053698408279981, + "grad_norm": 9.05808162689209, + "learning_rate": 6.5547812649159324e-06, + "loss": 0.1982, + "step": 16019 + }, + { + "epoch": 0.40539514639269175, + "grad_norm": 3.7101640701293945, + "learning_rate": 6.554399643154399e-06, + "loss": 0.1879, + "step": 16020 + }, + { + "epoch": 0.4054204519573854, + "grad_norm": 3.2275867462158203, + "learning_rate": 6.5540180113688325e-06, + "loss": 0.1964, + "step": 16021 + }, + { + "epoch": 0.4054457575220791, + "grad_norm": 4.261284828186035, + "learning_rate": 6.5536363695616925e-06, + "loss": 0.1341, + "step": 16022 + }, + { + "epoch": 0.4054710630867728, + "grad_norm": 4.700982093811035, + "learning_rate": 6.553254717735441e-06, + "loss": 0.2318, + "step": 16023 + }, + { + "epoch": 0.40549636865146643, + "grad_norm": 7.500061988830566, + "learning_rate": 6.552873055892539e-06, + "loss": 0.1515, + "step": 16024 + }, + { + "epoch": 0.40552167421616014, + "grad_norm": 4.783015727996826, + "learning_rate": 6.552491384035446e-06, + "loss": 0.1523, + "step": 16025 + }, + { + "epoch": 0.4055469797808538, + "grad_norm": 6.134341239929199, + "learning_rate": 6.552109702166626e-06, + "loss": 0.2074, + "step": 16026 + }, + { + "epoch": 0.4055722853455475, + "grad_norm": 4.265987396240234, + "learning_rate": 6.551728010288539e-06, + "loss": 0.1895, + "step": 16027 + }, + { + "epoch": 0.40559759091024117, + "grad_norm": 5.302531719207764, + "learning_rate": 6.551346308403647e-06, + "loss": 0.192, + "step": 16028 + }, + { + "epoch": 0.4056228964749348, + "grad_norm": 4.246065139770508, + "learning_rate": 6.550964596514409e-06, + "loss": 0.2019, + "step": 16029 + }, + { + "epoch": 0.40564820203962854, + "grad_norm": 5.14409065246582, + "learning_rate": 6.5505828746232925e-06, + "loss": 0.1997, + "step": 16030 + }, + { + "epoch": 0.4056735076043222, + "grad_norm": 6.324789047241211, + "learning_rate": 6.550201142732753e-06, + "loss": 0.1219, + "step": 16031 + }, + { + "epoch": 0.40569881316901585, + "grad_norm": 7.371140956878662, + "learning_rate": 6.5498194008452555e-06, + "loss": 0.2528, + "step": 16032 + }, + { + "epoch": 0.40572411873370956, + "grad_norm": 4.520929336547852, + "learning_rate": 6.549437648963261e-06, + "loss": 0.1372, + "step": 16033 + }, + { + "epoch": 0.4057494242984032, + "grad_norm": 6.715509414672852, + "learning_rate": 6.549055887089231e-06, + "loss": 0.2437, + "step": 16034 + }, + { + "epoch": 0.4057747298630969, + "grad_norm": 6.578685283660889, + "learning_rate": 6.5486741152256274e-06, + "loss": 0.2052, + "step": 16035 + }, + { + "epoch": 0.4058000354277906, + "grad_norm": 3.311046600341797, + "learning_rate": 6.5482923333749125e-06, + "loss": 0.0952, + "step": 16036 + }, + { + "epoch": 0.40582534099248424, + "grad_norm": 4.275001525878906, + "learning_rate": 6.547910541539549e-06, + "loss": 0.1153, + "step": 16037 + }, + { + "epoch": 0.4058506465571779, + "grad_norm": 5.920114994049072, + "learning_rate": 6.547528739721999e-06, + "loss": 0.1972, + "step": 16038 + }, + { + "epoch": 0.4058759521218716, + "grad_norm": 6.3940300941467285, + "learning_rate": 6.547146927924722e-06, + "loss": 0.1853, + "step": 16039 + }, + { + "epoch": 0.40590125768656526, + "grad_norm": 4.32354736328125, + "learning_rate": 6.546765106150183e-06, + "loss": 0.1055, + "step": 16040 + }, + { + "epoch": 0.405926563251259, + "grad_norm": 7.456913948059082, + "learning_rate": 6.546383274400845e-06, + "loss": 0.1797, + "step": 16041 + }, + { + "epoch": 0.40595186881595263, + "grad_norm": 4.463170051574707, + "learning_rate": 6.546001432679167e-06, + "loss": 0.1123, + "step": 16042 + }, + { + "epoch": 0.4059771743806463, + "grad_norm": 7.596860408782959, + "learning_rate": 6.545619580987614e-06, + "loss": 0.1592, + "step": 16043 + }, + { + "epoch": 0.40600247994534, + "grad_norm": 9.219172477722168, + "learning_rate": 6.545237719328648e-06, + "loss": 0.2462, + "step": 16044 + }, + { + "epoch": 0.40602778551003366, + "grad_norm": 3.866154670715332, + "learning_rate": 6.544855847704732e-06, + "loss": 0.2226, + "step": 16045 + }, + { + "epoch": 0.4060530910747273, + "grad_norm": 4.8171067237854, + "learning_rate": 6.544473966118327e-06, + "loss": 0.1697, + "step": 16046 + }, + { + "epoch": 0.406078396639421, + "grad_norm": 3.9597103595733643, + "learning_rate": 6.544092074571898e-06, + "loss": 0.1859, + "step": 16047 + }, + { + "epoch": 0.4061037022041147, + "grad_norm": 3.0272297859191895, + "learning_rate": 6.543710173067904e-06, + "loss": 0.1093, + "step": 16048 + }, + { + "epoch": 0.40612900776880834, + "grad_norm": 4.380964279174805, + "learning_rate": 6.543328261608812e-06, + "loss": 0.0901, + "step": 16049 + }, + { + "epoch": 0.40615431333350205, + "grad_norm": 9.955771446228027, + "learning_rate": 6.542946340197083e-06, + "loss": 0.1545, + "step": 16050 + }, + { + "epoch": 0.4061796188981957, + "grad_norm": 11.778307914733887, + "learning_rate": 6.54256440883518e-06, + "loss": 0.2073, + "step": 16051 + }, + { + "epoch": 0.4062049244628894, + "grad_norm": 9.529869079589844, + "learning_rate": 6.542182467525566e-06, + "loss": 0.1626, + "step": 16052 + }, + { + "epoch": 0.4062302300275831, + "grad_norm": 2.294539213180542, + "learning_rate": 6.5418005162707045e-06, + "loss": 0.0617, + "step": 16053 + }, + { + "epoch": 0.40625553559227673, + "grad_norm": 3.3666179180145264, + "learning_rate": 6.541418555073058e-06, + "loss": 0.1614, + "step": 16054 + }, + { + "epoch": 0.40628084115697044, + "grad_norm": 4.262901782989502, + "learning_rate": 6.54103658393509e-06, + "loss": 0.1402, + "step": 16055 + }, + { + "epoch": 0.4063061467216641, + "grad_norm": 4.5480732917785645, + "learning_rate": 6.540654602859264e-06, + "loss": 0.1347, + "step": 16056 + }, + { + "epoch": 0.40633145228635775, + "grad_norm": 4.206072807312012, + "learning_rate": 6.540272611848042e-06, + "loss": 0.1645, + "step": 16057 + }, + { + "epoch": 0.40635675785105146, + "grad_norm": 1.945169448852539, + "learning_rate": 6.5398906109038895e-06, + "loss": 0.1097, + "step": 16058 + }, + { + "epoch": 0.4063820634157451, + "grad_norm": 7.355815887451172, + "learning_rate": 6.539508600029268e-06, + "loss": 0.2165, + "step": 16059 + }, + { + "epoch": 0.4064073689804388, + "grad_norm": 7.554340839385986, + "learning_rate": 6.539126579226642e-06, + "loss": 0.1899, + "step": 16060 + }, + { + "epoch": 0.4064326745451325, + "grad_norm": 10.698681831359863, + "learning_rate": 6.538744548498477e-06, + "loss": 0.335, + "step": 16061 + }, + { + "epoch": 0.40645798010982614, + "grad_norm": 4.427137851715088, + "learning_rate": 6.5383625078472316e-06, + "loss": 0.1499, + "step": 16062 + }, + { + "epoch": 0.4064832856745198, + "grad_norm": 4.09649133682251, + "learning_rate": 6.537980457275374e-06, + "loss": 0.1736, + "step": 16063 + }, + { + "epoch": 0.4065085912392135, + "grad_norm": 4.569270610809326, + "learning_rate": 6.537598396785366e-06, + "loss": 0.2094, + "step": 16064 + }, + { + "epoch": 0.40653389680390717, + "grad_norm": 4.306431293487549, + "learning_rate": 6.537216326379674e-06, + "loss": 0.1463, + "step": 16065 + }, + { + "epoch": 0.4065592023686009, + "grad_norm": 7.431410789489746, + "learning_rate": 6.536834246060757e-06, + "loss": 0.2075, + "step": 16066 + }, + { + "epoch": 0.40658450793329454, + "grad_norm": 4.366668701171875, + "learning_rate": 6.5364521558310835e-06, + "loss": 0.1846, + "step": 16067 + }, + { + "epoch": 0.4066098134979882, + "grad_norm": 5.632320404052734, + "learning_rate": 6.5360700556931155e-06, + "loss": 0.1718, + "step": 16068 + }, + { + "epoch": 0.4066351190626819, + "grad_norm": 2.4554316997528076, + "learning_rate": 6.535687945649318e-06, + "loss": 0.1257, + "step": 16069 + }, + { + "epoch": 0.40666042462737556, + "grad_norm": 4.436645030975342, + "learning_rate": 6.535305825702154e-06, + "loss": 0.1407, + "step": 16070 + }, + { + "epoch": 0.4066857301920692, + "grad_norm": 5.192242622375488, + "learning_rate": 6.534923695854087e-06, + "loss": 0.1348, + "step": 16071 + }, + { + "epoch": 0.40671103575676293, + "grad_norm": 8.182937622070312, + "learning_rate": 6.5345415561075845e-06, + "loss": 0.2691, + "step": 16072 + }, + { + "epoch": 0.4067363413214566, + "grad_norm": 5.248981952667236, + "learning_rate": 6.534159406465108e-06, + "loss": 0.1533, + "step": 16073 + }, + { + "epoch": 0.40676164688615024, + "grad_norm": 3.091470956802368, + "learning_rate": 6.533777246929124e-06, + "loss": 0.1407, + "step": 16074 + }, + { + "epoch": 0.40678695245084395, + "grad_norm": 9.72236442565918, + "learning_rate": 6.533395077502094e-06, + "loss": 0.2821, + "step": 16075 + }, + { + "epoch": 0.4068122580155376, + "grad_norm": 7.328351020812988, + "learning_rate": 6.533012898186486e-06, + "loss": 0.197, + "step": 16076 + }, + { + "epoch": 0.4068375635802313, + "grad_norm": 15.262632369995117, + "learning_rate": 6.532630708984762e-06, + "loss": 0.3007, + "step": 16077 + }, + { + "epoch": 0.406862869144925, + "grad_norm": 4.288548469543457, + "learning_rate": 6.5322485098993885e-06, + "loss": 0.1482, + "step": 16078 + }, + { + "epoch": 0.40688817470961863, + "grad_norm": 4.697421073913574, + "learning_rate": 6.531866300932828e-06, + "loss": 0.2038, + "step": 16079 + }, + { + "epoch": 0.40691348027431234, + "grad_norm": 5.1208953857421875, + "learning_rate": 6.531484082087549e-06, + "loss": 0.1403, + "step": 16080 + }, + { + "epoch": 0.406938785839006, + "grad_norm": 2.996387004852295, + "learning_rate": 6.531101853366012e-06, + "loss": 0.1212, + "step": 16081 + }, + { + "epoch": 0.40696409140369966, + "grad_norm": 4.864319801330566, + "learning_rate": 6.530719614770685e-06, + "loss": 0.1892, + "step": 16082 + }, + { + "epoch": 0.40698939696839337, + "grad_norm": 4.3684611320495605, + "learning_rate": 6.530337366304032e-06, + "loss": 0.1686, + "step": 16083 + }, + { + "epoch": 0.407014702533087, + "grad_norm": 7.763709545135498, + "learning_rate": 6.529955107968518e-06, + "loss": 0.2144, + "step": 16084 + }, + { + "epoch": 0.4070400080977807, + "grad_norm": 6.8032355308532715, + "learning_rate": 6.529572839766608e-06, + "loss": 0.1913, + "step": 16085 + }, + { + "epoch": 0.4070653136624744, + "grad_norm": 4.303435802459717, + "learning_rate": 6.5291905617007665e-06, + "loss": 0.104, + "step": 16086 + }, + { + "epoch": 0.40709061922716805, + "grad_norm": 3.5649259090423584, + "learning_rate": 6.52880827377346e-06, + "loss": 0.1408, + "step": 16087 + }, + { + "epoch": 0.4071159247918617, + "grad_norm": 5.199815273284912, + "learning_rate": 6.5284259759871525e-06, + "loss": 0.0884, + "step": 16088 + }, + { + "epoch": 0.4071412303565554, + "grad_norm": 9.736223220825195, + "learning_rate": 6.528043668344312e-06, + "loss": 0.145, + "step": 16089 + }, + { + "epoch": 0.4071665359212491, + "grad_norm": 4.356729030609131, + "learning_rate": 6.5276613508474005e-06, + "loss": 0.121, + "step": 16090 + }, + { + "epoch": 0.4071918414859428, + "grad_norm": 7.227162837982178, + "learning_rate": 6.5272790234988855e-06, + "loss": 0.1885, + "step": 16091 + }, + { + "epoch": 0.40721714705063644, + "grad_norm": 3.463297128677368, + "learning_rate": 6.526896686301233e-06, + "loss": 0.1707, + "step": 16092 + }, + { + "epoch": 0.4072424526153301, + "grad_norm": 5.610171794891357, + "learning_rate": 6.5265143392569065e-06, + "loss": 0.1994, + "step": 16093 + }, + { + "epoch": 0.4072677581800238, + "grad_norm": 5.3662428855896, + "learning_rate": 6.526131982368373e-06, + "loss": 0.2282, + "step": 16094 + }, + { + "epoch": 0.40729306374471747, + "grad_norm": 6.236879825592041, + "learning_rate": 6.5257496156381e-06, + "loss": 0.1848, + "step": 16095 + }, + { + "epoch": 0.4073183693094111, + "grad_norm": 9.699804306030273, + "learning_rate": 6.52536723906855e-06, + "loss": 0.2213, + "step": 16096 + }, + { + "epoch": 0.40734367487410483, + "grad_norm": 12.85109806060791, + "learning_rate": 6.524984852662191e-06, + "loss": 0.2204, + "step": 16097 + }, + { + "epoch": 0.4073689804387985, + "grad_norm": 5.378658771514893, + "learning_rate": 6.5246024564214884e-06, + "loss": 0.1722, + "step": 16098 + }, + { + "epoch": 0.40739428600349215, + "grad_norm": 2.6332449913024902, + "learning_rate": 6.524220050348907e-06, + "loss": 0.0931, + "step": 16099 + }, + { + "epoch": 0.40741959156818586, + "grad_norm": 4.905941009521484, + "learning_rate": 6.523837634446916e-06, + "loss": 0.2258, + "step": 16100 + }, + { + "epoch": 0.4074448971328795, + "grad_norm": 4.761995792388916, + "learning_rate": 6.523455208717978e-06, + "loss": 0.206, + "step": 16101 + }, + { + "epoch": 0.40747020269757317, + "grad_norm": 5.989381790161133, + "learning_rate": 6.52307277316456e-06, + "loss": 0.2014, + "step": 16102 + }, + { + "epoch": 0.4074955082622669, + "grad_norm": 9.840538024902344, + "learning_rate": 6.522690327789132e-06, + "loss": 0.3077, + "step": 16103 + }, + { + "epoch": 0.40752081382696054, + "grad_norm": 3.9542055130004883, + "learning_rate": 6.522307872594155e-06, + "loss": 0.1358, + "step": 16104 + }, + { + "epoch": 0.40754611939165425, + "grad_norm": 13.160609245300293, + "learning_rate": 6.5219254075820985e-06, + "loss": 0.1695, + "step": 16105 + }, + { + "epoch": 0.4075714249563479, + "grad_norm": 5.433384418487549, + "learning_rate": 6.521542932755427e-06, + "loss": 0.1501, + "step": 16106 + }, + { + "epoch": 0.40759673052104156, + "grad_norm": 4.490730285644531, + "learning_rate": 6.52116044811661e-06, + "loss": 0.1441, + "step": 16107 + }, + { + "epoch": 0.4076220360857353, + "grad_norm": 5.462545871734619, + "learning_rate": 6.52077795366811e-06, + "loss": 0.1647, + "step": 16108 + }, + { + "epoch": 0.40764734165042893, + "grad_norm": 7.730300426483154, + "learning_rate": 6.520395449412398e-06, + "loss": 0.2294, + "step": 16109 + }, + { + "epoch": 0.4076726472151226, + "grad_norm": 3.451366662979126, + "learning_rate": 6.520012935351938e-06, + "loss": 0.1391, + "step": 16110 + }, + { + "epoch": 0.4076979527798163, + "grad_norm": 4.062559604644775, + "learning_rate": 6.519630411489198e-06, + "loss": 0.1454, + "step": 16111 + }, + { + "epoch": 0.40772325834450995, + "grad_norm": 4.991991996765137, + "learning_rate": 6.5192478778266424e-06, + "loss": 0.1645, + "step": 16112 + }, + { + "epoch": 0.4077485639092036, + "grad_norm": 20.92011260986328, + "learning_rate": 6.518865334366741e-06, + "loss": 0.143, + "step": 16113 + }, + { + "epoch": 0.4077738694738973, + "grad_norm": 10.794523239135742, + "learning_rate": 6.518482781111959e-06, + "loss": 0.3158, + "step": 16114 + }, + { + "epoch": 0.407799175038591, + "grad_norm": 13.242091178894043, + "learning_rate": 6.518100218064764e-06, + "loss": 0.2056, + "step": 16115 + }, + { + "epoch": 0.4078244806032847, + "grad_norm": 4.341546058654785, + "learning_rate": 6.517717645227624e-06, + "loss": 0.1889, + "step": 16116 + }, + { + "epoch": 0.40784978616797835, + "grad_norm": 3.236510753631592, + "learning_rate": 6.517335062603004e-06, + "loss": 0.1392, + "step": 16117 + }, + { + "epoch": 0.407875091732672, + "grad_norm": 7.413667678833008, + "learning_rate": 6.516952470193374e-06, + "loss": 0.2171, + "step": 16118 + }, + { + "epoch": 0.4079003972973657, + "grad_norm": 4.472068786621094, + "learning_rate": 6.516569868001198e-06, + "loss": 0.1601, + "step": 16119 + }, + { + "epoch": 0.40792570286205937, + "grad_norm": 3.6252620220184326, + "learning_rate": 6.5161872560289456e-06, + "loss": 0.1743, + "step": 16120 + }, + { + "epoch": 0.407951008426753, + "grad_norm": 3.0645415782928467, + "learning_rate": 6.515804634279084e-06, + "loss": 0.1627, + "step": 16121 + }, + { + "epoch": 0.40797631399144674, + "grad_norm": 5.184676170349121, + "learning_rate": 6.515422002754079e-06, + "loss": 0.1869, + "step": 16122 + }, + { + "epoch": 0.4080016195561404, + "grad_norm": 9.850235939025879, + "learning_rate": 6.515039361456401e-06, + "loss": 0.2654, + "step": 16123 + }, + { + "epoch": 0.40802692512083405, + "grad_norm": 3.5968453884124756, + "learning_rate": 6.514656710388515e-06, + "loss": 0.1025, + "step": 16124 + }, + { + "epoch": 0.40805223068552776, + "grad_norm": 4.131848335266113, + "learning_rate": 6.51427404955289e-06, + "loss": 0.1816, + "step": 16125 + }, + { + "epoch": 0.4080775362502214, + "grad_norm": 5.919795989990234, + "learning_rate": 6.513891378951993e-06, + "loss": 0.1515, + "step": 16126 + }, + { + "epoch": 0.4081028418149151, + "grad_norm": 9.563819885253906, + "learning_rate": 6.513508698588294e-06, + "loss": 0.2088, + "step": 16127 + }, + { + "epoch": 0.4081281473796088, + "grad_norm": 3.682537794113159, + "learning_rate": 6.513126008464255e-06, + "loss": 0.0984, + "step": 16128 + }, + { + "epoch": 0.40815345294430244, + "grad_norm": 10.0595703125, + "learning_rate": 6.512743308582351e-06, + "loss": 0.1638, + "step": 16129 + }, + { + "epoch": 0.40817875850899615, + "grad_norm": 14.098114967346191, + "learning_rate": 6.512360598945046e-06, + "loss": 0.3617, + "step": 16130 + }, + { + "epoch": 0.4082040640736898, + "grad_norm": 4.67425537109375, + "learning_rate": 6.5119778795548095e-06, + "loss": 0.1928, + "step": 16131 + }, + { + "epoch": 0.40822936963838347, + "grad_norm": 7.539485454559326, + "learning_rate": 6.511595150414108e-06, + "loss": 0.167, + "step": 16132 + }, + { + "epoch": 0.4082546752030772, + "grad_norm": 5.397181987762451, + "learning_rate": 6.51121241152541e-06, + "loss": 0.258, + "step": 16133 + }, + { + "epoch": 0.40827998076777083, + "grad_norm": 3.5842368602752686, + "learning_rate": 6.510829662891185e-06, + "loss": 0.1026, + "step": 16134 + }, + { + "epoch": 0.4083052863324645, + "grad_norm": 5.207937240600586, + "learning_rate": 6.510446904513901e-06, + "loss": 0.2293, + "step": 16135 + }, + { + "epoch": 0.4083305918971582, + "grad_norm": 6.890843868255615, + "learning_rate": 6.510064136396025e-06, + "loss": 0.1548, + "step": 16136 + }, + { + "epoch": 0.40835589746185186, + "grad_norm": 18.15354347229004, + "learning_rate": 6.509681358540027e-06, + "loss": 0.4132, + "step": 16137 + }, + { + "epoch": 0.4083812030265455, + "grad_norm": 6.379809379577637, + "learning_rate": 6.509298570948375e-06, + "loss": 0.2368, + "step": 16138 + }, + { + "epoch": 0.4084065085912392, + "grad_norm": 1.5447869300842285, + "learning_rate": 6.508915773623536e-06, + "loss": 0.0765, + "step": 16139 + }, + { + "epoch": 0.4084318141559329, + "grad_norm": 3.80031156539917, + "learning_rate": 6.5085329665679805e-06, + "loss": 0.1691, + "step": 16140 + }, + { + "epoch": 0.40845711972062654, + "grad_norm": 6.570369243621826, + "learning_rate": 6.508150149784177e-06, + "loss": 0.1679, + "step": 16141 + }, + { + "epoch": 0.40848242528532025, + "grad_norm": 5.678110599517822, + "learning_rate": 6.507767323274593e-06, + "loss": 0.1913, + "step": 16142 + }, + { + "epoch": 0.4085077308500139, + "grad_norm": 9.350544929504395, + "learning_rate": 6.507384487041699e-06, + "loss": 0.197, + "step": 16143 + }, + { + "epoch": 0.4085330364147076, + "grad_norm": 3.8217525482177734, + "learning_rate": 6.507001641087962e-06, + "loss": 0.1084, + "step": 16144 + }, + { + "epoch": 0.4085583419794013, + "grad_norm": 5.428701400756836, + "learning_rate": 6.506618785415852e-06, + "loss": 0.2332, + "step": 16145 + }, + { + "epoch": 0.40858364754409493, + "grad_norm": 10.647323608398438, + "learning_rate": 6.506235920027837e-06, + "loss": 0.2174, + "step": 16146 + }, + { + "epoch": 0.40860895310878864, + "grad_norm": 4.561981678009033, + "learning_rate": 6.505853044926388e-06, + "loss": 0.1042, + "step": 16147 + }, + { + "epoch": 0.4086342586734823, + "grad_norm": 3.4986722469329834, + "learning_rate": 6.505470160113972e-06, + "loss": 0.1841, + "step": 16148 + }, + { + "epoch": 0.40865956423817595, + "grad_norm": 5.387007236480713, + "learning_rate": 6.50508726559306e-06, + "loss": 0.1946, + "step": 16149 + }, + { + "epoch": 0.40868486980286967, + "grad_norm": 4.246971130371094, + "learning_rate": 6.504704361366119e-06, + "loss": 0.1551, + "step": 16150 + }, + { + "epoch": 0.4087101753675633, + "grad_norm": 5.553134441375732, + "learning_rate": 6.50432144743562e-06, + "loss": 0.2331, + "step": 16151 + }, + { + "epoch": 0.408735480932257, + "grad_norm": 4.606706142425537, + "learning_rate": 6.50393852380403e-06, + "loss": 0.1791, + "step": 16152 + }, + { + "epoch": 0.4087607864969507, + "grad_norm": 9.005457878112793, + "learning_rate": 6.5035555904738225e-06, + "loss": 0.2726, + "step": 16153 + }, + { + "epoch": 0.40878609206164435, + "grad_norm": 4.095228672027588, + "learning_rate": 6.503172647447464e-06, + "loss": 0.2326, + "step": 16154 + }, + { + "epoch": 0.40881139762633806, + "grad_norm": 3.5507640838623047, + "learning_rate": 6.502789694727425e-06, + "loss": 0.1565, + "step": 16155 + }, + { + "epoch": 0.4088367031910317, + "grad_norm": 3.611074209213257, + "learning_rate": 6.502406732316175e-06, + "loss": 0.1042, + "step": 16156 + }, + { + "epoch": 0.40886200875572537, + "grad_norm": 2.723721981048584, + "learning_rate": 6.502023760216182e-06, + "loss": 0.1593, + "step": 16157 + }, + { + "epoch": 0.4088873143204191, + "grad_norm": 4.603994846343994, + "learning_rate": 6.501640778429918e-06, + "loss": 0.1985, + "step": 16158 + }, + { + "epoch": 0.40891261988511274, + "grad_norm": 3.9715757369995117, + "learning_rate": 6.501257786959852e-06, + "loss": 0.1457, + "step": 16159 + }, + { + "epoch": 0.4089379254498064, + "grad_norm": 11.478631973266602, + "learning_rate": 6.500874785808454e-06, + "loss": 0.4087, + "step": 16160 + }, + { + "epoch": 0.4089632310145001, + "grad_norm": 3.739258050918579, + "learning_rate": 6.5004917749781925e-06, + "loss": 0.1541, + "step": 16161 + }, + { + "epoch": 0.40898853657919376, + "grad_norm": 7.2433576583862305, + "learning_rate": 6.5001087544715416e-06, + "loss": 0.2322, + "step": 16162 + }, + { + "epoch": 0.4090138421438874, + "grad_norm": 8.105842590332031, + "learning_rate": 6.499725724290965e-06, + "loss": 0.1051, + "step": 16163 + }, + { + "epoch": 0.40903914770858113, + "grad_norm": 4.24422550201416, + "learning_rate": 6.499342684438938e-06, + "loss": 0.1738, + "step": 16164 + }, + { + "epoch": 0.4090644532732748, + "grad_norm": 5.683735370635986, + "learning_rate": 6.498959634917929e-06, + "loss": 0.1846, + "step": 16165 + }, + { + "epoch": 0.40908975883796844, + "grad_norm": 4.663596153259277, + "learning_rate": 6.498576575730407e-06, + "loss": 0.1445, + "step": 16166 + }, + { + "epoch": 0.40911506440266215, + "grad_norm": 4.128351211547852, + "learning_rate": 6.498193506878845e-06, + "loss": 0.1631, + "step": 16167 + }, + { + "epoch": 0.4091403699673558, + "grad_norm": 3.8577065467834473, + "learning_rate": 6.49781042836571e-06, + "loss": 0.1485, + "step": 16168 + }, + { + "epoch": 0.4091656755320495, + "grad_norm": 4.714451313018799, + "learning_rate": 6.497427340193475e-06, + "loss": 0.1836, + "step": 16169 + }, + { + "epoch": 0.4091909810967432, + "grad_norm": 3.8995227813720703, + "learning_rate": 6.49704424236461e-06, + "loss": 0.1401, + "step": 16170 + }, + { + "epoch": 0.40921628666143683, + "grad_norm": 6.228597164154053, + "learning_rate": 6.496661134881584e-06, + "loss": 0.1573, + "step": 16171 + }, + { + "epoch": 0.40924159222613055, + "grad_norm": 7.772462368011475, + "learning_rate": 6.496278017746871e-06, + "loss": 0.2585, + "step": 16172 + }, + { + "epoch": 0.4092668977908242, + "grad_norm": 2.768493890762329, + "learning_rate": 6.495894890962938e-06, + "loss": 0.1215, + "step": 16173 + }, + { + "epoch": 0.40929220335551786, + "grad_norm": 12.523523330688477, + "learning_rate": 6.495511754532258e-06, + "loss": 0.2384, + "step": 16174 + }, + { + "epoch": 0.40931750892021157, + "grad_norm": 5.1930437088012695, + "learning_rate": 6.4951286084573e-06, + "loss": 0.1571, + "step": 16175 + }, + { + "epoch": 0.4093428144849052, + "grad_norm": 3.319000005722046, + "learning_rate": 6.494745452740534e-06, + "loss": 0.1469, + "step": 16176 + }, + { + "epoch": 0.4093681200495989, + "grad_norm": 7.179980278015137, + "learning_rate": 6.4943622873844345e-06, + "loss": 0.1116, + "step": 16177 + }, + { + "epoch": 0.4093934256142926, + "grad_norm": 13.056537628173828, + "learning_rate": 6.493979112391471e-06, + "loss": 0.1914, + "step": 16178 + }, + { + "epoch": 0.40941873117898625, + "grad_norm": 3.7494213581085205, + "learning_rate": 6.493595927764113e-06, + "loss": 0.2004, + "step": 16179 + }, + { + "epoch": 0.40944403674367996, + "grad_norm": 5.914219379425049, + "learning_rate": 6.493212733504834e-06, + "loss": 0.284, + "step": 16180 + }, + { + "epoch": 0.4094693423083736, + "grad_norm": 5.300785541534424, + "learning_rate": 6.492829529616103e-06, + "loss": 0.1246, + "step": 16181 + }, + { + "epoch": 0.4094946478730673, + "grad_norm": 10.58362102508545, + "learning_rate": 6.492446316100392e-06, + "loss": 0.3024, + "step": 16182 + }, + { + "epoch": 0.409519953437761, + "grad_norm": 2.644174814224243, + "learning_rate": 6.4920630929601716e-06, + "loss": 0.1088, + "step": 16183 + }, + { + "epoch": 0.40954525900245464, + "grad_norm": 2.512528896331787, + "learning_rate": 6.4916798601979146e-06, + "loss": 0.1079, + "step": 16184 + }, + { + "epoch": 0.4095705645671483, + "grad_norm": 9.197282791137695, + "learning_rate": 6.491296617816092e-06, + "loss": 0.2576, + "step": 16185 + }, + { + "epoch": 0.409595870131842, + "grad_norm": 3.956658124923706, + "learning_rate": 6.490913365817175e-06, + "loss": 0.1809, + "step": 16186 + }, + { + "epoch": 0.40962117569653567, + "grad_norm": 8.18507194519043, + "learning_rate": 6.490530104203634e-06, + "loss": 0.2222, + "step": 16187 + }, + { + "epoch": 0.4096464812612293, + "grad_norm": 5.4819159507751465, + "learning_rate": 6.4901468329779414e-06, + "loss": 0.1101, + "step": 16188 + }, + { + "epoch": 0.40967178682592303, + "grad_norm": 4.638059616088867, + "learning_rate": 6.489763552142571e-06, + "loss": 0.1495, + "step": 16189 + }, + { + "epoch": 0.4096970923906167, + "grad_norm": 5.002542018890381, + "learning_rate": 6.489380261699991e-06, + "loss": 0.2122, + "step": 16190 + }, + { + "epoch": 0.40972239795531035, + "grad_norm": 3.870279312133789, + "learning_rate": 6.488996961652676e-06, + "loss": 0.1255, + "step": 16191 + }, + { + "epoch": 0.40974770352000406, + "grad_norm": 5.060985088348389, + "learning_rate": 6.488613652003095e-06, + "loss": 0.1066, + "step": 16192 + }, + { + "epoch": 0.4097730090846977, + "grad_norm": 6.510085105895996, + "learning_rate": 6.4882303327537224e-06, + "loss": 0.104, + "step": 16193 + }, + { + "epoch": 0.4097983146493914, + "grad_norm": 8.791022300720215, + "learning_rate": 6.487847003907029e-06, + "loss": 0.1456, + "step": 16194 + }, + { + "epoch": 0.4098236202140851, + "grad_norm": 3.589691162109375, + "learning_rate": 6.487463665465488e-06, + "loss": 0.1853, + "step": 16195 + }, + { + "epoch": 0.40984892577877874, + "grad_norm": 4.2590012550354, + "learning_rate": 6.487080317431569e-06, + "loss": 0.1189, + "step": 16196 + }, + { + "epoch": 0.40987423134347245, + "grad_norm": 10.25905704498291, + "learning_rate": 6.4866969598077465e-06, + "loss": 0.3295, + "step": 16197 + }, + { + "epoch": 0.4098995369081661, + "grad_norm": 6.01643705368042, + "learning_rate": 6.486313592596492e-06, + "loss": 0.1816, + "step": 16198 + }, + { + "epoch": 0.40992484247285976, + "grad_norm": 4.0145111083984375, + "learning_rate": 6.485930215800277e-06, + "loss": 0.1877, + "step": 16199 + }, + { + "epoch": 0.4099501480375535, + "grad_norm": 2.0366885662078857, + "learning_rate": 6.485546829421574e-06, + "loss": 0.0735, + "step": 16200 + }, + { + "epoch": 0.40997545360224713, + "grad_norm": 6.63487434387207, + "learning_rate": 6.485163433462858e-06, + "loss": 0.2549, + "step": 16201 + }, + { + "epoch": 0.4100007591669408, + "grad_norm": 15.71954345703125, + "learning_rate": 6.484780027926598e-06, + "loss": 0.274, + "step": 16202 + }, + { + "epoch": 0.4100260647316345, + "grad_norm": 3.495190382003784, + "learning_rate": 6.484396612815267e-06, + "loss": 0.1939, + "step": 16203 + }, + { + "epoch": 0.41005137029632815, + "grad_norm": 4.773420333862305, + "learning_rate": 6.484013188131339e-06, + "loss": 0.2117, + "step": 16204 + }, + { + "epoch": 0.4100766758610218, + "grad_norm": 6.332388401031494, + "learning_rate": 6.4836297538772855e-06, + "loss": 0.2789, + "step": 16205 + }, + { + "epoch": 0.4101019814257155, + "grad_norm": 7.715866565704346, + "learning_rate": 6.483246310055581e-06, + "loss": 0.1601, + "step": 16206 + }, + { + "epoch": 0.4101272869904092, + "grad_norm": 2.1380696296691895, + "learning_rate": 6.482862856668695e-06, + "loss": 0.1099, + "step": 16207 + }, + { + "epoch": 0.4101525925551029, + "grad_norm": 3.173830032348633, + "learning_rate": 6.482479393719104e-06, + "loss": 0.1416, + "step": 16208 + }, + { + "epoch": 0.41017789811979655, + "grad_norm": 10.220200538635254, + "learning_rate": 6.482095921209278e-06, + "loss": 0.1806, + "step": 16209 + }, + { + "epoch": 0.4102032036844902, + "grad_norm": 3.6122865676879883, + "learning_rate": 6.481712439141692e-06, + "loss": 0.1627, + "step": 16210 + }, + { + "epoch": 0.4102285092491839, + "grad_norm": 6.931642055511475, + "learning_rate": 6.481328947518818e-06, + "loss": 0.1858, + "step": 16211 + }, + { + "epoch": 0.41025381481387757, + "grad_norm": 5.080339431762695, + "learning_rate": 6.480945446343128e-06, + "loss": 0.1592, + "step": 16212 + }, + { + "epoch": 0.4102791203785712, + "grad_norm": 12.55587100982666, + "learning_rate": 6.480561935617098e-06, + "loss": 0.2339, + "step": 16213 + }, + { + "epoch": 0.41030442594326494, + "grad_norm": 4.043633937835693, + "learning_rate": 6.480178415343198e-06, + "loss": 0.1692, + "step": 16214 + }, + { + "epoch": 0.4103297315079586, + "grad_norm": 4.738168716430664, + "learning_rate": 6.4797948855239045e-06, + "loss": 0.1197, + "step": 16215 + }, + { + "epoch": 0.41035503707265225, + "grad_norm": 13.626299858093262, + "learning_rate": 6.479411346161687e-06, + "loss": 0.3707, + "step": 16216 + }, + { + "epoch": 0.41038034263734596, + "grad_norm": 5.564418792724609, + "learning_rate": 6.479027797259022e-06, + "loss": 0.2289, + "step": 16217 + }, + { + "epoch": 0.4104056482020396, + "grad_norm": 3.593752145767212, + "learning_rate": 6.478644238818381e-06, + "loss": 0.1311, + "step": 16218 + }, + { + "epoch": 0.41043095376673333, + "grad_norm": 10.71879768371582, + "learning_rate": 6.478260670842241e-06, + "loss": 0.1478, + "step": 16219 + }, + { + "epoch": 0.410456259331427, + "grad_norm": 9.107569694519043, + "learning_rate": 6.4778770933330695e-06, + "loss": 0.1894, + "step": 16220 + }, + { + "epoch": 0.41048156489612064, + "grad_norm": 10.803641319274902, + "learning_rate": 6.477493506293344e-06, + "loss": 0.3083, + "step": 16221 + }, + { + "epoch": 0.41050687046081435, + "grad_norm": 11.153191566467285, + "learning_rate": 6.477109909725539e-06, + "loss": 0.1907, + "step": 16222 + }, + { + "epoch": 0.410532176025508, + "grad_norm": 7.189937591552734, + "learning_rate": 6.476726303632127e-06, + "loss": 0.3484, + "step": 16223 + }, + { + "epoch": 0.41055748159020167, + "grad_norm": 6.322831153869629, + "learning_rate": 6.476342688015582e-06, + "loss": 0.1884, + "step": 16224 + }, + { + "epoch": 0.4105827871548954, + "grad_norm": 2.8733763694763184, + "learning_rate": 6.475959062878376e-06, + "loss": 0.1544, + "step": 16225 + }, + { + "epoch": 0.41060809271958904, + "grad_norm": 7.212851047515869, + "learning_rate": 6.475575428222986e-06, + "loss": 0.3335, + "step": 16226 + }, + { + "epoch": 0.4106333982842827, + "grad_norm": 9.103588104248047, + "learning_rate": 6.475191784051884e-06, + "loss": 0.4357, + "step": 16227 + }, + { + "epoch": 0.4106587038489764, + "grad_norm": 35.14611053466797, + "learning_rate": 6.474808130367546e-06, + "loss": 0.2254, + "step": 16228 + }, + { + "epoch": 0.41068400941367006, + "grad_norm": 7.203104496002197, + "learning_rate": 6.4744244671724425e-06, + "loss": 0.1833, + "step": 16229 + }, + { + "epoch": 0.4107093149783637, + "grad_norm": 10.461089134216309, + "learning_rate": 6.474040794469052e-06, + "loss": 0.3046, + "step": 16230 + }, + { + "epoch": 0.4107346205430574, + "grad_norm": 6.774494171142578, + "learning_rate": 6.473657112259846e-06, + "loss": 0.2719, + "step": 16231 + }, + { + "epoch": 0.4107599261077511, + "grad_norm": 4.508748531341553, + "learning_rate": 6.473273420547298e-06, + "loss": 0.1425, + "step": 16232 + }, + { + "epoch": 0.4107852316724448, + "grad_norm": 5.651979446411133, + "learning_rate": 6.472889719333884e-06, + "loss": 0.1458, + "step": 16233 + }, + { + "epoch": 0.41081053723713845, + "grad_norm": 6.288952827453613, + "learning_rate": 6.472506008622079e-06, + "loss": 0.1908, + "step": 16234 + }, + { + "epoch": 0.4108358428018321, + "grad_norm": 5.507772445678711, + "learning_rate": 6.472122288414357e-06, + "loss": 0.2076, + "step": 16235 + }, + { + "epoch": 0.4108611483665258, + "grad_norm": 2.532672166824341, + "learning_rate": 6.471738558713192e-06, + "loss": 0.1442, + "step": 16236 + }, + { + "epoch": 0.4108864539312195, + "grad_norm": 4.345170497894287, + "learning_rate": 6.471354819521058e-06, + "loss": 0.1655, + "step": 16237 + }, + { + "epoch": 0.41091175949591313, + "grad_norm": 5.661883354187012, + "learning_rate": 6.47097107084043e-06, + "loss": 0.2523, + "step": 16238 + }, + { + "epoch": 0.41093706506060684, + "grad_norm": 6.258575439453125, + "learning_rate": 6.470587312673785e-06, + "loss": 0.1892, + "step": 16239 + }, + { + "epoch": 0.4109623706253005, + "grad_norm": 3.331226110458374, + "learning_rate": 6.470203545023595e-06, + "loss": 0.111, + "step": 16240 + }, + { + "epoch": 0.41098767618999416, + "grad_norm": 9.6090726852417, + "learning_rate": 6.469819767892336e-06, + "loss": 0.245, + "step": 16241 + }, + { + "epoch": 0.41101298175468787, + "grad_norm": 8.144838333129883, + "learning_rate": 6.469435981282482e-06, + "loss": 0.2425, + "step": 16242 + }, + { + "epoch": 0.4110382873193815, + "grad_norm": 4.52287483215332, + "learning_rate": 6.469052185196509e-06, + "loss": 0.1708, + "step": 16243 + }, + { + "epoch": 0.41106359288407524, + "grad_norm": 8.290511131286621, + "learning_rate": 6.4686683796368936e-06, + "loss": 0.2538, + "step": 16244 + }, + { + "epoch": 0.4110888984487689, + "grad_norm": 6.320822715759277, + "learning_rate": 6.468284564606106e-06, + "loss": 0.1203, + "step": 16245 + }, + { + "epoch": 0.41111420401346255, + "grad_norm": 23.768491744995117, + "learning_rate": 6.467900740106626e-06, + "loss": 0.2628, + "step": 16246 + }, + { + "epoch": 0.41113950957815626, + "grad_norm": 5.139041900634766, + "learning_rate": 6.467516906140927e-06, + "loss": 0.1737, + "step": 16247 + }, + { + "epoch": 0.4111648151428499, + "grad_norm": 4.7045440673828125, + "learning_rate": 6.467133062711485e-06, + "loss": 0.2084, + "step": 16248 + }, + { + "epoch": 0.41119012070754357, + "grad_norm": 5.007162094116211, + "learning_rate": 6.466749209820773e-06, + "loss": 0.205, + "step": 16249 + }, + { + "epoch": 0.4112154262722373, + "grad_norm": 4.460322856903076, + "learning_rate": 6.466365347471271e-06, + "loss": 0.1378, + "step": 16250 + }, + { + "epoch": 0.41124073183693094, + "grad_norm": 2.4546217918395996, + "learning_rate": 6.46598147566545e-06, + "loss": 0.1241, + "step": 16251 + }, + { + "epoch": 0.4112660374016246, + "grad_norm": 7.550950050354004, + "learning_rate": 6.4655975944057855e-06, + "loss": 0.2017, + "step": 16252 + }, + { + "epoch": 0.4112913429663183, + "grad_norm": 4.411004543304443, + "learning_rate": 6.465213703694757e-06, + "loss": 0.1831, + "step": 16253 + }, + { + "epoch": 0.41131664853101196, + "grad_norm": 9.522101402282715, + "learning_rate": 6.464829803534837e-06, + "loss": 0.1088, + "step": 16254 + }, + { + "epoch": 0.4113419540957056, + "grad_norm": 4.407721996307373, + "learning_rate": 6.464445893928503e-06, + "loss": 0.1584, + "step": 16255 + }, + { + "epoch": 0.41136725966039933, + "grad_norm": 4.283201694488525, + "learning_rate": 6.464061974878228e-06, + "loss": 0.1215, + "step": 16256 + }, + { + "epoch": 0.411392565225093, + "grad_norm": 3.997004508972168, + "learning_rate": 6.4636780463864925e-06, + "loss": 0.178, + "step": 16257 + }, + { + "epoch": 0.4114178707897867, + "grad_norm": 3.8705503940582275, + "learning_rate": 6.463294108455768e-06, + "loss": 0.2241, + "step": 16258 + }, + { + "epoch": 0.41144317635448036, + "grad_norm": 6.31650972366333, + "learning_rate": 6.462910161088531e-06, + "loss": 0.1039, + "step": 16259 + }, + { + "epoch": 0.411468481919174, + "grad_norm": 14.608023643493652, + "learning_rate": 6.46252620428726e-06, + "loss": 0.2921, + "step": 16260 + }, + { + "epoch": 0.4114937874838677, + "grad_norm": 7.026795864105225, + "learning_rate": 6.462142238054428e-06, + "loss": 0.1686, + "step": 16261 + }, + { + "epoch": 0.4115190930485614, + "grad_norm": 3.4023566246032715, + "learning_rate": 6.461758262392514e-06, + "loss": 0.1629, + "step": 16262 + }, + { + "epoch": 0.41154439861325504, + "grad_norm": 3.0265448093414307, + "learning_rate": 6.461374277303994e-06, + "loss": 0.2099, + "step": 16263 + }, + { + "epoch": 0.41156970417794875, + "grad_norm": 4.199304103851318, + "learning_rate": 6.46099028279134e-06, + "loss": 0.0984, + "step": 16264 + }, + { + "epoch": 0.4115950097426424, + "grad_norm": 6.575396537780762, + "learning_rate": 6.460606278857034e-06, + "loss": 0.253, + "step": 16265 + }, + { + "epoch": 0.41162031530733606, + "grad_norm": 6.101557731628418, + "learning_rate": 6.4602222655035486e-06, + "loss": 0.2072, + "step": 16266 + }, + { + "epoch": 0.41164562087202977, + "grad_norm": 3.517760992050171, + "learning_rate": 6.459838242733362e-06, + "loss": 0.1406, + "step": 16267 + }, + { + "epoch": 0.41167092643672343, + "grad_norm": 4.615301132202148, + "learning_rate": 6.45945421054895e-06, + "loss": 0.1929, + "step": 16268 + }, + { + "epoch": 0.4116962320014171, + "grad_norm": 3.0718138217926025, + "learning_rate": 6.459070168952788e-06, + "loss": 0.1331, + "step": 16269 + }, + { + "epoch": 0.4117215375661108, + "grad_norm": 5.5854878425598145, + "learning_rate": 6.4586861179473556e-06, + "loss": 0.1931, + "step": 16270 + }, + { + "epoch": 0.41174684313080445, + "grad_norm": 2.7008039951324463, + "learning_rate": 6.458302057535127e-06, + "loss": 0.1752, + "step": 16271 + }, + { + "epoch": 0.41177214869549816, + "grad_norm": 4.101554870605469, + "learning_rate": 6.45791798771858e-06, + "loss": 0.1353, + "step": 16272 + }, + { + "epoch": 0.4117974542601918, + "grad_norm": 5.496594429016113, + "learning_rate": 6.457533908500191e-06, + "loss": 0.2141, + "step": 16273 + }, + { + "epoch": 0.4118227598248855, + "grad_norm": 4.7423505783081055, + "learning_rate": 6.4571498198824355e-06, + "loss": 0.1544, + "step": 16274 + }, + { + "epoch": 0.4118480653895792, + "grad_norm": 10.383878707885742, + "learning_rate": 6.456765721867795e-06, + "loss": 0.3231, + "step": 16275 + }, + { + "epoch": 0.41187337095427284, + "grad_norm": 4.4654340744018555, + "learning_rate": 6.45638161445874e-06, + "loss": 0.1745, + "step": 16276 + }, + { + "epoch": 0.4118986765189665, + "grad_norm": 29.739028930664062, + "learning_rate": 6.4559974976577535e-06, + "loss": 0.227, + "step": 16277 + }, + { + "epoch": 0.4119239820836602, + "grad_norm": 5.575271129608154, + "learning_rate": 6.455613371467309e-06, + "loss": 0.2341, + "step": 16278 + }, + { + "epoch": 0.41194928764835387, + "grad_norm": 3.7367918491363525, + "learning_rate": 6.455229235889885e-06, + "loss": 0.1532, + "step": 16279 + }, + { + "epoch": 0.4119745932130475, + "grad_norm": 3.507344961166382, + "learning_rate": 6.454845090927957e-06, + "loss": 0.0996, + "step": 16280 + }, + { + "epoch": 0.41199989877774124, + "grad_norm": 7.23200798034668, + "learning_rate": 6.454460936584005e-06, + "loss": 0.1649, + "step": 16281 + }, + { + "epoch": 0.4120252043424349, + "grad_norm": 4.234334945678711, + "learning_rate": 6.454076772860505e-06, + "loss": 0.1519, + "step": 16282 + }, + { + "epoch": 0.4120505099071286, + "grad_norm": 9.157177925109863, + "learning_rate": 6.453692599759933e-06, + "loss": 0.2772, + "step": 16283 + }, + { + "epoch": 0.41207581547182226, + "grad_norm": 8.722951889038086, + "learning_rate": 6.453308417284769e-06, + "loss": 0.2026, + "step": 16284 + }, + { + "epoch": 0.4121011210365159, + "grad_norm": 4.47851037979126, + "learning_rate": 6.452924225437488e-06, + "loss": 0.2095, + "step": 16285 + }, + { + "epoch": 0.41212642660120963, + "grad_norm": 3.77720308303833, + "learning_rate": 6.452540024220571e-06, + "loss": 0.1164, + "step": 16286 + }, + { + "epoch": 0.4121517321659033, + "grad_norm": 5.443455219268799, + "learning_rate": 6.452155813636494e-06, + "loss": 0.133, + "step": 16287 + }, + { + "epoch": 0.41217703773059694, + "grad_norm": 8.88525676727295, + "learning_rate": 6.451771593687731e-06, + "loss": 0.2859, + "step": 16288 + }, + { + "epoch": 0.41220234329529065, + "grad_norm": 4.199431419372559, + "learning_rate": 6.4513873643767654e-06, + "loss": 0.1664, + "step": 16289 + }, + { + "epoch": 0.4122276488599843, + "grad_norm": 4.158698081970215, + "learning_rate": 6.451003125706073e-06, + "loss": 0.2238, + "step": 16290 + }, + { + "epoch": 0.41225295442467796, + "grad_norm": 4.608798027038574, + "learning_rate": 6.45061887767813e-06, + "loss": 0.1635, + "step": 16291 + }, + { + "epoch": 0.4122782599893717, + "grad_norm": 3.963991403579712, + "learning_rate": 6.450234620295417e-06, + "loss": 0.2134, + "step": 16292 + }, + { + "epoch": 0.41230356555406533, + "grad_norm": 3.914518356323242, + "learning_rate": 6.449850353560409e-06, + "loss": 0.2111, + "step": 16293 + }, + { + "epoch": 0.412328871118759, + "grad_norm": 13.601329803466797, + "learning_rate": 6.4494660774755865e-06, + "loss": 0.235, + "step": 16294 + }, + { + "epoch": 0.4123541766834527, + "grad_norm": 7.35319185256958, + "learning_rate": 6.449081792043428e-06, + "loss": 0.1276, + "step": 16295 + }, + { + "epoch": 0.41237948224814636, + "grad_norm": 5.532989501953125, + "learning_rate": 6.448697497266408e-06, + "loss": 0.197, + "step": 16296 + }, + { + "epoch": 0.41240478781284007, + "grad_norm": 3.463047981262207, + "learning_rate": 6.44831319314701e-06, + "loss": 0.1388, + "step": 16297 + }, + { + "epoch": 0.4124300933775337, + "grad_norm": 3.83392596244812, + "learning_rate": 6.4479288796877086e-06, + "loss": 0.1555, + "step": 16298 + }, + { + "epoch": 0.4124553989422274, + "grad_norm": 3.3884499073028564, + "learning_rate": 6.447544556890984e-06, + "loss": 0.1128, + "step": 16299 + }, + { + "epoch": 0.4124807045069211, + "grad_norm": 11.0790433883667, + "learning_rate": 6.447160224759312e-06, + "loss": 0.1944, + "step": 16300 + }, + { + "epoch": 0.41250601007161475, + "grad_norm": 6.253647804260254, + "learning_rate": 6.446775883295175e-06, + "loss": 0.212, + "step": 16301 + }, + { + "epoch": 0.4125313156363084, + "grad_norm": 2.7265520095825195, + "learning_rate": 6.446391532501047e-06, + "loss": 0.1218, + "step": 16302 + }, + { + "epoch": 0.4125566212010021, + "grad_norm": 3.872788429260254, + "learning_rate": 6.4460071723794114e-06, + "loss": 0.166, + "step": 16303 + }, + { + "epoch": 0.4125819267656958, + "grad_norm": 7.5300397872924805, + "learning_rate": 6.445622802932743e-06, + "loss": 0.1974, + "step": 16304 + }, + { + "epoch": 0.41260723233038943, + "grad_norm": 4.1559953689575195, + "learning_rate": 6.4452384241635224e-06, + "loss": 0.2247, + "step": 16305 + }, + { + "epoch": 0.41263253789508314, + "grad_norm": 6.139566898345947, + "learning_rate": 6.4448540360742285e-06, + "loss": 0.2146, + "step": 16306 + }, + { + "epoch": 0.4126578434597768, + "grad_norm": 3.3527603149414062, + "learning_rate": 6.444469638667339e-06, + "loss": 0.1767, + "step": 16307 + }, + { + "epoch": 0.4126831490244705, + "grad_norm": 12.650555610656738, + "learning_rate": 6.444085231945334e-06, + "loss": 0.2377, + "step": 16308 + }, + { + "epoch": 0.41270845458916416, + "grad_norm": 3.1498985290527344, + "learning_rate": 6.443700815910692e-06, + "loss": 0.1391, + "step": 16309 + }, + { + "epoch": 0.4127337601538578, + "grad_norm": 9.926396369934082, + "learning_rate": 6.443316390565891e-06, + "loss": 0.2331, + "step": 16310 + }, + { + "epoch": 0.41275906571855153, + "grad_norm": 3.8998172283172607, + "learning_rate": 6.442931955913412e-06, + "loss": 0.1427, + "step": 16311 + }, + { + "epoch": 0.4127843712832452, + "grad_norm": 3.672290563583374, + "learning_rate": 6.442547511955733e-06, + "loss": 0.1603, + "step": 16312 + }, + { + "epoch": 0.41280967684793884, + "grad_norm": 7.250021457672119, + "learning_rate": 6.442163058695334e-06, + "loss": 0.1801, + "step": 16313 + }, + { + "epoch": 0.41283498241263256, + "grad_norm": 3.691516399383545, + "learning_rate": 6.441778596134693e-06, + "loss": 0.1778, + "step": 16314 + }, + { + "epoch": 0.4128602879773262, + "grad_norm": 6.967352390289307, + "learning_rate": 6.4413941242762904e-06, + "loss": 0.1377, + "step": 16315 + }, + { + "epoch": 0.41288559354201987, + "grad_norm": 10.870962142944336, + "learning_rate": 6.441009643122606e-06, + "loss": 0.2749, + "step": 16316 + }, + { + "epoch": 0.4129108991067136, + "grad_norm": 10.69870662689209, + "learning_rate": 6.4406251526761165e-06, + "loss": 0.0966, + "step": 16317 + }, + { + "epoch": 0.41293620467140724, + "grad_norm": 2.1597933769226074, + "learning_rate": 6.440240652939305e-06, + "loss": 0.084, + "step": 16318 + }, + { + "epoch": 0.4129615102361009, + "grad_norm": 4.923957347869873, + "learning_rate": 6.439856143914649e-06, + "loss": 0.1567, + "step": 16319 + }, + { + "epoch": 0.4129868158007946, + "grad_norm": 4.409444332122803, + "learning_rate": 6.439471625604629e-06, + "loss": 0.2096, + "step": 16320 + }, + { + "epoch": 0.41301212136548826, + "grad_norm": 8.236361503601074, + "learning_rate": 6.439087098011724e-06, + "loss": 0.292, + "step": 16321 + }, + { + "epoch": 0.413037426930182, + "grad_norm": 5.327805519104004, + "learning_rate": 6.438702561138412e-06, + "loss": 0.1644, + "step": 16322 + }, + { + "epoch": 0.41306273249487563, + "grad_norm": 3.0669620037078857, + "learning_rate": 6.438318014987177e-06, + "loss": 0.0563, + "step": 16323 + }, + { + "epoch": 0.4130880380595693, + "grad_norm": 6.9688401222229, + "learning_rate": 6.437933459560497e-06, + "loss": 0.1601, + "step": 16324 + }, + { + "epoch": 0.413113343624263, + "grad_norm": 12.611193656921387, + "learning_rate": 6.43754889486085e-06, + "loss": 0.145, + "step": 16325 + }, + { + "epoch": 0.41313864918895665, + "grad_norm": 3.158541202545166, + "learning_rate": 6.437164320890719e-06, + "loss": 0.1544, + "step": 16326 + }, + { + "epoch": 0.4131639547536503, + "grad_norm": 4.208701133728027, + "learning_rate": 6.436779737652581e-06, + "loss": 0.1699, + "step": 16327 + }, + { + "epoch": 0.413189260318344, + "grad_norm": 4.247009754180908, + "learning_rate": 6.436395145148919e-06, + "loss": 0.1534, + "step": 16328 + }, + { + "epoch": 0.4132145658830377, + "grad_norm": 4.265746116638184, + "learning_rate": 6.4360105433822114e-06, + "loss": 0.2142, + "step": 16329 + }, + { + "epoch": 0.41323987144773133, + "grad_norm": 3.1922738552093506, + "learning_rate": 6.435625932354939e-06, + "loss": 0.1207, + "step": 16330 + }, + { + "epoch": 0.41326517701242504, + "grad_norm": 4.457470417022705, + "learning_rate": 6.435241312069583e-06, + "loss": 0.1663, + "step": 16331 + }, + { + "epoch": 0.4132904825771187, + "grad_norm": 5.9825520515441895, + "learning_rate": 6.434856682528621e-06, + "loss": 0.1999, + "step": 16332 + }, + { + "epoch": 0.41331578814181236, + "grad_norm": 5.242756366729736, + "learning_rate": 6.434472043734536e-06, + "loss": 0.2147, + "step": 16333 + }, + { + "epoch": 0.41334109370650607, + "grad_norm": 13.33189582824707, + "learning_rate": 6.434087395689808e-06, + "loss": 0.1846, + "step": 16334 + }, + { + "epoch": 0.4133663992711997, + "grad_norm": 8.238950729370117, + "learning_rate": 6.4337027383969166e-06, + "loss": 0.3046, + "step": 16335 + }, + { + "epoch": 0.41339170483589344, + "grad_norm": 4.897183418273926, + "learning_rate": 6.433318071858343e-06, + "loss": 0.1435, + "step": 16336 + }, + { + "epoch": 0.4134170104005871, + "grad_norm": 3.727085828781128, + "learning_rate": 6.432933396076569e-06, + "loss": 0.1896, + "step": 16337 + }, + { + "epoch": 0.41344231596528075, + "grad_norm": 11.445954322814941, + "learning_rate": 6.43254871105407e-06, + "loss": 0.257, + "step": 16338 + }, + { + "epoch": 0.41346762152997446, + "grad_norm": 6.601124286651611, + "learning_rate": 6.432164016793335e-06, + "loss": 0.1996, + "step": 16339 + }, + { + "epoch": 0.4134929270946681, + "grad_norm": 4.507190704345703, + "learning_rate": 6.43177931329684e-06, + "loss": 0.1349, + "step": 16340 + }, + { + "epoch": 0.4135182326593618, + "grad_norm": 4.322299480438232, + "learning_rate": 6.4313946005670655e-06, + "loss": 0.1685, + "step": 16341 + }, + { + "epoch": 0.4135435382240555, + "grad_norm": 4.55602502822876, + "learning_rate": 6.431009878606494e-06, + "loss": 0.2089, + "step": 16342 + }, + { + "epoch": 0.41356884378874914, + "grad_norm": 6.728602886199951, + "learning_rate": 6.4306251474176065e-06, + "loss": 0.223, + "step": 16343 + }, + { + "epoch": 0.4135941493534428, + "grad_norm": 5.211871147155762, + "learning_rate": 6.430240407002882e-06, + "loss": 0.2152, + "step": 16344 + }, + { + "epoch": 0.4136194549181365, + "grad_norm": 9.511690139770508, + "learning_rate": 6.429855657364805e-06, + "loss": 0.334, + "step": 16345 + }, + { + "epoch": 0.41364476048283016, + "grad_norm": 4.649808406829834, + "learning_rate": 6.429470898505854e-06, + "loss": 0.2057, + "step": 16346 + }, + { + "epoch": 0.4136700660475239, + "grad_norm": 7.460762977600098, + "learning_rate": 6.429086130428511e-06, + "loss": 0.2102, + "step": 16347 + }, + { + "epoch": 0.41369537161221753, + "grad_norm": 4.888692855834961, + "learning_rate": 6.4287013531352586e-06, + "loss": 0.182, + "step": 16348 + }, + { + "epoch": 0.4137206771769112, + "grad_norm": 5.310260772705078, + "learning_rate": 6.428316566628575e-06, + "loss": 0.1984, + "step": 16349 + }, + { + "epoch": 0.4137459827416049, + "grad_norm": 3.654008150100708, + "learning_rate": 6.427931770910945e-06, + "loss": 0.1764, + "step": 16350 + }, + { + "epoch": 0.41377128830629856, + "grad_norm": 7.013370513916016, + "learning_rate": 6.427546965984848e-06, + "loss": 0.2136, + "step": 16351 + }, + { + "epoch": 0.4137965938709922, + "grad_norm": 3.598487615585327, + "learning_rate": 6.427162151852766e-06, + "loss": 0.1558, + "step": 16352 + }, + { + "epoch": 0.4138218994356859, + "grad_norm": 5.741316795349121, + "learning_rate": 6.42677732851718e-06, + "loss": 0.1713, + "step": 16353 + }, + { + "epoch": 0.4138472050003796, + "grad_norm": 4.551168918609619, + "learning_rate": 6.426392495980573e-06, + "loss": 0.198, + "step": 16354 + }, + { + "epoch": 0.41387251056507324, + "grad_norm": 3.786712408065796, + "learning_rate": 6.426007654245428e-06, + "loss": 0.1963, + "step": 16355 + }, + { + "epoch": 0.41389781612976695, + "grad_norm": 3.764242172241211, + "learning_rate": 6.4256228033142236e-06, + "loss": 0.1454, + "step": 16356 + }, + { + "epoch": 0.4139231216944606, + "grad_norm": 3.8651721477508545, + "learning_rate": 6.4252379431894415e-06, + "loss": 0.1944, + "step": 16357 + }, + { + "epoch": 0.41394842725915426, + "grad_norm": 3.2416234016418457, + "learning_rate": 6.424853073873565e-06, + "loss": 0.1527, + "step": 16358 + }, + { + "epoch": 0.413973732823848, + "grad_norm": 4.829578399658203, + "learning_rate": 6.424468195369078e-06, + "loss": 0.2096, + "step": 16359 + }, + { + "epoch": 0.41399903838854163, + "grad_norm": 8.013213157653809, + "learning_rate": 6.4240833076784596e-06, + "loss": 0.2205, + "step": 16360 + }, + { + "epoch": 0.41402434395323534, + "grad_norm": 3.897385358810425, + "learning_rate": 6.423698410804193e-06, + "loss": 0.1549, + "step": 16361 + }, + { + "epoch": 0.414049649517929, + "grad_norm": 7.735998153686523, + "learning_rate": 6.423313504748761e-06, + "loss": 0.1848, + "step": 16362 + }, + { + "epoch": 0.41407495508262265, + "grad_norm": 2.538228988647461, + "learning_rate": 6.422928589514644e-06, + "loss": 0.1072, + "step": 16363 + }, + { + "epoch": 0.41410026064731636, + "grad_norm": 11.06942081451416, + "learning_rate": 6.422543665104325e-06, + "loss": 0.1375, + "step": 16364 + }, + { + "epoch": 0.41412556621201, + "grad_norm": 6.37086296081543, + "learning_rate": 6.422158731520287e-06, + "loss": 0.2138, + "step": 16365 + }, + { + "epoch": 0.4141508717767037, + "grad_norm": 6.36887264251709, + "learning_rate": 6.421773788765011e-06, + "loss": 0.2061, + "step": 16366 + }, + { + "epoch": 0.4141761773413974, + "grad_norm": 5.007132053375244, + "learning_rate": 6.421388836840982e-06, + "loss": 0.1432, + "step": 16367 + }, + { + "epoch": 0.41420148290609105, + "grad_norm": 5.490185260772705, + "learning_rate": 6.42100387575068e-06, + "loss": 0.1739, + "step": 16368 + }, + { + "epoch": 0.4142267884707847, + "grad_norm": 4.258702754974365, + "learning_rate": 6.420618905496587e-06, + "loss": 0.1261, + "step": 16369 + }, + { + "epoch": 0.4142520940354784, + "grad_norm": 13.025593757629395, + "learning_rate": 6.4202339260811885e-06, + "loss": 0.3116, + "step": 16370 + }, + { + "epoch": 0.41427739960017207, + "grad_norm": 8.353126525878906, + "learning_rate": 6.419848937506965e-06, + "loss": 0.2204, + "step": 16371 + }, + { + "epoch": 0.4143027051648658, + "grad_norm": 6.059648036956787, + "learning_rate": 6.4194639397764e-06, + "loss": 0.1799, + "step": 16372 + }, + { + "epoch": 0.41432801072955944, + "grad_norm": 4.159360885620117, + "learning_rate": 6.419078932891977e-06, + "loss": 0.1978, + "step": 16373 + }, + { + "epoch": 0.4143533162942531, + "grad_norm": 4.878326892852783, + "learning_rate": 6.418693916856177e-06, + "loss": 0.2683, + "step": 16374 + }, + { + "epoch": 0.4143786218589468, + "grad_norm": 6.8941473960876465, + "learning_rate": 6.418308891671484e-06, + "loss": 0.2522, + "step": 16375 + }, + { + "epoch": 0.41440392742364046, + "grad_norm": 5.970033645629883, + "learning_rate": 6.41792385734038e-06, + "loss": 0.1409, + "step": 16376 + }, + { + "epoch": 0.4144292329883341, + "grad_norm": 8.5984525680542, + "learning_rate": 6.41753881386535e-06, + "loss": 0.27, + "step": 16377 + }, + { + "epoch": 0.41445453855302783, + "grad_norm": 2.8652007579803467, + "learning_rate": 6.417153761248876e-06, + "loss": 0.1185, + "step": 16378 + }, + { + "epoch": 0.4144798441177215, + "grad_norm": 8.460771560668945, + "learning_rate": 6.416768699493441e-06, + "loss": 0.1113, + "step": 16379 + }, + { + "epoch": 0.41450514968241514, + "grad_norm": 3.8963544368743896, + "learning_rate": 6.416383628601527e-06, + "loss": 0.1082, + "step": 16380 + }, + { + "epoch": 0.41453045524710885, + "grad_norm": 3.4815096855163574, + "learning_rate": 6.415998548575619e-06, + "loss": 0.1511, + "step": 16381 + }, + { + "epoch": 0.4145557608118025, + "grad_norm": 4.245198726654053, + "learning_rate": 6.4156134594181995e-06, + "loss": 0.1755, + "step": 16382 + }, + { + "epoch": 0.41458106637649617, + "grad_norm": 4.039816856384277, + "learning_rate": 6.415228361131753e-06, + "loss": 0.1509, + "step": 16383 + }, + { + "epoch": 0.4146063719411899, + "grad_norm": 4.006134033203125, + "learning_rate": 6.414843253718761e-06, + "loss": 0.1705, + "step": 16384 + }, + { + "epoch": 0.41463167750588353, + "grad_norm": 14.089180946350098, + "learning_rate": 6.414458137181709e-06, + "loss": 0.1784, + "step": 16385 + }, + { + "epoch": 0.41465698307057725, + "grad_norm": 5.110132694244385, + "learning_rate": 6.414073011523079e-06, + "loss": 0.1459, + "step": 16386 + }, + { + "epoch": 0.4146822886352709, + "grad_norm": 4.818756580352783, + "learning_rate": 6.413687876745356e-06, + "loss": 0.1852, + "step": 16387 + }, + { + "epoch": 0.41470759419996456, + "grad_norm": 6.3820414543151855, + "learning_rate": 6.413302732851021e-06, + "loss": 0.1456, + "step": 16388 + }, + { + "epoch": 0.41473289976465827, + "grad_norm": 5.565542697906494, + "learning_rate": 6.412917579842561e-06, + "loss": 0.2166, + "step": 16389 + }, + { + "epoch": 0.4147582053293519, + "grad_norm": 8.286502838134766, + "learning_rate": 6.412532417722458e-06, + "loss": 0.1331, + "step": 16390 + }, + { + "epoch": 0.4147835108940456, + "grad_norm": 17.792327880859375, + "learning_rate": 6.412147246493196e-06, + "loss": 0.2594, + "step": 16391 + }, + { + "epoch": 0.4148088164587393, + "grad_norm": 5.383924961090088, + "learning_rate": 6.411762066157259e-06, + "loss": 0.1462, + "step": 16392 + }, + { + "epoch": 0.41483412202343295, + "grad_norm": 10.51274299621582, + "learning_rate": 6.41137687671713e-06, + "loss": 0.262, + "step": 16393 + }, + { + "epoch": 0.4148594275881266, + "grad_norm": 3.359192132949829, + "learning_rate": 6.410991678175297e-06, + "loss": 0.1297, + "step": 16394 + }, + { + "epoch": 0.4148847331528203, + "grad_norm": 11.491480827331543, + "learning_rate": 6.410606470534239e-06, + "loss": 0.2518, + "step": 16395 + }, + { + "epoch": 0.414910038717514, + "grad_norm": 4.754621505737305, + "learning_rate": 6.410221253796444e-06, + "loss": 0.2126, + "step": 16396 + }, + { + "epoch": 0.41493534428220763, + "grad_norm": 6.403365612030029, + "learning_rate": 6.409836027964392e-06, + "loss": 0.2501, + "step": 16397 + }, + { + "epoch": 0.41496064984690134, + "grad_norm": 11.706393241882324, + "learning_rate": 6.409450793040571e-06, + "loss": 0.2268, + "step": 16398 + }, + { + "epoch": 0.414985955411595, + "grad_norm": 9.308382034301758, + "learning_rate": 6.409065549027465e-06, + "loss": 0.1193, + "step": 16399 + }, + { + "epoch": 0.4150112609762887, + "grad_norm": 6.702581405639648, + "learning_rate": 6.4086802959275565e-06, + "loss": 0.2265, + "step": 16400 + }, + { + "epoch": 0.41503656654098237, + "grad_norm": 7.654646396636963, + "learning_rate": 6.408295033743329e-06, + "loss": 0.2026, + "step": 16401 + }, + { + "epoch": 0.415061872105676, + "grad_norm": 4.400134086608887, + "learning_rate": 6.40790976247727e-06, + "loss": 0.1432, + "step": 16402 + }, + { + "epoch": 0.41508717767036973, + "grad_norm": 3.2322192192077637, + "learning_rate": 6.407524482131864e-06, + "loss": 0.1541, + "step": 16403 + }, + { + "epoch": 0.4151124832350634, + "grad_norm": 3.252068042755127, + "learning_rate": 6.407139192709595e-06, + "loss": 0.1688, + "step": 16404 + }, + { + "epoch": 0.41513778879975705, + "grad_norm": 3.4212734699249268, + "learning_rate": 6.406753894212945e-06, + "loss": 0.1509, + "step": 16405 + }, + { + "epoch": 0.41516309436445076, + "grad_norm": 4.497738838195801, + "learning_rate": 6.406368586644403e-06, + "loss": 0.1618, + "step": 16406 + }, + { + "epoch": 0.4151883999291444, + "grad_norm": 3.0007123947143555, + "learning_rate": 6.405983270006448e-06, + "loss": 0.1398, + "step": 16407 + }, + { + "epoch": 0.41521370549383807, + "grad_norm": 10.52847671508789, + "learning_rate": 6.405597944301571e-06, + "loss": 0.1579, + "step": 16408 + }, + { + "epoch": 0.4152390110585318, + "grad_norm": 3.5293710231781006, + "learning_rate": 6.405212609532256e-06, + "loss": 0.108, + "step": 16409 + }, + { + "epoch": 0.41526431662322544, + "grad_norm": 2.7280051708221436, + "learning_rate": 6.4048272657009835e-06, + "loss": 0.0884, + "step": 16410 + }, + { + "epoch": 0.41528962218791915, + "grad_norm": 14.837200164794922, + "learning_rate": 6.404441912810242e-06, + "loss": 0.1691, + "step": 16411 + }, + { + "epoch": 0.4153149277526128, + "grad_norm": 3.4611799716949463, + "learning_rate": 6.404056550862515e-06, + "loss": 0.1896, + "step": 16412 + }, + { + "epoch": 0.41534023331730646, + "grad_norm": 4.957793235778809, + "learning_rate": 6.403671179860289e-06, + "loss": 0.1815, + "step": 16413 + }, + { + "epoch": 0.4153655388820002, + "grad_norm": 5.650022506713867, + "learning_rate": 6.403285799806049e-06, + "loss": 0.2666, + "step": 16414 + }, + { + "epoch": 0.41539084444669383, + "grad_norm": 4.220700740814209, + "learning_rate": 6.402900410702281e-06, + "loss": 0.2086, + "step": 16415 + }, + { + "epoch": 0.4154161500113875, + "grad_norm": 9.383481979370117, + "learning_rate": 6.4025150125514675e-06, + "loss": 0.224, + "step": 16416 + }, + { + "epoch": 0.4154414555760812, + "grad_norm": 9.8196439743042, + "learning_rate": 6.402129605356096e-06, + "loss": 0.2067, + "step": 16417 + }, + { + "epoch": 0.41546676114077485, + "grad_norm": 35.67236328125, + "learning_rate": 6.4017441891186525e-06, + "loss": 0.264, + "step": 16418 + }, + { + "epoch": 0.4154920667054685, + "grad_norm": 23.211299896240234, + "learning_rate": 6.401358763841619e-06, + "loss": 0.2114, + "step": 16419 + }, + { + "epoch": 0.4155173722701622, + "grad_norm": 4.33696985244751, + "learning_rate": 6.400973329527487e-06, + "loss": 0.188, + "step": 16420 + }, + { + "epoch": 0.4155426778348559, + "grad_norm": 7.262385368347168, + "learning_rate": 6.400587886178736e-06, + "loss": 0.1454, + "step": 16421 + }, + { + "epoch": 0.41556798339954953, + "grad_norm": 8.75245475769043, + "learning_rate": 6.400202433797854e-06, + "loss": 0.2994, + "step": 16422 + }, + { + "epoch": 0.41559328896424325, + "grad_norm": 3.8560147285461426, + "learning_rate": 6.399816972387329e-06, + "loss": 0.1305, + "step": 16423 + }, + { + "epoch": 0.4156185945289369, + "grad_norm": 3.333146333694458, + "learning_rate": 6.399431501949643e-06, + "loss": 0.1819, + "step": 16424 + }, + { + "epoch": 0.4156439000936306, + "grad_norm": 9.034219741821289, + "learning_rate": 6.399046022487285e-06, + "loss": 0.3691, + "step": 16425 + }, + { + "epoch": 0.41566920565832427, + "grad_norm": 2.8069562911987305, + "learning_rate": 6.398660534002739e-06, + "loss": 0.1665, + "step": 16426 + }, + { + "epoch": 0.4156945112230179, + "grad_norm": 6.992701053619385, + "learning_rate": 6.3982750364984915e-06, + "loss": 0.1818, + "step": 16427 + }, + { + "epoch": 0.41571981678771164, + "grad_norm": 12.409369468688965, + "learning_rate": 6.397889529977027e-06, + "loss": 0.3502, + "step": 16428 + }, + { + "epoch": 0.4157451223524053, + "grad_norm": 5.509536266326904, + "learning_rate": 6.397504014440834e-06, + "loss": 0.1604, + "step": 16429 + }, + { + "epoch": 0.41577042791709895, + "grad_norm": 2.7741334438323975, + "learning_rate": 6.397118489892398e-06, + "loss": 0.1473, + "step": 16430 + }, + { + "epoch": 0.41579573348179266, + "grad_norm": 5.252544403076172, + "learning_rate": 6.396732956334204e-06, + "loss": 0.1843, + "step": 16431 + }, + { + "epoch": 0.4158210390464863, + "grad_norm": 3.288712739944458, + "learning_rate": 6.396347413768738e-06, + "loss": 0.1363, + "step": 16432 + }, + { + "epoch": 0.41584634461118, + "grad_norm": 19.910659790039062, + "learning_rate": 6.39596186219849e-06, + "loss": 0.3345, + "step": 16433 + }, + { + "epoch": 0.4158716501758737, + "grad_norm": 4.502277851104736, + "learning_rate": 6.395576301625941e-06, + "loss": 0.1474, + "step": 16434 + }, + { + "epoch": 0.41589695574056734, + "grad_norm": 9.212013244628906, + "learning_rate": 6.3951907320535825e-06, + "loss": 0.1257, + "step": 16435 + }, + { + "epoch": 0.41592226130526105, + "grad_norm": 6.360762119293213, + "learning_rate": 6.394805153483897e-06, + "loss": 0.1222, + "step": 16436 + }, + { + "epoch": 0.4159475668699547, + "grad_norm": 4.666161060333252, + "learning_rate": 6.394419565919372e-06, + "loss": 0.2389, + "step": 16437 + }, + { + "epoch": 0.41597287243464837, + "grad_norm": 25.407018661499023, + "learning_rate": 6.394033969362496e-06, + "loss": 0.3161, + "step": 16438 + }, + { + "epoch": 0.4159981779993421, + "grad_norm": 10.307953834533691, + "learning_rate": 6.393648363815753e-06, + "loss": 0.1955, + "step": 16439 + }, + { + "epoch": 0.41602348356403573, + "grad_norm": 5.283689498901367, + "learning_rate": 6.393262749281632e-06, + "loss": 0.1394, + "step": 16440 + }, + { + "epoch": 0.4160487891287294, + "grad_norm": 6.749607563018799, + "learning_rate": 6.392877125762618e-06, + "loss": 0.1564, + "step": 16441 + }, + { + "epoch": 0.4160740946934231, + "grad_norm": 5.6400465965271, + "learning_rate": 6.392491493261199e-06, + "loss": 0.2076, + "step": 16442 + }, + { + "epoch": 0.41609940025811676, + "grad_norm": 2.055654525756836, + "learning_rate": 6.392105851779861e-06, + "loss": 0.1173, + "step": 16443 + }, + { + "epoch": 0.4161247058228104, + "grad_norm": 7.1274333000183105, + "learning_rate": 6.39172020132109e-06, + "loss": 0.2196, + "step": 16444 + }, + { + "epoch": 0.4161500113875041, + "grad_norm": 3.06510329246521, + "learning_rate": 6.391334541887376e-06, + "loss": 0.1847, + "step": 16445 + }, + { + "epoch": 0.4161753169521978, + "grad_norm": 4.077672004699707, + "learning_rate": 6.390948873481204e-06, + "loss": 0.1157, + "step": 16446 + }, + { + "epoch": 0.41620062251689144, + "grad_norm": 3.9320075511932373, + "learning_rate": 6.390563196105062e-06, + "loss": 0.2021, + "step": 16447 + }, + { + "epoch": 0.41622592808158515, + "grad_norm": 3.325004816055298, + "learning_rate": 6.390177509761435e-06, + "loss": 0.1298, + "step": 16448 + }, + { + "epoch": 0.4162512336462788, + "grad_norm": 3.4638898372650146, + "learning_rate": 6.389791814452814e-06, + "loss": 0.1592, + "step": 16449 + }, + { + "epoch": 0.4162765392109725, + "grad_norm": 5.733802318572998, + "learning_rate": 6.3894061101816815e-06, + "loss": 0.2102, + "step": 16450 + }, + { + "epoch": 0.4163018447756662, + "grad_norm": 7.156231880187988, + "learning_rate": 6.38902039695053e-06, + "loss": 0.1887, + "step": 16451 + }, + { + "epoch": 0.41632715034035983, + "grad_norm": 7.580562114715576, + "learning_rate": 6.388634674761843e-06, + "loss": 0.1651, + "step": 16452 + }, + { + "epoch": 0.41635245590505354, + "grad_norm": 3.1636464595794678, + "learning_rate": 6.388248943618111e-06, + "loss": 0.1797, + "step": 16453 + }, + { + "epoch": 0.4163777614697472, + "grad_norm": 4.418694019317627, + "learning_rate": 6.387863203521817e-06, + "loss": 0.1208, + "step": 16454 + }, + { + "epoch": 0.41640306703444085, + "grad_norm": 4.313361167907715, + "learning_rate": 6.387477454475452e-06, + "loss": 0.1938, + "step": 16455 + }, + { + "epoch": 0.41642837259913457, + "grad_norm": 5.107470512390137, + "learning_rate": 6.387091696481504e-06, + "loss": 0.184, + "step": 16456 + }, + { + "epoch": 0.4164536781638282, + "grad_norm": 3.213057518005371, + "learning_rate": 6.386705929542459e-06, + "loss": 0.1523, + "step": 16457 + }, + { + "epoch": 0.4164789837285219, + "grad_norm": 3.131612539291382, + "learning_rate": 6.386320153660807e-06, + "loss": 0.1387, + "step": 16458 + }, + { + "epoch": 0.4165042892932156, + "grad_norm": 5.87909460067749, + "learning_rate": 6.385934368839032e-06, + "loss": 0.1799, + "step": 16459 + }, + { + "epoch": 0.41652959485790925, + "grad_norm": 4.490966796875, + "learning_rate": 6.385548575079625e-06, + "loss": 0.144, + "step": 16460 + }, + { + "epoch": 0.4165549004226029, + "grad_norm": 9.267151832580566, + "learning_rate": 6.3851627723850725e-06, + "loss": 0.2095, + "step": 16461 + }, + { + "epoch": 0.4165802059872966, + "grad_norm": 8.078911781311035, + "learning_rate": 6.384776960757865e-06, + "loss": 0.1529, + "step": 16462 + }, + { + "epoch": 0.41660551155199027, + "grad_norm": 5.085240840911865, + "learning_rate": 6.384391140200486e-06, + "loss": 0.2, + "step": 16463 + }, + { + "epoch": 0.416630817116684, + "grad_norm": 3.7676002979278564, + "learning_rate": 6.384005310715428e-06, + "loss": 0.116, + "step": 16464 + }, + { + "epoch": 0.41665612268137764, + "grad_norm": 19.599802017211914, + "learning_rate": 6.3836194723051755e-06, + "loss": 0.2361, + "step": 16465 + }, + { + "epoch": 0.4166814282460713, + "grad_norm": 4.326320648193359, + "learning_rate": 6.3832336249722195e-06, + "loss": 0.2203, + "step": 16466 + }, + { + "epoch": 0.416706733810765, + "grad_norm": 6.746753215789795, + "learning_rate": 6.382847768719047e-06, + "loss": 0.1801, + "step": 16467 + }, + { + "epoch": 0.41673203937545866, + "grad_norm": 5.049201965332031, + "learning_rate": 6.382461903548146e-06, + "loss": 0.1413, + "step": 16468 + }, + { + "epoch": 0.4167573449401523, + "grad_norm": 7.484864234924316, + "learning_rate": 6.382076029462005e-06, + "loss": 0.217, + "step": 16469 + }, + { + "epoch": 0.41678265050484603, + "grad_norm": 4.824761867523193, + "learning_rate": 6.3816901464631134e-06, + "loss": 0.1987, + "step": 16470 + }, + { + "epoch": 0.4168079560695397, + "grad_norm": 3.7140278816223145, + "learning_rate": 6.381304254553959e-06, + "loss": 0.1171, + "step": 16471 + }, + { + "epoch": 0.41683326163423334, + "grad_norm": 6.69542121887207, + "learning_rate": 6.3809183537370315e-06, + "loss": 0.237, + "step": 16472 + }, + { + "epoch": 0.41685856719892705, + "grad_norm": 3.3265628814697266, + "learning_rate": 6.380532444014818e-06, + "loss": 0.1196, + "step": 16473 + }, + { + "epoch": 0.4168838727636207, + "grad_norm": 3.188021421432495, + "learning_rate": 6.380146525389806e-06, + "loss": 0.144, + "step": 16474 + }, + { + "epoch": 0.4169091783283144, + "grad_norm": 5.45313835144043, + "learning_rate": 6.3797605978644865e-06, + "loss": 0.184, + "step": 16475 + }, + { + "epoch": 0.4169344838930081, + "grad_norm": 5.097681045532227, + "learning_rate": 6.379374661441346e-06, + "loss": 0.2282, + "step": 16476 + }, + { + "epoch": 0.41695978945770173, + "grad_norm": 6.620461940765381, + "learning_rate": 6.378988716122877e-06, + "loss": 0.1954, + "step": 16477 + }, + { + "epoch": 0.41698509502239545, + "grad_norm": 4.041810989379883, + "learning_rate": 6.378602761911566e-06, + "loss": 0.1627, + "step": 16478 + }, + { + "epoch": 0.4170104005870891, + "grad_norm": 2.9545681476593018, + "learning_rate": 6.378216798809902e-06, + "loss": 0.1284, + "step": 16479 + }, + { + "epoch": 0.41703570615178276, + "grad_norm": 2.9089818000793457, + "learning_rate": 6.377830826820375e-06, + "loss": 0.0894, + "step": 16480 + }, + { + "epoch": 0.41706101171647647, + "grad_norm": 6.519807815551758, + "learning_rate": 6.377444845945472e-06, + "loss": 0.1717, + "step": 16481 + }, + { + "epoch": 0.4170863172811701, + "grad_norm": 15.369099617004395, + "learning_rate": 6.377058856187685e-06, + "loss": 0.2362, + "step": 16482 + }, + { + "epoch": 0.4171116228458638, + "grad_norm": 5.281896591186523, + "learning_rate": 6.3766728575494995e-06, + "loss": 0.0465, + "step": 16483 + }, + { + "epoch": 0.4171369284105575, + "grad_norm": 5.6639509201049805, + "learning_rate": 6.3762868500334085e-06, + "loss": 0.152, + "step": 16484 + }, + { + "epoch": 0.41716223397525115, + "grad_norm": 4.001278877258301, + "learning_rate": 6.3759008336418995e-06, + "loss": 0.1856, + "step": 16485 + }, + { + "epoch": 0.4171875395399448, + "grad_norm": 3.7914650440216064, + "learning_rate": 6.375514808377461e-06, + "loss": 0.1851, + "step": 16486 + }, + { + "epoch": 0.4172128451046385, + "grad_norm": 8.003710746765137, + "learning_rate": 6.375128774242584e-06, + "loss": 0.1672, + "step": 16487 + }, + { + "epoch": 0.4172381506693322, + "grad_norm": 7.405947685241699, + "learning_rate": 6.374742731239757e-06, + "loss": 0.2185, + "step": 16488 + }, + { + "epoch": 0.4172634562340259, + "grad_norm": 9.319747924804688, + "learning_rate": 6.374356679371469e-06, + "loss": 0.226, + "step": 16489 + }, + { + "epoch": 0.41728876179871954, + "grad_norm": 3.8366830348968506, + "learning_rate": 6.3739706186402114e-06, + "loss": 0.1582, + "step": 16490 + }, + { + "epoch": 0.4173140673634132, + "grad_norm": 5.168505668640137, + "learning_rate": 6.373584549048475e-06, + "loss": 0.1475, + "step": 16491 + }, + { + "epoch": 0.4173393729281069, + "grad_norm": 6.162989616394043, + "learning_rate": 6.373198470598744e-06, + "loss": 0.2431, + "step": 16492 + }, + { + "epoch": 0.41736467849280057, + "grad_norm": 4.256481170654297, + "learning_rate": 6.372812383293513e-06, + "loss": 0.1778, + "step": 16493 + }, + { + "epoch": 0.4173899840574942, + "grad_norm": 4.888459205627441, + "learning_rate": 6.3724262871352715e-06, + "loss": 0.2247, + "step": 16494 + }, + { + "epoch": 0.41741528962218793, + "grad_norm": 3.7774715423583984, + "learning_rate": 6.372040182126506e-06, + "loss": 0.1157, + "step": 16495 + }, + { + "epoch": 0.4174405951868816, + "grad_norm": 7.856675624847412, + "learning_rate": 6.371654068269709e-06, + "loss": 0.1806, + "step": 16496 + }, + { + "epoch": 0.41746590075157525, + "grad_norm": 5.695201396942139, + "learning_rate": 6.37126794556737e-06, + "loss": 0.2287, + "step": 16497 + }, + { + "epoch": 0.41749120631626896, + "grad_norm": 2.9304721355438232, + "learning_rate": 6.370881814021981e-06, + "loss": 0.1501, + "step": 16498 + }, + { + "epoch": 0.4175165118809626, + "grad_norm": 4.869497299194336, + "learning_rate": 6.370495673636028e-06, + "loss": 0.155, + "step": 16499 + }, + { + "epoch": 0.4175418174456563, + "grad_norm": 3.490616798400879, + "learning_rate": 6.370109524412005e-06, + "loss": 0.1743, + "step": 16500 + }, + { + "epoch": 0.41756712301035, + "grad_norm": 2.4455904960632324, + "learning_rate": 6.369723366352398e-06, + "loss": 0.1408, + "step": 16501 + }, + { + "epoch": 0.41759242857504364, + "grad_norm": 3.180677890777588, + "learning_rate": 6.369337199459702e-06, + "loss": 0.0943, + "step": 16502 + }, + { + "epoch": 0.41761773413973735, + "grad_norm": 2.7617621421813965, + "learning_rate": 6.368951023736404e-06, + "loss": 0.1447, + "step": 16503 + }, + { + "epoch": 0.417643039704431, + "grad_norm": 4.333040237426758, + "learning_rate": 6.368564839184997e-06, + "loss": 0.1624, + "step": 16504 + }, + { + "epoch": 0.41766834526912466, + "grad_norm": 8.380416870117188, + "learning_rate": 6.368178645807968e-06, + "loss": 0.1378, + "step": 16505 + }, + { + "epoch": 0.4176936508338184, + "grad_norm": 4.806975364685059, + "learning_rate": 6.3677924436078106e-06, + "loss": 0.1654, + "step": 16506 + }, + { + "epoch": 0.41771895639851203, + "grad_norm": 4.208510875701904, + "learning_rate": 6.367406232587013e-06, + "loss": 0.2101, + "step": 16507 + }, + { + "epoch": 0.4177442619632057, + "grad_norm": 3.0934622287750244, + "learning_rate": 6.3670200127480685e-06, + "loss": 0.1149, + "step": 16508 + }, + { + "epoch": 0.4177695675278994, + "grad_norm": 10.574403762817383, + "learning_rate": 6.366633784093466e-06, + "loss": 0.151, + "step": 16509 + }, + { + "epoch": 0.41779487309259306, + "grad_norm": 4.122795581817627, + "learning_rate": 6.366247546625694e-06, + "loss": 0.1764, + "step": 16510 + }, + { + "epoch": 0.4178201786572867, + "grad_norm": 4.321975231170654, + "learning_rate": 6.365861300347249e-06, + "loss": 0.1561, + "step": 16511 + }, + { + "epoch": 0.4178454842219804, + "grad_norm": 5.723717212677002, + "learning_rate": 6.365475045260617e-06, + "loss": 0.1599, + "step": 16512 + }, + { + "epoch": 0.4178707897866741, + "grad_norm": 6.100600719451904, + "learning_rate": 6.365088781368292e-06, + "loss": 0.1588, + "step": 16513 + }, + { + "epoch": 0.4178960953513678, + "grad_norm": 4.278990268707275, + "learning_rate": 6.364702508672761e-06, + "loss": 0.1111, + "step": 16514 + }, + { + "epoch": 0.41792140091606145, + "grad_norm": 3.6285078525543213, + "learning_rate": 6.364316227176519e-06, + "loss": 0.0798, + "step": 16515 + }, + { + "epoch": 0.4179467064807551, + "grad_norm": 4.868627548217773, + "learning_rate": 6.363929936882054e-06, + "loss": 0.2046, + "step": 16516 + }, + { + "epoch": 0.4179720120454488, + "grad_norm": 3.3905324935913086, + "learning_rate": 6.3635436377918605e-06, + "loss": 0.1816, + "step": 16517 + }, + { + "epoch": 0.41799731761014247, + "grad_norm": 2.4918198585510254, + "learning_rate": 6.363157329908426e-06, + "loss": 0.1178, + "step": 16518 + }, + { + "epoch": 0.4180226231748361, + "grad_norm": 7.407105922698975, + "learning_rate": 6.362771013234244e-06, + "loss": 0.2972, + "step": 16519 + }, + { + "epoch": 0.41804792873952984, + "grad_norm": 4.65818452835083, + "learning_rate": 6.362384687771806e-06, + "loss": 0.0832, + "step": 16520 + }, + { + "epoch": 0.4180732343042235, + "grad_norm": 4.174408912658691, + "learning_rate": 6.3619983535236016e-06, + "loss": 0.1647, + "step": 16521 + }, + { + "epoch": 0.41809853986891715, + "grad_norm": 4.456803321838379, + "learning_rate": 6.361612010492123e-06, + "loss": 0.2074, + "step": 16522 + }, + { + "epoch": 0.41812384543361086, + "grad_norm": 16.294530868530273, + "learning_rate": 6.361225658679862e-06, + "loss": 0.3263, + "step": 16523 + }, + { + "epoch": 0.4181491509983045, + "grad_norm": 5.353614807128906, + "learning_rate": 6.36083929808931e-06, + "loss": 0.2344, + "step": 16524 + }, + { + "epoch": 0.4181744565629982, + "grad_norm": 14.908978462219238, + "learning_rate": 6.36045292872296e-06, + "loss": 0.1509, + "step": 16525 + }, + { + "epoch": 0.4181997621276919, + "grad_norm": 3.7444498538970947, + "learning_rate": 6.360066550583301e-06, + "loss": 0.1552, + "step": 16526 + }, + { + "epoch": 0.41822506769238554, + "grad_norm": 14.568346977233887, + "learning_rate": 6.359680163672824e-06, + "loss": 0.295, + "step": 16527 + }, + { + "epoch": 0.41825037325707926, + "grad_norm": 3.7733898162841797, + "learning_rate": 6.3592937679940255e-06, + "loss": 0.1609, + "step": 16528 + }, + { + "epoch": 0.4182756788217729, + "grad_norm": 2.8566198348999023, + "learning_rate": 6.358907363549392e-06, + "loss": 0.1384, + "step": 16529 + }, + { + "epoch": 0.41830098438646657, + "grad_norm": 3.995438575744629, + "learning_rate": 6.358520950341419e-06, + "loss": 0.1232, + "step": 16530 + }, + { + "epoch": 0.4183262899511603, + "grad_norm": 5.8651652336120605, + "learning_rate": 6.358134528372597e-06, + "loss": 0.143, + "step": 16531 + }, + { + "epoch": 0.41835159551585394, + "grad_norm": 4.073379039764404, + "learning_rate": 6.3577480976454175e-06, + "loss": 0.1281, + "step": 16532 + }, + { + "epoch": 0.4183769010805476, + "grad_norm": 2.8779306411743164, + "learning_rate": 6.3573616581623745e-06, + "loss": 0.0725, + "step": 16533 + }, + { + "epoch": 0.4184022066452413, + "grad_norm": 3.733762264251709, + "learning_rate": 6.3569752099259566e-06, + "loss": 0.1523, + "step": 16534 + }, + { + "epoch": 0.41842751220993496, + "grad_norm": 5.902475833892822, + "learning_rate": 6.356588752938659e-06, + "loss": 0.2022, + "step": 16535 + }, + { + "epoch": 0.4184528177746286, + "grad_norm": 3.671541213989258, + "learning_rate": 6.3562022872029726e-06, + "loss": 0.1706, + "step": 16536 + }, + { + "epoch": 0.4184781233393223, + "grad_norm": 3.384941816329956, + "learning_rate": 6.355815812721391e-06, + "loss": 0.1164, + "step": 16537 + }, + { + "epoch": 0.418503428904016, + "grad_norm": 4.755102634429932, + "learning_rate": 6.355429329496404e-06, + "loss": 0.1976, + "step": 16538 + }, + { + "epoch": 0.4185287344687097, + "grad_norm": 6.13411283493042, + "learning_rate": 6.3550428375305065e-06, + "loss": 0.1134, + "step": 16539 + }, + { + "epoch": 0.41855404003340335, + "grad_norm": 3.1059365272521973, + "learning_rate": 6.354656336826188e-06, + "loss": 0.135, + "step": 16540 + }, + { + "epoch": 0.418579345598097, + "grad_norm": 4.925956726074219, + "learning_rate": 6.354269827385944e-06, + "loss": 0.1659, + "step": 16541 + }, + { + "epoch": 0.4186046511627907, + "grad_norm": 2.6513092517852783, + "learning_rate": 6.353883309212266e-06, + "loss": 0.1735, + "step": 16542 + }, + { + "epoch": 0.4186299567274844, + "grad_norm": 7.671940326690674, + "learning_rate": 6.353496782307646e-06, + "loss": 0.1642, + "step": 16543 + }, + { + "epoch": 0.41865526229217803, + "grad_norm": 4.066686630249023, + "learning_rate": 6.353110246674579e-06, + "loss": 0.1794, + "step": 16544 + }, + { + "epoch": 0.41868056785687174, + "grad_norm": 2.9013853073120117, + "learning_rate": 6.352723702315553e-06, + "loss": 0.0993, + "step": 16545 + }, + { + "epoch": 0.4187058734215654, + "grad_norm": 6.514447212219238, + "learning_rate": 6.352337149233065e-06, + "loss": 0.148, + "step": 16546 + }, + { + "epoch": 0.41873117898625906, + "grad_norm": 15.40223217010498, + "learning_rate": 6.351950587429605e-06, + "loss": 0.2851, + "step": 16547 + }, + { + "epoch": 0.41875648455095277, + "grad_norm": 2.654093027114868, + "learning_rate": 6.351564016907669e-06, + "loss": 0.0609, + "step": 16548 + }, + { + "epoch": 0.4187817901156464, + "grad_norm": 4.073518753051758, + "learning_rate": 6.351177437669745e-06, + "loss": 0.1487, + "step": 16549 + }, + { + "epoch": 0.4188070956803401, + "grad_norm": 7.9905290603637695, + "learning_rate": 6.350790849718333e-06, + "loss": 0.2685, + "step": 16550 + }, + { + "epoch": 0.4188324012450338, + "grad_norm": 2.7097506523132324, + "learning_rate": 6.350404253055919e-06, + "loss": 0.1469, + "step": 16551 + }, + { + "epoch": 0.41885770680972745, + "grad_norm": 3.436023473739624, + "learning_rate": 6.350017647685001e-06, + "loss": 0.1728, + "step": 16552 + }, + { + "epoch": 0.41888301237442116, + "grad_norm": 3.4462931156158447, + "learning_rate": 6.349631033608068e-06, + "loss": 0.1482, + "step": 16553 + }, + { + "epoch": 0.4189083179391148, + "grad_norm": 5.206860542297363, + "learning_rate": 6.349244410827618e-06, + "loss": 0.2118, + "step": 16554 + }, + { + "epoch": 0.41893362350380847, + "grad_norm": 4.533592224121094, + "learning_rate": 6.3488577793461414e-06, + "loss": 0.1616, + "step": 16555 + }, + { + "epoch": 0.4189589290685022, + "grad_norm": 3.520015239715576, + "learning_rate": 6.34847113916613e-06, + "loss": 0.0925, + "step": 16556 + }, + { + "epoch": 0.41898423463319584, + "grad_norm": 3.8279898166656494, + "learning_rate": 6.348084490290081e-06, + "loss": 0.1366, + "step": 16557 + }, + { + "epoch": 0.4190095401978895, + "grad_norm": 7.797133922576904, + "learning_rate": 6.347697832720485e-06, + "loss": 0.2182, + "step": 16558 + }, + { + "epoch": 0.4190348457625832, + "grad_norm": 3.312527894973755, + "learning_rate": 6.3473111664598375e-06, + "loss": 0.1897, + "step": 16559 + }, + { + "epoch": 0.41906015132727686, + "grad_norm": 6.579885482788086, + "learning_rate": 6.34692449151063e-06, + "loss": 0.2535, + "step": 16560 + }, + { + "epoch": 0.4190854568919705, + "grad_norm": 36.74619674682617, + "learning_rate": 6.346537807875356e-06, + "loss": 0.1444, + "step": 16561 + }, + { + "epoch": 0.41911076245666423, + "grad_norm": 4.749919414520264, + "learning_rate": 6.346151115556512e-06, + "loss": 0.1785, + "step": 16562 + }, + { + "epoch": 0.4191360680213579, + "grad_norm": 4.752631187438965, + "learning_rate": 6.345764414556588e-06, + "loss": 0.1468, + "step": 16563 + }, + { + "epoch": 0.4191613735860516, + "grad_norm": 4.006305694580078, + "learning_rate": 6.345377704878082e-06, + "loss": 0.157, + "step": 16564 + }, + { + "epoch": 0.41918667915074526, + "grad_norm": 7.2491374015808105, + "learning_rate": 6.344990986523483e-06, + "loss": 0.2982, + "step": 16565 + }, + { + "epoch": 0.4192119847154389, + "grad_norm": 6.322995185852051, + "learning_rate": 6.3446042594952895e-06, + "loss": 0.1504, + "step": 16566 + }, + { + "epoch": 0.4192372902801326, + "grad_norm": 5.722057819366455, + "learning_rate": 6.344217523795991e-06, + "loss": 0.2719, + "step": 16567 + }, + { + "epoch": 0.4192625958448263, + "grad_norm": 5.582450866699219, + "learning_rate": 6.343830779428086e-06, + "loss": 0.167, + "step": 16568 + }, + { + "epoch": 0.41928790140951994, + "grad_norm": 4.541382312774658, + "learning_rate": 6.343444026394065e-06, + "loss": 0.1631, + "step": 16569 + }, + { + "epoch": 0.41931320697421365, + "grad_norm": 6.6591410636901855, + "learning_rate": 6.343057264696424e-06, + "loss": 0.2138, + "step": 16570 + }, + { + "epoch": 0.4193385125389073, + "grad_norm": 5.898808479309082, + "learning_rate": 6.342670494337655e-06, + "loss": 0.2177, + "step": 16571 + }, + { + "epoch": 0.41936381810360096, + "grad_norm": 4.366759300231934, + "learning_rate": 6.342283715320254e-06, + "loss": 0.1643, + "step": 16572 + }, + { + "epoch": 0.41938912366829467, + "grad_norm": 11.284272193908691, + "learning_rate": 6.341896927646717e-06, + "loss": 0.2181, + "step": 16573 + }, + { + "epoch": 0.41941442923298833, + "grad_norm": 9.132164001464844, + "learning_rate": 6.3415101313195345e-06, + "loss": 0.2183, + "step": 16574 + }, + { + "epoch": 0.419439734797682, + "grad_norm": 10.539074897766113, + "learning_rate": 6.341123326341203e-06, + "loss": 0.3142, + "step": 16575 + }, + { + "epoch": 0.4194650403623757, + "grad_norm": 7.459339618682861, + "learning_rate": 6.340736512714217e-06, + "loss": 0.2159, + "step": 16576 + }, + { + "epoch": 0.41949034592706935, + "grad_norm": 4.870186805725098, + "learning_rate": 6.340349690441071e-06, + "loss": 0.1544, + "step": 16577 + }, + { + "epoch": 0.41951565149176306, + "grad_norm": 4.6086883544921875, + "learning_rate": 6.339962859524258e-06, + "loss": 0.2049, + "step": 16578 + }, + { + "epoch": 0.4195409570564567, + "grad_norm": 3.6711783409118652, + "learning_rate": 6.339576019966276e-06, + "loss": 0.1643, + "step": 16579 + }, + { + "epoch": 0.4195662626211504, + "grad_norm": 8.087148666381836, + "learning_rate": 6.339189171769615e-06, + "loss": 0.1872, + "step": 16580 + }, + { + "epoch": 0.4195915681858441, + "grad_norm": 5.285057544708252, + "learning_rate": 6.338802314936774e-06, + "loss": 0.1683, + "step": 16581 + }, + { + "epoch": 0.41961687375053774, + "grad_norm": 4.346757888793945, + "learning_rate": 6.338415449470245e-06, + "loss": 0.1529, + "step": 16582 + }, + { + "epoch": 0.4196421793152314, + "grad_norm": 3.6542904376983643, + "learning_rate": 6.3380285753725235e-06, + "loss": 0.1907, + "step": 16583 + }, + { + "epoch": 0.4196674848799251, + "grad_norm": 7.802975177764893, + "learning_rate": 6.337641692646106e-06, + "loss": 0.183, + "step": 16584 + }, + { + "epoch": 0.41969279044461877, + "grad_norm": 4.005954265594482, + "learning_rate": 6.337254801293484e-06, + "loss": 0.168, + "step": 16585 + }, + { + "epoch": 0.4197180960093124, + "grad_norm": 2.5688059329986572, + "learning_rate": 6.336867901317156e-06, + "loss": 0.1052, + "step": 16586 + }, + { + "epoch": 0.41974340157400614, + "grad_norm": 7.658480167388916, + "learning_rate": 6.336480992719614e-06, + "loss": 0.2916, + "step": 16587 + }, + { + "epoch": 0.4197687071386998, + "grad_norm": 4.544412612915039, + "learning_rate": 6.336094075503356e-06, + "loss": 0.1501, + "step": 16588 + }, + { + "epoch": 0.41979401270339345, + "grad_norm": 9.297635078430176, + "learning_rate": 6.335707149670875e-06, + "loss": 0.1938, + "step": 16589 + }, + { + "epoch": 0.41981931826808716, + "grad_norm": 5.7477850914001465, + "learning_rate": 6.335320215224668e-06, + "loss": 0.1436, + "step": 16590 + }, + { + "epoch": 0.4198446238327808, + "grad_norm": 3.7706551551818848, + "learning_rate": 6.334933272167229e-06, + "loss": 0.1604, + "step": 16591 + }, + { + "epoch": 0.41986992939747453, + "grad_norm": 4.2159528732299805, + "learning_rate": 6.334546320501053e-06, + "loss": 0.1776, + "step": 16592 + }, + { + "epoch": 0.4198952349621682, + "grad_norm": 13.792285919189453, + "learning_rate": 6.334159360228638e-06, + "loss": 0.2799, + "step": 16593 + }, + { + "epoch": 0.41992054052686184, + "grad_norm": 2.889493227005005, + "learning_rate": 6.3337723913524755e-06, + "loss": 0.1375, + "step": 16594 + }, + { + "epoch": 0.41994584609155555, + "grad_norm": 1.9564571380615234, + "learning_rate": 6.3333854138750635e-06, + "loss": 0.0902, + "step": 16595 + }, + { + "epoch": 0.4199711516562492, + "grad_norm": 4.971933364868164, + "learning_rate": 6.3329984277988966e-06, + "loss": 0.2286, + "step": 16596 + }, + { + "epoch": 0.41999645722094286, + "grad_norm": 3.7086079120635986, + "learning_rate": 6.332611433126471e-06, + "loss": 0.1402, + "step": 16597 + }, + { + "epoch": 0.4200217627856366, + "grad_norm": 4.313940048217773, + "learning_rate": 6.332224429860282e-06, + "loss": 0.1625, + "step": 16598 + }, + { + "epoch": 0.42004706835033023, + "grad_norm": 1.7390048503875732, + "learning_rate": 6.331837418002827e-06, + "loss": 0.0934, + "step": 16599 + }, + { + "epoch": 0.4200723739150239, + "grad_norm": 4.851327896118164, + "learning_rate": 6.331450397556598e-06, + "loss": 0.2, + "step": 16600 + }, + { + "epoch": 0.4200976794797176, + "grad_norm": 13.65083122253418, + "learning_rate": 6.331063368524095e-06, + "loss": 0.2615, + "step": 16601 + }, + { + "epoch": 0.42012298504441126, + "grad_norm": 7.779336929321289, + "learning_rate": 6.33067633090781e-06, + "loss": 0.143, + "step": 16602 + }, + { + "epoch": 0.42014829060910497, + "grad_norm": 9.6620512008667, + "learning_rate": 6.330289284710243e-06, + "loss": 0.2583, + "step": 16603 + }, + { + "epoch": 0.4201735961737986, + "grad_norm": 4.681717872619629, + "learning_rate": 6.3299022299338865e-06, + "loss": 0.1464, + "step": 16604 + }, + { + "epoch": 0.4201989017384923, + "grad_norm": 12.963294982910156, + "learning_rate": 6.329515166581237e-06, + "loss": 0.2256, + "step": 16605 + }, + { + "epoch": 0.420224207303186, + "grad_norm": 5.4656853675842285, + "learning_rate": 6.329128094654793e-06, + "loss": 0.1577, + "step": 16606 + }, + { + "epoch": 0.42024951286787965, + "grad_norm": 3.77731990814209, + "learning_rate": 6.328741014157048e-06, + "loss": 0.1059, + "step": 16607 + }, + { + "epoch": 0.4202748184325733, + "grad_norm": 4.856914520263672, + "learning_rate": 6.328353925090501e-06, + "loss": 0.1543, + "step": 16608 + }, + { + "epoch": 0.420300123997267, + "grad_norm": 4.41018533706665, + "learning_rate": 6.327966827457645e-06, + "loss": 0.1905, + "step": 16609 + }, + { + "epoch": 0.4203254295619607, + "grad_norm": 9.64320182800293, + "learning_rate": 6.327579721260979e-06, + "loss": 0.2422, + "step": 16610 + }, + { + "epoch": 0.42035073512665433, + "grad_norm": 7.032690048217773, + "learning_rate": 6.3271926065029985e-06, + "loss": 0.2347, + "step": 16611 + }, + { + "epoch": 0.42037604069134804, + "grad_norm": 9.875717163085938, + "learning_rate": 6.326805483186199e-06, + "loss": 0.1222, + "step": 16612 + }, + { + "epoch": 0.4204013462560417, + "grad_norm": 4.224916458129883, + "learning_rate": 6.32641835131308e-06, + "loss": 0.1203, + "step": 16613 + }, + { + "epoch": 0.42042665182073535, + "grad_norm": 3.9108235836029053, + "learning_rate": 6.326031210886132e-06, + "loss": 0.1828, + "step": 16614 + }, + { + "epoch": 0.42045195738542906, + "grad_norm": 5.106527328491211, + "learning_rate": 6.325644061907859e-06, + "loss": 0.2135, + "step": 16615 + }, + { + "epoch": 0.4204772629501227, + "grad_norm": 4.454598903656006, + "learning_rate": 6.325256904380751e-06, + "loss": 0.2236, + "step": 16616 + }, + { + "epoch": 0.42050256851481643, + "grad_norm": 3.168334722518921, + "learning_rate": 6.32486973830731e-06, + "loss": 0.2072, + "step": 16617 + }, + { + "epoch": 0.4205278740795101, + "grad_norm": 5.147884845733643, + "learning_rate": 6.324482563690028e-06, + "loss": 0.1643, + "step": 16618 + }, + { + "epoch": 0.42055317964420375, + "grad_norm": 4.776247024536133, + "learning_rate": 6.324095380531407e-06, + "loss": 0.1155, + "step": 16619 + }, + { + "epoch": 0.42057848520889746, + "grad_norm": 4.9886250495910645, + "learning_rate": 6.323708188833939e-06, + "loss": 0.1724, + "step": 16620 + }, + { + "epoch": 0.4206037907735911, + "grad_norm": 6.898421287536621, + "learning_rate": 6.323320988600126e-06, + "loss": 0.1357, + "step": 16621 + }, + { + "epoch": 0.42062909633828477, + "grad_norm": 4.692674160003662, + "learning_rate": 6.322933779832459e-06, + "loss": 0.1987, + "step": 16622 + }, + { + "epoch": 0.4206544019029785, + "grad_norm": 6.380527496337891, + "learning_rate": 6.32254656253344e-06, + "loss": 0.2223, + "step": 16623 + }, + { + "epoch": 0.42067970746767214, + "grad_norm": 9.98502254486084, + "learning_rate": 6.3221593367055624e-06, + "loss": 0.2177, + "step": 16624 + }, + { + "epoch": 0.4207050130323658, + "grad_norm": 4.870119571685791, + "learning_rate": 6.321772102351326e-06, + "loss": 0.1709, + "step": 16625 + }, + { + "epoch": 0.4207303185970595, + "grad_norm": 6.1266703605651855, + "learning_rate": 6.321384859473228e-06, + "loss": 0.1824, + "step": 16626 + }, + { + "epoch": 0.42075562416175316, + "grad_norm": 4.174295902252197, + "learning_rate": 6.320997608073765e-06, + "loss": 0.1436, + "step": 16627 + }, + { + "epoch": 0.4207809297264469, + "grad_norm": 3.0644400119781494, + "learning_rate": 6.320610348155434e-06, + "loss": 0.1275, + "step": 16628 + }, + { + "epoch": 0.42080623529114053, + "grad_norm": 9.85848331451416, + "learning_rate": 6.320223079720731e-06, + "loss": 0.1841, + "step": 16629 + }, + { + "epoch": 0.4208315408558342, + "grad_norm": 7.053539276123047, + "learning_rate": 6.319835802772156e-06, + "loss": 0.3565, + "step": 16630 + }, + { + "epoch": 0.4208568464205279, + "grad_norm": 7.824062824249268, + "learning_rate": 6.319448517312206e-06, + "loss": 0.2055, + "step": 16631 + }, + { + "epoch": 0.42088215198522155, + "grad_norm": 4.331011772155762, + "learning_rate": 6.319061223343377e-06, + "loss": 0.1551, + "step": 16632 + }, + { + "epoch": 0.4209074575499152, + "grad_norm": 2.967036485671997, + "learning_rate": 6.318673920868168e-06, + "loss": 0.1572, + "step": 16633 + }, + { + "epoch": 0.4209327631146089, + "grad_norm": 13.163298606872559, + "learning_rate": 6.3182866098890764e-06, + "loss": 0.1759, + "step": 16634 + }, + { + "epoch": 0.4209580686793026, + "grad_norm": 6.271088600158691, + "learning_rate": 6.317899290408599e-06, + "loss": 0.1304, + "step": 16635 + }, + { + "epoch": 0.42098337424399623, + "grad_norm": 8.933450698852539, + "learning_rate": 6.317511962429234e-06, + "loss": 0.1928, + "step": 16636 + }, + { + "epoch": 0.42100867980868995, + "grad_norm": 4.836925029754639, + "learning_rate": 6.31712462595348e-06, + "loss": 0.1668, + "step": 16637 + }, + { + "epoch": 0.4210339853733836, + "grad_norm": 4.758237361907959, + "learning_rate": 6.316737280983835e-06, + "loss": 0.1511, + "step": 16638 + }, + { + "epoch": 0.42105929093807726, + "grad_norm": 13.527741432189941, + "learning_rate": 6.316349927522796e-06, + "loss": 0.1647, + "step": 16639 + }, + { + "epoch": 0.42108459650277097, + "grad_norm": 7.69994592666626, + "learning_rate": 6.315962565572861e-06, + "loss": 0.2362, + "step": 16640 + }, + { + "epoch": 0.4211099020674646, + "grad_norm": 6.529125213623047, + "learning_rate": 6.315575195136527e-06, + "loss": 0.2128, + "step": 16641 + }, + { + "epoch": 0.42113520763215834, + "grad_norm": 4.690347194671631, + "learning_rate": 6.315187816216295e-06, + "loss": 0.101, + "step": 16642 + }, + { + "epoch": 0.421160513196852, + "grad_norm": 2.796919584274292, + "learning_rate": 6.3148004288146606e-06, + "loss": 0.1588, + "step": 16643 + }, + { + "epoch": 0.42118581876154565, + "grad_norm": 4.937994003295898, + "learning_rate": 6.314413032934122e-06, + "loss": 0.104, + "step": 16644 + }, + { + "epoch": 0.42121112432623936, + "grad_norm": 4.367544174194336, + "learning_rate": 6.31402562857718e-06, + "loss": 0.1399, + "step": 16645 + }, + { + "epoch": 0.421236429890933, + "grad_norm": 2.5576305389404297, + "learning_rate": 6.3136382157463294e-06, + "loss": 0.0992, + "step": 16646 + }, + { + "epoch": 0.4212617354556267, + "grad_norm": 3.463608503341675, + "learning_rate": 6.31325079444407e-06, + "loss": 0.0856, + "step": 16647 + }, + { + "epoch": 0.4212870410203204, + "grad_norm": 5.456703186035156, + "learning_rate": 6.312863364672903e-06, + "loss": 0.1465, + "step": 16648 + }, + { + "epoch": 0.42131234658501404, + "grad_norm": 3.890570878982544, + "learning_rate": 6.312475926435322e-06, + "loss": 0.1645, + "step": 16649 + }, + { + "epoch": 0.4213376521497077, + "grad_norm": 7.268948554992676, + "learning_rate": 6.31208847973383e-06, + "loss": 0.1651, + "step": 16650 + }, + { + "epoch": 0.4213629577144014, + "grad_norm": 4.8011860847473145, + "learning_rate": 6.311701024570922e-06, + "loss": 0.1315, + "step": 16651 + }, + { + "epoch": 0.42138826327909507, + "grad_norm": 6.510429382324219, + "learning_rate": 6.311313560949098e-06, + "loss": 0.1715, + "step": 16652 + }, + { + "epoch": 0.4214135688437887, + "grad_norm": 4.508202075958252, + "learning_rate": 6.310926088870856e-06, + "loss": 0.2062, + "step": 16653 + }, + { + "epoch": 0.42143887440848243, + "grad_norm": 8.613738059997559, + "learning_rate": 6.310538608338699e-06, + "loss": 0.2206, + "step": 16654 + }, + { + "epoch": 0.4214641799731761, + "grad_norm": 6.589836597442627, + "learning_rate": 6.310151119355119e-06, + "loss": 0.1746, + "step": 16655 + }, + { + "epoch": 0.4214894855378698, + "grad_norm": 5.3149237632751465, + "learning_rate": 6.30976362192262e-06, + "loss": 0.161, + "step": 16656 + }, + { + "epoch": 0.42151479110256346, + "grad_norm": 3.2266314029693604, + "learning_rate": 6.309376116043697e-06, + "loss": 0.172, + "step": 16657 + }, + { + "epoch": 0.4215400966672571, + "grad_norm": 3.435154438018799, + "learning_rate": 6.308988601720851e-06, + "loss": 0.0919, + "step": 16658 + }, + { + "epoch": 0.4215654022319508, + "grad_norm": 3.39787220954895, + "learning_rate": 6.308601078956583e-06, + "loss": 0.1385, + "step": 16659 + }, + { + "epoch": 0.4215907077966445, + "grad_norm": 2.7211575508117676, + "learning_rate": 6.308213547753389e-06, + "loss": 0.0633, + "step": 16660 + }, + { + "epoch": 0.42161601336133814, + "grad_norm": 2.9033045768737793, + "learning_rate": 6.3078260081137695e-06, + "loss": 0.1177, + "step": 16661 + }, + { + "epoch": 0.42164131892603185, + "grad_norm": 5.020246505737305, + "learning_rate": 6.307438460040224e-06, + "loss": 0.206, + "step": 16662 + }, + { + "epoch": 0.4216666244907255, + "grad_norm": 15.789374351501465, + "learning_rate": 6.30705090353525e-06, + "loss": 0.2614, + "step": 16663 + }, + { + "epoch": 0.42169193005541916, + "grad_norm": 5.030179023742676, + "learning_rate": 6.3066633386013475e-06, + "loss": 0.1535, + "step": 16664 + }, + { + "epoch": 0.4217172356201129, + "grad_norm": 7.106213569641113, + "learning_rate": 6.3062757652410175e-06, + "loss": 0.1421, + "step": 16665 + }, + { + "epoch": 0.42174254118480653, + "grad_norm": 10.476670265197754, + "learning_rate": 6.3058881834567575e-06, + "loss": 0.2403, + "step": 16666 + }, + { + "epoch": 0.42176784674950024, + "grad_norm": 2.8003883361816406, + "learning_rate": 6.305500593251068e-06, + "loss": 0.0705, + "step": 16667 + }, + { + "epoch": 0.4217931523141939, + "grad_norm": 3.0872321128845215, + "learning_rate": 6.305112994626446e-06, + "loss": 0.1287, + "step": 16668 + }, + { + "epoch": 0.42181845787888755, + "grad_norm": 3.1321234703063965, + "learning_rate": 6.304725387585395e-06, + "loss": 0.157, + "step": 16669 + }, + { + "epoch": 0.42184376344358127, + "grad_norm": 4.143288612365723, + "learning_rate": 6.304337772130412e-06, + "loss": 0.1991, + "step": 16670 + }, + { + "epoch": 0.4218690690082749, + "grad_norm": 9.848963737487793, + "learning_rate": 6.303950148263998e-06, + "loss": 0.2497, + "step": 16671 + }, + { + "epoch": 0.4218943745729686, + "grad_norm": 6.33345890045166, + "learning_rate": 6.303562515988652e-06, + "loss": 0.2796, + "step": 16672 + }, + { + "epoch": 0.4219196801376623, + "grad_norm": 4.20860481262207, + "learning_rate": 6.303174875306873e-06, + "loss": 0.1783, + "step": 16673 + }, + { + "epoch": 0.42194498570235595, + "grad_norm": 3.0476233959198, + "learning_rate": 6.302787226221163e-06, + "loss": 0.1542, + "step": 16674 + }, + { + "epoch": 0.4219702912670496, + "grad_norm": 5.279098987579346, + "learning_rate": 6.3023995687340186e-06, + "loss": 0.215, + "step": 16675 + }, + { + "epoch": 0.4219955968317433, + "grad_norm": 3.6135826110839844, + "learning_rate": 6.3020119028479445e-06, + "loss": 0.1819, + "step": 16676 + }, + { + "epoch": 0.42202090239643697, + "grad_norm": 5.274544715881348, + "learning_rate": 6.301624228565435e-06, + "loss": 0.0964, + "step": 16677 + }, + { + "epoch": 0.4220462079611306, + "grad_norm": 3.1847968101501465, + "learning_rate": 6.301236545888993e-06, + "loss": 0.1309, + "step": 16678 + }, + { + "epoch": 0.42207151352582434, + "grad_norm": 5.988005638122559, + "learning_rate": 6.30084885482112e-06, + "loss": 0.2334, + "step": 16679 + }, + { + "epoch": 0.422096819090518, + "grad_norm": 5.1106038093566895, + "learning_rate": 6.300461155364314e-06, + "loss": 0.131, + "step": 16680 + }, + { + "epoch": 0.4221221246552117, + "grad_norm": 4.679058074951172, + "learning_rate": 6.300073447521076e-06, + "loss": 0.1414, + "step": 16681 + }, + { + "epoch": 0.42214743021990536, + "grad_norm": 5.390780925750732, + "learning_rate": 6.299685731293905e-06, + "loss": 0.1839, + "step": 16682 + }, + { + "epoch": 0.422172735784599, + "grad_norm": 3.622929573059082, + "learning_rate": 6.299298006685305e-06, + "loss": 0.1693, + "step": 16683 + }, + { + "epoch": 0.42219804134929273, + "grad_norm": 14.124863624572754, + "learning_rate": 6.2989102736977725e-06, + "loss": 0.2, + "step": 16684 + }, + { + "epoch": 0.4222233469139864, + "grad_norm": 4.602981090545654, + "learning_rate": 6.29852253233381e-06, + "loss": 0.185, + "step": 16685 + }, + { + "epoch": 0.42224865247868004, + "grad_norm": 3.6025807857513428, + "learning_rate": 6.298134782595916e-06, + "loss": 0.13, + "step": 16686 + }, + { + "epoch": 0.42227395804337375, + "grad_norm": 7.168011665344238, + "learning_rate": 6.2977470244865935e-06, + "loss": 0.1829, + "step": 16687 + }, + { + "epoch": 0.4222992636080674, + "grad_norm": 8.967961311340332, + "learning_rate": 6.29735925800834e-06, + "loss": 0.3084, + "step": 16688 + }, + { + "epoch": 0.42232456917276107, + "grad_norm": 3.8836147785186768, + "learning_rate": 6.296971483163659e-06, + "loss": 0.1085, + "step": 16689 + }, + { + "epoch": 0.4223498747374548, + "grad_norm": 4.192149639129639, + "learning_rate": 6.29658369995505e-06, + "loss": 0.0685, + "step": 16690 + }, + { + "epoch": 0.42237518030214843, + "grad_norm": 5.751493453979492, + "learning_rate": 6.2961959083850145e-06, + "loss": 0.1822, + "step": 16691 + }, + { + "epoch": 0.42240048586684215, + "grad_norm": 3.9288570880889893, + "learning_rate": 6.295808108456053e-06, + "loss": 0.1962, + "step": 16692 + }, + { + "epoch": 0.4224257914315358, + "grad_norm": 3.2470099925994873, + "learning_rate": 6.295420300170665e-06, + "loss": 0.1374, + "step": 16693 + }, + { + "epoch": 0.42245109699622946, + "grad_norm": 4.66343355178833, + "learning_rate": 6.295032483531354e-06, + "loss": 0.1641, + "step": 16694 + }, + { + "epoch": 0.42247640256092317, + "grad_norm": 5.418320178985596, + "learning_rate": 6.294644658540617e-06, + "loss": 0.2145, + "step": 16695 + }, + { + "epoch": 0.4225017081256168, + "grad_norm": 9.495857238769531, + "learning_rate": 6.29425682520096e-06, + "loss": 0.2482, + "step": 16696 + }, + { + "epoch": 0.4225270136903105, + "grad_norm": 5.8782124519348145, + "learning_rate": 6.293868983514879e-06, + "loss": 0.1335, + "step": 16697 + }, + { + "epoch": 0.4225523192550042, + "grad_norm": 4.564225196838379, + "learning_rate": 6.293481133484879e-06, + "loss": 0.1688, + "step": 16698 + }, + { + "epoch": 0.42257762481969785, + "grad_norm": 5.339531421661377, + "learning_rate": 6.29309327511346e-06, + "loss": 0.1905, + "step": 16699 + }, + { + "epoch": 0.4226029303843915, + "grad_norm": 3.8082313537597656, + "learning_rate": 6.2927054084031225e-06, + "loss": 0.1162, + "step": 16700 + }, + { + "epoch": 0.4226282359490852, + "grad_norm": 2.2137248516082764, + "learning_rate": 6.2923175333563695e-06, + "loss": 0.0497, + "step": 16701 + }, + { + "epoch": 0.4226535415137789, + "grad_norm": 6.408285140991211, + "learning_rate": 6.291929649975699e-06, + "loss": 0.2221, + "step": 16702 + }, + { + "epoch": 0.42267884707847253, + "grad_norm": 4.3848772048950195, + "learning_rate": 6.291541758263616e-06, + "loss": 0.159, + "step": 16703 + }, + { + "epoch": 0.42270415264316624, + "grad_norm": 5.57785701751709, + "learning_rate": 6.29115385822262e-06, + "loss": 0.1463, + "step": 16704 + }, + { + "epoch": 0.4227294582078599, + "grad_norm": 5.063394069671631, + "learning_rate": 6.290765949855213e-06, + "loss": 0.1655, + "step": 16705 + }, + { + "epoch": 0.4227547637725536, + "grad_norm": 4.638571739196777, + "learning_rate": 6.290378033163895e-06, + "loss": 0.1761, + "step": 16706 + }, + { + "epoch": 0.42278006933724727, + "grad_norm": 2.558542490005493, + "learning_rate": 6.289990108151172e-06, + "loss": 0.1408, + "step": 16707 + }, + { + "epoch": 0.4228053749019409, + "grad_norm": 4.182583332061768, + "learning_rate": 6.28960217481954e-06, + "loss": 0.132, + "step": 16708 + }, + { + "epoch": 0.42283068046663463, + "grad_norm": 5.410524845123291, + "learning_rate": 6.289214233171506e-06, + "loss": 0.1649, + "step": 16709 + }, + { + "epoch": 0.4228559860313283, + "grad_norm": 2.444103479385376, + "learning_rate": 6.288826283209567e-06, + "loss": 0.1437, + "step": 16710 + }, + { + "epoch": 0.42288129159602195, + "grad_norm": 4.214954376220703, + "learning_rate": 6.288438324936228e-06, + "loss": 0.1663, + "step": 16711 + }, + { + "epoch": 0.42290659716071566, + "grad_norm": 3.7957894802093506, + "learning_rate": 6.28805035835399e-06, + "loss": 0.1397, + "step": 16712 + }, + { + "epoch": 0.4229319027254093, + "grad_norm": 3.0989067554473877, + "learning_rate": 6.2876623834653535e-06, + "loss": 0.1485, + "step": 16713 + }, + { + "epoch": 0.42295720829010297, + "grad_norm": 12.31967830657959, + "learning_rate": 6.287274400272824e-06, + "loss": 0.2922, + "step": 16714 + }, + { + "epoch": 0.4229825138547967, + "grad_norm": 2.1204516887664795, + "learning_rate": 6.286886408778899e-06, + "loss": 0.0839, + "step": 16715 + }, + { + "epoch": 0.42300781941949034, + "grad_norm": 8.364523887634277, + "learning_rate": 6.286498408986086e-06, + "loss": 0.2191, + "step": 16716 + }, + { + "epoch": 0.423033124984184, + "grad_norm": 6.379190921783447, + "learning_rate": 6.286110400896881e-06, + "loss": 0.1995, + "step": 16717 + }, + { + "epoch": 0.4230584305488777, + "grad_norm": 6.200942516326904, + "learning_rate": 6.285722384513791e-06, + "loss": 0.2111, + "step": 16718 + }, + { + "epoch": 0.42308373611357136, + "grad_norm": 4.170877933502197, + "learning_rate": 6.285334359839316e-06, + "loss": 0.2017, + "step": 16719 + }, + { + "epoch": 0.4231090416782651, + "grad_norm": 3.1429078578948975, + "learning_rate": 6.284946326875959e-06, + "loss": 0.1503, + "step": 16720 + }, + { + "epoch": 0.42313434724295873, + "grad_norm": 5.419804096221924, + "learning_rate": 6.284558285626222e-06, + "loss": 0.1061, + "step": 16721 + }, + { + "epoch": 0.4231596528076524, + "grad_norm": 21.955957412719727, + "learning_rate": 6.284170236092607e-06, + "loss": 0.3964, + "step": 16722 + }, + { + "epoch": 0.4231849583723461, + "grad_norm": 3.402202606201172, + "learning_rate": 6.283782178277619e-06, + "loss": 0.1533, + "step": 16723 + }, + { + "epoch": 0.42321026393703975, + "grad_norm": 5.134410381317139, + "learning_rate": 6.283394112183756e-06, + "loss": 0.1999, + "step": 16724 + }, + { + "epoch": 0.4232355695017334, + "grad_norm": 2.138801097869873, + "learning_rate": 6.283006037813525e-06, + "loss": 0.1133, + "step": 16725 + }, + { + "epoch": 0.4232608750664271, + "grad_norm": 4.419536113739014, + "learning_rate": 6.282617955169426e-06, + "loss": 0.1742, + "step": 16726 + }, + { + "epoch": 0.4232861806311208, + "grad_norm": 4.4452056884765625, + "learning_rate": 6.282229864253962e-06, + "loss": 0.1461, + "step": 16727 + }, + { + "epoch": 0.42331148619581443, + "grad_norm": 10.646965980529785, + "learning_rate": 6.2818417650696375e-06, + "loss": 0.2424, + "step": 16728 + }, + { + "epoch": 0.42333679176050815, + "grad_norm": 5.68970251083374, + "learning_rate": 6.281453657618952e-06, + "loss": 0.1911, + "step": 16729 + }, + { + "epoch": 0.4233620973252018, + "grad_norm": 6.742825984954834, + "learning_rate": 6.281065541904412e-06, + "loss": 0.2827, + "step": 16730 + }, + { + "epoch": 0.4233874028898955, + "grad_norm": 7.195694446563721, + "learning_rate": 6.280677417928517e-06, + "loss": 0.2118, + "step": 16731 + }, + { + "epoch": 0.42341270845458917, + "grad_norm": 7.867141246795654, + "learning_rate": 6.280289285693772e-06, + "loss": 0.2645, + "step": 16732 + }, + { + "epoch": 0.4234380140192828, + "grad_norm": 19.177406311035156, + "learning_rate": 6.279901145202681e-06, + "loss": 0.4934, + "step": 16733 + }, + { + "epoch": 0.42346331958397654, + "grad_norm": 6.793220043182373, + "learning_rate": 6.279512996457744e-06, + "loss": 0.2704, + "step": 16734 + }, + { + "epoch": 0.4234886251486702, + "grad_norm": 3.11271595954895, + "learning_rate": 6.279124839461466e-06, + "loss": 0.1565, + "step": 16735 + }, + { + "epoch": 0.42351393071336385, + "grad_norm": 4.222502708435059, + "learning_rate": 6.27873667421635e-06, + "loss": 0.2007, + "step": 16736 + }, + { + "epoch": 0.42353923627805756, + "grad_norm": 6.786605358123779, + "learning_rate": 6.278348500724899e-06, + "loss": 0.2371, + "step": 16737 + }, + { + "epoch": 0.4235645418427512, + "grad_norm": 6.057841777801514, + "learning_rate": 6.277960318989617e-06, + "loss": 0.2679, + "step": 16738 + }, + { + "epoch": 0.4235898474074449, + "grad_norm": 7.535494804382324, + "learning_rate": 6.277572129013005e-06, + "loss": 0.1372, + "step": 16739 + }, + { + "epoch": 0.4236151529721386, + "grad_norm": 4.245554447174072, + "learning_rate": 6.27718393079757e-06, + "loss": 0.1584, + "step": 16740 + }, + { + "epoch": 0.42364045853683224, + "grad_norm": 3.308310031890869, + "learning_rate": 6.276795724345812e-06, + "loss": 0.1282, + "step": 16741 + }, + { + "epoch": 0.4236657641015259, + "grad_norm": 4.070578098297119, + "learning_rate": 6.276407509660235e-06, + "loss": 0.1718, + "step": 16742 + }, + { + "epoch": 0.4236910696662196, + "grad_norm": 12.494282722473145, + "learning_rate": 6.2760192867433455e-06, + "loss": 0.2008, + "step": 16743 + }, + { + "epoch": 0.42371637523091327, + "grad_norm": 5.52153205871582, + "learning_rate": 6.275631055597644e-06, + "loss": 0.2174, + "step": 16744 + }, + { + "epoch": 0.423741680795607, + "grad_norm": 3.2242019176483154, + "learning_rate": 6.275242816225635e-06, + "loss": 0.1829, + "step": 16745 + }, + { + "epoch": 0.42376698636030063, + "grad_norm": 4.728222846984863, + "learning_rate": 6.2748545686298226e-06, + "loss": 0.2824, + "step": 16746 + }, + { + "epoch": 0.4237922919249943, + "grad_norm": 4.0655741691589355, + "learning_rate": 6.274466312812711e-06, + "loss": 0.1645, + "step": 16747 + }, + { + "epoch": 0.423817597489688, + "grad_norm": 5.880916595458984, + "learning_rate": 6.274078048776802e-06, + "loss": 0.1709, + "step": 16748 + }, + { + "epoch": 0.42384290305438166, + "grad_norm": 3.9461495876312256, + "learning_rate": 6.2736897765246005e-06, + "loss": 0.1359, + "step": 16749 + }, + { + "epoch": 0.4238682086190753, + "grad_norm": 4.15543270111084, + "learning_rate": 6.273301496058611e-06, + "loss": 0.1708, + "step": 16750 + }, + { + "epoch": 0.423893514183769, + "grad_norm": 6.603239059448242, + "learning_rate": 6.272913207381337e-06, + "loss": 0.2166, + "step": 16751 + }, + { + "epoch": 0.4239188197484627, + "grad_norm": 4.887662887573242, + "learning_rate": 6.272524910495283e-06, + "loss": 0.1397, + "step": 16752 + }, + { + "epoch": 0.42394412531315634, + "grad_norm": 10.211627006530762, + "learning_rate": 6.272136605402952e-06, + "loss": 0.1728, + "step": 16753 + }, + { + "epoch": 0.42396943087785005, + "grad_norm": 4.520246505737305, + "learning_rate": 6.271748292106849e-06, + "loss": 0.2247, + "step": 16754 + }, + { + "epoch": 0.4239947364425437, + "grad_norm": 7.799563407897949, + "learning_rate": 6.271359970609477e-06, + "loss": 0.3215, + "step": 16755 + }, + { + "epoch": 0.4240200420072374, + "grad_norm": 5.195701599121094, + "learning_rate": 6.2709716409133414e-06, + "loss": 0.1647, + "step": 16756 + }, + { + "epoch": 0.4240453475719311, + "grad_norm": 4.022425651550293, + "learning_rate": 6.270583303020947e-06, + "loss": 0.1402, + "step": 16757 + }, + { + "epoch": 0.42407065313662473, + "grad_norm": 3.124732732772827, + "learning_rate": 6.270194956934796e-06, + "loss": 0.1044, + "step": 16758 + }, + { + "epoch": 0.42409595870131844, + "grad_norm": 5.456474304199219, + "learning_rate": 6.2698066026573955e-06, + "loss": 0.1049, + "step": 16759 + }, + { + "epoch": 0.4241212642660121, + "grad_norm": 3.8383781909942627, + "learning_rate": 6.269418240191249e-06, + "loss": 0.238, + "step": 16760 + }, + { + "epoch": 0.42414656983070576, + "grad_norm": 3.5865638256073, + "learning_rate": 6.269029869538859e-06, + "loss": 0.1633, + "step": 16761 + }, + { + "epoch": 0.42417187539539947, + "grad_norm": 10.888493537902832, + "learning_rate": 6.268641490702731e-06, + "loss": 0.2627, + "step": 16762 + }, + { + "epoch": 0.4241971809600931, + "grad_norm": 4.494633197784424, + "learning_rate": 6.26825310368537e-06, + "loss": 0.2466, + "step": 16763 + }, + { + "epoch": 0.4242224865247868, + "grad_norm": 12.596009254455566, + "learning_rate": 6.267864708489279e-06, + "loss": 0.2088, + "step": 16764 + }, + { + "epoch": 0.4242477920894805, + "grad_norm": 3.655872344970703, + "learning_rate": 6.267476305116968e-06, + "loss": 0.1563, + "step": 16765 + }, + { + "epoch": 0.42427309765417415, + "grad_norm": 5.159757137298584, + "learning_rate": 6.2670878935709355e-06, + "loss": 0.275, + "step": 16766 + }, + { + "epoch": 0.4242984032188678, + "grad_norm": 2.890044689178467, + "learning_rate": 6.26669947385369e-06, + "loss": 0.0833, + "step": 16767 + }, + { + "epoch": 0.4243237087835615, + "grad_norm": 4.433992862701416, + "learning_rate": 6.266311045967736e-06, + "loss": 0.1278, + "step": 16768 + }, + { + "epoch": 0.42434901434825517, + "grad_norm": 4.475409030914307, + "learning_rate": 6.265922609915577e-06, + "loss": 0.1119, + "step": 16769 + }, + { + "epoch": 0.4243743199129489, + "grad_norm": 11.716137886047363, + "learning_rate": 6.265534165699717e-06, + "loss": 0.2408, + "step": 16770 + }, + { + "epoch": 0.42439962547764254, + "grad_norm": 12.086993217468262, + "learning_rate": 6.265145713322664e-06, + "loss": 0.2219, + "step": 16771 + }, + { + "epoch": 0.4244249310423362, + "grad_norm": 7.689574718475342, + "learning_rate": 6.264757252786922e-06, + "loss": 0.2008, + "step": 16772 + }, + { + "epoch": 0.4244502366070299, + "grad_norm": 12.278388023376465, + "learning_rate": 6.2643687840949955e-06, + "loss": 0.3408, + "step": 16773 + }, + { + "epoch": 0.42447554217172356, + "grad_norm": 10.926179885864258, + "learning_rate": 6.263980307249389e-06, + "loss": 0.2647, + "step": 16774 + }, + { + "epoch": 0.4245008477364172, + "grad_norm": 3.3826024532318115, + "learning_rate": 6.263591822252608e-06, + "loss": 0.1181, + "step": 16775 + }, + { + "epoch": 0.42452615330111093, + "grad_norm": 6.812875270843506, + "learning_rate": 6.26320332910716e-06, + "loss": 0.2646, + "step": 16776 + }, + { + "epoch": 0.4245514588658046, + "grad_norm": 21.09720802307129, + "learning_rate": 6.262814827815549e-06, + "loss": 0.2356, + "step": 16777 + }, + { + "epoch": 0.42457676443049824, + "grad_norm": 7.925963878631592, + "learning_rate": 6.262426318380279e-06, + "loss": 0.1903, + "step": 16778 + }, + { + "epoch": 0.42460206999519196, + "grad_norm": 10.439091682434082, + "learning_rate": 6.2620378008038565e-06, + "loss": 0.168, + "step": 16779 + }, + { + "epoch": 0.4246273755598856, + "grad_norm": 8.411081314086914, + "learning_rate": 6.261649275088788e-06, + "loss": 0.2088, + "step": 16780 + }, + { + "epoch": 0.42465268112457927, + "grad_norm": 39.91005325317383, + "learning_rate": 6.261260741237576e-06, + "loss": 0.2418, + "step": 16781 + }, + { + "epoch": 0.424677986689273, + "grad_norm": 6.78342342376709, + "learning_rate": 6.26087219925273e-06, + "loss": 0.2337, + "step": 16782 + }, + { + "epoch": 0.42470329225396664, + "grad_norm": 4.88775110244751, + "learning_rate": 6.2604836491367525e-06, + "loss": 0.1538, + "step": 16783 + }, + { + "epoch": 0.42472859781866035, + "grad_norm": 3.5303573608398438, + "learning_rate": 6.260095090892152e-06, + "loss": 0.1844, + "step": 16784 + }, + { + "epoch": 0.424753903383354, + "grad_norm": 8.994867324829102, + "learning_rate": 6.259706524521432e-06, + "loss": 0.2095, + "step": 16785 + }, + { + "epoch": 0.42477920894804766, + "grad_norm": 6.843161106109619, + "learning_rate": 6.259317950027099e-06, + "loss": 0.0927, + "step": 16786 + }, + { + "epoch": 0.42480451451274137, + "grad_norm": 10.600810050964355, + "learning_rate": 6.258929367411657e-06, + "loss": 0.2145, + "step": 16787 + }, + { + "epoch": 0.424829820077435, + "grad_norm": 8.065776824951172, + "learning_rate": 6.258540776677616e-06, + "loss": 0.2478, + "step": 16788 + }, + { + "epoch": 0.4248551256421287, + "grad_norm": 6.299200534820557, + "learning_rate": 6.258152177827479e-06, + "loss": 0.2115, + "step": 16789 + }, + { + "epoch": 0.4248804312068224, + "grad_norm": 5.175220489501953, + "learning_rate": 6.257763570863753e-06, + "loss": 0.2012, + "step": 16790 + }, + { + "epoch": 0.42490573677151605, + "grad_norm": 3.445725440979004, + "learning_rate": 6.257374955788943e-06, + "loss": 0.1512, + "step": 16791 + }, + { + "epoch": 0.4249310423362097, + "grad_norm": 6.739169120788574, + "learning_rate": 6.256986332605556e-06, + "loss": 0.2281, + "step": 16792 + }, + { + "epoch": 0.4249563479009034, + "grad_norm": 10.691543579101562, + "learning_rate": 6.256597701316099e-06, + "loss": 0.3253, + "step": 16793 + }, + { + "epoch": 0.4249816534655971, + "grad_norm": 2.2162234783172607, + "learning_rate": 6.2562090619230746e-06, + "loss": 0.0932, + "step": 16794 + }, + { + "epoch": 0.4250069590302908, + "grad_norm": 5.687121868133545, + "learning_rate": 6.2558204144289934e-06, + "loss": 0.2339, + "step": 16795 + }, + { + "epoch": 0.42503226459498444, + "grad_norm": 7.257637977600098, + "learning_rate": 6.255431758836359e-06, + "loss": 0.1428, + "step": 16796 + }, + { + "epoch": 0.4250575701596781, + "grad_norm": 8.920211791992188, + "learning_rate": 6.255043095147679e-06, + "loss": 0.3025, + "step": 16797 + }, + { + "epoch": 0.4250828757243718, + "grad_norm": 2.919984817504883, + "learning_rate": 6.254654423365461e-06, + "loss": 0.1426, + "step": 16798 + }, + { + "epoch": 0.42510818128906547, + "grad_norm": 4.131363391876221, + "learning_rate": 6.254265743492207e-06, + "loss": 0.1241, + "step": 16799 + }, + { + "epoch": 0.4251334868537591, + "grad_norm": 7.166411876678467, + "learning_rate": 6.25387705553043e-06, + "loss": 0.2198, + "step": 16800 + }, + { + "epoch": 0.42515879241845284, + "grad_norm": 2.9621829986572266, + "learning_rate": 6.253488359482631e-06, + "loss": 0.0999, + "step": 16801 + }, + { + "epoch": 0.4251840979831465, + "grad_norm": 3.0880420207977295, + "learning_rate": 6.25309965535132e-06, + "loss": 0.1002, + "step": 16802 + }, + { + "epoch": 0.42520940354784015, + "grad_norm": 2.508664131164551, + "learning_rate": 6.252710943139001e-06, + "loss": 0.0882, + "step": 16803 + }, + { + "epoch": 0.42523470911253386, + "grad_norm": 6.400160312652588, + "learning_rate": 6.252322222848183e-06, + "loss": 0.1904, + "step": 16804 + }, + { + "epoch": 0.4252600146772275, + "grad_norm": 3.3419313430786133, + "learning_rate": 6.251933494481371e-06, + "loss": 0.092, + "step": 16805 + }, + { + "epoch": 0.42528532024192117, + "grad_norm": 4.675371170043945, + "learning_rate": 6.251544758041073e-06, + "loss": 0.1721, + "step": 16806 + }, + { + "epoch": 0.4253106258066149, + "grad_norm": 8.655302047729492, + "learning_rate": 6.251156013529798e-06, + "loss": 0.2606, + "step": 16807 + }, + { + "epoch": 0.42533593137130854, + "grad_norm": 7.577735424041748, + "learning_rate": 6.250767260950047e-06, + "loss": 0.1754, + "step": 16808 + }, + { + "epoch": 0.42536123693600225, + "grad_norm": 5.635955333709717, + "learning_rate": 6.250378500304333e-06, + "loss": 0.0969, + "step": 16809 + }, + { + "epoch": 0.4253865425006959, + "grad_norm": 7.324398994445801, + "learning_rate": 6.24998973159516e-06, + "loss": 0.2879, + "step": 16810 + }, + { + "epoch": 0.42541184806538956, + "grad_norm": 4.355375289916992, + "learning_rate": 6.249600954825036e-06, + "loss": 0.1952, + "step": 16811 + }, + { + "epoch": 0.4254371536300833, + "grad_norm": 6.187519073486328, + "learning_rate": 6.249212169996467e-06, + "loss": 0.0891, + "step": 16812 + }, + { + "epoch": 0.42546245919477693, + "grad_norm": 5.053980350494385, + "learning_rate": 6.248823377111962e-06, + "loss": 0.165, + "step": 16813 + }, + { + "epoch": 0.4254877647594706, + "grad_norm": 6.200580596923828, + "learning_rate": 6.248434576174027e-06, + "loss": 0.2534, + "step": 16814 + }, + { + "epoch": 0.4255130703241643, + "grad_norm": 4.939740180969238, + "learning_rate": 6.2480457671851706e-06, + "loss": 0.1795, + "step": 16815 + }, + { + "epoch": 0.42553837588885796, + "grad_norm": 2.926276445388794, + "learning_rate": 6.247656950147898e-06, + "loss": 0.1495, + "step": 16816 + }, + { + "epoch": 0.4255636814535516, + "grad_norm": 11.341848373413086, + "learning_rate": 6.247268125064718e-06, + "loss": 0.2557, + "step": 16817 + }, + { + "epoch": 0.4255889870182453, + "grad_norm": 8.230217933654785, + "learning_rate": 6.246879291938138e-06, + "loss": 0.2365, + "step": 16818 + }, + { + "epoch": 0.425614292582939, + "grad_norm": 9.908304214477539, + "learning_rate": 6.2464904507706654e-06, + "loss": 0.2383, + "step": 16819 + }, + { + "epoch": 0.4256395981476327, + "grad_norm": 6.814682483673096, + "learning_rate": 6.246101601564809e-06, + "loss": 0.2481, + "step": 16820 + }, + { + "epoch": 0.42566490371232635, + "grad_norm": 9.286133766174316, + "learning_rate": 6.245712744323073e-06, + "loss": 0.2639, + "step": 16821 + }, + { + "epoch": 0.42569020927702, + "grad_norm": 3.6114370822906494, + "learning_rate": 6.245323879047968e-06, + "loss": 0.0959, + "step": 16822 + }, + { + "epoch": 0.4257155148417137, + "grad_norm": 5.439844131469727, + "learning_rate": 6.244935005742001e-06, + "loss": 0.1134, + "step": 16823 + }, + { + "epoch": 0.42574082040640737, + "grad_norm": 7.115851402282715, + "learning_rate": 6.24454612440768e-06, + "loss": 0.21, + "step": 16824 + }, + { + "epoch": 0.42576612597110103, + "grad_norm": 7.424627304077148, + "learning_rate": 6.244157235047512e-06, + "loss": 0.2247, + "step": 16825 + }, + { + "epoch": 0.42579143153579474, + "grad_norm": 9.523443222045898, + "learning_rate": 6.243768337664007e-06, + "loss": 0.2809, + "step": 16826 + }, + { + "epoch": 0.4258167371004884, + "grad_norm": 6.253267765045166, + "learning_rate": 6.243379432259668e-06, + "loss": 0.2624, + "step": 16827 + }, + { + "epoch": 0.42584204266518205, + "grad_norm": 6.2276692390441895, + "learning_rate": 6.242990518837009e-06, + "loss": 0.1975, + "step": 16828 + }, + { + "epoch": 0.42586734822987576, + "grad_norm": 5.833733558654785, + "learning_rate": 6.242601597398536e-06, + "loss": 0.2852, + "step": 16829 + }, + { + "epoch": 0.4258926537945694, + "grad_norm": 7.934849739074707, + "learning_rate": 6.242212667946755e-06, + "loss": 0.224, + "step": 16830 + }, + { + "epoch": 0.4259179593592631, + "grad_norm": 5.654536724090576, + "learning_rate": 6.2418237304841755e-06, + "loss": 0.1802, + "step": 16831 + }, + { + "epoch": 0.4259432649239568, + "grad_norm": 5.583306789398193, + "learning_rate": 6.241434785013305e-06, + "loss": 0.1328, + "step": 16832 + }, + { + "epoch": 0.42596857048865044, + "grad_norm": 7.000573635101318, + "learning_rate": 6.2410458315366555e-06, + "loss": 0.2289, + "step": 16833 + }, + { + "epoch": 0.42599387605334416, + "grad_norm": 4.099005699157715, + "learning_rate": 6.240656870056729e-06, + "loss": 0.1522, + "step": 16834 + }, + { + "epoch": 0.4260191816180378, + "grad_norm": 4.408815383911133, + "learning_rate": 6.240267900576039e-06, + "loss": 0.1304, + "step": 16835 + }, + { + "epoch": 0.42604448718273147, + "grad_norm": 6.3413825035095215, + "learning_rate": 6.239878923097091e-06, + "loss": 0.1738, + "step": 16836 + }, + { + "epoch": 0.4260697927474252, + "grad_norm": 5.8752312660217285, + "learning_rate": 6.239489937622396e-06, + "loss": 0.1101, + "step": 16837 + }, + { + "epoch": 0.42609509831211884, + "grad_norm": 3.694607973098755, + "learning_rate": 6.239100944154459e-06, + "loss": 0.1417, + "step": 16838 + }, + { + "epoch": 0.4261204038768125, + "grad_norm": 11.660273551940918, + "learning_rate": 6.2387119426957906e-06, + "loss": 0.256, + "step": 16839 + }, + { + "epoch": 0.4261457094415062, + "grad_norm": 3.870990514755249, + "learning_rate": 6.238322933248902e-06, + "loss": 0.1947, + "step": 16840 + }, + { + "epoch": 0.42617101500619986, + "grad_norm": 3.8676624298095703, + "learning_rate": 6.237933915816295e-06, + "loss": 0.1486, + "step": 16841 + }, + { + "epoch": 0.4261963205708935, + "grad_norm": 4.418107986450195, + "learning_rate": 6.2375448904004865e-06, + "loss": 0.1107, + "step": 16842 + }, + { + "epoch": 0.42622162613558723, + "grad_norm": 5.3491291999816895, + "learning_rate": 6.237155857003978e-06, + "loss": 0.1864, + "step": 16843 + }, + { + "epoch": 0.4262469317002809, + "grad_norm": 4.83969259262085, + "learning_rate": 6.236766815629283e-06, + "loss": 0.1908, + "step": 16844 + }, + { + "epoch": 0.42627223726497454, + "grad_norm": 4.100856781005859, + "learning_rate": 6.23637776627891e-06, + "loss": 0.1489, + "step": 16845 + }, + { + "epoch": 0.42629754282966825, + "grad_norm": 5.225693225860596, + "learning_rate": 6.235988708955365e-06, + "loss": 0.1454, + "step": 16846 + }, + { + "epoch": 0.4263228483943619, + "grad_norm": 4.358093738555908, + "learning_rate": 6.23559964366116e-06, + "loss": 0.1339, + "step": 16847 + }, + { + "epoch": 0.4263481539590556, + "grad_norm": 3.7416837215423584, + "learning_rate": 6.235210570398802e-06, + "loss": 0.1929, + "step": 16848 + }, + { + "epoch": 0.4263734595237493, + "grad_norm": 4.766019344329834, + "learning_rate": 6.234821489170801e-06, + "loss": 0.2339, + "step": 16849 + }, + { + "epoch": 0.42639876508844293, + "grad_norm": 4.054784774780273, + "learning_rate": 6.234432399979665e-06, + "loss": 0.1968, + "step": 16850 + }, + { + "epoch": 0.42642407065313664, + "grad_norm": 5.76554012298584, + "learning_rate": 6.234043302827906e-06, + "loss": 0.2517, + "step": 16851 + }, + { + "epoch": 0.4264493762178303, + "grad_norm": 5.976505756378174, + "learning_rate": 6.233654197718029e-06, + "loss": 0.1569, + "step": 16852 + }, + { + "epoch": 0.42647468178252396, + "grad_norm": 5.07197904586792, + "learning_rate": 6.2332650846525475e-06, + "loss": 0.179, + "step": 16853 + }, + { + "epoch": 0.42649998734721767, + "grad_norm": 4.47657585144043, + "learning_rate": 6.232875963633967e-06, + "loss": 0.1754, + "step": 16854 + }, + { + "epoch": 0.4265252929119113, + "grad_norm": 3.292304039001465, + "learning_rate": 6.2324868346648e-06, + "loss": 0.131, + "step": 16855 + }, + { + "epoch": 0.426550598476605, + "grad_norm": 5.433279037475586, + "learning_rate": 6.2320976977475545e-06, + "loss": 0.1944, + "step": 16856 + }, + { + "epoch": 0.4265759040412987, + "grad_norm": 6.248443126678467, + "learning_rate": 6.231708552884741e-06, + "loss": 0.165, + "step": 16857 + }, + { + "epoch": 0.42660120960599235, + "grad_norm": 4.610507965087891, + "learning_rate": 6.231319400078866e-06, + "loss": 0.1856, + "step": 16858 + }, + { + "epoch": 0.42662651517068606, + "grad_norm": 4.53454065322876, + "learning_rate": 6.2309302393324435e-06, + "loss": 0.117, + "step": 16859 + }, + { + "epoch": 0.4266518207353797, + "grad_norm": 4.140827655792236, + "learning_rate": 6.230541070647979e-06, + "loss": 0.1398, + "step": 16860 + }, + { + "epoch": 0.4266771263000734, + "grad_norm": 6.955878734588623, + "learning_rate": 6.2301518940279846e-06, + "loss": 0.1657, + "step": 16861 + }, + { + "epoch": 0.4267024318647671, + "grad_norm": 3.8081436157226562, + "learning_rate": 6.2297627094749705e-06, + "loss": 0.1517, + "step": 16862 + }, + { + "epoch": 0.42672773742946074, + "grad_norm": 6.984840393066406, + "learning_rate": 6.229373516991444e-06, + "loss": 0.1541, + "step": 16863 + }, + { + "epoch": 0.4267530429941544, + "grad_norm": 4.516531467437744, + "learning_rate": 6.228984316579918e-06, + "loss": 0.1815, + "step": 16864 + }, + { + "epoch": 0.4267783485588481, + "grad_norm": 7.987200736999512, + "learning_rate": 6.228595108242899e-06, + "loss": 0.2444, + "step": 16865 + }, + { + "epoch": 0.42680365412354176, + "grad_norm": 3.7656948566436768, + "learning_rate": 6.228205891982899e-06, + "loss": 0.1594, + "step": 16866 + }, + { + "epoch": 0.4268289596882354, + "grad_norm": 17.05943489074707, + "learning_rate": 6.2278166678024285e-06, + "loss": 0.288, + "step": 16867 + }, + { + "epoch": 0.42685426525292913, + "grad_norm": 7.159346580505371, + "learning_rate": 6.227427435703997e-06, + "loss": 0.2248, + "step": 16868 + }, + { + "epoch": 0.4268795708176228, + "grad_norm": 12.546538352966309, + "learning_rate": 6.227038195690113e-06, + "loss": 0.2156, + "step": 16869 + }, + { + "epoch": 0.42690487638231644, + "grad_norm": 5.721296787261963, + "learning_rate": 6.226648947763287e-06, + "loss": 0.1004, + "step": 16870 + }, + { + "epoch": 0.42693018194701016, + "grad_norm": 6.221960067749023, + "learning_rate": 6.226259691926034e-06, + "loss": 0.2228, + "step": 16871 + }, + { + "epoch": 0.4269554875117038, + "grad_norm": 3.898256778717041, + "learning_rate": 6.225870428180857e-06, + "loss": 0.0877, + "step": 16872 + }, + { + "epoch": 0.4269807930763975, + "grad_norm": 6.211648464202881, + "learning_rate": 6.225481156530271e-06, + "loss": 0.2366, + "step": 16873 + }, + { + "epoch": 0.4270060986410912, + "grad_norm": 2.697516918182373, + "learning_rate": 6.2250918769767845e-06, + "loss": 0.1215, + "step": 16874 + }, + { + "epoch": 0.42703140420578484, + "grad_norm": 2.919862747192383, + "learning_rate": 6.224702589522909e-06, + "loss": 0.1107, + "step": 16875 + }, + { + "epoch": 0.42705670977047855, + "grad_norm": 13.77723217010498, + "learning_rate": 6.224313294171154e-06, + "loss": 0.2364, + "step": 16876 + }, + { + "epoch": 0.4270820153351722, + "grad_norm": 7.4668989181518555, + "learning_rate": 6.223923990924029e-06, + "loss": 0.2223, + "step": 16877 + }, + { + "epoch": 0.42710732089986586, + "grad_norm": 4.689192771911621, + "learning_rate": 6.2235346797840475e-06, + "loss": 0.1288, + "step": 16878 + }, + { + "epoch": 0.4271326264645596, + "grad_norm": 7.3718390464782715, + "learning_rate": 6.223145360753719e-06, + "loss": 0.22, + "step": 16879 + }, + { + "epoch": 0.42715793202925323, + "grad_norm": 6.076222896575928, + "learning_rate": 6.222756033835552e-06, + "loss": 0.1344, + "step": 16880 + }, + { + "epoch": 0.4271832375939469, + "grad_norm": 3.447666645050049, + "learning_rate": 6.222366699032059e-06, + "loss": 0.1931, + "step": 16881 + }, + { + "epoch": 0.4272085431586406, + "grad_norm": 7.396284103393555, + "learning_rate": 6.221977356345751e-06, + "loss": 0.1416, + "step": 16882 + }, + { + "epoch": 0.42723384872333425, + "grad_norm": 5.57206916809082, + "learning_rate": 6.221588005779138e-06, + "loss": 0.1799, + "step": 16883 + }, + { + "epoch": 0.42725915428802796, + "grad_norm": 5.7830491065979, + "learning_rate": 6.221198647334731e-06, + "loss": 0.1945, + "step": 16884 + }, + { + "epoch": 0.4272844598527216, + "grad_norm": 4.11606502532959, + "learning_rate": 6.2208092810150415e-06, + "loss": 0.1252, + "step": 16885 + }, + { + "epoch": 0.4273097654174153, + "grad_norm": 7.681854248046875, + "learning_rate": 6.22041990682258e-06, + "loss": 0.1129, + "step": 16886 + }, + { + "epoch": 0.427335070982109, + "grad_norm": 4.429594039916992, + "learning_rate": 6.220030524759857e-06, + "loss": 0.1812, + "step": 16887 + }, + { + "epoch": 0.42736037654680264, + "grad_norm": 9.560883522033691, + "learning_rate": 6.219641134829385e-06, + "loss": 0.2603, + "step": 16888 + }, + { + "epoch": 0.4273856821114963, + "grad_norm": 12.52023696899414, + "learning_rate": 6.2192517370336734e-06, + "loss": 0.3085, + "step": 16889 + }, + { + "epoch": 0.42741098767619, + "grad_norm": 3.158911943435669, + "learning_rate": 6.218862331375235e-06, + "loss": 0.165, + "step": 16890 + }, + { + "epoch": 0.42743629324088367, + "grad_norm": 9.763028144836426, + "learning_rate": 6.21847291785658e-06, + "loss": 0.2589, + "step": 16891 + }, + { + "epoch": 0.4274615988055773, + "grad_norm": 13.107942581176758, + "learning_rate": 6.218083496480219e-06, + "loss": 0.4117, + "step": 16892 + }, + { + "epoch": 0.42748690437027104, + "grad_norm": 3.67366886138916, + "learning_rate": 6.217694067248664e-06, + "loss": 0.156, + "step": 16893 + }, + { + "epoch": 0.4275122099349647, + "grad_norm": 7.316140651702881, + "learning_rate": 6.217304630164427e-06, + "loss": 0.2089, + "step": 16894 + }, + { + "epoch": 0.42753751549965835, + "grad_norm": 3.496534824371338, + "learning_rate": 6.2169151852300195e-06, + "loss": 0.1709, + "step": 16895 + }, + { + "epoch": 0.42756282106435206, + "grad_norm": 10.02302074432373, + "learning_rate": 6.216525732447951e-06, + "loss": 0.2096, + "step": 16896 + }, + { + "epoch": 0.4275881266290457, + "grad_norm": 3.611103057861328, + "learning_rate": 6.216136271820736e-06, + "loss": 0.1765, + "step": 16897 + }, + { + "epoch": 0.42761343219373943, + "grad_norm": 4.2504987716674805, + "learning_rate": 6.215746803350883e-06, + "loss": 0.1168, + "step": 16898 + }, + { + "epoch": 0.4276387377584331, + "grad_norm": 8.43069076538086, + "learning_rate": 6.2153573270409055e-06, + "loss": 0.1256, + "step": 16899 + }, + { + "epoch": 0.42766404332312674, + "grad_norm": 4.230891704559326, + "learning_rate": 6.214967842893315e-06, + "loss": 0.1645, + "step": 16900 + }, + { + "epoch": 0.42768934888782045, + "grad_norm": 7.788119792938232, + "learning_rate": 6.2145783509106225e-06, + "loss": 0.2832, + "step": 16901 + }, + { + "epoch": 0.4277146544525141, + "grad_norm": 8.232802391052246, + "learning_rate": 6.2141888510953395e-06, + "loss": 0.2943, + "step": 16902 + }, + { + "epoch": 0.42773996001720777, + "grad_norm": 7.717423915863037, + "learning_rate": 6.21379934344998e-06, + "loss": 0.1402, + "step": 16903 + }, + { + "epoch": 0.4277652655819015, + "grad_norm": 2.5889387130737305, + "learning_rate": 6.213409827977052e-06, + "loss": 0.1447, + "step": 16904 + }, + { + "epoch": 0.42779057114659513, + "grad_norm": 2.9485437870025635, + "learning_rate": 6.213020304679073e-06, + "loss": 0.1212, + "step": 16905 + }, + { + "epoch": 0.4278158767112888, + "grad_norm": 9.340048789978027, + "learning_rate": 6.21263077355855e-06, + "loss": 0.243, + "step": 16906 + }, + { + "epoch": 0.4278411822759825, + "grad_norm": 6.194571495056152, + "learning_rate": 6.212241234617996e-06, + "loss": 0.1726, + "step": 16907 + }, + { + "epoch": 0.42786648784067616, + "grad_norm": 2.9239747524261475, + "learning_rate": 6.211851687859924e-06, + "loss": 0.1161, + "step": 16908 + }, + { + "epoch": 0.4278917934053698, + "grad_norm": 11.960965156555176, + "learning_rate": 6.211462133286846e-06, + "loss": 0.1443, + "step": 16909 + }, + { + "epoch": 0.4279170989700635, + "grad_norm": 6.20154333114624, + "learning_rate": 6.211072570901275e-06, + "loss": 0.1946, + "step": 16910 + }, + { + "epoch": 0.4279424045347572, + "grad_norm": 8.109938621520996, + "learning_rate": 6.2106830007057214e-06, + "loss": 0.1704, + "step": 16911 + }, + { + "epoch": 0.4279677100994509, + "grad_norm": 4.189765930175781, + "learning_rate": 6.210293422702699e-06, + "loss": 0.1629, + "step": 16912 + }, + { + "epoch": 0.42799301566414455, + "grad_norm": 6.052337646484375, + "learning_rate": 6.209903836894719e-06, + "loss": 0.2014, + "step": 16913 + }, + { + "epoch": 0.4280183212288382, + "grad_norm": 3.7653753757476807, + "learning_rate": 6.209514243284294e-06, + "loss": 0.1674, + "step": 16914 + }, + { + "epoch": 0.4280436267935319, + "grad_norm": 6.24968957901001, + "learning_rate": 6.209124641873937e-06, + "loss": 0.17, + "step": 16915 + }, + { + "epoch": 0.4280689323582256, + "grad_norm": 4.968007564544678, + "learning_rate": 6.208735032666159e-06, + "loss": 0.2109, + "step": 16916 + }, + { + "epoch": 0.42809423792291923, + "grad_norm": 2.8923397064208984, + "learning_rate": 6.2083454156634746e-06, + "loss": 0.1333, + "step": 16917 + }, + { + "epoch": 0.42811954348761294, + "grad_norm": 9.849477767944336, + "learning_rate": 6.207955790868396e-06, + "loss": 0.2458, + "step": 16918 + }, + { + "epoch": 0.4281448490523066, + "grad_norm": 7.533113479614258, + "learning_rate": 6.207566158283435e-06, + "loss": 0.0943, + "step": 16919 + }, + { + "epoch": 0.42817015461700025, + "grad_norm": 3.754892349243164, + "learning_rate": 6.207176517911103e-06, + "loss": 0.1302, + "step": 16920 + }, + { + "epoch": 0.42819546018169397, + "grad_norm": 5.274411201477051, + "learning_rate": 6.206786869753915e-06, + "loss": 0.168, + "step": 16921 + }, + { + "epoch": 0.4282207657463876, + "grad_norm": 5.134113788604736, + "learning_rate": 6.2063972138143836e-06, + "loss": 0.1475, + "step": 16922 + }, + { + "epoch": 0.42824607131108133, + "grad_norm": 4.999051570892334, + "learning_rate": 6.20600755009502e-06, + "loss": 0.2408, + "step": 16923 + }, + { + "epoch": 0.428271376875775, + "grad_norm": 5.27949857711792, + "learning_rate": 6.205617878598339e-06, + "loss": 0.2207, + "step": 16924 + }, + { + "epoch": 0.42829668244046865, + "grad_norm": 6.97127103805542, + "learning_rate": 6.205228199326851e-06, + "loss": 0.1848, + "step": 16925 + }, + { + "epoch": 0.42832198800516236, + "grad_norm": 9.812671661376953, + "learning_rate": 6.204838512283073e-06, + "loss": 0.3099, + "step": 16926 + }, + { + "epoch": 0.428347293569856, + "grad_norm": 7.335204601287842, + "learning_rate": 6.204448817469513e-06, + "loss": 0.1865, + "step": 16927 + }, + { + "epoch": 0.42837259913454967, + "grad_norm": 6.157294750213623, + "learning_rate": 6.2040591148886885e-06, + "loss": 0.2761, + "step": 16928 + }, + { + "epoch": 0.4283979046992434, + "grad_norm": 5.875093936920166, + "learning_rate": 6.20366940454311e-06, + "loss": 0.1506, + "step": 16929 + }, + { + "epoch": 0.42842321026393704, + "grad_norm": 3.830233573913574, + "learning_rate": 6.203279686435292e-06, + "loss": 0.1419, + "step": 16930 + }, + { + "epoch": 0.4284485158286307, + "grad_norm": 9.742680549621582, + "learning_rate": 6.202889960567746e-06, + "loss": 0.2182, + "step": 16931 + }, + { + "epoch": 0.4284738213933244, + "grad_norm": 2.8304290771484375, + "learning_rate": 6.202500226942987e-06, + "loss": 0.1527, + "step": 16932 + }, + { + "epoch": 0.42849912695801806, + "grad_norm": 5.490128993988037, + "learning_rate": 6.202110485563527e-06, + "loss": 0.1127, + "step": 16933 + }, + { + "epoch": 0.4285244325227117, + "grad_norm": 2.24048113822937, + "learning_rate": 6.201720736431882e-06, + "loss": 0.1017, + "step": 16934 + }, + { + "epoch": 0.42854973808740543, + "grad_norm": 7.997925281524658, + "learning_rate": 6.201330979550563e-06, + "loss": 0.2216, + "step": 16935 + }, + { + "epoch": 0.4285750436520991, + "grad_norm": 3.4565820693969727, + "learning_rate": 6.200941214922083e-06, + "loss": 0.2024, + "step": 16936 + }, + { + "epoch": 0.4286003492167928, + "grad_norm": 4.789795875549316, + "learning_rate": 6.200551442548956e-06, + "loss": 0.1826, + "step": 16937 + }, + { + "epoch": 0.42862565478148645, + "grad_norm": 2.6716153621673584, + "learning_rate": 6.2001616624336965e-06, + "loss": 0.1192, + "step": 16938 + }, + { + "epoch": 0.4286509603461801, + "grad_norm": 13.037659645080566, + "learning_rate": 6.19977187457882e-06, + "loss": 0.2701, + "step": 16939 + }, + { + "epoch": 0.4286762659108738, + "grad_norm": 5.776813507080078, + "learning_rate": 6.199382078986836e-06, + "loss": 0.2422, + "step": 16940 + }, + { + "epoch": 0.4287015714755675, + "grad_norm": 7.0880889892578125, + "learning_rate": 6.198992275660259e-06, + "loss": 0.1562, + "step": 16941 + }, + { + "epoch": 0.42872687704026113, + "grad_norm": 6.732861042022705, + "learning_rate": 6.198602464601604e-06, + "loss": 0.223, + "step": 16942 + }, + { + "epoch": 0.42875218260495485, + "grad_norm": 9.143133163452148, + "learning_rate": 6.1982126458133865e-06, + "loss": 0.179, + "step": 16943 + }, + { + "epoch": 0.4287774881696485, + "grad_norm": 7.762526512145996, + "learning_rate": 6.1978228192981175e-06, + "loss": 0.2074, + "step": 16944 + }, + { + "epoch": 0.42880279373434216, + "grad_norm": 6.668876647949219, + "learning_rate": 6.197432985058311e-06, + "loss": 0.2039, + "step": 16945 + }, + { + "epoch": 0.42882809929903587, + "grad_norm": 2.5797066688537598, + "learning_rate": 6.197043143096484e-06, + "loss": 0.1402, + "step": 16946 + }, + { + "epoch": 0.4288534048637295, + "grad_norm": 5.819618225097656, + "learning_rate": 6.1966532934151454e-06, + "loss": 0.1605, + "step": 16947 + }, + { + "epoch": 0.42887871042842324, + "grad_norm": 5.670480728149414, + "learning_rate": 6.196263436016816e-06, + "loss": 0.1908, + "step": 16948 + }, + { + "epoch": 0.4289040159931169, + "grad_norm": 5.709988594055176, + "learning_rate": 6.1958735709040026e-06, + "loss": 0.223, + "step": 16949 + }, + { + "epoch": 0.42892932155781055, + "grad_norm": 6.9101715087890625, + "learning_rate": 6.1954836980792266e-06, + "loss": 0.2486, + "step": 16950 + }, + { + "epoch": 0.42895462712250426, + "grad_norm": 4.423928260803223, + "learning_rate": 6.195093817544996e-06, + "loss": 0.2431, + "step": 16951 + }, + { + "epoch": 0.4289799326871979, + "grad_norm": 2.73193097114563, + "learning_rate": 6.194703929303829e-06, + "loss": 0.1631, + "step": 16952 + }, + { + "epoch": 0.4290052382518916, + "grad_norm": 3.6558127403259277, + "learning_rate": 6.194314033358237e-06, + "loss": 0.1159, + "step": 16953 + }, + { + "epoch": 0.4290305438165853, + "grad_norm": 4.195829391479492, + "learning_rate": 6.193924129710738e-06, + "loss": 0.1665, + "step": 16954 + }, + { + "epoch": 0.42905584938127894, + "grad_norm": 5.635091304779053, + "learning_rate": 6.193534218363843e-06, + "loss": 0.308, + "step": 16955 + }, + { + "epoch": 0.4290811549459726, + "grad_norm": 4.323437213897705, + "learning_rate": 6.193144299320068e-06, + "loss": 0.1245, + "step": 16956 + }, + { + "epoch": 0.4291064605106663, + "grad_norm": 4.359760761260986, + "learning_rate": 6.192754372581928e-06, + "loss": 0.133, + "step": 16957 + }, + { + "epoch": 0.42913176607535997, + "grad_norm": 2.708284854888916, + "learning_rate": 6.192364438151937e-06, + "loss": 0.1546, + "step": 16958 + }, + { + "epoch": 0.4291570716400536, + "grad_norm": 3.25964093208313, + "learning_rate": 6.191974496032609e-06, + "loss": 0.1547, + "step": 16959 + }, + { + "epoch": 0.42918237720474733, + "grad_norm": 4.265096664428711, + "learning_rate": 6.191584546226459e-06, + "loss": 0.1895, + "step": 16960 + }, + { + "epoch": 0.429207682769441, + "grad_norm": 4.785131454467773, + "learning_rate": 6.1911945887360026e-06, + "loss": 0.1781, + "step": 16961 + }, + { + "epoch": 0.4292329883341347, + "grad_norm": 3.223275899887085, + "learning_rate": 6.190804623563754e-06, + "loss": 0.1322, + "step": 16962 + }, + { + "epoch": 0.42925829389882836, + "grad_norm": 4.84380578994751, + "learning_rate": 6.1904146507122264e-06, + "loss": 0.1534, + "step": 16963 + }, + { + "epoch": 0.429283599463522, + "grad_norm": 4.29537296295166, + "learning_rate": 6.190024670183938e-06, + "loss": 0.1987, + "step": 16964 + }, + { + "epoch": 0.4293089050282157, + "grad_norm": 7.6193037033081055, + "learning_rate": 6.1896346819814e-06, + "loss": 0.1821, + "step": 16965 + }, + { + "epoch": 0.4293342105929094, + "grad_norm": 4.911333084106445, + "learning_rate": 6.189244686107131e-06, + "loss": 0.1358, + "step": 16966 + }, + { + "epoch": 0.42935951615760304, + "grad_norm": 2.866140127182007, + "learning_rate": 6.188854682563643e-06, + "loss": 0.1185, + "step": 16967 + }, + { + "epoch": 0.42938482172229675, + "grad_norm": 18.825681686401367, + "learning_rate": 6.188464671353454e-06, + "loss": 0.298, + "step": 16968 + }, + { + "epoch": 0.4294101272869904, + "grad_norm": 3.413177251815796, + "learning_rate": 6.188074652479074e-06, + "loss": 0.1474, + "step": 16969 + }, + { + "epoch": 0.42943543285168406, + "grad_norm": 7.07952356338501, + "learning_rate": 6.187684625943026e-06, + "loss": 0.2083, + "step": 16970 + }, + { + "epoch": 0.4294607384163778, + "grad_norm": 11.124515533447266, + "learning_rate": 6.187294591747817e-06, + "loss": 0.2484, + "step": 16971 + }, + { + "epoch": 0.42948604398107143, + "grad_norm": 17.767419815063477, + "learning_rate": 6.186904549895968e-06, + "loss": 0.2642, + "step": 16972 + }, + { + "epoch": 0.4295113495457651, + "grad_norm": 21.7273006439209, + "learning_rate": 6.186514500389991e-06, + "loss": 0.285, + "step": 16973 + }, + { + "epoch": 0.4295366551104588, + "grad_norm": 8.31013298034668, + "learning_rate": 6.186124443232405e-06, + "loss": 0.2295, + "step": 16974 + }, + { + "epoch": 0.42956196067515245, + "grad_norm": 4.309634685516357, + "learning_rate": 6.18573437842572e-06, + "loss": 0.1433, + "step": 16975 + }, + { + "epoch": 0.42958726623984617, + "grad_norm": 10.41352653503418, + "learning_rate": 6.185344305972458e-06, + "loss": 0.2241, + "step": 16976 + }, + { + "epoch": 0.4296125718045398, + "grad_norm": 3.2111504077911377, + "learning_rate": 6.1849542258751285e-06, + "loss": 0.1374, + "step": 16977 + }, + { + "epoch": 0.4296378773692335, + "grad_norm": 4.925569534301758, + "learning_rate": 6.18456413813625e-06, + "loss": 0.1941, + "step": 16978 + }, + { + "epoch": 0.4296631829339272, + "grad_norm": 16.07540512084961, + "learning_rate": 6.184174042758339e-06, + "loss": 0.2639, + "step": 16979 + }, + { + "epoch": 0.42968848849862085, + "grad_norm": 10.327054023742676, + "learning_rate": 6.183783939743908e-06, + "loss": 0.2324, + "step": 16980 + }, + { + "epoch": 0.4297137940633145, + "grad_norm": 19.804895401000977, + "learning_rate": 6.183393829095476e-06, + "loss": 0.1882, + "step": 16981 + }, + { + "epoch": 0.4297390996280082, + "grad_norm": 4.3148417472839355, + "learning_rate": 6.183003710815556e-06, + "loss": 0.1663, + "step": 16982 + }, + { + "epoch": 0.42976440519270187, + "grad_norm": 5.439178466796875, + "learning_rate": 6.1826135849066675e-06, + "loss": 0.1604, + "step": 16983 + }, + { + "epoch": 0.4297897107573955, + "grad_norm": 3.9084205627441406, + "learning_rate": 6.182223451371321e-06, + "loss": 0.1972, + "step": 16984 + }, + { + "epoch": 0.42981501632208924, + "grad_norm": 8.10000228881836, + "learning_rate": 6.1818333102120375e-06, + "loss": 0.1674, + "step": 16985 + }, + { + "epoch": 0.4298403218867829, + "grad_norm": 3.998556613922119, + "learning_rate": 6.1814431614313295e-06, + "loss": 0.1619, + "step": 16986 + }, + { + "epoch": 0.4298656274514766, + "grad_norm": 4.057188034057617, + "learning_rate": 6.181053005031715e-06, + "loss": 0.138, + "step": 16987 + }, + { + "epoch": 0.42989093301617026, + "grad_norm": 5.455568790435791, + "learning_rate": 6.18066284101571e-06, + "loss": 0.1108, + "step": 16988 + }, + { + "epoch": 0.4299162385808639, + "grad_norm": 5.315383434295654, + "learning_rate": 6.180272669385828e-06, + "loss": 0.1991, + "step": 16989 + }, + { + "epoch": 0.42994154414555763, + "grad_norm": 7.517213821411133, + "learning_rate": 6.17988249014459e-06, + "loss": 0.185, + "step": 16990 + }, + { + "epoch": 0.4299668497102513, + "grad_norm": 8.422274589538574, + "learning_rate": 6.179492303294508e-06, + "loss": 0.2755, + "step": 16991 + }, + { + "epoch": 0.42999215527494494, + "grad_norm": 4.903663158416748, + "learning_rate": 6.179102108838099e-06, + "loss": 0.1288, + "step": 16992 + }, + { + "epoch": 0.43001746083963865, + "grad_norm": 7.585780620574951, + "learning_rate": 6.17871190677788e-06, + "loss": 0.2641, + "step": 16993 + }, + { + "epoch": 0.4300427664043323, + "grad_norm": 3.2884533405303955, + "learning_rate": 6.178321697116367e-06, + "loss": 0.177, + "step": 16994 + }, + { + "epoch": 0.43006807196902597, + "grad_norm": 6.609457015991211, + "learning_rate": 6.177931479856078e-06, + "loss": 0.2858, + "step": 16995 + }, + { + "epoch": 0.4300933775337197, + "grad_norm": 4.2300591468811035, + "learning_rate": 6.1775412549995265e-06, + "loss": 0.1757, + "step": 16996 + }, + { + "epoch": 0.43011868309841333, + "grad_norm": 5.252183437347412, + "learning_rate": 6.17715102254923e-06, + "loss": 0.2179, + "step": 16997 + }, + { + "epoch": 0.430143988663107, + "grad_norm": 5.3312859535217285, + "learning_rate": 6.176760782507706e-06, + "loss": 0.1346, + "step": 16998 + }, + { + "epoch": 0.4301692942278007, + "grad_norm": 6.420897483825684, + "learning_rate": 6.176370534877472e-06, + "loss": 0.1994, + "step": 16999 + }, + { + "epoch": 0.43019459979249436, + "grad_norm": 12.387910842895508, + "learning_rate": 6.175980279661041e-06, + "loss": 0.2112, + "step": 17000 + }, + { + "epoch": 0.43021990535718807, + "grad_norm": 7.7332234382629395, + "learning_rate": 6.175590016860934e-06, + "loss": 0.2664, + "step": 17001 + }, + { + "epoch": 0.4302452109218817, + "grad_norm": 3.2321829795837402, + "learning_rate": 6.175199746479664e-06, + "loss": 0.1723, + "step": 17002 + }, + { + "epoch": 0.4302705164865754, + "grad_norm": 3.56392502784729, + "learning_rate": 6.174809468519751e-06, + "loss": 0.1076, + "step": 17003 + }, + { + "epoch": 0.4302958220512691, + "grad_norm": 4.338596820831299, + "learning_rate": 6.1744191829837085e-06, + "loss": 0.1032, + "step": 17004 + }, + { + "epoch": 0.43032112761596275, + "grad_norm": 4.999401569366455, + "learning_rate": 6.174028889874057e-06, + "loss": 0.2092, + "step": 17005 + }, + { + "epoch": 0.4303464331806564, + "grad_norm": 3.4003689289093018, + "learning_rate": 6.1736385891933105e-06, + "loss": 0.1164, + "step": 17006 + }, + { + "epoch": 0.4303717387453501, + "grad_norm": 5.058379173278809, + "learning_rate": 6.173248280943987e-06, + "loss": 0.1872, + "step": 17007 + }, + { + "epoch": 0.4303970443100438, + "grad_norm": 13.586429595947266, + "learning_rate": 6.172857965128604e-06, + "loss": 0.2128, + "step": 17008 + }, + { + "epoch": 0.43042234987473743, + "grad_norm": 7.161649227142334, + "learning_rate": 6.172467641749678e-06, + "loss": 0.2366, + "step": 17009 + }, + { + "epoch": 0.43044765543943114, + "grad_norm": 7.178647518157959, + "learning_rate": 6.172077310809725e-06, + "loss": 0.1253, + "step": 17010 + }, + { + "epoch": 0.4304729610041248, + "grad_norm": 4.136682033538818, + "learning_rate": 6.171686972311265e-06, + "loss": 0.2252, + "step": 17011 + }, + { + "epoch": 0.4304982665688185, + "grad_norm": 5.583125591278076, + "learning_rate": 6.171296626256814e-06, + "loss": 0.1725, + "step": 17012 + }, + { + "epoch": 0.43052357213351217, + "grad_norm": 5.352365493774414, + "learning_rate": 6.170906272648888e-06, + "loss": 0.1649, + "step": 17013 + }, + { + "epoch": 0.4305488776982058, + "grad_norm": 4.643167495727539, + "learning_rate": 6.170515911490006e-06, + "loss": 0.1352, + "step": 17014 + }, + { + "epoch": 0.43057418326289953, + "grad_norm": 2.8186159133911133, + "learning_rate": 6.170125542782683e-06, + "loss": 0.1622, + "step": 17015 + }, + { + "epoch": 0.4305994888275932, + "grad_norm": 8.13008975982666, + "learning_rate": 6.16973516652944e-06, + "loss": 0.1998, + "step": 17016 + }, + { + "epoch": 0.43062479439228685, + "grad_norm": 2.3908751010894775, + "learning_rate": 6.1693447827327915e-06, + "loss": 0.1175, + "step": 17017 + }, + { + "epoch": 0.43065009995698056, + "grad_norm": 4.906034469604492, + "learning_rate": 6.168954391395257e-06, + "loss": 0.1396, + "step": 17018 + }, + { + "epoch": 0.4306754055216742, + "grad_norm": 4.353033542633057, + "learning_rate": 6.168563992519352e-06, + "loss": 0.1533, + "step": 17019 + }, + { + "epoch": 0.43070071108636787, + "grad_norm": 3.9685676097869873, + "learning_rate": 6.168173586107595e-06, + "loss": 0.1665, + "step": 17020 + }, + { + "epoch": 0.4307260166510616, + "grad_norm": 2.1484155654907227, + "learning_rate": 6.167783172162505e-06, + "loss": 0.1098, + "step": 17021 + }, + { + "epoch": 0.43075132221575524, + "grad_norm": 4.551858901977539, + "learning_rate": 6.167392750686597e-06, + "loss": 0.1631, + "step": 17022 + }, + { + "epoch": 0.4307766277804489, + "grad_norm": 6.286726951599121, + "learning_rate": 6.1670023216823924e-06, + "loss": 0.1891, + "step": 17023 + }, + { + "epoch": 0.4308019333451426, + "grad_norm": 3.9551074504852295, + "learning_rate": 6.166611885152405e-06, + "loss": 0.131, + "step": 17024 + }, + { + "epoch": 0.43082723890983626, + "grad_norm": 6.0604424476623535, + "learning_rate": 6.166221441099156e-06, + "loss": 0.1597, + "step": 17025 + }, + { + "epoch": 0.43085254447453, + "grad_norm": 4.474228382110596, + "learning_rate": 6.16583098952516e-06, + "loss": 0.1719, + "step": 17026 + }, + { + "epoch": 0.43087785003922363, + "grad_norm": 9.63746166229248, + "learning_rate": 6.165440530432939e-06, + "loss": 0.216, + "step": 17027 + }, + { + "epoch": 0.4309031556039173, + "grad_norm": 4.998224258422852, + "learning_rate": 6.165050063825007e-06, + "loss": 0.1676, + "step": 17028 + }, + { + "epoch": 0.430928461168611, + "grad_norm": 4.959636688232422, + "learning_rate": 6.1646595897038855e-06, + "loss": 0.1839, + "step": 17029 + }, + { + "epoch": 0.43095376673330466, + "grad_norm": 11.159582138061523, + "learning_rate": 6.16426910807209e-06, + "loss": 0.1975, + "step": 17030 + }, + { + "epoch": 0.4309790722979983, + "grad_norm": 6.446164608001709, + "learning_rate": 6.1638786189321374e-06, + "loss": 0.1384, + "step": 17031 + }, + { + "epoch": 0.431004377862692, + "grad_norm": 4.720762729644775, + "learning_rate": 6.1634881222865515e-06, + "loss": 0.1783, + "step": 17032 + }, + { + "epoch": 0.4310296834273857, + "grad_norm": 9.776093482971191, + "learning_rate": 6.163097618137845e-06, + "loss": 0.1672, + "step": 17033 + }, + { + "epoch": 0.43105498899207934, + "grad_norm": 5.766147613525391, + "learning_rate": 6.16270710648854e-06, + "loss": 0.1689, + "step": 17034 + }, + { + "epoch": 0.43108029455677305, + "grad_norm": 7.532618522644043, + "learning_rate": 6.162316587341151e-06, + "loss": 0.1567, + "step": 17035 + }, + { + "epoch": 0.4311056001214667, + "grad_norm": 13.502399444580078, + "learning_rate": 6.161926060698201e-06, + "loss": 0.3086, + "step": 17036 + }, + { + "epoch": 0.43113090568616036, + "grad_norm": 4.352858543395996, + "learning_rate": 6.161535526562204e-06, + "loss": 0.1911, + "step": 17037 + }, + { + "epoch": 0.43115621125085407, + "grad_norm": 10.541683197021484, + "learning_rate": 6.161144984935683e-06, + "loss": 0.2255, + "step": 17038 + }, + { + "epoch": 0.4311815168155477, + "grad_norm": 3.4369451999664307, + "learning_rate": 6.160754435821151e-06, + "loss": 0.1853, + "step": 17039 + }, + { + "epoch": 0.43120682238024144, + "grad_norm": 4.292471885681152, + "learning_rate": 6.160363879221132e-06, + "loss": 0.1684, + "step": 17040 + }, + { + "epoch": 0.4312321279449351, + "grad_norm": 8.190987586975098, + "learning_rate": 6.15997331513814e-06, + "loss": 0.1914, + "step": 17041 + }, + { + "epoch": 0.43125743350962875, + "grad_norm": 4.496415615081787, + "learning_rate": 6.159582743574697e-06, + "loss": 0.1666, + "step": 17042 + }, + { + "epoch": 0.43128273907432246, + "grad_norm": 8.090195655822754, + "learning_rate": 6.159192164533321e-06, + "loss": 0.2491, + "step": 17043 + }, + { + "epoch": 0.4313080446390161, + "grad_norm": 4.968238353729248, + "learning_rate": 6.158801578016529e-06, + "loss": 0.1294, + "step": 17044 + }, + { + "epoch": 0.4313333502037098, + "grad_norm": 3.61598539352417, + "learning_rate": 6.158410984026843e-06, + "loss": 0.159, + "step": 17045 + }, + { + "epoch": 0.4313586557684035, + "grad_norm": 3.0179314613342285, + "learning_rate": 6.158020382566778e-06, + "loss": 0.1301, + "step": 17046 + }, + { + "epoch": 0.43138396133309714, + "grad_norm": 3.5110507011413574, + "learning_rate": 6.157629773638856e-06, + "loss": 0.1655, + "step": 17047 + }, + { + "epoch": 0.4314092668977908, + "grad_norm": 7.276761054992676, + "learning_rate": 6.157239157245594e-06, + "loss": 0.2076, + "step": 17048 + }, + { + "epoch": 0.4314345724624845, + "grad_norm": 5.679546356201172, + "learning_rate": 6.156848533389515e-06, + "loss": 0.1781, + "step": 17049 + }, + { + "epoch": 0.43145987802717817, + "grad_norm": 8.778738975524902, + "learning_rate": 6.156457902073132e-06, + "loss": 0.2644, + "step": 17050 + }, + { + "epoch": 0.4314851835918719, + "grad_norm": 5.2826948165893555, + "learning_rate": 6.1560672632989685e-06, + "loss": 0.2135, + "step": 17051 + }, + { + "epoch": 0.43151048915656554, + "grad_norm": 4.8181867599487305, + "learning_rate": 6.155676617069543e-06, + "loss": 0.1914, + "step": 17052 + }, + { + "epoch": 0.4315357947212592, + "grad_norm": 9.225372314453125, + "learning_rate": 6.155285963387372e-06, + "loss": 0.2202, + "step": 17053 + }, + { + "epoch": 0.4315611002859529, + "grad_norm": 4.054037570953369, + "learning_rate": 6.1548953022549775e-06, + "loss": 0.1444, + "step": 17054 + }, + { + "epoch": 0.43158640585064656, + "grad_norm": 3.389511823654175, + "learning_rate": 6.154504633674877e-06, + "loss": 0.1522, + "step": 17055 + }, + { + "epoch": 0.4316117114153402, + "grad_norm": 3.7504961490631104, + "learning_rate": 6.154113957649592e-06, + "loss": 0.1433, + "step": 17056 + }, + { + "epoch": 0.4316370169800339, + "grad_norm": 3.405566692352295, + "learning_rate": 6.153723274181641e-06, + "loss": 0.1512, + "step": 17057 + }, + { + "epoch": 0.4316623225447276, + "grad_norm": 2.96138858795166, + "learning_rate": 6.1533325832735435e-06, + "loss": 0.1129, + "step": 17058 + }, + { + "epoch": 0.43168762810942124, + "grad_norm": 4.305537700653076, + "learning_rate": 6.152941884927818e-06, + "loss": 0.1272, + "step": 17059 + }, + { + "epoch": 0.43171293367411495, + "grad_norm": 9.216772079467773, + "learning_rate": 6.152551179146985e-06, + "loss": 0.2272, + "step": 17060 + }, + { + "epoch": 0.4317382392388086, + "grad_norm": 4.903234004974365, + "learning_rate": 6.1521604659335635e-06, + "loss": 0.2122, + "step": 17061 + }, + { + "epoch": 0.43176354480350226, + "grad_norm": 2.457115411758423, + "learning_rate": 6.151769745290072e-06, + "loss": 0.0888, + "step": 17062 + }, + { + "epoch": 0.431788850368196, + "grad_norm": 5.337090015411377, + "learning_rate": 6.151379017219035e-06, + "loss": 0.2478, + "step": 17063 + }, + { + "epoch": 0.43181415593288963, + "grad_norm": 4.376774311065674, + "learning_rate": 6.150988281722966e-06, + "loss": 0.1112, + "step": 17064 + }, + { + "epoch": 0.43183946149758334, + "grad_norm": 2.9200868606567383, + "learning_rate": 6.150597538804389e-06, + "loss": 0.1567, + "step": 17065 + }, + { + "epoch": 0.431864767062277, + "grad_norm": 4.793420314788818, + "learning_rate": 6.150206788465822e-06, + "loss": 0.1791, + "step": 17066 + }, + { + "epoch": 0.43189007262697066, + "grad_norm": 11.602654457092285, + "learning_rate": 6.149816030709786e-06, + "loss": 0.0981, + "step": 17067 + }, + { + "epoch": 0.43191537819166437, + "grad_norm": 4.513610363006592, + "learning_rate": 6.1494252655388e-06, + "loss": 0.1362, + "step": 17068 + }, + { + "epoch": 0.431940683756358, + "grad_norm": 6.594783782958984, + "learning_rate": 6.149034492955385e-06, + "loss": 0.1343, + "step": 17069 + }, + { + "epoch": 0.4319659893210517, + "grad_norm": 5.918323993682861, + "learning_rate": 6.148643712962058e-06, + "loss": 0.1297, + "step": 17070 + }, + { + "epoch": 0.4319912948857454, + "grad_norm": 4.080094814300537, + "learning_rate": 6.148252925561344e-06, + "loss": 0.1549, + "step": 17071 + }, + { + "epoch": 0.43201660045043905, + "grad_norm": 4.844101905822754, + "learning_rate": 6.147862130755758e-06, + "loss": 0.1909, + "step": 17072 + }, + { + "epoch": 0.4320419060151327, + "grad_norm": 5.388741970062256, + "learning_rate": 6.147471328547824e-06, + "loss": 0.134, + "step": 17073 + }, + { + "epoch": 0.4320672115798264, + "grad_norm": 4.662194728851318, + "learning_rate": 6.1470805189400616e-06, + "loss": 0.1541, + "step": 17074 + }, + { + "epoch": 0.43209251714452007, + "grad_norm": 11.730843544006348, + "learning_rate": 6.1466897019349894e-06, + "loss": 0.1911, + "step": 17075 + }, + { + "epoch": 0.4321178227092138, + "grad_norm": 5.179938316345215, + "learning_rate": 6.146298877535128e-06, + "loss": 0.1103, + "step": 17076 + }, + { + "epoch": 0.43214312827390744, + "grad_norm": 4.93917179107666, + "learning_rate": 6.145908045743e-06, + "loss": 0.2444, + "step": 17077 + }, + { + "epoch": 0.4321684338386011, + "grad_norm": 6.081783294677734, + "learning_rate": 6.1455172065611236e-06, + "loss": 0.2583, + "step": 17078 + }, + { + "epoch": 0.4321937394032948, + "grad_norm": 7.719630718231201, + "learning_rate": 6.145126359992018e-06, + "loss": 0.249, + "step": 17079 + }, + { + "epoch": 0.43221904496798846, + "grad_norm": 4.562711715698242, + "learning_rate": 6.1447355060382084e-06, + "loss": 0.1277, + "step": 17080 + }, + { + "epoch": 0.4322443505326821, + "grad_norm": 4.764218330383301, + "learning_rate": 6.144344644702212e-06, + "loss": 0.1366, + "step": 17081 + }, + { + "epoch": 0.43226965609737583, + "grad_norm": 6.18129825592041, + "learning_rate": 6.143953775986548e-06, + "loss": 0.2373, + "step": 17082 + }, + { + "epoch": 0.4322949616620695, + "grad_norm": 12.658524513244629, + "learning_rate": 6.14356289989374e-06, + "loss": 0.3181, + "step": 17083 + }, + { + "epoch": 0.43232026722676314, + "grad_norm": 4.324499130249023, + "learning_rate": 6.1431720164263075e-06, + "loss": 0.1703, + "step": 17084 + }, + { + "epoch": 0.43234557279145686, + "grad_norm": 4.949752330780029, + "learning_rate": 6.142781125586772e-06, + "loss": 0.1393, + "step": 17085 + }, + { + "epoch": 0.4323708783561505, + "grad_norm": 4.667751312255859, + "learning_rate": 6.142390227377652e-06, + "loss": 0.2054, + "step": 17086 + }, + { + "epoch": 0.43239618392084417, + "grad_norm": 6.318285942077637, + "learning_rate": 6.141999321801472e-06, + "loss": 0.1983, + "step": 17087 + }, + { + "epoch": 0.4324214894855379, + "grad_norm": 14.494661331176758, + "learning_rate": 6.141608408860749e-06, + "loss": 0.2042, + "step": 17088 + }, + { + "epoch": 0.43244679505023154, + "grad_norm": 8.265984535217285, + "learning_rate": 6.141217488558006e-06, + "loss": 0.2338, + "step": 17089 + }, + { + "epoch": 0.43247210061492525, + "grad_norm": 7.209480285644531, + "learning_rate": 6.140826560895764e-06, + "loss": 0.1473, + "step": 17090 + }, + { + "epoch": 0.4324974061796189, + "grad_norm": 3.373816967010498, + "learning_rate": 6.140435625876544e-06, + "loss": 0.1657, + "step": 17091 + }, + { + "epoch": 0.43252271174431256, + "grad_norm": 4.19316291809082, + "learning_rate": 6.140044683502866e-06, + "loss": 0.1953, + "step": 17092 + }, + { + "epoch": 0.43254801730900627, + "grad_norm": 6.709948539733887, + "learning_rate": 6.139653733777253e-06, + "loss": 0.2276, + "step": 17093 + }, + { + "epoch": 0.43257332287369993, + "grad_norm": 3.701366424560547, + "learning_rate": 6.139262776702224e-06, + "loss": 0.1534, + "step": 17094 + }, + { + "epoch": 0.4325986284383936, + "grad_norm": 3.01282000541687, + "learning_rate": 6.1388718122803014e-06, + "loss": 0.1394, + "step": 17095 + }, + { + "epoch": 0.4326239340030873, + "grad_norm": 4.15083122253418, + "learning_rate": 6.138480840514007e-06, + "loss": 0.2098, + "step": 17096 + }, + { + "epoch": 0.43264923956778095, + "grad_norm": 5.628993511199951, + "learning_rate": 6.13808986140586e-06, + "loss": 0.1699, + "step": 17097 + }, + { + "epoch": 0.4326745451324746, + "grad_norm": 7.538201332092285, + "learning_rate": 6.137698874958385e-06, + "loss": 0.1746, + "step": 17098 + }, + { + "epoch": 0.4326998506971683, + "grad_norm": 8.230688095092773, + "learning_rate": 6.137307881174101e-06, + "loss": 0.1585, + "step": 17099 + }, + { + "epoch": 0.432725156261862, + "grad_norm": 8.332822799682617, + "learning_rate": 6.1369168800555305e-06, + "loss": 0.1692, + "step": 17100 + }, + { + "epoch": 0.43275046182655563, + "grad_norm": 4.037268161773682, + "learning_rate": 6.136525871605193e-06, + "loss": 0.144, + "step": 17101 + }, + { + "epoch": 0.43277576739124934, + "grad_norm": 3.0001466274261475, + "learning_rate": 6.136134855825613e-06, + "loss": 0.1184, + "step": 17102 + }, + { + "epoch": 0.432801072955943, + "grad_norm": 5.692552089691162, + "learning_rate": 6.13574383271931e-06, + "loss": 0.1946, + "step": 17103 + }, + { + "epoch": 0.4328263785206367, + "grad_norm": 3.4389235973358154, + "learning_rate": 6.135352802288806e-06, + "loss": 0.1984, + "step": 17104 + }, + { + "epoch": 0.43285168408533037, + "grad_norm": 5.8061017990112305, + "learning_rate": 6.134961764536624e-06, + "loss": 0.1567, + "step": 17105 + }, + { + "epoch": 0.432876989650024, + "grad_norm": 3.4434306621551514, + "learning_rate": 6.1345707194652835e-06, + "loss": 0.1041, + "step": 17106 + }, + { + "epoch": 0.43290229521471774, + "grad_norm": 9.342938423156738, + "learning_rate": 6.1341796670773075e-06, + "loss": 0.1811, + "step": 17107 + }, + { + "epoch": 0.4329276007794114, + "grad_norm": 4.056958198547363, + "learning_rate": 6.13378860737522e-06, + "loss": 0.1244, + "step": 17108 + }, + { + "epoch": 0.43295290634410505, + "grad_norm": 3.5618534088134766, + "learning_rate": 6.133397540361539e-06, + "loss": 0.1693, + "step": 17109 + }, + { + "epoch": 0.43297821190879876, + "grad_norm": 2.6101415157318115, + "learning_rate": 6.133006466038788e-06, + "loss": 0.1275, + "step": 17110 + }, + { + "epoch": 0.4330035174734924, + "grad_norm": 5.458543300628662, + "learning_rate": 6.132615384409488e-06, + "loss": 0.2187, + "step": 17111 + }, + { + "epoch": 0.4330288230381861, + "grad_norm": 11.772298812866211, + "learning_rate": 6.132224295476165e-06, + "loss": 0.221, + "step": 17112 + }, + { + "epoch": 0.4330541286028798, + "grad_norm": 3.454089879989624, + "learning_rate": 6.131833199241336e-06, + "loss": 0.1499, + "step": 17113 + }, + { + "epoch": 0.43307943416757344, + "grad_norm": 9.572577476501465, + "learning_rate": 6.131442095707525e-06, + "loss": 0.2047, + "step": 17114 + }, + { + "epoch": 0.43310473973226715, + "grad_norm": 7.916538238525391, + "learning_rate": 6.131050984877255e-06, + "loss": 0.2412, + "step": 17115 + }, + { + "epoch": 0.4331300452969608, + "grad_norm": 5.315547943115234, + "learning_rate": 6.130659866753048e-06, + "loss": 0.2365, + "step": 17116 + }, + { + "epoch": 0.43315535086165446, + "grad_norm": 3.689087152481079, + "learning_rate": 6.130268741337425e-06, + "loss": 0.1489, + "step": 17117 + }, + { + "epoch": 0.4331806564263482, + "grad_norm": 9.363164901733398, + "learning_rate": 6.12987760863291e-06, + "loss": 0.2699, + "step": 17118 + }, + { + "epoch": 0.43320596199104183, + "grad_norm": 3.1413636207580566, + "learning_rate": 6.129486468642024e-06, + "loss": 0.1576, + "step": 17119 + }, + { + "epoch": 0.4332312675557355, + "grad_norm": 4.7128214836120605, + "learning_rate": 6.12909532136729e-06, + "loss": 0.1706, + "step": 17120 + }, + { + "epoch": 0.4332565731204292, + "grad_norm": 20.354516983032227, + "learning_rate": 6.12870416681123e-06, + "loss": 0.0899, + "step": 17121 + }, + { + "epoch": 0.43328187868512286, + "grad_norm": 4.350188732147217, + "learning_rate": 6.128313004976368e-06, + "loss": 0.1818, + "step": 17122 + }, + { + "epoch": 0.4333071842498165, + "grad_norm": 9.282391548156738, + "learning_rate": 6.127921835865223e-06, + "loss": 0.2003, + "step": 17123 + }, + { + "epoch": 0.4333324898145102, + "grad_norm": 3.6931345462799072, + "learning_rate": 6.127530659480322e-06, + "loss": 0.1944, + "step": 17124 + }, + { + "epoch": 0.4333577953792039, + "grad_norm": 6.399353981018066, + "learning_rate": 6.1271394758241846e-06, + "loss": 0.171, + "step": 17125 + }, + { + "epoch": 0.43338310094389754, + "grad_norm": 3.855135679244995, + "learning_rate": 6.126748284899333e-06, + "loss": 0.2249, + "step": 17126 + }, + { + "epoch": 0.43340840650859125, + "grad_norm": 4.838399887084961, + "learning_rate": 6.126357086708295e-06, + "loss": 0.1987, + "step": 17127 + }, + { + "epoch": 0.4334337120732849, + "grad_norm": 4.152438640594482, + "learning_rate": 6.125965881253586e-06, + "loss": 0.1713, + "step": 17128 + }, + { + "epoch": 0.4334590176379786, + "grad_norm": 21.156763076782227, + "learning_rate": 6.125574668537735e-06, + "loss": 0.2594, + "step": 17129 + }, + { + "epoch": 0.4334843232026723, + "grad_norm": 4.109052658081055, + "learning_rate": 6.12518344856326e-06, + "loss": 0.1226, + "step": 17130 + }, + { + "epoch": 0.43350962876736593, + "grad_norm": 3.2352912425994873, + "learning_rate": 6.124792221332688e-06, + "loss": 0.1773, + "step": 17131 + }, + { + "epoch": 0.43353493433205964, + "grad_norm": 3.6320955753326416, + "learning_rate": 6.12440098684854e-06, + "loss": 0.1869, + "step": 17132 + }, + { + "epoch": 0.4335602398967533, + "grad_norm": 3.24759578704834, + "learning_rate": 6.124009745113338e-06, + "loss": 0.1388, + "step": 17133 + }, + { + "epoch": 0.43358554546144695, + "grad_norm": 4.547995090484619, + "learning_rate": 6.123618496129607e-06, + "loss": 0.1086, + "step": 17134 + }, + { + "epoch": 0.43361085102614066, + "grad_norm": 7.409869194030762, + "learning_rate": 6.12322723989987e-06, + "loss": 0.2051, + "step": 17135 + }, + { + "epoch": 0.4336361565908343, + "grad_norm": 4.467811107635498, + "learning_rate": 6.122835976426649e-06, + "loss": 0.1157, + "step": 17136 + }, + { + "epoch": 0.433661462155528, + "grad_norm": 4.369858264923096, + "learning_rate": 6.122444705712468e-06, + "loss": 0.0879, + "step": 17137 + }, + { + "epoch": 0.4336867677202217, + "grad_norm": 12.839949607849121, + "learning_rate": 6.122053427759849e-06, + "loss": 0.2892, + "step": 17138 + }, + { + "epoch": 0.43371207328491534, + "grad_norm": 5.45657205581665, + "learning_rate": 6.121662142571317e-06, + "loss": 0.1643, + "step": 17139 + }, + { + "epoch": 0.43373737884960906, + "grad_norm": 4.86348819732666, + "learning_rate": 6.121270850149393e-06, + "loss": 0.2268, + "step": 17140 + }, + { + "epoch": 0.4337626844143027, + "grad_norm": 6.244470119476318, + "learning_rate": 6.120879550496604e-06, + "loss": 0.2173, + "step": 17141 + }, + { + "epoch": 0.43378798997899637, + "grad_norm": 3.869917154312134, + "learning_rate": 6.120488243615471e-06, + "loss": 0.1726, + "step": 17142 + }, + { + "epoch": 0.4338132955436901, + "grad_norm": 3.9471659660339355, + "learning_rate": 6.1200969295085165e-06, + "loss": 0.1284, + "step": 17143 + }, + { + "epoch": 0.43383860110838374, + "grad_norm": 4.132373809814453, + "learning_rate": 6.119705608178266e-06, + "loss": 0.1353, + "step": 17144 + }, + { + "epoch": 0.4338639066730774, + "grad_norm": 6.008312702178955, + "learning_rate": 6.119314279627242e-06, + "loss": 0.1408, + "step": 17145 + }, + { + "epoch": 0.4338892122377711, + "grad_norm": 4.122181415557861, + "learning_rate": 6.118922943857969e-06, + "loss": 0.1485, + "step": 17146 + }, + { + "epoch": 0.43391451780246476, + "grad_norm": 6.1600022315979, + "learning_rate": 6.118531600872969e-06, + "loss": 0.1731, + "step": 17147 + }, + { + "epoch": 0.4339398233671584, + "grad_norm": 3.2388999462127686, + "learning_rate": 6.118140250674767e-06, + "loss": 0.1173, + "step": 17148 + }, + { + "epoch": 0.43396512893185213, + "grad_norm": 7.360684871673584, + "learning_rate": 6.117748893265887e-06, + "loss": 0.1227, + "step": 17149 + }, + { + "epoch": 0.4339904344965458, + "grad_norm": 5.58213996887207, + "learning_rate": 6.117357528648852e-06, + "loss": 0.1654, + "step": 17150 + }, + { + "epoch": 0.43401574006123944, + "grad_norm": 6.649238586425781, + "learning_rate": 6.116966156826186e-06, + "loss": 0.2668, + "step": 17151 + }, + { + "epoch": 0.43404104562593315, + "grad_norm": 5.609832763671875, + "learning_rate": 6.116574777800413e-06, + "loss": 0.1157, + "step": 17152 + }, + { + "epoch": 0.4340663511906268, + "grad_norm": 5.729649543762207, + "learning_rate": 6.1161833915740575e-06, + "loss": 0.2361, + "step": 17153 + }, + { + "epoch": 0.4340916567553205, + "grad_norm": 17.41558265686035, + "learning_rate": 6.115791998149643e-06, + "loss": 0.1388, + "step": 17154 + }, + { + "epoch": 0.4341169623200142, + "grad_norm": 6.155879974365234, + "learning_rate": 6.115400597529694e-06, + "loss": 0.1135, + "step": 17155 + }, + { + "epoch": 0.43414226788470783, + "grad_norm": 3.177696943283081, + "learning_rate": 6.115009189716732e-06, + "loss": 0.1225, + "step": 17156 + }, + { + "epoch": 0.43416757344940154, + "grad_norm": 4.716815948486328, + "learning_rate": 6.114617774713285e-06, + "loss": 0.1964, + "step": 17157 + }, + { + "epoch": 0.4341928790140952, + "grad_norm": 5.530507564544678, + "learning_rate": 6.114226352521874e-06, + "loss": 0.2265, + "step": 17158 + }, + { + "epoch": 0.43421818457878886, + "grad_norm": 5.2012505531311035, + "learning_rate": 6.113834923145024e-06, + "loss": 0.1814, + "step": 17159 + }, + { + "epoch": 0.43424349014348257, + "grad_norm": 6.566393852233887, + "learning_rate": 6.113443486585262e-06, + "loss": 0.1429, + "step": 17160 + }, + { + "epoch": 0.4342687957081762, + "grad_norm": 6.465014934539795, + "learning_rate": 6.113052042845109e-06, + "loss": 0.1908, + "step": 17161 + }, + { + "epoch": 0.4342941012728699, + "grad_norm": 7.446199417114258, + "learning_rate": 6.112660591927091e-06, + "loss": 0.1379, + "step": 17162 + }, + { + "epoch": 0.4343194068375636, + "grad_norm": 2.97672700881958, + "learning_rate": 6.11226913383373e-06, + "loss": 0.16, + "step": 17163 + }, + { + "epoch": 0.43434471240225725, + "grad_norm": 5.916031837463379, + "learning_rate": 6.111877668567554e-06, + "loss": 0.2154, + "step": 17164 + }, + { + "epoch": 0.4343700179669509, + "grad_norm": 3.7201287746429443, + "learning_rate": 6.1114861961310844e-06, + "loss": 0.1108, + "step": 17165 + }, + { + "epoch": 0.4343953235316446, + "grad_norm": 5.978524684906006, + "learning_rate": 6.111094716526848e-06, + "loss": 0.2155, + "step": 17166 + }, + { + "epoch": 0.4344206290963383, + "grad_norm": 3.6623613834381104, + "learning_rate": 6.110703229757369e-06, + "loss": 0.1079, + "step": 17167 + }, + { + "epoch": 0.434445934661032, + "grad_norm": 5.69042444229126, + "learning_rate": 6.1103117358251715e-06, + "loss": 0.204, + "step": 17168 + }, + { + "epoch": 0.43447124022572564, + "grad_norm": 4.437573432922363, + "learning_rate": 6.10992023473278e-06, + "loss": 0.1437, + "step": 17169 + }, + { + "epoch": 0.4344965457904193, + "grad_norm": 5.096078872680664, + "learning_rate": 6.109528726482719e-06, + "loss": 0.2145, + "step": 17170 + }, + { + "epoch": 0.434521851355113, + "grad_norm": 11.977147102355957, + "learning_rate": 6.109137211077514e-06, + "loss": 0.3582, + "step": 17171 + }, + { + "epoch": 0.43454715691980667, + "grad_norm": 6.488426208496094, + "learning_rate": 6.108745688519687e-06, + "loss": 0.2108, + "step": 17172 + }, + { + "epoch": 0.4345724624845003, + "grad_norm": 6.434595584869385, + "learning_rate": 6.108354158811768e-06, + "loss": 0.1968, + "step": 17173 + }, + { + "epoch": 0.43459776804919403, + "grad_norm": 4.687381267547607, + "learning_rate": 6.107962621956278e-06, + "loss": 0.1367, + "step": 17174 + }, + { + "epoch": 0.4346230736138877, + "grad_norm": 6.838201522827148, + "learning_rate": 6.107571077955744e-06, + "loss": 0.2052, + "step": 17175 + }, + { + "epoch": 0.43464837917858135, + "grad_norm": 5.828670978546143, + "learning_rate": 6.10717952681269e-06, + "loss": 0.1886, + "step": 17176 + }, + { + "epoch": 0.43467368474327506, + "grad_norm": 4.591522693634033, + "learning_rate": 6.1067879685296405e-06, + "loss": 0.1756, + "step": 17177 + }, + { + "epoch": 0.4346989903079687, + "grad_norm": 3.400949716567993, + "learning_rate": 6.106396403109122e-06, + "loss": 0.1184, + "step": 17178 + }, + { + "epoch": 0.4347242958726624, + "grad_norm": 5.6375837326049805, + "learning_rate": 6.106004830553658e-06, + "loss": 0.1668, + "step": 17179 + }, + { + "epoch": 0.4347496014373561, + "grad_norm": 6.0338263511657715, + "learning_rate": 6.105613250865775e-06, + "loss": 0.2216, + "step": 17180 + }, + { + "epoch": 0.43477490700204974, + "grad_norm": 5.226568698883057, + "learning_rate": 6.105221664047997e-06, + "loss": 0.1508, + "step": 17181 + }, + { + "epoch": 0.43480021256674345, + "grad_norm": 8.612467765808105, + "learning_rate": 6.104830070102851e-06, + "loss": 0.2258, + "step": 17182 + }, + { + "epoch": 0.4348255181314371, + "grad_norm": 36.186866760253906, + "learning_rate": 6.1044384690328615e-06, + "loss": 0.3601, + "step": 17183 + }, + { + "epoch": 0.43485082369613076, + "grad_norm": 5.185696125030518, + "learning_rate": 6.1040468608405545e-06, + "loss": 0.1845, + "step": 17184 + }, + { + "epoch": 0.4348761292608245, + "grad_norm": 3.4148120880126953, + "learning_rate": 6.1036552455284536e-06, + "loss": 0.1398, + "step": 17185 + }, + { + "epoch": 0.43490143482551813, + "grad_norm": 6.340676307678223, + "learning_rate": 6.103263623099086e-06, + "loss": 0.1518, + "step": 17186 + }, + { + "epoch": 0.4349267403902118, + "grad_norm": 5.259401321411133, + "learning_rate": 6.102871993554975e-06, + "loss": 0.1735, + "step": 17187 + }, + { + "epoch": 0.4349520459549055, + "grad_norm": 6.932137966156006, + "learning_rate": 6.10248035689865e-06, + "loss": 0.0928, + "step": 17188 + }, + { + "epoch": 0.43497735151959915, + "grad_norm": 2.9616777896881104, + "learning_rate": 6.102088713132632e-06, + "loss": 0.132, + "step": 17189 + }, + { + "epoch": 0.4350026570842928, + "grad_norm": 2.9642415046691895, + "learning_rate": 6.10169706225945e-06, + "loss": 0.107, + "step": 17190 + }, + { + "epoch": 0.4350279626489865, + "grad_norm": 6.319185733795166, + "learning_rate": 6.101305404281629e-06, + "loss": 0.1715, + "step": 17191 + }, + { + "epoch": 0.4350532682136802, + "grad_norm": 6.29387903213501, + "learning_rate": 6.100913739201692e-06, + "loss": 0.1783, + "step": 17192 + }, + { + "epoch": 0.4350785737783739, + "grad_norm": 7.738245487213135, + "learning_rate": 6.10052206702217e-06, + "loss": 0.239, + "step": 17193 + }, + { + "epoch": 0.43510387934306755, + "grad_norm": 6.507307052612305, + "learning_rate": 6.100130387745584e-06, + "loss": 0.155, + "step": 17194 + }, + { + "epoch": 0.4351291849077612, + "grad_norm": 5.3070068359375, + "learning_rate": 6.099738701374464e-06, + "loss": 0.2442, + "step": 17195 + }, + { + "epoch": 0.4351544904724549, + "grad_norm": 3.7285590171813965, + "learning_rate": 6.099347007911331e-06, + "loss": 0.1315, + "step": 17196 + }, + { + "epoch": 0.43517979603714857, + "grad_norm": 2.697465658187866, + "learning_rate": 6.098955307358716e-06, + "loss": 0.1342, + "step": 17197 + }, + { + "epoch": 0.4352051016018422, + "grad_norm": 10.530816078186035, + "learning_rate": 6.098563599719141e-06, + "loss": 0.3492, + "step": 17198 + }, + { + "epoch": 0.43523040716653594, + "grad_norm": 9.498261451721191, + "learning_rate": 6.0981718849951345e-06, + "loss": 0.2207, + "step": 17199 + }, + { + "epoch": 0.4352557127312296, + "grad_norm": 2.445791244506836, + "learning_rate": 6.097780163189222e-06, + "loss": 0.0666, + "step": 17200 + }, + { + "epoch": 0.43528101829592325, + "grad_norm": 5.312227249145508, + "learning_rate": 6.097388434303928e-06, + "loss": 0.1811, + "step": 17201 + }, + { + "epoch": 0.43530632386061696, + "grad_norm": 5.951584815979004, + "learning_rate": 6.096996698341782e-06, + "loss": 0.1815, + "step": 17202 + }, + { + "epoch": 0.4353316294253106, + "grad_norm": 4.835674285888672, + "learning_rate": 6.096604955305307e-06, + "loss": 0.1647, + "step": 17203 + }, + { + "epoch": 0.43535693499000433, + "grad_norm": 6.920533657073975, + "learning_rate": 6.096213205197031e-06, + "loss": 0.1397, + "step": 17204 + }, + { + "epoch": 0.435382240554698, + "grad_norm": 4.544297218322754, + "learning_rate": 6.095821448019479e-06, + "loss": 0.2026, + "step": 17205 + }, + { + "epoch": 0.43540754611939164, + "grad_norm": 7.706244945526123, + "learning_rate": 6.09542968377518e-06, + "loss": 0.2503, + "step": 17206 + }, + { + "epoch": 0.43543285168408535, + "grad_norm": 2.60880708694458, + "learning_rate": 6.095037912466658e-06, + "loss": 0.0677, + "step": 17207 + }, + { + "epoch": 0.435458157248779, + "grad_norm": 3.657871961593628, + "learning_rate": 6.0946461340964405e-06, + "loss": 0.0892, + "step": 17208 + }, + { + "epoch": 0.43548346281347267, + "grad_norm": 5.861593246459961, + "learning_rate": 6.094254348667052e-06, + "loss": 0.1834, + "step": 17209 + }, + { + "epoch": 0.4355087683781664, + "grad_norm": 9.39528751373291, + "learning_rate": 6.093862556181024e-06, + "loss": 0.1709, + "step": 17210 + }, + { + "epoch": 0.43553407394286003, + "grad_norm": 4.1073713302612305, + "learning_rate": 6.0934707566408765e-06, + "loss": 0.1414, + "step": 17211 + }, + { + "epoch": 0.4355593795075537, + "grad_norm": 9.470732688903809, + "learning_rate": 6.093078950049141e-06, + "loss": 0.1681, + "step": 17212 + }, + { + "epoch": 0.4355846850722474, + "grad_norm": 4.0381011962890625, + "learning_rate": 6.092687136408343e-06, + "loss": 0.1759, + "step": 17213 + }, + { + "epoch": 0.43560999063694106, + "grad_norm": 6.265688419342041, + "learning_rate": 6.092295315721008e-06, + "loss": 0.1742, + "step": 17214 + }, + { + "epoch": 0.4356352962016347, + "grad_norm": 5.329999923706055, + "learning_rate": 6.091903487989665e-06, + "loss": 0.1639, + "step": 17215 + }, + { + "epoch": 0.4356606017663284, + "grad_norm": 4.925227165222168, + "learning_rate": 6.091511653216839e-06, + "loss": 0.1775, + "step": 17216 + }, + { + "epoch": 0.4356859073310221, + "grad_norm": 3.108933687210083, + "learning_rate": 6.091119811405057e-06, + "loss": 0.1395, + "step": 17217 + }, + { + "epoch": 0.4357112128957158, + "grad_norm": 4.2339372634887695, + "learning_rate": 6.090727962556846e-06, + "loss": 0.2119, + "step": 17218 + }, + { + "epoch": 0.43573651846040945, + "grad_norm": 9.028596878051758, + "learning_rate": 6.090336106674735e-06, + "loss": 0.2679, + "step": 17219 + }, + { + "epoch": 0.4357618240251031, + "grad_norm": 3.656251907348633, + "learning_rate": 6.089944243761247e-06, + "loss": 0.1362, + "step": 17220 + }, + { + "epoch": 0.4357871295897968, + "grad_norm": 8.179462432861328, + "learning_rate": 6.089552373818912e-06, + "loss": 0.3061, + "step": 17221 + }, + { + "epoch": 0.4358124351544905, + "grad_norm": 12.645394325256348, + "learning_rate": 6.0891604968502575e-06, + "loss": 0.2309, + "step": 17222 + }, + { + "epoch": 0.43583774071918413, + "grad_norm": 7.1840901374816895, + "learning_rate": 6.088768612857807e-06, + "loss": 0.1939, + "step": 17223 + }, + { + "epoch": 0.43586304628387784, + "grad_norm": 7.4802069664001465, + "learning_rate": 6.088376721844093e-06, + "loss": 0.2003, + "step": 17224 + }, + { + "epoch": 0.4358883518485715, + "grad_norm": 4.865143775939941, + "learning_rate": 6.087984823811639e-06, + "loss": 0.2114, + "step": 17225 + }, + { + "epoch": 0.43591365741326515, + "grad_norm": 5.33474588394165, + "learning_rate": 6.087592918762973e-06, + "loss": 0.1248, + "step": 17226 + }, + { + "epoch": 0.43593896297795887, + "grad_norm": 5.561079978942871, + "learning_rate": 6.087201006700624e-06, + "loss": 0.1724, + "step": 17227 + }, + { + "epoch": 0.4359642685426525, + "grad_norm": 4.251108169555664, + "learning_rate": 6.086809087627116e-06, + "loss": 0.151, + "step": 17228 + }, + { + "epoch": 0.4359895741073462, + "grad_norm": 5.0601396560668945, + "learning_rate": 6.08641716154498e-06, + "loss": 0.1582, + "step": 17229 + }, + { + "epoch": 0.4360148796720399, + "grad_norm": 4.087623119354248, + "learning_rate": 6.086025228456741e-06, + "loss": 0.1128, + "step": 17230 + }, + { + "epoch": 0.43604018523673355, + "grad_norm": 3.5999655723571777, + "learning_rate": 6.0856332883649275e-06, + "loss": 0.1659, + "step": 17231 + }, + { + "epoch": 0.43606549080142726, + "grad_norm": 6.854796886444092, + "learning_rate": 6.085241341272067e-06, + "loss": 0.1731, + "step": 17232 + }, + { + "epoch": 0.4360907963661209, + "grad_norm": 7.321910381317139, + "learning_rate": 6.084849387180686e-06, + "loss": 0.194, + "step": 17233 + }, + { + "epoch": 0.43611610193081457, + "grad_norm": 5.480162620544434, + "learning_rate": 6.084457426093313e-06, + "loss": 0.1637, + "step": 17234 + }, + { + "epoch": 0.4361414074955083, + "grad_norm": 5.134415149688721, + "learning_rate": 6.0840654580124765e-06, + "loss": 0.2114, + "step": 17235 + }, + { + "epoch": 0.43616671306020194, + "grad_norm": 3.3755908012390137, + "learning_rate": 6.083673482940703e-06, + "loss": 0.1479, + "step": 17236 + }, + { + "epoch": 0.4361920186248956, + "grad_norm": 3.8783154487609863, + "learning_rate": 6.083281500880522e-06, + "loss": 0.1278, + "step": 17237 + }, + { + "epoch": 0.4362173241895893, + "grad_norm": 4.255475044250488, + "learning_rate": 6.082889511834459e-06, + "loss": 0.1499, + "step": 17238 + }, + { + "epoch": 0.43624262975428296, + "grad_norm": 4.177158355712891, + "learning_rate": 6.082497515805044e-06, + "loss": 0.1634, + "step": 17239 + }, + { + "epoch": 0.4362679353189766, + "grad_norm": 5.269420623779297, + "learning_rate": 6.082105512794803e-06, + "loss": 0.1087, + "step": 17240 + }, + { + "epoch": 0.43629324088367033, + "grad_norm": 21.262739181518555, + "learning_rate": 6.081713502806264e-06, + "loss": 0.2312, + "step": 17241 + }, + { + "epoch": 0.436318546448364, + "grad_norm": 3.062218189239502, + "learning_rate": 6.081321485841956e-06, + "loss": 0.1552, + "step": 17242 + }, + { + "epoch": 0.4363438520130577, + "grad_norm": 2.4668214321136475, + "learning_rate": 6.080929461904408e-06, + "loss": 0.1044, + "step": 17243 + }, + { + "epoch": 0.43636915757775135, + "grad_norm": 5.536475658416748, + "learning_rate": 6.080537430996147e-06, + "loss": 0.1821, + "step": 17244 + }, + { + "epoch": 0.436394463142445, + "grad_norm": 4.334217548370361, + "learning_rate": 6.0801453931197e-06, + "loss": 0.104, + "step": 17245 + }, + { + "epoch": 0.4364197687071387, + "grad_norm": 3.5355920791625977, + "learning_rate": 6.0797533482775985e-06, + "loss": 0.1621, + "step": 17246 + }, + { + "epoch": 0.4364450742718324, + "grad_norm": 5.223726272583008, + "learning_rate": 6.0793612964723665e-06, + "loss": 0.1477, + "step": 17247 + }, + { + "epoch": 0.43647037983652603, + "grad_norm": 18.24125862121582, + "learning_rate": 6.0789692377065354e-06, + "loss": 0.1522, + "step": 17248 + }, + { + "epoch": 0.43649568540121975, + "grad_norm": 6.614469528198242, + "learning_rate": 6.078577171982631e-06, + "loss": 0.1895, + "step": 17249 + }, + { + "epoch": 0.4365209909659134, + "grad_norm": 11.611088752746582, + "learning_rate": 6.078185099303185e-06, + "loss": 0.1667, + "step": 17250 + }, + { + "epoch": 0.43654629653060706, + "grad_norm": 4.594958782196045, + "learning_rate": 6.077793019670722e-06, + "loss": 0.1979, + "step": 17251 + }, + { + "epoch": 0.43657160209530077, + "grad_norm": 11.970721244812012, + "learning_rate": 6.077400933087773e-06, + "loss": 0.3454, + "step": 17252 + }, + { + "epoch": 0.4365969076599944, + "grad_norm": 9.178016662597656, + "learning_rate": 6.077008839556866e-06, + "loss": 0.2894, + "step": 17253 + }, + { + "epoch": 0.4366222132246881, + "grad_norm": 6.74098014831543, + "learning_rate": 6.076616739080529e-06, + "loss": 0.2412, + "step": 17254 + }, + { + "epoch": 0.4366475187893818, + "grad_norm": 8.889581680297852, + "learning_rate": 6.076224631661292e-06, + "loss": 0.1643, + "step": 17255 + }, + { + "epoch": 0.43667282435407545, + "grad_norm": 3.6344690322875977, + "learning_rate": 6.075832517301682e-06, + "loss": 0.171, + "step": 17256 + }, + { + "epoch": 0.43669812991876916, + "grad_norm": 9.2606782913208, + "learning_rate": 6.0754403960042274e-06, + "loss": 0.3107, + "step": 17257 + }, + { + "epoch": 0.4367234354834628, + "grad_norm": 3.210519313812256, + "learning_rate": 6.075048267771458e-06, + "loss": 0.1156, + "step": 17258 + }, + { + "epoch": 0.4367487410481565, + "grad_norm": 6.938523292541504, + "learning_rate": 6.074656132605904e-06, + "loss": 0.2102, + "step": 17259 + }, + { + "epoch": 0.4367740466128502, + "grad_norm": 9.11628532409668, + "learning_rate": 6.07426399051009e-06, + "loss": 0.223, + "step": 17260 + }, + { + "epoch": 0.43679935217754384, + "grad_norm": 8.519817352294922, + "learning_rate": 6.0738718414865495e-06, + "loss": 0.184, + "step": 17261 + }, + { + "epoch": 0.4368246577422375, + "grad_norm": 7.223987579345703, + "learning_rate": 6.073479685537808e-06, + "loss": 0.1501, + "step": 17262 + }, + { + "epoch": 0.4368499633069312, + "grad_norm": 5.129644393920898, + "learning_rate": 6.073087522666395e-06, + "loss": 0.187, + "step": 17263 + }, + { + "epoch": 0.43687526887162487, + "grad_norm": 4.570638179779053, + "learning_rate": 6.0726953528748414e-06, + "loss": 0.0927, + "step": 17264 + }, + { + "epoch": 0.4369005744363185, + "grad_norm": 3.6142375469207764, + "learning_rate": 6.072303176165674e-06, + "loss": 0.153, + "step": 17265 + }, + { + "epoch": 0.43692588000101223, + "grad_norm": 7.702927112579346, + "learning_rate": 6.071910992541425e-06, + "loss": 0.2913, + "step": 17266 + }, + { + "epoch": 0.4369511855657059, + "grad_norm": 6.7826151847839355, + "learning_rate": 6.071518802004618e-06, + "loss": 0.2075, + "step": 17267 + }, + { + "epoch": 0.4369764911303996, + "grad_norm": 5.239496231079102, + "learning_rate": 6.071126604557789e-06, + "loss": 0.2498, + "step": 17268 + }, + { + "epoch": 0.43700179669509326, + "grad_norm": 5.844523906707764, + "learning_rate": 6.070734400203461e-06, + "loss": 0.2332, + "step": 17269 + }, + { + "epoch": 0.4370271022597869, + "grad_norm": 6.561092853546143, + "learning_rate": 6.070342188944167e-06, + "loss": 0.2046, + "step": 17270 + }, + { + "epoch": 0.4370524078244806, + "grad_norm": 2.666677236557007, + "learning_rate": 6.069949970782436e-06, + "loss": 0.1386, + "step": 17271 + }, + { + "epoch": 0.4370777133891743, + "grad_norm": 2.349424123764038, + "learning_rate": 6.069557745720797e-06, + "loss": 0.0926, + "step": 17272 + }, + { + "epoch": 0.43710301895386794, + "grad_norm": 3.6665475368499756, + "learning_rate": 6.069165513761777e-06, + "loss": 0.1726, + "step": 17273 + }, + { + "epoch": 0.43712832451856165, + "grad_norm": 6.209871768951416, + "learning_rate": 6.06877327490791e-06, + "loss": 0.1839, + "step": 17274 + }, + { + "epoch": 0.4371536300832553, + "grad_norm": 17.295944213867188, + "learning_rate": 6.0683810291617205e-06, + "loss": 0.2156, + "step": 17275 + }, + { + "epoch": 0.43717893564794896, + "grad_norm": 5.007942199707031, + "learning_rate": 6.067988776525742e-06, + "loss": 0.2319, + "step": 17276 + }, + { + "epoch": 0.4372042412126427, + "grad_norm": 9.978919982910156, + "learning_rate": 6.067596517002502e-06, + "loss": 0.2011, + "step": 17277 + }, + { + "epoch": 0.43722954677733633, + "grad_norm": 4.48874044418335, + "learning_rate": 6.067204250594531e-06, + "loss": 0.1862, + "step": 17278 + }, + { + "epoch": 0.43725485234203, + "grad_norm": 6.21754789352417, + "learning_rate": 6.066811977304359e-06, + "loss": 0.1937, + "step": 17279 + }, + { + "epoch": 0.4372801579067237, + "grad_norm": 5.294201374053955, + "learning_rate": 6.066419697134514e-06, + "loss": 0.1461, + "step": 17280 + }, + { + "epoch": 0.43730546347141735, + "grad_norm": 3.252835988998413, + "learning_rate": 6.066027410087526e-06, + "loss": 0.1669, + "step": 17281 + }, + { + "epoch": 0.43733076903611107, + "grad_norm": 8.187901496887207, + "learning_rate": 6.065635116165927e-06, + "loss": 0.1722, + "step": 17282 + }, + { + "epoch": 0.4373560746008047, + "grad_norm": 3.173222780227661, + "learning_rate": 6.0652428153722455e-06, + "loss": 0.1241, + "step": 17283 + }, + { + "epoch": 0.4373813801654984, + "grad_norm": 10.26978588104248, + "learning_rate": 6.064850507709009e-06, + "loss": 0.3731, + "step": 17284 + }, + { + "epoch": 0.4374066857301921, + "grad_norm": 4.4729180335998535, + "learning_rate": 6.064458193178752e-06, + "loss": 0.1562, + "step": 17285 + }, + { + "epoch": 0.43743199129488575, + "grad_norm": 4.47359037399292, + "learning_rate": 6.0640658717840015e-06, + "loss": 0.1592, + "step": 17286 + }, + { + "epoch": 0.4374572968595794, + "grad_norm": 9.671412467956543, + "learning_rate": 6.063673543527288e-06, + "loss": 0.2052, + "step": 17287 + }, + { + "epoch": 0.4374826024242731, + "grad_norm": 4.302094459533691, + "learning_rate": 6.063281208411143e-06, + "loss": 0.1289, + "step": 17288 + }, + { + "epoch": 0.43750790798896677, + "grad_norm": 3.976436138153076, + "learning_rate": 6.062888866438092e-06, + "loss": 0.1223, + "step": 17289 + }, + { + "epoch": 0.4375332135536604, + "grad_norm": 15.387293815612793, + "learning_rate": 6.062496517610671e-06, + "loss": 0.3006, + "step": 17290 + }, + { + "epoch": 0.43755851911835414, + "grad_norm": 2.803377151489258, + "learning_rate": 6.062104161931407e-06, + "loss": 0.1285, + "step": 17291 + }, + { + "epoch": 0.4375838246830478, + "grad_norm": 7.116221904754639, + "learning_rate": 6.061711799402831e-06, + "loss": 0.1554, + "step": 17292 + }, + { + "epoch": 0.43760913024774145, + "grad_norm": 4.896697044372559, + "learning_rate": 6.061319430027472e-06, + "loss": 0.1452, + "step": 17293 + }, + { + "epoch": 0.43763443581243516, + "grad_norm": 5.56837272644043, + "learning_rate": 6.060927053807863e-06, + "loss": 0.1393, + "step": 17294 + }, + { + "epoch": 0.4376597413771288, + "grad_norm": 11.260807991027832, + "learning_rate": 6.060534670746532e-06, + "loss": 0.3239, + "step": 17295 + }, + { + "epoch": 0.43768504694182253, + "grad_norm": 3.435119390487671, + "learning_rate": 6.060142280846011e-06, + "loss": 0.1757, + "step": 17296 + }, + { + "epoch": 0.4377103525065162, + "grad_norm": 16.117361068725586, + "learning_rate": 6.0597498841088275e-06, + "loss": 0.2285, + "step": 17297 + }, + { + "epoch": 0.43773565807120984, + "grad_norm": 4.736571788787842, + "learning_rate": 6.059357480537515e-06, + "loss": 0.1618, + "step": 17298 + }, + { + "epoch": 0.43776096363590355, + "grad_norm": 6.64975118637085, + "learning_rate": 6.058965070134605e-06, + "loss": 0.1555, + "step": 17299 + }, + { + "epoch": 0.4377862692005972, + "grad_norm": 8.0746431350708, + "learning_rate": 6.058572652902623e-06, + "loss": 0.2322, + "step": 17300 + }, + { + "epoch": 0.43781157476529087, + "grad_norm": 5.507502555847168, + "learning_rate": 6.058180228844106e-06, + "loss": 0.1739, + "step": 17301 + }, + { + "epoch": 0.4378368803299846, + "grad_norm": 7.109441757202148, + "learning_rate": 6.057787797961579e-06, + "loss": 0.1843, + "step": 17302 + }, + { + "epoch": 0.43786218589467824, + "grad_norm": 9.21638298034668, + "learning_rate": 6.0573953602575776e-06, + "loss": 0.1788, + "step": 17303 + }, + { + "epoch": 0.4378874914593719, + "grad_norm": 2.901348829269409, + "learning_rate": 6.057002915734629e-06, + "loss": 0.1112, + "step": 17304 + }, + { + "epoch": 0.4379127970240656, + "grad_norm": 6.044012546539307, + "learning_rate": 6.056610464395266e-06, + "loss": 0.1298, + "step": 17305 + }, + { + "epoch": 0.43793810258875926, + "grad_norm": 9.478589057922363, + "learning_rate": 6.056218006242018e-06, + "loss": 0.1948, + "step": 17306 + }, + { + "epoch": 0.43796340815345297, + "grad_norm": 4.188211441040039, + "learning_rate": 6.055825541277416e-06, + "loss": 0.1474, + "step": 17307 + }, + { + "epoch": 0.4379887137181466, + "grad_norm": 10.926850318908691, + "learning_rate": 6.055433069503992e-06, + "loss": 0.1962, + "step": 17308 + }, + { + "epoch": 0.4380140192828403, + "grad_norm": 4.722874164581299, + "learning_rate": 6.055040590924276e-06, + "loss": 0.1599, + "step": 17309 + }, + { + "epoch": 0.438039324847534, + "grad_norm": 6.925675392150879, + "learning_rate": 6.0546481055408e-06, + "loss": 0.2436, + "step": 17310 + }, + { + "epoch": 0.43806463041222765, + "grad_norm": 11.108378410339355, + "learning_rate": 6.054255613356095e-06, + "loss": 0.3007, + "step": 17311 + }, + { + "epoch": 0.4380899359769213, + "grad_norm": 8.973616600036621, + "learning_rate": 6.0538631143726915e-06, + "loss": 0.2322, + "step": 17312 + }, + { + "epoch": 0.438115241541615, + "grad_norm": 10.997138023376465, + "learning_rate": 6.053470608593119e-06, + "loss": 0.1759, + "step": 17313 + }, + { + "epoch": 0.4381405471063087, + "grad_norm": 4.223853588104248, + "learning_rate": 6.053078096019913e-06, + "loss": 0.1913, + "step": 17314 + }, + { + "epoch": 0.43816585267100233, + "grad_norm": 6.639877796173096, + "learning_rate": 6.0526855766556025e-06, + "loss": 0.1298, + "step": 17315 + }, + { + "epoch": 0.43819115823569604, + "grad_norm": 4.5064568519592285, + "learning_rate": 6.052293050502717e-06, + "loss": 0.1735, + "step": 17316 + }, + { + "epoch": 0.4382164638003897, + "grad_norm": 6.03921365737915, + "learning_rate": 6.051900517563789e-06, + "loss": 0.1733, + "step": 17317 + }, + { + "epoch": 0.43824176936508336, + "grad_norm": 3.8453264236450195, + "learning_rate": 6.051507977841351e-06, + "loss": 0.1463, + "step": 17318 + }, + { + "epoch": 0.43826707492977707, + "grad_norm": 3.3697125911712646, + "learning_rate": 6.051115431337934e-06, + "loss": 0.1966, + "step": 17319 + }, + { + "epoch": 0.4382923804944707, + "grad_norm": 4.539254665374756, + "learning_rate": 6.050722878056068e-06, + "loss": 0.176, + "step": 17320 + }, + { + "epoch": 0.43831768605916444, + "grad_norm": 2.7905611991882324, + "learning_rate": 6.050330317998288e-06, + "loss": 0.1008, + "step": 17321 + }, + { + "epoch": 0.4383429916238581, + "grad_norm": 6.969156265258789, + "learning_rate": 6.04993775116712e-06, + "loss": 0.372, + "step": 17322 + }, + { + "epoch": 0.43836829718855175, + "grad_norm": 4.019207954406738, + "learning_rate": 6.049545177565101e-06, + "loss": 0.1714, + "step": 17323 + }, + { + "epoch": 0.43839360275324546, + "grad_norm": 4.378273010253906, + "learning_rate": 6.049152597194759e-06, + "loss": 0.1785, + "step": 17324 + }, + { + "epoch": 0.4384189083179391, + "grad_norm": 15.406441688537598, + "learning_rate": 6.048760010058627e-06, + "loss": 0.1221, + "step": 17325 + }, + { + "epoch": 0.43844421388263277, + "grad_norm": 5.200798511505127, + "learning_rate": 6.048367416159238e-06, + "loss": 0.2165, + "step": 17326 + }, + { + "epoch": 0.4384695194473265, + "grad_norm": 7.933413982391357, + "learning_rate": 6.047974815499123e-06, + "loss": 0.2048, + "step": 17327 + }, + { + "epoch": 0.43849482501202014, + "grad_norm": 17.02503204345703, + "learning_rate": 6.047582208080811e-06, + "loss": 0.5141, + "step": 17328 + }, + { + "epoch": 0.4385201305767138, + "grad_norm": 4.6831464767456055, + "learning_rate": 6.047189593906837e-06, + "loss": 0.1921, + "step": 17329 + }, + { + "epoch": 0.4385454361414075, + "grad_norm": 7.224671840667725, + "learning_rate": 6.046796972979732e-06, + "loss": 0.3035, + "step": 17330 + }, + { + "epoch": 0.43857074170610116, + "grad_norm": 5.462852478027344, + "learning_rate": 6.046404345302029e-06, + "loss": 0.1439, + "step": 17331 + }, + { + "epoch": 0.4385960472707949, + "grad_norm": 2.8653337955474854, + "learning_rate": 6.0460117108762595e-06, + "loss": 0.1467, + "step": 17332 + }, + { + "epoch": 0.43862135283548853, + "grad_norm": 4.772624492645264, + "learning_rate": 6.045619069704953e-06, + "loss": 0.1707, + "step": 17333 + }, + { + "epoch": 0.4386466584001822, + "grad_norm": 8.304576873779297, + "learning_rate": 6.045226421790645e-06, + "loss": 0.1681, + "step": 17334 + }, + { + "epoch": 0.4386719639648759, + "grad_norm": 5.793102264404297, + "learning_rate": 6.044833767135866e-06, + "loss": 0.2162, + "step": 17335 + }, + { + "epoch": 0.43869726952956956, + "grad_norm": 4.261153221130371, + "learning_rate": 6.04444110574315e-06, + "loss": 0.1801, + "step": 17336 + }, + { + "epoch": 0.4387225750942632, + "grad_norm": 11.966611862182617, + "learning_rate": 6.044048437615024e-06, + "loss": 0.0912, + "step": 17337 + }, + { + "epoch": 0.4387478806589569, + "grad_norm": 6.0869269371032715, + "learning_rate": 6.043655762754027e-06, + "loss": 0.2525, + "step": 17338 + }, + { + "epoch": 0.4387731862236506, + "grad_norm": 5.011748790740967, + "learning_rate": 6.043263081162687e-06, + "loss": 0.1546, + "step": 17339 + }, + { + "epoch": 0.43879849178834424, + "grad_norm": 2.9034931659698486, + "learning_rate": 6.042870392843536e-06, + "loss": 0.1322, + "step": 17340 + }, + { + "epoch": 0.43882379735303795, + "grad_norm": 12.138790130615234, + "learning_rate": 6.0424776977991096e-06, + "loss": 0.2173, + "step": 17341 + }, + { + "epoch": 0.4388491029177316, + "grad_norm": 4.339144229888916, + "learning_rate": 6.0420849960319386e-06, + "loss": 0.1633, + "step": 17342 + }, + { + "epoch": 0.43887440848242526, + "grad_norm": 6.03432035446167, + "learning_rate": 6.041692287544555e-06, + "loss": 0.1979, + "step": 17343 + }, + { + "epoch": 0.43889971404711897, + "grad_norm": 3.8388023376464844, + "learning_rate": 6.0412995723394914e-06, + "loss": 0.2321, + "step": 17344 + }, + { + "epoch": 0.43892501961181263, + "grad_norm": 7.379733562469482, + "learning_rate": 6.040906850419281e-06, + "loss": 0.2091, + "step": 17345 + }, + { + "epoch": 0.43895032517650634, + "grad_norm": 4.980563640594482, + "learning_rate": 6.0405141217864564e-06, + "loss": 0.1523, + "step": 17346 + }, + { + "epoch": 0.4389756307412, + "grad_norm": 10.249044418334961, + "learning_rate": 6.040121386443548e-06, + "loss": 0.2427, + "step": 17347 + }, + { + "epoch": 0.43900093630589365, + "grad_norm": 11.75985336303711, + "learning_rate": 6.039728644393091e-06, + "loss": 0.2103, + "step": 17348 + }, + { + "epoch": 0.43902624187058736, + "grad_norm": 3.4174134731292725, + "learning_rate": 6.039335895637618e-06, + "loss": 0.1648, + "step": 17349 + }, + { + "epoch": 0.439051547435281, + "grad_norm": 7.961544036865234, + "learning_rate": 6.038943140179661e-06, + "loss": 0.2113, + "step": 17350 + }, + { + "epoch": 0.4390768529999747, + "grad_norm": 4.005214214324951, + "learning_rate": 6.038550378021751e-06, + "loss": 0.1866, + "step": 17351 + }, + { + "epoch": 0.4391021585646684, + "grad_norm": 5.106936931610107, + "learning_rate": 6.038157609166425e-06, + "loss": 0.1597, + "step": 17352 + }, + { + "epoch": 0.43912746412936204, + "grad_norm": 15.01632022857666, + "learning_rate": 6.037764833616213e-06, + "loss": 0.2569, + "step": 17353 + }, + { + "epoch": 0.4391527696940557, + "grad_norm": 3.3265388011932373, + "learning_rate": 6.037372051373649e-06, + "loss": 0.1139, + "step": 17354 + }, + { + "epoch": 0.4391780752587494, + "grad_norm": 3.5601630210876465, + "learning_rate": 6.036979262441265e-06, + "loss": 0.1815, + "step": 17355 + }, + { + "epoch": 0.43920338082344307, + "grad_norm": 5.72426700592041, + "learning_rate": 6.0365864668215955e-06, + "loss": 0.1709, + "step": 17356 + }, + { + "epoch": 0.4392286863881367, + "grad_norm": 6.271353721618652, + "learning_rate": 6.036193664517172e-06, + "loss": 0.1918, + "step": 17357 + }, + { + "epoch": 0.43925399195283044, + "grad_norm": 3.851020336151123, + "learning_rate": 6.035800855530529e-06, + "loss": 0.1284, + "step": 17358 + }, + { + "epoch": 0.4392792975175241, + "grad_norm": 9.311121940612793, + "learning_rate": 6.035408039864198e-06, + "loss": 0.3229, + "step": 17359 + }, + { + "epoch": 0.4393046030822178, + "grad_norm": 2.936352014541626, + "learning_rate": 6.035015217520713e-06, + "loss": 0.129, + "step": 17360 + }, + { + "epoch": 0.43932990864691146, + "grad_norm": 10.627729415893555, + "learning_rate": 6.0346223885026074e-06, + "loss": 0.3568, + "step": 17361 + }, + { + "epoch": 0.4393552142116051, + "grad_norm": 6.818484783172607, + "learning_rate": 6.034229552812415e-06, + "loss": 0.2047, + "step": 17362 + }, + { + "epoch": 0.43938051977629883, + "grad_norm": 4.497427463531494, + "learning_rate": 6.03383671045267e-06, + "loss": 0.2414, + "step": 17363 + }, + { + "epoch": 0.4394058253409925, + "grad_norm": 4.596368312835693, + "learning_rate": 6.033443861425902e-06, + "loss": 0.1225, + "step": 17364 + }, + { + "epoch": 0.43943113090568614, + "grad_norm": 3.4552831649780273, + "learning_rate": 6.033051005734648e-06, + "loss": 0.1624, + "step": 17365 + }, + { + "epoch": 0.43945643647037985, + "grad_norm": 3.5132179260253906, + "learning_rate": 6.0326581433814394e-06, + "loss": 0.1564, + "step": 17366 + }, + { + "epoch": 0.4394817420350735, + "grad_norm": 2.7977893352508545, + "learning_rate": 6.0322652743688116e-06, + "loss": 0.1033, + "step": 17367 + }, + { + "epoch": 0.43950704759976716, + "grad_norm": 3.111579656600952, + "learning_rate": 6.031872398699297e-06, + "loss": 0.129, + "step": 17368 + }, + { + "epoch": 0.4395323531644609, + "grad_norm": 9.491976737976074, + "learning_rate": 6.031479516375429e-06, + "loss": 0.2086, + "step": 17369 + }, + { + "epoch": 0.43955765872915453, + "grad_norm": 4.293730735778809, + "learning_rate": 6.031086627399741e-06, + "loss": 0.2097, + "step": 17370 + }, + { + "epoch": 0.43958296429384824, + "grad_norm": 4.503945350646973, + "learning_rate": 6.0306937317747674e-06, + "loss": 0.1996, + "step": 17371 + }, + { + "epoch": 0.4396082698585419, + "grad_norm": 5.47363805770874, + "learning_rate": 6.030300829503041e-06, + "loss": 0.1563, + "step": 17372 + }, + { + "epoch": 0.43963357542323556, + "grad_norm": 4.64865255355835, + "learning_rate": 6.029907920587097e-06, + "loss": 0.1953, + "step": 17373 + }, + { + "epoch": 0.43965888098792927, + "grad_norm": 4.125728607177734, + "learning_rate": 6.029515005029468e-06, + "loss": 0.1092, + "step": 17374 + }, + { + "epoch": 0.4396841865526229, + "grad_norm": 3.3311214447021484, + "learning_rate": 6.029122082832689e-06, + "loss": 0.1653, + "step": 17375 + }, + { + "epoch": 0.4397094921173166, + "grad_norm": 8.080460548400879, + "learning_rate": 6.028729153999293e-06, + "loss": 0.1972, + "step": 17376 + }, + { + "epoch": 0.4397347976820103, + "grad_norm": 4.975393295288086, + "learning_rate": 6.028336218531814e-06, + "loss": 0.1402, + "step": 17377 + }, + { + "epoch": 0.43976010324670395, + "grad_norm": 11.548638343811035, + "learning_rate": 6.027943276432786e-06, + "loss": 0.2023, + "step": 17378 + }, + { + "epoch": 0.4397854088113976, + "grad_norm": 5.686656951904297, + "learning_rate": 6.027550327704742e-06, + "loss": 0.2698, + "step": 17379 + }, + { + "epoch": 0.4398107143760913, + "grad_norm": 3.6769659519195557, + "learning_rate": 6.027157372350219e-06, + "loss": 0.1048, + "step": 17380 + }, + { + "epoch": 0.439836019940785, + "grad_norm": 7.4419708251953125, + "learning_rate": 6.026764410371747e-06, + "loss": 0.1982, + "step": 17381 + }, + { + "epoch": 0.43986132550547863, + "grad_norm": 4.768306732177734, + "learning_rate": 6.026371441771862e-06, + "loss": 0.1459, + "step": 17382 + }, + { + "epoch": 0.43988663107017234, + "grad_norm": 7.6123948097229, + "learning_rate": 6.025978466553101e-06, + "loss": 0.2296, + "step": 17383 + }, + { + "epoch": 0.439911936634866, + "grad_norm": 4.684370994567871, + "learning_rate": 6.025585484717994e-06, + "loss": 0.2067, + "step": 17384 + }, + { + "epoch": 0.4399372421995597, + "grad_norm": 7.868861198425293, + "learning_rate": 6.025192496269078e-06, + "loss": 0.1871, + "step": 17385 + }, + { + "epoch": 0.43996254776425336, + "grad_norm": 3.433471202850342, + "learning_rate": 6.024799501208884e-06, + "loss": 0.1494, + "step": 17386 + }, + { + "epoch": 0.439987853328947, + "grad_norm": 4.646951198577881, + "learning_rate": 6.024406499539952e-06, + "loss": 0.2366, + "step": 17387 + }, + { + "epoch": 0.44001315889364073, + "grad_norm": 4.7939677238464355, + "learning_rate": 6.024013491264811e-06, + "loss": 0.1625, + "step": 17388 + }, + { + "epoch": 0.4400384644583344, + "grad_norm": 2.4291694164276123, + "learning_rate": 6.023620476385998e-06, + "loss": 0.0878, + "step": 17389 + }, + { + "epoch": 0.44006377002302804, + "grad_norm": 8.119730949401855, + "learning_rate": 6.023227454906047e-06, + "loss": 0.24, + "step": 17390 + }, + { + "epoch": 0.44008907558772176, + "grad_norm": 10.629459381103516, + "learning_rate": 6.022834426827492e-06, + "loss": 0.2388, + "step": 17391 + }, + { + "epoch": 0.4401143811524154, + "grad_norm": 12.985307693481445, + "learning_rate": 6.022441392152868e-06, + "loss": 0.2562, + "step": 17392 + }, + { + "epoch": 0.44013968671710907, + "grad_norm": 3.6747477054595947, + "learning_rate": 6.022048350884709e-06, + "loss": 0.1479, + "step": 17393 + }, + { + "epoch": 0.4401649922818028, + "grad_norm": 10.608771324157715, + "learning_rate": 6.021655303025552e-06, + "loss": 0.2712, + "step": 17394 + }, + { + "epoch": 0.44019029784649644, + "grad_norm": 13.095009803771973, + "learning_rate": 6.0212622485779294e-06, + "loss": 0.1778, + "step": 17395 + }, + { + "epoch": 0.44021560341119015, + "grad_norm": 3.633094072341919, + "learning_rate": 6.020869187544377e-06, + "loss": 0.164, + "step": 17396 + }, + { + "epoch": 0.4402409089758838, + "grad_norm": 3.6933579444885254, + "learning_rate": 6.020476119927428e-06, + "loss": 0.1856, + "step": 17397 + }, + { + "epoch": 0.44026621454057746, + "grad_norm": 6.807697772979736, + "learning_rate": 6.0200830457296185e-06, + "loss": 0.2329, + "step": 17398 + }, + { + "epoch": 0.4402915201052712, + "grad_norm": 9.110183715820312, + "learning_rate": 6.0196899649534836e-06, + "loss": 0.2782, + "step": 17399 + }, + { + "epoch": 0.44031682566996483, + "grad_norm": 5.282838344573975, + "learning_rate": 6.019296877601558e-06, + "loss": 0.167, + "step": 17400 + }, + { + "epoch": 0.4403421312346585, + "grad_norm": 2.749589204788208, + "learning_rate": 6.018903783676376e-06, + "loss": 0.0683, + "step": 17401 + }, + { + "epoch": 0.4403674367993522, + "grad_norm": 9.141866683959961, + "learning_rate": 6.018510683180474e-06, + "loss": 0.1753, + "step": 17402 + }, + { + "epoch": 0.44039274236404585, + "grad_norm": 15.853253364562988, + "learning_rate": 6.018117576116384e-06, + "loss": 0.3818, + "step": 17403 + }, + { + "epoch": 0.4404180479287395, + "grad_norm": 6.258516311645508, + "learning_rate": 6.017724462486646e-06, + "loss": 0.2521, + "step": 17404 + }, + { + "epoch": 0.4404433534934332, + "grad_norm": 3.8283958435058594, + "learning_rate": 6.01733134229379e-06, + "loss": 0.1404, + "step": 17405 + }, + { + "epoch": 0.4404686590581269, + "grad_norm": 4.101873874664307, + "learning_rate": 6.016938215540355e-06, + "loss": 0.2044, + "step": 17406 + }, + { + "epoch": 0.44049396462282053, + "grad_norm": 2.7378318309783936, + "learning_rate": 6.016545082228874e-06, + "loss": 0.1125, + "step": 17407 + }, + { + "epoch": 0.44051927018751424, + "grad_norm": 13.130958557128906, + "learning_rate": 6.016151942361882e-06, + "loss": 0.1491, + "step": 17408 + }, + { + "epoch": 0.4405445757522079, + "grad_norm": 4.169888973236084, + "learning_rate": 6.0157587959419165e-06, + "loss": 0.1662, + "step": 17409 + }, + { + "epoch": 0.4405698813169016, + "grad_norm": 5.936046123504639, + "learning_rate": 6.015365642971509e-06, + "loss": 0.1512, + "step": 17410 + }, + { + "epoch": 0.44059518688159527, + "grad_norm": 4.788860321044922, + "learning_rate": 6.014972483453201e-06, + "loss": 0.1815, + "step": 17411 + }, + { + "epoch": 0.4406204924462889, + "grad_norm": 5.636556148529053, + "learning_rate": 6.01457931738952e-06, + "loss": 0.2012, + "step": 17412 + }, + { + "epoch": 0.44064579801098264, + "grad_norm": 6.930774688720703, + "learning_rate": 6.01418614478301e-06, + "loss": 0.1777, + "step": 17413 + }, + { + "epoch": 0.4406711035756763, + "grad_norm": 4.7886152267456055, + "learning_rate": 6.013792965636199e-06, + "loss": 0.1772, + "step": 17414 + }, + { + "epoch": 0.44069640914036995, + "grad_norm": 5.498892307281494, + "learning_rate": 6.013399779951626e-06, + "loss": 0.261, + "step": 17415 + }, + { + "epoch": 0.44072171470506366, + "grad_norm": 7.3651933670043945, + "learning_rate": 6.013006587731829e-06, + "loss": 0.1935, + "step": 17416 + }, + { + "epoch": 0.4407470202697573, + "grad_norm": 6.305220603942871, + "learning_rate": 6.012613388979338e-06, + "loss": 0.2389, + "step": 17417 + }, + { + "epoch": 0.440772325834451, + "grad_norm": 11.276580810546875, + "learning_rate": 6.012220183696693e-06, + "loss": 0.2934, + "step": 17418 + }, + { + "epoch": 0.4407976313991447, + "grad_norm": 3.885573148727417, + "learning_rate": 6.011826971886428e-06, + "loss": 0.1286, + "step": 17419 + }, + { + "epoch": 0.44082293696383834, + "grad_norm": 8.096595764160156, + "learning_rate": 6.011433753551079e-06, + "loss": 0.1655, + "step": 17420 + }, + { + "epoch": 0.440848242528532, + "grad_norm": 3.9106462001800537, + "learning_rate": 6.0110405286931815e-06, + "loss": 0.1401, + "step": 17421 + }, + { + "epoch": 0.4408735480932257, + "grad_norm": 4.029569625854492, + "learning_rate": 6.010647297315274e-06, + "loss": 0.142, + "step": 17422 + }, + { + "epoch": 0.44089885365791937, + "grad_norm": 7.03490686416626, + "learning_rate": 6.010254059419888e-06, + "loss": 0.1638, + "step": 17423 + }, + { + "epoch": 0.4409241592226131, + "grad_norm": 9.414576530456543, + "learning_rate": 6.009860815009562e-06, + "loss": 0.1232, + "step": 17424 + }, + { + "epoch": 0.44094946478730673, + "grad_norm": 4.655203342437744, + "learning_rate": 6.0094675640868305e-06, + "loss": 0.1503, + "step": 17425 + }, + { + "epoch": 0.4409747703520004, + "grad_norm": 3.603003978729248, + "learning_rate": 6.009074306654231e-06, + "loss": 0.1842, + "step": 17426 + }, + { + "epoch": 0.4410000759166941, + "grad_norm": 5.297211647033691, + "learning_rate": 6.0086810427143e-06, + "loss": 0.219, + "step": 17427 + }, + { + "epoch": 0.44102538148138776, + "grad_norm": 5.113056182861328, + "learning_rate": 6.008287772269571e-06, + "loss": 0.1171, + "step": 17428 + }, + { + "epoch": 0.4410506870460814, + "grad_norm": 4.056393146514893, + "learning_rate": 6.007894495322583e-06, + "loss": 0.1388, + "step": 17429 + }, + { + "epoch": 0.4410759926107751, + "grad_norm": 3.7267725467681885, + "learning_rate": 6.00750121187587e-06, + "loss": 0.1859, + "step": 17430 + }, + { + "epoch": 0.4411012981754688, + "grad_norm": 10.34444522857666, + "learning_rate": 6.007107921931969e-06, + "loss": 0.143, + "step": 17431 + }, + { + "epoch": 0.44112660374016244, + "grad_norm": 9.03085708618164, + "learning_rate": 6.006714625493418e-06, + "loss": 0.2907, + "step": 17432 + }, + { + "epoch": 0.44115190930485615, + "grad_norm": 6.906283855438232, + "learning_rate": 6.0063213225627505e-06, + "loss": 0.241, + "step": 17433 + }, + { + "epoch": 0.4411772148695498, + "grad_norm": 11.197589874267578, + "learning_rate": 6.005928013142504e-06, + "loss": 0.1423, + "step": 17434 + }, + { + "epoch": 0.4412025204342435, + "grad_norm": 14.346537590026855, + "learning_rate": 6.005534697235215e-06, + "loss": 0.1921, + "step": 17435 + }, + { + "epoch": 0.4412278259989372, + "grad_norm": 4.699794292449951, + "learning_rate": 6.005141374843419e-06, + "loss": 0.2443, + "step": 17436 + }, + { + "epoch": 0.44125313156363083, + "grad_norm": 2.567248821258545, + "learning_rate": 6.004748045969654e-06, + "loss": 0.144, + "step": 17437 + }, + { + "epoch": 0.44127843712832454, + "grad_norm": 4.299542427062988, + "learning_rate": 6.004354710616457e-06, + "loss": 0.1793, + "step": 17438 + }, + { + "epoch": 0.4413037426930182, + "grad_norm": 4.6003217697143555, + "learning_rate": 6.003961368786361e-06, + "loss": 0.1758, + "step": 17439 + }, + { + "epoch": 0.44132904825771185, + "grad_norm": 9.856976509094238, + "learning_rate": 6.003568020481906e-06, + "loss": 0.1759, + "step": 17440 + }, + { + "epoch": 0.44135435382240557, + "grad_norm": 5.9955525398254395, + "learning_rate": 6.003174665705627e-06, + "loss": 0.1933, + "step": 17441 + }, + { + "epoch": 0.4413796593870992, + "grad_norm": 5.385063171386719, + "learning_rate": 6.002781304460063e-06, + "loss": 0.1768, + "step": 17442 + }, + { + "epoch": 0.4414049649517929, + "grad_norm": 3.3718624114990234, + "learning_rate": 6.002387936747745e-06, + "loss": 0.1733, + "step": 17443 + }, + { + "epoch": 0.4414302705164866, + "grad_norm": 4.595629692077637, + "learning_rate": 6.001994562571217e-06, + "loss": 0.1569, + "step": 17444 + }, + { + "epoch": 0.44145557608118025, + "grad_norm": 4.619056701660156, + "learning_rate": 6.001601181933011e-06, + "loss": 0.1917, + "step": 17445 + }, + { + "epoch": 0.4414808816458739, + "grad_norm": 16.18606185913086, + "learning_rate": 6.001207794835666e-06, + "loss": 0.1896, + "step": 17446 + }, + { + "epoch": 0.4415061872105676, + "grad_norm": 2.525614023208618, + "learning_rate": 6.000814401281718e-06, + "loss": 0.1263, + "step": 17447 + }, + { + "epoch": 0.44153149277526127, + "grad_norm": 4.584095478057861, + "learning_rate": 6.000421001273704e-06, + "loss": 0.1343, + "step": 17448 + }, + { + "epoch": 0.441556798339955, + "grad_norm": 4.945160865783691, + "learning_rate": 6.000027594814161e-06, + "loss": 0.1634, + "step": 17449 + }, + { + "epoch": 0.44158210390464864, + "grad_norm": 11.9261474609375, + "learning_rate": 5.999634181905626e-06, + "loss": 0.3489, + "step": 17450 + }, + { + "epoch": 0.4416074094693423, + "grad_norm": 3.1249001026153564, + "learning_rate": 5.999240762550637e-06, + "loss": 0.115, + "step": 17451 + }, + { + "epoch": 0.441632715034036, + "grad_norm": 3.4473233222961426, + "learning_rate": 5.998847336751729e-06, + "loss": 0.1245, + "step": 17452 + }, + { + "epoch": 0.44165802059872966, + "grad_norm": 4.677611827850342, + "learning_rate": 5.998453904511441e-06, + "loss": 0.218, + "step": 17453 + }, + { + "epoch": 0.4416833261634233, + "grad_norm": 12.027748107910156, + "learning_rate": 5.998060465832309e-06, + "loss": 0.3461, + "step": 17454 + }, + { + "epoch": 0.44170863172811703, + "grad_norm": 4.856778621673584, + "learning_rate": 5.9976670207168715e-06, + "loss": 0.1904, + "step": 17455 + }, + { + "epoch": 0.4417339372928107, + "grad_norm": 3.1125118732452393, + "learning_rate": 5.9972735691676646e-06, + "loss": 0.1415, + "step": 17456 + }, + { + "epoch": 0.44175924285750434, + "grad_norm": 6.985477924346924, + "learning_rate": 5.9968801111872244e-06, + "loss": 0.1477, + "step": 17457 + }, + { + "epoch": 0.44178454842219805, + "grad_norm": 4.032716274261475, + "learning_rate": 5.996486646778092e-06, + "loss": 0.0926, + "step": 17458 + }, + { + "epoch": 0.4418098539868917, + "grad_norm": 5.137876033782959, + "learning_rate": 5.996093175942801e-06, + "loss": 0.114, + "step": 17459 + }, + { + "epoch": 0.4418351595515854, + "grad_norm": 3.452089786529541, + "learning_rate": 5.995699698683892e-06, + "loss": 0.1543, + "step": 17460 + }, + { + "epoch": 0.4418604651162791, + "grad_norm": 6.187185764312744, + "learning_rate": 5.995306215003901e-06, + "loss": 0.202, + "step": 17461 + }, + { + "epoch": 0.44188577068097273, + "grad_norm": 3.9729559421539307, + "learning_rate": 5.994912724905364e-06, + "loss": 0.1594, + "step": 17462 + }, + { + "epoch": 0.44191107624566645, + "grad_norm": 3.662167549133301, + "learning_rate": 5.994519228390821e-06, + "loss": 0.2033, + "step": 17463 + }, + { + "epoch": 0.4419363818103601, + "grad_norm": 5.988982200622559, + "learning_rate": 5.994125725462807e-06, + "loss": 0.1425, + "step": 17464 + }, + { + "epoch": 0.44196168737505376, + "grad_norm": 8.054101943969727, + "learning_rate": 5.993732216123862e-06, + "loss": 0.159, + "step": 17465 + }, + { + "epoch": 0.44198699293974747, + "grad_norm": 7.479201316833496, + "learning_rate": 5.993338700376523e-06, + "loss": 0.1745, + "step": 17466 + }, + { + "epoch": 0.4420122985044411, + "grad_norm": 4.791321277618408, + "learning_rate": 5.992945178223327e-06, + "loss": 0.23, + "step": 17467 + }, + { + "epoch": 0.4420376040691348, + "grad_norm": 7.553130149841309, + "learning_rate": 5.9925516496668126e-06, + "loss": 0.1695, + "step": 17468 + }, + { + "epoch": 0.4420629096338285, + "grad_norm": 10.930124282836914, + "learning_rate": 5.9921581147095175e-06, + "loss": 0.3017, + "step": 17469 + }, + { + "epoch": 0.44208821519852215, + "grad_norm": 2.2646965980529785, + "learning_rate": 5.991764573353978e-06, + "loss": 0.1145, + "step": 17470 + }, + { + "epoch": 0.4421135207632158, + "grad_norm": 3.9508864879608154, + "learning_rate": 5.991371025602734e-06, + "loss": 0.1716, + "step": 17471 + }, + { + "epoch": 0.4421388263279095, + "grad_norm": 5.22780179977417, + "learning_rate": 5.990977471458324e-06, + "loss": 0.1572, + "step": 17472 + }, + { + "epoch": 0.4421641318926032, + "grad_norm": 10.817130088806152, + "learning_rate": 5.990583910923283e-06, + "loss": 0.2478, + "step": 17473 + }, + { + "epoch": 0.4421894374572969, + "grad_norm": 15.997904777526855, + "learning_rate": 5.990190344000151e-06, + "loss": 0.2532, + "step": 17474 + }, + { + "epoch": 0.44221474302199054, + "grad_norm": 7.4831647872924805, + "learning_rate": 5.989796770691466e-06, + "loss": 0.1861, + "step": 17475 + }, + { + "epoch": 0.4422400485866842, + "grad_norm": 6.047751426696777, + "learning_rate": 5.989403190999765e-06, + "loss": 0.1691, + "step": 17476 + }, + { + "epoch": 0.4422653541513779, + "grad_norm": 5.050997734069824, + "learning_rate": 5.989009604927587e-06, + "loss": 0.2262, + "step": 17477 + }, + { + "epoch": 0.44229065971607157, + "grad_norm": 3.9489593505859375, + "learning_rate": 5.9886160124774706e-06, + "loss": 0.1611, + "step": 17478 + }, + { + "epoch": 0.4423159652807652, + "grad_norm": 4.329639911651611, + "learning_rate": 5.9882224136519515e-06, + "loss": 0.1608, + "step": 17479 + }, + { + "epoch": 0.44234127084545893, + "grad_norm": 10.113198280334473, + "learning_rate": 5.9878288084535726e-06, + "loss": 0.3615, + "step": 17480 + }, + { + "epoch": 0.4423665764101526, + "grad_norm": 5.79472541809082, + "learning_rate": 5.987435196884868e-06, + "loss": 0.1787, + "step": 17481 + }, + { + "epoch": 0.44239188197484625, + "grad_norm": 4.213649749755859, + "learning_rate": 5.987041578948378e-06, + "loss": 0.1358, + "step": 17482 + }, + { + "epoch": 0.44241718753953996, + "grad_norm": 3.303555727005005, + "learning_rate": 5.986647954646641e-06, + "loss": 0.1653, + "step": 17483 + }, + { + "epoch": 0.4424424931042336, + "grad_norm": 5.056873798370361, + "learning_rate": 5.986254323982195e-06, + "loss": 0.2243, + "step": 17484 + }, + { + "epoch": 0.44246779866892727, + "grad_norm": 4.1287455558776855, + "learning_rate": 5.985860686957576e-06, + "loss": 0.1682, + "step": 17485 + }, + { + "epoch": 0.442493104233621, + "grad_norm": 3.278254270553589, + "learning_rate": 5.985467043575327e-06, + "loss": 0.1536, + "step": 17486 + }, + { + "epoch": 0.44251840979831464, + "grad_norm": 5.396832466125488, + "learning_rate": 5.985073393837983e-06, + "loss": 0.1671, + "step": 17487 + }, + { + "epoch": 0.44254371536300835, + "grad_norm": 2.9430952072143555, + "learning_rate": 5.984679737748086e-06, + "loss": 0.1328, + "step": 17488 + }, + { + "epoch": 0.442569020927702, + "grad_norm": 8.715937614440918, + "learning_rate": 5.98428607530817e-06, + "loss": 0.2584, + "step": 17489 + }, + { + "epoch": 0.44259432649239566, + "grad_norm": 4.800792217254639, + "learning_rate": 5.983892406520778e-06, + "loss": 0.1981, + "step": 17490 + }, + { + "epoch": 0.4426196320570894, + "grad_norm": 20.5609188079834, + "learning_rate": 5.983498731388446e-06, + "loss": 0.1803, + "step": 17491 + }, + { + "epoch": 0.44264493762178303, + "grad_norm": 6.512129783630371, + "learning_rate": 5.983105049913714e-06, + "loss": 0.1402, + "step": 17492 + }, + { + "epoch": 0.4426702431864767, + "grad_norm": 2.7138266563415527, + "learning_rate": 5.9827113620991205e-06, + "loss": 0.1178, + "step": 17493 + }, + { + "epoch": 0.4426955487511704, + "grad_norm": 7.112427234649658, + "learning_rate": 5.982317667947204e-06, + "loss": 0.1766, + "step": 17494 + }, + { + "epoch": 0.44272085431586405, + "grad_norm": 8.235785484313965, + "learning_rate": 5.981923967460504e-06, + "loss": 0.334, + "step": 17495 + }, + { + "epoch": 0.4427461598805577, + "grad_norm": 4.355844497680664, + "learning_rate": 5.981530260641558e-06, + "loss": 0.1958, + "step": 17496 + }, + { + "epoch": 0.4427714654452514, + "grad_norm": 3.255382537841797, + "learning_rate": 5.981136547492907e-06, + "loss": 0.1408, + "step": 17497 + }, + { + "epoch": 0.4427967710099451, + "grad_norm": 4.667569160461426, + "learning_rate": 5.980742828017087e-06, + "loss": 0.2043, + "step": 17498 + }, + { + "epoch": 0.4428220765746388, + "grad_norm": 9.175904273986816, + "learning_rate": 5.98034910221664e-06, + "loss": 0.2403, + "step": 17499 + }, + { + "epoch": 0.44284738213933245, + "grad_norm": 3.8941166400909424, + "learning_rate": 5.979955370094106e-06, + "loss": 0.1825, + "step": 17500 + }, + { + "epoch": 0.4428726877040261, + "grad_norm": 5.324511528015137, + "learning_rate": 5.97956163165202e-06, + "loss": 0.2248, + "step": 17501 + }, + { + "epoch": 0.4428979932687198, + "grad_norm": 4.616427421569824, + "learning_rate": 5.9791678868929235e-06, + "loss": 0.1687, + "step": 17502 + }, + { + "epoch": 0.44292329883341347, + "grad_norm": 10.33081340789795, + "learning_rate": 5.978774135819356e-06, + "loss": 0.1365, + "step": 17503 + }, + { + "epoch": 0.4429486043981071, + "grad_norm": 8.213223457336426, + "learning_rate": 5.978380378433855e-06, + "loss": 0.2789, + "step": 17504 + }, + { + "epoch": 0.44297390996280084, + "grad_norm": 18.27195167541504, + "learning_rate": 5.977986614738962e-06, + "loss": 0.2938, + "step": 17505 + }, + { + "epoch": 0.4429992155274945, + "grad_norm": 7.9855146408081055, + "learning_rate": 5.977592844737215e-06, + "loss": 0.2312, + "step": 17506 + }, + { + "epoch": 0.44302452109218815, + "grad_norm": 9.139443397521973, + "learning_rate": 5.977199068431154e-06, + "loss": 0.2033, + "step": 17507 + }, + { + "epoch": 0.44304982665688186, + "grad_norm": 6.725934982299805, + "learning_rate": 5.976805285823318e-06, + "loss": 0.1876, + "step": 17508 + }, + { + "epoch": 0.4430751322215755, + "grad_norm": 4.495759010314941, + "learning_rate": 5.976411496916246e-06, + "loss": 0.1492, + "step": 17509 + }, + { + "epoch": 0.4431004377862692, + "grad_norm": 3.387298107147217, + "learning_rate": 5.9760177017124765e-06, + "loss": 0.1079, + "step": 17510 + }, + { + "epoch": 0.4431257433509629, + "grad_norm": 4.272172927856445, + "learning_rate": 5.9756239002145525e-06, + "loss": 0.1865, + "step": 17511 + }, + { + "epoch": 0.44315104891565654, + "grad_norm": 5.3670501708984375, + "learning_rate": 5.975230092425011e-06, + "loss": 0.1151, + "step": 17512 + }, + { + "epoch": 0.44317635448035025, + "grad_norm": 3.726883888244629, + "learning_rate": 5.974836278346391e-06, + "loss": 0.0993, + "step": 17513 + }, + { + "epoch": 0.4432016600450439, + "grad_norm": 4.4653520584106445, + "learning_rate": 5.9744424579812336e-06, + "loss": 0.1702, + "step": 17514 + }, + { + "epoch": 0.44322696560973757, + "grad_norm": 21.105745315551758, + "learning_rate": 5.974048631332079e-06, + "loss": 0.3433, + "step": 17515 + }, + { + "epoch": 0.4432522711744313, + "grad_norm": 6.8437323570251465, + "learning_rate": 5.973654798401465e-06, + "loss": 0.1829, + "step": 17516 + }, + { + "epoch": 0.44327757673912493, + "grad_norm": 6.695671081542969, + "learning_rate": 5.973260959191933e-06, + "loss": 0.1968, + "step": 17517 + }, + { + "epoch": 0.4433028823038186, + "grad_norm": 3.912290096282959, + "learning_rate": 5.972867113706021e-06, + "loss": 0.1697, + "step": 17518 + }, + { + "epoch": 0.4433281878685123, + "grad_norm": 4.092713832855225, + "learning_rate": 5.972473261946271e-06, + "loss": 0.127, + "step": 17519 + }, + { + "epoch": 0.44335349343320596, + "grad_norm": 4.135295391082764, + "learning_rate": 5.9720794039152215e-06, + "loss": 0.1595, + "step": 17520 + }, + { + "epoch": 0.4433787989978996, + "grad_norm": 4.813827037811279, + "learning_rate": 5.971685539615413e-06, + "loss": 0.145, + "step": 17521 + }, + { + "epoch": 0.4434041045625933, + "grad_norm": 7.067859649658203, + "learning_rate": 5.971291669049386e-06, + "loss": 0.1205, + "step": 17522 + }, + { + "epoch": 0.443429410127287, + "grad_norm": 3.8938210010528564, + "learning_rate": 5.9708977922196775e-06, + "loss": 0.1651, + "step": 17523 + }, + { + "epoch": 0.4434547156919807, + "grad_norm": 16.288053512573242, + "learning_rate": 5.9705039091288315e-06, + "loss": 0.2103, + "step": 17524 + }, + { + "epoch": 0.44348002125667435, + "grad_norm": 5.193823337554932, + "learning_rate": 5.970110019779386e-06, + "loss": 0.2029, + "step": 17525 + }, + { + "epoch": 0.443505326821368, + "grad_norm": 4.715298175811768, + "learning_rate": 5.969716124173881e-06, + "loss": 0.1845, + "step": 17526 + }, + { + "epoch": 0.4435306323860617, + "grad_norm": 5.488327503204346, + "learning_rate": 5.9693222223148575e-06, + "loss": 0.206, + "step": 17527 + }, + { + "epoch": 0.4435559379507554, + "grad_norm": 6.0899786949157715, + "learning_rate": 5.968928314204855e-06, + "loss": 0.1436, + "step": 17528 + }, + { + "epoch": 0.44358124351544903, + "grad_norm": 3.9796602725982666, + "learning_rate": 5.968534399846414e-06, + "loss": 0.2034, + "step": 17529 + }, + { + "epoch": 0.44360654908014274, + "grad_norm": 4.640303134918213, + "learning_rate": 5.968140479242075e-06, + "loss": 0.1099, + "step": 17530 + }, + { + "epoch": 0.4436318546448364, + "grad_norm": 2.4050405025482178, + "learning_rate": 5.967746552394378e-06, + "loss": 0.0786, + "step": 17531 + }, + { + "epoch": 0.44365716020953005, + "grad_norm": 13.908824920654297, + "learning_rate": 5.967352619305863e-06, + "loss": 0.2088, + "step": 17532 + }, + { + "epoch": 0.44368246577422377, + "grad_norm": 4.24261474609375, + "learning_rate": 5.966958679979071e-06, + "loss": 0.1311, + "step": 17533 + }, + { + "epoch": 0.4437077713389174, + "grad_norm": 12.33821964263916, + "learning_rate": 5.966564734416544e-06, + "loss": 0.2526, + "step": 17534 + }, + { + "epoch": 0.4437330769036111, + "grad_norm": 3.4747071266174316, + "learning_rate": 5.966170782620819e-06, + "loss": 0.1218, + "step": 17535 + }, + { + "epoch": 0.4437583824683048, + "grad_norm": 7.600726127624512, + "learning_rate": 5.965776824594438e-06, + "loss": 0.2038, + "step": 17536 + }, + { + "epoch": 0.44378368803299845, + "grad_norm": 4.442448616027832, + "learning_rate": 5.965382860339943e-06, + "loss": 0.149, + "step": 17537 + }, + { + "epoch": 0.44380899359769216, + "grad_norm": 6.916049480438232, + "learning_rate": 5.964988889859874e-06, + "loss": 0.2323, + "step": 17538 + }, + { + "epoch": 0.4438342991623858, + "grad_norm": 5.368960380554199, + "learning_rate": 5.964594913156771e-06, + "loss": 0.2498, + "step": 17539 + }, + { + "epoch": 0.44385960472707947, + "grad_norm": 4.431614398956299, + "learning_rate": 5.964200930233174e-06, + "loss": 0.1493, + "step": 17540 + }, + { + "epoch": 0.4438849102917732, + "grad_norm": 6.885064601898193, + "learning_rate": 5.963806941091625e-06, + "loss": 0.2213, + "step": 17541 + }, + { + "epoch": 0.44391021585646684, + "grad_norm": 3.8846936225891113, + "learning_rate": 5.963412945734663e-06, + "loss": 0.1146, + "step": 17542 + }, + { + "epoch": 0.4439355214211605, + "grad_norm": 10.857306480407715, + "learning_rate": 5.963018944164831e-06, + "loss": 0.1382, + "step": 17543 + }, + { + "epoch": 0.4439608269858542, + "grad_norm": 37.249847412109375, + "learning_rate": 5.96262493638467e-06, + "loss": 0.2001, + "step": 17544 + }, + { + "epoch": 0.44398613255054786, + "grad_norm": 4.8540191650390625, + "learning_rate": 5.962230922396719e-06, + "loss": 0.1773, + "step": 17545 + }, + { + "epoch": 0.4440114381152415, + "grad_norm": 5.206819534301758, + "learning_rate": 5.9618369022035205e-06, + "loss": 0.1437, + "step": 17546 + }, + { + "epoch": 0.44403674367993523, + "grad_norm": 5.234552383422852, + "learning_rate": 5.961442875807614e-06, + "loss": 0.1358, + "step": 17547 + }, + { + "epoch": 0.4440620492446289, + "grad_norm": 3.10097074508667, + "learning_rate": 5.961048843211542e-06, + "loss": 0.1313, + "step": 17548 + }, + { + "epoch": 0.44408735480932254, + "grad_norm": 3.9055047035217285, + "learning_rate": 5.960654804417843e-06, + "loss": 0.1875, + "step": 17549 + }, + { + "epoch": 0.44411266037401625, + "grad_norm": 7.181069374084473, + "learning_rate": 5.960260759429062e-06, + "loss": 0.2357, + "step": 17550 + }, + { + "epoch": 0.4441379659387099, + "grad_norm": 7.1432905197143555, + "learning_rate": 5.959866708247737e-06, + "loss": 0.1354, + "step": 17551 + }, + { + "epoch": 0.4441632715034036, + "grad_norm": 8.50016975402832, + "learning_rate": 5.95947265087641e-06, + "loss": 0.131, + "step": 17552 + }, + { + "epoch": 0.4441885770680973, + "grad_norm": 3.0423831939697266, + "learning_rate": 5.959078587317623e-06, + "loss": 0.1121, + "step": 17553 + }, + { + "epoch": 0.44421388263279094, + "grad_norm": 4.21330451965332, + "learning_rate": 5.958684517573916e-06, + "loss": 0.1442, + "step": 17554 + }, + { + "epoch": 0.44423918819748465, + "grad_norm": 4.772010803222656, + "learning_rate": 5.958290441647832e-06, + "loss": 0.1495, + "step": 17555 + }, + { + "epoch": 0.4442644937621783, + "grad_norm": 7.010209560394287, + "learning_rate": 5.957896359541909e-06, + "loss": 0.1633, + "step": 17556 + }, + { + "epoch": 0.44428979932687196, + "grad_norm": 8.042009353637695, + "learning_rate": 5.957502271258693e-06, + "loss": 0.1635, + "step": 17557 + }, + { + "epoch": 0.44431510489156567, + "grad_norm": 6.178457736968994, + "learning_rate": 5.95710817680072e-06, + "loss": 0.1996, + "step": 17558 + }, + { + "epoch": 0.4443404104562593, + "grad_norm": 6.701766490936279, + "learning_rate": 5.956714076170536e-06, + "loss": 0.1815, + "step": 17559 + }, + { + "epoch": 0.444365716020953, + "grad_norm": 3.8923885822296143, + "learning_rate": 5.956319969370682e-06, + "loss": 0.1513, + "step": 17560 + }, + { + "epoch": 0.4443910215856467, + "grad_norm": 5.659015655517578, + "learning_rate": 5.955925856403697e-06, + "loss": 0.1843, + "step": 17561 + }, + { + "epoch": 0.44441632715034035, + "grad_norm": 7.258667945861816, + "learning_rate": 5.955531737272125e-06, + "loss": 0.2177, + "step": 17562 + }, + { + "epoch": 0.44444163271503406, + "grad_norm": 5.715603828430176, + "learning_rate": 5.9551376119785044e-06, + "loss": 0.2028, + "step": 17563 + }, + { + "epoch": 0.4444669382797277, + "grad_norm": 7.421856880187988, + "learning_rate": 5.954743480525382e-06, + "loss": 0.2861, + "step": 17564 + }, + { + "epoch": 0.4444922438444214, + "grad_norm": 5.581406116485596, + "learning_rate": 5.954349342915294e-06, + "loss": 0.1811, + "step": 17565 + }, + { + "epoch": 0.4445175494091151, + "grad_norm": 3.1371328830718994, + "learning_rate": 5.953955199150786e-06, + "loss": 0.1577, + "step": 17566 + }, + { + "epoch": 0.44454285497380874, + "grad_norm": 6.376339435577393, + "learning_rate": 5.953561049234397e-06, + "loss": 0.1138, + "step": 17567 + }, + { + "epoch": 0.4445681605385024, + "grad_norm": 2.8060193061828613, + "learning_rate": 5.953166893168672e-06, + "loss": 0.1807, + "step": 17568 + }, + { + "epoch": 0.4445934661031961, + "grad_norm": 10.54316234588623, + "learning_rate": 5.95277273095615e-06, + "loss": 0.3032, + "step": 17569 + }, + { + "epoch": 0.44461877166788977, + "grad_norm": 4.680283546447754, + "learning_rate": 5.952378562599375e-06, + "loss": 0.1714, + "step": 17570 + }, + { + "epoch": 0.4446440772325834, + "grad_norm": 10.869783401489258, + "learning_rate": 5.951984388100886e-06, + "loss": 0.2456, + "step": 17571 + }, + { + "epoch": 0.44466938279727714, + "grad_norm": 10.505690574645996, + "learning_rate": 5.951590207463228e-06, + "loss": 0.2219, + "step": 17572 + }, + { + "epoch": 0.4446946883619708, + "grad_norm": 7.463143348693848, + "learning_rate": 5.9511960206889404e-06, + "loss": 0.1732, + "step": 17573 + }, + { + "epoch": 0.44471999392666445, + "grad_norm": 12.279876708984375, + "learning_rate": 5.950801827780568e-06, + "loss": 0.3001, + "step": 17574 + }, + { + "epoch": 0.44474529949135816, + "grad_norm": 3.9932491779327393, + "learning_rate": 5.950407628740651e-06, + "loss": 0.176, + "step": 17575 + }, + { + "epoch": 0.4447706050560518, + "grad_norm": 8.31275749206543, + "learning_rate": 5.950013423571732e-06, + "loss": 0.1957, + "step": 17576 + }, + { + "epoch": 0.4447959106207455, + "grad_norm": 6.727406978607178, + "learning_rate": 5.949619212276354e-06, + "loss": 0.1566, + "step": 17577 + }, + { + "epoch": 0.4448212161854392, + "grad_norm": 3.2881250381469727, + "learning_rate": 5.949224994857058e-06, + "loss": 0.1061, + "step": 17578 + }, + { + "epoch": 0.44484652175013284, + "grad_norm": 4.612724304199219, + "learning_rate": 5.948830771316385e-06, + "loss": 0.1641, + "step": 17579 + }, + { + "epoch": 0.44487182731482655, + "grad_norm": 7.08917760848999, + "learning_rate": 5.948436541656882e-06, + "loss": 0.1496, + "step": 17580 + }, + { + "epoch": 0.4448971328795202, + "grad_norm": 3.8571414947509766, + "learning_rate": 5.948042305881085e-06, + "loss": 0.1243, + "step": 17581 + }, + { + "epoch": 0.44492243844421386, + "grad_norm": 7.803898334503174, + "learning_rate": 5.947648063991541e-06, + "loss": 0.1707, + "step": 17582 + }, + { + "epoch": 0.4449477440089076, + "grad_norm": 8.108543395996094, + "learning_rate": 5.947253815990791e-06, + "loss": 0.1696, + "step": 17583 + }, + { + "epoch": 0.44497304957360123, + "grad_norm": 4.239437103271484, + "learning_rate": 5.946859561881377e-06, + "loss": 0.1757, + "step": 17584 + }, + { + "epoch": 0.4449983551382949, + "grad_norm": 9.639147758483887, + "learning_rate": 5.94646530166584e-06, + "loss": 0.1881, + "step": 17585 + }, + { + "epoch": 0.4450236607029886, + "grad_norm": 4.422510623931885, + "learning_rate": 5.946071035346728e-06, + "loss": 0.1799, + "step": 17586 + }, + { + "epoch": 0.44504896626768226, + "grad_norm": 4.086554050445557, + "learning_rate": 5.945676762926577e-06, + "loss": 0.1635, + "step": 17587 + }, + { + "epoch": 0.4450742718323759, + "grad_norm": 6.139786720275879, + "learning_rate": 5.945282484407934e-06, + "loss": 0.1885, + "step": 17588 + }, + { + "epoch": 0.4450995773970696, + "grad_norm": 8.140674591064453, + "learning_rate": 5.944888199793338e-06, + "loss": 0.1667, + "step": 17589 + }, + { + "epoch": 0.4451248829617633, + "grad_norm": 3.361158609390259, + "learning_rate": 5.944493909085336e-06, + "loss": 0.1394, + "step": 17590 + }, + { + "epoch": 0.445150188526457, + "grad_norm": 8.549291610717773, + "learning_rate": 5.944099612286468e-06, + "loss": 0.2168, + "step": 17591 + }, + { + "epoch": 0.44517549409115065, + "grad_norm": 3.862700939178467, + "learning_rate": 5.943705309399277e-06, + "loss": 0.1868, + "step": 17592 + }, + { + "epoch": 0.4452007996558443, + "grad_norm": 9.025004386901855, + "learning_rate": 5.943311000426306e-06, + "loss": 0.2644, + "step": 17593 + }, + { + "epoch": 0.445226105220538, + "grad_norm": 5.236820220947266, + "learning_rate": 5.942916685370098e-06, + "loss": 0.1975, + "step": 17594 + }, + { + "epoch": 0.44525141078523167, + "grad_norm": 6.68310022354126, + "learning_rate": 5.9425223642331955e-06, + "loss": 0.2168, + "step": 17595 + }, + { + "epoch": 0.44527671634992533, + "grad_norm": 3.7824151515960693, + "learning_rate": 5.94212803701814e-06, + "loss": 0.0864, + "step": 17596 + }, + { + "epoch": 0.44530202191461904, + "grad_norm": 2.457509994506836, + "learning_rate": 5.9417337037274795e-06, + "loss": 0.1127, + "step": 17597 + }, + { + "epoch": 0.4453273274793127, + "grad_norm": 3.271277904510498, + "learning_rate": 5.941339364363751e-06, + "loss": 0.1586, + "step": 17598 + }, + { + "epoch": 0.44535263304400635, + "grad_norm": 5.470251560211182, + "learning_rate": 5.940945018929501e-06, + "loss": 0.2048, + "step": 17599 + }, + { + "epoch": 0.44537793860870006, + "grad_norm": 4.617398738861084, + "learning_rate": 5.940550667427271e-06, + "loss": 0.171, + "step": 17600 + }, + { + "epoch": 0.4454032441733937, + "grad_norm": 3.1865148544311523, + "learning_rate": 5.940156309859605e-06, + "loss": 0.1228, + "step": 17601 + }, + { + "epoch": 0.44542854973808743, + "grad_norm": 6.348570346832275, + "learning_rate": 5.9397619462290454e-06, + "loss": 0.2917, + "step": 17602 + }, + { + "epoch": 0.4454538553027811, + "grad_norm": 3.5128936767578125, + "learning_rate": 5.9393675765381365e-06, + "loss": 0.1321, + "step": 17603 + }, + { + "epoch": 0.44547916086747474, + "grad_norm": 5.018960475921631, + "learning_rate": 5.93897320078942e-06, + "loss": 0.1603, + "step": 17604 + }, + { + "epoch": 0.44550446643216846, + "grad_norm": 9.652339935302734, + "learning_rate": 5.938578818985441e-06, + "loss": 0.2306, + "step": 17605 + }, + { + "epoch": 0.4455297719968621, + "grad_norm": 3.003086805343628, + "learning_rate": 5.938184431128741e-06, + "loss": 0.1455, + "step": 17606 + }, + { + "epoch": 0.44555507756155577, + "grad_norm": 4.955486297607422, + "learning_rate": 5.937790037221865e-06, + "loss": 0.1986, + "step": 17607 + }, + { + "epoch": 0.4455803831262495, + "grad_norm": 7.283166408538818, + "learning_rate": 5.937395637267354e-06, + "loss": 0.14, + "step": 17608 + }, + { + "epoch": 0.44560568869094314, + "grad_norm": 12.080907821655273, + "learning_rate": 5.937001231267755e-06, + "loss": 0.1953, + "step": 17609 + }, + { + "epoch": 0.4456309942556368, + "grad_norm": 14.286191940307617, + "learning_rate": 5.9366068192256076e-06, + "loss": 0.2554, + "step": 17610 + }, + { + "epoch": 0.4456562998203305, + "grad_norm": 3.9339706897735596, + "learning_rate": 5.936212401143458e-06, + "loss": 0.2283, + "step": 17611 + }, + { + "epoch": 0.44568160538502416, + "grad_norm": 7.500340461730957, + "learning_rate": 5.9358179770238485e-06, + "loss": 0.2406, + "step": 17612 + }, + { + "epoch": 0.4457069109497178, + "grad_norm": 3.9759624004364014, + "learning_rate": 5.935423546869323e-06, + "loss": 0.1428, + "step": 17613 + }, + { + "epoch": 0.44573221651441153, + "grad_norm": 5.557118892669678, + "learning_rate": 5.9350291106824255e-06, + "loss": 0.2964, + "step": 17614 + }, + { + "epoch": 0.4457575220791052, + "grad_norm": 12.823001861572266, + "learning_rate": 5.934634668465699e-06, + "loss": 0.4558, + "step": 17615 + }, + { + "epoch": 0.4457828276437989, + "grad_norm": 4.674006462097168, + "learning_rate": 5.934240220221687e-06, + "loss": 0.1022, + "step": 17616 + }, + { + "epoch": 0.44580813320849255, + "grad_norm": 2.6583657264709473, + "learning_rate": 5.933845765952932e-06, + "loss": 0.1231, + "step": 17617 + }, + { + "epoch": 0.4458334387731862, + "grad_norm": 8.140459060668945, + "learning_rate": 5.9334513056619815e-06, + "loss": 0.1468, + "step": 17618 + }, + { + "epoch": 0.4458587443378799, + "grad_norm": 3.7417054176330566, + "learning_rate": 5.933056839351376e-06, + "loss": 0.1021, + "step": 17619 + }, + { + "epoch": 0.4458840499025736, + "grad_norm": 3.007988691329956, + "learning_rate": 5.932662367023661e-06, + "loss": 0.1305, + "step": 17620 + }, + { + "epoch": 0.44590935546726723, + "grad_norm": 3.763984441757202, + "learning_rate": 5.932267888681381e-06, + "loss": 0.1567, + "step": 17621 + }, + { + "epoch": 0.44593466103196094, + "grad_norm": 4.426640510559082, + "learning_rate": 5.931873404327078e-06, + "loss": 0.1289, + "step": 17622 + }, + { + "epoch": 0.4459599665966546, + "grad_norm": 8.735203742980957, + "learning_rate": 5.931478913963297e-06, + "loss": 0.2437, + "step": 17623 + }, + { + "epoch": 0.44598527216134826, + "grad_norm": 5.257031440734863, + "learning_rate": 5.93108441759258e-06, + "loss": 0.1635, + "step": 17624 + }, + { + "epoch": 0.44601057772604197, + "grad_norm": 5.218186855316162, + "learning_rate": 5.930689915217475e-06, + "loss": 0.237, + "step": 17625 + }, + { + "epoch": 0.4460358832907356, + "grad_norm": 8.85445499420166, + "learning_rate": 5.930295406840523e-06, + "loss": 0.2984, + "step": 17626 + }, + { + "epoch": 0.44606118885542934, + "grad_norm": 4.665561676025391, + "learning_rate": 5.929900892464267e-06, + "loss": 0.2006, + "step": 17627 + }, + { + "epoch": 0.446086494420123, + "grad_norm": 4.848722457885742, + "learning_rate": 5.929506372091256e-06, + "loss": 0.1443, + "step": 17628 + }, + { + "epoch": 0.44611179998481665, + "grad_norm": 6.887314796447754, + "learning_rate": 5.92911184572403e-06, + "loss": 0.1456, + "step": 17629 + }, + { + "epoch": 0.44613710554951036, + "grad_norm": 4.978161811828613, + "learning_rate": 5.928717313365136e-06, + "loss": 0.2292, + "step": 17630 + }, + { + "epoch": 0.446162411114204, + "grad_norm": 4.873518466949463, + "learning_rate": 5.928322775017115e-06, + "loss": 0.1316, + "step": 17631 + }, + { + "epoch": 0.4461877166788977, + "grad_norm": 3.6081724166870117, + "learning_rate": 5.927928230682515e-06, + "loss": 0.1856, + "step": 17632 + }, + { + "epoch": 0.4462130222435914, + "grad_norm": 4.594324588775635, + "learning_rate": 5.927533680363876e-06, + "loss": 0.1429, + "step": 17633 + }, + { + "epoch": 0.44623832780828504, + "grad_norm": 4.581669330596924, + "learning_rate": 5.9271391240637464e-06, + "loss": 0.1906, + "step": 17634 + }, + { + "epoch": 0.4462636333729787, + "grad_norm": 4.912128448486328, + "learning_rate": 5.926744561784669e-06, + "loss": 0.1304, + "step": 17635 + }, + { + "epoch": 0.4462889389376724, + "grad_norm": 5.764298915863037, + "learning_rate": 5.926349993529188e-06, + "loss": 0.1711, + "step": 17636 + }, + { + "epoch": 0.44631424450236606, + "grad_norm": 3.2105534076690674, + "learning_rate": 5.925955419299848e-06, + "loss": 0.0904, + "step": 17637 + }, + { + "epoch": 0.4463395500670597, + "grad_norm": 6.873018741607666, + "learning_rate": 5.925560839099195e-06, + "loss": 0.1646, + "step": 17638 + }, + { + "epoch": 0.44636485563175343, + "grad_norm": 3.1302032470703125, + "learning_rate": 5.9251662529297716e-06, + "loss": 0.0769, + "step": 17639 + }, + { + "epoch": 0.4463901611964471, + "grad_norm": 4.736154556274414, + "learning_rate": 5.924771660794122e-06, + "loss": 0.1533, + "step": 17640 + }, + { + "epoch": 0.4464154667611408, + "grad_norm": 6.516411304473877, + "learning_rate": 5.924377062694794e-06, + "loss": 0.2056, + "step": 17641 + }, + { + "epoch": 0.44644077232583446, + "grad_norm": 14.605110168457031, + "learning_rate": 5.923982458634328e-06, + "loss": 0.1876, + "step": 17642 + }, + { + "epoch": 0.4464660778905281, + "grad_norm": 5.297914028167725, + "learning_rate": 5.923587848615273e-06, + "loss": 0.1751, + "step": 17643 + }, + { + "epoch": 0.4464913834552218, + "grad_norm": 5.05255126953125, + "learning_rate": 5.92319323264017e-06, + "loss": 0.1426, + "step": 17644 + }, + { + "epoch": 0.4465166890199155, + "grad_norm": 2.1945247650146484, + "learning_rate": 5.922798610711566e-06, + "loss": 0.0735, + "step": 17645 + }, + { + "epoch": 0.44654199458460914, + "grad_norm": 3.0261118412017822, + "learning_rate": 5.922403982832005e-06, + "loss": 0.1513, + "step": 17646 + }, + { + "epoch": 0.44656730014930285, + "grad_norm": 4.358367443084717, + "learning_rate": 5.922009349004035e-06, + "loss": 0.1119, + "step": 17647 + }, + { + "epoch": 0.4465926057139965, + "grad_norm": 3.0719306468963623, + "learning_rate": 5.921614709230196e-06, + "loss": 0.1943, + "step": 17648 + }, + { + "epoch": 0.44661791127869016, + "grad_norm": 7.338424205780029, + "learning_rate": 5.921220063513035e-06, + "loss": 0.1769, + "step": 17649 + }, + { + "epoch": 0.4466432168433839, + "grad_norm": 5.291741371154785, + "learning_rate": 5.920825411855098e-06, + "loss": 0.2007, + "step": 17650 + }, + { + "epoch": 0.44666852240807753, + "grad_norm": 4.170469284057617, + "learning_rate": 5.920430754258928e-06, + "loss": 0.1379, + "step": 17651 + }, + { + "epoch": 0.4466938279727712, + "grad_norm": 6.719758033752441, + "learning_rate": 5.920036090727072e-06, + "loss": 0.1302, + "step": 17652 + }, + { + "epoch": 0.4467191335374649, + "grad_norm": 4.387815475463867, + "learning_rate": 5.919641421262076e-06, + "loss": 0.1729, + "step": 17653 + }, + { + "epoch": 0.44674443910215855, + "grad_norm": 6.578435897827148, + "learning_rate": 5.919246745866481e-06, + "loss": 0.1997, + "step": 17654 + }, + { + "epoch": 0.44676974466685226, + "grad_norm": 3.367806911468506, + "learning_rate": 5.918852064542836e-06, + "loss": 0.1382, + "step": 17655 + }, + { + "epoch": 0.4467950502315459, + "grad_norm": 6.69748067855835, + "learning_rate": 5.918457377293686e-06, + "loss": 0.1636, + "step": 17656 + }, + { + "epoch": 0.4468203557962396, + "grad_norm": 11.723243713378906, + "learning_rate": 5.918062684121573e-06, + "loss": 0.189, + "step": 17657 + }, + { + "epoch": 0.4468456613609333, + "grad_norm": 5.618924617767334, + "learning_rate": 5.917667985029047e-06, + "loss": 0.1736, + "step": 17658 + }, + { + "epoch": 0.44687096692562694, + "grad_norm": 4.679636478424072, + "learning_rate": 5.917273280018648e-06, + "loss": 0.1729, + "step": 17659 + }, + { + "epoch": 0.4468962724903206, + "grad_norm": 18.290719985961914, + "learning_rate": 5.916878569092925e-06, + "loss": 0.174, + "step": 17660 + }, + { + "epoch": 0.4469215780550143, + "grad_norm": 2.798269510269165, + "learning_rate": 5.9164838522544244e-06, + "loss": 0.1084, + "step": 17661 + }, + { + "epoch": 0.44694688361970797, + "grad_norm": 2.64030122756958, + "learning_rate": 5.916089129505688e-06, + "loss": 0.1161, + "step": 17662 + }, + { + "epoch": 0.4469721891844016, + "grad_norm": 7.371844291687012, + "learning_rate": 5.915694400849265e-06, + "loss": 0.2075, + "step": 17663 + }, + { + "epoch": 0.44699749474909534, + "grad_norm": 2.9342615604400635, + "learning_rate": 5.9152996662876975e-06, + "loss": 0.1233, + "step": 17664 + }, + { + "epoch": 0.447022800313789, + "grad_norm": 5.2642292976379395, + "learning_rate": 5.914904925823534e-06, + "loss": 0.1957, + "step": 17665 + }, + { + "epoch": 0.4470481058784827, + "grad_norm": 3.5158185958862305, + "learning_rate": 5.914510179459318e-06, + "loss": 0.1727, + "step": 17666 + }, + { + "epoch": 0.44707341144317636, + "grad_norm": 3.5271217823028564, + "learning_rate": 5.914115427197595e-06, + "loss": 0.0989, + "step": 17667 + }, + { + "epoch": 0.44709871700787, + "grad_norm": 9.714454650878906, + "learning_rate": 5.913720669040914e-06, + "loss": 0.2558, + "step": 17668 + }, + { + "epoch": 0.44712402257256373, + "grad_norm": 4.282528877258301, + "learning_rate": 5.913325904991817e-06, + "loss": 0.1694, + "step": 17669 + }, + { + "epoch": 0.4471493281372574, + "grad_norm": 3.4076406955718994, + "learning_rate": 5.912931135052852e-06, + "loss": 0.1252, + "step": 17670 + }, + { + "epoch": 0.44717463370195104, + "grad_norm": 2.450507879257202, + "learning_rate": 5.912536359226563e-06, + "loss": 0.1017, + "step": 17671 + }, + { + "epoch": 0.44719993926664475, + "grad_norm": 4.7015485763549805, + "learning_rate": 5.912141577515497e-06, + "loss": 0.1856, + "step": 17672 + }, + { + "epoch": 0.4472252448313384, + "grad_norm": 4.223250865936279, + "learning_rate": 5.911746789922199e-06, + "loss": 0.1694, + "step": 17673 + }, + { + "epoch": 0.44725055039603206, + "grad_norm": 5.784406661987305, + "learning_rate": 5.911351996449217e-06, + "loss": 0.1811, + "step": 17674 + }, + { + "epoch": 0.4472758559607258, + "grad_norm": 3.5362887382507324, + "learning_rate": 5.910957197099094e-06, + "loss": 0.1672, + "step": 17675 + }, + { + "epoch": 0.44730116152541943, + "grad_norm": 10.870721817016602, + "learning_rate": 5.91056239187438e-06, + "loss": 0.2418, + "step": 17676 + }, + { + "epoch": 0.4473264670901131, + "grad_norm": 9.895663261413574, + "learning_rate": 5.910167580777615e-06, + "loss": 0.1912, + "step": 17677 + }, + { + "epoch": 0.4473517726548068, + "grad_norm": 6.790787220001221, + "learning_rate": 5.909772763811351e-06, + "loss": 0.1184, + "step": 17678 + }, + { + "epoch": 0.44737707821950046, + "grad_norm": 3.6435790061950684, + "learning_rate": 5.9093779409781305e-06, + "loss": 0.1177, + "step": 17679 + }, + { + "epoch": 0.44740238378419417, + "grad_norm": 6.037413597106934, + "learning_rate": 5.908983112280502e-06, + "loss": 0.1358, + "step": 17680 + }, + { + "epoch": 0.4474276893488878, + "grad_norm": 3.6088943481445312, + "learning_rate": 5.908588277721009e-06, + "loss": 0.0668, + "step": 17681 + }, + { + "epoch": 0.4474529949135815, + "grad_norm": 10.13019847869873, + "learning_rate": 5.908193437302199e-06, + "loss": 0.2124, + "step": 17682 + }, + { + "epoch": 0.4474783004782752, + "grad_norm": 3.6302027702331543, + "learning_rate": 5.90779859102662e-06, + "loss": 0.0328, + "step": 17683 + }, + { + "epoch": 0.44750360604296885, + "grad_norm": 4.769733428955078, + "learning_rate": 5.907403738896815e-06, + "loss": 0.1169, + "step": 17684 + }, + { + "epoch": 0.4475289116076625, + "grad_norm": 4.77787446975708, + "learning_rate": 5.907008880915335e-06, + "loss": 0.1297, + "step": 17685 + }, + { + "epoch": 0.4475542171723562, + "grad_norm": 10.158708572387695, + "learning_rate": 5.906614017084721e-06, + "loss": 0.1679, + "step": 17686 + }, + { + "epoch": 0.4475795227370499, + "grad_norm": 5.324489116668701, + "learning_rate": 5.906219147407522e-06, + "loss": 0.2445, + "step": 17687 + }, + { + "epoch": 0.44760482830174353, + "grad_norm": 5.524595737457275, + "learning_rate": 5.905824271886286e-06, + "loss": 0.217, + "step": 17688 + }, + { + "epoch": 0.44763013386643724, + "grad_norm": 5.948727130889893, + "learning_rate": 5.905429390523556e-06, + "loss": 0.1381, + "step": 17689 + }, + { + "epoch": 0.4476554394311309, + "grad_norm": 4.292420387268066, + "learning_rate": 5.905034503321881e-06, + "loss": 0.1812, + "step": 17690 + }, + { + "epoch": 0.4476807449958246, + "grad_norm": 4.534790992736816, + "learning_rate": 5.904639610283806e-06, + "loss": 0.1822, + "step": 17691 + }, + { + "epoch": 0.44770605056051826, + "grad_norm": 6.065516948699951, + "learning_rate": 5.904244711411879e-06, + "loss": 0.2081, + "step": 17692 + }, + { + "epoch": 0.4477313561252119, + "grad_norm": 7.683677673339844, + "learning_rate": 5.903849806708646e-06, + "loss": 0.2793, + "step": 17693 + }, + { + "epoch": 0.44775666168990563, + "grad_norm": 4.684831619262695, + "learning_rate": 5.903454896176655e-06, + "loss": 0.089, + "step": 17694 + }, + { + "epoch": 0.4477819672545993, + "grad_norm": 3.177314519882202, + "learning_rate": 5.903059979818451e-06, + "loss": 0.1317, + "step": 17695 + }, + { + "epoch": 0.44780727281929295, + "grad_norm": 7.495219707489014, + "learning_rate": 5.90266505763658e-06, + "loss": 0.0848, + "step": 17696 + }, + { + "epoch": 0.44783257838398666, + "grad_norm": 3.932246208190918, + "learning_rate": 5.9022701296335906e-06, + "loss": 0.1879, + "step": 17697 + }, + { + "epoch": 0.4478578839486803, + "grad_norm": 9.194354057312012, + "learning_rate": 5.901875195812028e-06, + "loss": 0.1845, + "step": 17698 + }, + { + "epoch": 0.44788318951337397, + "grad_norm": 4.489124298095703, + "learning_rate": 5.901480256174442e-06, + "loss": 0.2015, + "step": 17699 + }, + { + "epoch": 0.4479084950780677, + "grad_norm": 8.84531307220459, + "learning_rate": 5.901085310723377e-06, + "loss": 0.2415, + "step": 17700 + }, + { + "epoch": 0.44793380064276134, + "grad_norm": 5.416690826416016, + "learning_rate": 5.900690359461379e-06, + "loss": 0.0736, + "step": 17701 + }, + { + "epoch": 0.447959106207455, + "grad_norm": 2.3200244903564453, + "learning_rate": 5.9002954023909965e-06, + "loss": 0.1123, + "step": 17702 + }, + { + "epoch": 0.4479844117721487, + "grad_norm": 7.53682279586792, + "learning_rate": 5.899900439514779e-06, + "loss": 0.1858, + "step": 17703 + }, + { + "epoch": 0.44800971733684236, + "grad_norm": 5.447776794433594, + "learning_rate": 5.899505470835267e-06, + "loss": 0.1195, + "step": 17704 + }, + { + "epoch": 0.4480350229015361, + "grad_norm": 8.750325202941895, + "learning_rate": 5.899110496355015e-06, + "loss": 0.1716, + "step": 17705 + }, + { + "epoch": 0.44806032846622973, + "grad_norm": 3.8752992153167725, + "learning_rate": 5.898715516076565e-06, + "loss": 0.1219, + "step": 17706 + }, + { + "epoch": 0.4480856340309234, + "grad_norm": 8.027374267578125, + "learning_rate": 5.898320530002466e-06, + "loss": 0.1615, + "step": 17707 + }, + { + "epoch": 0.4481109395956171, + "grad_norm": 6.056629180908203, + "learning_rate": 5.897925538135265e-06, + "loss": 0.2765, + "step": 17708 + }, + { + "epoch": 0.44813624516031075, + "grad_norm": 3.9513895511627197, + "learning_rate": 5.89753054047751e-06, + "loss": 0.0954, + "step": 17709 + }, + { + "epoch": 0.4481615507250044, + "grad_norm": 12.431145668029785, + "learning_rate": 5.897135537031746e-06, + "loss": 0.3548, + "step": 17710 + }, + { + "epoch": 0.4481868562896981, + "grad_norm": 3.7342641353607178, + "learning_rate": 5.896740527800524e-06, + "loss": 0.1449, + "step": 17711 + }, + { + "epoch": 0.4482121618543918, + "grad_norm": 4.128779888153076, + "learning_rate": 5.896345512786387e-06, + "loss": 0.1756, + "step": 17712 + }, + { + "epoch": 0.44823746741908543, + "grad_norm": 2.9736368656158447, + "learning_rate": 5.895950491991884e-06, + "loss": 0.1209, + "step": 17713 + }, + { + "epoch": 0.44826277298377915, + "grad_norm": 4.126072883605957, + "learning_rate": 5.8955554654195655e-06, + "loss": 0.1646, + "step": 17714 + }, + { + "epoch": 0.4482880785484728, + "grad_norm": 3.3601253032684326, + "learning_rate": 5.895160433071975e-06, + "loss": 0.1555, + "step": 17715 + }, + { + "epoch": 0.44831338411316646, + "grad_norm": 4.024438381195068, + "learning_rate": 5.894765394951661e-06, + "loss": 0.1792, + "step": 17716 + }, + { + "epoch": 0.44833868967786017, + "grad_norm": 4.095068454742432, + "learning_rate": 5.894370351061171e-06, + "loss": 0.1248, + "step": 17717 + }, + { + "epoch": 0.4483639952425538, + "grad_norm": 5.124794006347656, + "learning_rate": 5.8939753014030554e-06, + "loss": 0.2135, + "step": 17718 + }, + { + "epoch": 0.44838930080724754, + "grad_norm": 2.789762258529663, + "learning_rate": 5.893580245979857e-06, + "loss": 0.129, + "step": 17719 + }, + { + "epoch": 0.4484146063719412, + "grad_norm": 7.337907314300537, + "learning_rate": 5.893185184794128e-06, + "loss": 0.2435, + "step": 17720 + }, + { + "epoch": 0.44843991193663485, + "grad_norm": 6.792024612426758, + "learning_rate": 5.892790117848411e-06, + "loss": 0.2358, + "step": 17721 + }, + { + "epoch": 0.44846521750132856, + "grad_norm": 3.7834115028381348, + "learning_rate": 5.89239504514526e-06, + "loss": 0.1917, + "step": 17722 + }, + { + "epoch": 0.4484905230660222, + "grad_norm": 4.255050182342529, + "learning_rate": 5.891999966687216e-06, + "loss": 0.1715, + "step": 17723 + }, + { + "epoch": 0.4485158286307159, + "grad_norm": 5.2513580322265625, + "learning_rate": 5.891604882476833e-06, + "loss": 0.1747, + "step": 17724 + }, + { + "epoch": 0.4485411341954096, + "grad_norm": 5.73833703994751, + "learning_rate": 5.891209792516654e-06, + "loss": 0.1703, + "step": 17725 + }, + { + "epoch": 0.44856643976010324, + "grad_norm": 8.41666316986084, + "learning_rate": 5.89081469680923e-06, + "loss": 0.1695, + "step": 17726 + }, + { + "epoch": 0.4485917453247969, + "grad_norm": 8.246052742004395, + "learning_rate": 5.890419595357106e-06, + "loss": 0.1832, + "step": 17727 + }, + { + "epoch": 0.4486170508894906, + "grad_norm": 4.981477737426758, + "learning_rate": 5.890024488162835e-06, + "loss": 0.1529, + "step": 17728 + }, + { + "epoch": 0.44864235645418427, + "grad_norm": 3.296046018600464, + "learning_rate": 5.889629375228959e-06, + "loss": 0.1048, + "step": 17729 + }, + { + "epoch": 0.448667662018878, + "grad_norm": 4.700202465057373, + "learning_rate": 5.889234256558029e-06, + "loss": 0.209, + "step": 17730 + }, + { + "epoch": 0.44869296758357163, + "grad_norm": 2.6038875579833984, + "learning_rate": 5.8888391321525925e-06, + "loss": 0.1156, + "step": 17731 + }, + { + "epoch": 0.4487182731482653, + "grad_norm": 3.9785988330841064, + "learning_rate": 5.888444002015198e-06, + "loss": 0.1429, + "step": 17732 + }, + { + "epoch": 0.448743578712959, + "grad_norm": 5.826600551605225, + "learning_rate": 5.8880488661483945e-06, + "loss": 0.1508, + "step": 17733 + }, + { + "epoch": 0.44876888427765266, + "grad_norm": 6.033418655395508, + "learning_rate": 5.887653724554727e-06, + "loss": 0.18, + "step": 17734 + }, + { + "epoch": 0.4487941898423463, + "grad_norm": 6.803613662719727, + "learning_rate": 5.887258577236746e-06, + "loss": 0.1687, + "step": 17735 + }, + { + "epoch": 0.44881949540704, + "grad_norm": 3.8940467834472656, + "learning_rate": 5.886863424197001e-06, + "loss": 0.2232, + "step": 17736 + }, + { + "epoch": 0.4488448009717337, + "grad_norm": 6.967885494232178, + "learning_rate": 5.886468265438038e-06, + "loss": 0.3486, + "step": 17737 + }, + { + "epoch": 0.44887010653642734, + "grad_norm": 3.0214860439300537, + "learning_rate": 5.886073100962406e-06, + "loss": 0.1114, + "step": 17738 + }, + { + "epoch": 0.44889541210112105, + "grad_norm": 2.375884532928467, + "learning_rate": 5.885677930772653e-06, + "loss": 0.092, + "step": 17739 + }, + { + "epoch": 0.4489207176658147, + "grad_norm": 7.121694564819336, + "learning_rate": 5.8852827548713285e-06, + "loss": 0.2107, + "step": 17740 + }, + { + "epoch": 0.44894602323050836, + "grad_norm": 10.295188903808594, + "learning_rate": 5.88488757326098e-06, + "loss": 0.1419, + "step": 17741 + }, + { + "epoch": 0.4489713287952021, + "grad_norm": 7.850282669067383, + "learning_rate": 5.884492385944157e-06, + "loss": 0.2567, + "step": 17742 + }, + { + "epoch": 0.44899663435989573, + "grad_norm": 6.992788314819336, + "learning_rate": 5.884097192923406e-06, + "loss": 0.1959, + "step": 17743 + }, + { + "epoch": 0.44902193992458944, + "grad_norm": 3.855677843093872, + "learning_rate": 5.8837019942012765e-06, + "loss": 0.1953, + "step": 17744 + }, + { + "epoch": 0.4490472454892831, + "grad_norm": 10.470562934875488, + "learning_rate": 5.883306789780317e-06, + "loss": 0.2473, + "step": 17745 + }, + { + "epoch": 0.44907255105397675, + "grad_norm": 12.17405891418457, + "learning_rate": 5.882911579663077e-06, + "loss": 0.3597, + "step": 17746 + }, + { + "epoch": 0.44909785661867047, + "grad_norm": 4.937358856201172, + "learning_rate": 5.882516363852104e-06, + "loss": 0.1641, + "step": 17747 + }, + { + "epoch": 0.4491231621833641, + "grad_norm": 3.032757043838501, + "learning_rate": 5.882121142349948e-06, + "loss": 0.115, + "step": 17748 + }, + { + "epoch": 0.4491484677480578, + "grad_norm": 8.722918510437012, + "learning_rate": 5.881725915159157e-06, + "loss": 0.2582, + "step": 17749 + }, + { + "epoch": 0.4491737733127515, + "grad_norm": 2.8675858974456787, + "learning_rate": 5.881330682282278e-06, + "loss": 0.1348, + "step": 17750 + }, + { + "epoch": 0.44919907887744515, + "grad_norm": 4.494919776916504, + "learning_rate": 5.8809354437218625e-06, + "loss": 0.162, + "step": 17751 + }, + { + "epoch": 0.4492243844421388, + "grad_norm": 6.979153633117676, + "learning_rate": 5.880540199480458e-06, + "loss": 0.1987, + "step": 17752 + }, + { + "epoch": 0.4492496900068325, + "grad_norm": 4.330902576446533, + "learning_rate": 5.880144949560613e-06, + "loss": 0.1461, + "step": 17753 + }, + { + "epoch": 0.44927499557152617, + "grad_norm": 6.493044376373291, + "learning_rate": 5.879749693964878e-06, + "loss": 0.1709, + "step": 17754 + }, + { + "epoch": 0.4493003011362199, + "grad_norm": 4.4723944664001465, + "learning_rate": 5.879354432695801e-06, + "loss": 0.2187, + "step": 17755 + }, + { + "epoch": 0.44932560670091354, + "grad_norm": 3.556413412094116, + "learning_rate": 5.878959165755931e-06, + "loss": 0.1487, + "step": 17756 + }, + { + "epoch": 0.4493509122656072, + "grad_norm": 3.0766563415527344, + "learning_rate": 5.878563893147815e-06, + "loss": 0.1599, + "step": 17757 + }, + { + "epoch": 0.4493762178303009, + "grad_norm": 3.5639450550079346, + "learning_rate": 5.878168614874005e-06, + "loss": 0.1545, + "step": 17758 + }, + { + "epoch": 0.44940152339499456, + "grad_norm": 4.942489147186279, + "learning_rate": 5.877773330937049e-06, + "loss": 0.1654, + "step": 17759 + }, + { + "epoch": 0.4494268289596882, + "grad_norm": 3.723736047744751, + "learning_rate": 5.877378041339497e-06, + "loss": 0.1922, + "step": 17760 + }, + { + "epoch": 0.44945213452438193, + "grad_norm": 4.188976764678955, + "learning_rate": 5.876982746083895e-06, + "loss": 0.1163, + "step": 17761 + }, + { + "epoch": 0.4494774400890756, + "grad_norm": 2.816633462905884, + "learning_rate": 5.8765874451727954e-06, + "loss": 0.1302, + "step": 17762 + }, + { + "epoch": 0.44950274565376924, + "grad_norm": 6.500363349914551, + "learning_rate": 5.876192138608746e-06, + "loss": 0.1704, + "step": 17763 + }, + { + "epoch": 0.44952805121846295, + "grad_norm": 4.90587854385376, + "learning_rate": 5.875796826394298e-06, + "loss": 0.1761, + "step": 17764 + }, + { + "epoch": 0.4495533567831566, + "grad_norm": 14.444221496582031, + "learning_rate": 5.875401508531997e-06, + "loss": 0.2417, + "step": 17765 + }, + { + "epoch": 0.44957866234785027, + "grad_norm": 7.136011123657227, + "learning_rate": 5.875006185024396e-06, + "loss": 0.265, + "step": 17766 + }, + { + "epoch": 0.449603967912544, + "grad_norm": 4.674108505249023, + "learning_rate": 5.874610855874044e-06, + "loss": 0.1928, + "step": 17767 + }, + { + "epoch": 0.44962927347723763, + "grad_norm": 3.8283159732818604, + "learning_rate": 5.874215521083486e-06, + "loss": 0.2117, + "step": 17768 + }, + { + "epoch": 0.44965457904193135, + "grad_norm": 4.030856609344482, + "learning_rate": 5.873820180655278e-06, + "loss": 0.0789, + "step": 17769 + }, + { + "epoch": 0.449679884606625, + "grad_norm": 3.6705968379974365, + "learning_rate": 5.873424834591963e-06, + "loss": 0.1464, + "step": 17770 + }, + { + "epoch": 0.44970519017131866, + "grad_norm": 4.166032314300537, + "learning_rate": 5.873029482896097e-06, + "loss": 0.1699, + "step": 17771 + }, + { + "epoch": 0.44973049573601237, + "grad_norm": 2.7979166507720947, + "learning_rate": 5.872634125570224e-06, + "loss": 0.1289, + "step": 17772 + }, + { + "epoch": 0.449755801300706, + "grad_norm": 2.919480085372925, + "learning_rate": 5.8722387626168976e-06, + "loss": 0.1494, + "step": 17773 + }, + { + "epoch": 0.4497811068653997, + "grad_norm": 5.147322654724121, + "learning_rate": 5.871843394038664e-06, + "loss": 0.1963, + "step": 17774 + }, + { + "epoch": 0.4498064124300934, + "grad_norm": 26.698043823242188, + "learning_rate": 5.871448019838076e-06, + "loss": 0.2029, + "step": 17775 + }, + { + "epoch": 0.44983171799478705, + "grad_norm": 3.0108280181884766, + "learning_rate": 5.871052640017681e-06, + "loss": 0.1352, + "step": 17776 + }, + { + "epoch": 0.4498570235594807, + "grad_norm": 3.276380777359009, + "learning_rate": 5.8706572545800275e-06, + "loss": 0.1682, + "step": 17777 + }, + { + "epoch": 0.4498823291241744, + "grad_norm": 6.427766799926758, + "learning_rate": 5.870261863527671e-06, + "loss": 0.1678, + "step": 17778 + }, + { + "epoch": 0.4499076346888681, + "grad_norm": 3.1809394359588623, + "learning_rate": 5.869866466863154e-06, + "loss": 0.0893, + "step": 17779 + }, + { + "epoch": 0.44993294025356173, + "grad_norm": 4.639119625091553, + "learning_rate": 5.869471064589033e-06, + "loss": 0.1666, + "step": 17780 + }, + { + "epoch": 0.44995824581825544, + "grad_norm": 2.2226743698120117, + "learning_rate": 5.869075656707854e-06, + "loss": 0.1102, + "step": 17781 + }, + { + "epoch": 0.4499835513829491, + "grad_norm": 9.960333824157715, + "learning_rate": 5.868680243222168e-06, + "loss": 0.2611, + "step": 17782 + }, + { + "epoch": 0.4500088569476428, + "grad_norm": 6.626659393310547, + "learning_rate": 5.8682848241345234e-06, + "loss": 0.1866, + "step": 17783 + }, + { + "epoch": 0.45003416251233647, + "grad_norm": 3.9531362056732178, + "learning_rate": 5.867889399447473e-06, + "loss": 0.1115, + "step": 17784 + }, + { + "epoch": 0.4500594680770301, + "grad_norm": 3.542407274246216, + "learning_rate": 5.867493969163566e-06, + "loss": 0.1082, + "step": 17785 + }, + { + "epoch": 0.45008477364172383, + "grad_norm": 7.182417869567871, + "learning_rate": 5.867098533285349e-06, + "loss": 0.1626, + "step": 17786 + }, + { + "epoch": 0.4501100792064175, + "grad_norm": 16.111473083496094, + "learning_rate": 5.866703091815376e-06, + "loss": 0.1779, + "step": 17787 + }, + { + "epoch": 0.45013538477111115, + "grad_norm": 2.0634005069732666, + "learning_rate": 5.866307644756197e-06, + "loss": 0.0807, + "step": 17788 + }, + { + "epoch": 0.45016069033580486, + "grad_norm": 3.5130221843719482, + "learning_rate": 5.865912192110361e-06, + "loss": 0.1227, + "step": 17789 + }, + { + "epoch": 0.4501859959004985, + "grad_norm": 3.942080497741699, + "learning_rate": 5.865516733880417e-06, + "loss": 0.1373, + "step": 17790 + }, + { + "epoch": 0.45021130146519217, + "grad_norm": 3.78589129447937, + "learning_rate": 5.865121270068917e-06, + "loss": 0.119, + "step": 17791 + }, + { + "epoch": 0.4502366070298859, + "grad_norm": 5.0132527351379395, + "learning_rate": 5.864725800678411e-06, + "loss": 0.1868, + "step": 17792 + }, + { + "epoch": 0.45026191259457954, + "grad_norm": 5.474946975708008, + "learning_rate": 5.864330325711451e-06, + "loss": 0.2046, + "step": 17793 + }, + { + "epoch": 0.45028721815927325, + "grad_norm": 4.95151948928833, + "learning_rate": 5.863934845170582e-06, + "loss": 0.2373, + "step": 17794 + }, + { + "epoch": 0.4503125237239669, + "grad_norm": 3.7281932830810547, + "learning_rate": 5.863539359058361e-06, + "loss": 0.1164, + "step": 17795 + }, + { + "epoch": 0.45033782928866056, + "grad_norm": 3.633411169052124, + "learning_rate": 5.8631438673773335e-06, + "loss": 0.117, + "step": 17796 + }, + { + "epoch": 0.4503631348533543, + "grad_norm": 4.13263463973999, + "learning_rate": 5.862748370130052e-06, + "loss": 0.139, + "step": 17797 + }, + { + "epoch": 0.45038844041804793, + "grad_norm": 8.716581344604492, + "learning_rate": 5.862352867319067e-06, + "loss": 0.2039, + "step": 17798 + }, + { + "epoch": 0.4504137459827416, + "grad_norm": 14.198080062866211, + "learning_rate": 5.861957358946928e-06, + "loss": 0.1365, + "step": 17799 + }, + { + "epoch": 0.4504390515474353, + "grad_norm": 3.8232479095458984, + "learning_rate": 5.861561845016188e-06, + "loss": 0.1162, + "step": 17800 + }, + { + "epoch": 0.45046435711212895, + "grad_norm": 10.342103004455566, + "learning_rate": 5.861166325529395e-06, + "loss": 0.234, + "step": 17801 + }, + { + "epoch": 0.4504896626768226, + "grad_norm": 9.079019546508789, + "learning_rate": 5.8607708004891e-06, + "loss": 0.1959, + "step": 17802 + }, + { + "epoch": 0.4505149682415163, + "grad_norm": 6.260583877563477, + "learning_rate": 5.8603752698978556e-06, + "loss": 0.1861, + "step": 17803 + }, + { + "epoch": 0.45054027380621, + "grad_norm": 5.948270797729492, + "learning_rate": 5.859979733758211e-06, + "loss": 0.1436, + "step": 17804 + }, + { + "epoch": 0.45056557937090363, + "grad_norm": 4.756509304046631, + "learning_rate": 5.859584192072716e-06, + "loss": 0.2178, + "step": 17805 + }, + { + "epoch": 0.45059088493559735, + "grad_norm": 12.129626274108887, + "learning_rate": 5.859188644843923e-06, + "loss": 0.1344, + "step": 17806 + }, + { + "epoch": 0.450616190500291, + "grad_norm": 3.617537021636963, + "learning_rate": 5.858793092074382e-06, + "loss": 0.1477, + "step": 17807 + }, + { + "epoch": 0.4506414960649847, + "grad_norm": 4.7353692054748535, + "learning_rate": 5.8583975337666435e-06, + "loss": 0.2445, + "step": 17808 + }, + { + "epoch": 0.45066680162967837, + "grad_norm": 6.186666011810303, + "learning_rate": 5.8580019699232595e-06, + "loss": 0.1886, + "step": 17809 + }, + { + "epoch": 0.450692107194372, + "grad_norm": 7.777130603790283, + "learning_rate": 5.85760640054678e-06, + "loss": 0.2032, + "step": 17810 + }, + { + "epoch": 0.45071741275906574, + "grad_norm": 13.679420471191406, + "learning_rate": 5.857210825639757e-06, + "loss": 0.1804, + "step": 17811 + }, + { + "epoch": 0.4507427183237594, + "grad_norm": 3.5743534564971924, + "learning_rate": 5.856815245204741e-06, + "loss": 0.1523, + "step": 17812 + }, + { + "epoch": 0.45076802388845305, + "grad_norm": 4.017624378204346, + "learning_rate": 5.856419659244283e-06, + "loss": 0.1556, + "step": 17813 + }, + { + "epoch": 0.45079332945314676, + "grad_norm": 6.031736850738525, + "learning_rate": 5.856024067760935e-06, + "loss": 0.1841, + "step": 17814 + }, + { + "epoch": 0.4508186350178404, + "grad_norm": 6.695245742797852, + "learning_rate": 5.8556284707572445e-06, + "loss": 0.1854, + "step": 17815 + }, + { + "epoch": 0.4508439405825341, + "grad_norm": 6.121678829193115, + "learning_rate": 5.855232868235767e-06, + "loss": 0.2167, + "step": 17816 + }, + { + "epoch": 0.4508692461472278, + "grad_norm": 4.642695426940918, + "learning_rate": 5.854837260199051e-06, + "loss": 0.1427, + "step": 17817 + }, + { + "epoch": 0.45089455171192144, + "grad_norm": 5.509214401245117, + "learning_rate": 5.854441646649649e-06, + "loss": 0.2007, + "step": 17818 + }, + { + "epoch": 0.45091985727661515, + "grad_norm": 4.952060222625732, + "learning_rate": 5.8540460275901105e-06, + "loss": 0.1865, + "step": 17819 + }, + { + "epoch": 0.4509451628413088, + "grad_norm": 10.748091697692871, + "learning_rate": 5.853650403022989e-06, + "loss": 0.2437, + "step": 17820 + }, + { + "epoch": 0.45097046840600247, + "grad_norm": 5.544546127319336, + "learning_rate": 5.8532547729508345e-06, + "loss": 0.119, + "step": 17821 + }, + { + "epoch": 0.4509957739706962, + "grad_norm": 18.267038345336914, + "learning_rate": 5.852859137376199e-06, + "loss": 0.5602, + "step": 17822 + }, + { + "epoch": 0.45102107953538983, + "grad_norm": 4.489382743835449, + "learning_rate": 5.8524634963016335e-06, + "loss": 0.2091, + "step": 17823 + }, + { + "epoch": 0.4510463851000835, + "grad_norm": 3.3126204013824463, + "learning_rate": 5.85206784972969e-06, + "loss": 0.1333, + "step": 17824 + }, + { + "epoch": 0.4510716906647772, + "grad_norm": 5.2497076988220215, + "learning_rate": 5.8516721976629185e-06, + "loss": 0.1864, + "step": 17825 + }, + { + "epoch": 0.45109699622947086, + "grad_norm": 7.590040683746338, + "learning_rate": 5.851276540103872e-06, + "loss": 0.1603, + "step": 17826 + }, + { + "epoch": 0.4511223017941645, + "grad_norm": 2.647491455078125, + "learning_rate": 5.8508808770551005e-06, + "loss": 0.1095, + "step": 17827 + }, + { + "epoch": 0.4511476073588582, + "grad_norm": 15.80896282196045, + "learning_rate": 5.850485208519157e-06, + "loss": 0.4027, + "step": 17828 + }, + { + "epoch": 0.4511729129235519, + "grad_norm": 4.818155288696289, + "learning_rate": 5.850089534498593e-06, + "loss": 0.2175, + "step": 17829 + }, + { + "epoch": 0.45119821848824554, + "grad_norm": 5.694084644317627, + "learning_rate": 5.849693854995959e-06, + "loss": 0.1835, + "step": 17830 + }, + { + "epoch": 0.45122352405293925, + "grad_norm": 23.74183464050293, + "learning_rate": 5.849298170013808e-06, + "loss": 0.3693, + "step": 17831 + }, + { + "epoch": 0.4512488296176329, + "grad_norm": 3.966210126876831, + "learning_rate": 5.848902479554691e-06, + "loss": 0.1392, + "step": 17832 + }, + { + "epoch": 0.4512741351823266, + "grad_norm": 6.292324542999268, + "learning_rate": 5.84850678362116e-06, + "loss": 0.1546, + "step": 17833 + }, + { + "epoch": 0.4512994407470203, + "grad_norm": 3.2682323455810547, + "learning_rate": 5.848111082215766e-06, + "loss": 0.1055, + "step": 17834 + }, + { + "epoch": 0.45132474631171393, + "grad_norm": 5.213133335113525, + "learning_rate": 5.847715375341063e-06, + "loss": 0.2575, + "step": 17835 + }, + { + "epoch": 0.45135005187640764, + "grad_norm": 4.487202167510986, + "learning_rate": 5.8473196629996e-06, + "loss": 0.1249, + "step": 17836 + }, + { + "epoch": 0.4513753574411013, + "grad_norm": 8.68106746673584, + "learning_rate": 5.84692394519393e-06, + "loss": 0.2514, + "step": 17837 + }, + { + "epoch": 0.45140066300579496, + "grad_norm": 6.51512336730957, + "learning_rate": 5.846528221926605e-06, + "loss": 0.1719, + "step": 17838 + }, + { + "epoch": 0.45142596857048867, + "grad_norm": 5.704730033874512, + "learning_rate": 5.8461324932001774e-06, + "loss": 0.1105, + "step": 17839 + }, + { + "epoch": 0.4514512741351823, + "grad_norm": 4.226597309112549, + "learning_rate": 5.845736759017198e-06, + "loss": 0.1654, + "step": 17840 + }, + { + "epoch": 0.451476579699876, + "grad_norm": 3.170269012451172, + "learning_rate": 5.845341019380221e-06, + "loss": 0.0736, + "step": 17841 + }, + { + "epoch": 0.4515018852645697, + "grad_norm": 4.340223789215088, + "learning_rate": 5.8449452742917955e-06, + "loss": 0.1544, + "step": 17842 + }, + { + "epoch": 0.45152719082926335, + "grad_norm": 8.737712860107422, + "learning_rate": 5.844549523754476e-06, + "loss": 0.3273, + "step": 17843 + }, + { + "epoch": 0.451552496393957, + "grad_norm": 7.565097808837891, + "learning_rate": 5.844153767770812e-06, + "loss": 0.2524, + "step": 17844 + }, + { + "epoch": 0.4515778019586507, + "grad_norm": 8.992145538330078, + "learning_rate": 5.843758006343361e-06, + "loss": 0.353, + "step": 17845 + }, + { + "epoch": 0.45160310752334437, + "grad_norm": 4.440752029418945, + "learning_rate": 5.843362239474669e-06, + "loss": 0.1244, + "step": 17846 + }, + { + "epoch": 0.4516284130880381, + "grad_norm": 9.147401809692383, + "learning_rate": 5.84296646716729e-06, + "loss": 0.2264, + "step": 17847 + }, + { + "epoch": 0.45165371865273174, + "grad_norm": 3.3058669567108154, + "learning_rate": 5.842570689423779e-06, + "loss": 0.1583, + "step": 17848 + }, + { + "epoch": 0.4516790242174254, + "grad_norm": 2.8204495906829834, + "learning_rate": 5.842174906246685e-06, + "loss": 0.125, + "step": 17849 + }, + { + "epoch": 0.4517043297821191, + "grad_norm": 4.136582374572754, + "learning_rate": 5.841779117638563e-06, + "loss": 0.1466, + "step": 17850 + }, + { + "epoch": 0.45172963534681276, + "grad_norm": 4.048214912414551, + "learning_rate": 5.841383323601962e-06, + "loss": 0.1721, + "step": 17851 + }, + { + "epoch": 0.4517549409115064, + "grad_norm": 2.497095823287964, + "learning_rate": 5.840987524139437e-06, + "loss": 0.1261, + "step": 17852 + }, + { + "epoch": 0.45178024647620013, + "grad_norm": 6.817932605743408, + "learning_rate": 5.840591719253542e-06, + "loss": 0.1827, + "step": 17853 + }, + { + "epoch": 0.4518055520408938, + "grad_norm": 4.897149085998535, + "learning_rate": 5.840195908946825e-06, + "loss": 0.209, + "step": 17854 + }, + { + "epoch": 0.45183085760558744, + "grad_norm": 7.439420223236084, + "learning_rate": 5.839800093221844e-06, + "loss": 0.1865, + "step": 17855 + }, + { + "epoch": 0.45185616317028116, + "grad_norm": 4.252800464630127, + "learning_rate": 5.839404272081145e-06, + "loss": 0.1431, + "step": 17856 + }, + { + "epoch": 0.4518814687349748, + "grad_norm": 4.907557964324951, + "learning_rate": 5.839008445527285e-06, + "loss": 0.1334, + "step": 17857 + }, + { + "epoch": 0.4519067742996685, + "grad_norm": 8.397111892700195, + "learning_rate": 5.838612613562816e-06, + "loss": 0.2514, + "step": 17858 + }, + { + "epoch": 0.4519320798643622, + "grad_norm": 3.7720065116882324, + "learning_rate": 5.838216776190291e-06, + "loss": 0.1733, + "step": 17859 + }, + { + "epoch": 0.45195738542905584, + "grad_norm": 5.840717315673828, + "learning_rate": 5.83782093341226e-06, + "loss": 0.1905, + "step": 17860 + }, + { + "epoch": 0.45198269099374955, + "grad_norm": 4.755884647369385, + "learning_rate": 5.8374250852312794e-06, + "loss": 0.1076, + "step": 17861 + }, + { + "epoch": 0.4520079965584432, + "grad_norm": 6.1483564376831055, + "learning_rate": 5.837029231649899e-06, + "loss": 0.1396, + "step": 17862 + }, + { + "epoch": 0.45203330212313686, + "grad_norm": 4.827469825744629, + "learning_rate": 5.836633372670673e-06, + "loss": 0.1734, + "step": 17863 + }, + { + "epoch": 0.45205860768783057, + "grad_norm": 5.7058796882629395, + "learning_rate": 5.836237508296154e-06, + "loss": 0.1338, + "step": 17864 + }, + { + "epoch": 0.4520839132525242, + "grad_norm": 9.339279174804688, + "learning_rate": 5.8358416385288956e-06, + "loss": 0.2188, + "step": 17865 + }, + { + "epoch": 0.4521092188172179, + "grad_norm": 4.748101711273193, + "learning_rate": 5.83544576337145e-06, + "loss": 0.2389, + "step": 17866 + }, + { + "epoch": 0.4521345243819116, + "grad_norm": 7.144073963165283, + "learning_rate": 5.835049882826369e-06, + "loss": 0.2001, + "step": 17867 + }, + { + "epoch": 0.45215982994660525, + "grad_norm": 2.776272773742676, + "learning_rate": 5.834653996896207e-06, + "loss": 0.1549, + "step": 17868 + }, + { + "epoch": 0.4521851355112989, + "grad_norm": 6.956029891967773, + "learning_rate": 5.834258105583517e-06, + "loss": 0.3128, + "step": 17869 + }, + { + "epoch": 0.4522104410759926, + "grad_norm": 4.55123233795166, + "learning_rate": 5.833862208890851e-06, + "loss": 0.1492, + "step": 17870 + }, + { + "epoch": 0.4522357466406863, + "grad_norm": 6.626185894012451, + "learning_rate": 5.833466306820763e-06, + "loss": 0.2528, + "step": 17871 + }, + { + "epoch": 0.45226105220538, + "grad_norm": 3.347703456878662, + "learning_rate": 5.8330703993758065e-06, + "loss": 0.1476, + "step": 17872 + }, + { + "epoch": 0.45228635777007364, + "grad_norm": 11.432832717895508, + "learning_rate": 5.832674486558534e-06, + "loss": 0.31, + "step": 17873 + }, + { + "epoch": 0.4523116633347673, + "grad_norm": 4.213618755340576, + "learning_rate": 5.832278568371498e-06, + "loss": 0.1578, + "step": 17874 + }, + { + "epoch": 0.452336968899461, + "grad_norm": 5.452638626098633, + "learning_rate": 5.831882644817253e-06, + "loss": 0.1534, + "step": 17875 + }, + { + "epoch": 0.45236227446415467, + "grad_norm": 17.525104522705078, + "learning_rate": 5.83148671589835e-06, + "loss": 0.2562, + "step": 17876 + }, + { + "epoch": 0.4523875800288483, + "grad_norm": 7.552540302276611, + "learning_rate": 5.831090781617346e-06, + "loss": 0.2402, + "step": 17877 + }, + { + "epoch": 0.45241288559354204, + "grad_norm": 3.227299690246582, + "learning_rate": 5.830694841976791e-06, + "loss": 0.1135, + "step": 17878 + }, + { + "epoch": 0.4524381911582357, + "grad_norm": 7.250618934631348, + "learning_rate": 5.83029889697924e-06, + "loss": 0.2126, + "step": 17879 + }, + { + "epoch": 0.45246349672292935, + "grad_norm": 2.296637773513794, + "learning_rate": 5.829902946627244e-06, + "loss": 0.1064, + "step": 17880 + }, + { + "epoch": 0.45248880228762306, + "grad_norm": 6.398763179779053, + "learning_rate": 5.829506990923361e-06, + "loss": 0.175, + "step": 17881 + }, + { + "epoch": 0.4525141078523167, + "grad_norm": 5.465606689453125, + "learning_rate": 5.829111029870139e-06, + "loss": 0.2232, + "step": 17882 + }, + { + "epoch": 0.4525394134170104, + "grad_norm": 8.398533821105957, + "learning_rate": 5.828715063470135e-06, + "loss": 0.2276, + "step": 17883 + }, + { + "epoch": 0.4525647189817041, + "grad_norm": 6.386651039123535, + "learning_rate": 5.828319091725903e-06, + "loss": 0.237, + "step": 17884 + }, + { + "epoch": 0.45259002454639774, + "grad_norm": 6.375921249389648, + "learning_rate": 5.827923114639994e-06, + "loss": 0.1254, + "step": 17885 + }, + { + "epoch": 0.45261533011109145, + "grad_norm": 4.0039801597595215, + "learning_rate": 5.827527132214964e-06, + "loss": 0.187, + "step": 17886 + }, + { + "epoch": 0.4526406356757851, + "grad_norm": 8.600364685058594, + "learning_rate": 5.827131144453364e-06, + "loss": 0.156, + "step": 17887 + }, + { + "epoch": 0.45266594124047876, + "grad_norm": 5.573305606842041, + "learning_rate": 5.826735151357751e-06, + "loss": 0.2458, + "step": 17888 + }, + { + "epoch": 0.4526912468051725, + "grad_norm": 3.553335189819336, + "learning_rate": 5.826339152930674e-06, + "loss": 0.1211, + "step": 17889 + }, + { + "epoch": 0.45271655236986613, + "grad_norm": 3.183181047439575, + "learning_rate": 5.825943149174692e-06, + "loss": 0.152, + "step": 17890 + }, + { + "epoch": 0.4527418579345598, + "grad_norm": 9.513338088989258, + "learning_rate": 5.825547140092356e-06, + "loss": 0.3293, + "step": 17891 + }, + { + "epoch": 0.4527671634992535, + "grad_norm": 4.243681907653809, + "learning_rate": 5.825151125686219e-06, + "loss": 0.1243, + "step": 17892 + }, + { + "epoch": 0.45279246906394716, + "grad_norm": 5.136783599853516, + "learning_rate": 5.824755105958836e-06, + "loss": 0.1388, + "step": 17893 + }, + { + "epoch": 0.4528177746286408, + "grad_norm": 5.906114101409912, + "learning_rate": 5.8243590809127605e-06, + "loss": 0.1706, + "step": 17894 + }, + { + "epoch": 0.4528430801933345, + "grad_norm": 2.8963842391967773, + "learning_rate": 5.823963050550549e-06, + "loss": 0.0896, + "step": 17895 + }, + { + "epoch": 0.4528683857580282, + "grad_norm": 3.752589702606201, + "learning_rate": 5.82356701487475e-06, + "loss": 0.1226, + "step": 17896 + }, + { + "epoch": 0.4528936913227219, + "grad_norm": 3.1678874492645264, + "learning_rate": 5.823170973887923e-06, + "loss": 0.1077, + "step": 17897 + }, + { + "epoch": 0.45291899688741555, + "grad_norm": 6.607258319854736, + "learning_rate": 5.822774927592617e-06, + "loss": 0.152, + "step": 17898 + }, + { + "epoch": 0.4529443024521092, + "grad_norm": 6.53751277923584, + "learning_rate": 5.82237887599139e-06, + "loss": 0.2305, + "step": 17899 + }, + { + "epoch": 0.4529696080168029, + "grad_norm": 3.772799253463745, + "learning_rate": 5.8219828190867946e-06, + "loss": 0.1001, + "step": 17900 + }, + { + "epoch": 0.45299491358149657, + "grad_norm": 4.007343769073486, + "learning_rate": 5.821586756881385e-06, + "loss": 0.1687, + "step": 17901 + }, + { + "epoch": 0.45302021914619023, + "grad_norm": 11.79387092590332, + "learning_rate": 5.8211906893777156e-06, + "loss": 0.3541, + "step": 17902 + }, + { + "epoch": 0.45304552471088394, + "grad_norm": 5.0613884925842285, + "learning_rate": 5.820794616578339e-06, + "loss": 0.1396, + "step": 17903 + }, + { + "epoch": 0.4530708302755776, + "grad_norm": 4.981830596923828, + "learning_rate": 5.820398538485812e-06, + "loss": 0.0936, + "step": 17904 + }, + { + "epoch": 0.45309613584027125, + "grad_norm": 3.6942739486694336, + "learning_rate": 5.820002455102687e-06, + "loss": 0.2342, + "step": 17905 + }, + { + "epoch": 0.45312144140496496, + "grad_norm": 4.638552188873291, + "learning_rate": 5.819606366431518e-06, + "loss": 0.1411, + "step": 17906 + }, + { + "epoch": 0.4531467469696586, + "grad_norm": 7.32778787612915, + "learning_rate": 5.81921027247486e-06, + "loss": 0.219, + "step": 17907 + }, + { + "epoch": 0.4531720525343523, + "grad_norm": 3.4230360984802246, + "learning_rate": 5.818814173235269e-06, + "loss": 0.1169, + "step": 17908 + }, + { + "epoch": 0.453197358099046, + "grad_norm": 5.809146881103516, + "learning_rate": 5.818418068715295e-06, + "loss": 0.1375, + "step": 17909 + }, + { + "epoch": 0.45322266366373964, + "grad_norm": 9.390049934387207, + "learning_rate": 5.818021958917497e-06, + "loss": 0.2733, + "step": 17910 + }, + { + "epoch": 0.45324796922843336, + "grad_norm": 3.5633835792541504, + "learning_rate": 5.8176258438444265e-06, + "loss": 0.1712, + "step": 17911 + }, + { + "epoch": 0.453273274793127, + "grad_norm": 3.598724603652954, + "learning_rate": 5.817229723498641e-06, + "loss": 0.1415, + "step": 17912 + }, + { + "epoch": 0.45329858035782067, + "grad_norm": 9.974339485168457, + "learning_rate": 5.81683359788269e-06, + "loss": 0.2459, + "step": 17913 + }, + { + "epoch": 0.4533238859225144, + "grad_norm": 8.692185401916504, + "learning_rate": 5.816437466999134e-06, + "loss": 0.2037, + "step": 17914 + }, + { + "epoch": 0.45334919148720804, + "grad_norm": 4.803985595703125, + "learning_rate": 5.816041330850522e-06, + "loss": 0.2663, + "step": 17915 + }, + { + "epoch": 0.4533744970519017, + "grad_norm": 4.1018877029418945, + "learning_rate": 5.815645189439413e-06, + "loss": 0.1407, + "step": 17916 + }, + { + "epoch": 0.4533998026165954, + "grad_norm": 5.91693639755249, + "learning_rate": 5.815249042768359e-06, + "loss": 0.1717, + "step": 17917 + }, + { + "epoch": 0.45342510818128906, + "grad_norm": 5.1057868003845215, + "learning_rate": 5.814852890839916e-06, + "loss": 0.1534, + "step": 17918 + }, + { + "epoch": 0.4534504137459827, + "grad_norm": 13.639692306518555, + "learning_rate": 5.814456733656639e-06, + "loss": 0.2503, + "step": 17919 + }, + { + "epoch": 0.45347571931067643, + "grad_norm": 2.455566644668579, + "learning_rate": 5.814060571221081e-06, + "loss": 0.1062, + "step": 17920 + }, + { + "epoch": 0.4535010248753701, + "grad_norm": 5.106945037841797, + "learning_rate": 5.813664403535799e-06, + "loss": 0.1521, + "step": 17921 + }, + { + "epoch": 0.4535263304400638, + "grad_norm": 6.740016937255859, + "learning_rate": 5.813268230603345e-06, + "loss": 0.2249, + "step": 17922 + }, + { + "epoch": 0.45355163600475745, + "grad_norm": 3.813311815261841, + "learning_rate": 5.8128720524262775e-06, + "loss": 0.1753, + "step": 17923 + }, + { + "epoch": 0.4535769415694511, + "grad_norm": 11.381692886352539, + "learning_rate": 5.812475869007147e-06, + "loss": 0.4066, + "step": 17924 + }, + { + "epoch": 0.4536022471341448, + "grad_norm": 3.435905933380127, + "learning_rate": 5.812079680348512e-06, + "loss": 0.1477, + "step": 17925 + }, + { + "epoch": 0.4536275526988385, + "grad_norm": 9.608573913574219, + "learning_rate": 5.811683486452926e-06, + "loss": 0.1417, + "step": 17926 + }, + { + "epoch": 0.45365285826353213, + "grad_norm": 3.9088404178619385, + "learning_rate": 5.811287287322943e-06, + "loss": 0.1529, + "step": 17927 + }, + { + "epoch": 0.45367816382822584, + "grad_norm": 7.320326328277588, + "learning_rate": 5.810891082961121e-06, + "loss": 0.2481, + "step": 17928 + }, + { + "epoch": 0.4537034693929195, + "grad_norm": 3.956594467163086, + "learning_rate": 5.810494873370013e-06, + "loss": 0.1522, + "step": 17929 + }, + { + "epoch": 0.45372877495761316, + "grad_norm": 4.101210594177246, + "learning_rate": 5.810098658552174e-06, + "loss": 0.1654, + "step": 17930 + }, + { + "epoch": 0.45375408052230687, + "grad_norm": 5.996525764465332, + "learning_rate": 5.80970243851016e-06, + "loss": 0.1781, + "step": 17931 + }, + { + "epoch": 0.4537793860870005, + "grad_norm": 4.940202713012695, + "learning_rate": 5.809306213246524e-06, + "loss": 0.2041, + "step": 17932 + }, + { + "epoch": 0.4538046916516942, + "grad_norm": 9.58212947845459, + "learning_rate": 5.808909982763825e-06, + "loss": 0.1677, + "step": 17933 + }, + { + "epoch": 0.4538299972163879, + "grad_norm": 3.7596795558929443, + "learning_rate": 5.808513747064614e-06, + "loss": 0.1564, + "step": 17934 + }, + { + "epoch": 0.45385530278108155, + "grad_norm": 3.0554020404815674, + "learning_rate": 5.80811750615145e-06, + "loss": 0.0693, + "step": 17935 + }, + { + "epoch": 0.45388060834577526, + "grad_norm": 8.057969093322754, + "learning_rate": 5.807721260026884e-06, + "loss": 0.2488, + "step": 17936 + }, + { + "epoch": 0.4539059139104689, + "grad_norm": 18.05027961730957, + "learning_rate": 5.807325008693476e-06, + "loss": 0.3392, + "step": 17937 + }, + { + "epoch": 0.4539312194751626, + "grad_norm": 2.3267014026641846, + "learning_rate": 5.806928752153778e-06, + "loss": 0.1198, + "step": 17938 + }, + { + "epoch": 0.4539565250398563, + "grad_norm": 3.647958993911743, + "learning_rate": 5.806532490410348e-06, + "loss": 0.0808, + "step": 17939 + }, + { + "epoch": 0.45398183060454994, + "grad_norm": 3.7822933197021484, + "learning_rate": 5.806136223465739e-06, + "loss": 0.1149, + "step": 17940 + }, + { + "epoch": 0.4540071361692436, + "grad_norm": 5.514209270477295, + "learning_rate": 5.805739951322508e-06, + "loss": 0.1653, + "step": 17941 + }, + { + "epoch": 0.4540324417339373, + "grad_norm": 5.840514659881592, + "learning_rate": 5.805343673983209e-06, + "loss": 0.1624, + "step": 17942 + }, + { + "epoch": 0.45405774729863096, + "grad_norm": 9.451852798461914, + "learning_rate": 5.804947391450399e-06, + "loss": 0.113, + "step": 17943 + }, + { + "epoch": 0.4540830528633246, + "grad_norm": 8.759188652038574, + "learning_rate": 5.804551103726634e-06, + "loss": 0.2241, + "step": 17944 + }, + { + "epoch": 0.45410835842801833, + "grad_norm": 7.725876808166504, + "learning_rate": 5.8041548108144675e-06, + "loss": 0.2107, + "step": 17945 + }, + { + "epoch": 0.454133663992712, + "grad_norm": 4.083990097045898, + "learning_rate": 5.803758512716456e-06, + "loss": 0.0737, + "step": 17946 + }, + { + "epoch": 0.4541589695574057, + "grad_norm": 5.061824798583984, + "learning_rate": 5.8033622094351545e-06, + "loss": 0.1676, + "step": 17947 + }, + { + "epoch": 0.45418427512209936, + "grad_norm": 3.5481865406036377, + "learning_rate": 5.802965900973121e-06, + "loss": 0.1243, + "step": 17948 + }, + { + "epoch": 0.454209580686793, + "grad_norm": 2.8345186710357666, + "learning_rate": 5.802569587332909e-06, + "loss": 0.1026, + "step": 17949 + }, + { + "epoch": 0.4542348862514867, + "grad_norm": 5.808192253112793, + "learning_rate": 5.8021732685170765e-06, + "loss": 0.1207, + "step": 17950 + }, + { + "epoch": 0.4542601918161804, + "grad_norm": 12.498065948486328, + "learning_rate": 5.801776944528176e-06, + "loss": 0.2164, + "step": 17951 + }, + { + "epoch": 0.45428549738087404, + "grad_norm": 6.240914821624756, + "learning_rate": 5.801380615368766e-06, + "loss": 0.1622, + "step": 17952 + }, + { + "epoch": 0.45431080294556775, + "grad_norm": 5.871642112731934, + "learning_rate": 5.8009842810414005e-06, + "loss": 0.1356, + "step": 17953 + }, + { + "epoch": 0.4543361085102614, + "grad_norm": 6.69291877746582, + "learning_rate": 5.800587941548639e-06, + "loss": 0.1927, + "step": 17954 + }, + { + "epoch": 0.45436141407495506, + "grad_norm": 5.500673294067383, + "learning_rate": 5.800191596893032e-06, + "loss": 0.1907, + "step": 17955 + }, + { + "epoch": 0.4543867196396488, + "grad_norm": 4.018215179443359, + "learning_rate": 5.79979524707714e-06, + "loss": 0.1604, + "step": 17956 + }, + { + "epoch": 0.45441202520434243, + "grad_norm": 3.752997636795044, + "learning_rate": 5.7993988921035164e-06, + "loss": 0.1444, + "step": 17957 + }, + { + "epoch": 0.4544373307690361, + "grad_norm": 3.9664931297302246, + "learning_rate": 5.799002531974718e-06, + "loss": 0.1347, + "step": 17958 + }, + { + "epoch": 0.4544626363337298, + "grad_norm": 5.715167999267578, + "learning_rate": 5.7986061666933e-06, + "loss": 0.1838, + "step": 17959 + }, + { + "epoch": 0.45448794189842345, + "grad_norm": 5.969326019287109, + "learning_rate": 5.7982097962618215e-06, + "loss": 0.1447, + "step": 17960 + }, + { + "epoch": 0.45451324746311716, + "grad_norm": 9.645233154296875, + "learning_rate": 5.797813420682835e-06, + "loss": 0.2753, + "step": 17961 + }, + { + "epoch": 0.4545385530278108, + "grad_norm": 6.077528953552246, + "learning_rate": 5.797417039958899e-06, + "loss": 0.1675, + "step": 17962 + }, + { + "epoch": 0.4545638585925045, + "grad_norm": 4.6503586769104, + "learning_rate": 5.797020654092569e-06, + "loss": 0.1712, + "step": 17963 + }, + { + "epoch": 0.4545891641571982, + "grad_norm": 4.36507511138916, + "learning_rate": 5.7966242630864e-06, + "loss": 0.1043, + "step": 17964 + }, + { + "epoch": 0.45461446972189185, + "grad_norm": 3.820814609527588, + "learning_rate": 5.796227866942951e-06, + "loss": 0.1385, + "step": 17965 + }, + { + "epoch": 0.4546397752865855, + "grad_norm": 4.074512481689453, + "learning_rate": 5.795831465664774e-06, + "loss": 0.1509, + "step": 17966 + }, + { + "epoch": 0.4546650808512792, + "grad_norm": 12.86319351196289, + "learning_rate": 5.795435059254431e-06, + "loss": 0.2525, + "step": 17967 + }, + { + "epoch": 0.45469038641597287, + "grad_norm": 5.341440200805664, + "learning_rate": 5.795038647714473e-06, + "loss": 0.207, + "step": 17968 + }, + { + "epoch": 0.4547156919806665, + "grad_norm": 4.101435661315918, + "learning_rate": 5.794642231047458e-06, + "loss": 0.1103, + "step": 17969 + }, + { + "epoch": 0.45474099754536024, + "grad_norm": 6.007150650024414, + "learning_rate": 5.794245809255945e-06, + "loss": 0.1786, + "step": 17970 + }, + { + "epoch": 0.4547663031100539, + "grad_norm": 3.3112988471984863, + "learning_rate": 5.793849382342487e-06, + "loss": 0.1702, + "step": 17971 + }, + { + "epoch": 0.45479160867474755, + "grad_norm": 5.3709564208984375, + "learning_rate": 5.793452950309643e-06, + "loss": 0.1395, + "step": 17972 + }, + { + "epoch": 0.45481691423944126, + "grad_norm": 7.212253093719482, + "learning_rate": 5.793056513159968e-06, + "loss": 0.1957, + "step": 17973 + }, + { + "epoch": 0.4548422198041349, + "grad_norm": 3.785005569458008, + "learning_rate": 5.79266007089602e-06, + "loss": 0.161, + "step": 17974 + }, + { + "epoch": 0.45486752536882863, + "grad_norm": 6.960729598999023, + "learning_rate": 5.792263623520353e-06, + "loss": 0.2281, + "step": 17975 + }, + { + "epoch": 0.4548928309335223, + "grad_norm": 4.967981338500977, + "learning_rate": 5.791867171035528e-06, + "loss": 0.1057, + "step": 17976 + }, + { + "epoch": 0.45491813649821594, + "grad_norm": 7.1806960105896, + "learning_rate": 5.791470713444096e-06, + "loss": 0.1991, + "step": 17977 + }, + { + "epoch": 0.45494344206290965, + "grad_norm": 6.9463210105896, + "learning_rate": 5.791074250748619e-06, + "loss": 0.1472, + "step": 17978 + }, + { + "epoch": 0.4549687476276033, + "grad_norm": 6.817173957824707, + "learning_rate": 5.7906777829516495e-06, + "loss": 0.2614, + "step": 17979 + }, + { + "epoch": 0.45499405319229697, + "grad_norm": 3.314671754837036, + "learning_rate": 5.790281310055745e-06, + "loss": 0.1676, + "step": 17980 + }, + { + "epoch": 0.4550193587569907, + "grad_norm": 5.682547092437744, + "learning_rate": 5.789884832063466e-06, + "loss": 0.143, + "step": 17981 + }, + { + "epoch": 0.45504466432168433, + "grad_norm": 3.5926482677459717, + "learning_rate": 5.789488348977366e-06, + "loss": 0.168, + "step": 17982 + }, + { + "epoch": 0.455069969886378, + "grad_norm": 11.12494945526123, + "learning_rate": 5.789091860800002e-06, + "loss": 0.2086, + "step": 17983 + }, + { + "epoch": 0.4550952754510717, + "grad_norm": 4.97916841506958, + "learning_rate": 5.788695367533931e-06, + "loss": 0.1948, + "step": 17984 + }, + { + "epoch": 0.45512058101576536, + "grad_norm": 3.50864577293396, + "learning_rate": 5.788298869181711e-06, + "loss": 0.1359, + "step": 17985 + }, + { + "epoch": 0.45514588658045907, + "grad_norm": 5.876657485961914, + "learning_rate": 5.787902365745897e-06, + "loss": 0.1519, + "step": 17986 + }, + { + "epoch": 0.4551711921451527, + "grad_norm": 3.651787757873535, + "learning_rate": 5.78750585722905e-06, + "loss": 0.1025, + "step": 17987 + }, + { + "epoch": 0.4551964977098464, + "grad_norm": 2.3743224143981934, + "learning_rate": 5.787109343633721e-06, + "loss": 0.0486, + "step": 17988 + }, + { + "epoch": 0.4552218032745401, + "grad_norm": 9.523290634155273, + "learning_rate": 5.7867128249624725e-06, + "loss": 0.2699, + "step": 17989 + }, + { + "epoch": 0.45524710883923375, + "grad_norm": 6.655198097229004, + "learning_rate": 5.786316301217858e-06, + "loss": 0.224, + "step": 17990 + }, + { + "epoch": 0.4552724144039274, + "grad_norm": 5.6612629890441895, + "learning_rate": 5.785919772402437e-06, + "loss": 0.1865, + "step": 17991 + }, + { + "epoch": 0.4552977199686211, + "grad_norm": 10.981315612792969, + "learning_rate": 5.785523238518767e-06, + "loss": 0.1929, + "step": 17992 + }, + { + "epoch": 0.4553230255333148, + "grad_norm": 3.1798007488250732, + "learning_rate": 5.7851266995694e-06, + "loss": 0.1312, + "step": 17993 + }, + { + "epoch": 0.45534833109800843, + "grad_norm": 4.206971168518066, + "learning_rate": 5.784730155556899e-06, + "loss": 0.1719, + "step": 17994 + }, + { + "epoch": 0.45537363666270214, + "grad_norm": 7.3674421310424805, + "learning_rate": 5.784333606483818e-06, + "loss": 0.2118, + "step": 17995 + }, + { + "epoch": 0.4553989422273958, + "grad_norm": 4.178085803985596, + "learning_rate": 5.783937052352718e-06, + "loss": 0.2372, + "step": 17996 + }, + { + "epoch": 0.45542424779208945, + "grad_norm": 8.344233512878418, + "learning_rate": 5.783540493166152e-06, + "loss": 0.1637, + "step": 17997 + }, + { + "epoch": 0.45544955335678317, + "grad_norm": 26.40130043029785, + "learning_rate": 5.78314392892668e-06, + "loss": 0.2949, + "step": 17998 + }, + { + "epoch": 0.4554748589214768, + "grad_norm": 3.195465326309204, + "learning_rate": 5.782747359636856e-06, + "loss": 0.1904, + "step": 17999 + }, + { + "epoch": 0.45550016448617053, + "grad_norm": 4.815218448638916, + "learning_rate": 5.782350785299242e-06, + "loss": 0.2101, + "step": 18000 + }, + { + "epoch": 0.4555254700508642, + "grad_norm": 9.07787799835205, + "learning_rate": 5.781954205916394e-06, + "loss": 0.1566, + "step": 18001 + }, + { + "epoch": 0.45555077561555785, + "grad_norm": 3.5155820846557617, + "learning_rate": 5.781557621490867e-06, + "loss": 0.1552, + "step": 18002 + }, + { + "epoch": 0.45557608118025156, + "grad_norm": 6.18891716003418, + "learning_rate": 5.781161032025221e-06, + "loss": 0.1907, + "step": 18003 + }, + { + "epoch": 0.4556013867449452, + "grad_norm": 9.865959167480469, + "learning_rate": 5.780764437522013e-06, + "loss": 0.1634, + "step": 18004 + }, + { + "epoch": 0.45562669230963887, + "grad_norm": 4.6307878494262695, + "learning_rate": 5.7803678379838e-06, + "loss": 0.1413, + "step": 18005 + }, + { + "epoch": 0.4556519978743326, + "grad_norm": 8.001279830932617, + "learning_rate": 5.779971233413139e-06, + "loss": 0.224, + "step": 18006 + }, + { + "epoch": 0.45567730343902624, + "grad_norm": 4.52906608581543, + "learning_rate": 5.779574623812591e-06, + "loss": 0.1377, + "step": 18007 + }, + { + "epoch": 0.4557026090037199, + "grad_norm": 4.721108913421631, + "learning_rate": 5.779178009184711e-06, + "loss": 0.1581, + "step": 18008 + }, + { + "epoch": 0.4557279145684136, + "grad_norm": 2.88875412940979, + "learning_rate": 5.778781389532056e-06, + "loss": 0.128, + "step": 18009 + }, + { + "epoch": 0.45575322013310726, + "grad_norm": 7.2043914794921875, + "learning_rate": 5.778384764857184e-06, + "loss": 0.2206, + "step": 18010 + }, + { + "epoch": 0.455778525697801, + "grad_norm": 6.566478252410889, + "learning_rate": 5.7779881351626535e-06, + "loss": 0.1411, + "step": 18011 + }, + { + "epoch": 0.45580383126249463, + "grad_norm": 6.735239505767822, + "learning_rate": 5.777591500451023e-06, + "loss": 0.2004, + "step": 18012 + }, + { + "epoch": 0.4558291368271883, + "grad_norm": 3.167902708053589, + "learning_rate": 5.777194860724849e-06, + "loss": 0.1773, + "step": 18013 + }, + { + "epoch": 0.455854442391882, + "grad_norm": 4.241565227508545, + "learning_rate": 5.776798215986691e-06, + "loss": 0.1442, + "step": 18014 + }, + { + "epoch": 0.45587974795657565, + "grad_norm": 5.021191120147705, + "learning_rate": 5.776401566239105e-06, + "loss": 0.228, + "step": 18015 + }, + { + "epoch": 0.4559050535212693, + "grad_norm": 4.021469593048096, + "learning_rate": 5.77600491148465e-06, + "loss": 0.0595, + "step": 18016 + }, + { + "epoch": 0.455930359085963, + "grad_norm": 3.40537691116333, + "learning_rate": 5.775608251725884e-06, + "loss": 0.1134, + "step": 18017 + }, + { + "epoch": 0.4559556646506567, + "grad_norm": 4.944868564605713, + "learning_rate": 5.775211586965364e-06, + "loss": 0.157, + "step": 18018 + }, + { + "epoch": 0.45598097021535033, + "grad_norm": 5.164699077606201, + "learning_rate": 5.7748149172056486e-06, + "loss": 0.1888, + "step": 18019 + }, + { + "epoch": 0.45600627578004405, + "grad_norm": 4.109217643737793, + "learning_rate": 5.774418242449297e-06, + "loss": 0.1975, + "step": 18020 + }, + { + "epoch": 0.4560315813447377, + "grad_norm": 3.8807618618011475, + "learning_rate": 5.774021562698866e-06, + "loss": 0.2066, + "step": 18021 + }, + { + "epoch": 0.45605688690943136, + "grad_norm": 6.041171073913574, + "learning_rate": 5.773624877956913e-06, + "loss": 0.1756, + "step": 18022 + }, + { + "epoch": 0.45608219247412507, + "grad_norm": 4.668891906738281, + "learning_rate": 5.773228188225998e-06, + "loss": 0.1622, + "step": 18023 + }, + { + "epoch": 0.4561074980388187, + "grad_norm": 3.627788543701172, + "learning_rate": 5.772831493508677e-06, + "loss": 0.1693, + "step": 18024 + }, + { + "epoch": 0.45613280360351244, + "grad_norm": 6.028866291046143, + "learning_rate": 5.77243479380751e-06, + "loss": 0.1932, + "step": 18025 + }, + { + "epoch": 0.4561581091682061, + "grad_norm": 4.488667964935303, + "learning_rate": 5.772038089125055e-06, + "loss": 0.1739, + "step": 18026 + }, + { + "epoch": 0.45618341473289975, + "grad_norm": 3.196415662765503, + "learning_rate": 5.77164137946387e-06, + "loss": 0.1576, + "step": 18027 + }, + { + "epoch": 0.45620872029759346, + "grad_norm": 5.1418023109436035, + "learning_rate": 5.771244664826512e-06, + "loss": 0.1729, + "step": 18028 + }, + { + "epoch": 0.4562340258622871, + "grad_norm": 2.8337812423706055, + "learning_rate": 5.770847945215543e-06, + "loss": 0.1345, + "step": 18029 + }, + { + "epoch": 0.4562593314269808, + "grad_norm": 5.26474142074585, + "learning_rate": 5.770451220633517e-06, + "loss": 0.1545, + "step": 18030 + }, + { + "epoch": 0.4562846369916745, + "grad_norm": 3.0661849975585938, + "learning_rate": 5.770054491082995e-06, + "loss": 0.1274, + "step": 18031 + }, + { + "epoch": 0.45630994255636814, + "grad_norm": 5.801074981689453, + "learning_rate": 5.769657756566534e-06, + "loss": 0.2146, + "step": 18032 + }, + { + "epoch": 0.4563352481210618, + "grad_norm": 4.059481620788574, + "learning_rate": 5.769261017086695e-06, + "loss": 0.0963, + "step": 18033 + }, + { + "epoch": 0.4563605536857555, + "grad_norm": 112.75420379638672, + "learning_rate": 5.7688642726460345e-06, + "loss": 0.2968, + "step": 18034 + }, + { + "epoch": 0.45638585925044917, + "grad_norm": 5.234495639801025, + "learning_rate": 5.76846752324711e-06, + "loss": 0.2087, + "step": 18035 + }, + { + "epoch": 0.4564111648151428, + "grad_norm": 4.471516132354736, + "learning_rate": 5.768070768892482e-06, + "loss": 0.0908, + "step": 18036 + }, + { + "epoch": 0.45643647037983653, + "grad_norm": 2.732849359512329, + "learning_rate": 5.7676740095847084e-06, + "loss": 0.1018, + "step": 18037 + }, + { + "epoch": 0.4564617759445302, + "grad_norm": 3.424647808074951, + "learning_rate": 5.767277245326349e-06, + "loss": 0.1511, + "step": 18038 + }, + { + "epoch": 0.4564870815092239, + "grad_norm": 7.697932243347168, + "learning_rate": 5.766880476119961e-06, + "loss": 0.2178, + "step": 18039 + }, + { + "epoch": 0.45651238707391756, + "grad_norm": 8.419836044311523, + "learning_rate": 5.766483701968102e-06, + "loss": 0.1974, + "step": 18040 + }, + { + "epoch": 0.4565376926386112, + "grad_norm": 5.402807235717773, + "learning_rate": 5.766086922873333e-06, + "loss": 0.1552, + "step": 18041 + }, + { + "epoch": 0.4565629982033049, + "grad_norm": 5.497433185577393, + "learning_rate": 5.765690138838212e-06, + "loss": 0.143, + "step": 18042 + }, + { + "epoch": 0.4565883037679986, + "grad_norm": 5.593165397644043, + "learning_rate": 5.765293349865298e-06, + "loss": 0.207, + "step": 18043 + }, + { + "epoch": 0.45661360933269224, + "grad_norm": 5.350326061248779, + "learning_rate": 5.764896555957149e-06, + "loss": 0.1685, + "step": 18044 + }, + { + "epoch": 0.45663891489738595, + "grad_norm": 4.95159387588501, + "learning_rate": 5.764499757116325e-06, + "loss": 0.1551, + "step": 18045 + }, + { + "epoch": 0.4566642204620796, + "grad_norm": 9.5763521194458, + "learning_rate": 5.764102953345384e-06, + "loss": 0.2594, + "step": 18046 + }, + { + "epoch": 0.45668952602677326, + "grad_norm": 5.866811275482178, + "learning_rate": 5.763706144646886e-06, + "loss": 0.1956, + "step": 18047 + }, + { + "epoch": 0.456714831591467, + "grad_norm": 3.547731637954712, + "learning_rate": 5.763309331023389e-06, + "loss": 0.1483, + "step": 18048 + }, + { + "epoch": 0.45674013715616063, + "grad_norm": 4.313631057739258, + "learning_rate": 5.762912512477451e-06, + "loss": 0.1524, + "step": 18049 + }, + { + "epoch": 0.45676544272085434, + "grad_norm": 11.983772277832031, + "learning_rate": 5.762515689011633e-06, + "loss": 0.1113, + "step": 18050 + }, + { + "epoch": 0.456790748285548, + "grad_norm": 5.523300647735596, + "learning_rate": 5.762118860628491e-06, + "loss": 0.2117, + "step": 18051 + }, + { + "epoch": 0.45681605385024165, + "grad_norm": 6.309043884277344, + "learning_rate": 5.76172202733059e-06, + "loss": 0.1795, + "step": 18052 + }, + { + "epoch": 0.45684135941493537, + "grad_norm": 27.780977249145508, + "learning_rate": 5.761325189120484e-06, + "loss": 0.17, + "step": 18053 + }, + { + "epoch": 0.456866664979629, + "grad_norm": 4.62968635559082, + "learning_rate": 5.760928346000731e-06, + "loss": 0.1395, + "step": 18054 + }, + { + "epoch": 0.4568919705443227, + "grad_norm": 7.506433486938477, + "learning_rate": 5.760531497973894e-06, + "loss": 0.2364, + "step": 18055 + }, + { + "epoch": 0.4569172761090164, + "grad_norm": 11.037301063537598, + "learning_rate": 5.760134645042532e-06, + "loss": 0.399, + "step": 18056 + }, + { + "epoch": 0.45694258167371005, + "grad_norm": 3.265310764312744, + "learning_rate": 5.759737787209203e-06, + "loss": 0.1383, + "step": 18057 + }, + { + "epoch": 0.4569678872384037, + "grad_norm": 5.065240859985352, + "learning_rate": 5.759340924476466e-06, + "loss": 0.0907, + "step": 18058 + }, + { + "epoch": 0.4569931928030974, + "grad_norm": 4.1538214683532715, + "learning_rate": 5.75894405684688e-06, + "loss": 0.1304, + "step": 18059 + }, + { + "epoch": 0.45701849836779107, + "grad_norm": 5.226109027862549, + "learning_rate": 5.758547184323006e-06, + "loss": 0.1749, + "step": 18060 + }, + { + "epoch": 0.4570438039324847, + "grad_norm": 3.11462140083313, + "learning_rate": 5.758150306907401e-06, + "loss": 0.1384, + "step": 18061 + }, + { + "epoch": 0.45706910949717844, + "grad_norm": 5.162060737609863, + "learning_rate": 5.7577534246026264e-06, + "loss": 0.1981, + "step": 18062 + }, + { + "epoch": 0.4570944150618721, + "grad_norm": 3.5889992713928223, + "learning_rate": 5.757356537411241e-06, + "loss": 0.1906, + "step": 18063 + }, + { + "epoch": 0.4571197206265658, + "grad_norm": 10.154007911682129, + "learning_rate": 5.756959645335802e-06, + "loss": 0.2428, + "step": 18064 + }, + { + "epoch": 0.45714502619125946, + "grad_norm": 4.169907093048096, + "learning_rate": 5.7565627483788745e-06, + "loss": 0.1327, + "step": 18065 + }, + { + "epoch": 0.4571703317559531, + "grad_norm": 4.890477180480957, + "learning_rate": 5.7561658465430125e-06, + "loss": 0.2216, + "step": 18066 + }, + { + "epoch": 0.45719563732064683, + "grad_norm": 2.9970107078552246, + "learning_rate": 5.755768939830779e-06, + "loss": 0.1588, + "step": 18067 + }, + { + "epoch": 0.4572209428853405, + "grad_norm": 5.804233551025391, + "learning_rate": 5.7553720282447304e-06, + "loss": 0.0818, + "step": 18068 + }, + { + "epoch": 0.45724624845003414, + "grad_norm": 4.803791046142578, + "learning_rate": 5.75497511178743e-06, + "loss": 0.1237, + "step": 18069 + }, + { + "epoch": 0.45727155401472785, + "grad_norm": 10.7996826171875, + "learning_rate": 5.754578190461434e-06, + "loss": 0.2156, + "step": 18070 + }, + { + "epoch": 0.4572968595794215, + "grad_norm": 4.511005878448486, + "learning_rate": 5.754181264269305e-06, + "loss": 0.1526, + "step": 18071 + }, + { + "epoch": 0.45732216514411517, + "grad_norm": 2.853464126586914, + "learning_rate": 5.7537843332136e-06, + "loss": 0.096, + "step": 18072 + }, + { + "epoch": 0.4573474707088089, + "grad_norm": 5.819447994232178, + "learning_rate": 5.753387397296881e-06, + "loss": 0.2354, + "step": 18073 + }, + { + "epoch": 0.45737277627350253, + "grad_norm": 7.326763153076172, + "learning_rate": 5.752990456521705e-06, + "loss": 0.201, + "step": 18074 + }, + { + "epoch": 0.45739808183819625, + "grad_norm": 4.83976411819458, + "learning_rate": 5.752593510890635e-06, + "loss": 0.1489, + "step": 18075 + }, + { + "epoch": 0.4574233874028899, + "grad_norm": 3.507131576538086, + "learning_rate": 5.75219656040623e-06, + "loss": 0.1224, + "step": 18076 + }, + { + "epoch": 0.45744869296758356, + "grad_norm": 6.540421962738037, + "learning_rate": 5.75179960507105e-06, + "loss": 0.1927, + "step": 18077 + }, + { + "epoch": 0.45747399853227727, + "grad_norm": 3.5491697788238525, + "learning_rate": 5.751402644887653e-06, + "loss": 0.05, + "step": 18078 + }, + { + "epoch": 0.4574993040969709, + "grad_norm": 14.513737678527832, + "learning_rate": 5.7510056798586e-06, + "loss": 0.118, + "step": 18079 + }, + { + "epoch": 0.4575246096616646, + "grad_norm": 5.8890533447265625, + "learning_rate": 5.750608709986452e-06, + "loss": 0.1867, + "step": 18080 + }, + { + "epoch": 0.4575499152263583, + "grad_norm": 7.838472843170166, + "learning_rate": 5.7502117352737665e-06, + "loss": 0.1816, + "step": 18081 + }, + { + "epoch": 0.45757522079105195, + "grad_norm": 5.899770259857178, + "learning_rate": 5.749814755723107e-06, + "loss": 0.1837, + "step": 18082 + }, + { + "epoch": 0.4576005263557456, + "grad_norm": 2.6169729232788086, + "learning_rate": 5.749417771337029e-06, + "loss": 0.0798, + "step": 18083 + }, + { + "epoch": 0.4576258319204393, + "grad_norm": 4.3853864669799805, + "learning_rate": 5.749020782118097e-06, + "loss": 0.1892, + "step": 18084 + }, + { + "epoch": 0.457651137485133, + "grad_norm": 3.3162965774536133, + "learning_rate": 5.748623788068869e-06, + "loss": 0.1882, + "step": 18085 + }, + { + "epoch": 0.45767644304982663, + "grad_norm": 4.228824615478516, + "learning_rate": 5.748226789191904e-06, + "loss": 0.1337, + "step": 18086 + }, + { + "epoch": 0.45770174861452034, + "grad_norm": 4.118845462799072, + "learning_rate": 5.747829785489766e-06, + "loss": 0.1849, + "step": 18087 + }, + { + "epoch": 0.457727054179214, + "grad_norm": 18.632116317749023, + "learning_rate": 5.747432776965012e-06, + "loss": 0.2916, + "step": 18088 + }, + { + "epoch": 0.4577523597439077, + "grad_norm": 7.004303932189941, + "learning_rate": 5.747035763620202e-06, + "loss": 0.2903, + "step": 18089 + }, + { + "epoch": 0.45777766530860137, + "grad_norm": 6.314294338226318, + "learning_rate": 5.746638745457899e-06, + "loss": 0.2299, + "step": 18090 + }, + { + "epoch": 0.457802970873295, + "grad_norm": 4.14078950881958, + "learning_rate": 5.746241722480661e-06, + "loss": 0.1384, + "step": 18091 + }, + { + "epoch": 0.45782827643798873, + "grad_norm": 4.256282329559326, + "learning_rate": 5.745844694691047e-06, + "loss": 0.1294, + "step": 18092 + }, + { + "epoch": 0.4578535820026824, + "grad_norm": 7.218438625335693, + "learning_rate": 5.745447662091622e-06, + "loss": 0.1923, + "step": 18093 + }, + { + "epoch": 0.45787888756737605, + "grad_norm": 9.726664543151855, + "learning_rate": 5.745050624684942e-06, + "loss": 0.1258, + "step": 18094 + }, + { + "epoch": 0.45790419313206976, + "grad_norm": 7.19756555557251, + "learning_rate": 5.744653582473571e-06, + "loss": 0.273, + "step": 18095 + }, + { + "epoch": 0.4579294986967634, + "grad_norm": 20.275434494018555, + "learning_rate": 5.744256535460066e-06, + "loss": 0.1891, + "step": 18096 + }, + { + "epoch": 0.45795480426145707, + "grad_norm": 3.224778890609741, + "learning_rate": 5.743859483646988e-06, + "loss": 0.1157, + "step": 18097 + }, + { + "epoch": 0.4579801098261508, + "grad_norm": 4.682370185852051, + "learning_rate": 5.743462427036901e-06, + "loss": 0.1675, + "step": 18098 + }, + { + "epoch": 0.45800541539084444, + "grad_norm": 10.895954132080078, + "learning_rate": 5.743065365632362e-06, + "loss": 0.2142, + "step": 18099 + }, + { + "epoch": 0.4580307209555381, + "grad_norm": 2.6162219047546387, + "learning_rate": 5.742668299435933e-06, + "loss": 0.1731, + "step": 18100 + }, + { + "epoch": 0.4580560265202318, + "grad_norm": 8.873987197875977, + "learning_rate": 5.742271228450174e-06, + "loss": 0.1293, + "step": 18101 + }, + { + "epoch": 0.45808133208492546, + "grad_norm": 6.296229839324951, + "learning_rate": 5.741874152677647e-06, + "loss": 0.1615, + "step": 18102 + }, + { + "epoch": 0.4581066376496192, + "grad_norm": 8.263472557067871, + "learning_rate": 5.741477072120909e-06, + "loss": 0.1313, + "step": 18103 + }, + { + "epoch": 0.45813194321431283, + "grad_norm": 8.1638765335083, + "learning_rate": 5.7410799867825255e-06, + "loss": 0.2677, + "step": 18104 + }, + { + "epoch": 0.4581572487790065, + "grad_norm": 4.210021018981934, + "learning_rate": 5.7406828966650544e-06, + "loss": 0.1338, + "step": 18105 + }, + { + "epoch": 0.4581825543437002, + "grad_norm": 6.6199049949646, + "learning_rate": 5.740285801771057e-06, + "loss": 0.171, + "step": 18106 + }, + { + "epoch": 0.45820785990839386, + "grad_norm": 14.786312103271484, + "learning_rate": 5.739888702103094e-06, + "loss": 0.143, + "step": 18107 + }, + { + "epoch": 0.4582331654730875, + "grad_norm": 4.8993635177612305, + "learning_rate": 5.739491597663726e-06, + "loss": 0.1471, + "step": 18108 + }, + { + "epoch": 0.4582584710377812, + "grad_norm": 3.3848559856414795, + "learning_rate": 5.739094488455517e-06, + "loss": 0.119, + "step": 18109 + }, + { + "epoch": 0.4582837766024749, + "grad_norm": 30.131103515625, + "learning_rate": 5.738697374481021e-06, + "loss": 0.4409, + "step": 18110 + }, + { + "epoch": 0.45830908216716854, + "grad_norm": 4.796324729919434, + "learning_rate": 5.738300255742807e-06, + "loss": 0.244, + "step": 18111 + }, + { + "epoch": 0.45833438773186225, + "grad_norm": 15.139331817626953, + "learning_rate": 5.7379031322434285e-06, + "loss": 0.1606, + "step": 18112 + }, + { + "epoch": 0.4583596932965559, + "grad_norm": 3.937662363052368, + "learning_rate": 5.737506003985452e-06, + "loss": 0.1703, + "step": 18113 + }, + { + "epoch": 0.4583849988612496, + "grad_norm": 3.499318838119507, + "learning_rate": 5.7371088709714365e-06, + "loss": 0.1533, + "step": 18114 + }, + { + "epoch": 0.45841030442594327, + "grad_norm": 5.647938251495361, + "learning_rate": 5.736711733203942e-06, + "loss": 0.1314, + "step": 18115 + }, + { + "epoch": 0.4584356099906369, + "grad_norm": 4.4508056640625, + "learning_rate": 5.736314590685531e-06, + "loss": 0.1499, + "step": 18116 + }, + { + "epoch": 0.45846091555533064, + "grad_norm": 10.694990158081055, + "learning_rate": 5.735917443418765e-06, + "loss": 0.2823, + "step": 18117 + }, + { + "epoch": 0.4584862211200243, + "grad_norm": 5.884726524353027, + "learning_rate": 5.735520291406203e-06, + "loss": 0.2139, + "step": 18118 + }, + { + "epoch": 0.45851152668471795, + "grad_norm": 5.201089382171631, + "learning_rate": 5.735123134650407e-06, + "loss": 0.0712, + "step": 18119 + }, + { + "epoch": 0.45853683224941166, + "grad_norm": 2.126143455505371, + "learning_rate": 5.734725973153941e-06, + "loss": 0.0807, + "step": 18120 + }, + { + "epoch": 0.4585621378141053, + "grad_norm": 6.041030406951904, + "learning_rate": 5.734328806919361e-06, + "loss": 0.1764, + "step": 18121 + }, + { + "epoch": 0.458587443378799, + "grad_norm": 12.195162773132324, + "learning_rate": 5.7339316359492325e-06, + "loss": 0.2486, + "step": 18122 + }, + { + "epoch": 0.4586127489434927, + "grad_norm": 5.377807140350342, + "learning_rate": 5.733534460246115e-06, + "loss": 0.1489, + "step": 18123 + }, + { + "epoch": 0.45863805450818634, + "grad_norm": 8.14116382598877, + "learning_rate": 5.733137279812572e-06, + "loss": 0.3303, + "step": 18124 + }, + { + "epoch": 0.45866336007288, + "grad_norm": 10.611861228942871, + "learning_rate": 5.732740094651161e-06, + "loss": 0.252, + "step": 18125 + }, + { + "epoch": 0.4586886656375737, + "grad_norm": 6.852093696594238, + "learning_rate": 5.732342904764446e-06, + "loss": 0.2138, + "step": 18126 + }, + { + "epoch": 0.45871397120226737, + "grad_norm": 3.8061130046844482, + "learning_rate": 5.731945710154988e-06, + "loss": 0.1689, + "step": 18127 + }, + { + "epoch": 0.4587392767669611, + "grad_norm": 4.975374221801758, + "learning_rate": 5.731548510825347e-06, + "loss": 0.1713, + "step": 18128 + }, + { + "epoch": 0.45876458233165474, + "grad_norm": 2.3241682052612305, + "learning_rate": 5.731151306778089e-06, + "loss": 0.1088, + "step": 18129 + }, + { + "epoch": 0.4587898878963484, + "grad_norm": 5.025210857391357, + "learning_rate": 5.73075409801577e-06, + "loss": 0.1675, + "step": 18130 + }, + { + "epoch": 0.4588151934610421, + "grad_norm": 4.336568355560303, + "learning_rate": 5.730356884540954e-06, + "loss": 0.2219, + "step": 18131 + }, + { + "epoch": 0.45884049902573576, + "grad_norm": 6.3281426429748535, + "learning_rate": 5.729959666356202e-06, + "loss": 0.1996, + "step": 18132 + }, + { + "epoch": 0.4588658045904294, + "grad_norm": 4.810301780700684, + "learning_rate": 5.729562443464076e-06, + "loss": 0.1661, + "step": 18133 + }, + { + "epoch": 0.4588911101551231, + "grad_norm": 5.249233245849609, + "learning_rate": 5.729165215867137e-06, + "loss": 0.1673, + "step": 18134 + }, + { + "epoch": 0.4589164157198168, + "grad_norm": 3.3327476978302, + "learning_rate": 5.728767983567949e-06, + "loss": 0.1408, + "step": 18135 + }, + { + "epoch": 0.45894172128451044, + "grad_norm": 2.863013982772827, + "learning_rate": 5.728370746569069e-06, + "loss": 0.0924, + "step": 18136 + }, + { + "epoch": 0.45896702684920415, + "grad_norm": 3.51126766204834, + "learning_rate": 5.7279735048730645e-06, + "loss": 0.1554, + "step": 18137 + }, + { + "epoch": 0.4589923324138978, + "grad_norm": 3.172037124633789, + "learning_rate": 5.727576258482494e-06, + "loss": 0.1202, + "step": 18138 + }, + { + "epoch": 0.4590176379785915, + "grad_norm": 3.3225502967834473, + "learning_rate": 5.727179007399918e-06, + "loss": 0.1369, + "step": 18139 + }, + { + "epoch": 0.4590429435432852, + "grad_norm": 12.694025039672852, + "learning_rate": 5.726781751627901e-06, + "loss": 0.1412, + "step": 18140 + }, + { + "epoch": 0.45906824910797883, + "grad_norm": 3.27286696434021, + "learning_rate": 5.726384491169002e-06, + "loss": 0.1064, + "step": 18141 + }, + { + "epoch": 0.45909355467267254, + "grad_norm": 3.1014575958251953, + "learning_rate": 5.725987226025786e-06, + "loss": 0.1568, + "step": 18142 + }, + { + "epoch": 0.4591188602373662, + "grad_norm": 4.106802463531494, + "learning_rate": 5.725589956200813e-06, + "loss": 0.1334, + "step": 18143 + }, + { + "epoch": 0.45914416580205986, + "grad_norm": 7.689774036407471, + "learning_rate": 5.725192681696646e-06, + "loss": 0.1801, + "step": 18144 + }, + { + "epoch": 0.45916947136675357, + "grad_norm": 2.7549216747283936, + "learning_rate": 5.724795402515845e-06, + "loss": 0.0959, + "step": 18145 + }, + { + "epoch": 0.4591947769314472, + "grad_norm": 4.834024429321289, + "learning_rate": 5.7243981186609745e-06, + "loss": 0.1932, + "step": 18146 + }, + { + "epoch": 0.4592200824961409, + "grad_norm": 12.110532760620117, + "learning_rate": 5.724000830134594e-06, + "loss": 0.3938, + "step": 18147 + }, + { + "epoch": 0.4592453880608346, + "grad_norm": 4.332831859588623, + "learning_rate": 5.723603536939269e-06, + "loss": 0.1403, + "step": 18148 + }, + { + "epoch": 0.45927069362552825, + "grad_norm": 8.225069046020508, + "learning_rate": 5.723206239077557e-06, + "loss": 0.142, + "step": 18149 + }, + { + "epoch": 0.4592959991902219, + "grad_norm": 5.877838611602783, + "learning_rate": 5.722808936552022e-06, + "loss": 0.2411, + "step": 18150 + }, + { + "epoch": 0.4593213047549156, + "grad_norm": 4.009754180908203, + "learning_rate": 5.72241162936523e-06, + "loss": 0.2072, + "step": 18151 + }, + { + "epoch": 0.45934661031960927, + "grad_norm": 8.96208381652832, + "learning_rate": 5.722014317519737e-06, + "loss": 0.1718, + "step": 18152 + }, + { + "epoch": 0.459371915884303, + "grad_norm": 4.985168933868408, + "learning_rate": 5.721617001018109e-06, + "loss": 0.1351, + "step": 18153 + }, + { + "epoch": 0.45939722144899664, + "grad_norm": 4.068843841552734, + "learning_rate": 5.721219679862906e-06, + "loss": 0.1958, + "step": 18154 + }, + { + "epoch": 0.4594225270136903, + "grad_norm": 10.134055137634277, + "learning_rate": 5.7208223540566935e-06, + "loss": 0.2062, + "step": 18155 + }, + { + "epoch": 0.459447832578384, + "grad_norm": 3.0874509811401367, + "learning_rate": 5.72042502360203e-06, + "loss": 0.1364, + "step": 18156 + }, + { + "epoch": 0.45947313814307766, + "grad_norm": 4.44993257522583, + "learning_rate": 5.720027688501481e-06, + "loss": 0.1642, + "step": 18157 + }, + { + "epoch": 0.4594984437077713, + "grad_norm": 3.121439218521118, + "learning_rate": 5.7196303487576056e-06, + "loss": 0.1264, + "step": 18158 + }, + { + "epoch": 0.45952374927246503, + "grad_norm": 3.197758674621582, + "learning_rate": 5.719233004372969e-06, + "loss": 0.1799, + "step": 18159 + }, + { + "epoch": 0.4595490548371587, + "grad_norm": 8.57316780090332, + "learning_rate": 5.7188356553501326e-06, + "loss": 0.2177, + "step": 18160 + }, + { + "epoch": 0.45957436040185234, + "grad_norm": 4.536457061767578, + "learning_rate": 5.7184383016916575e-06, + "loss": 0.1769, + "step": 18161 + }, + { + "epoch": 0.45959966596654606, + "grad_norm": 4.188388824462891, + "learning_rate": 5.71804094340011e-06, + "loss": 0.179, + "step": 18162 + }, + { + "epoch": 0.4596249715312397, + "grad_norm": 12.07284164428711, + "learning_rate": 5.717643580478047e-06, + "loss": 0.3216, + "step": 18163 + }, + { + "epoch": 0.45965027709593337, + "grad_norm": 9.570598602294922, + "learning_rate": 5.717246212928037e-06, + "loss": 0.2015, + "step": 18164 + }, + { + "epoch": 0.4596755826606271, + "grad_norm": 4.580332279205322, + "learning_rate": 5.716848840752639e-06, + "loss": 0.1447, + "step": 18165 + }, + { + "epoch": 0.45970088822532074, + "grad_norm": 5.804299831390381, + "learning_rate": 5.716451463954415e-06, + "loss": 0.267, + "step": 18166 + }, + { + "epoch": 0.45972619379001445, + "grad_norm": 13.889534950256348, + "learning_rate": 5.716054082535929e-06, + "loss": 0.2228, + "step": 18167 + }, + { + "epoch": 0.4597514993547081, + "grad_norm": 9.311699867248535, + "learning_rate": 5.7156566964997425e-06, + "loss": 0.2654, + "step": 18168 + }, + { + "epoch": 0.45977680491940176, + "grad_norm": 4.509383201599121, + "learning_rate": 5.7152593058484215e-06, + "loss": 0.1028, + "step": 18169 + }, + { + "epoch": 0.45980211048409547, + "grad_norm": 3.75419545173645, + "learning_rate": 5.714861910584526e-06, + "loss": 0.2343, + "step": 18170 + }, + { + "epoch": 0.45982741604878913, + "grad_norm": 9.531604766845703, + "learning_rate": 5.714464510710617e-06, + "loss": 0.2057, + "step": 18171 + }, + { + "epoch": 0.4598527216134828, + "grad_norm": 6.106935977935791, + "learning_rate": 5.71406710622926e-06, + "loss": 0.1105, + "step": 18172 + }, + { + "epoch": 0.4598780271781765, + "grad_norm": 5.2133378982543945, + "learning_rate": 5.713669697143017e-06, + "loss": 0.199, + "step": 18173 + }, + { + "epoch": 0.45990333274287015, + "grad_norm": 10.111860275268555, + "learning_rate": 5.7132722834544505e-06, + "loss": 0.2865, + "step": 18174 + }, + { + "epoch": 0.4599286383075638, + "grad_norm": 6.404642581939697, + "learning_rate": 5.712874865166126e-06, + "loss": 0.0987, + "step": 18175 + }, + { + "epoch": 0.4599539438722575, + "grad_norm": 4.185973167419434, + "learning_rate": 5.712477442280602e-06, + "loss": 0.2085, + "step": 18176 + }, + { + "epoch": 0.4599792494369512, + "grad_norm": 6.67301082611084, + "learning_rate": 5.712080014800444e-06, + "loss": 0.2374, + "step": 18177 + }, + { + "epoch": 0.4600045550016449, + "grad_norm": 7.628779888153076, + "learning_rate": 5.711682582728214e-06, + "loss": 0.2604, + "step": 18178 + }, + { + "epoch": 0.46002986056633854, + "grad_norm": 4.5737762451171875, + "learning_rate": 5.711285146066477e-06, + "loss": 0.1194, + "step": 18179 + }, + { + "epoch": 0.4600551661310322, + "grad_norm": 3.3420166969299316, + "learning_rate": 5.710887704817793e-06, + "loss": 0.162, + "step": 18180 + }, + { + "epoch": 0.4600804716957259, + "grad_norm": 8.013585090637207, + "learning_rate": 5.7104902589847264e-06, + "loss": 0.2331, + "step": 18181 + }, + { + "epoch": 0.46010577726041957, + "grad_norm": 7.345845699310303, + "learning_rate": 5.7100928085698405e-06, + "loss": 0.2542, + "step": 18182 + }, + { + "epoch": 0.4601310828251132, + "grad_norm": 3.867255926132202, + "learning_rate": 5.709695353575697e-06, + "loss": 0.1761, + "step": 18183 + }, + { + "epoch": 0.46015638838980694, + "grad_norm": 5.871723175048828, + "learning_rate": 5.709297894004862e-06, + "loss": 0.1939, + "step": 18184 + }, + { + "epoch": 0.4601816939545006, + "grad_norm": 4.718438625335693, + "learning_rate": 5.708900429859896e-06, + "loss": 0.1877, + "step": 18185 + }, + { + "epoch": 0.46020699951919425, + "grad_norm": 6.064533233642578, + "learning_rate": 5.708502961143364e-06, + "loss": 0.1118, + "step": 18186 + }, + { + "epoch": 0.46023230508388796, + "grad_norm": 8.653069496154785, + "learning_rate": 5.708105487857826e-06, + "loss": 0.2294, + "step": 18187 + }, + { + "epoch": 0.4602576106485816, + "grad_norm": 8.968476295471191, + "learning_rate": 5.70770801000585e-06, + "loss": 0.1949, + "step": 18188 + }, + { + "epoch": 0.4602829162132753, + "grad_norm": 9.632043838500977, + "learning_rate": 5.707310527589995e-06, + "loss": 0.2395, + "step": 18189 + }, + { + "epoch": 0.460308221777969, + "grad_norm": 4.462306022644043, + "learning_rate": 5.706913040612827e-06, + "loss": 0.1393, + "step": 18190 + }, + { + "epoch": 0.46033352734266264, + "grad_norm": 4.869180679321289, + "learning_rate": 5.706515549076908e-06, + "loss": 0.2184, + "step": 18191 + }, + { + "epoch": 0.46035883290735635, + "grad_norm": 6.604115962982178, + "learning_rate": 5.7061180529848005e-06, + "loss": 0.305, + "step": 18192 + }, + { + "epoch": 0.46038413847205, + "grad_norm": 3.1644840240478516, + "learning_rate": 5.70572055233907e-06, + "loss": 0.1405, + "step": 18193 + }, + { + "epoch": 0.46040944403674366, + "grad_norm": 13.753568649291992, + "learning_rate": 5.705323047142279e-06, + "loss": 0.2782, + "step": 18194 + }, + { + "epoch": 0.4604347496014374, + "grad_norm": 2.771085023880005, + "learning_rate": 5.70492553739699e-06, + "loss": 0.1218, + "step": 18195 + }, + { + "epoch": 0.46046005516613103, + "grad_norm": 6.45886754989624, + "learning_rate": 5.70452802310577e-06, + "loss": 0.1678, + "step": 18196 + }, + { + "epoch": 0.4604853607308247, + "grad_norm": 3.577101945877075, + "learning_rate": 5.704130504271177e-06, + "loss": 0.1578, + "step": 18197 + }, + { + "epoch": 0.4605106662955184, + "grad_norm": 6.373189449310303, + "learning_rate": 5.703732980895779e-06, + "loss": 0.1819, + "step": 18198 + }, + { + "epoch": 0.46053597186021206, + "grad_norm": 5.730478286743164, + "learning_rate": 5.703335452982138e-06, + "loss": 0.2061, + "step": 18199 + }, + { + "epoch": 0.4605612774249057, + "grad_norm": 4.0348639488220215, + "learning_rate": 5.702937920532816e-06, + "loss": 0.1895, + "step": 18200 + }, + { + "epoch": 0.4605865829895994, + "grad_norm": 4.551974296569824, + "learning_rate": 5.702540383550379e-06, + "loss": 0.1842, + "step": 18201 + }, + { + "epoch": 0.4606118885542931, + "grad_norm": 3.241865634918213, + "learning_rate": 5.702142842037389e-06, + "loss": 0.1208, + "step": 18202 + }, + { + "epoch": 0.4606371941189868, + "grad_norm": 6.618608474731445, + "learning_rate": 5.70174529599641e-06, + "loss": 0.1667, + "step": 18203 + }, + { + "epoch": 0.46066249968368045, + "grad_norm": 3.989746332168579, + "learning_rate": 5.701347745430008e-06, + "loss": 0.1591, + "step": 18204 + }, + { + "epoch": 0.4606878052483741, + "grad_norm": 3.02236008644104, + "learning_rate": 5.700950190340744e-06, + "loss": 0.1748, + "step": 18205 + }, + { + "epoch": 0.4607131108130678, + "grad_norm": 9.66954231262207, + "learning_rate": 5.700552630731182e-06, + "loss": 0.3095, + "step": 18206 + }, + { + "epoch": 0.4607384163777615, + "grad_norm": 6.564736843109131, + "learning_rate": 5.700155066603887e-06, + "loss": 0.3117, + "step": 18207 + }, + { + "epoch": 0.46076372194245513, + "grad_norm": 8.002483367919922, + "learning_rate": 5.6997574979614225e-06, + "loss": 0.1819, + "step": 18208 + }, + { + "epoch": 0.46078902750714884, + "grad_norm": 7.712819576263428, + "learning_rate": 5.6993599248063516e-06, + "loss": 0.1708, + "step": 18209 + }, + { + "epoch": 0.4608143330718425, + "grad_norm": 4.1336846351623535, + "learning_rate": 5.698962347141239e-06, + "loss": 0.1865, + "step": 18210 + }, + { + "epoch": 0.46083963863653615, + "grad_norm": 2.3003549575805664, + "learning_rate": 5.698564764968647e-06, + "loss": 0.1475, + "step": 18211 + }, + { + "epoch": 0.46086494420122986, + "grad_norm": 3.4722397327423096, + "learning_rate": 5.698167178291143e-06, + "loss": 0.1113, + "step": 18212 + }, + { + "epoch": 0.4608902497659235, + "grad_norm": 4.61907958984375, + "learning_rate": 5.697769587111287e-06, + "loss": 0.1733, + "step": 18213 + }, + { + "epoch": 0.4609155553306172, + "grad_norm": 3.413224458694458, + "learning_rate": 5.697371991431645e-06, + "loss": 0.1476, + "step": 18214 + }, + { + "epoch": 0.4609408608953109, + "grad_norm": 5.487843036651611, + "learning_rate": 5.696974391254781e-06, + "loss": 0.2245, + "step": 18215 + }, + { + "epoch": 0.46096616646000454, + "grad_norm": 2.7293553352355957, + "learning_rate": 5.6965767865832585e-06, + "loss": 0.1219, + "step": 18216 + }, + { + "epoch": 0.46099147202469826, + "grad_norm": 4.826001167297363, + "learning_rate": 5.696179177419643e-06, + "loss": 0.1829, + "step": 18217 + }, + { + "epoch": 0.4610167775893919, + "grad_norm": 5.749216556549072, + "learning_rate": 5.695781563766496e-06, + "loss": 0.2542, + "step": 18218 + }, + { + "epoch": 0.46104208315408557, + "grad_norm": 3.1341655254364014, + "learning_rate": 5.695383945626384e-06, + "loss": 0.1853, + "step": 18219 + }, + { + "epoch": 0.4610673887187793, + "grad_norm": 1.766446590423584, + "learning_rate": 5.6949863230018696e-06, + "loss": 0.0897, + "step": 18220 + }, + { + "epoch": 0.46109269428347294, + "grad_norm": 3.0628273487091064, + "learning_rate": 5.694588695895519e-06, + "loss": 0.1276, + "step": 18221 + }, + { + "epoch": 0.4611179998481666, + "grad_norm": 5.625432968139648, + "learning_rate": 5.694191064309893e-06, + "loss": 0.1735, + "step": 18222 + }, + { + "epoch": 0.4611433054128603, + "grad_norm": 3.609005928039551, + "learning_rate": 5.6937934282475605e-06, + "loss": 0.1669, + "step": 18223 + }, + { + "epoch": 0.46116861097755396, + "grad_norm": 5.540987968444824, + "learning_rate": 5.693395787711081e-06, + "loss": 0.2002, + "step": 18224 + }, + { + "epoch": 0.4611939165422476, + "grad_norm": 4.893381118774414, + "learning_rate": 5.692998142703023e-06, + "loss": 0.1358, + "step": 18225 + }, + { + "epoch": 0.46121922210694133, + "grad_norm": 4.1355743408203125, + "learning_rate": 5.692600493225948e-06, + "loss": 0.2039, + "step": 18226 + }, + { + "epoch": 0.461244527671635, + "grad_norm": 3.1267213821411133, + "learning_rate": 5.69220283928242e-06, + "loss": 0.1169, + "step": 18227 + }, + { + "epoch": 0.46126983323632864, + "grad_norm": 4.4750237464904785, + "learning_rate": 5.691805180875006e-06, + "loss": 0.1936, + "step": 18228 + }, + { + "epoch": 0.46129513880102235, + "grad_norm": 17.06293296813965, + "learning_rate": 5.691407518006269e-06, + "loss": 0.1829, + "step": 18229 + }, + { + "epoch": 0.461320444365716, + "grad_norm": 8.882309913635254, + "learning_rate": 5.691009850678773e-06, + "loss": 0.2653, + "step": 18230 + }, + { + "epoch": 0.4613457499304097, + "grad_norm": 7.471750259399414, + "learning_rate": 5.690612178895084e-06, + "loss": 0.1991, + "step": 18231 + }, + { + "epoch": 0.4613710554951034, + "grad_norm": 3.914829969406128, + "learning_rate": 5.690214502657765e-06, + "loss": 0.2134, + "step": 18232 + }, + { + "epoch": 0.46139636105979703, + "grad_norm": 3.6884281635284424, + "learning_rate": 5.68981682196938e-06, + "loss": 0.1363, + "step": 18233 + }, + { + "epoch": 0.46142166662449074, + "grad_norm": 4.161468982696533, + "learning_rate": 5.689419136832497e-06, + "loss": 0.1565, + "step": 18234 + }, + { + "epoch": 0.4614469721891844, + "grad_norm": 5.5529890060424805, + "learning_rate": 5.689021447249676e-06, + "loss": 0.1584, + "step": 18235 + }, + { + "epoch": 0.46147227775387806, + "grad_norm": 2.7129361629486084, + "learning_rate": 5.688623753223485e-06, + "loss": 0.1857, + "step": 18236 + }, + { + "epoch": 0.46149758331857177, + "grad_norm": 5.147030830383301, + "learning_rate": 5.6882260547564875e-06, + "loss": 0.1713, + "step": 18237 + }, + { + "epoch": 0.4615228888832654, + "grad_norm": 5.328941345214844, + "learning_rate": 5.687828351851247e-06, + "loss": 0.155, + "step": 18238 + }, + { + "epoch": 0.4615481944479591, + "grad_norm": 5.149411678314209, + "learning_rate": 5.687430644510331e-06, + "loss": 0.1463, + "step": 18239 + }, + { + "epoch": 0.4615735000126528, + "grad_norm": 4.497766494750977, + "learning_rate": 5.687032932736301e-06, + "loss": 0.1776, + "step": 18240 + }, + { + "epoch": 0.46159880557734645, + "grad_norm": 8.622447967529297, + "learning_rate": 5.6866352165317255e-06, + "loss": 0.2365, + "step": 18241 + }, + { + "epoch": 0.46162411114204016, + "grad_norm": 4.448295593261719, + "learning_rate": 5.686237495899165e-06, + "loss": 0.1255, + "step": 18242 + }, + { + "epoch": 0.4616494167067338, + "grad_norm": 8.331028938293457, + "learning_rate": 5.685839770841188e-06, + "loss": 0.2685, + "step": 18243 + }, + { + "epoch": 0.4616747222714275, + "grad_norm": 5.770516872406006, + "learning_rate": 5.685442041360358e-06, + "loss": 0.1848, + "step": 18244 + }, + { + "epoch": 0.4617000278361212, + "grad_norm": 7.470926284790039, + "learning_rate": 5.685044307459239e-06, + "loss": 0.2017, + "step": 18245 + }, + { + "epoch": 0.46172533340081484, + "grad_norm": 3.2203123569488525, + "learning_rate": 5.684646569140396e-06, + "loss": 0.1883, + "step": 18246 + }, + { + "epoch": 0.4617506389655085, + "grad_norm": 8.223852157592773, + "learning_rate": 5.684248826406396e-06, + "loss": 0.3439, + "step": 18247 + }, + { + "epoch": 0.4617759445302022, + "grad_norm": 4.102904796600342, + "learning_rate": 5.683851079259802e-06, + "loss": 0.1795, + "step": 18248 + }, + { + "epoch": 0.46180125009489587, + "grad_norm": 12.445528030395508, + "learning_rate": 5.683453327703179e-06, + "loss": 0.1758, + "step": 18249 + }, + { + "epoch": 0.4618265556595895, + "grad_norm": 6.053071022033691, + "learning_rate": 5.683055571739094e-06, + "loss": 0.1899, + "step": 18250 + }, + { + "epoch": 0.46185186122428323, + "grad_norm": 6.050212383270264, + "learning_rate": 5.6826578113701095e-06, + "loss": 0.1357, + "step": 18251 + }, + { + "epoch": 0.4618771667889769, + "grad_norm": 4.365492343902588, + "learning_rate": 5.682260046598793e-06, + "loss": 0.1207, + "step": 18252 + }, + { + "epoch": 0.46190247235367055, + "grad_norm": 4.078673362731934, + "learning_rate": 5.681862277427708e-06, + "loss": 0.1594, + "step": 18253 + }, + { + "epoch": 0.46192777791836426, + "grad_norm": 6.677987098693848, + "learning_rate": 5.681464503859419e-06, + "loss": 0.1608, + "step": 18254 + }, + { + "epoch": 0.4619530834830579, + "grad_norm": 2.7622952461242676, + "learning_rate": 5.681066725896494e-06, + "loss": 0.1406, + "step": 18255 + }, + { + "epoch": 0.4619783890477516, + "grad_norm": 2.574083089828491, + "learning_rate": 5.680668943541496e-06, + "loss": 0.1398, + "step": 18256 + }, + { + "epoch": 0.4620036946124453, + "grad_norm": 11.722114562988281, + "learning_rate": 5.680271156796989e-06, + "loss": 0.1498, + "step": 18257 + }, + { + "epoch": 0.46202900017713894, + "grad_norm": 5.857301235198975, + "learning_rate": 5.67987336566554e-06, + "loss": 0.204, + "step": 18258 + }, + { + "epoch": 0.46205430574183265, + "grad_norm": 4.908626079559326, + "learning_rate": 5.679475570149716e-06, + "loss": 0.1781, + "step": 18259 + }, + { + "epoch": 0.4620796113065263, + "grad_norm": 6.74491024017334, + "learning_rate": 5.679077770252079e-06, + "loss": 0.1845, + "step": 18260 + }, + { + "epoch": 0.46210491687121996, + "grad_norm": 11.587404251098633, + "learning_rate": 5.678679965975195e-06, + "loss": 0.355, + "step": 18261 + }, + { + "epoch": 0.4621302224359137, + "grad_norm": 3.9815621376037598, + "learning_rate": 5.678282157321632e-06, + "loss": 0.1345, + "step": 18262 + }, + { + "epoch": 0.46215552800060733, + "grad_norm": 6.135654926300049, + "learning_rate": 5.677884344293954e-06, + "loss": 0.1122, + "step": 18263 + }, + { + "epoch": 0.462180833565301, + "grad_norm": 2.9105942249298096, + "learning_rate": 5.677486526894723e-06, + "loss": 0.1244, + "step": 18264 + }, + { + "epoch": 0.4622061391299947, + "grad_norm": 5.716476917266846, + "learning_rate": 5.67708870512651e-06, + "loss": 0.1604, + "step": 18265 + }, + { + "epoch": 0.46223144469468835, + "grad_norm": 7.776455402374268, + "learning_rate": 5.676690878991876e-06, + "loss": 0.1803, + "step": 18266 + }, + { + "epoch": 0.46225675025938207, + "grad_norm": 5.227721214294434, + "learning_rate": 5.676293048493389e-06, + "loss": 0.1358, + "step": 18267 + }, + { + "epoch": 0.4622820558240757, + "grad_norm": 4.044839382171631, + "learning_rate": 5.675895213633615e-06, + "loss": 0.1372, + "step": 18268 + }, + { + "epoch": 0.4623073613887694, + "grad_norm": 6.609045028686523, + "learning_rate": 5.675497374415117e-06, + "loss": 0.2037, + "step": 18269 + }, + { + "epoch": 0.4623326669534631, + "grad_norm": 4.5129714012146, + "learning_rate": 5.675099530840464e-06, + "loss": 0.2014, + "step": 18270 + }, + { + "epoch": 0.46235797251815675, + "grad_norm": 7.284674167633057, + "learning_rate": 5.674701682912218e-06, + "loss": 0.151, + "step": 18271 + }, + { + "epoch": 0.4623832780828504, + "grad_norm": 3.3852109909057617, + "learning_rate": 5.674303830632948e-06, + "loss": 0.1359, + "step": 18272 + }, + { + "epoch": 0.4624085836475441, + "grad_norm": 2.410024881362915, + "learning_rate": 5.673905974005216e-06, + "loss": 0.1255, + "step": 18273 + }, + { + "epoch": 0.46243388921223777, + "grad_norm": 23.27411460876465, + "learning_rate": 5.673508113031592e-06, + "loss": 0.2996, + "step": 18274 + }, + { + "epoch": 0.4624591947769314, + "grad_norm": 3.436847448348999, + "learning_rate": 5.673110247714637e-06, + "loss": 0.1474, + "step": 18275 + }, + { + "epoch": 0.46248450034162514, + "grad_norm": 7.7756123542785645, + "learning_rate": 5.67271237805692e-06, + "loss": 0.2124, + "step": 18276 + }, + { + "epoch": 0.4625098059063188, + "grad_norm": 5.0994391441345215, + "learning_rate": 5.672314504061006e-06, + "loss": 0.2189, + "step": 18277 + }, + { + "epoch": 0.46253511147101245, + "grad_norm": 3.5643680095672607, + "learning_rate": 5.671916625729461e-06, + "loss": 0.1169, + "step": 18278 + }, + { + "epoch": 0.46256041703570616, + "grad_norm": 10.087624549865723, + "learning_rate": 5.671518743064852e-06, + "loss": 0.1453, + "step": 18279 + }, + { + "epoch": 0.4625857226003998, + "grad_norm": 4.032385349273682, + "learning_rate": 5.671120856069741e-06, + "loss": 0.1336, + "step": 18280 + }, + { + "epoch": 0.46261102816509353, + "grad_norm": 5.124936103820801, + "learning_rate": 5.670722964746699e-06, + "loss": 0.1335, + "step": 18281 + }, + { + "epoch": 0.4626363337297872, + "grad_norm": 6.528341293334961, + "learning_rate": 5.670325069098289e-06, + "loss": 0.1796, + "step": 18282 + }, + { + "epoch": 0.46266163929448084, + "grad_norm": 15.325634002685547, + "learning_rate": 5.6699271691270764e-06, + "loss": 0.2251, + "step": 18283 + }, + { + "epoch": 0.46268694485917455, + "grad_norm": 3.7935633659362793, + "learning_rate": 5.6695292648356284e-06, + "loss": 0.1441, + "step": 18284 + }, + { + "epoch": 0.4627122504238682, + "grad_norm": 4.353637218475342, + "learning_rate": 5.66913135622651e-06, + "loss": 0.1957, + "step": 18285 + }, + { + "epoch": 0.46273755598856187, + "grad_norm": 5.440654277801514, + "learning_rate": 5.668733443302289e-06, + "loss": 0.1521, + "step": 18286 + }, + { + "epoch": 0.4627628615532556, + "grad_norm": 5.556585311889648, + "learning_rate": 5.668335526065531e-06, + "loss": 0.1518, + "step": 18287 + }, + { + "epoch": 0.46278816711794923, + "grad_norm": 50.469181060791016, + "learning_rate": 5.667937604518799e-06, + "loss": 0.3609, + "step": 18288 + }, + { + "epoch": 0.4628134726826429, + "grad_norm": 4.601277828216553, + "learning_rate": 5.667539678664663e-06, + "loss": 0.1837, + "step": 18289 + }, + { + "epoch": 0.4628387782473366, + "grad_norm": 3.257659673690796, + "learning_rate": 5.667141748505688e-06, + "loss": 0.1142, + "step": 18290 + }, + { + "epoch": 0.46286408381203026, + "grad_norm": 5.925180435180664, + "learning_rate": 5.66674381404444e-06, + "loss": 0.1826, + "step": 18291 + }, + { + "epoch": 0.4628893893767239, + "grad_norm": 7.969171524047852, + "learning_rate": 5.666345875283486e-06, + "loss": 0.216, + "step": 18292 + }, + { + "epoch": 0.4629146949414176, + "grad_norm": 6.371164798736572, + "learning_rate": 5.665947932225389e-06, + "loss": 0.1162, + "step": 18293 + }, + { + "epoch": 0.4629400005061113, + "grad_norm": 5.075747966766357, + "learning_rate": 5.6655499848727195e-06, + "loss": 0.1625, + "step": 18294 + }, + { + "epoch": 0.462965306070805, + "grad_norm": 4.488386631011963, + "learning_rate": 5.665152033228041e-06, + "loss": 0.1195, + "step": 18295 + }, + { + "epoch": 0.46299061163549865, + "grad_norm": 4.5330328941345215, + "learning_rate": 5.664754077293921e-06, + "loss": 0.1982, + "step": 18296 + }, + { + "epoch": 0.4630159172001923, + "grad_norm": 3.5527021884918213, + "learning_rate": 5.6643561170729264e-06, + "loss": 0.1696, + "step": 18297 + }, + { + "epoch": 0.463041222764886, + "grad_norm": 2.2316715717315674, + "learning_rate": 5.663958152567622e-06, + "loss": 0.0801, + "step": 18298 + }, + { + "epoch": 0.4630665283295797, + "grad_norm": 6.43716287612915, + "learning_rate": 5.663560183780575e-06, + "loss": 0.1339, + "step": 18299 + }, + { + "epoch": 0.46309183389427333, + "grad_norm": 5.64736795425415, + "learning_rate": 5.663162210714351e-06, + "loss": 0.1405, + "step": 18300 + }, + { + "epoch": 0.46311713945896704, + "grad_norm": 4.270741939544678, + "learning_rate": 5.662764233371519e-06, + "loss": 0.1544, + "step": 18301 + }, + { + "epoch": 0.4631424450236607, + "grad_norm": 4.805022239685059, + "learning_rate": 5.662366251754643e-06, + "loss": 0.1711, + "step": 18302 + }, + { + "epoch": 0.46316775058835435, + "grad_norm": 5.187915325164795, + "learning_rate": 5.661968265866291e-06, + "loss": 0.167, + "step": 18303 + }, + { + "epoch": 0.46319305615304807, + "grad_norm": 4.775258541107178, + "learning_rate": 5.6615702757090276e-06, + "loss": 0.2404, + "step": 18304 + }, + { + "epoch": 0.4632183617177417, + "grad_norm": 14.50751781463623, + "learning_rate": 5.661172281285422e-06, + "loss": 0.2572, + "step": 18305 + }, + { + "epoch": 0.46324366728243543, + "grad_norm": 5.876557350158691, + "learning_rate": 5.660774282598037e-06, + "loss": 0.2607, + "step": 18306 + }, + { + "epoch": 0.4632689728471291, + "grad_norm": 7.085258483886719, + "learning_rate": 5.660376279649444e-06, + "loss": 0.1284, + "step": 18307 + }, + { + "epoch": 0.46329427841182275, + "grad_norm": 4.314311981201172, + "learning_rate": 5.659978272442206e-06, + "loss": 0.1435, + "step": 18308 + }, + { + "epoch": 0.46331958397651646, + "grad_norm": 4.927788257598877, + "learning_rate": 5.6595802609788905e-06, + "loss": 0.1121, + "step": 18309 + }, + { + "epoch": 0.4633448895412101, + "grad_norm": 9.467938423156738, + "learning_rate": 5.659182245262065e-06, + "loss": 0.1923, + "step": 18310 + }, + { + "epoch": 0.46337019510590377, + "grad_norm": 4.573506832122803, + "learning_rate": 5.658784225294297e-06, + "loss": 0.1635, + "step": 18311 + }, + { + "epoch": 0.4633955006705975, + "grad_norm": 8.645530700683594, + "learning_rate": 5.65838620107815e-06, + "loss": 0.2152, + "step": 18312 + }, + { + "epoch": 0.46342080623529114, + "grad_norm": 3.6732704639434814, + "learning_rate": 5.657988172616194e-06, + "loss": 0.2087, + "step": 18313 + }, + { + "epoch": 0.4634461117999848, + "grad_norm": 3.098435640335083, + "learning_rate": 5.657590139910995e-06, + "loss": 0.1348, + "step": 18314 + }, + { + "epoch": 0.4634714173646785, + "grad_norm": 5.441720008850098, + "learning_rate": 5.657192102965119e-06, + "loss": 0.2399, + "step": 18315 + }, + { + "epoch": 0.46349672292937216, + "grad_norm": 4.875546932220459, + "learning_rate": 5.656794061781135e-06, + "loss": 0.1441, + "step": 18316 + }, + { + "epoch": 0.4635220284940658, + "grad_norm": 6.708584308624268, + "learning_rate": 5.656396016361606e-06, + "loss": 0.1202, + "step": 18317 + }, + { + "epoch": 0.46354733405875953, + "grad_norm": 5.623757362365723, + "learning_rate": 5.655997966709102e-06, + "loss": 0.1538, + "step": 18318 + }, + { + "epoch": 0.4635726396234532, + "grad_norm": 3.891397714614868, + "learning_rate": 5.6555999128261885e-06, + "loss": 0.1506, + "step": 18319 + }, + { + "epoch": 0.4635979451881469, + "grad_norm": 3.6860313415527344, + "learning_rate": 5.655201854715433e-06, + "loss": 0.1152, + "step": 18320 + }, + { + "epoch": 0.46362325075284055, + "grad_norm": 15.63703727722168, + "learning_rate": 5.6548037923794045e-06, + "loss": 0.2411, + "step": 18321 + }, + { + "epoch": 0.4636485563175342, + "grad_norm": 5.525969505310059, + "learning_rate": 5.6544057258206664e-06, + "loss": 0.2553, + "step": 18322 + }, + { + "epoch": 0.4636738618822279, + "grad_norm": 5.367804527282715, + "learning_rate": 5.6540076550417874e-06, + "loss": 0.1651, + "step": 18323 + }, + { + "epoch": 0.4636991674469216, + "grad_norm": 7.5163116455078125, + "learning_rate": 5.653609580045335e-06, + "loss": 0.1816, + "step": 18324 + }, + { + "epoch": 0.46372447301161523, + "grad_norm": 3.3211286067962646, + "learning_rate": 5.653211500833877e-06, + "loss": 0.1533, + "step": 18325 + }, + { + "epoch": 0.46374977857630895, + "grad_norm": 7.9581708908081055, + "learning_rate": 5.652813417409979e-06, + "loss": 0.2099, + "step": 18326 + }, + { + "epoch": 0.4637750841410026, + "grad_norm": 5.245842933654785, + "learning_rate": 5.652415329776209e-06, + "loss": 0.0941, + "step": 18327 + }, + { + "epoch": 0.46380038970569626, + "grad_norm": 4.6551737785339355, + "learning_rate": 5.652017237935132e-06, + "loss": 0.1717, + "step": 18328 + }, + { + "epoch": 0.46382569527038997, + "grad_norm": 8.53148365020752, + "learning_rate": 5.6516191418893195e-06, + "loss": 0.2122, + "step": 18329 + }, + { + "epoch": 0.4638510008350836, + "grad_norm": 6.00888729095459, + "learning_rate": 5.651221041641335e-06, + "loss": 0.1324, + "step": 18330 + }, + { + "epoch": 0.46387630639977734, + "grad_norm": 3.231954574584961, + "learning_rate": 5.650822937193748e-06, + "loss": 0.112, + "step": 18331 + }, + { + "epoch": 0.463901611964471, + "grad_norm": 7.123315334320068, + "learning_rate": 5.650424828549124e-06, + "loss": 0.2511, + "step": 18332 + }, + { + "epoch": 0.46392691752916465, + "grad_norm": 11.432069778442383, + "learning_rate": 5.650026715710032e-06, + "loss": 0.2166, + "step": 18333 + }, + { + "epoch": 0.46395222309385836, + "grad_norm": 6.897238254547119, + "learning_rate": 5.649628598679038e-06, + "loss": 0.1669, + "step": 18334 + }, + { + "epoch": 0.463977528658552, + "grad_norm": 4.2354559898376465, + "learning_rate": 5.64923047745871e-06, + "loss": 0.1343, + "step": 18335 + }, + { + "epoch": 0.4640028342232457, + "grad_norm": 13.054593086242676, + "learning_rate": 5.648832352051616e-06, + "loss": 0.2679, + "step": 18336 + }, + { + "epoch": 0.4640281397879394, + "grad_norm": 5.215747356414795, + "learning_rate": 5.6484342224603224e-06, + "loss": 0.1028, + "step": 18337 + }, + { + "epoch": 0.46405344535263304, + "grad_norm": 4.227297306060791, + "learning_rate": 5.648036088687399e-06, + "loss": 0.1278, + "step": 18338 + }, + { + "epoch": 0.4640787509173267, + "grad_norm": 13.005176544189453, + "learning_rate": 5.647637950735408e-06, + "loss": 0.1619, + "step": 18339 + }, + { + "epoch": 0.4641040564820204, + "grad_norm": 5.78295373916626, + "learning_rate": 5.647239808606923e-06, + "loss": 0.1707, + "step": 18340 + }, + { + "epoch": 0.46412936204671407, + "grad_norm": 6.596550941467285, + "learning_rate": 5.646841662304507e-06, + "loss": 0.2091, + "step": 18341 + }, + { + "epoch": 0.4641546676114077, + "grad_norm": 8.754168510437012, + "learning_rate": 5.646443511830732e-06, + "loss": 0.2958, + "step": 18342 + }, + { + "epoch": 0.46417997317610143, + "grad_norm": 3.9405057430267334, + "learning_rate": 5.6460453571881615e-06, + "loss": 0.1609, + "step": 18343 + }, + { + "epoch": 0.4642052787407951, + "grad_norm": 3.1765079498291016, + "learning_rate": 5.645647198379365e-06, + "loss": 0.1465, + "step": 18344 + }, + { + "epoch": 0.4642305843054888, + "grad_norm": 5.347836494445801, + "learning_rate": 5.6452490354069086e-06, + "loss": 0.1703, + "step": 18345 + }, + { + "epoch": 0.46425588987018246, + "grad_norm": 5.838015556335449, + "learning_rate": 5.644850868273363e-06, + "loss": 0.2947, + "step": 18346 + }, + { + "epoch": 0.4642811954348761, + "grad_norm": 4.180912971496582, + "learning_rate": 5.644452696981293e-06, + "loss": 0.168, + "step": 18347 + }, + { + "epoch": 0.4643065009995698, + "grad_norm": 4.37100887298584, + "learning_rate": 5.644054521533266e-06, + "loss": 0.152, + "step": 18348 + }, + { + "epoch": 0.4643318065642635, + "grad_norm": 7.317220211029053, + "learning_rate": 5.643656341931855e-06, + "loss": 0.2389, + "step": 18349 + }, + { + "epoch": 0.46435711212895714, + "grad_norm": 5.891505718231201, + "learning_rate": 5.643258158179621e-06, + "loss": 0.2105, + "step": 18350 + }, + { + "epoch": 0.46438241769365085, + "grad_norm": 6.091395854949951, + "learning_rate": 5.6428599702791355e-06, + "loss": 0.1413, + "step": 18351 + }, + { + "epoch": 0.4644077232583445, + "grad_norm": 11.679107666015625, + "learning_rate": 5.642461778232966e-06, + "loss": 0.1885, + "step": 18352 + }, + { + "epoch": 0.46443302882303816, + "grad_norm": 2.7256267070770264, + "learning_rate": 5.642063582043679e-06, + "loss": 0.1337, + "step": 18353 + }, + { + "epoch": 0.4644583343877319, + "grad_norm": 7.992190837860107, + "learning_rate": 5.641665381713845e-06, + "loss": 0.1851, + "step": 18354 + }, + { + "epoch": 0.46448363995242553, + "grad_norm": 5.265643119812012, + "learning_rate": 5.641267177246028e-06, + "loss": 0.2204, + "step": 18355 + }, + { + "epoch": 0.4645089455171192, + "grad_norm": 8.009198188781738, + "learning_rate": 5.640868968642801e-06, + "loss": 0.1948, + "step": 18356 + }, + { + "epoch": 0.4645342510818129, + "grad_norm": 4.506221294403076, + "learning_rate": 5.640470755906726e-06, + "loss": 0.1586, + "step": 18357 + }, + { + "epoch": 0.46455955664650656, + "grad_norm": 2.3763105869293213, + "learning_rate": 5.640072539040377e-06, + "loss": 0.0965, + "step": 18358 + }, + { + "epoch": 0.46458486221120027, + "grad_norm": 5.656421661376953, + "learning_rate": 5.639674318046317e-06, + "loss": 0.1687, + "step": 18359 + }, + { + "epoch": 0.4646101677758939, + "grad_norm": 2.934715509414673, + "learning_rate": 5.6392760929271195e-06, + "loss": 0.0942, + "step": 18360 + }, + { + "epoch": 0.4646354733405876, + "grad_norm": 3.6460728645324707, + "learning_rate": 5.638877863685347e-06, + "loss": 0.1404, + "step": 18361 + }, + { + "epoch": 0.4646607789052813, + "grad_norm": 9.690824508666992, + "learning_rate": 5.638479630323571e-06, + "loss": 0.1097, + "step": 18362 + }, + { + "epoch": 0.46468608446997495, + "grad_norm": 2.1691243648529053, + "learning_rate": 5.638081392844358e-06, + "loss": 0.1136, + "step": 18363 + }, + { + "epoch": 0.4647113900346686, + "grad_norm": 8.48240852355957, + "learning_rate": 5.637683151250276e-06, + "loss": 0.1853, + "step": 18364 + }, + { + "epoch": 0.4647366955993623, + "grad_norm": 13.018386840820312, + "learning_rate": 5.637284905543896e-06, + "loss": 0.248, + "step": 18365 + }, + { + "epoch": 0.46476200116405597, + "grad_norm": 4.046893119812012, + "learning_rate": 5.636886655727782e-06, + "loss": 0.2051, + "step": 18366 + }, + { + "epoch": 0.4647873067287496, + "grad_norm": 3.1211071014404297, + "learning_rate": 5.636488401804506e-06, + "loss": 0.1384, + "step": 18367 + }, + { + "epoch": 0.46481261229344334, + "grad_norm": 5.414921760559082, + "learning_rate": 5.636090143776635e-06, + "loss": 0.1116, + "step": 18368 + }, + { + "epoch": 0.464837917858137, + "grad_norm": 8.932573318481445, + "learning_rate": 5.6356918816467365e-06, + "loss": 0.1793, + "step": 18369 + }, + { + "epoch": 0.4648632234228307, + "grad_norm": 3.2336716651916504, + "learning_rate": 5.635293615417378e-06, + "loss": 0.1377, + "step": 18370 + }, + { + "epoch": 0.46488852898752436, + "grad_norm": 7.1392107009887695, + "learning_rate": 5.634895345091132e-06, + "loss": 0.1785, + "step": 18371 + }, + { + "epoch": 0.464913834552218, + "grad_norm": 3.2573792934417725, + "learning_rate": 5.634497070670562e-06, + "loss": 0.1447, + "step": 18372 + }, + { + "epoch": 0.46493914011691173, + "grad_norm": 4.535565376281738, + "learning_rate": 5.634098792158238e-06, + "loss": 0.179, + "step": 18373 + }, + { + "epoch": 0.4649644456816054, + "grad_norm": 3.783993721008301, + "learning_rate": 5.63370050955673e-06, + "loss": 0.1404, + "step": 18374 + }, + { + "epoch": 0.46498975124629904, + "grad_norm": 3.8148701190948486, + "learning_rate": 5.633302222868604e-06, + "loss": 0.1183, + "step": 18375 + }, + { + "epoch": 0.46501505681099276, + "grad_norm": 3.1585915088653564, + "learning_rate": 5.6329039320964315e-06, + "loss": 0.1296, + "step": 18376 + }, + { + "epoch": 0.4650403623756864, + "grad_norm": 6.251175403594971, + "learning_rate": 5.632505637242778e-06, + "loss": 0.1534, + "step": 18377 + }, + { + "epoch": 0.46506566794038007, + "grad_norm": 7.030656814575195, + "learning_rate": 5.632107338310213e-06, + "loss": 0.2434, + "step": 18378 + }, + { + "epoch": 0.4650909735050738, + "grad_norm": 4.077747821807861, + "learning_rate": 5.631709035301306e-06, + "loss": 0.1558, + "step": 18379 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 4.487153053283691, + "learning_rate": 5.631310728218625e-06, + "loss": 0.1357, + "step": 18380 + }, + { + "epoch": 0.4651415846344611, + "grad_norm": 11.814873695373535, + "learning_rate": 5.630912417064739e-06, + "loss": 0.2576, + "step": 18381 + }, + { + "epoch": 0.4651668901991548, + "grad_norm": 2.7025413513183594, + "learning_rate": 5.630514101842216e-06, + "loss": 0.1263, + "step": 18382 + }, + { + "epoch": 0.46519219576384846, + "grad_norm": 3.973696708679199, + "learning_rate": 5.630115782553624e-06, + "loss": 0.0912, + "step": 18383 + }, + { + "epoch": 0.46521750132854217, + "grad_norm": 7.14940881729126, + "learning_rate": 5.629717459201532e-06, + "loss": 0.2276, + "step": 18384 + }, + { + "epoch": 0.4652428068932358, + "grad_norm": 4.03320837020874, + "learning_rate": 5.629319131788511e-06, + "loss": 0.1598, + "step": 18385 + }, + { + "epoch": 0.4652681124579295, + "grad_norm": 7.896145820617676, + "learning_rate": 5.628920800317127e-06, + "loss": 0.2874, + "step": 18386 + }, + { + "epoch": 0.4652934180226232, + "grad_norm": 3.3739383220672607, + "learning_rate": 5.62852246478995e-06, + "loss": 0.107, + "step": 18387 + }, + { + "epoch": 0.46531872358731685, + "grad_norm": 10.019685745239258, + "learning_rate": 5.628124125209549e-06, + "loss": 0.2361, + "step": 18388 + }, + { + "epoch": 0.4653440291520105, + "grad_norm": 5.3110880851745605, + "learning_rate": 5.627725781578491e-06, + "loss": 0.1694, + "step": 18389 + }, + { + "epoch": 0.4653693347167042, + "grad_norm": 5.455934524536133, + "learning_rate": 5.627327433899348e-06, + "loss": 0.2271, + "step": 18390 + }, + { + "epoch": 0.4653946402813979, + "grad_norm": 7.823591232299805, + "learning_rate": 5.626929082174686e-06, + "loss": 0.221, + "step": 18391 + }, + { + "epoch": 0.46541994584609153, + "grad_norm": 5.42030668258667, + "learning_rate": 5.626530726407075e-06, + "loss": 0.2132, + "step": 18392 + }, + { + "epoch": 0.46544525141078524, + "grad_norm": 5.2753987312316895, + "learning_rate": 5.6261323665990845e-06, + "loss": 0.2194, + "step": 18393 + }, + { + "epoch": 0.4654705569754789, + "grad_norm": 6.3943610191345215, + "learning_rate": 5.625734002753281e-06, + "loss": 0.183, + "step": 18394 + }, + { + "epoch": 0.4654958625401726, + "grad_norm": 14.05651569366455, + "learning_rate": 5.625335634872237e-06, + "loss": 0.2808, + "step": 18395 + }, + { + "epoch": 0.46552116810486627, + "grad_norm": 5.483514308929443, + "learning_rate": 5.624937262958519e-06, + "loss": 0.1279, + "step": 18396 + }, + { + "epoch": 0.4655464736695599, + "grad_norm": 7.0835280418396, + "learning_rate": 5.624538887014698e-06, + "loss": 0.2244, + "step": 18397 + }, + { + "epoch": 0.46557177923425364, + "grad_norm": 8.246818542480469, + "learning_rate": 5.624140507043341e-06, + "loss": 0.2198, + "step": 18398 + }, + { + "epoch": 0.4655970847989473, + "grad_norm": 5.182062149047852, + "learning_rate": 5.623742123047018e-06, + "loss": 0.1873, + "step": 18399 + }, + { + "epoch": 0.46562239036364095, + "grad_norm": 4.1353983879089355, + "learning_rate": 5.6233437350282985e-06, + "loss": 0.1768, + "step": 18400 + }, + { + "epoch": 0.46564769592833466, + "grad_norm": 3.0305984020233154, + "learning_rate": 5.6229453429897516e-06, + "loss": 0.1261, + "step": 18401 + }, + { + "epoch": 0.4656730014930283, + "grad_norm": 8.140427589416504, + "learning_rate": 5.622546946933944e-06, + "loss": 0.201, + "step": 18402 + }, + { + "epoch": 0.46569830705772197, + "grad_norm": 7.719430446624756, + "learning_rate": 5.62214854686345e-06, + "loss": 0.205, + "step": 18403 + }, + { + "epoch": 0.4657236126224157, + "grad_norm": 4.302043437957764, + "learning_rate": 5.621750142780834e-06, + "loss": 0.1743, + "step": 18404 + }, + { + "epoch": 0.46574891818710934, + "grad_norm": 7.244979381561279, + "learning_rate": 5.621351734688667e-06, + "loss": 0.1586, + "step": 18405 + }, + { + "epoch": 0.465774223751803, + "grad_norm": 8.958011627197266, + "learning_rate": 5.620953322589517e-06, + "loss": 0.1454, + "step": 18406 + }, + { + "epoch": 0.4657995293164967, + "grad_norm": 5.619622707366943, + "learning_rate": 5.620554906485958e-06, + "loss": 0.2324, + "step": 18407 + }, + { + "epoch": 0.46582483488119036, + "grad_norm": 3.39573073387146, + "learning_rate": 5.620156486380552e-06, + "loss": 0.1403, + "step": 18408 + }, + { + "epoch": 0.4658501404458841, + "grad_norm": 16.99907875061035, + "learning_rate": 5.619758062275875e-06, + "loss": 0.2434, + "step": 18409 + }, + { + "epoch": 0.46587544601057773, + "grad_norm": 3.7826945781707764, + "learning_rate": 5.619359634174492e-06, + "loss": 0.1922, + "step": 18410 + }, + { + "epoch": 0.4659007515752714, + "grad_norm": 3.422536611557007, + "learning_rate": 5.618961202078974e-06, + "loss": 0.1694, + "step": 18411 + }, + { + "epoch": 0.4659260571399651, + "grad_norm": 12.179620742797852, + "learning_rate": 5.618562765991891e-06, + "loss": 0.2384, + "step": 18412 + }, + { + "epoch": 0.46595136270465876, + "grad_norm": 8.995352745056152, + "learning_rate": 5.618164325915811e-06, + "loss": 0.2243, + "step": 18413 + }, + { + "epoch": 0.4659766682693524, + "grad_norm": 4.531476020812988, + "learning_rate": 5.617765881853305e-06, + "loss": 0.1243, + "step": 18414 + }, + { + "epoch": 0.4660019738340461, + "grad_norm": 5.967689514160156, + "learning_rate": 5.617367433806942e-06, + "loss": 0.1507, + "step": 18415 + }, + { + "epoch": 0.4660272793987398, + "grad_norm": 9.23958969116211, + "learning_rate": 5.61696898177929e-06, + "loss": 0.1385, + "step": 18416 + }, + { + "epoch": 0.46605258496343344, + "grad_norm": 2.5381412506103516, + "learning_rate": 5.6165705257729195e-06, + "loss": 0.1188, + "step": 18417 + }, + { + "epoch": 0.46607789052812715, + "grad_norm": 3.2810521125793457, + "learning_rate": 5.616172065790403e-06, + "loss": 0.2016, + "step": 18418 + }, + { + "epoch": 0.4661031960928208, + "grad_norm": 8.709403991699219, + "learning_rate": 5.6157736018343045e-06, + "loss": 0.227, + "step": 18419 + }, + { + "epoch": 0.46612850165751446, + "grad_norm": 9.302809715270996, + "learning_rate": 5.615375133907199e-06, + "loss": 0.1991, + "step": 18420 + }, + { + "epoch": 0.46615380722220817, + "grad_norm": 5.979795932769775, + "learning_rate": 5.614976662011652e-06, + "loss": 0.2659, + "step": 18421 + }, + { + "epoch": 0.46617911278690183, + "grad_norm": 2.4856574535369873, + "learning_rate": 5.6145781861502355e-06, + "loss": 0.0734, + "step": 18422 + }, + { + "epoch": 0.46620441835159554, + "grad_norm": 5.632476329803467, + "learning_rate": 5.614179706325518e-06, + "loss": 0.2186, + "step": 18423 + }, + { + "epoch": 0.4662297239162892, + "grad_norm": 4.299849510192871, + "learning_rate": 5.613781222540071e-06, + "loss": 0.0971, + "step": 18424 + }, + { + "epoch": 0.46625502948098285, + "grad_norm": 3.693532705307007, + "learning_rate": 5.613382734796463e-06, + "loss": 0.1106, + "step": 18425 + }, + { + "epoch": 0.46628033504567656, + "grad_norm": 3.447342872619629, + "learning_rate": 5.612984243097264e-06, + "loss": 0.118, + "step": 18426 + }, + { + "epoch": 0.4663056406103702, + "grad_norm": 12.48961353302002, + "learning_rate": 5.612585747445042e-06, + "loss": 0.2323, + "step": 18427 + }, + { + "epoch": 0.4663309461750639, + "grad_norm": 6.020676612854004, + "learning_rate": 5.61218724784237e-06, + "loss": 0.1938, + "step": 18428 + }, + { + "epoch": 0.4663562517397576, + "grad_norm": 10.666191101074219, + "learning_rate": 5.611788744291815e-06, + "loss": 0.2046, + "step": 18429 + }, + { + "epoch": 0.46638155730445124, + "grad_norm": 5.262463092803955, + "learning_rate": 5.611390236795951e-06, + "loss": 0.1111, + "step": 18430 + }, + { + "epoch": 0.4664068628691449, + "grad_norm": 3.8808584213256836, + "learning_rate": 5.610991725357342e-06, + "loss": 0.1273, + "step": 18431 + }, + { + "epoch": 0.4664321684338386, + "grad_norm": 4.255000591278076, + "learning_rate": 5.610593209978562e-06, + "loss": 0.1376, + "step": 18432 + }, + { + "epoch": 0.46645747399853227, + "grad_norm": 4.982208728790283, + "learning_rate": 5.610194690662181e-06, + "loss": 0.1479, + "step": 18433 + }, + { + "epoch": 0.466482779563226, + "grad_norm": 6.137782096862793, + "learning_rate": 5.609796167410766e-06, + "loss": 0.2131, + "step": 18434 + }, + { + "epoch": 0.46650808512791964, + "grad_norm": 3.448896884918213, + "learning_rate": 5.609397640226891e-06, + "loss": 0.1056, + "step": 18435 + }, + { + "epoch": 0.4665333906926133, + "grad_norm": 9.458191871643066, + "learning_rate": 5.608999109113122e-06, + "loss": 0.1976, + "step": 18436 + }, + { + "epoch": 0.466558696257307, + "grad_norm": 2.8670196533203125, + "learning_rate": 5.608600574072033e-06, + "loss": 0.115, + "step": 18437 + }, + { + "epoch": 0.46658400182200066, + "grad_norm": 8.02505874633789, + "learning_rate": 5.608202035106191e-06, + "loss": 0.2655, + "step": 18438 + }, + { + "epoch": 0.4666093073866943, + "grad_norm": 27.326313018798828, + "learning_rate": 5.6078034922181655e-06, + "loss": 0.2183, + "step": 18439 + }, + { + "epoch": 0.46663461295138803, + "grad_norm": 4.99728536605835, + "learning_rate": 5.607404945410532e-06, + "loss": 0.1401, + "step": 18440 + }, + { + "epoch": 0.4666599185160817, + "grad_norm": 5.207333564758301, + "learning_rate": 5.607006394685854e-06, + "loss": 0.1521, + "step": 18441 + }, + { + "epoch": 0.46668522408077534, + "grad_norm": 6.330989360809326, + "learning_rate": 5.606607840046706e-06, + "loss": 0.1892, + "step": 18442 + }, + { + "epoch": 0.46671052964546905, + "grad_norm": 5.506035327911377, + "learning_rate": 5.606209281495656e-06, + "loss": 0.2016, + "step": 18443 + }, + { + "epoch": 0.4667358352101627, + "grad_norm": 3.6645290851593018, + "learning_rate": 5.605810719035277e-06, + "loss": 0.1447, + "step": 18444 + }, + { + "epoch": 0.46676114077485636, + "grad_norm": 12.966212272644043, + "learning_rate": 5.605412152668135e-06, + "loss": 0.2867, + "step": 18445 + }, + { + "epoch": 0.4667864463395501, + "grad_norm": 5.5749831199646, + "learning_rate": 5.6050135823968045e-06, + "loss": 0.14, + "step": 18446 + }, + { + "epoch": 0.46681175190424373, + "grad_norm": 11.238981246948242, + "learning_rate": 5.604615008223853e-06, + "loss": 0.2116, + "step": 18447 + }, + { + "epoch": 0.46683705746893744, + "grad_norm": 14.19011116027832, + "learning_rate": 5.604216430151851e-06, + "loss": 0.2572, + "step": 18448 + }, + { + "epoch": 0.4668623630336311, + "grad_norm": 10.310437202453613, + "learning_rate": 5.603817848183372e-06, + "loss": 0.0987, + "step": 18449 + }, + { + "epoch": 0.46688766859832476, + "grad_norm": 6.331991195678711, + "learning_rate": 5.603419262320982e-06, + "loss": 0.1936, + "step": 18450 + }, + { + "epoch": 0.46691297416301847, + "grad_norm": 7.295206546783447, + "learning_rate": 5.6030206725672546e-06, + "loss": 0.2352, + "step": 18451 + }, + { + "epoch": 0.4669382797277121, + "grad_norm": 4.262105464935303, + "learning_rate": 5.602622078924758e-06, + "loss": 0.1776, + "step": 18452 + }, + { + "epoch": 0.4669635852924058, + "grad_norm": 4.780152320861816, + "learning_rate": 5.602223481396064e-06, + "loss": 0.1193, + "step": 18453 + }, + { + "epoch": 0.4669888908570995, + "grad_norm": 13.33358383178711, + "learning_rate": 5.601824879983743e-06, + "loss": 0.3075, + "step": 18454 + }, + { + "epoch": 0.46701419642179315, + "grad_norm": 4.680087089538574, + "learning_rate": 5.601426274690366e-06, + "loss": 0.0988, + "step": 18455 + }, + { + "epoch": 0.4670395019864868, + "grad_norm": 4.244994640350342, + "learning_rate": 5.601027665518501e-06, + "loss": 0.1672, + "step": 18456 + }, + { + "epoch": 0.4670648075511805, + "grad_norm": 12.21169376373291, + "learning_rate": 5.600629052470723e-06, + "loss": 0.2128, + "step": 18457 + }, + { + "epoch": 0.4670901131158742, + "grad_norm": 3.6815860271453857, + "learning_rate": 5.600230435549597e-06, + "loss": 0.163, + "step": 18458 + }, + { + "epoch": 0.4671154186805679, + "grad_norm": 5.598977088928223, + "learning_rate": 5.5998318147576995e-06, + "loss": 0.2617, + "step": 18459 + }, + { + "epoch": 0.46714072424526154, + "grad_norm": 7.806373119354248, + "learning_rate": 5.5994331900975964e-06, + "loss": 0.2228, + "step": 18460 + }, + { + "epoch": 0.4671660298099552, + "grad_norm": 3.4142088890075684, + "learning_rate": 5.599034561571861e-06, + "loss": 0.116, + "step": 18461 + }, + { + "epoch": 0.4671913353746489, + "grad_norm": 9.591764450073242, + "learning_rate": 5.598635929183063e-06, + "loss": 0.1344, + "step": 18462 + }, + { + "epoch": 0.46721664093934256, + "grad_norm": 6.921687126159668, + "learning_rate": 5.598237292933773e-06, + "loss": 0.1862, + "step": 18463 + }, + { + "epoch": 0.4672419465040362, + "grad_norm": 3.0155959129333496, + "learning_rate": 5.597838652826562e-06, + "loss": 0.1861, + "step": 18464 + }, + { + "epoch": 0.46726725206872993, + "grad_norm": 5.407291412353516, + "learning_rate": 5.597440008864e-06, + "loss": 0.1642, + "step": 18465 + }, + { + "epoch": 0.4672925576334236, + "grad_norm": 9.680330276489258, + "learning_rate": 5.59704136104866e-06, + "loss": 0.2957, + "step": 18466 + }, + { + "epoch": 0.46731786319811724, + "grad_norm": 5.230595111846924, + "learning_rate": 5.59664270938311e-06, + "loss": 0.0963, + "step": 18467 + }, + { + "epoch": 0.46734316876281096, + "grad_norm": 4.073513984680176, + "learning_rate": 5.596244053869923e-06, + "loss": 0.1576, + "step": 18468 + }, + { + "epoch": 0.4673684743275046, + "grad_norm": 6.076649188995361, + "learning_rate": 5.595845394511667e-06, + "loss": 0.1914, + "step": 18469 + }, + { + "epoch": 0.46739377989219827, + "grad_norm": 3.4918901920318604, + "learning_rate": 5.595446731310917e-06, + "loss": 0.1247, + "step": 18470 + }, + { + "epoch": 0.467419085456892, + "grad_norm": 6.731710910797119, + "learning_rate": 5.595048064270242e-06, + "loss": 0.1834, + "step": 18471 + }, + { + "epoch": 0.46744439102158564, + "grad_norm": 6.929751396179199, + "learning_rate": 5.594649393392211e-06, + "loss": 0.2071, + "step": 18472 + }, + { + "epoch": 0.46746969658627935, + "grad_norm": 5.343993186950684, + "learning_rate": 5.5942507186793985e-06, + "loss": 0.1394, + "step": 18473 + }, + { + "epoch": 0.467495002150973, + "grad_norm": 6.745351314544678, + "learning_rate": 5.593852040134372e-06, + "loss": 0.2413, + "step": 18474 + }, + { + "epoch": 0.46752030771566666, + "grad_norm": 3.908794641494751, + "learning_rate": 5.593453357759706e-06, + "loss": 0.1845, + "step": 18475 + }, + { + "epoch": 0.4675456132803604, + "grad_norm": 4.566552639007568, + "learning_rate": 5.593054671557968e-06, + "loss": 0.2087, + "step": 18476 + }, + { + "epoch": 0.46757091884505403, + "grad_norm": 5.701021671295166, + "learning_rate": 5.592655981531733e-06, + "loss": 0.2434, + "step": 18477 + }, + { + "epoch": 0.4675962244097477, + "grad_norm": 8.004213333129883, + "learning_rate": 5.592257287683567e-06, + "loss": 0.2333, + "step": 18478 + }, + { + "epoch": 0.4676215299744414, + "grad_norm": 10.03848648071289, + "learning_rate": 5.591858590016046e-06, + "loss": 0.2737, + "step": 18479 + }, + { + "epoch": 0.46764683553913505, + "grad_norm": 8.79952335357666, + "learning_rate": 5.5914598885317376e-06, + "loss": 0.2744, + "step": 18480 + }, + { + "epoch": 0.4676721411038287, + "grad_norm": 6.962131023406982, + "learning_rate": 5.591061183233214e-06, + "loss": 0.1783, + "step": 18481 + }, + { + "epoch": 0.4676974466685224, + "grad_norm": 5.849704265594482, + "learning_rate": 5.590662474123049e-06, + "loss": 0.1691, + "step": 18482 + }, + { + "epoch": 0.4677227522332161, + "grad_norm": 4.9119873046875, + "learning_rate": 5.5902637612038095e-06, + "loss": 0.1815, + "step": 18483 + }, + { + "epoch": 0.46774805779790973, + "grad_norm": 16.151077270507812, + "learning_rate": 5.58986504447807e-06, + "loss": 0.2693, + "step": 18484 + }, + { + "epoch": 0.46777336336260344, + "grad_norm": 4.205448627471924, + "learning_rate": 5.5894663239484e-06, + "loss": 0.1848, + "step": 18485 + }, + { + "epoch": 0.4677986689272971, + "grad_norm": 4.165178298950195, + "learning_rate": 5.589067599617373e-06, + "loss": 0.1475, + "step": 18486 + }, + { + "epoch": 0.4678239744919908, + "grad_norm": 3.509690046310425, + "learning_rate": 5.588668871487556e-06, + "loss": 0.1353, + "step": 18487 + }, + { + "epoch": 0.46784928005668447, + "grad_norm": 3.5739810466766357, + "learning_rate": 5.588270139561525e-06, + "loss": 0.1322, + "step": 18488 + }, + { + "epoch": 0.4678745856213781, + "grad_norm": 4.2486138343811035, + "learning_rate": 5.587871403841848e-06, + "loss": 0.1158, + "step": 18489 + }, + { + "epoch": 0.46789989118607184, + "grad_norm": 5.9324951171875, + "learning_rate": 5.587472664331099e-06, + "loss": 0.1994, + "step": 18490 + }, + { + "epoch": 0.4679251967507655, + "grad_norm": 4.386294364929199, + "learning_rate": 5.587073921031847e-06, + "loss": 0.1768, + "step": 18491 + }, + { + "epoch": 0.46795050231545915, + "grad_norm": 7.629611968994141, + "learning_rate": 5.586675173946664e-06, + "loss": 0.1541, + "step": 18492 + }, + { + "epoch": 0.46797580788015286, + "grad_norm": 4.1919684410095215, + "learning_rate": 5.586276423078123e-06, + "loss": 0.1326, + "step": 18493 + }, + { + "epoch": 0.4680011134448465, + "grad_norm": 7.024995803833008, + "learning_rate": 5.585877668428794e-06, + "loss": 0.2254, + "step": 18494 + }, + { + "epoch": 0.4680264190095402, + "grad_norm": 5.726794719696045, + "learning_rate": 5.585478910001249e-06, + "loss": 0.2377, + "step": 18495 + }, + { + "epoch": 0.4680517245742339, + "grad_norm": 7.133894443511963, + "learning_rate": 5.5850801477980585e-06, + "loss": 0.1646, + "step": 18496 + }, + { + "epoch": 0.46807703013892754, + "grad_norm": 4.906076431274414, + "learning_rate": 5.5846813818217975e-06, + "loss": 0.2143, + "step": 18497 + }, + { + "epoch": 0.46810233570362125, + "grad_norm": 2.558729410171509, + "learning_rate": 5.584282612075032e-06, + "loss": 0.1278, + "step": 18498 + }, + { + "epoch": 0.4681276412683149, + "grad_norm": 3.0294439792633057, + "learning_rate": 5.583883838560339e-06, + "loss": 0.1304, + "step": 18499 + }, + { + "epoch": 0.46815294683300857, + "grad_norm": 2.6708920001983643, + "learning_rate": 5.583485061280286e-06, + "loss": 0.1257, + "step": 18500 + }, + { + "epoch": 0.4681782523977023, + "grad_norm": 6.832402229309082, + "learning_rate": 5.583086280237447e-06, + "loss": 0.2729, + "step": 18501 + }, + { + "epoch": 0.46820355796239593, + "grad_norm": 5.308705806732178, + "learning_rate": 5.582687495434392e-06, + "loss": 0.2414, + "step": 18502 + }, + { + "epoch": 0.4682288635270896, + "grad_norm": 3.52638578414917, + "learning_rate": 5.582288706873695e-06, + "loss": 0.1495, + "step": 18503 + }, + { + "epoch": 0.4682541690917833, + "grad_norm": 3.5245919227600098, + "learning_rate": 5.5818899145579265e-06, + "loss": 0.0791, + "step": 18504 + }, + { + "epoch": 0.46827947465647696, + "grad_norm": 26.090932846069336, + "learning_rate": 5.581491118489658e-06, + "loss": 0.1765, + "step": 18505 + }, + { + "epoch": 0.4683047802211706, + "grad_norm": 8.502975463867188, + "learning_rate": 5.581092318671462e-06, + "loss": 0.1574, + "step": 18506 + }, + { + "epoch": 0.4683300857858643, + "grad_norm": 4.510061264038086, + "learning_rate": 5.580693515105908e-06, + "loss": 0.1567, + "step": 18507 + }, + { + "epoch": 0.468355391350558, + "grad_norm": 4.012888431549072, + "learning_rate": 5.580294707795572e-06, + "loss": 0.1558, + "step": 18508 + }, + { + "epoch": 0.46838069691525164, + "grad_norm": 3.4079761505126953, + "learning_rate": 5.579895896743021e-06, + "loss": 0.1305, + "step": 18509 + }, + { + "epoch": 0.46840600247994535, + "grad_norm": 8.844307899475098, + "learning_rate": 5.579497081950831e-06, + "loss": 0.2421, + "step": 18510 + }, + { + "epoch": 0.468431308044639, + "grad_norm": 5.064499378204346, + "learning_rate": 5.579098263421571e-06, + "loss": 0.2358, + "step": 18511 + }, + { + "epoch": 0.4684566136093327, + "grad_norm": 5.225212097167969, + "learning_rate": 5.578699441157813e-06, + "loss": 0.1747, + "step": 18512 + }, + { + "epoch": 0.4684819191740264, + "grad_norm": 2.939307451248169, + "learning_rate": 5.578300615162132e-06, + "loss": 0.0701, + "step": 18513 + }, + { + "epoch": 0.46850722473872003, + "grad_norm": 6.20678186416626, + "learning_rate": 5.5779017854370965e-06, + "loss": 0.1463, + "step": 18514 + }, + { + "epoch": 0.46853253030341374, + "grad_norm": 18.176015853881836, + "learning_rate": 5.577502951985282e-06, + "loss": 0.2865, + "step": 18515 + }, + { + "epoch": 0.4685578358681074, + "grad_norm": 12.079939842224121, + "learning_rate": 5.577104114809257e-06, + "loss": 0.2901, + "step": 18516 + }, + { + "epoch": 0.46858314143280105, + "grad_norm": 3.2770302295684814, + "learning_rate": 5.576705273911595e-06, + "loss": 0.0866, + "step": 18517 + }, + { + "epoch": 0.46860844699749477, + "grad_norm": 5.8056559562683105, + "learning_rate": 5.576306429294868e-06, + "loss": 0.159, + "step": 18518 + }, + { + "epoch": 0.4686337525621884, + "grad_norm": 8.889853477478027, + "learning_rate": 5.575907580961647e-06, + "loss": 0.2014, + "step": 18519 + }, + { + "epoch": 0.4686590581268821, + "grad_norm": 8.90974235534668, + "learning_rate": 5.575508728914507e-06, + "loss": 0.2459, + "step": 18520 + }, + { + "epoch": 0.4686843636915758, + "grad_norm": 12.563671112060547, + "learning_rate": 5.5751098731560175e-06, + "loss": 0.258, + "step": 18521 + }, + { + "epoch": 0.46870966925626945, + "grad_norm": 9.676763534545898, + "learning_rate": 5.574711013688751e-06, + "loss": 0.234, + "step": 18522 + }, + { + "epoch": 0.46873497482096316, + "grad_norm": 6.673013687133789, + "learning_rate": 5.574312150515279e-06, + "loss": 0.2942, + "step": 18523 + }, + { + "epoch": 0.4687602803856568, + "grad_norm": 5.98151159286499, + "learning_rate": 5.573913283638177e-06, + "loss": 0.1395, + "step": 18524 + }, + { + "epoch": 0.46878558595035047, + "grad_norm": 4.745412826538086, + "learning_rate": 5.573514413060014e-06, + "loss": 0.1321, + "step": 18525 + }, + { + "epoch": 0.4688108915150442, + "grad_norm": 3.9920148849487305, + "learning_rate": 5.5731155387833635e-06, + "loss": 0.2066, + "step": 18526 + }, + { + "epoch": 0.46883619707973784, + "grad_norm": 4.676527500152588, + "learning_rate": 5.572716660810798e-06, + "loss": 0.1769, + "step": 18527 + }, + { + "epoch": 0.4688615026444315, + "grad_norm": 8.608376502990723, + "learning_rate": 5.572317779144888e-06, + "loss": 0.2747, + "step": 18528 + }, + { + "epoch": 0.4688868082091252, + "grad_norm": 4.66041374206543, + "learning_rate": 5.571918893788207e-06, + "loss": 0.248, + "step": 18529 + }, + { + "epoch": 0.46891211377381886, + "grad_norm": 4.285167217254639, + "learning_rate": 5.571520004743329e-06, + "loss": 0.1669, + "step": 18530 + }, + { + "epoch": 0.4689374193385125, + "grad_norm": 3.493797779083252, + "learning_rate": 5.571121112012823e-06, + "loss": 0.1851, + "step": 18531 + }, + { + "epoch": 0.46896272490320623, + "grad_norm": 4.20836877822876, + "learning_rate": 5.5707222155992645e-06, + "loss": 0.1749, + "step": 18532 + }, + { + "epoch": 0.4689880304678999, + "grad_norm": 3.100832223892212, + "learning_rate": 5.570323315505224e-06, + "loss": 0.1515, + "step": 18533 + }, + { + "epoch": 0.46901333603259354, + "grad_norm": 5.1960062980651855, + "learning_rate": 5.569924411733274e-06, + "loss": 0.1607, + "step": 18534 + }, + { + "epoch": 0.46903864159728725, + "grad_norm": 20.975990295410156, + "learning_rate": 5.569525504285989e-06, + "loss": 0.4681, + "step": 18535 + }, + { + "epoch": 0.4690639471619809, + "grad_norm": 6.168128490447998, + "learning_rate": 5.569126593165939e-06, + "loss": 0.2279, + "step": 18536 + }, + { + "epoch": 0.4690892527266746, + "grad_norm": 5.021334171295166, + "learning_rate": 5.5687276783756985e-06, + "loss": 0.1774, + "step": 18537 + }, + { + "epoch": 0.4691145582913683, + "grad_norm": 4.335803985595703, + "learning_rate": 5.568328759917837e-06, + "loss": 0.1648, + "step": 18538 + }, + { + "epoch": 0.46913986385606193, + "grad_norm": 5.3533854484558105, + "learning_rate": 5.567929837794931e-06, + "loss": 0.2707, + "step": 18539 + }, + { + "epoch": 0.46916516942075565, + "grad_norm": 6.720860481262207, + "learning_rate": 5.5675309120095486e-06, + "loss": 0.2069, + "step": 18540 + }, + { + "epoch": 0.4691904749854493, + "grad_norm": 4.008394241333008, + "learning_rate": 5.567131982564268e-06, + "loss": 0.1597, + "step": 18541 + }, + { + "epoch": 0.46921578055014296, + "grad_norm": 2.9903111457824707, + "learning_rate": 5.566733049461657e-06, + "loss": 0.1685, + "step": 18542 + }, + { + "epoch": 0.46924108611483667, + "grad_norm": 4.728090286254883, + "learning_rate": 5.5663341127042906e-06, + "loss": 0.1486, + "step": 18543 + }, + { + "epoch": 0.4692663916795303, + "grad_norm": 8.14154052734375, + "learning_rate": 5.56593517229474e-06, + "loss": 0.2582, + "step": 18544 + }, + { + "epoch": 0.469291697244224, + "grad_norm": 3.8206217288970947, + "learning_rate": 5.5655362282355785e-06, + "loss": 0.1629, + "step": 18545 + }, + { + "epoch": 0.4693170028089177, + "grad_norm": 13.6985445022583, + "learning_rate": 5.565137280529381e-06, + "loss": 0.2736, + "step": 18546 + }, + { + "epoch": 0.46934230837361135, + "grad_norm": 5.074478626251221, + "learning_rate": 5.564738329178717e-06, + "loss": 0.1949, + "step": 18547 + }, + { + "epoch": 0.469367613938305, + "grad_norm": 4.206547260284424, + "learning_rate": 5.564339374186161e-06, + "loss": 0.185, + "step": 18548 + }, + { + "epoch": 0.4693929195029987, + "grad_norm": 3.3944876194000244, + "learning_rate": 5.563940415554286e-06, + "loss": 0.11, + "step": 18549 + }, + { + "epoch": 0.4694182250676924, + "grad_norm": 3.9711973667144775, + "learning_rate": 5.563541453285663e-06, + "loss": 0.1717, + "step": 18550 + }, + { + "epoch": 0.4694435306323861, + "grad_norm": 5.581778049468994, + "learning_rate": 5.563142487382866e-06, + "loss": 0.1999, + "step": 18551 + }, + { + "epoch": 0.46946883619707974, + "grad_norm": 5.9612627029418945, + "learning_rate": 5.562743517848469e-06, + "loss": 0.2143, + "step": 18552 + }, + { + "epoch": 0.4694941417617734, + "grad_norm": 4.711045742034912, + "learning_rate": 5.562344544685042e-06, + "loss": 0.1555, + "step": 18553 + }, + { + "epoch": 0.4695194473264671, + "grad_norm": 4.000046253204346, + "learning_rate": 5.561945567895162e-06, + "loss": 0.1457, + "step": 18554 + }, + { + "epoch": 0.46954475289116077, + "grad_norm": 7.119393348693848, + "learning_rate": 5.5615465874813965e-06, + "loss": 0.2526, + "step": 18555 + }, + { + "epoch": 0.4695700584558544, + "grad_norm": 12.383140563964844, + "learning_rate": 5.561147603446323e-06, + "loss": 0.2876, + "step": 18556 + }, + { + "epoch": 0.46959536402054813, + "grad_norm": 4.945431709289551, + "learning_rate": 5.560748615792513e-06, + "loss": 0.1609, + "step": 18557 + }, + { + "epoch": 0.4696206695852418, + "grad_norm": 13.100432395935059, + "learning_rate": 5.56034962452254e-06, + "loss": 0.2119, + "step": 18558 + }, + { + "epoch": 0.46964597514993545, + "grad_norm": 5.787097930908203, + "learning_rate": 5.559950629638976e-06, + "loss": 0.1701, + "step": 18559 + }, + { + "epoch": 0.46967128071462916, + "grad_norm": 3.15084171295166, + "learning_rate": 5.559551631144394e-06, + "loss": 0.1581, + "step": 18560 + }, + { + "epoch": 0.4696965862793228, + "grad_norm": 3.805873394012451, + "learning_rate": 5.559152629041369e-06, + "loss": 0.1299, + "step": 18561 + }, + { + "epoch": 0.4697218918440165, + "grad_norm": 3.1761906147003174, + "learning_rate": 5.5587536233324715e-06, + "loss": 0.1468, + "step": 18562 + }, + { + "epoch": 0.4697471974087102, + "grad_norm": 8.991036415100098, + "learning_rate": 5.558354614020276e-06, + "loss": 0.2065, + "step": 18563 + }, + { + "epoch": 0.46977250297340384, + "grad_norm": 7.972304821014404, + "learning_rate": 5.557955601107356e-06, + "loss": 0.1769, + "step": 18564 + }, + { + "epoch": 0.46979780853809755, + "grad_norm": 9.318489074707031, + "learning_rate": 5.557556584596284e-06, + "loss": 0.1871, + "step": 18565 + }, + { + "epoch": 0.4698231141027912, + "grad_norm": 6.390841484069824, + "learning_rate": 5.557157564489632e-06, + "loss": 0.2274, + "step": 18566 + }, + { + "epoch": 0.46984841966748486, + "grad_norm": 3.7921197414398193, + "learning_rate": 5.556758540789975e-06, + "loss": 0.1831, + "step": 18567 + }, + { + "epoch": 0.4698737252321786, + "grad_norm": 5.65586519241333, + "learning_rate": 5.556359513499887e-06, + "loss": 0.1656, + "step": 18568 + }, + { + "epoch": 0.46989903079687223, + "grad_norm": 4.1854963302612305, + "learning_rate": 5.555960482621939e-06, + "loss": 0.2001, + "step": 18569 + }, + { + "epoch": 0.4699243363615659, + "grad_norm": 3.38310170173645, + "learning_rate": 5.555561448158706e-06, + "loss": 0.0927, + "step": 18570 + }, + { + "epoch": 0.4699496419262596, + "grad_norm": 2.712507486343384, + "learning_rate": 5.55516241011276e-06, + "loss": 0.1269, + "step": 18571 + }, + { + "epoch": 0.46997494749095325, + "grad_norm": 11.82339859008789, + "learning_rate": 5.554763368486675e-06, + "loss": 0.1611, + "step": 18572 + }, + { + "epoch": 0.4700002530556469, + "grad_norm": 6.634538650512695, + "learning_rate": 5.554364323283025e-06, + "loss": 0.2618, + "step": 18573 + }, + { + "epoch": 0.4700255586203406, + "grad_norm": 4.767358779907227, + "learning_rate": 5.5539652745043816e-06, + "loss": 0.1653, + "step": 18574 + }, + { + "epoch": 0.4700508641850343, + "grad_norm": 4.3511457443237305, + "learning_rate": 5.553566222153319e-06, + "loss": 0.1947, + "step": 18575 + }, + { + "epoch": 0.470076169749728, + "grad_norm": 8.081940650939941, + "learning_rate": 5.553167166232412e-06, + "loss": 0.1816, + "step": 18576 + }, + { + "epoch": 0.47010147531442165, + "grad_norm": 7.321653842926025, + "learning_rate": 5.552768106744232e-06, + "loss": 0.2449, + "step": 18577 + }, + { + "epoch": 0.4701267808791153, + "grad_norm": 6.7881975173950195, + "learning_rate": 5.552369043691353e-06, + "loss": 0.1893, + "step": 18578 + }, + { + "epoch": 0.470152086443809, + "grad_norm": 5.443038463592529, + "learning_rate": 5.55196997707635e-06, + "loss": 0.2621, + "step": 18579 + }, + { + "epoch": 0.47017739200850267, + "grad_norm": 5.317254543304443, + "learning_rate": 5.551570906901794e-06, + "loss": 0.1788, + "step": 18580 + }, + { + "epoch": 0.4702026975731963, + "grad_norm": 6.9628586769104, + "learning_rate": 5.551171833170261e-06, + "loss": 0.1951, + "step": 18581 + }, + { + "epoch": 0.47022800313789004, + "grad_norm": 4.077950954437256, + "learning_rate": 5.550772755884322e-06, + "loss": 0.1503, + "step": 18582 + }, + { + "epoch": 0.4702533087025837, + "grad_norm": 8.772905349731445, + "learning_rate": 5.5503736750465535e-06, + "loss": 0.1785, + "step": 18583 + }, + { + "epoch": 0.47027861426727735, + "grad_norm": 6.457885265350342, + "learning_rate": 5.549974590659526e-06, + "loss": 0.1536, + "step": 18584 + }, + { + "epoch": 0.47030391983197106, + "grad_norm": 3.5523500442504883, + "learning_rate": 5.549575502725815e-06, + "loss": 0.1206, + "step": 18585 + }, + { + "epoch": 0.4703292253966647, + "grad_norm": 6.3632683753967285, + "learning_rate": 5.549176411247994e-06, + "loss": 0.135, + "step": 18586 + }, + { + "epoch": 0.47035453096135843, + "grad_norm": 5.841668128967285, + "learning_rate": 5.548777316228637e-06, + "loss": 0.1439, + "step": 18587 + }, + { + "epoch": 0.4703798365260521, + "grad_norm": 8.707667350769043, + "learning_rate": 5.5483782176703175e-06, + "loss": 0.2327, + "step": 18588 + }, + { + "epoch": 0.47040514209074574, + "grad_norm": 23.991207122802734, + "learning_rate": 5.547979115575607e-06, + "loss": 0.2699, + "step": 18589 + }, + { + "epoch": 0.47043044765543945, + "grad_norm": 3.0710246562957764, + "learning_rate": 5.5475800099470835e-06, + "loss": 0.1118, + "step": 18590 + }, + { + "epoch": 0.4704557532201331, + "grad_norm": 3.0072007179260254, + "learning_rate": 5.547180900787316e-06, + "loss": 0.1121, + "step": 18591 + }, + { + "epoch": 0.47048105878482677, + "grad_norm": 5.72627592086792, + "learning_rate": 5.546781788098883e-06, + "loss": 0.2417, + "step": 18592 + }, + { + "epoch": 0.4705063643495205, + "grad_norm": 7.813490867614746, + "learning_rate": 5.5463826718843535e-06, + "loss": 0.1281, + "step": 18593 + }, + { + "epoch": 0.47053166991421413, + "grad_norm": 5.6731462478637695, + "learning_rate": 5.545983552146306e-06, + "loss": 0.222, + "step": 18594 + }, + { + "epoch": 0.4705569754789078, + "grad_norm": 5.326502799987793, + "learning_rate": 5.545584428887311e-06, + "loss": 0.1521, + "step": 18595 + }, + { + "epoch": 0.4705822810436015, + "grad_norm": 5.39505672454834, + "learning_rate": 5.545185302109943e-06, + "loss": 0.2109, + "step": 18596 + }, + { + "epoch": 0.47060758660829516, + "grad_norm": 4.809668064117432, + "learning_rate": 5.5447861718167775e-06, + "loss": 0.1262, + "step": 18597 + }, + { + "epoch": 0.4706328921729888, + "grad_norm": 12.903005599975586, + "learning_rate": 5.544387038010386e-06, + "loss": 0.1646, + "step": 18598 + }, + { + "epoch": 0.4706581977376825, + "grad_norm": 8.651582717895508, + "learning_rate": 5.543987900693345e-06, + "loss": 0.2285, + "step": 18599 + }, + { + "epoch": 0.4706835033023762, + "grad_norm": 4.2241597175598145, + "learning_rate": 5.543588759868227e-06, + "loss": 0.1593, + "step": 18600 + }, + { + "epoch": 0.4707088088670699, + "grad_norm": 4.091240882873535, + "learning_rate": 5.5431896155376066e-06, + "loss": 0.1924, + "step": 18601 + }, + { + "epoch": 0.47073411443176355, + "grad_norm": 5.115074634552002, + "learning_rate": 5.542790467704057e-06, + "loss": 0.2023, + "step": 18602 + }, + { + "epoch": 0.4707594199964572, + "grad_norm": 3.0829687118530273, + "learning_rate": 5.5423913163701525e-06, + "loss": 0.1283, + "step": 18603 + }, + { + "epoch": 0.4707847255611509, + "grad_norm": 4.576842308044434, + "learning_rate": 5.541992161538467e-06, + "loss": 0.1449, + "step": 18604 + }, + { + "epoch": 0.4708100311258446, + "grad_norm": 6.445399284362793, + "learning_rate": 5.541593003211575e-06, + "loss": 0.2302, + "step": 18605 + }, + { + "epoch": 0.47083533669053823, + "grad_norm": 12.76693344116211, + "learning_rate": 5.541193841392052e-06, + "loss": 0.2108, + "step": 18606 + }, + { + "epoch": 0.47086064225523194, + "grad_norm": 5.1717529296875, + "learning_rate": 5.540794676082468e-06, + "loss": 0.157, + "step": 18607 + }, + { + "epoch": 0.4708859478199256, + "grad_norm": 6.226653099060059, + "learning_rate": 5.540395507285402e-06, + "loss": 0.1088, + "step": 18608 + }, + { + "epoch": 0.47091125338461925, + "grad_norm": 6.4751200675964355, + "learning_rate": 5.539996335003425e-06, + "loss": 0.2094, + "step": 18609 + }, + { + "epoch": 0.47093655894931297, + "grad_norm": 12.208467483520508, + "learning_rate": 5.539597159239112e-06, + "loss": 0.2148, + "step": 18610 + }, + { + "epoch": 0.4709618645140066, + "grad_norm": 4.899963855743408, + "learning_rate": 5.5391979799950365e-06, + "loss": 0.1493, + "step": 18611 + }, + { + "epoch": 0.4709871700787003, + "grad_norm": 5.151488304138184, + "learning_rate": 5.538798797273775e-06, + "loss": 0.1844, + "step": 18612 + }, + { + "epoch": 0.471012475643394, + "grad_norm": 6.595297336578369, + "learning_rate": 5.5383996110779e-06, + "loss": 0.201, + "step": 18613 + }, + { + "epoch": 0.47103778120808765, + "grad_norm": 15.080434799194336, + "learning_rate": 5.538000421409985e-06, + "loss": 0.2052, + "step": 18614 + }, + { + "epoch": 0.47106308677278136, + "grad_norm": 6.654687881469727, + "learning_rate": 5.537601228272606e-06, + "loss": 0.1791, + "step": 18615 + }, + { + "epoch": 0.471088392337475, + "grad_norm": 4.372917175292969, + "learning_rate": 5.537202031668337e-06, + "loss": 0.1492, + "step": 18616 + }, + { + "epoch": 0.47111369790216867, + "grad_norm": 6.242335796356201, + "learning_rate": 5.536802831599752e-06, + "loss": 0.2655, + "step": 18617 + }, + { + "epoch": 0.4711390034668624, + "grad_norm": 4.053506374359131, + "learning_rate": 5.536403628069425e-06, + "loss": 0.154, + "step": 18618 + }, + { + "epoch": 0.47116430903155604, + "grad_norm": 4.591828346252441, + "learning_rate": 5.53600442107993e-06, + "loss": 0.2352, + "step": 18619 + }, + { + "epoch": 0.4711896145962497, + "grad_norm": 4.220290660858154, + "learning_rate": 5.535605210633842e-06, + "loss": 0.1896, + "step": 18620 + }, + { + "epoch": 0.4712149201609434, + "grad_norm": 4.7441534996032715, + "learning_rate": 5.535205996733738e-06, + "loss": 0.2362, + "step": 18621 + }, + { + "epoch": 0.47124022572563706, + "grad_norm": 3.059889554977417, + "learning_rate": 5.534806779382188e-06, + "loss": 0.1398, + "step": 18622 + }, + { + "epoch": 0.4712655312903307, + "grad_norm": 5.517565727233887, + "learning_rate": 5.534407558581771e-06, + "loss": 0.1351, + "step": 18623 + }, + { + "epoch": 0.47129083685502443, + "grad_norm": 4.5309858322143555, + "learning_rate": 5.5340083343350556e-06, + "loss": 0.204, + "step": 18624 + }, + { + "epoch": 0.4713161424197181, + "grad_norm": 4.369451522827148, + "learning_rate": 5.533609106644622e-06, + "loss": 0.1105, + "step": 18625 + }, + { + "epoch": 0.4713414479844118, + "grad_norm": 4.2166948318481445, + "learning_rate": 5.533209875513041e-06, + "loss": 0.0893, + "step": 18626 + }, + { + "epoch": 0.47136675354910545, + "grad_norm": 5.345422744750977, + "learning_rate": 5.53281064094289e-06, + "loss": 0.2121, + "step": 18627 + }, + { + "epoch": 0.4713920591137991, + "grad_norm": 8.36584758758545, + "learning_rate": 5.5324114029367424e-06, + "loss": 0.1666, + "step": 18628 + }, + { + "epoch": 0.4714173646784928, + "grad_norm": 13.442209243774414, + "learning_rate": 5.532012161497174e-06, + "loss": 0.2763, + "step": 18629 + }, + { + "epoch": 0.4714426702431865, + "grad_norm": 8.313520431518555, + "learning_rate": 5.531612916626755e-06, + "loss": 0.2088, + "step": 18630 + }, + { + "epoch": 0.47146797580788014, + "grad_norm": 3.2747933864593506, + "learning_rate": 5.531213668328065e-06, + "loss": 0.1235, + "step": 18631 + }, + { + "epoch": 0.47149328137257385, + "grad_norm": 4.277101039886475, + "learning_rate": 5.5308144166036756e-06, + "loss": 0.2004, + "step": 18632 + }, + { + "epoch": 0.4715185869372675, + "grad_norm": 2.4012372493743896, + "learning_rate": 5.530415161456164e-06, + "loss": 0.1323, + "step": 18633 + }, + { + "epoch": 0.47154389250196116, + "grad_norm": 3.6272366046905518, + "learning_rate": 5.530015902888104e-06, + "loss": 0.1129, + "step": 18634 + }, + { + "epoch": 0.47156919806665487, + "grad_norm": 5.004520416259766, + "learning_rate": 5.52961664090207e-06, + "loss": 0.1799, + "step": 18635 + }, + { + "epoch": 0.4715945036313485, + "grad_norm": 9.004631042480469, + "learning_rate": 5.529217375500636e-06, + "loss": 0.1213, + "step": 18636 + }, + { + "epoch": 0.4716198091960422, + "grad_norm": 6.663000583648682, + "learning_rate": 5.528818106686378e-06, + "loss": 0.131, + "step": 18637 + }, + { + "epoch": 0.4716451147607359, + "grad_norm": 4.572682857513428, + "learning_rate": 5.528418834461869e-06, + "loss": 0.1185, + "step": 18638 + }, + { + "epoch": 0.47167042032542955, + "grad_norm": 5.766245365142822, + "learning_rate": 5.528019558829688e-06, + "loss": 0.1722, + "step": 18639 + }, + { + "epoch": 0.47169572589012326, + "grad_norm": 3.9201571941375732, + "learning_rate": 5.527620279792405e-06, + "loss": 0.1912, + "step": 18640 + }, + { + "epoch": 0.4717210314548169, + "grad_norm": 15.401473045349121, + "learning_rate": 5.527220997352598e-06, + "loss": 0.1207, + "step": 18641 + }, + { + "epoch": 0.4717463370195106, + "grad_norm": 3.093790292739868, + "learning_rate": 5.526821711512839e-06, + "loss": 0.1823, + "step": 18642 + }, + { + "epoch": 0.4717716425842043, + "grad_norm": 3.795447587966919, + "learning_rate": 5.526422422275708e-06, + "loss": 0.143, + "step": 18643 + }, + { + "epoch": 0.47179694814889794, + "grad_norm": 4.662725448608398, + "learning_rate": 5.5260231296437735e-06, + "loss": 0.2087, + "step": 18644 + }, + { + "epoch": 0.4718222537135916, + "grad_norm": 4.985104084014893, + "learning_rate": 5.525623833619615e-06, + "loss": 0.1574, + "step": 18645 + }, + { + "epoch": 0.4718475592782853, + "grad_norm": 2.894484043121338, + "learning_rate": 5.5252245342058065e-06, + "loss": 0.1309, + "step": 18646 + }, + { + "epoch": 0.47187286484297897, + "grad_norm": 13.297242164611816, + "learning_rate": 5.524825231404923e-06, + "loss": 0.1807, + "step": 18647 + }, + { + "epoch": 0.4718981704076726, + "grad_norm": 20.883481979370117, + "learning_rate": 5.524425925219539e-06, + "loss": 0.1913, + "step": 18648 + }, + { + "epoch": 0.47192347597236634, + "grad_norm": 13.284300804138184, + "learning_rate": 5.524026615652229e-06, + "loss": 0.137, + "step": 18649 + }, + { + "epoch": 0.47194878153706, + "grad_norm": 9.318253517150879, + "learning_rate": 5.523627302705568e-06, + "loss": 0.2936, + "step": 18650 + }, + { + "epoch": 0.4719740871017537, + "grad_norm": 10.490493774414062, + "learning_rate": 5.523227986382133e-06, + "loss": 0.1649, + "step": 18651 + }, + { + "epoch": 0.47199939266644736, + "grad_norm": 8.697186470031738, + "learning_rate": 5.522828666684499e-06, + "loss": 0.2524, + "step": 18652 + }, + { + "epoch": 0.472024698231141, + "grad_norm": 4.684582710266113, + "learning_rate": 5.522429343615239e-06, + "loss": 0.1561, + "step": 18653 + }, + { + "epoch": 0.4720500037958347, + "grad_norm": 3.5309998989105225, + "learning_rate": 5.5220300171769305e-06, + "loss": 0.1331, + "step": 18654 + }, + { + "epoch": 0.4720753093605284, + "grad_norm": 4.567336559295654, + "learning_rate": 5.521630687372146e-06, + "loss": 0.1507, + "step": 18655 + }, + { + "epoch": 0.47210061492522204, + "grad_norm": 8.367026329040527, + "learning_rate": 5.521231354203464e-06, + "loss": 0.1415, + "step": 18656 + }, + { + "epoch": 0.47212592048991575, + "grad_norm": 6.016554355621338, + "learning_rate": 5.5208320176734566e-06, + "loss": 0.1605, + "step": 18657 + }, + { + "epoch": 0.4721512260546094, + "grad_norm": 6.1975483894348145, + "learning_rate": 5.5204326777847e-06, + "loss": 0.1904, + "step": 18658 + }, + { + "epoch": 0.47217653161930306, + "grad_norm": 7.935934066772461, + "learning_rate": 5.5200333345397706e-06, + "loss": 0.2221, + "step": 18659 + }, + { + "epoch": 0.4722018371839968, + "grad_norm": 4.499493598937988, + "learning_rate": 5.519633987941244e-06, + "loss": 0.1946, + "step": 18660 + }, + { + "epoch": 0.47222714274869043, + "grad_norm": 6.193641662597656, + "learning_rate": 5.519234637991692e-06, + "loss": 0.2133, + "step": 18661 + }, + { + "epoch": 0.4722524483133841, + "grad_norm": 2.7825021743774414, + "learning_rate": 5.518835284693693e-06, + "loss": 0.095, + "step": 18662 + }, + { + "epoch": 0.4722777538780778, + "grad_norm": 4.459418773651123, + "learning_rate": 5.518435928049824e-06, + "loss": 0.1377, + "step": 18663 + }, + { + "epoch": 0.47230305944277146, + "grad_norm": 8.636606216430664, + "learning_rate": 5.518036568062657e-06, + "loss": 0.293, + "step": 18664 + }, + { + "epoch": 0.47232836500746517, + "grad_norm": 4.31587553024292, + "learning_rate": 5.517637204734768e-06, + "loss": 0.1334, + "step": 18665 + }, + { + "epoch": 0.4723536705721588, + "grad_norm": 7.3572869300842285, + "learning_rate": 5.517237838068734e-06, + "loss": 0.1624, + "step": 18666 + }, + { + "epoch": 0.4723789761368525, + "grad_norm": 7.183802604675293, + "learning_rate": 5.516838468067129e-06, + "loss": 0.2252, + "step": 18667 + }, + { + "epoch": 0.4724042817015462, + "grad_norm": 6.438315391540527, + "learning_rate": 5.516439094732528e-06, + "loss": 0.2387, + "step": 18668 + }, + { + "epoch": 0.47242958726623985, + "grad_norm": 3.6388890743255615, + "learning_rate": 5.516039718067508e-06, + "loss": 0.1384, + "step": 18669 + }, + { + "epoch": 0.4724548928309335, + "grad_norm": 12.951021194458008, + "learning_rate": 5.515640338074644e-06, + "loss": 0.1789, + "step": 18670 + }, + { + "epoch": 0.4724801983956272, + "grad_norm": 4.329195499420166, + "learning_rate": 5.515240954756511e-06, + "loss": 0.2077, + "step": 18671 + }, + { + "epoch": 0.47250550396032087, + "grad_norm": 4.319631576538086, + "learning_rate": 5.5148415681156845e-06, + "loss": 0.0976, + "step": 18672 + }, + { + "epoch": 0.47253080952501453, + "grad_norm": 5.011600017547607, + "learning_rate": 5.51444217815474e-06, + "loss": 0.197, + "step": 18673 + }, + { + "epoch": 0.47255611508970824, + "grad_norm": 3.2284820079803467, + "learning_rate": 5.514042784876256e-06, + "loss": 0.0829, + "step": 18674 + }, + { + "epoch": 0.4725814206544019, + "grad_norm": 5.3032941818237305, + "learning_rate": 5.513643388282803e-06, + "loss": 0.1903, + "step": 18675 + }, + { + "epoch": 0.47260672621909555, + "grad_norm": 9.0501708984375, + "learning_rate": 5.513243988376961e-06, + "loss": 0.1336, + "step": 18676 + }, + { + "epoch": 0.47263203178378926, + "grad_norm": 17.002103805541992, + "learning_rate": 5.512844585161303e-06, + "loss": 0.2291, + "step": 18677 + }, + { + "epoch": 0.4726573373484829, + "grad_norm": 4.854750633239746, + "learning_rate": 5.512445178638407e-06, + "loss": 0.1872, + "step": 18678 + }, + { + "epoch": 0.47268264291317663, + "grad_norm": 4.711109638214111, + "learning_rate": 5.512045768810845e-06, + "loss": 0.1786, + "step": 18679 + }, + { + "epoch": 0.4727079484778703, + "grad_norm": 5.032989978790283, + "learning_rate": 5.511646355681199e-06, + "loss": 0.195, + "step": 18680 + }, + { + "epoch": 0.47273325404256394, + "grad_norm": 5.285067558288574, + "learning_rate": 5.511246939252037e-06, + "loss": 0.236, + "step": 18681 + }, + { + "epoch": 0.47275855960725766, + "grad_norm": 9.212058067321777, + "learning_rate": 5.5108475195259405e-06, + "loss": 0.2002, + "step": 18682 + }, + { + "epoch": 0.4727838651719513, + "grad_norm": 4.584778785705566, + "learning_rate": 5.510448096505483e-06, + "loss": 0.196, + "step": 18683 + }, + { + "epoch": 0.47280917073664497, + "grad_norm": 4.999844074249268, + "learning_rate": 5.51004867019324e-06, + "loss": 0.1163, + "step": 18684 + }, + { + "epoch": 0.4728344763013387, + "grad_norm": 5.560978889465332, + "learning_rate": 5.509649240591789e-06, + "loss": 0.1355, + "step": 18685 + }, + { + "epoch": 0.47285978186603234, + "grad_norm": 9.731921195983887, + "learning_rate": 5.509249807703704e-06, + "loss": 0.2209, + "step": 18686 + }, + { + "epoch": 0.472885087430726, + "grad_norm": 20.778757095336914, + "learning_rate": 5.508850371531562e-06, + "loss": 0.2272, + "step": 18687 + }, + { + "epoch": 0.4729103929954197, + "grad_norm": 12.320895195007324, + "learning_rate": 5.508450932077939e-06, + "loss": 0.1955, + "step": 18688 + }, + { + "epoch": 0.47293569856011336, + "grad_norm": 4.198253631591797, + "learning_rate": 5.508051489345411e-06, + "loss": 0.1198, + "step": 18689 + }, + { + "epoch": 0.47296100412480707, + "grad_norm": 3.0565834045410156, + "learning_rate": 5.507652043336552e-06, + "loss": 0.1321, + "step": 18690 + }, + { + "epoch": 0.47298630968950073, + "grad_norm": 6.221500873565674, + "learning_rate": 5.507252594053941e-06, + "loss": 0.1318, + "step": 18691 + }, + { + "epoch": 0.4730116152541944, + "grad_norm": 3.009183406829834, + "learning_rate": 5.50685314150015e-06, + "loss": 0.0822, + "step": 18692 + }, + { + "epoch": 0.4730369208188881, + "grad_norm": 2.517764091491699, + "learning_rate": 5.50645368567776e-06, + "loss": 0.0925, + "step": 18693 + }, + { + "epoch": 0.47306222638358175, + "grad_norm": 4.642289161682129, + "learning_rate": 5.506054226589344e-06, + "loss": 0.1388, + "step": 18694 + }, + { + "epoch": 0.4730875319482754, + "grad_norm": 8.413841247558594, + "learning_rate": 5.505654764237477e-06, + "loss": 0.1557, + "step": 18695 + }, + { + "epoch": 0.4731128375129691, + "grad_norm": 6.872251510620117, + "learning_rate": 5.505255298624738e-06, + "loss": 0.1844, + "step": 18696 + }, + { + "epoch": 0.4731381430776628, + "grad_norm": 9.781872749328613, + "learning_rate": 5.5048558297537e-06, + "loss": 0.3608, + "step": 18697 + }, + { + "epoch": 0.47316344864235643, + "grad_norm": 3.108341932296753, + "learning_rate": 5.504456357626943e-06, + "loss": 0.1227, + "step": 18698 + }, + { + "epoch": 0.47318875420705014, + "grad_norm": 10.503026008605957, + "learning_rate": 5.5040568822470385e-06, + "loss": 0.1943, + "step": 18699 + }, + { + "epoch": 0.4732140597717438, + "grad_norm": 4.894301891326904, + "learning_rate": 5.5036574036165655e-06, + "loss": 0.1828, + "step": 18700 + }, + { + "epoch": 0.47323936533643746, + "grad_norm": 8.995322227478027, + "learning_rate": 5.5032579217381e-06, + "loss": 0.2667, + "step": 18701 + }, + { + "epoch": 0.47326467090113117, + "grad_norm": 9.30223560333252, + "learning_rate": 5.502858436614218e-06, + "loss": 0.1432, + "step": 18702 + }, + { + "epoch": 0.4732899764658248, + "grad_norm": 6.899332046508789, + "learning_rate": 5.5024589482474946e-06, + "loss": 0.1396, + "step": 18703 + }, + { + "epoch": 0.47331528203051854, + "grad_norm": 8.73587703704834, + "learning_rate": 5.5020594566405065e-06, + "loss": 0.2041, + "step": 18704 + }, + { + "epoch": 0.4733405875952122, + "grad_norm": 10.905622482299805, + "learning_rate": 5.501659961795831e-06, + "loss": 0.2543, + "step": 18705 + }, + { + "epoch": 0.47336589315990585, + "grad_norm": 2.807756185531616, + "learning_rate": 5.501260463716044e-06, + "loss": 0.1726, + "step": 18706 + }, + { + "epoch": 0.47339119872459956, + "grad_norm": 4.061851501464844, + "learning_rate": 5.500860962403723e-06, + "loss": 0.1252, + "step": 18707 + }, + { + "epoch": 0.4734165042892932, + "grad_norm": 8.407186508178711, + "learning_rate": 5.5004614578614405e-06, + "loss": 0.1981, + "step": 18708 + }, + { + "epoch": 0.4734418098539869, + "grad_norm": 6.335296154022217, + "learning_rate": 5.5000619500917755e-06, + "loss": 0.1951, + "step": 18709 + }, + { + "epoch": 0.4734671154186806, + "grad_norm": 4.493675708770752, + "learning_rate": 5.4996624390973045e-06, + "loss": 0.198, + "step": 18710 + }, + { + "epoch": 0.47349242098337424, + "grad_norm": 3.116034984588623, + "learning_rate": 5.499262924880603e-06, + "loss": 0.1578, + "step": 18711 + }, + { + "epoch": 0.4735177265480679, + "grad_norm": 4.5751166343688965, + "learning_rate": 5.498863407444248e-06, + "loss": 0.1663, + "step": 18712 + }, + { + "epoch": 0.4735430321127616, + "grad_norm": 8.283087730407715, + "learning_rate": 5.498463886790816e-06, + "loss": 0.1056, + "step": 18713 + }, + { + "epoch": 0.47356833767745526, + "grad_norm": 5.685159206390381, + "learning_rate": 5.498064362922882e-06, + "loss": 0.1854, + "step": 18714 + }, + { + "epoch": 0.473593643242149, + "grad_norm": 5.7245330810546875, + "learning_rate": 5.497664835843024e-06, + "loss": 0.2383, + "step": 18715 + }, + { + "epoch": 0.47361894880684263, + "grad_norm": 6.792112827301025, + "learning_rate": 5.4972653055538195e-06, + "loss": 0.2355, + "step": 18716 + }, + { + "epoch": 0.4736442543715363, + "grad_norm": 3.7976596355438232, + "learning_rate": 5.496865772057842e-06, + "loss": 0.1854, + "step": 18717 + }, + { + "epoch": 0.47366955993623, + "grad_norm": 7.031550407409668, + "learning_rate": 5.49646623535767e-06, + "loss": 0.151, + "step": 18718 + }, + { + "epoch": 0.47369486550092366, + "grad_norm": 3.4170637130737305, + "learning_rate": 5.49606669545588e-06, + "loss": 0.1217, + "step": 18719 + }, + { + "epoch": 0.4737201710656173, + "grad_norm": 4.017053604125977, + "learning_rate": 5.4956671523550474e-06, + "loss": 0.1272, + "step": 18720 + }, + { + "epoch": 0.473745476630311, + "grad_norm": 5.563299655914307, + "learning_rate": 5.49526760605775e-06, + "loss": 0.2196, + "step": 18721 + }, + { + "epoch": 0.4737707821950047, + "grad_norm": 5.623768329620361, + "learning_rate": 5.494868056566565e-06, + "loss": 0.2249, + "step": 18722 + }, + { + "epoch": 0.47379608775969834, + "grad_norm": 3.2063214778900146, + "learning_rate": 5.494468503884065e-06, + "loss": 0.1325, + "step": 18723 + }, + { + "epoch": 0.47382139332439205, + "grad_norm": 6.495119571685791, + "learning_rate": 5.494068948012833e-06, + "loss": 0.1569, + "step": 18724 + }, + { + "epoch": 0.4738466988890857, + "grad_norm": 15.437870025634766, + "learning_rate": 5.49366938895544e-06, + "loss": 0.155, + "step": 18725 + }, + { + "epoch": 0.47387200445377936, + "grad_norm": 4.197417259216309, + "learning_rate": 5.4932698267144655e-06, + "loss": 0.1993, + "step": 18726 + }, + { + "epoch": 0.4738973100184731, + "grad_norm": 3.0999536514282227, + "learning_rate": 5.492870261292486e-06, + "loss": 0.1244, + "step": 18727 + }, + { + "epoch": 0.47392261558316673, + "grad_norm": 8.319680213928223, + "learning_rate": 5.492470692692078e-06, + "loss": 0.326, + "step": 18728 + }, + { + "epoch": 0.47394792114786044, + "grad_norm": 4.212965488433838, + "learning_rate": 5.4920711209158185e-06, + "loss": 0.1758, + "step": 18729 + }, + { + "epoch": 0.4739732267125541, + "grad_norm": 7.014509677886963, + "learning_rate": 5.4916715459662815e-06, + "loss": 0.2657, + "step": 18730 + }, + { + "epoch": 0.47399853227724775, + "grad_norm": 11.618760108947754, + "learning_rate": 5.4912719678460494e-06, + "loss": 0.2738, + "step": 18731 + }, + { + "epoch": 0.47402383784194146, + "grad_norm": 4.901417255401611, + "learning_rate": 5.490872386557693e-06, + "loss": 0.1963, + "step": 18732 + }, + { + "epoch": 0.4740491434066351, + "grad_norm": 5.023443698883057, + "learning_rate": 5.490472802103793e-06, + "loss": 0.1791, + "step": 18733 + }, + { + "epoch": 0.4740744489713288, + "grad_norm": 3.7219648361206055, + "learning_rate": 5.490073214486925e-06, + "loss": 0.1468, + "step": 18734 + }, + { + "epoch": 0.4740997545360225, + "grad_norm": 5.464147567749023, + "learning_rate": 5.489673623709667e-06, + "loss": 0.1454, + "step": 18735 + }, + { + "epoch": 0.47412506010071614, + "grad_norm": 2.91292667388916, + "learning_rate": 5.489274029774594e-06, + "loss": 0.1329, + "step": 18736 + }, + { + "epoch": 0.4741503656654098, + "grad_norm": 10.589162826538086, + "learning_rate": 5.488874432684283e-06, + "loss": 0.1911, + "step": 18737 + }, + { + "epoch": 0.4741756712301035, + "grad_norm": 4.7761921882629395, + "learning_rate": 5.488474832441313e-06, + "loss": 0.0837, + "step": 18738 + }, + { + "epoch": 0.47420097679479717, + "grad_norm": 4.0225372314453125, + "learning_rate": 5.488075229048259e-06, + "loss": 0.1659, + "step": 18739 + }, + { + "epoch": 0.4742262823594908, + "grad_norm": 4.726649761199951, + "learning_rate": 5.487675622507699e-06, + "loss": 0.1764, + "step": 18740 + }, + { + "epoch": 0.47425158792418454, + "grad_norm": 7.474396228790283, + "learning_rate": 5.487276012822209e-06, + "loss": 0.2699, + "step": 18741 + }, + { + "epoch": 0.4742768934888782, + "grad_norm": 12.259014129638672, + "learning_rate": 5.486876399994367e-06, + "loss": 0.2655, + "step": 18742 + }, + { + "epoch": 0.4743021990535719, + "grad_norm": 2.3859102725982666, + "learning_rate": 5.48647678402675e-06, + "loss": 0.112, + "step": 18743 + }, + { + "epoch": 0.47432750461826556, + "grad_norm": 13.969513893127441, + "learning_rate": 5.486077164921935e-06, + "loss": 0.2572, + "step": 18744 + }, + { + "epoch": 0.4743528101829592, + "grad_norm": 6.080133438110352, + "learning_rate": 5.485677542682498e-06, + "loss": 0.1178, + "step": 18745 + }, + { + "epoch": 0.47437811574765293, + "grad_norm": 3.6520609855651855, + "learning_rate": 5.485277917311017e-06, + "loss": 0.1324, + "step": 18746 + }, + { + "epoch": 0.4744034213123466, + "grad_norm": 7.341870307922363, + "learning_rate": 5.484878288810069e-06, + "loss": 0.2737, + "step": 18747 + }, + { + "epoch": 0.47442872687704024, + "grad_norm": 3.8436083793640137, + "learning_rate": 5.48447865718223e-06, + "loss": 0.1541, + "step": 18748 + }, + { + "epoch": 0.47445403244173395, + "grad_norm": 3.488659620285034, + "learning_rate": 5.484079022430081e-06, + "loss": 0.1447, + "step": 18749 + }, + { + "epoch": 0.4744793380064276, + "grad_norm": 4.479864597320557, + "learning_rate": 5.483679384556194e-06, + "loss": 0.1351, + "step": 18750 + }, + { + "epoch": 0.47450464357112127, + "grad_norm": 4.72374963760376, + "learning_rate": 5.48327974356315e-06, + "loss": 0.1134, + "step": 18751 + }, + { + "epoch": 0.474529949135815, + "grad_norm": 4.316508769989014, + "learning_rate": 5.4828800994535255e-06, + "loss": 0.1621, + "step": 18752 + }, + { + "epoch": 0.47455525470050863, + "grad_norm": 9.648299217224121, + "learning_rate": 5.482480452229894e-06, + "loss": 0.3001, + "step": 18753 + }, + { + "epoch": 0.47458056026520234, + "grad_norm": 11.795605659484863, + "learning_rate": 5.48208080189484e-06, + "loss": 0.2228, + "step": 18754 + }, + { + "epoch": 0.474605865829896, + "grad_norm": 6.618343353271484, + "learning_rate": 5.4816811484509334e-06, + "loss": 0.2612, + "step": 18755 + }, + { + "epoch": 0.47463117139458966, + "grad_norm": 2.9979541301727295, + "learning_rate": 5.481281491900756e-06, + "loss": 0.1247, + "step": 18756 + }, + { + "epoch": 0.47465647695928337, + "grad_norm": 5.373459815979004, + "learning_rate": 5.480881832246883e-06, + "loss": 0.2248, + "step": 18757 + }, + { + "epoch": 0.474681782523977, + "grad_norm": 4.305915832519531, + "learning_rate": 5.480482169491893e-06, + "loss": 0.0721, + "step": 18758 + }, + { + "epoch": 0.4747070880886707, + "grad_norm": 2.9877140522003174, + "learning_rate": 5.480082503638363e-06, + "loss": 0.1347, + "step": 18759 + }, + { + "epoch": 0.4747323936533644, + "grad_norm": 7.171365737915039, + "learning_rate": 5.479682834688871e-06, + "loss": 0.2196, + "step": 18760 + }, + { + "epoch": 0.47475769921805805, + "grad_norm": 4.53373908996582, + "learning_rate": 5.479283162645992e-06, + "loss": 0.1315, + "step": 18761 + }, + { + "epoch": 0.4747830047827517, + "grad_norm": 3.0653703212738037, + "learning_rate": 5.478883487512305e-06, + "loss": 0.1438, + "step": 18762 + }, + { + "epoch": 0.4748083103474454, + "grad_norm": 12.481111526489258, + "learning_rate": 5.478483809290388e-06, + "loss": 0.1494, + "step": 18763 + }, + { + "epoch": 0.4748336159121391, + "grad_norm": 11.399232864379883, + "learning_rate": 5.478084127982819e-06, + "loss": 0.2992, + "step": 18764 + }, + { + "epoch": 0.47485892147683273, + "grad_norm": 8.766605377197266, + "learning_rate": 5.477684443592173e-06, + "loss": 0.1054, + "step": 18765 + }, + { + "epoch": 0.47488422704152644, + "grad_norm": 4.531998157501221, + "learning_rate": 5.477284756121029e-06, + "loss": 0.2579, + "step": 18766 + }, + { + "epoch": 0.4749095326062201, + "grad_norm": 7.126244068145752, + "learning_rate": 5.476885065571965e-06, + "loss": 0.2459, + "step": 18767 + }, + { + "epoch": 0.4749348381709138, + "grad_norm": 7.331558704376221, + "learning_rate": 5.4764853719475575e-06, + "loss": 0.21, + "step": 18768 + }, + { + "epoch": 0.47496014373560747, + "grad_norm": 6.717774868011475, + "learning_rate": 5.476085675250384e-06, + "loss": 0.1355, + "step": 18769 + }, + { + "epoch": 0.4749854493003011, + "grad_norm": 6.081692695617676, + "learning_rate": 5.475685975483023e-06, + "loss": 0.2014, + "step": 18770 + }, + { + "epoch": 0.47501075486499483, + "grad_norm": 9.597807884216309, + "learning_rate": 5.475286272648052e-06, + "loss": 0.2163, + "step": 18771 + }, + { + "epoch": 0.4750360604296885, + "grad_norm": 8.932127952575684, + "learning_rate": 5.4748865667480475e-06, + "loss": 0.1805, + "step": 18772 + }, + { + "epoch": 0.47506136599438215, + "grad_norm": 5.427384853363037, + "learning_rate": 5.474486857785589e-06, + "loss": 0.1328, + "step": 18773 + }, + { + "epoch": 0.47508667155907586, + "grad_norm": 5.401612758636475, + "learning_rate": 5.474087145763253e-06, + "loss": 0.1736, + "step": 18774 + }, + { + "epoch": 0.4751119771237695, + "grad_norm": 4.064564228057861, + "learning_rate": 5.473687430683617e-06, + "loss": 0.1415, + "step": 18775 + }, + { + "epoch": 0.47513728268846317, + "grad_norm": 4.544040203094482, + "learning_rate": 5.473287712549258e-06, + "loss": 0.0945, + "step": 18776 + }, + { + "epoch": 0.4751625882531569, + "grad_norm": 8.7015962600708, + "learning_rate": 5.472887991362755e-06, + "loss": 0.1686, + "step": 18777 + }, + { + "epoch": 0.47518789381785054, + "grad_norm": 4.027169704437256, + "learning_rate": 5.472488267126684e-06, + "loss": 0.189, + "step": 18778 + }, + { + "epoch": 0.47521319938254425, + "grad_norm": 5.433554172515869, + "learning_rate": 5.472088539843625e-06, + "loss": 0.2068, + "step": 18779 + }, + { + "epoch": 0.4752385049472379, + "grad_norm": 5.430238246917725, + "learning_rate": 5.4716888095161565e-06, + "loss": 0.1835, + "step": 18780 + }, + { + "epoch": 0.47526381051193156, + "grad_norm": 4.083221912384033, + "learning_rate": 5.471289076146854e-06, + "loss": 0.1124, + "step": 18781 + }, + { + "epoch": 0.4752891160766253, + "grad_norm": 7.407946586608887, + "learning_rate": 5.470889339738294e-06, + "loss": 0.2218, + "step": 18782 + }, + { + "epoch": 0.47531442164131893, + "grad_norm": 3.7217695713043213, + "learning_rate": 5.470489600293059e-06, + "loss": 0.1289, + "step": 18783 + }, + { + "epoch": 0.4753397272060126, + "grad_norm": 5.108719348907471, + "learning_rate": 5.470089857813722e-06, + "loss": 0.1721, + "step": 18784 + }, + { + "epoch": 0.4753650327707063, + "grad_norm": 5.137397766113281, + "learning_rate": 5.4696901123028634e-06, + "loss": 0.2064, + "step": 18785 + }, + { + "epoch": 0.47539033833539995, + "grad_norm": 4.701879978179932, + "learning_rate": 5.469290363763061e-06, + "loss": 0.2185, + "step": 18786 + }, + { + "epoch": 0.4754156439000936, + "grad_norm": 3.385434627532959, + "learning_rate": 5.4688906121968925e-06, + "loss": 0.17, + "step": 18787 + }, + { + "epoch": 0.4754409494647873, + "grad_norm": 2.5355560779571533, + "learning_rate": 5.468490857606935e-06, + "loss": 0.1071, + "step": 18788 + }, + { + "epoch": 0.475466255029481, + "grad_norm": 7.064792633056641, + "learning_rate": 5.468091099995767e-06, + "loss": 0.2716, + "step": 18789 + }, + { + "epoch": 0.47549156059417463, + "grad_norm": 4.514261245727539, + "learning_rate": 5.467691339365967e-06, + "loss": 0.1884, + "step": 18790 + }, + { + "epoch": 0.47551686615886835, + "grad_norm": 4.884471893310547, + "learning_rate": 5.4672915757201125e-06, + "loss": 0.1716, + "step": 18791 + }, + { + "epoch": 0.475542171723562, + "grad_norm": 6.439272880554199, + "learning_rate": 5.466891809060782e-06, + "loss": 0.118, + "step": 18792 + }, + { + "epoch": 0.4755674772882557, + "grad_norm": 6.077876567840576, + "learning_rate": 5.466492039390552e-06, + "loss": 0.1819, + "step": 18793 + }, + { + "epoch": 0.47559278285294937, + "grad_norm": 9.741032600402832, + "learning_rate": 5.466092266712002e-06, + "loss": 0.2403, + "step": 18794 + }, + { + "epoch": 0.475618088417643, + "grad_norm": 4.834181785583496, + "learning_rate": 5.465692491027711e-06, + "loss": 0.1078, + "step": 18795 + }, + { + "epoch": 0.47564339398233674, + "grad_norm": 21.44202995300293, + "learning_rate": 5.465292712340253e-06, + "loss": 0.2548, + "step": 18796 + }, + { + "epoch": 0.4756686995470304, + "grad_norm": 2.947451114654541, + "learning_rate": 5.4648929306522115e-06, + "loss": 0.1315, + "step": 18797 + }, + { + "epoch": 0.47569400511172405, + "grad_norm": 5.478090286254883, + "learning_rate": 5.464493145966161e-06, + "loss": 0.2349, + "step": 18798 + }, + { + "epoch": 0.47571931067641776, + "grad_norm": 4.574894905090332, + "learning_rate": 5.464093358284681e-06, + "loss": 0.1476, + "step": 18799 + }, + { + "epoch": 0.4757446162411114, + "grad_norm": 16.290380477905273, + "learning_rate": 5.463693567610347e-06, + "loss": 0.3838, + "step": 18800 + }, + { + "epoch": 0.4757699218058051, + "grad_norm": 4.540464401245117, + "learning_rate": 5.463293773945741e-06, + "loss": 0.1595, + "step": 18801 + }, + { + "epoch": 0.4757952273704988, + "grad_norm": 5.890608310699463, + "learning_rate": 5.4628939772934405e-06, + "loss": 0.1186, + "step": 18802 + }, + { + "epoch": 0.47582053293519244, + "grad_norm": 5.561759948730469, + "learning_rate": 5.462494177656023e-06, + "loss": 0.2028, + "step": 18803 + }, + { + "epoch": 0.4758458384998861, + "grad_norm": 4.797126293182373, + "learning_rate": 5.462094375036065e-06, + "loss": 0.1882, + "step": 18804 + }, + { + "epoch": 0.4758711440645798, + "grad_norm": 3.6546401977539062, + "learning_rate": 5.461694569436146e-06, + "loss": 0.2058, + "step": 18805 + }, + { + "epoch": 0.47589644962927347, + "grad_norm": 6.626344680786133, + "learning_rate": 5.4612947608588475e-06, + "loss": 0.1936, + "step": 18806 + }, + { + "epoch": 0.4759217551939672, + "grad_norm": 4.264914512634277, + "learning_rate": 5.4608949493067435e-06, + "loss": 0.1861, + "step": 18807 + }, + { + "epoch": 0.47594706075866083, + "grad_norm": 4.4030232429504395, + "learning_rate": 5.460495134782413e-06, + "loss": 0.1361, + "step": 18808 + }, + { + "epoch": 0.4759723663233545, + "grad_norm": 5.828490734100342, + "learning_rate": 5.460095317288435e-06, + "loss": 0.1527, + "step": 18809 + }, + { + "epoch": 0.4759976718880482, + "grad_norm": 2.4323484897613525, + "learning_rate": 5.459695496827389e-06, + "loss": 0.0983, + "step": 18810 + }, + { + "epoch": 0.47602297745274186, + "grad_norm": 3.4893007278442383, + "learning_rate": 5.459295673401851e-06, + "loss": 0.1343, + "step": 18811 + }, + { + "epoch": 0.4760482830174355, + "grad_norm": 5.248344421386719, + "learning_rate": 5.4588958470144015e-06, + "loss": 0.1888, + "step": 18812 + }, + { + "epoch": 0.4760735885821292, + "grad_norm": 4.320318222045898, + "learning_rate": 5.4584960176676184e-06, + "loss": 0.2136, + "step": 18813 + }, + { + "epoch": 0.4760988941468229, + "grad_norm": 6.308640480041504, + "learning_rate": 5.458096185364078e-06, + "loss": 0.1672, + "step": 18814 + }, + { + "epoch": 0.47612419971151654, + "grad_norm": 9.182683944702148, + "learning_rate": 5.457696350106362e-06, + "loss": 0.2784, + "step": 18815 + }, + { + "epoch": 0.47614950527621025, + "grad_norm": 3.10343337059021, + "learning_rate": 5.457296511897047e-06, + "loss": 0.0811, + "step": 18816 + }, + { + "epoch": 0.4761748108409039, + "grad_norm": 3.9973227977752686, + "learning_rate": 5.456896670738712e-06, + "loss": 0.1385, + "step": 18817 + }, + { + "epoch": 0.4762001164055976, + "grad_norm": 8.362346649169922, + "learning_rate": 5.456496826633935e-06, + "loss": 0.1525, + "step": 18818 + }, + { + "epoch": 0.4762254219702913, + "grad_norm": 7.023516654968262, + "learning_rate": 5.456096979585295e-06, + "loss": 0.2304, + "step": 18819 + }, + { + "epoch": 0.47625072753498493, + "grad_norm": 14.284088134765625, + "learning_rate": 5.4556971295953695e-06, + "loss": 0.1413, + "step": 18820 + }, + { + "epoch": 0.47627603309967864, + "grad_norm": 15.920510292053223, + "learning_rate": 5.4552972766667386e-06, + "loss": 0.1909, + "step": 18821 + }, + { + "epoch": 0.4763013386643723, + "grad_norm": 5.860274314880371, + "learning_rate": 5.454897420801981e-06, + "loss": 0.101, + "step": 18822 + }, + { + "epoch": 0.47632664422906595, + "grad_norm": 6.6195573806762695, + "learning_rate": 5.454497562003675e-06, + "loss": 0.2103, + "step": 18823 + }, + { + "epoch": 0.47635194979375967, + "grad_norm": 5.003382205963135, + "learning_rate": 5.454097700274398e-06, + "loss": 0.1421, + "step": 18824 + }, + { + "epoch": 0.4763772553584533, + "grad_norm": 5.0965447425842285, + "learning_rate": 5.453697835616728e-06, + "loss": 0.2453, + "step": 18825 + }, + { + "epoch": 0.476402560923147, + "grad_norm": 6.614572525024414, + "learning_rate": 5.453297968033248e-06, + "loss": 0.1713, + "step": 18826 + }, + { + "epoch": 0.4764278664878407, + "grad_norm": 7.464051246643066, + "learning_rate": 5.452898097526531e-06, + "loss": 0.1433, + "step": 18827 + }, + { + "epoch": 0.47645317205253435, + "grad_norm": 6.673650741577148, + "learning_rate": 5.45249822409916e-06, + "loss": 0.1801, + "step": 18828 + }, + { + "epoch": 0.476478477617228, + "grad_norm": 3.7945430278778076, + "learning_rate": 5.452098347753712e-06, + "loss": 0.1049, + "step": 18829 + }, + { + "epoch": 0.4765037831819217, + "grad_norm": 5.905089855194092, + "learning_rate": 5.4516984684927654e-06, + "loss": 0.2007, + "step": 18830 + }, + { + "epoch": 0.47652908874661537, + "grad_norm": 3.2264323234558105, + "learning_rate": 5.451298586318898e-06, + "loss": 0.1295, + "step": 18831 + }, + { + "epoch": 0.4765543943113091, + "grad_norm": 15.659674644470215, + "learning_rate": 5.450898701234692e-06, + "loss": 0.1463, + "step": 18832 + }, + { + "epoch": 0.47657969987600274, + "grad_norm": 8.356801986694336, + "learning_rate": 5.450498813242724e-06, + "loss": 0.2362, + "step": 18833 + }, + { + "epoch": 0.4766050054406964, + "grad_norm": 8.776655197143555, + "learning_rate": 5.4500989223455715e-06, + "loss": 0.2126, + "step": 18834 + }, + { + "epoch": 0.4766303110053901, + "grad_norm": 2.599283218383789, + "learning_rate": 5.449699028545816e-06, + "loss": 0.0684, + "step": 18835 + }, + { + "epoch": 0.47665561657008376, + "grad_norm": 4.929304599761963, + "learning_rate": 5.449299131846035e-06, + "loss": 0.1468, + "step": 18836 + }, + { + "epoch": 0.4766809221347774, + "grad_norm": 5.405296325683594, + "learning_rate": 5.448899232248808e-06, + "loss": 0.1711, + "step": 18837 + }, + { + "epoch": 0.47670622769947113, + "grad_norm": 16.61544418334961, + "learning_rate": 5.448499329756712e-06, + "loss": 0.3045, + "step": 18838 + }, + { + "epoch": 0.4767315332641648, + "grad_norm": 4.913428783416748, + "learning_rate": 5.448099424372329e-06, + "loss": 0.2578, + "step": 18839 + }, + { + "epoch": 0.47675683882885844, + "grad_norm": 5.270944118499756, + "learning_rate": 5.4476995160982345e-06, + "loss": 0.2104, + "step": 18840 + }, + { + "epoch": 0.47678214439355215, + "grad_norm": 3.6645050048828125, + "learning_rate": 5.447299604937011e-06, + "loss": 0.1408, + "step": 18841 + }, + { + "epoch": 0.4768074499582458, + "grad_norm": 3.88901948928833, + "learning_rate": 5.446899690891234e-06, + "loss": 0.1531, + "step": 18842 + }, + { + "epoch": 0.4768327555229395, + "grad_norm": 2.6449599266052246, + "learning_rate": 5.446499773963484e-06, + "loss": 0.1204, + "step": 18843 + }, + { + "epoch": 0.4768580610876332, + "grad_norm": 6.5640034675598145, + "learning_rate": 5.446099854156341e-06, + "loss": 0.165, + "step": 18844 + }, + { + "epoch": 0.47688336665232683, + "grad_norm": 15.509445190429688, + "learning_rate": 5.445699931472381e-06, + "loss": 0.1439, + "step": 18845 + }, + { + "epoch": 0.47690867221702055, + "grad_norm": 2.7571773529052734, + "learning_rate": 5.445300005914188e-06, + "loss": 0.1454, + "step": 18846 + }, + { + "epoch": 0.4769339777817142, + "grad_norm": 14.062682151794434, + "learning_rate": 5.444900077484336e-06, + "loss": 0.249, + "step": 18847 + }, + { + "epoch": 0.47695928334640786, + "grad_norm": 2.806863784790039, + "learning_rate": 5.4445001461854075e-06, + "loss": 0.1368, + "step": 18848 + }, + { + "epoch": 0.47698458891110157, + "grad_norm": 8.819119453430176, + "learning_rate": 5.4441002120199786e-06, + "loss": 0.2229, + "step": 18849 + }, + { + "epoch": 0.4770098944757952, + "grad_norm": 4.3353986740112305, + "learning_rate": 5.443700274990631e-06, + "loss": 0.1933, + "step": 18850 + }, + { + "epoch": 0.4770352000404889, + "grad_norm": 4.895081520080566, + "learning_rate": 5.443300335099942e-06, + "loss": 0.2344, + "step": 18851 + }, + { + "epoch": 0.4770605056051826, + "grad_norm": 6.114600658416748, + "learning_rate": 5.442900392350493e-06, + "loss": 0.2087, + "step": 18852 + }, + { + "epoch": 0.47708581116987625, + "grad_norm": 3.8912038803100586, + "learning_rate": 5.4425004467448606e-06, + "loss": 0.1166, + "step": 18853 + }, + { + "epoch": 0.4771111167345699, + "grad_norm": 5.475554466247559, + "learning_rate": 5.442100498285625e-06, + "loss": 0.1838, + "step": 18854 + }, + { + "epoch": 0.4771364222992636, + "grad_norm": 4.767504692077637, + "learning_rate": 5.441700546975367e-06, + "loss": 0.1714, + "step": 18855 + }, + { + "epoch": 0.4771617278639573, + "grad_norm": 8.23734188079834, + "learning_rate": 5.441300592816663e-06, + "loss": 0.1611, + "step": 18856 + }, + { + "epoch": 0.477187033428651, + "grad_norm": 7.974939823150635, + "learning_rate": 5.440900635812094e-06, + "loss": 0.2405, + "step": 18857 + }, + { + "epoch": 0.47721233899334464, + "grad_norm": 4.87809944152832, + "learning_rate": 5.4405006759642384e-06, + "loss": 0.2412, + "step": 18858 + }, + { + "epoch": 0.4772376445580383, + "grad_norm": 10.838605880737305, + "learning_rate": 5.440100713275678e-06, + "loss": 0.2524, + "step": 18859 + }, + { + "epoch": 0.477262950122732, + "grad_norm": 6.66356086730957, + "learning_rate": 5.439700747748987e-06, + "loss": 0.2525, + "step": 18860 + }, + { + "epoch": 0.47728825568742567, + "grad_norm": 6.011073112487793, + "learning_rate": 5.4393007793867494e-06, + "loss": 0.2281, + "step": 18861 + }, + { + "epoch": 0.4773135612521193, + "grad_norm": 12.418572425842285, + "learning_rate": 5.438900808191541e-06, + "loss": 0.1931, + "step": 18862 + }, + { + "epoch": 0.47733886681681303, + "grad_norm": 2.6840176582336426, + "learning_rate": 5.438500834165946e-06, + "loss": 0.1106, + "step": 18863 + }, + { + "epoch": 0.4773641723815067, + "grad_norm": 3.115678310394287, + "learning_rate": 5.438100857312538e-06, + "loss": 0.1625, + "step": 18864 + }, + { + "epoch": 0.47738947794620035, + "grad_norm": 4.130786418914795, + "learning_rate": 5.4377008776339e-06, + "loss": 0.1759, + "step": 18865 + }, + { + "epoch": 0.47741478351089406, + "grad_norm": 6.62556791305542, + "learning_rate": 5.4373008951326115e-06, + "loss": 0.2468, + "step": 18866 + }, + { + "epoch": 0.4774400890755877, + "grad_norm": 4.208568572998047, + "learning_rate": 5.436900909811249e-06, + "loss": 0.1566, + "step": 18867 + }, + { + "epoch": 0.47746539464028137, + "grad_norm": 5.185756683349609, + "learning_rate": 5.436500921672395e-06, + "loss": 0.1529, + "step": 18868 + }, + { + "epoch": 0.4774907002049751, + "grad_norm": 2.9193148612976074, + "learning_rate": 5.436100930718626e-06, + "loss": 0.122, + "step": 18869 + }, + { + "epoch": 0.47751600576966874, + "grad_norm": 7.117950916290283, + "learning_rate": 5.435700936952526e-06, + "loss": 0.2624, + "step": 18870 + }, + { + "epoch": 0.47754131133436245, + "grad_norm": 4.435156345367432, + "learning_rate": 5.435300940376671e-06, + "loss": 0.1751, + "step": 18871 + }, + { + "epoch": 0.4775666168990561, + "grad_norm": 2.935112714767456, + "learning_rate": 5.434900940993641e-06, + "loss": 0.0999, + "step": 18872 + }, + { + "epoch": 0.47759192246374976, + "grad_norm": 9.258733749389648, + "learning_rate": 5.4345009388060146e-06, + "loss": 0.1442, + "step": 18873 + }, + { + "epoch": 0.4776172280284435, + "grad_norm": 8.427133560180664, + "learning_rate": 5.434100933816374e-06, + "loss": 0.2191, + "step": 18874 + }, + { + "epoch": 0.47764253359313713, + "grad_norm": 3.485626459121704, + "learning_rate": 5.433700926027295e-06, + "loss": 0.127, + "step": 18875 + }, + { + "epoch": 0.4776678391578308, + "grad_norm": 9.904471397399902, + "learning_rate": 5.433300915441361e-06, + "loss": 0.296, + "step": 18876 + }, + { + "epoch": 0.4776931447225245, + "grad_norm": 9.293073654174805, + "learning_rate": 5.43290090206115e-06, + "loss": 0.2121, + "step": 18877 + }, + { + "epoch": 0.47771845028721815, + "grad_norm": 3.2903666496276855, + "learning_rate": 5.432500885889241e-06, + "loss": 0.1389, + "step": 18878 + }, + { + "epoch": 0.4777437558519118, + "grad_norm": 3.3258607387542725, + "learning_rate": 5.432100866928215e-06, + "loss": 0.0629, + "step": 18879 + }, + { + "epoch": 0.4777690614166055, + "grad_norm": 2.9474573135375977, + "learning_rate": 5.4317008451806505e-06, + "loss": 0.119, + "step": 18880 + }, + { + "epoch": 0.4777943669812992, + "grad_norm": 4.170381546020508, + "learning_rate": 5.431300820649128e-06, + "loss": 0.1871, + "step": 18881 + }, + { + "epoch": 0.4778196725459929, + "grad_norm": 12.572412490844727, + "learning_rate": 5.430900793336225e-06, + "loss": 0.1849, + "step": 18882 + }, + { + "epoch": 0.47784497811068655, + "grad_norm": 6.013125896453857, + "learning_rate": 5.430500763244526e-06, + "loss": 0.1598, + "step": 18883 + }, + { + "epoch": 0.4778702836753802, + "grad_norm": 4.190911769866943, + "learning_rate": 5.430100730376605e-06, + "loss": 0.105, + "step": 18884 + }, + { + "epoch": 0.4778955892400739, + "grad_norm": 2.5619938373565674, + "learning_rate": 5.429700694735045e-06, + "loss": 0.0962, + "step": 18885 + }, + { + "epoch": 0.47792089480476757, + "grad_norm": 4.154772758483887, + "learning_rate": 5.429300656322426e-06, + "loss": 0.1475, + "step": 18886 + }, + { + "epoch": 0.4779462003694612, + "grad_norm": 9.114875793457031, + "learning_rate": 5.428900615141326e-06, + "loss": 0.2883, + "step": 18887 + }, + { + "epoch": 0.47797150593415494, + "grad_norm": 8.038917541503906, + "learning_rate": 5.4285005711943266e-06, + "loss": 0.2479, + "step": 18888 + }, + { + "epoch": 0.4779968114988486, + "grad_norm": 5.19905948638916, + "learning_rate": 5.428100524484006e-06, + "loss": 0.1348, + "step": 18889 + }, + { + "epoch": 0.47802211706354225, + "grad_norm": 3.968698263168335, + "learning_rate": 5.427700475012945e-06, + "loss": 0.1628, + "step": 18890 + }, + { + "epoch": 0.47804742262823596, + "grad_norm": 4.031611442565918, + "learning_rate": 5.427300422783722e-06, + "loss": 0.1406, + "step": 18891 + }, + { + "epoch": 0.4780727281929296, + "grad_norm": 3.812906503677368, + "learning_rate": 5.42690036779892e-06, + "loss": 0.1608, + "step": 18892 + }, + { + "epoch": 0.4780980337576233, + "grad_norm": 2.0819461345672607, + "learning_rate": 5.426500310061116e-06, + "loss": 0.1045, + "step": 18893 + }, + { + "epoch": 0.478123339322317, + "grad_norm": 6.301721572875977, + "learning_rate": 5.426100249572891e-06, + "loss": 0.165, + "step": 18894 + }, + { + "epoch": 0.47814864488701064, + "grad_norm": 5.597560405731201, + "learning_rate": 5.425700186336824e-06, + "loss": 0.2138, + "step": 18895 + }, + { + "epoch": 0.47817395045170435, + "grad_norm": 6.9595537185668945, + "learning_rate": 5.425300120355496e-06, + "loss": 0.2516, + "step": 18896 + }, + { + "epoch": 0.478199256016398, + "grad_norm": 7.231682777404785, + "learning_rate": 5.4249000516314874e-06, + "loss": 0.1586, + "step": 18897 + }, + { + "epoch": 0.47822456158109167, + "grad_norm": 14.611374855041504, + "learning_rate": 5.424499980167378e-06, + "loss": 0.1205, + "step": 18898 + }, + { + "epoch": 0.4782498671457854, + "grad_norm": 8.263890266418457, + "learning_rate": 5.424099905965747e-06, + "loss": 0.1663, + "step": 18899 + }, + { + "epoch": 0.47827517271047904, + "grad_norm": 5.581569671630859, + "learning_rate": 5.423699829029173e-06, + "loss": 0.1691, + "step": 18900 + }, + { + "epoch": 0.4783004782751727, + "grad_norm": 6.20822286605835, + "learning_rate": 5.423299749360238e-06, + "loss": 0.237, + "step": 18901 + }, + { + "epoch": 0.4783257838398664, + "grad_norm": 8.35029125213623, + "learning_rate": 5.422899666961523e-06, + "loss": 0.1593, + "step": 18902 + }, + { + "epoch": 0.47835108940456006, + "grad_norm": 3.719242811203003, + "learning_rate": 5.422499581835607e-06, + "loss": 0.1278, + "step": 18903 + }, + { + "epoch": 0.4783763949692537, + "grad_norm": 4.979197025299072, + "learning_rate": 5.422099493985067e-06, + "loss": 0.1375, + "step": 18904 + }, + { + "epoch": 0.4784017005339474, + "grad_norm": 3.2339892387390137, + "learning_rate": 5.421699403412488e-06, + "loss": 0.1555, + "step": 18905 + }, + { + "epoch": 0.4784270060986411, + "grad_norm": 6.159908294677734, + "learning_rate": 5.421299310120447e-06, + "loss": 0.1969, + "step": 18906 + }, + { + "epoch": 0.4784523116633348, + "grad_norm": 3.7607641220092773, + "learning_rate": 5.420899214111525e-06, + "loss": 0.1058, + "step": 18907 + }, + { + "epoch": 0.47847761722802845, + "grad_norm": 6.462229251861572, + "learning_rate": 5.420499115388304e-06, + "loss": 0.2171, + "step": 18908 + }, + { + "epoch": 0.4785029227927221, + "grad_norm": 7.490584373474121, + "learning_rate": 5.420099013953361e-06, + "loss": 0.1302, + "step": 18909 + }, + { + "epoch": 0.4785282283574158, + "grad_norm": 6.932230472564697, + "learning_rate": 5.4196989098092774e-06, + "loss": 0.1883, + "step": 18910 + }, + { + "epoch": 0.4785535339221095, + "grad_norm": 8.044218063354492, + "learning_rate": 5.419298802958633e-06, + "loss": 0.2435, + "step": 18911 + }, + { + "epoch": 0.47857883948680313, + "grad_norm": 5.9455366134643555, + "learning_rate": 5.418898693404011e-06, + "loss": 0.2866, + "step": 18912 + }, + { + "epoch": 0.47860414505149684, + "grad_norm": 3.5592100620269775, + "learning_rate": 5.418498581147987e-06, + "loss": 0.2, + "step": 18913 + }, + { + "epoch": 0.4786294506161905, + "grad_norm": 7.167057037353516, + "learning_rate": 5.4180984661931455e-06, + "loss": 0.1789, + "step": 18914 + }, + { + "epoch": 0.47865475618088416, + "grad_norm": 9.291489601135254, + "learning_rate": 5.417698348542063e-06, + "loss": 0.2432, + "step": 18915 + }, + { + "epoch": 0.47868006174557787, + "grad_norm": 8.78926944732666, + "learning_rate": 5.417298228197323e-06, + "loss": 0.1849, + "step": 18916 + }, + { + "epoch": 0.4787053673102715, + "grad_norm": 6.384859561920166, + "learning_rate": 5.416898105161503e-06, + "loss": 0.1147, + "step": 18917 + }, + { + "epoch": 0.4787306728749652, + "grad_norm": 7.987329006195068, + "learning_rate": 5.416497979437185e-06, + "loss": 0.1646, + "step": 18918 + }, + { + "epoch": 0.4787559784396589, + "grad_norm": 4.139039516448975, + "learning_rate": 5.4160978510269505e-06, + "loss": 0.1045, + "step": 18919 + }, + { + "epoch": 0.47878128400435255, + "grad_norm": 9.0148344039917, + "learning_rate": 5.415697719933376e-06, + "loss": 0.222, + "step": 18920 + }, + { + "epoch": 0.47880658956904626, + "grad_norm": 8.6368408203125, + "learning_rate": 5.4152975861590465e-06, + "loss": 0.1876, + "step": 18921 + }, + { + "epoch": 0.4788318951337399, + "grad_norm": 16.870386123657227, + "learning_rate": 5.414897449706537e-06, + "loss": 0.1498, + "step": 18922 + }, + { + "epoch": 0.47885720069843357, + "grad_norm": 6.671758651733398, + "learning_rate": 5.414497310578435e-06, + "loss": 0.1709, + "step": 18923 + }, + { + "epoch": 0.4788825062631273, + "grad_norm": 4.843606948852539, + "learning_rate": 5.414097168777314e-06, + "loss": 0.1615, + "step": 18924 + }, + { + "epoch": 0.47890781182782094, + "grad_norm": 7.797877788543701, + "learning_rate": 5.413697024305758e-06, + "loss": 0.2017, + "step": 18925 + }, + { + "epoch": 0.4789331173925146, + "grad_norm": 6.5774407386779785, + "learning_rate": 5.413296877166348e-06, + "loss": 0.2189, + "step": 18926 + }, + { + "epoch": 0.4789584229572083, + "grad_norm": 8.321554183959961, + "learning_rate": 5.412896727361663e-06, + "loss": 0.1762, + "step": 18927 + }, + { + "epoch": 0.47898372852190196, + "grad_norm": 13.043951034545898, + "learning_rate": 5.412496574894283e-06, + "loss": 0.3169, + "step": 18928 + }, + { + "epoch": 0.4790090340865956, + "grad_norm": 3.5990169048309326, + "learning_rate": 5.412096419766789e-06, + "loss": 0.1002, + "step": 18929 + }, + { + "epoch": 0.47903433965128933, + "grad_norm": 9.792182922363281, + "learning_rate": 5.411696261981763e-06, + "loss": 0.1613, + "step": 18930 + }, + { + "epoch": 0.479059645215983, + "grad_norm": 4.550982475280762, + "learning_rate": 5.411296101541784e-06, + "loss": 0.2307, + "step": 18931 + }, + { + "epoch": 0.47908495078067664, + "grad_norm": 7.542774200439453, + "learning_rate": 5.410895938449433e-06, + "loss": 0.2143, + "step": 18932 + }, + { + "epoch": 0.47911025634537036, + "grad_norm": 6.750308513641357, + "learning_rate": 5.410495772707291e-06, + "loss": 0.1869, + "step": 18933 + }, + { + "epoch": 0.479135561910064, + "grad_norm": 6.911073207855225, + "learning_rate": 5.410095604317936e-06, + "loss": 0.1823, + "step": 18934 + }, + { + "epoch": 0.4791608674747577, + "grad_norm": 7.024639129638672, + "learning_rate": 5.4096954332839525e-06, + "loss": 0.2594, + "step": 18935 + }, + { + "epoch": 0.4791861730394514, + "grad_norm": 9.258682250976562, + "learning_rate": 5.40929525960792e-06, + "loss": 0.2254, + "step": 18936 + }, + { + "epoch": 0.47921147860414504, + "grad_norm": 3.1377668380737305, + "learning_rate": 5.408895083292417e-06, + "loss": 0.175, + "step": 18937 + }, + { + "epoch": 0.47923678416883875, + "grad_norm": 5.639079570770264, + "learning_rate": 5.408494904340027e-06, + "loss": 0.1728, + "step": 18938 + }, + { + "epoch": 0.4792620897335324, + "grad_norm": 6.499063491821289, + "learning_rate": 5.4080947227533275e-06, + "loss": 0.2113, + "step": 18939 + }, + { + "epoch": 0.47928739529822606, + "grad_norm": 7.175816535949707, + "learning_rate": 5.407694538534902e-06, + "loss": 0.2648, + "step": 18940 + }, + { + "epoch": 0.47931270086291977, + "grad_norm": 21.075756072998047, + "learning_rate": 5.407294351687331e-06, + "loss": 0.2773, + "step": 18941 + }, + { + "epoch": 0.47933800642761343, + "grad_norm": 4.918161869049072, + "learning_rate": 5.406894162213194e-06, + "loss": 0.1539, + "step": 18942 + }, + { + "epoch": 0.4793633119923071, + "grad_norm": 6.502063274383545, + "learning_rate": 5.406493970115072e-06, + "loss": 0.1831, + "step": 18943 + }, + { + "epoch": 0.4793886175570008, + "grad_norm": 4.440613746643066, + "learning_rate": 5.4060937753955455e-06, + "loss": 0.1369, + "step": 18944 + }, + { + "epoch": 0.47941392312169445, + "grad_norm": 14.792337417602539, + "learning_rate": 5.405693578057196e-06, + "loss": 0.2402, + "step": 18945 + }, + { + "epoch": 0.47943922868638816, + "grad_norm": 14.061626434326172, + "learning_rate": 5.405293378102604e-06, + "loss": 0.1941, + "step": 18946 + }, + { + "epoch": 0.4794645342510818, + "grad_norm": 2.752817392349243, + "learning_rate": 5.404893175534352e-06, + "loss": 0.1238, + "step": 18947 + }, + { + "epoch": 0.4794898398157755, + "grad_norm": 16.648759841918945, + "learning_rate": 5.404492970355017e-06, + "loss": 0.2732, + "step": 18948 + }, + { + "epoch": 0.4795151453804692, + "grad_norm": 2.4427437782287598, + "learning_rate": 5.404092762567184e-06, + "loss": 0.116, + "step": 18949 + }, + { + "epoch": 0.47954045094516284, + "grad_norm": 23.3778018951416, + "learning_rate": 5.4036925521734305e-06, + "loss": 0.2178, + "step": 18950 + }, + { + "epoch": 0.4795657565098565, + "grad_norm": 3.0364158153533936, + "learning_rate": 5.403292339176339e-06, + "loss": 0.0989, + "step": 18951 + }, + { + "epoch": 0.4795910620745502, + "grad_norm": 3.513249635696411, + "learning_rate": 5.402892123578491e-06, + "loss": 0.1439, + "step": 18952 + }, + { + "epoch": 0.47961636763924387, + "grad_norm": 3.4792282581329346, + "learning_rate": 5.402491905382467e-06, + "loss": 0.1346, + "step": 18953 + }, + { + "epoch": 0.4796416732039375, + "grad_norm": 6.311466217041016, + "learning_rate": 5.402091684590846e-06, + "loss": 0.1756, + "step": 18954 + }, + { + "epoch": 0.47966697876863124, + "grad_norm": 2.2637288570404053, + "learning_rate": 5.401691461206211e-06, + "loss": 0.0826, + "step": 18955 + }, + { + "epoch": 0.4796922843333249, + "grad_norm": 7.05148458480835, + "learning_rate": 5.401291235231144e-06, + "loss": 0.204, + "step": 18956 + }, + { + "epoch": 0.47971758989801855, + "grad_norm": 3.8566994667053223, + "learning_rate": 5.400891006668223e-06, + "loss": 0.1226, + "step": 18957 + }, + { + "epoch": 0.47974289546271226, + "grad_norm": 11.202971458435059, + "learning_rate": 5.400490775520031e-06, + "loss": 0.3484, + "step": 18958 + }, + { + "epoch": 0.4797682010274059, + "grad_norm": 2.949420213699341, + "learning_rate": 5.400090541789149e-06, + "loss": 0.1487, + "step": 18959 + }, + { + "epoch": 0.47979350659209963, + "grad_norm": 9.495076179504395, + "learning_rate": 5.399690305478156e-06, + "loss": 0.1673, + "step": 18960 + }, + { + "epoch": 0.4798188121567933, + "grad_norm": 9.553946495056152, + "learning_rate": 5.399290066589636e-06, + "loss": 0.2351, + "step": 18961 + }, + { + "epoch": 0.47984411772148694, + "grad_norm": 3.878941774368286, + "learning_rate": 5.398889825126168e-06, + "loss": 0.1674, + "step": 18962 + }, + { + "epoch": 0.47986942328618065, + "grad_norm": 6.538437366485596, + "learning_rate": 5.398489581090334e-06, + "loss": 0.1662, + "step": 18963 + }, + { + "epoch": 0.4798947288508743, + "grad_norm": 3.9839189052581787, + "learning_rate": 5.398089334484714e-06, + "loss": 0.1797, + "step": 18964 + }, + { + "epoch": 0.47992003441556796, + "grad_norm": 2.738737106323242, + "learning_rate": 5.397689085311891e-06, + "loss": 0.1133, + "step": 18965 + }, + { + "epoch": 0.4799453399802617, + "grad_norm": 6.6545281410217285, + "learning_rate": 5.397288833574443e-06, + "loss": 0.1634, + "step": 18966 + }, + { + "epoch": 0.47997064554495533, + "grad_norm": 3.6320853233337402, + "learning_rate": 5.3968885792749555e-06, + "loss": 0.1034, + "step": 18967 + }, + { + "epoch": 0.479995951109649, + "grad_norm": 5.115923881530762, + "learning_rate": 5.396488322416007e-06, + "loss": 0.2446, + "step": 18968 + }, + { + "epoch": 0.4800212566743427, + "grad_norm": 5.067730903625488, + "learning_rate": 5.396088063000179e-06, + "loss": 0.1808, + "step": 18969 + }, + { + "epoch": 0.48004656223903636, + "grad_norm": 5.611173629760742, + "learning_rate": 5.395687801030052e-06, + "loss": 0.1932, + "step": 18970 + }, + { + "epoch": 0.48007186780373007, + "grad_norm": 3.3257997035980225, + "learning_rate": 5.395287536508208e-06, + "loss": 0.1161, + "step": 18971 + }, + { + "epoch": 0.4800971733684237, + "grad_norm": 5.283451557159424, + "learning_rate": 5.394887269437229e-06, + "loss": 0.2063, + "step": 18972 + }, + { + "epoch": 0.4801224789331174, + "grad_norm": 2.885929822921753, + "learning_rate": 5.394486999819695e-06, + "loss": 0.139, + "step": 18973 + }, + { + "epoch": 0.4801477844978111, + "grad_norm": 4.591769218444824, + "learning_rate": 5.394086727658187e-06, + "loss": 0.1264, + "step": 18974 + }, + { + "epoch": 0.48017309006250475, + "grad_norm": 5.7346296310424805, + "learning_rate": 5.393686452955288e-06, + "loss": 0.2177, + "step": 18975 + }, + { + "epoch": 0.4801983956271984, + "grad_norm": 3.704594373703003, + "learning_rate": 5.393286175713579e-06, + "loss": 0.1666, + "step": 18976 + }, + { + "epoch": 0.4802237011918921, + "grad_norm": 6.547710418701172, + "learning_rate": 5.392885895935639e-06, + "loss": 0.1567, + "step": 18977 + }, + { + "epoch": 0.4802490067565858, + "grad_norm": 3.1243977546691895, + "learning_rate": 5.392485613624052e-06, + "loss": 0.1015, + "step": 18978 + }, + { + "epoch": 0.48027431232127943, + "grad_norm": 7.219628810882568, + "learning_rate": 5.392085328781398e-06, + "loss": 0.2147, + "step": 18979 + }, + { + "epoch": 0.48029961788597314, + "grad_norm": 4.978738784790039, + "learning_rate": 5.3916850414102585e-06, + "loss": 0.1323, + "step": 18980 + }, + { + "epoch": 0.4803249234506668, + "grad_norm": 5.7269744873046875, + "learning_rate": 5.391284751513215e-06, + "loss": 0.163, + "step": 18981 + }, + { + "epoch": 0.48035022901536045, + "grad_norm": 4.222884654998779, + "learning_rate": 5.390884459092849e-06, + "loss": 0.1726, + "step": 18982 + }, + { + "epoch": 0.48037553458005416, + "grad_norm": 3.3055319786071777, + "learning_rate": 5.390484164151742e-06, + "loss": 0.1021, + "step": 18983 + }, + { + "epoch": 0.4804008401447478, + "grad_norm": 6.312157154083252, + "learning_rate": 5.390083866692475e-06, + "loss": 0.1594, + "step": 18984 + }, + { + "epoch": 0.48042614570944153, + "grad_norm": 19.159696578979492, + "learning_rate": 5.389683566717629e-06, + "loss": 0.2496, + "step": 18985 + }, + { + "epoch": 0.4804514512741352, + "grad_norm": 5.391548156738281, + "learning_rate": 5.389283264229788e-06, + "loss": 0.1771, + "step": 18986 + }, + { + "epoch": 0.48047675683882884, + "grad_norm": 3.502096176147461, + "learning_rate": 5.38888295923153e-06, + "loss": 0.1333, + "step": 18987 + }, + { + "epoch": 0.48050206240352256, + "grad_norm": 5.3338141441345215, + "learning_rate": 5.38848265172544e-06, + "loss": 0.233, + "step": 18988 + }, + { + "epoch": 0.4805273679682162, + "grad_norm": 4.704446792602539, + "learning_rate": 5.388082341714096e-06, + "loss": 0.1265, + "step": 18989 + }, + { + "epoch": 0.48055267353290987, + "grad_norm": 2.664215087890625, + "learning_rate": 5.387682029200082e-06, + "loss": 0.0799, + "step": 18990 + }, + { + "epoch": 0.4805779790976036, + "grad_norm": 23.23398780822754, + "learning_rate": 5.3872817141859775e-06, + "loss": 0.2551, + "step": 18991 + }, + { + "epoch": 0.48060328466229724, + "grad_norm": 4.885574817657471, + "learning_rate": 5.386881396674367e-06, + "loss": 0.2355, + "step": 18992 + }, + { + "epoch": 0.4806285902269909, + "grad_norm": 5.549118995666504, + "learning_rate": 5.386481076667829e-06, + "loss": 0.186, + "step": 18993 + }, + { + "epoch": 0.4806538957916846, + "grad_norm": 5.1302618980407715, + "learning_rate": 5.386080754168948e-06, + "loss": 0.2205, + "step": 18994 + }, + { + "epoch": 0.48067920135637826, + "grad_norm": 4.183847904205322, + "learning_rate": 5.385680429180301e-06, + "loss": 0.1377, + "step": 18995 + }, + { + "epoch": 0.4807045069210719, + "grad_norm": 3.478555917739868, + "learning_rate": 5.385280101704475e-06, + "loss": 0.1563, + "step": 18996 + }, + { + "epoch": 0.48072981248576563, + "grad_norm": 6.467959880828857, + "learning_rate": 5.3848797717440484e-06, + "loss": 0.2324, + "step": 18997 + }, + { + "epoch": 0.4807551180504593, + "grad_norm": 10.852886199951172, + "learning_rate": 5.384479439301605e-06, + "loss": 0.2524, + "step": 18998 + }, + { + "epoch": 0.480780423615153, + "grad_norm": 8.605643272399902, + "learning_rate": 5.3840791043797234e-06, + "loss": 0.2185, + "step": 18999 + }, + { + "epoch": 0.48080572917984665, + "grad_norm": 3.54068922996521, + "learning_rate": 5.383678766980989e-06, + "loss": 0.1341, + "step": 19000 + }, + { + "epoch": 0.4808310347445403, + "grad_norm": 5.413647651672363, + "learning_rate": 5.383278427107979e-06, + "loss": 0.1829, + "step": 19001 + }, + { + "epoch": 0.480856340309234, + "grad_norm": 5.441218376159668, + "learning_rate": 5.382878084763281e-06, + "loss": 0.1477, + "step": 19002 + }, + { + "epoch": 0.4808816458739277, + "grad_norm": 13.085578918457031, + "learning_rate": 5.382477739949471e-06, + "loss": 0.2679, + "step": 19003 + }, + { + "epoch": 0.48090695143862133, + "grad_norm": 3.229654550552368, + "learning_rate": 5.382077392669134e-06, + "loss": 0.1507, + "step": 19004 + }, + { + "epoch": 0.48093225700331504, + "grad_norm": 7.637664794921875, + "learning_rate": 5.381677042924851e-06, + "loss": 0.1702, + "step": 19005 + }, + { + "epoch": 0.4809575625680087, + "grad_norm": 3.9336986541748047, + "learning_rate": 5.381276690719203e-06, + "loss": 0.1521, + "step": 19006 + }, + { + "epoch": 0.48098286813270236, + "grad_norm": 4.47012186050415, + "learning_rate": 5.380876336054774e-06, + "loss": 0.189, + "step": 19007 + }, + { + "epoch": 0.48100817369739607, + "grad_norm": 3.2530505657196045, + "learning_rate": 5.380475978934143e-06, + "loss": 0.1529, + "step": 19008 + }, + { + "epoch": 0.4810334792620897, + "grad_norm": 6.951356887817383, + "learning_rate": 5.380075619359894e-06, + "loss": 0.2505, + "step": 19009 + }, + { + "epoch": 0.48105878482678344, + "grad_norm": 3.521334648132324, + "learning_rate": 5.3796752573346075e-06, + "loss": 0.1732, + "step": 19010 + }, + { + "epoch": 0.4810840903914771, + "grad_norm": 4.072854518890381, + "learning_rate": 5.379274892860867e-06, + "loss": 0.1142, + "step": 19011 + }, + { + "epoch": 0.48110939595617075, + "grad_norm": 4.312936782836914, + "learning_rate": 5.378874525941252e-06, + "loss": 0.1783, + "step": 19012 + }, + { + "epoch": 0.48113470152086446, + "grad_norm": 3.9054999351501465, + "learning_rate": 5.378474156578346e-06, + "loss": 0.1608, + "step": 19013 + }, + { + "epoch": 0.4811600070855581, + "grad_norm": 2.0600171089172363, + "learning_rate": 5.378073784774731e-06, + "loss": 0.0991, + "step": 19014 + }, + { + "epoch": 0.4811853126502518, + "grad_norm": 4.123847961425781, + "learning_rate": 5.37767341053299e-06, + "loss": 0.1133, + "step": 19015 + }, + { + "epoch": 0.4812106182149455, + "grad_norm": 10.106639862060547, + "learning_rate": 5.3772730338557e-06, + "loss": 0.235, + "step": 19016 + }, + { + "epoch": 0.48123592377963914, + "grad_norm": 9.29550552368164, + "learning_rate": 5.3768726547454485e-06, + "loss": 0.3221, + "step": 19017 + }, + { + "epoch": 0.4812612293443328, + "grad_norm": 5.732336044311523, + "learning_rate": 5.376472273204815e-06, + "loss": 0.1841, + "step": 19018 + }, + { + "epoch": 0.4812865349090265, + "grad_norm": 2.9106783866882324, + "learning_rate": 5.376071889236382e-06, + "loss": 0.1581, + "step": 19019 + }, + { + "epoch": 0.48131184047372016, + "grad_norm": 5.658786296844482, + "learning_rate": 5.375671502842732e-06, + "loss": 0.1851, + "step": 19020 + }, + { + "epoch": 0.4813371460384138, + "grad_norm": 11.683354377746582, + "learning_rate": 5.375271114026444e-06, + "loss": 0.3429, + "step": 19021 + }, + { + "epoch": 0.48136245160310753, + "grad_norm": 2.421748638153076, + "learning_rate": 5.374870722790105e-06, + "loss": 0.1411, + "step": 19022 + }, + { + "epoch": 0.4813877571678012, + "grad_norm": 7.650420188903809, + "learning_rate": 5.374470329136293e-06, + "loss": 0.2769, + "step": 19023 + }, + { + "epoch": 0.4814130627324949, + "grad_norm": 6.826962947845459, + "learning_rate": 5.3740699330675916e-06, + "loss": 0.196, + "step": 19024 + }, + { + "epoch": 0.48143836829718856, + "grad_norm": 9.581671714782715, + "learning_rate": 5.373669534586583e-06, + "loss": 0.1865, + "step": 19025 + }, + { + "epoch": 0.4814636738618822, + "grad_norm": 6.73284387588501, + "learning_rate": 5.373269133695849e-06, + "loss": 0.163, + "step": 19026 + }, + { + "epoch": 0.4814889794265759, + "grad_norm": 5.951926231384277, + "learning_rate": 5.372868730397972e-06, + "loss": 0.1348, + "step": 19027 + }, + { + "epoch": 0.4815142849912696, + "grad_norm": 4.977705001831055, + "learning_rate": 5.372468324695533e-06, + "loss": 0.1884, + "step": 19028 + }, + { + "epoch": 0.48153959055596324, + "grad_norm": 4.235353946685791, + "learning_rate": 5.372067916591116e-06, + "loss": 0.1677, + "step": 19029 + }, + { + "epoch": 0.48156489612065695, + "grad_norm": 6.242561340332031, + "learning_rate": 5.371667506087302e-06, + "loss": 0.2428, + "step": 19030 + }, + { + "epoch": 0.4815902016853506, + "grad_norm": 4.908924102783203, + "learning_rate": 5.371267093186674e-06, + "loss": 0.1476, + "step": 19031 + }, + { + "epoch": 0.48161550725004426, + "grad_norm": 3.5393526554107666, + "learning_rate": 5.370866677891813e-06, + "loss": 0.1715, + "step": 19032 + }, + { + "epoch": 0.481640812814738, + "grad_norm": 4.810617446899414, + "learning_rate": 5.370466260205302e-06, + "loss": 0.1654, + "step": 19033 + }, + { + "epoch": 0.48166611837943163, + "grad_norm": 7.593467712402344, + "learning_rate": 5.3700658401297225e-06, + "loss": 0.1677, + "step": 19034 + }, + { + "epoch": 0.4816914239441253, + "grad_norm": 11.553018569946289, + "learning_rate": 5.369665417667657e-06, + "loss": 0.3795, + "step": 19035 + }, + { + "epoch": 0.481716729508819, + "grad_norm": 14.087331771850586, + "learning_rate": 5.3692649928216885e-06, + "loss": 0.2269, + "step": 19036 + }, + { + "epoch": 0.48174203507351265, + "grad_norm": 4.559049606323242, + "learning_rate": 5.368864565594398e-06, + "loss": 0.0893, + "step": 19037 + }, + { + "epoch": 0.48176734063820636, + "grad_norm": 3.641143560409546, + "learning_rate": 5.36846413598837e-06, + "loss": 0.1871, + "step": 19038 + }, + { + "epoch": 0.4817926462029, + "grad_norm": 4.433121681213379, + "learning_rate": 5.368063704006184e-06, + "loss": 0.1486, + "step": 19039 + }, + { + "epoch": 0.4818179517675937, + "grad_norm": 4.411611080169678, + "learning_rate": 5.367663269650425e-06, + "loss": 0.1313, + "step": 19040 + }, + { + "epoch": 0.4818432573322874, + "grad_norm": 3.5850913524627686, + "learning_rate": 5.367262832923672e-06, + "loss": 0.1353, + "step": 19041 + }, + { + "epoch": 0.48186856289698105, + "grad_norm": 9.254858016967773, + "learning_rate": 5.366862393828511e-06, + "loss": 0.2169, + "step": 19042 + }, + { + "epoch": 0.4818938684616747, + "grad_norm": 3.797995090484619, + "learning_rate": 5.366461952367522e-06, + "loss": 0.1878, + "step": 19043 + }, + { + "epoch": 0.4819191740263684, + "grad_norm": 11.214860916137695, + "learning_rate": 5.3660615085432885e-06, + "loss": 0.2572, + "step": 19044 + }, + { + "epoch": 0.48194447959106207, + "grad_norm": 4.4758806228637695, + "learning_rate": 5.365661062358391e-06, + "loss": 0.1324, + "step": 19045 + }, + { + "epoch": 0.4819697851557557, + "grad_norm": 2.486877918243408, + "learning_rate": 5.365260613815416e-06, + "loss": 0.1445, + "step": 19046 + }, + { + "epoch": 0.48199509072044944, + "grad_norm": 6.394100189208984, + "learning_rate": 5.364860162916941e-06, + "loss": 0.2027, + "step": 19047 + }, + { + "epoch": 0.4820203962851431, + "grad_norm": 3.130831480026245, + "learning_rate": 5.364459709665552e-06, + "loss": 0.1177, + "step": 19048 + }, + { + "epoch": 0.4820457018498368, + "grad_norm": 7.421813011169434, + "learning_rate": 5.3640592540638305e-06, + "loss": 0.3154, + "step": 19049 + }, + { + "epoch": 0.48207100741453046, + "grad_norm": 5.276668548583984, + "learning_rate": 5.363658796114357e-06, + "loss": 0.238, + "step": 19050 + }, + { + "epoch": 0.4820963129792241, + "grad_norm": 3.2619848251342773, + "learning_rate": 5.363258335819716e-06, + "loss": 0.1332, + "step": 19051 + }, + { + "epoch": 0.48212161854391783, + "grad_norm": 3.6288726329803467, + "learning_rate": 5.36285787318249e-06, + "loss": 0.1822, + "step": 19052 + }, + { + "epoch": 0.4821469241086115, + "grad_norm": 7.924162864685059, + "learning_rate": 5.362457408205262e-06, + "loss": 0.1588, + "step": 19053 + }, + { + "epoch": 0.48217222967330514, + "grad_norm": 4.106112957000732, + "learning_rate": 5.362056940890611e-06, + "loss": 0.1262, + "step": 19054 + }, + { + "epoch": 0.48219753523799885, + "grad_norm": 3.821817636489868, + "learning_rate": 5.361656471241125e-06, + "loss": 0.1514, + "step": 19055 + }, + { + "epoch": 0.4822228408026925, + "grad_norm": 6.469440460205078, + "learning_rate": 5.36125599925938e-06, + "loss": 0.1743, + "step": 19056 + }, + { + "epoch": 0.48224814636738617, + "grad_norm": 2.1684489250183105, + "learning_rate": 5.360855524947964e-06, + "loss": 0.1099, + "step": 19057 + }, + { + "epoch": 0.4822734519320799, + "grad_norm": 4.560723781585693, + "learning_rate": 5.360455048309459e-06, + "loss": 0.2122, + "step": 19058 + }, + { + "epoch": 0.48229875749677353, + "grad_norm": 4.171738624572754, + "learning_rate": 5.360054569346445e-06, + "loss": 0.1618, + "step": 19059 + }, + { + "epoch": 0.4823240630614672, + "grad_norm": 4.463534355163574, + "learning_rate": 5.359654088061508e-06, + "loss": 0.157, + "step": 19060 + }, + { + "epoch": 0.4823493686261609, + "grad_norm": 11.057083129882812, + "learning_rate": 5.359253604457227e-06, + "loss": 0.2965, + "step": 19061 + }, + { + "epoch": 0.48237467419085456, + "grad_norm": 4.426933288574219, + "learning_rate": 5.358853118536188e-06, + "loss": 0.236, + "step": 19062 + }, + { + "epoch": 0.48239997975554827, + "grad_norm": 13.39944076538086, + "learning_rate": 5.35845263030097e-06, + "loss": 0.3393, + "step": 19063 + }, + { + "epoch": 0.4824252853202419, + "grad_norm": 6.245274543762207, + "learning_rate": 5.358052139754159e-06, + "loss": 0.261, + "step": 19064 + }, + { + "epoch": 0.4824505908849356, + "grad_norm": 7.236595153808594, + "learning_rate": 5.357651646898335e-06, + "loss": 0.1867, + "step": 19065 + }, + { + "epoch": 0.4824758964496293, + "grad_norm": 9.266748428344727, + "learning_rate": 5.357251151736083e-06, + "loss": 0.302, + "step": 19066 + }, + { + "epoch": 0.48250120201432295, + "grad_norm": 4.716102123260498, + "learning_rate": 5.356850654269985e-06, + "loss": 0.1824, + "step": 19067 + }, + { + "epoch": 0.4825265075790166, + "grad_norm": 5.176210403442383, + "learning_rate": 5.356450154502622e-06, + "loss": 0.1627, + "step": 19068 + }, + { + "epoch": 0.4825518131437103, + "grad_norm": 4.160138130187988, + "learning_rate": 5.3560496524365804e-06, + "loss": 0.1257, + "step": 19069 + }, + { + "epoch": 0.482577118708404, + "grad_norm": 7.543055057525635, + "learning_rate": 5.35564914807444e-06, + "loss": 0.2202, + "step": 19070 + }, + { + "epoch": 0.48260242427309763, + "grad_norm": 4.489489555358887, + "learning_rate": 5.355248641418785e-06, + "loss": 0.2051, + "step": 19071 + }, + { + "epoch": 0.48262772983779134, + "grad_norm": 4.324267387390137, + "learning_rate": 5.354848132472196e-06, + "loss": 0.1078, + "step": 19072 + }, + { + "epoch": 0.482653035402485, + "grad_norm": 5.858402252197266, + "learning_rate": 5.354447621237259e-06, + "loss": 0.145, + "step": 19073 + }, + { + "epoch": 0.4826783409671787, + "grad_norm": 3.2379961013793945, + "learning_rate": 5.354047107716553e-06, + "loss": 0.1461, + "step": 19074 + }, + { + "epoch": 0.48270364653187237, + "grad_norm": 4.261657238006592, + "learning_rate": 5.353646591912665e-06, + "loss": 0.1914, + "step": 19075 + }, + { + "epoch": 0.482728952096566, + "grad_norm": 19.294431686401367, + "learning_rate": 5.353246073828177e-06, + "loss": 0.2303, + "step": 19076 + }, + { + "epoch": 0.48275425766125973, + "grad_norm": 5.611207485198975, + "learning_rate": 5.352845553465668e-06, + "loss": 0.1668, + "step": 19077 + }, + { + "epoch": 0.4827795632259534, + "grad_norm": 2.7467849254608154, + "learning_rate": 5.3524450308277255e-06, + "loss": 0.1401, + "step": 19078 + }, + { + "epoch": 0.48280486879064705, + "grad_norm": 6.123804092407227, + "learning_rate": 5.35204450591693e-06, + "loss": 0.167, + "step": 19079 + }, + { + "epoch": 0.48283017435534076, + "grad_norm": 4.711783409118652, + "learning_rate": 5.351643978735866e-06, + "loss": 0.1682, + "step": 19080 + }, + { + "epoch": 0.4828554799200344, + "grad_norm": 4.280665397644043, + "learning_rate": 5.351243449287114e-06, + "loss": 0.1331, + "step": 19081 + }, + { + "epoch": 0.48288078548472807, + "grad_norm": 4.613447666168213, + "learning_rate": 5.350842917573258e-06, + "loss": 0.1281, + "step": 19082 + }, + { + "epoch": 0.4829060910494218, + "grad_norm": 10.696396827697754, + "learning_rate": 5.350442383596881e-06, + "loss": 0.1879, + "step": 19083 + }, + { + "epoch": 0.48293139661411544, + "grad_norm": 11.395025253295898, + "learning_rate": 5.350041847360569e-06, + "loss": 0.2248, + "step": 19084 + }, + { + "epoch": 0.4829567021788091, + "grad_norm": 5.162757396697998, + "learning_rate": 5.3496413088669e-06, + "loss": 0.1853, + "step": 19085 + }, + { + "epoch": 0.4829820077435028, + "grad_norm": 3.280768394470215, + "learning_rate": 5.34924076811846e-06, + "loss": 0.2027, + "step": 19086 + }, + { + "epoch": 0.48300731330819646, + "grad_norm": 3.2256076335906982, + "learning_rate": 5.348840225117829e-06, + "loss": 0.1545, + "step": 19087 + }, + { + "epoch": 0.4830326188728902, + "grad_norm": 4.614922523498535, + "learning_rate": 5.348439679867594e-06, + "loss": 0.1737, + "step": 19088 + }, + { + "epoch": 0.48305792443758383, + "grad_norm": 4.6548171043396, + "learning_rate": 5.3480391323703376e-06, + "loss": 0.1348, + "step": 19089 + }, + { + "epoch": 0.4830832300022775, + "grad_norm": 5.379119396209717, + "learning_rate": 5.3476385826286406e-06, + "loss": 0.1615, + "step": 19090 + }, + { + "epoch": 0.4831085355669712, + "grad_norm": 5.57354736328125, + "learning_rate": 5.347238030645086e-06, + "loss": 0.2049, + "step": 19091 + }, + { + "epoch": 0.48313384113166485, + "grad_norm": 5.231253623962402, + "learning_rate": 5.34683747642226e-06, + "loss": 0.1996, + "step": 19092 + }, + { + "epoch": 0.4831591466963585, + "grad_norm": 6.568708419799805, + "learning_rate": 5.346436919962742e-06, + "loss": 0.2058, + "step": 19093 + }, + { + "epoch": 0.4831844522610522, + "grad_norm": 4.826976776123047, + "learning_rate": 5.346036361269117e-06, + "loss": 0.1427, + "step": 19094 + }, + { + "epoch": 0.4832097578257459, + "grad_norm": 3.1266376972198486, + "learning_rate": 5.345635800343969e-06, + "loss": 0.1248, + "step": 19095 + }, + { + "epoch": 0.48323506339043953, + "grad_norm": 7.994058609008789, + "learning_rate": 5.3452352371898784e-06, + "loss": 0.1726, + "step": 19096 + }, + { + "epoch": 0.48326036895513325, + "grad_norm": 5.4840006828308105, + "learning_rate": 5.344834671809432e-06, + "loss": 0.1648, + "step": 19097 + }, + { + "epoch": 0.4832856745198269, + "grad_norm": 4.747194766998291, + "learning_rate": 5.344434104205208e-06, + "loss": 0.2064, + "step": 19098 + }, + { + "epoch": 0.48331098008452056, + "grad_norm": 4.859586238861084, + "learning_rate": 5.344033534379793e-06, + "loss": 0.1341, + "step": 19099 + }, + { + "epoch": 0.48333628564921427, + "grad_norm": 5.600736618041992, + "learning_rate": 5.343632962335772e-06, + "loss": 0.2038, + "step": 19100 + }, + { + "epoch": 0.4833615912139079, + "grad_norm": 6.268443584442139, + "learning_rate": 5.343232388075724e-06, + "loss": 0.2393, + "step": 19101 + }, + { + "epoch": 0.48338689677860164, + "grad_norm": 5.979513645172119, + "learning_rate": 5.3428318116022356e-06, + "loss": 0.1578, + "step": 19102 + }, + { + "epoch": 0.4834122023432953, + "grad_norm": 5.1087212562561035, + "learning_rate": 5.342431232917889e-06, + "loss": 0.1432, + "step": 19103 + }, + { + "epoch": 0.48343750790798895, + "grad_norm": 3.31103515625, + "learning_rate": 5.342030652025267e-06, + "loss": 0.142, + "step": 19104 + }, + { + "epoch": 0.48346281347268266, + "grad_norm": 7.124085903167725, + "learning_rate": 5.341630068926951e-06, + "loss": 0.2609, + "step": 19105 + }, + { + "epoch": 0.4834881190373763, + "grad_norm": 4.794886112213135, + "learning_rate": 5.341229483625527e-06, + "loss": 0.1561, + "step": 19106 + }, + { + "epoch": 0.48351342460207, + "grad_norm": 5.998894691467285, + "learning_rate": 5.340828896123579e-06, + "loss": 0.2337, + "step": 19107 + }, + { + "epoch": 0.4835387301667637, + "grad_norm": 3.9744396209716797, + "learning_rate": 5.340428306423687e-06, + "loss": 0.0853, + "step": 19108 + }, + { + "epoch": 0.48356403573145734, + "grad_norm": 5.518787860870361, + "learning_rate": 5.340027714528438e-06, + "loss": 0.1926, + "step": 19109 + }, + { + "epoch": 0.483589341296151, + "grad_norm": 8.009552955627441, + "learning_rate": 5.339627120440411e-06, + "loss": 0.139, + "step": 19110 + }, + { + "epoch": 0.4836146468608447, + "grad_norm": 5.127096176147461, + "learning_rate": 5.339226524162195e-06, + "loss": 0.1597, + "step": 19111 + }, + { + "epoch": 0.48363995242553837, + "grad_norm": 4.87502908706665, + "learning_rate": 5.338825925696367e-06, + "loss": 0.1662, + "step": 19112 + }, + { + "epoch": 0.4836652579902321, + "grad_norm": 7.966646194458008, + "learning_rate": 5.338425325045515e-06, + "loss": 0.1652, + "step": 19113 + }, + { + "epoch": 0.48369056355492573, + "grad_norm": 4.3322014808654785, + "learning_rate": 5.338024722212222e-06, + "loss": 0.1326, + "step": 19114 + }, + { + "epoch": 0.4837158691196194, + "grad_norm": 5.6454386711120605, + "learning_rate": 5.337624117199069e-06, + "loss": 0.1804, + "step": 19115 + }, + { + "epoch": 0.4837411746843131, + "grad_norm": 11.136911392211914, + "learning_rate": 5.337223510008641e-06, + "loss": 0.3618, + "step": 19116 + }, + { + "epoch": 0.48376648024900676, + "grad_norm": 3.6911165714263916, + "learning_rate": 5.336822900643524e-06, + "loss": 0.1476, + "step": 19117 + }, + { + "epoch": 0.4837917858137004, + "grad_norm": 5.9563493728637695, + "learning_rate": 5.336422289106294e-06, + "loss": 0.1964, + "step": 19118 + }, + { + "epoch": 0.4838170913783941, + "grad_norm": 11.525778770446777, + "learning_rate": 5.336021675399543e-06, + "loss": 0.1744, + "step": 19119 + }, + { + "epoch": 0.4838423969430878, + "grad_norm": 9.264458656311035, + "learning_rate": 5.3356210595258495e-06, + "loss": 0.2789, + "step": 19120 + }, + { + "epoch": 0.48386770250778144, + "grad_norm": 3.6440300941467285, + "learning_rate": 5.335220441487797e-06, + "loss": 0.1234, + "step": 19121 + }, + { + "epoch": 0.48389300807247515, + "grad_norm": 5.933020114898682, + "learning_rate": 5.334819821287972e-06, + "loss": 0.1783, + "step": 19122 + }, + { + "epoch": 0.4839183136371688, + "grad_norm": 4.947238922119141, + "learning_rate": 5.334419198928956e-06, + "loss": 0.182, + "step": 19123 + }, + { + "epoch": 0.48394361920186246, + "grad_norm": 22.287437438964844, + "learning_rate": 5.334018574413333e-06, + "loss": 0.1903, + "step": 19124 + }, + { + "epoch": 0.4839689247665562, + "grad_norm": 4.4566426277160645, + "learning_rate": 5.333617947743686e-06, + "loss": 0.149, + "step": 19125 + }, + { + "epoch": 0.48399423033124983, + "grad_norm": 5.360538005828857, + "learning_rate": 5.333217318922599e-06, + "loss": 0.2436, + "step": 19126 + }, + { + "epoch": 0.48401953589594354, + "grad_norm": 17.252153396606445, + "learning_rate": 5.332816687952655e-06, + "loss": 0.3035, + "step": 19127 + }, + { + "epoch": 0.4840448414606372, + "grad_norm": 4.60029935836792, + "learning_rate": 5.332416054836438e-06, + "loss": 0.1639, + "step": 19128 + }, + { + "epoch": 0.48407014702533085, + "grad_norm": 3.3098394870758057, + "learning_rate": 5.332015419576531e-06, + "loss": 0.1486, + "step": 19129 + }, + { + "epoch": 0.48409545259002457, + "grad_norm": 11.263856887817383, + "learning_rate": 5.33161478217552e-06, + "loss": 0.2833, + "step": 19130 + }, + { + "epoch": 0.4841207581547182, + "grad_norm": 3.767939805984497, + "learning_rate": 5.331214142635987e-06, + "loss": 0.1132, + "step": 19131 + }, + { + "epoch": 0.4841460637194119, + "grad_norm": 3.6822094917297363, + "learning_rate": 5.330813500960514e-06, + "loss": 0.1268, + "step": 19132 + }, + { + "epoch": 0.4841713692841056, + "grad_norm": 4.605626583099365, + "learning_rate": 5.330412857151688e-06, + "loss": 0.2167, + "step": 19133 + }, + { + "epoch": 0.48419667484879925, + "grad_norm": 8.66714096069336, + "learning_rate": 5.33001221121209e-06, + "loss": 0.3863, + "step": 19134 + }, + { + "epoch": 0.4842219804134929, + "grad_norm": 3.257368803024292, + "learning_rate": 5.329611563144304e-06, + "loss": 0.1499, + "step": 19135 + }, + { + "epoch": 0.4842472859781866, + "grad_norm": 4.941524028778076, + "learning_rate": 5.329210912950916e-06, + "loss": 0.1403, + "step": 19136 + }, + { + "epoch": 0.48427259154288027, + "grad_norm": 5.180666923522949, + "learning_rate": 5.328810260634508e-06, + "loss": 0.1478, + "step": 19137 + }, + { + "epoch": 0.484297897107574, + "grad_norm": 8.40989875793457, + "learning_rate": 5.328409606197662e-06, + "loss": 0.2017, + "step": 19138 + }, + { + "epoch": 0.48432320267226764, + "grad_norm": 4.387956142425537, + "learning_rate": 5.328008949642966e-06, + "loss": 0.1478, + "step": 19139 + }, + { + "epoch": 0.4843485082369613, + "grad_norm": 4.523999214172363, + "learning_rate": 5.3276082909729984e-06, + "loss": 0.0968, + "step": 19140 + }, + { + "epoch": 0.484373813801655, + "grad_norm": 5.97154426574707, + "learning_rate": 5.327207630190347e-06, + "loss": 0.1385, + "step": 19141 + }, + { + "epoch": 0.48439911936634866, + "grad_norm": 6.128042697906494, + "learning_rate": 5.326806967297596e-06, + "loss": 0.2057, + "step": 19142 + }, + { + "epoch": 0.4844244249310423, + "grad_norm": 4.28710412979126, + "learning_rate": 5.326406302297325e-06, + "loss": 0.1448, + "step": 19143 + }, + { + "epoch": 0.48444973049573603, + "grad_norm": 4.129437446594238, + "learning_rate": 5.326005635192123e-06, + "loss": 0.1573, + "step": 19144 + }, + { + "epoch": 0.4844750360604297, + "grad_norm": 3.88047456741333, + "learning_rate": 5.3256049659845695e-06, + "loss": 0.1506, + "step": 19145 + }, + { + "epoch": 0.48450034162512334, + "grad_norm": 5.715331077575684, + "learning_rate": 5.325204294677251e-06, + "loss": 0.1245, + "step": 19146 + }, + { + "epoch": 0.48452564718981705, + "grad_norm": 4.987372875213623, + "learning_rate": 5.32480362127275e-06, + "loss": 0.2112, + "step": 19147 + }, + { + "epoch": 0.4845509527545107, + "grad_norm": 4.070266246795654, + "learning_rate": 5.324402945773652e-06, + "loss": 0.1953, + "step": 19148 + }, + { + "epoch": 0.48457625831920437, + "grad_norm": 5.41783332824707, + "learning_rate": 5.3240022681825385e-06, + "loss": 0.1606, + "step": 19149 + }, + { + "epoch": 0.4846015638838981, + "grad_norm": 4.378880500793457, + "learning_rate": 5.323601588501995e-06, + "loss": 0.1183, + "step": 19150 + }, + { + "epoch": 0.48462686944859173, + "grad_norm": 10.02487850189209, + "learning_rate": 5.323200906734604e-06, + "loss": 0.1848, + "step": 19151 + }, + { + "epoch": 0.48465217501328545, + "grad_norm": 4.156554222106934, + "learning_rate": 5.3228002228829525e-06, + "loss": 0.2123, + "step": 19152 + }, + { + "epoch": 0.4846774805779791, + "grad_norm": 5.894131183624268, + "learning_rate": 5.322399536949621e-06, + "loss": 0.2154, + "step": 19153 + }, + { + "epoch": 0.48470278614267276, + "grad_norm": 3.470278263092041, + "learning_rate": 5.3219988489371955e-06, + "loss": 0.1237, + "step": 19154 + }, + { + "epoch": 0.48472809170736647, + "grad_norm": 4.293227195739746, + "learning_rate": 5.32159815884826e-06, + "loss": 0.1904, + "step": 19155 + }, + { + "epoch": 0.4847533972720601, + "grad_norm": 5.395094394683838, + "learning_rate": 5.321197466685396e-06, + "loss": 0.2514, + "step": 19156 + }, + { + "epoch": 0.4847787028367538, + "grad_norm": 3.618487596511841, + "learning_rate": 5.320796772451191e-06, + "loss": 0.1501, + "step": 19157 + }, + { + "epoch": 0.4848040084014475, + "grad_norm": 6.599936485290527, + "learning_rate": 5.320396076148226e-06, + "loss": 0.2469, + "step": 19158 + }, + { + "epoch": 0.48482931396614115, + "grad_norm": 3.634038209915161, + "learning_rate": 5.319995377779087e-06, + "loss": 0.1963, + "step": 19159 + }, + { + "epoch": 0.4848546195308348, + "grad_norm": 4.023831844329834, + "learning_rate": 5.319594677346357e-06, + "loss": 0.1588, + "step": 19160 + }, + { + "epoch": 0.4848799250955285, + "grad_norm": 5.127782344818115, + "learning_rate": 5.319193974852621e-06, + "loss": 0.1721, + "step": 19161 + }, + { + "epoch": 0.4849052306602222, + "grad_norm": 6.053174018859863, + "learning_rate": 5.318793270300462e-06, + "loss": 0.2406, + "step": 19162 + }, + { + "epoch": 0.48493053622491583, + "grad_norm": 3.1695265769958496, + "learning_rate": 5.318392563692465e-06, + "loss": 0.185, + "step": 19163 + }, + { + "epoch": 0.48495584178960954, + "grad_norm": 4.423689842224121, + "learning_rate": 5.317991855031213e-06, + "loss": 0.1652, + "step": 19164 + }, + { + "epoch": 0.4849811473543032, + "grad_norm": 4.99392032623291, + "learning_rate": 5.3175911443192905e-06, + "loss": 0.2224, + "step": 19165 + }, + { + "epoch": 0.4850064529189969, + "grad_norm": 5.011122226715088, + "learning_rate": 5.3171904315592825e-06, + "loss": 0.1654, + "step": 19166 + }, + { + "epoch": 0.48503175848369057, + "grad_norm": 2.8340189456939697, + "learning_rate": 5.316789716753772e-06, + "loss": 0.1414, + "step": 19167 + }, + { + "epoch": 0.4850570640483842, + "grad_norm": 4.839725971221924, + "learning_rate": 5.316388999905343e-06, + "loss": 0.1838, + "step": 19168 + }, + { + "epoch": 0.48508236961307793, + "grad_norm": 4.299044609069824, + "learning_rate": 5.31598828101658e-06, + "loss": 0.1601, + "step": 19169 + }, + { + "epoch": 0.4851076751777716, + "grad_norm": 6.893726348876953, + "learning_rate": 5.315587560090069e-06, + "loss": 0.2232, + "step": 19170 + }, + { + "epoch": 0.48513298074246525, + "grad_norm": 6.084508419036865, + "learning_rate": 5.315186837128391e-06, + "loss": 0.2516, + "step": 19171 + }, + { + "epoch": 0.48515828630715896, + "grad_norm": 4.735083103179932, + "learning_rate": 5.3147861121341335e-06, + "loss": 0.1826, + "step": 19172 + }, + { + "epoch": 0.4851835918718526, + "grad_norm": 9.57999324798584, + "learning_rate": 5.314385385109877e-06, + "loss": 0.1581, + "step": 19173 + }, + { + "epoch": 0.48520889743654627, + "grad_norm": 2.6156201362609863, + "learning_rate": 5.313984656058208e-06, + "loss": 0.0866, + "step": 19174 + }, + { + "epoch": 0.48523420300124, + "grad_norm": 9.545805931091309, + "learning_rate": 5.313583924981711e-06, + "loss": 0.1705, + "step": 19175 + }, + { + "epoch": 0.48525950856593364, + "grad_norm": 5.247419834136963, + "learning_rate": 5.313183191882969e-06, + "loss": 0.1257, + "step": 19176 + }, + { + "epoch": 0.48528481413062735, + "grad_norm": 17.4737491607666, + "learning_rate": 5.312782456764567e-06, + "loss": 0.2307, + "step": 19177 + }, + { + "epoch": 0.485310119695321, + "grad_norm": 5.608828544616699, + "learning_rate": 5.312381719629089e-06, + "loss": 0.1724, + "step": 19178 + }, + { + "epoch": 0.48533542526001466, + "grad_norm": 2.0595881938934326, + "learning_rate": 5.31198098047912e-06, + "loss": 0.0681, + "step": 19179 + }, + { + "epoch": 0.4853607308247084, + "grad_norm": 9.091389656066895, + "learning_rate": 5.311580239317244e-06, + "loss": 0.2321, + "step": 19180 + }, + { + "epoch": 0.48538603638940203, + "grad_norm": 7.368200778961182, + "learning_rate": 5.311179496146044e-06, + "loss": 0.1707, + "step": 19181 + }, + { + "epoch": 0.4854113419540957, + "grad_norm": 5.27150821685791, + "learning_rate": 5.3107787509681055e-06, + "loss": 0.2059, + "step": 19182 + }, + { + "epoch": 0.4854366475187894, + "grad_norm": 8.59292221069336, + "learning_rate": 5.3103780037860135e-06, + "loss": 0.2858, + "step": 19183 + }, + { + "epoch": 0.48546195308348306, + "grad_norm": 9.808664321899414, + "learning_rate": 5.3099772546023495e-06, + "loss": 0.2676, + "step": 19184 + }, + { + "epoch": 0.4854872586481767, + "grad_norm": 3.8456039428710938, + "learning_rate": 5.309576503419701e-06, + "loss": 0.1656, + "step": 19185 + }, + { + "epoch": 0.4855125642128704, + "grad_norm": 4.573565483093262, + "learning_rate": 5.309175750240652e-06, + "loss": 0.1482, + "step": 19186 + }, + { + "epoch": 0.4855378697775641, + "grad_norm": 2.6587746143341064, + "learning_rate": 5.308774995067785e-06, + "loss": 0.1104, + "step": 19187 + }, + { + "epoch": 0.48556317534225774, + "grad_norm": 5.6583638191223145, + "learning_rate": 5.308374237903686e-06, + "loss": 0.2149, + "step": 19188 + }, + { + "epoch": 0.48558848090695145, + "grad_norm": 3.8857150077819824, + "learning_rate": 5.307973478750938e-06, + "loss": 0.1584, + "step": 19189 + }, + { + "epoch": 0.4856137864716451, + "grad_norm": 3.8419594764709473, + "learning_rate": 5.3075727176121285e-06, + "loss": 0.1713, + "step": 19190 + }, + { + "epoch": 0.4856390920363388, + "grad_norm": 11.425999641418457, + "learning_rate": 5.307171954489837e-06, + "loss": 0.2452, + "step": 19191 + }, + { + "epoch": 0.48566439760103247, + "grad_norm": 5.557638645172119, + "learning_rate": 5.306771189386654e-06, + "loss": 0.1604, + "step": 19192 + }, + { + "epoch": 0.4856897031657261, + "grad_norm": 3.930752754211426, + "learning_rate": 5.306370422305157e-06, + "loss": 0.1511, + "step": 19193 + }, + { + "epoch": 0.48571500873041984, + "grad_norm": 6.6143012046813965, + "learning_rate": 5.305969653247936e-06, + "loss": 0.1545, + "step": 19194 + }, + { + "epoch": 0.4857403142951135, + "grad_norm": 3.4651598930358887, + "learning_rate": 5.305568882217576e-06, + "loss": 0.1874, + "step": 19195 + }, + { + "epoch": 0.48576561985980715, + "grad_norm": 5.487849712371826, + "learning_rate": 5.305168109216655e-06, + "loss": 0.1466, + "step": 19196 + }, + { + "epoch": 0.48579092542450086, + "grad_norm": 5.692021369934082, + "learning_rate": 5.304767334247765e-06, + "loss": 0.2387, + "step": 19197 + }, + { + "epoch": 0.4858162309891945, + "grad_norm": 8.5118408203125, + "learning_rate": 5.304366557313484e-06, + "loss": 0.2001, + "step": 19198 + }, + { + "epoch": 0.4858415365538882, + "grad_norm": 4.9384870529174805, + "learning_rate": 5.303965778416401e-06, + "loss": 0.1723, + "step": 19199 + }, + { + "epoch": 0.4858668421185819, + "grad_norm": 2.7013132572174072, + "learning_rate": 5.3035649975591e-06, + "loss": 0.1328, + "step": 19200 + }, + { + "epoch": 0.48589214768327554, + "grad_norm": 3.4459121227264404, + "learning_rate": 5.3031642147441645e-06, + "loss": 0.1642, + "step": 19201 + }, + { + "epoch": 0.48591745324796926, + "grad_norm": 21.224586486816406, + "learning_rate": 5.302763429974178e-06, + "loss": 0.2593, + "step": 19202 + }, + { + "epoch": 0.4859427588126629, + "grad_norm": 3.4357073307037354, + "learning_rate": 5.302362643251728e-06, + "loss": 0.1326, + "step": 19203 + }, + { + "epoch": 0.48596806437735657, + "grad_norm": 5.3851518630981445, + "learning_rate": 5.301961854579397e-06, + "loss": 0.1838, + "step": 19204 + }, + { + "epoch": 0.4859933699420503, + "grad_norm": 4.6892476081848145, + "learning_rate": 5.301561063959769e-06, + "loss": 0.1984, + "step": 19205 + }, + { + "epoch": 0.48601867550674394, + "grad_norm": 6.652675628662109, + "learning_rate": 5.301160271395433e-06, + "loss": 0.1946, + "step": 19206 + }, + { + "epoch": 0.4860439810714376, + "grad_norm": 4.359813690185547, + "learning_rate": 5.300759476888967e-06, + "loss": 0.118, + "step": 19207 + }, + { + "epoch": 0.4860692866361313, + "grad_norm": 6.172244071960449, + "learning_rate": 5.300358680442961e-06, + "loss": 0.2415, + "step": 19208 + }, + { + "epoch": 0.48609459220082496, + "grad_norm": 4.9697957038879395, + "learning_rate": 5.2999578820599974e-06, + "loss": 0.1875, + "step": 19209 + }, + { + "epoch": 0.4861198977655186, + "grad_norm": 4.120348930358887, + "learning_rate": 5.299557081742661e-06, + "loss": 0.0818, + "step": 19210 + }, + { + "epoch": 0.4861452033302123, + "grad_norm": 10.342289924621582, + "learning_rate": 5.299156279493536e-06, + "loss": 0.1754, + "step": 19211 + }, + { + "epoch": 0.486170508894906, + "grad_norm": 4.313837051391602, + "learning_rate": 5.298755475315209e-06, + "loss": 0.1448, + "step": 19212 + }, + { + "epoch": 0.48619581445959964, + "grad_norm": 5.981358528137207, + "learning_rate": 5.298354669210263e-06, + "loss": 0.1954, + "step": 19213 + }, + { + "epoch": 0.48622112002429335, + "grad_norm": 5.914984703063965, + "learning_rate": 5.297953861181284e-06, + "loss": 0.2356, + "step": 19214 + }, + { + "epoch": 0.486246425588987, + "grad_norm": 4.963232517242432, + "learning_rate": 5.2975530512308545e-06, + "loss": 0.1647, + "step": 19215 + }, + { + "epoch": 0.4862717311536807, + "grad_norm": 2.658466100692749, + "learning_rate": 5.297152239361562e-06, + "loss": 0.1012, + "step": 19216 + }, + { + "epoch": 0.4862970367183744, + "grad_norm": 11.122313499450684, + "learning_rate": 5.2967514255759885e-06, + "loss": 0.2731, + "step": 19217 + }, + { + "epoch": 0.48632234228306803, + "grad_norm": 11.418301582336426, + "learning_rate": 5.296350609876721e-06, + "loss": 0.1946, + "step": 19218 + }, + { + "epoch": 0.48634764784776174, + "grad_norm": 3.012584686279297, + "learning_rate": 5.295949792266345e-06, + "loss": 0.1352, + "step": 19219 + }, + { + "epoch": 0.4863729534124554, + "grad_norm": 6.283451557159424, + "learning_rate": 5.295548972747442e-06, + "loss": 0.15, + "step": 19220 + }, + { + "epoch": 0.48639825897714906, + "grad_norm": 4.469119548797607, + "learning_rate": 5.2951481513226e-06, + "loss": 0.1416, + "step": 19221 + }, + { + "epoch": 0.48642356454184277, + "grad_norm": 4.295948505401611, + "learning_rate": 5.294747327994402e-06, + "loss": 0.1668, + "step": 19222 + }, + { + "epoch": 0.4864488701065364, + "grad_norm": 4.240654468536377, + "learning_rate": 5.294346502765433e-06, + "loss": 0.0932, + "step": 19223 + }, + { + "epoch": 0.4864741756712301, + "grad_norm": 3.715848445892334, + "learning_rate": 5.293945675638279e-06, + "loss": 0.2101, + "step": 19224 + }, + { + "epoch": 0.4864994812359238, + "grad_norm": 7.808820724487305, + "learning_rate": 5.293544846615522e-06, + "loss": 0.3208, + "step": 19225 + }, + { + "epoch": 0.48652478680061745, + "grad_norm": 4.489157199859619, + "learning_rate": 5.29314401569975e-06, + "loss": 0.1699, + "step": 19226 + }, + { + "epoch": 0.4865500923653111, + "grad_norm": 7.009937763214111, + "learning_rate": 5.292743182893547e-06, + "loss": 0.217, + "step": 19227 + }, + { + "epoch": 0.4865753979300048, + "grad_norm": 4.279334545135498, + "learning_rate": 5.292342348199498e-06, + "loss": 0.1722, + "step": 19228 + }, + { + "epoch": 0.48660070349469847, + "grad_norm": 6.5604681968688965, + "learning_rate": 5.291941511620187e-06, + "loss": 0.2337, + "step": 19229 + }, + { + "epoch": 0.4866260090593922, + "grad_norm": 5.20847225189209, + "learning_rate": 5.2915406731582e-06, + "loss": 0.1443, + "step": 19230 + }, + { + "epoch": 0.48665131462408584, + "grad_norm": 9.263228416442871, + "learning_rate": 5.291139832816121e-06, + "loss": 0.2774, + "step": 19231 + }, + { + "epoch": 0.4866766201887795, + "grad_norm": 7.323754787445068, + "learning_rate": 5.290738990596536e-06, + "loss": 0.2183, + "step": 19232 + }, + { + "epoch": 0.4867019257534732, + "grad_norm": 4.842527866363525, + "learning_rate": 5.290338146502027e-06, + "loss": 0.1513, + "step": 19233 + }, + { + "epoch": 0.48672723131816686, + "grad_norm": 3.8176064491271973, + "learning_rate": 5.289937300535185e-06, + "loss": 0.1858, + "step": 19234 + }, + { + "epoch": 0.4867525368828605, + "grad_norm": 12.16765308380127, + "learning_rate": 5.289536452698588e-06, + "loss": 0.1645, + "step": 19235 + }, + { + "epoch": 0.48677784244755423, + "grad_norm": 4.526797294616699, + "learning_rate": 5.289135602994827e-06, + "loss": 0.167, + "step": 19236 + }, + { + "epoch": 0.4868031480122479, + "grad_norm": 3.450587272644043, + "learning_rate": 5.288734751426482e-06, + "loss": 0.1614, + "step": 19237 + }, + { + "epoch": 0.48682845357694154, + "grad_norm": 6.444063663482666, + "learning_rate": 5.288333897996141e-06, + "loss": 0.1982, + "step": 19238 + }, + { + "epoch": 0.48685375914163526, + "grad_norm": 6.991861343383789, + "learning_rate": 5.287933042706389e-06, + "loss": 0.2148, + "step": 19239 + }, + { + "epoch": 0.4868790647063289, + "grad_norm": 4.248439788818359, + "learning_rate": 5.287532185559809e-06, + "loss": 0.1231, + "step": 19240 + }, + { + "epoch": 0.4869043702710226, + "grad_norm": 6.486315727233887, + "learning_rate": 5.287131326558989e-06, + "loss": 0.248, + "step": 19241 + }, + { + "epoch": 0.4869296758357163, + "grad_norm": 7.554478168487549, + "learning_rate": 5.2867304657065115e-06, + "loss": 0.2479, + "step": 19242 + }, + { + "epoch": 0.48695498140040994, + "grad_norm": 4.31505012512207, + "learning_rate": 5.286329603004964e-06, + "loss": 0.1216, + "step": 19243 + }, + { + "epoch": 0.48698028696510365, + "grad_norm": 6.66062068939209, + "learning_rate": 5.285928738456928e-06, + "loss": 0.2077, + "step": 19244 + }, + { + "epoch": 0.4870055925297973, + "grad_norm": 4.73246431350708, + "learning_rate": 5.285527872064993e-06, + "loss": 0.2113, + "step": 19245 + }, + { + "epoch": 0.48703089809449096, + "grad_norm": 5.053986072540283, + "learning_rate": 5.285127003831739e-06, + "loss": 0.155, + "step": 19246 + }, + { + "epoch": 0.48705620365918467, + "grad_norm": 4.669625759124756, + "learning_rate": 5.284726133759757e-06, + "loss": 0.1861, + "step": 19247 + }, + { + "epoch": 0.48708150922387833, + "grad_norm": 7.641376972198486, + "learning_rate": 5.284325261851627e-06, + "loss": 0.1672, + "step": 19248 + }, + { + "epoch": 0.487106814788572, + "grad_norm": 3.9705100059509277, + "learning_rate": 5.283924388109937e-06, + "loss": 0.1235, + "step": 19249 + }, + { + "epoch": 0.4871321203532657, + "grad_norm": 7.407955646514893, + "learning_rate": 5.283523512537272e-06, + "loss": 0.1941, + "step": 19250 + }, + { + "epoch": 0.48715742591795935, + "grad_norm": 4.8256120681762695, + "learning_rate": 5.283122635136217e-06, + "loss": 0.1272, + "step": 19251 + }, + { + "epoch": 0.487182731482653, + "grad_norm": 2.6080939769744873, + "learning_rate": 5.282721755909355e-06, + "loss": 0.1554, + "step": 19252 + }, + { + "epoch": 0.4872080370473467, + "grad_norm": 8.453429222106934, + "learning_rate": 5.282320874859274e-06, + "loss": 0.2134, + "step": 19253 + }, + { + "epoch": 0.4872333426120404, + "grad_norm": 3.2735583782196045, + "learning_rate": 5.281919991988558e-06, + "loss": 0.1222, + "step": 19254 + }, + { + "epoch": 0.4872586481767341, + "grad_norm": 11.27832317352295, + "learning_rate": 5.281519107299793e-06, + "loss": 0.2833, + "step": 19255 + }, + { + "epoch": 0.48728395374142774, + "grad_norm": 13.087156295776367, + "learning_rate": 5.281118220795564e-06, + "loss": 0.1853, + "step": 19256 + }, + { + "epoch": 0.4873092593061214, + "grad_norm": 3.4989678859710693, + "learning_rate": 5.280717332478453e-06, + "loss": 0.1318, + "step": 19257 + }, + { + "epoch": 0.4873345648708151, + "grad_norm": 9.243215560913086, + "learning_rate": 5.280316442351051e-06, + "loss": 0.2197, + "step": 19258 + }, + { + "epoch": 0.48735987043550877, + "grad_norm": 3.0003631114959717, + "learning_rate": 5.27991555041594e-06, + "loss": 0.1427, + "step": 19259 + }, + { + "epoch": 0.4873851760002024, + "grad_norm": 6.242697238922119, + "learning_rate": 5.279514656675704e-06, + "loss": 0.1851, + "step": 19260 + }, + { + "epoch": 0.48741048156489614, + "grad_norm": 3.8350465297698975, + "learning_rate": 5.279113761132931e-06, + "loss": 0.1837, + "step": 19261 + }, + { + "epoch": 0.4874357871295898, + "grad_norm": 5.5622429847717285, + "learning_rate": 5.278712863790205e-06, + "loss": 0.2009, + "step": 19262 + }, + { + "epoch": 0.48746109269428345, + "grad_norm": 5.915272235870361, + "learning_rate": 5.278311964650114e-06, + "loss": 0.1774, + "step": 19263 + }, + { + "epoch": 0.48748639825897716, + "grad_norm": 5.115973472595215, + "learning_rate": 5.277911063715238e-06, + "loss": 0.1928, + "step": 19264 + }, + { + "epoch": 0.4875117038236708, + "grad_norm": 4.285604000091553, + "learning_rate": 5.277510160988166e-06, + "loss": 0.1965, + "step": 19265 + }, + { + "epoch": 0.48753700938836453, + "grad_norm": 3.6271684169769287, + "learning_rate": 5.2771092564714835e-06, + "loss": 0.1461, + "step": 19266 + }, + { + "epoch": 0.4875623149530582, + "grad_norm": 13.534977912902832, + "learning_rate": 5.276708350167774e-06, + "loss": 0.3209, + "step": 19267 + }, + { + "epoch": 0.48758762051775184, + "grad_norm": 7.404343128204346, + "learning_rate": 5.276307442079623e-06, + "loss": 0.2654, + "step": 19268 + }, + { + "epoch": 0.48761292608244555, + "grad_norm": 6.288986682891846, + "learning_rate": 5.275906532209618e-06, + "loss": 0.2301, + "step": 19269 + }, + { + "epoch": 0.4876382316471392, + "grad_norm": 9.067008018493652, + "learning_rate": 5.275505620560344e-06, + "loss": 0.1861, + "step": 19270 + }, + { + "epoch": 0.48766353721183286, + "grad_norm": 3.765688180923462, + "learning_rate": 5.275104707134383e-06, + "loss": 0.1632, + "step": 19271 + }, + { + "epoch": 0.4876888427765266, + "grad_norm": 3.3895158767700195, + "learning_rate": 5.274703791934324e-06, + "loss": 0.1677, + "step": 19272 + }, + { + "epoch": 0.48771414834122023, + "grad_norm": 7.9079155921936035, + "learning_rate": 5.2743028749627526e-06, + "loss": 0.2139, + "step": 19273 + }, + { + "epoch": 0.4877394539059139, + "grad_norm": 3.961132526397705, + "learning_rate": 5.2739019562222525e-06, + "loss": 0.2217, + "step": 19274 + }, + { + "epoch": 0.4877647594706076, + "grad_norm": 6.15878963470459, + "learning_rate": 5.273501035715408e-06, + "loss": 0.145, + "step": 19275 + }, + { + "epoch": 0.48779006503530126, + "grad_norm": 20.80598258972168, + "learning_rate": 5.273100113444808e-06, + "loss": 0.2742, + "step": 19276 + }, + { + "epoch": 0.4878153705999949, + "grad_norm": 4.384189605712891, + "learning_rate": 5.272699189413035e-06, + "loss": 0.186, + "step": 19277 + }, + { + "epoch": 0.4878406761646886, + "grad_norm": 3.0438168048858643, + "learning_rate": 5.272298263622677e-06, + "loss": 0.1677, + "step": 19278 + }, + { + "epoch": 0.4878659817293823, + "grad_norm": 4.947996139526367, + "learning_rate": 5.271897336076317e-06, + "loss": 0.1881, + "step": 19279 + }, + { + "epoch": 0.487891287294076, + "grad_norm": 2.8246984481811523, + "learning_rate": 5.271496406776543e-06, + "loss": 0.1345, + "step": 19280 + }, + { + "epoch": 0.48791659285876965, + "grad_norm": 4.648330211639404, + "learning_rate": 5.271095475725937e-06, + "loss": 0.1916, + "step": 19281 + }, + { + "epoch": 0.4879418984234633, + "grad_norm": 15.943114280700684, + "learning_rate": 5.270694542927089e-06, + "loss": 0.3277, + "step": 19282 + }, + { + "epoch": 0.487967203988157, + "grad_norm": 4.3592987060546875, + "learning_rate": 5.270293608382581e-06, + "loss": 0.1476, + "step": 19283 + }, + { + "epoch": 0.4879925095528507, + "grad_norm": 2.691626787185669, + "learning_rate": 5.269892672095e-06, + "loss": 0.1353, + "step": 19284 + }, + { + "epoch": 0.48801781511754433, + "grad_norm": 3.409712076187134, + "learning_rate": 5.269491734066932e-06, + "loss": 0.1388, + "step": 19285 + }, + { + "epoch": 0.48804312068223804, + "grad_norm": 5.232919216156006, + "learning_rate": 5.26909079430096e-06, + "loss": 0.1628, + "step": 19286 + }, + { + "epoch": 0.4880684262469317, + "grad_norm": 4.054104804992676, + "learning_rate": 5.268689852799673e-06, + "loss": 0.0988, + "step": 19287 + }, + { + "epoch": 0.48809373181162535, + "grad_norm": 7.1498308181762695, + "learning_rate": 5.268288909565654e-06, + "loss": 0.2189, + "step": 19288 + }, + { + "epoch": 0.48811903737631906, + "grad_norm": 2.9990079402923584, + "learning_rate": 5.267887964601492e-06, + "loss": 0.1532, + "step": 19289 + }, + { + "epoch": 0.4881443429410127, + "grad_norm": 4.357763290405273, + "learning_rate": 5.267487017909768e-06, + "loss": 0.1132, + "step": 19290 + }, + { + "epoch": 0.4881696485057064, + "grad_norm": 3.1338698863983154, + "learning_rate": 5.267086069493071e-06, + "loss": 0.1168, + "step": 19291 + }, + { + "epoch": 0.4881949540704001, + "grad_norm": 10.555329322814941, + "learning_rate": 5.266685119353986e-06, + "loss": 0.1782, + "step": 19292 + }, + { + "epoch": 0.48822025963509375, + "grad_norm": 4.336599826812744, + "learning_rate": 5.266284167495097e-06, + "loss": 0.1094, + "step": 19293 + }, + { + "epoch": 0.48824556519978746, + "grad_norm": 3.608440399169922, + "learning_rate": 5.265883213918992e-06, + "loss": 0.1132, + "step": 19294 + }, + { + "epoch": 0.4882708707644811, + "grad_norm": 2.933377504348755, + "learning_rate": 5.265482258628255e-06, + "loss": 0.0533, + "step": 19295 + }, + { + "epoch": 0.48829617632917477, + "grad_norm": 4.4570512771606445, + "learning_rate": 5.265081301625473e-06, + "loss": 0.2412, + "step": 19296 + }, + { + "epoch": 0.4883214818938685, + "grad_norm": 4.446972846984863, + "learning_rate": 5.26468034291323e-06, + "loss": 0.1869, + "step": 19297 + }, + { + "epoch": 0.48834678745856214, + "grad_norm": 3.7196755409240723, + "learning_rate": 5.264279382494114e-06, + "loss": 0.101, + "step": 19298 + }, + { + "epoch": 0.4883720930232558, + "grad_norm": 6.627673149108887, + "learning_rate": 5.263878420370708e-06, + "loss": 0.1735, + "step": 19299 + }, + { + "epoch": 0.4883973985879495, + "grad_norm": 8.496928215026855, + "learning_rate": 5.2634774565456e-06, + "loss": 0.2693, + "step": 19300 + }, + { + "epoch": 0.48842270415264316, + "grad_norm": 5.301297664642334, + "learning_rate": 5.263076491021375e-06, + "loss": 0.1869, + "step": 19301 + }, + { + "epoch": 0.4884480097173368, + "grad_norm": 6.098208427429199, + "learning_rate": 5.262675523800618e-06, + "loss": 0.2003, + "step": 19302 + }, + { + "epoch": 0.48847331528203053, + "grad_norm": 4.381221294403076, + "learning_rate": 5.262274554885917e-06, + "loss": 0.1645, + "step": 19303 + }, + { + "epoch": 0.4884986208467242, + "grad_norm": 8.45553970336914, + "learning_rate": 5.261873584279854e-06, + "loss": 0.2081, + "step": 19304 + }, + { + "epoch": 0.4885239264114179, + "grad_norm": 5.395224571228027, + "learning_rate": 5.261472611985019e-06, + "loss": 0.184, + "step": 19305 + }, + { + "epoch": 0.48854923197611155, + "grad_norm": 5.626465320587158, + "learning_rate": 5.261071638003994e-06, + "loss": 0.145, + "step": 19306 + }, + { + "epoch": 0.4885745375408052, + "grad_norm": 5.186275482177734, + "learning_rate": 5.260670662339368e-06, + "loss": 0.249, + "step": 19307 + }, + { + "epoch": 0.4885998431054989, + "grad_norm": 3.92366099357605, + "learning_rate": 5.260269684993724e-06, + "loss": 0.1517, + "step": 19308 + }, + { + "epoch": 0.4886251486701926, + "grad_norm": 6.300449848175049, + "learning_rate": 5.259868705969651e-06, + "loss": 0.1343, + "step": 19309 + }, + { + "epoch": 0.48865045423488623, + "grad_norm": 10.741199493408203, + "learning_rate": 5.259467725269732e-06, + "loss": 0.2728, + "step": 19310 + }, + { + "epoch": 0.48867575979957995, + "grad_norm": 6.867359638214111, + "learning_rate": 5.259066742896554e-06, + "loss": 0.3301, + "step": 19311 + }, + { + "epoch": 0.4887010653642736, + "grad_norm": 6.3623833656311035, + "learning_rate": 5.258665758852702e-06, + "loss": 0.2436, + "step": 19312 + }, + { + "epoch": 0.48872637092896726, + "grad_norm": 5.630322456359863, + "learning_rate": 5.2582647731407635e-06, + "loss": 0.1611, + "step": 19313 + }, + { + "epoch": 0.48875167649366097, + "grad_norm": 12.374977111816406, + "learning_rate": 5.257863785763324e-06, + "loss": 0.2637, + "step": 19314 + }, + { + "epoch": 0.4887769820583546, + "grad_norm": 3.351109504699707, + "learning_rate": 5.257462796722967e-06, + "loss": 0.0909, + "step": 19315 + }, + { + "epoch": 0.4888022876230483, + "grad_norm": 10.818927764892578, + "learning_rate": 5.257061806022282e-06, + "loss": 0.1818, + "step": 19316 + }, + { + "epoch": 0.488827593187742, + "grad_norm": 2.5567431449890137, + "learning_rate": 5.256660813663853e-06, + "loss": 0.1291, + "step": 19317 + }, + { + "epoch": 0.48885289875243565, + "grad_norm": 7.438841342926025, + "learning_rate": 5.256259819650266e-06, + "loss": 0.1997, + "step": 19318 + }, + { + "epoch": 0.48887820431712936, + "grad_norm": 4.723237991333008, + "learning_rate": 5.255858823984107e-06, + "loss": 0.1716, + "step": 19319 + }, + { + "epoch": 0.488903509881823, + "grad_norm": 18.018590927124023, + "learning_rate": 5.255457826667963e-06, + "loss": 0.2333, + "step": 19320 + }, + { + "epoch": 0.4889288154465167, + "grad_norm": 14.025357246398926, + "learning_rate": 5.255056827704416e-06, + "loss": 0.2351, + "step": 19321 + }, + { + "epoch": 0.4889541210112104, + "grad_norm": 5.538125514984131, + "learning_rate": 5.254655827096059e-06, + "loss": 0.2266, + "step": 19322 + }, + { + "epoch": 0.48897942657590404, + "grad_norm": 4.814952850341797, + "learning_rate": 5.2542548248454704e-06, + "loss": 0.1825, + "step": 19323 + }, + { + "epoch": 0.4890047321405977, + "grad_norm": 6.882216453552246, + "learning_rate": 5.253853820955242e-06, + "loss": 0.1918, + "step": 19324 + }, + { + "epoch": 0.4890300377052914, + "grad_norm": 7.760154724121094, + "learning_rate": 5.253452815427956e-06, + "loss": 0.2235, + "step": 19325 + }, + { + "epoch": 0.48905534326998507, + "grad_norm": 3.7260541915893555, + "learning_rate": 5.253051808266201e-06, + "loss": 0.1738, + "step": 19326 + }, + { + "epoch": 0.4890806488346787, + "grad_norm": 3.7674660682678223, + "learning_rate": 5.252650799472563e-06, + "loss": 0.2154, + "step": 19327 + }, + { + "epoch": 0.48910595439937243, + "grad_norm": 3.3693089485168457, + "learning_rate": 5.252249789049625e-06, + "loss": 0.2272, + "step": 19328 + }, + { + "epoch": 0.4891312599640661, + "grad_norm": 3.3066277503967285, + "learning_rate": 5.251848776999976e-06, + "loss": 0.151, + "step": 19329 + }, + { + "epoch": 0.4891565655287598, + "grad_norm": 3.545708656311035, + "learning_rate": 5.251447763326199e-06, + "loss": 0.2333, + "step": 19330 + }, + { + "epoch": 0.48918187109345346, + "grad_norm": 5.067492485046387, + "learning_rate": 5.251046748030886e-06, + "loss": 0.1153, + "step": 19331 + }, + { + "epoch": 0.4892071766581471, + "grad_norm": 3.3742737770080566, + "learning_rate": 5.250645731116616e-06, + "loss": 0.1322, + "step": 19332 + }, + { + "epoch": 0.4892324822228408, + "grad_norm": 2.531259536743164, + "learning_rate": 5.250244712585979e-06, + "loss": 0.1375, + "step": 19333 + }, + { + "epoch": 0.4892577877875345, + "grad_norm": 2.182061195373535, + "learning_rate": 5.249843692441562e-06, + "loss": 0.1189, + "step": 19334 + }, + { + "epoch": 0.48928309335222814, + "grad_norm": 3.9061553478240967, + "learning_rate": 5.249442670685948e-06, + "loss": 0.1283, + "step": 19335 + }, + { + "epoch": 0.48930839891692185, + "grad_norm": 3.9286372661590576, + "learning_rate": 5.249041647321725e-06, + "loss": 0.1294, + "step": 19336 + }, + { + "epoch": 0.4893337044816155, + "grad_norm": 4.00627326965332, + "learning_rate": 5.248640622351478e-06, + "loss": 0.1532, + "step": 19337 + }, + { + "epoch": 0.48935901004630916, + "grad_norm": 4.607030391693115, + "learning_rate": 5.248239595777795e-06, + "loss": 0.1556, + "step": 19338 + }, + { + "epoch": 0.4893843156110029, + "grad_norm": 4.961697578430176, + "learning_rate": 5.247838567603262e-06, + "loss": 0.1217, + "step": 19339 + }, + { + "epoch": 0.48940962117569653, + "grad_norm": 4.571256160736084, + "learning_rate": 5.247437537830462e-06, + "loss": 0.2368, + "step": 19340 + }, + { + "epoch": 0.4894349267403902, + "grad_norm": 9.312481880187988, + "learning_rate": 5.2470365064619845e-06, + "loss": 0.3047, + "step": 19341 + }, + { + "epoch": 0.4894602323050839, + "grad_norm": 3.329775333404541, + "learning_rate": 5.2466354735004145e-06, + "loss": 0.1362, + "step": 19342 + }, + { + "epoch": 0.48948553786977755, + "grad_norm": 12.895649909973145, + "learning_rate": 5.246234438948338e-06, + "loss": 0.1845, + "step": 19343 + }, + { + "epoch": 0.48951084343447127, + "grad_norm": 5.030027389526367, + "learning_rate": 5.245833402808341e-06, + "loss": 0.1816, + "step": 19344 + }, + { + "epoch": 0.4895361489991649, + "grad_norm": 3.6249659061431885, + "learning_rate": 5.245432365083011e-06, + "loss": 0.1304, + "step": 19345 + }, + { + "epoch": 0.4895614545638586, + "grad_norm": 6.094551086425781, + "learning_rate": 5.245031325774932e-06, + "loss": 0.1432, + "step": 19346 + }, + { + "epoch": 0.4895867601285523, + "grad_norm": 7.276956558227539, + "learning_rate": 5.244630284886693e-06, + "loss": 0.1651, + "step": 19347 + }, + { + "epoch": 0.48961206569324595, + "grad_norm": 4.978119373321533, + "learning_rate": 5.244229242420878e-06, + "loss": 0.2136, + "step": 19348 + }, + { + "epoch": 0.4896373712579396, + "grad_norm": 4.201676368713379, + "learning_rate": 5.243828198380074e-06, + "loss": 0.1606, + "step": 19349 + }, + { + "epoch": 0.4896626768226333, + "grad_norm": 3.7529611587524414, + "learning_rate": 5.243427152766867e-06, + "loss": 0.16, + "step": 19350 + }, + { + "epoch": 0.48968798238732697, + "grad_norm": 6.748845100402832, + "learning_rate": 5.243026105583845e-06, + "loss": 0.2596, + "step": 19351 + }, + { + "epoch": 0.4897132879520206, + "grad_norm": 2.7900876998901367, + "learning_rate": 5.242625056833591e-06, + "loss": 0.1555, + "step": 19352 + }, + { + "epoch": 0.48973859351671434, + "grad_norm": 5.138192176818848, + "learning_rate": 5.242224006518695e-06, + "loss": 0.1833, + "step": 19353 + }, + { + "epoch": 0.489763899081408, + "grad_norm": 4.722990036010742, + "learning_rate": 5.24182295464174e-06, + "loss": 0.188, + "step": 19354 + }, + { + "epoch": 0.48978920464610165, + "grad_norm": 10.44567584991455, + "learning_rate": 5.241421901205313e-06, + "loss": 0.2976, + "step": 19355 + }, + { + "epoch": 0.48981451021079536, + "grad_norm": 9.180667877197266, + "learning_rate": 5.241020846212003e-06, + "loss": 0.2124, + "step": 19356 + }, + { + "epoch": 0.489839815775489, + "grad_norm": 3.9200515747070312, + "learning_rate": 5.240619789664393e-06, + "loss": 0.1571, + "step": 19357 + }, + { + "epoch": 0.48986512134018273, + "grad_norm": 4.396799564361572, + "learning_rate": 5.240218731565072e-06, + "loss": 0.1342, + "step": 19358 + }, + { + "epoch": 0.4898904269048764, + "grad_norm": 7.149773120880127, + "learning_rate": 5.239817671916624e-06, + "loss": 0.2418, + "step": 19359 + }, + { + "epoch": 0.48991573246957004, + "grad_norm": 5.383970737457275, + "learning_rate": 5.239416610721638e-06, + "loss": 0.1508, + "step": 19360 + }, + { + "epoch": 0.48994103803426375, + "grad_norm": 3.4862589836120605, + "learning_rate": 5.239015547982698e-06, + "loss": 0.1217, + "step": 19361 + }, + { + "epoch": 0.4899663435989574, + "grad_norm": 26.892215728759766, + "learning_rate": 5.238614483702391e-06, + "loss": 0.2663, + "step": 19362 + }, + { + "epoch": 0.48999164916365107, + "grad_norm": 4.3565144538879395, + "learning_rate": 5.238213417883302e-06, + "loss": 0.1644, + "step": 19363 + }, + { + "epoch": 0.4900169547283448, + "grad_norm": 4.239785671234131, + "learning_rate": 5.237812350528022e-06, + "loss": 0.1412, + "step": 19364 + }, + { + "epoch": 0.49004226029303843, + "grad_norm": 4.944087028503418, + "learning_rate": 5.237411281639132e-06, + "loss": 0.2104, + "step": 19365 + }, + { + "epoch": 0.4900675658577321, + "grad_norm": 5.09480619430542, + "learning_rate": 5.237010211219221e-06, + "loss": 0.1715, + "step": 19366 + }, + { + "epoch": 0.4900928714224258, + "grad_norm": 3.6359734535217285, + "learning_rate": 5.236609139270876e-06, + "loss": 0.1629, + "step": 19367 + }, + { + "epoch": 0.49011817698711946, + "grad_norm": 5.356917858123779, + "learning_rate": 5.236208065796684e-06, + "loss": 0.1657, + "step": 19368 + }, + { + "epoch": 0.49014348255181317, + "grad_norm": 4.130167007446289, + "learning_rate": 5.235806990799228e-06, + "loss": 0.116, + "step": 19369 + }, + { + "epoch": 0.4901687881165068, + "grad_norm": 4.350894927978516, + "learning_rate": 5.235405914281097e-06, + "loss": 0.1077, + "step": 19370 + }, + { + "epoch": 0.4901940936812005, + "grad_norm": 9.221546173095703, + "learning_rate": 5.235004836244878e-06, + "loss": 0.3438, + "step": 19371 + }, + { + "epoch": 0.4902193992458942, + "grad_norm": 3.38907527923584, + "learning_rate": 5.234603756693156e-06, + "loss": 0.0851, + "step": 19372 + }, + { + "epoch": 0.49024470481058785, + "grad_norm": 3.122985363006592, + "learning_rate": 5.234202675628519e-06, + "loss": 0.2015, + "step": 19373 + }, + { + "epoch": 0.4902700103752815, + "grad_norm": 3.6438467502593994, + "learning_rate": 5.23380159305355e-06, + "loss": 0.156, + "step": 19374 + }, + { + "epoch": 0.4902953159399752, + "grad_norm": 2.9947800636291504, + "learning_rate": 5.233400508970841e-06, + "loss": 0.127, + "step": 19375 + }, + { + "epoch": 0.4903206215046689, + "grad_norm": 8.229227066040039, + "learning_rate": 5.232999423382973e-06, + "loss": 0.1514, + "step": 19376 + }, + { + "epoch": 0.49034592706936253, + "grad_norm": 6.035580635070801, + "learning_rate": 5.232598336292536e-06, + "loss": 0.2532, + "step": 19377 + }, + { + "epoch": 0.49037123263405624, + "grad_norm": 6.307799339294434, + "learning_rate": 5.232197247702115e-06, + "loss": 0.2546, + "step": 19378 + }, + { + "epoch": 0.4903965381987499, + "grad_norm": 5.26942777633667, + "learning_rate": 5.231796157614299e-06, + "loss": 0.2152, + "step": 19379 + }, + { + "epoch": 0.49042184376344355, + "grad_norm": 3.8411030769348145, + "learning_rate": 5.2313950660316715e-06, + "loss": 0.1458, + "step": 19380 + }, + { + "epoch": 0.49044714932813727, + "grad_norm": 4.982546329498291, + "learning_rate": 5.2309939729568196e-06, + "loss": 0.1992, + "step": 19381 + }, + { + "epoch": 0.4904724548928309, + "grad_norm": 3.5613596439361572, + "learning_rate": 5.2305928783923306e-06, + "loss": 0.1854, + "step": 19382 + }, + { + "epoch": 0.49049776045752463, + "grad_norm": 16.156021118164062, + "learning_rate": 5.230191782340792e-06, + "loss": 0.3096, + "step": 19383 + }, + { + "epoch": 0.4905230660222183, + "grad_norm": 2.621119499206543, + "learning_rate": 5.22979068480479e-06, + "loss": 0.1384, + "step": 19384 + }, + { + "epoch": 0.49054837158691195, + "grad_norm": 13.187853813171387, + "learning_rate": 5.229389585786908e-06, + "loss": 0.2923, + "step": 19385 + }, + { + "epoch": 0.49057367715160566, + "grad_norm": 7.954916477203369, + "learning_rate": 5.228988485289738e-06, + "loss": 0.3024, + "step": 19386 + }, + { + "epoch": 0.4905989827162993, + "grad_norm": 5.4895782470703125, + "learning_rate": 5.228587383315862e-06, + "loss": 0.1768, + "step": 19387 + }, + { + "epoch": 0.49062428828099297, + "grad_norm": 3.905609607696533, + "learning_rate": 5.228186279867868e-06, + "loss": 0.1759, + "step": 19388 + }, + { + "epoch": 0.4906495938456867, + "grad_norm": 3.8986868858337402, + "learning_rate": 5.227785174948346e-06, + "loss": 0.0928, + "step": 19389 + }, + { + "epoch": 0.49067489941038034, + "grad_norm": 14.082375526428223, + "learning_rate": 5.2273840685598764e-06, + "loss": 0.2881, + "step": 19390 + }, + { + "epoch": 0.490700204975074, + "grad_norm": 9.576163291931152, + "learning_rate": 5.226982960705052e-06, + "loss": 0.2317, + "step": 19391 + }, + { + "epoch": 0.4907255105397677, + "grad_norm": 4.481232166290283, + "learning_rate": 5.226581851386455e-06, + "loss": 0.1773, + "step": 19392 + }, + { + "epoch": 0.49075081610446136, + "grad_norm": 3.382965564727783, + "learning_rate": 5.2261807406066755e-06, + "loss": 0.1656, + "step": 19393 + }, + { + "epoch": 0.4907761216691551, + "grad_norm": 5.8195672035217285, + "learning_rate": 5.225779628368296e-06, + "loss": 0.2212, + "step": 19394 + }, + { + "epoch": 0.49080142723384873, + "grad_norm": 9.087651252746582, + "learning_rate": 5.225378514673908e-06, + "loss": 0.2162, + "step": 19395 + }, + { + "epoch": 0.4908267327985424, + "grad_norm": 7.67454195022583, + "learning_rate": 5.2249773995260945e-06, + "loss": 0.1685, + "step": 19396 + }, + { + "epoch": 0.4908520383632361, + "grad_norm": 5.457301616668701, + "learning_rate": 5.224576282927445e-06, + "loss": 0.1955, + "step": 19397 + }, + { + "epoch": 0.49087734392792975, + "grad_norm": 10.136857032775879, + "learning_rate": 5.224175164880542e-06, + "loss": 0.1898, + "step": 19398 + }, + { + "epoch": 0.4909026494926234, + "grad_norm": 7.729368686676025, + "learning_rate": 5.223774045387978e-06, + "loss": 0.1823, + "step": 19399 + }, + { + "epoch": 0.4909279550573171, + "grad_norm": 5.0237579345703125, + "learning_rate": 5.223372924452337e-06, + "loss": 0.2184, + "step": 19400 + }, + { + "epoch": 0.4909532606220108, + "grad_norm": 6.323643684387207, + "learning_rate": 5.222971802076203e-06, + "loss": 0.2334, + "step": 19401 + }, + { + "epoch": 0.49097856618670443, + "grad_norm": 4.912126064300537, + "learning_rate": 5.222570678262168e-06, + "loss": 0.1057, + "step": 19402 + }, + { + "epoch": 0.49100387175139815, + "grad_norm": 2.994443893432617, + "learning_rate": 5.222169553012814e-06, + "loss": 0.128, + "step": 19403 + }, + { + "epoch": 0.4910291773160918, + "grad_norm": 13.967985153198242, + "learning_rate": 5.221768426330731e-06, + "loss": 0.1783, + "step": 19404 + }, + { + "epoch": 0.49105448288078546, + "grad_norm": 3.9349277019500732, + "learning_rate": 5.2213672982185034e-06, + "loss": 0.1994, + "step": 19405 + }, + { + "epoch": 0.49107978844547917, + "grad_norm": 3.5036301612854004, + "learning_rate": 5.220966168678721e-06, + "loss": 0.1158, + "step": 19406 + }, + { + "epoch": 0.4911050940101728, + "grad_norm": 5.691927909851074, + "learning_rate": 5.220565037713967e-06, + "loss": 0.2316, + "step": 19407 + }, + { + "epoch": 0.49113039957486654, + "grad_norm": 3.7617275714874268, + "learning_rate": 5.220163905326831e-06, + "loss": 0.152, + "step": 19408 + }, + { + "epoch": 0.4911557051395602, + "grad_norm": 2.730534791946411, + "learning_rate": 5.2197627715199e-06, + "loss": 0.115, + "step": 19409 + }, + { + "epoch": 0.49118101070425385, + "grad_norm": 5.6970415115356445, + "learning_rate": 5.219361636295757e-06, + "loss": 0.1277, + "step": 19410 + }, + { + "epoch": 0.49120631626894756, + "grad_norm": 4.0190558433532715, + "learning_rate": 5.2189604996569955e-06, + "loss": 0.2062, + "step": 19411 + }, + { + "epoch": 0.4912316218336412, + "grad_norm": 6.899020671844482, + "learning_rate": 5.218559361606196e-06, + "loss": 0.1829, + "step": 19412 + }, + { + "epoch": 0.4912569273983349, + "grad_norm": 3.1103272438049316, + "learning_rate": 5.218158222145948e-06, + "loss": 0.1348, + "step": 19413 + }, + { + "epoch": 0.4912822329630286, + "grad_norm": 3.7978932857513428, + "learning_rate": 5.217757081278838e-06, + "loss": 0.1437, + "step": 19414 + }, + { + "epoch": 0.49130753852772224, + "grad_norm": 5.1567606925964355, + "learning_rate": 5.2173559390074545e-06, + "loss": 0.2256, + "step": 19415 + }, + { + "epoch": 0.4913328440924159, + "grad_norm": 3.2749080657958984, + "learning_rate": 5.216954795334381e-06, + "loss": 0.1242, + "step": 19416 + }, + { + "epoch": 0.4913581496571096, + "grad_norm": 8.037725448608398, + "learning_rate": 5.216553650262208e-06, + "loss": 0.2045, + "step": 19417 + }, + { + "epoch": 0.49138345522180327, + "grad_norm": 4.337902069091797, + "learning_rate": 5.21615250379352e-06, + "loss": 0.1837, + "step": 19418 + }, + { + "epoch": 0.4914087607864969, + "grad_norm": 24.122112274169922, + "learning_rate": 5.215751355930904e-06, + "loss": 0.5041, + "step": 19419 + }, + { + "epoch": 0.49143406635119063, + "grad_norm": 4.933720588684082, + "learning_rate": 5.215350206676949e-06, + "loss": 0.1558, + "step": 19420 + }, + { + "epoch": 0.4914593719158843, + "grad_norm": 3.430840253829956, + "learning_rate": 5.2149490560342395e-06, + "loss": 0.1365, + "step": 19421 + }, + { + "epoch": 0.491484677480578, + "grad_norm": 4.003606796264648, + "learning_rate": 5.214547904005365e-06, + "loss": 0.1161, + "step": 19422 + }, + { + "epoch": 0.49150998304527166, + "grad_norm": 17.87809181213379, + "learning_rate": 5.21414675059291e-06, + "loss": 0.1797, + "step": 19423 + }, + { + "epoch": 0.4915352886099653, + "grad_norm": 6.578973770141602, + "learning_rate": 5.213745595799463e-06, + "loss": 0.1593, + "step": 19424 + }, + { + "epoch": 0.491560594174659, + "grad_norm": 3.6698224544525146, + "learning_rate": 5.213344439627609e-06, + "loss": 0.1664, + "step": 19425 + }, + { + "epoch": 0.4915858997393527, + "grad_norm": 3.5904014110565186, + "learning_rate": 5.2129432820799385e-06, + "loss": 0.1989, + "step": 19426 + }, + { + "epoch": 0.49161120530404634, + "grad_norm": 6.024137496948242, + "learning_rate": 5.212542123159035e-06, + "loss": 0.1307, + "step": 19427 + }, + { + "epoch": 0.49163651086874005, + "grad_norm": 3.5972988605499268, + "learning_rate": 5.212140962867487e-06, + "loss": 0.1368, + "step": 19428 + }, + { + "epoch": 0.4916618164334337, + "grad_norm": 8.222588539123535, + "learning_rate": 5.2117398012078825e-06, + "loss": 0.2557, + "step": 19429 + }, + { + "epoch": 0.49168712199812736, + "grad_norm": 4.586849689483643, + "learning_rate": 5.211338638182806e-06, + "loss": 0.1849, + "step": 19430 + }, + { + "epoch": 0.4917124275628211, + "grad_norm": 3.147395133972168, + "learning_rate": 5.210937473794847e-06, + "loss": 0.129, + "step": 19431 + }, + { + "epoch": 0.49173773312751473, + "grad_norm": 7.729252338409424, + "learning_rate": 5.21053630804659e-06, + "loss": 0.2411, + "step": 19432 + }, + { + "epoch": 0.49176303869220844, + "grad_norm": 8.624947547912598, + "learning_rate": 5.2101351409406245e-06, + "loss": 0.2928, + "step": 19433 + }, + { + "epoch": 0.4917883442569021, + "grad_norm": 2.8735194206237793, + "learning_rate": 5.209733972479536e-06, + "loss": 0.1387, + "step": 19434 + }, + { + "epoch": 0.49181364982159576, + "grad_norm": 13.505475044250488, + "learning_rate": 5.209332802665913e-06, + "loss": 0.1869, + "step": 19435 + }, + { + "epoch": 0.49183895538628947, + "grad_norm": 3.6989011764526367, + "learning_rate": 5.208931631502341e-06, + "loss": 0.1834, + "step": 19436 + }, + { + "epoch": 0.4918642609509831, + "grad_norm": 2.5108699798583984, + "learning_rate": 5.208530458991408e-06, + "loss": 0.1052, + "step": 19437 + }, + { + "epoch": 0.4918895665156768, + "grad_norm": 3.0087690353393555, + "learning_rate": 5.208129285135701e-06, + "loss": 0.098, + "step": 19438 + }, + { + "epoch": 0.4919148720803705, + "grad_norm": 6.221653938293457, + "learning_rate": 5.207728109937807e-06, + "loss": 0.1752, + "step": 19439 + }, + { + "epoch": 0.49194017764506415, + "grad_norm": 3.1962969303131104, + "learning_rate": 5.207326933400313e-06, + "loss": 0.1461, + "step": 19440 + }, + { + "epoch": 0.4919654832097578, + "grad_norm": 20.84226417541504, + "learning_rate": 5.206925755525805e-06, + "loss": 0.4046, + "step": 19441 + }, + { + "epoch": 0.4919907887744515, + "grad_norm": 11.889392852783203, + "learning_rate": 5.206524576316873e-06, + "loss": 0.2637, + "step": 19442 + }, + { + "epoch": 0.49201609433914517, + "grad_norm": 7.3468804359436035, + "learning_rate": 5.206123395776102e-06, + "loss": 0.3005, + "step": 19443 + }, + { + "epoch": 0.4920413999038388, + "grad_norm": 4.693867206573486, + "learning_rate": 5.20572221390608e-06, + "loss": 0.2619, + "step": 19444 + }, + { + "epoch": 0.49206670546853254, + "grad_norm": 4.76467752456665, + "learning_rate": 5.205321030709391e-06, + "loss": 0.1809, + "step": 19445 + }, + { + "epoch": 0.4920920110332262, + "grad_norm": 5.889350891113281, + "learning_rate": 5.204919846188628e-06, + "loss": 0.2657, + "step": 19446 + }, + { + "epoch": 0.4921173165979199, + "grad_norm": 3.8207132816314697, + "learning_rate": 5.204518660346374e-06, + "loss": 0.1681, + "step": 19447 + }, + { + "epoch": 0.49214262216261356, + "grad_norm": 5.588210105895996, + "learning_rate": 5.204117473185217e-06, + "loss": 0.1967, + "step": 19448 + }, + { + "epoch": 0.4921679277273072, + "grad_norm": 3.1652965545654297, + "learning_rate": 5.2037162847077435e-06, + "loss": 0.1374, + "step": 19449 + }, + { + "epoch": 0.49219323329200093, + "grad_norm": 6.256294250488281, + "learning_rate": 5.2033150949165436e-06, + "loss": 0.1785, + "step": 19450 + }, + { + "epoch": 0.4922185388566946, + "grad_norm": 5.550131320953369, + "learning_rate": 5.202913903814201e-06, + "loss": 0.1696, + "step": 19451 + }, + { + "epoch": 0.49224384442138824, + "grad_norm": 3.0777156352996826, + "learning_rate": 5.202512711403304e-06, + "loss": 0.109, + "step": 19452 + }, + { + "epoch": 0.49226914998608196, + "grad_norm": 4.576411724090576, + "learning_rate": 5.202111517686442e-06, + "loss": 0.1759, + "step": 19453 + }, + { + "epoch": 0.4922944555507756, + "grad_norm": 15.030004501342773, + "learning_rate": 5.2017103226661995e-06, + "loss": 0.2876, + "step": 19454 + }, + { + "epoch": 0.49231976111546927, + "grad_norm": 13.044941902160645, + "learning_rate": 5.201309126345164e-06, + "loss": 0.296, + "step": 19455 + }, + { + "epoch": 0.492345066680163, + "grad_norm": 7.033721446990967, + "learning_rate": 5.200907928725924e-06, + "loss": 0.2015, + "step": 19456 + }, + { + "epoch": 0.49237037224485664, + "grad_norm": 4.682561874389648, + "learning_rate": 5.200506729811067e-06, + "loss": 0.1135, + "step": 19457 + }, + { + "epoch": 0.49239567780955035, + "grad_norm": 3.791410446166992, + "learning_rate": 5.2001055296031785e-06, + "loss": 0.2057, + "step": 19458 + }, + { + "epoch": 0.492420983374244, + "grad_norm": 5.597320079803467, + "learning_rate": 5.1997043281048466e-06, + "loss": 0.2018, + "step": 19459 + }, + { + "epoch": 0.49244628893893766, + "grad_norm": 11.710938453674316, + "learning_rate": 5.199303125318659e-06, + "loss": 0.2, + "step": 19460 + }, + { + "epoch": 0.49247159450363137, + "grad_norm": 2.9674739837646484, + "learning_rate": 5.1989019212472015e-06, + "loss": 0.1283, + "step": 19461 + }, + { + "epoch": 0.492496900068325, + "grad_norm": 4.049585819244385, + "learning_rate": 5.1985007158930646e-06, + "loss": 0.1466, + "step": 19462 + }, + { + "epoch": 0.4925222056330187, + "grad_norm": 7.535057544708252, + "learning_rate": 5.198099509258831e-06, + "loss": 0.2522, + "step": 19463 + }, + { + "epoch": 0.4925475111977124, + "grad_norm": 7.16480016708374, + "learning_rate": 5.197698301347093e-06, + "loss": 0.1427, + "step": 19464 + }, + { + "epoch": 0.49257281676240605, + "grad_norm": 5.947405815124512, + "learning_rate": 5.197297092160433e-06, + "loss": 0.179, + "step": 19465 + }, + { + "epoch": 0.4925981223270997, + "grad_norm": 3.877256393432617, + "learning_rate": 5.196895881701442e-06, + "loss": 0.1234, + "step": 19466 + }, + { + "epoch": 0.4926234278917934, + "grad_norm": 4.590271949768066, + "learning_rate": 5.196494669972706e-06, + "loss": 0.1479, + "step": 19467 + }, + { + "epoch": 0.4926487334564871, + "grad_norm": 8.304071426391602, + "learning_rate": 5.196093456976814e-06, + "loss": 0.1806, + "step": 19468 + }, + { + "epoch": 0.49267403902118073, + "grad_norm": 3.1419026851654053, + "learning_rate": 5.195692242716349e-06, + "loss": 0.1657, + "step": 19469 + }, + { + "epoch": 0.49269934458587444, + "grad_norm": 5.990776062011719, + "learning_rate": 5.1952910271939026e-06, + "loss": 0.1629, + "step": 19470 + }, + { + "epoch": 0.4927246501505681, + "grad_norm": 6.329867362976074, + "learning_rate": 5.1948898104120605e-06, + "loss": 0.2197, + "step": 19471 + }, + { + "epoch": 0.4927499557152618, + "grad_norm": 4.926955223083496, + "learning_rate": 5.19448859237341e-06, + "loss": 0.1998, + "step": 19472 + }, + { + "epoch": 0.49277526127995547, + "grad_norm": 6.8842034339904785, + "learning_rate": 5.194087373080538e-06, + "loss": 0.2216, + "step": 19473 + }, + { + "epoch": 0.4928005668446491, + "grad_norm": 9.503843307495117, + "learning_rate": 5.1936861525360335e-06, + "loss": 0.2227, + "step": 19474 + }, + { + "epoch": 0.49282587240934284, + "grad_norm": 5.851011276245117, + "learning_rate": 5.193284930742485e-06, + "loss": 0.1664, + "step": 19475 + }, + { + "epoch": 0.4928511779740365, + "grad_norm": 5.476924896240234, + "learning_rate": 5.192883707702475e-06, + "loss": 0.2146, + "step": 19476 + }, + { + "epoch": 0.49287648353873015, + "grad_norm": 5.226301193237305, + "learning_rate": 5.192482483418595e-06, + "loss": 0.167, + "step": 19477 + }, + { + "epoch": 0.49290178910342386, + "grad_norm": 4.698738098144531, + "learning_rate": 5.192081257893431e-06, + "loss": 0.2207, + "step": 19478 + }, + { + "epoch": 0.4929270946681175, + "grad_norm": 4.899001121520996, + "learning_rate": 5.191680031129571e-06, + "loss": 0.1745, + "step": 19479 + }, + { + "epoch": 0.49295240023281117, + "grad_norm": 11.474000930786133, + "learning_rate": 5.191278803129602e-06, + "loss": 0.2545, + "step": 19480 + }, + { + "epoch": 0.4929777057975049, + "grad_norm": 6.654662132263184, + "learning_rate": 5.190877573896113e-06, + "loss": 0.233, + "step": 19481 + }, + { + "epoch": 0.49300301136219854, + "grad_norm": 5.358072280883789, + "learning_rate": 5.190476343431688e-06, + "loss": 0.2032, + "step": 19482 + }, + { + "epoch": 0.4930283169268922, + "grad_norm": 3.830766439437866, + "learning_rate": 5.190075111738916e-06, + "loss": 0.1276, + "step": 19483 + }, + { + "epoch": 0.4930536224915859, + "grad_norm": 4.370286464691162, + "learning_rate": 5.189673878820386e-06, + "loss": 0.1394, + "step": 19484 + }, + { + "epoch": 0.49307892805627956, + "grad_norm": 2.748589038848877, + "learning_rate": 5.189272644678686e-06, + "loss": 0.0861, + "step": 19485 + }, + { + "epoch": 0.4931042336209733, + "grad_norm": 7.17160701751709, + "learning_rate": 5.188871409316399e-06, + "loss": 0.1904, + "step": 19486 + }, + { + "epoch": 0.49312953918566693, + "grad_norm": 4.945876121520996, + "learning_rate": 5.1884701727361175e-06, + "loss": 0.1418, + "step": 19487 + }, + { + "epoch": 0.4931548447503606, + "grad_norm": 5.8055806159973145, + "learning_rate": 5.188068934940426e-06, + "loss": 0.2152, + "step": 19488 + }, + { + "epoch": 0.4931801503150543, + "grad_norm": 8.1985445022583, + "learning_rate": 5.187667695931914e-06, + "loss": 0.143, + "step": 19489 + }, + { + "epoch": 0.49320545587974796, + "grad_norm": 3.331179618835449, + "learning_rate": 5.187266455713168e-06, + "loss": 0.0695, + "step": 19490 + }, + { + "epoch": 0.4932307614444416, + "grad_norm": 6.690763473510742, + "learning_rate": 5.186865214286774e-06, + "loss": 0.1948, + "step": 19491 + }, + { + "epoch": 0.4932560670091353, + "grad_norm": 2.915497303009033, + "learning_rate": 5.186463971655322e-06, + "loss": 0.1007, + "step": 19492 + }, + { + "epoch": 0.493281372573829, + "grad_norm": 5.516712665557861, + "learning_rate": 5.186062727821397e-06, + "loss": 0.2607, + "step": 19493 + }, + { + "epoch": 0.49330667813852264, + "grad_norm": 5.179688930511475, + "learning_rate": 5.18566148278759e-06, + "loss": 0.19, + "step": 19494 + }, + { + "epoch": 0.49333198370321635, + "grad_norm": 3.499358892440796, + "learning_rate": 5.185260236556485e-06, + "loss": 0.1165, + "step": 19495 + }, + { + "epoch": 0.49335728926791, + "grad_norm": 6.089137554168701, + "learning_rate": 5.184858989130672e-06, + "loss": 0.1596, + "step": 19496 + }, + { + "epoch": 0.4933825948326037, + "grad_norm": 4.833954811096191, + "learning_rate": 5.184457740512738e-06, + "loss": 0.2068, + "step": 19497 + }, + { + "epoch": 0.49340790039729737, + "grad_norm": 3.7066335678100586, + "learning_rate": 5.18405649070527e-06, + "loss": 0.1208, + "step": 19498 + }, + { + "epoch": 0.49343320596199103, + "grad_norm": 3.7407774925231934, + "learning_rate": 5.183655239710856e-06, + "loss": 0.1428, + "step": 19499 + }, + { + "epoch": 0.49345851152668474, + "grad_norm": 8.587674140930176, + "learning_rate": 5.183253987532083e-06, + "loss": 0.3531, + "step": 19500 + }, + { + "epoch": 0.4934838170913784, + "grad_norm": 4.45428466796875, + "learning_rate": 5.182852734171541e-06, + "loss": 0.1656, + "step": 19501 + }, + { + "epoch": 0.49350912265607205, + "grad_norm": 5.450944900512695, + "learning_rate": 5.1824514796318135e-06, + "loss": 0.2131, + "step": 19502 + }, + { + "epoch": 0.49353442822076576, + "grad_norm": 6.671774864196777, + "learning_rate": 5.182050223915492e-06, + "loss": 0.2177, + "step": 19503 + }, + { + "epoch": 0.4935597337854594, + "grad_norm": 5.051588535308838, + "learning_rate": 5.181648967025161e-06, + "loss": 0.1979, + "step": 19504 + }, + { + "epoch": 0.4935850393501531, + "grad_norm": 3.9334123134613037, + "learning_rate": 5.18124770896341e-06, + "loss": 0.1622, + "step": 19505 + }, + { + "epoch": 0.4936103449148468, + "grad_norm": 8.721236228942871, + "learning_rate": 5.180846449732827e-06, + "loss": 0.1817, + "step": 19506 + }, + { + "epoch": 0.49363565047954044, + "grad_norm": 3.8166208267211914, + "learning_rate": 5.180445189335998e-06, + "loss": 0.2087, + "step": 19507 + }, + { + "epoch": 0.4936609560442341, + "grad_norm": 4.180087089538574, + "learning_rate": 5.180043927775512e-06, + "loss": 0.1227, + "step": 19508 + }, + { + "epoch": 0.4936862616089278, + "grad_norm": 4.2436909675598145, + "learning_rate": 5.1796426650539564e-06, + "loss": 0.2225, + "step": 19509 + }, + { + "epoch": 0.49371156717362147, + "grad_norm": 5.84730863571167, + "learning_rate": 5.179241401173919e-06, + "loss": 0.1955, + "step": 19510 + }, + { + "epoch": 0.4937368727383152, + "grad_norm": 4.9817328453063965, + "learning_rate": 5.178840136137986e-06, + "loss": 0.1609, + "step": 19511 + }, + { + "epoch": 0.49376217830300884, + "grad_norm": 3.0581579208374023, + "learning_rate": 5.178438869948748e-06, + "loss": 0.141, + "step": 19512 + }, + { + "epoch": 0.4937874838677025, + "grad_norm": 11.53839111328125, + "learning_rate": 5.178037602608789e-06, + "loss": 0.2399, + "step": 19513 + }, + { + "epoch": 0.4938127894323962, + "grad_norm": 4.22177267074585, + "learning_rate": 5.177636334120699e-06, + "loss": 0.1703, + "step": 19514 + }, + { + "epoch": 0.49383809499708986, + "grad_norm": 5.390583515167236, + "learning_rate": 5.177235064487066e-06, + "loss": 0.1839, + "step": 19515 + }, + { + "epoch": 0.4938634005617835, + "grad_norm": 9.740653991699219, + "learning_rate": 5.176833793710477e-06, + "loss": 0.2118, + "step": 19516 + }, + { + "epoch": 0.49388870612647723, + "grad_norm": 4.055544853210449, + "learning_rate": 5.17643252179352e-06, + "loss": 0.1332, + "step": 19517 + }, + { + "epoch": 0.4939140116911709, + "grad_norm": 4.1560845375061035, + "learning_rate": 5.176031248738782e-06, + "loss": 0.1611, + "step": 19518 + }, + { + "epoch": 0.49393931725586454, + "grad_norm": 8.199639320373535, + "learning_rate": 5.175629974548852e-06, + "loss": 0.1528, + "step": 19519 + }, + { + "epoch": 0.49396462282055825, + "grad_norm": 3.2020130157470703, + "learning_rate": 5.175228699226315e-06, + "loss": 0.0998, + "step": 19520 + }, + { + "epoch": 0.4939899283852519, + "grad_norm": 6.940877914428711, + "learning_rate": 5.174827422773763e-06, + "loss": 0.2663, + "step": 19521 + }, + { + "epoch": 0.4940152339499456, + "grad_norm": 5.425394535064697, + "learning_rate": 5.174426145193779e-06, + "loss": 0.2027, + "step": 19522 + }, + { + "epoch": 0.4940405395146393, + "grad_norm": 3.9881765842437744, + "learning_rate": 5.174024866488956e-06, + "loss": 0.1528, + "step": 19523 + }, + { + "epoch": 0.49406584507933293, + "grad_norm": 8.87677001953125, + "learning_rate": 5.173623586661876e-06, + "loss": 0.2603, + "step": 19524 + }, + { + "epoch": 0.49409115064402664, + "grad_norm": 4.975484848022461, + "learning_rate": 5.173222305715132e-06, + "loss": 0.1661, + "step": 19525 + }, + { + "epoch": 0.4941164562087203, + "grad_norm": 5.477534294128418, + "learning_rate": 5.172821023651309e-06, + "loss": 0.1981, + "step": 19526 + }, + { + "epoch": 0.49414176177341396, + "grad_norm": 6.301064491271973, + "learning_rate": 5.172419740472995e-06, + "loss": 0.1486, + "step": 19527 + }, + { + "epoch": 0.49416706733810767, + "grad_norm": 10.764260292053223, + "learning_rate": 5.172018456182779e-06, + "loss": 0.2544, + "step": 19528 + }, + { + "epoch": 0.4941923729028013, + "grad_norm": 10.851469039916992, + "learning_rate": 5.171617170783247e-06, + "loss": 0.2551, + "step": 19529 + }, + { + "epoch": 0.494217678467495, + "grad_norm": 6.688261985778809, + "learning_rate": 5.171215884276989e-06, + "loss": 0.22, + "step": 19530 + }, + { + "epoch": 0.4942429840321887, + "grad_norm": 6.041180610656738, + "learning_rate": 5.17081459666659e-06, + "loss": 0.2011, + "step": 19531 + }, + { + "epoch": 0.49426828959688235, + "grad_norm": 3.0659124851226807, + "learning_rate": 5.1704133079546415e-06, + "loss": 0.1207, + "step": 19532 + }, + { + "epoch": 0.494293595161576, + "grad_norm": 13.259462356567383, + "learning_rate": 5.170012018143728e-06, + "loss": 0.2591, + "step": 19533 + }, + { + "epoch": 0.4943189007262697, + "grad_norm": 4.089879512786865, + "learning_rate": 5.169610727236438e-06, + "loss": 0.1136, + "step": 19534 + }, + { + "epoch": 0.4943442062909634, + "grad_norm": 5.39735221862793, + "learning_rate": 5.169209435235361e-06, + "loss": 0.1549, + "step": 19535 + }, + { + "epoch": 0.4943695118556571, + "grad_norm": 7.45950174331665, + "learning_rate": 5.168808142143082e-06, + "loss": 0.2406, + "step": 19536 + }, + { + "epoch": 0.49439481742035074, + "grad_norm": 4.880335807800293, + "learning_rate": 5.168406847962193e-06, + "loss": 0.206, + "step": 19537 + }, + { + "epoch": 0.4944201229850444, + "grad_norm": 8.599224090576172, + "learning_rate": 5.168005552695279e-06, + "loss": 0.2292, + "step": 19538 + }, + { + "epoch": 0.4944454285497381, + "grad_norm": 7.374225616455078, + "learning_rate": 5.167604256344928e-06, + "loss": 0.2331, + "step": 19539 + }, + { + "epoch": 0.49447073411443176, + "grad_norm": 3.3351895809173584, + "learning_rate": 5.1672029589137285e-06, + "loss": 0.1872, + "step": 19540 + }, + { + "epoch": 0.4944960396791254, + "grad_norm": 10.498422622680664, + "learning_rate": 5.166801660404268e-06, + "loss": 0.2258, + "step": 19541 + }, + { + "epoch": 0.49452134524381913, + "grad_norm": 9.425329208374023, + "learning_rate": 5.166400360819135e-06, + "loss": 0.2877, + "step": 19542 + }, + { + "epoch": 0.4945466508085128, + "grad_norm": 4.028590202331543, + "learning_rate": 5.165999060160918e-06, + "loss": 0.1453, + "step": 19543 + }, + { + "epoch": 0.49457195637320644, + "grad_norm": 9.363327980041504, + "learning_rate": 5.165597758432202e-06, + "loss": 0.2029, + "step": 19544 + }, + { + "epoch": 0.49459726193790016, + "grad_norm": 2.836376428604126, + "learning_rate": 5.1651964556355784e-06, + "loss": 0.1581, + "step": 19545 + }, + { + "epoch": 0.4946225675025938, + "grad_norm": 7.434759140014648, + "learning_rate": 5.1647951517736324e-06, + "loss": 0.1521, + "step": 19546 + }, + { + "epoch": 0.49464787306728747, + "grad_norm": 6.12223482131958, + "learning_rate": 5.164393846848954e-06, + "loss": 0.1917, + "step": 19547 + }, + { + "epoch": 0.4946731786319812, + "grad_norm": 4.761427402496338, + "learning_rate": 5.16399254086413e-06, + "loss": 0.1699, + "step": 19548 + }, + { + "epoch": 0.49469848419667484, + "grad_norm": 7.063229560852051, + "learning_rate": 5.163591233821747e-06, + "loss": 0.2241, + "step": 19549 + }, + { + "epoch": 0.49472378976136855, + "grad_norm": 6.0686845779418945, + "learning_rate": 5.1631899257243976e-06, + "loss": 0.1632, + "step": 19550 + }, + { + "epoch": 0.4947490953260622, + "grad_norm": 4.402788162231445, + "learning_rate": 5.1627886165746646e-06, + "loss": 0.1572, + "step": 19551 + }, + { + "epoch": 0.49477440089075586, + "grad_norm": 8.076889038085938, + "learning_rate": 5.162387306375139e-06, + "loss": 0.2719, + "step": 19552 + }, + { + "epoch": 0.4947997064554496, + "grad_norm": 2.9767942428588867, + "learning_rate": 5.161985995128407e-06, + "loss": 0.1102, + "step": 19553 + }, + { + "epoch": 0.49482501202014323, + "grad_norm": 5.487746238708496, + "learning_rate": 5.161584682837058e-06, + "loss": 0.118, + "step": 19554 + }, + { + "epoch": 0.4948503175848369, + "grad_norm": 4.16441535949707, + "learning_rate": 5.16118336950368e-06, + "loss": 0.1628, + "step": 19555 + }, + { + "epoch": 0.4948756231495306, + "grad_norm": 4.432633876800537, + "learning_rate": 5.160782055130861e-06, + "loss": 0.1495, + "step": 19556 + }, + { + "epoch": 0.49490092871422425, + "grad_norm": 5.424692630767822, + "learning_rate": 5.160380739721186e-06, + "loss": 0.2213, + "step": 19557 + }, + { + "epoch": 0.4949262342789179, + "grad_norm": 5.423238277435303, + "learning_rate": 5.159979423277246e-06, + "loss": 0.1752, + "step": 19558 + }, + { + "epoch": 0.4949515398436116, + "grad_norm": 3.1748740673065186, + "learning_rate": 5.1595781058016305e-06, + "loss": 0.1661, + "step": 19559 + }, + { + "epoch": 0.4949768454083053, + "grad_norm": 6.6899871826171875, + "learning_rate": 5.1591767872969234e-06, + "loss": 0.193, + "step": 19560 + }, + { + "epoch": 0.495002150972999, + "grad_norm": 7.630030155181885, + "learning_rate": 5.158775467765716e-06, + "loss": 0.1759, + "step": 19561 + }, + { + "epoch": 0.49502745653769264, + "grad_norm": 4.421235084533691, + "learning_rate": 5.158374147210594e-06, + "loss": 0.1624, + "step": 19562 + }, + { + "epoch": 0.4950527621023863, + "grad_norm": 15.342049598693848, + "learning_rate": 5.157972825634148e-06, + "loss": 0.1978, + "step": 19563 + }, + { + "epoch": 0.49507806766708, + "grad_norm": 13.22921371459961, + "learning_rate": 5.157571503038963e-06, + "loss": 0.1613, + "step": 19564 + }, + { + "epoch": 0.49510337323177367, + "grad_norm": 3.958437442779541, + "learning_rate": 5.15717017942763e-06, + "loss": 0.1276, + "step": 19565 + }, + { + "epoch": 0.4951286787964673, + "grad_norm": 3.580836057662964, + "learning_rate": 5.156768854802735e-06, + "loss": 0.1307, + "step": 19566 + }, + { + "epoch": 0.49515398436116104, + "grad_norm": 4.503752708435059, + "learning_rate": 5.156367529166868e-06, + "loss": 0.1581, + "step": 19567 + }, + { + "epoch": 0.4951792899258547, + "grad_norm": 7.5545735359191895, + "learning_rate": 5.1559662025226145e-06, + "loss": 0.1494, + "step": 19568 + }, + { + "epoch": 0.49520459549054835, + "grad_norm": 8.640753746032715, + "learning_rate": 5.155564874872564e-06, + "loss": 0.2391, + "step": 19569 + }, + { + "epoch": 0.49522990105524206, + "grad_norm": 6.0084710121154785, + "learning_rate": 5.155163546219306e-06, + "loss": 0.1981, + "step": 19570 + }, + { + "epoch": 0.4952552066199357, + "grad_norm": 5.105957508087158, + "learning_rate": 5.154762216565425e-06, + "loss": 0.1455, + "step": 19571 + }, + { + "epoch": 0.4952805121846294, + "grad_norm": 3.3572182655334473, + "learning_rate": 5.154360885913513e-06, + "loss": 0.1126, + "step": 19572 + }, + { + "epoch": 0.4953058177493231, + "grad_norm": 12.792908668518066, + "learning_rate": 5.153959554266156e-06, + "loss": 0.292, + "step": 19573 + }, + { + "epoch": 0.49533112331401674, + "grad_norm": 9.969039916992188, + "learning_rate": 5.153558221625943e-06, + "loss": 0.2932, + "step": 19574 + }, + { + "epoch": 0.49535642887871045, + "grad_norm": 12.341660499572754, + "learning_rate": 5.153156887995461e-06, + "loss": 0.3429, + "step": 19575 + }, + { + "epoch": 0.4953817344434041, + "grad_norm": 4.295795917510986, + "learning_rate": 5.152755553377297e-06, + "loss": 0.1671, + "step": 19576 + }, + { + "epoch": 0.49540704000809777, + "grad_norm": 5.3898162841796875, + "learning_rate": 5.152354217774045e-06, + "loss": 0.1817, + "step": 19577 + }, + { + "epoch": 0.4954323455727915, + "grad_norm": 4.340837001800537, + "learning_rate": 5.151952881188286e-06, + "loss": 0.1353, + "step": 19578 + }, + { + "epoch": 0.49545765113748513, + "grad_norm": 7.775681972503662, + "learning_rate": 5.151551543622612e-06, + "loss": 0.201, + "step": 19579 + }, + { + "epoch": 0.4954829567021788, + "grad_norm": 8.368644714355469, + "learning_rate": 5.151150205079609e-06, + "loss": 0.2597, + "step": 19580 + }, + { + "epoch": 0.4955082622668725, + "grad_norm": 12.623387336730957, + "learning_rate": 5.150748865561868e-06, + "loss": 0.2946, + "step": 19581 + }, + { + "epoch": 0.49553356783156616, + "grad_norm": 5.53358793258667, + "learning_rate": 5.150347525071975e-06, + "loss": 0.1921, + "step": 19582 + }, + { + "epoch": 0.4955588733962598, + "grad_norm": 4.169496536254883, + "learning_rate": 5.149946183612518e-06, + "loss": 0.1094, + "step": 19583 + }, + { + "epoch": 0.4955841789609535, + "grad_norm": 4.166955947875977, + "learning_rate": 5.149544841186087e-06, + "loss": 0.2108, + "step": 19584 + }, + { + "epoch": 0.4956094845256472, + "grad_norm": 3.7992076873779297, + "learning_rate": 5.149143497795269e-06, + "loss": 0.127, + "step": 19585 + }, + { + "epoch": 0.4956347900903409, + "grad_norm": 3.390923500061035, + "learning_rate": 5.148742153442652e-06, + "loss": 0.1079, + "step": 19586 + }, + { + "epoch": 0.49566009565503455, + "grad_norm": 3.813462257385254, + "learning_rate": 5.148340808130825e-06, + "loss": 0.1218, + "step": 19587 + }, + { + "epoch": 0.4956854012197282, + "grad_norm": 5.641977310180664, + "learning_rate": 5.147939461862375e-06, + "loss": 0.2435, + "step": 19588 + }, + { + "epoch": 0.4957107067844219, + "grad_norm": 2.867553472518921, + "learning_rate": 5.1475381146398904e-06, + "loss": 0.1239, + "step": 19589 + }, + { + "epoch": 0.4957360123491156, + "grad_norm": 13.456292152404785, + "learning_rate": 5.147136766465962e-06, + "loss": 0.2972, + "step": 19590 + }, + { + "epoch": 0.49576131791380923, + "grad_norm": 3.5900871753692627, + "learning_rate": 5.146735417343174e-06, + "loss": 0.098, + "step": 19591 + }, + { + "epoch": 0.49578662347850294, + "grad_norm": 4.998232841491699, + "learning_rate": 5.146334067274118e-06, + "loss": 0.152, + "step": 19592 + }, + { + "epoch": 0.4958119290431966, + "grad_norm": 6.263124942779541, + "learning_rate": 5.145932716261379e-06, + "loss": 0.2132, + "step": 19593 + }, + { + "epoch": 0.49583723460789025, + "grad_norm": 4.529430866241455, + "learning_rate": 5.14553136430755e-06, + "loss": 0.1752, + "step": 19594 + }, + { + "epoch": 0.49586254017258397, + "grad_norm": 9.308891296386719, + "learning_rate": 5.145130011415214e-06, + "loss": 0.2349, + "step": 19595 + }, + { + "epoch": 0.4958878457372776, + "grad_norm": 6.564310550689697, + "learning_rate": 5.144728657586963e-06, + "loss": 0.1718, + "step": 19596 + }, + { + "epoch": 0.4959131513019713, + "grad_norm": 4.139812469482422, + "learning_rate": 5.144327302825381e-06, + "loss": 0.1462, + "step": 19597 + }, + { + "epoch": 0.495938456866665, + "grad_norm": 4.83522891998291, + "learning_rate": 5.143925947133062e-06, + "loss": 0.1561, + "step": 19598 + }, + { + "epoch": 0.49596376243135865, + "grad_norm": 3.564147472381592, + "learning_rate": 5.14352459051259e-06, + "loss": 0.0789, + "step": 19599 + }, + { + "epoch": 0.49598906799605236, + "grad_norm": 3.1927361488342285, + "learning_rate": 5.143123232966555e-06, + "loss": 0.2127, + "step": 19600 + }, + { + "epoch": 0.496014373560746, + "grad_norm": 2.6845059394836426, + "learning_rate": 5.142721874497545e-06, + "loss": 0.1511, + "step": 19601 + }, + { + "epoch": 0.49603967912543967, + "grad_norm": 3.475701093673706, + "learning_rate": 5.142320515108149e-06, + "loss": 0.1572, + "step": 19602 + }, + { + "epoch": 0.4960649846901334, + "grad_norm": 5.464388370513916, + "learning_rate": 5.141919154800953e-06, + "loss": 0.2097, + "step": 19603 + }, + { + "epoch": 0.49609029025482704, + "grad_norm": 3.6020095348358154, + "learning_rate": 5.1415177935785475e-06, + "loss": 0.1487, + "step": 19604 + }, + { + "epoch": 0.4961155958195207, + "grad_norm": 5.965744972229004, + "learning_rate": 5.141116431443519e-06, + "loss": 0.1793, + "step": 19605 + }, + { + "epoch": 0.4961409013842144, + "grad_norm": 11.968893051147461, + "learning_rate": 5.1407150683984585e-06, + "loss": 0.2211, + "step": 19606 + }, + { + "epoch": 0.49616620694890806, + "grad_norm": 5.048425197601318, + "learning_rate": 5.140313704445953e-06, + "loss": 0.2005, + "step": 19607 + }, + { + "epoch": 0.4961915125136017, + "grad_norm": 5.077282905578613, + "learning_rate": 5.1399123395885885e-06, + "loss": 0.2026, + "step": 19608 + }, + { + "epoch": 0.49621681807829543, + "grad_norm": 3.913520097732544, + "learning_rate": 5.139510973828958e-06, + "loss": 0.195, + "step": 19609 + }, + { + "epoch": 0.4962421236429891, + "grad_norm": 11.566804885864258, + "learning_rate": 5.139109607169644e-06, + "loss": 0.2402, + "step": 19610 + }, + { + "epoch": 0.49626742920768274, + "grad_norm": 9.416217803955078, + "learning_rate": 5.1387082396132395e-06, + "loss": 0.2409, + "step": 19611 + }, + { + "epoch": 0.49629273477237645, + "grad_norm": 4.090180397033691, + "learning_rate": 5.138306871162332e-06, + "loss": 0.2038, + "step": 19612 + }, + { + "epoch": 0.4963180403370701, + "grad_norm": 4.987228870391846, + "learning_rate": 5.137905501819508e-06, + "loss": 0.2094, + "step": 19613 + }, + { + "epoch": 0.4963433459017638, + "grad_norm": 2.838451862335205, + "learning_rate": 5.137504131587357e-06, + "loss": 0.1356, + "step": 19614 + }, + { + "epoch": 0.4963686514664575, + "grad_norm": 6.255936145782471, + "learning_rate": 5.137102760468469e-06, + "loss": 0.0998, + "step": 19615 + }, + { + "epoch": 0.49639395703115113, + "grad_norm": 3.9111084938049316, + "learning_rate": 5.13670138846543e-06, + "loss": 0.1374, + "step": 19616 + }, + { + "epoch": 0.49641926259584485, + "grad_norm": 5.020524024963379, + "learning_rate": 5.136300015580828e-06, + "loss": 0.1675, + "step": 19617 + }, + { + "epoch": 0.4964445681605385, + "grad_norm": 6.182888984680176, + "learning_rate": 5.135898641817255e-06, + "loss": 0.1814, + "step": 19618 + }, + { + "epoch": 0.49646987372523216, + "grad_norm": 8.743040084838867, + "learning_rate": 5.135497267177295e-06, + "loss": 0.2584, + "step": 19619 + }, + { + "epoch": 0.49649517928992587, + "grad_norm": 6.621706962585449, + "learning_rate": 5.13509589166354e-06, + "loss": 0.1596, + "step": 19620 + }, + { + "epoch": 0.4965204848546195, + "grad_norm": 6.666318893432617, + "learning_rate": 5.134694515278575e-06, + "loss": 0.2764, + "step": 19621 + }, + { + "epoch": 0.4965457904193132, + "grad_norm": 4.60446310043335, + "learning_rate": 5.13429313802499e-06, + "loss": 0.1376, + "step": 19622 + }, + { + "epoch": 0.4965710959840069, + "grad_norm": 7.953884601593018, + "learning_rate": 5.1338917599053736e-06, + "loss": 0.2061, + "step": 19623 + }, + { + "epoch": 0.49659640154870055, + "grad_norm": 5.136105060577393, + "learning_rate": 5.133490380922315e-06, + "loss": 0.2224, + "step": 19624 + }, + { + "epoch": 0.49662170711339426, + "grad_norm": 10.929593086242676, + "learning_rate": 5.133089001078402e-06, + "loss": 0.2364, + "step": 19625 + }, + { + "epoch": 0.4966470126780879, + "grad_norm": 4.572640895843506, + "learning_rate": 5.132687620376222e-06, + "loss": 0.1637, + "step": 19626 + }, + { + "epoch": 0.4966723182427816, + "grad_norm": 11.417091369628906, + "learning_rate": 5.132286238818366e-06, + "loss": 0.243, + "step": 19627 + }, + { + "epoch": 0.4966976238074753, + "grad_norm": 4.429935932159424, + "learning_rate": 5.131884856407419e-06, + "loss": 0.1674, + "step": 19628 + }, + { + "epoch": 0.49672292937216894, + "grad_norm": 8.928679466247559, + "learning_rate": 5.1314834731459705e-06, + "loss": 0.178, + "step": 19629 + }, + { + "epoch": 0.4967482349368626, + "grad_norm": 5.667329788208008, + "learning_rate": 5.131082089036611e-06, + "loss": 0.1499, + "step": 19630 + }, + { + "epoch": 0.4967735405015563, + "grad_norm": 5.141393184661865, + "learning_rate": 5.130680704081928e-06, + "loss": 0.198, + "step": 19631 + }, + { + "epoch": 0.49679884606624997, + "grad_norm": 8.503334045410156, + "learning_rate": 5.130279318284507e-06, + "loss": 0.2561, + "step": 19632 + }, + { + "epoch": 0.4968241516309436, + "grad_norm": 5.1092729568481445, + "learning_rate": 5.12987793164694e-06, + "loss": 0.1589, + "step": 19633 + }, + { + "epoch": 0.49684945719563733, + "grad_norm": 3.760378837585449, + "learning_rate": 5.129476544171814e-06, + "loss": 0.1917, + "step": 19634 + }, + { + "epoch": 0.496874762760331, + "grad_norm": 3.777648448944092, + "learning_rate": 5.129075155861719e-06, + "loss": 0.1225, + "step": 19635 + }, + { + "epoch": 0.49690006832502465, + "grad_norm": 5.188846111297607, + "learning_rate": 5.128673766719243e-06, + "loss": 0.1754, + "step": 19636 + }, + { + "epoch": 0.49692537388971836, + "grad_norm": 7.183463096618652, + "learning_rate": 5.128272376746972e-06, + "loss": 0.2182, + "step": 19637 + }, + { + "epoch": 0.496950679454412, + "grad_norm": 8.156107902526855, + "learning_rate": 5.1278709859474976e-06, + "loss": 0.2927, + "step": 19638 + }, + { + "epoch": 0.4969759850191057, + "grad_norm": 3.245959520339966, + "learning_rate": 5.127469594323406e-06, + "loss": 0.1751, + "step": 19639 + }, + { + "epoch": 0.4970012905837994, + "grad_norm": 3.3445916175842285, + "learning_rate": 5.127068201877288e-06, + "loss": 0.1938, + "step": 19640 + }, + { + "epoch": 0.49702659614849304, + "grad_norm": 3.1964495182037354, + "learning_rate": 5.126666808611729e-06, + "loss": 0.1523, + "step": 19641 + }, + { + "epoch": 0.49705190171318675, + "grad_norm": 5.789426803588867, + "learning_rate": 5.126265414529319e-06, + "loss": 0.2256, + "step": 19642 + }, + { + "epoch": 0.4970772072778804, + "grad_norm": 3.8095784187316895, + "learning_rate": 5.125864019632649e-06, + "loss": 0.2115, + "step": 19643 + }, + { + "epoch": 0.49710251284257406, + "grad_norm": 2.283229351043701, + "learning_rate": 5.1254626239243035e-06, + "loss": 0.0942, + "step": 19644 + }, + { + "epoch": 0.4971278184072678, + "grad_norm": 3.4473557472229004, + "learning_rate": 5.125061227406874e-06, + "loss": 0.134, + "step": 19645 + }, + { + "epoch": 0.49715312397196143, + "grad_norm": 3.8276023864746094, + "learning_rate": 5.124659830082948e-06, + "loss": 0.176, + "step": 19646 + }, + { + "epoch": 0.4971784295366551, + "grad_norm": 4.006185531616211, + "learning_rate": 5.124258431955113e-06, + "loss": 0.2157, + "step": 19647 + }, + { + "epoch": 0.4972037351013488, + "grad_norm": 5.330420017242432, + "learning_rate": 5.123857033025959e-06, + "loss": 0.2082, + "step": 19648 + }, + { + "epoch": 0.49722904066604245, + "grad_norm": 5.894274711608887, + "learning_rate": 5.123455633298075e-06, + "loss": 0.2357, + "step": 19649 + }, + { + "epoch": 0.49725434623073617, + "grad_norm": 3.4722132682800293, + "learning_rate": 5.123054232774046e-06, + "loss": 0.0802, + "step": 19650 + }, + { + "epoch": 0.4972796517954298, + "grad_norm": 3.6973869800567627, + "learning_rate": 5.122652831456466e-06, + "loss": 0.212, + "step": 19651 + }, + { + "epoch": 0.4973049573601235, + "grad_norm": 5.693116664886475, + "learning_rate": 5.122251429347919e-06, + "loss": 0.1864, + "step": 19652 + }, + { + "epoch": 0.4973302629248172, + "grad_norm": 3.3823204040527344, + "learning_rate": 5.121850026450995e-06, + "loss": 0.1765, + "step": 19653 + }, + { + "epoch": 0.49735556848951085, + "grad_norm": 9.411208152770996, + "learning_rate": 5.1214486227682845e-06, + "loss": 0.2913, + "step": 19654 + }, + { + "epoch": 0.4973808740542045, + "grad_norm": 4.066157817840576, + "learning_rate": 5.121047218302373e-06, + "loss": 0.2488, + "step": 19655 + }, + { + "epoch": 0.4974061796188982, + "grad_norm": 4.543634414672852, + "learning_rate": 5.1206458130558514e-06, + "loss": 0.1881, + "step": 19656 + }, + { + "epoch": 0.49743148518359187, + "grad_norm": 7.3409271240234375, + "learning_rate": 5.120244407031306e-06, + "loss": 0.1357, + "step": 19657 + }, + { + "epoch": 0.4974567907482855, + "grad_norm": 8.596821784973145, + "learning_rate": 5.11984300023133e-06, + "loss": 0.2178, + "step": 19658 + }, + { + "epoch": 0.49748209631297924, + "grad_norm": 5.2890706062316895, + "learning_rate": 5.119441592658505e-06, + "loss": 0.2514, + "step": 19659 + }, + { + "epoch": 0.4975074018776729, + "grad_norm": 5.249733924865723, + "learning_rate": 5.1190401843154254e-06, + "loss": 0.2072, + "step": 19660 + }, + { + "epoch": 0.49753270744236655, + "grad_norm": 3.8511674404144287, + "learning_rate": 5.118638775204677e-06, + "loss": 0.1377, + "step": 19661 + }, + { + "epoch": 0.49755801300706026, + "grad_norm": 3.3051722049713135, + "learning_rate": 5.11823736532885e-06, + "loss": 0.1237, + "step": 19662 + }, + { + "epoch": 0.4975833185717539, + "grad_norm": 5.0519328117370605, + "learning_rate": 5.117835954690532e-06, + "loss": 0.162, + "step": 19663 + }, + { + "epoch": 0.49760862413644763, + "grad_norm": 4.281238555908203, + "learning_rate": 5.117434543292311e-06, + "loss": 0.1655, + "step": 19664 + }, + { + "epoch": 0.4976339297011413, + "grad_norm": 3.6111397743225098, + "learning_rate": 5.117033131136778e-06, + "loss": 0.1705, + "step": 19665 + }, + { + "epoch": 0.49765923526583494, + "grad_norm": 3.3311235904693604, + "learning_rate": 5.116631718226519e-06, + "loss": 0.1205, + "step": 19666 + }, + { + "epoch": 0.49768454083052865, + "grad_norm": 5.276641845703125, + "learning_rate": 5.116230304564125e-06, + "loss": 0.152, + "step": 19667 + }, + { + "epoch": 0.4977098463952223, + "grad_norm": 3.868335008621216, + "learning_rate": 5.115828890152181e-06, + "loss": 0.1916, + "step": 19668 + }, + { + "epoch": 0.49773515195991597, + "grad_norm": 3.526846408843994, + "learning_rate": 5.1154274749932805e-06, + "loss": 0.1585, + "step": 19669 + }, + { + "epoch": 0.4977604575246097, + "grad_norm": 4.316868782043457, + "learning_rate": 5.115026059090008e-06, + "loss": 0.1325, + "step": 19670 + }, + { + "epoch": 0.49778576308930333, + "grad_norm": 3.5763659477233887, + "learning_rate": 5.114624642444955e-06, + "loss": 0.1552, + "step": 19671 + }, + { + "epoch": 0.497811068653997, + "grad_norm": 2.4163506031036377, + "learning_rate": 5.1142232250607085e-06, + "loss": 0.1009, + "step": 19672 + }, + { + "epoch": 0.4978363742186907, + "grad_norm": 7.364658355712891, + "learning_rate": 5.113821806939857e-06, + "loss": 0.2778, + "step": 19673 + }, + { + "epoch": 0.49786167978338436, + "grad_norm": 3.067420482635498, + "learning_rate": 5.113420388084991e-06, + "loss": 0.1087, + "step": 19674 + }, + { + "epoch": 0.497886985348078, + "grad_norm": 6.167553424835205, + "learning_rate": 5.113018968498697e-06, + "loss": 0.1603, + "step": 19675 + }, + { + "epoch": 0.4979122909127717, + "grad_norm": 5.808119297027588, + "learning_rate": 5.112617548183565e-06, + "loss": 0.1654, + "step": 19676 + }, + { + "epoch": 0.4979375964774654, + "grad_norm": 4.44903039932251, + "learning_rate": 5.1122161271421835e-06, + "loss": 0.1912, + "step": 19677 + }, + { + "epoch": 0.4979629020421591, + "grad_norm": 8.114607810974121, + "learning_rate": 5.111814705377142e-06, + "loss": 0.2506, + "step": 19678 + }, + { + "epoch": 0.49798820760685275, + "grad_norm": 12.029061317443848, + "learning_rate": 5.111413282891026e-06, + "loss": 0.1766, + "step": 19679 + }, + { + "epoch": 0.4980135131715464, + "grad_norm": 4.464137554168701, + "learning_rate": 5.1110118596864285e-06, + "loss": 0.1557, + "step": 19680 + }, + { + "epoch": 0.4980388187362401, + "grad_norm": 4.000174045562744, + "learning_rate": 5.110610435765935e-06, + "loss": 0.1176, + "step": 19681 + }, + { + "epoch": 0.4980641243009338, + "grad_norm": 3.551589250564575, + "learning_rate": 5.110209011132137e-06, + "loss": 0.1582, + "step": 19682 + }, + { + "epoch": 0.49808942986562743, + "grad_norm": 5.572209358215332, + "learning_rate": 5.109807585787619e-06, + "loss": 0.1964, + "step": 19683 + }, + { + "epoch": 0.49811473543032114, + "grad_norm": 3.621805191040039, + "learning_rate": 5.1094061597349745e-06, + "loss": 0.0926, + "step": 19684 + }, + { + "epoch": 0.4981400409950148, + "grad_norm": 5.987278938293457, + "learning_rate": 5.109004732976789e-06, + "loss": 0.257, + "step": 19685 + }, + { + "epoch": 0.49816534655970846, + "grad_norm": 3.7948553562164307, + "learning_rate": 5.108603305515652e-06, + "loss": 0.1555, + "step": 19686 + }, + { + "epoch": 0.49819065212440217, + "grad_norm": 9.711640357971191, + "learning_rate": 5.108201877354153e-06, + "loss": 0.1891, + "step": 19687 + }, + { + "epoch": 0.4982159576890958, + "grad_norm": 2.843947410583496, + "learning_rate": 5.10780044849488e-06, + "loss": 0.1731, + "step": 19688 + }, + { + "epoch": 0.49824126325378953, + "grad_norm": 4.340083122253418, + "learning_rate": 5.1073990189404235e-06, + "loss": 0.2028, + "step": 19689 + }, + { + "epoch": 0.4982665688184832, + "grad_norm": 3.304379463195801, + "learning_rate": 5.106997588693369e-06, + "loss": 0.0949, + "step": 19690 + }, + { + "epoch": 0.49829187438317685, + "grad_norm": 9.394783020019531, + "learning_rate": 5.106596157756307e-06, + "loss": 0.1493, + "step": 19691 + }, + { + "epoch": 0.49831717994787056, + "grad_norm": 6.06106424331665, + "learning_rate": 5.106194726131828e-06, + "loss": 0.2208, + "step": 19692 + }, + { + "epoch": 0.4983424855125642, + "grad_norm": 2.540121078491211, + "learning_rate": 5.105793293822516e-06, + "loss": 0.1084, + "step": 19693 + }, + { + "epoch": 0.49836779107725787, + "grad_norm": 22.14769744873047, + "learning_rate": 5.105391860830965e-06, + "loss": 0.3115, + "step": 19694 + }, + { + "epoch": 0.4983930966419516, + "grad_norm": 3.6276981830596924, + "learning_rate": 5.104990427159761e-06, + "loss": 0.1484, + "step": 19695 + }, + { + "epoch": 0.49841840220664524, + "grad_norm": 5.91166877746582, + "learning_rate": 5.104588992811493e-06, + "loss": 0.2323, + "step": 19696 + }, + { + "epoch": 0.4984437077713389, + "grad_norm": 5.591256618499756, + "learning_rate": 5.10418755778875e-06, + "loss": 0.1762, + "step": 19697 + }, + { + "epoch": 0.4984690133360326, + "grad_norm": 3.982743740081787, + "learning_rate": 5.103786122094121e-06, + "loss": 0.1833, + "step": 19698 + }, + { + "epoch": 0.49849431890072626, + "grad_norm": 10.262002944946289, + "learning_rate": 5.103384685730194e-06, + "loss": 0.1359, + "step": 19699 + }, + { + "epoch": 0.4985196244654199, + "grad_norm": 2.968815326690674, + "learning_rate": 5.102983248699559e-06, + "loss": 0.1531, + "step": 19700 + }, + { + "epoch": 0.49854493003011363, + "grad_norm": 8.010916709899902, + "learning_rate": 5.102581811004804e-06, + "loss": 0.2044, + "step": 19701 + }, + { + "epoch": 0.4985702355948073, + "grad_norm": 2.6012825965881348, + "learning_rate": 5.1021803726485185e-06, + "loss": 0.1322, + "step": 19702 + }, + { + "epoch": 0.498595541159501, + "grad_norm": 4.475465774536133, + "learning_rate": 5.10177893363329e-06, + "loss": 0.1166, + "step": 19703 + }, + { + "epoch": 0.49862084672419466, + "grad_norm": 3.1444218158721924, + "learning_rate": 5.101377493961709e-06, + "loss": 0.1765, + "step": 19704 + }, + { + "epoch": 0.4986461522888883, + "grad_norm": 13.322672843933105, + "learning_rate": 5.100976053636362e-06, + "loss": 0.2939, + "step": 19705 + }, + { + "epoch": 0.498671457853582, + "grad_norm": 4.028000831604004, + "learning_rate": 5.10057461265984e-06, + "loss": 0.148, + "step": 19706 + }, + { + "epoch": 0.4986967634182757, + "grad_norm": 7.921897888183594, + "learning_rate": 5.100173171034731e-06, + "loss": 0.1855, + "step": 19707 + }, + { + "epoch": 0.49872206898296934, + "grad_norm": 12.241851806640625, + "learning_rate": 5.099771728763624e-06, + "loss": 0.3721, + "step": 19708 + }, + { + "epoch": 0.49874737454766305, + "grad_norm": 6.280734062194824, + "learning_rate": 5.099370285849108e-06, + "loss": 0.1663, + "step": 19709 + }, + { + "epoch": 0.4987726801123567, + "grad_norm": 5.163592338562012, + "learning_rate": 5.09896884229377e-06, + "loss": 0.1844, + "step": 19710 + }, + { + "epoch": 0.49879798567705036, + "grad_norm": 3.3082518577575684, + "learning_rate": 5.098567398100203e-06, + "loss": 0.0975, + "step": 19711 + }, + { + "epoch": 0.49882329124174407, + "grad_norm": 3.180208206176758, + "learning_rate": 5.098165953270991e-06, + "loss": 0.0906, + "step": 19712 + }, + { + "epoch": 0.4988485968064377, + "grad_norm": 7.380117416381836, + "learning_rate": 5.097764507808726e-06, + "loss": 0.1893, + "step": 19713 + }, + { + "epoch": 0.49887390237113144, + "grad_norm": 4.825465202331543, + "learning_rate": 5.0973630617159955e-06, + "loss": 0.1744, + "step": 19714 + }, + { + "epoch": 0.4988992079358251, + "grad_norm": 6.498161792755127, + "learning_rate": 5.096961614995389e-06, + "loss": 0.1727, + "step": 19715 + }, + { + "epoch": 0.49892451350051875, + "grad_norm": 6.222855091094971, + "learning_rate": 5.0965601676494935e-06, + "loss": 0.1548, + "step": 19716 + }, + { + "epoch": 0.49894981906521246, + "grad_norm": 5.438055515289307, + "learning_rate": 5.0961587196809004e-06, + "loss": 0.1902, + "step": 19717 + }, + { + "epoch": 0.4989751246299061, + "grad_norm": 3.4807896614074707, + "learning_rate": 5.095757271092199e-06, + "loss": 0.135, + "step": 19718 + }, + { + "epoch": 0.4990004301945998, + "grad_norm": 8.401917457580566, + "learning_rate": 5.095355821885975e-06, + "loss": 0.2679, + "step": 19719 + }, + { + "epoch": 0.4990257357592935, + "grad_norm": 3.232677698135376, + "learning_rate": 5.094954372064821e-06, + "loss": 0.1409, + "step": 19720 + }, + { + "epoch": 0.49905104132398714, + "grad_norm": 11.32041072845459, + "learning_rate": 5.0945529216313226e-06, + "loss": 0.2207, + "step": 19721 + }, + { + "epoch": 0.4990763468886808, + "grad_norm": 7.857549667358398, + "learning_rate": 5.09415147058807e-06, + "loss": 0.214, + "step": 19722 + }, + { + "epoch": 0.4991016524533745, + "grad_norm": 5.496176242828369, + "learning_rate": 5.093750018937652e-06, + "loss": 0.1421, + "step": 19723 + }, + { + "epoch": 0.49912695801806817, + "grad_norm": 6.051589012145996, + "learning_rate": 5.093348566682659e-06, + "loss": 0.218, + "step": 19724 + }, + { + "epoch": 0.4991522635827618, + "grad_norm": 4.3571085929870605, + "learning_rate": 5.092947113825677e-06, + "loss": 0.2278, + "step": 19725 + }, + { + "epoch": 0.49917756914745554, + "grad_norm": 17.83492660522461, + "learning_rate": 5.092545660369297e-06, + "loss": 0.2323, + "step": 19726 + }, + { + "epoch": 0.4992028747121492, + "grad_norm": 4.725334167480469, + "learning_rate": 5.092144206316106e-06, + "loss": 0.1148, + "step": 19727 + }, + { + "epoch": 0.4992281802768429, + "grad_norm": 5.644651889801025, + "learning_rate": 5.091742751668695e-06, + "loss": 0.1917, + "step": 19728 + }, + { + "epoch": 0.49925348584153656, + "grad_norm": 21.806230545043945, + "learning_rate": 5.091341296429653e-06, + "loss": 0.351, + "step": 19729 + }, + { + "epoch": 0.4992787914062302, + "grad_norm": 5.804430961608887, + "learning_rate": 5.090939840601566e-06, + "loss": 0.2248, + "step": 19730 + }, + { + "epoch": 0.4993040969709239, + "grad_norm": 4.478235721588135, + "learning_rate": 5.090538384187026e-06, + "loss": 0.1608, + "step": 19731 + }, + { + "epoch": 0.4993294025356176, + "grad_norm": 3.3550283908843994, + "learning_rate": 5.09013692718862e-06, + "loss": 0.1329, + "step": 19732 + }, + { + "epoch": 0.49935470810031124, + "grad_norm": 20.114652633666992, + "learning_rate": 5.08973546960894e-06, + "loss": 0.2233, + "step": 19733 + }, + { + "epoch": 0.49938001366500495, + "grad_norm": 7.970464706420898, + "learning_rate": 5.089334011450569e-06, + "loss": 0.2379, + "step": 19734 + }, + { + "epoch": 0.4994053192296986, + "grad_norm": 9.359029769897461, + "learning_rate": 5.088932552716103e-06, + "loss": 0.2358, + "step": 19735 + }, + { + "epoch": 0.49943062479439226, + "grad_norm": 9.12204647064209, + "learning_rate": 5.0885310934081256e-06, + "loss": 0.2242, + "step": 19736 + }, + { + "epoch": 0.499455930359086, + "grad_norm": 14.486281394958496, + "learning_rate": 5.088129633529229e-06, + "loss": 0.1644, + "step": 19737 + }, + { + "epoch": 0.49948123592377963, + "grad_norm": 6.043694496154785, + "learning_rate": 5.087728173082e-06, + "loss": 0.1344, + "step": 19738 + }, + { + "epoch": 0.4995065414884733, + "grad_norm": 7.930375576019287, + "learning_rate": 5.087326712069027e-06, + "loss": 0.1945, + "step": 19739 + }, + { + "epoch": 0.499531847053167, + "grad_norm": 3.8822884559631348, + "learning_rate": 5.086925250492902e-06, + "loss": 0.1197, + "step": 19740 + }, + { + "epoch": 0.49955715261786066, + "grad_norm": 8.341646194458008, + "learning_rate": 5.086523788356211e-06, + "loss": 0.1306, + "step": 19741 + }, + { + "epoch": 0.49958245818255437, + "grad_norm": 6.002223014831543, + "learning_rate": 5.086122325661547e-06, + "loss": 0.1916, + "step": 19742 + }, + { + "epoch": 0.499607763747248, + "grad_norm": 4.883945465087891, + "learning_rate": 5.085720862411493e-06, + "loss": 0.1845, + "step": 19743 + }, + { + "epoch": 0.4996330693119417, + "grad_norm": 5.582860946655273, + "learning_rate": 5.085319398608643e-06, + "loss": 0.1962, + "step": 19744 + }, + { + "epoch": 0.4996583748766354, + "grad_norm": 24.52635383605957, + "learning_rate": 5.084917934255582e-06, + "loss": 0.2245, + "step": 19745 + }, + { + "epoch": 0.49968368044132905, + "grad_norm": 9.01791763305664, + "learning_rate": 5.084516469354903e-06, + "loss": 0.1767, + "step": 19746 + }, + { + "epoch": 0.4997089860060227, + "grad_norm": 3.131282091140747, + "learning_rate": 5.084115003909192e-06, + "loss": 0.1517, + "step": 19747 + }, + { + "epoch": 0.4997342915707164, + "grad_norm": 10.256656646728516, + "learning_rate": 5.08371353792104e-06, + "loss": 0.3368, + "step": 19748 + }, + { + "epoch": 0.49975959713541007, + "grad_norm": 10.715710639953613, + "learning_rate": 5.083312071393034e-06, + "loss": 0.2112, + "step": 19749 + }, + { + "epoch": 0.49978490270010373, + "grad_norm": 4.013633728027344, + "learning_rate": 5.0829106043277645e-06, + "loss": 0.1832, + "step": 19750 + }, + { + "epoch": 0.49981020826479744, + "grad_norm": 5.4854865074157715, + "learning_rate": 5.082509136727819e-06, + "loss": 0.1676, + "step": 19751 + }, + { + "epoch": 0.4998355138294911, + "grad_norm": 5.306082725524902, + "learning_rate": 5.08210766859579e-06, + "loss": 0.1465, + "step": 19752 + }, + { + "epoch": 0.4998608193941848, + "grad_norm": 4.396441459655762, + "learning_rate": 5.081706199934262e-06, + "loss": 0.1311, + "step": 19753 + }, + { + "epoch": 0.49988612495887846, + "grad_norm": 9.034770011901855, + "learning_rate": 5.081304730745825e-06, + "loss": 0.2197, + "step": 19754 + }, + { + "epoch": 0.4999114305235721, + "grad_norm": 6.213046550750732, + "learning_rate": 5.080903261033071e-06, + "loss": 0.1265, + "step": 19755 + }, + { + "epoch": 0.49993673608826583, + "grad_norm": 9.65051555633545, + "learning_rate": 5.080501790798585e-06, + "loss": 0.1385, + "step": 19756 + }, + { + "epoch": 0.4999620416529595, + "grad_norm": 3.5721030235290527, + "learning_rate": 5.08010032004496e-06, + "loss": 0.1645, + "step": 19757 + }, + { + "epoch": 0.49998734721765314, + "grad_norm": 4.043337821960449, + "learning_rate": 5.07969884877478e-06, + "loss": 0.1799, + "step": 19758 + }, + { + "epoch": 0.5000126527823469, + "grad_norm": 4.567861557006836, + "learning_rate": 5.079297376990639e-06, + "loss": 0.1543, + "step": 19759 + }, + { + "epoch": 0.5000379583470406, + "grad_norm": 2.1698379516601562, + "learning_rate": 5.078895904695124e-06, + "loss": 0.1045, + "step": 19760 + }, + { + "epoch": 0.5000379583470406, + "eval_loss": 0.1833847463130951, + "eval_runtime": 69.8462, + "eval_samples_per_second": 45.729, + "eval_steps_per_second": 5.727, + "step": 19760 + }, + { + "epoch": 0.5000632639117342, + "grad_norm": 3.331338882446289, + "learning_rate": 5.078494431890822e-06, + "loss": 0.1325, + "step": 19761 + }, + { + "epoch": 0.5000885694764279, + "grad_norm": 7.732980251312256, + "learning_rate": 5.078092958580327e-06, + "loss": 0.1695, + "step": 19762 + }, + { + "epoch": 0.5001138750411216, + "grad_norm": 3.817960262298584, + "learning_rate": 5.077691484766223e-06, + "loss": 0.1145, + "step": 19763 + }, + { + "epoch": 0.5001391806058152, + "grad_norm": 4.32136869430542, + "learning_rate": 5.077290010451102e-06, + "loss": 0.1374, + "step": 19764 + }, + { + "epoch": 0.5001644861705089, + "grad_norm": 14.561602592468262, + "learning_rate": 5.076888535637551e-06, + "loss": 0.3064, + "step": 19765 + }, + { + "epoch": 0.5001897917352026, + "grad_norm": 6.005661964416504, + "learning_rate": 5.076487060328161e-06, + "loss": 0.1321, + "step": 19766 + }, + { + "epoch": 0.5002150972998962, + "grad_norm": 4.6862688064575195, + "learning_rate": 5.076085584525521e-06, + "loss": 0.1261, + "step": 19767 + }, + { + "epoch": 0.5002404028645899, + "grad_norm": 5.47272253036499, + "learning_rate": 5.075684108232218e-06, + "loss": 0.2199, + "step": 19768 + }, + { + "epoch": 0.5002657084292836, + "grad_norm": 8.815145492553711, + "learning_rate": 5.0752826314508415e-06, + "loss": 0.1859, + "step": 19769 + }, + { + "epoch": 0.5002910139939772, + "grad_norm": 3.3051397800445557, + "learning_rate": 5.074881154183983e-06, + "loss": 0.1597, + "step": 19770 + }, + { + "epoch": 0.500316319558671, + "grad_norm": 3.5586087703704834, + "learning_rate": 5.07447967643423e-06, + "loss": 0.1361, + "step": 19771 + }, + { + "epoch": 0.5003416251233647, + "grad_norm": 3.6562023162841797, + "learning_rate": 5.07407819820417e-06, + "loss": 0.1686, + "step": 19772 + }, + { + "epoch": 0.5003669306880583, + "grad_norm": 5.781057834625244, + "learning_rate": 5.0736767194963945e-06, + "loss": 0.1912, + "step": 19773 + }, + { + "epoch": 0.500392236252752, + "grad_norm": 8.051580429077148, + "learning_rate": 5.07327524031349e-06, + "loss": 0.3489, + "step": 19774 + }, + { + "epoch": 0.5004175418174457, + "grad_norm": 4.782625198364258, + "learning_rate": 5.072873760658049e-06, + "loss": 0.1617, + "step": 19775 + }, + { + "epoch": 0.5004428473821393, + "grad_norm": 12.950130462646484, + "learning_rate": 5.072472280532658e-06, + "loss": 0.3552, + "step": 19776 + }, + { + "epoch": 0.500468152946833, + "grad_norm": 2.7386863231658936, + "learning_rate": 5.0720707999399065e-06, + "loss": 0.1099, + "step": 19777 + }, + { + "epoch": 0.5004934585115267, + "grad_norm": 4.794193744659424, + "learning_rate": 5.071669318882384e-06, + "loss": 0.1583, + "step": 19778 + }, + { + "epoch": 0.5005187640762203, + "grad_norm": 5.318619251251221, + "learning_rate": 5.07126783736268e-06, + "loss": 0.2471, + "step": 19779 + }, + { + "epoch": 0.500544069640914, + "grad_norm": 5.544264316558838, + "learning_rate": 5.0708663553833816e-06, + "loss": 0.1956, + "step": 19780 + }, + { + "epoch": 0.5005693752056077, + "grad_norm": 7.509154319763184, + "learning_rate": 5.070464872947081e-06, + "loss": 0.1247, + "step": 19781 + }, + { + "epoch": 0.5005946807703013, + "grad_norm": 4.055263996124268, + "learning_rate": 5.070063390056365e-06, + "loss": 0.1696, + "step": 19782 + }, + { + "epoch": 0.500619986334995, + "grad_norm": 5.401088714599609, + "learning_rate": 5.069661906713822e-06, + "loss": 0.1773, + "step": 19783 + }, + { + "epoch": 0.5006452918996888, + "grad_norm": 9.270397186279297, + "learning_rate": 5.069260422922044e-06, + "loss": 0.1947, + "step": 19784 + }, + { + "epoch": 0.5006705974643825, + "grad_norm": 11.387310981750488, + "learning_rate": 5.068858938683617e-06, + "loss": 0.2342, + "step": 19785 + }, + { + "epoch": 0.5006959030290761, + "grad_norm": 3.953395128250122, + "learning_rate": 5.068457454001133e-06, + "loss": 0.1554, + "step": 19786 + }, + { + "epoch": 0.5007212085937698, + "grad_norm": 5.32750129699707, + "learning_rate": 5.068055968877178e-06, + "loss": 0.1042, + "step": 19787 + }, + { + "epoch": 0.5007465141584635, + "grad_norm": 9.97329044342041, + "learning_rate": 5.067654483314344e-06, + "loss": 0.2287, + "step": 19788 + }, + { + "epoch": 0.5007718197231571, + "grad_norm": 4.858685493469238, + "learning_rate": 5.067252997315218e-06, + "loss": 0.1365, + "step": 19789 + }, + { + "epoch": 0.5007971252878508, + "grad_norm": 15.407219886779785, + "learning_rate": 5.06685151088239e-06, + "loss": 0.2768, + "step": 19790 + }, + { + "epoch": 0.5008224308525445, + "grad_norm": 6.269042491912842, + "learning_rate": 5.066450024018449e-06, + "loss": 0.1562, + "step": 19791 + }, + { + "epoch": 0.5008477364172381, + "grad_norm": 3.9349446296691895, + "learning_rate": 5.066048536725984e-06, + "loss": 0.1593, + "step": 19792 + }, + { + "epoch": 0.5008730419819318, + "grad_norm": 3.4362564086914062, + "learning_rate": 5.0656470490075845e-06, + "loss": 0.0985, + "step": 19793 + }, + { + "epoch": 0.5008983475466255, + "grad_norm": 3.1466360092163086, + "learning_rate": 5.065245560865839e-06, + "loss": 0.137, + "step": 19794 + }, + { + "epoch": 0.5009236531113191, + "grad_norm": 5.355208873748779, + "learning_rate": 5.0648440723033386e-06, + "loss": 0.1793, + "step": 19795 + }, + { + "epoch": 0.5009489586760129, + "grad_norm": 4.3309712409973145, + "learning_rate": 5.06444258332267e-06, + "loss": 0.1761, + "step": 19796 + }, + { + "epoch": 0.5009742642407066, + "grad_norm": 3.8424625396728516, + "learning_rate": 5.064041093926423e-06, + "loss": 0.1337, + "step": 19797 + }, + { + "epoch": 0.5009995698054002, + "grad_norm": 3.373878240585327, + "learning_rate": 5.063639604117186e-06, + "loss": 0.095, + "step": 19798 + }, + { + "epoch": 0.5010248753700939, + "grad_norm": 8.684004783630371, + "learning_rate": 5.0632381138975506e-06, + "loss": 0.2265, + "step": 19799 + }, + { + "epoch": 0.5010501809347876, + "grad_norm": 7.753242492675781, + "learning_rate": 5.062836623270103e-06, + "loss": 0.1559, + "step": 19800 + }, + { + "epoch": 0.5010754864994812, + "grad_norm": 5.095597743988037, + "learning_rate": 5.062435132237435e-06, + "loss": 0.1418, + "step": 19801 + }, + { + "epoch": 0.5011007920641749, + "grad_norm": 2.7385215759277344, + "learning_rate": 5.0620336408021335e-06, + "loss": 0.1273, + "step": 19802 + }, + { + "epoch": 0.5011260976288686, + "grad_norm": 4.272070407867432, + "learning_rate": 5.061632148966788e-06, + "loss": 0.0979, + "step": 19803 + }, + { + "epoch": 0.5011514031935622, + "grad_norm": 3.565006732940674, + "learning_rate": 5.06123065673399e-06, + "loss": 0.1207, + "step": 19804 + }, + { + "epoch": 0.5011767087582559, + "grad_norm": 6.681431293487549, + "learning_rate": 5.060829164106326e-06, + "loss": 0.2515, + "step": 19805 + }, + { + "epoch": 0.5012020143229496, + "grad_norm": 3.465951681137085, + "learning_rate": 5.060427671086386e-06, + "loss": 0.1635, + "step": 19806 + }, + { + "epoch": 0.5012273198876432, + "grad_norm": 4.451186656951904, + "learning_rate": 5.060026177676759e-06, + "loss": 0.1307, + "step": 19807 + }, + { + "epoch": 0.501252625452337, + "grad_norm": 6.823739051818848, + "learning_rate": 5.059624683880035e-06, + "loss": 0.1594, + "step": 19808 + }, + { + "epoch": 0.5012779310170307, + "grad_norm": 3.8742542266845703, + "learning_rate": 5.059223189698802e-06, + "loss": 0.1445, + "step": 19809 + }, + { + "epoch": 0.5013032365817244, + "grad_norm": 3.3248977661132812, + "learning_rate": 5.058821695135649e-06, + "loss": 0.1078, + "step": 19810 + }, + { + "epoch": 0.501328542146418, + "grad_norm": 3.4721028804779053, + "learning_rate": 5.0584202001931685e-06, + "loss": 0.1417, + "step": 19811 + }, + { + "epoch": 0.5013538477111117, + "grad_norm": 3.827099084854126, + "learning_rate": 5.058018704873944e-06, + "loss": 0.1375, + "step": 19812 + }, + { + "epoch": 0.5013791532758054, + "grad_norm": 9.564982414245605, + "learning_rate": 5.0576172091805685e-06, + "loss": 0.237, + "step": 19813 + }, + { + "epoch": 0.501404458840499, + "grad_norm": 3.101902961730957, + "learning_rate": 5.05721571311563e-06, + "loss": 0.1086, + "step": 19814 + }, + { + "epoch": 0.5014297644051927, + "grad_norm": 3.078496217727661, + "learning_rate": 5.05681421668172e-06, + "loss": 0.0652, + "step": 19815 + }, + { + "epoch": 0.5014550699698864, + "grad_norm": 8.045494079589844, + "learning_rate": 5.056412719881424e-06, + "loss": 0.1771, + "step": 19816 + }, + { + "epoch": 0.50148037553458, + "grad_norm": 4.436280250549316, + "learning_rate": 5.056011222717333e-06, + "loss": 0.1031, + "step": 19817 + }, + { + "epoch": 0.5015056810992737, + "grad_norm": 9.350034713745117, + "learning_rate": 5.055609725192036e-06, + "loss": 0.2584, + "step": 19818 + }, + { + "epoch": 0.5015309866639674, + "grad_norm": 9.800971984863281, + "learning_rate": 5.055208227308122e-06, + "loss": 0.3348, + "step": 19819 + }, + { + "epoch": 0.501556292228661, + "grad_norm": 3.329615354537964, + "learning_rate": 5.0548067290681815e-06, + "loss": 0.126, + "step": 19820 + }, + { + "epoch": 0.5015815977933548, + "grad_norm": 6.2223687171936035, + "learning_rate": 5.054405230474803e-06, + "loss": 0.1785, + "step": 19821 + }, + { + "epoch": 0.5016069033580485, + "grad_norm": 2.8796327114105225, + "learning_rate": 5.0540037315305724e-06, + "loss": 0.1161, + "step": 19822 + }, + { + "epoch": 0.5016322089227421, + "grad_norm": 8.946219444274902, + "learning_rate": 5.053602232238085e-06, + "loss": 0.2273, + "step": 19823 + }, + { + "epoch": 0.5016575144874358, + "grad_norm": 6.496376037597656, + "learning_rate": 5.0532007325999245e-06, + "loss": 0.1049, + "step": 19824 + }, + { + "epoch": 0.5016828200521295, + "grad_norm": 3.288179397583008, + "learning_rate": 5.052799232618683e-06, + "loss": 0.0989, + "step": 19825 + }, + { + "epoch": 0.5017081256168231, + "grad_norm": 2.4723551273345947, + "learning_rate": 5.052397732296949e-06, + "loss": 0.1143, + "step": 19826 + }, + { + "epoch": 0.5017334311815168, + "grad_norm": 4.544955253601074, + "learning_rate": 5.0519962316373115e-06, + "loss": 0.1412, + "step": 19827 + }, + { + "epoch": 0.5017587367462105, + "grad_norm": 7.257899761199951, + "learning_rate": 5.051594730642362e-06, + "loss": 0.2038, + "step": 19828 + }, + { + "epoch": 0.5017840423109041, + "grad_norm": 6.6743950843811035, + "learning_rate": 5.0511932293146845e-06, + "loss": 0.2267, + "step": 19829 + }, + { + "epoch": 0.5018093478755978, + "grad_norm": 2.905790328979492, + "learning_rate": 5.050791727656874e-06, + "loss": 0.1107, + "step": 19830 + }, + { + "epoch": 0.5018346534402915, + "grad_norm": 5.839561939239502, + "learning_rate": 5.050390225671516e-06, + "loss": 0.1559, + "step": 19831 + }, + { + "epoch": 0.5018599590049851, + "grad_norm": 6.158758163452148, + "learning_rate": 5.049988723361202e-06, + "loss": 0.1434, + "step": 19832 + }, + { + "epoch": 0.5018852645696789, + "grad_norm": 10.105592727661133, + "learning_rate": 5.049587220728518e-06, + "loss": 0.4263, + "step": 19833 + }, + { + "epoch": 0.5019105701343726, + "grad_norm": 12.735252380371094, + "learning_rate": 5.049185717776056e-06, + "loss": 0.1608, + "step": 19834 + }, + { + "epoch": 0.5019358756990662, + "grad_norm": 3.119962692260742, + "learning_rate": 5.048784214506405e-06, + "loss": 0.1604, + "step": 19835 + }, + { + "epoch": 0.5019611812637599, + "grad_norm": 9.459308624267578, + "learning_rate": 5.048382710922153e-06, + "loss": 0.2239, + "step": 19836 + }, + { + "epoch": 0.5019864868284536, + "grad_norm": 8.1657075881958, + "learning_rate": 5.04798120702589e-06, + "loss": 0.1898, + "step": 19837 + }, + { + "epoch": 0.5020117923931473, + "grad_norm": 4.485510349273682, + "learning_rate": 5.047579702820207e-06, + "loss": 0.1712, + "step": 19838 + }, + { + "epoch": 0.5020370979578409, + "grad_norm": 5.789766311645508, + "learning_rate": 5.047178198307689e-06, + "loss": 0.1893, + "step": 19839 + }, + { + "epoch": 0.5020624035225346, + "grad_norm": 2.3355166912078857, + "learning_rate": 5.046776693490929e-06, + "loss": 0.0765, + "step": 19840 + }, + { + "epoch": 0.5020877090872283, + "grad_norm": 4.616065979003906, + "learning_rate": 5.046375188372513e-06, + "loss": 0.1323, + "step": 19841 + }, + { + "epoch": 0.5021130146519219, + "grad_norm": 9.614036560058594, + "learning_rate": 5.045973682955034e-06, + "loss": 0.259, + "step": 19842 + }, + { + "epoch": 0.5021383202166156, + "grad_norm": 5.943368434906006, + "learning_rate": 5.045572177241079e-06, + "loss": 0.1971, + "step": 19843 + }, + { + "epoch": 0.5021636257813094, + "grad_norm": 15.19498062133789, + "learning_rate": 5.045170671233237e-06, + "loss": 0.18, + "step": 19844 + }, + { + "epoch": 0.502188931346003, + "grad_norm": 4.486596584320068, + "learning_rate": 5.044769164934097e-06, + "loss": 0.1195, + "step": 19845 + }, + { + "epoch": 0.5022142369106967, + "grad_norm": 5.013003826141357, + "learning_rate": 5.044367658346251e-06, + "loss": 0.2187, + "step": 19846 + }, + { + "epoch": 0.5022395424753904, + "grad_norm": 4.286325931549072, + "learning_rate": 5.043966151472284e-06, + "loss": 0.134, + "step": 19847 + }, + { + "epoch": 0.502264848040084, + "grad_norm": 9.911590576171875, + "learning_rate": 5.043564644314789e-06, + "loss": 0.2387, + "step": 19848 + }, + { + "epoch": 0.5022901536047777, + "grad_norm": 5.9553937911987305, + "learning_rate": 5.0431631368763525e-06, + "loss": 0.2118, + "step": 19849 + }, + { + "epoch": 0.5023154591694714, + "grad_norm": 8.29754638671875, + "learning_rate": 5.042761629159566e-06, + "loss": 0.1031, + "step": 19850 + }, + { + "epoch": 0.502340764734165, + "grad_norm": 3.223721742630005, + "learning_rate": 5.042360121167017e-06, + "loss": 0.1251, + "step": 19851 + }, + { + "epoch": 0.5023660702988587, + "grad_norm": 4.1085028648376465, + "learning_rate": 5.041958612901295e-06, + "loss": 0.172, + "step": 19852 + }, + { + "epoch": 0.5023913758635524, + "grad_norm": 4.951493263244629, + "learning_rate": 5.04155710436499e-06, + "loss": 0.1369, + "step": 19853 + }, + { + "epoch": 0.502416681428246, + "grad_norm": 7.364076137542725, + "learning_rate": 5.041155595560693e-06, + "loss": 0.1654, + "step": 19854 + }, + { + "epoch": 0.5024419869929397, + "grad_norm": 6.935807228088379, + "learning_rate": 5.040754086490988e-06, + "loss": 0.2502, + "step": 19855 + }, + { + "epoch": 0.5024672925576334, + "grad_norm": 9.746005058288574, + "learning_rate": 5.040352577158469e-06, + "loss": 0.2566, + "step": 19856 + }, + { + "epoch": 0.502492598122327, + "grad_norm": 3.4182069301605225, + "learning_rate": 5.0399510675657245e-06, + "loss": 0.1134, + "step": 19857 + }, + { + "epoch": 0.5025179036870208, + "grad_norm": 13.3544921875, + "learning_rate": 5.039549557715342e-06, + "loss": 0.2789, + "step": 19858 + }, + { + "epoch": 0.5025432092517145, + "grad_norm": 5.208930492401123, + "learning_rate": 5.039148047609913e-06, + "loss": 0.1868, + "step": 19859 + }, + { + "epoch": 0.5025685148164081, + "grad_norm": 6.446686267852783, + "learning_rate": 5.038746537252024e-06, + "loss": 0.2378, + "step": 19860 + }, + { + "epoch": 0.5025938203811018, + "grad_norm": 4.7659525871276855, + "learning_rate": 5.038345026644267e-06, + "loss": 0.1621, + "step": 19861 + }, + { + "epoch": 0.5026191259457955, + "grad_norm": 3.441032886505127, + "learning_rate": 5.037943515789229e-06, + "loss": 0.1217, + "step": 19862 + }, + { + "epoch": 0.5026444315104892, + "grad_norm": 7.61435604095459, + "learning_rate": 5.037542004689501e-06, + "loss": 0.0873, + "step": 19863 + }, + { + "epoch": 0.5026697370751828, + "grad_norm": 9.911581039428711, + "learning_rate": 5.0371404933476704e-06, + "loss": 0.2122, + "step": 19864 + }, + { + "epoch": 0.5026950426398765, + "grad_norm": 4.416650295257568, + "learning_rate": 5.03673898176633e-06, + "loss": 0.1276, + "step": 19865 + }, + { + "epoch": 0.5027203482045702, + "grad_norm": 2.709097146987915, + "learning_rate": 5.036337469948065e-06, + "loss": 0.1157, + "step": 19866 + }, + { + "epoch": 0.5027456537692638, + "grad_norm": 7.635262489318848, + "learning_rate": 5.035935957895467e-06, + "loss": 0.2024, + "step": 19867 + }, + { + "epoch": 0.5027709593339575, + "grad_norm": 5.512397766113281, + "learning_rate": 5.035534445611123e-06, + "loss": 0.1655, + "step": 19868 + }, + { + "epoch": 0.5027962648986513, + "grad_norm": 6.232210636138916, + "learning_rate": 5.0351329330976275e-06, + "loss": 0.2643, + "step": 19869 + }, + { + "epoch": 0.5028215704633449, + "grad_norm": 27.891029357910156, + "learning_rate": 5.034731420357563e-06, + "loss": 0.5178, + "step": 19870 + }, + { + "epoch": 0.5028468760280386, + "grad_norm": 2.887859344482422, + "learning_rate": 5.0343299073935234e-06, + "loss": 0.1153, + "step": 19871 + }, + { + "epoch": 0.5028721815927323, + "grad_norm": 6.187366008758545, + "learning_rate": 5.033928394208097e-06, + "loss": 0.1322, + "step": 19872 + }, + { + "epoch": 0.5028974871574259, + "grad_norm": 3.4904239177703857, + "learning_rate": 5.033526880803872e-06, + "loss": 0.1495, + "step": 19873 + }, + { + "epoch": 0.5029227927221196, + "grad_norm": 3.960970878601074, + "learning_rate": 5.033125367183439e-06, + "loss": 0.1288, + "step": 19874 + }, + { + "epoch": 0.5029480982868133, + "grad_norm": 6.1308159828186035, + "learning_rate": 5.032723853349386e-06, + "loss": 0.218, + "step": 19875 + }, + { + "epoch": 0.5029734038515069, + "grad_norm": 4.243858337402344, + "learning_rate": 5.032322339304303e-06, + "loss": 0.1698, + "step": 19876 + }, + { + "epoch": 0.5029987094162006, + "grad_norm": 5.140676975250244, + "learning_rate": 5.031920825050779e-06, + "loss": 0.1239, + "step": 19877 + }, + { + "epoch": 0.5030240149808943, + "grad_norm": 3.4112777709960938, + "learning_rate": 5.031519310591404e-06, + "loss": 0.1504, + "step": 19878 + }, + { + "epoch": 0.5030493205455879, + "grad_norm": 5.842533588409424, + "learning_rate": 5.031117795928767e-06, + "loss": 0.1651, + "step": 19879 + }, + { + "epoch": 0.5030746261102816, + "grad_norm": 7.100588798522949, + "learning_rate": 5.030716281065456e-06, + "loss": 0.1276, + "step": 19880 + }, + { + "epoch": 0.5030999316749754, + "grad_norm": 10.140803337097168, + "learning_rate": 5.030314766004063e-06, + "loss": 0.1654, + "step": 19881 + }, + { + "epoch": 0.503125237239669, + "grad_norm": 6.836434364318848, + "learning_rate": 5.029913250747175e-06, + "loss": 0.1759, + "step": 19882 + }, + { + "epoch": 0.5031505428043627, + "grad_norm": 3.152240514755249, + "learning_rate": 5.029511735297382e-06, + "loss": 0.1118, + "step": 19883 + }, + { + "epoch": 0.5031758483690564, + "grad_norm": 4.454294681549072, + "learning_rate": 5.029110219657273e-06, + "loss": 0.1518, + "step": 19884 + }, + { + "epoch": 0.50320115393375, + "grad_norm": 17.44158935546875, + "learning_rate": 5.028708703829439e-06, + "loss": 0.2884, + "step": 19885 + }, + { + "epoch": 0.5032264594984437, + "grad_norm": 19.829288482666016, + "learning_rate": 5.028307187816466e-06, + "loss": 0.3281, + "step": 19886 + }, + { + "epoch": 0.5032517650631374, + "grad_norm": 4.024176597595215, + "learning_rate": 5.027905671620946e-06, + "loss": 0.1884, + "step": 19887 + }, + { + "epoch": 0.5032770706278311, + "grad_norm": 3.205538511276245, + "learning_rate": 5.027504155245467e-06, + "loss": 0.0833, + "step": 19888 + }, + { + "epoch": 0.5033023761925247, + "grad_norm": 6.552381992340088, + "learning_rate": 5.027102638692619e-06, + "loss": 0.2999, + "step": 19889 + }, + { + "epoch": 0.5033276817572184, + "grad_norm": 4.530178070068359, + "learning_rate": 5.026701121964993e-06, + "loss": 0.1509, + "step": 19890 + }, + { + "epoch": 0.5033529873219121, + "grad_norm": 3.001927137374878, + "learning_rate": 5.026299605065174e-06, + "loss": 0.0949, + "step": 19891 + }, + { + "epoch": 0.5033782928866057, + "grad_norm": 10.514060020446777, + "learning_rate": 5.025898087995755e-06, + "loss": 0.1606, + "step": 19892 + }, + { + "epoch": 0.5034035984512994, + "grad_norm": 3.053011417388916, + "learning_rate": 5.025496570759324e-06, + "loss": 0.0584, + "step": 19893 + }, + { + "epoch": 0.5034289040159932, + "grad_norm": 4.329679012298584, + "learning_rate": 5.0250950533584696e-06, + "loss": 0.182, + "step": 19894 + }, + { + "epoch": 0.5034542095806868, + "grad_norm": 5.4740986824035645, + "learning_rate": 5.024693535795783e-06, + "loss": 0.1458, + "step": 19895 + }, + { + "epoch": 0.5034795151453805, + "grad_norm": 3.28389310836792, + "learning_rate": 5.024292018073852e-06, + "loss": 0.1567, + "step": 19896 + }, + { + "epoch": 0.5035048207100742, + "grad_norm": 6.6696672439575195, + "learning_rate": 5.023890500195265e-06, + "loss": 0.1083, + "step": 19897 + }, + { + "epoch": 0.5035301262747678, + "grad_norm": 5.296629428863525, + "learning_rate": 5.0234889821626146e-06, + "loss": 0.1962, + "step": 19898 + }, + { + "epoch": 0.5035554318394615, + "grad_norm": 6.503430366516113, + "learning_rate": 5.0230874639784876e-06, + "loss": 0.1771, + "step": 19899 + }, + { + "epoch": 0.5035807374041552, + "grad_norm": 7.156501293182373, + "learning_rate": 5.0226859456454725e-06, + "loss": 0.1527, + "step": 19900 + }, + { + "epoch": 0.5036060429688488, + "grad_norm": 3.4106178283691406, + "learning_rate": 5.022284427166162e-06, + "loss": 0.0748, + "step": 19901 + }, + { + "epoch": 0.5036313485335425, + "grad_norm": 4.138420581817627, + "learning_rate": 5.021882908543142e-06, + "loss": 0.0801, + "step": 19902 + }, + { + "epoch": 0.5036566540982362, + "grad_norm": 6.135233402252197, + "learning_rate": 5.021481389779004e-06, + "loss": 0.2356, + "step": 19903 + }, + { + "epoch": 0.5036819596629298, + "grad_norm": 3.9007887840270996, + "learning_rate": 5.021079870876335e-06, + "loss": 0.1596, + "step": 19904 + }, + { + "epoch": 0.5037072652276235, + "grad_norm": 5.283244609832764, + "learning_rate": 5.0206783518377274e-06, + "loss": 0.1437, + "step": 19905 + }, + { + "epoch": 0.5037325707923173, + "grad_norm": 5.814212799072266, + "learning_rate": 5.020276832665768e-06, + "loss": 0.1921, + "step": 19906 + }, + { + "epoch": 0.5037578763570109, + "grad_norm": 11.58663558959961, + "learning_rate": 5.019875313363047e-06, + "loss": 0.14, + "step": 19907 + }, + { + "epoch": 0.5037831819217046, + "grad_norm": 4.602602005004883, + "learning_rate": 5.019473793932155e-06, + "loss": 0.1991, + "step": 19908 + }, + { + "epoch": 0.5038084874863983, + "grad_norm": 4.802772521972656, + "learning_rate": 5.0190722743756785e-06, + "loss": 0.0989, + "step": 19909 + }, + { + "epoch": 0.5038337930510919, + "grad_norm": 3.2621188163757324, + "learning_rate": 5.01867075469621e-06, + "loss": 0.18, + "step": 19910 + }, + { + "epoch": 0.5038590986157856, + "grad_norm": 4.692431926727295, + "learning_rate": 5.018269234896336e-06, + "loss": 0.2046, + "step": 19911 + }, + { + "epoch": 0.5038844041804793, + "grad_norm": 4.203329086303711, + "learning_rate": 5.017867714978648e-06, + "loss": 0.1775, + "step": 19912 + }, + { + "epoch": 0.503909709745173, + "grad_norm": 13.100271224975586, + "learning_rate": 5.017466194945734e-06, + "loss": 0.2558, + "step": 19913 + }, + { + "epoch": 0.5039350153098666, + "grad_norm": 6.674712657928467, + "learning_rate": 5.017064674800184e-06, + "loss": 0.2301, + "step": 19914 + }, + { + "epoch": 0.5039603208745603, + "grad_norm": 4.661986351013184, + "learning_rate": 5.016663154544587e-06, + "loss": 0.1734, + "step": 19915 + }, + { + "epoch": 0.503985626439254, + "grad_norm": 4.358842849731445, + "learning_rate": 5.016261634181532e-06, + "loss": 0.1597, + "step": 19916 + }, + { + "epoch": 0.5040109320039476, + "grad_norm": 3.6278562545776367, + "learning_rate": 5.0158601137136085e-06, + "loss": 0.124, + "step": 19917 + }, + { + "epoch": 0.5040362375686414, + "grad_norm": 6.461751937866211, + "learning_rate": 5.0154585931434075e-06, + "loss": 0.1827, + "step": 19918 + }, + { + "epoch": 0.5040615431333351, + "grad_norm": 2.339125633239746, + "learning_rate": 5.015057072473516e-06, + "loss": 0.0856, + "step": 19919 + }, + { + "epoch": 0.5040868486980287, + "grad_norm": 3.4511196613311768, + "learning_rate": 5.0146555517065236e-06, + "loss": 0.1287, + "step": 19920 + }, + { + "epoch": 0.5041121542627224, + "grad_norm": 7.468878746032715, + "learning_rate": 5.014254030845022e-06, + "loss": 0.1821, + "step": 19921 + }, + { + "epoch": 0.5041374598274161, + "grad_norm": 6.810162544250488, + "learning_rate": 5.013852509891597e-06, + "loss": 0.1839, + "step": 19922 + }, + { + "epoch": 0.5041627653921097, + "grad_norm": 4.411769390106201, + "learning_rate": 5.0134509888488405e-06, + "loss": 0.1638, + "step": 19923 + }, + { + "epoch": 0.5041880709568034, + "grad_norm": 3.4728612899780273, + "learning_rate": 5.013049467719341e-06, + "loss": 0.1406, + "step": 19924 + }, + { + "epoch": 0.5042133765214971, + "grad_norm": 5.11001443862915, + "learning_rate": 5.012647946505689e-06, + "loss": 0.1816, + "step": 19925 + }, + { + "epoch": 0.5042386820861907, + "grad_norm": 7.2884368896484375, + "learning_rate": 5.012246425210471e-06, + "loss": 0.2965, + "step": 19926 + }, + { + "epoch": 0.5042639876508844, + "grad_norm": 4.434496879577637, + "learning_rate": 5.011844903836279e-06, + "loss": 0.1229, + "step": 19927 + }, + { + "epoch": 0.5042892932155781, + "grad_norm": 3.375345468521118, + "learning_rate": 5.011443382385702e-06, + "loss": 0.1616, + "step": 19928 + }, + { + "epoch": 0.5043145987802717, + "grad_norm": 12.78126049041748, + "learning_rate": 5.011041860861327e-06, + "loss": 0.2728, + "step": 19929 + }, + { + "epoch": 0.5043399043449655, + "grad_norm": 11.751976013183594, + "learning_rate": 5.010640339265748e-06, + "loss": 0.3656, + "step": 19930 + }, + { + "epoch": 0.5043652099096592, + "grad_norm": 4.452877521514893, + "learning_rate": 5.010238817601548e-06, + "loss": 0.1278, + "step": 19931 + }, + { + "epoch": 0.5043905154743528, + "grad_norm": 9.217984199523926, + "learning_rate": 5.009837295871322e-06, + "loss": 0.1639, + "step": 19932 + }, + { + "epoch": 0.5044158210390465, + "grad_norm": 6.983232498168945, + "learning_rate": 5.009435774077655e-06, + "loss": 0.2287, + "step": 19933 + }, + { + "epoch": 0.5044411266037402, + "grad_norm": 7.725198745727539, + "learning_rate": 5.0090342522231415e-06, + "loss": 0.2819, + "step": 19934 + }, + { + "epoch": 0.5044664321684338, + "grad_norm": 4.091191291809082, + "learning_rate": 5.008632730310365e-06, + "loss": 0.1565, + "step": 19935 + }, + { + "epoch": 0.5044917377331275, + "grad_norm": 13.590822219848633, + "learning_rate": 5.00823120834192e-06, + "loss": 0.2381, + "step": 19936 + }, + { + "epoch": 0.5045170432978212, + "grad_norm": 4.923109531402588, + "learning_rate": 5.007829686320391e-06, + "loss": 0.2478, + "step": 19937 + }, + { + "epoch": 0.5045423488625149, + "grad_norm": 4.827347755432129, + "learning_rate": 5.007428164248373e-06, + "loss": 0.1712, + "step": 19938 + }, + { + "epoch": 0.5045676544272085, + "grad_norm": 13.791561126708984, + "learning_rate": 5.007026642128449e-06, + "loss": 0.2258, + "step": 19939 + }, + { + "epoch": 0.5045929599919022, + "grad_norm": 6.390516757965088, + "learning_rate": 5.0066251199632135e-06, + "loss": 0.1759, + "step": 19940 + }, + { + "epoch": 0.504618265556596, + "grad_norm": 5.642109394073486, + "learning_rate": 5.006223597755252e-06, + "loss": 0.1517, + "step": 19941 + }, + { + "epoch": 0.5046435711212895, + "grad_norm": 5.2523417472839355, + "learning_rate": 5.005822075507156e-06, + "loss": 0.1738, + "step": 19942 + }, + { + "epoch": 0.5046688766859833, + "grad_norm": 5.991734981536865, + "learning_rate": 5.005420553221516e-06, + "loss": 0.2043, + "step": 19943 + }, + { + "epoch": 0.504694182250677, + "grad_norm": 16.79959487915039, + "learning_rate": 5.00501903090092e-06, + "loss": 0.192, + "step": 19944 + }, + { + "epoch": 0.5047194878153706, + "grad_norm": 4.571465015411377, + "learning_rate": 5.004617508547956e-06, + "loss": 0.1635, + "step": 19945 + }, + { + "epoch": 0.5047447933800643, + "grad_norm": 3.750929117202759, + "learning_rate": 5.004215986165214e-06, + "loss": 0.1079, + "step": 19946 + }, + { + "epoch": 0.504770098944758, + "grad_norm": 6.154329776763916, + "learning_rate": 5.003814463755286e-06, + "loss": 0.223, + "step": 19947 + }, + { + "epoch": 0.5047954045094516, + "grad_norm": 2.570894956588745, + "learning_rate": 5.003412941320757e-06, + "loss": 0.0909, + "step": 19948 + }, + { + "epoch": 0.5048207100741453, + "grad_norm": 3.684753894805908, + "learning_rate": 5.00301141886422e-06, + "loss": 0.1142, + "step": 19949 + }, + { + "epoch": 0.504846015638839, + "grad_norm": 4.7886834144592285, + "learning_rate": 5.002609896388262e-06, + "loss": 0.1199, + "step": 19950 + }, + { + "epoch": 0.5048713212035326, + "grad_norm": 6.6285719871521, + "learning_rate": 5.002208373895474e-06, + "loss": 0.1841, + "step": 19951 + }, + { + "epoch": 0.5048966267682263, + "grad_norm": 2.586641788482666, + "learning_rate": 5.001806851388445e-06, + "loss": 0.1402, + "step": 19952 + }, + { + "epoch": 0.50492193233292, + "grad_norm": 4.533433437347412, + "learning_rate": 5.001405328869761e-06, + "loss": 0.1431, + "step": 19953 + }, + { + "epoch": 0.5049472378976136, + "grad_norm": 11.407090187072754, + "learning_rate": 5.001003806342019e-06, + "loss": 0.3314, + "step": 19954 + }, + { + "epoch": 0.5049725434623074, + "grad_norm": 13.112309455871582, + "learning_rate": 5.000602283807801e-06, + "loss": 0.2246, + "step": 19955 + }, + { + "epoch": 0.5049978490270011, + "grad_norm": 4.077613353729248, + "learning_rate": 5.000200761269699e-06, + "loss": 0.2243, + "step": 19956 + }, + { + "epoch": 0.5050231545916947, + "grad_norm": 5.66317892074585, + "learning_rate": 4.999799238730303e-06, + "loss": 0.1379, + "step": 19957 + }, + { + "epoch": 0.5050484601563884, + "grad_norm": 4.3661909103393555, + "learning_rate": 4.9993977161922e-06, + "loss": 0.1841, + "step": 19958 + }, + { + "epoch": 0.5050737657210821, + "grad_norm": 7.406411170959473, + "learning_rate": 4.998996193657984e-06, + "loss": 0.214, + "step": 19959 + }, + { + "epoch": 0.5050990712857757, + "grad_norm": 5.544412136077881, + "learning_rate": 4.998594671130239e-06, + "loss": 0.1471, + "step": 19960 + }, + { + "epoch": 0.5051243768504694, + "grad_norm": 12.579130172729492, + "learning_rate": 4.998193148611557e-06, + "loss": 0.3118, + "step": 19961 + }, + { + "epoch": 0.5051496824151631, + "grad_norm": 4.521254062652588, + "learning_rate": 4.997791626104529e-06, + "loss": 0.1603, + "step": 19962 + }, + { + "epoch": 0.5051749879798567, + "grad_norm": 4.626611709594727, + "learning_rate": 4.99739010361174e-06, + "loss": 0.1611, + "step": 19963 + }, + { + "epoch": 0.5052002935445504, + "grad_norm": 4.348204612731934, + "learning_rate": 4.996988581135782e-06, + "loss": 0.1362, + "step": 19964 + }, + { + "epoch": 0.5052255991092441, + "grad_norm": 9.869780540466309, + "learning_rate": 4.996587058679243e-06, + "loss": 0.1784, + "step": 19965 + }, + { + "epoch": 0.5052509046739379, + "grad_norm": 3.0326130390167236, + "learning_rate": 4.996185536244717e-06, + "loss": 0.1759, + "step": 19966 + }, + { + "epoch": 0.5052762102386315, + "grad_norm": 2.4995992183685303, + "learning_rate": 4.995784013834787e-06, + "loss": 0.1247, + "step": 19967 + }, + { + "epoch": 0.5053015158033252, + "grad_norm": 7.801449775695801, + "learning_rate": 4.995382491452045e-06, + "loss": 0.2552, + "step": 19968 + }, + { + "epoch": 0.5053268213680189, + "grad_norm": 4.13981819152832, + "learning_rate": 4.994980969099081e-06, + "loss": 0.1322, + "step": 19969 + }, + { + "epoch": 0.5053521269327125, + "grad_norm": 3.2886719703674316, + "learning_rate": 4.994579446778487e-06, + "loss": 0.1562, + "step": 19970 + }, + { + "epoch": 0.5053774324974062, + "grad_norm": 8.74225902557373, + "learning_rate": 4.994177924492845e-06, + "loss": 0.3242, + "step": 19971 + }, + { + "epoch": 0.5054027380620999, + "grad_norm": 5.1997809410095215, + "learning_rate": 4.993776402244749e-06, + "loss": 0.1575, + "step": 19972 + }, + { + "epoch": 0.5054280436267935, + "grad_norm": 9.626298904418945, + "learning_rate": 4.993374880036788e-06, + "loss": 0.2193, + "step": 19973 + }, + { + "epoch": 0.5054533491914872, + "grad_norm": 4.328737735748291, + "learning_rate": 4.992973357871551e-06, + "loss": 0.1368, + "step": 19974 + }, + { + "epoch": 0.5054786547561809, + "grad_norm": 2.4933993816375732, + "learning_rate": 4.99257183575163e-06, + "loss": 0.1554, + "step": 19975 + }, + { + "epoch": 0.5055039603208745, + "grad_norm": 5.554506301879883, + "learning_rate": 4.99217031367961e-06, + "loss": 0.2537, + "step": 19976 + }, + { + "epoch": 0.5055292658855682, + "grad_norm": 4.28810453414917, + "learning_rate": 4.991768791658081e-06, + "loss": 0.204, + "step": 19977 + }, + { + "epoch": 0.505554571450262, + "grad_norm": 4.571743011474609, + "learning_rate": 4.991367269689635e-06, + "loss": 0.1536, + "step": 19978 + }, + { + "epoch": 0.5055798770149555, + "grad_norm": 4.071448802947998, + "learning_rate": 4.990965747776861e-06, + "loss": 0.1637, + "step": 19979 + }, + { + "epoch": 0.5056051825796493, + "grad_norm": 5.092203140258789, + "learning_rate": 4.9905642259223455e-06, + "loss": 0.1544, + "step": 19980 + }, + { + "epoch": 0.505630488144343, + "grad_norm": 8.871820449829102, + "learning_rate": 4.990162704128679e-06, + "loss": 0.2922, + "step": 19981 + }, + { + "epoch": 0.5056557937090366, + "grad_norm": 6.7093682289123535, + "learning_rate": 4.989761182398452e-06, + "loss": 0.2355, + "step": 19982 + }, + { + "epoch": 0.5056810992737303, + "grad_norm": 6.819281578063965, + "learning_rate": 4.989359660734255e-06, + "loss": 0.1993, + "step": 19983 + }, + { + "epoch": 0.505706404838424, + "grad_norm": 6.669326305389404, + "learning_rate": 4.9889581391386736e-06, + "loss": 0.2093, + "step": 19984 + }, + { + "epoch": 0.5057317104031176, + "grad_norm": 6.3118438720703125, + "learning_rate": 4.988556617614299e-06, + "loss": 0.1459, + "step": 19985 + }, + { + "epoch": 0.5057570159678113, + "grad_norm": 8.10925579071045, + "learning_rate": 4.988155096163723e-06, + "loss": 0.2399, + "step": 19986 + }, + { + "epoch": 0.505782321532505, + "grad_norm": 4.629091262817383, + "learning_rate": 4.98775357478953e-06, + "loss": 0.1777, + "step": 19987 + }, + { + "epoch": 0.5058076270971986, + "grad_norm": 6.961804389953613, + "learning_rate": 4.9873520534943135e-06, + "loss": 0.2113, + "step": 19988 + }, + { + "epoch": 0.5058329326618923, + "grad_norm": 7.0234293937683105, + "learning_rate": 4.9869505322806605e-06, + "loss": 0.1866, + "step": 19989 + }, + { + "epoch": 0.505858238226586, + "grad_norm": 2.88698148727417, + "learning_rate": 4.98654901115116e-06, + "loss": 0.1642, + "step": 19990 + }, + { + "epoch": 0.5058835437912798, + "grad_norm": 5.021082401275635, + "learning_rate": 4.986147490108404e-06, + "loss": 0.1945, + "step": 19991 + }, + { + "epoch": 0.5059088493559734, + "grad_norm": 5.72135066986084, + "learning_rate": 4.985745969154981e-06, + "loss": 0.1724, + "step": 19992 + }, + { + "epoch": 0.5059341549206671, + "grad_norm": 4.494326591491699, + "learning_rate": 4.985344448293478e-06, + "loss": 0.1481, + "step": 19993 + }, + { + "epoch": 0.5059594604853608, + "grad_norm": 7.279801368713379, + "learning_rate": 4.984942927526486e-06, + "loss": 0.1695, + "step": 19994 + }, + { + "epoch": 0.5059847660500544, + "grad_norm": 12.34978199005127, + "learning_rate": 4.984541406856594e-06, + "loss": 0.271, + "step": 19995 + }, + { + "epoch": 0.5060100716147481, + "grad_norm": 5.147336483001709, + "learning_rate": 4.9841398862863915e-06, + "loss": 0.1793, + "step": 19996 + }, + { + "epoch": 0.5060353771794418, + "grad_norm": 3.9962353706359863, + "learning_rate": 4.9837383658184705e-06, + "loss": 0.1883, + "step": 19997 + }, + { + "epoch": 0.5060606827441354, + "grad_norm": 7.1367034912109375, + "learning_rate": 4.983336845455416e-06, + "loss": 0.164, + "step": 19998 + }, + { + "epoch": 0.5060859883088291, + "grad_norm": 3.616950750350952, + "learning_rate": 4.982935325199817e-06, + "loss": 0.1559, + "step": 19999 + }, + { + "epoch": 0.5061112938735228, + "grad_norm": 12.496512413024902, + "learning_rate": 4.982533805054267e-06, + "loss": 0.1999, + "step": 20000 + }, + { + "epoch": 0.5061365994382164, + "grad_norm": 8.683965682983398, + "learning_rate": 4.982132285021355e-06, + "loss": 0.1684, + "step": 20001 + }, + { + "epoch": 0.5061619050029101, + "grad_norm": 5.480543613433838, + "learning_rate": 4.9817307651036664e-06, + "loss": 0.1669, + "step": 20002 + }, + { + "epoch": 0.5061872105676039, + "grad_norm": 3.6973698139190674, + "learning_rate": 4.981329245303792e-06, + "loss": 0.1728, + "step": 20003 + }, + { + "epoch": 0.5062125161322975, + "grad_norm": 12.952139854431152, + "learning_rate": 4.980927725624322e-06, + "loss": 0.3011, + "step": 20004 + }, + { + "epoch": 0.5062378216969912, + "grad_norm": 5.494223594665527, + "learning_rate": 4.980526206067846e-06, + "loss": 0.1989, + "step": 20005 + }, + { + "epoch": 0.5062631272616849, + "grad_norm": 2.661863327026367, + "learning_rate": 4.980124686636955e-06, + "loss": 0.1346, + "step": 20006 + }, + { + "epoch": 0.5062884328263785, + "grad_norm": 5.1014838218688965, + "learning_rate": 4.979723167334234e-06, + "loss": 0.1751, + "step": 20007 + }, + { + "epoch": 0.5063137383910722, + "grad_norm": 7.211507320404053, + "learning_rate": 4.979321648162274e-06, + "loss": 0.2028, + "step": 20008 + }, + { + "epoch": 0.5063390439557659, + "grad_norm": 2.786470890045166, + "learning_rate": 4.9789201291236655e-06, + "loss": 0.1068, + "step": 20009 + }, + { + "epoch": 0.5063643495204595, + "grad_norm": 3.927182197570801, + "learning_rate": 4.978518610221e-06, + "loss": 0.1869, + "step": 20010 + }, + { + "epoch": 0.5063896550851532, + "grad_norm": 14.758479118347168, + "learning_rate": 4.97811709145686e-06, + "loss": 0.2381, + "step": 20011 + }, + { + "epoch": 0.5064149606498469, + "grad_norm": 4.058109760284424, + "learning_rate": 4.9777155728338395e-06, + "loss": 0.1038, + "step": 20012 + }, + { + "epoch": 0.5064402662145405, + "grad_norm": 8.149090766906738, + "learning_rate": 4.977314054354528e-06, + "loss": 0.2326, + "step": 20013 + }, + { + "epoch": 0.5064655717792342, + "grad_norm": 2.312761068344116, + "learning_rate": 4.976912536021515e-06, + "loss": 0.0834, + "step": 20014 + }, + { + "epoch": 0.506490877343928, + "grad_norm": 4.885862827301025, + "learning_rate": 4.976511017837387e-06, + "loss": 0.1535, + "step": 20015 + }, + { + "epoch": 0.5065161829086217, + "grad_norm": 5.711691379547119, + "learning_rate": 4.976109499804736e-06, + "loss": 0.2305, + "step": 20016 + }, + { + "epoch": 0.5065414884733153, + "grad_norm": 4.470598220825195, + "learning_rate": 4.97570798192615e-06, + "loss": 0.1536, + "step": 20017 + }, + { + "epoch": 0.506566794038009, + "grad_norm": 4.067971706390381, + "learning_rate": 4.975306464204218e-06, + "loss": 0.1524, + "step": 20018 + }, + { + "epoch": 0.5065920996027027, + "grad_norm": 9.27643871307373, + "learning_rate": 4.974904946641531e-06, + "loss": 0.218, + "step": 20019 + }, + { + "epoch": 0.5066174051673963, + "grad_norm": 5.193689346313477, + "learning_rate": 4.974503429240678e-06, + "loss": 0.1803, + "step": 20020 + }, + { + "epoch": 0.50664271073209, + "grad_norm": 6.08894681930542, + "learning_rate": 4.974101912004246e-06, + "loss": 0.206, + "step": 20021 + }, + { + "epoch": 0.5066680162967837, + "grad_norm": 3.214744806289673, + "learning_rate": 4.973700394934826e-06, + "loss": 0.1437, + "step": 20022 + }, + { + "epoch": 0.5066933218614773, + "grad_norm": 8.718857765197754, + "learning_rate": 4.97329887803501e-06, + "loss": 0.2078, + "step": 20023 + }, + { + "epoch": 0.506718627426171, + "grad_norm": 5.1076765060424805, + "learning_rate": 4.972897361307382e-06, + "loss": 0.1367, + "step": 20024 + }, + { + "epoch": 0.5067439329908647, + "grad_norm": 3.832547426223755, + "learning_rate": 4.972495844754534e-06, + "loss": 0.1657, + "step": 20025 + }, + { + "epoch": 0.5067692385555583, + "grad_norm": 4.133912086486816, + "learning_rate": 4.972094328379055e-06, + "loss": 0.1585, + "step": 20026 + }, + { + "epoch": 0.506794544120252, + "grad_norm": 3.9244463443756104, + "learning_rate": 4.971692812183534e-06, + "loss": 0.1504, + "step": 20027 + }, + { + "epoch": 0.5068198496849458, + "grad_norm": 7.913037300109863, + "learning_rate": 4.9712912961705636e-06, + "loss": 0.1899, + "step": 20028 + }, + { + "epoch": 0.5068451552496394, + "grad_norm": 3.5109264850616455, + "learning_rate": 4.970889780342728e-06, + "loss": 0.1738, + "step": 20029 + }, + { + "epoch": 0.5068704608143331, + "grad_norm": 3.4911422729492188, + "learning_rate": 4.97048826470262e-06, + "loss": 0.1485, + "step": 20030 + }, + { + "epoch": 0.5068957663790268, + "grad_norm": 3.139237642288208, + "learning_rate": 4.970086749252825e-06, + "loss": 0.148, + "step": 20031 + }, + { + "epoch": 0.5069210719437204, + "grad_norm": 3.82238507270813, + "learning_rate": 4.96968523399594e-06, + "loss": 0.2021, + "step": 20032 + }, + { + "epoch": 0.5069463775084141, + "grad_norm": 5.5781378746032715, + "learning_rate": 4.969283718934545e-06, + "loss": 0.1553, + "step": 20033 + }, + { + "epoch": 0.5069716830731078, + "grad_norm": 3.9430973529815674, + "learning_rate": 4.968882204071234e-06, + "loss": 0.1075, + "step": 20034 + }, + { + "epoch": 0.5069969886378014, + "grad_norm": 3.5197510719299316, + "learning_rate": 4.9684806894085965e-06, + "loss": 0.1631, + "step": 20035 + }, + { + "epoch": 0.5070222942024951, + "grad_norm": 7.53082799911499, + "learning_rate": 4.968079174949223e-06, + "loss": 0.1696, + "step": 20036 + }, + { + "epoch": 0.5070475997671888, + "grad_norm": 4.105118274688721, + "learning_rate": 4.967677660695699e-06, + "loss": 0.1518, + "step": 20037 + }, + { + "epoch": 0.5070729053318824, + "grad_norm": 3.8568568229675293, + "learning_rate": 4.967276146650616e-06, + "loss": 0.1516, + "step": 20038 + }, + { + "epoch": 0.5070982108965761, + "grad_norm": 6.34763765335083, + "learning_rate": 4.966874632816563e-06, + "loss": 0.2766, + "step": 20039 + }, + { + "epoch": 0.5071235164612699, + "grad_norm": 3.610410690307617, + "learning_rate": 4.96647311919613e-06, + "loss": 0.1734, + "step": 20040 + }, + { + "epoch": 0.5071488220259636, + "grad_norm": 8.100044250488281, + "learning_rate": 4.966071605791905e-06, + "loss": 0.2624, + "step": 20041 + }, + { + "epoch": 0.5071741275906572, + "grad_norm": 7.6088175773620605, + "learning_rate": 4.965670092606477e-06, + "loss": 0.1404, + "step": 20042 + }, + { + "epoch": 0.5071994331553509, + "grad_norm": 4.174457550048828, + "learning_rate": 4.965268579642437e-06, + "loss": 0.193, + "step": 20043 + }, + { + "epoch": 0.5072247387200446, + "grad_norm": 4.964757442474365, + "learning_rate": 4.964867066902375e-06, + "loss": 0.2095, + "step": 20044 + }, + { + "epoch": 0.5072500442847382, + "grad_norm": 3.958383798599243, + "learning_rate": 4.964465554388878e-06, + "loss": 0.1843, + "step": 20045 + }, + { + "epoch": 0.5072753498494319, + "grad_norm": 8.914328575134277, + "learning_rate": 4.964064042104534e-06, + "loss": 0.1234, + "step": 20046 + }, + { + "epoch": 0.5073006554141256, + "grad_norm": 7.440594673156738, + "learning_rate": 4.963662530051937e-06, + "loss": 0.2815, + "step": 20047 + }, + { + "epoch": 0.5073259609788192, + "grad_norm": 5.870859622955322, + "learning_rate": 4.963261018233672e-06, + "loss": 0.1332, + "step": 20048 + }, + { + "epoch": 0.5073512665435129, + "grad_norm": 4.571946144104004, + "learning_rate": 4.9628595066523295e-06, + "loss": 0.1669, + "step": 20049 + }, + { + "epoch": 0.5073765721082066, + "grad_norm": 18.96319580078125, + "learning_rate": 4.962457995310502e-06, + "loss": 0.262, + "step": 20050 + }, + { + "epoch": 0.5074018776729002, + "grad_norm": 11.438728332519531, + "learning_rate": 4.962056484210773e-06, + "loss": 0.3486, + "step": 20051 + }, + { + "epoch": 0.507427183237594, + "grad_norm": 3.5846786499023438, + "learning_rate": 4.961654973355735e-06, + "loss": 0.1581, + "step": 20052 + }, + { + "epoch": 0.5074524888022877, + "grad_norm": 4.581009387969971, + "learning_rate": 4.961253462747976e-06, + "loss": 0.1136, + "step": 20053 + }, + { + "epoch": 0.5074777943669813, + "grad_norm": 5.3585357666015625, + "learning_rate": 4.96085195239009e-06, + "loss": 0.1111, + "step": 20054 + }, + { + "epoch": 0.507503099931675, + "grad_norm": 6.281253814697266, + "learning_rate": 4.96045044228466e-06, + "loss": 0.241, + "step": 20055 + }, + { + "epoch": 0.5075284054963687, + "grad_norm": 5.219147682189941, + "learning_rate": 4.960048932434276e-06, + "loss": 0.1673, + "step": 20056 + }, + { + "epoch": 0.5075537110610623, + "grad_norm": 8.263693809509277, + "learning_rate": 4.959647422841531e-06, + "loss": 0.294, + "step": 20057 + }, + { + "epoch": 0.507579016625756, + "grad_norm": 5.933424472808838, + "learning_rate": 4.959245913509012e-06, + "loss": 0.1799, + "step": 20058 + }, + { + "epoch": 0.5076043221904497, + "grad_norm": 2.9750826358795166, + "learning_rate": 4.95884440443931e-06, + "loss": 0.0908, + "step": 20059 + }, + { + "epoch": 0.5076296277551433, + "grad_norm": 11.953577041625977, + "learning_rate": 4.9584428956350105e-06, + "loss": 0.2821, + "step": 20060 + }, + { + "epoch": 0.507654933319837, + "grad_norm": 4.482367992401123, + "learning_rate": 4.958041387098706e-06, + "loss": 0.104, + "step": 20061 + }, + { + "epoch": 0.5076802388845307, + "grad_norm": 3.2739884853363037, + "learning_rate": 4.957639878832984e-06, + "loss": 0.1278, + "step": 20062 + }, + { + "epoch": 0.5077055444492243, + "grad_norm": 3.972012519836426, + "learning_rate": 4.957238370840436e-06, + "loss": 0.1521, + "step": 20063 + }, + { + "epoch": 0.507730850013918, + "grad_norm": 9.760750770568848, + "learning_rate": 4.956836863123649e-06, + "loss": 0.1209, + "step": 20064 + }, + { + "epoch": 0.5077561555786118, + "grad_norm": 4.674233436584473, + "learning_rate": 4.956435355685213e-06, + "loss": 0.1792, + "step": 20065 + }, + { + "epoch": 0.5077814611433055, + "grad_norm": 4.088715553283691, + "learning_rate": 4.956033848527716e-06, + "loss": 0.1271, + "step": 20066 + }, + { + "epoch": 0.5078067667079991, + "grad_norm": 5.106034755706787, + "learning_rate": 4.955632341653752e-06, + "loss": 0.1794, + "step": 20067 + }, + { + "epoch": 0.5078320722726928, + "grad_norm": 4.472594738006592, + "learning_rate": 4.955230835065905e-06, + "loss": 0.1319, + "step": 20068 + }, + { + "epoch": 0.5078573778373865, + "grad_norm": 5.598101615905762, + "learning_rate": 4.954829328766764e-06, + "loss": 0.2469, + "step": 20069 + }, + { + "epoch": 0.5078826834020801, + "grad_norm": 9.997894287109375, + "learning_rate": 4.954427822758922e-06, + "loss": 0.2269, + "step": 20070 + }, + { + "epoch": 0.5079079889667738, + "grad_norm": 4.788760185241699, + "learning_rate": 4.954026317044967e-06, + "loss": 0.1755, + "step": 20071 + }, + { + "epoch": 0.5079332945314675, + "grad_norm": 17.4371395111084, + "learning_rate": 4.953624811627488e-06, + "loss": 0.3285, + "step": 20072 + }, + { + "epoch": 0.5079586000961611, + "grad_norm": 13.93263053894043, + "learning_rate": 4.953223306509073e-06, + "loss": 0.2657, + "step": 20073 + }, + { + "epoch": 0.5079839056608548, + "grad_norm": 10.711832046508789, + "learning_rate": 4.952821801692312e-06, + "loss": 0.1779, + "step": 20074 + }, + { + "epoch": 0.5080092112255485, + "grad_norm": 8.559223175048828, + "learning_rate": 4.952420297179795e-06, + "loss": 0.27, + "step": 20075 + }, + { + "epoch": 0.5080345167902421, + "grad_norm": 7.2261576652526855, + "learning_rate": 4.9520187929741105e-06, + "loss": 0.2056, + "step": 20076 + }, + { + "epoch": 0.5080598223549359, + "grad_norm": 2.7152185440063477, + "learning_rate": 4.951617289077849e-06, + "loss": 0.1217, + "step": 20077 + }, + { + "epoch": 0.5080851279196296, + "grad_norm": 5.9742021560668945, + "learning_rate": 4.951215785493596e-06, + "loss": 0.1869, + "step": 20078 + }, + { + "epoch": 0.5081104334843232, + "grad_norm": 6.856698989868164, + "learning_rate": 4.9508142822239445e-06, + "loss": 0.1557, + "step": 20079 + }, + { + "epoch": 0.5081357390490169, + "grad_norm": 3.7998480796813965, + "learning_rate": 4.9504127792714825e-06, + "loss": 0.1262, + "step": 20080 + }, + { + "epoch": 0.5081610446137106, + "grad_norm": 5.0637664794921875, + "learning_rate": 4.950011276638801e-06, + "loss": 0.1813, + "step": 20081 + }, + { + "epoch": 0.5081863501784042, + "grad_norm": 4.755586624145508, + "learning_rate": 4.949609774328485e-06, + "loss": 0.2115, + "step": 20082 + }, + { + "epoch": 0.5082116557430979, + "grad_norm": 5.671731948852539, + "learning_rate": 4.949208272343127e-06, + "loss": 0.1664, + "step": 20083 + }, + { + "epoch": 0.5082369613077916, + "grad_norm": 6.675322532653809, + "learning_rate": 4.948806770685315e-06, + "loss": 0.1711, + "step": 20084 + }, + { + "epoch": 0.5082622668724852, + "grad_norm": 4.36180305480957, + "learning_rate": 4.948405269357641e-06, + "loss": 0.185, + "step": 20085 + }, + { + "epoch": 0.5082875724371789, + "grad_norm": 13.496471405029297, + "learning_rate": 4.948003768362689e-06, + "loss": 0.107, + "step": 20086 + }, + { + "epoch": 0.5083128780018726, + "grad_norm": 3.8151638507843018, + "learning_rate": 4.947602267703052e-06, + "loss": 0.1572, + "step": 20087 + }, + { + "epoch": 0.5083381835665662, + "grad_norm": 17.065038681030273, + "learning_rate": 4.947200767381318e-06, + "loss": 0.2373, + "step": 20088 + }, + { + "epoch": 0.50836348913126, + "grad_norm": 10.706718444824219, + "learning_rate": 4.946799267400078e-06, + "loss": 0.2602, + "step": 20089 + }, + { + "epoch": 0.5083887946959537, + "grad_norm": 5.957635402679443, + "learning_rate": 4.946397767761918e-06, + "loss": 0.1585, + "step": 20090 + }, + { + "epoch": 0.5084141002606473, + "grad_norm": 9.182896614074707, + "learning_rate": 4.945996268469428e-06, + "loss": 0.3093, + "step": 20091 + }, + { + "epoch": 0.508439405825341, + "grad_norm": 3.307859420776367, + "learning_rate": 4.945594769525199e-06, + "loss": 0.0934, + "step": 20092 + }, + { + "epoch": 0.5084647113900347, + "grad_norm": 15.850785255432129, + "learning_rate": 4.945193270931819e-06, + "loss": 0.1752, + "step": 20093 + }, + { + "epoch": 0.5084900169547284, + "grad_norm": 5.35255765914917, + "learning_rate": 4.94479177269188e-06, + "loss": 0.1302, + "step": 20094 + }, + { + "epoch": 0.508515322519422, + "grad_norm": 8.439652442932129, + "learning_rate": 4.944390274807965e-06, + "loss": 0.2265, + "step": 20095 + }, + { + "epoch": 0.5085406280841157, + "grad_norm": 3.492594003677368, + "learning_rate": 4.943988777282668e-06, + "loss": 0.1431, + "step": 20096 + }, + { + "epoch": 0.5085659336488094, + "grad_norm": 9.799094200134277, + "learning_rate": 4.943587280118577e-06, + "loss": 0.1647, + "step": 20097 + }, + { + "epoch": 0.508591239213503, + "grad_norm": 3.3488993644714355, + "learning_rate": 4.943185783318283e-06, + "loss": 0.1346, + "step": 20098 + }, + { + "epoch": 0.5086165447781967, + "grad_norm": 25.148822784423828, + "learning_rate": 4.9427842868843705e-06, + "loss": 0.2967, + "step": 20099 + }, + { + "epoch": 0.5086418503428904, + "grad_norm": 4.669923305511475, + "learning_rate": 4.942382790819432e-06, + "loss": 0.165, + "step": 20100 + }, + { + "epoch": 0.508667155907584, + "grad_norm": 4.5381903648376465, + "learning_rate": 4.941981295126058e-06, + "loss": 0.1431, + "step": 20101 + }, + { + "epoch": 0.5086924614722778, + "grad_norm": 3.465348958969116, + "learning_rate": 4.941579799806834e-06, + "loss": 0.1916, + "step": 20102 + }, + { + "epoch": 0.5087177670369715, + "grad_norm": 4.1521382331848145, + "learning_rate": 4.941178304864352e-06, + "loss": 0.1435, + "step": 20103 + }, + { + "epoch": 0.5087430726016651, + "grad_norm": 4.412259578704834, + "learning_rate": 4.940776810301199e-06, + "loss": 0.1447, + "step": 20104 + }, + { + "epoch": 0.5087683781663588, + "grad_norm": 3.6155104637145996, + "learning_rate": 4.940375316119966e-06, + "loss": 0.2041, + "step": 20105 + }, + { + "epoch": 0.5087936837310525, + "grad_norm": 7.0461602210998535, + "learning_rate": 4.939973822323242e-06, + "loss": 0.206, + "step": 20106 + }, + { + "epoch": 0.5088189892957461, + "grad_norm": 4.312645435333252, + "learning_rate": 4.939572328913615e-06, + "loss": 0.125, + "step": 20107 + }, + { + "epoch": 0.5088442948604398, + "grad_norm": 5.531414985656738, + "learning_rate": 4.939170835893676e-06, + "loss": 0.1907, + "step": 20108 + }, + { + "epoch": 0.5088696004251335, + "grad_norm": 1.9970791339874268, + "learning_rate": 4.938769343266012e-06, + "loss": 0.0523, + "step": 20109 + }, + { + "epoch": 0.5088949059898271, + "grad_norm": 7.757147789001465, + "learning_rate": 4.9383678510332126e-06, + "loss": 0.1867, + "step": 20110 + }, + { + "epoch": 0.5089202115545208, + "grad_norm": 4.288155555725098, + "learning_rate": 4.937966359197869e-06, + "loss": 0.1463, + "step": 20111 + }, + { + "epoch": 0.5089455171192145, + "grad_norm": 6.249051094055176, + "learning_rate": 4.937564867762567e-06, + "loss": 0.1525, + "step": 20112 + }, + { + "epoch": 0.5089708226839081, + "grad_norm": 2.9088590145111084, + "learning_rate": 4.9371633767298985e-06, + "loss": 0.1326, + "step": 20113 + }, + { + "epoch": 0.5089961282486019, + "grad_norm": 3.324542284011841, + "learning_rate": 4.936761886102451e-06, + "loss": 0.107, + "step": 20114 + }, + { + "epoch": 0.5090214338132956, + "grad_norm": 5.4813232421875, + "learning_rate": 4.9363603958828145e-06, + "loss": 0.1514, + "step": 20115 + }, + { + "epoch": 0.5090467393779892, + "grad_norm": 8.20938777923584, + "learning_rate": 4.93595890607358e-06, + "loss": 0.1745, + "step": 20116 + }, + { + "epoch": 0.5090720449426829, + "grad_norm": 2.6228225231170654, + "learning_rate": 4.935557416677332e-06, + "loss": 0.113, + "step": 20117 + }, + { + "epoch": 0.5090973505073766, + "grad_norm": 4.397796630859375, + "learning_rate": 4.935155927696662e-06, + "loss": 0.1648, + "step": 20118 + }, + { + "epoch": 0.5091226560720703, + "grad_norm": 5.428017616271973, + "learning_rate": 4.934754439134161e-06, + "loss": 0.1226, + "step": 20119 + }, + { + "epoch": 0.5091479616367639, + "grad_norm": 3.7845633029937744, + "learning_rate": 4.934352950992418e-06, + "loss": 0.1433, + "step": 20120 + }, + { + "epoch": 0.5091732672014576, + "grad_norm": 5.574084758758545, + "learning_rate": 4.933951463274018e-06, + "loss": 0.1607, + "step": 20121 + }, + { + "epoch": 0.5091985727661513, + "grad_norm": 7.999923229217529, + "learning_rate": 4.933549975981553e-06, + "loss": 0.1618, + "step": 20122 + }, + { + "epoch": 0.5092238783308449, + "grad_norm": 4.199108123779297, + "learning_rate": 4.933148489117612e-06, + "loss": 0.1454, + "step": 20123 + }, + { + "epoch": 0.5092491838955386, + "grad_norm": 6.082958221435547, + "learning_rate": 4.932747002684782e-06, + "loss": 0.2191, + "step": 20124 + }, + { + "epoch": 0.5092744894602323, + "grad_norm": 10.942471504211426, + "learning_rate": 4.9323455166856585e-06, + "loss": 0.3484, + "step": 20125 + }, + { + "epoch": 0.509299795024926, + "grad_norm": 7.022210597991943, + "learning_rate": 4.9319440311228235e-06, + "loss": 0.1765, + "step": 20126 + }, + { + "epoch": 0.5093251005896197, + "grad_norm": 4.546924591064453, + "learning_rate": 4.9315425459988685e-06, + "loss": 0.133, + "step": 20127 + }, + { + "epoch": 0.5093504061543134, + "grad_norm": 3.1008119583129883, + "learning_rate": 4.931141061316383e-06, + "loss": 0.1352, + "step": 20128 + }, + { + "epoch": 0.509375711719007, + "grad_norm": 3.4920034408569336, + "learning_rate": 4.930739577077958e-06, + "loss": 0.1051, + "step": 20129 + }, + { + "epoch": 0.5094010172837007, + "grad_norm": 3.124664068222046, + "learning_rate": 4.930338093286179e-06, + "loss": 0.1613, + "step": 20130 + }, + { + "epoch": 0.5094263228483944, + "grad_norm": 6.459384918212891, + "learning_rate": 4.929936609943636e-06, + "loss": 0.2069, + "step": 20131 + }, + { + "epoch": 0.509451628413088, + "grad_norm": 13.533038139343262, + "learning_rate": 4.929535127052921e-06, + "loss": 0.2435, + "step": 20132 + }, + { + "epoch": 0.5094769339777817, + "grad_norm": 7.209192752838135, + "learning_rate": 4.929133644616618e-06, + "loss": 0.1713, + "step": 20133 + }, + { + "epoch": 0.5095022395424754, + "grad_norm": 5.5350446701049805, + "learning_rate": 4.9287321626373215e-06, + "loss": 0.2101, + "step": 20134 + }, + { + "epoch": 0.509527545107169, + "grad_norm": 6.171848297119141, + "learning_rate": 4.928330681117617e-06, + "loss": 0.1465, + "step": 20135 + }, + { + "epoch": 0.5095528506718627, + "grad_norm": 3.377854585647583, + "learning_rate": 4.927929200060094e-06, + "loss": 0.1141, + "step": 20136 + }, + { + "epoch": 0.5095781562365564, + "grad_norm": 4.375327110290527, + "learning_rate": 4.927527719467343e-06, + "loss": 0.1437, + "step": 20137 + }, + { + "epoch": 0.50960346180125, + "grad_norm": 10.630765914916992, + "learning_rate": 4.927126239341953e-06, + "loss": 0.1077, + "step": 20138 + }, + { + "epoch": 0.5096287673659438, + "grad_norm": 7.817087650299072, + "learning_rate": 4.926724759686511e-06, + "loss": 0.1825, + "step": 20139 + }, + { + "epoch": 0.5096540729306375, + "grad_norm": 13.05970573425293, + "learning_rate": 4.926323280503607e-06, + "loss": 0.1659, + "step": 20140 + }, + { + "epoch": 0.5096793784953311, + "grad_norm": 7.111354827880859, + "learning_rate": 4.92592180179583e-06, + "loss": 0.1993, + "step": 20141 + }, + { + "epoch": 0.5097046840600248, + "grad_norm": 5.925235748291016, + "learning_rate": 4.925520323565774e-06, + "loss": 0.2068, + "step": 20142 + }, + { + "epoch": 0.5097299896247185, + "grad_norm": 14.944782257080078, + "learning_rate": 4.925118845816019e-06, + "loss": 0.2335, + "step": 20143 + }, + { + "epoch": 0.5097552951894122, + "grad_norm": 4.185845375061035, + "learning_rate": 4.924717368549159e-06, + "loss": 0.1239, + "step": 20144 + }, + { + "epoch": 0.5097806007541058, + "grad_norm": 4.15494966506958, + "learning_rate": 4.9243158917677834e-06, + "loss": 0.1483, + "step": 20145 + }, + { + "epoch": 0.5098059063187995, + "grad_norm": 8.240348815917969, + "learning_rate": 4.92391441547448e-06, + "loss": 0.1749, + "step": 20146 + }, + { + "epoch": 0.5098312118834932, + "grad_norm": 5.674215793609619, + "learning_rate": 4.92351293967184e-06, + "loss": 0.1973, + "step": 20147 + }, + { + "epoch": 0.5098565174481868, + "grad_norm": 7.3280348777771, + "learning_rate": 4.92311146436245e-06, + "loss": 0.2297, + "step": 20148 + }, + { + "epoch": 0.5098818230128805, + "grad_norm": 10.460456848144531, + "learning_rate": 4.922709989548899e-06, + "loss": 0.2056, + "step": 20149 + }, + { + "epoch": 0.5099071285775743, + "grad_norm": 9.888812065124512, + "learning_rate": 4.922308515233778e-06, + "loss": 0.2332, + "step": 20150 + }, + { + "epoch": 0.5099324341422679, + "grad_norm": 10.068729400634766, + "learning_rate": 4.921907041419676e-06, + "loss": 0.2257, + "step": 20151 + }, + { + "epoch": 0.5099577397069616, + "grad_norm": 6.502787113189697, + "learning_rate": 4.921505568109179e-06, + "loss": 0.1749, + "step": 20152 + }, + { + "epoch": 0.5099830452716553, + "grad_norm": 6.406199932098389, + "learning_rate": 4.921104095304878e-06, + "loss": 0.21, + "step": 20153 + }, + { + "epoch": 0.5100083508363489, + "grad_norm": 2.2999536991119385, + "learning_rate": 4.9207026230093615e-06, + "loss": 0.0944, + "step": 20154 + }, + { + "epoch": 0.5100336564010426, + "grad_norm": 6.370014190673828, + "learning_rate": 4.92030115122522e-06, + "loss": 0.1759, + "step": 20155 + }, + { + "epoch": 0.5100589619657363, + "grad_norm": 4.730918884277344, + "learning_rate": 4.9198996799550435e-06, + "loss": 0.1774, + "step": 20156 + }, + { + "epoch": 0.5100842675304299, + "grad_norm": 3.276318073272705, + "learning_rate": 4.919498209201417e-06, + "loss": 0.092, + "step": 20157 + }, + { + "epoch": 0.5101095730951236, + "grad_norm": 3.0532915592193604, + "learning_rate": 4.919096738966931e-06, + "loss": 0.1417, + "step": 20158 + }, + { + "epoch": 0.5101348786598173, + "grad_norm": 6.746906757354736, + "learning_rate": 4.9186952692541765e-06, + "loss": 0.1733, + "step": 20159 + }, + { + "epoch": 0.5101601842245109, + "grad_norm": 3.320305585861206, + "learning_rate": 4.918293800065741e-06, + "loss": 0.0972, + "step": 20160 + }, + { + "epoch": 0.5101854897892046, + "grad_norm": 3.9157731533050537, + "learning_rate": 4.917892331404213e-06, + "loss": 0.1518, + "step": 20161 + }, + { + "epoch": 0.5102107953538983, + "grad_norm": 7.386026859283447, + "learning_rate": 4.917490863272182e-06, + "loss": 0.1443, + "step": 20162 + }, + { + "epoch": 0.510236100918592, + "grad_norm": 4.93474006652832, + "learning_rate": 4.917089395672236e-06, + "loss": 0.2241, + "step": 20163 + }, + { + "epoch": 0.5102614064832857, + "grad_norm": 6.894778251647949, + "learning_rate": 4.916687928606968e-06, + "loss": 0.2258, + "step": 20164 + }, + { + "epoch": 0.5102867120479794, + "grad_norm": 3.2112247943878174, + "learning_rate": 4.916286462078961e-06, + "loss": 0.149, + "step": 20165 + }, + { + "epoch": 0.510312017612673, + "grad_norm": 7.387442111968994, + "learning_rate": 4.9158849960908096e-06, + "loss": 0.2015, + "step": 20166 + }, + { + "epoch": 0.5103373231773667, + "grad_norm": 5.489048004150391, + "learning_rate": 4.9154835306450986e-06, + "loss": 0.2022, + "step": 20167 + }, + { + "epoch": 0.5103626287420604, + "grad_norm": 8.419233322143555, + "learning_rate": 4.9150820657444185e-06, + "loss": 0.1437, + "step": 20168 + }, + { + "epoch": 0.5103879343067541, + "grad_norm": 8.57750415802002, + "learning_rate": 4.91468060139136e-06, + "loss": 0.2311, + "step": 20169 + }, + { + "epoch": 0.5104132398714477, + "grad_norm": 3.207251787185669, + "learning_rate": 4.914279137588509e-06, + "loss": 0.1584, + "step": 20170 + }, + { + "epoch": 0.5104385454361414, + "grad_norm": 3.4137673377990723, + "learning_rate": 4.9138776743384555e-06, + "loss": 0.1581, + "step": 20171 + }, + { + "epoch": 0.5104638510008351, + "grad_norm": 4.469197750091553, + "learning_rate": 4.913476211643789e-06, + "loss": 0.1673, + "step": 20172 + }, + { + "epoch": 0.5104891565655287, + "grad_norm": 4.108676433563232, + "learning_rate": 4.9130747495071e-06, + "loss": 0.1275, + "step": 20173 + }, + { + "epoch": 0.5105144621302224, + "grad_norm": 4.507288932800293, + "learning_rate": 4.9126732879309745e-06, + "loss": 0.1448, + "step": 20174 + }, + { + "epoch": 0.5105397676949162, + "grad_norm": 4.341804027557373, + "learning_rate": 4.912271826918002e-06, + "loss": 0.1377, + "step": 20175 + }, + { + "epoch": 0.5105650732596098, + "grad_norm": 3.3186216354370117, + "learning_rate": 4.911870366470773e-06, + "loss": 0.1172, + "step": 20176 + }, + { + "epoch": 0.5105903788243035, + "grad_norm": 6.343328952789307, + "learning_rate": 4.911468906591874e-06, + "loss": 0.1244, + "step": 20177 + }, + { + "epoch": 0.5106156843889972, + "grad_norm": 7.785675525665283, + "learning_rate": 4.9110674472839e-06, + "loss": 0.1601, + "step": 20178 + }, + { + "epoch": 0.5106409899536908, + "grad_norm": 5.254014492034912, + "learning_rate": 4.9106659885494314e-06, + "loss": 0.1589, + "step": 20179 + }, + { + "epoch": 0.5106662955183845, + "grad_norm": 9.765363693237305, + "learning_rate": 4.9102645303910625e-06, + "loss": 0.1807, + "step": 20180 + }, + { + "epoch": 0.5106916010830782, + "grad_norm": 4.323225498199463, + "learning_rate": 4.90986307281138e-06, + "loss": 0.0906, + "step": 20181 + }, + { + "epoch": 0.5107169066477718, + "grad_norm": 3.4086599349975586, + "learning_rate": 4.909461615812977e-06, + "loss": 0.0665, + "step": 20182 + }, + { + "epoch": 0.5107422122124655, + "grad_norm": 3.1015493869781494, + "learning_rate": 4.909060159398436e-06, + "loss": 0.1338, + "step": 20183 + }, + { + "epoch": 0.5107675177771592, + "grad_norm": 2.5136919021606445, + "learning_rate": 4.9086587035703496e-06, + "loss": 0.0668, + "step": 20184 + }, + { + "epoch": 0.5107928233418528, + "grad_norm": 5.091757297515869, + "learning_rate": 4.908257248331306e-06, + "loss": 0.2692, + "step": 20185 + }, + { + "epoch": 0.5108181289065465, + "grad_norm": 12.040716171264648, + "learning_rate": 4.907855793683895e-06, + "loss": 0.2102, + "step": 20186 + }, + { + "epoch": 0.5108434344712403, + "grad_norm": 7.165836334228516, + "learning_rate": 4.9074543396307055e-06, + "loss": 0.2543, + "step": 20187 + }, + { + "epoch": 0.5108687400359339, + "grad_norm": 3.5056371688842773, + "learning_rate": 4.907052886174325e-06, + "loss": 0.1603, + "step": 20188 + }, + { + "epoch": 0.5108940456006276, + "grad_norm": 6.4118170738220215, + "learning_rate": 4.906651433317343e-06, + "loss": 0.1117, + "step": 20189 + }, + { + "epoch": 0.5109193511653213, + "grad_norm": 3.644549608230591, + "learning_rate": 4.906249981062349e-06, + "loss": 0.1346, + "step": 20190 + }, + { + "epoch": 0.5109446567300149, + "grad_norm": 3.6968722343444824, + "learning_rate": 4.9058485294119315e-06, + "loss": 0.1017, + "step": 20191 + }, + { + "epoch": 0.5109699622947086, + "grad_norm": 10.017186164855957, + "learning_rate": 4.905447078368678e-06, + "loss": 0.2565, + "step": 20192 + }, + { + "epoch": 0.5109952678594023, + "grad_norm": 6.582958698272705, + "learning_rate": 4.905045627935181e-06, + "loss": 0.1673, + "step": 20193 + }, + { + "epoch": 0.511020573424096, + "grad_norm": 3.840730667114258, + "learning_rate": 4.9046441781140254e-06, + "loss": 0.1542, + "step": 20194 + }, + { + "epoch": 0.5110458789887896, + "grad_norm": 3.489356279373169, + "learning_rate": 4.904242728907803e-06, + "loss": 0.1858, + "step": 20195 + }, + { + "epoch": 0.5110711845534833, + "grad_norm": 8.802046775817871, + "learning_rate": 4.9038412803191e-06, + "loss": 0.2002, + "step": 20196 + }, + { + "epoch": 0.511096490118177, + "grad_norm": 3.08544659614563, + "learning_rate": 4.903439832350507e-06, + "loss": 0.155, + "step": 20197 + }, + { + "epoch": 0.5111217956828706, + "grad_norm": 3.934889078140259, + "learning_rate": 4.903038385004612e-06, + "loss": 0.1075, + "step": 20198 + }, + { + "epoch": 0.5111471012475644, + "grad_norm": 3.7886464595794678, + "learning_rate": 4.902636938284005e-06, + "loss": 0.1247, + "step": 20199 + }, + { + "epoch": 0.5111724068122581, + "grad_norm": 9.33775520324707, + "learning_rate": 4.902235492191277e-06, + "loss": 0.1509, + "step": 20200 + }, + { + "epoch": 0.5111977123769517, + "grad_norm": 3.674858808517456, + "learning_rate": 4.901834046729011e-06, + "loss": 0.1068, + "step": 20201 + }, + { + "epoch": 0.5112230179416454, + "grad_norm": 4.848153591156006, + "learning_rate": 4.901432601899799e-06, + "loss": 0.1952, + "step": 20202 + }, + { + "epoch": 0.5112483235063391, + "grad_norm": 5.913776874542236, + "learning_rate": 4.901031157706229e-06, + "loss": 0.1185, + "step": 20203 + }, + { + "epoch": 0.5112736290710327, + "grad_norm": 7.171282768249512, + "learning_rate": 4.900629714150895e-06, + "loss": 0.234, + "step": 20204 + }, + { + "epoch": 0.5112989346357264, + "grad_norm": 3.6428215503692627, + "learning_rate": 4.900228271236377e-06, + "loss": 0.1713, + "step": 20205 + }, + { + "epoch": 0.5113242402004201, + "grad_norm": 3.6412229537963867, + "learning_rate": 4.89982682896527e-06, + "loss": 0.1779, + "step": 20206 + }, + { + "epoch": 0.5113495457651137, + "grad_norm": 4.491466045379639, + "learning_rate": 4.899425387340161e-06, + "loss": 0.1438, + "step": 20207 + }, + { + "epoch": 0.5113748513298074, + "grad_norm": 9.02893352508545, + "learning_rate": 4.899023946363638e-06, + "loss": 0.2842, + "step": 20208 + }, + { + "epoch": 0.5114001568945011, + "grad_norm": 3.2830240726470947, + "learning_rate": 4.898622506038294e-06, + "loss": 0.1416, + "step": 20209 + }, + { + "epoch": 0.5114254624591947, + "grad_norm": 10.022771835327148, + "learning_rate": 4.898221066366712e-06, + "loss": 0.3136, + "step": 20210 + }, + { + "epoch": 0.5114507680238884, + "grad_norm": 5.0247578620910645, + "learning_rate": 4.897819627351482e-06, + "loss": 0.1747, + "step": 20211 + }, + { + "epoch": 0.5114760735885822, + "grad_norm": 6.0932936668396, + "learning_rate": 4.897418188995196e-06, + "loss": 0.1795, + "step": 20212 + }, + { + "epoch": 0.5115013791532758, + "grad_norm": 2.5118212699890137, + "learning_rate": 4.897016751300443e-06, + "loss": 0.0942, + "step": 20213 + }, + { + "epoch": 0.5115266847179695, + "grad_norm": 3.9715418815612793, + "learning_rate": 4.896615314269807e-06, + "loss": 0.1813, + "step": 20214 + }, + { + "epoch": 0.5115519902826632, + "grad_norm": 7.7273850440979, + "learning_rate": 4.896213877905881e-06, + "loss": 0.2122, + "step": 20215 + }, + { + "epoch": 0.5115772958473568, + "grad_norm": 8.511679649353027, + "learning_rate": 4.895812442211251e-06, + "loss": 0.2307, + "step": 20216 + }, + { + "epoch": 0.5116026014120505, + "grad_norm": 7.178110599517822, + "learning_rate": 4.89541100718851e-06, + "loss": 0.2011, + "step": 20217 + }, + { + "epoch": 0.5116279069767442, + "grad_norm": 3.7727584838867188, + "learning_rate": 4.895009572840241e-06, + "loss": 0.144, + "step": 20218 + }, + { + "epoch": 0.5116532125414378, + "grad_norm": 3.2396140098571777, + "learning_rate": 4.894608139169036e-06, + "loss": 0.1758, + "step": 20219 + }, + { + "epoch": 0.5116785181061315, + "grad_norm": 3.44209623336792, + "learning_rate": 4.894206706177485e-06, + "loss": 0.1486, + "step": 20220 + }, + { + "epoch": 0.5117038236708252, + "grad_norm": 6.8284831047058105, + "learning_rate": 4.893805273868174e-06, + "loss": 0.1411, + "step": 20221 + }, + { + "epoch": 0.5117291292355189, + "grad_norm": 4.135682582855225, + "learning_rate": 4.8934038422436945e-06, + "loss": 0.1303, + "step": 20222 + }, + { + "epoch": 0.5117544348002125, + "grad_norm": 3.830108165740967, + "learning_rate": 4.8930024113066335e-06, + "loss": 0.1626, + "step": 20223 + }, + { + "epoch": 0.5117797403649063, + "grad_norm": 2.002025604248047, + "learning_rate": 4.892600981059579e-06, + "loss": 0.0592, + "step": 20224 + }, + { + "epoch": 0.5118050459296, + "grad_norm": 7.322171211242676, + "learning_rate": 4.892199551505121e-06, + "loss": 0.1616, + "step": 20225 + }, + { + "epoch": 0.5118303514942936, + "grad_norm": 4.720302104949951, + "learning_rate": 4.8917981226458475e-06, + "loss": 0.1603, + "step": 20226 + }, + { + "epoch": 0.5118556570589873, + "grad_norm": 6.593996524810791, + "learning_rate": 4.891396694484349e-06, + "loss": 0.1218, + "step": 20227 + }, + { + "epoch": 0.511880962623681, + "grad_norm": 4.912906169891357, + "learning_rate": 4.890995267023212e-06, + "loss": 0.144, + "step": 20228 + }, + { + "epoch": 0.5119062681883746, + "grad_norm": 8.725028038024902, + "learning_rate": 4.890593840265026e-06, + "loss": 0.2106, + "step": 20229 + }, + { + "epoch": 0.5119315737530683, + "grad_norm": 4.904418468475342, + "learning_rate": 4.890192414212382e-06, + "loss": 0.195, + "step": 20230 + }, + { + "epoch": 0.511956879317762, + "grad_norm": 4.738393783569336, + "learning_rate": 4.889790988867867e-06, + "loss": 0.2176, + "step": 20231 + }, + { + "epoch": 0.5119821848824556, + "grad_norm": 3.6662302017211914, + "learning_rate": 4.8893895642340665e-06, + "loss": 0.1096, + "step": 20232 + }, + { + "epoch": 0.5120074904471493, + "grad_norm": 3.8649239540100098, + "learning_rate": 4.888988140313573e-06, + "loss": 0.1537, + "step": 20233 + }, + { + "epoch": 0.512032796011843, + "grad_norm": 4.635588645935059, + "learning_rate": 4.888586717108974e-06, + "loss": 0.1191, + "step": 20234 + }, + { + "epoch": 0.5120581015765366, + "grad_norm": 7.935494899749756, + "learning_rate": 4.888185294622862e-06, + "loss": 0.2365, + "step": 20235 + }, + { + "epoch": 0.5120834071412304, + "grad_norm": 3.399656057357788, + "learning_rate": 4.887783872857819e-06, + "loss": 0.1498, + "step": 20236 + }, + { + "epoch": 0.5121087127059241, + "grad_norm": 8.192974090576172, + "learning_rate": 4.887382451816436e-06, + "loss": 0.1938, + "step": 20237 + }, + { + "epoch": 0.5121340182706177, + "grad_norm": 4.2270588874816895, + "learning_rate": 4.886981031501304e-06, + "loss": 0.1319, + "step": 20238 + }, + { + "epoch": 0.5121593238353114, + "grad_norm": 14.117929458618164, + "learning_rate": 4.886579611915012e-06, + "loss": 0.2651, + "step": 20239 + }, + { + "epoch": 0.5121846294000051, + "grad_norm": 4.278668403625488, + "learning_rate": 4.886178193060145e-06, + "loss": 0.104, + "step": 20240 + }, + { + "epoch": 0.5122099349646987, + "grad_norm": 3.766425609588623, + "learning_rate": 4.885776774939294e-06, + "loss": 0.1764, + "step": 20241 + }, + { + "epoch": 0.5122352405293924, + "grad_norm": 5.582622051239014, + "learning_rate": 4.8853753575550464e-06, + "loss": 0.2307, + "step": 20242 + }, + { + "epoch": 0.5122605460940861, + "grad_norm": 4.863797187805176, + "learning_rate": 4.884973940909993e-06, + "loss": 0.192, + "step": 20243 + }, + { + "epoch": 0.5122858516587797, + "grad_norm": 4.589057922363281, + "learning_rate": 4.884572525006722e-06, + "loss": 0.1903, + "step": 20244 + }, + { + "epoch": 0.5123111572234734, + "grad_norm": 3.373194456100464, + "learning_rate": 4.88417110984782e-06, + "loss": 0.1374, + "step": 20245 + }, + { + "epoch": 0.5123364627881671, + "grad_norm": 4.773448467254639, + "learning_rate": 4.883769695435877e-06, + "loss": 0.134, + "step": 20246 + }, + { + "epoch": 0.5123617683528608, + "grad_norm": 8.16474723815918, + "learning_rate": 4.883368281773483e-06, + "loss": 0.1961, + "step": 20247 + }, + { + "epoch": 0.5123870739175544, + "grad_norm": 12.72130298614502, + "learning_rate": 4.882966868863224e-06, + "loss": 0.2022, + "step": 20248 + }, + { + "epoch": 0.5124123794822482, + "grad_norm": 5.58373498916626, + "learning_rate": 4.88256545670769e-06, + "loss": 0.2584, + "step": 20249 + }, + { + "epoch": 0.5124376850469419, + "grad_norm": 8.680912017822266, + "learning_rate": 4.882164045309469e-06, + "loss": 0.1821, + "step": 20250 + }, + { + "epoch": 0.5124629906116355, + "grad_norm": 8.13988208770752, + "learning_rate": 4.881762634671151e-06, + "loss": 0.1901, + "step": 20251 + }, + { + "epoch": 0.5124882961763292, + "grad_norm": 2.5427629947662354, + "learning_rate": 4.881361224795324e-06, + "loss": 0.134, + "step": 20252 + }, + { + "epoch": 0.5125136017410229, + "grad_norm": 16.805339813232422, + "learning_rate": 4.880959815684576e-06, + "loss": 0.3297, + "step": 20253 + }, + { + "epoch": 0.5125389073057165, + "grad_norm": 5.0537333488464355, + "learning_rate": 4.8805584073414965e-06, + "loss": 0.1524, + "step": 20254 + }, + { + "epoch": 0.5125642128704102, + "grad_norm": 13.195194244384766, + "learning_rate": 4.880156999768673e-06, + "loss": 0.1891, + "step": 20255 + }, + { + "epoch": 0.5125895184351039, + "grad_norm": 3.46315598487854, + "learning_rate": 4.879755592968694e-06, + "loss": 0.1538, + "step": 20256 + }, + { + "epoch": 0.5126148239997975, + "grad_norm": 2.532533884048462, + "learning_rate": 4.879354186944151e-06, + "loss": 0.1065, + "step": 20257 + }, + { + "epoch": 0.5126401295644912, + "grad_norm": 4.4442853927612305, + "learning_rate": 4.878952781697628e-06, + "loss": 0.1833, + "step": 20258 + }, + { + "epoch": 0.5126654351291849, + "grad_norm": 5.778873920440674, + "learning_rate": 4.878551377231717e-06, + "loss": 0.1531, + "step": 20259 + }, + { + "epoch": 0.5126907406938785, + "grad_norm": 2.985349178314209, + "learning_rate": 4.878149973549005e-06, + "loss": 0.0929, + "step": 20260 + }, + { + "epoch": 0.5127160462585723, + "grad_norm": 4.130624771118164, + "learning_rate": 4.877748570652081e-06, + "loss": 0.1838, + "step": 20261 + }, + { + "epoch": 0.512741351823266, + "grad_norm": 2.677651882171631, + "learning_rate": 4.877347168543537e-06, + "loss": 0.1313, + "step": 20262 + }, + { + "epoch": 0.5127666573879596, + "grad_norm": 2.863407611846924, + "learning_rate": 4.876945767225955e-06, + "loss": 0.1442, + "step": 20263 + }, + { + "epoch": 0.5127919629526533, + "grad_norm": 3.2257401943206787, + "learning_rate": 4.876544366701927e-06, + "loss": 0.1577, + "step": 20264 + }, + { + "epoch": 0.512817268517347, + "grad_norm": 4.805784702301025, + "learning_rate": 4.876142966974041e-06, + "loss": 0.1259, + "step": 20265 + }, + { + "epoch": 0.5128425740820406, + "grad_norm": 3.4588894844055176, + "learning_rate": 4.875741568044889e-06, + "loss": 0.1383, + "step": 20266 + }, + { + "epoch": 0.5128678796467343, + "grad_norm": 4.676058769226074, + "learning_rate": 4.875340169917055e-06, + "loss": 0.1517, + "step": 20267 + }, + { + "epoch": 0.512893185211428, + "grad_norm": 5.006338119506836, + "learning_rate": 4.874938772593127e-06, + "loss": 0.1878, + "step": 20268 + }, + { + "epoch": 0.5129184907761216, + "grad_norm": 9.899970054626465, + "learning_rate": 4.874537376075697e-06, + "loss": 0.2462, + "step": 20269 + }, + { + "epoch": 0.5129437963408153, + "grad_norm": 8.892430305480957, + "learning_rate": 4.874135980367354e-06, + "loss": 0.1796, + "step": 20270 + }, + { + "epoch": 0.512969101905509, + "grad_norm": 3.0214056968688965, + "learning_rate": 4.873734585470683e-06, + "loss": 0.114, + "step": 20271 + }, + { + "epoch": 0.5129944074702028, + "grad_norm": 3.1398098468780518, + "learning_rate": 4.873333191388273e-06, + "loss": 0.109, + "step": 20272 + }, + { + "epoch": 0.5130197130348964, + "grad_norm": 12.679734230041504, + "learning_rate": 4.872931798122714e-06, + "loss": 0.1974, + "step": 20273 + }, + { + "epoch": 0.5130450185995901, + "grad_norm": 3.572071075439453, + "learning_rate": 4.872530405676596e-06, + "loss": 0.154, + "step": 20274 + }, + { + "epoch": 0.5130703241642838, + "grad_norm": 10.632678985595703, + "learning_rate": 4.872129014052505e-06, + "loss": 0.1479, + "step": 20275 + }, + { + "epoch": 0.5130956297289774, + "grad_norm": 5.107700347900391, + "learning_rate": 4.871727623253029e-06, + "loss": 0.1438, + "step": 20276 + }, + { + "epoch": 0.5131209352936711, + "grad_norm": 3.969097852706909, + "learning_rate": 4.871326233280758e-06, + "loss": 0.1458, + "step": 20277 + }, + { + "epoch": 0.5131462408583648, + "grad_norm": 3.447772979736328, + "learning_rate": 4.870924844138282e-06, + "loss": 0.1515, + "step": 20278 + }, + { + "epoch": 0.5131715464230584, + "grad_norm": 3.67106556892395, + "learning_rate": 4.870523455828187e-06, + "loss": 0.1324, + "step": 20279 + }, + { + "epoch": 0.5131968519877521, + "grad_norm": 9.140220642089844, + "learning_rate": 4.870122068353061e-06, + "loss": 0.3183, + "step": 20280 + }, + { + "epoch": 0.5132221575524458, + "grad_norm": 7.597912788391113, + "learning_rate": 4.869720681715495e-06, + "loss": 0.1086, + "step": 20281 + }, + { + "epoch": 0.5132474631171394, + "grad_norm": 7.861850261688232, + "learning_rate": 4.869319295918075e-06, + "loss": 0.1609, + "step": 20282 + }, + { + "epoch": 0.5132727686818331, + "grad_norm": 6.870351791381836, + "learning_rate": 4.86891791096339e-06, + "loss": 0.1772, + "step": 20283 + }, + { + "epoch": 0.5132980742465268, + "grad_norm": 5.89741325378418, + "learning_rate": 4.868516526854031e-06, + "loss": 0.2109, + "step": 20284 + }, + { + "epoch": 0.5133233798112204, + "grad_norm": 15.461812973022461, + "learning_rate": 4.868115143592583e-06, + "loss": 0.2197, + "step": 20285 + }, + { + "epoch": 0.5133486853759142, + "grad_norm": 5.54921293258667, + "learning_rate": 4.867713761181636e-06, + "loss": 0.1079, + "step": 20286 + }, + { + "epoch": 0.5133739909406079, + "grad_norm": 2.2438621520996094, + "learning_rate": 4.867312379623778e-06, + "loss": 0.0795, + "step": 20287 + }, + { + "epoch": 0.5133992965053015, + "grad_norm": 9.975822448730469, + "learning_rate": 4.8669109989216e-06, + "loss": 0.3091, + "step": 20288 + }, + { + "epoch": 0.5134246020699952, + "grad_norm": 15.183774948120117, + "learning_rate": 4.866509619077686e-06, + "loss": 0.2963, + "step": 20289 + }, + { + "epoch": 0.5134499076346889, + "grad_norm": 4.268701553344727, + "learning_rate": 4.866108240094627e-06, + "loss": 0.1506, + "step": 20290 + }, + { + "epoch": 0.5134752131993825, + "grad_norm": 3.180978536605835, + "learning_rate": 4.865706861975011e-06, + "loss": 0.1383, + "step": 20291 + }, + { + "epoch": 0.5135005187640762, + "grad_norm": 7.703734397888184, + "learning_rate": 4.865305484721428e-06, + "loss": 0.1471, + "step": 20292 + }, + { + "epoch": 0.5135258243287699, + "grad_norm": 7.236317157745361, + "learning_rate": 4.864904108336463e-06, + "loss": 0.1287, + "step": 20293 + }, + { + "epoch": 0.5135511298934635, + "grad_norm": 6.7070465087890625, + "learning_rate": 4.864502732822707e-06, + "loss": 0.1353, + "step": 20294 + }, + { + "epoch": 0.5135764354581572, + "grad_norm": 12.906685829162598, + "learning_rate": 4.864101358182747e-06, + "loss": 0.2031, + "step": 20295 + }, + { + "epoch": 0.5136017410228509, + "grad_norm": 8.341971397399902, + "learning_rate": 4.863699984419172e-06, + "loss": 0.19, + "step": 20296 + }, + { + "epoch": 0.5136270465875447, + "grad_norm": 3.271859884262085, + "learning_rate": 4.8632986115345725e-06, + "loss": 0.1333, + "step": 20297 + }, + { + "epoch": 0.5136523521522383, + "grad_norm": 6.614962577819824, + "learning_rate": 4.862897239531533e-06, + "loss": 0.2918, + "step": 20298 + }, + { + "epoch": 0.513677657716932, + "grad_norm": 5.6818528175354, + "learning_rate": 4.862495868412643e-06, + "loss": 0.1075, + "step": 20299 + }, + { + "epoch": 0.5137029632816257, + "grad_norm": 7.95797061920166, + "learning_rate": 4.862094498180492e-06, + "loss": 0.1929, + "step": 20300 + }, + { + "epoch": 0.5137282688463193, + "grad_norm": 4.709873199462891, + "learning_rate": 4.8616931288376716e-06, + "loss": 0.1562, + "step": 20301 + }, + { + "epoch": 0.513753574411013, + "grad_norm": 4.002546787261963, + "learning_rate": 4.861291760386762e-06, + "loss": 0.1191, + "step": 20302 + }, + { + "epoch": 0.5137788799757067, + "grad_norm": 3.6201844215393066, + "learning_rate": 4.860890392830358e-06, + "loss": 0.1318, + "step": 20303 + }, + { + "epoch": 0.5138041855404003, + "grad_norm": 14.839312553405762, + "learning_rate": 4.860489026171044e-06, + "loss": 0.2918, + "step": 20304 + }, + { + "epoch": 0.513829491105094, + "grad_norm": 5.67725944519043, + "learning_rate": 4.860087660411413e-06, + "loss": 0.1843, + "step": 20305 + }, + { + "epoch": 0.5138547966697877, + "grad_norm": 7.882839679718018, + "learning_rate": 4.8596862955540495e-06, + "loss": 0.1862, + "step": 20306 + }, + { + "epoch": 0.5138801022344813, + "grad_norm": 4.766428470611572, + "learning_rate": 4.859284931601543e-06, + "loss": 0.1198, + "step": 20307 + }, + { + "epoch": 0.513905407799175, + "grad_norm": 5.376378059387207, + "learning_rate": 4.858883568556482e-06, + "loss": 0.1917, + "step": 20308 + }, + { + "epoch": 0.5139307133638688, + "grad_norm": 3.6294896602630615, + "learning_rate": 4.858482206421454e-06, + "loss": 0.1722, + "step": 20309 + }, + { + "epoch": 0.5139560189285624, + "grad_norm": 3.527932643890381, + "learning_rate": 4.8580808451990485e-06, + "loss": 0.1782, + "step": 20310 + }, + { + "epoch": 0.5139813244932561, + "grad_norm": 5.3799285888671875, + "learning_rate": 4.857679484891853e-06, + "loss": 0.0971, + "step": 20311 + }, + { + "epoch": 0.5140066300579498, + "grad_norm": 3.4859778881073, + "learning_rate": 4.857278125502457e-06, + "loss": 0.1584, + "step": 20312 + }, + { + "epoch": 0.5140319356226434, + "grad_norm": 6.424849510192871, + "learning_rate": 4.856876767033445e-06, + "loss": 0.2063, + "step": 20313 + }, + { + "epoch": 0.5140572411873371, + "grad_norm": 6.147964000701904, + "learning_rate": 4.85647540948741e-06, + "loss": 0.1443, + "step": 20314 + }, + { + "epoch": 0.5140825467520308, + "grad_norm": 6.361465930938721, + "learning_rate": 4.85607405286694e-06, + "loss": 0.2086, + "step": 20315 + }, + { + "epoch": 0.5141078523167244, + "grad_norm": 8.900641441345215, + "learning_rate": 4.8556726971746196e-06, + "loss": 0.2215, + "step": 20316 + }, + { + "epoch": 0.5141331578814181, + "grad_norm": 3.080583095550537, + "learning_rate": 4.855271342413039e-06, + "loss": 0.0975, + "step": 20317 + }, + { + "epoch": 0.5141584634461118, + "grad_norm": 4.131297588348389, + "learning_rate": 4.854869988584786e-06, + "loss": 0.1394, + "step": 20318 + }, + { + "epoch": 0.5141837690108054, + "grad_norm": 7.855400085449219, + "learning_rate": 4.854468635692453e-06, + "loss": 0.1456, + "step": 20319 + }, + { + "epoch": 0.5142090745754991, + "grad_norm": 2.195404052734375, + "learning_rate": 4.854067283738622e-06, + "loss": 0.0733, + "step": 20320 + }, + { + "epoch": 0.5142343801401928, + "grad_norm": 14.409914016723633, + "learning_rate": 4.853665932725883e-06, + "loss": 0.1446, + "step": 20321 + }, + { + "epoch": 0.5142596857048866, + "grad_norm": 17.200170516967773, + "learning_rate": 4.853264582656826e-06, + "loss": 0.2264, + "step": 20322 + }, + { + "epoch": 0.5142849912695802, + "grad_norm": 5.258547306060791, + "learning_rate": 4.852863233534041e-06, + "loss": 0.1258, + "step": 20323 + }, + { + "epoch": 0.5143102968342739, + "grad_norm": 4.51959753036499, + "learning_rate": 4.85246188536011e-06, + "loss": 0.1353, + "step": 20324 + }, + { + "epoch": 0.5143356023989676, + "grad_norm": 6.264570236206055, + "learning_rate": 4.852060538137627e-06, + "loss": 0.1694, + "step": 20325 + }, + { + "epoch": 0.5143609079636612, + "grad_norm": 5.910211563110352, + "learning_rate": 4.8516591918691765e-06, + "loss": 0.2335, + "step": 20326 + }, + { + "epoch": 0.5143862135283549, + "grad_norm": 8.665146827697754, + "learning_rate": 4.851257846557349e-06, + "loss": 0.2237, + "step": 20327 + }, + { + "epoch": 0.5144115190930486, + "grad_norm": 5.520401954650879, + "learning_rate": 4.850856502204733e-06, + "loss": 0.1256, + "step": 20328 + }, + { + "epoch": 0.5144368246577422, + "grad_norm": 6.337146282196045, + "learning_rate": 4.8504551588139145e-06, + "loss": 0.2427, + "step": 20329 + }, + { + "epoch": 0.5144621302224359, + "grad_norm": 5.891402721405029, + "learning_rate": 4.8500538163874825e-06, + "loss": 0.1834, + "step": 20330 + }, + { + "epoch": 0.5144874357871296, + "grad_norm": 4.001949787139893, + "learning_rate": 4.849652474928026e-06, + "loss": 0.1645, + "step": 20331 + }, + { + "epoch": 0.5145127413518232, + "grad_norm": 5.926849365234375, + "learning_rate": 4.849251134438135e-06, + "loss": 0.1734, + "step": 20332 + }, + { + "epoch": 0.514538046916517, + "grad_norm": 5.909029006958008, + "learning_rate": 4.848849794920393e-06, + "loss": 0.2211, + "step": 20333 + }, + { + "epoch": 0.5145633524812107, + "grad_norm": 2.91294264793396, + "learning_rate": 4.84844845637739e-06, + "loss": 0.1171, + "step": 20334 + }, + { + "epoch": 0.5145886580459043, + "grad_norm": 13.018279075622559, + "learning_rate": 4.848047118811716e-06, + "loss": 0.2402, + "step": 20335 + }, + { + "epoch": 0.514613963610598, + "grad_norm": 10.806249618530273, + "learning_rate": 4.847645782225957e-06, + "loss": 0.1839, + "step": 20336 + }, + { + "epoch": 0.5146392691752917, + "grad_norm": 8.441780090332031, + "learning_rate": 4.8472444466227035e-06, + "loss": 0.1943, + "step": 20337 + }, + { + "epoch": 0.5146645747399853, + "grad_norm": 5.175517559051514, + "learning_rate": 4.84684311200454e-06, + "loss": 0.2532, + "step": 20338 + }, + { + "epoch": 0.514689880304679, + "grad_norm": 8.009419441223145, + "learning_rate": 4.846441778374059e-06, + "loss": 0.2769, + "step": 20339 + }, + { + "epoch": 0.5147151858693727, + "grad_norm": 3.283857583999634, + "learning_rate": 4.846040445733845e-06, + "loss": 0.1192, + "step": 20340 + }, + { + "epoch": 0.5147404914340663, + "grad_norm": 7.081120491027832, + "learning_rate": 4.845639114086488e-06, + "loss": 0.236, + "step": 20341 + }, + { + "epoch": 0.51476579699876, + "grad_norm": 4.533167362213135, + "learning_rate": 4.845237783434576e-06, + "loss": 0.1546, + "step": 20342 + }, + { + "epoch": 0.5147911025634537, + "grad_norm": 7.033300876617432, + "learning_rate": 4.844836453780696e-06, + "loss": 0.1644, + "step": 20343 + }, + { + "epoch": 0.5148164081281473, + "grad_norm": 3.8949856758117676, + "learning_rate": 4.844435125127436e-06, + "loss": 0.1372, + "step": 20344 + }, + { + "epoch": 0.514841713692841, + "grad_norm": 7.649746417999268, + "learning_rate": 4.844033797477388e-06, + "loss": 0.194, + "step": 20345 + }, + { + "epoch": 0.5148670192575348, + "grad_norm": 15.880989074707031, + "learning_rate": 4.843632470833135e-06, + "loss": 0.1123, + "step": 20346 + }, + { + "epoch": 0.5148923248222284, + "grad_norm": 8.213465690612793, + "learning_rate": 4.843231145197267e-06, + "loss": 0.1371, + "step": 20347 + }, + { + "epoch": 0.5149176303869221, + "grad_norm": 11.903099060058594, + "learning_rate": 4.842829820572371e-06, + "loss": 0.2839, + "step": 20348 + }, + { + "epoch": 0.5149429359516158, + "grad_norm": 5.373499870300293, + "learning_rate": 4.842428496961037e-06, + "loss": 0.2286, + "step": 20349 + }, + { + "epoch": 0.5149682415163095, + "grad_norm": 5.562467575073242, + "learning_rate": 4.842027174365855e-06, + "loss": 0.2501, + "step": 20350 + }, + { + "epoch": 0.5149935470810031, + "grad_norm": 4.609622001647949, + "learning_rate": 4.841625852789408e-06, + "loss": 0.1532, + "step": 20351 + }, + { + "epoch": 0.5150188526456968, + "grad_norm": 7.454356670379639, + "learning_rate": 4.841224532234285e-06, + "loss": 0.1801, + "step": 20352 + }, + { + "epoch": 0.5150441582103905, + "grad_norm": 9.09681510925293, + "learning_rate": 4.840823212703077e-06, + "loss": 0.3149, + "step": 20353 + }, + { + "epoch": 0.5150694637750841, + "grad_norm": 6.590664863586426, + "learning_rate": 4.840421894198373e-06, + "loss": 0.1006, + "step": 20354 + }, + { + "epoch": 0.5150947693397778, + "grad_norm": 4.321986198425293, + "learning_rate": 4.840020576722756e-06, + "loss": 0.1528, + "step": 20355 + }, + { + "epoch": 0.5151200749044715, + "grad_norm": 3.791839122772217, + "learning_rate": 4.8396192602788155e-06, + "loss": 0.1339, + "step": 20356 + }, + { + "epoch": 0.5151453804691651, + "grad_norm": 9.2490234375, + "learning_rate": 4.839217944869141e-06, + "loss": 0.1865, + "step": 20357 + }, + { + "epoch": 0.5151706860338588, + "grad_norm": 4.900667190551758, + "learning_rate": 4.83881663049632e-06, + "loss": 0.1124, + "step": 20358 + }, + { + "epoch": 0.5151959915985526, + "grad_norm": 2.984457492828369, + "learning_rate": 4.838415317162944e-06, + "loss": 0.104, + "step": 20359 + }, + { + "epoch": 0.5152212971632462, + "grad_norm": 3.4377102851867676, + "learning_rate": 4.838014004871594e-06, + "loss": 0.1276, + "step": 20360 + }, + { + "epoch": 0.5152466027279399, + "grad_norm": 9.8783540725708, + "learning_rate": 4.8376126936248616e-06, + "loss": 0.2018, + "step": 20361 + }, + { + "epoch": 0.5152719082926336, + "grad_norm": 4.948936939239502, + "learning_rate": 4.837211383425335e-06, + "loss": 0.2222, + "step": 20362 + }, + { + "epoch": 0.5152972138573272, + "grad_norm": 4.42838716506958, + "learning_rate": 4.836810074275605e-06, + "loss": 0.1884, + "step": 20363 + }, + { + "epoch": 0.5153225194220209, + "grad_norm": 2.6841163635253906, + "learning_rate": 4.836408766178254e-06, + "loss": 0.1171, + "step": 20364 + }, + { + "epoch": 0.5153478249867146, + "grad_norm": 6.142184734344482, + "learning_rate": 4.836007459135872e-06, + "loss": 0.197, + "step": 20365 + }, + { + "epoch": 0.5153731305514082, + "grad_norm": 3.429656982421875, + "learning_rate": 4.835606153151048e-06, + "loss": 0.0776, + "step": 20366 + }, + { + "epoch": 0.5153984361161019, + "grad_norm": 3.4855408668518066, + "learning_rate": 4.83520484822637e-06, + "loss": 0.133, + "step": 20367 + }, + { + "epoch": 0.5154237416807956, + "grad_norm": 6.361388206481934, + "learning_rate": 4.834803544364424e-06, + "loss": 0.1319, + "step": 20368 + }, + { + "epoch": 0.5154490472454892, + "grad_norm": 3.222451686859131, + "learning_rate": 4.8344022415678e-06, + "loss": 0.1625, + "step": 20369 + }, + { + "epoch": 0.515474352810183, + "grad_norm": 3.780414581298828, + "learning_rate": 4.834000939839084e-06, + "loss": 0.1547, + "step": 20370 + }, + { + "epoch": 0.5154996583748767, + "grad_norm": 7.144022464752197, + "learning_rate": 4.833599639180865e-06, + "loss": 0.197, + "step": 20371 + }, + { + "epoch": 0.5155249639395703, + "grad_norm": 3.462714433670044, + "learning_rate": 4.833198339595734e-06, + "loss": 0.077, + "step": 20372 + }, + { + "epoch": 0.515550269504264, + "grad_norm": 4.436502933502197, + "learning_rate": 4.832797041086273e-06, + "loss": 0.1694, + "step": 20373 + }, + { + "epoch": 0.5155755750689577, + "grad_norm": 8.0274076461792, + "learning_rate": 4.832395743655073e-06, + "loss": 0.1805, + "step": 20374 + }, + { + "epoch": 0.5156008806336514, + "grad_norm": 8.079436302185059, + "learning_rate": 4.831994447304722e-06, + "loss": 0.2114, + "step": 20375 + }, + { + "epoch": 0.515626186198345, + "grad_norm": 4.024594783782959, + "learning_rate": 4.831593152037809e-06, + "loss": 0.0832, + "step": 20376 + }, + { + "epoch": 0.5156514917630387, + "grad_norm": 3.0313005447387695, + "learning_rate": 4.831191857856919e-06, + "loss": 0.1106, + "step": 20377 + }, + { + "epoch": 0.5156767973277324, + "grad_norm": 12.633896827697754, + "learning_rate": 4.830790564764641e-06, + "loss": 0.1389, + "step": 20378 + }, + { + "epoch": 0.515702102892426, + "grad_norm": 4.9620537757873535, + "learning_rate": 4.830389272763563e-06, + "loss": 0.223, + "step": 20379 + }, + { + "epoch": 0.5157274084571197, + "grad_norm": 13.512429237365723, + "learning_rate": 4.829987981856273e-06, + "loss": 0.1702, + "step": 20380 + }, + { + "epoch": 0.5157527140218134, + "grad_norm": 7.002645969390869, + "learning_rate": 4.829586692045362e-06, + "loss": 0.2394, + "step": 20381 + }, + { + "epoch": 0.515778019586507, + "grad_norm": 2.9922611713409424, + "learning_rate": 4.829185403333412e-06, + "loss": 0.092, + "step": 20382 + }, + { + "epoch": 0.5158033251512008, + "grad_norm": 8.041255950927734, + "learning_rate": 4.828784115723012e-06, + "loss": 0.2332, + "step": 20383 + }, + { + "epoch": 0.5158286307158945, + "grad_norm": 2.7990570068359375, + "learning_rate": 4.828382829216754e-06, + "loss": 0.1007, + "step": 20384 + }, + { + "epoch": 0.5158539362805881, + "grad_norm": 4.772973537445068, + "learning_rate": 4.827981543817223e-06, + "loss": 0.1707, + "step": 20385 + }, + { + "epoch": 0.5158792418452818, + "grad_norm": 12.894861221313477, + "learning_rate": 4.827580259527007e-06, + "loss": 0.1813, + "step": 20386 + }, + { + "epoch": 0.5159045474099755, + "grad_norm": 11.320365905761719, + "learning_rate": 4.827178976348692e-06, + "loss": 0.3086, + "step": 20387 + }, + { + "epoch": 0.5159298529746691, + "grad_norm": 3.8887245655059814, + "learning_rate": 4.826777694284869e-06, + "loss": 0.1403, + "step": 20388 + }, + { + "epoch": 0.5159551585393628, + "grad_norm": 5.1030168533325195, + "learning_rate": 4.826376413338124e-06, + "loss": 0.1173, + "step": 20389 + }, + { + "epoch": 0.5159804641040565, + "grad_norm": 7.124150276184082, + "learning_rate": 4.825975133511047e-06, + "loss": 0.1304, + "step": 20390 + }, + { + "epoch": 0.5160057696687501, + "grad_norm": 6.506715774536133, + "learning_rate": 4.825573854806222e-06, + "loss": 0.1848, + "step": 20391 + }, + { + "epoch": 0.5160310752334438, + "grad_norm": 4.313838005065918, + "learning_rate": 4.825172577226239e-06, + "loss": 0.0855, + "step": 20392 + }, + { + "epoch": 0.5160563807981375, + "grad_norm": 6.980526447296143, + "learning_rate": 4.824771300773686e-06, + "loss": 0.1896, + "step": 20393 + }, + { + "epoch": 0.5160816863628311, + "grad_norm": 11.253807067871094, + "learning_rate": 4.824370025451151e-06, + "loss": 0.209, + "step": 20394 + }, + { + "epoch": 0.5161069919275248, + "grad_norm": 2.9746177196502686, + "learning_rate": 4.82396875126122e-06, + "loss": 0.1316, + "step": 20395 + }, + { + "epoch": 0.5161322974922186, + "grad_norm": 7.095687389373779, + "learning_rate": 4.823567478206482e-06, + "loss": 0.2194, + "step": 20396 + }, + { + "epoch": 0.5161576030569122, + "grad_norm": 4.400537490844727, + "learning_rate": 4.823166206289524e-06, + "loss": 0.1077, + "step": 20397 + }, + { + "epoch": 0.5161829086216059, + "grad_norm": 6.932568073272705, + "learning_rate": 4.822764935512935e-06, + "loss": 0.1637, + "step": 20398 + }, + { + "epoch": 0.5162082141862996, + "grad_norm": 5.452171325683594, + "learning_rate": 4.822363665879302e-06, + "loss": 0.1484, + "step": 20399 + }, + { + "epoch": 0.5162335197509933, + "grad_norm": 10.298802375793457, + "learning_rate": 4.821962397391213e-06, + "loss": 0.1603, + "step": 20400 + }, + { + "epoch": 0.5162588253156869, + "grad_norm": 4.24329948425293, + "learning_rate": 4.821561130051254e-06, + "loss": 0.1615, + "step": 20401 + }, + { + "epoch": 0.5162841308803806, + "grad_norm": 7.760888576507568, + "learning_rate": 4.821159863862015e-06, + "loss": 0.3478, + "step": 20402 + }, + { + "epoch": 0.5163094364450743, + "grad_norm": 5.472443580627441, + "learning_rate": 4.820758598826084e-06, + "loss": 0.1602, + "step": 20403 + }, + { + "epoch": 0.5163347420097679, + "grad_norm": 8.508965492248535, + "learning_rate": 4.820357334946045e-06, + "loss": 0.2968, + "step": 20404 + }, + { + "epoch": 0.5163600475744616, + "grad_norm": 6.114377975463867, + "learning_rate": 4.819956072224489e-06, + "loss": 0.1251, + "step": 20405 + }, + { + "epoch": 0.5163853531391553, + "grad_norm": 3.8363146781921387, + "learning_rate": 4.8195548106640024e-06, + "loss": 0.1639, + "step": 20406 + }, + { + "epoch": 0.516410658703849, + "grad_norm": 5.201504230499268, + "learning_rate": 4.819153550267175e-06, + "loss": 0.215, + "step": 20407 + }, + { + "epoch": 0.5164359642685427, + "grad_norm": 3.55901837348938, + "learning_rate": 4.818752291036591e-06, + "loss": 0.1199, + "step": 20408 + }, + { + "epoch": 0.5164612698332364, + "grad_norm": 4.972647190093994, + "learning_rate": 4.81835103297484e-06, + "loss": 0.1552, + "step": 20409 + }, + { + "epoch": 0.51648657539793, + "grad_norm": 9.173348426818848, + "learning_rate": 4.8179497760845095e-06, + "loss": 0.27, + "step": 20410 + }, + { + "epoch": 0.5165118809626237, + "grad_norm": 3.7626988887786865, + "learning_rate": 4.8175485203681864e-06, + "loss": 0.183, + "step": 20411 + }, + { + "epoch": 0.5165371865273174, + "grad_norm": 4.065830707550049, + "learning_rate": 4.817147265828462e-06, + "loss": 0.1461, + "step": 20412 + }, + { + "epoch": 0.516562492092011, + "grad_norm": 3.364077568054199, + "learning_rate": 4.8167460124679175e-06, + "loss": 0.1606, + "step": 20413 + }, + { + "epoch": 0.5165877976567047, + "grad_norm": 4.7539825439453125, + "learning_rate": 4.816344760289145e-06, + "loss": 0.148, + "step": 20414 + }, + { + "epoch": 0.5166131032213984, + "grad_norm": 3.230633497238159, + "learning_rate": 4.81594350929473e-06, + "loss": 0.1396, + "step": 20415 + }, + { + "epoch": 0.516638408786092, + "grad_norm": 4.736554145812988, + "learning_rate": 4.815542259487264e-06, + "loss": 0.1442, + "step": 20416 + }, + { + "epoch": 0.5166637143507857, + "grad_norm": 6.135466575622559, + "learning_rate": 4.8151410108693296e-06, + "loss": 0.262, + "step": 20417 + }, + { + "epoch": 0.5166890199154794, + "grad_norm": 5.4462971687316895, + "learning_rate": 4.814739763443516e-06, + "loss": 0.1713, + "step": 20418 + }, + { + "epoch": 0.516714325480173, + "grad_norm": 3.6446380615234375, + "learning_rate": 4.814338517212411e-06, + "loss": 0.1058, + "step": 20419 + }, + { + "epoch": 0.5167396310448668, + "grad_norm": 3.7523698806762695, + "learning_rate": 4.813937272178606e-06, + "loss": 0.1107, + "step": 20420 + }, + { + "epoch": 0.5167649366095605, + "grad_norm": 7.165191650390625, + "learning_rate": 4.813536028344681e-06, + "loss": 0.2209, + "step": 20421 + }, + { + "epoch": 0.5167902421742541, + "grad_norm": 11.155963897705078, + "learning_rate": 4.813134785713228e-06, + "loss": 0.2109, + "step": 20422 + }, + { + "epoch": 0.5168155477389478, + "grad_norm": 6.149610996246338, + "learning_rate": 4.8127335442868335e-06, + "loss": 0.1976, + "step": 20423 + }, + { + "epoch": 0.5168408533036415, + "grad_norm": 2.293333053588867, + "learning_rate": 4.812332304068088e-06, + "loss": 0.1183, + "step": 20424 + }, + { + "epoch": 0.5168661588683352, + "grad_norm": 2.9336745738983154, + "learning_rate": 4.811931065059575e-06, + "loss": 0.1508, + "step": 20425 + }, + { + "epoch": 0.5168914644330288, + "grad_norm": 7.8737335205078125, + "learning_rate": 4.811529827263884e-06, + "loss": 0.2174, + "step": 20426 + }, + { + "epoch": 0.5169167699977225, + "grad_norm": 3.1311004161834717, + "learning_rate": 4.811128590683602e-06, + "loss": 0.0949, + "step": 20427 + }, + { + "epoch": 0.5169420755624162, + "grad_norm": 3.3959145545959473, + "learning_rate": 4.810727355321316e-06, + "loss": 0.1174, + "step": 20428 + }, + { + "epoch": 0.5169673811271098, + "grad_norm": 6.563007831573486, + "learning_rate": 4.810326121179615e-06, + "loss": 0.19, + "step": 20429 + }, + { + "epoch": 0.5169926866918035, + "grad_norm": 4.187681198120117, + "learning_rate": 4.809924888261085e-06, + "loss": 0.2422, + "step": 20430 + }, + { + "epoch": 0.5170179922564972, + "grad_norm": 4.161891460418701, + "learning_rate": 4.809523656568314e-06, + "loss": 0.1622, + "step": 20431 + }, + { + "epoch": 0.5170432978211909, + "grad_norm": 5.622794151306152, + "learning_rate": 4.8091224261038895e-06, + "loss": 0.2162, + "step": 20432 + }, + { + "epoch": 0.5170686033858846, + "grad_norm": 6.042981147766113, + "learning_rate": 4.808721196870399e-06, + "loss": 0.1638, + "step": 20433 + }, + { + "epoch": 0.5170939089505783, + "grad_norm": 2.6173250675201416, + "learning_rate": 4.808319968870431e-06, + "loss": 0.0837, + "step": 20434 + }, + { + "epoch": 0.5171192145152719, + "grad_norm": 3.469510078430176, + "learning_rate": 4.80791874210657e-06, + "loss": 0.0963, + "step": 20435 + }, + { + "epoch": 0.5171445200799656, + "grad_norm": 2.62056040763855, + "learning_rate": 4.807517516581406e-06, + "loss": 0.0911, + "step": 20436 + }, + { + "epoch": 0.5171698256446593, + "grad_norm": 9.311083793640137, + "learning_rate": 4.807116292297526e-06, + "loss": 0.1824, + "step": 20437 + }, + { + "epoch": 0.5171951312093529, + "grad_norm": 21.050506591796875, + "learning_rate": 4.806715069257518e-06, + "loss": 0.1345, + "step": 20438 + }, + { + "epoch": 0.5172204367740466, + "grad_norm": 3.0032870769500732, + "learning_rate": 4.806313847463967e-06, + "loss": 0.1484, + "step": 20439 + }, + { + "epoch": 0.5172457423387403, + "grad_norm": 3.575232982635498, + "learning_rate": 4.805912626919463e-06, + "loss": 0.1261, + "step": 20440 + }, + { + "epoch": 0.5172710479034339, + "grad_norm": 3.080246925354004, + "learning_rate": 4.8055114076265905e-06, + "loss": 0.1292, + "step": 20441 + }, + { + "epoch": 0.5172963534681276, + "grad_norm": 6.045753479003906, + "learning_rate": 4.8051101895879395e-06, + "loss": 0.1701, + "step": 20442 + }, + { + "epoch": 0.5173216590328213, + "grad_norm": 4.470427513122559, + "learning_rate": 4.8047089728061e-06, + "loss": 0.1543, + "step": 20443 + }, + { + "epoch": 0.517346964597515, + "grad_norm": 4.335832118988037, + "learning_rate": 4.804307757283652e-06, + "loss": 0.1738, + "step": 20444 + }, + { + "epoch": 0.5173722701622087, + "grad_norm": 4.9668869972229, + "learning_rate": 4.803906543023188e-06, + "loss": 0.1311, + "step": 20445 + }, + { + "epoch": 0.5173975757269024, + "grad_norm": 6.310513496398926, + "learning_rate": 4.803505330027293e-06, + "loss": 0.1652, + "step": 20446 + }, + { + "epoch": 0.517422881291596, + "grad_norm": 13.656530380249023, + "learning_rate": 4.803104118298559e-06, + "loss": 0.1525, + "step": 20447 + }, + { + "epoch": 0.5174481868562897, + "grad_norm": 3.830441474914551, + "learning_rate": 4.802702907839568e-06, + "loss": 0.1259, + "step": 20448 + }, + { + "epoch": 0.5174734924209834, + "grad_norm": 9.969521522521973, + "learning_rate": 4.802301698652909e-06, + "loss": 0.159, + "step": 20449 + }, + { + "epoch": 0.5174987979856771, + "grad_norm": 4.545116901397705, + "learning_rate": 4.801900490741169e-06, + "loss": 0.2, + "step": 20450 + }, + { + "epoch": 0.5175241035503707, + "grad_norm": 3.8565285205841064, + "learning_rate": 4.801499284106939e-06, + "loss": 0.1287, + "step": 20451 + }, + { + "epoch": 0.5175494091150644, + "grad_norm": 3.919818162918091, + "learning_rate": 4.801098078752799e-06, + "loss": 0.1792, + "step": 20452 + }, + { + "epoch": 0.5175747146797581, + "grad_norm": 6.927034378051758, + "learning_rate": 4.800696874681342e-06, + "loss": 0.2272, + "step": 20453 + }, + { + "epoch": 0.5176000202444517, + "grad_norm": 8.196860313415527, + "learning_rate": 4.800295671895155e-06, + "loss": 0.2701, + "step": 20454 + }, + { + "epoch": 0.5176253258091454, + "grad_norm": 6.942233085632324, + "learning_rate": 4.799894470396822e-06, + "loss": 0.1803, + "step": 20455 + }, + { + "epoch": 0.5176506313738392, + "grad_norm": 3.975700616836548, + "learning_rate": 4.799493270188935e-06, + "loss": 0.1296, + "step": 20456 + }, + { + "epoch": 0.5176759369385328, + "grad_norm": 3.965400218963623, + "learning_rate": 4.799092071274077e-06, + "loss": 0.0895, + "step": 20457 + }, + { + "epoch": 0.5177012425032265, + "grad_norm": 2.5254733562469482, + "learning_rate": 4.7986908736548375e-06, + "loss": 0.1195, + "step": 20458 + }, + { + "epoch": 0.5177265480679202, + "grad_norm": 4.2090301513671875, + "learning_rate": 4.798289677333801e-06, + "loss": 0.1392, + "step": 20459 + }, + { + "epoch": 0.5177518536326138, + "grad_norm": 12.223481178283691, + "learning_rate": 4.7978884823135595e-06, + "loss": 0.1301, + "step": 20460 + }, + { + "epoch": 0.5177771591973075, + "grad_norm": 5.520936012268066, + "learning_rate": 4.797487288596697e-06, + "loss": 0.1452, + "step": 20461 + }, + { + "epoch": 0.5178024647620012, + "grad_norm": 9.137392044067383, + "learning_rate": 4.797086096185801e-06, + "loss": 0.2741, + "step": 20462 + }, + { + "epoch": 0.5178277703266948, + "grad_norm": 13.368694305419922, + "learning_rate": 4.796684905083458e-06, + "loss": 0.2775, + "step": 20463 + }, + { + "epoch": 0.5178530758913885, + "grad_norm": 3.744269371032715, + "learning_rate": 4.7962837152922565e-06, + "loss": 0.1814, + "step": 20464 + }, + { + "epoch": 0.5178783814560822, + "grad_norm": 8.923718452453613, + "learning_rate": 4.795882526814786e-06, + "loss": 0.2341, + "step": 20465 + }, + { + "epoch": 0.5179036870207758, + "grad_norm": 5.122623443603516, + "learning_rate": 4.7954813396536285e-06, + "loss": 0.1799, + "step": 20466 + }, + { + "epoch": 0.5179289925854695, + "grad_norm": 5.821515083312988, + "learning_rate": 4.795080153811373e-06, + "loss": 0.1351, + "step": 20467 + }, + { + "epoch": 0.5179542981501633, + "grad_norm": 5.784670352935791, + "learning_rate": 4.794678969290609e-06, + "loss": 0.1642, + "step": 20468 + }, + { + "epoch": 0.5179796037148569, + "grad_norm": 3.252246856689453, + "learning_rate": 4.794277786093924e-06, + "loss": 0.1412, + "step": 20469 + }, + { + "epoch": 0.5180049092795506, + "grad_norm": 3.1239609718322754, + "learning_rate": 4.793876604223901e-06, + "loss": 0.1637, + "step": 20470 + }, + { + "epoch": 0.5180302148442443, + "grad_norm": 5.994378089904785, + "learning_rate": 4.793475423683129e-06, + "loss": 0.1731, + "step": 20471 + }, + { + "epoch": 0.5180555204089379, + "grad_norm": 4.994276523590088, + "learning_rate": 4.793074244474195e-06, + "loss": 0.2132, + "step": 20472 + }, + { + "epoch": 0.5180808259736316, + "grad_norm": 2.0101330280303955, + "learning_rate": 4.79267306659969e-06, + "loss": 0.1531, + "step": 20473 + }, + { + "epoch": 0.5181061315383253, + "grad_norm": 3.130720853805542, + "learning_rate": 4.792271890062195e-06, + "loss": 0.1574, + "step": 20474 + }, + { + "epoch": 0.5181314371030189, + "grad_norm": 4.238537788391113, + "learning_rate": 4.791870714864301e-06, + "loss": 0.1521, + "step": 20475 + }, + { + "epoch": 0.5181567426677126, + "grad_norm": 9.846002578735352, + "learning_rate": 4.791469541008593e-06, + "loss": 0.1878, + "step": 20476 + }, + { + "epoch": 0.5181820482324063, + "grad_norm": 12.449324607849121, + "learning_rate": 4.791068368497659e-06, + "loss": 0.2538, + "step": 20477 + }, + { + "epoch": 0.5182073537971, + "grad_norm": 9.125490188598633, + "learning_rate": 4.790667197334089e-06, + "loss": 0.2275, + "step": 20478 + }, + { + "epoch": 0.5182326593617936, + "grad_norm": 6.435585021972656, + "learning_rate": 4.7902660275204655e-06, + "loss": 0.193, + "step": 20479 + }, + { + "epoch": 0.5182579649264873, + "grad_norm": 3.9944474697113037, + "learning_rate": 4.789864859059376e-06, + "loss": 0.0993, + "step": 20480 + }, + { + "epoch": 0.5182832704911811, + "grad_norm": 6.571975231170654, + "learning_rate": 4.789463691953411e-06, + "loss": 0.2836, + "step": 20481 + }, + { + "epoch": 0.5183085760558747, + "grad_norm": 5.848330974578857, + "learning_rate": 4.789062526205156e-06, + "loss": 0.1452, + "step": 20482 + }, + { + "epoch": 0.5183338816205684, + "grad_norm": 4.682497978210449, + "learning_rate": 4.788661361817196e-06, + "loss": 0.1777, + "step": 20483 + }, + { + "epoch": 0.5183591871852621, + "grad_norm": 14.298690795898438, + "learning_rate": 4.788260198792119e-06, + "loss": 0.2336, + "step": 20484 + }, + { + "epoch": 0.5183844927499557, + "grad_norm": 9.047517776489258, + "learning_rate": 4.787859037132514e-06, + "loss": 0.2021, + "step": 20485 + }, + { + "epoch": 0.5184097983146494, + "grad_norm": 7.158298015594482, + "learning_rate": 4.787457876840966e-06, + "loss": 0.1757, + "step": 20486 + }, + { + "epoch": 0.5184351038793431, + "grad_norm": 4.034364700317383, + "learning_rate": 4.787056717920063e-06, + "loss": 0.1948, + "step": 20487 + }, + { + "epoch": 0.5184604094440367, + "grad_norm": 5.167674541473389, + "learning_rate": 4.786655560372392e-06, + "loss": 0.1887, + "step": 20488 + }, + { + "epoch": 0.5184857150087304, + "grad_norm": 3.966512680053711, + "learning_rate": 4.786254404200539e-06, + "loss": 0.142, + "step": 20489 + }, + { + "epoch": 0.5185110205734241, + "grad_norm": 2.9262142181396484, + "learning_rate": 4.78585324940709e-06, + "loss": 0.1178, + "step": 20490 + }, + { + "epoch": 0.5185363261381177, + "grad_norm": 4.0786237716674805, + "learning_rate": 4.7854520959946375e-06, + "loss": 0.1318, + "step": 20491 + }, + { + "epoch": 0.5185616317028114, + "grad_norm": 10.41358470916748, + "learning_rate": 4.785050943965761e-06, + "loss": 0.2045, + "step": 20492 + }, + { + "epoch": 0.5185869372675052, + "grad_norm": 9.314498901367188, + "learning_rate": 4.784649793323052e-06, + "loss": 0.2526, + "step": 20493 + }, + { + "epoch": 0.5186122428321988, + "grad_norm": 4.5059309005737305, + "learning_rate": 4.7842486440690964e-06, + "loss": 0.2006, + "step": 20494 + }, + { + "epoch": 0.5186375483968925, + "grad_norm": 8.789019584655762, + "learning_rate": 4.7838474962064806e-06, + "loss": 0.2681, + "step": 20495 + }, + { + "epoch": 0.5186628539615862, + "grad_norm": 3.9442062377929688, + "learning_rate": 4.783446349737795e-06, + "loss": 0.1694, + "step": 20496 + }, + { + "epoch": 0.5186881595262798, + "grad_norm": 8.793502807617188, + "learning_rate": 4.78304520466562e-06, + "loss": 0.1145, + "step": 20497 + }, + { + "epoch": 0.5187134650909735, + "grad_norm": 7.092657566070557, + "learning_rate": 4.782644060992546e-06, + "loss": 0.1826, + "step": 20498 + }, + { + "epoch": 0.5187387706556672, + "grad_norm": 4.600697994232178, + "learning_rate": 4.782242918721162e-06, + "loss": 0.1361, + "step": 20499 + }, + { + "epoch": 0.5187640762203608, + "grad_norm": 13.501062393188477, + "learning_rate": 4.781841777854054e-06, + "loss": 0.2146, + "step": 20500 + }, + { + "epoch": 0.5187893817850545, + "grad_norm": 8.628097534179688, + "learning_rate": 4.781440638393806e-06, + "loss": 0.2581, + "step": 20501 + }, + { + "epoch": 0.5188146873497482, + "grad_norm": 7.3243207931518555, + "learning_rate": 4.781039500343006e-06, + "loss": 0.1864, + "step": 20502 + }, + { + "epoch": 0.5188399929144419, + "grad_norm": 8.02256965637207, + "learning_rate": 4.780638363704242e-06, + "loss": 0.1281, + "step": 20503 + }, + { + "epoch": 0.5188652984791355, + "grad_norm": 7.722579002380371, + "learning_rate": 4.780237228480102e-06, + "loss": 0.1776, + "step": 20504 + }, + { + "epoch": 0.5188906040438293, + "grad_norm": 4.172252178192139, + "learning_rate": 4.77983609467317e-06, + "loss": 0.1281, + "step": 20505 + }, + { + "epoch": 0.518915909608523, + "grad_norm": 3.1549949645996094, + "learning_rate": 4.779434962286034e-06, + "loss": 0.1631, + "step": 20506 + }, + { + "epoch": 0.5189412151732166, + "grad_norm": 9.945977210998535, + "learning_rate": 4.77903383132128e-06, + "loss": 0.2744, + "step": 20507 + }, + { + "epoch": 0.5189665207379103, + "grad_norm": 7.255550384521484, + "learning_rate": 4.7786327017814965e-06, + "loss": 0.1513, + "step": 20508 + }, + { + "epoch": 0.518991826302604, + "grad_norm": 2.205962896347046, + "learning_rate": 4.7782315736692716e-06, + "loss": 0.0771, + "step": 20509 + }, + { + "epoch": 0.5190171318672976, + "grad_norm": 4.415345668792725, + "learning_rate": 4.777830446987187e-06, + "loss": 0.1145, + "step": 20510 + }, + { + "epoch": 0.5190424374319913, + "grad_norm": 5.930318832397461, + "learning_rate": 4.777429321737834e-06, + "loss": 0.2006, + "step": 20511 + }, + { + "epoch": 0.519067742996685, + "grad_norm": 6.892725467681885, + "learning_rate": 4.777028197923798e-06, + "loss": 0.2678, + "step": 20512 + }, + { + "epoch": 0.5190930485613786, + "grad_norm": 5.696280479431152, + "learning_rate": 4.7766270755476655e-06, + "loss": 0.1381, + "step": 20513 + }, + { + "epoch": 0.5191183541260723, + "grad_norm": 3.451242446899414, + "learning_rate": 4.776225954612023e-06, + "loss": 0.1854, + "step": 20514 + }, + { + "epoch": 0.519143659690766, + "grad_norm": 4.3286213874816895, + "learning_rate": 4.775824835119458e-06, + "loss": 0.1214, + "step": 20515 + }, + { + "epoch": 0.5191689652554596, + "grad_norm": 8.83336067199707, + "learning_rate": 4.775423717072557e-06, + "loss": 0.2702, + "step": 20516 + }, + { + "epoch": 0.5191942708201533, + "grad_norm": 5.765410423278809, + "learning_rate": 4.7750226004739055e-06, + "loss": 0.1445, + "step": 20517 + }, + { + "epoch": 0.5192195763848471, + "grad_norm": 4.482692718505859, + "learning_rate": 4.774621485326094e-06, + "loss": 0.1803, + "step": 20518 + }, + { + "epoch": 0.5192448819495407, + "grad_norm": 28.97693634033203, + "learning_rate": 4.774220371631705e-06, + "loss": 0.1732, + "step": 20519 + }, + { + "epoch": 0.5192701875142344, + "grad_norm": 4.160231113433838, + "learning_rate": 4.773819259393326e-06, + "loss": 0.1817, + "step": 20520 + }, + { + "epoch": 0.5192954930789281, + "grad_norm": 4.372133255004883, + "learning_rate": 4.773418148613545e-06, + "loss": 0.1691, + "step": 20521 + }, + { + "epoch": 0.5193207986436217, + "grad_norm": 5.844147682189941, + "learning_rate": 4.773017039294951e-06, + "loss": 0.1745, + "step": 20522 + }, + { + "epoch": 0.5193461042083154, + "grad_norm": 5.819137096405029, + "learning_rate": 4.772615931440124e-06, + "loss": 0.1512, + "step": 20523 + }, + { + "epoch": 0.5193714097730091, + "grad_norm": 2.4850192070007324, + "learning_rate": 4.772214825051656e-06, + "loss": 0.1243, + "step": 20524 + }, + { + "epoch": 0.5193967153377027, + "grad_norm": 4.3769211769104, + "learning_rate": 4.771813720132132e-06, + "loss": 0.1901, + "step": 20525 + }, + { + "epoch": 0.5194220209023964, + "grad_norm": 3.8806710243225098, + "learning_rate": 4.771412616684141e-06, + "loss": 0.1341, + "step": 20526 + }, + { + "epoch": 0.5194473264670901, + "grad_norm": 5.0355305671691895, + "learning_rate": 4.771011514710265e-06, + "loss": 0.166, + "step": 20527 + }, + { + "epoch": 0.5194726320317838, + "grad_norm": 5.5289692878723145, + "learning_rate": 4.7706104142130936e-06, + "loss": 0.2601, + "step": 20528 + }, + { + "epoch": 0.5194979375964774, + "grad_norm": 7.2228851318359375, + "learning_rate": 4.770209315195212e-06, + "loss": 0.1723, + "step": 20529 + }, + { + "epoch": 0.5195232431611712, + "grad_norm": 2.3662257194519043, + "learning_rate": 4.769808217659209e-06, + "loss": 0.1626, + "step": 20530 + }, + { + "epoch": 0.5195485487258649, + "grad_norm": 20.04590606689453, + "learning_rate": 4.769407121607671e-06, + "loss": 0.158, + "step": 20531 + }, + { + "epoch": 0.5195738542905585, + "grad_norm": 4.920316219329834, + "learning_rate": 4.769006027043182e-06, + "loss": 0.1183, + "step": 20532 + }, + { + "epoch": 0.5195991598552522, + "grad_norm": 4.751803398132324, + "learning_rate": 4.76860493396833e-06, + "loss": 0.1944, + "step": 20533 + }, + { + "epoch": 0.5196244654199459, + "grad_norm": 6.674017906188965, + "learning_rate": 4.768203842385702e-06, + "loss": 0.1929, + "step": 20534 + }, + { + "epoch": 0.5196497709846395, + "grad_norm": 5.453360557556152, + "learning_rate": 4.767802752297887e-06, + "loss": 0.2034, + "step": 20535 + }, + { + "epoch": 0.5196750765493332, + "grad_norm": 2.846703290939331, + "learning_rate": 4.767401663707466e-06, + "loss": 0.1383, + "step": 20536 + }, + { + "epoch": 0.5197003821140269, + "grad_norm": 5.246132850646973, + "learning_rate": 4.767000576617029e-06, + "loss": 0.1275, + "step": 20537 + }, + { + "epoch": 0.5197256876787205, + "grad_norm": 3.7771518230438232, + "learning_rate": 4.766599491029161e-06, + "loss": 0.1614, + "step": 20538 + }, + { + "epoch": 0.5197509932434142, + "grad_norm": 8.068325996398926, + "learning_rate": 4.766198406946451e-06, + "loss": 0.1267, + "step": 20539 + }, + { + "epoch": 0.5197762988081079, + "grad_norm": 8.899853706359863, + "learning_rate": 4.765797324371484e-06, + "loss": 0.1714, + "step": 20540 + }, + { + "epoch": 0.5198016043728015, + "grad_norm": 2.4993748664855957, + "learning_rate": 4.7653962433068455e-06, + "loss": 0.1489, + "step": 20541 + }, + { + "epoch": 0.5198269099374953, + "grad_norm": 3.4446117877960205, + "learning_rate": 4.7649951637551236e-06, + "loss": 0.1644, + "step": 20542 + }, + { + "epoch": 0.519852215502189, + "grad_norm": 7.725543975830078, + "learning_rate": 4.764594085718904e-06, + "loss": 0.176, + "step": 20543 + }, + { + "epoch": 0.5198775210668826, + "grad_norm": 2.6789276599884033, + "learning_rate": 4.7641930092007735e-06, + "loss": 0.1092, + "step": 20544 + }, + { + "epoch": 0.5199028266315763, + "grad_norm": 3.1116859912872314, + "learning_rate": 4.763791934203317e-06, + "loss": 0.1457, + "step": 20545 + }, + { + "epoch": 0.51992813219627, + "grad_norm": 4.471743106842041, + "learning_rate": 4.7633908607291246e-06, + "loss": 0.1263, + "step": 20546 + }, + { + "epoch": 0.5199534377609636, + "grad_norm": 4.686224460601807, + "learning_rate": 4.7629897887807795e-06, + "loss": 0.1088, + "step": 20547 + }, + { + "epoch": 0.5199787433256573, + "grad_norm": 4.115462303161621, + "learning_rate": 4.762588718360869e-06, + "loss": 0.1749, + "step": 20548 + }, + { + "epoch": 0.520004048890351, + "grad_norm": 5.405699729919434, + "learning_rate": 4.762187649471981e-06, + "loss": 0.1592, + "step": 20549 + }, + { + "epoch": 0.5200293544550446, + "grad_norm": 4.654289722442627, + "learning_rate": 4.761786582116699e-06, + "loss": 0.1115, + "step": 20550 + }, + { + "epoch": 0.5200546600197383, + "grad_norm": 13.777634620666504, + "learning_rate": 4.761385516297611e-06, + "loss": 0.3578, + "step": 20551 + }, + { + "epoch": 0.520079965584432, + "grad_norm": 3.4353737831115723, + "learning_rate": 4.760984452017303e-06, + "loss": 0.1489, + "step": 20552 + }, + { + "epoch": 0.5201052711491257, + "grad_norm": 2.8539905548095703, + "learning_rate": 4.760583389278365e-06, + "loss": 0.1304, + "step": 20553 + }, + { + "epoch": 0.5201305767138193, + "grad_norm": 3.413062334060669, + "learning_rate": 4.760182328083377e-06, + "loss": 0.1749, + "step": 20554 + }, + { + "epoch": 0.5201558822785131, + "grad_norm": 5.496903896331787, + "learning_rate": 4.759781268434929e-06, + "loss": 0.2324, + "step": 20555 + }, + { + "epoch": 0.5201811878432068, + "grad_norm": 11.409506797790527, + "learning_rate": 4.759380210335607e-06, + "loss": 0.2915, + "step": 20556 + }, + { + "epoch": 0.5202064934079004, + "grad_norm": 4.75686502456665, + "learning_rate": 4.758979153788e-06, + "loss": 0.1366, + "step": 20557 + }, + { + "epoch": 0.5202317989725941, + "grad_norm": 14.371260643005371, + "learning_rate": 4.7585780987946884e-06, + "loss": 0.3031, + "step": 20558 + }, + { + "epoch": 0.5202571045372878, + "grad_norm": 3.9168007373809814, + "learning_rate": 4.758177045358261e-06, + "loss": 0.1897, + "step": 20559 + }, + { + "epoch": 0.5202824101019814, + "grad_norm": 6.410568714141846, + "learning_rate": 4.757775993481307e-06, + "loss": 0.1732, + "step": 20560 + }, + { + "epoch": 0.5203077156666751, + "grad_norm": 6.7800397872924805, + "learning_rate": 4.757374943166409e-06, + "loss": 0.1762, + "step": 20561 + }, + { + "epoch": 0.5203330212313688, + "grad_norm": 5.353503227233887, + "learning_rate": 4.756973894416157e-06, + "loss": 0.1483, + "step": 20562 + }, + { + "epoch": 0.5203583267960624, + "grad_norm": 4.775875568389893, + "learning_rate": 4.756572847233134e-06, + "loss": 0.1854, + "step": 20563 + }, + { + "epoch": 0.5203836323607561, + "grad_norm": 9.919696807861328, + "learning_rate": 4.756171801619927e-06, + "loss": 0.1584, + "step": 20564 + }, + { + "epoch": 0.5204089379254498, + "grad_norm": 7.217880725860596, + "learning_rate": 4.755770757579123e-06, + "loss": 0.2357, + "step": 20565 + }, + { + "epoch": 0.5204342434901434, + "grad_norm": 7.322799205780029, + "learning_rate": 4.755369715113309e-06, + "loss": 0.1968, + "step": 20566 + }, + { + "epoch": 0.5204595490548372, + "grad_norm": 4.718873023986816, + "learning_rate": 4.754968674225069e-06, + "loss": 0.1585, + "step": 20567 + }, + { + "epoch": 0.5204848546195309, + "grad_norm": 5.671693325042725, + "learning_rate": 4.7545676349169905e-06, + "loss": 0.0955, + "step": 20568 + }, + { + "epoch": 0.5205101601842245, + "grad_norm": 5.246201992034912, + "learning_rate": 4.75416659719166e-06, + "loss": 0.2194, + "step": 20569 + }, + { + "epoch": 0.5205354657489182, + "grad_norm": 4.31923246383667, + "learning_rate": 4.753765561051664e-06, + "loss": 0.1499, + "step": 20570 + }, + { + "epoch": 0.5205607713136119, + "grad_norm": 6.550994873046875, + "learning_rate": 4.753364526499587e-06, + "loss": 0.2627, + "step": 20571 + }, + { + "epoch": 0.5205860768783055, + "grad_norm": 11.727049827575684, + "learning_rate": 4.752963493538016e-06, + "loss": 0.3621, + "step": 20572 + }, + { + "epoch": 0.5206113824429992, + "grad_norm": 5.068269729614258, + "learning_rate": 4.752562462169539e-06, + "loss": 0.1635, + "step": 20573 + }, + { + "epoch": 0.5206366880076929, + "grad_norm": 4.820422172546387, + "learning_rate": 4.75216143239674e-06, + "loss": 0.1688, + "step": 20574 + }, + { + "epoch": 0.5206619935723865, + "grad_norm": 2.91890811920166, + "learning_rate": 4.751760404222206e-06, + "loss": 0.1611, + "step": 20575 + }, + { + "epoch": 0.5206872991370802, + "grad_norm": 5.3365159034729, + "learning_rate": 4.751359377648523e-06, + "loss": 0.1436, + "step": 20576 + }, + { + "epoch": 0.5207126047017739, + "grad_norm": 6.889984130859375, + "learning_rate": 4.7509583526782765e-06, + "loss": 0.1265, + "step": 20577 + }, + { + "epoch": 0.5207379102664675, + "grad_norm": 5.09314489364624, + "learning_rate": 4.750557329314053e-06, + "loss": 0.1815, + "step": 20578 + }, + { + "epoch": 0.5207632158311613, + "grad_norm": 4.793943881988525, + "learning_rate": 4.75015630755844e-06, + "loss": 0.1587, + "step": 20579 + }, + { + "epoch": 0.520788521395855, + "grad_norm": 4.356750011444092, + "learning_rate": 4.749755287414022e-06, + "loss": 0.1721, + "step": 20580 + }, + { + "epoch": 0.5208138269605487, + "grad_norm": 3.352510929107666, + "learning_rate": 4.7493542688833846e-06, + "loss": 0.1171, + "step": 20581 + }, + { + "epoch": 0.5208391325252423, + "grad_norm": 6.89854621887207, + "learning_rate": 4.7489532519691155e-06, + "loss": 0.2252, + "step": 20582 + }, + { + "epoch": 0.520864438089936, + "grad_norm": 3.6292500495910645, + "learning_rate": 4.7485522366738e-06, + "loss": 0.119, + "step": 20583 + }, + { + "epoch": 0.5208897436546297, + "grad_norm": 5.154742240905762, + "learning_rate": 4.748151223000027e-06, + "loss": 0.1755, + "step": 20584 + }, + { + "epoch": 0.5209150492193233, + "grad_norm": 5.247472763061523, + "learning_rate": 4.747750210950377e-06, + "loss": 0.1478, + "step": 20585 + }, + { + "epoch": 0.520940354784017, + "grad_norm": 4.826099395751953, + "learning_rate": 4.747349200527439e-06, + "loss": 0.2648, + "step": 20586 + }, + { + "epoch": 0.5209656603487107, + "grad_norm": 8.18123722076416, + "learning_rate": 4.746948191733799e-06, + "loss": 0.227, + "step": 20587 + }, + { + "epoch": 0.5209909659134043, + "grad_norm": 4.225269794464111, + "learning_rate": 4.746547184572046e-06, + "loss": 0.1128, + "step": 20588 + }, + { + "epoch": 0.521016271478098, + "grad_norm": 23.326904296875, + "learning_rate": 4.74614617904476e-06, + "loss": 0.2186, + "step": 20589 + }, + { + "epoch": 0.5210415770427917, + "grad_norm": 4.355663776397705, + "learning_rate": 4.74574517515453e-06, + "loss": 0.1875, + "step": 20590 + }, + { + "epoch": 0.5210668826074853, + "grad_norm": 2.7303128242492676, + "learning_rate": 4.745344172903943e-06, + "loss": 0.0901, + "step": 20591 + }, + { + "epoch": 0.5210921881721791, + "grad_norm": 8.671128273010254, + "learning_rate": 4.744943172295584e-06, + "loss": 0.1515, + "step": 20592 + }, + { + "epoch": 0.5211174937368728, + "grad_norm": 2.3621914386749268, + "learning_rate": 4.74454217333204e-06, + "loss": 0.1227, + "step": 20593 + }, + { + "epoch": 0.5211427993015664, + "grad_norm": 4.155125617980957, + "learning_rate": 4.744141176015895e-06, + "loss": 0.1679, + "step": 20594 + }, + { + "epoch": 0.5211681048662601, + "grad_norm": 4.283143043518066, + "learning_rate": 4.743740180349735e-06, + "loss": 0.1249, + "step": 20595 + }, + { + "epoch": 0.5211934104309538, + "grad_norm": 3.04829740524292, + "learning_rate": 4.743339186336147e-06, + "loss": 0.1295, + "step": 20596 + }, + { + "epoch": 0.5212187159956474, + "grad_norm": 4.983888149261475, + "learning_rate": 4.74293819397772e-06, + "loss": 0.2308, + "step": 20597 + }, + { + "epoch": 0.5212440215603411, + "grad_norm": 4.920605182647705, + "learning_rate": 4.742537203277034e-06, + "loss": 0.1956, + "step": 20598 + }, + { + "epoch": 0.5212693271250348, + "grad_norm": 3.7529959678649902, + "learning_rate": 4.742136214236678e-06, + "loss": 0.2245, + "step": 20599 + }, + { + "epoch": 0.5212946326897284, + "grad_norm": 5.6556010246276855, + "learning_rate": 4.741735226859238e-06, + "loss": 0.2105, + "step": 20600 + }, + { + "epoch": 0.5213199382544221, + "grad_norm": 16.16956329345703, + "learning_rate": 4.7413342411473e-06, + "loss": 0.2426, + "step": 20601 + }, + { + "epoch": 0.5213452438191158, + "grad_norm": 4.305744647979736, + "learning_rate": 4.740933257103449e-06, + "loss": 0.0974, + "step": 20602 + }, + { + "epoch": 0.5213705493838094, + "grad_norm": 7.213277339935303, + "learning_rate": 4.740532274730271e-06, + "loss": 0.2552, + "step": 20603 + }, + { + "epoch": 0.5213958549485032, + "grad_norm": 2.715479850769043, + "learning_rate": 4.740131294030351e-06, + "loss": 0.1078, + "step": 20604 + }, + { + "epoch": 0.5214211605131969, + "grad_norm": 4.29456901550293, + "learning_rate": 4.739730315006276e-06, + "loss": 0.1888, + "step": 20605 + }, + { + "epoch": 0.5214464660778906, + "grad_norm": 2.9652421474456787, + "learning_rate": 4.739329337660634e-06, + "loss": 0.0976, + "step": 20606 + }, + { + "epoch": 0.5214717716425842, + "grad_norm": 4.023890018463135, + "learning_rate": 4.738928361996008e-06, + "loss": 0.1708, + "step": 20607 + }, + { + "epoch": 0.5214970772072779, + "grad_norm": 3.431992530822754, + "learning_rate": 4.738527388014983e-06, + "loss": 0.1744, + "step": 20608 + }, + { + "epoch": 0.5215223827719716, + "grad_norm": 10.413883209228516, + "learning_rate": 4.738126415720146e-06, + "loss": 0.3162, + "step": 20609 + }, + { + "epoch": 0.5215476883366652, + "grad_norm": 4.827394485473633, + "learning_rate": 4.7377254451140854e-06, + "loss": 0.1236, + "step": 20610 + }, + { + "epoch": 0.5215729939013589, + "grad_norm": 6.398344039916992, + "learning_rate": 4.737324476199383e-06, + "loss": 0.1727, + "step": 20611 + }, + { + "epoch": 0.5215982994660526, + "grad_norm": 6.258484363555908, + "learning_rate": 4.736923508978627e-06, + "loss": 0.2222, + "step": 20612 + }, + { + "epoch": 0.5216236050307462, + "grad_norm": 4.471828460693359, + "learning_rate": 4.736522543454401e-06, + "loss": 0.2132, + "step": 20613 + }, + { + "epoch": 0.5216489105954399, + "grad_norm": 8.419044494628906, + "learning_rate": 4.736121579629292e-06, + "loss": 0.1852, + "step": 20614 + }, + { + "epoch": 0.5216742161601337, + "grad_norm": 4.743781089782715, + "learning_rate": 4.735720617505889e-06, + "loss": 0.179, + "step": 20615 + }, + { + "epoch": 0.5216995217248273, + "grad_norm": 7.493072986602783, + "learning_rate": 4.735319657086771e-06, + "loss": 0.15, + "step": 20616 + }, + { + "epoch": 0.521724827289521, + "grad_norm": 4.4150261878967285, + "learning_rate": 4.734918698374528e-06, + "loss": 0.1527, + "step": 20617 + }, + { + "epoch": 0.5217501328542147, + "grad_norm": 3.268998861312866, + "learning_rate": 4.734517741371745e-06, + "loss": 0.0912, + "step": 20618 + }, + { + "epoch": 0.5217754384189083, + "grad_norm": 3.909477949142456, + "learning_rate": 4.734116786081011e-06, + "loss": 0.1629, + "step": 20619 + }, + { + "epoch": 0.521800743983602, + "grad_norm": 5.963794708251953, + "learning_rate": 4.733715832504904e-06, + "loss": 0.1778, + "step": 20620 + }, + { + "epoch": 0.5218260495482957, + "grad_norm": 6.309122085571289, + "learning_rate": 4.733314880646016e-06, + "loss": 0.1975, + "step": 20621 + }, + { + "epoch": 0.5218513551129893, + "grad_norm": 8.86166763305664, + "learning_rate": 4.73291393050693e-06, + "loss": 0.1835, + "step": 20622 + }, + { + "epoch": 0.521876660677683, + "grad_norm": 3.94449520111084, + "learning_rate": 4.732512982090232e-06, + "loss": 0.1541, + "step": 20623 + }, + { + "epoch": 0.5219019662423767, + "grad_norm": 3.5997061729431152, + "learning_rate": 4.732112035398511e-06, + "loss": 0.1124, + "step": 20624 + }, + { + "epoch": 0.5219272718070703, + "grad_norm": 6.172653675079346, + "learning_rate": 4.731711090434347e-06, + "loss": 0.1712, + "step": 20625 + }, + { + "epoch": 0.521952577371764, + "grad_norm": 3.396441698074341, + "learning_rate": 4.731310147200328e-06, + "loss": 0.1465, + "step": 20626 + }, + { + "epoch": 0.5219778829364577, + "grad_norm": 9.381160736083984, + "learning_rate": 4.730909205699041e-06, + "loss": 0.2042, + "step": 20627 + }, + { + "epoch": 0.5220031885011513, + "grad_norm": 6.323617935180664, + "learning_rate": 4.730508265933072e-06, + "loss": 0.1355, + "step": 20628 + }, + { + "epoch": 0.5220284940658451, + "grad_norm": 6.408665180206299, + "learning_rate": 4.730107327905002e-06, + "loss": 0.181, + "step": 20629 + }, + { + "epoch": 0.5220537996305388, + "grad_norm": 8.654500961303711, + "learning_rate": 4.72970639161742e-06, + "loss": 0.2161, + "step": 20630 + }, + { + "epoch": 0.5220791051952325, + "grad_norm": 9.033754348754883, + "learning_rate": 4.729305457072913e-06, + "loss": 0.2845, + "step": 20631 + }, + { + "epoch": 0.5221044107599261, + "grad_norm": 7.188329696655273, + "learning_rate": 4.728904524274065e-06, + "loss": 0.1425, + "step": 20632 + }, + { + "epoch": 0.5221297163246198, + "grad_norm": 3.1527721881866455, + "learning_rate": 4.7285035932234595e-06, + "loss": 0.1485, + "step": 20633 + }, + { + "epoch": 0.5221550218893135, + "grad_norm": 5.806906223297119, + "learning_rate": 4.728102663923684e-06, + "loss": 0.1615, + "step": 20634 + }, + { + "epoch": 0.5221803274540071, + "grad_norm": 7.930512428283691, + "learning_rate": 4.727701736377324e-06, + "loss": 0.152, + "step": 20635 + }, + { + "epoch": 0.5222056330187008, + "grad_norm": 3.4130640029907227, + "learning_rate": 4.727300810586965e-06, + "loss": 0.1245, + "step": 20636 + }, + { + "epoch": 0.5222309385833945, + "grad_norm": 5.340314865112305, + "learning_rate": 4.726899886555194e-06, + "loss": 0.2316, + "step": 20637 + }, + { + "epoch": 0.5222562441480881, + "grad_norm": 9.842910766601562, + "learning_rate": 4.726498964284593e-06, + "loss": 0.2704, + "step": 20638 + }, + { + "epoch": 0.5222815497127818, + "grad_norm": 8.317850112915039, + "learning_rate": 4.726098043777749e-06, + "loss": 0.2331, + "step": 20639 + }, + { + "epoch": 0.5223068552774756, + "grad_norm": 5.308751583099365, + "learning_rate": 4.725697125037248e-06, + "loss": 0.1064, + "step": 20640 + }, + { + "epoch": 0.5223321608421692, + "grad_norm": 6.303966045379639, + "learning_rate": 4.725296208065677e-06, + "loss": 0.1701, + "step": 20641 + }, + { + "epoch": 0.5223574664068629, + "grad_norm": 4.418209552764893, + "learning_rate": 4.724895292865618e-06, + "loss": 0.147, + "step": 20642 + }, + { + "epoch": 0.5223827719715566, + "grad_norm": 3.436767339706421, + "learning_rate": 4.724494379439658e-06, + "loss": 0.0906, + "step": 20643 + }, + { + "epoch": 0.5224080775362502, + "grad_norm": 3.9070820808410645, + "learning_rate": 4.724093467790382e-06, + "loss": 0.2339, + "step": 20644 + }, + { + "epoch": 0.5224333831009439, + "grad_norm": 4.8810834884643555, + "learning_rate": 4.723692557920376e-06, + "loss": 0.2188, + "step": 20645 + }, + { + "epoch": 0.5224586886656376, + "grad_norm": 3.8105955123901367, + "learning_rate": 4.723291649832228e-06, + "loss": 0.082, + "step": 20646 + }, + { + "epoch": 0.5224839942303312, + "grad_norm": 4.589325904846191, + "learning_rate": 4.722890743528519e-06, + "loss": 0.1966, + "step": 20647 + }, + { + "epoch": 0.5225092997950249, + "grad_norm": 3.8653135299682617, + "learning_rate": 4.722489839011835e-06, + "loss": 0.1844, + "step": 20648 + }, + { + "epoch": 0.5225346053597186, + "grad_norm": 6.549437046051025, + "learning_rate": 4.722088936284763e-06, + "loss": 0.1681, + "step": 20649 + }, + { + "epoch": 0.5225599109244122, + "grad_norm": 3.076859951019287, + "learning_rate": 4.721688035349889e-06, + "loss": 0.1488, + "step": 20650 + }, + { + "epoch": 0.5225852164891059, + "grad_norm": 5.533616542816162, + "learning_rate": 4.721287136209796e-06, + "loss": 0.2468, + "step": 20651 + }, + { + "epoch": 0.5226105220537997, + "grad_norm": 4.589599132537842, + "learning_rate": 4.72088623886707e-06, + "loss": 0.1754, + "step": 20652 + }, + { + "epoch": 0.5226358276184933, + "grad_norm": 2.5541656017303467, + "learning_rate": 4.7204853433242966e-06, + "loss": 0.1001, + "step": 20653 + }, + { + "epoch": 0.522661133183187, + "grad_norm": 3.1902270317077637, + "learning_rate": 4.720084449584063e-06, + "loss": 0.1532, + "step": 20654 + }, + { + "epoch": 0.5226864387478807, + "grad_norm": 5.858097076416016, + "learning_rate": 4.719683557648951e-06, + "loss": 0.1379, + "step": 20655 + }, + { + "epoch": 0.5227117443125744, + "grad_norm": 2.612903356552124, + "learning_rate": 4.7192826675215485e-06, + "loss": 0.0831, + "step": 20656 + }, + { + "epoch": 0.522737049877268, + "grad_norm": 3.5559308528900146, + "learning_rate": 4.718881779204438e-06, + "loss": 0.1724, + "step": 20657 + }, + { + "epoch": 0.5227623554419617, + "grad_norm": 8.522385597229004, + "learning_rate": 4.718480892700209e-06, + "loss": 0.2572, + "step": 20658 + }, + { + "epoch": 0.5227876610066554, + "grad_norm": 7.203927040100098, + "learning_rate": 4.718080008011444e-06, + "loss": 0.2365, + "step": 20659 + }, + { + "epoch": 0.522812966571349, + "grad_norm": 9.411758422851562, + "learning_rate": 4.717679125140727e-06, + "loss": 0.2763, + "step": 20660 + }, + { + "epoch": 0.5228382721360427, + "grad_norm": 2.676042079925537, + "learning_rate": 4.717278244090647e-06, + "loss": 0.0733, + "step": 20661 + }, + { + "epoch": 0.5228635777007364, + "grad_norm": 5.021960258483887, + "learning_rate": 4.716877364863786e-06, + "loss": 0.196, + "step": 20662 + }, + { + "epoch": 0.52288888326543, + "grad_norm": 2.393440008163452, + "learning_rate": 4.71647648746273e-06, + "loss": 0.1562, + "step": 20663 + }, + { + "epoch": 0.5229141888301237, + "grad_norm": 3.413639783859253, + "learning_rate": 4.716075611890065e-06, + "loss": 0.0944, + "step": 20664 + }, + { + "epoch": 0.5229394943948175, + "grad_norm": 6.090205669403076, + "learning_rate": 4.715674738148375e-06, + "loss": 0.1655, + "step": 20665 + }, + { + "epoch": 0.5229647999595111, + "grad_norm": 5.2391886711120605, + "learning_rate": 4.715273866240245e-06, + "loss": 0.1441, + "step": 20666 + }, + { + "epoch": 0.5229901055242048, + "grad_norm": 2.5803792476654053, + "learning_rate": 4.714872996168261e-06, + "loss": 0.1583, + "step": 20667 + }, + { + "epoch": 0.5230154110888985, + "grad_norm": 5.2439799308776855, + "learning_rate": 4.714472127935011e-06, + "loss": 0.2027, + "step": 20668 + }, + { + "epoch": 0.5230407166535921, + "grad_norm": 7.1851887702941895, + "learning_rate": 4.714071261543074e-06, + "loss": 0.1722, + "step": 20669 + }, + { + "epoch": 0.5230660222182858, + "grad_norm": 7.118623733520508, + "learning_rate": 4.7136703969950385e-06, + "loss": 0.1784, + "step": 20670 + }, + { + "epoch": 0.5230913277829795, + "grad_norm": 4.2296881675720215, + "learning_rate": 4.7132695342934885e-06, + "loss": 0.1458, + "step": 20671 + }, + { + "epoch": 0.5231166333476731, + "grad_norm": 3.2864372730255127, + "learning_rate": 4.712868673441013e-06, + "loss": 0.1423, + "step": 20672 + }, + { + "epoch": 0.5231419389123668, + "grad_norm": 6.135664939880371, + "learning_rate": 4.712467814440192e-06, + "loss": 0.1837, + "step": 20673 + }, + { + "epoch": 0.5231672444770605, + "grad_norm": 3.889291286468506, + "learning_rate": 4.712066957293613e-06, + "loss": 0.1706, + "step": 20674 + }, + { + "epoch": 0.5231925500417541, + "grad_norm": 6.884557723999023, + "learning_rate": 4.711666102003859e-06, + "loss": 0.2173, + "step": 20675 + }, + { + "epoch": 0.5232178556064478, + "grad_norm": 6.896823883056641, + "learning_rate": 4.71126524857352e-06, + "loss": 0.1477, + "step": 20676 + }, + { + "epoch": 0.5232431611711416, + "grad_norm": 9.94596004486084, + "learning_rate": 4.710864397005176e-06, + "loss": 0.199, + "step": 20677 + }, + { + "epoch": 0.5232684667358352, + "grad_norm": 3.239002227783203, + "learning_rate": 4.7104635473014125e-06, + "loss": 0.161, + "step": 20678 + }, + { + "epoch": 0.5232937723005289, + "grad_norm": 5.209101676940918, + "learning_rate": 4.7100626994648164e-06, + "loss": 0.1979, + "step": 20679 + }, + { + "epoch": 0.5233190778652226, + "grad_norm": 5.641599655151367, + "learning_rate": 4.709661853497972e-06, + "loss": 0.1414, + "step": 20680 + }, + { + "epoch": 0.5233443834299163, + "grad_norm": 4.997621536254883, + "learning_rate": 4.709261009403467e-06, + "loss": 0.1657, + "step": 20681 + }, + { + "epoch": 0.5233696889946099, + "grad_norm": 3.1644632816314697, + "learning_rate": 4.708860167183881e-06, + "loss": 0.1224, + "step": 20682 + }, + { + "epoch": 0.5233949945593036, + "grad_norm": 4.74357271194458, + "learning_rate": 4.708459326841801e-06, + "loss": 0.1822, + "step": 20683 + }, + { + "epoch": 0.5234203001239973, + "grad_norm": 4.331175327301025, + "learning_rate": 4.708058488379813e-06, + "loss": 0.209, + "step": 20684 + }, + { + "epoch": 0.5234456056886909, + "grad_norm": 8.191243171691895, + "learning_rate": 4.707657651800504e-06, + "loss": 0.2136, + "step": 20685 + }, + { + "epoch": 0.5234709112533846, + "grad_norm": 6.31181001663208, + "learning_rate": 4.7072568171064544e-06, + "loss": 0.312, + "step": 20686 + }, + { + "epoch": 0.5234962168180783, + "grad_norm": 5.1388421058654785, + "learning_rate": 4.7068559843002505e-06, + "loss": 0.1785, + "step": 20687 + }, + { + "epoch": 0.5235215223827719, + "grad_norm": 5.568802833557129, + "learning_rate": 4.706455153384479e-06, + "loss": 0.2021, + "step": 20688 + }, + { + "epoch": 0.5235468279474657, + "grad_norm": 7.739809989929199, + "learning_rate": 4.706054324361723e-06, + "loss": 0.2142, + "step": 20689 + }, + { + "epoch": 0.5235721335121594, + "grad_norm": 6.264529705047607, + "learning_rate": 4.705653497234569e-06, + "loss": 0.2133, + "step": 20690 + }, + { + "epoch": 0.523597439076853, + "grad_norm": 4.53537654876709, + "learning_rate": 4.705252672005599e-06, + "loss": 0.0559, + "step": 20691 + }, + { + "epoch": 0.5236227446415467, + "grad_norm": 5.764822959899902, + "learning_rate": 4.704851848677401e-06, + "loss": 0.2293, + "step": 20692 + }, + { + "epoch": 0.5236480502062404, + "grad_norm": 5.518498420715332, + "learning_rate": 4.704451027252558e-06, + "loss": 0.2013, + "step": 20693 + }, + { + "epoch": 0.523673355770934, + "grad_norm": 2.853771448135376, + "learning_rate": 4.704050207733657e-06, + "loss": 0.1184, + "step": 20694 + }, + { + "epoch": 0.5236986613356277, + "grad_norm": 3.7313807010650635, + "learning_rate": 4.70364939012328e-06, + "loss": 0.1053, + "step": 20695 + }, + { + "epoch": 0.5237239669003214, + "grad_norm": 4.972527503967285, + "learning_rate": 4.703248574424012e-06, + "loss": 0.1192, + "step": 20696 + }, + { + "epoch": 0.523749272465015, + "grad_norm": 16.192163467407227, + "learning_rate": 4.702847760638439e-06, + "loss": 0.4371, + "step": 20697 + }, + { + "epoch": 0.5237745780297087, + "grad_norm": 3.3664186000823975, + "learning_rate": 4.702446948769146e-06, + "loss": 0.1805, + "step": 20698 + }, + { + "epoch": 0.5237998835944024, + "grad_norm": 3.187156915664673, + "learning_rate": 4.702046138818719e-06, + "loss": 0.1309, + "step": 20699 + }, + { + "epoch": 0.523825189159096, + "grad_norm": 6.844372272491455, + "learning_rate": 4.701645330789739e-06, + "loss": 0.3009, + "step": 20700 + }, + { + "epoch": 0.5238504947237898, + "grad_norm": 10.078116416931152, + "learning_rate": 4.701244524684792e-06, + "loss": 0.2591, + "step": 20701 + }, + { + "epoch": 0.5238758002884835, + "grad_norm": 8.17659854888916, + "learning_rate": 4.700843720506464e-06, + "loss": 0.289, + "step": 20702 + }, + { + "epoch": 0.5239011058531771, + "grad_norm": 3.8780345916748047, + "learning_rate": 4.700442918257341e-06, + "loss": 0.1494, + "step": 20703 + }, + { + "epoch": 0.5239264114178708, + "grad_norm": 8.880912780761719, + "learning_rate": 4.700042117940004e-06, + "loss": 0.2105, + "step": 20704 + }, + { + "epoch": 0.5239517169825645, + "grad_norm": 17.51801872253418, + "learning_rate": 4.6996413195570406e-06, + "loss": 0.3396, + "step": 20705 + }, + { + "epoch": 0.5239770225472581, + "grad_norm": 5.798497200012207, + "learning_rate": 4.699240523111034e-06, + "loss": 0.2203, + "step": 20706 + }, + { + "epoch": 0.5240023281119518, + "grad_norm": 3.2078335285186768, + "learning_rate": 4.6988397286045705e-06, + "loss": 0.1805, + "step": 20707 + }, + { + "epoch": 0.5240276336766455, + "grad_norm": 4.846846103668213, + "learning_rate": 4.698438936040232e-06, + "loss": 0.179, + "step": 20708 + }, + { + "epoch": 0.5240529392413392, + "grad_norm": 4.414836406707764, + "learning_rate": 4.698038145420605e-06, + "loss": 0.1282, + "step": 20709 + }, + { + "epoch": 0.5240782448060328, + "grad_norm": 10.59040355682373, + "learning_rate": 4.697637356748273e-06, + "loss": 0.1425, + "step": 20710 + }, + { + "epoch": 0.5241035503707265, + "grad_norm": 3.9011082649230957, + "learning_rate": 4.697236570025822e-06, + "loss": 0.1163, + "step": 20711 + }, + { + "epoch": 0.5241288559354202, + "grad_norm": 7.965259075164795, + "learning_rate": 4.696835785255838e-06, + "loss": 0.1991, + "step": 20712 + }, + { + "epoch": 0.5241541615001138, + "grad_norm": 3.9270951747894287, + "learning_rate": 4.696435002440902e-06, + "loss": 0.1968, + "step": 20713 + }, + { + "epoch": 0.5241794670648076, + "grad_norm": 2.592294931411743, + "learning_rate": 4.6960342215836e-06, + "loss": 0.0751, + "step": 20714 + }, + { + "epoch": 0.5242047726295013, + "grad_norm": 11.386959075927734, + "learning_rate": 4.695633442686516e-06, + "loss": 0.2971, + "step": 20715 + }, + { + "epoch": 0.5242300781941949, + "grad_norm": 2.9997947216033936, + "learning_rate": 4.695232665752239e-06, + "loss": 0.0873, + "step": 20716 + }, + { + "epoch": 0.5242553837588886, + "grad_norm": 4.007367134094238, + "learning_rate": 4.694831890783346e-06, + "loss": 0.1567, + "step": 20717 + }, + { + "epoch": 0.5242806893235823, + "grad_norm": 4.9442596435546875, + "learning_rate": 4.694431117782427e-06, + "loss": 0.1006, + "step": 20718 + }, + { + "epoch": 0.5243059948882759, + "grad_norm": 4.843877792358398, + "learning_rate": 4.6940303467520645e-06, + "loss": 0.1519, + "step": 20719 + }, + { + "epoch": 0.5243313004529696, + "grad_norm": 6.493904113769531, + "learning_rate": 4.693629577694843e-06, + "loss": 0.2282, + "step": 20720 + }, + { + "epoch": 0.5243566060176633, + "grad_norm": 5.577003002166748, + "learning_rate": 4.693228810613349e-06, + "loss": 0.2029, + "step": 20721 + }, + { + "epoch": 0.5243819115823569, + "grad_norm": 10.0951566696167, + "learning_rate": 4.692828045510164e-06, + "loss": 0.2917, + "step": 20722 + }, + { + "epoch": 0.5244072171470506, + "grad_norm": 5.3112335205078125, + "learning_rate": 4.692427282387873e-06, + "loss": 0.1881, + "step": 20723 + }, + { + "epoch": 0.5244325227117443, + "grad_norm": 6.197563171386719, + "learning_rate": 4.692026521249062e-06, + "loss": 0.1509, + "step": 20724 + }, + { + "epoch": 0.5244578282764379, + "grad_norm": 3.319201946258545, + "learning_rate": 4.691625762096316e-06, + "loss": 0.115, + "step": 20725 + }, + { + "epoch": 0.5244831338411317, + "grad_norm": 1.9000712633132935, + "learning_rate": 4.691225004932217e-06, + "loss": 0.084, + "step": 20726 + }, + { + "epoch": 0.5245084394058254, + "grad_norm": 4.70767068862915, + "learning_rate": 4.69082424975935e-06, + "loss": 0.1595, + "step": 20727 + }, + { + "epoch": 0.524533744970519, + "grad_norm": 5.367177963256836, + "learning_rate": 4.6904234965802994e-06, + "loss": 0.1645, + "step": 20728 + }, + { + "epoch": 0.5245590505352127, + "grad_norm": 3.648651599884033, + "learning_rate": 4.690022745397653e-06, + "loss": 0.1981, + "step": 20729 + }, + { + "epoch": 0.5245843560999064, + "grad_norm": 2.9936859607696533, + "learning_rate": 4.689621996213989e-06, + "loss": 0.1579, + "step": 20730 + }, + { + "epoch": 0.5246096616646, + "grad_norm": 6.909911155700684, + "learning_rate": 4.689221249031896e-06, + "loss": 0.2338, + "step": 20731 + }, + { + "epoch": 0.5246349672292937, + "grad_norm": 4.016647815704346, + "learning_rate": 4.688820503853958e-06, + "loss": 0.1622, + "step": 20732 + }, + { + "epoch": 0.5246602727939874, + "grad_norm": 6.309288024902344, + "learning_rate": 4.688419760682757e-06, + "loss": 0.2679, + "step": 20733 + }, + { + "epoch": 0.5246855783586811, + "grad_norm": 4.234281063079834, + "learning_rate": 4.688019019520882e-06, + "loss": 0.1673, + "step": 20734 + }, + { + "epoch": 0.5247108839233747, + "grad_norm": 4.936830997467041, + "learning_rate": 4.687618280370912e-06, + "loss": 0.1442, + "step": 20735 + }, + { + "epoch": 0.5247361894880684, + "grad_norm": 5.675252437591553, + "learning_rate": 4.687217543235434e-06, + "loss": 0.2052, + "step": 20736 + }, + { + "epoch": 0.5247614950527622, + "grad_norm": 4.074235916137695, + "learning_rate": 4.686816808117032e-06, + "loss": 0.155, + "step": 20737 + }, + { + "epoch": 0.5247868006174558, + "grad_norm": 4.092485427856445, + "learning_rate": 4.686416075018292e-06, + "loss": 0.2152, + "step": 20738 + }, + { + "epoch": 0.5248121061821495, + "grad_norm": 7.457757472991943, + "learning_rate": 4.686015343941794e-06, + "loss": 0.2024, + "step": 20739 + }, + { + "epoch": 0.5248374117468432, + "grad_norm": 3.345377206802368, + "learning_rate": 4.685614614890124e-06, + "loss": 0.1748, + "step": 20740 + }, + { + "epoch": 0.5248627173115368, + "grad_norm": 5.561418056488037, + "learning_rate": 4.685213887865868e-06, + "loss": 0.1123, + "step": 20741 + }, + { + "epoch": 0.5248880228762305, + "grad_norm": 4.7475666999816895, + "learning_rate": 4.6848131628716085e-06, + "loss": 0.1785, + "step": 20742 + }, + { + "epoch": 0.5249133284409242, + "grad_norm": 5.977410793304443, + "learning_rate": 4.684412439909933e-06, + "loss": 0.1639, + "step": 20743 + }, + { + "epoch": 0.5249386340056178, + "grad_norm": 3.724409818649292, + "learning_rate": 4.68401171898342e-06, + "loss": 0.179, + "step": 20744 + }, + { + "epoch": 0.5249639395703115, + "grad_norm": 3.3069207668304443, + "learning_rate": 4.683611000094658e-06, + "loss": 0.1314, + "step": 20745 + }, + { + "epoch": 0.5249892451350052, + "grad_norm": 7.925747394561768, + "learning_rate": 4.68321028324623e-06, + "loss": 0.2276, + "step": 20746 + }, + { + "epoch": 0.5250145506996988, + "grad_norm": 2.8722198009490967, + "learning_rate": 4.68280956844072e-06, + "loss": 0.1599, + "step": 20747 + }, + { + "epoch": 0.5250398562643925, + "grad_norm": 13.214605331420898, + "learning_rate": 4.68240885568071e-06, + "loss": 0.2224, + "step": 20748 + }, + { + "epoch": 0.5250651618290862, + "grad_norm": 3.535903215408325, + "learning_rate": 4.682008144968789e-06, + "loss": 0.1685, + "step": 20749 + }, + { + "epoch": 0.5250904673937798, + "grad_norm": 3.1174728870391846, + "learning_rate": 4.681607436307537e-06, + "loss": 0.119, + "step": 20750 + }, + { + "epoch": 0.5251157729584736, + "grad_norm": 13.504334449768066, + "learning_rate": 4.681206729699539e-06, + "loss": 0.3901, + "step": 20751 + }, + { + "epoch": 0.5251410785231673, + "grad_norm": 5.30370569229126, + "learning_rate": 4.680806025147381e-06, + "loss": 0.1711, + "step": 20752 + }, + { + "epoch": 0.5251663840878609, + "grad_norm": 5.3887434005737305, + "learning_rate": 4.680405322653644e-06, + "loss": 0.174, + "step": 20753 + }, + { + "epoch": 0.5251916896525546, + "grad_norm": 5.0641961097717285, + "learning_rate": 4.680004622220914e-06, + "loss": 0.1834, + "step": 20754 + }, + { + "epoch": 0.5252169952172483, + "grad_norm": 3.323878526687622, + "learning_rate": 4.679603923851775e-06, + "loss": 0.1397, + "step": 20755 + }, + { + "epoch": 0.5252423007819419, + "grad_norm": 3.3259127140045166, + "learning_rate": 4.679203227548811e-06, + "loss": 0.1, + "step": 20756 + }, + { + "epoch": 0.5252676063466356, + "grad_norm": 7.667743682861328, + "learning_rate": 4.678802533314606e-06, + "loss": 0.2126, + "step": 20757 + }, + { + "epoch": 0.5252929119113293, + "grad_norm": 5.9657979011535645, + "learning_rate": 4.678401841151742e-06, + "loss": 0.1735, + "step": 20758 + }, + { + "epoch": 0.525318217476023, + "grad_norm": 5.676675796508789, + "learning_rate": 4.6780011510628045e-06, + "loss": 0.1991, + "step": 20759 + }, + { + "epoch": 0.5253435230407166, + "grad_norm": 2.959268808364868, + "learning_rate": 4.677600463050381e-06, + "loss": 0.1425, + "step": 20760 + }, + { + "epoch": 0.5253688286054103, + "grad_norm": 4.157759666442871, + "learning_rate": 4.67719977711705e-06, + "loss": 0.142, + "step": 20761 + }, + { + "epoch": 0.525394134170104, + "grad_norm": 5.305933475494385, + "learning_rate": 4.6767990932653964e-06, + "loss": 0.2014, + "step": 20762 + }, + { + "epoch": 0.5254194397347977, + "grad_norm": 4.8030242919921875, + "learning_rate": 4.676398411498006e-06, + "loss": 0.1947, + "step": 20763 + }, + { + "epoch": 0.5254447452994914, + "grad_norm": 5.007529258728027, + "learning_rate": 4.675997731817462e-06, + "loss": 0.1809, + "step": 20764 + }, + { + "epoch": 0.5254700508641851, + "grad_norm": 6.1330742835998535, + "learning_rate": 4.67559705422635e-06, + "loss": 0.1586, + "step": 20765 + }, + { + "epoch": 0.5254953564288787, + "grad_norm": 5.595857620239258, + "learning_rate": 4.675196378727251e-06, + "loss": 0.1756, + "step": 20766 + }, + { + "epoch": 0.5255206619935724, + "grad_norm": 4.899330139160156, + "learning_rate": 4.67479570532275e-06, + "loss": 0.1383, + "step": 20767 + }, + { + "epoch": 0.5255459675582661, + "grad_norm": 8.372164726257324, + "learning_rate": 4.674395034015431e-06, + "loss": 0.3022, + "step": 20768 + }, + { + "epoch": 0.5255712731229597, + "grad_norm": 4.405276775360107, + "learning_rate": 4.67399436480788e-06, + "loss": 0.1499, + "step": 20769 + }, + { + "epoch": 0.5255965786876534, + "grad_norm": 5.428197860717773, + "learning_rate": 4.6735936977026764e-06, + "loss": 0.1519, + "step": 20770 + }, + { + "epoch": 0.5256218842523471, + "grad_norm": 7.223422050476074, + "learning_rate": 4.673193032702406e-06, + "loss": 0.2811, + "step": 20771 + }, + { + "epoch": 0.5256471898170407, + "grad_norm": 4.0348734855651855, + "learning_rate": 4.6727923698096535e-06, + "loss": 0.1261, + "step": 20772 + }, + { + "epoch": 0.5256724953817344, + "grad_norm": 4.421204566955566, + "learning_rate": 4.672391709027002e-06, + "loss": 0.187, + "step": 20773 + }, + { + "epoch": 0.5256978009464282, + "grad_norm": 5.047882080078125, + "learning_rate": 4.671991050357037e-06, + "loss": 0.1872, + "step": 20774 + }, + { + "epoch": 0.5257231065111218, + "grad_norm": 8.911249160766602, + "learning_rate": 4.671590393802339e-06, + "loss": 0.2266, + "step": 20775 + }, + { + "epoch": 0.5257484120758155, + "grad_norm": 4.326503753662109, + "learning_rate": 4.6711897393654934e-06, + "loss": 0.1306, + "step": 20776 + }, + { + "epoch": 0.5257737176405092, + "grad_norm": 3.1687583923339844, + "learning_rate": 4.670789087049085e-06, + "loss": 0.0812, + "step": 20777 + }, + { + "epoch": 0.5257990232052028, + "grad_norm": 4.243988037109375, + "learning_rate": 4.670388436855697e-06, + "loss": 0.1248, + "step": 20778 + }, + { + "epoch": 0.5258243287698965, + "grad_norm": 8.12830638885498, + "learning_rate": 4.66998778878791e-06, + "loss": 0.2276, + "step": 20779 + }, + { + "epoch": 0.5258496343345902, + "grad_norm": 4.518120288848877, + "learning_rate": 4.669587142848314e-06, + "loss": 0.1991, + "step": 20780 + }, + { + "epoch": 0.5258749398992838, + "grad_norm": 4.679096221923828, + "learning_rate": 4.6691864990394865e-06, + "loss": 0.1465, + "step": 20781 + }, + { + "epoch": 0.5259002454639775, + "grad_norm": 5.711216449737549, + "learning_rate": 4.668785857364015e-06, + "loss": 0.1575, + "step": 20782 + }, + { + "epoch": 0.5259255510286712, + "grad_norm": 5.513045310974121, + "learning_rate": 4.668385217824482e-06, + "loss": 0.1675, + "step": 20783 + }, + { + "epoch": 0.5259508565933649, + "grad_norm": 6.323573112487793, + "learning_rate": 4.6679845804234695e-06, + "loss": 0.2266, + "step": 20784 + }, + { + "epoch": 0.5259761621580585, + "grad_norm": 2.8939640522003174, + "learning_rate": 4.667583945163563e-06, + "loss": 0.0766, + "step": 20785 + }, + { + "epoch": 0.5260014677227522, + "grad_norm": 6.12214994430542, + "learning_rate": 4.667183312047346e-06, + "loss": 0.183, + "step": 20786 + }, + { + "epoch": 0.526026773287446, + "grad_norm": 13.67129898071289, + "learning_rate": 4.6667826810774045e-06, + "loss": 0.2683, + "step": 20787 + }, + { + "epoch": 0.5260520788521396, + "grad_norm": 3.857184410095215, + "learning_rate": 4.666382052256316e-06, + "loss": 0.148, + "step": 20788 + }, + { + "epoch": 0.5260773844168333, + "grad_norm": 12.22836971282959, + "learning_rate": 4.665981425586668e-06, + "loss": 0.2192, + "step": 20789 + }, + { + "epoch": 0.526102689981527, + "grad_norm": 7.852969169616699, + "learning_rate": 4.665580801071045e-06, + "loss": 0.1468, + "step": 20790 + }, + { + "epoch": 0.5261279955462206, + "grad_norm": 14.540464401245117, + "learning_rate": 4.66518017871203e-06, + "loss": 0.1881, + "step": 20791 + }, + { + "epoch": 0.5261533011109143, + "grad_norm": 4.352280616760254, + "learning_rate": 4.664779558512204e-06, + "loss": 0.1962, + "step": 20792 + }, + { + "epoch": 0.526178606675608, + "grad_norm": 7.600583076477051, + "learning_rate": 4.664378940474152e-06, + "loss": 0.2052, + "step": 20793 + }, + { + "epoch": 0.5262039122403016, + "grad_norm": 4.299858093261719, + "learning_rate": 4.6639783246004586e-06, + "loss": 0.1739, + "step": 20794 + }, + { + "epoch": 0.5262292178049953, + "grad_norm": 4.94132137298584, + "learning_rate": 4.663577710893706e-06, + "loss": 0.1851, + "step": 20795 + }, + { + "epoch": 0.526254523369689, + "grad_norm": 4.640835762023926, + "learning_rate": 4.6631770993564795e-06, + "loss": 0.2011, + "step": 20796 + }, + { + "epoch": 0.5262798289343826, + "grad_norm": 4.026935577392578, + "learning_rate": 4.66277648999136e-06, + "loss": 0.154, + "step": 20797 + }, + { + "epoch": 0.5263051344990763, + "grad_norm": 10.648066520690918, + "learning_rate": 4.6623758828009315e-06, + "loss": 0.2534, + "step": 20798 + }, + { + "epoch": 0.52633044006377, + "grad_norm": 4.645899772644043, + "learning_rate": 4.661975277787779e-06, + "loss": 0.2401, + "step": 20799 + }, + { + "epoch": 0.5263557456284637, + "grad_norm": 7.105307579040527, + "learning_rate": 4.6615746749544865e-06, + "loss": 0.2221, + "step": 20800 + }, + { + "epoch": 0.5263810511931574, + "grad_norm": 12.567434310913086, + "learning_rate": 4.661174074303634e-06, + "loss": 0.2509, + "step": 20801 + }, + { + "epoch": 0.5264063567578511, + "grad_norm": 6.513339996337891, + "learning_rate": 4.660773475837808e-06, + "loss": 0.2403, + "step": 20802 + }, + { + "epoch": 0.5264316623225447, + "grad_norm": 7.469949245452881, + "learning_rate": 4.66037287955959e-06, + "loss": 0.1708, + "step": 20803 + }, + { + "epoch": 0.5264569678872384, + "grad_norm": 4.685953617095947, + "learning_rate": 4.659972285471565e-06, + "loss": 0.1627, + "step": 20804 + }, + { + "epoch": 0.5264822734519321, + "grad_norm": 5.625904560089111, + "learning_rate": 4.659571693576314e-06, + "loss": 0.1921, + "step": 20805 + }, + { + "epoch": 0.5265075790166257, + "grad_norm": 6.0966033935546875, + "learning_rate": 4.6591711038764226e-06, + "loss": 0.1642, + "step": 20806 + }, + { + "epoch": 0.5265328845813194, + "grad_norm": 3.8602025508880615, + "learning_rate": 4.658770516374474e-06, + "loss": 0.088, + "step": 20807 + }, + { + "epoch": 0.5265581901460131, + "grad_norm": 5.13859748840332, + "learning_rate": 4.65836993107305e-06, + "loss": 0.1328, + "step": 20808 + }, + { + "epoch": 0.5265834957107068, + "grad_norm": 5.042868137359619, + "learning_rate": 4.657969347974736e-06, + "loss": 0.1555, + "step": 20809 + }, + { + "epoch": 0.5266088012754004, + "grad_norm": 5.421332836151123, + "learning_rate": 4.657568767082113e-06, + "loss": 0.1541, + "step": 20810 + }, + { + "epoch": 0.5266341068400942, + "grad_norm": 2.834264039993286, + "learning_rate": 4.657168188397765e-06, + "loss": 0.1339, + "step": 20811 + }, + { + "epoch": 0.5266594124047879, + "grad_norm": 7.126827716827393, + "learning_rate": 4.656767611924275e-06, + "loss": 0.1657, + "step": 20812 + }, + { + "epoch": 0.5266847179694815, + "grad_norm": 5.759823322296143, + "learning_rate": 4.656367037664229e-06, + "loss": 0.2753, + "step": 20813 + }, + { + "epoch": 0.5267100235341752, + "grad_norm": 4.440873146057129, + "learning_rate": 4.6559664656202075e-06, + "loss": 0.2158, + "step": 20814 + }, + { + "epoch": 0.5267353290988689, + "grad_norm": 6.800079822540283, + "learning_rate": 4.655565895794793e-06, + "loss": 0.1642, + "step": 20815 + }, + { + "epoch": 0.5267606346635625, + "grad_norm": 3.923002243041992, + "learning_rate": 4.65516532819057e-06, + "loss": 0.1655, + "step": 20816 + }, + { + "epoch": 0.5267859402282562, + "grad_norm": 4.780798435211182, + "learning_rate": 4.654764762810122e-06, + "loss": 0.1252, + "step": 20817 + }, + { + "epoch": 0.5268112457929499, + "grad_norm": 6.64735221862793, + "learning_rate": 4.654364199656034e-06, + "loss": 0.2302, + "step": 20818 + }, + { + "epoch": 0.5268365513576435, + "grad_norm": 4.179497718811035, + "learning_rate": 4.653963638730885e-06, + "loss": 0.153, + "step": 20819 + }, + { + "epoch": 0.5268618569223372, + "grad_norm": 6.969608306884766, + "learning_rate": 4.65356308003726e-06, + "loss": 0.3175, + "step": 20820 + }, + { + "epoch": 0.5268871624870309, + "grad_norm": 3.5350658893585205, + "learning_rate": 4.653162523577741e-06, + "loss": 0.1482, + "step": 20821 + }, + { + "epoch": 0.5269124680517245, + "grad_norm": 5.2310662269592285, + "learning_rate": 4.652761969354916e-06, + "loss": 0.158, + "step": 20822 + }, + { + "epoch": 0.5269377736164182, + "grad_norm": 2.660806894302368, + "learning_rate": 4.652361417371362e-06, + "loss": 0.115, + "step": 20823 + }, + { + "epoch": 0.526963079181112, + "grad_norm": 7.139286994934082, + "learning_rate": 4.651960867629664e-06, + "loss": 0.1877, + "step": 20824 + }, + { + "epoch": 0.5269883847458056, + "grad_norm": 3.4067342281341553, + "learning_rate": 4.651560320132405e-06, + "loss": 0.0883, + "step": 20825 + }, + { + "epoch": 0.5270136903104993, + "grad_norm": 4.770041465759277, + "learning_rate": 4.65115977488217e-06, + "loss": 0.1364, + "step": 20826 + }, + { + "epoch": 0.527038995875193, + "grad_norm": 3.344743490219116, + "learning_rate": 4.650759231881543e-06, + "loss": 0.1, + "step": 20827 + }, + { + "epoch": 0.5270643014398866, + "grad_norm": 7.038992881774902, + "learning_rate": 4.6503586911331025e-06, + "loss": 0.2141, + "step": 20828 + }, + { + "epoch": 0.5270896070045803, + "grad_norm": 7.44308614730835, + "learning_rate": 4.649958152639433e-06, + "loss": 0.2745, + "step": 20829 + }, + { + "epoch": 0.527114912569274, + "grad_norm": 5.472863674163818, + "learning_rate": 4.649557616403119e-06, + "loss": 0.1077, + "step": 20830 + }, + { + "epoch": 0.5271402181339676, + "grad_norm": 8.057271957397461, + "learning_rate": 4.649157082426745e-06, + "loss": 0.3162, + "step": 20831 + }, + { + "epoch": 0.5271655236986613, + "grad_norm": 4.520151138305664, + "learning_rate": 4.648756550712889e-06, + "loss": 0.153, + "step": 20832 + }, + { + "epoch": 0.527190829263355, + "grad_norm": 9.507433891296387, + "learning_rate": 4.648356021264137e-06, + "loss": 0.2814, + "step": 20833 + }, + { + "epoch": 0.5272161348280486, + "grad_norm": 14.130122184753418, + "learning_rate": 4.647955494083072e-06, + "loss": 0.2227, + "step": 20834 + }, + { + "epoch": 0.5272414403927423, + "grad_norm": 3.6236624717712402, + "learning_rate": 4.647554969172277e-06, + "loss": 0.1557, + "step": 20835 + }, + { + "epoch": 0.5272667459574361, + "grad_norm": 4.946499824523926, + "learning_rate": 4.647154446534333e-06, + "loss": 0.1569, + "step": 20836 + }, + { + "epoch": 0.5272920515221298, + "grad_norm": 3.0955088138580322, + "learning_rate": 4.646753926171825e-06, + "loss": 0.1224, + "step": 20837 + }, + { + "epoch": 0.5273173570868234, + "grad_norm": 5.364346981048584, + "learning_rate": 4.6463534080873355e-06, + "loss": 0.1849, + "step": 20838 + }, + { + "epoch": 0.5273426626515171, + "grad_norm": 5.948974609375, + "learning_rate": 4.645952892283447e-06, + "loss": 0.1632, + "step": 20839 + }, + { + "epoch": 0.5273679682162108, + "grad_norm": 6.863123893737793, + "learning_rate": 4.645552378762743e-06, + "loss": 0.2318, + "step": 20840 + }, + { + "epoch": 0.5273932737809044, + "grad_norm": 4.9893341064453125, + "learning_rate": 4.645151867527806e-06, + "loss": 0.2415, + "step": 20841 + }, + { + "epoch": 0.5274185793455981, + "grad_norm": 3.6027162075042725, + "learning_rate": 4.644751358581217e-06, + "loss": 0.191, + "step": 20842 + }, + { + "epoch": 0.5274438849102918, + "grad_norm": 10.707375526428223, + "learning_rate": 4.644350851925561e-06, + "loss": 0.206, + "step": 20843 + }, + { + "epoch": 0.5274691904749854, + "grad_norm": 4.162567138671875, + "learning_rate": 4.643950347563422e-06, + "loss": 0.0977, + "step": 20844 + }, + { + "epoch": 0.5274944960396791, + "grad_norm": 7.69300651550293, + "learning_rate": 4.6435498454973786e-06, + "loss": 0.1862, + "step": 20845 + }, + { + "epoch": 0.5275198016043728, + "grad_norm": 4.19931697845459, + "learning_rate": 4.6431493457300164e-06, + "loss": 0.187, + "step": 20846 + }, + { + "epoch": 0.5275451071690664, + "grad_norm": 7.753298282623291, + "learning_rate": 4.6427488482639175e-06, + "loss": 0.219, + "step": 20847 + }, + { + "epoch": 0.5275704127337602, + "grad_norm": 8.2681245803833, + "learning_rate": 4.642348353101665e-06, + "loss": 0.1884, + "step": 20848 + }, + { + "epoch": 0.5275957182984539, + "grad_norm": 5.545825481414795, + "learning_rate": 4.641947860245844e-06, + "loss": 0.1343, + "step": 20849 + }, + { + "epoch": 0.5276210238631475, + "grad_norm": 3.5874619483947754, + "learning_rate": 4.6415473696990314e-06, + "loss": 0.1256, + "step": 20850 + }, + { + "epoch": 0.5276463294278412, + "grad_norm": 11.60482406616211, + "learning_rate": 4.641146881463814e-06, + "loss": 0.2, + "step": 20851 + }, + { + "epoch": 0.5276716349925349, + "grad_norm": 7.692669868469238, + "learning_rate": 4.640746395542774e-06, + "loss": 0.2324, + "step": 20852 + }, + { + "epoch": 0.5276969405572285, + "grad_norm": 7.90623140335083, + "learning_rate": 4.640345911938494e-06, + "loss": 0.1971, + "step": 20853 + }, + { + "epoch": 0.5277222461219222, + "grad_norm": 3.2290968894958496, + "learning_rate": 4.6399454306535555e-06, + "loss": 0.1097, + "step": 20854 + }, + { + "epoch": 0.5277475516866159, + "grad_norm": 5.24573278427124, + "learning_rate": 4.639544951690541e-06, + "loss": 0.1833, + "step": 20855 + }, + { + "epoch": 0.5277728572513095, + "grad_norm": 5.604578495025635, + "learning_rate": 4.639144475052036e-06, + "loss": 0.2088, + "step": 20856 + }, + { + "epoch": 0.5277981628160032, + "grad_norm": 5.984229564666748, + "learning_rate": 4.638744000740622e-06, + "loss": 0.1812, + "step": 20857 + }, + { + "epoch": 0.5278234683806969, + "grad_norm": 4.323091506958008, + "learning_rate": 4.6383435287588785e-06, + "loss": 0.1121, + "step": 20858 + }, + { + "epoch": 0.5278487739453905, + "grad_norm": 5.947713375091553, + "learning_rate": 4.637943059109391e-06, + "loss": 0.2035, + "step": 20859 + }, + { + "epoch": 0.5278740795100842, + "grad_norm": 13.18712329864502, + "learning_rate": 4.637542591794741e-06, + "loss": 0.181, + "step": 20860 + }, + { + "epoch": 0.527899385074778, + "grad_norm": 4.514995574951172, + "learning_rate": 4.63714212681751e-06, + "loss": 0.0925, + "step": 20861 + }, + { + "epoch": 0.5279246906394717, + "grad_norm": 8.203369140625, + "learning_rate": 4.636741664180286e-06, + "loss": 0.1666, + "step": 20862 + }, + { + "epoch": 0.5279499962041653, + "grad_norm": 5.313464641571045, + "learning_rate": 4.6363412038856446e-06, + "loss": 0.168, + "step": 20863 + }, + { + "epoch": 0.527975301768859, + "grad_norm": 4.261733055114746, + "learning_rate": 4.635940745936171e-06, + "loss": 0.1685, + "step": 20864 + }, + { + "epoch": 0.5280006073335527, + "grad_norm": 4.279792308807373, + "learning_rate": 4.6355402903344494e-06, + "loss": 0.1654, + "step": 20865 + }, + { + "epoch": 0.5280259128982463, + "grad_norm": 3.2106635570526123, + "learning_rate": 4.63513983708306e-06, + "loss": 0.0906, + "step": 20866 + }, + { + "epoch": 0.52805121846294, + "grad_norm": 5.0783281326293945, + "learning_rate": 4.634739386184585e-06, + "loss": 0.1827, + "step": 20867 + }, + { + "epoch": 0.5280765240276337, + "grad_norm": 6.887246608734131, + "learning_rate": 4.63433893764161e-06, + "loss": 0.1167, + "step": 20868 + }, + { + "epoch": 0.5281018295923273, + "grad_norm": 3.8280537128448486, + "learning_rate": 4.633938491456712e-06, + "loss": 0.1205, + "step": 20869 + }, + { + "epoch": 0.528127135157021, + "grad_norm": 3.7778618335723877, + "learning_rate": 4.633538047632478e-06, + "loss": 0.214, + "step": 20870 + }, + { + "epoch": 0.5281524407217147, + "grad_norm": 3.4930312633514404, + "learning_rate": 4.633137606171491e-06, + "loss": 0.1833, + "step": 20871 + }, + { + "epoch": 0.5281777462864083, + "grad_norm": 8.763701438903809, + "learning_rate": 4.632737167076329e-06, + "loss": 0.1885, + "step": 20872 + }, + { + "epoch": 0.5282030518511021, + "grad_norm": 3.0058646202087402, + "learning_rate": 4.632336730349576e-06, + "loss": 0.128, + "step": 20873 + }, + { + "epoch": 0.5282283574157958, + "grad_norm": 21.60363006591797, + "learning_rate": 4.631936295993817e-06, + "loss": 0.3728, + "step": 20874 + }, + { + "epoch": 0.5282536629804894, + "grad_norm": 3.3698630332946777, + "learning_rate": 4.631535864011632e-06, + "loss": 0.1208, + "step": 20875 + }, + { + "epoch": 0.5282789685451831, + "grad_norm": 6.072044849395752, + "learning_rate": 4.6311354344056034e-06, + "loss": 0.2036, + "step": 20876 + }, + { + "epoch": 0.5283042741098768, + "grad_norm": 7.1666669845581055, + "learning_rate": 4.630735007178313e-06, + "loss": 0.2296, + "step": 20877 + }, + { + "epoch": 0.5283295796745704, + "grad_norm": 4.926214694976807, + "learning_rate": 4.630334582332343e-06, + "loss": 0.2172, + "step": 20878 + }, + { + "epoch": 0.5283548852392641, + "grad_norm": 5.987180709838867, + "learning_rate": 4.629934159870278e-06, + "loss": 0.1038, + "step": 20879 + }, + { + "epoch": 0.5283801908039578, + "grad_norm": 7.851877212524414, + "learning_rate": 4.629533739794701e-06, + "loss": 0.3053, + "step": 20880 + }, + { + "epoch": 0.5284054963686514, + "grad_norm": 3.3051679134368896, + "learning_rate": 4.629133322108189e-06, + "loss": 0.1046, + "step": 20881 + }, + { + "epoch": 0.5284308019333451, + "grad_norm": 8.922524452209473, + "learning_rate": 4.628732906813328e-06, + "loss": 0.2279, + "step": 20882 + }, + { + "epoch": 0.5284561074980388, + "grad_norm": 2.7135982513427734, + "learning_rate": 4.628332493912699e-06, + "loss": 0.1234, + "step": 20883 + }, + { + "epoch": 0.5284814130627324, + "grad_norm": 3.323458194732666, + "learning_rate": 4.627932083408886e-06, + "loss": 0.1495, + "step": 20884 + }, + { + "epoch": 0.5285067186274262, + "grad_norm": 12.42385196685791, + "learning_rate": 4.6275316753044685e-06, + "loss": 0.2723, + "step": 20885 + }, + { + "epoch": 0.5285320241921199, + "grad_norm": 5.076902866363525, + "learning_rate": 4.6271312696020295e-06, + "loss": 0.1645, + "step": 20886 + }, + { + "epoch": 0.5285573297568136, + "grad_norm": 3.0322835445404053, + "learning_rate": 4.626730866304152e-06, + "loss": 0.128, + "step": 20887 + }, + { + "epoch": 0.5285826353215072, + "grad_norm": 7.439761638641357, + "learning_rate": 4.62633046541342e-06, + "loss": 0.2612, + "step": 20888 + }, + { + "epoch": 0.5286079408862009, + "grad_norm": 2.9256300926208496, + "learning_rate": 4.62593006693241e-06, + "loss": 0.1684, + "step": 20889 + }, + { + "epoch": 0.5286332464508946, + "grad_norm": 3.7765960693359375, + "learning_rate": 4.625529670863709e-06, + "loss": 0.1344, + "step": 20890 + }, + { + "epoch": 0.5286585520155882, + "grad_norm": 6.091593265533447, + "learning_rate": 4.625129277209896e-06, + "loss": 0.1981, + "step": 20891 + }, + { + "epoch": 0.5286838575802819, + "grad_norm": 11.528979301452637, + "learning_rate": 4.624728885973556e-06, + "loss": 0.1526, + "step": 20892 + }, + { + "epoch": 0.5287091631449756, + "grad_norm": 10.861370086669922, + "learning_rate": 4.624328497157271e-06, + "loss": 0.2512, + "step": 20893 + }, + { + "epoch": 0.5287344687096692, + "grad_norm": 4.394033432006836, + "learning_rate": 4.623928110763619e-06, + "loss": 0.1492, + "step": 20894 + }, + { + "epoch": 0.5287597742743629, + "grad_norm": 3.3354439735412598, + "learning_rate": 4.623527726795187e-06, + "loss": 0.1375, + "step": 20895 + }, + { + "epoch": 0.5287850798390566, + "grad_norm": 3.368189573287964, + "learning_rate": 4.623127345254552e-06, + "loss": 0.196, + "step": 20896 + }, + { + "epoch": 0.5288103854037502, + "grad_norm": 3.6045081615448, + "learning_rate": 4.622726966144301e-06, + "loss": 0.1383, + "step": 20897 + }, + { + "epoch": 0.528835690968444, + "grad_norm": 6.4213547706604, + "learning_rate": 4.622326589467012e-06, + "loss": 0.2339, + "step": 20898 + }, + { + "epoch": 0.5288609965331377, + "grad_norm": 4.896344184875488, + "learning_rate": 4.62192621522527e-06, + "loss": 0.1815, + "step": 20899 + }, + { + "epoch": 0.5288863020978313, + "grad_norm": 4.842330455780029, + "learning_rate": 4.621525843421654e-06, + "loss": 0.1887, + "step": 20900 + }, + { + "epoch": 0.528911607662525, + "grad_norm": 2.9653234481811523, + "learning_rate": 4.621125474058747e-06, + "loss": 0.0937, + "step": 20901 + }, + { + "epoch": 0.5289369132272187, + "grad_norm": 3.5326180458068848, + "learning_rate": 4.6207251071391355e-06, + "loss": 0.1177, + "step": 20902 + }, + { + "epoch": 0.5289622187919123, + "grad_norm": 8.16220760345459, + "learning_rate": 4.620324742665393e-06, + "loss": 0.2028, + "step": 20903 + }, + { + "epoch": 0.528987524356606, + "grad_norm": 5.885438919067383, + "learning_rate": 4.6199243806401065e-06, + "loss": 0.1965, + "step": 20904 + }, + { + "epoch": 0.5290128299212997, + "grad_norm": 4.076982021331787, + "learning_rate": 4.619524021065856e-06, + "loss": 0.1898, + "step": 20905 + }, + { + "epoch": 0.5290381354859933, + "grad_norm": 3.613551616668701, + "learning_rate": 4.619123663945228e-06, + "loss": 0.1463, + "step": 20906 + }, + { + "epoch": 0.529063441050687, + "grad_norm": 16.14556121826172, + "learning_rate": 4.618723309280798e-06, + "loss": 0.1572, + "step": 20907 + }, + { + "epoch": 0.5290887466153807, + "grad_norm": 6.972332954406738, + "learning_rate": 4.61832295707515e-06, + "loss": 0.1918, + "step": 20908 + }, + { + "epoch": 0.5291140521800743, + "grad_norm": 3.897416591644287, + "learning_rate": 4.617922607330866e-06, + "loss": 0.166, + "step": 20909 + }, + { + "epoch": 0.5291393577447681, + "grad_norm": 8.94170093536377, + "learning_rate": 4.617522260050531e-06, + "loss": 0.2199, + "step": 20910 + }, + { + "epoch": 0.5291646633094618, + "grad_norm": 3.051708459854126, + "learning_rate": 4.617121915236721e-06, + "loss": 0.087, + "step": 20911 + }, + { + "epoch": 0.5291899688741555, + "grad_norm": 5.659052848815918, + "learning_rate": 4.6167215728920215e-06, + "loss": 0.1338, + "step": 20912 + }, + { + "epoch": 0.5292152744388491, + "grad_norm": 26.0933780670166, + "learning_rate": 4.616321233019013e-06, + "loss": 0.1671, + "step": 20913 + }, + { + "epoch": 0.5292405800035428, + "grad_norm": 3.240212917327881, + "learning_rate": 4.6159208956202765e-06, + "loss": 0.1145, + "step": 20914 + }, + { + "epoch": 0.5292658855682365, + "grad_norm": 5.06203031539917, + "learning_rate": 4.6155205606983975e-06, + "loss": 0.1181, + "step": 20915 + }, + { + "epoch": 0.5292911911329301, + "grad_norm": 3.657919406890869, + "learning_rate": 4.615120228255952e-06, + "loss": 0.1681, + "step": 20916 + }, + { + "epoch": 0.5293164966976238, + "grad_norm": 7.069636821746826, + "learning_rate": 4.6147198982955264e-06, + "loss": 0.1442, + "step": 20917 + }, + { + "epoch": 0.5293418022623175, + "grad_norm": 2.717514753341675, + "learning_rate": 4.614319570819699e-06, + "loss": 0.1691, + "step": 20918 + }, + { + "epoch": 0.5293671078270111, + "grad_norm": 3.2637181282043457, + "learning_rate": 4.613919245831056e-06, + "loss": 0.1087, + "step": 20919 + }, + { + "epoch": 0.5293924133917048, + "grad_norm": 3.900336980819702, + "learning_rate": 4.613518923332173e-06, + "loss": 0.1801, + "step": 20920 + }, + { + "epoch": 0.5294177189563986, + "grad_norm": 6.965845584869385, + "learning_rate": 4.613118603325635e-06, + "loss": 0.0697, + "step": 20921 + }, + { + "epoch": 0.5294430245210922, + "grad_norm": 5.033802509307861, + "learning_rate": 4.612718285814024e-06, + "loss": 0.1313, + "step": 20922 + }, + { + "epoch": 0.5294683300857859, + "grad_norm": 2.968829393386841, + "learning_rate": 4.61231797079992e-06, + "loss": 0.1442, + "step": 20923 + }, + { + "epoch": 0.5294936356504796, + "grad_norm": 6.4281005859375, + "learning_rate": 4.611917658285906e-06, + "loss": 0.2141, + "step": 20924 + }, + { + "epoch": 0.5295189412151732, + "grad_norm": 4.4339189529418945, + "learning_rate": 4.611517348274562e-06, + "loss": 0.1694, + "step": 20925 + }, + { + "epoch": 0.5295442467798669, + "grad_norm": 12.041940689086914, + "learning_rate": 4.61111704076847e-06, + "loss": 0.2446, + "step": 20926 + }, + { + "epoch": 0.5295695523445606, + "grad_norm": 3.573498010635376, + "learning_rate": 4.610716735770213e-06, + "loss": 0.1455, + "step": 20927 + }, + { + "epoch": 0.5295948579092542, + "grad_norm": 5.47130012512207, + "learning_rate": 4.6103164332823715e-06, + "loss": 0.1475, + "step": 20928 + }, + { + "epoch": 0.5296201634739479, + "grad_norm": 7.442149639129639, + "learning_rate": 4.609916133307527e-06, + "loss": 0.1879, + "step": 20929 + }, + { + "epoch": 0.5296454690386416, + "grad_norm": 3.7644596099853516, + "learning_rate": 4.609515835848259e-06, + "loss": 0.1653, + "step": 20930 + }, + { + "epoch": 0.5296707746033352, + "grad_norm": 6.554549694061279, + "learning_rate": 4.609115540907152e-06, + "loss": 0.1891, + "step": 20931 + }, + { + "epoch": 0.5296960801680289, + "grad_norm": 4.189568996429443, + "learning_rate": 4.608715248486787e-06, + "loss": 0.1393, + "step": 20932 + }, + { + "epoch": 0.5297213857327226, + "grad_norm": 8.531988143920898, + "learning_rate": 4.608314958589743e-06, + "loss": 0.185, + "step": 20933 + }, + { + "epoch": 0.5297466912974163, + "grad_norm": 2.940866231918335, + "learning_rate": 4.607914671218604e-06, + "loss": 0.1582, + "step": 20934 + }, + { + "epoch": 0.52977199686211, + "grad_norm": 3.9675581455230713, + "learning_rate": 4.6075143863759495e-06, + "loss": 0.1776, + "step": 20935 + }, + { + "epoch": 0.5297973024268037, + "grad_norm": 2.652435064315796, + "learning_rate": 4.607114104064362e-06, + "loss": 0.1107, + "step": 20936 + }, + { + "epoch": 0.5298226079914974, + "grad_norm": 2.8057680130004883, + "learning_rate": 4.606713824286424e-06, + "loss": 0.1178, + "step": 20937 + }, + { + "epoch": 0.529847913556191, + "grad_norm": 3.23093843460083, + "learning_rate": 4.6063135470447136e-06, + "loss": 0.1149, + "step": 20938 + }, + { + "epoch": 0.5298732191208847, + "grad_norm": 5.850590229034424, + "learning_rate": 4.6059132723418135e-06, + "loss": 0.1913, + "step": 20939 + }, + { + "epoch": 0.5298985246855784, + "grad_norm": 6.438169956207275, + "learning_rate": 4.605513000180305e-06, + "loss": 0.2518, + "step": 20940 + }, + { + "epoch": 0.529923830250272, + "grad_norm": 3.2396061420440674, + "learning_rate": 4.605112730562773e-06, + "loss": 0.109, + "step": 20941 + }, + { + "epoch": 0.5299491358149657, + "grad_norm": 4.493236541748047, + "learning_rate": 4.604712463491794e-06, + "loss": 0.1323, + "step": 20942 + }, + { + "epoch": 0.5299744413796594, + "grad_norm": 5.832598686218262, + "learning_rate": 4.60431219896995e-06, + "loss": 0.2818, + "step": 20943 + }, + { + "epoch": 0.529999746944353, + "grad_norm": 4.156365871429443, + "learning_rate": 4.603911936999823e-06, + "loss": 0.1, + "step": 20944 + }, + { + "epoch": 0.5300250525090467, + "grad_norm": 6.151748180389404, + "learning_rate": 4.603511677583994e-06, + "loss": 0.173, + "step": 20945 + }, + { + "epoch": 0.5300503580737405, + "grad_norm": 5.123058795928955, + "learning_rate": 4.603111420725046e-06, + "loss": 0.1914, + "step": 20946 + }, + { + "epoch": 0.5300756636384341, + "grad_norm": 7.305926322937012, + "learning_rate": 4.602711166425558e-06, + "loss": 0.1645, + "step": 20947 + }, + { + "epoch": 0.5301009692031278, + "grad_norm": 5.548770904541016, + "learning_rate": 4.6023109146881105e-06, + "loss": 0.2278, + "step": 20948 + }, + { + "epoch": 0.5301262747678215, + "grad_norm": 10.789438247680664, + "learning_rate": 4.6019106655152865e-06, + "loss": 0.2076, + "step": 20949 + }, + { + "epoch": 0.5301515803325151, + "grad_norm": 3.7510955333709717, + "learning_rate": 4.601510418909669e-06, + "loss": 0.1527, + "step": 20950 + }, + { + "epoch": 0.5301768858972088, + "grad_norm": 4.7273664474487305, + "learning_rate": 4.601110174873834e-06, + "loss": 0.1869, + "step": 20951 + }, + { + "epoch": 0.5302021914619025, + "grad_norm": 8.191963195800781, + "learning_rate": 4.600709933410366e-06, + "loss": 0.1373, + "step": 20952 + }, + { + "epoch": 0.5302274970265961, + "grad_norm": 3.301427125930786, + "learning_rate": 4.600309694521845e-06, + "loss": 0.0928, + "step": 20953 + }, + { + "epoch": 0.5302528025912898, + "grad_norm": 8.009320259094238, + "learning_rate": 4.599909458210853e-06, + "loss": 0.1677, + "step": 20954 + }, + { + "epoch": 0.5302781081559835, + "grad_norm": 6.371405124664307, + "learning_rate": 4.5995092244799706e-06, + "loss": 0.1402, + "step": 20955 + }, + { + "epoch": 0.5303034137206771, + "grad_norm": 6.872764587402344, + "learning_rate": 4.599108993331779e-06, + "loss": 0.2508, + "step": 20956 + }, + { + "epoch": 0.5303287192853708, + "grad_norm": 4.61428689956665, + "learning_rate": 4.5987087647688575e-06, + "loss": 0.2146, + "step": 20957 + }, + { + "epoch": 0.5303540248500646, + "grad_norm": 9.705631256103516, + "learning_rate": 4.598308538793788e-06, + "loss": 0.2504, + "step": 20958 + }, + { + "epoch": 0.5303793304147582, + "grad_norm": 10.02273178100586, + "learning_rate": 4.5979083154091544e-06, + "loss": 0.1565, + "step": 20959 + }, + { + "epoch": 0.5304046359794519, + "grad_norm": 3.612034320831299, + "learning_rate": 4.597508094617536e-06, + "loss": 0.1442, + "step": 20960 + }, + { + "epoch": 0.5304299415441456, + "grad_norm": 3.5926830768585205, + "learning_rate": 4.59710787642151e-06, + "loss": 0.1861, + "step": 20961 + }, + { + "epoch": 0.5304552471088392, + "grad_norm": 6.643304824829102, + "learning_rate": 4.596707660823661e-06, + "loss": 0.2218, + "step": 20962 + }, + { + "epoch": 0.5304805526735329, + "grad_norm": 4.597764492034912, + "learning_rate": 4.596307447826572e-06, + "loss": 0.2248, + "step": 20963 + }, + { + "epoch": 0.5305058582382266, + "grad_norm": 11.249792098999023, + "learning_rate": 4.595907237432819e-06, + "loss": 0.19, + "step": 20964 + }, + { + "epoch": 0.5305311638029203, + "grad_norm": 5.148031234741211, + "learning_rate": 4.595507029644984e-06, + "loss": 0.2173, + "step": 20965 + }, + { + "epoch": 0.5305564693676139, + "grad_norm": 8.278636932373047, + "learning_rate": 4.595106824465649e-06, + "loss": 0.2046, + "step": 20966 + }, + { + "epoch": 0.5305817749323076, + "grad_norm": 3.4493963718414307, + "learning_rate": 4.594706621897395e-06, + "loss": 0.1345, + "step": 20967 + }, + { + "epoch": 0.5306070804970013, + "grad_norm": 6.065237998962402, + "learning_rate": 4.594306421942806e-06, + "loss": 0.1855, + "step": 20968 + }, + { + "epoch": 0.5306323860616949, + "grad_norm": 3.3324029445648193, + "learning_rate": 4.593906224604456e-06, + "loss": 0.169, + "step": 20969 + }, + { + "epoch": 0.5306576916263887, + "grad_norm": 7.985937118530273, + "learning_rate": 4.59350602988493e-06, + "loss": 0.1755, + "step": 20970 + }, + { + "epoch": 0.5306829971910824, + "grad_norm": 8.81021499633789, + "learning_rate": 4.593105837786807e-06, + "loss": 0.1904, + "step": 20971 + }, + { + "epoch": 0.530708302755776, + "grad_norm": 6.614956855773926, + "learning_rate": 4.592705648312672e-06, + "loss": 0.1809, + "step": 20972 + }, + { + "epoch": 0.5307336083204697, + "grad_norm": 6.737952709197998, + "learning_rate": 4.592305461465099e-06, + "loss": 0.189, + "step": 20973 + }, + { + "epoch": 0.5307589138851634, + "grad_norm": 3.0082249641418457, + "learning_rate": 4.591905277246673e-06, + "loss": 0.112, + "step": 20974 + }, + { + "epoch": 0.530784219449857, + "grad_norm": 4.139017105102539, + "learning_rate": 4.5915050956599746e-06, + "loss": 0.1006, + "step": 20975 + }, + { + "epoch": 0.5308095250145507, + "grad_norm": 6.258696556091309, + "learning_rate": 4.591104916707584e-06, + "loss": 0.1799, + "step": 20976 + }, + { + "epoch": 0.5308348305792444, + "grad_norm": 5.463261127471924, + "learning_rate": 4.590704740392083e-06, + "loss": 0.2277, + "step": 20977 + }, + { + "epoch": 0.530860136143938, + "grad_norm": 7.97145938873291, + "learning_rate": 4.590304566716049e-06, + "loss": 0.1673, + "step": 20978 + }, + { + "epoch": 0.5308854417086317, + "grad_norm": 6.912230491638184, + "learning_rate": 4.589904395682065e-06, + "loss": 0.1401, + "step": 20979 + }, + { + "epoch": 0.5309107472733254, + "grad_norm": 7.843922138214111, + "learning_rate": 4.589504227292712e-06, + "loss": 0.2781, + "step": 20980 + }, + { + "epoch": 0.530936052838019, + "grad_norm": 3.2683537006378174, + "learning_rate": 4.58910406155057e-06, + "loss": 0.1392, + "step": 20981 + }, + { + "epoch": 0.5309613584027127, + "grad_norm": 4.303049087524414, + "learning_rate": 4.588703898458218e-06, + "loss": 0.1652, + "step": 20982 + }, + { + "epoch": 0.5309866639674065, + "grad_norm": 4.864198207855225, + "learning_rate": 4.5883037380182396e-06, + "loss": 0.1483, + "step": 20983 + }, + { + "epoch": 0.5310119695321001, + "grad_norm": 3.312471866607666, + "learning_rate": 4.587903580233212e-06, + "loss": 0.1212, + "step": 20984 + }, + { + "epoch": 0.5310372750967938, + "grad_norm": 3.0492827892303467, + "learning_rate": 4.58750342510572e-06, + "loss": 0.1442, + "step": 20985 + }, + { + "epoch": 0.5310625806614875, + "grad_norm": 4.468806743621826, + "learning_rate": 4.587103272638339e-06, + "loss": 0.118, + "step": 20986 + }, + { + "epoch": 0.5310878862261811, + "grad_norm": 5.799154281616211, + "learning_rate": 4.586703122833654e-06, + "loss": 0.1513, + "step": 20987 + }, + { + "epoch": 0.5311131917908748, + "grad_norm": 11.096697807312012, + "learning_rate": 4.5863029756942425e-06, + "loss": 0.2494, + "step": 20988 + }, + { + "epoch": 0.5311384973555685, + "grad_norm": 4.292302131652832, + "learning_rate": 4.585902831222687e-06, + "loss": 0.114, + "step": 20989 + }, + { + "epoch": 0.5311638029202622, + "grad_norm": 5.537896156311035, + "learning_rate": 4.585502689421568e-06, + "loss": 0.1727, + "step": 20990 + }, + { + "epoch": 0.5311891084849558, + "grad_norm": 8.949372291564941, + "learning_rate": 4.5851025502934635e-06, + "loss": 0.2069, + "step": 20991 + }, + { + "epoch": 0.5312144140496495, + "grad_norm": 3.627990484237671, + "learning_rate": 4.584702413840956e-06, + "loss": 0.1567, + "step": 20992 + }, + { + "epoch": 0.5312397196143432, + "grad_norm": 8.900301933288574, + "learning_rate": 4.584302280066624e-06, + "loss": 0.1774, + "step": 20993 + }, + { + "epoch": 0.5312650251790368, + "grad_norm": 6.147397518157959, + "learning_rate": 4.583902148973053e-06, + "loss": 0.2333, + "step": 20994 + }, + { + "epoch": 0.5312903307437306, + "grad_norm": 4.771459579467773, + "learning_rate": 4.5835020205628165e-06, + "loss": 0.1328, + "step": 20995 + }, + { + "epoch": 0.5313156363084243, + "grad_norm": 4.789041996002197, + "learning_rate": 4.583101894838498e-06, + "loss": 0.1639, + "step": 20996 + }, + { + "epoch": 0.5313409418731179, + "grad_norm": 5.223927021026611, + "learning_rate": 4.582701771802678e-06, + "loss": 0.2236, + "step": 20997 + }, + { + "epoch": 0.5313662474378116, + "grad_norm": 3.3570237159729004, + "learning_rate": 4.582301651457937e-06, + "loss": 0.1233, + "step": 20998 + }, + { + "epoch": 0.5313915530025053, + "grad_norm": 4.155495643615723, + "learning_rate": 4.581901533806857e-06, + "loss": 0.1721, + "step": 20999 + }, + { + "epoch": 0.5314168585671989, + "grad_norm": 18.01993179321289, + "learning_rate": 4.5815014188520134e-06, + "loss": 0.1279, + "step": 21000 + }, + { + "epoch": 0.5314421641318926, + "grad_norm": 3.5064072608947754, + "learning_rate": 4.5811013065959905e-06, + "loss": 0.0972, + "step": 21001 + }, + { + "epoch": 0.5314674696965863, + "grad_norm": 2.1818864345550537, + "learning_rate": 4.580701197041367e-06, + "loss": 0.1187, + "step": 21002 + }, + { + "epoch": 0.5314927752612799, + "grad_norm": 7.236918926239014, + "learning_rate": 4.580301090190725e-06, + "loss": 0.2938, + "step": 21003 + }, + { + "epoch": 0.5315180808259736, + "grad_norm": 4.290793418884277, + "learning_rate": 4.579900986046641e-06, + "loss": 0.166, + "step": 21004 + }, + { + "epoch": 0.5315433863906673, + "grad_norm": 3.7462615966796875, + "learning_rate": 4.579500884611698e-06, + "loss": 0.1478, + "step": 21005 + }, + { + "epoch": 0.5315686919553609, + "grad_norm": 4.224702835083008, + "learning_rate": 4.579100785888475e-06, + "loss": 0.1714, + "step": 21006 + }, + { + "epoch": 0.5315939975200547, + "grad_norm": 5.506321907043457, + "learning_rate": 4.578700689879554e-06, + "loss": 0.1705, + "step": 21007 + }, + { + "epoch": 0.5316193030847484, + "grad_norm": 3.8495357036590576, + "learning_rate": 4.578300596587515e-06, + "loss": 0.1363, + "step": 21008 + }, + { + "epoch": 0.531644608649442, + "grad_norm": 8.781635284423828, + "learning_rate": 4.577900506014934e-06, + "loss": 0.1337, + "step": 21009 + }, + { + "epoch": 0.5316699142141357, + "grad_norm": 5.16171407699585, + "learning_rate": 4.577500418164395e-06, + "loss": 0.2013, + "step": 21010 + }, + { + "epoch": 0.5316952197788294, + "grad_norm": 5.000615119934082, + "learning_rate": 4.577100333038479e-06, + "loss": 0.1373, + "step": 21011 + }, + { + "epoch": 0.531720525343523, + "grad_norm": 12.879948616027832, + "learning_rate": 4.5767002506397636e-06, + "loss": 0.1653, + "step": 21012 + }, + { + "epoch": 0.5317458309082167, + "grad_norm": 6.896048545837402, + "learning_rate": 4.576300170970829e-06, + "loss": 0.1812, + "step": 21013 + }, + { + "epoch": 0.5317711364729104, + "grad_norm": 4.749034404754639, + "learning_rate": 4.575900094034256e-06, + "loss": 0.2068, + "step": 21014 + }, + { + "epoch": 0.5317964420376041, + "grad_norm": 3.2727465629577637, + "learning_rate": 4.575500019832623e-06, + "loss": 0.1448, + "step": 21015 + }, + { + "epoch": 0.5318217476022977, + "grad_norm": 4.643065929412842, + "learning_rate": 4.575099948368513e-06, + "loss": 0.1971, + "step": 21016 + }, + { + "epoch": 0.5318470531669914, + "grad_norm": 2.93839693069458, + "learning_rate": 4.574699879644505e-06, + "loss": 0.0915, + "step": 21017 + }, + { + "epoch": 0.5318723587316851, + "grad_norm": 5.1537089347839355, + "learning_rate": 4.574299813663177e-06, + "loss": 0.2595, + "step": 21018 + }, + { + "epoch": 0.5318976642963787, + "grad_norm": 4.59037971496582, + "learning_rate": 4.57389975042711e-06, + "loss": 0.2079, + "step": 21019 + }, + { + "epoch": 0.5319229698610725, + "grad_norm": 3.3162543773651123, + "learning_rate": 4.573499689938885e-06, + "loss": 0.1909, + "step": 21020 + }, + { + "epoch": 0.5319482754257662, + "grad_norm": 4.843597412109375, + "learning_rate": 4.573099632201082e-06, + "loss": 0.1258, + "step": 21021 + }, + { + "epoch": 0.5319735809904598, + "grad_norm": 11.507831573486328, + "learning_rate": 4.572699577216279e-06, + "loss": 0.1803, + "step": 21022 + }, + { + "epoch": 0.5319988865551535, + "grad_norm": 3.347957134246826, + "learning_rate": 4.572299524987057e-06, + "loss": 0.1512, + "step": 21023 + }, + { + "epoch": 0.5320241921198472, + "grad_norm": 8.237089157104492, + "learning_rate": 4.571899475515995e-06, + "loss": 0.305, + "step": 21024 + }, + { + "epoch": 0.5320494976845408, + "grad_norm": 4.307632923126221, + "learning_rate": 4.571499428805676e-06, + "loss": 0.1636, + "step": 21025 + }, + { + "epoch": 0.5320748032492345, + "grad_norm": 5.765809059143066, + "learning_rate": 4.571099384858676e-06, + "loss": 0.1831, + "step": 21026 + }, + { + "epoch": 0.5321001088139282, + "grad_norm": 5.517488956451416, + "learning_rate": 4.570699343677576e-06, + "loss": 0.1826, + "step": 21027 + }, + { + "epoch": 0.5321254143786218, + "grad_norm": 3.52666974067688, + "learning_rate": 4.570299305264956e-06, + "loss": 0.0831, + "step": 21028 + }, + { + "epoch": 0.5321507199433155, + "grad_norm": 4.137593746185303, + "learning_rate": 4.569899269623396e-06, + "loss": 0.1585, + "step": 21029 + }, + { + "epoch": 0.5321760255080092, + "grad_norm": 4.825419902801514, + "learning_rate": 4.5694992367554765e-06, + "loss": 0.2054, + "step": 21030 + }, + { + "epoch": 0.5322013310727028, + "grad_norm": 5.577576160430908, + "learning_rate": 4.5690992066637756e-06, + "loss": 0.1509, + "step": 21031 + }, + { + "epoch": 0.5322266366373966, + "grad_norm": 5.678623676300049, + "learning_rate": 4.568699179350874e-06, + "loss": 0.1898, + "step": 21032 + }, + { + "epoch": 0.5322519422020903, + "grad_norm": 5.710536003112793, + "learning_rate": 4.56829915481935e-06, + "loss": 0.21, + "step": 21033 + }, + { + "epoch": 0.5322772477667839, + "grad_norm": 4.467354774475098, + "learning_rate": 4.567899133071787e-06, + "loss": 0.2072, + "step": 21034 + }, + { + "epoch": 0.5323025533314776, + "grad_norm": 6.743224620819092, + "learning_rate": 4.5674991141107596e-06, + "loss": 0.1503, + "step": 21035 + }, + { + "epoch": 0.5323278588961713, + "grad_norm": 4.155200481414795, + "learning_rate": 4.567099097938851e-06, + "loss": 0.156, + "step": 21036 + }, + { + "epoch": 0.5323531644608649, + "grad_norm": 7.023738861083984, + "learning_rate": 4.566699084558639e-06, + "loss": 0.1842, + "step": 21037 + }, + { + "epoch": 0.5323784700255586, + "grad_norm": 4.0709919929504395, + "learning_rate": 4.5662990739727065e-06, + "loss": 0.1235, + "step": 21038 + }, + { + "epoch": 0.5324037755902523, + "grad_norm": 5.965339183807373, + "learning_rate": 4.565899066183629e-06, + "loss": 0.2322, + "step": 21039 + }, + { + "epoch": 0.532429081154946, + "grad_norm": 3.3389389514923096, + "learning_rate": 4.565499061193986e-06, + "loss": 0.1167, + "step": 21040 + }, + { + "epoch": 0.5324543867196396, + "grad_norm": 4.067710876464844, + "learning_rate": 4.565099059006362e-06, + "loss": 0.1289, + "step": 21041 + }, + { + "epoch": 0.5324796922843333, + "grad_norm": 19.050649642944336, + "learning_rate": 4.5646990596233305e-06, + "loss": 0.1315, + "step": 21042 + }, + { + "epoch": 0.532504997849027, + "grad_norm": 4.280562400817871, + "learning_rate": 4.5642990630474755e-06, + "loss": 0.0947, + "step": 21043 + }, + { + "epoch": 0.5325303034137207, + "grad_norm": 2.540928363800049, + "learning_rate": 4.5638990692813746e-06, + "loss": 0.1397, + "step": 21044 + }, + { + "epoch": 0.5325556089784144, + "grad_norm": 6.004830837249756, + "learning_rate": 4.563499078327606e-06, + "loss": 0.135, + "step": 21045 + }, + { + "epoch": 0.5325809145431081, + "grad_norm": 8.039777755737305, + "learning_rate": 4.563099090188751e-06, + "loss": 0.2661, + "step": 21046 + }, + { + "epoch": 0.5326062201078017, + "grad_norm": 33.93716812133789, + "learning_rate": 4.562699104867391e-06, + "loss": 0.1269, + "step": 21047 + }, + { + "epoch": 0.5326315256724954, + "grad_norm": 11.048364639282227, + "learning_rate": 4.5622991223661015e-06, + "loss": 0.303, + "step": 21048 + }, + { + "epoch": 0.5326568312371891, + "grad_norm": 7.2010087966918945, + "learning_rate": 4.561899142687463e-06, + "loss": 0.1948, + "step": 21049 + }, + { + "epoch": 0.5326821368018827, + "grad_norm": 3.8071675300598145, + "learning_rate": 4.561499165834056e-06, + "loss": 0.2261, + "step": 21050 + }, + { + "epoch": 0.5327074423665764, + "grad_norm": 6.220592021942139, + "learning_rate": 4.561099191808458e-06, + "loss": 0.2409, + "step": 21051 + }, + { + "epoch": 0.5327327479312701, + "grad_norm": 5.469599723815918, + "learning_rate": 4.560699220613253e-06, + "loss": 0.1779, + "step": 21052 + }, + { + "epoch": 0.5327580534959637, + "grad_norm": 3.205054521560669, + "learning_rate": 4.5602992522510144e-06, + "loss": 0.2162, + "step": 21053 + }, + { + "epoch": 0.5327833590606574, + "grad_norm": 3.626746892929077, + "learning_rate": 4.5598992867243245e-06, + "loss": 0.1669, + "step": 21054 + }, + { + "epoch": 0.5328086646253511, + "grad_norm": 4.899966716766357, + "learning_rate": 4.5594993240357615e-06, + "loss": 0.1336, + "step": 21055 + }, + { + "epoch": 0.5328339701900447, + "grad_norm": 11.718242645263672, + "learning_rate": 4.559099364187907e-06, + "loss": 0.1681, + "step": 21056 + }, + { + "epoch": 0.5328592757547385, + "grad_norm": 4.586093425750732, + "learning_rate": 4.558699407183339e-06, + "loss": 0.2107, + "step": 21057 + }, + { + "epoch": 0.5328845813194322, + "grad_norm": 2.2969255447387695, + "learning_rate": 4.558299453024635e-06, + "loss": 0.1017, + "step": 21058 + }, + { + "epoch": 0.5329098868841258, + "grad_norm": 5.010815143585205, + "learning_rate": 4.5578995017143755e-06, + "loss": 0.1197, + "step": 21059 + }, + { + "epoch": 0.5329351924488195, + "grad_norm": 3.608564853668213, + "learning_rate": 4.557499553255141e-06, + "loss": 0.1785, + "step": 21060 + }, + { + "epoch": 0.5329604980135132, + "grad_norm": 7.330624580383301, + "learning_rate": 4.557099607649509e-06, + "loss": 0.2076, + "step": 21061 + }, + { + "epoch": 0.5329858035782068, + "grad_norm": 2.434018611907959, + "learning_rate": 4.556699664900059e-06, + "loss": 0.1205, + "step": 21062 + }, + { + "epoch": 0.5330111091429005, + "grad_norm": 6.326043605804443, + "learning_rate": 4.5562997250093695e-06, + "loss": 0.2051, + "step": 21063 + }, + { + "epoch": 0.5330364147075942, + "grad_norm": 3.4298152923583984, + "learning_rate": 4.555899787980022e-06, + "loss": 0.1979, + "step": 21064 + }, + { + "epoch": 0.5330617202722879, + "grad_norm": 8.378244400024414, + "learning_rate": 4.555499853814596e-06, + "loss": 0.1994, + "step": 21065 + }, + { + "epoch": 0.5330870258369815, + "grad_norm": 3.612910032272339, + "learning_rate": 4.555099922515665e-06, + "loss": 0.1709, + "step": 21066 + }, + { + "epoch": 0.5331123314016752, + "grad_norm": 5.108317852020264, + "learning_rate": 4.554699994085814e-06, + "loss": 0.2222, + "step": 21067 + }, + { + "epoch": 0.533137636966369, + "grad_norm": 4.110452651977539, + "learning_rate": 4.55430006852762e-06, + "loss": 0.1628, + "step": 21068 + }, + { + "epoch": 0.5331629425310626, + "grad_norm": 4.080435752868652, + "learning_rate": 4.5539001458436615e-06, + "loss": 0.1339, + "step": 21069 + }, + { + "epoch": 0.5331882480957563, + "grad_norm": 10.979314804077148, + "learning_rate": 4.553500226036518e-06, + "loss": 0.2339, + "step": 21070 + }, + { + "epoch": 0.53321355366045, + "grad_norm": 3.4693379402160645, + "learning_rate": 4.553100309108767e-06, + "loss": 0.0927, + "step": 21071 + }, + { + "epoch": 0.5332388592251436, + "grad_norm": 3.347141981124878, + "learning_rate": 4.552700395062991e-06, + "loss": 0.1177, + "step": 21072 + }, + { + "epoch": 0.5332641647898373, + "grad_norm": 9.685956954956055, + "learning_rate": 4.552300483901766e-06, + "loss": 0.1541, + "step": 21073 + }, + { + "epoch": 0.533289470354531, + "grad_norm": 8.583000183105469, + "learning_rate": 4.551900575627672e-06, + "loss": 0.2293, + "step": 21074 + }, + { + "epoch": 0.5333147759192246, + "grad_norm": 4.905858516693115, + "learning_rate": 4.551500670243289e-06, + "loss": 0.2341, + "step": 21075 + }, + { + "epoch": 0.5333400814839183, + "grad_norm": 5.839925765991211, + "learning_rate": 4.551100767751193e-06, + "loss": 0.2179, + "step": 21076 + }, + { + "epoch": 0.533365387048612, + "grad_norm": 4.639800071716309, + "learning_rate": 4.550700868153966e-06, + "loss": 0.1273, + "step": 21077 + }, + { + "epoch": 0.5333906926133056, + "grad_norm": 2.5280442237854004, + "learning_rate": 4.5503009714541855e-06, + "loss": 0.1006, + "step": 21078 + }, + { + "epoch": 0.5334159981779993, + "grad_norm": 7.9921345710754395, + "learning_rate": 4.549901077654429e-06, + "loss": 0.1785, + "step": 21079 + }, + { + "epoch": 0.533441303742693, + "grad_norm": 4.37441349029541, + "learning_rate": 4.549501186757277e-06, + "loss": 0.1772, + "step": 21080 + }, + { + "epoch": 0.5334666093073867, + "grad_norm": 4.954742431640625, + "learning_rate": 4.549101298765309e-06, + "loss": 0.184, + "step": 21081 + }, + { + "epoch": 0.5334919148720804, + "grad_norm": 8.668523788452148, + "learning_rate": 4.548701413681101e-06, + "loss": 0.1524, + "step": 21082 + }, + { + "epoch": 0.5335172204367741, + "grad_norm": 3.577939748764038, + "learning_rate": 4.548301531507238e-06, + "loss": 0.1814, + "step": 21083 + }, + { + "epoch": 0.5335425260014677, + "grad_norm": 3.888000726699829, + "learning_rate": 4.547901652246291e-06, + "loss": 0.1468, + "step": 21084 + }, + { + "epoch": 0.5335678315661614, + "grad_norm": 5.167279243469238, + "learning_rate": 4.547501775900841e-06, + "loss": 0.204, + "step": 21085 + }, + { + "epoch": 0.5335931371308551, + "grad_norm": 4.085925102233887, + "learning_rate": 4.547101902473469e-06, + "loss": 0.212, + "step": 21086 + }, + { + "epoch": 0.5336184426955487, + "grad_norm": 3.0587286949157715, + "learning_rate": 4.5467020319667546e-06, + "loss": 0.1589, + "step": 21087 + }, + { + "epoch": 0.5336437482602424, + "grad_norm": 9.214529991149902, + "learning_rate": 4.546302164383273e-06, + "loss": 0.1864, + "step": 21088 + }, + { + "epoch": 0.5336690538249361, + "grad_norm": 4.102241039276123, + "learning_rate": 4.545902299725604e-06, + "loss": 0.1349, + "step": 21089 + }, + { + "epoch": 0.5336943593896297, + "grad_norm": 8.959553718566895, + "learning_rate": 4.545502437996326e-06, + "loss": 0.2911, + "step": 21090 + }, + { + "epoch": 0.5337196649543234, + "grad_norm": 6.369751930236816, + "learning_rate": 4.545102579198021e-06, + "loss": 0.2895, + "step": 21091 + }, + { + "epoch": 0.5337449705190171, + "grad_norm": 10.500110626220703, + "learning_rate": 4.544702723333262e-06, + "loss": 0.3331, + "step": 21092 + }, + { + "epoch": 0.5337702760837109, + "grad_norm": 7.308701992034912, + "learning_rate": 4.544302870404631e-06, + "loss": 0.1248, + "step": 21093 + }, + { + "epoch": 0.5337955816484045, + "grad_norm": 8.38158130645752, + "learning_rate": 4.543903020414706e-06, + "loss": 0.2212, + "step": 21094 + }, + { + "epoch": 0.5338208872130982, + "grad_norm": 7.843160152435303, + "learning_rate": 4.543503173366066e-06, + "loss": 0.1716, + "step": 21095 + }, + { + "epoch": 0.5338461927777919, + "grad_norm": 2.2118639945983887, + "learning_rate": 4.5431033292612905e-06, + "loss": 0.1183, + "step": 21096 + }, + { + "epoch": 0.5338714983424855, + "grad_norm": 3.9723682403564453, + "learning_rate": 4.542703488102955e-06, + "loss": 0.1335, + "step": 21097 + }, + { + "epoch": 0.5338968039071792, + "grad_norm": 6.024727821350098, + "learning_rate": 4.542303649893639e-06, + "loss": 0.1597, + "step": 21098 + }, + { + "epoch": 0.5339221094718729, + "grad_norm": 6.049588203430176, + "learning_rate": 4.541903814635924e-06, + "loss": 0.1279, + "step": 21099 + }, + { + "epoch": 0.5339474150365665, + "grad_norm": 7.258962154388428, + "learning_rate": 4.541503982332385e-06, + "loss": 0.2272, + "step": 21100 + }, + { + "epoch": 0.5339727206012602, + "grad_norm": 4.193828582763672, + "learning_rate": 4.5411041529856e-06, + "loss": 0.1286, + "step": 21101 + }, + { + "epoch": 0.5339980261659539, + "grad_norm": 3.9481353759765625, + "learning_rate": 4.540704326598151e-06, + "loss": 0.0976, + "step": 21102 + }, + { + "epoch": 0.5340233317306475, + "grad_norm": 3.575244426727295, + "learning_rate": 4.540304503172613e-06, + "loss": 0.162, + "step": 21103 + }, + { + "epoch": 0.5340486372953412, + "grad_norm": 4.730928421020508, + "learning_rate": 4.5399046827115655e-06, + "loss": 0.1169, + "step": 21104 + }, + { + "epoch": 0.534073942860035, + "grad_norm": 8.032793045043945, + "learning_rate": 4.53950486521759e-06, + "loss": 0.2099, + "step": 21105 + }, + { + "epoch": 0.5340992484247286, + "grad_norm": 4.42278528213501, + "learning_rate": 4.539105050693259e-06, + "loss": 0.1172, + "step": 21106 + }, + { + "epoch": 0.5341245539894223, + "grad_norm": 3.3169503211975098, + "learning_rate": 4.538705239141154e-06, + "loss": 0.1666, + "step": 21107 + }, + { + "epoch": 0.534149859554116, + "grad_norm": 5.150866985321045, + "learning_rate": 4.538305430563853e-06, + "loss": 0.1616, + "step": 21108 + }, + { + "epoch": 0.5341751651188096, + "grad_norm": 3.003185510635376, + "learning_rate": 4.537905624963937e-06, + "loss": 0.1561, + "step": 21109 + }, + { + "epoch": 0.5342004706835033, + "grad_norm": 6.977680206298828, + "learning_rate": 4.5375058223439796e-06, + "loss": 0.1223, + "step": 21110 + }, + { + "epoch": 0.534225776248197, + "grad_norm": 7.783045291900635, + "learning_rate": 4.53710602270656e-06, + "loss": 0.1299, + "step": 21111 + }, + { + "epoch": 0.5342510818128906, + "grad_norm": 3.6149072647094727, + "learning_rate": 4.536706226054259e-06, + "loss": 0.1823, + "step": 21112 + }, + { + "epoch": 0.5342763873775843, + "grad_norm": 3.894310474395752, + "learning_rate": 4.536306432389654e-06, + "loss": 0.1099, + "step": 21113 + }, + { + "epoch": 0.534301692942278, + "grad_norm": 6.604903697967529, + "learning_rate": 4.535906641715322e-06, + "loss": 0.2009, + "step": 21114 + }, + { + "epoch": 0.5343269985069716, + "grad_norm": 6.160313606262207, + "learning_rate": 4.535506854033841e-06, + "loss": 0.1757, + "step": 21115 + }, + { + "epoch": 0.5343523040716653, + "grad_norm": 8.821258544921875, + "learning_rate": 4.53510706934779e-06, + "loss": 0.2002, + "step": 21116 + }, + { + "epoch": 0.534377609636359, + "grad_norm": 5.871500015258789, + "learning_rate": 4.534707287659747e-06, + "loss": 0.098, + "step": 21117 + }, + { + "epoch": 0.5344029152010528, + "grad_norm": 8.32354736328125, + "learning_rate": 4.534307508972292e-06, + "loss": 0.214, + "step": 21118 + }, + { + "epoch": 0.5344282207657464, + "grad_norm": 3.481219530105591, + "learning_rate": 4.5339077332879996e-06, + "loss": 0.1676, + "step": 21119 + }, + { + "epoch": 0.5344535263304401, + "grad_norm": 2.9282188415527344, + "learning_rate": 4.533507960609449e-06, + "loss": 0.0917, + "step": 21120 + }, + { + "epoch": 0.5344788318951338, + "grad_norm": 14.697381973266602, + "learning_rate": 4.533108190939219e-06, + "loss": 0.2235, + "step": 21121 + }, + { + "epoch": 0.5345041374598274, + "grad_norm": 3.773014545440674, + "learning_rate": 4.53270842427989e-06, + "loss": 0.1663, + "step": 21122 + }, + { + "epoch": 0.5345294430245211, + "grad_norm": 3.4561710357666016, + "learning_rate": 4.532308660634035e-06, + "loss": 0.1261, + "step": 21123 + }, + { + "epoch": 0.5345547485892148, + "grad_norm": 5.003613471984863, + "learning_rate": 4.531908900004235e-06, + "loss": 0.099, + "step": 21124 + }, + { + "epoch": 0.5345800541539084, + "grad_norm": 32.237525939941406, + "learning_rate": 4.531509142393066e-06, + "loss": 0.2577, + "step": 21125 + }, + { + "epoch": 0.5346053597186021, + "grad_norm": 5.4157233238220215, + "learning_rate": 4.531109387803109e-06, + "loss": 0.1815, + "step": 21126 + }, + { + "epoch": 0.5346306652832958, + "grad_norm": 3.9513068199157715, + "learning_rate": 4.530709636236941e-06, + "loss": 0.1172, + "step": 21127 + }, + { + "epoch": 0.5346559708479894, + "grad_norm": 3.3588383197784424, + "learning_rate": 4.530309887697138e-06, + "loss": 0.1315, + "step": 21128 + }, + { + "epoch": 0.5346812764126831, + "grad_norm": 5.013293743133545, + "learning_rate": 4.5299101421862805e-06, + "loss": 0.2285, + "step": 21129 + }, + { + "epoch": 0.5347065819773769, + "grad_norm": 6.655736923217773, + "learning_rate": 4.529510399706943e-06, + "loss": 0.2078, + "step": 21130 + }, + { + "epoch": 0.5347318875420705, + "grad_norm": 4.420735836029053, + "learning_rate": 4.529110660261707e-06, + "loss": 0.158, + "step": 21131 + }, + { + "epoch": 0.5347571931067642, + "grad_norm": 5.003047943115234, + "learning_rate": 4.528710923853148e-06, + "loss": 0.2133, + "step": 21132 + }, + { + "epoch": 0.5347824986714579, + "grad_norm": 17.41444206237793, + "learning_rate": 4.528311190483845e-06, + "loss": 0.2152, + "step": 21133 + }, + { + "epoch": 0.5348078042361515, + "grad_norm": 4.9078288078308105, + "learning_rate": 4.527911460156374e-06, + "loss": 0.2071, + "step": 21134 + }, + { + "epoch": 0.5348331098008452, + "grad_norm": 2.8669939041137695, + "learning_rate": 4.527511732873316e-06, + "loss": 0.1006, + "step": 21135 + }, + { + "epoch": 0.5348584153655389, + "grad_norm": 3.176557779312134, + "learning_rate": 4.527112008637248e-06, + "loss": 0.1321, + "step": 21136 + }, + { + "epoch": 0.5348837209302325, + "grad_norm": 3.7291793823242188, + "learning_rate": 4.526712287450744e-06, + "loss": 0.1867, + "step": 21137 + }, + { + "epoch": 0.5349090264949262, + "grad_norm": 8.061708450317383, + "learning_rate": 4.526312569316385e-06, + "loss": 0.235, + "step": 21138 + }, + { + "epoch": 0.5349343320596199, + "grad_norm": 5.185778617858887, + "learning_rate": 4.525912854236748e-06, + "loss": 0.1922, + "step": 21139 + }, + { + "epoch": 0.5349596376243135, + "grad_norm": 5.62393856048584, + "learning_rate": 4.525513142214413e-06, + "loss": 0.2023, + "step": 21140 + }, + { + "epoch": 0.5349849431890072, + "grad_norm": 4.615167617797852, + "learning_rate": 4.525113433251953e-06, + "loss": 0.1354, + "step": 21141 + }, + { + "epoch": 0.535010248753701, + "grad_norm": 3.372744083404541, + "learning_rate": 4.5247137273519485e-06, + "loss": 0.1506, + "step": 21142 + }, + { + "epoch": 0.5350355543183947, + "grad_norm": 3.7101266384124756, + "learning_rate": 4.524314024516977e-06, + "loss": 0.1397, + "step": 21143 + }, + { + "epoch": 0.5350608598830883, + "grad_norm": 4.087739944458008, + "learning_rate": 4.523914324749618e-06, + "loss": 0.1679, + "step": 21144 + }, + { + "epoch": 0.535086165447782, + "grad_norm": 11.276155471801758, + "learning_rate": 4.523514628052444e-06, + "loss": 0.152, + "step": 21145 + }, + { + "epoch": 0.5351114710124757, + "grad_norm": 7.001857280731201, + "learning_rate": 4.523114934428036e-06, + "loss": 0.1546, + "step": 21146 + }, + { + "epoch": 0.5351367765771693, + "grad_norm": 5.272627830505371, + "learning_rate": 4.5227152438789714e-06, + "loss": 0.1631, + "step": 21147 + }, + { + "epoch": 0.535162082141863, + "grad_norm": 5.025730609893799, + "learning_rate": 4.522315556407828e-06, + "loss": 0.1828, + "step": 21148 + }, + { + "epoch": 0.5351873877065567, + "grad_norm": 5.751626014709473, + "learning_rate": 4.521915872017184e-06, + "loss": 0.1639, + "step": 21149 + }, + { + "epoch": 0.5352126932712503, + "grad_norm": 3.8439652919769287, + "learning_rate": 4.521516190709613e-06, + "loss": 0.113, + "step": 21150 + }, + { + "epoch": 0.535237998835944, + "grad_norm": 8.881975173950195, + "learning_rate": 4.521116512487696e-06, + "loss": 0.172, + "step": 21151 + }, + { + "epoch": 0.5352633044006377, + "grad_norm": 5.336467266082764, + "learning_rate": 4.520716837354009e-06, + "loss": 0.125, + "step": 21152 + }, + { + "epoch": 0.5352886099653313, + "grad_norm": 4.122864723205566, + "learning_rate": 4.520317165311132e-06, + "loss": 0.1782, + "step": 21153 + }, + { + "epoch": 0.535313915530025, + "grad_norm": 12.679689407348633, + "learning_rate": 4.519917496361639e-06, + "loss": 0.1684, + "step": 21154 + }, + { + "epoch": 0.5353392210947188, + "grad_norm": 4.4832611083984375, + "learning_rate": 4.519517830508108e-06, + "loss": 0.2026, + "step": 21155 + }, + { + "epoch": 0.5353645266594124, + "grad_norm": 7.151193618774414, + "learning_rate": 4.519118167753117e-06, + "loss": 0.2387, + "step": 21156 + }, + { + "epoch": 0.5353898322241061, + "grad_norm": 5.722646236419678, + "learning_rate": 4.518718508099245e-06, + "loss": 0.2434, + "step": 21157 + }, + { + "epoch": 0.5354151377887998, + "grad_norm": 4.8390583992004395, + "learning_rate": 4.518318851549068e-06, + "loss": 0.1193, + "step": 21158 + }, + { + "epoch": 0.5354404433534934, + "grad_norm": 4.060599327087402, + "learning_rate": 4.517919198105162e-06, + "loss": 0.1657, + "step": 21159 + }, + { + "epoch": 0.5354657489181871, + "grad_norm": 6.468907833099365, + "learning_rate": 4.5175195477701065e-06, + "loss": 0.2086, + "step": 21160 + }, + { + "epoch": 0.5354910544828808, + "grad_norm": 3.9528539180755615, + "learning_rate": 4.517119900546476e-06, + "loss": 0.1826, + "step": 21161 + }, + { + "epoch": 0.5355163600475744, + "grad_norm": 3.222897529602051, + "learning_rate": 4.516720256436851e-06, + "loss": 0.1122, + "step": 21162 + }, + { + "epoch": 0.5355416656122681, + "grad_norm": 4.1095499992370605, + "learning_rate": 4.516320615443807e-06, + "loss": 0.1816, + "step": 21163 + }, + { + "epoch": 0.5355669711769618, + "grad_norm": 3.7143707275390625, + "learning_rate": 4.515920977569921e-06, + "loss": 0.1427, + "step": 21164 + }, + { + "epoch": 0.5355922767416554, + "grad_norm": 6.327752590179443, + "learning_rate": 4.5155213428177696e-06, + "loss": 0.2018, + "step": 21165 + }, + { + "epoch": 0.5356175823063491, + "grad_norm": 2.960789442062378, + "learning_rate": 4.5151217111899334e-06, + "loss": 0.0977, + "step": 21166 + }, + { + "epoch": 0.5356428878710429, + "grad_norm": 9.61364459991455, + "learning_rate": 4.514722082688985e-06, + "loss": 0.1922, + "step": 21167 + }, + { + "epoch": 0.5356681934357366, + "grad_norm": 7.791142463684082, + "learning_rate": 4.514322457317504e-06, + "loss": 0.247, + "step": 21168 + }, + { + "epoch": 0.5356934990004302, + "grad_norm": 3.6561226844787598, + "learning_rate": 4.513922835078066e-06, + "loss": 0.1397, + "step": 21169 + }, + { + "epoch": 0.5357188045651239, + "grad_norm": 3.0625803470611572, + "learning_rate": 4.51352321597325e-06, + "loss": 0.1376, + "step": 21170 + }, + { + "epoch": 0.5357441101298176, + "grad_norm": 3.8510489463806152, + "learning_rate": 4.513123600005635e-06, + "loss": 0.1811, + "step": 21171 + }, + { + "epoch": 0.5357694156945112, + "grad_norm": 4.728013515472412, + "learning_rate": 4.512723987177792e-06, + "loss": 0.1211, + "step": 21172 + }, + { + "epoch": 0.5357947212592049, + "grad_norm": 3.1206398010253906, + "learning_rate": 4.512324377492302e-06, + "loss": 0.1851, + "step": 21173 + }, + { + "epoch": 0.5358200268238986, + "grad_norm": 4.061551570892334, + "learning_rate": 4.511924770951741e-06, + "loss": 0.1561, + "step": 21174 + }, + { + "epoch": 0.5358453323885922, + "grad_norm": 2.549842119216919, + "learning_rate": 4.51152516755869e-06, + "loss": 0.1373, + "step": 21175 + }, + { + "epoch": 0.5358706379532859, + "grad_norm": 8.620291709899902, + "learning_rate": 4.511125567315718e-06, + "loss": 0.3172, + "step": 21176 + }, + { + "epoch": 0.5358959435179796, + "grad_norm": 9.115776062011719, + "learning_rate": 4.510725970225408e-06, + "loss": 0.1348, + "step": 21177 + }, + { + "epoch": 0.5359212490826732, + "grad_norm": 4.45172119140625, + "learning_rate": 4.510326376290335e-06, + "loss": 0.1532, + "step": 21178 + }, + { + "epoch": 0.535946554647367, + "grad_norm": 3.3392844200134277, + "learning_rate": 4.509926785513075e-06, + "loss": 0.1707, + "step": 21179 + }, + { + "epoch": 0.5359718602120607, + "grad_norm": 7.3856611251831055, + "learning_rate": 4.509527197896208e-06, + "loss": 0.3184, + "step": 21180 + }, + { + "epoch": 0.5359971657767543, + "grad_norm": 9.990900039672852, + "learning_rate": 4.509127613442309e-06, + "loss": 0.2021, + "step": 21181 + }, + { + "epoch": 0.536022471341448, + "grad_norm": 10.943304061889648, + "learning_rate": 4.508728032153952e-06, + "loss": 0.1951, + "step": 21182 + }, + { + "epoch": 0.5360477769061417, + "grad_norm": 6.15479040145874, + "learning_rate": 4.508328454033718e-06, + "loss": 0.1676, + "step": 21183 + }, + { + "epoch": 0.5360730824708353, + "grad_norm": 12.584589958190918, + "learning_rate": 4.507928879084185e-06, + "loss": 0.3179, + "step": 21184 + }, + { + "epoch": 0.536098388035529, + "grad_norm": 2.9342525005340576, + "learning_rate": 4.507529307307924e-06, + "loss": 0.1526, + "step": 21185 + }, + { + "epoch": 0.5361236936002227, + "grad_norm": 15.42010498046875, + "learning_rate": 4.5071297387075144e-06, + "loss": 0.2295, + "step": 21186 + }, + { + "epoch": 0.5361489991649163, + "grad_norm": 3.000251054763794, + "learning_rate": 4.506730173285535e-06, + "loss": 0.0891, + "step": 21187 + }, + { + "epoch": 0.53617430472961, + "grad_norm": 4.002626419067383, + "learning_rate": 4.506330611044562e-06, + "loss": 0.1191, + "step": 21188 + }, + { + "epoch": 0.5361996102943037, + "grad_norm": 17.874597549438477, + "learning_rate": 4.505931051987169e-06, + "loss": 0.2607, + "step": 21189 + }, + { + "epoch": 0.5362249158589973, + "grad_norm": 3.974571704864502, + "learning_rate": 4.505531496115936e-06, + "loss": 0.1697, + "step": 21190 + }, + { + "epoch": 0.536250221423691, + "grad_norm": 3.244274139404297, + "learning_rate": 4.5051319434334376e-06, + "loss": 0.1672, + "step": 21191 + }, + { + "epoch": 0.5362755269883848, + "grad_norm": 5.677931785583496, + "learning_rate": 4.504732393942251e-06, + "loss": 0.1749, + "step": 21192 + }, + { + "epoch": 0.5363008325530785, + "grad_norm": 11.203449249267578, + "learning_rate": 4.504332847644954e-06, + "loss": 0.0824, + "step": 21193 + }, + { + "epoch": 0.5363261381177721, + "grad_norm": 5.581587791442871, + "learning_rate": 4.503933304544122e-06, + "loss": 0.2388, + "step": 21194 + }, + { + "epoch": 0.5363514436824658, + "grad_norm": 5.625175952911377, + "learning_rate": 4.503533764642331e-06, + "loss": 0.2052, + "step": 21195 + }, + { + "epoch": 0.5363767492471595, + "grad_norm": 3.0149662494659424, + "learning_rate": 4.503134227942159e-06, + "loss": 0.1027, + "step": 21196 + }, + { + "epoch": 0.5364020548118531, + "grad_norm": 14.810346603393555, + "learning_rate": 4.502734694446184e-06, + "loss": 0.1768, + "step": 21197 + }, + { + "epoch": 0.5364273603765468, + "grad_norm": 3.984469413757324, + "learning_rate": 4.502335164156977e-06, + "loss": 0.1808, + "step": 21198 + }, + { + "epoch": 0.5364526659412405, + "grad_norm": 6.757269859313965, + "learning_rate": 4.501935637077119e-06, + "loss": 0.1638, + "step": 21199 + }, + { + "epoch": 0.5364779715059341, + "grad_norm": 2.97263503074646, + "learning_rate": 4.501536113209186e-06, + "loss": 0.1384, + "step": 21200 + }, + { + "epoch": 0.5365032770706278, + "grad_norm": 7.080984115600586, + "learning_rate": 4.5011365925557525e-06, + "loss": 0.2759, + "step": 21201 + }, + { + "epoch": 0.5365285826353215, + "grad_norm": 3.5237040519714355, + "learning_rate": 4.500737075119399e-06, + "loss": 0.1407, + "step": 21202 + }, + { + "epoch": 0.5365538882000151, + "grad_norm": 3.966524600982666, + "learning_rate": 4.500337560902697e-06, + "loss": 0.1413, + "step": 21203 + }, + { + "epoch": 0.5365791937647089, + "grad_norm": 5.858582973480225, + "learning_rate": 4.499938049908225e-06, + "loss": 0.1039, + "step": 21204 + }, + { + "epoch": 0.5366044993294026, + "grad_norm": 4.411685943603516, + "learning_rate": 4.49953854213856e-06, + "loss": 0.1313, + "step": 21205 + }, + { + "epoch": 0.5366298048940962, + "grad_norm": 4.370173931121826, + "learning_rate": 4.499139037596281e-06, + "loss": 0.1128, + "step": 21206 + }, + { + "epoch": 0.5366551104587899, + "grad_norm": 4.965545177459717, + "learning_rate": 4.498739536283958e-06, + "loss": 0.1242, + "step": 21207 + }, + { + "epoch": 0.5366804160234836, + "grad_norm": 4.714936256408691, + "learning_rate": 4.49834003820417e-06, + "loss": 0.1963, + "step": 21208 + }, + { + "epoch": 0.5367057215881772, + "grad_norm": 6.957805156707764, + "learning_rate": 4.497940543359494e-06, + "loss": 0.1142, + "step": 21209 + }, + { + "epoch": 0.5367310271528709, + "grad_norm": 6.078855514526367, + "learning_rate": 4.497541051752506e-06, + "loss": 0.1475, + "step": 21210 + }, + { + "epoch": 0.5367563327175646, + "grad_norm": 7.152219295501709, + "learning_rate": 4.497141563385785e-06, + "loss": 0.1603, + "step": 21211 + }, + { + "epoch": 0.5367816382822582, + "grad_norm": 3.2165815830230713, + "learning_rate": 4.496742078261902e-06, + "loss": 0.1401, + "step": 21212 + }, + { + "epoch": 0.5368069438469519, + "grad_norm": 14.489899635314941, + "learning_rate": 4.496342596383435e-06, + "loss": 0.2979, + "step": 21213 + }, + { + "epoch": 0.5368322494116456, + "grad_norm": 7.969790935516357, + "learning_rate": 4.495943117752963e-06, + "loss": 0.2618, + "step": 21214 + }, + { + "epoch": 0.5368575549763392, + "grad_norm": 3.7591044902801514, + "learning_rate": 4.4955436423730605e-06, + "loss": 0.1727, + "step": 21215 + }, + { + "epoch": 0.536882860541033, + "grad_norm": 3.645111083984375, + "learning_rate": 4.495144170246301e-06, + "loss": 0.1124, + "step": 21216 + }, + { + "epoch": 0.5369081661057267, + "grad_norm": 3.450849771499634, + "learning_rate": 4.494744701375263e-06, + "loss": 0.1434, + "step": 21217 + }, + { + "epoch": 0.5369334716704203, + "grad_norm": 4.958066463470459, + "learning_rate": 4.494345235762524e-06, + "loss": 0.1453, + "step": 21218 + }, + { + "epoch": 0.536958777235114, + "grad_norm": 5.072296619415283, + "learning_rate": 4.493945773410659e-06, + "loss": 0.173, + "step": 21219 + }, + { + "epoch": 0.5369840827998077, + "grad_norm": 10.29787540435791, + "learning_rate": 4.493546314322241e-06, + "loss": 0.3537, + "step": 21220 + }, + { + "epoch": 0.5370093883645014, + "grad_norm": 2.9423329830169678, + "learning_rate": 4.493146858499851e-06, + "loss": 0.0918, + "step": 21221 + }, + { + "epoch": 0.537034693929195, + "grad_norm": 22.940174102783203, + "learning_rate": 4.4927474059460605e-06, + "loss": 0.3599, + "step": 21222 + }, + { + "epoch": 0.5370599994938887, + "grad_norm": 6.144715785980225, + "learning_rate": 4.492347956663448e-06, + "loss": 0.2456, + "step": 21223 + }, + { + "epoch": 0.5370853050585824, + "grad_norm": 3.462484359741211, + "learning_rate": 4.4919485106545914e-06, + "loss": 0.1501, + "step": 21224 + }, + { + "epoch": 0.537110610623276, + "grad_norm": 2.568892002105713, + "learning_rate": 4.491549067922063e-06, + "loss": 0.1529, + "step": 21225 + }, + { + "epoch": 0.5371359161879697, + "grad_norm": 22.8015079498291, + "learning_rate": 4.491149628468439e-06, + "loss": 0.2481, + "step": 21226 + }, + { + "epoch": 0.5371612217526635, + "grad_norm": 3.6048004627227783, + "learning_rate": 4.490750192296296e-06, + "loss": 0.2309, + "step": 21227 + }, + { + "epoch": 0.537186527317357, + "grad_norm": 4.624448299407959, + "learning_rate": 4.490350759408213e-06, + "loss": 0.1485, + "step": 21228 + }, + { + "epoch": 0.5372118328820508, + "grad_norm": 4.480192184448242, + "learning_rate": 4.489951329806762e-06, + "loss": 0.1976, + "step": 21229 + }, + { + "epoch": 0.5372371384467445, + "grad_norm": 4.415889263153076, + "learning_rate": 4.489551903494519e-06, + "loss": 0.1585, + "step": 21230 + }, + { + "epoch": 0.5372624440114381, + "grad_norm": 3.368196725845337, + "learning_rate": 4.48915248047406e-06, + "loss": 0.1505, + "step": 21231 + }, + { + "epoch": 0.5372877495761318, + "grad_norm": 5.740541458129883, + "learning_rate": 4.488753060747964e-06, + "loss": 0.2667, + "step": 21232 + }, + { + "epoch": 0.5373130551408255, + "grad_norm": 5.468518257141113, + "learning_rate": 4.488353644318805e-06, + "loss": 0.1458, + "step": 21233 + }, + { + "epoch": 0.5373383607055191, + "grad_norm": 4.569140434265137, + "learning_rate": 4.487954231189156e-06, + "loss": 0.1216, + "step": 21234 + }, + { + "epoch": 0.5373636662702128, + "grad_norm": 3.260976791381836, + "learning_rate": 4.4875548213615945e-06, + "loss": 0.1416, + "step": 21235 + }, + { + "epoch": 0.5373889718349065, + "grad_norm": 8.510950088500977, + "learning_rate": 4.4871554148386976e-06, + "loss": 0.2054, + "step": 21236 + }, + { + "epoch": 0.5374142773996001, + "grad_norm": 26.218849182128906, + "learning_rate": 4.486756011623042e-06, + "loss": 0.363, + "step": 21237 + }, + { + "epoch": 0.5374395829642938, + "grad_norm": 3.783780336380005, + "learning_rate": 4.486356611717198e-06, + "loss": 0.1447, + "step": 21238 + }, + { + "epoch": 0.5374648885289875, + "grad_norm": 6.495538711547852, + "learning_rate": 4.485957215123747e-06, + "loss": 0.1899, + "step": 21239 + }, + { + "epoch": 0.5374901940936812, + "grad_norm": 11.317148208618164, + "learning_rate": 4.48555782184526e-06, + "loss": 0.1768, + "step": 21240 + }, + { + "epoch": 0.5375154996583749, + "grad_norm": 23.60740852355957, + "learning_rate": 4.485158431884318e-06, + "loss": 0.3687, + "step": 21241 + }, + { + "epoch": 0.5375408052230686, + "grad_norm": 6.236512660980225, + "learning_rate": 4.484759045243492e-06, + "loss": 0.1868, + "step": 21242 + }, + { + "epoch": 0.5375661107877622, + "grad_norm": 4.2278218269348145, + "learning_rate": 4.484359661925358e-06, + "loss": 0.1974, + "step": 21243 + }, + { + "epoch": 0.5375914163524559, + "grad_norm": 5.779454708099365, + "learning_rate": 4.4839602819324935e-06, + "loss": 0.2131, + "step": 21244 + }, + { + "epoch": 0.5376167219171496, + "grad_norm": 5.513617515563965, + "learning_rate": 4.483560905267474e-06, + "loss": 0.182, + "step": 21245 + }, + { + "epoch": 0.5376420274818433, + "grad_norm": 5.346068382263184, + "learning_rate": 4.483161531932874e-06, + "loss": 0.1624, + "step": 21246 + }, + { + "epoch": 0.5376673330465369, + "grad_norm": 4.827329635620117, + "learning_rate": 4.482762161931268e-06, + "loss": 0.1726, + "step": 21247 + }, + { + "epoch": 0.5376926386112306, + "grad_norm": 8.740431785583496, + "learning_rate": 4.4823627952652334e-06, + "loss": 0.1669, + "step": 21248 + }, + { + "epoch": 0.5377179441759243, + "grad_norm": 3.174574136734009, + "learning_rate": 4.481963431937344e-06, + "loss": 0.1119, + "step": 21249 + }, + { + "epoch": 0.5377432497406179, + "grad_norm": 4.488487243652344, + "learning_rate": 4.481564071950177e-06, + "loss": 0.144, + "step": 21250 + }, + { + "epoch": 0.5377685553053116, + "grad_norm": 5.78188419342041, + "learning_rate": 4.481164715306308e-06, + "loss": 0.215, + "step": 21251 + }, + { + "epoch": 0.5377938608700054, + "grad_norm": 4.36788272857666, + "learning_rate": 4.480765362008309e-06, + "loss": 0.1906, + "step": 21252 + }, + { + "epoch": 0.537819166434699, + "grad_norm": 6.2073469161987305, + "learning_rate": 4.480366012058758e-06, + "loss": 0.1748, + "step": 21253 + }, + { + "epoch": 0.5378444719993927, + "grad_norm": 5.035584449768066, + "learning_rate": 4.479966665460229e-06, + "loss": 0.2059, + "step": 21254 + }, + { + "epoch": 0.5378697775640864, + "grad_norm": 4.871366024017334, + "learning_rate": 4.4795673222153015e-06, + "loss": 0.1491, + "step": 21255 + }, + { + "epoch": 0.53789508312878, + "grad_norm": 4.337247371673584, + "learning_rate": 4.479167982326545e-06, + "loss": 0.1352, + "step": 21256 + }, + { + "epoch": 0.5379203886934737, + "grad_norm": 4.199730396270752, + "learning_rate": 4.478768645796538e-06, + "loss": 0.1985, + "step": 21257 + }, + { + "epoch": 0.5379456942581674, + "grad_norm": 16.212949752807617, + "learning_rate": 4.4783693126278545e-06, + "loss": 0.1533, + "step": 21258 + }, + { + "epoch": 0.537970999822861, + "grad_norm": 4.6270365715026855, + "learning_rate": 4.477969982823072e-06, + "loss": 0.1872, + "step": 21259 + }, + { + "epoch": 0.5379963053875547, + "grad_norm": 5.853668212890625, + "learning_rate": 4.477570656384762e-06, + "loss": 0.1943, + "step": 21260 + }, + { + "epoch": 0.5380216109522484, + "grad_norm": 6.129015922546387, + "learning_rate": 4.477171333315502e-06, + "loss": 0.3028, + "step": 21261 + }, + { + "epoch": 0.538046916516942, + "grad_norm": 10.000083923339844, + "learning_rate": 4.4767720136178666e-06, + "loss": 0.156, + "step": 21262 + }, + { + "epoch": 0.5380722220816357, + "grad_norm": 15.134622573852539, + "learning_rate": 4.476372697294432e-06, + "loss": 0.1897, + "step": 21263 + }, + { + "epoch": 0.5380975276463295, + "grad_norm": 3.5359740257263184, + "learning_rate": 4.475973384347773e-06, + "loss": 0.168, + "step": 21264 + }, + { + "epoch": 0.538122833211023, + "grad_norm": 6.2785868644714355, + "learning_rate": 4.4755740747804635e-06, + "loss": 0.2285, + "step": 21265 + }, + { + "epoch": 0.5381481387757168, + "grad_norm": 5.071849346160889, + "learning_rate": 4.475174768595078e-06, + "loss": 0.1836, + "step": 21266 + }, + { + "epoch": 0.5381734443404105, + "grad_norm": 4.52895975112915, + "learning_rate": 4.4747754657941935e-06, + "loss": 0.1918, + "step": 21267 + }, + { + "epoch": 0.5381987499051041, + "grad_norm": 4.964203834533691, + "learning_rate": 4.4743761663803864e-06, + "loss": 0.127, + "step": 21268 + }, + { + "epoch": 0.5382240554697978, + "grad_norm": 14.00129222869873, + "learning_rate": 4.473976870356227e-06, + "loss": 0.1839, + "step": 21269 + }, + { + "epoch": 0.5382493610344915, + "grad_norm": 6.418173313140869, + "learning_rate": 4.473577577724294e-06, + "loss": 0.2223, + "step": 21270 + }, + { + "epoch": 0.5382746665991852, + "grad_norm": 8.129473686218262, + "learning_rate": 4.473178288487161e-06, + "loss": 0.228, + "step": 21271 + }, + { + "epoch": 0.5382999721638788, + "grad_norm": 3.496006488800049, + "learning_rate": 4.472779002647405e-06, + "loss": 0.143, + "step": 21272 + }, + { + "epoch": 0.5383252777285725, + "grad_norm": 5.824723720550537, + "learning_rate": 4.472379720207597e-06, + "loss": 0.2248, + "step": 21273 + }, + { + "epoch": 0.5383505832932662, + "grad_norm": 6.274622917175293, + "learning_rate": 4.471980441170313e-06, + "loss": 0.2017, + "step": 21274 + }, + { + "epoch": 0.5383758888579598, + "grad_norm": 3.649906635284424, + "learning_rate": 4.471581165538132e-06, + "loss": 0.18, + "step": 21275 + }, + { + "epoch": 0.5384011944226536, + "grad_norm": 3.420815944671631, + "learning_rate": 4.471181893313623e-06, + "loss": 0.1439, + "step": 21276 + }, + { + "epoch": 0.5384264999873473, + "grad_norm": 4.530200004577637, + "learning_rate": 4.470782624499366e-06, + "loss": 0.1241, + "step": 21277 + }, + { + "epoch": 0.5384518055520409, + "grad_norm": 2.4204277992248535, + "learning_rate": 4.470383359097931e-06, + "loss": 0.1297, + "step": 21278 + }, + { + "epoch": 0.5384771111167346, + "grad_norm": 5.93000602722168, + "learning_rate": 4.469984097111897e-06, + "loss": 0.2085, + "step": 21279 + }, + { + "epoch": 0.5385024166814283, + "grad_norm": 9.934779167175293, + "learning_rate": 4.4695848385438364e-06, + "loss": 0.3172, + "step": 21280 + }, + { + "epoch": 0.5385277222461219, + "grad_norm": 4.192657947540283, + "learning_rate": 4.469185583396325e-06, + "loss": 0.1319, + "step": 21281 + }, + { + "epoch": 0.5385530278108156, + "grad_norm": 6.106687545776367, + "learning_rate": 4.468786331671937e-06, + "loss": 0.096, + "step": 21282 + }, + { + "epoch": 0.5385783333755093, + "grad_norm": 4.81730318069458, + "learning_rate": 4.468387083373246e-06, + "loss": 0.1393, + "step": 21283 + }, + { + "epoch": 0.5386036389402029, + "grad_norm": 3.329967498779297, + "learning_rate": 4.467987838502828e-06, + "loss": 0.1673, + "step": 21284 + }, + { + "epoch": 0.5386289445048966, + "grad_norm": 11.420934677124023, + "learning_rate": 4.4675885970632575e-06, + "loss": 0.2856, + "step": 21285 + }, + { + "epoch": 0.5386542500695903, + "grad_norm": 3.6352570056915283, + "learning_rate": 4.4671893590571114e-06, + "loss": 0.1737, + "step": 21286 + }, + { + "epoch": 0.5386795556342839, + "grad_norm": 5.776355266571045, + "learning_rate": 4.4667901244869596e-06, + "loss": 0.1987, + "step": 21287 + }, + { + "epoch": 0.5387048611989776, + "grad_norm": 11.856069564819336, + "learning_rate": 4.466390893355379e-06, + "loss": 0.1893, + "step": 21288 + }, + { + "epoch": 0.5387301667636714, + "grad_norm": 4.823091983795166, + "learning_rate": 4.465991665664945e-06, + "loss": 0.1898, + "step": 21289 + }, + { + "epoch": 0.538755472328365, + "grad_norm": 8.472867012023926, + "learning_rate": 4.465592441418233e-06, + "loss": 0.179, + "step": 21290 + }, + { + "epoch": 0.5387807778930587, + "grad_norm": 9.643747329711914, + "learning_rate": 4.4651932206178136e-06, + "loss": 0.2156, + "step": 21291 + }, + { + "epoch": 0.5388060834577524, + "grad_norm": 3.8408236503601074, + "learning_rate": 4.464794003266264e-06, + "loss": 0.1475, + "step": 21292 + }, + { + "epoch": 0.538831389022446, + "grad_norm": 10.502894401550293, + "learning_rate": 4.4643947893661576e-06, + "loss": 0.391, + "step": 21293 + }, + { + "epoch": 0.5388566945871397, + "grad_norm": 1.7547487020492554, + "learning_rate": 4.463995578920073e-06, + "loss": 0.0683, + "step": 21294 + }, + { + "epoch": 0.5388820001518334, + "grad_norm": 6.471179962158203, + "learning_rate": 4.463596371930578e-06, + "loss": 0.1955, + "step": 21295 + }, + { + "epoch": 0.5389073057165271, + "grad_norm": 3.474964141845703, + "learning_rate": 4.463197168400251e-06, + "loss": 0.1494, + "step": 21296 + }, + { + "epoch": 0.5389326112812207, + "grad_norm": 1.918784260749817, + "learning_rate": 4.462797968331664e-06, + "loss": 0.0855, + "step": 21297 + }, + { + "epoch": 0.5389579168459144, + "grad_norm": 7.348869323730469, + "learning_rate": 4.462398771727395e-06, + "loss": 0.2724, + "step": 21298 + }, + { + "epoch": 0.5389832224106081, + "grad_norm": 11.75280475616455, + "learning_rate": 4.4619995785900175e-06, + "loss": 0.1837, + "step": 21299 + }, + { + "epoch": 0.5390085279753017, + "grad_norm": 4.9403276443481445, + "learning_rate": 4.461600388922103e-06, + "loss": 0.1407, + "step": 21300 + }, + { + "epoch": 0.5390338335399955, + "grad_norm": 5.008004665374756, + "learning_rate": 4.461201202726226e-06, + "loss": 0.1589, + "step": 21301 + }, + { + "epoch": 0.5390591391046892, + "grad_norm": 2.394136428833008, + "learning_rate": 4.460802020004964e-06, + "loss": 0.122, + "step": 21302 + }, + { + "epoch": 0.5390844446693828, + "grad_norm": 3.072437047958374, + "learning_rate": 4.46040284076089e-06, + "loss": 0.1277, + "step": 21303 + }, + { + "epoch": 0.5391097502340765, + "grad_norm": 5.377723217010498, + "learning_rate": 4.460003664996577e-06, + "loss": 0.1738, + "step": 21304 + }, + { + "epoch": 0.5391350557987702, + "grad_norm": 3.085757255554199, + "learning_rate": 4.4596044927146e-06, + "loss": 0.1406, + "step": 21305 + }, + { + "epoch": 0.5391603613634638, + "grad_norm": 2.9841291904449463, + "learning_rate": 4.459205323917533e-06, + "loss": 0.0743, + "step": 21306 + }, + { + "epoch": 0.5391856669281575, + "grad_norm": 3.3547768592834473, + "learning_rate": 4.45880615860795e-06, + "loss": 0.1258, + "step": 21307 + }, + { + "epoch": 0.5392109724928512, + "grad_norm": 3.3687376976013184, + "learning_rate": 4.458406996788426e-06, + "loss": 0.1575, + "step": 21308 + }, + { + "epoch": 0.5392362780575448, + "grad_norm": 6.276679515838623, + "learning_rate": 4.458007838461535e-06, + "loss": 0.2095, + "step": 21309 + }, + { + "epoch": 0.5392615836222385, + "grad_norm": 4.451305866241455, + "learning_rate": 4.457608683629849e-06, + "loss": 0.1787, + "step": 21310 + }, + { + "epoch": 0.5392868891869322, + "grad_norm": 4.083111763000488, + "learning_rate": 4.457209532295944e-06, + "loss": 0.1828, + "step": 21311 + }, + { + "epoch": 0.5393121947516258, + "grad_norm": 5.8015336990356445, + "learning_rate": 4.456810384462396e-06, + "loss": 0.145, + "step": 21312 + }, + { + "epoch": 0.5393375003163196, + "grad_norm": 7.98729944229126, + "learning_rate": 4.456411240131775e-06, + "loss": 0.1265, + "step": 21313 + }, + { + "epoch": 0.5393628058810133, + "grad_norm": 8.348658561706543, + "learning_rate": 4.456012099306656e-06, + "loss": 0.174, + "step": 21314 + }, + { + "epoch": 0.5393881114457069, + "grad_norm": 13.653342247009277, + "learning_rate": 4.455612961989614e-06, + "loss": 0.2552, + "step": 21315 + }, + { + "epoch": 0.5394134170104006, + "grad_norm": 4.691012382507324, + "learning_rate": 4.455213828183223e-06, + "loss": 0.1195, + "step": 21316 + }, + { + "epoch": 0.5394387225750943, + "grad_norm": 3.2941653728485107, + "learning_rate": 4.454814697890058e-06, + "loss": 0.1414, + "step": 21317 + }, + { + "epoch": 0.5394640281397879, + "grad_norm": 4.528911113739014, + "learning_rate": 4.454415571112691e-06, + "loss": 0.1629, + "step": 21318 + }, + { + "epoch": 0.5394893337044816, + "grad_norm": 4.301901340484619, + "learning_rate": 4.454016447853696e-06, + "loss": 0.1712, + "step": 21319 + }, + { + "epoch": 0.5395146392691753, + "grad_norm": 11.127214431762695, + "learning_rate": 4.453617328115647e-06, + "loss": 0.1891, + "step": 21320 + }, + { + "epoch": 0.539539944833869, + "grad_norm": 3.323362350463867, + "learning_rate": 4.45321821190112e-06, + "loss": 0.1453, + "step": 21321 + }, + { + "epoch": 0.5395652503985626, + "grad_norm": 4.579803466796875, + "learning_rate": 4.452819099212685e-06, + "loss": 0.1529, + "step": 21322 + }, + { + "epoch": 0.5395905559632563, + "grad_norm": 5.253274917602539, + "learning_rate": 4.452419990052918e-06, + "loss": 0.1384, + "step": 21323 + }, + { + "epoch": 0.53961586152795, + "grad_norm": 4.041285991668701, + "learning_rate": 4.452020884424393e-06, + "loss": 0.1531, + "step": 21324 + }, + { + "epoch": 0.5396411670926436, + "grad_norm": 6.811646461486816, + "learning_rate": 4.451621782329686e-06, + "loss": 0.1986, + "step": 21325 + }, + { + "epoch": 0.5396664726573374, + "grad_norm": 5.006882190704346, + "learning_rate": 4.451222683771365e-06, + "loss": 0.1843, + "step": 21326 + }, + { + "epoch": 0.5396917782220311, + "grad_norm": 4.654274940490723, + "learning_rate": 4.450823588752007e-06, + "loss": 0.2437, + "step": 21327 + }, + { + "epoch": 0.5397170837867247, + "grad_norm": 2.4679477214813232, + "learning_rate": 4.4504244972741854e-06, + "loss": 0.143, + "step": 21328 + }, + { + "epoch": 0.5397423893514184, + "grad_norm": 6.847381114959717, + "learning_rate": 4.450025409340474e-06, + "loss": 0.2774, + "step": 21329 + }, + { + "epoch": 0.5397676949161121, + "grad_norm": 8.028986930847168, + "learning_rate": 4.449626324953449e-06, + "loss": 0.2228, + "step": 21330 + }, + { + "epoch": 0.5397930004808057, + "grad_norm": 3.3169946670532227, + "learning_rate": 4.449227244115679e-06, + "loss": 0.1109, + "step": 21331 + }, + { + "epoch": 0.5398183060454994, + "grad_norm": 5.297420978546143, + "learning_rate": 4.44882816682974e-06, + "loss": 0.1617, + "step": 21332 + }, + { + "epoch": 0.5398436116101931, + "grad_norm": 6.558469772338867, + "learning_rate": 4.4484290930982076e-06, + "loss": 0.2537, + "step": 21333 + }, + { + "epoch": 0.5398689171748867, + "grad_norm": 4.837091445922852, + "learning_rate": 4.4480300229236525e-06, + "loss": 0.1606, + "step": 21334 + }, + { + "epoch": 0.5398942227395804, + "grad_norm": 6.709635257720947, + "learning_rate": 4.447630956308647e-06, + "loss": 0.2673, + "step": 21335 + }, + { + "epoch": 0.5399195283042741, + "grad_norm": 5.420900344848633, + "learning_rate": 4.44723189325577e-06, + "loss": 0.1538, + "step": 21336 + }, + { + "epoch": 0.5399448338689677, + "grad_norm": 12.073785781860352, + "learning_rate": 4.44683283376759e-06, + "loss": 0.2528, + "step": 21337 + }, + { + "epoch": 0.5399701394336615, + "grad_norm": 3.629129409790039, + "learning_rate": 4.446433777846682e-06, + "loss": 0.1338, + "step": 21338 + }, + { + "epoch": 0.5399954449983552, + "grad_norm": 3.726824998855591, + "learning_rate": 4.44603472549562e-06, + "loss": 0.1828, + "step": 21339 + }, + { + "epoch": 0.5400207505630488, + "grad_norm": 8.16276741027832, + "learning_rate": 4.4456356767169775e-06, + "loss": 0.1688, + "step": 21340 + }, + { + "epoch": 0.5400460561277425, + "grad_norm": 16.173229217529297, + "learning_rate": 4.445236631513326e-06, + "loss": 0.3376, + "step": 21341 + }, + { + "epoch": 0.5400713616924362, + "grad_norm": 5.342324733734131, + "learning_rate": 4.44483758988724e-06, + "loss": 0.1655, + "step": 21342 + }, + { + "epoch": 0.5400966672571298, + "grad_norm": 3.3876473903656006, + "learning_rate": 4.444438551841297e-06, + "loss": 0.1394, + "step": 21343 + }, + { + "epoch": 0.5401219728218235, + "grad_norm": 7.988931655883789, + "learning_rate": 4.444039517378062e-06, + "loss": 0.2289, + "step": 21344 + }, + { + "epoch": 0.5401472783865172, + "grad_norm": 4.987969398498535, + "learning_rate": 4.443640486500114e-06, + "loss": 0.1771, + "step": 21345 + }, + { + "epoch": 0.5401725839512108, + "grad_norm": 5.212674617767334, + "learning_rate": 4.443241459210025e-06, + "loss": 0.1317, + "step": 21346 + }, + { + "epoch": 0.5401978895159045, + "grad_norm": 3.0645484924316406, + "learning_rate": 4.44284243551037e-06, + "loss": 0.0993, + "step": 21347 + }, + { + "epoch": 0.5402231950805982, + "grad_norm": 6.519168376922607, + "learning_rate": 4.442443415403718e-06, + "loss": 0.1341, + "step": 21348 + }, + { + "epoch": 0.540248500645292, + "grad_norm": 3.1953697204589844, + "learning_rate": 4.442044398892646e-06, + "loss": 0.1345, + "step": 21349 + }, + { + "epoch": 0.5402738062099856, + "grad_norm": 4.705191135406494, + "learning_rate": 4.441645385979725e-06, + "loss": 0.1461, + "step": 21350 + }, + { + "epoch": 0.5402991117746793, + "grad_norm": 6.362515449523926, + "learning_rate": 4.441246376667529e-06, + "loss": 0.2084, + "step": 21351 + }, + { + "epoch": 0.540324417339373, + "grad_norm": 4.1818389892578125, + "learning_rate": 4.440847370958633e-06, + "loss": 0.121, + "step": 21352 + }, + { + "epoch": 0.5403497229040666, + "grad_norm": 4.02225399017334, + "learning_rate": 4.4404483688556075e-06, + "loss": 0.1967, + "step": 21353 + }, + { + "epoch": 0.5403750284687603, + "grad_norm": 2.6514837741851807, + "learning_rate": 4.440049370361026e-06, + "loss": 0.1166, + "step": 21354 + }, + { + "epoch": 0.540400334033454, + "grad_norm": 2.94093656539917, + "learning_rate": 4.439650375477461e-06, + "loss": 0.1214, + "step": 21355 + }, + { + "epoch": 0.5404256395981476, + "grad_norm": 5.2328057289123535, + "learning_rate": 4.439251384207489e-06, + "loss": 0.2, + "step": 21356 + }, + { + "epoch": 0.5404509451628413, + "grad_norm": 10.201301574707031, + "learning_rate": 4.438852396553679e-06, + "loss": 0.1923, + "step": 21357 + }, + { + "epoch": 0.540476250727535, + "grad_norm": 9.053243637084961, + "learning_rate": 4.438453412518604e-06, + "loss": 0.1785, + "step": 21358 + }, + { + "epoch": 0.5405015562922286, + "grad_norm": 18.659177780151367, + "learning_rate": 4.4380544321048404e-06, + "loss": 0.3195, + "step": 21359 + }, + { + "epoch": 0.5405268618569223, + "grad_norm": 3.1265921592712402, + "learning_rate": 4.43765545531496e-06, + "loss": 0.1424, + "step": 21360 + }, + { + "epoch": 0.540552167421616, + "grad_norm": 3.1287527084350586, + "learning_rate": 4.437256482151534e-06, + "loss": 0.1461, + "step": 21361 + }, + { + "epoch": 0.5405774729863096, + "grad_norm": 17.14580726623535, + "learning_rate": 4.436857512617135e-06, + "loss": 0.1936, + "step": 21362 + }, + { + "epoch": 0.5406027785510034, + "grad_norm": 11.208292007446289, + "learning_rate": 4.436458546714338e-06, + "loss": 0.3112, + "step": 21363 + }, + { + "epoch": 0.5406280841156971, + "grad_norm": 3.7704315185546875, + "learning_rate": 4.436059584445716e-06, + "loss": 0.1592, + "step": 21364 + }, + { + "epoch": 0.5406533896803907, + "grad_norm": 4.0449137687683105, + "learning_rate": 4.435660625813841e-06, + "loss": 0.156, + "step": 21365 + }, + { + "epoch": 0.5406786952450844, + "grad_norm": 12.497983932495117, + "learning_rate": 4.435261670821284e-06, + "loss": 0.2657, + "step": 21366 + }, + { + "epoch": 0.5407040008097781, + "grad_norm": 2.8268003463745117, + "learning_rate": 4.4348627194706205e-06, + "loss": 0.1078, + "step": 21367 + }, + { + "epoch": 0.5407293063744717, + "grad_norm": 5.314012050628662, + "learning_rate": 4.4344637717644214e-06, + "loss": 0.1804, + "step": 21368 + }, + { + "epoch": 0.5407546119391654, + "grad_norm": 3.773184299468994, + "learning_rate": 4.434064827705261e-06, + "loss": 0.1688, + "step": 21369 + }, + { + "epoch": 0.5407799175038591, + "grad_norm": 2.1360020637512207, + "learning_rate": 4.433665887295712e-06, + "loss": 0.0684, + "step": 21370 + }, + { + "epoch": 0.5408052230685527, + "grad_norm": 4.105554103851318, + "learning_rate": 4.433266950538345e-06, + "loss": 0.1042, + "step": 21371 + }, + { + "epoch": 0.5408305286332464, + "grad_norm": 9.026437759399414, + "learning_rate": 4.432868017435734e-06, + "loss": 0.3216, + "step": 21372 + }, + { + "epoch": 0.5408558341979401, + "grad_norm": 17.960002899169922, + "learning_rate": 4.4324690879904506e-06, + "loss": 0.3286, + "step": 21373 + }, + { + "epoch": 0.5408811397626339, + "grad_norm": 5.761617183685303, + "learning_rate": 4.432070162205072e-06, + "loss": 0.1634, + "step": 21374 + }, + { + "epoch": 0.5409064453273275, + "grad_norm": 7.214019298553467, + "learning_rate": 4.431671240082164e-06, + "loss": 0.1484, + "step": 21375 + }, + { + "epoch": 0.5409317508920212, + "grad_norm": 9.561993598937988, + "learning_rate": 4.431272321624304e-06, + "loss": 0.1506, + "step": 21376 + }, + { + "epoch": 0.5409570564567149, + "grad_norm": 7.0255818367004395, + "learning_rate": 4.430873406834062e-06, + "loss": 0.1253, + "step": 21377 + }, + { + "epoch": 0.5409823620214085, + "grad_norm": 3.6266586780548096, + "learning_rate": 4.430474495714014e-06, + "loss": 0.1108, + "step": 21378 + }, + { + "epoch": 0.5410076675861022, + "grad_norm": 6.260762691497803, + "learning_rate": 4.430075588266727e-06, + "loss": 0.1713, + "step": 21379 + }, + { + "epoch": 0.5410329731507959, + "grad_norm": 5.506786823272705, + "learning_rate": 4.429676684494777e-06, + "loss": 0.1727, + "step": 21380 + }, + { + "epoch": 0.5410582787154895, + "grad_norm": 4.577568531036377, + "learning_rate": 4.429277784400736e-06, + "loss": 0.1563, + "step": 21381 + }, + { + "epoch": 0.5410835842801832, + "grad_norm": 6.942277908325195, + "learning_rate": 4.428878887987177e-06, + "loss": 0.1599, + "step": 21382 + }, + { + "epoch": 0.5411088898448769, + "grad_norm": 3.6584692001342773, + "learning_rate": 4.4284799952566734e-06, + "loss": 0.1361, + "step": 21383 + }, + { + "epoch": 0.5411341954095705, + "grad_norm": 4.571739673614502, + "learning_rate": 4.4280811062117945e-06, + "loss": 0.1752, + "step": 21384 + }, + { + "epoch": 0.5411595009742642, + "grad_norm": 4.982329368591309, + "learning_rate": 4.427682220855113e-06, + "loss": 0.1358, + "step": 21385 + }, + { + "epoch": 0.541184806538958, + "grad_norm": 5.419755458831787, + "learning_rate": 4.427283339189203e-06, + "loss": 0.1477, + "step": 21386 + }, + { + "epoch": 0.5412101121036516, + "grad_norm": 4.506187438964844, + "learning_rate": 4.426884461216639e-06, + "loss": 0.154, + "step": 21387 + }, + { + "epoch": 0.5412354176683453, + "grad_norm": 2.6605875492095947, + "learning_rate": 4.426485586939988e-06, + "loss": 0.1307, + "step": 21388 + }, + { + "epoch": 0.541260723233039, + "grad_norm": 3.564990520477295, + "learning_rate": 4.426086716361824e-06, + "loss": 0.1546, + "step": 21389 + }, + { + "epoch": 0.5412860287977326, + "grad_norm": 3.917510509490967, + "learning_rate": 4.425687849484721e-06, + "loss": 0.1457, + "step": 21390 + }, + { + "epoch": 0.5413113343624263, + "grad_norm": 2.4038922786712646, + "learning_rate": 4.42528898631125e-06, + "loss": 0.1473, + "step": 21391 + }, + { + "epoch": 0.54133663992712, + "grad_norm": 3.3406405448913574, + "learning_rate": 4.424890126843985e-06, + "loss": 0.1464, + "step": 21392 + }, + { + "epoch": 0.5413619454918136, + "grad_norm": 5.500699520111084, + "learning_rate": 4.424491271085495e-06, + "loss": 0.1839, + "step": 21393 + }, + { + "epoch": 0.5413872510565073, + "grad_norm": 2.877495765686035, + "learning_rate": 4.424092419038355e-06, + "loss": 0.1228, + "step": 21394 + }, + { + "epoch": 0.541412556621201, + "grad_norm": 7.572392463684082, + "learning_rate": 4.423693570705134e-06, + "loss": 0.2064, + "step": 21395 + }, + { + "epoch": 0.5414378621858946, + "grad_norm": 3.202672004699707, + "learning_rate": 4.423294726088407e-06, + "loss": 0.1063, + "step": 21396 + }, + { + "epoch": 0.5414631677505883, + "grad_norm": 3.8587441444396973, + "learning_rate": 4.422895885190746e-06, + "loss": 0.0924, + "step": 21397 + }, + { + "epoch": 0.541488473315282, + "grad_norm": 4.768784999847412, + "learning_rate": 4.42249704801472e-06, + "loss": 0.2092, + "step": 21398 + }, + { + "epoch": 0.5415137788799758, + "grad_norm": 10.255184173583984, + "learning_rate": 4.4220982145629034e-06, + "loss": 0.2352, + "step": 21399 + }, + { + "epoch": 0.5415390844446694, + "grad_norm": 5.9359002113342285, + "learning_rate": 4.42169938483787e-06, + "loss": 0.1172, + "step": 21400 + }, + { + "epoch": 0.5415643900093631, + "grad_norm": 6.825228214263916, + "learning_rate": 4.421300558842188e-06, + "loss": 0.2434, + "step": 21401 + }, + { + "epoch": 0.5415896955740568, + "grad_norm": 2.5312612056732178, + "learning_rate": 4.420901736578431e-06, + "loss": 0.1003, + "step": 21402 + }, + { + "epoch": 0.5416150011387504, + "grad_norm": 3.888352870941162, + "learning_rate": 4.420502918049171e-06, + "loss": 0.1373, + "step": 21403 + }, + { + "epoch": 0.5416403067034441, + "grad_norm": 3.2996485233306885, + "learning_rate": 4.42010410325698e-06, + "loss": 0.1445, + "step": 21404 + }, + { + "epoch": 0.5416656122681378, + "grad_norm": 4.885852813720703, + "learning_rate": 4.419705292204432e-06, + "loss": 0.1628, + "step": 21405 + }, + { + "epoch": 0.5416909178328314, + "grad_norm": 14.421854019165039, + "learning_rate": 4.419306484894094e-06, + "loss": 0.1337, + "step": 21406 + }, + { + "epoch": 0.5417162233975251, + "grad_norm": 6.5454182624816895, + "learning_rate": 4.41890768132854e-06, + "loss": 0.1627, + "step": 21407 + }, + { + "epoch": 0.5417415289622188, + "grad_norm": 5.706485271453857, + "learning_rate": 4.418508881510343e-06, + "loss": 0.1379, + "step": 21408 + }, + { + "epoch": 0.5417668345269124, + "grad_norm": 5.191816806793213, + "learning_rate": 4.418110085442075e-06, + "loss": 0.1216, + "step": 21409 + }, + { + "epoch": 0.5417921400916061, + "grad_norm": 3.566016435623169, + "learning_rate": 4.4177112931263064e-06, + "loss": 0.1167, + "step": 21410 + }, + { + "epoch": 0.5418174456562999, + "grad_norm": 6.488324165344238, + "learning_rate": 4.417312504565609e-06, + "loss": 0.2106, + "step": 21411 + }, + { + "epoch": 0.5418427512209935, + "grad_norm": 5.996982097625732, + "learning_rate": 4.4169137197625544e-06, + "loss": 0.1276, + "step": 21412 + }, + { + "epoch": 0.5418680567856872, + "grad_norm": 10.648134231567383, + "learning_rate": 4.416514938719715e-06, + "loss": 0.1643, + "step": 21413 + }, + { + "epoch": 0.5418933623503809, + "grad_norm": 11.402153015136719, + "learning_rate": 4.416116161439664e-06, + "loss": 0.4443, + "step": 21414 + }, + { + "epoch": 0.5419186679150745, + "grad_norm": 4.545644283294678, + "learning_rate": 4.415717387924969e-06, + "loss": 0.1605, + "step": 21415 + }, + { + "epoch": 0.5419439734797682, + "grad_norm": 11.996124267578125, + "learning_rate": 4.415318618178205e-06, + "loss": 0.3488, + "step": 21416 + }, + { + "epoch": 0.5419692790444619, + "grad_norm": 7.2701416015625, + "learning_rate": 4.4149198522019414e-06, + "loss": 0.214, + "step": 21417 + }, + { + "epoch": 0.5419945846091555, + "grad_norm": 5.837796211242676, + "learning_rate": 4.414521089998753e-06, + "loss": 0.151, + "step": 21418 + }, + { + "epoch": 0.5420198901738492, + "grad_norm": 5.614413261413574, + "learning_rate": 4.414122331571208e-06, + "loss": 0.1474, + "step": 21419 + }, + { + "epoch": 0.5420451957385429, + "grad_norm": 4.421071529388428, + "learning_rate": 4.413723576921878e-06, + "loss": 0.1363, + "step": 21420 + }, + { + "epoch": 0.5420705013032365, + "grad_norm": 3.427882432937622, + "learning_rate": 4.4133248260533375e-06, + "loss": 0.1206, + "step": 21421 + }, + { + "epoch": 0.5420958068679302, + "grad_norm": 5.3261399269104, + "learning_rate": 4.412926078968156e-06, + "loss": 0.1244, + "step": 21422 + }, + { + "epoch": 0.542121112432624, + "grad_norm": 4.4696784019470215, + "learning_rate": 4.412527335668904e-06, + "loss": 0.1697, + "step": 21423 + }, + { + "epoch": 0.5421464179973177, + "grad_norm": 4.293298244476318, + "learning_rate": 4.4121285961581525e-06, + "loss": 0.1141, + "step": 21424 + }, + { + "epoch": 0.5421717235620113, + "grad_norm": 5.947614669799805, + "learning_rate": 4.411729860438477e-06, + "loss": 0.1963, + "step": 21425 + }, + { + "epoch": 0.542197029126705, + "grad_norm": 3.329833507537842, + "learning_rate": 4.411331128512445e-06, + "loss": 0.1185, + "step": 21426 + }, + { + "epoch": 0.5422223346913987, + "grad_norm": 4.8729352951049805, + "learning_rate": 4.4109324003826295e-06, + "loss": 0.2382, + "step": 21427 + }, + { + "epoch": 0.5422476402560923, + "grad_norm": 3.440286159515381, + "learning_rate": 4.410533676051601e-06, + "loss": 0.1054, + "step": 21428 + }, + { + "epoch": 0.542272945820786, + "grad_norm": 3.590147018432617, + "learning_rate": 4.410134955521931e-06, + "loss": 0.1498, + "step": 21429 + }, + { + "epoch": 0.5422982513854797, + "grad_norm": 4.268812656402588, + "learning_rate": 4.4097362387961905e-06, + "loss": 0.1765, + "step": 21430 + }, + { + "epoch": 0.5423235569501733, + "grad_norm": 3.055429697036743, + "learning_rate": 4.4093375258769535e-06, + "loss": 0.1723, + "step": 21431 + }, + { + "epoch": 0.542348862514867, + "grad_norm": 9.22035026550293, + "learning_rate": 4.408938816766787e-06, + "loss": 0.1661, + "step": 21432 + }, + { + "epoch": 0.5423741680795607, + "grad_norm": 14.019339561462402, + "learning_rate": 4.408540111468264e-06, + "loss": 0.2035, + "step": 21433 + }, + { + "epoch": 0.5423994736442543, + "grad_norm": 10.437195777893066, + "learning_rate": 4.408141409983956e-06, + "loss": 0.1673, + "step": 21434 + }, + { + "epoch": 0.542424779208948, + "grad_norm": 8.217926979064941, + "learning_rate": 4.407742712316434e-06, + "loss": 0.1081, + "step": 21435 + }, + { + "epoch": 0.5424500847736418, + "grad_norm": 8.784281730651855, + "learning_rate": 4.407344018468271e-06, + "loss": 0.2103, + "step": 21436 + }, + { + "epoch": 0.5424753903383354, + "grad_norm": 7.615166664123535, + "learning_rate": 4.406945328442034e-06, + "loss": 0.1897, + "step": 21437 + }, + { + "epoch": 0.5425006959030291, + "grad_norm": 8.541348457336426, + "learning_rate": 4.406546642240295e-06, + "loss": 0.2067, + "step": 21438 + }, + { + "epoch": 0.5425260014677228, + "grad_norm": 1.8777093887329102, + "learning_rate": 4.406147959865628e-06, + "loss": 0.0568, + "step": 21439 + }, + { + "epoch": 0.5425513070324164, + "grad_norm": 3.821300983428955, + "learning_rate": 4.405749281320604e-06, + "loss": 0.0949, + "step": 21440 + }, + { + "epoch": 0.5425766125971101, + "grad_norm": 4.893539905548096, + "learning_rate": 4.40535060660779e-06, + "loss": 0.1876, + "step": 21441 + }, + { + "epoch": 0.5426019181618038, + "grad_norm": 19.873136520385742, + "learning_rate": 4.4049519357297595e-06, + "loss": 0.3301, + "step": 21442 + }, + { + "epoch": 0.5426272237264974, + "grad_norm": 6.191732883453369, + "learning_rate": 4.404553268689084e-06, + "loss": 0.1768, + "step": 21443 + }, + { + "epoch": 0.5426525292911911, + "grad_norm": 3.598776340484619, + "learning_rate": 4.404154605488332e-06, + "loss": 0.0925, + "step": 21444 + }, + { + "epoch": 0.5426778348558848, + "grad_norm": 3.28684663772583, + "learning_rate": 4.403755946130079e-06, + "loss": 0.1549, + "step": 21445 + }, + { + "epoch": 0.5427031404205784, + "grad_norm": 3.9598727226257324, + "learning_rate": 4.403357290616892e-06, + "loss": 0.1874, + "step": 21446 + }, + { + "epoch": 0.5427284459852721, + "grad_norm": 8.318246841430664, + "learning_rate": 4.402958638951342e-06, + "loss": 0.2288, + "step": 21447 + }, + { + "epoch": 0.5427537515499659, + "grad_norm": 4.559868335723877, + "learning_rate": 4.402559991136001e-06, + "loss": 0.1952, + "step": 21448 + }, + { + "epoch": 0.5427790571146596, + "grad_norm": 3.69189715385437, + "learning_rate": 4.402161347173441e-06, + "loss": 0.1232, + "step": 21449 + }, + { + "epoch": 0.5428043626793532, + "grad_norm": 10.034758567810059, + "learning_rate": 4.401762707066229e-06, + "loss": 0.2354, + "step": 21450 + }, + { + "epoch": 0.5428296682440469, + "grad_norm": 5.031869888305664, + "learning_rate": 4.401364070816939e-06, + "loss": 0.1425, + "step": 21451 + }, + { + "epoch": 0.5428549738087406, + "grad_norm": 13.300227165222168, + "learning_rate": 4.4009654384281405e-06, + "loss": 0.2598, + "step": 21452 + }, + { + "epoch": 0.5428802793734342, + "grad_norm": 5.89335298538208, + "learning_rate": 4.400566809902406e-06, + "loss": 0.2816, + "step": 21453 + }, + { + "epoch": 0.5429055849381279, + "grad_norm": 6.739137172698975, + "learning_rate": 4.400168185242302e-06, + "loss": 0.251, + "step": 21454 + }, + { + "epoch": 0.5429308905028216, + "grad_norm": 2.975802421569824, + "learning_rate": 4.399769564450404e-06, + "loss": 0.0986, + "step": 21455 + }, + { + "epoch": 0.5429561960675152, + "grad_norm": 7.501842975616455, + "learning_rate": 4.399370947529279e-06, + "loss": 0.1676, + "step": 21456 + }, + { + "epoch": 0.5429815016322089, + "grad_norm": 4.943583965301514, + "learning_rate": 4.398972334481499e-06, + "loss": 0.1182, + "step": 21457 + }, + { + "epoch": 0.5430068071969026, + "grad_norm": 7.468501567840576, + "learning_rate": 4.398573725309636e-06, + "loss": 0.2061, + "step": 21458 + }, + { + "epoch": 0.5430321127615962, + "grad_norm": 3.065128803253174, + "learning_rate": 4.398175120016258e-06, + "loss": 0.1017, + "step": 21459 + }, + { + "epoch": 0.54305741832629, + "grad_norm": 11.238341331481934, + "learning_rate": 4.397776518603936e-06, + "loss": 0.2542, + "step": 21460 + }, + { + "epoch": 0.5430827238909837, + "grad_norm": 5.649583339691162, + "learning_rate": 4.397377921075242e-06, + "loss": 0.1823, + "step": 21461 + }, + { + "epoch": 0.5431080294556773, + "grad_norm": 4.16081428527832, + "learning_rate": 4.396979327432748e-06, + "loss": 0.153, + "step": 21462 + }, + { + "epoch": 0.543133335020371, + "grad_norm": 4.0172529220581055, + "learning_rate": 4.39658073767902e-06, + "loss": 0.1798, + "step": 21463 + }, + { + "epoch": 0.5431586405850647, + "grad_norm": 4.748747825622559, + "learning_rate": 4.39618215181663e-06, + "loss": 0.1946, + "step": 21464 + }, + { + "epoch": 0.5431839461497583, + "grad_norm": 8.756927490234375, + "learning_rate": 4.395783569848149e-06, + "loss": 0.2267, + "step": 21465 + }, + { + "epoch": 0.543209251714452, + "grad_norm": 4.820216655731201, + "learning_rate": 4.395384991776148e-06, + "loss": 0.1606, + "step": 21466 + }, + { + "epoch": 0.5432345572791457, + "grad_norm": 7.0486063957214355, + "learning_rate": 4.394986417603197e-06, + "loss": 0.1646, + "step": 21467 + }, + { + "epoch": 0.5432598628438393, + "grad_norm": 4.837035179138184, + "learning_rate": 4.394587847331866e-06, + "loss": 0.098, + "step": 21468 + }, + { + "epoch": 0.543285168408533, + "grad_norm": 4.86814546585083, + "learning_rate": 4.394189280964724e-06, + "loss": 0.0984, + "step": 21469 + }, + { + "epoch": 0.5433104739732267, + "grad_norm": 3.593334913253784, + "learning_rate": 4.3937907185043435e-06, + "loss": 0.1686, + "step": 21470 + }, + { + "epoch": 0.5433357795379203, + "grad_norm": 8.728607177734375, + "learning_rate": 4.393392159953296e-06, + "loss": 0.1854, + "step": 21471 + }, + { + "epoch": 0.543361085102614, + "grad_norm": 9.92887020111084, + "learning_rate": 4.392993605314147e-06, + "loss": 0.2145, + "step": 21472 + }, + { + "epoch": 0.5433863906673078, + "grad_norm": 8.594250679016113, + "learning_rate": 4.39259505458947e-06, + "loss": 0.2004, + "step": 21473 + }, + { + "epoch": 0.5434116962320014, + "grad_norm": 4.894895076751709, + "learning_rate": 4.392196507781834e-06, + "loss": 0.1841, + "step": 21474 + }, + { + "epoch": 0.5434370017966951, + "grad_norm": 9.644083976745605, + "learning_rate": 4.391797964893812e-06, + "loss": 0.1666, + "step": 21475 + }, + { + "epoch": 0.5434623073613888, + "grad_norm": 3.379234790802002, + "learning_rate": 4.391399425927969e-06, + "loss": 0.0981, + "step": 21476 + }, + { + "epoch": 0.5434876129260825, + "grad_norm": 3.7431256771087646, + "learning_rate": 4.391000890886879e-06, + "loss": 0.128, + "step": 21477 + }, + { + "epoch": 0.5435129184907761, + "grad_norm": 4.662636756896973, + "learning_rate": 4.39060235977311e-06, + "loss": 0.1073, + "step": 21478 + }, + { + "epoch": 0.5435382240554698, + "grad_norm": 6.489301681518555, + "learning_rate": 4.3902038325892345e-06, + "loss": 0.1643, + "step": 21479 + }, + { + "epoch": 0.5435635296201635, + "grad_norm": 3.9430408477783203, + "learning_rate": 4.389805309337822e-06, + "loss": 0.1139, + "step": 21480 + }, + { + "epoch": 0.5435888351848571, + "grad_norm": 7.922245502471924, + "learning_rate": 4.389406790021439e-06, + "loss": 0.1986, + "step": 21481 + }, + { + "epoch": 0.5436141407495508, + "grad_norm": 7.284010887145996, + "learning_rate": 4.38900827464266e-06, + "loss": 0.2571, + "step": 21482 + }, + { + "epoch": 0.5436394463142445, + "grad_norm": 3.879492998123169, + "learning_rate": 4.388609763204052e-06, + "loss": 0.128, + "step": 21483 + }, + { + "epoch": 0.5436647518789381, + "grad_norm": 4.803304195404053, + "learning_rate": 4.388211255708186e-06, + "loss": 0.1976, + "step": 21484 + }, + { + "epoch": 0.5436900574436319, + "grad_norm": 5.561988353729248, + "learning_rate": 4.387812752157631e-06, + "loss": 0.1924, + "step": 21485 + }, + { + "epoch": 0.5437153630083256, + "grad_norm": 8.998388290405273, + "learning_rate": 4.387414252554959e-06, + "loss": 0.274, + "step": 21486 + }, + { + "epoch": 0.5437406685730192, + "grad_norm": 3.8647756576538086, + "learning_rate": 4.387015756902738e-06, + "loss": 0.1919, + "step": 21487 + }, + { + "epoch": 0.5437659741377129, + "grad_norm": 8.830196380615234, + "learning_rate": 4.386617265203538e-06, + "loss": 0.2283, + "step": 21488 + }, + { + "epoch": 0.5437912797024066, + "grad_norm": 18.007328033447266, + "learning_rate": 4.386218777459931e-06, + "loss": 0.2462, + "step": 21489 + }, + { + "epoch": 0.5438165852671002, + "grad_norm": 10.336349487304688, + "learning_rate": 4.385820293674482e-06, + "loss": 0.132, + "step": 21490 + }, + { + "epoch": 0.5438418908317939, + "grad_norm": 8.98391342163086, + "learning_rate": 4.385421813849765e-06, + "loss": 0.1513, + "step": 21491 + }, + { + "epoch": 0.5438671963964876, + "grad_norm": 3.8539793491363525, + "learning_rate": 4.385023337988348e-06, + "loss": 0.1324, + "step": 21492 + }, + { + "epoch": 0.5438925019611812, + "grad_norm": 10.726730346679688, + "learning_rate": 4.384624866092804e-06, + "loss": 0.2031, + "step": 21493 + }, + { + "epoch": 0.5439178075258749, + "grad_norm": 1.883540391921997, + "learning_rate": 4.384226398165697e-06, + "loss": 0.0753, + "step": 21494 + }, + { + "epoch": 0.5439431130905686, + "grad_norm": 6.320577144622803, + "learning_rate": 4.383827934209599e-06, + "loss": 0.207, + "step": 21495 + }, + { + "epoch": 0.5439684186552622, + "grad_norm": 3.762648105621338, + "learning_rate": 4.3834294742270804e-06, + "loss": 0.1284, + "step": 21496 + }, + { + "epoch": 0.543993724219956, + "grad_norm": 3.1364612579345703, + "learning_rate": 4.383031018220713e-06, + "loss": 0.1329, + "step": 21497 + }, + { + "epoch": 0.5440190297846497, + "grad_norm": 5.260901927947998, + "learning_rate": 4.382632566193061e-06, + "loss": 0.1671, + "step": 21498 + }, + { + "epoch": 0.5440443353493433, + "grad_norm": 5.180279731750488, + "learning_rate": 4.382234118146696e-06, + "loss": 0.1877, + "step": 21499 + }, + { + "epoch": 0.544069640914037, + "grad_norm": 15.94439697265625, + "learning_rate": 4.38183567408419e-06, + "loss": 0.1414, + "step": 21500 + }, + { + "epoch": 0.5440949464787307, + "grad_norm": 5.176083087921143, + "learning_rate": 4.38143723400811e-06, + "loss": 0.1844, + "step": 21501 + }, + { + "epoch": 0.5441202520434244, + "grad_norm": 3.9887454509735107, + "learning_rate": 4.381038797921028e-06, + "loss": 0.1462, + "step": 21502 + }, + { + "epoch": 0.544145557608118, + "grad_norm": 3.957150459289551, + "learning_rate": 4.38064036582551e-06, + "loss": 0.1195, + "step": 21503 + }, + { + "epoch": 0.5441708631728117, + "grad_norm": 9.789467811584473, + "learning_rate": 4.380241937724127e-06, + "loss": 0.1644, + "step": 21504 + }, + { + "epoch": 0.5441961687375054, + "grad_norm": 2.506389856338501, + "learning_rate": 4.379843513619448e-06, + "loss": 0.0763, + "step": 21505 + }, + { + "epoch": 0.544221474302199, + "grad_norm": 11.695792198181152, + "learning_rate": 4.379445093514046e-06, + "loss": 0.3331, + "step": 21506 + }, + { + "epoch": 0.5442467798668927, + "grad_norm": 4.22275447845459, + "learning_rate": 4.379046677410484e-06, + "loss": 0.1743, + "step": 21507 + }, + { + "epoch": 0.5442720854315864, + "grad_norm": 4.485733985900879, + "learning_rate": 4.3786482653113345e-06, + "loss": 0.2191, + "step": 21508 + }, + { + "epoch": 0.54429739099628, + "grad_norm": 4.5170159339904785, + "learning_rate": 4.378249857219167e-06, + "loss": 0.1828, + "step": 21509 + }, + { + "epoch": 0.5443226965609738, + "grad_norm": 5.475816249847412, + "learning_rate": 4.377851453136552e-06, + "loss": 0.1834, + "step": 21510 + }, + { + "epoch": 0.5443480021256675, + "grad_norm": 4.5325117111206055, + "learning_rate": 4.3774530530660566e-06, + "loss": 0.1407, + "step": 21511 + }, + { + "epoch": 0.5443733076903611, + "grad_norm": 5.077061176300049, + "learning_rate": 4.37705465701025e-06, + "loss": 0.237, + "step": 21512 + }, + { + "epoch": 0.5443986132550548, + "grad_norm": 2.732767343521118, + "learning_rate": 4.376656264971702e-06, + "loss": 0.1154, + "step": 21513 + }, + { + "epoch": 0.5444239188197485, + "grad_norm": 6.986331462860107, + "learning_rate": 4.376257876952983e-06, + "loss": 0.2093, + "step": 21514 + }, + { + "epoch": 0.5444492243844421, + "grad_norm": 6.7523040771484375, + "learning_rate": 4.37585949295666e-06, + "loss": 0.227, + "step": 21515 + }, + { + "epoch": 0.5444745299491358, + "grad_norm": 4.671630859375, + "learning_rate": 4.3754611129853046e-06, + "loss": 0.1412, + "step": 21516 + }, + { + "epoch": 0.5444998355138295, + "grad_norm": 3.142544984817505, + "learning_rate": 4.375062737041481e-06, + "loss": 0.129, + "step": 21517 + }, + { + "epoch": 0.5445251410785231, + "grad_norm": 5.9583330154418945, + "learning_rate": 4.374664365127764e-06, + "loss": 0.1956, + "step": 21518 + }, + { + "epoch": 0.5445504466432168, + "grad_norm": 6.050674915313721, + "learning_rate": 4.374265997246719e-06, + "loss": 0.2473, + "step": 21519 + }, + { + "epoch": 0.5445757522079105, + "grad_norm": 5.5225019454956055, + "learning_rate": 4.373867633400918e-06, + "loss": 0.1729, + "step": 21520 + }, + { + "epoch": 0.5446010577726041, + "grad_norm": 3.6198203563690186, + "learning_rate": 4.373469273592927e-06, + "loss": 0.1669, + "step": 21521 + }, + { + "epoch": 0.5446263633372979, + "grad_norm": 8.432963371276855, + "learning_rate": 4.373070917825316e-06, + "loss": 0.2085, + "step": 21522 + }, + { + "epoch": 0.5446516689019916, + "grad_norm": 3.619802713394165, + "learning_rate": 4.372672566100653e-06, + "loss": 0.1466, + "step": 21523 + }, + { + "epoch": 0.5446769744666852, + "grad_norm": 7.694786071777344, + "learning_rate": 4.372274218421511e-06, + "loss": 0.2038, + "step": 21524 + }, + { + "epoch": 0.5447022800313789, + "grad_norm": 8.555319786071777, + "learning_rate": 4.371875874790454e-06, + "loss": 0.2516, + "step": 21525 + }, + { + "epoch": 0.5447275855960726, + "grad_norm": 8.472990989685059, + "learning_rate": 4.371477535210051e-06, + "loss": 0.2808, + "step": 21526 + }, + { + "epoch": 0.5447528911607663, + "grad_norm": 3.854236125946045, + "learning_rate": 4.371079199682874e-06, + "loss": 0.2032, + "step": 21527 + }, + { + "epoch": 0.5447781967254599, + "grad_norm": 5.87940788269043, + "learning_rate": 4.370680868211491e-06, + "loss": 0.1357, + "step": 21528 + }, + { + "epoch": 0.5448035022901536, + "grad_norm": 10.295405387878418, + "learning_rate": 4.370282540798469e-06, + "loss": 0.3798, + "step": 21529 + }, + { + "epoch": 0.5448288078548473, + "grad_norm": 8.545612335205078, + "learning_rate": 4.369884217446377e-06, + "loss": 0.2216, + "step": 21530 + }, + { + "epoch": 0.5448541134195409, + "grad_norm": 9.384252548217773, + "learning_rate": 4.369485898157786e-06, + "loss": 0.2545, + "step": 21531 + }, + { + "epoch": 0.5448794189842346, + "grad_norm": 3.5332765579223633, + "learning_rate": 4.369087582935262e-06, + "loss": 0.1557, + "step": 21532 + }, + { + "epoch": 0.5449047245489284, + "grad_norm": 3.2032575607299805, + "learning_rate": 4.368689271781377e-06, + "loss": 0.1391, + "step": 21533 + }, + { + "epoch": 0.544930030113622, + "grad_norm": 6.740068435668945, + "learning_rate": 4.3682909646986955e-06, + "loss": 0.2709, + "step": 21534 + }, + { + "epoch": 0.5449553356783157, + "grad_norm": 5.666609287261963, + "learning_rate": 4.367892661689788e-06, + "loss": 0.1242, + "step": 21535 + }, + { + "epoch": 0.5449806412430094, + "grad_norm": 4.589119911193848, + "learning_rate": 4.367494362757223e-06, + "loss": 0.1826, + "step": 21536 + }, + { + "epoch": 0.545005946807703, + "grad_norm": 4.750465393066406, + "learning_rate": 4.367096067903571e-06, + "loss": 0.1299, + "step": 21537 + }, + { + "epoch": 0.5450312523723967, + "grad_norm": 3.533459186553955, + "learning_rate": 4.366697777131398e-06, + "loss": 0.1479, + "step": 21538 + }, + { + "epoch": 0.5450565579370904, + "grad_norm": 5.8906450271606445, + "learning_rate": 4.3662994904432715e-06, + "loss": 0.2104, + "step": 21539 + }, + { + "epoch": 0.545081863501784, + "grad_norm": 3.456399917602539, + "learning_rate": 4.365901207841763e-06, + "loss": 0.1034, + "step": 21540 + }, + { + "epoch": 0.5451071690664777, + "grad_norm": 3.466250419616699, + "learning_rate": 4.36550292932944e-06, + "loss": 0.1279, + "step": 21541 + }, + { + "epoch": 0.5451324746311714, + "grad_norm": 6.329626560211182, + "learning_rate": 4.365104654908871e-06, + "loss": 0.1683, + "step": 21542 + }, + { + "epoch": 0.545157780195865, + "grad_norm": 4.4803571701049805, + "learning_rate": 4.364706384582623e-06, + "loss": 0.1604, + "step": 21543 + }, + { + "epoch": 0.5451830857605587, + "grad_norm": 4.714873313903809, + "learning_rate": 4.364308118353265e-06, + "loss": 0.1451, + "step": 21544 + }, + { + "epoch": 0.5452083913252525, + "grad_norm": 2.9738504886627197, + "learning_rate": 4.363909856223366e-06, + "loss": 0.1322, + "step": 21545 + }, + { + "epoch": 0.545233696889946, + "grad_norm": 5.003420829772949, + "learning_rate": 4.363511598195495e-06, + "loss": 0.187, + "step": 21546 + }, + { + "epoch": 0.5452590024546398, + "grad_norm": 16.264427185058594, + "learning_rate": 4.363113344272219e-06, + "loss": 0.2354, + "step": 21547 + }, + { + "epoch": 0.5452843080193335, + "grad_norm": 5.800645351409912, + "learning_rate": 4.362715094456106e-06, + "loss": 0.1524, + "step": 21548 + }, + { + "epoch": 0.5453096135840271, + "grad_norm": 5.022220134735107, + "learning_rate": 4.362316848749724e-06, + "loss": 0.2388, + "step": 21549 + }, + { + "epoch": 0.5453349191487208, + "grad_norm": 3.563380718231201, + "learning_rate": 4.361918607155645e-06, + "loss": 0.1583, + "step": 21550 + }, + { + "epoch": 0.5453602247134145, + "grad_norm": 7.160402297973633, + "learning_rate": 4.361520369676431e-06, + "loss": 0.2241, + "step": 21551 + }, + { + "epoch": 0.5453855302781082, + "grad_norm": 7.359514236450195, + "learning_rate": 4.361122136314654e-06, + "loss": 0.2072, + "step": 21552 + }, + { + "epoch": 0.5454108358428018, + "grad_norm": 6.209575176239014, + "learning_rate": 4.360723907072882e-06, + "loss": 0.2436, + "step": 21553 + }, + { + "epoch": 0.5454361414074955, + "grad_norm": 10.522468566894531, + "learning_rate": 4.360325681953682e-06, + "loss": 0.2444, + "step": 21554 + }, + { + "epoch": 0.5454614469721892, + "grad_norm": 3.4995198249816895, + "learning_rate": 4.359927460959625e-06, + "loss": 0.1714, + "step": 21555 + }, + { + "epoch": 0.5454867525368828, + "grad_norm": 12.855287551879883, + "learning_rate": 4.3595292440932746e-06, + "loss": 0.2353, + "step": 21556 + }, + { + "epoch": 0.5455120581015765, + "grad_norm": 2.866848945617676, + "learning_rate": 4.359131031357201e-06, + "loss": 0.1126, + "step": 21557 + }, + { + "epoch": 0.5455373636662703, + "grad_norm": 12.621706008911133, + "learning_rate": 4.358732822753972e-06, + "loss": 0.2142, + "step": 21558 + }, + { + "epoch": 0.5455626692309639, + "grad_norm": 7.563612461090088, + "learning_rate": 4.3583346182861585e-06, + "loss": 0.2322, + "step": 21559 + }, + { + "epoch": 0.5455879747956576, + "grad_norm": 4.341103553771973, + "learning_rate": 4.357936417956323e-06, + "loss": 0.1664, + "step": 21560 + }, + { + "epoch": 0.5456132803603513, + "grad_norm": 3.5501770973205566, + "learning_rate": 4.357538221767036e-06, + "loss": 0.1393, + "step": 21561 + }, + { + "epoch": 0.5456385859250449, + "grad_norm": 5.817742824554443, + "learning_rate": 4.357140029720865e-06, + "loss": 0.1414, + "step": 21562 + }, + { + "epoch": 0.5456638914897386, + "grad_norm": 9.53933048248291, + "learning_rate": 4.35674184182038e-06, + "loss": 0.2676, + "step": 21563 + }, + { + "epoch": 0.5456891970544323, + "grad_norm": 3.5624938011169434, + "learning_rate": 4.356343658068149e-06, + "loss": 0.1243, + "step": 21564 + }, + { + "epoch": 0.5457145026191259, + "grad_norm": 2.3136796951293945, + "learning_rate": 4.355945478466735e-06, + "loss": 0.0787, + "step": 21565 + }, + { + "epoch": 0.5457398081838196, + "grad_norm": 4.86046838760376, + "learning_rate": 4.355547303018709e-06, + "loss": 0.2285, + "step": 21566 + }, + { + "epoch": 0.5457651137485133, + "grad_norm": 5.546893119812012, + "learning_rate": 4.3551491317266396e-06, + "loss": 0.1889, + "step": 21567 + }, + { + "epoch": 0.5457904193132069, + "grad_norm": 8.254952430725098, + "learning_rate": 4.354750964593093e-06, + "loss": 0.1908, + "step": 21568 + }, + { + "epoch": 0.5458157248779006, + "grad_norm": 6.911044597625732, + "learning_rate": 4.3543528016206375e-06, + "loss": 0.3194, + "step": 21569 + }, + { + "epoch": 0.5458410304425944, + "grad_norm": 3.738673686981201, + "learning_rate": 4.353954642811839e-06, + "loss": 0.1772, + "step": 21570 + }, + { + "epoch": 0.545866336007288, + "grad_norm": 4.203746318817139, + "learning_rate": 4.353556488169269e-06, + "loss": 0.1495, + "step": 21571 + }, + { + "epoch": 0.5458916415719817, + "grad_norm": 2.130098819732666, + "learning_rate": 4.353158337695493e-06, + "loss": 0.0654, + "step": 21572 + }, + { + "epoch": 0.5459169471366754, + "grad_norm": 5.433171272277832, + "learning_rate": 4.3527601913930785e-06, + "loss": 0.2132, + "step": 21573 + }, + { + "epoch": 0.545942252701369, + "grad_norm": 5.735752105712891, + "learning_rate": 4.352362049264593e-06, + "loss": 0.1549, + "step": 21574 + }, + { + "epoch": 0.5459675582660627, + "grad_norm": 5.03126859664917, + "learning_rate": 4.3519639113126035e-06, + "loss": 0.1454, + "step": 21575 + }, + { + "epoch": 0.5459928638307564, + "grad_norm": 4.931187152862549, + "learning_rate": 4.3515657775396775e-06, + "loss": 0.1522, + "step": 21576 + }, + { + "epoch": 0.5460181693954501, + "grad_norm": 11.688602447509766, + "learning_rate": 4.351167647948386e-06, + "loss": 0.2555, + "step": 21577 + }, + { + "epoch": 0.5460434749601437, + "grad_norm": 3.1716856956481934, + "learning_rate": 4.3507695225412914e-06, + "loss": 0.1663, + "step": 21578 + }, + { + "epoch": 0.5460687805248374, + "grad_norm": 6.582497596740723, + "learning_rate": 4.350371401320963e-06, + "loss": 0.1399, + "step": 21579 + }, + { + "epoch": 0.5460940860895311, + "grad_norm": 5.6489644050598145, + "learning_rate": 4.349973284289969e-06, + "loss": 0.2063, + "step": 21580 + }, + { + "epoch": 0.5461193916542247, + "grad_norm": 3.196913003921509, + "learning_rate": 4.349575171450879e-06, + "loss": 0.1666, + "step": 21581 + }, + { + "epoch": 0.5461446972189185, + "grad_norm": 2.9800310134887695, + "learning_rate": 4.349177062806255e-06, + "loss": 0.1611, + "step": 21582 + }, + { + "epoch": 0.5461700027836122, + "grad_norm": 4.211211204528809, + "learning_rate": 4.348778958358667e-06, + "loss": 0.1664, + "step": 21583 + }, + { + "epoch": 0.5461953083483058, + "grad_norm": 3.0979113578796387, + "learning_rate": 4.348380858110682e-06, + "loss": 0.1197, + "step": 21584 + }, + { + "epoch": 0.5462206139129995, + "grad_norm": 8.372798919677734, + "learning_rate": 4.347982762064868e-06, + "loss": 0.1868, + "step": 21585 + }, + { + "epoch": 0.5462459194776932, + "grad_norm": 9.471013069152832, + "learning_rate": 4.347584670223794e-06, + "loss": 0.1284, + "step": 21586 + }, + { + "epoch": 0.5462712250423868, + "grad_norm": 3.333256721496582, + "learning_rate": 4.347186582590023e-06, + "loss": 0.1483, + "step": 21587 + }, + { + "epoch": 0.5462965306070805, + "grad_norm": 5.936685562133789, + "learning_rate": 4.346788499166124e-06, + "loss": 0.2164, + "step": 21588 + }, + { + "epoch": 0.5463218361717742, + "grad_norm": 6.191582202911377, + "learning_rate": 4.346390419954665e-06, + "loss": 0.2262, + "step": 21589 + }, + { + "epoch": 0.5463471417364678, + "grad_norm": 4.1943039894104, + "learning_rate": 4.345992344958214e-06, + "loss": 0.1909, + "step": 21590 + }, + { + "epoch": 0.5463724473011615, + "grad_norm": 3.8146517276763916, + "learning_rate": 4.345594274179335e-06, + "loss": 0.1254, + "step": 21591 + }, + { + "epoch": 0.5463977528658552, + "grad_norm": 2.9169373512268066, + "learning_rate": 4.345196207620597e-06, + "loss": 0.129, + "step": 21592 + }, + { + "epoch": 0.5464230584305488, + "grad_norm": 6.863090515136719, + "learning_rate": 4.344798145284567e-06, + "loss": 0.2293, + "step": 21593 + }, + { + "epoch": 0.5464483639952425, + "grad_norm": 2.732872724533081, + "learning_rate": 4.344400087173813e-06, + "loss": 0.1631, + "step": 21594 + }, + { + "epoch": 0.5464736695599363, + "grad_norm": 7.25972843170166, + "learning_rate": 4.3440020332909e-06, + "loss": 0.2558, + "step": 21595 + }, + { + "epoch": 0.5464989751246299, + "grad_norm": 2.6880431175231934, + "learning_rate": 4.343603983638396e-06, + "loss": 0.1593, + "step": 21596 + }, + { + "epoch": 0.5465242806893236, + "grad_norm": 8.937033653259277, + "learning_rate": 4.343205938218867e-06, + "loss": 0.3223, + "step": 21597 + }, + { + "epoch": 0.5465495862540173, + "grad_norm": 7.506106853485107, + "learning_rate": 4.342807897034882e-06, + "loss": 0.1965, + "step": 21598 + }, + { + "epoch": 0.5465748918187109, + "grad_norm": 5.812214374542236, + "learning_rate": 4.342409860089006e-06, + "loss": 0.1828, + "step": 21599 + }, + { + "epoch": 0.5466001973834046, + "grad_norm": 6.388948440551758, + "learning_rate": 4.342011827383807e-06, + "loss": 0.1926, + "step": 21600 + }, + { + "epoch": 0.5466255029480983, + "grad_norm": 17.369646072387695, + "learning_rate": 4.341613798921851e-06, + "loss": 0.1877, + "step": 21601 + }, + { + "epoch": 0.5466508085127919, + "grad_norm": 3.6400766372680664, + "learning_rate": 4.3412157747057045e-06, + "loss": 0.1207, + "step": 21602 + }, + { + "epoch": 0.5466761140774856, + "grad_norm": 2.7853286266326904, + "learning_rate": 4.340817754737936e-06, + "loss": 0.1465, + "step": 21603 + }, + { + "epoch": 0.5467014196421793, + "grad_norm": 4.996635913848877, + "learning_rate": 4.340419739021111e-06, + "loss": 0.1837, + "step": 21604 + }, + { + "epoch": 0.546726725206873, + "grad_norm": 9.645444869995117, + "learning_rate": 4.340021727557796e-06, + "loss": 0.2025, + "step": 21605 + }, + { + "epoch": 0.5467520307715666, + "grad_norm": 3.9782655239105225, + "learning_rate": 4.339623720350558e-06, + "loss": 0.1398, + "step": 21606 + }, + { + "epoch": 0.5467773363362604, + "grad_norm": 3.6526851654052734, + "learning_rate": 4.339225717401963e-06, + "loss": 0.1735, + "step": 21607 + }, + { + "epoch": 0.5468026419009541, + "grad_norm": 3.5410120487213135, + "learning_rate": 4.338827718714581e-06, + "loss": 0.2082, + "step": 21608 + }, + { + "epoch": 0.5468279474656477, + "grad_norm": 3.699894905090332, + "learning_rate": 4.338429724290974e-06, + "loss": 0.1472, + "step": 21609 + }, + { + "epoch": 0.5468532530303414, + "grad_norm": 14.26901912689209, + "learning_rate": 4.33803173413371e-06, + "loss": 0.3344, + "step": 21610 + }, + { + "epoch": 0.5468785585950351, + "grad_norm": 4.170884609222412, + "learning_rate": 4.337633748245357e-06, + "loss": 0.1821, + "step": 21611 + }, + { + "epoch": 0.5469038641597287, + "grad_norm": 7.1918816566467285, + "learning_rate": 4.3372357666284825e-06, + "loss": 0.2548, + "step": 21612 + }, + { + "epoch": 0.5469291697244224, + "grad_norm": 4.959688186645508, + "learning_rate": 4.3368377892856495e-06, + "loss": 0.1632, + "step": 21613 + }, + { + "epoch": 0.5469544752891161, + "grad_norm": 8.248546600341797, + "learning_rate": 4.336439816219426e-06, + "loss": 0.2107, + "step": 21614 + }, + { + "epoch": 0.5469797808538097, + "grad_norm": 7.727343559265137, + "learning_rate": 4.336041847432379e-06, + "loss": 0.2306, + "step": 21615 + }, + { + "epoch": 0.5470050864185034, + "grad_norm": 4.28857946395874, + "learning_rate": 4.335643882927074e-06, + "loss": 0.1641, + "step": 21616 + }, + { + "epoch": 0.5470303919831971, + "grad_norm": 3.589683771133423, + "learning_rate": 4.335245922706081e-06, + "loss": 0.1161, + "step": 21617 + }, + { + "epoch": 0.5470556975478907, + "grad_norm": 2.3825881481170654, + "learning_rate": 4.33484796677196e-06, + "loss": 0.0826, + "step": 21618 + }, + { + "epoch": 0.5470810031125845, + "grad_norm": 13.25880241394043, + "learning_rate": 4.334450015127281e-06, + "loss": 0.2645, + "step": 21619 + }, + { + "epoch": 0.5471063086772782, + "grad_norm": 5.643587112426758, + "learning_rate": 4.334052067774611e-06, + "loss": 0.1831, + "step": 21620 + }, + { + "epoch": 0.5471316142419718, + "grad_norm": 3.347378969192505, + "learning_rate": 4.333654124716518e-06, + "loss": 0.1119, + "step": 21621 + }, + { + "epoch": 0.5471569198066655, + "grad_norm": 4.077863693237305, + "learning_rate": 4.333256185955562e-06, + "loss": 0.1615, + "step": 21622 + }, + { + "epoch": 0.5471822253713592, + "grad_norm": 18.337575912475586, + "learning_rate": 4.332858251494313e-06, + "loss": 0.1956, + "step": 21623 + }, + { + "epoch": 0.5472075309360528, + "grad_norm": 3.828310012817383, + "learning_rate": 4.332460321335338e-06, + "loss": 0.1332, + "step": 21624 + }, + { + "epoch": 0.5472328365007465, + "grad_norm": 4.385798454284668, + "learning_rate": 4.332062395481203e-06, + "loss": 0.1956, + "step": 21625 + }, + { + "epoch": 0.5472581420654402, + "grad_norm": 8.092792510986328, + "learning_rate": 4.331664473934472e-06, + "loss": 0.2211, + "step": 21626 + }, + { + "epoch": 0.5472834476301338, + "grad_norm": 6.97379732131958, + "learning_rate": 4.331266556697713e-06, + "loss": 0.171, + "step": 21627 + }, + { + "epoch": 0.5473087531948275, + "grad_norm": 4.1406450271606445, + "learning_rate": 4.330868643773491e-06, + "loss": 0.1805, + "step": 21628 + }, + { + "epoch": 0.5473340587595212, + "grad_norm": 3.3593077659606934, + "learning_rate": 4.330470735164373e-06, + "loss": 0.1168, + "step": 21629 + }, + { + "epoch": 0.547359364324215, + "grad_norm": 3.9935617446899414, + "learning_rate": 4.330072830872925e-06, + "loss": 0.2022, + "step": 21630 + }, + { + "epoch": 0.5473846698889085, + "grad_norm": 10.35119342803955, + "learning_rate": 4.329674930901713e-06, + "loss": 0.2117, + "step": 21631 + }, + { + "epoch": 0.5474099754536023, + "grad_norm": 8.645541191101074, + "learning_rate": 4.329277035253303e-06, + "loss": 0.1939, + "step": 21632 + }, + { + "epoch": 0.547435281018296, + "grad_norm": 3.93009877204895, + "learning_rate": 4.3288791439302594e-06, + "loss": 0.2069, + "step": 21633 + }, + { + "epoch": 0.5474605865829896, + "grad_norm": 6.663671016693115, + "learning_rate": 4.32848125693515e-06, + "loss": 0.2737, + "step": 21634 + }, + { + "epoch": 0.5474858921476833, + "grad_norm": 8.452552795410156, + "learning_rate": 4.32808337427054e-06, + "loss": 0.2718, + "step": 21635 + }, + { + "epoch": 0.547511197712377, + "grad_norm": 5.871293067932129, + "learning_rate": 4.327685495938995e-06, + "loss": 0.179, + "step": 21636 + }, + { + "epoch": 0.5475365032770706, + "grad_norm": 3.284233331680298, + "learning_rate": 4.327287621943081e-06, + "loss": 0.1332, + "step": 21637 + }, + { + "epoch": 0.5475618088417643, + "grad_norm": 5.844018459320068, + "learning_rate": 4.326889752285364e-06, + "loss": 0.1523, + "step": 21638 + }, + { + "epoch": 0.547587114406458, + "grad_norm": 4.646574020385742, + "learning_rate": 4.3264918869684115e-06, + "loss": 0.2313, + "step": 21639 + }, + { + "epoch": 0.5476124199711516, + "grad_norm": 3.939777374267578, + "learning_rate": 4.326094025994786e-06, + "loss": 0.1526, + "step": 21640 + }, + { + "epoch": 0.5476377255358453, + "grad_norm": 9.105733871459961, + "learning_rate": 4.325696169367054e-06, + "loss": 0.1817, + "step": 21641 + }, + { + "epoch": 0.547663031100539, + "grad_norm": 27.331174850463867, + "learning_rate": 4.325298317087783e-06, + "loss": 0.4268, + "step": 21642 + }, + { + "epoch": 0.5476883366652326, + "grad_norm": 6.235955715179443, + "learning_rate": 4.3249004691595384e-06, + "loss": 0.2971, + "step": 21643 + }, + { + "epoch": 0.5477136422299264, + "grad_norm": 8.945738792419434, + "learning_rate": 4.324502625584884e-06, + "loss": 0.1825, + "step": 21644 + }, + { + "epoch": 0.5477389477946201, + "grad_norm": 2.95670747756958, + "learning_rate": 4.324104786366387e-06, + "loss": 0.1326, + "step": 21645 + }, + { + "epoch": 0.5477642533593137, + "grad_norm": 4.06323766708374, + "learning_rate": 4.323706951506611e-06, + "loss": 0.1872, + "step": 21646 + }, + { + "epoch": 0.5477895589240074, + "grad_norm": 7.255110263824463, + "learning_rate": 4.323309121008124e-06, + "loss": 0.2031, + "step": 21647 + }, + { + "epoch": 0.5478148644887011, + "grad_norm": 6.196984767913818, + "learning_rate": 4.322911294873493e-06, + "loss": 0.2298, + "step": 21648 + }, + { + "epoch": 0.5478401700533947, + "grad_norm": 7.592553615570068, + "learning_rate": 4.322513473105278e-06, + "loss": 0.2612, + "step": 21649 + }, + { + "epoch": 0.5478654756180884, + "grad_norm": 4.03931188583374, + "learning_rate": 4.322115655706049e-06, + "loss": 0.1913, + "step": 21650 + }, + { + "epoch": 0.5478907811827821, + "grad_norm": 2.762772798538208, + "learning_rate": 4.321717842678368e-06, + "loss": 0.1279, + "step": 21651 + }, + { + "epoch": 0.5479160867474757, + "grad_norm": 5.0717597007751465, + "learning_rate": 4.321320034024806e-06, + "loss": 0.1471, + "step": 21652 + }, + { + "epoch": 0.5479413923121694, + "grad_norm": 2.857201337814331, + "learning_rate": 4.320922229747923e-06, + "loss": 0.1413, + "step": 21653 + }, + { + "epoch": 0.5479666978768631, + "grad_norm": 3.8153634071350098, + "learning_rate": 4.3205244298502855e-06, + "loss": 0.1475, + "step": 21654 + }, + { + "epoch": 0.5479920034415569, + "grad_norm": 7.682955741882324, + "learning_rate": 4.320126634334461e-06, + "loss": 0.1411, + "step": 21655 + }, + { + "epoch": 0.5480173090062505, + "grad_norm": 5.704814910888672, + "learning_rate": 4.3197288432030135e-06, + "loss": 0.1676, + "step": 21656 + }, + { + "epoch": 0.5480426145709442, + "grad_norm": 4.859372615814209, + "learning_rate": 4.319331056458507e-06, + "loss": 0.1328, + "step": 21657 + }, + { + "epoch": 0.5480679201356379, + "grad_norm": 7.091047763824463, + "learning_rate": 4.318933274103507e-06, + "loss": 0.2591, + "step": 21658 + }, + { + "epoch": 0.5480932257003315, + "grad_norm": 3.1625945568084717, + "learning_rate": 4.3185354961405815e-06, + "loss": 0.1335, + "step": 21659 + }, + { + "epoch": 0.5481185312650252, + "grad_norm": 5.929797649383545, + "learning_rate": 4.318137722572293e-06, + "loss": 0.195, + "step": 21660 + }, + { + "epoch": 0.5481438368297189, + "grad_norm": 4.673656940460205, + "learning_rate": 4.317739953401208e-06, + "loss": 0.1842, + "step": 21661 + }, + { + "epoch": 0.5481691423944125, + "grad_norm": 4.718060493469238, + "learning_rate": 4.317342188629891e-06, + "loss": 0.1607, + "step": 21662 + }, + { + "epoch": 0.5481944479591062, + "grad_norm": 4.84006404876709, + "learning_rate": 4.316944428260907e-06, + "loss": 0.1928, + "step": 21663 + }, + { + "epoch": 0.5482197535237999, + "grad_norm": 3.427400827407837, + "learning_rate": 4.316546672296821e-06, + "loss": 0.145, + "step": 21664 + }, + { + "epoch": 0.5482450590884935, + "grad_norm": 5.536862850189209, + "learning_rate": 4.3161489207402006e-06, + "loss": 0.1475, + "step": 21665 + }, + { + "epoch": 0.5482703646531872, + "grad_norm": 4.100809574127197, + "learning_rate": 4.315751173593607e-06, + "loss": 0.1608, + "step": 21666 + }, + { + "epoch": 0.548295670217881, + "grad_norm": 5.477297782897949, + "learning_rate": 4.3153534308596055e-06, + "loss": 0.1277, + "step": 21667 + }, + { + "epoch": 0.5483209757825745, + "grad_norm": 3.279020071029663, + "learning_rate": 4.314955692540763e-06, + "loss": 0.1543, + "step": 21668 + }, + { + "epoch": 0.5483462813472683, + "grad_norm": 2.8621957302093506, + "learning_rate": 4.314557958639643e-06, + "loss": 0.1403, + "step": 21669 + }, + { + "epoch": 0.548371586911962, + "grad_norm": 2.4813148975372314, + "learning_rate": 4.314160229158815e-06, + "loss": 0.1119, + "step": 21670 + }, + { + "epoch": 0.5483968924766556, + "grad_norm": 2.3922834396362305, + "learning_rate": 4.313762504100836e-06, + "loss": 0.094, + "step": 21671 + }, + { + "epoch": 0.5484221980413493, + "grad_norm": 2.516228437423706, + "learning_rate": 4.313364783468277e-06, + "loss": 0.1167, + "step": 21672 + }, + { + "epoch": 0.548447503606043, + "grad_norm": 5.425290584564209, + "learning_rate": 4.312967067263699e-06, + "loss": 0.1803, + "step": 21673 + }, + { + "epoch": 0.5484728091707366, + "grad_norm": 7.09862756729126, + "learning_rate": 4.312569355489672e-06, + "loss": 0.1768, + "step": 21674 + }, + { + "epoch": 0.5484981147354303, + "grad_norm": 15.851243019104004, + "learning_rate": 4.312171648148755e-06, + "loss": 0.1594, + "step": 21675 + }, + { + "epoch": 0.548523420300124, + "grad_norm": 10.594661712646484, + "learning_rate": 4.311773945243515e-06, + "loss": 0.3939, + "step": 21676 + }, + { + "epoch": 0.5485487258648176, + "grad_norm": 6.160877704620361, + "learning_rate": 4.311376246776516e-06, + "loss": 0.1289, + "step": 21677 + }, + { + "epoch": 0.5485740314295113, + "grad_norm": 2.1232433319091797, + "learning_rate": 4.310978552750326e-06, + "loss": 0.0672, + "step": 21678 + }, + { + "epoch": 0.548599336994205, + "grad_norm": 4.623328685760498, + "learning_rate": 4.3105808631675054e-06, + "loss": 0.1323, + "step": 21679 + }, + { + "epoch": 0.5486246425588988, + "grad_norm": 3.9999501705169678, + "learning_rate": 4.310183178030621e-06, + "loss": 0.1774, + "step": 21680 + }, + { + "epoch": 0.5486499481235924, + "grad_norm": 2.9121198654174805, + "learning_rate": 4.309785497342236e-06, + "loss": 0.132, + "step": 21681 + }, + { + "epoch": 0.5486752536882861, + "grad_norm": 4.4789347648620605, + "learning_rate": 4.309387821104916e-06, + "loss": 0.1816, + "step": 21682 + }, + { + "epoch": 0.5487005592529798, + "grad_norm": 4.997365951538086, + "learning_rate": 4.308990149321228e-06, + "loss": 0.223, + "step": 21683 + }, + { + "epoch": 0.5487258648176734, + "grad_norm": 5.041409015655518, + "learning_rate": 4.308592481993732e-06, + "loss": 0.2167, + "step": 21684 + }, + { + "epoch": 0.5487511703823671, + "grad_norm": 3.61171293258667, + "learning_rate": 4.308194819124995e-06, + "loss": 0.1086, + "step": 21685 + }, + { + "epoch": 0.5487764759470608, + "grad_norm": 5.757104396820068, + "learning_rate": 4.307797160717581e-06, + "loss": 0.1779, + "step": 21686 + }, + { + "epoch": 0.5488017815117544, + "grad_norm": 2.7356131076812744, + "learning_rate": 4.3073995067740544e-06, + "loss": 0.098, + "step": 21687 + }, + { + "epoch": 0.5488270870764481, + "grad_norm": 10.415918350219727, + "learning_rate": 4.307001857296979e-06, + "loss": 0.2337, + "step": 21688 + }, + { + "epoch": 0.5488523926411418, + "grad_norm": 4.1281938552856445, + "learning_rate": 4.30660421228892e-06, + "loss": 0.1966, + "step": 21689 + }, + { + "epoch": 0.5488776982058354, + "grad_norm": 9.723165512084961, + "learning_rate": 4.306206571752441e-06, + "loss": 0.2918, + "step": 21690 + }, + { + "epoch": 0.5489030037705291, + "grad_norm": 6.932912826538086, + "learning_rate": 4.305808935690107e-06, + "loss": 0.1758, + "step": 21691 + }, + { + "epoch": 0.5489283093352229, + "grad_norm": 3.258758783340454, + "learning_rate": 4.305411304104483e-06, + "loss": 0.1019, + "step": 21692 + }, + { + "epoch": 0.5489536148999165, + "grad_norm": 3.187934160232544, + "learning_rate": 4.305013676998132e-06, + "loss": 0.1124, + "step": 21693 + }, + { + "epoch": 0.5489789204646102, + "grad_norm": 9.073705673217773, + "learning_rate": 4.304616054373618e-06, + "loss": 0.2019, + "step": 21694 + }, + { + "epoch": 0.5490042260293039, + "grad_norm": 9.338476181030273, + "learning_rate": 4.304218436233505e-06, + "loss": 0.1644, + "step": 21695 + }, + { + "epoch": 0.5490295315939975, + "grad_norm": 14.226385116577148, + "learning_rate": 4.3038208225803604e-06, + "loss": 0.1935, + "step": 21696 + }, + { + "epoch": 0.5490548371586912, + "grad_norm": 6.235128402709961, + "learning_rate": 4.303423213416743e-06, + "loss": 0.2217, + "step": 21697 + }, + { + "epoch": 0.5490801427233849, + "grad_norm": 9.86736011505127, + "learning_rate": 4.30302560874522e-06, + "loss": 0.286, + "step": 21698 + }, + { + "epoch": 0.5491054482880785, + "grad_norm": 14.113930702209473, + "learning_rate": 4.302628008568356e-06, + "loss": 0.2127, + "step": 21699 + }, + { + "epoch": 0.5491307538527722, + "grad_norm": 3.434936046600342, + "learning_rate": 4.302230412888714e-06, + "loss": 0.1513, + "step": 21700 + }, + { + "epoch": 0.5491560594174659, + "grad_norm": 3.848895788192749, + "learning_rate": 4.301832821708859e-06, + "loss": 0.1703, + "step": 21701 + }, + { + "epoch": 0.5491813649821595, + "grad_norm": 13.871561050415039, + "learning_rate": 4.301435235031354e-06, + "loss": 0.2091, + "step": 21702 + }, + { + "epoch": 0.5492066705468532, + "grad_norm": 4.360483646392822, + "learning_rate": 4.3010376528587624e-06, + "loss": 0.1633, + "step": 21703 + }, + { + "epoch": 0.549231976111547, + "grad_norm": 5.855533599853516, + "learning_rate": 4.300640075193649e-06, + "loss": 0.1759, + "step": 21704 + }, + { + "epoch": 0.5492572816762407, + "grad_norm": 6.286734104156494, + "learning_rate": 4.30024250203858e-06, + "loss": 0.18, + "step": 21705 + }, + { + "epoch": 0.5492825872409343, + "grad_norm": 6.45097017288208, + "learning_rate": 4.299844933396114e-06, + "loss": 0.2228, + "step": 21706 + }, + { + "epoch": 0.549307892805628, + "grad_norm": 11.027441024780273, + "learning_rate": 4.299447369268819e-06, + "loss": 0.1661, + "step": 21707 + }, + { + "epoch": 0.5493331983703217, + "grad_norm": 2.5063440799713135, + "learning_rate": 4.299049809659257e-06, + "loss": 0.1312, + "step": 21708 + }, + { + "epoch": 0.5493585039350153, + "grad_norm": 12.365141868591309, + "learning_rate": 4.2986522545699946e-06, + "loss": 0.2594, + "step": 21709 + }, + { + "epoch": 0.549383809499709, + "grad_norm": 4.801034927368164, + "learning_rate": 4.2982547040035914e-06, + "loss": 0.1592, + "step": 21710 + }, + { + "epoch": 0.5494091150644027, + "grad_norm": 8.460542678833008, + "learning_rate": 4.297857157962613e-06, + "loss": 0.2048, + "step": 21711 + }, + { + "epoch": 0.5494344206290963, + "grad_norm": 14.240771293640137, + "learning_rate": 4.297459616449622e-06, + "loss": 0.1947, + "step": 21712 + }, + { + "epoch": 0.54945972619379, + "grad_norm": 6.115180015563965, + "learning_rate": 4.297062079467186e-06, + "loss": 0.2154, + "step": 21713 + }, + { + "epoch": 0.5494850317584837, + "grad_norm": 3.210773229598999, + "learning_rate": 4.296664547017865e-06, + "loss": 0.0652, + "step": 21714 + }, + { + "epoch": 0.5495103373231773, + "grad_norm": 2.8996355533599854, + "learning_rate": 4.296267019104223e-06, + "loss": 0.1062, + "step": 21715 + }, + { + "epoch": 0.549535642887871, + "grad_norm": 3.8473150730133057, + "learning_rate": 4.295869495728825e-06, + "loss": 0.1297, + "step": 21716 + }, + { + "epoch": 0.5495609484525648, + "grad_norm": 3.8774611949920654, + "learning_rate": 4.295471976894232e-06, + "loss": 0.1262, + "step": 21717 + }, + { + "epoch": 0.5495862540172584, + "grad_norm": 3.2608890533447266, + "learning_rate": 4.295074462603011e-06, + "loss": 0.14, + "step": 21718 + }, + { + "epoch": 0.5496115595819521, + "grad_norm": 3.805182456970215, + "learning_rate": 4.294676952857722e-06, + "loss": 0.1374, + "step": 21719 + }, + { + "epoch": 0.5496368651466458, + "grad_norm": 5.674006462097168, + "learning_rate": 4.2942794476609315e-06, + "loss": 0.2432, + "step": 21720 + }, + { + "epoch": 0.5496621707113394, + "grad_norm": 9.43283748626709, + "learning_rate": 4.2938819470152e-06, + "loss": 0.2375, + "step": 21721 + }, + { + "epoch": 0.5496874762760331, + "grad_norm": 6.224647045135498, + "learning_rate": 4.293484450923093e-06, + "loss": 0.1591, + "step": 21722 + }, + { + "epoch": 0.5497127818407268, + "grad_norm": 6.575437068939209, + "learning_rate": 4.293086959387176e-06, + "loss": 0.1851, + "step": 21723 + }, + { + "epoch": 0.5497380874054204, + "grad_norm": 5.334054470062256, + "learning_rate": 4.292689472410007e-06, + "loss": 0.1596, + "step": 21724 + }, + { + "epoch": 0.5497633929701141, + "grad_norm": 3.3147976398468018, + "learning_rate": 4.292291989994152e-06, + "loss": 0.1167, + "step": 21725 + }, + { + "epoch": 0.5497886985348078, + "grad_norm": 3.849459648132324, + "learning_rate": 4.291894512142174e-06, + "loss": 0.1252, + "step": 21726 + }, + { + "epoch": 0.5498140040995014, + "grad_norm": 3.55720853805542, + "learning_rate": 4.291497038856639e-06, + "loss": 0.1309, + "step": 21727 + }, + { + "epoch": 0.5498393096641951, + "grad_norm": 3.325403928756714, + "learning_rate": 4.291099570140106e-06, + "loss": 0.1293, + "step": 21728 + }, + { + "epoch": 0.5498646152288889, + "grad_norm": 3.4217276573181152, + "learning_rate": 4.290702105995139e-06, + "loss": 0.1231, + "step": 21729 + }, + { + "epoch": 0.5498899207935825, + "grad_norm": 5.788854122161865, + "learning_rate": 4.290304646424303e-06, + "loss": 0.1963, + "step": 21730 + }, + { + "epoch": 0.5499152263582762, + "grad_norm": 9.395684242248535, + "learning_rate": 4.289907191430163e-06, + "loss": 0.1575, + "step": 21731 + }, + { + "epoch": 0.5499405319229699, + "grad_norm": 3.8847031593322754, + "learning_rate": 4.289509741015276e-06, + "loss": 0.1529, + "step": 21732 + }, + { + "epoch": 0.5499658374876636, + "grad_norm": 15.713996887207031, + "learning_rate": 4.28911229518221e-06, + "loss": 0.2756, + "step": 21733 + }, + { + "epoch": 0.5499911430523572, + "grad_norm": 3.128129720687866, + "learning_rate": 4.288714853933525e-06, + "loss": 0.1301, + "step": 21734 + }, + { + "epoch": 0.5500164486170509, + "grad_norm": 6.808043003082275, + "learning_rate": 4.288317417271786e-06, + "loss": 0.143, + "step": 21735 + }, + { + "epoch": 0.5500417541817446, + "grad_norm": 4.988117218017578, + "learning_rate": 4.2879199851995585e-06, + "loss": 0.1672, + "step": 21736 + }, + { + "epoch": 0.5500670597464382, + "grad_norm": 3.948777198791504, + "learning_rate": 4.2875225577194e-06, + "loss": 0.1128, + "step": 21737 + }, + { + "epoch": 0.5500923653111319, + "grad_norm": 3.044524669647217, + "learning_rate": 4.287125134833876e-06, + "loss": 0.0903, + "step": 21738 + }, + { + "epoch": 0.5501176708758256, + "grad_norm": 2.3583242893218994, + "learning_rate": 4.286727716545549e-06, + "loss": 0.0584, + "step": 21739 + }, + { + "epoch": 0.5501429764405192, + "grad_norm": 10.531797409057617, + "learning_rate": 4.286330302856985e-06, + "loss": 0.2132, + "step": 21740 + }, + { + "epoch": 0.550168282005213, + "grad_norm": 7.553640365600586, + "learning_rate": 4.285932893770742e-06, + "loss": 0.1565, + "step": 21741 + }, + { + "epoch": 0.5501935875699067, + "grad_norm": 8.403891563415527, + "learning_rate": 4.285535489289384e-06, + "loss": 0.2774, + "step": 21742 + }, + { + "epoch": 0.5502188931346003, + "grad_norm": 8.583338737487793, + "learning_rate": 4.285138089415476e-06, + "loss": 0.1741, + "step": 21743 + }, + { + "epoch": 0.550244198699294, + "grad_norm": 8.968277931213379, + "learning_rate": 4.28474069415158e-06, + "loss": 0.151, + "step": 21744 + }, + { + "epoch": 0.5502695042639877, + "grad_norm": 4.102815628051758, + "learning_rate": 4.284343303500258e-06, + "loss": 0.1756, + "step": 21745 + }, + { + "epoch": 0.5502948098286813, + "grad_norm": 4.447151184082031, + "learning_rate": 4.283945917464072e-06, + "loss": 0.1375, + "step": 21746 + }, + { + "epoch": 0.550320115393375, + "grad_norm": 5.775588512420654, + "learning_rate": 4.283548536045587e-06, + "loss": 0.1151, + "step": 21747 + }, + { + "epoch": 0.5503454209580687, + "grad_norm": 7.489908695220947, + "learning_rate": 4.283151159247363e-06, + "loss": 0.2036, + "step": 21748 + }, + { + "epoch": 0.5503707265227623, + "grad_norm": 5.978006839752197, + "learning_rate": 4.282753787071964e-06, + "loss": 0.1267, + "step": 21749 + }, + { + "epoch": 0.550396032087456, + "grad_norm": 8.695639610290527, + "learning_rate": 4.282356419521954e-06, + "loss": 0.2205, + "step": 21750 + }, + { + "epoch": 0.5504213376521497, + "grad_norm": 8.43703556060791, + "learning_rate": 4.281959056599891e-06, + "loss": 0.1704, + "step": 21751 + }, + { + "epoch": 0.5504466432168433, + "grad_norm": 4.041365623474121, + "learning_rate": 4.281561698308342e-06, + "loss": 0.1788, + "step": 21752 + }, + { + "epoch": 0.550471948781537, + "grad_norm": 4.18111515045166, + "learning_rate": 4.28116434464987e-06, + "loss": 0.1631, + "step": 21753 + }, + { + "epoch": 0.5504972543462308, + "grad_norm": 4.658429145812988, + "learning_rate": 4.280766995627033e-06, + "loss": 0.1796, + "step": 21754 + }, + { + "epoch": 0.5505225599109244, + "grad_norm": 4.323114395141602, + "learning_rate": 4.280369651242395e-06, + "loss": 0.1687, + "step": 21755 + }, + { + "epoch": 0.5505478654756181, + "grad_norm": 4.091150283813477, + "learning_rate": 4.27997231149852e-06, + "loss": 0.1645, + "step": 21756 + }, + { + "epoch": 0.5505731710403118, + "grad_norm": 5.504438400268555, + "learning_rate": 4.27957497639797e-06, + "loss": 0.1935, + "step": 21757 + }, + { + "epoch": 0.5505984766050055, + "grad_norm": 7.799999237060547, + "learning_rate": 4.279177645943309e-06, + "loss": 0.2148, + "step": 21758 + }, + { + "epoch": 0.5506237821696991, + "grad_norm": 22.658288955688477, + "learning_rate": 4.278780320137095e-06, + "loss": 0.233, + "step": 21759 + }, + { + "epoch": 0.5506490877343928, + "grad_norm": 3.6041409969329834, + "learning_rate": 4.278382998981892e-06, + "loss": 0.1316, + "step": 21760 + }, + { + "epoch": 0.5506743932990865, + "grad_norm": 3.784918785095215, + "learning_rate": 4.277985682480263e-06, + "loss": 0.1197, + "step": 21761 + }, + { + "epoch": 0.5506996988637801, + "grad_norm": 4.210082530975342, + "learning_rate": 4.2775883706347736e-06, + "loss": 0.157, + "step": 21762 + }, + { + "epoch": 0.5507250044284738, + "grad_norm": 2.923081159591675, + "learning_rate": 4.2771910634479786e-06, + "loss": 0.1467, + "step": 21763 + }, + { + "epoch": 0.5507503099931675, + "grad_norm": 6.007880210876465, + "learning_rate": 4.276793760922445e-06, + "loss": 0.1575, + "step": 21764 + }, + { + "epoch": 0.5507756155578611, + "grad_norm": 3.9527158737182617, + "learning_rate": 4.276396463060733e-06, + "loss": 0.1876, + "step": 21765 + }, + { + "epoch": 0.5508009211225549, + "grad_norm": 8.950017929077148, + "learning_rate": 4.275999169865406e-06, + "loss": 0.2407, + "step": 21766 + }, + { + "epoch": 0.5508262266872486, + "grad_norm": 2.850245714187622, + "learning_rate": 4.275601881339028e-06, + "loss": 0.1485, + "step": 21767 + }, + { + "epoch": 0.5508515322519422, + "grad_norm": 9.847478866577148, + "learning_rate": 4.275204597484156e-06, + "loss": 0.2425, + "step": 21768 + }, + { + "epoch": 0.5508768378166359, + "grad_norm": 4.558393955230713, + "learning_rate": 4.274807318303355e-06, + "loss": 0.1758, + "step": 21769 + }, + { + "epoch": 0.5509021433813296, + "grad_norm": 8.698664665222168, + "learning_rate": 4.274410043799188e-06, + "loss": 0.2496, + "step": 21770 + }, + { + "epoch": 0.5509274489460232, + "grad_norm": 7.487011432647705, + "learning_rate": 4.274012773974216e-06, + "loss": 0.1843, + "step": 21771 + }, + { + "epoch": 0.5509527545107169, + "grad_norm": 8.992616653442383, + "learning_rate": 4.2736155088309994e-06, + "loss": 0.2843, + "step": 21772 + }, + { + "epoch": 0.5509780600754106, + "grad_norm": 5.552587509155273, + "learning_rate": 4.273218248372101e-06, + "loss": 0.1246, + "step": 21773 + }, + { + "epoch": 0.5510033656401042, + "grad_norm": 5.3514299392700195, + "learning_rate": 4.272820992600084e-06, + "loss": 0.1684, + "step": 21774 + }, + { + "epoch": 0.5510286712047979, + "grad_norm": 4.3459343910217285, + "learning_rate": 4.272423741517507e-06, + "loss": 0.1849, + "step": 21775 + }, + { + "epoch": 0.5510539767694916, + "grad_norm": 12.975500106811523, + "learning_rate": 4.272026495126936e-06, + "loss": 0.2066, + "step": 21776 + }, + { + "epoch": 0.5510792823341852, + "grad_norm": 3.129926919937134, + "learning_rate": 4.2716292534309314e-06, + "loss": 0.1492, + "step": 21777 + }, + { + "epoch": 0.551104587898879, + "grad_norm": 3.493938684463501, + "learning_rate": 4.271232016432053e-06, + "loss": 0.1346, + "step": 21778 + }, + { + "epoch": 0.5511298934635727, + "grad_norm": 13.37118148803711, + "learning_rate": 4.270834784132863e-06, + "loss": 0.2409, + "step": 21779 + }, + { + "epoch": 0.5511551990282663, + "grad_norm": 15.120165824890137, + "learning_rate": 4.2704375565359255e-06, + "loss": 0.264, + "step": 21780 + }, + { + "epoch": 0.55118050459296, + "grad_norm": 10.136897087097168, + "learning_rate": 4.2700403336438e-06, + "loss": 0.214, + "step": 21781 + }, + { + "epoch": 0.5512058101576537, + "grad_norm": 8.851903915405273, + "learning_rate": 4.269643115459048e-06, + "loss": 0.3394, + "step": 21782 + }, + { + "epoch": 0.5512311157223474, + "grad_norm": 3.3427884578704834, + "learning_rate": 4.269245901984231e-06, + "loss": 0.132, + "step": 21783 + }, + { + "epoch": 0.551256421287041, + "grad_norm": 3.8279242515563965, + "learning_rate": 4.268848693221914e-06, + "loss": 0.1938, + "step": 21784 + }, + { + "epoch": 0.5512817268517347, + "grad_norm": 6.219917297363281, + "learning_rate": 4.2684514891746535e-06, + "loss": 0.1974, + "step": 21785 + }, + { + "epoch": 0.5513070324164284, + "grad_norm": 3.3035361766815186, + "learning_rate": 4.268054289845013e-06, + "loss": 0.1145, + "step": 21786 + }, + { + "epoch": 0.551332337981122, + "grad_norm": 4.09775447845459, + "learning_rate": 4.2676570952355545e-06, + "loss": 0.1573, + "step": 21787 + }, + { + "epoch": 0.5513576435458157, + "grad_norm": 10.20113468170166, + "learning_rate": 4.26725990534884e-06, + "loss": 0.1002, + "step": 21788 + }, + { + "epoch": 0.5513829491105094, + "grad_norm": 4.418456077575684, + "learning_rate": 4.266862720187431e-06, + "loss": 0.1471, + "step": 21789 + }, + { + "epoch": 0.551408254675203, + "grad_norm": 4.052206516265869, + "learning_rate": 4.266465539753886e-06, + "loss": 0.1592, + "step": 21790 + }, + { + "epoch": 0.5514335602398968, + "grad_norm": 2.6934432983398438, + "learning_rate": 4.266068364050768e-06, + "loss": 0.1497, + "step": 21791 + }, + { + "epoch": 0.5514588658045905, + "grad_norm": 8.419662475585938, + "learning_rate": 4.26567119308064e-06, + "loss": 0.1719, + "step": 21792 + }, + { + "epoch": 0.5514841713692841, + "grad_norm": 5.203794479370117, + "learning_rate": 4.265274026846062e-06, + "loss": 0.1915, + "step": 21793 + }, + { + "epoch": 0.5515094769339778, + "grad_norm": 14.203680038452148, + "learning_rate": 4.264876865349594e-06, + "loss": 0.2967, + "step": 21794 + }, + { + "epoch": 0.5515347824986715, + "grad_norm": 10.021963119506836, + "learning_rate": 4.264479708593799e-06, + "loss": 0.2662, + "step": 21795 + }, + { + "epoch": 0.5515600880633651, + "grad_norm": 3.418412923812866, + "learning_rate": 4.264082556581237e-06, + "loss": 0.1607, + "step": 21796 + }, + { + "epoch": 0.5515853936280588, + "grad_norm": 3.167898654937744, + "learning_rate": 4.26368540931447e-06, + "loss": 0.1115, + "step": 21797 + }, + { + "epoch": 0.5516106991927525, + "grad_norm": 3.410451889038086, + "learning_rate": 4.26328826679606e-06, + "loss": 0.1588, + "step": 21798 + }, + { + "epoch": 0.5516360047574461, + "grad_norm": 4.126551628112793, + "learning_rate": 4.262891129028566e-06, + "loss": 0.1186, + "step": 21799 + }, + { + "epoch": 0.5516613103221398, + "grad_norm": 4.526174068450928, + "learning_rate": 4.262493996014549e-06, + "loss": 0.1765, + "step": 21800 + }, + { + "epoch": 0.5516866158868335, + "grad_norm": 6.367304801940918, + "learning_rate": 4.262096867756572e-06, + "loss": 0.1647, + "step": 21801 + }, + { + "epoch": 0.5517119214515271, + "grad_norm": 14.149313926696777, + "learning_rate": 4.261699744257196e-06, + "loss": 0.1554, + "step": 21802 + }, + { + "epoch": 0.5517372270162209, + "grad_norm": 7.253072261810303, + "learning_rate": 4.2613026255189795e-06, + "loss": 0.2048, + "step": 21803 + }, + { + "epoch": 0.5517625325809146, + "grad_norm": 3.294901132583618, + "learning_rate": 4.260905511544485e-06, + "loss": 0.1107, + "step": 21804 + }, + { + "epoch": 0.5517878381456082, + "grad_norm": 3.99155855178833, + "learning_rate": 4.2605084023362745e-06, + "loss": 0.1465, + "step": 21805 + }, + { + "epoch": 0.5518131437103019, + "grad_norm": 6.94883918762207, + "learning_rate": 4.260111297896907e-06, + "loss": 0.2372, + "step": 21806 + }, + { + "epoch": 0.5518384492749956, + "grad_norm": 4.064854145050049, + "learning_rate": 4.259714198228944e-06, + "loss": 0.1639, + "step": 21807 + }, + { + "epoch": 0.5518637548396893, + "grad_norm": 5.215665817260742, + "learning_rate": 4.259317103334947e-06, + "loss": 0.1886, + "step": 21808 + }, + { + "epoch": 0.5518890604043829, + "grad_norm": 4.01750373840332, + "learning_rate": 4.258920013217475e-06, + "loss": 0.1686, + "step": 21809 + }, + { + "epoch": 0.5519143659690766, + "grad_norm": 13.988507270812988, + "learning_rate": 4.258522927879091e-06, + "loss": 0.1907, + "step": 21810 + }, + { + "epoch": 0.5519396715337703, + "grad_norm": 3.2650132179260254, + "learning_rate": 4.258125847322356e-06, + "loss": 0.1614, + "step": 21811 + }, + { + "epoch": 0.5519649770984639, + "grad_norm": 5.2596001625061035, + "learning_rate": 4.257728771549828e-06, + "loss": 0.2034, + "step": 21812 + }, + { + "epoch": 0.5519902826631576, + "grad_norm": 3.8602499961853027, + "learning_rate": 4.257331700564068e-06, + "loss": 0.1009, + "step": 21813 + }, + { + "epoch": 0.5520155882278514, + "grad_norm": 12.952890396118164, + "learning_rate": 4.2569346343676385e-06, + "loss": 0.4882, + "step": 21814 + }, + { + "epoch": 0.552040893792545, + "grad_norm": 3.019583225250244, + "learning_rate": 4.256537572963101e-06, + "loss": 0.1189, + "step": 21815 + }, + { + "epoch": 0.5520661993572387, + "grad_norm": 2.9666013717651367, + "learning_rate": 4.256140516353013e-06, + "loss": 0.184, + "step": 21816 + }, + { + "epoch": 0.5520915049219324, + "grad_norm": 7.94882345199585, + "learning_rate": 4.255743464539935e-06, + "loss": 0.1434, + "step": 21817 + }, + { + "epoch": 0.552116810486626, + "grad_norm": 7.628437519073486, + "learning_rate": 4.25534641752643e-06, + "loss": 0.1528, + "step": 21818 + }, + { + "epoch": 0.5521421160513197, + "grad_norm": 4.1200337409973145, + "learning_rate": 4.254949375315058e-06, + "loss": 0.1602, + "step": 21819 + }, + { + "epoch": 0.5521674216160134, + "grad_norm": 10.404942512512207, + "learning_rate": 4.25455233790838e-06, + "loss": 0.213, + "step": 21820 + }, + { + "epoch": 0.552192727180707, + "grad_norm": 5.243541240692139, + "learning_rate": 4.254155305308953e-06, + "loss": 0.1277, + "step": 21821 + }, + { + "epoch": 0.5522180327454007, + "grad_norm": 4.462195873260498, + "learning_rate": 4.253758277519341e-06, + "loss": 0.1147, + "step": 21822 + }, + { + "epoch": 0.5522433383100944, + "grad_norm": 4.966962814331055, + "learning_rate": 4.253361254542102e-06, + "loss": 0.1569, + "step": 21823 + }, + { + "epoch": 0.552268643874788, + "grad_norm": 6.188922882080078, + "learning_rate": 4.2529642363797995e-06, + "loss": 0.1821, + "step": 21824 + }, + { + "epoch": 0.5522939494394817, + "grad_norm": 7.893500804901123, + "learning_rate": 4.25256722303499e-06, + "loss": 0.2011, + "step": 21825 + }, + { + "epoch": 0.5523192550041754, + "grad_norm": 6.242923736572266, + "learning_rate": 4.252170214510236e-06, + "loss": 0.2467, + "step": 21826 + }, + { + "epoch": 0.552344560568869, + "grad_norm": 4.4557929039001465, + "learning_rate": 4.251773210808095e-06, + "loss": 0.229, + "step": 21827 + }, + { + "epoch": 0.5523698661335628, + "grad_norm": 4.083737373352051, + "learning_rate": 4.251376211931132e-06, + "loss": 0.1566, + "step": 21828 + }, + { + "epoch": 0.5523951716982565, + "grad_norm": 8.471528053283691, + "learning_rate": 4.250979217881905e-06, + "loss": 0.1518, + "step": 21829 + }, + { + "epoch": 0.5524204772629501, + "grad_norm": 3.3942012786865234, + "learning_rate": 4.250582228662973e-06, + "loss": 0.1114, + "step": 21830 + }, + { + "epoch": 0.5524457828276438, + "grad_norm": 6.660463333129883, + "learning_rate": 4.250185244276896e-06, + "loss": 0.1311, + "step": 21831 + }, + { + "epoch": 0.5524710883923375, + "grad_norm": 8.978302955627441, + "learning_rate": 4.249788264726235e-06, + "loss": 0.2502, + "step": 21832 + }, + { + "epoch": 0.5524963939570312, + "grad_norm": 17.437349319458008, + "learning_rate": 4.249391290013551e-06, + "loss": 0.2603, + "step": 21833 + }, + { + "epoch": 0.5525216995217248, + "grad_norm": 4.058314323425293, + "learning_rate": 4.248994320141401e-06, + "loss": 0.1687, + "step": 21834 + }, + { + "epoch": 0.5525470050864185, + "grad_norm": 4.669145107269287, + "learning_rate": 4.24859735511235e-06, + "loss": 0.1913, + "step": 21835 + }, + { + "epoch": 0.5525723106511122, + "grad_norm": 7.472837448120117, + "learning_rate": 4.248200394928952e-06, + "loss": 0.1665, + "step": 21836 + }, + { + "epoch": 0.5525976162158058, + "grad_norm": 5.363028049468994, + "learning_rate": 4.247803439593771e-06, + "loss": 0.2239, + "step": 21837 + }, + { + "epoch": 0.5526229217804995, + "grad_norm": 5.06790018081665, + "learning_rate": 4.247406489109366e-06, + "loss": 0.1808, + "step": 21838 + }, + { + "epoch": 0.5526482273451933, + "grad_norm": 7.269323825836182, + "learning_rate": 4.2470095434782955e-06, + "loss": 0.2494, + "step": 21839 + }, + { + "epoch": 0.5526735329098869, + "grad_norm": 6.694103717803955, + "learning_rate": 4.246612602703121e-06, + "loss": 0.2178, + "step": 21840 + }, + { + "epoch": 0.5526988384745806, + "grad_norm": 5.691337585449219, + "learning_rate": 4.246215666786401e-06, + "loss": 0.2066, + "step": 21841 + }, + { + "epoch": 0.5527241440392743, + "grad_norm": 7.465015411376953, + "learning_rate": 4.245818735730698e-06, + "loss": 0.2176, + "step": 21842 + }, + { + "epoch": 0.5527494496039679, + "grad_norm": 2.8474514484405518, + "learning_rate": 4.245421809538567e-06, + "loss": 0.1237, + "step": 21843 + }, + { + "epoch": 0.5527747551686616, + "grad_norm": 4.9044976234436035, + "learning_rate": 4.245024888212572e-06, + "loss": 0.1489, + "step": 21844 + }, + { + "epoch": 0.5528000607333553, + "grad_norm": 6.6997551918029785, + "learning_rate": 4.2446279717552695e-06, + "loss": 0.2081, + "step": 21845 + }, + { + "epoch": 0.5528253662980489, + "grad_norm": 7.566600322723389, + "learning_rate": 4.244231060169224e-06, + "loss": 0.1408, + "step": 21846 + }, + { + "epoch": 0.5528506718627426, + "grad_norm": 3.2295026779174805, + "learning_rate": 4.243834153456989e-06, + "loss": 0.0963, + "step": 21847 + }, + { + "epoch": 0.5528759774274363, + "grad_norm": 6.669830322265625, + "learning_rate": 4.243437251621127e-06, + "loss": 0.1815, + "step": 21848 + }, + { + "epoch": 0.5529012829921299, + "grad_norm": 6.88816499710083, + "learning_rate": 4.243040354664197e-06, + "loss": 0.1988, + "step": 21849 + }, + { + "epoch": 0.5529265885568236, + "grad_norm": 5.556453704833984, + "learning_rate": 4.24264346258876e-06, + "loss": 0.16, + "step": 21850 + }, + { + "epoch": 0.5529518941215174, + "grad_norm": 6.0190019607543945, + "learning_rate": 4.242246575397376e-06, + "loss": 0.1418, + "step": 21851 + }, + { + "epoch": 0.552977199686211, + "grad_norm": 2.7625012397766113, + "learning_rate": 4.2418496930926005e-06, + "loss": 0.1479, + "step": 21852 + }, + { + "epoch": 0.5530025052509047, + "grad_norm": 3.9413721561431885, + "learning_rate": 4.241452815676996e-06, + "loss": 0.163, + "step": 21853 + }, + { + "epoch": 0.5530278108155984, + "grad_norm": 4.19345760345459, + "learning_rate": 4.2410559431531206e-06, + "loss": 0.1674, + "step": 21854 + }, + { + "epoch": 0.553053116380292, + "grad_norm": 6.0762619972229, + "learning_rate": 4.2406590755235364e-06, + "loss": 0.2325, + "step": 21855 + }, + { + "epoch": 0.5530784219449857, + "grad_norm": 4.53391695022583, + "learning_rate": 4.240262212790799e-06, + "loss": 0.1136, + "step": 21856 + }, + { + "epoch": 0.5531037275096794, + "grad_norm": 4.0370049476623535, + "learning_rate": 4.239865354957468e-06, + "loss": 0.1249, + "step": 21857 + }, + { + "epoch": 0.553129033074373, + "grad_norm": 4.04943323135376, + "learning_rate": 4.239468502026106e-06, + "loss": 0.1715, + "step": 21858 + }, + { + "epoch": 0.5531543386390667, + "grad_norm": 4.833566665649414, + "learning_rate": 4.23907165399927e-06, + "loss": 0.1849, + "step": 21859 + }, + { + "epoch": 0.5531796442037604, + "grad_norm": 5.498950481414795, + "learning_rate": 4.238674810879519e-06, + "loss": 0.183, + "step": 21860 + }, + { + "epoch": 0.5532049497684541, + "grad_norm": 2.272803544998169, + "learning_rate": 4.238277972669412e-06, + "loss": 0.1152, + "step": 21861 + }, + { + "epoch": 0.5532302553331477, + "grad_norm": 3.5168638229370117, + "learning_rate": 4.2378811393715095e-06, + "loss": 0.1047, + "step": 21862 + }, + { + "epoch": 0.5532555608978414, + "grad_norm": 6.442623138427734, + "learning_rate": 4.2374843109883686e-06, + "loss": 0.191, + "step": 21863 + }, + { + "epoch": 0.5532808664625352, + "grad_norm": 4.026603698730469, + "learning_rate": 4.237087487522551e-06, + "loss": 0.1667, + "step": 21864 + }, + { + "epoch": 0.5533061720272288, + "grad_norm": 4.3280110359191895, + "learning_rate": 4.236690668976613e-06, + "loss": 0.1375, + "step": 21865 + }, + { + "epoch": 0.5533314775919225, + "grad_norm": 5.500274658203125, + "learning_rate": 4.236293855353116e-06, + "loss": 0.1195, + "step": 21866 + }, + { + "epoch": 0.5533567831566162, + "grad_norm": 22.238338470458984, + "learning_rate": 4.235897046654616e-06, + "loss": 0.2169, + "step": 21867 + }, + { + "epoch": 0.5533820887213098, + "grad_norm": 4.073957443237305, + "learning_rate": 4.235500242883676e-06, + "loss": 0.1547, + "step": 21868 + }, + { + "epoch": 0.5534073942860035, + "grad_norm": 6.525952339172363, + "learning_rate": 4.235103444042852e-06, + "loss": 0.1258, + "step": 21869 + }, + { + "epoch": 0.5534326998506972, + "grad_norm": 10.020710945129395, + "learning_rate": 4.234706650134703e-06, + "loss": 0.169, + "step": 21870 + }, + { + "epoch": 0.5534580054153908, + "grad_norm": 8.725306510925293, + "learning_rate": 4.234309861161789e-06, + "loss": 0.2261, + "step": 21871 + }, + { + "epoch": 0.5534833109800845, + "grad_norm": 10.848392486572266, + "learning_rate": 4.233913077126667e-06, + "loss": 0.174, + "step": 21872 + }, + { + "epoch": 0.5535086165447782, + "grad_norm": 10.501849174499512, + "learning_rate": 4.2335162980319e-06, + "loss": 0.3299, + "step": 21873 + }, + { + "epoch": 0.5535339221094718, + "grad_norm": 7.092007637023926, + "learning_rate": 4.233119523880042e-06, + "loss": 0.1887, + "step": 21874 + }, + { + "epoch": 0.5535592276741655, + "grad_norm": 24.764198303222656, + "learning_rate": 4.232722754673652e-06, + "loss": 0.3269, + "step": 21875 + }, + { + "epoch": 0.5535845332388593, + "grad_norm": 3.1311542987823486, + "learning_rate": 4.2323259904152915e-06, + "loss": 0.1239, + "step": 21876 + }, + { + "epoch": 0.5536098388035529, + "grad_norm": 13.139081954956055, + "learning_rate": 4.23192923110752e-06, + "loss": 0.1772, + "step": 21877 + }, + { + "epoch": 0.5536351443682466, + "grad_norm": 4.266100883483887, + "learning_rate": 4.231532476752892e-06, + "loss": 0.1401, + "step": 21878 + }, + { + "epoch": 0.5536604499329403, + "grad_norm": 5.906731605529785, + "learning_rate": 4.231135727353968e-06, + "loss": 0.1641, + "step": 21879 + }, + { + "epoch": 0.5536857554976339, + "grad_norm": 4.858400344848633, + "learning_rate": 4.230738982913306e-06, + "loss": 0.1942, + "step": 21880 + }, + { + "epoch": 0.5537110610623276, + "grad_norm": 4.033312797546387, + "learning_rate": 4.230342243433468e-06, + "loss": 0.1713, + "step": 21881 + }, + { + "epoch": 0.5537363666270213, + "grad_norm": 4.912965774536133, + "learning_rate": 4.2299455089170075e-06, + "loss": 0.2149, + "step": 21882 + }, + { + "epoch": 0.5537616721917149, + "grad_norm": 7.996722221374512, + "learning_rate": 4.229548779366485e-06, + "loss": 0.1293, + "step": 21883 + }, + { + "epoch": 0.5537869777564086, + "grad_norm": 5.452548503875732, + "learning_rate": 4.229152054784459e-06, + "loss": 0.2308, + "step": 21884 + }, + { + "epoch": 0.5538122833211023, + "grad_norm": 3.578207492828369, + "learning_rate": 4.228755335173488e-06, + "loss": 0.142, + "step": 21885 + }, + { + "epoch": 0.553837588885796, + "grad_norm": 2.6876561641693115, + "learning_rate": 4.228358620536132e-06, + "loss": 0.1287, + "step": 21886 + }, + { + "epoch": 0.5538628944504896, + "grad_norm": 2.997929334640503, + "learning_rate": 4.227961910874947e-06, + "loss": 0.1451, + "step": 21887 + }, + { + "epoch": 0.5538882000151834, + "grad_norm": 5.666210651397705, + "learning_rate": 4.227565206192491e-06, + "loss": 0.1388, + "step": 21888 + }, + { + "epoch": 0.5539135055798771, + "grad_norm": 4.328197956085205, + "learning_rate": 4.227168506491324e-06, + "loss": 0.1778, + "step": 21889 + }, + { + "epoch": 0.5539388111445707, + "grad_norm": 10.869107246398926, + "learning_rate": 4.226771811774005e-06, + "loss": 0.1836, + "step": 21890 + }, + { + "epoch": 0.5539641167092644, + "grad_norm": 6.366755962371826, + "learning_rate": 4.2263751220430895e-06, + "loss": 0.1867, + "step": 21891 + }, + { + "epoch": 0.5539894222739581, + "grad_norm": 4.341354846954346, + "learning_rate": 4.225978437301136e-06, + "loss": 0.1604, + "step": 21892 + }, + { + "epoch": 0.5540147278386517, + "grad_norm": 5.774224281311035, + "learning_rate": 4.225581757550705e-06, + "loss": 0.1484, + "step": 21893 + }, + { + "epoch": 0.5540400334033454, + "grad_norm": 21.022293090820312, + "learning_rate": 4.225185082794352e-06, + "loss": 0.2695, + "step": 21894 + }, + { + "epoch": 0.5540653389680391, + "grad_norm": 6.579110622406006, + "learning_rate": 4.224788413034637e-06, + "loss": 0.1392, + "step": 21895 + }, + { + "epoch": 0.5540906445327327, + "grad_norm": 3.94173002243042, + "learning_rate": 4.224391748274118e-06, + "loss": 0.1698, + "step": 21896 + }, + { + "epoch": 0.5541159500974264, + "grad_norm": 3.2180135250091553, + "learning_rate": 4.2239950885153515e-06, + "loss": 0.1554, + "step": 21897 + }, + { + "epoch": 0.5541412556621201, + "grad_norm": 14.388457298278809, + "learning_rate": 4.223598433760895e-06, + "loss": 0.2192, + "step": 21898 + }, + { + "epoch": 0.5541665612268137, + "grad_norm": 7.422582626342773, + "learning_rate": 4.223201784013311e-06, + "loss": 0.2669, + "step": 21899 + }, + { + "epoch": 0.5541918667915074, + "grad_norm": 7.484182834625244, + "learning_rate": 4.2228051392751525e-06, + "loss": 0.1804, + "step": 21900 + }, + { + "epoch": 0.5542171723562012, + "grad_norm": 2.8448164463043213, + "learning_rate": 4.222408499548978e-06, + "loss": 0.1301, + "step": 21901 + }, + { + "epoch": 0.5542424779208948, + "grad_norm": 10.229552268981934, + "learning_rate": 4.222011864837347e-06, + "loss": 0.1789, + "step": 21902 + }, + { + "epoch": 0.5542677834855885, + "grad_norm": 5.418102741241455, + "learning_rate": 4.221615235142816e-06, + "loss": 0.1997, + "step": 21903 + }, + { + "epoch": 0.5542930890502822, + "grad_norm": 3.6687846183776855, + "learning_rate": 4.221218610467947e-06, + "loss": 0.1849, + "step": 21904 + }, + { + "epoch": 0.5543183946149758, + "grad_norm": 5.286811351776123, + "learning_rate": 4.220821990815291e-06, + "loss": 0.1435, + "step": 21905 + }, + { + "epoch": 0.5543437001796695, + "grad_norm": 6.743001461029053, + "learning_rate": 4.2204253761874094e-06, + "loss": 0.1367, + "step": 21906 + }, + { + "epoch": 0.5543690057443632, + "grad_norm": 5.365236282348633, + "learning_rate": 4.22002876658686e-06, + "loss": 0.21, + "step": 21907 + }, + { + "epoch": 0.5543943113090568, + "grad_norm": 4.8000922203063965, + "learning_rate": 4.219632162016202e-06, + "loss": 0.1438, + "step": 21908 + }, + { + "epoch": 0.5544196168737505, + "grad_norm": 7.4456400871276855, + "learning_rate": 4.219235562477989e-06, + "loss": 0.1566, + "step": 21909 + }, + { + "epoch": 0.5544449224384442, + "grad_norm": 2.791816473007202, + "learning_rate": 4.21883896797478e-06, + "loss": 0.0578, + "step": 21910 + }, + { + "epoch": 0.5544702280031379, + "grad_norm": 3.9836337566375732, + "learning_rate": 4.218442378509133e-06, + "loss": 0.1824, + "step": 21911 + }, + { + "epoch": 0.5544955335678315, + "grad_norm": 5.177333831787109, + "learning_rate": 4.218045794083609e-06, + "loss": 0.2254, + "step": 21912 + }, + { + "epoch": 0.5545208391325253, + "grad_norm": 2.890669107437134, + "learning_rate": 4.21764921470076e-06, + "loss": 0.0964, + "step": 21913 + }, + { + "epoch": 0.554546144697219, + "grad_norm": 5.538484573364258, + "learning_rate": 4.217252640363145e-06, + "loss": 0.1036, + "step": 21914 + }, + { + "epoch": 0.5545714502619126, + "grad_norm": 8.040796279907227, + "learning_rate": 4.216856071073322e-06, + "loss": 0.1802, + "step": 21915 + }, + { + "epoch": 0.5545967558266063, + "grad_norm": 11.583494186401367, + "learning_rate": 4.2164595068338485e-06, + "loss": 0.2866, + "step": 21916 + }, + { + "epoch": 0.5546220613913, + "grad_norm": 27.16400146484375, + "learning_rate": 4.2160629476472845e-06, + "loss": 0.1472, + "step": 21917 + }, + { + "epoch": 0.5546473669559936, + "grad_norm": 3.7533106803894043, + "learning_rate": 4.215666393516182e-06, + "loss": 0.139, + "step": 21918 + }, + { + "epoch": 0.5546726725206873, + "grad_norm": 5.841462135314941, + "learning_rate": 4.215269844443102e-06, + "loss": 0.1435, + "step": 21919 + }, + { + "epoch": 0.554697978085381, + "grad_norm": 3.976499319076538, + "learning_rate": 4.2148733004306004e-06, + "loss": 0.1337, + "step": 21920 + }, + { + "epoch": 0.5547232836500746, + "grad_norm": 3.4063987731933594, + "learning_rate": 4.214476761481237e-06, + "loss": 0.0884, + "step": 21921 + }, + { + "epoch": 0.5547485892147683, + "grad_norm": 3.9894378185272217, + "learning_rate": 4.214080227597564e-06, + "loss": 0.1825, + "step": 21922 + }, + { + "epoch": 0.554773894779462, + "grad_norm": 4.0896525382995605, + "learning_rate": 4.2136836987821425e-06, + "loss": 0.0765, + "step": 21923 + }, + { + "epoch": 0.5547992003441556, + "grad_norm": 8.431768417358398, + "learning_rate": 4.213287175037528e-06, + "loss": 0.1965, + "step": 21924 + }, + { + "epoch": 0.5548245059088494, + "grad_norm": 6.672313690185547, + "learning_rate": 4.212890656366279e-06, + "loss": 0.1786, + "step": 21925 + }, + { + "epoch": 0.5548498114735431, + "grad_norm": 4.4674859046936035, + "learning_rate": 4.212494142770952e-06, + "loss": 0.0944, + "step": 21926 + }, + { + "epoch": 0.5548751170382367, + "grad_norm": 6.587113380432129, + "learning_rate": 4.2120976342541035e-06, + "loss": 0.1109, + "step": 21927 + }, + { + "epoch": 0.5549004226029304, + "grad_norm": 16.408544540405273, + "learning_rate": 4.21170113081829e-06, + "loss": 0.231, + "step": 21928 + }, + { + "epoch": 0.5549257281676241, + "grad_norm": 3.4765684604644775, + "learning_rate": 4.211304632466069e-06, + "loss": 0.1569, + "step": 21929 + }, + { + "epoch": 0.5549510337323177, + "grad_norm": 4.349895477294922, + "learning_rate": 4.2109081392e-06, + "loss": 0.1201, + "step": 21930 + }, + { + "epoch": 0.5549763392970114, + "grad_norm": 10.930920600891113, + "learning_rate": 4.210511651022636e-06, + "loss": 0.2934, + "step": 21931 + }, + { + "epoch": 0.5550016448617051, + "grad_norm": 10.150086402893066, + "learning_rate": 4.210115167936536e-06, + "loss": 0.2855, + "step": 21932 + }, + { + "epoch": 0.5550269504263987, + "grad_norm": 5.65685510635376, + "learning_rate": 4.209718689944255e-06, + "loss": 0.2133, + "step": 21933 + }, + { + "epoch": 0.5550522559910924, + "grad_norm": 6.0760369300842285, + "learning_rate": 4.209322217048353e-06, + "loss": 0.208, + "step": 21934 + }, + { + "epoch": 0.5550775615557861, + "grad_norm": 9.89902400970459, + "learning_rate": 4.208925749251384e-06, + "loss": 0.2173, + "step": 21935 + }, + { + "epoch": 0.5551028671204798, + "grad_norm": 6.709805488586426, + "learning_rate": 4.208529286555905e-06, + "loss": 0.1954, + "step": 21936 + }, + { + "epoch": 0.5551281726851734, + "grad_norm": 6.240172863006592, + "learning_rate": 4.208132828964474e-06, + "loss": 0.133, + "step": 21937 + }, + { + "epoch": 0.5551534782498672, + "grad_norm": 10.241634368896484, + "learning_rate": 4.207736376479647e-06, + "loss": 0.1838, + "step": 21938 + }, + { + "epoch": 0.5551787838145609, + "grad_norm": 9.099593162536621, + "learning_rate": 4.2073399291039824e-06, + "loss": 0.2346, + "step": 21939 + }, + { + "epoch": 0.5552040893792545, + "grad_norm": 6.497070789337158, + "learning_rate": 4.206943486840033e-06, + "loss": 0.2557, + "step": 21940 + }, + { + "epoch": 0.5552293949439482, + "grad_norm": 3.4789881706237793, + "learning_rate": 4.206547049690358e-06, + "loss": 0.1146, + "step": 21941 + }, + { + "epoch": 0.5552547005086419, + "grad_norm": 6.784489631652832, + "learning_rate": 4.206150617657513e-06, + "loss": 0.1642, + "step": 21942 + }, + { + "epoch": 0.5552800060733355, + "grad_norm": 4.485753059387207, + "learning_rate": 4.205754190744058e-06, + "loss": 0.1648, + "step": 21943 + }, + { + "epoch": 0.5553053116380292, + "grad_norm": 6.663567543029785, + "learning_rate": 4.205357768952544e-06, + "loss": 0.173, + "step": 21944 + }, + { + "epoch": 0.5553306172027229, + "grad_norm": 5.520725250244141, + "learning_rate": 4.20496135228553e-06, + "loss": 0.1787, + "step": 21945 + }, + { + "epoch": 0.5553559227674165, + "grad_norm": 6.055046081542969, + "learning_rate": 4.204564940745571e-06, + "loss": 0.2719, + "step": 21946 + }, + { + "epoch": 0.5553812283321102, + "grad_norm": 9.850653648376465, + "learning_rate": 4.204168534335228e-06, + "loss": 0.1735, + "step": 21947 + }, + { + "epoch": 0.5554065338968039, + "grad_norm": 9.04599666595459, + "learning_rate": 4.203772133057053e-06, + "loss": 0.2595, + "step": 21948 + }, + { + "epoch": 0.5554318394614975, + "grad_norm": 5.256377220153809, + "learning_rate": 4.203375736913602e-06, + "loss": 0.2175, + "step": 21949 + }, + { + "epoch": 0.5554571450261913, + "grad_norm": 4.144184112548828, + "learning_rate": 4.2029793459074325e-06, + "loss": 0.1701, + "step": 21950 + }, + { + "epoch": 0.555482450590885, + "grad_norm": 9.635848045349121, + "learning_rate": 4.2025829600411025e-06, + "loss": 0.1954, + "step": 21951 + }, + { + "epoch": 0.5555077561555786, + "grad_norm": 4.670104026794434, + "learning_rate": 4.202186579317167e-06, + "loss": 0.2013, + "step": 21952 + }, + { + "epoch": 0.5555330617202723, + "grad_norm": 3.5985350608825684, + "learning_rate": 4.201790203738179e-06, + "loss": 0.1793, + "step": 21953 + }, + { + "epoch": 0.555558367284966, + "grad_norm": 5.04408597946167, + "learning_rate": 4.2013938333067006e-06, + "loss": 0.152, + "step": 21954 + }, + { + "epoch": 0.5555836728496596, + "grad_norm": 6.577591419219971, + "learning_rate": 4.200997468025284e-06, + "loss": 0.2448, + "step": 21955 + }, + { + "epoch": 0.5556089784143533, + "grad_norm": 9.848609924316406, + "learning_rate": 4.200601107896484e-06, + "loss": 0.2919, + "step": 21956 + }, + { + "epoch": 0.555634283979047, + "grad_norm": 3.3461339473724365, + "learning_rate": 4.200204752922863e-06, + "loss": 0.1265, + "step": 21957 + }, + { + "epoch": 0.5556595895437406, + "grad_norm": 7.672242164611816, + "learning_rate": 4.199808403106969e-06, + "loss": 0.2406, + "step": 21958 + }, + { + "epoch": 0.5556848951084343, + "grad_norm": 3.4398961067199707, + "learning_rate": 4.199412058451363e-06, + "loss": 0.1, + "step": 21959 + }, + { + "epoch": 0.555710200673128, + "grad_norm": 4.029293060302734, + "learning_rate": 4.199015718958599e-06, + "loss": 0.1178, + "step": 21960 + }, + { + "epoch": 0.5557355062378216, + "grad_norm": 11.733412742614746, + "learning_rate": 4.198619384631236e-06, + "loss": 0.3456, + "step": 21961 + }, + { + "epoch": 0.5557608118025154, + "grad_norm": 7.593556880950928, + "learning_rate": 4.1982230554718255e-06, + "loss": 0.2575, + "step": 21962 + }, + { + "epoch": 0.5557861173672091, + "grad_norm": 4.246754169464111, + "learning_rate": 4.197826731482925e-06, + "loss": 0.1358, + "step": 21963 + }, + { + "epoch": 0.5558114229319028, + "grad_norm": 9.213655471801758, + "learning_rate": 4.197430412667091e-06, + "loss": 0.2257, + "step": 21964 + }, + { + "epoch": 0.5558367284965964, + "grad_norm": 6.015042304992676, + "learning_rate": 4.197034099026881e-06, + "loss": 0.2115, + "step": 21965 + }, + { + "epoch": 0.5558620340612901, + "grad_norm": 2.795936107635498, + "learning_rate": 4.196637790564847e-06, + "loss": 0.098, + "step": 21966 + }, + { + "epoch": 0.5558873396259838, + "grad_norm": 2.5393919944763184, + "learning_rate": 4.196241487283546e-06, + "loss": 0.1106, + "step": 21967 + }, + { + "epoch": 0.5559126451906774, + "grad_norm": 5.936884880065918, + "learning_rate": 4.195845189185534e-06, + "loss": 0.213, + "step": 21968 + }, + { + "epoch": 0.5559379507553711, + "grad_norm": 9.313627243041992, + "learning_rate": 4.195448896273367e-06, + "loss": 0.2013, + "step": 21969 + }, + { + "epoch": 0.5559632563200648, + "grad_norm": 5.1107940673828125, + "learning_rate": 4.195052608549603e-06, + "loss": 0.1429, + "step": 21970 + }, + { + "epoch": 0.5559885618847584, + "grad_norm": 5.496922016143799, + "learning_rate": 4.1946563260167924e-06, + "loss": 0.1776, + "step": 21971 + }, + { + "epoch": 0.5560138674494521, + "grad_norm": 7.167486190795898, + "learning_rate": 4.194260048677493e-06, + "loss": 0.1419, + "step": 21972 + }, + { + "epoch": 0.5560391730141458, + "grad_norm": 5.7869672775268555, + "learning_rate": 4.1938637765342615e-06, + "loss": 0.1503, + "step": 21973 + }, + { + "epoch": 0.5560644785788394, + "grad_norm": 11.291240692138672, + "learning_rate": 4.193467509589655e-06, + "loss": 0.2918, + "step": 21974 + }, + { + "epoch": 0.5560897841435332, + "grad_norm": 5.1339111328125, + "learning_rate": 4.193071247846223e-06, + "loss": 0.1302, + "step": 21975 + }, + { + "epoch": 0.5561150897082269, + "grad_norm": 3.767563819885254, + "learning_rate": 4.192674991306525e-06, + "loss": 0.129, + "step": 21976 + }, + { + "epoch": 0.5561403952729205, + "grad_norm": 6.054562568664551, + "learning_rate": 4.192278739973116e-06, + "loss": 0.2102, + "step": 21977 + }, + { + "epoch": 0.5561657008376142, + "grad_norm": 4.469761848449707, + "learning_rate": 4.191882493848552e-06, + "loss": 0.0937, + "step": 21978 + }, + { + "epoch": 0.5561910064023079, + "grad_norm": 8.874411582946777, + "learning_rate": 4.191486252935387e-06, + "loss": 0.2763, + "step": 21979 + }, + { + "epoch": 0.5562163119670015, + "grad_norm": 4.783570766448975, + "learning_rate": 4.191090017236177e-06, + "loss": 0.1485, + "step": 21980 + }, + { + "epoch": 0.5562416175316952, + "grad_norm": 3.7149369716644287, + "learning_rate": 4.190693786753477e-06, + "loss": 0.1588, + "step": 21981 + }, + { + "epoch": 0.5562669230963889, + "grad_norm": 6.13254976272583, + "learning_rate": 4.190297561489842e-06, + "loss": 0.2167, + "step": 21982 + }, + { + "epoch": 0.5562922286610825, + "grad_norm": 3.842853307723999, + "learning_rate": 4.189901341447827e-06, + "loss": 0.1357, + "step": 21983 + }, + { + "epoch": 0.5563175342257762, + "grad_norm": 6.58306360244751, + "learning_rate": 4.189505126629989e-06, + "loss": 0.2657, + "step": 21984 + }, + { + "epoch": 0.55634283979047, + "grad_norm": 4.718478679656982, + "learning_rate": 4.18910891703888e-06, + "loss": 0.1513, + "step": 21985 + }, + { + "epoch": 0.5563681453551635, + "grad_norm": 5.923803806304932, + "learning_rate": 4.188712712677057e-06, + "loss": 0.1398, + "step": 21986 + }, + { + "epoch": 0.5563934509198573, + "grad_norm": 3.108031749725342, + "learning_rate": 4.188316513547076e-06, + "loss": 0.1501, + "step": 21987 + }, + { + "epoch": 0.556418756484551, + "grad_norm": 3.8810956478118896, + "learning_rate": 4.18792031965149e-06, + "loss": 0.1602, + "step": 21988 + }, + { + "epoch": 0.5564440620492447, + "grad_norm": 4.225842475891113, + "learning_rate": 4.187524130992855e-06, + "loss": 0.158, + "step": 21989 + }, + { + "epoch": 0.5564693676139383, + "grad_norm": 2.927025318145752, + "learning_rate": 4.187127947573725e-06, + "loss": 0.1155, + "step": 21990 + }, + { + "epoch": 0.556494673178632, + "grad_norm": 6.440597057342529, + "learning_rate": 4.186731769396655e-06, + "loss": 0.1827, + "step": 21991 + }, + { + "epoch": 0.5565199787433257, + "grad_norm": 2.530125379562378, + "learning_rate": 4.186335596464204e-06, + "loss": 0.0829, + "step": 21992 + }, + { + "epoch": 0.5565452843080193, + "grad_norm": 5.9854936599731445, + "learning_rate": 4.18593942877892e-06, + "loss": 0.2276, + "step": 21993 + }, + { + "epoch": 0.556570589872713, + "grad_norm": 9.665970802307129, + "learning_rate": 4.185543266343362e-06, + "loss": 0.3606, + "step": 21994 + }, + { + "epoch": 0.5565958954374067, + "grad_norm": 2.8787150382995605, + "learning_rate": 4.185147109160085e-06, + "loss": 0.0958, + "step": 21995 + }, + { + "epoch": 0.5566212010021003, + "grad_norm": 3.106213331222534, + "learning_rate": 4.184750957231642e-06, + "loss": 0.1447, + "step": 21996 + }, + { + "epoch": 0.556646506566794, + "grad_norm": 4.126738548278809, + "learning_rate": 4.184354810560589e-06, + "loss": 0.1832, + "step": 21997 + }, + { + "epoch": 0.5566718121314878, + "grad_norm": 3.6813459396362305, + "learning_rate": 4.1839586691494784e-06, + "loss": 0.1584, + "step": 21998 + }, + { + "epoch": 0.5566971176961814, + "grad_norm": 4.3833088874816895, + "learning_rate": 4.183562533000868e-06, + "loss": 0.1634, + "step": 21999 + }, + { + "epoch": 0.5567224232608751, + "grad_norm": 3.0230672359466553, + "learning_rate": 4.18316640211731e-06, + "loss": 0.1208, + "step": 22000 + }, + { + "epoch": 0.5567477288255688, + "grad_norm": 5.334080696105957, + "learning_rate": 4.182770276501362e-06, + "loss": 0.2174, + "step": 22001 + }, + { + "epoch": 0.5567730343902624, + "grad_norm": 6.263031005859375, + "learning_rate": 4.182374156155574e-06, + "loss": 0.143, + "step": 22002 + }, + { + "epoch": 0.5567983399549561, + "grad_norm": 9.838583946228027, + "learning_rate": 4.181978041082504e-06, + "loss": 0.1884, + "step": 22003 + }, + { + "epoch": 0.5568236455196498, + "grad_norm": 2.2763168811798096, + "learning_rate": 4.181581931284705e-06, + "loss": 0.1078, + "step": 22004 + }, + { + "epoch": 0.5568489510843434, + "grad_norm": 3.142101287841797, + "learning_rate": 4.181185826764734e-06, + "loss": 0.119, + "step": 22005 + }, + { + "epoch": 0.5568742566490371, + "grad_norm": 3.539620876312256, + "learning_rate": 4.180789727525141e-06, + "loss": 0.1998, + "step": 22006 + }, + { + "epoch": 0.5568995622137308, + "grad_norm": 5.196666240692139, + "learning_rate": 4.180393633568483e-06, + "loss": 0.1742, + "step": 22007 + }, + { + "epoch": 0.5569248677784244, + "grad_norm": 7.974080562591553, + "learning_rate": 4.179997544897315e-06, + "loss": 0.1444, + "step": 22008 + }, + { + "epoch": 0.5569501733431181, + "grad_norm": 8.543335914611816, + "learning_rate": 4.17960146151419e-06, + "loss": 0.2312, + "step": 22009 + }, + { + "epoch": 0.5569754789078118, + "grad_norm": 12.003788948059082, + "learning_rate": 4.179205383421662e-06, + "loss": 0.3536, + "step": 22010 + }, + { + "epoch": 0.5570007844725055, + "grad_norm": 5.703559875488281, + "learning_rate": 4.178809310622285e-06, + "loss": 0.2113, + "step": 22011 + }, + { + "epoch": 0.5570260900371992, + "grad_norm": 6.109450340270996, + "learning_rate": 4.1784132431186155e-06, + "loss": 0.155, + "step": 22012 + }, + { + "epoch": 0.5570513956018929, + "grad_norm": 4.0996503829956055, + "learning_rate": 4.178017180913206e-06, + "loss": 0.194, + "step": 22013 + }, + { + "epoch": 0.5570767011665866, + "grad_norm": 4.643311977386475, + "learning_rate": 4.1776211240086105e-06, + "loss": 0.216, + "step": 22014 + }, + { + "epoch": 0.5571020067312802, + "grad_norm": 21.480573654174805, + "learning_rate": 4.177225072407385e-06, + "loss": 0.1149, + "step": 22015 + }, + { + "epoch": 0.5571273122959739, + "grad_norm": 4.466264724731445, + "learning_rate": 4.176829026112079e-06, + "loss": 0.1324, + "step": 22016 + }, + { + "epoch": 0.5571526178606676, + "grad_norm": 4.1945905685424805, + "learning_rate": 4.176432985125251e-06, + "loss": 0.1283, + "step": 22017 + }, + { + "epoch": 0.5571779234253612, + "grad_norm": 4.069255352020264, + "learning_rate": 4.176036949449455e-06, + "loss": 0.1727, + "step": 22018 + }, + { + "epoch": 0.5572032289900549, + "grad_norm": 6.237113952636719, + "learning_rate": 4.17564091908724e-06, + "loss": 0.1577, + "step": 22019 + }, + { + "epoch": 0.5572285345547486, + "grad_norm": 6.412297248840332, + "learning_rate": 4.175244894041165e-06, + "loss": 0.2064, + "step": 22020 + }, + { + "epoch": 0.5572538401194422, + "grad_norm": 11.915590286254883, + "learning_rate": 4.174848874313782e-06, + "loss": 0.2538, + "step": 22021 + }, + { + "epoch": 0.557279145684136, + "grad_norm": 3.367124080657959, + "learning_rate": 4.174452859907645e-06, + "loss": 0.1594, + "step": 22022 + }, + { + "epoch": 0.5573044512488297, + "grad_norm": 4.485844612121582, + "learning_rate": 4.1740568508253095e-06, + "loss": 0.1384, + "step": 22023 + }, + { + "epoch": 0.5573297568135233, + "grad_norm": 6.657558917999268, + "learning_rate": 4.173660847069327e-06, + "loss": 0.1394, + "step": 22024 + }, + { + "epoch": 0.557355062378217, + "grad_norm": 3.493025302886963, + "learning_rate": 4.173264848642251e-06, + "loss": 0.1861, + "step": 22025 + }, + { + "epoch": 0.5573803679429107, + "grad_norm": 5.77750825881958, + "learning_rate": 4.172868855546637e-06, + "loss": 0.2296, + "step": 22026 + }, + { + "epoch": 0.5574056735076043, + "grad_norm": 6.146890163421631, + "learning_rate": 4.172472867785039e-06, + "loss": 0.2281, + "step": 22027 + }, + { + "epoch": 0.557430979072298, + "grad_norm": 6.528352737426758, + "learning_rate": 4.172076885360008e-06, + "loss": 0.217, + "step": 22028 + }, + { + "epoch": 0.5574562846369917, + "grad_norm": 2.437518835067749, + "learning_rate": 4.171680908274098e-06, + "loss": 0.1516, + "step": 22029 + }, + { + "epoch": 0.5574815902016853, + "grad_norm": 7.236357688903809, + "learning_rate": 4.1712849365298655e-06, + "loss": 0.1243, + "step": 22030 + }, + { + "epoch": 0.557506895766379, + "grad_norm": 7.117712497711182, + "learning_rate": 4.170888970129861e-06, + "loss": 0.2179, + "step": 22031 + }, + { + "epoch": 0.5575322013310727, + "grad_norm": 5.7955780029296875, + "learning_rate": 4.170493009076642e-06, + "loss": 0.2151, + "step": 22032 + }, + { + "epoch": 0.5575575068957663, + "grad_norm": 4.266805648803711, + "learning_rate": 4.170097053372757e-06, + "loss": 0.1389, + "step": 22033 + }, + { + "epoch": 0.55758281246046, + "grad_norm": 8.696109771728516, + "learning_rate": 4.169701103020762e-06, + "loss": 0.3428, + "step": 22034 + }, + { + "epoch": 0.5576081180251538, + "grad_norm": 3.9039392471313477, + "learning_rate": 4.16930515802321e-06, + "loss": 0.1788, + "step": 22035 + }, + { + "epoch": 0.5576334235898474, + "grad_norm": 4.056995868682861, + "learning_rate": 4.168909218382657e-06, + "loss": 0.1643, + "step": 22036 + }, + { + "epoch": 0.5576587291545411, + "grad_norm": 2.549783945083618, + "learning_rate": 4.168513284101651e-06, + "loss": 0.0872, + "step": 22037 + }, + { + "epoch": 0.5576840347192348, + "grad_norm": 6.537744045257568, + "learning_rate": 4.168117355182748e-06, + "loss": 0.1731, + "step": 22038 + }, + { + "epoch": 0.5577093402839285, + "grad_norm": 4.409100532531738, + "learning_rate": 4.167721431628504e-06, + "loss": 0.1859, + "step": 22039 + }, + { + "epoch": 0.5577346458486221, + "grad_norm": 4.8193817138671875, + "learning_rate": 4.167325513441468e-06, + "loss": 0.1455, + "step": 22040 + }, + { + "epoch": 0.5577599514133158, + "grad_norm": 8.944781303405762, + "learning_rate": 4.166929600624194e-06, + "loss": 0.2028, + "step": 22041 + }, + { + "epoch": 0.5577852569780095, + "grad_norm": 6.7693281173706055, + "learning_rate": 4.166533693179238e-06, + "loss": 0.1431, + "step": 22042 + }, + { + "epoch": 0.5578105625427031, + "grad_norm": 9.14936637878418, + "learning_rate": 4.16613779110915e-06, + "loss": 0.2288, + "step": 22043 + }, + { + "epoch": 0.5578358681073968, + "grad_norm": 7.081029415130615, + "learning_rate": 4.165741894416484e-06, + "loss": 0.2144, + "step": 22044 + }, + { + "epoch": 0.5578611736720905, + "grad_norm": 3.679917812347412, + "learning_rate": 4.165346003103795e-06, + "loss": 0.1002, + "step": 22045 + }, + { + "epoch": 0.5578864792367841, + "grad_norm": 7.209908962249756, + "learning_rate": 4.164950117173633e-06, + "loss": 0.1187, + "step": 22046 + }, + { + "epoch": 0.5579117848014779, + "grad_norm": 3.9508819580078125, + "learning_rate": 4.164554236628552e-06, + "loss": 0.1281, + "step": 22047 + }, + { + "epoch": 0.5579370903661716, + "grad_norm": 3.229072332382202, + "learning_rate": 4.164158361471105e-06, + "loss": 0.1129, + "step": 22048 + }, + { + "epoch": 0.5579623959308652, + "grad_norm": 7.457426071166992, + "learning_rate": 4.163762491703848e-06, + "loss": 0.167, + "step": 22049 + }, + { + "epoch": 0.5579877014955589, + "grad_norm": 11.643607139587402, + "learning_rate": 4.163366627329329e-06, + "loss": 0.3832, + "step": 22050 + }, + { + "epoch": 0.5580130070602526, + "grad_norm": 9.105643272399902, + "learning_rate": 4.162970768350103e-06, + "loss": 0.2931, + "step": 22051 + }, + { + "epoch": 0.5580383126249462, + "grad_norm": 5.895541191101074, + "learning_rate": 4.162574914768722e-06, + "loss": 0.2322, + "step": 22052 + }, + { + "epoch": 0.5580636181896399, + "grad_norm": 3.941542863845825, + "learning_rate": 4.162179066587741e-06, + "loss": 0.0795, + "step": 22053 + }, + { + "epoch": 0.5580889237543336, + "grad_norm": 3.0305209159851074, + "learning_rate": 4.161783223809712e-06, + "loss": 0.0837, + "step": 22054 + }, + { + "epoch": 0.5581142293190272, + "grad_norm": 4.553624153137207, + "learning_rate": 4.161387386437185e-06, + "loss": 0.1548, + "step": 22055 + }, + { + "epoch": 0.5581395348837209, + "grad_norm": 4.180455207824707, + "learning_rate": 4.160991554472716e-06, + "loss": 0.1199, + "step": 22056 + }, + { + "epoch": 0.5581648404484146, + "grad_norm": 5.576743125915527, + "learning_rate": 4.160595727918856e-06, + "loss": 0.1573, + "step": 22057 + }, + { + "epoch": 0.5581901460131082, + "grad_norm": 14.326051712036133, + "learning_rate": 4.16019990677816e-06, + "loss": 0.102, + "step": 22058 + }, + { + "epoch": 0.558215451577802, + "grad_norm": 7.392340183258057, + "learning_rate": 4.159804091053176e-06, + "loss": 0.2421, + "step": 22059 + }, + { + "epoch": 0.5582407571424957, + "grad_norm": 4.603394031524658, + "learning_rate": 4.1594082807464595e-06, + "loss": 0.1629, + "step": 22060 + }, + { + "epoch": 0.5582660627071893, + "grad_norm": 6.286217212677002, + "learning_rate": 4.159012475860562e-06, + "loss": 0.1679, + "step": 22061 + }, + { + "epoch": 0.558291368271883, + "grad_norm": 6.948581218719482, + "learning_rate": 4.15861667639804e-06, + "loss": 0.2284, + "step": 22062 + }, + { + "epoch": 0.5583166738365767, + "grad_norm": 3.0097148418426514, + "learning_rate": 4.1582208823614395e-06, + "loss": 0.1149, + "step": 22063 + }, + { + "epoch": 0.5583419794012704, + "grad_norm": 5.716226100921631, + "learning_rate": 4.157825093753316e-06, + "loss": 0.1345, + "step": 22064 + }, + { + "epoch": 0.558367284965964, + "grad_norm": 2.7117719650268555, + "learning_rate": 4.157429310576222e-06, + "loss": 0.095, + "step": 22065 + }, + { + "epoch": 0.5583925905306577, + "grad_norm": 2.7130308151245117, + "learning_rate": 4.157033532832711e-06, + "loss": 0.1342, + "step": 22066 + }, + { + "epoch": 0.5584178960953514, + "grad_norm": 6.049726963043213, + "learning_rate": 4.156637760525333e-06, + "loss": 0.2035, + "step": 22067 + }, + { + "epoch": 0.558443201660045, + "grad_norm": 2.617119073867798, + "learning_rate": 4.156241993656641e-06, + "loss": 0.0622, + "step": 22068 + }, + { + "epoch": 0.5584685072247387, + "grad_norm": 4.647201061248779, + "learning_rate": 4.1558462322291886e-06, + "loss": 0.107, + "step": 22069 + }, + { + "epoch": 0.5584938127894324, + "grad_norm": 5.876204490661621, + "learning_rate": 4.155450476245526e-06, + "loss": 0.1841, + "step": 22070 + }, + { + "epoch": 0.558519118354126, + "grad_norm": 7.1931376457214355, + "learning_rate": 4.155054725708206e-06, + "loss": 0.1212, + "step": 22071 + }, + { + "epoch": 0.5585444239188198, + "grad_norm": 12.256375312805176, + "learning_rate": 4.154658980619781e-06, + "loss": 0.2501, + "step": 22072 + }, + { + "epoch": 0.5585697294835135, + "grad_norm": 2.522594451904297, + "learning_rate": 4.154263240982803e-06, + "loss": 0.126, + "step": 22073 + }, + { + "epoch": 0.5585950350482071, + "grad_norm": 8.951312065124512, + "learning_rate": 4.153867506799823e-06, + "loss": 0.229, + "step": 22074 + }, + { + "epoch": 0.5586203406129008, + "grad_norm": 2.9607620239257812, + "learning_rate": 4.1534717780733954e-06, + "loss": 0.0709, + "step": 22075 + }, + { + "epoch": 0.5586456461775945, + "grad_norm": 3.074059009552002, + "learning_rate": 4.153076054806072e-06, + "loss": 0.1208, + "step": 22076 + }, + { + "epoch": 0.5586709517422881, + "grad_norm": 4.038477897644043, + "learning_rate": 4.152680337000402e-06, + "loss": 0.1125, + "step": 22077 + }, + { + "epoch": 0.5586962573069818, + "grad_norm": 4.413730144500732, + "learning_rate": 4.1522846246589384e-06, + "loss": 0.1249, + "step": 22078 + }, + { + "epoch": 0.5587215628716755, + "grad_norm": 3.2321724891662598, + "learning_rate": 4.1518889177842334e-06, + "loss": 0.1439, + "step": 22079 + }, + { + "epoch": 0.5587468684363691, + "grad_norm": 5.223609924316406, + "learning_rate": 4.151493216378842e-06, + "loss": 0.2459, + "step": 22080 + }, + { + "epoch": 0.5587721740010628, + "grad_norm": 4.761138439178467, + "learning_rate": 4.15109752044531e-06, + "loss": 0.1566, + "step": 22081 + }, + { + "epoch": 0.5587974795657565, + "grad_norm": 24.389739990234375, + "learning_rate": 4.1507018299861925e-06, + "loss": 0.2418, + "step": 22082 + }, + { + "epoch": 0.5588227851304501, + "grad_norm": 3.3688712120056152, + "learning_rate": 4.150306145004041e-06, + "loss": 0.0881, + "step": 22083 + }, + { + "epoch": 0.5588480906951439, + "grad_norm": 11.092037200927734, + "learning_rate": 4.149910465501407e-06, + "loss": 0.184, + "step": 22084 + }, + { + "epoch": 0.5588733962598376, + "grad_norm": 2.6216201782226562, + "learning_rate": 4.149514791480844e-06, + "loss": 0.1105, + "step": 22085 + }, + { + "epoch": 0.5588987018245312, + "grad_norm": 8.344340324401855, + "learning_rate": 4.1491191229449e-06, + "loss": 0.1749, + "step": 22086 + }, + { + "epoch": 0.5589240073892249, + "grad_norm": 4.645293712615967, + "learning_rate": 4.14872345989613e-06, + "loss": 0.2139, + "step": 22087 + }, + { + "epoch": 0.5589493129539186, + "grad_norm": 5.637674331665039, + "learning_rate": 4.148327802337082e-06, + "loss": 0.1618, + "step": 22088 + }, + { + "epoch": 0.5589746185186122, + "grad_norm": 13.32867431640625, + "learning_rate": 4.147932150270313e-06, + "loss": 0.2176, + "step": 22089 + }, + { + "epoch": 0.5589999240833059, + "grad_norm": 7.913897514343262, + "learning_rate": 4.147536503698368e-06, + "loss": 0.2391, + "step": 22090 + }, + { + "epoch": 0.5590252296479996, + "grad_norm": 5.82213830947876, + "learning_rate": 4.1471408626238015e-06, + "loss": 0.1405, + "step": 22091 + }, + { + "epoch": 0.5590505352126933, + "grad_norm": 3.1505229473114014, + "learning_rate": 4.146745227049166e-06, + "loss": 0.139, + "step": 22092 + }, + { + "epoch": 0.5590758407773869, + "grad_norm": 12.686365127563477, + "learning_rate": 4.146349596977013e-06, + "loss": 0.296, + "step": 22093 + }, + { + "epoch": 0.5591011463420806, + "grad_norm": 5.665440082550049, + "learning_rate": 4.145953972409891e-06, + "loss": 0.1387, + "step": 22094 + }, + { + "epoch": 0.5591264519067743, + "grad_norm": 11.046598434448242, + "learning_rate": 4.145558353350353e-06, + "loss": 0.2186, + "step": 22095 + }, + { + "epoch": 0.559151757471468, + "grad_norm": 4.753676414489746, + "learning_rate": 4.1451627398009495e-06, + "loss": 0.1555, + "step": 22096 + }, + { + "epoch": 0.5591770630361617, + "grad_norm": 3.0059001445770264, + "learning_rate": 4.144767131764235e-06, + "loss": 0.1149, + "step": 22097 + }, + { + "epoch": 0.5592023686008554, + "grad_norm": 2.8530983924865723, + "learning_rate": 4.144371529242756e-06, + "loss": 0.1042, + "step": 22098 + }, + { + "epoch": 0.559227674165549, + "grad_norm": 4.529170036315918, + "learning_rate": 4.143975932239067e-06, + "loss": 0.1534, + "step": 22099 + }, + { + "epoch": 0.5592529797302427, + "grad_norm": 3.543234348297119, + "learning_rate": 4.1435803407557176e-06, + "loss": 0.1776, + "step": 22100 + }, + { + "epoch": 0.5592782852949364, + "grad_norm": 7.632210731506348, + "learning_rate": 4.143184754795259e-06, + "loss": 0.1701, + "step": 22101 + }, + { + "epoch": 0.55930359085963, + "grad_norm": 3.5666251182556152, + "learning_rate": 4.142789174360243e-06, + "loss": 0.1946, + "step": 22102 + }, + { + "epoch": 0.5593288964243237, + "grad_norm": 5.074679374694824, + "learning_rate": 4.142393599453221e-06, + "loss": 0.1635, + "step": 22103 + }, + { + "epoch": 0.5593542019890174, + "grad_norm": 7.968147277832031, + "learning_rate": 4.141998030076741e-06, + "loss": 0.1769, + "step": 22104 + }, + { + "epoch": 0.559379507553711, + "grad_norm": 4.304305076599121, + "learning_rate": 4.141602466233357e-06, + "loss": 0.1957, + "step": 22105 + }, + { + "epoch": 0.5594048131184047, + "grad_norm": 6.310433864593506, + "learning_rate": 4.141206907925619e-06, + "loss": 0.208, + "step": 22106 + }, + { + "epoch": 0.5594301186830984, + "grad_norm": 15.026241302490234, + "learning_rate": 4.140811355156079e-06, + "loss": 0.2114, + "step": 22107 + }, + { + "epoch": 0.559455424247792, + "grad_norm": 12.054695129394531, + "learning_rate": 4.140415807927286e-06, + "loss": 0.1713, + "step": 22108 + }, + { + "epoch": 0.5594807298124858, + "grad_norm": 5.311675071716309, + "learning_rate": 4.140020266241791e-06, + "loss": 0.0977, + "step": 22109 + }, + { + "epoch": 0.5595060353771795, + "grad_norm": 8.302491188049316, + "learning_rate": 4.139624730102145e-06, + "loss": 0.1731, + "step": 22110 + }, + { + "epoch": 0.5595313409418731, + "grad_norm": 4.351250171661377, + "learning_rate": 4.139229199510902e-06, + "loss": 0.1451, + "step": 22111 + }, + { + "epoch": 0.5595566465065668, + "grad_norm": 5.309856414794922, + "learning_rate": 4.138833674470607e-06, + "loss": 0.1653, + "step": 22112 + }, + { + "epoch": 0.5595819520712605, + "grad_norm": 4.5368733406066895, + "learning_rate": 4.138438154983813e-06, + "loss": 0.1439, + "step": 22113 + }, + { + "epoch": 0.5596072576359541, + "grad_norm": 10.715434074401855, + "learning_rate": 4.138042641053072e-06, + "loss": 0.1845, + "step": 22114 + }, + { + "epoch": 0.5596325632006478, + "grad_norm": 4.611672878265381, + "learning_rate": 4.137647132680935e-06, + "loss": 0.1361, + "step": 22115 + }, + { + "epoch": 0.5596578687653415, + "grad_norm": 7.464847564697266, + "learning_rate": 4.13725162986995e-06, + "loss": 0.1515, + "step": 22116 + }, + { + "epoch": 0.5596831743300352, + "grad_norm": 4.048595428466797, + "learning_rate": 4.136856132622668e-06, + "loss": 0.17, + "step": 22117 + }, + { + "epoch": 0.5597084798947288, + "grad_norm": 26.687931060791016, + "learning_rate": 4.136460640941641e-06, + "loss": 0.2126, + "step": 22118 + }, + { + "epoch": 0.5597337854594225, + "grad_norm": 3.7359907627105713, + "learning_rate": 4.136065154829418e-06, + "loss": 0.1453, + "step": 22119 + }, + { + "epoch": 0.5597590910241163, + "grad_norm": 4.992592811584473, + "learning_rate": 4.135669674288552e-06, + "loss": 0.093, + "step": 22120 + }, + { + "epoch": 0.5597843965888099, + "grad_norm": 4.123861312866211, + "learning_rate": 4.13527419932159e-06, + "loss": 0.133, + "step": 22121 + }, + { + "epoch": 0.5598097021535036, + "grad_norm": 7.88748025894165, + "learning_rate": 4.134878729931084e-06, + "loss": 0.1502, + "step": 22122 + }, + { + "epoch": 0.5598350077181973, + "grad_norm": 9.596602439880371, + "learning_rate": 4.134483266119583e-06, + "loss": 0.227, + "step": 22123 + }, + { + "epoch": 0.5598603132828909, + "grad_norm": 3.962998390197754, + "learning_rate": 4.1340878078896415e-06, + "loss": 0.152, + "step": 22124 + }, + { + "epoch": 0.5598856188475846, + "grad_norm": 4.494571208953857, + "learning_rate": 4.133692355243804e-06, + "loss": 0.1481, + "step": 22125 + }, + { + "epoch": 0.5599109244122783, + "grad_norm": 4.813939094543457, + "learning_rate": 4.1332969081846245e-06, + "loss": 0.1761, + "step": 22126 + }, + { + "epoch": 0.5599362299769719, + "grad_norm": 4.439271450042725, + "learning_rate": 4.132901466714653e-06, + "loss": 0.1138, + "step": 22127 + }, + { + "epoch": 0.5599615355416656, + "grad_norm": 5.403788089752197, + "learning_rate": 4.1325060308364364e-06, + "loss": 0.1221, + "step": 22128 + }, + { + "epoch": 0.5599868411063593, + "grad_norm": 5.276343822479248, + "learning_rate": 4.132110600552528e-06, + "loss": 0.1372, + "step": 22129 + }, + { + "epoch": 0.5600121466710529, + "grad_norm": 7.058051586151123, + "learning_rate": 4.131715175865477e-06, + "loss": 0.2103, + "step": 22130 + }, + { + "epoch": 0.5600374522357466, + "grad_norm": 6.341188430786133, + "learning_rate": 4.1313197567778336e-06, + "loss": 0.1664, + "step": 22131 + }, + { + "epoch": 0.5600627578004403, + "grad_norm": 3.801398515701294, + "learning_rate": 4.130924343292146e-06, + "loss": 0.1065, + "step": 22132 + }, + { + "epoch": 0.560088063365134, + "grad_norm": 4.727992534637451, + "learning_rate": 4.130528935410968e-06, + "loss": 0.1727, + "step": 22133 + }, + { + "epoch": 0.5601133689298277, + "grad_norm": 8.280720710754395, + "learning_rate": 4.130133533136847e-06, + "loss": 0.2491, + "step": 22134 + }, + { + "epoch": 0.5601386744945214, + "grad_norm": 3.6647212505340576, + "learning_rate": 4.129738136472331e-06, + "loss": 0.168, + "step": 22135 + }, + { + "epoch": 0.560163980059215, + "grad_norm": 4.5914130210876465, + "learning_rate": 4.1293427454199724e-06, + "loss": 0.1527, + "step": 22136 + }, + { + "epoch": 0.5601892856239087, + "grad_norm": 4.784872055053711, + "learning_rate": 4.12894735998232e-06, + "loss": 0.1794, + "step": 22137 + }, + { + "epoch": 0.5602145911886024, + "grad_norm": 2.612675428390503, + "learning_rate": 4.128551980161927e-06, + "loss": 0.0596, + "step": 22138 + }, + { + "epoch": 0.560239896753296, + "grad_norm": 3.9732491970062256, + "learning_rate": 4.128156605961337e-06, + "loss": 0.121, + "step": 22139 + }, + { + "epoch": 0.5602652023179897, + "grad_norm": 6.0054240226745605, + "learning_rate": 4.127761237383104e-06, + "loss": 0.1788, + "step": 22140 + }, + { + "epoch": 0.5602905078826834, + "grad_norm": 21.06667709350586, + "learning_rate": 4.127365874429776e-06, + "loss": 0.2026, + "step": 22141 + }, + { + "epoch": 0.5603158134473771, + "grad_norm": 3.385106086730957, + "learning_rate": 4.126970517103906e-06, + "loss": 0.1375, + "step": 22142 + }, + { + "epoch": 0.5603411190120707, + "grad_norm": 3.193471908569336, + "learning_rate": 4.126575165408038e-06, + "loss": 0.0865, + "step": 22143 + }, + { + "epoch": 0.5603664245767644, + "grad_norm": 14.246475219726562, + "learning_rate": 4.126179819344725e-06, + "loss": 0.3469, + "step": 22144 + }, + { + "epoch": 0.5603917301414582, + "grad_norm": 7.034088611602783, + "learning_rate": 4.125784478916514e-06, + "loss": 0.2399, + "step": 22145 + }, + { + "epoch": 0.5604170357061518, + "grad_norm": 6.026848316192627, + "learning_rate": 4.12538914412596e-06, + "loss": 0.1822, + "step": 22146 + }, + { + "epoch": 0.5604423412708455, + "grad_norm": 3.441263437271118, + "learning_rate": 4.124993814975606e-06, + "loss": 0.1, + "step": 22147 + }, + { + "epoch": 0.5604676468355392, + "grad_norm": 12.302330017089844, + "learning_rate": 4.1245984914680035e-06, + "loss": 0.1752, + "step": 22148 + }, + { + "epoch": 0.5604929524002328, + "grad_norm": 4.826718330383301, + "learning_rate": 4.124203173605704e-06, + "loss": 0.1087, + "step": 22149 + }, + { + "epoch": 0.5605182579649265, + "grad_norm": 5.8596954345703125, + "learning_rate": 4.1238078613912545e-06, + "loss": 0.2191, + "step": 22150 + }, + { + "epoch": 0.5605435635296202, + "grad_norm": 4.979241371154785, + "learning_rate": 4.123412554827207e-06, + "loss": 0.1785, + "step": 22151 + }, + { + "epoch": 0.5605688690943138, + "grad_norm": 3.78342342376709, + "learning_rate": 4.123017253916107e-06, + "loss": 0.1555, + "step": 22152 + }, + { + "epoch": 0.5605941746590075, + "grad_norm": 3.6271605491638184, + "learning_rate": 4.122621958660506e-06, + "loss": 0.1135, + "step": 22153 + }, + { + "epoch": 0.5606194802237012, + "grad_norm": 5.8369221687316895, + "learning_rate": 4.122226669062953e-06, + "loss": 0.19, + "step": 22154 + }, + { + "epoch": 0.5606447857883948, + "grad_norm": 4.499162673950195, + "learning_rate": 4.121831385125997e-06, + "loss": 0.1009, + "step": 22155 + }, + { + "epoch": 0.5606700913530885, + "grad_norm": 5.890720844268799, + "learning_rate": 4.121436106852187e-06, + "loss": 0.1541, + "step": 22156 + }, + { + "epoch": 0.5606953969177823, + "grad_norm": 8.331056594848633, + "learning_rate": 4.121040834244071e-06, + "loss": 0.1695, + "step": 22157 + }, + { + "epoch": 0.5607207024824759, + "grad_norm": 3.608689069747925, + "learning_rate": 4.1206455673042015e-06, + "loss": 0.1572, + "step": 22158 + }, + { + "epoch": 0.5607460080471696, + "grad_norm": 7.232541561126709, + "learning_rate": 4.120250306035123e-06, + "loss": 0.1734, + "step": 22159 + }, + { + "epoch": 0.5607713136118633, + "grad_norm": 5.458611965179443, + "learning_rate": 4.119855050439388e-06, + "loss": 0.1386, + "step": 22160 + }, + { + "epoch": 0.5607966191765569, + "grad_norm": 7.392058849334717, + "learning_rate": 4.119459800519544e-06, + "loss": 0.2387, + "step": 22161 + }, + { + "epoch": 0.5608219247412506, + "grad_norm": 16.105758666992188, + "learning_rate": 4.119064556278138e-06, + "loss": 0.214, + "step": 22162 + }, + { + "epoch": 0.5608472303059443, + "grad_norm": 9.992155075073242, + "learning_rate": 4.118669317717722e-06, + "loss": 0.2321, + "step": 22163 + }, + { + "epoch": 0.5608725358706379, + "grad_norm": 3.8034961223602295, + "learning_rate": 4.118274084840846e-06, + "loss": 0.1406, + "step": 22164 + }, + { + "epoch": 0.5608978414353316, + "grad_norm": 5.417398452758789, + "learning_rate": 4.117878857650054e-06, + "loss": 0.1423, + "step": 22165 + }, + { + "epoch": 0.5609231470000253, + "grad_norm": 3.202575206756592, + "learning_rate": 4.1174836361478965e-06, + "loss": 0.1193, + "step": 22166 + }, + { + "epoch": 0.560948452564719, + "grad_norm": 3.390833854675293, + "learning_rate": 4.117088420336923e-06, + "loss": 0.1289, + "step": 22167 + }, + { + "epoch": 0.5609737581294126, + "grad_norm": 3.7234911918640137, + "learning_rate": 4.116693210219685e-06, + "loss": 0.1171, + "step": 22168 + }, + { + "epoch": 0.5609990636941063, + "grad_norm": 14.477563858032227, + "learning_rate": 4.116298005798726e-06, + "loss": 0.3163, + "step": 22169 + }, + { + "epoch": 0.5610243692588001, + "grad_norm": 4.5626630783081055, + "learning_rate": 4.115902807076596e-06, + "loss": 0.1455, + "step": 22170 + }, + { + "epoch": 0.5610496748234937, + "grad_norm": 8.81622314453125, + "learning_rate": 4.1155076140558446e-06, + "loss": 0.2257, + "step": 22171 + }, + { + "epoch": 0.5610749803881874, + "grad_norm": 5.875758171081543, + "learning_rate": 4.11511242673902e-06, + "loss": 0.1266, + "step": 22172 + }, + { + "epoch": 0.5611002859528811, + "grad_norm": 7.508269309997559, + "learning_rate": 4.114717245128673e-06, + "loss": 0.2936, + "step": 22173 + }, + { + "epoch": 0.5611255915175747, + "grad_norm": 9.211343765258789, + "learning_rate": 4.114322069227348e-06, + "loss": 0.2344, + "step": 22174 + }, + { + "epoch": 0.5611508970822684, + "grad_norm": 5.378499984741211, + "learning_rate": 4.113926899037595e-06, + "loss": 0.1957, + "step": 22175 + }, + { + "epoch": 0.5611762026469621, + "grad_norm": 5.474784851074219, + "learning_rate": 4.113531734561962e-06, + "loss": 0.1818, + "step": 22176 + }, + { + "epoch": 0.5612015082116557, + "grad_norm": 9.209005355834961, + "learning_rate": 4.1131365758030015e-06, + "loss": 0.2205, + "step": 22177 + }, + { + "epoch": 0.5612268137763494, + "grad_norm": 4.041999340057373, + "learning_rate": 4.112741422763255e-06, + "loss": 0.1789, + "step": 22178 + }, + { + "epoch": 0.5612521193410431, + "grad_norm": 4.253803253173828, + "learning_rate": 4.112346275445274e-06, + "loss": 0.1493, + "step": 22179 + }, + { + "epoch": 0.5612774249057367, + "grad_norm": 4.150609970092773, + "learning_rate": 4.111951133851607e-06, + "loss": 0.1511, + "step": 22180 + }, + { + "epoch": 0.5613027304704304, + "grad_norm": 4.70720100402832, + "learning_rate": 4.111555997984802e-06, + "loss": 0.1778, + "step": 22181 + }, + { + "epoch": 0.5613280360351242, + "grad_norm": 8.662798881530762, + "learning_rate": 4.111160867847409e-06, + "loss": 0.1872, + "step": 22182 + }, + { + "epoch": 0.5613533415998178, + "grad_norm": 6.284822463989258, + "learning_rate": 4.110765743441972e-06, + "loss": 0.203, + "step": 22183 + }, + { + "epoch": 0.5613786471645115, + "grad_norm": 4.363517761230469, + "learning_rate": 4.110370624771042e-06, + "loss": 0.1366, + "step": 22184 + }, + { + "epoch": 0.5614039527292052, + "grad_norm": 6.550875186920166, + "learning_rate": 4.109975511837168e-06, + "loss": 0.1737, + "step": 22185 + }, + { + "epoch": 0.5614292582938988, + "grad_norm": 3.698136806488037, + "learning_rate": 4.109580404642895e-06, + "loss": 0.1193, + "step": 22186 + }, + { + "epoch": 0.5614545638585925, + "grad_norm": 5.206004619598389, + "learning_rate": 4.1091853031907714e-06, + "loss": 0.1357, + "step": 22187 + }, + { + "epoch": 0.5614798694232862, + "grad_norm": 4.1455464363098145, + "learning_rate": 4.108790207483348e-06, + "loss": 0.179, + "step": 22188 + }, + { + "epoch": 0.5615051749879798, + "grad_norm": 3.8409485816955566, + "learning_rate": 4.1083951175231685e-06, + "loss": 0.201, + "step": 22189 + }, + { + "epoch": 0.5615304805526735, + "grad_norm": 3.5437228679656982, + "learning_rate": 4.108000033312785e-06, + "loss": 0.1727, + "step": 22190 + }, + { + "epoch": 0.5615557861173672, + "grad_norm": 7.7666778564453125, + "learning_rate": 4.107604954854743e-06, + "loss": 0.1561, + "step": 22191 + }, + { + "epoch": 0.5615810916820609, + "grad_norm": 6.458738803863525, + "learning_rate": 4.1072098821515896e-06, + "loss": 0.221, + "step": 22192 + }, + { + "epoch": 0.5616063972467545, + "grad_norm": 4.23337459564209, + "learning_rate": 4.106814815205873e-06, + "loss": 0.1679, + "step": 22193 + }, + { + "epoch": 0.5616317028114483, + "grad_norm": 6.431434631347656, + "learning_rate": 4.106419754020143e-06, + "loss": 0.1857, + "step": 22194 + }, + { + "epoch": 0.561657008376142, + "grad_norm": 7.072586536407471, + "learning_rate": 4.106024698596947e-06, + "loss": 0.1535, + "step": 22195 + }, + { + "epoch": 0.5616823139408356, + "grad_norm": 7.087721347808838, + "learning_rate": 4.105629648938829e-06, + "loss": 0.2726, + "step": 22196 + }, + { + "epoch": 0.5617076195055293, + "grad_norm": 5.957972526550293, + "learning_rate": 4.10523460504834e-06, + "loss": 0.1833, + "step": 22197 + }, + { + "epoch": 0.561732925070223, + "grad_norm": 6.494944095611572, + "learning_rate": 4.104839566928025e-06, + "loss": 0.1929, + "step": 22198 + }, + { + "epoch": 0.5617582306349166, + "grad_norm": 11.250679016113281, + "learning_rate": 4.104444534580437e-06, + "loss": 0.1927, + "step": 22199 + }, + { + "epoch": 0.5617835361996103, + "grad_norm": 6.858208656311035, + "learning_rate": 4.104049508008117e-06, + "loss": 0.2366, + "step": 22200 + }, + { + "epoch": 0.561808841764304, + "grad_norm": 2.772155523300171, + "learning_rate": 4.103654487213615e-06, + "loss": 0.126, + "step": 22201 + }, + { + "epoch": 0.5618341473289976, + "grad_norm": 2.4337692260742188, + "learning_rate": 4.103259472199478e-06, + "loss": 0.0984, + "step": 22202 + }, + { + "epoch": 0.5618594528936913, + "grad_norm": 2.9765467643737793, + "learning_rate": 4.102864462968254e-06, + "loss": 0.1388, + "step": 22203 + }, + { + "epoch": 0.561884758458385, + "grad_norm": 1.9812238216400146, + "learning_rate": 4.102469459522493e-06, + "loss": 0.1399, + "step": 22204 + }, + { + "epoch": 0.5619100640230786, + "grad_norm": 2.867063522338867, + "learning_rate": 4.1020744618647364e-06, + "loss": 0.1553, + "step": 22205 + }, + { + "epoch": 0.5619353695877723, + "grad_norm": 11.788814544677734, + "learning_rate": 4.101679469997535e-06, + "loss": 0.2096, + "step": 22206 + }, + { + "epoch": 0.5619606751524661, + "grad_norm": 5.204395294189453, + "learning_rate": 4.101284483923436e-06, + "loss": 0.1585, + "step": 22207 + }, + { + "epoch": 0.5619859807171597, + "grad_norm": 4.20283842086792, + "learning_rate": 4.100889503644987e-06, + "loss": 0.1316, + "step": 22208 + }, + { + "epoch": 0.5620112862818534, + "grad_norm": 6.005425453186035, + "learning_rate": 4.100494529164733e-06, + "loss": 0.1715, + "step": 22209 + }, + { + "epoch": 0.5620365918465471, + "grad_norm": 3.178230047225952, + "learning_rate": 4.100099560485224e-06, + "loss": 0.0727, + "step": 22210 + }, + { + "epoch": 0.5620618974112407, + "grad_norm": 5.458066463470459, + "learning_rate": 4.0997045976090035e-06, + "loss": 0.187, + "step": 22211 + }, + { + "epoch": 0.5620872029759344, + "grad_norm": 3.6892507076263428, + "learning_rate": 4.0993096405386224e-06, + "loss": 0.1829, + "step": 22212 + }, + { + "epoch": 0.5621125085406281, + "grad_norm": 5.7086005210876465, + "learning_rate": 4.098914689276626e-06, + "loss": 0.2085, + "step": 22213 + }, + { + "epoch": 0.5621378141053217, + "grad_norm": 3.3480141162872314, + "learning_rate": 4.09851974382556e-06, + "loss": 0.1364, + "step": 22214 + }, + { + "epoch": 0.5621631196700154, + "grad_norm": 7.48739767074585, + "learning_rate": 4.098124804187973e-06, + "loss": 0.2703, + "step": 22215 + }, + { + "epoch": 0.5621884252347091, + "grad_norm": 6.092892169952393, + "learning_rate": 4.09772987036641e-06, + "loss": 0.1686, + "step": 22216 + }, + { + "epoch": 0.5622137307994027, + "grad_norm": 3.027172565460205, + "learning_rate": 4.097334942363422e-06, + "loss": 0.1258, + "step": 22217 + }, + { + "epoch": 0.5622390363640964, + "grad_norm": 7.451874732971191, + "learning_rate": 4.096940020181551e-06, + "loss": 0.1616, + "step": 22218 + }, + { + "epoch": 0.5622643419287902, + "grad_norm": 4.322079181671143, + "learning_rate": 4.096545103823347e-06, + "loss": 0.1038, + "step": 22219 + }, + { + "epoch": 0.5622896474934839, + "grad_norm": 7.165099620819092, + "learning_rate": 4.096150193291354e-06, + "loss": 0.2116, + "step": 22220 + }, + { + "epoch": 0.5623149530581775, + "grad_norm": 10.021076202392578, + "learning_rate": 4.0957552885881215e-06, + "loss": 0.1375, + "step": 22221 + }, + { + "epoch": 0.5623402586228712, + "grad_norm": 3.316128969192505, + "learning_rate": 4.095360389716195e-06, + "loss": 0.108, + "step": 22222 + }, + { + "epoch": 0.5623655641875649, + "grad_norm": 11.315988540649414, + "learning_rate": 4.09496549667812e-06, + "loss": 0.2727, + "step": 22223 + }, + { + "epoch": 0.5623908697522585, + "grad_norm": 4.594188213348389, + "learning_rate": 4.094570609476445e-06, + "loss": 0.1111, + "step": 22224 + }, + { + "epoch": 0.5624161753169522, + "grad_norm": 5.994927406311035, + "learning_rate": 4.094175728113715e-06, + "loss": 0.2033, + "step": 22225 + }, + { + "epoch": 0.5624414808816459, + "grad_norm": 9.256926536560059, + "learning_rate": 4.093780852592479e-06, + "loss": 0.1835, + "step": 22226 + }, + { + "epoch": 0.5624667864463395, + "grad_norm": 5.319876194000244, + "learning_rate": 4.093385982915281e-06, + "loss": 0.1751, + "step": 22227 + }, + { + "epoch": 0.5624920920110332, + "grad_norm": 5.0219950675964355, + "learning_rate": 4.092991119084667e-06, + "loss": 0.2163, + "step": 22228 + }, + { + "epoch": 0.5625173975757269, + "grad_norm": 6.187098026275635, + "learning_rate": 4.092596261103184e-06, + "loss": 0.1765, + "step": 22229 + }, + { + "epoch": 0.5625427031404205, + "grad_norm": 5.618900299072266, + "learning_rate": 4.092201408973383e-06, + "loss": 0.2067, + "step": 22230 + }, + { + "epoch": 0.5625680087051143, + "grad_norm": 4.1696624755859375, + "learning_rate": 4.091806562697802e-06, + "loss": 0.1441, + "step": 22231 + }, + { + "epoch": 0.562593314269808, + "grad_norm": 6.894949436187744, + "learning_rate": 4.091411722278993e-06, + "loss": 0.1987, + "step": 22232 + }, + { + "epoch": 0.5626186198345016, + "grad_norm": 6.653947353363037, + "learning_rate": 4.0910168877195e-06, + "loss": 0.1165, + "step": 22233 + }, + { + "epoch": 0.5626439253991953, + "grad_norm": 8.067154884338379, + "learning_rate": 4.09062205902187e-06, + "loss": 0.1981, + "step": 22234 + }, + { + "epoch": 0.562669230963889, + "grad_norm": 7.730200290679932, + "learning_rate": 4.0902272361886515e-06, + "loss": 0.263, + "step": 22235 + }, + { + "epoch": 0.5626945365285826, + "grad_norm": 4.365321159362793, + "learning_rate": 4.089832419222386e-06, + "loss": 0.2268, + "step": 22236 + }, + { + "epoch": 0.5627198420932763, + "grad_norm": 6.183210372924805, + "learning_rate": 4.089437608125623e-06, + "loss": 0.1032, + "step": 22237 + }, + { + "epoch": 0.56274514765797, + "grad_norm": 5.631492614746094, + "learning_rate": 4.089042802900906e-06, + "loss": 0.235, + "step": 22238 + }, + { + "epoch": 0.5627704532226636, + "grad_norm": 3.6157455444335938, + "learning_rate": 4.088648003550786e-06, + "loss": 0.1005, + "step": 22239 + }, + { + "epoch": 0.5627957587873573, + "grad_norm": 8.029330253601074, + "learning_rate": 4.088253210077802e-06, + "loss": 0.195, + "step": 22240 + }, + { + "epoch": 0.562821064352051, + "grad_norm": 3.0388150215148926, + "learning_rate": 4.087858422484504e-06, + "loss": 0.0896, + "step": 22241 + }, + { + "epoch": 0.5628463699167446, + "grad_norm": 3.2607476711273193, + "learning_rate": 4.087463640773438e-06, + "loss": 0.1717, + "step": 22242 + }, + { + "epoch": 0.5628716754814383, + "grad_norm": 2.719971179962158, + "learning_rate": 4.0870688649471505e-06, + "loss": 0.1174, + "step": 22243 + }, + { + "epoch": 0.5628969810461321, + "grad_norm": 9.85988712310791, + "learning_rate": 4.086674095008185e-06, + "loss": 0.2139, + "step": 22244 + }, + { + "epoch": 0.5629222866108258, + "grad_norm": 4.213932991027832, + "learning_rate": 4.086279330959087e-06, + "loss": 0.1082, + "step": 22245 + }, + { + "epoch": 0.5629475921755194, + "grad_norm": 4.47893762588501, + "learning_rate": 4.085884572802406e-06, + "loss": 0.1605, + "step": 22246 + }, + { + "epoch": 0.5629728977402131, + "grad_norm": 2.6180062294006348, + "learning_rate": 4.085489820540683e-06, + "loss": 0.1151, + "step": 22247 + }, + { + "epoch": 0.5629982033049068, + "grad_norm": 6.4073028564453125, + "learning_rate": 4.085095074176468e-06, + "loss": 0.1888, + "step": 22248 + }, + { + "epoch": 0.5630235088696004, + "grad_norm": 5.8604512214660645, + "learning_rate": 4.084700333712304e-06, + "loss": 0.1613, + "step": 22249 + }, + { + "epoch": 0.5630488144342941, + "grad_norm": 8.056396484375, + "learning_rate": 4.084305599150737e-06, + "loss": 0.2328, + "step": 22250 + }, + { + "epoch": 0.5630741199989878, + "grad_norm": 4.468459606170654, + "learning_rate": 4.083910870494312e-06, + "loss": 0.1359, + "step": 22251 + }, + { + "epoch": 0.5630994255636814, + "grad_norm": 5.921441555023193, + "learning_rate": 4.083516147745579e-06, + "loss": 0.1766, + "step": 22252 + }, + { + "epoch": 0.5631247311283751, + "grad_norm": 6.0810136795043945, + "learning_rate": 4.0831214309070765e-06, + "loss": 0.1319, + "step": 22253 + }, + { + "epoch": 0.5631500366930688, + "grad_norm": 2.7001779079437256, + "learning_rate": 4.082726719981354e-06, + "loss": 0.1102, + "step": 22254 + }, + { + "epoch": 0.5631753422577624, + "grad_norm": 4.196385860443115, + "learning_rate": 4.082332014970956e-06, + "loss": 0.1096, + "step": 22255 + }, + { + "epoch": 0.5632006478224562, + "grad_norm": 2.254270315170288, + "learning_rate": 4.081937315878427e-06, + "loss": 0.059, + "step": 22256 + }, + { + "epoch": 0.5632259533871499, + "grad_norm": 8.545982360839844, + "learning_rate": 4.081542622706317e-06, + "loss": 0.2299, + "step": 22257 + }, + { + "epoch": 0.5632512589518435, + "grad_norm": 3.468930721282959, + "learning_rate": 4.081147935457165e-06, + "loss": 0.0753, + "step": 22258 + }, + { + "epoch": 0.5632765645165372, + "grad_norm": 4.504996299743652, + "learning_rate": 4.0807532541335195e-06, + "loss": 0.1963, + "step": 22259 + }, + { + "epoch": 0.5633018700812309, + "grad_norm": 5.695262432098389, + "learning_rate": 4.080358578737925e-06, + "loss": 0.1917, + "step": 22260 + }, + { + "epoch": 0.5633271756459245, + "grad_norm": 3.4979135990142822, + "learning_rate": 4.0799639092729294e-06, + "loss": 0.1105, + "step": 22261 + }, + { + "epoch": 0.5633524812106182, + "grad_norm": 5.803600311279297, + "learning_rate": 4.0795692457410725e-06, + "loss": 0.1917, + "step": 22262 + }, + { + "epoch": 0.5633777867753119, + "grad_norm": 3.514847993850708, + "learning_rate": 4.079174588144903e-06, + "loss": 0.1297, + "step": 22263 + }, + { + "epoch": 0.5634030923400055, + "grad_norm": 8.227924346923828, + "learning_rate": 4.078779936486966e-06, + "loss": 0.1883, + "step": 22264 + }, + { + "epoch": 0.5634283979046992, + "grad_norm": 10.02279281616211, + "learning_rate": 4.078385290769804e-06, + "loss": 0.2961, + "step": 22265 + }, + { + "epoch": 0.5634537034693929, + "grad_norm": 9.297335624694824, + "learning_rate": 4.077990650995968e-06, + "loss": 0.3944, + "step": 22266 + }, + { + "epoch": 0.5634790090340865, + "grad_norm": 3.4259746074676514, + "learning_rate": 4.077596017167995e-06, + "loss": 0.1149, + "step": 22267 + }, + { + "epoch": 0.5635043145987803, + "grad_norm": 3.4942994117736816, + "learning_rate": 4.077201389288435e-06, + "loss": 0.1056, + "step": 22268 + }, + { + "epoch": 0.563529620163474, + "grad_norm": 8.140606880187988, + "learning_rate": 4.076806767359831e-06, + "loss": 0.1874, + "step": 22269 + }, + { + "epoch": 0.5635549257281677, + "grad_norm": 3.5126254558563232, + "learning_rate": 4.07641215138473e-06, + "loss": 0.2129, + "step": 22270 + }, + { + "epoch": 0.5635802312928613, + "grad_norm": 3.1478519439697266, + "learning_rate": 4.076017541365673e-06, + "loss": 0.1388, + "step": 22271 + }, + { + "epoch": 0.563605536857555, + "grad_norm": 4.509250164031982, + "learning_rate": 4.075622937305208e-06, + "loss": 0.1838, + "step": 22272 + }, + { + "epoch": 0.5636308424222487, + "grad_norm": 5.409947872161865, + "learning_rate": 4.07522833920588e-06, + "loss": 0.1937, + "step": 22273 + }, + { + "epoch": 0.5636561479869423, + "grad_norm": 3.717946767807007, + "learning_rate": 4.074833747070232e-06, + "loss": 0.108, + "step": 22274 + }, + { + "epoch": 0.563681453551636, + "grad_norm": 5.511497974395752, + "learning_rate": 4.074439160900807e-06, + "loss": 0.2065, + "step": 22275 + }, + { + "epoch": 0.5637067591163297, + "grad_norm": 3.2168734073638916, + "learning_rate": 4.074044580700154e-06, + "loss": 0.1026, + "step": 22276 + }, + { + "epoch": 0.5637320646810233, + "grad_norm": 3.797689914703369, + "learning_rate": 4.073650006470814e-06, + "loss": 0.1435, + "step": 22277 + }, + { + "epoch": 0.563757370245717, + "grad_norm": 4.817898273468018, + "learning_rate": 4.073255438215332e-06, + "loss": 0.1857, + "step": 22278 + }, + { + "epoch": 0.5637826758104107, + "grad_norm": 7.165376663208008, + "learning_rate": 4.072860875936254e-06, + "loss": 0.1675, + "step": 22279 + }, + { + "epoch": 0.5638079813751044, + "grad_norm": 2.8039746284484863, + "learning_rate": 4.072466319636125e-06, + "loss": 0.1569, + "step": 22280 + }, + { + "epoch": 0.5638332869397981, + "grad_norm": 3.6644065380096436, + "learning_rate": 4.072071769317487e-06, + "loss": 0.1643, + "step": 22281 + }, + { + "epoch": 0.5638585925044918, + "grad_norm": 4.833334445953369, + "learning_rate": 4.071677224982885e-06, + "loss": 0.0913, + "step": 22282 + }, + { + "epoch": 0.5638838980691854, + "grad_norm": 4.415833473205566, + "learning_rate": 4.071282686634867e-06, + "loss": 0.1303, + "step": 22283 + }, + { + "epoch": 0.5639092036338791, + "grad_norm": 3.2124626636505127, + "learning_rate": 4.070888154275971e-06, + "loss": 0.11, + "step": 22284 + }, + { + "epoch": 0.5639345091985728, + "grad_norm": 10.549431800842285, + "learning_rate": 4.0704936279087445e-06, + "loss": 0.177, + "step": 22285 + }, + { + "epoch": 0.5639598147632664, + "grad_norm": 6.293665409088135, + "learning_rate": 4.070099107535732e-06, + "loss": 0.3067, + "step": 22286 + }, + { + "epoch": 0.5639851203279601, + "grad_norm": 2.7289958000183105, + "learning_rate": 4.069704593159478e-06, + "loss": 0.1374, + "step": 22287 + }, + { + "epoch": 0.5640104258926538, + "grad_norm": 7.434051036834717, + "learning_rate": 4.069310084782527e-06, + "loss": 0.2411, + "step": 22288 + }, + { + "epoch": 0.5640357314573474, + "grad_norm": 6.015534400939941, + "learning_rate": 4.068915582407421e-06, + "loss": 0.1998, + "step": 22289 + }, + { + "epoch": 0.5640610370220411, + "grad_norm": 3.6382546424865723, + "learning_rate": 4.068521086036705e-06, + "loss": 0.1669, + "step": 22290 + }, + { + "epoch": 0.5640863425867348, + "grad_norm": 9.38701343536377, + "learning_rate": 4.068126595672923e-06, + "loss": 0.2575, + "step": 22291 + }, + { + "epoch": 0.5641116481514284, + "grad_norm": 5.020402431488037, + "learning_rate": 4.067732111318621e-06, + "loss": 0.1808, + "step": 22292 + }, + { + "epoch": 0.5641369537161222, + "grad_norm": 6.340639591217041, + "learning_rate": 4.0673376329763394e-06, + "loss": 0.1638, + "step": 22293 + }, + { + "epoch": 0.5641622592808159, + "grad_norm": 4.015842437744141, + "learning_rate": 4.066943160648625e-06, + "loss": 0.137, + "step": 22294 + }, + { + "epoch": 0.5641875648455096, + "grad_norm": 7.0381879806518555, + "learning_rate": 4.066548694338019e-06, + "loss": 0.1466, + "step": 22295 + }, + { + "epoch": 0.5642128704102032, + "grad_norm": 15.467839241027832, + "learning_rate": 4.0661542340470695e-06, + "loss": 0.2491, + "step": 22296 + }, + { + "epoch": 0.5642381759748969, + "grad_norm": 5.727262020111084, + "learning_rate": 4.065759779778316e-06, + "loss": 0.1988, + "step": 22297 + }, + { + "epoch": 0.5642634815395906, + "grad_norm": 7.6690592765808105, + "learning_rate": 4.065365331534303e-06, + "loss": 0.1996, + "step": 22298 + }, + { + "epoch": 0.5642887871042842, + "grad_norm": 6.4212870597839355, + "learning_rate": 4.064970889317576e-06, + "loss": 0.2079, + "step": 22299 + }, + { + "epoch": 0.5643140926689779, + "grad_norm": 15.84340763092041, + "learning_rate": 4.064576453130679e-06, + "loss": 0.2303, + "step": 22300 + }, + { + "epoch": 0.5643393982336716, + "grad_norm": 6.194307327270508, + "learning_rate": 4.064182022976153e-06, + "loss": 0.189, + "step": 22301 + }, + { + "epoch": 0.5643647037983652, + "grad_norm": 4.789233684539795, + "learning_rate": 4.0637875988565435e-06, + "loss": 0.1654, + "step": 22302 + }, + { + "epoch": 0.5643900093630589, + "grad_norm": 2.8051257133483887, + "learning_rate": 4.063393180774394e-06, + "loss": 0.15, + "step": 22303 + }, + { + "epoch": 0.5644153149277527, + "grad_norm": 15.194445610046387, + "learning_rate": 4.062998768732246e-06, + "loss": 0.2224, + "step": 22304 + }, + { + "epoch": 0.5644406204924463, + "grad_norm": 4.014070987701416, + "learning_rate": 4.062604362732647e-06, + "loss": 0.1376, + "step": 22305 + }, + { + "epoch": 0.56446592605714, + "grad_norm": 3.102783679962158, + "learning_rate": 4.062209962778136e-06, + "loss": 0.1488, + "step": 22306 + }, + { + "epoch": 0.5644912316218337, + "grad_norm": 6.732329845428467, + "learning_rate": 4.06181556887126e-06, + "loss": 0.1768, + "step": 22307 + }, + { + "epoch": 0.5645165371865273, + "grad_norm": 6.736544609069824, + "learning_rate": 4.061421181014561e-06, + "loss": 0.1292, + "step": 22308 + }, + { + "epoch": 0.564541842751221, + "grad_norm": 3.67790150642395, + "learning_rate": 4.06102679921058e-06, + "loss": 0.1728, + "step": 22309 + }, + { + "epoch": 0.5645671483159147, + "grad_norm": 5.978644371032715, + "learning_rate": 4.060632423461866e-06, + "loss": 0.1498, + "step": 22310 + }, + { + "epoch": 0.5645924538806083, + "grad_norm": 4.256902694702148, + "learning_rate": 4.060238053770956e-06, + "loss": 0.184, + "step": 22311 + }, + { + "epoch": 0.564617759445302, + "grad_norm": 5.531790733337402, + "learning_rate": 4.059843690140396e-06, + "loss": 0.152, + "step": 22312 + }, + { + "epoch": 0.5646430650099957, + "grad_norm": 6.4957051277160645, + "learning_rate": 4.05944933257273e-06, + "loss": 0.1889, + "step": 22313 + }, + { + "epoch": 0.5646683705746893, + "grad_norm": 2.63803768157959, + "learning_rate": 4.059054981070502e-06, + "loss": 0.1235, + "step": 22314 + }, + { + "epoch": 0.564693676139383, + "grad_norm": 10.260210037231445, + "learning_rate": 4.058660635636252e-06, + "loss": 0.1893, + "step": 22315 + }, + { + "epoch": 0.5647189817040768, + "grad_norm": 3.0987389087677, + "learning_rate": 4.058266296272523e-06, + "loss": 0.1439, + "step": 22316 + }, + { + "epoch": 0.5647442872687704, + "grad_norm": 11.032663345336914, + "learning_rate": 4.05787196298186e-06, + "loss": 0.2481, + "step": 22317 + }, + { + "epoch": 0.5647695928334641, + "grad_norm": 4.506087779998779, + "learning_rate": 4.057477635766808e-06, + "loss": 0.1383, + "step": 22318 + }, + { + "epoch": 0.5647948983981578, + "grad_norm": 4.243635654449463, + "learning_rate": 4.0570833146299044e-06, + "loss": 0.1298, + "step": 22319 + }, + { + "epoch": 0.5648202039628515, + "grad_norm": 4.006288051605225, + "learning_rate": 4.0566889995736955e-06, + "loss": 0.2158, + "step": 22320 + }, + { + "epoch": 0.5648455095275451, + "grad_norm": 6.385106563568115, + "learning_rate": 4.056294690600724e-06, + "loss": 0.1315, + "step": 22321 + }, + { + "epoch": 0.5648708150922388, + "grad_norm": 5.04811954498291, + "learning_rate": 4.055900387713533e-06, + "loss": 0.1544, + "step": 22322 + }, + { + "epoch": 0.5648961206569325, + "grad_norm": 11.432209968566895, + "learning_rate": 4.055506090914666e-06, + "loss": 0.2019, + "step": 22323 + }, + { + "epoch": 0.5649214262216261, + "grad_norm": 4.443868637084961, + "learning_rate": 4.0551118002066625e-06, + "loss": 0.123, + "step": 22324 + }, + { + "epoch": 0.5649467317863198, + "grad_norm": 6.772681713104248, + "learning_rate": 4.054717515592068e-06, + "loss": 0.1402, + "step": 22325 + }, + { + "epoch": 0.5649720373510135, + "grad_norm": 3.8235225677490234, + "learning_rate": 4.054323237073424e-06, + "loss": 0.1387, + "step": 22326 + }, + { + "epoch": 0.5649973429157071, + "grad_norm": 4.106703758239746, + "learning_rate": 4.053928964653276e-06, + "loss": 0.1343, + "step": 22327 + }, + { + "epoch": 0.5650226484804008, + "grad_norm": 3.6174416542053223, + "learning_rate": 4.053534698334161e-06, + "loss": 0.1248, + "step": 22328 + }, + { + "epoch": 0.5650479540450946, + "grad_norm": 2.9295754432678223, + "learning_rate": 4.053140438118625e-06, + "loss": 0.1279, + "step": 22329 + }, + { + "epoch": 0.5650732596097882, + "grad_norm": 3.7238430976867676, + "learning_rate": 4.052746184009211e-06, + "loss": 0.1175, + "step": 22330 + }, + { + "epoch": 0.5650985651744819, + "grad_norm": 3.6105408668518066, + "learning_rate": 4.052351936008461e-06, + "loss": 0.2049, + "step": 22331 + }, + { + "epoch": 0.5651238707391756, + "grad_norm": 9.417984008789062, + "learning_rate": 4.051957694118916e-06, + "loss": 0.318, + "step": 22332 + }, + { + "epoch": 0.5651491763038692, + "grad_norm": 4.15608549118042, + "learning_rate": 4.051563458343121e-06, + "loss": 0.1978, + "step": 22333 + }, + { + "epoch": 0.5651744818685629, + "grad_norm": 6.218521595001221, + "learning_rate": 4.051169228683616e-06, + "loss": 0.1827, + "step": 22334 + }, + { + "epoch": 0.5651997874332566, + "grad_norm": 5.633917808532715, + "learning_rate": 4.050775005142944e-06, + "loss": 0.1316, + "step": 22335 + }, + { + "epoch": 0.5652250929979502, + "grad_norm": 3.0641229152679443, + "learning_rate": 4.0503807877236474e-06, + "loss": 0.1756, + "step": 22336 + }, + { + "epoch": 0.5652503985626439, + "grad_norm": 6.653814315795898, + "learning_rate": 4.04998657642827e-06, + "loss": 0.1826, + "step": 22337 + }, + { + "epoch": 0.5652757041273376, + "grad_norm": 6.867492198944092, + "learning_rate": 4.04959237125935e-06, + "loss": 0.287, + "step": 22338 + }, + { + "epoch": 0.5653010096920312, + "grad_norm": 4.077813625335693, + "learning_rate": 4.049198172219432e-06, + "loss": 0.1084, + "step": 22339 + }, + { + "epoch": 0.5653263152567249, + "grad_norm": 4.401766777038574, + "learning_rate": 4.048803979311059e-06, + "loss": 0.1361, + "step": 22340 + }, + { + "epoch": 0.5653516208214187, + "grad_norm": 4.533334255218506, + "learning_rate": 4.048409792536774e-06, + "loss": 0.1496, + "step": 22341 + }, + { + "epoch": 0.5653769263861123, + "grad_norm": 4.0091094970703125, + "learning_rate": 4.048015611899115e-06, + "loss": 0.1769, + "step": 22342 + }, + { + "epoch": 0.565402231950806, + "grad_norm": 3.2853446006774902, + "learning_rate": 4.047621437400627e-06, + "loss": 0.2013, + "step": 22343 + }, + { + "epoch": 0.5654275375154997, + "grad_norm": 6.658202648162842, + "learning_rate": 4.047227269043851e-06, + "loss": 0.2571, + "step": 22344 + }, + { + "epoch": 0.5654528430801933, + "grad_norm": 2.7311336994171143, + "learning_rate": 4.04683310683133e-06, + "loss": 0.1152, + "step": 22345 + }, + { + "epoch": 0.565478148644887, + "grad_norm": 6.410682678222656, + "learning_rate": 4.046438950765603e-06, + "loss": 0.221, + "step": 22346 + }, + { + "epoch": 0.5655034542095807, + "grad_norm": 8.131943702697754, + "learning_rate": 4.0460448008492155e-06, + "loss": 0.2061, + "step": 22347 + }, + { + "epoch": 0.5655287597742744, + "grad_norm": 12.800643920898438, + "learning_rate": 4.045650657084707e-06, + "loss": 0.1697, + "step": 22348 + }, + { + "epoch": 0.565554065338968, + "grad_norm": 6.597640514373779, + "learning_rate": 4.045256519474621e-06, + "loss": 0.1432, + "step": 22349 + }, + { + "epoch": 0.5655793709036617, + "grad_norm": 5.314364910125732, + "learning_rate": 4.044862388021496e-06, + "loss": 0.189, + "step": 22350 + }, + { + "epoch": 0.5656046764683554, + "grad_norm": 5.785171031951904, + "learning_rate": 4.044468262727877e-06, + "loss": 0.1719, + "step": 22351 + }, + { + "epoch": 0.565629982033049, + "grad_norm": 2.6826236248016357, + "learning_rate": 4.0440741435963034e-06, + "loss": 0.146, + "step": 22352 + }, + { + "epoch": 0.5656552875977428, + "grad_norm": 4.079137802124023, + "learning_rate": 4.043680030629319e-06, + "loss": 0.1601, + "step": 22353 + }, + { + "epoch": 0.5656805931624365, + "grad_norm": 4.361372470855713, + "learning_rate": 4.043285923829465e-06, + "loss": 0.0911, + "step": 22354 + }, + { + "epoch": 0.5657058987271301, + "grad_norm": 36.876712799072266, + "learning_rate": 4.042891823199281e-06, + "loss": 0.1983, + "step": 22355 + }, + { + "epoch": 0.5657312042918238, + "grad_norm": 7.559956073760986, + "learning_rate": 4.042497728741309e-06, + "loss": 0.2623, + "step": 22356 + }, + { + "epoch": 0.5657565098565175, + "grad_norm": 3.620507001876831, + "learning_rate": 4.042103640458092e-06, + "loss": 0.1083, + "step": 22357 + }, + { + "epoch": 0.5657818154212111, + "grad_norm": 5.514686584472656, + "learning_rate": 4.041709558352172e-06, + "loss": 0.1175, + "step": 22358 + }, + { + "epoch": 0.5658071209859048, + "grad_norm": 8.89763069152832, + "learning_rate": 4.041315482426086e-06, + "loss": 0.1634, + "step": 22359 + }, + { + "epoch": 0.5658324265505985, + "grad_norm": 4.740691184997559, + "learning_rate": 4.040921412682378e-06, + "loss": 0.1089, + "step": 22360 + }, + { + "epoch": 0.5658577321152921, + "grad_norm": 5.84271240234375, + "learning_rate": 4.040527349123591e-06, + "loss": 0.1805, + "step": 22361 + }, + { + "epoch": 0.5658830376799858, + "grad_norm": 5.8487749099731445, + "learning_rate": 4.040133291752264e-06, + "loss": 0.1586, + "step": 22362 + }, + { + "epoch": 0.5659083432446795, + "grad_norm": 5.30709981918335, + "learning_rate": 4.03973924057094e-06, + "loss": 0.168, + "step": 22363 + }, + { + "epoch": 0.5659336488093731, + "grad_norm": 3.666250228881836, + "learning_rate": 4.039345195582158e-06, + "loss": 0.1844, + "step": 22364 + }, + { + "epoch": 0.5659589543740668, + "grad_norm": 8.943915367126465, + "learning_rate": 4.03895115678846e-06, + "loss": 0.3007, + "step": 22365 + }, + { + "epoch": 0.5659842599387606, + "grad_norm": 2.4510722160339355, + "learning_rate": 4.038557124192386e-06, + "loss": 0.1133, + "step": 22366 + }, + { + "epoch": 0.5660095655034542, + "grad_norm": 4.051772117614746, + "learning_rate": 4.038163097796481e-06, + "loss": 0.1406, + "step": 22367 + }, + { + "epoch": 0.5660348710681479, + "grad_norm": 5.688035011291504, + "learning_rate": 4.037769077603282e-06, + "loss": 0.1803, + "step": 22368 + }, + { + "epoch": 0.5660601766328416, + "grad_norm": 4.936009883880615, + "learning_rate": 4.037375063615331e-06, + "loss": 0.1352, + "step": 22369 + }, + { + "epoch": 0.5660854821975352, + "grad_norm": 3.44145131111145, + "learning_rate": 4.036981055835169e-06, + "loss": 0.1584, + "step": 22370 + }, + { + "epoch": 0.5661107877622289, + "grad_norm": 6.37210750579834, + "learning_rate": 4.0365870542653384e-06, + "loss": 0.1695, + "step": 22371 + }, + { + "epoch": 0.5661360933269226, + "grad_norm": 12.470413208007812, + "learning_rate": 4.036193058908377e-06, + "loss": 0.198, + "step": 22372 + }, + { + "epoch": 0.5661613988916163, + "grad_norm": 3.790562629699707, + "learning_rate": 4.035799069766828e-06, + "loss": 0.183, + "step": 22373 + }, + { + "epoch": 0.5661867044563099, + "grad_norm": 3.4434444904327393, + "learning_rate": 4.03540508684323e-06, + "loss": 0.113, + "step": 22374 + }, + { + "epoch": 0.5662120100210036, + "grad_norm": 5.695289611816406, + "learning_rate": 4.0350111101401265e-06, + "loss": 0.1096, + "step": 22375 + }, + { + "epoch": 0.5662373155856973, + "grad_norm": 8.72644329071045, + "learning_rate": 4.034617139660059e-06, + "loss": 0.2799, + "step": 22376 + }, + { + "epoch": 0.5662626211503909, + "grad_norm": 3.900857448577881, + "learning_rate": 4.034223175405563e-06, + "loss": 0.1807, + "step": 22377 + }, + { + "epoch": 0.5662879267150847, + "grad_norm": 9.731032371520996, + "learning_rate": 4.033829217379183e-06, + "loss": 0.2216, + "step": 22378 + }, + { + "epoch": 0.5663132322797784, + "grad_norm": 4.550444602966309, + "learning_rate": 4.033435265583457e-06, + "loss": 0.1683, + "step": 22379 + }, + { + "epoch": 0.566338537844472, + "grad_norm": 7.205100059509277, + "learning_rate": 4.03304132002093e-06, + "loss": 0.1739, + "step": 22380 + }, + { + "epoch": 0.5663638434091657, + "grad_norm": 10.815396308898926, + "learning_rate": 4.032647380694139e-06, + "loss": 0.1725, + "step": 22381 + }, + { + "epoch": 0.5663891489738594, + "grad_norm": 3.106245517730713, + "learning_rate": 4.032253447605624e-06, + "loss": 0.1252, + "step": 22382 + }, + { + "epoch": 0.566414454538553, + "grad_norm": 13.10019302368164, + "learning_rate": 4.031859520757927e-06, + "loss": 0.3796, + "step": 22383 + }, + { + "epoch": 0.5664397601032467, + "grad_norm": 3.673625946044922, + "learning_rate": 4.0314656001535866e-06, + "loss": 0.148, + "step": 22384 + }, + { + "epoch": 0.5664650656679404, + "grad_norm": 3.562365770339966, + "learning_rate": 4.031071685795147e-06, + "loss": 0.143, + "step": 22385 + }, + { + "epoch": 0.566490371232634, + "grad_norm": 9.139474868774414, + "learning_rate": 4.030677777685145e-06, + "loss": 0.1453, + "step": 22386 + }, + { + "epoch": 0.5665156767973277, + "grad_norm": 3.8986361026763916, + "learning_rate": 4.03028387582612e-06, + "loss": 0.1302, + "step": 22387 + }, + { + "epoch": 0.5665409823620214, + "grad_norm": 4.670619487762451, + "learning_rate": 4.029889980220616e-06, + "loss": 0.0869, + "step": 22388 + }, + { + "epoch": 0.566566287926715, + "grad_norm": 11.0419282913208, + "learning_rate": 4.02949609087117e-06, + "loss": 0.2061, + "step": 22389 + }, + { + "epoch": 0.5665915934914088, + "grad_norm": 8.865656852722168, + "learning_rate": 4.029102207780323e-06, + "loss": 0.2331, + "step": 22390 + }, + { + "epoch": 0.5666168990561025, + "grad_norm": 5.282229423522949, + "learning_rate": 4.028708330950616e-06, + "loss": 0.1754, + "step": 22391 + }, + { + "epoch": 0.5666422046207961, + "grad_norm": 3.581270217895508, + "learning_rate": 4.028314460384588e-06, + "loss": 0.121, + "step": 22392 + }, + { + "epoch": 0.5666675101854898, + "grad_norm": 4.834488868713379, + "learning_rate": 4.027920596084779e-06, + "loss": 0.2038, + "step": 22393 + }, + { + "epoch": 0.5666928157501835, + "grad_norm": 6.9885382652282715, + "learning_rate": 4.02752673805373e-06, + "loss": 0.2236, + "step": 22394 + }, + { + "epoch": 0.5667181213148771, + "grad_norm": 4.091000080108643, + "learning_rate": 4.02713288629398e-06, + "loss": 0.1317, + "step": 22395 + }, + { + "epoch": 0.5667434268795708, + "grad_norm": 3.8412418365478516, + "learning_rate": 4.026739040808069e-06, + "loss": 0.1265, + "step": 22396 + }, + { + "epoch": 0.5667687324442645, + "grad_norm": 6.597335338592529, + "learning_rate": 4.026345201598535e-06, + "loss": 0.172, + "step": 22397 + }, + { + "epoch": 0.5667940380089582, + "grad_norm": 3.9486963748931885, + "learning_rate": 4.0259513686679234e-06, + "loss": 0.153, + "step": 22398 + }, + { + "epoch": 0.5668193435736518, + "grad_norm": 5.12875509262085, + "learning_rate": 4.025557542018767e-06, + "loss": 0.1117, + "step": 22399 + }, + { + "epoch": 0.5668446491383455, + "grad_norm": 7.6816816329956055, + "learning_rate": 4.02516372165361e-06, + "loss": 0.2523, + "step": 22400 + }, + { + "epoch": 0.5668699547030392, + "grad_norm": 2.196993350982666, + "learning_rate": 4.02476990757499e-06, + "loss": 0.1055, + "step": 22401 + }, + { + "epoch": 0.5668952602677328, + "grad_norm": 3.1876065731048584, + "learning_rate": 4.02437609978545e-06, + "loss": 0.0944, + "step": 22402 + }, + { + "epoch": 0.5669205658324266, + "grad_norm": 6.141303539276123, + "learning_rate": 4.023982298287525e-06, + "loss": 0.16, + "step": 22403 + }, + { + "epoch": 0.5669458713971203, + "grad_norm": 3.8097054958343506, + "learning_rate": 4.0235885030837565e-06, + "loss": 0.1296, + "step": 22404 + }, + { + "epoch": 0.5669711769618139, + "grad_norm": 8.351461410522461, + "learning_rate": 4.023194714176683e-06, + "loss": 0.169, + "step": 22405 + }, + { + "epoch": 0.5669964825265076, + "grad_norm": 4.717746734619141, + "learning_rate": 4.022800931568847e-06, + "loss": 0.1479, + "step": 22406 + }, + { + "epoch": 0.5670217880912013, + "grad_norm": 3.2110860347747803, + "learning_rate": 4.0224071552627865e-06, + "loss": 0.1308, + "step": 22407 + }, + { + "epoch": 0.5670470936558949, + "grad_norm": 3.3706936836242676, + "learning_rate": 4.022013385261039e-06, + "loss": 0.1452, + "step": 22408 + }, + { + "epoch": 0.5670723992205886, + "grad_norm": 2.924020528793335, + "learning_rate": 4.0216196215661455e-06, + "loss": 0.1014, + "step": 22409 + }, + { + "epoch": 0.5670977047852823, + "grad_norm": 18.370695114135742, + "learning_rate": 4.0212258641806445e-06, + "loss": 0.2762, + "step": 22410 + }, + { + "epoch": 0.5671230103499759, + "grad_norm": 2.9562039375305176, + "learning_rate": 4.020832113107078e-06, + "loss": 0.1437, + "step": 22411 + }, + { + "epoch": 0.5671483159146696, + "grad_norm": 9.086991310119629, + "learning_rate": 4.020438368347981e-06, + "loss": 0.3382, + "step": 22412 + }, + { + "epoch": 0.5671736214793633, + "grad_norm": 4.743170261383057, + "learning_rate": 4.0200446299058954e-06, + "loss": 0.1632, + "step": 22413 + }, + { + "epoch": 0.5671989270440569, + "grad_norm": 2.923879384994507, + "learning_rate": 4.01965089778336e-06, + "loss": 0.1167, + "step": 22414 + }, + { + "epoch": 0.5672242326087507, + "grad_norm": 8.001596450805664, + "learning_rate": 4.019257171982912e-06, + "loss": 0.1921, + "step": 22415 + }, + { + "epoch": 0.5672495381734444, + "grad_norm": 9.667478561401367, + "learning_rate": 4.0188634525070955e-06, + "loss": 0.1565, + "step": 22416 + }, + { + "epoch": 0.567274843738138, + "grad_norm": 20.702281951904297, + "learning_rate": 4.018469739358443e-06, + "loss": 0.2341, + "step": 22417 + }, + { + "epoch": 0.5673001493028317, + "grad_norm": 8.986233711242676, + "learning_rate": 4.018076032539497e-06, + "loss": 0.2723, + "step": 22418 + }, + { + "epoch": 0.5673254548675254, + "grad_norm": 9.922272682189941, + "learning_rate": 4.017682332052798e-06, + "loss": 0.2729, + "step": 22419 + }, + { + "epoch": 0.567350760432219, + "grad_norm": 7.593138217926025, + "learning_rate": 4.017288637900881e-06, + "loss": 0.2333, + "step": 22420 + }, + { + "epoch": 0.5673760659969127, + "grad_norm": 4.349846839904785, + "learning_rate": 4.0168949500862865e-06, + "loss": 0.1823, + "step": 22421 + }, + { + "epoch": 0.5674013715616064, + "grad_norm": 2.9482157230377197, + "learning_rate": 4.016501268611556e-06, + "loss": 0.1179, + "step": 22422 + }, + { + "epoch": 0.5674266771263001, + "grad_norm": 3.0406906604766846, + "learning_rate": 4.016107593479224e-06, + "loss": 0.1109, + "step": 22423 + }, + { + "epoch": 0.5674519826909937, + "grad_norm": 5.992288112640381, + "learning_rate": 4.015713924691831e-06, + "loss": 0.1924, + "step": 22424 + }, + { + "epoch": 0.5674772882556874, + "grad_norm": 20.710403442382812, + "learning_rate": 4.015320262251917e-06, + "loss": 0.1498, + "step": 22425 + }, + { + "epoch": 0.5675025938203812, + "grad_norm": 3.849360466003418, + "learning_rate": 4.0149266061620175e-06, + "loss": 0.1505, + "step": 22426 + }, + { + "epoch": 0.5675278993850748, + "grad_norm": 8.584714889526367, + "learning_rate": 4.014532956424674e-06, + "loss": 0.223, + "step": 22427 + }, + { + "epoch": 0.5675532049497685, + "grad_norm": 4.064741134643555, + "learning_rate": 4.014139313042424e-06, + "loss": 0.164, + "step": 22428 + }, + { + "epoch": 0.5675785105144622, + "grad_norm": 3.941776752471924, + "learning_rate": 4.013745676017808e-06, + "loss": 0.1399, + "step": 22429 + }, + { + "epoch": 0.5676038160791558, + "grad_norm": 6.8871541023254395, + "learning_rate": 4.013352045353361e-06, + "loss": 0.1802, + "step": 22430 + }, + { + "epoch": 0.5676291216438495, + "grad_norm": 3.809131145477295, + "learning_rate": 4.012958421051623e-06, + "loss": 0.1259, + "step": 22431 + }, + { + "epoch": 0.5676544272085432, + "grad_norm": 6.192887783050537, + "learning_rate": 4.012564803115132e-06, + "loss": 0.1217, + "step": 22432 + }, + { + "epoch": 0.5676797327732368, + "grad_norm": 6.639443874359131, + "learning_rate": 4.01217119154643e-06, + "loss": 0.2301, + "step": 22433 + }, + { + "epoch": 0.5677050383379305, + "grad_norm": 5.756592273712158, + "learning_rate": 4.011777586348049e-06, + "loss": 0.1292, + "step": 22434 + }, + { + "epoch": 0.5677303439026242, + "grad_norm": 5.913334846496582, + "learning_rate": 4.011383987522531e-06, + "loss": 0.2295, + "step": 22435 + }, + { + "epoch": 0.5677556494673178, + "grad_norm": 5.541855812072754, + "learning_rate": 4.010990395072414e-06, + "loss": 0.1165, + "step": 22436 + }, + { + "epoch": 0.5677809550320115, + "grad_norm": 2.8772435188293457, + "learning_rate": 4.010596809000235e-06, + "loss": 0.1151, + "step": 22437 + }, + { + "epoch": 0.5678062605967052, + "grad_norm": 4.96352481842041, + "learning_rate": 4.010203229308537e-06, + "loss": 0.2022, + "step": 22438 + }, + { + "epoch": 0.5678315661613988, + "grad_norm": 8.887425422668457, + "learning_rate": 4.009809655999851e-06, + "loss": 0.308, + "step": 22439 + }, + { + "epoch": 0.5678568717260926, + "grad_norm": 3.6279072761535645, + "learning_rate": 4.009416089076719e-06, + "loss": 0.1331, + "step": 22440 + }, + { + "epoch": 0.5678821772907863, + "grad_norm": 3.8770949840545654, + "learning_rate": 4.009022528541677e-06, + "loss": 0.1979, + "step": 22441 + }, + { + "epoch": 0.5679074828554799, + "grad_norm": 5.017320156097412, + "learning_rate": 4.008628974397268e-06, + "loss": 0.1604, + "step": 22442 + }, + { + "epoch": 0.5679327884201736, + "grad_norm": 10.876029968261719, + "learning_rate": 4.008235426646023e-06, + "loss": 0.1341, + "step": 22443 + }, + { + "epoch": 0.5679580939848673, + "grad_norm": 5.263393878936768, + "learning_rate": 4.007841885290484e-06, + "loss": 0.2254, + "step": 22444 + }, + { + "epoch": 0.5679833995495609, + "grad_norm": 6.047552585601807, + "learning_rate": 4.007448350333188e-06, + "loss": 0.1332, + "step": 22445 + }, + { + "epoch": 0.5680087051142546, + "grad_norm": 4.310439586639404, + "learning_rate": 4.0070548217766755e-06, + "loss": 0.1547, + "step": 22446 + }, + { + "epoch": 0.5680340106789483, + "grad_norm": 3.7165095806121826, + "learning_rate": 4.0066612996234785e-06, + "loss": 0.1577, + "step": 22447 + }, + { + "epoch": 0.568059316243642, + "grad_norm": 4.042212963104248, + "learning_rate": 4.0062677838761385e-06, + "loss": 0.1409, + "step": 22448 + }, + { + "epoch": 0.5680846218083356, + "grad_norm": 6.147545337677002, + "learning_rate": 4.005874274537195e-06, + "loss": 0.1894, + "step": 22449 + }, + { + "epoch": 0.5681099273730293, + "grad_norm": 9.877005577087402, + "learning_rate": 4.005480771609181e-06, + "loss": 0.2599, + "step": 22450 + }, + { + "epoch": 0.5681352329377231, + "grad_norm": 5.203434944152832, + "learning_rate": 4.005087275094638e-06, + "loss": 0.2031, + "step": 22451 + }, + { + "epoch": 0.5681605385024167, + "grad_norm": 9.883968353271484, + "learning_rate": 4.004693784996101e-06, + "loss": 0.1509, + "step": 22452 + }, + { + "epoch": 0.5681858440671104, + "grad_norm": 3.585212230682373, + "learning_rate": 4.004300301316109e-06, + "loss": 0.1299, + "step": 22453 + }, + { + "epoch": 0.5682111496318041, + "grad_norm": 5.82613468170166, + "learning_rate": 4.0039068240571985e-06, + "loss": 0.1643, + "step": 22454 + }, + { + "epoch": 0.5682364551964977, + "grad_norm": 9.214275360107422, + "learning_rate": 4.0035133532219094e-06, + "loss": 0.156, + "step": 22455 + }, + { + "epoch": 0.5682617607611914, + "grad_norm": 3.6375906467437744, + "learning_rate": 4.003119888812776e-06, + "loss": 0.1051, + "step": 22456 + }, + { + "epoch": 0.5682870663258851, + "grad_norm": 3.898452043533325, + "learning_rate": 4.002726430832338e-06, + "loss": 0.1605, + "step": 22457 + }, + { + "epoch": 0.5683123718905787, + "grad_norm": 5.902179718017578, + "learning_rate": 4.00233297928313e-06, + "loss": 0.1421, + "step": 22458 + }, + { + "epoch": 0.5683376774552724, + "grad_norm": 3.4586737155914307, + "learning_rate": 4.0019395341676916e-06, + "loss": 0.1396, + "step": 22459 + }, + { + "epoch": 0.5683629830199661, + "grad_norm": 5.4491353034973145, + "learning_rate": 4.001546095488561e-06, + "loss": 0.2255, + "step": 22460 + }, + { + "epoch": 0.5683882885846597, + "grad_norm": 9.318020820617676, + "learning_rate": 4.001152663248272e-06, + "loss": 0.2305, + "step": 22461 + }, + { + "epoch": 0.5684135941493534, + "grad_norm": 4.620377063751221, + "learning_rate": 4.000759237449364e-06, + "loss": 0.1408, + "step": 22462 + }, + { + "epoch": 0.5684388997140472, + "grad_norm": 5.443294525146484, + "learning_rate": 4.000365818094374e-06, + "loss": 0.1416, + "step": 22463 + }, + { + "epoch": 0.5684642052787408, + "grad_norm": 2.974350690841675, + "learning_rate": 3.999972405185841e-06, + "loss": 0.1621, + "step": 22464 + }, + { + "epoch": 0.5684895108434345, + "grad_norm": 5.548929214477539, + "learning_rate": 3.999578998726298e-06, + "loss": 0.1819, + "step": 22465 + }, + { + "epoch": 0.5685148164081282, + "grad_norm": 3.209458112716675, + "learning_rate": 3.999185598718283e-06, + "loss": 0.1223, + "step": 22466 + }, + { + "epoch": 0.5685401219728218, + "grad_norm": 6.50501823425293, + "learning_rate": 3.998792205164335e-06, + "loss": 0.1508, + "step": 22467 + }, + { + "epoch": 0.5685654275375155, + "grad_norm": 8.909507751464844, + "learning_rate": 3.998398818066989e-06, + "loss": 0.1853, + "step": 22468 + }, + { + "epoch": 0.5685907331022092, + "grad_norm": 4.872239589691162, + "learning_rate": 3.998005437428785e-06, + "loss": 0.1364, + "step": 22469 + }, + { + "epoch": 0.5686160386669028, + "grad_norm": 5.2306294441223145, + "learning_rate": 3.997612063252256e-06, + "loss": 0.1753, + "step": 22470 + }, + { + "epoch": 0.5686413442315965, + "grad_norm": 3.2279765605926514, + "learning_rate": 3.99721869553994e-06, + "loss": 0.0786, + "step": 22471 + }, + { + "epoch": 0.5686666497962902, + "grad_norm": 9.335552215576172, + "learning_rate": 3.996825334294374e-06, + "loss": 0.2742, + "step": 22472 + }, + { + "epoch": 0.5686919553609838, + "grad_norm": 4.205495357513428, + "learning_rate": 3.996431979518096e-06, + "loss": 0.1868, + "step": 22473 + }, + { + "epoch": 0.5687172609256775, + "grad_norm": 10.489239692687988, + "learning_rate": 3.996038631213641e-06, + "loss": 0.1928, + "step": 22474 + }, + { + "epoch": 0.5687425664903712, + "grad_norm": 5.403629779815674, + "learning_rate": 3.995645289383545e-06, + "loss": 0.1855, + "step": 22475 + }, + { + "epoch": 0.568767872055065, + "grad_norm": 3.9896366596221924, + "learning_rate": 3.9952519540303455e-06, + "loss": 0.1542, + "step": 22476 + }, + { + "epoch": 0.5687931776197586, + "grad_norm": 6.230152130126953, + "learning_rate": 3.994858625156582e-06, + "loss": 0.1844, + "step": 22477 + }, + { + "epoch": 0.5688184831844523, + "grad_norm": 3.5116360187530518, + "learning_rate": 3.994465302764787e-06, + "loss": 0.1563, + "step": 22478 + }, + { + "epoch": 0.568843788749146, + "grad_norm": 9.945849418640137, + "learning_rate": 3.9940719868574974e-06, + "loss": 0.2734, + "step": 22479 + }, + { + "epoch": 0.5688690943138396, + "grad_norm": 4.834151268005371, + "learning_rate": 3.993678677437251e-06, + "loss": 0.1556, + "step": 22480 + }, + { + "epoch": 0.5688943998785333, + "grad_norm": 3.6884877681732178, + "learning_rate": 3.993285374506583e-06, + "loss": 0.1395, + "step": 22481 + }, + { + "epoch": 0.568919705443227, + "grad_norm": 7.384689807891846, + "learning_rate": 3.992892078068032e-06, + "loss": 0.1852, + "step": 22482 + }, + { + "epoch": 0.5689450110079206, + "grad_norm": 2.4584872722625732, + "learning_rate": 3.9924987881241316e-06, + "loss": 0.0809, + "step": 22483 + }, + { + "epoch": 0.5689703165726143, + "grad_norm": 3.557861089706421, + "learning_rate": 3.992105504677418e-06, + "loss": 0.1396, + "step": 22484 + }, + { + "epoch": 0.568995622137308, + "grad_norm": 4.436300754547119, + "learning_rate": 3.991712227730429e-06, + "loss": 0.1482, + "step": 22485 + }, + { + "epoch": 0.5690209277020016, + "grad_norm": 6.782751083374023, + "learning_rate": 3.991318957285703e-06, + "loss": 0.1832, + "step": 22486 + }, + { + "epoch": 0.5690462332666953, + "grad_norm": 5.210983753204346, + "learning_rate": 3.990925693345771e-06, + "loss": 0.1506, + "step": 22487 + }, + { + "epoch": 0.5690715388313891, + "grad_norm": 6.423707485198975, + "learning_rate": 3.990532435913171e-06, + "loss": 0.1344, + "step": 22488 + }, + { + "epoch": 0.5690968443960827, + "grad_norm": 3.608726978302002, + "learning_rate": 3.99013918499044e-06, + "loss": 0.1073, + "step": 22489 + }, + { + "epoch": 0.5691221499607764, + "grad_norm": 9.495595932006836, + "learning_rate": 3.989745940580113e-06, + "loss": 0.201, + "step": 22490 + }, + { + "epoch": 0.5691474555254701, + "grad_norm": 3.736666202545166, + "learning_rate": 3.989352702684729e-06, + "loss": 0.136, + "step": 22491 + }, + { + "epoch": 0.5691727610901637, + "grad_norm": 3.9162709712982178, + "learning_rate": 3.988959471306819e-06, + "loss": 0.1058, + "step": 22492 + }, + { + "epoch": 0.5691980666548574, + "grad_norm": 3.9339487552642822, + "learning_rate": 3.9885662464489215e-06, + "loss": 0.1112, + "step": 22493 + }, + { + "epoch": 0.5692233722195511, + "grad_norm": 5.600154876708984, + "learning_rate": 3.988173028113572e-06, + "loss": 0.1632, + "step": 22494 + }, + { + "epoch": 0.5692486777842447, + "grad_norm": 6.2759599685668945, + "learning_rate": 3.98777981630331e-06, + "loss": 0.2842, + "step": 22495 + }, + { + "epoch": 0.5692739833489384, + "grad_norm": 4.094555854797363, + "learning_rate": 3.987386611020663e-06, + "loss": 0.1904, + "step": 22496 + }, + { + "epoch": 0.5692992889136321, + "grad_norm": 5.259437561035156, + "learning_rate": 3.986993412268173e-06, + "loss": 0.1809, + "step": 22497 + }, + { + "epoch": 0.5693245944783257, + "grad_norm": 8.010347366333008, + "learning_rate": 3.986600220048374e-06, + "loss": 0.2331, + "step": 22498 + }, + { + "epoch": 0.5693499000430194, + "grad_norm": 4.947433948516846, + "learning_rate": 3.986207034363803e-06, + "loss": 0.178, + "step": 22499 + }, + { + "epoch": 0.5693752056077132, + "grad_norm": 9.407865524291992, + "learning_rate": 3.985813855216993e-06, + "loss": 0.2999, + "step": 22500 + }, + { + "epoch": 0.5694005111724069, + "grad_norm": 3.365253448486328, + "learning_rate": 3.985420682610481e-06, + "loss": 0.1293, + "step": 22501 + }, + { + "epoch": 0.5694258167371005, + "grad_norm": 3.743410348892212, + "learning_rate": 3.985027516546801e-06, + "loss": 0.1497, + "step": 22502 + }, + { + "epoch": 0.5694511223017942, + "grad_norm": 7.635668754577637, + "learning_rate": 3.984634357028491e-06, + "loss": 0.2063, + "step": 22503 + }, + { + "epoch": 0.5694764278664879, + "grad_norm": 3.7109014987945557, + "learning_rate": 3.984241204058086e-06, + "loss": 0.2168, + "step": 22504 + }, + { + "epoch": 0.5695017334311815, + "grad_norm": 5.496908187866211, + "learning_rate": 3.98384805763812e-06, + "loss": 0.1533, + "step": 22505 + }, + { + "epoch": 0.5695270389958752, + "grad_norm": 5.342170715332031, + "learning_rate": 3.983454917771128e-06, + "loss": 0.2546, + "step": 22506 + }, + { + "epoch": 0.5695523445605689, + "grad_norm": 3.2300362586975098, + "learning_rate": 3.983061784459647e-06, + "loss": 0.1163, + "step": 22507 + }, + { + "epoch": 0.5695776501252625, + "grad_norm": 5.077348232269287, + "learning_rate": 3.982668657706211e-06, + "loss": 0.1158, + "step": 22508 + }, + { + "epoch": 0.5696029556899562, + "grad_norm": 10.674895286560059, + "learning_rate": 3.982275537513356e-06, + "loss": 0.1607, + "step": 22509 + }, + { + "epoch": 0.5696282612546499, + "grad_norm": 5.487740516662598, + "learning_rate": 3.9818824238836165e-06, + "loss": 0.1987, + "step": 22510 + }, + { + "epoch": 0.5696535668193435, + "grad_norm": 9.341780662536621, + "learning_rate": 3.981489316819527e-06, + "loss": 0.2426, + "step": 22511 + }, + { + "epoch": 0.5696788723840372, + "grad_norm": 5.0329670906066895, + "learning_rate": 3.981096216323625e-06, + "loss": 0.1794, + "step": 22512 + }, + { + "epoch": 0.569704177948731, + "grad_norm": 5.370420455932617, + "learning_rate": 3.9807031223984426e-06, + "loss": 0.1618, + "step": 22513 + }, + { + "epoch": 0.5697294835134246, + "grad_norm": 5.572803020477295, + "learning_rate": 3.980310035046517e-06, + "loss": 0.1901, + "step": 22514 + }, + { + "epoch": 0.5697547890781183, + "grad_norm": 4.749253273010254, + "learning_rate": 3.979916954270382e-06, + "loss": 0.1573, + "step": 22515 + }, + { + "epoch": 0.569780094642812, + "grad_norm": 4.797060489654541, + "learning_rate": 3.979523880072572e-06, + "loss": 0.1787, + "step": 22516 + }, + { + "epoch": 0.5698054002075056, + "grad_norm": 3.3347792625427246, + "learning_rate": 3.979130812455625e-06, + "loss": 0.1434, + "step": 22517 + }, + { + "epoch": 0.5698307057721993, + "grad_norm": 4.6318583488464355, + "learning_rate": 3.978737751422072e-06, + "loss": 0.1721, + "step": 22518 + }, + { + "epoch": 0.569856011336893, + "grad_norm": 4.783291816711426, + "learning_rate": 3.978344696974449e-06, + "loss": 0.1213, + "step": 22519 + }, + { + "epoch": 0.5698813169015866, + "grad_norm": 17.544687271118164, + "learning_rate": 3.977951649115291e-06, + "loss": 0.1264, + "step": 22520 + }, + { + "epoch": 0.5699066224662803, + "grad_norm": 6.435606956481934, + "learning_rate": 3.977558607847132e-06, + "loss": 0.1494, + "step": 22521 + }, + { + "epoch": 0.569931928030974, + "grad_norm": 1.9288114309310913, + "learning_rate": 3.97716557317251e-06, + "loss": 0.0799, + "step": 22522 + }, + { + "epoch": 0.5699572335956676, + "grad_norm": 4.861406326293945, + "learning_rate": 3.976772545093955e-06, + "loss": 0.1767, + "step": 22523 + }, + { + "epoch": 0.5699825391603613, + "grad_norm": 3.111194133758545, + "learning_rate": 3.976379523614003e-06, + "loss": 0.1499, + "step": 22524 + }, + { + "epoch": 0.5700078447250551, + "grad_norm": 5.82230806350708, + "learning_rate": 3.975986508735189e-06, + "loss": 0.2004, + "step": 22525 + }, + { + "epoch": 0.5700331502897488, + "grad_norm": 11.4000883102417, + "learning_rate": 3.975593500460051e-06, + "loss": 0.329, + "step": 22526 + }, + { + "epoch": 0.5700584558544424, + "grad_norm": 4.138501167297363, + "learning_rate": 3.9752004987911165e-06, + "loss": 0.1001, + "step": 22527 + }, + { + "epoch": 0.5700837614191361, + "grad_norm": 10.326549530029297, + "learning_rate": 3.9748075037309245e-06, + "loss": 0.2913, + "step": 22528 + }, + { + "epoch": 0.5701090669838298, + "grad_norm": 3.2050206661224365, + "learning_rate": 3.974414515282007e-06, + "loss": 0.1122, + "step": 22529 + }, + { + "epoch": 0.5701343725485234, + "grad_norm": 6.544623851776123, + "learning_rate": 3.974021533446902e-06, + "loss": 0.2084, + "step": 22530 + }, + { + "epoch": 0.5701596781132171, + "grad_norm": 4.064074516296387, + "learning_rate": 3.973628558228139e-06, + "loss": 0.1895, + "step": 22531 + }, + { + "epoch": 0.5701849836779108, + "grad_norm": 6.087141036987305, + "learning_rate": 3.973235589628255e-06, + "loss": 0.2013, + "step": 22532 + }, + { + "epoch": 0.5702102892426044, + "grad_norm": 3.352266788482666, + "learning_rate": 3.972842627649784e-06, + "loss": 0.1562, + "step": 22533 + }, + { + "epoch": 0.5702355948072981, + "grad_norm": 2.413516044616699, + "learning_rate": 3.97244967229526e-06, + "loss": 0.0786, + "step": 22534 + }, + { + "epoch": 0.5702609003719918, + "grad_norm": 3.2399895191192627, + "learning_rate": 3.972056723567217e-06, + "loss": 0.1411, + "step": 22535 + }, + { + "epoch": 0.5702862059366854, + "grad_norm": 4.7740654945373535, + "learning_rate": 3.971663781468188e-06, + "loss": 0.1391, + "step": 22536 + }, + { + "epoch": 0.5703115115013792, + "grad_norm": 3.898563861846924, + "learning_rate": 3.971270846000708e-06, + "loss": 0.0827, + "step": 22537 + }, + { + "epoch": 0.5703368170660729, + "grad_norm": 14.121197700500488, + "learning_rate": 3.9708779171673125e-06, + "loss": 0.1047, + "step": 22538 + }, + { + "epoch": 0.5703621226307665, + "grad_norm": 3.548091173171997, + "learning_rate": 3.970484994970533e-06, + "loss": 0.1651, + "step": 22539 + }, + { + "epoch": 0.5703874281954602, + "grad_norm": 3.070633888244629, + "learning_rate": 3.970092079412904e-06, + "loss": 0.1406, + "step": 22540 + }, + { + "epoch": 0.5704127337601539, + "grad_norm": 4.048259258270264, + "learning_rate": 3.96969917049696e-06, + "loss": 0.1771, + "step": 22541 + }, + { + "epoch": 0.5704380393248475, + "grad_norm": 4.079753875732422, + "learning_rate": 3.969306268225234e-06, + "loss": 0.1216, + "step": 22542 + }, + { + "epoch": 0.5704633448895412, + "grad_norm": 8.557254791259766, + "learning_rate": 3.96891337260026e-06, + "loss": 0.1477, + "step": 22543 + }, + { + "epoch": 0.5704886504542349, + "grad_norm": 4.6179938316345215, + "learning_rate": 3.968520483624574e-06, + "loss": 0.213, + "step": 22544 + }, + { + "epoch": 0.5705139560189285, + "grad_norm": 7.492279052734375, + "learning_rate": 3.968127601300705e-06, + "loss": 0.243, + "step": 22545 + }, + { + "epoch": 0.5705392615836222, + "grad_norm": 3.671581268310547, + "learning_rate": 3.967734725631189e-06, + "loss": 0.156, + "step": 22546 + }, + { + "epoch": 0.5705645671483159, + "grad_norm": 2.5501973628997803, + "learning_rate": 3.9673418566185605e-06, + "loss": 0.1087, + "step": 22547 + }, + { + "epoch": 0.5705898727130095, + "grad_norm": 8.404559135437012, + "learning_rate": 3.966948994265355e-06, + "loss": 0.2695, + "step": 22548 + }, + { + "epoch": 0.5706151782777032, + "grad_norm": 3.6501052379608154, + "learning_rate": 3.9665561385741e-06, + "loss": 0.1459, + "step": 22549 + }, + { + "epoch": 0.570640483842397, + "grad_norm": 7.807326793670654, + "learning_rate": 3.966163289547333e-06, + "loss": 0.1384, + "step": 22550 + }, + { + "epoch": 0.5706657894070907, + "grad_norm": 7.935576438903809, + "learning_rate": 3.965770447187585e-06, + "loss": 0.0601, + "step": 22551 + }, + { + "epoch": 0.5706910949717843, + "grad_norm": 6.62143611907959, + "learning_rate": 3.965377611497394e-06, + "loss": 0.2009, + "step": 22552 + }, + { + "epoch": 0.570716400536478, + "grad_norm": 4.966419219970703, + "learning_rate": 3.964984782479289e-06, + "loss": 0.1866, + "step": 22553 + }, + { + "epoch": 0.5707417061011717, + "grad_norm": 7.396583557128906, + "learning_rate": 3.964591960135804e-06, + "loss": 0.1553, + "step": 22554 + }, + { + "epoch": 0.5707670116658653, + "grad_norm": 4.488801002502441, + "learning_rate": 3.964199144469473e-06, + "loss": 0.157, + "step": 22555 + }, + { + "epoch": 0.570792317230559, + "grad_norm": 5.6833720207214355, + "learning_rate": 3.963806335482829e-06, + "loss": 0.1619, + "step": 22556 + }, + { + "epoch": 0.5708176227952527, + "grad_norm": 9.789846420288086, + "learning_rate": 3.963413533178407e-06, + "loss": 0.2007, + "step": 22557 + }, + { + "epoch": 0.5708429283599463, + "grad_norm": 2.5952799320220947, + "learning_rate": 3.963020737558736e-06, + "loss": 0.1271, + "step": 22558 + }, + { + "epoch": 0.57086823392464, + "grad_norm": 11.01672649383545, + "learning_rate": 3.962627948626352e-06, + "loss": 0.3714, + "step": 22559 + }, + { + "epoch": 0.5708935394893337, + "grad_norm": 8.09144115447998, + "learning_rate": 3.962235166383788e-06, + "loss": 0.1622, + "step": 22560 + }, + { + "epoch": 0.5709188450540273, + "grad_norm": 5.640473365783691, + "learning_rate": 3.961842390833577e-06, + "loss": 0.1931, + "step": 22561 + }, + { + "epoch": 0.5709441506187211, + "grad_norm": 4.469233512878418, + "learning_rate": 3.96144962197825e-06, + "loss": 0.1879, + "step": 22562 + }, + { + "epoch": 0.5709694561834148, + "grad_norm": 5.58152961730957, + "learning_rate": 3.961056859820342e-06, + "loss": 0.1512, + "step": 22563 + }, + { + "epoch": 0.5709947617481084, + "grad_norm": 3.2737629413604736, + "learning_rate": 3.960664104362383e-06, + "loss": 0.1276, + "step": 22564 + }, + { + "epoch": 0.5710200673128021, + "grad_norm": 8.382319450378418, + "learning_rate": 3.9602713556069105e-06, + "loss": 0.202, + "step": 22565 + }, + { + "epoch": 0.5710453728774958, + "grad_norm": 4.761744499206543, + "learning_rate": 3.9598786135564534e-06, + "loss": 0.1609, + "step": 22566 + }, + { + "epoch": 0.5710706784421894, + "grad_norm": 4.322354793548584, + "learning_rate": 3.959485878213546e-06, + "loss": 0.1249, + "step": 22567 + }, + { + "epoch": 0.5710959840068831, + "grad_norm": 5.815426826477051, + "learning_rate": 3.959093149580721e-06, + "loss": 0.1093, + "step": 22568 + }, + { + "epoch": 0.5711212895715768, + "grad_norm": 3.2679624557495117, + "learning_rate": 3.958700427660509e-06, + "loss": 0.1425, + "step": 22569 + }, + { + "epoch": 0.5711465951362704, + "grad_norm": 5.668423652648926, + "learning_rate": 3.958307712455446e-06, + "loss": 0.1869, + "step": 22570 + }, + { + "epoch": 0.5711719007009641, + "grad_norm": 6.900320529937744, + "learning_rate": 3.957915003968064e-06, + "loss": 0.1328, + "step": 22571 + }, + { + "epoch": 0.5711972062656578, + "grad_norm": 6.027657985687256, + "learning_rate": 3.957522302200891e-06, + "loss": 0.1539, + "step": 22572 + }, + { + "epoch": 0.5712225118303514, + "grad_norm": 5.4155049324035645, + "learning_rate": 3.957129607156464e-06, + "loss": 0.164, + "step": 22573 + }, + { + "epoch": 0.5712478173950452, + "grad_norm": 7.714632034301758, + "learning_rate": 3.956736918837315e-06, + "loss": 0.1309, + "step": 22574 + }, + { + "epoch": 0.5712731229597389, + "grad_norm": 3.9244279861450195, + "learning_rate": 3.956344237245975e-06, + "loss": 0.1186, + "step": 22575 + }, + { + "epoch": 0.5712984285244326, + "grad_norm": 6.30531120300293, + "learning_rate": 3.955951562384977e-06, + "loss": 0.1758, + "step": 22576 + }, + { + "epoch": 0.5713237340891262, + "grad_norm": 4.062857627868652, + "learning_rate": 3.955558894256853e-06, + "loss": 0.0898, + "step": 22577 + }, + { + "epoch": 0.5713490396538199, + "grad_norm": 3.5972509384155273, + "learning_rate": 3.955166232864135e-06, + "loss": 0.17, + "step": 22578 + }, + { + "epoch": 0.5713743452185136, + "grad_norm": 4.953786373138428, + "learning_rate": 3.954773578209356e-06, + "loss": 0.114, + "step": 22579 + }, + { + "epoch": 0.5713996507832072, + "grad_norm": 7.265356063842773, + "learning_rate": 3.954380930295048e-06, + "loss": 0.1325, + "step": 22580 + }, + { + "epoch": 0.5714249563479009, + "grad_norm": 2.907019853591919, + "learning_rate": 3.953988289123742e-06, + "loss": 0.0853, + "step": 22581 + }, + { + "epoch": 0.5714502619125946, + "grad_norm": 3.6868104934692383, + "learning_rate": 3.9535956546979715e-06, + "loss": 0.1305, + "step": 22582 + }, + { + "epoch": 0.5714755674772882, + "grad_norm": 9.136775016784668, + "learning_rate": 3.95320302702027e-06, + "loss": 0.0948, + "step": 22583 + }, + { + "epoch": 0.5715008730419819, + "grad_norm": 3.2438628673553467, + "learning_rate": 3.952810406093165e-06, + "loss": 0.1569, + "step": 22584 + }, + { + "epoch": 0.5715261786066756, + "grad_norm": 3.2294206619262695, + "learning_rate": 3.952417791919191e-06, + "loss": 0.1341, + "step": 22585 + }, + { + "epoch": 0.5715514841713693, + "grad_norm": 5.030508518218994, + "learning_rate": 3.95202518450088e-06, + "loss": 0.1729, + "step": 22586 + }, + { + "epoch": 0.571576789736063, + "grad_norm": 9.740951538085938, + "learning_rate": 3.951632583840763e-06, + "loss": 0.2286, + "step": 22587 + }, + { + "epoch": 0.5716020953007567, + "grad_norm": 14.748480796813965, + "learning_rate": 3.951239989941374e-06, + "loss": 0.1852, + "step": 22588 + }, + { + "epoch": 0.5716274008654503, + "grad_norm": 3.614069938659668, + "learning_rate": 3.950847402805243e-06, + "loss": 0.1146, + "step": 22589 + }, + { + "epoch": 0.571652706430144, + "grad_norm": 7.488770484924316, + "learning_rate": 3.950454822434901e-06, + "loss": 0.1644, + "step": 22590 + }, + { + "epoch": 0.5716780119948377, + "grad_norm": 9.366765975952148, + "learning_rate": 3.950062248832881e-06, + "loss": 0.2371, + "step": 22591 + }, + { + "epoch": 0.5717033175595313, + "grad_norm": 8.270218849182129, + "learning_rate": 3.949669682001716e-06, + "loss": 0.3129, + "step": 22592 + }, + { + "epoch": 0.571728623124225, + "grad_norm": 4.470119953155518, + "learning_rate": 3.949277121943933e-06, + "loss": 0.1592, + "step": 22593 + }, + { + "epoch": 0.5717539286889187, + "grad_norm": 4.222659111022949, + "learning_rate": 3.9488845686620676e-06, + "loss": 0.1171, + "step": 22594 + }, + { + "epoch": 0.5717792342536123, + "grad_norm": 6.198375225067139, + "learning_rate": 3.948492022158651e-06, + "loss": 0.1584, + "step": 22595 + }, + { + "epoch": 0.571804539818306, + "grad_norm": 11.834942817687988, + "learning_rate": 3.948099482436212e-06, + "loss": 0.2337, + "step": 22596 + }, + { + "epoch": 0.5718298453829997, + "grad_norm": 7.9849162101745605, + "learning_rate": 3.9477069494972854e-06, + "loss": 0.1692, + "step": 22597 + }, + { + "epoch": 0.5718551509476933, + "grad_norm": 3.614290952682495, + "learning_rate": 3.947314423344399e-06, + "loss": 0.1159, + "step": 22598 + }, + { + "epoch": 0.5718804565123871, + "grad_norm": 11.04336929321289, + "learning_rate": 3.946921903980088e-06, + "loss": 0.2308, + "step": 22599 + }, + { + "epoch": 0.5719057620770808, + "grad_norm": 4.557518482208252, + "learning_rate": 3.9465293914068806e-06, + "loss": 0.1498, + "step": 22600 + }, + { + "epoch": 0.5719310676417744, + "grad_norm": 4.8685150146484375, + "learning_rate": 3.94613688562731e-06, + "loss": 0.1938, + "step": 22601 + }, + { + "epoch": 0.5719563732064681, + "grad_norm": 2.796839952468872, + "learning_rate": 3.945744386643907e-06, + "loss": 0.1191, + "step": 22602 + }, + { + "epoch": 0.5719816787711618, + "grad_norm": 17.64530372619629, + "learning_rate": 3.945351894459201e-06, + "loss": 0.2851, + "step": 22603 + }, + { + "epoch": 0.5720069843358555, + "grad_norm": 3.3009772300720215, + "learning_rate": 3.944959409075724e-06, + "loss": 0.1513, + "step": 22604 + }, + { + "epoch": 0.5720322899005491, + "grad_norm": 8.49867057800293, + "learning_rate": 3.94456693049601e-06, + "loss": 0.2073, + "step": 22605 + }, + { + "epoch": 0.5720575954652428, + "grad_norm": 7.536441802978516, + "learning_rate": 3.944174458722586e-06, + "loss": 0.142, + "step": 22606 + }, + { + "epoch": 0.5720829010299365, + "grad_norm": 3.7267284393310547, + "learning_rate": 3.943781993757984e-06, + "loss": 0.1611, + "step": 22607 + }, + { + "epoch": 0.5721082065946301, + "grad_norm": 9.360735893249512, + "learning_rate": 3.943389535604736e-06, + "loss": 0.3333, + "step": 22608 + }, + { + "epoch": 0.5721335121593238, + "grad_norm": 7.8604254722595215, + "learning_rate": 3.942997084265371e-06, + "loss": 0.2286, + "step": 22609 + }, + { + "epoch": 0.5721588177240176, + "grad_norm": 6.20422887802124, + "learning_rate": 3.942604639742424e-06, + "loss": 0.1837, + "step": 22610 + }, + { + "epoch": 0.5721841232887112, + "grad_norm": 10.36926555633545, + "learning_rate": 3.9422122020384216e-06, + "loss": 0.159, + "step": 22611 + }, + { + "epoch": 0.5722094288534049, + "grad_norm": 6.150147914886475, + "learning_rate": 3.9418197711558955e-06, + "loss": 0.1394, + "step": 22612 + }, + { + "epoch": 0.5722347344180986, + "grad_norm": 3.498572826385498, + "learning_rate": 3.941427347097377e-06, + "loss": 0.153, + "step": 22613 + }, + { + "epoch": 0.5722600399827922, + "grad_norm": 9.884267807006836, + "learning_rate": 3.941034929865398e-06, + "loss": 0.3312, + "step": 22614 + }, + { + "epoch": 0.5722853455474859, + "grad_norm": 4.621798992156982, + "learning_rate": 3.9406425194624865e-06, + "loss": 0.2135, + "step": 22615 + }, + { + "epoch": 0.5723106511121796, + "grad_norm": 3.4557135105133057, + "learning_rate": 3.940250115891174e-06, + "loss": 0.1423, + "step": 22616 + }, + { + "epoch": 0.5723359566768732, + "grad_norm": 3.4013054370880127, + "learning_rate": 3.939857719153991e-06, + "loss": 0.1092, + "step": 22617 + }, + { + "epoch": 0.5723612622415669, + "grad_norm": 5.680220127105713, + "learning_rate": 3.939465329253469e-06, + "loss": 0.2048, + "step": 22618 + }, + { + "epoch": 0.5723865678062606, + "grad_norm": 7.185104846954346, + "learning_rate": 3.939072946192139e-06, + "loss": 0.2225, + "step": 22619 + }, + { + "epoch": 0.5724118733709542, + "grad_norm": 6.2373504638671875, + "learning_rate": 3.9386805699725296e-06, + "loss": 0.2186, + "step": 22620 + }, + { + "epoch": 0.5724371789356479, + "grad_norm": 7.604085922241211, + "learning_rate": 3.93828820059717e-06, + "loss": 0.2508, + "step": 22621 + }, + { + "epoch": 0.5724624845003417, + "grad_norm": 5.449272155761719, + "learning_rate": 3.937895838068594e-06, + "loss": 0.127, + "step": 22622 + }, + { + "epoch": 0.5724877900650353, + "grad_norm": 10.157903671264648, + "learning_rate": 3.937503482389331e-06, + "loss": 0.1472, + "step": 22623 + }, + { + "epoch": 0.572513095629729, + "grad_norm": 5.985404014587402, + "learning_rate": 3.9371111335619085e-06, + "loss": 0.1993, + "step": 22624 + }, + { + "epoch": 0.5725384011944227, + "grad_norm": 4.915493488311768, + "learning_rate": 3.936718791588859e-06, + "loss": 0.2152, + "step": 22625 + }, + { + "epoch": 0.5725637067591163, + "grad_norm": 6.404938220977783, + "learning_rate": 3.936326456472713e-06, + "loss": 0.2026, + "step": 22626 + }, + { + "epoch": 0.57258901232381, + "grad_norm": 4.3215651512146, + "learning_rate": 3.935934128216e-06, + "loss": 0.1829, + "step": 22627 + }, + { + "epoch": 0.5726143178885037, + "grad_norm": 3.8407530784606934, + "learning_rate": 3.935541806821249e-06, + "loss": 0.1706, + "step": 22628 + }, + { + "epoch": 0.5726396234531974, + "grad_norm": 10.994391441345215, + "learning_rate": 3.9351494922909924e-06, + "loss": 0.2386, + "step": 22629 + }, + { + "epoch": 0.572664929017891, + "grad_norm": 5.432705402374268, + "learning_rate": 3.934757184627757e-06, + "loss": 0.1555, + "step": 22630 + }, + { + "epoch": 0.5726902345825847, + "grad_norm": 6.591058254241943, + "learning_rate": 3.934364883834074e-06, + "loss": 0.2074, + "step": 22631 + }, + { + "epoch": 0.5727155401472784, + "grad_norm": 3.047585964202881, + "learning_rate": 3.933972589912476e-06, + "loss": 0.192, + "step": 22632 + }, + { + "epoch": 0.572740845711972, + "grad_norm": 2.6635658740997314, + "learning_rate": 3.933580302865489e-06, + "loss": 0.1083, + "step": 22633 + }, + { + "epoch": 0.5727661512766657, + "grad_norm": 5.280353546142578, + "learning_rate": 3.9331880226956434e-06, + "loss": 0.221, + "step": 22634 + }, + { + "epoch": 0.5727914568413595, + "grad_norm": 3.31410813331604, + "learning_rate": 3.93279574940547e-06, + "loss": 0.117, + "step": 22635 + }, + { + "epoch": 0.5728167624060531, + "grad_norm": 4.2365264892578125, + "learning_rate": 3.9324034829975e-06, + "loss": 0.22, + "step": 22636 + }, + { + "epoch": 0.5728420679707468, + "grad_norm": 10.581732749938965, + "learning_rate": 3.93201122347426e-06, + "loss": 0.1297, + "step": 22637 + }, + { + "epoch": 0.5728673735354405, + "grad_norm": 4.267003059387207, + "learning_rate": 3.93161897083828e-06, + "loss": 0.1831, + "step": 22638 + }, + { + "epoch": 0.5728926791001341, + "grad_norm": 2.7633137702941895, + "learning_rate": 3.931226725092092e-06, + "loss": 0.1007, + "step": 22639 + }, + { + "epoch": 0.5729179846648278, + "grad_norm": 4.725002765655518, + "learning_rate": 3.9308344862382234e-06, + "loss": 0.144, + "step": 22640 + }, + { + "epoch": 0.5729432902295215, + "grad_norm": 6.942529678344727, + "learning_rate": 3.930442254279206e-06, + "loss": 0.256, + "step": 22641 + }, + { + "epoch": 0.5729685957942151, + "grad_norm": 4.428470134735107, + "learning_rate": 3.930050029217566e-06, + "loss": 0.1852, + "step": 22642 + }, + { + "epoch": 0.5729939013589088, + "grad_norm": 5.273545265197754, + "learning_rate": 3.929657811055834e-06, + "loss": 0.2579, + "step": 22643 + }, + { + "epoch": 0.5730192069236025, + "grad_norm": 4.530084609985352, + "learning_rate": 3.92926559979654e-06, + "loss": 0.2236, + "step": 22644 + }, + { + "epoch": 0.5730445124882961, + "grad_norm": 6.104257583618164, + "learning_rate": 3.928873395442215e-06, + "loss": 0.1551, + "step": 22645 + }, + { + "epoch": 0.5730698180529898, + "grad_norm": 12.077593803405762, + "learning_rate": 3.928481197995383e-06, + "loss": 0.2432, + "step": 22646 + }, + { + "epoch": 0.5730951236176836, + "grad_norm": 6.6023712158203125, + "learning_rate": 3.928089007458578e-06, + "loss": 0.1896, + "step": 22647 + }, + { + "epoch": 0.5731204291823772, + "grad_norm": 2.046121597290039, + "learning_rate": 3.927696823834326e-06, + "loss": 0.0987, + "step": 22648 + }, + { + "epoch": 0.5731457347470709, + "grad_norm": 5.079741954803467, + "learning_rate": 3.927304647125159e-06, + "loss": 0.1919, + "step": 22649 + }, + { + "epoch": 0.5731710403117646, + "grad_norm": 4.100421905517578, + "learning_rate": 3.926912477333606e-06, + "loss": 0.2116, + "step": 22650 + }, + { + "epoch": 0.5731963458764582, + "grad_norm": 8.612931251525879, + "learning_rate": 3.926520314462195e-06, + "loss": 0.2062, + "step": 22651 + }, + { + "epoch": 0.5732216514411519, + "grad_norm": 4.810433387756348, + "learning_rate": 3.926128158513452e-06, + "loss": 0.1684, + "step": 22652 + }, + { + "epoch": 0.5732469570058456, + "grad_norm": 9.829265594482422, + "learning_rate": 3.925736009489911e-06, + "loss": 0.155, + "step": 22653 + }, + { + "epoch": 0.5732722625705393, + "grad_norm": 3.4083752632141113, + "learning_rate": 3.9253438673940985e-06, + "loss": 0.1249, + "step": 22654 + }, + { + "epoch": 0.5732975681352329, + "grad_norm": 17.467016220092773, + "learning_rate": 3.9249517322285426e-06, + "loss": 0.3134, + "step": 22655 + }, + { + "epoch": 0.5733228736999266, + "grad_norm": 9.273648262023926, + "learning_rate": 3.924559603995774e-06, + "loss": 0.1913, + "step": 22656 + }, + { + "epoch": 0.5733481792646203, + "grad_norm": 6.217184543609619, + "learning_rate": 3.924167482698319e-06, + "loss": 0.202, + "step": 22657 + }, + { + "epoch": 0.5733734848293139, + "grad_norm": 8.161149978637695, + "learning_rate": 3.92377536833871e-06, + "loss": 0.1969, + "step": 22658 + }, + { + "epoch": 0.5733987903940077, + "grad_norm": 2.804205894470215, + "learning_rate": 3.9233832609194715e-06, + "loss": 0.1023, + "step": 22659 + }, + { + "epoch": 0.5734240959587014, + "grad_norm": 3.293060064315796, + "learning_rate": 3.922991160443135e-06, + "loss": 0.1173, + "step": 22660 + }, + { + "epoch": 0.573449401523395, + "grad_norm": 8.436311721801758, + "learning_rate": 3.9225990669122275e-06, + "loss": 0.1445, + "step": 22661 + }, + { + "epoch": 0.5734747070880887, + "grad_norm": 7.185157299041748, + "learning_rate": 3.922206980329278e-06, + "loss": 0.159, + "step": 22662 + }, + { + "epoch": 0.5735000126527824, + "grad_norm": 3.4082367420196533, + "learning_rate": 3.921814900696817e-06, + "loss": 0.1616, + "step": 22663 + }, + { + "epoch": 0.573525318217476, + "grad_norm": 5.816591262817383, + "learning_rate": 3.92142282801737e-06, + "loss": 0.2182, + "step": 22664 + }, + { + "epoch": 0.5735506237821697, + "grad_norm": 5.116008281707764, + "learning_rate": 3.921030762293465e-06, + "loss": 0.2538, + "step": 22665 + }, + { + "epoch": 0.5735759293468634, + "grad_norm": 2.73441219329834, + "learning_rate": 3.920638703527634e-06, + "loss": 0.0964, + "step": 22666 + }, + { + "epoch": 0.573601234911557, + "grad_norm": 5.4671149253845215, + "learning_rate": 3.920246651722404e-06, + "loss": 0.1419, + "step": 22667 + }, + { + "epoch": 0.5736265404762507, + "grad_norm": 4.835817813873291, + "learning_rate": 3.9198546068803e-06, + "loss": 0.1927, + "step": 22668 + }, + { + "epoch": 0.5736518460409444, + "grad_norm": 3.0346109867095947, + "learning_rate": 3.9194625690038536e-06, + "loss": 0.1333, + "step": 22669 + }, + { + "epoch": 0.573677151605638, + "grad_norm": 5.630237102508545, + "learning_rate": 3.919070538095592e-06, + "loss": 0.1521, + "step": 22670 + }, + { + "epoch": 0.5737024571703317, + "grad_norm": 3.4129507541656494, + "learning_rate": 3.918678514158043e-06, + "loss": 0.1429, + "step": 22671 + }, + { + "epoch": 0.5737277627350255, + "grad_norm": 4.48954963684082, + "learning_rate": 3.9182864971937375e-06, + "loss": 0.1245, + "step": 22672 + }, + { + "epoch": 0.5737530682997191, + "grad_norm": 4.057988166809082, + "learning_rate": 3.917894487205199e-06, + "loss": 0.1534, + "step": 22673 + }, + { + "epoch": 0.5737783738644128, + "grad_norm": 4.019667148590088, + "learning_rate": 3.917502484194958e-06, + "loss": 0.172, + "step": 22674 + }, + { + "epoch": 0.5738036794291065, + "grad_norm": 3.0210978984832764, + "learning_rate": 3.917110488165542e-06, + "loss": 0.1462, + "step": 22675 + }, + { + "epoch": 0.5738289849938001, + "grad_norm": 5.219059467315674, + "learning_rate": 3.91671849911948e-06, + "loss": 0.1123, + "step": 22676 + }, + { + "epoch": 0.5738542905584938, + "grad_norm": 5.230623245239258, + "learning_rate": 3.916326517059299e-06, + "loss": 0.1963, + "step": 22677 + }, + { + "epoch": 0.5738795961231875, + "grad_norm": 8.3714017868042, + "learning_rate": 3.915934541987525e-06, + "loss": 0.228, + "step": 22678 + }, + { + "epoch": 0.5739049016878812, + "grad_norm": 5.5052876472473145, + "learning_rate": 3.915542573906688e-06, + "loss": 0.1582, + "step": 22679 + }, + { + "epoch": 0.5739302072525748, + "grad_norm": 8.136663436889648, + "learning_rate": 3.915150612819317e-06, + "loss": 0.2202, + "step": 22680 + }, + { + "epoch": 0.5739555128172685, + "grad_norm": 7.2630228996276855, + "learning_rate": 3.914758658727937e-06, + "loss": 0.2279, + "step": 22681 + }, + { + "epoch": 0.5739808183819622, + "grad_norm": 8.027617454528809, + "learning_rate": 3.914366711635075e-06, + "loss": 0.2241, + "step": 22682 + }, + { + "epoch": 0.5740061239466558, + "grad_norm": 11.3360595703125, + "learning_rate": 3.913974771543261e-06, + "loss": 0.2602, + "step": 22683 + }, + { + "epoch": 0.5740314295113496, + "grad_norm": 4.145894527435303, + "learning_rate": 3.913582838455023e-06, + "loss": 0.0865, + "step": 22684 + }, + { + "epoch": 0.5740567350760433, + "grad_norm": 6.258817195892334, + "learning_rate": 3.913190912372885e-06, + "loss": 0.2189, + "step": 22685 + }, + { + "epoch": 0.5740820406407369, + "grad_norm": 15.525126457214355, + "learning_rate": 3.912798993299378e-06, + "loss": 0.2269, + "step": 22686 + }, + { + "epoch": 0.5741073462054306, + "grad_norm": 5.600537300109863, + "learning_rate": 3.912407081237028e-06, + "loss": 0.1694, + "step": 22687 + }, + { + "epoch": 0.5741326517701243, + "grad_norm": 5.214454174041748, + "learning_rate": 3.9120151761883615e-06, + "loss": 0.1975, + "step": 22688 + }, + { + "epoch": 0.5741579573348179, + "grad_norm": 4.415213584899902, + "learning_rate": 3.911623278155908e-06, + "loss": 0.2081, + "step": 22689 + }, + { + "epoch": 0.5741832628995116, + "grad_norm": 3.818577289581299, + "learning_rate": 3.911231387142194e-06, + "loss": 0.1791, + "step": 22690 + }, + { + "epoch": 0.5742085684642053, + "grad_norm": 3.959900140762329, + "learning_rate": 3.910839503149745e-06, + "loss": 0.1241, + "step": 22691 + }, + { + "epoch": 0.5742338740288989, + "grad_norm": 6.159755229949951, + "learning_rate": 3.9104476261810885e-06, + "loss": 0.1861, + "step": 22692 + }, + { + "epoch": 0.5742591795935926, + "grad_norm": 6.984398365020752, + "learning_rate": 3.910055756238753e-06, + "loss": 0.2941, + "step": 22693 + }, + { + "epoch": 0.5742844851582863, + "grad_norm": 3.3546125888824463, + "learning_rate": 3.9096638933252685e-06, + "loss": 0.1907, + "step": 22694 + }, + { + "epoch": 0.5743097907229799, + "grad_norm": 3.5401360988616943, + "learning_rate": 3.909272037443156e-06, + "loss": 0.1501, + "step": 22695 + }, + { + "epoch": 0.5743350962876737, + "grad_norm": 5.758141994476318, + "learning_rate": 3.908880188594945e-06, + "loss": 0.1118, + "step": 22696 + }, + { + "epoch": 0.5743604018523674, + "grad_norm": 7.614289283752441, + "learning_rate": 3.908488346783162e-06, + "loss": 0.1008, + "step": 22697 + }, + { + "epoch": 0.574385707417061, + "grad_norm": 3.816612958908081, + "learning_rate": 3.908096512010338e-06, + "loss": 0.1806, + "step": 22698 + }, + { + "epoch": 0.5744110129817547, + "grad_norm": 5.680837154388428, + "learning_rate": 3.9077046842789935e-06, + "loss": 0.1432, + "step": 22699 + }, + { + "epoch": 0.5744363185464484, + "grad_norm": 4.340637683868408, + "learning_rate": 3.907312863591659e-06, + "loss": 0.1614, + "step": 22700 + }, + { + "epoch": 0.574461624111142, + "grad_norm": 9.081080436706543, + "learning_rate": 3.906921049950859e-06, + "loss": 0.2733, + "step": 22701 + }, + { + "epoch": 0.5744869296758357, + "grad_norm": 6.835057258605957, + "learning_rate": 3.906529243359126e-06, + "loss": 0.1313, + "step": 22702 + }, + { + "epoch": 0.5745122352405294, + "grad_norm": 3.1649794578552246, + "learning_rate": 3.906137443818979e-06, + "loss": 0.1337, + "step": 22703 + }, + { + "epoch": 0.5745375408052231, + "grad_norm": 4.461863040924072, + "learning_rate": 3.905745651332949e-06, + "loss": 0.1681, + "step": 22704 + }, + { + "epoch": 0.5745628463699167, + "grad_norm": 9.939545631408691, + "learning_rate": 3.905353865903561e-06, + "loss": 0.1859, + "step": 22705 + }, + { + "epoch": 0.5745881519346104, + "grad_norm": 3.9475650787353516, + "learning_rate": 3.904962087533343e-06, + "loss": 0.1824, + "step": 22706 + }, + { + "epoch": 0.5746134574993041, + "grad_norm": 8.4269437789917, + "learning_rate": 3.904570316224822e-06, + "loss": 0.2021, + "step": 22707 + }, + { + "epoch": 0.5746387630639977, + "grad_norm": 8.235201835632324, + "learning_rate": 3.9041785519805216e-06, + "loss": 0.2855, + "step": 22708 + }, + { + "epoch": 0.5746640686286915, + "grad_norm": 4.370299816131592, + "learning_rate": 3.90378679480297e-06, + "loss": 0.1123, + "step": 22709 + }, + { + "epoch": 0.5746893741933852, + "grad_norm": 4.291810035705566, + "learning_rate": 3.903395044694694e-06, + "loss": 0.1589, + "step": 22710 + }, + { + "epoch": 0.5747146797580788, + "grad_norm": 9.36426830291748, + "learning_rate": 3.9030033016582206e-06, + "loss": 0.1926, + "step": 22711 + }, + { + "epoch": 0.5747399853227725, + "grad_norm": 6.498364448547363, + "learning_rate": 3.902611565696073e-06, + "loss": 0.1719, + "step": 22712 + }, + { + "epoch": 0.5747652908874662, + "grad_norm": 6.645338535308838, + "learning_rate": 3.902219836810779e-06, + "loss": 0.2217, + "step": 22713 + }, + { + "epoch": 0.5747905964521598, + "grad_norm": 8.363780975341797, + "learning_rate": 3.901828115004867e-06, + "loss": 0.1597, + "step": 22714 + }, + { + "epoch": 0.5748159020168535, + "grad_norm": 2.6251182556152344, + "learning_rate": 3.9014364002808595e-06, + "loss": 0.0934, + "step": 22715 + }, + { + "epoch": 0.5748412075815472, + "grad_norm": 4.467206954956055, + "learning_rate": 3.901044692641286e-06, + "loss": 0.158, + "step": 22716 + }, + { + "epoch": 0.5748665131462408, + "grad_norm": 6.817826747894287, + "learning_rate": 3.90065299208867e-06, + "loss": 0.2183, + "step": 22717 + }, + { + "epoch": 0.5748918187109345, + "grad_norm": 3.5771689414978027, + "learning_rate": 3.900261298625538e-06, + "loss": 0.1133, + "step": 22718 + }, + { + "epoch": 0.5749171242756282, + "grad_norm": 4.557682991027832, + "learning_rate": 3.899869612254416e-06, + "loss": 0.1714, + "step": 22719 + }, + { + "epoch": 0.5749424298403218, + "grad_norm": 7.701560020446777, + "learning_rate": 3.899477932977831e-06, + "loss": 0.1988, + "step": 22720 + }, + { + "epoch": 0.5749677354050156, + "grad_norm": 3.669252634048462, + "learning_rate": 3.8990862607983084e-06, + "loss": 0.1592, + "step": 22721 + }, + { + "epoch": 0.5749930409697093, + "grad_norm": 3.5070180892944336, + "learning_rate": 3.898694595718373e-06, + "loss": 0.1305, + "step": 22722 + }, + { + "epoch": 0.5750183465344029, + "grad_norm": 7.078171253204346, + "learning_rate": 3.89830293774055e-06, + "loss": 0.1899, + "step": 22723 + }, + { + "epoch": 0.5750436520990966, + "grad_norm": 4.266678333282471, + "learning_rate": 3.897911286867368e-06, + "loss": 0.1086, + "step": 22724 + }, + { + "epoch": 0.5750689576637903, + "grad_norm": 17.812910079956055, + "learning_rate": 3.897519643101353e-06, + "loss": 0.2992, + "step": 22725 + }, + { + "epoch": 0.5750942632284839, + "grad_norm": 7.126338481903076, + "learning_rate": 3.897128006445026e-06, + "loss": 0.1827, + "step": 22726 + }, + { + "epoch": 0.5751195687931776, + "grad_norm": 4.9446024894714355, + "learning_rate": 3.896736376900915e-06, + "loss": 0.1922, + "step": 22727 + }, + { + "epoch": 0.5751448743578713, + "grad_norm": 15.8784818649292, + "learning_rate": 3.896344754471546e-06, + "loss": 0.2218, + "step": 22728 + }, + { + "epoch": 0.5751701799225649, + "grad_norm": 4.097416400909424, + "learning_rate": 3.895953139159447e-06, + "loss": 0.1674, + "step": 22729 + }, + { + "epoch": 0.5751954854872586, + "grad_norm": 10.491619110107422, + "learning_rate": 3.895561530967139e-06, + "loss": 0.1935, + "step": 22730 + }, + { + "epoch": 0.5752207910519523, + "grad_norm": 6.154538154602051, + "learning_rate": 3.895169929897149e-06, + "loss": 0.1402, + "step": 22731 + }, + { + "epoch": 0.575246096616646, + "grad_norm": 7.576605796813965, + "learning_rate": 3.894778335952003e-06, + "loss": 0.2853, + "step": 22732 + }, + { + "epoch": 0.5752714021813397, + "grad_norm": 4.211528778076172, + "learning_rate": 3.894386749134227e-06, + "loss": 0.1078, + "step": 22733 + }, + { + "epoch": 0.5752967077460334, + "grad_norm": 7.014017105102539, + "learning_rate": 3.893995169446344e-06, + "loss": 0.1745, + "step": 22734 + }, + { + "epoch": 0.5753220133107271, + "grad_norm": 8.530733108520508, + "learning_rate": 3.89360359689088e-06, + "loss": 0.1402, + "step": 22735 + }, + { + "epoch": 0.5753473188754207, + "grad_norm": 11.068403244018555, + "learning_rate": 3.89321203147036e-06, + "loss": 0.2218, + "step": 22736 + }, + { + "epoch": 0.5753726244401144, + "grad_norm": 7.392401218414307, + "learning_rate": 3.892820473187311e-06, + "loss": 0.2377, + "step": 22737 + }, + { + "epoch": 0.5753979300048081, + "grad_norm": 2.691659927368164, + "learning_rate": 3.892428922044258e-06, + "loss": 0.0918, + "step": 22738 + }, + { + "epoch": 0.5754232355695017, + "grad_norm": 3.999199628829956, + "learning_rate": 3.892037378043723e-06, + "loss": 0.0721, + "step": 22739 + }, + { + "epoch": 0.5754485411341954, + "grad_norm": 4.450044631958008, + "learning_rate": 3.891645841188233e-06, + "loss": 0.1498, + "step": 22740 + }, + { + "epoch": 0.5754738466988891, + "grad_norm": 6.949442386627197, + "learning_rate": 3.891254311480314e-06, + "loss": 0.2026, + "step": 22741 + }, + { + "epoch": 0.5754991522635827, + "grad_norm": 2.9890329837799072, + "learning_rate": 3.89086278892249e-06, + "loss": 0.1116, + "step": 22742 + }, + { + "epoch": 0.5755244578282764, + "grad_norm": 4.518585205078125, + "learning_rate": 3.890471273517284e-06, + "loss": 0.1448, + "step": 22743 + }, + { + "epoch": 0.5755497633929701, + "grad_norm": 4.1201958656311035, + "learning_rate": 3.8900797652672215e-06, + "loss": 0.1374, + "step": 22744 + }, + { + "epoch": 0.5755750689576637, + "grad_norm": 5.162757396697998, + "learning_rate": 3.88968826417483e-06, + "loss": 0.1393, + "step": 22745 + }, + { + "epoch": 0.5756003745223575, + "grad_norm": 6.114354133605957, + "learning_rate": 3.889296770242632e-06, + "loss": 0.1479, + "step": 22746 + }, + { + "epoch": 0.5756256800870512, + "grad_norm": 3.821168899536133, + "learning_rate": 3.8889052834731525e-06, + "loss": 0.0564, + "step": 22747 + }, + { + "epoch": 0.5756509856517448, + "grad_norm": 6.026469707489014, + "learning_rate": 3.888513803868916e-06, + "loss": 0.2686, + "step": 22748 + }, + { + "epoch": 0.5756762912164385, + "grad_norm": 5.223651885986328, + "learning_rate": 3.888122331432447e-06, + "loss": 0.1451, + "step": 22749 + }, + { + "epoch": 0.5757015967811322, + "grad_norm": 4.257720947265625, + "learning_rate": 3.88773086616627e-06, + "loss": 0.1955, + "step": 22750 + }, + { + "epoch": 0.5757269023458258, + "grad_norm": 5.402416229248047, + "learning_rate": 3.8873394080729124e-06, + "loss": 0.1478, + "step": 22751 + }, + { + "epoch": 0.5757522079105195, + "grad_norm": 6.172515392303467, + "learning_rate": 3.886947957154893e-06, + "loss": 0.0906, + "step": 22752 + }, + { + "epoch": 0.5757775134752132, + "grad_norm": 3.3036904335021973, + "learning_rate": 3.886556513414739e-06, + "loss": 0.1791, + "step": 22753 + }, + { + "epoch": 0.5758028190399068, + "grad_norm": 11.733235359191895, + "learning_rate": 3.886165076854975e-06, + "loss": 0.277, + "step": 22754 + }, + { + "epoch": 0.5758281246046005, + "grad_norm": 5.90920352935791, + "learning_rate": 3.885773647478128e-06, + "loss": 0.2098, + "step": 22755 + }, + { + "epoch": 0.5758534301692942, + "grad_norm": 4.463888168334961, + "learning_rate": 3.885382225286718e-06, + "loss": 0.1411, + "step": 22756 + }, + { + "epoch": 0.575878735733988, + "grad_norm": 4.351029872894287, + "learning_rate": 3.8849908102832686e-06, + "loss": 0.1654, + "step": 22757 + }, + { + "epoch": 0.5759040412986816, + "grad_norm": 4.2727460861206055, + "learning_rate": 3.8845994024703085e-06, + "loss": 0.1581, + "step": 22758 + }, + { + "epoch": 0.5759293468633753, + "grad_norm": 5.26540470123291, + "learning_rate": 3.884208001850357e-06, + "loss": 0.1751, + "step": 22759 + }, + { + "epoch": 0.575954652428069, + "grad_norm": 4.538370609283447, + "learning_rate": 3.883816608425944e-06, + "loss": 0.1536, + "step": 22760 + }, + { + "epoch": 0.5759799579927626, + "grad_norm": 5.5985493659973145, + "learning_rate": 3.883425222199588e-06, + "loss": 0.1413, + "step": 22761 + }, + { + "epoch": 0.5760052635574563, + "grad_norm": 5.783119201660156, + "learning_rate": 3.8830338431738145e-06, + "loss": 0.194, + "step": 22762 + }, + { + "epoch": 0.57603056912215, + "grad_norm": 3.5220203399658203, + "learning_rate": 3.882642471351149e-06, + "loss": 0.1392, + "step": 22763 + }, + { + "epoch": 0.5760558746868436, + "grad_norm": 7.116716384887695, + "learning_rate": 3.882251106734115e-06, + "loss": 0.0797, + "step": 22764 + }, + { + "epoch": 0.5760811802515373, + "grad_norm": 3.7081148624420166, + "learning_rate": 3.881859749325235e-06, + "loss": 0.1615, + "step": 22765 + }, + { + "epoch": 0.576106485816231, + "grad_norm": 7.907839775085449, + "learning_rate": 3.881468399127032e-06, + "loss": 0.2857, + "step": 22766 + }, + { + "epoch": 0.5761317913809246, + "grad_norm": 3.6246283054351807, + "learning_rate": 3.881077056142033e-06, + "loss": 0.1568, + "step": 22767 + }, + { + "epoch": 0.5761570969456183, + "grad_norm": 3.1142489910125732, + "learning_rate": 3.880685720372759e-06, + "loss": 0.1772, + "step": 22768 + }, + { + "epoch": 0.576182402510312, + "grad_norm": 4.337817192077637, + "learning_rate": 3.880294391821736e-06, + "loss": 0.1289, + "step": 22769 + }, + { + "epoch": 0.5762077080750057, + "grad_norm": 7.199014186859131, + "learning_rate": 3.879903070491485e-06, + "loss": 0.2235, + "step": 22770 + }, + { + "epoch": 0.5762330136396994, + "grad_norm": 6.168984889984131, + "learning_rate": 3.879511756384531e-06, + "loss": 0.1879, + "step": 22771 + }, + { + "epoch": 0.5762583192043931, + "grad_norm": 5.539872169494629, + "learning_rate": 3.879120449503397e-06, + "loss": 0.1962, + "step": 22772 + }, + { + "epoch": 0.5762836247690867, + "grad_norm": 9.650232315063477, + "learning_rate": 3.878729149850608e-06, + "loss": 0.2385, + "step": 22773 + }, + { + "epoch": 0.5763089303337804, + "grad_norm": 3.5676584243774414, + "learning_rate": 3.8783378574286844e-06, + "loss": 0.1062, + "step": 22774 + }, + { + "epoch": 0.5763342358984741, + "grad_norm": 4.112771511077881, + "learning_rate": 3.877946572240153e-06, + "loss": 0.1803, + "step": 22775 + }, + { + "epoch": 0.5763595414631677, + "grad_norm": 11.317249298095703, + "learning_rate": 3.8775552942875336e-06, + "loss": 0.2508, + "step": 22776 + }, + { + "epoch": 0.5763848470278614, + "grad_norm": 4.748392105102539, + "learning_rate": 3.877164023573352e-06, + "loss": 0.1731, + "step": 22777 + }, + { + "epoch": 0.5764101525925551, + "grad_norm": 7.508969306945801, + "learning_rate": 3.876772760100132e-06, + "loss": 0.2138, + "step": 22778 + }, + { + "epoch": 0.5764354581572487, + "grad_norm": 2.72216796875, + "learning_rate": 3.876381503870394e-06, + "loss": 0.0847, + "step": 22779 + }, + { + "epoch": 0.5764607637219424, + "grad_norm": 4.864619731903076, + "learning_rate": 3.8759902548866625e-06, + "loss": 0.1229, + "step": 22780 + }, + { + "epoch": 0.5764860692866361, + "grad_norm": 6.4433488845825195, + "learning_rate": 3.875599013151461e-06, + "loss": 0.2322, + "step": 22781 + }, + { + "epoch": 0.5765113748513299, + "grad_norm": 6.794747352600098, + "learning_rate": 3.875207778667314e-06, + "loss": 0.149, + "step": 22782 + }, + { + "epoch": 0.5765366804160235, + "grad_norm": 4.0778069496154785, + "learning_rate": 3.874816551436741e-06, + "loss": 0.2155, + "step": 22783 + }, + { + "epoch": 0.5765619859807172, + "grad_norm": 4.116300582885742, + "learning_rate": 3.874425331462267e-06, + "loss": 0.1568, + "step": 22784 + }, + { + "epoch": 0.5765872915454109, + "grad_norm": 5.223411560058594, + "learning_rate": 3.874034118746415e-06, + "loss": 0.1737, + "step": 22785 + }, + { + "epoch": 0.5766125971101045, + "grad_norm": 6.202563285827637, + "learning_rate": 3.873642913291709e-06, + "loss": 0.1785, + "step": 22786 + }, + { + "epoch": 0.5766379026747982, + "grad_norm": 19.751598358154297, + "learning_rate": 3.8732517151006674e-06, + "loss": 0.2903, + "step": 22787 + }, + { + "epoch": 0.5766632082394919, + "grad_norm": 5.223079681396484, + "learning_rate": 3.872860524175817e-06, + "loss": 0.187, + "step": 22788 + }, + { + "epoch": 0.5766885138041855, + "grad_norm": 7.234426975250244, + "learning_rate": 3.872469340519679e-06, + "loss": 0.2663, + "step": 22789 + }, + { + "epoch": 0.5767138193688792, + "grad_norm": 5.557494163513184, + "learning_rate": 3.872078164134777e-06, + "loss": 0.1604, + "step": 22790 + }, + { + "epoch": 0.5767391249335729, + "grad_norm": 7.189732551574707, + "learning_rate": 3.871686995023635e-06, + "loss": 0.1151, + "step": 22791 + }, + { + "epoch": 0.5767644304982665, + "grad_norm": 2.7272679805755615, + "learning_rate": 3.871295833188772e-06, + "loss": 0.0868, + "step": 22792 + }, + { + "epoch": 0.5767897360629602, + "grad_norm": 29.010719299316406, + "learning_rate": 3.870904678632711e-06, + "loss": 0.2404, + "step": 22793 + }, + { + "epoch": 0.576815041627654, + "grad_norm": 5.130034446716309, + "learning_rate": 3.870513531357977e-06, + "loss": 0.17, + "step": 22794 + }, + { + "epoch": 0.5768403471923476, + "grad_norm": 3.8295187950134277, + "learning_rate": 3.870122391367092e-06, + "loss": 0.1285, + "step": 22795 + }, + { + "epoch": 0.5768656527570413, + "grad_norm": 10.552562713623047, + "learning_rate": 3.869731258662576e-06, + "loss": 0.2629, + "step": 22796 + }, + { + "epoch": 0.576890958321735, + "grad_norm": 4.343571186065674, + "learning_rate": 3.8693401332469535e-06, + "loss": 0.1728, + "step": 22797 + }, + { + "epoch": 0.5769162638864286, + "grad_norm": 4.025177001953125, + "learning_rate": 3.868949015122745e-06, + "loss": 0.1911, + "step": 22798 + }, + { + "epoch": 0.5769415694511223, + "grad_norm": 5.740329265594482, + "learning_rate": 3.868557904292476e-06, + "loss": 0.1511, + "step": 22799 + }, + { + "epoch": 0.576966875015816, + "grad_norm": 3.457016706466675, + "learning_rate": 3.868166800758666e-06, + "loss": 0.1777, + "step": 22800 + }, + { + "epoch": 0.5769921805805096, + "grad_norm": 5.9384965896606445, + "learning_rate": 3.867775704523837e-06, + "loss": 0.1616, + "step": 22801 + }, + { + "epoch": 0.5770174861452033, + "grad_norm": 5.278132438659668, + "learning_rate": 3.867384615590513e-06, + "loss": 0.1748, + "step": 22802 + }, + { + "epoch": 0.577042791709897, + "grad_norm": 9.257139205932617, + "learning_rate": 3.8669935339612136e-06, + "loss": 0.2659, + "step": 22803 + }, + { + "epoch": 0.5770680972745906, + "grad_norm": 3.8916549682617188, + "learning_rate": 3.866602459638463e-06, + "loss": 0.1165, + "step": 22804 + }, + { + "epoch": 0.5770934028392843, + "grad_norm": 3.471419334411621, + "learning_rate": 3.866211392624781e-06, + "loss": 0.1562, + "step": 22805 + }, + { + "epoch": 0.577118708403978, + "grad_norm": 5.688705921173096, + "learning_rate": 3.865820332922693e-06, + "loss": 0.1304, + "step": 22806 + }, + { + "epoch": 0.5771440139686718, + "grad_norm": 3.8771724700927734, + "learning_rate": 3.865429280534717e-06, + "loss": 0.1425, + "step": 22807 + }, + { + "epoch": 0.5771693195333654, + "grad_norm": 15.625449180603027, + "learning_rate": 3.865038235463377e-06, + "loss": 0.256, + "step": 22808 + }, + { + "epoch": 0.5771946250980591, + "grad_norm": 7.236454010009766, + "learning_rate": 3.864647197711195e-06, + "loss": 0.1216, + "step": 22809 + }, + { + "epoch": 0.5772199306627528, + "grad_norm": 13.233369827270508, + "learning_rate": 3.864256167280692e-06, + "loss": 0.2221, + "step": 22810 + }, + { + "epoch": 0.5772452362274464, + "grad_norm": 3.7603888511657715, + "learning_rate": 3.863865144174388e-06, + "loss": 0.1527, + "step": 22811 + }, + { + "epoch": 0.5772705417921401, + "grad_norm": 5.760840892791748, + "learning_rate": 3.863474128394808e-06, + "loss": 0.1814, + "step": 22812 + }, + { + "epoch": 0.5772958473568338, + "grad_norm": 7.604315757751465, + "learning_rate": 3.863083119944472e-06, + "loss": 0.1572, + "step": 22813 + }, + { + "epoch": 0.5773211529215274, + "grad_norm": 7.7448954582214355, + "learning_rate": 3.8626921188259e-06, + "loss": 0.1554, + "step": 22814 + }, + { + "epoch": 0.5773464584862211, + "grad_norm": 32.876338958740234, + "learning_rate": 3.862301125041616e-06, + "loss": 0.2626, + "step": 22815 + }, + { + "epoch": 0.5773717640509148, + "grad_norm": 3.248863697052002, + "learning_rate": 3.861910138594139e-06, + "loss": 0.1356, + "step": 22816 + }, + { + "epoch": 0.5773970696156084, + "grad_norm": 7.9279351234436035, + "learning_rate": 3.861519159485995e-06, + "loss": 0.1986, + "step": 22817 + }, + { + "epoch": 0.5774223751803021, + "grad_norm": 6.841838836669922, + "learning_rate": 3.8611281877197e-06, + "loss": 0.1972, + "step": 22818 + }, + { + "epoch": 0.5774476807449959, + "grad_norm": 5.332899570465088, + "learning_rate": 3.860737223297777e-06, + "loss": 0.1278, + "step": 22819 + }, + { + "epoch": 0.5774729863096895, + "grad_norm": 3.060220241546631, + "learning_rate": 3.8603462662227485e-06, + "loss": 0.092, + "step": 22820 + }, + { + "epoch": 0.5774982918743832, + "grad_norm": 4.413395404815674, + "learning_rate": 3.859955316497134e-06, + "loss": 0.1451, + "step": 22821 + }, + { + "epoch": 0.5775235974390769, + "grad_norm": 10.385092735290527, + "learning_rate": 3.8595643741234585e-06, + "loss": 0.2637, + "step": 22822 + }, + { + "epoch": 0.5775489030037705, + "grad_norm": 3.986015796661377, + "learning_rate": 3.859173439104238e-06, + "loss": 0.1525, + "step": 22823 + }, + { + "epoch": 0.5775742085684642, + "grad_norm": 2.4642646312713623, + "learning_rate": 3.858782511441995e-06, + "loss": 0.1392, + "step": 22824 + }, + { + "epoch": 0.5775995141331579, + "grad_norm": 3.2294251918792725, + "learning_rate": 3.858391591139252e-06, + "loss": 0.1357, + "step": 22825 + }, + { + "epoch": 0.5776248196978515, + "grad_norm": 11.22140884399414, + "learning_rate": 3.85800067819853e-06, + "loss": 0.3141, + "step": 22826 + }, + { + "epoch": 0.5776501252625452, + "grad_norm": 3.221287965774536, + "learning_rate": 3.857609772622349e-06, + "loss": 0.1558, + "step": 22827 + }, + { + "epoch": 0.5776754308272389, + "grad_norm": 2.7141005992889404, + "learning_rate": 3.857218874413229e-06, + "loss": 0.1285, + "step": 22828 + }, + { + "epoch": 0.5777007363919325, + "grad_norm": 3.5384912490844727, + "learning_rate": 3.8568279835736925e-06, + "loss": 0.1672, + "step": 22829 + }, + { + "epoch": 0.5777260419566262, + "grad_norm": 5.246383190155029, + "learning_rate": 3.8564371001062615e-06, + "loss": 0.2546, + "step": 22830 + }, + { + "epoch": 0.57775134752132, + "grad_norm": 1.741361141204834, + "learning_rate": 3.856046224013453e-06, + "loss": 0.073, + "step": 22831 + }, + { + "epoch": 0.5777766530860137, + "grad_norm": 9.606311798095703, + "learning_rate": 3.855655355297789e-06, + "loss": 0.0957, + "step": 22832 + }, + { + "epoch": 0.5778019586507073, + "grad_norm": 5.259280204772949, + "learning_rate": 3.855264493961793e-06, + "loss": 0.2215, + "step": 22833 + }, + { + "epoch": 0.577827264215401, + "grad_norm": 10.735115051269531, + "learning_rate": 3.854873640007981e-06, + "loss": 0.2324, + "step": 22834 + }, + { + "epoch": 0.5778525697800947, + "grad_norm": 4.989270210266113, + "learning_rate": 3.854482793438878e-06, + "loss": 0.1468, + "step": 22835 + }, + { + "epoch": 0.5778778753447883, + "grad_norm": 8.091961860656738, + "learning_rate": 3.854091954257001e-06, + "loss": 0.4193, + "step": 22836 + }, + { + "epoch": 0.577903180909482, + "grad_norm": 6.454986572265625, + "learning_rate": 3.8537011224648725e-06, + "loss": 0.1769, + "step": 22837 + }, + { + "epoch": 0.5779284864741757, + "grad_norm": 7.044318675994873, + "learning_rate": 3.853310298065011e-06, + "loss": 0.1953, + "step": 22838 + }, + { + "epoch": 0.5779537920388693, + "grad_norm": 4.543162822723389, + "learning_rate": 3.852919481059941e-06, + "loss": 0.1227, + "step": 22839 + }, + { + "epoch": 0.577979097603563, + "grad_norm": 3.573512077331543, + "learning_rate": 3.852528671452177e-06, + "loss": 0.2045, + "step": 22840 + }, + { + "epoch": 0.5780044031682567, + "grad_norm": 6.53065824508667, + "learning_rate": 3.852137869244243e-06, + "loss": 0.1634, + "step": 22841 + }, + { + "epoch": 0.5780297087329503, + "grad_norm": 3.850623607635498, + "learning_rate": 3.851747074438657e-06, + "loss": 0.1885, + "step": 22842 + }, + { + "epoch": 0.578055014297644, + "grad_norm": 3.129513740539551, + "learning_rate": 3.851356287037942e-06, + "loss": 0.1435, + "step": 22843 + }, + { + "epoch": 0.5780803198623378, + "grad_norm": 4.628290176391602, + "learning_rate": 3.850965507044619e-06, + "loss": 0.146, + "step": 22844 + }, + { + "epoch": 0.5781056254270314, + "grad_norm": 5.948338985443115, + "learning_rate": 3.850574734461202e-06, + "loss": 0.2108, + "step": 22845 + }, + { + "epoch": 0.5781309309917251, + "grad_norm": 3.0661091804504395, + "learning_rate": 3.850183969290215e-06, + "loss": 0.1672, + "step": 22846 + }, + { + "epoch": 0.5781562365564188, + "grad_norm": 3.648981809616089, + "learning_rate": 3.849793211534179e-06, + "loss": 0.111, + "step": 22847 + }, + { + "epoch": 0.5781815421211124, + "grad_norm": 3.514665365219116, + "learning_rate": 3.849402461195613e-06, + "loss": 0.0943, + "step": 22848 + }, + { + "epoch": 0.5782068476858061, + "grad_norm": 7.394779205322266, + "learning_rate": 3.849011718277036e-06, + "loss": 0.1248, + "step": 22849 + }, + { + "epoch": 0.5782321532504998, + "grad_norm": 3.0383718013763428, + "learning_rate": 3.848620982780967e-06, + "loss": 0.142, + "step": 22850 + }, + { + "epoch": 0.5782574588151934, + "grad_norm": 4.245165824890137, + "learning_rate": 3.8482302547099285e-06, + "loss": 0.1561, + "step": 22851 + }, + { + "epoch": 0.5782827643798871, + "grad_norm": 2.9607200622558594, + "learning_rate": 3.847839534066437e-06, + "loss": 0.1602, + "step": 22852 + }, + { + "epoch": 0.5783080699445808, + "grad_norm": 9.876708030700684, + "learning_rate": 3.847448820853018e-06, + "loss": 0.2953, + "step": 22853 + }, + { + "epoch": 0.5783333755092744, + "grad_norm": 5.728034496307373, + "learning_rate": 3.847058115072184e-06, + "loss": 0.1706, + "step": 22854 + }, + { + "epoch": 0.5783586810739682, + "grad_norm": 3.813559055328369, + "learning_rate": 3.846667416726458e-06, + "loss": 0.1682, + "step": 22855 + }, + { + "epoch": 0.5783839866386619, + "grad_norm": 11.503632545471191, + "learning_rate": 3.84627672581836e-06, + "loss": 0.2139, + "step": 22856 + }, + { + "epoch": 0.5784092922033555, + "grad_norm": 4.178658485412598, + "learning_rate": 3.84588604235041e-06, + "loss": 0.2313, + "step": 22857 + }, + { + "epoch": 0.5784345977680492, + "grad_norm": 6.855123043060303, + "learning_rate": 3.845495366325124e-06, + "loss": 0.2356, + "step": 22858 + }, + { + "epoch": 0.5784599033327429, + "grad_norm": 5.167211055755615, + "learning_rate": 3.845104697745024e-06, + "loss": 0.208, + "step": 22859 + }, + { + "epoch": 0.5784852088974366, + "grad_norm": 6.558145999908447, + "learning_rate": 3.8447140366126304e-06, + "loss": 0.1813, + "step": 22860 + }, + { + "epoch": 0.5785105144621302, + "grad_norm": 6.148786544799805, + "learning_rate": 3.844323382930461e-06, + "loss": 0.1698, + "step": 22861 + }, + { + "epoch": 0.5785358200268239, + "grad_norm": 3.3734381198883057, + "learning_rate": 3.843932736701033e-06, + "loss": 0.1696, + "step": 22862 + }, + { + "epoch": 0.5785611255915176, + "grad_norm": 10.189287185668945, + "learning_rate": 3.8435420979268695e-06, + "loss": 0.2935, + "step": 22863 + }, + { + "epoch": 0.5785864311562112, + "grad_norm": 3.297916889190674, + "learning_rate": 3.843151466610487e-06, + "loss": 0.1233, + "step": 22864 + }, + { + "epoch": 0.5786117367209049, + "grad_norm": 5.5751566886901855, + "learning_rate": 3.842760842754405e-06, + "loss": 0.181, + "step": 22865 + }, + { + "epoch": 0.5786370422855986, + "grad_norm": 5.1773881912231445, + "learning_rate": 3.842370226361145e-06, + "loss": 0.1561, + "step": 22866 + }, + { + "epoch": 0.5786623478502922, + "grad_norm": 9.869997024536133, + "learning_rate": 3.841979617433223e-06, + "loss": 0.1523, + "step": 22867 + }, + { + "epoch": 0.578687653414986, + "grad_norm": 4.901448726654053, + "learning_rate": 3.841589015973159e-06, + "loss": 0.1514, + "step": 22868 + }, + { + "epoch": 0.5787129589796797, + "grad_norm": 4.991194248199463, + "learning_rate": 3.841198421983471e-06, + "loss": 0.1589, + "step": 22869 + }, + { + "epoch": 0.5787382645443733, + "grad_norm": 4.297414302825928, + "learning_rate": 3.8408078354666816e-06, + "loss": 0.1514, + "step": 22870 + }, + { + "epoch": 0.578763570109067, + "grad_norm": 6.947904109954834, + "learning_rate": 3.840417256425305e-06, + "loss": 0.1918, + "step": 22871 + }, + { + "epoch": 0.5787888756737607, + "grad_norm": 7.202373027801514, + "learning_rate": 3.840026684861861e-06, + "loss": 0.2437, + "step": 22872 + }, + { + "epoch": 0.5788141812384543, + "grad_norm": 18.18430519104004, + "learning_rate": 3.83963612077887e-06, + "loss": 0.3049, + "step": 22873 + }, + { + "epoch": 0.578839486803148, + "grad_norm": 3.1535024642944336, + "learning_rate": 3.839245564178849e-06, + "loss": 0.1702, + "step": 22874 + }, + { + "epoch": 0.5788647923678417, + "grad_norm": 5.522505283355713, + "learning_rate": 3.83885501506432e-06, + "loss": 0.2205, + "step": 22875 + }, + { + "epoch": 0.5788900979325353, + "grad_norm": 5.0034871101379395, + "learning_rate": 3.8384644734377966e-06, + "loss": 0.1568, + "step": 22876 + }, + { + "epoch": 0.578915403497229, + "grad_norm": 7.755469799041748, + "learning_rate": 3.8380739393018005e-06, + "loss": 0.1788, + "step": 22877 + }, + { + "epoch": 0.5789407090619227, + "grad_norm": 4.234287261962891, + "learning_rate": 3.8376834126588486e-06, + "loss": 0.1994, + "step": 22878 + }, + { + "epoch": 0.5789660146266163, + "grad_norm": 9.305405616760254, + "learning_rate": 3.837292893511462e-06, + "loss": 0.188, + "step": 22879 + }, + { + "epoch": 0.57899132019131, + "grad_norm": 4.261345863342285, + "learning_rate": 3.836902381862156e-06, + "loss": 0.1391, + "step": 22880 + }, + { + "epoch": 0.5790166257560038, + "grad_norm": 5.880903720855713, + "learning_rate": 3.83651187771345e-06, + "loss": 0.155, + "step": 22881 + }, + { + "epoch": 0.5790419313206974, + "grad_norm": 3.7572410106658936, + "learning_rate": 3.8361213810678625e-06, + "loss": 0.1665, + "step": 22882 + }, + { + "epoch": 0.5790672368853911, + "grad_norm": 8.437463760375977, + "learning_rate": 3.8357308919279135e-06, + "loss": 0.1614, + "step": 22883 + }, + { + "epoch": 0.5790925424500848, + "grad_norm": 4.17843770980835, + "learning_rate": 3.835340410296118e-06, + "loss": 0.1527, + "step": 22884 + }, + { + "epoch": 0.5791178480147785, + "grad_norm": 2.880946159362793, + "learning_rate": 3.834949936174994e-06, + "loss": 0.0919, + "step": 22885 + }, + { + "epoch": 0.5791431535794721, + "grad_norm": 9.80041217803955, + "learning_rate": 3.834559469567063e-06, + "loss": 0.199, + "step": 22886 + }, + { + "epoch": 0.5791684591441658, + "grad_norm": 2.8153953552246094, + "learning_rate": 3.83416901047484e-06, + "loss": 0.1076, + "step": 22887 + }, + { + "epoch": 0.5791937647088595, + "grad_norm": 3.8722102642059326, + "learning_rate": 3.833778558900847e-06, + "loss": 0.0891, + "step": 22888 + }, + { + "epoch": 0.5792190702735531, + "grad_norm": 8.789044380187988, + "learning_rate": 3.8333881148475965e-06, + "loss": 0.1348, + "step": 22889 + }, + { + "epoch": 0.5792443758382468, + "grad_norm": 13.15139389038086, + "learning_rate": 3.832997678317609e-06, + "loss": 0.2825, + "step": 22890 + }, + { + "epoch": 0.5792696814029406, + "grad_norm": 3.5757343769073486, + "learning_rate": 3.832607249313404e-06, + "loss": 0.1551, + "step": 22891 + }, + { + "epoch": 0.5792949869676342, + "grad_norm": 4.879418849945068, + "learning_rate": 3.8322168278374976e-06, + "loss": 0.1292, + "step": 22892 + }, + { + "epoch": 0.5793202925323279, + "grad_norm": 13.496274948120117, + "learning_rate": 3.831826413892406e-06, + "loss": 0.3207, + "step": 22893 + }, + { + "epoch": 0.5793455980970216, + "grad_norm": 3.7732057571411133, + "learning_rate": 3.83143600748065e-06, + "loss": 0.1884, + "step": 22894 + }, + { + "epoch": 0.5793709036617152, + "grad_norm": 4.537570476531982, + "learning_rate": 3.831045608604745e-06, + "loss": 0.2017, + "step": 22895 + }, + { + "epoch": 0.5793962092264089, + "grad_norm": 3.9127142429351807, + "learning_rate": 3.830655217267209e-06, + "loss": 0.1582, + "step": 22896 + }, + { + "epoch": 0.5794215147911026, + "grad_norm": 5.045092582702637, + "learning_rate": 3.830264833470563e-06, + "loss": 0.1849, + "step": 22897 + }, + { + "epoch": 0.5794468203557962, + "grad_norm": 3.8040919303894043, + "learning_rate": 3.829874457217318e-06, + "loss": 0.1424, + "step": 22898 + }, + { + "epoch": 0.5794721259204899, + "grad_norm": 2.8100125789642334, + "learning_rate": 3.829484088509995e-06, + "loss": 0.1518, + "step": 22899 + }, + { + "epoch": 0.5794974314851836, + "grad_norm": 5.493989944458008, + "learning_rate": 3.829093727351112e-06, + "loss": 0.2044, + "step": 22900 + }, + { + "epoch": 0.5795227370498772, + "grad_norm": 4.0130157470703125, + "learning_rate": 3.828703373743188e-06, + "loss": 0.1466, + "step": 22901 + }, + { + "epoch": 0.5795480426145709, + "grad_norm": 8.108905792236328, + "learning_rate": 3.828313027688736e-06, + "loss": 0.3718, + "step": 22902 + }, + { + "epoch": 0.5795733481792646, + "grad_norm": 13.633749961853027, + "learning_rate": 3.827922689190276e-06, + "loss": 0.3674, + "step": 22903 + }, + { + "epoch": 0.5795986537439582, + "grad_norm": 4.7672271728515625, + "learning_rate": 3.827532358250323e-06, + "loss": 0.1244, + "step": 22904 + }, + { + "epoch": 0.579623959308652, + "grad_norm": 7.541160583496094, + "learning_rate": 3.827142034871397e-06, + "loss": 0.1898, + "step": 22905 + }, + { + "epoch": 0.5796492648733457, + "grad_norm": 3.996717691421509, + "learning_rate": 3.8267517190560145e-06, + "loss": 0.1668, + "step": 22906 + }, + { + "epoch": 0.5796745704380393, + "grad_norm": 9.249527931213379, + "learning_rate": 3.826361410806691e-06, + "loss": 0.248, + "step": 22907 + }, + { + "epoch": 0.579699876002733, + "grad_norm": 4.871161937713623, + "learning_rate": 3.825971110125944e-06, + "loss": 0.1338, + "step": 22908 + }, + { + "epoch": 0.5797251815674267, + "grad_norm": 2.401320457458496, + "learning_rate": 3.8255808170162914e-06, + "loss": 0.1345, + "step": 22909 + }, + { + "epoch": 0.5797504871321204, + "grad_norm": 3.8447110652923584, + "learning_rate": 3.8251905314802515e-06, + "loss": 0.1676, + "step": 22910 + }, + { + "epoch": 0.579775792696814, + "grad_norm": 4.955615997314453, + "learning_rate": 3.824800253520337e-06, + "loss": 0.1924, + "step": 22911 + }, + { + "epoch": 0.5798010982615077, + "grad_norm": 8.354183197021484, + "learning_rate": 3.824409983139067e-06, + "loss": 0.1212, + "step": 22912 + }, + { + "epoch": 0.5798264038262014, + "grad_norm": 4.0762176513671875, + "learning_rate": 3.8240197203389586e-06, + "loss": 0.1486, + "step": 22913 + }, + { + "epoch": 0.579851709390895, + "grad_norm": 2.666956901550293, + "learning_rate": 3.823629465122531e-06, + "loss": 0.0925, + "step": 22914 + }, + { + "epoch": 0.5798770149555887, + "grad_norm": 3.505599021911621, + "learning_rate": 3.823239217492295e-06, + "loss": 0.136, + "step": 22915 + }, + { + "epoch": 0.5799023205202825, + "grad_norm": 3.969632625579834, + "learning_rate": 3.822848977450771e-06, + "loss": 0.1917, + "step": 22916 + }, + { + "epoch": 0.5799276260849761, + "grad_norm": 2.948758125305176, + "learning_rate": 3.822458745000474e-06, + "loss": 0.1562, + "step": 22917 + }, + { + "epoch": 0.5799529316496698, + "grad_norm": 3.948995351791382, + "learning_rate": 3.822068520143924e-06, + "loss": 0.1277, + "step": 22918 + }, + { + "epoch": 0.5799782372143635, + "grad_norm": 4.16251802444458, + "learning_rate": 3.8216783028836346e-06, + "loss": 0.1276, + "step": 22919 + }, + { + "epoch": 0.5800035427790571, + "grad_norm": 4.992199420928955, + "learning_rate": 3.821288093222122e-06, + "loss": 0.1895, + "step": 22920 + }, + { + "epoch": 0.5800288483437508, + "grad_norm": 3.150334119796753, + "learning_rate": 3.820897891161903e-06, + "loss": 0.1359, + "step": 22921 + }, + { + "epoch": 0.5800541539084445, + "grad_norm": 2.971863269805908, + "learning_rate": 3.820507696705493e-06, + "loss": 0.1518, + "step": 22922 + }, + { + "epoch": 0.5800794594731381, + "grad_norm": 4.329742908477783, + "learning_rate": 3.820117509855412e-06, + "loss": 0.2584, + "step": 22923 + }, + { + "epoch": 0.5801047650378318, + "grad_norm": 7.032401084899902, + "learning_rate": 3.8197273306141724e-06, + "loss": 0.2037, + "step": 22924 + }, + { + "epoch": 0.5801300706025255, + "grad_norm": 4.413417816162109, + "learning_rate": 3.819337158984292e-06, + "loss": 0.1577, + "step": 22925 + }, + { + "epoch": 0.5801553761672191, + "grad_norm": 4.600811958312988, + "learning_rate": 3.818946994968286e-06, + "loss": 0.206, + "step": 22926 + }, + { + "epoch": 0.5801806817319128, + "grad_norm": 2.9145569801330566, + "learning_rate": 3.8185568385686705e-06, + "loss": 0.0977, + "step": 22927 + }, + { + "epoch": 0.5802059872966066, + "grad_norm": 2.1974523067474365, + "learning_rate": 3.818166689787964e-06, + "loss": 0.1231, + "step": 22928 + }, + { + "epoch": 0.5802312928613002, + "grad_norm": 10.328287124633789, + "learning_rate": 3.81777654862868e-06, + "loss": 0.1907, + "step": 22929 + }, + { + "epoch": 0.5802565984259939, + "grad_norm": 6.947534084320068, + "learning_rate": 3.817386415093334e-06, + "loss": 0.2775, + "step": 22930 + }, + { + "epoch": 0.5802819039906876, + "grad_norm": 10.565882682800293, + "learning_rate": 3.8169962891844436e-06, + "loss": 0.2515, + "step": 22931 + }, + { + "epoch": 0.5803072095553812, + "grad_norm": 5.716169834136963, + "learning_rate": 3.816606170904526e-06, + "loss": 0.1542, + "step": 22932 + }, + { + "epoch": 0.5803325151200749, + "grad_norm": 5.562801361083984, + "learning_rate": 3.816216060256094e-06, + "loss": 0.1317, + "step": 22933 + }, + { + "epoch": 0.5803578206847686, + "grad_norm": 2.638507127761841, + "learning_rate": 3.815825957241663e-06, + "loss": 0.1188, + "step": 22934 + }, + { + "epoch": 0.5803831262494623, + "grad_norm": 3.0409562587738037, + "learning_rate": 3.81543586186375e-06, + "loss": 0.1095, + "step": 22935 + }, + { + "epoch": 0.5804084318141559, + "grad_norm": 6.371481895446777, + "learning_rate": 3.815045774124874e-06, + "loss": 0.1677, + "step": 22936 + }, + { + "epoch": 0.5804337373788496, + "grad_norm": 7.317742347717285, + "learning_rate": 3.8146556940275448e-06, + "loss": 0.1888, + "step": 22937 + }, + { + "epoch": 0.5804590429435433, + "grad_norm": 4.468695163726807, + "learning_rate": 3.81426562157428e-06, + "loss": 0.1334, + "step": 22938 + }, + { + "epoch": 0.5804843485082369, + "grad_norm": 4.9255805015563965, + "learning_rate": 3.813875556767597e-06, + "loss": 0.1684, + "step": 22939 + }, + { + "epoch": 0.5805096540729306, + "grad_norm": 3.0510051250457764, + "learning_rate": 3.8134854996100086e-06, + "loss": 0.1062, + "step": 22940 + }, + { + "epoch": 0.5805349596376244, + "grad_norm": 4.013086795806885, + "learning_rate": 3.813095450104034e-06, + "loss": 0.0959, + "step": 22941 + }, + { + "epoch": 0.580560265202318, + "grad_norm": 5.155231952667236, + "learning_rate": 3.8127054082521842e-06, + "loss": 0.1248, + "step": 22942 + }, + { + "epoch": 0.5805855707670117, + "grad_norm": 7.6439714431762695, + "learning_rate": 3.812315374056976e-06, + "loss": 0.1123, + "step": 22943 + }, + { + "epoch": 0.5806108763317054, + "grad_norm": 4.4868855476379395, + "learning_rate": 3.811925347520925e-06, + "loss": 0.17, + "step": 22944 + }, + { + "epoch": 0.580636181896399, + "grad_norm": 7.681940078735352, + "learning_rate": 3.811535328646549e-06, + "loss": 0.176, + "step": 22945 + }, + { + "epoch": 0.5806614874610927, + "grad_norm": 5.69619083404541, + "learning_rate": 3.811145317436358e-06, + "loss": 0.1802, + "step": 22946 + }, + { + "epoch": 0.5806867930257864, + "grad_norm": 8.673773765563965, + "learning_rate": 3.8107553138928697e-06, + "loss": 0.1383, + "step": 22947 + }, + { + "epoch": 0.58071209859048, + "grad_norm": 4.09375, + "learning_rate": 3.8103653180186006e-06, + "loss": 0.1184, + "step": 22948 + }, + { + "epoch": 0.5807374041551737, + "grad_norm": 8.402647018432617, + "learning_rate": 3.8099753298160634e-06, + "loss": 0.1981, + "step": 22949 + }, + { + "epoch": 0.5807627097198674, + "grad_norm": 5.763619422912598, + "learning_rate": 3.809585349287775e-06, + "loss": 0.1832, + "step": 22950 + }, + { + "epoch": 0.580788015284561, + "grad_norm": 6.209716796875, + "learning_rate": 3.809195376436248e-06, + "loss": 0.1498, + "step": 22951 + }, + { + "epoch": 0.5808133208492547, + "grad_norm": 8.720208168029785, + "learning_rate": 3.8088054112639987e-06, + "loss": 0.1846, + "step": 22952 + }, + { + "epoch": 0.5808386264139485, + "grad_norm": 5.018242359161377, + "learning_rate": 3.8084154537735417e-06, + "loss": 0.1745, + "step": 22953 + }, + { + "epoch": 0.5808639319786421, + "grad_norm": 3.059516429901123, + "learning_rate": 3.8080255039673926e-06, + "loss": 0.104, + "step": 22954 + }, + { + "epoch": 0.5808892375433358, + "grad_norm": 8.38353443145752, + "learning_rate": 3.8076355618480654e-06, + "loss": 0.1655, + "step": 22955 + }, + { + "epoch": 0.5809145431080295, + "grad_norm": 12.0355863571167, + "learning_rate": 3.8072456274180736e-06, + "loss": 0.3231, + "step": 22956 + }, + { + "epoch": 0.5809398486727231, + "grad_norm": 2.830578565597534, + "learning_rate": 3.806855700679932e-06, + "loss": 0.0871, + "step": 22957 + }, + { + "epoch": 0.5809651542374168, + "grad_norm": 5.127814769744873, + "learning_rate": 3.8064657816361596e-06, + "loss": 0.165, + "step": 22958 + }, + { + "epoch": 0.5809904598021105, + "grad_norm": 6.147193431854248, + "learning_rate": 3.806075870289264e-06, + "loss": 0.159, + "step": 22959 + }, + { + "epoch": 0.5810157653668042, + "grad_norm": 3.619490146636963, + "learning_rate": 3.8056859666417638e-06, + "loss": 0.138, + "step": 22960 + }, + { + "epoch": 0.5810410709314978, + "grad_norm": 4.527297496795654, + "learning_rate": 3.8052960706961723e-06, + "loss": 0.2138, + "step": 22961 + }, + { + "epoch": 0.5810663764961915, + "grad_norm": 7.910746097564697, + "learning_rate": 3.8049061824550045e-06, + "loss": 0.1926, + "step": 22962 + }, + { + "epoch": 0.5810916820608852, + "grad_norm": 6.526463508605957, + "learning_rate": 3.8045163019207764e-06, + "loss": 0.1919, + "step": 22963 + }, + { + "epoch": 0.5811169876255788, + "grad_norm": 6.445330619812012, + "learning_rate": 3.804126429095998e-06, + "loss": 0.1628, + "step": 22964 + }, + { + "epoch": 0.5811422931902726, + "grad_norm": 5.405595302581787, + "learning_rate": 3.8037365639831857e-06, + "loss": 0.1576, + "step": 22965 + }, + { + "epoch": 0.5811675987549663, + "grad_norm": 9.545068740844727, + "learning_rate": 3.803346706584854e-06, + "loss": 0.1213, + "step": 22966 + }, + { + "epoch": 0.5811929043196599, + "grad_norm": 4.212750434875488, + "learning_rate": 3.802956856903519e-06, + "loss": 0.1107, + "step": 22967 + }, + { + "epoch": 0.5812182098843536, + "grad_norm": 7.184714317321777, + "learning_rate": 3.8025670149416904e-06, + "loss": 0.1737, + "step": 22968 + }, + { + "epoch": 0.5812435154490473, + "grad_norm": 4.046376705169678, + "learning_rate": 3.802177180701884e-06, + "loss": 0.1574, + "step": 22969 + }, + { + "epoch": 0.5812688210137409, + "grad_norm": 3.6178090572357178, + "learning_rate": 3.8017873541866147e-06, + "loss": 0.1575, + "step": 22970 + }, + { + "epoch": 0.5812941265784346, + "grad_norm": 7.980459690093994, + "learning_rate": 3.801397535398396e-06, + "loss": 0.2568, + "step": 22971 + }, + { + "epoch": 0.5813194321431283, + "grad_norm": 4.077206611633301, + "learning_rate": 3.801007724339743e-06, + "loss": 0.143, + "step": 22972 + }, + { + "epoch": 0.5813447377078219, + "grad_norm": 5.981174945831299, + "learning_rate": 3.8006179210131665e-06, + "loss": 0.2171, + "step": 22973 + }, + { + "epoch": 0.5813700432725156, + "grad_norm": 4.759200572967529, + "learning_rate": 3.800228125421182e-06, + "loss": 0.1176, + "step": 22974 + }, + { + "epoch": 0.5813953488372093, + "grad_norm": 7.280109405517578, + "learning_rate": 3.799838337566304e-06, + "loss": 0.2202, + "step": 22975 + }, + { + "epoch": 0.5814206544019029, + "grad_norm": 4.957618236541748, + "learning_rate": 3.7994485574510453e-06, + "loss": 0.142, + "step": 22976 + }, + { + "epoch": 0.5814459599665966, + "grad_norm": 6.378339767456055, + "learning_rate": 3.7990587850779186e-06, + "loss": 0.2029, + "step": 22977 + }, + { + "epoch": 0.5814712655312904, + "grad_norm": 5.3147478103637695, + "learning_rate": 3.7986690204494387e-06, + "loss": 0.1824, + "step": 22978 + }, + { + "epoch": 0.581496571095984, + "grad_norm": 5.451560020446777, + "learning_rate": 3.79827926356812e-06, + "loss": 0.1329, + "step": 22979 + }, + { + "epoch": 0.5815218766606777, + "grad_norm": 5.115210056304932, + "learning_rate": 3.7978895144364727e-06, + "loss": 0.1393, + "step": 22980 + }, + { + "epoch": 0.5815471822253714, + "grad_norm": 3.9661197662353516, + "learning_rate": 3.797499773057014e-06, + "loss": 0.1134, + "step": 22981 + }, + { + "epoch": 0.581572487790065, + "grad_norm": 3.671844959259033, + "learning_rate": 3.797110039432256e-06, + "loss": 0.1508, + "step": 22982 + }, + { + "epoch": 0.5815977933547587, + "grad_norm": 13.03525161743164, + "learning_rate": 3.7967203135647096e-06, + "loss": 0.1893, + "step": 22983 + }, + { + "epoch": 0.5816230989194524, + "grad_norm": 7.61523962020874, + "learning_rate": 3.796330595456891e-06, + "loss": 0.25, + "step": 22984 + }, + { + "epoch": 0.581648404484146, + "grad_norm": 5.123852729797363, + "learning_rate": 3.795940885111314e-06, + "loss": 0.1287, + "step": 22985 + }, + { + "epoch": 0.5816737100488397, + "grad_norm": 7.494386672973633, + "learning_rate": 3.795551182530488e-06, + "loss": 0.2531, + "step": 22986 + }, + { + "epoch": 0.5816990156135334, + "grad_norm": 6.251695156097412, + "learning_rate": 3.7951614877169285e-06, + "loss": 0.1633, + "step": 22987 + }, + { + "epoch": 0.5817243211782271, + "grad_norm": 2.7485506534576416, + "learning_rate": 3.7947718006731488e-06, + "loss": 0.1678, + "step": 22988 + }, + { + "epoch": 0.5817496267429207, + "grad_norm": 2.5529632568359375, + "learning_rate": 3.7943821214016633e-06, + "loss": 0.056, + "step": 22989 + }, + { + "epoch": 0.5817749323076145, + "grad_norm": 3.6124050617218018, + "learning_rate": 3.793992449904982e-06, + "loss": 0.1138, + "step": 22990 + }, + { + "epoch": 0.5818002378723082, + "grad_norm": 8.603071212768555, + "learning_rate": 3.7936027861856185e-06, + "loss": 0.1843, + "step": 22991 + }, + { + "epoch": 0.5818255434370018, + "grad_norm": 8.111581802368164, + "learning_rate": 3.7932131302460855e-06, + "loss": 0.1299, + "step": 22992 + }, + { + "epoch": 0.5818508490016955, + "grad_norm": 4.232619285583496, + "learning_rate": 3.7928234820888976e-06, + "loss": 0.0968, + "step": 22993 + }, + { + "epoch": 0.5818761545663892, + "grad_norm": 9.797187805175781, + "learning_rate": 3.792433841716568e-06, + "loss": 0.0948, + "step": 22994 + }, + { + "epoch": 0.5819014601310828, + "grad_norm": 7.295114994049072, + "learning_rate": 3.7920442091316057e-06, + "loss": 0.1911, + "step": 22995 + }, + { + "epoch": 0.5819267656957765, + "grad_norm": 4.282329559326172, + "learning_rate": 3.791654584336526e-06, + "loss": 0.1075, + "step": 22996 + }, + { + "epoch": 0.5819520712604702, + "grad_norm": 5.234924793243408, + "learning_rate": 3.7912649673338408e-06, + "loss": 0.211, + "step": 22997 + }, + { + "epoch": 0.5819773768251638, + "grad_norm": 4.728222370147705, + "learning_rate": 3.790875358126066e-06, + "loss": 0.2164, + "step": 22998 + }, + { + "epoch": 0.5820026823898575, + "grad_norm": 3.5417356491088867, + "learning_rate": 3.7904857567157085e-06, + "loss": 0.127, + "step": 22999 + }, + { + "epoch": 0.5820279879545512, + "grad_norm": 5.440755844116211, + "learning_rate": 3.7900961631052825e-06, + "loss": 0.1116, + "step": 23000 + }, + { + "epoch": 0.5820532935192448, + "grad_norm": 3.273668050765991, + "learning_rate": 3.7897065772973017e-06, + "loss": 0.1285, + "step": 23001 + }, + { + "epoch": 0.5820785990839386, + "grad_norm": 5.210967063903809, + "learning_rate": 3.7893169992942785e-06, + "loss": 0.1121, + "step": 23002 + }, + { + "epoch": 0.5821039046486323, + "grad_norm": 3.8149869441986084, + "learning_rate": 3.7889274290987273e-06, + "loss": 0.1733, + "step": 23003 + }, + { + "epoch": 0.5821292102133259, + "grad_norm": 7.160821437835693, + "learning_rate": 3.7885378667131545e-06, + "loss": 0.1569, + "step": 23004 + }, + { + "epoch": 0.5821545157780196, + "grad_norm": 11.173772811889648, + "learning_rate": 3.7881483121400765e-06, + "loss": 0.1335, + "step": 23005 + }, + { + "epoch": 0.5821798213427133, + "grad_norm": 3.581425666809082, + "learning_rate": 3.7877587653820053e-06, + "loss": 0.0741, + "step": 23006 + }, + { + "epoch": 0.5822051269074069, + "grad_norm": 2.9065332412719727, + "learning_rate": 3.7873692264414523e-06, + "loss": 0.1418, + "step": 23007 + }, + { + "epoch": 0.5822304324721006, + "grad_norm": 4.946542263031006, + "learning_rate": 3.7869796953209294e-06, + "loss": 0.1893, + "step": 23008 + }, + { + "epoch": 0.5822557380367943, + "grad_norm": 3.2804036140441895, + "learning_rate": 3.7865901720229485e-06, + "loss": 0.1291, + "step": 23009 + }, + { + "epoch": 0.5822810436014879, + "grad_norm": 4.337144374847412, + "learning_rate": 3.786200656550022e-06, + "loss": 0.1076, + "step": 23010 + }, + { + "epoch": 0.5823063491661816, + "grad_norm": 6.803958892822266, + "learning_rate": 3.7858111489046617e-06, + "loss": 0.2202, + "step": 23011 + }, + { + "epoch": 0.5823316547308753, + "grad_norm": 7.207981109619141, + "learning_rate": 3.785421649089379e-06, + "loss": 0.1609, + "step": 23012 + }, + { + "epoch": 0.582356960295569, + "grad_norm": 5.128511905670166, + "learning_rate": 3.7850321571066874e-06, + "loss": 0.1818, + "step": 23013 + }, + { + "epoch": 0.5823822658602626, + "grad_norm": 6.063985347747803, + "learning_rate": 3.7846426729590958e-06, + "loss": 0.1632, + "step": 23014 + }, + { + "epoch": 0.5824075714249564, + "grad_norm": 6.5115437507629395, + "learning_rate": 3.784253196649118e-06, + "loss": 0.2004, + "step": 23015 + }, + { + "epoch": 0.5824328769896501, + "grad_norm": 6.832531929016113, + "learning_rate": 3.783863728179267e-06, + "loss": 0.2063, + "step": 23016 + }, + { + "epoch": 0.5824581825543437, + "grad_norm": 4.3013176918029785, + "learning_rate": 3.783474267552051e-06, + "loss": 0.1317, + "step": 23017 + }, + { + "epoch": 0.5824834881190374, + "grad_norm": 6.361865043640137, + "learning_rate": 3.783084814769982e-06, + "loss": 0.2336, + "step": 23018 + }, + { + "epoch": 0.5825087936837311, + "grad_norm": 5.416430473327637, + "learning_rate": 3.782695369835574e-06, + "loss": 0.143, + "step": 23019 + }, + { + "epoch": 0.5825340992484247, + "grad_norm": 15.3014554977417, + "learning_rate": 3.7823059327513382e-06, + "loss": 0.1633, + "step": 23020 + }, + { + "epoch": 0.5825594048131184, + "grad_norm": 2.9323885440826416, + "learning_rate": 3.7819165035197835e-06, + "loss": 0.1794, + "step": 23021 + }, + { + "epoch": 0.5825847103778121, + "grad_norm": 5.6201887130737305, + "learning_rate": 3.7815270821434225e-06, + "loss": 0.1579, + "step": 23022 + }, + { + "epoch": 0.5826100159425057, + "grad_norm": 6.724461078643799, + "learning_rate": 3.7811376686247664e-06, + "loss": 0.1981, + "step": 23023 + }, + { + "epoch": 0.5826353215071994, + "grad_norm": 5.7254862785339355, + "learning_rate": 3.7807482629663274e-06, + "loss": 0.1677, + "step": 23024 + }, + { + "epoch": 0.5826606270718931, + "grad_norm": 6.038467884063721, + "learning_rate": 3.7803588651706174e-06, + "loss": 0.1577, + "step": 23025 + }, + { + "epoch": 0.5826859326365867, + "grad_norm": 4.698818206787109, + "learning_rate": 3.7799694752401446e-06, + "loss": 0.1332, + "step": 23026 + }, + { + "epoch": 0.5827112382012805, + "grad_norm": 5.606935977935791, + "learning_rate": 3.779580093177421e-06, + "loss": 0.1744, + "step": 23027 + }, + { + "epoch": 0.5827365437659742, + "grad_norm": 8.100006103515625, + "learning_rate": 3.7791907189849593e-06, + "loss": 0.1706, + "step": 23028 + }, + { + "epoch": 0.5827618493306678, + "grad_norm": 6.629839897155762, + "learning_rate": 3.778801352665271e-06, + "loss": 0.1539, + "step": 23029 + }, + { + "epoch": 0.5827871548953615, + "grad_norm": 4.319783687591553, + "learning_rate": 3.778411994220864e-06, + "loss": 0.1975, + "step": 23030 + }, + { + "epoch": 0.5828124604600552, + "grad_norm": 7.258892059326172, + "learning_rate": 3.778022643654251e-06, + "loss": 0.2107, + "step": 23031 + }, + { + "epoch": 0.5828377660247488, + "grad_norm": 11.613075256347656, + "learning_rate": 3.7776333009679424e-06, + "loss": 0.124, + "step": 23032 + }, + { + "epoch": 0.5828630715894425, + "grad_norm": 4.593194961547852, + "learning_rate": 3.77724396616445e-06, + "loss": 0.1694, + "step": 23033 + }, + { + "epoch": 0.5828883771541362, + "grad_norm": 6.3197760581970215, + "learning_rate": 3.7768546392462842e-06, + "loss": 0.2229, + "step": 23034 + }, + { + "epoch": 0.5829136827188298, + "grad_norm": 4.6143012046813965, + "learning_rate": 3.776465320215954e-06, + "loss": 0.1671, + "step": 23035 + }, + { + "epoch": 0.5829389882835235, + "grad_norm": 8.327286720275879, + "learning_rate": 3.7760760090759723e-06, + "loss": 0.173, + "step": 23036 + }, + { + "epoch": 0.5829642938482172, + "grad_norm": 3.751420259475708, + "learning_rate": 3.7756867058288476e-06, + "loss": 0.0983, + "step": 23037 + }, + { + "epoch": 0.582989599412911, + "grad_norm": 2.9757230281829834, + "learning_rate": 3.7752974104770932e-06, + "loss": 0.1319, + "step": 23038 + }, + { + "epoch": 0.5830149049776046, + "grad_norm": 15.651510238647461, + "learning_rate": 3.7749081230232163e-06, + "loss": 0.3577, + "step": 23039 + }, + { + "epoch": 0.5830402105422983, + "grad_norm": 8.279133796691895, + "learning_rate": 3.774518843469731e-06, + "loss": 0.1631, + "step": 23040 + }, + { + "epoch": 0.583065516106992, + "grad_norm": 7.6013712882995605, + "learning_rate": 3.7741295718191438e-06, + "loss": 0.2445, + "step": 23041 + }, + { + "epoch": 0.5830908216716856, + "grad_norm": 6.009036064147949, + "learning_rate": 3.773740308073969e-06, + "loss": 0.1762, + "step": 23042 + }, + { + "epoch": 0.5831161272363793, + "grad_norm": 2.668569326400757, + "learning_rate": 3.7733510522367136e-06, + "loss": 0.0951, + "step": 23043 + }, + { + "epoch": 0.583141432801073, + "grad_norm": 3.3936705589294434, + "learning_rate": 3.7729618043098888e-06, + "loss": 0.1255, + "step": 23044 + }, + { + "epoch": 0.5831667383657666, + "grad_norm": 7.059070587158203, + "learning_rate": 3.7725725642960047e-06, + "loss": 0.1824, + "step": 23045 + }, + { + "epoch": 0.5831920439304603, + "grad_norm": 8.172459602355957, + "learning_rate": 3.772183332197572e-06, + "loss": 0.2327, + "step": 23046 + }, + { + "epoch": 0.583217349495154, + "grad_norm": 3.6850390434265137, + "learning_rate": 3.7717941080171027e-06, + "loss": 0.1821, + "step": 23047 + }, + { + "epoch": 0.5832426550598476, + "grad_norm": 6.62752103805542, + "learning_rate": 3.7714048917571027e-06, + "loss": 0.1989, + "step": 23048 + }, + { + "epoch": 0.5832679606245413, + "grad_norm": 4.074297904968262, + "learning_rate": 3.771015683420084e-06, + "loss": 0.217, + "step": 23049 + }, + { + "epoch": 0.583293266189235, + "grad_norm": 3.742739677429199, + "learning_rate": 3.7706264830085563e-06, + "loss": 0.2198, + "step": 23050 + }, + { + "epoch": 0.5833185717539286, + "grad_norm": 4.309808254241943, + "learning_rate": 3.7702372905250324e-06, + "loss": 0.1563, + "step": 23051 + }, + { + "epoch": 0.5833438773186224, + "grad_norm": 6.3610734939575195, + "learning_rate": 3.769848105972017e-06, + "loss": 0.1559, + "step": 23052 + }, + { + "epoch": 0.5833691828833161, + "grad_norm": 3.3659887313842773, + "learning_rate": 3.7694589293520223e-06, + "loss": 0.0923, + "step": 23053 + }, + { + "epoch": 0.5833944884480097, + "grad_norm": 2.675748348236084, + "learning_rate": 3.769069760667558e-06, + "loss": 0.139, + "step": 23054 + }, + { + "epoch": 0.5834197940127034, + "grad_norm": 3.2808189392089844, + "learning_rate": 3.768680599921134e-06, + "loss": 0.1108, + "step": 23055 + }, + { + "epoch": 0.5834450995773971, + "grad_norm": 3.985093355178833, + "learning_rate": 3.768291447115262e-06, + "loss": 0.1872, + "step": 23056 + }, + { + "epoch": 0.5834704051420907, + "grad_norm": 5.1267828941345215, + "learning_rate": 3.7679023022524468e-06, + "loss": 0.1769, + "step": 23057 + }, + { + "epoch": 0.5834957107067844, + "grad_norm": 4.711380481719971, + "learning_rate": 3.7675131653352003e-06, + "loss": 0.1119, + "step": 23058 + }, + { + "epoch": 0.5835210162714781, + "grad_norm": 10.664019584655762, + "learning_rate": 3.7671240363660333e-06, + "loss": 0.1519, + "step": 23059 + }, + { + "epoch": 0.5835463218361717, + "grad_norm": 8.323898315429688, + "learning_rate": 3.7667349153474555e-06, + "loss": 0.1098, + "step": 23060 + }, + { + "epoch": 0.5835716274008654, + "grad_norm": 3.700547218322754, + "learning_rate": 3.766345802281972e-06, + "loss": 0.1703, + "step": 23061 + }, + { + "epoch": 0.5835969329655591, + "grad_norm": 5.729534149169922, + "learning_rate": 3.7659566971720955e-06, + "loss": 0.1968, + "step": 23062 + }, + { + "epoch": 0.5836222385302529, + "grad_norm": 7.227751731872559, + "learning_rate": 3.765567600020335e-06, + "loss": 0.1643, + "step": 23063 + }, + { + "epoch": 0.5836475440949465, + "grad_norm": 8.809979438781738, + "learning_rate": 3.7651785108292018e-06, + "loss": 0.2079, + "step": 23064 + }, + { + "epoch": 0.5836728496596402, + "grad_norm": 4.643707752227783, + "learning_rate": 3.7647894296012e-06, + "loss": 0.152, + "step": 23065 + }, + { + "epoch": 0.5836981552243339, + "grad_norm": 7.657382011413574, + "learning_rate": 3.764400356338841e-06, + "loss": 0.1696, + "step": 23066 + }, + { + "epoch": 0.5837234607890275, + "grad_norm": 3.852081775665283, + "learning_rate": 3.764011291044636e-06, + "loss": 0.1486, + "step": 23067 + }, + { + "epoch": 0.5837487663537212, + "grad_norm": 9.08469295501709, + "learning_rate": 3.7636222337210914e-06, + "loss": 0.2096, + "step": 23068 + }, + { + "epoch": 0.5837740719184149, + "grad_norm": 6.353111743927002, + "learning_rate": 3.7632331843707174e-06, + "loss": 0.1244, + "step": 23069 + }, + { + "epoch": 0.5837993774831085, + "grad_norm": 7.4280219078063965, + "learning_rate": 3.7628441429960227e-06, + "loss": 0.2483, + "step": 23070 + }, + { + "epoch": 0.5838246830478022, + "grad_norm": 6.401188373565674, + "learning_rate": 3.7624551095995156e-06, + "loss": 0.1618, + "step": 23071 + }, + { + "epoch": 0.5838499886124959, + "grad_norm": 5.93143367767334, + "learning_rate": 3.7620660841837047e-06, + "loss": 0.1957, + "step": 23072 + }, + { + "epoch": 0.5838752941771895, + "grad_norm": 6.587301254272461, + "learning_rate": 3.7616770667511005e-06, + "loss": 0.1611, + "step": 23073 + }, + { + "epoch": 0.5839005997418832, + "grad_norm": 6.743223190307617, + "learning_rate": 3.7612880573042103e-06, + "loss": 0.1834, + "step": 23074 + }, + { + "epoch": 0.583925905306577, + "grad_norm": 3.486912965774536, + "learning_rate": 3.7608990558455425e-06, + "loss": 0.1763, + "step": 23075 + }, + { + "epoch": 0.5839512108712706, + "grad_norm": 9.306466102600098, + "learning_rate": 3.7605100623776057e-06, + "loss": 0.1991, + "step": 23076 + }, + { + "epoch": 0.5839765164359643, + "grad_norm": 5.540356636047363, + "learning_rate": 3.7601210769029096e-06, + "loss": 0.2116, + "step": 23077 + }, + { + "epoch": 0.584001822000658, + "grad_norm": 8.172102928161621, + "learning_rate": 3.759732099423963e-06, + "loss": 0.2103, + "step": 23078 + }, + { + "epoch": 0.5840271275653516, + "grad_norm": 5.364092826843262, + "learning_rate": 3.7593431299432726e-06, + "loss": 0.1708, + "step": 23079 + }, + { + "epoch": 0.5840524331300453, + "grad_norm": 5.112408638000488, + "learning_rate": 3.7589541684633466e-06, + "loss": 0.2142, + "step": 23080 + }, + { + "epoch": 0.584077738694739, + "grad_norm": 4.601918697357178, + "learning_rate": 3.758565214986695e-06, + "loss": 0.147, + "step": 23081 + }, + { + "epoch": 0.5841030442594326, + "grad_norm": 4.619476318359375, + "learning_rate": 3.758176269515827e-06, + "loss": 0.1489, + "step": 23082 + }, + { + "epoch": 0.5841283498241263, + "grad_norm": 9.712322235107422, + "learning_rate": 3.757787332053247e-06, + "loss": 0.2189, + "step": 23083 + }, + { + "epoch": 0.58415365538882, + "grad_norm": 7.166497230529785, + "learning_rate": 3.757398402601466e-06, + "loss": 0.2215, + "step": 23084 + }, + { + "epoch": 0.5841789609535136, + "grad_norm": 3.20943284034729, + "learning_rate": 3.757009481162991e-06, + "loss": 0.1, + "step": 23085 + }, + { + "epoch": 0.5842042665182073, + "grad_norm": 7.254964351654053, + "learning_rate": 3.7566205677403313e-06, + "loss": 0.1419, + "step": 23086 + }, + { + "epoch": 0.584229572082901, + "grad_norm": 3.647660493850708, + "learning_rate": 3.7562316623359964e-06, + "loss": 0.1643, + "step": 23087 + }, + { + "epoch": 0.5842548776475948, + "grad_norm": 3.1032803058624268, + "learning_rate": 3.755842764952489e-06, + "loss": 0.1726, + "step": 23088 + }, + { + "epoch": 0.5842801832122884, + "grad_norm": 3.332157850265503, + "learning_rate": 3.7554538755923208e-06, + "loss": 0.1291, + "step": 23089 + }, + { + "epoch": 0.5843054887769821, + "grad_norm": 5.12848424911499, + "learning_rate": 3.755064994257999e-06, + "loss": 0.2724, + "step": 23090 + }, + { + "epoch": 0.5843307943416758, + "grad_norm": 9.532504081726074, + "learning_rate": 3.7546761209520337e-06, + "loss": 0.1967, + "step": 23091 + }, + { + "epoch": 0.5843560999063694, + "grad_norm": 3.548475980758667, + "learning_rate": 3.7542872556769282e-06, + "loss": 0.1576, + "step": 23092 + }, + { + "epoch": 0.5843814054710631, + "grad_norm": 7.583741188049316, + "learning_rate": 3.753898398435193e-06, + "loss": 0.2626, + "step": 23093 + }, + { + "epoch": 0.5844067110357568, + "grad_norm": 18.883230209350586, + "learning_rate": 3.7535095492293362e-06, + "loss": 0.2179, + "step": 23094 + }, + { + "epoch": 0.5844320166004504, + "grad_norm": 3.2143619060516357, + "learning_rate": 3.7531207080618637e-06, + "loss": 0.1377, + "step": 23095 + }, + { + "epoch": 0.5844573221651441, + "grad_norm": 5.731229305267334, + "learning_rate": 3.752731874935283e-06, + "loss": 0.1202, + "step": 23096 + }, + { + "epoch": 0.5844826277298378, + "grad_norm": 4.127665042877197, + "learning_rate": 3.752343049852104e-06, + "loss": 0.1574, + "step": 23097 + }, + { + "epoch": 0.5845079332945314, + "grad_norm": 3.7726480960845947, + "learning_rate": 3.751954232814831e-06, + "loss": 0.1395, + "step": 23098 + }, + { + "epoch": 0.5845332388592251, + "grad_norm": 4.71566915512085, + "learning_rate": 3.7515654238259734e-06, + "loss": 0.129, + "step": 23099 + }, + { + "epoch": 0.5845585444239189, + "grad_norm": 4.427397727966309, + "learning_rate": 3.7511766228880385e-06, + "loss": 0.2049, + "step": 23100 + }, + { + "epoch": 0.5845838499886125, + "grad_norm": 5.87749719619751, + "learning_rate": 3.7507878300035345e-06, + "loss": 0.1701, + "step": 23101 + }, + { + "epoch": 0.5846091555533062, + "grad_norm": 6.702324390411377, + "learning_rate": 3.7503990451749656e-06, + "loss": 0.1526, + "step": 23102 + }, + { + "epoch": 0.5846344611179999, + "grad_norm": 5.671322345733643, + "learning_rate": 3.7500102684048407e-06, + "loss": 0.2116, + "step": 23103 + }, + { + "epoch": 0.5846597666826935, + "grad_norm": 5.1490631103515625, + "learning_rate": 3.7496214996956692e-06, + "loss": 0.151, + "step": 23104 + }, + { + "epoch": 0.5846850722473872, + "grad_norm": 9.615224838256836, + "learning_rate": 3.749232739049954e-06, + "loss": 0.1652, + "step": 23105 + }, + { + "epoch": 0.5847103778120809, + "grad_norm": 7.313110828399658, + "learning_rate": 3.748843986470204e-06, + "loss": 0.2079, + "step": 23106 + }, + { + "epoch": 0.5847356833767745, + "grad_norm": 5.471432209014893, + "learning_rate": 3.748455241958927e-06, + "loss": 0.1977, + "step": 23107 + }, + { + "epoch": 0.5847609889414682, + "grad_norm": 4.070727825164795, + "learning_rate": 3.7480665055186295e-06, + "loss": 0.1758, + "step": 23108 + }, + { + "epoch": 0.5847862945061619, + "grad_norm": 5.311241149902344, + "learning_rate": 3.747677777151819e-06, + "loss": 0.1744, + "step": 23109 + }, + { + "epoch": 0.5848116000708555, + "grad_norm": 6.421961784362793, + "learning_rate": 3.747289056861001e-06, + "loss": 0.2255, + "step": 23110 + }, + { + "epoch": 0.5848369056355492, + "grad_norm": 11.649760246276855, + "learning_rate": 3.7469003446486816e-06, + "loss": 0.1943, + "step": 23111 + }, + { + "epoch": 0.584862211200243, + "grad_norm": 15.62912368774414, + "learning_rate": 3.7465116405173698e-06, + "loss": 0.1609, + "step": 23112 + }, + { + "epoch": 0.5848875167649366, + "grad_norm": 5.340634822845459, + "learning_rate": 3.7461229444695724e-06, + "loss": 0.2217, + "step": 23113 + }, + { + "epoch": 0.5849128223296303, + "grad_norm": 4.137301445007324, + "learning_rate": 3.7457342565077934e-06, + "loss": 0.1499, + "step": 23114 + }, + { + "epoch": 0.584938127894324, + "grad_norm": 3.848376989364624, + "learning_rate": 3.745345576634541e-06, + "loss": 0.1734, + "step": 23115 + }, + { + "epoch": 0.5849634334590177, + "grad_norm": 3.88132905960083, + "learning_rate": 3.7449569048523215e-06, + "loss": 0.1837, + "step": 23116 + }, + { + "epoch": 0.5849887390237113, + "grad_norm": 5.6751389503479, + "learning_rate": 3.744568241163643e-06, + "loss": 0.2147, + "step": 23117 + }, + { + "epoch": 0.585014044588405, + "grad_norm": 5.399194717407227, + "learning_rate": 3.744179585571009e-06, + "loss": 0.179, + "step": 23118 + }, + { + "epoch": 0.5850393501530987, + "grad_norm": 4.25284481048584, + "learning_rate": 3.7437909380769267e-06, + "loss": 0.1617, + "step": 23119 + }, + { + "epoch": 0.5850646557177923, + "grad_norm": 13.635709762573242, + "learning_rate": 3.7434022986839034e-06, + "loss": 0.1886, + "step": 23120 + }, + { + "epoch": 0.585089961282486, + "grad_norm": 5.313379764556885, + "learning_rate": 3.743013667394446e-06, + "loss": 0.135, + "step": 23121 + }, + { + "epoch": 0.5851152668471797, + "grad_norm": 4.522863388061523, + "learning_rate": 3.742625044211059e-06, + "loss": 0.1863, + "step": 23122 + }, + { + "epoch": 0.5851405724118733, + "grad_norm": 3.6954848766326904, + "learning_rate": 3.742236429136249e-06, + "loss": 0.1115, + "step": 23123 + }, + { + "epoch": 0.585165877976567, + "grad_norm": 10.162169456481934, + "learning_rate": 3.741847822172522e-06, + "loss": 0.2139, + "step": 23124 + }, + { + "epoch": 0.5851911835412608, + "grad_norm": 3.393620491027832, + "learning_rate": 3.741459223322385e-06, + "loss": 0.1553, + "step": 23125 + }, + { + "epoch": 0.5852164891059544, + "grad_norm": 4.622476100921631, + "learning_rate": 3.741070632588344e-06, + "loss": 0.1511, + "step": 23126 + }, + { + "epoch": 0.5852417946706481, + "grad_norm": 5.547767639160156, + "learning_rate": 3.7406820499729025e-06, + "loss": 0.2233, + "step": 23127 + }, + { + "epoch": 0.5852671002353418, + "grad_norm": 8.527936935424805, + "learning_rate": 3.7402934754785704e-06, + "loss": 0.212, + "step": 23128 + }, + { + "epoch": 0.5852924058000354, + "grad_norm": 2.605137825012207, + "learning_rate": 3.739904909107849e-06, + "loss": 0.1461, + "step": 23129 + }, + { + "epoch": 0.5853177113647291, + "grad_norm": 8.99043083190918, + "learning_rate": 3.739516350863247e-06, + "loss": 0.1561, + "step": 23130 + }, + { + "epoch": 0.5853430169294228, + "grad_norm": 3.0347206592559814, + "learning_rate": 3.739127800747272e-06, + "loss": 0.1705, + "step": 23131 + }, + { + "epoch": 0.5853683224941164, + "grad_norm": 11.89284896850586, + "learning_rate": 3.738739258762425e-06, + "loss": 0.1659, + "step": 23132 + }, + { + "epoch": 0.5853936280588101, + "grad_norm": 8.146085739135742, + "learning_rate": 3.738350724911214e-06, + "loss": 0.1937, + "step": 23133 + }, + { + "epoch": 0.5854189336235038, + "grad_norm": 4.12214994430542, + "learning_rate": 3.737962199196144e-06, + "loss": 0.1346, + "step": 23134 + }, + { + "epoch": 0.5854442391881974, + "grad_norm": 3.2916197776794434, + "learning_rate": 3.7375736816197233e-06, + "loss": 0.1331, + "step": 23135 + }, + { + "epoch": 0.5854695447528911, + "grad_norm": 10.031089782714844, + "learning_rate": 3.7371851721844533e-06, + "loss": 0.2406, + "step": 23136 + }, + { + "epoch": 0.5854948503175849, + "grad_norm": 16.69287109375, + "learning_rate": 3.736796670892841e-06, + "loss": 0.2412, + "step": 23137 + }, + { + "epoch": 0.5855201558822785, + "grad_norm": 3.084019422531128, + "learning_rate": 3.736408177747392e-06, + "loss": 0.093, + "step": 23138 + }, + { + "epoch": 0.5855454614469722, + "grad_norm": 4.252527713775635, + "learning_rate": 3.7360196927506137e-06, + "loss": 0.1317, + "step": 23139 + }, + { + "epoch": 0.5855707670116659, + "grad_norm": 5.868653774261475, + "learning_rate": 3.7356312159050074e-06, + "loss": 0.1636, + "step": 23140 + }, + { + "epoch": 0.5855960725763596, + "grad_norm": 4.430654048919678, + "learning_rate": 3.7352427472130804e-06, + "loss": 0.1527, + "step": 23141 + }, + { + "epoch": 0.5856213781410532, + "grad_norm": 5.2130560874938965, + "learning_rate": 3.7348542866773367e-06, + "loss": 0.1844, + "step": 23142 + }, + { + "epoch": 0.5856466837057469, + "grad_norm": 6.1373090744018555, + "learning_rate": 3.7344658343002833e-06, + "loss": 0.1851, + "step": 23143 + }, + { + "epoch": 0.5856719892704406, + "grad_norm": 3.2216103076934814, + "learning_rate": 3.734077390084426e-06, + "loss": 0.1168, + "step": 23144 + }, + { + "epoch": 0.5856972948351342, + "grad_norm": 11.529311180114746, + "learning_rate": 3.7336889540322663e-06, + "loss": 0.2703, + "step": 23145 + }, + { + "epoch": 0.5857226003998279, + "grad_norm": 10.909513473510742, + "learning_rate": 3.733300526146311e-06, + "loss": 0.3229, + "step": 23146 + }, + { + "epoch": 0.5857479059645216, + "grad_norm": 4.498867511749268, + "learning_rate": 3.732912106429064e-06, + "loss": 0.1812, + "step": 23147 + }, + { + "epoch": 0.5857732115292152, + "grad_norm": 3.4508683681488037, + "learning_rate": 3.7325236948830344e-06, + "loss": 0.1704, + "step": 23148 + }, + { + "epoch": 0.585798517093909, + "grad_norm": 3.5236260890960693, + "learning_rate": 3.732135291510721e-06, + "loss": 0.1182, + "step": 23149 + }, + { + "epoch": 0.5858238226586027, + "grad_norm": 3.1675710678100586, + "learning_rate": 3.731746896314632e-06, + "loss": 0.1562, + "step": 23150 + }, + { + "epoch": 0.5858491282232963, + "grad_norm": 9.14309024810791, + "learning_rate": 3.7313585092972706e-06, + "loss": 0.2607, + "step": 23151 + }, + { + "epoch": 0.58587443378799, + "grad_norm": 4.900345802307129, + "learning_rate": 3.7309701304611436e-06, + "loss": 0.1483, + "step": 23152 + }, + { + "epoch": 0.5858997393526837, + "grad_norm": 5.4186625480651855, + "learning_rate": 3.730581759808754e-06, + "loss": 0.2186, + "step": 23153 + }, + { + "epoch": 0.5859250449173773, + "grad_norm": 4.3868255615234375, + "learning_rate": 3.7301933973426053e-06, + "loss": 0.1783, + "step": 23154 + }, + { + "epoch": 0.585950350482071, + "grad_norm": 5.282652378082275, + "learning_rate": 3.729805043065204e-06, + "loss": 0.1833, + "step": 23155 + }, + { + "epoch": 0.5859756560467647, + "grad_norm": 2.749786376953125, + "learning_rate": 3.7294166969790534e-06, + "loss": 0.1254, + "step": 23156 + }, + { + "epoch": 0.5860009616114583, + "grad_norm": 6.328728199005127, + "learning_rate": 3.7290283590866594e-06, + "loss": 0.2153, + "step": 23157 + }, + { + "epoch": 0.586026267176152, + "grad_norm": 8.750364303588867, + "learning_rate": 3.7286400293905246e-06, + "loss": 0.2643, + "step": 23158 + }, + { + "epoch": 0.5860515727408457, + "grad_norm": 2.2491936683654785, + "learning_rate": 3.7282517078931524e-06, + "loss": 0.1322, + "step": 23159 + }, + { + "epoch": 0.5860768783055393, + "grad_norm": 6.349541664123535, + "learning_rate": 3.727863394597049e-06, + "loss": 0.1993, + "step": 23160 + }, + { + "epoch": 0.586102183870233, + "grad_norm": 5.284341812133789, + "learning_rate": 3.7274750895047173e-06, + "loss": 0.195, + "step": 23161 + }, + { + "epoch": 0.5861274894349268, + "grad_norm": 4.475848197937012, + "learning_rate": 3.727086792618665e-06, + "loss": 0.1711, + "step": 23162 + }, + { + "epoch": 0.5861527949996204, + "grad_norm": 13.667134284973145, + "learning_rate": 3.7266985039413907e-06, + "loss": 0.2175, + "step": 23163 + }, + { + "epoch": 0.5861781005643141, + "grad_norm": 3.697558641433716, + "learning_rate": 3.7263102234754e-06, + "loss": 0.1462, + "step": 23164 + }, + { + "epoch": 0.5862034061290078, + "grad_norm": 4.596431732177734, + "learning_rate": 3.7259219512231994e-06, + "loss": 0.2085, + "step": 23165 + }, + { + "epoch": 0.5862287116937015, + "grad_norm": 3.998727321624756, + "learning_rate": 3.7255336871872918e-06, + "loss": 0.1271, + "step": 23166 + }, + { + "epoch": 0.5862540172583951, + "grad_norm": 10.200180053710938, + "learning_rate": 3.725145431370179e-06, + "loss": 0.3149, + "step": 23167 + }, + { + "epoch": 0.5862793228230888, + "grad_norm": 2.803144931793213, + "learning_rate": 3.7247571837743657e-06, + "loss": 0.1525, + "step": 23168 + }, + { + "epoch": 0.5863046283877825, + "grad_norm": 11.748563766479492, + "learning_rate": 3.7243689444023566e-06, + "loss": 0.1496, + "step": 23169 + }, + { + "epoch": 0.5863299339524761, + "grad_norm": 6.893770694732666, + "learning_rate": 3.723980713256657e-06, + "loss": 0.2552, + "step": 23170 + }, + { + "epoch": 0.5863552395171698, + "grad_norm": 3.3279783725738525, + "learning_rate": 3.723592490339766e-06, + "loss": 0.1365, + "step": 23171 + }, + { + "epoch": 0.5863805450818635, + "grad_norm": 2.830617666244507, + "learning_rate": 3.72320427565419e-06, + "loss": 0.146, + "step": 23172 + }, + { + "epoch": 0.5864058506465571, + "grad_norm": 5.417728424072266, + "learning_rate": 3.722816069202432e-06, + "loss": 0.1805, + "step": 23173 + }, + { + "epoch": 0.5864311562112509, + "grad_norm": 4.9721879959106445, + "learning_rate": 3.7224278709869948e-06, + "loss": 0.2074, + "step": 23174 + }, + { + "epoch": 0.5864564617759446, + "grad_norm": 3.474721670150757, + "learning_rate": 3.7220396810103854e-06, + "loss": 0.1666, + "step": 23175 + }, + { + "epoch": 0.5864817673406382, + "grad_norm": 5.895510196685791, + "learning_rate": 3.7216514992751026e-06, + "loss": 0.1873, + "step": 23176 + }, + { + "epoch": 0.5865070729053319, + "grad_norm": 5.052614212036133, + "learning_rate": 3.7212633257836504e-06, + "loss": 0.1366, + "step": 23177 + }, + { + "epoch": 0.5865323784700256, + "grad_norm": 7.957448482513428, + "learning_rate": 3.720875160538534e-06, + "loss": 0.2532, + "step": 23178 + }, + { + "epoch": 0.5865576840347192, + "grad_norm": 10.120381355285645, + "learning_rate": 3.720487003542258e-06, + "loss": 0.192, + "step": 23179 + }, + { + "epoch": 0.5865829895994129, + "grad_norm": 5.395445823669434, + "learning_rate": 3.720098854797321e-06, + "loss": 0.1617, + "step": 23180 + }, + { + "epoch": 0.5866082951641066, + "grad_norm": 5.539931774139404, + "learning_rate": 3.7197107143062285e-06, + "loss": 0.2719, + "step": 23181 + }, + { + "epoch": 0.5866336007288002, + "grad_norm": 4.761664867401123, + "learning_rate": 3.719322582071484e-06, + "loss": 0.1603, + "step": 23182 + }, + { + "epoch": 0.5866589062934939, + "grad_norm": 4.365353107452393, + "learning_rate": 3.7189344580955893e-06, + "loss": 0.1839, + "step": 23183 + }, + { + "epoch": 0.5866842118581876, + "grad_norm": 4.948244094848633, + "learning_rate": 3.718546342381049e-06, + "loss": 0.1315, + "step": 23184 + }, + { + "epoch": 0.5867095174228812, + "grad_norm": 3.5475895404815674, + "learning_rate": 3.718158234930364e-06, + "loss": 0.1296, + "step": 23185 + }, + { + "epoch": 0.586734822987575, + "grad_norm": 2.308983325958252, + "learning_rate": 3.7177701357460395e-06, + "loss": 0.1097, + "step": 23186 + }, + { + "epoch": 0.5867601285522687, + "grad_norm": 4.2295966148376465, + "learning_rate": 3.7173820448305754e-06, + "loss": 0.1208, + "step": 23187 + }, + { + "epoch": 0.5867854341169623, + "grad_norm": 3.348249673843384, + "learning_rate": 3.7169939621864765e-06, + "loss": 0.1258, + "step": 23188 + }, + { + "epoch": 0.586810739681656, + "grad_norm": 4.422648906707764, + "learning_rate": 3.7166058878162454e-06, + "loss": 0.1832, + "step": 23189 + }, + { + "epoch": 0.5868360452463497, + "grad_norm": 8.908570289611816, + "learning_rate": 3.7162178217223834e-06, + "loss": 0.289, + "step": 23190 + }, + { + "epoch": 0.5868613508110434, + "grad_norm": 5.029210090637207, + "learning_rate": 3.715829763907393e-06, + "loss": 0.2168, + "step": 23191 + }, + { + "epoch": 0.586886656375737, + "grad_norm": 5.092134952545166, + "learning_rate": 3.7154417143737803e-06, + "loss": 0.1166, + "step": 23192 + }, + { + "epoch": 0.5869119619404307, + "grad_norm": 10.665705680847168, + "learning_rate": 3.7150536731240427e-06, + "loss": 0.1543, + "step": 23193 + }, + { + "epoch": 0.5869372675051244, + "grad_norm": 5.731688499450684, + "learning_rate": 3.714665640160685e-06, + "loss": 0.2288, + "step": 23194 + }, + { + "epoch": 0.586962573069818, + "grad_norm": 8.759629249572754, + "learning_rate": 3.71427761548621e-06, + "loss": 0.3865, + "step": 23195 + }, + { + "epoch": 0.5869878786345117, + "grad_norm": 5.261158466339111, + "learning_rate": 3.713889599103119e-06, + "loss": 0.1803, + "step": 23196 + }, + { + "epoch": 0.5870131841992055, + "grad_norm": 11.568338394165039, + "learning_rate": 3.7135015910139172e-06, + "loss": 0.2722, + "step": 23197 + }, + { + "epoch": 0.587038489763899, + "grad_norm": 4.6850714683532715, + "learning_rate": 3.7131135912211014e-06, + "loss": 0.1028, + "step": 23198 + }, + { + "epoch": 0.5870637953285928, + "grad_norm": 4.7261061668396, + "learning_rate": 3.712725599727177e-06, + "loss": 0.2103, + "step": 23199 + }, + { + "epoch": 0.5870891008932865, + "grad_norm": 3.8789737224578857, + "learning_rate": 3.7123376165346464e-06, + "loss": 0.1532, + "step": 23200 + }, + { + "epoch": 0.5871144064579801, + "grad_norm": 4.697049140930176, + "learning_rate": 3.7119496416460125e-06, + "loss": 0.2349, + "step": 23201 + }, + { + "epoch": 0.5871397120226738, + "grad_norm": 5.2637858390808105, + "learning_rate": 3.711561675063774e-06, + "loss": 0.1581, + "step": 23202 + }, + { + "epoch": 0.5871650175873675, + "grad_norm": 5.327578544616699, + "learning_rate": 3.7111737167904343e-06, + "loss": 0.1919, + "step": 23203 + }, + { + "epoch": 0.5871903231520611, + "grad_norm": 4.273344039916992, + "learning_rate": 3.710785766828496e-06, + "loss": 0.1039, + "step": 23204 + }, + { + "epoch": 0.5872156287167548, + "grad_norm": 9.368464469909668, + "learning_rate": 3.71039782518046e-06, + "loss": 0.2011, + "step": 23205 + }, + { + "epoch": 0.5872409342814485, + "grad_norm": 7.527170181274414, + "learning_rate": 3.710009891848831e-06, + "loss": 0.1954, + "step": 23206 + }, + { + "epoch": 0.5872662398461421, + "grad_norm": 8.618435859680176, + "learning_rate": 3.7096219668361057e-06, + "loss": 0.2294, + "step": 23207 + }, + { + "epoch": 0.5872915454108358, + "grad_norm": 3.1116943359375, + "learning_rate": 3.709234050144789e-06, + "loss": 0.1038, + "step": 23208 + }, + { + "epoch": 0.5873168509755295, + "grad_norm": 4.767374515533447, + "learning_rate": 3.7088461417773814e-06, + "loss": 0.1836, + "step": 23209 + }, + { + "epoch": 0.5873421565402231, + "grad_norm": 3.58123517036438, + "learning_rate": 3.708458241736387e-06, + "loss": 0.1286, + "step": 23210 + }, + { + "epoch": 0.5873674621049169, + "grad_norm": 3.8789470195770264, + "learning_rate": 3.7080703500243026e-06, + "loss": 0.0923, + "step": 23211 + }, + { + "epoch": 0.5873927676696106, + "grad_norm": 2.401228427886963, + "learning_rate": 3.707682466643633e-06, + "loss": 0.1302, + "step": 23212 + }, + { + "epoch": 0.5874180732343042, + "grad_norm": 3.2918195724487305, + "learning_rate": 3.7072945915968787e-06, + "loss": 0.1392, + "step": 23213 + }, + { + "epoch": 0.5874433787989979, + "grad_norm": 7.025810718536377, + "learning_rate": 3.706906724886541e-06, + "loss": 0.1768, + "step": 23214 + }, + { + "epoch": 0.5874686843636916, + "grad_norm": 10.017594337463379, + "learning_rate": 3.706518866515122e-06, + "loss": 0.1799, + "step": 23215 + }, + { + "epoch": 0.5874939899283853, + "grad_norm": 9.567022323608398, + "learning_rate": 3.7061310164851227e-06, + "loss": 0.227, + "step": 23216 + }, + { + "epoch": 0.5875192954930789, + "grad_norm": 6.871821880340576, + "learning_rate": 3.7057431747990422e-06, + "loss": 0.2131, + "step": 23217 + }, + { + "epoch": 0.5875446010577726, + "grad_norm": 6.50736141204834, + "learning_rate": 3.7053553414593835e-06, + "loss": 0.1375, + "step": 23218 + }, + { + "epoch": 0.5875699066224663, + "grad_norm": 7.557275295257568, + "learning_rate": 3.7049675164686494e-06, + "loss": 0.2485, + "step": 23219 + }, + { + "epoch": 0.5875952121871599, + "grad_norm": 10.793417930603027, + "learning_rate": 3.704579699829337e-06, + "loss": 0.2005, + "step": 23220 + }, + { + "epoch": 0.5876205177518536, + "grad_norm": 4.329170227050781, + "learning_rate": 3.704191891543949e-06, + "loss": 0.1568, + "step": 23221 + }, + { + "epoch": 0.5876458233165474, + "grad_norm": 4.574324131011963, + "learning_rate": 3.703804091614986e-06, + "loss": 0.1538, + "step": 23222 + }, + { + "epoch": 0.587671128881241, + "grad_norm": 4.057338714599609, + "learning_rate": 3.703416300044952e-06, + "loss": 0.1886, + "step": 23223 + }, + { + "epoch": 0.5876964344459347, + "grad_norm": 3.780292510986328, + "learning_rate": 3.703028516836342e-06, + "loss": 0.1462, + "step": 23224 + }, + { + "epoch": 0.5877217400106284, + "grad_norm": 5.498382568359375, + "learning_rate": 3.702640741991661e-06, + "loss": 0.2475, + "step": 23225 + }, + { + "epoch": 0.587747045575322, + "grad_norm": 3.8573451042175293, + "learning_rate": 3.7022529755134086e-06, + "loss": 0.1766, + "step": 23226 + }, + { + "epoch": 0.5877723511400157, + "grad_norm": 5.435818195343018, + "learning_rate": 3.7018652174040847e-06, + "loss": 0.1894, + "step": 23227 + }, + { + "epoch": 0.5877976567047094, + "grad_norm": 6.1074395179748535, + "learning_rate": 3.7014774676661924e-06, + "loss": 0.1592, + "step": 23228 + }, + { + "epoch": 0.587822962269403, + "grad_norm": 3.604773998260498, + "learning_rate": 3.7010897263022292e-06, + "loss": 0.1298, + "step": 23229 + }, + { + "epoch": 0.5878482678340967, + "grad_norm": 6.289125442504883, + "learning_rate": 3.700701993314696e-06, + "loss": 0.2019, + "step": 23230 + }, + { + "epoch": 0.5878735733987904, + "grad_norm": 6.695587635040283, + "learning_rate": 3.7003142687060943e-06, + "loss": 0.1633, + "step": 23231 + }, + { + "epoch": 0.587898878963484, + "grad_norm": 3.5764834880828857, + "learning_rate": 3.699926552478926e-06, + "loss": 0.1773, + "step": 23232 + }, + { + "epoch": 0.5879241845281777, + "grad_norm": 9.973359107971191, + "learning_rate": 3.699538844635688e-06, + "loss": 0.2729, + "step": 23233 + }, + { + "epoch": 0.5879494900928715, + "grad_norm": 4.685309886932373, + "learning_rate": 3.6991511451788813e-06, + "loss": 0.1919, + "step": 23234 + }, + { + "epoch": 0.587974795657565, + "grad_norm": 4.611046314239502, + "learning_rate": 3.6987634541110073e-06, + "loss": 0.1923, + "step": 23235 + }, + { + "epoch": 0.5880001012222588, + "grad_norm": 5.766176223754883, + "learning_rate": 3.698375771434566e-06, + "loss": 0.1499, + "step": 23236 + }, + { + "epoch": 0.5880254067869525, + "grad_norm": 5.004843235015869, + "learning_rate": 3.697988097152059e-06, + "loss": 0.2107, + "step": 23237 + }, + { + "epoch": 0.5880507123516461, + "grad_norm": 4.6950860023498535, + "learning_rate": 3.697600431265982e-06, + "loss": 0.1572, + "step": 23238 + }, + { + "epoch": 0.5880760179163398, + "grad_norm": 3.483804941177368, + "learning_rate": 3.6972127737788383e-06, + "loss": 0.2024, + "step": 23239 + }, + { + "epoch": 0.5881013234810335, + "grad_norm": 5.677022933959961, + "learning_rate": 3.6968251246931275e-06, + "loss": 0.1004, + "step": 23240 + }, + { + "epoch": 0.5881266290457271, + "grad_norm": 12.671104431152344, + "learning_rate": 3.6964374840113497e-06, + "loss": 0.224, + "step": 23241 + }, + { + "epoch": 0.5881519346104208, + "grad_norm": 9.97740364074707, + "learning_rate": 3.6960498517360034e-06, + "loss": 0.2874, + "step": 23242 + }, + { + "epoch": 0.5881772401751145, + "grad_norm": 6.277263641357422, + "learning_rate": 3.6956622278695885e-06, + "loss": 0.2079, + "step": 23243 + }, + { + "epoch": 0.5882025457398082, + "grad_norm": 5.936186790466309, + "learning_rate": 3.6952746124146055e-06, + "loss": 0.1346, + "step": 23244 + }, + { + "epoch": 0.5882278513045018, + "grad_norm": 5.897562503814697, + "learning_rate": 3.694887005373555e-06, + "loss": 0.183, + "step": 23245 + }, + { + "epoch": 0.5882531568691955, + "grad_norm": 5.1283674240112305, + "learning_rate": 3.6944994067489335e-06, + "loss": 0.1501, + "step": 23246 + }, + { + "epoch": 0.5882784624338893, + "grad_norm": 7.5246148109436035, + "learning_rate": 3.694111816543244e-06, + "loss": 0.1668, + "step": 23247 + }, + { + "epoch": 0.5883037679985829, + "grad_norm": 3.365082025527954, + "learning_rate": 3.6937242347589834e-06, + "loss": 0.1041, + "step": 23248 + }, + { + "epoch": 0.5883290735632766, + "grad_norm": 5.845109462738037, + "learning_rate": 3.693336661398652e-06, + "loss": 0.1939, + "step": 23249 + }, + { + "epoch": 0.5883543791279703, + "grad_norm": 2.070664644241333, + "learning_rate": 3.6929490964647518e-06, + "loss": 0.1326, + "step": 23250 + }, + { + "epoch": 0.5883796846926639, + "grad_norm": 2.8999850749969482, + "learning_rate": 3.692561539959778e-06, + "loss": 0.1312, + "step": 23251 + }, + { + "epoch": 0.5884049902573576, + "grad_norm": 6.214057922363281, + "learning_rate": 3.692173991886231e-06, + "loss": 0.1847, + "step": 23252 + }, + { + "epoch": 0.5884302958220513, + "grad_norm": 4.689565181732178, + "learning_rate": 3.6917864522466106e-06, + "loss": 0.1457, + "step": 23253 + }, + { + "epoch": 0.5884556013867449, + "grad_norm": 8.138073921203613, + "learning_rate": 3.6913989210434186e-06, + "loss": 0.1431, + "step": 23254 + }, + { + "epoch": 0.5884809069514386, + "grad_norm": 2.959627389907837, + "learning_rate": 3.6910113982791494e-06, + "loss": 0.1602, + "step": 23255 + }, + { + "epoch": 0.5885062125161323, + "grad_norm": 5.467467784881592, + "learning_rate": 3.6906238839563042e-06, + "loss": 0.1158, + "step": 23256 + }, + { + "epoch": 0.5885315180808259, + "grad_norm": 7.378113269805908, + "learning_rate": 3.690236378077382e-06, + "loss": 0.1882, + "step": 23257 + }, + { + "epoch": 0.5885568236455196, + "grad_norm": 2.9618568420410156, + "learning_rate": 3.6898488806448814e-06, + "loss": 0.118, + "step": 23258 + }, + { + "epoch": 0.5885821292102134, + "grad_norm": 5.370422840118408, + "learning_rate": 3.6894613916613043e-06, + "loss": 0.1301, + "step": 23259 + }, + { + "epoch": 0.588607434774907, + "grad_norm": 3.939757823944092, + "learning_rate": 3.689073911129144e-06, + "loss": 0.0927, + "step": 23260 + }, + { + "epoch": 0.5886327403396007, + "grad_norm": 6.170202732086182, + "learning_rate": 3.6886864390509024e-06, + "loss": 0.1837, + "step": 23261 + }, + { + "epoch": 0.5886580459042944, + "grad_norm": 7.23192834854126, + "learning_rate": 3.6882989754290786e-06, + "loss": 0.174, + "step": 23262 + }, + { + "epoch": 0.588683351468988, + "grad_norm": 7.991376876831055, + "learning_rate": 3.6879115202661726e-06, + "loss": 0.1401, + "step": 23263 + }, + { + "epoch": 0.5887086570336817, + "grad_norm": 2.088408946990967, + "learning_rate": 3.687524073564679e-06, + "loss": 0.0669, + "step": 23264 + }, + { + "epoch": 0.5887339625983754, + "grad_norm": 3.862661600112915, + "learning_rate": 3.687136635327099e-06, + "loss": 0.1474, + "step": 23265 + }, + { + "epoch": 0.588759268163069, + "grad_norm": 6.127205848693848, + "learning_rate": 3.6867492055559296e-06, + "loss": 0.2134, + "step": 23266 + }, + { + "epoch": 0.5887845737277627, + "grad_norm": 10.3539400100708, + "learning_rate": 3.6863617842536727e-06, + "loss": 0.2192, + "step": 23267 + }, + { + "epoch": 0.5888098792924564, + "grad_norm": 4.508726596832275, + "learning_rate": 3.6859743714228226e-06, + "loss": 0.1083, + "step": 23268 + }, + { + "epoch": 0.5888351848571501, + "grad_norm": 3.6488592624664307, + "learning_rate": 3.685586967065879e-06, + "loss": 0.1587, + "step": 23269 + }, + { + "epoch": 0.5888604904218437, + "grad_norm": 7.8671345710754395, + "learning_rate": 3.6851995711853407e-06, + "loss": 0.1269, + "step": 23270 + }, + { + "epoch": 0.5888857959865375, + "grad_norm": 20.07496452331543, + "learning_rate": 3.684812183783707e-06, + "loss": 0.4404, + "step": 23271 + }, + { + "epoch": 0.5889111015512312, + "grad_norm": 10.184281349182129, + "learning_rate": 3.6844248048634747e-06, + "loss": 0.2504, + "step": 23272 + }, + { + "epoch": 0.5889364071159248, + "grad_norm": 4.452668190002441, + "learning_rate": 3.684037434427141e-06, + "loss": 0.1271, + "step": 23273 + }, + { + "epoch": 0.5889617126806185, + "grad_norm": 3.780235767364502, + "learning_rate": 3.6836500724772062e-06, + "loss": 0.1342, + "step": 23274 + }, + { + "epoch": 0.5889870182453122, + "grad_norm": 7.596745491027832, + "learning_rate": 3.6832627190161664e-06, + "loss": 0.2248, + "step": 23275 + }, + { + "epoch": 0.5890123238100058, + "grad_norm": 7.394479274749756, + "learning_rate": 3.6828753740465205e-06, + "loss": 0.2092, + "step": 23276 + }, + { + "epoch": 0.5890376293746995, + "grad_norm": 3.143220901489258, + "learning_rate": 3.6824880375707672e-06, + "loss": 0.1007, + "step": 23277 + }, + { + "epoch": 0.5890629349393932, + "grad_norm": 5.2146430015563965, + "learning_rate": 3.682100709591403e-06, + "loss": 0.1195, + "step": 23278 + }, + { + "epoch": 0.5890882405040868, + "grad_norm": 18.240203857421875, + "learning_rate": 3.681713390110925e-06, + "loss": 0.1698, + "step": 23279 + }, + { + "epoch": 0.5891135460687805, + "grad_norm": 3.5047481060028076, + "learning_rate": 3.6813260791318327e-06, + "loss": 0.1024, + "step": 23280 + }, + { + "epoch": 0.5891388516334742, + "grad_norm": 5.693580627441406, + "learning_rate": 3.6809387766566253e-06, + "loss": 0.1918, + "step": 23281 + }, + { + "epoch": 0.5891641571981678, + "grad_norm": 7.408618450164795, + "learning_rate": 3.680551482687796e-06, + "loss": 0.2199, + "step": 23282 + }, + { + "epoch": 0.5891894627628615, + "grad_norm": 4.86144495010376, + "learning_rate": 3.6801641972278446e-06, + "loss": 0.1522, + "step": 23283 + }, + { + "epoch": 0.5892147683275553, + "grad_norm": 4.297491073608398, + "learning_rate": 3.6797769202792693e-06, + "loss": 0.1366, + "step": 23284 + }, + { + "epoch": 0.5892400738922489, + "grad_norm": 2.0280113220214844, + "learning_rate": 3.6793896518445695e-06, + "loss": 0.0923, + "step": 23285 + }, + { + "epoch": 0.5892653794569426, + "grad_norm": 4.171382904052734, + "learning_rate": 3.6790023919262375e-06, + "loss": 0.1981, + "step": 23286 + }, + { + "epoch": 0.5892906850216363, + "grad_norm": 4.065860271453857, + "learning_rate": 3.6786151405267733e-06, + "loss": 0.1035, + "step": 23287 + }, + { + "epoch": 0.5893159905863299, + "grad_norm": 20.726905822753906, + "learning_rate": 3.678227897648674e-06, + "loss": 0.1215, + "step": 23288 + }, + { + "epoch": 0.5893412961510236, + "grad_norm": 2.6850192546844482, + "learning_rate": 3.6778406632944375e-06, + "loss": 0.1345, + "step": 23289 + }, + { + "epoch": 0.5893666017157173, + "grad_norm": 8.088287353515625, + "learning_rate": 3.6774534374665626e-06, + "loss": 0.2248, + "step": 23290 + }, + { + "epoch": 0.5893919072804109, + "grad_norm": 10.46562385559082, + "learning_rate": 3.677066220167542e-06, + "loss": 0.2889, + "step": 23291 + }, + { + "epoch": 0.5894172128451046, + "grad_norm": 3.6162545680999756, + "learning_rate": 3.6766790113998763e-06, + "loss": 0.1087, + "step": 23292 + }, + { + "epoch": 0.5894425184097983, + "grad_norm": 6.7642741203308105, + "learning_rate": 3.6762918111660607e-06, + "loss": 0.1524, + "step": 23293 + }, + { + "epoch": 0.589467823974492, + "grad_norm": 10.322428703308105, + "learning_rate": 3.6759046194685953e-06, + "loss": 0.3285, + "step": 23294 + }, + { + "epoch": 0.5894931295391856, + "grad_norm": 2.6817169189453125, + "learning_rate": 3.6755174363099733e-06, + "loss": 0.1026, + "step": 23295 + }, + { + "epoch": 0.5895184351038794, + "grad_norm": 3.20453143119812, + "learning_rate": 3.675130261692692e-06, + "loss": 0.1889, + "step": 23296 + }, + { + "epoch": 0.5895437406685731, + "grad_norm": 4.140936374664307, + "learning_rate": 3.674743095619249e-06, + "loss": 0.1423, + "step": 23297 + }, + { + "epoch": 0.5895690462332667, + "grad_norm": 3.6035289764404297, + "learning_rate": 3.6743559380921444e-06, + "loss": 0.1016, + "step": 23298 + }, + { + "epoch": 0.5895943517979604, + "grad_norm": 5.882976531982422, + "learning_rate": 3.673968789113869e-06, + "loss": 0.1421, + "step": 23299 + }, + { + "epoch": 0.5896196573626541, + "grad_norm": 3.9372246265411377, + "learning_rate": 3.6735816486869223e-06, + "loss": 0.1153, + "step": 23300 + }, + { + "epoch": 0.5896449629273477, + "grad_norm": 3.1214683055877686, + "learning_rate": 3.6731945168138016e-06, + "loss": 0.1133, + "step": 23301 + }, + { + "epoch": 0.5896702684920414, + "grad_norm": 5.551313400268555, + "learning_rate": 3.672807393497002e-06, + "loss": 0.1318, + "step": 23302 + }, + { + "epoch": 0.5896955740567351, + "grad_norm": 4.286294460296631, + "learning_rate": 3.6724202787390217e-06, + "loss": 0.1216, + "step": 23303 + }, + { + "epoch": 0.5897208796214287, + "grad_norm": 4.3836798667907715, + "learning_rate": 3.6720331725423563e-06, + "loss": 0.179, + "step": 23304 + }, + { + "epoch": 0.5897461851861224, + "grad_norm": 3.9612560272216797, + "learning_rate": 3.6716460749095005e-06, + "loss": 0.0909, + "step": 23305 + }, + { + "epoch": 0.5897714907508161, + "grad_norm": 3.0862669944763184, + "learning_rate": 3.671258985842952e-06, + "loss": 0.125, + "step": 23306 + }, + { + "epoch": 0.5897967963155097, + "grad_norm": 2.6509621143341064, + "learning_rate": 3.670871905345208e-06, + "loss": 0.0799, + "step": 23307 + }, + { + "epoch": 0.5898221018802035, + "grad_norm": 7.568215370178223, + "learning_rate": 3.670484833418765e-06, + "loss": 0.2308, + "step": 23308 + }, + { + "epoch": 0.5898474074448972, + "grad_norm": 4.264551162719727, + "learning_rate": 3.6700977700661156e-06, + "loss": 0.1529, + "step": 23309 + }, + { + "epoch": 0.5898727130095908, + "grad_norm": 8.311264991760254, + "learning_rate": 3.6697107152897588e-06, + "loss": 0.2216, + "step": 23310 + }, + { + "epoch": 0.5898980185742845, + "grad_norm": 10.55661678314209, + "learning_rate": 3.66932366909219e-06, + "loss": 0.2735, + "step": 23311 + }, + { + "epoch": 0.5899233241389782, + "grad_norm": 4.953190326690674, + "learning_rate": 3.6689366314759077e-06, + "loss": 0.2132, + "step": 23312 + }, + { + "epoch": 0.5899486297036718, + "grad_norm": 3.056401252746582, + "learning_rate": 3.668549602443403e-06, + "loss": 0.1413, + "step": 23313 + }, + { + "epoch": 0.5899739352683655, + "grad_norm": 4.244626998901367, + "learning_rate": 3.668162581997175e-06, + "loss": 0.1302, + "step": 23314 + }, + { + "epoch": 0.5899992408330592, + "grad_norm": 4.41159200668335, + "learning_rate": 3.6677755701397176e-06, + "loss": 0.1673, + "step": 23315 + }, + { + "epoch": 0.5900245463977528, + "grad_norm": 8.742280006408691, + "learning_rate": 3.667388566873531e-06, + "loss": 0.212, + "step": 23316 + }, + { + "epoch": 0.5900498519624465, + "grad_norm": 13.670334815979004, + "learning_rate": 3.6670015722011055e-06, + "loss": 0.2065, + "step": 23317 + }, + { + "epoch": 0.5900751575271402, + "grad_norm": 3.5107555389404297, + "learning_rate": 3.666614586124938e-06, + "loss": 0.1654, + "step": 23318 + }, + { + "epoch": 0.590100463091834, + "grad_norm": 10.861990928649902, + "learning_rate": 3.6662276086475257e-06, + "loss": 0.277, + "step": 23319 + }, + { + "epoch": 0.5901257686565275, + "grad_norm": 10.577306747436523, + "learning_rate": 3.6658406397713652e-06, + "loss": 0.2484, + "step": 23320 + }, + { + "epoch": 0.5901510742212213, + "grad_norm": 3.89819073677063, + "learning_rate": 3.6654536794989482e-06, + "loss": 0.0871, + "step": 23321 + }, + { + "epoch": 0.590176379785915, + "grad_norm": 4.593378067016602, + "learning_rate": 3.6650667278327723e-06, + "loss": 0.1575, + "step": 23322 + }, + { + "epoch": 0.5902016853506086, + "grad_norm": 8.071640968322754, + "learning_rate": 3.6646797847753322e-06, + "loss": 0.2265, + "step": 23323 + }, + { + "epoch": 0.5902269909153023, + "grad_norm": 8.939289093017578, + "learning_rate": 3.6642928503291247e-06, + "loss": 0.2339, + "step": 23324 + }, + { + "epoch": 0.590252296479996, + "grad_norm": 4.351605415344238, + "learning_rate": 3.6639059244966456e-06, + "loss": 0.1774, + "step": 23325 + }, + { + "epoch": 0.5902776020446896, + "grad_norm": 6.581305027008057, + "learning_rate": 3.663519007280387e-06, + "loss": 0.1964, + "step": 23326 + }, + { + "epoch": 0.5903029076093833, + "grad_norm": 3.0694544315338135, + "learning_rate": 3.6631320986828457e-06, + "loss": 0.1627, + "step": 23327 + }, + { + "epoch": 0.590328213174077, + "grad_norm": 3.6682205200195312, + "learning_rate": 3.6627451987065177e-06, + "loss": 0.1183, + "step": 23328 + }, + { + "epoch": 0.5903535187387706, + "grad_norm": 6.052335262298584, + "learning_rate": 3.662358307353897e-06, + "loss": 0.2159, + "step": 23329 + }, + { + "epoch": 0.5903788243034643, + "grad_norm": 8.594183921813965, + "learning_rate": 3.6619714246274777e-06, + "loss": 0.1411, + "step": 23330 + }, + { + "epoch": 0.590404129868158, + "grad_norm": 7.339802265167236, + "learning_rate": 3.6615845505297563e-06, + "loss": 0.2008, + "step": 23331 + }, + { + "epoch": 0.5904294354328516, + "grad_norm": 6.860138893127441, + "learning_rate": 3.661197685063228e-06, + "loss": 0.2317, + "step": 23332 + }, + { + "epoch": 0.5904547409975454, + "grad_norm": 3.939610481262207, + "learning_rate": 3.6608108282303854e-06, + "loss": 0.1535, + "step": 23333 + }, + { + "epoch": 0.5904800465622391, + "grad_norm": 3.4104015827178955, + "learning_rate": 3.660423980033726e-06, + "loss": 0.1546, + "step": 23334 + }, + { + "epoch": 0.5905053521269327, + "grad_norm": 6.5655341148376465, + "learning_rate": 3.660037140475743e-06, + "loss": 0.1511, + "step": 23335 + }, + { + "epoch": 0.5905306576916264, + "grad_norm": 6.662359714508057, + "learning_rate": 3.65965030955893e-06, + "loss": 0.2408, + "step": 23336 + }, + { + "epoch": 0.5905559632563201, + "grad_norm": 18.89444923400879, + "learning_rate": 3.6592634872857835e-06, + "loss": 0.3187, + "step": 23337 + }, + { + "epoch": 0.5905812688210137, + "grad_norm": 3.005690097808838, + "learning_rate": 3.6588766736587987e-06, + "loss": 0.1393, + "step": 23338 + }, + { + "epoch": 0.5906065743857074, + "grad_norm": 9.365184783935547, + "learning_rate": 3.658489868680467e-06, + "loss": 0.1092, + "step": 23339 + }, + { + "epoch": 0.5906318799504011, + "grad_norm": 9.48589038848877, + "learning_rate": 3.658103072353285e-06, + "loss": 0.221, + "step": 23340 + }, + { + "epoch": 0.5906571855150947, + "grad_norm": 3.1981446743011475, + "learning_rate": 3.657716284679746e-06, + "loss": 0.1321, + "step": 23341 + }, + { + "epoch": 0.5906824910797884, + "grad_norm": 10.173027992248535, + "learning_rate": 3.6573295056623454e-06, + "loss": 0.2142, + "step": 23342 + }, + { + "epoch": 0.5907077966444821, + "grad_norm": 3.658378839492798, + "learning_rate": 3.656942735303579e-06, + "loss": 0.1385, + "step": 23343 + }, + { + "epoch": 0.5907331022091759, + "grad_norm": 3.0723795890808105, + "learning_rate": 3.656555973605937e-06, + "loss": 0.1472, + "step": 23344 + }, + { + "epoch": 0.5907584077738695, + "grad_norm": 4.7227301597595215, + "learning_rate": 3.6561692205719158e-06, + "loss": 0.1458, + "step": 23345 + }, + { + "epoch": 0.5907837133385632, + "grad_norm": 4.420115947723389, + "learning_rate": 3.655782476204009e-06, + "loss": 0.1909, + "step": 23346 + }, + { + "epoch": 0.5908090189032569, + "grad_norm": 9.248921394348145, + "learning_rate": 3.6553957405047135e-06, + "loss": 0.2247, + "step": 23347 + }, + { + "epoch": 0.5908343244679505, + "grad_norm": 4.9202375411987305, + "learning_rate": 3.655009013476518e-06, + "loss": 0.1875, + "step": 23348 + }, + { + "epoch": 0.5908596300326442, + "grad_norm": 9.396102905273438, + "learning_rate": 3.65462229512192e-06, + "loss": 0.198, + "step": 23349 + }, + { + "epoch": 0.5908849355973379, + "grad_norm": 4.413454055786133, + "learning_rate": 3.654235585443412e-06, + "loss": 0.1744, + "step": 23350 + }, + { + "epoch": 0.5909102411620315, + "grad_norm": 6.679113388061523, + "learning_rate": 3.6538488844434903e-06, + "loss": 0.2249, + "step": 23351 + }, + { + "epoch": 0.5909355467267252, + "grad_norm": 2.940483331680298, + "learning_rate": 3.653462192124645e-06, + "loss": 0.1083, + "step": 23352 + }, + { + "epoch": 0.5909608522914189, + "grad_norm": 3.81390118598938, + "learning_rate": 3.653075508489372e-06, + "loss": 0.1439, + "step": 23353 + }, + { + "epoch": 0.5909861578561125, + "grad_norm": 4.276442527770996, + "learning_rate": 3.6526888335401633e-06, + "loss": 0.1337, + "step": 23354 + }, + { + "epoch": 0.5910114634208062, + "grad_norm": 3.2548422813415527, + "learning_rate": 3.6523021672795146e-06, + "loss": 0.1651, + "step": 23355 + }, + { + "epoch": 0.5910367689855, + "grad_norm": 4.9707932472229, + "learning_rate": 3.6519155097099207e-06, + "loss": 0.1442, + "step": 23356 + }, + { + "epoch": 0.5910620745501936, + "grad_norm": 10.036776542663574, + "learning_rate": 3.6515288608338707e-06, + "loss": 0.1402, + "step": 23357 + }, + { + "epoch": 0.5910873801148873, + "grad_norm": 4.741293907165527, + "learning_rate": 3.65114222065386e-06, + "loss": 0.1694, + "step": 23358 + }, + { + "epoch": 0.591112685679581, + "grad_norm": 5.9104323387146, + "learning_rate": 3.6507555891723835e-06, + "loss": 0.2201, + "step": 23359 + }, + { + "epoch": 0.5911379912442746, + "grad_norm": 4.630230903625488, + "learning_rate": 3.650368966391933e-06, + "loss": 0.1702, + "step": 23360 + }, + { + "epoch": 0.5911632968089683, + "grad_norm": 5.8176655769348145, + "learning_rate": 3.6499823523150007e-06, + "loss": 0.1849, + "step": 23361 + }, + { + "epoch": 0.591188602373662, + "grad_norm": 4.309194564819336, + "learning_rate": 3.6495957469440824e-06, + "loss": 0.1896, + "step": 23362 + }, + { + "epoch": 0.5912139079383556, + "grad_norm": 8.471025466918945, + "learning_rate": 3.649209150281669e-06, + "loss": 0.1224, + "step": 23363 + }, + { + "epoch": 0.5912392135030493, + "grad_norm": 4.004399299621582, + "learning_rate": 3.6488225623302543e-06, + "loss": 0.1575, + "step": 23364 + }, + { + "epoch": 0.591264519067743, + "grad_norm": 4.510515213012695, + "learning_rate": 3.6484359830923345e-06, + "loss": 0.1483, + "step": 23365 + }, + { + "epoch": 0.5912898246324366, + "grad_norm": 2.5685250759124756, + "learning_rate": 3.6480494125703964e-06, + "loss": 0.0972, + "step": 23366 + }, + { + "epoch": 0.5913151301971303, + "grad_norm": 5.920331001281738, + "learning_rate": 3.6476628507669366e-06, + "loss": 0.2541, + "step": 23367 + }, + { + "epoch": 0.591340435761824, + "grad_norm": 11.619758605957031, + "learning_rate": 3.6472762976844476e-06, + "loss": 0.1397, + "step": 23368 + }, + { + "epoch": 0.5913657413265176, + "grad_norm": 2.626404047012329, + "learning_rate": 3.6468897533254243e-06, + "loss": 0.103, + "step": 23369 + }, + { + "epoch": 0.5913910468912114, + "grad_norm": 8.145797729492188, + "learning_rate": 3.646503217692355e-06, + "loss": 0.1591, + "step": 23370 + }, + { + "epoch": 0.5914163524559051, + "grad_norm": 3.024944543838501, + "learning_rate": 3.646116690787734e-06, + "loss": 0.1189, + "step": 23371 + }, + { + "epoch": 0.5914416580205988, + "grad_norm": 4.675107479095459, + "learning_rate": 3.645730172614056e-06, + "loss": 0.1654, + "step": 23372 + }, + { + "epoch": 0.5914669635852924, + "grad_norm": 4.408993244171143, + "learning_rate": 3.645343663173814e-06, + "loss": 0.1599, + "step": 23373 + }, + { + "epoch": 0.5914922691499861, + "grad_norm": 3.9374144077301025, + "learning_rate": 3.6449571624694956e-06, + "loss": 0.156, + "step": 23374 + }, + { + "epoch": 0.5915175747146798, + "grad_norm": 5.629241466522217, + "learning_rate": 3.644570670503598e-06, + "loss": 0.1957, + "step": 23375 + }, + { + "epoch": 0.5915428802793734, + "grad_norm": 3.510345220565796, + "learning_rate": 3.6441841872786104e-06, + "loss": 0.1213, + "step": 23376 + }, + { + "epoch": 0.5915681858440671, + "grad_norm": 8.247113227844238, + "learning_rate": 3.6437977127970274e-06, + "loss": 0.2426, + "step": 23377 + }, + { + "epoch": 0.5915934914087608, + "grad_norm": 5.449758052825928, + "learning_rate": 3.643411247061343e-06, + "loss": 0.1184, + "step": 23378 + }, + { + "epoch": 0.5916187969734544, + "grad_norm": 5.047966957092285, + "learning_rate": 3.6430247900740447e-06, + "loss": 0.1313, + "step": 23379 + }, + { + "epoch": 0.5916441025381481, + "grad_norm": 6.176913261413574, + "learning_rate": 3.6426383418376276e-06, + "loss": 0.1247, + "step": 23380 + }, + { + "epoch": 0.5916694081028419, + "grad_norm": 7.449141502380371, + "learning_rate": 3.6422519023545833e-06, + "loss": 0.1949, + "step": 23381 + }, + { + "epoch": 0.5916947136675355, + "grad_norm": 5.287750244140625, + "learning_rate": 3.6418654716274054e-06, + "loss": 0.1951, + "step": 23382 + }, + { + "epoch": 0.5917200192322292, + "grad_norm": 3.416614532470703, + "learning_rate": 3.641479049658583e-06, + "loss": 0.1407, + "step": 23383 + }, + { + "epoch": 0.5917453247969229, + "grad_norm": 6.232067584991455, + "learning_rate": 3.641092636450609e-06, + "loss": 0.2598, + "step": 23384 + }, + { + "epoch": 0.5917706303616165, + "grad_norm": 2.895913600921631, + "learning_rate": 3.640706232005976e-06, + "loss": 0.119, + "step": 23385 + }, + { + "epoch": 0.5917959359263102, + "grad_norm": 5.8274383544921875, + "learning_rate": 3.6403198363271763e-06, + "loss": 0.1605, + "step": 23386 + }, + { + "epoch": 0.5918212414910039, + "grad_norm": 5.6317949295043945, + "learning_rate": 3.6399334494167014e-06, + "loss": 0.2223, + "step": 23387 + }, + { + "epoch": 0.5918465470556975, + "grad_norm": 6.079576015472412, + "learning_rate": 3.639547071277042e-06, + "loss": 0.215, + "step": 23388 + }, + { + "epoch": 0.5918718526203912, + "grad_norm": 5.582981109619141, + "learning_rate": 3.639160701910691e-06, + "loss": 0.2007, + "step": 23389 + }, + { + "epoch": 0.5918971581850849, + "grad_norm": 4.881905555725098, + "learning_rate": 3.6387743413201393e-06, + "loss": 0.1559, + "step": 23390 + }, + { + "epoch": 0.5919224637497785, + "grad_norm": 5.114635467529297, + "learning_rate": 3.638387989507879e-06, + "loss": 0.1591, + "step": 23391 + }, + { + "epoch": 0.5919477693144722, + "grad_norm": 13.514979362487793, + "learning_rate": 3.6380016464764e-06, + "loss": 0.1925, + "step": 23392 + }, + { + "epoch": 0.591973074879166, + "grad_norm": 4.761970043182373, + "learning_rate": 3.6376153122281964e-06, + "loss": 0.1643, + "step": 23393 + }, + { + "epoch": 0.5919983804438596, + "grad_norm": 4.104682922363281, + "learning_rate": 3.637228986765757e-06, + "loss": 0.1328, + "step": 23394 + }, + { + "epoch": 0.5920236860085533, + "grad_norm": 5.580134868621826, + "learning_rate": 3.636842670091576e-06, + "loss": 0.1428, + "step": 23395 + }, + { + "epoch": 0.592048991573247, + "grad_norm": 4.213791370391846, + "learning_rate": 3.636456362208143e-06, + "loss": 0.15, + "step": 23396 + }, + { + "epoch": 0.5920742971379407, + "grad_norm": 5.0057783126831055, + "learning_rate": 3.6360700631179474e-06, + "loss": 0.1668, + "step": 23397 + }, + { + "epoch": 0.5920996027026343, + "grad_norm": 3.320880174636841, + "learning_rate": 3.6356837728234827e-06, + "loss": 0.1487, + "step": 23398 + }, + { + "epoch": 0.592124908267328, + "grad_norm": 5.568515777587891, + "learning_rate": 3.63529749132724e-06, + "loss": 0.1849, + "step": 23399 + }, + { + "epoch": 0.5921502138320217, + "grad_norm": 4.005950450897217, + "learning_rate": 3.6349112186317116e-06, + "loss": 0.1367, + "step": 23400 + }, + { + "epoch": 0.5921755193967153, + "grad_norm": 2.8709943294525146, + "learning_rate": 3.6345249547393846e-06, + "loss": 0.1176, + "step": 23401 + }, + { + "epoch": 0.592200824961409, + "grad_norm": 3.978365421295166, + "learning_rate": 3.6341386996527523e-06, + "loss": 0.1347, + "step": 23402 + }, + { + "epoch": 0.5922261305261027, + "grad_norm": 3.377729892730713, + "learning_rate": 3.633752453374305e-06, + "loss": 0.145, + "step": 23403 + }, + { + "epoch": 0.5922514360907963, + "grad_norm": 3.9034523963928223, + "learning_rate": 3.6333662159065368e-06, + "loss": 0.1579, + "step": 23404 + }, + { + "epoch": 0.59227674165549, + "grad_norm": 4.016858100891113, + "learning_rate": 3.632979987251933e-06, + "loss": 0.0939, + "step": 23405 + }, + { + "epoch": 0.5923020472201838, + "grad_norm": 3.507847547531128, + "learning_rate": 3.6325937674129875e-06, + "loss": 0.16, + "step": 23406 + }, + { + "epoch": 0.5923273527848774, + "grad_norm": 7.305330753326416, + "learning_rate": 3.6322075563921907e-06, + "loss": 0.1837, + "step": 23407 + }, + { + "epoch": 0.5923526583495711, + "grad_norm": 6.403471946716309, + "learning_rate": 3.6318213541920324e-06, + "loss": 0.1182, + "step": 23408 + }, + { + "epoch": 0.5923779639142648, + "grad_norm": 5.8936333656311035, + "learning_rate": 3.6314351608150055e-06, + "loss": 0.1423, + "step": 23409 + }, + { + "epoch": 0.5924032694789584, + "grad_norm": 2.849071979522705, + "learning_rate": 3.631048976263597e-06, + "loss": 0.1223, + "step": 23410 + }, + { + "epoch": 0.5924285750436521, + "grad_norm": 5.096924781799316, + "learning_rate": 3.630662800540299e-06, + "loss": 0.1529, + "step": 23411 + }, + { + "epoch": 0.5924538806083458, + "grad_norm": 7.357241153717041, + "learning_rate": 3.630276633647602e-06, + "loss": 0.2037, + "step": 23412 + }, + { + "epoch": 0.5924791861730394, + "grad_norm": 7.068843364715576, + "learning_rate": 3.629890475587998e-06, + "loss": 0.1264, + "step": 23413 + }, + { + "epoch": 0.5925044917377331, + "grad_norm": 13.930570602416992, + "learning_rate": 3.629504326363974e-06, + "loss": 0.2228, + "step": 23414 + }, + { + "epoch": 0.5925297973024268, + "grad_norm": 8.84471321105957, + "learning_rate": 3.629118185978021e-06, + "loss": 0.1239, + "step": 23415 + }, + { + "epoch": 0.5925551028671204, + "grad_norm": 3.044020652770996, + "learning_rate": 3.62873205443263e-06, + "loss": 0.1079, + "step": 23416 + }, + { + "epoch": 0.5925804084318141, + "grad_norm": 6.004951477050781, + "learning_rate": 3.6283459317302914e-06, + "loss": 0.1735, + "step": 23417 + }, + { + "epoch": 0.5926057139965079, + "grad_norm": 8.61363410949707, + "learning_rate": 3.627959817873495e-06, + "loss": 0.2186, + "step": 23418 + }, + { + "epoch": 0.5926310195612015, + "grad_norm": 4.501504421234131, + "learning_rate": 3.6275737128647306e-06, + "loss": 0.1176, + "step": 23419 + }, + { + "epoch": 0.5926563251258952, + "grad_norm": 4.848011493682861, + "learning_rate": 3.627187616706488e-06, + "loss": 0.1436, + "step": 23420 + }, + { + "epoch": 0.5926816306905889, + "grad_norm": 2.8829941749572754, + "learning_rate": 3.626801529401256e-06, + "loss": 0.103, + "step": 23421 + }, + { + "epoch": 0.5927069362552826, + "grad_norm": 4.849565505981445, + "learning_rate": 3.6264154509515272e-06, + "loss": 0.2547, + "step": 23422 + }, + { + "epoch": 0.5927322418199762, + "grad_norm": 7.169971942901611, + "learning_rate": 3.626029381359789e-06, + "loss": 0.1876, + "step": 23423 + }, + { + "epoch": 0.5927575473846699, + "grad_norm": 6.191318988800049, + "learning_rate": 3.625643320628531e-06, + "loss": 0.16, + "step": 23424 + }, + { + "epoch": 0.5927828529493636, + "grad_norm": 11.028752326965332, + "learning_rate": 3.6252572687602437e-06, + "loss": 0.2156, + "step": 23425 + }, + { + "epoch": 0.5928081585140572, + "grad_norm": 19.276172637939453, + "learning_rate": 3.6248712257574185e-06, + "loss": 0.2112, + "step": 23426 + }, + { + "epoch": 0.5928334640787509, + "grad_norm": 7.3174519538879395, + "learning_rate": 3.6244851916225408e-06, + "loss": 0.2733, + "step": 23427 + }, + { + "epoch": 0.5928587696434446, + "grad_norm": 3.8971550464630127, + "learning_rate": 3.6240991663581026e-06, + "loss": 0.1408, + "step": 23428 + }, + { + "epoch": 0.5928840752081382, + "grad_norm": 5.655721187591553, + "learning_rate": 3.6237131499665924e-06, + "loss": 0.1931, + "step": 23429 + }, + { + "epoch": 0.592909380772832, + "grad_norm": 3.132817506790161, + "learning_rate": 3.6233271424505e-06, + "loss": 0.1594, + "step": 23430 + }, + { + "epoch": 0.5929346863375257, + "grad_norm": 6.087430953979492, + "learning_rate": 3.6229411438123172e-06, + "loss": 0.1515, + "step": 23431 + }, + { + "epoch": 0.5929599919022193, + "grad_norm": 4.0304274559021, + "learning_rate": 3.622555154054529e-06, + "loss": 0.1511, + "step": 23432 + }, + { + "epoch": 0.592985297466913, + "grad_norm": 3.4356443881988525, + "learning_rate": 3.622169173179626e-06, + "loss": 0.1649, + "step": 23433 + }, + { + "epoch": 0.5930106030316067, + "grad_norm": 2.455317974090576, + "learning_rate": 3.6217832011900976e-06, + "loss": 0.1189, + "step": 23434 + }, + { + "epoch": 0.5930359085963003, + "grad_norm": 9.87608814239502, + "learning_rate": 3.6213972380884354e-06, + "loss": 0.2859, + "step": 23435 + }, + { + "epoch": 0.593061214160994, + "grad_norm": 6.241396903991699, + "learning_rate": 3.621011283877124e-06, + "loss": 0.2373, + "step": 23436 + }, + { + "epoch": 0.5930865197256877, + "grad_norm": 5.932445526123047, + "learning_rate": 3.620625338558654e-06, + "loss": 0.1601, + "step": 23437 + }, + { + "epoch": 0.5931118252903813, + "grad_norm": 6.590907573699951, + "learning_rate": 3.6202394021355148e-06, + "loss": 0.2474, + "step": 23438 + }, + { + "epoch": 0.593137130855075, + "grad_norm": 5.357494354248047, + "learning_rate": 3.6198534746101948e-06, + "loss": 0.1789, + "step": 23439 + }, + { + "epoch": 0.5931624364197687, + "grad_norm": 6.683258533477783, + "learning_rate": 3.6194675559851857e-06, + "loss": 0.1424, + "step": 23440 + }, + { + "epoch": 0.5931877419844623, + "grad_norm": 3.0680348873138428, + "learning_rate": 3.6190816462629706e-06, + "loss": 0.0948, + "step": 23441 + }, + { + "epoch": 0.593213047549156, + "grad_norm": 4.136325359344482, + "learning_rate": 3.6186957454460415e-06, + "loss": 0.1543, + "step": 23442 + }, + { + "epoch": 0.5932383531138498, + "grad_norm": 4.387162685394287, + "learning_rate": 3.6183098535368865e-06, + "loss": 0.0823, + "step": 23443 + }, + { + "epoch": 0.5932636586785434, + "grad_norm": 4.933238506317139, + "learning_rate": 3.6179239705379964e-06, + "loss": 0.1927, + "step": 23444 + }, + { + "epoch": 0.5932889642432371, + "grad_norm": 7.499841690063477, + "learning_rate": 3.617538096451856e-06, + "loss": 0.2213, + "step": 23445 + }, + { + "epoch": 0.5933142698079308, + "grad_norm": 4.166747093200684, + "learning_rate": 3.6171522312809544e-06, + "loss": 0.1074, + "step": 23446 + }, + { + "epoch": 0.5933395753726245, + "grad_norm": 6.422246932983398, + "learning_rate": 3.616766375027782e-06, + "loss": 0.2647, + "step": 23447 + }, + { + "epoch": 0.5933648809373181, + "grad_norm": 6.834933757781982, + "learning_rate": 3.616380527694826e-06, + "loss": 0.1752, + "step": 23448 + }, + { + "epoch": 0.5933901865020118, + "grad_norm": 8.947943687438965, + "learning_rate": 3.6159946892845737e-06, + "loss": 0.2326, + "step": 23449 + }, + { + "epoch": 0.5934154920667055, + "grad_norm": 5.2138848304748535, + "learning_rate": 3.6156088597995155e-06, + "loss": 0.1851, + "step": 23450 + }, + { + "epoch": 0.5934407976313991, + "grad_norm": 3.5317208766937256, + "learning_rate": 3.6152230392421366e-06, + "loss": 0.1922, + "step": 23451 + }, + { + "epoch": 0.5934661031960928, + "grad_norm": 3.5088400840759277, + "learning_rate": 3.614837227614927e-06, + "loss": 0.109, + "step": 23452 + }, + { + "epoch": 0.5934914087607865, + "grad_norm": 4.275959014892578, + "learning_rate": 3.6144514249203757e-06, + "loss": 0.1404, + "step": 23453 + }, + { + "epoch": 0.5935167143254801, + "grad_norm": 6.439408779144287, + "learning_rate": 3.614065631160969e-06, + "loss": 0.1885, + "step": 23454 + }, + { + "epoch": 0.5935420198901739, + "grad_norm": 3.331815242767334, + "learning_rate": 3.613679846339195e-06, + "loss": 0.1199, + "step": 23455 + }, + { + "epoch": 0.5935673254548676, + "grad_norm": 4.106883525848389, + "learning_rate": 3.6132940704575414e-06, + "loss": 0.1576, + "step": 23456 + }, + { + "epoch": 0.5935926310195612, + "grad_norm": 3.217848300933838, + "learning_rate": 3.612908303518498e-06, + "loss": 0.1826, + "step": 23457 + }, + { + "epoch": 0.5936179365842549, + "grad_norm": 2.7690842151641846, + "learning_rate": 3.6125225455245494e-06, + "loss": 0.0622, + "step": 23458 + }, + { + "epoch": 0.5936432421489486, + "grad_norm": 15.743728637695312, + "learning_rate": 3.6121367964781846e-06, + "loss": 0.2711, + "step": 23459 + }, + { + "epoch": 0.5936685477136422, + "grad_norm": 4.474565505981445, + "learning_rate": 3.6117510563818914e-06, + "loss": 0.1398, + "step": 23460 + }, + { + "epoch": 0.5936938532783359, + "grad_norm": 5.065019130706787, + "learning_rate": 3.6113653252381575e-06, + "loss": 0.207, + "step": 23461 + }, + { + "epoch": 0.5937191588430296, + "grad_norm": 59.010963439941406, + "learning_rate": 3.6109796030494727e-06, + "loss": 0.1379, + "step": 23462 + }, + { + "epoch": 0.5937444644077232, + "grad_norm": 22.151371002197266, + "learning_rate": 3.610593889818319e-06, + "loss": 0.1587, + "step": 23463 + }, + { + "epoch": 0.5937697699724169, + "grad_norm": 4.883368968963623, + "learning_rate": 3.6102081855471876e-06, + "loss": 0.1847, + "step": 23464 + }, + { + "epoch": 0.5937950755371106, + "grad_norm": 3.241497278213501, + "learning_rate": 3.609822490238565e-06, + "loss": 0.118, + "step": 23465 + }, + { + "epoch": 0.5938203811018042, + "grad_norm": 3.452129602432251, + "learning_rate": 3.6094368038949406e-06, + "loss": 0.103, + "step": 23466 + }, + { + "epoch": 0.593845686666498, + "grad_norm": 7.360969066619873, + "learning_rate": 3.6090511265187978e-06, + "loss": 0.1531, + "step": 23467 + }, + { + "epoch": 0.5938709922311917, + "grad_norm": 4.795736312866211, + "learning_rate": 3.6086654581126246e-06, + "loss": 0.1998, + "step": 23468 + }, + { + "epoch": 0.5938962977958853, + "grad_norm": 12.101784706115723, + "learning_rate": 3.6082797986789096e-06, + "loss": 0.4331, + "step": 23469 + }, + { + "epoch": 0.593921603360579, + "grad_norm": 6.224659442901611, + "learning_rate": 3.6078941482201402e-06, + "loss": 0.2607, + "step": 23470 + }, + { + "epoch": 0.5939469089252727, + "grad_norm": 6.050477981567383, + "learning_rate": 3.6075085067388037e-06, + "loss": 0.162, + "step": 23471 + }, + { + "epoch": 0.5939722144899663, + "grad_norm": 4.0735368728637695, + "learning_rate": 3.607122874237384e-06, + "loss": 0.2007, + "step": 23472 + }, + { + "epoch": 0.59399752005466, + "grad_norm": 5.202553749084473, + "learning_rate": 3.6067372507183694e-06, + "loss": 0.1569, + "step": 23473 + }, + { + "epoch": 0.5940228256193537, + "grad_norm": 5.675119400024414, + "learning_rate": 3.606351636184248e-06, + "loss": 0.1811, + "step": 23474 + }, + { + "epoch": 0.5940481311840474, + "grad_norm": 3.7967689037323, + "learning_rate": 3.6059660306375067e-06, + "loss": 0.0675, + "step": 23475 + }, + { + "epoch": 0.594073436748741, + "grad_norm": 4.688441276550293, + "learning_rate": 3.605580434080629e-06, + "loss": 0.1777, + "step": 23476 + }, + { + "epoch": 0.5940987423134347, + "grad_norm": 3.803612470626831, + "learning_rate": 3.605194846516104e-06, + "loss": 0.1513, + "step": 23477 + }, + { + "epoch": 0.5941240478781284, + "grad_norm": 6.535318374633789, + "learning_rate": 3.6048092679464196e-06, + "loss": 0.2584, + "step": 23478 + }, + { + "epoch": 0.594149353442822, + "grad_norm": 3.9875504970550537, + "learning_rate": 3.60442369837406e-06, + "loss": 0.0865, + "step": 23479 + }, + { + "epoch": 0.5941746590075158, + "grad_norm": 6.801259994506836, + "learning_rate": 3.6040381378015116e-06, + "loss": 0.2545, + "step": 23480 + }, + { + "epoch": 0.5941999645722095, + "grad_norm": 5.927832126617432, + "learning_rate": 3.6036525862312625e-06, + "loss": 0.1717, + "step": 23481 + }, + { + "epoch": 0.5942252701369031, + "grad_norm": 9.171624183654785, + "learning_rate": 3.6032670436657974e-06, + "loss": 0.2966, + "step": 23482 + }, + { + "epoch": 0.5942505757015968, + "grad_norm": 9.301865577697754, + "learning_rate": 3.602881510107603e-06, + "loss": 0.2324, + "step": 23483 + }, + { + "epoch": 0.5942758812662905, + "grad_norm": 7.460535526275635, + "learning_rate": 3.602495985559168e-06, + "loss": 0.1906, + "step": 23484 + }, + { + "epoch": 0.5943011868309841, + "grad_norm": 5.919363498687744, + "learning_rate": 3.6021104700229748e-06, + "loss": 0.0967, + "step": 23485 + }, + { + "epoch": 0.5943264923956778, + "grad_norm": 6.781350135803223, + "learning_rate": 3.6017249635015106e-06, + "loss": 0.2278, + "step": 23486 + }, + { + "epoch": 0.5943517979603715, + "grad_norm": 5.703932285308838, + "learning_rate": 3.6013394659972623e-06, + "loss": 0.182, + "step": 23487 + }, + { + "epoch": 0.5943771035250651, + "grad_norm": 4.2225799560546875, + "learning_rate": 3.6009539775127177e-06, + "loss": 0.1689, + "step": 23488 + }, + { + "epoch": 0.5944024090897588, + "grad_norm": 3.7519514560699463, + "learning_rate": 3.6005684980503584e-06, + "loss": 0.1811, + "step": 23489 + }, + { + "epoch": 0.5944277146544525, + "grad_norm": 8.643875122070312, + "learning_rate": 3.6001830276126727e-06, + "loss": 0.1734, + "step": 23490 + }, + { + "epoch": 0.5944530202191461, + "grad_norm": 2.361262798309326, + "learning_rate": 3.5997975662021458e-06, + "loss": 0.0733, + "step": 23491 + }, + { + "epoch": 0.5944783257838399, + "grad_norm": 4.302562713623047, + "learning_rate": 3.5994121138212645e-06, + "loss": 0.1516, + "step": 23492 + }, + { + "epoch": 0.5945036313485336, + "grad_norm": 5.99908971786499, + "learning_rate": 3.599026670472516e-06, + "loss": 0.2279, + "step": 23493 + }, + { + "epoch": 0.5945289369132272, + "grad_norm": 3.3390913009643555, + "learning_rate": 3.598641236158381e-06, + "loss": 0.1392, + "step": 23494 + }, + { + "epoch": 0.5945542424779209, + "grad_norm": 2.721682548522949, + "learning_rate": 3.598255810881349e-06, + "loss": 0.1708, + "step": 23495 + }, + { + "epoch": 0.5945795480426146, + "grad_norm": 4.530557632446289, + "learning_rate": 3.597870394643904e-06, + "loss": 0.1386, + "step": 23496 + }, + { + "epoch": 0.5946048536073082, + "grad_norm": 4.675607681274414, + "learning_rate": 3.5974849874485346e-06, + "loss": 0.1364, + "step": 23497 + }, + { + "epoch": 0.5946301591720019, + "grad_norm": 2.9069323539733887, + "learning_rate": 3.597099589297721e-06, + "loss": 0.1197, + "step": 23498 + }, + { + "epoch": 0.5946554647366956, + "grad_norm": 6.1080169677734375, + "learning_rate": 3.5967142001939516e-06, + "loss": 0.2579, + "step": 23499 + }, + { + "epoch": 0.5946807703013893, + "grad_norm": 7.168817043304443, + "learning_rate": 3.5963288201397106e-06, + "loss": 0.1812, + "step": 23500 + }, + { + "epoch": 0.5947060758660829, + "grad_norm": 6.552645683288574, + "learning_rate": 3.5959434491374867e-06, + "loss": 0.1556, + "step": 23501 + }, + { + "epoch": 0.5947313814307766, + "grad_norm": 4.692849636077881, + "learning_rate": 3.59555808718976e-06, + "loss": 0.1834, + "step": 23502 + }, + { + "epoch": 0.5947566869954704, + "grad_norm": 4.8995537757873535, + "learning_rate": 3.595172734299018e-06, + "loss": 0.1873, + "step": 23503 + }, + { + "epoch": 0.594781992560164, + "grad_norm": 4.128993511199951, + "learning_rate": 3.594787390467746e-06, + "loss": 0.1713, + "step": 23504 + }, + { + "epoch": 0.5948072981248577, + "grad_norm": 3.7660202980041504, + "learning_rate": 3.5944020556984293e-06, + "loss": 0.1702, + "step": 23505 + }, + { + "epoch": 0.5948326036895514, + "grad_norm": 10.422843933105469, + "learning_rate": 3.5940167299935525e-06, + "loss": 0.2834, + "step": 23506 + }, + { + "epoch": 0.594857909254245, + "grad_norm": 4.05970573425293, + "learning_rate": 3.5936314133555994e-06, + "loss": 0.107, + "step": 23507 + }, + { + "epoch": 0.5948832148189387, + "grad_norm": 3.835287570953369, + "learning_rate": 3.5932461057870567e-06, + "loss": 0.1801, + "step": 23508 + }, + { + "epoch": 0.5949085203836324, + "grad_norm": 8.643096923828125, + "learning_rate": 3.592860807290407e-06, + "loss": 0.163, + "step": 23509 + }, + { + "epoch": 0.594933825948326, + "grad_norm": 5.096972942352295, + "learning_rate": 3.592475517868137e-06, + "loss": 0.1509, + "step": 23510 + }, + { + "epoch": 0.5949591315130197, + "grad_norm": 6.391273021697998, + "learning_rate": 3.5920902375227306e-06, + "loss": 0.2617, + "step": 23511 + }, + { + "epoch": 0.5949844370777134, + "grad_norm": 4.0293145179748535, + "learning_rate": 3.5917049662566716e-06, + "loss": 0.084, + "step": 23512 + }, + { + "epoch": 0.595009742642407, + "grad_norm": 19.803560256958008, + "learning_rate": 3.5913197040724456e-06, + "loss": 0.2358, + "step": 23513 + }, + { + "epoch": 0.5950350482071007, + "grad_norm": 2.8411192893981934, + "learning_rate": 3.590934450972536e-06, + "loss": 0.1486, + "step": 23514 + }, + { + "epoch": 0.5950603537717944, + "grad_norm": 4.245142459869385, + "learning_rate": 3.5905492069594306e-06, + "loss": 0.1648, + "step": 23515 + }, + { + "epoch": 0.595085659336488, + "grad_norm": 2.233731508255005, + "learning_rate": 3.59016397203561e-06, + "loss": 0.0873, + "step": 23516 + }, + { + "epoch": 0.5951109649011818, + "grad_norm": 5.4057793617248535, + "learning_rate": 3.5897787462035582e-06, + "loss": 0.1348, + "step": 23517 + }, + { + "epoch": 0.5951362704658755, + "grad_norm": 4.407087326049805, + "learning_rate": 3.589393529465761e-06, + "loss": 0.2102, + "step": 23518 + }, + { + "epoch": 0.5951615760305691, + "grad_norm": 3.4996273517608643, + "learning_rate": 3.589008321824706e-06, + "loss": 0.1692, + "step": 23519 + }, + { + "epoch": 0.5951868815952628, + "grad_norm": 6.635528087615967, + "learning_rate": 3.5886231232828704e-06, + "loss": 0.1394, + "step": 23520 + }, + { + "epoch": 0.5952121871599565, + "grad_norm": 4.170602798461914, + "learning_rate": 3.588237933842742e-06, + "loss": 0.1694, + "step": 23521 + }, + { + "epoch": 0.5952374927246501, + "grad_norm": 5.202952861785889, + "learning_rate": 3.5878527535068052e-06, + "loss": 0.192, + "step": 23522 + }, + { + "epoch": 0.5952627982893438, + "grad_norm": 2.753822088241577, + "learning_rate": 3.587467582277545e-06, + "loss": 0.1034, + "step": 23523 + }, + { + "epoch": 0.5952881038540375, + "grad_norm": 6.056541919708252, + "learning_rate": 3.5870824201574415e-06, + "loss": 0.2032, + "step": 23524 + }, + { + "epoch": 0.5953134094187312, + "grad_norm": 8.114750862121582, + "learning_rate": 3.5866972671489806e-06, + "loss": 0.1552, + "step": 23525 + }, + { + "epoch": 0.5953387149834248, + "grad_norm": 8.302555084228516, + "learning_rate": 3.586312123254646e-06, + "loss": 0.2382, + "step": 23526 + }, + { + "epoch": 0.5953640205481185, + "grad_norm": 11.558412551879883, + "learning_rate": 3.5859269884769222e-06, + "loss": 0.2613, + "step": 23527 + }, + { + "epoch": 0.5953893261128123, + "grad_norm": 2.6912636756896973, + "learning_rate": 3.585541862818294e-06, + "loss": 0.1315, + "step": 23528 + }, + { + "epoch": 0.5954146316775059, + "grad_norm": 6.963647842407227, + "learning_rate": 3.5851567462812408e-06, + "loss": 0.2449, + "step": 23529 + }, + { + "epoch": 0.5954399372421996, + "grad_norm": 4.45793342590332, + "learning_rate": 3.5847716388682484e-06, + "loss": 0.1435, + "step": 23530 + }, + { + "epoch": 0.5954652428068933, + "grad_norm": 4.217544078826904, + "learning_rate": 3.5843865405818013e-06, + "loss": 0.1576, + "step": 23531 + }, + { + "epoch": 0.5954905483715869, + "grad_norm": 4.69087028503418, + "learning_rate": 3.584001451424383e-06, + "loss": 0.1995, + "step": 23532 + }, + { + "epoch": 0.5955158539362806, + "grad_norm": 3.8005340099334717, + "learning_rate": 3.583616371398475e-06, + "loss": 0.1651, + "step": 23533 + }, + { + "epoch": 0.5955411595009743, + "grad_norm": 16.25648307800293, + "learning_rate": 3.583231300506561e-06, + "loss": 0.1975, + "step": 23534 + }, + { + "epoch": 0.5955664650656679, + "grad_norm": 4.1284637451171875, + "learning_rate": 3.5828462387511256e-06, + "loss": 0.1536, + "step": 23535 + }, + { + "epoch": 0.5955917706303616, + "grad_norm": 13.716992378234863, + "learning_rate": 3.5824611861346503e-06, + "loss": 0.2303, + "step": 23536 + }, + { + "epoch": 0.5956170761950553, + "grad_norm": 3.9134693145751953, + "learning_rate": 3.5820761426596206e-06, + "loss": 0.1493, + "step": 23537 + }, + { + "epoch": 0.5956423817597489, + "grad_norm": 4.741818904876709, + "learning_rate": 3.5816911083285165e-06, + "loss": 0.1375, + "step": 23538 + }, + { + "epoch": 0.5956676873244426, + "grad_norm": 6.137277126312256, + "learning_rate": 3.581306083143824e-06, + "loss": 0.2238, + "step": 23539 + }, + { + "epoch": 0.5956929928891364, + "grad_norm": 9.173930168151855, + "learning_rate": 3.580921067108024e-06, + "loss": 0.197, + "step": 23540 + }, + { + "epoch": 0.59571829845383, + "grad_norm": 6.16721248626709, + "learning_rate": 3.5805360602236004e-06, + "loss": 0.1734, + "step": 23541 + }, + { + "epoch": 0.5957436040185237, + "grad_norm": 4.974643707275391, + "learning_rate": 3.5801510624930365e-06, + "loss": 0.1375, + "step": 23542 + }, + { + "epoch": 0.5957689095832174, + "grad_norm": 6.3727288246154785, + "learning_rate": 3.5797660739188123e-06, + "loss": 0.2086, + "step": 23543 + }, + { + "epoch": 0.595794215147911, + "grad_norm": 2.4955692291259766, + "learning_rate": 3.579381094503413e-06, + "loss": 0.1034, + "step": 23544 + }, + { + "epoch": 0.5958195207126047, + "grad_norm": 4.266746520996094, + "learning_rate": 3.578996124249321e-06, + "loss": 0.0962, + "step": 23545 + }, + { + "epoch": 0.5958448262772984, + "grad_norm": 2.826315402984619, + "learning_rate": 3.5786111631590206e-06, + "loss": 0.0853, + "step": 23546 + }, + { + "epoch": 0.595870131841992, + "grad_norm": 4.690458297729492, + "learning_rate": 3.5782262112349896e-06, + "loss": 0.1779, + "step": 23547 + }, + { + "epoch": 0.5958954374066857, + "grad_norm": 10.351252555847168, + "learning_rate": 3.577841268479714e-06, + "loss": 0.166, + "step": 23548 + }, + { + "epoch": 0.5959207429713794, + "grad_norm": 4.228285312652588, + "learning_rate": 3.5774563348956758e-06, + "loss": 0.2041, + "step": 23549 + }, + { + "epoch": 0.5959460485360731, + "grad_norm": 4.540450572967529, + "learning_rate": 3.5770714104853586e-06, + "loss": 0.1994, + "step": 23550 + }, + { + "epoch": 0.5959713541007667, + "grad_norm": 14.899633407592773, + "learning_rate": 3.5766864952512416e-06, + "loss": 0.3105, + "step": 23551 + }, + { + "epoch": 0.5959966596654604, + "grad_norm": 9.391077995300293, + "learning_rate": 3.576301589195808e-06, + "loss": 0.2507, + "step": 23552 + }, + { + "epoch": 0.5960219652301542, + "grad_norm": 4.91790771484375, + "learning_rate": 3.5759166923215404e-06, + "loss": 0.1962, + "step": 23553 + }, + { + "epoch": 0.5960472707948478, + "grad_norm": 5.296914100646973, + "learning_rate": 3.575531804630924e-06, + "loss": 0.1965, + "step": 23554 + }, + { + "epoch": 0.5960725763595415, + "grad_norm": 3.6728692054748535, + "learning_rate": 3.575146926126436e-06, + "loss": 0.1903, + "step": 23555 + }, + { + "epoch": 0.5960978819242352, + "grad_norm": 3.525869369506836, + "learning_rate": 3.57476205681056e-06, + "loss": 0.1625, + "step": 23556 + }, + { + "epoch": 0.5961231874889288, + "grad_norm": 5.052183151245117, + "learning_rate": 3.574377196685779e-06, + "loss": 0.1752, + "step": 23557 + }, + { + "epoch": 0.5961484930536225, + "grad_norm": 5.360556125640869, + "learning_rate": 3.5739923457545735e-06, + "loss": 0.1832, + "step": 23558 + }, + { + "epoch": 0.5961737986183162, + "grad_norm": 4.570870399475098, + "learning_rate": 3.5736075040194285e-06, + "loss": 0.133, + "step": 23559 + }, + { + "epoch": 0.5961991041830098, + "grad_norm": 3.4779410362243652, + "learning_rate": 3.573222671482821e-06, + "loss": 0.122, + "step": 23560 + }, + { + "epoch": 0.5962244097477035, + "grad_norm": 9.973662376403809, + "learning_rate": 3.572837848147236e-06, + "loss": 0.2625, + "step": 23561 + }, + { + "epoch": 0.5962497153123972, + "grad_norm": 9.833687782287598, + "learning_rate": 3.572453034015153e-06, + "loss": 0.1592, + "step": 23562 + }, + { + "epoch": 0.5962750208770908, + "grad_norm": 18.6918888092041, + "learning_rate": 3.5720682290890574e-06, + "loss": 0.2643, + "step": 23563 + }, + { + "epoch": 0.5963003264417845, + "grad_norm": 4.977566242218018, + "learning_rate": 3.571683433371427e-06, + "loss": 0.151, + "step": 23564 + }, + { + "epoch": 0.5963256320064783, + "grad_norm": 4.642258644104004, + "learning_rate": 3.571298646864743e-06, + "loss": 0.1345, + "step": 23565 + }, + { + "epoch": 0.5963509375711719, + "grad_norm": 4.296823024749756, + "learning_rate": 3.57091386957149e-06, + "loss": 0.1503, + "step": 23566 + }, + { + "epoch": 0.5963762431358656, + "grad_norm": 4.07375431060791, + "learning_rate": 3.5705291014941467e-06, + "loss": 0.1568, + "step": 23567 + }, + { + "epoch": 0.5964015487005593, + "grad_norm": 3.9829108715057373, + "learning_rate": 3.570144342635196e-06, + "loss": 0.2, + "step": 23568 + }, + { + "epoch": 0.5964268542652529, + "grad_norm": 5.928484916687012, + "learning_rate": 3.569759592997119e-06, + "loss": 0.1675, + "step": 23569 + }, + { + "epoch": 0.5964521598299466, + "grad_norm": 4.774477005004883, + "learning_rate": 3.5693748525823948e-06, + "loss": 0.1917, + "step": 23570 + }, + { + "epoch": 0.5964774653946403, + "grad_norm": 9.623761177062988, + "learning_rate": 3.568990121393506e-06, + "loss": 0.1539, + "step": 23571 + }, + { + "epoch": 0.5965027709593339, + "grad_norm": 3.262470006942749, + "learning_rate": 3.568605399432936e-06, + "loss": 0.1139, + "step": 23572 + }, + { + "epoch": 0.5965280765240276, + "grad_norm": 3.706437349319458, + "learning_rate": 3.568220686703162e-06, + "loss": 0.1594, + "step": 23573 + }, + { + "epoch": 0.5965533820887213, + "grad_norm": 11.295928001403809, + "learning_rate": 3.5678359832066656e-06, + "loss": 0.2011, + "step": 23574 + }, + { + "epoch": 0.596578687653415, + "grad_norm": 15.418648719787598, + "learning_rate": 3.5674512889459288e-06, + "loss": 0.1544, + "step": 23575 + }, + { + "epoch": 0.5966039932181086, + "grad_norm": 6.140204906463623, + "learning_rate": 3.5670666039234347e-06, + "loss": 0.1626, + "step": 23576 + }, + { + "epoch": 0.5966292987828024, + "grad_norm": 4.67622709274292, + "learning_rate": 3.5666819281416592e-06, + "loss": 0.1323, + "step": 23577 + }, + { + "epoch": 0.5966546043474961, + "grad_norm": 11.926615715026855, + "learning_rate": 3.5662972616030855e-06, + "loss": 0.2511, + "step": 23578 + }, + { + "epoch": 0.5966799099121897, + "grad_norm": 7.035459041595459, + "learning_rate": 3.5659126043101932e-06, + "loss": 0.1507, + "step": 23579 + }, + { + "epoch": 0.5967052154768834, + "grad_norm": 4.778909206390381, + "learning_rate": 3.565527956265464e-06, + "loss": 0.0465, + "step": 23580 + }, + { + "epoch": 0.5967305210415771, + "grad_norm": 3.977252960205078, + "learning_rate": 3.5651433174713813e-06, + "loss": 0.2018, + "step": 23581 + }, + { + "epoch": 0.5967558266062707, + "grad_norm": 5.60927677154541, + "learning_rate": 3.56475868793042e-06, + "loss": 0.1897, + "step": 23582 + }, + { + "epoch": 0.5967811321709644, + "grad_norm": 7.856189250946045, + "learning_rate": 3.5643740676450624e-06, + "loss": 0.1548, + "step": 23583 + }, + { + "epoch": 0.5968064377356581, + "grad_norm": 4.036411762237549, + "learning_rate": 3.563989456617789e-06, + "loss": 0.1561, + "step": 23584 + }, + { + "epoch": 0.5968317433003517, + "grad_norm": 7.494706630706787, + "learning_rate": 3.5636048548510836e-06, + "loss": 0.1258, + "step": 23585 + }, + { + "epoch": 0.5968570488650454, + "grad_norm": 4.884125232696533, + "learning_rate": 3.5632202623474204e-06, + "loss": 0.1163, + "step": 23586 + }, + { + "epoch": 0.5968823544297391, + "grad_norm": 2.581897020339966, + "learning_rate": 3.5628356791092833e-06, + "loss": 0.0756, + "step": 23587 + }, + { + "epoch": 0.5969076599944327, + "grad_norm": 9.099409103393555, + "learning_rate": 3.5624511051391507e-06, + "loss": 0.1442, + "step": 23588 + }, + { + "epoch": 0.5969329655591264, + "grad_norm": 5.08591890335083, + "learning_rate": 3.5620665404395037e-06, + "loss": 0.2137, + "step": 23589 + }, + { + "epoch": 0.5969582711238202, + "grad_norm": 5.583108901977539, + "learning_rate": 3.561681985012825e-06, + "loss": 0.1671, + "step": 23590 + }, + { + "epoch": 0.5969835766885138, + "grad_norm": 8.933545112609863, + "learning_rate": 3.5612974388615883e-06, + "loss": 0.1744, + "step": 23591 + }, + { + "epoch": 0.5970088822532075, + "grad_norm": 7.469383239746094, + "learning_rate": 3.5609129019882784e-06, + "loss": 0.2339, + "step": 23592 + }, + { + "epoch": 0.5970341878179012, + "grad_norm": 6.374026298522949, + "learning_rate": 3.5605283743953732e-06, + "loss": 0.1598, + "step": 23593 + }, + { + "epoch": 0.5970594933825948, + "grad_norm": 13.197155952453613, + "learning_rate": 3.560143856085353e-06, + "loss": 0.1602, + "step": 23594 + }, + { + "epoch": 0.5970847989472885, + "grad_norm": 4.878120422363281, + "learning_rate": 3.559759347060696e-06, + "loss": 0.1438, + "step": 23595 + }, + { + "epoch": 0.5971101045119822, + "grad_norm": 4.032230377197266, + "learning_rate": 3.5593748473238847e-06, + "loss": 0.1239, + "step": 23596 + }, + { + "epoch": 0.5971354100766758, + "grad_norm": 6.09214973449707, + "learning_rate": 3.558990356877396e-06, + "loss": 0.1192, + "step": 23597 + }, + { + "epoch": 0.5971607156413695, + "grad_norm": 3.136295795440674, + "learning_rate": 3.5586058757237104e-06, + "loss": 0.1314, + "step": 23598 + }, + { + "epoch": 0.5971860212060632, + "grad_norm": 8.357037544250488, + "learning_rate": 3.558221403865308e-06, + "loss": 0.2846, + "step": 23599 + }, + { + "epoch": 0.5972113267707568, + "grad_norm": 4.552150249481201, + "learning_rate": 3.5578369413046677e-06, + "loss": 0.1324, + "step": 23600 + }, + { + "epoch": 0.5972366323354505, + "grad_norm": 4.414459228515625, + "learning_rate": 3.5574524880442675e-06, + "loss": 0.1539, + "step": 23601 + }, + { + "epoch": 0.5972619379001443, + "grad_norm": 3.8462471961975098, + "learning_rate": 3.5570680440865883e-06, + "loss": 0.1259, + "step": 23602 + }, + { + "epoch": 0.597287243464838, + "grad_norm": 8.268929481506348, + "learning_rate": 3.5566836094341107e-06, + "loss": 0.3119, + "step": 23603 + }, + { + "epoch": 0.5973125490295316, + "grad_norm": 9.807512283325195, + "learning_rate": 3.55629918408931e-06, + "loss": 0.1691, + "step": 23604 + }, + { + "epoch": 0.5973378545942253, + "grad_norm": 5.659704208374023, + "learning_rate": 3.5559147680546674e-06, + "loss": 0.1867, + "step": 23605 + }, + { + "epoch": 0.597363160158919, + "grad_norm": 3.8136589527130127, + "learning_rate": 3.555530361332662e-06, + "loss": 0.094, + "step": 23606 + }, + { + "epoch": 0.5973884657236126, + "grad_norm": 5.084554195404053, + "learning_rate": 3.5551459639257744e-06, + "loss": 0.1379, + "step": 23607 + }, + { + "epoch": 0.5974137712883063, + "grad_norm": 4.008883476257324, + "learning_rate": 3.5547615758364796e-06, + "loss": 0.1892, + "step": 23608 + }, + { + "epoch": 0.597439076853, + "grad_norm": 3.8601372241973877, + "learning_rate": 3.554377197067259e-06, + "loss": 0.1443, + "step": 23609 + }, + { + "epoch": 0.5974643824176936, + "grad_norm": 6.721932411193848, + "learning_rate": 3.5539928276205902e-06, + "loss": 0.1407, + "step": 23610 + }, + { + "epoch": 0.5974896879823873, + "grad_norm": 11.477511405944824, + "learning_rate": 3.5536084674989533e-06, + "loss": 0.2273, + "step": 23611 + }, + { + "epoch": 0.597514993547081, + "grad_norm": 3.6073615550994873, + "learning_rate": 3.5532241167048283e-06, + "loss": 0.1431, + "step": 23612 + }, + { + "epoch": 0.5975402991117746, + "grad_norm": 3.930311441421509, + "learning_rate": 3.5528397752406896e-06, + "loss": 0.1345, + "step": 23613 + }, + { + "epoch": 0.5975656046764684, + "grad_norm": 5.402971267700195, + "learning_rate": 3.552455443109018e-06, + "loss": 0.1642, + "step": 23614 + }, + { + "epoch": 0.5975909102411621, + "grad_norm": 4.3971099853515625, + "learning_rate": 3.5520711203122923e-06, + "loss": 0.1576, + "step": 23615 + }, + { + "epoch": 0.5976162158058557, + "grad_norm": 4.499819755554199, + "learning_rate": 3.551686806852992e-06, + "loss": 0.1372, + "step": 23616 + }, + { + "epoch": 0.5976415213705494, + "grad_norm": 3.8861923217773438, + "learning_rate": 3.5513025027335925e-06, + "loss": 0.1399, + "step": 23617 + }, + { + "epoch": 0.5976668269352431, + "grad_norm": 7.340961456298828, + "learning_rate": 3.550918207956574e-06, + "loss": 0.2002, + "step": 23618 + }, + { + "epoch": 0.5976921324999367, + "grad_norm": 10.857114791870117, + "learning_rate": 3.550533922524414e-06, + "loss": 0.2034, + "step": 23619 + }, + { + "epoch": 0.5977174380646304, + "grad_norm": 5.274970531463623, + "learning_rate": 3.5501496464395923e-06, + "loss": 0.1283, + "step": 23620 + }, + { + "epoch": 0.5977427436293241, + "grad_norm": 3.5955066680908203, + "learning_rate": 3.5497653797045855e-06, + "loss": 0.1001, + "step": 23621 + }, + { + "epoch": 0.5977680491940177, + "grad_norm": 6.60120153427124, + "learning_rate": 3.549381122321871e-06, + "loss": 0.2705, + "step": 23622 + }, + { + "epoch": 0.5977933547587114, + "grad_norm": 13.044398307800293, + "learning_rate": 3.5489968742939286e-06, + "loss": 0.1819, + "step": 23623 + }, + { + "epoch": 0.5978186603234051, + "grad_norm": 4.064718723297119, + "learning_rate": 3.5486126356232354e-06, + "loss": 0.1625, + "step": 23624 + }, + { + "epoch": 0.5978439658880987, + "grad_norm": 3.2259933948516846, + "learning_rate": 3.5482284063122702e-06, + "loss": 0.1551, + "step": 23625 + }, + { + "epoch": 0.5978692714527925, + "grad_norm": 8.133465766906738, + "learning_rate": 3.547844186363508e-06, + "loss": 0.2686, + "step": 23626 + }, + { + "epoch": 0.5978945770174862, + "grad_norm": 3.758789300918579, + "learning_rate": 3.5474599757794305e-06, + "loss": 0.179, + "step": 23627 + }, + { + "epoch": 0.5979198825821799, + "grad_norm": 6.574077606201172, + "learning_rate": 3.547075774562512e-06, + "loss": 0.2487, + "step": 23628 + }, + { + "epoch": 0.5979451881468735, + "grad_norm": 3.5891811847686768, + "learning_rate": 3.5466915827152325e-06, + "loss": 0.1075, + "step": 23629 + }, + { + "epoch": 0.5979704937115672, + "grad_norm": 8.696531295776367, + "learning_rate": 3.5463074002400686e-06, + "loss": 0.1863, + "step": 23630 + }, + { + "epoch": 0.5979957992762609, + "grad_norm": 4.052464962005615, + "learning_rate": 3.5459232271394973e-06, + "loss": 0.149, + "step": 23631 + }, + { + "epoch": 0.5980211048409545, + "grad_norm": 2.567434072494507, + "learning_rate": 3.545539063415996e-06, + "loss": 0.1358, + "step": 23632 + }, + { + "epoch": 0.5980464104056482, + "grad_norm": 4.2971649169921875, + "learning_rate": 3.5451549090720433e-06, + "loss": 0.1332, + "step": 23633 + }, + { + "epoch": 0.5980717159703419, + "grad_norm": 5.402487277984619, + "learning_rate": 3.5447707641101183e-06, + "loss": 0.215, + "step": 23634 + }, + { + "epoch": 0.5980970215350355, + "grad_norm": 3.0602104663848877, + "learning_rate": 3.5443866285326928e-06, + "loss": 0.1273, + "step": 23635 + }, + { + "epoch": 0.5981223270997292, + "grad_norm": 5.162906646728516, + "learning_rate": 3.5440025023422473e-06, + "loss": 0.224, + "step": 23636 + }, + { + "epoch": 0.598147632664423, + "grad_norm": 6.25315523147583, + "learning_rate": 3.54361838554126e-06, + "loss": 0.1757, + "step": 23637 + }, + { + "epoch": 0.5981729382291165, + "grad_norm": 9.182601928710938, + "learning_rate": 3.543234278132208e-06, + "loss": 0.2277, + "step": 23638 + }, + { + "epoch": 0.5981982437938103, + "grad_norm": 2.857957363128662, + "learning_rate": 3.5428501801175653e-06, + "loss": 0.1475, + "step": 23639 + }, + { + "epoch": 0.598223549358504, + "grad_norm": 2.6331629753112793, + "learning_rate": 3.5424660914998103e-06, + "loss": 0.0753, + "step": 23640 + }, + { + "epoch": 0.5982488549231976, + "grad_norm": 9.977216720581055, + "learning_rate": 3.542082012281421e-06, + "loss": 0.1888, + "step": 23641 + }, + { + "epoch": 0.5982741604878913, + "grad_norm": 4.804173469543457, + "learning_rate": 3.5416979424648736e-06, + "loss": 0.1649, + "step": 23642 + }, + { + "epoch": 0.598299466052585, + "grad_norm": 8.143853187561035, + "learning_rate": 3.5413138820526465e-06, + "loss": 0.135, + "step": 23643 + }, + { + "epoch": 0.5983247716172786, + "grad_norm": 10.820686340332031, + "learning_rate": 3.540929831047213e-06, + "loss": 0.1766, + "step": 23644 + }, + { + "epoch": 0.5983500771819723, + "grad_norm": 10.136056900024414, + "learning_rate": 3.5405457894510516e-06, + "loss": 0.1645, + "step": 23645 + }, + { + "epoch": 0.598375382746666, + "grad_norm": 4.952520370483398, + "learning_rate": 3.5401617572666393e-06, + "loss": 0.1522, + "step": 23646 + }, + { + "epoch": 0.5984006883113596, + "grad_norm": 7.192953586578369, + "learning_rate": 3.5397777344964535e-06, + "loss": 0.1462, + "step": 23647 + }, + { + "epoch": 0.5984259938760533, + "grad_norm": 4.464796543121338, + "learning_rate": 3.539393721142968e-06, + "loss": 0.1642, + "step": 23648 + }, + { + "epoch": 0.598451299440747, + "grad_norm": 8.4368314743042, + "learning_rate": 3.53900971720866e-06, + "loss": 0.179, + "step": 23649 + }, + { + "epoch": 0.5984766050054406, + "grad_norm": 3.6648807525634766, + "learning_rate": 3.5386257226960076e-06, + "loss": 0.137, + "step": 23650 + }, + { + "epoch": 0.5985019105701344, + "grad_norm": 4.1286940574646, + "learning_rate": 3.5382417376074873e-06, + "loss": 0.1187, + "step": 23651 + }, + { + "epoch": 0.5985272161348281, + "grad_norm": 4.516447067260742, + "learning_rate": 3.537857761945573e-06, + "loss": 0.1723, + "step": 23652 + }, + { + "epoch": 0.5985525216995218, + "grad_norm": 4.192836761474609, + "learning_rate": 3.5374737957127413e-06, + "loss": 0.2199, + "step": 23653 + }, + { + "epoch": 0.5985778272642154, + "grad_norm": 3.0057520866394043, + "learning_rate": 3.5370898389114703e-06, + "loss": 0.1384, + "step": 23654 + }, + { + "epoch": 0.5986031328289091, + "grad_norm": 7.366636753082275, + "learning_rate": 3.5367058915442337e-06, + "loss": 0.1609, + "step": 23655 + }, + { + "epoch": 0.5986284383936028, + "grad_norm": 8.933847427368164, + "learning_rate": 3.5363219536135096e-06, + "loss": 0.1518, + "step": 23656 + }, + { + "epoch": 0.5986537439582964, + "grad_norm": 3.4508626461029053, + "learning_rate": 3.535938025121773e-06, + "loss": 0.0967, + "step": 23657 + }, + { + "epoch": 0.5986790495229901, + "grad_norm": 14.4706449508667, + "learning_rate": 3.5355541060714987e-06, + "loss": 0.2074, + "step": 23658 + }, + { + "epoch": 0.5987043550876838, + "grad_norm": 3.4636776447296143, + "learning_rate": 3.5351701964651637e-06, + "loss": 0.1239, + "step": 23659 + }, + { + "epoch": 0.5987296606523774, + "grad_norm": 4.454617977142334, + "learning_rate": 3.5347862963052443e-06, + "loss": 0.1383, + "step": 23660 + }, + { + "epoch": 0.5987549662170711, + "grad_norm": 4.305019855499268, + "learning_rate": 3.534402405594216e-06, + "loss": 0.1107, + "step": 23661 + }, + { + "epoch": 0.5987802717817649, + "grad_norm": 4.02247953414917, + "learning_rate": 3.5340185243345524e-06, + "loss": 0.1472, + "step": 23662 + }, + { + "epoch": 0.5988055773464585, + "grad_norm": 5.115341663360596, + "learning_rate": 3.5336346525287314e-06, + "loss": 0.1796, + "step": 23663 + }, + { + "epoch": 0.5988308829111522, + "grad_norm": 7.376775741577148, + "learning_rate": 3.533250790179227e-06, + "loss": 0.2523, + "step": 23664 + }, + { + "epoch": 0.5988561884758459, + "grad_norm": 2.267437219619751, + "learning_rate": 3.5328669372885175e-06, + "loss": 0.1067, + "step": 23665 + }, + { + "epoch": 0.5988814940405395, + "grad_norm": 3.978809356689453, + "learning_rate": 3.532483093859075e-06, + "loss": 0.189, + "step": 23666 + }, + { + "epoch": 0.5989067996052332, + "grad_norm": 8.751043319702148, + "learning_rate": 3.532099259893375e-06, + "loss": 0.3077, + "step": 23667 + }, + { + "epoch": 0.5989321051699269, + "grad_norm": 4.1584391593933105, + "learning_rate": 3.5317154353938942e-06, + "loss": 0.1825, + "step": 23668 + }, + { + "epoch": 0.5989574107346205, + "grad_norm": 10.575662612915039, + "learning_rate": 3.53133162036311e-06, + "loss": 0.1307, + "step": 23669 + }, + { + "epoch": 0.5989827162993142, + "grad_norm": 4.307985782623291, + "learning_rate": 3.5309478148034924e-06, + "loss": 0.1638, + "step": 23670 + }, + { + "epoch": 0.5990080218640079, + "grad_norm": 12.713019371032715, + "learning_rate": 3.5305640187175193e-06, + "loss": 0.1908, + "step": 23671 + }, + { + "epoch": 0.5990333274287015, + "grad_norm": 5.9537034034729, + "learning_rate": 3.530180232107665e-06, + "loss": 0.1943, + "step": 23672 + }, + { + "epoch": 0.5990586329933952, + "grad_norm": 5.693894863128662, + "learning_rate": 3.529796454976405e-06, + "loss": 0.1942, + "step": 23673 + }, + { + "epoch": 0.599083938558089, + "grad_norm": 5.997905254364014, + "learning_rate": 3.529412687326217e-06, + "loss": 0.1236, + "step": 23674 + }, + { + "epoch": 0.5991092441227825, + "grad_norm": 3.0880188941955566, + "learning_rate": 3.5290289291595707e-06, + "loss": 0.115, + "step": 23675 + }, + { + "epoch": 0.5991345496874763, + "grad_norm": 5.2542243003845215, + "learning_rate": 3.5286451804789433e-06, + "loss": 0.1709, + "step": 23676 + }, + { + "epoch": 0.59915985525217, + "grad_norm": 14.098751068115234, + "learning_rate": 3.5282614412868087e-06, + "loss": 0.1575, + "step": 23677 + }, + { + "epoch": 0.5991851608168637, + "grad_norm": 3.534573554992676, + "learning_rate": 3.5278777115856456e-06, + "loss": 0.1261, + "step": 23678 + }, + { + "epoch": 0.5992104663815573, + "grad_norm": 5.188008785247803, + "learning_rate": 3.527493991377922e-06, + "loss": 0.1376, + "step": 23679 + }, + { + "epoch": 0.599235771946251, + "grad_norm": 3.6412787437438965, + "learning_rate": 3.527110280666116e-06, + "loss": 0.1318, + "step": 23680 + }, + { + "epoch": 0.5992610775109447, + "grad_norm": 4.856938362121582, + "learning_rate": 3.526726579452704e-06, + "loss": 0.0954, + "step": 23681 + }, + { + "epoch": 0.5992863830756383, + "grad_norm": 3.9111852645874023, + "learning_rate": 3.526342887740157e-06, + "loss": 0.0984, + "step": 23682 + }, + { + "epoch": 0.599311688640332, + "grad_norm": 3.819627046585083, + "learning_rate": 3.52595920553095e-06, + "loss": 0.1019, + "step": 23683 + }, + { + "epoch": 0.5993369942050257, + "grad_norm": 10.60462474822998, + "learning_rate": 3.5255755328275587e-06, + "loss": 0.2019, + "step": 23684 + }, + { + "epoch": 0.5993622997697193, + "grad_norm": 3.9397456645965576, + "learning_rate": 3.525191869632456e-06, + "loss": 0.1288, + "step": 23685 + }, + { + "epoch": 0.599387605334413, + "grad_norm": 6.528722286224365, + "learning_rate": 3.5248082159481157e-06, + "loss": 0.2351, + "step": 23686 + }, + { + "epoch": 0.5994129108991068, + "grad_norm": 9.470988273620605, + "learning_rate": 3.5244245717770146e-06, + "loss": 0.2496, + "step": 23687 + }, + { + "epoch": 0.5994382164638004, + "grad_norm": 12.412271499633789, + "learning_rate": 3.524040937121625e-06, + "loss": 0.1502, + "step": 23688 + }, + { + "epoch": 0.5994635220284941, + "grad_norm": 6.382791519165039, + "learning_rate": 3.5236573119844197e-06, + "loss": 0.1552, + "step": 23689 + }, + { + "epoch": 0.5994888275931878, + "grad_norm": 3.628566265106201, + "learning_rate": 3.5232736963678737e-06, + "loss": 0.2122, + "step": 23690 + }, + { + "epoch": 0.5995141331578814, + "grad_norm": 4.038302898406982, + "learning_rate": 3.5228900902744624e-06, + "loss": 0.1565, + "step": 23691 + }, + { + "epoch": 0.5995394387225751, + "grad_norm": 6.822010040283203, + "learning_rate": 3.5225064937066566e-06, + "loss": 0.1894, + "step": 23692 + }, + { + "epoch": 0.5995647442872688, + "grad_norm": 6.443451404571533, + "learning_rate": 3.5221229066669317e-06, + "loss": 0.1933, + "step": 23693 + }, + { + "epoch": 0.5995900498519624, + "grad_norm": 4.8913092613220215, + "learning_rate": 3.521739329157761e-06, + "loss": 0.1096, + "step": 23694 + }, + { + "epoch": 0.5996153554166561, + "grad_norm": 3.0038528442382812, + "learning_rate": 3.5213557611816185e-06, + "loss": 0.1162, + "step": 23695 + }, + { + "epoch": 0.5996406609813498, + "grad_norm": 2.8675355911254883, + "learning_rate": 3.5209722027409798e-06, + "loss": 0.1071, + "step": 23696 + }, + { + "epoch": 0.5996659665460434, + "grad_norm": 6.74245023727417, + "learning_rate": 3.520588653838315e-06, + "loss": 0.2054, + "step": 23697 + }, + { + "epoch": 0.5996912721107371, + "grad_norm": 4.432939052581787, + "learning_rate": 3.5202051144760972e-06, + "loss": 0.1423, + "step": 23698 + }, + { + "epoch": 0.5997165776754309, + "grad_norm": 9.999747276306152, + "learning_rate": 3.519821584656802e-06, + "loss": 0.1188, + "step": 23699 + }, + { + "epoch": 0.5997418832401245, + "grad_norm": 4.944174289703369, + "learning_rate": 3.5194380643829048e-06, + "loss": 0.0861, + "step": 23700 + }, + { + "epoch": 0.5997671888048182, + "grad_norm": 5.382908821105957, + "learning_rate": 3.5190545536568733e-06, + "loss": 0.1142, + "step": 23701 + }, + { + "epoch": 0.5997924943695119, + "grad_norm": 3.937344551086426, + "learning_rate": 3.518671052481184e-06, + "loss": 0.1143, + "step": 23702 + }, + { + "epoch": 0.5998177999342056, + "grad_norm": 7.862762451171875, + "learning_rate": 3.518287560858309e-06, + "loss": 0.3535, + "step": 23703 + }, + { + "epoch": 0.5998431054988992, + "grad_norm": 3.659841775894165, + "learning_rate": 3.517904078790724e-06, + "loss": 0.1239, + "step": 23704 + }, + { + "epoch": 0.5998684110635929, + "grad_norm": 9.409466743469238, + "learning_rate": 3.517520606280898e-06, + "loss": 0.2487, + "step": 23705 + }, + { + "epoch": 0.5998937166282866, + "grad_norm": 3.2737162113189697, + "learning_rate": 3.5171371433313063e-06, + "loss": 0.165, + "step": 23706 + }, + { + "epoch": 0.5999190221929802, + "grad_norm": 4.080836772918701, + "learning_rate": 3.5167536899444205e-06, + "loss": 0.1843, + "step": 23707 + }, + { + "epoch": 0.5999443277576739, + "grad_norm": 33.859275817871094, + "learning_rate": 3.5163702461227158e-06, + "loss": 0.1884, + "step": 23708 + }, + { + "epoch": 0.5999696333223676, + "grad_norm": 6.851672649383545, + "learning_rate": 3.5159868118686636e-06, + "loss": 0.1565, + "step": 23709 + }, + { + "epoch": 0.5999949388870612, + "grad_norm": 10.549356460571289, + "learning_rate": 3.515603387184735e-06, + "loss": 0.3145, + "step": 23710 + }, + { + "epoch": 0.600020244451755, + "grad_norm": 3.8855152130126953, + "learning_rate": 3.5152199720734037e-06, + "loss": 0.1543, + "step": 23711 + }, + { + "epoch": 0.6000455500164487, + "grad_norm": 4.309970378875732, + "learning_rate": 3.5148365665371443e-06, + "loss": 0.1887, + "step": 23712 + }, + { + "epoch": 0.6000455500164487, + "eval_loss": 0.17702995240688324, + "eval_runtime": 69.858, + "eval_samples_per_second": 45.721, + "eval_steps_per_second": 5.726, + "step": 23712 + }, + { + "epoch": 0.6000708555811423, + "grad_norm": 3.82440185546875, + "learning_rate": 3.5144531705784267e-06, + "loss": 0.1545, + "step": 23713 + }, + { + "epoch": 0.600096161145836, + "grad_norm": 7.611006736755371, + "learning_rate": 3.514069784199724e-06, + "loss": 0.2603, + "step": 23714 + }, + { + "epoch": 0.6001214667105297, + "grad_norm": 12.44852352142334, + "learning_rate": 3.5136864074035102e-06, + "loss": 0.3271, + "step": 23715 + }, + { + "epoch": 0.6001467722752233, + "grad_norm": 3.8889756202697754, + "learning_rate": 3.513303040192255e-06, + "loss": 0.1468, + "step": 23716 + }, + { + "epoch": 0.600172077839917, + "grad_norm": 3.9554035663604736, + "learning_rate": 3.512919682568432e-06, + "loss": 0.1419, + "step": 23717 + }, + { + "epoch": 0.6001973834046107, + "grad_norm": 6.347926139831543, + "learning_rate": 3.5125363345345148e-06, + "loss": 0.1582, + "step": 23718 + }, + { + "epoch": 0.6002226889693043, + "grad_norm": 3.012542486190796, + "learning_rate": 3.5121529960929722e-06, + "loss": 0.1098, + "step": 23719 + }, + { + "epoch": 0.600247994533998, + "grad_norm": 9.079597473144531, + "learning_rate": 3.511769667246279e-06, + "loss": 0.1306, + "step": 23720 + }, + { + "epoch": 0.6002733000986917, + "grad_norm": 12.332294464111328, + "learning_rate": 3.5113863479969058e-06, + "loss": 0.1707, + "step": 23721 + }, + { + "epoch": 0.6002986056633853, + "grad_norm": 3.649574041366577, + "learning_rate": 3.511003038347327e-06, + "loss": 0.0881, + "step": 23722 + }, + { + "epoch": 0.600323911228079, + "grad_norm": 3.825488328933716, + "learning_rate": 3.5106197383000107e-06, + "loss": 0.1484, + "step": 23723 + }, + { + "epoch": 0.6003492167927728, + "grad_norm": 4.729172229766846, + "learning_rate": 3.5102364478574303e-06, + "loss": 0.1787, + "step": 23724 + }, + { + "epoch": 0.6003745223574664, + "grad_norm": 3.2996015548706055, + "learning_rate": 3.509853167022058e-06, + "loss": 0.1602, + "step": 23725 + }, + { + "epoch": 0.6003998279221601, + "grad_norm": 5.258082389831543, + "learning_rate": 3.509469895796366e-06, + "loss": 0.2659, + "step": 23726 + }, + { + "epoch": 0.6004251334868538, + "grad_norm": 8.494038581848145, + "learning_rate": 3.509086634182828e-06, + "loss": 0.2637, + "step": 23727 + }, + { + "epoch": 0.6004504390515474, + "grad_norm": 5.806909084320068, + "learning_rate": 3.50870338218391e-06, + "loss": 0.1014, + "step": 23728 + }, + { + "epoch": 0.6004757446162411, + "grad_norm": 4.750024795532227, + "learning_rate": 3.5083201398020862e-06, + "loss": 0.145, + "step": 23729 + }, + { + "epoch": 0.6005010501809348, + "grad_norm": 7.396256923675537, + "learning_rate": 3.5079369070398284e-06, + "loss": 0.1496, + "step": 23730 + }, + { + "epoch": 0.6005263557456285, + "grad_norm": 14.283984184265137, + "learning_rate": 3.5075536838996103e-06, + "loss": 0.2201, + "step": 23731 + }, + { + "epoch": 0.6005516613103221, + "grad_norm": 3.7451202869415283, + "learning_rate": 3.5071704703838994e-06, + "loss": 0.1554, + "step": 23732 + }, + { + "epoch": 0.6005769668750158, + "grad_norm": 3.5652945041656494, + "learning_rate": 3.5067872664951676e-06, + "loss": 0.1627, + "step": 23733 + }, + { + "epoch": 0.6006022724397095, + "grad_norm": 5.422129154205322, + "learning_rate": 3.506404072235887e-06, + "loss": 0.2212, + "step": 23734 + }, + { + "epoch": 0.6006275780044031, + "grad_norm": 2.6902008056640625, + "learning_rate": 3.506020887608532e-06, + "loss": 0.1024, + "step": 23735 + }, + { + "epoch": 0.6006528835690969, + "grad_norm": 13.74271297454834, + "learning_rate": 3.505637712615567e-06, + "loss": 0.2328, + "step": 23736 + }, + { + "epoch": 0.6006781891337906, + "grad_norm": 4.662304401397705, + "learning_rate": 3.5052545472594666e-06, + "loss": 0.1782, + "step": 23737 + }, + { + "epoch": 0.6007034946984842, + "grad_norm": 6.809993267059326, + "learning_rate": 3.504871391542702e-06, + "loss": 0.2596, + "step": 23738 + }, + { + "epoch": 0.6007288002631779, + "grad_norm": 3.7840945720672607, + "learning_rate": 3.5044882454677444e-06, + "loss": 0.1642, + "step": 23739 + }, + { + "epoch": 0.6007541058278716, + "grad_norm": 53.52122116088867, + "learning_rate": 3.504105109037064e-06, + "loss": 0.3359, + "step": 23740 + }, + { + "epoch": 0.6007794113925652, + "grad_norm": 11.62736988067627, + "learning_rate": 3.5037219822531306e-06, + "loss": 0.2526, + "step": 23741 + }, + { + "epoch": 0.6008047169572589, + "grad_norm": 5.067817211151123, + "learning_rate": 3.5033388651184163e-06, + "loss": 0.1581, + "step": 23742 + }, + { + "epoch": 0.6008300225219526, + "grad_norm": 3.3985354900360107, + "learning_rate": 3.5029557576353913e-06, + "loss": 0.1611, + "step": 23743 + }, + { + "epoch": 0.6008553280866462, + "grad_norm": 4.2678046226501465, + "learning_rate": 3.5025726598065263e-06, + "loss": 0.0962, + "step": 23744 + }, + { + "epoch": 0.6008806336513399, + "grad_norm": 4.40059757232666, + "learning_rate": 3.502189571634292e-06, + "loss": 0.0979, + "step": 23745 + }, + { + "epoch": 0.6009059392160336, + "grad_norm": 4.360746383666992, + "learning_rate": 3.501806493121157e-06, + "loss": 0.1903, + "step": 23746 + }, + { + "epoch": 0.6009312447807272, + "grad_norm": 5.852043151855469, + "learning_rate": 3.501423424269594e-06, + "loss": 0.1835, + "step": 23747 + }, + { + "epoch": 0.600956550345421, + "grad_norm": 5.4398603439331055, + "learning_rate": 3.5010403650820722e-06, + "loss": 0.214, + "step": 23748 + }, + { + "epoch": 0.6009818559101147, + "grad_norm": 8.217690467834473, + "learning_rate": 3.500657315561064e-06, + "loss": 0.2119, + "step": 23749 + }, + { + "epoch": 0.6010071614748083, + "grad_norm": 9.760982513427734, + "learning_rate": 3.500274275709036e-06, + "loss": 0.2582, + "step": 23750 + }, + { + "epoch": 0.601032467039502, + "grad_norm": 5.618177890777588, + "learning_rate": 3.4998912455284605e-06, + "loss": 0.1196, + "step": 23751 + }, + { + "epoch": 0.6010577726041957, + "grad_norm": 7.166172027587891, + "learning_rate": 3.499508225021807e-06, + "loss": 0.1338, + "step": 23752 + }, + { + "epoch": 0.6010830781688893, + "grad_norm": 4.671570301055908, + "learning_rate": 3.4991252141915487e-06, + "loss": 0.2089, + "step": 23753 + }, + { + "epoch": 0.601108383733583, + "grad_norm": 3.847212791442871, + "learning_rate": 3.49874221304015e-06, + "loss": 0.1491, + "step": 23754 + }, + { + "epoch": 0.6011336892982767, + "grad_norm": 12.07954216003418, + "learning_rate": 3.4983592215700833e-06, + "loss": 0.2013, + "step": 23755 + }, + { + "epoch": 0.6011589948629704, + "grad_norm": 3.702265501022339, + "learning_rate": 3.497976239783819e-06, + "loss": 0.1775, + "step": 23756 + }, + { + "epoch": 0.601184300427664, + "grad_norm": 6.519339561462402, + "learning_rate": 3.4975932676838288e-06, + "loss": 0.1784, + "step": 23757 + }, + { + "epoch": 0.6012096059923577, + "grad_norm": 2.859161853790283, + "learning_rate": 3.4972103052725775e-06, + "loss": 0.1438, + "step": 23758 + }, + { + "epoch": 0.6012349115570514, + "grad_norm": 6.57568883895874, + "learning_rate": 3.4968273525525376e-06, + "loss": 0.2091, + "step": 23759 + }, + { + "epoch": 0.601260217121745, + "grad_norm": 4.988070487976074, + "learning_rate": 3.4964444095261783e-06, + "loss": 0.156, + "step": 23760 + }, + { + "epoch": 0.6012855226864388, + "grad_norm": 2.8450353145599365, + "learning_rate": 3.49606147619597e-06, + "loss": 0.1287, + "step": 23761 + }, + { + "epoch": 0.6013108282511325, + "grad_norm": 6.646664142608643, + "learning_rate": 3.495678552564383e-06, + "loss": 0.2074, + "step": 23762 + }, + { + "epoch": 0.6013361338158261, + "grad_norm": 5.337068557739258, + "learning_rate": 3.4952956386338833e-06, + "loss": 0.1851, + "step": 23763 + }, + { + "epoch": 0.6013614393805198, + "grad_norm": 6.675656318664551, + "learning_rate": 3.494912734406942e-06, + "loss": 0.2098, + "step": 23764 + }, + { + "epoch": 0.6013867449452135, + "grad_norm": 3.354407787322998, + "learning_rate": 3.4945298398860287e-06, + "loss": 0.1557, + "step": 23765 + }, + { + "epoch": 0.6014120505099071, + "grad_norm": 7.662546157836914, + "learning_rate": 3.4941469550736147e-06, + "loss": 0.1901, + "step": 23766 + }, + { + "epoch": 0.6014373560746008, + "grad_norm": 5.514279842376709, + "learning_rate": 3.493764079972164e-06, + "loss": 0.1998, + "step": 23767 + }, + { + "epoch": 0.6014626616392945, + "grad_norm": 9.03532600402832, + "learning_rate": 3.4933812145841496e-06, + "loss": 0.1596, + "step": 23768 + }, + { + "epoch": 0.6014879672039881, + "grad_norm": 6.2311296463012695, + "learning_rate": 3.49299835891204e-06, + "loss": 0.1944, + "step": 23769 + }, + { + "epoch": 0.6015132727686818, + "grad_norm": 3.258821487426758, + "learning_rate": 3.4926155129583028e-06, + "loss": 0.1128, + "step": 23770 + }, + { + "epoch": 0.6015385783333755, + "grad_norm": 3.759042739868164, + "learning_rate": 3.4922326767254085e-06, + "loss": 0.1591, + "step": 23771 + }, + { + "epoch": 0.6015638838980691, + "grad_norm": 4.936905384063721, + "learning_rate": 3.491849850215824e-06, + "loss": 0.1508, + "step": 23772 + }, + { + "epoch": 0.6015891894627629, + "grad_norm": 3.5821847915649414, + "learning_rate": 3.4914670334320212e-06, + "loss": 0.129, + "step": 23773 + }, + { + "epoch": 0.6016144950274566, + "grad_norm": 5.919861793518066, + "learning_rate": 3.4910842263764643e-06, + "loss": 0.1113, + "step": 23774 + }, + { + "epoch": 0.6016398005921502, + "grad_norm": 2.8030662536621094, + "learning_rate": 3.4907014290516274e-06, + "loss": 0.1533, + "step": 23775 + }, + { + "epoch": 0.6016651061568439, + "grad_norm": 6.6242828369140625, + "learning_rate": 3.490318641459975e-06, + "loss": 0.1785, + "step": 23776 + }, + { + "epoch": 0.6016904117215376, + "grad_norm": 5.307131290435791, + "learning_rate": 3.489935863603976e-06, + "loss": 0.1491, + "step": 23777 + }, + { + "epoch": 0.6017157172862312, + "grad_norm": 4.896978855133057, + "learning_rate": 3.4895530954861e-06, + "loss": 0.1186, + "step": 23778 + }, + { + "epoch": 0.6017410228509249, + "grad_norm": 4.246869087219238, + "learning_rate": 3.489170337108817e-06, + "loss": 0.1006, + "step": 23779 + }, + { + "epoch": 0.6017663284156186, + "grad_norm": 5.421510696411133, + "learning_rate": 3.4887875884745915e-06, + "loss": 0.1554, + "step": 23780 + }, + { + "epoch": 0.6017916339803123, + "grad_norm": 5.0377116203308105, + "learning_rate": 3.488404849585894e-06, + "loss": 0.1894, + "step": 23781 + }, + { + "epoch": 0.6018169395450059, + "grad_norm": 2.7947840690612793, + "learning_rate": 3.4880221204451926e-06, + "loss": 0.11, + "step": 23782 + }, + { + "epoch": 0.6018422451096996, + "grad_norm": 3.1808524131774902, + "learning_rate": 3.487639401054954e-06, + "loss": 0.1575, + "step": 23783 + }, + { + "epoch": 0.6018675506743933, + "grad_norm": 4.577351093292236, + "learning_rate": 3.487256691417651e-06, + "loss": 0.1733, + "step": 23784 + }, + { + "epoch": 0.601892856239087, + "grad_norm": 4.853430271148682, + "learning_rate": 3.4868739915357456e-06, + "loss": 0.2005, + "step": 23785 + }, + { + "epoch": 0.6019181618037807, + "grad_norm": 2.9829580783843994, + "learning_rate": 3.4864913014117085e-06, + "loss": 0.1316, + "step": 23786 + }, + { + "epoch": 0.6019434673684744, + "grad_norm": 6.932286739349365, + "learning_rate": 3.486108621048008e-06, + "loss": 0.1406, + "step": 23787 + }, + { + "epoch": 0.601968772933168, + "grad_norm": 7.645427703857422, + "learning_rate": 3.485725950447112e-06, + "loss": 0.1871, + "step": 23788 + }, + { + "epoch": 0.6019940784978617, + "grad_norm": 3.3807287216186523, + "learning_rate": 3.4853432896114868e-06, + "loss": 0.1139, + "step": 23789 + }, + { + "epoch": 0.6020193840625554, + "grad_norm": 5.130550861358643, + "learning_rate": 3.4849606385436006e-06, + "loss": 0.2445, + "step": 23790 + }, + { + "epoch": 0.602044689627249, + "grad_norm": 3.000844955444336, + "learning_rate": 3.4845779972459214e-06, + "loss": 0.1094, + "step": 23791 + }, + { + "epoch": 0.6020699951919427, + "grad_norm": 3.444096088409424, + "learning_rate": 3.484195365720917e-06, + "loss": 0.1328, + "step": 23792 + }, + { + "epoch": 0.6020953007566364, + "grad_norm": 2.734164237976074, + "learning_rate": 3.4838127439710565e-06, + "loss": 0.1361, + "step": 23793 + }, + { + "epoch": 0.60212060632133, + "grad_norm": 2.827333688735962, + "learning_rate": 3.483430131998804e-06, + "loss": 0.1343, + "step": 23794 + }, + { + "epoch": 0.6021459118860237, + "grad_norm": 3.181847095489502, + "learning_rate": 3.483047529806628e-06, + "loss": 0.1388, + "step": 23795 + }, + { + "epoch": 0.6021712174507174, + "grad_norm": 6.146670341491699, + "learning_rate": 3.4826649373969966e-06, + "loss": 0.188, + "step": 23796 + }, + { + "epoch": 0.602196523015411, + "grad_norm": 2.9454755783081055, + "learning_rate": 3.4822823547723783e-06, + "loss": 0.1102, + "step": 23797 + }, + { + "epoch": 0.6022218285801048, + "grad_norm": 4.11296272277832, + "learning_rate": 3.4818997819352375e-06, + "loss": 0.1566, + "step": 23798 + }, + { + "epoch": 0.6022471341447985, + "grad_norm": 12.638096809387207, + "learning_rate": 3.481517218888042e-06, + "loss": 0.2999, + "step": 23799 + }, + { + "epoch": 0.6022724397094921, + "grad_norm": 2.8601977825164795, + "learning_rate": 3.481134665633261e-06, + "loss": 0.1285, + "step": 23800 + }, + { + "epoch": 0.6022977452741858, + "grad_norm": 10.609519958496094, + "learning_rate": 3.480752122173359e-06, + "loss": 0.3038, + "step": 23801 + }, + { + "epoch": 0.6023230508388795, + "grad_norm": 4.063299179077148, + "learning_rate": 3.4803695885108045e-06, + "loss": 0.1086, + "step": 23802 + }, + { + "epoch": 0.6023483564035731, + "grad_norm": 7.421655178070068, + "learning_rate": 3.479987064648064e-06, + "loss": 0.2192, + "step": 23803 + }, + { + "epoch": 0.6023736619682668, + "grad_norm": 2.818767547607422, + "learning_rate": 3.479604550587603e-06, + "loss": 0.0936, + "step": 23804 + }, + { + "epoch": 0.6023989675329605, + "grad_norm": 4.013876914978027, + "learning_rate": 3.4792220463318897e-06, + "loss": 0.1449, + "step": 23805 + }, + { + "epoch": 0.6024242730976542, + "grad_norm": 3.592573881149292, + "learning_rate": 3.4788395518833927e-06, + "loss": 0.1093, + "step": 23806 + }, + { + "epoch": 0.6024495786623478, + "grad_norm": 7.859402656555176, + "learning_rate": 3.478457067244574e-06, + "loss": 0.188, + "step": 23807 + }, + { + "epoch": 0.6024748842270415, + "grad_norm": 3.364569664001465, + "learning_rate": 3.478074592417903e-06, + "loss": 0.1852, + "step": 23808 + }, + { + "epoch": 0.6025001897917353, + "grad_norm": 3.654247283935547, + "learning_rate": 3.477692127405846e-06, + "loss": 0.1133, + "step": 23809 + }, + { + "epoch": 0.6025254953564289, + "grad_norm": 5.482690334320068, + "learning_rate": 3.477309672210871e-06, + "loss": 0.1451, + "step": 23810 + }, + { + "epoch": 0.6025508009211226, + "grad_norm": 5.0671186447143555, + "learning_rate": 3.476927226835441e-06, + "loss": 0.1798, + "step": 23811 + }, + { + "epoch": 0.6025761064858163, + "grad_norm": 4.47165584564209, + "learning_rate": 3.476544791282024e-06, + "loss": 0.1714, + "step": 23812 + }, + { + "epoch": 0.6026014120505099, + "grad_norm": 4.039312362670898, + "learning_rate": 3.476162365553086e-06, + "loss": 0.1428, + "step": 23813 + }, + { + "epoch": 0.6026267176152036, + "grad_norm": 5.259130954742432, + "learning_rate": 3.475779949651093e-06, + "loss": 0.1671, + "step": 23814 + }, + { + "epoch": 0.6026520231798973, + "grad_norm": 5.6363139152526855, + "learning_rate": 3.475397543578514e-06, + "loss": 0.1444, + "step": 23815 + }, + { + "epoch": 0.6026773287445909, + "grad_norm": 6.358561992645264, + "learning_rate": 3.475015147337811e-06, + "loss": 0.1348, + "step": 23816 + }, + { + "epoch": 0.6027026343092846, + "grad_norm": 8.350987434387207, + "learning_rate": 3.474632760931451e-06, + "loss": 0.1424, + "step": 23817 + }, + { + "epoch": 0.6027279398739783, + "grad_norm": 3.591844081878662, + "learning_rate": 3.474250384361901e-06, + "loss": 0.1824, + "step": 23818 + }, + { + "epoch": 0.6027532454386719, + "grad_norm": 6.30391788482666, + "learning_rate": 3.4738680176316285e-06, + "loss": 0.1514, + "step": 23819 + }, + { + "epoch": 0.6027785510033656, + "grad_norm": 7.564321041107178, + "learning_rate": 3.4734856607430956e-06, + "loss": 0.1983, + "step": 23820 + }, + { + "epoch": 0.6028038565680593, + "grad_norm": 5.67877721786499, + "learning_rate": 3.473103313698769e-06, + "loss": 0.1555, + "step": 23821 + }, + { + "epoch": 0.602829162132753, + "grad_norm": 6.428097724914551, + "learning_rate": 3.472720976501115e-06, + "loss": 0.2292, + "step": 23822 + }, + { + "epoch": 0.6028544676974467, + "grad_norm": 13.486612319946289, + "learning_rate": 3.4723386491526003e-06, + "loss": 0.1494, + "step": 23823 + }, + { + "epoch": 0.6028797732621404, + "grad_norm": 9.712054252624512, + "learning_rate": 3.4719563316556904e-06, + "loss": 0.3131, + "step": 23824 + }, + { + "epoch": 0.602905078826834, + "grad_norm": 5.149975299835205, + "learning_rate": 3.471574024012848e-06, + "loss": 0.2074, + "step": 23825 + }, + { + "epoch": 0.6029303843915277, + "grad_norm": 7.824385166168213, + "learning_rate": 3.4711917262265417e-06, + "loss": 0.1449, + "step": 23826 + }, + { + "epoch": 0.6029556899562214, + "grad_norm": 3.648913860321045, + "learning_rate": 3.470809438299235e-06, + "loss": 0.1429, + "step": 23827 + }, + { + "epoch": 0.602980995520915, + "grad_norm": 6.66398811340332, + "learning_rate": 3.470427160233395e-06, + "loss": 0.1703, + "step": 23828 + }, + { + "epoch": 0.6030063010856087, + "grad_norm": 5.898207664489746, + "learning_rate": 3.470044892031484e-06, + "loss": 0.1784, + "step": 23829 + }, + { + "epoch": 0.6030316066503024, + "grad_norm": 5.929736614227295, + "learning_rate": 3.46966263369597e-06, + "loss": 0.2744, + "step": 23830 + }, + { + "epoch": 0.6030569122149961, + "grad_norm": 3.9691169261932373, + "learning_rate": 3.4692803852293162e-06, + "loss": 0.1494, + "step": 23831 + }, + { + "epoch": 0.6030822177796897, + "grad_norm": 5.387345790863037, + "learning_rate": 3.4688981466339895e-06, + "loss": 0.1922, + "step": 23832 + }, + { + "epoch": 0.6031075233443834, + "grad_norm": 5.851308345794678, + "learning_rate": 3.4685159179124526e-06, + "loss": 0.2548, + "step": 23833 + }, + { + "epoch": 0.6031328289090772, + "grad_norm": 12.156903266906738, + "learning_rate": 3.4681336990671725e-06, + "loss": 0.3626, + "step": 23834 + }, + { + "epoch": 0.6031581344737708, + "grad_norm": 9.285623550415039, + "learning_rate": 3.4677514901006128e-06, + "loss": 0.2503, + "step": 23835 + }, + { + "epoch": 0.6031834400384645, + "grad_norm": 3.5464115142822266, + "learning_rate": 3.467369291015238e-06, + "loss": 0.1462, + "step": 23836 + }, + { + "epoch": 0.6032087456031582, + "grad_norm": 3.6297078132629395, + "learning_rate": 3.466987101813516e-06, + "loss": 0.1519, + "step": 23837 + }, + { + "epoch": 0.6032340511678518, + "grad_norm": 4.16476583480835, + "learning_rate": 3.466604922497907e-06, + "loss": 0.2037, + "step": 23838 + }, + { + "epoch": 0.6032593567325455, + "grad_norm": 7.438039779663086, + "learning_rate": 3.4662227530708774e-06, + "loss": 0.1913, + "step": 23839 + }, + { + "epoch": 0.6032846622972392, + "grad_norm": 3.435889720916748, + "learning_rate": 3.4658405935348925e-06, + "loss": 0.1524, + "step": 23840 + }, + { + "epoch": 0.6033099678619328, + "grad_norm": 11.201674461364746, + "learning_rate": 3.4654584438924176e-06, + "loss": 0.204, + "step": 23841 + }, + { + "epoch": 0.6033352734266265, + "grad_norm": 3.7092411518096924, + "learning_rate": 3.4650763041459146e-06, + "loss": 0.159, + "step": 23842 + }, + { + "epoch": 0.6033605789913202, + "grad_norm": 8.46837043762207, + "learning_rate": 3.464694174297848e-06, + "loss": 0.1399, + "step": 23843 + }, + { + "epoch": 0.6033858845560138, + "grad_norm": 6.339193344116211, + "learning_rate": 3.464312054350684e-06, + "loss": 0.2209, + "step": 23844 + }, + { + "epoch": 0.6034111901207075, + "grad_norm": 2.4106316566467285, + "learning_rate": 3.4639299443068853e-06, + "loss": 0.0862, + "step": 23845 + }, + { + "epoch": 0.6034364956854013, + "grad_norm": 4.6957526206970215, + "learning_rate": 3.4635478441689186e-06, + "loss": 0.1627, + "step": 23846 + }, + { + "epoch": 0.6034618012500949, + "grad_norm": 3.783747434616089, + "learning_rate": 3.463165753939244e-06, + "loss": 0.1157, + "step": 23847 + }, + { + "epoch": 0.6034871068147886, + "grad_norm": 5.12581729888916, + "learning_rate": 3.462783673620328e-06, + "loss": 0.2176, + "step": 23848 + }, + { + "epoch": 0.6035124123794823, + "grad_norm": 6.394246578216553, + "learning_rate": 3.4624016032146336e-06, + "loss": 0.1983, + "step": 23849 + }, + { + "epoch": 0.6035377179441759, + "grad_norm": 5.735058307647705, + "learning_rate": 3.462019542724628e-06, + "loss": 0.1942, + "step": 23850 + }, + { + "epoch": 0.6035630235088696, + "grad_norm": 4.053769111633301, + "learning_rate": 3.4616374921527697e-06, + "loss": 0.1615, + "step": 23851 + }, + { + "epoch": 0.6035883290735633, + "grad_norm": 7.183166027069092, + "learning_rate": 3.461255451501525e-06, + "loss": 0.187, + "step": 23852 + }, + { + "epoch": 0.6036136346382569, + "grad_norm": 7.330863952636719, + "learning_rate": 3.460873420773358e-06, + "loss": 0.2097, + "step": 23853 + }, + { + "epoch": 0.6036389402029506, + "grad_norm": 4.772772789001465, + "learning_rate": 3.4604913999707328e-06, + "loss": 0.187, + "step": 23854 + }, + { + "epoch": 0.6036642457676443, + "grad_norm": 3.3773159980773926, + "learning_rate": 3.460109389096112e-06, + "loss": 0.1582, + "step": 23855 + }, + { + "epoch": 0.6036895513323379, + "grad_norm": 14.061694145202637, + "learning_rate": 3.459727388151959e-06, + "loss": 0.1692, + "step": 23856 + }, + { + "epoch": 0.6037148568970316, + "grad_norm": 5.685466289520264, + "learning_rate": 3.4593453971407374e-06, + "loss": 0.1344, + "step": 23857 + }, + { + "epoch": 0.6037401624617253, + "grad_norm": 7.092688083648682, + "learning_rate": 3.458963416064911e-06, + "loss": 0.095, + "step": 23858 + }, + { + "epoch": 0.6037654680264191, + "grad_norm": 6.367641925811768, + "learning_rate": 3.458581444926944e-06, + "loss": 0.1397, + "step": 23859 + }, + { + "epoch": 0.6037907735911127, + "grad_norm": 3.5101473331451416, + "learning_rate": 3.4581994837292963e-06, + "loss": 0.1727, + "step": 23860 + }, + { + "epoch": 0.6038160791558064, + "grad_norm": 4.561603546142578, + "learning_rate": 3.4578175324744357e-06, + "loss": 0.1256, + "step": 23861 + }, + { + "epoch": 0.6038413847205001, + "grad_norm": 5.437526226043701, + "learning_rate": 3.4574355911648205e-06, + "loss": 0.2534, + "step": 23862 + }, + { + "epoch": 0.6038666902851937, + "grad_norm": 3.3119516372680664, + "learning_rate": 3.4570536598029182e-06, + "loss": 0.1554, + "step": 23863 + }, + { + "epoch": 0.6038919958498874, + "grad_norm": 5.389770030975342, + "learning_rate": 3.4566717383911895e-06, + "loss": 0.1612, + "step": 23864 + }, + { + "epoch": 0.6039173014145811, + "grad_norm": 6.575539588928223, + "learning_rate": 3.456289826932097e-06, + "loss": 0.2819, + "step": 23865 + }, + { + "epoch": 0.6039426069792747, + "grad_norm": 4.9962615966796875, + "learning_rate": 3.455907925428104e-06, + "loss": 0.191, + "step": 23866 + }, + { + "epoch": 0.6039679125439684, + "grad_norm": 7.387741565704346, + "learning_rate": 3.4555260338816733e-06, + "loss": 0.2328, + "step": 23867 + }, + { + "epoch": 0.6039932181086621, + "grad_norm": 4.936317443847656, + "learning_rate": 3.4551441522952704e-06, + "loss": 0.1614, + "step": 23868 + }, + { + "epoch": 0.6040185236733557, + "grad_norm": 3.9975168704986572, + "learning_rate": 3.454762280671353e-06, + "loss": 0.1274, + "step": 23869 + }, + { + "epoch": 0.6040438292380494, + "grad_norm": 3.634345054626465, + "learning_rate": 3.454380419012387e-06, + "loss": 0.1734, + "step": 23870 + }, + { + "epoch": 0.6040691348027432, + "grad_norm": 3.7335073947906494, + "learning_rate": 3.453998567320834e-06, + "loss": 0.2099, + "step": 23871 + }, + { + "epoch": 0.6040944403674368, + "grad_norm": 4.758062362670898, + "learning_rate": 3.4536167255991583e-06, + "loss": 0.1877, + "step": 23872 + }, + { + "epoch": 0.6041197459321305, + "grad_norm": 6.33427095413208, + "learning_rate": 3.4532348938498185e-06, + "loss": 0.1548, + "step": 23873 + }, + { + "epoch": 0.6041450514968242, + "grad_norm": 7.538880825042725, + "learning_rate": 3.4528530720752797e-06, + "loss": 0.2435, + "step": 23874 + }, + { + "epoch": 0.6041703570615178, + "grad_norm": 3.1206259727478027, + "learning_rate": 3.452471260278003e-06, + "loss": 0.1504, + "step": 23875 + }, + { + "epoch": 0.6041956626262115, + "grad_norm": 12.654583930969238, + "learning_rate": 3.4520894584604514e-06, + "loss": 0.1713, + "step": 23876 + }, + { + "epoch": 0.6042209681909052, + "grad_norm": 13.988015174865723, + "learning_rate": 3.451707666625089e-06, + "loss": 0.3739, + "step": 23877 + }, + { + "epoch": 0.6042462737555988, + "grad_norm": 9.887826919555664, + "learning_rate": 3.4513258847743746e-06, + "loss": 0.26, + "step": 23878 + }, + { + "epoch": 0.6042715793202925, + "grad_norm": 5.967809677124023, + "learning_rate": 3.4509441129107707e-06, + "loss": 0.1873, + "step": 23879 + }, + { + "epoch": 0.6042968848849862, + "grad_norm": 3.107001304626465, + "learning_rate": 3.4505623510367404e-06, + "loss": 0.1362, + "step": 23880 + }, + { + "epoch": 0.6043221904496798, + "grad_norm": 5.189818859100342, + "learning_rate": 3.450180599154747e-06, + "loss": 0.1934, + "step": 23881 + }, + { + "epoch": 0.6043474960143735, + "grad_norm": 14.980142593383789, + "learning_rate": 3.449798857267249e-06, + "loss": 0.2208, + "step": 23882 + }, + { + "epoch": 0.6043728015790673, + "grad_norm": 6.341464042663574, + "learning_rate": 3.4494171253767096e-06, + "loss": 0.1214, + "step": 23883 + }, + { + "epoch": 0.604398107143761, + "grad_norm": 4.214987754821777, + "learning_rate": 3.4490354034855906e-06, + "loss": 0.1605, + "step": 23884 + }, + { + "epoch": 0.6044234127084546, + "grad_norm": 3.5692012310028076, + "learning_rate": 3.448653691596356e-06, + "loss": 0.0976, + "step": 23885 + }, + { + "epoch": 0.6044487182731483, + "grad_norm": 3.288329601287842, + "learning_rate": 3.4482719897114632e-06, + "loss": 0.13, + "step": 23886 + }, + { + "epoch": 0.604474023837842, + "grad_norm": 9.745524406433105, + "learning_rate": 3.447890297833375e-06, + "loss": 0.2095, + "step": 23887 + }, + { + "epoch": 0.6044993294025356, + "grad_norm": 3.4439923763275146, + "learning_rate": 3.4475086159645554e-06, + "loss": 0.1428, + "step": 23888 + }, + { + "epoch": 0.6045246349672293, + "grad_norm": 5.021052360534668, + "learning_rate": 3.447126944107463e-06, + "loss": 0.1356, + "step": 23889 + }, + { + "epoch": 0.604549940531923, + "grad_norm": 5.998986721038818, + "learning_rate": 3.4467452822645607e-06, + "loss": 0.1493, + "step": 23890 + }, + { + "epoch": 0.6045752460966166, + "grad_norm": 7.754758358001709, + "learning_rate": 3.4463636304383088e-06, + "loss": 0.1686, + "step": 23891 + }, + { + "epoch": 0.6046005516613103, + "grad_norm": 3.7739665508270264, + "learning_rate": 3.4459819886311683e-06, + "loss": 0.1528, + "step": 23892 + }, + { + "epoch": 0.604625857226004, + "grad_norm": 2.8303565979003906, + "learning_rate": 3.4456003568456007e-06, + "loss": 0.1081, + "step": 23893 + }, + { + "epoch": 0.6046511627906976, + "grad_norm": 2.7579004764556885, + "learning_rate": 3.445218735084069e-06, + "loss": 0.1476, + "step": 23894 + }, + { + "epoch": 0.6046764683553913, + "grad_norm": 3.193833827972412, + "learning_rate": 3.444837123349032e-06, + "loss": 0.1423, + "step": 23895 + }, + { + "epoch": 0.6047017739200851, + "grad_norm": 7.82515811920166, + "learning_rate": 3.44445552164295e-06, + "loss": 0.2155, + "step": 23896 + }, + { + "epoch": 0.6047270794847787, + "grad_norm": 3.950550079345703, + "learning_rate": 3.4440739299682844e-06, + "loss": 0.1228, + "step": 23897 + }, + { + "epoch": 0.6047523850494724, + "grad_norm": 3.998483657836914, + "learning_rate": 3.4436923483274976e-06, + "loss": 0.1175, + "step": 23898 + }, + { + "epoch": 0.6047776906141661, + "grad_norm": 4.86492395401001, + "learning_rate": 3.4433107767230505e-06, + "loss": 0.1737, + "step": 23899 + }, + { + "epoch": 0.6048029961788597, + "grad_norm": 7.557657718658447, + "learning_rate": 3.442929215157401e-06, + "loss": 0.2457, + "step": 23900 + }, + { + "epoch": 0.6048283017435534, + "grad_norm": 3.6443417072296143, + "learning_rate": 3.4425476636330113e-06, + "loss": 0.116, + "step": 23901 + }, + { + "epoch": 0.6048536073082471, + "grad_norm": 3.9072067737579346, + "learning_rate": 3.4421661221523422e-06, + "loss": 0.0872, + "step": 23902 + }, + { + "epoch": 0.6048789128729407, + "grad_norm": 4.033535003662109, + "learning_rate": 3.441784590717856e-06, + "loss": 0.1556, + "step": 23903 + }, + { + "epoch": 0.6049042184376344, + "grad_norm": 8.992526054382324, + "learning_rate": 3.4414030693320088e-06, + "loss": 0.2245, + "step": 23904 + }, + { + "epoch": 0.6049295240023281, + "grad_norm": 6.775164604187012, + "learning_rate": 3.441021557997264e-06, + "loss": 0.1553, + "step": 23905 + }, + { + "epoch": 0.6049548295670217, + "grad_norm": 8.429767608642578, + "learning_rate": 3.44064005671608e-06, + "loss": 0.1917, + "step": 23906 + }, + { + "epoch": 0.6049801351317154, + "grad_norm": 6.506714344024658, + "learning_rate": 3.4402585654909194e-06, + "loss": 0.1764, + "step": 23907 + }, + { + "epoch": 0.6050054406964092, + "grad_norm": 4.876929759979248, + "learning_rate": 3.4398770843242423e-06, + "loss": 0.1082, + "step": 23908 + }, + { + "epoch": 0.6050307462611029, + "grad_norm": 6.007192134857178, + "learning_rate": 3.4394956132185065e-06, + "loss": 0.1809, + "step": 23909 + }, + { + "epoch": 0.6050560518257965, + "grad_norm": 5.880733489990234, + "learning_rate": 3.439114152176173e-06, + "loss": 0.1375, + "step": 23910 + }, + { + "epoch": 0.6050813573904902, + "grad_norm": 4.238916397094727, + "learning_rate": 3.4387327011997013e-06, + "loss": 0.1294, + "step": 23911 + }, + { + "epoch": 0.6051066629551839, + "grad_norm": 4.616065979003906, + "learning_rate": 3.438351260291555e-06, + "loss": 0.1448, + "step": 23912 + }, + { + "epoch": 0.6051319685198775, + "grad_norm": 3.985302448272705, + "learning_rate": 3.437969829454188e-06, + "loss": 0.1379, + "step": 23913 + }, + { + "epoch": 0.6051572740845712, + "grad_norm": 16.186100006103516, + "learning_rate": 3.4375884086900636e-06, + "loss": 0.3012, + "step": 23914 + }, + { + "epoch": 0.6051825796492649, + "grad_norm": 2.963054656982422, + "learning_rate": 3.4372069980016416e-06, + "loss": 0.1788, + "step": 23915 + }, + { + "epoch": 0.6052078852139585, + "grad_norm": 3.948363780975342, + "learning_rate": 3.436825597391381e-06, + "loss": 0.1614, + "step": 23916 + }, + { + "epoch": 0.6052331907786522, + "grad_norm": 9.763777732849121, + "learning_rate": 3.4364442068617403e-06, + "loss": 0.2609, + "step": 23917 + }, + { + "epoch": 0.6052584963433459, + "grad_norm": 14.961811065673828, + "learning_rate": 3.43606282641518e-06, + "loss": 0.3289, + "step": 23918 + }, + { + "epoch": 0.6052838019080395, + "grad_norm": 6.516406059265137, + "learning_rate": 3.4356814560541615e-06, + "loss": 0.1512, + "step": 23919 + }, + { + "epoch": 0.6053091074727333, + "grad_norm": 3.8876113891601562, + "learning_rate": 3.43530009578114e-06, + "loss": 0.2041, + "step": 23920 + }, + { + "epoch": 0.605334413037427, + "grad_norm": 3.815627336502075, + "learning_rate": 3.4349187455985788e-06, + "loss": 0.1431, + "step": 23921 + }, + { + "epoch": 0.6053597186021206, + "grad_norm": 12.459580421447754, + "learning_rate": 3.4345374055089355e-06, + "loss": 0.1729, + "step": 23922 + }, + { + "epoch": 0.6053850241668143, + "grad_norm": 9.974813461303711, + "learning_rate": 3.4341560755146688e-06, + "loss": 0.3533, + "step": 23923 + }, + { + "epoch": 0.605410329731508, + "grad_norm": 3.5777781009674072, + "learning_rate": 3.433774755618237e-06, + "loss": 0.1332, + "step": 23924 + }, + { + "epoch": 0.6054356352962016, + "grad_norm": 4.458280563354492, + "learning_rate": 3.4333934458221034e-06, + "loss": 0.1584, + "step": 23925 + }, + { + "epoch": 0.6054609408608953, + "grad_norm": 5.95943021774292, + "learning_rate": 3.4330121461287217e-06, + "loss": 0.1977, + "step": 23926 + }, + { + "epoch": 0.605486246425589, + "grad_norm": 3.839956760406494, + "learning_rate": 3.4326308565405532e-06, + "loss": 0.0973, + "step": 23927 + }, + { + "epoch": 0.6055115519902826, + "grad_norm": 9.03928279876709, + "learning_rate": 3.4322495770600573e-06, + "loss": 0.1218, + "step": 23928 + }, + { + "epoch": 0.6055368575549763, + "grad_norm": 5.142502784729004, + "learning_rate": 3.431868307689692e-06, + "loss": 0.2056, + "step": 23929 + }, + { + "epoch": 0.60556216311967, + "grad_norm": 4.67697286605835, + "learning_rate": 3.4314870484319175e-06, + "loss": 0.1542, + "step": 23930 + }, + { + "epoch": 0.6055874686843636, + "grad_norm": 7.6781110763549805, + "learning_rate": 3.4311057992891906e-06, + "loss": 0.2123, + "step": 23931 + }, + { + "epoch": 0.6056127742490574, + "grad_norm": 8.322168350219727, + "learning_rate": 3.4307245602639695e-06, + "loss": 0.1765, + "step": 23932 + }, + { + "epoch": 0.6056380798137511, + "grad_norm": 12.173029899597168, + "learning_rate": 3.4303433313587144e-06, + "loss": 0.2632, + "step": 23933 + }, + { + "epoch": 0.6056633853784448, + "grad_norm": 3.672062635421753, + "learning_rate": 3.429962112575884e-06, + "loss": 0.1353, + "step": 23934 + }, + { + "epoch": 0.6056886909431384, + "grad_norm": 3.567711114883423, + "learning_rate": 3.429580903917935e-06, + "loss": 0.1325, + "step": 23935 + }, + { + "epoch": 0.6057139965078321, + "grad_norm": 4.383408546447754, + "learning_rate": 3.429199705387327e-06, + "loss": 0.1365, + "step": 23936 + }, + { + "epoch": 0.6057393020725258, + "grad_norm": 13.433677673339844, + "learning_rate": 3.428818516986517e-06, + "loss": 0.2987, + "step": 23937 + }, + { + "epoch": 0.6057646076372194, + "grad_norm": 11.06205940246582, + "learning_rate": 3.4284373387179665e-06, + "loss": 0.2017, + "step": 23938 + }, + { + "epoch": 0.6057899132019131, + "grad_norm": 3.707841396331787, + "learning_rate": 3.428056170584129e-06, + "loss": 0.1272, + "step": 23939 + }, + { + "epoch": 0.6058152187666068, + "grad_norm": 5.25099515914917, + "learning_rate": 3.4276750125874654e-06, + "loss": 0.1643, + "step": 23940 + }, + { + "epoch": 0.6058405243313004, + "grad_norm": 8.041343688964844, + "learning_rate": 3.4272938647304325e-06, + "loss": 0.2794, + "step": 23941 + }, + { + "epoch": 0.6058658298959941, + "grad_norm": 6.526706218719482, + "learning_rate": 3.4269127270154896e-06, + "loss": 0.1865, + "step": 23942 + }, + { + "epoch": 0.6058911354606878, + "grad_norm": 20.40842056274414, + "learning_rate": 3.4265315994450956e-06, + "loss": 0.2982, + "step": 23943 + }, + { + "epoch": 0.6059164410253814, + "grad_norm": 5.059661388397217, + "learning_rate": 3.4261504820217044e-06, + "loss": 0.1437, + "step": 23944 + }, + { + "epoch": 0.6059417465900752, + "grad_norm": 5.174960613250732, + "learning_rate": 3.425769374747776e-06, + "loss": 0.1913, + "step": 23945 + }, + { + "epoch": 0.6059670521547689, + "grad_norm": 4.065957546234131, + "learning_rate": 3.4253882776257697e-06, + "loss": 0.1179, + "step": 23946 + }, + { + "epoch": 0.6059923577194625, + "grad_norm": 12.592955589294434, + "learning_rate": 3.4250071906581406e-06, + "loss": 0.1735, + "step": 23947 + }, + { + "epoch": 0.6060176632841562, + "grad_norm": 5.815926551818848, + "learning_rate": 3.4246261138473467e-06, + "loss": 0.196, + "step": 23948 + }, + { + "epoch": 0.6060429688488499, + "grad_norm": 3.4053726196289062, + "learning_rate": 3.4242450471958474e-06, + "loss": 0.0916, + "step": 23949 + }, + { + "epoch": 0.6060682744135435, + "grad_norm": 7.610217094421387, + "learning_rate": 3.423863990706097e-06, + "loss": 0.1457, + "step": 23950 + }, + { + "epoch": 0.6060935799782372, + "grad_norm": 3.4549925327301025, + "learning_rate": 3.4234829443805544e-06, + "loss": 0.1537, + "step": 23951 + }, + { + "epoch": 0.6061188855429309, + "grad_norm": 4.059206485748291, + "learning_rate": 3.42310190822168e-06, + "loss": 0.1149, + "step": 23952 + }, + { + "epoch": 0.6061441911076245, + "grad_norm": 5.914626121520996, + "learning_rate": 3.422720882231925e-06, + "loss": 0.1686, + "step": 23953 + }, + { + "epoch": 0.6061694966723182, + "grad_norm": 2.8693718910217285, + "learning_rate": 3.4223398664137496e-06, + "loss": 0.1083, + "step": 23954 + }, + { + "epoch": 0.6061948022370119, + "grad_norm": 8.340472221374512, + "learning_rate": 3.421958860769612e-06, + "loss": 0.2157, + "step": 23955 + }, + { + "epoch": 0.6062201078017055, + "grad_norm": 3.4608805179595947, + "learning_rate": 3.4215778653019694e-06, + "loss": 0.1019, + "step": 23956 + }, + { + "epoch": 0.6062454133663993, + "grad_norm": 5.912128925323486, + "learning_rate": 3.4211968800132756e-06, + "loss": 0.272, + "step": 23957 + }, + { + "epoch": 0.606270718931093, + "grad_norm": 12.877924919128418, + "learning_rate": 3.42081590490599e-06, + "loss": 0.178, + "step": 23958 + }, + { + "epoch": 0.6062960244957867, + "grad_norm": 5.3948283195495605, + "learning_rate": 3.4204349399825676e-06, + "loss": 0.1355, + "step": 23959 + }, + { + "epoch": 0.6063213300604803, + "grad_norm": 7.254499912261963, + "learning_rate": 3.4200539852454697e-06, + "loss": 0.1546, + "step": 23960 + }, + { + "epoch": 0.606346635625174, + "grad_norm": 8.494592666625977, + "learning_rate": 3.4196730406971467e-06, + "loss": 0.1716, + "step": 23961 + }, + { + "epoch": 0.6063719411898677, + "grad_norm": 4.964973449707031, + "learning_rate": 3.419292106340059e-06, + "loss": 0.1603, + "step": 23962 + }, + { + "epoch": 0.6063972467545613, + "grad_norm": 7.338053226470947, + "learning_rate": 3.4189111821766624e-06, + "loss": 0.1793, + "step": 23963 + }, + { + "epoch": 0.606422552319255, + "grad_norm": 3.3342037200927734, + "learning_rate": 3.4185302682094133e-06, + "loss": 0.1002, + "step": 23964 + }, + { + "epoch": 0.6064478578839487, + "grad_norm": 10.103682518005371, + "learning_rate": 3.4181493644407704e-06, + "loss": 0.208, + "step": 23965 + }, + { + "epoch": 0.6064731634486423, + "grad_norm": 2.352113962173462, + "learning_rate": 3.4177684708731862e-06, + "loss": 0.088, + "step": 23966 + }, + { + "epoch": 0.606498469013336, + "grad_norm": 3.1470754146575928, + "learning_rate": 3.417387587509119e-06, + "loss": 0.132, + "step": 23967 + }, + { + "epoch": 0.6065237745780298, + "grad_norm": 2.4293010234832764, + "learning_rate": 3.4170067143510245e-06, + "loss": 0.0944, + "step": 23968 + }, + { + "epoch": 0.6065490801427234, + "grad_norm": 4.839338779449463, + "learning_rate": 3.416625851401361e-06, + "loss": 0.182, + "step": 23969 + }, + { + "epoch": 0.6065743857074171, + "grad_norm": 9.174317359924316, + "learning_rate": 3.4162449986625813e-06, + "loss": 0.246, + "step": 23970 + }, + { + "epoch": 0.6065996912721108, + "grad_norm": 5.375848293304443, + "learning_rate": 3.415864156137143e-06, + "loss": 0.1692, + "step": 23971 + }, + { + "epoch": 0.6066249968368044, + "grad_norm": 6.103419303894043, + "learning_rate": 3.415483323827502e-06, + "loss": 0.1767, + "step": 23972 + }, + { + "epoch": 0.6066503024014981, + "grad_norm": 3.2014400959014893, + "learning_rate": 3.415102501736115e-06, + "loss": 0.0659, + "step": 23973 + }, + { + "epoch": 0.6066756079661918, + "grad_norm": 6.390614032745361, + "learning_rate": 3.414721689865437e-06, + "loss": 0.2207, + "step": 23974 + }, + { + "epoch": 0.6067009135308854, + "grad_norm": 9.419759750366211, + "learning_rate": 3.4143408882179235e-06, + "loss": 0.2317, + "step": 23975 + }, + { + "epoch": 0.6067262190955791, + "grad_norm": 6.240711688995361, + "learning_rate": 3.4139600967960318e-06, + "loss": 0.1741, + "step": 23976 + }, + { + "epoch": 0.6067515246602728, + "grad_norm": 3.118894577026367, + "learning_rate": 3.4135793156022144e-06, + "loss": 0.0544, + "step": 23977 + }, + { + "epoch": 0.6067768302249664, + "grad_norm": 5.460888385772705, + "learning_rate": 3.413198544638931e-06, + "loss": 0.2123, + "step": 23978 + }, + { + "epoch": 0.6068021357896601, + "grad_norm": 3.5898778438568115, + "learning_rate": 3.4128177839086334e-06, + "loss": 0.1808, + "step": 23979 + }, + { + "epoch": 0.6068274413543538, + "grad_norm": 4.256214618682861, + "learning_rate": 3.4124370334137804e-06, + "loss": 0.1453, + "step": 23980 + }, + { + "epoch": 0.6068527469190474, + "grad_norm": 5.708006858825684, + "learning_rate": 3.4120562931568236e-06, + "loss": 0.1497, + "step": 23981 + }, + { + "epoch": 0.6068780524837412, + "grad_norm": 4.739010334014893, + "learning_rate": 3.411675563140221e-06, + "loss": 0.1456, + "step": 23982 + }, + { + "epoch": 0.6069033580484349, + "grad_norm": 6.278841972351074, + "learning_rate": 3.4112948433664293e-06, + "loss": 0.1543, + "step": 23983 + }, + { + "epoch": 0.6069286636131285, + "grad_norm": 4.736179351806641, + "learning_rate": 3.410914133837899e-06, + "loss": 0.1675, + "step": 23984 + }, + { + "epoch": 0.6069539691778222, + "grad_norm": 3.2924623489379883, + "learning_rate": 3.4105334345570882e-06, + "loss": 0.1099, + "step": 23985 + }, + { + "epoch": 0.6069792747425159, + "grad_norm": 5.29526424407959, + "learning_rate": 3.4101527455264526e-06, + "loss": 0.1185, + "step": 23986 + }, + { + "epoch": 0.6070045803072096, + "grad_norm": 2.9114835262298584, + "learning_rate": 3.409772066748447e-06, + "loss": 0.1296, + "step": 23987 + }, + { + "epoch": 0.6070298858719032, + "grad_norm": 4.470612049102783, + "learning_rate": 3.4093913982255245e-06, + "loss": 0.185, + "step": 23988 + }, + { + "epoch": 0.6070551914365969, + "grad_norm": 26.80891227722168, + "learning_rate": 3.40901073996014e-06, + "loss": 0.4355, + "step": 23989 + }, + { + "epoch": 0.6070804970012906, + "grad_norm": 5.113770484924316, + "learning_rate": 3.40863009195475e-06, + "loss": 0.1824, + "step": 23990 + }, + { + "epoch": 0.6071058025659842, + "grad_norm": 3.686548948287964, + "learning_rate": 3.4082494542118098e-06, + "loss": 0.0947, + "step": 23991 + }, + { + "epoch": 0.6071311081306779, + "grad_norm": 3.4792838096618652, + "learning_rate": 3.4078688267337713e-06, + "loss": 0.1231, + "step": 23992 + }, + { + "epoch": 0.6071564136953717, + "grad_norm": 4.36755895614624, + "learning_rate": 3.4074882095230906e-06, + "loss": 0.1486, + "step": 23993 + }, + { + "epoch": 0.6071817192600653, + "grad_norm": 4.769445896148682, + "learning_rate": 3.407107602582222e-06, + "loss": 0.1322, + "step": 23994 + }, + { + "epoch": 0.607207024824759, + "grad_norm": 4.5099778175354, + "learning_rate": 3.4067270059136203e-06, + "loss": 0.1124, + "step": 23995 + }, + { + "epoch": 0.6072323303894527, + "grad_norm": 3.745892286300659, + "learning_rate": 3.4063464195197414e-06, + "loss": 0.159, + "step": 23996 + }, + { + "epoch": 0.6072576359541463, + "grad_norm": 5.7120795249938965, + "learning_rate": 3.4059658434030364e-06, + "loss": 0.2016, + "step": 23997 + }, + { + "epoch": 0.60728294151884, + "grad_norm": 8.9741792678833, + "learning_rate": 3.4055852775659615e-06, + "loss": 0.2358, + "step": 23998 + }, + { + "epoch": 0.6073082470835337, + "grad_norm": 4.44124698638916, + "learning_rate": 3.4052047220109695e-06, + "loss": 0.18, + "step": 23999 + }, + { + "epoch": 0.6073335526482273, + "grad_norm": 3.3446922302246094, + "learning_rate": 3.404824176740519e-06, + "loss": 0.0729, + "step": 24000 + }, + { + "epoch": 0.607358858212921, + "grad_norm": 4.319263935089111, + "learning_rate": 3.4044436417570575e-06, + "loss": 0.1915, + "step": 24001 + }, + { + "epoch": 0.6073841637776147, + "grad_norm": 3.4076833724975586, + "learning_rate": 3.404063117063042e-06, + "loss": 0.1141, + "step": 24002 + }, + { + "epoch": 0.6074094693423083, + "grad_norm": 2.6743524074554443, + "learning_rate": 3.403682602660928e-06, + "loss": 0.1542, + "step": 24003 + }, + { + "epoch": 0.607434774907002, + "grad_norm": 2.7304511070251465, + "learning_rate": 3.403302098553168e-06, + "loss": 0.1128, + "step": 24004 + }, + { + "epoch": 0.6074600804716958, + "grad_norm": 3.3462774753570557, + "learning_rate": 3.4029216047422152e-06, + "loss": 0.1476, + "step": 24005 + }, + { + "epoch": 0.6074853860363894, + "grad_norm": 4.689367771148682, + "learning_rate": 3.402541121230524e-06, + "loss": 0.1519, + "step": 24006 + }, + { + "epoch": 0.6075106916010831, + "grad_norm": 11.73963737487793, + "learning_rate": 3.4021606480205484e-06, + "loss": 0.2758, + "step": 24007 + }, + { + "epoch": 0.6075359971657768, + "grad_norm": 5.047257900238037, + "learning_rate": 3.401780185114741e-06, + "loss": 0.1581, + "step": 24008 + }, + { + "epoch": 0.6075613027304704, + "grad_norm": 3.366265058517456, + "learning_rate": 3.401399732515557e-06, + "loss": 0.1288, + "step": 24009 + }, + { + "epoch": 0.6075866082951641, + "grad_norm": 9.144834518432617, + "learning_rate": 3.401019290225448e-06, + "loss": 0.1777, + "step": 24010 + }, + { + "epoch": 0.6076119138598578, + "grad_norm": 6.450554847717285, + "learning_rate": 3.4006388582468676e-06, + "loss": 0.227, + "step": 24011 + }, + { + "epoch": 0.6076372194245515, + "grad_norm": 5.883894443511963, + "learning_rate": 3.400258436582269e-06, + "loss": 0.1618, + "step": 24012 + }, + { + "epoch": 0.6076625249892451, + "grad_norm": 5.113560676574707, + "learning_rate": 3.399878025234109e-06, + "loss": 0.1889, + "step": 24013 + }, + { + "epoch": 0.6076878305539388, + "grad_norm": 10.930684089660645, + "learning_rate": 3.399497624204836e-06, + "loss": 0.3106, + "step": 24014 + }, + { + "epoch": 0.6077131361186325, + "grad_norm": 7.581890106201172, + "learning_rate": 3.3991172334969046e-06, + "loss": 0.1776, + "step": 24015 + }, + { + "epoch": 0.6077384416833261, + "grad_norm": 3.081209659576416, + "learning_rate": 3.3987368531127684e-06, + "loss": 0.1483, + "step": 24016 + }, + { + "epoch": 0.6077637472480198, + "grad_norm": 6.902156829833984, + "learning_rate": 3.398356483054881e-06, + "loss": 0.1603, + "step": 24017 + }, + { + "epoch": 0.6077890528127136, + "grad_norm": 4.481870174407959, + "learning_rate": 3.397976123325696e-06, + "loss": 0.1696, + "step": 24018 + }, + { + "epoch": 0.6078143583774072, + "grad_norm": 6.899158000946045, + "learning_rate": 3.3975957739276634e-06, + "loss": 0.2425, + "step": 24019 + }, + { + "epoch": 0.6078396639421009, + "grad_norm": 3.2896947860717773, + "learning_rate": 3.397215434863237e-06, + "loss": 0.1126, + "step": 24020 + }, + { + "epoch": 0.6078649695067946, + "grad_norm": 5.909018039703369, + "learning_rate": 3.396835106134871e-06, + "loss": 0.1433, + "step": 24021 + }, + { + "epoch": 0.6078902750714882, + "grad_norm": 3.914456844329834, + "learning_rate": 3.396454787745019e-06, + "loss": 0.1558, + "step": 24022 + }, + { + "epoch": 0.6079155806361819, + "grad_norm": 6.9165496826171875, + "learning_rate": 3.396074479696129e-06, + "loss": 0.2743, + "step": 24023 + }, + { + "epoch": 0.6079408862008756, + "grad_norm": 4.579986095428467, + "learning_rate": 3.3956941819906575e-06, + "loss": 0.2002, + "step": 24024 + }, + { + "epoch": 0.6079661917655692, + "grad_norm": 9.541641235351562, + "learning_rate": 3.395313894631055e-06, + "loss": 0.2403, + "step": 24025 + }, + { + "epoch": 0.6079914973302629, + "grad_norm": 4.619685649871826, + "learning_rate": 3.3949336176197746e-06, + "loss": 0.1574, + "step": 24026 + }, + { + "epoch": 0.6080168028949566, + "grad_norm": 9.508674621582031, + "learning_rate": 3.394553350959271e-06, + "loss": 0.1631, + "step": 24027 + }, + { + "epoch": 0.6080421084596502, + "grad_norm": 4.226436614990234, + "learning_rate": 3.3941730946519922e-06, + "loss": 0.1358, + "step": 24028 + }, + { + "epoch": 0.6080674140243439, + "grad_norm": 3.5322444438934326, + "learning_rate": 3.3937928487003924e-06, + "loss": 0.2001, + "step": 24029 + }, + { + "epoch": 0.6080927195890377, + "grad_norm": 2.5803892612457275, + "learning_rate": 3.393412613106924e-06, + "loss": 0.0948, + "step": 24030 + }, + { + "epoch": 0.6081180251537313, + "grad_norm": 3.375363349914551, + "learning_rate": 3.393032387874041e-06, + "loss": 0.1137, + "step": 24031 + }, + { + "epoch": 0.608143330718425, + "grad_norm": 4.764966011047363, + "learning_rate": 3.3926521730041905e-06, + "loss": 0.2077, + "step": 24032 + }, + { + "epoch": 0.6081686362831187, + "grad_norm": 3.5438222885131836, + "learning_rate": 3.3922719684998274e-06, + "loss": 0.11, + "step": 24033 + }, + { + "epoch": 0.6081939418478123, + "grad_norm": 4.932129859924316, + "learning_rate": 3.391891774363404e-06, + "loss": 0.172, + "step": 24034 + }, + { + "epoch": 0.608219247412506, + "grad_norm": 8.768139839172363, + "learning_rate": 3.3915115905973704e-06, + "loss": 0.2546, + "step": 24035 + }, + { + "epoch": 0.6082445529771997, + "grad_norm": 4.239508152008057, + "learning_rate": 3.391131417204181e-06, + "loss": 0.0951, + "step": 24036 + }, + { + "epoch": 0.6082698585418934, + "grad_norm": 4.840444564819336, + "learning_rate": 3.3907512541862856e-06, + "loss": 0.1762, + "step": 24037 + }, + { + "epoch": 0.608295164106587, + "grad_norm": 3.655568838119507, + "learning_rate": 3.390371101546135e-06, + "loss": 0.1394, + "step": 24038 + }, + { + "epoch": 0.6083204696712807, + "grad_norm": 3.3374035358428955, + "learning_rate": 3.389990959286182e-06, + "loss": 0.1808, + "step": 24039 + }, + { + "epoch": 0.6083457752359744, + "grad_norm": 2.504859209060669, + "learning_rate": 3.3896108274088786e-06, + "loss": 0.0994, + "step": 24040 + }, + { + "epoch": 0.608371080800668, + "grad_norm": 10.909418106079102, + "learning_rate": 3.389230705916675e-06, + "loss": 0.1592, + "step": 24041 + }, + { + "epoch": 0.6083963863653618, + "grad_norm": 5.37175178527832, + "learning_rate": 3.3888505948120225e-06, + "loss": 0.1749, + "step": 24042 + }, + { + "epoch": 0.6084216919300555, + "grad_norm": 3.773446798324585, + "learning_rate": 3.388470494097372e-06, + "loss": 0.1615, + "step": 24043 + }, + { + "epoch": 0.6084469974947491, + "grad_norm": 11.643998146057129, + "learning_rate": 3.388090403775179e-06, + "loss": 0.2369, + "step": 24044 + }, + { + "epoch": 0.6084723030594428, + "grad_norm": 5.594155311584473, + "learning_rate": 3.387710323847888e-06, + "loss": 0.2059, + "step": 24045 + }, + { + "epoch": 0.6084976086241365, + "grad_norm": 3.887800931930542, + "learning_rate": 3.3873302543179533e-06, + "loss": 0.1206, + "step": 24046 + }, + { + "epoch": 0.6085229141888301, + "grad_norm": 8.506396293640137, + "learning_rate": 3.386950195187826e-06, + "loss": 0.368, + "step": 24047 + }, + { + "epoch": 0.6085482197535238, + "grad_norm": 4.140540599822998, + "learning_rate": 3.386570146459957e-06, + "loss": 0.1569, + "step": 24048 + }, + { + "epoch": 0.6085735253182175, + "grad_norm": 4.1847639083862305, + "learning_rate": 3.3861901081367988e-06, + "loss": 0.174, + "step": 24049 + }, + { + "epoch": 0.6085988308829111, + "grad_norm": 3.5862317085266113, + "learning_rate": 3.385810080220798e-06, + "loss": 0.097, + "step": 24050 + }, + { + "epoch": 0.6086241364476048, + "grad_norm": 4.142673969268799, + "learning_rate": 3.3854300627144086e-06, + "loss": 0.1527, + "step": 24051 + }, + { + "epoch": 0.6086494420122985, + "grad_norm": 2.935187816619873, + "learning_rate": 3.38505005562008e-06, + "loss": 0.087, + "step": 24052 + }, + { + "epoch": 0.6086747475769921, + "grad_norm": 2.7955381870269775, + "learning_rate": 3.3846700589402654e-06, + "loss": 0.1548, + "step": 24053 + }, + { + "epoch": 0.6087000531416858, + "grad_norm": 3.963613986968994, + "learning_rate": 3.3842900726774107e-06, + "loss": 0.1392, + "step": 24054 + }, + { + "epoch": 0.6087253587063796, + "grad_norm": 3.693016767501831, + "learning_rate": 3.383910096833969e-06, + "loss": 0.1303, + "step": 24055 + }, + { + "epoch": 0.6087506642710732, + "grad_norm": 4.444467544555664, + "learning_rate": 3.3835301314123903e-06, + "loss": 0.134, + "step": 24056 + }, + { + "epoch": 0.6087759698357669, + "grad_norm": 4.181612014770508, + "learning_rate": 3.3831501764151255e-06, + "loss": 0.1731, + "step": 24057 + }, + { + "epoch": 0.6088012754004606, + "grad_norm": 3.6594772338867188, + "learning_rate": 3.382770231844626e-06, + "loss": 0.0905, + "step": 24058 + }, + { + "epoch": 0.6088265809651542, + "grad_norm": 5.121716499328613, + "learning_rate": 3.3823902977033383e-06, + "loss": 0.1441, + "step": 24059 + }, + { + "epoch": 0.6088518865298479, + "grad_norm": 30.676958084106445, + "learning_rate": 3.3820103739937147e-06, + "loss": 0.1691, + "step": 24060 + }, + { + "epoch": 0.6088771920945416, + "grad_norm": 5.037759780883789, + "learning_rate": 3.381630460718207e-06, + "loss": 0.19, + "step": 24061 + }, + { + "epoch": 0.6089024976592353, + "grad_norm": 4.5991644859313965, + "learning_rate": 3.3812505578792632e-06, + "loss": 0.1807, + "step": 24062 + }, + { + "epoch": 0.6089278032239289, + "grad_norm": 4.1248040199279785, + "learning_rate": 3.380870665479332e-06, + "loss": 0.1563, + "step": 24063 + }, + { + "epoch": 0.6089531087886226, + "grad_norm": 8.071588516235352, + "learning_rate": 3.380490783520865e-06, + "loss": 0.1507, + "step": 24064 + }, + { + "epoch": 0.6089784143533163, + "grad_norm": 8.687992095947266, + "learning_rate": 3.3801109120063123e-06, + "loss": 0.1953, + "step": 24065 + }, + { + "epoch": 0.60900371991801, + "grad_norm": 8.07063102722168, + "learning_rate": 3.379731050938123e-06, + "loss": 0.2125, + "step": 24066 + }, + { + "epoch": 0.6090290254827037, + "grad_norm": 6.547052383422852, + "learning_rate": 3.3793512003187468e-06, + "loss": 0.1471, + "step": 24067 + }, + { + "epoch": 0.6090543310473974, + "grad_norm": 4.254130840301514, + "learning_rate": 3.3789713601506334e-06, + "loss": 0.2011, + "step": 24068 + }, + { + "epoch": 0.609079636612091, + "grad_norm": 4.539175033569336, + "learning_rate": 3.378591530436231e-06, + "loss": 0.1697, + "step": 24069 + }, + { + "epoch": 0.6091049421767847, + "grad_norm": 12.275846481323242, + "learning_rate": 3.3782117111779903e-06, + "loss": 0.2494, + "step": 24070 + }, + { + "epoch": 0.6091302477414784, + "grad_norm": 7.831207275390625, + "learning_rate": 3.377831902378363e-06, + "loss": 0.2996, + "step": 24071 + }, + { + "epoch": 0.609155553306172, + "grad_norm": 5.5532636642456055, + "learning_rate": 3.3774521040397934e-06, + "loss": 0.1801, + "step": 24072 + }, + { + "epoch": 0.6091808588708657, + "grad_norm": 3.968834638595581, + "learning_rate": 3.377072316164734e-06, + "loss": 0.1733, + "step": 24073 + }, + { + "epoch": 0.6092061644355594, + "grad_norm": 5.038283824920654, + "learning_rate": 3.3766925387556326e-06, + "loss": 0.1386, + "step": 24074 + }, + { + "epoch": 0.609231470000253, + "grad_norm": 3.0503783226013184, + "learning_rate": 3.3763127718149423e-06, + "loss": 0.1152, + "step": 24075 + }, + { + "epoch": 0.6092567755649467, + "grad_norm": 5.600948333740234, + "learning_rate": 3.375933015345106e-06, + "loss": 0.1691, + "step": 24076 + }, + { + "epoch": 0.6092820811296404, + "grad_norm": 9.168216705322266, + "learning_rate": 3.3755532693485758e-06, + "loss": 0.2756, + "step": 24077 + }, + { + "epoch": 0.609307386694334, + "grad_norm": 2.7616326808929443, + "learning_rate": 3.3751735338278e-06, + "loss": 0.1405, + "step": 24078 + }, + { + "epoch": 0.6093326922590278, + "grad_norm": 17.6612548828125, + "learning_rate": 3.374793808785228e-06, + "loss": 0.1977, + "step": 24079 + }, + { + "epoch": 0.6093579978237215, + "grad_norm": 5.2715277671813965, + "learning_rate": 3.3744140942233104e-06, + "loss": 0.1792, + "step": 24080 + }, + { + "epoch": 0.6093833033884151, + "grad_norm": 6.034323215484619, + "learning_rate": 3.3740343901444926e-06, + "loss": 0.1663, + "step": 24081 + }, + { + "epoch": 0.6094086089531088, + "grad_norm": 3.5670766830444336, + "learning_rate": 3.373654696551224e-06, + "loss": 0.1743, + "step": 24082 + }, + { + "epoch": 0.6094339145178025, + "grad_norm": 3.4780642986297607, + "learning_rate": 3.3732750134459534e-06, + "loss": 0.1054, + "step": 24083 + }, + { + "epoch": 0.6094592200824961, + "grad_norm": 2.677189350128174, + "learning_rate": 3.372895340831133e-06, + "loss": 0.087, + "step": 24084 + }, + { + "epoch": 0.6094845256471898, + "grad_norm": 5.022661209106445, + "learning_rate": 3.3725156787092044e-06, + "loss": 0.1628, + "step": 24085 + }, + { + "epoch": 0.6095098312118835, + "grad_norm": 5.907169818878174, + "learning_rate": 3.37213602708262e-06, + "loss": 0.1293, + "step": 24086 + }, + { + "epoch": 0.6095351367765772, + "grad_norm": 7.563804626464844, + "learning_rate": 3.371756385953827e-06, + "loss": 0.1547, + "step": 24087 + }, + { + "epoch": 0.6095604423412708, + "grad_norm": 7.9256591796875, + "learning_rate": 3.3713767553252773e-06, + "loss": 0.1578, + "step": 24088 + }, + { + "epoch": 0.6095857479059645, + "grad_norm": 5.062753677368164, + "learning_rate": 3.370997135199413e-06, + "loss": 0.2057, + "step": 24089 + }, + { + "epoch": 0.6096110534706582, + "grad_norm": 5.701567649841309, + "learning_rate": 3.3706175255786856e-06, + "loss": 0.1178, + "step": 24090 + }, + { + "epoch": 0.6096363590353518, + "grad_norm": 2.52360200881958, + "learning_rate": 3.3702379264655425e-06, + "loss": 0.1054, + "step": 24091 + }, + { + "epoch": 0.6096616646000456, + "grad_norm": 9.616718292236328, + "learning_rate": 3.3698583378624334e-06, + "loss": 0.2364, + "step": 24092 + }, + { + "epoch": 0.6096869701647393, + "grad_norm": 6.884061813354492, + "learning_rate": 3.3694787597718044e-06, + "loss": 0.2216, + "step": 24093 + }, + { + "epoch": 0.6097122757294329, + "grad_norm": 3.2788922786712646, + "learning_rate": 3.3690991921961026e-06, + "loss": 0.0856, + "step": 24094 + }, + { + "epoch": 0.6097375812941266, + "grad_norm": 6.546688556671143, + "learning_rate": 3.368719635137777e-06, + "loss": 0.1566, + "step": 24095 + }, + { + "epoch": 0.6097628868588203, + "grad_norm": 5.187999248504639, + "learning_rate": 3.3683400885992747e-06, + "loss": 0.186, + "step": 24096 + }, + { + "epoch": 0.6097881924235139, + "grad_norm": 3.3297739028930664, + "learning_rate": 3.3679605525830448e-06, + "loss": 0.1627, + "step": 24097 + }, + { + "epoch": 0.6098134979882076, + "grad_norm": 4.274467945098877, + "learning_rate": 3.3675810270915334e-06, + "loss": 0.156, + "step": 24098 + }, + { + "epoch": 0.6098388035529013, + "grad_norm": 3.0600736141204834, + "learning_rate": 3.367201512127187e-06, + "loss": 0.0843, + "step": 24099 + }, + { + "epoch": 0.6098641091175949, + "grad_norm": 4.543168544769287, + "learning_rate": 3.3668220076924554e-06, + "loss": 0.1293, + "step": 24100 + }, + { + "epoch": 0.6098894146822886, + "grad_norm": 4.34115743637085, + "learning_rate": 3.3664425137897837e-06, + "loss": 0.1359, + "step": 24101 + }, + { + "epoch": 0.6099147202469823, + "grad_norm": 5.722896099090576, + "learning_rate": 3.3660630304216223e-06, + "loss": 0.1313, + "step": 24102 + }, + { + "epoch": 0.609940025811676, + "grad_norm": 3.831650733947754, + "learning_rate": 3.3656835575904156e-06, + "loss": 0.1347, + "step": 24103 + }, + { + "epoch": 0.6099653313763697, + "grad_norm": 3.736237049102783, + "learning_rate": 3.3653040952986112e-06, + "loss": 0.1427, + "step": 24104 + }, + { + "epoch": 0.6099906369410634, + "grad_norm": 5.292189598083496, + "learning_rate": 3.364924643548656e-06, + "loss": 0.1881, + "step": 24105 + }, + { + "epoch": 0.610015942505757, + "grad_norm": 6.526352405548096, + "learning_rate": 3.3645452023430005e-06, + "loss": 0.2265, + "step": 24106 + }, + { + "epoch": 0.6100412480704507, + "grad_norm": 3.761190414428711, + "learning_rate": 3.3641657716840858e-06, + "loss": 0.0942, + "step": 24107 + }, + { + "epoch": 0.6100665536351444, + "grad_norm": 8.881844520568848, + "learning_rate": 3.363786351574363e-06, + "loss": 0.1694, + "step": 24108 + }, + { + "epoch": 0.610091859199838, + "grad_norm": 5.533852577209473, + "learning_rate": 3.3634069420162764e-06, + "loss": 0.2451, + "step": 24109 + }, + { + "epoch": 0.6101171647645317, + "grad_norm": 13.132475852966309, + "learning_rate": 3.363027543012275e-06, + "loss": 0.2815, + "step": 24110 + }, + { + "epoch": 0.6101424703292254, + "grad_norm": 2.167118787765503, + "learning_rate": 3.362648154564806e-06, + "loss": 0.0759, + "step": 24111 + }, + { + "epoch": 0.610167775893919, + "grad_norm": 9.262609481811523, + "learning_rate": 3.3622687766763124e-06, + "loss": 0.1011, + "step": 24112 + }, + { + "epoch": 0.6101930814586127, + "grad_norm": 4.3155198097229, + "learning_rate": 3.3618894093492427e-06, + "loss": 0.1227, + "step": 24113 + }, + { + "epoch": 0.6102183870233064, + "grad_norm": 5.342613220214844, + "learning_rate": 3.3615100525860434e-06, + "loss": 0.1194, + "step": 24114 + }, + { + "epoch": 0.6102436925880002, + "grad_norm": 5.122918128967285, + "learning_rate": 3.361130706389164e-06, + "loss": 0.1218, + "step": 24115 + }, + { + "epoch": 0.6102689981526938, + "grad_norm": 7.1661787033081055, + "learning_rate": 3.360751370761045e-06, + "loss": 0.1818, + "step": 24116 + }, + { + "epoch": 0.6102943037173875, + "grad_norm": 4.394237041473389, + "learning_rate": 3.3603720457041357e-06, + "loss": 0.1253, + "step": 24117 + }, + { + "epoch": 0.6103196092820812, + "grad_norm": 5.659962177276611, + "learning_rate": 3.3599927312208812e-06, + "loss": 0.184, + "step": 24118 + }, + { + "epoch": 0.6103449148467748, + "grad_norm": 2.6141393184661865, + "learning_rate": 3.3596134273137314e-06, + "loss": 0.0999, + "step": 24119 + }, + { + "epoch": 0.6103702204114685, + "grad_norm": 12.342864036560059, + "learning_rate": 3.3592341339851277e-06, + "loss": 0.1654, + "step": 24120 + }, + { + "epoch": 0.6103955259761622, + "grad_norm": 7.058557510375977, + "learning_rate": 3.3588548512375173e-06, + "loss": 0.1236, + "step": 24121 + }, + { + "epoch": 0.6104208315408558, + "grad_norm": 3.230839729309082, + "learning_rate": 3.358475579073348e-06, + "loss": 0.1233, + "step": 24122 + }, + { + "epoch": 0.6104461371055495, + "grad_norm": 3.9505131244659424, + "learning_rate": 3.358096317495063e-06, + "loss": 0.1439, + "step": 24123 + }, + { + "epoch": 0.6104714426702432, + "grad_norm": 10.78271770477295, + "learning_rate": 3.357717066505111e-06, + "loss": 0.2016, + "step": 24124 + }, + { + "epoch": 0.6104967482349368, + "grad_norm": 6.354003429412842, + "learning_rate": 3.3573378261059353e-06, + "loss": 0.1677, + "step": 24125 + }, + { + "epoch": 0.6105220537996305, + "grad_norm": 4.972613334655762, + "learning_rate": 3.356958596299983e-06, + "loss": 0.1373, + "step": 24126 + }, + { + "epoch": 0.6105473593643242, + "grad_norm": 3.9249894618988037, + "learning_rate": 3.356579377089698e-06, + "loss": 0.1782, + "step": 24127 + }, + { + "epoch": 0.6105726649290178, + "grad_norm": 3.552900552749634, + "learning_rate": 3.356200168477528e-06, + "loss": 0.1911, + "step": 24128 + }, + { + "epoch": 0.6105979704937116, + "grad_norm": 3.9925057888031006, + "learning_rate": 3.3558209704659174e-06, + "loss": 0.1075, + "step": 24129 + }, + { + "epoch": 0.6106232760584053, + "grad_norm": 5.213320732116699, + "learning_rate": 3.35544178305731e-06, + "loss": 0.1711, + "step": 24130 + }, + { + "epoch": 0.6106485816230989, + "grad_norm": 4.501896858215332, + "learning_rate": 3.355062606254153e-06, + "loss": 0.1171, + "step": 24131 + }, + { + "epoch": 0.6106738871877926, + "grad_norm": 5.955622673034668, + "learning_rate": 3.354683440058891e-06, + "loss": 0.2031, + "step": 24132 + }, + { + "epoch": 0.6106991927524863, + "grad_norm": 5.496095657348633, + "learning_rate": 3.354304284473972e-06, + "loss": 0.1695, + "step": 24133 + }, + { + "epoch": 0.6107244983171799, + "grad_norm": 6.318701267242432, + "learning_rate": 3.353925139501837e-06, + "loss": 0.2674, + "step": 24134 + }, + { + "epoch": 0.6107498038818736, + "grad_norm": 15.379266738891602, + "learning_rate": 3.353546005144931e-06, + "loss": 0.1246, + "step": 24135 + }, + { + "epoch": 0.6107751094465673, + "grad_norm": 7.94549560546875, + "learning_rate": 3.3531668814057016e-06, + "loss": 0.271, + "step": 24136 + }, + { + "epoch": 0.6108004150112609, + "grad_norm": 6.677634239196777, + "learning_rate": 3.352787768286595e-06, + "loss": 0.2297, + "step": 24137 + }, + { + "epoch": 0.6108257205759546, + "grad_norm": 3.037566900253296, + "learning_rate": 3.3524086657900508e-06, + "loss": 0.1294, + "step": 24138 + }, + { + "epoch": 0.6108510261406483, + "grad_norm": 6.137818336486816, + "learning_rate": 3.3520295739185173e-06, + "loss": 0.1811, + "step": 24139 + }, + { + "epoch": 0.6108763317053421, + "grad_norm": 9.092255592346191, + "learning_rate": 3.3516504926744377e-06, + "loss": 0.1443, + "step": 24140 + }, + { + "epoch": 0.6109016372700357, + "grad_norm": 4.114363670349121, + "learning_rate": 3.3512714220602603e-06, + "loss": 0.2279, + "step": 24141 + }, + { + "epoch": 0.6109269428347294, + "grad_norm": 4.145579814910889, + "learning_rate": 3.350892362078424e-06, + "loss": 0.11, + "step": 24142 + }, + { + "epoch": 0.6109522483994231, + "grad_norm": 4.68377685546875, + "learning_rate": 3.3505133127313773e-06, + "loss": 0.1388, + "step": 24143 + }, + { + "epoch": 0.6109775539641167, + "grad_norm": 20.25177764892578, + "learning_rate": 3.3501342740215624e-06, + "loss": 0.2353, + "step": 24144 + }, + { + "epoch": 0.6110028595288104, + "grad_norm": 3.3208091259002686, + "learning_rate": 3.349755245951425e-06, + "loss": 0.115, + "step": 24145 + }, + { + "epoch": 0.6110281650935041, + "grad_norm": 11.568446159362793, + "learning_rate": 3.349376228523411e-06, + "loss": 0.3558, + "step": 24146 + }, + { + "epoch": 0.6110534706581977, + "grad_norm": 4.9877543449401855, + "learning_rate": 3.3489972217399604e-06, + "loss": 0.2521, + "step": 24147 + }, + { + "epoch": 0.6110787762228914, + "grad_norm": 4.675932884216309, + "learning_rate": 3.3486182256035207e-06, + "loss": 0.1195, + "step": 24148 + }, + { + "epoch": 0.6111040817875851, + "grad_norm": 12.715523719787598, + "learning_rate": 3.348239240116533e-06, + "loss": 0.3326, + "step": 24149 + }, + { + "epoch": 0.6111293873522787, + "grad_norm": 3.210552930831909, + "learning_rate": 3.347860265281447e-06, + "loss": 0.1626, + "step": 24150 + }, + { + "epoch": 0.6111546929169724, + "grad_norm": 7.5002546310424805, + "learning_rate": 3.3474813011007003e-06, + "loss": 0.204, + "step": 24151 + }, + { + "epoch": 0.6111799984816662, + "grad_norm": 6.464083194732666, + "learning_rate": 3.3471023475767382e-06, + "loss": 0.1133, + "step": 24152 + }, + { + "epoch": 0.6112053040463598, + "grad_norm": 4.377357482910156, + "learning_rate": 3.3467234047120077e-06, + "loss": 0.1507, + "step": 24153 + }, + { + "epoch": 0.6112306096110535, + "grad_norm": 10.392220497131348, + "learning_rate": 3.3463444725089495e-06, + "loss": 0.1898, + "step": 24154 + }, + { + "epoch": 0.6112559151757472, + "grad_norm": 2.547023296356201, + "learning_rate": 3.345965550970009e-06, + "loss": 0.0925, + "step": 24155 + }, + { + "epoch": 0.6112812207404408, + "grad_norm": 6.026241779327393, + "learning_rate": 3.345586640097629e-06, + "loss": 0.1677, + "step": 24156 + }, + { + "epoch": 0.6113065263051345, + "grad_norm": 4.232524394989014, + "learning_rate": 3.3452077398942516e-06, + "loss": 0.1096, + "step": 24157 + }, + { + "epoch": 0.6113318318698282, + "grad_norm": 5.795337200164795, + "learning_rate": 3.3448288503623217e-06, + "loss": 0.1712, + "step": 24158 + }, + { + "epoch": 0.6113571374345218, + "grad_norm": 4.905531883239746, + "learning_rate": 3.3444499715042855e-06, + "loss": 0.1288, + "step": 24159 + }, + { + "epoch": 0.6113824429992155, + "grad_norm": 3.2786781787872314, + "learning_rate": 3.3440711033225804e-06, + "loss": 0.1212, + "step": 24160 + }, + { + "epoch": 0.6114077485639092, + "grad_norm": 5.5314836502075195, + "learning_rate": 3.3436922458196534e-06, + "loss": 0.1708, + "step": 24161 + }, + { + "epoch": 0.6114330541286028, + "grad_norm": 3.927243232727051, + "learning_rate": 3.343313398997947e-06, + "loss": 0.11, + "step": 24162 + }, + { + "epoch": 0.6114583596932965, + "grad_norm": 2.9239718914031982, + "learning_rate": 3.342934562859904e-06, + "loss": 0.0967, + "step": 24163 + }, + { + "epoch": 0.6114836652579902, + "grad_norm": 2.6098814010620117, + "learning_rate": 3.34255573740797e-06, + "loss": 0.1105, + "step": 24164 + }, + { + "epoch": 0.611508970822684, + "grad_norm": 4.70703649520874, + "learning_rate": 3.342176922644584e-06, + "loss": 0.194, + "step": 24165 + }, + { + "epoch": 0.6115342763873776, + "grad_norm": 11.70808219909668, + "learning_rate": 3.3417981185721903e-06, + "loss": 0.2268, + "step": 24166 + }, + { + "epoch": 0.6115595819520713, + "grad_norm": 8.21100902557373, + "learning_rate": 3.341419325193232e-06, + "loss": 0.2751, + "step": 24167 + }, + { + "epoch": 0.611584887516765, + "grad_norm": 4.022868633270264, + "learning_rate": 3.3410405425101546e-06, + "loss": 0.1948, + "step": 24168 + }, + { + "epoch": 0.6116101930814586, + "grad_norm": 8.733128547668457, + "learning_rate": 3.340661770525395e-06, + "loss": 0.1991, + "step": 24169 + }, + { + "epoch": 0.6116354986461523, + "grad_norm": 3.968733549118042, + "learning_rate": 3.3402830092413994e-06, + "loss": 0.1425, + "step": 24170 + }, + { + "epoch": 0.611660804210846, + "grad_norm": 6.315787315368652, + "learning_rate": 3.33990425866061e-06, + "loss": 0.1764, + "step": 24171 + }, + { + "epoch": 0.6116861097755396, + "grad_norm": 5.248743057250977, + "learning_rate": 3.339525518785472e-06, + "loss": 0.124, + "step": 24172 + }, + { + "epoch": 0.6117114153402333, + "grad_norm": 4.211326599121094, + "learning_rate": 3.3391467896184223e-06, + "loss": 0.0765, + "step": 24173 + }, + { + "epoch": 0.611736720904927, + "grad_norm": 24.219457626342773, + "learning_rate": 3.3387680711619066e-06, + "loss": 0.2114, + "step": 24174 + }, + { + "epoch": 0.6117620264696206, + "grad_norm": 7.756166934967041, + "learning_rate": 3.338389363418366e-06, + "loss": 0.1257, + "step": 24175 + }, + { + "epoch": 0.6117873320343143, + "grad_norm": 5.5540900230407715, + "learning_rate": 3.3380106663902433e-06, + "loss": 0.1912, + "step": 24176 + }, + { + "epoch": 0.6118126375990081, + "grad_norm": 4.5460405349731445, + "learning_rate": 3.3376319800799828e-06, + "loss": 0.1831, + "step": 24177 + }, + { + "epoch": 0.6118379431637017, + "grad_norm": 11.600006103515625, + "learning_rate": 3.3372533044900223e-06, + "loss": 0.2053, + "step": 24178 + }, + { + "epoch": 0.6118632487283954, + "grad_norm": 5.228784561157227, + "learning_rate": 3.3368746396228057e-06, + "loss": 0.1726, + "step": 24179 + }, + { + "epoch": 0.6118885542930891, + "grad_norm": 10.537976264953613, + "learning_rate": 3.3364959854807766e-06, + "loss": 0.1963, + "step": 24180 + }, + { + "epoch": 0.6119138598577827, + "grad_norm": 4.3324971199035645, + "learning_rate": 3.3361173420663758e-06, + "loss": 0.0967, + "step": 24181 + }, + { + "epoch": 0.6119391654224764, + "grad_norm": 3.8611526489257812, + "learning_rate": 3.335738709382044e-06, + "loss": 0.136, + "step": 24182 + }, + { + "epoch": 0.6119644709871701, + "grad_norm": 5.488548278808594, + "learning_rate": 3.3353600874302233e-06, + "loss": 0.1894, + "step": 24183 + }, + { + "epoch": 0.6119897765518637, + "grad_norm": 5.484737873077393, + "learning_rate": 3.3349814762133557e-06, + "loss": 0.1656, + "step": 24184 + }, + { + "epoch": 0.6120150821165574, + "grad_norm": 3.0367698669433594, + "learning_rate": 3.3346028757338822e-06, + "loss": 0.1569, + "step": 24185 + }, + { + "epoch": 0.6120403876812511, + "grad_norm": 6.122278213500977, + "learning_rate": 3.3342242859942465e-06, + "loss": 0.2491, + "step": 24186 + }, + { + "epoch": 0.6120656932459447, + "grad_norm": 2.6048476696014404, + "learning_rate": 3.333845706996889e-06, + "loss": 0.1364, + "step": 24187 + }, + { + "epoch": 0.6120909988106384, + "grad_norm": 2.9680416584014893, + "learning_rate": 3.3334671387442484e-06, + "loss": 0.1439, + "step": 24188 + }, + { + "epoch": 0.6121163043753322, + "grad_norm": 5.751911163330078, + "learning_rate": 3.333088581238769e-06, + "loss": 0.1468, + "step": 24189 + }, + { + "epoch": 0.6121416099400259, + "grad_norm": 2.646711826324463, + "learning_rate": 3.332710034482893e-06, + "loss": 0.1327, + "step": 24190 + }, + { + "epoch": 0.6121669155047195, + "grad_norm": 3.7875266075134277, + "learning_rate": 3.332331498479059e-06, + "loss": 0.1553, + "step": 24191 + }, + { + "epoch": 0.6121922210694132, + "grad_norm": 9.5941162109375, + "learning_rate": 3.3319529732297073e-06, + "loss": 0.2123, + "step": 24192 + }, + { + "epoch": 0.6122175266341069, + "grad_norm": 6.724410533905029, + "learning_rate": 3.331574458737281e-06, + "loss": 0.1309, + "step": 24193 + }, + { + "epoch": 0.6122428321988005, + "grad_norm": 3.3988428115844727, + "learning_rate": 3.3311959550042235e-06, + "loss": 0.1139, + "step": 24194 + }, + { + "epoch": 0.6122681377634942, + "grad_norm": 2.9413936138153076, + "learning_rate": 3.3308174620329703e-06, + "loss": 0.1419, + "step": 24195 + }, + { + "epoch": 0.6122934433281879, + "grad_norm": 3.7701456546783447, + "learning_rate": 3.3304389798259647e-06, + "loss": 0.1581, + "step": 24196 + }, + { + "epoch": 0.6123187488928815, + "grad_norm": 6.465275287628174, + "learning_rate": 3.3300605083856473e-06, + "loss": 0.216, + "step": 24197 + }, + { + "epoch": 0.6123440544575752, + "grad_norm": 4.0942182540893555, + "learning_rate": 3.32968204771446e-06, + "loss": 0.211, + "step": 24198 + }, + { + "epoch": 0.6123693600222689, + "grad_norm": 5.718015670776367, + "learning_rate": 3.3293035978148434e-06, + "loss": 0.2, + "step": 24199 + }, + { + "epoch": 0.6123946655869625, + "grad_norm": 4.000230312347412, + "learning_rate": 3.328925158689236e-06, + "loss": 0.1198, + "step": 24200 + }, + { + "epoch": 0.6124199711516563, + "grad_norm": 4.574774265289307, + "learning_rate": 3.3285467303400785e-06, + "loss": 0.0939, + "step": 24201 + }, + { + "epoch": 0.61244527671635, + "grad_norm": 2.481448173522949, + "learning_rate": 3.3281683127698128e-06, + "loss": 0.1061, + "step": 24202 + }, + { + "epoch": 0.6124705822810436, + "grad_norm": 2.813356876373291, + "learning_rate": 3.3277899059808805e-06, + "loss": 0.1923, + "step": 24203 + }, + { + "epoch": 0.6124958878457373, + "grad_norm": 7.082287788391113, + "learning_rate": 3.3274115099757183e-06, + "loss": 0.234, + "step": 24204 + }, + { + "epoch": 0.612521193410431, + "grad_norm": 6.479879379272461, + "learning_rate": 3.3270331247567676e-06, + "loss": 0.2276, + "step": 24205 + }, + { + "epoch": 0.6125464989751246, + "grad_norm": 4.89414119720459, + "learning_rate": 3.32665475032647e-06, + "loss": 0.1387, + "step": 24206 + }, + { + "epoch": 0.6125718045398183, + "grad_norm": 4.700153827667236, + "learning_rate": 3.326276386687265e-06, + "loss": 0.1822, + "step": 24207 + }, + { + "epoch": 0.612597110104512, + "grad_norm": 4.3509650230407715, + "learning_rate": 3.3258980338415926e-06, + "loss": 0.1933, + "step": 24208 + }, + { + "epoch": 0.6126224156692056, + "grad_norm": 3.6363930702209473, + "learning_rate": 3.325519691791891e-06, + "loss": 0.1559, + "step": 24209 + }, + { + "epoch": 0.6126477212338993, + "grad_norm": 2.7400474548339844, + "learning_rate": 3.3251413605406012e-06, + "loss": 0.1055, + "step": 24210 + }, + { + "epoch": 0.612673026798593, + "grad_norm": 3.2665700912475586, + "learning_rate": 3.3247630400901642e-06, + "loss": 0.1554, + "step": 24211 + }, + { + "epoch": 0.6126983323632866, + "grad_norm": 3.3693923950195312, + "learning_rate": 3.32438473044302e-06, + "loss": 0.1524, + "step": 24212 + }, + { + "epoch": 0.6127236379279803, + "grad_norm": 7.137991428375244, + "learning_rate": 3.3240064316016047e-06, + "loss": 0.2387, + "step": 24213 + }, + { + "epoch": 0.6127489434926741, + "grad_norm": 3.5392069816589355, + "learning_rate": 3.3236281435683616e-06, + "loss": 0.1721, + "step": 24214 + }, + { + "epoch": 0.6127742490573678, + "grad_norm": 3.4746336936950684, + "learning_rate": 3.323249866345728e-06, + "loss": 0.0825, + "step": 24215 + }, + { + "epoch": 0.6127995546220614, + "grad_norm": 5.499587535858154, + "learning_rate": 3.322871599936144e-06, + "loss": 0.1901, + "step": 24216 + }, + { + "epoch": 0.6128248601867551, + "grad_norm": 7.278421401977539, + "learning_rate": 3.3224933443420505e-06, + "loss": 0.1467, + "step": 24217 + }, + { + "epoch": 0.6128501657514488, + "grad_norm": 3.7743453979492188, + "learning_rate": 3.3221150995658834e-06, + "loss": 0.1812, + "step": 24218 + }, + { + "epoch": 0.6128754713161424, + "grad_norm": 5.434112071990967, + "learning_rate": 3.3217368656100845e-06, + "loss": 0.1933, + "step": 24219 + }, + { + "epoch": 0.6129007768808361, + "grad_norm": 8.97232437133789, + "learning_rate": 3.3213586424770923e-06, + "loss": 0.2723, + "step": 24220 + }, + { + "epoch": 0.6129260824455298, + "grad_norm": 4.401416301727295, + "learning_rate": 3.320980430169348e-06, + "loss": 0.1321, + "step": 24221 + }, + { + "epoch": 0.6129513880102234, + "grad_norm": 3.997131109237671, + "learning_rate": 3.3206022286892862e-06, + "loss": 0.1129, + "step": 24222 + }, + { + "epoch": 0.6129766935749171, + "grad_norm": 6.307417392730713, + "learning_rate": 3.3202240380393486e-06, + "loss": 0.1069, + "step": 24223 + }, + { + "epoch": 0.6130019991396108, + "grad_norm": 7.670380592346191, + "learning_rate": 3.319845858221974e-06, + "loss": 0.1268, + "step": 24224 + }, + { + "epoch": 0.6130273047043044, + "grad_norm": 4.424076557159424, + "learning_rate": 3.319467689239602e-06, + "loss": 0.1552, + "step": 24225 + }, + { + "epoch": 0.6130526102689982, + "grad_norm": 11.134879112243652, + "learning_rate": 3.319089531094669e-06, + "loss": 0.3416, + "step": 24226 + }, + { + "epoch": 0.6130779158336919, + "grad_norm": 4.103386878967285, + "learning_rate": 3.3187113837896147e-06, + "loss": 0.2011, + "step": 24227 + }, + { + "epoch": 0.6131032213983855, + "grad_norm": 1.4314393997192383, + "learning_rate": 3.318333247326878e-06, + "loss": 0.0377, + "step": 24228 + }, + { + "epoch": 0.6131285269630792, + "grad_norm": 4.911862850189209, + "learning_rate": 3.3179551217088967e-06, + "loss": 0.1492, + "step": 24229 + }, + { + "epoch": 0.6131538325277729, + "grad_norm": 5.074692249298096, + "learning_rate": 3.317577006938113e-06, + "loss": 0.1573, + "step": 24230 + }, + { + "epoch": 0.6131791380924665, + "grad_norm": 2.705256700515747, + "learning_rate": 3.31719890301696e-06, + "loss": 0.0964, + "step": 24231 + }, + { + "epoch": 0.6132044436571602, + "grad_norm": 11.619054794311523, + "learning_rate": 3.3168208099478783e-06, + "loss": 0.1307, + "step": 24232 + }, + { + "epoch": 0.6132297492218539, + "grad_norm": 3.533219814300537, + "learning_rate": 3.3164427277333054e-06, + "loss": 0.1938, + "step": 24233 + }, + { + "epoch": 0.6132550547865475, + "grad_norm": 3.854658365249634, + "learning_rate": 3.316064656375683e-06, + "loss": 0.1597, + "step": 24234 + }, + { + "epoch": 0.6132803603512412, + "grad_norm": 17.175012588500977, + "learning_rate": 3.3156865958774436e-06, + "loss": 0.3281, + "step": 24235 + }, + { + "epoch": 0.6133056659159349, + "grad_norm": 7.3957014083862305, + "learning_rate": 3.3153085462410284e-06, + "loss": 0.1883, + "step": 24236 + }, + { + "epoch": 0.6133309714806285, + "grad_norm": 4.810924053192139, + "learning_rate": 3.3149305074688755e-06, + "loss": 0.1859, + "step": 24237 + }, + { + "epoch": 0.6133562770453223, + "grad_norm": 7.1645307540893555, + "learning_rate": 3.3145524795634227e-06, + "loss": 0.229, + "step": 24238 + }, + { + "epoch": 0.613381582610016, + "grad_norm": 4.5067620277404785, + "learning_rate": 3.3141744625271076e-06, + "loss": 0.1649, + "step": 24239 + }, + { + "epoch": 0.6134068881747096, + "grad_norm": 7.991616249084473, + "learning_rate": 3.313796456362367e-06, + "loss": 0.1944, + "step": 24240 + }, + { + "epoch": 0.6134321937394033, + "grad_norm": 2.1518445014953613, + "learning_rate": 3.31341846107164e-06, + "loss": 0.1412, + "step": 24241 + }, + { + "epoch": 0.613457499304097, + "grad_norm": 2.9300262928009033, + "learning_rate": 3.3130404766573634e-06, + "loss": 0.1579, + "step": 24242 + }, + { + "epoch": 0.6134828048687907, + "grad_norm": 3.254274845123291, + "learning_rate": 3.312662503121975e-06, + "loss": 0.1765, + "step": 24243 + }, + { + "epoch": 0.6135081104334843, + "grad_norm": 12.082097053527832, + "learning_rate": 3.3122845404679127e-06, + "loss": 0.2959, + "step": 24244 + }, + { + "epoch": 0.613533415998178, + "grad_norm": 5.903003215789795, + "learning_rate": 3.311906588697612e-06, + "loss": 0.1741, + "step": 24245 + }, + { + "epoch": 0.6135587215628717, + "grad_norm": 5.832650184631348, + "learning_rate": 3.3115286478135123e-06, + "loss": 0.1961, + "step": 24246 + }, + { + "epoch": 0.6135840271275653, + "grad_norm": 9.147441864013672, + "learning_rate": 3.3111507178180513e-06, + "loss": 0.2623, + "step": 24247 + }, + { + "epoch": 0.613609332692259, + "grad_norm": 3.5460755825042725, + "learning_rate": 3.310772798713665e-06, + "loss": 0.124, + "step": 24248 + }, + { + "epoch": 0.6136346382569527, + "grad_norm": 3.4291632175445557, + "learning_rate": 3.3103948905027894e-06, + "loss": 0.1763, + "step": 24249 + }, + { + "epoch": 0.6136599438216463, + "grad_norm": 8.292394638061523, + "learning_rate": 3.3100169931878632e-06, + "loss": 0.1552, + "step": 24250 + }, + { + "epoch": 0.6136852493863401, + "grad_norm": 15.908295631408691, + "learning_rate": 3.309639106771323e-06, + "loss": 0.2141, + "step": 24251 + }, + { + "epoch": 0.6137105549510338, + "grad_norm": 6.013322830200195, + "learning_rate": 3.3092612312556077e-06, + "loss": 0.1321, + "step": 24252 + }, + { + "epoch": 0.6137358605157274, + "grad_norm": 3.0914530754089355, + "learning_rate": 3.3088833666431502e-06, + "loss": 0.1033, + "step": 24253 + }, + { + "epoch": 0.6137611660804211, + "grad_norm": 3.8744168281555176, + "learning_rate": 3.3085055129363897e-06, + "loss": 0.1193, + "step": 24254 + }, + { + "epoch": 0.6137864716451148, + "grad_norm": 3.2337558269500732, + "learning_rate": 3.308127670137762e-06, + "loss": 0.1542, + "step": 24255 + }, + { + "epoch": 0.6138117772098084, + "grad_norm": 4.137125492095947, + "learning_rate": 3.3077498382497064e-06, + "loss": 0.1275, + "step": 24256 + }, + { + "epoch": 0.6138370827745021, + "grad_norm": 3.6269032955169678, + "learning_rate": 3.3073720172746554e-06, + "loss": 0.1279, + "step": 24257 + }, + { + "epoch": 0.6138623883391958, + "grad_norm": 6.107946395874023, + "learning_rate": 3.3069942072150474e-06, + "loss": 0.2505, + "step": 24258 + }, + { + "epoch": 0.6138876939038894, + "grad_norm": 4.514678955078125, + "learning_rate": 3.306616408073319e-06, + "loss": 0.111, + "step": 24259 + }, + { + "epoch": 0.6139129994685831, + "grad_norm": 2.5973281860351562, + "learning_rate": 3.3062386198519057e-06, + "loss": 0.1362, + "step": 24260 + }, + { + "epoch": 0.6139383050332768, + "grad_norm": 4.139642238616943, + "learning_rate": 3.305860842553247e-06, + "loss": 0.1966, + "step": 24261 + }, + { + "epoch": 0.6139636105979704, + "grad_norm": 3.6625590324401855, + "learning_rate": 3.3054830761797742e-06, + "loss": 0.1403, + "step": 24262 + }, + { + "epoch": 0.6139889161626642, + "grad_norm": 5.993992805480957, + "learning_rate": 3.3051053207339267e-06, + "loss": 0.1912, + "step": 24263 + }, + { + "epoch": 0.6140142217273579, + "grad_norm": 7.048152923583984, + "learning_rate": 3.3047275762181385e-06, + "loss": 0.1195, + "step": 24264 + }, + { + "epoch": 0.6140395272920515, + "grad_norm": 3.7354214191436768, + "learning_rate": 3.3043498426348495e-06, + "loss": 0.0972, + "step": 24265 + }, + { + "epoch": 0.6140648328567452, + "grad_norm": 3.8740670680999756, + "learning_rate": 3.3039721199864916e-06, + "loss": 0.1341, + "step": 24266 + }, + { + "epoch": 0.6140901384214389, + "grad_norm": 2.9095840454101562, + "learning_rate": 3.3035944082755013e-06, + "loss": 0.1026, + "step": 24267 + }, + { + "epoch": 0.6141154439861326, + "grad_norm": 5.468634605407715, + "learning_rate": 3.303216707504316e-06, + "loss": 0.1126, + "step": 24268 + }, + { + "epoch": 0.6141407495508262, + "grad_norm": 5.719820022583008, + "learning_rate": 3.3028390176753706e-06, + "loss": 0.1726, + "step": 24269 + }, + { + "epoch": 0.6141660551155199, + "grad_norm": 6.884454727172852, + "learning_rate": 3.3024613387911e-06, + "loss": 0.1304, + "step": 24270 + }, + { + "epoch": 0.6141913606802136, + "grad_norm": 2.776193857192993, + "learning_rate": 3.30208367085394e-06, + "loss": 0.1483, + "step": 24271 + }, + { + "epoch": 0.6142166662449072, + "grad_norm": 4.60099458694458, + "learning_rate": 3.3017060138663277e-06, + "loss": 0.1294, + "step": 24272 + }, + { + "epoch": 0.6142419718096009, + "grad_norm": 6.126185894012451, + "learning_rate": 3.3013283678306964e-06, + "loss": 0.1555, + "step": 24273 + }, + { + "epoch": 0.6142672773742947, + "grad_norm": 4.774694442749023, + "learning_rate": 3.300950732749484e-06, + "loss": 0.1498, + "step": 24274 + }, + { + "epoch": 0.6142925829389883, + "grad_norm": 4.150766372680664, + "learning_rate": 3.3005731086251235e-06, + "loss": 0.0827, + "step": 24275 + }, + { + "epoch": 0.614317888503682, + "grad_norm": 3.3081068992614746, + "learning_rate": 3.3001954954600503e-06, + "loss": 0.0864, + "step": 24276 + }, + { + "epoch": 0.6143431940683757, + "grad_norm": 4.25377893447876, + "learning_rate": 3.2998178932567003e-06, + "loss": 0.1295, + "step": 24277 + }, + { + "epoch": 0.6143684996330693, + "grad_norm": 13.086905479431152, + "learning_rate": 3.2994403020175103e-06, + "loss": 0.288, + "step": 24278 + }, + { + "epoch": 0.614393805197763, + "grad_norm": 7.070220947265625, + "learning_rate": 3.2990627217449113e-06, + "loss": 0.1733, + "step": 24279 + }, + { + "epoch": 0.6144191107624567, + "grad_norm": 3.0435140132904053, + "learning_rate": 3.2986851524413404e-06, + "loss": 0.1283, + "step": 24280 + }, + { + "epoch": 0.6144444163271503, + "grad_norm": 12.365808486938477, + "learning_rate": 3.2983075941092326e-06, + "loss": 0.3465, + "step": 24281 + }, + { + "epoch": 0.614469721891844, + "grad_norm": 7.776510715484619, + "learning_rate": 3.2979300467510223e-06, + "loss": 0.1959, + "step": 24282 + }, + { + "epoch": 0.6144950274565377, + "grad_norm": 5.999355316162109, + "learning_rate": 3.2975525103691474e-06, + "loss": 0.1419, + "step": 24283 + }, + { + "epoch": 0.6145203330212313, + "grad_norm": 5.065612316131592, + "learning_rate": 3.297174984966037e-06, + "loss": 0.1911, + "step": 24284 + }, + { + "epoch": 0.614545638585925, + "grad_norm": 4.220525741577148, + "learning_rate": 3.2967974705441287e-06, + "loss": 0.0997, + "step": 24285 + }, + { + "epoch": 0.6145709441506187, + "grad_norm": 5.896828651428223, + "learning_rate": 3.2964199671058563e-06, + "loss": 0.1998, + "step": 24286 + }, + { + "epoch": 0.6145962497153123, + "grad_norm": 15.511750221252441, + "learning_rate": 3.2960424746536566e-06, + "loss": 0.1279, + "step": 24287 + }, + { + "epoch": 0.6146215552800061, + "grad_norm": 7.303651809692383, + "learning_rate": 3.2956649931899603e-06, + "loss": 0.1409, + "step": 24288 + }, + { + "epoch": 0.6146468608446998, + "grad_norm": 2.604137897491455, + "learning_rate": 3.295287522717203e-06, + "loss": 0.0966, + "step": 24289 + }, + { + "epoch": 0.6146721664093934, + "grad_norm": 6.833940029144287, + "learning_rate": 3.29491006323782e-06, + "loss": 0.119, + "step": 24290 + }, + { + "epoch": 0.6146974719740871, + "grad_norm": 4.1740336418151855, + "learning_rate": 3.2945326147542446e-06, + "loss": 0.2, + "step": 24291 + }, + { + "epoch": 0.6147227775387808, + "grad_norm": 5.7977070808410645, + "learning_rate": 3.2941551772689127e-06, + "loss": 0.1622, + "step": 24292 + }, + { + "epoch": 0.6147480831034745, + "grad_norm": 3.0818393230438232, + "learning_rate": 3.2937777507842556e-06, + "loss": 0.1301, + "step": 24293 + }, + { + "epoch": 0.6147733886681681, + "grad_norm": 3.7767045497894287, + "learning_rate": 3.2934003353027077e-06, + "loss": 0.119, + "step": 24294 + }, + { + "epoch": 0.6147986942328618, + "grad_norm": 6.224076271057129, + "learning_rate": 3.2930229308267046e-06, + "loss": 0.1567, + "step": 24295 + }, + { + "epoch": 0.6148239997975555, + "grad_norm": 3.461773633956909, + "learning_rate": 3.2926455373586794e-06, + "loss": 0.1083, + "step": 24296 + }, + { + "epoch": 0.6148493053622491, + "grad_norm": 15.934874534606934, + "learning_rate": 3.292268154901065e-06, + "loss": 0.1989, + "step": 24297 + }, + { + "epoch": 0.6148746109269428, + "grad_norm": 7.6614227294921875, + "learning_rate": 3.2918907834562953e-06, + "loss": 0.1861, + "step": 24298 + }, + { + "epoch": 0.6148999164916366, + "grad_norm": 7.975919246673584, + "learning_rate": 3.291513423026804e-06, + "loss": 0.1627, + "step": 24299 + }, + { + "epoch": 0.6149252220563302, + "grad_norm": 9.74292278289795, + "learning_rate": 3.291136073615027e-06, + "loss": 0.214, + "step": 24300 + }, + { + "epoch": 0.6149505276210239, + "grad_norm": 6.813149452209473, + "learning_rate": 3.290758735223393e-06, + "loss": 0.2358, + "step": 24301 + }, + { + "epoch": 0.6149758331857176, + "grad_norm": 3.293078660964966, + "learning_rate": 3.2903814078543393e-06, + "loss": 0.1039, + "step": 24302 + }, + { + "epoch": 0.6150011387504112, + "grad_norm": 3.3063080310821533, + "learning_rate": 3.290004091510297e-06, + "loss": 0.1818, + "step": 24303 + }, + { + "epoch": 0.6150264443151049, + "grad_norm": 9.172835350036621, + "learning_rate": 3.2896267861937005e-06, + "loss": 0.1701, + "step": 24304 + }, + { + "epoch": 0.6150517498797986, + "grad_norm": 6.359882831573486, + "learning_rate": 3.289249491906984e-06, + "loss": 0.1389, + "step": 24305 + }, + { + "epoch": 0.6150770554444922, + "grad_norm": 2.657599449157715, + "learning_rate": 3.2888722086525774e-06, + "loss": 0.1048, + "step": 24306 + }, + { + "epoch": 0.6151023610091859, + "grad_norm": 3.2759697437286377, + "learning_rate": 3.288494936432916e-06, + "loss": 0.1428, + "step": 24307 + }, + { + "epoch": 0.6151276665738796, + "grad_norm": 3.1355321407318115, + "learning_rate": 3.2881176752504322e-06, + "loss": 0.0716, + "step": 24308 + }, + { + "epoch": 0.6151529721385732, + "grad_norm": 5.154598236083984, + "learning_rate": 3.2877404251075616e-06, + "loss": 0.1842, + "step": 24309 + }, + { + "epoch": 0.6151782777032669, + "grad_norm": 4.762909889221191, + "learning_rate": 3.2873631860067324e-06, + "loss": 0.0771, + "step": 24310 + }, + { + "epoch": 0.6152035832679607, + "grad_norm": 3.156672477722168, + "learning_rate": 3.2869859579503784e-06, + "loss": 0.1165, + "step": 24311 + }, + { + "epoch": 0.6152288888326543, + "grad_norm": 2.7324018478393555, + "learning_rate": 3.286608740940934e-06, + "loss": 0.0893, + "step": 24312 + }, + { + "epoch": 0.615254194397348, + "grad_norm": 5.442138671875, + "learning_rate": 3.2862315349808318e-06, + "loss": 0.1371, + "step": 24313 + }, + { + "epoch": 0.6152794999620417, + "grad_norm": 5.9434733390808105, + "learning_rate": 3.2858543400725045e-06, + "loss": 0.1194, + "step": 24314 + }, + { + "epoch": 0.6153048055267353, + "grad_norm": 2.865082025527954, + "learning_rate": 3.2854771562183823e-06, + "loss": 0.1469, + "step": 24315 + }, + { + "epoch": 0.615330111091429, + "grad_norm": 3.657111406326294, + "learning_rate": 3.2850999834208984e-06, + "loss": 0.131, + "step": 24316 + }, + { + "epoch": 0.6153554166561227, + "grad_norm": 6.2530622482299805, + "learning_rate": 3.284722821682487e-06, + "loss": 0.2011, + "step": 24317 + }, + { + "epoch": 0.6153807222208164, + "grad_norm": 8.645783424377441, + "learning_rate": 3.28434567100558e-06, + "loss": 0.3468, + "step": 24318 + }, + { + "epoch": 0.61540602778551, + "grad_norm": 3.2743172645568848, + "learning_rate": 3.2839685313926067e-06, + "loss": 0.1337, + "step": 24319 + }, + { + "epoch": 0.6154313333502037, + "grad_norm": 5.034653663635254, + "learning_rate": 3.2835914028460014e-06, + "loss": 0.1507, + "step": 24320 + }, + { + "epoch": 0.6154566389148974, + "grad_norm": 4.2987189292907715, + "learning_rate": 3.2832142853681957e-06, + "loss": 0.0889, + "step": 24321 + }, + { + "epoch": 0.615481944479591, + "grad_norm": 3.863032579421997, + "learning_rate": 3.2828371789616243e-06, + "loss": 0.1925, + "step": 24322 + }, + { + "epoch": 0.6155072500442847, + "grad_norm": 6.334584712982178, + "learning_rate": 3.2824600836287137e-06, + "loss": 0.1223, + "step": 24323 + }, + { + "epoch": 0.6155325556089785, + "grad_norm": 3.580153226852417, + "learning_rate": 3.282082999371899e-06, + "loss": 0.0982, + "step": 24324 + }, + { + "epoch": 0.6155578611736721, + "grad_norm": 6.465773105621338, + "learning_rate": 3.2817059261936114e-06, + "loss": 0.1638, + "step": 24325 + }, + { + "epoch": 0.6155831667383658, + "grad_norm": 4.073798179626465, + "learning_rate": 3.281328864096284e-06, + "loss": 0.1558, + "step": 24326 + }, + { + "epoch": 0.6156084723030595, + "grad_norm": 6.415587902069092, + "learning_rate": 3.280951813082346e-06, + "loss": 0.2736, + "step": 24327 + }, + { + "epoch": 0.6156337778677531, + "grad_norm": 4.135483741760254, + "learning_rate": 3.2805747731542297e-06, + "loss": 0.1367, + "step": 24328 + }, + { + "epoch": 0.6156590834324468, + "grad_norm": 7.122622489929199, + "learning_rate": 3.280197744314368e-06, + "loss": 0.1305, + "step": 24329 + }, + { + "epoch": 0.6156843889971405, + "grad_norm": 8.700773239135742, + "learning_rate": 3.27982072656519e-06, + "loss": 0.1469, + "step": 24330 + }, + { + "epoch": 0.6157096945618341, + "grad_norm": 11.673422813415527, + "learning_rate": 3.2794437199091287e-06, + "loss": 0.3667, + "step": 24331 + }, + { + "epoch": 0.6157350001265278, + "grad_norm": 5.751548767089844, + "learning_rate": 3.2790667243486143e-06, + "loss": 0.1866, + "step": 24332 + }, + { + "epoch": 0.6157603056912215, + "grad_norm": 6.96351432800293, + "learning_rate": 3.2786897398860797e-06, + "loss": 0.1985, + "step": 24333 + }, + { + "epoch": 0.6157856112559151, + "grad_norm": 4.188699245452881, + "learning_rate": 3.2783127665239534e-06, + "loss": 0.1427, + "step": 24334 + }, + { + "epoch": 0.6158109168206088, + "grad_norm": 6.271363735198975, + "learning_rate": 3.277935804264668e-06, + "loss": 0.1019, + "step": 24335 + }, + { + "epoch": 0.6158362223853026, + "grad_norm": 13.505722045898438, + "learning_rate": 3.2775588531106563e-06, + "loss": 0.1627, + "step": 24336 + }, + { + "epoch": 0.6158615279499962, + "grad_norm": 3.9572620391845703, + "learning_rate": 3.2771819130643445e-06, + "loss": 0.1774, + "step": 24337 + }, + { + "epoch": 0.6158868335146899, + "grad_norm": 7.10359525680542, + "learning_rate": 3.2768049841281667e-06, + "loss": 0.2163, + "step": 24338 + }, + { + "epoch": 0.6159121390793836, + "grad_norm": 4.070131301879883, + "learning_rate": 3.276428066304553e-06, + "loss": 0.1127, + "step": 24339 + }, + { + "epoch": 0.6159374446440772, + "grad_norm": 6.309701919555664, + "learning_rate": 3.2760511595959354e-06, + "loss": 0.217, + "step": 24340 + }, + { + "epoch": 0.6159627502087709, + "grad_norm": 2.5433132648468018, + "learning_rate": 3.275674264004742e-06, + "loss": 0.1274, + "step": 24341 + }, + { + "epoch": 0.6159880557734646, + "grad_norm": 8.204570770263672, + "learning_rate": 3.2752973795334043e-06, + "loss": 0.2182, + "step": 24342 + }, + { + "epoch": 0.6160133613381583, + "grad_norm": 4.010204792022705, + "learning_rate": 3.274920506184353e-06, + "loss": 0.1629, + "step": 24343 + }, + { + "epoch": 0.6160386669028519, + "grad_norm": 5.359607696533203, + "learning_rate": 3.274543643960021e-06, + "loss": 0.2341, + "step": 24344 + }, + { + "epoch": 0.6160639724675456, + "grad_norm": 4.758331775665283, + "learning_rate": 3.274166792862833e-06, + "loss": 0.1395, + "step": 24345 + }, + { + "epoch": 0.6160892780322393, + "grad_norm": 7.6890130043029785, + "learning_rate": 3.2737899528952235e-06, + "loss": 0.1897, + "step": 24346 + }, + { + "epoch": 0.6161145835969329, + "grad_norm": 7.177427768707275, + "learning_rate": 3.27341312405962e-06, + "loss": 0.1959, + "step": 24347 + }, + { + "epoch": 0.6161398891616267, + "grad_norm": 5.580839157104492, + "learning_rate": 3.273036306358455e-06, + "loss": 0.1468, + "step": 24348 + }, + { + "epoch": 0.6161651947263204, + "grad_norm": 5.390634059906006, + "learning_rate": 3.2726594997941596e-06, + "loss": 0.1767, + "step": 24349 + }, + { + "epoch": 0.616190500291014, + "grad_norm": 9.696014404296875, + "learning_rate": 3.2722827043691598e-06, + "loss": 0.2049, + "step": 24350 + }, + { + "epoch": 0.6162158058557077, + "grad_norm": 4.41063117980957, + "learning_rate": 3.2719059200858873e-06, + "loss": 0.1873, + "step": 24351 + }, + { + "epoch": 0.6162411114204014, + "grad_norm": 3.7767255306243896, + "learning_rate": 3.2715291469467723e-06, + "loss": 0.1846, + "step": 24352 + }, + { + "epoch": 0.616266416985095, + "grad_norm": 16.321557998657227, + "learning_rate": 3.271152384954246e-06, + "loss": 0.1497, + "step": 24353 + }, + { + "epoch": 0.6162917225497887, + "grad_norm": 6.281162261962891, + "learning_rate": 3.2707756341107343e-06, + "loss": 0.0853, + "step": 24354 + }, + { + "epoch": 0.6163170281144824, + "grad_norm": 4.864783763885498, + "learning_rate": 3.2703988944186694e-06, + "loss": 0.1695, + "step": 24355 + }, + { + "epoch": 0.616342333679176, + "grad_norm": 6.055398941040039, + "learning_rate": 3.270022165880481e-06, + "loss": 0.1186, + "step": 24356 + }, + { + "epoch": 0.6163676392438697, + "grad_norm": 6.558795928955078, + "learning_rate": 3.269645448498597e-06, + "loss": 0.1201, + "step": 24357 + }, + { + "epoch": 0.6163929448085634, + "grad_norm": 21.802139282226562, + "learning_rate": 3.269268742275449e-06, + "loss": 0.2188, + "step": 24358 + }, + { + "epoch": 0.616418250373257, + "grad_norm": 3.240382194519043, + "learning_rate": 3.268892047213463e-06, + "loss": 0.1275, + "step": 24359 + }, + { + "epoch": 0.6164435559379507, + "grad_norm": 5.441836833953857, + "learning_rate": 3.2685153633150718e-06, + "loss": 0.1411, + "step": 24360 + }, + { + "epoch": 0.6164688615026445, + "grad_norm": 4.260496139526367, + "learning_rate": 3.2681386905827015e-06, + "loss": 0.1433, + "step": 24361 + }, + { + "epoch": 0.6164941670673381, + "grad_norm": 2.5014805793762207, + "learning_rate": 3.267762029018784e-06, + "loss": 0.0774, + "step": 24362 + }, + { + "epoch": 0.6165194726320318, + "grad_norm": 14.840968132019043, + "learning_rate": 3.267385378625747e-06, + "loss": 0.2175, + "step": 24363 + }, + { + "epoch": 0.6165447781967255, + "grad_norm": 4.849469184875488, + "learning_rate": 3.267008739406018e-06, + "loss": 0.1871, + "step": 24364 + }, + { + "epoch": 0.6165700837614191, + "grad_norm": 3.3545448780059814, + "learning_rate": 3.266632111362028e-06, + "loss": 0.1387, + "step": 24365 + }, + { + "epoch": 0.6165953893261128, + "grad_norm": 6.57682466506958, + "learning_rate": 3.2662554944962043e-06, + "loss": 0.1892, + "step": 24366 + }, + { + "epoch": 0.6166206948908065, + "grad_norm": 4.42071533203125, + "learning_rate": 3.265878888810979e-06, + "loss": 0.124, + "step": 24367 + }, + { + "epoch": 0.6166460004555001, + "grad_norm": 5.589835166931152, + "learning_rate": 3.2655022943087757e-06, + "loss": 0.1399, + "step": 24368 + }, + { + "epoch": 0.6166713060201938, + "grad_norm": 2.800635814666748, + "learning_rate": 3.2651257109920253e-06, + "loss": 0.0492, + "step": 24369 + }, + { + "epoch": 0.6166966115848875, + "grad_norm": 4.470279216766357, + "learning_rate": 3.264749138863157e-06, + "loss": 0.1299, + "step": 24370 + }, + { + "epoch": 0.6167219171495812, + "grad_norm": 8.124045372009277, + "learning_rate": 3.2643725779246004e-06, + "loss": 0.1769, + "step": 24371 + }, + { + "epoch": 0.6167472227142748, + "grad_norm": 5.731405735015869, + "learning_rate": 3.2639960281787807e-06, + "loss": 0.1337, + "step": 24372 + }, + { + "epoch": 0.6167725282789686, + "grad_norm": 3.9333269596099854, + "learning_rate": 3.263619489628127e-06, + "loss": 0.1705, + "step": 24373 + }, + { + "epoch": 0.6167978338436623, + "grad_norm": 4.2099480628967285, + "learning_rate": 3.263242962275069e-06, + "loss": 0.1496, + "step": 24374 + }, + { + "epoch": 0.6168231394083559, + "grad_norm": 6.783939838409424, + "learning_rate": 3.2628664461220356e-06, + "loss": 0.2091, + "step": 24375 + }, + { + "epoch": 0.6168484449730496, + "grad_norm": 4.526996612548828, + "learning_rate": 3.2624899411714517e-06, + "loss": 0.1186, + "step": 24376 + }, + { + "epoch": 0.6168737505377433, + "grad_norm": 6.835444927215576, + "learning_rate": 3.2621134474257465e-06, + "loss": 0.1429, + "step": 24377 + }, + { + "epoch": 0.6168990561024369, + "grad_norm": 5.614286422729492, + "learning_rate": 3.2617369648873486e-06, + "loss": 0.2325, + "step": 24378 + }, + { + "epoch": 0.6169243616671306, + "grad_norm": 6.548288345336914, + "learning_rate": 3.261360493558685e-06, + "loss": 0.1854, + "step": 24379 + }, + { + "epoch": 0.6169496672318243, + "grad_norm": 5.345726490020752, + "learning_rate": 3.260984033442187e-06, + "loss": 0.2147, + "step": 24380 + }, + { + "epoch": 0.6169749727965179, + "grad_norm": 6.361632823944092, + "learning_rate": 3.2606075845402776e-06, + "loss": 0.1357, + "step": 24381 + }, + { + "epoch": 0.6170002783612116, + "grad_norm": 3.01090145111084, + "learning_rate": 3.260231146855386e-06, + "loss": 0.1354, + "step": 24382 + }, + { + "epoch": 0.6170255839259053, + "grad_norm": 4.665452480316162, + "learning_rate": 3.25985472038994e-06, + "loss": 0.1393, + "step": 24383 + }, + { + "epoch": 0.6170508894905989, + "grad_norm": 8.381396293640137, + "learning_rate": 3.2594783051463697e-06, + "loss": 0.1485, + "step": 24384 + }, + { + "epoch": 0.6170761950552927, + "grad_norm": 20.619075775146484, + "learning_rate": 3.2591019011270976e-06, + "loss": 0.2031, + "step": 24385 + }, + { + "epoch": 0.6171015006199864, + "grad_norm": 7.489211559295654, + "learning_rate": 3.258725508334554e-06, + "loss": 0.1624, + "step": 24386 + }, + { + "epoch": 0.61712680618468, + "grad_norm": 4.205234050750732, + "learning_rate": 3.258349126771166e-06, + "loss": 0.1679, + "step": 24387 + }, + { + "epoch": 0.6171521117493737, + "grad_norm": 5.7333526611328125, + "learning_rate": 3.2579727564393596e-06, + "loss": 0.2302, + "step": 24388 + }, + { + "epoch": 0.6171774173140674, + "grad_norm": 4.751302719116211, + "learning_rate": 3.2575963973415642e-06, + "loss": 0.1503, + "step": 24389 + }, + { + "epoch": 0.617202722878761, + "grad_norm": 12.19001293182373, + "learning_rate": 3.257220049480205e-06, + "loss": 0.1546, + "step": 24390 + }, + { + "epoch": 0.6172280284434547, + "grad_norm": 4.052797317504883, + "learning_rate": 3.256843712857709e-06, + "loss": 0.1081, + "step": 24391 + }, + { + "epoch": 0.6172533340081484, + "grad_norm": 2.2073264122009277, + "learning_rate": 3.256467387476504e-06, + "loss": 0.1021, + "step": 24392 + }, + { + "epoch": 0.617278639572842, + "grad_norm": 5.201021194458008, + "learning_rate": 3.256091073339017e-06, + "loss": 0.1974, + "step": 24393 + }, + { + "epoch": 0.6173039451375357, + "grad_norm": 8.389518737792969, + "learning_rate": 3.2557147704476743e-06, + "loss": 0.2956, + "step": 24394 + }, + { + "epoch": 0.6173292507022294, + "grad_norm": 4.691206455230713, + "learning_rate": 3.2553384788049015e-06, + "loss": 0.1918, + "step": 24395 + }, + { + "epoch": 0.6173545562669231, + "grad_norm": 5.926568984985352, + "learning_rate": 3.2549621984131264e-06, + "loss": 0.1913, + "step": 24396 + }, + { + "epoch": 0.6173798618316167, + "grad_norm": 9.686767578125, + "learning_rate": 3.2545859292747777e-06, + "loss": 0.2173, + "step": 24397 + }, + { + "epoch": 0.6174051673963105, + "grad_norm": 10.917706489562988, + "learning_rate": 3.2542096713922776e-06, + "loss": 0.1817, + "step": 24398 + }, + { + "epoch": 0.6174304729610042, + "grad_norm": 5.175333499908447, + "learning_rate": 3.253833424768055e-06, + "loss": 0.1933, + "step": 24399 + }, + { + "epoch": 0.6174557785256978, + "grad_norm": 4.573705196380615, + "learning_rate": 3.2534571894045348e-06, + "loss": 0.103, + "step": 24400 + }, + { + "epoch": 0.6174810840903915, + "grad_norm": 7.688569068908691, + "learning_rate": 3.2530809653041447e-06, + "loss": 0.2978, + "step": 24401 + }, + { + "epoch": 0.6175063896550852, + "grad_norm": 6.230615615844727, + "learning_rate": 3.2527047524693122e-06, + "loss": 0.1453, + "step": 24402 + }, + { + "epoch": 0.6175316952197788, + "grad_norm": 3.853694200515747, + "learning_rate": 3.2523285509024604e-06, + "loss": 0.1031, + "step": 24403 + }, + { + "epoch": 0.6175570007844725, + "grad_norm": 5.746732711791992, + "learning_rate": 3.2519523606060164e-06, + "loss": 0.1555, + "step": 24404 + }, + { + "epoch": 0.6175823063491662, + "grad_norm": 7.636319637298584, + "learning_rate": 3.2515761815824065e-06, + "loss": 0.2265, + "step": 24405 + }, + { + "epoch": 0.6176076119138598, + "grad_norm": 8.734719276428223, + "learning_rate": 3.2512000138340588e-06, + "loss": 0.1943, + "step": 24406 + }, + { + "epoch": 0.6176329174785535, + "grad_norm": 4.832208156585693, + "learning_rate": 3.2508238573633944e-06, + "loss": 0.1949, + "step": 24407 + }, + { + "epoch": 0.6176582230432472, + "grad_norm": 4.8078389167785645, + "learning_rate": 3.250447712172842e-06, + "loss": 0.1499, + "step": 24408 + }, + { + "epoch": 0.6176835286079408, + "grad_norm": 3.6266045570373535, + "learning_rate": 3.2500715782648266e-06, + "loss": 0.1664, + "step": 24409 + }, + { + "epoch": 0.6177088341726346, + "grad_norm": 3.821709156036377, + "learning_rate": 3.249695455641774e-06, + "loss": 0.1712, + "step": 24410 + }, + { + "epoch": 0.6177341397373283, + "grad_norm": 6.182806015014648, + "learning_rate": 3.249319344306112e-06, + "loss": 0.2186, + "step": 24411 + }, + { + "epoch": 0.6177594453020219, + "grad_norm": 4.114181041717529, + "learning_rate": 3.248943244260262e-06, + "loss": 0.1362, + "step": 24412 + }, + { + "epoch": 0.6177847508667156, + "grad_norm": 3.5225038528442383, + "learning_rate": 3.248567155506651e-06, + "loss": 0.1448, + "step": 24413 + }, + { + "epoch": 0.6178100564314093, + "grad_norm": 2.4904494285583496, + "learning_rate": 3.2481910780477067e-06, + "loss": 0.1292, + "step": 24414 + }, + { + "epoch": 0.6178353619961029, + "grad_norm": 7.585994243621826, + "learning_rate": 3.2478150118858508e-06, + "loss": 0.2205, + "step": 24415 + }, + { + "epoch": 0.6178606675607966, + "grad_norm": 11.206377029418945, + "learning_rate": 3.24743895702351e-06, + "loss": 0.2153, + "step": 24416 + }, + { + "epoch": 0.6178859731254903, + "grad_norm": 3.567991256713867, + "learning_rate": 3.247062913463109e-06, + "loss": 0.1218, + "step": 24417 + }, + { + "epoch": 0.6179112786901839, + "grad_norm": 3.5778348445892334, + "learning_rate": 3.246686881207074e-06, + "loss": 0.1485, + "step": 24418 + }, + { + "epoch": 0.6179365842548776, + "grad_norm": 6.061243534088135, + "learning_rate": 3.246310860257828e-06, + "loss": 0.2225, + "step": 24419 + }, + { + "epoch": 0.6179618898195713, + "grad_norm": 3.215682029724121, + "learning_rate": 3.2459348506177984e-06, + "loss": 0.1435, + "step": 24420 + }, + { + "epoch": 0.617987195384265, + "grad_norm": 9.005363464355469, + "learning_rate": 3.245558852289409e-06, + "loss": 0.2091, + "step": 24421 + }, + { + "epoch": 0.6180125009489587, + "grad_norm": 2.600583553314209, + "learning_rate": 3.2451828652750827e-06, + "loss": 0.0986, + "step": 24422 + }, + { + "epoch": 0.6180378065136524, + "grad_norm": 3.9986722469329834, + "learning_rate": 3.2448068895772455e-06, + "loss": 0.1899, + "step": 24423 + }, + { + "epoch": 0.6180631120783461, + "grad_norm": 4.00386905670166, + "learning_rate": 3.2444309251983248e-06, + "loss": 0.1409, + "step": 24424 + }, + { + "epoch": 0.6180884176430397, + "grad_norm": 5.649981498718262, + "learning_rate": 3.2440549721407397e-06, + "loss": 0.2039, + "step": 24425 + }, + { + "epoch": 0.6181137232077334, + "grad_norm": 10.061803817749023, + "learning_rate": 3.2436790304069184e-06, + "loss": 0.1199, + "step": 24426 + }, + { + "epoch": 0.6181390287724271, + "grad_norm": 6.910001754760742, + "learning_rate": 3.2433030999992836e-06, + "loss": 0.0942, + "step": 24427 + }, + { + "epoch": 0.6181643343371207, + "grad_norm": 5.467278003692627, + "learning_rate": 3.242927180920263e-06, + "loss": 0.209, + "step": 24428 + }, + { + "epoch": 0.6181896399018144, + "grad_norm": 7.088507175445557, + "learning_rate": 3.2425512731722753e-06, + "loss": 0.153, + "step": 24429 + }, + { + "epoch": 0.6182149454665081, + "grad_norm": 2.692230224609375, + "learning_rate": 3.242175376757748e-06, + "loss": 0.1128, + "step": 24430 + }, + { + "epoch": 0.6182402510312017, + "grad_norm": 3.0565998554229736, + "learning_rate": 3.2417994916791053e-06, + "loss": 0.0652, + "step": 24431 + }, + { + "epoch": 0.6182655565958954, + "grad_norm": 10.681982040405273, + "learning_rate": 3.241423617938769e-06, + "loss": 0.2609, + "step": 24432 + }, + { + "epoch": 0.6182908621605891, + "grad_norm": 3.2601022720336914, + "learning_rate": 3.2410477555391683e-06, + "loss": 0.1176, + "step": 24433 + }, + { + "epoch": 0.6183161677252828, + "grad_norm": 23.86943817138672, + "learning_rate": 3.2406719044827206e-06, + "loss": 0.2258, + "step": 24434 + }, + { + "epoch": 0.6183414732899765, + "grad_norm": 7.681948661804199, + "learning_rate": 3.2402960647718527e-06, + "loss": 0.2103, + "step": 24435 + }, + { + "epoch": 0.6183667788546702, + "grad_norm": 8.912118911743164, + "learning_rate": 3.239920236408989e-06, + "loss": 0.275, + "step": 24436 + }, + { + "epoch": 0.6183920844193638, + "grad_norm": 5.779416084289551, + "learning_rate": 3.2395444193965532e-06, + "loss": 0.1821, + "step": 24437 + }, + { + "epoch": 0.6184173899840575, + "grad_norm": 3.8411402702331543, + "learning_rate": 3.239168613736966e-06, + "loss": 0.1429, + "step": 24438 + }, + { + "epoch": 0.6184426955487512, + "grad_norm": 3.3542070388793945, + "learning_rate": 3.2387928194326535e-06, + "loss": 0.1375, + "step": 24439 + }, + { + "epoch": 0.6184680011134448, + "grad_norm": 6.839024543762207, + "learning_rate": 3.238417036486038e-06, + "loss": 0.2553, + "step": 24440 + }, + { + "epoch": 0.6184933066781385, + "grad_norm": 5.17770528793335, + "learning_rate": 3.2380412648995443e-06, + "loss": 0.1675, + "step": 24441 + }, + { + "epoch": 0.6185186122428322, + "grad_norm": 8.471494674682617, + "learning_rate": 3.237665504675595e-06, + "loss": 0.2501, + "step": 24442 + }, + { + "epoch": 0.6185439178075258, + "grad_norm": 4.9537577629089355, + "learning_rate": 3.2372897558166123e-06, + "loss": 0.1713, + "step": 24443 + }, + { + "epoch": 0.6185692233722195, + "grad_norm": 3.972339630126953, + "learning_rate": 3.2369140183250194e-06, + "loss": 0.161, + "step": 24444 + }, + { + "epoch": 0.6185945289369132, + "grad_norm": 5.471653461456299, + "learning_rate": 3.2365382922032416e-06, + "loss": 0.1754, + "step": 24445 + }, + { + "epoch": 0.618619834501607, + "grad_norm": 3.6975080966949463, + "learning_rate": 3.2361625774536998e-06, + "loss": 0.129, + "step": 24446 + }, + { + "epoch": 0.6186451400663006, + "grad_norm": 3.9728915691375732, + "learning_rate": 3.235786874078817e-06, + "loss": 0.1458, + "step": 24447 + }, + { + "epoch": 0.6186704456309943, + "grad_norm": 5.914891719818115, + "learning_rate": 3.2354111820810175e-06, + "loss": 0.1671, + "step": 24448 + }, + { + "epoch": 0.618695751195688, + "grad_norm": 8.939818382263184, + "learning_rate": 3.2350355014627217e-06, + "loss": 0.2487, + "step": 24449 + }, + { + "epoch": 0.6187210567603816, + "grad_norm": 4.522641181945801, + "learning_rate": 3.2346598322263544e-06, + "loss": 0.1693, + "step": 24450 + }, + { + "epoch": 0.6187463623250753, + "grad_norm": 5.332146644592285, + "learning_rate": 3.2342841743743382e-06, + "loss": 0.2439, + "step": 24451 + }, + { + "epoch": 0.618771667889769, + "grad_norm": 5.5261969566345215, + "learning_rate": 3.233908527909093e-06, + "loss": 0.0974, + "step": 24452 + }, + { + "epoch": 0.6187969734544626, + "grad_norm": 6.463221073150635, + "learning_rate": 3.233532892833044e-06, + "loss": 0.194, + "step": 24453 + }, + { + "epoch": 0.6188222790191563, + "grad_norm": 3.913151502609253, + "learning_rate": 3.2331572691486125e-06, + "loss": 0.1282, + "step": 24454 + }, + { + "epoch": 0.61884758458385, + "grad_norm": 3.035764455795288, + "learning_rate": 3.232781656858223e-06, + "loss": 0.1227, + "step": 24455 + }, + { + "epoch": 0.6188728901485436, + "grad_norm": 18.93739128112793, + "learning_rate": 3.232406055964294e-06, + "loss": 0.2058, + "step": 24456 + }, + { + "epoch": 0.6188981957132373, + "grad_norm": 4.630344867706299, + "learning_rate": 3.2320304664692494e-06, + "loss": 0.1505, + "step": 24457 + }, + { + "epoch": 0.618923501277931, + "grad_norm": 6.54390811920166, + "learning_rate": 3.2316548883755115e-06, + "loss": 0.1485, + "step": 24458 + }, + { + "epoch": 0.6189488068426247, + "grad_norm": 4.630283355712891, + "learning_rate": 3.2312793216855043e-06, + "loss": 0.1372, + "step": 24459 + }, + { + "epoch": 0.6189741124073184, + "grad_norm": 4.846009254455566, + "learning_rate": 3.2309037664016463e-06, + "loss": 0.1382, + "step": 24460 + }, + { + "epoch": 0.6189994179720121, + "grad_norm": 5.432396411895752, + "learning_rate": 3.23052822252636e-06, + "loss": 0.1486, + "step": 24461 + }, + { + "epoch": 0.6190247235367057, + "grad_norm": 4.184842109680176, + "learning_rate": 3.230152690062068e-06, + "loss": 0.1622, + "step": 24462 + }, + { + "epoch": 0.6190500291013994, + "grad_norm": 11.925101280212402, + "learning_rate": 3.2297771690111928e-06, + "loss": 0.1838, + "step": 24463 + }, + { + "epoch": 0.6190753346660931, + "grad_norm": 3.9105374813079834, + "learning_rate": 3.229401659376157e-06, + "loss": 0.1533, + "step": 24464 + }, + { + "epoch": 0.6191006402307867, + "grad_norm": 3.9673750400543213, + "learning_rate": 3.229026161159379e-06, + "loss": 0.1012, + "step": 24465 + }, + { + "epoch": 0.6191259457954804, + "grad_norm": 5.137026786804199, + "learning_rate": 3.2286506743632807e-06, + "loss": 0.1348, + "step": 24466 + }, + { + "epoch": 0.6191512513601741, + "grad_norm": 8.69135570526123, + "learning_rate": 3.2282751989902862e-06, + "loss": 0.2995, + "step": 24467 + }, + { + "epoch": 0.6191765569248677, + "grad_norm": 6.717315196990967, + "learning_rate": 3.2278997350428167e-06, + "loss": 0.2081, + "step": 24468 + }, + { + "epoch": 0.6192018624895614, + "grad_norm": 3.3067362308502197, + "learning_rate": 3.2275242825232896e-06, + "loss": 0.0999, + "step": 24469 + }, + { + "epoch": 0.6192271680542552, + "grad_norm": 5.284224033355713, + "learning_rate": 3.227148841434129e-06, + "loss": 0.1093, + "step": 24470 + }, + { + "epoch": 0.6192524736189489, + "grad_norm": 4.594944953918457, + "learning_rate": 3.226773411777756e-06, + "loss": 0.149, + "step": 24471 + }, + { + "epoch": 0.6192777791836425, + "grad_norm": 3.4196970462799072, + "learning_rate": 3.2263979935565938e-06, + "loss": 0.1498, + "step": 24472 + }, + { + "epoch": 0.6193030847483362, + "grad_norm": 2.917121648788452, + "learning_rate": 3.226022586773058e-06, + "loss": 0.0918, + "step": 24473 + }, + { + "epoch": 0.6193283903130299, + "grad_norm": 2.7639987468719482, + "learning_rate": 3.225647191429573e-06, + "loss": 0.1259, + "step": 24474 + }, + { + "epoch": 0.6193536958777235, + "grad_norm": 4.909626007080078, + "learning_rate": 3.2252718075285606e-06, + "loss": 0.1072, + "step": 24475 + }, + { + "epoch": 0.6193790014424172, + "grad_norm": 6.079895973205566, + "learning_rate": 3.224896435072439e-06, + "loss": 0.2452, + "step": 24476 + }, + { + "epoch": 0.6194043070071109, + "grad_norm": 5.359281063079834, + "learning_rate": 3.2245210740636306e-06, + "loss": 0.1963, + "step": 24477 + }, + { + "epoch": 0.6194296125718045, + "grad_norm": 3.469008207321167, + "learning_rate": 3.224145724504556e-06, + "loss": 0.1291, + "step": 24478 + }, + { + "epoch": 0.6194549181364982, + "grad_norm": 10.688630104064941, + "learning_rate": 3.2237703863976344e-06, + "loss": 0.2259, + "step": 24479 + }, + { + "epoch": 0.6194802237011919, + "grad_norm": 3.1010890007019043, + "learning_rate": 3.223395059745287e-06, + "loss": 0.1126, + "step": 24480 + }, + { + "epoch": 0.6195055292658855, + "grad_norm": 11.199686050415039, + "learning_rate": 3.2230197445499357e-06, + "loss": 0.1942, + "step": 24481 + }, + { + "epoch": 0.6195308348305792, + "grad_norm": 4.7174553871154785, + "learning_rate": 3.222644440813999e-06, + "loss": 0.1565, + "step": 24482 + }, + { + "epoch": 0.619556140395273, + "grad_norm": 3.627671957015991, + "learning_rate": 3.2222691485398966e-06, + "loss": 0.1293, + "step": 24483 + }, + { + "epoch": 0.6195814459599666, + "grad_norm": 4.394586563110352, + "learning_rate": 3.2218938677300504e-06, + "loss": 0.1883, + "step": 24484 + }, + { + "epoch": 0.6196067515246603, + "grad_norm": 9.34853458404541, + "learning_rate": 3.2215185983868792e-06, + "loss": 0.1832, + "step": 24485 + }, + { + "epoch": 0.619632057089354, + "grad_norm": 3.5865819454193115, + "learning_rate": 3.221143340512806e-06, + "loss": 0.1418, + "step": 24486 + }, + { + "epoch": 0.6196573626540476, + "grad_norm": 2.9353668689727783, + "learning_rate": 3.220768094110247e-06, + "loss": 0.1275, + "step": 24487 + }, + { + "epoch": 0.6196826682187413, + "grad_norm": 4.544619560241699, + "learning_rate": 3.2203928591816227e-06, + "loss": 0.2217, + "step": 24488 + }, + { + "epoch": 0.619707973783435, + "grad_norm": 4.495646953582764, + "learning_rate": 3.2200176357293544e-06, + "loss": 0.1357, + "step": 24489 + }, + { + "epoch": 0.6197332793481286, + "grad_norm": 9.56002140045166, + "learning_rate": 3.2196424237558633e-06, + "loss": 0.1553, + "step": 24490 + }, + { + "epoch": 0.6197585849128223, + "grad_norm": 11.847370147705078, + "learning_rate": 3.2192672232635645e-06, + "loss": 0.2154, + "step": 24491 + }, + { + "epoch": 0.619783890477516, + "grad_norm": 6.23036003112793, + "learning_rate": 3.2188920342548814e-06, + "loss": 0.1282, + "step": 24492 + }, + { + "epoch": 0.6198091960422096, + "grad_norm": 9.063045501708984, + "learning_rate": 3.218516856732231e-06, + "loss": 0.2644, + "step": 24493 + }, + { + "epoch": 0.6198345016069033, + "grad_norm": 11.194060325622559, + "learning_rate": 3.2181416906980343e-06, + "loss": 0.1504, + "step": 24494 + }, + { + "epoch": 0.619859807171597, + "grad_norm": 3.2849717140197754, + "learning_rate": 3.2177665361547128e-06, + "loss": 0.1107, + "step": 24495 + }, + { + "epoch": 0.6198851127362907, + "grad_norm": 5.239661693572998, + "learning_rate": 3.217391393104681e-06, + "loss": 0.2094, + "step": 24496 + }, + { + "epoch": 0.6199104183009844, + "grad_norm": 4.710168838500977, + "learning_rate": 3.217016261550361e-06, + "loss": 0.1565, + "step": 24497 + }, + { + "epoch": 0.6199357238656781, + "grad_norm": 6.607948303222656, + "learning_rate": 3.2166411414941708e-06, + "loss": 0.1857, + "step": 24498 + }, + { + "epoch": 0.6199610294303718, + "grad_norm": 5.966273784637451, + "learning_rate": 3.216266032938532e-06, + "loss": 0.1611, + "step": 24499 + }, + { + "epoch": 0.6199863349950654, + "grad_norm": 4.486362457275391, + "learning_rate": 3.2158909358858603e-06, + "loss": 0.1309, + "step": 24500 + }, + { + "epoch": 0.6200116405597591, + "grad_norm": 2.5018506050109863, + "learning_rate": 3.2155158503385765e-06, + "loss": 0.1172, + "step": 24501 + }, + { + "epoch": 0.6200369461244528, + "grad_norm": 5.927122116088867, + "learning_rate": 3.2151407762990992e-06, + "loss": 0.197, + "step": 24502 + }, + { + "epoch": 0.6200622516891464, + "grad_norm": 6.334866523742676, + "learning_rate": 3.2147657137698472e-06, + "loss": 0.1219, + "step": 24503 + }, + { + "epoch": 0.6200875572538401, + "grad_norm": 4.540116310119629, + "learning_rate": 3.2143906627532383e-06, + "loss": 0.1558, + "step": 24504 + }, + { + "epoch": 0.6201128628185338, + "grad_norm": 8.970999717712402, + "learning_rate": 3.2140156232516917e-06, + "loss": 0.1755, + "step": 24505 + }, + { + "epoch": 0.6201381683832274, + "grad_norm": 4.287654876708984, + "learning_rate": 3.213640595267627e-06, + "loss": 0.1559, + "step": 24506 + }, + { + "epoch": 0.6201634739479212, + "grad_norm": 2.6447153091430664, + "learning_rate": 3.2132655788034604e-06, + "loss": 0.0938, + "step": 24507 + }, + { + "epoch": 0.6201887795126149, + "grad_norm": 3.391409397125244, + "learning_rate": 3.212890573861614e-06, + "loss": 0.1076, + "step": 24508 + }, + { + "epoch": 0.6202140850773085, + "grad_norm": 7.090825080871582, + "learning_rate": 3.212515580444503e-06, + "loss": 0.1736, + "step": 24509 + }, + { + "epoch": 0.6202393906420022, + "grad_norm": 5.241525173187256, + "learning_rate": 3.2121405985545455e-06, + "loss": 0.1246, + "step": 24510 + }, + { + "epoch": 0.6202646962066959, + "grad_norm": 4.00120210647583, + "learning_rate": 3.2117656281941613e-06, + "loss": 0.1087, + "step": 24511 + }, + { + "epoch": 0.6202900017713895, + "grad_norm": 6.951526165008545, + "learning_rate": 3.2113906693657694e-06, + "loss": 0.1434, + "step": 24512 + }, + { + "epoch": 0.6203153073360832, + "grad_norm": 10.947574615478516, + "learning_rate": 3.2110157220717843e-06, + "loss": 0.3822, + "step": 24513 + }, + { + "epoch": 0.6203406129007769, + "grad_norm": 9.234185218811035, + "learning_rate": 3.210640786314626e-06, + "loss": 0.1726, + "step": 24514 + }, + { + "epoch": 0.6203659184654705, + "grad_norm": 3.4263808727264404, + "learning_rate": 3.210265862096713e-06, + "loss": 0.1636, + "step": 24515 + }, + { + "epoch": 0.6203912240301642, + "grad_norm": 8.781567573547363, + "learning_rate": 3.209890949420462e-06, + "loss": 0.2148, + "step": 24516 + }, + { + "epoch": 0.6204165295948579, + "grad_norm": 3.424360990524292, + "learning_rate": 3.2095160482882937e-06, + "loss": 0.1486, + "step": 24517 + }, + { + "epoch": 0.6204418351595515, + "grad_norm": 4.547192096710205, + "learning_rate": 3.209141158702621e-06, + "loss": 0.1257, + "step": 24518 + }, + { + "epoch": 0.6204671407242452, + "grad_norm": 10.18989372253418, + "learning_rate": 3.2087662806658638e-06, + "loss": 0.1065, + "step": 24519 + }, + { + "epoch": 0.620492446288939, + "grad_norm": 8.396581649780273, + "learning_rate": 3.2083914141804397e-06, + "loss": 0.1341, + "step": 24520 + }, + { + "epoch": 0.6205177518536326, + "grad_norm": 19.11144256591797, + "learning_rate": 3.208016559248769e-06, + "loss": 0.381, + "step": 24521 + }, + { + "epoch": 0.6205430574183263, + "grad_norm": 6.060965538024902, + "learning_rate": 3.2076417158732633e-06, + "loss": 0.1907, + "step": 24522 + }, + { + "epoch": 0.62056836298302, + "grad_norm": 7.316516399383545, + "learning_rate": 3.2072668840563424e-06, + "loss": 0.1272, + "step": 24523 + }, + { + "epoch": 0.6205936685477137, + "grad_norm": 3.1113064289093018, + "learning_rate": 3.206892063800424e-06, + "loss": 0.1105, + "step": 24524 + }, + { + "epoch": 0.6206189741124073, + "grad_norm": 3.5709681510925293, + "learning_rate": 3.2065172551079282e-06, + "loss": 0.1495, + "step": 24525 + }, + { + "epoch": 0.620644279677101, + "grad_norm": 3.3186275959014893, + "learning_rate": 3.206142457981266e-06, + "loss": 0.1462, + "step": 24526 + }, + { + "epoch": 0.6206695852417947, + "grad_norm": 8.302050590515137, + "learning_rate": 3.2057676724228574e-06, + "loss": 0.2272, + "step": 24527 + }, + { + "epoch": 0.6206948908064883, + "grad_norm": 3.742361545562744, + "learning_rate": 3.20539289843512e-06, + "loss": 0.1482, + "step": 24528 + }, + { + "epoch": 0.620720196371182, + "grad_norm": 5.846973419189453, + "learning_rate": 3.205018136020469e-06, + "loss": 0.213, + "step": 24529 + }, + { + "epoch": 0.6207455019358757, + "grad_norm": 3.0537188053131104, + "learning_rate": 3.204643385181325e-06, + "loss": 0.1402, + "step": 24530 + }, + { + "epoch": 0.6207708075005693, + "grad_norm": 5.870067119598389, + "learning_rate": 3.2042686459201e-06, + "loss": 0.1964, + "step": 24531 + }, + { + "epoch": 0.620796113065263, + "grad_norm": 6.320271968841553, + "learning_rate": 3.203893918239212e-06, + "loss": 0.2451, + "step": 24532 + }, + { + "epoch": 0.6208214186299568, + "grad_norm": 10.30716609954834, + "learning_rate": 3.2035192021410787e-06, + "loss": 0.1701, + "step": 24533 + }, + { + "epoch": 0.6208467241946504, + "grad_norm": 5.149807453155518, + "learning_rate": 3.2031444976281167e-06, + "loss": 0.1528, + "step": 24534 + }, + { + "epoch": 0.6208720297593441, + "grad_norm": 4.058085918426514, + "learning_rate": 3.20276980470274e-06, + "loss": 0.1382, + "step": 24535 + }, + { + "epoch": 0.6208973353240378, + "grad_norm": 3.0374650955200195, + "learning_rate": 3.2023951233673677e-06, + "loss": 0.1483, + "step": 24536 + }, + { + "epoch": 0.6209226408887314, + "grad_norm": 11.489550590515137, + "learning_rate": 3.2020204536244136e-06, + "loss": 0.2604, + "step": 24537 + }, + { + "epoch": 0.6209479464534251, + "grad_norm": 3.8396804332733154, + "learning_rate": 3.2016457954762957e-06, + "loss": 0.1347, + "step": 24538 + }, + { + "epoch": 0.6209732520181188, + "grad_norm": 3.9661591053009033, + "learning_rate": 3.201271148925431e-06, + "loss": 0.1372, + "step": 24539 + }, + { + "epoch": 0.6209985575828124, + "grad_norm": 3.220613479614258, + "learning_rate": 3.200896513974233e-06, + "loss": 0.0894, + "step": 24540 + }, + { + "epoch": 0.6210238631475061, + "grad_norm": 8.580543518066406, + "learning_rate": 3.200521890625118e-06, + "loss": 0.2257, + "step": 24541 + }, + { + "epoch": 0.6210491687121998, + "grad_norm": 5.783431053161621, + "learning_rate": 3.200147278880502e-06, + "loss": 0.2032, + "step": 24542 + }, + { + "epoch": 0.6210744742768934, + "grad_norm": 12.592087745666504, + "learning_rate": 3.199772678742804e-06, + "loss": 0.2178, + "step": 24543 + }, + { + "epoch": 0.6210997798415872, + "grad_norm": 6.170968055725098, + "learning_rate": 3.1993980902144356e-06, + "loss": 0.2003, + "step": 24544 + }, + { + "epoch": 0.6211250854062809, + "grad_norm": 4.844210147857666, + "learning_rate": 3.1990235132978137e-06, + "loss": 0.1701, + "step": 24545 + }, + { + "epoch": 0.6211503909709745, + "grad_norm": 2.3799667358398438, + "learning_rate": 3.1986489479953543e-06, + "loss": 0.0883, + "step": 24546 + }, + { + "epoch": 0.6211756965356682, + "grad_norm": 4.5699052810668945, + "learning_rate": 3.198274394309472e-06, + "loss": 0.2058, + "step": 24547 + }, + { + "epoch": 0.6212010021003619, + "grad_norm": 5.012622356414795, + "learning_rate": 3.1978998522425854e-06, + "loss": 0.1513, + "step": 24548 + }, + { + "epoch": 0.6212263076650556, + "grad_norm": 3.744609832763672, + "learning_rate": 3.197525321797106e-06, + "loss": 0.1078, + "step": 24549 + }, + { + "epoch": 0.6212516132297492, + "grad_norm": 2.1287310123443604, + "learning_rate": 3.1971508029754496e-06, + "loss": 0.0942, + "step": 24550 + }, + { + "epoch": 0.6212769187944429, + "grad_norm": 8.398316383361816, + "learning_rate": 3.196776295780033e-06, + "loss": 0.1942, + "step": 24551 + }, + { + "epoch": 0.6213022243591366, + "grad_norm": 13.918039321899414, + "learning_rate": 3.1964018002132725e-06, + "loss": 0.2131, + "step": 24552 + }, + { + "epoch": 0.6213275299238302, + "grad_norm": 12.782744407653809, + "learning_rate": 3.196027316277579e-06, + "loss": 0.2348, + "step": 24553 + }, + { + "epoch": 0.6213528354885239, + "grad_norm": 5.03176736831665, + "learning_rate": 3.1956528439753698e-06, + "loss": 0.2598, + "step": 24554 + }, + { + "epoch": 0.6213781410532176, + "grad_norm": 10.171402931213379, + "learning_rate": 3.1952783833090596e-06, + "loss": 0.2876, + "step": 24555 + }, + { + "epoch": 0.6214034466179112, + "grad_norm": 4.6127848625183105, + "learning_rate": 3.194903934281066e-06, + "loss": 0.1525, + "step": 24556 + }, + { + "epoch": 0.621428752182605, + "grad_norm": 4.4995198249816895, + "learning_rate": 3.194529496893799e-06, + "loss": 0.1922, + "step": 24557 + }, + { + "epoch": 0.6214540577472987, + "grad_norm": 6.1461615562438965, + "learning_rate": 3.194155071149675e-06, + "loss": 0.2165, + "step": 24558 + }, + { + "epoch": 0.6214793633119923, + "grad_norm": 4.634564399719238, + "learning_rate": 3.1937806570511094e-06, + "loss": 0.1137, + "step": 24559 + }, + { + "epoch": 0.621504668876686, + "grad_norm": 3.9447619915008545, + "learning_rate": 3.1934062546005174e-06, + "loss": 0.1248, + "step": 24560 + }, + { + "epoch": 0.6215299744413797, + "grad_norm": 18.80605125427246, + "learning_rate": 3.193031863800312e-06, + "loss": 0.23, + "step": 24561 + }, + { + "epoch": 0.6215552800060733, + "grad_norm": 8.423714637756348, + "learning_rate": 3.1926574846529073e-06, + "loss": 0.2566, + "step": 24562 + }, + { + "epoch": 0.621580585570767, + "grad_norm": 5.272049903869629, + "learning_rate": 3.1922831171607194e-06, + "loss": 0.1783, + "step": 24563 + }, + { + "epoch": 0.6216058911354607, + "grad_norm": 7.252739429473877, + "learning_rate": 3.19190876132616e-06, + "loss": 0.1962, + "step": 24564 + }, + { + "epoch": 0.6216311967001543, + "grad_norm": 4.8034820556640625, + "learning_rate": 3.191534417151646e-06, + "loss": 0.2067, + "step": 24565 + }, + { + "epoch": 0.621656502264848, + "grad_norm": 4.128365516662598, + "learning_rate": 3.1911600846395896e-06, + "loss": 0.2005, + "step": 24566 + }, + { + "epoch": 0.6216818078295417, + "grad_norm": 6.174786567687988, + "learning_rate": 3.1907857637924054e-06, + "loss": 0.0914, + "step": 24567 + }, + { + "epoch": 0.6217071133942353, + "grad_norm": 3.8551859855651855, + "learning_rate": 3.190411454612507e-06, + "loss": 0.1633, + "step": 24568 + }, + { + "epoch": 0.6217324189589291, + "grad_norm": 4.1009907722473145, + "learning_rate": 3.190037157102308e-06, + "loss": 0.1593, + "step": 24569 + }, + { + "epoch": 0.6217577245236228, + "grad_norm": 4.204955101013184, + "learning_rate": 3.1896628712642254e-06, + "loss": 0.1198, + "step": 24570 + }, + { + "epoch": 0.6217830300883164, + "grad_norm": 14.619507789611816, + "learning_rate": 3.1892885971006675e-06, + "loss": 0.144, + "step": 24571 + }, + { + "epoch": 0.6218083356530101, + "grad_norm": 3.8784403800964355, + "learning_rate": 3.1889143346140504e-06, + "loss": 0.0868, + "step": 24572 + }, + { + "epoch": 0.6218336412177038, + "grad_norm": 7.101744651794434, + "learning_rate": 3.1885400838067886e-06, + "loss": 0.172, + "step": 24573 + }, + { + "epoch": 0.6218589467823975, + "grad_norm": 15.385993003845215, + "learning_rate": 3.188165844681297e-06, + "loss": 0.1869, + "step": 24574 + }, + { + "epoch": 0.6218842523470911, + "grad_norm": 11.305020332336426, + "learning_rate": 3.187791617239984e-06, + "loss": 0.1983, + "step": 24575 + }, + { + "epoch": 0.6219095579117848, + "grad_norm": 31.83786964416504, + "learning_rate": 3.1874174014852665e-06, + "loss": 0.194, + "step": 24576 + }, + { + "epoch": 0.6219348634764785, + "grad_norm": 3.255438804626465, + "learning_rate": 3.1870431974195566e-06, + "loss": 0.1471, + "step": 24577 + }, + { + "epoch": 0.6219601690411721, + "grad_norm": 8.443055152893066, + "learning_rate": 3.1866690050452697e-06, + "loss": 0.2022, + "step": 24578 + }, + { + "epoch": 0.6219854746058658, + "grad_norm": 21.14679527282715, + "learning_rate": 3.186294824364815e-06, + "loss": 0.1956, + "step": 24579 + }, + { + "epoch": 0.6220107801705596, + "grad_norm": 10.976409912109375, + "learning_rate": 3.1859206553806087e-06, + "loss": 0.2577, + "step": 24580 + }, + { + "epoch": 0.6220360857352532, + "grad_norm": 4.842209815979004, + "learning_rate": 3.1855464980950614e-06, + "loss": 0.1714, + "step": 24581 + }, + { + "epoch": 0.6220613912999469, + "grad_norm": 4.500420093536377, + "learning_rate": 3.185172352510588e-06, + "loss": 0.1515, + "step": 24582 + }, + { + "epoch": 0.6220866968646406, + "grad_norm": 9.055248260498047, + "learning_rate": 3.1847982186296022e-06, + "loss": 0.1975, + "step": 24583 + }, + { + "epoch": 0.6221120024293342, + "grad_norm": 3.0249271392822266, + "learning_rate": 3.184424096454513e-06, + "loss": 0.1395, + "step": 24584 + }, + { + "epoch": 0.6221373079940279, + "grad_norm": 4.6116156578063965, + "learning_rate": 3.184049985987735e-06, + "loss": 0.1646, + "step": 24585 + }, + { + "epoch": 0.6221626135587216, + "grad_norm": 7.448252201080322, + "learning_rate": 3.183675887231681e-06, + "loss": 0.1917, + "step": 24586 + }, + { + "epoch": 0.6221879191234152, + "grad_norm": 3.5305991172790527, + "learning_rate": 3.1833018001887654e-06, + "loss": 0.1487, + "step": 24587 + }, + { + "epoch": 0.6222132246881089, + "grad_norm": 4.032877445220947, + "learning_rate": 3.1829277248613968e-06, + "loss": 0.0592, + "step": 24588 + }, + { + "epoch": 0.6222385302528026, + "grad_norm": 8.800424575805664, + "learning_rate": 3.1825536612519887e-06, + "loss": 0.1945, + "step": 24589 + }, + { + "epoch": 0.6222638358174962, + "grad_norm": 3.7143635749816895, + "learning_rate": 3.1821796093629543e-06, + "loss": 0.1666, + "step": 24590 + }, + { + "epoch": 0.6222891413821899, + "grad_norm": 3.2065253257751465, + "learning_rate": 3.1818055691967064e-06, + "loss": 0.1543, + "step": 24591 + }, + { + "epoch": 0.6223144469468836, + "grad_norm": 19.6021785736084, + "learning_rate": 3.181431540755656e-06, + "loss": 0.318, + "step": 24592 + }, + { + "epoch": 0.6223397525115772, + "grad_norm": 5.388486862182617, + "learning_rate": 3.181057524042215e-06, + "loss": 0.1785, + "step": 24593 + }, + { + "epoch": 0.622365058076271, + "grad_norm": 11.539344787597656, + "learning_rate": 3.180683519058796e-06, + "loss": 0.2259, + "step": 24594 + }, + { + "epoch": 0.6223903636409647, + "grad_norm": 13.633281707763672, + "learning_rate": 3.18030952580781e-06, + "loss": 0.1104, + "step": 24595 + }, + { + "epoch": 0.6224156692056583, + "grad_norm": 4.931896686553955, + "learning_rate": 3.1799355442916695e-06, + "loss": 0.2024, + "step": 24596 + }, + { + "epoch": 0.622440974770352, + "grad_norm": 7.503767013549805, + "learning_rate": 3.179561574512787e-06, + "loss": 0.2161, + "step": 24597 + }, + { + "epoch": 0.6224662803350457, + "grad_norm": 5.359457969665527, + "learning_rate": 3.1791876164735723e-06, + "loss": 0.0951, + "step": 24598 + }, + { + "epoch": 0.6224915858997394, + "grad_norm": 7.190313816070557, + "learning_rate": 3.178813670176437e-06, + "loss": 0.1986, + "step": 24599 + }, + { + "epoch": 0.622516891464433, + "grad_norm": 4.302745342254639, + "learning_rate": 3.1784397356237967e-06, + "loss": 0.1973, + "step": 24600 + }, + { + "epoch": 0.6225421970291267, + "grad_norm": 19.52227210998535, + "learning_rate": 3.1780658128180574e-06, + "loss": 0.3016, + "step": 24601 + }, + { + "epoch": 0.6225675025938204, + "grad_norm": 3.085670232772827, + "learning_rate": 3.1776919017616324e-06, + "loss": 0.0937, + "step": 24602 + }, + { + "epoch": 0.622592808158514, + "grad_norm": 5.545032501220703, + "learning_rate": 3.177318002456934e-06, + "loss": 0.1461, + "step": 24603 + }, + { + "epoch": 0.6226181137232077, + "grad_norm": 13.414341926574707, + "learning_rate": 3.1769441149063716e-06, + "loss": 0.3263, + "step": 24604 + }, + { + "epoch": 0.6226434192879015, + "grad_norm": 4.696619033813477, + "learning_rate": 3.176570239112361e-06, + "loss": 0.0667, + "step": 24605 + }, + { + "epoch": 0.6226687248525951, + "grad_norm": 8.013222694396973, + "learning_rate": 3.176196375077307e-06, + "loss": 0.2797, + "step": 24606 + }, + { + "epoch": 0.6226940304172888, + "grad_norm": 4.382348537445068, + "learning_rate": 3.1758225228036233e-06, + "loss": 0.1295, + "step": 24607 + }, + { + "epoch": 0.6227193359819825, + "grad_norm": 6.30334997177124, + "learning_rate": 3.1754486822937204e-06, + "loss": 0.2022, + "step": 24608 + }, + { + "epoch": 0.6227446415466761, + "grad_norm": 6.088600158691406, + "learning_rate": 3.175074853550012e-06, + "loss": 0.1617, + "step": 24609 + }, + { + "epoch": 0.6227699471113698, + "grad_norm": 5.123129367828369, + "learning_rate": 3.1747010365749044e-06, + "loss": 0.1153, + "step": 24610 + }, + { + "epoch": 0.6227952526760635, + "grad_norm": 4.2306952476501465, + "learning_rate": 3.1743272313708107e-06, + "loss": 0.1563, + "step": 24611 + }, + { + "epoch": 0.6228205582407571, + "grad_norm": 3.8818109035491943, + "learning_rate": 3.1739534379401405e-06, + "loss": 0.1544, + "step": 24612 + }, + { + "epoch": 0.6228458638054508, + "grad_norm": 3.3631153106689453, + "learning_rate": 3.173579656285305e-06, + "loss": 0.1312, + "step": 24613 + }, + { + "epoch": 0.6228711693701445, + "grad_norm": 5.452871799468994, + "learning_rate": 3.1732058864087166e-06, + "loss": 0.091, + "step": 24614 + }, + { + "epoch": 0.6228964749348381, + "grad_norm": 5.619002342224121, + "learning_rate": 3.1728321283127817e-06, + "loss": 0.2112, + "step": 24615 + }, + { + "epoch": 0.6229217804995318, + "grad_norm": 7.609630107879639, + "learning_rate": 3.172458381999912e-06, + "loss": 0.2463, + "step": 24616 + }, + { + "epoch": 0.6229470860642256, + "grad_norm": 2.758354663848877, + "learning_rate": 3.1720846474725186e-06, + "loss": 0.1224, + "step": 24617 + }, + { + "epoch": 0.6229723916289192, + "grad_norm": 3.441007137298584, + "learning_rate": 3.1717109247330137e-06, + "loss": 0.1633, + "step": 24618 + }, + { + "epoch": 0.6229976971936129, + "grad_norm": 8.38829231262207, + "learning_rate": 3.1713372137838026e-06, + "loss": 0.136, + "step": 24619 + }, + { + "epoch": 0.6230230027583066, + "grad_norm": 4.70993185043335, + "learning_rate": 3.1709635146272976e-06, + "loss": 0.1871, + "step": 24620 + }, + { + "epoch": 0.6230483083230002, + "grad_norm": 9.850403785705566, + "learning_rate": 3.1705898272659097e-06, + "loss": 0.2499, + "step": 24621 + }, + { + "epoch": 0.6230736138876939, + "grad_norm": 6.1808953285217285, + "learning_rate": 3.1702161517020467e-06, + "loss": 0.1391, + "step": 24622 + }, + { + "epoch": 0.6230989194523876, + "grad_norm": 4.239736557006836, + "learning_rate": 3.1698424879381205e-06, + "loss": 0.1787, + "step": 24623 + }, + { + "epoch": 0.6231242250170812, + "grad_norm": 4.184318542480469, + "learning_rate": 3.16946883597654e-06, + "loss": 0.1635, + "step": 24624 + }, + { + "epoch": 0.6231495305817749, + "grad_norm": 9.188965797424316, + "learning_rate": 3.1690951958197135e-06, + "loss": 0.126, + "step": 24625 + }, + { + "epoch": 0.6231748361464686, + "grad_norm": 4.226644515991211, + "learning_rate": 3.168721567470051e-06, + "loss": 0.1365, + "step": 24626 + }, + { + "epoch": 0.6232001417111623, + "grad_norm": 4.993544101715088, + "learning_rate": 3.168347950929963e-06, + "loss": 0.1293, + "step": 24627 + }, + { + "epoch": 0.6232254472758559, + "grad_norm": 4.4231977462768555, + "learning_rate": 3.1679743462018597e-06, + "loss": 0.0925, + "step": 24628 + }, + { + "epoch": 0.6232507528405496, + "grad_norm": 2.4139342308044434, + "learning_rate": 3.167600753288147e-06, + "loss": 0.0954, + "step": 24629 + }, + { + "epoch": 0.6232760584052434, + "grad_norm": 6.6858391761779785, + "learning_rate": 3.167227172191236e-06, + "loss": 0.1901, + "step": 24630 + }, + { + "epoch": 0.623301363969937, + "grad_norm": 4.7200212478637695, + "learning_rate": 3.1668536029135387e-06, + "loss": 0.1065, + "step": 24631 + }, + { + "epoch": 0.6233266695346307, + "grad_norm": 4.2182111740112305, + "learning_rate": 3.1664800454574596e-06, + "loss": 0.0869, + "step": 24632 + }, + { + "epoch": 0.6233519750993244, + "grad_norm": 4.270312786102295, + "learning_rate": 3.1661064998254095e-06, + "loss": 0.1051, + "step": 24633 + }, + { + "epoch": 0.623377280664018, + "grad_norm": 13.301067352294922, + "learning_rate": 3.1657329660197972e-06, + "loss": 0.2585, + "step": 24634 + }, + { + "epoch": 0.6234025862287117, + "grad_norm": 21.827016830444336, + "learning_rate": 3.165359444043032e-06, + "loss": 0.2142, + "step": 24635 + }, + { + "epoch": 0.6234278917934054, + "grad_norm": 11.307856559753418, + "learning_rate": 3.164985933897524e-06, + "loss": 0.2807, + "step": 24636 + }, + { + "epoch": 0.623453197358099, + "grad_norm": 3.684976816177368, + "learning_rate": 3.164612435585679e-06, + "loss": 0.1533, + "step": 24637 + }, + { + "epoch": 0.6234785029227927, + "grad_norm": 3.7028989791870117, + "learning_rate": 3.1642389491099066e-06, + "loss": 0.1345, + "step": 24638 + }, + { + "epoch": 0.6235038084874864, + "grad_norm": 8.41342544555664, + "learning_rate": 3.1638654744726162e-06, + "loss": 0.3161, + "step": 24639 + }, + { + "epoch": 0.62352911405218, + "grad_norm": 4.202502727508545, + "learning_rate": 3.1634920116762175e-06, + "loss": 0.1594, + "step": 24640 + }, + { + "epoch": 0.6235544196168737, + "grad_norm": 4.125280380249023, + "learning_rate": 3.1631185607231152e-06, + "loss": 0.1705, + "step": 24641 + }, + { + "epoch": 0.6235797251815675, + "grad_norm": 2.5746781826019287, + "learning_rate": 3.1627451216157196e-06, + "loss": 0.1321, + "step": 24642 + }, + { + "epoch": 0.6236050307462611, + "grad_norm": 4.371912956237793, + "learning_rate": 3.1623716943564387e-06, + "loss": 0.1432, + "step": 24643 + }, + { + "epoch": 0.6236303363109548, + "grad_norm": 4.3672356605529785, + "learning_rate": 3.161998278947681e-06, + "loss": 0.132, + "step": 24644 + }, + { + "epoch": 0.6236556418756485, + "grad_norm": 8.176727294921875, + "learning_rate": 3.161624875391857e-06, + "loss": 0.2365, + "step": 24645 + }, + { + "epoch": 0.6236809474403421, + "grad_norm": 2.7595133781433105, + "learning_rate": 3.16125148369137e-06, + "loss": 0.1139, + "step": 24646 + }, + { + "epoch": 0.6237062530050358, + "grad_norm": 6.991933822631836, + "learning_rate": 3.1608781038486304e-06, + "loss": 0.2148, + "step": 24647 + }, + { + "epoch": 0.6237315585697295, + "grad_norm": 7.538311958312988, + "learning_rate": 3.1605047358660463e-06, + "loss": 0.1824, + "step": 24648 + }, + { + "epoch": 0.6237568641344231, + "grad_norm": 3.124990701675415, + "learning_rate": 3.160131379746025e-06, + "loss": 0.0815, + "step": 24649 + }, + { + "epoch": 0.6237821696991168, + "grad_norm": 7.599677562713623, + "learning_rate": 3.1597580354909735e-06, + "loss": 0.1856, + "step": 24650 + }, + { + "epoch": 0.6238074752638105, + "grad_norm": 8.04258918762207, + "learning_rate": 3.1593847031032998e-06, + "loss": 0.1459, + "step": 24651 + }, + { + "epoch": 0.6238327808285042, + "grad_norm": 12.252242088317871, + "learning_rate": 3.159011382585413e-06, + "loss": 0.1974, + "step": 24652 + }, + { + "epoch": 0.6238580863931978, + "grad_norm": 7.926356315612793, + "learning_rate": 3.158638073939719e-06, + "loss": 0.1814, + "step": 24653 + }, + { + "epoch": 0.6238833919578916, + "grad_norm": 4.835868835449219, + "learning_rate": 3.1582647771686247e-06, + "loss": 0.1631, + "step": 24654 + }, + { + "epoch": 0.6239086975225853, + "grad_norm": 3.7952497005462646, + "learning_rate": 3.15789149227454e-06, + "loss": 0.0668, + "step": 24655 + }, + { + "epoch": 0.6239340030872789, + "grad_norm": 7.18419075012207, + "learning_rate": 3.1575182192598684e-06, + "loss": 0.1053, + "step": 24656 + }, + { + "epoch": 0.6239593086519726, + "grad_norm": 2.1620171070098877, + "learning_rate": 3.15714495812702e-06, + "loss": 0.136, + "step": 24657 + }, + { + "epoch": 0.6239846142166663, + "grad_norm": 7.917428970336914, + "learning_rate": 3.1567717088784022e-06, + "loss": 0.2313, + "step": 24658 + }, + { + "epoch": 0.6240099197813599, + "grad_norm": 7.641258239746094, + "learning_rate": 3.1563984715164187e-06, + "loss": 0.188, + "step": 24659 + }, + { + "epoch": 0.6240352253460536, + "grad_norm": 9.485870361328125, + "learning_rate": 3.1560252460434793e-06, + "loss": 0.1106, + "step": 24660 + }, + { + "epoch": 0.6240605309107473, + "grad_norm": 3.9213900566101074, + "learning_rate": 3.1556520324619893e-06, + "loss": 0.1864, + "step": 24661 + }, + { + "epoch": 0.6240858364754409, + "grad_norm": 3.8466005325317383, + "learning_rate": 3.1552788307743585e-06, + "loss": 0.1291, + "step": 24662 + }, + { + "epoch": 0.6241111420401346, + "grad_norm": 4.234889030456543, + "learning_rate": 3.15490564098299e-06, + "loss": 0.1067, + "step": 24663 + }, + { + "epoch": 0.6241364476048283, + "grad_norm": 5.211258888244629, + "learning_rate": 3.154532463090291e-06, + "loss": 0.2202, + "step": 24664 + }, + { + "epoch": 0.6241617531695219, + "grad_norm": 2.903008460998535, + "learning_rate": 3.154159297098669e-06, + "loss": 0.1217, + "step": 24665 + }, + { + "epoch": 0.6241870587342156, + "grad_norm": 9.059347152709961, + "learning_rate": 3.153786143010531e-06, + "loss": 0.2829, + "step": 24666 + }, + { + "epoch": 0.6242123642989094, + "grad_norm": 7.641873359680176, + "learning_rate": 3.1534130008282838e-06, + "loss": 0.1657, + "step": 24667 + }, + { + "epoch": 0.624237669863603, + "grad_norm": 7.988674163818359, + "learning_rate": 3.1530398705543307e-06, + "loss": 0.2338, + "step": 24668 + }, + { + "epoch": 0.6242629754282967, + "grad_norm": 8.697781562805176, + "learning_rate": 3.1526667521910805e-06, + "loss": 0.1766, + "step": 24669 + }, + { + "epoch": 0.6242882809929904, + "grad_norm": 5.604016304016113, + "learning_rate": 3.1522936457409387e-06, + "loss": 0.1475, + "step": 24670 + }, + { + "epoch": 0.624313586557684, + "grad_norm": 3.9968202114105225, + "learning_rate": 3.1519205512063135e-06, + "loss": 0.1378, + "step": 24671 + }, + { + "epoch": 0.6243388921223777, + "grad_norm": 5.454812049865723, + "learning_rate": 3.1515474685896066e-06, + "loss": 0.2031, + "step": 24672 + }, + { + "epoch": 0.6243641976870714, + "grad_norm": 5.250288486480713, + "learning_rate": 3.1511743978932265e-06, + "loss": 0.1065, + "step": 24673 + }, + { + "epoch": 0.624389503251765, + "grad_norm": 2.212221384048462, + "learning_rate": 3.150801339119579e-06, + "loss": 0.094, + "step": 24674 + }, + { + "epoch": 0.6244148088164587, + "grad_norm": 4.342911243438721, + "learning_rate": 3.1504282922710693e-06, + "loss": 0.1633, + "step": 24675 + }, + { + "epoch": 0.6244401143811524, + "grad_norm": 6.0367560386657715, + "learning_rate": 3.1500552573501053e-06, + "loss": 0.1335, + "step": 24676 + }, + { + "epoch": 0.6244654199458461, + "grad_norm": 7.959967613220215, + "learning_rate": 3.1496822343590893e-06, + "loss": 0.2075, + "step": 24677 + }, + { + "epoch": 0.6244907255105397, + "grad_norm": 5.322554111480713, + "learning_rate": 3.149309223300428e-06, + "loss": 0.1293, + "step": 24678 + }, + { + "epoch": 0.6245160310752335, + "grad_norm": 10.203934669494629, + "learning_rate": 3.1489362241765286e-06, + "loss": 0.3385, + "step": 24679 + }, + { + "epoch": 0.6245413366399272, + "grad_norm": 5.473893642425537, + "learning_rate": 3.148563236989795e-06, + "loss": 0.1895, + "step": 24680 + }, + { + "epoch": 0.6245666422046208, + "grad_norm": 7.182440280914307, + "learning_rate": 3.148190261742632e-06, + "loss": 0.2384, + "step": 24681 + }, + { + "epoch": 0.6245919477693145, + "grad_norm": 4.290975093841553, + "learning_rate": 3.147817298437446e-06, + "loss": 0.1417, + "step": 24682 + }, + { + "epoch": 0.6246172533340082, + "grad_norm": 7.340823173522949, + "learning_rate": 3.147444347076641e-06, + "loss": 0.2284, + "step": 24683 + }, + { + "epoch": 0.6246425588987018, + "grad_norm": 6.2912278175354, + "learning_rate": 3.147071407662624e-06, + "loss": 0.1365, + "step": 24684 + }, + { + "epoch": 0.6246678644633955, + "grad_norm": 10.655426979064941, + "learning_rate": 3.1466984801977986e-06, + "loss": 0.1251, + "step": 24685 + }, + { + "epoch": 0.6246931700280892, + "grad_norm": 8.31446647644043, + "learning_rate": 3.1463255646845685e-06, + "loss": 0.2538, + "step": 24686 + }, + { + "epoch": 0.6247184755927828, + "grad_norm": 7.89574670791626, + "learning_rate": 3.1459526611253403e-06, + "loss": 0.187, + "step": 24687 + }, + { + "epoch": 0.6247437811574765, + "grad_norm": 7.801324367523193, + "learning_rate": 3.1455797695225187e-06, + "loss": 0.2346, + "step": 24688 + }, + { + "epoch": 0.6247690867221702, + "grad_norm": 8.544244766235352, + "learning_rate": 3.1452068898785103e-06, + "loss": 0.2497, + "step": 24689 + }, + { + "epoch": 0.6247943922868638, + "grad_norm": 7.849627494812012, + "learning_rate": 3.1448340221957158e-06, + "loss": 0.2513, + "step": 24690 + }, + { + "epoch": 0.6248196978515576, + "grad_norm": 5.548089027404785, + "learning_rate": 3.1444611664765412e-06, + "loss": 0.1787, + "step": 24691 + }, + { + "epoch": 0.6248450034162513, + "grad_norm": 2.8849244117736816, + "learning_rate": 3.1440883227233914e-06, + "loss": 0.133, + "step": 24692 + }, + { + "epoch": 0.6248703089809449, + "grad_norm": 4.755144119262695, + "learning_rate": 3.1437154909386736e-06, + "loss": 0.1499, + "step": 24693 + }, + { + "epoch": 0.6248956145456386, + "grad_norm": 2.9589314460754395, + "learning_rate": 3.143342671124786e-06, + "loss": 0.1613, + "step": 24694 + }, + { + "epoch": 0.6249209201103323, + "grad_norm": 6.758266925811768, + "learning_rate": 3.1429698632841366e-06, + "loss": 0.2012, + "step": 24695 + }, + { + "epoch": 0.6249462256750259, + "grad_norm": 3.0295510292053223, + "learning_rate": 3.1425970674191295e-06, + "loss": 0.0989, + "step": 24696 + }, + { + "epoch": 0.6249715312397196, + "grad_norm": 4.27859354019165, + "learning_rate": 3.142224283532168e-06, + "loss": 0.0907, + "step": 24697 + }, + { + "epoch": 0.6249968368044133, + "grad_norm": 5.35148811340332, + "learning_rate": 3.1418515116256587e-06, + "loss": 0.1915, + "step": 24698 + }, + { + "epoch": 0.6250221423691069, + "grad_norm": 6.6242356300354, + "learning_rate": 3.141478751702002e-06, + "loss": 0.2392, + "step": 24699 + }, + { + "epoch": 0.6250474479338006, + "grad_norm": 4.125219345092773, + "learning_rate": 3.141106003763602e-06, + "loss": 0.1555, + "step": 24700 + }, + { + "epoch": 0.6250727534984943, + "grad_norm": 11.367960929870605, + "learning_rate": 3.1407332678128644e-06, + "loss": 0.2081, + "step": 24701 + }, + { + "epoch": 0.625098059063188, + "grad_norm": 3.9579050540924072, + "learning_rate": 3.1403605438521932e-06, + "loss": 0.1359, + "step": 24702 + }, + { + "epoch": 0.6251233646278817, + "grad_norm": 8.068143844604492, + "learning_rate": 3.13998783188399e-06, + "loss": 0.1198, + "step": 24703 + }, + { + "epoch": 0.6251486701925754, + "grad_norm": 7.635923385620117, + "learning_rate": 3.139615131910659e-06, + "loss": 0.1738, + "step": 24704 + }, + { + "epoch": 0.6251739757572691, + "grad_norm": 5.085952281951904, + "learning_rate": 3.1392424439346037e-06, + "loss": 0.1586, + "step": 24705 + }, + { + "epoch": 0.6251992813219627, + "grad_norm": 4.839909553527832, + "learning_rate": 3.1388697679582296e-06, + "loss": 0.1271, + "step": 24706 + }, + { + "epoch": 0.6252245868866564, + "grad_norm": 5.544095039367676, + "learning_rate": 3.138497103983936e-06, + "loss": 0.1118, + "step": 24707 + }, + { + "epoch": 0.6252498924513501, + "grad_norm": 2.8597497940063477, + "learning_rate": 3.1381244520141296e-06, + "loss": 0.1526, + "step": 24708 + }, + { + "epoch": 0.6252751980160437, + "grad_norm": 4.1226277351379395, + "learning_rate": 3.137751812051212e-06, + "loss": 0.1221, + "step": 24709 + }, + { + "epoch": 0.6253005035807374, + "grad_norm": 11.859269142150879, + "learning_rate": 3.137379184097586e-06, + "loss": 0.3471, + "step": 24710 + }, + { + "epoch": 0.6253258091454311, + "grad_norm": 4.658679962158203, + "learning_rate": 3.137006568155656e-06, + "loss": 0.1185, + "step": 24711 + }, + { + "epoch": 0.6253511147101247, + "grad_norm": 6.102943420410156, + "learning_rate": 3.136633964227823e-06, + "loss": 0.1566, + "step": 24712 + }, + { + "epoch": 0.6253764202748184, + "grad_norm": 3.6996347904205322, + "learning_rate": 3.1362613723164926e-06, + "loss": 0.1356, + "step": 24713 + }, + { + "epoch": 0.6254017258395121, + "grad_norm": 11.859879493713379, + "learning_rate": 3.1358887924240644e-06, + "loss": 0.1502, + "step": 24714 + }, + { + "epoch": 0.6254270314042057, + "grad_norm": 7.451332092285156, + "learning_rate": 3.1355162245529434e-06, + "loss": 0.1943, + "step": 24715 + }, + { + "epoch": 0.6254523369688995, + "grad_norm": 4.9428935050964355, + "learning_rate": 3.135143668705532e-06, + "loss": 0.1626, + "step": 24716 + }, + { + "epoch": 0.6254776425335932, + "grad_norm": 22.857458114624023, + "learning_rate": 3.134771124884231e-06, + "loss": 0.0954, + "step": 24717 + }, + { + "epoch": 0.6255029480982868, + "grad_norm": 16.52103614807129, + "learning_rate": 3.134398593091444e-06, + "loss": 0.324, + "step": 24718 + }, + { + "epoch": 0.6255282536629805, + "grad_norm": 3.021049976348877, + "learning_rate": 3.1340260733295733e-06, + "loss": 0.1378, + "step": 24719 + }, + { + "epoch": 0.6255535592276742, + "grad_norm": 8.905906677246094, + "learning_rate": 3.1336535656010234e-06, + "loss": 0.2119, + "step": 24720 + }, + { + "epoch": 0.6255788647923678, + "grad_norm": 4.73214864730835, + "learning_rate": 3.133281069908193e-06, + "loss": 0.13, + "step": 24721 + }, + { + "epoch": 0.6256041703570615, + "grad_norm": 3.182685136795044, + "learning_rate": 3.1329085862534853e-06, + "loss": 0.1224, + "step": 24722 + }, + { + "epoch": 0.6256294759217552, + "grad_norm": 13.533859252929688, + "learning_rate": 3.1325361146393028e-06, + "loss": 0.1725, + "step": 24723 + }, + { + "epoch": 0.6256547814864488, + "grad_norm": 4.145050048828125, + "learning_rate": 3.1321636550680496e-06, + "loss": 0.1384, + "step": 24724 + }, + { + "epoch": 0.6256800870511425, + "grad_norm": 18.64989471435547, + "learning_rate": 3.131791207542123e-06, + "loss": 0.203, + "step": 24725 + }, + { + "epoch": 0.6257053926158362, + "grad_norm": 14.784253120422363, + "learning_rate": 3.1314187720639282e-06, + "loss": 0.2422, + "step": 24726 + }, + { + "epoch": 0.62573069818053, + "grad_norm": 3.5098509788513184, + "learning_rate": 3.131046348635866e-06, + "loss": 0.1186, + "step": 24727 + }, + { + "epoch": 0.6257560037452236, + "grad_norm": 3.1401400566101074, + "learning_rate": 3.13067393726034e-06, + "loss": 0.1566, + "step": 24728 + }, + { + "epoch": 0.6257813093099173, + "grad_norm": 3.130838632583618, + "learning_rate": 3.130301537939748e-06, + "loss": 0.1432, + "step": 24729 + }, + { + "epoch": 0.625806614874611, + "grad_norm": 9.536530494689941, + "learning_rate": 3.129929150676494e-06, + "loss": 0.1899, + "step": 24730 + }, + { + "epoch": 0.6258319204393046, + "grad_norm": 2.8136520385742188, + "learning_rate": 3.129556775472979e-06, + "loss": 0.1317, + "step": 24731 + }, + { + "epoch": 0.6258572260039983, + "grad_norm": 11.621530532836914, + "learning_rate": 3.129184412331604e-06, + "loss": 0.1874, + "step": 24732 + }, + { + "epoch": 0.625882531568692, + "grad_norm": 3.6595728397369385, + "learning_rate": 3.1288120612547736e-06, + "loss": 0.1889, + "step": 24733 + }, + { + "epoch": 0.6259078371333856, + "grad_norm": 9.719766616821289, + "learning_rate": 3.128439722244883e-06, + "loss": 0.2716, + "step": 24734 + }, + { + "epoch": 0.6259331426980793, + "grad_norm": 8.549505233764648, + "learning_rate": 3.1280673953043374e-06, + "loss": 0.2317, + "step": 24735 + }, + { + "epoch": 0.625958448262773, + "grad_norm": 2.3566954135894775, + "learning_rate": 3.127695080435536e-06, + "loss": 0.0616, + "step": 24736 + }, + { + "epoch": 0.6259837538274666, + "grad_norm": 4.73421049118042, + "learning_rate": 3.1273227776408834e-06, + "loss": 0.1264, + "step": 24737 + }, + { + "epoch": 0.6260090593921603, + "grad_norm": 6.272212028503418, + "learning_rate": 3.1269504869227764e-06, + "loss": 0.1569, + "step": 24738 + }, + { + "epoch": 0.626034364956854, + "grad_norm": 9.691386222839355, + "learning_rate": 3.126578208283616e-06, + "loss": 0.2212, + "step": 24739 + }, + { + "epoch": 0.6260596705215477, + "grad_norm": 5.236997604370117, + "learning_rate": 3.126205941725806e-06, + "loss": 0.1658, + "step": 24740 + }, + { + "epoch": 0.6260849760862414, + "grad_norm": 3.0439140796661377, + "learning_rate": 3.1258336872517447e-06, + "loss": 0.1036, + "step": 24741 + }, + { + "epoch": 0.6261102816509351, + "grad_norm": 4.651880264282227, + "learning_rate": 3.1254614448638333e-06, + "loss": 0.1104, + "step": 24742 + }, + { + "epoch": 0.6261355872156287, + "grad_norm": 9.012102127075195, + "learning_rate": 3.125089214564473e-06, + "loss": 0.2854, + "step": 24743 + }, + { + "epoch": 0.6261608927803224, + "grad_norm": 9.135421752929688, + "learning_rate": 3.124716996356063e-06, + "loss": 0.1698, + "step": 24744 + }, + { + "epoch": 0.6261861983450161, + "grad_norm": 8.655354499816895, + "learning_rate": 3.1243447902410033e-06, + "loss": 0.2845, + "step": 24745 + }, + { + "epoch": 0.6262115039097097, + "grad_norm": 2.6131808757781982, + "learning_rate": 3.1239725962216984e-06, + "loss": 0.0924, + "step": 24746 + }, + { + "epoch": 0.6262368094744034, + "grad_norm": 8.325614929199219, + "learning_rate": 3.1236004143005418e-06, + "loss": 0.1494, + "step": 24747 + }, + { + "epoch": 0.6262621150390971, + "grad_norm": 7.996261119842529, + "learning_rate": 3.1232282444799377e-06, + "loss": 0.2359, + "step": 24748 + }, + { + "epoch": 0.6262874206037907, + "grad_norm": 3.778848171234131, + "learning_rate": 3.122856086762286e-06, + "loss": 0.1098, + "step": 24749 + }, + { + "epoch": 0.6263127261684844, + "grad_norm": 3.495832681655884, + "learning_rate": 3.1224839411499857e-06, + "loss": 0.1176, + "step": 24750 + }, + { + "epoch": 0.6263380317331781, + "grad_norm": 2.651533603668213, + "learning_rate": 3.1221118076454394e-06, + "loss": 0.0988, + "step": 24751 + }, + { + "epoch": 0.6263633372978717, + "grad_norm": 4.91312313079834, + "learning_rate": 3.121739686251043e-06, + "loss": 0.2334, + "step": 24752 + }, + { + "epoch": 0.6263886428625655, + "grad_norm": 7.732476711273193, + "learning_rate": 3.1213675769691975e-06, + "loss": 0.231, + "step": 24753 + }, + { + "epoch": 0.6264139484272592, + "grad_norm": 6.2426934242248535, + "learning_rate": 3.1209954798023033e-06, + "loss": 0.1786, + "step": 24754 + }, + { + "epoch": 0.6264392539919529, + "grad_norm": 3.4129445552825928, + "learning_rate": 3.120623394752762e-06, + "loss": 0.107, + "step": 24755 + }, + { + "epoch": 0.6264645595566465, + "grad_norm": 3.3368334770202637, + "learning_rate": 3.120251321822968e-06, + "loss": 0.1372, + "step": 24756 + }, + { + "epoch": 0.6264898651213402, + "grad_norm": 7.886623859405518, + "learning_rate": 3.1198792610153237e-06, + "loss": 0.1529, + "step": 24757 + }, + { + "epoch": 0.6265151706860339, + "grad_norm": 3.525336503982544, + "learning_rate": 3.119507212332229e-06, + "loss": 0.1676, + "step": 24758 + }, + { + "epoch": 0.6265404762507275, + "grad_norm": 4.398440361022949, + "learning_rate": 3.1191351757760848e-06, + "loss": 0.1575, + "step": 24759 + }, + { + "epoch": 0.6265657818154212, + "grad_norm": 13.754401206970215, + "learning_rate": 3.118763151349285e-06, + "loss": 0.1264, + "step": 24760 + }, + { + "epoch": 0.6265910873801149, + "grad_norm": 4.63240385055542, + "learning_rate": 3.118391139054232e-06, + "loss": 0.127, + "step": 24761 + }, + { + "epoch": 0.6266163929448085, + "grad_norm": 5.0601959228515625, + "learning_rate": 3.1180191388933244e-06, + "loss": 0.1203, + "step": 24762 + }, + { + "epoch": 0.6266416985095022, + "grad_norm": 9.68779468536377, + "learning_rate": 3.1176471508689615e-06, + "loss": 0.344, + "step": 24763 + }, + { + "epoch": 0.626667004074196, + "grad_norm": 10.724205017089844, + "learning_rate": 3.117275174983544e-06, + "loss": 0.245, + "step": 24764 + }, + { + "epoch": 0.6266923096388896, + "grad_norm": 3.070297956466675, + "learning_rate": 3.116903211239467e-06, + "loss": 0.1387, + "step": 24765 + }, + { + "epoch": 0.6267176152035833, + "grad_norm": 3.8430373668670654, + "learning_rate": 3.1165312596391305e-06, + "loss": 0.1383, + "step": 24766 + }, + { + "epoch": 0.626742920768277, + "grad_norm": 7.328558444976807, + "learning_rate": 3.1161593201849346e-06, + "loss": 0.2161, + "step": 24767 + }, + { + "epoch": 0.6267682263329706, + "grad_norm": 4.836673259735107, + "learning_rate": 3.115787392879277e-06, + "loss": 0.1505, + "step": 24768 + }, + { + "epoch": 0.6267935318976643, + "grad_norm": 3.557096481323242, + "learning_rate": 3.115415477724555e-06, + "loss": 0.0794, + "step": 24769 + }, + { + "epoch": 0.626818837462358, + "grad_norm": 5.105242729187012, + "learning_rate": 3.115043574723169e-06, + "loss": 0.139, + "step": 24770 + }, + { + "epoch": 0.6268441430270516, + "grad_norm": 7.494616985321045, + "learning_rate": 3.114671683877515e-06, + "loss": 0.165, + "step": 24771 + }, + { + "epoch": 0.6268694485917453, + "grad_norm": 3.896707534790039, + "learning_rate": 3.1142998051899925e-06, + "loss": 0.1198, + "step": 24772 + }, + { + "epoch": 0.626894754156439, + "grad_norm": 4.372061252593994, + "learning_rate": 3.113927938663001e-06, + "loss": 0.1775, + "step": 24773 + }, + { + "epoch": 0.6269200597211326, + "grad_norm": 7.1575422286987305, + "learning_rate": 3.1135560842989375e-06, + "loss": 0.1687, + "step": 24774 + }, + { + "epoch": 0.6269453652858263, + "grad_norm": 7.616125583648682, + "learning_rate": 3.113184242100199e-06, + "loss": 0.221, + "step": 24775 + }, + { + "epoch": 0.62697067085052, + "grad_norm": 4.520510673522949, + "learning_rate": 3.1128124120691846e-06, + "loss": 0.1279, + "step": 24776 + }, + { + "epoch": 0.6269959764152137, + "grad_norm": 4.608206272125244, + "learning_rate": 3.1124405942082936e-06, + "loss": 0.1857, + "step": 24777 + }, + { + "epoch": 0.6270212819799074, + "grad_norm": 11.337268829345703, + "learning_rate": 3.1120687885199195e-06, + "loss": 0.1885, + "step": 24778 + }, + { + "epoch": 0.6270465875446011, + "grad_norm": 6.576208114624023, + "learning_rate": 3.1116969950064633e-06, + "loss": 0.1351, + "step": 24779 + }, + { + "epoch": 0.6270718931092948, + "grad_norm": 3.4525508880615234, + "learning_rate": 3.111325213670322e-06, + "loss": 0.1485, + "step": 24780 + }, + { + "epoch": 0.6270971986739884, + "grad_norm": 4.243051052093506, + "learning_rate": 3.110953444513895e-06, + "loss": 0.1341, + "step": 24781 + }, + { + "epoch": 0.6271225042386821, + "grad_norm": 9.084566116333008, + "learning_rate": 3.1105816875395757e-06, + "loss": 0.1893, + "step": 24782 + }, + { + "epoch": 0.6271478098033758, + "grad_norm": 3.013314962387085, + "learning_rate": 3.110209942749764e-06, + "loss": 0.1235, + "step": 24783 + }, + { + "epoch": 0.6271731153680694, + "grad_norm": 4.667498588562012, + "learning_rate": 3.1098382101468562e-06, + "loss": 0.1745, + "step": 24784 + }, + { + "epoch": 0.6271984209327631, + "grad_norm": 3.704442262649536, + "learning_rate": 3.1094664897332505e-06, + "loss": 0.1659, + "step": 24785 + }, + { + "epoch": 0.6272237264974568, + "grad_norm": 3.8747048377990723, + "learning_rate": 3.1090947815113458e-06, + "loss": 0.173, + "step": 24786 + }, + { + "epoch": 0.6272490320621504, + "grad_norm": 6.434680461883545, + "learning_rate": 3.108723085483535e-06, + "loss": 0.158, + "step": 24787 + }, + { + "epoch": 0.6272743376268441, + "grad_norm": 6.342878818511963, + "learning_rate": 3.1083514016522174e-06, + "loss": 0.1567, + "step": 24788 + }, + { + "epoch": 0.6272996431915379, + "grad_norm": 4.979536533355713, + "learning_rate": 3.10797973001979e-06, + "loss": 0.1053, + "step": 24789 + }, + { + "epoch": 0.6273249487562315, + "grad_norm": 3.6220927238464355, + "learning_rate": 3.1076080705886506e-06, + "loss": 0.1345, + "step": 24790 + }, + { + "epoch": 0.6273502543209252, + "grad_norm": 2.9605391025543213, + "learning_rate": 3.107236423361193e-06, + "loss": 0.0658, + "step": 24791 + }, + { + "epoch": 0.6273755598856189, + "grad_norm": 5.325178623199463, + "learning_rate": 3.106864788339816e-06, + "loss": 0.153, + "step": 24792 + }, + { + "epoch": 0.6274008654503125, + "grad_norm": 3.5830984115600586, + "learning_rate": 3.106493165526916e-06, + "loss": 0.0854, + "step": 24793 + }, + { + "epoch": 0.6274261710150062, + "grad_norm": 4.294273853302002, + "learning_rate": 3.1061215549248895e-06, + "loss": 0.1396, + "step": 24794 + }, + { + "epoch": 0.6274514765796999, + "grad_norm": 10.788098335266113, + "learning_rate": 3.105749956536134e-06, + "loss": 0.2857, + "step": 24795 + }, + { + "epoch": 0.6274767821443935, + "grad_norm": 6.554847717285156, + "learning_rate": 3.1053783703630426e-06, + "loss": 0.1272, + "step": 24796 + }, + { + "epoch": 0.6275020877090872, + "grad_norm": 4.323256015777588, + "learning_rate": 3.1050067964080137e-06, + "loss": 0.1544, + "step": 24797 + }, + { + "epoch": 0.6275273932737809, + "grad_norm": 4.826481819152832, + "learning_rate": 3.104635234673445e-06, + "loss": 0.2067, + "step": 24798 + }, + { + "epoch": 0.6275526988384745, + "grad_norm": 23.677106857299805, + "learning_rate": 3.1042636851617307e-06, + "loss": 0.1098, + "step": 24799 + }, + { + "epoch": 0.6275780044031682, + "grad_norm": 5.527745723724365, + "learning_rate": 3.1038921478752663e-06, + "loss": 0.1414, + "step": 24800 + }, + { + "epoch": 0.627603309967862, + "grad_norm": 4.680931091308594, + "learning_rate": 3.10352062281645e-06, + "loss": 0.1493, + "step": 24801 + }, + { + "epoch": 0.6276286155325556, + "grad_norm": 5.644641876220703, + "learning_rate": 3.1031491099876753e-06, + "loss": 0.1655, + "step": 24802 + }, + { + "epoch": 0.6276539210972493, + "grad_norm": 3.011986255645752, + "learning_rate": 3.102777609391339e-06, + "loss": 0.1187, + "step": 24803 + }, + { + "epoch": 0.627679226661943, + "grad_norm": 5.423452377319336, + "learning_rate": 3.1024061210298395e-06, + "loss": 0.1569, + "step": 24804 + }, + { + "epoch": 0.6277045322266367, + "grad_norm": 3.641000747680664, + "learning_rate": 3.102034644905567e-06, + "loss": 0.1874, + "step": 24805 + }, + { + "epoch": 0.6277298377913303, + "grad_norm": 12.265689849853516, + "learning_rate": 3.101663181020921e-06, + "loss": 0.1314, + "step": 24806 + }, + { + "epoch": 0.627755143356024, + "grad_norm": 2.7656869888305664, + "learning_rate": 3.1012917293782957e-06, + "loss": 0.096, + "step": 24807 + }, + { + "epoch": 0.6277804489207177, + "grad_norm": 3.9354751110076904, + "learning_rate": 3.1009202899800887e-06, + "loss": 0.148, + "step": 24808 + }, + { + "epoch": 0.6278057544854113, + "grad_norm": 3.561497926712036, + "learning_rate": 3.1005488628286915e-06, + "loss": 0.1263, + "step": 24809 + }, + { + "epoch": 0.627831060050105, + "grad_norm": 2.8419554233551025, + "learning_rate": 3.1001774479265016e-06, + "loss": 0.0945, + "step": 24810 + }, + { + "epoch": 0.6278563656147987, + "grad_norm": 5.120520114898682, + "learning_rate": 3.099806045275914e-06, + "loss": 0.1898, + "step": 24811 + }, + { + "epoch": 0.6278816711794923, + "grad_norm": 3.2234878540039062, + "learning_rate": 3.0994346548793253e-06, + "loss": 0.0977, + "step": 24812 + }, + { + "epoch": 0.627906976744186, + "grad_norm": 5.238988876342773, + "learning_rate": 3.099063276739127e-06, + "loss": 0.2111, + "step": 24813 + }, + { + "epoch": 0.6279322823088798, + "grad_norm": 20.2308349609375, + "learning_rate": 3.0986919108577163e-06, + "loss": 0.2697, + "step": 24814 + }, + { + "epoch": 0.6279575878735734, + "grad_norm": 4.510051727294922, + "learning_rate": 3.098320557237488e-06, + "loss": 0.1723, + "step": 24815 + }, + { + "epoch": 0.6279828934382671, + "grad_norm": 5.533730983734131, + "learning_rate": 3.0979492158808367e-06, + "loss": 0.1754, + "step": 24816 + }, + { + "epoch": 0.6280081990029608, + "grad_norm": 9.260241508483887, + "learning_rate": 3.0975778867901583e-06, + "loss": 0.2295, + "step": 24817 + }, + { + "epoch": 0.6280335045676544, + "grad_norm": 3.819640636444092, + "learning_rate": 3.097206569967845e-06, + "loss": 0.1609, + "step": 24818 + }, + { + "epoch": 0.6280588101323481, + "grad_norm": 4.279785633087158, + "learning_rate": 3.0968352654162926e-06, + "loss": 0.1303, + "step": 24819 + }, + { + "epoch": 0.6280841156970418, + "grad_norm": 14.240285873413086, + "learning_rate": 3.0964639731378953e-06, + "loss": 0.2423, + "step": 24820 + }, + { + "epoch": 0.6281094212617354, + "grad_norm": 11.180002212524414, + "learning_rate": 3.09609269313505e-06, + "loss": 0.1693, + "step": 24821 + }, + { + "epoch": 0.6281347268264291, + "grad_norm": 5.30468225479126, + "learning_rate": 3.0957214254101464e-06, + "loss": 0.1248, + "step": 24822 + }, + { + "epoch": 0.6281600323911228, + "grad_norm": 6.824957847595215, + "learning_rate": 3.0953501699655807e-06, + "loss": 0.1278, + "step": 24823 + }, + { + "epoch": 0.6281853379558164, + "grad_norm": 2.953770399093628, + "learning_rate": 3.094978926803748e-06, + "loss": 0.1284, + "step": 24824 + }, + { + "epoch": 0.6282106435205101, + "grad_norm": 5.116641044616699, + "learning_rate": 3.0946076959270433e-06, + "loss": 0.1204, + "step": 24825 + }, + { + "epoch": 0.6282359490852039, + "grad_norm": 11.14299488067627, + "learning_rate": 3.0942364773378586e-06, + "loss": 0.1177, + "step": 24826 + }, + { + "epoch": 0.6282612546498975, + "grad_norm": 8.570069313049316, + "learning_rate": 3.0938652710385875e-06, + "loss": 0.1567, + "step": 24827 + }, + { + "epoch": 0.6282865602145912, + "grad_norm": 3.7983646392822266, + "learning_rate": 3.093494077031626e-06, + "loss": 0.1051, + "step": 24828 + }, + { + "epoch": 0.6283118657792849, + "grad_norm": 3.725609540939331, + "learning_rate": 3.0931228953193647e-06, + "loss": 0.1492, + "step": 24829 + }, + { + "epoch": 0.6283371713439786, + "grad_norm": 3.87056303024292, + "learning_rate": 3.0927517259042004e-06, + "loss": 0.1418, + "step": 24830 + }, + { + "epoch": 0.6283624769086722, + "grad_norm": 3.8567168712615967, + "learning_rate": 3.0923805687885255e-06, + "loss": 0.1051, + "step": 24831 + }, + { + "epoch": 0.6283877824733659, + "grad_norm": 5.606129169464111, + "learning_rate": 3.0920094239747327e-06, + "loss": 0.1543, + "step": 24832 + }, + { + "epoch": 0.6284130880380596, + "grad_norm": 2.3846492767333984, + "learning_rate": 3.0916382914652166e-06, + "loss": 0.1044, + "step": 24833 + }, + { + "epoch": 0.6284383936027532, + "grad_norm": 5.005627155303955, + "learning_rate": 3.0912671712623705e-06, + "loss": 0.1512, + "step": 24834 + }, + { + "epoch": 0.6284636991674469, + "grad_norm": 5.164300918579102, + "learning_rate": 3.0908960633685874e-06, + "loss": 0.2, + "step": 24835 + }, + { + "epoch": 0.6284890047321406, + "grad_norm": 3.7158443927764893, + "learning_rate": 3.09052496778626e-06, + "loss": 0.14, + "step": 24836 + }, + { + "epoch": 0.6285143102968342, + "grad_norm": 8.174965858459473, + "learning_rate": 3.0901538845177816e-06, + "loss": 0.2533, + "step": 24837 + }, + { + "epoch": 0.628539615861528, + "grad_norm": 7.017059326171875, + "learning_rate": 3.089782813565545e-06, + "loss": 0.281, + "step": 24838 + }, + { + "epoch": 0.6285649214262217, + "grad_norm": 7.995674133300781, + "learning_rate": 3.089411754931947e-06, + "loss": 0.145, + "step": 24839 + }, + { + "epoch": 0.6285902269909153, + "grad_norm": 4.047539234161377, + "learning_rate": 3.089040708619374e-06, + "loss": 0.1414, + "step": 24840 + }, + { + "epoch": 0.628615532555609, + "grad_norm": 4.502147197723389, + "learning_rate": 3.0886696746302227e-06, + "loss": 0.1637, + "step": 24841 + }, + { + "epoch": 0.6286408381203027, + "grad_norm": 13.075349807739258, + "learning_rate": 3.0882986529668846e-06, + "loss": 0.1941, + "step": 24842 + }, + { + "epoch": 0.6286661436849963, + "grad_norm": 3.2193069458007812, + "learning_rate": 3.087927643631755e-06, + "loss": 0.1298, + "step": 24843 + }, + { + "epoch": 0.62869144924969, + "grad_norm": 11.34328556060791, + "learning_rate": 3.0875566466272223e-06, + "loss": 0.2782, + "step": 24844 + }, + { + "epoch": 0.6287167548143837, + "grad_norm": 2.6966283321380615, + "learning_rate": 3.0871856619556813e-06, + "loss": 0.0958, + "step": 24845 + }, + { + "epoch": 0.6287420603790773, + "grad_norm": 5.109992504119873, + "learning_rate": 3.0868146896195238e-06, + "loss": 0.1147, + "step": 24846 + }, + { + "epoch": 0.628767365943771, + "grad_norm": 4.9893388748168945, + "learning_rate": 3.086443729621143e-06, + "loss": 0.1262, + "step": 24847 + }, + { + "epoch": 0.6287926715084647, + "grad_norm": 3.988018751144409, + "learning_rate": 3.0860727819629323e-06, + "loss": 0.131, + "step": 24848 + }, + { + "epoch": 0.6288179770731583, + "grad_norm": 9.307479858398438, + "learning_rate": 3.0857018466472806e-06, + "loss": 0.1915, + "step": 24849 + }, + { + "epoch": 0.628843282637852, + "grad_norm": 4.226360321044922, + "learning_rate": 3.085330923676581e-06, + "loss": 0.1606, + "step": 24850 + }, + { + "epoch": 0.6288685882025458, + "grad_norm": 6.023375988006592, + "learning_rate": 3.0849600130532265e-06, + "loss": 0.2071, + "step": 24851 + }, + { + "epoch": 0.6288938937672394, + "grad_norm": 2.6311938762664795, + "learning_rate": 3.08458911477961e-06, + "loss": 0.1126, + "step": 24852 + }, + { + "epoch": 0.6289191993319331, + "grad_norm": 3.834602117538452, + "learning_rate": 3.084218228858121e-06, + "loss": 0.1469, + "step": 24853 + }, + { + "epoch": 0.6289445048966268, + "grad_norm": 3.706777811050415, + "learning_rate": 3.083847355291152e-06, + "loss": 0.0982, + "step": 24854 + }, + { + "epoch": 0.6289698104613205, + "grad_norm": 7.535984039306641, + "learning_rate": 3.0834764940810956e-06, + "loss": 0.1196, + "step": 24855 + }, + { + "epoch": 0.6289951160260141, + "grad_norm": 5.394536972045898, + "learning_rate": 3.083105645230342e-06, + "loss": 0.1701, + "step": 24856 + }, + { + "epoch": 0.6290204215907078, + "grad_norm": 4.105871200561523, + "learning_rate": 3.0827348087412845e-06, + "loss": 0.1075, + "step": 24857 + }, + { + "epoch": 0.6290457271554015, + "grad_norm": 2.5624523162841797, + "learning_rate": 3.0823639846163127e-06, + "loss": 0.0834, + "step": 24858 + }, + { + "epoch": 0.6290710327200951, + "grad_norm": 5.147563457489014, + "learning_rate": 3.08199317285782e-06, + "loss": 0.1549, + "step": 24859 + }, + { + "epoch": 0.6290963382847888, + "grad_norm": 5.940659999847412, + "learning_rate": 3.0816223734681954e-06, + "loss": 0.2335, + "step": 24860 + }, + { + "epoch": 0.6291216438494825, + "grad_norm": 4.166769504547119, + "learning_rate": 3.081251586449832e-06, + "loss": 0.1524, + "step": 24861 + }, + { + "epoch": 0.6291469494141761, + "grad_norm": 3.170325994491577, + "learning_rate": 3.0808808118051205e-06, + "loss": 0.1153, + "step": 24862 + }, + { + "epoch": 0.6291722549788699, + "grad_norm": 12.645071983337402, + "learning_rate": 3.0805100495364504e-06, + "loss": 0.3063, + "step": 24863 + }, + { + "epoch": 0.6291975605435636, + "grad_norm": 15.846614837646484, + "learning_rate": 3.0801392996462145e-06, + "loss": 0.147, + "step": 24864 + }, + { + "epoch": 0.6292228661082572, + "grad_norm": 3.5966718196868896, + "learning_rate": 3.0797685621368045e-06, + "loss": 0.1202, + "step": 24865 + }, + { + "epoch": 0.6292481716729509, + "grad_norm": 9.137041091918945, + "learning_rate": 3.079397837010608e-06, + "loss": 0.2321, + "step": 24866 + }, + { + "epoch": 0.6292734772376446, + "grad_norm": 22.601886749267578, + "learning_rate": 3.079027124270018e-06, + "loss": 0.3636, + "step": 24867 + }, + { + "epoch": 0.6292987828023382, + "grad_norm": 3.68293833732605, + "learning_rate": 3.078656423917424e-06, + "loss": 0.1095, + "step": 24868 + }, + { + "epoch": 0.6293240883670319, + "grad_norm": 6.4558892250061035, + "learning_rate": 3.0782857359552175e-06, + "loss": 0.1889, + "step": 24869 + }, + { + "epoch": 0.6293493939317256, + "grad_norm": 18.09516716003418, + "learning_rate": 3.0779150603857915e-06, + "loss": 0.2471, + "step": 24870 + }, + { + "epoch": 0.6293746994964192, + "grad_norm": 4.848782539367676, + "learning_rate": 3.0775443972115314e-06, + "loss": 0.1475, + "step": 24871 + }, + { + "epoch": 0.6294000050611129, + "grad_norm": 5.0815629959106445, + "learning_rate": 3.0771737464348297e-06, + "loss": 0.1027, + "step": 24872 + }, + { + "epoch": 0.6294253106258066, + "grad_norm": 4.427399635314941, + "learning_rate": 3.0768031080580774e-06, + "loss": 0.1981, + "step": 24873 + }, + { + "epoch": 0.6294506161905002, + "grad_norm": 3.225447416305542, + "learning_rate": 3.0764324820836654e-06, + "loss": 0.1181, + "step": 24874 + }, + { + "epoch": 0.629475921755194, + "grad_norm": 9.65118408203125, + "learning_rate": 3.076061868513981e-06, + "loss": 0.2692, + "step": 24875 + }, + { + "epoch": 0.6295012273198877, + "grad_norm": 6.778306007385254, + "learning_rate": 3.075691267351415e-06, + "loss": 0.1469, + "step": 24876 + }, + { + "epoch": 0.6295265328845813, + "grad_norm": 3.8233683109283447, + "learning_rate": 3.0753206785983587e-06, + "loss": 0.1943, + "step": 24877 + }, + { + "epoch": 0.629551838449275, + "grad_norm": 3.0095901489257812, + "learning_rate": 3.0749501022572014e-06, + "loss": 0.1459, + "step": 24878 + }, + { + "epoch": 0.6295771440139687, + "grad_norm": 4.697969436645508, + "learning_rate": 3.0745795383303347e-06, + "loss": 0.1672, + "step": 24879 + }, + { + "epoch": 0.6296024495786623, + "grad_norm": 4.941776275634766, + "learning_rate": 3.0742089868201442e-06, + "loss": 0.1821, + "step": 24880 + }, + { + "epoch": 0.629627755143356, + "grad_norm": 4.205626964569092, + "learning_rate": 3.0738384477290217e-06, + "loss": 0.0904, + "step": 24881 + }, + { + "epoch": 0.6296530607080497, + "grad_norm": 4.637203216552734, + "learning_rate": 3.0734679210593566e-06, + "loss": 0.0937, + "step": 24882 + }, + { + "epoch": 0.6296783662727434, + "grad_norm": 5.290699005126953, + "learning_rate": 3.0730974068135412e-06, + "loss": 0.1574, + "step": 24883 + }, + { + "epoch": 0.629703671837437, + "grad_norm": 4.312522888183594, + "learning_rate": 3.0727269049939597e-06, + "loss": 0.1498, + "step": 24884 + }, + { + "epoch": 0.6297289774021307, + "grad_norm": 3.8480029106140137, + "learning_rate": 3.072356415603004e-06, + "loss": 0.1903, + "step": 24885 + }, + { + "epoch": 0.6297542829668245, + "grad_norm": 6.645788669586182, + "learning_rate": 3.071985938643064e-06, + "loss": 0.2935, + "step": 24886 + }, + { + "epoch": 0.629779588531518, + "grad_norm": 8.246984481811523, + "learning_rate": 3.0716154741165284e-06, + "loss": 0.2134, + "step": 24887 + }, + { + "epoch": 0.6298048940962118, + "grad_norm": 7.9715681076049805, + "learning_rate": 3.0712450220257845e-06, + "loss": 0.1753, + "step": 24888 + }, + { + "epoch": 0.6298301996609055, + "grad_norm": 4.178788661956787, + "learning_rate": 3.070874582373224e-06, + "loss": 0.138, + "step": 24889 + }, + { + "epoch": 0.6298555052255991, + "grad_norm": 4.222712993621826, + "learning_rate": 3.0705041551612336e-06, + "loss": 0.1381, + "step": 24890 + }, + { + "epoch": 0.6298808107902928, + "grad_norm": 4.894155979156494, + "learning_rate": 3.0701337403922026e-06, + "loss": 0.1339, + "step": 24891 + }, + { + "epoch": 0.6299061163549865, + "grad_norm": 5.14033317565918, + "learning_rate": 3.069763338068522e-06, + "loss": 0.1169, + "step": 24892 + }, + { + "epoch": 0.6299314219196801, + "grad_norm": 5.591646194458008, + "learning_rate": 3.069392948192577e-06, + "loss": 0.1794, + "step": 24893 + }, + { + "epoch": 0.6299567274843738, + "grad_norm": 6.782931327819824, + "learning_rate": 3.0690225707667576e-06, + "loss": 0.1967, + "step": 24894 + }, + { + "epoch": 0.6299820330490675, + "grad_norm": 8.770575523376465, + "learning_rate": 3.068652205793452e-06, + "loss": 0.2292, + "step": 24895 + }, + { + "epoch": 0.6300073386137611, + "grad_norm": 3.582646369934082, + "learning_rate": 3.068281853275052e-06, + "loss": 0.1802, + "step": 24896 + }, + { + "epoch": 0.6300326441784548, + "grad_norm": 7.289784908294678, + "learning_rate": 3.0679115132139402e-06, + "loss": 0.1544, + "step": 24897 + }, + { + "epoch": 0.6300579497431485, + "grad_norm": 14.737273216247559, + "learning_rate": 3.0675411856125082e-06, + "loss": 0.1427, + "step": 24898 + }, + { + "epoch": 0.6300832553078421, + "grad_norm": 5.033279895782471, + "learning_rate": 3.0671708704731433e-06, + "loss": 0.2006, + "step": 24899 + }, + { + "epoch": 0.6301085608725359, + "grad_norm": 2.667652130126953, + "learning_rate": 3.0668005677982337e-06, + "loss": 0.1016, + "step": 24900 + }, + { + "epoch": 0.6301338664372296, + "grad_norm": 6.437896251678467, + "learning_rate": 3.0664302775901706e-06, + "loss": 0.1736, + "step": 24901 + }, + { + "epoch": 0.6301591720019232, + "grad_norm": 3.4092044830322266, + "learning_rate": 3.0660599998513363e-06, + "loss": 0.1301, + "step": 24902 + }, + { + "epoch": 0.6301844775666169, + "grad_norm": 3.8985958099365234, + "learning_rate": 3.0656897345841215e-06, + "loss": 0.1645, + "step": 24903 + }, + { + "epoch": 0.6302097831313106, + "grad_norm": 4.203588962554932, + "learning_rate": 3.0653194817909136e-06, + "loss": 0.1273, + "step": 24904 + }, + { + "epoch": 0.6302350886960042, + "grad_norm": 3.8015315532684326, + "learning_rate": 3.0649492414741024e-06, + "loss": 0.1849, + "step": 24905 + }, + { + "epoch": 0.6302603942606979, + "grad_norm": 3.987950086593628, + "learning_rate": 3.064579013636072e-06, + "loss": 0.1335, + "step": 24906 + }, + { + "epoch": 0.6302856998253916, + "grad_norm": 5.2841339111328125, + "learning_rate": 3.064208798279211e-06, + "loss": 0.1837, + "step": 24907 + }, + { + "epoch": 0.6303110053900853, + "grad_norm": 3.079833745956421, + "learning_rate": 3.0638385954059076e-06, + "loss": 0.1413, + "step": 24908 + }, + { + "epoch": 0.6303363109547789, + "grad_norm": 5.249194145202637, + "learning_rate": 3.063468405018551e-06, + "loss": 0.1985, + "step": 24909 + }, + { + "epoch": 0.6303616165194726, + "grad_norm": 5.4666972160339355, + "learning_rate": 3.0630982271195244e-06, + "loss": 0.1694, + "step": 24910 + }, + { + "epoch": 0.6303869220841664, + "grad_norm": 2.456969738006592, + "learning_rate": 3.062728061711217e-06, + "loss": 0.0968, + "step": 24911 + }, + { + "epoch": 0.63041222764886, + "grad_norm": 2.786616563796997, + "learning_rate": 3.062357908796016e-06, + "loss": 0.0746, + "step": 24912 + }, + { + "epoch": 0.6304375332135537, + "grad_norm": 7.039666652679443, + "learning_rate": 3.0619877683763094e-06, + "loss": 0.1939, + "step": 24913 + }, + { + "epoch": 0.6304628387782474, + "grad_norm": 8.509170532226562, + "learning_rate": 3.0616176404544832e-06, + "loss": 0.2056, + "step": 24914 + }, + { + "epoch": 0.630488144342941, + "grad_norm": 4.630248546600342, + "learning_rate": 3.0612475250329233e-06, + "loss": 0.1405, + "step": 24915 + }, + { + "epoch": 0.6305134499076347, + "grad_norm": 6.873341083526611, + "learning_rate": 3.0608774221140185e-06, + "loss": 0.1862, + "step": 24916 + }, + { + "epoch": 0.6305387554723284, + "grad_norm": 12.952173233032227, + "learning_rate": 3.0605073317001534e-06, + "loss": 0.3278, + "step": 24917 + }, + { + "epoch": 0.630564061037022, + "grad_norm": 7.837841987609863, + "learning_rate": 3.060137253793717e-06, + "loss": 0.1442, + "step": 24918 + }, + { + "epoch": 0.6305893666017157, + "grad_norm": 6.794432163238525, + "learning_rate": 3.0597671883970936e-06, + "loss": 0.1352, + "step": 24919 + }, + { + "epoch": 0.6306146721664094, + "grad_norm": 7.048471927642822, + "learning_rate": 3.0593971355126715e-06, + "loss": 0.1615, + "step": 24920 + }, + { + "epoch": 0.630639977731103, + "grad_norm": 5.953210353851318, + "learning_rate": 3.059027095142835e-06, + "loss": 0.239, + "step": 24921 + }, + { + "epoch": 0.6306652832957967, + "grad_norm": 3.8948771953582764, + "learning_rate": 3.058657067289972e-06, + "loss": 0.1362, + "step": 24922 + }, + { + "epoch": 0.6306905888604905, + "grad_norm": 5.1220927238464355, + "learning_rate": 3.0582870519564706e-06, + "loss": 0.1678, + "step": 24923 + }, + { + "epoch": 0.630715894425184, + "grad_norm": 6.308745384216309, + "learning_rate": 3.057917049144713e-06, + "loss": 0.1875, + "step": 24924 + }, + { + "epoch": 0.6307411999898778, + "grad_norm": 5.103799343109131, + "learning_rate": 3.0575470588570866e-06, + "loss": 0.1653, + "step": 24925 + }, + { + "epoch": 0.6307665055545715, + "grad_norm": 8.486910820007324, + "learning_rate": 3.0571770810959776e-06, + "loss": 0.1952, + "step": 24926 + }, + { + "epoch": 0.6307918111192651, + "grad_norm": 5.161083698272705, + "learning_rate": 3.056807115863775e-06, + "loss": 0.156, + "step": 24927 + }, + { + "epoch": 0.6308171166839588, + "grad_norm": 4.278575420379639, + "learning_rate": 3.0564371631628598e-06, + "loss": 0.1471, + "step": 24928 + }, + { + "epoch": 0.6308424222486525, + "grad_norm": 3.3996989727020264, + "learning_rate": 3.0560672229956196e-06, + "loss": 0.1627, + "step": 24929 + }, + { + "epoch": 0.6308677278133461, + "grad_norm": 7.094139575958252, + "learning_rate": 3.0556972953644404e-06, + "loss": 0.1726, + "step": 24930 + }, + { + "epoch": 0.6308930333780398, + "grad_norm": 3.810601234436035, + "learning_rate": 3.055327380271707e-06, + "loss": 0.0838, + "step": 24931 + }, + { + "epoch": 0.6309183389427335, + "grad_norm": 3.184006929397583, + "learning_rate": 3.0549574777198084e-06, + "loss": 0.118, + "step": 24932 + }, + { + "epoch": 0.6309436445074272, + "grad_norm": 5.35361909866333, + "learning_rate": 3.0545875877111247e-06, + "loss": 0.1139, + "step": 24933 + }, + { + "epoch": 0.6309689500721208, + "grad_norm": 4.39752197265625, + "learning_rate": 3.0542177102480443e-06, + "loss": 0.1226, + "step": 24934 + }, + { + "epoch": 0.6309942556368145, + "grad_norm": 6.203634262084961, + "learning_rate": 3.0538478453329514e-06, + "loss": 0.1955, + "step": 24935 + }, + { + "epoch": 0.6310195612015083, + "grad_norm": 3.5285046100616455, + "learning_rate": 3.053477992968234e-06, + "loss": 0.1331, + "step": 24936 + }, + { + "epoch": 0.6310448667662019, + "grad_norm": 5.9944024085998535, + "learning_rate": 3.0531081531562728e-06, + "loss": 0.1287, + "step": 24937 + }, + { + "epoch": 0.6310701723308956, + "grad_norm": 6.029077529907227, + "learning_rate": 3.0527383258994546e-06, + "loss": 0.1182, + "step": 24938 + }, + { + "epoch": 0.6310954778955893, + "grad_norm": 3.566345453262329, + "learning_rate": 3.0523685112001653e-06, + "loss": 0.1227, + "step": 24939 + }, + { + "epoch": 0.6311207834602829, + "grad_norm": 5.162689208984375, + "learning_rate": 3.0519987090607916e-06, + "loss": 0.1156, + "step": 24940 + }, + { + "epoch": 0.6311460890249766, + "grad_norm": 6.811553001403809, + "learning_rate": 3.0516289194837135e-06, + "loss": 0.2322, + "step": 24941 + }, + { + "epoch": 0.6311713945896703, + "grad_norm": 12.619135856628418, + "learning_rate": 3.0512591424713176e-06, + "loss": 0.3052, + "step": 24942 + }, + { + "epoch": 0.6311967001543639, + "grad_norm": 10.122992515563965, + "learning_rate": 3.0508893780259896e-06, + "loss": 0.1771, + "step": 24943 + }, + { + "epoch": 0.6312220057190576, + "grad_norm": 4.483376979827881, + "learning_rate": 3.0505196261501134e-06, + "loss": 0.2105, + "step": 24944 + }, + { + "epoch": 0.6312473112837513, + "grad_norm": 5.347558498382568, + "learning_rate": 3.0501498868460743e-06, + "loss": 0.1418, + "step": 24945 + }, + { + "epoch": 0.6312726168484449, + "grad_norm": 2.691622734069824, + "learning_rate": 3.0497801601162553e-06, + "loss": 0.1114, + "step": 24946 + }, + { + "epoch": 0.6312979224131386, + "grad_norm": 6.110506057739258, + "learning_rate": 3.0494104459630414e-06, + "loss": 0.2117, + "step": 24947 + }, + { + "epoch": 0.6313232279778324, + "grad_norm": 6.6564202308654785, + "learning_rate": 3.0490407443888164e-06, + "loss": 0.2316, + "step": 24948 + }, + { + "epoch": 0.631348533542526, + "grad_norm": 4.351955413818359, + "learning_rate": 3.0486710553959653e-06, + "loss": 0.1514, + "step": 24949 + }, + { + "epoch": 0.6313738391072197, + "grad_norm": 6.564594745635986, + "learning_rate": 3.048301378986872e-06, + "loss": 0.2, + "step": 24950 + }, + { + "epoch": 0.6313991446719134, + "grad_norm": 3.3401410579681396, + "learning_rate": 3.0479317151639187e-06, + "loss": 0.1284, + "step": 24951 + }, + { + "epoch": 0.631424450236607, + "grad_norm": 7.513323783874512, + "learning_rate": 3.047562063929491e-06, + "loss": 0.1709, + "step": 24952 + }, + { + "epoch": 0.6314497558013007, + "grad_norm": 3.2229111194610596, + "learning_rate": 3.047192425285972e-06, + "loss": 0.1518, + "step": 24953 + }, + { + "epoch": 0.6314750613659944, + "grad_norm": 7.648508071899414, + "learning_rate": 3.046822799235748e-06, + "loss": 0.277, + "step": 24954 + }, + { + "epoch": 0.631500366930688, + "grad_norm": 10.490456581115723, + "learning_rate": 3.0464531857811985e-06, + "loss": 0.2943, + "step": 24955 + }, + { + "epoch": 0.6315256724953817, + "grad_norm": 4.55735969543457, + "learning_rate": 3.0460835849247085e-06, + "loss": 0.2038, + "step": 24956 + }, + { + "epoch": 0.6315509780600754, + "grad_norm": 3.8528454303741455, + "learning_rate": 3.045713996668662e-06, + "loss": 0.1721, + "step": 24957 + }, + { + "epoch": 0.6315762836247691, + "grad_norm": 4.540703296661377, + "learning_rate": 3.045344421015445e-06, + "loss": 0.1506, + "step": 24958 + }, + { + "epoch": 0.6316015891894627, + "grad_norm": 4.691853046417236, + "learning_rate": 3.0449748579674364e-06, + "loss": 0.1685, + "step": 24959 + }, + { + "epoch": 0.6316268947541565, + "grad_norm": 3.452976942062378, + "learning_rate": 3.044605307527021e-06, + "loss": 0.1577, + "step": 24960 + }, + { + "epoch": 0.6316522003188502, + "grad_norm": 6.94497013092041, + "learning_rate": 3.0442357696965817e-06, + "loss": 0.195, + "step": 24961 + }, + { + "epoch": 0.6316775058835438, + "grad_norm": 13.042900085449219, + "learning_rate": 3.043866244478505e-06, + "loss": 0.2106, + "step": 24962 + }, + { + "epoch": 0.6317028114482375, + "grad_norm": 6.0577778816223145, + "learning_rate": 3.0434967318751683e-06, + "loss": 0.2383, + "step": 24963 + }, + { + "epoch": 0.6317281170129312, + "grad_norm": 4.730152606964111, + "learning_rate": 3.0431272318889576e-06, + "loss": 0.1355, + "step": 24964 + }, + { + "epoch": 0.6317534225776248, + "grad_norm": 3.3386504650115967, + "learning_rate": 3.0427577445222555e-06, + "loss": 0.1656, + "step": 24965 + }, + { + "epoch": 0.6317787281423185, + "grad_norm": 5.286304950714111, + "learning_rate": 3.042388269777444e-06, + "loss": 0.1547, + "step": 24966 + }, + { + "epoch": 0.6318040337070122, + "grad_norm": 3.1661243438720703, + "learning_rate": 3.042018807656909e-06, + "loss": 0.1482, + "step": 24967 + }, + { + "epoch": 0.6318293392717058, + "grad_norm": 2.8869311809539795, + "learning_rate": 3.0416493581630284e-06, + "loss": 0.1213, + "step": 24968 + }, + { + "epoch": 0.6318546448363995, + "grad_norm": 5.602777481079102, + "learning_rate": 3.041279921298187e-06, + "loss": 0.1653, + "step": 24969 + }, + { + "epoch": 0.6318799504010932, + "grad_norm": 5.240205764770508, + "learning_rate": 3.040910497064766e-06, + "loss": 0.1694, + "step": 24970 + }, + { + "epoch": 0.6319052559657868, + "grad_norm": 4.294070720672607, + "learning_rate": 3.0405410854651517e-06, + "loss": 0.1593, + "step": 24971 + }, + { + "epoch": 0.6319305615304806, + "grad_norm": 10.015176773071289, + "learning_rate": 3.040171686501722e-06, + "loss": 0.2455, + "step": 24972 + }, + { + "epoch": 0.6319558670951743, + "grad_norm": 5.01712703704834, + "learning_rate": 3.0398023001768596e-06, + "loss": 0.1364, + "step": 24973 + }, + { + "epoch": 0.6319811726598679, + "grad_norm": 8.017991065979004, + "learning_rate": 3.039432926492949e-06, + "loss": 0.1286, + "step": 24974 + }, + { + "epoch": 0.6320064782245616, + "grad_norm": 4.346142768859863, + "learning_rate": 3.039063565452369e-06, + "loss": 0.1681, + "step": 24975 + }, + { + "epoch": 0.6320317837892553, + "grad_norm": 10.60305118560791, + "learning_rate": 3.038694217057505e-06, + "loss": 0.2352, + "step": 24976 + }, + { + "epoch": 0.6320570893539489, + "grad_norm": 5.173323154449463, + "learning_rate": 3.0383248813107376e-06, + "loss": 0.157, + "step": 24977 + }, + { + "epoch": 0.6320823949186426, + "grad_norm": 7.847064971923828, + "learning_rate": 3.037955558214446e-06, + "loss": 0.2675, + "step": 24978 + }, + { + "epoch": 0.6321077004833363, + "grad_norm": 10.598108291625977, + "learning_rate": 3.037586247771015e-06, + "loss": 0.3249, + "step": 24979 + }, + { + "epoch": 0.6321330060480299, + "grad_norm": 5.954131603240967, + "learning_rate": 3.0372169499828258e-06, + "loss": 0.1777, + "step": 24980 + }, + { + "epoch": 0.6321583116127236, + "grad_norm": 4.616660118103027, + "learning_rate": 3.0368476648522595e-06, + "loss": 0.1797, + "step": 24981 + }, + { + "epoch": 0.6321836171774173, + "grad_norm": 2.5024337768554688, + "learning_rate": 3.0364783923816964e-06, + "loss": 0.0692, + "step": 24982 + }, + { + "epoch": 0.6322089227421109, + "grad_norm": 10.316702842712402, + "learning_rate": 3.0361091325735185e-06, + "loss": 0.1636, + "step": 24983 + }, + { + "epoch": 0.6322342283068046, + "grad_norm": 4.762701511383057, + "learning_rate": 3.035739885430108e-06, + "loss": 0.1438, + "step": 24984 + }, + { + "epoch": 0.6322595338714984, + "grad_norm": 2.9009408950805664, + "learning_rate": 3.035370650953848e-06, + "loss": 0.1528, + "step": 24985 + }, + { + "epoch": 0.6322848394361921, + "grad_norm": 7.26331901550293, + "learning_rate": 3.035001429147114e-06, + "loss": 0.1206, + "step": 24986 + }, + { + "epoch": 0.6323101450008857, + "grad_norm": 6.190917491912842, + "learning_rate": 3.0346322200122914e-06, + "loss": 0.1783, + "step": 24987 + }, + { + "epoch": 0.6323354505655794, + "grad_norm": 4.177987098693848, + "learning_rate": 3.0342630235517595e-06, + "loss": 0.1678, + "step": 24988 + }, + { + "epoch": 0.6323607561302731, + "grad_norm": 6.413508892059326, + "learning_rate": 3.033893839767902e-06, + "loss": 0.2491, + "step": 24989 + }, + { + "epoch": 0.6323860616949667, + "grad_norm": 3.863966941833496, + "learning_rate": 3.0335246686630946e-06, + "loss": 0.1655, + "step": 24990 + }, + { + "epoch": 0.6324113672596604, + "grad_norm": 4.393311023712158, + "learning_rate": 3.0331555102397215e-06, + "loss": 0.1634, + "step": 24991 + }, + { + "epoch": 0.6324366728243541, + "grad_norm": 3.7186615467071533, + "learning_rate": 3.0327863645001633e-06, + "loss": 0.1695, + "step": 24992 + }, + { + "epoch": 0.6324619783890477, + "grad_norm": 2.6243503093719482, + "learning_rate": 3.0324172314468005e-06, + "loss": 0.1119, + "step": 24993 + }, + { + "epoch": 0.6324872839537414, + "grad_norm": 4.913719654083252, + "learning_rate": 3.032048111082012e-06, + "loss": 0.1331, + "step": 24994 + }, + { + "epoch": 0.6325125895184351, + "grad_norm": 3.059696912765503, + "learning_rate": 3.0316790034081787e-06, + "loss": 0.112, + "step": 24995 + }, + { + "epoch": 0.6325378950831287, + "grad_norm": 5.977302074432373, + "learning_rate": 3.0313099084276814e-06, + "loss": 0.1804, + "step": 24996 + }, + { + "epoch": 0.6325632006478225, + "grad_norm": 6.5086212158203125, + "learning_rate": 3.030940826142901e-06, + "loss": 0.1717, + "step": 24997 + }, + { + "epoch": 0.6325885062125162, + "grad_norm": 6.223195552825928, + "learning_rate": 3.0305717565562176e-06, + "loss": 0.1392, + "step": 24998 + }, + { + "epoch": 0.6326138117772098, + "grad_norm": 9.148526191711426, + "learning_rate": 3.0302026996700094e-06, + "loss": 0.2706, + "step": 24999 + }, + { + "epoch": 0.6326391173419035, + "grad_norm": 2.091587781906128, + "learning_rate": 3.0298336554866577e-06, + "loss": 0.1062, + "step": 25000 + }, + { + "epoch": 0.6326644229065972, + "grad_norm": 5.104007244110107, + "learning_rate": 3.0294646240085427e-06, + "loss": 0.2404, + "step": 25001 + }, + { + "epoch": 0.6326897284712908, + "grad_norm": 14.612221717834473, + "learning_rate": 3.029095605238045e-06, + "loss": 0.2246, + "step": 25002 + }, + { + "epoch": 0.6327150340359845, + "grad_norm": 8.066954612731934, + "learning_rate": 3.0287265991775406e-06, + "loss": 0.2581, + "step": 25003 + }, + { + "epoch": 0.6327403396006782, + "grad_norm": 9.539003372192383, + "learning_rate": 3.028357605829413e-06, + "loss": 0.3001, + "step": 25004 + }, + { + "epoch": 0.6327656451653718, + "grad_norm": 4.525074005126953, + "learning_rate": 3.02798862519604e-06, + "loss": 0.1049, + "step": 25005 + }, + { + "epoch": 0.6327909507300655, + "grad_norm": 3.5106921195983887, + "learning_rate": 3.0276196572798017e-06, + "loss": 0.1274, + "step": 25006 + }, + { + "epoch": 0.6328162562947592, + "grad_norm": 4.6589460372924805, + "learning_rate": 3.0272507020830777e-06, + "loss": 0.1848, + "step": 25007 + }, + { + "epoch": 0.6328415618594528, + "grad_norm": 5.928452968597412, + "learning_rate": 3.0268817596082478e-06, + "loss": 0.1319, + "step": 25008 + }, + { + "epoch": 0.6328668674241466, + "grad_norm": 3.541077136993408, + "learning_rate": 3.0265128298576886e-06, + "loss": 0.1369, + "step": 25009 + }, + { + "epoch": 0.6328921729888403, + "grad_norm": 5.849669933319092, + "learning_rate": 3.026143912833781e-06, + "loss": 0.1523, + "step": 25010 + }, + { + "epoch": 0.632917478553534, + "grad_norm": 5.1871514320373535, + "learning_rate": 3.0257750085389072e-06, + "loss": 0.131, + "step": 25011 + }, + { + "epoch": 0.6329427841182276, + "grad_norm": 9.84637451171875, + "learning_rate": 3.02540611697544e-06, + "loss": 0.2379, + "step": 25012 + }, + { + "epoch": 0.6329680896829213, + "grad_norm": 4.620233058929443, + "learning_rate": 3.0250372381457617e-06, + "loss": 0.1917, + "step": 25013 + }, + { + "epoch": 0.632993395247615, + "grad_norm": 3.977696180343628, + "learning_rate": 3.0246683720522514e-06, + "loss": 0.1681, + "step": 25014 + }, + { + "epoch": 0.6330187008123086, + "grad_norm": 2.9825522899627686, + "learning_rate": 3.024299518697289e-06, + "loss": 0.1277, + "step": 25015 + }, + { + "epoch": 0.6330440063770023, + "grad_norm": 4.56689977645874, + "learning_rate": 3.023930678083249e-06, + "loss": 0.1129, + "step": 25016 + }, + { + "epoch": 0.633069311941696, + "grad_norm": 3.3345541954040527, + "learning_rate": 3.023561850212513e-06, + "loss": 0.1453, + "step": 25017 + }, + { + "epoch": 0.6330946175063896, + "grad_norm": 2.557335615158081, + "learning_rate": 3.0231930350874595e-06, + "loss": 0.1217, + "step": 25018 + }, + { + "epoch": 0.6331199230710833, + "grad_norm": 3.8066463470458984, + "learning_rate": 3.0228242327104653e-06, + "loss": 0.1364, + "step": 25019 + }, + { + "epoch": 0.633145228635777, + "grad_norm": 7.817533016204834, + "learning_rate": 3.0224554430839127e-06, + "loss": 0.2578, + "step": 25020 + }, + { + "epoch": 0.6331705342004706, + "grad_norm": 6.348731517791748, + "learning_rate": 3.0220866662101742e-06, + "loss": 0.1539, + "step": 25021 + }, + { + "epoch": 0.6331958397651644, + "grad_norm": 14.181600570678711, + "learning_rate": 3.021717902091632e-06, + "loss": 0.3451, + "step": 25022 + }, + { + "epoch": 0.6332211453298581, + "grad_norm": 6.165173053741455, + "learning_rate": 3.0213491507306625e-06, + "loss": 0.2054, + "step": 25023 + }, + { + "epoch": 0.6332464508945517, + "grad_norm": 8.09009075164795, + "learning_rate": 3.020980412129646e-06, + "loss": 0.1632, + "step": 25024 + }, + { + "epoch": 0.6332717564592454, + "grad_norm": 2.921581983566284, + "learning_rate": 3.0206116862909573e-06, + "loss": 0.1505, + "step": 25025 + }, + { + "epoch": 0.6332970620239391, + "grad_norm": 13.042069435119629, + "learning_rate": 3.0202429732169745e-06, + "loss": 0.1919, + "step": 25026 + }, + { + "epoch": 0.6333223675886327, + "grad_norm": 6.90048885345459, + "learning_rate": 3.0198742729100775e-06, + "loss": 0.1405, + "step": 25027 + }, + { + "epoch": 0.6333476731533264, + "grad_norm": 15.761674880981445, + "learning_rate": 3.019505585372644e-06, + "loss": 0.2883, + "step": 25028 + }, + { + "epoch": 0.6333729787180201, + "grad_norm": 4.074893951416016, + "learning_rate": 3.01913691060705e-06, + "loss": 0.1749, + "step": 25029 + }, + { + "epoch": 0.6333982842827137, + "grad_norm": 6.653360843658447, + "learning_rate": 3.0187682486156733e-06, + "loss": 0.1748, + "step": 25030 + }, + { + "epoch": 0.6334235898474074, + "grad_norm": 5.528167247772217, + "learning_rate": 3.0183995994008915e-06, + "loss": 0.1361, + "step": 25031 + }, + { + "epoch": 0.6334488954121011, + "grad_norm": 3.0238900184631348, + "learning_rate": 3.0180309629650827e-06, + "loss": 0.1125, + "step": 25032 + }, + { + "epoch": 0.6334742009767947, + "grad_norm": 5.286896705627441, + "learning_rate": 3.017662339310624e-06, + "loss": 0.1297, + "step": 25033 + }, + { + "epoch": 0.6334995065414885, + "grad_norm": 2.51375412940979, + "learning_rate": 3.0172937284398905e-06, + "loss": 0.1188, + "step": 25034 + }, + { + "epoch": 0.6335248121061822, + "grad_norm": 4.478459358215332, + "learning_rate": 3.0169251303552627e-06, + "loss": 0.1808, + "step": 25035 + }, + { + "epoch": 0.6335501176708759, + "grad_norm": 12.944952011108398, + "learning_rate": 3.016556545059114e-06, + "loss": 0.2066, + "step": 25036 + }, + { + "epoch": 0.6335754232355695, + "grad_norm": 2.102484941482544, + "learning_rate": 3.0161879725538245e-06, + "loss": 0.0769, + "step": 25037 + }, + { + "epoch": 0.6336007288002632, + "grad_norm": 4.122659206390381, + "learning_rate": 3.01581941284177e-06, + "loss": 0.1683, + "step": 25038 + }, + { + "epoch": 0.6336260343649569, + "grad_norm": 4.8222455978393555, + "learning_rate": 3.0154508659253256e-06, + "loss": 0.1399, + "step": 25039 + }, + { + "epoch": 0.6336513399296505, + "grad_norm": 5.3827714920043945, + "learning_rate": 3.01508233180687e-06, + "loss": 0.1758, + "step": 25040 + }, + { + "epoch": 0.6336766454943442, + "grad_norm": 10.506234169006348, + "learning_rate": 3.0147138104887787e-06, + "loss": 0.2571, + "step": 25041 + }, + { + "epoch": 0.6337019510590379, + "grad_norm": 6.3106560707092285, + "learning_rate": 3.0143453019734314e-06, + "loss": 0.247, + "step": 25042 + }, + { + "epoch": 0.6337272566237315, + "grad_norm": 9.30534553527832, + "learning_rate": 3.013976806263199e-06, + "loss": 0.266, + "step": 25043 + }, + { + "epoch": 0.6337525621884252, + "grad_norm": 5.923261642456055, + "learning_rate": 3.0136083233604613e-06, + "loss": 0.2624, + "step": 25044 + }, + { + "epoch": 0.633777867753119, + "grad_norm": 9.039791107177734, + "learning_rate": 3.0132398532675945e-06, + "loss": 0.2616, + "step": 25045 + }, + { + "epoch": 0.6338031733178126, + "grad_norm": 10.11161994934082, + "learning_rate": 3.0128713959869756e-06, + "loss": 0.2571, + "step": 25046 + }, + { + "epoch": 0.6338284788825063, + "grad_norm": 7.803793907165527, + "learning_rate": 3.012502951520977e-06, + "loss": 0.06, + "step": 25047 + }, + { + "epoch": 0.6338537844472, + "grad_norm": 11.61837100982666, + "learning_rate": 3.0121345198719775e-06, + "loss": 0.1394, + "step": 25048 + }, + { + "epoch": 0.6338790900118936, + "grad_norm": 7.5432047843933105, + "learning_rate": 3.0117661010423536e-06, + "loss": 0.3147, + "step": 25049 + }, + { + "epoch": 0.6339043955765873, + "grad_norm": 6.837443828582764, + "learning_rate": 3.011397695034479e-06, + "loss": 0.1274, + "step": 25050 + }, + { + "epoch": 0.633929701141281, + "grad_norm": 3.264409303665161, + "learning_rate": 3.011029301850734e-06, + "loss": 0.0848, + "step": 25051 + }, + { + "epoch": 0.6339550067059746, + "grad_norm": 3.5637474060058594, + "learning_rate": 3.010660921493488e-06, + "loss": 0.1661, + "step": 25052 + }, + { + "epoch": 0.6339803122706683, + "grad_norm": 2.5968332290649414, + "learning_rate": 3.0102925539651206e-06, + "loss": 0.1218, + "step": 25053 + }, + { + "epoch": 0.634005617835362, + "grad_norm": 13.227729797363281, + "learning_rate": 3.0099241992680054e-06, + "loss": 0.1458, + "step": 25054 + }, + { + "epoch": 0.6340309234000556, + "grad_norm": 2.133479356765747, + "learning_rate": 3.0095558574045216e-06, + "loss": 0.0615, + "step": 25055 + }, + { + "epoch": 0.6340562289647493, + "grad_norm": 2.7600395679473877, + "learning_rate": 3.0091875283770398e-06, + "loss": 0.1051, + "step": 25056 + }, + { + "epoch": 0.634081534529443, + "grad_norm": 3.378080129623413, + "learning_rate": 3.008819212187937e-06, + "loss": 0.1711, + "step": 25057 + }, + { + "epoch": 0.6341068400941366, + "grad_norm": 7.984818458557129, + "learning_rate": 3.0084509088395887e-06, + "loss": 0.1154, + "step": 25058 + }, + { + "epoch": 0.6341321456588304, + "grad_norm": 5.848668575286865, + "learning_rate": 3.0080826183343712e-06, + "loss": 0.2106, + "step": 25059 + }, + { + "epoch": 0.6341574512235241, + "grad_norm": 6.943432331085205, + "learning_rate": 3.0077143406746583e-06, + "loss": 0.2425, + "step": 25060 + }, + { + "epoch": 0.6341827567882178, + "grad_norm": 3.764378309249878, + "learning_rate": 3.0073460758628237e-06, + "loss": 0.1408, + "step": 25061 + }, + { + "epoch": 0.6342080623529114, + "grad_norm": 2.857496500015259, + "learning_rate": 3.0069778239012455e-06, + "loss": 0.0916, + "step": 25062 + }, + { + "epoch": 0.6342333679176051, + "grad_norm": 5.819619178771973, + "learning_rate": 3.006609584792295e-06, + "loss": 0.1856, + "step": 25063 + }, + { + "epoch": 0.6342586734822988, + "grad_norm": 6.598679065704346, + "learning_rate": 3.006241358538349e-06, + "loss": 0.1786, + "step": 25064 + }, + { + "epoch": 0.6342839790469924, + "grad_norm": 4.318741798400879, + "learning_rate": 3.005873145141782e-06, + "loss": 0.1771, + "step": 25065 + }, + { + "epoch": 0.6343092846116861, + "grad_norm": 4.094130516052246, + "learning_rate": 3.005504944604968e-06, + "loss": 0.1625, + "step": 25066 + }, + { + "epoch": 0.6343345901763798, + "grad_norm": 6.646241664886475, + "learning_rate": 3.0051367569302804e-06, + "loss": 0.1692, + "step": 25067 + }, + { + "epoch": 0.6343598957410734, + "grad_norm": 10.033982276916504, + "learning_rate": 3.0047685821200967e-06, + "loss": 0.1972, + "step": 25068 + }, + { + "epoch": 0.6343852013057671, + "grad_norm": 10.441981315612793, + "learning_rate": 3.0044004201767883e-06, + "loss": 0.1695, + "step": 25069 + }, + { + "epoch": 0.6344105068704609, + "grad_norm": 2.883798599243164, + "learning_rate": 3.0040322711027305e-06, + "loss": 0.0966, + "step": 25070 + }, + { + "epoch": 0.6344358124351545, + "grad_norm": 5.93397855758667, + "learning_rate": 3.003664134900296e-06, + "loss": 0.2257, + "step": 25071 + }, + { + "epoch": 0.6344611179998482, + "grad_norm": 3.41137433052063, + "learning_rate": 3.0032960115718613e-06, + "loss": 0.1218, + "step": 25072 + }, + { + "epoch": 0.6344864235645419, + "grad_norm": 5.554403305053711, + "learning_rate": 3.002927901119801e-06, + "loss": 0.1338, + "step": 25073 + }, + { + "epoch": 0.6345117291292355, + "grad_norm": 10.321584701538086, + "learning_rate": 3.0025598035464843e-06, + "loss": 0.2621, + "step": 25074 + }, + { + "epoch": 0.6345370346939292, + "grad_norm": 3.1571757793426514, + "learning_rate": 3.0021917188542887e-06, + "loss": 0.1477, + "step": 25075 + }, + { + "epoch": 0.6345623402586229, + "grad_norm": 2.090956926345825, + "learning_rate": 3.001823647045587e-06, + "loss": 0.0808, + "step": 25076 + }, + { + "epoch": 0.6345876458233165, + "grad_norm": 6.14084005355835, + "learning_rate": 3.001455588122755e-06, + "loss": 0.1595, + "step": 25077 + }, + { + "epoch": 0.6346129513880102, + "grad_norm": 3.285634756088257, + "learning_rate": 3.001087542088162e-06, + "loss": 0.1816, + "step": 25078 + }, + { + "epoch": 0.6346382569527039, + "grad_norm": 6.8069024085998535, + "learning_rate": 3.0007195089441833e-06, + "loss": 0.1728, + "step": 25079 + }, + { + "epoch": 0.6346635625173975, + "grad_norm": 6.0393781661987305, + "learning_rate": 3.000351488693193e-06, + "loss": 0.1776, + "step": 25080 + }, + { + "epoch": 0.6346888680820912, + "grad_norm": 4.799094200134277, + "learning_rate": 2.999983481337564e-06, + "loss": 0.1855, + "step": 25081 + }, + { + "epoch": 0.634714173646785, + "grad_norm": 3.418898820877075, + "learning_rate": 2.9996154868796713e-06, + "loss": 0.1722, + "step": 25082 + }, + { + "epoch": 0.6347394792114786, + "grad_norm": 3.418609142303467, + "learning_rate": 2.999247505321884e-06, + "loss": 0.1207, + "step": 25083 + }, + { + "epoch": 0.6347647847761723, + "grad_norm": 4.441710948944092, + "learning_rate": 2.9988795366665777e-06, + "loss": 0.2059, + "step": 25084 + }, + { + "epoch": 0.634790090340866, + "grad_norm": 8.021260261535645, + "learning_rate": 2.9985115809161253e-06, + "loss": 0.2204, + "step": 25085 + }, + { + "epoch": 0.6348153959055597, + "grad_norm": 6.323845863342285, + "learning_rate": 2.9981436380729014e-06, + "loss": 0.1671, + "step": 25086 + }, + { + "epoch": 0.6348407014702533, + "grad_norm": 7.222457408905029, + "learning_rate": 2.9977757081392745e-06, + "loss": 0.1361, + "step": 25087 + }, + { + "epoch": 0.634866007034947, + "grad_norm": 6.60552978515625, + "learning_rate": 2.9974077911176193e-06, + "loss": 0.179, + "step": 25088 + }, + { + "epoch": 0.6348913125996407, + "grad_norm": 8.21110725402832, + "learning_rate": 2.9970398870103103e-06, + "loss": 0.1582, + "step": 25089 + }, + { + "epoch": 0.6349166181643343, + "grad_norm": 3.224658727645874, + "learning_rate": 2.9966719958197176e-06, + "loss": 0.1285, + "step": 25090 + }, + { + "epoch": 0.634941923729028, + "grad_norm": 5.365354537963867, + "learning_rate": 2.9963041175482146e-06, + "loss": 0.1053, + "step": 25091 + }, + { + "epoch": 0.6349672292937217, + "grad_norm": 3.7320353984832764, + "learning_rate": 2.995936252198173e-06, + "loss": 0.1041, + "step": 25092 + }, + { + "epoch": 0.6349925348584153, + "grad_norm": 3.0806806087493896, + "learning_rate": 2.995568399771967e-06, + "loss": 0.1238, + "step": 25093 + }, + { + "epoch": 0.635017840423109, + "grad_norm": 3.2571990489959717, + "learning_rate": 2.995200560271967e-06, + "loss": 0.1502, + "step": 25094 + }, + { + "epoch": 0.6350431459878028, + "grad_norm": 3.5455150604248047, + "learning_rate": 2.9948327337005455e-06, + "loss": 0.1867, + "step": 25095 + }, + { + "epoch": 0.6350684515524964, + "grad_norm": 3.9612154960632324, + "learning_rate": 2.9944649200600755e-06, + "loss": 0.1241, + "step": 25096 + }, + { + "epoch": 0.6350937571171901, + "grad_norm": 16.534133911132812, + "learning_rate": 2.9940971193529274e-06, + "loss": 0.1758, + "step": 25097 + }, + { + "epoch": 0.6351190626818838, + "grad_norm": 3.3898210525512695, + "learning_rate": 2.993729331581473e-06, + "loss": 0.1215, + "step": 25098 + }, + { + "epoch": 0.6351443682465774, + "grad_norm": 3.4151899814605713, + "learning_rate": 2.993361556748088e-06, + "loss": 0.1071, + "step": 25099 + }, + { + "epoch": 0.6351696738112711, + "grad_norm": 6.134034156799316, + "learning_rate": 2.992993794855138e-06, + "loss": 0.1477, + "step": 25100 + }, + { + "epoch": 0.6351949793759648, + "grad_norm": 4.433295726776123, + "learning_rate": 2.9926260459049982e-06, + "loss": 0.1794, + "step": 25101 + }, + { + "epoch": 0.6352202849406584, + "grad_norm": 6.50730562210083, + "learning_rate": 2.9922583099000403e-06, + "loss": 0.2709, + "step": 25102 + }, + { + "epoch": 0.6352455905053521, + "grad_norm": 6.94646692276001, + "learning_rate": 2.991890586842634e-06, + "loss": 0.1625, + "step": 25103 + }, + { + "epoch": 0.6352708960700458, + "grad_norm": 4.6379852294921875, + "learning_rate": 2.991522876735154e-06, + "loss": 0.1851, + "step": 25104 + }, + { + "epoch": 0.6352962016347394, + "grad_norm": 4.745038032531738, + "learning_rate": 2.9911551795799677e-06, + "loss": 0.115, + "step": 25105 + }, + { + "epoch": 0.6353215071994331, + "grad_norm": 3.359208345413208, + "learning_rate": 2.990787495379448e-06, + "loss": 0.1532, + "step": 25106 + }, + { + "epoch": 0.6353468127641269, + "grad_norm": 4.308151721954346, + "learning_rate": 2.9904198241359654e-06, + "loss": 0.1685, + "step": 25107 + }, + { + "epoch": 0.6353721183288205, + "grad_norm": 3.0406665802001953, + "learning_rate": 2.9900521658518945e-06, + "loss": 0.0736, + "step": 25108 + }, + { + "epoch": 0.6353974238935142, + "grad_norm": 7.043219566345215, + "learning_rate": 2.9896845205296e-06, + "loss": 0.1772, + "step": 25109 + }, + { + "epoch": 0.6354227294582079, + "grad_norm": 4.211091041564941, + "learning_rate": 2.9893168881714574e-06, + "loss": 0.1495, + "step": 25110 + }, + { + "epoch": 0.6354480350229015, + "grad_norm": 7.459769248962402, + "learning_rate": 2.9889492687798347e-06, + "loss": 0.1734, + "step": 25111 + }, + { + "epoch": 0.6354733405875952, + "grad_norm": 2.99596905708313, + "learning_rate": 2.9885816623571055e-06, + "loss": 0.1032, + "step": 25112 + }, + { + "epoch": 0.6354986461522889, + "grad_norm": 3.554384708404541, + "learning_rate": 2.9882140689056407e-06, + "loss": 0.1699, + "step": 25113 + }, + { + "epoch": 0.6355239517169826, + "grad_norm": 9.77525520324707, + "learning_rate": 2.9878464884278068e-06, + "loss": 0.1625, + "step": 25114 + }, + { + "epoch": 0.6355492572816762, + "grad_norm": 4.0383195877075195, + "learning_rate": 2.987478920925977e-06, + "loss": 0.1241, + "step": 25115 + }, + { + "epoch": 0.6355745628463699, + "grad_norm": 7.467888355255127, + "learning_rate": 2.9871113664025207e-06, + "loss": 0.1522, + "step": 25116 + }, + { + "epoch": 0.6355998684110636, + "grad_norm": 4.330228805541992, + "learning_rate": 2.986743824859811e-06, + "loss": 0.1717, + "step": 25117 + }, + { + "epoch": 0.6356251739757572, + "grad_norm": 5.529805660247803, + "learning_rate": 2.986376296300214e-06, + "loss": 0.2216, + "step": 25118 + }, + { + "epoch": 0.635650479540451, + "grad_norm": 5.665110111236572, + "learning_rate": 2.9860087807261016e-06, + "loss": 0.1842, + "step": 25119 + }, + { + "epoch": 0.6356757851051447, + "grad_norm": 2.7562954425811768, + "learning_rate": 2.9856412781398454e-06, + "loss": 0.0414, + "step": 25120 + }, + { + "epoch": 0.6357010906698383, + "grad_norm": 5.225364685058594, + "learning_rate": 2.985273788543813e-06, + "loss": 0.1196, + "step": 25121 + }, + { + "epoch": 0.635726396234532, + "grad_norm": 7.381961822509766, + "learning_rate": 2.9849063119403753e-06, + "loss": 0.1754, + "step": 25122 + }, + { + "epoch": 0.6357517017992257, + "grad_norm": 3.4044744968414307, + "learning_rate": 2.984538848331902e-06, + "loss": 0.1509, + "step": 25123 + }, + { + "epoch": 0.6357770073639193, + "grad_norm": 10.880960464477539, + "learning_rate": 2.984171397720762e-06, + "loss": 0.183, + "step": 25124 + }, + { + "epoch": 0.635802312928613, + "grad_norm": 5.049607276916504, + "learning_rate": 2.9838039601093262e-06, + "loss": 0.1613, + "step": 25125 + }, + { + "epoch": 0.6358276184933067, + "grad_norm": 3.609678268432617, + "learning_rate": 2.983436535499965e-06, + "loss": 0.1552, + "step": 25126 + }, + { + "epoch": 0.6358529240580003, + "grad_norm": 3.2478950023651123, + "learning_rate": 2.983069123895045e-06, + "loss": 0.1482, + "step": 25127 + }, + { + "epoch": 0.635878229622694, + "grad_norm": 3.44045090675354, + "learning_rate": 2.9827017252969364e-06, + "loss": 0.1612, + "step": 25128 + }, + { + "epoch": 0.6359035351873877, + "grad_norm": 8.669175148010254, + "learning_rate": 2.9823343397080094e-06, + "loss": 0.2076, + "step": 25129 + }, + { + "epoch": 0.6359288407520813, + "grad_norm": 2.978701591491699, + "learning_rate": 2.981966967130635e-06, + "loss": 0.121, + "step": 25130 + }, + { + "epoch": 0.635954146316775, + "grad_norm": 7.686492443084717, + "learning_rate": 2.9815996075671783e-06, + "loss": 0.105, + "step": 25131 + }, + { + "epoch": 0.6359794518814688, + "grad_norm": 6.025970458984375, + "learning_rate": 2.9812322610200107e-06, + "loss": 0.2311, + "step": 25132 + }, + { + "epoch": 0.6360047574461624, + "grad_norm": 6.5254364013671875, + "learning_rate": 2.9808649274915002e-06, + "loss": 0.1771, + "step": 25133 + }, + { + "epoch": 0.6360300630108561, + "grad_norm": 3.3590712547302246, + "learning_rate": 2.9804976069840165e-06, + "loss": 0.1158, + "step": 25134 + }, + { + "epoch": 0.6360553685755498, + "grad_norm": 3.368560552597046, + "learning_rate": 2.9801302994999294e-06, + "loss": 0.1403, + "step": 25135 + }, + { + "epoch": 0.6360806741402434, + "grad_norm": 3.4295239448547363, + "learning_rate": 2.9797630050416048e-06, + "loss": 0.103, + "step": 25136 + }, + { + "epoch": 0.6361059797049371, + "grad_norm": 3.22047758102417, + "learning_rate": 2.9793957236114126e-06, + "loss": 0.0627, + "step": 25137 + }, + { + "epoch": 0.6361312852696308, + "grad_norm": 4.334737300872803, + "learning_rate": 2.9790284552117216e-06, + "loss": 0.1702, + "step": 25138 + }, + { + "epoch": 0.6361565908343245, + "grad_norm": 2.683215379714966, + "learning_rate": 2.978661199844902e-06, + "loss": 0.1107, + "step": 25139 + }, + { + "epoch": 0.6361818963990181, + "grad_norm": 10.211310386657715, + "learning_rate": 2.9782939575133184e-06, + "loss": 0.2564, + "step": 25140 + }, + { + "epoch": 0.6362072019637118, + "grad_norm": 4.11683464050293, + "learning_rate": 2.977926728219341e-06, + "loss": 0.132, + "step": 25141 + }, + { + "epoch": 0.6362325075284055, + "grad_norm": 10.601198196411133, + "learning_rate": 2.9775595119653383e-06, + "loss": 0.1709, + "step": 25142 + }, + { + "epoch": 0.6362578130930991, + "grad_norm": 2.036064386367798, + "learning_rate": 2.9771923087536803e-06, + "loss": 0.1027, + "step": 25143 + }, + { + "epoch": 0.6362831186577929, + "grad_norm": 8.014548301696777, + "learning_rate": 2.9768251185867303e-06, + "loss": 0.2858, + "step": 25144 + }, + { + "epoch": 0.6363084242224866, + "grad_norm": 2.407723903656006, + "learning_rate": 2.9764579414668583e-06, + "loss": 0.0756, + "step": 25145 + }, + { + "epoch": 0.6363337297871802, + "grad_norm": 3.9643383026123047, + "learning_rate": 2.976090777396433e-06, + "loss": 0.1745, + "step": 25146 + }, + { + "epoch": 0.6363590353518739, + "grad_norm": 4.276342868804932, + "learning_rate": 2.9757236263778234e-06, + "loss": 0.1981, + "step": 25147 + }, + { + "epoch": 0.6363843409165676, + "grad_norm": 12.035176277160645, + "learning_rate": 2.9753564884133958e-06, + "loss": 0.1146, + "step": 25148 + }, + { + "epoch": 0.6364096464812612, + "grad_norm": 4.823484420776367, + "learning_rate": 2.9749893635055156e-06, + "loss": 0.2021, + "step": 25149 + }, + { + "epoch": 0.6364349520459549, + "grad_norm": 3.455564498901367, + "learning_rate": 2.9746222516565537e-06, + "loss": 0.0941, + "step": 25150 + }, + { + "epoch": 0.6364602576106486, + "grad_norm": 5.2219367027282715, + "learning_rate": 2.974255152868875e-06, + "loss": 0.1395, + "step": 25151 + }, + { + "epoch": 0.6364855631753422, + "grad_norm": 9.154291152954102, + "learning_rate": 2.973888067144849e-06, + "loss": 0.2618, + "step": 25152 + }, + { + "epoch": 0.6365108687400359, + "grad_norm": 3.4207634925842285, + "learning_rate": 2.9735209944868416e-06, + "loss": 0.1177, + "step": 25153 + }, + { + "epoch": 0.6365361743047296, + "grad_norm": 3.211242437362671, + "learning_rate": 2.9731539348972206e-06, + "loss": 0.1264, + "step": 25154 + }, + { + "epoch": 0.6365614798694232, + "grad_norm": 9.834031105041504, + "learning_rate": 2.972786888378352e-06, + "loss": 0.2267, + "step": 25155 + }, + { + "epoch": 0.636586785434117, + "grad_norm": 29.39936637878418, + "learning_rate": 2.972419854932604e-06, + "loss": 0.1477, + "step": 25156 + }, + { + "epoch": 0.6366120909988107, + "grad_norm": 3.664245128631592, + "learning_rate": 2.972052834562345e-06, + "loss": 0.1236, + "step": 25157 + }, + { + "epoch": 0.6366373965635043, + "grad_norm": 2.8915345668792725, + "learning_rate": 2.9716858272699378e-06, + "loss": 0.1646, + "step": 25158 + }, + { + "epoch": 0.636662702128198, + "grad_norm": 6.32073974609375, + "learning_rate": 2.971318833057752e-06, + "loss": 0.2737, + "step": 25159 + }, + { + "epoch": 0.6366880076928917, + "grad_norm": 3.967155694961548, + "learning_rate": 2.9709518519281532e-06, + "loss": 0.1447, + "step": 25160 + }, + { + "epoch": 0.6367133132575853, + "grad_norm": 5.076584815979004, + "learning_rate": 2.9705848838835103e-06, + "loss": 0.1679, + "step": 25161 + }, + { + "epoch": 0.636738618822279, + "grad_norm": 4.082657814025879, + "learning_rate": 2.9702179289261867e-06, + "loss": 0.2167, + "step": 25162 + }, + { + "epoch": 0.6367639243869727, + "grad_norm": 5.059005260467529, + "learning_rate": 2.9698509870585494e-06, + "loss": 0.1761, + "step": 25163 + }, + { + "epoch": 0.6367892299516664, + "grad_norm": 5.29921817779541, + "learning_rate": 2.969484058282966e-06, + "loss": 0.1876, + "step": 25164 + }, + { + "epoch": 0.63681453551636, + "grad_norm": 3.4746742248535156, + "learning_rate": 2.969117142601804e-06, + "loss": 0.1319, + "step": 25165 + }, + { + "epoch": 0.6368398410810537, + "grad_norm": 7.081943511962891, + "learning_rate": 2.9687502400174258e-06, + "loss": 0.139, + "step": 25166 + }, + { + "epoch": 0.6368651466457474, + "grad_norm": 3.7497923374176025, + "learning_rate": 2.968383350532199e-06, + "loss": 0.1786, + "step": 25167 + }, + { + "epoch": 0.636890452210441, + "grad_norm": 8.6206636428833, + "learning_rate": 2.968016474148491e-06, + "loss": 0.1469, + "step": 25168 + }, + { + "epoch": 0.6369157577751348, + "grad_norm": 3.8022994995117188, + "learning_rate": 2.9676496108686657e-06, + "loss": 0.1553, + "step": 25169 + }, + { + "epoch": 0.6369410633398285, + "grad_norm": 3.638486623764038, + "learning_rate": 2.9672827606950927e-06, + "loss": 0.1457, + "step": 25170 + }, + { + "epoch": 0.6369663689045221, + "grad_norm": 4.718145847320557, + "learning_rate": 2.9669159236301326e-06, + "loss": 0.1577, + "step": 25171 + }, + { + "epoch": 0.6369916744692158, + "grad_norm": 4.9037766456604, + "learning_rate": 2.9665490996761536e-06, + "loss": 0.0917, + "step": 25172 + }, + { + "epoch": 0.6370169800339095, + "grad_norm": 3.7522289752960205, + "learning_rate": 2.966182288835522e-06, + "loss": 0.137, + "step": 25173 + }, + { + "epoch": 0.6370422855986031, + "grad_norm": 2.6495418548583984, + "learning_rate": 2.965815491110604e-06, + "loss": 0.0967, + "step": 25174 + }, + { + "epoch": 0.6370675911632968, + "grad_norm": 3.964813709259033, + "learning_rate": 2.9654487065037613e-06, + "loss": 0.1508, + "step": 25175 + }, + { + "epoch": 0.6370928967279905, + "grad_norm": 5.8618316650390625, + "learning_rate": 2.9650819350173616e-06, + "loss": 0.211, + "step": 25176 + }, + { + "epoch": 0.6371182022926841, + "grad_norm": 7.3009467124938965, + "learning_rate": 2.9647151766537694e-06, + "loss": 0.1903, + "step": 25177 + }, + { + "epoch": 0.6371435078573778, + "grad_norm": 9.361529350280762, + "learning_rate": 2.9643484314153525e-06, + "loss": 0.2118, + "step": 25178 + }, + { + "epoch": 0.6371688134220715, + "grad_norm": 5.43801736831665, + "learning_rate": 2.9639816993044733e-06, + "loss": 0.1003, + "step": 25179 + }, + { + "epoch": 0.6371941189867651, + "grad_norm": 11.16215705871582, + "learning_rate": 2.9636149803234963e-06, + "loss": 0.2378, + "step": 25180 + }, + { + "epoch": 0.6372194245514589, + "grad_norm": 5.380206108093262, + "learning_rate": 2.9632482744747893e-06, + "loss": 0.1706, + "step": 25181 + }, + { + "epoch": 0.6372447301161526, + "grad_norm": 2.0814530849456787, + "learning_rate": 2.962881581760713e-06, + "loss": 0.1238, + "step": 25182 + }, + { + "epoch": 0.6372700356808462, + "grad_norm": 9.295609474182129, + "learning_rate": 2.9625149021836364e-06, + "loss": 0.1641, + "step": 25183 + }, + { + "epoch": 0.6372953412455399, + "grad_norm": 4.565864086151123, + "learning_rate": 2.962148235745922e-06, + "loss": 0.218, + "step": 25184 + }, + { + "epoch": 0.6373206468102336, + "grad_norm": 7.33056640625, + "learning_rate": 2.9617815824499334e-06, + "loss": 0.1597, + "step": 25185 + }, + { + "epoch": 0.6373459523749272, + "grad_norm": 4.203088760375977, + "learning_rate": 2.9614149422980365e-06, + "loss": 0.1565, + "step": 25186 + }, + { + "epoch": 0.6373712579396209, + "grad_norm": 4.067130088806152, + "learning_rate": 2.961048315292595e-06, + "loss": 0.1211, + "step": 25187 + }, + { + "epoch": 0.6373965635043146, + "grad_norm": 4.136349678039551, + "learning_rate": 2.9606817014359758e-06, + "loss": 0.145, + "step": 25188 + }, + { + "epoch": 0.6374218690690083, + "grad_norm": 4.788471698760986, + "learning_rate": 2.9603151007305392e-06, + "loss": 0.1075, + "step": 25189 + }, + { + "epoch": 0.6374471746337019, + "grad_norm": 4.987203598022461, + "learning_rate": 2.959948513178651e-06, + "loss": 0.1597, + "step": 25190 + }, + { + "epoch": 0.6374724801983956, + "grad_norm": 7.630448341369629, + "learning_rate": 2.9595819387826753e-06, + "loss": 0.1565, + "step": 25191 + }, + { + "epoch": 0.6374977857630894, + "grad_norm": 6.793576240539551, + "learning_rate": 2.959215377544978e-06, + "loss": 0.2082, + "step": 25192 + }, + { + "epoch": 0.637523091327783, + "grad_norm": 5.643091201782227, + "learning_rate": 2.9588488294679196e-06, + "loss": 0.1588, + "step": 25193 + }, + { + "epoch": 0.6375483968924767, + "grad_norm": 5.026499271392822, + "learning_rate": 2.9584822945538647e-06, + "loss": 0.1739, + "step": 25194 + }, + { + "epoch": 0.6375737024571704, + "grad_norm": 12.314779281616211, + "learning_rate": 2.9581157728051787e-06, + "loss": 0.2561, + "step": 25195 + }, + { + "epoch": 0.637599008021864, + "grad_norm": 3.878596305847168, + "learning_rate": 2.9577492642242258e-06, + "loss": 0.1196, + "step": 25196 + }, + { + "epoch": 0.6376243135865577, + "grad_norm": 5.270020961761475, + "learning_rate": 2.9573827688133655e-06, + "loss": 0.2053, + "step": 25197 + }, + { + "epoch": 0.6376496191512514, + "grad_norm": 8.438785552978516, + "learning_rate": 2.957016286574964e-06, + "loss": 0.2365, + "step": 25198 + }, + { + "epoch": 0.637674924715945, + "grad_norm": 7.520571231842041, + "learning_rate": 2.9566498175113847e-06, + "loss": 0.2558, + "step": 25199 + }, + { + "epoch": 0.6377002302806387, + "grad_norm": 7.561139106750488, + "learning_rate": 2.956283361624991e-06, + "loss": 0.2267, + "step": 25200 + }, + { + "epoch": 0.6377255358453324, + "grad_norm": 5.651361465454102, + "learning_rate": 2.955916918918147e-06, + "loss": 0.2191, + "step": 25201 + }, + { + "epoch": 0.637750841410026, + "grad_norm": 4.191425323486328, + "learning_rate": 2.955550489393213e-06, + "loss": 0.2156, + "step": 25202 + }, + { + "epoch": 0.6377761469747197, + "grad_norm": 3.3646748065948486, + "learning_rate": 2.955184073052553e-06, + "loss": 0.1261, + "step": 25203 + }, + { + "epoch": 0.6378014525394134, + "grad_norm": 11.189599990844727, + "learning_rate": 2.954817669898531e-06, + "loss": 0.2908, + "step": 25204 + }, + { + "epoch": 0.637826758104107, + "grad_norm": 3.9478468894958496, + "learning_rate": 2.954451279933511e-06, + "loss": 0.151, + "step": 25205 + }, + { + "epoch": 0.6378520636688008, + "grad_norm": 7.579550743103027, + "learning_rate": 2.9540849031598527e-06, + "loss": 0.1673, + "step": 25206 + }, + { + "epoch": 0.6378773692334945, + "grad_norm": 2.501878499984741, + "learning_rate": 2.9537185395799205e-06, + "loss": 0.0918, + "step": 25207 + }, + { + "epoch": 0.6379026747981881, + "grad_norm": 12.97225570678711, + "learning_rate": 2.9533521891960776e-06, + "loss": 0.1993, + "step": 25208 + }, + { + "epoch": 0.6379279803628818, + "grad_norm": 3.2043960094451904, + "learning_rate": 2.9529858520106848e-06, + "loss": 0.1322, + "step": 25209 + }, + { + "epoch": 0.6379532859275755, + "grad_norm": 4.987236499786377, + "learning_rate": 2.952619528026106e-06, + "loss": 0.1837, + "step": 25210 + }, + { + "epoch": 0.6379785914922691, + "grad_norm": 7.9507670402526855, + "learning_rate": 2.9522532172447033e-06, + "loss": 0.3119, + "step": 25211 + }, + { + "epoch": 0.6380038970569628, + "grad_norm": 2.953677177429199, + "learning_rate": 2.9518869196688377e-06, + "loss": 0.1525, + "step": 25212 + }, + { + "epoch": 0.6380292026216565, + "grad_norm": 3.889392137527466, + "learning_rate": 2.9515206353008725e-06, + "loss": 0.154, + "step": 25213 + }, + { + "epoch": 0.6380545081863502, + "grad_norm": 6.538841247558594, + "learning_rate": 2.9511543641431704e-06, + "loss": 0.172, + "step": 25214 + }, + { + "epoch": 0.6380798137510438, + "grad_norm": 3.100527286529541, + "learning_rate": 2.9507881061980927e-06, + "loss": 0.1621, + "step": 25215 + }, + { + "epoch": 0.6381051193157375, + "grad_norm": 4.39622688293457, + "learning_rate": 2.9504218614680007e-06, + "loss": 0.144, + "step": 25216 + }, + { + "epoch": 0.6381304248804313, + "grad_norm": 5.265885829925537, + "learning_rate": 2.9500556299552565e-06, + "loss": 0.1424, + "step": 25217 + }, + { + "epoch": 0.6381557304451249, + "grad_norm": 4.414193153381348, + "learning_rate": 2.9496894116622245e-06, + "loss": 0.1815, + "step": 25218 + }, + { + "epoch": 0.6381810360098186, + "grad_norm": 11.627643585205078, + "learning_rate": 2.949323206591262e-06, + "loss": 0.2112, + "step": 25219 + }, + { + "epoch": 0.6382063415745123, + "grad_norm": 5.778426647186279, + "learning_rate": 2.9489570147447323e-06, + "loss": 0.1918, + "step": 25220 + }, + { + "epoch": 0.6382316471392059, + "grad_norm": 10.830231666564941, + "learning_rate": 2.948590836124998e-06, + "loss": 0.2233, + "step": 25221 + }, + { + "epoch": 0.6382569527038996, + "grad_norm": 7.846356391906738, + "learning_rate": 2.9482246707344194e-06, + "loss": 0.2612, + "step": 25222 + }, + { + "epoch": 0.6382822582685933, + "grad_norm": 3.0772459506988525, + "learning_rate": 2.94785851857536e-06, + "loss": 0.1513, + "step": 25223 + }, + { + "epoch": 0.6383075638332869, + "grad_norm": 4.201402187347412, + "learning_rate": 2.9474923796501776e-06, + "loss": 0.1693, + "step": 25224 + }, + { + "epoch": 0.6383328693979806, + "grad_norm": 18.602853775024414, + "learning_rate": 2.947126253961235e-06, + "loss": 0.2793, + "step": 25225 + }, + { + "epoch": 0.6383581749626743, + "grad_norm": 8.349447250366211, + "learning_rate": 2.946760141510894e-06, + "loss": 0.1517, + "step": 25226 + }, + { + "epoch": 0.6383834805273679, + "grad_norm": 4.676144599914551, + "learning_rate": 2.946394042301516e-06, + "loss": 0.1596, + "step": 25227 + }, + { + "epoch": 0.6384087860920616, + "grad_norm": 4.715611934661865, + "learning_rate": 2.946027956335459e-06, + "loss": 0.1625, + "step": 25228 + }, + { + "epoch": 0.6384340916567554, + "grad_norm": 4.844523906707764, + "learning_rate": 2.945661883615085e-06, + "loss": 0.1655, + "step": 25229 + }, + { + "epoch": 0.638459397221449, + "grad_norm": 2.48203706741333, + "learning_rate": 2.9452958241427565e-06, + "loss": 0.0941, + "step": 25230 + }, + { + "epoch": 0.6384847027861427, + "grad_norm": 9.474119186401367, + "learning_rate": 2.9449297779208323e-06, + "loss": 0.3299, + "step": 25231 + }, + { + "epoch": 0.6385100083508364, + "grad_norm": 7.734554290771484, + "learning_rate": 2.944563744951676e-06, + "loss": 0.196, + "step": 25232 + }, + { + "epoch": 0.63853531391553, + "grad_norm": 6.803040027618408, + "learning_rate": 2.9441977252376442e-06, + "loss": 0.1685, + "step": 25233 + }, + { + "epoch": 0.6385606194802237, + "grad_norm": 4.338736534118652, + "learning_rate": 2.9438317187810984e-06, + "loss": 0.1685, + "step": 25234 + }, + { + "epoch": 0.6385859250449174, + "grad_norm": 7.5161943435668945, + "learning_rate": 2.9434657255844e-06, + "loss": 0.1728, + "step": 25235 + }, + { + "epoch": 0.638611230609611, + "grad_norm": 8.391085624694824, + "learning_rate": 2.943099745649909e-06, + "loss": 0.22, + "step": 25236 + }, + { + "epoch": 0.6386365361743047, + "grad_norm": 11.581876754760742, + "learning_rate": 2.9427337789799842e-06, + "loss": 0.2212, + "step": 25237 + }, + { + "epoch": 0.6386618417389984, + "grad_norm": 8.432158470153809, + "learning_rate": 2.9423678255769866e-06, + "loss": 0.1324, + "step": 25238 + }, + { + "epoch": 0.638687147303692, + "grad_norm": 5.896567344665527, + "learning_rate": 2.942001885443277e-06, + "loss": 0.1641, + "step": 25239 + }, + { + "epoch": 0.6387124528683857, + "grad_norm": 3.6686809062957764, + "learning_rate": 2.941635958581214e-06, + "loss": 0.1656, + "step": 25240 + }, + { + "epoch": 0.6387377584330794, + "grad_norm": 7.681521892547607, + "learning_rate": 2.9412700449931574e-06, + "loss": 0.1844, + "step": 25241 + }, + { + "epoch": 0.6387630639977732, + "grad_norm": 5.841243743896484, + "learning_rate": 2.9409041446814684e-06, + "loss": 0.2016, + "step": 25242 + }, + { + "epoch": 0.6387883695624668, + "grad_norm": 6.014786720275879, + "learning_rate": 2.9405382576485043e-06, + "loss": 0.1765, + "step": 25243 + }, + { + "epoch": 0.6388136751271605, + "grad_norm": 3.9960711002349854, + "learning_rate": 2.9401723838966256e-06, + "loss": 0.1092, + "step": 25244 + }, + { + "epoch": 0.6388389806918542, + "grad_norm": 5.462308406829834, + "learning_rate": 2.939806523428195e-06, + "loss": 0.1815, + "step": 25245 + }, + { + "epoch": 0.6388642862565478, + "grad_norm": 4.746745586395264, + "learning_rate": 2.9394406762455662e-06, + "loss": 0.1419, + "step": 25246 + }, + { + "epoch": 0.6388895918212415, + "grad_norm": 5.151040554046631, + "learning_rate": 2.939074842351101e-06, + "loss": 0.1872, + "step": 25247 + }, + { + "epoch": 0.6389148973859352, + "grad_norm": 12.395895004272461, + "learning_rate": 2.9387090217471592e-06, + "loss": 0.1823, + "step": 25248 + }, + { + "epoch": 0.6389402029506288, + "grad_norm": 5.281143665313721, + "learning_rate": 2.9383432144361014e-06, + "loss": 0.1482, + "step": 25249 + }, + { + "epoch": 0.6389655085153225, + "grad_norm": 3.5336203575134277, + "learning_rate": 2.937977420420283e-06, + "loss": 0.1411, + "step": 25250 + }, + { + "epoch": 0.6389908140800162, + "grad_norm": 3.1282403469085693, + "learning_rate": 2.9376116397020637e-06, + "loss": 0.22, + "step": 25251 + }, + { + "epoch": 0.6390161196447098, + "grad_norm": 4.490747928619385, + "learning_rate": 2.937245872283804e-06, + "loss": 0.1414, + "step": 25252 + }, + { + "epoch": 0.6390414252094035, + "grad_norm": 3.3018176555633545, + "learning_rate": 2.936880118167862e-06, + "loss": 0.1396, + "step": 25253 + }, + { + "epoch": 0.6390667307740973, + "grad_norm": 3.656329393386841, + "learning_rate": 2.9365143773565985e-06, + "loss": 0.184, + "step": 25254 + }, + { + "epoch": 0.6390920363387909, + "grad_norm": 4.113702774047852, + "learning_rate": 2.936148649852367e-06, + "loss": 0.1258, + "step": 25255 + }, + { + "epoch": 0.6391173419034846, + "grad_norm": 15.155390739440918, + "learning_rate": 2.9357829356575296e-06, + "loss": 0.2166, + "step": 25256 + }, + { + "epoch": 0.6391426474681783, + "grad_norm": 8.938060760498047, + "learning_rate": 2.9354172347744447e-06, + "loss": 0.2205, + "step": 25257 + }, + { + "epoch": 0.6391679530328719, + "grad_norm": 8.21872329711914, + "learning_rate": 2.935051547205471e-06, + "loss": 0.1932, + "step": 25258 + }, + { + "epoch": 0.6391932585975656, + "grad_norm": 5.135644435882568, + "learning_rate": 2.934685872952964e-06, + "loss": 0.1223, + "step": 25259 + }, + { + "epoch": 0.6392185641622593, + "grad_norm": 5.110434532165527, + "learning_rate": 2.9343202120192836e-06, + "loss": 0.1955, + "step": 25260 + }, + { + "epoch": 0.6392438697269529, + "grad_norm": 11.537927627563477, + "learning_rate": 2.9339545644067885e-06, + "loss": 0.2054, + "step": 25261 + }, + { + "epoch": 0.6392691752916466, + "grad_norm": 5.250447750091553, + "learning_rate": 2.9335889301178354e-06, + "loss": 0.1438, + "step": 25262 + }, + { + "epoch": 0.6392944808563403, + "grad_norm": 2.888336658477783, + "learning_rate": 2.933223309154786e-06, + "loss": 0.1099, + "step": 25263 + }, + { + "epoch": 0.6393197864210339, + "grad_norm": 8.626023292541504, + "learning_rate": 2.9328577015199915e-06, + "loss": 0.2401, + "step": 25264 + }, + { + "epoch": 0.6393450919857276, + "grad_norm": 3.6495234966278076, + "learning_rate": 2.932492107215814e-06, + "loss": 0.151, + "step": 25265 + }, + { + "epoch": 0.6393703975504214, + "grad_norm": 4.388978004455566, + "learning_rate": 2.9321265262446114e-06, + "loss": 0.1469, + "step": 25266 + }, + { + "epoch": 0.6393957031151151, + "grad_norm": 8.543973922729492, + "learning_rate": 2.9317609586087398e-06, + "loss": 0.172, + "step": 25267 + }, + { + "epoch": 0.6394210086798087, + "grad_norm": 9.792630195617676, + "learning_rate": 2.9313954043105564e-06, + "loss": 0.1225, + "step": 25268 + }, + { + "epoch": 0.6394463142445024, + "grad_norm": 3.9075658321380615, + "learning_rate": 2.9310298633524203e-06, + "loss": 0.1306, + "step": 25269 + }, + { + "epoch": 0.6394716198091961, + "grad_norm": 5.195372104644775, + "learning_rate": 2.930664335736686e-06, + "loss": 0.1848, + "step": 25270 + }, + { + "epoch": 0.6394969253738897, + "grad_norm": 3.180366277694702, + "learning_rate": 2.930298821465715e-06, + "loss": 0.1181, + "step": 25271 + }, + { + "epoch": 0.6395222309385834, + "grad_norm": 3.1786556243896484, + "learning_rate": 2.92993332054186e-06, + "loss": 0.1338, + "step": 25272 + }, + { + "epoch": 0.6395475365032771, + "grad_norm": 5.77309513092041, + "learning_rate": 2.9295678329674803e-06, + "loss": 0.1593, + "step": 25273 + }, + { + "epoch": 0.6395728420679707, + "grad_norm": 3.122786521911621, + "learning_rate": 2.929202358744932e-06, + "loss": 0.1529, + "step": 25274 + }, + { + "epoch": 0.6395981476326644, + "grad_norm": 2.4885194301605225, + "learning_rate": 2.9288368978765725e-06, + "loss": 0.0902, + "step": 25275 + }, + { + "epoch": 0.6396234531973581, + "grad_norm": 4.4266180992126465, + "learning_rate": 2.9284714503647603e-06, + "loss": 0.1754, + "step": 25276 + }, + { + "epoch": 0.6396487587620517, + "grad_norm": 3.3690271377563477, + "learning_rate": 2.928106016211849e-06, + "loss": 0.1792, + "step": 25277 + }, + { + "epoch": 0.6396740643267455, + "grad_norm": 5.512645721435547, + "learning_rate": 2.9277405954201964e-06, + "loss": 0.215, + "step": 25278 + }, + { + "epoch": 0.6396993698914392, + "grad_norm": 4.4763569831848145, + "learning_rate": 2.9273751879921593e-06, + "loss": 0.1093, + "step": 25279 + }, + { + "epoch": 0.6397246754561328, + "grad_norm": 16.098426818847656, + "learning_rate": 2.9270097939300955e-06, + "loss": 0.2099, + "step": 25280 + }, + { + "epoch": 0.6397499810208265, + "grad_norm": 5.8091044425964355, + "learning_rate": 2.9266444132363587e-06, + "loss": 0.2091, + "step": 25281 + }, + { + "epoch": 0.6397752865855202, + "grad_norm": 3.652357816696167, + "learning_rate": 2.926279045913306e-06, + "loss": 0.1429, + "step": 25282 + }, + { + "epoch": 0.6398005921502138, + "grad_norm": 8.77562427520752, + "learning_rate": 2.925913691963294e-06, + "loss": 0.2293, + "step": 25283 + }, + { + "epoch": 0.6398258977149075, + "grad_norm": 3.1659598350524902, + "learning_rate": 2.925548351388679e-06, + "loss": 0.1245, + "step": 25284 + }, + { + "epoch": 0.6398512032796012, + "grad_norm": 4.302043914794922, + "learning_rate": 2.925183024191819e-06, + "loss": 0.1417, + "step": 25285 + }, + { + "epoch": 0.6398765088442948, + "grad_norm": 6.479609489440918, + "learning_rate": 2.9248177103750654e-06, + "loss": 0.2653, + "step": 25286 + }, + { + "epoch": 0.6399018144089885, + "grad_norm": 6.612384796142578, + "learning_rate": 2.9244524099407764e-06, + "loss": 0.2359, + "step": 25287 + }, + { + "epoch": 0.6399271199736822, + "grad_norm": 7.546788692474365, + "learning_rate": 2.924087122891307e-06, + "loss": 0.3256, + "step": 25288 + }, + { + "epoch": 0.6399524255383758, + "grad_norm": 8.975546836853027, + "learning_rate": 2.9237218492290165e-06, + "loss": 0.1564, + "step": 25289 + }, + { + "epoch": 0.6399777311030695, + "grad_norm": 3.013808250427246, + "learning_rate": 2.923356588956255e-06, + "loss": 0.1606, + "step": 25290 + }, + { + "epoch": 0.6400030366677633, + "grad_norm": 5.266204357147217, + "learning_rate": 2.9229913420753812e-06, + "loss": 0.1719, + "step": 25291 + }, + { + "epoch": 0.640028342232457, + "grad_norm": 3.6522789001464844, + "learning_rate": 2.9226261085887498e-06, + "loss": 0.1825, + "step": 25292 + }, + { + "epoch": 0.6400536477971506, + "grad_norm": 6.966940402984619, + "learning_rate": 2.922260888498718e-06, + "loss": 0.1977, + "step": 25293 + }, + { + "epoch": 0.6400789533618443, + "grad_norm": 8.546839714050293, + "learning_rate": 2.9218956818076373e-06, + "loss": 0.1788, + "step": 25294 + }, + { + "epoch": 0.640104258926538, + "grad_norm": 5.358654975891113, + "learning_rate": 2.9215304885178653e-06, + "loss": 0.1235, + "step": 25295 + }, + { + "epoch": 0.6401295644912316, + "grad_norm": 10.724832534790039, + "learning_rate": 2.9211653086317566e-06, + "loss": 0.1904, + "step": 25296 + }, + { + "epoch": 0.6401548700559253, + "grad_norm": 5.032098770141602, + "learning_rate": 2.920800142151665e-06, + "loss": 0.1827, + "step": 25297 + }, + { + "epoch": 0.640180175620619, + "grad_norm": 3.0613255500793457, + "learning_rate": 2.9204349890799487e-06, + "loss": 0.083, + "step": 25298 + }, + { + "epoch": 0.6402054811853126, + "grad_norm": 13.8283052444458, + "learning_rate": 2.920069849418958e-06, + "loss": 0.2101, + "step": 25299 + }, + { + "epoch": 0.6402307867500063, + "grad_norm": 2.9013359546661377, + "learning_rate": 2.9197047231710524e-06, + "loss": 0.1388, + "step": 25300 + }, + { + "epoch": 0.6402560923147, + "grad_norm": 5.345446586608887, + "learning_rate": 2.919339610338582e-06, + "loss": 0.1378, + "step": 25301 + }, + { + "epoch": 0.6402813978793936, + "grad_norm": 4.968343734741211, + "learning_rate": 2.918974510923905e-06, + "loss": 0.1757, + "step": 25302 + }, + { + "epoch": 0.6403067034440874, + "grad_norm": 4.1613640785217285, + "learning_rate": 2.918609424929374e-06, + "loss": 0.1758, + "step": 25303 + }, + { + "epoch": 0.6403320090087811, + "grad_norm": 3.490419864654541, + "learning_rate": 2.918244352357342e-06, + "loss": 0.1173, + "step": 25304 + }, + { + "epoch": 0.6403573145734747, + "grad_norm": 4.742248058319092, + "learning_rate": 2.9178792932101665e-06, + "loss": 0.1296, + "step": 25305 + }, + { + "epoch": 0.6403826201381684, + "grad_norm": 7.9765706062316895, + "learning_rate": 2.917514247490198e-06, + "loss": 0.2357, + "step": 25306 + }, + { + "epoch": 0.6404079257028621, + "grad_norm": 4.087100028991699, + "learning_rate": 2.917149215199797e-06, + "loss": 0.1998, + "step": 25307 + }, + { + "epoch": 0.6404332312675557, + "grad_norm": 9.43095588684082, + "learning_rate": 2.916784196341309e-06, + "loss": 0.2103, + "step": 25308 + }, + { + "epoch": 0.6404585368322494, + "grad_norm": 3.3502821922302246, + "learning_rate": 2.9164191909170945e-06, + "loss": 0.1478, + "step": 25309 + }, + { + "epoch": 0.6404838423969431, + "grad_norm": 2.3566815853118896, + "learning_rate": 2.9160541989295056e-06, + "loss": 0.0923, + "step": 25310 + }, + { + "epoch": 0.6405091479616367, + "grad_norm": 6.661164283752441, + "learning_rate": 2.9156892203808944e-06, + "loss": 0.2166, + "step": 25311 + }, + { + "epoch": 0.6405344535263304, + "grad_norm": 2.5833535194396973, + "learning_rate": 2.9153242552736144e-06, + "loss": 0.1044, + "step": 25312 + }, + { + "epoch": 0.6405597590910241, + "grad_norm": 2.3691534996032715, + "learning_rate": 2.914959303610022e-06, + "loss": 0.1069, + "step": 25313 + }, + { + "epoch": 0.6405850646557177, + "grad_norm": 5.9161481857299805, + "learning_rate": 2.9145943653924704e-06, + "loss": 0.1317, + "step": 25314 + }, + { + "epoch": 0.6406103702204115, + "grad_norm": 7.9149699211120605, + "learning_rate": 2.9142294406233083e-06, + "loss": 0.1876, + "step": 25315 + }, + { + "epoch": 0.6406356757851052, + "grad_norm": 4.28411865234375, + "learning_rate": 2.913864529304895e-06, + "loss": 0.1422, + "step": 25316 + }, + { + "epoch": 0.6406609813497989, + "grad_norm": 4.70418643951416, + "learning_rate": 2.913499631439582e-06, + "loss": 0.1147, + "step": 25317 + }, + { + "epoch": 0.6406862869144925, + "grad_norm": 3.693411350250244, + "learning_rate": 2.913134747029721e-06, + "loss": 0.1576, + "step": 25318 + }, + { + "epoch": 0.6407115924791862, + "grad_norm": 9.011866569519043, + "learning_rate": 2.9127698760776647e-06, + "loss": 0.2356, + "step": 25319 + }, + { + "epoch": 0.6407368980438799, + "grad_norm": 4.14543342590332, + "learning_rate": 2.91240501858577e-06, + "loss": 0.1003, + "step": 25320 + }, + { + "epoch": 0.6407622036085735, + "grad_norm": 5.388458251953125, + "learning_rate": 2.9120401745563842e-06, + "loss": 0.1253, + "step": 25321 + }, + { + "epoch": 0.6407875091732672, + "grad_norm": 8.389166831970215, + "learning_rate": 2.911675343991864e-06, + "loss": 0.2121, + "step": 25322 + }, + { + "epoch": 0.6408128147379609, + "grad_norm": 10.991575241088867, + "learning_rate": 2.91131052689456e-06, + "loss": 0.1616, + "step": 25323 + }, + { + "epoch": 0.6408381203026545, + "grad_norm": 7.701730251312256, + "learning_rate": 2.9109457232668292e-06, + "loss": 0.2575, + "step": 25324 + }, + { + "epoch": 0.6408634258673482, + "grad_norm": 4.019200801849365, + "learning_rate": 2.9105809331110167e-06, + "loss": 0.1625, + "step": 25325 + }, + { + "epoch": 0.640888731432042, + "grad_norm": 3.6482040882110596, + "learning_rate": 2.9102161564294815e-06, + "loss": 0.1035, + "step": 25326 + }, + { + "epoch": 0.6409140369967355, + "grad_norm": 4.166442394256592, + "learning_rate": 2.909851393224571e-06, + "loss": 0.1654, + "step": 25327 + }, + { + "epoch": 0.6409393425614293, + "grad_norm": 4.032931327819824, + "learning_rate": 2.9094866434986426e-06, + "loss": 0.1264, + "step": 25328 + }, + { + "epoch": 0.640964648126123, + "grad_norm": 5.023046970367432, + "learning_rate": 2.909121907254046e-06, + "loss": 0.1704, + "step": 25329 + }, + { + "epoch": 0.6409899536908166, + "grad_norm": 4.855186939239502, + "learning_rate": 2.908757184493133e-06, + "loss": 0.1732, + "step": 25330 + }, + { + "epoch": 0.6410152592555103, + "grad_norm": 8.243108749389648, + "learning_rate": 2.9083924752182544e-06, + "loss": 0.2677, + "step": 25331 + }, + { + "epoch": 0.641040564820204, + "grad_norm": 14.902918815612793, + "learning_rate": 2.9080277794317653e-06, + "loss": 0.2449, + "step": 25332 + }, + { + "epoch": 0.6410658703848976, + "grad_norm": 6.287721633911133, + "learning_rate": 2.907663097136016e-06, + "loss": 0.1713, + "step": 25333 + }, + { + "epoch": 0.6410911759495913, + "grad_norm": 3.8429665565490723, + "learning_rate": 2.907298428333358e-06, + "loss": 0.1645, + "step": 25334 + }, + { + "epoch": 0.641116481514285, + "grad_norm": 3.0801451206207275, + "learning_rate": 2.9069337730261416e-06, + "loss": 0.1446, + "step": 25335 + }, + { + "epoch": 0.6411417870789786, + "grad_norm": 6.094855308532715, + "learning_rate": 2.906569131216721e-06, + "loss": 0.1931, + "step": 25336 + }, + { + "epoch": 0.6411670926436723, + "grad_norm": 6.219547271728516, + "learning_rate": 2.9062045029074474e-06, + "loss": 0.2009, + "step": 25337 + }, + { + "epoch": 0.641192398208366, + "grad_norm": 3.9962902069091797, + "learning_rate": 2.9058398881006716e-06, + "loss": 0.127, + "step": 25338 + }, + { + "epoch": 0.6412177037730596, + "grad_norm": 4.850126266479492, + "learning_rate": 2.9054752867987422e-06, + "loss": 0.1457, + "step": 25339 + }, + { + "epoch": 0.6412430093377534, + "grad_norm": 4.607994556427002, + "learning_rate": 2.9051106990040157e-06, + "loss": 0.1629, + "step": 25340 + }, + { + "epoch": 0.6412683149024471, + "grad_norm": 4.002162456512451, + "learning_rate": 2.9047461247188404e-06, + "loss": 0.1058, + "step": 25341 + }, + { + "epoch": 0.6412936204671408, + "grad_norm": 3.6938955783843994, + "learning_rate": 2.9043815639455677e-06, + "loss": 0.149, + "step": 25342 + }, + { + "epoch": 0.6413189260318344, + "grad_norm": 3.8786821365356445, + "learning_rate": 2.904017016686547e-06, + "loss": 0.1427, + "step": 25343 + }, + { + "epoch": 0.6413442315965281, + "grad_norm": 8.265294075012207, + "learning_rate": 2.9036524829441313e-06, + "loss": 0.1874, + "step": 25344 + }, + { + "epoch": 0.6413695371612218, + "grad_norm": 8.302521705627441, + "learning_rate": 2.9032879627206717e-06, + "loss": 0.2135, + "step": 25345 + }, + { + "epoch": 0.6413948427259154, + "grad_norm": 31.269166946411133, + "learning_rate": 2.9029234560185184e-06, + "loss": 0.1891, + "step": 25346 + }, + { + "epoch": 0.6414201482906091, + "grad_norm": 3.7500555515289307, + "learning_rate": 2.9025589628400207e-06, + "loss": 0.1409, + "step": 25347 + }, + { + "epoch": 0.6414454538553028, + "grad_norm": 9.722098350524902, + "learning_rate": 2.9021944831875284e-06, + "loss": 0.2303, + "step": 25348 + }, + { + "epoch": 0.6414707594199964, + "grad_norm": 3.9106650352478027, + "learning_rate": 2.9018300170633956e-06, + "loss": 0.1632, + "step": 25349 + }, + { + "epoch": 0.6414960649846901, + "grad_norm": 4.079425811767578, + "learning_rate": 2.901465564469969e-06, + "loss": 0.1375, + "step": 25350 + }, + { + "epoch": 0.6415213705493839, + "grad_norm": 4.8232645988464355, + "learning_rate": 2.9011011254096046e-06, + "loss": 0.1329, + "step": 25351 + }, + { + "epoch": 0.6415466761140775, + "grad_norm": 8.370387077331543, + "learning_rate": 2.900736699884644e-06, + "loss": 0.2424, + "step": 25352 + }, + { + "epoch": 0.6415719816787712, + "grad_norm": 6.41209077835083, + "learning_rate": 2.900372287897445e-06, + "loss": 0.1497, + "step": 25353 + }, + { + "epoch": 0.6415972872434649, + "grad_norm": 4.638080596923828, + "learning_rate": 2.9000078894503512e-06, + "loss": 0.145, + "step": 25354 + }, + { + "epoch": 0.6416225928081585, + "grad_norm": 4.491482257843018, + "learning_rate": 2.8996435045457217e-06, + "loss": 0.1049, + "step": 25355 + }, + { + "epoch": 0.6416478983728522, + "grad_norm": 6.033860683441162, + "learning_rate": 2.899279133185895e-06, + "loss": 0.1476, + "step": 25356 + }, + { + "epoch": 0.6416732039375459, + "grad_norm": 6.536964416503906, + "learning_rate": 2.898914775373229e-06, + "loss": 0.2423, + "step": 25357 + }, + { + "epoch": 0.6416985095022395, + "grad_norm": 4.659947872161865, + "learning_rate": 2.898550431110069e-06, + "loss": 0.1656, + "step": 25358 + }, + { + "epoch": 0.6417238150669332, + "grad_norm": 4.957310199737549, + "learning_rate": 2.898186100398768e-06, + "loss": 0.1465, + "step": 25359 + }, + { + "epoch": 0.6417491206316269, + "grad_norm": 3.975522518157959, + "learning_rate": 2.8978217832416737e-06, + "loss": 0.1196, + "step": 25360 + }, + { + "epoch": 0.6417744261963205, + "grad_norm": 6.864668369293213, + "learning_rate": 2.897457479641136e-06, + "loss": 0.1551, + "step": 25361 + }, + { + "epoch": 0.6417997317610142, + "grad_norm": 16.958099365234375, + "learning_rate": 2.897093189599502e-06, + "loss": 0.1879, + "step": 25362 + }, + { + "epoch": 0.641825037325708, + "grad_norm": 5.334515571594238, + "learning_rate": 2.8967289131191245e-06, + "loss": 0.1698, + "step": 25363 + }, + { + "epoch": 0.6418503428904015, + "grad_norm": 7.2003560066223145, + "learning_rate": 2.8963646502023513e-06, + "loss": 0.1415, + "step": 25364 + }, + { + "epoch": 0.6418756484550953, + "grad_norm": 24.242719650268555, + "learning_rate": 2.8960004008515308e-06, + "loss": 0.2084, + "step": 25365 + }, + { + "epoch": 0.641900954019789, + "grad_norm": 2.8567700386047363, + "learning_rate": 2.8956361650690112e-06, + "loss": 0.1407, + "step": 25366 + }, + { + "epoch": 0.6419262595844826, + "grad_norm": 2.454496145248413, + "learning_rate": 2.8952719428571437e-06, + "loss": 0.078, + "step": 25367 + }, + { + "epoch": 0.6419515651491763, + "grad_norm": 3.8496882915496826, + "learning_rate": 2.8949077342182765e-06, + "loss": 0.1523, + "step": 25368 + }, + { + "epoch": 0.64197687071387, + "grad_norm": 4.065607070922852, + "learning_rate": 2.894543539154757e-06, + "loss": 0.1761, + "step": 25369 + }, + { + "epoch": 0.6420021762785637, + "grad_norm": 2.122929811477661, + "learning_rate": 2.894179357668933e-06, + "loss": 0.0673, + "step": 25370 + }, + { + "epoch": 0.6420274818432573, + "grad_norm": 7.373312950134277, + "learning_rate": 2.893815189763157e-06, + "loss": 0.216, + "step": 25371 + }, + { + "epoch": 0.642052787407951, + "grad_norm": 4.902294158935547, + "learning_rate": 2.8934510354397743e-06, + "loss": 0.1906, + "step": 25372 + }, + { + "epoch": 0.6420780929726447, + "grad_norm": 4.153875827789307, + "learning_rate": 2.893086894701134e-06, + "loss": 0.149, + "step": 25373 + }, + { + "epoch": 0.6421033985373383, + "grad_norm": 3.996241807937622, + "learning_rate": 2.892722767549585e-06, + "loss": 0.1016, + "step": 25374 + }, + { + "epoch": 0.642128704102032, + "grad_norm": 5.37410306930542, + "learning_rate": 2.8923586539874726e-06, + "loss": 0.1624, + "step": 25375 + }, + { + "epoch": 0.6421540096667258, + "grad_norm": 6.7792582511901855, + "learning_rate": 2.891994554017149e-06, + "loss": 0.1568, + "step": 25376 + }, + { + "epoch": 0.6421793152314194, + "grad_norm": 5.694199085235596, + "learning_rate": 2.8916304676409613e-06, + "loss": 0.1681, + "step": 25377 + }, + { + "epoch": 0.6422046207961131, + "grad_norm": 2.8119025230407715, + "learning_rate": 2.8912663948612555e-06, + "loss": 0.1387, + "step": 25378 + }, + { + "epoch": 0.6422299263608068, + "grad_norm": 4.855063438415527, + "learning_rate": 2.890902335680379e-06, + "loss": 0.1496, + "step": 25379 + }, + { + "epoch": 0.6422552319255004, + "grad_norm": 8.13062858581543, + "learning_rate": 2.890538290100683e-06, + "loss": 0.1967, + "step": 25380 + }, + { + "epoch": 0.6422805374901941, + "grad_norm": 2.9402334690093994, + "learning_rate": 2.8901742581245108e-06, + "loss": 0.0323, + "step": 25381 + }, + { + "epoch": 0.6423058430548878, + "grad_norm": 11.59958553314209, + "learning_rate": 2.889810239754217e-06, + "loss": 0.2508, + "step": 25382 + }, + { + "epoch": 0.6423311486195814, + "grad_norm": 5.282135486602783, + "learning_rate": 2.88944623499214e-06, + "loss": 0.1661, + "step": 25383 + }, + { + "epoch": 0.6423564541842751, + "grad_norm": 8.0189208984375, + "learning_rate": 2.8890822438406337e-06, + "loss": 0.1656, + "step": 25384 + }, + { + "epoch": 0.6423817597489688, + "grad_norm": 8.166110038757324, + "learning_rate": 2.888718266302041e-06, + "loss": 0.1883, + "step": 25385 + }, + { + "epoch": 0.6424070653136624, + "grad_norm": 5.314916610717773, + "learning_rate": 2.888354302378716e-06, + "loss": 0.1917, + "step": 25386 + }, + { + "epoch": 0.6424323708783561, + "grad_norm": 19.557723999023438, + "learning_rate": 2.8879903520729967e-06, + "loss": 0.2194, + "step": 25387 + }, + { + "epoch": 0.6424576764430499, + "grad_norm": 8.448135375976562, + "learning_rate": 2.8876264153872375e-06, + "loss": 0.2597, + "step": 25388 + }, + { + "epoch": 0.6424829820077435, + "grad_norm": 5.789601802825928, + "learning_rate": 2.8872624923237802e-06, + "loss": 0.1549, + "step": 25389 + }, + { + "epoch": 0.6425082875724372, + "grad_norm": 5.460739612579346, + "learning_rate": 2.886898582884976e-06, + "loss": 0.1813, + "step": 25390 + }, + { + "epoch": 0.6425335931371309, + "grad_norm": 4.775365829467773, + "learning_rate": 2.8865346870731705e-06, + "loss": 0.1503, + "step": 25391 + }, + { + "epoch": 0.6425588987018245, + "grad_norm": 4.032472610473633, + "learning_rate": 2.886170804890709e-06, + "loss": 0.2031, + "step": 25392 + }, + { + "epoch": 0.6425842042665182, + "grad_norm": 3.6900455951690674, + "learning_rate": 2.8858069363399365e-06, + "loss": 0.1439, + "step": 25393 + }, + { + "epoch": 0.6426095098312119, + "grad_norm": 3.8527042865753174, + "learning_rate": 2.885443081423204e-06, + "loss": 0.1653, + "step": 25394 + }, + { + "epoch": 0.6426348153959056, + "grad_norm": 4.1682820320129395, + "learning_rate": 2.885079240142856e-06, + "loss": 0.2242, + "step": 25395 + }, + { + "epoch": 0.6426601209605992, + "grad_norm": 7.648548603057861, + "learning_rate": 2.884715412501239e-06, + "loss": 0.1392, + "step": 25396 + }, + { + "epoch": 0.6426854265252929, + "grad_norm": 8.97048282623291, + "learning_rate": 2.884351598500696e-06, + "loss": 0.1249, + "step": 25397 + }, + { + "epoch": 0.6427107320899866, + "grad_norm": 4.697266578674316, + "learning_rate": 2.8839877981435783e-06, + "loss": 0.1408, + "step": 25398 + }, + { + "epoch": 0.6427360376546802, + "grad_norm": 16.540056228637695, + "learning_rate": 2.88362401143223e-06, + "loss": 0.1799, + "step": 25399 + }, + { + "epoch": 0.642761343219374, + "grad_norm": 7.114573001861572, + "learning_rate": 2.8832602383689963e-06, + "loss": 0.1345, + "step": 25400 + }, + { + "epoch": 0.6427866487840677, + "grad_norm": 4.230997085571289, + "learning_rate": 2.8828964789562236e-06, + "loss": 0.1735, + "step": 25401 + }, + { + "epoch": 0.6428119543487613, + "grad_norm": 5.140445232391357, + "learning_rate": 2.8825327331962562e-06, + "loss": 0.1602, + "step": 25402 + }, + { + "epoch": 0.642837259913455, + "grad_norm": 4.309201240539551, + "learning_rate": 2.8821690010914425e-06, + "loss": 0.1514, + "step": 25403 + }, + { + "epoch": 0.6428625654781487, + "grad_norm": 7.5569682121276855, + "learning_rate": 2.881805282644128e-06, + "loss": 0.1615, + "step": 25404 + }, + { + "epoch": 0.6428878710428423, + "grad_norm": 7.629348278045654, + "learning_rate": 2.881441577856656e-06, + "loss": 0.1352, + "step": 25405 + }, + { + "epoch": 0.642913176607536, + "grad_norm": 7.575538635253906, + "learning_rate": 2.8810778867313717e-06, + "loss": 0.1331, + "step": 25406 + }, + { + "epoch": 0.6429384821722297, + "grad_norm": 7.650596618652344, + "learning_rate": 2.880714209270624e-06, + "loss": 0.1642, + "step": 25407 + }, + { + "epoch": 0.6429637877369233, + "grad_norm": 5.757096290588379, + "learning_rate": 2.8803505454767568e-06, + "loss": 0.1258, + "step": 25408 + }, + { + "epoch": 0.642989093301617, + "grad_norm": 9.897137641906738, + "learning_rate": 2.8799868953521137e-06, + "loss": 0.2145, + "step": 25409 + }, + { + "epoch": 0.6430143988663107, + "grad_norm": 8.468682289123535, + "learning_rate": 2.8796232588990393e-06, + "loss": 0.1536, + "step": 25410 + }, + { + "epoch": 0.6430397044310043, + "grad_norm": 4.151230812072754, + "learning_rate": 2.879259636119882e-06, + "loss": 0.1132, + "step": 25411 + }, + { + "epoch": 0.643065009995698, + "grad_norm": 3.5206923484802246, + "learning_rate": 2.878896027016983e-06, + "loss": 0.1409, + "step": 25412 + }, + { + "epoch": 0.6430903155603918, + "grad_norm": 3.8298654556274414, + "learning_rate": 2.8785324315926923e-06, + "loss": 0.1254, + "step": 25413 + }, + { + "epoch": 0.6431156211250854, + "grad_norm": 4.438422679901123, + "learning_rate": 2.8781688498493477e-06, + "loss": 0.163, + "step": 25414 + }, + { + "epoch": 0.6431409266897791, + "grad_norm": 4.25853157043457, + "learning_rate": 2.8778052817893e-06, + "loss": 0.1891, + "step": 25415 + }, + { + "epoch": 0.6431662322544728, + "grad_norm": 2.859670400619507, + "learning_rate": 2.8774417274148886e-06, + "loss": 0.1486, + "step": 25416 + }, + { + "epoch": 0.6431915378191664, + "grad_norm": 5.684825420379639, + "learning_rate": 2.8770781867284647e-06, + "loss": 0.2283, + "step": 25417 + }, + { + "epoch": 0.6432168433838601, + "grad_norm": 2.687591075897217, + "learning_rate": 2.876714659732365e-06, + "loss": 0.1653, + "step": 25418 + }, + { + "epoch": 0.6432421489485538, + "grad_norm": 5.054439544677734, + "learning_rate": 2.8763511464289397e-06, + "loss": 0.2178, + "step": 25419 + }, + { + "epoch": 0.6432674545132475, + "grad_norm": 4.561239242553711, + "learning_rate": 2.8759876468205284e-06, + "loss": 0.1877, + "step": 25420 + }, + { + "epoch": 0.6432927600779411, + "grad_norm": 3.2348294258117676, + "learning_rate": 2.8756241609094813e-06, + "loss": 0.1399, + "step": 25421 + }, + { + "epoch": 0.6433180656426348, + "grad_norm": 6.289491176605225, + "learning_rate": 2.8752606886981355e-06, + "loss": 0.085, + "step": 25422 + }, + { + "epoch": 0.6433433712073285, + "grad_norm": 3.120788335800171, + "learning_rate": 2.87489723018884e-06, + "loss": 0.1262, + "step": 25423 + }, + { + "epoch": 0.6433686767720221, + "grad_norm": 5.598243236541748, + "learning_rate": 2.874533785383935e-06, + "loss": 0.1848, + "step": 25424 + }, + { + "epoch": 0.6433939823367159, + "grad_norm": 4.260875701904297, + "learning_rate": 2.874170354285768e-06, + "loss": 0.2087, + "step": 25425 + }, + { + "epoch": 0.6434192879014096, + "grad_norm": 5.7776780128479, + "learning_rate": 2.8738069368966814e-06, + "loss": 0.2013, + "step": 25426 + }, + { + "epoch": 0.6434445934661032, + "grad_norm": 4.067024230957031, + "learning_rate": 2.873443533219018e-06, + "loss": 0.1076, + "step": 25427 + }, + { + "epoch": 0.6434698990307969, + "grad_norm": 6.48283052444458, + "learning_rate": 2.87308014325512e-06, + "loss": 0.1662, + "step": 25428 + }, + { + "epoch": 0.6434952045954906, + "grad_norm": 10.907962799072266, + "learning_rate": 2.872716767007335e-06, + "loss": 0.2115, + "step": 25429 + }, + { + "epoch": 0.6435205101601842, + "grad_norm": 5.122966289520264, + "learning_rate": 2.8723534044780033e-06, + "loss": 0.1891, + "step": 25430 + }, + { + "epoch": 0.6435458157248779, + "grad_norm": 13.091971397399902, + "learning_rate": 2.871990055669468e-06, + "loss": 0.1819, + "step": 25431 + }, + { + "epoch": 0.6435711212895716, + "grad_norm": 4.905466079711914, + "learning_rate": 2.8716267205840742e-06, + "loss": 0.141, + "step": 25432 + }, + { + "epoch": 0.6435964268542652, + "grad_norm": 7.490243911743164, + "learning_rate": 2.871263399224161e-06, + "loss": 0.0712, + "step": 25433 + }, + { + "epoch": 0.6436217324189589, + "grad_norm": 3.9151179790496826, + "learning_rate": 2.8709000915920764e-06, + "loss": 0.1167, + "step": 25434 + }, + { + "epoch": 0.6436470379836526, + "grad_norm": 5.152523994445801, + "learning_rate": 2.870536797690161e-06, + "loss": 0.124, + "step": 25435 + }, + { + "epoch": 0.6436723435483462, + "grad_norm": 4.632593631744385, + "learning_rate": 2.870173517520758e-06, + "loss": 0.1805, + "step": 25436 + }, + { + "epoch": 0.64369764911304, + "grad_norm": 2.8113601207733154, + "learning_rate": 2.8698102510862077e-06, + "loss": 0.1609, + "step": 25437 + }, + { + "epoch": 0.6437229546777337, + "grad_norm": 6.611842155456543, + "learning_rate": 2.8694469983888563e-06, + "loss": 0.2811, + "step": 25438 + }, + { + "epoch": 0.6437482602424273, + "grad_norm": 3.101433515548706, + "learning_rate": 2.8690837594310456e-06, + "loss": 0.1142, + "step": 25439 + }, + { + "epoch": 0.643773565807121, + "grad_norm": 2.7887134552001953, + "learning_rate": 2.868720534215117e-06, + "loss": 0.107, + "step": 25440 + }, + { + "epoch": 0.6437988713718147, + "grad_norm": 6.477145195007324, + "learning_rate": 2.8683573227434115e-06, + "loss": 0.1923, + "step": 25441 + }, + { + "epoch": 0.6438241769365083, + "grad_norm": 4.99484920501709, + "learning_rate": 2.867994125018275e-06, + "loss": 0.1595, + "step": 25442 + }, + { + "epoch": 0.643849482501202, + "grad_norm": 4.0405755043029785, + "learning_rate": 2.8676309410420456e-06, + "loss": 0.1835, + "step": 25443 + }, + { + "epoch": 0.6438747880658957, + "grad_norm": 8.079691886901855, + "learning_rate": 2.8672677708170715e-06, + "loss": 0.1928, + "step": 25444 + }, + { + "epoch": 0.6439000936305894, + "grad_norm": 6.226831912994385, + "learning_rate": 2.866904614345687e-06, + "loss": 0.1592, + "step": 25445 + }, + { + "epoch": 0.643925399195283, + "grad_norm": 4.770963191986084, + "learning_rate": 2.8665414716302397e-06, + "loss": 0.1477, + "step": 25446 + }, + { + "epoch": 0.6439507047599767, + "grad_norm": 6.419914722442627, + "learning_rate": 2.8661783426730682e-06, + "loss": 0.1849, + "step": 25447 + }, + { + "epoch": 0.6439760103246704, + "grad_norm": 4.051797389984131, + "learning_rate": 2.8658152274765187e-06, + "loss": 0.1258, + "step": 25448 + }, + { + "epoch": 0.644001315889364, + "grad_norm": 3.939232587814331, + "learning_rate": 2.865452126042927e-06, + "loss": 0.1004, + "step": 25449 + }, + { + "epoch": 0.6440266214540578, + "grad_norm": 4.994705677032471, + "learning_rate": 2.865089038374639e-06, + "loss": 0.1595, + "step": 25450 + }, + { + "epoch": 0.6440519270187515, + "grad_norm": 4.494208812713623, + "learning_rate": 2.864725964473992e-06, + "loss": 0.1202, + "step": 25451 + }, + { + "epoch": 0.6440772325834451, + "grad_norm": 5.971782684326172, + "learning_rate": 2.864362904343334e-06, + "loss": 0.0893, + "step": 25452 + }, + { + "epoch": 0.6441025381481388, + "grad_norm": 3.500196933746338, + "learning_rate": 2.863999857984999e-06, + "loss": 0.1291, + "step": 25453 + }, + { + "epoch": 0.6441278437128325, + "grad_norm": 6.511157035827637, + "learning_rate": 2.8636368254013336e-06, + "loss": 0.2699, + "step": 25454 + }, + { + "epoch": 0.6441531492775261, + "grad_norm": 3.0461206436157227, + "learning_rate": 2.8632738065946754e-06, + "loss": 0.1004, + "step": 25455 + }, + { + "epoch": 0.6441784548422198, + "grad_norm": 3.716440439224243, + "learning_rate": 2.862910801567368e-06, + "loss": 0.1169, + "step": 25456 + }, + { + "epoch": 0.6442037604069135, + "grad_norm": 3.4315757751464844, + "learning_rate": 2.862547810321752e-06, + "loss": 0.1415, + "step": 25457 + }, + { + "epoch": 0.6442290659716071, + "grad_norm": 4.5266242027282715, + "learning_rate": 2.862184832860167e-06, + "loss": 0.126, + "step": 25458 + }, + { + "epoch": 0.6442543715363008, + "grad_norm": 5.028354167938232, + "learning_rate": 2.8618218691849546e-06, + "loss": 0.1547, + "step": 25459 + }, + { + "epoch": 0.6442796771009945, + "grad_norm": 7.371913433074951, + "learning_rate": 2.861458919298453e-06, + "loss": 0.1864, + "step": 25460 + }, + { + "epoch": 0.6443049826656881, + "grad_norm": 5.314137935638428, + "learning_rate": 2.861095983203008e-06, + "loss": 0.1287, + "step": 25461 + }, + { + "epoch": 0.6443302882303819, + "grad_norm": 2.9175894260406494, + "learning_rate": 2.8607330609009564e-06, + "loss": 0.1773, + "step": 25462 + }, + { + "epoch": 0.6443555937950756, + "grad_norm": 8.892988204956055, + "learning_rate": 2.8603701523946393e-06, + "loss": 0.1525, + "step": 25463 + }, + { + "epoch": 0.6443808993597692, + "grad_norm": 14.341285705566406, + "learning_rate": 2.8600072576863945e-06, + "loss": 0.223, + "step": 25464 + }, + { + "epoch": 0.6444062049244629, + "grad_norm": 2.788008451461792, + "learning_rate": 2.8596443767785683e-06, + "loss": 0.1621, + "step": 25465 + }, + { + "epoch": 0.6444315104891566, + "grad_norm": 6.042851448059082, + "learning_rate": 2.859281509673496e-06, + "loss": 0.1743, + "step": 25466 + }, + { + "epoch": 0.6444568160538502, + "grad_norm": 10.568939208984375, + "learning_rate": 2.85891865637352e-06, + "loss": 0.2321, + "step": 25467 + }, + { + "epoch": 0.6444821216185439, + "grad_norm": 6.089118957519531, + "learning_rate": 2.858555816880977e-06, + "loss": 0.2146, + "step": 25468 + }, + { + "epoch": 0.6445074271832376, + "grad_norm": 4.620340824127197, + "learning_rate": 2.858192991198211e-06, + "loss": 0.1685, + "step": 25469 + }, + { + "epoch": 0.6445327327479313, + "grad_norm": 3.5002572536468506, + "learning_rate": 2.85783017932756e-06, + "loss": 0.1718, + "step": 25470 + }, + { + "epoch": 0.6445580383126249, + "grad_norm": 5.016852378845215, + "learning_rate": 2.857467381271364e-06, + "loss": 0.1671, + "step": 25471 + }, + { + "epoch": 0.6445833438773186, + "grad_norm": 4.917625904083252, + "learning_rate": 2.85710459703196e-06, + "loss": 0.1306, + "step": 25472 + }, + { + "epoch": 0.6446086494420123, + "grad_norm": 5.129439353942871, + "learning_rate": 2.856741826611692e-06, + "loss": 0.1054, + "step": 25473 + }, + { + "epoch": 0.644633955006706, + "grad_norm": 5.186293125152588, + "learning_rate": 2.8563790700128973e-06, + "loss": 0.1255, + "step": 25474 + }, + { + "epoch": 0.6446592605713997, + "grad_norm": 3.414872169494629, + "learning_rate": 2.856016327237915e-06, + "loss": 0.1279, + "step": 25475 + }, + { + "epoch": 0.6446845661360934, + "grad_norm": 8.264636039733887, + "learning_rate": 2.855653598289083e-06, + "loss": 0.1971, + "step": 25476 + }, + { + "epoch": 0.644709871700787, + "grad_norm": 4.773152828216553, + "learning_rate": 2.8552908831687443e-06, + "loss": 0.0915, + "step": 25477 + }, + { + "epoch": 0.6447351772654807, + "grad_norm": 9.49422836303711, + "learning_rate": 2.854928181879233e-06, + "loss": 0.1609, + "step": 25478 + }, + { + "epoch": 0.6447604828301744, + "grad_norm": 5.474697589874268, + "learning_rate": 2.854565494422895e-06, + "loss": 0.2062, + "step": 25479 + }, + { + "epoch": 0.644785788394868, + "grad_norm": 16.492046356201172, + "learning_rate": 2.854202820802061e-06, + "loss": 0.3188, + "step": 25480 + }, + { + "epoch": 0.6448110939595617, + "grad_norm": 7.504924297332764, + "learning_rate": 2.8538401610190764e-06, + "loss": 0.186, + "step": 25481 + }, + { + "epoch": 0.6448363995242554, + "grad_norm": 3.2090511322021484, + "learning_rate": 2.853477515076275e-06, + "loss": 0.1463, + "step": 25482 + }, + { + "epoch": 0.644861705088949, + "grad_norm": 3.9174087047576904, + "learning_rate": 2.853114882976002e-06, + "loss": 0.1577, + "step": 25483 + }, + { + "epoch": 0.6448870106536427, + "grad_norm": 7.549218654632568, + "learning_rate": 2.8527522647205875e-06, + "loss": 0.2005, + "step": 25484 + }, + { + "epoch": 0.6449123162183364, + "grad_norm": 6.050161361694336, + "learning_rate": 2.852389660312376e-06, + "loss": 0.1491, + "step": 25485 + }, + { + "epoch": 0.64493762178303, + "grad_norm": 3.6899006366729736, + "learning_rate": 2.8520270697537045e-06, + "loss": 0.0722, + "step": 25486 + }, + { + "epoch": 0.6449629273477238, + "grad_norm": 4.320812225341797, + "learning_rate": 2.8516644930469095e-06, + "loss": 0.0819, + "step": 25487 + }, + { + "epoch": 0.6449882329124175, + "grad_norm": 4.468209266662598, + "learning_rate": 2.851301930194331e-06, + "loss": 0.1537, + "step": 25488 + }, + { + "epoch": 0.6450135384771111, + "grad_norm": 3.2214465141296387, + "learning_rate": 2.850939381198308e-06, + "loss": 0.174, + "step": 25489 + }, + { + "epoch": 0.6450388440418048, + "grad_norm": 4.025468349456787, + "learning_rate": 2.8505768460611773e-06, + "loss": 0.1461, + "step": 25490 + }, + { + "epoch": 0.6450641496064985, + "grad_norm": 11.808756828308105, + "learning_rate": 2.8502143247852738e-06, + "loss": 0.2273, + "step": 25491 + }, + { + "epoch": 0.6450894551711921, + "grad_norm": 8.671775817871094, + "learning_rate": 2.8498518173729406e-06, + "loss": 0.1725, + "step": 25492 + }, + { + "epoch": 0.6451147607358858, + "grad_norm": 5.186283588409424, + "learning_rate": 2.849489323826513e-06, + "loss": 0.1314, + "step": 25493 + }, + { + "epoch": 0.6451400663005795, + "grad_norm": 8.221528053283691, + "learning_rate": 2.849126844148329e-06, + "loss": 0.2079, + "step": 25494 + }, + { + "epoch": 0.6451653718652731, + "grad_norm": 12.762043952941895, + "learning_rate": 2.848764378340723e-06, + "loss": 0.1532, + "step": 25495 + }, + { + "epoch": 0.6451906774299668, + "grad_norm": 3.3544740676879883, + "learning_rate": 2.848401926406038e-06, + "loss": 0.1323, + "step": 25496 + }, + { + "epoch": 0.6452159829946605, + "grad_norm": 14.761317253112793, + "learning_rate": 2.848039488346609e-06, + "loss": 0.2226, + "step": 25497 + }, + { + "epoch": 0.6452412885593543, + "grad_norm": 5.435059070587158, + "learning_rate": 2.8476770641647726e-06, + "loss": 0.1441, + "step": 25498 + }, + { + "epoch": 0.6452665941240479, + "grad_norm": 8.331523895263672, + "learning_rate": 2.8473146538628648e-06, + "loss": 0.157, + "step": 25499 + }, + { + "epoch": 0.6452918996887416, + "grad_norm": 4.555671691894531, + "learning_rate": 2.846952257443226e-06, + "loss": 0.2377, + "step": 25500 + }, + { + "epoch": 0.6453172052534353, + "grad_norm": 5.539039611816406, + "learning_rate": 2.846589874908192e-06, + "loss": 0.171, + "step": 25501 + }, + { + "epoch": 0.6453425108181289, + "grad_norm": 3.9761455059051514, + "learning_rate": 2.8462275062600992e-06, + "loss": 0.2252, + "step": 25502 + }, + { + "epoch": 0.6453678163828226, + "grad_norm": 4.898163795471191, + "learning_rate": 2.8458651515012826e-06, + "loss": 0.1325, + "step": 25503 + }, + { + "epoch": 0.6453931219475163, + "grad_norm": 3.581052780151367, + "learning_rate": 2.8455028106340824e-06, + "loss": 0.134, + "step": 25504 + }, + { + "epoch": 0.6454184275122099, + "grad_norm": 4.8057026863098145, + "learning_rate": 2.8451404836608344e-06, + "loss": 0.1657, + "step": 25505 + }, + { + "epoch": 0.6454437330769036, + "grad_norm": 5.395286560058594, + "learning_rate": 2.8447781705838747e-06, + "loss": 0.0912, + "step": 25506 + }, + { + "epoch": 0.6454690386415973, + "grad_norm": 3.987558126449585, + "learning_rate": 2.844415871405537e-06, + "loss": 0.1177, + "step": 25507 + }, + { + "epoch": 0.6454943442062909, + "grad_norm": 5.084777355194092, + "learning_rate": 2.8440535861281633e-06, + "loss": 0.1612, + "step": 25508 + }, + { + "epoch": 0.6455196497709846, + "grad_norm": 3.6421549320220947, + "learning_rate": 2.8436913147540847e-06, + "loss": 0.1646, + "step": 25509 + }, + { + "epoch": 0.6455449553356783, + "grad_norm": 4.043055534362793, + "learning_rate": 2.843329057285643e-06, + "loss": 0.1033, + "step": 25510 + }, + { + "epoch": 0.645570260900372, + "grad_norm": 16.219039916992188, + "learning_rate": 2.8429668137251677e-06, + "loss": 0.3076, + "step": 25511 + }, + { + "epoch": 0.6455955664650657, + "grad_norm": 3.94856858253479, + "learning_rate": 2.842604584075e-06, + "loss": 0.1157, + "step": 25512 + }, + { + "epoch": 0.6456208720297594, + "grad_norm": 4.436614036560059, + "learning_rate": 2.8422423683374724e-06, + "loss": 0.1707, + "step": 25513 + }, + { + "epoch": 0.645646177594453, + "grad_norm": 6.800817012786865, + "learning_rate": 2.8418801665149254e-06, + "loss": 0.1812, + "step": 25514 + }, + { + "epoch": 0.6456714831591467, + "grad_norm": 10.180391311645508, + "learning_rate": 2.8415179786096884e-06, + "loss": 0.2166, + "step": 25515 + }, + { + "epoch": 0.6456967887238404, + "grad_norm": 7.956510543823242, + "learning_rate": 2.841155804624103e-06, + "loss": 0.2537, + "step": 25516 + }, + { + "epoch": 0.645722094288534, + "grad_norm": 2.8173739910125732, + "learning_rate": 2.840793644560501e-06, + "loss": 0.1397, + "step": 25517 + }, + { + "epoch": 0.6457473998532277, + "grad_norm": 5.3921799659729, + "learning_rate": 2.8404314984212177e-06, + "loss": 0.1528, + "step": 25518 + }, + { + "epoch": 0.6457727054179214, + "grad_norm": 6.792978763580322, + "learning_rate": 2.8400693662085926e-06, + "loss": 0.135, + "step": 25519 + }, + { + "epoch": 0.645798010982615, + "grad_norm": 4.199524879455566, + "learning_rate": 2.8397072479249577e-06, + "loss": 0.1704, + "step": 25520 + }, + { + "epoch": 0.6458233165473087, + "grad_norm": 5.1225409507751465, + "learning_rate": 2.8393451435726494e-06, + "loss": 0.1899, + "step": 25521 + }, + { + "epoch": 0.6458486221120024, + "grad_norm": 4.369232177734375, + "learning_rate": 2.8389830531539996e-06, + "loss": 0.0945, + "step": 25522 + }, + { + "epoch": 0.6458739276766962, + "grad_norm": 6.3327717781066895, + "learning_rate": 2.8386209766713508e-06, + "loss": 0.2014, + "step": 25523 + }, + { + "epoch": 0.6458992332413898, + "grad_norm": 3.6875391006469727, + "learning_rate": 2.838258914127029e-06, + "loss": 0.1932, + "step": 25524 + }, + { + "epoch": 0.6459245388060835, + "grad_norm": 5.938290119171143, + "learning_rate": 2.8378968655233752e-06, + "loss": 0.1744, + "step": 25525 + }, + { + "epoch": 0.6459498443707772, + "grad_norm": 9.53917407989502, + "learning_rate": 2.8375348308627203e-06, + "loss": 0.2236, + "step": 25526 + }, + { + "epoch": 0.6459751499354708, + "grad_norm": 3.092543125152588, + "learning_rate": 2.8371728101474057e-06, + "loss": 0.1514, + "step": 25527 + }, + { + "epoch": 0.6460004555001645, + "grad_norm": 6.520809173583984, + "learning_rate": 2.836810803379757e-06, + "loss": 0.175, + "step": 25528 + }, + { + "epoch": 0.6460257610648582, + "grad_norm": 4.864506244659424, + "learning_rate": 2.8364488105621147e-06, + "loss": 0.1953, + "step": 25529 + }, + { + "epoch": 0.6460510666295518, + "grad_norm": 4.391780853271484, + "learning_rate": 2.836086831696809e-06, + "loss": 0.1615, + "step": 25530 + }, + { + "epoch": 0.6460763721942455, + "grad_norm": 14.158374786376953, + "learning_rate": 2.8357248667861804e-06, + "loss": 0.1318, + "step": 25531 + }, + { + "epoch": 0.6461016777589392, + "grad_norm": 7.57552433013916, + "learning_rate": 2.8353629158325587e-06, + "loss": 0.1718, + "step": 25532 + }, + { + "epoch": 0.6461269833236328, + "grad_norm": 4.227193832397461, + "learning_rate": 2.8350009788382788e-06, + "loss": 0.1679, + "step": 25533 + }, + { + "epoch": 0.6461522888883265, + "grad_norm": 3.848222017288208, + "learning_rate": 2.834639055805673e-06, + "loss": 0.1935, + "step": 25534 + }, + { + "epoch": 0.6461775944530203, + "grad_norm": 3.3913450241088867, + "learning_rate": 2.8342771467370788e-06, + "loss": 0.2074, + "step": 25535 + }, + { + "epoch": 0.6462029000177139, + "grad_norm": 3.1429243087768555, + "learning_rate": 2.8339152516348288e-06, + "loss": 0.1498, + "step": 25536 + }, + { + "epoch": 0.6462282055824076, + "grad_norm": 4.35188627243042, + "learning_rate": 2.8335533705012562e-06, + "loss": 0.1413, + "step": 25537 + }, + { + "epoch": 0.6462535111471013, + "grad_norm": 4.949275970458984, + "learning_rate": 2.8331915033386938e-06, + "loss": 0.1647, + "step": 25538 + }, + { + "epoch": 0.6462788167117949, + "grad_norm": 2.7111799716949463, + "learning_rate": 2.832829650149478e-06, + "loss": 0.1048, + "step": 25539 + }, + { + "epoch": 0.6463041222764886, + "grad_norm": 34.82127380371094, + "learning_rate": 2.832467810935938e-06, + "loss": 0.1361, + "step": 25540 + }, + { + "epoch": 0.6463294278411823, + "grad_norm": 7.942224502563477, + "learning_rate": 2.832105985700414e-06, + "loss": 0.1766, + "step": 25541 + }, + { + "epoch": 0.6463547334058759, + "grad_norm": 6.915010929107666, + "learning_rate": 2.8317441744452313e-06, + "loss": 0.1444, + "step": 25542 + }, + { + "epoch": 0.6463800389705696, + "grad_norm": 19.41713523864746, + "learning_rate": 2.8313823771727295e-06, + "loss": 0.1677, + "step": 25543 + }, + { + "epoch": 0.6464053445352633, + "grad_norm": 2.417144775390625, + "learning_rate": 2.8310205938852385e-06, + "loss": 0.0815, + "step": 25544 + }, + { + "epoch": 0.6464306500999569, + "grad_norm": 4.430877685546875, + "learning_rate": 2.830658824585093e-06, + "loss": 0.1129, + "step": 25545 + }, + { + "epoch": 0.6464559556646506, + "grad_norm": 2.434408664703369, + "learning_rate": 2.8302970692746234e-06, + "loss": 0.1258, + "step": 25546 + }, + { + "epoch": 0.6464812612293444, + "grad_norm": 6.002383232116699, + "learning_rate": 2.829935327956166e-06, + "loss": 0.1636, + "step": 25547 + }, + { + "epoch": 0.6465065667940381, + "grad_norm": 8.171008110046387, + "learning_rate": 2.8295736006320517e-06, + "loss": 0.2096, + "step": 25548 + }, + { + "epoch": 0.6465318723587317, + "grad_norm": 7.732564449310303, + "learning_rate": 2.8292118873046138e-06, + "loss": 0.258, + "step": 25549 + }, + { + "epoch": 0.6465571779234254, + "grad_norm": 7.383909225463867, + "learning_rate": 2.828850187976183e-06, + "loss": 0.1334, + "step": 25550 + }, + { + "epoch": 0.6465824834881191, + "grad_norm": 3.5221099853515625, + "learning_rate": 2.828488502649095e-06, + "loss": 0.1461, + "step": 25551 + }, + { + "epoch": 0.6466077890528127, + "grad_norm": 13.524662971496582, + "learning_rate": 2.828126831325681e-06, + "loss": 0.2746, + "step": 25552 + }, + { + "epoch": 0.6466330946175064, + "grad_norm": 6.145768165588379, + "learning_rate": 2.827765174008271e-06, + "loss": 0.1978, + "step": 25553 + }, + { + "epoch": 0.6466584001822001, + "grad_norm": 3.319213390350342, + "learning_rate": 2.827403530699203e-06, + "loss": 0.145, + "step": 25554 + }, + { + "epoch": 0.6466837057468937, + "grad_norm": 6.588183403015137, + "learning_rate": 2.827041901400802e-06, + "loss": 0.1076, + "step": 25555 + }, + { + "epoch": 0.6467090113115874, + "grad_norm": 4.478092193603516, + "learning_rate": 2.826680286115405e-06, + "loss": 0.1502, + "step": 25556 + }, + { + "epoch": 0.6467343168762811, + "grad_norm": 4.179916858673096, + "learning_rate": 2.826318684845342e-06, + "loss": 0.2019, + "step": 25557 + }, + { + "epoch": 0.6467596224409747, + "grad_norm": 4.588439464569092, + "learning_rate": 2.8259570975929475e-06, + "loss": 0.1327, + "step": 25558 + }, + { + "epoch": 0.6467849280056684, + "grad_norm": 9.230799674987793, + "learning_rate": 2.825595524360548e-06, + "loss": 0.2667, + "step": 25559 + }, + { + "epoch": 0.6468102335703622, + "grad_norm": 5.715040683746338, + "learning_rate": 2.8252339651504813e-06, + "loss": 0.1535, + "step": 25560 + }, + { + "epoch": 0.6468355391350558, + "grad_norm": 8.59518814086914, + "learning_rate": 2.8248724199650736e-06, + "loss": 0.1614, + "step": 25561 + }, + { + "epoch": 0.6468608446997495, + "grad_norm": 3.016693115234375, + "learning_rate": 2.8245108888066607e-06, + "loss": 0.1095, + "step": 25562 + }, + { + "epoch": 0.6468861502644432, + "grad_norm": 4.234543800354004, + "learning_rate": 2.8241493716775724e-06, + "loss": 0.1099, + "step": 25563 + }, + { + "epoch": 0.6469114558291368, + "grad_norm": 9.724302291870117, + "learning_rate": 2.8237878685801405e-06, + "loss": 0.1777, + "step": 25564 + }, + { + "epoch": 0.6469367613938305, + "grad_norm": 5.509744644165039, + "learning_rate": 2.823426379516694e-06, + "loss": 0.1968, + "step": 25565 + }, + { + "epoch": 0.6469620669585242, + "grad_norm": 4.193905353546143, + "learning_rate": 2.8230649044895676e-06, + "loss": 0.1333, + "step": 25566 + }, + { + "epoch": 0.6469873725232178, + "grad_norm": 2.7013649940490723, + "learning_rate": 2.8227034435010903e-06, + "loss": 0.0981, + "step": 25567 + }, + { + "epoch": 0.6470126780879115, + "grad_norm": 6.045373439788818, + "learning_rate": 2.822341996553594e-06, + "loss": 0.1536, + "step": 25568 + }, + { + "epoch": 0.6470379836526052, + "grad_norm": 7.653750419616699, + "learning_rate": 2.8219805636494068e-06, + "loss": 0.2016, + "step": 25569 + }, + { + "epoch": 0.6470632892172988, + "grad_norm": 3.297781467437744, + "learning_rate": 2.8216191447908647e-06, + "loss": 0.1057, + "step": 25570 + }, + { + "epoch": 0.6470885947819925, + "grad_norm": 3.288212776184082, + "learning_rate": 2.821257739980294e-06, + "loss": 0.1268, + "step": 25571 + }, + { + "epoch": 0.6471139003466863, + "grad_norm": 3.313847780227661, + "learning_rate": 2.820896349220028e-06, + "loss": 0.125, + "step": 25572 + }, + { + "epoch": 0.64713920591138, + "grad_norm": 5.027537822723389, + "learning_rate": 2.820534972512394e-06, + "loss": 0.1563, + "step": 25573 + }, + { + "epoch": 0.6471645114760736, + "grad_norm": 3.4312727451324463, + "learning_rate": 2.8201736098597266e-06, + "loss": 0.1418, + "step": 25574 + }, + { + "epoch": 0.6471898170407673, + "grad_norm": 4.025542736053467, + "learning_rate": 2.8198122612643535e-06, + "loss": 0.1708, + "step": 25575 + }, + { + "epoch": 0.647215122605461, + "grad_norm": 4.871729850769043, + "learning_rate": 2.8194509267286062e-06, + "loss": 0.0968, + "step": 25576 + }, + { + "epoch": 0.6472404281701546, + "grad_norm": 7.272544860839844, + "learning_rate": 2.8190896062548117e-06, + "loss": 0.2049, + "step": 25577 + }, + { + "epoch": 0.6472657337348483, + "grad_norm": 3.530397653579712, + "learning_rate": 2.8187282998453046e-06, + "loss": 0.1291, + "step": 25578 + }, + { + "epoch": 0.647291039299542, + "grad_norm": 6.453320026397705, + "learning_rate": 2.818367007502414e-06, + "loss": 0.1927, + "step": 25579 + }, + { + "epoch": 0.6473163448642356, + "grad_norm": 5.802269458770752, + "learning_rate": 2.818005729228468e-06, + "loss": 0.208, + "step": 25580 + }, + { + "epoch": 0.6473416504289293, + "grad_norm": 5.625732421875, + "learning_rate": 2.8176444650257974e-06, + "loss": 0.1111, + "step": 25581 + }, + { + "epoch": 0.647366955993623, + "grad_norm": 5.490891456604004, + "learning_rate": 2.817283214896729e-06, + "loss": 0.1529, + "step": 25582 + }, + { + "epoch": 0.6473922615583166, + "grad_norm": 8.61095142364502, + "learning_rate": 2.8169219788435974e-06, + "loss": 0.2006, + "step": 25583 + }, + { + "epoch": 0.6474175671230104, + "grad_norm": 6.440751075744629, + "learning_rate": 2.8165607568687282e-06, + "loss": 0.1454, + "step": 25584 + }, + { + "epoch": 0.6474428726877041, + "grad_norm": 5.107702255249023, + "learning_rate": 2.8161995489744566e-06, + "loss": 0.1376, + "step": 25585 + }, + { + "epoch": 0.6474681782523977, + "grad_norm": 3.9436631202697754, + "learning_rate": 2.815838355163104e-06, + "loss": 0.1216, + "step": 25586 + }, + { + "epoch": 0.6474934838170914, + "grad_norm": 5.176059246063232, + "learning_rate": 2.815477175437005e-06, + "loss": 0.1701, + "step": 25587 + }, + { + "epoch": 0.6475187893817851, + "grad_norm": 5.909534454345703, + "learning_rate": 2.815116009798485e-06, + "loss": 0.1539, + "step": 25588 + }, + { + "epoch": 0.6475440949464787, + "grad_norm": 7.517429351806641, + "learning_rate": 2.8147548582498807e-06, + "loss": 0.1265, + "step": 25589 + }, + { + "epoch": 0.6475694005111724, + "grad_norm": 14.272675514221191, + "learning_rate": 2.8143937207935113e-06, + "loss": 0.3316, + "step": 25590 + }, + { + "epoch": 0.6475947060758661, + "grad_norm": 4.458251476287842, + "learning_rate": 2.8140325974317116e-06, + "loss": 0.1677, + "step": 25591 + }, + { + "epoch": 0.6476200116405597, + "grad_norm": 5.012546062469482, + "learning_rate": 2.813671488166808e-06, + "loss": 0.2036, + "step": 25592 + }, + { + "epoch": 0.6476453172052534, + "grad_norm": 5.804945468902588, + "learning_rate": 2.8133103930011318e-06, + "loss": 0.1965, + "step": 25593 + }, + { + "epoch": 0.6476706227699471, + "grad_norm": 4.633667945861816, + "learning_rate": 2.81294931193701e-06, + "loss": 0.1867, + "step": 25594 + }, + { + "epoch": 0.6476959283346407, + "grad_norm": 3.435220956802368, + "learning_rate": 2.812588244976772e-06, + "loss": 0.1099, + "step": 25595 + }, + { + "epoch": 0.6477212338993344, + "grad_norm": 4.2751784324646, + "learning_rate": 2.812227192122743e-06, + "loss": 0.1485, + "step": 25596 + }, + { + "epoch": 0.6477465394640282, + "grad_norm": 8.39976692199707, + "learning_rate": 2.811866153377256e-06, + "loss": 0.1752, + "step": 25597 + }, + { + "epoch": 0.6477718450287219, + "grad_norm": 3.757970094680786, + "learning_rate": 2.8115051287426366e-06, + "loss": 0.1545, + "step": 25598 + }, + { + "epoch": 0.6477971505934155, + "grad_norm": 2.8191325664520264, + "learning_rate": 2.811144118221214e-06, + "loss": 0.1623, + "step": 25599 + }, + { + "epoch": 0.6478224561581092, + "grad_norm": 2.6252429485321045, + "learning_rate": 2.8107831218153136e-06, + "loss": 0.1242, + "step": 25600 + }, + { + "epoch": 0.6478477617228029, + "grad_norm": 7.397663116455078, + "learning_rate": 2.810422139527268e-06, + "loss": 0.1761, + "step": 25601 + }, + { + "epoch": 0.6478730672874965, + "grad_norm": 7.407832145690918, + "learning_rate": 2.8100611713594027e-06, + "loss": 0.1661, + "step": 25602 + }, + { + "epoch": 0.6478983728521902, + "grad_norm": 3.4248528480529785, + "learning_rate": 2.8097002173140453e-06, + "loss": 0.1406, + "step": 25603 + }, + { + "epoch": 0.6479236784168839, + "grad_norm": 2.988109827041626, + "learning_rate": 2.8093392773935213e-06, + "loss": 0.1615, + "step": 25604 + }, + { + "epoch": 0.6479489839815775, + "grad_norm": 5.093240261077881, + "learning_rate": 2.8089783516001636e-06, + "loss": 0.1289, + "step": 25605 + }, + { + "epoch": 0.6479742895462712, + "grad_norm": 6.686444282531738, + "learning_rate": 2.8086174399362966e-06, + "loss": 0.215, + "step": 25606 + }, + { + "epoch": 0.6479995951109649, + "grad_norm": 3.9893651008605957, + "learning_rate": 2.8082565424042486e-06, + "loss": 0.1456, + "step": 25607 + }, + { + "epoch": 0.6480249006756585, + "grad_norm": 9.714189529418945, + "learning_rate": 2.8078956590063454e-06, + "loss": 0.1065, + "step": 25608 + }, + { + "epoch": 0.6480502062403523, + "grad_norm": 5.455635070800781, + "learning_rate": 2.8075347897449148e-06, + "loss": 0.1487, + "step": 25609 + }, + { + "epoch": 0.648075511805046, + "grad_norm": 8.159995079040527, + "learning_rate": 2.8071739346222856e-06, + "loss": 0.2164, + "step": 25610 + }, + { + "epoch": 0.6481008173697396, + "grad_norm": 5.275363922119141, + "learning_rate": 2.8068130936407833e-06, + "loss": 0.1599, + "step": 25611 + }, + { + "epoch": 0.6481261229344333, + "grad_norm": 6.417982578277588, + "learning_rate": 2.806452266802736e-06, + "loss": 0.1328, + "step": 25612 + }, + { + "epoch": 0.648151428499127, + "grad_norm": 3.141836643218994, + "learning_rate": 2.806091454110468e-06, + "loss": 0.1402, + "step": 25613 + }, + { + "epoch": 0.6481767340638206, + "grad_norm": 4.350602149963379, + "learning_rate": 2.8057306555663106e-06, + "loss": 0.1728, + "step": 25614 + }, + { + "epoch": 0.6482020396285143, + "grad_norm": 6.423550605773926, + "learning_rate": 2.805369871172585e-06, + "loss": 0.1777, + "step": 25615 + }, + { + "epoch": 0.648227345193208, + "grad_norm": 3.885707139968872, + "learning_rate": 2.805009100931626e-06, + "loss": 0.1419, + "step": 25616 + }, + { + "epoch": 0.6482526507579016, + "grad_norm": 3.6477742195129395, + "learning_rate": 2.80464834484575e-06, + "loss": 0.1092, + "step": 25617 + }, + { + "epoch": 0.6482779563225953, + "grad_norm": 5.1572465896606445, + "learning_rate": 2.804287602917291e-06, + "loss": 0.1658, + "step": 25618 + }, + { + "epoch": 0.648303261887289, + "grad_norm": 2.413475751876831, + "learning_rate": 2.8039268751485706e-06, + "loss": 0.1201, + "step": 25619 + }, + { + "epoch": 0.6483285674519826, + "grad_norm": 12.938870429992676, + "learning_rate": 2.803566161541922e-06, + "loss": 0.213, + "step": 25620 + }, + { + "epoch": 0.6483538730166764, + "grad_norm": 4.5735907554626465, + "learning_rate": 2.8032054620996617e-06, + "loss": 0.2121, + "step": 25621 + }, + { + "epoch": 0.6483791785813701, + "grad_norm": 9.8622465133667, + "learning_rate": 2.802844776824123e-06, + "loss": 0.1309, + "step": 25622 + }, + { + "epoch": 0.6484044841460637, + "grad_norm": 3.8702433109283447, + "learning_rate": 2.8024841057176276e-06, + "loss": 0.1322, + "step": 25623 + }, + { + "epoch": 0.6484297897107574, + "grad_norm": 9.57811164855957, + "learning_rate": 2.8021234487825057e-06, + "loss": 0.242, + "step": 25624 + }, + { + "epoch": 0.6484550952754511, + "grad_norm": 10.2205228805542, + "learning_rate": 2.8017628060210805e-06, + "loss": 0.2316, + "step": 25625 + }, + { + "epoch": 0.6484804008401448, + "grad_norm": 3.3590621948242188, + "learning_rate": 2.801402177435678e-06, + "loss": 0.1443, + "step": 25626 + }, + { + "epoch": 0.6485057064048384, + "grad_norm": 5.615594863891602, + "learning_rate": 2.8010415630286226e-06, + "loss": 0.1278, + "step": 25627 + }, + { + "epoch": 0.6485310119695321, + "grad_norm": 5.4789605140686035, + "learning_rate": 2.8006809628022426e-06, + "loss": 0.1609, + "step": 25628 + }, + { + "epoch": 0.6485563175342258, + "grad_norm": 3.6193673610687256, + "learning_rate": 2.8003203767588614e-06, + "loss": 0.096, + "step": 25629 + }, + { + "epoch": 0.6485816230989194, + "grad_norm": 7.748725891113281, + "learning_rate": 2.7999598049008057e-06, + "loss": 0.2181, + "step": 25630 + }, + { + "epoch": 0.6486069286636131, + "grad_norm": 4.582974433898926, + "learning_rate": 2.799599247230398e-06, + "loss": 0.2, + "step": 25631 + }, + { + "epoch": 0.6486322342283068, + "grad_norm": 3.6207423210144043, + "learning_rate": 2.7992387037499673e-06, + "loss": 0.1502, + "step": 25632 + }, + { + "epoch": 0.6486575397930004, + "grad_norm": 10.827105522155762, + "learning_rate": 2.798878174461837e-06, + "loss": 0.23, + "step": 25633 + }, + { + "epoch": 0.6486828453576942, + "grad_norm": 3.2732808589935303, + "learning_rate": 2.7985176593683316e-06, + "loss": 0.1801, + "step": 25634 + }, + { + "epoch": 0.6487081509223879, + "grad_norm": 3.7290406227111816, + "learning_rate": 2.798157158471775e-06, + "loss": 0.1902, + "step": 25635 + }, + { + "epoch": 0.6487334564870815, + "grad_norm": 4.228762626647949, + "learning_rate": 2.7977966717744943e-06, + "loss": 0.1569, + "step": 25636 + }, + { + "epoch": 0.6487587620517752, + "grad_norm": 3.552318572998047, + "learning_rate": 2.7974361992788134e-06, + "loss": 0.1275, + "step": 25637 + }, + { + "epoch": 0.6487840676164689, + "grad_norm": 2.4210548400878906, + "learning_rate": 2.7970757409870576e-06, + "loss": 0.1236, + "step": 25638 + }, + { + "epoch": 0.6488093731811625, + "grad_norm": 6.204902172088623, + "learning_rate": 2.79671529690155e-06, + "loss": 0.2219, + "step": 25639 + }, + { + "epoch": 0.6488346787458562, + "grad_norm": 5.29154634475708, + "learning_rate": 2.7963548670246134e-06, + "loss": 0.1134, + "step": 25640 + }, + { + "epoch": 0.6488599843105499, + "grad_norm": 2.6828932762145996, + "learning_rate": 2.7959944513585765e-06, + "loss": 0.1647, + "step": 25641 + }, + { + "epoch": 0.6488852898752435, + "grad_norm": 3.3812928199768066, + "learning_rate": 2.795634049905761e-06, + "loss": 0.1371, + "step": 25642 + }, + { + "epoch": 0.6489105954399372, + "grad_norm": 4.610067367553711, + "learning_rate": 2.7952736626684916e-06, + "loss": 0.17, + "step": 25643 + }, + { + "epoch": 0.6489359010046309, + "grad_norm": 4.508160591125488, + "learning_rate": 2.794913289649091e-06, + "loss": 0.1324, + "step": 25644 + }, + { + "epoch": 0.6489612065693245, + "grad_norm": 9.096700668334961, + "learning_rate": 2.7945529308498858e-06, + "loss": 0.1268, + "step": 25645 + }, + { + "epoch": 0.6489865121340183, + "grad_norm": 4.399219512939453, + "learning_rate": 2.7941925862731968e-06, + "loss": 0.1282, + "step": 25646 + }, + { + "epoch": 0.649011817698712, + "grad_norm": 8.778666496276855, + "learning_rate": 2.7938322559213534e-06, + "loss": 0.3093, + "step": 25647 + }, + { + "epoch": 0.6490371232634056, + "grad_norm": 21.12805938720703, + "learning_rate": 2.7934719397966713e-06, + "loss": 0.1883, + "step": 25648 + }, + { + "epoch": 0.6490624288280993, + "grad_norm": 6.302126884460449, + "learning_rate": 2.79311163790148e-06, + "loss": 0.1709, + "step": 25649 + }, + { + "epoch": 0.649087734392793, + "grad_norm": 5.726040363311768, + "learning_rate": 2.7927513502380998e-06, + "loss": 0.1714, + "step": 25650 + }, + { + "epoch": 0.6491130399574867, + "grad_norm": 6.611475944519043, + "learning_rate": 2.7923910768088593e-06, + "loss": 0.1764, + "step": 25651 + }, + { + "epoch": 0.6491383455221803, + "grad_norm": 3.145660400390625, + "learning_rate": 2.7920308176160737e-06, + "loss": 0.109, + "step": 25652 + }, + { + "epoch": 0.649163651086874, + "grad_norm": 5.686239242553711, + "learning_rate": 2.791670572662073e-06, + "loss": 0.2804, + "step": 25653 + }, + { + "epoch": 0.6491889566515677, + "grad_norm": 6.88193941116333, + "learning_rate": 2.7913103419491762e-06, + "loss": 0.1332, + "step": 25654 + }, + { + "epoch": 0.6492142622162613, + "grad_norm": 9.84480094909668, + "learning_rate": 2.7909501254797123e-06, + "loss": 0.1286, + "step": 25655 + }, + { + "epoch": 0.649239567780955, + "grad_norm": 4.107435703277588, + "learning_rate": 2.7905899232559954e-06, + "loss": 0.1597, + "step": 25656 + }, + { + "epoch": 0.6492648733456488, + "grad_norm": 4.333511829376221, + "learning_rate": 2.790229735280355e-06, + "loss": 0.153, + "step": 25657 + }, + { + "epoch": 0.6492901789103424, + "grad_norm": 4.791441440582275, + "learning_rate": 2.789869561555111e-06, + "loss": 0.1009, + "step": 25658 + }, + { + "epoch": 0.6493154844750361, + "grad_norm": 3.2894515991210938, + "learning_rate": 2.7895094020825884e-06, + "loss": 0.1433, + "step": 25659 + }, + { + "epoch": 0.6493407900397298, + "grad_norm": 10.16649055480957, + "learning_rate": 2.789149256865109e-06, + "loss": 0.2173, + "step": 25660 + }, + { + "epoch": 0.6493660956044234, + "grad_norm": 4.093959808349609, + "learning_rate": 2.7887891259049943e-06, + "loss": 0.1229, + "step": 25661 + }, + { + "epoch": 0.6493914011691171, + "grad_norm": 5.184407711029053, + "learning_rate": 2.788429009204565e-06, + "loss": 0.1813, + "step": 25662 + }, + { + "epoch": 0.6494167067338108, + "grad_norm": 3.351835250854492, + "learning_rate": 2.788068906766148e-06, + "loss": 0.1904, + "step": 25663 + }, + { + "epoch": 0.6494420122985044, + "grad_norm": 2.935478687286377, + "learning_rate": 2.787708818592063e-06, + "loss": 0.1477, + "step": 25664 + }, + { + "epoch": 0.6494673178631981, + "grad_norm": 4.53697395324707, + "learning_rate": 2.7873487446846324e-06, + "loss": 0.1911, + "step": 25665 + }, + { + "epoch": 0.6494926234278918, + "grad_norm": 3.6299262046813965, + "learning_rate": 2.786988685046179e-06, + "loss": 0.1386, + "step": 25666 + }, + { + "epoch": 0.6495179289925854, + "grad_norm": 4.783541679382324, + "learning_rate": 2.786628639679021e-06, + "loss": 0.224, + "step": 25667 + }, + { + "epoch": 0.6495432345572791, + "grad_norm": 4.1838274002075195, + "learning_rate": 2.7862686085854852e-06, + "loss": 0.2022, + "step": 25668 + }, + { + "epoch": 0.6495685401219728, + "grad_norm": 4.465581893920898, + "learning_rate": 2.785908591767892e-06, + "loss": 0.1797, + "step": 25669 + }, + { + "epoch": 0.6495938456866664, + "grad_norm": 6.006769180297852, + "learning_rate": 2.7855485892285615e-06, + "loss": 0.1494, + "step": 25670 + }, + { + "epoch": 0.6496191512513602, + "grad_norm": 8.704866409301758, + "learning_rate": 2.785188600969815e-06, + "loss": 0.2372, + "step": 25671 + }, + { + "epoch": 0.6496444568160539, + "grad_norm": 13.579673767089844, + "learning_rate": 2.784828626993976e-06, + "loss": 0.2647, + "step": 25672 + }, + { + "epoch": 0.6496697623807475, + "grad_norm": 5.903601169586182, + "learning_rate": 2.7844686673033667e-06, + "loss": 0.2281, + "step": 25673 + }, + { + "epoch": 0.6496950679454412, + "grad_norm": 5.273555278778076, + "learning_rate": 2.784108721900306e-06, + "loss": 0.1884, + "step": 25674 + }, + { + "epoch": 0.6497203735101349, + "grad_norm": 6.908062934875488, + "learning_rate": 2.783748790787114e-06, + "loss": 0.2124, + "step": 25675 + }, + { + "epoch": 0.6497456790748286, + "grad_norm": 4.7094597816467285, + "learning_rate": 2.7833888739661165e-06, + "loss": 0.1245, + "step": 25676 + }, + { + "epoch": 0.6497709846395222, + "grad_norm": 3.293583393096924, + "learning_rate": 2.7830289714396288e-06, + "loss": 0.1626, + "step": 25677 + }, + { + "epoch": 0.6497962902042159, + "grad_norm": 4.160540580749512, + "learning_rate": 2.782669083209979e-06, + "loss": 0.1668, + "step": 25678 + }, + { + "epoch": 0.6498215957689096, + "grad_norm": 13.212625503540039, + "learning_rate": 2.78230920927948e-06, + "loss": 0.2108, + "step": 25679 + }, + { + "epoch": 0.6498469013336032, + "grad_norm": 4.28712272644043, + "learning_rate": 2.7819493496504592e-06, + "loss": 0.1464, + "step": 25680 + }, + { + "epoch": 0.649872206898297, + "grad_norm": 6.440428733825684, + "learning_rate": 2.7815895043252317e-06, + "loss": 0.1365, + "step": 25681 + }, + { + "epoch": 0.6498975124629907, + "grad_norm": 5.255563735961914, + "learning_rate": 2.781229673306125e-06, + "loss": 0.1427, + "step": 25682 + }, + { + "epoch": 0.6499228180276843, + "grad_norm": 3.464775800704956, + "learning_rate": 2.7808698565954513e-06, + "loss": 0.1396, + "step": 25683 + }, + { + "epoch": 0.649948123592378, + "grad_norm": 5.040940284729004, + "learning_rate": 2.7805100541955364e-06, + "loss": 0.207, + "step": 25684 + }, + { + "epoch": 0.6499734291570717, + "grad_norm": 3.3771703243255615, + "learning_rate": 2.7801502661086988e-06, + "loss": 0.1145, + "step": 25685 + }, + { + "epoch": 0.6499987347217653, + "grad_norm": 3.5816080570220947, + "learning_rate": 2.779790492337262e-06, + "loss": 0.1495, + "step": 25686 + }, + { + "epoch": 0.650024040286459, + "grad_norm": 5.714028835296631, + "learning_rate": 2.77943073288354e-06, + "loss": 0.1092, + "step": 25687 + }, + { + "epoch": 0.6500493458511527, + "grad_norm": 3.862131357192993, + "learning_rate": 2.779070987749858e-06, + "loss": 0.18, + "step": 25688 + }, + { + "epoch": 0.6500746514158463, + "grad_norm": 5.45449686050415, + "learning_rate": 2.7787112569385323e-06, + "loss": 0.1734, + "step": 25689 + }, + { + "epoch": 0.65009995698054, + "grad_norm": 5.234785079956055, + "learning_rate": 2.7783515404518856e-06, + "loss": 0.1461, + "step": 25690 + }, + { + "epoch": 0.6501252625452337, + "grad_norm": 7.231147289276123, + "learning_rate": 2.7779918382922375e-06, + "loss": 0.2114, + "step": 25691 + }, + { + "epoch": 0.6501505681099273, + "grad_norm": 5.011271953582764, + "learning_rate": 2.7776321504619064e-06, + "loss": 0.1066, + "step": 25692 + }, + { + "epoch": 0.650175873674621, + "grad_norm": 13.167655944824219, + "learning_rate": 2.777272476963212e-06, + "loss": 0.1932, + "step": 25693 + }, + { + "epoch": 0.6502011792393148, + "grad_norm": 3.502004384994507, + "learning_rate": 2.7769128177984724e-06, + "loss": 0.0671, + "step": 25694 + }, + { + "epoch": 0.6502264848040084, + "grad_norm": 2.8244128227233887, + "learning_rate": 2.77655317297001e-06, + "loss": 0.1734, + "step": 25695 + }, + { + "epoch": 0.6502517903687021, + "grad_norm": 7.530946254730225, + "learning_rate": 2.776193542480143e-06, + "loss": 0.2439, + "step": 25696 + }, + { + "epoch": 0.6502770959333958, + "grad_norm": 5.209872722625732, + "learning_rate": 2.7758339263311897e-06, + "loss": 0.1107, + "step": 25697 + }, + { + "epoch": 0.6503024014980894, + "grad_norm": 4.1048054695129395, + "learning_rate": 2.775474324525468e-06, + "loss": 0.1814, + "step": 25698 + }, + { + "epoch": 0.6503277070627831, + "grad_norm": 4.509184837341309, + "learning_rate": 2.775114737065301e-06, + "loss": 0.0871, + "step": 25699 + }, + { + "epoch": 0.6503530126274768, + "grad_norm": 4.179596424102783, + "learning_rate": 2.7747551639530046e-06, + "loss": 0.1773, + "step": 25700 + }, + { + "epoch": 0.6503783181921705, + "grad_norm": 7.6347503662109375, + "learning_rate": 2.774395605190898e-06, + "loss": 0.2373, + "step": 25701 + }, + { + "epoch": 0.6504036237568641, + "grad_norm": 4.0790581703186035, + "learning_rate": 2.774036060781299e-06, + "loss": 0.152, + "step": 25702 + }, + { + "epoch": 0.6504289293215578, + "grad_norm": 7.844985485076904, + "learning_rate": 2.7736765307265285e-06, + "loss": 0.2528, + "step": 25703 + }, + { + "epoch": 0.6504542348862515, + "grad_norm": 23.296445846557617, + "learning_rate": 2.7733170150289048e-06, + "loss": 0.2395, + "step": 25704 + }, + { + "epoch": 0.6504795404509451, + "grad_norm": 4.85202169418335, + "learning_rate": 2.7729575136907443e-06, + "loss": 0.1133, + "step": 25705 + }, + { + "epoch": 0.6505048460156388, + "grad_norm": 3.8964364528656006, + "learning_rate": 2.7725980267143653e-06, + "loss": 0.172, + "step": 25706 + }, + { + "epoch": 0.6505301515803326, + "grad_norm": 3.2492268085479736, + "learning_rate": 2.7722385541020893e-06, + "loss": 0.1492, + "step": 25707 + }, + { + "epoch": 0.6505554571450262, + "grad_norm": 4.2113542556762695, + "learning_rate": 2.7718790958562323e-06, + "loss": 0.1388, + "step": 25708 + }, + { + "epoch": 0.6505807627097199, + "grad_norm": 3.85636043548584, + "learning_rate": 2.771519651979113e-06, + "loss": 0.1427, + "step": 25709 + }, + { + "epoch": 0.6506060682744136, + "grad_norm": 5.504160404205322, + "learning_rate": 2.771160222473046e-06, + "loss": 0.1838, + "step": 25710 + }, + { + "epoch": 0.6506313738391072, + "grad_norm": 6.5075483322143555, + "learning_rate": 2.7708008073403546e-06, + "loss": 0.1573, + "step": 25711 + }, + { + "epoch": 0.6506566794038009, + "grad_norm": 5.403885841369629, + "learning_rate": 2.770441406583353e-06, + "loss": 0.2217, + "step": 25712 + }, + { + "epoch": 0.6506819849684946, + "grad_norm": 10.515783309936523, + "learning_rate": 2.770082020204363e-06, + "loss": 0.2146, + "step": 25713 + }, + { + "epoch": 0.6507072905331882, + "grad_norm": 31.286231994628906, + "learning_rate": 2.7697226482056955e-06, + "loss": 0.1372, + "step": 25714 + }, + { + "epoch": 0.6507325960978819, + "grad_norm": 6.177124977111816, + "learning_rate": 2.769363290589674e-06, + "loss": 0.1912, + "step": 25715 + }, + { + "epoch": 0.6507579016625756, + "grad_norm": 4.3360700607299805, + "learning_rate": 2.769003947358612e-06, + "loss": 0.1597, + "step": 25716 + }, + { + "epoch": 0.6507832072272692, + "grad_norm": 5.210893630981445, + "learning_rate": 2.768644618514831e-06, + "loss": 0.1008, + "step": 25717 + }, + { + "epoch": 0.650808512791963, + "grad_norm": 8.884003639221191, + "learning_rate": 2.768285304060643e-06, + "loss": 0.1624, + "step": 25718 + }, + { + "epoch": 0.6508338183566567, + "grad_norm": 3.16219162940979, + "learning_rate": 2.76792600399837e-06, + "loss": 0.1319, + "step": 25719 + }, + { + "epoch": 0.6508591239213503, + "grad_norm": 3.3739383220672607, + "learning_rate": 2.7675667183303244e-06, + "loss": 0.1657, + "step": 25720 + }, + { + "epoch": 0.650884429486044, + "grad_norm": 6.3865251541137695, + "learning_rate": 2.7672074470588285e-06, + "loss": 0.1625, + "step": 25721 + }, + { + "epoch": 0.6509097350507377, + "grad_norm": 9.38456916809082, + "learning_rate": 2.7668481901861965e-06, + "loss": 0.1829, + "step": 25722 + }, + { + "epoch": 0.6509350406154313, + "grad_norm": 6.882224082946777, + "learning_rate": 2.7664889477147445e-06, + "loss": 0.2181, + "step": 25723 + }, + { + "epoch": 0.650960346180125, + "grad_norm": 4.16355037689209, + "learning_rate": 2.76612971964679e-06, + "loss": 0.1571, + "step": 25724 + }, + { + "epoch": 0.6509856517448187, + "grad_norm": 3.6895456314086914, + "learning_rate": 2.7657705059846473e-06, + "loss": 0.1194, + "step": 25725 + }, + { + "epoch": 0.6510109573095124, + "grad_norm": 5.941012859344482, + "learning_rate": 2.7654113067306374e-06, + "loss": 0.1281, + "step": 25726 + }, + { + "epoch": 0.651036262874206, + "grad_norm": 6.219649314880371, + "learning_rate": 2.765052121887074e-06, + "loss": 0.1635, + "step": 25727 + }, + { + "epoch": 0.6510615684388997, + "grad_norm": 6.654707431793213, + "learning_rate": 2.7646929514562737e-06, + "loss": 0.206, + "step": 25728 + }, + { + "epoch": 0.6510868740035934, + "grad_norm": 3.8125057220458984, + "learning_rate": 2.7643337954405514e-06, + "loss": 0.1289, + "step": 25729 + }, + { + "epoch": 0.651112179568287, + "grad_norm": 19.050668716430664, + "learning_rate": 2.7639746538422273e-06, + "loss": 0.2276, + "step": 25730 + }, + { + "epoch": 0.6511374851329808, + "grad_norm": 9.860639572143555, + "learning_rate": 2.763615526663612e-06, + "loss": 0.1995, + "step": 25731 + }, + { + "epoch": 0.6511627906976745, + "grad_norm": 5.191473960876465, + "learning_rate": 2.763256413907026e-06, + "loss": 0.2001, + "step": 25732 + }, + { + "epoch": 0.6511880962623681, + "grad_norm": 4.037835597991943, + "learning_rate": 2.762897315574781e-06, + "loss": 0.1295, + "step": 25733 + }, + { + "epoch": 0.6512134018270618, + "grad_norm": 7.977625846862793, + "learning_rate": 2.7625382316691964e-06, + "loss": 0.2015, + "step": 25734 + }, + { + "epoch": 0.6512387073917555, + "grad_norm": 12.421126365661621, + "learning_rate": 2.7621791621925873e-06, + "loss": 0.2305, + "step": 25735 + }, + { + "epoch": 0.6512640129564491, + "grad_norm": 7.587339878082275, + "learning_rate": 2.7618201071472683e-06, + "loss": 0.1687, + "step": 25736 + }, + { + "epoch": 0.6512893185211428, + "grad_norm": 13.066628456115723, + "learning_rate": 2.7614610665355533e-06, + "loss": 0.2788, + "step": 25737 + }, + { + "epoch": 0.6513146240858365, + "grad_norm": 11.66067886352539, + "learning_rate": 2.7611020403597615e-06, + "loss": 0.184, + "step": 25738 + }, + { + "epoch": 0.6513399296505301, + "grad_norm": 4.569878578186035, + "learning_rate": 2.7607430286222057e-06, + "loss": 0.1025, + "step": 25739 + }, + { + "epoch": 0.6513652352152238, + "grad_norm": 4.031129360198975, + "learning_rate": 2.7603840313252023e-06, + "loss": 0.178, + "step": 25740 + }, + { + "epoch": 0.6513905407799175, + "grad_norm": 6.861173152923584, + "learning_rate": 2.7600250484710635e-06, + "loss": 0.2833, + "step": 25741 + }, + { + "epoch": 0.6514158463446111, + "grad_norm": 3.6705849170684814, + "learning_rate": 2.7596660800621076e-06, + "loss": 0.169, + "step": 25742 + }, + { + "epoch": 0.6514411519093048, + "grad_norm": 5.463837146759033, + "learning_rate": 2.7593071261006476e-06, + "loss": 0.1248, + "step": 25743 + }, + { + "epoch": 0.6514664574739986, + "grad_norm": 6.284722805023193, + "learning_rate": 2.7589481865890026e-06, + "loss": 0.1909, + "step": 25744 + }, + { + "epoch": 0.6514917630386922, + "grad_norm": 5.059778213500977, + "learning_rate": 2.7585892615294803e-06, + "loss": 0.1743, + "step": 25745 + }, + { + "epoch": 0.6515170686033859, + "grad_norm": 4.140372276306152, + "learning_rate": 2.7582303509244e-06, + "loss": 0.1745, + "step": 25746 + }, + { + "epoch": 0.6515423741680796, + "grad_norm": 3.840900182723999, + "learning_rate": 2.7578714547760743e-06, + "loss": 0.1469, + "step": 25747 + }, + { + "epoch": 0.6515676797327732, + "grad_norm": 4.7356367111206055, + "learning_rate": 2.7575125730868218e-06, + "loss": 0.1522, + "step": 25748 + }, + { + "epoch": 0.6515929852974669, + "grad_norm": 4.44301176071167, + "learning_rate": 2.75715370585895e-06, + "loss": 0.1594, + "step": 25749 + }, + { + "epoch": 0.6516182908621606, + "grad_norm": 7.363624572753906, + "learning_rate": 2.7567948530947786e-06, + "loss": 0.1702, + "step": 25750 + }, + { + "epoch": 0.6516435964268542, + "grad_norm": 3.465028762817383, + "learning_rate": 2.7564360147966196e-06, + "loss": 0.089, + "step": 25751 + }, + { + "epoch": 0.6516689019915479, + "grad_norm": 3.3662984371185303, + "learning_rate": 2.7560771909667854e-06, + "loss": 0.137, + "step": 25752 + }, + { + "epoch": 0.6516942075562416, + "grad_norm": 3.309026002883911, + "learning_rate": 2.7557183816075943e-06, + "loss": 0.1062, + "step": 25753 + }, + { + "epoch": 0.6517195131209353, + "grad_norm": 4.454338550567627, + "learning_rate": 2.755359586721358e-06, + "loss": 0.1377, + "step": 25754 + }, + { + "epoch": 0.651744818685629, + "grad_norm": 11.779751777648926, + "learning_rate": 2.7550008063103893e-06, + "loss": 0.2499, + "step": 25755 + }, + { + "epoch": 0.6517701242503227, + "grad_norm": 7.046524524688721, + "learning_rate": 2.754642040377002e-06, + "loss": 0.1842, + "step": 25756 + }, + { + "epoch": 0.6517954298150164, + "grad_norm": 6.115281581878662, + "learning_rate": 2.7542832889235115e-06, + "loss": 0.1867, + "step": 25757 + }, + { + "epoch": 0.65182073537971, + "grad_norm": 5.919625282287598, + "learning_rate": 2.7539245519522313e-06, + "loss": 0.1515, + "step": 25758 + }, + { + "epoch": 0.6518460409444037, + "grad_norm": 7.666752815246582, + "learning_rate": 2.753565829465473e-06, + "loss": 0.1929, + "step": 25759 + }, + { + "epoch": 0.6518713465090974, + "grad_norm": 4.771744728088379, + "learning_rate": 2.7532071214655497e-06, + "loss": 0.1754, + "step": 25760 + }, + { + "epoch": 0.651896652073791, + "grad_norm": 19.115243911743164, + "learning_rate": 2.752848427954779e-06, + "loss": 0.2737, + "step": 25761 + }, + { + "epoch": 0.6519219576384847, + "grad_norm": 12.817566871643066, + "learning_rate": 2.7524897489354674e-06, + "loss": 0.2835, + "step": 25762 + }, + { + "epoch": 0.6519472632031784, + "grad_norm": 6.480655193328857, + "learning_rate": 2.7521310844099326e-06, + "loss": 0.1347, + "step": 25763 + }, + { + "epoch": 0.651972568767872, + "grad_norm": 3.893122434616089, + "learning_rate": 2.751772434380485e-06, + "loss": 0.1204, + "step": 25764 + }, + { + "epoch": 0.6519978743325657, + "grad_norm": 4.530497074127197, + "learning_rate": 2.7514137988494405e-06, + "loss": 0.1057, + "step": 25765 + }, + { + "epoch": 0.6520231798972594, + "grad_norm": 7.144412517547607, + "learning_rate": 2.75105517781911e-06, + "loss": 0.1814, + "step": 25766 + }, + { + "epoch": 0.652048485461953, + "grad_norm": 5.007693290710449, + "learning_rate": 2.7506965712918064e-06, + "loss": 0.1457, + "step": 25767 + }, + { + "epoch": 0.6520737910266468, + "grad_norm": 3.09771990776062, + "learning_rate": 2.7503379792698405e-06, + "loss": 0.1273, + "step": 25768 + }, + { + "epoch": 0.6520990965913405, + "grad_norm": 4.139881610870361, + "learning_rate": 2.7499794017555276e-06, + "loss": 0.1309, + "step": 25769 + }, + { + "epoch": 0.6521244021560341, + "grad_norm": 7.869548320770264, + "learning_rate": 2.7496208387511796e-06, + "loss": 0.2327, + "step": 25770 + }, + { + "epoch": 0.6521497077207278, + "grad_norm": 4.346001148223877, + "learning_rate": 2.749262290259108e-06, + "loss": 0.0961, + "step": 25771 + }, + { + "epoch": 0.6521750132854215, + "grad_norm": 4.634462833404541, + "learning_rate": 2.7489037562816233e-06, + "loss": 0.1838, + "step": 25772 + }, + { + "epoch": 0.6522003188501151, + "grad_norm": 12.601767539978027, + "learning_rate": 2.748545236821042e-06, + "loss": 0.1927, + "step": 25773 + }, + { + "epoch": 0.6522256244148088, + "grad_norm": 5.574536323547363, + "learning_rate": 2.7481867318796717e-06, + "loss": 0.1458, + "step": 25774 + }, + { + "epoch": 0.6522509299795025, + "grad_norm": 7.230153560638428, + "learning_rate": 2.74782824145983e-06, + "loss": 0.2226, + "step": 25775 + }, + { + "epoch": 0.6522762355441961, + "grad_norm": 5.692723751068115, + "learning_rate": 2.7474697655638206e-06, + "loss": 0.2107, + "step": 25776 + }, + { + "epoch": 0.6523015411088898, + "grad_norm": 5.836701393127441, + "learning_rate": 2.7471113041939623e-06, + "loss": 0.1101, + "step": 25777 + }, + { + "epoch": 0.6523268466735835, + "grad_norm": 4.9185471534729, + "learning_rate": 2.7467528573525636e-06, + "loss": 0.1563, + "step": 25778 + }, + { + "epoch": 0.6523521522382772, + "grad_norm": 4.965437412261963, + "learning_rate": 2.746394425041937e-06, + "loss": 0.1601, + "step": 25779 + }, + { + "epoch": 0.6523774578029709, + "grad_norm": 14.693989753723145, + "learning_rate": 2.7460360072643915e-06, + "loss": 0.1489, + "step": 25780 + }, + { + "epoch": 0.6524027633676646, + "grad_norm": 2.9324827194213867, + "learning_rate": 2.745677604022242e-06, + "loss": 0.0755, + "step": 25781 + }, + { + "epoch": 0.6524280689323583, + "grad_norm": 3.369560480117798, + "learning_rate": 2.745319215317799e-06, + "loss": 0.1332, + "step": 25782 + }, + { + "epoch": 0.6524533744970519, + "grad_norm": 9.755454063415527, + "learning_rate": 2.744960841153373e-06, + "loss": 0.3021, + "step": 25783 + }, + { + "epoch": 0.6524786800617456, + "grad_norm": 12.770834922790527, + "learning_rate": 2.7446024815312726e-06, + "loss": 0.2151, + "step": 25784 + }, + { + "epoch": 0.6525039856264393, + "grad_norm": 4.2411980628967285, + "learning_rate": 2.744244136453813e-06, + "loss": 0.1978, + "step": 25785 + }, + { + "epoch": 0.6525292911911329, + "grad_norm": 8.08303165435791, + "learning_rate": 2.743885805923304e-06, + "loss": 0.1617, + "step": 25786 + }, + { + "epoch": 0.6525545967558266, + "grad_norm": 3.7109718322753906, + "learning_rate": 2.7435274899420537e-06, + "loss": 0.1292, + "step": 25787 + }, + { + "epoch": 0.6525799023205203, + "grad_norm": 2.97812819480896, + "learning_rate": 2.743169188512379e-06, + "loss": 0.1589, + "step": 25788 + }, + { + "epoch": 0.6526052078852139, + "grad_norm": 4.804131507873535, + "learning_rate": 2.7428109016365824e-06, + "loss": 0.1165, + "step": 25789 + }, + { + "epoch": 0.6526305134499076, + "grad_norm": 2.665510892868042, + "learning_rate": 2.7424526293169796e-06, + "loss": 0.1391, + "step": 25790 + }, + { + "epoch": 0.6526558190146013, + "grad_norm": 6.635622978210449, + "learning_rate": 2.742094371555879e-06, + "loss": 0.2344, + "step": 25791 + }, + { + "epoch": 0.652681124579295, + "grad_norm": 4.870622158050537, + "learning_rate": 2.7417361283555947e-06, + "loss": 0.1579, + "step": 25792 + }, + { + "epoch": 0.6527064301439887, + "grad_norm": 5.139866828918457, + "learning_rate": 2.7413778997184306e-06, + "loss": 0.1724, + "step": 25793 + }, + { + "epoch": 0.6527317357086824, + "grad_norm": 5.593037128448486, + "learning_rate": 2.7410196856467027e-06, + "loss": 0.1186, + "step": 25794 + }, + { + "epoch": 0.652757041273376, + "grad_norm": 3.8964552879333496, + "learning_rate": 2.7406614861427163e-06, + "loss": 0.1482, + "step": 25795 + }, + { + "epoch": 0.6527823468380697, + "grad_norm": 13.359302520751953, + "learning_rate": 2.7403033012087854e-06, + "loss": 0.1306, + "step": 25796 + }, + { + "epoch": 0.6528076524027634, + "grad_norm": 5.4441399574279785, + "learning_rate": 2.739945130847218e-06, + "loss": 0.1496, + "step": 25797 + }, + { + "epoch": 0.652832957967457, + "grad_norm": 5.933767318725586, + "learning_rate": 2.7395869750603243e-06, + "loss": 0.1637, + "step": 25798 + }, + { + "epoch": 0.6528582635321507, + "grad_norm": 3.88596510887146, + "learning_rate": 2.7392288338504124e-06, + "loss": 0.1507, + "step": 25799 + }, + { + "epoch": 0.6528835690968444, + "grad_norm": 4.187816143035889, + "learning_rate": 2.7388707072197943e-06, + "loss": 0.1758, + "step": 25800 + }, + { + "epoch": 0.652908874661538, + "grad_norm": 5.744651794433594, + "learning_rate": 2.738512595170779e-06, + "loss": 0.1653, + "step": 25801 + }, + { + "epoch": 0.6529341802262317, + "grad_norm": 3.9765148162841797, + "learning_rate": 2.738154497705675e-06, + "loss": 0.1681, + "step": 25802 + }, + { + "epoch": 0.6529594857909254, + "grad_norm": 3.239166498184204, + "learning_rate": 2.73779641482679e-06, + "loss": 0.1528, + "step": 25803 + }, + { + "epoch": 0.6529847913556192, + "grad_norm": 16.938383102416992, + "learning_rate": 2.737438346536437e-06, + "loss": 0.249, + "step": 25804 + }, + { + "epoch": 0.6530100969203128, + "grad_norm": 5.11165189743042, + "learning_rate": 2.737080292836921e-06, + "loss": 0.1878, + "step": 25805 + }, + { + "epoch": 0.6530354024850065, + "grad_norm": 4.190622806549072, + "learning_rate": 2.7367222537305584e-06, + "loss": 0.1654, + "step": 25806 + }, + { + "epoch": 0.6530607080497002, + "grad_norm": 7.550195693969727, + "learning_rate": 2.736364229219648e-06, + "loss": 0.2295, + "step": 25807 + }, + { + "epoch": 0.6530860136143938, + "grad_norm": 8.945103645324707, + "learning_rate": 2.7360062193065056e-06, + "loss": 0.1853, + "step": 25808 + }, + { + "epoch": 0.6531113191790875, + "grad_norm": 4.468514442443848, + "learning_rate": 2.7356482239934377e-06, + "loss": 0.1696, + "step": 25809 + }, + { + "epoch": 0.6531366247437812, + "grad_norm": 6.562890529632568, + "learning_rate": 2.7352902432827546e-06, + "loss": 0.118, + "step": 25810 + }, + { + "epoch": 0.6531619303084748, + "grad_norm": 5.576529026031494, + "learning_rate": 2.73493227717676e-06, + "loss": 0.1214, + "step": 25811 + }, + { + "epoch": 0.6531872358731685, + "grad_norm": 2.8981595039367676, + "learning_rate": 2.7345743256777686e-06, + "loss": 0.1295, + "step": 25812 + }, + { + "epoch": 0.6532125414378622, + "grad_norm": 3.7632896900177, + "learning_rate": 2.7342163887880856e-06, + "loss": 0.1235, + "step": 25813 + }, + { + "epoch": 0.6532378470025558, + "grad_norm": 8.22849178314209, + "learning_rate": 2.73385846651002e-06, + "loss": 0.1936, + "step": 25814 + }, + { + "epoch": 0.6532631525672495, + "grad_norm": 5.252602577209473, + "learning_rate": 2.733500558845879e-06, + "loss": 0.1605, + "step": 25815 + }, + { + "epoch": 0.6532884581319433, + "grad_norm": 11.478116989135742, + "learning_rate": 2.7331426657979696e-06, + "loss": 0.2629, + "step": 25816 + }, + { + "epoch": 0.6533137636966369, + "grad_norm": 6.67824649810791, + "learning_rate": 2.7327847873686036e-06, + "loss": 0.1819, + "step": 25817 + }, + { + "epoch": 0.6533390692613306, + "grad_norm": 3.143040657043457, + "learning_rate": 2.7324269235600847e-06, + "loss": 0.0738, + "step": 25818 + }, + { + "epoch": 0.6533643748260243, + "grad_norm": 3.9423882961273193, + "learning_rate": 2.732069074374727e-06, + "loss": 0.1285, + "step": 25819 + }, + { + "epoch": 0.6533896803907179, + "grad_norm": 8.68952465057373, + "learning_rate": 2.7317112398148294e-06, + "loss": 0.1926, + "step": 25820 + }, + { + "epoch": 0.6534149859554116, + "grad_norm": 5.738175868988037, + "learning_rate": 2.7313534198827065e-06, + "loss": 0.1939, + "step": 25821 + }, + { + "epoch": 0.6534402915201053, + "grad_norm": 4.445847988128662, + "learning_rate": 2.7309956145806616e-06, + "loss": 0.1723, + "step": 25822 + }, + { + "epoch": 0.6534655970847989, + "grad_norm": 5.0381646156311035, + "learning_rate": 2.7306378239110075e-06, + "loss": 0.1537, + "step": 25823 + }, + { + "epoch": 0.6534909026494926, + "grad_norm": 2.69880747795105, + "learning_rate": 2.730280047876044e-06, + "loss": 0.1316, + "step": 25824 + }, + { + "epoch": 0.6535162082141863, + "grad_norm": 8.517385482788086, + "learning_rate": 2.7299222864780845e-06, + "loss": 0.2367, + "step": 25825 + }, + { + "epoch": 0.6535415137788799, + "grad_norm": 2.088261127471924, + "learning_rate": 2.7295645397194316e-06, + "loss": 0.1009, + "step": 25826 + }, + { + "epoch": 0.6535668193435736, + "grad_norm": 4.167106628417969, + "learning_rate": 2.729206807602397e-06, + "loss": 0.0925, + "step": 25827 + }, + { + "epoch": 0.6535921249082673, + "grad_norm": 5.098735809326172, + "learning_rate": 2.728849090129285e-06, + "loss": 0.1531, + "step": 25828 + }, + { + "epoch": 0.6536174304729611, + "grad_norm": 4.584334373474121, + "learning_rate": 2.7284913873024026e-06, + "loss": 0.1616, + "step": 25829 + }, + { + "epoch": 0.6536427360376547, + "grad_norm": 3.4357807636260986, + "learning_rate": 2.728133699124055e-06, + "loss": 0.1473, + "step": 25830 + }, + { + "epoch": 0.6536680416023484, + "grad_norm": 4.346035480499268, + "learning_rate": 2.7277760255965523e-06, + "loss": 0.1178, + "step": 25831 + }, + { + "epoch": 0.6536933471670421, + "grad_norm": 6.118780612945557, + "learning_rate": 2.7274183667222e-06, + "loss": 0.1856, + "step": 25832 + }, + { + "epoch": 0.6537186527317357, + "grad_norm": 3.397254228591919, + "learning_rate": 2.727060722503303e-06, + "loss": 0.1353, + "step": 25833 + }, + { + "epoch": 0.6537439582964294, + "grad_norm": 14.950594902038574, + "learning_rate": 2.7267030929421677e-06, + "loss": 0.2454, + "step": 25834 + }, + { + "epoch": 0.6537692638611231, + "grad_norm": 5.393117904663086, + "learning_rate": 2.7263454780411023e-06, + "loss": 0.1543, + "step": 25835 + }, + { + "epoch": 0.6537945694258167, + "grad_norm": 6.742541790008545, + "learning_rate": 2.7259878778024127e-06, + "loss": 0.1996, + "step": 25836 + }, + { + "epoch": 0.6538198749905104, + "grad_norm": 2.921461343765259, + "learning_rate": 2.725630292228404e-06, + "loss": 0.094, + "step": 25837 + }, + { + "epoch": 0.6538451805552041, + "grad_norm": 4.146360874176025, + "learning_rate": 2.7252727213213804e-06, + "loss": 0.1404, + "step": 25838 + }, + { + "epoch": 0.6538704861198977, + "grad_norm": 9.423287391662598, + "learning_rate": 2.7249151650836515e-06, + "loss": 0.1278, + "step": 25839 + }, + { + "epoch": 0.6538957916845914, + "grad_norm": 11.214649200439453, + "learning_rate": 2.724557623517522e-06, + "loss": 0.153, + "step": 25840 + }, + { + "epoch": 0.6539210972492852, + "grad_norm": 4.134604454040527, + "learning_rate": 2.7242000966252967e-06, + "loss": 0.1211, + "step": 25841 + }, + { + "epoch": 0.6539464028139788, + "grad_norm": 3.6875340938568115, + "learning_rate": 2.7238425844092808e-06, + "loss": 0.1001, + "step": 25842 + }, + { + "epoch": 0.6539717083786725, + "grad_norm": 2.5620200634002686, + "learning_rate": 2.7234850868717817e-06, + "loss": 0.108, + "step": 25843 + }, + { + "epoch": 0.6539970139433662, + "grad_norm": 5.744858264923096, + "learning_rate": 2.723127604015104e-06, + "loss": 0.1958, + "step": 25844 + }, + { + "epoch": 0.6540223195080598, + "grad_norm": 2.6937472820281982, + "learning_rate": 2.7227701358415524e-06, + "loss": 0.0838, + "step": 25845 + }, + { + "epoch": 0.6540476250727535, + "grad_norm": 4.45752477645874, + "learning_rate": 2.7224126823534334e-06, + "loss": 0.1941, + "step": 25846 + }, + { + "epoch": 0.6540729306374472, + "grad_norm": 5.386330604553223, + "learning_rate": 2.722055243553049e-06, + "loss": 0.1823, + "step": 25847 + }, + { + "epoch": 0.6540982362021408, + "grad_norm": 3.94134259223938, + "learning_rate": 2.7216978194427085e-06, + "loss": 0.1206, + "step": 25848 + }, + { + "epoch": 0.6541235417668345, + "grad_norm": 3.915234088897705, + "learning_rate": 2.721340410024713e-06, + "loss": 0.1504, + "step": 25849 + }, + { + "epoch": 0.6541488473315282, + "grad_norm": 7.301313877105713, + "learning_rate": 2.7209830153013732e-06, + "loss": 0.194, + "step": 25850 + }, + { + "epoch": 0.6541741528962218, + "grad_norm": 10.89565372467041, + "learning_rate": 2.720625635274986e-06, + "loss": 0.1847, + "step": 25851 + }, + { + "epoch": 0.6541994584609155, + "grad_norm": 6.4748616218566895, + "learning_rate": 2.720268269947862e-06, + "loss": 0.188, + "step": 25852 + }, + { + "epoch": 0.6542247640256093, + "grad_norm": 3.8827240467071533, + "learning_rate": 2.7199109193223027e-06, + "loss": 0.1408, + "step": 25853 + }, + { + "epoch": 0.654250069590303, + "grad_norm": 4.6899590492248535, + "learning_rate": 2.719553583400617e-06, + "loss": 0.1535, + "step": 25854 + }, + { + "epoch": 0.6542753751549966, + "grad_norm": 5.766194820404053, + "learning_rate": 2.719196262185102e-06, + "loss": 0.2304, + "step": 25855 + }, + { + "epoch": 0.6543006807196903, + "grad_norm": 2.177398204803467, + "learning_rate": 2.7188389556780686e-06, + "loss": 0.1036, + "step": 25856 + }, + { + "epoch": 0.654325986284384, + "grad_norm": 30.15359115600586, + "learning_rate": 2.7184816638818163e-06, + "loss": 0.1818, + "step": 25857 + }, + { + "epoch": 0.6543512918490776, + "grad_norm": 11.965214729309082, + "learning_rate": 2.718124386798655e-06, + "loss": 0.1575, + "step": 25858 + }, + { + "epoch": 0.6543765974137713, + "grad_norm": 11.154727935791016, + "learning_rate": 2.7177671244308812e-06, + "loss": 0.2433, + "step": 25859 + }, + { + "epoch": 0.654401902978465, + "grad_norm": 8.255058288574219, + "learning_rate": 2.7174098767808046e-06, + "loss": 0.1864, + "step": 25860 + }, + { + "epoch": 0.6544272085431586, + "grad_norm": 3.543257474899292, + "learning_rate": 2.717052643850725e-06, + "loss": 0.1341, + "step": 25861 + }, + { + "epoch": 0.6544525141078523, + "grad_norm": 10.878521919250488, + "learning_rate": 2.7166954256429514e-06, + "loss": 0.1563, + "step": 25862 + }, + { + "epoch": 0.654477819672546, + "grad_norm": 4.623835563659668, + "learning_rate": 2.7163382221597827e-06, + "loss": 0.1586, + "step": 25863 + }, + { + "epoch": 0.6545031252372396, + "grad_norm": 3.35363507270813, + "learning_rate": 2.715981033403525e-06, + "loss": 0.1437, + "step": 25864 + }, + { + "epoch": 0.6545284308019333, + "grad_norm": 9.494733810424805, + "learning_rate": 2.7156238593764783e-06, + "loss": 0.1905, + "step": 25865 + }, + { + "epoch": 0.6545537363666271, + "grad_norm": 3.4578123092651367, + "learning_rate": 2.715266700080951e-06, + "loss": 0.1437, + "step": 25866 + }, + { + "epoch": 0.6545790419313207, + "grad_norm": 5.266630172729492, + "learning_rate": 2.7149095555192427e-06, + "loss": 0.2089, + "step": 25867 + }, + { + "epoch": 0.6546043474960144, + "grad_norm": 7.099021911621094, + "learning_rate": 2.7145524256936585e-06, + "loss": 0.2149, + "step": 25868 + }, + { + "epoch": 0.6546296530607081, + "grad_norm": 23.4113712310791, + "learning_rate": 2.714195310606498e-06, + "loss": 0.2584, + "step": 25869 + }, + { + "epoch": 0.6546549586254017, + "grad_norm": 11.145234107971191, + "learning_rate": 2.7138382102600693e-06, + "loss": 0.1966, + "step": 25870 + }, + { + "epoch": 0.6546802641900954, + "grad_norm": 5.270648002624512, + "learning_rate": 2.7134811246566717e-06, + "loss": 0.1551, + "step": 25871 + }, + { + "epoch": 0.6547055697547891, + "grad_norm": 4.739552974700928, + "learning_rate": 2.7131240537986104e-06, + "loss": 0.1592, + "step": 25872 + }, + { + "epoch": 0.6547308753194827, + "grad_norm": 5.377246856689453, + "learning_rate": 2.7127669976881856e-06, + "loss": 0.1669, + "step": 25873 + }, + { + "epoch": 0.6547561808841764, + "grad_norm": 6.07297945022583, + "learning_rate": 2.7124099563276994e-06, + "loss": 0.1583, + "step": 25874 + }, + { + "epoch": 0.6547814864488701, + "grad_norm": 8.458623886108398, + "learning_rate": 2.7120529297194576e-06, + "loss": 0.1812, + "step": 25875 + }, + { + "epoch": 0.6548067920135637, + "grad_norm": 5.560353755950928, + "learning_rate": 2.7116959178657614e-06, + "loss": 0.1827, + "step": 25876 + }, + { + "epoch": 0.6548320975782574, + "grad_norm": 4.825232982635498, + "learning_rate": 2.7113389207689123e-06, + "loss": 0.1282, + "step": 25877 + }, + { + "epoch": 0.6548574031429512, + "grad_norm": 3.7606191635131836, + "learning_rate": 2.710981938431211e-06, + "loss": 0.1019, + "step": 25878 + }, + { + "epoch": 0.6548827087076448, + "grad_norm": 13.520539283752441, + "learning_rate": 2.710624970854963e-06, + "loss": 0.1225, + "step": 25879 + }, + { + "epoch": 0.6549080142723385, + "grad_norm": 9.424524307250977, + "learning_rate": 2.7102680180424668e-06, + "loss": 0.2158, + "step": 25880 + }, + { + "epoch": 0.6549333198370322, + "grad_norm": 5.413572788238525, + "learning_rate": 2.7099110799960304e-06, + "loss": 0.1744, + "step": 25881 + }, + { + "epoch": 0.6549586254017259, + "grad_norm": 3.338848114013672, + "learning_rate": 2.7095541567179472e-06, + "loss": 0.1937, + "step": 25882 + }, + { + "epoch": 0.6549839309664195, + "grad_norm": 4.261775970458984, + "learning_rate": 2.7091972482105254e-06, + "loss": 0.1244, + "step": 25883 + }, + { + "epoch": 0.6550092365311132, + "grad_norm": 4.984918594360352, + "learning_rate": 2.708840354476062e-06, + "loss": 0.1351, + "step": 25884 + }, + { + "epoch": 0.6550345420958069, + "grad_norm": 5.657817363739014, + "learning_rate": 2.7084834755168653e-06, + "loss": 0.2157, + "step": 25885 + }, + { + "epoch": 0.6550598476605005, + "grad_norm": 2.3952744007110596, + "learning_rate": 2.708126611335229e-06, + "loss": 0.1075, + "step": 25886 + }, + { + "epoch": 0.6550851532251942, + "grad_norm": 5.804569721221924, + "learning_rate": 2.707769761933459e-06, + "loss": 0.1505, + "step": 25887 + }, + { + "epoch": 0.6551104587898879, + "grad_norm": 10.595519065856934, + "learning_rate": 2.7074129273138538e-06, + "loss": 0.1199, + "step": 25888 + }, + { + "epoch": 0.6551357643545815, + "grad_norm": 10.536874771118164, + "learning_rate": 2.7070561074787206e-06, + "loss": 0.1925, + "step": 25889 + }, + { + "epoch": 0.6551610699192753, + "grad_norm": 9.331235885620117, + "learning_rate": 2.7066993024303513e-06, + "loss": 0.2812, + "step": 25890 + }, + { + "epoch": 0.655186375483969, + "grad_norm": 4.156769752502441, + "learning_rate": 2.706342512171054e-06, + "loss": 0.1653, + "step": 25891 + }, + { + "epoch": 0.6552116810486626, + "grad_norm": 3.801799774169922, + "learning_rate": 2.705985736703125e-06, + "loss": 0.1334, + "step": 25892 + }, + { + "epoch": 0.6552369866133563, + "grad_norm": 10.304028511047363, + "learning_rate": 2.705628976028869e-06, + "loss": 0.2427, + "step": 25893 + }, + { + "epoch": 0.65526229217805, + "grad_norm": 3.7734925746917725, + "learning_rate": 2.705272230150585e-06, + "loss": 0.1483, + "step": 25894 + }, + { + "epoch": 0.6552875977427436, + "grad_norm": 4.707178115844727, + "learning_rate": 2.704915499070574e-06, + "loss": 0.169, + "step": 25895 + }, + { + "epoch": 0.6553129033074373, + "grad_norm": 7.095508575439453, + "learning_rate": 2.704558782791134e-06, + "loss": 0.2023, + "step": 25896 + }, + { + "epoch": 0.655338208872131, + "grad_norm": 4.67740535736084, + "learning_rate": 2.704202081314569e-06, + "loss": 0.1469, + "step": 25897 + }, + { + "epoch": 0.6553635144368246, + "grad_norm": 4.195132255554199, + "learning_rate": 2.703845394643178e-06, + "loss": 0.1697, + "step": 25898 + }, + { + "epoch": 0.6553888200015183, + "grad_norm": 3.529447555541992, + "learning_rate": 2.703488722779261e-06, + "loss": 0.0803, + "step": 25899 + }, + { + "epoch": 0.655414125566212, + "grad_norm": 10.275225639343262, + "learning_rate": 2.703132065725118e-06, + "loss": 0.1622, + "step": 25900 + }, + { + "epoch": 0.6554394311309056, + "grad_norm": 4.529771327972412, + "learning_rate": 2.7027754234830473e-06, + "loss": 0.1545, + "step": 25901 + }, + { + "epoch": 0.6554647366955993, + "grad_norm": 4.043361186981201, + "learning_rate": 2.702418796055352e-06, + "loss": 0.111, + "step": 25902 + }, + { + "epoch": 0.6554900422602931, + "grad_norm": 3.1972973346710205, + "learning_rate": 2.7020621834443307e-06, + "loss": 0.1517, + "step": 25903 + }, + { + "epoch": 0.6555153478249867, + "grad_norm": 4.7163848876953125, + "learning_rate": 2.7017055856522833e-06, + "loss": 0.143, + "step": 25904 + }, + { + "epoch": 0.6555406533896804, + "grad_norm": 11.139660835266113, + "learning_rate": 2.701349002681507e-06, + "loss": 0.3164, + "step": 25905 + }, + { + "epoch": 0.6555659589543741, + "grad_norm": 11.04342269897461, + "learning_rate": 2.7009924345343045e-06, + "loss": 0.2515, + "step": 25906 + }, + { + "epoch": 0.6555912645190678, + "grad_norm": 2.2618985176086426, + "learning_rate": 2.700635881212975e-06, + "loss": 0.0945, + "step": 25907 + }, + { + "epoch": 0.6556165700837614, + "grad_norm": 7.069095611572266, + "learning_rate": 2.700279342719817e-06, + "loss": 0.1348, + "step": 25908 + }, + { + "epoch": 0.6556418756484551, + "grad_norm": 2.5910754203796387, + "learning_rate": 2.6999228190571274e-06, + "loss": 0.1369, + "step": 25909 + }, + { + "epoch": 0.6556671812131488, + "grad_norm": 3.2483794689178467, + "learning_rate": 2.6995663102272097e-06, + "loss": 0.1779, + "step": 25910 + }, + { + "epoch": 0.6556924867778424, + "grad_norm": 11.661528587341309, + "learning_rate": 2.699209816232361e-06, + "loss": 0.191, + "step": 25911 + }, + { + "epoch": 0.6557177923425361, + "grad_norm": 5.605495929718018, + "learning_rate": 2.6988533370748804e-06, + "loss": 0.1744, + "step": 25912 + }, + { + "epoch": 0.6557430979072298, + "grad_norm": 4.980030059814453, + "learning_rate": 2.6984968727570647e-06, + "loss": 0.1582, + "step": 25913 + }, + { + "epoch": 0.6557684034719234, + "grad_norm": 3.975844144821167, + "learning_rate": 2.6981404232812165e-06, + "loss": 0.0854, + "step": 25914 + }, + { + "epoch": 0.6557937090366172, + "grad_norm": 7.878364562988281, + "learning_rate": 2.6977839886496304e-06, + "loss": 0.1717, + "step": 25915 + }, + { + "epoch": 0.6558190146013109, + "grad_norm": 19.884931564331055, + "learning_rate": 2.6974275688646113e-06, + "loss": 0.2175, + "step": 25916 + }, + { + "epoch": 0.6558443201660045, + "grad_norm": 6.814374923706055, + "learning_rate": 2.6970711639284486e-06, + "loss": 0.1954, + "step": 25917 + }, + { + "epoch": 0.6558696257306982, + "grad_norm": 6.15566873550415, + "learning_rate": 2.6967147738434484e-06, + "loss": 0.2197, + "step": 25918 + }, + { + "epoch": 0.6558949312953919, + "grad_norm": 4.180330276489258, + "learning_rate": 2.6963583986119044e-06, + "loss": 0.1122, + "step": 25919 + }, + { + "epoch": 0.6559202368600855, + "grad_norm": 2.8366949558258057, + "learning_rate": 2.6960020382361194e-06, + "loss": 0.1032, + "step": 25920 + }, + { + "epoch": 0.6559455424247792, + "grad_norm": 4.6380438804626465, + "learning_rate": 2.6956456927183862e-06, + "loss": 0.1773, + "step": 25921 + }, + { + "epoch": 0.6559708479894729, + "grad_norm": 5.9812445640563965, + "learning_rate": 2.6952893620610067e-06, + "loss": 0.1965, + "step": 25922 + }, + { + "epoch": 0.6559961535541665, + "grad_norm": 2.981717109680176, + "learning_rate": 2.694933046266275e-06, + "loss": 0.1365, + "step": 25923 + }, + { + "epoch": 0.6560214591188602, + "grad_norm": 6.858768463134766, + "learning_rate": 2.6945767453364934e-06, + "loss": 0.1423, + "step": 25924 + }, + { + "epoch": 0.6560467646835539, + "grad_norm": 6.8231635093688965, + "learning_rate": 2.6942204592739577e-06, + "loss": 0.1607, + "step": 25925 + }, + { + "epoch": 0.6560720702482475, + "grad_norm": 2.1079745292663574, + "learning_rate": 2.6938641880809656e-06, + "loss": 0.1261, + "step": 25926 + }, + { + "epoch": 0.6560973758129413, + "grad_norm": 4.773428916931152, + "learning_rate": 2.6935079317598125e-06, + "loss": 0.1676, + "step": 25927 + }, + { + "epoch": 0.656122681377635, + "grad_norm": 4.321195602416992, + "learning_rate": 2.6931516903127993e-06, + "loss": 0.1245, + "step": 25928 + }, + { + "epoch": 0.6561479869423286, + "grad_norm": 8.700505256652832, + "learning_rate": 2.6927954637422215e-06, + "loss": 0.2853, + "step": 25929 + }, + { + "epoch": 0.6561732925070223, + "grad_norm": 2.851092576980591, + "learning_rate": 2.692439252050377e-06, + "loss": 0.0933, + "step": 25930 + }, + { + "epoch": 0.656198598071716, + "grad_norm": 5.6254377365112305, + "learning_rate": 2.6920830552395626e-06, + "loss": 0.1219, + "step": 25931 + }, + { + "epoch": 0.6562239036364097, + "grad_norm": 5.275176048278809, + "learning_rate": 2.6917268733120727e-06, + "loss": 0.132, + "step": 25932 + }, + { + "epoch": 0.6562492092011033, + "grad_norm": 12.7395601272583, + "learning_rate": 2.6913707062702095e-06, + "loss": 0.2142, + "step": 25933 + }, + { + "epoch": 0.656274514765797, + "grad_norm": 9.421454429626465, + "learning_rate": 2.691014554116267e-06, + "loss": 0.2488, + "step": 25934 + }, + { + "epoch": 0.6562998203304907, + "grad_norm": 4.215628147125244, + "learning_rate": 2.690658416852542e-06, + "loss": 0.1476, + "step": 25935 + }, + { + "epoch": 0.6563251258951843, + "grad_norm": 6.067811965942383, + "learning_rate": 2.690302294481329e-06, + "loss": 0.121, + "step": 25936 + }, + { + "epoch": 0.656350431459878, + "grad_norm": 9.979762077331543, + "learning_rate": 2.689946187004929e-06, + "loss": 0.185, + "step": 25937 + }, + { + "epoch": 0.6563757370245717, + "grad_norm": 2.760798931121826, + "learning_rate": 2.689590094425637e-06, + "loss": 0.1188, + "step": 25938 + }, + { + "epoch": 0.6564010425892653, + "grad_norm": 3.6375746726989746, + "learning_rate": 2.6892340167457475e-06, + "loss": 0.0896, + "step": 25939 + }, + { + "epoch": 0.6564263481539591, + "grad_norm": 3.5927717685699463, + "learning_rate": 2.688877953967557e-06, + "loss": 0.159, + "step": 25940 + }, + { + "epoch": 0.6564516537186528, + "grad_norm": 8.957730293273926, + "learning_rate": 2.6885219060933644e-06, + "loss": 0.1686, + "step": 25941 + }, + { + "epoch": 0.6564769592833464, + "grad_norm": 6.573859214782715, + "learning_rate": 2.6881658731254643e-06, + "loss": 0.1462, + "step": 25942 + }, + { + "epoch": 0.6565022648480401, + "grad_norm": 3.588068723678589, + "learning_rate": 2.6878098550661526e-06, + "loss": 0.1459, + "step": 25943 + }, + { + "epoch": 0.6565275704127338, + "grad_norm": 7.939755916595459, + "learning_rate": 2.6874538519177218e-06, + "loss": 0.2214, + "step": 25944 + }, + { + "epoch": 0.6565528759774274, + "grad_norm": 4.693664073944092, + "learning_rate": 2.687097863682474e-06, + "loss": 0.1589, + "step": 25945 + }, + { + "epoch": 0.6565781815421211, + "grad_norm": 4.356703758239746, + "learning_rate": 2.6867418903626997e-06, + "loss": 0.152, + "step": 25946 + }, + { + "epoch": 0.6566034871068148, + "grad_norm": 10.600202560424805, + "learning_rate": 2.6863859319607007e-06, + "loss": 0.1627, + "step": 25947 + }, + { + "epoch": 0.6566287926715084, + "grad_norm": 7.877719402313232, + "learning_rate": 2.6860299884787643e-06, + "loss": 0.259, + "step": 25948 + }, + { + "epoch": 0.6566540982362021, + "grad_norm": 4.282338619232178, + "learning_rate": 2.685674059919192e-06, + "loss": 0.1381, + "step": 25949 + }, + { + "epoch": 0.6566794038008958, + "grad_norm": 15.110639572143555, + "learning_rate": 2.685318146284275e-06, + "loss": 0.2077, + "step": 25950 + }, + { + "epoch": 0.6567047093655894, + "grad_norm": 5.405916213989258, + "learning_rate": 2.6849622475763154e-06, + "loss": 0.2227, + "step": 25951 + }, + { + "epoch": 0.6567300149302832, + "grad_norm": 26.897445678710938, + "learning_rate": 2.6846063637975992e-06, + "loss": 0.2131, + "step": 25952 + }, + { + "epoch": 0.6567553204949769, + "grad_norm": 4.0477495193481445, + "learning_rate": 2.6842504949504278e-06, + "loss": 0.0934, + "step": 25953 + }, + { + "epoch": 0.6567806260596705, + "grad_norm": 5.2256364822387695, + "learning_rate": 2.6838946410370926e-06, + "loss": 0.252, + "step": 25954 + }, + { + "epoch": 0.6568059316243642, + "grad_norm": 13.945844650268555, + "learning_rate": 2.6835388020598917e-06, + "loss": 0.1348, + "step": 25955 + }, + { + "epoch": 0.6568312371890579, + "grad_norm": 4.715379238128662, + "learning_rate": 2.683182978021118e-06, + "loss": 0.1265, + "step": 25956 + }, + { + "epoch": 0.6568565427537516, + "grad_norm": 2.5372025966644287, + "learning_rate": 2.6828271689230667e-06, + "loss": 0.092, + "step": 25957 + }, + { + "epoch": 0.6568818483184452, + "grad_norm": 7.107848644256592, + "learning_rate": 2.682471374768032e-06, + "loss": 0.2967, + "step": 25958 + }, + { + "epoch": 0.6569071538831389, + "grad_norm": 5.880013942718506, + "learning_rate": 2.6821155955583057e-06, + "loss": 0.1469, + "step": 25959 + }, + { + "epoch": 0.6569324594478326, + "grad_norm": 7.110099792480469, + "learning_rate": 2.6817598312961875e-06, + "loss": 0.2457, + "step": 25960 + }, + { + "epoch": 0.6569577650125262, + "grad_norm": 5.595662593841553, + "learning_rate": 2.6814040819839683e-06, + "loss": 0.1519, + "step": 25961 + }, + { + "epoch": 0.6569830705772199, + "grad_norm": 5.0095624923706055, + "learning_rate": 2.6810483476239435e-06, + "loss": 0.2175, + "step": 25962 + }, + { + "epoch": 0.6570083761419137, + "grad_norm": 3.427962303161621, + "learning_rate": 2.6806926282184044e-06, + "loss": 0.0862, + "step": 25963 + }, + { + "epoch": 0.6570336817066073, + "grad_norm": 3.901460647583008, + "learning_rate": 2.6803369237696486e-06, + "loss": 0.1297, + "step": 25964 + }, + { + "epoch": 0.657058987271301, + "grad_norm": 3.18326735496521, + "learning_rate": 2.6799812342799686e-06, + "loss": 0.1229, + "step": 25965 + }, + { + "epoch": 0.6570842928359947, + "grad_norm": 2.2626185417175293, + "learning_rate": 2.6796255597516585e-06, + "loss": 0.1094, + "step": 25966 + }, + { + "epoch": 0.6571095984006883, + "grad_norm": 5.930083274841309, + "learning_rate": 2.679269900187009e-06, + "loss": 0.1532, + "step": 25967 + }, + { + "epoch": 0.657134903965382, + "grad_norm": 6.2888569831848145, + "learning_rate": 2.6789142555883186e-06, + "loss": 0.1313, + "step": 25968 + }, + { + "epoch": 0.6571602095300757, + "grad_norm": 4.350869655609131, + "learning_rate": 2.678558625957879e-06, + "loss": 0.1481, + "step": 25969 + }, + { + "epoch": 0.6571855150947693, + "grad_norm": 12.183967590332031, + "learning_rate": 2.6782030112979816e-06, + "loss": 0.2707, + "step": 25970 + }, + { + "epoch": 0.657210820659463, + "grad_norm": 2.6238183975219727, + "learning_rate": 2.6778474116109197e-06, + "loss": 0.1098, + "step": 25971 + }, + { + "epoch": 0.6572361262241567, + "grad_norm": 3.83508038520813, + "learning_rate": 2.6774918268989904e-06, + "loss": 0.1456, + "step": 25972 + }, + { + "epoch": 0.6572614317888503, + "grad_norm": 4.197513103485107, + "learning_rate": 2.6771362571644844e-06, + "loss": 0.1258, + "step": 25973 + }, + { + "epoch": 0.657286737353544, + "grad_norm": 3.1041975021362305, + "learning_rate": 2.6767807024096938e-06, + "loss": 0.1457, + "step": 25974 + }, + { + "epoch": 0.6573120429182377, + "grad_norm": 7.336091995239258, + "learning_rate": 2.676425162636911e-06, + "loss": 0.1679, + "step": 25975 + }, + { + "epoch": 0.6573373484829313, + "grad_norm": 6.517194747924805, + "learning_rate": 2.6760696378484314e-06, + "loss": 0.1092, + "step": 25976 + }, + { + "epoch": 0.6573626540476251, + "grad_norm": 4.8292012214660645, + "learning_rate": 2.675714128046545e-06, + "loss": 0.2166, + "step": 25977 + }, + { + "epoch": 0.6573879596123188, + "grad_norm": 5.405315399169922, + "learning_rate": 2.675358633233549e-06, + "loss": 0.1678, + "step": 25978 + }, + { + "epoch": 0.6574132651770124, + "grad_norm": 8.477924346923828, + "learning_rate": 2.6750031534117293e-06, + "loss": 0.2443, + "step": 25979 + }, + { + "epoch": 0.6574385707417061, + "grad_norm": 7.023583889007568, + "learning_rate": 2.6746476885833837e-06, + "loss": 0.2244, + "step": 25980 + }, + { + "epoch": 0.6574638763063998, + "grad_norm": 4.255876541137695, + "learning_rate": 2.6742922387508007e-06, + "loss": 0.1292, + "step": 25981 + }, + { + "epoch": 0.6574891818710935, + "grad_norm": 5.744422912597656, + "learning_rate": 2.673936803916279e-06, + "loss": 0.2347, + "step": 25982 + }, + { + "epoch": 0.6575144874357871, + "grad_norm": 3.7836861610412598, + "learning_rate": 2.673581384082101e-06, + "loss": 0.1009, + "step": 25983 + }, + { + "epoch": 0.6575397930004808, + "grad_norm": 5.649081707000732, + "learning_rate": 2.673225979250567e-06, + "loss": 0.1211, + "step": 25984 + }, + { + "epoch": 0.6575650985651745, + "grad_norm": 3.84155011177063, + "learning_rate": 2.672870589423965e-06, + "loss": 0.1847, + "step": 25985 + }, + { + "epoch": 0.6575904041298681, + "grad_norm": 11.477486610412598, + "learning_rate": 2.6725152146045886e-06, + "loss": 0.2248, + "step": 25986 + }, + { + "epoch": 0.6576157096945618, + "grad_norm": 5.059811115264893, + "learning_rate": 2.672159854794726e-06, + "loss": 0.1134, + "step": 25987 + }, + { + "epoch": 0.6576410152592556, + "grad_norm": 3.0569920539855957, + "learning_rate": 2.6718045099966737e-06, + "loss": 0.1025, + "step": 25988 + }, + { + "epoch": 0.6576663208239492, + "grad_norm": 4.956247329711914, + "learning_rate": 2.6714491802127205e-06, + "loss": 0.1671, + "step": 25989 + }, + { + "epoch": 0.6576916263886429, + "grad_norm": 6.334832191467285, + "learning_rate": 2.6710938654451575e-06, + "loss": 0.1839, + "step": 25990 + }, + { + "epoch": 0.6577169319533366, + "grad_norm": 16.37739372253418, + "learning_rate": 2.670738565696278e-06, + "loss": 0.1482, + "step": 25991 + }, + { + "epoch": 0.6577422375180302, + "grad_norm": 4.469442844390869, + "learning_rate": 2.6703832809683727e-06, + "loss": 0.125, + "step": 25992 + }, + { + "epoch": 0.6577675430827239, + "grad_norm": 4.677667140960693, + "learning_rate": 2.670028011263732e-06, + "loss": 0.1377, + "step": 25993 + }, + { + "epoch": 0.6577928486474176, + "grad_norm": 8.077446937561035, + "learning_rate": 2.6696727565846453e-06, + "loss": 0.189, + "step": 25994 + }, + { + "epoch": 0.6578181542121112, + "grad_norm": 5.856277942657471, + "learning_rate": 2.6693175169334096e-06, + "loss": 0.1606, + "step": 25995 + }, + { + "epoch": 0.6578434597768049, + "grad_norm": 7.040138244628906, + "learning_rate": 2.6689622923123076e-06, + "loss": 0.1933, + "step": 25996 + }, + { + "epoch": 0.6578687653414986, + "grad_norm": 2.7346692085266113, + "learning_rate": 2.6686070827236366e-06, + "loss": 0.1547, + "step": 25997 + }, + { + "epoch": 0.6578940709061922, + "grad_norm": 8.169177055358887, + "learning_rate": 2.6682518881696818e-06, + "loss": 0.231, + "step": 25998 + }, + { + "epoch": 0.6579193764708859, + "grad_norm": 10.829364776611328, + "learning_rate": 2.6678967086527395e-06, + "loss": 0.1547, + "step": 25999 + }, + { + "epoch": 0.6579446820355797, + "grad_norm": 4.048505783081055, + "learning_rate": 2.667541544175097e-06, + "loss": 0.1545, + "step": 26000 + }, + { + "epoch": 0.6579699876002733, + "grad_norm": 7.111184597015381, + "learning_rate": 2.6671863947390465e-06, + "loss": 0.1728, + "step": 26001 + }, + { + "epoch": 0.657995293164967, + "grad_norm": 1.1069443225860596, + "learning_rate": 2.6668312603468743e-06, + "loss": 0.0275, + "step": 26002 + }, + { + "epoch": 0.6580205987296607, + "grad_norm": 5.505546569824219, + "learning_rate": 2.6664761410008754e-06, + "loss": 0.1893, + "step": 26003 + }, + { + "epoch": 0.6580459042943543, + "grad_norm": 3.666166067123413, + "learning_rate": 2.6661210367033375e-06, + "loss": 0.092, + "step": 26004 + }, + { + "epoch": 0.658071209859048, + "grad_norm": 3.1990575790405273, + "learning_rate": 2.665765947456551e-06, + "loss": 0.1194, + "step": 26005 + }, + { + "epoch": 0.6580965154237417, + "grad_norm": 18.200824737548828, + "learning_rate": 2.6654108732628037e-06, + "loss": 0.5868, + "step": 26006 + }, + { + "epoch": 0.6581218209884353, + "grad_norm": 5.532507419586182, + "learning_rate": 2.6650558141243895e-06, + "loss": 0.1947, + "step": 26007 + }, + { + "epoch": 0.658147126553129, + "grad_norm": 2.8213491439819336, + "learning_rate": 2.664700770043594e-06, + "loss": 0.0933, + "step": 26008 + }, + { + "epoch": 0.6581724321178227, + "grad_norm": 6.211226940155029, + "learning_rate": 2.6643457410227126e-06, + "loss": 0.1059, + "step": 26009 + }, + { + "epoch": 0.6581977376825164, + "grad_norm": 3.2671079635620117, + "learning_rate": 2.6639907270640274e-06, + "loss": 0.1097, + "step": 26010 + }, + { + "epoch": 0.65822304324721, + "grad_norm": 17.59583854675293, + "learning_rate": 2.6636357281698333e-06, + "loss": 0.2554, + "step": 26011 + }, + { + "epoch": 0.6582483488119037, + "grad_norm": 4.344581127166748, + "learning_rate": 2.663280744342418e-06, + "loss": 0.1103, + "step": 26012 + }, + { + "epoch": 0.6582736543765975, + "grad_norm": 5.515442371368408, + "learning_rate": 2.66292577558407e-06, + "loss": 0.2145, + "step": 26013 + }, + { + "epoch": 0.6582989599412911, + "grad_norm": 2.8386878967285156, + "learning_rate": 2.6625708218970774e-06, + "loss": 0.1203, + "step": 26014 + }, + { + "epoch": 0.6583242655059848, + "grad_norm": 5.206294059753418, + "learning_rate": 2.662215883283733e-06, + "loss": 0.1547, + "step": 26015 + }, + { + "epoch": 0.6583495710706785, + "grad_norm": 3.512005567550659, + "learning_rate": 2.661860959746323e-06, + "loss": 0.1283, + "step": 26016 + }, + { + "epoch": 0.6583748766353721, + "grad_norm": 3.585930585861206, + "learning_rate": 2.6615060512871364e-06, + "loss": 0.1332, + "step": 26017 + }, + { + "epoch": 0.6584001822000658, + "grad_norm": 2.81592059135437, + "learning_rate": 2.6611511579084613e-06, + "loss": 0.1312, + "step": 26018 + }, + { + "epoch": 0.6584254877647595, + "grad_norm": 3.6603927612304688, + "learning_rate": 2.6607962796125884e-06, + "loss": 0.122, + "step": 26019 + }, + { + "epoch": 0.6584507933294531, + "grad_norm": 5.5101189613342285, + "learning_rate": 2.6604414164018056e-06, + "loss": 0.1447, + "step": 26020 + }, + { + "epoch": 0.6584760988941468, + "grad_norm": 3.0467276573181152, + "learning_rate": 2.6600865682783995e-06, + "loss": 0.1318, + "step": 26021 + }, + { + "epoch": 0.6585014044588405, + "grad_norm": 2.929922580718994, + "learning_rate": 2.6597317352446627e-06, + "loss": 0.087, + "step": 26022 + }, + { + "epoch": 0.6585267100235341, + "grad_norm": 5.327943325042725, + "learning_rate": 2.6593769173028773e-06, + "loss": 0.2279, + "step": 26023 + }, + { + "epoch": 0.6585520155882278, + "grad_norm": 30.1019344329834, + "learning_rate": 2.6590221144553364e-06, + "loss": 0.1572, + "step": 26024 + }, + { + "epoch": 0.6585773211529216, + "grad_norm": 9.741949081420898, + "learning_rate": 2.658667326704325e-06, + "loss": 0.2262, + "step": 26025 + }, + { + "epoch": 0.6586026267176152, + "grad_norm": 3.2568745613098145, + "learning_rate": 2.6583125540521358e-06, + "loss": 0.0918, + "step": 26026 + }, + { + "epoch": 0.6586279322823089, + "grad_norm": 4.469940662384033, + "learning_rate": 2.65795779650105e-06, + "loss": 0.1559, + "step": 26027 + }, + { + "epoch": 0.6586532378470026, + "grad_norm": 3.7441720962524414, + "learning_rate": 2.6576030540533606e-06, + "loss": 0.1528, + "step": 26028 + }, + { + "epoch": 0.6586785434116962, + "grad_norm": 3.6962947845458984, + "learning_rate": 2.6572483267113513e-06, + "loss": 0.1563, + "step": 26029 + }, + { + "epoch": 0.6587038489763899, + "grad_norm": 4.535633087158203, + "learning_rate": 2.6568936144773135e-06, + "loss": 0.1583, + "step": 26030 + }, + { + "epoch": 0.6587291545410836, + "grad_norm": 3.2659754753112793, + "learning_rate": 2.656538917353533e-06, + "loss": 0.0898, + "step": 26031 + }, + { + "epoch": 0.6587544601057772, + "grad_norm": 3.3983213901519775, + "learning_rate": 2.656184235342297e-06, + "loss": 0.1224, + "step": 26032 + }, + { + "epoch": 0.6587797656704709, + "grad_norm": 11.084739685058594, + "learning_rate": 2.6558295684458914e-06, + "loss": 0.2506, + "step": 26033 + }, + { + "epoch": 0.6588050712351646, + "grad_norm": 4.113532543182373, + "learning_rate": 2.655474916666606e-06, + "loss": 0.1101, + "step": 26034 + }, + { + "epoch": 0.6588303767998583, + "grad_norm": 4.213202953338623, + "learning_rate": 2.6551202800067276e-06, + "loss": 0.1065, + "step": 26035 + }, + { + "epoch": 0.6588556823645519, + "grad_norm": 4.976364612579346, + "learning_rate": 2.6547656584685423e-06, + "loss": 0.1573, + "step": 26036 + }, + { + "epoch": 0.6588809879292457, + "grad_norm": 8.12497329711914, + "learning_rate": 2.6544110520543344e-06, + "loss": 0.1512, + "step": 26037 + }, + { + "epoch": 0.6589062934939394, + "grad_norm": 3.7622745037078857, + "learning_rate": 2.6540564607663956e-06, + "loss": 0.1054, + "step": 26038 + }, + { + "epoch": 0.658931599058633, + "grad_norm": 6.208695888519287, + "learning_rate": 2.6537018846070107e-06, + "loss": 0.1612, + "step": 26039 + }, + { + "epoch": 0.6589569046233267, + "grad_norm": 3.6167044639587402, + "learning_rate": 2.653347323578466e-06, + "loss": 0.1558, + "step": 26040 + }, + { + "epoch": 0.6589822101880204, + "grad_norm": 5.770856857299805, + "learning_rate": 2.652992777683045e-06, + "loss": 0.1124, + "step": 26041 + }, + { + "epoch": 0.659007515752714, + "grad_norm": 11.009906768798828, + "learning_rate": 2.6526382469230396e-06, + "loss": 0.1421, + "step": 26042 + }, + { + "epoch": 0.6590328213174077, + "grad_norm": 4.9708404541015625, + "learning_rate": 2.652283731300733e-06, + "loss": 0.1498, + "step": 26043 + }, + { + "epoch": 0.6590581268821014, + "grad_norm": 4.315059185028076, + "learning_rate": 2.6519292308184125e-06, + "loss": 0.1108, + "step": 26044 + }, + { + "epoch": 0.659083432446795, + "grad_norm": 4.889052867889404, + "learning_rate": 2.6515747454783614e-06, + "loss": 0.2003, + "step": 26045 + }, + { + "epoch": 0.6591087380114887, + "grad_norm": 3.675955057144165, + "learning_rate": 2.6512202752828697e-06, + "loss": 0.1094, + "step": 26046 + }, + { + "epoch": 0.6591340435761824, + "grad_norm": 9.460188865661621, + "learning_rate": 2.650865820234222e-06, + "loss": 0.1573, + "step": 26047 + }, + { + "epoch": 0.659159349140876, + "grad_norm": 12.055368423461914, + "learning_rate": 2.6505113803347034e-06, + "loss": 0.2308, + "step": 26048 + }, + { + "epoch": 0.6591846547055698, + "grad_norm": 7.412526607513428, + "learning_rate": 2.650156955586598e-06, + "loss": 0.1235, + "step": 26049 + }, + { + "epoch": 0.6592099602702635, + "grad_norm": 7.328890800476074, + "learning_rate": 2.649802545992195e-06, + "loss": 0.221, + "step": 26050 + }, + { + "epoch": 0.6592352658349571, + "grad_norm": 4.744038105010986, + "learning_rate": 2.6494481515537786e-06, + "loss": 0.2103, + "step": 26051 + }, + { + "epoch": 0.6592605713996508, + "grad_norm": 8.292641639709473, + "learning_rate": 2.6490937722736315e-06, + "loss": 0.1887, + "step": 26052 + }, + { + "epoch": 0.6592858769643445, + "grad_norm": 3.4384357929229736, + "learning_rate": 2.648739408154045e-06, + "loss": 0.0976, + "step": 26053 + }, + { + "epoch": 0.6593111825290381, + "grad_norm": 4.982139587402344, + "learning_rate": 2.6483850591972977e-06, + "loss": 0.2026, + "step": 26054 + }, + { + "epoch": 0.6593364880937318, + "grad_norm": 5.09091854095459, + "learning_rate": 2.648030725405679e-06, + "loss": 0.1638, + "step": 26055 + }, + { + "epoch": 0.6593617936584255, + "grad_norm": 3.608002185821533, + "learning_rate": 2.6476764067814707e-06, + "loss": 0.1323, + "step": 26056 + }, + { + "epoch": 0.6593870992231191, + "grad_norm": 5.849506378173828, + "learning_rate": 2.6473221033269635e-06, + "loss": 0.1039, + "step": 26057 + }, + { + "epoch": 0.6594124047878128, + "grad_norm": 4.52525520324707, + "learning_rate": 2.6469678150444354e-06, + "loss": 0.165, + "step": 26058 + }, + { + "epoch": 0.6594377103525065, + "grad_norm": 9.578113555908203, + "learning_rate": 2.6466135419361748e-06, + "loss": 0.2495, + "step": 26059 + }, + { + "epoch": 0.6594630159172002, + "grad_norm": 4.790928840637207, + "learning_rate": 2.646259284004465e-06, + "loss": 0.1863, + "step": 26060 + }, + { + "epoch": 0.6594883214818938, + "grad_norm": 6.758969306945801, + "learning_rate": 2.645905041251593e-06, + "loss": 0.1804, + "step": 26061 + }, + { + "epoch": 0.6595136270465876, + "grad_norm": 5.215539932250977, + "learning_rate": 2.645550813679841e-06, + "loss": 0.2003, + "step": 26062 + }, + { + "epoch": 0.6595389326112813, + "grad_norm": 12.120585441589355, + "learning_rate": 2.6451966012914936e-06, + "loss": 0.2436, + "step": 26063 + }, + { + "epoch": 0.6595642381759749, + "grad_norm": 2.850243330001831, + "learning_rate": 2.6448424040888336e-06, + "loss": 0.0924, + "step": 26064 + }, + { + "epoch": 0.6595895437406686, + "grad_norm": 7.305052757263184, + "learning_rate": 2.644488222074149e-06, + "loss": 0.1997, + "step": 26065 + }, + { + "epoch": 0.6596148493053623, + "grad_norm": 6.6871771812438965, + "learning_rate": 2.644134055249722e-06, + "loss": 0.1438, + "step": 26066 + }, + { + "epoch": 0.6596401548700559, + "grad_norm": 3.9491567611694336, + "learning_rate": 2.6437799036178356e-06, + "loss": 0.1281, + "step": 26067 + }, + { + "epoch": 0.6596654604347496, + "grad_norm": 3.466667652130127, + "learning_rate": 2.643425767180773e-06, + "loss": 0.1104, + "step": 26068 + }, + { + "epoch": 0.6596907659994433, + "grad_norm": 8.144372940063477, + "learning_rate": 2.6430716459408208e-06, + "loss": 0.1622, + "step": 26069 + }, + { + "epoch": 0.6597160715641369, + "grad_norm": 4.045384407043457, + "learning_rate": 2.642717539900261e-06, + "loss": 0.1119, + "step": 26070 + }, + { + "epoch": 0.6597413771288306, + "grad_norm": 3.927569627761841, + "learning_rate": 2.6423634490613775e-06, + "loss": 0.1376, + "step": 26071 + }, + { + "epoch": 0.6597666826935243, + "grad_norm": 2.485940456390381, + "learning_rate": 2.6420093734264517e-06, + "loss": 0.1098, + "step": 26072 + }, + { + "epoch": 0.6597919882582179, + "grad_norm": 9.01683521270752, + "learning_rate": 2.6416553129977707e-06, + "loss": 0.1966, + "step": 26073 + }, + { + "epoch": 0.6598172938229117, + "grad_norm": 3.0995419025421143, + "learning_rate": 2.6413012677776157e-06, + "loss": 0.0869, + "step": 26074 + }, + { + "epoch": 0.6598425993876054, + "grad_norm": 2.956089735031128, + "learning_rate": 2.64094723776827e-06, + "loss": 0.1043, + "step": 26075 + }, + { + "epoch": 0.659867904952299, + "grad_norm": 7.634111404418945, + "learning_rate": 2.6405932229720156e-06, + "loss": 0.2607, + "step": 26076 + }, + { + "epoch": 0.6598932105169927, + "grad_norm": 2.7604148387908936, + "learning_rate": 2.6402392233911376e-06, + "loss": 0.0885, + "step": 26077 + }, + { + "epoch": 0.6599185160816864, + "grad_norm": 10.42448616027832, + "learning_rate": 2.6398852390279185e-06, + "loss": 0.3763, + "step": 26078 + }, + { + "epoch": 0.65994382164638, + "grad_norm": 5.571813106536865, + "learning_rate": 2.6395312698846396e-06, + "loss": 0.216, + "step": 26079 + }, + { + "epoch": 0.6599691272110737, + "grad_norm": 22.687786102294922, + "learning_rate": 2.639177315963586e-06, + "loss": 0.2367, + "step": 26080 + }, + { + "epoch": 0.6599944327757674, + "grad_norm": 7.614700794219971, + "learning_rate": 2.6388233772670356e-06, + "loss": 0.1414, + "step": 26081 + }, + { + "epoch": 0.660019738340461, + "grad_norm": 7.290584087371826, + "learning_rate": 2.6384694537972764e-06, + "loss": 0.2613, + "step": 26082 + }, + { + "epoch": 0.6600450439051547, + "grad_norm": 3.5726261138916016, + "learning_rate": 2.6381155455565864e-06, + "loss": 0.1088, + "step": 26083 + }, + { + "epoch": 0.6600703494698484, + "grad_norm": 3.6872918605804443, + "learning_rate": 2.6377616525472538e-06, + "loss": 0.1428, + "step": 26084 + }, + { + "epoch": 0.6600956550345422, + "grad_norm": 3.7687435150146484, + "learning_rate": 2.6374077747715534e-06, + "loss": 0.1043, + "step": 26085 + }, + { + "epoch": 0.6601209605992358, + "grad_norm": 4.096907138824463, + "learning_rate": 2.6370539122317717e-06, + "loss": 0.1808, + "step": 26086 + }, + { + "epoch": 0.6601462661639295, + "grad_norm": 3.657902717590332, + "learning_rate": 2.6367000649301886e-06, + "loss": 0.094, + "step": 26087 + }, + { + "epoch": 0.6601715717286232, + "grad_norm": 4.213609218597412, + "learning_rate": 2.636346232869091e-06, + "loss": 0.2055, + "step": 26088 + }, + { + "epoch": 0.6601968772933168, + "grad_norm": 2.7147059440612793, + "learning_rate": 2.635992416050753e-06, + "loss": 0.0695, + "step": 26089 + }, + { + "epoch": 0.6602221828580105, + "grad_norm": 8.568058013916016, + "learning_rate": 2.6356386144774614e-06, + "loss": 0.2456, + "step": 26090 + }, + { + "epoch": 0.6602474884227042, + "grad_norm": 7.875781059265137, + "learning_rate": 2.6352848281514952e-06, + "loss": 0.1648, + "step": 26091 + }, + { + "epoch": 0.6602727939873978, + "grad_norm": 3.425147294998169, + "learning_rate": 2.634931057075142e-06, + "loss": 0.1164, + "step": 26092 + }, + { + "epoch": 0.6602980995520915, + "grad_norm": 10.589984893798828, + "learning_rate": 2.6345773012506738e-06, + "loss": 0.149, + "step": 26093 + }, + { + "epoch": 0.6603234051167852, + "grad_norm": 8.894943237304688, + "learning_rate": 2.634223560680378e-06, + "loss": 0.1679, + "step": 26094 + }, + { + "epoch": 0.6603487106814788, + "grad_norm": 4.081900119781494, + "learning_rate": 2.633869835366533e-06, + "loss": 0.1254, + "step": 26095 + }, + { + "epoch": 0.6603740162461725, + "grad_norm": 4.29135274887085, + "learning_rate": 2.6335161253114233e-06, + "loss": 0.128, + "step": 26096 + }, + { + "epoch": 0.6603993218108662, + "grad_norm": 4.4837422370910645, + "learning_rate": 2.6331624305173274e-06, + "loss": 0.147, + "step": 26097 + }, + { + "epoch": 0.6604246273755598, + "grad_norm": 3.955993413925171, + "learning_rate": 2.6328087509865276e-06, + "loss": 0.1565, + "step": 26098 + }, + { + "epoch": 0.6604499329402536, + "grad_norm": 6.226291179656982, + "learning_rate": 2.6324550867213016e-06, + "loss": 0.1732, + "step": 26099 + }, + { + "epoch": 0.6604752385049473, + "grad_norm": 3.6169004440307617, + "learning_rate": 2.632101437723934e-06, + "loss": 0.1461, + "step": 26100 + }, + { + "epoch": 0.6605005440696409, + "grad_norm": 14.677692413330078, + "learning_rate": 2.6317478039967037e-06, + "loss": 0.28, + "step": 26101 + }, + { + "epoch": 0.6605258496343346, + "grad_norm": 3.405794143676758, + "learning_rate": 2.6313941855418913e-06, + "loss": 0.1162, + "step": 26102 + }, + { + "epoch": 0.6605511551990283, + "grad_norm": 5.352750301361084, + "learning_rate": 2.6310405823617757e-06, + "loss": 0.1233, + "step": 26103 + }, + { + "epoch": 0.6605764607637219, + "grad_norm": 9.602747917175293, + "learning_rate": 2.63068699445864e-06, + "loss": 0.2174, + "step": 26104 + }, + { + "epoch": 0.6606017663284156, + "grad_norm": 4.465034484863281, + "learning_rate": 2.630333421834764e-06, + "loss": 0.153, + "step": 26105 + }, + { + "epoch": 0.6606270718931093, + "grad_norm": 3.2199907302856445, + "learning_rate": 2.6299798644924264e-06, + "loss": 0.1083, + "step": 26106 + }, + { + "epoch": 0.6606523774578029, + "grad_norm": 7.289176940917969, + "learning_rate": 2.629626322433908e-06, + "loss": 0.197, + "step": 26107 + }, + { + "epoch": 0.6606776830224966, + "grad_norm": 8.6890869140625, + "learning_rate": 2.629272795661486e-06, + "loss": 0.1772, + "step": 26108 + }, + { + "epoch": 0.6607029885871903, + "grad_norm": 3.2141003608703613, + "learning_rate": 2.6289192841774458e-06, + "loss": 0.0715, + "step": 26109 + }, + { + "epoch": 0.660728294151884, + "grad_norm": 5.524870872497559, + "learning_rate": 2.628565787984063e-06, + "loss": 0.2396, + "step": 26110 + }, + { + "epoch": 0.6607535997165777, + "grad_norm": 13.545793533325195, + "learning_rate": 2.628212307083619e-06, + "loss": 0.1463, + "step": 26111 + }, + { + "epoch": 0.6607789052812714, + "grad_norm": 4.82465124130249, + "learning_rate": 2.62785884147839e-06, + "loss": 0.1562, + "step": 26112 + }, + { + "epoch": 0.6608042108459651, + "grad_norm": 6.973353385925293, + "learning_rate": 2.6275053911706606e-06, + "loss": 0.1638, + "step": 26113 + }, + { + "epoch": 0.6608295164106587, + "grad_norm": 3.947946310043335, + "learning_rate": 2.627151956162708e-06, + "loss": 0.1403, + "step": 26114 + }, + { + "epoch": 0.6608548219753524, + "grad_norm": 5.523355960845947, + "learning_rate": 2.6267985364568106e-06, + "loss": 0.1315, + "step": 26115 + }, + { + "epoch": 0.6608801275400461, + "grad_norm": 3.9154107570648193, + "learning_rate": 2.626445132055246e-06, + "loss": 0.155, + "step": 26116 + }, + { + "epoch": 0.6609054331047397, + "grad_norm": 7.821004867553711, + "learning_rate": 2.6260917429602974e-06, + "loss": 0.0939, + "step": 26117 + }, + { + "epoch": 0.6609307386694334, + "grad_norm": 11.635112762451172, + "learning_rate": 2.6257383691742388e-06, + "loss": 0.1211, + "step": 26118 + }, + { + "epoch": 0.6609560442341271, + "grad_norm": 6.271129608154297, + "learning_rate": 2.625385010699356e-06, + "loss": 0.1916, + "step": 26119 + }, + { + "epoch": 0.6609813497988207, + "grad_norm": 2.8169021606445312, + "learning_rate": 2.6250316675379206e-06, + "loss": 0.1147, + "step": 26120 + }, + { + "epoch": 0.6610066553635144, + "grad_norm": 12.823683738708496, + "learning_rate": 2.6246783396922156e-06, + "loss": 0.2137, + "step": 26121 + }, + { + "epoch": 0.6610319609282082, + "grad_norm": 8.495119094848633, + "learning_rate": 2.624325027164516e-06, + "loss": 0.2222, + "step": 26122 + }, + { + "epoch": 0.6610572664929018, + "grad_norm": 4.584484100341797, + "learning_rate": 2.6239717299571066e-06, + "loss": 0.1811, + "step": 26123 + }, + { + "epoch": 0.6610825720575955, + "grad_norm": 2.764047145843506, + "learning_rate": 2.623618448072257e-06, + "loss": 0.115, + "step": 26124 + }, + { + "epoch": 0.6611078776222892, + "grad_norm": 3.5723178386688232, + "learning_rate": 2.6232651815122522e-06, + "loss": 0.1809, + "step": 26125 + }, + { + "epoch": 0.6611331831869828, + "grad_norm": 3.682769775390625, + "learning_rate": 2.6229119302793665e-06, + "loss": 0.1109, + "step": 26126 + }, + { + "epoch": 0.6611584887516765, + "grad_norm": 3.833599805831909, + "learning_rate": 2.622558694375882e-06, + "loss": 0.1158, + "step": 26127 + }, + { + "epoch": 0.6611837943163702, + "grad_norm": 6.163389682769775, + "learning_rate": 2.622205473804073e-06, + "loss": 0.2309, + "step": 26128 + }, + { + "epoch": 0.6612090998810638, + "grad_norm": 5.557536602020264, + "learning_rate": 2.6218522685662194e-06, + "loss": 0.1406, + "step": 26129 + }, + { + "epoch": 0.6612344054457575, + "grad_norm": 3.670638084411621, + "learning_rate": 2.621499078664596e-06, + "loss": 0.123, + "step": 26130 + }, + { + "epoch": 0.6612597110104512, + "grad_norm": 5.8306684494018555, + "learning_rate": 2.621145904101484e-06, + "loss": 0.1927, + "step": 26131 + }, + { + "epoch": 0.6612850165751448, + "grad_norm": 3.0179262161254883, + "learning_rate": 2.6207927448791604e-06, + "loss": 0.1478, + "step": 26132 + }, + { + "epoch": 0.6613103221398385, + "grad_norm": 6.795024871826172, + "learning_rate": 2.620439600999902e-06, + "loss": 0.1412, + "step": 26133 + }, + { + "epoch": 0.6613356277045322, + "grad_norm": 24.803325653076172, + "learning_rate": 2.6200864724659856e-06, + "loss": 0.319, + "step": 26134 + }, + { + "epoch": 0.6613609332692258, + "grad_norm": 6.372720241546631, + "learning_rate": 2.619733359279687e-06, + "loss": 0.1676, + "step": 26135 + }, + { + "epoch": 0.6613862388339196, + "grad_norm": 5.0382981300354, + "learning_rate": 2.619380261443287e-06, + "loss": 0.1565, + "step": 26136 + }, + { + "epoch": 0.6614115443986133, + "grad_norm": 4.318363189697266, + "learning_rate": 2.6190271789590606e-06, + "loss": 0.2377, + "step": 26137 + }, + { + "epoch": 0.661436849963307, + "grad_norm": 9.645708084106445, + "learning_rate": 2.6186741118292853e-06, + "loss": 0.2485, + "step": 26138 + }, + { + "epoch": 0.6614621555280006, + "grad_norm": 9.224261283874512, + "learning_rate": 2.618321060056236e-06, + "loss": 0.2077, + "step": 26139 + }, + { + "epoch": 0.6614874610926943, + "grad_norm": 4.092829704284668, + "learning_rate": 2.6179680236421923e-06, + "loss": 0.1401, + "step": 26140 + }, + { + "epoch": 0.661512766657388, + "grad_norm": 3.7391624450683594, + "learning_rate": 2.6176150025894305e-06, + "loss": 0.1169, + "step": 26141 + }, + { + "epoch": 0.6615380722220816, + "grad_norm": 4.216085910797119, + "learning_rate": 2.617261996900226e-06, + "loss": 0.1211, + "step": 26142 + }, + { + "epoch": 0.6615633777867753, + "grad_norm": 6.766092300415039, + "learning_rate": 2.616909006576854e-06, + "loss": 0.1304, + "step": 26143 + }, + { + "epoch": 0.661588683351469, + "grad_norm": 5.033226490020752, + "learning_rate": 2.616556031621594e-06, + "loss": 0.1539, + "step": 26144 + }, + { + "epoch": 0.6616139889161626, + "grad_norm": 7.0202860832214355, + "learning_rate": 2.6162030720367216e-06, + "loss": 0.3068, + "step": 26145 + }, + { + "epoch": 0.6616392944808563, + "grad_norm": 5.459376335144043, + "learning_rate": 2.615850127824512e-06, + "loss": 0.1948, + "step": 26146 + }, + { + "epoch": 0.66166460004555, + "grad_norm": 4.967047214508057, + "learning_rate": 2.615497198987239e-06, + "loss": 0.1634, + "step": 26147 + }, + { + "epoch": 0.6616899056102437, + "grad_norm": 4.498692035675049, + "learning_rate": 2.615144285527183e-06, + "loss": 0.1715, + "step": 26148 + }, + { + "epoch": 0.6617152111749374, + "grad_norm": 3.4459688663482666, + "learning_rate": 2.6147913874466156e-06, + "loss": 0.1153, + "step": 26149 + }, + { + "epoch": 0.6617405167396311, + "grad_norm": 7.028139591217041, + "learning_rate": 2.6144385047478194e-06, + "loss": 0.1212, + "step": 26150 + }, + { + "epoch": 0.6617658223043247, + "grad_norm": 4.513515949249268, + "learning_rate": 2.6140856374330612e-06, + "loss": 0.0961, + "step": 26151 + }, + { + "epoch": 0.6617911278690184, + "grad_norm": 5.007126331329346, + "learning_rate": 2.6137327855046236e-06, + "loss": 0.1213, + "step": 26152 + }, + { + "epoch": 0.6618164334337121, + "grad_norm": 5.761888027191162, + "learning_rate": 2.6133799489647772e-06, + "loss": 0.1401, + "step": 26153 + }, + { + "epoch": 0.6618417389984057, + "grad_norm": 6.005646705627441, + "learning_rate": 2.6130271278158033e-06, + "loss": 0.2298, + "step": 26154 + }, + { + "epoch": 0.6618670445630994, + "grad_norm": 3.324159860610962, + "learning_rate": 2.6126743220599693e-06, + "loss": 0.1457, + "step": 26155 + }, + { + "epoch": 0.6618923501277931, + "grad_norm": 7.907513618469238, + "learning_rate": 2.6123215316995563e-06, + "loss": 0.1488, + "step": 26156 + }, + { + "epoch": 0.6619176556924867, + "grad_norm": 3.9529547691345215, + "learning_rate": 2.611968756736837e-06, + "loss": 0.1073, + "step": 26157 + }, + { + "epoch": 0.6619429612571804, + "grad_norm": 9.912654876708984, + "learning_rate": 2.6116159971740873e-06, + "loss": 0.413, + "step": 26158 + }, + { + "epoch": 0.6619682668218742, + "grad_norm": 4.469034194946289, + "learning_rate": 2.611263253013583e-06, + "loss": 0.1377, + "step": 26159 + }, + { + "epoch": 0.6619935723865678, + "grad_norm": 7.644143581390381, + "learning_rate": 2.6109105242575966e-06, + "loss": 0.1786, + "step": 26160 + }, + { + "epoch": 0.6620188779512615, + "grad_norm": 4.527966022491455, + "learning_rate": 2.6105578109084023e-06, + "loss": 0.1388, + "step": 26161 + }, + { + "epoch": 0.6620441835159552, + "grad_norm": 4.0861945152282715, + "learning_rate": 2.610205112968278e-06, + "loss": 0.1458, + "step": 26162 + }, + { + "epoch": 0.6620694890806489, + "grad_norm": 6.645299434661865, + "learning_rate": 2.609852430439497e-06, + "loss": 0.1299, + "step": 26163 + }, + { + "epoch": 0.6620947946453425, + "grad_norm": 5.902703285217285, + "learning_rate": 2.6094997633243326e-06, + "loss": 0.1324, + "step": 26164 + }, + { + "epoch": 0.6621201002100362, + "grad_norm": 3.7389931678771973, + "learning_rate": 2.6091471116250595e-06, + "loss": 0.154, + "step": 26165 + }, + { + "epoch": 0.6621454057747299, + "grad_norm": 4.303620338439941, + "learning_rate": 2.60879447534395e-06, + "loss": 0.2086, + "step": 26166 + }, + { + "epoch": 0.6621707113394235, + "grad_norm": 4.169969081878662, + "learning_rate": 2.608441854483282e-06, + "loss": 0.1379, + "step": 26167 + }, + { + "epoch": 0.6621960169041172, + "grad_norm": 5.16085958480835, + "learning_rate": 2.608089249045328e-06, + "loss": 0.1776, + "step": 26168 + }, + { + "epoch": 0.6622213224688109, + "grad_norm": 2.90301513671875, + "learning_rate": 2.607736659032361e-06, + "loss": 0.1421, + "step": 26169 + }, + { + "epoch": 0.6622466280335045, + "grad_norm": 7.71525239944458, + "learning_rate": 2.6073840844466537e-06, + "loss": 0.2066, + "step": 26170 + }, + { + "epoch": 0.6622719335981982, + "grad_norm": 3.994006872177124, + "learning_rate": 2.6070315252904833e-06, + "loss": 0.1223, + "step": 26171 + }, + { + "epoch": 0.662297239162892, + "grad_norm": 3.4702296257019043, + "learning_rate": 2.6066789815661215e-06, + "loss": 0.1009, + "step": 26172 + }, + { + "epoch": 0.6623225447275856, + "grad_norm": 3.395568370819092, + "learning_rate": 2.6063264532758414e-06, + "loss": 0.1446, + "step": 26173 + }, + { + "epoch": 0.6623478502922793, + "grad_norm": 3.848316192626953, + "learning_rate": 2.605973940421915e-06, + "loss": 0.1456, + "step": 26174 + }, + { + "epoch": 0.662373155856973, + "grad_norm": 5.126437664031982, + "learning_rate": 2.6056214430066196e-06, + "loss": 0.1776, + "step": 26175 + }, + { + "epoch": 0.6623984614216666, + "grad_norm": 3.924621105194092, + "learning_rate": 2.6052689610322256e-06, + "loss": 0.1118, + "step": 26176 + }, + { + "epoch": 0.6624237669863603, + "grad_norm": 4.906793117523193, + "learning_rate": 2.6049164945010074e-06, + "loss": 0.1268, + "step": 26177 + }, + { + "epoch": 0.662449072551054, + "grad_norm": 24.386123657226562, + "learning_rate": 2.6045640434152347e-06, + "loss": 0.4024, + "step": 26178 + }, + { + "epoch": 0.6624743781157476, + "grad_norm": 2.3274271488189697, + "learning_rate": 2.604211607777185e-06, + "loss": 0.1174, + "step": 26179 + }, + { + "epoch": 0.6624996836804413, + "grad_norm": 7.569928169250488, + "learning_rate": 2.603859187589127e-06, + "loss": 0.2113, + "step": 26180 + }, + { + "epoch": 0.662524989245135, + "grad_norm": 12.542852401733398, + "learning_rate": 2.603506782853339e-06, + "loss": 0.3933, + "step": 26181 + }, + { + "epoch": 0.6625502948098286, + "grad_norm": 5.695400238037109, + "learning_rate": 2.6031543935720864e-06, + "loss": 0.1601, + "step": 26182 + }, + { + "epoch": 0.6625756003745223, + "grad_norm": 5.509397983551025, + "learning_rate": 2.6028020197476473e-06, + "loss": 0.1606, + "step": 26183 + }, + { + "epoch": 0.6626009059392161, + "grad_norm": 3.19639253616333, + "learning_rate": 2.6024496613822896e-06, + "loss": 0.1886, + "step": 26184 + }, + { + "epoch": 0.6626262115039097, + "grad_norm": 6.2523980140686035, + "learning_rate": 2.6020973184782927e-06, + "loss": 0.1177, + "step": 26185 + }, + { + "epoch": 0.6626515170686034, + "grad_norm": 5.261237144470215, + "learning_rate": 2.60174499103792e-06, + "loss": 0.173, + "step": 26186 + }, + { + "epoch": 0.6626768226332971, + "grad_norm": 3.0026612281799316, + "learning_rate": 2.601392679063449e-06, + "loss": 0.1254, + "step": 26187 + }, + { + "epoch": 0.6627021281979908, + "grad_norm": 3.530726671218872, + "learning_rate": 2.6010403825571494e-06, + "loss": 0.1181, + "step": 26188 + }, + { + "epoch": 0.6627274337626844, + "grad_norm": 6.655735492706299, + "learning_rate": 2.600688101521296e-06, + "loss": 0.1607, + "step": 26189 + }, + { + "epoch": 0.6627527393273781, + "grad_norm": 5.185587406158447, + "learning_rate": 2.6003358359581577e-06, + "loss": 0.1963, + "step": 26190 + }, + { + "epoch": 0.6627780448920718, + "grad_norm": 4.189192771911621, + "learning_rate": 2.5999835858700085e-06, + "loss": 0.111, + "step": 26191 + }, + { + "epoch": 0.6628033504567654, + "grad_norm": 6.155755996704102, + "learning_rate": 2.5996313512591175e-06, + "loss": 0.1674, + "step": 26192 + }, + { + "epoch": 0.6628286560214591, + "grad_norm": 4.2991766929626465, + "learning_rate": 2.5992791321277566e-06, + "loss": 0.141, + "step": 26193 + }, + { + "epoch": 0.6628539615861528, + "grad_norm": 2.9033706188201904, + "learning_rate": 2.5989269284781993e-06, + "loss": 0.1017, + "step": 26194 + }, + { + "epoch": 0.6628792671508464, + "grad_norm": 4.0976104736328125, + "learning_rate": 2.598574740312716e-06, + "loss": 0.1506, + "step": 26195 + }, + { + "epoch": 0.6629045727155402, + "grad_norm": 7.212170600891113, + "learning_rate": 2.598222567633577e-06, + "loss": 0.2143, + "step": 26196 + }, + { + "epoch": 0.6629298782802339, + "grad_norm": 6.762791633605957, + "learning_rate": 2.5978704104430526e-06, + "loss": 0.2284, + "step": 26197 + }, + { + "epoch": 0.6629551838449275, + "grad_norm": 5.236374378204346, + "learning_rate": 2.5975182687434166e-06, + "loss": 0.1384, + "step": 26198 + }, + { + "epoch": 0.6629804894096212, + "grad_norm": 3.4387378692626953, + "learning_rate": 2.597166142536939e-06, + "loss": 0.1014, + "step": 26199 + }, + { + "epoch": 0.6630057949743149, + "grad_norm": 10.843996047973633, + "learning_rate": 2.596814031825889e-06, + "loss": 0.2646, + "step": 26200 + }, + { + "epoch": 0.6630311005390085, + "grad_norm": 4.216529846191406, + "learning_rate": 2.5964619366125366e-06, + "loss": 0.1415, + "step": 26201 + }, + { + "epoch": 0.6630564061037022, + "grad_norm": 4.174003601074219, + "learning_rate": 2.596109856899156e-06, + "loss": 0.201, + "step": 26202 + }, + { + "epoch": 0.6630817116683959, + "grad_norm": 7.892714977264404, + "learning_rate": 2.595757792688016e-06, + "loss": 0.1368, + "step": 26203 + }, + { + "epoch": 0.6631070172330895, + "grad_norm": 6.738846778869629, + "learning_rate": 2.595405743981387e-06, + "loss": 0.2025, + "step": 26204 + }, + { + "epoch": 0.6631323227977832, + "grad_norm": 5.431300640106201, + "learning_rate": 2.595053710781537e-06, + "loss": 0.1154, + "step": 26205 + }, + { + "epoch": 0.6631576283624769, + "grad_norm": 10.420145034790039, + "learning_rate": 2.5947016930907397e-06, + "loss": 0.1468, + "step": 26206 + }, + { + "epoch": 0.6631829339271705, + "grad_norm": 2.321051597595215, + "learning_rate": 2.5943496909112646e-06, + "loss": 0.1022, + "step": 26207 + }, + { + "epoch": 0.6632082394918642, + "grad_norm": 4.715545654296875, + "learning_rate": 2.59399770424538e-06, + "loss": 0.1557, + "step": 26208 + }, + { + "epoch": 0.663233545056558, + "grad_norm": 3.354326009750366, + "learning_rate": 2.593645733095356e-06, + "loss": 0.0496, + "step": 26209 + }, + { + "epoch": 0.6632588506212516, + "grad_norm": 4.454140663146973, + "learning_rate": 2.593293777463464e-06, + "loss": 0.1603, + "step": 26210 + }, + { + "epoch": 0.6632841561859453, + "grad_norm": 2.8857297897338867, + "learning_rate": 2.5929418373519712e-06, + "loss": 0.0875, + "step": 26211 + }, + { + "epoch": 0.663309461750639, + "grad_norm": 4.062780380249023, + "learning_rate": 2.5925899127631515e-06, + "loss": 0.1242, + "step": 26212 + }, + { + "epoch": 0.6633347673153327, + "grad_norm": 3.5336966514587402, + "learning_rate": 2.5922380036992687e-06, + "loss": 0.0798, + "step": 26213 + }, + { + "epoch": 0.6633600728800263, + "grad_norm": 4.559921741485596, + "learning_rate": 2.591886110162597e-06, + "loss": 0.0876, + "step": 26214 + }, + { + "epoch": 0.66338537844472, + "grad_norm": 7.194551467895508, + "learning_rate": 2.591534232155402e-06, + "loss": 0.157, + "step": 26215 + }, + { + "epoch": 0.6634106840094137, + "grad_norm": 7.19595193862915, + "learning_rate": 2.591182369679959e-06, + "loss": 0.2593, + "step": 26216 + }, + { + "epoch": 0.6634359895741073, + "grad_norm": 4.596404075622559, + "learning_rate": 2.590830522738528e-06, + "loss": 0.1532, + "step": 26217 + }, + { + "epoch": 0.663461295138801, + "grad_norm": 4.875361442565918, + "learning_rate": 2.5904786913333853e-06, + "loss": 0.1215, + "step": 26218 + }, + { + "epoch": 0.6634866007034947, + "grad_norm": 2.6930041313171387, + "learning_rate": 2.5901268754667975e-06, + "loss": 0.0987, + "step": 26219 + }, + { + "epoch": 0.6635119062681883, + "grad_norm": 5.698309898376465, + "learning_rate": 2.5897750751410323e-06, + "loss": 0.2167, + "step": 26220 + }, + { + "epoch": 0.6635372118328821, + "grad_norm": 7.872657299041748, + "learning_rate": 2.589423290358358e-06, + "loss": 0.2278, + "step": 26221 + }, + { + "epoch": 0.6635625173975758, + "grad_norm": 7.9070725440979, + "learning_rate": 2.589071521121047e-06, + "loss": 0.2314, + "step": 26222 + }, + { + "epoch": 0.6635878229622694, + "grad_norm": 6.325987815856934, + "learning_rate": 2.5887197674313648e-06, + "loss": 0.1777, + "step": 26223 + }, + { + "epoch": 0.6636131285269631, + "grad_norm": 6.215543270111084, + "learning_rate": 2.588368029291578e-06, + "loss": 0.1162, + "step": 26224 + }, + { + "epoch": 0.6636384340916568, + "grad_norm": 3.1053240299224854, + "learning_rate": 2.5880163067039595e-06, + "loss": 0.1811, + "step": 26225 + }, + { + "epoch": 0.6636637396563504, + "grad_norm": 14.605510711669922, + "learning_rate": 2.587664599670775e-06, + "loss": 0.2382, + "step": 26226 + }, + { + "epoch": 0.6636890452210441, + "grad_norm": 5.854811191558838, + "learning_rate": 2.587312908194293e-06, + "loss": 0.1952, + "step": 26227 + }, + { + "epoch": 0.6637143507857378, + "grad_norm": 11.857048988342285, + "learning_rate": 2.586961232276779e-06, + "loss": 0.3066, + "step": 26228 + }, + { + "epoch": 0.6637396563504314, + "grad_norm": 8.44057846069336, + "learning_rate": 2.586609571920507e-06, + "loss": 0.1685, + "step": 26229 + }, + { + "epoch": 0.6637649619151251, + "grad_norm": 6.329578399658203, + "learning_rate": 2.5862579271277373e-06, + "loss": 0.1973, + "step": 26230 + }, + { + "epoch": 0.6637902674798188, + "grad_norm": 3.979236125946045, + "learning_rate": 2.5859062979007433e-06, + "loss": 0.1596, + "step": 26231 + }, + { + "epoch": 0.6638155730445124, + "grad_norm": 2.9289350509643555, + "learning_rate": 2.5855546842417883e-06, + "loss": 0.1267, + "step": 26232 + }, + { + "epoch": 0.6638408786092062, + "grad_norm": 7.019031047821045, + "learning_rate": 2.585203086153144e-06, + "loss": 0.1697, + "step": 26233 + }, + { + "epoch": 0.6638661841738999, + "grad_norm": 7.21956205368042, + "learning_rate": 2.584851503637076e-06, + "loss": 0.1742, + "step": 26234 + }, + { + "epoch": 0.6638914897385935, + "grad_norm": 6.050748825073242, + "learning_rate": 2.5844999366958513e-06, + "loss": 0.2539, + "step": 26235 + }, + { + "epoch": 0.6639167953032872, + "grad_norm": 4.309685707092285, + "learning_rate": 2.584148385331735e-06, + "loss": 0.1576, + "step": 26236 + }, + { + "epoch": 0.6639421008679809, + "grad_norm": 4.039413928985596, + "learning_rate": 2.583796849546999e-06, + "loss": 0.1223, + "step": 26237 + }, + { + "epoch": 0.6639674064326746, + "grad_norm": 4.527225494384766, + "learning_rate": 2.5834453293439065e-06, + "loss": 0.1484, + "step": 26238 + }, + { + "epoch": 0.6639927119973682, + "grad_norm": 5.075899600982666, + "learning_rate": 2.583093824724727e-06, + "loss": 0.1972, + "step": 26239 + }, + { + "epoch": 0.6640180175620619, + "grad_norm": 14.796207427978516, + "learning_rate": 2.5827423356917225e-06, + "loss": 0.2858, + "step": 26240 + }, + { + "epoch": 0.6640433231267556, + "grad_norm": 3.6762821674346924, + "learning_rate": 2.582390862247166e-06, + "loss": 0.1209, + "step": 26241 + }, + { + "epoch": 0.6640686286914492, + "grad_norm": 6.861730098724365, + "learning_rate": 2.5820394043933204e-06, + "loss": 0.1835, + "step": 26242 + }, + { + "epoch": 0.6640939342561429, + "grad_norm": 3.5842816829681396, + "learning_rate": 2.5816879621324535e-06, + "loss": 0.1468, + "step": 26243 + }, + { + "epoch": 0.6641192398208366, + "grad_norm": 9.908251762390137, + "learning_rate": 2.581336535466829e-06, + "loss": 0.4054, + "step": 26244 + }, + { + "epoch": 0.6641445453855302, + "grad_norm": 6.247669219970703, + "learning_rate": 2.5809851243987172e-06, + "loss": 0.1721, + "step": 26245 + }, + { + "epoch": 0.664169850950224, + "grad_norm": 3.8045270442962646, + "learning_rate": 2.580633728930381e-06, + "loss": 0.1408, + "step": 26246 + }, + { + "epoch": 0.6641951565149177, + "grad_norm": 8.915538787841797, + "learning_rate": 2.5802823490640907e-06, + "loss": 0.3153, + "step": 26247 + }, + { + "epoch": 0.6642204620796113, + "grad_norm": 11.889398574829102, + "learning_rate": 2.5799309848021057e-06, + "loss": 0.1608, + "step": 26248 + }, + { + "epoch": 0.664245767644305, + "grad_norm": 4.771116733551025, + "learning_rate": 2.579579636146698e-06, + "loss": 0.1573, + "step": 26249 + }, + { + "epoch": 0.6642710732089987, + "grad_norm": 2.971219539642334, + "learning_rate": 2.579228303100131e-06, + "loss": 0.1265, + "step": 26250 + }, + { + "epoch": 0.6642963787736923, + "grad_norm": 23.842464447021484, + "learning_rate": 2.57887698566467e-06, + "loss": 0.1822, + "step": 26251 + }, + { + "epoch": 0.664321684338386, + "grad_norm": 6.727644920349121, + "learning_rate": 2.578525683842579e-06, + "loss": 0.1625, + "step": 26252 + }, + { + "epoch": 0.6643469899030797, + "grad_norm": 6.167111873626709, + "learning_rate": 2.578174397636127e-06, + "loss": 0.1708, + "step": 26253 + }, + { + "epoch": 0.6643722954677733, + "grad_norm": 6.749136447906494, + "learning_rate": 2.577823127047579e-06, + "loss": 0.1218, + "step": 26254 + }, + { + "epoch": 0.664397601032467, + "grad_norm": 6.002925395965576, + "learning_rate": 2.577471872079197e-06, + "loss": 0.2135, + "step": 26255 + }, + { + "epoch": 0.6644229065971607, + "grad_norm": 8.592286109924316, + "learning_rate": 2.577120632733251e-06, + "loss": 0.1946, + "step": 26256 + }, + { + "epoch": 0.6644482121618543, + "grad_norm": 2.7104814052581787, + "learning_rate": 2.5767694090120003e-06, + "loss": 0.1222, + "step": 26257 + }, + { + "epoch": 0.6644735177265481, + "grad_norm": 3.134644031524658, + "learning_rate": 2.5764182009177153e-06, + "loss": 0.1339, + "step": 26258 + }, + { + "epoch": 0.6644988232912418, + "grad_norm": 3.664229154586792, + "learning_rate": 2.5760670084526556e-06, + "loss": 0.1745, + "step": 26259 + }, + { + "epoch": 0.6645241288559354, + "grad_norm": 3.3586857318878174, + "learning_rate": 2.5757158316190935e-06, + "loss": 0.1119, + "step": 26260 + }, + { + "epoch": 0.6645494344206291, + "grad_norm": 5.850732803344727, + "learning_rate": 2.5753646704192846e-06, + "loss": 0.158, + "step": 26261 + }, + { + "epoch": 0.6645747399853228, + "grad_norm": 4.332173824310303, + "learning_rate": 2.5750135248554997e-06, + "loss": 0.1638, + "step": 26262 + }, + { + "epoch": 0.6646000455500164, + "grad_norm": 6.398566722869873, + "learning_rate": 2.5746623949299997e-06, + "loss": 0.1968, + "step": 26263 + }, + { + "epoch": 0.6646253511147101, + "grad_norm": 5.001123905181885, + "learning_rate": 2.574311280645053e-06, + "loss": 0.2189, + "step": 26264 + }, + { + "epoch": 0.6646506566794038, + "grad_norm": 3.0898489952087402, + "learning_rate": 2.573960182002922e-06, + "loss": 0.1631, + "step": 26265 + }, + { + "epoch": 0.6646759622440975, + "grad_norm": 2.933943033218384, + "learning_rate": 2.5736090990058702e-06, + "loss": 0.0996, + "step": 26266 + }, + { + "epoch": 0.6647012678087911, + "grad_norm": 3.8577511310577393, + "learning_rate": 2.5732580316561595e-06, + "loss": 0.1379, + "step": 26267 + }, + { + "epoch": 0.6647265733734848, + "grad_norm": 3.869324207305908, + "learning_rate": 2.572906979956059e-06, + "loss": 0.1439, + "step": 26268 + }, + { + "epoch": 0.6647518789381786, + "grad_norm": 8.020236015319824, + "learning_rate": 2.5725559439078297e-06, + "loss": 0.136, + "step": 26269 + }, + { + "epoch": 0.6647771845028722, + "grad_norm": 9.092336654663086, + "learning_rate": 2.572204923513735e-06, + "loss": 0.2135, + "step": 26270 + }, + { + "epoch": 0.6648024900675659, + "grad_norm": 13.05438232421875, + "learning_rate": 2.5718539187760383e-06, + "loss": 0.2377, + "step": 26271 + }, + { + "epoch": 0.6648277956322596, + "grad_norm": 8.779373168945312, + "learning_rate": 2.571502929697005e-06, + "loss": 0.1472, + "step": 26272 + }, + { + "epoch": 0.6648531011969532, + "grad_norm": 8.600433349609375, + "learning_rate": 2.5711519562788977e-06, + "loss": 0.2334, + "step": 26273 + }, + { + "epoch": 0.6648784067616469, + "grad_norm": 4.629825592041016, + "learning_rate": 2.57080099852398e-06, + "loss": 0.1018, + "step": 26274 + }, + { + "epoch": 0.6649037123263406, + "grad_norm": 4.082996368408203, + "learning_rate": 2.570450056434513e-06, + "loss": 0.1258, + "step": 26275 + }, + { + "epoch": 0.6649290178910342, + "grad_norm": 8.577727317810059, + "learning_rate": 2.5700991300127633e-06, + "loss": 0.2974, + "step": 26276 + }, + { + "epoch": 0.6649543234557279, + "grad_norm": 3.7904703617095947, + "learning_rate": 2.5697482192609923e-06, + "loss": 0.1234, + "step": 26277 + }, + { + "epoch": 0.6649796290204216, + "grad_norm": 3.398665189743042, + "learning_rate": 2.5693973241814628e-06, + "loss": 0.1115, + "step": 26278 + }, + { + "epoch": 0.6650049345851152, + "grad_norm": 3.4483261108398438, + "learning_rate": 2.5690464447764364e-06, + "loss": 0.1447, + "step": 26279 + }, + { + "epoch": 0.6650302401498089, + "grad_norm": 2.7719244956970215, + "learning_rate": 2.5686955810481784e-06, + "loss": 0.1318, + "step": 26280 + }, + { + "epoch": 0.6650555457145026, + "grad_norm": 2.8110623359680176, + "learning_rate": 2.568344732998951e-06, + "loss": 0.1351, + "step": 26281 + }, + { + "epoch": 0.6650808512791963, + "grad_norm": 2.6011734008789062, + "learning_rate": 2.5679939006310164e-06, + "loss": 0.124, + "step": 26282 + }, + { + "epoch": 0.66510615684389, + "grad_norm": 4.995747089385986, + "learning_rate": 2.5676430839466337e-06, + "loss": 0.12, + "step": 26283 + }, + { + "epoch": 0.6651314624085837, + "grad_norm": 4.507065296173096, + "learning_rate": 2.5672922829480714e-06, + "loss": 0.1885, + "step": 26284 + }, + { + "epoch": 0.6651567679732773, + "grad_norm": 2.6162571907043457, + "learning_rate": 2.5669414976375884e-06, + "loss": 0.112, + "step": 26285 + }, + { + "epoch": 0.665182073537971, + "grad_norm": 5.780365943908691, + "learning_rate": 2.5665907280174446e-06, + "loss": 0.218, + "step": 26286 + }, + { + "epoch": 0.6652073791026647, + "grad_norm": 4.570653438568115, + "learning_rate": 2.5662399740899084e-06, + "loss": 0.1565, + "step": 26287 + }, + { + "epoch": 0.6652326846673583, + "grad_norm": 4.424610137939453, + "learning_rate": 2.5658892358572345e-06, + "loss": 0.1502, + "step": 26288 + }, + { + "epoch": 0.665257990232052, + "grad_norm": 4.50053596496582, + "learning_rate": 2.5655385133216905e-06, + "loss": 0.1282, + "step": 26289 + }, + { + "epoch": 0.6652832957967457, + "grad_norm": 6.785054683685303, + "learning_rate": 2.565187806485533e-06, + "loss": 0.2046, + "step": 26290 + }, + { + "epoch": 0.6653086013614394, + "grad_norm": 5.755895137786865, + "learning_rate": 2.5648371153510304e-06, + "loss": 0.1166, + "step": 26291 + }, + { + "epoch": 0.665333906926133, + "grad_norm": 3.0346999168395996, + "learning_rate": 2.564486439920436e-06, + "loss": 0.1498, + "step": 26292 + }, + { + "epoch": 0.6653592124908267, + "grad_norm": 7.504487037658691, + "learning_rate": 2.5641357801960186e-06, + "loss": 0.1721, + "step": 26293 + }, + { + "epoch": 0.6653845180555205, + "grad_norm": 5.290187835693359, + "learning_rate": 2.5637851361800337e-06, + "loss": 0.1098, + "step": 26294 + }, + { + "epoch": 0.6654098236202141, + "grad_norm": 9.270572662353516, + "learning_rate": 2.563434507874749e-06, + "loss": 0.1575, + "step": 26295 + }, + { + "epoch": 0.6654351291849078, + "grad_norm": 6.141510963439941, + "learning_rate": 2.5630838952824188e-06, + "loss": 0.1167, + "step": 26296 + }, + { + "epoch": 0.6654604347496015, + "grad_norm": 4.796929836273193, + "learning_rate": 2.562733298405309e-06, + "loss": 0.1575, + "step": 26297 + }, + { + "epoch": 0.6654857403142951, + "grad_norm": 3.6494476795196533, + "learning_rate": 2.5623827172456765e-06, + "loss": 0.1226, + "step": 26298 + }, + { + "epoch": 0.6655110458789888, + "grad_norm": 14.893733024597168, + "learning_rate": 2.5620321518057867e-06, + "loss": 0.3538, + "step": 26299 + }, + { + "epoch": 0.6655363514436825, + "grad_norm": 2.4205141067504883, + "learning_rate": 2.561681602087898e-06, + "loss": 0.087, + "step": 26300 + }, + { + "epoch": 0.6655616570083761, + "grad_norm": 2.3951070308685303, + "learning_rate": 2.5613310680942705e-06, + "loss": 0.1027, + "step": 26301 + }, + { + "epoch": 0.6655869625730698, + "grad_norm": 12.714308738708496, + "learning_rate": 2.5609805498271643e-06, + "loss": 0.2867, + "step": 26302 + }, + { + "epoch": 0.6656122681377635, + "grad_norm": 3.7576067447662354, + "learning_rate": 2.5606300472888424e-06, + "loss": 0.1833, + "step": 26303 + }, + { + "epoch": 0.6656375737024571, + "grad_norm": 2.8649728298187256, + "learning_rate": 2.5602795604815635e-06, + "loss": 0.0757, + "step": 26304 + }, + { + "epoch": 0.6656628792671508, + "grad_norm": 4.099978923797607, + "learning_rate": 2.559929089407588e-06, + "loss": 0.1678, + "step": 26305 + }, + { + "epoch": 0.6656881848318446, + "grad_norm": 6.538674354553223, + "learning_rate": 2.559578634069173e-06, + "loss": 0.1301, + "step": 26306 + }, + { + "epoch": 0.6657134903965382, + "grad_norm": 4.988570213317871, + "learning_rate": 2.559228194468585e-06, + "loss": 0.1942, + "step": 26307 + }, + { + "epoch": 0.6657387959612319, + "grad_norm": 12.646766662597656, + "learning_rate": 2.5588777706080792e-06, + "loss": 0.1612, + "step": 26308 + }, + { + "epoch": 0.6657641015259256, + "grad_norm": 2.595712900161743, + "learning_rate": 2.558527362489917e-06, + "loss": 0.1215, + "step": 26309 + }, + { + "epoch": 0.6657894070906192, + "grad_norm": 4.207233905792236, + "learning_rate": 2.558176970116356e-06, + "loss": 0.1301, + "step": 26310 + }, + { + "epoch": 0.6658147126553129, + "grad_norm": 8.811513900756836, + "learning_rate": 2.557826593489659e-06, + "loss": 0.2209, + "step": 26311 + }, + { + "epoch": 0.6658400182200066, + "grad_norm": 13.412796020507812, + "learning_rate": 2.557476232612084e-06, + "loss": 0.171, + "step": 26312 + }, + { + "epoch": 0.6658653237847002, + "grad_norm": 4.749171733856201, + "learning_rate": 2.5571258874858907e-06, + "loss": 0.2412, + "step": 26313 + }, + { + "epoch": 0.6658906293493939, + "grad_norm": 5.320592403411865, + "learning_rate": 2.556775558113338e-06, + "loss": 0.1653, + "step": 26314 + }, + { + "epoch": 0.6659159349140876, + "grad_norm": 5.003674030303955, + "learning_rate": 2.5564252444966835e-06, + "loss": 0.1913, + "step": 26315 + }, + { + "epoch": 0.6659412404787813, + "grad_norm": 4.486452579498291, + "learning_rate": 2.5560749466381897e-06, + "loss": 0.128, + "step": 26316 + }, + { + "epoch": 0.6659665460434749, + "grad_norm": 6.4618096351623535, + "learning_rate": 2.5557246645401123e-06, + "loss": 0.21, + "step": 26317 + }, + { + "epoch": 0.6659918516081687, + "grad_norm": 4.184409141540527, + "learning_rate": 2.555374398204716e-06, + "loss": 0.0953, + "step": 26318 + }, + { + "epoch": 0.6660171571728624, + "grad_norm": 7.122406482696533, + "learning_rate": 2.5550241476342508e-06, + "loss": 0.2095, + "step": 26319 + }, + { + "epoch": 0.666042462737556, + "grad_norm": 3.6422183513641357, + "learning_rate": 2.5546739128309826e-06, + "loss": 0.1943, + "step": 26320 + }, + { + "epoch": 0.6660677683022497, + "grad_norm": 5.9502339363098145, + "learning_rate": 2.554323693797165e-06, + "loss": 0.1705, + "step": 26321 + }, + { + "epoch": 0.6660930738669434, + "grad_norm": 5.640101909637451, + "learning_rate": 2.5539734905350633e-06, + "loss": 0.1294, + "step": 26322 + }, + { + "epoch": 0.666118379431637, + "grad_norm": 5.298834800720215, + "learning_rate": 2.553623303046928e-06, + "loss": 0.1342, + "step": 26323 + }, + { + "epoch": 0.6661436849963307, + "grad_norm": 3.758840799331665, + "learning_rate": 2.553273131335022e-06, + "loss": 0.0995, + "step": 26324 + }, + { + "epoch": 0.6661689905610244, + "grad_norm": 2.828388214111328, + "learning_rate": 2.5529229754016018e-06, + "loss": 0.0943, + "step": 26325 + }, + { + "epoch": 0.666194296125718, + "grad_norm": 5.990391731262207, + "learning_rate": 2.5525728352489287e-06, + "loss": 0.2358, + "step": 26326 + }, + { + "epoch": 0.6662196016904117, + "grad_norm": 4.378051280975342, + "learning_rate": 2.5522227108792552e-06, + "loss": 0.1777, + "step": 26327 + }, + { + "epoch": 0.6662449072551054, + "grad_norm": 5.370312213897705, + "learning_rate": 2.5518726022948437e-06, + "loss": 0.1598, + "step": 26328 + }, + { + "epoch": 0.666270212819799, + "grad_norm": 4.696080207824707, + "learning_rate": 2.551522509497948e-06, + "loss": 0.113, + "step": 26329 + }, + { + "epoch": 0.6662955183844927, + "grad_norm": 3.183199882507324, + "learning_rate": 2.5511724324908304e-06, + "loss": 0.1157, + "step": 26330 + }, + { + "epoch": 0.6663208239491865, + "grad_norm": 12.078948020935059, + "learning_rate": 2.5508223712757462e-06, + "loss": 0.2499, + "step": 26331 + }, + { + "epoch": 0.6663461295138801, + "grad_norm": 6.538997173309326, + "learning_rate": 2.5504723258549534e-06, + "loss": 0.211, + "step": 26332 + }, + { + "epoch": 0.6663714350785738, + "grad_norm": 7.041845321655273, + "learning_rate": 2.550122296230707e-06, + "loss": 0.228, + "step": 26333 + }, + { + "epoch": 0.6663967406432675, + "grad_norm": 2.4825706481933594, + "learning_rate": 2.5497722824052673e-06, + "loss": 0.0587, + "step": 26334 + }, + { + "epoch": 0.6664220462079611, + "grad_norm": 6.146083354949951, + "learning_rate": 2.5494222843808913e-06, + "loss": 0.1458, + "step": 26335 + }, + { + "epoch": 0.6664473517726548, + "grad_norm": 4.350661754608154, + "learning_rate": 2.549072302159835e-06, + "loss": 0.136, + "step": 26336 + }, + { + "epoch": 0.6664726573373485, + "grad_norm": 4.545306205749512, + "learning_rate": 2.548722335744353e-06, + "loss": 0.132, + "step": 26337 + }, + { + "epoch": 0.6664979629020421, + "grad_norm": 3.9975225925445557, + "learning_rate": 2.548372385136707e-06, + "loss": 0.1786, + "step": 26338 + }, + { + "epoch": 0.6665232684667358, + "grad_norm": 3.6353797912597656, + "learning_rate": 2.5480224503391515e-06, + "loss": 0.0772, + "step": 26339 + }, + { + "epoch": 0.6665485740314295, + "grad_norm": 3.996534585952759, + "learning_rate": 2.547672531353943e-06, + "loss": 0.1966, + "step": 26340 + }, + { + "epoch": 0.6665738795961232, + "grad_norm": 6.190954685211182, + "learning_rate": 2.5473226281833387e-06, + "loss": 0.1698, + "step": 26341 + }, + { + "epoch": 0.6665991851608168, + "grad_norm": 2.822744846343994, + "learning_rate": 2.546972740829592e-06, + "loss": 0.1241, + "step": 26342 + }, + { + "epoch": 0.6666244907255106, + "grad_norm": 3.8506407737731934, + "learning_rate": 2.5466228692949633e-06, + "loss": 0.1492, + "step": 26343 + }, + { + "epoch": 0.6666497962902043, + "grad_norm": 2.9879064559936523, + "learning_rate": 2.5462730135817083e-06, + "loss": 0.0975, + "step": 26344 + }, + { + "epoch": 0.6666751018548979, + "grad_norm": 3.512986183166504, + "learning_rate": 2.545923173692082e-06, + "loss": 0.075, + "step": 26345 + }, + { + "epoch": 0.6667004074195916, + "grad_norm": 5.602177143096924, + "learning_rate": 2.5455733496283384e-06, + "loss": 0.2453, + "step": 26346 + }, + { + "epoch": 0.6667257129842853, + "grad_norm": 8.149504661560059, + "learning_rate": 2.5452235413927374e-06, + "loss": 0.2104, + "step": 26347 + }, + { + "epoch": 0.6667510185489789, + "grad_norm": 5.34820556640625, + "learning_rate": 2.544873748987534e-06, + "loss": 0.1502, + "step": 26348 + }, + { + "epoch": 0.6667763241136726, + "grad_norm": 3.868596076965332, + "learning_rate": 2.5445239724149823e-06, + "loss": 0.1394, + "step": 26349 + }, + { + "epoch": 0.6668016296783663, + "grad_norm": 4.1723151206970215, + "learning_rate": 2.544174211677337e-06, + "loss": 0.153, + "step": 26350 + }, + { + "epoch": 0.6668269352430599, + "grad_norm": 3.909318208694458, + "learning_rate": 2.5438244667768577e-06, + "loss": 0.171, + "step": 26351 + }, + { + "epoch": 0.6668522408077536, + "grad_norm": 4.680474758148193, + "learning_rate": 2.5434747377157954e-06, + "loss": 0.1232, + "step": 26352 + }, + { + "epoch": 0.6668775463724473, + "grad_norm": 3.4659602642059326, + "learning_rate": 2.5431250244964113e-06, + "loss": 0.1569, + "step": 26353 + }, + { + "epoch": 0.6669028519371409, + "grad_norm": 6.276535987854004, + "learning_rate": 2.5427753271209533e-06, + "loss": 0.1536, + "step": 26354 + }, + { + "epoch": 0.6669281575018347, + "grad_norm": 3.7547309398651123, + "learning_rate": 2.542425645591682e-06, + "loss": 0.1817, + "step": 26355 + }, + { + "epoch": 0.6669534630665284, + "grad_norm": 10.70132064819336, + "learning_rate": 2.5420759799108484e-06, + "loss": 0.1691, + "step": 26356 + }, + { + "epoch": 0.666978768631222, + "grad_norm": 7.403083801269531, + "learning_rate": 2.541726330080714e-06, + "loss": 0.2057, + "step": 26357 + }, + { + "epoch": 0.6670040741959157, + "grad_norm": 6.424764633178711, + "learning_rate": 2.5413766961035245e-06, + "loss": 0.1773, + "step": 26358 + }, + { + "epoch": 0.6670293797606094, + "grad_norm": 9.096709251403809, + "learning_rate": 2.5410270779815415e-06, + "loss": 0.2566, + "step": 26359 + }, + { + "epoch": 0.667054685325303, + "grad_norm": 3.1236069202423096, + "learning_rate": 2.540677475717016e-06, + "loss": 0.1116, + "step": 26360 + }, + { + "epoch": 0.6670799908899967, + "grad_norm": 11.053292274475098, + "learning_rate": 2.5403278893122065e-06, + "loss": 0.3526, + "step": 26361 + }, + { + "epoch": 0.6671052964546904, + "grad_norm": 7.8294196128845215, + "learning_rate": 2.539978318769364e-06, + "loss": 0.1918, + "step": 26362 + }, + { + "epoch": 0.667130602019384, + "grad_norm": 8.408609390258789, + "learning_rate": 2.5396287640907434e-06, + "loss": 0.1557, + "step": 26363 + }, + { + "epoch": 0.6671559075840777, + "grad_norm": 4.556580543518066, + "learning_rate": 2.539279225278598e-06, + "loss": 0.1317, + "step": 26364 + }, + { + "epoch": 0.6671812131487714, + "grad_norm": 6.280195236206055, + "learning_rate": 2.5389297023351844e-06, + "loss": 0.1319, + "step": 26365 + }, + { + "epoch": 0.667206518713465, + "grad_norm": 7.596756458282471, + "learning_rate": 2.538580195262756e-06, + "loss": 0.1889, + "step": 26366 + }, + { + "epoch": 0.6672318242781587, + "grad_norm": 3.015345335006714, + "learning_rate": 2.5382307040635657e-06, + "loss": 0.1392, + "step": 26367 + }, + { + "epoch": 0.6672571298428525, + "grad_norm": 8.783825874328613, + "learning_rate": 2.537881228739866e-06, + "loss": 0.281, + "step": 26368 + }, + { + "epoch": 0.6672824354075462, + "grad_norm": 7.316697120666504, + "learning_rate": 2.537531769293914e-06, + "loss": 0.2657, + "step": 26369 + }, + { + "epoch": 0.6673077409722398, + "grad_norm": 4.485616683959961, + "learning_rate": 2.5371823257279615e-06, + "loss": 0.141, + "step": 26370 + }, + { + "epoch": 0.6673330465369335, + "grad_norm": 11.348413467407227, + "learning_rate": 2.5368328980442627e-06, + "loss": 0.1908, + "step": 26371 + }, + { + "epoch": 0.6673583521016272, + "grad_norm": 3.073560953140259, + "learning_rate": 2.5364834862450693e-06, + "loss": 0.0996, + "step": 26372 + }, + { + "epoch": 0.6673836576663208, + "grad_norm": 3.5692014694213867, + "learning_rate": 2.536134090332635e-06, + "loss": 0.169, + "step": 26373 + }, + { + "epoch": 0.6674089632310145, + "grad_norm": 6.572314739227295, + "learning_rate": 2.5357847103092146e-06, + "loss": 0.2469, + "step": 26374 + }, + { + "epoch": 0.6674342687957082, + "grad_norm": 7.057077884674072, + "learning_rate": 2.5354353461770604e-06, + "loss": 0.225, + "step": 26375 + }, + { + "epoch": 0.6674595743604018, + "grad_norm": 9.418415069580078, + "learning_rate": 2.5350859979384253e-06, + "loss": 0.2242, + "step": 26376 + }, + { + "epoch": 0.6674848799250955, + "grad_norm": 11.707533836364746, + "learning_rate": 2.53473666559556e-06, + "loss": 0.244, + "step": 26377 + }, + { + "epoch": 0.6675101854897892, + "grad_norm": 3.1994106769561768, + "learning_rate": 2.5343873491507215e-06, + "loss": 0.1658, + "step": 26378 + }, + { + "epoch": 0.6675354910544828, + "grad_norm": 8.15149974822998, + "learning_rate": 2.53403804860616e-06, + "loss": 0.3409, + "step": 26379 + }, + { + "epoch": 0.6675607966191766, + "grad_norm": 5.603311061859131, + "learning_rate": 2.533688763964129e-06, + "loss": 0.2159, + "step": 26380 + }, + { + "epoch": 0.6675861021838703, + "grad_norm": 4.059167385101318, + "learning_rate": 2.533339495226878e-06, + "loss": 0.149, + "step": 26381 + }, + { + "epoch": 0.6676114077485639, + "grad_norm": 3.198230028152466, + "learning_rate": 2.532990242396664e-06, + "loss": 0.1292, + "step": 26382 + }, + { + "epoch": 0.6676367133132576, + "grad_norm": 2.4328064918518066, + "learning_rate": 2.532641005475735e-06, + "loss": 0.0766, + "step": 26383 + }, + { + "epoch": 0.6676620188779513, + "grad_norm": 3.3816890716552734, + "learning_rate": 2.5322917844663485e-06, + "loss": 0.138, + "step": 26384 + }, + { + "epoch": 0.6676873244426449, + "grad_norm": 2.719442844390869, + "learning_rate": 2.5319425793707504e-06, + "loss": 0.1237, + "step": 26385 + }, + { + "epoch": 0.6677126300073386, + "grad_norm": 10.111587524414062, + "learning_rate": 2.531593390191197e-06, + "loss": 0.2098, + "step": 26386 + }, + { + "epoch": 0.6677379355720323, + "grad_norm": 2.5308761596679688, + "learning_rate": 2.5312442169299366e-06, + "loss": 0.1059, + "step": 26387 + }, + { + "epoch": 0.6677632411367259, + "grad_norm": 9.739313125610352, + "learning_rate": 2.5308950595892268e-06, + "loss": 0.1584, + "step": 26388 + }, + { + "epoch": 0.6677885467014196, + "grad_norm": 5.855799198150635, + "learning_rate": 2.530545918171312e-06, + "loss": 0.2157, + "step": 26389 + }, + { + "epoch": 0.6678138522661133, + "grad_norm": 12.314041137695312, + "learning_rate": 2.5301967926784486e-06, + "loss": 0.3005, + "step": 26390 + }, + { + "epoch": 0.6678391578308069, + "grad_norm": 2.4380815029144287, + "learning_rate": 2.5298476831128855e-06, + "loss": 0.1351, + "step": 26391 + }, + { + "epoch": 0.6678644633955007, + "grad_norm": 3.444780111312866, + "learning_rate": 2.5294985894768765e-06, + "loss": 0.1189, + "step": 26392 + }, + { + "epoch": 0.6678897689601944, + "grad_norm": 4.315231800079346, + "learning_rate": 2.5291495117726717e-06, + "loss": 0.1952, + "step": 26393 + }, + { + "epoch": 0.6679150745248881, + "grad_norm": 4.537519454956055, + "learning_rate": 2.528800450002522e-06, + "loss": 0.1339, + "step": 26394 + }, + { + "epoch": 0.6679403800895817, + "grad_norm": 5.341558933258057, + "learning_rate": 2.5284514041686768e-06, + "loss": 0.2199, + "step": 26395 + }, + { + "epoch": 0.6679656856542754, + "grad_norm": 5.546751499176025, + "learning_rate": 2.528102374273391e-06, + "loss": 0.2304, + "step": 26396 + }, + { + "epoch": 0.6679909912189691, + "grad_norm": 7.366486072540283, + "learning_rate": 2.5277533603189125e-06, + "loss": 0.1221, + "step": 26397 + }, + { + "epoch": 0.6680162967836627, + "grad_norm": 4.070568084716797, + "learning_rate": 2.527404362307493e-06, + "loss": 0.1505, + "step": 26398 + }, + { + "epoch": 0.6680416023483564, + "grad_norm": 4.906704425811768, + "learning_rate": 2.5270553802413826e-06, + "loss": 0.1199, + "step": 26399 + }, + { + "epoch": 0.6680669079130501, + "grad_norm": 2.8588688373565674, + "learning_rate": 2.5267064141228304e-06, + "loss": 0.1698, + "step": 26400 + }, + { + "epoch": 0.6680922134777437, + "grad_norm": 3.89088773727417, + "learning_rate": 2.5263574639540907e-06, + "loss": 0.1131, + "step": 26401 + }, + { + "epoch": 0.6681175190424374, + "grad_norm": 3.3828463554382324, + "learning_rate": 2.5260085297374107e-06, + "loss": 0.1409, + "step": 26402 + }, + { + "epoch": 0.6681428246071311, + "grad_norm": 6.107140064239502, + "learning_rate": 2.525659611475042e-06, + "loss": 0.1897, + "step": 26403 + }, + { + "epoch": 0.6681681301718247, + "grad_norm": 6.6350884437561035, + "learning_rate": 2.5253107091692325e-06, + "loss": 0.1986, + "step": 26404 + }, + { + "epoch": 0.6681934357365185, + "grad_norm": 6.3708815574646, + "learning_rate": 2.5249618228222362e-06, + "loss": 0.1801, + "step": 26405 + }, + { + "epoch": 0.6682187413012122, + "grad_norm": 4.031842231750488, + "learning_rate": 2.5246129524362994e-06, + "loss": 0.1524, + "step": 26406 + }, + { + "epoch": 0.6682440468659058, + "grad_norm": 2.66528058052063, + "learning_rate": 2.5242640980136746e-06, + "loss": 0.1044, + "step": 26407 + }, + { + "epoch": 0.6682693524305995, + "grad_norm": 7.01729154586792, + "learning_rate": 2.5239152595566074e-06, + "loss": 0.0963, + "step": 26408 + }, + { + "epoch": 0.6682946579952932, + "grad_norm": 4.2638421058654785, + "learning_rate": 2.523566437067352e-06, + "loss": 0.1644, + "step": 26409 + }, + { + "epoch": 0.6683199635599868, + "grad_norm": 3.196305990219116, + "learning_rate": 2.523217630548156e-06, + "loss": 0.1285, + "step": 26410 + }, + { + "epoch": 0.6683452691246805, + "grad_norm": 5.569238185882568, + "learning_rate": 2.522868840001269e-06, + "loss": 0.2044, + "step": 26411 + }, + { + "epoch": 0.6683705746893742, + "grad_norm": 7.177145481109619, + "learning_rate": 2.522520065428938e-06, + "loss": 0.2361, + "step": 26412 + }, + { + "epoch": 0.6683958802540678, + "grad_norm": 4.766232967376709, + "learning_rate": 2.5221713068334165e-06, + "loss": 0.241, + "step": 26413 + }, + { + "epoch": 0.6684211858187615, + "grad_norm": 4.150908470153809, + "learning_rate": 2.521822564216949e-06, + "loss": 0.1633, + "step": 26414 + }, + { + "epoch": 0.6684464913834552, + "grad_norm": 12.978041648864746, + "learning_rate": 2.5214738375817903e-06, + "loss": 0.1939, + "step": 26415 + }, + { + "epoch": 0.6684717969481488, + "grad_norm": 4.040217876434326, + "learning_rate": 2.521125126930183e-06, + "loss": 0.1597, + "step": 26416 + }, + { + "epoch": 0.6684971025128426, + "grad_norm": 5.543449401855469, + "learning_rate": 2.5207764322643803e-06, + "loss": 0.1833, + "step": 26417 + }, + { + "epoch": 0.6685224080775363, + "grad_norm": 5.116867542266846, + "learning_rate": 2.520427753586627e-06, + "loss": 0.2176, + "step": 26418 + }, + { + "epoch": 0.66854771364223, + "grad_norm": 4.259449481964111, + "learning_rate": 2.520079090899178e-06, + "loss": 0.13, + "step": 26419 + }, + { + "epoch": 0.6685730192069236, + "grad_norm": 2.6494996547698975, + "learning_rate": 2.519730444204273e-06, + "loss": 0.1295, + "step": 26420 + }, + { + "epoch": 0.6685983247716173, + "grad_norm": 6.136088848114014, + "learning_rate": 2.5193818135041675e-06, + "loss": 0.1497, + "step": 26421 + }, + { + "epoch": 0.668623630336311, + "grad_norm": 4.344625473022461, + "learning_rate": 2.519033198801105e-06, + "loss": 0.1552, + "step": 26422 + }, + { + "epoch": 0.6686489359010046, + "grad_norm": 3.182790994644165, + "learning_rate": 2.5186846000973403e-06, + "loss": 0.15, + "step": 26423 + }, + { + "epoch": 0.6686742414656983, + "grad_norm": 5.042574405670166, + "learning_rate": 2.518336017395113e-06, + "loss": 0.1822, + "step": 26424 + }, + { + "epoch": 0.668699547030392, + "grad_norm": 7.330850124359131, + "learning_rate": 2.5179874506966773e-06, + "loss": 0.2574, + "step": 26425 + }, + { + "epoch": 0.6687248525950856, + "grad_norm": 2.620551109313965, + "learning_rate": 2.5176389000042783e-06, + "loss": 0.0873, + "step": 26426 + }, + { + "epoch": 0.6687501581597793, + "grad_norm": 8.177824020385742, + "learning_rate": 2.5172903653201625e-06, + "loss": 0.2273, + "step": 26427 + }, + { + "epoch": 0.668775463724473, + "grad_norm": 6.045597076416016, + "learning_rate": 2.5169418466465813e-06, + "loss": 0.1891, + "step": 26428 + }, + { + "epoch": 0.6688007692891667, + "grad_norm": 6.875011444091797, + "learning_rate": 2.51659334398578e-06, + "loss": 0.2093, + "step": 26429 + }, + { + "epoch": 0.6688260748538604, + "grad_norm": 4.7813029289245605, + "learning_rate": 2.5162448573400068e-06, + "loss": 0.0828, + "step": 26430 + }, + { + "epoch": 0.6688513804185541, + "grad_norm": 5.898963451385498, + "learning_rate": 2.515896386711506e-06, + "loss": 0.202, + "step": 26431 + }, + { + "epoch": 0.6688766859832477, + "grad_norm": 6.156646728515625, + "learning_rate": 2.51554793210253e-06, + "loss": 0.2255, + "step": 26432 + }, + { + "epoch": 0.6689019915479414, + "grad_norm": 4.0340189933776855, + "learning_rate": 2.5151994935153224e-06, + "loss": 0.1427, + "step": 26433 + }, + { + "epoch": 0.6689272971126351, + "grad_norm": 3.3113741874694824, + "learning_rate": 2.514851070952132e-06, + "loss": 0.168, + "step": 26434 + }, + { + "epoch": 0.6689526026773287, + "grad_norm": 5.738648414611816, + "learning_rate": 2.514502664415202e-06, + "loss": 0.1031, + "step": 26435 + }, + { + "epoch": 0.6689779082420224, + "grad_norm": 2.7682435512542725, + "learning_rate": 2.5141542739067844e-06, + "loss": 0.0809, + "step": 26436 + }, + { + "epoch": 0.6690032138067161, + "grad_norm": 5.476468563079834, + "learning_rate": 2.5138058994291233e-06, + "loss": 0.2223, + "step": 26437 + }, + { + "epoch": 0.6690285193714097, + "grad_norm": 3.6458184719085693, + "learning_rate": 2.513457540984465e-06, + "loss": 0.1514, + "step": 26438 + }, + { + "epoch": 0.6690538249361034, + "grad_norm": 5.19697904586792, + "learning_rate": 2.5131091985750556e-06, + "loss": 0.1195, + "step": 26439 + }, + { + "epoch": 0.6690791305007971, + "grad_norm": 9.084080696105957, + "learning_rate": 2.5127608722031437e-06, + "loss": 0.2162, + "step": 26440 + }, + { + "epoch": 0.6691044360654907, + "grad_norm": 3.9394376277923584, + "learning_rate": 2.512412561870974e-06, + "loss": 0.1795, + "step": 26441 + }, + { + "epoch": 0.6691297416301845, + "grad_norm": 3.958909034729004, + "learning_rate": 2.5120642675807937e-06, + "loss": 0.0729, + "step": 26442 + }, + { + "epoch": 0.6691550471948782, + "grad_norm": 10.37289047241211, + "learning_rate": 2.511715989334845e-06, + "loss": 0.1939, + "step": 26443 + }, + { + "epoch": 0.6691803527595719, + "grad_norm": 4.536801338195801, + "learning_rate": 2.51136772713538e-06, + "loss": 0.1696, + "step": 26444 + }, + { + "epoch": 0.6692056583242655, + "grad_norm": 5.14393949508667, + "learning_rate": 2.51101948098464e-06, + "loss": 0.151, + "step": 26445 + }, + { + "epoch": 0.6692309638889592, + "grad_norm": 6.087637424468994, + "learning_rate": 2.510671250884875e-06, + "loss": 0.1336, + "step": 26446 + }, + { + "epoch": 0.6692562694536529, + "grad_norm": 3.301358938217163, + "learning_rate": 2.510323036838325e-06, + "loss": 0.1556, + "step": 26447 + }, + { + "epoch": 0.6692815750183465, + "grad_norm": 5.710696220397949, + "learning_rate": 2.509974838847241e-06, + "loss": 0.1425, + "step": 26448 + }, + { + "epoch": 0.6693068805830402, + "grad_norm": 3.8250014781951904, + "learning_rate": 2.509626656913863e-06, + "loss": 0.1327, + "step": 26449 + }, + { + "epoch": 0.6693321861477339, + "grad_norm": 4.264941692352295, + "learning_rate": 2.5092784910404444e-06, + "loss": 0.1659, + "step": 26450 + }, + { + "epoch": 0.6693574917124275, + "grad_norm": 3.2746798992156982, + "learning_rate": 2.508930341229221e-06, + "loss": 0.1353, + "step": 26451 + }, + { + "epoch": 0.6693827972771212, + "grad_norm": 5.242785453796387, + "learning_rate": 2.5085822074824445e-06, + "loss": 0.1946, + "step": 26452 + }, + { + "epoch": 0.669408102841815, + "grad_norm": 3.53901743888855, + "learning_rate": 2.508234089802356e-06, + "loss": 0.1492, + "step": 26453 + }, + { + "epoch": 0.6694334084065086, + "grad_norm": 5.522928714752197, + "learning_rate": 2.5078859881912064e-06, + "loss": 0.1113, + "step": 26454 + }, + { + "epoch": 0.6694587139712023, + "grad_norm": 2.6120526790618896, + "learning_rate": 2.5075379026512326e-06, + "loss": 0.1079, + "step": 26455 + }, + { + "epoch": 0.669484019535896, + "grad_norm": 5.023499965667725, + "learning_rate": 2.5071898331846843e-06, + "loss": 0.1847, + "step": 26456 + }, + { + "epoch": 0.6695093251005896, + "grad_norm": 6.372343063354492, + "learning_rate": 2.506841779793806e-06, + "loss": 0.2326, + "step": 26457 + }, + { + "epoch": 0.6695346306652833, + "grad_norm": 3.424879789352417, + "learning_rate": 2.506493742480839e-06, + "loss": 0.0837, + "step": 26458 + }, + { + "epoch": 0.669559936229977, + "grad_norm": 5.398819923400879, + "learning_rate": 2.506145721248032e-06, + "loss": 0.1699, + "step": 26459 + }, + { + "epoch": 0.6695852417946706, + "grad_norm": 4.78560209274292, + "learning_rate": 2.5057977160976277e-06, + "loss": 0.1521, + "step": 26460 + }, + { + "epoch": 0.6696105473593643, + "grad_norm": 4.992733955383301, + "learning_rate": 2.50544972703187e-06, + "loss": 0.1673, + "step": 26461 + }, + { + "epoch": 0.669635852924058, + "grad_norm": 4.631638526916504, + "learning_rate": 2.5051017540530003e-06, + "loss": 0.1392, + "step": 26462 + }, + { + "epoch": 0.6696611584887516, + "grad_norm": 4.03119421005249, + "learning_rate": 2.5047537971632696e-06, + "loss": 0.145, + "step": 26463 + }, + { + "epoch": 0.6696864640534453, + "grad_norm": 7.833138942718506, + "learning_rate": 2.5044058563649143e-06, + "loss": 0.2054, + "step": 26464 + }, + { + "epoch": 0.669711769618139, + "grad_norm": 7.653567314147949, + "learning_rate": 2.504057931660182e-06, + "loss": 0.2281, + "step": 26465 + }, + { + "epoch": 0.6697370751828327, + "grad_norm": 2.4514412879943848, + "learning_rate": 2.5037100230513153e-06, + "loss": 0.1328, + "step": 26466 + }, + { + "epoch": 0.6697623807475264, + "grad_norm": 5.221940994262695, + "learning_rate": 2.503362130540559e-06, + "loss": 0.2044, + "step": 26467 + }, + { + "epoch": 0.6697876863122201, + "grad_norm": 7.067721843719482, + "learning_rate": 2.5030142541301566e-06, + "loss": 0.2102, + "step": 26468 + }, + { + "epoch": 0.6698129918769138, + "grad_norm": 4.71354866027832, + "learning_rate": 2.5026663938223506e-06, + "loss": 0.1425, + "step": 26469 + }, + { + "epoch": 0.6698382974416074, + "grad_norm": 4.528562545776367, + "learning_rate": 2.5023185496193827e-06, + "loss": 0.1152, + "step": 26470 + }, + { + "epoch": 0.6698636030063011, + "grad_norm": 4.046977519989014, + "learning_rate": 2.5019707215235e-06, + "loss": 0.148, + "step": 26471 + }, + { + "epoch": 0.6698889085709948, + "grad_norm": 4.180495738983154, + "learning_rate": 2.5016229095369425e-06, + "loss": 0.1262, + "step": 26472 + }, + { + "epoch": 0.6699142141356884, + "grad_norm": 8.59404182434082, + "learning_rate": 2.501275113661955e-06, + "loss": 0.2047, + "step": 26473 + }, + { + "epoch": 0.6699395197003821, + "grad_norm": 4.100700378417969, + "learning_rate": 2.5009273339007776e-06, + "loss": 0.164, + "step": 26474 + }, + { + "epoch": 0.6699648252650758, + "grad_norm": 4.392152786254883, + "learning_rate": 2.5005795702556566e-06, + "loss": 0.1259, + "step": 26475 + }, + { + "epoch": 0.6699901308297694, + "grad_norm": 7.407740592956543, + "learning_rate": 2.500231822728833e-06, + "loss": 0.1148, + "step": 26476 + }, + { + "epoch": 0.6700154363944631, + "grad_norm": 11.00946044921875, + "learning_rate": 2.4998840913225497e-06, + "loss": 0.1861, + "step": 26477 + }, + { + "epoch": 0.6700407419591569, + "grad_norm": 8.61085033416748, + "learning_rate": 2.4995363760390467e-06, + "loss": 0.1523, + "step": 26478 + }, + { + "epoch": 0.6700660475238505, + "grad_norm": 6.657398700714111, + "learning_rate": 2.49918867688057e-06, + "loss": 0.1584, + "step": 26479 + }, + { + "epoch": 0.6700913530885442, + "grad_norm": 3.0192153453826904, + "learning_rate": 2.498840993849359e-06, + "loss": 0.1214, + "step": 26480 + }, + { + "epoch": 0.6701166586532379, + "grad_norm": 3.4650301933288574, + "learning_rate": 2.498493326947661e-06, + "loss": 0.1198, + "step": 26481 + }, + { + "epoch": 0.6701419642179315, + "grad_norm": 4.692121982574463, + "learning_rate": 2.49814567617771e-06, + "loss": 0.1716, + "step": 26482 + }, + { + "epoch": 0.6701672697826252, + "grad_norm": 2.9307124614715576, + "learning_rate": 2.4977980415417543e-06, + "loss": 0.1006, + "step": 26483 + }, + { + "epoch": 0.6701925753473189, + "grad_norm": 3.938302755355835, + "learning_rate": 2.4974504230420337e-06, + "loss": 0.0753, + "step": 26484 + }, + { + "epoch": 0.6702178809120125, + "grad_norm": 3.5316965579986572, + "learning_rate": 2.4971028206807894e-06, + "loss": 0.0594, + "step": 26485 + }, + { + "epoch": 0.6702431864767062, + "grad_norm": 4.501029014587402, + "learning_rate": 2.496755234460262e-06, + "loss": 0.1531, + "step": 26486 + }, + { + "epoch": 0.6702684920413999, + "grad_norm": 9.895593643188477, + "learning_rate": 2.496407664382696e-06, + "loss": 0.2809, + "step": 26487 + }, + { + "epoch": 0.6702937976060935, + "grad_norm": 3.2026848793029785, + "learning_rate": 2.496060110450331e-06, + "loss": 0.1757, + "step": 26488 + }, + { + "epoch": 0.6703191031707872, + "grad_norm": 8.520330429077148, + "learning_rate": 2.4957125726654074e-06, + "loss": 0.2655, + "step": 26489 + }, + { + "epoch": 0.670344408735481, + "grad_norm": 4.358343124389648, + "learning_rate": 2.495365051030169e-06, + "loss": 0.1396, + "step": 26490 + }, + { + "epoch": 0.6703697143001746, + "grad_norm": 8.998560905456543, + "learning_rate": 2.495017545546856e-06, + "loss": 0.1997, + "step": 26491 + }, + { + "epoch": 0.6703950198648683, + "grad_norm": 4.784887790679932, + "learning_rate": 2.4946700562177085e-06, + "loss": 0.1415, + "step": 26492 + }, + { + "epoch": 0.670420325429562, + "grad_norm": 4.410022258758545, + "learning_rate": 2.494322583044966e-06, + "loss": 0.1586, + "step": 26493 + }, + { + "epoch": 0.6704456309942556, + "grad_norm": 3.87035870552063, + "learning_rate": 2.4939751260308747e-06, + "loss": 0.1543, + "step": 26494 + }, + { + "epoch": 0.6704709365589493, + "grad_norm": 6.697814464569092, + "learning_rate": 2.4936276851776682e-06, + "loss": 0.1876, + "step": 26495 + }, + { + "epoch": 0.670496242123643, + "grad_norm": 6.7695183753967285, + "learning_rate": 2.4932802604875923e-06, + "loss": 0.1635, + "step": 26496 + }, + { + "epoch": 0.6705215476883367, + "grad_norm": 8.124785423278809, + "learning_rate": 2.492932851962884e-06, + "loss": 0.143, + "step": 26497 + }, + { + "epoch": 0.6705468532530303, + "grad_norm": 4.1453423500061035, + "learning_rate": 2.4925854596057865e-06, + "loss": 0.146, + "step": 26498 + }, + { + "epoch": 0.670572158817724, + "grad_norm": 3.2807095050811768, + "learning_rate": 2.4922380834185395e-06, + "loss": 0.133, + "step": 26499 + }, + { + "epoch": 0.6705974643824177, + "grad_norm": 7.143640518188477, + "learning_rate": 2.4918907234033822e-06, + "loss": 0.194, + "step": 26500 + }, + { + "epoch": 0.6706227699471113, + "grad_norm": 6.929810523986816, + "learning_rate": 2.4915433795625537e-06, + "loss": 0.1819, + "step": 26501 + }, + { + "epoch": 0.670648075511805, + "grad_norm": 5.40300178527832, + "learning_rate": 2.4911960518982976e-06, + "loss": 0.1112, + "step": 26502 + }, + { + "epoch": 0.6706733810764988, + "grad_norm": 2.7312822341918945, + "learning_rate": 2.4908487404128508e-06, + "loss": 0.1115, + "step": 26503 + }, + { + "epoch": 0.6706986866411924, + "grad_norm": 3.042353868484497, + "learning_rate": 2.4905014451084534e-06, + "loss": 0.1136, + "step": 26504 + }, + { + "epoch": 0.6707239922058861, + "grad_norm": 4.393357276916504, + "learning_rate": 2.490154165987344e-06, + "loss": 0.1523, + "step": 26505 + }, + { + "epoch": 0.6707492977705798, + "grad_norm": 2.612064838409424, + "learning_rate": 2.4898069030517656e-06, + "loss": 0.1385, + "step": 26506 + }, + { + "epoch": 0.6707746033352734, + "grad_norm": 5.532361030578613, + "learning_rate": 2.489459656303955e-06, + "loss": 0.1282, + "step": 26507 + }, + { + "epoch": 0.6707999088999671, + "grad_norm": 4.336806774139404, + "learning_rate": 2.489112425746153e-06, + "loss": 0.2056, + "step": 26508 + }, + { + "epoch": 0.6708252144646608, + "grad_norm": 6.674292087554932, + "learning_rate": 2.4887652113805955e-06, + "loss": 0.2063, + "step": 26509 + }, + { + "epoch": 0.6708505200293544, + "grad_norm": 4.400782108306885, + "learning_rate": 2.488418013209526e-06, + "loss": 0.1901, + "step": 26510 + }, + { + "epoch": 0.6708758255940481, + "grad_norm": 3.027993679046631, + "learning_rate": 2.4880708312351814e-06, + "loss": 0.1404, + "step": 26511 + }, + { + "epoch": 0.6709011311587418, + "grad_norm": 2.477468729019165, + "learning_rate": 2.487723665459801e-06, + "loss": 0.1127, + "step": 26512 + }, + { + "epoch": 0.6709264367234354, + "grad_norm": 2.8003604412078857, + "learning_rate": 2.4873765158856205e-06, + "loss": 0.1301, + "step": 26513 + }, + { + "epoch": 0.6709517422881291, + "grad_norm": 7.456841468811035, + "learning_rate": 2.4870293825148844e-06, + "loss": 0.2145, + "step": 26514 + }, + { + "epoch": 0.6709770478528229, + "grad_norm": 3.4366588592529297, + "learning_rate": 2.4866822653498278e-06, + "loss": 0.1281, + "step": 26515 + }, + { + "epoch": 0.6710023534175165, + "grad_norm": 11.100992202758789, + "learning_rate": 2.4863351643926903e-06, + "loss": 0.3124, + "step": 26516 + }, + { + "epoch": 0.6710276589822102, + "grad_norm": 10.740840911865234, + "learning_rate": 2.4859880796457077e-06, + "loss": 0.1584, + "step": 26517 + }, + { + "epoch": 0.6710529645469039, + "grad_norm": 11.783367156982422, + "learning_rate": 2.4856410111111216e-06, + "loss": 0.2309, + "step": 26518 + }, + { + "epoch": 0.6710782701115975, + "grad_norm": 2.8425910472869873, + "learning_rate": 2.4852939587911695e-06, + "loss": 0.1173, + "step": 26519 + }, + { + "epoch": 0.6711035756762912, + "grad_norm": 6.615452766418457, + "learning_rate": 2.4849469226880875e-06, + "loss": 0.139, + "step": 26520 + }, + { + "epoch": 0.6711288812409849, + "grad_norm": 4.021440505981445, + "learning_rate": 2.4845999028041178e-06, + "loss": 0.1013, + "step": 26521 + }, + { + "epoch": 0.6711541868056786, + "grad_norm": 4.656002998352051, + "learning_rate": 2.484252899141492e-06, + "loss": 0.1275, + "step": 26522 + }, + { + "epoch": 0.6711794923703722, + "grad_norm": 2.9053027629852295, + "learning_rate": 2.4839059117024533e-06, + "loss": 0.1146, + "step": 26523 + }, + { + "epoch": 0.6712047979350659, + "grad_norm": 4.748262405395508, + "learning_rate": 2.483558940489235e-06, + "loss": 0.1527, + "step": 26524 + }, + { + "epoch": 0.6712301034997596, + "grad_norm": 3.451983690261841, + "learning_rate": 2.4832119855040815e-06, + "loss": 0.1273, + "step": 26525 + }, + { + "epoch": 0.6712554090644532, + "grad_norm": 3.7815496921539307, + "learning_rate": 2.4828650467492214e-06, + "loss": 0.0922, + "step": 26526 + }, + { + "epoch": 0.671280714629147, + "grad_norm": 4.532093048095703, + "learning_rate": 2.4825181242268986e-06, + "loss": 0.1835, + "step": 26527 + }, + { + "epoch": 0.6713060201938407, + "grad_norm": 4.659524440765381, + "learning_rate": 2.482171217939346e-06, + "loss": 0.1741, + "step": 26528 + }, + { + "epoch": 0.6713313257585343, + "grad_norm": 3.017937660217285, + "learning_rate": 2.4818243278888064e-06, + "loss": 0.1452, + "step": 26529 + }, + { + "epoch": 0.671356631323228, + "grad_norm": 2.371950149536133, + "learning_rate": 2.48147745407751e-06, + "loss": 0.0978, + "step": 26530 + }, + { + "epoch": 0.6713819368879217, + "grad_norm": 17.356672286987305, + "learning_rate": 2.4811305965076987e-06, + "loss": 0.2331, + "step": 26531 + }, + { + "epoch": 0.6714072424526153, + "grad_norm": 5.6522345542907715, + "learning_rate": 2.480783755181606e-06, + "loss": 0.1444, + "step": 26532 + }, + { + "epoch": 0.671432548017309, + "grad_norm": 12.898286819458008, + "learning_rate": 2.480436930101472e-06, + "loss": 0.1975, + "step": 26533 + }, + { + "epoch": 0.6714578535820027, + "grad_norm": 6.407020568847656, + "learning_rate": 2.4800901212695316e-06, + "loss": 0.191, + "step": 26534 + }, + { + "epoch": 0.6714831591466963, + "grad_norm": 4.32875919342041, + "learning_rate": 2.4797433286880214e-06, + "loss": 0.0819, + "step": 26535 + }, + { + "epoch": 0.67150846471139, + "grad_norm": 2.590768337249756, + "learning_rate": 2.479396552359175e-06, + "loss": 0.1025, + "step": 26536 + }, + { + "epoch": 0.6715337702760837, + "grad_norm": 6.680306434631348, + "learning_rate": 2.479049792285234e-06, + "loss": 0.1633, + "step": 26537 + }, + { + "epoch": 0.6715590758407773, + "grad_norm": 3.5999460220336914, + "learning_rate": 2.4787030484684314e-06, + "loss": 0.1218, + "step": 26538 + }, + { + "epoch": 0.671584381405471, + "grad_norm": 7.125119209289551, + "learning_rate": 2.4783563209110045e-06, + "loss": 0.2208, + "step": 26539 + }, + { + "epoch": 0.6716096869701648, + "grad_norm": 3.586043357849121, + "learning_rate": 2.4780096096151867e-06, + "loss": 0.1246, + "step": 26540 + }, + { + "epoch": 0.6716349925348584, + "grad_norm": 6.530416488647461, + "learning_rate": 2.477662914583217e-06, + "loss": 0.0973, + "step": 26541 + }, + { + "epoch": 0.6716602980995521, + "grad_norm": 9.048698425292969, + "learning_rate": 2.477316235817331e-06, + "loss": 0.2247, + "step": 26542 + }, + { + "epoch": 0.6716856036642458, + "grad_norm": 9.156543731689453, + "learning_rate": 2.4769695733197624e-06, + "loss": 0.1623, + "step": 26543 + }, + { + "epoch": 0.6717109092289394, + "grad_norm": 13.624733924865723, + "learning_rate": 2.4766229270927463e-06, + "loss": 0.2956, + "step": 26544 + }, + { + "epoch": 0.6717362147936331, + "grad_norm": 6.813737869262695, + "learning_rate": 2.4762762971385214e-06, + "loss": 0.1912, + "step": 26545 + }, + { + "epoch": 0.6717615203583268, + "grad_norm": 9.247900009155273, + "learning_rate": 2.475929683459321e-06, + "loss": 0.1505, + "step": 26546 + }, + { + "epoch": 0.6717868259230205, + "grad_norm": 3.75923490524292, + "learning_rate": 2.4755830860573805e-06, + "loss": 0.1286, + "step": 26547 + }, + { + "epoch": 0.6718121314877141, + "grad_norm": 2.9954211711883545, + "learning_rate": 2.4752365049349354e-06, + "loss": 0.1126, + "step": 26548 + }, + { + "epoch": 0.6718374370524078, + "grad_norm": 19.328678131103516, + "learning_rate": 2.474889940094218e-06, + "loss": 0.1572, + "step": 26549 + }, + { + "epoch": 0.6718627426171015, + "grad_norm": 6.405698299407959, + "learning_rate": 2.4745433915374684e-06, + "loss": 0.1676, + "step": 26550 + }, + { + "epoch": 0.6718880481817951, + "grad_norm": 8.694706916809082, + "learning_rate": 2.4741968592669184e-06, + "loss": 0.1658, + "step": 26551 + }, + { + "epoch": 0.6719133537464889, + "grad_norm": 4.204126358032227, + "learning_rate": 2.4738503432848027e-06, + "loss": 0.1261, + "step": 26552 + }, + { + "epoch": 0.6719386593111826, + "grad_norm": 5.21896505355835, + "learning_rate": 2.4735038435933544e-06, + "loss": 0.0769, + "step": 26553 + }, + { + "epoch": 0.6719639648758762, + "grad_norm": 3.047034978866577, + "learning_rate": 2.4731573601948123e-06, + "loss": 0.0727, + "step": 26554 + }, + { + "epoch": 0.6719892704405699, + "grad_norm": 3.154618978500366, + "learning_rate": 2.4728108930914057e-06, + "loss": 0.1388, + "step": 26555 + }, + { + "epoch": 0.6720145760052636, + "grad_norm": 4.240954399108887, + "learning_rate": 2.472464442285376e-06, + "loss": 0.1853, + "step": 26556 + }, + { + "epoch": 0.6720398815699572, + "grad_norm": 3.9342217445373535, + "learning_rate": 2.4721180077789487e-06, + "loss": 0.151, + "step": 26557 + }, + { + "epoch": 0.6720651871346509, + "grad_norm": 2.7410993576049805, + "learning_rate": 2.4717715895743645e-06, + "loss": 0.1107, + "step": 26558 + }, + { + "epoch": 0.6720904926993446, + "grad_norm": 3.934967279434204, + "learning_rate": 2.4714251876738526e-06, + "loss": 0.1711, + "step": 26559 + }, + { + "epoch": 0.6721157982640382, + "grad_norm": 3.3310492038726807, + "learning_rate": 2.471078802079653e-06, + "loss": 0.1651, + "step": 26560 + }, + { + "epoch": 0.6721411038287319, + "grad_norm": 9.998459815979004, + "learning_rate": 2.470732432793993e-06, + "loss": 0.2161, + "step": 26561 + }, + { + "epoch": 0.6721664093934256, + "grad_norm": 5.678789138793945, + "learning_rate": 2.47038607981911e-06, + "loss": 0.0958, + "step": 26562 + }, + { + "epoch": 0.6721917149581192, + "grad_norm": 9.025115966796875, + "learning_rate": 2.4700397431572355e-06, + "loss": 0.2354, + "step": 26563 + }, + { + "epoch": 0.672217020522813, + "grad_norm": 6.3161163330078125, + "learning_rate": 2.469693422810605e-06, + "loss": 0.1492, + "step": 26564 + }, + { + "epoch": 0.6722423260875067, + "grad_norm": 7.881618499755859, + "learning_rate": 2.4693471187814517e-06, + "loss": 0.2253, + "step": 26565 + }, + { + "epoch": 0.6722676316522003, + "grad_norm": 3.649949312210083, + "learning_rate": 2.4690008310720077e-06, + "loss": 0.1949, + "step": 26566 + }, + { + "epoch": 0.672292937216894, + "grad_norm": 3.469083547592163, + "learning_rate": 2.4686545596845045e-06, + "loss": 0.1102, + "step": 26567 + }, + { + "epoch": 0.6723182427815877, + "grad_norm": 11.282535552978516, + "learning_rate": 2.46830830462118e-06, + "loss": 0.2484, + "step": 26568 + }, + { + "epoch": 0.6723435483462813, + "grad_norm": 6.583005905151367, + "learning_rate": 2.4679620658842633e-06, + "loss": 0.2174, + "step": 26569 + }, + { + "epoch": 0.672368853910975, + "grad_norm": 3.408022165298462, + "learning_rate": 2.4676158434759888e-06, + "loss": 0.1317, + "step": 26570 + }, + { + "epoch": 0.6723941594756687, + "grad_norm": 6.534818649291992, + "learning_rate": 2.4672696373985865e-06, + "loss": 0.1481, + "step": 26571 + }, + { + "epoch": 0.6724194650403624, + "grad_norm": 3.664729118347168, + "learning_rate": 2.4669234476542932e-06, + "loss": 0.0889, + "step": 26572 + }, + { + "epoch": 0.672444770605056, + "grad_norm": 5.904546737670898, + "learning_rate": 2.466577274245339e-06, + "loss": 0.2259, + "step": 26573 + }, + { + "epoch": 0.6724700761697497, + "grad_norm": 3.7382795810699463, + "learning_rate": 2.4662311171739576e-06, + "loss": 0.1443, + "step": 26574 + }, + { + "epoch": 0.6724953817344435, + "grad_norm": 4.152133464813232, + "learning_rate": 2.4658849764423783e-06, + "loss": 0.1303, + "step": 26575 + }, + { + "epoch": 0.672520687299137, + "grad_norm": 5.301712989807129, + "learning_rate": 2.465538852052837e-06, + "loss": 0.1197, + "step": 26576 + }, + { + "epoch": 0.6725459928638308, + "grad_norm": 4.041003227233887, + "learning_rate": 2.465192744007564e-06, + "loss": 0.1094, + "step": 26577 + }, + { + "epoch": 0.6725712984285245, + "grad_norm": 11.277667045593262, + "learning_rate": 2.4648466523087917e-06, + "loss": 0.3818, + "step": 26578 + }, + { + "epoch": 0.6725966039932181, + "grad_norm": 18.11174201965332, + "learning_rate": 2.4645005769587513e-06, + "loss": 0.1826, + "step": 26579 + }, + { + "epoch": 0.6726219095579118, + "grad_norm": 6.841716766357422, + "learning_rate": 2.4641545179596736e-06, + "loss": 0.1737, + "step": 26580 + }, + { + "epoch": 0.6726472151226055, + "grad_norm": 8.902107238769531, + "learning_rate": 2.4638084753137935e-06, + "loss": 0.1476, + "step": 26581 + }, + { + "epoch": 0.6726725206872991, + "grad_norm": 2.7223691940307617, + "learning_rate": 2.463462449023341e-06, + "loss": 0.1256, + "step": 26582 + }, + { + "epoch": 0.6726978262519928, + "grad_norm": 3.72096586227417, + "learning_rate": 2.4631164390905476e-06, + "loss": 0.1551, + "step": 26583 + }, + { + "epoch": 0.6727231318166865, + "grad_norm": 6.8054070472717285, + "learning_rate": 2.462770445517642e-06, + "loss": 0.2281, + "step": 26584 + }, + { + "epoch": 0.6727484373813801, + "grad_norm": 4.825521945953369, + "learning_rate": 2.46242446830686e-06, + "loss": 0.1428, + "step": 26585 + }, + { + "epoch": 0.6727737429460738, + "grad_norm": 9.39824390411377, + "learning_rate": 2.4620785074604282e-06, + "loss": 0.1232, + "step": 26586 + }, + { + "epoch": 0.6727990485107676, + "grad_norm": 5.158327102661133, + "learning_rate": 2.4617325629805845e-06, + "loss": 0.1475, + "step": 26587 + }, + { + "epoch": 0.6728243540754612, + "grad_norm": 7.742016315460205, + "learning_rate": 2.4613866348695507e-06, + "loss": 0.2144, + "step": 26588 + }, + { + "epoch": 0.6728496596401549, + "grad_norm": 7.426056861877441, + "learning_rate": 2.4610407231295645e-06, + "loss": 0.2031, + "step": 26589 + }, + { + "epoch": 0.6728749652048486, + "grad_norm": 3.446845293045044, + "learning_rate": 2.4606948277628527e-06, + "loss": 0.0991, + "step": 26590 + }, + { + "epoch": 0.6729002707695422, + "grad_norm": 5.6620001792907715, + "learning_rate": 2.4603489487716513e-06, + "loss": 0.2189, + "step": 26591 + }, + { + "epoch": 0.6729255763342359, + "grad_norm": 4.604732513427734, + "learning_rate": 2.4600030861581835e-06, + "loss": 0.1832, + "step": 26592 + }, + { + "epoch": 0.6729508818989296, + "grad_norm": 4.834774971008301, + "learning_rate": 2.4596572399246855e-06, + "loss": 0.1389, + "step": 26593 + }, + { + "epoch": 0.6729761874636232, + "grad_norm": 5.031821250915527, + "learning_rate": 2.4593114100733827e-06, + "loss": 0.1385, + "step": 26594 + }, + { + "epoch": 0.6730014930283169, + "grad_norm": 5.24610710144043, + "learning_rate": 2.45896559660651e-06, + "loss": 0.1499, + "step": 26595 + }, + { + "epoch": 0.6730267985930106, + "grad_norm": 3.8262786865234375, + "learning_rate": 2.4586197995262963e-06, + "loss": 0.115, + "step": 26596 + }, + { + "epoch": 0.6730521041577043, + "grad_norm": 3.9437644481658936, + "learning_rate": 2.45827401883497e-06, + "loss": 0.1593, + "step": 26597 + }, + { + "epoch": 0.6730774097223979, + "grad_norm": 6.964317798614502, + "learning_rate": 2.4579282545347603e-06, + "loss": 0.1095, + "step": 26598 + }, + { + "epoch": 0.6731027152870916, + "grad_norm": 2.7070069313049316, + "learning_rate": 2.4575825066279003e-06, + "loss": 0.0627, + "step": 26599 + }, + { + "epoch": 0.6731280208517854, + "grad_norm": 3.4832308292388916, + "learning_rate": 2.4572367751166174e-06, + "loss": 0.1325, + "step": 26600 + }, + { + "epoch": 0.673153326416479, + "grad_norm": 8.194575309753418, + "learning_rate": 2.456891060003142e-06, + "loss": 0.1507, + "step": 26601 + }, + { + "epoch": 0.6731786319811727, + "grad_norm": 4.7677435874938965, + "learning_rate": 2.456545361289701e-06, + "loss": 0.0985, + "step": 26602 + }, + { + "epoch": 0.6732039375458664, + "grad_norm": 10.193145751953125, + "learning_rate": 2.4561996789785274e-06, + "loss": 0.1607, + "step": 26603 + }, + { + "epoch": 0.67322924311056, + "grad_norm": 7.37497091293335, + "learning_rate": 2.4558540130718496e-06, + "loss": 0.165, + "step": 26604 + }, + { + "epoch": 0.6732545486752537, + "grad_norm": 2.302147150039673, + "learning_rate": 2.455508363571896e-06, + "loss": 0.1199, + "step": 26605 + }, + { + "epoch": 0.6732798542399474, + "grad_norm": 3.50477933883667, + "learning_rate": 2.455162730480895e-06, + "loss": 0.1484, + "step": 26606 + }, + { + "epoch": 0.673305159804641, + "grad_norm": 10.215848922729492, + "learning_rate": 2.4548171138010745e-06, + "loss": 0.1182, + "step": 26607 + }, + { + "epoch": 0.6733304653693347, + "grad_norm": 5.052558422088623, + "learning_rate": 2.454471513534667e-06, + "loss": 0.1928, + "step": 26608 + }, + { + "epoch": 0.6733557709340284, + "grad_norm": 21.732580184936523, + "learning_rate": 2.4541259296838993e-06, + "loss": 0.3128, + "step": 26609 + }, + { + "epoch": 0.673381076498722, + "grad_norm": 9.043892860412598, + "learning_rate": 2.453780362251e-06, + "loss": 0.2783, + "step": 26610 + }, + { + "epoch": 0.6734063820634157, + "grad_norm": 4.322805881500244, + "learning_rate": 2.4534348112381952e-06, + "loss": 0.1067, + "step": 26611 + }, + { + "epoch": 0.6734316876281095, + "grad_norm": 9.601418495178223, + "learning_rate": 2.4530892766477175e-06, + "loss": 0.2306, + "step": 26612 + }, + { + "epoch": 0.673456993192803, + "grad_norm": 4.409798622131348, + "learning_rate": 2.4527437584817936e-06, + "loss": 0.1747, + "step": 26613 + }, + { + "epoch": 0.6734822987574968, + "grad_norm": 10.231675148010254, + "learning_rate": 2.4523982567426514e-06, + "loss": 0.3195, + "step": 26614 + }, + { + "epoch": 0.6735076043221905, + "grad_norm": 3.3773043155670166, + "learning_rate": 2.452052771432517e-06, + "loss": 0.1416, + "step": 26615 + }, + { + "epoch": 0.6735329098868841, + "grad_norm": 6.809606552124023, + "learning_rate": 2.451707302553622e-06, + "loss": 0.1976, + "step": 26616 + }, + { + "epoch": 0.6735582154515778, + "grad_norm": 7.6094746589660645, + "learning_rate": 2.451361850108191e-06, + "loss": 0.1915, + "step": 26617 + }, + { + "epoch": 0.6735835210162715, + "grad_norm": 6.68955135345459, + "learning_rate": 2.451016414098457e-06, + "loss": 0.244, + "step": 26618 + }, + { + "epoch": 0.6736088265809651, + "grad_norm": 11.700713157653809, + "learning_rate": 2.45067099452664e-06, + "loss": 0.135, + "step": 26619 + }, + { + "epoch": 0.6736341321456588, + "grad_norm": 5.680869102478027, + "learning_rate": 2.4503255913949734e-06, + "loss": 0.1876, + "step": 26620 + }, + { + "epoch": 0.6736594377103525, + "grad_norm": 14.311046600341797, + "learning_rate": 2.4499802047056814e-06, + "loss": 0.2606, + "step": 26621 + }, + { + "epoch": 0.6736847432750461, + "grad_norm": 7.527491569519043, + "learning_rate": 2.449634834460996e-06, + "loss": 0.1368, + "step": 26622 + }, + { + "epoch": 0.6737100488397398, + "grad_norm": 2.583629608154297, + "learning_rate": 2.449289480663138e-06, + "loss": 0.0894, + "step": 26623 + }, + { + "epoch": 0.6737353544044336, + "grad_norm": 3.798457145690918, + "learning_rate": 2.448944143314339e-06, + "loss": 0.1145, + "step": 26624 + }, + { + "epoch": 0.6737606599691273, + "grad_norm": 2.8275909423828125, + "learning_rate": 2.4485988224168227e-06, + "loss": 0.0978, + "step": 26625 + }, + { + "epoch": 0.6737859655338209, + "grad_norm": 3.8144731521606445, + "learning_rate": 2.4482535179728205e-06, + "loss": 0.1594, + "step": 26626 + }, + { + "epoch": 0.6738112710985146, + "grad_norm": 5.593514442443848, + "learning_rate": 2.4479082299845564e-06, + "loss": 0.2296, + "step": 26627 + }, + { + "epoch": 0.6738365766632083, + "grad_norm": 7.83863639831543, + "learning_rate": 2.447562958454257e-06, + "loss": 0.2784, + "step": 26628 + }, + { + "epoch": 0.6738618822279019, + "grad_norm": 3.673762083053589, + "learning_rate": 2.4472177033841477e-06, + "loss": 0.1337, + "step": 26629 + }, + { + "epoch": 0.6738871877925956, + "grad_norm": 4.854014873504639, + "learning_rate": 2.4468724647764585e-06, + "loss": 0.1412, + "step": 26630 + }, + { + "epoch": 0.6739124933572893, + "grad_norm": 4.4937567710876465, + "learning_rate": 2.446527242633414e-06, + "loss": 0.155, + "step": 26631 + }, + { + "epoch": 0.6739377989219829, + "grad_norm": 2.2728219032287598, + "learning_rate": 2.4461820369572404e-06, + "loss": 0.0861, + "step": 26632 + }, + { + "epoch": 0.6739631044866766, + "grad_norm": 4.470441818237305, + "learning_rate": 2.4458368477501636e-06, + "loss": 0.1408, + "step": 26633 + }, + { + "epoch": 0.6739884100513703, + "grad_norm": 7.056522369384766, + "learning_rate": 2.4454916750144077e-06, + "loss": 0.1789, + "step": 26634 + }, + { + "epoch": 0.6740137156160639, + "grad_norm": 5.72726583480835, + "learning_rate": 2.4451465187522034e-06, + "loss": 0.1396, + "step": 26635 + }, + { + "epoch": 0.6740390211807576, + "grad_norm": 3.7260797023773193, + "learning_rate": 2.444801378965773e-06, + "loss": 0.1561, + "step": 26636 + }, + { + "epoch": 0.6740643267454514, + "grad_norm": 6.784953594207764, + "learning_rate": 2.4444562556573436e-06, + "loss": 0.2205, + "step": 26637 + }, + { + "epoch": 0.674089632310145, + "grad_norm": 6.678181171417236, + "learning_rate": 2.444111148829139e-06, + "loss": 0.1637, + "step": 26638 + }, + { + "epoch": 0.6741149378748387, + "grad_norm": 5.0346879959106445, + "learning_rate": 2.443766058483388e-06, + "loss": 0.1288, + "step": 26639 + }, + { + "epoch": 0.6741402434395324, + "grad_norm": 6.720261573791504, + "learning_rate": 2.443420984622314e-06, + "loss": 0.1786, + "step": 26640 + }, + { + "epoch": 0.674165549004226, + "grad_norm": 9.04273796081543, + "learning_rate": 2.443075927248142e-06, + "loss": 0.2358, + "step": 26641 + }, + { + "epoch": 0.6741908545689197, + "grad_norm": 6.259130477905273, + "learning_rate": 2.4427308863630967e-06, + "loss": 0.1774, + "step": 26642 + }, + { + "epoch": 0.6742161601336134, + "grad_norm": 6.798236846923828, + "learning_rate": 2.442385861969405e-06, + "loss": 0.1575, + "step": 26643 + }, + { + "epoch": 0.674241465698307, + "grad_norm": 8.095775604248047, + "learning_rate": 2.442040854069292e-06, + "loss": 0.2201, + "step": 26644 + }, + { + "epoch": 0.6742667712630007, + "grad_norm": 5.716771125793457, + "learning_rate": 2.4416958626649817e-06, + "loss": 0.1403, + "step": 26645 + }, + { + "epoch": 0.6742920768276944, + "grad_norm": 5.010725021362305, + "learning_rate": 2.4413508877586966e-06, + "loss": 0.1708, + "step": 26646 + }, + { + "epoch": 0.674317382392388, + "grad_norm": 7.254390716552734, + "learning_rate": 2.4410059293526662e-06, + "loss": 0.1952, + "step": 26647 + }, + { + "epoch": 0.6743426879570817, + "grad_norm": 5.321431636810303, + "learning_rate": 2.44066098744911e-06, + "loss": 0.2096, + "step": 26648 + }, + { + "epoch": 0.6743679935217755, + "grad_norm": 2.784496545791626, + "learning_rate": 2.4403160620502593e-06, + "loss": 0.0566, + "step": 26649 + }, + { + "epoch": 0.6743932990864692, + "grad_norm": 2.4257590770721436, + "learning_rate": 2.4399711531583297e-06, + "loss": 0.1044, + "step": 26650 + }, + { + "epoch": 0.6744186046511628, + "grad_norm": 5.691678524017334, + "learning_rate": 2.4396262607755526e-06, + "loss": 0.2928, + "step": 26651 + }, + { + "epoch": 0.6744439102158565, + "grad_norm": 3.910815477371216, + "learning_rate": 2.4392813849041474e-06, + "loss": 0.1567, + "step": 26652 + }, + { + "epoch": 0.6744692157805502, + "grad_norm": 5.848740577697754, + "learning_rate": 2.4389365255463438e-06, + "loss": 0.157, + "step": 26653 + }, + { + "epoch": 0.6744945213452438, + "grad_norm": 4.899695873260498, + "learning_rate": 2.4385916827043582e-06, + "loss": 0.179, + "step": 26654 + }, + { + "epoch": 0.6745198269099375, + "grad_norm": 3.705125093460083, + "learning_rate": 2.4382468563804206e-06, + "loss": 0.0748, + "step": 26655 + }, + { + "epoch": 0.6745451324746312, + "grad_norm": 2.8859336376190186, + "learning_rate": 2.4379020465767507e-06, + "loss": 0.0967, + "step": 26656 + }, + { + "epoch": 0.6745704380393248, + "grad_norm": 4.6572747230529785, + "learning_rate": 2.4375572532955777e-06, + "loss": 0.1372, + "step": 26657 + }, + { + "epoch": 0.6745957436040185, + "grad_norm": 6.801220417022705, + "learning_rate": 2.437212476539117e-06, + "loss": 0.146, + "step": 26658 + }, + { + "epoch": 0.6746210491687122, + "grad_norm": 8.037541389465332, + "learning_rate": 2.4368677163095987e-06, + "loss": 0.1245, + "step": 26659 + }, + { + "epoch": 0.6746463547334058, + "grad_norm": 3.2768428325653076, + "learning_rate": 2.4365229726092414e-06, + "loss": 0.1214, + "step": 26660 + }, + { + "epoch": 0.6746716602980996, + "grad_norm": 3.7339751720428467, + "learning_rate": 2.436178245440273e-06, + "loss": 0.1086, + "step": 26661 + }, + { + "epoch": 0.6746969658627933, + "grad_norm": 2.7175586223602295, + "learning_rate": 2.4358335348049137e-06, + "loss": 0.1538, + "step": 26662 + }, + { + "epoch": 0.6747222714274869, + "grad_norm": 11.43887996673584, + "learning_rate": 2.435488840705388e-06, + "loss": 0.112, + "step": 26663 + }, + { + "epoch": 0.6747475769921806, + "grad_norm": 4.213311672210693, + "learning_rate": 2.4351441631439164e-06, + "loss": 0.1665, + "step": 26664 + }, + { + "epoch": 0.6747728825568743, + "grad_norm": 3.489412307739258, + "learning_rate": 2.4347995021227216e-06, + "loss": 0.1149, + "step": 26665 + }, + { + "epoch": 0.6747981881215679, + "grad_norm": 4.697272777557373, + "learning_rate": 2.43445485764403e-06, + "loss": 0.123, + "step": 26666 + }, + { + "epoch": 0.6748234936862616, + "grad_norm": 5.97944450378418, + "learning_rate": 2.4341102297100616e-06, + "loss": 0.1756, + "step": 26667 + }, + { + "epoch": 0.6748487992509553, + "grad_norm": 4.240460395812988, + "learning_rate": 2.4337656183230384e-06, + "loss": 0.1697, + "step": 26668 + }, + { + "epoch": 0.6748741048156489, + "grad_norm": 7.477450370788574, + "learning_rate": 2.4334210234851826e-06, + "loss": 0.1798, + "step": 26669 + }, + { + "epoch": 0.6748994103803426, + "grad_norm": 3.6649374961853027, + "learning_rate": 2.4330764451987182e-06, + "loss": 0.1483, + "step": 26670 + }, + { + "epoch": 0.6749247159450363, + "grad_norm": 3.330702304840088, + "learning_rate": 2.4327318834658666e-06, + "loss": 0.159, + "step": 26671 + }, + { + "epoch": 0.6749500215097299, + "grad_norm": 4.547091007232666, + "learning_rate": 2.432387338288849e-06, + "loss": 0.1453, + "step": 26672 + }, + { + "epoch": 0.6749753270744236, + "grad_norm": 5.919703483581543, + "learning_rate": 2.4320428096698865e-06, + "loss": 0.1328, + "step": 26673 + }, + { + "epoch": 0.6750006326391174, + "grad_norm": 7.070103645324707, + "learning_rate": 2.431698297611204e-06, + "loss": 0.0984, + "step": 26674 + }, + { + "epoch": 0.6750259382038111, + "grad_norm": 8.354108810424805, + "learning_rate": 2.4313538021150216e-06, + "loss": 0.2689, + "step": 26675 + }, + { + "epoch": 0.6750512437685047, + "grad_norm": 9.548493385314941, + "learning_rate": 2.4310093231835607e-06, + "loss": 0.1404, + "step": 26676 + }, + { + "epoch": 0.6750765493331984, + "grad_norm": 8.446728706359863, + "learning_rate": 2.430664860819041e-06, + "loss": 0.1566, + "step": 26677 + }, + { + "epoch": 0.6751018548978921, + "grad_norm": 11.926093101501465, + "learning_rate": 2.430320415023687e-06, + "loss": 0.3377, + "step": 26678 + }, + { + "epoch": 0.6751271604625857, + "grad_norm": 3.2957661151885986, + "learning_rate": 2.429975985799719e-06, + "loss": 0.1578, + "step": 26679 + }, + { + "epoch": 0.6751524660272794, + "grad_norm": 4.132641315460205, + "learning_rate": 2.4296315731493582e-06, + "loss": 0.1699, + "step": 26680 + }, + { + "epoch": 0.6751777715919731, + "grad_norm": 5.812594890594482, + "learning_rate": 2.429287177074823e-06, + "loss": 0.2289, + "step": 26681 + }, + { + "epoch": 0.6752030771566667, + "grad_norm": 6.1214728355407715, + "learning_rate": 2.4289427975783387e-06, + "loss": 0.1181, + "step": 26682 + }, + { + "epoch": 0.6752283827213604, + "grad_norm": 7.230535984039307, + "learning_rate": 2.428598434662122e-06, + "loss": 0.2121, + "step": 26683 + }, + { + "epoch": 0.6752536882860541, + "grad_norm": 3.8689475059509277, + "learning_rate": 2.4282540883283993e-06, + "loss": 0.1614, + "step": 26684 + }, + { + "epoch": 0.6752789938507477, + "grad_norm": 4.631110668182373, + "learning_rate": 2.4279097585793838e-06, + "loss": 0.1392, + "step": 26685 + }, + { + "epoch": 0.6753042994154415, + "grad_norm": 3.4027974605560303, + "learning_rate": 2.427565445417302e-06, + "loss": 0.1113, + "step": 26686 + }, + { + "epoch": 0.6753296049801352, + "grad_norm": 5.630282878875732, + "learning_rate": 2.42722114884437e-06, + "loss": 0.1839, + "step": 26687 + }, + { + "epoch": 0.6753549105448288, + "grad_norm": 3.454664707183838, + "learning_rate": 2.426876868862814e-06, + "loss": 0.0879, + "step": 26688 + }, + { + "epoch": 0.6753802161095225, + "grad_norm": 25.282957077026367, + "learning_rate": 2.426532605474847e-06, + "loss": 0.2795, + "step": 26689 + }, + { + "epoch": 0.6754055216742162, + "grad_norm": 4.365524768829346, + "learning_rate": 2.4261883586826947e-06, + "loss": 0.168, + "step": 26690 + }, + { + "epoch": 0.6754308272389098, + "grad_norm": 3.50795316696167, + "learning_rate": 2.4258441284885742e-06, + "loss": 0.1256, + "step": 26691 + }, + { + "epoch": 0.6754561328036035, + "grad_norm": 4.160403251647949, + "learning_rate": 2.4254999148947038e-06, + "loss": 0.1016, + "step": 26692 + }, + { + "epoch": 0.6754814383682972, + "grad_norm": 6.8006696701049805, + "learning_rate": 2.4251557179033084e-06, + "loss": 0.1356, + "step": 26693 + }, + { + "epoch": 0.6755067439329908, + "grad_norm": 11.216170310974121, + "learning_rate": 2.424811537516605e-06, + "loss": 0.2541, + "step": 26694 + }, + { + "epoch": 0.6755320494976845, + "grad_norm": 7.898958683013916, + "learning_rate": 2.424467373736812e-06, + "loss": 0.1579, + "step": 26695 + }, + { + "epoch": 0.6755573550623782, + "grad_norm": 5.257318019866943, + "learning_rate": 2.4241232265661483e-06, + "loss": 0.1548, + "step": 26696 + }, + { + "epoch": 0.6755826606270718, + "grad_norm": 12.192774772644043, + "learning_rate": 2.423779096006837e-06, + "loss": 0.1339, + "step": 26697 + }, + { + "epoch": 0.6756079661917656, + "grad_norm": 4.3565754890441895, + "learning_rate": 2.423434982061095e-06, + "loss": 0.1747, + "step": 26698 + }, + { + "epoch": 0.6756332717564593, + "grad_norm": 5.059800148010254, + "learning_rate": 2.4230908847311407e-06, + "loss": 0.1109, + "step": 26699 + }, + { + "epoch": 0.675658577321153, + "grad_norm": 8.565918922424316, + "learning_rate": 2.422746804019193e-06, + "loss": 0.2411, + "step": 26700 + }, + { + "epoch": 0.6756838828858466, + "grad_norm": 7.922020435333252, + "learning_rate": 2.4224027399274737e-06, + "loss": 0.1951, + "step": 26701 + }, + { + "epoch": 0.6757091884505403, + "grad_norm": 5.961182117462158, + "learning_rate": 2.4220586924581992e-06, + "loss": 0.1859, + "step": 26702 + }, + { + "epoch": 0.675734494015234, + "grad_norm": 3.225625514984131, + "learning_rate": 2.4217146616135883e-06, + "loss": 0.0816, + "step": 26703 + }, + { + "epoch": 0.6757597995799276, + "grad_norm": 5.803229808807373, + "learning_rate": 2.421370647395859e-06, + "loss": 0.1914, + "step": 26704 + }, + { + "epoch": 0.6757851051446213, + "grad_norm": 4.946777820587158, + "learning_rate": 2.421026649807232e-06, + "loss": 0.1692, + "step": 26705 + }, + { + "epoch": 0.675810410709315, + "grad_norm": 19.76201629638672, + "learning_rate": 2.4206826688499245e-06, + "loss": 0.2646, + "step": 26706 + }, + { + "epoch": 0.6758357162740086, + "grad_norm": 3.7552146911621094, + "learning_rate": 2.4203387045261546e-06, + "loss": 0.1067, + "step": 26707 + }, + { + "epoch": 0.6758610218387023, + "grad_norm": 7.160656452178955, + "learning_rate": 2.419994756838139e-06, + "loss": 0.2355, + "step": 26708 + }, + { + "epoch": 0.675886327403396, + "grad_norm": 3.571617603302002, + "learning_rate": 2.419650825788099e-06, + "loss": 0.1187, + "step": 26709 + }, + { + "epoch": 0.6759116329680896, + "grad_norm": 9.564867973327637, + "learning_rate": 2.4193069113782507e-06, + "loss": 0.2324, + "step": 26710 + }, + { + "epoch": 0.6759369385327834, + "grad_norm": 4.071293830871582, + "learning_rate": 2.418963013610812e-06, + "loss": 0.1602, + "step": 26711 + }, + { + "epoch": 0.6759622440974771, + "grad_norm": 6.355681419372559, + "learning_rate": 2.4186191324879994e-06, + "loss": 0.197, + "step": 26712 + }, + { + "epoch": 0.6759875496621707, + "grad_norm": 4.633683204650879, + "learning_rate": 2.4182752680120332e-06, + "loss": 0.1643, + "step": 26713 + }, + { + "epoch": 0.6760128552268644, + "grad_norm": 10.60779094696045, + "learning_rate": 2.4179314201851277e-06, + "loss": 0.2071, + "step": 26714 + }, + { + "epoch": 0.6760381607915581, + "grad_norm": 4.945084571838379, + "learning_rate": 2.4175875890095064e-06, + "loss": 0.1874, + "step": 26715 + }, + { + "epoch": 0.6760634663562517, + "grad_norm": 3.3233721256256104, + "learning_rate": 2.4172437744873778e-06, + "loss": 0.1032, + "step": 26716 + }, + { + "epoch": 0.6760887719209454, + "grad_norm": 3.8918466567993164, + "learning_rate": 2.4168999766209662e-06, + "loss": 0.1426, + "step": 26717 + }, + { + "epoch": 0.6761140774856391, + "grad_norm": 6.510178089141846, + "learning_rate": 2.4165561954124863e-06, + "loss": 0.1732, + "step": 26718 + }, + { + "epoch": 0.6761393830503327, + "grad_norm": 3.150909423828125, + "learning_rate": 2.4162124308641543e-06, + "loss": 0.1353, + "step": 26719 + }, + { + "epoch": 0.6761646886150264, + "grad_norm": 5.5687642097473145, + "learning_rate": 2.4158686829781864e-06, + "loss": 0.146, + "step": 26720 + }, + { + "epoch": 0.6761899941797201, + "grad_norm": 5.426323413848877, + "learning_rate": 2.415524951756802e-06, + "loss": 0.1649, + "step": 26721 + }, + { + "epoch": 0.6762152997444137, + "grad_norm": 4.736030101776123, + "learning_rate": 2.415181237202217e-06, + "loss": 0.1786, + "step": 26722 + }, + { + "epoch": 0.6762406053091075, + "grad_norm": 6.226245880126953, + "learning_rate": 2.414837539316645e-06, + "loss": 0.159, + "step": 26723 + }, + { + "epoch": 0.6762659108738012, + "grad_norm": 6.751681804656982, + "learning_rate": 2.414493858102307e-06, + "loss": 0.2564, + "step": 26724 + }, + { + "epoch": 0.6762912164384949, + "grad_norm": 9.397997856140137, + "learning_rate": 2.414150193561417e-06, + "loss": 0.3199, + "step": 26725 + }, + { + "epoch": 0.6763165220031885, + "grad_norm": 3.2079663276672363, + "learning_rate": 2.4138065456961916e-06, + "loss": 0.1404, + "step": 26726 + }, + { + "epoch": 0.6763418275678822, + "grad_norm": 3.6484763622283936, + "learning_rate": 2.413462914508845e-06, + "loss": 0.129, + "step": 26727 + }, + { + "epoch": 0.6763671331325759, + "grad_norm": 4.533079624176025, + "learning_rate": 2.4131193000015983e-06, + "loss": 0.1229, + "step": 26728 + }, + { + "epoch": 0.6763924386972695, + "grad_norm": 7.567124366760254, + "learning_rate": 2.4127757021766607e-06, + "loss": 0.1619, + "step": 26729 + }, + { + "epoch": 0.6764177442619632, + "grad_norm": 3.772935152053833, + "learning_rate": 2.412432121036253e-06, + "loss": 0.1281, + "step": 26730 + }, + { + "epoch": 0.6764430498266569, + "grad_norm": 3.7769079208374023, + "learning_rate": 2.412088556582587e-06, + "loss": 0.1475, + "step": 26731 + }, + { + "epoch": 0.6764683553913505, + "grad_norm": 4.914737701416016, + "learning_rate": 2.4117450088178843e-06, + "loss": 0.1943, + "step": 26732 + }, + { + "epoch": 0.6764936609560442, + "grad_norm": 5.886623382568359, + "learning_rate": 2.4114014777443536e-06, + "loss": 0.1457, + "step": 26733 + }, + { + "epoch": 0.676518966520738, + "grad_norm": 4.763119220733643, + "learning_rate": 2.4110579633642146e-06, + "loss": 0.1383, + "step": 26734 + }, + { + "epoch": 0.6765442720854316, + "grad_norm": 7.189183235168457, + "learning_rate": 2.4107144656796795e-06, + "loss": 0.1239, + "step": 26735 + }, + { + "epoch": 0.6765695776501253, + "grad_norm": 5.680478572845459, + "learning_rate": 2.410370984692967e-06, + "loss": 0.1902, + "step": 26736 + }, + { + "epoch": 0.676594883214819, + "grad_norm": 5.267545223236084, + "learning_rate": 2.41002752040629e-06, + "loss": 0.1715, + "step": 26737 + }, + { + "epoch": 0.6766201887795126, + "grad_norm": 3.3762450218200684, + "learning_rate": 2.409684072821864e-06, + "loss": 0.1331, + "step": 26738 + }, + { + "epoch": 0.6766454943442063, + "grad_norm": 5.03316068649292, + "learning_rate": 2.4093406419419014e-06, + "loss": 0.1598, + "step": 26739 + }, + { + "epoch": 0.6766707999089, + "grad_norm": 3.1456263065338135, + "learning_rate": 2.4089972277686213e-06, + "loss": 0.1088, + "step": 26740 + }, + { + "epoch": 0.6766961054735936, + "grad_norm": 5.772674083709717, + "learning_rate": 2.408653830304236e-06, + "loss": 0.1858, + "step": 26741 + }, + { + "epoch": 0.6767214110382873, + "grad_norm": 7.755287170410156, + "learning_rate": 2.40831044955096e-06, + "loss": 0.172, + "step": 26742 + }, + { + "epoch": 0.676746716602981, + "grad_norm": 4.974852085113525, + "learning_rate": 2.4079670855110065e-06, + "loss": 0.1725, + "step": 26743 + }, + { + "epoch": 0.6767720221676746, + "grad_norm": 5.083485126495361, + "learning_rate": 2.4076237381865924e-06, + "loss": 0.1241, + "step": 26744 + }, + { + "epoch": 0.6767973277323683, + "grad_norm": 6.13073205947876, + "learning_rate": 2.4072804075799313e-06, + "loss": 0.1086, + "step": 26745 + }, + { + "epoch": 0.676822633297062, + "grad_norm": 5.881518840789795, + "learning_rate": 2.406937093693236e-06, + "loss": 0.1729, + "step": 26746 + }, + { + "epoch": 0.6768479388617556, + "grad_norm": 5.129526138305664, + "learning_rate": 2.406593796528719e-06, + "loss": 0.0742, + "step": 26747 + }, + { + "epoch": 0.6768732444264494, + "grad_norm": 4.577265739440918, + "learning_rate": 2.4062505160885986e-06, + "loss": 0.1766, + "step": 26748 + }, + { + "epoch": 0.6768985499911431, + "grad_norm": 4.4442853927612305, + "learning_rate": 2.4059072523750854e-06, + "loss": 0.1691, + "step": 26749 + }, + { + "epoch": 0.6769238555558367, + "grad_norm": 4.931939125061035, + "learning_rate": 2.4055640053903946e-06, + "loss": 0.1007, + "step": 26750 + }, + { + "epoch": 0.6769491611205304, + "grad_norm": 4.85032844543457, + "learning_rate": 2.405220775136737e-06, + "loss": 0.1569, + "step": 26751 + }, + { + "epoch": 0.6769744666852241, + "grad_norm": 10.136133193969727, + "learning_rate": 2.404877561616329e-06, + "loss": 0.3906, + "step": 26752 + }, + { + "epoch": 0.6769997722499178, + "grad_norm": 5.90037727355957, + "learning_rate": 2.4045343648313833e-06, + "loss": 0.1785, + "step": 26753 + }, + { + "epoch": 0.6770250778146114, + "grad_norm": 5.482159614562988, + "learning_rate": 2.404191184784111e-06, + "loss": 0.1987, + "step": 26754 + }, + { + "epoch": 0.6770503833793051, + "grad_norm": 5.531128406524658, + "learning_rate": 2.4038480214767307e-06, + "loss": 0.1672, + "step": 26755 + }, + { + "epoch": 0.6770756889439988, + "grad_norm": 6.7615966796875, + "learning_rate": 2.4035048749114477e-06, + "loss": 0.2397, + "step": 26756 + }, + { + "epoch": 0.6771009945086924, + "grad_norm": 11.115748405456543, + "learning_rate": 2.4031617450904802e-06, + "loss": 0.1597, + "step": 26757 + }, + { + "epoch": 0.6771263000733861, + "grad_norm": 4.520188808441162, + "learning_rate": 2.4028186320160385e-06, + "loss": 0.1778, + "step": 26758 + }, + { + "epoch": 0.6771516056380799, + "grad_norm": 4.7040276527404785, + "learning_rate": 2.4024755356903396e-06, + "loss": 0.1341, + "step": 26759 + }, + { + "epoch": 0.6771769112027735, + "grad_norm": 15.734889030456543, + "learning_rate": 2.4021324561155893e-06, + "loss": 0.224, + "step": 26760 + }, + { + "epoch": 0.6772022167674672, + "grad_norm": 6.485650539398193, + "learning_rate": 2.401789393294005e-06, + "loss": 0.1841, + "step": 26761 + }, + { + "epoch": 0.6772275223321609, + "grad_norm": 7.218155860900879, + "learning_rate": 2.401446347227796e-06, + "loss": 0.1968, + "step": 26762 + }, + { + "epoch": 0.6772528278968545, + "grad_norm": 5.367030143737793, + "learning_rate": 2.40110331791918e-06, + "loss": 0.1511, + "step": 26763 + }, + { + "epoch": 0.6772781334615482, + "grad_norm": 4.699265003204346, + "learning_rate": 2.4007603053703616e-06, + "loss": 0.1543, + "step": 26764 + }, + { + "epoch": 0.6773034390262419, + "grad_norm": 3.30405592918396, + "learning_rate": 2.400417309583558e-06, + "loss": 0.1488, + "step": 26765 + }, + { + "epoch": 0.6773287445909355, + "grad_norm": 4.338425159454346, + "learning_rate": 2.400074330560978e-06, + "loss": 0.138, + "step": 26766 + }, + { + "epoch": 0.6773540501556292, + "grad_norm": 7.407212257385254, + "learning_rate": 2.399731368304837e-06, + "loss": 0.1898, + "step": 26767 + }, + { + "epoch": 0.6773793557203229, + "grad_norm": 6.9416937828063965, + "learning_rate": 2.399388422817344e-06, + "loss": 0.1137, + "step": 26768 + }, + { + "epoch": 0.6774046612850165, + "grad_norm": 4.373800754547119, + "learning_rate": 2.3990454941007125e-06, + "loss": 0.1642, + "step": 26769 + }, + { + "epoch": 0.6774299668497102, + "grad_norm": 3.7143778800964355, + "learning_rate": 2.3987025821571503e-06, + "loss": 0.0838, + "step": 26770 + }, + { + "epoch": 0.677455272414404, + "grad_norm": 3.5359551906585693, + "learning_rate": 2.398359686988873e-06, + "loss": 0.1679, + "step": 26771 + }, + { + "epoch": 0.6774805779790976, + "grad_norm": 6.103450775146484, + "learning_rate": 2.3980168085980905e-06, + "loss": 0.1445, + "step": 26772 + }, + { + "epoch": 0.6775058835437913, + "grad_norm": 4.084659099578857, + "learning_rate": 2.3976739469870133e-06, + "loss": 0.1368, + "step": 26773 + }, + { + "epoch": 0.677531189108485, + "grad_norm": 3.6737301349639893, + "learning_rate": 2.3973311021578504e-06, + "loss": 0.1371, + "step": 26774 + }, + { + "epoch": 0.6775564946731786, + "grad_norm": 4.977571964263916, + "learning_rate": 2.3969882741128177e-06, + "loss": 0.1084, + "step": 26775 + }, + { + "epoch": 0.6775818002378723, + "grad_norm": 4.429288387298584, + "learning_rate": 2.396645462854123e-06, + "loss": 0.1412, + "step": 26776 + }, + { + "epoch": 0.677607105802566, + "grad_norm": 4.508619785308838, + "learning_rate": 2.396302668383978e-06, + "loss": 0.0761, + "step": 26777 + }, + { + "epoch": 0.6776324113672597, + "grad_norm": 12.735271453857422, + "learning_rate": 2.3959598907045904e-06, + "loss": 0.294, + "step": 26778 + }, + { + "epoch": 0.6776577169319533, + "grad_norm": 7.320587158203125, + "learning_rate": 2.395617129818175e-06, + "loss": 0.2416, + "step": 26779 + }, + { + "epoch": 0.677683022496647, + "grad_norm": 4.3145036697387695, + "learning_rate": 2.3952743857269405e-06, + "loss": 0.1508, + "step": 26780 + }, + { + "epoch": 0.6777083280613407, + "grad_norm": 10.477779388427734, + "learning_rate": 2.394931658433097e-06, + "loss": 0.2656, + "step": 26781 + }, + { + "epoch": 0.6777336336260343, + "grad_norm": 5.109189987182617, + "learning_rate": 2.394588947938854e-06, + "loss": 0.2052, + "step": 26782 + }, + { + "epoch": 0.677758939190728, + "grad_norm": 5.730345726013184, + "learning_rate": 2.394246254246421e-06, + "loss": 0.1336, + "step": 26783 + }, + { + "epoch": 0.6777842447554218, + "grad_norm": 7.787380218505859, + "learning_rate": 2.393903577358011e-06, + "loss": 0.1516, + "step": 26784 + }, + { + "epoch": 0.6778095503201154, + "grad_norm": 5.2338032722473145, + "learning_rate": 2.393560917275832e-06, + "loss": 0.1456, + "step": 26785 + }, + { + "epoch": 0.6778348558848091, + "grad_norm": 5.41779899597168, + "learning_rate": 2.3932182740020937e-06, + "loss": 0.1814, + "step": 26786 + }, + { + "epoch": 0.6778601614495028, + "grad_norm": 3.2695088386535645, + "learning_rate": 2.392875647539004e-06, + "loss": 0.1751, + "step": 26787 + }, + { + "epoch": 0.6778854670141964, + "grad_norm": 5.360292434692383, + "learning_rate": 2.3925330378887763e-06, + "loss": 0.0653, + "step": 26788 + }, + { + "epoch": 0.6779107725788901, + "grad_norm": 4.164179801940918, + "learning_rate": 2.392190445053616e-06, + "loss": 0.1057, + "step": 26789 + }, + { + "epoch": 0.6779360781435838, + "grad_norm": 6.47352933883667, + "learning_rate": 2.3918478690357385e-06, + "loss": 0.2247, + "step": 26790 + }, + { + "epoch": 0.6779613837082774, + "grad_norm": 4.220105171203613, + "learning_rate": 2.3915053098373446e-06, + "loss": 0.1115, + "step": 26791 + }, + { + "epoch": 0.6779866892729711, + "grad_norm": 4.970240592956543, + "learning_rate": 2.39116276746065e-06, + "loss": 0.1062, + "step": 26792 + }, + { + "epoch": 0.6780119948376648, + "grad_norm": 3.284022808074951, + "learning_rate": 2.3908202419078598e-06, + "loss": 0.1754, + "step": 26793 + }, + { + "epoch": 0.6780373004023584, + "grad_norm": 5.514501571655273, + "learning_rate": 2.390477733181188e-06, + "loss": 0.1662, + "step": 26794 + }, + { + "epoch": 0.6780626059670521, + "grad_norm": 7.254446983337402, + "learning_rate": 2.390135241282836e-06, + "loss": 0.2102, + "step": 26795 + }, + { + "epoch": 0.6780879115317459, + "grad_norm": 3.8965656757354736, + "learning_rate": 2.3897927662150182e-06, + "loss": 0.1461, + "step": 26796 + }, + { + "epoch": 0.6781132170964395, + "grad_norm": 3.9593513011932373, + "learning_rate": 2.3894503079799396e-06, + "loss": 0.0881, + "step": 26797 + }, + { + "epoch": 0.6781385226611332, + "grad_norm": 7.484162330627441, + "learning_rate": 2.3891078665798123e-06, + "loss": 0.1572, + "step": 26798 + }, + { + "epoch": 0.6781638282258269, + "grad_norm": 2.901675224304199, + "learning_rate": 2.3887654420168425e-06, + "loss": 0.0439, + "step": 26799 + }, + { + "epoch": 0.6781891337905205, + "grad_norm": 9.413288116455078, + "learning_rate": 2.388423034293239e-06, + "loss": 0.143, + "step": 26800 + }, + { + "epoch": 0.6782144393552142, + "grad_norm": 4.20642614364624, + "learning_rate": 2.3880806434112076e-06, + "loss": 0.1107, + "step": 26801 + }, + { + "epoch": 0.6782397449199079, + "grad_norm": 6.401621341705322, + "learning_rate": 2.3877382693729594e-06, + "loss": 0.1243, + "step": 26802 + }, + { + "epoch": 0.6782650504846016, + "grad_norm": 7.213315963745117, + "learning_rate": 2.3873959121807024e-06, + "loss": 0.15, + "step": 26803 + }, + { + "epoch": 0.6782903560492952, + "grad_norm": 3.5116989612579346, + "learning_rate": 2.387053571836642e-06, + "loss": 0.1608, + "step": 26804 + }, + { + "epoch": 0.6783156616139889, + "grad_norm": 9.791084289550781, + "learning_rate": 2.386711248342986e-06, + "loss": 0.1548, + "step": 26805 + }, + { + "epoch": 0.6783409671786826, + "grad_norm": 3.9008357524871826, + "learning_rate": 2.3863689417019447e-06, + "loss": 0.1264, + "step": 26806 + }, + { + "epoch": 0.6783662727433762, + "grad_norm": 7.27360200881958, + "learning_rate": 2.386026651915724e-06, + "loss": 0.104, + "step": 26807 + }, + { + "epoch": 0.67839157830807, + "grad_norm": 4.689818382263184, + "learning_rate": 2.385684378986531e-06, + "loss": 0.1666, + "step": 26808 + }, + { + "epoch": 0.6784168838727637, + "grad_norm": 12.015725135803223, + "learning_rate": 2.3853421229165714e-06, + "loss": 0.1907, + "step": 26809 + }, + { + "epoch": 0.6784421894374573, + "grad_norm": 4.663641452789307, + "learning_rate": 2.3849998837080557e-06, + "loss": 0.1399, + "step": 26810 + }, + { + "epoch": 0.678467495002151, + "grad_norm": 4.672616481781006, + "learning_rate": 2.3846576613631898e-06, + "loss": 0.2328, + "step": 26811 + }, + { + "epoch": 0.6784928005668447, + "grad_norm": 8.43437671661377, + "learning_rate": 2.3843154558841796e-06, + "loss": 0.2319, + "step": 26812 + }, + { + "epoch": 0.6785181061315383, + "grad_norm": 3.8368632793426514, + "learning_rate": 2.383973267273233e-06, + "loss": 0.17, + "step": 26813 + }, + { + "epoch": 0.678543411696232, + "grad_norm": 4.17686128616333, + "learning_rate": 2.3836310955325544e-06, + "loss": 0.1218, + "step": 26814 + }, + { + "epoch": 0.6785687172609257, + "grad_norm": 7.195891857147217, + "learning_rate": 2.383288940664353e-06, + "loss": 0.2915, + "step": 26815 + }, + { + "epoch": 0.6785940228256193, + "grad_norm": 6.786688327789307, + "learning_rate": 2.382946802670836e-06, + "loss": 0.2404, + "step": 26816 + }, + { + "epoch": 0.678619328390313, + "grad_norm": 5.9117960929870605, + "learning_rate": 2.382604681554207e-06, + "loss": 0.1562, + "step": 26817 + }, + { + "epoch": 0.6786446339550067, + "grad_norm": 7.5705647468566895, + "learning_rate": 2.3822625773166723e-06, + "loss": 0.293, + "step": 26818 + }, + { + "epoch": 0.6786699395197003, + "grad_norm": 9.917549133300781, + "learning_rate": 2.381920489960441e-06, + "loss": 0.2309, + "step": 26819 + }, + { + "epoch": 0.678695245084394, + "grad_norm": 3.9613943099975586, + "learning_rate": 2.3815784194877156e-06, + "loss": 0.1562, + "step": 26820 + }, + { + "epoch": 0.6787205506490878, + "grad_norm": 4.564631462097168, + "learning_rate": 2.3812363659007076e-06, + "loss": 0.1083, + "step": 26821 + }, + { + "epoch": 0.6787458562137814, + "grad_norm": 3.667485475540161, + "learning_rate": 2.380894329201615e-06, + "loss": 0.1407, + "step": 26822 + }, + { + "epoch": 0.6787711617784751, + "grad_norm": 3.5047552585601807, + "learning_rate": 2.3805523093926504e-06, + "loss": 0.1076, + "step": 26823 + }, + { + "epoch": 0.6787964673431688, + "grad_norm": 16.70398712158203, + "learning_rate": 2.3802103064760147e-06, + "loss": 0.1951, + "step": 26824 + }, + { + "epoch": 0.6788217729078624, + "grad_norm": 3.851077079772949, + "learning_rate": 2.3798683204539187e-06, + "loss": 0.0809, + "step": 26825 + }, + { + "epoch": 0.6788470784725561, + "grad_norm": 2.450040102005005, + "learning_rate": 2.379526351328561e-06, + "loss": 0.0878, + "step": 26826 + }, + { + "epoch": 0.6788723840372498, + "grad_norm": 8.590103149414062, + "learning_rate": 2.3791843991021525e-06, + "loss": 0.2617, + "step": 26827 + }, + { + "epoch": 0.6788976896019435, + "grad_norm": 2.2932024002075195, + "learning_rate": 2.3788424637768935e-06, + "loss": 0.1133, + "step": 26828 + }, + { + "epoch": 0.6789229951666371, + "grad_norm": 13.164138793945312, + "learning_rate": 2.378500545354995e-06, + "loss": 0.2619, + "step": 26829 + }, + { + "epoch": 0.6789483007313308, + "grad_norm": 3.217712163925171, + "learning_rate": 2.378158643838658e-06, + "loss": 0.1027, + "step": 26830 + }, + { + "epoch": 0.6789736062960245, + "grad_norm": 3.105283498764038, + "learning_rate": 2.3778167592300883e-06, + "loss": 0.1399, + "step": 26831 + }, + { + "epoch": 0.6789989118607181, + "grad_norm": 3.1332435607910156, + "learning_rate": 2.377474891531489e-06, + "loss": 0.0977, + "step": 26832 + }, + { + "epoch": 0.6790242174254119, + "grad_norm": 3.7012939453125, + "learning_rate": 2.377133040745068e-06, + "loss": 0.1107, + "step": 26833 + }, + { + "epoch": 0.6790495229901056, + "grad_norm": 6.1870436668396, + "learning_rate": 2.3767912068730283e-06, + "loss": 0.1913, + "step": 26834 + }, + { + "epoch": 0.6790748285547992, + "grad_norm": 6.605329990386963, + "learning_rate": 2.376449389917574e-06, + "loss": 0.1665, + "step": 26835 + }, + { + "epoch": 0.6791001341194929, + "grad_norm": 5.620166778564453, + "learning_rate": 2.376107589880908e-06, + "loss": 0.1378, + "step": 26836 + }, + { + "epoch": 0.6791254396841866, + "grad_norm": 8.097090721130371, + "learning_rate": 2.375765806765237e-06, + "loss": 0.1599, + "step": 26837 + }, + { + "epoch": 0.6791507452488802, + "grad_norm": 8.465163230895996, + "learning_rate": 2.375424040572765e-06, + "loss": 0.2789, + "step": 26838 + }, + { + "epoch": 0.6791760508135739, + "grad_norm": 15.378264427185059, + "learning_rate": 2.375082291305695e-06, + "loss": 0.3435, + "step": 26839 + }, + { + "epoch": 0.6792013563782676, + "grad_norm": 7.633852481842041, + "learning_rate": 2.3747405589662315e-06, + "loss": 0.2723, + "step": 26840 + }, + { + "epoch": 0.6792266619429612, + "grad_norm": 4.254097938537598, + "learning_rate": 2.374398843556575e-06, + "loss": 0.2108, + "step": 26841 + }, + { + "epoch": 0.6792519675076549, + "grad_norm": 3.6207151412963867, + "learning_rate": 2.3740571450789347e-06, + "loss": 0.1504, + "step": 26842 + }, + { + "epoch": 0.6792772730723486, + "grad_norm": 7.656393527984619, + "learning_rate": 2.373715463535511e-06, + "loss": 0.1883, + "step": 26843 + }, + { + "epoch": 0.6793025786370422, + "grad_norm": 4.2729268074035645, + "learning_rate": 2.373373798928507e-06, + "loss": 0.151, + "step": 26844 + }, + { + "epoch": 0.679327884201736, + "grad_norm": 9.587242126464844, + "learning_rate": 2.373032151260126e-06, + "loss": 0.345, + "step": 26845 + }, + { + "epoch": 0.6793531897664297, + "grad_norm": 5.249607563018799, + "learning_rate": 2.3726905205325733e-06, + "loss": 0.2033, + "step": 26846 + }, + { + "epoch": 0.6793784953311233, + "grad_norm": 10.53967571258545, + "learning_rate": 2.372348906748051e-06, + "loss": 0.3095, + "step": 26847 + }, + { + "epoch": 0.679403800895817, + "grad_norm": 4.295559406280518, + "learning_rate": 2.3720073099087622e-06, + "loss": 0.1647, + "step": 26848 + }, + { + "epoch": 0.6794291064605107, + "grad_norm": 15.117453575134277, + "learning_rate": 2.3716657300169064e-06, + "loss": 0.1873, + "step": 26849 + }, + { + "epoch": 0.6794544120252043, + "grad_norm": 4.611058235168457, + "learning_rate": 2.3713241670746924e-06, + "loss": 0.2271, + "step": 26850 + }, + { + "epoch": 0.679479717589898, + "grad_norm": 6.78534460067749, + "learning_rate": 2.3709826210843173e-06, + "loss": 0.1487, + "step": 26851 + }, + { + "epoch": 0.6795050231545917, + "grad_norm": 7.513059616088867, + "learning_rate": 2.3706410920479906e-06, + "loss": 0.1539, + "step": 26852 + }, + { + "epoch": 0.6795303287192854, + "grad_norm": 5.934995651245117, + "learning_rate": 2.370299579967906e-06, + "loss": 0.1395, + "step": 26853 + }, + { + "epoch": 0.679555634283979, + "grad_norm": 8.077929496765137, + "learning_rate": 2.369958084846273e-06, + "loss": 0.1876, + "step": 26854 + }, + { + "epoch": 0.6795809398486727, + "grad_norm": 2.5377230644226074, + "learning_rate": 2.3696166066852882e-06, + "loss": 0.1095, + "step": 26855 + }, + { + "epoch": 0.6796062454133664, + "grad_norm": 4.698808193206787, + "learning_rate": 2.369275145487161e-06, + "loss": 0.1663, + "step": 26856 + }, + { + "epoch": 0.67963155097806, + "grad_norm": 3.418919324874878, + "learning_rate": 2.3689337012540845e-06, + "loss": 0.138, + "step": 26857 + }, + { + "epoch": 0.6796568565427538, + "grad_norm": 3.1877734661102295, + "learning_rate": 2.3685922739882675e-06, + "loss": 0.1573, + "step": 26858 + }, + { + "epoch": 0.6796821621074475, + "grad_norm": 3.890454053878784, + "learning_rate": 2.368250863691908e-06, + "loss": 0.1248, + "step": 26859 + }, + { + "epoch": 0.6797074676721411, + "grad_norm": 7.143097877502441, + "learning_rate": 2.367909470367213e-06, + "loss": 0.1398, + "step": 26860 + }, + { + "epoch": 0.6797327732368348, + "grad_norm": 13.344642639160156, + "learning_rate": 2.3675680940163758e-06, + "loss": 0.149, + "step": 26861 + }, + { + "epoch": 0.6797580788015285, + "grad_norm": 3.8947980403900146, + "learning_rate": 2.3672267346416043e-06, + "loss": 0.1178, + "step": 26862 + }, + { + "epoch": 0.6797833843662221, + "grad_norm": 5.72817850112915, + "learning_rate": 2.3668853922450963e-06, + "loss": 0.115, + "step": 26863 + }, + { + "epoch": 0.6798086899309158, + "grad_norm": 3.9235117435455322, + "learning_rate": 2.366544066829056e-06, + "loss": 0.113, + "step": 26864 + }, + { + "epoch": 0.6798339954956095, + "grad_norm": 5.723841190338135, + "learning_rate": 2.3662027583956838e-06, + "loss": 0.1707, + "step": 26865 + }, + { + "epoch": 0.6798593010603031, + "grad_norm": 2.1543760299682617, + "learning_rate": 2.3658614669471796e-06, + "loss": 0.1135, + "step": 26866 + }, + { + "epoch": 0.6798846066249968, + "grad_norm": 13.986956596374512, + "learning_rate": 2.3655201924857446e-06, + "loss": 0.3398, + "step": 26867 + }, + { + "epoch": 0.6799099121896905, + "grad_norm": 8.255699157714844, + "learning_rate": 2.3651789350135785e-06, + "loss": 0.1883, + "step": 26868 + }, + { + "epoch": 0.6799352177543841, + "grad_norm": 6.18666934967041, + "learning_rate": 2.364837694532885e-06, + "loss": 0.1472, + "step": 26869 + }, + { + "epoch": 0.6799605233190779, + "grad_norm": 5.02912712097168, + "learning_rate": 2.364496471045863e-06, + "loss": 0.1301, + "step": 26870 + }, + { + "epoch": 0.6799858288837716, + "grad_norm": 2.806866407394409, + "learning_rate": 2.3641552645547127e-06, + "loss": 0.0862, + "step": 26871 + }, + { + "epoch": 0.6800111344484652, + "grad_norm": 4.217636585235596, + "learning_rate": 2.3638140750616336e-06, + "loss": 0.1307, + "step": 26872 + }, + { + "epoch": 0.6800364400131589, + "grad_norm": 10.634016036987305, + "learning_rate": 2.3634729025688284e-06, + "loss": 0.2645, + "step": 26873 + }, + { + "epoch": 0.6800617455778526, + "grad_norm": 7.368973255157471, + "learning_rate": 2.3631317470784966e-06, + "loss": 0.2483, + "step": 26874 + }, + { + "epoch": 0.6800870511425462, + "grad_norm": 14.490778923034668, + "learning_rate": 2.362790608592837e-06, + "loss": 0.2084, + "step": 26875 + }, + { + "epoch": 0.6801123567072399, + "grad_norm": 4.9780378341674805, + "learning_rate": 2.3624494871140493e-06, + "loss": 0.2116, + "step": 26876 + }, + { + "epoch": 0.6801376622719336, + "grad_norm": 3.13031005859375, + "learning_rate": 2.3621083826443353e-06, + "loss": 0.1044, + "step": 26877 + }, + { + "epoch": 0.6801629678366272, + "grad_norm": 10.772438049316406, + "learning_rate": 2.3617672951858937e-06, + "loss": 0.2525, + "step": 26878 + }, + { + "epoch": 0.6801882734013209, + "grad_norm": 2.955246925354004, + "learning_rate": 2.361426224740924e-06, + "loss": 0.1304, + "step": 26879 + }, + { + "epoch": 0.6802135789660146, + "grad_norm": 3.772526502609253, + "learning_rate": 2.3610851713116245e-06, + "loss": 0.1724, + "step": 26880 + }, + { + "epoch": 0.6802388845307084, + "grad_norm": 2.519350290298462, + "learning_rate": 2.360744134900197e-06, + "loss": 0.0754, + "step": 26881 + }, + { + "epoch": 0.680264190095402, + "grad_norm": 2.288029432296753, + "learning_rate": 2.360403115508838e-06, + "loss": 0.1036, + "step": 26882 + }, + { + "epoch": 0.6802894956600957, + "grad_norm": 15.340229034423828, + "learning_rate": 2.360062113139752e-06, + "loss": 0.1715, + "step": 26883 + }, + { + "epoch": 0.6803148012247894, + "grad_norm": 5.034215450286865, + "learning_rate": 2.359721127795131e-06, + "loss": 0.2221, + "step": 26884 + }, + { + "epoch": 0.680340106789483, + "grad_norm": 3.689549446105957, + "learning_rate": 2.3593801594771793e-06, + "loss": 0.196, + "step": 26885 + }, + { + "epoch": 0.6803654123541767, + "grad_norm": 3.977879524230957, + "learning_rate": 2.3590392081880914e-06, + "loss": 0.1597, + "step": 26886 + }, + { + "epoch": 0.6803907179188704, + "grad_norm": 2.814631938934326, + "learning_rate": 2.358698273930072e-06, + "loss": 0.0996, + "step": 26887 + }, + { + "epoch": 0.680416023483564, + "grad_norm": 3.136869430541992, + "learning_rate": 2.358357356705313e-06, + "loss": 0.1649, + "step": 26888 + }, + { + "epoch": 0.6804413290482577, + "grad_norm": 5.769415378570557, + "learning_rate": 2.3580164565160172e-06, + "loss": 0.1992, + "step": 26889 + }, + { + "epoch": 0.6804666346129514, + "grad_norm": 2.924182891845703, + "learning_rate": 2.357675573364381e-06, + "loss": 0.1277, + "step": 26890 + }, + { + "epoch": 0.680491940177645, + "grad_norm": 5.268206596374512, + "learning_rate": 2.357334707252606e-06, + "loss": 0.1433, + "step": 26891 + }, + { + "epoch": 0.6805172457423387, + "grad_norm": 7.3543925285339355, + "learning_rate": 2.356993858182885e-06, + "loss": 0.1659, + "step": 26892 + }, + { + "epoch": 0.6805425513070325, + "grad_norm": 1.7660199403762817, + "learning_rate": 2.3566530261574205e-06, + "loss": 0.085, + "step": 26893 + }, + { + "epoch": 0.680567856871726, + "grad_norm": 10.245593070983887, + "learning_rate": 2.356312211178407e-06, + "loss": 0.2256, + "step": 26894 + }, + { + "epoch": 0.6805931624364198, + "grad_norm": 5.02879524230957, + "learning_rate": 2.3559714132480468e-06, + "loss": 0.1317, + "step": 26895 + }, + { + "epoch": 0.6806184680011135, + "grad_norm": 3.524667978286743, + "learning_rate": 2.355630632368534e-06, + "loss": 0.1067, + "step": 26896 + }, + { + "epoch": 0.6806437735658071, + "grad_norm": 4.4854021072387695, + "learning_rate": 2.3552898685420683e-06, + "loss": 0.1806, + "step": 26897 + }, + { + "epoch": 0.6806690791305008, + "grad_norm": 6.41200590133667, + "learning_rate": 2.354949121770846e-06, + "loss": 0.1994, + "step": 26898 + }, + { + "epoch": 0.6806943846951945, + "grad_norm": 5.562307357788086, + "learning_rate": 2.3546083920570627e-06, + "loss": 0.1305, + "step": 26899 + }, + { + "epoch": 0.6807196902598881, + "grad_norm": 3.7195770740509033, + "learning_rate": 2.3542676794029203e-06, + "loss": 0.1264, + "step": 26900 + }, + { + "epoch": 0.6807449958245818, + "grad_norm": 5.194451808929443, + "learning_rate": 2.353926983810613e-06, + "loss": 0.1709, + "step": 26901 + }, + { + "epoch": 0.6807703013892755, + "grad_norm": 3.7874228954315186, + "learning_rate": 2.353586305282338e-06, + "loss": 0.0977, + "step": 26902 + }, + { + "epoch": 0.6807956069539691, + "grad_norm": 3.7339766025543213, + "learning_rate": 2.353245643820291e-06, + "loss": 0.1719, + "step": 26903 + }, + { + "epoch": 0.6808209125186628, + "grad_norm": 4.3168230056762695, + "learning_rate": 2.352904999426673e-06, + "loss": 0.0995, + "step": 26904 + }, + { + "epoch": 0.6808462180833565, + "grad_norm": 6.6979546546936035, + "learning_rate": 2.3525643721036778e-06, + "loss": 0.1644, + "step": 26905 + }, + { + "epoch": 0.6808715236480503, + "grad_norm": 4.040266036987305, + "learning_rate": 2.352223761853503e-06, + "loss": 0.1608, + "step": 26906 + }, + { + "epoch": 0.6808968292127439, + "grad_norm": 8.364062309265137, + "learning_rate": 2.3518831686783415e-06, + "loss": 0.1686, + "step": 26907 + }, + { + "epoch": 0.6809221347774376, + "grad_norm": 7.968179225921631, + "learning_rate": 2.351542592580396e-06, + "loss": 0.1585, + "step": 26908 + }, + { + "epoch": 0.6809474403421313, + "grad_norm": 2.9016435146331787, + "learning_rate": 2.351202033561859e-06, + "loss": 0.091, + "step": 26909 + }, + { + "epoch": 0.6809727459068249, + "grad_norm": 2.196690082550049, + "learning_rate": 2.3508614916249273e-06, + "loss": 0.0979, + "step": 26910 + }, + { + "epoch": 0.6809980514715186, + "grad_norm": 8.128071784973145, + "learning_rate": 2.3505209667717955e-06, + "loss": 0.1509, + "step": 26911 + }, + { + "epoch": 0.6810233570362123, + "grad_norm": 4.403853893280029, + "learning_rate": 2.3501804590046632e-06, + "loss": 0.0831, + "step": 26912 + }, + { + "epoch": 0.6810486626009059, + "grad_norm": 8.471231460571289, + "learning_rate": 2.349839968325724e-06, + "loss": 0.2439, + "step": 26913 + }, + { + "epoch": 0.6810739681655996, + "grad_norm": 3.441879987716675, + "learning_rate": 2.3494994947371742e-06, + "loss": 0.1425, + "step": 26914 + }, + { + "epoch": 0.6810992737302933, + "grad_norm": 5.514955520629883, + "learning_rate": 2.3491590382412068e-06, + "loss": 0.1826, + "step": 26915 + }, + { + "epoch": 0.6811245792949869, + "grad_norm": 5.72833251953125, + "learning_rate": 2.3488185988400214e-06, + "loss": 0.2298, + "step": 26916 + }, + { + "epoch": 0.6811498848596806, + "grad_norm": 14.372336387634277, + "learning_rate": 2.34847817653581e-06, + "loss": 0.3193, + "step": 26917 + }, + { + "epoch": 0.6811751904243744, + "grad_norm": 5.701099872589111, + "learning_rate": 2.3481377713307735e-06, + "loss": 0.2165, + "step": 26918 + }, + { + "epoch": 0.681200495989068, + "grad_norm": 3.0955731868743896, + "learning_rate": 2.3477973832270994e-06, + "loss": 0.1384, + "step": 26919 + }, + { + "epoch": 0.6812258015537617, + "grad_norm": 8.286544799804688, + "learning_rate": 2.3474570122269886e-06, + "loss": 0.1659, + "step": 26920 + }, + { + "epoch": 0.6812511071184554, + "grad_norm": 2.8435468673706055, + "learning_rate": 2.347116658332632e-06, + "loss": 0.1398, + "step": 26921 + }, + { + "epoch": 0.681276412683149, + "grad_norm": 5.418069362640381, + "learning_rate": 2.346776321546231e-06, + "loss": 0.1395, + "step": 26922 + }, + { + "epoch": 0.6813017182478427, + "grad_norm": 4.853427410125732, + "learning_rate": 2.3464360018699716e-06, + "loss": 0.0957, + "step": 26923 + }, + { + "epoch": 0.6813270238125364, + "grad_norm": 7.792752742767334, + "learning_rate": 2.3460956993060546e-06, + "loss": 0.1471, + "step": 26924 + }, + { + "epoch": 0.68135232937723, + "grad_norm": 7.677557945251465, + "learning_rate": 2.3457554138566734e-06, + "loss": 0.1386, + "step": 26925 + }, + { + "epoch": 0.6813776349419237, + "grad_norm": 4.388801097869873, + "learning_rate": 2.3454151455240193e-06, + "loss": 0.0994, + "step": 26926 + }, + { + "epoch": 0.6814029405066174, + "grad_norm": 3.2352564334869385, + "learning_rate": 2.3450748943102914e-06, + "loss": 0.1533, + "step": 26927 + }, + { + "epoch": 0.681428246071311, + "grad_norm": 5.16180419921875, + "learning_rate": 2.3447346602176812e-06, + "loss": 0.1682, + "step": 26928 + }, + { + "epoch": 0.6814535516360047, + "grad_norm": 6.978886604309082, + "learning_rate": 2.3443944432483837e-06, + "loss": 0.1767, + "step": 26929 + }, + { + "epoch": 0.6814788572006985, + "grad_norm": 3.0725138187408447, + "learning_rate": 2.3440542434045902e-06, + "loss": 0.1165, + "step": 26930 + }, + { + "epoch": 0.6815041627653922, + "grad_norm": 9.033098220825195, + "learning_rate": 2.3437140606884996e-06, + "loss": 0.1601, + "step": 26931 + }, + { + "epoch": 0.6815294683300858, + "grad_norm": 3.8231329917907715, + "learning_rate": 2.3433738951023023e-06, + "loss": 0.1258, + "step": 26932 + }, + { + "epoch": 0.6815547738947795, + "grad_norm": 4.867788791656494, + "learning_rate": 2.3430337466481925e-06, + "loss": 0.1692, + "step": 26933 + }, + { + "epoch": 0.6815800794594732, + "grad_norm": 2.8843255043029785, + "learning_rate": 2.342693615328362e-06, + "loss": 0.1265, + "step": 26934 + }, + { + "epoch": 0.6816053850241668, + "grad_norm": 6.668726921081543, + "learning_rate": 2.34235350114501e-06, + "loss": 0.2065, + "step": 26935 + }, + { + "epoch": 0.6816306905888605, + "grad_norm": 5.345111846923828, + "learning_rate": 2.342013404100322e-06, + "loss": 0.1788, + "step": 26936 + }, + { + "epoch": 0.6816559961535542, + "grad_norm": 2.521587610244751, + "learning_rate": 2.3416733241964966e-06, + "loss": 0.0825, + "step": 26937 + }, + { + "epoch": 0.6816813017182478, + "grad_norm": 4.09241247177124, + "learning_rate": 2.341333261435724e-06, + "loss": 0.1303, + "step": 26938 + }, + { + "epoch": 0.6817066072829415, + "grad_norm": 3.2268242835998535, + "learning_rate": 2.340993215820201e-06, + "loss": 0.1129, + "step": 26939 + }, + { + "epoch": 0.6817319128476352, + "grad_norm": 5.304986953735352, + "learning_rate": 2.340653187352117e-06, + "loss": 0.1676, + "step": 26940 + }, + { + "epoch": 0.6817572184123288, + "grad_norm": 3.7862062454223633, + "learning_rate": 2.340313176033667e-06, + "loss": 0.1388, + "step": 26941 + }, + { + "epoch": 0.6817825239770225, + "grad_norm": 7.890143394470215, + "learning_rate": 2.3399731818670403e-06, + "loss": 0.2592, + "step": 26942 + }, + { + "epoch": 0.6818078295417163, + "grad_norm": 5.507801532745361, + "learning_rate": 2.3396332048544337e-06, + "loss": 0.1445, + "step": 26943 + }, + { + "epoch": 0.6818331351064099, + "grad_norm": 5.98775577545166, + "learning_rate": 2.339293244998037e-06, + "loss": 0.2033, + "step": 26944 + }, + { + "epoch": 0.6818584406711036, + "grad_norm": 2.766530752182007, + "learning_rate": 2.3389533023000442e-06, + "loss": 0.1002, + "step": 26945 + }, + { + "epoch": 0.6818837462357973, + "grad_norm": 13.744133949279785, + "learning_rate": 2.3386133767626445e-06, + "loss": 0.1731, + "step": 26946 + }, + { + "epoch": 0.6819090518004909, + "grad_norm": 4.346869468688965, + "learning_rate": 2.338273468388034e-06, + "loss": 0.1028, + "step": 26947 + }, + { + "epoch": 0.6819343573651846, + "grad_norm": 4.6390204429626465, + "learning_rate": 2.3379335771784e-06, + "loss": 0.1737, + "step": 26948 + }, + { + "epoch": 0.6819596629298783, + "grad_norm": 4.617502212524414, + "learning_rate": 2.337593703135942e-06, + "loss": 0.148, + "step": 26949 + }, + { + "epoch": 0.6819849684945719, + "grad_norm": 8.818504333496094, + "learning_rate": 2.3372538462628425e-06, + "loss": 0.2753, + "step": 26950 + }, + { + "epoch": 0.6820102740592656, + "grad_norm": 6.834496974945068, + "learning_rate": 2.3369140065613e-06, + "loss": 0.1521, + "step": 26951 + }, + { + "epoch": 0.6820355796239593, + "grad_norm": 8.273619651794434, + "learning_rate": 2.3365741840335037e-06, + "loss": 0.1562, + "step": 26952 + }, + { + "epoch": 0.6820608851886529, + "grad_norm": 11.943321228027344, + "learning_rate": 2.336234378681645e-06, + "loss": 0.1594, + "step": 26953 + }, + { + "epoch": 0.6820861907533466, + "grad_norm": 4.416179180145264, + "learning_rate": 2.3358945905079127e-06, + "loss": 0.1698, + "step": 26954 + }, + { + "epoch": 0.6821114963180404, + "grad_norm": 2.985042095184326, + "learning_rate": 2.3355548195145033e-06, + "loss": 0.1158, + "step": 26955 + }, + { + "epoch": 0.6821368018827341, + "grad_norm": 7.806783199310303, + "learning_rate": 2.3352150657036053e-06, + "loss": 0.272, + "step": 26956 + }, + { + "epoch": 0.6821621074474277, + "grad_norm": 8.290847778320312, + "learning_rate": 2.3348753290774074e-06, + "loss": 0.258, + "step": 26957 + }, + { + "epoch": 0.6821874130121214, + "grad_norm": 3.9551727771759033, + "learning_rate": 2.334535609638105e-06, + "loss": 0.1709, + "step": 26958 + }, + { + "epoch": 0.6822127185768151, + "grad_norm": 4.069233417510986, + "learning_rate": 2.334195907387886e-06, + "loss": 0.1494, + "step": 26959 + }, + { + "epoch": 0.6822380241415087, + "grad_norm": 3.1537888050079346, + "learning_rate": 2.3338562223289423e-06, + "loss": 0.1297, + "step": 26960 + }, + { + "epoch": 0.6822633297062024, + "grad_norm": 3.9132981300354004, + "learning_rate": 2.333516554463462e-06, + "loss": 0.1533, + "step": 26961 + }, + { + "epoch": 0.6822886352708961, + "grad_norm": 4.175604820251465, + "learning_rate": 2.3331769037936408e-06, + "loss": 0.1146, + "step": 26962 + }, + { + "epoch": 0.6823139408355897, + "grad_norm": 8.754108428955078, + "learning_rate": 2.3328372703216623e-06, + "loss": 0.2423, + "step": 26963 + }, + { + "epoch": 0.6823392464002834, + "grad_norm": 7.599188804626465, + "learning_rate": 2.332497654049722e-06, + "loss": 0.2297, + "step": 26964 + }, + { + "epoch": 0.6823645519649771, + "grad_norm": 2.701284646987915, + "learning_rate": 2.3321580549800057e-06, + "loss": 0.0889, + "step": 26965 + }, + { + "epoch": 0.6823898575296707, + "grad_norm": 5.944904327392578, + "learning_rate": 2.3318184731147097e-06, + "loss": 0.1905, + "step": 26966 + }, + { + "epoch": 0.6824151630943645, + "grad_norm": 10.986051559448242, + "learning_rate": 2.331478908456017e-06, + "loss": 0.2007, + "step": 26967 + }, + { + "epoch": 0.6824404686590582, + "grad_norm": 4.559438705444336, + "learning_rate": 2.331139361006122e-06, + "loss": 0.1351, + "step": 26968 + }, + { + "epoch": 0.6824657742237518, + "grad_norm": 3.516486644744873, + "learning_rate": 2.330799830767211e-06, + "loss": 0.1017, + "step": 26969 + }, + { + "epoch": 0.6824910797884455, + "grad_norm": 2.912761926651001, + "learning_rate": 2.330460317741477e-06, + "loss": 0.1115, + "step": 26970 + }, + { + "epoch": 0.6825163853531392, + "grad_norm": 4.183879852294922, + "learning_rate": 2.3301208219311077e-06, + "loss": 0.1474, + "step": 26971 + }, + { + "epoch": 0.6825416909178328, + "grad_norm": 3.6305389404296875, + "learning_rate": 2.3297813433382926e-06, + "loss": 0.1374, + "step": 26972 + }, + { + "epoch": 0.6825669964825265, + "grad_norm": 4.9825758934021, + "learning_rate": 2.3294418819652193e-06, + "loss": 0.1666, + "step": 26973 + }, + { + "epoch": 0.6825923020472202, + "grad_norm": 4.889565467834473, + "learning_rate": 2.32910243781408e-06, + "loss": 0.1737, + "step": 26974 + }, + { + "epoch": 0.6826176076119138, + "grad_norm": 2.739708185195923, + "learning_rate": 2.3287630108870628e-06, + "loss": 0.145, + "step": 26975 + }, + { + "epoch": 0.6826429131766075, + "grad_norm": 10.661178588867188, + "learning_rate": 2.328423601186356e-06, + "loss": 0.2335, + "step": 26976 + }, + { + "epoch": 0.6826682187413012, + "grad_norm": 2.4466288089752197, + "learning_rate": 2.3280842087141463e-06, + "loss": 0.1009, + "step": 26977 + }, + { + "epoch": 0.6826935243059948, + "grad_norm": 8.354085922241211, + "learning_rate": 2.3277448334726267e-06, + "loss": 0.2015, + "step": 26978 + }, + { + "epoch": 0.6827188298706885, + "grad_norm": 4.389764308929443, + "learning_rate": 2.3274054754639818e-06, + "loss": 0.1491, + "step": 26979 + }, + { + "epoch": 0.6827441354353823, + "grad_norm": 5.3246049880981445, + "learning_rate": 2.327066134690405e-06, + "loss": 0.1351, + "step": 26980 + }, + { + "epoch": 0.682769441000076, + "grad_norm": 5.633238792419434, + "learning_rate": 2.326726811154078e-06, + "loss": 0.1371, + "step": 26981 + }, + { + "epoch": 0.6827947465647696, + "grad_norm": 4.32926082611084, + "learning_rate": 2.326387504857195e-06, + "loss": 0.108, + "step": 26982 + }, + { + "epoch": 0.6828200521294633, + "grad_norm": 5.347936153411865, + "learning_rate": 2.326048215801941e-06, + "loss": 0.1347, + "step": 26983 + }, + { + "epoch": 0.682845357694157, + "grad_norm": 8.493562698364258, + "learning_rate": 2.3257089439905046e-06, + "loss": 0.2179, + "step": 26984 + }, + { + "epoch": 0.6828706632588506, + "grad_norm": 6.342261791229248, + "learning_rate": 2.3253696894250723e-06, + "loss": 0.2042, + "step": 26985 + }, + { + "epoch": 0.6828959688235443, + "grad_norm": 3.7204227447509766, + "learning_rate": 2.325030452107835e-06, + "loss": 0.1631, + "step": 26986 + }, + { + "epoch": 0.682921274388238, + "grad_norm": 5.94539737701416, + "learning_rate": 2.3246912320409785e-06, + "loss": 0.191, + "step": 26987 + }, + { + "epoch": 0.6829465799529316, + "grad_norm": 7.036629676818848, + "learning_rate": 2.3243520292266915e-06, + "loss": 0.1431, + "step": 26988 + }, + { + "epoch": 0.6829718855176253, + "grad_norm": 5.736843585968018, + "learning_rate": 2.3240128436671594e-06, + "loss": 0.2434, + "step": 26989 + }, + { + "epoch": 0.682997191082319, + "grad_norm": 4.428461074829102, + "learning_rate": 2.3236736753645695e-06, + "loss": 0.1734, + "step": 26990 + }, + { + "epoch": 0.6830224966470126, + "grad_norm": 2.637385129928589, + "learning_rate": 2.323334524321112e-06, + "loss": 0.1094, + "step": 26991 + }, + { + "epoch": 0.6830478022117064, + "grad_norm": 4.156236171722412, + "learning_rate": 2.32299539053897e-06, + "loss": 0.1607, + "step": 26992 + }, + { + "epoch": 0.6830731077764001, + "grad_norm": 5.204747676849365, + "learning_rate": 2.3226562740203375e-06, + "loss": 0.1983, + "step": 26993 + }, + { + "epoch": 0.6830984133410937, + "grad_norm": 5.641448974609375, + "learning_rate": 2.3223171747673915e-06, + "loss": 0.123, + "step": 26994 + }, + { + "epoch": 0.6831237189057874, + "grad_norm": 2.3149149417877197, + "learning_rate": 2.321978092782327e-06, + "loss": 0.1185, + "step": 26995 + }, + { + "epoch": 0.6831490244704811, + "grad_norm": 6.090348720550537, + "learning_rate": 2.321639028067325e-06, + "loss": 0.1838, + "step": 26996 + }, + { + "epoch": 0.6831743300351747, + "grad_norm": 5.9221906661987305, + "learning_rate": 2.321299980624579e-06, + "loss": 0.2054, + "step": 26997 + }, + { + "epoch": 0.6831996355998684, + "grad_norm": 6.992166042327881, + "learning_rate": 2.3209609504562673e-06, + "loss": 0.2296, + "step": 26998 + }, + { + "epoch": 0.6832249411645621, + "grad_norm": 5.137491226196289, + "learning_rate": 2.3206219375645815e-06, + "loss": 0.1558, + "step": 26999 + }, + { + "epoch": 0.6832502467292557, + "grad_norm": 2.8933939933776855, + "learning_rate": 2.3202829419517054e-06, + "loss": 0.0864, + "step": 27000 + }, + { + "epoch": 0.6832755522939494, + "grad_norm": 5.8072075843811035, + "learning_rate": 2.3199439636198273e-06, + "loss": 0.1241, + "step": 27001 + }, + { + "epoch": 0.6833008578586431, + "grad_norm": 8.418840408325195, + "learning_rate": 2.3196050025711326e-06, + "loss": 0.1585, + "step": 27002 + }, + { + "epoch": 0.6833261634233367, + "grad_norm": 4.251806259155273, + "learning_rate": 2.319266058807806e-06, + "loss": 0.1642, + "step": 27003 + }, + { + "epoch": 0.6833514689880305, + "grad_norm": 4.162633895874023, + "learning_rate": 2.3189271323320324e-06, + "loss": 0.1735, + "step": 27004 + }, + { + "epoch": 0.6833767745527242, + "grad_norm": 5.977972507476807, + "learning_rate": 2.3185882231460012e-06, + "loss": 0.1965, + "step": 27005 + }, + { + "epoch": 0.6834020801174178, + "grad_norm": 4.032973289489746, + "learning_rate": 2.318249331251895e-06, + "loss": 0.1537, + "step": 27006 + }, + { + "epoch": 0.6834273856821115, + "grad_norm": 4.906004428863525, + "learning_rate": 2.317910456651901e-06, + "loss": 0.2488, + "step": 27007 + }, + { + "epoch": 0.6834526912468052, + "grad_norm": 3.5317084789276123, + "learning_rate": 2.3175715993482005e-06, + "loss": 0.1277, + "step": 27008 + }, + { + "epoch": 0.6834779968114989, + "grad_norm": 2.768486499786377, + "learning_rate": 2.3172327593429843e-06, + "loss": 0.1339, + "step": 27009 + }, + { + "epoch": 0.6835033023761925, + "grad_norm": 5.899149417877197, + "learning_rate": 2.3168939366384352e-06, + "loss": 0.1485, + "step": 27010 + }, + { + "epoch": 0.6835286079408862, + "grad_norm": 4.295309543609619, + "learning_rate": 2.316555131236738e-06, + "loss": 0.1548, + "step": 27011 + }, + { + "epoch": 0.6835539135055799, + "grad_norm": 4.359819412231445, + "learning_rate": 2.316216343140075e-06, + "loss": 0.1314, + "step": 27012 + }, + { + "epoch": 0.6835792190702735, + "grad_norm": 2.876161575317383, + "learning_rate": 2.315877572350636e-06, + "loss": 0.1168, + "step": 27013 + }, + { + "epoch": 0.6836045246349672, + "grad_norm": 5.45400333404541, + "learning_rate": 2.3155388188706034e-06, + "loss": 0.1845, + "step": 27014 + }, + { + "epoch": 0.683629830199661, + "grad_norm": 6.800889492034912, + "learning_rate": 2.3152000827021615e-06, + "loss": 0.1707, + "step": 27015 + }, + { + "epoch": 0.6836551357643545, + "grad_norm": 4.54041051864624, + "learning_rate": 2.3148613638474927e-06, + "loss": 0.1801, + "step": 27016 + }, + { + "epoch": 0.6836804413290483, + "grad_norm": 4.167757511138916, + "learning_rate": 2.3145226623087857e-06, + "loss": 0.1368, + "step": 27017 + }, + { + "epoch": 0.683705746893742, + "grad_norm": 2.491485595703125, + "learning_rate": 2.3141839780882226e-06, + "loss": 0.1136, + "step": 27018 + }, + { + "epoch": 0.6837310524584356, + "grad_norm": 5.550346374511719, + "learning_rate": 2.313845311187987e-06, + "loss": 0.1478, + "step": 27019 + }, + { + "epoch": 0.6837563580231293, + "grad_norm": 2.6571969985961914, + "learning_rate": 2.3135066616102634e-06, + "loss": 0.1265, + "step": 27020 + }, + { + "epoch": 0.683781663587823, + "grad_norm": 11.182520866394043, + "learning_rate": 2.313168029357234e-06, + "loss": 0.1899, + "step": 27021 + }, + { + "epoch": 0.6838069691525166, + "grad_norm": 6.783858299255371, + "learning_rate": 2.312829414431086e-06, + "loss": 0.1779, + "step": 27022 + }, + { + "epoch": 0.6838322747172103, + "grad_norm": 10.890823364257812, + "learning_rate": 2.3124908168339993e-06, + "loss": 0.2154, + "step": 27023 + }, + { + "epoch": 0.683857580281904, + "grad_norm": 17.051406860351562, + "learning_rate": 2.312152236568163e-06, + "loss": 0.358, + "step": 27024 + }, + { + "epoch": 0.6838828858465976, + "grad_norm": 5.550621032714844, + "learning_rate": 2.3118136736357537e-06, + "loss": 0.1924, + "step": 27025 + }, + { + "epoch": 0.6839081914112913, + "grad_norm": 17.79801368713379, + "learning_rate": 2.311475128038959e-06, + "loss": 0.2681, + "step": 27026 + }, + { + "epoch": 0.683933496975985, + "grad_norm": 14.46274471282959, + "learning_rate": 2.31113659977996e-06, + "loss": 0.3114, + "step": 27027 + }, + { + "epoch": 0.6839588025406786, + "grad_norm": 9.864331245422363, + "learning_rate": 2.3107980888609444e-06, + "loss": 0.2736, + "step": 27028 + }, + { + "epoch": 0.6839841081053724, + "grad_norm": 4.6985602378845215, + "learning_rate": 2.310459595284088e-06, + "loss": 0.1924, + "step": 27029 + }, + { + "epoch": 0.6840094136700661, + "grad_norm": 2.6926703453063965, + "learning_rate": 2.3101211190515794e-06, + "loss": 0.1391, + "step": 27030 + }, + { + "epoch": 0.6840347192347597, + "grad_norm": 4.468730926513672, + "learning_rate": 2.309782660165597e-06, + "loss": 0.1477, + "step": 27031 + }, + { + "epoch": 0.6840600247994534, + "grad_norm": 5.349316596984863, + "learning_rate": 2.309444218628328e-06, + "loss": 0.1185, + "step": 27032 + }, + { + "epoch": 0.6840853303641471, + "grad_norm": 3.9835805892944336, + "learning_rate": 2.3091057944419527e-06, + "loss": 0.152, + "step": 27033 + }, + { + "epoch": 0.6841106359288408, + "grad_norm": 4.996362209320068, + "learning_rate": 2.3087673876086535e-06, + "loss": 0.1687, + "step": 27034 + }, + { + "epoch": 0.6841359414935344, + "grad_norm": 2.9005017280578613, + "learning_rate": 2.3084289981306114e-06, + "loss": 0.1342, + "step": 27035 + }, + { + "epoch": 0.6841612470582281, + "grad_norm": 5.917377471923828, + "learning_rate": 2.308090626010011e-06, + "loss": 0.1954, + "step": 27036 + }, + { + "epoch": 0.6841865526229218, + "grad_norm": 4.32041072845459, + "learning_rate": 2.307752271249034e-06, + "loss": 0.1759, + "step": 27037 + }, + { + "epoch": 0.6842118581876154, + "grad_norm": 4.649971961975098, + "learning_rate": 2.3074139338498618e-06, + "loss": 0.1869, + "step": 27038 + }, + { + "epoch": 0.6842371637523091, + "grad_norm": 4.927950382232666, + "learning_rate": 2.3070756138146744e-06, + "loss": 0.1193, + "step": 27039 + }, + { + "epoch": 0.6842624693170029, + "grad_norm": 5.150026798248291, + "learning_rate": 2.3067373111456577e-06, + "loss": 0.1484, + "step": 27040 + }, + { + "epoch": 0.6842877748816965, + "grad_norm": 5.81095552444458, + "learning_rate": 2.30639902584499e-06, + "loss": 0.2236, + "step": 27041 + }, + { + "epoch": 0.6843130804463902, + "grad_norm": 2.5757060050964355, + "learning_rate": 2.3060607579148554e-06, + "loss": 0.1053, + "step": 27042 + }, + { + "epoch": 0.6843383860110839, + "grad_norm": 6.269152641296387, + "learning_rate": 2.3057225073574308e-06, + "loss": 0.1302, + "step": 27043 + }, + { + "epoch": 0.6843636915757775, + "grad_norm": 5.6192121505737305, + "learning_rate": 2.305384274174903e-06, + "loss": 0.1123, + "step": 27044 + }, + { + "epoch": 0.6843889971404712, + "grad_norm": 8.855633735656738, + "learning_rate": 2.3050460583694507e-06, + "loss": 0.1544, + "step": 27045 + }, + { + "epoch": 0.6844143027051649, + "grad_norm": 7.109942436218262, + "learning_rate": 2.304707859943255e-06, + "loss": 0.1793, + "step": 27046 + }, + { + "epoch": 0.6844396082698585, + "grad_norm": 12.466828346252441, + "learning_rate": 2.3043696788984964e-06, + "loss": 0.1792, + "step": 27047 + }, + { + "epoch": 0.6844649138345522, + "grad_norm": 14.679265975952148, + "learning_rate": 2.3040315152373555e-06, + "loss": 0.2446, + "step": 27048 + }, + { + "epoch": 0.6844902193992459, + "grad_norm": 3.2704198360443115, + "learning_rate": 2.303693368962015e-06, + "loss": 0.1677, + "step": 27049 + }, + { + "epoch": 0.6845155249639395, + "grad_norm": 2.8828117847442627, + "learning_rate": 2.3033552400746546e-06, + "loss": 0.1158, + "step": 27050 + }, + { + "epoch": 0.6845408305286332, + "grad_norm": 3.0968949794769287, + "learning_rate": 2.303017128577455e-06, + "loss": 0.0758, + "step": 27051 + }, + { + "epoch": 0.684566136093327, + "grad_norm": 8.158712387084961, + "learning_rate": 2.3026790344725935e-06, + "loss": 0.1896, + "step": 27052 + }, + { + "epoch": 0.6845914416580205, + "grad_norm": 3.0579004287719727, + "learning_rate": 2.3023409577622557e-06, + "loss": 0.1044, + "step": 27053 + }, + { + "epoch": 0.6846167472227143, + "grad_norm": 3.1127195358276367, + "learning_rate": 2.3020028984486177e-06, + "loss": 0.1512, + "step": 27054 + }, + { + "epoch": 0.684642052787408, + "grad_norm": 6.1135430335998535, + "learning_rate": 2.3016648565338644e-06, + "loss": 0.1966, + "step": 27055 + }, + { + "epoch": 0.6846673583521016, + "grad_norm": 4.600766181945801, + "learning_rate": 2.3013268320201685e-06, + "loss": 0.1246, + "step": 27056 + }, + { + "epoch": 0.6846926639167953, + "grad_norm": 2.7994327545166016, + "learning_rate": 2.300988824909716e-06, + "loss": 0.1321, + "step": 27057 + }, + { + "epoch": 0.684717969481489, + "grad_norm": 5.629315376281738, + "learning_rate": 2.300650835204683e-06, + "loss": 0.1855, + "step": 27058 + }, + { + "epoch": 0.6847432750461827, + "grad_norm": 3.4934773445129395, + "learning_rate": 2.300312862907254e-06, + "loss": 0.1533, + "step": 27059 + }, + { + "epoch": 0.6847685806108763, + "grad_norm": 23.256126403808594, + "learning_rate": 2.2999749080196016e-06, + "loss": 0.1985, + "step": 27060 + }, + { + "epoch": 0.68479388617557, + "grad_norm": 2.81381893157959, + "learning_rate": 2.299636970543911e-06, + "loss": 0.142, + "step": 27061 + }, + { + "epoch": 0.6848191917402637, + "grad_norm": 5.489384174346924, + "learning_rate": 2.2992990504823564e-06, + "loss": 0.1492, + "step": 27062 + }, + { + "epoch": 0.6848444973049573, + "grad_norm": 7.451601982116699, + "learning_rate": 2.2989611478371245e-06, + "loss": 0.2368, + "step": 27063 + }, + { + "epoch": 0.684869802869651, + "grad_norm": 6.347337245941162, + "learning_rate": 2.2986232626103853e-06, + "loss": 0.2267, + "step": 27064 + }, + { + "epoch": 0.6848951084343448, + "grad_norm": 3.1359829902648926, + "learning_rate": 2.298285394804325e-06, + "loss": 0.1139, + "step": 27065 + }, + { + "epoch": 0.6849204139990384, + "grad_norm": 6.310713291168213, + "learning_rate": 2.297947544421117e-06, + "loss": 0.1338, + "step": 27066 + }, + { + "epoch": 0.6849457195637321, + "grad_norm": 7.349937438964844, + "learning_rate": 2.2976097114629446e-06, + "loss": 0.2528, + "step": 27067 + }, + { + "epoch": 0.6849710251284258, + "grad_norm": 8.238834381103516, + "learning_rate": 2.2972718959319847e-06, + "loss": 0.2131, + "step": 27068 + }, + { + "epoch": 0.6849963306931194, + "grad_norm": 5.547798156738281, + "learning_rate": 2.2969340978304157e-06, + "loss": 0.1107, + "step": 27069 + }, + { + "epoch": 0.6850216362578131, + "grad_norm": 6.5512237548828125, + "learning_rate": 2.2965963171604135e-06, + "loss": 0.2042, + "step": 27070 + }, + { + "epoch": 0.6850469418225068, + "grad_norm": 5.433924198150635, + "learning_rate": 2.296258553924161e-06, + "loss": 0.175, + "step": 27071 + }, + { + "epoch": 0.6850722473872004, + "grad_norm": 3.997342824935913, + "learning_rate": 2.2959208081238345e-06, + "loss": 0.177, + "step": 27072 + }, + { + "epoch": 0.6850975529518941, + "grad_norm": 7.219945430755615, + "learning_rate": 2.2955830797616108e-06, + "loss": 0.1614, + "step": 27073 + }, + { + "epoch": 0.6851228585165878, + "grad_norm": 4.726474285125732, + "learning_rate": 2.295245368839669e-06, + "loss": 0.1538, + "step": 27074 + }, + { + "epoch": 0.6851481640812814, + "grad_norm": 5.474461078643799, + "learning_rate": 2.2949076753601858e-06, + "loss": 0.1207, + "step": 27075 + }, + { + "epoch": 0.6851734696459751, + "grad_norm": 6.394834041595459, + "learning_rate": 2.294569999325341e-06, + "loss": 0.2131, + "step": 27076 + }, + { + "epoch": 0.6851987752106689, + "grad_norm": 2.2990920543670654, + "learning_rate": 2.294232340737311e-06, + "loss": 0.1266, + "step": 27077 + }, + { + "epoch": 0.6852240807753625, + "grad_norm": 4.203860759735107, + "learning_rate": 2.293894699598273e-06, + "loss": 0.1554, + "step": 27078 + }, + { + "epoch": 0.6852493863400562, + "grad_norm": 4.004570007324219, + "learning_rate": 2.293557075910403e-06, + "loss": 0.1324, + "step": 27079 + }, + { + "epoch": 0.6852746919047499, + "grad_norm": 2.5325095653533936, + "learning_rate": 2.2932194696758824e-06, + "loss": 0.1184, + "step": 27080 + }, + { + "epoch": 0.6852999974694435, + "grad_norm": 4.943798065185547, + "learning_rate": 2.2928818808968855e-06, + "loss": 0.1205, + "step": 27081 + }, + { + "epoch": 0.6853253030341372, + "grad_norm": 4.557312965393066, + "learning_rate": 2.29254430957559e-06, + "loss": 0.1351, + "step": 27082 + }, + { + "epoch": 0.6853506085988309, + "grad_norm": 8.67115592956543, + "learning_rate": 2.2922067557141706e-06, + "loss": 0.162, + "step": 27083 + }, + { + "epoch": 0.6853759141635246, + "grad_norm": 7.234519958496094, + "learning_rate": 2.2918692193148077e-06, + "loss": 0.1616, + "step": 27084 + }, + { + "epoch": 0.6854012197282182, + "grad_norm": 4.145187854766846, + "learning_rate": 2.291531700379675e-06, + "loss": 0.1392, + "step": 27085 + }, + { + "epoch": 0.6854265252929119, + "grad_norm": 6.414153099060059, + "learning_rate": 2.2911941989109544e-06, + "loss": 0.1654, + "step": 27086 + }, + { + "epoch": 0.6854518308576056, + "grad_norm": 8.165456771850586, + "learning_rate": 2.290856714910815e-06, + "loss": 0.1733, + "step": 27087 + }, + { + "epoch": 0.6854771364222992, + "grad_norm": 5.159430027008057, + "learning_rate": 2.2905192483814383e-06, + "loss": 0.1727, + "step": 27088 + }, + { + "epoch": 0.685502441986993, + "grad_norm": 4.460439682006836, + "learning_rate": 2.2901817993249962e-06, + "loss": 0.0886, + "step": 27089 + }, + { + "epoch": 0.6855277475516867, + "grad_norm": 4.84293270111084, + "learning_rate": 2.2898443677436727e-06, + "loss": 0.1103, + "step": 27090 + }, + { + "epoch": 0.6855530531163803, + "grad_norm": 5.414794921875, + "learning_rate": 2.2895069536396344e-06, + "loss": 0.1042, + "step": 27091 + }, + { + "epoch": 0.685578358681074, + "grad_norm": 2.548152446746826, + "learning_rate": 2.2891695570150637e-06, + "loss": 0.1024, + "step": 27092 + }, + { + "epoch": 0.6856036642457677, + "grad_norm": 4.362817287445068, + "learning_rate": 2.2888321778721315e-06, + "loss": 0.1255, + "step": 27093 + }, + { + "epoch": 0.6856289698104613, + "grad_norm": 7.173773765563965, + "learning_rate": 2.288494816213021e-06, + "loss": 0.1928, + "step": 27094 + }, + { + "epoch": 0.685654275375155, + "grad_norm": 7.560651779174805, + "learning_rate": 2.2881574720398993e-06, + "loss": 0.1362, + "step": 27095 + }, + { + "epoch": 0.6856795809398487, + "grad_norm": 6.660416603088379, + "learning_rate": 2.287820145354947e-06, + "loss": 0.2668, + "step": 27096 + }, + { + "epoch": 0.6857048865045423, + "grad_norm": 7.211523532867432, + "learning_rate": 2.2874828361603363e-06, + "loss": 0.1972, + "step": 27097 + }, + { + "epoch": 0.685730192069236, + "grad_norm": 8.817689895629883, + "learning_rate": 2.2871455444582462e-06, + "loss": 0.0905, + "step": 27098 + }, + { + "epoch": 0.6857554976339297, + "grad_norm": 8.36717700958252, + "learning_rate": 2.2868082702508497e-06, + "loss": 0.1719, + "step": 27099 + }, + { + "epoch": 0.6857808031986233, + "grad_norm": 11.121293067932129, + "learning_rate": 2.286471013540322e-06, + "loss": 0.3014, + "step": 27100 + }, + { + "epoch": 0.685806108763317, + "grad_norm": 4.128114223480225, + "learning_rate": 2.2861337743288363e-06, + "loss": 0.1473, + "step": 27101 + }, + { + "epoch": 0.6858314143280108, + "grad_norm": 3.7306642532348633, + "learning_rate": 2.285796552618571e-06, + "loss": 0.1317, + "step": 27102 + }, + { + "epoch": 0.6858567198927044, + "grad_norm": 7.869626998901367, + "learning_rate": 2.285459348411699e-06, + "loss": 0.2481, + "step": 27103 + }, + { + "epoch": 0.6858820254573981, + "grad_norm": 3.3780150413513184, + "learning_rate": 2.285122161710394e-06, + "loss": 0.1494, + "step": 27104 + }, + { + "epoch": 0.6859073310220918, + "grad_norm": 3.2529115676879883, + "learning_rate": 2.284784992516832e-06, + "loss": 0.1508, + "step": 27105 + }, + { + "epoch": 0.6859326365867854, + "grad_norm": 3.95505690574646, + "learning_rate": 2.284447840833184e-06, + "loss": 0.0753, + "step": 27106 + }, + { + "epoch": 0.6859579421514791, + "grad_norm": 4.168478965759277, + "learning_rate": 2.2841107066616284e-06, + "loss": 0.1445, + "step": 27107 + }, + { + "epoch": 0.6859832477161728, + "grad_norm": 3.346968412399292, + "learning_rate": 2.2837735900043383e-06, + "loss": 0.1509, + "step": 27108 + }, + { + "epoch": 0.6860085532808665, + "grad_norm": 9.8988037109375, + "learning_rate": 2.283436490863487e-06, + "loss": 0.2242, + "step": 27109 + }, + { + "epoch": 0.6860338588455601, + "grad_norm": 6.401421546936035, + "learning_rate": 2.283099409241247e-06, + "loss": 0.1402, + "step": 27110 + }, + { + "epoch": 0.6860591644102538, + "grad_norm": 4.131908893585205, + "learning_rate": 2.2827623451397944e-06, + "loss": 0.154, + "step": 27111 + }, + { + "epoch": 0.6860844699749475, + "grad_norm": 4.971485137939453, + "learning_rate": 2.2824252985613025e-06, + "loss": 0.0782, + "step": 27112 + }, + { + "epoch": 0.6861097755396411, + "grad_norm": 6.61527681350708, + "learning_rate": 2.2820882695079443e-06, + "loss": 0.1988, + "step": 27113 + }, + { + "epoch": 0.6861350811043349, + "grad_norm": 7.866573333740234, + "learning_rate": 2.281751257981891e-06, + "loss": 0.266, + "step": 27114 + }, + { + "epoch": 0.6861603866690286, + "grad_norm": 3.991647481918335, + "learning_rate": 2.281414263985321e-06, + "loss": 0.12, + "step": 27115 + }, + { + "epoch": 0.6861856922337222, + "grad_norm": 2.9987261295318604, + "learning_rate": 2.281077287520404e-06, + "loss": 0.1541, + "step": 27116 + }, + { + "epoch": 0.6862109977984159, + "grad_norm": 6.028596878051758, + "learning_rate": 2.280740328589314e-06, + "loss": 0.1344, + "step": 27117 + }, + { + "epoch": 0.6862363033631096, + "grad_norm": 4.281765460968018, + "learning_rate": 2.280403387194222e-06, + "loss": 0.1315, + "step": 27118 + }, + { + "epoch": 0.6862616089278032, + "grad_norm": 2.483978033065796, + "learning_rate": 2.280066463337304e-06, + "loss": 0.0883, + "step": 27119 + }, + { + "epoch": 0.6862869144924969, + "grad_norm": 6.69005823135376, + "learning_rate": 2.2797295570207306e-06, + "loss": 0.1084, + "step": 27120 + }, + { + "epoch": 0.6863122200571906, + "grad_norm": 3.022216320037842, + "learning_rate": 2.2793926682466783e-06, + "loss": 0.1307, + "step": 27121 + }, + { + "epoch": 0.6863375256218842, + "grad_norm": 6.810140609741211, + "learning_rate": 2.2790557970173133e-06, + "loss": 0.1799, + "step": 27122 + }, + { + "epoch": 0.6863628311865779, + "grad_norm": 3.6606452465057373, + "learning_rate": 2.2787189433348127e-06, + "loss": 0.1251, + "step": 27123 + }, + { + "epoch": 0.6863881367512716, + "grad_norm": 3.671184539794922, + "learning_rate": 2.2783821072013457e-06, + "loss": 0.119, + "step": 27124 + }, + { + "epoch": 0.6864134423159652, + "grad_norm": 5.384502410888672, + "learning_rate": 2.27804528861909e-06, + "loss": 0.1577, + "step": 27125 + }, + { + "epoch": 0.686438747880659, + "grad_norm": 4.668947219848633, + "learning_rate": 2.2777084875902106e-06, + "loss": 0.1659, + "step": 27126 + }, + { + "epoch": 0.6864640534453527, + "grad_norm": 3.3607232570648193, + "learning_rate": 2.277371704116884e-06, + "loss": 0.1041, + "step": 27127 + }, + { + "epoch": 0.6864893590100463, + "grad_norm": 8.751778602600098, + "learning_rate": 2.2770349382012795e-06, + "loss": 0.2068, + "step": 27128 + }, + { + "epoch": 0.68651466457474, + "grad_norm": 5.8995256423950195, + "learning_rate": 2.276698189845572e-06, + "loss": 0.0741, + "step": 27129 + }, + { + "epoch": 0.6865399701394337, + "grad_norm": 6.503143787384033, + "learning_rate": 2.2763614590519307e-06, + "loss": 0.2555, + "step": 27130 + }, + { + "epoch": 0.6865652757041273, + "grad_norm": 3.755189895629883, + "learning_rate": 2.276024745822528e-06, + "loss": 0.1731, + "step": 27131 + }, + { + "epoch": 0.686590581268821, + "grad_norm": 3.493396520614624, + "learning_rate": 2.2756880501595354e-06, + "loss": 0.1408, + "step": 27132 + }, + { + "epoch": 0.6866158868335147, + "grad_norm": 12.25879192352295, + "learning_rate": 2.2753513720651216e-06, + "loss": 0.1483, + "step": 27133 + }, + { + "epoch": 0.6866411923982083, + "grad_norm": 6.303104877471924, + "learning_rate": 2.2750147115414615e-06, + "loss": 0.157, + "step": 27134 + }, + { + "epoch": 0.686666497962902, + "grad_norm": 20.77918815612793, + "learning_rate": 2.2746780685907255e-06, + "loss": 0.3194, + "step": 27135 + }, + { + "epoch": 0.6866918035275957, + "grad_norm": 7.424892902374268, + "learning_rate": 2.2743414432150833e-06, + "loss": 0.246, + "step": 27136 + }, + { + "epoch": 0.6867171090922894, + "grad_norm": 2.879611015319824, + "learning_rate": 2.2740048354167042e-06, + "loss": 0.0982, + "step": 27137 + }, + { + "epoch": 0.686742414656983, + "grad_norm": 3.42563533782959, + "learning_rate": 2.2736682451977625e-06, + "loss": 0.1219, + "step": 27138 + }, + { + "epoch": 0.6867677202216768, + "grad_norm": 3.914071559906006, + "learning_rate": 2.2733316725604275e-06, + "loss": 0.1617, + "step": 27139 + }, + { + "epoch": 0.6867930257863705, + "grad_norm": 6.192950248718262, + "learning_rate": 2.2729951175068692e-06, + "loss": 0.1459, + "step": 27140 + }, + { + "epoch": 0.6868183313510641, + "grad_norm": 10.586448669433594, + "learning_rate": 2.2726585800392565e-06, + "loss": 0.2933, + "step": 27141 + }, + { + "epoch": 0.6868436369157578, + "grad_norm": 7.0044426918029785, + "learning_rate": 2.272322060159763e-06, + "loss": 0.1675, + "step": 27142 + }, + { + "epoch": 0.6868689424804515, + "grad_norm": 2.8139524459838867, + "learning_rate": 2.2719855578705568e-06, + "loss": 0.1265, + "step": 27143 + }, + { + "epoch": 0.6868942480451451, + "grad_norm": 6.027334690093994, + "learning_rate": 2.271649073173808e-06, + "loss": 0.1861, + "step": 27144 + }, + { + "epoch": 0.6869195536098388, + "grad_norm": 8.789095878601074, + "learning_rate": 2.271312606071686e-06, + "loss": 0.2651, + "step": 27145 + }, + { + "epoch": 0.6869448591745325, + "grad_norm": 12.012818336486816, + "learning_rate": 2.270976156566363e-06, + "loss": 0.2259, + "step": 27146 + }, + { + "epoch": 0.6869701647392261, + "grad_norm": 3.6935300827026367, + "learning_rate": 2.2706397246600064e-06, + "loss": 0.1732, + "step": 27147 + }, + { + "epoch": 0.6869954703039198, + "grad_norm": 4.255508899688721, + "learning_rate": 2.2703033103547874e-06, + "loss": 0.1412, + "step": 27148 + }, + { + "epoch": 0.6870207758686135, + "grad_norm": 7.292172908782959, + "learning_rate": 2.2699669136528723e-06, + "loss": 0.2622, + "step": 27149 + }, + { + "epoch": 0.6870460814333071, + "grad_norm": 4.547997951507568, + "learning_rate": 2.2696305345564344e-06, + "loss": 0.1193, + "step": 27150 + }, + { + "epoch": 0.6870713869980009, + "grad_norm": 2.911092758178711, + "learning_rate": 2.2692941730676398e-06, + "loss": 0.0841, + "step": 27151 + }, + { + "epoch": 0.6870966925626946, + "grad_norm": 4.806450366973877, + "learning_rate": 2.2689578291886626e-06, + "loss": 0.1916, + "step": 27152 + }, + { + "epoch": 0.6871219981273882, + "grad_norm": 6.838373184204102, + "learning_rate": 2.268621502921665e-06, + "loss": 0.1541, + "step": 27153 + }, + { + "epoch": 0.6871473036920819, + "grad_norm": 5.0962934494018555, + "learning_rate": 2.268285194268821e-06, + "loss": 0.2255, + "step": 27154 + }, + { + "epoch": 0.6871726092567756, + "grad_norm": 5.2482686042785645, + "learning_rate": 2.2679489032322954e-06, + "loss": 0.2007, + "step": 27155 + }, + { + "epoch": 0.6871979148214692, + "grad_norm": 4.515805721282959, + "learning_rate": 2.267612629814263e-06, + "loss": 0.1725, + "step": 27156 + }, + { + "epoch": 0.6872232203861629, + "grad_norm": 6.709964752197266, + "learning_rate": 2.267276374016885e-06, + "loss": 0.1522, + "step": 27157 + }, + { + "epoch": 0.6872485259508566, + "grad_norm": 5.188234329223633, + "learning_rate": 2.2669401358423356e-06, + "loss": 0.1687, + "step": 27158 + }, + { + "epoch": 0.6872738315155502, + "grad_norm": 6.828253746032715, + "learning_rate": 2.26660391529278e-06, + "loss": 0.2173, + "step": 27159 + }, + { + "epoch": 0.6872991370802439, + "grad_norm": 3.7536208629608154, + "learning_rate": 2.266267712370386e-06, + "loss": 0.1129, + "step": 27160 + }, + { + "epoch": 0.6873244426449376, + "grad_norm": 5.894629955291748, + "learning_rate": 2.2659315270773247e-06, + "loss": 0.1348, + "step": 27161 + }, + { + "epoch": 0.6873497482096314, + "grad_norm": 3.8458590507507324, + "learning_rate": 2.265595359415762e-06, + "loss": 0.1139, + "step": 27162 + }, + { + "epoch": 0.687375053774325, + "grad_norm": 7.419076442718506, + "learning_rate": 2.265259209387867e-06, + "loss": 0.262, + "step": 27163 + }, + { + "epoch": 0.6874003593390187, + "grad_norm": 5.428102016448975, + "learning_rate": 2.264923076995804e-06, + "loss": 0.132, + "step": 27164 + }, + { + "epoch": 0.6874256649037124, + "grad_norm": 3.443230152130127, + "learning_rate": 2.2645869622417456e-06, + "loss": 0.1697, + "step": 27165 + }, + { + "epoch": 0.687450970468406, + "grad_norm": 10.919090270996094, + "learning_rate": 2.264250865127857e-06, + "loss": 0.1757, + "step": 27166 + }, + { + "epoch": 0.6874762760330997, + "grad_norm": 5.938161849975586, + "learning_rate": 2.263914785656306e-06, + "loss": 0.2136, + "step": 27167 + }, + { + "epoch": 0.6875015815977934, + "grad_norm": 5.585134029388428, + "learning_rate": 2.2635787238292573e-06, + "loss": 0.1575, + "step": 27168 + }, + { + "epoch": 0.687526887162487, + "grad_norm": 6.442564010620117, + "learning_rate": 2.2632426796488837e-06, + "loss": 0.2389, + "step": 27169 + }, + { + "epoch": 0.6875521927271807, + "grad_norm": 9.720352172851562, + "learning_rate": 2.262906653117346e-06, + "loss": 0.1906, + "step": 27170 + }, + { + "epoch": 0.6875774982918744, + "grad_norm": 5.0497517585754395, + "learning_rate": 2.262570644236816e-06, + "loss": 0.1542, + "step": 27171 + }, + { + "epoch": 0.687602803856568, + "grad_norm": 5.6241912841796875, + "learning_rate": 2.2622346530094562e-06, + "loss": 0.163, + "step": 27172 + }, + { + "epoch": 0.6876281094212617, + "grad_norm": 3.1021249294281006, + "learning_rate": 2.2618986794374386e-06, + "loss": 0.1136, + "step": 27173 + }, + { + "epoch": 0.6876534149859554, + "grad_norm": 5.066715717315674, + "learning_rate": 2.2615627235229264e-06, + "loss": 0.1529, + "step": 27174 + }, + { + "epoch": 0.687678720550649, + "grad_norm": 3.1606688499450684, + "learning_rate": 2.2612267852680873e-06, + "loss": 0.1527, + "step": 27175 + }, + { + "epoch": 0.6877040261153428, + "grad_norm": 24.014516830444336, + "learning_rate": 2.260890864675085e-06, + "loss": 0.3617, + "step": 27176 + }, + { + "epoch": 0.6877293316800365, + "grad_norm": 3.5526885986328125, + "learning_rate": 2.2605549617460903e-06, + "loss": 0.1004, + "step": 27177 + }, + { + "epoch": 0.6877546372447301, + "grad_norm": 7.129955768585205, + "learning_rate": 2.2602190764832667e-06, + "loss": 0.2188, + "step": 27178 + }, + { + "epoch": 0.6877799428094238, + "grad_norm": 15.848855018615723, + "learning_rate": 2.2598832088887805e-06, + "loss": 0.1273, + "step": 27179 + }, + { + "epoch": 0.6878052483741175, + "grad_norm": 8.043691635131836, + "learning_rate": 2.2595473589647966e-06, + "loss": 0.15, + "step": 27180 + }, + { + "epoch": 0.6878305539388111, + "grad_norm": 8.841994285583496, + "learning_rate": 2.259211526713483e-06, + "loss": 0.213, + "step": 27181 + }, + { + "epoch": 0.6878558595035048, + "grad_norm": 3.5282764434814453, + "learning_rate": 2.258875712137003e-06, + "loss": 0.1248, + "step": 27182 + }, + { + "epoch": 0.6878811650681985, + "grad_norm": 9.402301788330078, + "learning_rate": 2.2585399152375276e-06, + "loss": 0.1807, + "step": 27183 + }, + { + "epoch": 0.6879064706328921, + "grad_norm": 4.316825866699219, + "learning_rate": 2.2582041360172143e-06, + "loss": 0.1719, + "step": 27184 + }, + { + "epoch": 0.6879317761975858, + "grad_norm": 6.061113357543945, + "learning_rate": 2.2578683744782342e-06, + "loss": 0.1574, + "step": 27185 + }, + { + "epoch": 0.6879570817622795, + "grad_norm": 3.117135763168335, + "learning_rate": 2.257532630622749e-06, + "loss": 0.1078, + "step": 27186 + }, + { + "epoch": 0.6879823873269733, + "grad_norm": 12.860426902770996, + "learning_rate": 2.25719690445293e-06, + "loss": 0.2283, + "step": 27187 + }, + { + "epoch": 0.6880076928916669, + "grad_norm": 6.714566230773926, + "learning_rate": 2.2568611959709343e-06, + "loss": 0.1762, + "step": 27188 + }, + { + "epoch": 0.6880329984563606, + "grad_norm": 6.124453544616699, + "learning_rate": 2.2565255051789315e-06, + "loss": 0.1758, + "step": 27189 + }, + { + "epoch": 0.6880583040210543, + "grad_norm": 5.598601341247559, + "learning_rate": 2.256189832079086e-06, + "loss": 0.1535, + "step": 27190 + }, + { + "epoch": 0.6880836095857479, + "grad_norm": 4.876774787902832, + "learning_rate": 2.255854176673562e-06, + "loss": 0.1288, + "step": 27191 + }, + { + "epoch": 0.6881089151504416, + "grad_norm": 4.959921360015869, + "learning_rate": 2.255518538964522e-06, + "loss": 0.1315, + "step": 27192 + }, + { + "epoch": 0.6881342207151353, + "grad_norm": 2.429046630859375, + "learning_rate": 2.255182918954134e-06, + "loss": 0.1576, + "step": 27193 + }, + { + "epoch": 0.6881595262798289, + "grad_norm": 5.619110584259033, + "learning_rate": 2.2548473166445604e-06, + "loss": 0.1902, + "step": 27194 + }, + { + "epoch": 0.6881848318445226, + "grad_norm": 2.8137965202331543, + "learning_rate": 2.254511732037965e-06, + "loss": 0.0954, + "step": 27195 + }, + { + "epoch": 0.6882101374092163, + "grad_norm": 4.748501300811768, + "learning_rate": 2.2541761651365157e-06, + "loss": 0.0915, + "step": 27196 + }, + { + "epoch": 0.6882354429739099, + "grad_norm": 3.3775148391723633, + "learning_rate": 2.2538406159423703e-06, + "loss": 0.1352, + "step": 27197 + }, + { + "epoch": 0.6882607485386036, + "grad_norm": 5.906122207641602, + "learning_rate": 2.253505084457697e-06, + "loss": 0.1436, + "step": 27198 + }, + { + "epoch": 0.6882860541032974, + "grad_norm": 7.8175950050354, + "learning_rate": 2.253169570684658e-06, + "loss": 0.1384, + "step": 27199 + }, + { + "epoch": 0.688311359667991, + "grad_norm": 4.95550537109375, + "learning_rate": 2.2528340746254203e-06, + "loss": 0.102, + "step": 27200 + }, + { + "epoch": 0.6883366652326847, + "grad_norm": 5.940861225128174, + "learning_rate": 2.252498596282141e-06, + "loss": 0.1316, + "step": 27201 + }, + { + "epoch": 0.6883619707973784, + "grad_norm": 3.8837623596191406, + "learning_rate": 2.252163135656989e-06, + "loss": 0.1903, + "step": 27202 + }, + { + "epoch": 0.688387276362072, + "grad_norm": 3.7619168758392334, + "learning_rate": 2.2518276927521244e-06, + "loss": 0.1344, + "step": 27203 + }, + { + "epoch": 0.6884125819267657, + "grad_norm": 17.808807373046875, + "learning_rate": 2.251492267569713e-06, + "loss": 0.1481, + "step": 27204 + }, + { + "epoch": 0.6884378874914594, + "grad_norm": 5.272881984710693, + "learning_rate": 2.251156860111917e-06, + "loss": 0.1307, + "step": 27205 + }, + { + "epoch": 0.688463193056153, + "grad_norm": 5.299466609954834, + "learning_rate": 2.2508214703808996e-06, + "loss": 0.1965, + "step": 27206 + }, + { + "epoch": 0.6884884986208467, + "grad_norm": 15.398529052734375, + "learning_rate": 2.2504860983788207e-06, + "loss": 0.2324, + "step": 27207 + }, + { + "epoch": 0.6885138041855404, + "grad_norm": 6.489873886108398, + "learning_rate": 2.2501507441078473e-06, + "loss": 0.1495, + "step": 27208 + }, + { + "epoch": 0.688539109750234, + "grad_norm": 8.22400951385498, + "learning_rate": 2.2498154075701406e-06, + "loss": 0.2362, + "step": 27209 + }, + { + "epoch": 0.6885644153149277, + "grad_norm": 3.109287977218628, + "learning_rate": 2.2494800887678625e-06, + "loss": 0.173, + "step": 27210 + }, + { + "epoch": 0.6885897208796214, + "grad_norm": 5.623716831207275, + "learning_rate": 2.2491447877031734e-06, + "loss": 0.1883, + "step": 27211 + }, + { + "epoch": 0.6886150264443152, + "grad_norm": 11.138623237609863, + "learning_rate": 2.2488095043782406e-06, + "loss": 0.2199, + "step": 27212 + }, + { + "epoch": 0.6886403320090088, + "grad_norm": 4.93300724029541, + "learning_rate": 2.2484742387952213e-06, + "loss": 0.1164, + "step": 27213 + }, + { + "epoch": 0.6886656375737025, + "grad_norm": 8.19900894165039, + "learning_rate": 2.2481389909562836e-06, + "loss": 0.2218, + "step": 27214 + }, + { + "epoch": 0.6886909431383962, + "grad_norm": 4.155552387237549, + "learning_rate": 2.247803760863582e-06, + "loss": 0.153, + "step": 27215 + }, + { + "epoch": 0.6887162487030898, + "grad_norm": 2.08080792427063, + "learning_rate": 2.2474685485192836e-06, + "loss": 0.1135, + "step": 27216 + }, + { + "epoch": 0.6887415542677835, + "grad_norm": 4.369790077209473, + "learning_rate": 2.2471333539255485e-06, + "loss": 0.1095, + "step": 27217 + }, + { + "epoch": 0.6887668598324772, + "grad_norm": 5.480363845825195, + "learning_rate": 2.246798177084539e-06, + "loss": 0.1564, + "step": 27218 + }, + { + "epoch": 0.6887921653971708, + "grad_norm": 3.665393590927124, + "learning_rate": 2.246463017998413e-06, + "loss": 0.1232, + "step": 27219 + }, + { + "epoch": 0.6888174709618645, + "grad_norm": 6.797251224517822, + "learning_rate": 2.246127876669337e-06, + "loss": 0.2196, + "step": 27220 + }, + { + "epoch": 0.6888427765265582, + "grad_norm": 3.472505569458008, + "learning_rate": 2.2457927530994705e-06, + "loss": 0.1186, + "step": 27221 + }, + { + "epoch": 0.6888680820912518, + "grad_norm": 3.681126117706299, + "learning_rate": 2.2454576472909737e-06, + "loss": 0.1452, + "step": 27222 + }, + { + "epoch": 0.6888933876559455, + "grad_norm": 9.279999732971191, + "learning_rate": 2.245122559246007e-06, + "loss": 0.1042, + "step": 27223 + }, + { + "epoch": 0.6889186932206393, + "grad_norm": 5.7649827003479, + "learning_rate": 2.2447874889667335e-06, + "loss": 0.1723, + "step": 27224 + }, + { + "epoch": 0.6889439987853329, + "grad_norm": 3.306781530380249, + "learning_rate": 2.2444524364553138e-06, + "loss": 0.1046, + "step": 27225 + }, + { + "epoch": 0.6889693043500266, + "grad_norm": 4.248063564300537, + "learning_rate": 2.2441174017139054e-06, + "loss": 0.1372, + "step": 27226 + }, + { + "epoch": 0.6889946099147203, + "grad_norm": 3.5181262493133545, + "learning_rate": 2.243782384744675e-06, + "loss": 0.1545, + "step": 27227 + }, + { + "epoch": 0.6890199154794139, + "grad_norm": 6.67273473739624, + "learning_rate": 2.243447385549776e-06, + "loss": 0.1608, + "step": 27228 + }, + { + "epoch": 0.6890452210441076, + "grad_norm": 4.138065814971924, + "learning_rate": 2.243112404131373e-06, + "loss": 0.1972, + "step": 27229 + }, + { + "epoch": 0.6890705266088013, + "grad_norm": 7.146559715270996, + "learning_rate": 2.242777440491624e-06, + "loss": 0.1546, + "step": 27230 + }, + { + "epoch": 0.6890958321734949, + "grad_norm": 4.931081771850586, + "learning_rate": 2.242442494632694e-06, + "loss": 0.1908, + "step": 27231 + }, + { + "epoch": 0.6891211377381886, + "grad_norm": 2.846062421798706, + "learning_rate": 2.242107566556736e-06, + "loss": 0.1142, + "step": 27232 + }, + { + "epoch": 0.6891464433028823, + "grad_norm": 3.746638059616089, + "learning_rate": 2.2417726562659148e-06, + "loss": 0.1822, + "step": 27233 + }, + { + "epoch": 0.6891717488675759, + "grad_norm": 11.89242935180664, + "learning_rate": 2.241437763762387e-06, + "loss": 0.275, + "step": 27234 + }, + { + "epoch": 0.6891970544322696, + "grad_norm": 4.171057224273682, + "learning_rate": 2.2411028890483154e-06, + "loss": 0.1078, + "step": 27235 + }, + { + "epoch": 0.6892223599969634, + "grad_norm": 12.213454246520996, + "learning_rate": 2.2407680321258584e-06, + "loss": 0.2702, + "step": 27236 + }, + { + "epoch": 0.6892476655616571, + "grad_norm": 4.941961288452148, + "learning_rate": 2.2404331929971758e-06, + "loss": 0.1684, + "step": 27237 + }, + { + "epoch": 0.6892729711263507, + "grad_norm": 7.1725544929504395, + "learning_rate": 2.2400983716644233e-06, + "loss": 0.1831, + "step": 27238 + }, + { + "epoch": 0.6892982766910444, + "grad_norm": 6.643290996551514, + "learning_rate": 2.239763568129765e-06, + "loss": 0.1958, + "step": 27239 + }, + { + "epoch": 0.6893235822557381, + "grad_norm": 22.816247940063477, + "learning_rate": 2.2394287823953587e-06, + "loss": 0.2297, + "step": 27240 + }, + { + "epoch": 0.6893488878204317, + "grad_norm": 3.4869492053985596, + "learning_rate": 2.239094014463362e-06, + "loss": 0.1247, + "step": 27241 + }, + { + "epoch": 0.6893741933851254, + "grad_norm": 7.229165077209473, + "learning_rate": 2.238759264335933e-06, + "loss": 0.0711, + "step": 27242 + }, + { + "epoch": 0.6893994989498191, + "grad_norm": 4.99435567855835, + "learning_rate": 2.238424532015233e-06, + "loss": 0.1499, + "step": 27243 + }, + { + "epoch": 0.6894248045145127, + "grad_norm": 14.51115894317627, + "learning_rate": 2.2380898175034205e-06, + "loss": 0.1816, + "step": 27244 + }, + { + "epoch": 0.6894501100792064, + "grad_norm": 5.297892093658447, + "learning_rate": 2.2377551208026525e-06, + "loss": 0.1756, + "step": 27245 + }, + { + "epoch": 0.6894754156439001, + "grad_norm": 7.336750507354736, + "learning_rate": 2.2374204419150864e-06, + "loss": 0.1699, + "step": 27246 + }, + { + "epoch": 0.6895007212085937, + "grad_norm": 5.926135063171387, + "learning_rate": 2.237085780842883e-06, + "loss": 0.1457, + "step": 27247 + }, + { + "epoch": 0.6895260267732874, + "grad_norm": 4.340628623962402, + "learning_rate": 2.2367511375882e-06, + "loss": 0.1946, + "step": 27248 + }, + { + "epoch": 0.6895513323379812, + "grad_norm": 4.911425590515137, + "learning_rate": 2.236416512153195e-06, + "loss": 0.1481, + "step": 27249 + }, + { + "epoch": 0.6895766379026748, + "grad_norm": 16.320322036743164, + "learning_rate": 2.236081904540024e-06, + "loss": 0.1881, + "step": 27250 + }, + { + "epoch": 0.6896019434673685, + "grad_norm": 3.6904547214508057, + "learning_rate": 2.2357473147508485e-06, + "loss": 0.1084, + "step": 27251 + }, + { + "epoch": 0.6896272490320622, + "grad_norm": 4.28340482711792, + "learning_rate": 2.235412742787824e-06, + "loss": 0.118, + "step": 27252 + }, + { + "epoch": 0.6896525545967558, + "grad_norm": 3.511504650115967, + "learning_rate": 2.235078188653109e-06, + "loss": 0.1314, + "step": 27253 + }, + { + "epoch": 0.6896778601614495, + "grad_norm": 8.150757789611816, + "learning_rate": 2.23474365234886e-06, + "loss": 0.1924, + "step": 27254 + }, + { + "epoch": 0.6897031657261432, + "grad_norm": 4.202824592590332, + "learning_rate": 2.234409133877233e-06, + "loss": 0.1459, + "step": 27255 + }, + { + "epoch": 0.6897284712908368, + "grad_norm": 5.823590278625488, + "learning_rate": 2.2340746332403893e-06, + "loss": 0.2439, + "step": 27256 + }, + { + "epoch": 0.6897537768555305, + "grad_norm": 9.750605583190918, + "learning_rate": 2.233740150440482e-06, + "loss": 0.242, + "step": 27257 + }, + { + "epoch": 0.6897790824202242, + "grad_norm": 9.797451972961426, + "learning_rate": 2.2334056854796733e-06, + "loss": 0.2564, + "step": 27258 + }, + { + "epoch": 0.6898043879849178, + "grad_norm": 2.73091983795166, + "learning_rate": 2.2330712383601134e-06, + "loss": 0.1386, + "step": 27259 + }, + { + "epoch": 0.6898296935496115, + "grad_norm": 6.01235818862915, + "learning_rate": 2.2327368090839637e-06, + "loss": 0.1583, + "step": 27260 + }, + { + "epoch": 0.6898549991143053, + "grad_norm": 3.5724687576293945, + "learning_rate": 2.2324023976533776e-06, + "loss": 0.1441, + "step": 27261 + }, + { + "epoch": 0.6898803046789989, + "grad_norm": 3.347142219543457, + "learning_rate": 2.2320680040705178e-06, + "loss": 0.0856, + "step": 27262 + }, + { + "epoch": 0.6899056102436926, + "grad_norm": 4.236282825469971, + "learning_rate": 2.231733628337533e-06, + "loss": 0.1643, + "step": 27263 + }, + { + "epoch": 0.6899309158083863, + "grad_norm": 4.56522798538208, + "learning_rate": 2.2313992704565847e-06, + "loss": 0.1311, + "step": 27264 + }, + { + "epoch": 0.68995622137308, + "grad_norm": 9.689067840576172, + "learning_rate": 2.2310649304298253e-06, + "loss": 0.1852, + "step": 27265 + }, + { + "epoch": 0.6899815269377736, + "grad_norm": 3.147257089614868, + "learning_rate": 2.2307306082594154e-06, + "loss": 0.1217, + "step": 27266 + }, + { + "epoch": 0.6900068325024673, + "grad_norm": 4.629646301269531, + "learning_rate": 2.2303963039475085e-06, + "loss": 0.1824, + "step": 27267 + }, + { + "epoch": 0.690032138067161, + "grad_norm": 4.376158714294434, + "learning_rate": 2.2300620174962607e-06, + "loss": 0.1286, + "step": 27268 + }, + { + "epoch": 0.6900574436318546, + "grad_norm": 3.1830270290374756, + "learning_rate": 2.2297277489078256e-06, + "loss": 0.0576, + "step": 27269 + }, + { + "epoch": 0.6900827491965483, + "grad_norm": 6.304062843322754, + "learning_rate": 2.2293934981843628e-06, + "loss": 0.2139, + "step": 27270 + }, + { + "epoch": 0.690108054761242, + "grad_norm": 4.637383460998535, + "learning_rate": 2.229059265328026e-06, + "loss": 0.1843, + "step": 27271 + }, + { + "epoch": 0.6901333603259356, + "grad_norm": 8.370247840881348, + "learning_rate": 2.2287250503409714e-06, + "loss": 0.1313, + "step": 27272 + }, + { + "epoch": 0.6901586658906294, + "grad_norm": 10.82120132446289, + "learning_rate": 2.2283908532253505e-06, + "loss": 0.1818, + "step": 27273 + }, + { + "epoch": 0.6901839714553231, + "grad_norm": 8.744301795959473, + "learning_rate": 2.2280566739833237e-06, + "loss": 0.17, + "step": 27274 + }, + { + "epoch": 0.6902092770200167, + "grad_norm": 2.605898857116699, + "learning_rate": 2.2277225126170442e-06, + "loss": 0.1293, + "step": 27275 + }, + { + "epoch": 0.6902345825847104, + "grad_norm": 9.20953369140625, + "learning_rate": 2.2273883691286657e-06, + "loss": 0.2935, + "step": 27276 + }, + { + "epoch": 0.6902598881494041, + "grad_norm": 5.949041366577148, + "learning_rate": 2.227054243520343e-06, + "loss": 0.1648, + "step": 27277 + }, + { + "epoch": 0.6902851937140977, + "grad_norm": 3.9486701488494873, + "learning_rate": 2.2267201357942324e-06, + "loss": 0.1505, + "step": 27278 + }, + { + "epoch": 0.6903104992787914, + "grad_norm": 10.63485336303711, + "learning_rate": 2.226386045952489e-06, + "loss": 0.2698, + "step": 27279 + }, + { + "epoch": 0.6903358048434851, + "grad_norm": 7.827602863311768, + "learning_rate": 2.226051973997265e-06, + "loss": 0.0816, + "step": 27280 + }, + { + "epoch": 0.6903611104081787, + "grad_norm": 5.265103340148926, + "learning_rate": 2.2257179199307164e-06, + "loss": 0.1869, + "step": 27281 + }, + { + "epoch": 0.6903864159728724, + "grad_norm": 2.8068106174468994, + "learning_rate": 2.2253838837549948e-06, + "loss": 0.1295, + "step": 27282 + }, + { + "epoch": 0.6904117215375661, + "grad_norm": 5.934465408325195, + "learning_rate": 2.2250498654722584e-06, + "loss": 0.1965, + "step": 27283 + }, + { + "epoch": 0.6904370271022597, + "grad_norm": 6.677485466003418, + "learning_rate": 2.224715865084659e-06, + "loss": 0.1218, + "step": 27284 + }, + { + "epoch": 0.6904623326669534, + "grad_norm": 4.5205464363098145, + "learning_rate": 2.2243818825943507e-06, + "loss": 0.1853, + "step": 27285 + }, + { + "epoch": 0.6904876382316472, + "grad_norm": 15.910167694091797, + "learning_rate": 2.224047918003486e-06, + "loss": 0.2437, + "step": 27286 + }, + { + "epoch": 0.6905129437963408, + "grad_norm": 5.794347286224365, + "learning_rate": 2.2237139713142212e-06, + "loss": 0.1969, + "step": 27287 + }, + { + "epoch": 0.6905382493610345, + "grad_norm": 3.2336721420288086, + "learning_rate": 2.2233800425287074e-06, + "loss": 0.1423, + "step": 27288 + }, + { + "epoch": 0.6905635549257282, + "grad_norm": 7.255937099456787, + "learning_rate": 2.2230461316491024e-06, + "loss": 0.1439, + "step": 27289 + }, + { + "epoch": 0.6905888604904219, + "grad_norm": 2.5731475353240967, + "learning_rate": 2.2227122386775527e-06, + "loss": 0.0748, + "step": 27290 + }, + { + "epoch": 0.6906141660551155, + "grad_norm": 3.61301326751709, + "learning_rate": 2.222378363616217e-06, + "loss": 0.1768, + "step": 27291 + }, + { + "epoch": 0.6906394716198092, + "grad_norm": 3.8247151374816895, + "learning_rate": 2.2220445064672445e-06, + "loss": 0.1803, + "step": 27292 + }, + { + "epoch": 0.6906647771845029, + "grad_norm": 3.1325860023498535, + "learning_rate": 2.2217106672327944e-06, + "loss": 0.0979, + "step": 27293 + }, + { + "epoch": 0.6906900827491965, + "grad_norm": 5.240941047668457, + "learning_rate": 2.2213768459150114e-06, + "loss": 0.0899, + "step": 27294 + }, + { + "epoch": 0.6907153883138902, + "grad_norm": 5.571028232574463, + "learning_rate": 2.2210430425160545e-06, + "loss": 0.1506, + "step": 27295 + }, + { + "epoch": 0.690740693878584, + "grad_norm": 5.589646816253662, + "learning_rate": 2.220709257038072e-06, + "loss": 0.1625, + "step": 27296 + }, + { + "epoch": 0.6907659994432775, + "grad_norm": 6.485332012176514, + "learning_rate": 2.220375489483222e-06, + "loss": 0.2164, + "step": 27297 + }, + { + "epoch": 0.6907913050079713, + "grad_norm": 3.6645736694335938, + "learning_rate": 2.22004173985365e-06, + "loss": 0.1332, + "step": 27298 + }, + { + "epoch": 0.690816610572665, + "grad_norm": 17.631614685058594, + "learning_rate": 2.2197080081515134e-06, + "loss": 0.2317, + "step": 27299 + }, + { + "epoch": 0.6908419161373586, + "grad_norm": 3.7346549034118652, + "learning_rate": 2.219374294378961e-06, + "loss": 0.1131, + "step": 27300 + }, + { + "epoch": 0.6908672217020523, + "grad_norm": 5.8032073974609375, + "learning_rate": 2.219040598538148e-06, + "loss": 0.1529, + "step": 27301 + }, + { + "epoch": 0.690892527266746, + "grad_norm": 3.9332916736602783, + "learning_rate": 2.218706920631225e-06, + "loss": 0.1483, + "step": 27302 + }, + { + "epoch": 0.6909178328314396, + "grad_norm": 3.910522222518921, + "learning_rate": 2.218373260660344e-06, + "loss": 0.143, + "step": 27303 + }, + { + "epoch": 0.6909431383961333, + "grad_norm": 11.999456405639648, + "learning_rate": 2.218039618627654e-06, + "loss": 0.2577, + "step": 27304 + }, + { + "epoch": 0.690968443960827, + "grad_norm": 2.5018699169158936, + "learning_rate": 2.217705994535312e-06, + "loss": 0.0586, + "step": 27305 + }, + { + "epoch": 0.6909937495255206, + "grad_norm": 2.650202989578247, + "learning_rate": 2.217372388385465e-06, + "loss": 0.0793, + "step": 27306 + }, + { + "epoch": 0.6910190550902143, + "grad_norm": 4.797503471374512, + "learning_rate": 2.217038800180267e-06, + "loss": 0.1919, + "step": 27307 + }, + { + "epoch": 0.691044360654908, + "grad_norm": 5.793024063110352, + "learning_rate": 2.2167052299218663e-06, + "loss": 0.1293, + "step": 27308 + }, + { + "epoch": 0.6910696662196016, + "grad_norm": 15.014131546020508, + "learning_rate": 2.2163716776124172e-06, + "loss": 0.2066, + "step": 27309 + }, + { + "epoch": 0.6910949717842954, + "grad_norm": 7.630949020385742, + "learning_rate": 2.2160381432540697e-06, + "loss": 0.1982, + "step": 27310 + }, + { + "epoch": 0.6911202773489891, + "grad_norm": 6.856374263763428, + "learning_rate": 2.2157046268489734e-06, + "loss": 0.2187, + "step": 27311 + }, + { + "epoch": 0.6911455829136827, + "grad_norm": 12.354126930236816, + "learning_rate": 2.2153711283992815e-06, + "loss": 0.2624, + "step": 27312 + }, + { + "epoch": 0.6911708884783764, + "grad_norm": 5.648841857910156, + "learning_rate": 2.21503764790714e-06, + "loss": 0.1263, + "step": 27313 + }, + { + "epoch": 0.6911961940430701, + "grad_norm": 4.361225605010986, + "learning_rate": 2.2147041853747053e-06, + "loss": 0.1472, + "step": 27314 + }, + { + "epoch": 0.6912214996077638, + "grad_norm": 7.092191696166992, + "learning_rate": 2.214370740804125e-06, + "loss": 0.1888, + "step": 27315 + }, + { + "epoch": 0.6912468051724574, + "grad_norm": 6.353710174560547, + "learning_rate": 2.2140373141975496e-06, + "loss": 0.1917, + "step": 27316 + }, + { + "epoch": 0.6912721107371511, + "grad_norm": 6.371941089630127, + "learning_rate": 2.2137039055571276e-06, + "loss": 0.2272, + "step": 27317 + }, + { + "epoch": 0.6912974163018448, + "grad_norm": 4.0978779792785645, + "learning_rate": 2.2133705148850125e-06, + "loss": 0.0488, + "step": 27318 + }, + { + "epoch": 0.6913227218665384, + "grad_norm": 3.6998448371887207, + "learning_rate": 2.213037142183353e-06, + "loss": 0.1197, + "step": 27319 + }, + { + "epoch": 0.6913480274312321, + "grad_norm": 10.483528137207031, + "learning_rate": 2.212703787454298e-06, + "loss": 0.15, + "step": 27320 + }, + { + "epoch": 0.6913733329959258, + "grad_norm": 2.9365627765655518, + "learning_rate": 2.212370450699996e-06, + "loss": 0.1033, + "step": 27321 + }, + { + "epoch": 0.6913986385606194, + "grad_norm": 2.211946487426758, + "learning_rate": 2.212037131922601e-06, + "loss": 0.126, + "step": 27322 + }, + { + "epoch": 0.6914239441253132, + "grad_norm": 8.597665786743164, + "learning_rate": 2.211703831124257e-06, + "loss": 0.169, + "step": 27323 + }, + { + "epoch": 0.6914492496900069, + "grad_norm": 5.018462181091309, + "learning_rate": 2.2113705483071206e-06, + "loss": 0.1479, + "step": 27324 + }, + { + "epoch": 0.6914745552547005, + "grad_norm": 6.737653732299805, + "learning_rate": 2.2110372834733334e-06, + "loss": 0.1886, + "step": 27325 + }, + { + "epoch": 0.6914998608193942, + "grad_norm": 3.6088263988494873, + "learning_rate": 2.2107040366250494e-06, + "loss": 0.163, + "step": 27326 + }, + { + "epoch": 0.6915251663840879, + "grad_norm": 4.117448329925537, + "learning_rate": 2.2103708077644148e-06, + "loss": 0.1787, + "step": 27327 + }, + { + "epoch": 0.6915504719487815, + "grad_norm": 9.013700485229492, + "learning_rate": 2.2100375968935834e-06, + "loss": 0.2019, + "step": 27328 + }, + { + "epoch": 0.6915757775134752, + "grad_norm": 6.996546745300293, + "learning_rate": 2.2097044040146965e-06, + "loss": 0.1774, + "step": 27329 + }, + { + "epoch": 0.6916010830781689, + "grad_norm": 7.209330081939697, + "learning_rate": 2.209371229129909e-06, + "loss": 0.1802, + "step": 27330 + }, + { + "epoch": 0.6916263886428625, + "grad_norm": 3.027087450027466, + "learning_rate": 2.209038072241366e-06, + "loss": 0.134, + "step": 27331 + }, + { + "epoch": 0.6916516942075562, + "grad_norm": 3.622170925140381, + "learning_rate": 2.2087049333512185e-06, + "loss": 0.109, + "step": 27332 + }, + { + "epoch": 0.69167699977225, + "grad_norm": 4.521775245666504, + "learning_rate": 2.2083718124616136e-06, + "loss": 0.0856, + "step": 27333 + }, + { + "epoch": 0.6917023053369435, + "grad_norm": 3.4430935382843018, + "learning_rate": 2.2080387095746996e-06, + "loss": 0.0843, + "step": 27334 + }, + { + "epoch": 0.6917276109016373, + "grad_norm": 3.555166244506836, + "learning_rate": 2.207705624692623e-06, + "loss": 0.1544, + "step": 27335 + }, + { + "epoch": 0.691752916466331, + "grad_norm": 12.403935432434082, + "learning_rate": 2.207372557817535e-06, + "loss": 0.3818, + "step": 27336 + }, + { + "epoch": 0.6917782220310246, + "grad_norm": 6.200455665588379, + "learning_rate": 2.2070395089515826e-06, + "loss": 0.1619, + "step": 27337 + }, + { + "epoch": 0.6918035275957183, + "grad_norm": 3.458631992340088, + "learning_rate": 2.206706478096913e-06, + "loss": 0.1114, + "step": 27338 + }, + { + "epoch": 0.691828833160412, + "grad_norm": 5.0760393142700195, + "learning_rate": 2.206373465255673e-06, + "loss": 0.1908, + "step": 27339 + }, + { + "epoch": 0.6918541387251057, + "grad_norm": 4.153987407684326, + "learning_rate": 2.206040470430009e-06, + "loss": 0.1842, + "step": 27340 + }, + { + "epoch": 0.6918794442897993, + "grad_norm": 2.3448901176452637, + "learning_rate": 2.2057074936220723e-06, + "loss": 0.0811, + "step": 27341 + }, + { + "epoch": 0.691904749854493, + "grad_norm": 6.315034866333008, + "learning_rate": 2.2053745348340084e-06, + "loss": 0.2115, + "step": 27342 + }, + { + "epoch": 0.6919300554191867, + "grad_norm": 5.034677505493164, + "learning_rate": 2.205041594067964e-06, + "loss": 0.1563, + "step": 27343 + }, + { + "epoch": 0.6919553609838803, + "grad_norm": 4.744436264038086, + "learning_rate": 2.2047086713260853e-06, + "loss": 0.1408, + "step": 27344 + }, + { + "epoch": 0.691980666548574, + "grad_norm": 19.872989654541016, + "learning_rate": 2.2043757666105214e-06, + "loss": 0.1712, + "step": 27345 + }, + { + "epoch": 0.6920059721132678, + "grad_norm": 3.5358002185821533, + "learning_rate": 2.204042879923418e-06, + "loss": 0.1155, + "step": 27346 + }, + { + "epoch": 0.6920312776779614, + "grad_norm": 4.593898296356201, + "learning_rate": 2.203710011266923e-06, + "loss": 0.1018, + "step": 27347 + }, + { + "epoch": 0.6920565832426551, + "grad_norm": 4.868710517883301, + "learning_rate": 2.2033771606431793e-06, + "loss": 0.1531, + "step": 27348 + }, + { + "epoch": 0.6920818888073488, + "grad_norm": 3.666445016860962, + "learning_rate": 2.2030443280543386e-06, + "loss": 0.1413, + "step": 27349 + }, + { + "epoch": 0.6921071943720424, + "grad_norm": 3.024672269821167, + "learning_rate": 2.202711513502544e-06, + "loss": 0.1339, + "step": 27350 + }, + { + "epoch": 0.6921324999367361, + "grad_norm": 4.189652442932129, + "learning_rate": 2.202378716989943e-06, + "loss": 0.1198, + "step": 27351 + }, + { + "epoch": 0.6921578055014298, + "grad_norm": 9.527962684631348, + "learning_rate": 2.202045938518679e-06, + "loss": 0.1901, + "step": 27352 + }, + { + "epoch": 0.6921831110661234, + "grad_norm": 3.6079471111297607, + "learning_rate": 2.2017131780909025e-06, + "loss": 0.2186, + "step": 27353 + }, + { + "epoch": 0.6922084166308171, + "grad_norm": 5.029184341430664, + "learning_rate": 2.2013804357087553e-06, + "loss": 0.1482, + "step": 27354 + }, + { + "epoch": 0.6922337221955108, + "grad_norm": 4.221623420715332, + "learning_rate": 2.2010477113743887e-06, + "loss": 0.1565, + "step": 27355 + }, + { + "epoch": 0.6922590277602044, + "grad_norm": 3.84613037109375, + "learning_rate": 2.200715005089941e-06, + "loss": 0.1089, + "step": 27356 + }, + { + "epoch": 0.6922843333248981, + "grad_norm": 8.382537841796875, + "learning_rate": 2.200382316857564e-06, + "loss": 0.2635, + "step": 27357 + }, + { + "epoch": 0.6923096388895918, + "grad_norm": 8.11877155303955, + "learning_rate": 2.200049646679398e-06, + "loss": 0.2138, + "step": 27358 + }, + { + "epoch": 0.6923349444542855, + "grad_norm": 5.2177815437316895, + "learning_rate": 2.1997169945575953e-06, + "loss": 0.1688, + "step": 27359 + }, + { + "epoch": 0.6923602500189792, + "grad_norm": 4.328207969665527, + "learning_rate": 2.1993843604942925e-06, + "loss": 0.158, + "step": 27360 + }, + { + "epoch": 0.6923855555836729, + "grad_norm": 6.492273807525635, + "learning_rate": 2.199051744491641e-06, + "loss": 0.2062, + "step": 27361 + }, + { + "epoch": 0.6924108611483665, + "grad_norm": 3.5724477767944336, + "learning_rate": 2.1987191465517823e-06, + "loss": 0.1282, + "step": 27362 + }, + { + "epoch": 0.6924361667130602, + "grad_norm": 2.723372459411621, + "learning_rate": 2.1983865666768646e-06, + "loss": 0.1458, + "step": 27363 + }, + { + "epoch": 0.6924614722777539, + "grad_norm": 4.807811260223389, + "learning_rate": 2.19805400486903e-06, + "loss": 0.1404, + "step": 27364 + }, + { + "epoch": 0.6924867778424476, + "grad_norm": 4.177711009979248, + "learning_rate": 2.197721461130425e-06, + "loss": 0.132, + "step": 27365 + }, + { + "epoch": 0.6925120834071412, + "grad_norm": 5.454464435577393, + "learning_rate": 2.197388935463192e-06, + "loss": 0.108, + "step": 27366 + }, + { + "epoch": 0.6925373889718349, + "grad_norm": 8.309633255004883, + "learning_rate": 2.1970564278694756e-06, + "loss": 0.179, + "step": 27367 + }, + { + "epoch": 0.6925626945365286, + "grad_norm": 3.8376243114471436, + "learning_rate": 2.196723938351422e-06, + "loss": 0.1566, + "step": 27368 + }, + { + "epoch": 0.6925880001012222, + "grad_norm": 4.055815696716309, + "learning_rate": 2.1963914669111745e-06, + "loss": 0.1664, + "step": 27369 + }, + { + "epoch": 0.692613305665916, + "grad_norm": 5.458028316497803, + "learning_rate": 2.1960590135508775e-06, + "loss": 0.0973, + "step": 27370 + }, + { + "epoch": 0.6926386112306097, + "grad_norm": 3.78269362449646, + "learning_rate": 2.195726578272672e-06, + "loss": 0.1127, + "step": 27371 + }, + { + "epoch": 0.6926639167953033, + "grad_norm": 7.375550270080566, + "learning_rate": 2.1953941610787055e-06, + "loss": 0.1798, + "step": 27372 + }, + { + "epoch": 0.692689222359997, + "grad_norm": 12.426806449890137, + "learning_rate": 2.195061761971121e-06, + "loss": 0.2424, + "step": 27373 + }, + { + "epoch": 0.6927145279246907, + "grad_norm": 3.760582447052002, + "learning_rate": 2.1947293809520614e-06, + "loss": 0.1106, + "step": 27374 + }, + { + "epoch": 0.6927398334893843, + "grad_norm": 4.907205104827881, + "learning_rate": 2.194397018023668e-06, + "loss": 0.1704, + "step": 27375 + }, + { + "epoch": 0.692765139054078, + "grad_norm": 4.3949995040893555, + "learning_rate": 2.1940646731880887e-06, + "loss": 0.1372, + "step": 27376 + }, + { + "epoch": 0.6927904446187717, + "grad_norm": 6.242656707763672, + "learning_rate": 2.1937323464474643e-06, + "loss": 0.1964, + "step": 27377 + }, + { + "epoch": 0.6928157501834653, + "grad_norm": 5.737272262573242, + "learning_rate": 2.1934000378039377e-06, + "loss": 0.1599, + "step": 27378 + }, + { + "epoch": 0.692841055748159, + "grad_norm": 4.9526448249816895, + "learning_rate": 2.19306774725965e-06, + "loss": 0.1874, + "step": 27379 + }, + { + "epoch": 0.6928663613128527, + "grad_norm": 6.599137783050537, + "learning_rate": 2.192735474816749e-06, + "loss": 0.1054, + "step": 27380 + }, + { + "epoch": 0.6928916668775463, + "grad_norm": 8.288629531860352, + "learning_rate": 2.1924032204773744e-06, + "loss": 0.1851, + "step": 27381 + }, + { + "epoch": 0.69291697244224, + "grad_norm": 3.387964963912964, + "learning_rate": 2.192070984243669e-06, + "loss": 0.099, + "step": 27382 + }, + { + "epoch": 0.6929422780069338, + "grad_norm": 4.172055721282959, + "learning_rate": 2.1917387661177735e-06, + "loss": 0.1833, + "step": 27383 + }, + { + "epoch": 0.6929675835716274, + "grad_norm": 4.202319145202637, + "learning_rate": 2.1914065661018343e-06, + "loss": 0.1327, + "step": 27384 + }, + { + "epoch": 0.6929928891363211, + "grad_norm": 4.783987522125244, + "learning_rate": 2.1910743841979896e-06, + "loss": 0.1772, + "step": 27385 + }, + { + "epoch": 0.6930181947010148, + "grad_norm": 8.583017349243164, + "learning_rate": 2.1907422204083884e-06, + "loss": 0.2303, + "step": 27386 + }, + { + "epoch": 0.6930435002657084, + "grad_norm": 7.956045627593994, + "learning_rate": 2.1904100747351638e-06, + "loss": 0.1299, + "step": 27387 + }, + { + "epoch": 0.6930688058304021, + "grad_norm": 4.014437675476074, + "learning_rate": 2.1900779471804635e-06, + "loss": 0.1701, + "step": 27388 + }, + { + "epoch": 0.6930941113950958, + "grad_norm": 4.355175495147705, + "learning_rate": 2.1897458377464263e-06, + "loss": 0.1885, + "step": 27389 + }, + { + "epoch": 0.6931194169597894, + "grad_norm": 12.574634552001953, + "learning_rate": 2.189413746435199e-06, + "loss": 0.3387, + "step": 27390 + }, + { + "epoch": 0.6931447225244831, + "grad_norm": 5.1870598793029785, + "learning_rate": 2.189081673248916e-06, + "loss": 0.123, + "step": 27391 + }, + { + "epoch": 0.6931700280891768, + "grad_norm": 3.363539934158325, + "learning_rate": 2.1887496181897245e-06, + "loss": 0.1072, + "step": 27392 + }, + { + "epoch": 0.6931953336538705, + "grad_norm": 3.997130870819092, + "learning_rate": 2.1884175812597636e-06, + "loss": 0.1796, + "step": 27393 + }, + { + "epoch": 0.6932206392185641, + "grad_norm": 5.662525177001953, + "learning_rate": 2.1880855624611724e-06, + "loss": 0.1717, + "step": 27394 + }, + { + "epoch": 0.6932459447832579, + "grad_norm": 3.978405714035034, + "learning_rate": 2.187753561796097e-06, + "loss": 0.1345, + "step": 27395 + }, + { + "epoch": 0.6932712503479516, + "grad_norm": 8.880208015441895, + "learning_rate": 2.1874215792666752e-06, + "loss": 0.142, + "step": 27396 + }, + { + "epoch": 0.6932965559126452, + "grad_norm": 4.756007671356201, + "learning_rate": 2.1870896148750486e-06, + "loss": 0.1983, + "step": 27397 + }, + { + "epoch": 0.6933218614773389, + "grad_norm": 6.227919578552246, + "learning_rate": 2.1867576686233567e-06, + "loss": 0.1727, + "step": 27398 + }, + { + "epoch": 0.6933471670420326, + "grad_norm": 9.267666816711426, + "learning_rate": 2.186425740513742e-06, + "loss": 0.1886, + "step": 27399 + }, + { + "epoch": 0.6933724726067262, + "grad_norm": 5.940617084503174, + "learning_rate": 2.186093830548345e-06, + "loss": 0.2259, + "step": 27400 + }, + { + "epoch": 0.6933977781714199, + "grad_norm": 3.681666851043701, + "learning_rate": 2.1857619387293048e-06, + "loss": 0.1283, + "step": 27401 + }, + { + "epoch": 0.6934230837361136, + "grad_norm": 3.32999324798584, + "learning_rate": 2.185430065058761e-06, + "loss": 0.1307, + "step": 27402 + }, + { + "epoch": 0.6934483893008072, + "grad_norm": 3.287119150161743, + "learning_rate": 2.1850982095388597e-06, + "loss": 0.1742, + "step": 27403 + }, + { + "epoch": 0.6934736948655009, + "grad_norm": 5.117229461669922, + "learning_rate": 2.1847663721717315e-06, + "loss": 0.1592, + "step": 27404 + }, + { + "epoch": 0.6934990004301946, + "grad_norm": 5.250895023345947, + "learning_rate": 2.1844345529595233e-06, + "loss": 0.1486, + "step": 27405 + }, + { + "epoch": 0.6935243059948882, + "grad_norm": 6.825982570648193, + "learning_rate": 2.184102751904372e-06, + "loss": 0.2775, + "step": 27406 + }, + { + "epoch": 0.693549611559582, + "grad_norm": 4.620809078216553, + "learning_rate": 2.1837709690084193e-06, + "loss": 0.139, + "step": 27407 + }, + { + "epoch": 0.6935749171242757, + "grad_norm": 4.890329360961914, + "learning_rate": 2.183439204273804e-06, + "loss": 0.122, + "step": 27408 + }, + { + "epoch": 0.6936002226889693, + "grad_norm": 4.888276100158691, + "learning_rate": 2.1831074577026652e-06, + "loss": 0.1317, + "step": 27409 + }, + { + "epoch": 0.693625528253663, + "grad_norm": 3.6461799144744873, + "learning_rate": 2.18277572929714e-06, + "loss": 0.1235, + "step": 27410 + }, + { + "epoch": 0.6936508338183567, + "grad_norm": 13.927373886108398, + "learning_rate": 2.1824440190593734e-06, + "loss": 0.2161, + "step": 27411 + }, + { + "epoch": 0.6936761393830503, + "grad_norm": 3.031137704849243, + "learning_rate": 2.1821123269915e-06, + "loss": 0.1088, + "step": 27412 + }, + { + "epoch": 0.693701444947744, + "grad_norm": 5.913970470428467, + "learning_rate": 2.1817806530956605e-06, + "loss": 0.1756, + "step": 27413 + }, + { + "epoch": 0.6937267505124377, + "grad_norm": 4.807081699371338, + "learning_rate": 2.181448997373991e-06, + "loss": 0.1399, + "step": 27414 + }, + { + "epoch": 0.6937520560771313, + "grad_norm": 10.185195922851562, + "learning_rate": 2.181117359828635e-06, + "loss": 0.2672, + "step": 27415 + }, + { + "epoch": 0.693777361641825, + "grad_norm": 4.0473313331604, + "learning_rate": 2.1807857404617273e-06, + "loss": 0.1648, + "step": 27416 + }, + { + "epoch": 0.6938026672065187, + "grad_norm": 11.682513236999512, + "learning_rate": 2.180454139275411e-06, + "loss": 0.181, + "step": 27417 + }, + { + "epoch": 0.6938279727712124, + "grad_norm": 2.3710436820983887, + "learning_rate": 2.1801225562718177e-06, + "loss": 0.0587, + "step": 27418 + }, + { + "epoch": 0.693853278335906, + "grad_norm": 10.716487884521484, + "learning_rate": 2.179790991453092e-06, + "loss": 0.1249, + "step": 27419 + }, + { + "epoch": 0.6938785839005998, + "grad_norm": 7.80556583404541, + "learning_rate": 2.179459444821367e-06, + "loss": 0.1504, + "step": 27420 + }, + { + "epoch": 0.6939038894652935, + "grad_norm": 5.965040683746338, + "learning_rate": 2.1791279163787877e-06, + "loss": 0.1397, + "step": 27421 + }, + { + "epoch": 0.6939291950299871, + "grad_norm": 3.451643943786621, + "learning_rate": 2.1787964061274836e-06, + "loss": 0.1307, + "step": 27422 + }, + { + "epoch": 0.6939545005946808, + "grad_norm": 5.877071380615234, + "learning_rate": 2.1784649140695986e-06, + "loss": 0.162, + "step": 27423 + }, + { + "epoch": 0.6939798061593745, + "grad_norm": 3.052159547805786, + "learning_rate": 2.1781334402072687e-06, + "loss": 0.1122, + "step": 27424 + }, + { + "epoch": 0.6940051117240681, + "grad_norm": 4.761368274688721, + "learning_rate": 2.1778019845426315e-06, + "loss": 0.1706, + "step": 27425 + }, + { + "epoch": 0.6940304172887618, + "grad_norm": 5.044463157653809, + "learning_rate": 2.177470547077822e-06, + "loss": 0.1648, + "step": 27426 + }, + { + "epoch": 0.6940557228534555, + "grad_norm": 3.4541046619415283, + "learning_rate": 2.1771391278149817e-06, + "loss": 0.0852, + "step": 27427 + }, + { + "epoch": 0.6940810284181491, + "grad_norm": 3.4036757946014404, + "learning_rate": 2.1768077267562466e-06, + "loss": 0.1834, + "step": 27428 + }, + { + "epoch": 0.6941063339828428, + "grad_norm": 6.796820640563965, + "learning_rate": 2.1764763439037512e-06, + "loss": 0.1296, + "step": 27429 + }, + { + "epoch": 0.6941316395475365, + "grad_norm": 7.132484436035156, + "learning_rate": 2.1761449792596368e-06, + "loss": 0.1416, + "step": 27430 + }, + { + "epoch": 0.6941569451122301, + "grad_norm": 3.7655978202819824, + "learning_rate": 2.1758136328260374e-06, + "loss": 0.1133, + "step": 27431 + }, + { + "epoch": 0.6941822506769239, + "grad_norm": 4.427060604095459, + "learning_rate": 2.175482304605091e-06, + "loss": 0.1266, + "step": 27432 + }, + { + "epoch": 0.6942075562416176, + "grad_norm": 3.849463939666748, + "learning_rate": 2.1751509945989326e-06, + "loss": 0.1818, + "step": 27433 + }, + { + "epoch": 0.6942328618063112, + "grad_norm": 5.275895595550537, + "learning_rate": 2.174819702809703e-06, + "loss": 0.1874, + "step": 27434 + }, + { + "epoch": 0.6942581673710049, + "grad_norm": 7.3048996925354, + "learning_rate": 2.174488429239532e-06, + "loss": 0.1561, + "step": 27435 + }, + { + "epoch": 0.6942834729356986, + "grad_norm": 3.5283210277557373, + "learning_rate": 2.174157173890561e-06, + "loss": 0.1216, + "step": 27436 + }, + { + "epoch": 0.6943087785003922, + "grad_norm": 10.76406192779541, + "learning_rate": 2.173825936764923e-06, + "loss": 0.1044, + "step": 27437 + }, + { + "epoch": 0.6943340840650859, + "grad_norm": 4.8009514808654785, + "learning_rate": 2.173494717864758e-06, + "loss": 0.1005, + "step": 27438 + }, + { + "epoch": 0.6943593896297796, + "grad_norm": 3.6699776649475098, + "learning_rate": 2.1731635171922e-06, + "loss": 0.1218, + "step": 27439 + }, + { + "epoch": 0.6943846951944732, + "grad_norm": 5.878757476806641, + "learning_rate": 2.1728323347493844e-06, + "loss": 0.2412, + "step": 27440 + }, + { + "epoch": 0.6944100007591669, + "grad_norm": 2.9227380752563477, + "learning_rate": 2.172501170538445e-06, + "loss": 0.0487, + "step": 27441 + }, + { + "epoch": 0.6944353063238606, + "grad_norm": 7.366187572479248, + "learning_rate": 2.1721700245615217e-06, + "loss": 0.1565, + "step": 27442 + }, + { + "epoch": 0.6944606118885543, + "grad_norm": 12.418654441833496, + "learning_rate": 2.1718388968207484e-06, + "loss": 0.3173, + "step": 27443 + }, + { + "epoch": 0.694485917453248, + "grad_norm": 14.238470077514648, + "learning_rate": 2.1715077873182594e-06, + "loss": 0.2326, + "step": 27444 + }, + { + "epoch": 0.6945112230179417, + "grad_norm": 2.86515736579895, + "learning_rate": 2.1711766960561886e-06, + "loss": 0.148, + "step": 27445 + }, + { + "epoch": 0.6945365285826354, + "grad_norm": 4.48704719543457, + "learning_rate": 2.1708456230366753e-06, + "loss": 0.1493, + "step": 27446 + }, + { + "epoch": 0.694561834147329, + "grad_norm": 3.0732979774475098, + "learning_rate": 2.1705145682618506e-06, + "loss": 0.1102, + "step": 27447 + }, + { + "epoch": 0.6945871397120227, + "grad_norm": 6.414071559906006, + "learning_rate": 2.170183531733854e-06, + "loss": 0.1212, + "step": 27448 + }, + { + "epoch": 0.6946124452767164, + "grad_norm": 4.427931785583496, + "learning_rate": 2.169852513454815e-06, + "loss": 0.1514, + "step": 27449 + }, + { + "epoch": 0.69463775084141, + "grad_norm": 4.468139171600342, + "learning_rate": 2.1695215134268717e-06, + "loss": 0.1601, + "step": 27450 + }, + { + "epoch": 0.6946630564061037, + "grad_norm": 3.056058406829834, + "learning_rate": 2.1691905316521573e-06, + "loss": 0.0839, + "step": 27451 + }, + { + "epoch": 0.6946883619707974, + "grad_norm": 2.8976540565490723, + "learning_rate": 2.168859568132807e-06, + "loss": 0.1113, + "step": 27452 + }, + { + "epoch": 0.694713667535491, + "grad_norm": 3.175541639328003, + "learning_rate": 2.168528622870953e-06, + "loss": 0.076, + "step": 27453 + }, + { + "epoch": 0.6947389731001847, + "grad_norm": 5.662281513214111, + "learning_rate": 2.1681976958687327e-06, + "loss": 0.1442, + "step": 27454 + }, + { + "epoch": 0.6947642786648784, + "grad_norm": 5.4146575927734375, + "learning_rate": 2.1678667871282793e-06, + "loss": 0.1222, + "step": 27455 + }, + { + "epoch": 0.694789584229572, + "grad_norm": 13.716228485107422, + "learning_rate": 2.1675358966517256e-06, + "loss": 0.2156, + "step": 27456 + }, + { + "epoch": 0.6948148897942658, + "grad_norm": 6.287960529327393, + "learning_rate": 2.1672050244412044e-06, + "loss": 0.1615, + "step": 27457 + }, + { + "epoch": 0.6948401953589595, + "grad_norm": 14.846665382385254, + "learning_rate": 2.1668741704988533e-06, + "loss": 0.1735, + "step": 27458 + }, + { + "epoch": 0.6948655009236531, + "grad_norm": 9.170289039611816, + "learning_rate": 2.1665433348268035e-06, + "loss": 0.2186, + "step": 27459 + }, + { + "epoch": 0.6948908064883468, + "grad_norm": 3.9721152782440186, + "learning_rate": 2.1662125174271863e-06, + "loss": 0.1566, + "step": 27460 + }, + { + "epoch": 0.6949161120530405, + "grad_norm": 8.767011642456055, + "learning_rate": 2.165881718302141e-06, + "loss": 0.1819, + "step": 27461 + }, + { + "epoch": 0.6949414176177341, + "grad_norm": 3.457423448562622, + "learning_rate": 2.165550937453795e-06, + "loss": 0.0973, + "step": 27462 + }, + { + "epoch": 0.6949667231824278, + "grad_norm": 5.14355993270874, + "learning_rate": 2.1652201748842843e-06, + "loss": 0.2021, + "step": 27463 + }, + { + "epoch": 0.6949920287471215, + "grad_norm": 15.030399322509766, + "learning_rate": 2.1648894305957397e-06, + "loss": 0.178, + "step": 27464 + }, + { + "epoch": 0.6950173343118151, + "grad_norm": 3.549417495727539, + "learning_rate": 2.1645587045903e-06, + "loss": 0.1536, + "step": 27465 + }, + { + "epoch": 0.6950426398765088, + "grad_norm": 7.051757335662842, + "learning_rate": 2.1642279968700895e-06, + "loss": 0.232, + "step": 27466 + }, + { + "epoch": 0.6950679454412025, + "grad_norm": 5.296755313873291, + "learning_rate": 2.1638973074372475e-06, + "loss": 0.1744, + "step": 27467 + }, + { + "epoch": 0.6950932510058963, + "grad_norm": 4.94528341293335, + "learning_rate": 2.1635666362939022e-06, + "loss": 0.1565, + "step": 27468 + }, + { + "epoch": 0.6951185565705899, + "grad_norm": 18.147808074951172, + "learning_rate": 2.1632359834421894e-06, + "loss": 0.2855, + "step": 27469 + }, + { + "epoch": 0.6951438621352836, + "grad_norm": 3.071641683578491, + "learning_rate": 2.162905348884241e-06, + "loss": 0.1181, + "step": 27470 + }, + { + "epoch": 0.6951691676999773, + "grad_norm": 4.322681427001953, + "learning_rate": 2.1625747326221873e-06, + "loss": 0.2154, + "step": 27471 + }, + { + "epoch": 0.6951944732646709, + "grad_norm": 6.2676568031311035, + "learning_rate": 2.1622441346581603e-06, + "loss": 0.2182, + "step": 27472 + }, + { + "epoch": 0.6952197788293646, + "grad_norm": 5.533329010009766, + "learning_rate": 2.1619135549942945e-06, + "loss": 0.2642, + "step": 27473 + }, + { + "epoch": 0.6952450843940583, + "grad_norm": 3.5439693927764893, + "learning_rate": 2.1615829936327203e-06, + "loss": 0.1055, + "step": 27474 + }, + { + "epoch": 0.6952703899587519, + "grad_norm": 5.42024564743042, + "learning_rate": 2.1612524505755695e-06, + "loss": 0.1749, + "step": 27475 + }, + { + "epoch": 0.6952956955234456, + "grad_norm": 7.406887531280518, + "learning_rate": 2.1609219258249713e-06, + "loss": 0.1117, + "step": 27476 + }, + { + "epoch": 0.6953210010881393, + "grad_norm": 5.156291484832764, + "learning_rate": 2.1605914193830618e-06, + "loss": 0.1589, + "step": 27477 + }, + { + "epoch": 0.6953463066528329, + "grad_norm": 6.863901138305664, + "learning_rate": 2.16026093125197e-06, + "loss": 0.182, + "step": 27478 + }, + { + "epoch": 0.6953716122175266, + "grad_norm": 5.995182037353516, + "learning_rate": 2.1599304614338275e-06, + "loss": 0.1277, + "step": 27479 + }, + { + "epoch": 0.6953969177822203, + "grad_norm": 4.23392915725708, + "learning_rate": 2.159600009930763e-06, + "loss": 0.1411, + "step": 27480 + }, + { + "epoch": 0.695422223346914, + "grad_norm": 4.293314456939697, + "learning_rate": 2.1592695767449112e-06, + "loss": 0.1454, + "step": 27481 + }, + { + "epoch": 0.6954475289116077, + "grad_norm": 7.8399529457092285, + "learning_rate": 2.158939161878402e-06, + "loss": 0.157, + "step": 27482 + }, + { + "epoch": 0.6954728344763014, + "grad_norm": 4.286160945892334, + "learning_rate": 2.1586087653333654e-06, + "loss": 0.1619, + "step": 27483 + }, + { + "epoch": 0.695498140040995, + "grad_norm": 3.3104958534240723, + "learning_rate": 2.1582783871119305e-06, + "loss": 0.0998, + "step": 27484 + }, + { + "epoch": 0.6955234456056887, + "grad_norm": 3.651728630065918, + "learning_rate": 2.157948027216232e-06, + "loss": 0.1382, + "step": 27485 + }, + { + "epoch": 0.6955487511703824, + "grad_norm": 22.985187530517578, + "learning_rate": 2.157617685648397e-06, + "loss": 0.4049, + "step": 27486 + }, + { + "epoch": 0.695574056735076, + "grad_norm": 5.546099662780762, + "learning_rate": 2.157287362410557e-06, + "loss": 0.1226, + "step": 27487 + }, + { + "epoch": 0.6955993622997697, + "grad_norm": 10.246232986450195, + "learning_rate": 2.1569570575048425e-06, + "loss": 0.2486, + "step": 27488 + }, + { + "epoch": 0.6956246678644634, + "grad_norm": 22.472824096679688, + "learning_rate": 2.1566267709333818e-06, + "loss": 0.205, + "step": 27489 + }, + { + "epoch": 0.695649973429157, + "grad_norm": 4.418075084686279, + "learning_rate": 2.1562965026983073e-06, + "loss": 0.1732, + "step": 27490 + }, + { + "epoch": 0.6956752789938507, + "grad_norm": 3.2737903594970703, + "learning_rate": 2.1559662528017457e-06, + "loss": 0.121, + "step": 27491 + }, + { + "epoch": 0.6957005845585444, + "grad_norm": 3.585223913192749, + "learning_rate": 2.1556360212458325e-06, + "loss": 0.1313, + "step": 27492 + }, + { + "epoch": 0.6957258901232382, + "grad_norm": 2.8764278888702393, + "learning_rate": 2.15530580803269e-06, + "loss": 0.08, + "step": 27493 + }, + { + "epoch": 0.6957511956879318, + "grad_norm": 7.6478424072265625, + "learning_rate": 2.1549756131644528e-06, + "loss": 0.1522, + "step": 27494 + }, + { + "epoch": 0.6957765012526255, + "grad_norm": 8.311806678771973, + "learning_rate": 2.154645436643247e-06, + "loss": 0.2469, + "step": 27495 + }, + { + "epoch": 0.6958018068173192, + "grad_norm": 6.005948066711426, + "learning_rate": 2.1543152784712074e-06, + "loss": 0.1562, + "step": 27496 + }, + { + "epoch": 0.6958271123820128, + "grad_norm": 4.217858791351318, + "learning_rate": 2.153985138650455e-06, + "loss": 0.1922, + "step": 27497 + }, + { + "epoch": 0.6958524179467065, + "grad_norm": 2.6308605670928955, + "learning_rate": 2.1536550171831256e-06, + "loss": 0.1462, + "step": 27498 + }, + { + "epoch": 0.6958777235114002, + "grad_norm": 3.027019500732422, + "learning_rate": 2.153324914071343e-06, + "loss": 0.1296, + "step": 27499 + }, + { + "epoch": 0.6959030290760938, + "grad_norm": 2.8120899200439453, + "learning_rate": 2.1529948293172424e-06, + "loss": 0.1372, + "step": 27500 + }, + { + "epoch": 0.6959283346407875, + "grad_norm": 3.731153726577759, + "learning_rate": 2.1526647629229445e-06, + "loss": 0.1603, + "step": 27501 + }, + { + "epoch": 0.6959536402054812, + "grad_norm": 3.057055950164795, + "learning_rate": 2.1523347148905845e-06, + "loss": 0.1191, + "step": 27502 + }, + { + "epoch": 0.6959789457701748, + "grad_norm": 7.082411289215088, + "learning_rate": 2.152004685222286e-06, + "loss": 0.1859, + "step": 27503 + }, + { + "epoch": 0.6960042513348685, + "grad_norm": 2.944227457046509, + "learning_rate": 2.1516746739201814e-06, + "loss": 0.1443, + "step": 27504 + }, + { + "epoch": 0.6960295568995623, + "grad_norm": 4.687591552734375, + "learning_rate": 2.151344680986397e-06, + "loss": 0.143, + "step": 27505 + }, + { + "epoch": 0.6960548624642559, + "grad_norm": 3.927469253540039, + "learning_rate": 2.1510147064230608e-06, + "loss": 0.085, + "step": 27506 + }, + { + "epoch": 0.6960801680289496, + "grad_norm": 6.571967124938965, + "learning_rate": 2.150684750232299e-06, + "loss": 0.2227, + "step": 27507 + }, + { + "epoch": 0.6961054735936433, + "grad_norm": 11.616191864013672, + "learning_rate": 2.150354812416243e-06, + "loss": 0.2511, + "step": 27508 + }, + { + "epoch": 0.6961307791583369, + "grad_norm": 6.823018550872803, + "learning_rate": 2.150024892977019e-06, + "loss": 0.1459, + "step": 27509 + }, + { + "epoch": 0.6961560847230306, + "grad_norm": 5.466637134552002, + "learning_rate": 2.1496949919167544e-06, + "loss": 0.191, + "step": 27510 + }, + { + "epoch": 0.6961813902877243, + "grad_norm": 3.600407123565674, + "learning_rate": 2.1493651092375746e-06, + "loss": 0.0892, + "step": 27511 + }, + { + "epoch": 0.6962066958524179, + "grad_norm": 6.320221424102783, + "learning_rate": 2.1490352449416106e-06, + "loss": 0.2141, + "step": 27512 + }, + { + "epoch": 0.6962320014171116, + "grad_norm": 5.115489959716797, + "learning_rate": 2.148705399030988e-06, + "loss": 0.1851, + "step": 27513 + }, + { + "epoch": 0.6962573069818053, + "grad_norm": 6.38310432434082, + "learning_rate": 2.148375571507834e-06, + "loss": 0.1298, + "step": 27514 + }, + { + "epoch": 0.6962826125464989, + "grad_norm": 10.970799446105957, + "learning_rate": 2.1480457623742757e-06, + "loss": 0.1647, + "step": 27515 + }, + { + "epoch": 0.6963079181111926, + "grad_norm": 15.990516662597656, + "learning_rate": 2.1477159716324374e-06, + "loss": 0.2604, + "step": 27516 + }, + { + "epoch": 0.6963332236758863, + "grad_norm": 5.34983491897583, + "learning_rate": 2.1473861992844506e-06, + "loss": 0.1472, + "step": 27517 + }, + { + "epoch": 0.69635852924058, + "grad_norm": 3.995192050933838, + "learning_rate": 2.1470564453324393e-06, + "loss": 0.1248, + "step": 27518 + }, + { + "epoch": 0.6963838348052737, + "grad_norm": 3.9446427822113037, + "learning_rate": 2.1467267097785304e-06, + "loss": 0.1634, + "step": 27519 + }, + { + "epoch": 0.6964091403699674, + "grad_norm": 4.680680751800537, + "learning_rate": 2.1463969926248486e-06, + "loss": 0.1531, + "step": 27520 + }, + { + "epoch": 0.6964344459346611, + "grad_norm": 3.571636438369751, + "learning_rate": 2.1460672938735234e-06, + "loss": 0.1135, + "step": 27521 + }, + { + "epoch": 0.6964597514993547, + "grad_norm": 3.94272518157959, + "learning_rate": 2.145737613526677e-06, + "loss": 0.1444, + "step": 27522 + }, + { + "epoch": 0.6964850570640484, + "grad_norm": 3.269284248352051, + "learning_rate": 2.1454079515864424e-06, + "loss": 0.0943, + "step": 27523 + }, + { + "epoch": 0.6965103626287421, + "grad_norm": 11.231241226196289, + "learning_rate": 2.1450783080549365e-06, + "loss": 0.3275, + "step": 27524 + }, + { + "epoch": 0.6965356681934357, + "grad_norm": 5.419910430908203, + "learning_rate": 2.144748682934292e-06, + "loss": 0.1403, + "step": 27525 + }, + { + "epoch": 0.6965609737581294, + "grad_norm": 7.063163757324219, + "learning_rate": 2.14441907622663e-06, + "loss": 0.1531, + "step": 27526 + }, + { + "epoch": 0.6965862793228231, + "grad_norm": 3.8882927894592285, + "learning_rate": 2.1440894879340817e-06, + "loss": 0.0864, + "step": 27527 + }, + { + "epoch": 0.6966115848875167, + "grad_norm": 2.331381320953369, + "learning_rate": 2.1437599180587652e-06, + "loss": 0.0937, + "step": 27528 + }, + { + "epoch": 0.6966368904522104, + "grad_norm": 15.613931655883789, + "learning_rate": 2.143430366602812e-06, + "loss": 0.4109, + "step": 27529 + }, + { + "epoch": 0.6966621960169042, + "grad_norm": 5.049010753631592, + "learning_rate": 2.143100833568343e-06, + "loss": 0.1231, + "step": 27530 + }, + { + "epoch": 0.6966875015815978, + "grad_norm": 5.077874183654785, + "learning_rate": 2.142771318957489e-06, + "loss": 0.1106, + "step": 27531 + }, + { + "epoch": 0.6967128071462915, + "grad_norm": 2.1021459102630615, + "learning_rate": 2.1424418227723677e-06, + "loss": 0.1159, + "step": 27532 + }, + { + "epoch": 0.6967381127109852, + "grad_norm": 4.074644565582275, + "learning_rate": 2.1421123450151094e-06, + "loss": 0.1146, + "step": 27533 + }, + { + "epoch": 0.6967634182756788, + "grad_norm": 8.17179012298584, + "learning_rate": 2.141782885687835e-06, + "loss": 0.2149, + "step": 27534 + }, + { + "epoch": 0.6967887238403725, + "grad_norm": 2.928699016571045, + "learning_rate": 2.1414534447926728e-06, + "loss": 0.0981, + "step": 27535 + }, + { + "epoch": 0.6968140294050662, + "grad_norm": 9.513311386108398, + "learning_rate": 2.1411240223317458e-06, + "loss": 0.1999, + "step": 27536 + }, + { + "epoch": 0.6968393349697598, + "grad_norm": 6.2905049324035645, + "learning_rate": 2.1407946183071783e-06, + "loss": 0.1211, + "step": 27537 + }, + { + "epoch": 0.6968646405344535, + "grad_norm": 2.2070810794830322, + "learning_rate": 2.1404652327210925e-06, + "loss": 0.1198, + "step": 27538 + }, + { + "epoch": 0.6968899460991472, + "grad_norm": 2.4080326557159424, + "learning_rate": 2.140135865575616e-06, + "loss": 0.0994, + "step": 27539 + }, + { + "epoch": 0.6969152516638408, + "grad_norm": 12.574975967407227, + "learning_rate": 2.1398065168728714e-06, + "loss": 0.2537, + "step": 27540 + }, + { + "epoch": 0.6969405572285345, + "grad_norm": 9.47998046875, + "learning_rate": 2.139477186614983e-06, + "loss": 0.1508, + "step": 27541 + }, + { + "epoch": 0.6969658627932283, + "grad_norm": 6.665768623352051, + "learning_rate": 2.1391478748040717e-06, + "loss": 0.1453, + "step": 27542 + }, + { + "epoch": 0.6969911683579219, + "grad_norm": 8.726054191589355, + "learning_rate": 2.1388185814422657e-06, + "loss": 0.2554, + "step": 27543 + }, + { + "epoch": 0.6970164739226156, + "grad_norm": 7.680868148803711, + "learning_rate": 2.138489306531687e-06, + "loss": 0.1656, + "step": 27544 + }, + { + "epoch": 0.6970417794873093, + "grad_norm": 2.9661953449249268, + "learning_rate": 2.1381600500744574e-06, + "loss": 0.1, + "step": 27545 + }, + { + "epoch": 0.697067085052003, + "grad_norm": 4.857685089111328, + "learning_rate": 2.137830812072702e-06, + "loss": 0.1807, + "step": 27546 + }, + { + "epoch": 0.6970923906166966, + "grad_norm": 3.5511162281036377, + "learning_rate": 2.1375015925285415e-06, + "loss": 0.1321, + "step": 27547 + }, + { + "epoch": 0.6971176961813903, + "grad_norm": 3.431468963623047, + "learning_rate": 2.1371723914441027e-06, + "loss": 0.0985, + "step": 27548 + }, + { + "epoch": 0.697143001746084, + "grad_norm": 4.473067760467529, + "learning_rate": 2.1368432088215064e-06, + "loss": 0.1944, + "step": 27549 + }, + { + "epoch": 0.6971683073107776, + "grad_norm": 8.008729934692383, + "learning_rate": 2.1365140446628756e-06, + "loss": 0.2113, + "step": 27550 + }, + { + "epoch": 0.6971936128754713, + "grad_norm": 5.560958385467529, + "learning_rate": 2.136184898970331e-06, + "loss": 0.1602, + "step": 27551 + }, + { + "epoch": 0.697218918440165, + "grad_norm": 5.746326446533203, + "learning_rate": 2.135855771745999e-06, + "loss": 0.1109, + "step": 27552 + }, + { + "epoch": 0.6972442240048586, + "grad_norm": 3.5189690589904785, + "learning_rate": 2.1355266629920014e-06, + "loss": 0.1254, + "step": 27553 + }, + { + "epoch": 0.6972695295695523, + "grad_norm": 3.999953031539917, + "learning_rate": 2.1351975727104586e-06, + "loss": 0.1421, + "step": 27554 + }, + { + "epoch": 0.6972948351342461, + "grad_norm": 3.5119285583496094, + "learning_rate": 2.1348685009034925e-06, + "loss": 0.0855, + "step": 27555 + }, + { + "epoch": 0.6973201406989397, + "grad_norm": 4.169480323791504, + "learning_rate": 2.134539447573228e-06, + "loss": 0.1377, + "step": 27556 + }, + { + "epoch": 0.6973454462636334, + "grad_norm": 3.5832455158233643, + "learning_rate": 2.1342104127217834e-06, + "loss": 0.1495, + "step": 27557 + }, + { + "epoch": 0.6973707518283271, + "grad_norm": 2.86824893951416, + "learning_rate": 2.1338813963512865e-06, + "loss": 0.1013, + "step": 27558 + }, + { + "epoch": 0.6973960573930207, + "grad_norm": 12.959683418273926, + "learning_rate": 2.1335523984638523e-06, + "loss": 0.1644, + "step": 27559 + }, + { + "epoch": 0.6974213629577144, + "grad_norm": 4.396105766296387, + "learning_rate": 2.133223419061607e-06, + "loss": 0.1338, + "step": 27560 + }, + { + "epoch": 0.6974466685224081, + "grad_norm": 3.329648017883301, + "learning_rate": 2.132894458146669e-06, + "loss": 0.0715, + "step": 27561 + }, + { + "epoch": 0.6974719740871017, + "grad_norm": 12.090801239013672, + "learning_rate": 2.1325655157211644e-06, + "loss": 0.151, + "step": 27562 + }, + { + "epoch": 0.6974972796517954, + "grad_norm": 8.315361022949219, + "learning_rate": 2.132236591787208e-06, + "loss": 0.2357, + "step": 27563 + }, + { + "epoch": 0.6975225852164891, + "grad_norm": 5.324070930480957, + "learning_rate": 2.131907686346926e-06, + "loss": 0.14, + "step": 27564 + }, + { + "epoch": 0.6975478907811827, + "grad_norm": 3.203617811203003, + "learning_rate": 2.1315787994024367e-06, + "loss": 0.1209, + "step": 27565 + }, + { + "epoch": 0.6975731963458764, + "grad_norm": 4.678009510040283, + "learning_rate": 2.1312499309558636e-06, + "loss": 0.0965, + "step": 27566 + }, + { + "epoch": 0.6975985019105702, + "grad_norm": 7.349016189575195, + "learning_rate": 2.130921081009326e-06, + "loss": 0.2605, + "step": 27567 + }, + { + "epoch": 0.6976238074752638, + "grad_norm": 3.1534852981567383, + "learning_rate": 2.1305922495649444e-06, + "loss": 0.1276, + "step": 27568 + }, + { + "epoch": 0.6976491130399575, + "grad_norm": 4.919293403625488, + "learning_rate": 2.130263436624838e-06, + "loss": 0.162, + "step": 27569 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 5.763008117675781, + "learning_rate": 2.129934642191131e-06, + "loss": 0.141, + "step": 27570 + }, + { + "epoch": 0.6976997241693449, + "grad_norm": 12.698646545410156, + "learning_rate": 2.1296058662659415e-06, + "loss": 0.2793, + "step": 27571 + }, + { + "epoch": 0.6977250297340385, + "grad_norm": 14.293646812438965, + "learning_rate": 2.1292771088513897e-06, + "loss": 0.3626, + "step": 27572 + }, + { + "epoch": 0.6977503352987322, + "grad_norm": 9.462432861328125, + "learning_rate": 2.128948369949596e-06, + "loss": 0.2045, + "step": 27573 + }, + { + "epoch": 0.6977756408634259, + "grad_norm": 15.614653587341309, + "learning_rate": 2.1286196495626783e-06, + "loss": 0.3218, + "step": 27574 + }, + { + "epoch": 0.6978009464281195, + "grad_norm": 4.67990255355835, + "learning_rate": 2.1282909476927598e-06, + "loss": 0.1516, + "step": 27575 + }, + { + "epoch": 0.6978262519928132, + "grad_norm": 8.498825073242188, + "learning_rate": 2.1279622643419596e-06, + "loss": 0.2482, + "step": 27576 + }, + { + "epoch": 0.6978515575575069, + "grad_norm": 5.192583084106445, + "learning_rate": 2.127633599512396e-06, + "loss": 0.1379, + "step": 27577 + }, + { + "epoch": 0.6978768631222005, + "grad_norm": 4.680853366851807, + "learning_rate": 2.127304953206187e-06, + "loss": 0.1316, + "step": 27578 + }, + { + "epoch": 0.6979021686868943, + "grad_norm": 6.131274700164795, + "learning_rate": 2.126976325425456e-06, + "loss": 0.1118, + "step": 27579 + }, + { + "epoch": 0.697927474251588, + "grad_norm": 3.794783592224121, + "learning_rate": 2.1266477161723204e-06, + "loss": 0.1309, + "step": 27580 + }, + { + "epoch": 0.6979527798162816, + "grad_norm": 7.209654331207275, + "learning_rate": 2.126319125448899e-06, + "loss": 0.1849, + "step": 27581 + }, + { + "epoch": 0.6979780853809753, + "grad_norm": 8.406261444091797, + "learning_rate": 2.1259905532573095e-06, + "loss": 0.1756, + "step": 27582 + }, + { + "epoch": 0.698003390945669, + "grad_norm": 4.8089165687561035, + "learning_rate": 2.125661999599674e-06, + "loss": 0.1648, + "step": 27583 + }, + { + "epoch": 0.6980286965103626, + "grad_norm": 8.546879768371582, + "learning_rate": 2.125333464478109e-06, + "loss": 0.371, + "step": 27584 + }, + { + "epoch": 0.6980540020750563, + "grad_norm": 3.6700923442840576, + "learning_rate": 2.1250049478947347e-06, + "loss": 0.1013, + "step": 27585 + }, + { + "epoch": 0.69807930763975, + "grad_norm": 12.64027214050293, + "learning_rate": 2.1246764498516664e-06, + "loss": 0.2798, + "step": 27586 + }, + { + "epoch": 0.6981046132044436, + "grad_norm": 10.150601387023926, + "learning_rate": 2.124347970351027e-06, + "loss": 0.2073, + "step": 27587 + }, + { + "epoch": 0.6981299187691373, + "grad_norm": 7.200306415557861, + "learning_rate": 2.1240195093949306e-06, + "loss": 0.2577, + "step": 27588 + }, + { + "epoch": 0.698155224333831, + "grad_norm": 10.581904411315918, + "learning_rate": 2.123691066985501e-06, + "loss": 0.1949, + "step": 27589 + }, + { + "epoch": 0.6981805298985246, + "grad_norm": 3.9543910026550293, + "learning_rate": 2.123362643124849e-06, + "loss": 0.1376, + "step": 27590 + }, + { + "epoch": 0.6982058354632183, + "grad_norm": 4.020447254180908, + "learning_rate": 2.123034237815098e-06, + "loss": 0.1976, + "step": 27591 + }, + { + "epoch": 0.6982311410279121, + "grad_norm": 8.170334815979004, + "learning_rate": 2.122705851058362e-06, + "loss": 0.1554, + "step": 27592 + }, + { + "epoch": 0.6982564465926057, + "grad_norm": 3.9948203563690186, + "learning_rate": 2.122377482856765e-06, + "loss": 0.1189, + "step": 27593 + }, + { + "epoch": 0.6982817521572994, + "grad_norm": 21.866600036621094, + "learning_rate": 2.122049133212416e-06, + "loss": 0.1622, + "step": 27594 + }, + { + "epoch": 0.6983070577219931, + "grad_norm": 5.674008846282959, + "learning_rate": 2.1217208021274393e-06, + "loss": 0.2224, + "step": 27595 + }, + { + "epoch": 0.6983323632866868, + "grad_norm": 4.61241340637207, + "learning_rate": 2.121392489603947e-06, + "loss": 0.1624, + "step": 27596 + }, + { + "epoch": 0.6983576688513804, + "grad_norm": 11.796738624572754, + "learning_rate": 2.121064195644062e-06, + "loss": 0.1671, + "step": 27597 + }, + { + "epoch": 0.6983829744160741, + "grad_norm": 5.368581295013428, + "learning_rate": 2.1207359202498977e-06, + "loss": 0.1811, + "step": 27598 + }, + { + "epoch": 0.6984082799807678, + "grad_norm": 2.970767021179199, + "learning_rate": 2.120407663423573e-06, + "loss": 0.1236, + "step": 27599 + }, + { + "epoch": 0.6984335855454614, + "grad_norm": 9.856359481811523, + "learning_rate": 2.120079425167202e-06, + "loss": 0.2796, + "step": 27600 + }, + { + "epoch": 0.6984588911101551, + "grad_norm": 9.3799467086792, + "learning_rate": 2.1197512054829023e-06, + "loss": 0.1544, + "step": 27601 + }, + { + "epoch": 0.6984841966748488, + "grad_norm": 10.15410041809082, + "learning_rate": 2.1194230043727928e-06, + "loss": 0.2412, + "step": 27602 + }, + { + "epoch": 0.6985095022395424, + "grad_norm": 3.201524019241333, + "learning_rate": 2.1190948218389886e-06, + "loss": 0.0988, + "step": 27603 + }, + { + "epoch": 0.6985348078042362, + "grad_norm": 5.3015971183776855, + "learning_rate": 2.1187666578836057e-06, + "loss": 0.1907, + "step": 27604 + }, + { + "epoch": 0.6985601133689299, + "grad_norm": 3.351844072341919, + "learning_rate": 2.118438512508759e-06, + "loss": 0.1262, + "step": 27605 + }, + { + "epoch": 0.6985854189336235, + "grad_norm": 12.377457618713379, + "learning_rate": 2.1181103857165687e-06, + "loss": 0.2495, + "step": 27606 + }, + { + "epoch": 0.6986107244983172, + "grad_norm": 3.7525548934936523, + "learning_rate": 2.117782277509148e-06, + "loss": 0.1108, + "step": 27607 + }, + { + "epoch": 0.6986360300630109, + "grad_norm": 6.412662506103516, + "learning_rate": 2.1174541878886134e-06, + "loss": 0.2461, + "step": 27608 + }, + { + "epoch": 0.6986613356277045, + "grad_norm": 3.6610140800476074, + "learning_rate": 2.1171261168570782e-06, + "loss": 0.1679, + "step": 27609 + }, + { + "epoch": 0.6986866411923982, + "grad_norm": 4.214834690093994, + "learning_rate": 2.116798064416663e-06, + "loss": 0.1044, + "step": 27610 + }, + { + "epoch": 0.6987119467570919, + "grad_norm": 3.0762412548065186, + "learning_rate": 2.11647003056948e-06, + "loss": 0.1439, + "step": 27611 + }, + { + "epoch": 0.6987372523217855, + "grad_norm": 3.100855588912964, + "learning_rate": 2.116142015317646e-06, + "loss": 0.1757, + "step": 27612 + }, + { + "epoch": 0.6987625578864792, + "grad_norm": 2.9829893112182617, + "learning_rate": 2.1158140186632738e-06, + "loss": 0.1343, + "step": 27613 + }, + { + "epoch": 0.6987878634511729, + "grad_norm": 4.710530757904053, + "learning_rate": 2.115486040608482e-06, + "loss": 0.1234, + "step": 27614 + }, + { + "epoch": 0.6988131690158665, + "grad_norm": 4.365464210510254, + "learning_rate": 2.1151580811553848e-06, + "loss": 0.1644, + "step": 27615 + }, + { + "epoch": 0.6988384745805603, + "grad_norm": 4.5556960105896, + "learning_rate": 2.1148301403060962e-06, + "loss": 0.1615, + "step": 27616 + }, + { + "epoch": 0.698863780145254, + "grad_norm": 5.580996036529541, + "learning_rate": 2.1145022180627296e-06, + "loss": 0.2038, + "step": 27617 + }, + { + "epoch": 0.6988890857099476, + "grad_norm": 3.6780507564544678, + "learning_rate": 2.114174314427403e-06, + "loss": 0.14, + "step": 27618 + }, + { + "epoch": 0.6989143912746413, + "grad_norm": 4.149040222167969, + "learning_rate": 2.1138464294022277e-06, + "loss": 0.1126, + "step": 27619 + }, + { + "epoch": 0.698939696839335, + "grad_norm": 3.240119695663452, + "learning_rate": 2.1135185629893234e-06, + "loss": 0.1534, + "step": 27620 + }, + { + "epoch": 0.6989650024040287, + "grad_norm": 2.996013879776001, + "learning_rate": 2.113190715190797e-06, + "loss": 0.1418, + "step": 27621 + }, + { + "epoch": 0.6989903079687223, + "grad_norm": 7.186823844909668, + "learning_rate": 2.1128628860087693e-06, + "loss": 0.1458, + "step": 27622 + }, + { + "epoch": 0.699015613533416, + "grad_norm": 3.0761194229125977, + "learning_rate": 2.11253507544535e-06, + "loss": 0.0973, + "step": 27623 + }, + { + "epoch": 0.6990409190981097, + "grad_norm": 3.7712061405181885, + "learning_rate": 2.112207283502658e-06, + "loss": 0.1829, + "step": 27624 + }, + { + "epoch": 0.6990662246628033, + "grad_norm": 8.011920928955078, + "learning_rate": 2.1118795101828004e-06, + "loss": 0.2479, + "step": 27625 + }, + { + "epoch": 0.699091530227497, + "grad_norm": 9.478416442871094, + "learning_rate": 2.1115517554878967e-06, + "loss": 0.1575, + "step": 27626 + }, + { + "epoch": 0.6991168357921907, + "grad_norm": 9.9962158203125, + "learning_rate": 2.1112240194200566e-06, + "loss": 0.3067, + "step": 27627 + }, + { + "epoch": 0.6991421413568844, + "grad_norm": 5.716446399688721, + "learning_rate": 2.1108963019813986e-06, + "loss": 0.1008, + "step": 27628 + }, + { + "epoch": 0.6991674469215781, + "grad_norm": 3.3062615394592285, + "learning_rate": 2.1105686031740304e-06, + "loss": 0.1073, + "step": 27629 + }, + { + "epoch": 0.6991927524862718, + "grad_norm": 8.15294361114502, + "learning_rate": 2.1102409230000687e-06, + "loss": 0.2837, + "step": 27630 + }, + { + "epoch": 0.6992180580509654, + "grad_norm": 4.149469375610352, + "learning_rate": 2.1099132614616262e-06, + "loss": 0.1501, + "step": 27631 + }, + { + "epoch": 0.6992433636156591, + "grad_norm": 5.584080696105957, + "learning_rate": 2.109585618560813e-06, + "loss": 0.1476, + "step": 27632 + }, + { + "epoch": 0.6992686691803528, + "grad_norm": 5.967167377471924, + "learning_rate": 2.109257994299747e-06, + "loss": 0.143, + "step": 27633 + }, + { + "epoch": 0.6992939747450464, + "grad_norm": 5.7902512550354, + "learning_rate": 2.1089303886805384e-06, + "loss": 0.1756, + "step": 27634 + }, + { + "epoch": 0.6993192803097401, + "grad_norm": 5.429149150848389, + "learning_rate": 2.1086028017052997e-06, + "loss": 0.2072, + "step": 27635 + }, + { + "epoch": 0.6993445858744338, + "grad_norm": 2.2642815113067627, + "learning_rate": 2.108275233376142e-06, + "loss": 0.056, + "step": 27636 + }, + { + "epoch": 0.6993698914391274, + "grad_norm": 9.2726411819458, + "learning_rate": 2.107947683695183e-06, + "loss": 0.2273, + "step": 27637 + }, + { + "epoch": 0.6993951970038211, + "grad_norm": 4.242557525634766, + "learning_rate": 2.107620152664528e-06, + "loss": 0.1726, + "step": 27638 + }, + { + "epoch": 0.6994205025685148, + "grad_norm": 4.2042341232299805, + "learning_rate": 2.1072926402862943e-06, + "loss": 0.1997, + "step": 27639 + }, + { + "epoch": 0.6994458081332084, + "grad_norm": 3.6231894493103027, + "learning_rate": 2.1069651465625903e-06, + "loss": 0.1756, + "step": 27640 + }, + { + "epoch": 0.6994711136979022, + "grad_norm": 6.56373405456543, + "learning_rate": 2.1066376714955315e-06, + "loss": 0.163, + "step": 27641 + }, + { + "epoch": 0.6994964192625959, + "grad_norm": 3.8471720218658447, + "learning_rate": 2.1063102150872287e-06, + "loss": 0.1316, + "step": 27642 + }, + { + "epoch": 0.6995217248272895, + "grad_norm": 19.330442428588867, + "learning_rate": 2.1059827773397927e-06, + "loss": 0.3158, + "step": 27643 + }, + { + "epoch": 0.6995470303919832, + "grad_norm": 2.5652577877044678, + "learning_rate": 2.105655358255334e-06, + "loss": 0.118, + "step": 27644 + }, + { + "epoch": 0.6995723359566769, + "grad_norm": 15.182904243469238, + "learning_rate": 2.105327957835967e-06, + "loss": 0.1535, + "step": 27645 + }, + { + "epoch": 0.6995976415213705, + "grad_norm": 2.212641954421997, + "learning_rate": 2.1050005760838018e-06, + "loss": 0.0812, + "step": 27646 + }, + { + "epoch": 0.6996229470860642, + "grad_norm": 2.8936359882354736, + "learning_rate": 2.1046732130009488e-06, + "loss": 0.0683, + "step": 27647 + }, + { + "epoch": 0.6996482526507579, + "grad_norm": 19.21117401123047, + "learning_rate": 2.1043458685895187e-06, + "loss": 0.2421, + "step": 27648 + }, + { + "epoch": 0.6996735582154516, + "grad_norm": 7.475083827972412, + "learning_rate": 2.1040185428516245e-06, + "loss": 0.1894, + "step": 27649 + }, + { + "epoch": 0.6996988637801452, + "grad_norm": 3.986818552017212, + "learning_rate": 2.1036912357893745e-06, + "loss": 0.12, + "step": 27650 + }, + { + "epoch": 0.6997241693448389, + "grad_norm": 2.3485491275787354, + "learning_rate": 2.103363947404885e-06, + "loss": 0.0703, + "step": 27651 + }, + { + "epoch": 0.6997494749095327, + "grad_norm": 6.08728551864624, + "learning_rate": 2.1030366777002586e-06, + "loss": 0.2017, + "step": 27652 + }, + { + "epoch": 0.6997747804742263, + "grad_norm": 6.755975246429443, + "learning_rate": 2.1027094266776114e-06, + "loss": 0.1279, + "step": 27653 + }, + { + "epoch": 0.69980008603892, + "grad_norm": 4.14231014251709, + "learning_rate": 2.102382194339051e-06, + "loss": 0.1126, + "step": 27654 + }, + { + "epoch": 0.6998253916036137, + "grad_norm": 8.824233055114746, + "learning_rate": 2.102054980686692e-06, + "loss": 0.1481, + "step": 27655 + }, + { + "epoch": 0.6998506971683073, + "grad_norm": 3.6093790531158447, + "learning_rate": 2.1017277857226377e-06, + "loss": 0.1194, + "step": 27656 + }, + { + "epoch": 0.699876002733001, + "grad_norm": 5.01884651184082, + "learning_rate": 2.1014006094490036e-06, + "loss": 0.1817, + "step": 27657 + }, + { + "epoch": 0.6999013082976947, + "grad_norm": 4.580370903015137, + "learning_rate": 2.1010734518678975e-06, + "loss": 0.1985, + "step": 27658 + }, + { + "epoch": 0.6999266138623883, + "grad_norm": 4.500734329223633, + "learning_rate": 2.1007463129814303e-06, + "loss": 0.1468, + "step": 27659 + }, + { + "epoch": 0.699951919427082, + "grad_norm": 5.41287899017334, + "learning_rate": 2.1004191927917086e-06, + "loss": 0.1451, + "step": 27660 + }, + { + "epoch": 0.6999772249917757, + "grad_norm": 3.6486613750457764, + "learning_rate": 2.100092091300846e-06, + "loss": 0.1455, + "step": 27661 + }, + { + "epoch": 0.7000025305564693, + "grad_norm": 5.512413501739502, + "learning_rate": 2.09976500851095e-06, + "loss": 0.1234, + "step": 27662 + }, + { + "epoch": 0.700027836121163, + "grad_norm": 4.928278923034668, + "learning_rate": 2.099437944424128e-06, + "loss": 0.1498, + "step": 27663 + }, + { + "epoch": 0.7000531416858568, + "grad_norm": 6.094090938568115, + "learning_rate": 2.099110899042493e-06, + "loss": 0.2182, + "step": 27664 + }, + { + "epoch": 0.7000531416858568, + "eval_loss": 0.1694859266281128, + "eval_runtime": 69.8621, + "eval_samples_per_second": 45.719, + "eval_steps_per_second": 5.726, + "step": 27664 + }, + { + "epoch": 0.7000784472505504, + "grad_norm": 4.3385748863220215, + "learning_rate": 2.0987838723681524e-06, + "loss": 0.1278, + "step": 27665 + }, + { + "epoch": 0.7001037528152441, + "grad_norm": 2.5526552200317383, + "learning_rate": 2.098456864403215e-06, + "loss": 0.1129, + "step": 27666 + }, + { + "epoch": 0.7001290583799378, + "grad_norm": 3.5533525943756104, + "learning_rate": 2.0981298751497875e-06, + "loss": 0.1859, + "step": 27667 + }, + { + "epoch": 0.7001543639446314, + "grad_norm": 9.93692684173584, + "learning_rate": 2.0978029046099844e-06, + "loss": 0.1912, + "step": 27668 + }, + { + "epoch": 0.7001796695093251, + "grad_norm": 6.298692226409912, + "learning_rate": 2.0974759527859066e-06, + "loss": 0.1977, + "step": 27669 + }, + { + "epoch": 0.7002049750740188, + "grad_norm": 16.032442092895508, + "learning_rate": 2.097149019679669e-06, + "loss": 0.1568, + "step": 27670 + }, + { + "epoch": 0.7002302806387124, + "grad_norm": 3.067455768585205, + "learning_rate": 2.096822105293375e-06, + "loss": 0.0895, + "step": 27671 + }, + { + "epoch": 0.7002555862034061, + "grad_norm": 5.5779643058776855, + "learning_rate": 2.0964952096291376e-06, + "loss": 0.1453, + "step": 27672 + }, + { + "epoch": 0.7002808917680998, + "grad_norm": 3.8703534603118896, + "learning_rate": 2.096168332689062e-06, + "loss": 0.1092, + "step": 27673 + }, + { + "epoch": 0.7003061973327935, + "grad_norm": 3.55534291267395, + "learning_rate": 2.0958414744752564e-06, + "loss": 0.1485, + "step": 27674 + }, + { + "epoch": 0.7003315028974871, + "grad_norm": 5.158123016357422, + "learning_rate": 2.0955146349898275e-06, + "loss": 0.1159, + "step": 27675 + }, + { + "epoch": 0.7003568084621808, + "grad_norm": 4.2834086418151855, + "learning_rate": 2.0951878142348864e-06, + "loss": 0.1871, + "step": 27676 + }, + { + "epoch": 0.7003821140268746, + "grad_norm": 6.392881870269775, + "learning_rate": 2.0948610122125384e-06, + "loss": 0.1391, + "step": 27677 + }, + { + "epoch": 0.7004074195915682, + "grad_norm": 4.447035789489746, + "learning_rate": 2.0945342289248915e-06, + "loss": 0.1172, + "step": 27678 + }, + { + "epoch": 0.7004327251562619, + "grad_norm": 12.799285888671875, + "learning_rate": 2.094207464374051e-06, + "loss": 0.2729, + "step": 27679 + }, + { + "epoch": 0.7004580307209556, + "grad_norm": 7.765085220336914, + "learning_rate": 2.093880718562128e-06, + "loss": 0.2007, + "step": 27680 + }, + { + "epoch": 0.7004833362856492, + "grad_norm": 5.817584991455078, + "learning_rate": 2.0935539914912273e-06, + "loss": 0.1439, + "step": 27681 + }, + { + "epoch": 0.7005086418503429, + "grad_norm": 7.015757083892822, + "learning_rate": 2.093227283163457e-06, + "loss": 0.1308, + "step": 27682 + }, + { + "epoch": 0.7005339474150366, + "grad_norm": 5.5595927238464355, + "learning_rate": 2.092900593580921e-06, + "loss": 0.1773, + "step": 27683 + }, + { + "epoch": 0.7005592529797302, + "grad_norm": 3.6570162773132324, + "learning_rate": 2.0925739227457296e-06, + "loss": 0.1473, + "step": 27684 + }, + { + "epoch": 0.7005845585444239, + "grad_norm": 4.6168107986450195, + "learning_rate": 2.092247270659989e-06, + "loss": 0.1545, + "step": 27685 + }, + { + "epoch": 0.7006098641091176, + "grad_norm": 17.31300926208496, + "learning_rate": 2.0919206373258045e-06, + "loss": 0.1907, + "step": 27686 + }, + { + "epoch": 0.7006351696738112, + "grad_norm": 6.417909145355225, + "learning_rate": 2.091594022745281e-06, + "loss": 0.197, + "step": 27687 + }, + { + "epoch": 0.7006604752385049, + "grad_norm": 4.822574138641357, + "learning_rate": 2.0912674269205287e-06, + "loss": 0.1601, + "step": 27688 + }, + { + "epoch": 0.7006857808031987, + "grad_norm": 5.367356777191162, + "learning_rate": 2.0909408498536517e-06, + "loss": 0.1691, + "step": 27689 + }, + { + "epoch": 0.7007110863678923, + "grad_norm": 5.787850379943848, + "learning_rate": 2.0906142915467557e-06, + "loss": 0.2131, + "step": 27690 + }, + { + "epoch": 0.700736391932586, + "grad_norm": 3.8637583255767822, + "learning_rate": 2.0902877520019454e-06, + "loss": 0.1873, + "step": 27691 + }, + { + "epoch": 0.7007616974972797, + "grad_norm": 5.419188022613525, + "learning_rate": 2.0899612312213297e-06, + "loss": 0.1842, + "step": 27692 + }, + { + "epoch": 0.7007870030619733, + "grad_norm": 17.454559326171875, + "learning_rate": 2.089634729207013e-06, + "loss": 0.3397, + "step": 27693 + }, + { + "epoch": 0.700812308626667, + "grad_norm": 3.6828548908233643, + "learning_rate": 2.0893082459610984e-06, + "loss": 0.091, + "step": 27694 + }, + { + "epoch": 0.7008376141913607, + "grad_norm": 5.968650817871094, + "learning_rate": 2.0889817814856973e-06, + "loss": 0.1812, + "step": 27695 + }, + { + "epoch": 0.7008629197560543, + "grad_norm": 5.097570896148682, + "learning_rate": 2.0886553357829078e-06, + "loss": 0.1425, + "step": 27696 + }, + { + "epoch": 0.700888225320748, + "grad_norm": 5.339632987976074, + "learning_rate": 2.0883289088548403e-06, + "loss": 0.117, + "step": 27697 + }, + { + "epoch": 0.7009135308854417, + "grad_norm": 5.672551155090332, + "learning_rate": 2.088002500703596e-06, + "loss": 0.139, + "step": 27698 + }, + { + "epoch": 0.7009388364501354, + "grad_norm": 3.283097505569458, + "learning_rate": 2.0876761113312854e-06, + "loss": 0.1148, + "step": 27699 + }, + { + "epoch": 0.700964142014829, + "grad_norm": 7.622166633605957, + "learning_rate": 2.087349740740006e-06, + "loss": 0.1698, + "step": 27700 + }, + { + "epoch": 0.7009894475795228, + "grad_norm": 1.8870354890823364, + "learning_rate": 2.087023388931869e-06, + "loss": 0.0747, + "step": 27701 + }, + { + "epoch": 0.7010147531442165, + "grad_norm": 8.206964492797852, + "learning_rate": 2.086697055908974e-06, + "loss": 0.2001, + "step": 27702 + }, + { + "epoch": 0.7010400587089101, + "grad_norm": 3.7068309783935547, + "learning_rate": 2.08637074167343e-06, + "loss": 0.1321, + "step": 27703 + }, + { + "epoch": 0.7010653642736038, + "grad_norm": 3.8536951541900635, + "learning_rate": 2.0860444462273382e-06, + "loss": 0.1441, + "step": 27704 + }, + { + "epoch": 0.7010906698382975, + "grad_norm": 6.246753692626953, + "learning_rate": 2.0857181695728047e-06, + "loss": 0.2043, + "step": 27705 + }, + { + "epoch": 0.7011159754029911, + "grad_norm": 5.174533843994141, + "learning_rate": 2.08539191171193e-06, + "loss": 0.0974, + "step": 27706 + }, + { + "epoch": 0.7011412809676848, + "grad_norm": 5.246518611907959, + "learning_rate": 2.085065672646823e-06, + "loss": 0.174, + "step": 27707 + }, + { + "epoch": 0.7011665865323785, + "grad_norm": 8.976808547973633, + "learning_rate": 2.0847394523795848e-06, + "loss": 0.3574, + "step": 27708 + }, + { + "epoch": 0.7011918920970721, + "grad_norm": 6.69264030456543, + "learning_rate": 2.08441325091232e-06, + "loss": 0.2419, + "step": 27709 + }, + { + "epoch": 0.7012171976617658, + "grad_norm": 6.728362083435059, + "learning_rate": 2.0840870682471297e-06, + "loss": 0.2083, + "step": 27710 + }, + { + "epoch": 0.7012425032264595, + "grad_norm": 2.4369266033172607, + "learning_rate": 2.0837609043861216e-06, + "loss": 0.1036, + "step": 27711 + }, + { + "epoch": 0.7012678087911531, + "grad_norm": 7.149991512298584, + "learning_rate": 2.083434759331397e-06, + "loss": 0.2275, + "step": 27712 + }, + { + "epoch": 0.7012931143558468, + "grad_norm": 6.709836483001709, + "learning_rate": 2.0831086330850594e-06, + "loss": 0.151, + "step": 27713 + }, + { + "epoch": 0.7013184199205406, + "grad_norm": 4.38371467590332, + "learning_rate": 2.082782525649209e-06, + "loss": 0.1704, + "step": 27714 + }, + { + "epoch": 0.7013437254852342, + "grad_norm": 4.28048038482666, + "learning_rate": 2.0824564370259537e-06, + "loss": 0.1304, + "step": 27715 + }, + { + "epoch": 0.7013690310499279, + "grad_norm": 9.484339714050293, + "learning_rate": 2.082130367217394e-06, + "loss": 0.2622, + "step": 27716 + }, + { + "epoch": 0.7013943366146216, + "grad_norm": 5.907866954803467, + "learning_rate": 2.0818043162256327e-06, + "loss": 0.1461, + "step": 27717 + }, + { + "epoch": 0.7014196421793152, + "grad_norm": 8.33736801147461, + "learning_rate": 2.0814782840527713e-06, + "loss": 0.2222, + "step": 27718 + }, + { + "epoch": 0.7014449477440089, + "grad_norm": 2.907083749771118, + "learning_rate": 2.081152270700915e-06, + "loss": 0.1196, + "step": 27719 + }, + { + "epoch": 0.7014702533087026, + "grad_norm": 3.6180484294891357, + "learning_rate": 2.080826276172165e-06, + "loss": 0.111, + "step": 27720 + }, + { + "epoch": 0.7014955588733962, + "grad_norm": 6.90065860748291, + "learning_rate": 2.080500300468623e-06, + "loss": 0.2457, + "step": 27721 + }, + { + "epoch": 0.7015208644380899, + "grad_norm": 7.104518890380859, + "learning_rate": 2.0801743435923918e-06, + "loss": 0.2135, + "step": 27722 + }, + { + "epoch": 0.7015461700027836, + "grad_norm": 15.823494911193848, + "learning_rate": 2.079848405545571e-06, + "loss": 0.2527, + "step": 27723 + }, + { + "epoch": 0.7015714755674773, + "grad_norm": 3.3868589401245117, + "learning_rate": 2.0795224863302664e-06, + "loss": 0.1265, + "step": 27724 + }, + { + "epoch": 0.7015967811321709, + "grad_norm": 5.109666347503662, + "learning_rate": 2.0791965859485762e-06, + "loss": 0.1645, + "step": 27725 + }, + { + "epoch": 0.7016220866968647, + "grad_norm": 11.063187599182129, + "learning_rate": 2.078870704402608e-06, + "loss": 0.2313, + "step": 27726 + }, + { + "epoch": 0.7016473922615584, + "grad_norm": 5.7328596115112305, + "learning_rate": 2.0785448416944552e-06, + "loss": 0.1792, + "step": 27727 + }, + { + "epoch": 0.701672697826252, + "grad_norm": 5.406207084655762, + "learning_rate": 2.0782189978262256e-06, + "loss": 0.1167, + "step": 27728 + }, + { + "epoch": 0.7016980033909457, + "grad_norm": 7.536263942718506, + "learning_rate": 2.0778931728000157e-06, + "loss": 0.2402, + "step": 27729 + }, + { + "epoch": 0.7017233089556394, + "grad_norm": 5.571792125701904, + "learning_rate": 2.0775673666179336e-06, + "loss": 0.2311, + "step": 27730 + }, + { + "epoch": 0.701748614520333, + "grad_norm": 2.921891450881958, + "learning_rate": 2.0772415792820717e-06, + "loss": 0.1061, + "step": 27731 + }, + { + "epoch": 0.7017739200850267, + "grad_norm": 7.791992664337158, + "learning_rate": 2.076915810794537e-06, + "loss": 0.1427, + "step": 27732 + }, + { + "epoch": 0.7017992256497204, + "grad_norm": 9.88974666595459, + "learning_rate": 2.076590061157427e-06, + "loss": 0.1944, + "step": 27733 + }, + { + "epoch": 0.701824531214414, + "grad_norm": 3.5855276584625244, + "learning_rate": 2.0762643303728473e-06, + "loss": 0.1204, + "step": 27734 + }, + { + "epoch": 0.7018498367791077, + "grad_norm": 4.431595325469971, + "learning_rate": 2.0759386184428915e-06, + "loss": 0.2018, + "step": 27735 + }, + { + "epoch": 0.7018751423438014, + "grad_norm": 3.8699967861175537, + "learning_rate": 2.0756129253696654e-06, + "loss": 0.1395, + "step": 27736 + }, + { + "epoch": 0.701900447908495, + "grad_norm": 7.019991397857666, + "learning_rate": 2.0752872511552657e-06, + "loss": 0.1743, + "step": 27737 + }, + { + "epoch": 0.7019257534731888, + "grad_norm": 3.4416770935058594, + "learning_rate": 2.0749615958017965e-06, + "loss": 0.1402, + "step": 27738 + }, + { + "epoch": 0.7019510590378825, + "grad_norm": 4.030529499053955, + "learning_rate": 2.074635959311356e-06, + "loss": 0.1184, + "step": 27739 + }, + { + "epoch": 0.7019763646025761, + "grad_norm": 3.6774423122406006, + "learning_rate": 2.0743103416860437e-06, + "loss": 0.1132, + "step": 27740 + }, + { + "epoch": 0.7020016701672698, + "grad_norm": 2.710771322250366, + "learning_rate": 2.073984742927958e-06, + "loss": 0.1085, + "step": 27741 + }, + { + "epoch": 0.7020269757319635, + "grad_norm": 5.922791957855225, + "learning_rate": 2.0736591630392023e-06, + "loss": 0.1435, + "step": 27742 + }, + { + "epoch": 0.7020522812966571, + "grad_norm": 4.354959964752197, + "learning_rate": 2.073333602021874e-06, + "loss": 0.1794, + "step": 27743 + }, + { + "epoch": 0.7020775868613508, + "grad_norm": 5.087342262268066, + "learning_rate": 2.0730080598780733e-06, + "loss": 0.1843, + "step": 27744 + }, + { + "epoch": 0.7021028924260445, + "grad_norm": 4.509782314300537, + "learning_rate": 2.0726825366098974e-06, + "loss": 0.1955, + "step": 27745 + }, + { + "epoch": 0.7021281979907381, + "grad_norm": 16.172637939453125, + "learning_rate": 2.072357032219449e-06, + "loss": 0.3238, + "step": 27746 + }, + { + "epoch": 0.7021535035554318, + "grad_norm": 9.307528495788574, + "learning_rate": 2.0720315467088252e-06, + "loss": 0.1618, + "step": 27747 + }, + { + "epoch": 0.7021788091201255, + "grad_norm": 15.48363971710205, + "learning_rate": 2.071706080080126e-06, + "loss": 0.2962, + "step": 27748 + }, + { + "epoch": 0.7022041146848192, + "grad_norm": 12.816140174865723, + "learning_rate": 2.0713806323354473e-06, + "loss": 0.3562, + "step": 27749 + }, + { + "epoch": 0.7022294202495128, + "grad_norm": 5.313265800476074, + "learning_rate": 2.071055203476892e-06, + "loss": 0.164, + "step": 27750 + }, + { + "epoch": 0.7022547258142066, + "grad_norm": 4.183597087860107, + "learning_rate": 2.070729793506557e-06, + "loss": 0.1659, + "step": 27751 + }, + { + "epoch": 0.7022800313789003, + "grad_norm": 5.430893421173096, + "learning_rate": 2.0704044024265406e-06, + "loss": 0.1309, + "step": 27752 + }, + { + "epoch": 0.7023053369435939, + "grad_norm": 10.411087036132812, + "learning_rate": 2.070079030238941e-06, + "loss": 0.1708, + "step": 27753 + }, + { + "epoch": 0.7023306425082876, + "grad_norm": 9.352980613708496, + "learning_rate": 2.069753676945855e-06, + "loss": 0.2384, + "step": 27754 + }, + { + "epoch": 0.7023559480729813, + "grad_norm": 8.02923583984375, + "learning_rate": 2.069428342549385e-06, + "loss": 0.0993, + "step": 27755 + }, + { + "epoch": 0.7023812536376749, + "grad_norm": 4.479547500610352, + "learning_rate": 2.0691030270516253e-06, + "loss": 0.107, + "step": 27756 + }, + { + "epoch": 0.7024065592023686, + "grad_norm": 7.437075614929199, + "learning_rate": 2.0687777304546757e-06, + "loss": 0.2675, + "step": 27757 + }, + { + "epoch": 0.7024318647670623, + "grad_norm": 3.40690279006958, + "learning_rate": 2.0684524527606314e-06, + "loss": 0.1244, + "step": 27758 + }, + { + "epoch": 0.7024571703317559, + "grad_norm": 4.482593536376953, + "learning_rate": 2.0681271939715942e-06, + "loss": 0.1651, + "step": 27759 + }, + { + "epoch": 0.7024824758964496, + "grad_norm": 2.920773983001709, + "learning_rate": 2.067801954089657e-06, + "loss": 0.1197, + "step": 27760 + }, + { + "epoch": 0.7025077814611433, + "grad_norm": 8.461207389831543, + "learning_rate": 2.067476733116923e-06, + "loss": 0.222, + "step": 27761 + }, + { + "epoch": 0.7025330870258369, + "grad_norm": 4.8993659019470215, + "learning_rate": 2.067151531055483e-06, + "loss": 0.1481, + "step": 27762 + }, + { + "epoch": 0.7025583925905307, + "grad_norm": 5.823221206665039, + "learning_rate": 2.0668263479074385e-06, + "loss": 0.1209, + "step": 27763 + }, + { + "epoch": 0.7025836981552244, + "grad_norm": 7.2726287841796875, + "learning_rate": 2.066501183674884e-06, + "loss": 0.1994, + "step": 27764 + }, + { + "epoch": 0.702609003719918, + "grad_norm": 4.841038227081299, + "learning_rate": 2.0661760383599207e-06, + "loss": 0.0815, + "step": 27765 + }, + { + "epoch": 0.7026343092846117, + "grad_norm": 6.7272748947143555, + "learning_rate": 2.065850911964639e-06, + "loss": 0.2304, + "step": 27766 + }, + { + "epoch": 0.7026596148493054, + "grad_norm": 3.8970694541931152, + "learning_rate": 2.065525804491141e-06, + "loss": 0.1432, + "step": 27767 + }, + { + "epoch": 0.702684920413999, + "grad_norm": 5.131679058074951, + "learning_rate": 2.065200715941519e-06, + "loss": 0.1994, + "step": 27768 + }, + { + "epoch": 0.7027102259786927, + "grad_norm": 6.80843448638916, + "learning_rate": 2.0648756463178736e-06, + "loss": 0.2433, + "step": 27769 + }, + { + "epoch": 0.7027355315433864, + "grad_norm": 9.223221778869629, + "learning_rate": 2.0645505956222996e-06, + "loss": 0.0868, + "step": 27770 + }, + { + "epoch": 0.70276083710808, + "grad_norm": 4.185217380523682, + "learning_rate": 2.0642255638568926e-06, + "loss": 0.17, + "step": 27771 + }, + { + "epoch": 0.7027861426727737, + "grad_norm": 8.239978790283203, + "learning_rate": 2.063900551023747e-06, + "loss": 0.1769, + "step": 27772 + }, + { + "epoch": 0.7028114482374674, + "grad_norm": 7.7908616065979, + "learning_rate": 2.063575557124962e-06, + "loss": 0.1833, + "step": 27773 + }, + { + "epoch": 0.702836753802161, + "grad_norm": 6.101670265197754, + "learning_rate": 2.0632505821626323e-06, + "loss": 0.1428, + "step": 27774 + }, + { + "epoch": 0.7028620593668548, + "grad_norm": 2.9345338344573975, + "learning_rate": 2.0629256261388534e-06, + "loss": 0.1391, + "step": 27775 + }, + { + "epoch": 0.7028873649315485, + "grad_norm": 4.261826038360596, + "learning_rate": 2.0626006890557193e-06, + "loss": 0.1554, + "step": 27776 + }, + { + "epoch": 0.7029126704962422, + "grad_norm": 4.074701309204102, + "learning_rate": 2.062275770915328e-06, + "loss": 0.1388, + "step": 27777 + }, + { + "epoch": 0.7029379760609358, + "grad_norm": 9.870380401611328, + "learning_rate": 2.0619508717197744e-06, + "loss": 0.2903, + "step": 27778 + }, + { + "epoch": 0.7029632816256295, + "grad_norm": 5.929875373840332, + "learning_rate": 2.061625991471153e-06, + "loss": 0.1734, + "step": 27779 + }, + { + "epoch": 0.7029885871903232, + "grad_norm": 3.3050904273986816, + "learning_rate": 2.061301130171559e-06, + "loss": 0.1149, + "step": 27780 + }, + { + "epoch": 0.7030138927550168, + "grad_norm": 3.610154390335083, + "learning_rate": 2.0609762878230855e-06, + "loss": 0.1571, + "step": 27781 + }, + { + "epoch": 0.7030391983197105, + "grad_norm": 2.6575987339019775, + "learning_rate": 2.060651464427831e-06, + "loss": 0.1112, + "step": 27782 + }, + { + "epoch": 0.7030645038844042, + "grad_norm": 3.929176092147827, + "learning_rate": 2.0603266599878884e-06, + "loss": 0.1767, + "step": 27783 + }, + { + "epoch": 0.7030898094490978, + "grad_norm": 7.4560699462890625, + "learning_rate": 2.0600018745053526e-06, + "loss": 0.2752, + "step": 27784 + }, + { + "epoch": 0.7031151150137915, + "grad_norm": 9.854084968566895, + "learning_rate": 2.059677107982316e-06, + "loss": 0.2509, + "step": 27785 + }, + { + "epoch": 0.7031404205784852, + "grad_norm": 3.8978614807128906, + "learning_rate": 2.059352360420876e-06, + "loss": 0.2112, + "step": 27786 + }, + { + "epoch": 0.7031657261431788, + "grad_norm": 4.683980941772461, + "learning_rate": 2.0590276318231264e-06, + "loss": 0.149, + "step": 27787 + }, + { + "epoch": 0.7031910317078726, + "grad_norm": 4.141844272613525, + "learning_rate": 2.0587029221911598e-06, + "loss": 0.1239, + "step": 27788 + }, + { + "epoch": 0.7032163372725663, + "grad_norm": 3.1537387371063232, + "learning_rate": 2.0583782315270696e-06, + "loss": 0.1585, + "step": 27789 + }, + { + "epoch": 0.7032416428372599, + "grad_norm": 4.872953414916992, + "learning_rate": 2.0580535598329527e-06, + "loss": 0.1343, + "step": 27790 + }, + { + "epoch": 0.7032669484019536, + "grad_norm": 10.55229377746582, + "learning_rate": 2.0577289071108992e-06, + "loss": 0.2541, + "step": 27791 + }, + { + "epoch": 0.7032922539666473, + "grad_norm": 5.328071117401123, + "learning_rate": 2.0574042733630084e-06, + "loss": 0.1578, + "step": 27792 + }, + { + "epoch": 0.7033175595313409, + "grad_norm": 10.966142654418945, + "learning_rate": 2.0570796585913665e-06, + "loss": 0.2099, + "step": 27793 + }, + { + "epoch": 0.7033428650960346, + "grad_norm": 2.6719579696655273, + "learning_rate": 2.0567550627980725e-06, + "loss": 0.1377, + "step": 27794 + }, + { + "epoch": 0.7033681706607283, + "grad_norm": 3.4101381301879883, + "learning_rate": 2.0564304859852156e-06, + "loss": 0.1368, + "step": 27795 + }, + { + "epoch": 0.7033934762254219, + "grad_norm": 35.21539306640625, + "learning_rate": 2.0561059281548944e-06, + "loss": 0.2059, + "step": 27796 + }, + { + "epoch": 0.7034187817901156, + "grad_norm": 3.602919101715088, + "learning_rate": 2.055781389309195e-06, + "loss": 0.1414, + "step": 27797 + }, + { + "epoch": 0.7034440873548093, + "grad_norm": 3.3964221477508545, + "learning_rate": 2.055456869450216e-06, + "loss": 0.1504, + "step": 27798 + }, + { + "epoch": 0.703469392919503, + "grad_norm": 5.107086658477783, + "learning_rate": 2.0551323685800457e-06, + "loss": 0.1482, + "step": 27799 + }, + { + "epoch": 0.7034946984841967, + "grad_norm": 3.808326244354248, + "learning_rate": 2.0548078867007802e-06, + "loss": 0.1616, + "step": 27800 + }, + { + "epoch": 0.7035200040488904, + "grad_norm": 13.533973693847656, + "learning_rate": 2.0544834238145116e-06, + "loss": 0.3603, + "step": 27801 + }, + { + "epoch": 0.7035453096135841, + "grad_norm": 3.8748786449432373, + "learning_rate": 2.0541589799233313e-06, + "loss": 0.1427, + "step": 27802 + }, + { + "epoch": 0.7035706151782777, + "grad_norm": 7.368249416351318, + "learning_rate": 2.05383455502933e-06, + "loss": 0.1941, + "step": 27803 + }, + { + "epoch": 0.7035959207429714, + "grad_norm": 9.168248176574707, + "learning_rate": 2.053510149134603e-06, + "loss": 0.1618, + "step": 27804 + }, + { + "epoch": 0.7036212263076651, + "grad_norm": 9.259687423706055, + "learning_rate": 2.0531857622412417e-06, + "loss": 0.1919, + "step": 27805 + }, + { + "epoch": 0.7036465318723587, + "grad_norm": 5.056869983673096, + "learning_rate": 2.0528613943513364e-06, + "loss": 0.2144, + "step": 27806 + }, + { + "epoch": 0.7036718374370524, + "grad_norm": 5.965600967407227, + "learning_rate": 2.05253704546698e-06, + "loss": 0.1749, + "step": 27807 + }, + { + "epoch": 0.7036971430017461, + "grad_norm": 5.6486711502075195, + "learning_rate": 2.0522127155902626e-06, + "loss": 0.1281, + "step": 27808 + }, + { + "epoch": 0.7037224485664397, + "grad_norm": 5.090156078338623, + "learning_rate": 2.051888404723279e-06, + "loss": 0.1339, + "step": 27809 + }, + { + "epoch": 0.7037477541311334, + "grad_norm": 11.073982238769531, + "learning_rate": 2.051564112868118e-06, + "loss": 0.1662, + "step": 27810 + }, + { + "epoch": 0.7037730596958272, + "grad_norm": 5.772462844848633, + "learning_rate": 2.0512398400268717e-06, + "loss": 0.1253, + "step": 27811 + }, + { + "epoch": 0.7037983652605208, + "grad_norm": 4.232749938964844, + "learning_rate": 2.05091558620163e-06, + "loss": 0.0935, + "step": 27812 + }, + { + "epoch": 0.7038236708252145, + "grad_norm": 2.2248618602752686, + "learning_rate": 2.0505913513944865e-06, + "loss": 0.0797, + "step": 27813 + }, + { + "epoch": 0.7038489763899082, + "grad_norm": 3.444835662841797, + "learning_rate": 2.0502671356075306e-06, + "loss": 0.1419, + "step": 27814 + }, + { + "epoch": 0.7038742819546018, + "grad_norm": 4.936405658721924, + "learning_rate": 2.0499429388428534e-06, + "loss": 0.2283, + "step": 27815 + }, + { + "epoch": 0.7038995875192955, + "grad_norm": 5.048618316650391, + "learning_rate": 2.049618761102543e-06, + "loss": 0.1749, + "step": 27816 + }, + { + "epoch": 0.7039248930839892, + "grad_norm": 3.234152317047119, + "learning_rate": 2.049294602388695e-06, + "loss": 0.1273, + "step": 27817 + }, + { + "epoch": 0.7039501986486828, + "grad_norm": 3.9761769771575928, + "learning_rate": 2.048970462703397e-06, + "loss": 0.1398, + "step": 27818 + }, + { + "epoch": 0.7039755042133765, + "grad_norm": 4.8435540199279785, + "learning_rate": 2.0486463420487395e-06, + "loss": 0.1557, + "step": 27819 + }, + { + "epoch": 0.7040008097780702, + "grad_norm": 2.6986019611358643, + "learning_rate": 2.048322240426811e-06, + "loss": 0.1017, + "step": 27820 + }, + { + "epoch": 0.7040261153427638, + "grad_norm": 3.0774266719818115, + "learning_rate": 2.047998157839705e-06, + "loss": 0.1224, + "step": 27821 + }, + { + "epoch": 0.7040514209074575, + "grad_norm": 7.998197078704834, + "learning_rate": 2.0476740942895084e-06, + "loss": 0.2109, + "step": 27822 + }, + { + "epoch": 0.7040767264721512, + "grad_norm": 3.4419875144958496, + "learning_rate": 2.0473500497783153e-06, + "loss": 0.0793, + "step": 27823 + }, + { + "epoch": 0.7041020320368448, + "grad_norm": 6.42389440536499, + "learning_rate": 2.0470260243082095e-06, + "loss": 0.2173, + "step": 27824 + }, + { + "epoch": 0.7041273376015386, + "grad_norm": 5.097662448883057, + "learning_rate": 2.046702017881285e-06, + "loss": 0.2029, + "step": 27825 + }, + { + "epoch": 0.7041526431662323, + "grad_norm": 3.1727406978607178, + "learning_rate": 2.0463780304996285e-06, + "loss": 0.1534, + "step": 27826 + }, + { + "epoch": 0.704177948730926, + "grad_norm": 3.6262874603271484, + "learning_rate": 2.0460540621653343e-06, + "loss": 0.13, + "step": 27827 + }, + { + "epoch": 0.7042032542956196, + "grad_norm": 5.979920387268066, + "learning_rate": 2.0457301128804837e-06, + "loss": 0.198, + "step": 27828 + }, + { + "epoch": 0.7042285598603133, + "grad_norm": 4.135293960571289, + "learning_rate": 2.0454061826471723e-06, + "loss": 0.1396, + "step": 27829 + }, + { + "epoch": 0.704253865425007, + "grad_norm": 6.5768656730651855, + "learning_rate": 2.0450822714674845e-06, + "loss": 0.1527, + "step": 27830 + }, + { + "epoch": 0.7042791709897006, + "grad_norm": 2.894392967224121, + "learning_rate": 2.044758379343514e-06, + "loss": 0.1028, + "step": 27831 + }, + { + "epoch": 0.7043044765543943, + "grad_norm": 7.799522876739502, + "learning_rate": 2.044434506277347e-06, + "loss": 0.1617, + "step": 27832 + }, + { + "epoch": 0.704329782119088, + "grad_norm": 6.288241863250732, + "learning_rate": 2.0441106522710714e-06, + "loss": 0.1631, + "step": 27833 + }, + { + "epoch": 0.7043550876837816, + "grad_norm": 3.8729703426361084, + "learning_rate": 2.043786817326775e-06, + "loss": 0.1162, + "step": 27834 + }, + { + "epoch": 0.7043803932484753, + "grad_norm": 8.77402114868164, + "learning_rate": 2.0434630014465494e-06, + "loss": 0.2463, + "step": 27835 + }, + { + "epoch": 0.7044056988131691, + "grad_norm": 8.594136238098145, + "learning_rate": 2.0431392046324807e-06, + "loss": 0.2521, + "step": 27836 + }, + { + "epoch": 0.7044310043778627, + "grad_norm": 5.721569061279297, + "learning_rate": 2.042815426886657e-06, + "loss": 0.116, + "step": 27837 + }, + { + "epoch": 0.7044563099425564, + "grad_norm": 4.559759616851807, + "learning_rate": 2.042491668211167e-06, + "loss": 0.1303, + "step": 27838 + }, + { + "epoch": 0.7044816155072501, + "grad_norm": 4.720442771911621, + "learning_rate": 2.0421679286080966e-06, + "loss": 0.169, + "step": 27839 + }, + { + "epoch": 0.7045069210719437, + "grad_norm": 3.771183490753174, + "learning_rate": 2.0418442080795363e-06, + "loss": 0.1886, + "step": 27840 + }, + { + "epoch": 0.7045322266366374, + "grad_norm": 3.1673460006713867, + "learning_rate": 2.041520506627572e-06, + "loss": 0.1412, + "step": 27841 + }, + { + "epoch": 0.7045575322013311, + "grad_norm": 9.790431022644043, + "learning_rate": 2.041196824254293e-06, + "loss": 0.2246, + "step": 27842 + }, + { + "epoch": 0.7045828377660247, + "grad_norm": 3.3653180599212646, + "learning_rate": 2.040873160961782e-06, + "loss": 0.1373, + "step": 27843 + }, + { + "epoch": 0.7046081433307184, + "grad_norm": 4.502347946166992, + "learning_rate": 2.040549516752132e-06, + "loss": 0.0892, + "step": 27844 + }, + { + "epoch": 0.7046334488954121, + "grad_norm": 6.043869972229004, + "learning_rate": 2.040225891627428e-06, + "loss": 0.0949, + "step": 27845 + }, + { + "epoch": 0.7046587544601057, + "grad_norm": 3.6164612770080566, + "learning_rate": 2.0399022855897567e-06, + "loss": 0.1014, + "step": 27846 + }, + { + "epoch": 0.7046840600247994, + "grad_norm": 8.94705867767334, + "learning_rate": 2.0395786986412026e-06, + "loss": 0.2407, + "step": 27847 + }, + { + "epoch": 0.7047093655894932, + "grad_norm": 3.4938933849334717, + "learning_rate": 2.039255130783857e-06, + "loss": 0.1844, + "step": 27848 + }, + { + "epoch": 0.7047346711541868, + "grad_norm": 4.0903096199035645, + "learning_rate": 2.038931582019804e-06, + "loss": 0.1738, + "step": 27849 + }, + { + "epoch": 0.7047599767188805, + "grad_norm": 6.848198890686035, + "learning_rate": 2.0386080523511306e-06, + "loss": 0.2004, + "step": 27850 + }, + { + "epoch": 0.7047852822835742, + "grad_norm": 4.659912586212158, + "learning_rate": 2.0382845417799214e-06, + "loss": 0.1991, + "step": 27851 + }, + { + "epoch": 0.7048105878482679, + "grad_norm": 9.27218246459961, + "learning_rate": 2.037961050308265e-06, + "loss": 0.1942, + "step": 27852 + }, + { + "epoch": 0.7048358934129615, + "grad_norm": 7.874774932861328, + "learning_rate": 2.037637577938246e-06, + "loss": 0.2402, + "step": 27853 + }, + { + "epoch": 0.7048611989776552, + "grad_norm": 6.388512134552002, + "learning_rate": 2.0373141246719554e-06, + "loss": 0.1955, + "step": 27854 + }, + { + "epoch": 0.7048865045423489, + "grad_norm": 3.737391471862793, + "learning_rate": 2.03699069051147e-06, + "loss": 0.161, + "step": 27855 + }, + { + "epoch": 0.7049118101070425, + "grad_norm": 5.012686252593994, + "learning_rate": 2.036667275458883e-06, + "loss": 0.1924, + "step": 27856 + }, + { + "epoch": 0.7049371156717362, + "grad_norm": 4.348344326019287, + "learning_rate": 2.0363438795162754e-06, + "loss": 0.1582, + "step": 27857 + }, + { + "epoch": 0.7049624212364299, + "grad_norm": 2.738093614578247, + "learning_rate": 2.036020502685739e-06, + "loss": 0.1487, + "step": 27858 + }, + { + "epoch": 0.7049877268011235, + "grad_norm": 7.805115699768066, + "learning_rate": 2.035697144969351e-06, + "loss": 0.1383, + "step": 27859 + }, + { + "epoch": 0.7050130323658172, + "grad_norm": 3.197387218475342, + "learning_rate": 2.0353738063692023e-06, + "loss": 0.1001, + "step": 27860 + }, + { + "epoch": 0.705038337930511, + "grad_norm": 3.867310047149658, + "learning_rate": 2.035050486887375e-06, + "loss": 0.181, + "step": 27861 + }, + { + "epoch": 0.7050636434952046, + "grad_norm": 6.041245937347412, + "learning_rate": 2.0347271865259587e-06, + "loss": 0.1676, + "step": 27862 + }, + { + "epoch": 0.7050889490598983, + "grad_norm": 4.5581817626953125, + "learning_rate": 2.0344039052870324e-06, + "loss": 0.2087, + "step": 27863 + }, + { + "epoch": 0.705114254624592, + "grad_norm": 2.2759711742401123, + "learning_rate": 2.034080643172685e-06, + "loss": 0.0864, + "step": 27864 + }, + { + "epoch": 0.7051395601892856, + "grad_norm": 6.340203762054443, + "learning_rate": 2.0337574001849996e-06, + "loss": 0.1721, + "step": 27865 + }, + { + "epoch": 0.7051648657539793, + "grad_norm": 4.004816055297852, + "learning_rate": 2.0334341763260596e-06, + "loss": 0.1618, + "step": 27866 + }, + { + "epoch": 0.705190171318673, + "grad_norm": 2.853811502456665, + "learning_rate": 2.0331109715979525e-06, + "loss": 0.1135, + "step": 27867 + }, + { + "epoch": 0.7052154768833666, + "grad_norm": 9.27247428894043, + "learning_rate": 2.0327877860027607e-06, + "loss": 0.2218, + "step": 27868 + }, + { + "epoch": 0.7052407824480603, + "grad_norm": 3.748425006866455, + "learning_rate": 2.0324646195425683e-06, + "loss": 0.1189, + "step": 27869 + }, + { + "epoch": 0.705266088012754, + "grad_norm": 6.536166191101074, + "learning_rate": 2.0321414722194583e-06, + "loss": 0.1767, + "step": 27870 + }, + { + "epoch": 0.7052913935774476, + "grad_norm": 3.3192219734191895, + "learning_rate": 2.0318183440355176e-06, + "loss": 0.0673, + "step": 27871 + }, + { + "epoch": 0.7053166991421413, + "grad_norm": 8.227410316467285, + "learning_rate": 2.031495234992828e-06, + "loss": 0.1842, + "step": 27872 + }, + { + "epoch": 0.7053420047068351, + "grad_norm": 3.4960572719573975, + "learning_rate": 2.0311721450934737e-06, + "loss": 0.1018, + "step": 27873 + }, + { + "epoch": 0.7053673102715287, + "grad_norm": 3.9526708126068115, + "learning_rate": 2.0308490743395357e-06, + "loss": 0.065, + "step": 27874 + }, + { + "epoch": 0.7053926158362224, + "grad_norm": 6.676056385040283, + "learning_rate": 2.030526022733102e-06, + "loss": 0.2291, + "step": 27875 + }, + { + "epoch": 0.7054179214009161, + "grad_norm": 2.8565473556518555, + "learning_rate": 2.030202990276254e-06, + "loss": 0.0903, + "step": 27876 + }, + { + "epoch": 0.7054432269656097, + "grad_norm": 9.637939453125, + "learning_rate": 2.0298799769710738e-06, + "loss": 0.2999, + "step": 27877 + }, + { + "epoch": 0.7054685325303034, + "grad_norm": 3.487819194793701, + "learning_rate": 2.0295569828196433e-06, + "loss": 0.1154, + "step": 27878 + }, + { + "epoch": 0.7054938380949971, + "grad_norm": 8.85430908203125, + "learning_rate": 2.029234007824049e-06, + "loss": 0.262, + "step": 27879 + }, + { + "epoch": 0.7055191436596908, + "grad_norm": 6.580816268920898, + "learning_rate": 2.0289110519863725e-06, + "loss": 0.1374, + "step": 27880 + }, + { + "epoch": 0.7055444492243844, + "grad_norm": 2.0777482986450195, + "learning_rate": 2.0285881153086958e-06, + "loss": 0.0734, + "step": 27881 + }, + { + "epoch": 0.7055697547890781, + "grad_norm": 3.2210640907287598, + "learning_rate": 2.0282651977930994e-06, + "loss": 0.1515, + "step": 27882 + }, + { + "epoch": 0.7055950603537718, + "grad_norm": 4.970378398895264, + "learning_rate": 2.0279422994416704e-06, + "loss": 0.1392, + "step": 27883 + }, + { + "epoch": 0.7056203659184654, + "grad_norm": 2.9350414276123047, + "learning_rate": 2.0276194202564888e-06, + "loss": 0.0789, + "step": 27884 + }, + { + "epoch": 0.7056456714831592, + "grad_norm": 5.736359596252441, + "learning_rate": 2.027296560239636e-06, + "loss": 0.163, + "step": 27885 + }, + { + "epoch": 0.7056709770478529, + "grad_norm": 6.137356281280518, + "learning_rate": 2.026973719393193e-06, + "loss": 0.2287, + "step": 27886 + }, + { + "epoch": 0.7056962826125465, + "grad_norm": 8.963432312011719, + "learning_rate": 2.0266508977192454e-06, + "loss": 0.1938, + "step": 27887 + }, + { + "epoch": 0.7057215881772402, + "grad_norm": 5.1636247634887695, + "learning_rate": 2.026328095219871e-06, + "loss": 0.1294, + "step": 27888 + }, + { + "epoch": 0.7057468937419339, + "grad_norm": 4.765200138092041, + "learning_rate": 2.026005311897158e-06, + "loss": 0.1192, + "step": 27889 + }, + { + "epoch": 0.7057721993066275, + "grad_norm": 5.084554672241211, + "learning_rate": 2.025682547753179e-06, + "loss": 0.1605, + "step": 27890 + }, + { + "epoch": 0.7057975048713212, + "grad_norm": 9.127758026123047, + "learning_rate": 2.0253598027900223e-06, + "loss": 0.2081, + "step": 27891 + }, + { + "epoch": 0.7058228104360149, + "grad_norm": 8.997464179992676, + "learning_rate": 2.0250370770097665e-06, + "loss": 0.2624, + "step": 27892 + }, + { + "epoch": 0.7058481160007085, + "grad_norm": 4.2935872077941895, + "learning_rate": 2.024714370414494e-06, + "loss": 0.1719, + "step": 27893 + }, + { + "epoch": 0.7058734215654022, + "grad_norm": 6.588008880615234, + "learning_rate": 2.0243916830062825e-06, + "loss": 0.2341, + "step": 27894 + }, + { + "epoch": 0.7058987271300959, + "grad_norm": 6.032194137573242, + "learning_rate": 2.024069014787218e-06, + "loss": 0.1829, + "step": 27895 + }, + { + "epoch": 0.7059240326947895, + "grad_norm": 5.20427942276001, + "learning_rate": 2.023746365759378e-06, + "loss": 0.1908, + "step": 27896 + }, + { + "epoch": 0.7059493382594833, + "grad_norm": 11.941927909851074, + "learning_rate": 2.0234237359248424e-06, + "loss": 0.1762, + "step": 27897 + }, + { + "epoch": 0.705974643824177, + "grad_norm": 4.860570907592773, + "learning_rate": 2.0231011252856956e-06, + "loss": 0.1504, + "step": 27898 + }, + { + "epoch": 0.7059999493888706, + "grad_norm": 8.219060897827148, + "learning_rate": 2.0227785338440157e-06, + "loss": 0.1554, + "step": 27899 + }, + { + "epoch": 0.7060252549535643, + "grad_norm": 4.46465539932251, + "learning_rate": 2.022455961601884e-06, + "loss": 0.116, + "step": 27900 + }, + { + "epoch": 0.706050560518258, + "grad_norm": 7.23829460144043, + "learning_rate": 2.0221334085613776e-06, + "loss": 0.2064, + "step": 27901 + }, + { + "epoch": 0.7060758660829516, + "grad_norm": 6.077619552612305, + "learning_rate": 2.021810874724582e-06, + "loss": 0.1683, + "step": 27902 + }, + { + "epoch": 0.7061011716476453, + "grad_norm": 7.296797275543213, + "learning_rate": 2.021488360093571e-06, + "loss": 0.1779, + "step": 27903 + }, + { + "epoch": 0.706126477212339, + "grad_norm": 5.860857009887695, + "learning_rate": 2.021165864670429e-06, + "loss": 0.222, + "step": 27904 + }, + { + "epoch": 0.7061517827770327, + "grad_norm": 5.821264266967773, + "learning_rate": 2.020843388457233e-06, + "loss": 0.1932, + "step": 27905 + }, + { + "epoch": 0.7061770883417263, + "grad_norm": 4.963200092315674, + "learning_rate": 2.020520931456065e-06, + "loss": 0.1166, + "step": 27906 + }, + { + "epoch": 0.70620239390642, + "grad_norm": 4.161007881164551, + "learning_rate": 2.020198493669004e-06, + "loss": 0.1619, + "step": 27907 + }, + { + "epoch": 0.7062276994711137, + "grad_norm": 4.73189115524292, + "learning_rate": 2.0198760750981284e-06, + "loss": 0.1148, + "step": 27908 + }, + { + "epoch": 0.7062530050358073, + "grad_norm": 3.930262327194214, + "learning_rate": 2.0195536757455155e-06, + "loss": 0.1172, + "step": 27909 + }, + { + "epoch": 0.7062783106005011, + "grad_norm": 5.841150760650635, + "learning_rate": 2.0192312956132482e-06, + "loss": 0.2135, + "step": 27910 + }, + { + "epoch": 0.7063036161651948, + "grad_norm": 5.7925214767456055, + "learning_rate": 2.018908934703404e-06, + "loss": 0.1767, + "step": 27911 + }, + { + "epoch": 0.7063289217298884, + "grad_norm": 2.385551929473877, + "learning_rate": 2.018586593018062e-06, + "loss": 0.1105, + "step": 27912 + }, + { + "epoch": 0.7063542272945821, + "grad_norm": 4.825668811798096, + "learning_rate": 2.018264270559298e-06, + "loss": 0.1603, + "step": 27913 + }, + { + "epoch": 0.7063795328592758, + "grad_norm": 8.283939361572266, + "learning_rate": 2.0179419673291956e-06, + "loss": 0.1326, + "step": 27914 + }, + { + "epoch": 0.7064048384239694, + "grad_norm": 8.822197914123535, + "learning_rate": 2.01761968332983e-06, + "loss": 0.2987, + "step": 27915 + }, + { + "epoch": 0.7064301439886631, + "grad_norm": 3.827831983566284, + "learning_rate": 2.017297418563281e-06, + "loss": 0.1383, + "step": 27916 + }, + { + "epoch": 0.7064554495533568, + "grad_norm": 4.149530410766602, + "learning_rate": 2.016975173031624e-06, + "loss": 0.1309, + "step": 27917 + }, + { + "epoch": 0.7064807551180504, + "grad_norm": 3.4324088096618652, + "learning_rate": 2.016652946736941e-06, + "loss": 0.1334, + "step": 27918 + }, + { + "epoch": 0.7065060606827441, + "grad_norm": 8.037810325622559, + "learning_rate": 2.0163307396813066e-06, + "loss": 0.2264, + "step": 27919 + }, + { + "epoch": 0.7065313662474378, + "grad_norm": 3.8588852882385254, + "learning_rate": 2.016008551866803e-06, + "loss": 0.1027, + "step": 27920 + }, + { + "epoch": 0.7065566718121314, + "grad_norm": 4.585756778717041, + "learning_rate": 2.0156863832955026e-06, + "loss": 0.1922, + "step": 27921 + }, + { + "epoch": 0.7065819773768252, + "grad_norm": 3.4343135356903076, + "learning_rate": 2.0153642339694868e-06, + "loss": 0.1233, + "step": 27922 + }, + { + "epoch": 0.7066072829415189, + "grad_norm": 5.313186168670654, + "learning_rate": 2.0150421038908323e-06, + "loss": 0.1613, + "step": 27923 + }, + { + "epoch": 0.7066325885062125, + "grad_norm": 4.955333709716797, + "learning_rate": 2.0147199930616157e-06, + "loss": 0.1605, + "step": 27924 + }, + { + "epoch": 0.7066578940709062, + "grad_norm": 2.8989551067352295, + "learning_rate": 2.0143979014839134e-06, + "loss": 0.1172, + "step": 27925 + }, + { + "epoch": 0.7066831996355999, + "grad_norm": 7.256010055541992, + "learning_rate": 2.0140758291598044e-06, + "loss": 0.1728, + "step": 27926 + }, + { + "epoch": 0.7067085052002935, + "grad_norm": 4.750572681427002, + "learning_rate": 2.0137537760913658e-06, + "loss": 0.1715, + "step": 27927 + }, + { + "epoch": 0.7067338107649872, + "grad_norm": 2.9097557067871094, + "learning_rate": 2.0134317422806714e-06, + "loss": 0.1079, + "step": 27928 + }, + { + "epoch": 0.7067591163296809, + "grad_norm": 3.6402032375335693, + "learning_rate": 2.013109727729804e-06, + "loss": 0.1519, + "step": 27929 + }, + { + "epoch": 0.7067844218943746, + "grad_norm": 8.719747543334961, + "learning_rate": 2.0127877324408324e-06, + "loss": 0.3432, + "step": 27930 + }, + { + "epoch": 0.7068097274590682, + "grad_norm": 12.203248023986816, + "learning_rate": 2.0124657564158395e-06, + "loss": 0.2589, + "step": 27931 + }, + { + "epoch": 0.7068350330237619, + "grad_norm": 2.338747262954712, + "learning_rate": 2.0121437996568964e-06, + "loss": 0.1248, + "step": 27932 + }, + { + "epoch": 0.7068603385884557, + "grad_norm": 5.662905216217041, + "learning_rate": 2.011821862166087e-06, + "loss": 0.16, + "step": 27933 + }, + { + "epoch": 0.7068856441531493, + "grad_norm": 4.806484222412109, + "learning_rate": 2.011499943945478e-06, + "loss": 0.1587, + "step": 27934 + }, + { + "epoch": 0.706910949717843, + "grad_norm": 3.7901058197021484, + "learning_rate": 2.011178044997152e-06, + "loss": 0.1681, + "step": 27935 + }, + { + "epoch": 0.7069362552825367, + "grad_norm": 5.047335624694824, + "learning_rate": 2.010856165323181e-06, + "loss": 0.1466, + "step": 27936 + }, + { + "epoch": 0.7069615608472303, + "grad_norm": 3.657963514328003, + "learning_rate": 2.0105343049256465e-06, + "loss": 0.1385, + "step": 27937 + }, + { + "epoch": 0.706986866411924, + "grad_norm": 8.657569885253906, + "learning_rate": 2.010212463806616e-06, + "loss": 0.3452, + "step": 27938 + }, + { + "epoch": 0.7070121719766177, + "grad_norm": 7.244904041290283, + "learning_rate": 2.009890641968172e-06, + "loss": 0.2071, + "step": 27939 + }, + { + "epoch": 0.7070374775413113, + "grad_norm": 4.647828102111816, + "learning_rate": 2.0095688394123843e-06, + "loss": 0.1295, + "step": 27940 + }, + { + "epoch": 0.707062783106005, + "grad_norm": 5.082821369171143, + "learning_rate": 2.009247056141333e-06, + "loss": 0.1235, + "step": 27941 + }, + { + "epoch": 0.7070880886706987, + "grad_norm": 6.176759719848633, + "learning_rate": 2.0089252921570914e-06, + "loss": 0.1718, + "step": 27942 + }, + { + "epoch": 0.7071133942353923, + "grad_norm": 4.032001495361328, + "learning_rate": 2.0086035474617333e-06, + "loss": 0.1305, + "step": 27943 + }, + { + "epoch": 0.707138699800086, + "grad_norm": 2.7439935207366943, + "learning_rate": 2.0082818220573338e-06, + "loss": 0.1159, + "step": 27944 + }, + { + "epoch": 0.7071640053647797, + "grad_norm": 14.105517387390137, + "learning_rate": 2.0079601159459693e-06, + "loss": 0.2485, + "step": 27945 + }, + { + "epoch": 0.7071893109294733, + "grad_norm": 5.737936019897461, + "learning_rate": 2.0076384291297134e-06, + "loss": 0.1857, + "step": 27946 + }, + { + "epoch": 0.7072146164941671, + "grad_norm": 4.260636806488037, + "learning_rate": 2.0073167616106417e-06, + "loss": 0.1589, + "step": 27947 + }, + { + "epoch": 0.7072399220588608, + "grad_norm": 2.3063066005706787, + "learning_rate": 2.0069951133908244e-06, + "loss": 0.0961, + "step": 27948 + }, + { + "epoch": 0.7072652276235544, + "grad_norm": 5.201650619506836, + "learning_rate": 2.0066734844723417e-06, + "loss": 0.1822, + "step": 27949 + }, + { + "epoch": 0.7072905331882481, + "grad_norm": 3.546466112136841, + "learning_rate": 2.006351874857264e-06, + "loss": 0.1495, + "step": 27950 + }, + { + "epoch": 0.7073158387529418, + "grad_norm": 4.23759126663208, + "learning_rate": 2.006030284547667e-06, + "loss": 0.1266, + "step": 27951 + }, + { + "epoch": 0.7073411443176354, + "grad_norm": 4.989538669586182, + "learning_rate": 2.0057087135456216e-06, + "loss": 0.143, + "step": 27952 + }, + { + "epoch": 0.7073664498823291, + "grad_norm": 5.301618576049805, + "learning_rate": 2.0053871618532055e-06, + "loss": 0.2341, + "step": 27953 + }, + { + "epoch": 0.7073917554470228, + "grad_norm": 7.2545671463012695, + "learning_rate": 2.005065629472491e-06, + "loss": 0.157, + "step": 27954 + }, + { + "epoch": 0.7074170610117165, + "grad_norm": 5.859360218048096, + "learning_rate": 2.0047441164055505e-06, + "loss": 0.1526, + "step": 27955 + }, + { + "epoch": 0.7074423665764101, + "grad_norm": 6.199337005615234, + "learning_rate": 2.0044226226544565e-06, + "loss": 0.197, + "step": 27956 + }, + { + "epoch": 0.7074676721411038, + "grad_norm": 3.584434747695923, + "learning_rate": 2.0041011482212856e-06, + "loss": 0.1487, + "step": 27957 + }, + { + "epoch": 0.7074929777057976, + "grad_norm": 5.565082550048828, + "learning_rate": 2.0037796931081095e-06, + "loss": 0.1501, + "step": 27958 + }, + { + "epoch": 0.7075182832704912, + "grad_norm": 5.642792701721191, + "learning_rate": 2.0034582573169987e-06, + "loss": 0.1474, + "step": 27959 + }, + { + "epoch": 0.7075435888351849, + "grad_norm": 7.95189094543457, + "learning_rate": 2.003136840850031e-06, + "loss": 0.1979, + "step": 27960 + }, + { + "epoch": 0.7075688943998786, + "grad_norm": 5.987236022949219, + "learning_rate": 2.002815443709274e-06, + "loss": 0.1051, + "step": 27961 + }, + { + "epoch": 0.7075941999645722, + "grad_norm": 2.667781114578247, + "learning_rate": 2.002494065896804e-06, + "loss": 0.1235, + "step": 27962 + }, + { + "epoch": 0.7076195055292659, + "grad_norm": 2.3948540687561035, + "learning_rate": 2.0021727074146902e-06, + "loss": 0.1002, + "step": 27963 + }, + { + "epoch": 0.7076448110939596, + "grad_norm": 2.860227346420288, + "learning_rate": 2.001851368265011e-06, + "loss": 0.1727, + "step": 27964 + }, + { + "epoch": 0.7076701166586532, + "grad_norm": 4.9166717529296875, + "learning_rate": 2.001530048449831e-06, + "loss": 0.1759, + "step": 27965 + }, + { + "epoch": 0.7076954222233469, + "grad_norm": 4.15158224105835, + "learning_rate": 2.001208747971227e-06, + "loss": 0.1543, + "step": 27966 + }, + { + "epoch": 0.7077207277880406, + "grad_norm": 5.8437910079956055, + "learning_rate": 2.000887466831269e-06, + "loss": 0.1934, + "step": 27967 + }, + { + "epoch": 0.7077460333527342, + "grad_norm": 3.682029962539673, + "learning_rate": 2.0005662050320327e-06, + "loss": 0.1362, + "step": 27968 + }, + { + "epoch": 0.7077713389174279, + "grad_norm": 5.415942668914795, + "learning_rate": 2.0002449625755845e-06, + "loss": 0.2008, + "step": 27969 + }, + { + "epoch": 0.7077966444821217, + "grad_norm": 4.210886001586914, + "learning_rate": 1.999923739463999e-06, + "loss": 0.1548, + "step": 27970 + }, + { + "epoch": 0.7078219500468153, + "grad_norm": 3.715961456298828, + "learning_rate": 1.9996025356993467e-06, + "loss": 0.0898, + "step": 27971 + }, + { + "epoch": 0.707847255611509, + "grad_norm": 6.348690032958984, + "learning_rate": 1.9992813512837004e-06, + "loss": 0.1201, + "step": 27972 + }, + { + "epoch": 0.7078725611762027, + "grad_norm": 18.685848236083984, + "learning_rate": 1.998960186219131e-06, + "loss": 0.2044, + "step": 27973 + }, + { + "epoch": 0.7078978667408963, + "grad_norm": 6.0301618576049805, + "learning_rate": 1.998639040507709e-06, + "loss": 0.1748, + "step": 27974 + }, + { + "epoch": 0.70792317230559, + "grad_norm": 11.133834838867188, + "learning_rate": 1.9983179141515037e-06, + "loss": 0.1784, + "step": 27975 + }, + { + "epoch": 0.7079484778702837, + "grad_norm": 5.659734725952148, + "learning_rate": 1.9979968071525897e-06, + "loss": 0.1233, + "step": 27976 + }, + { + "epoch": 0.7079737834349773, + "grad_norm": 11.857282638549805, + "learning_rate": 1.9976757195130356e-06, + "loss": 0.202, + "step": 27977 + }, + { + "epoch": 0.707999088999671, + "grad_norm": 4.3898091316223145, + "learning_rate": 1.9973546512349124e-06, + "loss": 0.1354, + "step": 27978 + }, + { + "epoch": 0.7080243945643647, + "grad_norm": 5.303311347961426, + "learning_rate": 1.9970336023202892e-06, + "loss": 0.1551, + "step": 27979 + }, + { + "epoch": 0.7080497001290584, + "grad_norm": 8.860577583312988, + "learning_rate": 1.996712572771239e-06, + "loss": 0.1575, + "step": 27980 + }, + { + "epoch": 0.708075005693752, + "grad_norm": 6.702137470245361, + "learning_rate": 1.996391562589831e-06, + "loss": 0.2293, + "step": 27981 + }, + { + "epoch": 0.7081003112584457, + "grad_norm": 4.572624683380127, + "learning_rate": 1.9960705717781348e-06, + "loss": 0.1816, + "step": 27982 + }, + { + "epoch": 0.7081256168231395, + "grad_norm": 3.5516796112060547, + "learning_rate": 1.995749600338219e-06, + "loss": 0.1682, + "step": 27983 + }, + { + "epoch": 0.7081509223878331, + "grad_norm": 4.220641613006592, + "learning_rate": 1.9954286482721573e-06, + "loss": 0.1534, + "step": 27984 + }, + { + "epoch": 0.7081762279525268, + "grad_norm": 3.3612966537475586, + "learning_rate": 1.995107715582017e-06, + "loss": 0.1099, + "step": 27985 + }, + { + "epoch": 0.7082015335172205, + "grad_norm": 10.83304500579834, + "learning_rate": 1.9947868022698686e-06, + "loss": 0.2029, + "step": 27986 + }, + { + "epoch": 0.7082268390819141, + "grad_norm": 10.357394218444824, + "learning_rate": 1.9944659083377804e-06, + "loss": 0.1715, + "step": 27987 + }, + { + "epoch": 0.7082521446466078, + "grad_norm": 4.790441989898682, + "learning_rate": 1.994145033787821e-06, + "loss": 0.2073, + "step": 27988 + }, + { + "epoch": 0.7082774502113015, + "grad_norm": 4.392515659332275, + "learning_rate": 1.993824178622063e-06, + "loss": 0.1036, + "step": 27989 + }, + { + "epoch": 0.7083027557759951, + "grad_norm": 9.04992961883545, + "learning_rate": 1.993503342842574e-06, + "loss": 0.2001, + "step": 27990 + }, + { + "epoch": 0.7083280613406888, + "grad_norm": 5.517974376678467, + "learning_rate": 1.993182526451422e-06, + "loss": 0.1676, + "step": 27991 + }, + { + "epoch": 0.7083533669053825, + "grad_norm": 4.078855991363525, + "learning_rate": 1.9928617294506753e-06, + "loss": 0.0984, + "step": 27992 + }, + { + "epoch": 0.7083786724700761, + "grad_norm": 11.152726173400879, + "learning_rate": 1.9925409518424056e-06, + "loss": 0.1904, + "step": 27993 + }, + { + "epoch": 0.7084039780347698, + "grad_norm": 3.906541347503662, + "learning_rate": 1.9922201936286776e-06, + "loss": 0.1389, + "step": 27994 + }, + { + "epoch": 0.7084292835994636, + "grad_norm": 3.401331901550293, + "learning_rate": 1.9918994548115667e-06, + "loss": 0.1652, + "step": 27995 + }, + { + "epoch": 0.7084545891641572, + "grad_norm": 2.7327260971069336, + "learning_rate": 1.991578735393132e-06, + "loss": 0.118, + "step": 27996 + }, + { + "epoch": 0.7084798947288509, + "grad_norm": 4.588804721832275, + "learning_rate": 1.991258035375449e-06, + "loss": 0.1694, + "step": 27997 + }, + { + "epoch": 0.7085052002935446, + "grad_norm": 12.014724731445312, + "learning_rate": 1.990937354760581e-06, + "loss": 0.1402, + "step": 27998 + }, + { + "epoch": 0.7085305058582382, + "grad_norm": 4.407705307006836, + "learning_rate": 1.9906166935506023e-06, + "loss": 0.1793, + "step": 27999 + }, + { + "epoch": 0.7085558114229319, + "grad_norm": 4.700758934020996, + "learning_rate": 1.9902960517475726e-06, + "loss": 0.1559, + "step": 28000 + }, + { + "epoch": 0.7085811169876256, + "grad_norm": 14.559316635131836, + "learning_rate": 1.989975429353566e-06, + "loss": 0.256, + "step": 28001 + }, + { + "epoch": 0.7086064225523192, + "grad_norm": 4.40890645980835, + "learning_rate": 1.989654826370646e-06, + "loss": 0.1417, + "step": 28002 + }, + { + "epoch": 0.7086317281170129, + "grad_norm": 3.701092481613159, + "learning_rate": 1.9893342428008837e-06, + "loss": 0.1307, + "step": 28003 + }, + { + "epoch": 0.7086570336817066, + "grad_norm": 4.9928388595581055, + "learning_rate": 1.9890136786463452e-06, + "loss": 0.1242, + "step": 28004 + }, + { + "epoch": 0.7086823392464002, + "grad_norm": 12.097468376159668, + "learning_rate": 1.9886931339090977e-06, + "loss": 0.2241, + "step": 28005 + }, + { + "epoch": 0.7087076448110939, + "grad_norm": 7.570127964019775, + "learning_rate": 1.9883726085912053e-06, + "loss": 0.1615, + "step": 28006 + }, + { + "epoch": 0.7087329503757877, + "grad_norm": 2.982261896133423, + "learning_rate": 1.9880521026947404e-06, + "loss": 0.1125, + "step": 28007 + }, + { + "epoch": 0.7087582559404814, + "grad_norm": 8.545490264892578, + "learning_rate": 1.9877316162217667e-06, + "loss": 0.1297, + "step": 28008 + }, + { + "epoch": 0.708783561505175, + "grad_norm": 4.4367876052856445, + "learning_rate": 1.9874111491743515e-06, + "loss": 0.1438, + "step": 28009 + }, + { + "epoch": 0.7088088670698687, + "grad_norm": 3.3900630474090576, + "learning_rate": 1.9870907015545597e-06, + "loss": 0.149, + "step": 28010 + }, + { + "epoch": 0.7088341726345624, + "grad_norm": 5.684544086456299, + "learning_rate": 1.9867702733644616e-06, + "loss": 0.1494, + "step": 28011 + }, + { + "epoch": 0.708859478199256, + "grad_norm": 4.9131388664245605, + "learning_rate": 1.9864498646061213e-06, + "loss": 0.149, + "step": 28012 + }, + { + "epoch": 0.7088847837639497, + "grad_norm": 13.668066024780273, + "learning_rate": 1.9861294752816052e-06, + "loss": 0.3268, + "step": 28013 + }, + { + "epoch": 0.7089100893286434, + "grad_norm": 2.844280481338501, + "learning_rate": 1.9858091053929797e-06, + "loss": 0.1223, + "step": 28014 + }, + { + "epoch": 0.708935394893337, + "grad_norm": 4.45431661605835, + "learning_rate": 1.9854887549423084e-06, + "loss": 0.175, + "step": 28015 + }, + { + "epoch": 0.7089607004580307, + "grad_norm": 3.3282883167266846, + "learning_rate": 1.9851684239316617e-06, + "loss": 0.1322, + "step": 28016 + }, + { + "epoch": 0.7089860060227244, + "grad_norm": 2.149763822555542, + "learning_rate": 1.9848481123631023e-06, + "loss": 0.068, + "step": 28017 + }, + { + "epoch": 0.709011311587418, + "grad_norm": 3.756781578063965, + "learning_rate": 1.9845278202386968e-06, + "loss": 0.149, + "step": 28018 + }, + { + "epoch": 0.7090366171521117, + "grad_norm": 4.185764789581299, + "learning_rate": 1.984207547560509e-06, + "loss": 0.1614, + "step": 28019 + }, + { + "epoch": 0.7090619227168055, + "grad_norm": 3.6009018421173096, + "learning_rate": 1.9838872943306076e-06, + "loss": 0.1738, + "step": 28020 + }, + { + "epoch": 0.7090872282814991, + "grad_norm": 5.2635626792907715, + "learning_rate": 1.983567060551056e-06, + "loss": 0.1529, + "step": 28021 + }, + { + "epoch": 0.7091125338461928, + "grad_norm": 5.173868656158447, + "learning_rate": 1.983246846223919e-06, + "loss": 0.1305, + "step": 28022 + }, + { + "epoch": 0.7091378394108865, + "grad_norm": 3.28640079498291, + "learning_rate": 1.9829266513512602e-06, + "loss": 0.1393, + "step": 28023 + }, + { + "epoch": 0.7091631449755801, + "grad_norm": 4.858062744140625, + "learning_rate": 1.9826064759351483e-06, + "loss": 0.1036, + "step": 28024 + }, + { + "epoch": 0.7091884505402738, + "grad_norm": 4.186280727386475, + "learning_rate": 1.982286319977644e-06, + "loss": 0.1326, + "step": 28025 + }, + { + "epoch": 0.7092137561049675, + "grad_norm": 4.117435455322266, + "learning_rate": 1.9819661834808175e-06, + "loss": 0.1464, + "step": 28026 + }, + { + "epoch": 0.7092390616696611, + "grad_norm": 4.589709758758545, + "learning_rate": 1.981646066446726e-06, + "loss": 0.1543, + "step": 28027 + }, + { + "epoch": 0.7092643672343548, + "grad_norm": 2.865607500076294, + "learning_rate": 1.981325968877439e-06, + "loss": 0.1406, + "step": 28028 + }, + { + "epoch": 0.7092896727990485, + "grad_norm": 6.595954895019531, + "learning_rate": 1.981005890775018e-06, + "loss": 0.158, + "step": 28029 + }, + { + "epoch": 0.7093149783637421, + "grad_norm": 4.026723384857178, + "learning_rate": 1.980685832141532e-06, + "loss": 0.0939, + "step": 28030 + }, + { + "epoch": 0.7093402839284358, + "grad_norm": 7.432167053222656, + "learning_rate": 1.9803657929790377e-06, + "loss": 0.1467, + "step": 28031 + }, + { + "epoch": 0.7093655894931296, + "grad_norm": 2.2643179893493652, + "learning_rate": 1.9800457732896044e-06, + "loss": 0.1042, + "step": 28032 + }, + { + "epoch": 0.7093908950578233, + "grad_norm": 6.38594388961792, + "learning_rate": 1.9797257730752924e-06, + "loss": 0.2327, + "step": 28033 + }, + { + "epoch": 0.7094162006225169, + "grad_norm": 3.3831851482391357, + "learning_rate": 1.979405792338169e-06, + "loss": 0.1105, + "step": 28034 + }, + { + "epoch": 0.7094415061872106, + "grad_norm": 5.937807083129883, + "learning_rate": 1.9790858310802955e-06, + "loss": 0.1761, + "step": 28035 + }, + { + "epoch": 0.7094668117519043, + "grad_norm": 9.05337142944336, + "learning_rate": 1.978765889303736e-06, + "loss": 0.2381, + "step": 28036 + }, + { + "epoch": 0.7094921173165979, + "grad_norm": 3.106163263320923, + "learning_rate": 1.9784459670105515e-06, + "loss": 0.1288, + "step": 28037 + }, + { + "epoch": 0.7095174228812916, + "grad_norm": 7.117246627807617, + "learning_rate": 1.9781260642028083e-06, + "loss": 0.2268, + "step": 28038 + }, + { + "epoch": 0.7095427284459853, + "grad_norm": 8.156558990478516, + "learning_rate": 1.9778061808825684e-06, + "loss": 0.2742, + "step": 28039 + }, + { + "epoch": 0.7095680340106789, + "grad_norm": 3.5113258361816406, + "learning_rate": 1.977486317051894e-06, + "loss": 0.1012, + "step": 28040 + }, + { + "epoch": 0.7095933395753726, + "grad_norm": 3.254676580429077, + "learning_rate": 1.977166472712848e-06, + "loss": 0.0951, + "step": 28041 + }, + { + "epoch": 0.7096186451400663, + "grad_norm": 6.632185459136963, + "learning_rate": 1.9768466478674915e-06, + "loss": 0.1505, + "step": 28042 + }, + { + "epoch": 0.7096439507047599, + "grad_norm": 4.069085121154785, + "learning_rate": 1.97652684251789e-06, + "loss": 0.1424, + "step": 28043 + }, + { + "epoch": 0.7096692562694537, + "grad_norm": 2.6248011589050293, + "learning_rate": 1.976207056666105e-06, + "loss": 0.0948, + "step": 28044 + }, + { + "epoch": 0.7096945618341474, + "grad_norm": 3.4423627853393555, + "learning_rate": 1.9758872903141983e-06, + "loss": 0.1749, + "step": 28045 + }, + { + "epoch": 0.709719867398841, + "grad_norm": 5.346168518066406, + "learning_rate": 1.9755675434642303e-06, + "loss": 0.1681, + "step": 28046 + }, + { + "epoch": 0.7097451729635347, + "grad_norm": 6.553206920623779, + "learning_rate": 1.9752478161182657e-06, + "loss": 0.2556, + "step": 28047 + }, + { + "epoch": 0.7097704785282284, + "grad_norm": 4.050290107727051, + "learning_rate": 1.9749281082783658e-06, + "loss": 0.1016, + "step": 28048 + }, + { + "epoch": 0.709795784092922, + "grad_norm": 8.776251792907715, + "learning_rate": 1.9746084199465914e-06, + "loss": 0.2064, + "step": 28049 + }, + { + "epoch": 0.7098210896576157, + "grad_norm": 2.8832056522369385, + "learning_rate": 1.9742887511250033e-06, + "loss": 0.1222, + "step": 28050 + }, + { + "epoch": 0.7098463952223094, + "grad_norm": 6.240474224090576, + "learning_rate": 1.9739691018156653e-06, + "loss": 0.1928, + "step": 28051 + }, + { + "epoch": 0.709871700787003, + "grad_norm": 3.811521291732788, + "learning_rate": 1.9736494720206383e-06, + "loss": 0.0902, + "step": 28052 + }, + { + "epoch": 0.7098970063516967, + "grad_norm": 5.712108135223389, + "learning_rate": 1.9733298617419827e-06, + "loss": 0.1317, + "step": 28053 + }, + { + "epoch": 0.7099223119163904, + "grad_norm": 11.116617202758789, + "learning_rate": 1.9730102709817584e-06, + "loss": 0.2389, + "step": 28054 + }, + { + "epoch": 0.709947617481084, + "grad_norm": 3.7211666107177734, + "learning_rate": 1.9726906997420292e-06, + "loss": 0.1505, + "step": 28055 + }, + { + "epoch": 0.7099729230457777, + "grad_norm": 7.204358100891113, + "learning_rate": 1.9723711480248527e-06, + "loss": 0.1785, + "step": 28056 + }, + { + "epoch": 0.7099982286104715, + "grad_norm": 3.0387818813323975, + "learning_rate": 1.972051615832295e-06, + "loss": 0.1234, + "step": 28057 + }, + { + "epoch": 0.7100235341751652, + "grad_norm": 3.150205135345459, + "learning_rate": 1.97173210316641e-06, + "loss": 0.1369, + "step": 28058 + }, + { + "epoch": 0.7100488397398588, + "grad_norm": 6.995205402374268, + "learning_rate": 1.971412610029263e-06, + "loss": 0.0806, + "step": 28059 + }, + { + "epoch": 0.7100741453045525, + "grad_norm": 8.834919929504395, + "learning_rate": 1.9710931364229106e-06, + "loss": 0.1311, + "step": 28060 + }, + { + "epoch": 0.7100994508692462, + "grad_norm": 3.6864302158355713, + "learning_rate": 1.9707736823494187e-06, + "loss": 0.2035, + "step": 28061 + }, + { + "epoch": 0.7101247564339398, + "grad_norm": 6.385347366333008, + "learning_rate": 1.9704542478108407e-06, + "loss": 0.2191, + "step": 28062 + }, + { + "epoch": 0.7101500619986335, + "grad_norm": 4.778120994567871, + "learning_rate": 1.970134832809242e-06, + "loss": 0.1439, + "step": 28063 + }, + { + "epoch": 0.7101753675633272, + "grad_norm": 3.865574598312378, + "learning_rate": 1.969815437346677e-06, + "loss": 0.1694, + "step": 28064 + }, + { + "epoch": 0.7102006731280208, + "grad_norm": 4.251047611236572, + "learning_rate": 1.969496061425213e-06, + "loss": 0.1527, + "step": 28065 + }, + { + "epoch": 0.7102259786927145, + "grad_norm": 4.498361110687256, + "learning_rate": 1.9691767050469017e-06, + "loss": 0.1331, + "step": 28066 + }, + { + "epoch": 0.7102512842574082, + "grad_norm": 5.354372024536133, + "learning_rate": 1.9688573682138075e-06, + "loss": 0.1085, + "step": 28067 + }, + { + "epoch": 0.7102765898221018, + "grad_norm": 5.982569694519043, + "learning_rate": 1.9685380509279865e-06, + "loss": 0.1908, + "step": 28068 + }, + { + "epoch": 0.7103018953867956, + "grad_norm": 9.882390022277832, + "learning_rate": 1.968218753191502e-06, + "loss": 0.2293, + "step": 28069 + }, + { + "epoch": 0.7103272009514893, + "grad_norm": 3.768519401550293, + "learning_rate": 1.96789947500641e-06, + "loss": 0.1364, + "step": 28070 + }, + { + "epoch": 0.7103525065161829, + "grad_norm": 12.933791160583496, + "learning_rate": 1.967580216374771e-06, + "loss": 0.1589, + "step": 28071 + }, + { + "epoch": 0.7103778120808766, + "grad_norm": 3.4128644466400146, + "learning_rate": 1.9672609772986426e-06, + "loss": 0.1097, + "step": 28072 + }, + { + "epoch": 0.7104031176455703, + "grad_norm": 4.449057579040527, + "learning_rate": 1.9669417577800825e-06, + "loss": 0.0465, + "step": 28073 + }, + { + "epoch": 0.7104284232102639, + "grad_norm": 4.192065238952637, + "learning_rate": 1.966622557821153e-06, + "loss": 0.1684, + "step": 28074 + }, + { + "epoch": 0.7104537287749576, + "grad_norm": 4.983339786529541, + "learning_rate": 1.96630337742391e-06, + "loss": 0.2048, + "step": 28075 + }, + { + "epoch": 0.7104790343396513, + "grad_norm": 4.126811504364014, + "learning_rate": 1.965984216590412e-06, + "loss": 0.1694, + "step": 28076 + }, + { + "epoch": 0.7105043399043449, + "grad_norm": 25.33601188659668, + "learning_rate": 1.965665075322716e-06, + "loss": 0.4053, + "step": 28077 + }, + { + "epoch": 0.7105296454690386, + "grad_norm": 6.346760272979736, + "learning_rate": 1.9653459536228832e-06, + "loss": 0.1358, + "step": 28078 + }, + { + "epoch": 0.7105549510337323, + "grad_norm": 7.127954006195068, + "learning_rate": 1.9650268514929693e-06, + "loss": 0.2195, + "step": 28079 + }, + { + "epoch": 0.7105802565984259, + "grad_norm": 5.537595748901367, + "learning_rate": 1.964707768935033e-06, + "loss": 0.1215, + "step": 28080 + }, + { + "epoch": 0.7106055621631197, + "grad_norm": 7.170895576477051, + "learning_rate": 1.96438870595113e-06, + "loss": 0.2207, + "step": 28081 + }, + { + "epoch": 0.7106308677278134, + "grad_norm": 4.03692626953125, + "learning_rate": 1.9640696625433215e-06, + "loss": 0.1357, + "step": 28082 + }, + { + "epoch": 0.7106561732925071, + "grad_norm": 6.179864883422852, + "learning_rate": 1.963750638713663e-06, + "loss": 0.17, + "step": 28083 + }, + { + "epoch": 0.7106814788572007, + "grad_norm": 9.316373825073242, + "learning_rate": 1.9634316344642116e-06, + "loss": 0.1596, + "step": 28084 + }, + { + "epoch": 0.7107067844218944, + "grad_norm": 5.276551723480225, + "learning_rate": 1.963112649797023e-06, + "loss": 0.1639, + "step": 28085 + }, + { + "epoch": 0.7107320899865881, + "grad_norm": 7.1754841804504395, + "learning_rate": 1.962793684714158e-06, + "loss": 0.1688, + "step": 28086 + }, + { + "epoch": 0.7107573955512817, + "grad_norm": 4.0573906898498535, + "learning_rate": 1.9624747392176695e-06, + "loss": 0.1537, + "step": 28087 + }, + { + "epoch": 0.7107827011159754, + "grad_norm": 3.7219743728637695, + "learning_rate": 1.9621558133096203e-06, + "loss": 0.1609, + "step": 28088 + }, + { + "epoch": 0.7108080066806691, + "grad_norm": 4.68245267868042, + "learning_rate": 1.9618369069920596e-06, + "loss": 0.1488, + "step": 28089 + }, + { + "epoch": 0.7108333122453627, + "grad_norm": 8.410045623779297, + "learning_rate": 1.961518020267049e-06, + "loss": 0.2509, + "step": 28090 + }, + { + "epoch": 0.7108586178100564, + "grad_norm": 5.750421524047852, + "learning_rate": 1.961199153136642e-06, + "loss": 0.1925, + "step": 28091 + }, + { + "epoch": 0.7108839233747501, + "grad_norm": 4.796241283416748, + "learning_rate": 1.9608803056028996e-06, + "loss": 0.1784, + "step": 28092 + }, + { + "epoch": 0.7109092289394437, + "grad_norm": 8.927290916442871, + "learning_rate": 1.9605614776678718e-06, + "loss": 0.2536, + "step": 28093 + }, + { + "epoch": 0.7109345345041375, + "grad_norm": 4.917364120483398, + "learning_rate": 1.960242669333619e-06, + "loss": 0.1398, + "step": 28094 + }, + { + "epoch": 0.7109598400688312, + "grad_norm": 6.710599422454834, + "learning_rate": 1.959923880602194e-06, + "loss": 0.1523, + "step": 28095 + }, + { + "epoch": 0.7109851456335248, + "grad_norm": 6.920588493347168, + "learning_rate": 1.959605111475658e-06, + "loss": 0.1986, + "step": 28096 + }, + { + "epoch": 0.7110104511982185, + "grad_norm": 12.03288459777832, + "learning_rate": 1.9592863619560596e-06, + "loss": 0.235, + "step": 28097 + }, + { + "epoch": 0.7110357567629122, + "grad_norm": 4.545444965362549, + "learning_rate": 1.95896763204546e-06, + "loss": 0.1431, + "step": 28098 + }, + { + "epoch": 0.7110610623276058, + "grad_norm": 3.4528746604919434, + "learning_rate": 1.9586489217459113e-06, + "loss": 0.1721, + "step": 28099 + }, + { + "epoch": 0.7110863678922995, + "grad_norm": 3.7392616271972656, + "learning_rate": 1.9583302310594694e-06, + "loss": 0.1592, + "step": 28100 + }, + { + "epoch": 0.7111116734569932, + "grad_norm": 3.572845220565796, + "learning_rate": 1.9580115599881917e-06, + "loss": 0.1096, + "step": 28101 + }, + { + "epoch": 0.7111369790216868, + "grad_norm": 6.412714004516602, + "learning_rate": 1.9576929085341305e-06, + "loss": 0.1456, + "step": 28102 + }, + { + "epoch": 0.7111622845863805, + "grad_norm": 4.913451671600342, + "learning_rate": 1.957374276699343e-06, + "loss": 0.1132, + "step": 28103 + }, + { + "epoch": 0.7111875901510742, + "grad_norm": 4.922143459320068, + "learning_rate": 1.9570556644858805e-06, + "loss": 0.2148, + "step": 28104 + }, + { + "epoch": 0.7112128957157678, + "grad_norm": 5.061337947845459, + "learning_rate": 1.956737071895802e-06, + "loss": 0.1248, + "step": 28105 + }, + { + "epoch": 0.7112382012804616, + "grad_norm": 2.919052839279175, + "learning_rate": 1.9564184989311603e-06, + "loss": 0.1632, + "step": 28106 + }, + { + "epoch": 0.7112635068451553, + "grad_norm": 7.94319486618042, + "learning_rate": 1.956099945594009e-06, + "loss": 0.1911, + "step": 28107 + }, + { + "epoch": 0.711288812409849, + "grad_norm": 2.7141783237457275, + "learning_rate": 1.955781411886401e-06, + "loss": 0.1084, + "step": 28108 + }, + { + "epoch": 0.7113141179745426, + "grad_norm": 12.335338592529297, + "learning_rate": 1.9554628978103947e-06, + "loss": 0.2146, + "step": 28109 + }, + { + "epoch": 0.7113394235392363, + "grad_norm": 3.464703321456909, + "learning_rate": 1.9551444033680417e-06, + "loss": 0.1157, + "step": 28110 + }, + { + "epoch": 0.71136472910393, + "grad_norm": 3.695173501968384, + "learning_rate": 1.9548259285613956e-06, + "loss": 0.1223, + "step": 28111 + }, + { + "epoch": 0.7113900346686236, + "grad_norm": 7.612624168395996, + "learning_rate": 1.9545074733925096e-06, + "loss": 0.1762, + "step": 28112 + }, + { + "epoch": 0.7114153402333173, + "grad_norm": 5.021661281585693, + "learning_rate": 1.9541890378634398e-06, + "loss": 0.16, + "step": 28113 + }, + { + "epoch": 0.711440645798011, + "grad_norm": 4.896007537841797, + "learning_rate": 1.9538706219762386e-06, + "loss": 0.1538, + "step": 28114 + }, + { + "epoch": 0.7114659513627046, + "grad_norm": 4.753316402435303, + "learning_rate": 1.953552225732959e-06, + "loss": 0.1409, + "step": 28115 + }, + { + "epoch": 0.7114912569273983, + "grad_norm": 4.857699394226074, + "learning_rate": 1.9532338491356524e-06, + "loss": 0.2015, + "step": 28116 + }, + { + "epoch": 0.711516562492092, + "grad_norm": 11.011815071105957, + "learning_rate": 1.952915492186376e-06, + "loss": 0.2459, + "step": 28117 + }, + { + "epoch": 0.7115418680567857, + "grad_norm": 5.799821376800537, + "learning_rate": 1.952597154887181e-06, + "loss": 0.1851, + "step": 28118 + }, + { + "epoch": 0.7115671736214794, + "grad_norm": 3.5874133110046387, + "learning_rate": 1.9522788372401196e-06, + "loss": 0.1881, + "step": 28119 + }, + { + "epoch": 0.7115924791861731, + "grad_norm": 3.657210111618042, + "learning_rate": 1.9519605392472436e-06, + "loss": 0.1701, + "step": 28120 + }, + { + "epoch": 0.7116177847508667, + "grad_norm": 3.162121534347534, + "learning_rate": 1.9516422609106083e-06, + "loss": 0.1404, + "step": 28121 + }, + { + "epoch": 0.7116430903155604, + "grad_norm": 7.542196273803711, + "learning_rate": 1.951324002232264e-06, + "loss": 0.1562, + "step": 28122 + }, + { + "epoch": 0.7116683958802541, + "grad_norm": 3.4682013988494873, + "learning_rate": 1.951005763214267e-06, + "loss": 0.1323, + "step": 28123 + }, + { + "epoch": 0.7116937014449477, + "grad_norm": 7.711488723754883, + "learning_rate": 1.9506875438586636e-06, + "loss": 0.2406, + "step": 28124 + }, + { + "epoch": 0.7117190070096414, + "grad_norm": 22.516014099121094, + "learning_rate": 1.9503693441675107e-06, + "loss": 0.259, + "step": 28125 + }, + { + "epoch": 0.7117443125743351, + "grad_norm": 5.479599475860596, + "learning_rate": 1.950051164142859e-06, + "loss": 0.1844, + "step": 28126 + }, + { + "epoch": 0.7117696181390287, + "grad_norm": 3.1890554428100586, + "learning_rate": 1.9497330037867595e-06, + "loss": 0.1038, + "step": 28127 + }, + { + "epoch": 0.7117949237037224, + "grad_norm": 10.149327278137207, + "learning_rate": 1.9494148631012625e-06, + "loss": 0.1482, + "step": 28128 + }, + { + "epoch": 0.7118202292684161, + "grad_norm": 4.024691104888916, + "learning_rate": 1.949096742088424e-06, + "loss": 0.1475, + "step": 28129 + }, + { + "epoch": 0.7118455348331097, + "grad_norm": 4.967852592468262, + "learning_rate": 1.9487786407502924e-06, + "loss": 0.1316, + "step": 28130 + }, + { + "epoch": 0.7118708403978035, + "grad_norm": 3.981795310974121, + "learning_rate": 1.9484605590889184e-06, + "loss": 0.1096, + "step": 28131 + }, + { + "epoch": 0.7118961459624972, + "grad_norm": 15.307342529296875, + "learning_rate": 1.9481424971063568e-06, + "loss": 0.1703, + "step": 28132 + }, + { + "epoch": 0.7119214515271908, + "grad_norm": 5.222228050231934, + "learning_rate": 1.947824454804656e-06, + "loss": 0.1563, + "step": 28133 + }, + { + "epoch": 0.7119467570918845, + "grad_norm": 3.1530396938323975, + "learning_rate": 1.947506432185868e-06, + "loss": 0.1362, + "step": 28134 + }, + { + "epoch": 0.7119720626565782, + "grad_norm": 6.154956817626953, + "learning_rate": 1.9471884292520406e-06, + "loss": 0.1951, + "step": 28135 + }, + { + "epoch": 0.7119973682212719, + "grad_norm": 4.547211647033691, + "learning_rate": 1.9468704460052314e-06, + "loss": 0.1739, + "step": 28136 + }, + { + "epoch": 0.7120226737859655, + "grad_norm": 3.5518574714660645, + "learning_rate": 1.9465524824474828e-06, + "loss": 0.102, + "step": 28137 + }, + { + "epoch": 0.7120479793506592, + "grad_norm": 4.081811904907227, + "learning_rate": 1.9462345385808514e-06, + "loss": 0.1263, + "step": 28138 + }, + { + "epoch": 0.7120732849153529, + "grad_norm": 3.1198079586029053, + "learning_rate": 1.9459166144073832e-06, + "loss": 0.1385, + "step": 28139 + }, + { + "epoch": 0.7120985904800465, + "grad_norm": 5.813267707824707, + "learning_rate": 1.9455987099291345e-06, + "loss": 0.1558, + "step": 28140 + }, + { + "epoch": 0.7121238960447402, + "grad_norm": 4.869424819946289, + "learning_rate": 1.9452808251481482e-06, + "loss": 0.1897, + "step": 28141 + }, + { + "epoch": 0.712149201609434, + "grad_norm": 6.375072479248047, + "learning_rate": 1.9449629600664793e-06, + "loss": 0.2098, + "step": 28142 + }, + { + "epoch": 0.7121745071741276, + "grad_norm": 3.771116256713867, + "learning_rate": 1.9446451146861737e-06, + "loss": 0.1308, + "step": 28143 + }, + { + "epoch": 0.7121998127388213, + "grad_norm": 4.618716716766357, + "learning_rate": 1.9443272890092856e-06, + "loss": 0.1551, + "step": 28144 + }, + { + "epoch": 0.712225118303515, + "grad_norm": 6.179762840270996, + "learning_rate": 1.9440094830378625e-06, + "loss": 0.2345, + "step": 28145 + }, + { + "epoch": 0.7122504238682086, + "grad_norm": 7.18770170211792, + "learning_rate": 1.943691696773954e-06, + "loss": 0.1871, + "step": 28146 + }, + { + "epoch": 0.7122757294329023, + "grad_norm": 10.478750228881836, + "learning_rate": 1.943373930219607e-06, + "loss": 0.2681, + "step": 28147 + }, + { + "epoch": 0.712301034997596, + "grad_norm": 5.60552978515625, + "learning_rate": 1.943056183376875e-06, + "loss": 0.1381, + "step": 28148 + }, + { + "epoch": 0.7123263405622896, + "grad_norm": 8.384209632873535, + "learning_rate": 1.942738456247805e-06, + "loss": 0.1508, + "step": 28149 + }, + { + "epoch": 0.7123516461269833, + "grad_norm": 4.720183372497559, + "learning_rate": 1.942420748834446e-06, + "loss": 0.1963, + "step": 28150 + }, + { + "epoch": 0.712376951691677, + "grad_norm": 7.616551399230957, + "learning_rate": 1.9421030611388448e-06, + "loss": 0.1878, + "step": 28151 + }, + { + "epoch": 0.7124022572563706, + "grad_norm": 4.930202484130859, + "learning_rate": 1.9417853931630543e-06, + "loss": 0.067, + "step": 28152 + }, + { + "epoch": 0.7124275628210643, + "grad_norm": 2.296816825866699, + "learning_rate": 1.941467744909119e-06, + "loss": 0.1399, + "step": 28153 + }, + { + "epoch": 0.712452868385758, + "grad_norm": 3.533781051635742, + "learning_rate": 1.9411501163790926e-06, + "loss": 0.1449, + "step": 28154 + }, + { + "epoch": 0.7124781739504517, + "grad_norm": 4.833205699920654, + "learning_rate": 1.9408325075750168e-06, + "loss": 0.1336, + "step": 28155 + }, + { + "epoch": 0.7125034795151454, + "grad_norm": 7.218750476837158, + "learning_rate": 1.9405149184989448e-06, + "loss": 0.2535, + "step": 28156 + }, + { + "epoch": 0.7125287850798391, + "grad_norm": 4.363346576690674, + "learning_rate": 1.940197349152923e-06, + "loss": 0.1304, + "step": 28157 + }, + { + "epoch": 0.7125540906445327, + "grad_norm": 5.4342570304870605, + "learning_rate": 1.9398797995389996e-06, + "loss": 0.1851, + "step": 28158 + }, + { + "epoch": 0.7125793962092264, + "grad_norm": 4.2290472984313965, + "learning_rate": 1.9395622696592198e-06, + "loss": 0.1595, + "step": 28159 + }, + { + "epoch": 0.7126047017739201, + "grad_norm": 4.20151948928833, + "learning_rate": 1.9392447595156362e-06, + "loss": 0.1777, + "step": 28160 + }, + { + "epoch": 0.7126300073386138, + "grad_norm": 11.213654518127441, + "learning_rate": 1.9389272691102933e-06, + "loss": 0.2301, + "step": 28161 + }, + { + "epoch": 0.7126553129033074, + "grad_norm": 6.0715651512146, + "learning_rate": 1.9386097984452375e-06, + "loss": 0.2399, + "step": 28162 + }, + { + "epoch": 0.7126806184680011, + "grad_norm": 10.171701431274414, + "learning_rate": 1.9382923475225207e-06, + "loss": 0.3218, + "step": 28163 + }, + { + "epoch": 0.7127059240326948, + "grad_norm": 7.113612651824951, + "learning_rate": 1.9379749163441838e-06, + "loss": 0.2191, + "step": 28164 + }, + { + "epoch": 0.7127312295973884, + "grad_norm": 2.440734624862671, + "learning_rate": 1.9376575049122787e-06, + "loss": 0.0787, + "step": 28165 + }, + { + "epoch": 0.7127565351620821, + "grad_norm": 5.113902568817139, + "learning_rate": 1.9373401132288495e-06, + "loss": 0.1089, + "step": 28166 + }, + { + "epoch": 0.7127818407267759, + "grad_norm": 5.784403324127197, + "learning_rate": 1.937022741295947e-06, + "loss": 0.1557, + "step": 28167 + }, + { + "epoch": 0.7128071462914695, + "grad_norm": 3.7961552143096924, + "learning_rate": 1.936705389115612e-06, + "loss": 0.1462, + "step": 28168 + }, + { + "epoch": 0.7128324518561632, + "grad_norm": 6.392780780792236, + "learning_rate": 1.9363880566898957e-06, + "loss": 0.2453, + "step": 28169 + }, + { + "epoch": 0.7128577574208569, + "grad_norm": 4.418550968170166, + "learning_rate": 1.936070744020841e-06, + "loss": 0.1096, + "step": 28170 + }, + { + "epoch": 0.7128830629855505, + "grad_norm": 4.59968900680542, + "learning_rate": 1.9357534511105e-06, + "loss": 0.1492, + "step": 28171 + }, + { + "epoch": 0.7129083685502442, + "grad_norm": 4.157537460327148, + "learning_rate": 1.9354361779609114e-06, + "loss": 0.11, + "step": 28172 + }, + { + "epoch": 0.7129336741149379, + "grad_norm": 5.422491073608398, + "learning_rate": 1.935118924574127e-06, + "loss": 0.0858, + "step": 28173 + }, + { + "epoch": 0.7129589796796315, + "grad_norm": 7.481982707977295, + "learning_rate": 1.9348016909521884e-06, + "loss": 0.0886, + "step": 28174 + }, + { + "epoch": 0.7129842852443252, + "grad_norm": 17.603302001953125, + "learning_rate": 1.934484477097145e-06, + "loss": 0.2418, + "step": 28175 + }, + { + "epoch": 0.7130095908090189, + "grad_norm": 5.57342529296875, + "learning_rate": 1.9341672830110416e-06, + "loss": 0.1182, + "step": 28176 + }, + { + "epoch": 0.7130348963737125, + "grad_norm": 4.6304612159729, + "learning_rate": 1.9338501086959224e-06, + "loss": 0.1304, + "step": 28177 + }, + { + "epoch": 0.7130602019384062, + "grad_norm": 3.709664821624756, + "learning_rate": 1.9335329541538323e-06, + "loss": 0.0976, + "step": 28178 + }, + { + "epoch": 0.7130855075031, + "grad_norm": 16.787492752075195, + "learning_rate": 1.933215819386819e-06, + "loss": 0.2336, + "step": 28179 + }, + { + "epoch": 0.7131108130677936, + "grad_norm": 9.291462898254395, + "learning_rate": 1.9328987043969265e-06, + "loss": 0.302, + "step": 28180 + }, + { + "epoch": 0.7131361186324873, + "grad_norm": 8.514784812927246, + "learning_rate": 1.9325816091862003e-06, + "loss": 0.2131, + "step": 28181 + }, + { + "epoch": 0.713161424197181, + "grad_norm": 5.321095943450928, + "learning_rate": 1.9322645337566826e-06, + "loss": 0.1627, + "step": 28182 + }, + { + "epoch": 0.7131867297618746, + "grad_norm": 5.182331085205078, + "learning_rate": 1.931947478110422e-06, + "loss": 0.1876, + "step": 28183 + }, + { + "epoch": 0.7132120353265683, + "grad_norm": 4.389275550842285, + "learning_rate": 1.9316304422494613e-06, + "loss": 0.1661, + "step": 28184 + }, + { + "epoch": 0.713237340891262, + "grad_norm": 3.875260829925537, + "learning_rate": 1.9313134261758456e-06, + "loss": 0.1287, + "step": 28185 + }, + { + "epoch": 0.7132626464559557, + "grad_norm": 5.871897220611572, + "learning_rate": 1.930996429891617e-06, + "loss": 0.0794, + "step": 28186 + }, + { + "epoch": 0.7132879520206493, + "grad_norm": 6.600575923919678, + "learning_rate": 1.930679453398823e-06, + "loss": 0.1943, + "step": 28187 + }, + { + "epoch": 0.713313257585343, + "grad_norm": 6.371025085449219, + "learning_rate": 1.930362496699506e-06, + "loss": 0.0738, + "step": 28188 + }, + { + "epoch": 0.7133385631500367, + "grad_norm": 4.408493518829346, + "learning_rate": 1.9300455597957107e-06, + "loss": 0.1011, + "step": 28189 + }, + { + "epoch": 0.7133638687147303, + "grad_norm": 3.0489511489868164, + "learning_rate": 1.9297286426894783e-06, + "loss": 0.1147, + "step": 28190 + }, + { + "epoch": 0.713389174279424, + "grad_norm": 9.584400177001953, + "learning_rate": 1.929411745382857e-06, + "loss": 0.1847, + "step": 28191 + }, + { + "epoch": 0.7134144798441178, + "grad_norm": 10.169596672058105, + "learning_rate": 1.9290948678778882e-06, + "loss": 0.1945, + "step": 28192 + }, + { + "epoch": 0.7134397854088114, + "grad_norm": 5.502511501312256, + "learning_rate": 1.928778010176615e-06, + "loss": 0.208, + "step": 28193 + }, + { + "epoch": 0.7134650909735051, + "grad_norm": 6.527406692504883, + "learning_rate": 1.9284611722810815e-06, + "loss": 0.2009, + "step": 28194 + }, + { + "epoch": 0.7134903965381988, + "grad_norm": 3.337801694869995, + "learning_rate": 1.928144354193329e-06, + "loss": 0.1414, + "step": 28195 + }, + { + "epoch": 0.7135157021028924, + "grad_norm": 7.246627330780029, + "learning_rate": 1.927827555915403e-06, + "loss": 0.2118, + "step": 28196 + }, + { + "epoch": 0.7135410076675861, + "grad_norm": 3.7561163902282715, + "learning_rate": 1.9275107774493444e-06, + "loss": 0.104, + "step": 28197 + }, + { + "epoch": 0.7135663132322798, + "grad_norm": 6.444891452789307, + "learning_rate": 1.927194018797201e-06, + "loss": 0.2182, + "step": 28198 + }, + { + "epoch": 0.7135916187969734, + "grad_norm": 7.158185958862305, + "learning_rate": 1.926877279961008e-06, + "loss": 0.1735, + "step": 28199 + }, + { + "epoch": 0.7136169243616671, + "grad_norm": 6.161708354949951, + "learning_rate": 1.9265605609428133e-06, + "loss": 0.1944, + "step": 28200 + }, + { + "epoch": 0.7136422299263608, + "grad_norm": 3.428290367126465, + "learning_rate": 1.9262438617446568e-06, + "loss": 0.148, + "step": 28201 + }, + { + "epoch": 0.7136675354910544, + "grad_norm": 6.3162841796875, + "learning_rate": 1.925927182368585e-06, + "loss": 0.1963, + "step": 28202 + }, + { + "epoch": 0.7136928410557482, + "grad_norm": 9.90152645111084, + "learning_rate": 1.9256105228166334e-06, + "loss": 0.2175, + "step": 28203 + }, + { + "epoch": 0.7137181466204419, + "grad_norm": 11.071553230285645, + "learning_rate": 1.925293883090849e-06, + "loss": 0.1919, + "step": 28204 + }, + { + "epoch": 0.7137434521851355, + "grad_norm": 11.988381385803223, + "learning_rate": 1.9249772631932713e-06, + "loss": 0.2341, + "step": 28205 + }, + { + "epoch": 0.7137687577498292, + "grad_norm": 4.922354698181152, + "learning_rate": 1.9246606631259446e-06, + "loss": 0.2541, + "step": 28206 + }, + { + "epoch": 0.7137940633145229, + "grad_norm": 4.520717144012451, + "learning_rate": 1.9243440828909092e-06, + "loss": 0.1415, + "step": 28207 + }, + { + "epoch": 0.7138193688792165, + "grad_norm": 6.8345255851745605, + "learning_rate": 1.924027522490207e-06, + "loss": 0.1872, + "step": 28208 + }, + { + "epoch": 0.7138446744439102, + "grad_norm": 3.037522554397583, + "learning_rate": 1.9237109819258776e-06, + "loss": 0.1208, + "step": 28209 + }, + { + "epoch": 0.7138699800086039, + "grad_norm": 8.349685668945312, + "learning_rate": 1.9233944611999648e-06, + "loss": 0.1798, + "step": 28210 + }, + { + "epoch": 0.7138952855732976, + "grad_norm": 3.814526081085205, + "learning_rate": 1.92307796031451e-06, + "loss": 0.1255, + "step": 28211 + }, + { + "epoch": 0.7139205911379912, + "grad_norm": 3.371783494949341, + "learning_rate": 1.922761479271552e-06, + "loss": 0.1311, + "step": 28212 + }, + { + "epoch": 0.7139458967026849, + "grad_norm": 3.9135847091674805, + "learning_rate": 1.922445018073131e-06, + "loss": 0.172, + "step": 28213 + }, + { + "epoch": 0.7139712022673786, + "grad_norm": 5.512298583984375, + "learning_rate": 1.9221285767212923e-06, + "loss": 0.1964, + "step": 28214 + }, + { + "epoch": 0.7139965078320722, + "grad_norm": 5.015666484832764, + "learning_rate": 1.9218121552180734e-06, + "loss": 0.1832, + "step": 28215 + }, + { + "epoch": 0.714021813396766, + "grad_norm": 3.6373534202575684, + "learning_rate": 1.9214957535655155e-06, + "loss": 0.1139, + "step": 28216 + }, + { + "epoch": 0.7140471189614597, + "grad_norm": 4.979115962982178, + "learning_rate": 1.921179371765657e-06, + "loss": 0.1699, + "step": 28217 + }, + { + "epoch": 0.7140724245261533, + "grad_norm": 9.235906600952148, + "learning_rate": 1.9208630098205416e-06, + "loss": 0.2338, + "step": 28218 + }, + { + "epoch": 0.714097730090847, + "grad_norm": 3.980558395385742, + "learning_rate": 1.9205466677322075e-06, + "loss": 0.1225, + "step": 28219 + }, + { + "epoch": 0.7141230356555407, + "grad_norm": 4.166125297546387, + "learning_rate": 1.920230345502696e-06, + "loss": 0.1621, + "step": 28220 + }, + { + "epoch": 0.7141483412202343, + "grad_norm": 6.337276935577393, + "learning_rate": 1.9199140431340458e-06, + "loss": 0.1647, + "step": 28221 + }, + { + "epoch": 0.714173646784928, + "grad_norm": 3.551279306411743, + "learning_rate": 1.919597760628295e-06, + "loss": 0.1557, + "step": 28222 + }, + { + "epoch": 0.7141989523496217, + "grad_norm": 5.0492777824401855, + "learning_rate": 1.9192814979874868e-06, + "loss": 0.2214, + "step": 28223 + }, + { + "epoch": 0.7142242579143153, + "grad_norm": 3.2554566860198975, + "learning_rate": 1.9189652552136594e-06, + "loss": 0.113, + "step": 28224 + }, + { + "epoch": 0.714249563479009, + "grad_norm": 2.3817484378814697, + "learning_rate": 1.9186490323088525e-06, + "loss": 0.1065, + "step": 28225 + }, + { + "epoch": 0.7142748690437027, + "grad_norm": 5.618300914764404, + "learning_rate": 1.9183328292751026e-06, + "loss": 0.1738, + "step": 28226 + }, + { + "epoch": 0.7143001746083963, + "grad_norm": 12.456403732299805, + "learning_rate": 1.918016646114453e-06, + "loss": 0.1872, + "step": 28227 + }, + { + "epoch": 0.71432548017309, + "grad_norm": 2.8386898040771484, + "learning_rate": 1.9177004828289388e-06, + "loss": 0.1371, + "step": 28228 + }, + { + "epoch": 0.7143507857377838, + "grad_norm": 5.634754657745361, + "learning_rate": 1.9173843394206045e-06, + "loss": 0.2331, + "step": 28229 + }, + { + "epoch": 0.7143760913024774, + "grad_norm": 5.742089748382568, + "learning_rate": 1.9170682158914812e-06, + "loss": 0.1512, + "step": 28230 + }, + { + "epoch": 0.7144013968671711, + "grad_norm": 4.600972652435303, + "learning_rate": 1.9167521122436134e-06, + "loss": 0.1637, + "step": 28231 + }, + { + "epoch": 0.7144267024318648, + "grad_norm": 6.282110691070557, + "learning_rate": 1.9164360284790363e-06, + "loss": 0.1593, + "step": 28232 + }, + { + "epoch": 0.7144520079965584, + "grad_norm": 6.15587043762207, + "learning_rate": 1.916119964599793e-06, + "loss": 0.1508, + "step": 28233 + }, + { + "epoch": 0.7144773135612521, + "grad_norm": 4.700222969055176, + "learning_rate": 1.915803920607915e-06, + "loss": 0.167, + "step": 28234 + }, + { + "epoch": 0.7145026191259458, + "grad_norm": 4.189427375793457, + "learning_rate": 1.915487896505445e-06, + "loss": 0.1382, + "step": 28235 + }, + { + "epoch": 0.7145279246906395, + "grad_norm": 6.28739070892334, + "learning_rate": 1.915171892294419e-06, + "loss": 0.1235, + "step": 28236 + }, + { + "epoch": 0.7145532302553331, + "grad_norm": 5.78730583190918, + "learning_rate": 1.914855907976877e-06, + "loss": 0.0959, + "step": 28237 + }, + { + "epoch": 0.7145785358200268, + "grad_norm": 3.1329925060272217, + "learning_rate": 1.914539943554855e-06, + "loss": 0.1387, + "step": 28238 + }, + { + "epoch": 0.7146038413847206, + "grad_norm": 6.448723793029785, + "learning_rate": 1.9142239990303913e-06, + "loss": 0.1076, + "step": 28239 + }, + { + "epoch": 0.7146291469494142, + "grad_norm": 6.289495944976807, + "learning_rate": 1.913908074405521e-06, + "loss": 0.1583, + "step": 28240 + }, + { + "epoch": 0.7146544525141079, + "grad_norm": 8.323836326599121, + "learning_rate": 1.9135921696822855e-06, + "loss": 0.1456, + "step": 28241 + }, + { + "epoch": 0.7146797580788016, + "grad_norm": 3.8348708152770996, + "learning_rate": 1.91327628486272e-06, + "loss": 0.1957, + "step": 28242 + }, + { + "epoch": 0.7147050636434952, + "grad_norm": 3.835707187652588, + "learning_rate": 1.912960419948862e-06, + "loss": 0.1047, + "step": 28243 + }, + { + "epoch": 0.7147303692081889, + "grad_norm": 5.072199821472168, + "learning_rate": 1.9126445749427454e-06, + "loss": 0.1488, + "step": 28244 + }, + { + "epoch": 0.7147556747728826, + "grad_norm": 5.90158224105835, + "learning_rate": 1.9123287498464115e-06, + "loss": 0.1705, + "step": 28245 + }, + { + "epoch": 0.7147809803375762, + "grad_norm": 6.949391841888428, + "learning_rate": 1.9120129446618957e-06, + "loss": 0.1421, + "step": 28246 + }, + { + "epoch": 0.7148062859022699, + "grad_norm": 4.389175891876221, + "learning_rate": 1.9116971593912337e-06, + "loss": 0.1839, + "step": 28247 + }, + { + "epoch": 0.7148315914669636, + "grad_norm": 7.894745826721191, + "learning_rate": 1.9113813940364623e-06, + "loss": 0.1402, + "step": 28248 + }, + { + "epoch": 0.7148568970316572, + "grad_norm": 13.61465835571289, + "learning_rate": 1.911065648599616e-06, + "loss": 0.272, + "step": 28249 + }, + { + "epoch": 0.7148822025963509, + "grad_norm": 7.544378280639648, + "learning_rate": 1.910749923082734e-06, + "loss": 0.1614, + "step": 28250 + }, + { + "epoch": 0.7149075081610446, + "grad_norm": 2.9030911922454834, + "learning_rate": 1.910434217487852e-06, + "loss": 0.0804, + "step": 28251 + }, + { + "epoch": 0.7149328137257382, + "grad_norm": 13.806740760803223, + "learning_rate": 1.9101185318170045e-06, + "loss": 0.2004, + "step": 28252 + }, + { + "epoch": 0.714958119290432, + "grad_norm": 4.365433216094971, + "learning_rate": 1.909802866072226e-06, + "loss": 0.1864, + "step": 28253 + }, + { + "epoch": 0.7149834248551257, + "grad_norm": 4.436699390411377, + "learning_rate": 1.9094872202555565e-06, + "loss": 0.1592, + "step": 28254 + }, + { + "epoch": 0.7150087304198193, + "grad_norm": 2.6960678100585938, + "learning_rate": 1.909171594369028e-06, + "loss": 0.109, + "step": 28255 + }, + { + "epoch": 0.715034035984513, + "grad_norm": 4.059113502502441, + "learning_rate": 1.9088559884146773e-06, + "loss": 0.186, + "step": 28256 + }, + { + "epoch": 0.7150593415492067, + "grad_norm": 2.4608891010284424, + "learning_rate": 1.908540402394538e-06, + "loss": 0.1121, + "step": 28257 + }, + { + "epoch": 0.7150846471139003, + "grad_norm": 13.748725891113281, + "learning_rate": 1.908224836310648e-06, + "loss": 0.3178, + "step": 28258 + }, + { + "epoch": 0.715109952678594, + "grad_norm": 2.9158737659454346, + "learning_rate": 1.9079092901650393e-06, + "loss": 0.1073, + "step": 28259 + }, + { + "epoch": 0.7151352582432877, + "grad_norm": 4.229659557342529, + "learning_rate": 1.907593763959752e-06, + "loss": 0.0782, + "step": 28260 + }, + { + "epoch": 0.7151605638079813, + "grad_norm": 3.7356557846069336, + "learning_rate": 1.9072782576968135e-06, + "loss": 0.1071, + "step": 28261 + }, + { + "epoch": 0.715185869372675, + "grad_norm": 2.0312795639038086, + "learning_rate": 1.906962771378264e-06, + "loss": 0.0554, + "step": 28262 + }, + { + "epoch": 0.7152111749373687, + "grad_norm": 6.710771083831787, + "learning_rate": 1.9066473050061347e-06, + "loss": 0.2097, + "step": 28263 + }, + { + "epoch": 0.7152364805020625, + "grad_norm": 5.795202732086182, + "learning_rate": 1.9063318585824648e-06, + "loss": 0.1902, + "step": 28264 + }, + { + "epoch": 0.7152617860667561, + "grad_norm": 5.986335277557373, + "learning_rate": 1.9060164321092816e-06, + "loss": 0.1292, + "step": 28265 + }, + { + "epoch": 0.7152870916314498, + "grad_norm": 4.69256591796875, + "learning_rate": 1.9057010255886248e-06, + "loss": 0.1468, + "step": 28266 + }, + { + "epoch": 0.7153123971961435, + "grad_norm": 7.538695335388184, + "learning_rate": 1.905385639022525e-06, + "loss": 0.1934, + "step": 28267 + }, + { + "epoch": 0.7153377027608371, + "grad_norm": 3.3782806396484375, + "learning_rate": 1.9050702724130193e-06, + "loss": 0.1575, + "step": 28268 + }, + { + "epoch": 0.7153630083255308, + "grad_norm": 5.864090442657471, + "learning_rate": 1.9047549257621395e-06, + "loss": 0.2324, + "step": 28269 + }, + { + "epoch": 0.7153883138902245, + "grad_norm": 4.244979381561279, + "learning_rate": 1.9044395990719193e-06, + "loss": 0.1944, + "step": 28270 + }, + { + "epoch": 0.7154136194549181, + "grad_norm": 11.611136436462402, + "learning_rate": 1.9041242923443908e-06, + "loss": 0.1395, + "step": 28271 + }, + { + "epoch": 0.7154389250196118, + "grad_norm": 5.272716045379639, + "learning_rate": 1.903809005581591e-06, + "loss": 0.1967, + "step": 28272 + }, + { + "epoch": 0.7154642305843055, + "grad_norm": 10.860039710998535, + "learning_rate": 1.9034937387855507e-06, + "loss": 0.1494, + "step": 28273 + }, + { + "epoch": 0.7154895361489991, + "grad_norm": 5.814798355102539, + "learning_rate": 1.9031784919583029e-06, + "loss": 0.1867, + "step": 28274 + }, + { + "epoch": 0.7155148417136928, + "grad_norm": 3.5020411014556885, + "learning_rate": 1.9028632651018797e-06, + "loss": 0.1422, + "step": 28275 + }, + { + "epoch": 0.7155401472783866, + "grad_norm": 5.870186805725098, + "learning_rate": 1.9025480582183164e-06, + "loss": 0.1446, + "step": 28276 + }, + { + "epoch": 0.7155654528430802, + "grad_norm": 3.7210476398468018, + "learning_rate": 1.902232871309645e-06, + "loss": 0.1289, + "step": 28277 + }, + { + "epoch": 0.7155907584077739, + "grad_norm": 3.3096721172332764, + "learning_rate": 1.9019177043778974e-06, + "loss": 0.1329, + "step": 28278 + }, + { + "epoch": 0.7156160639724676, + "grad_norm": 7.48056697845459, + "learning_rate": 1.9016025574251063e-06, + "loss": 0.2056, + "step": 28279 + }, + { + "epoch": 0.7156413695371612, + "grad_norm": 2.7244486808776855, + "learning_rate": 1.9012874304533018e-06, + "loss": 0.0801, + "step": 28280 + }, + { + "epoch": 0.7156666751018549, + "grad_norm": 22.553373336791992, + "learning_rate": 1.9009723234645205e-06, + "loss": 0.4686, + "step": 28281 + }, + { + "epoch": 0.7156919806665486, + "grad_norm": 15.593057632446289, + "learning_rate": 1.9006572364607922e-06, + "loss": 0.2351, + "step": 28282 + }, + { + "epoch": 0.7157172862312422, + "grad_norm": 3.7710044384002686, + "learning_rate": 1.9003421694441487e-06, + "loss": 0.1499, + "step": 28283 + }, + { + "epoch": 0.7157425917959359, + "grad_norm": 8.119682312011719, + "learning_rate": 1.90002712241662e-06, + "loss": 0.1741, + "step": 28284 + }, + { + "epoch": 0.7157678973606296, + "grad_norm": 3.9611711502075195, + "learning_rate": 1.8997120953802417e-06, + "loss": 0.168, + "step": 28285 + }, + { + "epoch": 0.7157932029253232, + "grad_norm": 4.632025718688965, + "learning_rate": 1.899397088337044e-06, + "loss": 0.1956, + "step": 28286 + }, + { + "epoch": 0.7158185084900169, + "grad_norm": 3.3107471466064453, + "learning_rate": 1.899082101289057e-06, + "loss": 0.1263, + "step": 28287 + }, + { + "epoch": 0.7158438140547106, + "grad_norm": 5.129080772399902, + "learning_rate": 1.8987671342383107e-06, + "loss": 0.1716, + "step": 28288 + }, + { + "epoch": 0.7158691196194044, + "grad_norm": 2.3353261947631836, + "learning_rate": 1.8984521871868406e-06, + "loss": 0.0745, + "step": 28289 + }, + { + "epoch": 0.715894425184098, + "grad_norm": 3.814462423324585, + "learning_rate": 1.8981372601366738e-06, + "loss": 0.1359, + "step": 28290 + }, + { + "epoch": 0.7159197307487917, + "grad_norm": 3.4497203826904297, + "learning_rate": 1.897822353089846e-06, + "loss": 0.1525, + "step": 28291 + }, + { + "epoch": 0.7159450363134854, + "grad_norm": 9.13117504119873, + "learning_rate": 1.897507466048381e-06, + "loss": 0.2029, + "step": 28292 + }, + { + "epoch": 0.715970341878179, + "grad_norm": 2.5705857276916504, + "learning_rate": 1.8971925990143159e-06, + "loss": 0.1005, + "step": 28293 + }, + { + "epoch": 0.7159956474428727, + "grad_norm": 4.377396106719971, + "learning_rate": 1.896877751989676e-06, + "loss": 0.1258, + "step": 28294 + }, + { + "epoch": 0.7160209530075664, + "grad_norm": 4.564663410186768, + "learning_rate": 1.8965629249764976e-06, + "loss": 0.1992, + "step": 28295 + }, + { + "epoch": 0.71604625857226, + "grad_norm": 6.153204441070557, + "learning_rate": 1.896248117976805e-06, + "loss": 0.0938, + "step": 28296 + }, + { + "epoch": 0.7160715641369537, + "grad_norm": 6.576352596282959, + "learning_rate": 1.895933330992632e-06, + "loss": 0.152, + "step": 28297 + }, + { + "epoch": 0.7160968697016474, + "grad_norm": 5.563997745513916, + "learning_rate": 1.8956185640260061e-06, + "loss": 0.1557, + "step": 28298 + }, + { + "epoch": 0.716122175266341, + "grad_norm": 4.536087512969971, + "learning_rate": 1.8953038170789617e-06, + "loss": 0.1979, + "step": 28299 + }, + { + "epoch": 0.7161474808310347, + "grad_norm": 3.8357865810394287, + "learning_rate": 1.8949890901535223e-06, + "loss": 0.1224, + "step": 28300 + }, + { + "epoch": 0.7161727863957285, + "grad_norm": 3.257484197616577, + "learning_rate": 1.894674383251723e-06, + "loss": 0.1027, + "step": 28301 + }, + { + "epoch": 0.7161980919604221, + "grad_norm": 3.1523075103759766, + "learning_rate": 1.8943596963755883e-06, + "loss": 0.0909, + "step": 28302 + }, + { + "epoch": 0.7162233975251158, + "grad_norm": 2.685335159301758, + "learning_rate": 1.8940450295271524e-06, + "loss": 0.1013, + "step": 28303 + }, + { + "epoch": 0.7162487030898095, + "grad_norm": 10.840226173400879, + "learning_rate": 1.8937303827084425e-06, + "loss": 0.1823, + "step": 28304 + }, + { + "epoch": 0.7162740086545031, + "grad_norm": 13.928844451904297, + "learning_rate": 1.8934157559214872e-06, + "loss": 0.1486, + "step": 28305 + }, + { + "epoch": 0.7162993142191968, + "grad_norm": 4.062271595001221, + "learning_rate": 1.893101149168316e-06, + "loss": 0.1467, + "step": 28306 + }, + { + "epoch": 0.7163246197838905, + "grad_norm": 4.67542028427124, + "learning_rate": 1.8927865624509555e-06, + "loss": 0.178, + "step": 28307 + }, + { + "epoch": 0.7163499253485841, + "grad_norm": 5.207855224609375, + "learning_rate": 1.8924719957714383e-06, + "loss": 0.1478, + "step": 28308 + }, + { + "epoch": 0.7163752309132778, + "grad_norm": 8.13459587097168, + "learning_rate": 1.8921574491317917e-06, + "loss": 0.1367, + "step": 28309 + }, + { + "epoch": 0.7164005364779715, + "grad_norm": 6.4811224937438965, + "learning_rate": 1.891842922534043e-06, + "loss": 0.1698, + "step": 28310 + }, + { + "epoch": 0.7164258420426651, + "grad_norm": 4.5382161140441895, + "learning_rate": 1.8915284159802194e-06, + "loss": 0.1071, + "step": 28311 + }, + { + "epoch": 0.7164511476073588, + "grad_norm": 15.943755149841309, + "learning_rate": 1.891213929472353e-06, + "loss": 0.2349, + "step": 28312 + }, + { + "epoch": 0.7164764531720526, + "grad_norm": 4.138294696807861, + "learning_rate": 1.8908994630124694e-06, + "loss": 0.1263, + "step": 28313 + }, + { + "epoch": 0.7165017587367463, + "grad_norm": 6.968314170837402, + "learning_rate": 1.8905850166025968e-06, + "loss": 0.1635, + "step": 28314 + }, + { + "epoch": 0.7165270643014399, + "grad_norm": 6.634676933288574, + "learning_rate": 1.8902705902447616e-06, + "loss": 0.1219, + "step": 28315 + }, + { + "epoch": 0.7165523698661336, + "grad_norm": 3.829477071762085, + "learning_rate": 1.8899561839409946e-06, + "loss": 0.1001, + "step": 28316 + }, + { + "epoch": 0.7165776754308273, + "grad_norm": 6.720589637756348, + "learning_rate": 1.8896417976933212e-06, + "loss": 0.1447, + "step": 28317 + }, + { + "epoch": 0.7166029809955209, + "grad_norm": 5.6892876625061035, + "learning_rate": 1.8893274315037696e-06, + "loss": 0.1174, + "step": 28318 + }, + { + "epoch": 0.7166282865602146, + "grad_norm": 14.768128395080566, + "learning_rate": 1.8890130853743654e-06, + "loss": 0.1725, + "step": 28319 + }, + { + "epoch": 0.7166535921249083, + "grad_norm": 8.376444816589355, + "learning_rate": 1.8886987593071382e-06, + "loss": 0.2092, + "step": 28320 + }, + { + "epoch": 0.7166788976896019, + "grad_norm": 5.372442245483398, + "learning_rate": 1.888384453304114e-06, + "loss": 0.1713, + "step": 28321 + }, + { + "epoch": 0.7167042032542956, + "grad_norm": 3.8380069732666016, + "learning_rate": 1.8880701673673201e-06, + "loss": 0.1452, + "step": 28322 + }, + { + "epoch": 0.7167295088189893, + "grad_norm": 2.6317341327667236, + "learning_rate": 1.8877559014987812e-06, + "loss": 0.067, + "step": 28323 + }, + { + "epoch": 0.7167548143836829, + "grad_norm": 3.6670453548431396, + "learning_rate": 1.8874416557005265e-06, + "loss": 0.1712, + "step": 28324 + }, + { + "epoch": 0.7167801199483766, + "grad_norm": 7.702012062072754, + "learning_rate": 1.8871274299745806e-06, + "loss": 0.1656, + "step": 28325 + }, + { + "epoch": 0.7168054255130704, + "grad_norm": 6.492568016052246, + "learning_rate": 1.8868132243229743e-06, + "loss": 0.1898, + "step": 28326 + }, + { + "epoch": 0.716830731077764, + "grad_norm": 6.549969673156738, + "learning_rate": 1.8864990387477268e-06, + "loss": 0.1671, + "step": 28327 + }, + { + "epoch": 0.7168560366424577, + "grad_norm": 4.18451452255249, + "learning_rate": 1.8861848732508693e-06, + "loss": 0.1644, + "step": 28328 + }, + { + "epoch": 0.7168813422071514, + "grad_norm": 5.853819847106934, + "learning_rate": 1.8858707278344247e-06, + "loss": 0.1606, + "step": 28329 + }, + { + "epoch": 0.716906647771845, + "grad_norm": 5.295463562011719, + "learning_rate": 1.885556602500424e-06, + "loss": 0.2039, + "step": 28330 + }, + { + "epoch": 0.7169319533365387, + "grad_norm": 4.894177436828613, + "learning_rate": 1.885242497250886e-06, + "loss": 0.1858, + "step": 28331 + }, + { + "epoch": 0.7169572589012324, + "grad_norm": 7.1456098556518555, + "learning_rate": 1.8849284120878414e-06, + "loss": 0.2464, + "step": 28332 + }, + { + "epoch": 0.716982564465926, + "grad_norm": 5.577531337738037, + "learning_rate": 1.8846143470133143e-06, + "loss": 0.1936, + "step": 28333 + }, + { + "epoch": 0.7170078700306197, + "grad_norm": 7.771136283874512, + "learning_rate": 1.8843003020293276e-06, + "loss": 0.1425, + "step": 28334 + }, + { + "epoch": 0.7170331755953134, + "grad_norm": 3.849289655685425, + "learning_rate": 1.883986277137911e-06, + "loss": 0.1578, + "step": 28335 + }, + { + "epoch": 0.717058481160007, + "grad_norm": 3.96742582321167, + "learning_rate": 1.8836722723410867e-06, + "loss": 0.154, + "step": 28336 + }, + { + "epoch": 0.7170837867247007, + "grad_norm": 4.867325782775879, + "learning_rate": 1.8833582876408806e-06, + "loss": 0.131, + "step": 28337 + }, + { + "epoch": 0.7171090922893945, + "grad_norm": 4.132641792297363, + "learning_rate": 1.8830443230393152e-06, + "loss": 0.1486, + "step": 28338 + }, + { + "epoch": 0.7171343978540882, + "grad_norm": 6.059070587158203, + "learning_rate": 1.8827303785384193e-06, + "loss": 0.154, + "step": 28339 + }, + { + "epoch": 0.7171597034187818, + "grad_norm": 5.656655311584473, + "learning_rate": 1.882416454140215e-06, + "loss": 0.186, + "step": 28340 + }, + { + "epoch": 0.7171850089834755, + "grad_norm": 5.021988391876221, + "learning_rate": 1.8821025498467271e-06, + "loss": 0.1825, + "step": 28341 + }, + { + "epoch": 0.7172103145481692, + "grad_norm": 2.7650651931762695, + "learning_rate": 1.8817886656599782e-06, + "loss": 0.1375, + "step": 28342 + }, + { + "epoch": 0.7172356201128628, + "grad_norm": 4.426564693450928, + "learning_rate": 1.8814748015819962e-06, + "loss": 0.157, + "step": 28343 + }, + { + "epoch": 0.7172609256775565, + "grad_norm": 9.721358299255371, + "learning_rate": 1.8811609576148026e-06, + "loss": 0.2749, + "step": 28344 + }, + { + "epoch": 0.7172862312422502, + "grad_norm": 3.164597272872925, + "learning_rate": 1.8808471337604218e-06, + "loss": 0.0827, + "step": 28345 + }, + { + "epoch": 0.7173115368069438, + "grad_norm": 4.952771186828613, + "learning_rate": 1.8805333300208762e-06, + "loss": 0.1952, + "step": 28346 + }, + { + "epoch": 0.7173368423716375, + "grad_norm": 6.1882500648498535, + "learning_rate": 1.8802195463981925e-06, + "loss": 0.121, + "step": 28347 + }, + { + "epoch": 0.7173621479363312, + "grad_norm": 6.573483943939209, + "learning_rate": 1.8799057828943929e-06, + "loss": 0.1642, + "step": 28348 + }, + { + "epoch": 0.7173874535010248, + "grad_norm": 4.597225666046143, + "learning_rate": 1.8795920395115003e-06, + "loss": 0.122, + "step": 28349 + }, + { + "epoch": 0.7174127590657186, + "grad_norm": 2.6298906803131104, + "learning_rate": 1.8792783162515367e-06, + "loss": 0.0863, + "step": 28350 + }, + { + "epoch": 0.7174380646304123, + "grad_norm": 4.808533191680908, + "learning_rate": 1.8789646131165278e-06, + "loss": 0.1533, + "step": 28351 + }, + { + "epoch": 0.7174633701951059, + "grad_norm": 9.240838050842285, + "learning_rate": 1.878650930108496e-06, + "loss": 0.3417, + "step": 28352 + }, + { + "epoch": 0.7174886757597996, + "grad_norm": 5.248817443847656, + "learning_rate": 1.8783372672294636e-06, + "loss": 0.1078, + "step": 28353 + }, + { + "epoch": 0.7175139813244933, + "grad_norm": 4.886508464813232, + "learning_rate": 1.878023624481452e-06, + "loss": 0.166, + "step": 28354 + }, + { + "epoch": 0.7175392868891869, + "grad_norm": 7.653672218322754, + "learning_rate": 1.8777100018664868e-06, + "loss": 0.252, + "step": 28355 + }, + { + "epoch": 0.7175645924538806, + "grad_norm": 4.2181220054626465, + "learning_rate": 1.8773963993865873e-06, + "loss": 0.1614, + "step": 28356 + }, + { + "epoch": 0.7175898980185743, + "grad_norm": 6.458077907562256, + "learning_rate": 1.8770828170437817e-06, + "loss": 0.1649, + "step": 28357 + }, + { + "epoch": 0.7176152035832679, + "grad_norm": 4.581498622894287, + "learning_rate": 1.876769254840084e-06, + "loss": 0.1401, + "step": 28358 + }, + { + "epoch": 0.7176405091479616, + "grad_norm": 8.697737693786621, + "learning_rate": 1.8764557127775218e-06, + "loss": 0.3426, + "step": 28359 + }, + { + "epoch": 0.7176658147126553, + "grad_norm": 5.014634609222412, + "learning_rate": 1.8761421908581145e-06, + "loss": 0.1746, + "step": 28360 + }, + { + "epoch": 0.7176911202773489, + "grad_norm": 2.7681725025177, + "learning_rate": 1.8758286890838883e-06, + "loss": 0.1352, + "step": 28361 + }, + { + "epoch": 0.7177164258420426, + "grad_norm": 6.697761535644531, + "learning_rate": 1.875515207456859e-06, + "loss": 0.1239, + "step": 28362 + }, + { + "epoch": 0.7177417314067364, + "grad_norm": 3.2871055603027344, + "learning_rate": 1.8752017459790516e-06, + "loss": 0.1137, + "step": 28363 + }, + { + "epoch": 0.7177670369714301, + "grad_norm": 6.063668727874756, + "learning_rate": 1.8748883046524874e-06, + "loss": 0.1717, + "step": 28364 + }, + { + "epoch": 0.7177923425361237, + "grad_norm": 5.670388698577881, + "learning_rate": 1.8745748834791855e-06, + "loss": 0.2252, + "step": 28365 + }, + { + "epoch": 0.7178176481008174, + "grad_norm": 4.590519428253174, + "learning_rate": 1.8742614824611705e-06, + "loss": 0.1858, + "step": 28366 + }, + { + "epoch": 0.7178429536655111, + "grad_norm": 5.8719482421875, + "learning_rate": 1.8739481016004617e-06, + "loss": 0.1534, + "step": 28367 + }, + { + "epoch": 0.7178682592302047, + "grad_norm": 3.928915023803711, + "learning_rate": 1.8736347408990801e-06, + "loss": 0.1446, + "step": 28368 + }, + { + "epoch": 0.7178935647948984, + "grad_norm": 3.291722536087036, + "learning_rate": 1.873321400359045e-06, + "loss": 0.1411, + "step": 28369 + }, + { + "epoch": 0.7179188703595921, + "grad_norm": 6.615652561187744, + "learning_rate": 1.873008079982382e-06, + "loss": 0.187, + "step": 28370 + }, + { + "epoch": 0.7179441759242857, + "grad_norm": 4.273684501647949, + "learning_rate": 1.8726947797711048e-06, + "loss": 0.1244, + "step": 28371 + }, + { + "epoch": 0.7179694814889794, + "grad_norm": 7.391451835632324, + "learning_rate": 1.8723814997272387e-06, + "loss": 0.2147, + "step": 28372 + }, + { + "epoch": 0.7179947870536731, + "grad_norm": 8.379504203796387, + "learning_rate": 1.8720682398528005e-06, + "loss": 0.2002, + "step": 28373 + }, + { + "epoch": 0.7180200926183667, + "grad_norm": 8.628983497619629, + "learning_rate": 1.8717550001498165e-06, + "loss": 0.1648, + "step": 28374 + }, + { + "epoch": 0.7180453981830605, + "grad_norm": 4.6636552810668945, + "learning_rate": 1.8714417806202989e-06, + "loss": 0.137, + "step": 28375 + }, + { + "epoch": 0.7180707037477542, + "grad_norm": 6.958341598510742, + "learning_rate": 1.8711285812662733e-06, + "loss": 0.1557, + "step": 28376 + }, + { + "epoch": 0.7180960093124478, + "grad_norm": 12.060344696044922, + "learning_rate": 1.8708154020897551e-06, + "loss": 0.2692, + "step": 28377 + }, + { + "epoch": 0.7181213148771415, + "grad_norm": 3.0362343788146973, + "learning_rate": 1.8705022430927689e-06, + "loss": 0.127, + "step": 28378 + }, + { + "epoch": 0.7181466204418352, + "grad_norm": 4.248507499694824, + "learning_rate": 1.8701891042773306e-06, + "loss": 0.1885, + "step": 28379 + }, + { + "epoch": 0.7181719260065288, + "grad_norm": 9.630485534667969, + "learning_rate": 1.8698759856454612e-06, + "loss": 0.248, + "step": 28380 + }, + { + "epoch": 0.7181972315712225, + "grad_norm": 5.794442176818848, + "learning_rate": 1.8695628871991777e-06, + "loss": 0.1987, + "step": 28381 + }, + { + "epoch": 0.7182225371359162, + "grad_norm": 12.899496078491211, + "learning_rate": 1.869249808940502e-06, + "loss": 0.2198, + "step": 28382 + }, + { + "epoch": 0.7182478427006098, + "grad_norm": 7.119398593902588, + "learning_rate": 1.8689367508714524e-06, + "loss": 0.2849, + "step": 28383 + }, + { + "epoch": 0.7182731482653035, + "grad_norm": 3.804441213607788, + "learning_rate": 1.8686237129940472e-06, + "loss": 0.1589, + "step": 28384 + }, + { + "epoch": 0.7182984538299972, + "grad_norm": 3.3197124004364014, + "learning_rate": 1.8683106953103037e-06, + "loss": 0.165, + "step": 28385 + }, + { + "epoch": 0.7183237593946908, + "grad_norm": 5.209563255310059, + "learning_rate": 1.867997697822243e-06, + "loss": 0.1336, + "step": 28386 + }, + { + "epoch": 0.7183490649593846, + "grad_norm": 6.018011093139648, + "learning_rate": 1.8676847205318816e-06, + "loss": 0.2567, + "step": 28387 + }, + { + "epoch": 0.7183743705240783, + "grad_norm": 4.835184097290039, + "learning_rate": 1.8673717634412413e-06, + "loss": 0.1974, + "step": 28388 + }, + { + "epoch": 0.7183996760887719, + "grad_norm": 9.809652328491211, + "learning_rate": 1.8670588265523354e-06, + "loss": 0.1925, + "step": 28389 + }, + { + "epoch": 0.7184249816534656, + "grad_norm": 17.949787139892578, + "learning_rate": 1.8667459098671854e-06, + "loss": 0.2799, + "step": 28390 + }, + { + "epoch": 0.7184502872181593, + "grad_norm": 3.696338653564453, + "learning_rate": 1.8664330133878084e-06, + "loss": 0.1814, + "step": 28391 + }, + { + "epoch": 0.718475592782853, + "grad_norm": 5.169393539428711, + "learning_rate": 1.866120137116222e-06, + "loss": 0.1696, + "step": 28392 + }, + { + "epoch": 0.7185008983475466, + "grad_norm": 14.635518074035645, + "learning_rate": 1.865807281054442e-06, + "loss": 0.2126, + "step": 28393 + }, + { + "epoch": 0.7185262039122403, + "grad_norm": 5.460235595703125, + "learning_rate": 1.8654944452044899e-06, + "loss": 0.1514, + "step": 28394 + }, + { + "epoch": 0.718551509476934, + "grad_norm": 5.437661170959473, + "learning_rate": 1.8651816295683806e-06, + "loss": 0.1984, + "step": 28395 + }, + { + "epoch": 0.7185768150416276, + "grad_norm": 4.874887943267822, + "learning_rate": 1.8648688341481308e-06, + "loss": 0.1871, + "step": 28396 + }, + { + "epoch": 0.7186021206063213, + "grad_norm": 7.228704929351807, + "learning_rate": 1.8645560589457595e-06, + "loss": 0.168, + "step": 28397 + }, + { + "epoch": 0.718627426171015, + "grad_norm": 3.939836263656616, + "learning_rate": 1.864243303963284e-06, + "loss": 0.1682, + "step": 28398 + }, + { + "epoch": 0.7186527317357086, + "grad_norm": 3.451172113418579, + "learning_rate": 1.8639305692027192e-06, + "loss": 0.1193, + "step": 28399 + }, + { + "epoch": 0.7186780373004024, + "grad_norm": 7.134212493896484, + "learning_rate": 1.8636178546660815e-06, + "loss": 0.183, + "step": 28400 + }, + { + "epoch": 0.7187033428650961, + "grad_norm": 11.80572509765625, + "learning_rate": 1.8633051603553925e-06, + "loss": 0.3867, + "step": 28401 + }, + { + "epoch": 0.7187286484297897, + "grad_norm": 4.337023735046387, + "learning_rate": 1.8629924862726612e-06, + "loss": 0.0889, + "step": 28402 + }, + { + "epoch": 0.7187539539944834, + "grad_norm": 8.179454803466797, + "learning_rate": 1.8626798324199097e-06, + "loss": 0.1729, + "step": 28403 + }, + { + "epoch": 0.7187792595591771, + "grad_norm": 4.1907758712768555, + "learning_rate": 1.862367198799151e-06, + "loss": 0.1393, + "step": 28404 + }, + { + "epoch": 0.7188045651238707, + "grad_norm": 11.472085952758789, + "learning_rate": 1.8620545854124056e-06, + "loss": 0.2178, + "step": 28405 + }, + { + "epoch": 0.7188298706885644, + "grad_norm": 4.363588333129883, + "learning_rate": 1.861741992261683e-06, + "loss": 0.1372, + "step": 28406 + }, + { + "epoch": 0.7188551762532581, + "grad_norm": 5.0393967628479, + "learning_rate": 1.8614294193490046e-06, + "loss": 0.1657, + "step": 28407 + }, + { + "epoch": 0.7188804818179517, + "grad_norm": 14.604937553405762, + "learning_rate": 1.861116866676382e-06, + "loss": 0.198, + "step": 28408 + }, + { + "epoch": 0.7189057873826454, + "grad_norm": 3.0310351848602295, + "learning_rate": 1.8608043342458349e-06, + "loss": 0.1235, + "step": 28409 + }, + { + "epoch": 0.7189310929473391, + "grad_norm": 2.8051929473876953, + "learning_rate": 1.8604918220593766e-06, + "loss": 0.0657, + "step": 28410 + }, + { + "epoch": 0.7189563985120327, + "grad_norm": 4.693206310272217, + "learning_rate": 1.8601793301190225e-06, + "loss": 0.139, + "step": 28411 + }, + { + "epoch": 0.7189817040767265, + "grad_norm": 4.318929195404053, + "learning_rate": 1.8598668584267858e-06, + "loss": 0.1358, + "step": 28412 + }, + { + "epoch": 0.7190070096414202, + "grad_norm": 3.6267662048339844, + "learning_rate": 1.8595544069846855e-06, + "loss": 0.1156, + "step": 28413 + }, + { + "epoch": 0.7190323152061138, + "grad_norm": 2.891923666000366, + "learning_rate": 1.859241975794735e-06, + "loss": 0.136, + "step": 28414 + }, + { + "epoch": 0.7190576207708075, + "grad_norm": 14.245699882507324, + "learning_rate": 1.8589295648589484e-06, + "loss": 0.2578, + "step": 28415 + }, + { + "epoch": 0.7190829263355012, + "grad_norm": 6.338452339172363, + "learning_rate": 1.8586171741793397e-06, + "loss": 0.1941, + "step": 28416 + }, + { + "epoch": 0.7191082319001949, + "grad_norm": 5.673150062561035, + "learning_rate": 1.8583048037579255e-06, + "loss": 0.1735, + "step": 28417 + }, + { + "epoch": 0.7191335374648885, + "grad_norm": 5.026712417602539, + "learning_rate": 1.8579924535967191e-06, + "loss": 0.164, + "step": 28418 + }, + { + "epoch": 0.7191588430295822, + "grad_norm": 4.952262878417969, + "learning_rate": 1.8576801236977354e-06, + "loss": 0.1143, + "step": 28419 + }, + { + "epoch": 0.7191841485942759, + "grad_norm": 4.7224273681640625, + "learning_rate": 1.8573678140629863e-06, + "loss": 0.1253, + "step": 28420 + }, + { + "epoch": 0.7192094541589695, + "grad_norm": 4.3426690101623535, + "learning_rate": 1.8570555246944893e-06, + "loss": 0.1492, + "step": 28421 + }, + { + "epoch": 0.7192347597236632, + "grad_norm": 4.526956558227539, + "learning_rate": 1.8567432555942565e-06, + "loss": 0.1853, + "step": 28422 + }, + { + "epoch": 0.719260065288357, + "grad_norm": 16.94331932067871, + "learning_rate": 1.8564310067643015e-06, + "loss": 0.2456, + "step": 28423 + }, + { + "epoch": 0.7192853708530506, + "grad_norm": 4.202381134033203, + "learning_rate": 1.856118778206637e-06, + "loss": 0.1698, + "step": 28424 + }, + { + "epoch": 0.7193106764177443, + "grad_norm": 6.924922943115234, + "learning_rate": 1.8558065699232791e-06, + "loss": 0.252, + "step": 28425 + }, + { + "epoch": 0.719335981982438, + "grad_norm": 11.021299362182617, + "learning_rate": 1.8554943819162397e-06, + "loss": 0.2798, + "step": 28426 + }, + { + "epoch": 0.7193612875471316, + "grad_norm": 4.4058122634887695, + "learning_rate": 1.8551822141875326e-06, + "loss": 0.193, + "step": 28427 + }, + { + "epoch": 0.7193865931118253, + "grad_norm": 11.474566459655762, + "learning_rate": 1.85487006673917e-06, + "loss": 0.2454, + "step": 28428 + }, + { + "epoch": 0.719411898676519, + "grad_norm": 5.518731594085693, + "learning_rate": 1.8545579395731633e-06, + "loss": 0.2054, + "step": 28429 + }, + { + "epoch": 0.7194372042412126, + "grad_norm": 11.122907638549805, + "learning_rate": 1.8542458326915298e-06, + "loss": 0.2905, + "step": 28430 + }, + { + "epoch": 0.7194625098059063, + "grad_norm": 3.2303624153137207, + "learning_rate": 1.8539337460962775e-06, + "loss": 0.1249, + "step": 28431 + }, + { + "epoch": 0.7194878153706, + "grad_norm": 6.184597492218018, + "learning_rate": 1.8536216797894246e-06, + "loss": 0.1655, + "step": 28432 + }, + { + "epoch": 0.7195131209352936, + "grad_norm": 2.9510226249694824, + "learning_rate": 1.8533096337729773e-06, + "loss": 0.1344, + "step": 28433 + }, + { + "epoch": 0.7195384264999873, + "grad_norm": 2.941756010055542, + "learning_rate": 1.852997608048952e-06, + "loss": 0.1164, + "step": 28434 + }, + { + "epoch": 0.719563732064681, + "grad_norm": 4.904983997344971, + "learning_rate": 1.8526856026193585e-06, + "loss": 0.1812, + "step": 28435 + }, + { + "epoch": 0.7195890376293747, + "grad_norm": 5.428506374359131, + "learning_rate": 1.8523736174862134e-06, + "loss": 0.1126, + "step": 28436 + }, + { + "epoch": 0.7196143431940684, + "grad_norm": 10.143088340759277, + "learning_rate": 1.8520616526515217e-06, + "loss": 0.1334, + "step": 28437 + }, + { + "epoch": 0.7196396487587621, + "grad_norm": 4.653232574462891, + "learning_rate": 1.8517497081173008e-06, + "loss": 0.1439, + "step": 28438 + }, + { + "epoch": 0.7196649543234557, + "grad_norm": 7.0871806144714355, + "learning_rate": 1.8514377838855585e-06, + "loss": 0.1887, + "step": 28439 + }, + { + "epoch": 0.7196902598881494, + "grad_norm": 5.698578834533691, + "learning_rate": 1.85112587995831e-06, + "loss": 0.1857, + "step": 28440 + }, + { + "epoch": 0.7197155654528431, + "grad_norm": 7.16355037689209, + "learning_rate": 1.850813996337565e-06, + "loss": 0.2535, + "step": 28441 + }, + { + "epoch": 0.7197408710175368, + "grad_norm": 6.1701884269714355, + "learning_rate": 1.8505021330253347e-06, + "loss": 0.1511, + "step": 28442 + }, + { + "epoch": 0.7197661765822304, + "grad_norm": 2.813535213470459, + "learning_rate": 1.8501902900236284e-06, + "loss": 0.1227, + "step": 28443 + }, + { + "epoch": 0.7197914821469241, + "grad_norm": 3.069441556930542, + "learning_rate": 1.8498784673344611e-06, + "loss": 0.1283, + "step": 28444 + }, + { + "epoch": 0.7198167877116178, + "grad_norm": 3.656043767929077, + "learning_rate": 1.8495666649598415e-06, + "loss": 0.1228, + "step": 28445 + }, + { + "epoch": 0.7198420932763114, + "grad_norm": 10.69615364074707, + "learning_rate": 1.8492548829017803e-06, + "loss": 0.2745, + "step": 28446 + }, + { + "epoch": 0.7198673988410051, + "grad_norm": 3.0946106910705566, + "learning_rate": 1.8489431211622867e-06, + "loss": 0.1534, + "step": 28447 + }, + { + "epoch": 0.7198927044056989, + "grad_norm": 3.38957142829895, + "learning_rate": 1.8486313797433747e-06, + "loss": 0.0894, + "step": 28448 + }, + { + "epoch": 0.7199180099703925, + "grad_norm": 3.7890355587005615, + "learning_rate": 1.8483196586470526e-06, + "loss": 0.1249, + "step": 28449 + }, + { + "epoch": 0.7199433155350862, + "grad_norm": 2.9815542697906494, + "learning_rate": 1.8480079578753307e-06, + "loss": 0.1031, + "step": 28450 + }, + { + "epoch": 0.7199686210997799, + "grad_norm": 7.168084144592285, + "learning_rate": 1.847696277430217e-06, + "loss": 0.1675, + "step": 28451 + }, + { + "epoch": 0.7199939266644735, + "grad_norm": 3.2105801105499268, + "learning_rate": 1.847384617313726e-06, + "loss": 0.1134, + "step": 28452 + }, + { + "epoch": 0.7200192322291672, + "grad_norm": 4.427044868469238, + "learning_rate": 1.847072977527865e-06, + "loss": 0.1631, + "step": 28453 + }, + { + "epoch": 0.7200445377938609, + "grad_norm": 6.157121658325195, + "learning_rate": 1.8467613580746436e-06, + "loss": 0.2027, + "step": 28454 + }, + { + "epoch": 0.7200698433585545, + "grad_norm": 6.454799652099609, + "learning_rate": 1.846449758956072e-06, + "loss": 0.1402, + "step": 28455 + }, + { + "epoch": 0.7200951489232482, + "grad_norm": 3.611921787261963, + "learning_rate": 1.8461381801741574e-06, + "loss": 0.1304, + "step": 28456 + }, + { + "epoch": 0.7201204544879419, + "grad_norm": 4.62498140335083, + "learning_rate": 1.8458266217309129e-06, + "loss": 0.1407, + "step": 28457 + }, + { + "epoch": 0.7201457600526355, + "grad_norm": 12.359807968139648, + "learning_rate": 1.8455150836283453e-06, + "loss": 0.3178, + "step": 28458 + }, + { + "epoch": 0.7201710656173292, + "grad_norm": 4.497908592224121, + "learning_rate": 1.8452035658684646e-06, + "loss": 0.1326, + "step": 28459 + }, + { + "epoch": 0.720196371182023, + "grad_norm": 3.5586936473846436, + "learning_rate": 1.8448920684532778e-06, + "loss": 0.1241, + "step": 28460 + }, + { + "epoch": 0.7202216767467166, + "grad_norm": 10.356684684753418, + "learning_rate": 1.8445805913847964e-06, + "loss": 0.2988, + "step": 28461 + }, + { + "epoch": 0.7202469823114103, + "grad_norm": 3.9876527786254883, + "learning_rate": 1.844269134665026e-06, + "loss": 0.1462, + "step": 28462 + }, + { + "epoch": 0.720272287876104, + "grad_norm": 5.348513126373291, + "learning_rate": 1.8439576982959805e-06, + "loss": 0.1211, + "step": 28463 + }, + { + "epoch": 0.7202975934407976, + "grad_norm": 5.95869255065918, + "learning_rate": 1.8436462822796619e-06, + "loss": 0.176, + "step": 28464 + }, + { + "epoch": 0.7203228990054913, + "grad_norm": 11.816481590270996, + "learning_rate": 1.8433348866180822e-06, + "loss": 0.2236, + "step": 28465 + }, + { + "epoch": 0.720348204570185, + "grad_norm": 3.241025924682617, + "learning_rate": 1.8430235113132472e-06, + "loss": 0.1459, + "step": 28466 + }, + { + "epoch": 0.7203735101348787, + "grad_norm": 8.060383796691895, + "learning_rate": 1.8427121563671701e-06, + "loss": 0.211, + "step": 28467 + }, + { + "epoch": 0.7203988156995723, + "grad_norm": 7.01523494720459, + "learning_rate": 1.8424008217818513e-06, + "loss": 0.2272, + "step": 28468 + }, + { + "epoch": 0.720424121264266, + "grad_norm": 8.48310661315918, + "learning_rate": 1.8420895075593038e-06, + "loss": 0.1618, + "step": 28469 + }, + { + "epoch": 0.7204494268289597, + "grad_norm": 5.286013126373291, + "learning_rate": 1.8417782137015321e-06, + "loss": 0.1647, + "step": 28470 + }, + { + "epoch": 0.7204747323936533, + "grad_norm": 4.742404937744141, + "learning_rate": 1.8414669402105468e-06, + "loss": 0.1234, + "step": 28471 + }, + { + "epoch": 0.720500037958347, + "grad_norm": 3.5956270694732666, + "learning_rate": 1.841155687088354e-06, + "loss": 0.1651, + "step": 28472 + }, + { + "epoch": 0.7205253435230408, + "grad_norm": 6.360150337219238, + "learning_rate": 1.8408444543369602e-06, + "loss": 0.2028, + "step": 28473 + }, + { + "epoch": 0.7205506490877344, + "grad_norm": 4.738397598266602, + "learning_rate": 1.8405332419583716e-06, + "loss": 0.154, + "step": 28474 + }, + { + "epoch": 0.7205759546524281, + "grad_norm": 3.7055587768554688, + "learning_rate": 1.8402220499545975e-06, + "loss": 0.1913, + "step": 28475 + }, + { + "epoch": 0.7206012602171218, + "grad_norm": 9.438624382019043, + "learning_rate": 1.8399108783276442e-06, + "loss": 0.3124, + "step": 28476 + }, + { + "epoch": 0.7206265657818154, + "grad_norm": 13.604743003845215, + "learning_rate": 1.8395997270795174e-06, + "loss": 0.203, + "step": 28477 + }, + { + "epoch": 0.7206518713465091, + "grad_norm": 4.425403118133545, + "learning_rate": 1.8392885962122225e-06, + "loss": 0.1686, + "step": 28478 + }, + { + "epoch": 0.7206771769112028, + "grad_norm": 7.81725549697876, + "learning_rate": 1.8389774857277693e-06, + "loss": 0.1923, + "step": 28479 + }, + { + "epoch": 0.7207024824758964, + "grad_norm": 4.949837684631348, + "learning_rate": 1.8386663956281624e-06, + "loss": 0.1213, + "step": 28480 + }, + { + "epoch": 0.7207277880405901, + "grad_norm": 7.6554951667785645, + "learning_rate": 1.838355325915408e-06, + "loss": 0.2038, + "step": 28481 + }, + { + "epoch": 0.7207530936052838, + "grad_norm": 3.5309150218963623, + "learning_rate": 1.8380442765915097e-06, + "loss": 0.1581, + "step": 28482 + }, + { + "epoch": 0.7207783991699774, + "grad_norm": 3.6785924434661865, + "learning_rate": 1.8377332476584781e-06, + "loss": 0.1224, + "step": 28483 + }, + { + "epoch": 0.7208037047346711, + "grad_norm": 12.473527908325195, + "learning_rate": 1.8374222391183162e-06, + "loss": 0.245, + "step": 28484 + }, + { + "epoch": 0.7208290102993649, + "grad_norm": 5.771941184997559, + "learning_rate": 1.8371112509730298e-06, + "loss": 0.15, + "step": 28485 + }, + { + "epoch": 0.7208543158640585, + "grad_norm": 7.700422763824463, + "learning_rate": 1.8368002832246245e-06, + "loss": 0.1772, + "step": 28486 + }, + { + "epoch": 0.7208796214287522, + "grad_norm": 5.258686542510986, + "learning_rate": 1.8364893358751045e-06, + "loss": 0.1704, + "step": 28487 + }, + { + "epoch": 0.7209049269934459, + "grad_norm": 5.1427836418151855, + "learning_rate": 1.8361784089264778e-06, + "loss": 0.1014, + "step": 28488 + }, + { + "epoch": 0.7209302325581395, + "grad_norm": 10.50079345703125, + "learning_rate": 1.8358675023807482e-06, + "loss": 0.2409, + "step": 28489 + }, + { + "epoch": 0.7209555381228332, + "grad_norm": 7.50250768661499, + "learning_rate": 1.8355566162399202e-06, + "loss": 0.1396, + "step": 28490 + }, + { + "epoch": 0.7209808436875269, + "grad_norm": 4.2772722244262695, + "learning_rate": 1.8352457505059978e-06, + "loss": 0.1246, + "step": 28491 + }, + { + "epoch": 0.7210061492522206, + "grad_norm": 4.336503505706787, + "learning_rate": 1.8349349051809884e-06, + "loss": 0.1102, + "step": 28492 + }, + { + "epoch": 0.7210314548169142, + "grad_norm": 3.933609962463379, + "learning_rate": 1.834624080266893e-06, + "loss": 0.1785, + "step": 28493 + }, + { + "epoch": 0.7210567603816079, + "grad_norm": 4.683567523956299, + "learning_rate": 1.8343132757657217e-06, + "loss": 0.1763, + "step": 28494 + }, + { + "epoch": 0.7210820659463016, + "grad_norm": 9.374969482421875, + "learning_rate": 1.8340024916794718e-06, + "loss": 0.1297, + "step": 28495 + }, + { + "epoch": 0.7211073715109952, + "grad_norm": 12.992776870727539, + "learning_rate": 1.8336917280101523e-06, + "loss": 0.2623, + "step": 28496 + }, + { + "epoch": 0.721132677075689, + "grad_norm": 2.788222551345825, + "learning_rate": 1.8333809847597644e-06, + "loss": 0.1295, + "step": 28497 + }, + { + "epoch": 0.7211579826403827, + "grad_norm": 2.3717050552368164, + "learning_rate": 1.833070261930317e-06, + "loss": 0.0926, + "step": 28498 + }, + { + "epoch": 0.7211832882050763, + "grad_norm": 7.731057643890381, + "learning_rate": 1.8327595595238073e-06, + "loss": 0.127, + "step": 28499 + }, + { + "epoch": 0.72120859376977, + "grad_norm": 8.551441192626953, + "learning_rate": 1.8324488775422434e-06, + "loss": 0.146, + "step": 28500 + }, + { + "epoch": 0.7212338993344637, + "grad_norm": 4.438869476318359, + "learning_rate": 1.8321382159876256e-06, + "loss": 0.1408, + "step": 28501 + }, + { + "epoch": 0.7212592048991573, + "grad_norm": 3.4833359718322754, + "learning_rate": 1.831827574861963e-06, + "loss": 0.1649, + "step": 28502 + }, + { + "epoch": 0.721284510463851, + "grad_norm": 7.597642421722412, + "learning_rate": 1.8315169541672511e-06, + "loss": 0.1925, + "step": 28503 + }, + { + "epoch": 0.7213098160285447, + "grad_norm": 9.256031036376953, + "learning_rate": 1.8312063539054992e-06, + "loss": 0.1299, + "step": 28504 + }, + { + "epoch": 0.7213351215932383, + "grad_norm": 2.178306818008423, + "learning_rate": 1.830895774078706e-06, + "loss": 0.081, + "step": 28505 + }, + { + "epoch": 0.721360427157932, + "grad_norm": 3.2468347549438477, + "learning_rate": 1.8305852146888782e-06, + "loss": 0.0994, + "step": 28506 + }, + { + "epoch": 0.7213857327226257, + "grad_norm": 3.7366061210632324, + "learning_rate": 1.8302746757380169e-06, + "loss": 0.1649, + "step": 28507 + }, + { + "epoch": 0.7214110382873193, + "grad_norm": 2.704352855682373, + "learning_rate": 1.8299641572281246e-06, + "loss": 0.1351, + "step": 28508 + }, + { + "epoch": 0.721436343852013, + "grad_norm": 2.630826950073242, + "learning_rate": 1.8296536591612024e-06, + "loss": 0.1307, + "step": 28509 + }, + { + "epoch": 0.7214616494167068, + "grad_norm": 9.849867820739746, + "learning_rate": 1.8293431815392553e-06, + "loss": 0.3204, + "step": 28510 + }, + { + "epoch": 0.7214869549814004, + "grad_norm": 7.9093098640441895, + "learning_rate": 1.8290327243642842e-06, + "loss": 0.2224, + "step": 28511 + }, + { + "epoch": 0.7215122605460941, + "grad_norm": 4.238132953643799, + "learning_rate": 1.8287222876382914e-06, + "loss": 0.1202, + "step": 28512 + }, + { + "epoch": 0.7215375661107878, + "grad_norm": 6.4348978996276855, + "learning_rate": 1.8284118713632788e-06, + "loss": 0.1304, + "step": 28513 + }, + { + "epoch": 0.7215628716754814, + "grad_norm": 2.3878214359283447, + "learning_rate": 1.8281014755412463e-06, + "loss": 0.0962, + "step": 28514 + }, + { + "epoch": 0.7215881772401751, + "grad_norm": 6.483441352844238, + "learning_rate": 1.8277911001741989e-06, + "loss": 0.1847, + "step": 28515 + }, + { + "epoch": 0.7216134828048688, + "grad_norm": 3.443251371383667, + "learning_rate": 1.8274807452641362e-06, + "loss": 0.1423, + "step": 28516 + }, + { + "epoch": 0.7216387883695624, + "grad_norm": 8.004011154174805, + "learning_rate": 1.827170410813061e-06, + "loss": 0.1708, + "step": 28517 + }, + { + "epoch": 0.7216640939342561, + "grad_norm": 6.132866859436035, + "learning_rate": 1.8268600968229716e-06, + "loss": 0.1822, + "step": 28518 + }, + { + "epoch": 0.7216893994989498, + "grad_norm": 30.959434509277344, + "learning_rate": 1.8265498032958723e-06, + "loss": 0.1459, + "step": 28519 + }, + { + "epoch": 0.7217147050636435, + "grad_norm": 4.888284683227539, + "learning_rate": 1.8262395302337637e-06, + "loss": 0.0988, + "step": 28520 + }, + { + "epoch": 0.7217400106283371, + "grad_norm": 4.182897090911865, + "learning_rate": 1.8259292776386457e-06, + "loss": 0.1915, + "step": 28521 + }, + { + "epoch": 0.7217653161930309, + "grad_norm": 7.7468695640563965, + "learning_rate": 1.8256190455125177e-06, + "loss": 0.227, + "step": 28522 + }, + { + "epoch": 0.7217906217577246, + "grad_norm": 5.4320878982543945, + "learning_rate": 1.8253088338573838e-06, + "loss": 0.1926, + "step": 28523 + }, + { + "epoch": 0.7218159273224182, + "grad_norm": 4.542702674865723, + "learning_rate": 1.824998642675241e-06, + "loss": 0.1257, + "step": 28524 + }, + { + "epoch": 0.7218412328871119, + "grad_norm": 7.5004754066467285, + "learning_rate": 1.8246884719680946e-06, + "loss": 0.2415, + "step": 28525 + }, + { + "epoch": 0.7218665384518056, + "grad_norm": 6.987526893615723, + "learning_rate": 1.8243783217379386e-06, + "loss": 0.1457, + "step": 28526 + }, + { + "epoch": 0.7218918440164992, + "grad_norm": 3.3198235034942627, + "learning_rate": 1.824068191986777e-06, + "loss": 0.1187, + "step": 28527 + }, + { + "epoch": 0.7219171495811929, + "grad_norm": 5.983216285705566, + "learning_rate": 1.8237580827166079e-06, + "loss": 0.1288, + "step": 28528 + }, + { + "epoch": 0.7219424551458866, + "grad_norm": 4.188172340393066, + "learning_rate": 1.8234479939294352e-06, + "loss": 0.13, + "step": 28529 + }, + { + "epoch": 0.7219677607105802, + "grad_norm": 4.045888423919678, + "learning_rate": 1.8231379256272518e-06, + "loss": 0.1196, + "step": 28530 + }, + { + "epoch": 0.7219930662752739, + "grad_norm": 5.287557125091553, + "learning_rate": 1.8228278778120634e-06, + "loss": 0.1624, + "step": 28531 + }, + { + "epoch": 0.7220183718399676, + "grad_norm": 9.707844734191895, + "learning_rate": 1.8225178504858647e-06, + "loss": 0.2894, + "step": 28532 + }, + { + "epoch": 0.7220436774046612, + "grad_norm": 3.446795701980591, + "learning_rate": 1.8222078436506607e-06, + "loss": 0.1771, + "step": 28533 + }, + { + "epoch": 0.722068982969355, + "grad_norm": 2.9013149738311768, + "learning_rate": 1.8218978573084445e-06, + "loss": 0.0911, + "step": 28534 + }, + { + "epoch": 0.7220942885340487, + "grad_norm": 5.335407733917236, + "learning_rate": 1.821587891461219e-06, + "loss": 0.1556, + "step": 28535 + }, + { + "epoch": 0.7221195940987423, + "grad_norm": 4.536136627197266, + "learning_rate": 1.8212779461109802e-06, + "loss": 0.1746, + "step": 28536 + }, + { + "epoch": 0.722144899663436, + "grad_norm": 4.41999626159668, + "learning_rate": 1.8209680212597303e-06, + "loss": 0.1863, + "step": 28537 + }, + { + "epoch": 0.7221702052281297, + "grad_norm": 3.70393443107605, + "learning_rate": 1.8206581169094668e-06, + "loss": 0.1648, + "step": 28538 + }, + { + "epoch": 0.7221955107928233, + "grad_norm": 3.2685585021972656, + "learning_rate": 1.8203482330621874e-06, + "loss": 0.1006, + "step": 28539 + }, + { + "epoch": 0.722220816357517, + "grad_norm": 3.3836958408355713, + "learning_rate": 1.8200383697198904e-06, + "loss": 0.147, + "step": 28540 + }, + { + "epoch": 0.7222461219222107, + "grad_norm": 4.986032962799072, + "learning_rate": 1.8197285268845733e-06, + "loss": 0.1694, + "step": 28541 + }, + { + "epoch": 0.7222714274869043, + "grad_norm": 5.9615478515625, + "learning_rate": 1.8194187045582368e-06, + "loss": 0.1577, + "step": 28542 + }, + { + "epoch": 0.722296733051598, + "grad_norm": 3.459221839904785, + "learning_rate": 1.8191089027428777e-06, + "loss": 0.1722, + "step": 28543 + }, + { + "epoch": 0.7223220386162917, + "grad_norm": 9.24765682220459, + "learning_rate": 1.8187991214404933e-06, + "loss": 0.2151, + "step": 28544 + }, + { + "epoch": 0.7223473441809855, + "grad_norm": 8.332695960998535, + "learning_rate": 1.8184893606530795e-06, + "loss": 0.2446, + "step": 28545 + }, + { + "epoch": 0.722372649745679, + "grad_norm": 3.1486308574676514, + "learning_rate": 1.8181796203826384e-06, + "loss": 0.1351, + "step": 28546 + }, + { + "epoch": 0.7223979553103728, + "grad_norm": 2.3015522956848145, + "learning_rate": 1.8178699006311652e-06, + "loss": 0.1087, + "step": 28547 + }, + { + "epoch": 0.7224232608750665, + "grad_norm": 6.943406581878662, + "learning_rate": 1.8175602014006567e-06, + "loss": 0.2594, + "step": 28548 + }, + { + "epoch": 0.7224485664397601, + "grad_norm": 5.801580429077148, + "learning_rate": 1.8172505226931087e-06, + "loss": 0.1455, + "step": 28549 + }, + { + "epoch": 0.7224738720044538, + "grad_norm": 5.213698387145996, + "learning_rate": 1.816940864510522e-06, + "loss": 0.159, + "step": 28550 + }, + { + "epoch": 0.7224991775691475, + "grad_norm": 3.351597309112549, + "learning_rate": 1.8166312268548914e-06, + "loss": 0.1388, + "step": 28551 + }, + { + "epoch": 0.7225244831338411, + "grad_norm": 4.181617259979248, + "learning_rate": 1.8163216097282137e-06, + "loss": 0.1485, + "step": 28552 + }, + { + "epoch": 0.7225497886985348, + "grad_norm": 6.261523246765137, + "learning_rate": 1.8160120131324843e-06, + "loss": 0.1134, + "step": 28553 + }, + { + "epoch": 0.7225750942632285, + "grad_norm": 3.646728038787842, + "learning_rate": 1.8157024370697029e-06, + "loss": 0.1263, + "step": 28554 + }, + { + "epoch": 0.7226003998279221, + "grad_norm": 8.52981185913086, + "learning_rate": 1.8153928815418643e-06, + "loss": 0.1526, + "step": 28555 + }, + { + "epoch": 0.7226257053926158, + "grad_norm": 4.4615478515625, + "learning_rate": 1.8150833465509644e-06, + "loss": 0.1844, + "step": 28556 + }, + { + "epoch": 0.7226510109573095, + "grad_norm": 3.9177908897399902, + "learning_rate": 1.8147738320989977e-06, + "loss": 0.1203, + "step": 28557 + }, + { + "epoch": 0.7226763165220031, + "grad_norm": 2.600287437438965, + "learning_rate": 1.814464338187964e-06, + "loss": 0.136, + "step": 28558 + }, + { + "epoch": 0.7227016220866969, + "grad_norm": 4.503363609313965, + "learning_rate": 1.8141548648198554e-06, + "loss": 0.155, + "step": 28559 + }, + { + "epoch": 0.7227269276513906, + "grad_norm": 6.374213218688965, + "learning_rate": 1.8138454119966725e-06, + "loss": 0.1744, + "step": 28560 + }, + { + "epoch": 0.7227522332160842, + "grad_norm": 10.352349281311035, + "learning_rate": 1.813535979720405e-06, + "loss": 0.207, + "step": 28561 + }, + { + "epoch": 0.7227775387807779, + "grad_norm": 3.1473145484924316, + "learning_rate": 1.8132265679930527e-06, + "loss": 0.0877, + "step": 28562 + }, + { + "epoch": 0.7228028443454716, + "grad_norm": 2.0048041343688965, + "learning_rate": 1.8129171768166082e-06, + "loss": 0.0936, + "step": 28563 + }, + { + "epoch": 0.7228281499101652, + "grad_norm": 4.455023765563965, + "learning_rate": 1.8126078061930707e-06, + "loss": 0.2185, + "step": 28564 + }, + { + "epoch": 0.7228534554748589, + "grad_norm": 3.1356353759765625, + "learning_rate": 1.81229845612443e-06, + "loss": 0.1107, + "step": 28565 + }, + { + "epoch": 0.7228787610395526, + "grad_norm": 2.6089673042297363, + "learning_rate": 1.8119891266126849e-06, + "loss": 0.1263, + "step": 28566 + }, + { + "epoch": 0.7229040666042462, + "grad_norm": 3.8822758197784424, + "learning_rate": 1.8116798176598265e-06, + "loss": 0.1502, + "step": 28567 + }, + { + "epoch": 0.7229293721689399, + "grad_norm": 5.576426982879639, + "learning_rate": 1.8113705292678545e-06, + "loss": 0.1914, + "step": 28568 + }, + { + "epoch": 0.7229546777336336, + "grad_norm": 3.828425407409668, + "learning_rate": 1.8110612614387606e-06, + "loss": 0.1603, + "step": 28569 + }, + { + "epoch": 0.7229799832983274, + "grad_norm": 4.273329257965088, + "learning_rate": 1.8107520141745395e-06, + "loss": 0.1405, + "step": 28570 + }, + { + "epoch": 0.723005288863021, + "grad_norm": 3.7339560985565186, + "learning_rate": 1.810442787477185e-06, + "loss": 0.1807, + "step": 28571 + }, + { + "epoch": 0.7230305944277147, + "grad_norm": 4.23921012878418, + "learning_rate": 1.8101335813486904e-06, + "loss": 0.1328, + "step": 28572 + }, + { + "epoch": 0.7230558999924084, + "grad_norm": 6.375322341918945, + "learning_rate": 1.8098243957910526e-06, + "loss": 0.1838, + "step": 28573 + }, + { + "epoch": 0.723081205557102, + "grad_norm": 5.251835346221924, + "learning_rate": 1.8095152308062635e-06, + "loss": 0.0685, + "step": 28574 + }, + { + "epoch": 0.7231065111217957, + "grad_norm": 7.2895941734313965, + "learning_rate": 1.8092060863963179e-06, + "loss": 0.181, + "step": 28575 + }, + { + "epoch": 0.7231318166864894, + "grad_norm": 3.259963035583496, + "learning_rate": 1.8088969625632063e-06, + "loss": 0.1258, + "step": 28576 + }, + { + "epoch": 0.723157122251183, + "grad_norm": 3.8977162837982178, + "learning_rate": 1.8085878593089284e-06, + "loss": 0.0956, + "step": 28577 + }, + { + "epoch": 0.7231824278158767, + "grad_norm": 5.525108337402344, + "learning_rate": 1.8082787766354704e-06, + "loss": 0.2006, + "step": 28578 + }, + { + "epoch": 0.7232077333805704, + "grad_norm": 3.73214054107666, + "learning_rate": 1.8079697145448306e-06, + "loss": 0.1571, + "step": 28579 + }, + { + "epoch": 0.723233038945264, + "grad_norm": 4.352193832397461, + "learning_rate": 1.8076606730389979e-06, + "loss": 0.1328, + "step": 28580 + }, + { + "epoch": 0.7232583445099577, + "grad_norm": 7.086665153503418, + "learning_rate": 1.8073516521199702e-06, + "loss": 0.1544, + "step": 28581 + }, + { + "epoch": 0.7232836500746515, + "grad_norm": 3.8343405723571777, + "learning_rate": 1.8070426517897378e-06, + "loss": 0.102, + "step": 28582 + }, + { + "epoch": 0.723308955639345, + "grad_norm": 3.06093430519104, + "learning_rate": 1.8067336720502932e-06, + "loss": 0.1357, + "step": 28583 + }, + { + "epoch": 0.7233342612040388, + "grad_norm": 8.78647232055664, + "learning_rate": 1.8064247129036273e-06, + "loss": 0.2092, + "step": 28584 + }, + { + "epoch": 0.7233595667687325, + "grad_norm": 7.685041427612305, + "learning_rate": 1.806115774351736e-06, + "loss": 0.1957, + "step": 28585 + }, + { + "epoch": 0.7233848723334261, + "grad_norm": 5.4328293800354, + "learning_rate": 1.8058068563966103e-06, + "loss": 0.2132, + "step": 28586 + }, + { + "epoch": 0.7234101778981198, + "grad_norm": 4.7867817878723145, + "learning_rate": 1.8054979590402417e-06, + "loss": 0.1222, + "step": 28587 + }, + { + "epoch": 0.7234354834628135, + "grad_norm": 2.895434617996216, + "learning_rate": 1.8051890822846214e-06, + "loss": 0.1461, + "step": 28588 + }, + { + "epoch": 0.7234607890275071, + "grad_norm": 5.52997350692749, + "learning_rate": 1.8048802261317434e-06, + "loss": 0.1806, + "step": 28589 + }, + { + "epoch": 0.7234860945922008, + "grad_norm": 13.047873497009277, + "learning_rate": 1.804571390583597e-06, + "loss": 0.2934, + "step": 28590 + }, + { + "epoch": 0.7235114001568945, + "grad_norm": 3.2030651569366455, + "learning_rate": 1.8042625756421788e-06, + "loss": 0.1072, + "step": 28591 + }, + { + "epoch": 0.7235367057215881, + "grad_norm": 3.407486915588379, + "learning_rate": 1.8039537813094738e-06, + "loss": 0.0862, + "step": 28592 + }, + { + "epoch": 0.7235620112862818, + "grad_norm": 3.6976566314697266, + "learning_rate": 1.8036450075874774e-06, + "loss": 0.0904, + "step": 28593 + }, + { + "epoch": 0.7235873168509755, + "grad_norm": 4.538573265075684, + "learning_rate": 1.8033362544781784e-06, + "loss": 0.1385, + "step": 28594 + }, + { + "epoch": 0.7236126224156693, + "grad_norm": 3.582286834716797, + "learning_rate": 1.803027521983573e-06, + "loss": 0.1648, + "step": 28595 + }, + { + "epoch": 0.7236379279803629, + "grad_norm": 2.956348180770874, + "learning_rate": 1.8027188101056442e-06, + "loss": 0.1174, + "step": 28596 + }, + { + "epoch": 0.7236632335450566, + "grad_norm": 7.734994411468506, + "learning_rate": 1.8024101188463894e-06, + "loss": 0.2652, + "step": 28597 + }, + { + "epoch": 0.7236885391097503, + "grad_norm": 4.328552722930908, + "learning_rate": 1.802101448207797e-06, + "loss": 0.1135, + "step": 28598 + }, + { + "epoch": 0.7237138446744439, + "grad_norm": 6.195249080657959, + "learning_rate": 1.8017927981918555e-06, + "loss": 0.2002, + "step": 28599 + }, + { + "epoch": 0.7237391502391376, + "grad_norm": 6.430355548858643, + "learning_rate": 1.8014841688005592e-06, + "loss": 0.1391, + "step": 28600 + }, + { + "epoch": 0.7237644558038313, + "grad_norm": 4.226507186889648, + "learning_rate": 1.801175560035896e-06, + "loss": 0.1368, + "step": 28601 + }, + { + "epoch": 0.7237897613685249, + "grad_norm": 7.389667987823486, + "learning_rate": 1.8008669718998572e-06, + "loss": 0.1432, + "step": 28602 + }, + { + "epoch": 0.7238150669332186, + "grad_norm": 7.427626609802246, + "learning_rate": 1.8005584043944303e-06, + "loss": 0.2044, + "step": 28603 + }, + { + "epoch": 0.7238403724979123, + "grad_norm": 12.345739364624023, + "learning_rate": 1.8002498575216087e-06, + "loss": 0.2195, + "step": 28604 + }, + { + "epoch": 0.7238656780626059, + "grad_norm": 4.292882442474365, + "learning_rate": 1.7999413312833808e-06, + "loss": 0.1635, + "step": 28605 + }, + { + "epoch": 0.7238909836272996, + "grad_norm": 7.3383378982543945, + "learning_rate": 1.7996328256817358e-06, + "loss": 0.1269, + "step": 28606 + }, + { + "epoch": 0.7239162891919934, + "grad_norm": 4.443477630615234, + "learning_rate": 1.7993243407186622e-06, + "loss": 0.1536, + "step": 28607 + }, + { + "epoch": 0.723941594756687, + "grad_norm": 10.029160499572754, + "learning_rate": 1.7990158763961535e-06, + "loss": 0.1917, + "step": 28608 + }, + { + "epoch": 0.7239669003213807, + "grad_norm": 6.132066249847412, + "learning_rate": 1.7987074327161925e-06, + "loss": 0.1645, + "step": 28609 + }, + { + "epoch": 0.7239922058860744, + "grad_norm": 2.9445149898529053, + "learning_rate": 1.7983990096807734e-06, + "loss": 0.0824, + "step": 28610 + }, + { + "epoch": 0.724017511450768, + "grad_norm": 3.8238744735717773, + "learning_rate": 1.7980906072918824e-06, + "loss": 0.1175, + "step": 28611 + }, + { + "epoch": 0.7240428170154617, + "grad_norm": 10.684366226196289, + "learning_rate": 1.797782225551511e-06, + "loss": 0.1665, + "step": 28612 + }, + { + "epoch": 0.7240681225801554, + "grad_norm": 4.300736904144287, + "learning_rate": 1.7974738644616457e-06, + "loss": 0.1608, + "step": 28613 + }, + { + "epoch": 0.724093428144849, + "grad_norm": 2.3686468601226807, + "learning_rate": 1.7971655240242764e-06, + "loss": 0.1313, + "step": 28614 + }, + { + "epoch": 0.7241187337095427, + "grad_norm": 4.8757219314575195, + "learning_rate": 1.796857204241389e-06, + "loss": 0.1041, + "step": 28615 + }, + { + "epoch": 0.7241440392742364, + "grad_norm": 6.551374435424805, + "learning_rate": 1.7965489051149753e-06, + "loss": 0.167, + "step": 28616 + }, + { + "epoch": 0.72416934483893, + "grad_norm": 5.936295032501221, + "learning_rate": 1.7962406266470223e-06, + "loss": 0.1708, + "step": 28617 + }, + { + "epoch": 0.7241946504036237, + "grad_norm": 7.947498321533203, + "learning_rate": 1.795932368839517e-06, + "loss": 0.2062, + "step": 28618 + }, + { + "epoch": 0.7242199559683175, + "grad_norm": 2.9372313022613525, + "learning_rate": 1.795624131694446e-06, + "loss": 0.0882, + "step": 28619 + }, + { + "epoch": 0.7242452615330112, + "grad_norm": 7.826504230499268, + "learning_rate": 1.795315915213801e-06, + "loss": 0.1347, + "step": 28620 + }, + { + "epoch": 0.7242705670977048, + "grad_norm": 5.994015216827393, + "learning_rate": 1.7950077193995657e-06, + "loss": 0.1746, + "step": 28621 + }, + { + "epoch": 0.7242958726623985, + "grad_norm": 5.579618453979492, + "learning_rate": 1.794699544253733e-06, + "loss": 0.1313, + "step": 28622 + }, + { + "epoch": 0.7243211782270922, + "grad_norm": 4.185733795166016, + "learning_rate": 1.7943913897782832e-06, + "loss": 0.1745, + "step": 28623 + }, + { + "epoch": 0.7243464837917858, + "grad_norm": 5.58916711807251, + "learning_rate": 1.7940832559752086e-06, + "loss": 0.1486, + "step": 28624 + }, + { + "epoch": 0.7243717893564795, + "grad_norm": 6.372666358947754, + "learning_rate": 1.793775142846495e-06, + "loss": 0.1394, + "step": 28625 + }, + { + "epoch": 0.7243970949211732, + "grad_norm": 5.398963451385498, + "learning_rate": 1.7934670503941293e-06, + "loss": 0.1145, + "step": 28626 + }, + { + "epoch": 0.7244224004858668, + "grad_norm": 3.321396827697754, + "learning_rate": 1.793158978620096e-06, + "loss": 0.107, + "step": 28627 + }, + { + "epoch": 0.7244477060505605, + "grad_norm": 3.643507242202759, + "learning_rate": 1.7928509275263857e-06, + "loss": 0.1661, + "step": 28628 + }, + { + "epoch": 0.7244730116152542, + "grad_norm": 3.9611010551452637, + "learning_rate": 1.7925428971149833e-06, + "loss": 0.1445, + "step": 28629 + }, + { + "epoch": 0.7244983171799478, + "grad_norm": 9.740272521972656, + "learning_rate": 1.7922348873878748e-06, + "loss": 0.1867, + "step": 28630 + }, + { + "epoch": 0.7245236227446415, + "grad_norm": 4.438622951507568, + "learning_rate": 1.7919268983470456e-06, + "loss": 0.1493, + "step": 28631 + }, + { + "epoch": 0.7245489283093353, + "grad_norm": 8.345865249633789, + "learning_rate": 1.791618929994484e-06, + "loss": 0.2028, + "step": 28632 + }, + { + "epoch": 0.7245742338740289, + "grad_norm": 10.156182289123535, + "learning_rate": 1.7913109823321762e-06, + "loss": 0.2337, + "step": 28633 + }, + { + "epoch": 0.7245995394387226, + "grad_norm": 4.838545322418213, + "learning_rate": 1.791003055362105e-06, + "loss": 0.1519, + "step": 28634 + }, + { + "epoch": 0.7246248450034163, + "grad_norm": 5.964797019958496, + "learning_rate": 1.7906951490862611e-06, + "loss": 0.1269, + "step": 28635 + }, + { + "epoch": 0.7246501505681099, + "grad_norm": 5.134415149688721, + "learning_rate": 1.7903872635066239e-06, + "loss": 0.1679, + "step": 28636 + }, + { + "epoch": 0.7246754561328036, + "grad_norm": 6.733438491821289, + "learning_rate": 1.790079398625184e-06, + "loss": 0.1828, + "step": 28637 + }, + { + "epoch": 0.7247007616974973, + "grad_norm": 3.2782280445098877, + "learning_rate": 1.7897715544439238e-06, + "loss": 0.1482, + "step": 28638 + }, + { + "epoch": 0.7247260672621909, + "grad_norm": 11.182462692260742, + "learning_rate": 1.7894637309648323e-06, + "loss": 0.214, + "step": 28639 + }, + { + "epoch": 0.7247513728268846, + "grad_norm": 9.615509033203125, + "learning_rate": 1.789155928189889e-06, + "loss": 0.1757, + "step": 28640 + }, + { + "epoch": 0.7247766783915783, + "grad_norm": 3.203760862350464, + "learning_rate": 1.7888481461210832e-06, + "loss": 0.1083, + "step": 28641 + }, + { + "epoch": 0.7248019839562719, + "grad_norm": 7.166743278503418, + "learning_rate": 1.7885403847603972e-06, + "loss": 0.1541, + "step": 28642 + }, + { + "epoch": 0.7248272895209656, + "grad_norm": 7.7438063621521, + "learning_rate": 1.7882326441098186e-06, + "loss": 0.0813, + "step": 28643 + }, + { + "epoch": 0.7248525950856594, + "grad_norm": 4.865283966064453, + "learning_rate": 1.7879249241713298e-06, + "loss": 0.098, + "step": 28644 + }, + { + "epoch": 0.724877900650353, + "grad_norm": 4.012600898742676, + "learning_rate": 1.787617224946916e-06, + "loss": 0.1218, + "step": 28645 + }, + { + "epoch": 0.7249032062150467, + "grad_norm": 7.870745658874512, + "learning_rate": 1.7873095464385593e-06, + "loss": 0.2442, + "step": 28646 + }, + { + "epoch": 0.7249285117797404, + "grad_norm": 7.94211483001709, + "learning_rate": 1.7870018886482477e-06, + "loss": 0.1726, + "step": 28647 + }, + { + "epoch": 0.7249538173444341, + "grad_norm": 5.923593997955322, + "learning_rate": 1.7866942515779627e-06, + "loss": 0.1987, + "step": 28648 + }, + { + "epoch": 0.7249791229091277, + "grad_norm": 3.5903961658477783, + "learning_rate": 1.7863866352296894e-06, + "loss": 0.1334, + "step": 28649 + }, + { + "epoch": 0.7250044284738214, + "grad_norm": 2.883424758911133, + "learning_rate": 1.7860790396054084e-06, + "loss": 0.1126, + "step": 28650 + }, + { + "epoch": 0.7250297340385151, + "grad_norm": 10.768028259277344, + "learning_rate": 1.7857714647071083e-06, + "loss": 0.1807, + "step": 28651 + }, + { + "epoch": 0.7250550396032087, + "grad_norm": 6.53812313079834, + "learning_rate": 1.78546391053677e-06, + "loss": 0.1686, + "step": 28652 + }, + { + "epoch": 0.7250803451679024, + "grad_norm": 11.276952743530273, + "learning_rate": 1.7851563770963764e-06, + "loss": 0.2199, + "step": 28653 + }, + { + "epoch": 0.7251056507325961, + "grad_norm": 4.4145188331604, + "learning_rate": 1.7848488643879102e-06, + "loss": 0.1171, + "step": 28654 + }, + { + "epoch": 0.7251309562972897, + "grad_norm": 5.71908712387085, + "learning_rate": 1.7845413724133575e-06, + "loss": 0.1918, + "step": 28655 + }, + { + "epoch": 0.7251562618619835, + "grad_norm": 18.865596771240234, + "learning_rate": 1.784233901174699e-06, + "loss": 0.085, + "step": 28656 + }, + { + "epoch": 0.7251815674266772, + "grad_norm": 10.914033889770508, + "learning_rate": 1.7839264506739178e-06, + "loss": 0.1542, + "step": 28657 + }, + { + "epoch": 0.7252068729913708, + "grad_norm": 4.295619964599609, + "learning_rate": 1.783619020912995e-06, + "loss": 0.1455, + "step": 28658 + }, + { + "epoch": 0.7252321785560645, + "grad_norm": 8.685193061828613, + "learning_rate": 1.783311611893917e-06, + "loss": 0.193, + "step": 28659 + }, + { + "epoch": 0.7252574841207582, + "grad_norm": 4.89422082901001, + "learning_rate": 1.7830042236186634e-06, + "loss": 0.1451, + "step": 28660 + }, + { + "epoch": 0.7252827896854518, + "grad_norm": 5.971798896789551, + "learning_rate": 1.7826968560892177e-06, + "loss": 0.1648, + "step": 28661 + }, + { + "epoch": 0.7253080952501455, + "grad_norm": 3.2149691581726074, + "learning_rate": 1.782389509307561e-06, + "loss": 0.0965, + "step": 28662 + }, + { + "epoch": 0.7253334008148392, + "grad_norm": 3.384722948074341, + "learning_rate": 1.7820821832756746e-06, + "loss": 0.1501, + "step": 28663 + }, + { + "epoch": 0.7253587063795328, + "grad_norm": 6.937658309936523, + "learning_rate": 1.7817748779955429e-06, + "loss": 0.2068, + "step": 28664 + }, + { + "epoch": 0.7253840119442265, + "grad_norm": 4.19320821762085, + "learning_rate": 1.781467593469145e-06, + "loss": 0.1454, + "step": 28665 + }, + { + "epoch": 0.7254093175089202, + "grad_norm": 4.070904731750488, + "learning_rate": 1.7811603296984676e-06, + "loss": 0.1685, + "step": 28666 + }, + { + "epoch": 0.7254346230736138, + "grad_norm": 3.5507283210754395, + "learning_rate": 1.7808530866854845e-06, + "loss": 0.1334, + "step": 28667 + }, + { + "epoch": 0.7254599286383075, + "grad_norm": 9.306869506835938, + "learning_rate": 1.7805458644321828e-06, + "loss": 0.0888, + "step": 28668 + }, + { + "epoch": 0.7254852342030013, + "grad_norm": 7.893295764923096, + "learning_rate": 1.7802386629405405e-06, + "loss": 0.2413, + "step": 28669 + }, + { + "epoch": 0.7255105397676949, + "grad_norm": 4.669315338134766, + "learning_rate": 1.7799314822125436e-06, + "loss": 0.1547, + "step": 28670 + }, + { + "epoch": 0.7255358453323886, + "grad_norm": 8.902542114257812, + "learning_rate": 1.7796243222501659e-06, + "loss": 0.1049, + "step": 28671 + }, + { + "epoch": 0.7255611508970823, + "grad_norm": 2.514234781265259, + "learning_rate": 1.7793171830553934e-06, + "loss": 0.0741, + "step": 28672 + }, + { + "epoch": 0.725586456461776, + "grad_norm": 2.9575603008270264, + "learning_rate": 1.7790100646302045e-06, + "loss": 0.0986, + "step": 28673 + }, + { + "epoch": 0.7256117620264696, + "grad_norm": 5.166785717010498, + "learning_rate": 1.7787029669765815e-06, + "loss": 0.0671, + "step": 28674 + }, + { + "epoch": 0.7256370675911633, + "grad_norm": 6.744928359985352, + "learning_rate": 1.7783958900965036e-06, + "loss": 0.135, + "step": 28675 + }, + { + "epoch": 0.725662373155857, + "grad_norm": 3.1233267784118652, + "learning_rate": 1.7780888339919522e-06, + "loss": 0.1279, + "step": 28676 + }, + { + "epoch": 0.7256876787205506, + "grad_norm": 2.1148107051849365, + "learning_rate": 1.7777817986649043e-06, + "loss": 0.0678, + "step": 28677 + }, + { + "epoch": 0.7257129842852443, + "grad_norm": 9.484763145446777, + "learning_rate": 1.777474784117344e-06, + "loss": 0.2646, + "step": 28678 + }, + { + "epoch": 0.725738289849938, + "grad_norm": 12.951456069946289, + "learning_rate": 1.7771677903512497e-06, + "loss": 0.1746, + "step": 28679 + }, + { + "epoch": 0.7257635954146316, + "grad_norm": 4.080517768859863, + "learning_rate": 1.7768608173686003e-06, + "loss": 0.1502, + "step": 28680 + }, + { + "epoch": 0.7257889009793254, + "grad_norm": 8.304837226867676, + "learning_rate": 1.7765538651713748e-06, + "loss": 0.1335, + "step": 28681 + }, + { + "epoch": 0.7258142065440191, + "grad_norm": 9.395856857299805, + "learning_rate": 1.7762469337615552e-06, + "loss": 0.1938, + "step": 28682 + }, + { + "epoch": 0.7258395121087127, + "grad_norm": 3.7906494140625, + "learning_rate": 1.7759400231411195e-06, + "loss": 0.1136, + "step": 28683 + }, + { + "epoch": 0.7258648176734064, + "grad_norm": 12.274155616760254, + "learning_rate": 1.775633133312047e-06, + "loss": 0.3373, + "step": 28684 + }, + { + "epoch": 0.7258901232381001, + "grad_norm": 8.17784595489502, + "learning_rate": 1.7753262642763148e-06, + "loss": 0.2729, + "step": 28685 + }, + { + "epoch": 0.7259154288027937, + "grad_norm": 3.2532498836517334, + "learning_rate": 1.7750194160359052e-06, + "loss": 0.0895, + "step": 28686 + }, + { + "epoch": 0.7259407343674874, + "grad_norm": 6.425692558288574, + "learning_rate": 1.7747125885927956e-06, + "loss": 0.0991, + "step": 28687 + }, + { + "epoch": 0.7259660399321811, + "grad_norm": 4.402575969696045, + "learning_rate": 1.7744057819489647e-06, + "loss": 0.1706, + "step": 28688 + }, + { + "epoch": 0.7259913454968747, + "grad_norm": 2.8027913570404053, + "learning_rate": 1.7740989961063893e-06, + "loss": 0.0892, + "step": 28689 + }, + { + "epoch": 0.7260166510615684, + "grad_norm": 4.96645975112915, + "learning_rate": 1.7737922310670509e-06, + "loss": 0.1479, + "step": 28690 + }, + { + "epoch": 0.7260419566262621, + "grad_norm": 4.162985324859619, + "learning_rate": 1.7734854868329266e-06, + "loss": 0.1722, + "step": 28691 + }, + { + "epoch": 0.7260672621909557, + "grad_norm": 4.580988883972168, + "learning_rate": 1.7731787634059939e-06, + "loss": 0.1279, + "step": 28692 + }, + { + "epoch": 0.7260925677556495, + "grad_norm": 3.6482813358306885, + "learning_rate": 1.772872060788231e-06, + "loss": 0.1489, + "step": 28693 + }, + { + "epoch": 0.7261178733203432, + "grad_norm": 8.59655475616455, + "learning_rate": 1.7725653789816144e-06, + "loss": 0.2571, + "step": 28694 + }, + { + "epoch": 0.7261431788850368, + "grad_norm": 9.05123519897461, + "learning_rate": 1.7722587179881252e-06, + "loss": 0.2198, + "step": 28695 + }, + { + "epoch": 0.7261684844497305, + "grad_norm": 14.064669609069824, + "learning_rate": 1.771952077809737e-06, + "loss": 0.1592, + "step": 28696 + }, + { + "epoch": 0.7261937900144242, + "grad_norm": 5.786345481872559, + "learning_rate": 1.7716454584484326e-06, + "loss": 0.1774, + "step": 28697 + }, + { + "epoch": 0.7262190955791179, + "grad_norm": 19.39841651916504, + "learning_rate": 1.7713388599061837e-06, + "loss": 0.3369, + "step": 28698 + }, + { + "epoch": 0.7262444011438115, + "grad_norm": 3.4757511615753174, + "learning_rate": 1.7710322821849707e-06, + "loss": 0.1017, + "step": 28699 + }, + { + "epoch": 0.7262697067085052, + "grad_norm": 3.8572099208831787, + "learning_rate": 1.7707257252867688e-06, + "loss": 0.1303, + "step": 28700 + }, + { + "epoch": 0.7262950122731989, + "grad_norm": 5.116209506988525, + "learning_rate": 1.770419189213559e-06, + "loss": 0.1217, + "step": 28701 + }, + { + "epoch": 0.7263203178378925, + "grad_norm": 3.6595118045806885, + "learning_rate": 1.770112673967312e-06, + "loss": 0.1487, + "step": 28702 + }, + { + "epoch": 0.7263456234025862, + "grad_norm": 8.70981502532959, + "learning_rate": 1.7698061795500088e-06, + "loss": 0.197, + "step": 28703 + }, + { + "epoch": 0.72637092896728, + "grad_norm": 3.721588373184204, + "learning_rate": 1.7694997059636233e-06, + "loss": 0.1175, + "step": 28704 + }, + { + "epoch": 0.7263962345319736, + "grad_norm": 7.948284149169922, + "learning_rate": 1.769193253210137e-06, + "loss": 0.1673, + "step": 28705 + }, + { + "epoch": 0.7264215400966673, + "grad_norm": 6.739424705505371, + "learning_rate": 1.7688868212915183e-06, + "loss": 0.1324, + "step": 28706 + }, + { + "epoch": 0.726446845661361, + "grad_norm": 3.20180344581604, + "learning_rate": 1.76858041020975e-06, + "loss": 0.0931, + "step": 28707 + }, + { + "epoch": 0.7264721512260546, + "grad_norm": 4.127442359924316, + "learning_rate": 1.768274019966803e-06, + "loss": 0.1168, + "step": 28708 + }, + { + "epoch": 0.7264974567907483, + "grad_norm": 6.443770408630371, + "learning_rate": 1.7679676505646576e-06, + "loss": 0.1267, + "step": 28709 + }, + { + "epoch": 0.726522762355442, + "grad_norm": 13.379404067993164, + "learning_rate": 1.7676613020052879e-06, + "loss": 0.305, + "step": 28710 + }, + { + "epoch": 0.7265480679201356, + "grad_norm": 13.7677001953125, + "learning_rate": 1.7673549742906693e-06, + "loss": 0.1746, + "step": 28711 + }, + { + "epoch": 0.7265733734848293, + "grad_norm": 11.397192001342773, + "learning_rate": 1.767048667422775e-06, + "loss": 0.1057, + "step": 28712 + }, + { + "epoch": 0.726598679049523, + "grad_norm": 4.205082893371582, + "learning_rate": 1.7667423814035844e-06, + "loss": 0.1554, + "step": 28713 + }, + { + "epoch": 0.7266239846142166, + "grad_norm": 15.633722305297852, + "learning_rate": 1.7664361162350708e-06, + "loss": 0.1981, + "step": 28714 + }, + { + "epoch": 0.7266492901789103, + "grad_norm": 6.133051872253418, + "learning_rate": 1.7661298719192094e-06, + "loss": 0.1523, + "step": 28715 + }, + { + "epoch": 0.726674595743604, + "grad_norm": 7.515293598175049, + "learning_rate": 1.7658236484579726e-06, + "loss": 0.1917, + "step": 28716 + }, + { + "epoch": 0.7266999013082976, + "grad_norm": 5.073048114776611, + "learning_rate": 1.7655174458533397e-06, + "loss": 0.1389, + "step": 28717 + }, + { + "epoch": 0.7267252068729914, + "grad_norm": 7.32879114151001, + "learning_rate": 1.7652112641072833e-06, + "loss": 0.1279, + "step": 28718 + }, + { + "epoch": 0.7267505124376851, + "grad_norm": 5.251989841461182, + "learning_rate": 1.7649051032217774e-06, + "loss": 0.1172, + "step": 28719 + }, + { + "epoch": 0.7267758180023787, + "grad_norm": 6.274834156036377, + "learning_rate": 1.7645989631987965e-06, + "loss": 0.1904, + "step": 28720 + }, + { + "epoch": 0.7268011235670724, + "grad_norm": 2.6026968955993652, + "learning_rate": 1.764292844040314e-06, + "loss": 0.1281, + "step": 28721 + }, + { + "epoch": 0.7268264291317661, + "grad_norm": 6.3788580894470215, + "learning_rate": 1.7639867457483062e-06, + "loss": 0.1083, + "step": 28722 + }, + { + "epoch": 0.7268517346964598, + "grad_norm": 3.2254443168640137, + "learning_rate": 1.763680668324746e-06, + "loss": 0.1343, + "step": 28723 + }, + { + "epoch": 0.7268770402611534, + "grad_norm": 3.680360794067383, + "learning_rate": 1.7633746117716072e-06, + "loss": 0.115, + "step": 28724 + }, + { + "epoch": 0.7269023458258471, + "grad_norm": 9.354243278503418, + "learning_rate": 1.7630685760908623e-06, + "loss": 0.215, + "step": 28725 + }, + { + "epoch": 0.7269276513905408, + "grad_norm": 3.701528787612915, + "learning_rate": 1.7627625612844867e-06, + "loss": 0.1923, + "step": 28726 + }, + { + "epoch": 0.7269529569552344, + "grad_norm": 5.76593542098999, + "learning_rate": 1.7624565673544525e-06, + "loss": 0.213, + "step": 28727 + }, + { + "epoch": 0.7269782625199281, + "grad_norm": 4.800079345703125, + "learning_rate": 1.7621505943027367e-06, + "loss": 0.1992, + "step": 28728 + }, + { + "epoch": 0.7270035680846219, + "grad_norm": 4.920255184173584, + "learning_rate": 1.7618446421313057e-06, + "loss": 0.1387, + "step": 28729 + }, + { + "epoch": 0.7270288736493155, + "grad_norm": 4.286064624786377, + "learning_rate": 1.7615387108421384e-06, + "loss": 0.1357, + "step": 28730 + }, + { + "epoch": 0.7270541792140092, + "grad_norm": 7.680095672607422, + "learning_rate": 1.7612328004372036e-06, + "loss": 0.3153, + "step": 28731 + }, + { + "epoch": 0.7270794847787029, + "grad_norm": 3.6908416748046875, + "learning_rate": 1.7609269109184795e-06, + "loss": 0.1585, + "step": 28732 + }, + { + "epoch": 0.7271047903433965, + "grad_norm": 7.059730052947998, + "learning_rate": 1.7606210422879317e-06, + "loss": 0.1332, + "step": 28733 + }, + { + "epoch": 0.7271300959080902, + "grad_norm": 2.555316925048828, + "learning_rate": 1.7603151945475377e-06, + "loss": 0.0982, + "step": 28734 + }, + { + "epoch": 0.7271554014727839, + "grad_norm": 6.552447319030762, + "learning_rate": 1.7600093676992675e-06, + "loss": 0.1308, + "step": 28735 + }, + { + "epoch": 0.7271807070374775, + "grad_norm": 3.3186144828796387, + "learning_rate": 1.759703561745097e-06, + "loss": 0.1409, + "step": 28736 + }, + { + "epoch": 0.7272060126021712, + "grad_norm": 3.5548129081726074, + "learning_rate": 1.7593977766869924e-06, + "loss": 0.0894, + "step": 28737 + }, + { + "epoch": 0.7272313181668649, + "grad_norm": 8.359736442565918, + "learning_rate": 1.7590920125269301e-06, + "loss": 0.0935, + "step": 28738 + }, + { + "epoch": 0.7272566237315585, + "grad_norm": 6.63504695892334, + "learning_rate": 1.758786269266879e-06, + "loss": 0.1625, + "step": 28739 + }, + { + "epoch": 0.7272819292962522, + "grad_norm": 9.763606071472168, + "learning_rate": 1.758480546908814e-06, + "loss": 0.1792, + "step": 28740 + }, + { + "epoch": 0.727307234860946, + "grad_norm": 4.491715431213379, + "learning_rate": 1.758174845454705e-06, + "loss": 0.0923, + "step": 28741 + }, + { + "epoch": 0.7273325404256396, + "grad_norm": 3.811098098754883, + "learning_rate": 1.7578691649065233e-06, + "loss": 0.1821, + "step": 28742 + }, + { + "epoch": 0.7273578459903333, + "grad_norm": 8.75447940826416, + "learning_rate": 1.7575635052662387e-06, + "loss": 0.1983, + "step": 28743 + }, + { + "epoch": 0.727383151555027, + "grad_norm": 4.367561340332031, + "learning_rate": 1.7572578665358258e-06, + "loss": 0.1279, + "step": 28744 + }, + { + "epoch": 0.7274084571197206, + "grad_norm": 7.272752285003662, + "learning_rate": 1.7569522487172536e-06, + "loss": 0.207, + "step": 28745 + }, + { + "epoch": 0.7274337626844143, + "grad_norm": 7.140667915344238, + "learning_rate": 1.756646651812493e-06, + "loss": 0.215, + "step": 28746 + }, + { + "epoch": 0.727459068249108, + "grad_norm": 5.945793151855469, + "learning_rate": 1.756341075823515e-06, + "loss": 0.1462, + "step": 28747 + }, + { + "epoch": 0.7274843738138017, + "grad_norm": 6.2522687911987305, + "learning_rate": 1.7560355207522878e-06, + "loss": 0.2124, + "step": 28748 + }, + { + "epoch": 0.7275096793784953, + "grad_norm": 5.137555122375488, + "learning_rate": 1.7557299866007866e-06, + "loss": 0.1458, + "step": 28749 + }, + { + "epoch": 0.727534984943189, + "grad_norm": 4.594358921051025, + "learning_rate": 1.7554244733709785e-06, + "loss": 0.1497, + "step": 28750 + }, + { + "epoch": 0.7275602905078827, + "grad_norm": 5.709801197052002, + "learning_rate": 1.755118981064835e-06, + "loss": 0.14, + "step": 28751 + }, + { + "epoch": 0.7275855960725763, + "grad_norm": 2.9926345348358154, + "learning_rate": 1.754813509684324e-06, + "loss": 0.0869, + "step": 28752 + }, + { + "epoch": 0.72761090163727, + "grad_norm": 6.730188846588135, + "learning_rate": 1.7545080592314179e-06, + "loss": 0.1247, + "step": 28753 + }, + { + "epoch": 0.7276362072019638, + "grad_norm": 5.399718761444092, + "learning_rate": 1.754202629708086e-06, + "loss": 0.1904, + "step": 28754 + }, + { + "epoch": 0.7276615127666574, + "grad_norm": 4.304823398590088, + "learning_rate": 1.753897221116298e-06, + "loss": 0.1596, + "step": 28755 + }, + { + "epoch": 0.7276868183313511, + "grad_norm": 6.093876838684082, + "learning_rate": 1.753591833458021e-06, + "loss": 0.1394, + "step": 28756 + }, + { + "epoch": 0.7277121238960448, + "grad_norm": 4.489894866943359, + "learning_rate": 1.7532864667352278e-06, + "loss": 0.1533, + "step": 28757 + }, + { + "epoch": 0.7277374294607384, + "grad_norm": 3.274442434310913, + "learning_rate": 1.7529811209498866e-06, + "loss": 0.0854, + "step": 28758 + }, + { + "epoch": 0.7277627350254321, + "grad_norm": 5.643126487731934, + "learning_rate": 1.7526757961039664e-06, + "loss": 0.1027, + "step": 28759 + }, + { + "epoch": 0.7277880405901258, + "grad_norm": 3.1877219676971436, + "learning_rate": 1.752370492199434e-06, + "loss": 0.1315, + "step": 28760 + }, + { + "epoch": 0.7278133461548194, + "grad_norm": 8.718740463256836, + "learning_rate": 1.7520652092382617e-06, + "loss": 0.2408, + "step": 28761 + }, + { + "epoch": 0.7278386517195131, + "grad_norm": 4.721105098724365, + "learning_rate": 1.751759947222415e-06, + "loss": 0.1377, + "step": 28762 + }, + { + "epoch": 0.7278639572842068, + "grad_norm": 3.89898943901062, + "learning_rate": 1.7514547061538679e-06, + "loss": 0.1095, + "step": 28763 + }, + { + "epoch": 0.7278892628489004, + "grad_norm": 6.348513603210449, + "learning_rate": 1.7511494860345812e-06, + "loss": 0.1824, + "step": 28764 + }, + { + "epoch": 0.7279145684135941, + "grad_norm": 7.762603759765625, + "learning_rate": 1.7508442868665293e-06, + "loss": 0.1912, + "step": 28765 + }, + { + "epoch": 0.7279398739782879, + "grad_norm": 4.622990131378174, + "learning_rate": 1.7505391086516764e-06, + "loss": 0.113, + "step": 28766 + }, + { + "epoch": 0.7279651795429815, + "grad_norm": 4.155416011810303, + "learning_rate": 1.7502339513919958e-06, + "loss": 0.1854, + "step": 28767 + }, + { + "epoch": 0.7279904851076752, + "grad_norm": 4.316722393035889, + "learning_rate": 1.7499288150894484e-06, + "loss": 0.0856, + "step": 28768 + }, + { + "epoch": 0.7280157906723689, + "grad_norm": 6.969884872436523, + "learning_rate": 1.749623699746007e-06, + "loss": 0.2653, + "step": 28769 + }, + { + "epoch": 0.7280410962370625, + "grad_norm": 5.105056285858154, + "learning_rate": 1.7493186053636363e-06, + "loss": 0.2431, + "step": 28770 + }, + { + "epoch": 0.7280664018017562, + "grad_norm": 6.248472213745117, + "learning_rate": 1.7490135319443068e-06, + "loss": 0.2108, + "step": 28771 + }, + { + "epoch": 0.7280917073664499, + "grad_norm": 5.60966157913208, + "learning_rate": 1.7487084794899845e-06, + "loss": 0.1022, + "step": 28772 + }, + { + "epoch": 0.7281170129311435, + "grad_norm": 4.547845363616943, + "learning_rate": 1.7484034480026369e-06, + "loss": 0.1411, + "step": 28773 + }, + { + "epoch": 0.7281423184958372, + "grad_norm": 2.8508994579315186, + "learning_rate": 1.7480984374842297e-06, + "loss": 0.1608, + "step": 28774 + }, + { + "epoch": 0.7281676240605309, + "grad_norm": 7.584099769592285, + "learning_rate": 1.7477934479367298e-06, + "loss": 0.2058, + "step": 28775 + }, + { + "epoch": 0.7281929296252246, + "grad_norm": 5.99953556060791, + "learning_rate": 1.747488479362106e-06, + "loss": 0.2073, + "step": 28776 + }, + { + "epoch": 0.7282182351899182, + "grad_norm": 7.067856311798096, + "learning_rate": 1.7471835317623242e-06, + "loss": 0.1053, + "step": 28777 + }, + { + "epoch": 0.728243540754612, + "grad_norm": 5.796852111816406, + "learning_rate": 1.746878605139351e-06, + "loss": 0.1511, + "step": 28778 + }, + { + "epoch": 0.7282688463193057, + "grad_norm": 7.3942179679870605, + "learning_rate": 1.746573699495151e-06, + "loss": 0.1912, + "step": 28779 + }, + { + "epoch": 0.7282941518839993, + "grad_norm": 8.271928787231445, + "learning_rate": 1.7462688148316937e-06, + "loss": 0.1885, + "step": 28780 + }, + { + "epoch": 0.728319457448693, + "grad_norm": 2.6200175285339355, + "learning_rate": 1.745963951150943e-06, + "loss": 0.1038, + "step": 28781 + }, + { + "epoch": 0.7283447630133867, + "grad_norm": 6.339847564697266, + "learning_rate": 1.7456591084548658e-06, + "loss": 0.1624, + "step": 28782 + }, + { + "epoch": 0.7283700685780803, + "grad_norm": 6.000720500946045, + "learning_rate": 1.7453542867454259e-06, + "loss": 0.1358, + "step": 28783 + }, + { + "epoch": 0.728395374142774, + "grad_norm": 13.30705738067627, + "learning_rate": 1.7450494860245925e-06, + "loss": 0.2843, + "step": 28784 + }, + { + "epoch": 0.7284206797074677, + "grad_norm": 5.105820178985596, + "learning_rate": 1.7447447062943295e-06, + "loss": 0.1773, + "step": 28785 + }, + { + "epoch": 0.7284459852721613, + "grad_norm": 7.330257892608643, + "learning_rate": 1.7444399475566026e-06, + "loss": 0.1669, + "step": 28786 + }, + { + "epoch": 0.728471290836855, + "grad_norm": 5.259461879730225, + "learning_rate": 1.7441352098133751e-06, + "loss": 0.1452, + "step": 28787 + }, + { + "epoch": 0.7284965964015487, + "grad_norm": 4.608006477355957, + "learning_rate": 1.7438304930666157e-06, + "loss": 0.1602, + "step": 28788 + }, + { + "epoch": 0.7285219019662423, + "grad_norm": 4.663075923919678, + "learning_rate": 1.7435257973182879e-06, + "loss": 0.1417, + "step": 28789 + }, + { + "epoch": 0.728547207530936, + "grad_norm": 3.015868902206421, + "learning_rate": 1.743221122570356e-06, + "loss": 0.1128, + "step": 28790 + }, + { + "epoch": 0.7285725130956298, + "grad_norm": 7.631837844848633, + "learning_rate": 1.742916468824784e-06, + "loss": 0.2134, + "step": 28791 + }, + { + "epoch": 0.7285978186603234, + "grad_norm": 6.880988121032715, + "learning_rate": 1.7426118360835393e-06, + "loss": 0.1912, + "step": 28792 + }, + { + "epoch": 0.7286231242250171, + "grad_norm": 3.8216047286987305, + "learning_rate": 1.7423072243485833e-06, + "loss": 0.1722, + "step": 28793 + }, + { + "epoch": 0.7286484297897108, + "grad_norm": 3.8165676593780518, + "learning_rate": 1.7420026336218854e-06, + "loss": 0.1164, + "step": 28794 + }, + { + "epoch": 0.7286737353544044, + "grad_norm": 2.168926954269409, + "learning_rate": 1.741698063905403e-06, + "loss": 0.0502, + "step": 28795 + }, + { + "epoch": 0.7286990409190981, + "grad_norm": 5.566900730133057, + "learning_rate": 1.7413935152011058e-06, + "loss": 0.2405, + "step": 28796 + }, + { + "epoch": 0.7287243464837918, + "grad_norm": 4.217597007751465, + "learning_rate": 1.7410889875109533e-06, + "loss": 0.1457, + "step": 28797 + }, + { + "epoch": 0.7287496520484854, + "grad_norm": 4.490774631500244, + "learning_rate": 1.7407844808369157e-06, + "loss": 0.1, + "step": 28798 + }, + { + "epoch": 0.7287749576131791, + "grad_norm": 6.200645923614502, + "learning_rate": 1.7404799951809492e-06, + "loss": 0.1335, + "step": 28799 + }, + { + "epoch": 0.7288002631778728, + "grad_norm": 3.24191951751709, + "learning_rate": 1.7401755305450223e-06, + "loss": 0.116, + "step": 28800 + }, + { + "epoch": 0.7288255687425665, + "grad_norm": 15.47911262512207, + "learning_rate": 1.7398710869310959e-06, + "loss": 0.1703, + "step": 28801 + }, + { + "epoch": 0.7288508743072601, + "grad_norm": 5.0483078956604, + "learning_rate": 1.7395666643411358e-06, + "loss": 0.2018, + "step": 28802 + }, + { + "epoch": 0.7288761798719539, + "grad_norm": 6.42771577835083, + "learning_rate": 1.7392622627771033e-06, + "loss": 0.1603, + "step": 28803 + }, + { + "epoch": 0.7289014854366476, + "grad_norm": 4.68543004989624, + "learning_rate": 1.7389578822409626e-06, + "loss": 0.1342, + "step": 28804 + }, + { + "epoch": 0.7289267910013412, + "grad_norm": 4.321267604827881, + "learning_rate": 1.7386535227346757e-06, + "loss": 0.1523, + "step": 28805 + }, + { + "epoch": 0.7289520965660349, + "grad_norm": 5.8483781814575195, + "learning_rate": 1.7383491842602041e-06, + "loss": 0.1521, + "step": 28806 + }, + { + "epoch": 0.7289774021307286, + "grad_norm": 4.483974933624268, + "learning_rate": 1.7380448668195133e-06, + "loss": 0.0863, + "step": 28807 + }, + { + "epoch": 0.7290027076954222, + "grad_norm": 3.7650835514068604, + "learning_rate": 1.7377405704145645e-06, + "loss": 0.1529, + "step": 28808 + }, + { + "epoch": 0.7290280132601159, + "grad_norm": 5.173338890075684, + "learning_rate": 1.73743629504732e-06, + "loss": 0.1836, + "step": 28809 + }, + { + "epoch": 0.7290533188248096, + "grad_norm": 4.133236885070801, + "learning_rate": 1.73713204071974e-06, + "loss": 0.1408, + "step": 28810 + }, + { + "epoch": 0.7290786243895032, + "grad_norm": 5.2658915519714355, + "learning_rate": 1.7368278074337901e-06, + "loss": 0.2277, + "step": 28811 + }, + { + "epoch": 0.7291039299541969, + "grad_norm": 4.675495624542236, + "learning_rate": 1.736523595191431e-06, + "loss": 0.1199, + "step": 28812 + }, + { + "epoch": 0.7291292355188906, + "grad_norm": 6.862715721130371, + "learning_rate": 1.7362194039946246e-06, + "loss": 0.1617, + "step": 28813 + }, + { + "epoch": 0.7291545410835842, + "grad_norm": 3.504366397857666, + "learning_rate": 1.73591523384533e-06, + "loss": 0.1043, + "step": 28814 + }, + { + "epoch": 0.729179846648278, + "grad_norm": 2.82348895072937, + "learning_rate": 1.7356110847455126e-06, + "loss": 0.1387, + "step": 28815 + }, + { + "epoch": 0.7292051522129717, + "grad_norm": 5.6747283935546875, + "learning_rate": 1.735306956697132e-06, + "loss": 0.1763, + "step": 28816 + }, + { + "epoch": 0.7292304577776653, + "grad_norm": 8.15311336517334, + "learning_rate": 1.7350028497021492e-06, + "loss": 0.2159, + "step": 28817 + }, + { + "epoch": 0.729255763342359, + "grad_norm": 5.87973690032959, + "learning_rate": 1.7346987637625246e-06, + "loss": 0.1596, + "step": 28818 + }, + { + "epoch": 0.7292810689070527, + "grad_norm": 3.890589714050293, + "learning_rate": 1.734394698880222e-06, + "loss": 0.0892, + "step": 28819 + }, + { + "epoch": 0.7293063744717463, + "grad_norm": 2.2440357208251953, + "learning_rate": 1.7340906550572e-06, + "loss": 0.0605, + "step": 28820 + }, + { + "epoch": 0.72933168003644, + "grad_norm": 4.039965629577637, + "learning_rate": 1.7337866322954206e-06, + "loss": 0.126, + "step": 28821 + }, + { + "epoch": 0.7293569856011337, + "grad_norm": 3.7014310359954834, + "learning_rate": 1.7334826305968415e-06, + "loss": 0.1494, + "step": 28822 + }, + { + "epoch": 0.7293822911658273, + "grad_norm": 3.8817062377929688, + "learning_rate": 1.7331786499634269e-06, + "loss": 0.1091, + "step": 28823 + }, + { + "epoch": 0.729407596730521, + "grad_norm": 4.920564651489258, + "learning_rate": 1.7328746903971338e-06, + "loss": 0.2002, + "step": 28824 + }, + { + "epoch": 0.7294329022952147, + "grad_norm": 6.046759128570557, + "learning_rate": 1.7325707518999279e-06, + "loss": 0.1081, + "step": 28825 + }, + { + "epoch": 0.7294582078599084, + "grad_norm": 4.299274444580078, + "learning_rate": 1.732266834473762e-06, + "loss": 0.1415, + "step": 28826 + }, + { + "epoch": 0.729483513424602, + "grad_norm": 4.3137078285217285, + "learning_rate": 1.7319629381206005e-06, + "loss": 0.1325, + "step": 28827 + }, + { + "epoch": 0.7295088189892958, + "grad_norm": 6.828363418579102, + "learning_rate": 1.7316590628424012e-06, + "loss": 0.1553, + "step": 28828 + }, + { + "epoch": 0.7295341245539895, + "grad_norm": 7.948879718780518, + "learning_rate": 1.7313552086411273e-06, + "loss": 0.221, + "step": 28829 + }, + { + "epoch": 0.7295594301186831, + "grad_norm": 4.3333516120910645, + "learning_rate": 1.731051375518733e-06, + "loss": 0.1243, + "step": 28830 + }, + { + "epoch": 0.7295847356833768, + "grad_norm": 4.785793781280518, + "learning_rate": 1.7307475634771814e-06, + "loss": 0.1803, + "step": 28831 + }, + { + "epoch": 0.7296100412480705, + "grad_norm": 3.4625706672668457, + "learning_rate": 1.730443772518431e-06, + "loss": 0.1532, + "step": 28832 + }, + { + "epoch": 0.7296353468127641, + "grad_norm": 15.826107025146484, + "learning_rate": 1.7301400026444405e-06, + "loss": 0.1964, + "step": 28833 + }, + { + "epoch": 0.7296606523774578, + "grad_norm": 7.583080291748047, + "learning_rate": 1.7298362538571667e-06, + "loss": 0.1968, + "step": 28834 + }, + { + "epoch": 0.7296859579421515, + "grad_norm": 4.908603191375732, + "learning_rate": 1.7295325261585728e-06, + "loss": 0.166, + "step": 28835 + }, + { + "epoch": 0.7297112635068451, + "grad_norm": 8.202238082885742, + "learning_rate": 1.7292288195506152e-06, + "loss": 0.0783, + "step": 28836 + }, + { + "epoch": 0.7297365690715388, + "grad_norm": 4.917007923126221, + "learning_rate": 1.728925134035251e-06, + "loss": 0.1366, + "step": 28837 + }, + { + "epoch": 0.7297618746362325, + "grad_norm": 4.190078258514404, + "learning_rate": 1.7286214696144416e-06, + "loss": 0.1487, + "step": 28838 + }, + { + "epoch": 0.7297871802009261, + "grad_norm": 8.024892807006836, + "learning_rate": 1.7283178262901434e-06, + "loss": 0.1464, + "step": 28839 + }, + { + "epoch": 0.7298124857656199, + "grad_norm": 6.005756855010986, + "learning_rate": 1.7280142040643155e-06, + "loss": 0.1384, + "step": 28840 + }, + { + "epoch": 0.7298377913303136, + "grad_norm": 6.182353973388672, + "learning_rate": 1.7277106029389141e-06, + "loss": 0.2079, + "step": 28841 + }, + { + "epoch": 0.7298630968950072, + "grad_norm": 4.2402448654174805, + "learning_rate": 1.7274070229159008e-06, + "loss": 0.1843, + "step": 28842 + }, + { + "epoch": 0.7298884024597009, + "grad_norm": 5.720839977264404, + "learning_rate": 1.7271034639972279e-06, + "loss": 0.1328, + "step": 28843 + }, + { + "epoch": 0.7299137080243946, + "grad_norm": 5.7395806312561035, + "learning_rate": 1.7267999261848578e-06, + "loss": 0.2023, + "step": 28844 + }, + { + "epoch": 0.7299390135890882, + "grad_norm": 6.119706153869629, + "learning_rate": 1.726496409480744e-06, + "loss": 0.102, + "step": 28845 + }, + { + "epoch": 0.7299643191537819, + "grad_norm": 5.527900695800781, + "learning_rate": 1.7261929138868476e-06, + "loss": 0.1769, + "step": 28846 + }, + { + "epoch": 0.7299896247184756, + "grad_norm": 5.78071928024292, + "learning_rate": 1.7258894394051246e-06, + "loss": 0.1605, + "step": 28847 + }, + { + "epoch": 0.7300149302831692, + "grad_norm": 5.420530796051025, + "learning_rate": 1.7255859860375312e-06, + "loss": 0.147, + "step": 28848 + }, + { + "epoch": 0.7300402358478629, + "grad_norm": 3.42020583152771, + "learning_rate": 1.7252825537860234e-06, + "loss": 0.1505, + "step": 28849 + }, + { + "epoch": 0.7300655414125566, + "grad_norm": 8.875781059265137, + "learning_rate": 1.724979142652561e-06, + "loss": 0.2695, + "step": 28850 + }, + { + "epoch": 0.7300908469772504, + "grad_norm": 6.992748260498047, + "learning_rate": 1.7246757526390983e-06, + "loss": 0.2054, + "step": 28851 + }, + { + "epoch": 0.730116152541944, + "grad_norm": 8.593810081481934, + "learning_rate": 1.7243723837475928e-06, + "loss": 0.2645, + "step": 28852 + }, + { + "epoch": 0.7301414581066377, + "grad_norm": 5.788391590118408, + "learning_rate": 1.7240690359799989e-06, + "loss": 0.169, + "step": 28853 + }, + { + "epoch": 0.7301667636713314, + "grad_norm": 3.550736427307129, + "learning_rate": 1.7237657093382759e-06, + "loss": 0.1198, + "step": 28854 + }, + { + "epoch": 0.730192069236025, + "grad_norm": 12.326050758361816, + "learning_rate": 1.723462403824377e-06, + "loss": 0.1061, + "step": 28855 + }, + { + "epoch": 0.7302173748007187, + "grad_norm": 6.9887237548828125, + "learning_rate": 1.7231591194402624e-06, + "loss": 0.1801, + "step": 28856 + }, + { + "epoch": 0.7302426803654124, + "grad_norm": 5.778570652008057, + "learning_rate": 1.7228558561878823e-06, + "loss": 0.1678, + "step": 28857 + }, + { + "epoch": 0.730267985930106, + "grad_norm": 7.500250816345215, + "learning_rate": 1.7225526140691961e-06, + "loss": 0.223, + "step": 28858 + }, + { + "epoch": 0.7302932914947997, + "grad_norm": 3.524169683456421, + "learning_rate": 1.7222493930861595e-06, + "loss": 0.1234, + "step": 28859 + }, + { + "epoch": 0.7303185970594934, + "grad_norm": 3.3964641094207764, + "learning_rate": 1.721946193240726e-06, + "loss": 0.1393, + "step": 28860 + }, + { + "epoch": 0.730343902624187, + "grad_norm": 3.171915292739868, + "learning_rate": 1.7216430145348506e-06, + "loss": 0.0977, + "step": 28861 + }, + { + "epoch": 0.7303692081888807, + "grad_norm": 3.5868077278137207, + "learning_rate": 1.7213398569704908e-06, + "loss": 0.1143, + "step": 28862 + }, + { + "epoch": 0.7303945137535744, + "grad_norm": 2.97904634475708, + "learning_rate": 1.7210367205496008e-06, + "loss": 0.1606, + "step": 28863 + }, + { + "epoch": 0.730419819318268, + "grad_norm": 4.2429094314575195, + "learning_rate": 1.7207336052741347e-06, + "loss": 0.1731, + "step": 28864 + }, + { + "epoch": 0.7304451248829618, + "grad_norm": 7.540177345275879, + "learning_rate": 1.7204305111460462e-06, + "loss": 0.1182, + "step": 28865 + }, + { + "epoch": 0.7304704304476555, + "grad_norm": 6.233612060546875, + "learning_rate": 1.7201274381672927e-06, + "loss": 0.1905, + "step": 28866 + }, + { + "epoch": 0.7304957360123491, + "grad_norm": 8.531608581542969, + "learning_rate": 1.7198243863398273e-06, + "loss": 0.2306, + "step": 28867 + }, + { + "epoch": 0.7305210415770428, + "grad_norm": 3.1999337673187256, + "learning_rate": 1.7195213556656026e-06, + "loss": 0.1051, + "step": 28868 + }, + { + "epoch": 0.7305463471417365, + "grad_norm": 5.635498523712158, + "learning_rate": 1.7192183461465783e-06, + "loss": 0.1453, + "step": 28869 + }, + { + "epoch": 0.7305716527064301, + "grad_norm": 11.303657531738281, + "learning_rate": 1.7189153577847007e-06, + "loss": 0.2637, + "step": 28870 + }, + { + "epoch": 0.7305969582711238, + "grad_norm": 5.109752178192139, + "learning_rate": 1.7186123905819296e-06, + "loss": 0.1717, + "step": 28871 + }, + { + "epoch": 0.7306222638358175, + "grad_norm": 4.378902435302734, + "learning_rate": 1.7183094445402148e-06, + "loss": 0.1589, + "step": 28872 + }, + { + "epoch": 0.7306475694005111, + "grad_norm": 3.984956741333008, + "learning_rate": 1.7180065196615153e-06, + "loss": 0.1026, + "step": 28873 + }, + { + "epoch": 0.7306728749652048, + "grad_norm": 6.333889484405518, + "learning_rate": 1.7177036159477784e-06, + "loss": 0.1798, + "step": 28874 + }, + { + "epoch": 0.7306981805298985, + "grad_norm": 27.840421676635742, + "learning_rate": 1.7174007334009618e-06, + "loss": 0.2481, + "step": 28875 + }, + { + "epoch": 0.7307234860945923, + "grad_norm": 5.284395694732666, + "learning_rate": 1.7170978720230153e-06, + "loss": 0.1049, + "step": 28876 + }, + { + "epoch": 0.7307487916592859, + "grad_norm": 2.7267494201660156, + "learning_rate": 1.7167950318158961e-06, + "loss": 0.1374, + "step": 28877 + }, + { + "epoch": 0.7307740972239796, + "grad_norm": 5.8889641761779785, + "learning_rate": 1.7164922127815547e-06, + "loss": 0.2274, + "step": 28878 + }, + { + "epoch": 0.7307994027886733, + "grad_norm": 3.597174882888794, + "learning_rate": 1.7161894149219444e-06, + "loss": 0.1407, + "step": 28879 + }, + { + "epoch": 0.7308247083533669, + "grad_norm": 3.318964958190918, + "learning_rate": 1.7158866382390155e-06, + "loss": 0.2014, + "step": 28880 + }, + { + "epoch": 0.7308500139180606, + "grad_norm": 6.6378254890441895, + "learning_rate": 1.715583882734725e-06, + "loss": 0.1765, + "step": 28881 + }, + { + "epoch": 0.7308753194827543, + "grad_norm": 6.187117576599121, + "learning_rate": 1.7152811484110233e-06, + "loss": 0.1689, + "step": 28882 + }, + { + "epoch": 0.7309006250474479, + "grad_norm": 5.959133625030518, + "learning_rate": 1.7149784352698616e-06, + "loss": 0.2299, + "step": 28883 + }, + { + "epoch": 0.7309259306121416, + "grad_norm": 4.198153972625732, + "learning_rate": 1.7146757433131922e-06, + "loss": 0.0706, + "step": 28884 + }, + { + "epoch": 0.7309512361768353, + "grad_norm": 5.358619689941406, + "learning_rate": 1.714373072542969e-06, + "loss": 0.151, + "step": 28885 + }, + { + "epoch": 0.7309765417415289, + "grad_norm": 10.354249954223633, + "learning_rate": 1.7140704229611428e-06, + "loss": 0.1219, + "step": 28886 + }, + { + "epoch": 0.7310018473062226, + "grad_norm": 6.243140697479248, + "learning_rate": 1.713767794569665e-06, + "loss": 0.166, + "step": 28887 + }, + { + "epoch": 0.7310271528709164, + "grad_norm": 3.868267774581909, + "learning_rate": 1.713465187370486e-06, + "loss": 0.0818, + "step": 28888 + }, + { + "epoch": 0.73105245843561, + "grad_norm": 3.186365842819214, + "learning_rate": 1.71316260136556e-06, + "loss": 0.099, + "step": 28889 + }, + { + "epoch": 0.7310777640003037, + "grad_norm": 11.60267162322998, + "learning_rate": 1.7128600365568376e-06, + "loss": 0.182, + "step": 28890 + }, + { + "epoch": 0.7311030695649974, + "grad_norm": 10.317564010620117, + "learning_rate": 1.712557492946269e-06, + "loss": 0.1774, + "step": 28891 + }, + { + "epoch": 0.731128375129691, + "grad_norm": 2.2909770011901855, + "learning_rate": 1.7122549705358038e-06, + "loss": 0.0217, + "step": 28892 + }, + { + "epoch": 0.7311536806943847, + "grad_norm": 5.811168670654297, + "learning_rate": 1.7119524693273964e-06, + "loss": 0.1346, + "step": 28893 + }, + { + "epoch": 0.7311789862590784, + "grad_norm": 5.567002773284912, + "learning_rate": 1.711649989322996e-06, + "loss": 0.2046, + "step": 28894 + }, + { + "epoch": 0.731204291823772, + "grad_norm": 18.1608829498291, + "learning_rate": 1.7113475305245536e-06, + "loss": 0.1909, + "step": 28895 + }, + { + "epoch": 0.7312295973884657, + "grad_norm": 6.039984703063965, + "learning_rate": 1.7110450929340183e-06, + "loss": 0.1451, + "step": 28896 + }, + { + "epoch": 0.7312549029531594, + "grad_norm": 3.7244834899902344, + "learning_rate": 1.7107426765533402e-06, + "loss": 0.1552, + "step": 28897 + }, + { + "epoch": 0.731280208517853, + "grad_norm": 3.768967390060425, + "learning_rate": 1.7104402813844728e-06, + "loss": 0.154, + "step": 28898 + }, + { + "epoch": 0.7313055140825467, + "grad_norm": 5.244327068328857, + "learning_rate": 1.7101379074293622e-06, + "loss": 0.1514, + "step": 28899 + }, + { + "epoch": 0.7313308196472404, + "grad_norm": 4.8466949462890625, + "learning_rate": 1.7098355546899631e-06, + "loss": 0.1212, + "step": 28900 + }, + { + "epoch": 0.731356125211934, + "grad_norm": 3.19602632522583, + "learning_rate": 1.70953322316822e-06, + "loss": 0.0883, + "step": 28901 + }, + { + "epoch": 0.7313814307766278, + "grad_norm": 19.090723037719727, + "learning_rate": 1.709230912866086e-06, + "loss": 0.4662, + "step": 28902 + }, + { + "epoch": 0.7314067363413215, + "grad_norm": 4.79900598526001, + "learning_rate": 1.7089286237855085e-06, + "loss": 0.118, + "step": 28903 + }, + { + "epoch": 0.7314320419060152, + "grad_norm": 8.489778518676758, + "learning_rate": 1.708626355928441e-06, + "loss": 0.1999, + "step": 28904 + }, + { + "epoch": 0.7314573474707088, + "grad_norm": 7.149003028869629, + "learning_rate": 1.7083241092968271e-06, + "loss": 0.1017, + "step": 28905 + }, + { + "epoch": 0.7314826530354025, + "grad_norm": 6.0252885818481445, + "learning_rate": 1.70802188389262e-06, + "loss": 0.1704, + "step": 28906 + }, + { + "epoch": 0.7315079586000962, + "grad_norm": 3.7402358055114746, + "learning_rate": 1.7077196797177659e-06, + "loss": 0.1046, + "step": 28907 + }, + { + "epoch": 0.7315332641647898, + "grad_norm": 7.034589767456055, + "learning_rate": 1.7074174967742168e-06, + "loss": 0.2318, + "step": 28908 + }, + { + "epoch": 0.7315585697294835, + "grad_norm": 5.837944030761719, + "learning_rate": 1.7071153350639203e-06, + "loss": 0.1625, + "step": 28909 + }, + { + "epoch": 0.7315838752941772, + "grad_norm": 3.590008497238159, + "learning_rate": 1.706813194588824e-06, + "loss": 0.1256, + "step": 28910 + }, + { + "epoch": 0.7316091808588708, + "grad_norm": 3.686725616455078, + "learning_rate": 1.706511075350875e-06, + "loss": 0.0801, + "step": 28911 + }, + { + "epoch": 0.7316344864235645, + "grad_norm": 3.1382455825805664, + "learning_rate": 1.7062089773520251e-06, + "loss": 0.1014, + "step": 28912 + }, + { + "epoch": 0.7316597919882583, + "grad_norm": 7.104796886444092, + "learning_rate": 1.7059069005942214e-06, + "loss": 0.2354, + "step": 28913 + }, + { + "epoch": 0.7316850975529519, + "grad_norm": 6.590740203857422, + "learning_rate": 1.7056048450794105e-06, + "loss": 0.1538, + "step": 28914 + }, + { + "epoch": 0.7317104031176456, + "grad_norm": 4.499457836151123, + "learning_rate": 1.70530281080954e-06, + "loss": 0.1538, + "step": 28915 + }, + { + "epoch": 0.7317357086823393, + "grad_norm": 3.28995680809021, + "learning_rate": 1.7050007977865607e-06, + "loss": 0.0992, + "step": 28916 + }, + { + "epoch": 0.7317610142470329, + "grad_norm": 4.240317344665527, + "learning_rate": 1.7046988060124176e-06, + "loss": 0.13, + "step": 28917 + }, + { + "epoch": 0.7317863198117266, + "grad_norm": 3.817166566848755, + "learning_rate": 1.7043968354890595e-06, + "loss": 0.1403, + "step": 28918 + }, + { + "epoch": 0.7318116253764203, + "grad_norm": 5.472995758056641, + "learning_rate": 1.7040948862184314e-06, + "loss": 0.1665, + "step": 28919 + }, + { + "epoch": 0.7318369309411139, + "grad_norm": 2.597858190536499, + "learning_rate": 1.7037929582024843e-06, + "loss": 0.099, + "step": 28920 + }, + { + "epoch": 0.7318622365058076, + "grad_norm": 3.784214735031128, + "learning_rate": 1.7034910514431625e-06, + "loss": 0.1134, + "step": 28921 + }, + { + "epoch": 0.7318875420705013, + "grad_norm": 14.144221305847168, + "learning_rate": 1.7031891659424144e-06, + "loss": 0.2387, + "step": 28922 + }, + { + "epoch": 0.7319128476351949, + "grad_norm": 3.8560359477996826, + "learning_rate": 1.702887301702184e-06, + "loss": 0.1749, + "step": 28923 + }, + { + "epoch": 0.7319381531998886, + "grad_norm": 8.495205879211426, + "learning_rate": 1.7025854587244217e-06, + "loss": 0.2461, + "step": 28924 + }, + { + "epoch": 0.7319634587645824, + "grad_norm": 4.8570122718811035, + "learning_rate": 1.7022836370110729e-06, + "loss": 0.1357, + "step": 28925 + }, + { + "epoch": 0.731988764329276, + "grad_norm": 4.911900043487549, + "learning_rate": 1.7019818365640834e-06, + "loss": 0.1702, + "step": 28926 + }, + { + "epoch": 0.7320140698939697, + "grad_norm": 5.49872350692749, + "learning_rate": 1.7016800573853993e-06, + "loss": 0.1177, + "step": 28927 + }, + { + "epoch": 0.7320393754586634, + "grad_norm": 10.937467575073242, + "learning_rate": 1.7013782994769652e-06, + "loss": 0.1229, + "step": 28928 + }, + { + "epoch": 0.7320646810233571, + "grad_norm": 2.3734116554260254, + "learning_rate": 1.701076562840731e-06, + "loss": 0.0625, + "step": 28929 + }, + { + "epoch": 0.7320899865880507, + "grad_norm": 3.6127443313598633, + "learning_rate": 1.7007748474786384e-06, + "loss": 0.1668, + "step": 28930 + }, + { + "epoch": 0.7321152921527444, + "grad_norm": 7.510410785675049, + "learning_rate": 1.7004731533926383e-06, + "loss": 0.1477, + "step": 28931 + }, + { + "epoch": 0.7321405977174381, + "grad_norm": 5.9137725830078125, + "learning_rate": 1.7001714805846697e-06, + "loss": 0.0667, + "step": 28932 + }, + { + "epoch": 0.7321659032821317, + "grad_norm": 7.506125450134277, + "learning_rate": 1.6998698290566834e-06, + "loss": 0.0762, + "step": 28933 + }, + { + "epoch": 0.7321912088468254, + "grad_norm": 3.3479554653167725, + "learning_rate": 1.6995681988106212e-06, + "loss": 0.1073, + "step": 28934 + }, + { + "epoch": 0.7322165144115191, + "grad_norm": 6.753853797912598, + "learning_rate": 1.6992665898484329e-06, + "loss": 0.158, + "step": 28935 + }, + { + "epoch": 0.7322418199762127, + "grad_norm": 6.452465057373047, + "learning_rate": 1.6989650021720566e-06, + "loss": 0.1566, + "step": 28936 + }, + { + "epoch": 0.7322671255409064, + "grad_norm": 4.575882434844971, + "learning_rate": 1.698663435783443e-06, + "loss": 0.1837, + "step": 28937 + }, + { + "epoch": 0.7322924311056002, + "grad_norm": 3.82539439201355, + "learning_rate": 1.6983618906845334e-06, + "loss": 0.1189, + "step": 28938 + }, + { + "epoch": 0.7323177366702938, + "grad_norm": 2.060539722442627, + "learning_rate": 1.6980603668772772e-06, + "loss": 0.072, + "step": 28939 + }, + { + "epoch": 0.7323430422349875, + "grad_norm": 15.38890266418457, + "learning_rate": 1.6977588643636117e-06, + "loss": 0.1197, + "step": 28940 + }, + { + "epoch": 0.7323683477996812, + "grad_norm": 9.434470176696777, + "learning_rate": 1.6974573831454865e-06, + "loss": 0.1709, + "step": 28941 + }, + { + "epoch": 0.7323936533643748, + "grad_norm": 14.12297248840332, + "learning_rate": 1.6971559232248431e-06, + "loss": 0.146, + "step": 28942 + }, + { + "epoch": 0.7324189589290685, + "grad_norm": 6.561502933502197, + "learning_rate": 1.6968544846036284e-06, + "loss": 0.1852, + "step": 28943 + }, + { + "epoch": 0.7324442644937622, + "grad_norm": 4.665760040283203, + "learning_rate": 1.696553067283785e-06, + "loss": 0.0873, + "step": 28944 + }, + { + "epoch": 0.7324695700584558, + "grad_norm": 4.619599342346191, + "learning_rate": 1.6962516712672555e-06, + "loss": 0.1477, + "step": 28945 + }, + { + "epoch": 0.7324948756231495, + "grad_norm": 5.355713844299316, + "learning_rate": 1.6959502965559832e-06, + "loss": 0.1701, + "step": 28946 + }, + { + "epoch": 0.7325201811878432, + "grad_norm": 12.625826835632324, + "learning_rate": 1.6956489431519146e-06, + "loss": 0.2954, + "step": 28947 + }, + { + "epoch": 0.7325454867525368, + "grad_norm": 5.172431945800781, + "learning_rate": 1.6953476110569917e-06, + "loss": 0.1675, + "step": 28948 + }, + { + "epoch": 0.7325707923172305, + "grad_norm": 5.242257595062256, + "learning_rate": 1.6950463002731566e-06, + "loss": 0.1989, + "step": 28949 + }, + { + "epoch": 0.7325960978819243, + "grad_norm": 7.264626502990723, + "learning_rate": 1.6947450108023516e-06, + "loss": 0.1116, + "step": 28950 + }, + { + "epoch": 0.7326214034466179, + "grad_norm": 4.3682475090026855, + "learning_rate": 1.6944437426465232e-06, + "loss": 0.1076, + "step": 28951 + }, + { + "epoch": 0.7326467090113116, + "grad_norm": 5.411273956298828, + "learning_rate": 1.6941424958076126e-06, + "loss": 0.118, + "step": 28952 + }, + { + "epoch": 0.7326720145760053, + "grad_norm": 3.739145040512085, + "learning_rate": 1.693841270287561e-06, + "loss": 0.1404, + "step": 28953 + }, + { + "epoch": 0.732697320140699, + "grad_norm": 3.0164902210235596, + "learning_rate": 1.6935400660883128e-06, + "loss": 0.1505, + "step": 28954 + }, + { + "epoch": 0.7327226257053926, + "grad_norm": 6.603300094604492, + "learning_rate": 1.693238883211808e-06, + "loss": 0.1619, + "step": 28955 + }, + { + "epoch": 0.7327479312700863, + "grad_norm": 7.6244988441467285, + "learning_rate": 1.6929377216599918e-06, + "loss": 0.1845, + "step": 28956 + }, + { + "epoch": 0.73277323683478, + "grad_norm": 3.54622220993042, + "learning_rate": 1.6926365814348057e-06, + "loss": 0.1429, + "step": 28957 + }, + { + "epoch": 0.7327985423994736, + "grad_norm": 9.243268966674805, + "learning_rate": 1.6923354625381905e-06, + "loss": 0.2615, + "step": 28958 + }, + { + "epoch": 0.7328238479641673, + "grad_norm": 15.126418113708496, + "learning_rate": 1.6920343649720872e-06, + "loss": 0.223, + "step": 28959 + }, + { + "epoch": 0.732849153528861, + "grad_norm": 5.158374786376953, + "learning_rate": 1.6917332887384409e-06, + "loss": 0.1425, + "step": 28960 + }, + { + "epoch": 0.7328744590935546, + "grad_norm": 5.422163963317871, + "learning_rate": 1.6914322338391902e-06, + "loss": 0.1437, + "step": 28961 + }, + { + "epoch": 0.7328997646582484, + "grad_norm": 5.438709735870361, + "learning_rate": 1.6911312002762787e-06, + "loss": 0.1851, + "step": 28962 + }, + { + "epoch": 0.7329250702229421, + "grad_norm": 4.487581729888916, + "learning_rate": 1.6908301880516442e-06, + "loss": 0.1159, + "step": 28963 + }, + { + "epoch": 0.7329503757876357, + "grad_norm": 2.72133731842041, + "learning_rate": 1.6905291971672328e-06, + "loss": 0.1409, + "step": 28964 + }, + { + "epoch": 0.7329756813523294, + "grad_norm": 4.7681965827941895, + "learning_rate": 1.6902282276249804e-06, + "loss": 0.1599, + "step": 28965 + }, + { + "epoch": 0.7330009869170231, + "grad_norm": 8.087752342224121, + "learning_rate": 1.6899272794268346e-06, + "loss": 0.1775, + "step": 28966 + }, + { + "epoch": 0.7330262924817167, + "grad_norm": 4.836437225341797, + "learning_rate": 1.6896263525747281e-06, + "loss": 0.1778, + "step": 28967 + }, + { + "epoch": 0.7330515980464104, + "grad_norm": 4.467281341552734, + "learning_rate": 1.6893254470706071e-06, + "loss": 0.1562, + "step": 28968 + }, + { + "epoch": 0.7330769036111041, + "grad_norm": 4.279979228973389, + "learning_rate": 1.6890245629164092e-06, + "loss": 0.1666, + "step": 28969 + }, + { + "epoch": 0.7331022091757977, + "grad_norm": 5.593630790710449, + "learning_rate": 1.6887237001140795e-06, + "loss": 0.1498, + "step": 28970 + }, + { + "epoch": 0.7331275147404914, + "grad_norm": 9.55538272857666, + "learning_rate": 1.688422858665551e-06, + "loss": 0.1865, + "step": 28971 + }, + { + "epoch": 0.7331528203051851, + "grad_norm": 5.439630508422852, + "learning_rate": 1.6881220385727693e-06, + "loss": 0.1066, + "step": 28972 + }, + { + "epoch": 0.7331781258698787, + "grad_norm": 5.113855361938477, + "learning_rate": 1.687821239837671e-06, + "loss": 0.1471, + "step": 28973 + }, + { + "epoch": 0.7332034314345725, + "grad_norm": 2.9742236137390137, + "learning_rate": 1.6875204624621982e-06, + "loss": 0.1293, + "step": 28974 + }, + { + "epoch": 0.7332287369992662, + "grad_norm": 3.5484533309936523, + "learning_rate": 1.687219706448291e-06, + "loss": 0.1, + "step": 28975 + }, + { + "epoch": 0.7332540425639598, + "grad_norm": 5.634853363037109, + "learning_rate": 1.6869189717978868e-06, + "loss": 0.1753, + "step": 28976 + }, + { + "epoch": 0.7332793481286535, + "grad_norm": 3.9307408332824707, + "learning_rate": 1.6866182585129249e-06, + "loss": 0.1527, + "step": 28977 + }, + { + "epoch": 0.7333046536933472, + "grad_norm": 10.919657707214355, + "learning_rate": 1.6863175665953463e-06, + "loss": 0.2255, + "step": 28978 + }, + { + "epoch": 0.7333299592580409, + "grad_norm": 3.454256057739258, + "learning_rate": 1.68601689604709e-06, + "loss": 0.1181, + "step": 28979 + }, + { + "epoch": 0.7333552648227345, + "grad_norm": 13.353326797485352, + "learning_rate": 1.6857162468700943e-06, + "loss": 0.2528, + "step": 28980 + }, + { + "epoch": 0.7333805703874282, + "grad_norm": 4.343837261199951, + "learning_rate": 1.685415619066298e-06, + "loss": 0.1799, + "step": 28981 + }, + { + "epoch": 0.7334058759521219, + "grad_norm": 3.5343899726867676, + "learning_rate": 1.6851150126376375e-06, + "loss": 0.1382, + "step": 28982 + }, + { + "epoch": 0.7334311815168155, + "grad_norm": 7.468355655670166, + "learning_rate": 1.6848144275860562e-06, + "loss": 0.1934, + "step": 28983 + }, + { + "epoch": 0.7334564870815092, + "grad_norm": 6.617434978485107, + "learning_rate": 1.6845138639134894e-06, + "loss": 0.21, + "step": 28984 + }, + { + "epoch": 0.733481792646203, + "grad_norm": 9.270092964172363, + "learning_rate": 1.684213321621876e-06, + "loss": 0.2327, + "step": 28985 + }, + { + "epoch": 0.7335070982108965, + "grad_norm": 3.367776870727539, + "learning_rate": 1.6839128007131528e-06, + "loss": 0.1319, + "step": 28986 + }, + { + "epoch": 0.7335324037755903, + "grad_norm": 4.812433242797852, + "learning_rate": 1.6836123011892607e-06, + "loss": 0.1952, + "step": 28987 + }, + { + "epoch": 0.733557709340284, + "grad_norm": 6.8836188316345215, + "learning_rate": 1.6833118230521356e-06, + "loss": 0.215, + "step": 28988 + }, + { + "epoch": 0.7335830149049776, + "grad_norm": 4.965027809143066, + "learning_rate": 1.6830113663037161e-06, + "loss": 0.1301, + "step": 28989 + }, + { + "epoch": 0.7336083204696713, + "grad_norm": 4.664783954620361, + "learning_rate": 1.6827109309459372e-06, + "loss": 0.1476, + "step": 28990 + }, + { + "epoch": 0.733633626034365, + "grad_norm": 6.130831241607666, + "learning_rate": 1.6824105169807403e-06, + "loss": 0.169, + "step": 28991 + }, + { + "epoch": 0.7336589315990586, + "grad_norm": 4.255283355712891, + "learning_rate": 1.6821101244100608e-06, + "loss": 0.175, + "step": 28992 + }, + { + "epoch": 0.7336842371637523, + "grad_norm": 6.678061008453369, + "learning_rate": 1.6818097532358363e-06, + "loss": 0.1289, + "step": 28993 + }, + { + "epoch": 0.733709542728446, + "grad_norm": 4.202919006347656, + "learning_rate": 1.681509403460001e-06, + "loss": 0.1494, + "step": 28994 + }, + { + "epoch": 0.7337348482931396, + "grad_norm": 4.004926681518555, + "learning_rate": 1.6812090750844967e-06, + "loss": 0.1517, + "step": 28995 + }, + { + "epoch": 0.7337601538578333, + "grad_norm": 5.182713508605957, + "learning_rate": 1.6809087681112558e-06, + "loss": 0.2213, + "step": 28996 + }, + { + "epoch": 0.733785459422527, + "grad_norm": 4.1967315673828125, + "learning_rate": 1.6806084825422202e-06, + "loss": 0.1149, + "step": 28997 + }, + { + "epoch": 0.7338107649872206, + "grad_norm": 3.12062931060791, + "learning_rate": 1.6803082183793195e-06, + "loss": 0.1347, + "step": 28998 + }, + { + "epoch": 0.7338360705519144, + "grad_norm": 15.127655982971191, + "learning_rate": 1.6800079756244958e-06, + "loss": 0.1651, + "step": 28999 + }, + { + "epoch": 0.7338613761166081, + "grad_norm": 3.89846134185791, + "learning_rate": 1.6797077542796807e-06, + "loss": 0.0971, + "step": 29000 + }, + { + "epoch": 0.7338866816813017, + "grad_norm": 4.138210773468018, + "learning_rate": 1.6794075543468164e-06, + "loss": 0.1715, + "step": 29001 + }, + { + "epoch": 0.7339119872459954, + "grad_norm": 15.981246948242188, + "learning_rate": 1.679107375827832e-06, + "loss": 0.2938, + "step": 29002 + }, + { + "epoch": 0.7339372928106891, + "grad_norm": 6.677768230438232, + "learning_rate": 1.6788072187246679e-06, + "loss": 0.1212, + "step": 29003 + }, + { + "epoch": 0.7339625983753828, + "grad_norm": 6.062252998352051, + "learning_rate": 1.6785070830392564e-06, + "loss": 0.181, + "step": 29004 + }, + { + "epoch": 0.7339879039400764, + "grad_norm": 5.011796474456787, + "learning_rate": 1.6782069687735365e-06, + "loss": 0.1126, + "step": 29005 + }, + { + "epoch": 0.7340132095047701, + "grad_norm": 5.943152904510498, + "learning_rate": 1.677906875929442e-06, + "loss": 0.1969, + "step": 29006 + }, + { + "epoch": 0.7340385150694638, + "grad_norm": 3.400575876235962, + "learning_rate": 1.6776068045089084e-06, + "loss": 0.1501, + "step": 29007 + }, + { + "epoch": 0.7340638206341574, + "grad_norm": 5.331707000732422, + "learning_rate": 1.6773067545138688e-06, + "loss": 0.1627, + "step": 29008 + }, + { + "epoch": 0.7340891261988511, + "grad_norm": 4.822449207305908, + "learning_rate": 1.6770067259462613e-06, + "loss": 0.2398, + "step": 29009 + }, + { + "epoch": 0.7341144317635449, + "grad_norm": 7.622990131378174, + "learning_rate": 1.6767067188080194e-06, + "loss": 0.1913, + "step": 29010 + }, + { + "epoch": 0.7341397373282385, + "grad_norm": 2.9363908767700195, + "learning_rate": 1.6764067331010774e-06, + "loss": 0.1311, + "step": 29011 + }, + { + "epoch": 0.7341650428929322, + "grad_norm": 7.1039886474609375, + "learning_rate": 1.6761067688273707e-06, + "loss": 0.2121, + "step": 29012 + }, + { + "epoch": 0.7341903484576259, + "grad_norm": 11.220132827758789, + "learning_rate": 1.6758068259888311e-06, + "loss": 0.1682, + "step": 29013 + }, + { + "epoch": 0.7342156540223195, + "grad_norm": 4.2027082443237305, + "learning_rate": 1.6755069045873967e-06, + "loss": 0.1034, + "step": 29014 + }, + { + "epoch": 0.7342409595870132, + "grad_norm": 3.1241419315338135, + "learning_rate": 1.6752070046249996e-06, + "loss": 0.1219, + "step": 29015 + }, + { + "epoch": 0.7342662651517069, + "grad_norm": 4.911497592926025, + "learning_rate": 1.6749071261035748e-06, + "loss": 0.1879, + "step": 29016 + }, + { + "epoch": 0.7342915707164005, + "grad_norm": 2.7126522064208984, + "learning_rate": 1.6746072690250531e-06, + "loss": 0.1319, + "step": 29017 + }, + { + "epoch": 0.7343168762810942, + "grad_norm": 5.629266262054443, + "learning_rate": 1.6743074333913722e-06, + "loss": 0.1942, + "step": 29018 + }, + { + "epoch": 0.7343421818457879, + "grad_norm": 4.492693901062012, + "learning_rate": 1.6740076192044646e-06, + "loss": 0.1524, + "step": 29019 + }, + { + "epoch": 0.7343674874104815, + "grad_norm": 4.268478870391846, + "learning_rate": 1.6737078264662626e-06, + "loss": 0.1101, + "step": 29020 + }, + { + "epoch": 0.7343927929751752, + "grad_norm": 3.6904664039611816, + "learning_rate": 1.673408055178699e-06, + "loss": 0.1352, + "step": 29021 + }, + { + "epoch": 0.734418098539869, + "grad_norm": 5.862769603729248, + "learning_rate": 1.673108305343709e-06, + "loss": 0.1276, + "step": 29022 + }, + { + "epoch": 0.7344434041045625, + "grad_norm": 4.007191181182861, + "learning_rate": 1.6728085769632252e-06, + "loss": 0.1198, + "step": 29023 + }, + { + "epoch": 0.7344687096692563, + "grad_norm": 3.5087733268737793, + "learning_rate": 1.67250887003918e-06, + "loss": 0.1719, + "step": 29024 + }, + { + "epoch": 0.73449401523395, + "grad_norm": 6.81642484664917, + "learning_rate": 1.6722091845735045e-06, + "loss": 0.1407, + "step": 29025 + }, + { + "epoch": 0.7345193207986436, + "grad_norm": 7.366705894470215, + "learning_rate": 1.671909520568134e-06, + "loss": 0.2138, + "step": 29026 + }, + { + "epoch": 0.7345446263633373, + "grad_norm": 2.7773563861846924, + "learning_rate": 1.671609878024999e-06, + "loss": 0.1058, + "step": 29027 + }, + { + "epoch": 0.734569931928031, + "grad_norm": 7.192803859710693, + "learning_rate": 1.671310256946036e-06, + "loss": 0.2012, + "step": 29028 + }, + { + "epoch": 0.7345952374927246, + "grad_norm": 7.131478786468506, + "learning_rate": 1.6710106573331702e-06, + "loss": 0.1135, + "step": 29029 + }, + { + "epoch": 0.7346205430574183, + "grad_norm": 6.181118011474609, + "learning_rate": 1.6707110791883397e-06, + "loss": 0.0732, + "step": 29030 + }, + { + "epoch": 0.734645848622112, + "grad_norm": 7.735554218292236, + "learning_rate": 1.670411522513472e-06, + "loss": 0.1827, + "step": 29031 + }, + { + "epoch": 0.7346711541868057, + "grad_norm": 9.521318435668945, + "learning_rate": 1.6701119873105044e-06, + "loss": 0.1799, + "step": 29032 + }, + { + "epoch": 0.7346964597514993, + "grad_norm": 2.5060551166534424, + "learning_rate": 1.669812473581362e-06, + "loss": 0.1291, + "step": 29033 + }, + { + "epoch": 0.734721765316193, + "grad_norm": 7.648020267486572, + "learning_rate": 1.669512981327981e-06, + "loss": 0.2078, + "step": 29034 + }, + { + "epoch": 0.7347470708808868, + "grad_norm": 5.341867923736572, + "learning_rate": 1.66921351055229e-06, + "loss": 0.1147, + "step": 29035 + }, + { + "epoch": 0.7347723764455804, + "grad_norm": 4.354780673980713, + "learning_rate": 1.6689140612562227e-06, + "loss": 0.1105, + "step": 29036 + }, + { + "epoch": 0.7347976820102741, + "grad_norm": 2.6379010677337646, + "learning_rate": 1.668614633441709e-06, + "loss": 0.1453, + "step": 29037 + }, + { + "epoch": 0.7348229875749678, + "grad_norm": 5.566742897033691, + "learning_rate": 1.6683152271106802e-06, + "loss": 0.1926, + "step": 29038 + }, + { + "epoch": 0.7348482931396614, + "grad_norm": 7.323096752166748, + "learning_rate": 1.6680158422650667e-06, + "loss": 0.2119, + "step": 29039 + }, + { + "epoch": 0.7348735987043551, + "grad_norm": 12.050180435180664, + "learning_rate": 1.6677164789067973e-06, + "loss": 0.2309, + "step": 29040 + }, + { + "epoch": 0.7348989042690488, + "grad_norm": 8.564044952392578, + "learning_rate": 1.6674171370378062e-06, + "loss": 0.212, + "step": 29041 + }, + { + "epoch": 0.7349242098337424, + "grad_norm": 5.462238788604736, + "learning_rate": 1.6671178166600222e-06, + "loss": 0.1593, + "step": 29042 + }, + { + "epoch": 0.7349495153984361, + "grad_norm": 5.401593208312988, + "learning_rate": 1.6668185177753755e-06, + "loss": 0.1647, + "step": 29043 + }, + { + "epoch": 0.7349748209631298, + "grad_norm": 4.434237003326416, + "learning_rate": 1.6665192403857945e-06, + "loss": 0.1301, + "step": 29044 + }, + { + "epoch": 0.7350001265278234, + "grad_norm": 3.1304328441619873, + "learning_rate": 1.666219984493212e-06, + "loss": 0.1613, + "step": 29045 + }, + { + "epoch": 0.7350254320925171, + "grad_norm": 5.896962642669678, + "learning_rate": 1.6659207500995572e-06, + "loss": 0.2076, + "step": 29046 + }, + { + "epoch": 0.7350507376572109, + "grad_norm": 4.182879447937012, + "learning_rate": 1.6656215372067591e-06, + "loss": 0.2058, + "step": 29047 + }, + { + "epoch": 0.7350760432219045, + "grad_norm": 3.4187116622924805, + "learning_rate": 1.665322345816746e-06, + "loss": 0.0895, + "step": 29048 + }, + { + "epoch": 0.7351013487865982, + "grad_norm": 3.333817720413208, + "learning_rate": 1.6650231759314506e-06, + "loss": 0.0841, + "step": 29049 + }, + { + "epoch": 0.7351266543512919, + "grad_norm": 21.964496612548828, + "learning_rate": 1.6647240275528e-06, + "loss": 0.1718, + "step": 29050 + }, + { + "epoch": 0.7351519599159855, + "grad_norm": 18.830720901489258, + "learning_rate": 1.664424900682724e-06, + "loss": 0.1389, + "step": 29051 + }, + { + "epoch": 0.7351772654806792, + "grad_norm": 5.951946258544922, + "learning_rate": 1.6641257953231494e-06, + "loss": 0.2194, + "step": 29052 + }, + { + "epoch": 0.7352025710453729, + "grad_norm": 3.5390357971191406, + "learning_rate": 1.6638267114760087e-06, + "loss": 0.1334, + "step": 29053 + }, + { + "epoch": 0.7352278766100665, + "grad_norm": 6.164599895477295, + "learning_rate": 1.6635276491432295e-06, + "loss": 0.1706, + "step": 29054 + }, + { + "epoch": 0.7352531821747602, + "grad_norm": 8.639124870300293, + "learning_rate": 1.663228608326739e-06, + "loss": 0.2018, + "step": 29055 + }, + { + "epoch": 0.7352784877394539, + "grad_norm": 4.595317840576172, + "learning_rate": 1.6629295890284659e-06, + "loss": 0.1697, + "step": 29056 + }, + { + "epoch": 0.7353037933041476, + "grad_norm": 8.518277168273926, + "learning_rate": 1.6626305912503398e-06, + "loss": 0.2159, + "step": 29057 + }, + { + "epoch": 0.7353290988688412, + "grad_norm": 3.481034278869629, + "learning_rate": 1.662331614994287e-06, + "loss": 0.1372, + "step": 29058 + }, + { + "epoch": 0.735354404433535, + "grad_norm": 4.323614597320557, + "learning_rate": 1.66203266026224e-06, + "loss": 0.1505, + "step": 29059 + }, + { + "epoch": 0.7353797099982287, + "grad_norm": 3.0690207481384277, + "learning_rate": 1.6617337270561208e-06, + "loss": 0.1096, + "step": 29060 + }, + { + "epoch": 0.7354050155629223, + "grad_norm": 17.454833984375, + "learning_rate": 1.6614348153778604e-06, + "loss": 0.2095, + "step": 29061 + }, + { + "epoch": 0.735430321127616, + "grad_norm": 4.9555253982543945, + "learning_rate": 1.6611359252293846e-06, + "loss": 0.1014, + "step": 29062 + }, + { + "epoch": 0.7354556266923097, + "grad_norm": 7.8900837898254395, + "learning_rate": 1.660837056612626e-06, + "loss": 0.1946, + "step": 29063 + }, + { + "epoch": 0.7354809322570033, + "grad_norm": 6.586294651031494, + "learning_rate": 1.6605382095295043e-06, + "loss": 0.1384, + "step": 29064 + }, + { + "epoch": 0.735506237821697, + "grad_norm": 4.9017720222473145, + "learning_rate": 1.660239383981952e-06, + "loss": 0.1664, + "step": 29065 + }, + { + "epoch": 0.7355315433863907, + "grad_norm": 7.214956283569336, + "learning_rate": 1.6599405799718948e-06, + "loss": 0.163, + "step": 29066 + }, + { + "epoch": 0.7355568489510843, + "grad_norm": 2.0606963634490967, + "learning_rate": 1.6596417975012596e-06, + "loss": 0.0695, + "step": 29067 + }, + { + "epoch": 0.735582154515778, + "grad_norm": 4.571021556854248, + "learning_rate": 1.6593430365719715e-06, + "loss": 0.1457, + "step": 29068 + }, + { + "epoch": 0.7356074600804717, + "grad_norm": 3.2558345794677734, + "learning_rate": 1.65904429718596e-06, + "loss": 0.1629, + "step": 29069 + }, + { + "epoch": 0.7356327656451653, + "grad_norm": 4.763192653656006, + "learning_rate": 1.6587455793451506e-06, + "loss": 0.1386, + "step": 29070 + }, + { + "epoch": 0.735658071209859, + "grad_norm": 6.073489665985107, + "learning_rate": 1.6584468830514683e-06, + "loss": 0.143, + "step": 29071 + }, + { + "epoch": 0.7356833767745528, + "grad_norm": 4.633375644683838, + "learning_rate": 1.6581482083068413e-06, + "loss": 0.132, + "step": 29072 + }, + { + "epoch": 0.7357086823392464, + "grad_norm": 4.5937347412109375, + "learning_rate": 1.6578495551131957e-06, + "loss": 0.1387, + "step": 29073 + }, + { + "epoch": 0.7357339879039401, + "grad_norm": 7.924753665924072, + "learning_rate": 1.6575509234724557e-06, + "loss": 0.2157, + "step": 29074 + }, + { + "epoch": 0.7357592934686338, + "grad_norm": 6.113481044769287, + "learning_rate": 1.657252313386547e-06, + "loss": 0.2374, + "step": 29075 + }, + { + "epoch": 0.7357845990333274, + "grad_norm": 5.202059745788574, + "learning_rate": 1.6569537248573996e-06, + "loss": 0.1451, + "step": 29076 + }, + { + "epoch": 0.7358099045980211, + "grad_norm": 9.295553207397461, + "learning_rate": 1.6566551578869322e-06, + "loss": 0.2834, + "step": 29077 + }, + { + "epoch": 0.7358352101627148, + "grad_norm": 3.484062433242798, + "learning_rate": 1.6563566124770757e-06, + "loss": 0.1525, + "step": 29078 + }, + { + "epoch": 0.7358605157274084, + "grad_norm": 10.855634689331055, + "learning_rate": 1.6560580886297518e-06, + "loss": 0.2014, + "step": 29079 + }, + { + "epoch": 0.7358858212921021, + "grad_norm": 6.2923784255981445, + "learning_rate": 1.655759586346889e-06, + "loss": 0.1649, + "step": 29080 + }, + { + "epoch": 0.7359111268567958, + "grad_norm": 5.542651176452637, + "learning_rate": 1.6554611056304103e-06, + "loss": 0.17, + "step": 29081 + }, + { + "epoch": 0.7359364324214895, + "grad_norm": 2.8212759494781494, + "learning_rate": 1.6551626464822407e-06, + "loss": 0.1042, + "step": 29082 + }, + { + "epoch": 0.7359617379861831, + "grad_norm": 3.361696481704712, + "learning_rate": 1.6548642089043038e-06, + "loss": 0.1108, + "step": 29083 + }, + { + "epoch": 0.7359870435508769, + "grad_norm": 4.288145542144775, + "learning_rate": 1.6545657928985264e-06, + "loss": 0.1633, + "step": 29084 + }, + { + "epoch": 0.7360123491155706, + "grad_norm": 3.4943349361419678, + "learning_rate": 1.6542673984668323e-06, + "loss": 0.0988, + "step": 29085 + }, + { + "epoch": 0.7360376546802642, + "grad_norm": 10.312204360961914, + "learning_rate": 1.6539690256111452e-06, + "loss": 0.3151, + "step": 29086 + }, + { + "epoch": 0.7360629602449579, + "grad_norm": 6.558053493499756, + "learning_rate": 1.653670674333388e-06, + "loss": 0.1332, + "step": 29087 + }, + { + "epoch": 0.7360882658096516, + "grad_norm": 3.9575233459472656, + "learning_rate": 1.6533723446354877e-06, + "loss": 0.1414, + "step": 29088 + }, + { + "epoch": 0.7361135713743452, + "grad_norm": 2.540846586227417, + "learning_rate": 1.6530740365193654e-06, + "loss": 0.0935, + "step": 29089 + }, + { + "epoch": 0.7361388769390389, + "grad_norm": 2.9536824226379395, + "learning_rate": 1.652775749986949e-06, + "loss": 0.0955, + "step": 29090 + }, + { + "epoch": 0.7361641825037326, + "grad_norm": 4.839438438415527, + "learning_rate": 1.6524774850401553e-06, + "loss": 0.1545, + "step": 29091 + }, + { + "epoch": 0.7361894880684262, + "grad_norm": 3.4506289958953857, + "learning_rate": 1.6521792416809141e-06, + "loss": 0.0887, + "step": 29092 + }, + { + "epoch": 0.7362147936331199, + "grad_norm": 5.836716651916504, + "learning_rate": 1.6518810199111441e-06, + "loss": 0.2021, + "step": 29093 + }, + { + "epoch": 0.7362400991978136, + "grad_norm": 4.2143025398254395, + "learning_rate": 1.6515828197327744e-06, + "loss": 0.1449, + "step": 29094 + }, + { + "epoch": 0.7362654047625072, + "grad_norm": 7.946381092071533, + "learning_rate": 1.6512846411477207e-06, + "loss": 0.1531, + "step": 29095 + }, + { + "epoch": 0.736290710327201, + "grad_norm": 14.313111305236816, + "learning_rate": 1.650986484157911e-06, + "loss": 0.2167, + "step": 29096 + }, + { + "epoch": 0.7363160158918947, + "grad_norm": 6.142868995666504, + "learning_rate": 1.6506883487652663e-06, + "loss": 0.1567, + "step": 29097 + }, + { + "epoch": 0.7363413214565883, + "grad_norm": 7.89741849899292, + "learning_rate": 1.65039023497171e-06, + "loss": 0.1152, + "step": 29098 + }, + { + "epoch": 0.736366627021282, + "grad_norm": 8.188140869140625, + "learning_rate": 1.6500921427791616e-06, + "loss": 0.2648, + "step": 29099 + }, + { + "epoch": 0.7363919325859757, + "grad_norm": 3.299236536026001, + "learning_rate": 1.6497940721895477e-06, + "loss": 0.0907, + "step": 29100 + }, + { + "epoch": 0.7364172381506693, + "grad_norm": 7.253750801086426, + "learning_rate": 1.649496023204789e-06, + "loss": 0.2149, + "step": 29101 + }, + { + "epoch": 0.736442543715363, + "grad_norm": 10.43338680267334, + "learning_rate": 1.6491979958268056e-06, + "loss": 0.2058, + "step": 29102 + }, + { + "epoch": 0.7364678492800567, + "grad_norm": 3.636669158935547, + "learning_rate": 1.6488999900575237e-06, + "loss": 0.1387, + "step": 29103 + }, + { + "epoch": 0.7364931548447503, + "grad_norm": 3.9369890689849854, + "learning_rate": 1.6486020058988595e-06, + "loss": 0.1608, + "step": 29104 + }, + { + "epoch": 0.736518460409444, + "grad_norm": 4.634882926940918, + "learning_rate": 1.6483040433527392e-06, + "loss": 0.1621, + "step": 29105 + }, + { + "epoch": 0.7365437659741377, + "grad_norm": 6.826625823974609, + "learning_rate": 1.6480061024210813e-06, + "loss": 0.1729, + "step": 29106 + }, + { + "epoch": 0.7365690715388314, + "grad_norm": 2.6735682487487793, + "learning_rate": 1.6477081831058116e-06, + "loss": 0.0766, + "step": 29107 + }, + { + "epoch": 0.736594377103525, + "grad_norm": 3.966627836227417, + "learning_rate": 1.6474102854088448e-06, + "loss": 0.1321, + "step": 29108 + }, + { + "epoch": 0.7366196826682188, + "grad_norm": 4.094501495361328, + "learning_rate": 1.6471124093321073e-06, + "loss": 0.1947, + "step": 29109 + }, + { + "epoch": 0.7366449882329125, + "grad_norm": 2.4975290298461914, + "learning_rate": 1.6468145548775167e-06, + "loss": 0.1006, + "step": 29110 + }, + { + "epoch": 0.7366702937976061, + "grad_norm": 6.046950817108154, + "learning_rate": 1.6465167220469963e-06, + "loss": 0.1251, + "step": 29111 + }, + { + "epoch": 0.7366955993622998, + "grad_norm": 6.333138942718506, + "learning_rate": 1.6462189108424664e-06, + "loss": 0.1719, + "step": 29112 + }, + { + "epoch": 0.7367209049269935, + "grad_norm": 8.446619033813477, + "learning_rate": 1.6459211212658466e-06, + "loss": 0.2075, + "step": 29113 + }, + { + "epoch": 0.7367462104916871, + "grad_norm": 8.474602699279785, + "learning_rate": 1.6456233533190558e-06, + "loss": 0.1839, + "step": 29114 + }, + { + "epoch": 0.7367715160563808, + "grad_norm": 3.468114137649536, + "learning_rate": 1.645325607004018e-06, + "loss": 0.151, + "step": 29115 + }, + { + "epoch": 0.7367968216210745, + "grad_norm": 4.194108009338379, + "learning_rate": 1.6450278823226512e-06, + "loss": 0.1386, + "step": 29116 + }, + { + "epoch": 0.7368221271857681, + "grad_norm": 9.364513397216797, + "learning_rate": 1.6447301792768756e-06, + "loss": 0.2822, + "step": 29117 + }, + { + "epoch": 0.7368474327504618, + "grad_norm": 3.884561538696289, + "learning_rate": 1.644432497868609e-06, + "loss": 0.1413, + "step": 29118 + }, + { + "epoch": 0.7368727383151555, + "grad_norm": 5.5271124839782715, + "learning_rate": 1.6441348380997746e-06, + "loss": 0.1399, + "step": 29119 + }, + { + "epoch": 0.7368980438798491, + "grad_norm": 2.394007921218872, + "learning_rate": 1.6438371999722907e-06, + "loss": 0.0794, + "step": 29120 + }, + { + "epoch": 0.7369233494445429, + "grad_norm": 2.9044337272644043, + "learning_rate": 1.6435395834880768e-06, + "loss": 0.0968, + "step": 29121 + }, + { + "epoch": 0.7369486550092366, + "grad_norm": 7.726348400115967, + "learning_rate": 1.6432419886490496e-06, + "loss": 0.1811, + "step": 29122 + }, + { + "epoch": 0.7369739605739302, + "grad_norm": 7.356654644012451, + "learning_rate": 1.6429444154571323e-06, + "loss": 0.2228, + "step": 29123 + }, + { + "epoch": 0.7369992661386239, + "grad_norm": 14.044183731079102, + "learning_rate": 1.6426468639142418e-06, + "loss": 0.1574, + "step": 29124 + }, + { + "epoch": 0.7370245717033176, + "grad_norm": 5.704319953918457, + "learning_rate": 1.6423493340222972e-06, + "loss": 0.1483, + "step": 29125 + }, + { + "epoch": 0.7370498772680112, + "grad_norm": 3.027155637741089, + "learning_rate": 1.6420518257832158e-06, + "loss": 0.1671, + "step": 29126 + }, + { + "epoch": 0.7370751828327049, + "grad_norm": 7.3831787109375, + "learning_rate": 1.6417543391989189e-06, + "loss": 0.1726, + "step": 29127 + }, + { + "epoch": 0.7371004883973986, + "grad_norm": 2.605872631072998, + "learning_rate": 1.6414568742713232e-06, + "loss": 0.0568, + "step": 29128 + }, + { + "epoch": 0.7371257939620922, + "grad_norm": 2.936617851257324, + "learning_rate": 1.6411594310023481e-06, + "loss": 0.1166, + "step": 29129 + }, + { + "epoch": 0.7371510995267859, + "grad_norm": 3.3357982635498047, + "learning_rate": 1.6408620093939093e-06, + "loss": 0.1548, + "step": 29130 + }, + { + "epoch": 0.7371764050914796, + "grad_norm": 7.028811931610107, + "learning_rate": 1.6405646094479278e-06, + "loss": 0.1816, + "step": 29131 + }, + { + "epoch": 0.7372017106561733, + "grad_norm": 5.493898868560791, + "learning_rate": 1.6402672311663204e-06, + "loss": 0.2232, + "step": 29132 + }, + { + "epoch": 0.737227016220867, + "grad_norm": 14.005766868591309, + "learning_rate": 1.639969874551003e-06, + "loss": 0.1649, + "step": 29133 + }, + { + "epoch": 0.7372523217855607, + "grad_norm": 3.6034328937530518, + "learning_rate": 1.6396725396038982e-06, + "loss": 0.1222, + "step": 29134 + }, + { + "epoch": 0.7372776273502544, + "grad_norm": 7.323826789855957, + "learning_rate": 1.6393752263269169e-06, + "loss": 0.2439, + "step": 29135 + }, + { + "epoch": 0.737302932914948, + "grad_norm": 10.898417472839355, + "learning_rate": 1.6390779347219816e-06, + "loss": 0.178, + "step": 29136 + }, + { + "epoch": 0.7373282384796417, + "grad_norm": 2.882089138031006, + "learning_rate": 1.6387806647910054e-06, + "loss": 0.1078, + "step": 29137 + }, + { + "epoch": 0.7373535440443354, + "grad_norm": 8.108588218688965, + "learning_rate": 1.638483416535911e-06, + "loss": 0.2309, + "step": 29138 + }, + { + "epoch": 0.737378849609029, + "grad_norm": 6.378810405731201, + "learning_rate": 1.6381861899586082e-06, + "loss": 0.1197, + "step": 29139 + }, + { + "epoch": 0.7374041551737227, + "grad_norm": 9.024246215820312, + "learning_rate": 1.6378889850610192e-06, + "loss": 0.1635, + "step": 29140 + }, + { + "epoch": 0.7374294607384164, + "grad_norm": 12.057975769042969, + "learning_rate": 1.6375918018450564e-06, + "loss": 0.1847, + "step": 29141 + }, + { + "epoch": 0.73745476630311, + "grad_norm": 5.017542839050293, + "learning_rate": 1.6372946403126428e-06, + "loss": 0.1808, + "step": 29142 + }, + { + "epoch": 0.7374800718678037, + "grad_norm": 6.402581691741943, + "learning_rate": 1.6369975004656868e-06, + "loss": 0.2209, + "step": 29143 + }, + { + "epoch": 0.7375053774324974, + "grad_norm": 7.234936237335205, + "learning_rate": 1.6367003823061101e-06, + "loss": 0.2336, + "step": 29144 + }, + { + "epoch": 0.737530682997191, + "grad_norm": 5.157118797302246, + "learning_rate": 1.636403285835826e-06, + "loss": 0.1733, + "step": 29145 + }, + { + "epoch": 0.7375559885618848, + "grad_norm": 3.942706823348999, + "learning_rate": 1.6361062110567521e-06, + "loss": 0.1943, + "step": 29146 + }, + { + "epoch": 0.7375812941265785, + "grad_norm": 9.363911628723145, + "learning_rate": 1.635809157970804e-06, + "loss": 0.2917, + "step": 29147 + }, + { + "epoch": 0.7376065996912721, + "grad_norm": 3.0910024642944336, + "learning_rate": 1.6355121265798967e-06, + "loss": 0.1829, + "step": 29148 + }, + { + "epoch": 0.7376319052559658, + "grad_norm": 4.633071422576904, + "learning_rate": 1.6352151168859442e-06, + "loss": 0.2164, + "step": 29149 + }, + { + "epoch": 0.7376572108206595, + "grad_norm": 10.151025772094727, + "learning_rate": 1.6349181288908655e-06, + "loss": 0.1495, + "step": 29150 + }, + { + "epoch": 0.7376825163853531, + "grad_norm": 3.531567335128784, + "learning_rate": 1.6346211625965734e-06, + "loss": 0.1059, + "step": 29151 + }, + { + "epoch": 0.7377078219500468, + "grad_norm": 7.579751014709473, + "learning_rate": 1.6343242180049835e-06, + "loss": 0.2303, + "step": 29152 + }, + { + "epoch": 0.7377331275147405, + "grad_norm": 4.738554954528809, + "learning_rate": 1.6340272951180092e-06, + "loss": 0.1825, + "step": 29153 + }, + { + "epoch": 0.7377584330794341, + "grad_norm": 4.02273416519165, + "learning_rate": 1.6337303939375682e-06, + "loss": 0.1462, + "step": 29154 + }, + { + "epoch": 0.7377837386441278, + "grad_norm": 4.234594821929932, + "learning_rate": 1.633433514465574e-06, + "loss": 0.1412, + "step": 29155 + }, + { + "epoch": 0.7378090442088215, + "grad_norm": 6.704031944274902, + "learning_rate": 1.6331366567039404e-06, + "loss": 0.2297, + "step": 29156 + }, + { + "epoch": 0.7378343497735151, + "grad_norm": 3.6841180324554443, + "learning_rate": 1.632839820654581e-06, + "loss": 0.1536, + "step": 29157 + }, + { + "epoch": 0.7378596553382089, + "grad_norm": 6.476461887359619, + "learning_rate": 1.6325430063194125e-06, + "loss": 0.1204, + "step": 29158 + }, + { + "epoch": 0.7378849609029026, + "grad_norm": 9.470731735229492, + "learning_rate": 1.632246213700348e-06, + "loss": 0.2071, + "step": 29159 + }, + { + "epoch": 0.7379102664675963, + "grad_norm": 4.008752822875977, + "learning_rate": 1.631949442799301e-06, + "loss": 0.1242, + "step": 29160 + }, + { + "epoch": 0.7379355720322899, + "grad_norm": 9.331660270690918, + "learning_rate": 1.6316526936181858e-06, + "loss": 0.1218, + "step": 29161 + }, + { + "epoch": 0.7379608775969836, + "grad_norm": 11.999459266662598, + "learning_rate": 1.631355966158914e-06, + "loss": 0.2582, + "step": 29162 + }, + { + "epoch": 0.7379861831616773, + "grad_norm": 15.997225761413574, + "learning_rate": 1.6310592604234027e-06, + "loss": 0.2432, + "step": 29163 + }, + { + "epoch": 0.7380114887263709, + "grad_norm": 4.073899745941162, + "learning_rate": 1.630762576413562e-06, + "loss": 0.1591, + "step": 29164 + }, + { + "epoch": 0.7380367942910646, + "grad_norm": 4.262120723724365, + "learning_rate": 1.6304659141313101e-06, + "loss": 0.1225, + "step": 29165 + }, + { + "epoch": 0.7380620998557583, + "grad_norm": 7.323433876037598, + "learning_rate": 1.6301692735785535e-06, + "loss": 0.2541, + "step": 29166 + }, + { + "epoch": 0.7380874054204519, + "grad_norm": 3.148075580596924, + "learning_rate": 1.6298726547572097e-06, + "loss": 0.1597, + "step": 29167 + }, + { + "epoch": 0.7381127109851456, + "grad_norm": 7.657674789428711, + "learning_rate": 1.6295760576691887e-06, + "loss": 0.1543, + "step": 29168 + }, + { + "epoch": 0.7381380165498393, + "grad_norm": 5.449223041534424, + "learning_rate": 1.6292794823164082e-06, + "loss": 0.1473, + "step": 29169 + }, + { + "epoch": 0.738163322114533, + "grad_norm": 18.787879943847656, + "learning_rate": 1.6289829287007735e-06, + "loss": 0.1229, + "step": 29170 + }, + { + "epoch": 0.7381886276792267, + "grad_norm": 8.054549217224121, + "learning_rate": 1.6286863968242028e-06, + "loss": 0.2069, + "step": 29171 + }, + { + "epoch": 0.7382139332439204, + "grad_norm": 6.325369834899902, + "learning_rate": 1.628389886688605e-06, + "loss": 0.1812, + "step": 29172 + }, + { + "epoch": 0.738239238808614, + "grad_norm": 5.741118907928467, + "learning_rate": 1.6280933982958969e-06, + "loss": 0.1616, + "step": 29173 + }, + { + "epoch": 0.7382645443733077, + "grad_norm": 4.030803203582764, + "learning_rate": 1.6277969316479841e-06, + "loss": 0.0988, + "step": 29174 + }, + { + "epoch": 0.7382898499380014, + "grad_norm": 4.895702838897705, + "learning_rate": 1.6275004867467832e-06, + "loss": 0.1555, + "step": 29175 + }, + { + "epoch": 0.738315155502695, + "grad_norm": 4.7324724197387695, + "learning_rate": 1.6272040635942022e-06, + "loss": 0.1809, + "step": 29176 + }, + { + "epoch": 0.7383404610673887, + "grad_norm": 5.761960029602051, + "learning_rate": 1.626907662192157e-06, + "loss": 0.1582, + "step": 29177 + }, + { + "epoch": 0.7383657666320824, + "grad_norm": 6.02268648147583, + "learning_rate": 1.6266112825425562e-06, + "loss": 0.1677, + "step": 29178 + }, + { + "epoch": 0.738391072196776, + "grad_norm": 5.519427299499512, + "learning_rate": 1.6263149246473125e-06, + "loss": 0.1041, + "step": 29179 + }, + { + "epoch": 0.7384163777614697, + "grad_norm": 3.454787015914917, + "learning_rate": 1.626018588508334e-06, + "loss": 0.1006, + "step": 29180 + }, + { + "epoch": 0.7384416833261634, + "grad_norm": 4.83918571472168, + "learning_rate": 1.6257222741275357e-06, + "loss": 0.1458, + "step": 29181 + }, + { + "epoch": 0.738466988890857, + "grad_norm": 14.701075553894043, + "learning_rate": 1.6254259815068273e-06, + "loss": 0.2246, + "step": 29182 + }, + { + "epoch": 0.7384922944555508, + "grad_norm": 2.6568899154663086, + "learning_rate": 1.625129710648119e-06, + "loss": 0.1431, + "step": 29183 + }, + { + "epoch": 0.7385176000202445, + "grad_norm": 5.624055862426758, + "learning_rate": 1.6248334615533196e-06, + "loss": 0.2171, + "step": 29184 + }, + { + "epoch": 0.7385429055849382, + "grad_norm": 12.42212963104248, + "learning_rate": 1.6245372342243426e-06, + "loss": 0.1654, + "step": 29185 + }, + { + "epoch": 0.7385682111496318, + "grad_norm": 17.028179168701172, + "learning_rate": 1.6242410286630976e-06, + "loss": 0.3083, + "step": 29186 + }, + { + "epoch": 0.7385935167143255, + "grad_norm": 3.960253953933716, + "learning_rate": 1.623944844871494e-06, + "loss": 0.1059, + "step": 29187 + }, + { + "epoch": 0.7386188222790192, + "grad_norm": 6.248212814331055, + "learning_rate": 1.6236486828514426e-06, + "loss": 0.1649, + "step": 29188 + }, + { + "epoch": 0.7386441278437128, + "grad_norm": 3.069943428039551, + "learning_rate": 1.6233525426048508e-06, + "loss": 0.159, + "step": 29189 + }, + { + "epoch": 0.7386694334084065, + "grad_norm": 4.146209716796875, + "learning_rate": 1.6230564241336316e-06, + "loss": 0.1622, + "step": 29190 + }, + { + "epoch": 0.7386947389731002, + "grad_norm": 6.058735370635986, + "learning_rate": 1.622760327439694e-06, + "loss": 0.1186, + "step": 29191 + }, + { + "epoch": 0.7387200445377938, + "grad_norm": 8.721562385559082, + "learning_rate": 1.6224642525249462e-06, + "loss": 0.176, + "step": 29192 + }, + { + "epoch": 0.7387453501024875, + "grad_norm": 4.090834140777588, + "learning_rate": 1.6221681993912975e-06, + "loss": 0.1511, + "step": 29193 + }, + { + "epoch": 0.7387706556671813, + "grad_norm": 6.291717529296875, + "learning_rate": 1.6218721680406586e-06, + "loss": 0.2582, + "step": 29194 + }, + { + "epoch": 0.7387959612318749, + "grad_norm": 3.4918875694274902, + "learning_rate": 1.621576158474938e-06, + "loss": 0.1056, + "step": 29195 + }, + { + "epoch": 0.7388212667965686, + "grad_norm": 10.241922378540039, + "learning_rate": 1.6212801706960441e-06, + "loss": 0.2692, + "step": 29196 + }, + { + "epoch": 0.7388465723612623, + "grad_norm": 3.351470470428467, + "learning_rate": 1.6209842047058843e-06, + "loss": 0.1278, + "step": 29197 + }, + { + "epoch": 0.7388718779259559, + "grad_norm": 2.735252857208252, + "learning_rate": 1.6206882605063707e-06, + "loss": 0.1204, + "step": 29198 + }, + { + "epoch": 0.7388971834906496, + "grad_norm": 16.4307804107666, + "learning_rate": 1.620392338099408e-06, + "loss": 0.1875, + "step": 29199 + }, + { + "epoch": 0.7389224890553433, + "grad_norm": 8.086297988891602, + "learning_rate": 1.62009643748691e-06, + "loss": 0.2414, + "step": 29200 + }, + { + "epoch": 0.7389477946200369, + "grad_norm": 3.0124270915985107, + "learning_rate": 1.6198005586707777e-06, + "loss": 0.1638, + "step": 29201 + }, + { + "epoch": 0.7389731001847306, + "grad_norm": 2.4847118854522705, + "learning_rate": 1.6195047016529247e-06, + "loss": 0.1326, + "step": 29202 + }, + { + "epoch": 0.7389984057494243, + "grad_norm": 3.6141295433044434, + "learning_rate": 1.6192088664352552e-06, + "loss": 0.133, + "step": 29203 + }, + { + "epoch": 0.7390237113141179, + "grad_norm": 6.8870697021484375, + "learning_rate": 1.6189130530196822e-06, + "loss": 0.1941, + "step": 29204 + }, + { + "epoch": 0.7390490168788116, + "grad_norm": 6.712528228759766, + "learning_rate": 1.618617261408107e-06, + "loss": 0.156, + "step": 29205 + }, + { + "epoch": 0.7390743224435053, + "grad_norm": 16.142724990844727, + "learning_rate": 1.6183214916024416e-06, + "loss": 0.199, + "step": 29206 + }, + { + "epoch": 0.739099628008199, + "grad_norm": 5.1846818923950195, + "learning_rate": 1.6180257436045904e-06, + "loss": 0.1509, + "step": 29207 + }, + { + "epoch": 0.7391249335728927, + "grad_norm": 13.534135818481445, + "learning_rate": 1.6177300174164634e-06, + "loss": 0.2631, + "step": 29208 + }, + { + "epoch": 0.7391502391375864, + "grad_norm": 4.022157192230225, + "learning_rate": 1.6174343130399666e-06, + "loss": 0.1183, + "step": 29209 + }, + { + "epoch": 0.7391755447022801, + "grad_norm": 3.4899673461914062, + "learning_rate": 1.6171386304770065e-06, + "loss": 0.1224, + "step": 29210 + }, + { + "epoch": 0.7392008502669737, + "grad_norm": 4.729909420013428, + "learning_rate": 1.6168429697294885e-06, + "loss": 0.1819, + "step": 29211 + }, + { + "epoch": 0.7392261558316674, + "grad_norm": 4.456071376800537, + "learning_rate": 1.6165473307993224e-06, + "loss": 0.2041, + "step": 29212 + }, + { + "epoch": 0.7392514613963611, + "grad_norm": 4.575217247009277, + "learning_rate": 1.6162517136884137e-06, + "loss": 0.1253, + "step": 29213 + }, + { + "epoch": 0.7392767669610547, + "grad_norm": 4.297523021697998, + "learning_rate": 1.6159561183986676e-06, + "loss": 0.1849, + "step": 29214 + }, + { + "epoch": 0.7393020725257484, + "grad_norm": 3.6269690990448, + "learning_rate": 1.61566054493199e-06, + "loss": 0.0825, + "step": 29215 + }, + { + "epoch": 0.7393273780904421, + "grad_norm": 9.561797142028809, + "learning_rate": 1.6153649932902892e-06, + "loss": 0.2633, + "step": 29216 + }, + { + "epoch": 0.7393526836551357, + "grad_norm": 6.939586162567139, + "learning_rate": 1.6150694634754704e-06, + "loss": 0.2381, + "step": 29217 + }, + { + "epoch": 0.7393779892198294, + "grad_norm": 4.917571067810059, + "learning_rate": 1.6147739554894388e-06, + "loss": 0.1956, + "step": 29218 + }, + { + "epoch": 0.7394032947845232, + "grad_norm": 9.852492332458496, + "learning_rate": 1.6144784693340998e-06, + "loss": 0.1473, + "step": 29219 + }, + { + "epoch": 0.7394286003492168, + "grad_norm": 7.256412506103516, + "learning_rate": 1.6141830050113583e-06, + "loss": 0.181, + "step": 29220 + }, + { + "epoch": 0.7394539059139105, + "grad_norm": 3.989315986633301, + "learning_rate": 1.6138875625231222e-06, + "loss": 0.1392, + "step": 29221 + }, + { + "epoch": 0.7394792114786042, + "grad_norm": 7.097565174102783, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.1934, + "step": 29222 + }, + { + "epoch": 0.7395045170432978, + "grad_norm": 3.728377342224121, + "learning_rate": 1.613296743057783e-06, + "loss": 0.1078, + "step": 29223 + }, + { + "epoch": 0.7395298226079915, + "grad_norm": 5.655606746673584, + "learning_rate": 1.6130013660844885e-06, + "loss": 0.249, + "step": 29224 + }, + { + "epoch": 0.7395551281726852, + "grad_norm": 7.892080307006836, + "learning_rate": 1.6127060109533193e-06, + "loss": 0.2323, + "step": 29225 + }, + { + "epoch": 0.7395804337373788, + "grad_norm": 5.910373687744141, + "learning_rate": 1.612410677666179e-06, + "loss": 0.1919, + "step": 29226 + }, + { + "epoch": 0.7396057393020725, + "grad_norm": 3.233736991882324, + "learning_rate": 1.6121153662249728e-06, + "loss": 0.1246, + "step": 29227 + }, + { + "epoch": 0.7396310448667662, + "grad_norm": 10.903295516967773, + "learning_rate": 1.6118200766316021e-06, + "loss": 0.1285, + "step": 29228 + }, + { + "epoch": 0.7396563504314598, + "grad_norm": 3.8219475746154785, + "learning_rate": 1.6115248088879753e-06, + "loss": 0.1007, + "step": 29229 + }, + { + "epoch": 0.7396816559961535, + "grad_norm": 5.999175548553467, + "learning_rate": 1.6112295629959928e-06, + "loss": 0.1505, + "step": 29230 + }, + { + "epoch": 0.7397069615608473, + "grad_norm": 3.506392478942871, + "learning_rate": 1.6109343389575643e-06, + "loss": 0.0732, + "step": 29231 + }, + { + "epoch": 0.7397322671255409, + "grad_norm": 6.911091327667236, + "learning_rate": 1.610639136774586e-06, + "loss": 0.1746, + "step": 29232 + }, + { + "epoch": 0.7397575726902346, + "grad_norm": 5.2260661125183105, + "learning_rate": 1.6103439564489675e-06, + "loss": 0.1851, + "step": 29233 + }, + { + "epoch": 0.7397828782549283, + "grad_norm": 4.220390319824219, + "learning_rate": 1.6100487979826085e-06, + "loss": 0.1307, + "step": 29234 + }, + { + "epoch": 0.739808183819622, + "grad_norm": 5.514540672302246, + "learning_rate": 1.609753661377418e-06, + "loss": 0.1288, + "step": 29235 + }, + { + "epoch": 0.7398334893843156, + "grad_norm": 5.461320877075195, + "learning_rate": 1.6094585466352914e-06, + "loss": 0.1224, + "step": 29236 + }, + { + "epoch": 0.7398587949490093, + "grad_norm": 2.9799983501434326, + "learning_rate": 1.609163453758138e-06, + "loss": 0.1058, + "step": 29237 + }, + { + "epoch": 0.739884100513703, + "grad_norm": 6.196725368499756, + "learning_rate": 1.6088683827478569e-06, + "loss": 0.154, + "step": 29238 + }, + { + "epoch": 0.7399094060783966, + "grad_norm": 10.084338188171387, + "learning_rate": 1.6085733336063548e-06, + "loss": 0.181, + "step": 29239 + }, + { + "epoch": 0.7399347116430903, + "grad_norm": 4.278904914855957, + "learning_rate": 1.6082783063355322e-06, + "loss": 0.1647, + "step": 29240 + }, + { + "epoch": 0.739960017207784, + "grad_norm": 4.230588912963867, + "learning_rate": 1.607983300937292e-06, + "loss": 0.1727, + "step": 29241 + }, + { + "epoch": 0.7399853227724776, + "grad_norm": 3.17610764503479, + "learning_rate": 1.6076883174135344e-06, + "loss": 0.1121, + "step": 29242 + }, + { + "epoch": 0.7400106283371714, + "grad_norm": 4.398932933807373, + "learning_rate": 1.6073933557661659e-06, + "loss": 0.2072, + "step": 29243 + }, + { + "epoch": 0.7400359339018651, + "grad_norm": 9.943184852600098, + "learning_rate": 1.6070984159970864e-06, + "loss": 0.191, + "step": 29244 + }, + { + "epoch": 0.7400612394665587, + "grad_norm": 9.268298149108887, + "learning_rate": 1.606803498108198e-06, + "loss": 0.2426, + "step": 29245 + }, + { + "epoch": 0.7400865450312524, + "grad_norm": 3.2774970531463623, + "learning_rate": 1.606508602101403e-06, + "loss": 0.1283, + "step": 29246 + }, + { + "epoch": 0.7401118505959461, + "grad_norm": 3.853654623031616, + "learning_rate": 1.606213727978601e-06, + "loss": 0.061, + "step": 29247 + }, + { + "epoch": 0.7401371561606397, + "grad_norm": 5.777159214019775, + "learning_rate": 1.6059188757416971e-06, + "loss": 0.1879, + "step": 29248 + }, + { + "epoch": 0.7401624617253334, + "grad_norm": 8.063641548156738, + "learning_rate": 1.6056240453925909e-06, + "loss": 0.2382, + "step": 29249 + }, + { + "epoch": 0.7401877672900271, + "grad_norm": 17.833526611328125, + "learning_rate": 1.6053292369331841e-06, + "loss": 0.1975, + "step": 29250 + }, + { + "epoch": 0.7402130728547207, + "grad_norm": 2.501946449279785, + "learning_rate": 1.6050344503653758e-06, + "loss": 0.1256, + "step": 29251 + }, + { + "epoch": 0.7402383784194144, + "grad_norm": 4.975449562072754, + "learning_rate": 1.6047396856910702e-06, + "loss": 0.1138, + "step": 29252 + }, + { + "epoch": 0.7402636839841081, + "grad_norm": 4.696154594421387, + "learning_rate": 1.6044449429121673e-06, + "loss": 0.1424, + "step": 29253 + }, + { + "epoch": 0.7402889895488017, + "grad_norm": 4.897338390350342, + "learning_rate": 1.6041502220305665e-06, + "loss": 0.1772, + "step": 29254 + }, + { + "epoch": 0.7403142951134954, + "grad_norm": 11.414130210876465, + "learning_rate": 1.6038555230481684e-06, + "loss": 0.1829, + "step": 29255 + }, + { + "epoch": 0.7403396006781892, + "grad_norm": 4.167088508605957, + "learning_rate": 1.6035608459668756e-06, + "loss": 0.1315, + "step": 29256 + }, + { + "epoch": 0.7403649062428828, + "grad_norm": 3.2948060035705566, + "learning_rate": 1.6032661907885872e-06, + "loss": 0.1749, + "step": 29257 + }, + { + "epoch": 0.7403902118075765, + "grad_norm": 13.718461990356445, + "learning_rate": 1.6029715575152033e-06, + "loss": 0.1974, + "step": 29258 + }, + { + "epoch": 0.7404155173722702, + "grad_norm": 4.669907569885254, + "learning_rate": 1.6026769461486225e-06, + "loss": 0.1442, + "step": 29259 + }, + { + "epoch": 0.7404408229369639, + "grad_norm": 6.7522735595703125, + "learning_rate": 1.602382356690747e-06, + "loss": 0.221, + "step": 29260 + }, + { + "epoch": 0.7404661285016575, + "grad_norm": 7.647233963012695, + "learning_rate": 1.602087789143475e-06, + "loss": 0.1701, + "step": 29261 + }, + { + "epoch": 0.7404914340663512, + "grad_norm": 1.96534264087677, + "learning_rate": 1.6017932435087098e-06, + "loss": 0.0751, + "step": 29262 + }, + { + "epoch": 0.7405167396310449, + "grad_norm": 3.383049249649048, + "learning_rate": 1.601498719788344e-06, + "loss": 0.1501, + "step": 29263 + }, + { + "epoch": 0.7405420451957385, + "grad_norm": 4.9634480476379395, + "learning_rate": 1.601204217984283e-06, + "loss": 0.1896, + "step": 29264 + }, + { + "epoch": 0.7405673507604322, + "grad_norm": 11.248287200927734, + "learning_rate": 1.6009097380984218e-06, + "loss": 0.2119, + "step": 29265 + }, + { + "epoch": 0.7405926563251259, + "grad_norm": 5.458430767059326, + "learning_rate": 1.6006152801326652e-06, + "loss": 0.1733, + "step": 29266 + }, + { + "epoch": 0.7406179618898195, + "grad_norm": 7.079986095428467, + "learning_rate": 1.6003208440889045e-06, + "loss": 0.1703, + "step": 29267 + }, + { + "epoch": 0.7406432674545133, + "grad_norm": 4.946303367614746, + "learning_rate": 1.6000264299690444e-06, + "loss": 0.1396, + "step": 29268 + }, + { + "epoch": 0.740668573019207, + "grad_norm": 3.6583683490753174, + "learning_rate": 1.5997320377749792e-06, + "loss": 0.1047, + "step": 29269 + }, + { + "epoch": 0.7406938785839006, + "grad_norm": 5.609848499298096, + "learning_rate": 1.5994376675086133e-06, + "loss": 0.1789, + "step": 29270 + }, + { + "epoch": 0.7407191841485943, + "grad_norm": 4.161911964416504, + "learning_rate": 1.599143319171838e-06, + "loss": 0.2002, + "step": 29271 + }, + { + "epoch": 0.740744489713288, + "grad_norm": 6.6456451416015625, + "learning_rate": 1.5988489927665562e-06, + "loss": 0.1998, + "step": 29272 + }, + { + "epoch": 0.7407697952779816, + "grad_norm": 6.0936102867126465, + "learning_rate": 1.5985546882946652e-06, + "loss": 0.1458, + "step": 29273 + }, + { + "epoch": 0.7407951008426753, + "grad_norm": 4.748101711273193, + "learning_rate": 1.5982604057580598e-06, + "loss": 0.1513, + "step": 29274 + }, + { + "epoch": 0.740820406407369, + "grad_norm": 3.9467806816101074, + "learning_rate": 1.5979661451586426e-06, + "loss": 0.1769, + "step": 29275 + }, + { + "epoch": 0.7408457119720626, + "grad_norm": 3.9823434352874756, + "learning_rate": 1.5976719064983088e-06, + "loss": 0.15, + "step": 29276 + }, + { + "epoch": 0.7408710175367563, + "grad_norm": 9.641054153442383, + "learning_rate": 1.5973776897789556e-06, + "loss": 0.2212, + "step": 29277 + }, + { + "epoch": 0.74089632310145, + "grad_norm": 4.450103282928467, + "learning_rate": 1.5970834950024793e-06, + "loss": 0.1434, + "step": 29278 + }, + { + "epoch": 0.7409216286661436, + "grad_norm": 2.9150216579437256, + "learning_rate": 1.5967893221707804e-06, + "loss": 0.1387, + "step": 29279 + }, + { + "epoch": 0.7409469342308374, + "grad_norm": 8.714078903198242, + "learning_rate": 1.596495171285754e-06, + "loss": 0.2019, + "step": 29280 + }, + { + "epoch": 0.7409722397955311, + "grad_norm": 13.452736854553223, + "learning_rate": 1.596201042349297e-06, + "loss": 0.223, + "step": 29281 + }, + { + "epoch": 0.7409975453602247, + "grad_norm": 4.701728820800781, + "learning_rate": 1.5959069353633044e-06, + "loss": 0.1245, + "step": 29282 + }, + { + "epoch": 0.7410228509249184, + "grad_norm": 2.8254473209381104, + "learning_rate": 1.595612850329677e-06, + "loss": 0.1117, + "step": 29283 + }, + { + "epoch": 0.7410481564896121, + "grad_norm": 4.587645053863525, + "learning_rate": 1.5953187872503089e-06, + "loss": 0.1152, + "step": 29284 + }, + { + "epoch": 0.7410734620543057, + "grad_norm": 10.28488826751709, + "learning_rate": 1.5950247461270968e-06, + "loss": 0.1136, + "step": 29285 + }, + { + "epoch": 0.7410987676189994, + "grad_norm": 3.8932390213012695, + "learning_rate": 1.5947307269619345e-06, + "loss": 0.1546, + "step": 29286 + }, + { + "epoch": 0.7411240731836931, + "grad_norm": 5.8057098388671875, + "learning_rate": 1.5944367297567226e-06, + "loss": 0.1665, + "step": 29287 + }, + { + "epoch": 0.7411493787483868, + "grad_norm": 3.155763864517212, + "learning_rate": 1.5941427545133543e-06, + "loss": 0.1517, + "step": 29288 + }, + { + "epoch": 0.7411746843130804, + "grad_norm": 5.329409599304199, + "learning_rate": 1.593848801233726e-06, + "loss": 0.1503, + "step": 29289 + }, + { + "epoch": 0.7411999898777741, + "grad_norm": 3.597158193588257, + "learning_rate": 1.5935548699197318e-06, + "loss": 0.0998, + "step": 29290 + }, + { + "epoch": 0.7412252954424678, + "grad_norm": 4.203065872192383, + "learning_rate": 1.5932609605732697e-06, + "loss": 0.1541, + "step": 29291 + }, + { + "epoch": 0.7412506010071614, + "grad_norm": 4.420538425445557, + "learning_rate": 1.5929670731962327e-06, + "loss": 0.1765, + "step": 29292 + }, + { + "epoch": 0.7412759065718552, + "grad_norm": 13.627721786499023, + "learning_rate": 1.5926732077905205e-06, + "loss": 0.2662, + "step": 29293 + }, + { + "epoch": 0.7413012121365489, + "grad_norm": 7.285507678985596, + "learning_rate": 1.5923793643580214e-06, + "loss": 0.1895, + "step": 29294 + }, + { + "epoch": 0.7413265177012425, + "grad_norm": 5.5548295974731445, + "learning_rate": 1.592085542900636e-06, + "loss": 0.1854, + "step": 29295 + }, + { + "epoch": 0.7413518232659362, + "grad_norm": 3.3433616161346436, + "learning_rate": 1.5917917434202551e-06, + "loss": 0.1299, + "step": 29296 + }, + { + "epoch": 0.7413771288306299, + "grad_norm": 10.894082069396973, + "learning_rate": 1.591497965918779e-06, + "loss": 0.2255, + "step": 29297 + }, + { + "epoch": 0.7414024343953235, + "grad_norm": 5.769517421722412, + "learning_rate": 1.5912042103980946e-06, + "loss": 0.1908, + "step": 29298 + }, + { + "epoch": 0.7414277399600172, + "grad_norm": 4.486095428466797, + "learning_rate": 1.590910476860102e-06, + "loss": 0.1003, + "step": 29299 + }, + { + "epoch": 0.7414530455247109, + "grad_norm": 6.559948921203613, + "learning_rate": 1.5906167653066922e-06, + "loss": 0.1839, + "step": 29300 + }, + { + "epoch": 0.7414783510894045, + "grad_norm": 4.128360748291016, + "learning_rate": 1.5903230757397636e-06, + "loss": 0.1129, + "step": 29301 + }, + { + "epoch": 0.7415036566540982, + "grad_norm": 12.22547721862793, + "learning_rate": 1.5900294081612045e-06, + "loss": 0.2292, + "step": 29302 + }, + { + "epoch": 0.7415289622187919, + "grad_norm": 4.689143657684326, + "learning_rate": 1.5897357625729127e-06, + "loss": 0.1585, + "step": 29303 + }, + { + "epoch": 0.7415542677834855, + "grad_norm": 2.581094980239868, + "learning_rate": 1.5894421389767806e-06, + "loss": 0.0672, + "step": 29304 + }, + { + "epoch": 0.7415795733481793, + "grad_norm": 2.83728289604187, + "learning_rate": 1.5891485373747e-06, + "loss": 0.1165, + "step": 29305 + }, + { + "epoch": 0.741604878912873, + "grad_norm": 4.790109634399414, + "learning_rate": 1.5888549577685675e-06, + "loss": 0.1707, + "step": 29306 + }, + { + "epoch": 0.7416301844775666, + "grad_norm": 4.562650680541992, + "learning_rate": 1.5885614001602757e-06, + "loss": 0.1765, + "step": 29307 + }, + { + "epoch": 0.7416554900422603, + "grad_norm": 6.018860340118408, + "learning_rate": 1.5882678645517158e-06, + "loss": 0.2071, + "step": 29308 + }, + { + "epoch": 0.741680795606954, + "grad_norm": 10.650554656982422, + "learning_rate": 1.587974350944781e-06, + "loss": 0.1685, + "step": 29309 + }, + { + "epoch": 0.7417061011716476, + "grad_norm": 7.416994571685791, + "learning_rate": 1.5876808593413678e-06, + "loss": 0.223, + "step": 29310 + }, + { + "epoch": 0.7417314067363413, + "grad_norm": 7.460676670074463, + "learning_rate": 1.5873873897433628e-06, + "loss": 0.1687, + "step": 29311 + }, + { + "epoch": 0.741756712301035, + "grad_norm": 3.3315985202789307, + "learning_rate": 1.5870939421526631e-06, + "loss": 0.1541, + "step": 29312 + }, + { + "epoch": 0.7417820178657287, + "grad_norm": 4.298335552215576, + "learning_rate": 1.5868005165711587e-06, + "loss": 0.1616, + "step": 29313 + }, + { + "epoch": 0.7418073234304223, + "grad_norm": 5.040641784667969, + "learning_rate": 1.5865071130007437e-06, + "loss": 0.203, + "step": 29314 + }, + { + "epoch": 0.741832628995116, + "grad_norm": 3.814113140106201, + "learning_rate": 1.58621373144331e-06, + "loss": 0.155, + "step": 29315 + }, + { + "epoch": 0.7418579345598098, + "grad_norm": 4.851405620574951, + "learning_rate": 1.5859203719007488e-06, + "loss": 0.1596, + "step": 29316 + }, + { + "epoch": 0.7418832401245034, + "grad_norm": 5.134699821472168, + "learning_rate": 1.5856270343749503e-06, + "loss": 0.1782, + "step": 29317 + }, + { + "epoch": 0.7419085456891971, + "grad_norm": 5.212613582611084, + "learning_rate": 1.5853337188678098e-06, + "loss": 0.1897, + "step": 29318 + }, + { + "epoch": 0.7419338512538908, + "grad_norm": 10.416481018066406, + "learning_rate": 1.5850404253812168e-06, + "loss": 0.1618, + "step": 29319 + }, + { + "epoch": 0.7419591568185844, + "grad_norm": 5.4044108390808105, + "learning_rate": 1.5847471539170628e-06, + "loss": 0.2169, + "step": 29320 + }, + { + "epoch": 0.7419844623832781, + "grad_norm": 4.222704887390137, + "learning_rate": 1.5844539044772373e-06, + "loss": 0.1741, + "step": 29321 + }, + { + "epoch": 0.7420097679479718, + "grad_norm": 2.661139488220215, + "learning_rate": 1.5841606770636352e-06, + "loss": 0.1476, + "step": 29322 + }, + { + "epoch": 0.7420350735126654, + "grad_norm": 6.368844509124756, + "learning_rate": 1.5838674716781454e-06, + "loss": 0.1852, + "step": 29323 + }, + { + "epoch": 0.7420603790773591, + "grad_norm": 3.862365961074829, + "learning_rate": 1.583574288322659e-06, + "loss": 0.1326, + "step": 29324 + }, + { + "epoch": 0.7420856846420528, + "grad_norm": 8.731586456298828, + "learning_rate": 1.5832811269990644e-06, + "loss": 0.1334, + "step": 29325 + }, + { + "epoch": 0.7421109902067464, + "grad_norm": 7.129767894744873, + "learning_rate": 1.5829879877092563e-06, + "loss": 0.2504, + "step": 29326 + }, + { + "epoch": 0.7421362957714401, + "grad_norm": 8.799612998962402, + "learning_rate": 1.5826948704551215e-06, + "loss": 0.2129, + "step": 29327 + }, + { + "epoch": 0.7421616013361338, + "grad_norm": 5.044579982757568, + "learning_rate": 1.5824017752385546e-06, + "loss": 0.1455, + "step": 29328 + }, + { + "epoch": 0.7421869069008274, + "grad_norm": 3.4590187072753906, + "learning_rate": 1.58210870206144e-06, + "loss": 0.1302, + "step": 29329 + }, + { + "epoch": 0.7422122124655212, + "grad_norm": 5.353789329528809, + "learning_rate": 1.5818156509256722e-06, + "loss": 0.1242, + "step": 29330 + }, + { + "epoch": 0.7422375180302149, + "grad_norm": 6.950189113616943, + "learning_rate": 1.5815226218331397e-06, + "loss": 0.1914, + "step": 29331 + }, + { + "epoch": 0.7422628235949085, + "grad_norm": 15.481926918029785, + "learning_rate": 1.5812296147857315e-06, + "loss": 0.1789, + "step": 29332 + }, + { + "epoch": 0.7422881291596022, + "grad_norm": 3.2269961833953857, + "learning_rate": 1.5809366297853363e-06, + "loss": 0.0938, + "step": 29333 + }, + { + "epoch": 0.7423134347242959, + "grad_norm": 3.4075262546539307, + "learning_rate": 1.5806436668338459e-06, + "loss": 0.1061, + "step": 29334 + }, + { + "epoch": 0.7423387402889895, + "grad_norm": 2.281646251678467, + "learning_rate": 1.5803507259331491e-06, + "loss": 0.1022, + "step": 29335 + }, + { + "epoch": 0.7423640458536832, + "grad_norm": 5.607653617858887, + "learning_rate": 1.5800578070851324e-06, + "loss": 0.1474, + "step": 29336 + }, + { + "epoch": 0.7423893514183769, + "grad_norm": 5.619332790374756, + "learning_rate": 1.579764910291688e-06, + "loss": 0.1971, + "step": 29337 + }, + { + "epoch": 0.7424146569830706, + "grad_norm": 5.336666584014893, + "learning_rate": 1.5794720355547038e-06, + "loss": 0.1546, + "step": 29338 + }, + { + "epoch": 0.7424399625477642, + "grad_norm": 4.599379062652588, + "learning_rate": 1.5791791828760683e-06, + "loss": 0.1085, + "step": 29339 + }, + { + "epoch": 0.7424652681124579, + "grad_norm": 3.916254997253418, + "learning_rate": 1.5788863522576681e-06, + "loss": 0.1216, + "step": 29340 + }, + { + "epoch": 0.7424905736771517, + "grad_norm": 4.755918502807617, + "learning_rate": 1.5785935437013973e-06, + "loss": 0.1261, + "step": 29341 + }, + { + "epoch": 0.7425158792418453, + "grad_norm": 15.258679389953613, + "learning_rate": 1.5783007572091363e-06, + "loss": 0.1462, + "step": 29342 + }, + { + "epoch": 0.742541184806539, + "grad_norm": 6.073012351989746, + "learning_rate": 1.578007992782779e-06, + "loss": 0.1312, + "step": 29343 + }, + { + "epoch": 0.7425664903712327, + "grad_norm": 3.3708131313323975, + "learning_rate": 1.5777152504242104e-06, + "loss": 0.1014, + "step": 29344 + }, + { + "epoch": 0.7425917959359263, + "grad_norm": 3.5788846015930176, + "learning_rate": 1.5774225301353208e-06, + "loss": 0.1391, + "step": 29345 + }, + { + "epoch": 0.74261710150062, + "grad_norm": 5.68996524810791, + "learning_rate": 1.5771298319179967e-06, + "loss": 0.1938, + "step": 29346 + }, + { + "epoch": 0.7426424070653137, + "grad_norm": 8.028483390808105, + "learning_rate": 1.5768371557741257e-06, + "loss": 0.1503, + "step": 29347 + }, + { + "epoch": 0.7426677126300073, + "grad_norm": 9.579778671264648, + "learning_rate": 1.5765445017055935e-06, + "loss": 0.2645, + "step": 29348 + }, + { + "epoch": 0.742693018194701, + "grad_norm": 3.01902437210083, + "learning_rate": 1.5762518697142904e-06, + "loss": 0.1073, + "step": 29349 + }, + { + "epoch": 0.7427183237593947, + "grad_norm": 6.488060474395752, + "learning_rate": 1.5759592598021022e-06, + "loss": 0.1509, + "step": 29350 + }, + { + "epoch": 0.7427436293240883, + "grad_norm": 6.717645645141602, + "learning_rate": 1.5756666719709163e-06, + "loss": 0.1749, + "step": 29351 + }, + { + "epoch": 0.742768934888782, + "grad_norm": 7.935858249664307, + "learning_rate": 1.5753741062226169e-06, + "loss": 0.2328, + "step": 29352 + }, + { + "epoch": 0.7427942404534758, + "grad_norm": 6.46938943862915, + "learning_rate": 1.5750815625590949e-06, + "loss": 0.1266, + "step": 29353 + }, + { + "epoch": 0.7428195460181694, + "grad_norm": 5.378567218780518, + "learning_rate": 1.5747890409822342e-06, + "loss": 0.1376, + "step": 29354 + }, + { + "epoch": 0.7428448515828631, + "grad_norm": 4.611441612243652, + "learning_rate": 1.5744965414939222e-06, + "loss": 0.1204, + "step": 29355 + }, + { + "epoch": 0.7428701571475568, + "grad_norm": 6.284250736236572, + "learning_rate": 1.5742040640960432e-06, + "loss": 0.116, + "step": 29356 + }, + { + "epoch": 0.7428954627122504, + "grad_norm": 11.49879264831543, + "learning_rate": 1.5739116087904866e-06, + "loss": 0.2234, + "step": 29357 + }, + { + "epoch": 0.7429207682769441, + "grad_norm": 3.7605626583099365, + "learning_rate": 1.5736191755791368e-06, + "loss": 0.1624, + "step": 29358 + }, + { + "epoch": 0.7429460738416378, + "grad_norm": 4.453096866607666, + "learning_rate": 1.5733267644638794e-06, + "loss": 0.1498, + "step": 29359 + }, + { + "epoch": 0.7429713794063314, + "grad_norm": 5.016353130340576, + "learning_rate": 1.5730343754465988e-06, + "loss": 0.1652, + "step": 29360 + }, + { + "epoch": 0.7429966849710251, + "grad_norm": 4.4580841064453125, + "learning_rate": 1.5727420085291833e-06, + "loss": 0.0987, + "step": 29361 + }, + { + "epoch": 0.7430219905357188, + "grad_norm": 6.318997383117676, + "learning_rate": 1.572449663713518e-06, + "loss": 0.1306, + "step": 29362 + }, + { + "epoch": 0.7430472961004125, + "grad_norm": 5.0234527587890625, + "learning_rate": 1.5721573410014862e-06, + "loss": 0.1984, + "step": 29363 + }, + { + "epoch": 0.7430726016651061, + "grad_norm": 5.1293182373046875, + "learning_rate": 1.5718650403949731e-06, + "loss": 0.2156, + "step": 29364 + }, + { + "epoch": 0.7430979072297998, + "grad_norm": 4.961126327514648, + "learning_rate": 1.571572761895866e-06, + "loss": 0.1294, + "step": 29365 + }, + { + "epoch": 0.7431232127944936, + "grad_norm": 9.181136131286621, + "learning_rate": 1.5712805055060482e-06, + "loss": 0.2246, + "step": 29366 + }, + { + "epoch": 0.7431485183591872, + "grad_norm": 5.728286266326904, + "learning_rate": 1.5709882712274033e-06, + "loss": 0.1677, + "step": 29367 + }, + { + "epoch": 0.7431738239238809, + "grad_norm": 3.9172253608703613, + "learning_rate": 1.5706960590618203e-06, + "loss": 0.0956, + "step": 29368 + }, + { + "epoch": 0.7431991294885746, + "grad_norm": 9.092268943786621, + "learning_rate": 1.5704038690111779e-06, + "loss": 0.1753, + "step": 29369 + }, + { + "epoch": 0.7432244350532682, + "grad_norm": 5.391906261444092, + "learning_rate": 1.5701117010773638e-06, + "loss": 0.2142, + "step": 29370 + }, + { + "epoch": 0.7432497406179619, + "grad_norm": 6.682440757751465, + "learning_rate": 1.5698195552622603e-06, + "loss": 0.1092, + "step": 29371 + }, + { + "epoch": 0.7432750461826556, + "grad_norm": 4.7439045906066895, + "learning_rate": 1.569527431567755e-06, + "loss": 0.1183, + "step": 29372 + }, + { + "epoch": 0.7433003517473492, + "grad_norm": 4.188993453979492, + "learning_rate": 1.5692353299957263e-06, + "loss": 0.151, + "step": 29373 + }, + { + "epoch": 0.7433256573120429, + "grad_norm": 4.885365962982178, + "learning_rate": 1.5689432505480628e-06, + "loss": 0.178, + "step": 29374 + }, + { + "epoch": 0.7433509628767366, + "grad_norm": 4.6771464347839355, + "learning_rate": 1.568651193226644e-06, + "loss": 0.1248, + "step": 29375 + }, + { + "epoch": 0.7433762684414302, + "grad_norm": 4.19761323928833, + "learning_rate": 1.5683591580333584e-06, + "loss": 0.158, + "step": 29376 + }, + { + "epoch": 0.7434015740061239, + "grad_norm": 5.188240051269531, + "learning_rate": 1.5680671449700834e-06, + "loss": 0.1512, + "step": 29377 + }, + { + "epoch": 0.7434268795708177, + "grad_norm": 4.529953956604004, + "learning_rate": 1.5677751540387066e-06, + "loss": 0.1003, + "step": 29378 + }, + { + "epoch": 0.7434521851355113, + "grad_norm": 11.508111000061035, + "learning_rate": 1.5674831852411076e-06, + "loss": 0.1809, + "step": 29379 + }, + { + "epoch": 0.743477490700205, + "grad_norm": 4.4686760902404785, + "learning_rate": 1.5671912385791732e-06, + "loss": 0.2015, + "step": 29380 + }, + { + "epoch": 0.7435027962648987, + "grad_norm": 3.899953603744507, + "learning_rate": 1.5668993140547834e-06, + "loss": 0.1111, + "step": 29381 + }, + { + "epoch": 0.7435281018295923, + "grad_norm": 3.0697102546691895, + "learning_rate": 1.566607411669821e-06, + "loss": 0.1462, + "step": 29382 + }, + { + "epoch": 0.743553407394286, + "grad_norm": 11.91336441040039, + "learning_rate": 1.566315531426168e-06, + "loss": 0.2722, + "step": 29383 + }, + { + "epoch": 0.7435787129589797, + "grad_norm": 6.9477152824401855, + "learning_rate": 1.5660236733257084e-06, + "loss": 0.1077, + "step": 29384 + }, + { + "epoch": 0.7436040185236733, + "grad_norm": 5.999029636383057, + "learning_rate": 1.5657318373703235e-06, + "loss": 0.1215, + "step": 29385 + }, + { + "epoch": 0.743629324088367, + "grad_norm": 5.990442276000977, + "learning_rate": 1.5654400235618955e-06, + "loss": 0.1901, + "step": 29386 + }, + { + "epoch": 0.7436546296530607, + "grad_norm": 2.841862201690674, + "learning_rate": 1.5651482319023038e-06, + "loss": 0.1264, + "step": 29387 + }, + { + "epoch": 0.7436799352177543, + "grad_norm": 3.2934019565582275, + "learning_rate": 1.5648564623934338e-06, + "loss": 0.1259, + "step": 29388 + }, + { + "epoch": 0.743705240782448, + "grad_norm": 9.90969467163086, + "learning_rate": 1.5645647150371657e-06, + "loss": 0.2014, + "step": 29389 + }, + { + "epoch": 0.7437305463471418, + "grad_norm": 4.711175918579102, + "learning_rate": 1.564272989835381e-06, + "loss": 0.1511, + "step": 29390 + }, + { + "epoch": 0.7437558519118355, + "grad_norm": 4.466011047363281, + "learning_rate": 1.5639812867899589e-06, + "loss": 0.1605, + "step": 29391 + }, + { + "epoch": 0.7437811574765291, + "grad_norm": 7.5854387283325195, + "learning_rate": 1.5636896059027839e-06, + "loss": 0.1832, + "step": 29392 + }, + { + "epoch": 0.7438064630412228, + "grad_norm": 2.637843608856201, + "learning_rate": 1.5633979471757354e-06, + "loss": 0.0848, + "step": 29393 + }, + { + "epoch": 0.7438317686059165, + "grad_norm": 3.8576841354370117, + "learning_rate": 1.5631063106106942e-06, + "loss": 0.1159, + "step": 29394 + }, + { + "epoch": 0.7438570741706101, + "grad_norm": 3.401540756225586, + "learning_rate": 1.5628146962095407e-06, + "loss": 0.088, + "step": 29395 + }, + { + "epoch": 0.7438823797353038, + "grad_norm": 6.053379535675049, + "learning_rate": 1.5625231039741551e-06, + "loss": 0.1595, + "step": 29396 + }, + { + "epoch": 0.7439076852999975, + "grad_norm": 5.068321704864502, + "learning_rate": 1.5622315339064199e-06, + "loss": 0.158, + "step": 29397 + }, + { + "epoch": 0.7439329908646911, + "grad_norm": 2.5327391624450684, + "learning_rate": 1.5619399860082141e-06, + "loss": 0.0823, + "step": 29398 + }, + { + "epoch": 0.7439582964293848, + "grad_norm": 5.75402307510376, + "learning_rate": 1.5616484602814174e-06, + "loss": 0.1605, + "step": 29399 + }, + { + "epoch": 0.7439836019940785, + "grad_norm": 4.056788444519043, + "learning_rate": 1.5613569567279091e-06, + "loss": 0.1398, + "step": 29400 + }, + { + "epoch": 0.7440089075587721, + "grad_norm": 6.3877668380737305, + "learning_rate": 1.5610654753495713e-06, + "loss": 0.2208, + "step": 29401 + }, + { + "epoch": 0.7440342131234658, + "grad_norm": 5.372419357299805, + "learning_rate": 1.5607740161482809e-06, + "loss": 0.1798, + "step": 29402 + }, + { + "epoch": 0.7440595186881596, + "grad_norm": 3.6812002658843994, + "learning_rate": 1.5604825791259226e-06, + "loss": 0.0607, + "step": 29403 + }, + { + "epoch": 0.7440848242528532, + "grad_norm": 6.1642327308654785, + "learning_rate": 1.5601911642843693e-06, + "loss": 0.1752, + "step": 29404 + }, + { + "epoch": 0.7441101298175469, + "grad_norm": 4.042042255401611, + "learning_rate": 1.5598997716255048e-06, + "loss": 0.1756, + "step": 29405 + }, + { + "epoch": 0.7441354353822406, + "grad_norm": 3.0196053981781006, + "learning_rate": 1.5596084011512053e-06, + "loss": 0.1167, + "step": 29406 + }, + { + "epoch": 0.7441607409469342, + "grad_norm": 7.429080963134766, + "learning_rate": 1.5593170528633545e-06, + "loss": 0.1775, + "step": 29407 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 6.359391212463379, + "learning_rate": 1.5590257267638242e-06, + "loss": 0.1257, + "step": 29408 + }, + { + "epoch": 0.7442113520763216, + "grad_norm": 5.553393840789795, + "learning_rate": 1.5587344228544993e-06, + "loss": 0.2118, + "step": 29409 + }, + { + "epoch": 0.7442366576410152, + "grad_norm": 6.89915657043457, + "learning_rate": 1.5584431411372535e-06, + "loss": 0.2171, + "step": 29410 + }, + { + "epoch": 0.7442619632057089, + "grad_norm": 4.630540370941162, + "learning_rate": 1.5581518816139701e-06, + "loss": 0.1486, + "step": 29411 + }, + { + "epoch": 0.7442872687704026, + "grad_norm": 15.072530746459961, + "learning_rate": 1.5578606442865247e-06, + "loss": 0.2357, + "step": 29412 + }, + { + "epoch": 0.7443125743350962, + "grad_norm": 4.698672294616699, + "learning_rate": 1.5575694291567956e-06, + "loss": 0.1874, + "step": 29413 + }, + { + "epoch": 0.74433787989979, + "grad_norm": 4.194629192352295, + "learning_rate": 1.5572782362266598e-06, + "loss": 0.146, + "step": 29414 + }, + { + "epoch": 0.7443631854644837, + "grad_norm": 21.263330459594727, + "learning_rate": 1.5569870654979974e-06, + "loss": 0.2095, + "step": 29415 + }, + { + "epoch": 0.7443884910291774, + "grad_norm": 4.18384313583374, + "learning_rate": 1.5566959169726846e-06, + "loss": 0.1028, + "step": 29416 + }, + { + "epoch": 0.744413796593871, + "grad_norm": 3.6972999572753906, + "learning_rate": 1.5564047906525998e-06, + "loss": 0.1827, + "step": 29417 + }, + { + "epoch": 0.7444391021585647, + "grad_norm": 2.8785555362701416, + "learning_rate": 1.5561136865396188e-06, + "loss": 0.1058, + "step": 29418 + }, + { + "epoch": 0.7444644077232584, + "grad_norm": 4.387005805969238, + "learning_rate": 1.5558226046356206e-06, + "loss": 0.1318, + "step": 29419 + }, + { + "epoch": 0.744489713287952, + "grad_norm": 3.0000603199005127, + "learning_rate": 1.5555315449424824e-06, + "loss": 0.1349, + "step": 29420 + }, + { + "epoch": 0.7445150188526457, + "grad_norm": 7.587525844573975, + "learning_rate": 1.5552405074620809e-06, + "loss": 0.1597, + "step": 29421 + }, + { + "epoch": 0.7445403244173394, + "grad_norm": 3.2316231727600098, + "learning_rate": 1.5549494921962922e-06, + "loss": 0.0741, + "step": 29422 + }, + { + "epoch": 0.744565629982033, + "grad_norm": 7.545353412628174, + "learning_rate": 1.5546584991469916e-06, + "loss": 0.2124, + "step": 29423 + }, + { + "epoch": 0.7445909355467267, + "grad_norm": 15.91473388671875, + "learning_rate": 1.5543675283160592e-06, + "loss": 0.2891, + "step": 29424 + }, + { + "epoch": 0.7446162411114204, + "grad_norm": 5.319016933441162, + "learning_rate": 1.5540765797053692e-06, + "loss": 0.18, + "step": 29425 + }, + { + "epoch": 0.744641546676114, + "grad_norm": 3.6253137588500977, + "learning_rate": 1.5537856533167988e-06, + "loss": 0.0913, + "step": 29426 + }, + { + "epoch": 0.7446668522408078, + "grad_norm": 3.2942914962768555, + "learning_rate": 1.5534947491522223e-06, + "loss": 0.1161, + "step": 29427 + }, + { + "epoch": 0.7446921578055015, + "grad_norm": 3.4883415699005127, + "learning_rate": 1.5532038672135185e-06, + "loss": 0.168, + "step": 29428 + }, + { + "epoch": 0.7447174633701951, + "grad_norm": 8.63691234588623, + "learning_rate": 1.5529130075025616e-06, + "loss": 0.2203, + "step": 29429 + }, + { + "epoch": 0.7447427689348888, + "grad_norm": 14.13589859008789, + "learning_rate": 1.552622170021228e-06, + "loss": 0.2747, + "step": 29430 + }, + { + "epoch": 0.7447680744995825, + "grad_norm": 3.8644304275512695, + "learning_rate": 1.5523313547713908e-06, + "loss": 0.1324, + "step": 29431 + }, + { + "epoch": 0.7447933800642761, + "grad_norm": 7.092749118804932, + "learning_rate": 1.552040561754929e-06, + "loss": 0.1399, + "step": 29432 + }, + { + "epoch": 0.7448186856289698, + "grad_norm": 6.757281303405762, + "learning_rate": 1.5517497909737144e-06, + "loss": 0.1017, + "step": 29433 + }, + { + "epoch": 0.7448439911936635, + "grad_norm": 6.822300434112549, + "learning_rate": 1.5514590424296278e-06, + "loss": 0.1182, + "step": 29434 + }, + { + "epoch": 0.7448692967583571, + "grad_norm": 11.719368934631348, + "learning_rate": 1.5511683161245366e-06, + "loss": 0.2943, + "step": 29435 + }, + { + "epoch": 0.7448946023230508, + "grad_norm": 3.544536828994751, + "learning_rate": 1.5508776120603214e-06, + "loss": 0.1043, + "step": 29436 + }, + { + "epoch": 0.7449199078877445, + "grad_norm": 6.683980464935303, + "learning_rate": 1.5505869302388528e-06, + "loss": 0.1679, + "step": 29437 + }, + { + "epoch": 0.7449452134524381, + "grad_norm": 4.906636714935303, + "learning_rate": 1.5502962706620111e-06, + "loss": 0.2164, + "step": 29438 + }, + { + "epoch": 0.7449705190171318, + "grad_norm": 10.820119857788086, + "learning_rate": 1.5500056333316638e-06, + "loss": 0.2499, + "step": 29439 + }, + { + "epoch": 0.7449958245818256, + "grad_norm": 9.900632858276367, + "learning_rate": 1.5497150182496894e-06, + "loss": 0.1553, + "step": 29440 + }, + { + "epoch": 0.7450211301465193, + "grad_norm": 3.640784740447998, + "learning_rate": 1.5494244254179603e-06, + "loss": 0.1847, + "step": 29441 + }, + { + "epoch": 0.7450464357112129, + "grad_norm": 7.82934045791626, + "learning_rate": 1.5491338548383522e-06, + "loss": 0.1438, + "step": 29442 + }, + { + "epoch": 0.7450717412759066, + "grad_norm": 5.368147850036621, + "learning_rate": 1.5488433065127383e-06, + "loss": 0.1233, + "step": 29443 + }, + { + "epoch": 0.7450970468406003, + "grad_norm": 4.627825736999512, + "learning_rate": 1.5485527804429918e-06, + "loss": 0.1533, + "step": 29444 + }, + { + "epoch": 0.7451223524052939, + "grad_norm": 7.484165191650391, + "learning_rate": 1.5482622766309851e-06, + "loss": 0.2218, + "step": 29445 + }, + { + "epoch": 0.7451476579699876, + "grad_norm": 2.8806657791137695, + "learning_rate": 1.5479717950785944e-06, + "loss": 0.1239, + "step": 29446 + }, + { + "epoch": 0.7451729635346813, + "grad_norm": 4.64077091217041, + "learning_rate": 1.5476813357876913e-06, + "loss": 0.0915, + "step": 29447 + }, + { + "epoch": 0.7451982690993749, + "grad_norm": 2.476438045501709, + "learning_rate": 1.5473908987601494e-06, + "loss": 0.0927, + "step": 29448 + }, + { + "epoch": 0.7452235746640686, + "grad_norm": 3.3771796226501465, + "learning_rate": 1.5471004839978393e-06, + "loss": 0.1052, + "step": 29449 + }, + { + "epoch": 0.7452488802287623, + "grad_norm": 4.344098091125488, + "learning_rate": 1.5468100915026374e-06, + "loss": 0.1184, + "step": 29450 + }, + { + "epoch": 0.745274185793456, + "grad_norm": 3.2352402210235596, + "learning_rate": 1.5465197212764154e-06, + "loss": 0.1517, + "step": 29451 + }, + { + "epoch": 0.7452994913581497, + "grad_norm": 5.009670734405518, + "learning_rate": 1.5462293733210448e-06, + "loss": 0.1173, + "step": 29452 + }, + { + "epoch": 0.7453247969228434, + "grad_norm": 3.5058705806732178, + "learning_rate": 1.545939047638399e-06, + "loss": 0.1516, + "step": 29453 + }, + { + "epoch": 0.745350102487537, + "grad_norm": 8.848838806152344, + "learning_rate": 1.5456487442303481e-06, + "loss": 0.2034, + "step": 29454 + }, + { + "epoch": 0.7453754080522307, + "grad_norm": 3.751328945159912, + "learning_rate": 1.5453584630987673e-06, + "loss": 0.204, + "step": 29455 + }, + { + "epoch": 0.7454007136169244, + "grad_norm": 3.704258918762207, + "learning_rate": 1.5450682042455272e-06, + "loss": 0.1378, + "step": 29456 + }, + { + "epoch": 0.745426019181618, + "grad_norm": 3.5004091262817383, + "learning_rate": 1.5447779676725e-06, + "loss": 0.1566, + "step": 29457 + }, + { + "epoch": 0.7454513247463117, + "grad_norm": 3.70351505279541, + "learning_rate": 1.5444877533815544e-06, + "loss": 0.1418, + "step": 29458 + }, + { + "epoch": 0.7454766303110054, + "grad_norm": 4.938738822937012, + "learning_rate": 1.544197561374567e-06, + "loss": 0.2128, + "step": 29459 + }, + { + "epoch": 0.745501935875699, + "grad_norm": 5.952212333679199, + "learning_rate": 1.5439073916534065e-06, + "loss": 0.1779, + "step": 29460 + }, + { + "epoch": 0.7455272414403927, + "grad_norm": 7.00926399230957, + "learning_rate": 1.5436172442199444e-06, + "loss": 0.1636, + "step": 29461 + }, + { + "epoch": 0.7455525470050864, + "grad_norm": 3.3302433490753174, + "learning_rate": 1.5433271190760495e-06, + "loss": 0.145, + "step": 29462 + }, + { + "epoch": 0.74557785256978, + "grad_norm": 4.506109714508057, + "learning_rate": 1.5430370162235975e-06, + "loss": 0.156, + "step": 29463 + }, + { + "epoch": 0.7456031581344738, + "grad_norm": 3.6925132274627686, + "learning_rate": 1.5427469356644548e-06, + "loss": 0.1261, + "step": 29464 + }, + { + "epoch": 0.7456284636991675, + "grad_norm": 3.5890049934387207, + "learning_rate": 1.5424568774004973e-06, + "loss": 0.1833, + "step": 29465 + }, + { + "epoch": 0.7456537692638612, + "grad_norm": 4.359155178070068, + "learning_rate": 1.5421668414335883e-06, + "loss": 0.1804, + "step": 29466 + }, + { + "epoch": 0.7456790748285548, + "grad_norm": 5.473465442657471, + "learning_rate": 1.5418768277656048e-06, + "loss": 0.0984, + "step": 29467 + }, + { + "epoch": 0.7457043803932485, + "grad_norm": 14.272629737854004, + "learning_rate": 1.5415868363984122e-06, + "loss": 0.2235, + "step": 29468 + }, + { + "epoch": 0.7457296859579422, + "grad_norm": 2.57584547996521, + "learning_rate": 1.5412968673338863e-06, + "loss": 0.1187, + "step": 29469 + }, + { + "epoch": 0.7457549915226358, + "grad_norm": 4.336395740509033, + "learning_rate": 1.5410069205738899e-06, + "loss": 0.2171, + "step": 29470 + }, + { + "epoch": 0.7457802970873295, + "grad_norm": 18.602272033691406, + "learning_rate": 1.5407169961202983e-06, + "loss": 0.1114, + "step": 29471 + }, + { + "epoch": 0.7458056026520232, + "grad_norm": 5.308350563049316, + "learning_rate": 1.5404270939749777e-06, + "loss": 0.1557, + "step": 29472 + }, + { + "epoch": 0.7458309082167168, + "grad_norm": 3.998076915740967, + "learning_rate": 1.5401372141398009e-06, + "loss": 0.1685, + "step": 29473 + }, + { + "epoch": 0.7458562137814105, + "grad_norm": 9.710197448730469, + "learning_rate": 1.5398473566166354e-06, + "loss": 0.1567, + "step": 29474 + }, + { + "epoch": 0.7458815193461042, + "grad_norm": 10.25917911529541, + "learning_rate": 1.539557521407351e-06, + "loss": 0.2411, + "step": 29475 + }, + { + "epoch": 0.7459068249107978, + "grad_norm": 8.719474792480469, + "learning_rate": 1.5392677085138146e-06, + "loss": 0.2577, + "step": 29476 + }, + { + "epoch": 0.7459321304754916, + "grad_norm": 5.458456993103027, + "learning_rate": 1.5389779179378989e-06, + "loss": 0.1094, + "step": 29477 + }, + { + "epoch": 0.7459574360401853, + "grad_norm": 5.300292491912842, + "learning_rate": 1.5386881496814705e-06, + "loss": 0.2217, + "step": 29478 + }, + { + "epoch": 0.7459827416048789, + "grad_norm": 4.27801513671875, + "learning_rate": 1.5383984037463984e-06, + "loss": 0.1081, + "step": 29479 + }, + { + "epoch": 0.7460080471695726, + "grad_norm": 6.4672136306762695, + "learning_rate": 1.5381086801345513e-06, + "loss": 0.1888, + "step": 29480 + }, + { + "epoch": 0.7460333527342663, + "grad_norm": 3.880347967147827, + "learning_rate": 1.5378189788477954e-06, + "loss": 0.1236, + "step": 29481 + }, + { + "epoch": 0.7460586582989599, + "grad_norm": 12.222207069396973, + "learning_rate": 1.5375292998880026e-06, + "loss": 0.2792, + "step": 29482 + }, + { + "epoch": 0.7460839638636536, + "grad_norm": 3.848106861114502, + "learning_rate": 1.5372396432570391e-06, + "loss": 0.1212, + "step": 29483 + }, + { + "epoch": 0.7461092694283473, + "grad_norm": 5.71555757522583, + "learning_rate": 1.5369500089567735e-06, + "loss": 0.136, + "step": 29484 + }, + { + "epoch": 0.7461345749930409, + "grad_norm": 4.2928690910339355, + "learning_rate": 1.5366603969890714e-06, + "loss": 0.1537, + "step": 29485 + }, + { + "epoch": 0.7461598805577346, + "grad_norm": 4.687582492828369, + "learning_rate": 1.5363708073558037e-06, + "loss": 0.1557, + "step": 29486 + }, + { + "epoch": 0.7461851861224283, + "grad_norm": 9.625781059265137, + "learning_rate": 1.5360812400588365e-06, + "loss": 0.1799, + "step": 29487 + }, + { + "epoch": 0.746210491687122, + "grad_norm": 6.026362895965576, + "learning_rate": 1.5357916951000367e-06, + "loss": 0.1074, + "step": 29488 + }, + { + "epoch": 0.7462357972518157, + "grad_norm": 4.889228820800781, + "learning_rate": 1.5355021724812701e-06, + "loss": 0.0848, + "step": 29489 + }, + { + "epoch": 0.7462611028165094, + "grad_norm": 5.467813968658447, + "learning_rate": 1.535212672204408e-06, + "loss": 0.1855, + "step": 29490 + }, + { + "epoch": 0.7462864083812031, + "grad_norm": 4.570385456085205, + "learning_rate": 1.5349231942713144e-06, + "loss": 0.1648, + "step": 29491 + }, + { + "epoch": 0.7463117139458967, + "grad_norm": 6.245254993438721, + "learning_rate": 1.5346337386838562e-06, + "loss": 0.15, + "step": 29492 + }, + { + "epoch": 0.7463370195105904, + "grad_norm": 4.378813743591309, + "learning_rate": 1.5343443054438994e-06, + "loss": 0.1922, + "step": 29493 + }, + { + "epoch": 0.7463623250752841, + "grad_norm": 2.8806495666503906, + "learning_rate": 1.534054894553313e-06, + "loss": 0.0999, + "step": 29494 + }, + { + "epoch": 0.7463876306399777, + "grad_norm": 8.036920547485352, + "learning_rate": 1.53376550601396e-06, + "loss": 0.1756, + "step": 29495 + }, + { + "epoch": 0.7464129362046714, + "grad_norm": 9.606334686279297, + "learning_rate": 1.5334761398277115e-06, + "loss": 0.1964, + "step": 29496 + }, + { + "epoch": 0.7464382417693651, + "grad_norm": 7.904059410095215, + "learning_rate": 1.533186795996428e-06, + "loss": 0.2358, + "step": 29497 + }, + { + "epoch": 0.7464635473340587, + "grad_norm": 5.778831958770752, + "learning_rate": 1.5328974745219793e-06, + "loss": 0.1571, + "step": 29498 + }, + { + "epoch": 0.7464888528987524, + "grad_norm": 3.9306998252868652, + "learning_rate": 1.5326081754062284e-06, + "loss": 0.0754, + "step": 29499 + }, + { + "epoch": 0.7465141584634462, + "grad_norm": 4.083611011505127, + "learning_rate": 1.532318898651045e-06, + "loss": 0.149, + "step": 29500 + }, + { + "epoch": 0.7465394640281398, + "grad_norm": 11.16334342956543, + "learning_rate": 1.5320296442582894e-06, + "loss": 0.2436, + "step": 29501 + }, + { + "epoch": 0.7465647695928335, + "grad_norm": 15.755000114440918, + "learning_rate": 1.5317404122298307e-06, + "loss": 0.3688, + "step": 29502 + }, + { + "epoch": 0.7465900751575272, + "grad_norm": 5.0667619705200195, + "learning_rate": 1.5314512025675316e-06, + "loss": 0.2021, + "step": 29503 + }, + { + "epoch": 0.7466153807222208, + "grad_norm": 13.570685386657715, + "learning_rate": 1.5311620152732615e-06, + "loss": 0.2423, + "step": 29504 + }, + { + "epoch": 0.7466406862869145, + "grad_norm": 3.6334011554718018, + "learning_rate": 1.5308728503488795e-06, + "loss": 0.1332, + "step": 29505 + }, + { + "epoch": 0.7466659918516082, + "grad_norm": 3.9716315269470215, + "learning_rate": 1.5305837077962543e-06, + "loss": 0.1524, + "step": 29506 + }, + { + "epoch": 0.7466912974163018, + "grad_norm": 4.415289878845215, + "learning_rate": 1.5302945876172498e-06, + "loss": 0.1187, + "step": 29507 + }, + { + "epoch": 0.7467166029809955, + "grad_norm": 19.792816162109375, + "learning_rate": 1.5300054898137285e-06, + "loss": 0.228, + "step": 29508 + }, + { + "epoch": 0.7467419085456892, + "grad_norm": 5.8519158363342285, + "learning_rate": 1.5297164143875576e-06, + "loss": 0.1535, + "step": 29509 + }, + { + "epoch": 0.7467672141103828, + "grad_norm": 3.8153460025787354, + "learning_rate": 1.5294273613406002e-06, + "loss": 0.0972, + "step": 29510 + }, + { + "epoch": 0.7467925196750765, + "grad_norm": 3.4986767768859863, + "learning_rate": 1.5291383306747204e-06, + "loss": 0.146, + "step": 29511 + }, + { + "epoch": 0.7468178252397702, + "grad_norm": 5.696165084838867, + "learning_rate": 1.52884932239178e-06, + "loss": 0.1199, + "step": 29512 + }, + { + "epoch": 0.7468431308044639, + "grad_norm": 3.6130287647247314, + "learning_rate": 1.5285603364936463e-06, + "loss": 0.1063, + "step": 29513 + }, + { + "epoch": 0.7468684363691576, + "grad_norm": 8.156623840332031, + "learning_rate": 1.5282713729821808e-06, + "loss": 0.2294, + "step": 29514 + }, + { + "epoch": 0.7468937419338513, + "grad_norm": 3.8802618980407715, + "learning_rate": 1.527982431859248e-06, + "loss": 0.1157, + "step": 29515 + }, + { + "epoch": 0.7469190474985449, + "grad_norm": 16.501609802246094, + "learning_rate": 1.527693513126709e-06, + "loss": 0.2925, + "step": 29516 + }, + { + "epoch": 0.7469443530632386, + "grad_norm": 5.0480451583862305, + "learning_rate": 1.5274046167864299e-06, + "loss": 0.1527, + "step": 29517 + }, + { + "epoch": 0.7469696586279323, + "grad_norm": 9.540436744689941, + "learning_rate": 1.5271157428402727e-06, + "loss": 0.1928, + "step": 29518 + }, + { + "epoch": 0.746994964192626, + "grad_norm": 7.797705173492432, + "learning_rate": 1.5268268912901003e-06, + "loss": 0.2082, + "step": 29519 + }, + { + "epoch": 0.7470202697573196, + "grad_norm": 6.431729793548584, + "learning_rate": 1.5265380621377734e-06, + "loss": 0.2543, + "step": 29520 + }, + { + "epoch": 0.7470455753220133, + "grad_norm": 3.6884102821350098, + "learning_rate": 1.5262492553851583e-06, + "loss": 0.1953, + "step": 29521 + }, + { + "epoch": 0.747070880886707, + "grad_norm": 6.400592803955078, + "learning_rate": 1.5259604710341146e-06, + "loss": 0.2114, + "step": 29522 + }, + { + "epoch": 0.7470961864514006, + "grad_norm": 4.941245079040527, + "learning_rate": 1.5256717090865065e-06, + "loss": 0.1104, + "step": 29523 + }, + { + "epoch": 0.7471214920160943, + "grad_norm": 2.974313259124756, + "learning_rate": 1.5253829695441935e-06, + "loss": 0.0847, + "step": 29524 + }, + { + "epoch": 0.7471467975807881, + "grad_norm": 5.989361763000488, + "learning_rate": 1.5250942524090412e-06, + "loss": 0.132, + "step": 29525 + }, + { + "epoch": 0.7471721031454817, + "grad_norm": 2.4192802906036377, + "learning_rate": 1.5248055576829097e-06, + "loss": 0.0697, + "step": 29526 + }, + { + "epoch": 0.7471974087101754, + "grad_norm": 3.9740703105926514, + "learning_rate": 1.52451688536766e-06, + "loss": 0.1343, + "step": 29527 + }, + { + "epoch": 0.7472227142748691, + "grad_norm": 9.98819637298584, + "learning_rate": 1.5242282354651532e-06, + "loss": 0.2178, + "step": 29528 + }, + { + "epoch": 0.7472480198395627, + "grad_norm": 4.604913234710693, + "learning_rate": 1.523939607977254e-06, + "loss": 0.1462, + "step": 29529 + }, + { + "epoch": 0.7472733254042564, + "grad_norm": 3.0028934478759766, + "learning_rate": 1.5236510029058198e-06, + "loss": 0.1519, + "step": 29530 + }, + { + "epoch": 0.7472986309689501, + "grad_norm": 3.6388707160949707, + "learning_rate": 1.5233624202527165e-06, + "loss": 0.1832, + "step": 29531 + }, + { + "epoch": 0.7473239365336437, + "grad_norm": 4.295225620269775, + "learning_rate": 1.523073860019799e-06, + "loss": 0.112, + "step": 29532 + }, + { + "epoch": 0.7473492420983374, + "grad_norm": 5.781239032745361, + "learning_rate": 1.522785322208934e-06, + "loss": 0.1757, + "step": 29533 + }, + { + "epoch": 0.7473745476630311, + "grad_norm": 3.6148476600646973, + "learning_rate": 1.522496806821977e-06, + "loss": 0.1528, + "step": 29534 + }, + { + "epoch": 0.7473998532277247, + "grad_norm": 12.764922142028809, + "learning_rate": 1.5222083138607951e-06, + "loss": 0.2583, + "step": 29535 + }, + { + "epoch": 0.7474251587924184, + "grad_norm": 3.209602117538452, + "learning_rate": 1.5219198433272413e-06, + "loss": 0.1414, + "step": 29536 + }, + { + "epoch": 0.7474504643571122, + "grad_norm": 8.220304489135742, + "learning_rate": 1.521631395223181e-06, + "loss": 0.1046, + "step": 29537 + }, + { + "epoch": 0.7474757699218058, + "grad_norm": 5.309502601623535, + "learning_rate": 1.5213429695504728e-06, + "loss": 0.1637, + "step": 29538 + }, + { + "epoch": 0.7475010754864995, + "grad_norm": 3.6367013454437256, + "learning_rate": 1.5210545663109755e-06, + "loss": 0.1124, + "step": 29539 + }, + { + "epoch": 0.7475263810511932, + "grad_norm": 8.574827194213867, + "learning_rate": 1.5207661855065515e-06, + "loss": 0.1073, + "step": 29540 + }, + { + "epoch": 0.7475516866158868, + "grad_norm": 8.256157875061035, + "learning_rate": 1.5204778271390591e-06, + "loss": 0.2533, + "step": 29541 + }, + { + "epoch": 0.7475769921805805, + "grad_norm": 4.056384563446045, + "learning_rate": 1.5201894912103583e-06, + "loss": 0.0559, + "step": 29542 + }, + { + "epoch": 0.7476022977452742, + "grad_norm": 10.163457870483398, + "learning_rate": 1.519901177722306e-06, + "loss": 0.2608, + "step": 29543 + }, + { + "epoch": 0.7476276033099679, + "grad_norm": 2.772918224334717, + "learning_rate": 1.5196128866767672e-06, + "loss": 0.1009, + "step": 29544 + }, + { + "epoch": 0.7476529088746615, + "grad_norm": 7.9528093338012695, + "learning_rate": 1.5193246180755945e-06, + "loss": 0.1056, + "step": 29545 + }, + { + "epoch": 0.7476782144393552, + "grad_norm": 5.7014312744140625, + "learning_rate": 1.5190363719206514e-06, + "loss": 0.1363, + "step": 29546 + }, + { + "epoch": 0.7477035200040489, + "grad_norm": 1.738740086555481, + "learning_rate": 1.5187481482137934e-06, + "loss": 0.096, + "step": 29547 + }, + { + "epoch": 0.7477288255687425, + "grad_norm": 2.575923204421997, + "learning_rate": 1.5184599469568829e-06, + "loss": 0.1121, + "step": 29548 + }, + { + "epoch": 0.7477541311334363, + "grad_norm": 3.7143776416778564, + "learning_rate": 1.518171768151776e-06, + "loss": 0.1132, + "step": 29549 + }, + { + "epoch": 0.74777943669813, + "grad_norm": 17.062097549438477, + "learning_rate": 1.517883611800332e-06, + "loss": 0.2042, + "step": 29550 + }, + { + "epoch": 0.7478047422628236, + "grad_norm": 7.7592644691467285, + "learning_rate": 1.5175954779044077e-06, + "loss": 0.2342, + "step": 29551 + }, + { + "epoch": 0.7478300478275173, + "grad_norm": 5.445190906524658, + "learning_rate": 1.5173073664658634e-06, + "loss": 0.1829, + "step": 29552 + }, + { + "epoch": 0.747855353392211, + "grad_norm": 2.4370944499969482, + "learning_rate": 1.5170192774865567e-06, + "loss": 0.1114, + "step": 29553 + }, + { + "epoch": 0.7478806589569046, + "grad_norm": 3.2930333614349365, + "learning_rate": 1.516731210968344e-06, + "loss": 0.1348, + "step": 29554 + }, + { + "epoch": 0.7479059645215983, + "grad_norm": 8.5201416015625, + "learning_rate": 1.516443166913083e-06, + "loss": 0.1699, + "step": 29555 + }, + { + "epoch": 0.747931270086292, + "grad_norm": 4.903225421905518, + "learning_rate": 1.5161551453226332e-06, + "loss": 0.1447, + "step": 29556 + }, + { + "epoch": 0.7479565756509856, + "grad_norm": 3.0570425987243652, + "learning_rate": 1.5158671461988505e-06, + "loss": 0.0953, + "step": 29557 + }, + { + "epoch": 0.7479818812156793, + "grad_norm": 3.9864866733551025, + "learning_rate": 1.5155791695435928e-06, + "loss": 0.1255, + "step": 29558 + }, + { + "epoch": 0.748007186780373, + "grad_norm": 5.486127853393555, + "learning_rate": 1.5152912153587156e-06, + "loss": 0.1579, + "step": 29559 + }, + { + "epoch": 0.7480324923450666, + "grad_norm": 6.123118877410889, + "learning_rate": 1.5150032836460782e-06, + "loss": 0.2336, + "step": 29560 + }, + { + "epoch": 0.7480577979097603, + "grad_norm": 3.5131518840789795, + "learning_rate": 1.5147153744075348e-06, + "loss": 0.1262, + "step": 29561 + }, + { + "epoch": 0.7480831034744541, + "grad_norm": 6.777065753936768, + "learning_rate": 1.5144274876449471e-06, + "loss": 0.1448, + "step": 29562 + }, + { + "epoch": 0.7481084090391477, + "grad_norm": 4.612610340118408, + "learning_rate": 1.5141396233601653e-06, + "loss": 0.1373, + "step": 29563 + }, + { + "epoch": 0.7481337146038414, + "grad_norm": 8.622709274291992, + "learning_rate": 1.5138517815550496e-06, + "loss": 0.366, + "step": 29564 + }, + { + "epoch": 0.7481590201685351, + "grad_norm": 3.3504130840301514, + "learning_rate": 1.5135639622314557e-06, + "loss": 0.1033, + "step": 29565 + }, + { + "epoch": 0.7481843257332287, + "grad_norm": 3.3374826908111572, + "learning_rate": 1.5132761653912392e-06, + "loss": 0.1385, + "step": 29566 + }, + { + "epoch": 0.7482096312979224, + "grad_norm": 5.37874174118042, + "learning_rate": 1.5129883910362542e-06, + "loss": 0.1568, + "step": 29567 + }, + { + "epoch": 0.7482349368626161, + "grad_norm": 3.817765951156616, + "learning_rate": 1.5127006391683607e-06, + "loss": 0.1189, + "step": 29568 + }, + { + "epoch": 0.7482602424273098, + "grad_norm": 2.535917043685913, + "learning_rate": 1.5124129097894115e-06, + "loss": 0.0724, + "step": 29569 + }, + { + "epoch": 0.7482855479920034, + "grad_norm": 4.045372486114502, + "learning_rate": 1.512125202901261e-06, + "loss": 0.1212, + "step": 29570 + }, + { + "epoch": 0.7483108535566971, + "grad_norm": 5.78710412979126, + "learning_rate": 1.5118375185057683e-06, + "loss": 0.1614, + "step": 29571 + }, + { + "epoch": 0.7483361591213908, + "grad_norm": 8.467110633850098, + "learning_rate": 1.5115498566047865e-06, + "loss": 0.2548, + "step": 29572 + }, + { + "epoch": 0.7483614646860844, + "grad_norm": 5.534446716308594, + "learning_rate": 1.5112622172001707e-06, + "loss": 0.1299, + "step": 29573 + }, + { + "epoch": 0.7483867702507782, + "grad_norm": 5.277481555938721, + "learning_rate": 1.510974600293774e-06, + "loss": 0.1382, + "step": 29574 + }, + { + "epoch": 0.7484120758154719, + "grad_norm": 3.5569958686828613, + "learning_rate": 1.5106870058874574e-06, + "loss": 0.0791, + "step": 29575 + }, + { + "epoch": 0.7484373813801655, + "grad_norm": 8.311897277832031, + "learning_rate": 1.5103994339830668e-06, + "loss": 0.1963, + "step": 29576 + }, + { + "epoch": 0.7484626869448592, + "grad_norm": 5.2137908935546875, + "learning_rate": 1.5101118845824631e-06, + "loss": 0.1364, + "step": 29577 + }, + { + "epoch": 0.7484879925095529, + "grad_norm": 8.368191719055176, + "learning_rate": 1.5098243576874978e-06, + "loss": 0.2397, + "step": 29578 + }, + { + "epoch": 0.7485132980742465, + "grad_norm": 3.5070886611938477, + "learning_rate": 1.5095368533000283e-06, + "loss": 0.1331, + "step": 29579 + }, + { + "epoch": 0.7485386036389402, + "grad_norm": 14.732061386108398, + "learning_rate": 1.509249371421903e-06, + "loss": 0.198, + "step": 29580 + }, + { + "epoch": 0.7485639092036339, + "grad_norm": 5.252042293548584, + "learning_rate": 1.5089619120549815e-06, + "loss": 0.1292, + "step": 29581 + }, + { + "epoch": 0.7485892147683275, + "grad_norm": 3.759526014328003, + "learning_rate": 1.5086744752011135e-06, + "loss": 0.123, + "step": 29582 + }, + { + "epoch": 0.7486145203330212, + "grad_norm": 5.241724491119385, + "learning_rate": 1.508387060862156e-06, + "loss": 0.1614, + "step": 29583 + }, + { + "epoch": 0.7486398258977149, + "grad_norm": 5.170552730560303, + "learning_rate": 1.5080996690399602e-06, + "loss": 0.1754, + "step": 29584 + }, + { + "epoch": 0.7486651314624085, + "grad_norm": 4.777431964874268, + "learning_rate": 1.5078122997363808e-06, + "loss": 0.1499, + "step": 29585 + }, + { + "epoch": 0.7486904370271023, + "grad_norm": 8.68416976928711, + "learning_rate": 1.5075249529532687e-06, + "loss": 0.1886, + "step": 29586 + }, + { + "epoch": 0.748715742591796, + "grad_norm": 10.734652519226074, + "learning_rate": 1.5072376286924796e-06, + "loss": 0.2421, + "step": 29587 + }, + { + "epoch": 0.7487410481564896, + "grad_norm": 11.45879077911377, + "learning_rate": 1.5069503269558656e-06, + "loss": 0.2009, + "step": 29588 + }, + { + "epoch": 0.7487663537211833, + "grad_norm": 11.107423782348633, + "learning_rate": 1.5066630477452793e-06, + "loss": 0.2004, + "step": 29589 + }, + { + "epoch": 0.748791659285877, + "grad_norm": 4.1704421043396, + "learning_rate": 1.5063757910625714e-06, + "loss": 0.1609, + "step": 29590 + }, + { + "epoch": 0.7488169648505706, + "grad_norm": 13.972870826721191, + "learning_rate": 1.5060885569095978e-06, + "loss": 0.2798, + "step": 29591 + }, + { + "epoch": 0.7488422704152643, + "grad_norm": 7.963703632354736, + "learning_rate": 1.5058013452882093e-06, + "loss": 0.2287, + "step": 29592 + }, + { + "epoch": 0.748867575979958, + "grad_norm": 3.7000646591186523, + "learning_rate": 1.5055141562002578e-06, + "loss": 0.1772, + "step": 29593 + }, + { + "epoch": 0.7488928815446517, + "grad_norm": 4.173059940338135, + "learning_rate": 1.5052269896475941e-06, + "loss": 0.1516, + "step": 29594 + }, + { + "epoch": 0.7489181871093453, + "grad_norm": 4.010608196258545, + "learning_rate": 1.504939845632073e-06, + "loss": 0.1908, + "step": 29595 + }, + { + "epoch": 0.748943492674039, + "grad_norm": 6.166836261749268, + "learning_rate": 1.5046527241555447e-06, + "loss": 0.1676, + "step": 29596 + }, + { + "epoch": 0.7489687982387327, + "grad_norm": 6.552699565887451, + "learning_rate": 1.5043656252198608e-06, + "loss": 0.1643, + "step": 29597 + }, + { + "epoch": 0.7489941038034263, + "grad_norm": 6.496250629425049, + "learning_rate": 1.504078548826871e-06, + "loss": 0.129, + "step": 29598 + }, + { + "epoch": 0.7490194093681201, + "grad_norm": 3.6169161796569824, + "learning_rate": 1.50379149497843e-06, + "loss": 0.1892, + "step": 29599 + }, + { + "epoch": 0.7490447149328138, + "grad_norm": 5.260301113128662, + "learning_rate": 1.5035044636763874e-06, + "loss": 0.1477, + "step": 29600 + }, + { + "epoch": 0.7490700204975074, + "grad_norm": 3.7675602436065674, + "learning_rate": 1.5032174549225926e-06, + "loss": 0.1845, + "step": 29601 + }, + { + "epoch": 0.7490953260622011, + "grad_norm": 3.5837888717651367, + "learning_rate": 1.5029304687189006e-06, + "loss": 0.1194, + "step": 29602 + }, + { + "epoch": 0.7491206316268948, + "grad_norm": 4.695613384246826, + "learning_rate": 1.5026435050671568e-06, + "loss": 0.1795, + "step": 29603 + }, + { + "epoch": 0.7491459371915884, + "grad_norm": 8.680525779724121, + "learning_rate": 1.502356563969216e-06, + "loss": 0.1905, + "step": 29604 + }, + { + "epoch": 0.7491712427562821, + "grad_norm": 7.899075031280518, + "learning_rate": 1.5020696454269257e-06, + "loss": 0.196, + "step": 29605 + }, + { + "epoch": 0.7491965483209758, + "grad_norm": 7.150291442871094, + "learning_rate": 1.5017827494421406e-06, + "loss": 0.2097, + "step": 29606 + }, + { + "epoch": 0.7492218538856694, + "grad_norm": 3.52032470703125, + "learning_rate": 1.5014958760167048e-06, + "loss": 0.1116, + "step": 29607 + }, + { + "epoch": 0.7492471594503631, + "grad_norm": 4.074526309967041, + "learning_rate": 1.5012090251524724e-06, + "loss": 0.13, + "step": 29608 + }, + { + "epoch": 0.7492724650150568, + "grad_norm": 7.172560214996338, + "learning_rate": 1.5009221968512911e-06, + "loss": 0.1411, + "step": 29609 + }, + { + "epoch": 0.7492977705797504, + "grad_norm": 5.704288959503174, + "learning_rate": 1.5006353911150146e-06, + "loss": 0.2008, + "step": 29610 + }, + { + "epoch": 0.7493230761444442, + "grad_norm": 4.258218288421631, + "learning_rate": 1.5003486079454864e-06, + "loss": 0.1842, + "step": 29611 + }, + { + "epoch": 0.7493483817091379, + "grad_norm": 3.7858469486236572, + "learning_rate": 1.500061847344561e-06, + "loss": 0.1395, + "step": 29612 + }, + { + "epoch": 0.7493736872738315, + "grad_norm": 7.361434459686279, + "learning_rate": 1.4997751093140839e-06, + "loss": 0.1761, + "step": 29613 + }, + { + "epoch": 0.7493989928385252, + "grad_norm": 9.435210227966309, + "learning_rate": 1.4994883938559075e-06, + "loss": 0.1964, + "step": 29614 + }, + { + "epoch": 0.7494242984032189, + "grad_norm": 5.366546154022217, + "learning_rate": 1.499201700971879e-06, + "loss": 0.1479, + "step": 29615 + }, + { + "epoch": 0.7494496039679125, + "grad_norm": 4.321893692016602, + "learning_rate": 1.4989150306638477e-06, + "loss": 0.1072, + "step": 29616 + }, + { + "epoch": 0.7494749095326062, + "grad_norm": 5.449960231781006, + "learning_rate": 1.498628382933661e-06, + "loss": 0.1949, + "step": 29617 + }, + { + "epoch": 0.7495002150972999, + "grad_norm": 9.882135391235352, + "learning_rate": 1.49834175778317e-06, + "loss": 0.1608, + "step": 29618 + }, + { + "epoch": 0.7495255206619936, + "grad_norm": 5.494376182556152, + "learning_rate": 1.4980551552142214e-06, + "loss": 0.2082, + "step": 29619 + }, + { + "epoch": 0.7495508262266872, + "grad_norm": 11.57978630065918, + "learning_rate": 1.4977685752286637e-06, + "loss": 0.359, + "step": 29620 + }, + { + "epoch": 0.7495761317913809, + "grad_norm": 3.470369338989258, + "learning_rate": 1.4974820178283433e-06, + "loss": 0.1155, + "step": 29621 + }, + { + "epoch": 0.7496014373560747, + "grad_norm": 5.258523941040039, + "learning_rate": 1.497195483015112e-06, + "loss": 0.1084, + "step": 29622 + }, + { + "epoch": 0.7496267429207683, + "grad_norm": 3.984882354736328, + "learning_rate": 1.4969089707908147e-06, + "loss": 0.154, + "step": 29623 + }, + { + "epoch": 0.749652048485462, + "grad_norm": 3.6043670177459717, + "learning_rate": 1.4966224811573005e-06, + "loss": 0.169, + "step": 29624 + }, + { + "epoch": 0.7496773540501557, + "grad_norm": 7.8197550773620605, + "learning_rate": 1.496336014116414e-06, + "loss": 0.1206, + "step": 29625 + }, + { + "epoch": 0.7497026596148493, + "grad_norm": 3.709376096725464, + "learning_rate": 1.4960495696700067e-06, + "loss": 0.0904, + "step": 29626 + }, + { + "epoch": 0.749727965179543, + "grad_norm": 10.263506889343262, + "learning_rate": 1.495763147819924e-06, + "loss": 0.1258, + "step": 29627 + }, + { + "epoch": 0.7497532707442367, + "grad_norm": 8.573168754577637, + "learning_rate": 1.4954767485680134e-06, + "loss": 0.1478, + "step": 29628 + }, + { + "epoch": 0.7497785763089303, + "grad_norm": 6.16483736038208, + "learning_rate": 1.4951903719161203e-06, + "loss": 0.1742, + "step": 29629 + }, + { + "epoch": 0.749803881873624, + "grad_norm": 4.776147842407227, + "learning_rate": 1.4949040178660918e-06, + "loss": 0.1247, + "step": 29630 + }, + { + "epoch": 0.7498291874383177, + "grad_norm": 10.9325532913208, + "learning_rate": 1.4946176864197764e-06, + "loss": 0.1945, + "step": 29631 + }, + { + "epoch": 0.7498544930030113, + "grad_norm": 3.3801846504211426, + "learning_rate": 1.4943313775790197e-06, + "loss": 0.1436, + "step": 29632 + }, + { + "epoch": 0.749879798567705, + "grad_norm": 3.0532264709472656, + "learning_rate": 1.4940450913456678e-06, + "loss": 0.1084, + "step": 29633 + }, + { + "epoch": 0.7499051041323987, + "grad_norm": 5.8373003005981445, + "learning_rate": 1.4937588277215654e-06, + "loss": 0.1993, + "step": 29634 + }, + { + "epoch": 0.7499304096970923, + "grad_norm": 4.782833576202393, + "learning_rate": 1.493472586708562e-06, + "loss": 0.1767, + "step": 29635 + }, + { + "epoch": 0.7499557152617861, + "grad_norm": 4.1698431968688965, + "learning_rate": 1.4931863683084996e-06, + "loss": 0.0889, + "step": 29636 + }, + { + "epoch": 0.7499810208264798, + "grad_norm": 7.663900852203369, + "learning_rate": 1.4929001725232295e-06, + "loss": 0.2022, + "step": 29637 + }, + { + "epoch": 0.7500063263911734, + "grad_norm": 8.852215766906738, + "learning_rate": 1.4926139993545902e-06, + "loss": 0.1285, + "step": 29638 + }, + { + "epoch": 0.7500316319558671, + "grad_norm": 6.192089080810547, + "learning_rate": 1.4923278488044324e-06, + "loss": 0.1801, + "step": 29639 + }, + { + "epoch": 0.7500569375205608, + "grad_norm": 5.435768127441406, + "learning_rate": 1.4920417208745986e-06, + "loss": 0.1773, + "step": 29640 + }, + { + "epoch": 0.7500822430852544, + "grad_norm": 3.4616992473602295, + "learning_rate": 1.4917556155669383e-06, + "loss": 0.1417, + "step": 29641 + }, + { + "epoch": 0.7501075486499481, + "grad_norm": 6.4595160484313965, + "learning_rate": 1.49146953288329e-06, + "loss": 0.097, + "step": 29642 + }, + { + "epoch": 0.7501328542146418, + "grad_norm": 3.303501844406128, + "learning_rate": 1.4911834728255043e-06, + "loss": 0.1016, + "step": 29643 + }, + { + "epoch": 0.7501581597793354, + "grad_norm": 6.641231536865234, + "learning_rate": 1.4908974353954213e-06, + "loss": 0.1801, + "step": 29644 + }, + { + "epoch": 0.7501834653440291, + "grad_norm": 2.886384963989258, + "learning_rate": 1.4906114205948902e-06, + "loss": 0.1184, + "step": 29645 + }, + { + "epoch": 0.7502087709087228, + "grad_norm": 7.5770697593688965, + "learning_rate": 1.490325428425753e-06, + "loss": 0.26, + "step": 29646 + }, + { + "epoch": 0.7502340764734166, + "grad_norm": 4.813151836395264, + "learning_rate": 1.4900394588898542e-06, + "loss": 0.1284, + "step": 29647 + }, + { + "epoch": 0.7502593820381102, + "grad_norm": 5.698099136352539, + "learning_rate": 1.4897535119890367e-06, + "loss": 0.1434, + "step": 29648 + }, + { + "epoch": 0.7502846876028039, + "grad_norm": 3.8808436393737793, + "learning_rate": 1.489467587725147e-06, + "loss": 0.1512, + "step": 29649 + }, + { + "epoch": 0.7503099931674976, + "grad_norm": 3.0702717304229736, + "learning_rate": 1.489181686100028e-06, + "loss": 0.1495, + "step": 29650 + }, + { + "epoch": 0.7503352987321912, + "grad_norm": 6.286980152130127, + "learning_rate": 1.4888958071155234e-06, + "loss": 0.1913, + "step": 29651 + }, + { + "epoch": 0.7503606042968849, + "grad_norm": 7.327483177185059, + "learning_rate": 1.4886099507734746e-06, + "loss": 0.3083, + "step": 29652 + }, + { + "epoch": 0.7503859098615786, + "grad_norm": 6.0407633781433105, + "learning_rate": 1.488324117075729e-06, + "loss": 0.1728, + "step": 29653 + }, + { + "epoch": 0.7504112154262722, + "grad_norm": 2.8437106609344482, + "learning_rate": 1.4880383060241282e-06, + "loss": 0.1161, + "step": 29654 + }, + { + "epoch": 0.7504365209909659, + "grad_norm": 3.4232120513916016, + "learning_rate": 1.487752517620515e-06, + "loss": 0.1886, + "step": 29655 + }, + { + "epoch": 0.7504618265556596, + "grad_norm": 4.132141590118408, + "learning_rate": 1.4874667518667302e-06, + "loss": 0.077, + "step": 29656 + }, + { + "epoch": 0.7504871321203532, + "grad_norm": 3.9853646755218506, + "learning_rate": 1.487181008764621e-06, + "loss": 0.0875, + "step": 29657 + }, + { + "epoch": 0.7505124376850469, + "grad_norm": 3.1917099952697754, + "learning_rate": 1.4868952883160282e-06, + "loss": 0.1358, + "step": 29658 + }, + { + "epoch": 0.7505377432497407, + "grad_norm": 7.264317989349365, + "learning_rate": 1.486609590522794e-06, + "loss": 0.1739, + "step": 29659 + }, + { + "epoch": 0.7505630488144343, + "grad_norm": 6.2363786697387695, + "learning_rate": 1.4863239153867615e-06, + "loss": 0.1495, + "step": 29660 + }, + { + "epoch": 0.750588354379128, + "grad_norm": 31.360668182373047, + "learning_rate": 1.4860382629097703e-06, + "loss": 0.3067, + "step": 29661 + }, + { + "epoch": 0.7506136599438217, + "grad_norm": 10.183403015136719, + "learning_rate": 1.4857526330936662e-06, + "loss": 0.2252, + "step": 29662 + }, + { + "epoch": 0.7506389655085153, + "grad_norm": 3.428619623184204, + "learning_rate": 1.4854670259402897e-06, + "loss": 0.106, + "step": 29663 + }, + { + "epoch": 0.750664271073209, + "grad_norm": 6.933789253234863, + "learning_rate": 1.485181441451483e-06, + "loss": 0.1603, + "step": 29664 + }, + { + "epoch": 0.7506895766379027, + "grad_norm": 19.78420066833496, + "learning_rate": 1.4848958796290858e-06, + "loss": 0.1571, + "step": 29665 + }, + { + "epoch": 0.7507148822025963, + "grad_norm": 4.569066524505615, + "learning_rate": 1.4846103404749424e-06, + "loss": 0.1765, + "step": 29666 + }, + { + "epoch": 0.75074018776729, + "grad_norm": 3.0288987159729004, + "learning_rate": 1.484324823990892e-06, + "loss": 0.1371, + "step": 29667 + }, + { + "epoch": 0.7507654933319837, + "grad_norm": 3.1129117012023926, + "learning_rate": 1.4840393301787793e-06, + "loss": 0.1286, + "step": 29668 + }, + { + "epoch": 0.7507907988966773, + "grad_norm": 3.242238759994507, + "learning_rate": 1.4837538590404404e-06, + "loss": 0.1299, + "step": 29669 + }, + { + "epoch": 0.750816104461371, + "grad_norm": 3.534623861312866, + "learning_rate": 1.4834684105777197e-06, + "loss": 0.1238, + "step": 29670 + }, + { + "epoch": 0.7508414100260647, + "grad_norm": 5.0984015464782715, + "learning_rate": 1.4831829847924562e-06, + "loss": 0.1319, + "step": 29671 + }, + { + "epoch": 0.7508667155907585, + "grad_norm": 6.966787338256836, + "learning_rate": 1.4828975816864944e-06, + "loss": 0.181, + "step": 29672 + }, + { + "epoch": 0.7508920211554521, + "grad_norm": 3.251934289932251, + "learning_rate": 1.4826122012616683e-06, + "loss": 0.1588, + "step": 29673 + }, + { + "epoch": 0.7509173267201458, + "grad_norm": 4.714582443237305, + "learning_rate": 1.482326843519824e-06, + "loss": 0.1498, + "step": 29674 + }, + { + "epoch": 0.7509426322848395, + "grad_norm": 7.260040283203125, + "learning_rate": 1.4820415084627976e-06, + "loss": 0.1399, + "step": 29675 + }, + { + "epoch": 0.7509679378495331, + "grad_norm": 5.6630377769470215, + "learning_rate": 1.4817561960924325e-06, + "loss": 0.1324, + "step": 29676 + }, + { + "epoch": 0.7509932434142268, + "grad_norm": 3.554292678833008, + "learning_rate": 1.4814709064105675e-06, + "loss": 0.1203, + "step": 29677 + }, + { + "epoch": 0.7510185489789205, + "grad_norm": 5.6209821701049805, + "learning_rate": 1.4811856394190427e-06, + "loss": 0.1599, + "step": 29678 + }, + { + "epoch": 0.7510438545436141, + "grad_norm": 18.163043975830078, + "learning_rate": 1.4809003951196948e-06, + "loss": 0.2224, + "step": 29679 + }, + { + "epoch": 0.7510691601083078, + "grad_norm": 5.9341020584106445, + "learning_rate": 1.4806151735143675e-06, + "loss": 0.1809, + "step": 29680 + }, + { + "epoch": 0.7510944656730015, + "grad_norm": 3.2037129402160645, + "learning_rate": 1.4803299746048983e-06, + "loss": 0.1201, + "step": 29681 + }, + { + "epoch": 0.7511197712376951, + "grad_norm": 6.181591987609863, + "learning_rate": 1.4800447983931265e-06, + "loss": 0.21, + "step": 29682 + }, + { + "epoch": 0.7511450768023888, + "grad_norm": 5.029101848602295, + "learning_rate": 1.4797596448808892e-06, + "loss": 0.1345, + "step": 29683 + }, + { + "epoch": 0.7511703823670826, + "grad_norm": 3.4663844108581543, + "learning_rate": 1.479474514070029e-06, + "loss": 0.1335, + "step": 29684 + }, + { + "epoch": 0.7511956879317762, + "grad_norm": 5.894455432891846, + "learning_rate": 1.4791894059623823e-06, + "loss": 0.1581, + "step": 29685 + }, + { + "epoch": 0.7512209934964699, + "grad_norm": 4.487310886383057, + "learning_rate": 1.4789043205597886e-06, + "loss": 0.1427, + "step": 29686 + }, + { + "epoch": 0.7512462990611636, + "grad_norm": 6.638962745666504, + "learning_rate": 1.4786192578640856e-06, + "loss": 0.1488, + "step": 29687 + }, + { + "epoch": 0.7512716046258572, + "grad_norm": 5.888668060302734, + "learning_rate": 1.478334217877111e-06, + "loss": 0.2138, + "step": 29688 + }, + { + "epoch": 0.7512969101905509, + "grad_norm": 8.031478881835938, + "learning_rate": 1.4780492006007052e-06, + "loss": 0.2033, + "step": 29689 + }, + { + "epoch": 0.7513222157552446, + "grad_norm": 3.276125907897949, + "learning_rate": 1.4777642060367047e-06, + "loss": 0.1748, + "step": 29690 + }, + { + "epoch": 0.7513475213199382, + "grad_norm": 4.253077983856201, + "learning_rate": 1.477479234186948e-06, + "loss": 0.0989, + "step": 29691 + }, + { + "epoch": 0.7513728268846319, + "grad_norm": 4.801670074462891, + "learning_rate": 1.4771942850532711e-06, + "loss": 0.1182, + "step": 29692 + }, + { + "epoch": 0.7513981324493256, + "grad_norm": 3.1896817684173584, + "learning_rate": 1.4769093586375143e-06, + "loss": 0.1231, + "step": 29693 + }, + { + "epoch": 0.7514234380140192, + "grad_norm": 2.1891114711761475, + "learning_rate": 1.4766244549415137e-06, + "loss": 0.1185, + "step": 29694 + }, + { + "epoch": 0.7514487435787129, + "grad_norm": 5.105544567108154, + "learning_rate": 1.4763395739671067e-06, + "loss": 0.1216, + "step": 29695 + }, + { + "epoch": 0.7514740491434067, + "grad_norm": 4.1952714920043945, + "learning_rate": 1.4760547157161281e-06, + "loss": 0.1088, + "step": 29696 + }, + { + "epoch": 0.7514993547081004, + "grad_norm": 4.641656875610352, + "learning_rate": 1.4757698801904197e-06, + "loss": 0.1628, + "step": 29697 + }, + { + "epoch": 0.751524660272794, + "grad_norm": 3.7545225620269775, + "learning_rate": 1.4754850673918136e-06, + "loss": 0.1298, + "step": 29698 + }, + { + "epoch": 0.7515499658374877, + "grad_norm": 10.968610763549805, + "learning_rate": 1.475200277322152e-06, + "loss": 0.1369, + "step": 29699 + }, + { + "epoch": 0.7515752714021814, + "grad_norm": 3.242788314819336, + "learning_rate": 1.4749155099832645e-06, + "loss": 0.1399, + "step": 29700 + }, + { + "epoch": 0.751600576966875, + "grad_norm": 4.6262383460998535, + "learning_rate": 1.4746307653769938e-06, + "loss": 0.0963, + "step": 29701 + }, + { + "epoch": 0.7516258825315687, + "grad_norm": 4.456644535064697, + "learning_rate": 1.474346043505171e-06, + "loss": 0.17, + "step": 29702 + }, + { + "epoch": 0.7516511880962624, + "grad_norm": 3.4363503456115723, + "learning_rate": 1.4740613443696384e-06, + "loss": 0.1564, + "step": 29703 + }, + { + "epoch": 0.751676493660956, + "grad_norm": 2.877054452896118, + "learning_rate": 1.4737766679722248e-06, + "loss": 0.166, + "step": 29704 + }, + { + "epoch": 0.7517017992256497, + "grad_norm": 3.009998083114624, + "learning_rate": 1.4734920143147718e-06, + "loss": 0.0899, + "step": 29705 + }, + { + "epoch": 0.7517271047903434, + "grad_norm": 4.0145745277404785, + "learning_rate": 1.4732073833991106e-06, + "loss": 0.1358, + "step": 29706 + }, + { + "epoch": 0.751752410355037, + "grad_norm": 5.155594825744629, + "learning_rate": 1.4729227752270826e-06, + "loss": 0.1448, + "step": 29707 + }, + { + "epoch": 0.7517777159197307, + "grad_norm": 3.0599935054779053, + "learning_rate": 1.472638189800516e-06, + "loss": 0.1121, + "step": 29708 + }, + { + "epoch": 0.7518030214844245, + "grad_norm": 3.2501916885375977, + "learning_rate": 1.4723536271212508e-06, + "loss": 0.144, + "step": 29709 + }, + { + "epoch": 0.7518283270491181, + "grad_norm": 3.324010133743286, + "learning_rate": 1.4720690871911197e-06, + "loss": 0.1622, + "step": 29710 + }, + { + "epoch": 0.7518536326138118, + "grad_norm": 4.188576698303223, + "learning_rate": 1.4717845700119605e-06, + "loss": 0.0865, + "step": 29711 + }, + { + "epoch": 0.7518789381785055, + "grad_norm": 3.4816322326660156, + "learning_rate": 1.4715000755856064e-06, + "loss": 0.0886, + "step": 29712 + }, + { + "epoch": 0.7519042437431991, + "grad_norm": 5.4031500816345215, + "learning_rate": 1.4712156039138913e-06, + "loss": 0.1569, + "step": 29713 + }, + { + "epoch": 0.7519295493078928, + "grad_norm": 6.775277137756348, + "learning_rate": 1.4709311549986511e-06, + "loss": 0.171, + "step": 29714 + }, + { + "epoch": 0.7519548548725865, + "grad_norm": 4.436426639556885, + "learning_rate": 1.4706467288417176e-06, + "loss": 0.1714, + "step": 29715 + }, + { + "epoch": 0.7519801604372801, + "grad_norm": 4.685898303985596, + "learning_rate": 1.4703623254449284e-06, + "loss": 0.1574, + "step": 29716 + }, + { + "epoch": 0.7520054660019738, + "grad_norm": 4.6628265380859375, + "learning_rate": 1.470077944810116e-06, + "loss": 0.1999, + "step": 29717 + }, + { + "epoch": 0.7520307715666675, + "grad_norm": 4.235029697418213, + "learning_rate": 1.4697935869391144e-06, + "loss": 0.1647, + "step": 29718 + }, + { + "epoch": 0.7520560771313611, + "grad_norm": 6.196065425872803, + "learning_rate": 1.4695092518337556e-06, + "loss": 0.191, + "step": 29719 + }, + { + "epoch": 0.7520813826960548, + "grad_norm": 5.014163970947266, + "learning_rate": 1.4692249394958764e-06, + "loss": 0.0864, + "step": 29720 + }, + { + "epoch": 0.7521066882607486, + "grad_norm": 10.93543815612793, + "learning_rate": 1.468940649927309e-06, + "loss": 0.0925, + "step": 29721 + }, + { + "epoch": 0.7521319938254423, + "grad_norm": 3.7386224269866943, + "learning_rate": 1.4686563831298867e-06, + "loss": 0.139, + "step": 29722 + }, + { + "epoch": 0.7521572993901359, + "grad_norm": 15.792906761169434, + "learning_rate": 1.4683721391054406e-06, + "loss": 0.1969, + "step": 29723 + }, + { + "epoch": 0.7521826049548296, + "grad_norm": 8.822587966918945, + "learning_rate": 1.4680879178558072e-06, + "loss": 0.2802, + "step": 29724 + }, + { + "epoch": 0.7522079105195233, + "grad_norm": 7.004777431488037, + "learning_rate": 1.4678037193828176e-06, + "loss": 0.1699, + "step": 29725 + }, + { + "epoch": 0.7522332160842169, + "grad_norm": 4.134443283081055, + "learning_rate": 1.4675195436883055e-06, + "loss": 0.1614, + "step": 29726 + }, + { + "epoch": 0.7522585216489106, + "grad_norm": 3.8694849014282227, + "learning_rate": 1.4672353907741004e-06, + "loss": 0.1258, + "step": 29727 + }, + { + "epoch": 0.7522838272136043, + "grad_norm": 11.640888214111328, + "learning_rate": 1.4669512606420394e-06, + "loss": 0.2348, + "step": 29728 + }, + { + "epoch": 0.7523091327782979, + "grad_norm": 5.2098846435546875, + "learning_rate": 1.4666671532939503e-06, + "loss": 0.1323, + "step": 29729 + }, + { + "epoch": 0.7523344383429916, + "grad_norm": 2.8138363361358643, + "learning_rate": 1.4663830687316711e-06, + "loss": 0.1063, + "step": 29730 + }, + { + "epoch": 0.7523597439076853, + "grad_norm": 6.089120388031006, + "learning_rate": 1.466099006957027e-06, + "loss": 0.2436, + "step": 29731 + }, + { + "epoch": 0.7523850494723789, + "grad_norm": 4.164048194885254, + "learning_rate": 1.4658149679718542e-06, + "loss": 0.1435, + "step": 29732 + }, + { + "epoch": 0.7524103550370727, + "grad_norm": 4.338105201721191, + "learning_rate": 1.465530951777982e-06, + "loss": 0.1861, + "step": 29733 + }, + { + "epoch": 0.7524356606017664, + "grad_norm": 6.511112213134766, + "learning_rate": 1.4652469583772467e-06, + "loss": 0.1945, + "step": 29734 + }, + { + "epoch": 0.75246096616646, + "grad_norm": 4.098392486572266, + "learning_rate": 1.4649629877714728e-06, + "loss": 0.1357, + "step": 29735 + }, + { + "epoch": 0.7524862717311537, + "grad_norm": 4.33353328704834, + "learning_rate": 1.4646790399624967e-06, + "loss": 0.1503, + "step": 29736 + }, + { + "epoch": 0.7525115772958474, + "grad_norm": 12.60603141784668, + "learning_rate": 1.4643951149521457e-06, + "loss": 0.2634, + "step": 29737 + }, + { + "epoch": 0.752536882860541, + "grad_norm": 4.6421732902526855, + "learning_rate": 1.4641112127422568e-06, + "loss": 0.1213, + "step": 29738 + }, + { + "epoch": 0.7525621884252347, + "grad_norm": 4.694929122924805, + "learning_rate": 1.4638273333346536e-06, + "loss": 0.146, + "step": 29739 + }, + { + "epoch": 0.7525874939899284, + "grad_norm": 6.171581745147705, + "learning_rate": 1.4635434767311713e-06, + "loss": 0.173, + "step": 29740 + }, + { + "epoch": 0.752612799554622, + "grad_norm": 5.960891246795654, + "learning_rate": 1.4632596429336382e-06, + "loss": 0.1631, + "step": 29741 + }, + { + "epoch": 0.7526381051193157, + "grad_norm": 5.183147430419922, + "learning_rate": 1.4629758319438875e-06, + "loss": 0.1857, + "step": 29742 + }, + { + "epoch": 0.7526634106840094, + "grad_norm": 3.8668174743652344, + "learning_rate": 1.462692043763747e-06, + "loss": 0.1336, + "step": 29743 + }, + { + "epoch": 0.752688716248703, + "grad_norm": 4.278192520141602, + "learning_rate": 1.4624082783950483e-06, + "loss": 0.1316, + "step": 29744 + }, + { + "epoch": 0.7527140218133967, + "grad_norm": 3.2090649604797363, + "learning_rate": 1.4621245358396202e-06, + "loss": 0.1022, + "step": 29745 + }, + { + "epoch": 0.7527393273780905, + "grad_norm": 3.3075032234191895, + "learning_rate": 1.4618408160992915e-06, + "loss": 0.1647, + "step": 29746 + }, + { + "epoch": 0.7527646329427842, + "grad_norm": 2.39172625541687, + "learning_rate": 1.4615571191758943e-06, + "loss": 0.1027, + "step": 29747 + }, + { + "epoch": 0.7527899385074778, + "grad_norm": 7.2507829666137695, + "learning_rate": 1.4612734450712573e-06, + "loss": 0.1667, + "step": 29748 + }, + { + "epoch": 0.7528152440721715, + "grad_norm": 4.038926124572754, + "learning_rate": 1.4609897937872098e-06, + "loss": 0.1004, + "step": 29749 + }, + { + "epoch": 0.7528405496368652, + "grad_norm": 3.6826412677764893, + "learning_rate": 1.4607061653255794e-06, + "loss": 0.1614, + "step": 29750 + }, + { + "epoch": 0.7528658552015588, + "grad_norm": 5.636130332946777, + "learning_rate": 1.4604225596881978e-06, + "loss": 0.1786, + "step": 29751 + }, + { + "epoch": 0.7528911607662525, + "grad_norm": 15.54118824005127, + "learning_rate": 1.4601389768768925e-06, + "loss": 0.3542, + "step": 29752 + }, + { + "epoch": 0.7529164663309462, + "grad_norm": 3.647869348526001, + "learning_rate": 1.4598554168934932e-06, + "loss": 0.1634, + "step": 29753 + }, + { + "epoch": 0.7529417718956398, + "grad_norm": 7.3219170570373535, + "learning_rate": 1.4595718797398256e-06, + "loss": 0.22, + "step": 29754 + }, + { + "epoch": 0.7529670774603335, + "grad_norm": 3.539262533187866, + "learning_rate": 1.459288365417722e-06, + "loss": 0.1208, + "step": 29755 + }, + { + "epoch": 0.7529923830250272, + "grad_norm": 6.014451503753662, + "learning_rate": 1.4590048739290091e-06, + "loss": 0.1261, + "step": 29756 + }, + { + "epoch": 0.7530176885897208, + "grad_norm": 6.453554153442383, + "learning_rate": 1.4587214052755156e-06, + "loss": 0.1436, + "step": 29757 + }, + { + "epoch": 0.7530429941544146, + "grad_norm": 5.238865852355957, + "learning_rate": 1.458437959459067e-06, + "loss": 0.2508, + "step": 29758 + }, + { + "epoch": 0.7530682997191083, + "grad_norm": 3.2383768558502197, + "learning_rate": 1.4581545364814947e-06, + "loss": 0.1623, + "step": 29759 + }, + { + "epoch": 0.7530936052838019, + "grad_norm": 2.99688458442688, + "learning_rate": 1.4578711363446253e-06, + "loss": 0.1031, + "step": 29760 + }, + { + "epoch": 0.7531189108484956, + "grad_norm": 4.813741683959961, + "learning_rate": 1.4575877590502858e-06, + "loss": 0.1888, + "step": 29761 + }, + { + "epoch": 0.7531442164131893, + "grad_norm": 4.940508842468262, + "learning_rate": 1.4573044046003027e-06, + "loss": 0.207, + "step": 29762 + }, + { + "epoch": 0.7531695219778829, + "grad_norm": 9.448816299438477, + "learning_rate": 1.4570210729965057e-06, + "loss": 0.1335, + "step": 29763 + }, + { + "epoch": 0.7531948275425766, + "grad_norm": 3.6436753273010254, + "learning_rate": 1.4567377642407193e-06, + "loss": 0.1863, + "step": 29764 + }, + { + "epoch": 0.7532201331072703, + "grad_norm": 5.805212020874023, + "learning_rate": 1.4564544783347751e-06, + "loss": 0.1637, + "step": 29765 + }, + { + "epoch": 0.7532454386719639, + "grad_norm": 6.130183696746826, + "learning_rate": 1.4561712152804935e-06, + "loss": 0.2367, + "step": 29766 + }, + { + "epoch": 0.7532707442366576, + "grad_norm": 4.036630630493164, + "learning_rate": 1.4558879750797062e-06, + "loss": 0.0949, + "step": 29767 + }, + { + "epoch": 0.7532960498013513, + "grad_norm": 9.014519691467285, + "learning_rate": 1.4556047577342368e-06, + "loss": 0.212, + "step": 29768 + }, + { + "epoch": 0.7533213553660449, + "grad_norm": 11.787650108337402, + "learning_rate": 1.4553215632459162e-06, + "loss": 0.3593, + "step": 29769 + }, + { + "epoch": 0.7533466609307387, + "grad_norm": 4.848377227783203, + "learning_rate": 1.4550383916165638e-06, + "loss": 0.1059, + "step": 29770 + }, + { + "epoch": 0.7533719664954324, + "grad_norm": 6.420457363128662, + "learning_rate": 1.4547552428480111e-06, + "loss": 0.2466, + "step": 29771 + }, + { + "epoch": 0.753397272060126, + "grad_norm": 5.736705780029297, + "learning_rate": 1.4544721169420828e-06, + "loss": 0.134, + "step": 29772 + }, + { + "epoch": 0.7534225776248197, + "grad_norm": 3.300950050354004, + "learning_rate": 1.4541890139006021e-06, + "loss": 0.1532, + "step": 29773 + }, + { + "epoch": 0.7534478831895134, + "grad_norm": 14.072225570678711, + "learning_rate": 1.4539059337253992e-06, + "loss": 0.2059, + "step": 29774 + }, + { + "epoch": 0.7534731887542071, + "grad_norm": 3.736222982406616, + "learning_rate": 1.453622876418297e-06, + "loss": 0.1074, + "step": 29775 + }, + { + "epoch": 0.7534984943189007, + "grad_norm": 6.415761470794678, + "learning_rate": 1.4533398419811207e-06, + "loss": 0.1618, + "step": 29776 + }, + { + "epoch": 0.7535237998835944, + "grad_norm": 2.4482192993164062, + "learning_rate": 1.453056830415695e-06, + "loss": 0.0812, + "step": 29777 + }, + { + "epoch": 0.7535491054482881, + "grad_norm": 3.9740097522735596, + "learning_rate": 1.4527738417238474e-06, + "loss": 0.1254, + "step": 29778 + }, + { + "epoch": 0.7535744110129817, + "grad_norm": 6.1736321449279785, + "learning_rate": 1.4524908759074013e-06, + "loss": 0.1159, + "step": 29779 + }, + { + "epoch": 0.7535997165776754, + "grad_norm": 2.7019894123077393, + "learning_rate": 1.452207932968182e-06, + "loss": 0.1163, + "step": 29780 + }, + { + "epoch": 0.7536250221423691, + "grad_norm": 5.5481791496276855, + "learning_rate": 1.451925012908012e-06, + "loss": 0.1402, + "step": 29781 + }, + { + "epoch": 0.7536503277070628, + "grad_norm": 4.356203079223633, + "learning_rate": 1.4516421157287209e-06, + "loss": 0.1864, + "step": 29782 + }, + { + "epoch": 0.7536756332717565, + "grad_norm": 2.866447687149048, + "learning_rate": 1.4513592414321264e-06, + "loss": 0.1259, + "step": 29783 + }, + { + "epoch": 0.7537009388364502, + "grad_norm": 3.760329484939575, + "learning_rate": 1.451076390020058e-06, + "loss": 0.1307, + "step": 29784 + }, + { + "epoch": 0.7537262444011438, + "grad_norm": 3.207066535949707, + "learning_rate": 1.4507935614943358e-06, + "loss": 0.1051, + "step": 29785 + }, + { + "epoch": 0.7537515499658375, + "grad_norm": 6.1717305183410645, + "learning_rate": 1.4505107558567877e-06, + "loss": 0.1216, + "step": 29786 + }, + { + "epoch": 0.7537768555305312, + "grad_norm": 4.018970012664795, + "learning_rate": 1.450227973109235e-06, + "loss": 0.1571, + "step": 29787 + }, + { + "epoch": 0.7538021610952248, + "grad_norm": 13.536808967590332, + "learning_rate": 1.4499452132535024e-06, + "loss": 0.2375, + "step": 29788 + }, + { + "epoch": 0.7538274666599185, + "grad_norm": 3.033256769180298, + "learning_rate": 1.4496624762914108e-06, + "loss": 0.142, + "step": 29789 + }, + { + "epoch": 0.7538527722246122, + "grad_norm": 5.26246452331543, + "learning_rate": 1.4493797622247868e-06, + "loss": 0.1104, + "step": 29790 + }, + { + "epoch": 0.7538780777893058, + "grad_norm": 6.144263744354248, + "learning_rate": 1.449097071055453e-06, + "loss": 0.1202, + "step": 29791 + }, + { + "epoch": 0.7539033833539995, + "grad_norm": 5.422791957855225, + "learning_rate": 1.4488144027852308e-06, + "loss": 0.1479, + "step": 29792 + }, + { + "epoch": 0.7539286889186932, + "grad_norm": 3.9726345539093018, + "learning_rate": 1.448531757415943e-06, + "loss": 0.1413, + "step": 29793 + }, + { + "epoch": 0.7539539944833868, + "grad_norm": 8.73442268371582, + "learning_rate": 1.448249134949415e-06, + "loss": 0.2661, + "step": 29794 + }, + { + "epoch": 0.7539793000480806, + "grad_norm": 4.6966400146484375, + "learning_rate": 1.4479665353874655e-06, + "loss": 0.1132, + "step": 29795 + }, + { + "epoch": 0.7540046056127743, + "grad_norm": 7.6388678550720215, + "learning_rate": 1.4476839587319224e-06, + "loss": 0.1649, + "step": 29796 + }, + { + "epoch": 0.7540299111774679, + "grad_norm": 4.309582233428955, + "learning_rate": 1.4474014049846014e-06, + "loss": 0.1473, + "step": 29797 + }, + { + "epoch": 0.7540552167421616, + "grad_norm": 5.974908351898193, + "learning_rate": 1.4471188741473297e-06, + "loss": 0.249, + "step": 29798 + }, + { + "epoch": 0.7540805223068553, + "grad_norm": 2.844954252243042, + "learning_rate": 1.4468363662219276e-06, + "loss": 0.1395, + "step": 29799 + }, + { + "epoch": 0.754105827871549, + "grad_norm": 9.900921821594238, + "learning_rate": 1.4465538812102164e-06, + "loss": 0.2621, + "step": 29800 + }, + { + "epoch": 0.7541311334362426, + "grad_norm": 4.088669300079346, + "learning_rate": 1.446271419114017e-06, + "loss": 0.1084, + "step": 29801 + }, + { + "epoch": 0.7541564390009363, + "grad_norm": 5.839839458465576, + "learning_rate": 1.445988979935154e-06, + "loss": 0.079, + "step": 29802 + }, + { + "epoch": 0.75418174456563, + "grad_norm": 7.817545413970947, + "learning_rate": 1.4457065636754463e-06, + "loss": 0.2496, + "step": 29803 + }, + { + "epoch": 0.7542070501303236, + "grad_norm": 3.2231833934783936, + "learning_rate": 1.445424170336715e-06, + "loss": 0.0724, + "step": 29804 + }, + { + "epoch": 0.7542323556950173, + "grad_norm": 8.085527420043945, + "learning_rate": 1.4451417999207829e-06, + "loss": 0.2037, + "step": 29805 + }, + { + "epoch": 0.754257661259711, + "grad_norm": 6.6591691970825195, + "learning_rate": 1.4448594524294707e-06, + "loss": 0.1857, + "step": 29806 + }, + { + "epoch": 0.7542829668244047, + "grad_norm": 4.231852054595947, + "learning_rate": 1.4445771278645981e-06, + "loss": 0.1868, + "step": 29807 + }, + { + "epoch": 0.7543082723890984, + "grad_norm": 9.326865196228027, + "learning_rate": 1.4442948262279848e-06, + "loss": 0.2819, + "step": 29808 + }, + { + "epoch": 0.7543335779537921, + "grad_norm": 4.6414690017700195, + "learning_rate": 1.4440125475214557e-06, + "loss": 0.1271, + "step": 29809 + }, + { + "epoch": 0.7543588835184857, + "grad_norm": 3.4725279808044434, + "learning_rate": 1.4437302917468255e-06, + "loss": 0.0581, + "step": 29810 + }, + { + "epoch": 0.7543841890831794, + "grad_norm": 3.6476023197174072, + "learning_rate": 1.4434480589059185e-06, + "loss": 0.1331, + "step": 29811 + }, + { + "epoch": 0.7544094946478731, + "grad_norm": 15.045894622802734, + "learning_rate": 1.4431658490005517e-06, + "loss": 0.2835, + "step": 29812 + }, + { + "epoch": 0.7544348002125667, + "grad_norm": 2.9692955017089844, + "learning_rate": 1.4428836620325498e-06, + "loss": 0.1067, + "step": 29813 + }, + { + "epoch": 0.7544601057772604, + "grad_norm": 5.704436302185059, + "learning_rate": 1.4426014980037268e-06, + "loss": 0.2162, + "step": 29814 + }, + { + "epoch": 0.7544854113419541, + "grad_norm": 5.981710433959961, + "learning_rate": 1.442319356915906e-06, + "loss": 0.1391, + "step": 29815 + }, + { + "epoch": 0.7545107169066477, + "grad_norm": 5.093532085418701, + "learning_rate": 1.4420372387709047e-06, + "loss": 0.0941, + "step": 29816 + }, + { + "epoch": 0.7545360224713414, + "grad_norm": 2.929659128189087, + "learning_rate": 1.4417551435705452e-06, + "loss": 0.0964, + "step": 29817 + }, + { + "epoch": 0.7545613280360352, + "grad_norm": 4.167766571044922, + "learning_rate": 1.4414730713166448e-06, + "loss": 0.1627, + "step": 29818 + }, + { + "epoch": 0.7545866336007288, + "grad_norm": 3.4575562477111816, + "learning_rate": 1.441191022011022e-06, + "loss": 0.1341, + "step": 29819 + }, + { + "epoch": 0.7546119391654225, + "grad_norm": 4.64453125, + "learning_rate": 1.4409089956554957e-06, + "loss": 0.1151, + "step": 29820 + }, + { + "epoch": 0.7546372447301162, + "grad_norm": 6.247417449951172, + "learning_rate": 1.440626992251886e-06, + "loss": 0.1553, + "step": 29821 + }, + { + "epoch": 0.7546625502948098, + "grad_norm": 4.756271839141846, + "learning_rate": 1.4403450118020111e-06, + "loss": 0.1172, + "step": 29822 + }, + { + "epoch": 0.7546878558595035, + "grad_norm": 4.788540363311768, + "learning_rate": 1.4400630543076887e-06, + "loss": 0.2089, + "step": 29823 + }, + { + "epoch": 0.7547131614241972, + "grad_norm": 5.068286418914795, + "learning_rate": 1.4397811197707369e-06, + "loss": 0.1288, + "step": 29824 + }, + { + "epoch": 0.7547384669888909, + "grad_norm": 3.223848581314087, + "learning_rate": 1.4394992081929749e-06, + "loss": 0.1353, + "step": 29825 + }, + { + "epoch": 0.7547637725535845, + "grad_norm": 5.273033618927002, + "learning_rate": 1.4392173195762189e-06, + "loss": 0.1427, + "step": 29826 + }, + { + "epoch": 0.7547890781182782, + "grad_norm": 4.166407108306885, + "learning_rate": 1.4389354539222917e-06, + "loss": 0.1528, + "step": 29827 + }, + { + "epoch": 0.7548143836829719, + "grad_norm": 3.233743906021118, + "learning_rate": 1.4386536112330036e-06, + "loss": 0.1074, + "step": 29828 + }, + { + "epoch": 0.7548396892476655, + "grad_norm": 3.922417402267456, + "learning_rate": 1.4383717915101774e-06, + "loss": 0.1477, + "step": 29829 + }, + { + "epoch": 0.7548649948123592, + "grad_norm": 3.6290090084075928, + "learning_rate": 1.4380899947556294e-06, + "loss": 0.1361, + "step": 29830 + }, + { + "epoch": 0.754890300377053, + "grad_norm": 9.753605842590332, + "learning_rate": 1.4378082209711764e-06, + "loss": 0.1867, + "step": 29831 + }, + { + "epoch": 0.7549156059417466, + "grad_norm": 9.237122535705566, + "learning_rate": 1.437526470158634e-06, + "loss": 0.1337, + "step": 29832 + }, + { + "epoch": 0.7549409115064403, + "grad_norm": 3.4961299896240234, + "learning_rate": 1.4372447423198221e-06, + "loss": 0.0976, + "step": 29833 + }, + { + "epoch": 0.754966217071134, + "grad_norm": 3.924607992172241, + "learning_rate": 1.4369630374565563e-06, + "loss": 0.139, + "step": 29834 + }, + { + "epoch": 0.7549915226358276, + "grad_norm": 3.4837589263916016, + "learning_rate": 1.4366813555706533e-06, + "loss": 0.172, + "step": 29835 + }, + { + "epoch": 0.7550168282005213, + "grad_norm": 3.111177921295166, + "learning_rate": 1.4363996966639287e-06, + "loss": 0.1026, + "step": 29836 + }, + { + "epoch": 0.755042133765215, + "grad_norm": 9.662426948547363, + "learning_rate": 1.4361180607381985e-06, + "loss": 0.2223, + "step": 29837 + }, + { + "epoch": 0.7550674393299086, + "grad_norm": 4.725998401641846, + "learning_rate": 1.4358364477952818e-06, + "loss": 0.146, + "step": 29838 + }, + { + "epoch": 0.7550927448946023, + "grad_norm": 2.9989051818847656, + "learning_rate": 1.435554857836991e-06, + "loss": 0.1169, + "step": 29839 + }, + { + "epoch": 0.755118050459296, + "grad_norm": 4.24421501159668, + "learning_rate": 1.4352732908651473e-06, + "loss": 0.0714, + "step": 29840 + }, + { + "epoch": 0.7551433560239896, + "grad_norm": 5.981006622314453, + "learning_rate": 1.4349917468815594e-06, + "loss": 0.2137, + "step": 29841 + }, + { + "epoch": 0.7551686615886833, + "grad_norm": 8.293758392333984, + "learning_rate": 1.4347102258880485e-06, + "loss": 0.1677, + "step": 29842 + }, + { + "epoch": 0.755193967153377, + "grad_norm": 5.123244762420654, + "learning_rate": 1.4344287278864267e-06, + "loss": 0.1382, + "step": 29843 + }, + { + "epoch": 0.7552192727180707, + "grad_norm": 5.216912269592285, + "learning_rate": 1.4341472528785134e-06, + "loss": 0.147, + "step": 29844 + }, + { + "epoch": 0.7552445782827644, + "grad_norm": 4.497297763824463, + "learning_rate": 1.4338658008661183e-06, + "loss": 0.155, + "step": 29845 + }, + { + "epoch": 0.7552698838474581, + "grad_norm": 3.8086469173431396, + "learning_rate": 1.433584371851061e-06, + "loss": 0.1636, + "step": 29846 + }, + { + "epoch": 0.7552951894121517, + "grad_norm": 9.826725006103516, + "learning_rate": 1.4333029658351533e-06, + "loss": 0.2083, + "step": 29847 + }, + { + "epoch": 0.7553204949768454, + "grad_norm": 4.760557174682617, + "learning_rate": 1.4330215828202127e-06, + "loss": 0.1311, + "step": 29848 + }, + { + "epoch": 0.7553458005415391, + "grad_norm": 4.196944713592529, + "learning_rate": 1.4327402228080523e-06, + "loss": 0.161, + "step": 29849 + }, + { + "epoch": 0.7553711061062328, + "grad_norm": 7.259440898895264, + "learning_rate": 1.4324588858004867e-06, + "loss": 0.233, + "step": 29850 + }, + { + "epoch": 0.7553964116709264, + "grad_norm": 5.830629825592041, + "learning_rate": 1.4321775717993286e-06, + "loss": 0.2189, + "step": 29851 + }, + { + "epoch": 0.7554217172356201, + "grad_norm": 5.091988563537598, + "learning_rate": 1.4318962808063947e-06, + "loss": 0.1484, + "step": 29852 + }, + { + "epoch": 0.7554470228003138, + "grad_norm": 12.823915481567383, + "learning_rate": 1.4316150128234984e-06, + "loss": 0.2504, + "step": 29853 + }, + { + "epoch": 0.7554723283650074, + "grad_norm": 3.9497344493865967, + "learning_rate": 1.4313337678524537e-06, + "loss": 0.1182, + "step": 29854 + }, + { + "epoch": 0.7554976339297012, + "grad_norm": 3.611243724822998, + "learning_rate": 1.4310525458950714e-06, + "loss": 0.1456, + "step": 29855 + }, + { + "epoch": 0.7555229394943949, + "grad_norm": 8.286820411682129, + "learning_rate": 1.4307713469531693e-06, + "loss": 0.3197, + "step": 29856 + }, + { + "epoch": 0.7555482450590885, + "grad_norm": 3.9644389152526855, + "learning_rate": 1.4304901710285585e-06, + "loss": 0.1689, + "step": 29857 + }, + { + "epoch": 0.7555735506237822, + "grad_norm": 8.551043510437012, + "learning_rate": 1.4302090181230527e-06, + "loss": 0.156, + "step": 29858 + }, + { + "epoch": 0.7555988561884759, + "grad_norm": 6.736334800720215, + "learning_rate": 1.429927888238463e-06, + "loss": 0.1614, + "step": 29859 + }, + { + "epoch": 0.7556241617531695, + "grad_norm": 16.366411209106445, + "learning_rate": 1.4296467813766067e-06, + "loss": 0.2471, + "step": 29860 + }, + { + "epoch": 0.7556494673178632, + "grad_norm": 3.7882068157196045, + "learning_rate": 1.4293656975392938e-06, + "loss": 0.1138, + "step": 29861 + }, + { + "epoch": 0.7556747728825569, + "grad_norm": 5.537919998168945, + "learning_rate": 1.4290846367283373e-06, + "loss": 0.1527, + "step": 29862 + }, + { + "epoch": 0.7557000784472505, + "grad_norm": 4.327662467956543, + "learning_rate": 1.4288035989455479e-06, + "loss": 0.0986, + "step": 29863 + }, + { + "epoch": 0.7557253840119442, + "grad_norm": 4.757096290588379, + "learning_rate": 1.4285225841927419e-06, + "loss": 0.126, + "step": 29864 + }, + { + "epoch": 0.7557506895766379, + "grad_norm": 8.761695861816406, + "learning_rate": 1.4282415924717292e-06, + "loss": 0.156, + "step": 29865 + }, + { + "epoch": 0.7557759951413315, + "grad_norm": 4.128692626953125, + "learning_rate": 1.427960623784322e-06, + "loss": 0.1311, + "step": 29866 + }, + { + "epoch": 0.7558013007060252, + "grad_norm": 6.610514163970947, + "learning_rate": 1.427679678132332e-06, + "loss": 0.1654, + "step": 29867 + }, + { + "epoch": 0.755826606270719, + "grad_norm": 2.853579044342041, + "learning_rate": 1.42739875551757e-06, + "loss": 0.1557, + "step": 29868 + }, + { + "epoch": 0.7558519118354126, + "grad_norm": 9.738733291625977, + "learning_rate": 1.4271178559418502e-06, + "loss": 0.1662, + "step": 29869 + }, + { + "epoch": 0.7558772174001063, + "grad_norm": 6.651728630065918, + "learning_rate": 1.4268369794069814e-06, + "loss": 0.166, + "step": 29870 + }, + { + "epoch": 0.7559025229648, + "grad_norm": 5.711907386779785, + "learning_rate": 1.4265561259147793e-06, + "loss": 0.1579, + "step": 29871 + }, + { + "epoch": 0.7559278285294936, + "grad_norm": 8.320366859436035, + "learning_rate": 1.426275295467049e-06, + "loss": 0.2307, + "step": 29872 + }, + { + "epoch": 0.7559531340941873, + "grad_norm": 5.09189510345459, + "learning_rate": 1.4259944880656062e-06, + "loss": 0.1764, + "step": 29873 + }, + { + "epoch": 0.755978439658881, + "grad_norm": 4.880305290222168, + "learning_rate": 1.4257137037122582e-06, + "loss": 0.1283, + "step": 29874 + }, + { + "epoch": 0.7560037452235747, + "grad_norm": 4.205445289611816, + "learning_rate": 1.4254329424088215e-06, + "loss": 0.1164, + "step": 29875 + }, + { + "epoch": 0.7560290507882683, + "grad_norm": 9.283581733703613, + "learning_rate": 1.4251522041570992e-06, + "loss": 0.2071, + "step": 29876 + }, + { + "epoch": 0.756054356352962, + "grad_norm": 11.139556884765625, + "learning_rate": 1.4248714889589065e-06, + "loss": 0.2966, + "step": 29877 + }, + { + "epoch": 0.7560796619176557, + "grad_norm": 6.0520172119140625, + "learning_rate": 1.4245907968160516e-06, + "loss": 0.1581, + "step": 29878 + }, + { + "epoch": 0.7561049674823493, + "grad_norm": 3.3736350536346436, + "learning_rate": 1.4243101277303472e-06, + "loss": 0.1273, + "step": 29879 + }, + { + "epoch": 0.7561302730470431, + "grad_norm": 4.857706069946289, + "learning_rate": 1.424029481703601e-06, + "loss": 0.1357, + "step": 29880 + }, + { + "epoch": 0.7561555786117368, + "grad_norm": 6.427282333374023, + "learning_rate": 1.423748858737624e-06, + "loss": 0.137, + "step": 29881 + }, + { + "epoch": 0.7561808841764304, + "grad_norm": 12.412290573120117, + "learning_rate": 1.4234682588342242e-06, + "loss": 0.417, + "step": 29882 + }, + { + "epoch": 0.7562061897411241, + "grad_norm": 4.004201889038086, + "learning_rate": 1.4231876819952128e-06, + "loss": 0.113, + "step": 29883 + }, + { + "epoch": 0.7562314953058178, + "grad_norm": 4.866579055786133, + "learning_rate": 1.4229071282223995e-06, + "loss": 0.2018, + "step": 29884 + }, + { + "epoch": 0.7562568008705114, + "grad_norm": 4.006656169891357, + "learning_rate": 1.422626597517593e-06, + "loss": 0.1292, + "step": 29885 + }, + { + "epoch": 0.7562821064352051, + "grad_norm": 5.287026405334473, + "learning_rate": 1.4223460898825998e-06, + "loss": 0.2013, + "step": 29886 + }, + { + "epoch": 0.7563074119998988, + "grad_norm": 4.355219841003418, + "learning_rate": 1.4220656053192327e-06, + "loss": 0.1792, + "step": 29887 + }, + { + "epoch": 0.7563327175645924, + "grad_norm": 4.907159805297852, + "learning_rate": 1.4217851438292996e-06, + "loss": 0.1869, + "step": 29888 + }, + { + "epoch": 0.7563580231292861, + "grad_norm": 2.414533853530884, + "learning_rate": 1.4215047054146081e-06, + "loss": 0.0998, + "step": 29889 + }, + { + "epoch": 0.7563833286939798, + "grad_norm": 4.2118144035339355, + "learning_rate": 1.4212242900769657e-06, + "loss": 0.1731, + "step": 29890 + }, + { + "epoch": 0.7564086342586734, + "grad_norm": 4.415256023406982, + "learning_rate": 1.4209438978181838e-06, + "loss": 0.1508, + "step": 29891 + }, + { + "epoch": 0.7564339398233672, + "grad_norm": 5.563910961151123, + "learning_rate": 1.4206635286400688e-06, + "loss": 0.1234, + "step": 29892 + }, + { + "epoch": 0.7564592453880609, + "grad_norm": 8.92656135559082, + "learning_rate": 1.4203831825444291e-06, + "loss": 0.1882, + "step": 29893 + }, + { + "epoch": 0.7564845509527545, + "grad_norm": 6.199501037597656, + "learning_rate": 1.4201028595330724e-06, + "loss": 0.1672, + "step": 29894 + }, + { + "epoch": 0.7565098565174482, + "grad_norm": 6.296017646789551, + "learning_rate": 1.419822559607804e-06, + "loss": 0.228, + "step": 29895 + }, + { + "epoch": 0.7565351620821419, + "grad_norm": 5.324446678161621, + "learning_rate": 1.4195422827704364e-06, + "loss": 0.1446, + "step": 29896 + }, + { + "epoch": 0.7565604676468355, + "grad_norm": 3.922276735305786, + "learning_rate": 1.419262029022775e-06, + "loss": 0.1545, + "step": 29897 + }, + { + "epoch": 0.7565857732115292, + "grad_norm": 4.612618923187256, + "learning_rate": 1.418981798366626e-06, + "loss": 0.1498, + "step": 29898 + }, + { + "epoch": 0.7566110787762229, + "grad_norm": 3.588773727416992, + "learning_rate": 1.4187015908037954e-06, + "loss": 0.1369, + "step": 29899 + }, + { + "epoch": 0.7566363843409165, + "grad_norm": 4.735684871673584, + "learning_rate": 1.418421406336094e-06, + "loss": 0.0902, + "step": 29900 + }, + { + "epoch": 0.7566616899056102, + "grad_norm": 3.8689372539520264, + "learning_rate": 1.4181412449653253e-06, + "loss": 0.0989, + "step": 29901 + }, + { + "epoch": 0.7566869954703039, + "grad_norm": 7.610074043273926, + "learning_rate": 1.4178611066933007e-06, + "loss": 0.0659, + "step": 29902 + }, + { + "epoch": 0.7567123010349976, + "grad_norm": 3.4974780082702637, + "learning_rate": 1.4175809915218203e-06, + "loss": 0.1472, + "step": 29903 + }, + { + "epoch": 0.7567376065996912, + "grad_norm": 4.454966068267822, + "learning_rate": 1.4173008994526948e-06, + "loss": 0.1369, + "step": 29904 + }, + { + "epoch": 0.756762912164385, + "grad_norm": 3.1023831367492676, + "learning_rate": 1.417020830487728e-06, + "loss": 0.0855, + "step": 29905 + }, + { + "epoch": 0.7567882177290787, + "grad_norm": 4.480108737945557, + "learning_rate": 1.4167407846287306e-06, + "loss": 0.133, + "step": 29906 + }, + { + "epoch": 0.7568135232937723, + "grad_norm": 5.182100296020508, + "learning_rate": 1.4164607618775022e-06, + "loss": 0.1311, + "step": 29907 + }, + { + "epoch": 0.756838828858466, + "grad_norm": 2.8783068656921387, + "learning_rate": 1.4161807622358538e-06, + "loss": 0.1262, + "step": 29908 + }, + { + "epoch": 0.7568641344231597, + "grad_norm": 3.7446060180664062, + "learning_rate": 1.4159007857055868e-06, + "loss": 0.1694, + "step": 29909 + }, + { + "epoch": 0.7568894399878533, + "grad_norm": 6.940948963165283, + "learning_rate": 1.4156208322885124e-06, + "loss": 0.1904, + "step": 29910 + }, + { + "epoch": 0.756914745552547, + "grad_norm": 5.012892723083496, + "learning_rate": 1.415340901986429e-06, + "loss": 0.1008, + "step": 29911 + }, + { + "epoch": 0.7569400511172407, + "grad_norm": 4.473285675048828, + "learning_rate": 1.4150609948011472e-06, + "loss": 0.1118, + "step": 29912 + }, + { + "epoch": 0.7569653566819343, + "grad_norm": 4.247766494750977, + "learning_rate": 1.414781110734469e-06, + "loss": 0.1551, + "step": 29913 + }, + { + "epoch": 0.756990662246628, + "grad_norm": 21.345706939697266, + "learning_rate": 1.4145012497882016e-06, + "loss": 0.6, + "step": 29914 + }, + { + "epoch": 0.7570159678113217, + "grad_norm": 11.4441556930542, + "learning_rate": 1.4142214119641484e-06, + "loss": 0.1039, + "step": 29915 + }, + { + "epoch": 0.7570412733760153, + "grad_norm": 7.1144118309021, + "learning_rate": 1.4139415972641146e-06, + "loss": 0.1625, + "step": 29916 + }, + { + "epoch": 0.7570665789407091, + "grad_norm": 4.651956081390381, + "learning_rate": 1.4136618056899032e-06, + "loss": 0.1166, + "step": 29917 + }, + { + "epoch": 0.7570918845054028, + "grad_norm": 2.3256847858428955, + "learning_rate": 1.413382037243321e-06, + "loss": 0.0835, + "step": 29918 + }, + { + "epoch": 0.7571171900700964, + "grad_norm": 4.040797710418701, + "learning_rate": 1.4131022919261706e-06, + "loss": 0.1033, + "step": 29919 + }, + { + "epoch": 0.7571424956347901, + "grad_norm": 2.7973005771636963, + "learning_rate": 1.4128225697402565e-06, + "loss": 0.0893, + "step": 29920 + }, + { + "epoch": 0.7571678011994838, + "grad_norm": 7.011063098907471, + "learning_rate": 1.4125428706873822e-06, + "loss": 0.2197, + "step": 29921 + }, + { + "epoch": 0.7571931067641774, + "grad_norm": 5.199127674102783, + "learning_rate": 1.4122631947693504e-06, + "loss": 0.1677, + "step": 29922 + }, + { + "epoch": 0.7572184123288711, + "grad_norm": 7.659801006317139, + "learning_rate": 1.4119835419879675e-06, + "loss": 0.216, + "step": 29923 + }, + { + "epoch": 0.7572437178935648, + "grad_norm": 4.680127143859863, + "learning_rate": 1.4117039123450348e-06, + "loss": 0.1762, + "step": 29924 + }, + { + "epoch": 0.7572690234582584, + "grad_norm": 4.489566802978516, + "learning_rate": 1.4114243058423566e-06, + "loss": 0.1332, + "step": 29925 + }, + { + "epoch": 0.7572943290229521, + "grad_norm": 3.9121041297912598, + "learning_rate": 1.4111447224817336e-06, + "loss": 0.1083, + "step": 29926 + }, + { + "epoch": 0.7573196345876458, + "grad_norm": 10.635054588317871, + "learning_rate": 1.4108651622649728e-06, + "loss": 0.1513, + "step": 29927 + }, + { + "epoch": 0.7573449401523396, + "grad_norm": 12.971567153930664, + "learning_rate": 1.4105856251938743e-06, + "loss": 0.1617, + "step": 29928 + }, + { + "epoch": 0.7573702457170332, + "grad_norm": 4.887187957763672, + "learning_rate": 1.4103061112702422e-06, + "loss": 0.1309, + "step": 29929 + }, + { + "epoch": 0.7573955512817269, + "grad_norm": 2.234626293182373, + "learning_rate": 1.4100266204958756e-06, + "loss": 0.078, + "step": 29930 + }, + { + "epoch": 0.7574208568464206, + "grad_norm": 3.1586618423461914, + "learning_rate": 1.409747152872582e-06, + "loss": 0.1028, + "step": 29931 + }, + { + "epoch": 0.7574461624111142, + "grad_norm": 16.500425338745117, + "learning_rate": 1.4094677084021592e-06, + "loss": 0.2789, + "step": 29932 + }, + { + "epoch": 0.7574714679758079, + "grad_norm": 16.97606086730957, + "learning_rate": 1.4091882870864143e-06, + "loss": 0.1878, + "step": 29933 + }, + { + "epoch": 0.7574967735405016, + "grad_norm": 5.696041107177734, + "learning_rate": 1.4089088889271434e-06, + "loss": 0.1362, + "step": 29934 + }, + { + "epoch": 0.7575220791051952, + "grad_norm": 3.032454490661621, + "learning_rate": 1.4086295139261524e-06, + "loss": 0.0794, + "step": 29935 + }, + { + "epoch": 0.7575473846698889, + "grad_norm": 5.998237133026123, + "learning_rate": 1.4083501620852403e-06, + "loss": 0.1891, + "step": 29936 + }, + { + "epoch": 0.7575726902345826, + "grad_norm": 5.970853805541992, + "learning_rate": 1.4080708334062132e-06, + "loss": 0.1845, + "step": 29937 + }, + { + "epoch": 0.7575979957992762, + "grad_norm": 6.393842697143555, + "learning_rate": 1.4077915278908654e-06, + "loss": 0.2234, + "step": 29938 + }, + { + "epoch": 0.7576233013639699, + "grad_norm": 6.791913986206055, + "learning_rate": 1.4075122455410039e-06, + "loss": 0.1921, + "step": 29939 + }, + { + "epoch": 0.7576486069286636, + "grad_norm": 5.537514686584473, + "learning_rate": 1.407232986358426e-06, + "loss": 0.1668, + "step": 29940 + }, + { + "epoch": 0.7576739124933572, + "grad_norm": 7.048067092895508, + "learning_rate": 1.406953750344937e-06, + "loss": 0.2338, + "step": 29941 + }, + { + "epoch": 0.757699218058051, + "grad_norm": 11.490169525146484, + "learning_rate": 1.4066745375023321e-06, + "loss": 0.3107, + "step": 29942 + }, + { + "epoch": 0.7577245236227447, + "grad_norm": 13.208366394042969, + "learning_rate": 1.4063953478324165e-06, + "loss": 0.191, + "step": 29943 + }, + { + "epoch": 0.7577498291874383, + "grad_norm": 7.603349685668945, + "learning_rate": 1.4061161813369873e-06, + "loss": 0.2074, + "step": 29944 + }, + { + "epoch": 0.757775134752132, + "grad_norm": 3.595923900604248, + "learning_rate": 1.405837038017847e-06, + "loss": 0.1207, + "step": 29945 + }, + { + "epoch": 0.7578004403168257, + "grad_norm": 4.79440450668335, + "learning_rate": 1.4055579178767958e-06, + "loss": 0.2126, + "step": 29946 + }, + { + "epoch": 0.7578257458815193, + "grad_norm": 3.217165231704712, + "learning_rate": 1.4052788209156332e-06, + "loss": 0.1203, + "step": 29947 + }, + { + "epoch": 0.757851051446213, + "grad_norm": 17.76023292541504, + "learning_rate": 1.404999747136157e-06, + "loss": 0.1327, + "step": 29948 + }, + { + "epoch": 0.7578763570109067, + "grad_norm": 6.728463649749756, + "learning_rate": 1.40472069654017e-06, + "loss": 0.1706, + "step": 29949 + }, + { + "epoch": 0.7579016625756003, + "grad_norm": 4.304161548614502, + "learning_rate": 1.4044416691294704e-06, + "loss": 0.1484, + "step": 29950 + }, + { + "epoch": 0.757926968140294, + "grad_norm": 5.3824567794799805, + "learning_rate": 1.4041626649058582e-06, + "loss": 0.1304, + "step": 29951 + }, + { + "epoch": 0.7579522737049877, + "grad_norm": 6.899636745452881, + "learning_rate": 1.4038836838711317e-06, + "loss": 0.1141, + "step": 29952 + }, + { + "epoch": 0.7579775792696815, + "grad_norm": 3.8009374141693115, + "learning_rate": 1.4036047260270892e-06, + "loss": 0.1177, + "step": 29953 + }, + { + "epoch": 0.7580028848343751, + "grad_norm": 8.518956184387207, + "learning_rate": 1.403325791375532e-06, + "loss": 0.2703, + "step": 29954 + }, + { + "epoch": 0.7580281903990688, + "grad_norm": 5.739091396331787, + "learning_rate": 1.403046879918258e-06, + "loss": 0.1626, + "step": 29955 + }, + { + "epoch": 0.7580534959637625, + "grad_norm": 5.433147430419922, + "learning_rate": 1.4027679916570652e-06, + "loss": 0.1324, + "step": 29956 + }, + { + "epoch": 0.7580788015284561, + "grad_norm": 6.564366817474365, + "learning_rate": 1.4024891265937513e-06, + "loss": 0.109, + "step": 29957 + }, + { + "epoch": 0.7581041070931498, + "grad_norm": 3.9944093227386475, + "learning_rate": 1.4022102847301171e-06, + "loss": 0.1347, + "step": 29958 + }, + { + "epoch": 0.7581294126578435, + "grad_norm": 13.678069114685059, + "learning_rate": 1.40193146606796e-06, + "loss": 0.2338, + "step": 29959 + }, + { + "epoch": 0.7581547182225371, + "grad_norm": 3.14087176322937, + "learning_rate": 1.4016526706090772e-06, + "loss": 0.0662, + "step": 29960 + }, + { + "epoch": 0.7581800237872308, + "grad_norm": 4.3077216148376465, + "learning_rate": 1.4013738983552655e-06, + "loss": 0.1299, + "step": 29961 + }, + { + "epoch": 0.7582053293519245, + "grad_norm": 6.975414276123047, + "learning_rate": 1.4010951493083258e-06, + "loss": 0.2166, + "step": 29962 + }, + { + "epoch": 0.7582306349166181, + "grad_norm": 4.237207889556885, + "learning_rate": 1.400816423470054e-06, + "loss": 0.1376, + "step": 29963 + }, + { + "epoch": 0.7582559404813118, + "grad_norm": 5.433119297027588, + "learning_rate": 1.4005377208422472e-06, + "loss": 0.1974, + "step": 29964 + }, + { + "epoch": 0.7582812460460056, + "grad_norm": 5.130471706390381, + "learning_rate": 1.400259041426702e-06, + "loss": 0.1622, + "step": 29965 + }, + { + "epoch": 0.7583065516106992, + "grad_norm": 16.340930938720703, + "learning_rate": 1.399980385225218e-06, + "loss": 0.2327, + "step": 29966 + }, + { + "epoch": 0.7583318571753929, + "grad_norm": 2.8319172859191895, + "learning_rate": 1.3997017522395895e-06, + "loss": 0.1052, + "step": 29967 + }, + { + "epoch": 0.7583571627400866, + "grad_norm": 10.136950492858887, + "learning_rate": 1.3994231424716176e-06, + "loss": 0.2515, + "step": 29968 + }, + { + "epoch": 0.7583824683047802, + "grad_norm": 17.328763961791992, + "learning_rate": 1.3991445559230931e-06, + "loss": 0.2297, + "step": 29969 + }, + { + "epoch": 0.7584077738694739, + "grad_norm": 3.3117916584014893, + "learning_rate": 1.3988659925958171e-06, + "loss": 0.1129, + "step": 29970 + }, + { + "epoch": 0.7584330794341676, + "grad_norm": 4.0636138916015625, + "learning_rate": 1.3985874524915826e-06, + "loss": 0.1472, + "step": 29971 + }, + { + "epoch": 0.7584583849988612, + "grad_norm": 3.4996888637542725, + "learning_rate": 1.3983089356121909e-06, + "loss": 0.149, + "step": 29972 + }, + { + "epoch": 0.7584836905635549, + "grad_norm": 6.276858806610107, + "learning_rate": 1.398030441959432e-06, + "loss": 0.1905, + "step": 29973 + }, + { + "epoch": 0.7585089961282486, + "grad_norm": 7.056417465209961, + "learning_rate": 1.3977519715351062e-06, + "loss": 0.185, + "step": 29974 + }, + { + "epoch": 0.7585343016929422, + "grad_norm": 5.30228853225708, + "learning_rate": 1.3974735243410064e-06, + "loss": 0.1082, + "step": 29975 + }, + { + "epoch": 0.7585596072576359, + "grad_norm": 7.269708156585693, + "learning_rate": 1.3971951003789314e-06, + "loss": 0.1114, + "step": 29976 + }, + { + "epoch": 0.7585849128223296, + "grad_norm": 7.360497951507568, + "learning_rate": 1.396916699650675e-06, + "loss": 0.2063, + "step": 29977 + }, + { + "epoch": 0.7586102183870234, + "grad_norm": 4.507488250732422, + "learning_rate": 1.396638322158032e-06, + "loss": 0.1466, + "step": 29978 + }, + { + "epoch": 0.758635523951717, + "grad_norm": 4.052594184875488, + "learning_rate": 1.3963599679027989e-06, + "loss": 0.1422, + "step": 29979 + }, + { + "epoch": 0.7586608295164107, + "grad_norm": 3.2866437435150146, + "learning_rate": 1.396081636886768e-06, + "loss": 0.0967, + "step": 29980 + }, + { + "epoch": 0.7586861350811044, + "grad_norm": 2.99078369140625, + "learning_rate": 1.3958033291117384e-06, + "loss": 0.118, + "step": 29981 + }, + { + "epoch": 0.758711440645798, + "grad_norm": 6.815042972564697, + "learning_rate": 1.395525044579502e-06, + "loss": 0.1187, + "step": 29982 + }, + { + "epoch": 0.7587367462104917, + "grad_norm": 3.6632001399993896, + "learning_rate": 1.3952467832918547e-06, + "loss": 0.1742, + "step": 29983 + }, + { + "epoch": 0.7587620517751854, + "grad_norm": 11.310025215148926, + "learning_rate": 1.3949685452505885e-06, + "loss": 0.2686, + "step": 29984 + }, + { + "epoch": 0.758787357339879, + "grad_norm": 3.925114393234253, + "learning_rate": 1.394690330457501e-06, + "loss": 0.0741, + "step": 29985 + }, + { + "epoch": 0.7588126629045727, + "grad_norm": 4.90409517288208, + "learning_rate": 1.394412138914385e-06, + "loss": 0.1113, + "step": 29986 + }, + { + "epoch": 0.7588379684692664, + "grad_norm": 3.0750505924224854, + "learning_rate": 1.3941339706230346e-06, + "loss": 0.0721, + "step": 29987 + }, + { + "epoch": 0.75886327403396, + "grad_norm": 3.3044545650482178, + "learning_rate": 1.3938558255852418e-06, + "loss": 0.2185, + "step": 29988 + }, + { + "epoch": 0.7588885795986537, + "grad_norm": 10.8883695602417, + "learning_rate": 1.3935777038028036e-06, + "loss": 0.383, + "step": 29989 + }, + { + "epoch": 0.7589138851633475, + "grad_norm": 6.035935878753662, + "learning_rate": 1.3932996052775121e-06, + "loss": 0.192, + "step": 29990 + }, + { + "epoch": 0.7589391907280411, + "grad_norm": 2.273221015930176, + "learning_rate": 1.3930215300111599e-06, + "loss": 0.0961, + "step": 29991 + }, + { + "epoch": 0.7589644962927348, + "grad_norm": 3.697892904281616, + "learning_rate": 1.3927434780055398e-06, + "loss": 0.1369, + "step": 29992 + }, + { + "epoch": 0.7589898018574285, + "grad_norm": 14.865239143371582, + "learning_rate": 1.3924654492624478e-06, + "loss": 0.2359, + "step": 29993 + }, + { + "epoch": 0.7590151074221221, + "grad_norm": 5.522985458374023, + "learning_rate": 1.3921874437836747e-06, + "loss": 0.1293, + "step": 29994 + }, + { + "epoch": 0.7590404129868158, + "grad_norm": 2.880223274230957, + "learning_rate": 1.3919094615710138e-06, + "loss": 0.0926, + "step": 29995 + }, + { + "epoch": 0.7590657185515095, + "grad_norm": 6.328181266784668, + "learning_rate": 1.3916315026262562e-06, + "loss": 0.2399, + "step": 29996 + }, + { + "epoch": 0.7590910241162031, + "grad_norm": 3.8755173683166504, + "learning_rate": 1.3913535669511973e-06, + "loss": 0.1215, + "step": 29997 + }, + { + "epoch": 0.7591163296808968, + "grad_norm": 13.003067970275879, + "learning_rate": 1.3910756545476262e-06, + "loss": 0.1797, + "step": 29998 + }, + { + "epoch": 0.7591416352455905, + "grad_norm": 3.570439338684082, + "learning_rate": 1.3907977654173405e-06, + "loss": 0.1098, + "step": 29999 + }, + { + "epoch": 0.7591669408102841, + "grad_norm": 6.750303268432617, + "learning_rate": 1.3905198995621249e-06, + "loss": 0.2089, + "step": 30000 + }, + { + "epoch": 0.7591922463749778, + "grad_norm": 3.5318691730499268, + "learning_rate": 1.390242056983777e-06, + "loss": 0.1005, + "step": 30001 + }, + { + "epoch": 0.7592175519396716, + "grad_norm": 7.21565055847168, + "learning_rate": 1.389964237684085e-06, + "loss": 0.2458, + "step": 30002 + }, + { + "epoch": 0.7592428575043653, + "grad_norm": 6.46641731262207, + "learning_rate": 1.3896864416648453e-06, + "loss": 0.2256, + "step": 30003 + }, + { + "epoch": 0.7592681630690589, + "grad_norm": 5.717955112457275, + "learning_rate": 1.389408668927843e-06, + "loss": 0.1904, + "step": 30004 + }, + { + "epoch": 0.7592934686337526, + "grad_norm": 2.6669206619262695, + "learning_rate": 1.3891309194748742e-06, + "loss": 0.0661, + "step": 30005 + }, + { + "epoch": 0.7593187741984463, + "grad_norm": 5.400457382202148, + "learning_rate": 1.3888531933077282e-06, + "loss": 0.1613, + "step": 30006 + }, + { + "epoch": 0.7593440797631399, + "grad_norm": 7.129833221435547, + "learning_rate": 1.3885754904281946e-06, + "loss": 0.2117, + "step": 30007 + }, + { + "epoch": 0.7593693853278336, + "grad_norm": 12.70679759979248, + "learning_rate": 1.388297810838068e-06, + "loss": 0.1603, + "step": 30008 + }, + { + "epoch": 0.7593946908925273, + "grad_norm": 5.5156660079956055, + "learning_rate": 1.3880201545391364e-06, + "loss": 0.1265, + "step": 30009 + }, + { + "epoch": 0.7594199964572209, + "grad_norm": 2.638759136199951, + "learning_rate": 1.3877425215331914e-06, + "loss": 0.1306, + "step": 30010 + }, + { + "epoch": 0.7594453020219146, + "grad_norm": 5.649722099304199, + "learning_rate": 1.3874649118220206e-06, + "loss": 0.1424, + "step": 30011 + }, + { + "epoch": 0.7594706075866083, + "grad_norm": 7.015536785125732, + "learning_rate": 1.3871873254074191e-06, + "loss": 0.1619, + "step": 30012 + }, + { + "epoch": 0.7594959131513019, + "grad_norm": 8.125307083129883, + "learning_rate": 1.3869097622911742e-06, + "loss": 0.159, + "step": 30013 + }, + { + "epoch": 0.7595212187159956, + "grad_norm": 7.999654293060303, + "learning_rate": 1.3866322224750762e-06, + "loss": 0.2447, + "step": 30014 + }, + { + "epoch": 0.7595465242806894, + "grad_norm": 10.472103118896484, + "learning_rate": 1.3863547059609128e-06, + "loss": 0.2314, + "step": 30015 + }, + { + "epoch": 0.759571829845383, + "grad_norm": 13.870168685913086, + "learning_rate": 1.3860772127504795e-06, + "loss": 0.1926, + "step": 30016 + }, + { + "epoch": 0.7595971354100767, + "grad_norm": 6.3492326736450195, + "learning_rate": 1.3857997428455583e-06, + "loss": 0.197, + "step": 30017 + }, + { + "epoch": 0.7596224409747704, + "grad_norm": 11.03415298461914, + "learning_rate": 1.3855222962479442e-06, + "loss": 0.3528, + "step": 30018 + }, + { + "epoch": 0.759647746539464, + "grad_norm": 6.872442722320557, + "learning_rate": 1.385244872959423e-06, + "loss": 0.2052, + "step": 30019 + }, + { + "epoch": 0.7596730521041577, + "grad_norm": 2.091762065887451, + "learning_rate": 1.3849674729817864e-06, + "loss": 0.0569, + "step": 30020 + }, + { + "epoch": 0.7596983576688514, + "grad_norm": 2.7901813983917236, + "learning_rate": 1.3846900963168224e-06, + "loss": 0.0992, + "step": 30021 + }, + { + "epoch": 0.759723663233545, + "grad_norm": 4.104284286499023, + "learning_rate": 1.3844127429663196e-06, + "loss": 0.0959, + "step": 30022 + }, + { + "epoch": 0.7597489687982387, + "grad_norm": 3.266301393508911, + "learning_rate": 1.3841354129320644e-06, + "loss": 0.0616, + "step": 30023 + }, + { + "epoch": 0.7597742743629324, + "grad_norm": 6.2620344161987305, + "learning_rate": 1.3838581062158497e-06, + "loss": 0.1384, + "step": 30024 + }, + { + "epoch": 0.759799579927626, + "grad_norm": 16.77057456970215, + "learning_rate": 1.3835808228194608e-06, + "loss": 0.3442, + "step": 30025 + }, + { + "epoch": 0.7598248854923197, + "grad_norm": 5.358852863311768, + "learning_rate": 1.3833035627446861e-06, + "loss": 0.1354, + "step": 30026 + }, + { + "epoch": 0.7598501910570135, + "grad_norm": 4.382082462310791, + "learning_rate": 1.383026325993313e-06, + "loss": 0.1513, + "step": 30027 + }, + { + "epoch": 0.7598754966217071, + "grad_norm": 9.26698112487793, + "learning_rate": 1.3827491125671322e-06, + "loss": 0.1927, + "step": 30028 + }, + { + "epoch": 0.7599008021864008, + "grad_norm": 4.595071792602539, + "learning_rate": 1.3824719224679274e-06, + "loss": 0.1298, + "step": 30029 + }, + { + "epoch": 0.7599261077510945, + "grad_norm": 10.768871307373047, + "learning_rate": 1.3821947556974912e-06, + "loss": 0.2858, + "step": 30030 + }, + { + "epoch": 0.7599514133157882, + "grad_norm": 6.969090938568115, + "learning_rate": 1.3819176122576056e-06, + "loss": 0.161, + "step": 30031 + }, + { + "epoch": 0.7599767188804818, + "grad_norm": 1.6379501819610596, + "learning_rate": 1.3816404921500614e-06, + "loss": 0.0584, + "step": 30032 + }, + { + "epoch": 0.7600020244451755, + "grad_norm": 4.451643466949463, + "learning_rate": 1.3813633953766448e-06, + "loss": 0.1253, + "step": 30033 + }, + { + "epoch": 0.7600273300098692, + "grad_norm": 6.831229209899902, + "learning_rate": 1.3810863219391423e-06, + "loss": 0.1835, + "step": 30034 + }, + { + "epoch": 0.7600526355745628, + "grad_norm": 8.21273422241211, + "learning_rate": 1.3808092718393396e-06, + "loss": 0.2024, + "step": 30035 + }, + { + "epoch": 0.7600779411392565, + "grad_norm": 4.333217620849609, + "learning_rate": 1.380532245079026e-06, + "loss": 0.1339, + "step": 30036 + }, + { + "epoch": 0.7601032467039502, + "grad_norm": 5.976053237915039, + "learning_rate": 1.3802552416599868e-06, + "loss": 0.1487, + "step": 30037 + }, + { + "epoch": 0.7601285522686438, + "grad_norm": 11.071904182434082, + "learning_rate": 1.379978261584007e-06, + "loss": 0.1578, + "step": 30038 + }, + { + "epoch": 0.7601538578333376, + "grad_norm": 9.495376586914062, + "learning_rate": 1.3797013048528751e-06, + "loss": 0.1142, + "step": 30039 + }, + { + "epoch": 0.7601791633980313, + "grad_norm": 4.810251712799072, + "learning_rate": 1.3794243714683765e-06, + "loss": 0.1308, + "step": 30040 + }, + { + "epoch": 0.7602044689627249, + "grad_norm": 3.3005294799804688, + "learning_rate": 1.379147461432296e-06, + "loss": 0.1052, + "step": 30041 + }, + { + "epoch": 0.7602297745274186, + "grad_norm": 6.707903861999512, + "learning_rate": 1.3788705747464188e-06, + "loss": 0.1978, + "step": 30042 + }, + { + "epoch": 0.7602550800921123, + "grad_norm": 8.149085998535156, + "learning_rate": 1.3785937114125347e-06, + "loss": 0.1847, + "step": 30043 + }, + { + "epoch": 0.7602803856568059, + "grad_norm": 4.726919174194336, + "learning_rate": 1.3783168714324236e-06, + "loss": 0.1242, + "step": 30044 + }, + { + "epoch": 0.7603056912214996, + "grad_norm": 6.0618181228637695, + "learning_rate": 1.3780400548078742e-06, + "loss": 0.11, + "step": 30045 + }, + { + "epoch": 0.7603309967861933, + "grad_norm": 12.036791801452637, + "learning_rate": 1.3777632615406694e-06, + "loss": 0.2612, + "step": 30046 + }, + { + "epoch": 0.7603563023508869, + "grad_norm": 5.616870403289795, + "learning_rate": 1.3774864916325987e-06, + "loss": 0.1145, + "step": 30047 + }, + { + "epoch": 0.7603816079155806, + "grad_norm": 11.315939903259277, + "learning_rate": 1.3772097450854404e-06, + "loss": 0.2217, + "step": 30048 + }, + { + "epoch": 0.7604069134802743, + "grad_norm": 4.31645393371582, + "learning_rate": 1.376933021900984e-06, + "loss": 0.1145, + "step": 30049 + }, + { + "epoch": 0.7604322190449679, + "grad_norm": 6.402920246124268, + "learning_rate": 1.376656322081012e-06, + "loss": 0.2062, + "step": 30050 + }, + { + "epoch": 0.7604575246096617, + "grad_norm": 5.813189506530762, + "learning_rate": 1.3763796456273099e-06, + "loss": 0.1924, + "step": 30051 + }, + { + "epoch": 0.7604828301743554, + "grad_norm": 9.515312194824219, + "learning_rate": 1.376102992541662e-06, + "loss": 0.2345, + "step": 30052 + }, + { + "epoch": 0.760508135739049, + "grad_norm": 13.871101379394531, + "learning_rate": 1.3758263628258518e-06, + "loss": 0.2304, + "step": 30053 + }, + { + "epoch": 0.7605334413037427, + "grad_norm": 3.485180616378784, + "learning_rate": 1.3755497564816611e-06, + "loss": 0.127, + "step": 30054 + }, + { + "epoch": 0.7605587468684364, + "grad_norm": 2.750824451446533, + "learning_rate": 1.3752731735108782e-06, + "loss": 0.1059, + "step": 30055 + }, + { + "epoch": 0.7605840524331301, + "grad_norm": 5.358645915985107, + "learning_rate": 1.3749966139152838e-06, + "loss": 0.1747, + "step": 30056 + }, + { + "epoch": 0.7606093579978237, + "grad_norm": 3.441131353378296, + "learning_rate": 1.3747200776966618e-06, + "loss": 0.1144, + "step": 30057 + }, + { + "epoch": 0.7606346635625174, + "grad_norm": 3.8037784099578857, + "learning_rate": 1.3744435648567943e-06, + "loss": 0.1499, + "step": 30058 + }, + { + "epoch": 0.7606599691272111, + "grad_norm": 3.830737352371216, + "learning_rate": 1.3741670753974679e-06, + "loss": 0.1059, + "step": 30059 + }, + { + "epoch": 0.7606852746919047, + "grad_norm": 19.34886932373047, + "learning_rate": 1.3738906093204618e-06, + "loss": 0.2029, + "step": 30060 + }, + { + "epoch": 0.7607105802565984, + "grad_norm": 7.460481643676758, + "learning_rate": 1.3736141666275631e-06, + "loss": 0.226, + "step": 30061 + }, + { + "epoch": 0.7607358858212921, + "grad_norm": 2.9145452976226807, + "learning_rate": 1.3733377473205495e-06, + "loss": 0.1446, + "step": 30062 + }, + { + "epoch": 0.7607611913859857, + "grad_norm": 6.802550315856934, + "learning_rate": 1.3730613514012077e-06, + "loss": 0.2371, + "step": 30063 + }, + { + "epoch": 0.7607864969506795, + "grad_norm": 4.2596211433410645, + "learning_rate": 1.3727849788713188e-06, + "loss": 0.1789, + "step": 30064 + }, + { + "epoch": 0.7608118025153732, + "grad_norm": 4.8691725730896, + "learning_rate": 1.3725086297326645e-06, + "loss": 0.144, + "step": 30065 + }, + { + "epoch": 0.7608371080800668, + "grad_norm": 5.167383670806885, + "learning_rate": 1.3722323039870256e-06, + "loss": 0.1442, + "step": 30066 + }, + { + "epoch": 0.7608624136447605, + "grad_norm": 7.327727317810059, + "learning_rate": 1.3719560016361877e-06, + "loss": 0.2201, + "step": 30067 + }, + { + "epoch": 0.7608877192094542, + "grad_norm": 8.60284423828125, + "learning_rate": 1.3716797226819306e-06, + "loss": 0.3216, + "step": 30068 + }, + { + "epoch": 0.7609130247741478, + "grad_norm": 5.067413806915283, + "learning_rate": 1.3714034671260356e-06, + "loss": 0.1253, + "step": 30069 + }, + { + "epoch": 0.7609383303388415, + "grad_norm": 9.266792297363281, + "learning_rate": 1.3711272349702836e-06, + "loss": 0.2682, + "step": 30070 + }, + { + "epoch": 0.7609636359035352, + "grad_norm": 4.034411430358887, + "learning_rate": 1.3708510262164587e-06, + "loss": 0.1583, + "step": 30071 + }, + { + "epoch": 0.7609889414682288, + "grad_norm": 3.8825013637542725, + "learning_rate": 1.37057484086634e-06, + "loss": 0.1461, + "step": 30072 + }, + { + "epoch": 0.7610142470329225, + "grad_norm": 4.715259075164795, + "learning_rate": 1.3702986789217077e-06, + "loss": 0.168, + "step": 30073 + }, + { + "epoch": 0.7610395525976162, + "grad_norm": 4.082854270935059, + "learning_rate": 1.370022540384347e-06, + "loss": 0.0688, + "step": 30074 + }, + { + "epoch": 0.7610648581623098, + "grad_norm": 4.08089017868042, + "learning_rate": 1.3697464252560328e-06, + "loss": 0.1473, + "step": 30075 + }, + { + "epoch": 0.7610901637270036, + "grad_norm": 6.174797058105469, + "learning_rate": 1.3694703335385501e-06, + "loss": 0.1654, + "step": 30076 + }, + { + "epoch": 0.7611154692916973, + "grad_norm": 8.391603469848633, + "learning_rate": 1.3691942652336765e-06, + "loss": 0.2392, + "step": 30077 + }, + { + "epoch": 0.7611407748563909, + "grad_norm": 4.481688022613525, + "learning_rate": 1.3689182203431967e-06, + "loss": 0.1467, + "step": 30078 + }, + { + "epoch": 0.7611660804210846, + "grad_norm": 12.825135231018066, + "learning_rate": 1.3686421988688848e-06, + "loss": 0.2341, + "step": 30079 + }, + { + "epoch": 0.7611913859857783, + "grad_norm": 5.071383953094482, + "learning_rate": 1.3683662008125254e-06, + "loss": 0.1266, + "step": 30080 + }, + { + "epoch": 0.761216691550472, + "grad_norm": 4.233888626098633, + "learning_rate": 1.3680902261758955e-06, + "loss": 0.0728, + "step": 30081 + }, + { + "epoch": 0.7612419971151656, + "grad_norm": 6.383020877838135, + "learning_rate": 1.367814274960777e-06, + "loss": 0.1757, + "step": 30082 + }, + { + "epoch": 0.7612673026798593, + "grad_norm": 5.860448360443115, + "learning_rate": 1.367538347168949e-06, + "loss": 0.1608, + "step": 30083 + }, + { + "epoch": 0.761292608244553, + "grad_norm": 5.785380840301514, + "learning_rate": 1.3672624428021908e-06, + "loss": 0.2365, + "step": 30084 + }, + { + "epoch": 0.7613179138092466, + "grad_norm": 2.9976096153259277, + "learning_rate": 1.3669865618622796e-06, + "loss": 0.1093, + "step": 30085 + }, + { + "epoch": 0.7613432193739403, + "grad_norm": 5.031485557556152, + "learning_rate": 1.3667107043509976e-06, + "loss": 0.1393, + "step": 30086 + }, + { + "epoch": 0.761368524938634, + "grad_norm": 6.955157279968262, + "learning_rate": 1.3664348702701224e-06, + "loss": 0.1332, + "step": 30087 + }, + { + "epoch": 0.7613938305033277, + "grad_norm": 9.030234336853027, + "learning_rate": 1.3661590596214335e-06, + "loss": 0.2835, + "step": 30088 + }, + { + "epoch": 0.7614191360680214, + "grad_norm": 3.3554527759552, + "learning_rate": 1.3658832724067067e-06, + "loss": 0.1453, + "step": 30089 + }, + { + "epoch": 0.7614444416327151, + "grad_norm": 3.974539041519165, + "learning_rate": 1.3656075086277243e-06, + "loss": 0.1044, + "step": 30090 + }, + { + "epoch": 0.7614697471974087, + "grad_norm": 4.312560558319092, + "learning_rate": 1.3653317682862632e-06, + "loss": 0.1057, + "step": 30091 + }, + { + "epoch": 0.7614950527621024, + "grad_norm": 3.507458448410034, + "learning_rate": 1.3650560513841015e-06, + "loss": 0.0905, + "step": 30092 + }, + { + "epoch": 0.7615203583267961, + "grad_norm": 7.247068405151367, + "learning_rate": 1.364780357923015e-06, + "loss": 0.149, + "step": 30093 + }, + { + "epoch": 0.7615456638914897, + "grad_norm": 2.896653652191162, + "learning_rate": 1.3645046879047864e-06, + "loss": 0.0875, + "step": 30094 + }, + { + "epoch": 0.7615709694561834, + "grad_norm": 5.313344478607178, + "learning_rate": 1.3642290413311899e-06, + "loss": 0.1474, + "step": 30095 + }, + { + "epoch": 0.7615962750208771, + "grad_norm": 26.62557601928711, + "learning_rate": 1.3639534182040043e-06, + "loss": 0.2423, + "step": 30096 + }, + { + "epoch": 0.7616215805855707, + "grad_norm": 9.15718936920166, + "learning_rate": 1.3636778185250054e-06, + "loss": 0.212, + "step": 30097 + }, + { + "epoch": 0.7616468861502644, + "grad_norm": 6.478486061096191, + "learning_rate": 1.363402242295973e-06, + "loss": 0.1012, + "step": 30098 + }, + { + "epoch": 0.7616721917149581, + "grad_norm": 3.41287899017334, + "learning_rate": 1.3631266895186834e-06, + "loss": 0.1191, + "step": 30099 + }, + { + "epoch": 0.7616974972796517, + "grad_norm": 4.494869232177734, + "learning_rate": 1.3628511601949128e-06, + "loss": 0.1568, + "step": 30100 + }, + { + "epoch": 0.7617228028443455, + "grad_norm": 2.9569101333618164, + "learning_rate": 1.362575654326439e-06, + "loss": 0.1314, + "step": 30101 + }, + { + "epoch": 0.7617481084090392, + "grad_norm": 4.983511924743652, + "learning_rate": 1.3623001719150363e-06, + "loss": 0.1126, + "step": 30102 + }, + { + "epoch": 0.7617734139737328, + "grad_norm": 3.891007423400879, + "learning_rate": 1.3620247129624853e-06, + "loss": 0.1076, + "step": 30103 + }, + { + "epoch": 0.7617987195384265, + "grad_norm": 4.920602321624756, + "learning_rate": 1.361749277470558e-06, + "loss": 0.142, + "step": 30104 + }, + { + "epoch": 0.7618240251031202, + "grad_norm": 7.659127235412598, + "learning_rate": 1.3614738654410365e-06, + "loss": 0.1558, + "step": 30105 + }, + { + "epoch": 0.7618493306678139, + "grad_norm": 4.882556915283203, + "learning_rate": 1.3611984768756897e-06, + "loss": 0.1598, + "step": 30106 + }, + { + "epoch": 0.7618746362325075, + "grad_norm": 3.8194034099578857, + "learning_rate": 1.3609231117762983e-06, + "loss": 0.1186, + "step": 30107 + }, + { + "epoch": 0.7618999417972012, + "grad_norm": 6.108173370361328, + "learning_rate": 1.3606477701446358e-06, + "loss": 0.1841, + "step": 30108 + }, + { + "epoch": 0.7619252473618949, + "grad_norm": 4.396997928619385, + "learning_rate": 1.3603724519824818e-06, + "loss": 0.1021, + "step": 30109 + }, + { + "epoch": 0.7619505529265885, + "grad_norm": 8.63775634765625, + "learning_rate": 1.360097157291606e-06, + "loss": 0.2287, + "step": 30110 + }, + { + "epoch": 0.7619758584912822, + "grad_norm": 3.6015167236328125, + "learning_rate": 1.3598218860737877e-06, + "loss": 0.1457, + "step": 30111 + }, + { + "epoch": 0.762001164055976, + "grad_norm": 5.188172340393066, + "learning_rate": 1.3595466383307999e-06, + "loss": 0.182, + "step": 30112 + }, + { + "epoch": 0.7620264696206696, + "grad_norm": 17.518604278564453, + "learning_rate": 1.3592714140644197e-06, + "loss": 0.2074, + "step": 30113 + }, + { + "epoch": 0.7620517751853633, + "grad_norm": 4.262502193450928, + "learning_rate": 1.3589962132764211e-06, + "loss": 0.1746, + "step": 30114 + }, + { + "epoch": 0.762077080750057, + "grad_norm": 3.3336102962493896, + "learning_rate": 1.3587210359685788e-06, + "loss": 0.1648, + "step": 30115 + }, + { + "epoch": 0.7621023863147506, + "grad_norm": 4.073802471160889, + "learning_rate": 1.3584458821426655e-06, + "loss": 0.095, + "step": 30116 + }, + { + "epoch": 0.7621276918794443, + "grad_norm": 8.613723754882812, + "learning_rate": 1.358170751800459e-06, + "loss": 0.1619, + "step": 30117 + }, + { + "epoch": 0.762152997444138, + "grad_norm": 13.960658073425293, + "learning_rate": 1.3578956449437313e-06, + "loss": 0.2434, + "step": 30118 + }, + { + "epoch": 0.7621783030088316, + "grad_norm": 4.531970500946045, + "learning_rate": 1.3576205615742577e-06, + "loss": 0.1076, + "step": 30119 + }, + { + "epoch": 0.7622036085735253, + "grad_norm": 4.915998935699463, + "learning_rate": 1.3573455016938097e-06, + "loss": 0.1065, + "step": 30120 + }, + { + "epoch": 0.762228914138219, + "grad_norm": 5.295907497406006, + "learning_rate": 1.3570704653041649e-06, + "loss": 0.1595, + "step": 30121 + }, + { + "epoch": 0.7622542197029126, + "grad_norm": 8.225865364074707, + "learning_rate": 1.3567954524070947e-06, + "loss": 0.1645, + "step": 30122 + }, + { + "epoch": 0.7622795252676063, + "grad_norm": 6.2118916511535645, + "learning_rate": 1.3565204630043733e-06, + "loss": 0.1554, + "step": 30123 + }, + { + "epoch": 0.7623048308323, + "grad_norm": 15.336209297180176, + "learning_rate": 1.3562454970977718e-06, + "loss": 0.1997, + "step": 30124 + }, + { + "epoch": 0.7623301363969937, + "grad_norm": 4.201761722564697, + "learning_rate": 1.3559705546890672e-06, + "loss": 0.1612, + "step": 30125 + }, + { + "epoch": 0.7623554419616874, + "grad_norm": 23.990955352783203, + "learning_rate": 1.3556956357800305e-06, + "loss": 0.3061, + "step": 30126 + }, + { + "epoch": 0.7623807475263811, + "grad_norm": 2.689000368118286, + "learning_rate": 1.3554207403724345e-06, + "loss": 0.1265, + "step": 30127 + }, + { + "epoch": 0.7624060530910747, + "grad_norm": 3.0419671535491943, + "learning_rate": 1.3551458684680523e-06, + "loss": 0.0943, + "step": 30128 + }, + { + "epoch": 0.7624313586557684, + "grad_norm": 5.030782699584961, + "learning_rate": 1.3548710200686548e-06, + "loss": 0.2193, + "step": 30129 + }, + { + "epoch": 0.7624566642204621, + "grad_norm": 4.2611188888549805, + "learning_rate": 1.3545961951760179e-06, + "loss": 0.1629, + "step": 30130 + }, + { + "epoch": 0.7624819697851558, + "grad_norm": 9.635579109191895, + "learning_rate": 1.354321393791912e-06, + "loss": 0.1948, + "step": 30131 + }, + { + "epoch": 0.7625072753498494, + "grad_norm": 8.335221290588379, + "learning_rate": 1.3540466159181086e-06, + "loss": 0.145, + "step": 30132 + }, + { + "epoch": 0.7625325809145431, + "grad_norm": 5.954234600067139, + "learning_rate": 1.35377186155638e-06, + "loss": 0.1504, + "step": 30133 + }, + { + "epoch": 0.7625578864792368, + "grad_norm": 2.907682418823242, + "learning_rate": 1.3534971307084992e-06, + "loss": 0.1122, + "step": 30134 + }, + { + "epoch": 0.7625831920439304, + "grad_norm": 13.682119369506836, + "learning_rate": 1.3532224233762352e-06, + "loss": 0.1553, + "step": 30135 + }, + { + "epoch": 0.7626084976086241, + "grad_norm": 4.576609134674072, + "learning_rate": 1.3529477395613654e-06, + "loss": 0.1369, + "step": 30136 + }, + { + "epoch": 0.7626338031733179, + "grad_norm": 3.15212345123291, + "learning_rate": 1.352673079265654e-06, + "loss": 0.1272, + "step": 30137 + }, + { + "epoch": 0.7626591087380115, + "grad_norm": 5.95966911315918, + "learning_rate": 1.3523984424908766e-06, + "loss": 0.0844, + "step": 30138 + }, + { + "epoch": 0.7626844143027052, + "grad_norm": 8.879777908325195, + "learning_rate": 1.3521238292388018e-06, + "loss": 0.2097, + "step": 30139 + }, + { + "epoch": 0.7627097198673989, + "grad_norm": 21.258329391479492, + "learning_rate": 1.3518492395112048e-06, + "loss": 0.3407, + "step": 30140 + }, + { + "epoch": 0.7627350254320925, + "grad_norm": 3.6939821243286133, + "learning_rate": 1.3515746733098506e-06, + "loss": 0.1204, + "step": 30141 + }, + { + "epoch": 0.7627603309967862, + "grad_norm": 4.58354377746582, + "learning_rate": 1.3513001306365136e-06, + "loss": 0.1695, + "step": 30142 + }, + { + "epoch": 0.7627856365614799, + "grad_norm": 3.8387529850006104, + "learning_rate": 1.3510256114929627e-06, + "loss": 0.1456, + "step": 30143 + }, + { + "epoch": 0.7628109421261735, + "grad_norm": 3.294260263442993, + "learning_rate": 1.3507511158809717e-06, + "loss": 0.1082, + "step": 30144 + }, + { + "epoch": 0.7628362476908672, + "grad_norm": 4.256945610046387, + "learning_rate": 1.3504766438023042e-06, + "loss": 0.1298, + "step": 30145 + }, + { + "epoch": 0.7628615532555609, + "grad_norm": 5.5507683753967285, + "learning_rate": 1.3502021952587364e-06, + "loss": 0.1288, + "step": 30146 + }, + { + "epoch": 0.7628868588202545, + "grad_norm": 3.1777501106262207, + "learning_rate": 1.3499277702520335e-06, + "loss": 0.0846, + "step": 30147 + }, + { + "epoch": 0.7629121643849482, + "grad_norm": 14.80928897857666, + "learning_rate": 1.3496533687839692e-06, + "loss": 0.1521, + "step": 30148 + }, + { + "epoch": 0.762937469949642, + "grad_norm": 9.853348731994629, + "learning_rate": 1.3493789908563115e-06, + "loss": 0.1494, + "step": 30149 + }, + { + "epoch": 0.7629627755143356, + "grad_norm": 3.692044258117676, + "learning_rate": 1.3491046364708294e-06, + "loss": 0.1571, + "step": 30150 + }, + { + "epoch": 0.7629880810790293, + "grad_norm": 5.226554870605469, + "learning_rate": 1.348830305629291e-06, + "loss": 0.1637, + "step": 30151 + }, + { + "epoch": 0.763013386643723, + "grad_norm": 7.622040748596191, + "learning_rate": 1.3485559983334683e-06, + "loss": 0.1424, + "step": 30152 + }, + { + "epoch": 0.7630386922084166, + "grad_norm": 5.455571174621582, + "learning_rate": 1.3482817145851291e-06, + "loss": 0.154, + "step": 30153 + }, + { + "epoch": 0.7630639977731103, + "grad_norm": 9.198280334472656, + "learning_rate": 1.3480074543860412e-06, + "loss": 0.2574, + "step": 30154 + }, + { + "epoch": 0.763089303337804, + "grad_norm": 4.627713680267334, + "learning_rate": 1.3477332177379742e-06, + "loss": 0.0683, + "step": 30155 + }, + { + "epoch": 0.7631146089024976, + "grad_norm": 10.632755279541016, + "learning_rate": 1.3474590046426945e-06, + "loss": 0.2474, + "step": 30156 + }, + { + "epoch": 0.7631399144671913, + "grad_norm": 4.633130073547363, + "learning_rate": 1.347184815101974e-06, + "loss": 0.1909, + "step": 30157 + }, + { + "epoch": 0.763165220031885, + "grad_norm": 6.880388259887695, + "learning_rate": 1.3469106491175794e-06, + "loss": 0.1675, + "step": 30158 + }, + { + "epoch": 0.7631905255965787, + "grad_norm": 6.133847713470459, + "learning_rate": 1.3466365066912784e-06, + "loss": 0.1335, + "step": 30159 + }, + { + "epoch": 0.7632158311612723, + "grad_norm": 3.43493390083313, + "learning_rate": 1.3463623878248373e-06, + "loss": 0.1061, + "step": 30160 + }, + { + "epoch": 0.763241136725966, + "grad_norm": 10.01072883605957, + "learning_rate": 1.346088292520027e-06, + "loss": 0.1974, + "step": 30161 + }, + { + "epoch": 0.7632664422906598, + "grad_norm": 3.8513355255126953, + "learning_rate": 1.3458142207786135e-06, + "loss": 0.084, + "step": 30162 + }, + { + "epoch": 0.7632917478553534, + "grad_norm": 3.208162784576416, + "learning_rate": 1.3455401726023648e-06, + "loss": 0.1208, + "step": 30163 + }, + { + "epoch": 0.7633170534200471, + "grad_norm": 4.470770835876465, + "learning_rate": 1.3452661479930457e-06, + "loss": 0.1138, + "step": 30164 + }, + { + "epoch": 0.7633423589847408, + "grad_norm": 4.096888542175293, + "learning_rate": 1.3449921469524275e-06, + "loss": 0.1496, + "step": 30165 + }, + { + "epoch": 0.7633676645494344, + "grad_norm": 4.867549419403076, + "learning_rate": 1.344718169482273e-06, + "loss": 0.1477, + "step": 30166 + }, + { + "epoch": 0.7633929701141281, + "grad_norm": 6.356838226318359, + "learning_rate": 1.3444442155843547e-06, + "loss": 0.1877, + "step": 30167 + }, + { + "epoch": 0.7634182756788218, + "grad_norm": 6.067610740661621, + "learning_rate": 1.3441702852604322e-06, + "loss": 0.1618, + "step": 30168 + }, + { + "epoch": 0.7634435812435154, + "grad_norm": 4.214019775390625, + "learning_rate": 1.3438963785122772e-06, + "loss": 0.1121, + "step": 30169 + }, + { + "epoch": 0.7634688868082091, + "grad_norm": 5.065121650695801, + "learning_rate": 1.3436224953416528e-06, + "loss": 0.1813, + "step": 30170 + }, + { + "epoch": 0.7634941923729028, + "grad_norm": 4.248684883117676, + "learning_rate": 1.3433486357503296e-06, + "loss": 0.1615, + "step": 30171 + }, + { + "epoch": 0.7635194979375964, + "grad_norm": 5.602365970611572, + "learning_rate": 1.3430747997400678e-06, + "loss": 0.1288, + "step": 30172 + }, + { + "epoch": 0.7635448035022901, + "grad_norm": 5.2550249099731445, + "learning_rate": 1.3428009873126386e-06, + "loss": 0.1005, + "step": 30173 + }, + { + "epoch": 0.7635701090669839, + "grad_norm": 4.803101539611816, + "learning_rate": 1.3425271984698035e-06, + "loss": 0.1184, + "step": 30174 + }, + { + "epoch": 0.7635954146316775, + "grad_norm": 4.902946949005127, + "learning_rate": 1.342253433213333e-06, + "loss": 0.0983, + "step": 30175 + }, + { + "epoch": 0.7636207201963712, + "grad_norm": 3.6076693534851074, + "learning_rate": 1.3419796915449867e-06, + "loss": 0.1878, + "step": 30176 + }, + { + "epoch": 0.7636460257610649, + "grad_norm": 7.346487045288086, + "learning_rate": 1.341705973466535e-06, + "loss": 0.2519, + "step": 30177 + }, + { + "epoch": 0.7636713313257585, + "grad_norm": 9.234368324279785, + "learning_rate": 1.341432278979739e-06, + "loss": 0.1699, + "step": 30178 + }, + { + "epoch": 0.7636966368904522, + "grad_norm": 2.568598985671997, + "learning_rate": 1.341158608086367e-06, + "loss": 0.1331, + "step": 30179 + }, + { + "epoch": 0.7637219424551459, + "grad_norm": 3.0661747455596924, + "learning_rate": 1.340884960788183e-06, + "loss": 0.097, + "step": 30180 + }, + { + "epoch": 0.7637472480198395, + "grad_norm": 4.048964500427246, + "learning_rate": 1.3406113370869512e-06, + "loss": 0.1334, + "step": 30181 + }, + { + "epoch": 0.7637725535845332, + "grad_norm": 6.623339653015137, + "learning_rate": 1.3403377369844344e-06, + "loss": 0.2327, + "step": 30182 + }, + { + "epoch": 0.7637978591492269, + "grad_norm": 6.7876811027526855, + "learning_rate": 1.3400641604824006e-06, + "loss": 0.1767, + "step": 30183 + }, + { + "epoch": 0.7638231647139206, + "grad_norm": 5.373947620391846, + "learning_rate": 1.3397906075826123e-06, + "loss": 0.1366, + "step": 30184 + }, + { + "epoch": 0.7638484702786142, + "grad_norm": 6.382381439208984, + "learning_rate": 1.3395170782868333e-06, + "loss": 0.1822, + "step": 30185 + }, + { + "epoch": 0.763873775843308, + "grad_norm": 5.686990737915039, + "learning_rate": 1.3392435725968278e-06, + "loss": 0.1937, + "step": 30186 + }, + { + "epoch": 0.7638990814080017, + "grad_norm": 4.492466926574707, + "learning_rate": 1.3389700905143577e-06, + "loss": 0.1545, + "step": 30187 + }, + { + "epoch": 0.7639243869726953, + "grad_norm": 3.8437788486480713, + "learning_rate": 1.3386966320411904e-06, + "loss": 0.1382, + "step": 30188 + }, + { + "epoch": 0.763949692537389, + "grad_norm": 3.7162275314331055, + "learning_rate": 1.3384231971790873e-06, + "loss": 0.1304, + "step": 30189 + }, + { + "epoch": 0.7639749981020827, + "grad_norm": 3.8412859439849854, + "learning_rate": 1.338149785929812e-06, + "loss": 0.1119, + "step": 30190 + }, + { + "epoch": 0.7640003036667763, + "grad_norm": 3.5077812671661377, + "learning_rate": 1.337876398295126e-06, + "loss": 0.0983, + "step": 30191 + }, + { + "epoch": 0.76402560923147, + "grad_norm": 3.3324949741363525, + "learning_rate": 1.3376030342767954e-06, + "loss": 0.1428, + "step": 30192 + }, + { + "epoch": 0.7640509147961637, + "grad_norm": 18.883129119873047, + "learning_rate": 1.3373296938765818e-06, + "loss": 0.1433, + "step": 30193 + }, + { + "epoch": 0.7640762203608573, + "grad_norm": 3.818438768386841, + "learning_rate": 1.3370563770962474e-06, + "loss": 0.1126, + "step": 30194 + }, + { + "epoch": 0.764101525925551, + "grad_norm": 2.712475538253784, + "learning_rate": 1.3367830839375534e-06, + "loss": 0.1195, + "step": 30195 + }, + { + "epoch": 0.7641268314902447, + "grad_norm": 4.97144079208374, + "learning_rate": 1.336509814402266e-06, + "loss": 0.1349, + "step": 30196 + }, + { + "epoch": 0.7641521370549383, + "grad_norm": 4.179315090179443, + "learning_rate": 1.3362365684921447e-06, + "loss": 0.1255, + "step": 30197 + }, + { + "epoch": 0.764177442619632, + "grad_norm": 2.275294780731201, + "learning_rate": 1.3359633462089527e-06, + "loss": 0.0476, + "step": 30198 + }, + { + "epoch": 0.7642027481843258, + "grad_norm": 4.702985763549805, + "learning_rate": 1.33569014755445e-06, + "loss": 0.1326, + "step": 30199 + }, + { + "epoch": 0.7642280537490194, + "grad_norm": 4.422224998474121, + "learning_rate": 1.3354169725304011e-06, + "loss": 0.1726, + "step": 30200 + }, + { + "epoch": 0.7642533593137131, + "grad_norm": 5.421978950500488, + "learning_rate": 1.3351438211385652e-06, + "loss": 0.1441, + "step": 30201 + }, + { + "epoch": 0.7642786648784068, + "grad_norm": 13.909818649291992, + "learning_rate": 1.334870693380708e-06, + "loss": 0.2907, + "step": 30202 + }, + { + "epoch": 0.7643039704431004, + "grad_norm": 3.4193127155303955, + "learning_rate": 1.334597589258585e-06, + "loss": 0.1968, + "step": 30203 + }, + { + "epoch": 0.7643292760077941, + "grad_norm": 6.8312883377075195, + "learning_rate": 1.334324508773962e-06, + "loss": 0.1762, + "step": 30204 + }, + { + "epoch": 0.7643545815724878, + "grad_norm": 6.329726219177246, + "learning_rate": 1.334051451928597e-06, + "loss": 0.1841, + "step": 30205 + }, + { + "epoch": 0.7643798871371814, + "grad_norm": 4.4510908126831055, + "learning_rate": 1.3337784187242548e-06, + "loss": 0.1616, + "step": 30206 + }, + { + "epoch": 0.7644051927018751, + "grad_norm": 3.1850574016571045, + "learning_rate": 1.3335054091626903e-06, + "loss": 0.0813, + "step": 30207 + }, + { + "epoch": 0.7644304982665688, + "grad_norm": 5.3694939613342285, + "learning_rate": 1.3332324232456694e-06, + "loss": 0.2063, + "step": 30208 + }, + { + "epoch": 0.7644558038312625, + "grad_norm": 5.608915328979492, + "learning_rate": 1.3329594609749492e-06, + "loss": 0.1717, + "step": 30209 + }, + { + "epoch": 0.7644811093959561, + "grad_norm": 7.233730316162109, + "learning_rate": 1.3326865223522928e-06, + "loss": 0.2048, + "step": 30210 + }, + { + "epoch": 0.7645064149606499, + "grad_norm": 6.288007736206055, + "learning_rate": 1.3324136073794585e-06, + "loss": 0.1755, + "step": 30211 + }, + { + "epoch": 0.7645317205253436, + "grad_norm": 6.187211513519287, + "learning_rate": 1.3321407160582066e-06, + "loss": 0.2515, + "step": 30212 + }, + { + "epoch": 0.7645570260900372, + "grad_norm": 4.652383804321289, + "learning_rate": 1.3318678483902975e-06, + "loss": 0.1278, + "step": 30213 + }, + { + "epoch": 0.7645823316547309, + "grad_norm": 6.719488620758057, + "learning_rate": 1.3315950043774882e-06, + "loss": 0.1687, + "step": 30214 + }, + { + "epoch": 0.7646076372194246, + "grad_norm": 7.888631820678711, + "learning_rate": 1.3313221840215422e-06, + "loss": 0.2497, + "step": 30215 + }, + { + "epoch": 0.7646329427841182, + "grad_norm": 3.145418167114258, + "learning_rate": 1.331049387324217e-06, + "loss": 0.1803, + "step": 30216 + }, + { + "epoch": 0.7646582483488119, + "grad_norm": 3.231973886489868, + "learning_rate": 1.3307766142872713e-06, + "loss": 0.1536, + "step": 30217 + }, + { + "epoch": 0.7646835539135056, + "grad_norm": 3.49798846244812, + "learning_rate": 1.3305038649124642e-06, + "loss": 0.1096, + "step": 30218 + }, + { + "epoch": 0.7647088594781992, + "grad_norm": 5.372463226318359, + "learning_rate": 1.3302311392015555e-06, + "loss": 0.1626, + "step": 30219 + }, + { + "epoch": 0.7647341650428929, + "grad_norm": 8.913844108581543, + "learning_rate": 1.329958437156304e-06, + "loss": 0.1808, + "step": 30220 + }, + { + "epoch": 0.7647594706075866, + "grad_norm": 4.599330902099609, + "learning_rate": 1.3296857587784683e-06, + "loss": 0.0788, + "step": 30221 + }, + { + "epoch": 0.7647847761722802, + "grad_norm": 8.927407264709473, + "learning_rate": 1.3294131040698044e-06, + "loss": 0.1151, + "step": 30222 + }, + { + "epoch": 0.764810081736974, + "grad_norm": 7.36534309387207, + "learning_rate": 1.3291404730320745e-06, + "loss": 0.1715, + "step": 30223 + }, + { + "epoch": 0.7648353873016677, + "grad_norm": 12.687572479248047, + "learning_rate": 1.3288678656670345e-06, + "loss": 0.2382, + "step": 30224 + }, + { + "epoch": 0.7648606928663613, + "grad_norm": 3.892967939376831, + "learning_rate": 1.3285952819764424e-06, + "loss": 0.1584, + "step": 30225 + }, + { + "epoch": 0.764885998431055, + "grad_norm": 2.964460849761963, + "learning_rate": 1.3283227219620554e-06, + "loss": 0.1032, + "step": 30226 + }, + { + "epoch": 0.7649113039957487, + "grad_norm": 5.979194641113281, + "learning_rate": 1.3280501856256339e-06, + "loss": 0.1307, + "step": 30227 + }, + { + "epoch": 0.7649366095604423, + "grad_norm": 3.7849347591400146, + "learning_rate": 1.3277776729689334e-06, + "loss": 0.1507, + "step": 30228 + }, + { + "epoch": 0.764961915125136, + "grad_norm": 3.888026237487793, + "learning_rate": 1.3275051839937114e-06, + "loss": 0.1679, + "step": 30229 + }, + { + "epoch": 0.7649872206898297, + "grad_norm": 2.5601882934570312, + "learning_rate": 1.3272327187017243e-06, + "loss": 0.1137, + "step": 30230 + }, + { + "epoch": 0.7650125262545233, + "grad_norm": 2.7374963760375977, + "learning_rate": 1.3269602770947315e-06, + "loss": 0.1258, + "step": 30231 + }, + { + "epoch": 0.765037831819217, + "grad_norm": 5.265979766845703, + "learning_rate": 1.326687859174487e-06, + "loss": 0.163, + "step": 30232 + }, + { + "epoch": 0.7650631373839107, + "grad_norm": 2.9199132919311523, + "learning_rate": 1.3264154649427524e-06, + "loss": 0.118, + "step": 30233 + }, + { + "epoch": 0.7650884429486045, + "grad_norm": 12.038946151733398, + "learning_rate": 1.3261430944012782e-06, + "loss": 0.1457, + "step": 30234 + }, + { + "epoch": 0.765113748513298, + "grad_norm": 6.747903347015381, + "learning_rate": 1.3258707475518252e-06, + "loss": 0.1849, + "step": 30235 + }, + { + "epoch": 0.7651390540779918, + "grad_norm": 6.117752552032471, + "learning_rate": 1.3255984243961468e-06, + "loss": 0.1067, + "step": 30236 + }, + { + "epoch": 0.7651643596426855, + "grad_norm": 5.938873291015625, + "learning_rate": 1.3253261249360033e-06, + "loss": 0.2023, + "step": 30237 + }, + { + "epoch": 0.7651896652073791, + "grad_norm": 8.130380630493164, + "learning_rate": 1.3250538491731458e-06, + "loss": 0.187, + "step": 30238 + }, + { + "epoch": 0.7652149707720728, + "grad_norm": 1.954384207725525, + "learning_rate": 1.3247815971093332e-06, + "loss": 0.058, + "step": 30239 + }, + { + "epoch": 0.7652402763367665, + "grad_norm": 3.829789161682129, + "learning_rate": 1.3245093687463206e-06, + "loss": 0.1358, + "step": 30240 + }, + { + "epoch": 0.7652655819014601, + "grad_norm": 3.9011921882629395, + "learning_rate": 1.324237164085862e-06, + "loss": 0.0696, + "step": 30241 + }, + { + "epoch": 0.7652908874661538, + "grad_norm": 6.869804859161377, + "learning_rate": 1.3239649831297158e-06, + "loss": 0.2242, + "step": 30242 + }, + { + "epoch": 0.7653161930308475, + "grad_norm": 5.953314781188965, + "learning_rate": 1.3236928258796355e-06, + "loss": 0.1383, + "step": 30243 + }, + { + "epoch": 0.7653414985955411, + "grad_norm": 4.975370407104492, + "learning_rate": 1.3234206923373766e-06, + "loss": 0.1329, + "step": 30244 + }, + { + "epoch": 0.7653668041602348, + "grad_norm": 9.176390647888184, + "learning_rate": 1.323148582504692e-06, + "loss": 0.183, + "step": 30245 + }, + { + "epoch": 0.7653921097249285, + "grad_norm": 8.914770126342773, + "learning_rate": 1.3228764963833402e-06, + "loss": 0.2022, + "step": 30246 + }, + { + "epoch": 0.7654174152896221, + "grad_norm": 13.58055305480957, + "learning_rate": 1.3226044339750731e-06, + "loss": 0.1605, + "step": 30247 + }, + { + "epoch": 0.7654427208543159, + "grad_norm": 22.974660873413086, + "learning_rate": 1.3223323952816464e-06, + "loss": 0.2395, + "step": 30248 + }, + { + "epoch": 0.7654680264190096, + "grad_norm": 8.846654891967773, + "learning_rate": 1.3220603803048127e-06, + "loss": 0.2071, + "step": 30249 + }, + { + "epoch": 0.7654933319837032, + "grad_norm": 3.1924262046813965, + "learning_rate": 1.32178838904633e-06, + "loss": 0.091, + "step": 30250 + }, + { + "epoch": 0.7655186375483969, + "grad_norm": 3.241584539413452, + "learning_rate": 1.3215164215079467e-06, + "loss": 0.0812, + "step": 30251 + }, + { + "epoch": 0.7655439431130906, + "grad_norm": 7.430827617645264, + "learning_rate": 1.3212444776914218e-06, + "loss": 0.1323, + "step": 30252 + }, + { + "epoch": 0.7655692486777842, + "grad_norm": 2.8600075244903564, + "learning_rate": 1.320972557598505e-06, + "loss": 0.1712, + "step": 30253 + }, + { + "epoch": 0.7655945542424779, + "grad_norm": 3.3888792991638184, + "learning_rate": 1.3207006612309536e-06, + "loss": 0.1118, + "step": 30254 + }, + { + "epoch": 0.7656198598071716, + "grad_norm": 8.298808097839355, + "learning_rate": 1.3204287885905193e-06, + "loss": 0.2253, + "step": 30255 + }, + { + "epoch": 0.7656451653718652, + "grad_norm": 10.651803016662598, + "learning_rate": 1.3201569396789548e-06, + "loss": 0.3399, + "step": 30256 + }, + { + "epoch": 0.7656704709365589, + "grad_norm": 4.0446062088012695, + "learning_rate": 1.3198851144980124e-06, + "loss": 0.1536, + "step": 30257 + }, + { + "epoch": 0.7656957765012526, + "grad_norm": 11.066262245178223, + "learning_rate": 1.3196133130494482e-06, + "loss": 0.1767, + "step": 30258 + }, + { + "epoch": 0.7657210820659464, + "grad_norm": 6.542158126831055, + "learning_rate": 1.319341535335013e-06, + "loss": 0.1474, + "step": 30259 + }, + { + "epoch": 0.76574638763064, + "grad_norm": 8.082609176635742, + "learning_rate": 1.3190697813564596e-06, + "loss": 0.1895, + "step": 30260 + }, + { + "epoch": 0.7657716931953337, + "grad_norm": 9.812895774841309, + "learning_rate": 1.3187980511155385e-06, + "loss": 0.1926, + "step": 30261 + }, + { + "epoch": 0.7657969987600274, + "grad_norm": 5.39713716506958, + "learning_rate": 1.3185263446140062e-06, + "loss": 0.1269, + "step": 30262 + }, + { + "epoch": 0.765822304324721, + "grad_norm": 6.186629772186279, + "learning_rate": 1.3182546618536107e-06, + "loss": 0.1394, + "step": 30263 + }, + { + "epoch": 0.7658476098894147, + "grad_norm": 4.962878704071045, + "learning_rate": 1.317983002836109e-06, + "loss": 0.1288, + "step": 30264 + }, + { + "epoch": 0.7658729154541084, + "grad_norm": 8.99129581451416, + "learning_rate": 1.3177113675632474e-06, + "loss": 0.1889, + "step": 30265 + }, + { + "epoch": 0.765898221018802, + "grad_norm": 3.4977216720581055, + "learning_rate": 1.3174397560367812e-06, + "loss": 0.0739, + "step": 30266 + }, + { + "epoch": 0.7659235265834957, + "grad_norm": 3.9491801261901855, + "learning_rate": 1.3171681682584603e-06, + "loss": 0.1583, + "step": 30267 + }, + { + "epoch": 0.7659488321481894, + "grad_norm": 2.5909488201141357, + "learning_rate": 1.3168966042300391e-06, + "loss": 0.1021, + "step": 30268 + }, + { + "epoch": 0.765974137712883, + "grad_norm": 4.277388095855713, + "learning_rate": 1.316625063953264e-06, + "loss": 0.1259, + "step": 30269 + }, + { + "epoch": 0.7659994432775767, + "grad_norm": 12.087370872497559, + "learning_rate": 1.31635354742989e-06, + "loss": 0.1605, + "step": 30270 + }, + { + "epoch": 0.7660247488422705, + "grad_norm": 5.642641067504883, + "learning_rate": 1.3160820546616665e-06, + "loss": 0.1424, + "step": 30271 + }, + { + "epoch": 0.766050054406964, + "grad_norm": 4.5812201499938965, + "learning_rate": 1.315810585650345e-06, + "loss": 0.1758, + "step": 30272 + }, + { + "epoch": 0.7660753599716578, + "grad_norm": 8.078925132751465, + "learning_rate": 1.3155391403976742e-06, + "loss": 0.1895, + "step": 30273 + }, + { + "epoch": 0.7661006655363515, + "grad_norm": 5.181488037109375, + "learning_rate": 1.3152677189054074e-06, + "loss": 0.1364, + "step": 30274 + }, + { + "epoch": 0.7661259711010451, + "grad_norm": 3.6719515323638916, + "learning_rate": 1.3149963211752937e-06, + "loss": 0.1989, + "step": 30275 + }, + { + "epoch": 0.7661512766657388, + "grad_norm": 23.4056339263916, + "learning_rate": 1.3147249472090817e-06, + "loss": 0.2118, + "step": 30276 + }, + { + "epoch": 0.7661765822304325, + "grad_norm": 2.0775887966156006, + "learning_rate": 1.3144535970085264e-06, + "loss": 0.1019, + "step": 30277 + }, + { + "epoch": 0.7662018877951261, + "grad_norm": 6.651355266571045, + "learning_rate": 1.314182270575371e-06, + "loss": 0.1422, + "step": 30278 + }, + { + "epoch": 0.7662271933598198, + "grad_norm": 6.670481204986572, + "learning_rate": 1.3139109679113699e-06, + "loss": 0.1318, + "step": 30279 + }, + { + "epoch": 0.7662524989245135, + "grad_norm": 19.578798294067383, + "learning_rate": 1.3136396890182702e-06, + "loss": 0.3939, + "step": 30280 + }, + { + "epoch": 0.7662778044892071, + "grad_norm": 5.315197467803955, + "learning_rate": 1.3133684338978252e-06, + "loss": 0.1798, + "step": 30281 + }, + { + "epoch": 0.7663031100539008, + "grad_norm": 3.7348151206970215, + "learning_rate": 1.3130972025517784e-06, + "loss": 0.183, + "step": 30282 + }, + { + "epoch": 0.7663284156185945, + "grad_norm": 9.04285717010498, + "learning_rate": 1.3128259949818834e-06, + "loss": 0.2464, + "step": 30283 + }, + { + "epoch": 0.7663537211832882, + "grad_norm": 4.313426971435547, + "learning_rate": 1.3125548111898862e-06, + "loss": 0.1946, + "step": 30284 + }, + { + "epoch": 0.7663790267479819, + "grad_norm": 5.421430587768555, + "learning_rate": 1.312283651177539e-06, + "loss": 0.1375, + "step": 30285 + }, + { + "epoch": 0.7664043323126756, + "grad_norm": 11.501136779785156, + "learning_rate": 1.3120125149465879e-06, + "loss": 0.2518, + "step": 30286 + }, + { + "epoch": 0.7664296378773693, + "grad_norm": 3.772355318069458, + "learning_rate": 1.3117414024987823e-06, + "loss": 0.1832, + "step": 30287 + }, + { + "epoch": 0.7664549434420629, + "grad_norm": 3.8457391262054443, + "learning_rate": 1.3114703138358687e-06, + "loss": 0.1176, + "step": 30288 + }, + { + "epoch": 0.7664802490067566, + "grad_norm": 2.4593427181243896, + "learning_rate": 1.3111992489595986e-06, + "loss": 0.0907, + "step": 30289 + }, + { + "epoch": 0.7665055545714503, + "grad_norm": 5.538488388061523, + "learning_rate": 1.3109282078717184e-06, + "loss": 0.1587, + "step": 30290 + }, + { + "epoch": 0.7665308601361439, + "grad_norm": 14.906296730041504, + "learning_rate": 1.3106571905739761e-06, + "loss": 0.257, + "step": 30291 + }, + { + "epoch": 0.7665561657008376, + "grad_norm": 4.328212261199951, + "learning_rate": 1.3103861970681175e-06, + "loss": 0.1346, + "step": 30292 + }, + { + "epoch": 0.7665814712655313, + "grad_norm": 3.276731014251709, + "learning_rate": 1.3101152273558936e-06, + "loss": 0.1417, + "step": 30293 + }, + { + "epoch": 0.7666067768302249, + "grad_norm": 10.393706321716309, + "learning_rate": 1.3098442814390489e-06, + "loss": 0.1983, + "step": 30294 + }, + { + "epoch": 0.7666320823949186, + "grad_norm": 11.86452579498291, + "learning_rate": 1.309573359319335e-06, + "loss": 0.2369, + "step": 30295 + }, + { + "epoch": 0.7666573879596124, + "grad_norm": 4.685863971710205, + "learning_rate": 1.3093024609984927e-06, + "loss": 0.136, + "step": 30296 + }, + { + "epoch": 0.766682693524306, + "grad_norm": 3.4527316093444824, + "learning_rate": 1.3090315864782743e-06, + "loss": 0.136, + "step": 30297 + }, + { + "epoch": 0.7667079990889997, + "grad_norm": 3.5318520069122314, + "learning_rate": 1.3087607357604243e-06, + "loss": 0.0856, + "step": 30298 + }, + { + "epoch": 0.7667333046536934, + "grad_norm": 6.067449569702148, + "learning_rate": 1.3084899088466896e-06, + "loss": 0.2264, + "step": 30299 + }, + { + "epoch": 0.766758610218387, + "grad_norm": 9.087884902954102, + "learning_rate": 1.3082191057388156e-06, + "loss": 0.2408, + "step": 30300 + }, + { + "epoch": 0.7667839157830807, + "grad_norm": 3.2983696460723877, + "learning_rate": 1.3079483264385513e-06, + "loss": 0.145, + "step": 30301 + }, + { + "epoch": 0.7668092213477744, + "grad_norm": 4.960566520690918, + "learning_rate": 1.3076775709476413e-06, + "loss": 0.1217, + "step": 30302 + }, + { + "epoch": 0.766834526912468, + "grad_norm": 4.588864803314209, + "learning_rate": 1.307406839267832e-06, + "loss": 0.2146, + "step": 30303 + }, + { + "epoch": 0.7668598324771617, + "grad_norm": 13.383580207824707, + "learning_rate": 1.3071361314008673e-06, + "loss": 0.164, + "step": 30304 + }, + { + "epoch": 0.7668851380418554, + "grad_norm": 2.8830316066741943, + "learning_rate": 1.3068654473484965e-06, + "loss": 0.1224, + "step": 30305 + }, + { + "epoch": 0.766910443606549, + "grad_norm": 4.02808952331543, + "learning_rate": 1.3065947871124635e-06, + "loss": 0.1124, + "step": 30306 + }, + { + "epoch": 0.7669357491712427, + "grad_norm": 12.675555229187012, + "learning_rate": 1.3063241506945124e-06, + "loss": 0.256, + "step": 30307 + }, + { + "epoch": 0.7669610547359365, + "grad_norm": 4.019012928009033, + "learning_rate": 1.3060535380963925e-06, + "loss": 0.0458, + "step": 30308 + }, + { + "epoch": 0.76698636030063, + "grad_norm": 10.7344970703125, + "learning_rate": 1.3057829493198432e-06, + "loss": 0.2037, + "step": 30309 + }, + { + "epoch": 0.7670116658653238, + "grad_norm": 7.6329803466796875, + "learning_rate": 1.305512384366614e-06, + "loss": 0.2387, + "step": 30310 + }, + { + "epoch": 0.7670369714300175, + "grad_norm": 5.9681315422058105, + "learning_rate": 1.3052418432384462e-06, + "loss": 0.1257, + "step": 30311 + }, + { + "epoch": 0.7670622769947112, + "grad_norm": 3.609027624130249, + "learning_rate": 1.30497132593709e-06, + "loss": 0.1593, + "step": 30312 + }, + { + "epoch": 0.7670875825594048, + "grad_norm": 5.163947105407715, + "learning_rate": 1.3047008324642836e-06, + "loss": 0.1414, + "step": 30313 + }, + { + "epoch": 0.7671128881240985, + "grad_norm": 4.511806488037109, + "learning_rate": 1.3044303628217752e-06, + "loss": 0.174, + "step": 30314 + }, + { + "epoch": 0.7671381936887922, + "grad_norm": 6.362374305725098, + "learning_rate": 1.3041599170113067e-06, + "loss": 0.1441, + "step": 30315 + }, + { + "epoch": 0.7671634992534858, + "grad_norm": 5.499713897705078, + "learning_rate": 1.3038894950346248e-06, + "loss": 0.1237, + "step": 30316 + }, + { + "epoch": 0.7671888048181795, + "grad_norm": 11.509282112121582, + "learning_rate": 1.3036190968934725e-06, + "loss": 0.1916, + "step": 30317 + }, + { + "epoch": 0.7672141103828732, + "grad_norm": 8.909075736999512, + "learning_rate": 1.3033487225895925e-06, + "loss": 0.2408, + "step": 30318 + }, + { + "epoch": 0.7672394159475668, + "grad_norm": 6.495845794677734, + "learning_rate": 1.3030783721247275e-06, + "loss": 0.1677, + "step": 30319 + }, + { + "epoch": 0.7672647215122606, + "grad_norm": 11.499312400817871, + "learning_rate": 1.3028080455006243e-06, + "loss": 0.2195, + "step": 30320 + }, + { + "epoch": 0.7672900270769543, + "grad_norm": 4.507751941680908, + "learning_rate": 1.302537742719024e-06, + "loss": 0.1624, + "step": 30321 + }, + { + "epoch": 0.7673153326416479, + "grad_norm": 8.49246597290039, + "learning_rate": 1.3022674637816702e-06, + "loss": 0.1236, + "step": 30322 + }, + { + "epoch": 0.7673406382063416, + "grad_norm": 6.887607097625732, + "learning_rate": 1.3019972086903037e-06, + "loss": 0.1771, + "step": 30323 + }, + { + "epoch": 0.7673659437710353, + "grad_norm": 7.242849349975586, + "learning_rate": 1.3017269774466713e-06, + "loss": 0.1851, + "step": 30324 + }, + { + "epoch": 0.7673912493357289, + "grad_norm": 4.5848588943481445, + "learning_rate": 1.3014567700525133e-06, + "loss": 0.134, + "step": 30325 + }, + { + "epoch": 0.7674165549004226, + "grad_norm": 5.061717987060547, + "learning_rate": 1.3011865865095729e-06, + "loss": 0.1327, + "step": 30326 + }, + { + "epoch": 0.7674418604651163, + "grad_norm": 9.03139591217041, + "learning_rate": 1.3009164268195905e-06, + "loss": 0.2122, + "step": 30327 + }, + { + "epoch": 0.7674671660298099, + "grad_norm": 6.7528605461120605, + "learning_rate": 1.300646290984312e-06, + "loss": 0.1399, + "step": 30328 + }, + { + "epoch": 0.7674924715945036, + "grad_norm": 3.85258412361145, + "learning_rate": 1.3003761790054769e-06, + "loss": 0.1638, + "step": 30329 + }, + { + "epoch": 0.7675177771591973, + "grad_norm": 3.3717358112335205, + "learning_rate": 1.3001060908848279e-06, + "loss": 0.111, + "step": 30330 + }, + { + "epoch": 0.7675430827238909, + "grad_norm": 6.9219889640808105, + "learning_rate": 1.299836026624105e-06, + "loss": 0.1842, + "step": 30331 + }, + { + "epoch": 0.7675683882885846, + "grad_norm": 10.471700668334961, + "learning_rate": 1.2995659862250525e-06, + "loss": 0.1751, + "step": 30332 + }, + { + "epoch": 0.7675936938532784, + "grad_norm": 6.240484714508057, + "learning_rate": 1.2992959696894108e-06, + "loss": 0.1645, + "step": 30333 + }, + { + "epoch": 0.767618999417972, + "grad_norm": 3.968764543533325, + "learning_rate": 1.2990259770189207e-06, + "loss": 0.1343, + "step": 30334 + }, + { + "epoch": 0.7676443049826657, + "grad_norm": 2.8327951431274414, + "learning_rate": 1.2987560082153239e-06, + "loss": 0.1147, + "step": 30335 + }, + { + "epoch": 0.7676696105473594, + "grad_norm": 3.8760459423065186, + "learning_rate": 1.2984860632803597e-06, + "loss": 0.1317, + "step": 30336 + }, + { + "epoch": 0.7676949161120531, + "grad_norm": 4.992576599121094, + "learning_rate": 1.298216142215772e-06, + "loss": 0.123, + "step": 30337 + }, + { + "epoch": 0.7677202216767467, + "grad_norm": 3.046046495437622, + "learning_rate": 1.2979462450232977e-06, + "loss": 0.1048, + "step": 30338 + }, + { + "epoch": 0.7677455272414404, + "grad_norm": 7.372764587402344, + "learning_rate": 1.2976763717046832e-06, + "loss": 0.1375, + "step": 30339 + }, + { + "epoch": 0.7677708328061341, + "grad_norm": 3.599925994873047, + "learning_rate": 1.2974065222616616e-06, + "loss": 0.1452, + "step": 30340 + }, + { + "epoch": 0.7677961383708277, + "grad_norm": 2.7631568908691406, + "learning_rate": 1.2971366966959781e-06, + "loss": 0.1194, + "step": 30341 + }, + { + "epoch": 0.7678214439355214, + "grad_norm": 5.836793422698975, + "learning_rate": 1.29686689500937e-06, + "loss": 0.1453, + "step": 30342 + }, + { + "epoch": 0.7678467495002151, + "grad_norm": 5.102407932281494, + "learning_rate": 1.2965971172035812e-06, + "loss": 0.1312, + "step": 30343 + }, + { + "epoch": 0.7678720550649087, + "grad_norm": 2.1121456623077393, + "learning_rate": 1.2963273632803457e-06, + "loss": 0.0537, + "step": 30344 + }, + { + "epoch": 0.7678973606296025, + "grad_norm": 8.57352352142334, + "learning_rate": 1.2960576332414081e-06, + "loss": 0.2068, + "step": 30345 + }, + { + "epoch": 0.7679226661942962, + "grad_norm": 6.865825653076172, + "learning_rate": 1.2957879270885038e-06, + "loss": 0.1983, + "step": 30346 + }, + { + "epoch": 0.7679479717589898, + "grad_norm": 6.485713005065918, + "learning_rate": 1.2955182448233773e-06, + "loss": 0.2071, + "step": 30347 + }, + { + "epoch": 0.7679732773236835, + "grad_norm": 2.8605144023895264, + "learning_rate": 1.2952485864477615e-06, + "loss": 0.1429, + "step": 30348 + }, + { + "epoch": 0.7679985828883772, + "grad_norm": 2.816957712173462, + "learning_rate": 1.2949789519634e-06, + "loss": 0.063, + "step": 30349 + }, + { + "epoch": 0.7680238884530708, + "grad_norm": 8.0701904296875, + "learning_rate": 1.2947093413720285e-06, + "loss": 0.2013, + "step": 30350 + }, + { + "epoch": 0.7680491940177645, + "grad_norm": 5.008937358856201, + "learning_rate": 1.294439754675389e-06, + "loss": 0.2031, + "step": 30351 + }, + { + "epoch": 0.7680744995824582, + "grad_norm": 5.796294689178467, + "learning_rate": 1.2941701918752182e-06, + "loss": 0.1885, + "step": 30352 + }, + { + "epoch": 0.7680998051471518, + "grad_norm": 4.916988372802734, + "learning_rate": 1.293900652973254e-06, + "loss": 0.0979, + "step": 30353 + }, + { + "epoch": 0.7681251107118455, + "grad_norm": 3.7672505378723145, + "learning_rate": 1.293631137971234e-06, + "loss": 0.0774, + "step": 30354 + }, + { + "epoch": 0.7681504162765392, + "grad_norm": 5.565078258514404, + "learning_rate": 1.2933616468708987e-06, + "loss": 0.2096, + "step": 30355 + }, + { + "epoch": 0.7681757218412328, + "grad_norm": 7.044012069702148, + "learning_rate": 1.2930921796739848e-06, + "loss": 0.1876, + "step": 30356 + }, + { + "epoch": 0.7682010274059266, + "grad_norm": 4.029428482055664, + "learning_rate": 1.2928227363822294e-06, + "loss": 0.1443, + "step": 30357 + }, + { + "epoch": 0.7682263329706203, + "grad_norm": 8.198561668395996, + "learning_rate": 1.2925533169973698e-06, + "loss": 0.2547, + "step": 30358 + }, + { + "epoch": 0.7682516385353139, + "grad_norm": 2.9014782905578613, + "learning_rate": 1.2922839215211452e-06, + "loss": 0.1512, + "step": 30359 + }, + { + "epoch": 0.7682769441000076, + "grad_norm": 13.211695671081543, + "learning_rate": 1.2920145499552923e-06, + "loss": 0.1337, + "step": 30360 + }, + { + "epoch": 0.7683022496647013, + "grad_norm": 7.0229105949401855, + "learning_rate": 1.2917452023015475e-06, + "loss": 0.2473, + "step": 30361 + }, + { + "epoch": 0.768327555229395, + "grad_norm": 12.499717712402344, + "learning_rate": 1.2914758785616483e-06, + "loss": 0.1699, + "step": 30362 + }, + { + "epoch": 0.7683528607940886, + "grad_norm": 9.780220031738281, + "learning_rate": 1.2912065787373296e-06, + "loss": 0.1406, + "step": 30363 + }, + { + "epoch": 0.7683781663587823, + "grad_norm": 3.0167758464813232, + "learning_rate": 1.2909373028303311e-06, + "loss": 0.1243, + "step": 30364 + }, + { + "epoch": 0.768403471923476, + "grad_norm": 4.263697147369385, + "learning_rate": 1.2906680508423886e-06, + "loss": 0.1371, + "step": 30365 + }, + { + "epoch": 0.7684287774881696, + "grad_norm": 6.701180458068848, + "learning_rate": 1.290398822775238e-06, + "loss": 0.1299, + "step": 30366 + }, + { + "epoch": 0.7684540830528633, + "grad_norm": 4.228411674499512, + "learning_rate": 1.2901296186306129e-06, + "loss": 0.1749, + "step": 30367 + }, + { + "epoch": 0.768479388617557, + "grad_norm": 7.518797874450684, + "learning_rate": 1.2898604384102543e-06, + "loss": 0.2187, + "step": 30368 + }, + { + "epoch": 0.7685046941822506, + "grad_norm": 4.44949197769165, + "learning_rate": 1.2895912821158934e-06, + "loss": 0.1758, + "step": 30369 + }, + { + "epoch": 0.7685299997469444, + "grad_norm": 13.966647148132324, + "learning_rate": 1.2893221497492714e-06, + "loss": 0.28, + "step": 30370 + }, + { + "epoch": 0.7685553053116381, + "grad_norm": 3.216356039047241, + "learning_rate": 1.2890530413121178e-06, + "loss": 0.1317, + "step": 30371 + }, + { + "epoch": 0.7685806108763317, + "grad_norm": 5.263867378234863, + "learning_rate": 1.2887839568061715e-06, + "loss": 0.2063, + "step": 30372 + }, + { + "epoch": 0.7686059164410254, + "grad_norm": 4.278242588043213, + "learning_rate": 1.2885148962331666e-06, + "loss": 0.1487, + "step": 30373 + }, + { + "epoch": 0.7686312220057191, + "grad_norm": 2.806932210922241, + "learning_rate": 1.288245859594841e-06, + "loss": 0.0842, + "step": 30374 + }, + { + "epoch": 0.7686565275704127, + "grad_norm": 3.9151504039764404, + "learning_rate": 1.2879768468929248e-06, + "loss": 0.1395, + "step": 30375 + }, + { + "epoch": 0.7686818331351064, + "grad_norm": 3.986112117767334, + "learning_rate": 1.2877078581291574e-06, + "loss": 0.1079, + "step": 30376 + }, + { + "epoch": 0.7687071386998001, + "grad_norm": 5.0105204582214355, + "learning_rate": 1.2874388933052695e-06, + "loss": 0.0895, + "step": 30377 + }, + { + "epoch": 0.7687324442644937, + "grad_norm": 6.061265468597412, + "learning_rate": 1.2871699524230002e-06, + "loss": 0.1658, + "step": 30378 + }, + { + "epoch": 0.7687577498291874, + "grad_norm": 3.594627618789673, + "learning_rate": 1.2869010354840789e-06, + "loss": 0.089, + "step": 30379 + }, + { + "epoch": 0.7687830553938811, + "grad_norm": 12.358709335327148, + "learning_rate": 1.2866321424902434e-06, + "loss": 0.3549, + "step": 30380 + }, + { + "epoch": 0.7688083609585747, + "grad_norm": 7.260146141052246, + "learning_rate": 1.2863632734432245e-06, + "loss": 0.1097, + "step": 30381 + }, + { + "epoch": 0.7688336665232685, + "grad_norm": 3.559490919113159, + "learning_rate": 1.2860944283447602e-06, + "loss": 0.1279, + "step": 30382 + }, + { + "epoch": 0.7688589720879622, + "grad_norm": 4.251082420349121, + "learning_rate": 1.2858256071965818e-06, + "loss": 0.1274, + "step": 30383 + }, + { + "epoch": 0.7688842776526558, + "grad_norm": 3.7506754398345947, + "learning_rate": 1.2855568100004234e-06, + "loss": 0.1667, + "step": 30384 + }, + { + "epoch": 0.7689095832173495, + "grad_norm": 5.153692245483398, + "learning_rate": 1.2852880367580168e-06, + "loss": 0.1565, + "step": 30385 + }, + { + "epoch": 0.7689348887820432, + "grad_norm": 7.856713771820068, + "learning_rate": 1.2850192874710977e-06, + "loss": 0.1844, + "step": 30386 + }, + { + "epoch": 0.7689601943467369, + "grad_norm": 4.405162334442139, + "learning_rate": 1.2847505621413986e-06, + "loss": 0.1378, + "step": 30387 + }, + { + "epoch": 0.7689854999114305, + "grad_norm": 4.348901748657227, + "learning_rate": 1.2844818607706515e-06, + "loss": 0.0977, + "step": 30388 + }, + { + "epoch": 0.7690108054761242, + "grad_norm": 5.513906955718994, + "learning_rate": 1.2842131833605887e-06, + "loss": 0.1703, + "step": 30389 + }, + { + "epoch": 0.7690361110408179, + "grad_norm": 5.767192363739014, + "learning_rate": 1.2839445299129454e-06, + "loss": 0.2223, + "step": 30390 + }, + { + "epoch": 0.7690614166055115, + "grad_norm": 3.952369451522827, + "learning_rate": 1.2836759004294524e-06, + "loss": 0.1087, + "step": 30391 + }, + { + "epoch": 0.7690867221702052, + "grad_norm": 16.92173194885254, + "learning_rate": 1.2834072949118427e-06, + "loss": 0.2149, + "step": 30392 + }, + { + "epoch": 0.769112027734899, + "grad_norm": 5.942454814910889, + "learning_rate": 1.2831387133618472e-06, + "loss": 0.1639, + "step": 30393 + }, + { + "epoch": 0.7691373332995926, + "grad_norm": 10.994675636291504, + "learning_rate": 1.2828701557811978e-06, + "loss": 0.2447, + "step": 30394 + }, + { + "epoch": 0.7691626388642863, + "grad_norm": 7.075862407684326, + "learning_rate": 1.2826016221716287e-06, + "loss": 0.1778, + "step": 30395 + }, + { + "epoch": 0.76918794442898, + "grad_norm": 4.632918357849121, + "learning_rate": 1.2823331125348698e-06, + "loss": 0.1185, + "step": 30396 + }, + { + "epoch": 0.7692132499936736, + "grad_norm": 8.1500244140625, + "learning_rate": 1.2820646268726538e-06, + "loss": 0.1011, + "step": 30397 + }, + { + "epoch": 0.7692385555583673, + "grad_norm": 6.073404312133789, + "learning_rate": 1.2817961651867095e-06, + "loss": 0.1686, + "step": 30398 + }, + { + "epoch": 0.769263861123061, + "grad_norm": 9.287879943847656, + "learning_rate": 1.2815277274787718e-06, + "loss": 0.1949, + "step": 30399 + }, + { + "epoch": 0.7692891666877546, + "grad_norm": 5.129347324371338, + "learning_rate": 1.2812593137505703e-06, + "loss": 0.1819, + "step": 30400 + }, + { + "epoch": 0.7693144722524483, + "grad_norm": 2.9555327892303467, + "learning_rate": 1.2809909240038354e-06, + "loss": 0.1342, + "step": 30401 + }, + { + "epoch": 0.769339777817142, + "grad_norm": 6.994063377380371, + "learning_rate": 1.280722558240297e-06, + "loss": 0.2394, + "step": 30402 + }, + { + "epoch": 0.7693650833818356, + "grad_norm": 4.1302618980407715, + "learning_rate": 1.2804542164616884e-06, + "loss": 0.1698, + "step": 30403 + }, + { + "epoch": 0.7693903889465293, + "grad_norm": 3.792430877685547, + "learning_rate": 1.2801858986697368e-06, + "loss": 0.1294, + "step": 30404 + }, + { + "epoch": 0.769415694511223, + "grad_norm": 3.7900378704071045, + "learning_rate": 1.2799176048661777e-06, + "loss": 0.1349, + "step": 30405 + }, + { + "epoch": 0.7694410000759166, + "grad_norm": 9.26735782623291, + "learning_rate": 1.2796493350527351e-06, + "loss": 0.1861, + "step": 30406 + }, + { + "epoch": 0.7694663056406104, + "grad_norm": 5.138757705688477, + "learning_rate": 1.2793810892311431e-06, + "loss": 0.1188, + "step": 30407 + }, + { + "epoch": 0.7694916112053041, + "grad_norm": 11.385697364807129, + "learning_rate": 1.2791128674031294e-06, + "loss": 0.1053, + "step": 30408 + }, + { + "epoch": 0.7695169167699977, + "grad_norm": 6.650351524353027, + "learning_rate": 1.278844669570427e-06, + "loss": 0.146, + "step": 30409 + }, + { + "epoch": 0.7695422223346914, + "grad_norm": 5.469400882720947, + "learning_rate": 1.2785764957347607e-06, + "loss": 0.1772, + "step": 30410 + }, + { + "epoch": 0.7695675278993851, + "grad_norm": 4.71298885345459, + "learning_rate": 1.2783083458978634e-06, + "loss": 0.1674, + "step": 30411 + }, + { + "epoch": 0.7695928334640787, + "grad_norm": 4.637451648712158, + "learning_rate": 1.2780402200614623e-06, + "loss": 0.1004, + "step": 30412 + }, + { + "epoch": 0.7696181390287724, + "grad_norm": 4.129265308380127, + "learning_rate": 1.2777721182272883e-06, + "loss": 0.1104, + "step": 30413 + }, + { + "epoch": 0.7696434445934661, + "grad_norm": 12.092121124267578, + "learning_rate": 1.27750404039707e-06, + "loss": 0.2015, + "step": 30414 + }, + { + "epoch": 0.7696687501581598, + "grad_norm": 3.9592649936676025, + "learning_rate": 1.277235986572536e-06, + "loss": 0.1475, + "step": 30415 + }, + { + "epoch": 0.7696940557228534, + "grad_norm": 4.0727033615112305, + "learning_rate": 1.2769679567554122e-06, + "loss": 0.1279, + "step": 30416 + }, + { + "epoch": 0.7697193612875471, + "grad_norm": 7.019477844238281, + "learning_rate": 1.2766999509474316e-06, + "loss": 0.1991, + "step": 30417 + }, + { + "epoch": 0.7697446668522409, + "grad_norm": 5.631906509399414, + "learning_rate": 1.2764319691503208e-06, + "loss": 0.1576, + "step": 30418 + }, + { + "epoch": 0.7697699724169345, + "grad_norm": 3.5789942741394043, + "learning_rate": 1.276164011365807e-06, + "loss": 0.1118, + "step": 30419 + }, + { + "epoch": 0.7697952779816282, + "grad_norm": 9.005593299865723, + "learning_rate": 1.2758960775956185e-06, + "loss": 0.1266, + "step": 30420 + }, + { + "epoch": 0.7698205835463219, + "grad_norm": 9.779619216918945, + "learning_rate": 1.2756281678414829e-06, + "loss": 0.206, + "step": 30421 + }, + { + "epoch": 0.7698458891110155, + "grad_norm": 3.5323970317840576, + "learning_rate": 1.275360282105129e-06, + "loss": 0.1269, + "step": 30422 + }, + { + "epoch": 0.7698711946757092, + "grad_norm": 4.698821544647217, + "learning_rate": 1.2750924203882847e-06, + "loss": 0.1615, + "step": 30423 + }, + { + "epoch": 0.7698965002404029, + "grad_norm": 4.043470859527588, + "learning_rate": 1.2748245826926759e-06, + "loss": 0.1227, + "step": 30424 + }, + { + "epoch": 0.7699218058050965, + "grad_norm": 8.575542449951172, + "learning_rate": 1.2745567690200294e-06, + "loss": 0.1702, + "step": 30425 + }, + { + "epoch": 0.7699471113697902, + "grad_norm": 7.548775672912598, + "learning_rate": 1.2742889793720746e-06, + "loss": 0.2334, + "step": 30426 + }, + { + "epoch": 0.7699724169344839, + "grad_norm": 3.7086408138275146, + "learning_rate": 1.274021213750537e-06, + "loss": 0.1164, + "step": 30427 + }, + { + "epoch": 0.7699977224991775, + "grad_norm": 6.908718109130859, + "learning_rate": 1.2737534721571436e-06, + "loss": 0.1291, + "step": 30428 + }, + { + "epoch": 0.7700230280638712, + "grad_norm": 4.706844329833984, + "learning_rate": 1.273485754593619e-06, + "loss": 0.1341, + "step": 30429 + }, + { + "epoch": 0.770048333628565, + "grad_norm": 18.85335922241211, + "learning_rate": 1.273218061061694e-06, + "loss": 0.2835, + "step": 30430 + }, + { + "epoch": 0.7700736391932586, + "grad_norm": 6.140340805053711, + "learning_rate": 1.272950391563092e-06, + "loss": 0.2168, + "step": 30431 + }, + { + "epoch": 0.7700989447579523, + "grad_norm": 7.478151798248291, + "learning_rate": 1.2726827460995395e-06, + "loss": 0.2015, + "step": 30432 + }, + { + "epoch": 0.770124250322646, + "grad_norm": 3.4946963787078857, + "learning_rate": 1.2724151246727611e-06, + "loss": 0.1176, + "step": 30433 + }, + { + "epoch": 0.7701495558873396, + "grad_norm": 8.082100868225098, + "learning_rate": 1.2721475272844858e-06, + "loss": 0.1554, + "step": 30434 + }, + { + "epoch": 0.7701748614520333, + "grad_norm": 6.074869155883789, + "learning_rate": 1.2718799539364363e-06, + "loss": 0.1602, + "step": 30435 + }, + { + "epoch": 0.770200167016727, + "grad_norm": 11.632883071899414, + "learning_rate": 1.2716124046303424e-06, + "loss": 0.1392, + "step": 30436 + }, + { + "epoch": 0.7702254725814206, + "grad_norm": 3.5060460567474365, + "learning_rate": 1.2713448793679234e-06, + "loss": 0.1065, + "step": 30437 + }, + { + "epoch": 0.7702507781461143, + "grad_norm": 7.296962261199951, + "learning_rate": 1.2710773781509094e-06, + "loss": 0.1161, + "step": 30438 + }, + { + "epoch": 0.770276083710808, + "grad_norm": 5.239333629608154, + "learning_rate": 1.270809900981022e-06, + "loss": 0.1597, + "step": 30439 + }, + { + "epoch": 0.7703013892755017, + "grad_norm": 8.014747619628906, + "learning_rate": 1.2705424478599916e-06, + "loss": 0.1911, + "step": 30440 + }, + { + "epoch": 0.7703266948401953, + "grad_norm": 4.0888214111328125, + "learning_rate": 1.2702750187895357e-06, + "loss": 0.1666, + "step": 30441 + }, + { + "epoch": 0.770352000404889, + "grad_norm": 4.712299823760986, + "learning_rate": 1.2700076137713835e-06, + "loss": 0.133, + "step": 30442 + }, + { + "epoch": 0.7703773059695828, + "grad_norm": 6.307245254516602, + "learning_rate": 1.2697402328072573e-06, + "loss": 0.1698, + "step": 30443 + }, + { + "epoch": 0.7704026115342764, + "grad_norm": 3.070516347885132, + "learning_rate": 1.269472875898884e-06, + "loss": 0.1097, + "step": 30444 + }, + { + "epoch": 0.7704279170989701, + "grad_norm": 8.715882301330566, + "learning_rate": 1.269205543047986e-06, + "loss": 0.2235, + "step": 30445 + }, + { + "epoch": 0.7704532226636638, + "grad_norm": 5.318577766418457, + "learning_rate": 1.2689382342562878e-06, + "loss": 0.123, + "step": 30446 + }, + { + "epoch": 0.7704785282283574, + "grad_norm": 2.894517660140991, + "learning_rate": 1.2686709495255128e-06, + "loss": 0.138, + "step": 30447 + }, + { + "epoch": 0.7705038337930511, + "grad_norm": 4.643084526062012, + "learning_rate": 1.268403688857383e-06, + "loss": 0.1784, + "step": 30448 + }, + { + "epoch": 0.7705291393577448, + "grad_norm": 3.5291810035705566, + "learning_rate": 1.2681364522536255e-06, + "loss": 0.127, + "step": 30449 + }, + { + "epoch": 0.7705544449224384, + "grad_norm": 5.953487873077393, + "learning_rate": 1.2678692397159615e-06, + "loss": 0.1871, + "step": 30450 + }, + { + "epoch": 0.7705797504871321, + "grad_norm": 6.352899551391602, + "learning_rate": 1.2676020512461146e-06, + "loss": 0.175, + "step": 30451 + }, + { + "epoch": 0.7706050560518258, + "grad_norm": 3.5702760219573975, + "learning_rate": 1.267334886845807e-06, + "loss": 0.1434, + "step": 30452 + }, + { + "epoch": 0.7706303616165194, + "grad_norm": 4.372364521026611, + "learning_rate": 1.267067746516763e-06, + "loss": 0.105, + "step": 30453 + }, + { + "epoch": 0.7706556671812131, + "grad_norm": 3.3729658126831055, + "learning_rate": 1.266800630260705e-06, + "loss": 0.1222, + "step": 30454 + }, + { + "epoch": 0.7706809727459069, + "grad_norm": 24.1984806060791, + "learning_rate": 1.2665335380793553e-06, + "loss": 0.2393, + "step": 30455 + }, + { + "epoch": 0.7707062783106005, + "grad_norm": 4.954209804534912, + "learning_rate": 1.2662664699744348e-06, + "loss": 0.1145, + "step": 30456 + }, + { + "epoch": 0.7707315838752942, + "grad_norm": 3.4309327602386475, + "learning_rate": 1.2659994259476687e-06, + "loss": 0.1416, + "step": 30457 + }, + { + "epoch": 0.7707568894399879, + "grad_norm": 5.5294694900512695, + "learning_rate": 1.265732406000778e-06, + "loss": 0.1109, + "step": 30458 + }, + { + "epoch": 0.7707821950046815, + "grad_norm": 5.548385143280029, + "learning_rate": 1.2654654101354847e-06, + "loss": 0.2008, + "step": 30459 + }, + { + "epoch": 0.7708075005693752, + "grad_norm": 7.256731033325195, + "learning_rate": 1.2651984383535082e-06, + "loss": 0.1927, + "step": 30460 + }, + { + "epoch": 0.7708328061340689, + "grad_norm": 16.82146644592285, + "learning_rate": 1.264931490656574e-06, + "loss": 0.319, + "step": 30461 + }, + { + "epoch": 0.7708581116987625, + "grad_norm": 9.418212890625, + "learning_rate": 1.2646645670464014e-06, + "loss": 0.1467, + "step": 30462 + }, + { + "epoch": 0.7708834172634562, + "grad_norm": 15.189519882202148, + "learning_rate": 1.2643976675247128e-06, + "loss": 0.3078, + "step": 30463 + }, + { + "epoch": 0.7709087228281499, + "grad_norm": 2.642172336578369, + "learning_rate": 1.2641307920932272e-06, + "loss": 0.099, + "step": 30464 + }, + { + "epoch": 0.7709340283928436, + "grad_norm": 3.2735934257507324, + "learning_rate": 1.2638639407536685e-06, + "loss": 0.1477, + "step": 30465 + }, + { + "epoch": 0.7709593339575372, + "grad_norm": 7.591652870178223, + "learning_rate": 1.2635971135077546e-06, + "loss": 0.226, + "step": 30466 + }, + { + "epoch": 0.770984639522231, + "grad_norm": 6.31932258605957, + "learning_rate": 1.2633303103572108e-06, + "loss": 0.1466, + "step": 30467 + }, + { + "epoch": 0.7710099450869247, + "grad_norm": 1.7613669633865356, + "learning_rate": 1.2630635313037526e-06, + "loss": 0.0536, + "step": 30468 + }, + { + "epoch": 0.7710352506516183, + "grad_norm": 3.4656100273132324, + "learning_rate": 1.2627967763491033e-06, + "loss": 0.1071, + "step": 30469 + }, + { + "epoch": 0.771060556216312, + "grad_norm": 8.49679946899414, + "learning_rate": 1.2625300454949813e-06, + "loss": 0.3001, + "step": 30470 + }, + { + "epoch": 0.7710858617810057, + "grad_norm": 3.8242156505584717, + "learning_rate": 1.2622633387431104e-06, + "loss": 0.1336, + "step": 30471 + }, + { + "epoch": 0.7711111673456993, + "grad_norm": 5.25098180770874, + "learning_rate": 1.2619966560952051e-06, + "loss": 0.1674, + "step": 30472 + }, + { + "epoch": 0.771136472910393, + "grad_norm": 4.723112106323242, + "learning_rate": 1.2617299975529895e-06, + "loss": 0.2231, + "step": 30473 + }, + { + "epoch": 0.7711617784750867, + "grad_norm": 5.668691635131836, + "learning_rate": 1.2614633631181805e-06, + "loss": 0.1242, + "step": 30474 + }, + { + "epoch": 0.7711870840397803, + "grad_norm": 3.065394639968872, + "learning_rate": 1.2611967527925018e-06, + "loss": 0.1463, + "step": 30475 + }, + { + "epoch": 0.771212389604474, + "grad_norm": 4.42879581451416, + "learning_rate": 1.260930166577667e-06, + "loss": 0.1262, + "step": 30476 + }, + { + "epoch": 0.7712376951691677, + "grad_norm": 3.3933143615722656, + "learning_rate": 1.2606636044753989e-06, + "loss": 0.0912, + "step": 30477 + }, + { + "epoch": 0.7712630007338613, + "grad_norm": 5.161322116851807, + "learning_rate": 1.260397066487416e-06, + "loss": 0.1788, + "step": 30478 + }, + { + "epoch": 0.771288306298555, + "grad_norm": 3.349820852279663, + "learning_rate": 1.2601305526154356e-06, + "loss": 0.1442, + "step": 30479 + }, + { + "epoch": 0.7713136118632488, + "grad_norm": 7.097954750061035, + "learning_rate": 1.2598640628611785e-06, + "loss": 0.1466, + "step": 30480 + }, + { + "epoch": 0.7713389174279424, + "grad_norm": 22.179210662841797, + "learning_rate": 1.2595975972263624e-06, + "loss": 0.3298, + "step": 30481 + }, + { + "epoch": 0.7713642229926361, + "grad_norm": 2.9586246013641357, + "learning_rate": 1.2593311557127064e-06, + "loss": 0.1342, + "step": 30482 + }, + { + "epoch": 0.7713895285573298, + "grad_norm": 14.805835723876953, + "learning_rate": 1.2590647383219256e-06, + "loss": 0.2291, + "step": 30483 + }, + { + "epoch": 0.7714148341220234, + "grad_norm": 1.9319915771484375, + "learning_rate": 1.258798345055744e-06, + "loss": 0.0597, + "step": 30484 + }, + { + "epoch": 0.7714401396867171, + "grad_norm": 7.767515659332275, + "learning_rate": 1.2585319759158725e-06, + "loss": 0.2829, + "step": 30485 + }, + { + "epoch": 0.7714654452514108, + "grad_norm": 4.812302589416504, + "learning_rate": 1.2582656309040342e-06, + "loss": 0.1455, + "step": 30486 + }, + { + "epoch": 0.7714907508161044, + "grad_norm": 5.06662130355835, + "learning_rate": 1.257999310021943e-06, + "loss": 0.1581, + "step": 30487 + }, + { + "epoch": 0.7715160563807981, + "grad_norm": 4.9417643547058105, + "learning_rate": 1.25773301327132e-06, + "loss": 0.1749, + "step": 30488 + }, + { + "epoch": 0.7715413619454918, + "grad_norm": 4.452963829040527, + "learning_rate": 1.2574667406538805e-06, + "loss": 0.1249, + "step": 30489 + }, + { + "epoch": 0.7715666675101855, + "grad_norm": 4.57193660736084, + "learning_rate": 1.2572004921713415e-06, + "loss": 0.1376, + "step": 30490 + }, + { + "epoch": 0.7715919730748791, + "grad_norm": 6.877295970916748, + "learning_rate": 1.2569342678254193e-06, + "loss": 0.2691, + "step": 30491 + }, + { + "epoch": 0.7716172786395729, + "grad_norm": 8.108146667480469, + "learning_rate": 1.2566680676178333e-06, + "loss": 0.2136, + "step": 30492 + }, + { + "epoch": 0.7716425842042666, + "grad_norm": 6.974926948547363, + "learning_rate": 1.2564018915502984e-06, + "loss": 0.2909, + "step": 30493 + }, + { + "epoch": 0.7716678897689602, + "grad_norm": 4.36012077331543, + "learning_rate": 1.256135739624531e-06, + "loss": 0.1479, + "step": 30494 + }, + { + "epoch": 0.7716931953336539, + "grad_norm": 3.640836477279663, + "learning_rate": 1.2558696118422469e-06, + "loss": 0.0884, + "step": 30495 + }, + { + "epoch": 0.7717185008983476, + "grad_norm": 2.639263153076172, + "learning_rate": 1.2556035082051642e-06, + "loss": 0.086, + "step": 30496 + }, + { + "epoch": 0.7717438064630412, + "grad_norm": 4.561995983123779, + "learning_rate": 1.2553374287149971e-06, + "loss": 0.1416, + "step": 30497 + }, + { + "epoch": 0.7717691120277349, + "grad_norm": 4.242506504058838, + "learning_rate": 1.2550713733734647e-06, + "loss": 0.1164, + "step": 30498 + }, + { + "epoch": 0.7717944175924286, + "grad_norm": 2.8205578327178955, + "learning_rate": 1.254805342182278e-06, + "loss": 0.1192, + "step": 30499 + }, + { + "epoch": 0.7718197231571222, + "grad_norm": 5.097662925720215, + "learning_rate": 1.2545393351431563e-06, + "loss": 0.1383, + "step": 30500 + }, + { + "epoch": 0.7718450287218159, + "grad_norm": 13.960257530212402, + "learning_rate": 1.2542733522578122e-06, + "loss": 0.1841, + "step": 30501 + }, + { + "epoch": 0.7718703342865096, + "grad_norm": 2.8414859771728516, + "learning_rate": 1.2540073935279652e-06, + "loss": 0.0926, + "step": 30502 + }, + { + "epoch": 0.7718956398512032, + "grad_norm": 6.924637317657471, + "learning_rate": 1.2537414589553248e-06, + "loss": 0.2122, + "step": 30503 + }, + { + "epoch": 0.771920945415897, + "grad_norm": 4.1647491455078125, + "learning_rate": 1.2534755485416106e-06, + "loss": 0.1404, + "step": 30504 + }, + { + "epoch": 0.7719462509805907, + "grad_norm": 5.768135070800781, + "learning_rate": 1.2532096622885359e-06, + "loss": 0.1862, + "step": 30505 + }, + { + "epoch": 0.7719715565452843, + "grad_norm": 3.6401984691619873, + "learning_rate": 1.2529438001978146e-06, + "loss": 0.1536, + "step": 30506 + }, + { + "epoch": 0.771996862109978, + "grad_norm": 5.715487480163574, + "learning_rate": 1.2526779622711604e-06, + "loss": 0.2288, + "step": 30507 + }, + { + "epoch": 0.7720221676746717, + "grad_norm": 5.4610395431518555, + "learning_rate": 1.2524121485102902e-06, + "loss": 0.1451, + "step": 30508 + }, + { + "epoch": 0.7720474732393653, + "grad_norm": 6.0590996742248535, + "learning_rate": 1.2521463589169175e-06, + "loss": 0.1389, + "step": 30509 + }, + { + "epoch": 0.772072778804059, + "grad_norm": 4.175468921661377, + "learning_rate": 1.251880593492754e-06, + "loss": 0.1304, + "step": 30510 + }, + { + "epoch": 0.7720980843687527, + "grad_norm": 5.160970211029053, + "learning_rate": 1.251614852239517e-06, + "loss": 0.1297, + "step": 30511 + }, + { + "epoch": 0.7721233899334463, + "grad_norm": 6.461897373199463, + "learning_rate": 1.2513491351589184e-06, + "loss": 0.1929, + "step": 30512 + }, + { + "epoch": 0.77214869549814, + "grad_norm": 4.569157123565674, + "learning_rate": 1.2510834422526719e-06, + "loss": 0.1929, + "step": 30513 + }, + { + "epoch": 0.7721740010628337, + "grad_norm": 4.478067874908447, + "learning_rate": 1.2508177735224897e-06, + "loss": 0.1522, + "step": 30514 + }, + { + "epoch": 0.7721993066275274, + "grad_norm": 2.6468257904052734, + "learning_rate": 1.250552128970089e-06, + "loss": 0.0633, + "step": 30515 + }, + { + "epoch": 0.772224612192221, + "grad_norm": 8.236774444580078, + "learning_rate": 1.2502865085971772e-06, + "loss": 0.2566, + "step": 30516 + }, + { + "epoch": 0.7722499177569148, + "grad_norm": 5.518159866333008, + "learning_rate": 1.2500209124054719e-06, + "loss": 0.1416, + "step": 30517 + }, + { + "epoch": 0.7722752233216085, + "grad_norm": 5.114090919494629, + "learning_rate": 1.2497553403966821e-06, + "loss": 0.1247, + "step": 30518 + }, + { + "epoch": 0.7723005288863021, + "grad_norm": 6.458996772766113, + "learning_rate": 1.2494897925725247e-06, + "loss": 0.2041, + "step": 30519 + }, + { + "epoch": 0.7723258344509958, + "grad_norm": 6.102158546447754, + "learning_rate": 1.2492242689347095e-06, + "loss": 0.142, + "step": 30520 + }, + { + "epoch": 0.7723511400156895, + "grad_norm": 6.631597518920898, + "learning_rate": 1.2489587694849497e-06, + "loss": 0.1738, + "step": 30521 + }, + { + "epoch": 0.7723764455803831, + "grad_norm": 3.2914443016052246, + "learning_rate": 1.2486932942249553e-06, + "loss": 0.1357, + "step": 30522 + }, + { + "epoch": 0.7724017511450768, + "grad_norm": 13.346402168273926, + "learning_rate": 1.2484278431564418e-06, + "loss": 0.1753, + "step": 30523 + }, + { + "epoch": 0.7724270567097705, + "grad_norm": 6.900234699249268, + "learning_rate": 1.248162416281119e-06, + "loss": 0.1488, + "step": 30524 + }, + { + "epoch": 0.7724523622744641, + "grad_norm": 4.259031295776367, + "learning_rate": 1.247897013600699e-06, + "loss": 0.12, + "step": 30525 + }, + { + "epoch": 0.7724776678391578, + "grad_norm": 6.060410022735596, + "learning_rate": 1.2476316351168916e-06, + "loss": 0.1764, + "step": 30526 + }, + { + "epoch": 0.7725029734038515, + "grad_norm": 4.32501745223999, + "learning_rate": 1.2473662808314113e-06, + "loss": 0.154, + "step": 30527 + }, + { + "epoch": 0.7725282789685451, + "grad_norm": 4.5314202308654785, + "learning_rate": 1.247100950745968e-06, + "loss": 0.1703, + "step": 30528 + }, + { + "epoch": 0.7725535845332389, + "grad_norm": 6.855892658233643, + "learning_rate": 1.2468356448622725e-06, + "loss": 0.2134, + "step": 30529 + }, + { + "epoch": 0.7725788900979326, + "grad_norm": 11.278340339660645, + "learning_rate": 1.2465703631820342e-06, + "loss": 0.213, + "step": 30530 + }, + { + "epoch": 0.7726041956626262, + "grad_norm": 7.36344575881958, + "learning_rate": 1.246305105706967e-06, + "loss": 0.1925, + "step": 30531 + }, + { + "epoch": 0.7726295012273199, + "grad_norm": 3.85857892036438, + "learning_rate": 1.24603987243878e-06, + "loss": 0.1559, + "step": 30532 + }, + { + "epoch": 0.7726548067920136, + "grad_norm": 4.292304039001465, + "learning_rate": 1.2457746633791835e-06, + "loss": 0.1743, + "step": 30533 + }, + { + "epoch": 0.7726801123567072, + "grad_norm": 4.401745796203613, + "learning_rate": 1.2455094785298865e-06, + "loss": 0.1659, + "step": 30534 + }, + { + "epoch": 0.7727054179214009, + "grad_norm": 8.738689422607422, + "learning_rate": 1.2452443178926015e-06, + "loss": 0.2245, + "step": 30535 + }, + { + "epoch": 0.7727307234860946, + "grad_norm": 7.871179103851318, + "learning_rate": 1.2449791814690383e-06, + "loss": 0.2602, + "step": 30536 + }, + { + "epoch": 0.7727560290507882, + "grad_norm": 6.285489082336426, + "learning_rate": 1.244714069260905e-06, + "loss": 0.1794, + "step": 30537 + }, + { + "epoch": 0.7727813346154819, + "grad_norm": 6.046276092529297, + "learning_rate": 1.2444489812699113e-06, + "loss": 0.1748, + "step": 30538 + }, + { + "epoch": 0.7728066401801756, + "grad_norm": 2.4622859954833984, + "learning_rate": 1.2441839174977687e-06, + "loss": 0.0754, + "step": 30539 + }, + { + "epoch": 0.7728319457448692, + "grad_norm": 18.13501739501953, + "learning_rate": 1.2439188779461852e-06, + "loss": 0.3349, + "step": 30540 + }, + { + "epoch": 0.772857251309563, + "grad_norm": 8.433881759643555, + "learning_rate": 1.2436538626168687e-06, + "loss": 0.1267, + "step": 30541 + }, + { + "epoch": 0.7728825568742567, + "grad_norm": 3.5358927249908447, + "learning_rate": 1.2433888715115334e-06, + "loss": 0.1221, + "step": 30542 + }, + { + "epoch": 0.7729078624389504, + "grad_norm": 10.545465469360352, + "learning_rate": 1.2431239046318805e-06, + "loss": 0.2282, + "step": 30543 + }, + { + "epoch": 0.772933168003644, + "grad_norm": 5.5746235847473145, + "learning_rate": 1.242858961979625e-06, + "loss": 0.1721, + "step": 30544 + }, + { + "epoch": 0.7729584735683377, + "grad_norm": 7.4118452072143555, + "learning_rate": 1.2425940435564715e-06, + "loss": 0.1308, + "step": 30545 + }, + { + "epoch": 0.7729837791330314, + "grad_norm": 7.6986236572265625, + "learning_rate": 1.2423291493641326e-06, + "loss": 0.2449, + "step": 30546 + }, + { + "epoch": 0.773009084697725, + "grad_norm": 5.327886581420898, + "learning_rate": 1.2420642794043113e-06, + "loss": 0.1674, + "step": 30547 + }, + { + "epoch": 0.7730343902624187, + "grad_norm": 2.73583722114563, + "learning_rate": 1.2417994336787204e-06, + "loss": 0.12, + "step": 30548 + }, + { + "epoch": 0.7730596958271124, + "grad_norm": 4.23179817199707, + "learning_rate": 1.2415346121890638e-06, + "loss": 0.2012, + "step": 30549 + }, + { + "epoch": 0.773085001391806, + "grad_norm": 6.453001022338867, + "learning_rate": 1.2412698149370527e-06, + "loss": 0.2516, + "step": 30550 + }, + { + "epoch": 0.7731103069564997, + "grad_norm": 4.372674465179443, + "learning_rate": 1.241005041924394e-06, + "loss": 0.079, + "step": 30551 + }, + { + "epoch": 0.7731356125211934, + "grad_norm": 8.43966293334961, + "learning_rate": 1.2407402931527946e-06, + "loss": 0.1325, + "step": 30552 + }, + { + "epoch": 0.773160918085887, + "grad_norm": 5.547205924987793, + "learning_rate": 1.2404755686239595e-06, + "loss": 0.1562, + "step": 30553 + }, + { + "epoch": 0.7731862236505808, + "grad_norm": 7.866710662841797, + "learning_rate": 1.2402108683396003e-06, + "loss": 0.2481, + "step": 30554 + }, + { + "epoch": 0.7732115292152745, + "grad_norm": 4.949284076690674, + "learning_rate": 1.2399461923014216e-06, + "loss": 0.1584, + "step": 30555 + }, + { + "epoch": 0.7732368347799681, + "grad_norm": 5.251112461090088, + "learning_rate": 1.2396815405111307e-06, + "loss": 0.1778, + "step": 30556 + }, + { + "epoch": 0.7732621403446618, + "grad_norm": 3.122605800628662, + "learning_rate": 1.239416912970432e-06, + "loss": 0.1131, + "step": 30557 + }, + { + "epoch": 0.7732874459093555, + "grad_norm": 7.408638954162598, + "learning_rate": 1.239152309681036e-06, + "loss": 0.1831, + "step": 30558 + }, + { + "epoch": 0.7733127514740491, + "grad_norm": 8.248616218566895, + "learning_rate": 1.2388877306446472e-06, + "loss": 0.0994, + "step": 30559 + }, + { + "epoch": 0.7733380570387428, + "grad_norm": 3.4318623542785645, + "learning_rate": 1.238623175862972e-06, + "loss": 0.1333, + "step": 30560 + }, + { + "epoch": 0.7733633626034365, + "grad_norm": 7.911496162414551, + "learning_rate": 1.2383586453377145e-06, + "loss": 0.2053, + "step": 30561 + }, + { + "epoch": 0.7733886681681301, + "grad_norm": 4.4852423667907715, + "learning_rate": 1.2380941390705835e-06, + "loss": 0.161, + "step": 30562 + }, + { + "epoch": 0.7734139737328238, + "grad_norm": 8.981082916259766, + "learning_rate": 1.237829657063284e-06, + "loss": 0.204, + "step": 30563 + }, + { + "epoch": 0.7734392792975175, + "grad_norm": 4.649898052215576, + "learning_rate": 1.237565199317522e-06, + "loss": 0.119, + "step": 30564 + }, + { + "epoch": 0.7734645848622111, + "grad_norm": 4.6939897537231445, + "learning_rate": 1.2373007658349995e-06, + "loss": 0.1756, + "step": 30565 + }, + { + "epoch": 0.7734898904269049, + "grad_norm": 4.119524002075195, + "learning_rate": 1.2370363566174266e-06, + "loss": 0.1354, + "step": 30566 + }, + { + "epoch": 0.7735151959915986, + "grad_norm": 4.521692752838135, + "learning_rate": 1.2367719716665061e-06, + "loss": 0.2228, + "step": 30567 + }, + { + "epoch": 0.7735405015562923, + "grad_norm": 8.639957427978516, + "learning_rate": 1.236507610983943e-06, + "loss": 0.1897, + "step": 30568 + }, + { + "epoch": 0.7735658071209859, + "grad_norm": 11.82202434539795, + "learning_rate": 1.2362432745714425e-06, + "loss": 0.1251, + "step": 30569 + }, + { + "epoch": 0.7735911126856796, + "grad_norm": 2.8627846240997314, + "learning_rate": 1.2359789624307072e-06, + "loss": 0.1201, + "step": 30570 + }, + { + "epoch": 0.7736164182503733, + "grad_norm": 3.9725804328918457, + "learning_rate": 1.2357146745634452e-06, + "loss": 0.1375, + "step": 30571 + }, + { + "epoch": 0.7736417238150669, + "grad_norm": 5.798803329467773, + "learning_rate": 1.2354504109713572e-06, + "loss": 0.1424, + "step": 30572 + }, + { + "epoch": 0.7736670293797606, + "grad_norm": 6.221388816833496, + "learning_rate": 1.2351861716561526e-06, + "loss": 0.1977, + "step": 30573 + }, + { + "epoch": 0.7736923349444543, + "grad_norm": 4.334599018096924, + "learning_rate": 1.2349219566195286e-06, + "loss": 0.1436, + "step": 30574 + }, + { + "epoch": 0.7737176405091479, + "grad_norm": 12.170700073242188, + "learning_rate": 1.2346577658631948e-06, + "loss": 0.2151, + "step": 30575 + }, + { + "epoch": 0.7737429460738416, + "grad_norm": 3.915858507156372, + "learning_rate": 1.2343935993888505e-06, + "loss": 0.1439, + "step": 30576 + }, + { + "epoch": 0.7737682516385354, + "grad_norm": 4.273952484130859, + "learning_rate": 1.2341294571982049e-06, + "loss": 0.1861, + "step": 30577 + }, + { + "epoch": 0.773793557203229, + "grad_norm": 4.131535530090332, + "learning_rate": 1.2338653392929545e-06, + "loss": 0.1292, + "step": 30578 + }, + { + "epoch": 0.7738188627679227, + "grad_norm": 4.094571113586426, + "learning_rate": 1.2336012456748075e-06, + "loss": 0.0932, + "step": 30579 + }, + { + "epoch": 0.7738441683326164, + "grad_norm": 4.006283283233643, + "learning_rate": 1.2333371763454644e-06, + "loss": 0.1699, + "step": 30580 + }, + { + "epoch": 0.77386947389731, + "grad_norm": 7.438612937927246, + "learning_rate": 1.2330731313066318e-06, + "loss": 0.2463, + "step": 30581 + }, + { + "epoch": 0.7738947794620037, + "grad_norm": 6.662460803985596, + "learning_rate": 1.232809110560007e-06, + "loss": 0.1571, + "step": 30582 + }, + { + "epoch": 0.7739200850266974, + "grad_norm": 10.62632942199707, + "learning_rate": 1.2325451141072969e-06, + "loss": 0.2094, + "step": 30583 + }, + { + "epoch": 0.773945390591391, + "grad_norm": 5.487163066864014, + "learning_rate": 1.2322811419502013e-06, + "loss": 0.2095, + "step": 30584 + }, + { + "epoch": 0.7739706961560847, + "grad_norm": 7.0834808349609375, + "learning_rate": 1.2320171940904251e-06, + "loss": 0.1603, + "step": 30585 + }, + { + "epoch": 0.7739960017207784, + "grad_norm": 7.378721714019775, + "learning_rate": 1.2317532705296693e-06, + "loss": 0.1828, + "step": 30586 + }, + { + "epoch": 0.774021307285472, + "grad_norm": 4.187375545501709, + "learning_rate": 1.2314893712696352e-06, + "loss": 0.1116, + "step": 30587 + }, + { + "epoch": 0.7740466128501657, + "grad_norm": 6.352911472320557, + "learning_rate": 1.231225496312024e-06, + "loss": 0.199, + "step": 30588 + }, + { + "epoch": 0.7740719184148595, + "grad_norm": 3.7850863933563232, + "learning_rate": 1.23096164565854e-06, + "loss": 0.0988, + "step": 30589 + }, + { + "epoch": 0.774097223979553, + "grad_norm": 11.730731964111328, + "learning_rate": 1.230697819310883e-06, + "loss": 0.222, + "step": 30590 + }, + { + "epoch": 0.7741225295442468, + "grad_norm": 10.663056373596191, + "learning_rate": 1.230434017270755e-06, + "loss": 0.2149, + "step": 30591 + }, + { + "epoch": 0.7741478351089405, + "grad_norm": 5.27897834777832, + "learning_rate": 1.2301702395398552e-06, + "loss": 0.1475, + "step": 30592 + }, + { + "epoch": 0.7741731406736342, + "grad_norm": 5.46060848236084, + "learning_rate": 1.2299064861198879e-06, + "loss": 0.1863, + "step": 30593 + }, + { + "epoch": 0.7741984462383278, + "grad_norm": 5.903159141540527, + "learning_rate": 1.2296427570125524e-06, + "loss": 0.243, + "step": 30594 + }, + { + "epoch": 0.7742237518030215, + "grad_norm": 4.503651142120361, + "learning_rate": 1.2293790522195493e-06, + "loss": 0.0764, + "step": 30595 + }, + { + "epoch": 0.7742490573677152, + "grad_norm": 5.083974838256836, + "learning_rate": 1.229115371742578e-06, + "loss": 0.1573, + "step": 30596 + }, + { + "epoch": 0.7742743629324088, + "grad_norm": 2.7840776443481445, + "learning_rate": 1.2288517155833413e-06, + "loss": 0.1561, + "step": 30597 + }, + { + "epoch": 0.7742996684971025, + "grad_norm": 3.7786223888397217, + "learning_rate": 1.2285880837435387e-06, + "loss": 0.1392, + "step": 30598 + }, + { + "epoch": 0.7743249740617962, + "grad_norm": 5.063749313354492, + "learning_rate": 1.2283244762248697e-06, + "loss": 0.1478, + "step": 30599 + }, + { + "epoch": 0.7743502796264898, + "grad_norm": 3.5616018772125244, + "learning_rate": 1.2280608930290349e-06, + "loss": 0.1345, + "step": 30600 + }, + { + "epoch": 0.7743755851911835, + "grad_norm": 4.6102752685546875, + "learning_rate": 1.2277973341577321e-06, + "loss": 0.1744, + "step": 30601 + }, + { + "epoch": 0.7744008907558773, + "grad_norm": 4.322021484375, + "learning_rate": 1.227533799612664e-06, + "loss": 0.1286, + "step": 30602 + }, + { + "epoch": 0.7744261963205709, + "grad_norm": 5.269777774810791, + "learning_rate": 1.227270289395529e-06, + "loss": 0.0911, + "step": 30603 + }, + { + "epoch": 0.7744515018852646, + "grad_norm": 4.6927337646484375, + "learning_rate": 1.2270068035080263e-06, + "loss": 0.1533, + "step": 30604 + }, + { + "epoch": 0.7744768074499583, + "grad_norm": 5.326253890991211, + "learning_rate": 1.226743341951853e-06, + "loss": 0.104, + "step": 30605 + }, + { + "epoch": 0.7745021130146519, + "grad_norm": 4.156152248382568, + "learning_rate": 1.2264799047287113e-06, + "loss": 0.1444, + "step": 30606 + }, + { + "epoch": 0.7745274185793456, + "grad_norm": 5.864054203033447, + "learning_rate": 1.226216491840298e-06, + "loss": 0.1589, + "step": 30607 + }, + { + "epoch": 0.7745527241440393, + "grad_norm": 9.448378562927246, + "learning_rate": 1.2259531032883148e-06, + "loss": 0.2814, + "step": 30608 + }, + { + "epoch": 0.7745780297087329, + "grad_norm": 3.360642671585083, + "learning_rate": 1.2256897390744554e-06, + "loss": 0.1404, + "step": 30609 + }, + { + "epoch": 0.7746033352734266, + "grad_norm": 3.3692846298217773, + "learning_rate": 1.2254263992004223e-06, + "loss": 0.1447, + "step": 30610 + }, + { + "epoch": 0.7746286408381203, + "grad_norm": 11.82229232788086, + "learning_rate": 1.2251630836679106e-06, + "loss": 0.1326, + "step": 30611 + }, + { + "epoch": 0.7746539464028139, + "grad_norm": 7.347080230712891, + "learning_rate": 1.2248997924786232e-06, + "loss": 0.1704, + "step": 30612 + }, + { + "epoch": 0.7746792519675076, + "grad_norm": 5.5994720458984375, + "learning_rate": 1.2246365256342518e-06, + "loss": 0.1487, + "step": 30613 + }, + { + "epoch": 0.7747045575322014, + "grad_norm": 3.6136679649353027, + "learning_rate": 1.2243732831364986e-06, + "loss": 0.1332, + "step": 30614 + }, + { + "epoch": 0.774729863096895, + "grad_norm": 9.280830383300781, + "learning_rate": 1.224110064987058e-06, + "loss": 0.3291, + "step": 30615 + }, + { + "epoch": 0.7747551686615887, + "grad_norm": 6.485655784606934, + "learning_rate": 1.223846871187631e-06, + "loss": 0.1633, + "step": 30616 + }, + { + "epoch": 0.7747804742262824, + "grad_norm": 9.244888305664062, + "learning_rate": 1.2235837017399128e-06, + "loss": 0.2258, + "step": 30617 + }, + { + "epoch": 0.7748057797909761, + "grad_norm": 4.199799537658691, + "learning_rate": 1.2233205566456013e-06, + "loss": 0.0916, + "step": 30618 + }, + { + "epoch": 0.7748310853556697, + "grad_norm": 20.403039932250977, + "learning_rate": 1.2230574359063912e-06, + "loss": 0.2444, + "step": 30619 + }, + { + "epoch": 0.7748563909203634, + "grad_norm": 3.7606546878814697, + "learning_rate": 1.2227943395239827e-06, + "loss": 0.1723, + "step": 30620 + }, + { + "epoch": 0.7748816964850571, + "grad_norm": 4.81885290145874, + "learning_rate": 1.222531267500071e-06, + "loss": 0.1376, + "step": 30621 + }, + { + "epoch": 0.7749070020497507, + "grad_norm": 3.509021520614624, + "learning_rate": 1.2222682198363522e-06, + "loss": 0.1218, + "step": 30622 + }, + { + "epoch": 0.7749323076144444, + "grad_norm": 12.602768898010254, + "learning_rate": 1.222005196534522e-06, + "loss": 0.1741, + "step": 30623 + }, + { + "epoch": 0.7749576131791381, + "grad_norm": 2.5510284900665283, + "learning_rate": 1.2217421975962785e-06, + "loss": 0.0897, + "step": 30624 + }, + { + "epoch": 0.7749829187438317, + "grad_norm": 5.758612155914307, + "learning_rate": 1.221479223023317e-06, + "loss": 0.1536, + "step": 30625 + }, + { + "epoch": 0.7750082243085255, + "grad_norm": 3.5108680725097656, + "learning_rate": 1.2212162728173332e-06, + "loss": 0.1562, + "step": 30626 + }, + { + "epoch": 0.7750335298732192, + "grad_norm": 6.166781902313232, + "learning_rate": 1.2209533469800228e-06, + "loss": 0.1351, + "step": 30627 + }, + { + "epoch": 0.7750588354379128, + "grad_norm": 3.841503143310547, + "learning_rate": 1.2206904455130796e-06, + "loss": 0.1411, + "step": 30628 + }, + { + "epoch": 0.7750841410026065, + "grad_norm": 5.775191307067871, + "learning_rate": 1.220427568418202e-06, + "loss": 0.1649, + "step": 30629 + }, + { + "epoch": 0.7751094465673002, + "grad_norm": 2.6365854740142822, + "learning_rate": 1.2201647156970842e-06, + "loss": 0.1465, + "step": 30630 + }, + { + "epoch": 0.7751347521319938, + "grad_norm": 4.825596809387207, + "learning_rate": 1.2199018873514212e-06, + "loss": 0.1361, + "step": 30631 + }, + { + "epoch": 0.7751600576966875, + "grad_norm": 5.289078712463379, + "learning_rate": 1.2196390833829065e-06, + "loss": 0.1475, + "step": 30632 + }, + { + "epoch": 0.7751853632613812, + "grad_norm": 4.704672336578369, + "learning_rate": 1.219376303793237e-06, + "loss": 0.1486, + "step": 30633 + }, + { + "epoch": 0.7752106688260748, + "grad_norm": 4.3660664558410645, + "learning_rate": 1.2191135485841066e-06, + "loss": 0.1254, + "step": 30634 + }, + { + "epoch": 0.7752359743907685, + "grad_norm": 8.4290132522583, + "learning_rate": 1.21885081775721e-06, + "loss": 0.1979, + "step": 30635 + }, + { + "epoch": 0.7752612799554622, + "grad_norm": 7.4388556480407715, + "learning_rate": 1.2185881113142395e-06, + "loss": 0.1518, + "step": 30636 + }, + { + "epoch": 0.7752865855201558, + "grad_norm": 4.278407096862793, + "learning_rate": 1.2183254292568918e-06, + "loss": 0.1814, + "step": 30637 + }, + { + "epoch": 0.7753118910848495, + "grad_norm": 4.9262919425964355, + "learning_rate": 1.2180627715868593e-06, + "loss": 0.2095, + "step": 30638 + }, + { + "epoch": 0.7753371966495433, + "grad_norm": 5.3577375411987305, + "learning_rate": 1.2178001383058391e-06, + "loss": 0.164, + "step": 30639 + }, + { + "epoch": 0.7753625022142369, + "grad_norm": 7.426272869110107, + "learning_rate": 1.2175375294155189e-06, + "loss": 0.1585, + "step": 30640 + }, + { + "epoch": 0.7753878077789306, + "grad_norm": 2.7393624782562256, + "learning_rate": 1.2172749449175975e-06, + "loss": 0.1365, + "step": 30641 + }, + { + "epoch": 0.7754131133436243, + "grad_norm": 6.674220561981201, + "learning_rate": 1.2170123848137649e-06, + "loss": 0.1768, + "step": 30642 + }, + { + "epoch": 0.775438418908318, + "grad_norm": 5.978909492492676, + "learning_rate": 1.2167498491057188e-06, + "loss": 0.1486, + "step": 30643 + }, + { + "epoch": 0.7754637244730116, + "grad_norm": 4.159574508666992, + "learning_rate": 1.2164873377951459e-06, + "loss": 0.1195, + "step": 30644 + }, + { + "epoch": 0.7754890300377053, + "grad_norm": 2.7455027103424072, + "learning_rate": 1.2162248508837442e-06, + "loss": 0.0955, + "step": 30645 + }, + { + "epoch": 0.775514335602399, + "grad_norm": 8.202539443969727, + "learning_rate": 1.215962388373203e-06, + "loss": 0.2266, + "step": 30646 + }, + { + "epoch": 0.7755396411670926, + "grad_norm": 5.697546005249023, + "learning_rate": 1.2156999502652178e-06, + "loss": 0.1852, + "step": 30647 + }, + { + "epoch": 0.7755649467317863, + "grad_norm": 6.765842437744141, + "learning_rate": 1.2154375365614796e-06, + "loss": 0.0684, + "step": 30648 + }, + { + "epoch": 0.77559025229648, + "grad_norm": 5.227213382720947, + "learning_rate": 1.2151751472636807e-06, + "loss": 0.1841, + "step": 30649 + }, + { + "epoch": 0.7756155578611736, + "grad_norm": 7.433592796325684, + "learning_rate": 1.2149127823735123e-06, + "loss": 0.1976, + "step": 30650 + }, + { + "epoch": 0.7756408634258674, + "grad_norm": 10.975950241088867, + "learning_rate": 1.2146504418926686e-06, + "loss": 0.2374, + "step": 30651 + }, + { + "epoch": 0.7756661689905611, + "grad_norm": 5.920916557312012, + "learning_rate": 1.2143881258228396e-06, + "loss": 0.1563, + "step": 30652 + }, + { + "epoch": 0.7756914745552547, + "grad_norm": 4.710175037384033, + "learning_rate": 1.214125834165718e-06, + "loss": 0.2204, + "step": 30653 + }, + { + "epoch": 0.7757167801199484, + "grad_norm": 4.3412251472473145, + "learning_rate": 1.2138635669229941e-06, + "loss": 0.1345, + "step": 30654 + }, + { + "epoch": 0.7757420856846421, + "grad_norm": 3.951974868774414, + "learning_rate": 1.2136013240963585e-06, + "loss": 0.131, + "step": 30655 + }, + { + "epoch": 0.7757673912493357, + "grad_norm": 3.655750274658203, + "learning_rate": 1.2133391056875055e-06, + "loss": 0.1046, + "step": 30656 + }, + { + "epoch": 0.7757926968140294, + "grad_norm": 14.318041801452637, + "learning_rate": 1.213076911698124e-06, + "loss": 0.1446, + "step": 30657 + }, + { + "epoch": 0.7758180023787231, + "grad_norm": 4.53321647644043, + "learning_rate": 1.2128147421299047e-06, + "loss": 0.1258, + "step": 30658 + }, + { + "epoch": 0.7758433079434167, + "grad_norm": 11.386712074279785, + "learning_rate": 1.2125525969845376e-06, + "loss": 0.3244, + "step": 30659 + }, + { + "epoch": 0.7758686135081104, + "grad_norm": 12.762519836425781, + "learning_rate": 1.2122904762637154e-06, + "loss": 0.2126, + "step": 30660 + }, + { + "epoch": 0.7758939190728041, + "grad_norm": 2.4380080699920654, + "learning_rate": 1.212028379969128e-06, + "loss": 0.1155, + "step": 30661 + }, + { + "epoch": 0.7759192246374977, + "grad_norm": 6.03756856918335, + "learning_rate": 1.2117663081024645e-06, + "loss": 0.1787, + "step": 30662 + }, + { + "epoch": 0.7759445302021915, + "grad_norm": 5.941618919372559, + "learning_rate": 1.2115042606654138e-06, + "loss": 0.1469, + "step": 30663 + }, + { + "epoch": 0.7759698357668852, + "grad_norm": 6.721217155456543, + "learning_rate": 1.2112422376596694e-06, + "loss": 0.1846, + "step": 30664 + }, + { + "epoch": 0.7759951413315788, + "grad_norm": 7.8377861976623535, + "learning_rate": 1.2109802390869186e-06, + "loss": 0.1455, + "step": 30665 + }, + { + "epoch": 0.7760204468962725, + "grad_norm": 6.8997392654418945, + "learning_rate": 1.2107182649488513e-06, + "loss": 0.1299, + "step": 30666 + }, + { + "epoch": 0.7760457524609662, + "grad_norm": 11.404433250427246, + "learning_rate": 1.210456315247156e-06, + "loss": 0.2964, + "step": 30667 + }, + { + "epoch": 0.7760710580256598, + "grad_norm": 4.252996444702148, + "learning_rate": 1.2101943899835244e-06, + "loss": 0.1064, + "step": 30668 + }, + { + "epoch": 0.7760963635903535, + "grad_norm": 7.104998588562012, + "learning_rate": 1.2099324891596425e-06, + "loss": 0.1477, + "step": 30669 + }, + { + "epoch": 0.7761216691550472, + "grad_norm": 8.697662353515625, + "learning_rate": 1.209670612777204e-06, + "loss": 0.2255, + "step": 30670 + }, + { + "epoch": 0.7761469747197409, + "grad_norm": 3.742111921310425, + "learning_rate": 1.2094087608378918e-06, + "loss": 0.1736, + "step": 30671 + }, + { + "epoch": 0.7761722802844345, + "grad_norm": 3.6492743492126465, + "learning_rate": 1.2091469333433991e-06, + "loss": 0.1297, + "step": 30672 + }, + { + "epoch": 0.7761975858491282, + "grad_norm": 4.656661510467529, + "learning_rate": 1.2088851302954108e-06, + "loss": 0.1008, + "step": 30673 + }, + { + "epoch": 0.776222891413822, + "grad_norm": 4.1791181564331055, + "learning_rate": 1.2086233516956204e-06, + "loss": 0.1273, + "step": 30674 + }, + { + "epoch": 0.7762481969785155, + "grad_norm": 3.4554529190063477, + "learning_rate": 1.2083615975457097e-06, + "loss": 0.1533, + "step": 30675 + }, + { + "epoch": 0.7762735025432093, + "grad_norm": 9.757412910461426, + "learning_rate": 1.208099867847371e-06, + "loss": 0.196, + "step": 30676 + }, + { + "epoch": 0.776298808107903, + "grad_norm": 3.9353485107421875, + "learning_rate": 1.2078381626022894e-06, + "loss": 0.1146, + "step": 30677 + }, + { + "epoch": 0.7763241136725966, + "grad_norm": 5.395279884338379, + "learning_rate": 1.207576481812156e-06, + "loss": 0.1462, + "step": 30678 + }, + { + "epoch": 0.7763494192372903, + "grad_norm": 9.851056098937988, + "learning_rate": 1.207314825478656e-06, + "loss": 0.2173, + "step": 30679 + }, + { + "epoch": 0.776374724801984, + "grad_norm": 6.214598178863525, + "learning_rate": 1.2070531936034773e-06, + "loss": 0.1452, + "step": 30680 + }, + { + "epoch": 0.7764000303666776, + "grad_norm": 6.005332946777344, + "learning_rate": 1.2067915861883056e-06, + "loss": 0.1572, + "step": 30681 + }, + { + "epoch": 0.7764253359313713, + "grad_norm": 3.145110845565796, + "learning_rate": 1.2065300032348309e-06, + "loss": 0.126, + "step": 30682 + }, + { + "epoch": 0.776450641496065, + "grad_norm": 2.9912338256835938, + "learning_rate": 1.2062684447447388e-06, + "loss": 0.1126, + "step": 30683 + }, + { + "epoch": 0.7764759470607586, + "grad_norm": 4.1973876953125, + "learning_rate": 1.2060069107197153e-06, + "loss": 0.1419, + "step": 30684 + }, + { + "epoch": 0.7765012526254523, + "grad_norm": 6.682620048522949, + "learning_rate": 1.2057454011614479e-06, + "loss": 0.2142, + "step": 30685 + }, + { + "epoch": 0.776526558190146, + "grad_norm": 5.51235294342041, + "learning_rate": 1.205483916071621e-06, + "loss": 0.1528, + "step": 30686 + }, + { + "epoch": 0.7765518637548396, + "grad_norm": 8.682327270507812, + "learning_rate": 1.2052224554519238e-06, + "loss": 0.2107, + "step": 30687 + }, + { + "epoch": 0.7765771693195334, + "grad_norm": 6.638630390167236, + "learning_rate": 1.2049610193040413e-06, + "loss": 0.1564, + "step": 30688 + }, + { + "epoch": 0.7766024748842271, + "grad_norm": 3.7473559379577637, + "learning_rate": 1.2046996076296591e-06, + "loss": 0.2229, + "step": 30689 + }, + { + "epoch": 0.7766277804489207, + "grad_norm": 6.938689708709717, + "learning_rate": 1.2044382204304617e-06, + "loss": 0.2085, + "step": 30690 + }, + { + "epoch": 0.7766530860136144, + "grad_norm": 3.252002000808716, + "learning_rate": 1.2041768577081376e-06, + "loss": 0.1054, + "step": 30691 + }, + { + "epoch": 0.7766783915783081, + "grad_norm": 12.343043327331543, + "learning_rate": 1.2039155194643709e-06, + "loss": 0.1386, + "step": 30692 + }, + { + "epoch": 0.7767036971430017, + "grad_norm": 3.676248550415039, + "learning_rate": 1.2036542057008471e-06, + "loss": 0.1307, + "step": 30693 + }, + { + "epoch": 0.7767290027076954, + "grad_norm": 4.011347770690918, + "learning_rate": 1.2033929164192499e-06, + "loss": 0.1662, + "step": 30694 + }, + { + "epoch": 0.7767543082723891, + "grad_norm": 3.8721399307250977, + "learning_rate": 1.2031316516212666e-06, + "loss": 0.1921, + "step": 30695 + }, + { + "epoch": 0.7767796138370828, + "grad_norm": 6.45068359375, + "learning_rate": 1.202870411308581e-06, + "loss": 0.132, + "step": 30696 + }, + { + "epoch": 0.7768049194017764, + "grad_norm": 4.90690803527832, + "learning_rate": 1.2026091954828777e-06, + "loss": 0.1306, + "step": 30697 + }, + { + "epoch": 0.7768302249664701, + "grad_norm": 5.31131649017334, + "learning_rate": 1.2023480041458402e-06, + "loss": 0.1544, + "step": 30698 + }, + { + "epoch": 0.7768555305311639, + "grad_norm": 5.554655075073242, + "learning_rate": 1.2020868372991556e-06, + "loss": 0.1178, + "step": 30699 + }, + { + "epoch": 0.7768808360958575, + "grad_norm": 3.2879796028137207, + "learning_rate": 1.2018256949445045e-06, + "loss": 0.0998, + "step": 30700 + }, + { + "epoch": 0.7769061416605512, + "grad_norm": 7.888761043548584, + "learning_rate": 1.2015645770835765e-06, + "loss": 0.1602, + "step": 30701 + }, + { + "epoch": 0.7769314472252449, + "grad_norm": 6.141322135925293, + "learning_rate": 1.2013034837180488e-06, + "loss": 0.1769, + "step": 30702 + }, + { + "epoch": 0.7769567527899385, + "grad_norm": 5.395065784454346, + "learning_rate": 1.2010424148496098e-06, + "loss": 0.2624, + "step": 30703 + }, + { + "epoch": 0.7769820583546322, + "grad_norm": 11.861734390258789, + "learning_rate": 1.2007813704799399e-06, + "loss": 0.2603, + "step": 30704 + }, + { + "epoch": 0.7770073639193259, + "grad_norm": 4.936829090118408, + "learning_rate": 1.2005203506107272e-06, + "loss": 0.097, + "step": 30705 + }, + { + "epoch": 0.7770326694840195, + "grad_norm": 5.266484260559082, + "learning_rate": 1.2002593552436493e-06, + "loss": 0.1627, + "step": 30706 + }, + { + "epoch": 0.7770579750487132, + "grad_norm": 9.611189842224121, + "learning_rate": 1.1999983843803936e-06, + "loss": 0.1212, + "step": 30707 + }, + { + "epoch": 0.7770832806134069, + "grad_norm": 17.754953384399414, + "learning_rate": 1.1997374380226395e-06, + "loss": 0.398, + "step": 30708 + }, + { + "epoch": 0.7771085861781005, + "grad_norm": 2.910102128982544, + "learning_rate": 1.199476516172075e-06, + "loss": 0.1393, + "step": 30709 + }, + { + "epoch": 0.7771338917427942, + "grad_norm": 4.962179183959961, + "learning_rate": 1.1992156188303767e-06, + "loss": 0.1443, + "step": 30710 + }, + { + "epoch": 0.777159197307488, + "grad_norm": 7.1927170753479, + "learning_rate": 1.1989547459992311e-06, + "loss": 0.2071, + "step": 30711 + }, + { + "epoch": 0.7771845028721815, + "grad_norm": 4.252915859222412, + "learning_rate": 1.1986938976803191e-06, + "loss": 0.1066, + "step": 30712 + }, + { + "epoch": 0.7772098084368753, + "grad_norm": 4.567996025085449, + "learning_rate": 1.198433073875322e-06, + "loss": 0.1689, + "step": 30713 + }, + { + "epoch": 0.777235114001569, + "grad_norm": 4.811074733734131, + "learning_rate": 1.1981722745859243e-06, + "loss": 0.2141, + "step": 30714 + }, + { + "epoch": 0.7772604195662626, + "grad_norm": 5.911964416503906, + "learning_rate": 1.1979114998138065e-06, + "loss": 0.2369, + "step": 30715 + }, + { + "epoch": 0.7772857251309563, + "grad_norm": 14.25410270690918, + "learning_rate": 1.1976507495606498e-06, + "loss": 0.2134, + "step": 30716 + }, + { + "epoch": 0.77731103069565, + "grad_norm": 12.265928268432617, + "learning_rate": 1.1973900238281349e-06, + "loss": 0.0878, + "step": 30717 + }, + { + "epoch": 0.7773363362603436, + "grad_norm": 3.3391597270965576, + "learning_rate": 1.197129322617946e-06, + "loss": 0.1199, + "step": 30718 + }, + { + "epoch": 0.7773616418250373, + "grad_norm": 2.7411439418792725, + "learning_rate": 1.1968686459317625e-06, + "loss": 0.0906, + "step": 30719 + }, + { + "epoch": 0.777386947389731, + "grad_norm": 4.586977958679199, + "learning_rate": 1.1966079937712654e-06, + "loss": 0.127, + "step": 30720 + }, + { + "epoch": 0.7774122529544247, + "grad_norm": 5.70996618270874, + "learning_rate": 1.1963473661381347e-06, + "loss": 0.1677, + "step": 30721 + }, + { + "epoch": 0.7774375585191183, + "grad_norm": 6.611991882324219, + "learning_rate": 1.196086763034054e-06, + "loss": 0.209, + "step": 30722 + }, + { + "epoch": 0.777462864083812, + "grad_norm": 3.3586959838867188, + "learning_rate": 1.1958261844607016e-06, + "loss": 0.1189, + "step": 30723 + }, + { + "epoch": 0.7774881696485058, + "grad_norm": 7.834190845489502, + "learning_rate": 1.195565630419759e-06, + "loss": 0.2002, + "step": 30724 + }, + { + "epoch": 0.7775134752131994, + "grad_norm": 11.55223274230957, + "learning_rate": 1.1953051009129047e-06, + "loss": 0.259, + "step": 30725 + }, + { + "epoch": 0.7775387807778931, + "grad_norm": 5.83276891708374, + "learning_rate": 1.195044595941821e-06, + "loss": 0.1753, + "step": 30726 + }, + { + "epoch": 0.7775640863425868, + "grad_norm": 9.058501243591309, + "learning_rate": 1.1947841155081868e-06, + "loss": 0.259, + "step": 30727 + }, + { + "epoch": 0.7775893919072804, + "grad_norm": 4.66285514831543, + "learning_rate": 1.1945236596136827e-06, + "loss": 0.1502, + "step": 30728 + }, + { + "epoch": 0.7776146974719741, + "grad_norm": 6.604369163513184, + "learning_rate": 1.194263228259986e-06, + "loss": 0.1484, + "step": 30729 + }, + { + "epoch": 0.7776400030366678, + "grad_norm": 5.364692687988281, + "learning_rate": 1.1940028214487787e-06, + "loss": 0.1682, + "step": 30730 + }, + { + "epoch": 0.7776653086013614, + "grad_norm": 4.478705406188965, + "learning_rate": 1.1937424391817398e-06, + "loss": 0.1848, + "step": 30731 + }, + { + "epoch": 0.7776906141660551, + "grad_norm": 5.895469665527344, + "learning_rate": 1.1934820814605475e-06, + "loss": 0.1143, + "step": 30732 + }, + { + "epoch": 0.7777159197307488, + "grad_norm": 4.6604485511779785, + "learning_rate": 1.1932217482868803e-06, + "loss": 0.1443, + "step": 30733 + }, + { + "epoch": 0.7777412252954424, + "grad_norm": 8.808603286743164, + "learning_rate": 1.192961439662419e-06, + "loss": 0.2401, + "step": 30734 + }, + { + "epoch": 0.7777665308601361, + "grad_norm": 7.934029579162598, + "learning_rate": 1.1927011555888396e-06, + "loss": 0.172, + "step": 30735 + }, + { + "epoch": 0.7777918364248299, + "grad_norm": 4.239518165588379, + "learning_rate": 1.1924408960678252e-06, + "loss": 0.1696, + "step": 30736 + }, + { + "epoch": 0.7778171419895235, + "grad_norm": 5.890034198760986, + "learning_rate": 1.1921806611010483e-06, + "loss": 0.2004, + "step": 30737 + }, + { + "epoch": 0.7778424475542172, + "grad_norm": 7.795114994049072, + "learning_rate": 1.1919204506901921e-06, + "loss": 0.2117, + "step": 30738 + }, + { + "epoch": 0.7778677531189109, + "grad_norm": 4.9523539543151855, + "learning_rate": 1.1916602648369319e-06, + "loss": 0.1493, + "step": 30739 + }, + { + "epoch": 0.7778930586836045, + "grad_norm": 4.31355619430542, + "learning_rate": 1.1914001035429462e-06, + "loss": 0.1171, + "step": 30740 + }, + { + "epoch": 0.7779183642482982, + "grad_norm": 4.608513832092285, + "learning_rate": 1.1911399668099116e-06, + "loss": 0.2041, + "step": 30741 + }, + { + "epoch": 0.7779436698129919, + "grad_norm": 5.077084064483643, + "learning_rate": 1.1908798546395085e-06, + "loss": 0.1598, + "step": 30742 + }, + { + "epoch": 0.7779689753776855, + "grad_norm": 8.615941047668457, + "learning_rate": 1.1906197670334124e-06, + "loss": 0.2044, + "step": 30743 + }, + { + "epoch": 0.7779942809423792, + "grad_norm": 9.4210786819458, + "learning_rate": 1.1903597039932991e-06, + "loss": 0.1354, + "step": 30744 + }, + { + "epoch": 0.7780195865070729, + "grad_norm": 2.5640594959259033, + "learning_rate": 1.190099665520849e-06, + "loss": 0.1597, + "step": 30745 + }, + { + "epoch": 0.7780448920717666, + "grad_norm": 3.2544610500335693, + "learning_rate": 1.189839651617738e-06, + "loss": 0.131, + "step": 30746 + }, + { + "epoch": 0.7780701976364602, + "grad_norm": 9.243898391723633, + "learning_rate": 1.189579662285642e-06, + "loss": 0.1379, + "step": 30747 + }, + { + "epoch": 0.778095503201154, + "grad_norm": 4.327416896820068, + "learning_rate": 1.1893196975262361e-06, + "loss": 0.1748, + "step": 30748 + }, + { + "epoch": 0.7781208087658477, + "grad_norm": 7.4621262550354, + "learning_rate": 1.1890597573412022e-06, + "loss": 0.1287, + "step": 30749 + }, + { + "epoch": 0.7781461143305413, + "grad_norm": 5.181390762329102, + "learning_rate": 1.1887998417322095e-06, + "loss": 0.1205, + "step": 30750 + }, + { + "epoch": 0.778171419895235, + "grad_norm": 3.5837037563323975, + "learning_rate": 1.1885399507009399e-06, + "loss": 0.1309, + "step": 30751 + }, + { + "epoch": 0.7781967254599287, + "grad_norm": 4.856287002563477, + "learning_rate": 1.1882800842490648e-06, + "loss": 0.1209, + "step": 30752 + }, + { + "epoch": 0.7782220310246223, + "grad_norm": 6.311016082763672, + "learning_rate": 1.1880202423782644e-06, + "loss": 0.1692, + "step": 30753 + }, + { + "epoch": 0.778247336589316, + "grad_norm": 4.846467971801758, + "learning_rate": 1.1877604250902124e-06, + "loss": 0.1357, + "step": 30754 + }, + { + "epoch": 0.7782726421540097, + "grad_norm": 6.624057292938232, + "learning_rate": 1.1875006323865846e-06, + "loss": 0.1604, + "step": 30755 + }, + { + "epoch": 0.7782979477187033, + "grad_norm": 4.210023880004883, + "learning_rate": 1.1872408642690537e-06, + "loss": 0.1068, + "step": 30756 + }, + { + "epoch": 0.778323253283397, + "grad_norm": 12.175179481506348, + "learning_rate": 1.1869811207392994e-06, + "loss": 0.1967, + "step": 30757 + }, + { + "epoch": 0.7783485588480907, + "grad_norm": 4.2574639320373535, + "learning_rate": 1.1867214017989941e-06, + "loss": 0.1451, + "step": 30758 + }, + { + "epoch": 0.7783738644127843, + "grad_norm": 3.9708714485168457, + "learning_rate": 1.1864617074498136e-06, + "loss": 0.1557, + "step": 30759 + }, + { + "epoch": 0.778399169977478, + "grad_norm": 4.296503067016602, + "learning_rate": 1.1862020376934308e-06, + "loss": 0.1847, + "step": 30760 + }, + { + "epoch": 0.7784244755421718, + "grad_norm": 3.4346139430999756, + "learning_rate": 1.1859423925315227e-06, + "loss": 0.1222, + "step": 30761 + }, + { + "epoch": 0.7784497811068654, + "grad_norm": 2.9426825046539307, + "learning_rate": 1.1856827719657626e-06, + "loss": 0.1065, + "step": 30762 + }, + { + "epoch": 0.7784750866715591, + "grad_norm": 4.306545734405518, + "learning_rate": 1.1854231759978252e-06, + "loss": 0.1344, + "step": 30763 + }, + { + "epoch": 0.7785003922362528, + "grad_norm": 7.281080722808838, + "learning_rate": 1.1851636046293823e-06, + "loss": 0.1853, + "step": 30764 + }, + { + "epoch": 0.7785256978009464, + "grad_norm": 27.144973754882812, + "learning_rate": 1.1849040578621113e-06, + "loss": 0.3699, + "step": 30765 + }, + { + "epoch": 0.7785510033656401, + "grad_norm": 3.413816452026367, + "learning_rate": 1.1846445356976842e-06, + "loss": 0.0912, + "step": 30766 + }, + { + "epoch": 0.7785763089303338, + "grad_norm": 6.026965141296387, + "learning_rate": 1.1843850381377752e-06, + "loss": 0.1952, + "step": 30767 + }, + { + "epoch": 0.7786016144950274, + "grad_norm": 13.247769355773926, + "learning_rate": 1.1841255651840555e-06, + "loss": 0.1147, + "step": 30768 + }, + { + "epoch": 0.7786269200597211, + "grad_norm": 8.51147174835205, + "learning_rate": 1.1838661168382021e-06, + "loss": 0.1926, + "step": 30769 + }, + { + "epoch": 0.7786522256244148, + "grad_norm": 13.428900718688965, + "learning_rate": 1.1836066931018858e-06, + "loss": 0.2635, + "step": 30770 + }, + { + "epoch": 0.7786775311891084, + "grad_norm": 7.06602144241333, + "learning_rate": 1.1833472939767803e-06, + "loss": 0.1433, + "step": 30771 + }, + { + "epoch": 0.7787028367538021, + "grad_norm": 4.071608543395996, + "learning_rate": 1.1830879194645568e-06, + "loss": 0.1893, + "step": 30772 + }, + { + "epoch": 0.7787281423184959, + "grad_norm": 5.9256672859191895, + "learning_rate": 1.1828285695668907e-06, + "loss": 0.1612, + "step": 30773 + }, + { + "epoch": 0.7787534478831896, + "grad_norm": 5.900686264038086, + "learning_rate": 1.1825692442854526e-06, + "loss": 0.1665, + "step": 30774 + }, + { + "epoch": 0.7787787534478832, + "grad_norm": 5.673165321350098, + "learning_rate": 1.1823099436219143e-06, + "loss": 0.1917, + "step": 30775 + }, + { + "epoch": 0.7788040590125769, + "grad_norm": 5.250746726989746, + "learning_rate": 1.182050667577952e-06, + "loss": 0.2076, + "step": 30776 + }, + { + "epoch": 0.7788293645772706, + "grad_norm": 2.909627676010132, + "learning_rate": 1.1817914161552318e-06, + "loss": 0.1305, + "step": 30777 + }, + { + "epoch": 0.7788546701419642, + "grad_norm": 6.009337425231934, + "learning_rate": 1.1815321893554299e-06, + "loss": 0.1867, + "step": 30778 + }, + { + "epoch": 0.7788799757066579, + "grad_norm": 8.794660568237305, + "learning_rate": 1.1812729871802153e-06, + "loss": 0.1437, + "step": 30779 + }, + { + "epoch": 0.7789052812713516, + "grad_norm": 11.34207820892334, + "learning_rate": 1.1810138096312634e-06, + "loss": 0.2831, + "step": 30780 + }, + { + "epoch": 0.7789305868360452, + "grad_norm": 8.393757820129395, + "learning_rate": 1.1807546567102408e-06, + "loss": 0.223, + "step": 30781 + }, + { + "epoch": 0.7789558924007389, + "grad_norm": 3.9099652767181396, + "learning_rate": 1.1804955284188218e-06, + "loss": 0.1572, + "step": 30782 + }, + { + "epoch": 0.7789811979654326, + "grad_norm": 5.324121475219727, + "learning_rate": 1.1802364247586757e-06, + "loss": 0.1983, + "step": 30783 + }, + { + "epoch": 0.7790065035301262, + "grad_norm": 3.6216509342193604, + "learning_rate": 1.1799773457314767e-06, + "loss": 0.1649, + "step": 30784 + }, + { + "epoch": 0.77903180909482, + "grad_norm": 5.274791240692139, + "learning_rate": 1.1797182913388905e-06, + "loss": 0.1497, + "step": 30785 + }, + { + "epoch": 0.7790571146595137, + "grad_norm": 5.65487813949585, + "learning_rate": 1.1794592615825922e-06, + "loss": 0.2007, + "step": 30786 + }, + { + "epoch": 0.7790824202242073, + "grad_norm": 13.478641510009766, + "learning_rate": 1.1792002564642485e-06, + "loss": 0.1249, + "step": 30787 + }, + { + "epoch": 0.779107725788901, + "grad_norm": 3.7635605335235596, + "learning_rate": 1.1789412759855334e-06, + "loss": 0.1345, + "step": 30788 + }, + { + "epoch": 0.7791330313535947, + "grad_norm": 6.1343865394592285, + "learning_rate": 1.1786823201481156e-06, + "loss": 0.1615, + "step": 30789 + }, + { + "epoch": 0.7791583369182883, + "grad_norm": 9.762418746948242, + "learning_rate": 1.178423388953664e-06, + "loss": 0.286, + "step": 30790 + }, + { + "epoch": 0.779183642482982, + "grad_norm": 5.229928016662598, + "learning_rate": 1.1781644824038479e-06, + "loss": 0.1071, + "step": 30791 + }, + { + "epoch": 0.7792089480476757, + "grad_norm": 7.9908647537231445, + "learning_rate": 1.1779056005003398e-06, + "loss": 0.2315, + "step": 30792 + }, + { + "epoch": 0.7792342536123693, + "grad_norm": 3.4525084495544434, + "learning_rate": 1.1776467432448073e-06, + "loss": 0.1749, + "step": 30793 + }, + { + "epoch": 0.779259559177063, + "grad_norm": 6.944336414337158, + "learning_rate": 1.1773879106389203e-06, + "loss": 0.1712, + "step": 30794 + }, + { + "epoch": 0.7792848647417567, + "grad_norm": 3.6058599948883057, + "learning_rate": 1.1771291026843457e-06, + "loss": 0.1211, + "step": 30795 + }, + { + "epoch": 0.7793101703064503, + "grad_norm": 9.950101852416992, + "learning_rate": 1.1768703193827557e-06, + "loss": 0.1352, + "step": 30796 + }, + { + "epoch": 0.779335475871144, + "grad_norm": 3.990147590637207, + "learning_rate": 1.1766115607358187e-06, + "loss": 0.1751, + "step": 30797 + }, + { + "epoch": 0.7793607814358378, + "grad_norm": 5.206136226654053, + "learning_rate": 1.1763528267452019e-06, + "loss": 0.1267, + "step": 30798 + }, + { + "epoch": 0.7793860870005315, + "grad_norm": 7.308191299438477, + "learning_rate": 1.1760941174125728e-06, + "loss": 0.2124, + "step": 30799 + }, + { + "epoch": 0.7794113925652251, + "grad_norm": 3.5832929611206055, + "learning_rate": 1.1758354327396031e-06, + "loss": 0.1669, + "step": 30800 + }, + { + "epoch": 0.7794366981299188, + "grad_norm": 12.914328575134277, + "learning_rate": 1.1755767727279598e-06, + "loss": 0.1997, + "step": 30801 + }, + { + "epoch": 0.7794620036946125, + "grad_norm": 6.534108638763428, + "learning_rate": 1.1753181373793099e-06, + "loss": 0.2413, + "step": 30802 + }, + { + "epoch": 0.7794873092593061, + "grad_norm": 4.336752891540527, + "learning_rate": 1.1750595266953225e-06, + "loss": 0.0847, + "step": 30803 + }, + { + "epoch": 0.7795126148239998, + "grad_norm": 5.523990154266357, + "learning_rate": 1.1748009406776628e-06, + "loss": 0.2026, + "step": 30804 + }, + { + "epoch": 0.7795379203886935, + "grad_norm": 9.975321769714355, + "learning_rate": 1.174542379328002e-06, + "loss": 0.204, + "step": 30805 + }, + { + "epoch": 0.7795632259533871, + "grad_norm": 9.81297492980957, + "learning_rate": 1.174283842648004e-06, + "loss": 0.2563, + "step": 30806 + }, + { + "epoch": 0.7795885315180808, + "grad_norm": 6.5576300621032715, + "learning_rate": 1.1740253306393412e-06, + "loss": 0.2166, + "step": 30807 + }, + { + "epoch": 0.7796138370827745, + "grad_norm": 10.460617065429688, + "learning_rate": 1.1737668433036742e-06, + "loss": 0.1337, + "step": 30808 + }, + { + "epoch": 0.7796391426474681, + "grad_norm": 8.887516975402832, + "learning_rate": 1.1735083806426744e-06, + "loss": 0.1856, + "step": 30809 + }, + { + "epoch": 0.7796644482121619, + "grad_norm": 5.063439846038818, + "learning_rate": 1.1732499426580063e-06, + "loss": 0.1411, + "step": 30810 + }, + { + "epoch": 0.7796897537768556, + "grad_norm": 6.015508651733398, + "learning_rate": 1.17299152935134e-06, + "loss": 0.183, + "step": 30811 + }, + { + "epoch": 0.7797150593415492, + "grad_norm": 5.0297322273254395, + "learning_rate": 1.1727331407243364e-06, + "loss": 0.1999, + "step": 30812 + }, + { + "epoch": 0.7797403649062429, + "grad_norm": 7.31734561920166, + "learning_rate": 1.1724747767786666e-06, + "loss": 0.1843, + "step": 30813 + }, + { + "epoch": 0.7797656704709366, + "grad_norm": 9.932707786560059, + "learning_rate": 1.1722164375159934e-06, + "loss": 0.1708, + "step": 30814 + }, + { + "epoch": 0.7797909760356302, + "grad_norm": 5.0505781173706055, + "learning_rate": 1.1719581229379872e-06, + "loss": 0.1058, + "step": 30815 + }, + { + "epoch": 0.7798162816003239, + "grad_norm": 10.904544830322266, + "learning_rate": 1.1716998330463086e-06, + "loss": 0.2921, + "step": 30816 + }, + { + "epoch": 0.7798415871650176, + "grad_norm": 5.2490620613098145, + "learning_rate": 1.1714415678426267e-06, + "loss": 0.1905, + "step": 30817 + }, + { + "epoch": 0.7798668927297112, + "grad_norm": 3.3024306297302246, + "learning_rate": 1.1711833273286044e-06, + "loss": 0.1048, + "step": 30818 + }, + { + "epoch": 0.7798921982944049, + "grad_norm": 2.565045118331909, + "learning_rate": 1.1709251115059101e-06, + "loss": 0.0993, + "step": 30819 + }, + { + "epoch": 0.7799175038590986, + "grad_norm": 4.278376579284668, + "learning_rate": 1.1706669203762077e-06, + "loss": 0.1638, + "step": 30820 + }, + { + "epoch": 0.7799428094237922, + "grad_norm": 4.6342997550964355, + "learning_rate": 1.1704087539411617e-06, + "loss": 0.1473, + "step": 30821 + }, + { + "epoch": 0.779968114988486, + "grad_norm": 16.340560913085938, + "learning_rate": 1.1701506122024365e-06, + "loss": 0.1676, + "step": 30822 + }, + { + "epoch": 0.7799934205531797, + "grad_norm": 5.04256010055542, + "learning_rate": 1.1698924951616986e-06, + "loss": 0.1721, + "step": 30823 + }, + { + "epoch": 0.7800187261178734, + "grad_norm": 4.397945404052734, + "learning_rate": 1.1696344028206113e-06, + "loss": 0.2283, + "step": 30824 + }, + { + "epoch": 0.780044031682567, + "grad_norm": 6.42116117477417, + "learning_rate": 1.1693763351808396e-06, + "loss": 0.1301, + "step": 30825 + }, + { + "epoch": 0.7800693372472607, + "grad_norm": 2.460131883621216, + "learning_rate": 1.1691182922440459e-06, + "loss": 0.0763, + "step": 30826 + }, + { + "epoch": 0.7800946428119544, + "grad_norm": 5.411459922790527, + "learning_rate": 1.168860274011897e-06, + "loss": 0.1401, + "step": 30827 + }, + { + "epoch": 0.780119948376648, + "grad_norm": 4.9640212059021, + "learning_rate": 1.1686022804860558e-06, + "loss": 0.144, + "step": 30828 + }, + { + "epoch": 0.7801452539413417, + "grad_norm": 7.669331073760986, + "learning_rate": 1.1683443116681853e-06, + "loss": 0.1567, + "step": 30829 + }, + { + "epoch": 0.7801705595060354, + "grad_norm": 3.8503317832946777, + "learning_rate": 1.1680863675599486e-06, + "loss": 0.196, + "step": 30830 + }, + { + "epoch": 0.780195865070729, + "grad_norm": 6.380222320556641, + "learning_rate": 1.1678284481630114e-06, + "loss": 0.1762, + "step": 30831 + }, + { + "epoch": 0.7802211706354227, + "grad_norm": 4.765458106994629, + "learning_rate": 1.167570553479036e-06, + "loss": 0.1825, + "step": 30832 + }, + { + "epoch": 0.7802464762001164, + "grad_norm": 5.394248008728027, + "learning_rate": 1.167312683509685e-06, + "loss": 0.1663, + "step": 30833 + }, + { + "epoch": 0.78027178176481, + "grad_norm": 3.246711015701294, + "learning_rate": 1.167054838256621e-06, + "loss": 0.1549, + "step": 30834 + }, + { + "epoch": 0.7802970873295038, + "grad_norm": 4.048409461975098, + "learning_rate": 1.1667970177215066e-06, + "loss": 0.1303, + "step": 30835 + }, + { + "epoch": 0.7803223928941975, + "grad_norm": 4.932337760925293, + "learning_rate": 1.1665392219060068e-06, + "loss": 0.1619, + "step": 30836 + }, + { + "epoch": 0.7803476984588911, + "grad_norm": 4.081485748291016, + "learning_rate": 1.166281450811782e-06, + "loss": 0.1068, + "step": 30837 + }, + { + "epoch": 0.7803730040235848, + "grad_norm": 3.6666033267974854, + "learning_rate": 1.1660237044404954e-06, + "loss": 0.1408, + "step": 30838 + }, + { + "epoch": 0.7803983095882785, + "grad_norm": 4.9116926193237305, + "learning_rate": 1.1657659827938066e-06, + "loss": 0.2084, + "step": 30839 + }, + { + "epoch": 0.7804236151529721, + "grad_norm": 7.375484943389893, + "learning_rate": 1.1655082858733818e-06, + "loss": 0.2051, + "step": 30840 + }, + { + "epoch": 0.7804489207176658, + "grad_norm": 3.927201509475708, + "learning_rate": 1.1652506136808794e-06, + "loss": 0.1395, + "step": 30841 + }, + { + "epoch": 0.7804742262823595, + "grad_norm": 4.369362831115723, + "learning_rate": 1.1649929662179648e-06, + "loss": 0.1321, + "step": 30842 + }, + { + "epoch": 0.7804995318470531, + "grad_norm": 4.1484055519104, + "learning_rate": 1.1647353434862952e-06, + "loss": 0.1459, + "step": 30843 + }, + { + "epoch": 0.7805248374117468, + "grad_norm": 6.990333080291748, + "learning_rate": 1.164477745487535e-06, + "loss": 0.0918, + "step": 30844 + }, + { + "epoch": 0.7805501429764405, + "grad_norm": 3.3960771560668945, + "learning_rate": 1.1642201722233426e-06, + "loss": 0.1034, + "step": 30845 + }, + { + "epoch": 0.7805754485411341, + "grad_norm": 4.004838466644287, + "learning_rate": 1.1639626236953838e-06, + "loss": 0.1135, + "step": 30846 + }, + { + "epoch": 0.7806007541058279, + "grad_norm": 4.835702896118164, + "learning_rate": 1.163705099905314e-06, + "loss": 0.1689, + "step": 30847 + }, + { + "epoch": 0.7806260596705216, + "grad_norm": 8.591712951660156, + "learning_rate": 1.1634476008547973e-06, + "loss": 0.1555, + "step": 30848 + }, + { + "epoch": 0.7806513652352153, + "grad_norm": 11.303857803344727, + "learning_rate": 1.163190126545492e-06, + "loss": 0.2095, + "step": 30849 + }, + { + "epoch": 0.7806766707999089, + "grad_norm": 7.9746174812316895, + "learning_rate": 1.1629326769790617e-06, + "loss": 0.1537, + "step": 30850 + }, + { + "epoch": 0.7807019763646026, + "grad_norm": 8.157201766967773, + "learning_rate": 1.1626752521571643e-06, + "loss": 0.1981, + "step": 30851 + }, + { + "epoch": 0.7807272819292963, + "grad_norm": 3.8646676540374756, + "learning_rate": 1.1624178520814611e-06, + "loss": 0.1088, + "step": 30852 + }, + { + "epoch": 0.7807525874939899, + "grad_norm": 4.797389030456543, + "learning_rate": 1.1621604767536093e-06, + "loss": 0.1557, + "step": 30853 + }, + { + "epoch": 0.7807778930586836, + "grad_norm": 7.280797004699707, + "learning_rate": 1.1619031261752722e-06, + "loss": 0.1008, + "step": 30854 + }, + { + "epoch": 0.7808031986233773, + "grad_norm": 5.804296016693115, + "learning_rate": 1.1616458003481085e-06, + "loss": 0.1357, + "step": 30855 + }, + { + "epoch": 0.7808285041880709, + "grad_norm": 6.323599815368652, + "learning_rate": 1.1613884992737762e-06, + "loss": 0.1274, + "step": 30856 + }, + { + "epoch": 0.7808538097527646, + "grad_norm": 2.717792510986328, + "learning_rate": 1.1611312229539346e-06, + "loss": 0.1201, + "step": 30857 + }, + { + "epoch": 0.7808791153174583, + "grad_norm": 6.127384185791016, + "learning_rate": 1.1608739713902445e-06, + "loss": 0.1935, + "step": 30858 + }, + { + "epoch": 0.780904420882152, + "grad_norm": 4.704147815704346, + "learning_rate": 1.1606167445843648e-06, + "loss": 0.1123, + "step": 30859 + }, + { + "epoch": 0.7809297264468457, + "grad_norm": 12.210066795349121, + "learning_rate": 1.1603595425379527e-06, + "loss": 0.2468, + "step": 30860 + }, + { + "epoch": 0.7809550320115394, + "grad_norm": 4.487546443939209, + "learning_rate": 1.1601023652526684e-06, + "loss": 0.1717, + "step": 30861 + }, + { + "epoch": 0.780980337576233, + "grad_norm": 3.1950325965881348, + "learning_rate": 1.159845212730168e-06, + "loss": 0.1276, + "step": 30862 + }, + { + "epoch": 0.7810056431409267, + "grad_norm": 4.39141845703125, + "learning_rate": 1.1595880849721125e-06, + "loss": 0.119, + "step": 30863 + }, + { + "epoch": 0.7810309487056204, + "grad_norm": 6.419125080108643, + "learning_rate": 1.1593309819801595e-06, + "loss": 0.1868, + "step": 30864 + }, + { + "epoch": 0.781056254270314, + "grad_norm": 5.640473365783691, + "learning_rate": 1.1590739037559667e-06, + "loss": 0.1032, + "step": 30865 + }, + { + "epoch": 0.7810815598350077, + "grad_norm": 4.092022895812988, + "learning_rate": 1.1588168503011905e-06, + "loss": 0.1347, + "step": 30866 + }, + { + "epoch": 0.7811068653997014, + "grad_norm": 4.77345085144043, + "learning_rate": 1.1585598216174909e-06, + "loss": 0.1328, + "step": 30867 + }, + { + "epoch": 0.781132170964395, + "grad_norm": 4.007627010345459, + "learning_rate": 1.158302817706524e-06, + "loss": 0.1396, + "step": 30868 + }, + { + "epoch": 0.7811574765290887, + "grad_norm": 5.713175296783447, + "learning_rate": 1.158045838569949e-06, + "loss": 0.224, + "step": 30869 + }, + { + "epoch": 0.7811827820937824, + "grad_norm": 7.839700222015381, + "learning_rate": 1.1577888842094192e-06, + "loss": 0.2279, + "step": 30870 + }, + { + "epoch": 0.781208087658476, + "grad_norm": 7.33299446105957, + "learning_rate": 1.1575319546265962e-06, + "loss": 0.2474, + "step": 30871 + }, + { + "epoch": 0.7812333932231698, + "grad_norm": 5.9899139404296875, + "learning_rate": 1.1572750498231328e-06, + "loss": 0.2027, + "step": 30872 + }, + { + "epoch": 0.7812586987878635, + "grad_norm": 5.076332092285156, + "learning_rate": 1.157018169800691e-06, + "loss": 0.1249, + "step": 30873 + }, + { + "epoch": 0.7812840043525572, + "grad_norm": 3.0314528942108154, + "learning_rate": 1.1567613145609213e-06, + "loss": 0.0766, + "step": 30874 + }, + { + "epoch": 0.7813093099172508, + "grad_norm": 5.755103588104248, + "learning_rate": 1.1565044841054846e-06, + "loss": 0.1796, + "step": 30875 + }, + { + "epoch": 0.7813346154819445, + "grad_norm": 3.075141429901123, + "learning_rate": 1.1562476784360337e-06, + "loss": 0.1499, + "step": 30876 + }, + { + "epoch": 0.7813599210466382, + "grad_norm": 8.461410522460938, + "learning_rate": 1.15599089755423e-06, + "loss": 0.1955, + "step": 30877 + }, + { + "epoch": 0.7813852266113318, + "grad_norm": 2.029599666595459, + "learning_rate": 1.1557341414617228e-06, + "loss": 0.0572, + "step": 30878 + }, + { + "epoch": 0.7814105321760255, + "grad_norm": 3.2571840286254883, + "learning_rate": 1.1554774101601724e-06, + "loss": 0.1213, + "step": 30879 + }, + { + "epoch": 0.7814358377407192, + "grad_norm": 3.8726184368133545, + "learning_rate": 1.155220703651232e-06, + "loss": 0.1424, + "step": 30880 + }, + { + "epoch": 0.7814611433054128, + "grad_norm": 3.9061520099639893, + "learning_rate": 1.1549640219365588e-06, + "loss": 0.1352, + "step": 30881 + }, + { + "epoch": 0.7814864488701065, + "grad_norm": 4.106184959411621, + "learning_rate": 1.1547073650178075e-06, + "loss": 0.1116, + "step": 30882 + }, + { + "epoch": 0.7815117544348003, + "grad_norm": 3.7900021076202393, + "learning_rate": 1.1544507328966336e-06, + "loss": 0.1033, + "step": 30883 + }, + { + "epoch": 0.7815370599994939, + "grad_norm": 13.172739028930664, + "learning_rate": 1.1541941255746903e-06, + "loss": 0.1998, + "step": 30884 + }, + { + "epoch": 0.7815623655641876, + "grad_norm": 3.5341861248016357, + "learning_rate": 1.1539375430536348e-06, + "loss": 0.1243, + "step": 30885 + }, + { + "epoch": 0.7815876711288813, + "grad_norm": 3.145000457763672, + "learning_rate": 1.1536809853351206e-06, + "loss": 0.0835, + "step": 30886 + }, + { + "epoch": 0.7816129766935749, + "grad_norm": 4.8831024169921875, + "learning_rate": 1.1534244524208022e-06, + "loss": 0.1588, + "step": 30887 + }, + { + "epoch": 0.7816382822582686, + "grad_norm": 8.25165843963623, + "learning_rate": 1.1531679443123339e-06, + "loss": 0.2539, + "step": 30888 + }, + { + "epoch": 0.7816635878229623, + "grad_norm": 7.385580062866211, + "learning_rate": 1.152911461011369e-06, + "loss": 0.1646, + "step": 30889 + }, + { + "epoch": 0.7816888933876559, + "grad_norm": 8.492366790771484, + "learning_rate": 1.1526550025195633e-06, + "loss": 0.223, + "step": 30890 + }, + { + "epoch": 0.7817141989523496, + "grad_norm": 5.421797275543213, + "learning_rate": 1.1523985688385703e-06, + "loss": 0.1503, + "step": 30891 + }, + { + "epoch": 0.7817395045170433, + "grad_norm": 4.8185811042785645, + "learning_rate": 1.1521421599700434e-06, + "loss": 0.1121, + "step": 30892 + }, + { + "epoch": 0.7817648100817369, + "grad_norm": 6.562611103057861, + "learning_rate": 1.1518857759156338e-06, + "loss": 0.2121, + "step": 30893 + }, + { + "epoch": 0.7817901156464306, + "grad_norm": 5.575319290161133, + "learning_rate": 1.1516294166769992e-06, + "loss": 0.1116, + "step": 30894 + }, + { + "epoch": 0.7818154212111244, + "grad_norm": 3.986971616744995, + "learning_rate": 1.15137308225579e-06, + "loss": 0.1774, + "step": 30895 + }, + { + "epoch": 0.781840726775818, + "grad_norm": 9.458428382873535, + "learning_rate": 1.1511167726536603e-06, + "loss": 0.1718, + "step": 30896 + }, + { + "epoch": 0.7818660323405117, + "grad_norm": 5.539618492126465, + "learning_rate": 1.1508604878722612e-06, + "loss": 0.17, + "step": 30897 + }, + { + "epoch": 0.7818913379052054, + "grad_norm": 4.168415069580078, + "learning_rate": 1.1506042279132479e-06, + "loss": 0.0781, + "step": 30898 + }, + { + "epoch": 0.781916643469899, + "grad_norm": 5.466595649719238, + "learning_rate": 1.1503479927782723e-06, + "loss": 0.2074, + "step": 30899 + }, + { + "epoch": 0.7819419490345927, + "grad_norm": 4.094520092010498, + "learning_rate": 1.150091782468986e-06, + "loss": 0.132, + "step": 30900 + }, + { + "epoch": 0.7819672545992864, + "grad_norm": 3.3611950874328613, + "learning_rate": 1.1498355969870407e-06, + "loss": 0.0848, + "step": 30901 + }, + { + "epoch": 0.7819925601639801, + "grad_norm": 6.468541145324707, + "learning_rate": 1.1495794363340907e-06, + "loss": 0.1272, + "step": 30902 + }, + { + "epoch": 0.7820178657286737, + "grad_norm": 11.187296867370605, + "learning_rate": 1.149323300511785e-06, + "loss": 0.1915, + "step": 30903 + }, + { + "epoch": 0.7820431712933674, + "grad_norm": 2.456733226776123, + "learning_rate": 1.1490671895217797e-06, + "loss": 0.1241, + "step": 30904 + }, + { + "epoch": 0.7820684768580611, + "grad_norm": 5.90950870513916, + "learning_rate": 1.1488111033657211e-06, + "loss": 0.1983, + "step": 30905 + }, + { + "epoch": 0.7820937824227547, + "grad_norm": 6.2754998207092285, + "learning_rate": 1.148555042045265e-06, + "loss": 0.1574, + "step": 30906 + }, + { + "epoch": 0.7821190879874484, + "grad_norm": 6.266725063323975, + "learning_rate": 1.1482990055620597e-06, + "loss": 0.1516, + "step": 30907 + }, + { + "epoch": 0.7821443935521422, + "grad_norm": 6.737030982971191, + "learning_rate": 1.1480429939177607e-06, + "loss": 0.1994, + "step": 30908 + }, + { + "epoch": 0.7821696991168358, + "grad_norm": 3.7567899227142334, + "learning_rate": 1.1477870071140124e-06, + "loss": 0.1054, + "step": 30909 + }, + { + "epoch": 0.7821950046815295, + "grad_norm": 6.834616184234619, + "learning_rate": 1.1475310451524707e-06, + "loss": 0.13, + "step": 30910 + }, + { + "epoch": 0.7822203102462232, + "grad_norm": 2.6813106536865234, + "learning_rate": 1.1472751080347833e-06, + "loss": 0.0701, + "step": 30911 + }, + { + "epoch": 0.7822456158109168, + "grad_norm": 8.408685684204102, + "learning_rate": 1.1470191957626054e-06, + "loss": 0.1129, + "step": 30912 + }, + { + "epoch": 0.7822709213756105, + "grad_norm": 7.231362819671631, + "learning_rate": 1.1467633083375806e-06, + "loss": 0.1581, + "step": 30913 + }, + { + "epoch": 0.7822962269403042, + "grad_norm": 6.108741283416748, + "learning_rate": 1.146507445761364e-06, + "loss": 0.2196, + "step": 30914 + }, + { + "epoch": 0.7823215325049978, + "grad_norm": 3.861284017562866, + "learning_rate": 1.1462516080356022e-06, + "loss": 0.1197, + "step": 30915 + }, + { + "epoch": 0.7823468380696915, + "grad_norm": 5.508904457092285, + "learning_rate": 1.1459957951619493e-06, + "loss": 0.1678, + "step": 30916 + }, + { + "epoch": 0.7823721436343852, + "grad_norm": 4.500393390655518, + "learning_rate": 1.1457400071420522e-06, + "loss": 0.0884, + "step": 30917 + }, + { + "epoch": 0.7823974491990788, + "grad_norm": 5.442266941070557, + "learning_rate": 1.1454842439775605e-06, + "loss": 0.1548, + "step": 30918 + }, + { + "epoch": 0.7824227547637725, + "grad_norm": 4.916528224945068, + "learning_rate": 1.1452285056701246e-06, + "loss": 0.1469, + "step": 30919 + }, + { + "epoch": 0.7824480603284663, + "grad_norm": 5.195476055145264, + "learning_rate": 1.144972792221391e-06, + "loss": 0.1681, + "step": 30920 + }, + { + "epoch": 0.7824733658931599, + "grad_norm": 8.431530952453613, + "learning_rate": 1.1447171036330124e-06, + "loss": 0.2296, + "step": 30921 + }, + { + "epoch": 0.7824986714578536, + "grad_norm": 4.500288486480713, + "learning_rate": 1.1444614399066361e-06, + "loss": 0.1222, + "step": 30922 + }, + { + "epoch": 0.7825239770225473, + "grad_norm": 6.248860836029053, + "learning_rate": 1.1442058010439106e-06, + "loss": 0.0913, + "step": 30923 + }, + { + "epoch": 0.7825492825872409, + "grad_norm": 2.5715181827545166, + "learning_rate": 1.1439501870464837e-06, + "loss": 0.1033, + "step": 30924 + }, + { + "epoch": 0.7825745881519346, + "grad_norm": 5.3745951652526855, + "learning_rate": 1.1436945979160057e-06, + "loss": 0.1046, + "step": 30925 + }, + { + "epoch": 0.7825998937166283, + "grad_norm": 10.967752456665039, + "learning_rate": 1.143439033654124e-06, + "loss": 0.2706, + "step": 30926 + }, + { + "epoch": 0.782625199281322, + "grad_norm": 8.608013153076172, + "learning_rate": 1.1431834942624864e-06, + "loss": 0.164, + "step": 30927 + }, + { + "epoch": 0.7826505048460156, + "grad_norm": 8.693022727966309, + "learning_rate": 1.14292797974274e-06, + "loss": 0.2182, + "step": 30928 + }, + { + "epoch": 0.7826758104107093, + "grad_norm": 13.16859245300293, + "learning_rate": 1.1426724900965353e-06, + "loss": 0.1547, + "step": 30929 + }, + { + "epoch": 0.782701115975403, + "grad_norm": 10.054245948791504, + "learning_rate": 1.1424170253255174e-06, + "loss": 0.2213, + "step": 30930 + }, + { + "epoch": 0.7827264215400966, + "grad_norm": 4.077448844909668, + "learning_rate": 1.142161585431335e-06, + "loss": 0.1976, + "step": 30931 + }, + { + "epoch": 0.7827517271047904, + "grad_norm": 3.485635757446289, + "learning_rate": 1.1419061704156337e-06, + "loss": 0.1041, + "step": 30932 + }, + { + "epoch": 0.7827770326694841, + "grad_norm": 3.466895580291748, + "learning_rate": 1.1416507802800625e-06, + "loss": 0.0969, + "step": 30933 + }, + { + "epoch": 0.7828023382341777, + "grad_norm": 4.426273822784424, + "learning_rate": 1.141395415026267e-06, + "loss": 0.1604, + "step": 30934 + }, + { + "epoch": 0.7828276437988714, + "grad_norm": 3.9238359928131104, + "learning_rate": 1.1411400746558977e-06, + "loss": 0.1667, + "step": 30935 + }, + { + "epoch": 0.7828529493635651, + "grad_norm": 2.398613452911377, + "learning_rate": 1.1408847591705946e-06, + "loss": 0.1043, + "step": 30936 + }, + { + "epoch": 0.7828782549282587, + "grad_norm": 2.7759690284729004, + "learning_rate": 1.14062946857201e-06, + "loss": 0.1047, + "step": 30937 + }, + { + "epoch": 0.7829035604929524, + "grad_norm": 4.6615095138549805, + "learning_rate": 1.1403742028617859e-06, + "loss": 0.1665, + "step": 30938 + }, + { + "epoch": 0.7829288660576461, + "grad_norm": 3.732522487640381, + "learning_rate": 1.1401189620415741e-06, + "loss": 0.1435, + "step": 30939 + }, + { + "epoch": 0.7829541716223397, + "grad_norm": 6.448108673095703, + "learning_rate": 1.1398637461130142e-06, + "loss": 0.2202, + "step": 30940 + }, + { + "epoch": 0.7829794771870334, + "grad_norm": 3.119889259338379, + "learning_rate": 1.1396085550777564e-06, + "loss": 0.1008, + "step": 30941 + }, + { + "epoch": 0.7830047827517271, + "grad_norm": 3.6731066703796387, + "learning_rate": 1.139353388937443e-06, + "loss": 0.138, + "step": 30942 + }, + { + "epoch": 0.7830300883164207, + "grad_norm": 4.632686614990234, + "learning_rate": 1.1390982476937252e-06, + "loss": 0.1191, + "step": 30943 + }, + { + "epoch": 0.7830553938811144, + "grad_norm": 3.1220037937164307, + "learning_rate": 1.1388431313482412e-06, + "loss": 0.1184, + "step": 30944 + }, + { + "epoch": 0.7830806994458082, + "grad_norm": 6.451980113983154, + "learning_rate": 1.1385880399026412e-06, + "loss": 0.1849, + "step": 30945 + }, + { + "epoch": 0.7831060050105018, + "grad_norm": 8.312043190002441, + "learning_rate": 1.138332973358568e-06, + "loss": 0.2033, + "step": 30946 + }, + { + "epoch": 0.7831313105751955, + "grad_norm": 3.9765090942382812, + "learning_rate": 1.1380779317176665e-06, + "loss": 0.1293, + "step": 30947 + }, + { + "epoch": 0.7831566161398892, + "grad_norm": 3.626668691635132, + "learning_rate": 1.1378229149815833e-06, + "loss": 0.1091, + "step": 30948 + }, + { + "epoch": 0.7831819217045828, + "grad_norm": 4.077714920043945, + "learning_rate": 1.1375679231519614e-06, + "loss": 0.0976, + "step": 30949 + }, + { + "epoch": 0.7832072272692765, + "grad_norm": 4.624450206756592, + "learning_rate": 1.1373129562304458e-06, + "loss": 0.1311, + "step": 30950 + }, + { + "epoch": 0.7832325328339702, + "grad_norm": 8.285904884338379, + "learning_rate": 1.1370580142186787e-06, + "loss": 0.1942, + "step": 30951 + }, + { + "epoch": 0.7832578383986639, + "grad_norm": 5.247751235961914, + "learning_rate": 1.136803097118308e-06, + "loss": 0.168, + "step": 30952 + }, + { + "epoch": 0.7832831439633575, + "grad_norm": 3.523550271987915, + "learning_rate": 1.136548204930975e-06, + "loss": 0.1235, + "step": 30953 + }, + { + "epoch": 0.7833084495280512, + "grad_norm": 4.236613750457764, + "learning_rate": 1.136293337658324e-06, + "loss": 0.1303, + "step": 30954 + }, + { + "epoch": 0.7833337550927449, + "grad_norm": 4.520158767700195, + "learning_rate": 1.136038495301997e-06, + "loss": 0.1598, + "step": 30955 + }, + { + "epoch": 0.7833590606574385, + "grad_norm": 23.247661590576172, + "learning_rate": 1.1357836778636405e-06, + "loss": 0.3807, + "step": 30956 + }, + { + "epoch": 0.7833843662221323, + "grad_norm": 4.246890544891357, + "learning_rate": 1.1355288853448964e-06, + "loss": 0.1651, + "step": 30957 + }, + { + "epoch": 0.783409671786826, + "grad_norm": 4.091636657714844, + "learning_rate": 1.135274117747408e-06, + "loss": 0.0963, + "step": 30958 + }, + { + "epoch": 0.7834349773515196, + "grad_norm": 12.695673942565918, + "learning_rate": 1.135019375072816e-06, + "loss": 0.2196, + "step": 30959 + }, + { + "epoch": 0.7834602829162133, + "grad_norm": 4.605062007904053, + "learning_rate": 1.1347646573227667e-06, + "loss": 0.1752, + "step": 30960 + }, + { + "epoch": 0.783485588480907, + "grad_norm": 5.001620769500732, + "learning_rate": 1.134509964498901e-06, + "loss": 0.2215, + "step": 30961 + }, + { + "epoch": 0.7835108940456006, + "grad_norm": 9.234393119812012, + "learning_rate": 1.1342552966028619e-06, + "loss": 0.1621, + "step": 30962 + }, + { + "epoch": 0.7835361996102943, + "grad_norm": 4.381284236907959, + "learning_rate": 1.134000653636289e-06, + "loss": 0.1666, + "step": 30963 + }, + { + "epoch": 0.783561505174988, + "grad_norm": 4.587862968444824, + "learning_rate": 1.133746035600829e-06, + "loss": 0.1436, + "step": 30964 + }, + { + "epoch": 0.7835868107396816, + "grad_norm": 9.636401176452637, + "learning_rate": 1.1334914424981213e-06, + "loss": 0.1973, + "step": 30965 + }, + { + "epoch": 0.7836121163043753, + "grad_norm": 11.379311561584473, + "learning_rate": 1.1332368743298083e-06, + "loss": 0.2398, + "step": 30966 + }, + { + "epoch": 0.783637421869069, + "grad_norm": 3.9481918811798096, + "learning_rate": 1.1329823310975303e-06, + "loss": 0.0921, + "step": 30967 + }, + { + "epoch": 0.7836627274337626, + "grad_norm": 3.5065252780914307, + "learning_rate": 1.1327278128029305e-06, + "loss": 0.1118, + "step": 30968 + }, + { + "epoch": 0.7836880329984564, + "grad_norm": 1.9450198411941528, + "learning_rate": 1.132473319447649e-06, + "loss": 0.0866, + "step": 30969 + }, + { + "epoch": 0.7837133385631501, + "grad_norm": 5.339666366577148, + "learning_rate": 1.13221885103333e-06, + "loss": 0.1988, + "step": 30970 + }, + { + "epoch": 0.7837386441278437, + "grad_norm": 5.346200466156006, + "learning_rate": 1.1319644075616094e-06, + "loss": 0.2144, + "step": 30971 + }, + { + "epoch": 0.7837639496925374, + "grad_norm": 11.705438613891602, + "learning_rate": 1.131709989034132e-06, + "loss": 0.3107, + "step": 30972 + }, + { + "epoch": 0.7837892552572311, + "grad_norm": 4.679895401000977, + "learning_rate": 1.131455595452538e-06, + "loss": 0.13, + "step": 30973 + }, + { + "epoch": 0.7838145608219247, + "grad_norm": 7.967580318450928, + "learning_rate": 1.1312012268184668e-06, + "loss": 0.2002, + "step": 30974 + }, + { + "epoch": 0.7838398663866184, + "grad_norm": 2.9124038219451904, + "learning_rate": 1.130946883133558e-06, + "loss": 0.1199, + "step": 30975 + }, + { + "epoch": 0.7838651719513121, + "grad_norm": 6.655810832977295, + "learning_rate": 1.1306925643994543e-06, + "loss": 0.1587, + "step": 30976 + }, + { + "epoch": 0.7838904775160058, + "grad_norm": 4.151324272155762, + "learning_rate": 1.1304382706177946e-06, + "loss": 0.174, + "step": 30977 + }, + { + "epoch": 0.7839157830806994, + "grad_norm": 4.08595609664917, + "learning_rate": 1.1301840017902172e-06, + "loss": 0.0865, + "step": 30978 + }, + { + "epoch": 0.7839410886453931, + "grad_norm": 5.848142147064209, + "learning_rate": 1.129929757918365e-06, + "loss": 0.191, + "step": 30979 + }, + { + "epoch": 0.7839663942100868, + "grad_norm": 2.233607769012451, + "learning_rate": 1.1296755390038754e-06, + "loss": 0.1052, + "step": 30980 + }, + { + "epoch": 0.7839916997747804, + "grad_norm": 14.741573333740234, + "learning_rate": 1.1294213450483888e-06, + "loss": 0.2822, + "step": 30981 + }, + { + "epoch": 0.7840170053394742, + "grad_norm": 4.440541744232178, + "learning_rate": 1.1291671760535427e-06, + "loss": 0.1468, + "step": 30982 + }, + { + "epoch": 0.7840423109041679, + "grad_norm": 3.822016954421997, + "learning_rate": 1.1289130320209795e-06, + "loss": 0.1574, + "step": 30983 + }, + { + "epoch": 0.7840676164688615, + "grad_norm": 9.13054084777832, + "learning_rate": 1.1286589129523339e-06, + "loss": 0.1859, + "step": 30984 + }, + { + "epoch": 0.7840929220335552, + "grad_norm": 3.8918936252593994, + "learning_rate": 1.1284048188492481e-06, + "loss": 0.1253, + "step": 30985 + }, + { + "epoch": 0.7841182275982489, + "grad_norm": 4.435562610626221, + "learning_rate": 1.128150749713358e-06, + "loss": 0.1163, + "step": 30986 + }, + { + "epoch": 0.7841435331629425, + "grad_norm": 5.635768413543701, + "learning_rate": 1.1278967055463052e-06, + "loss": 0.1314, + "step": 30987 + }, + { + "epoch": 0.7841688387276362, + "grad_norm": 4.430227756500244, + "learning_rate": 1.1276426863497258e-06, + "loss": 0.1319, + "step": 30988 + }, + { + "epoch": 0.7841941442923299, + "grad_norm": 4.8174729347229, + "learning_rate": 1.1273886921252587e-06, + "loss": 0.1187, + "step": 30989 + }, + { + "epoch": 0.7842194498570235, + "grad_norm": 4.181949138641357, + "learning_rate": 1.1271347228745399e-06, + "loss": 0.1261, + "step": 30990 + }, + { + "epoch": 0.7842447554217172, + "grad_norm": 3.772141456604004, + "learning_rate": 1.126880778599211e-06, + "loss": 0.1148, + "step": 30991 + }, + { + "epoch": 0.7842700609864109, + "grad_norm": 4.699199676513672, + "learning_rate": 1.1266268593009066e-06, + "loss": 0.129, + "step": 30992 + }, + { + "epoch": 0.7842953665511045, + "grad_norm": 3.3780694007873535, + "learning_rate": 1.1263729649812655e-06, + "loss": 0.1358, + "step": 30993 + }, + { + "epoch": 0.7843206721157983, + "grad_norm": 3.930495023727417, + "learning_rate": 1.1261190956419233e-06, + "loss": 0.1652, + "step": 30994 + }, + { + "epoch": 0.784345977680492, + "grad_norm": 2.8510305881500244, + "learning_rate": 1.1258652512845191e-06, + "loss": 0.1132, + "step": 30995 + }, + { + "epoch": 0.7843712832451856, + "grad_norm": 9.69997501373291, + "learning_rate": 1.1256114319106904e-06, + "loss": 0.1911, + "step": 30996 + }, + { + "epoch": 0.7843965888098793, + "grad_norm": 2.533954620361328, + "learning_rate": 1.125357637522072e-06, + "loss": 0.1189, + "step": 30997 + }, + { + "epoch": 0.784421894374573, + "grad_norm": 5.4233012199401855, + "learning_rate": 1.1251038681203002e-06, + "loss": 0.1776, + "step": 30998 + }, + { + "epoch": 0.7844471999392666, + "grad_norm": 2.8313755989074707, + "learning_rate": 1.1248501237070148e-06, + "loss": 0.0542, + "step": 30999 + }, + { + "epoch": 0.7844725055039603, + "grad_norm": 3.4255106449127197, + "learning_rate": 1.124596404283848e-06, + "loss": 0.1385, + "step": 31000 + }, + { + "epoch": 0.784497811068654, + "grad_norm": 12.742751121520996, + "learning_rate": 1.1243427098524413e-06, + "loss": 0.2054, + "step": 31001 + }, + { + "epoch": 0.7845231166333477, + "grad_norm": 5.650260925292969, + "learning_rate": 1.1240890404144244e-06, + "loss": 0.1733, + "step": 31002 + }, + { + "epoch": 0.7845484221980413, + "grad_norm": 5.850780487060547, + "learning_rate": 1.1238353959714376e-06, + "loss": 0.1386, + "step": 31003 + }, + { + "epoch": 0.784573727762735, + "grad_norm": 4.5057291984558105, + "learning_rate": 1.123581776525116e-06, + "loss": 0.2139, + "step": 31004 + }, + { + "epoch": 0.7845990333274288, + "grad_norm": 4.861270904541016, + "learning_rate": 1.1233281820770942e-06, + "loss": 0.1408, + "step": 31005 + }, + { + "epoch": 0.7846243388921224, + "grad_norm": 9.611738204956055, + "learning_rate": 1.123074612629006e-06, + "loss": 0.2795, + "step": 31006 + }, + { + "epoch": 0.7846496444568161, + "grad_norm": 8.569293022155762, + "learning_rate": 1.1228210681824903e-06, + "loss": 0.2393, + "step": 31007 + }, + { + "epoch": 0.7846749500215098, + "grad_norm": 5.8335137367248535, + "learning_rate": 1.1225675487391801e-06, + "loss": 0.1775, + "step": 31008 + }, + { + "epoch": 0.7847002555862034, + "grad_norm": 4.238364219665527, + "learning_rate": 1.1223140543007094e-06, + "loss": 0.2036, + "step": 31009 + }, + { + "epoch": 0.7847255611508971, + "grad_norm": 3.4260942935943604, + "learning_rate": 1.1220605848687165e-06, + "loss": 0.1372, + "step": 31010 + }, + { + "epoch": 0.7847508667155908, + "grad_norm": 4.413149833679199, + "learning_rate": 1.121807140444831e-06, + "loss": 0.0657, + "step": 31011 + }, + { + "epoch": 0.7847761722802844, + "grad_norm": 5.468246936798096, + "learning_rate": 1.121553721030691e-06, + "loss": 0.1453, + "step": 31012 + }, + { + "epoch": 0.7848014778449781, + "grad_norm": 14.030463218688965, + "learning_rate": 1.1213003266279283e-06, + "loss": 0.1733, + "step": 31013 + }, + { + "epoch": 0.7848267834096718, + "grad_norm": 10.268375396728516, + "learning_rate": 1.121046957238181e-06, + "loss": 0.2666, + "step": 31014 + }, + { + "epoch": 0.7848520889743654, + "grad_norm": 3.9858083724975586, + "learning_rate": 1.1207936128630781e-06, + "loss": 0.089, + "step": 31015 + }, + { + "epoch": 0.7848773945390591, + "grad_norm": 5.7222981452941895, + "learning_rate": 1.1205402935042563e-06, + "loss": 0.1473, + "step": 31016 + }, + { + "epoch": 0.7849027001037528, + "grad_norm": 3.6074025630950928, + "learning_rate": 1.1202869991633474e-06, + "loss": 0.1423, + "step": 31017 + }, + { + "epoch": 0.7849280056684464, + "grad_norm": 5.629821300506592, + "learning_rate": 1.1200337298419888e-06, + "loss": 0.204, + "step": 31018 + }, + { + "epoch": 0.7849533112331402, + "grad_norm": 3.0771944522857666, + "learning_rate": 1.1197804855418081e-06, + "loss": 0.0899, + "step": 31019 + }, + { + "epoch": 0.7849786167978339, + "grad_norm": 3.0217432975769043, + "learning_rate": 1.1195272662644429e-06, + "loss": 0.0727, + "step": 31020 + }, + { + "epoch": 0.7850039223625275, + "grad_norm": 3.202056884765625, + "learning_rate": 1.119274072011523e-06, + "loss": 0.1275, + "step": 31021 + }, + { + "epoch": 0.7850292279272212, + "grad_norm": 3.2668497562408447, + "learning_rate": 1.1190209027846837e-06, + "loss": 0.088, + "step": 31022 + }, + { + "epoch": 0.7850545334919149, + "grad_norm": 4.397121906280518, + "learning_rate": 1.1187677585855572e-06, + "loss": 0.1498, + "step": 31023 + }, + { + "epoch": 0.7850798390566085, + "grad_norm": 6.305406093597412, + "learning_rate": 1.1185146394157752e-06, + "loss": 0.1907, + "step": 31024 + }, + { + "epoch": 0.7851051446213022, + "grad_norm": 8.090299606323242, + "learning_rate": 1.118261545276969e-06, + "loss": 0.2523, + "step": 31025 + }, + { + "epoch": 0.7851304501859959, + "grad_norm": 5.11118221282959, + "learning_rate": 1.1180084761707739e-06, + "loss": 0.1077, + "step": 31026 + }, + { + "epoch": 0.7851557557506895, + "grad_norm": 2.9389524459838867, + "learning_rate": 1.1177554320988193e-06, + "loss": 0.0728, + "step": 31027 + }, + { + "epoch": 0.7851810613153832, + "grad_norm": 4.446177959442139, + "learning_rate": 1.1175024130627383e-06, + "loss": 0.1354, + "step": 31028 + }, + { + "epoch": 0.785206366880077, + "grad_norm": 3.5636441707611084, + "learning_rate": 1.1172494190641597e-06, + "loss": 0.1571, + "step": 31029 + }, + { + "epoch": 0.7852316724447707, + "grad_norm": 5.208403587341309, + "learning_rate": 1.1169964501047198e-06, + "loss": 0.2136, + "step": 31030 + }, + { + "epoch": 0.7852569780094643, + "grad_norm": 4.729639530181885, + "learning_rate": 1.116743506186047e-06, + "loss": 0.1184, + "step": 31031 + }, + { + "epoch": 0.785282283574158, + "grad_norm": 7.566263198852539, + "learning_rate": 1.1164905873097732e-06, + "loss": 0.1688, + "step": 31032 + }, + { + "epoch": 0.7853075891388517, + "grad_norm": 5.945032119750977, + "learning_rate": 1.1162376934775276e-06, + "loss": 0.1718, + "step": 31033 + }, + { + "epoch": 0.7853328947035453, + "grad_norm": 4.006727695465088, + "learning_rate": 1.1159848246909445e-06, + "loss": 0.1252, + "step": 31034 + }, + { + "epoch": 0.785358200268239, + "grad_norm": 2.974144458770752, + "learning_rate": 1.115731980951652e-06, + "loss": 0.1149, + "step": 31035 + }, + { + "epoch": 0.7853835058329327, + "grad_norm": 5.5529937744140625, + "learning_rate": 1.115479162261282e-06, + "loss": 0.0811, + "step": 31036 + }, + { + "epoch": 0.7854088113976263, + "grad_norm": 3.942187547683716, + "learning_rate": 1.1152263686214626e-06, + "loss": 0.1101, + "step": 31037 + }, + { + "epoch": 0.78543411696232, + "grad_norm": 2.361065149307251, + "learning_rate": 1.1149736000338274e-06, + "loss": 0.0627, + "step": 31038 + }, + { + "epoch": 0.7854594225270137, + "grad_norm": 4.661205768585205, + "learning_rate": 1.114720856500005e-06, + "loss": 0.1117, + "step": 31039 + }, + { + "epoch": 0.7854847280917073, + "grad_norm": 5.523738861083984, + "learning_rate": 1.1144681380216249e-06, + "loss": 0.1626, + "step": 31040 + }, + { + "epoch": 0.785510033656401, + "grad_norm": 3.100862979888916, + "learning_rate": 1.1142154446003168e-06, + "loss": 0.0947, + "step": 31041 + }, + { + "epoch": 0.7855353392210948, + "grad_norm": 5.2708420753479, + "learning_rate": 1.1139627762377091e-06, + "loss": 0.144, + "step": 31042 + }, + { + "epoch": 0.7855606447857884, + "grad_norm": 2.499605655670166, + "learning_rate": 1.113710132935434e-06, + "loss": 0.0375, + "step": 31043 + }, + { + "epoch": 0.7855859503504821, + "grad_norm": 15.378581047058105, + "learning_rate": 1.1134575146951183e-06, + "loss": 0.1497, + "step": 31044 + }, + { + "epoch": 0.7856112559151758, + "grad_norm": 4.4632954597473145, + "learning_rate": 1.1132049215183943e-06, + "loss": 0.1196, + "step": 31045 + }, + { + "epoch": 0.7856365614798694, + "grad_norm": 5.462235450744629, + "learning_rate": 1.1129523534068865e-06, + "loss": 0.1995, + "step": 31046 + }, + { + "epoch": 0.7856618670445631, + "grad_norm": 3.9763176441192627, + "learning_rate": 1.1126998103622272e-06, + "loss": 0.1314, + "step": 31047 + }, + { + "epoch": 0.7856871726092568, + "grad_norm": 3.494124174118042, + "learning_rate": 1.1124472923860424e-06, + "loss": 0.1472, + "step": 31048 + }, + { + "epoch": 0.7857124781739504, + "grad_norm": 8.589776039123535, + "learning_rate": 1.1121947994799642e-06, + "loss": 0.2444, + "step": 31049 + }, + { + "epoch": 0.7857377837386441, + "grad_norm": 4.844642639160156, + "learning_rate": 1.1119423316456162e-06, + "loss": 0.14, + "step": 31050 + }, + { + "epoch": 0.7857630893033378, + "grad_norm": 3.5403201580047607, + "learning_rate": 1.11168988888463e-06, + "loss": 0.1485, + "step": 31051 + }, + { + "epoch": 0.7857883948680314, + "grad_norm": 7.561738967895508, + "learning_rate": 1.1114374711986315e-06, + "loss": 0.1951, + "step": 31052 + }, + { + "epoch": 0.7858137004327251, + "grad_norm": 6.275701999664307, + "learning_rate": 1.1111850785892502e-06, + "loss": 0.179, + "step": 31053 + }, + { + "epoch": 0.7858390059974188, + "grad_norm": 3.728652238845825, + "learning_rate": 1.1109327110581137e-06, + "loss": 0.1235, + "step": 31054 + }, + { + "epoch": 0.7858643115621126, + "grad_norm": 13.837447166442871, + "learning_rate": 1.1106803686068478e-06, + "loss": 0.3289, + "step": 31055 + }, + { + "epoch": 0.7858896171268062, + "grad_norm": 5.87838888168335, + "learning_rate": 1.1104280512370803e-06, + "loss": 0.1514, + "step": 31056 + }, + { + "epoch": 0.7859149226914999, + "grad_norm": 12.488808631896973, + "learning_rate": 1.1101757589504397e-06, + "loss": 0.2544, + "step": 31057 + }, + { + "epoch": 0.7859402282561936, + "grad_norm": 6.919739246368408, + "learning_rate": 1.1099234917485517e-06, + "loss": 0.1113, + "step": 31058 + }, + { + "epoch": 0.7859655338208872, + "grad_norm": 4.865778923034668, + "learning_rate": 1.1096712496330442e-06, + "loss": 0.1617, + "step": 31059 + }, + { + "epoch": 0.7859908393855809, + "grad_norm": 7.251163482666016, + "learning_rate": 1.1094190326055414e-06, + "loss": 0.1244, + "step": 31060 + }, + { + "epoch": 0.7860161449502746, + "grad_norm": 3.597583293914795, + "learning_rate": 1.1091668406676732e-06, + "loss": 0.1667, + "step": 31061 + }, + { + "epoch": 0.7860414505149682, + "grad_norm": 8.073445320129395, + "learning_rate": 1.1089146738210638e-06, + "loss": 0.2184, + "step": 31062 + }, + { + "epoch": 0.7860667560796619, + "grad_norm": 4.688511371612549, + "learning_rate": 1.10866253206734e-06, + "loss": 0.1407, + "step": 31063 + }, + { + "epoch": 0.7860920616443556, + "grad_norm": 3.427414655685425, + "learning_rate": 1.1084104154081266e-06, + "loss": 0.1229, + "step": 31064 + }, + { + "epoch": 0.7861173672090492, + "grad_norm": 5.880698204040527, + "learning_rate": 1.1081583238450517e-06, + "loss": 0.1389, + "step": 31065 + }, + { + "epoch": 0.786142672773743, + "grad_norm": 4.9375410079956055, + "learning_rate": 1.10790625737974e-06, + "loss": 0.1617, + "step": 31066 + }, + { + "epoch": 0.7861679783384367, + "grad_norm": 8.476361274719238, + "learning_rate": 1.1076542160138165e-06, + "loss": 0.1807, + "step": 31067 + }, + { + "epoch": 0.7861932839031303, + "grad_norm": 31.585311889648438, + "learning_rate": 1.1074021997489076e-06, + "loss": 0.2661, + "step": 31068 + }, + { + "epoch": 0.786218589467824, + "grad_norm": 2.39105224609375, + "learning_rate": 1.107150208586636e-06, + "loss": 0.1001, + "step": 31069 + }, + { + "epoch": 0.7862438950325177, + "grad_norm": 4.246090888977051, + "learning_rate": 1.10689824252863e-06, + "loss": 0.161, + "step": 31070 + }, + { + "epoch": 0.7862692005972113, + "grad_norm": 6.2365803718566895, + "learning_rate": 1.1066463015765127e-06, + "loss": 0.1628, + "step": 31071 + }, + { + "epoch": 0.786294506161905, + "grad_norm": 5.544473171234131, + "learning_rate": 1.10639438573191e-06, + "loss": 0.1507, + "step": 31072 + }, + { + "epoch": 0.7863198117265987, + "grad_norm": 4.883054256439209, + "learning_rate": 1.106142494996444e-06, + "loss": 0.1614, + "step": 31073 + }, + { + "epoch": 0.7863451172912923, + "grad_norm": 3.9675180912017822, + "learning_rate": 1.1058906293717414e-06, + "loss": 0.1195, + "step": 31074 + }, + { + "epoch": 0.786370422855986, + "grad_norm": 9.93370246887207, + "learning_rate": 1.105638788859425e-06, + "loss": 0.2704, + "step": 31075 + }, + { + "epoch": 0.7863957284206797, + "grad_norm": 3.7363154888153076, + "learning_rate": 1.1053869734611223e-06, + "loss": 0.1376, + "step": 31076 + }, + { + "epoch": 0.7864210339853733, + "grad_norm": 5.3504767417907715, + "learning_rate": 1.105135183178452e-06, + "loss": 0.1086, + "step": 31077 + }, + { + "epoch": 0.786446339550067, + "grad_norm": 9.622364044189453, + "learning_rate": 1.1048834180130413e-06, + "loss": 0.2441, + "step": 31078 + }, + { + "epoch": 0.7864716451147608, + "grad_norm": 3.673926591873169, + "learning_rate": 1.1046316779665122e-06, + "loss": 0.0966, + "step": 31079 + }, + { + "epoch": 0.7864969506794545, + "grad_norm": 4.595556259155273, + "learning_rate": 1.1043799630404912e-06, + "loss": 0.1319, + "step": 31080 + }, + { + "epoch": 0.7865222562441481, + "grad_norm": 10.250204086303711, + "learning_rate": 1.104128273236596e-06, + "loss": 0.1752, + "step": 31081 + }, + { + "epoch": 0.7865475618088418, + "grad_norm": 4.17288875579834, + "learning_rate": 1.103876608556455e-06, + "loss": 0.1768, + "step": 31082 + }, + { + "epoch": 0.7865728673735355, + "grad_norm": 14.872076988220215, + "learning_rate": 1.103624969001687e-06, + "loss": 0.1401, + "step": 31083 + }, + { + "epoch": 0.7865981729382291, + "grad_norm": 14.325316429138184, + "learning_rate": 1.1033733545739184e-06, + "loss": 0.2371, + "step": 31084 + }, + { + "epoch": 0.7866234785029228, + "grad_norm": 6.613048076629639, + "learning_rate": 1.1031217652747706e-06, + "loss": 0.2051, + "step": 31085 + }, + { + "epoch": 0.7866487840676165, + "grad_norm": 7.397062301635742, + "learning_rate": 1.102870201105865e-06, + "loss": 0.2108, + "step": 31086 + }, + { + "epoch": 0.7866740896323101, + "grad_norm": 8.536559104919434, + "learning_rate": 1.102618662068824e-06, + "loss": 0.1891, + "step": 31087 + }, + { + "epoch": 0.7866993951970038, + "grad_norm": 6.468826770782471, + "learning_rate": 1.1023671481652708e-06, + "loss": 0.0999, + "step": 31088 + }, + { + "epoch": 0.7867247007616975, + "grad_norm": 8.800790786743164, + "learning_rate": 1.1021156593968275e-06, + "loss": 0.2007, + "step": 31089 + }, + { + "epoch": 0.7867500063263911, + "grad_norm": 3.4233264923095703, + "learning_rate": 1.101864195765115e-06, + "loss": 0.1055, + "step": 31090 + }, + { + "epoch": 0.7867753118910848, + "grad_norm": 3.194530725479126, + "learning_rate": 1.1016127572717533e-06, + "loss": 0.0905, + "step": 31091 + }, + { + "epoch": 0.7868006174557786, + "grad_norm": 10.673026084899902, + "learning_rate": 1.101361343918368e-06, + "loss": 0.2135, + "step": 31092 + }, + { + "epoch": 0.7868259230204722, + "grad_norm": 7.9253621101379395, + "learning_rate": 1.1011099557065779e-06, + "loss": 0.2145, + "step": 31093 + }, + { + "epoch": 0.7868512285851659, + "grad_norm": 6.3947978019714355, + "learning_rate": 1.1008585926380043e-06, + "loss": 0.1525, + "step": 31094 + }, + { + "epoch": 0.7868765341498596, + "grad_norm": 3.561488389968872, + "learning_rate": 1.1006072547142683e-06, + "loss": 0.157, + "step": 31095 + }, + { + "epoch": 0.7869018397145532, + "grad_norm": 6.665791988372803, + "learning_rate": 1.1003559419369898e-06, + "loss": 0.2288, + "step": 31096 + }, + { + "epoch": 0.7869271452792469, + "grad_norm": 9.06185245513916, + "learning_rate": 1.1001046543077914e-06, + "loss": 0.1754, + "step": 31097 + }, + { + "epoch": 0.7869524508439406, + "grad_norm": 5.642940521240234, + "learning_rate": 1.0998533918282928e-06, + "loss": 0.2162, + "step": 31098 + }, + { + "epoch": 0.7869777564086342, + "grad_norm": 4.106991767883301, + "learning_rate": 1.0996021545001139e-06, + "loss": 0.1515, + "step": 31099 + }, + { + "epoch": 0.7870030619733279, + "grad_norm": 5.428935527801514, + "learning_rate": 1.0993509423248743e-06, + "loss": 0.1541, + "step": 31100 + }, + { + "epoch": 0.7870283675380216, + "grad_norm": 3.4701426029205322, + "learning_rate": 1.099099755304196e-06, + "loss": 0.1032, + "step": 31101 + }, + { + "epoch": 0.7870536731027152, + "grad_norm": 3.530465841293335, + "learning_rate": 1.0988485934396975e-06, + "loss": 0.1407, + "step": 31102 + }, + { + "epoch": 0.787078978667409, + "grad_norm": 7.696791648864746, + "learning_rate": 1.0985974567329993e-06, + "loss": 0.272, + "step": 31103 + }, + { + "epoch": 0.7871042842321027, + "grad_norm": 4.857187747955322, + "learning_rate": 1.0983463451857186e-06, + "loss": 0.1946, + "step": 31104 + }, + { + "epoch": 0.7871295897967964, + "grad_norm": 3.572767972946167, + "learning_rate": 1.098095258799478e-06, + "loss": 0.1179, + "step": 31105 + }, + { + "epoch": 0.78715489536149, + "grad_norm": 5.752684116363525, + "learning_rate": 1.0978441975758941e-06, + "loss": 0.1478, + "step": 31106 + }, + { + "epoch": 0.7871802009261837, + "grad_norm": 3.33577036857605, + "learning_rate": 1.0975931615165897e-06, + "loss": 0.1293, + "step": 31107 + }, + { + "epoch": 0.7872055064908774, + "grad_norm": 3.7036352157592773, + "learning_rate": 1.0973421506231785e-06, + "loss": 0.1334, + "step": 31108 + }, + { + "epoch": 0.787230812055571, + "grad_norm": 5.977309703826904, + "learning_rate": 1.0970911648972831e-06, + "loss": 0.151, + "step": 31109 + }, + { + "epoch": 0.7872561176202647, + "grad_norm": 8.000542640686035, + "learning_rate": 1.096840204340519e-06, + "loss": 0.2276, + "step": 31110 + }, + { + "epoch": 0.7872814231849584, + "grad_norm": 5.926831245422363, + "learning_rate": 1.0965892689545098e-06, + "loss": 0.1928, + "step": 31111 + }, + { + "epoch": 0.787306728749652, + "grad_norm": 4.353896617889404, + "learning_rate": 1.0963383587408671e-06, + "loss": 0.1067, + "step": 31112 + }, + { + "epoch": 0.7873320343143457, + "grad_norm": 4.081740856170654, + "learning_rate": 1.0960874737012139e-06, + "loss": 0.1103, + "step": 31113 + }, + { + "epoch": 0.7873573398790394, + "grad_norm": 14.150334358215332, + "learning_rate": 1.0958366138371651e-06, + "loss": 0.1891, + "step": 31114 + }, + { + "epoch": 0.787382645443733, + "grad_norm": 19.496732711791992, + "learning_rate": 1.0955857791503404e-06, + "loss": 0.1543, + "step": 31115 + }, + { + "epoch": 0.7874079510084268, + "grad_norm": 8.872725486755371, + "learning_rate": 1.0953349696423576e-06, + "loss": 0.3328, + "step": 31116 + }, + { + "epoch": 0.7874332565731205, + "grad_norm": 7.728531360626221, + "learning_rate": 1.0950841853148324e-06, + "loss": 0.1433, + "step": 31117 + }, + { + "epoch": 0.7874585621378141, + "grad_norm": 10.712478637695312, + "learning_rate": 1.0948334261693822e-06, + "loss": 0.3826, + "step": 31118 + }, + { + "epoch": 0.7874838677025078, + "grad_norm": 7.320733070373535, + "learning_rate": 1.0945826922076264e-06, + "loss": 0.1646, + "step": 31119 + }, + { + "epoch": 0.7875091732672015, + "grad_norm": 7.666868209838867, + "learning_rate": 1.09433198343118e-06, + "loss": 0.1495, + "step": 31120 + }, + { + "epoch": 0.7875344788318951, + "grad_norm": 3.9158287048339844, + "learning_rate": 1.09408129984166e-06, + "loss": 0.0659, + "step": 31121 + }, + { + "epoch": 0.7875597843965888, + "grad_norm": 6.02115535736084, + "learning_rate": 1.093830641440682e-06, + "loss": 0.1413, + "step": 31122 + }, + { + "epoch": 0.7875850899612825, + "grad_norm": 3.8619909286499023, + "learning_rate": 1.0935800082298647e-06, + "loss": 0.1399, + "step": 31123 + }, + { + "epoch": 0.7876103955259761, + "grad_norm": 7.891280651092529, + "learning_rate": 1.0933294002108236e-06, + "loss": 0.2068, + "step": 31124 + }, + { + "epoch": 0.7876357010906698, + "grad_norm": 7.634411334991455, + "learning_rate": 1.0930788173851741e-06, + "loss": 0.2186, + "step": 31125 + }, + { + "epoch": 0.7876610066553635, + "grad_norm": 4.658688545227051, + "learning_rate": 1.0928282597545326e-06, + "loss": 0.1832, + "step": 31126 + }, + { + "epoch": 0.7876863122200571, + "grad_norm": 4.090487003326416, + "learning_rate": 1.0925777273205136e-06, + "loss": 0.103, + "step": 31127 + }, + { + "epoch": 0.7877116177847509, + "grad_norm": 5.420191287994385, + "learning_rate": 1.0923272200847357e-06, + "loss": 0.2288, + "step": 31128 + }, + { + "epoch": 0.7877369233494446, + "grad_norm": 7.078511714935303, + "learning_rate": 1.0920767380488124e-06, + "loss": 0.2465, + "step": 31129 + }, + { + "epoch": 0.7877622289141383, + "grad_norm": 5.630694389343262, + "learning_rate": 1.0918262812143593e-06, + "loss": 0.1821, + "step": 31130 + }, + { + "epoch": 0.7877875344788319, + "grad_norm": 3.7183804512023926, + "learning_rate": 1.0915758495829897e-06, + "loss": 0.0859, + "step": 31131 + }, + { + "epoch": 0.7878128400435256, + "grad_norm": 5.061739921569824, + "learning_rate": 1.0913254431563226e-06, + "loss": 0.1779, + "step": 31132 + }, + { + "epoch": 0.7878381456082193, + "grad_norm": 4.502719879150391, + "learning_rate": 1.0910750619359706e-06, + "loss": 0.1994, + "step": 31133 + }, + { + "epoch": 0.7878634511729129, + "grad_norm": 1.66263747215271, + "learning_rate": 1.0908247059235477e-06, + "loss": 0.0456, + "step": 31134 + }, + { + "epoch": 0.7878887567376066, + "grad_norm": 2.123756170272827, + "learning_rate": 1.0905743751206682e-06, + "loss": 0.115, + "step": 31135 + }, + { + "epoch": 0.7879140623023003, + "grad_norm": 5.226870536804199, + "learning_rate": 1.0903240695289485e-06, + "loss": 0.1864, + "step": 31136 + }, + { + "epoch": 0.7879393678669939, + "grad_norm": 7.736822605133057, + "learning_rate": 1.0900737891500007e-06, + "loss": 0.183, + "step": 31137 + }, + { + "epoch": 0.7879646734316876, + "grad_norm": 5.291627883911133, + "learning_rate": 1.0898235339854424e-06, + "loss": 0.1652, + "step": 31138 + }, + { + "epoch": 0.7879899789963813, + "grad_norm": 3.361276865005493, + "learning_rate": 1.0895733040368816e-06, + "loss": 0.0961, + "step": 31139 + }, + { + "epoch": 0.788015284561075, + "grad_norm": 8.225361824035645, + "learning_rate": 1.0893230993059368e-06, + "loss": 0.2069, + "step": 31140 + }, + { + "epoch": 0.7880405901257687, + "grad_norm": 6.518982410430908, + "learning_rate": 1.0890729197942185e-06, + "loss": 0.1341, + "step": 31141 + }, + { + "epoch": 0.7880658956904624, + "grad_norm": 3.4164106845855713, + "learning_rate": 1.088822765503344e-06, + "loss": 0.1142, + "step": 31142 + }, + { + "epoch": 0.788091201255156, + "grad_norm": 5.809328079223633, + "learning_rate": 1.0885726364349214e-06, + "loss": 0.1655, + "step": 31143 + }, + { + "epoch": 0.7881165068198497, + "grad_norm": 2.782802104949951, + "learning_rate": 1.0883225325905677e-06, + "loss": 0.1055, + "step": 31144 + }, + { + "epoch": 0.7881418123845434, + "grad_norm": 3.0939886569976807, + "learning_rate": 1.088072453971893e-06, + "loss": 0.1167, + "step": 31145 + }, + { + "epoch": 0.788167117949237, + "grad_norm": 4.750271320343018, + "learning_rate": 1.0878224005805137e-06, + "loss": 0.1625, + "step": 31146 + }, + { + "epoch": 0.7881924235139307, + "grad_norm": 3.617154598236084, + "learning_rate": 1.0875723724180382e-06, + "loss": 0.107, + "step": 31147 + }, + { + "epoch": 0.7882177290786244, + "grad_norm": 2.749624013900757, + "learning_rate": 1.0873223694860812e-06, + "loss": 0.1397, + "step": 31148 + }, + { + "epoch": 0.788243034643318, + "grad_norm": 3.646787405014038, + "learning_rate": 1.0870723917862535e-06, + "loss": 0.1, + "step": 31149 + }, + { + "epoch": 0.7882683402080117, + "grad_norm": 5.443840980529785, + "learning_rate": 1.0868224393201692e-06, + "loss": 0.2075, + "step": 31150 + }, + { + "epoch": 0.7882936457727054, + "grad_norm": 6.842897891998291, + "learning_rate": 1.0865725120894394e-06, + "loss": 0.1453, + "step": 31151 + }, + { + "epoch": 0.788318951337399, + "grad_norm": 3.966947555541992, + "learning_rate": 1.0863226100956753e-06, + "loss": 0.1066, + "step": 31152 + }, + { + "epoch": 0.7883442569020928, + "grad_norm": 2.840913772583008, + "learning_rate": 1.0860727333404886e-06, + "loss": 0.1151, + "step": 31153 + }, + { + "epoch": 0.7883695624667865, + "grad_norm": 3.897129774093628, + "learning_rate": 1.0858228818254897e-06, + "loss": 0.1851, + "step": 31154 + }, + { + "epoch": 0.7883948680314801, + "grad_norm": 6.8325300216674805, + "learning_rate": 1.085573055552292e-06, + "loss": 0.2348, + "step": 31155 + }, + { + "epoch": 0.7884201735961738, + "grad_norm": 4.165457248687744, + "learning_rate": 1.0853232545225057e-06, + "loss": 0.0772, + "step": 31156 + }, + { + "epoch": 0.7884454791608675, + "grad_norm": 3.304741382598877, + "learning_rate": 1.0850734787377415e-06, + "loss": 0.1696, + "step": 31157 + }, + { + "epoch": 0.7884707847255612, + "grad_norm": 5.461760997772217, + "learning_rate": 1.0848237281996089e-06, + "loss": 0.1556, + "step": 31158 + }, + { + "epoch": 0.7884960902902548, + "grad_norm": 9.325780868530273, + "learning_rate": 1.0845740029097207e-06, + "loss": 0.281, + "step": 31159 + }, + { + "epoch": 0.7885213958549485, + "grad_norm": 7.629783630371094, + "learning_rate": 1.084324302869687e-06, + "loss": 0.2277, + "step": 31160 + }, + { + "epoch": 0.7885467014196422, + "grad_norm": 6.2518391609191895, + "learning_rate": 1.0840746280811176e-06, + "loss": 0.1415, + "step": 31161 + }, + { + "epoch": 0.7885720069843358, + "grad_norm": 8.429323196411133, + "learning_rate": 1.0838249785456206e-06, + "loss": 0.1204, + "step": 31162 + }, + { + "epoch": 0.7885973125490295, + "grad_norm": 6.656256675720215, + "learning_rate": 1.0835753542648097e-06, + "loss": 0.2034, + "step": 31163 + }, + { + "epoch": 0.7886226181137233, + "grad_norm": 4.761111736297607, + "learning_rate": 1.0833257552402927e-06, + "loss": 0.1581, + "step": 31164 + }, + { + "epoch": 0.7886479236784169, + "grad_norm": 7.452926158905029, + "learning_rate": 1.083076181473679e-06, + "loss": 0.2119, + "step": 31165 + }, + { + "epoch": 0.7886732292431106, + "grad_norm": 5.585641384124756, + "learning_rate": 1.0828266329665771e-06, + "loss": 0.1589, + "step": 31166 + }, + { + "epoch": 0.7886985348078043, + "grad_norm": 3.6086509227752686, + "learning_rate": 1.0825771097205994e-06, + "loss": 0.1415, + "step": 31167 + }, + { + "epoch": 0.7887238403724979, + "grad_norm": 2.612858533859253, + "learning_rate": 1.0823276117373533e-06, + "loss": 0.0963, + "step": 31168 + }, + { + "epoch": 0.7887491459371916, + "grad_norm": 11.85982894897461, + "learning_rate": 1.082078139018447e-06, + "loss": 0.2544, + "step": 31169 + }, + { + "epoch": 0.7887744515018853, + "grad_norm": 4.297908782958984, + "learning_rate": 1.081828691565489e-06, + "loss": 0.1382, + "step": 31170 + }, + { + "epoch": 0.7887997570665789, + "grad_norm": 8.699752807617188, + "learning_rate": 1.0815792693800902e-06, + "loss": 0.2338, + "step": 31171 + }, + { + "epoch": 0.7888250626312726, + "grad_norm": 6.5933518409729, + "learning_rate": 1.0813298724638565e-06, + "loss": 0.1188, + "step": 31172 + }, + { + "epoch": 0.7888503681959663, + "grad_norm": 5.356137752532959, + "learning_rate": 1.0810805008184006e-06, + "loss": 0.1321, + "step": 31173 + }, + { + "epoch": 0.7888756737606599, + "grad_norm": 5.335243225097656, + "learning_rate": 1.0808311544453242e-06, + "loss": 0.2245, + "step": 31174 + }, + { + "epoch": 0.7889009793253536, + "grad_norm": 3.4113235473632812, + "learning_rate": 1.0805818333462403e-06, + "loss": 0.093, + "step": 31175 + }, + { + "epoch": 0.7889262848900473, + "grad_norm": 3.8924546241760254, + "learning_rate": 1.080332537522754e-06, + "loss": 0.1597, + "step": 31176 + }, + { + "epoch": 0.788951590454741, + "grad_norm": 3.7526743412017822, + "learning_rate": 1.0800832669764766e-06, + "loss": 0.0447, + "step": 31177 + }, + { + "epoch": 0.7889768960194347, + "grad_norm": 4.260007381439209, + "learning_rate": 1.0798340217090103e-06, + "loss": 0.1137, + "step": 31178 + }, + { + "epoch": 0.7890022015841284, + "grad_norm": 4.979395389556885, + "learning_rate": 1.0795848017219669e-06, + "loss": 0.1904, + "step": 31179 + }, + { + "epoch": 0.789027507148822, + "grad_norm": 3.7496964931488037, + "learning_rate": 1.0793356070169513e-06, + "loss": 0.1548, + "step": 31180 + }, + { + "epoch": 0.7890528127135157, + "grad_norm": 4.253621578216553, + "learning_rate": 1.07908643759557e-06, + "loss": 0.1492, + "step": 31181 + }, + { + "epoch": 0.7890781182782094, + "grad_norm": 6.5801568031311035, + "learning_rate": 1.0788372934594322e-06, + "loss": 0.1887, + "step": 31182 + }, + { + "epoch": 0.7891034238429031, + "grad_norm": 2.970360517501831, + "learning_rate": 1.0785881746101428e-06, + "loss": 0.1272, + "step": 31183 + }, + { + "epoch": 0.7891287294075967, + "grad_norm": 4.908862590789795, + "learning_rate": 1.0783390810493095e-06, + "loss": 0.1259, + "step": 31184 + }, + { + "epoch": 0.7891540349722904, + "grad_norm": 3.4605419635772705, + "learning_rate": 1.0780900127785364e-06, + "loss": 0.1414, + "step": 31185 + }, + { + "epoch": 0.7891793405369841, + "grad_norm": 4.028273105621338, + "learning_rate": 1.077840969799433e-06, + "loss": 0.1486, + "step": 31186 + }, + { + "epoch": 0.7892046461016777, + "grad_norm": 5.651762962341309, + "learning_rate": 1.077591952113603e-06, + "loss": 0.2059, + "step": 31187 + }, + { + "epoch": 0.7892299516663714, + "grad_norm": 10.906808853149414, + "learning_rate": 1.077342959722653e-06, + "loss": 0.2357, + "step": 31188 + }, + { + "epoch": 0.7892552572310652, + "grad_norm": 4.945157527923584, + "learning_rate": 1.0770939926281876e-06, + "loss": 0.1627, + "step": 31189 + }, + { + "epoch": 0.7892805627957588, + "grad_norm": 8.204375267028809, + "learning_rate": 1.0768450508318145e-06, + "loss": 0.2012, + "step": 31190 + }, + { + "epoch": 0.7893058683604525, + "grad_norm": 3.9969840049743652, + "learning_rate": 1.0765961343351378e-06, + "loss": 0.1909, + "step": 31191 + }, + { + "epoch": 0.7893311739251462, + "grad_norm": 5.1660614013671875, + "learning_rate": 1.0763472431397631e-06, + "loss": 0.1511, + "step": 31192 + }, + { + "epoch": 0.7893564794898398, + "grad_norm": 2.9040234088897705, + "learning_rate": 1.076098377247294e-06, + "loss": 0.1415, + "step": 31193 + }, + { + "epoch": 0.7893817850545335, + "grad_norm": 3.046403169631958, + "learning_rate": 1.0758495366593369e-06, + "loss": 0.1147, + "step": 31194 + }, + { + "epoch": 0.7894070906192272, + "grad_norm": 4.532692909240723, + "learning_rate": 1.075600721377497e-06, + "loss": 0.1603, + "step": 31195 + }, + { + "epoch": 0.7894323961839208, + "grad_norm": 30.05088233947754, + "learning_rate": 1.0753519314033778e-06, + "loss": 0.206, + "step": 31196 + }, + { + "epoch": 0.7894577017486145, + "grad_norm": 4.953620433807373, + "learning_rate": 1.075103166738583e-06, + "loss": 0.1523, + "step": 31197 + }, + { + "epoch": 0.7894830073133082, + "grad_norm": 3.510833978652954, + "learning_rate": 1.0748544273847184e-06, + "loss": 0.1241, + "step": 31198 + }, + { + "epoch": 0.7895083128780018, + "grad_norm": 11.954872131347656, + "learning_rate": 1.074605713343388e-06, + "loss": 0.2771, + "step": 31199 + }, + { + "epoch": 0.7895336184426955, + "grad_norm": 3.4644925594329834, + "learning_rate": 1.0743570246161949e-06, + "loss": 0.1345, + "step": 31200 + }, + { + "epoch": 0.7895589240073893, + "grad_norm": 5.126461982727051, + "learning_rate": 1.074108361204742e-06, + "loss": 0.1514, + "step": 31201 + }, + { + "epoch": 0.7895842295720829, + "grad_norm": 6.258893013000488, + "learning_rate": 1.0738597231106357e-06, + "loss": 0.1788, + "step": 31202 + }, + { + "epoch": 0.7896095351367766, + "grad_norm": 4.908411979675293, + "learning_rate": 1.0736111103354757e-06, + "loss": 0.2013, + "step": 31203 + }, + { + "epoch": 0.7896348407014703, + "grad_norm": 7.397884368896484, + "learning_rate": 1.0733625228808702e-06, + "loss": 0.1949, + "step": 31204 + }, + { + "epoch": 0.7896601462661639, + "grad_norm": 4.237577438354492, + "learning_rate": 1.0731139607484171e-06, + "loss": 0.1113, + "step": 31205 + }, + { + "epoch": 0.7896854518308576, + "grad_norm": 7.9993391036987305, + "learning_rate": 1.0728654239397225e-06, + "loss": 0.1642, + "step": 31206 + }, + { + "epoch": 0.7897107573955513, + "grad_norm": 6.1215691566467285, + "learning_rate": 1.0726169124563867e-06, + "loss": 0.1926, + "step": 31207 + }, + { + "epoch": 0.789736062960245, + "grad_norm": 4.9728217124938965, + "learning_rate": 1.0723684263000173e-06, + "loss": 0.13, + "step": 31208 + }, + { + "epoch": 0.7897613685249386, + "grad_norm": 3.3953158855438232, + "learning_rate": 1.0721199654722098e-06, + "loss": 0.0792, + "step": 31209 + }, + { + "epoch": 0.7897866740896323, + "grad_norm": 7.167513370513916, + "learning_rate": 1.071871529974572e-06, + "loss": 0.1845, + "step": 31210 + }, + { + "epoch": 0.789811979654326, + "grad_norm": 8.472530364990234, + "learning_rate": 1.071623119808704e-06, + "loss": 0.1606, + "step": 31211 + }, + { + "epoch": 0.7898372852190196, + "grad_norm": 9.346219062805176, + "learning_rate": 1.071374734976206e-06, + "loss": 0.1833, + "step": 31212 + }, + { + "epoch": 0.7898625907837133, + "grad_norm": 6.145358562469482, + "learning_rate": 1.0711263754786832e-06, + "loss": 0.1685, + "step": 31213 + }, + { + "epoch": 0.7898878963484071, + "grad_norm": 3.7311291694641113, + "learning_rate": 1.0708780413177355e-06, + "loss": 0.1325, + "step": 31214 + }, + { + "epoch": 0.7899132019131007, + "grad_norm": 2.874312400817871, + "learning_rate": 1.0706297324949644e-06, + "loss": 0.1202, + "step": 31215 + }, + { + "epoch": 0.7899385074777944, + "grad_norm": 7.115776062011719, + "learning_rate": 1.0703814490119695e-06, + "loss": 0.1503, + "step": 31216 + }, + { + "epoch": 0.7899638130424881, + "grad_norm": 3.1744675636291504, + "learning_rate": 1.070133190870357e-06, + "loss": 0.0804, + "step": 31217 + }, + { + "epoch": 0.7899891186071817, + "grad_norm": 4.423865795135498, + "learning_rate": 1.0698849580717219e-06, + "loss": 0.2128, + "step": 31218 + }, + { + "epoch": 0.7900144241718754, + "grad_norm": 3.1569316387176514, + "learning_rate": 1.0696367506176685e-06, + "loss": 0.1547, + "step": 31219 + }, + { + "epoch": 0.7900397297365691, + "grad_norm": 2.6722731590270996, + "learning_rate": 1.0693885685097954e-06, + "loss": 0.1502, + "step": 31220 + }, + { + "epoch": 0.7900650353012627, + "grad_norm": 9.904237747192383, + "learning_rate": 1.0691404117497072e-06, + "loss": 0.2183, + "step": 31221 + }, + { + "epoch": 0.7900903408659564, + "grad_norm": 8.570364952087402, + "learning_rate": 1.0688922803389985e-06, + "loss": 0.1263, + "step": 31222 + }, + { + "epoch": 0.7901156464306501, + "grad_norm": 6.6647047996521, + "learning_rate": 1.0686441742792735e-06, + "loss": 0.1492, + "step": 31223 + }, + { + "epoch": 0.7901409519953437, + "grad_norm": 5.16624641418457, + "learning_rate": 1.0683960935721294e-06, + "loss": 0.1817, + "step": 31224 + }, + { + "epoch": 0.7901662575600374, + "grad_norm": 4.6010026931762695, + "learning_rate": 1.0681480382191695e-06, + "loss": 0.133, + "step": 31225 + }, + { + "epoch": 0.7901915631247312, + "grad_norm": 9.344975471496582, + "learning_rate": 1.0679000082219908e-06, + "loss": 0.2314, + "step": 31226 + }, + { + "epoch": 0.7902168686894248, + "grad_norm": 2.936701774597168, + "learning_rate": 1.0676520035821941e-06, + "loss": 0.0846, + "step": 31227 + }, + { + "epoch": 0.7902421742541185, + "grad_norm": 6.442620754241943, + "learning_rate": 1.0674040243013762e-06, + "loss": 0.1146, + "step": 31228 + }, + { + "epoch": 0.7902674798188122, + "grad_norm": 19.13218879699707, + "learning_rate": 1.06715607038114e-06, + "loss": 0.2202, + "step": 31229 + }, + { + "epoch": 0.7902927853835058, + "grad_norm": 40.92717742919922, + "learning_rate": 1.0669081418230826e-06, + "loss": 0.2831, + "step": 31230 + }, + { + "epoch": 0.7903180909481995, + "grad_norm": 7.821783065795898, + "learning_rate": 1.0666602386288032e-06, + "loss": 0.2166, + "step": 31231 + }, + { + "epoch": 0.7903433965128932, + "grad_norm": 8.866422653198242, + "learning_rate": 1.0664123607998988e-06, + "loss": 0.201, + "step": 31232 + }, + { + "epoch": 0.7903687020775869, + "grad_norm": 5.610972881317139, + "learning_rate": 1.0661645083379707e-06, + "loss": 0.1781, + "step": 31233 + }, + { + "epoch": 0.7903940076422805, + "grad_norm": 3.3722498416900635, + "learning_rate": 1.0659166812446147e-06, + "loss": 0.1851, + "step": 31234 + }, + { + "epoch": 0.7904193132069742, + "grad_norm": 3.6525301933288574, + "learning_rate": 1.0656688795214325e-06, + "loss": 0.1202, + "step": 31235 + }, + { + "epoch": 0.7904446187716679, + "grad_norm": 4.511303424835205, + "learning_rate": 1.065421103170018e-06, + "loss": 0.0794, + "step": 31236 + }, + { + "epoch": 0.7904699243363615, + "grad_norm": 3.4995579719543457, + "learning_rate": 1.0651733521919716e-06, + "loss": 0.1569, + "step": 31237 + }, + { + "epoch": 0.7904952299010553, + "grad_norm": 8.746633529663086, + "learning_rate": 1.0649256265888903e-06, + "loss": 0.155, + "step": 31238 + }, + { + "epoch": 0.790520535465749, + "grad_norm": 4.135122299194336, + "learning_rate": 1.0646779263623718e-06, + "loss": 0.1447, + "step": 31239 + }, + { + "epoch": 0.7905458410304426, + "grad_norm": 7.754749298095703, + "learning_rate": 1.064430251514012e-06, + "loss": 0.1915, + "step": 31240 + }, + { + "epoch": 0.7905711465951363, + "grad_norm": 4.260627269744873, + "learning_rate": 1.0641826020454104e-06, + "loss": 0.1604, + "step": 31241 + }, + { + "epoch": 0.79059645215983, + "grad_norm": 3.069934844970703, + "learning_rate": 1.0639349779581637e-06, + "loss": 0.1374, + "step": 31242 + }, + { + "epoch": 0.7906217577245236, + "grad_norm": 2.543348789215088, + "learning_rate": 1.0636873792538665e-06, + "loss": 0.1004, + "step": 31243 + }, + { + "epoch": 0.7906470632892173, + "grad_norm": 5.196134567260742, + "learning_rate": 1.0634398059341189e-06, + "loss": 0.1933, + "step": 31244 + }, + { + "epoch": 0.790672368853911, + "grad_norm": 6.718375205993652, + "learning_rate": 1.0631922580005155e-06, + "loss": 0.2351, + "step": 31245 + }, + { + "epoch": 0.7906976744186046, + "grad_norm": 3.8303699493408203, + "learning_rate": 1.0629447354546523e-06, + "loss": 0.1408, + "step": 31246 + }, + { + "epoch": 0.7907229799832983, + "grad_norm": 11.945141792297363, + "learning_rate": 1.0626972382981255e-06, + "loss": 0.1982, + "step": 31247 + }, + { + "epoch": 0.790748285547992, + "grad_norm": 5.390608787536621, + "learning_rate": 1.0624497665325346e-06, + "loss": 0.0906, + "step": 31248 + }, + { + "epoch": 0.7907735911126856, + "grad_norm": 6.158827304840088, + "learning_rate": 1.0622023201594695e-06, + "loss": 0.247, + "step": 31249 + }, + { + "epoch": 0.7907988966773793, + "grad_norm": 13.102127075195312, + "learning_rate": 1.0619548991805312e-06, + "loss": 0.1637, + "step": 31250 + }, + { + "epoch": 0.7908242022420731, + "grad_norm": 3.3672773838043213, + "learning_rate": 1.0617075035973113e-06, + "loss": 0.1395, + "step": 31251 + }, + { + "epoch": 0.7908495078067667, + "grad_norm": 3.3392961025238037, + "learning_rate": 1.0614601334114099e-06, + "loss": 0.1232, + "step": 31252 + }, + { + "epoch": 0.7908748133714604, + "grad_norm": 3.4285359382629395, + "learning_rate": 1.0612127886244167e-06, + "loss": 0.1114, + "step": 31253 + }, + { + "epoch": 0.7909001189361541, + "grad_norm": 3.6133389472961426, + "learning_rate": 1.0609654692379312e-06, + "loss": 0.1679, + "step": 31254 + }, + { + "epoch": 0.7909254245008477, + "grad_norm": 2.484145402908325, + "learning_rate": 1.060718175253545e-06, + "loss": 0.1263, + "step": 31255 + }, + { + "epoch": 0.7909507300655414, + "grad_norm": 7.567420482635498, + "learning_rate": 1.0604709066728563e-06, + "loss": 0.1717, + "step": 31256 + }, + { + "epoch": 0.7909760356302351, + "grad_norm": 2.8694705963134766, + "learning_rate": 1.0602236634974578e-06, + "loss": 0.1566, + "step": 31257 + }, + { + "epoch": 0.7910013411949288, + "grad_norm": 6.7408833503723145, + "learning_rate": 1.059976445728944e-06, + "loss": 0.1433, + "step": 31258 + }, + { + "epoch": 0.7910266467596224, + "grad_norm": 4.3587236404418945, + "learning_rate": 1.0597292533689073e-06, + "loss": 0.0948, + "step": 31259 + }, + { + "epoch": 0.7910519523243161, + "grad_norm": 6.844422817230225, + "learning_rate": 1.0594820864189459e-06, + "loss": 0.2224, + "step": 31260 + }, + { + "epoch": 0.7910772578890098, + "grad_norm": 6.855225086212158, + "learning_rate": 1.0592349448806511e-06, + "loss": 0.2429, + "step": 31261 + }, + { + "epoch": 0.7911025634537034, + "grad_norm": 3.6377692222595215, + "learning_rate": 1.0589878287556172e-06, + "loss": 0.1493, + "step": 31262 + }, + { + "epoch": 0.7911278690183972, + "grad_norm": 3.2503113746643066, + "learning_rate": 1.0587407380454362e-06, + "loss": 0.1841, + "step": 31263 + }, + { + "epoch": 0.7911531745830909, + "grad_norm": 4.212448596954346, + "learning_rate": 1.0584936727517047e-06, + "loss": 0.1316, + "step": 31264 + }, + { + "epoch": 0.7911784801477845, + "grad_norm": 7.419907093048096, + "learning_rate": 1.0582466328760138e-06, + "loss": 0.1846, + "step": 31265 + }, + { + "epoch": 0.7912037857124782, + "grad_norm": 3.643734931945801, + "learning_rate": 1.0579996184199576e-06, + "loss": 0.0992, + "step": 31266 + }, + { + "epoch": 0.7912290912771719, + "grad_norm": 7.6436848640441895, + "learning_rate": 1.057752629385127e-06, + "loss": 0.1468, + "step": 31267 + }, + { + "epoch": 0.7912543968418655, + "grad_norm": 2.8601937294006348, + "learning_rate": 1.057505665773118e-06, + "loss": 0.1284, + "step": 31268 + }, + { + "epoch": 0.7912797024065592, + "grad_norm": 9.56242561340332, + "learning_rate": 1.0572587275855218e-06, + "loss": 0.1797, + "step": 31269 + }, + { + "epoch": 0.7913050079712529, + "grad_norm": 8.460898399353027, + "learning_rate": 1.05701181482393e-06, + "loss": 0.1959, + "step": 31270 + }, + { + "epoch": 0.7913303135359465, + "grad_norm": 5.8728508949279785, + "learning_rate": 1.0567649274899344e-06, + "loss": 0.1143, + "step": 31271 + }, + { + "epoch": 0.7913556191006402, + "grad_norm": 9.244479179382324, + "learning_rate": 1.0565180655851299e-06, + "loss": 0.1813, + "step": 31272 + }, + { + "epoch": 0.7913809246653339, + "grad_norm": 5.58710241317749, + "learning_rate": 1.0562712291111067e-06, + "loss": 0.1279, + "step": 31273 + }, + { + "epoch": 0.7914062302300275, + "grad_norm": 2.3927645683288574, + "learning_rate": 1.0560244180694567e-06, + "loss": 0.1545, + "step": 31274 + }, + { + "epoch": 0.7914315357947213, + "grad_norm": 7.186556816101074, + "learning_rate": 1.055777632461772e-06, + "loss": 0.1658, + "step": 31275 + }, + { + "epoch": 0.791456841359415, + "grad_norm": 6.410217761993408, + "learning_rate": 1.0555308722896417e-06, + "loss": 0.1549, + "step": 31276 + }, + { + "epoch": 0.7914821469241086, + "grad_norm": 5.206584930419922, + "learning_rate": 1.0552841375546607e-06, + "loss": 0.1434, + "step": 31277 + }, + { + "epoch": 0.7915074524888023, + "grad_norm": 3.98641037940979, + "learning_rate": 1.0550374282584176e-06, + "loss": 0.119, + "step": 31278 + }, + { + "epoch": 0.791532758053496, + "grad_norm": 5.218606948852539, + "learning_rate": 1.0547907444025064e-06, + "loss": 0.138, + "step": 31279 + }, + { + "epoch": 0.7915580636181896, + "grad_norm": 3.420830249786377, + "learning_rate": 1.054544085988513e-06, + "loss": 0.1321, + "step": 31280 + }, + { + "epoch": 0.7915833691828833, + "grad_norm": 5.1292619705200195, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.1173, + "step": 31281 + }, + { + "epoch": 0.791608674747577, + "grad_norm": 3.2782461643218994, + "learning_rate": 1.0540508454926525e-06, + "loss": 0.1384, + "step": 31282 + }, + { + "epoch": 0.7916339803122706, + "grad_norm": 4.530471324920654, + "learning_rate": 1.0538042634139673e-06, + "loss": 0.1442, + "step": 31283 + }, + { + "epoch": 0.7916592858769643, + "grad_norm": 8.013724327087402, + "learning_rate": 1.0535577067835617e-06, + "loss": 0.2334, + "step": 31284 + }, + { + "epoch": 0.791684591441658, + "grad_norm": 3.8059804439544678, + "learning_rate": 1.0533111756030296e-06, + "loss": 0.1483, + "step": 31285 + }, + { + "epoch": 0.7917098970063517, + "grad_norm": 2.5177438259124756, + "learning_rate": 1.0530646698739582e-06, + "loss": 0.0845, + "step": 31286 + }, + { + "epoch": 0.7917352025710453, + "grad_norm": 5.764990329742432, + "learning_rate": 1.05281818959794e-06, + "loss": 0.1338, + "step": 31287 + }, + { + "epoch": 0.7917605081357391, + "grad_norm": 6.0054497718811035, + "learning_rate": 1.052571734776563e-06, + "loss": 0.0913, + "step": 31288 + }, + { + "epoch": 0.7917858137004328, + "grad_norm": 7.3199992179870605, + "learning_rate": 1.0523253054114169e-06, + "loss": 0.1609, + "step": 31289 + }, + { + "epoch": 0.7918111192651264, + "grad_norm": 5.765547752380371, + "learning_rate": 1.0520789015040889e-06, + "loss": 0.1562, + "step": 31290 + }, + { + "epoch": 0.7918364248298201, + "grad_norm": 7.952437400817871, + "learning_rate": 1.051832523056171e-06, + "loss": 0.2008, + "step": 31291 + }, + { + "epoch": 0.7918617303945138, + "grad_norm": 3.159536361694336, + "learning_rate": 1.051586170069251e-06, + "loss": 0.131, + "step": 31292 + }, + { + "epoch": 0.7918870359592074, + "grad_norm": 5.2572126388549805, + "learning_rate": 1.0513398425449167e-06, + "loss": 0.131, + "step": 31293 + }, + { + "epoch": 0.7919123415239011, + "grad_norm": 8.039531707763672, + "learning_rate": 1.0510935404847566e-06, + "loss": 0.2727, + "step": 31294 + }, + { + "epoch": 0.7919376470885948, + "grad_norm": 2.633530616760254, + "learning_rate": 1.0508472638903606e-06, + "loss": 0.1363, + "step": 31295 + }, + { + "epoch": 0.7919629526532884, + "grad_norm": 4.467040538787842, + "learning_rate": 1.0506010127633165e-06, + "loss": 0.144, + "step": 31296 + }, + { + "epoch": 0.7919882582179821, + "grad_norm": 16.736417770385742, + "learning_rate": 1.0503547871052112e-06, + "loss": 0.25, + "step": 31297 + }, + { + "epoch": 0.7920135637826758, + "grad_norm": 3.928328275680542, + "learning_rate": 1.0501085869176324e-06, + "loss": 0.0753, + "step": 31298 + }, + { + "epoch": 0.7920388693473694, + "grad_norm": 6.167977333068848, + "learning_rate": 1.0498624122021695e-06, + "loss": 0.2029, + "step": 31299 + }, + { + "epoch": 0.7920641749120632, + "grad_norm": 3.2109997272491455, + "learning_rate": 1.0496162629604096e-06, + "loss": 0.175, + "step": 31300 + }, + { + "epoch": 0.7920894804767569, + "grad_norm": 4.109986782073975, + "learning_rate": 1.0493701391939393e-06, + "loss": 0.1541, + "step": 31301 + }, + { + "epoch": 0.7921147860414505, + "grad_norm": 3.9759361743927, + "learning_rate": 1.0491240409043464e-06, + "loss": 0.1376, + "step": 31302 + }, + { + "epoch": 0.7921400916061442, + "grad_norm": 5.393924713134766, + "learning_rate": 1.0488779680932155e-06, + "loss": 0.1957, + "step": 31303 + }, + { + "epoch": 0.7921653971708379, + "grad_norm": 3.104008436203003, + "learning_rate": 1.0486319207621376e-06, + "loss": 0.0974, + "step": 31304 + }, + { + "epoch": 0.7921907027355315, + "grad_norm": 4.179101467132568, + "learning_rate": 1.0483858989126971e-06, + "loss": 0.1464, + "step": 31305 + }, + { + "epoch": 0.7922160083002252, + "grad_norm": 4.721056938171387, + "learning_rate": 1.0481399025464812e-06, + "loss": 0.1413, + "step": 31306 + }, + { + "epoch": 0.7922413138649189, + "grad_norm": 7.913430213928223, + "learning_rate": 1.0478939316650749e-06, + "loss": 0.2272, + "step": 31307 + }, + { + "epoch": 0.7922666194296125, + "grad_norm": 2.7988641262054443, + "learning_rate": 1.0476479862700662e-06, + "loss": 0.0762, + "step": 31308 + }, + { + "epoch": 0.7922919249943062, + "grad_norm": 10.307657241821289, + "learning_rate": 1.0474020663630392e-06, + "loss": 0.3095, + "step": 31309 + }, + { + "epoch": 0.7923172305589999, + "grad_norm": 8.679051399230957, + "learning_rate": 1.0471561719455842e-06, + "loss": 0.1092, + "step": 31310 + }, + { + "epoch": 0.7923425361236937, + "grad_norm": 8.072295188903809, + "learning_rate": 1.0469103030192807e-06, + "loss": 0.1914, + "step": 31311 + }, + { + "epoch": 0.7923678416883873, + "grad_norm": 4.227572441101074, + "learning_rate": 1.0466644595857184e-06, + "loss": 0.2033, + "step": 31312 + }, + { + "epoch": 0.792393147253081, + "grad_norm": 4.055327415466309, + "learning_rate": 1.0464186416464805e-06, + "loss": 0.1583, + "step": 31313 + }, + { + "epoch": 0.7924184528177747, + "grad_norm": 10.234979629516602, + "learning_rate": 1.0461728492031565e-06, + "loss": 0.1558, + "step": 31314 + }, + { + "epoch": 0.7924437583824683, + "grad_norm": 4.695422172546387, + "learning_rate": 1.0459270822573252e-06, + "loss": 0.1482, + "step": 31315 + }, + { + "epoch": 0.792469063947162, + "grad_norm": 10.187323570251465, + "learning_rate": 1.0456813408105759e-06, + "loss": 0.1513, + "step": 31316 + }, + { + "epoch": 0.7924943695118557, + "grad_norm": 3.22304630279541, + "learning_rate": 1.045435624864491e-06, + "loss": 0.1152, + "step": 31317 + }, + { + "epoch": 0.7925196750765493, + "grad_norm": 5.372409820556641, + "learning_rate": 1.0451899344206574e-06, + "loss": 0.1335, + "step": 31318 + }, + { + "epoch": 0.792544980641243, + "grad_norm": 7.755469799041748, + "learning_rate": 1.0449442694806583e-06, + "loss": 0.1769, + "step": 31319 + }, + { + "epoch": 0.7925702862059367, + "grad_norm": 3.249476671218872, + "learning_rate": 1.0446986300460776e-06, + "loss": 0.1253, + "step": 31320 + }, + { + "epoch": 0.7925955917706303, + "grad_norm": 4.148756504058838, + "learning_rate": 1.044453016118499e-06, + "loss": 0.144, + "step": 31321 + }, + { + "epoch": 0.792620897335324, + "grad_norm": 3.8435211181640625, + "learning_rate": 1.0442074276995078e-06, + "loss": 0.176, + "step": 31322 + }, + { + "epoch": 0.7926462029000177, + "grad_norm": 2.915443181991577, + "learning_rate": 1.0439618647906875e-06, + "loss": 0.1016, + "step": 31323 + }, + { + "epoch": 0.7926715084647113, + "grad_norm": 3.545642852783203, + "learning_rate": 1.0437163273936208e-06, + "loss": 0.1022, + "step": 31324 + }, + { + "epoch": 0.7926968140294051, + "grad_norm": 3.823603630065918, + "learning_rate": 1.0434708155098905e-06, + "loss": 0.1442, + "step": 31325 + }, + { + "epoch": 0.7927221195940988, + "grad_norm": 9.237403869628906, + "learning_rate": 1.0432253291410826e-06, + "loss": 0.1995, + "step": 31326 + }, + { + "epoch": 0.7927474251587924, + "grad_norm": 7.547929286956787, + "learning_rate": 1.0429798682887781e-06, + "loss": 0.2137, + "step": 31327 + }, + { + "epoch": 0.7927727307234861, + "grad_norm": 3.6869699954986572, + "learning_rate": 1.0427344329545601e-06, + "loss": 0.1196, + "step": 31328 + }, + { + "epoch": 0.7927980362881798, + "grad_norm": 2.602780342102051, + "learning_rate": 1.0424890231400108e-06, + "loss": 0.1074, + "step": 31329 + }, + { + "epoch": 0.7928233418528734, + "grad_norm": 7.272562503814697, + "learning_rate": 1.042243638846715e-06, + "loss": 0.1228, + "step": 31330 + }, + { + "epoch": 0.7928486474175671, + "grad_norm": 4.414247035980225, + "learning_rate": 1.0419982800762536e-06, + "loss": 0.132, + "step": 31331 + }, + { + "epoch": 0.7928739529822608, + "grad_norm": 3.9725122451782227, + "learning_rate": 1.041752946830209e-06, + "loss": 0.14, + "step": 31332 + }, + { + "epoch": 0.7928992585469544, + "grad_norm": 5.2606306076049805, + "learning_rate": 1.0415076391101636e-06, + "loss": 0.2207, + "step": 31333 + }, + { + "epoch": 0.7929245641116481, + "grad_norm": 3.620880365371704, + "learning_rate": 1.0412623569176976e-06, + "loss": 0.1286, + "step": 31334 + }, + { + "epoch": 0.7929498696763418, + "grad_norm": 4.587439060211182, + "learning_rate": 1.0410171002543956e-06, + "loss": 0.1275, + "step": 31335 + }, + { + "epoch": 0.7929751752410356, + "grad_norm": 7.431933879852295, + "learning_rate": 1.0407718691218382e-06, + "loss": 0.2285, + "step": 31336 + }, + { + "epoch": 0.7930004808057292, + "grad_norm": 5.451905250549316, + "learning_rate": 1.040526663521606e-06, + "loss": 0.172, + "step": 31337 + }, + { + "epoch": 0.7930257863704229, + "grad_norm": 3.9600749015808105, + "learning_rate": 1.0402814834552799e-06, + "loss": 0.1611, + "step": 31338 + }, + { + "epoch": 0.7930510919351166, + "grad_norm": 21.346567153930664, + "learning_rate": 1.0400363289244431e-06, + "loss": 0.136, + "step": 31339 + }, + { + "epoch": 0.7930763974998102, + "grad_norm": 5.171509742736816, + "learning_rate": 1.0397911999306742e-06, + "loss": 0.1875, + "step": 31340 + }, + { + "epoch": 0.7931017030645039, + "grad_norm": 5.307552337646484, + "learning_rate": 1.0395460964755576e-06, + "loss": 0.1195, + "step": 31341 + }, + { + "epoch": 0.7931270086291976, + "grad_norm": 2.8135690689086914, + "learning_rate": 1.039301018560669e-06, + "loss": 0.0726, + "step": 31342 + }, + { + "epoch": 0.7931523141938912, + "grad_norm": 3.8475234508514404, + "learning_rate": 1.0390559661875927e-06, + "loss": 0.1607, + "step": 31343 + }, + { + "epoch": 0.7931776197585849, + "grad_norm": 5.223592758178711, + "learning_rate": 1.0388109393579065e-06, + "loss": 0.1439, + "step": 31344 + }, + { + "epoch": 0.7932029253232786, + "grad_norm": 3.5048251152038574, + "learning_rate": 1.0385659380731944e-06, + "loss": 0.1105, + "step": 31345 + }, + { + "epoch": 0.7932282308879722, + "grad_norm": 6.942633628845215, + "learning_rate": 1.0383209623350304e-06, + "loss": 0.2287, + "step": 31346 + }, + { + "epoch": 0.7932535364526659, + "grad_norm": 9.173393249511719, + "learning_rate": 1.0380760121449996e-06, + "loss": 0.1969, + "step": 31347 + }, + { + "epoch": 0.7932788420173597, + "grad_norm": 4.410013198852539, + "learning_rate": 1.0378310875046782e-06, + "loss": 0.165, + "step": 31348 + }, + { + "epoch": 0.7933041475820533, + "grad_norm": 4.731255531311035, + "learning_rate": 1.0375861884156496e-06, + "loss": 0.126, + "step": 31349 + }, + { + "epoch": 0.793329453146747, + "grad_norm": 5.435334205627441, + "learning_rate": 1.037341314879488e-06, + "loss": 0.1814, + "step": 31350 + }, + { + "epoch": 0.7933547587114407, + "grad_norm": 4.852704048156738, + "learning_rate": 1.0370964668977767e-06, + "loss": 0.1568, + "step": 31351 + }, + { + "epoch": 0.7933800642761343, + "grad_norm": 2.444333791732788, + "learning_rate": 1.0368516444720916e-06, + "loss": 0.1023, + "step": 31352 + }, + { + "epoch": 0.793405369840828, + "grad_norm": 10.233875274658203, + "learning_rate": 1.0366068476040142e-06, + "loss": 0.2299, + "step": 31353 + }, + { + "epoch": 0.7934306754055217, + "grad_norm": 12.394524574279785, + "learning_rate": 1.0363620762951222e-06, + "loss": 0.2561, + "step": 31354 + }, + { + "epoch": 0.7934559809702153, + "grad_norm": 7.227599143981934, + "learning_rate": 1.0361173305469934e-06, + "loss": 0.179, + "step": 31355 + }, + { + "epoch": 0.793481286534909, + "grad_norm": 10.05097770690918, + "learning_rate": 1.035872610361206e-06, + "loss": 0.2257, + "step": 31356 + }, + { + "epoch": 0.7935065920996027, + "grad_norm": 3.899348735809326, + "learning_rate": 1.0356279157393395e-06, + "loss": 0.1455, + "step": 31357 + }, + { + "epoch": 0.7935318976642963, + "grad_norm": 3.1235451698303223, + "learning_rate": 1.0353832466829716e-06, + "loss": 0.0829, + "step": 31358 + }, + { + "epoch": 0.79355720322899, + "grad_norm": 3.3048226833343506, + "learning_rate": 1.0351386031936794e-06, + "loss": 0.0792, + "step": 31359 + }, + { + "epoch": 0.7935825087936837, + "grad_norm": 4.959121227264404, + "learning_rate": 1.034893985273041e-06, + "loss": 0.1316, + "step": 31360 + }, + { + "epoch": 0.7936078143583775, + "grad_norm": 5.887768745422363, + "learning_rate": 1.0346493929226326e-06, + "loss": 0.1635, + "step": 31361 + }, + { + "epoch": 0.7936331199230711, + "grad_norm": 12.062264442443848, + "learning_rate": 1.0344048261440332e-06, + "loss": 0.3538, + "step": 31362 + }, + { + "epoch": 0.7936584254877648, + "grad_norm": 6.595832824707031, + "learning_rate": 1.0341602849388199e-06, + "loss": 0.1895, + "step": 31363 + }, + { + "epoch": 0.7936837310524585, + "grad_norm": 2.9475836753845215, + "learning_rate": 1.0339157693085693e-06, + "loss": 0.1424, + "step": 31364 + }, + { + "epoch": 0.7937090366171521, + "grad_norm": 4.855649471282959, + "learning_rate": 1.033671279254857e-06, + "loss": 0.132, + "step": 31365 + }, + { + "epoch": 0.7937343421818458, + "grad_norm": 8.972264289855957, + "learning_rate": 1.033426814779262e-06, + "loss": 0.1867, + "step": 31366 + }, + { + "epoch": 0.7937596477465395, + "grad_norm": 5.818846702575684, + "learning_rate": 1.0331823758833593e-06, + "loss": 0.1584, + "step": 31367 + }, + { + "epoch": 0.7937849533112331, + "grad_norm": 7.366373538970947, + "learning_rate": 1.0329379625687258e-06, + "loss": 0.2204, + "step": 31368 + }, + { + "epoch": 0.7938102588759268, + "grad_norm": 10.230955123901367, + "learning_rate": 1.032693574836936e-06, + "loss": 0.1137, + "step": 31369 + }, + { + "epoch": 0.7938355644406205, + "grad_norm": 3.554039478302002, + "learning_rate": 1.0324492126895692e-06, + "loss": 0.1347, + "step": 31370 + }, + { + "epoch": 0.7938608700053141, + "grad_norm": 13.254803657531738, + "learning_rate": 1.0322048761281971e-06, + "loss": 0.1819, + "step": 31371 + }, + { + "epoch": 0.7938861755700078, + "grad_norm": 3.657585859298706, + "learning_rate": 1.0319605651544012e-06, + "loss": 0.1033, + "step": 31372 + }, + { + "epoch": 0.7939114811347016, + "grad_norm": 12.622703552246094, + "learning_rate": 1.0317162797697506e-06, + "loss": 0.2164, + "step": 31373 + }, + { + "epoch": 0.7939367866993952, + "grad_norm": 5.183627605438232, + "learning_rate": 1.031472019975825e-06, + "loss": 0.1237, + "step": 31374 + }, + { + "epoch": 0.7939620922640889, + "grad_norm": 4.910340309143066, + "learning_rate": 1.0312277857741964e-06, + "loss": 0.1672, + "step": 31375 + }, + { + "epoch": 0.7939873978287826, + "grad_norm": 4.122905254364014, + "learning_rate": 1.0309835771664445e-06, + "loss": 0.1238, + "step": 31376 + }, + { + "epoch": 0.7940127033934762, + "grad_norm": 7.534056663513184, + "learning_rate": 1.0307393941541387e-06, + "loss": 0.1213, + "step": 31377 + }, + { + "epoch": 0.7940380089581699, + "grad_norm": 4.386380672454834, + "learning_rate": 1.030495236738857e-06, + "loss": 0.1445, + "step": 31378 + }, + { + "epoch": 0.7940633145228636, + "grad_norm": 6.900784015655518, + "learning_rate": 1.0302511049221726e-06, + "loss": 0.111, + "step": 31379 + }, + { + "epoch": 0.7940886200875572, + "grad_norm": 3.433133840560913, + "learning_rate": 1.0300069987056626e-06, + "loss": 0.1327, + "step": 31380 + }, + { + "epoch": 0.7941139256522509, + "grad_norm": 4.235433578491211, + "learning_rate": 1.0297629180908964e-06, + "loss": 0.1566, + "step": 31381 + }, + { + "epoch": 0.7941392312169446, + "grad_norm": 6.651475429534912, + "learning_rate": 1.0295188630794517e-06, + "loss": 0.144, + "step": 31382 + }, + { + "epoch": 0.7941645367816382, + "grad_norm": 6.762019634246826, + "learning_rate": 1.0292748336729002e-06, + "loss": 0.1866, + "step": 31383 + }, + { + "epoch": 0.7941898423463319, + "grad_norm": 3.7831220626831055, + "learning_rate": 1.029030829872818e-06, + "loss": 0.1406, + "step": 31384 + }, + { + "epoch": 0.7942151479110257, + "grad_norm": 6.586048603057861, + "learning_rate": 1.0287868516807776e-06, + "loss": 0.1842, + "step": 31385 + }, + { + "epoch": 0.7942404534757194, + "grad_norm": 4.739274501800537, + "learning_rate": 1.0285428990983514e-06, + "loss": 0.1418, + "step": 31386 + }, + { + "epoch": 0.794265759040413, + "grad_norm": 2.297999382019043, + "learning_rate": 1.028298972127114e-06, + "loss": 0.106, + "step": 31387 + }, + { + "epoch": 0.7942910646051067, + "grad_norm": 6.16899299621582, + "learning_rate": 1.028055070768636e-06, + "loss": 0.2139, + "step": 31388 + }, + { + "epoch": 0.7943163701698004, + "grad_norm": 5.098977565765381, + "learning_rate": 1.0278111950244934e-06, + "loss": 0.1758, + "step": 31389 + }, + { + "epoch": 0.794341675734494, + "grad_norm": 3.464329242706299, + "learning_rate": 1.0275673448962575e-06, + "loss": 0.1105, + "step": 31390 + }, + { + "epoch": 0.7943669812991877, + "grad_norm": 10.888971328735352, + "learning_rate": 1.0273235203855015e-06, + "loss": 0.2109, + "step": 31391 + }, + { + "epoch": 0.7943922868638814, + "grad_norm": 7.6591973304748535, + "learning_rate": 1.027079721493795e-06, + "loss": 0.1515, + "step": 31392 + }, + { + "epoch": 0.794417592428575, + "grad_norm": 17.249645233154297, + "learning_rate": 1.0268359482227142e-06, + "loss": 0.2818, + "step": 31393 + }, + { + "epoch": 0.7944428979932687, + "grad_norm": 41.322776794433594, + "learning_rate": 1.0265922005738287e-06, + "loss": 0.1569, + "step": 31394 + }, + { + "epoch": 0.7944682035579624, + "grad_norm": 9.196001052856445, + "learning_rate": 1.0263484785487117e-06, + "loss": 0.1317, + "step": 31395 + }, + { + "epoch": 0.794493509122656, + "grad_norm": 6.870634078979492, + "learning_rate": 1.0261047821489323e-06, + "loss": 0.2442, + "step": 31396 + }, + { + "epoch": 0.7945188146873498, + "grad_norm": 5.376550197601318, + "learning_rate": 1.0258611113760653e-06, + "loss": 0.1332, + "step": 31397 + }, + { + "epoch": 0.7945441202520435, + "grad_norm": 3.3625288009643555, + "learning_rate": 1.0256174662316809e-06, + "loss": 0.1572, + "step": 31398 + }, + { + "epoch": 0.7945694258167371, + "grad_norm": 3.663404941558838, + "learning_rate": 1.0253738467173502e-06, + "loss": 0.1513, + "step": 31399 + }, + { + "epoch": 0.7945947313814308, + "grad_norm": 3.9647369384765625, + "learning_rate": 1.025130252834643e-06, + "loss": 0.1322, + "step": 31400 + }, + { + "epoch": 0.7946200369461245, + "grad_norm": 3.3177542686462402, + "learning_rate": 1.0248866845851324e-06, + "loss": 0.1556, + "step": 31401 + }, + { + "epoch": 0.7946453425108181, + "grad_norm": 3.790271282196045, + "learning_rate": 1.0246431419703885e-06, + "loss": 0.1491, + "step": 31402 + }, + { + "epoch": 0.7946706480755118, + "grad_norm": 4.820389747619629, + "learning_rate": 1.0243996249919812e-06, + "loss": 0.1357, + "step": 31403 + }, + { + "epoch": 0.7946959536402055, + "grad_norm": 3.885326385498047, + "learning_rate": 1.0241561336514804e-06, + "loss": 0.1108, + "step": 31404 + }, + { + "epoch": 0.7947212592048991, + "grad_norm": 6.7625837326049805, + "learning_rate": 1.023912667950458e-06, + "loss": 0.1757, + "step": 31405 + }, + { + "epoch": 0.7947465647695928, + "grad_norm": 3.1294662952423096, + "learning_rate": 1.023669227890482e-06, + "loss": 0.17, + "step": 31406 + }, + { + "epoch": 0.7947718703342865, + "grad_norm": 4.742919445037842, + "learning_rate": 1.0234258134731263e-06, + "loss": 0.1598, + "step": 31407 + }, + { + "epoch": 0.7947971758989801, + "grad_norm": 5.864124298095703, + "learning_rate": 1.023182424699955e-06, + "loss": 0.1505, + "step": 31408 + }, + { + "epoch": 0.7948224814636738, + "grad_norm": 3.882922649383545, + "learning_rate": 1.022939061572542e-06, + "loss": 0.076, + "step": 31409 + }, + { + "epoch": 0.7948477870283676, + "grad_norm": 7.400614261627197, + "learning_rate": 1.0226957240924533e-06, + "loss": 0.1644, + "step": 31410 + }, + { + "epoch": 0.7948730925930612, + "grad_norm": 5.519037246704102, + "learning_rate": 1.0224524122612634e-06, + "loss": 0.2315, + "step": 31411 + }, + { + "epoch": 0.7948983981577549, + "grad_norm": 9.372393608093262, + "learning_rate": 1.022209126080535e-06, + "loss": 0.1449, + "step": 31412 + }, + { + "epoch": 0.7949237037224486, + "grad_norm": 4.184232234954834, + "learning_rate": 1.0219658655518416e-06, + "loss": 0.1216, + "step": 31413 + }, + { + "epoch": 0.7949490092871423, + "grad_norm": 5.804106712341309, + "learning_rate": 1.0217226306767503e-06, + "loss": 0.1513, + "step": 31414 + }, + { + "epoch": 0.7949743148518359, + "grad_norm": 6.337441444396973, + "learning_rate": 1.021479421456828e-06, + "loss": 0.2128, + "step": 31415 + }, + { + "epoch": 0.7949996204165296, + "grad_norm": 4.668231964111328, + "learning_rate": 1.0212362378936463e-06, + "loss": 0.1684, + "step": 31416 + }, + { + "epoch": 0.7950249259812233, + "grad_norm": 3.9534432888031006, + "learning_rate": 1.0209930799887723e-06, + "loss": 0.1606, + "step": 31417 + }, + { + "epoch": 0.7950502315459169, + "grad_norm": 6.349226474761963, + "learning_rate": 1.0207499477437738e-06, + "loss": 0.1755, + "step": 31418 + }, + { + "epoch": 0.7950755371106106, + "grad_norm": 4.766130447387695, + "learning_rate": 1.020506841160217e-06, + "loss": 0.1942, + "step": 31419 + }, + { + "epoch": 0.7951008426753043, + "grad_norm": 1.8731509447097778, + "learning_rate": 1.020263760239673e-06, + "loss": 0.0778, + "step": 31420 + }, + { + "epoch": 0.7951261482399979, + "grad_norm": 7.332515716552734, + "learning_rate": 1.0200207049837073e-06, + "loss": 0.2322, + "step": 31421 + }, + { + "epoch": 0.7951514538046917, + "grad_norm": 14.974369049072266, + "learning_rate": 1.019777675393888e-06, + "loss": 0.2249, + "step": 31422 + }, + { + "epoch": 0.7951767593693854, + "grad_norm": 4.708183288574219, + "learning_rate": 1.0195346714717813e-06, + "loss": 0.0893, + "step": 31423 + }, + { + "epoch": 0.795202064934079, + "grad_norm": 9.01559829711914, + "learning_rate": 1.0192916932189573e-06, + "loss": 0.213, + "step": 31424 + }, + { + "epoch": 0.7952273704987727, + "grad_norm": 6.203590393066406, + "learning_rate": 1.0190487406369776e-06, + "loss": 0.1826, + "step": 31425 + }, + { + "epoch": 0.7952526760634664, + "grad_norm": 4.230481147766113, + "learning_rate": 1.0188058137274138e-06, + "loss": 0.1756, + "step": 31426 + }, + { + "epoch": 0.79527798162816, + "grad_norm": 4.831717491149902, + "learning_rate": 1.0185629124918295e-06, + "loss": 0.1315, + "step": 31427 + }, + { + "epoch": 0.7953032871928537, + "grad_norm": 2.9376888275146484, + "learning_rate": 1.0183200369317935e-06, + "loss": 0.0918, + "step": 31428 + }, + { + "epoch": 0.7953285927575474, + "grad_norm": 5.0288825035095215, + "learning_rate": 1.018077187048871e-06, + "loss": 0.1022, + "step": 31429 + }, + { + "epoch": 0.795353898322241, + "grad_norm": 16.913745880126953, + "learning_rate": 1.017834362844628e-06, + "loss": 0.3098, + "step": 31430 + }, + { + "epoch": 0.7953792038869347, + "grad_norm": 5.214209079742432, + "learning_rate": 1.0175915643206292e-06, + "loss": 0.1154, + "step": 31431 + }, + { + "epoch": 0.7954045094516284, + "grad_norm": 2.84989333152771, + "learning_rate": 1.017348791478443e-06, + "loss": 0.0824, + "step": 31432 + }, + { + "epoch": 0.795429815016322, + "grad_norm": 2.810640573501587, + "learning_rate": 1.0171060443196335e-06, + "loss": 0.0954, + "step": 31433 + }, + { + "epoch": 0.7954551205810158, + "grad_norm": 5.409352779388428, + "learning_rate": 1.0168633228457663e-06, + "loss": 0.1423, + "step": 31434 + }, + { + "epoch": 0.7954804261457095, + "grad_norm": 3.188687801361084, + "learning_rate": 1.0166206270584056e-06, + "loss": 0.1722, + "step": 31435 + }, + { + "epoch": 0.7955057317104031, + "grad_norm": 5.452060222625732, + "learning_rate": 1.016377956959118e-06, + "loss": 0.1509, + "step": 31436 + }, + { + "epoch": 0.7955310372750968, + "grad_norm": 2.6044986248016357, + "learning_rate": 1.0161353125494672e-06, + "loss": 0.0827, + "step": 31437 + }, + { + "epoch": 0.7955563428397905, + "grad_norm": 6.080687522888184, + "learning_rate": 1.0158926938310214e-06, + "loss": 0.1428, + "step": 31438 + }, + { + "epoch": 0.7955816484044842, + "grad_norm": 15.75100326538086, + "learning_rate": 1.0156501008053398e-06, + "loss": 0.1459, + "step": 31439 + }, + { + "epoch": 0.7956069539691778, + "grad_norm": 5.927457809448242, + "learning_rate": 1.015407533473991e-06, + "loss": 0.1269, + "step": 31440 + }, + { + "epoch": 0.7956322595338715, + "grad_norm": 6.9492926597595215, + "learning_rate": 1.015164991838536e-06, + "loss": 0.1267, + "step": 31441 + }, + { + "epoch": 0.7956575650985652, + "grad_norm": 4.515990257263184, + "learning_rate": 1.0149224759005437e-06, + "loss": 0.1236, + "step": 31442 + }, + { + "epoch": 0.7956828706632588, + "grad_norm": 3.3610141277313232, + "learning_rate": 1.0146799856615725e-06, + "loss": 0.1017, + "step": 31443 + }, + { + "epoch": 0.7957081762279525, + "grad_norm": 3.5706660747528076, + "learning_rate": 1.01443752112319e-06, + "loss": 0.0643, + "step": 31444 + }, + { + "epoch": 0.7957334817926462, + "grad_norm": 24.667856216430664, + "learning_rate": 1.0141950822869584e-06, + "loss": 0.1562, + "step": 31445 + }, + { + "epoch": 0.7957587873573398, + "grad_norm": 6.344918727874756, + "learning_rate": 1.0139526691544405e-06, + "loss": 0.1701, + "step": 31446 + }, + { + "epoch": 0.7957840929220336, + "grad_norm": 5.592540740966797, + "learning_rate": 1.013710281727201e-06, + "loss": 0.1478, + "step": 31447 + }, + { + "epoch": 0.7958093984867273, + "grad_norm": 8.177952766418457, + "learning_rate": 1.0134679200068032e-06, + "loss": 0.2073, + "step": 31448 + }, + { + "epoch": 0.7958347040514209, + "grad_norm": 3.8220584392547607, + "learning_rate": 1.0132255839948085e-06, + "loss": 0.1034, + "step": 31449 + }, + { + "epoch": 0.7958600096161146, + "grad_norm": 2.672746181488037, + "learning_rate": 1.0129832736927792e-06, + "loss": 0.0647, + "step": 31450 + }, + { + "epoch": 0.7958853151808083, + "grad_norm": 4.476351261138916, + "learning_rate": 1.0127409891022805e-06, + "loss": 0.133, + "step": 31451 + }, + { + "epoch": 0.7959106207455019, + "grad_norm": 8.147647857666016, + "learning_rate": 1.012498730224874e-06, + "loss": 0.2238, + "step": 31452 + }, + { + "epoch": 0.7959359263101956, + "grad_norm": 7.508499622344971, + "learning_rate": 1.0122564970621206e-06, + "loss": 0.2031, + "step": 31453 + }, + { + "epoch": 0.7959612318748893, + "grad_norm": 4.786701202392578, + "learning_rate": 1.0120142896155827e-06, + "loss": 0.1631, + "step": 31454 + }, + { + "epoch": 0.7959865374395829, + "grad_norm": 4.539454936981201, + "learning_rate": 1.0117721078868248e-06, + "loss": 0.1665, + "step": 31455 + }, + { + "epoch": 0.7960118430042766, + "grad_norm": 5.258768081665039, + "learning_rate": 1.0115299518774047e-06, + "loss": 0.1048, + "step": 31456 + }, + { + "epoch": 0.7960371485689703, + "grad_norm": 2.8654375076293945, + "learning_rate": 1.011287821588887e-06, + "loss": 0.1243, + "step": 31457 + }, + { + "epoch": 0.7960624541336639, + "grad_norm": 7.235049247741699, + "learning_rate": 1.011045717022831e-06, + "loss": 0.2488, + "step": 31458 + }, + { + "epoch": 0.7960877596983577, + "grad_norm": 3.1623013019561768, + "learning_rate": 1.0108036381808e-06, + "loss": 0.1113, + "step": 31459 + }, + { + "epoch": 0.7961130652630514, + "grad_norm": 4.593142509460449, + "learning_rate": 1.0105615850643546e-06, + "loss": 0.1601, + "step": 31460 + }, + { + "epoch": 0.796138370827745, + "grad_norm": 2.7435591220855713, + "learning_rate": 1.0103195576750558e-06, + "loss": 0.0978, + "step": 31461 + }, + { + "epoch": 0.7961636763924387, + "grad_norm": 11.331985473632812, + "learning_rate": 1.0100775560144626e-06, + "loss": 0.1864, + "step": 31462 + }, + { + "epoch": 0.7961889819571324, + "grad_norm": 2.531722068786621, + "learning_rate": 1.009835580084138e-06, + "loss": 0.0901, + "step": 31463 + }, + { + "epoch": 0.7962142875218261, + "grad_norm": 7.1203765869140625, + "learning_rate": 1.0095936298856419e-06, + "loss": 0.207, + "step": 31464 + }, + { + "epoch": 0.7962395930865197, + "grad_norm": 8.30348014831543, + "learning_rate": 1.0093517054205337e-06, + "loss": 0.2319, + "step": 31465 + }, + { + "epoch": 0.7962648986512134, + "grad_norm": 3.068723201751709, + "learning_rate": 1.0091098066903731e-06, + "loss": 0.1432, + "step": 31466 + }, + { + "epoch": 0.7962902042159071, + "grad_norm": 5.117106914520264, + "learning_rate": 1.0088679336967227e-06, + "loss": 0.1834, + "step": 31467 + }, + { + "epoch": 0.7963155097806007, + "grad_norm": 28.132368087768555, + "learning_rate": 1.0086260864411385e-06, + "loss": 0.2696, + "step": 31468 + }, + { + "epoch": 0.7963408153452944, + "grad_norm": 7.2294921875, + "learning_rate": 1.0083842649251851e-06, + "loss": 0.2064, + "step": 31469 + }, + { + "epoch": 0.7963661209099882, + "grad_norm": 5.387002944946289, + "learning_rate": 1.0081424691504167e-06, + "loss": 0.1327, + "step": 31470 + }, + { + "epoch": 0.7963914264746818, + "grad_norm": 4.3444037437438965, + "learning_rate": 1.0079006991183966e-06, + "loss": 0.1442, + "step": 31471 + }, + { + "epoch": 0.7964167320393755, + "grad_norm": 6.731440544128418, + "learning_rate": 1.0076589548306815e-06, + "loss": 0.1589, + "step": 31472 + }, + { + "epoch": 0.7964420376040692, + "grad_norm": 3.9788477420806885, + "learning_rate": 1.0074172362888318e-06, + "loss": 0.145, + "step": 31473 + }, + { + "epoch": 0.7964673431687628, + "grad_norm": 7.90368127822876, + "learning_rate": 1.0071755434944036e-06, + "loss": 0.1435, + "step": 31474 + }, + { + "epoch": 0.7964926487334565, + "grad_norm": 5.604005813598633, + "learning_rate": 1.0069338764489596e-06, + "loss": 0.1582, + "step": 31475 + }, + { + "epoch": 0.7965179542981502, + "grad_norm": 15.916162490844727, + "learning_rate": 1.006692235154056e-06, + "loss": 0.2621, + "step": 31476 + }, + { + "epoch": 0.7965432598628438, + "grad_norm": 3.027709484100342, + "learning_rate": 1.0064506196112517e-06, + "loss": 0.0664, + "step": 31477 + }, + { + "epoch": 0.7965685654275375, + "grad_norm": 4.512119293212891, + "learning_rate": 1.0062090298221027e-06, + "loss": 0.1301, + "step": 31478 + }, + { + "epoch": 0.7965938709922312, + "grad_norm": 4.318511962890625, + "learning_rate": 1.00596746578817e-06, + "loss": 0.1028, + "step": 31479 + }, + { + "epoch": 0.7966191765569248, + "grad_norm": 4.9739508628845215, + "learning_rate": 1.0057259275110104e-06, + "loss": 0.1685, + "step": 31480 + }, + { + "epoch": 0.7966444821216185, + "grad_norm": 5.101956844329834, + "learning_rate": 1.00548441499218e-06, + "loss": 0.1401, + "step": 31481 + }, + { + "epoch": 0.7966697876863122, + "grad_norm": 5.9658966064453125, + "learning_rate": 1.0052429282332405e-06, + "loss": 0.1948, + "step": 31482 + }, + { + "epoch": 0.7966950932510058, + "grad_norm": 13.291341781616211, + "learning_rate": 1.0050014672357433e-06, + "loss": 0.3546, + "step": 31483 + }, + { + "epoch": 0.7967203988156996, + "grad_norm": 5.25853157043457, + "learning_rate": 1.0047600320012495e-06, + "loss": 0.1409, + "step": 31484 + }, + { + "epoch": 0.7967457043803933, + "grad_norm": 2.707599639892578, + "learning_rate": 1.0045186225313146e-06, + "loss": 0.1006, + "step": 31485 + }, + { + "epoch": 0.7967710099450869, + "grad_norm": 4.321122169494629, + "learning_rate": 1.004277238827498e-06, + "loss": 0.1398, + "step": 31486 + }, + { + "epoch": 0.7967963155097806, + "grad_norm": 6.161791801452637, + "learning_rate": 1.0040358808913514e-06, + "loss": 0.2049, + "step": 31487 + }, + { + "epoch": 0.7968216210744743, + "grad_norm": 6.7959489822387695, + "learning_rate": 1.003794548724436e-06, + "loss": 0.2181, + "step": 31488 + }, + { + "epoch": 0.796846926639168, + "grad_norm": 4.255718231201172, + "learning_rate": 1.0035532423283045e-06, + "loss": 0.0905, + "step": 31489 + }, + { + "epoch": 0.7968722322038616, + "grad_norm": 3.1145639419555664, + "learning_rate": 1.0033119617045161e-06, + "loss": 0.0926, + "step": 31490 + }, + { + "epoch": 0.7968975377685553, + "grad_norm": 6.973881244659424, + "learning_rate": 1.0030707068546253e-06, + "loss": 0.2106, + "step": 31491 + }, + { + "epoch": 0.796922843333249, + "grad_norm": 4.533496856689453, + "learning_rate": 1.0028294777801883e-06, + "loss": 0.1353, + "step": 31492 + }, + { + "epoch": 0.7969481488979426, + "grad_norm": 8.58774471282959, + "learning_rate": 1.0025882744827592e-06, + "loss": 0.1441, + "step": 31493 + }, + { + "epoch": 0.7969734544626363, + "grad_norm": 4.341276168823242, + "learning_rate": 1.0023470969638954e-06, + "loss": 0.1678, + "step": 31494 + }, + { + "epoch": 0.79699876002733, + "grad_norm": 3.777268886566162, + "learning_rate": 1.0021059452251526e-06, + "loss": 0.0955, + "step": 31495 + }, + { + "epoch": 0.7970240655920237, + "grad_norm": 3.9314448833465576, + "learning_rate": 1.001864819268084e-06, + "loss": 0.1577, + "step": 31496 + }, + { + "epoch": 0.7970493711567174, + "grad_norm": 8.110062599182129, + "learning_rate": 1.0016237190942445e-06, + "loss": 0.1529, + "step": 31497 + }, + { + "epoch": 0.7970746767214111, + "grad_norm": 4.756086349487305, + "learning_rate": 1.0013826447051911e-06, + "loss": 0.2099, + "step": 31498 + }, + { + "epoch": 0.7970999822861047, + "grad_norm": 5.504360198974609, + "learning_rate": 1.0011415961024774e-06, + "loss": 0.1752, + "step": 31499 + }, + { + "epoch": 0.7971252878507984, + "grad_norm": 3.8330109119415283, + "learning_rate": 1.0009005732876576e-06, + "loss": 0.1622, + "step": 31500 + }, + { + "epoch": 0.7971505934154921, + "grad_norm": 6.261066436767578, + "learning_rate": 1.0006595762622851e-06, + "loss": 0.1687, + "step": 31501 + }, + { + "epoch": 0.7971758989801857, + "grad_norm": 3.3170669078826904, + "learning_rate": 1.0004186050279157e-06, + "loss": 0.1027, + "step": 31502 + }, + { + "epoch": 0.7972012045448794, + "grad_norm": 4.861363410949707, + "learning_rate": 1.0001776595861029e-06, + "loss": 0.1651, + "step": 31503 + }, + { + "epoch": 0.7972265101095731, + "grad_norm": 5.8740010261535645, + "learning_rate": 9.999367399384008e-07, + "loss": 0.1705, + "step": 31504 + }, + { + "epoch": 0.7972518156742667, + "grad_norm": 11.220748901367188, + "learning_rate": 9.99695846086361e-07, + "loss": 0.1692, + "step": 31505 + }, + { + "epoch": 0.7972771212389604, + "grad_norm": 4.261307716369629, + "learning_rate": 9.9945497803154e-07, + "loss": 0.0987, + "step": 31506 + }, + { + "epoch": 0.7973024268036542, + "grad_norm": 6.2419257164001465, + "learning_rate": 9.992141357754898e-07, + "loss": 0.1703, + "step": 31507 + }, + { + "epoch": 0.7973277323683478, + "grad_norm": 6.005768775939941, + "learning_rate": 9.989733193197632e-07, + "loss": 0.1187, + "step": 31508 + }, + { + "epoch": 0.7973530379330415, + "grad_norm": 3.3371403217315674, + "learning_rate": 9.987325286659133e-07, + "loss": 0.1546, + "step": 31509 + }, + { + "epoch": 0.7973783434977352, + "grad_norm": 14.5555419921875, + "learning_rate": 9.984917638154917e-07, + "loss": 0.1935, + "step": 31510 + }, + { + "epoch": 0.7974036490624288, + "grad_norm": 5.501105785369873, + "learning_rate": 9.982510247700538e-07, + "loss": 0.1322, + "step": 31511 + }, + { + "epoch": 0.7974289546271225, + "grad_norm": 10.174880981445312, + "learning_rate": 9.980103115311496e-07, + "loss": 0.3449, + "step": 31512 + }, + { + "epoch": 0.7974542601918162, + "grad_norm": 5.278215408325195, + "learning_rate": 9.977696241003348e-07, + "loss": 0.2176, + "step": 31513 + }, + { + "epoch": 0.7974795657565099, + "grad_norm": 4.939691066741943, + "learning_rate": 9.975289624791568e-07, + "loss": 0.1723, + "step": 31514 + }, + { + "epoch": 0.7975048713212035, + "grad_norm": 5.903703212738037, + "learning_rate": 9.972883266691712e-07, + "loss": 0.1689, + "step": 31515 + }, + { + "epoch": 0.7975301768858972, + "grad_norm": 5.863248825073242, + "learning_rate": 9.970477166719273e-07, + "loss": 0.1478, + "step": 31516 + }, + { + "epoch": 0.7975554824505909, + "grad_norm": 4.66816520690918, + "learning_rate": 9.96807132488981e-07, + "loss": 0.0958, + "step": 31517 + }, + { + "epoch": 0.7975807880152845, + "grad_norm": 1.8157366514205933, + "learning_rate": 9.965665741218777e-07, + "loss": 0.0514, + "step": 31518 + }, + { + "epoch": 0.7976060935799782, + "grad_norm": 10.338152885437012, + "learning_rate": 9.963260415721737e-07, + "loss": 0.3164, + "step": 31519 + }, + { + "epoch": 0.797631399144672, + "grad_norm": 5.836186408996582, + "learning_rate": 9.960855348414166e-07, + "loss": 0.1424, + "step": 31520 + }, + { + "epoch": 0.7976567047093656, + "grad_norm": 3.7230725288391113, + "learning_rate": 9.958450539311604e-07, + "loss": 0.125, + "step": 31521 + }, + { + "epoch": 0.7976820102740593, + "grad_norm": 8.243568420410156, + "learning_rate": 9.956045988429553e-07, + "loss": 0.1793, + "step": 31522 + }, + { + "epoch": 0.797707315838753, + "grad_norm": 5.388853073120117, + "learning_rate": 9.953641695783507e-07, + "loss": 0.2268, + "step": 31523 + }, + { + "epoch": 0.7977326214034466, + "grad_norm": 3.848369836807251, + "learning_rate": 9.951237661388963e-07, + "loss": 0.1906, + "step": 31524 + }, + { + "epoch": 0.7977579269681403, + "grad_norm": 6.725439071655273, + "learning_rate": 9.94883388526145e-07, + "loss": 0.1853, + "step": 31525 + }, + { + "epoch": 0.797783232532834, + "grad_norm": 2.465005874633789, + "learning_rate": 9.94643036741646e-07, + "loss": 0.0934, + "step": 31526 + }, + { + "epoch": 0.7978085380975276, + "grad_norm": 5.132780075073242, + "learning_rate": 9.944027107869487e-07, + "loss": 0.123, + "step": 31527 + }, + { + "epoch": 0.7978338436622213, + "grad_norm": 7.575007438659668, + "learning_rate": 9.941624106636022e-07, + "loss": 0.1518, + "step": 31528 + }, + { + "epoch": 0.797859149226915, + "grad_norm": 7.636898040771484, + "learning_rate": 9.939221363731582e-07, + "loss": 0.1141, + "step": 31529 + }, + { + "epoch": 0.7978844547916086, + "grad_norm": 4.800072193145752, + "learning_rate": 9.936818879171655e-07, + "loss": 0.1366, + "step": 31530 + }, + { + "epoch": 0.7979097603563023, + "grad_norm": 3.8954732418060303, + "learning_rate": 9.93441665297173e-07, + "loss": 0.1069, + "step": 31531 + }, + { + "epoch": 0.7979350659209961, + "grad_norm": 1.7848436832427979, + "learning_rate": 9.932014685147285e-07, + "loss": 0.0572, + "step": 31532 + }, + { + "epoch": 0.7979603714856897, + "grad_norm": 8.296564102172852, + "learning_rate": 9.929612975713836e-07, + "loss": 0.1114, + "step": 31533 + }, + { + "epoch": 0.7979856770503834, + "grad_norm": 6.97672176361084, + "learning_rate": 9.927211524686859e-07, + "loss": 0.2019, + "step": 31534 + }, + { + "epoch": 0.7980109826150771, + "grad_norm": 3.0359079837799072, + "learning_rate": 9.924810332081848e-07, + "loss": 0.165, + "step": 31535 + }, + { + "epoch": 0.7980362881797707, + "grad_norm": 7.116236686706543, + "learning_rate": 9.922409397914273e-07, + "loss": 0.1592, + "step": 31536 + }, + { + "epoch": 0.7980615937444644, + "grad_norm": 4.8544816970825195, + "learning_rate": 9.920008722199615e-07, + "loss": 0.1435, + "step": 31537 + }, + { + "epoch": 0.7980868993091581, + "grad_norm": 3.061781406402588, + "learning_rate": 9.917608304953374e-07, + "loss": 0.0892, + "step": 31538 + }, + { + "epoch": 0.7981122048738517, + "grad_norm": 7.5022430419921875, + "learning_rate": 9.915208146191024e-07, + "loss": 0.2646, + "step": 31539 + }, + { + "epoch": 0.7981375104385454, + "grad_norm": 8.290185928344727, + "learning_rate": 9.91280824592804e-07, + "loss": 0.1863, + "step": 31540 + }, + { + "epoch": 0.7981628160032391, + "grad_norm": 3.5433566570281982, + "learning_rate": 9.910408604179889e-07, + "loss": 0.1236, + "step": 31541 + }, + { + "epoch": 0.7981881215679328, + "grad_norm": 5.475596904754639, + "learning_rate": 9.90800922096206e-07, + "loss": 0.1767, + "step": 31542 + }, + { + "epoch": 0.7982134271326264, + "grad_norm": 4.657783508300781, + "learning_rate": 9.90561009629002e-07, + "loss": 0.2056, + "step": 31543 + }, + { + "epoch": 0.7982387326973202, + "grad_norm": 17.74468231201172, + "learning_rate": 9.903211230179261e-07, + "loss": 0.2841, + "step": 31544 + }, + { + "epoch": 0.7982640382620139, + "grad_norm": 4.387576580047607, + "learning_rate": 9.900812622645213e-07, + "loss": 0.1653, + "step": 31545 + }, + { + "epoch": 0.7982893438267075, + "grad_norm": 4.56361722946167, + "learning_rate": 9.898414273703373e-07, + "loss": 0.1579, + "step": 31546 + }, + { + "epoch": 0.7983146493914012, + "grad_norm": 4.277544975280762, + "learning_rate": 9.89601618336919e-07, + "loss": 0.1629, + "step": 31547 + }, + { + "epoch": 0.7983399549560949, + "grad_norm": 11.52855396270752, + "learning_rate": 9.893618351658164e-07, + "loss": 0.3167, + "step": 31548 + }, + { + "epoch": 0.7983652605207885, + "grad_norm": 6.385105609893799, + "learning_rate": 9.89122077858571e-07, + "loss": 0.1463, + "step": 31549 + }, + { + "epoch": 0.7983905660854822, + "grad_norm": 2.993191957473755, + "learning_rate": 9.888823464167318e-07, + "loss": 0.1356, + "step": 31550 + }, + { + "epoch": 0.7984158716501759, + "grad_norm": 5.436706066131592, + "learning_rate": 9.886426408418436e-07, + "loss": 0.1108, + "step": 31551 + }, + { + "epoch": 0.7984411772148695, + "grad_norm": 5.990143299102783, + "learning_rate": 9.884029611354552e-07, + "loss": 0.1562, + "step": 31552 + }, + { + "epoch": 0.7984664827795632, + "grad_norm": 4.181573390960693, + "learning_rate": 9.881633072991076e-07, + "loss": 0.1478, + "step": 31553 + }, + { + "epoch": 0.7984917883442569, + "grad_norm": 7.818352699279785, + "learning_rate": 9.879236793343494e-07, + "loss": 0.2383, + "step": 31554 + }, + { + "epoch": 0.7985170939089505, + "grad_norm": 3.9158055782318115, + "learning_rate": 9.876840772427237e-07, + "loss": 0.1359, + "step": 31555 + }, + { + "epoch": 0.7985423994736442, + "grad_norm": 6.82849645614624, + "learning_rate": 9.874445010257788e-07, + "loss": 0.2827, + "step": 31556 + }, + { + "epoch": 0.798567705038338, + "grad_norm": 3.9907352924346924, + "learning_rate": 9.872049506850578e-07, + "loss": 0.1117, + "step": 31557 + }, + { + "epoch": 0.7985930106030316, + "grad_norm": 4.912889003753662, + "learning_rate": 9.869654262221056e-07, + "loss": 0.1318, + "step": 31558 + }, + { + "epoch": 0.7986183161677253, + "grad_norm": 5.340142726898193, + "learning_rate": 9.867259276384656e-07, + "loss": 0.1998, + "step": 31559 + }, + { + "epoch": 0.798643621732419, + "grad_norm": 5.973484992980957, + "learning_rate": 9.86486454935685e-07, + "loss": 0.1784, + "step": 31560 + }, + { + "epoch": 0.7986689272971126, + "grad_norm": 5.1898345947265625, + "learning_rate": 9.862470081153064e-07, + "loss": 0.1633, + "step": 31561 + }, + { + "epoch": 0.7986942328618063, + "grad_norm": 6.810214519500732, + "learning_rate": 9.860075871788744e-07, + "loss": 0.2011, + "step": 31562 + }, + { + "epoch": 0.7987195384265, + "grad_norm": 3.698854684829712, + "learning_rate": 9.857681921279312e-07, + "loss": 0.1353, + "step": 31563 + }, + { + "epoch": 0.7987448439911936, + "grad_norm": 8.666046142578125, + "learning_rate": 9.855288229640237e-07, + "loss": 0.1949, + "step": 31564 + }, + { + "epoch": 0.7987701495558873, + "grad_norm": 4.341826438903809, + "learning_rate": 9.85289479688694e-07, + "loss": 0.1698, + "step": 31565 + }, + { + "epoch": 0.798795455120581, + "grad_norm": 5.446795463562012, + "learning_rate": 9.850501623034863e-07, + "loss": 0.1418, + "step": 31566 + }, + { + "epoch": 0.7988207606852747, + "grad_norm": 4.81652307510376, + "learning_rate": 9.848108708099423e-07, + "loss": 0.1374, + "step": 31567 + }, + { + "epoch": 0.7988460662499683, + "grad_norm": 3.757035255432129, + "learning_rate": 9.845716052096056e-07, + "loss": 0.1584, + "step": 31568 + }, + { + "epoch": 0.7988713718146621, + "grad_norm": 4.617419242858887, + "learning_rate": 9.843323655040204e-07, + "loss": 0.1265, + "step": 31569 + }, + { + "epoch": 0.7988966773793558, + "grad_norm": 4.175110816955566, + "learning_rate": 9.840931516947295e-07, + "loss": 0.1534, + "step": 31570 + }, + { + "epoch": 0.7989219829440494, + "grad_norm": 2.479313611984253, + "learning_rate": 9.838539637832744e-07, + "loss": 0.0998, + "step": 31571 + }, + { + "epoch": 0.7989472885087431, + "grad_norm": 2.9796624183654785, + "learning_rate": 9.836148017711966e-07, + "loss": 0.1182, + "step": 31572 + }, + { + "epoch": 0.7989725940734368, + "grad_norm": 4.362384796142578, + "learning_rate": 9.833756656600412e-07, + "loss": 0.2093, + "step": 31573 + }, + { + "epoch": 0.7989978996381304, + "grad_norm": 8.363358497619629, + "learning_rate": 9.831365554513478e-07, + "loss": 0.2144, + "step": 31574 + }, + { + "epoch": 0.7990232052028241, + "grad_norm": 4.223649501800537, + "learning_rate": 9.82897471146662e-07, + "loss": 0.1855, + "step": 31575 + }, + { + "epoch": 0.7990485107675178, + "grad_norm": 3.0541341304779053, + "learning_rate": 9.826584127475208e-07, + "loss": 0.1057, + "step": 31576 + }, + { + "epoch": 0.7990738163322114, + "grad_norm": 4.982504367828369, + "learning_rate": 9.824193802554693e-07, + "loss": 0.1532, + "step": 31577 + }, + { + "epoch": 0.7990991218969051, + "grad_norm": 5.805586338043213, + "learning_rate": 9.82180373672047e-07, + "loss": 0.1812, + "step": 31578 + }, + { + "epoch": 0.7991244274615988, + "grad_norm": 2.4234440326690674, + "learning_rate": 9.81941392998798e-07, + "loss": 0.0811, + "step": 31579 + }, + { + "epoch": 0.7991497330262924, + "grad_norm": 3.959247350692749, + "learning_rate": 9.817024382372597e-07, + "loss": 0.1612, + "step": 31580 + }, + { + "epoch": 0.7991750385909862, + "grad_norm": 5.000866889953613, + "learning_rate": 9.814635093889757e-07, + "loss": 0.1612, + "step": 31581 + }, + { + "epoch": 0.7992003441556799, + "grad_norm": 9.808241844177246, + "learning_rate": 9.812246064554847e-07, + "loss": 0.2089, + "step": 31582 + }, + { + "epoch": 0.7992256497203735, + "grad_norm": 5.940023899078369, + "learning_rate": 9.809857294383317e-07, + "loss": 0.2166, + "step": 31583 + }, + { + "epoch": 0.7992509552850672, + "grad_norm": 4.294122695922852, + "learning_rate": 9.807468783390512e-07, + "loss": 0.1405, + "step": 31584 + }, + { + "epoch": 0.7992762608497609, + "grad_norm": 15.791243553161621, + "learning_rate": 9.805080531591876e-07, + "loss": 0.3257, + "step": 31585 + }, + { + "epoch": 0.7993015664144545, + "grad_norm": 16.29755973815918, + "learning_rate": 9.802692539002788e-07, + "loss": 0.304, + "step": 31586 + }, + { + "epoch": 0.7993268719791482, + "grad_norm": 4.516705513000488, + "learning_rate": 9.800304805638673e-07, + "loss": 0.149, + "step": 31587 + }, + { + "epoch": 0.7993521775438419, + "grad_norm": 5.266415596008301, + "learning_rate": 9.79791733151491e-07, + "loss": 0.1445, + "step": 31588 + }, + { + "epoch": 0.7993774831085355, + "grad_norm": 5.209798812866211, + "learning_rate": 9.795530116646907e-07, + "loss": 0.1517, + "step": 31589 + }, + { + "epoch": 0.7994027886732292, + "grad_norm": 2.4474470615386963, + "learning_rate": 9.793143161050034e-07, + "loss": 0.089, + "step": 31590 + }, + { + "epoch": 0.7994280942379229, + "grad_norm": 2.9477291107177734, + "learning_rate": 9.790756464739715e-07, + "loss": 0.1499, + "step": 31591 + }, + { + "epoch": 0.7994533998026166, + "grad_norm": 7.381414890289307, + "learning_rate": 9.788370027731332e-07, + "loss": 0.1246, + "step": 31592 + }, + { + "epoch": 0.7994787053673102, + "grad_norm": 7.522336959838867, + "learning_rate": 9.785983850040266e-07, + "loss": 0.1826, + "step": 31593 + }, + { + "epoch": 0.799504010932004, + "grad_norm": 2.713207244873047, + "learning_rate": 9.78359793168191e-07, + "loss": 0.1156, + "step": 31594 + }, + { + "epoch": 0.7995293164966977, + "grad_norm": 6.73528528213501, + "learning_rate": 9.781212272671635e-07, + "loss": 0.1555, + "step": 31595 + }, + { + "epoch": 0.7995546220613913, + "grad_norm": 9.626261711120605, + "learning_rate": 9.77882687302486e-07, + "loss": 0.1464, + "step": 31596 + }, + { + "epoch": 0.799579927626085, + "grad_norm": 30.454126358032227, + "learning_rate": 9.776441732756946e-07, + "loss": 0.1905, + "step": 31597 + }, + { + "epoch": 0.7996052331907787, + "grad_norm": 9.50187873840332, + "learning_rate": 9.77405685188328e-07, + "loss": 0.189, + "step": 31598 + }, + { + "epoch": 0.7996305387554723, + "grad_norm": 4.128303050994873, + "learning_rate": 9.771672230419221e-07, + "loss": 0.1698, + "step": 31599 + }, + { + "epoch": 0.799655844320166, + "grad_norm": 3.949537515640259, + "learning_rate": 9.76928786838018e-07, + "loss": 0.1572, + "step": 31600 + }, + { + "epoch": 0.7996811498848597, + "grad_norm": 4.150111675262451, + "learning_rate": 9.766903765781516e-07, + "loss": 0.1491, + "step": 31601 + }, + { + "epoch": 0.7997064554495533, + "grad_norm": 3.7659153938293457, + "learning_rate": 9.764519922638605e-07, + "loss": 0.1844, + "step": 31602 + }, + { + "epoch": 0.799731761014247, + "grad_norm": 3.82971453666687, + "learning_rate": 9.76213633896681e-07, + "loss": 0.1012, + "step": 31603 + }, + { + "epoch": 0.7997570665789407, + "grad_norm": 7.161352634429932, + "learning_rate": 9.759753014781526e-07, + "loss": 0.1234, + "step": 31604 + }, + { + "epoch": 0.7997823721436343, + "grad_norm": 7.650611877441406, + "learning_rate": 9.75736995009811e-07, + "loss": 0.244, + "step": 31605 + }, + { + "epoch": 0.7998076777083281, + "grad_norm": 5.101109504699707, + "learning_rate": 9.754987144931926e-07, + "loss": 0.1589, + "step": 31606 + }, + { + "epoch": 0.7998329832730218, + "grad_norm": 11.063485145568848, + "learning_rate": 9.752604599298338e-07, + "loss": 0.243, + "step": 31607 + }, + { + "epoch": 0.7998582888377154, + "grad_norm": 3.7115790843963623, + "learning_rate": 9.75022231321272e-07, + "loss": 0.0781, + "step": 31608 + }, + { + "epoch": 0.7998835944024091, + "grad_norm": 18.252635955810547, + "learning_rate": 9.747840286690424e-07, + "loss": 0.2213, + "step": 31609 + }, + { + "epoch": 0.7999088999671028, + "grad_norm": 11.703238487243652, + "learning_rate": 9.745458519746848e-07, + "loss": 0.2407, + "step": 31610 + }, + { + "epoch": 0.7999342055317964, + "grad_norm": 24.63250732421875, + "learning_rate": 9.74307701239729e-07, + "loss": 0.2223, + "step": 31611 + }, + { + "epoch": 0.7999595110964901, + "grad_norm": 4.948445796966553, + "learning_rate": 9.740695764657155e-07, + "loss": 0.1377, + "step": 31612 + }, + { + "epoch": 0.7999848166611838, + "grad_norm": 4.29609489440918, + "learning_rate": 9.73831477654177e-07, + "loss": 0.1382, + "step": 31613 + }, + { + "epoch": 0.8000101222258774, + "grad_norm": 4.678988933563232, + "learning_rate": 9.73593404806653e-07, + "loss": 0.1275, + "step": 31614 + }, + { + "epoch": 0.8000354277905711, + "grad_norm": 16.309450149536133, + "learning_rate": 9.733553579246735e-07, + "loss": 0.2175, + "step": 31615 + }, + { + "epoch": 0.8000607333552648, + "grad_norm": 7.532461643218994, + "learning_rate": 9.731173370097779e-07, + "loss": 0.1281, + "step": 31616 + }, + { + "epoch": 0.8000607333552648, + "eval_loss": 0.16452720761299133, + "eval_runtime": 69.8509, + "eval_samples_per_second": 45.726, + "eval_steps_per_second": 5.726, + "step": 31616 + }, + { + "epoch": 0.8000860389199586, + "grad_norm": 5.445400238037109, + "learning_rate": 9.728793420634979e-07, + "loss": 0.1147, + "step": 31617 + }, + { + "epoch": 0.8001113444846522, + "grad_norm": 5.448487281799316, + "learning_rate": 9.726413730873713e-07, + "loss": 0.2429, + "step": 31618 + }, + { + "epoch": 0.8001366500493459, + "grad_norm": 4.9946465492248535, + "learning_rate": 9.724034300829317e-07, + "loss": 0.1442, + "step": 31619 + }, + { + "epoch": 0.8001619556140396, + "grad_norm": 6.056802272796631, + "learning_rate": 9.721655130517128e-07, + "loss": 0.1114, + "step": 31620 + }, + { + "epoch": 0.8001872611787332, + "grad_norm": 3.555853843688965, + "learning_rate": 9.719276219952494e-07, + "loss": 0.1534, + "step": 31621 + }, + { + "epoch": 0.8002125667434269, + "grad_norm": 3.9779751300811768, + "learning_rate": 9.71689756915074e-07, + "loss": 0.1038, + "step": 31622 + }, + { + "epoch": 0.8002378723081206, + "grad_norm": 6.710205554962158, + "learning_rate": 9.714519178127236e-07, + "loss": 0.155, + "step": 31623 + }, + { + "epoch": 0.8002631778728142, + "grad_norm": 4.066625118255615, + "learning_rate": 9.712141046897305e-07, + "loss": 0.1563, + "step": 31624 + }, + { + "epoch": 0.8002884834375079, + "grad_norm": 3.546992063522339, + "learning_rate": 9.709763175476278e-07, + "loss": 0.1167, + "step": 31625 + }, + { + "epoch": 0.8003137890022016, + "grad_norm": 8.549901008605957, + "learning_rate": 9.707385563879485e-07, + "loss": 0.1318, + "step": 31626 + }, + { + "epoch": 0.8003390945668952, + "grad_norm": 4.863997936248779, + "learning_rate": 9.705008212122275e-07, + "loss": 0.1519, + "step": 31627 + }, + { + "epoch": 0.8003644001315889, + "grad_norm": 3.6996004581451416, + "learning_rate": 9.702631120219973e-07, + "loss": 0.1303, + "step": 31628 + }, + { + "epoch": 0.8003897056962826, + "grad_norm": 6.668740272521973, + "learning_rate": 9.700254288187911e-07, + "loss": 0.1915, + "step": 31629 + }, + { + "epoch": 0.8004150112609763, + "grad_norm": 2.9713993072509766, + "learning_rate": 9.6978777160414e-07, + "loss": 0.0567, + "step": 31630 + }, + { + "epoch": 0.80044031682567, + "grad_norm": 6.274544715881348, + "learning_rate": 9.695501403795787e-07, + "loss": 0.2284, + "step": 31631 + }, + { + "epoch": 0.8004656223903637, + "grad_norm": 4.18430757522583, + "learning_rate": 9.693125351466392e-07, + "loss": 0.1333, + "step": 31632 + }, + { + "epoch": 0.8004909279550573, + "grad_norm": 3.6191022396087646, + "learning_rate": 9.690749559068534e-07, + "loss": 0.1064, + "step": 31633 + }, + { + "epoch": 0.800516233519751, + "grad_norm": 4.573666095733643, + "learning_rate": 9.68837402661752e-07, + "loss": 0.1221, + "step": 31634 + }, + { + "epoch": 0.8005415390844447, + "grad_norm": 4.340502738952637, + "learning_rate": 9.685998754128695e-07, + "loss": 0.135, + "step": 31635 + }, + { + "epoch": 0.8005668446491383, + "grad_norm": 5.2893524169921875, + "learning_rate": 9.683623741617371e-07, + "loss": 0.1692, + "step": 31636 + }, + { + "epoch": 0.800592150213832, + "grad_norm": 3.6857988834381104, + "learning_rate": 9.681248989098851e-07, + "loss": 0.1216, + "step": 31637 + }, + { + "epoch": 0.8006174557785257, + "grad_norm": 5.140899181365967, + "learning_rate": 9.678874496588447e-07, + "loss": 0.1365, + "step": 31638 + }, + { + "epoch": 0.8006427613432193, + "grad_norm": 4.326773643493652, + "learning_rate": 9.67650026410149e-07, + "loss": 0.1638, + "step": 31639 + }, + { + "epoch": 0.800668066907913, + "grad_norm": 7.6453471183776855, + "learning_rate": 9.674126291653275e-07, + "loss": 0.1118, + "step": 31640 + }, + { + "epoch": 0.8006933724726067, + "grad_norm": 3.455777406692505, + "learning_rate": 9.671752579259141e-07, + "loss": 0.1412, + "step": 31641 + }, + { + "epoch": 0.8007186780373005, + "grad_norm": 3.8300259113311768, + "learning_rate": 9.669379126934346e-07, + "loss": 0.1635, + "step": 31642 + }, + { + "epoch": 0.8007439836019941, + "grad_norm": 7.850599765777588, + "learning_rate": 9.667005934694235e-07, + "loss": 0.1625, + "step": 31643 + }, + { + "epoch": 0.8007692891666878, + "grad_norm": 5.249715328216553, + "learning_rate": 9.66463300255409e-07, + "loss": 0.2275, + "step": 31644 + }, + { + "epoch": 0.8007945947313815, + "grad_norm": 29.209518432617188, + "learning_rate": 9.66226033052925e-07, + "loss": 0.2761, + "step": 31645 + }, + { + "epoch": 0.8008199002960751, + "grad_norm": 9.022242546081543, + "learning_rate": 9.659887918634963e-07, + "loss": 0.134, + "step": 31646 + }, + { + "epoch": 0.8008452058607688, + "grad_norm": 8.851005554199219, + "learning_rate": 9.657515766886567e-07, + "loss": 0.1968, + "step": 31647 + }, + { + "epoch": 0.8008705114254625, + "grad_norm": 3.6074047088623047, + "learning_rate": 9.655143875299338e-07, + "loss": 0.1394, + "step": 31648 + }, + { + "epoch": 0.8008958169901561, + "grad_norm": 3.2022504806518555, + "learning_rate": 9.652772243888591e-07, + "loss": 0.1222, + "step": 31649 + }, + { + "epoch": 0.8009211225548498, + "grad_norm": 7.911282062530518, + "learning_rate": 9.650400872669614e-07, + "loss": 0.1599, + "step": 31650 + }, + { + "epoch": 0.8009464281195435, + "grad_norm": 8.023681640625, + "learning_rate": 9.64802976165769e-07, + "loss": 0.2407, + "step": 31651 + }, + { + "epoch": 0.8009717336842371, + "grad_norm": 6.576676368713379, + "learning_rate": 9.645658910868123e-07, + "loss": 0.1188, + "step": 31652 + }, + { + "epoch": 0.8009970392489308, + "grad_norm": 4.692429542541504, + "learning_rate": 9.643288320316174e-07, + "loss": 0.1584, + "step": 31653 + }, + { + "epoch": 0.8010223448136246, + "grad_norm": 7.114645957946777, + "learning_rate": 9.640917990017174e-07, + "loss": 0.1898, + "step": 31654 + }, + { + "epoch": 0.8010476503783182, + "grad_norm": 4.530991554260254, + "learning_rate": 9.638547919986379e-07, + "loss": 0.1846, + "step": 31655 + }, + { + "epoch": 0.8010729559430119, + "grad_norm": 4.221885681152344, + "learning_rate": 9.636178110239087e-07, + "loss": 0.1817, + "step": 31656 + }, + { + "epoch": 0.8010982615077056, + "grad_norm": 4.818173885345459, + "learning_rate": 9.633808560790559e-07, + "loss": 0.1821, + "step": 31657 + }, + { + "epoch": 0.8011235670723992, + "grad_norm": 4.905941486358643, + "learning_rate": 9.631439271656112e-07, + "loss": 0.1876, + "step": 31658 + }, + { + "epoch": 0.8011488726370929, + "grad_norm": 5.099379539489746, + "learning_rate": 9.629070242850986e-07, + "loss": 0.1912, + "step": 31659 + }, + { + "epoch": 0.8011741782017866, + "grad_norm": 4.2577362060546875, + "learning_rate": 9.626701474390482e-07, + "loss": 0.1816, + "step": 31660 + }, + { + "epoch": 0.8011994837664802, + "grad_norm": 4.924464702606201, + "learning_rate": 9.624332966289863e-07, + "loss": 0.1949, + "step": 31661 + }, + { + "epoch": 0.8012247893311739, + "grad_norm": 5.374330520629883, + "learning_rate": 9.621964718564419e-07, + "loss": 0.1599, + "step": 31662 + }, + { + "epoch": 0.8012500948958676, + "grad_norm": 8.055148124694824, + "learning_rate": 9.619596731229413e-07, + "loss": 0.0948, + "step": 31663 + }, + { + "epoch": 0.8012754004605612, + "grad_norm": 5.341044902801514, + "learning_rate": 9.61722900430012e-07, + "loss": 0.1794, + "step": 31664 + }, + { + "epoch": 0.8013007060252549, + "grad_norm": 6.271853923797607, + "learning_rate": 9.614861537791791e-07, + "loss": 0.1136, + "step": 31665 + }, + { + "epoch": 0.8013260115899487, + "grad_norm": 3.3239362239837646, + "learning_rate": 9.612494331719723e-07, + "loss": 0.1331, + "step": 31666 + }, + { + "epoch": 0.8013513171546423, + "grad_norm": 3.89579439163208, + "learning_rate": 9.610127386099167e-07, + "loss": 0.194, + "step": 31667 + }, + { + "epoch": 0.801376622719336, + "grad_norm": 2.92726993560791, + "learning_rate": 9.60776070094538e-07, + "loss": 0.1219, + "step": 31668 + }, + { + "epoch": 0.8014019282840297, + "grad_norm": 3.7073886394500732, + "learning_rate": 9.60539427627362e-07, + "loss": 0.0975, + "step": 31669 + }, + { + "epoch": 0.8014272338487234, + "grad_norm": 3.8281102180480957, + "learning_rate": 9.603028112099173e-07, + "loss": 0.1424, + "step": 31670 + }, + { + "epoch": 0.801452539413417, + "grad_norm": 4.512866973876953, + "learning_rate": 9.600662208437272e-07, + "loss": 0.1001, + "step": 31671 + }, + { + "epoch": 0.8014778449781107, + "grad_norm": 5.879659652709961, + "learning_rate": 9.59829656530321e-07, + "loss": 0.1904, + "step": 31672 + }, + { + "epoch": 0.8015031505428044, + "grad_norm": 6.220860004425049, + "learning_rate": 9.595931182712192e-07, + "loss": 0.1485, + "step": 31673 + }, + { + "epoch": 0.801528456107498, + "grad_norm": 4.165074348449707, + "learning_rate": 9.593566060679511e-07, + "loss": 0.1627, + "step": 31674 + }, + { + "epoch": 0.8015537616721917, + "grad_norm": 9.915849685668945, + "learning_rate": 9.59120119922039e-07, + "loss": 0.1594, + "step": 31675 + }, + { + "epoch": 0.8015790672368854, + "grad_norm": 10.446608543395996, + "learning_rate": 9.588836598350127e-07, + "loss": 0.2738, + "step": 31676 + }, + { + "epoch": 0.801604372801579, + "grad_norm": 2.6799070835113525, + "learning_rate": 9.586472258083912e-07, + "loss": 0.1155, + "step": 31677 + }, + { + "epoch": 0.8016296783662727, + "grad_norm": 7.135974884033203, + "learning_rate": 9.58410817843703e-07, + "loss": 0.129, + "step": 31678 + }, + { + "epoch": 0.8016549839309665, + "grad_norm": 9.823040008544922, + "learning_rate": 9.58174435942472e-07, + "loss": 0.2371, + "step": 31679 + }, + { + "epoch": 0.8016802894956601, + "grad_norm": 2.872047185897827, + "learning_rate": 9.579380801062222e-07, + "loss": 0.1204, + "step": 31680 + }, + { + "epoch": 0.8017055950603538, + "grad_norm": 9.189289093017578, + "learning_rate": 9.577017503364766e-07, + "loss": 0.1198, + "step": 31681 + }, + { + "epoch": 0.8017309006250475, + "grad_norm": 10.419943809509277, + "learning_rate": 9.574654466347616e-07, + "loss": 0.2132, + "step": 31682 + }, + { + "epoch": 0.8017562061897411, + "grad_norm": 6.537143707275391, + "learning_rate": 9.572291690025997e-07, + "loss": 0.192, + "step": 31683 + }, + { + "epoch": 0.8017815117544348, + "grad_norm": 9.575791358947754, + "learning_rate": 9.569929174415144e-07, + "loss": 0.1506, + "step": 31684 + }, + { + "epoch": 0.8018068173191285, + "grad_norm": 4.896678447723389, + "learning_rate": 9.567566919530302e-07, + "loss": 0.1117, + "step": 31685 + }, + { + "epoch": 0.8018321228838221, + "grad_norm": 8.19587230682373, + "learning_rate": 9.5652049253867e-07, + "loss": 0.1373, + "step": 31686 + }, + { + "epoch": 0.8018574284485158, + "grad_norm": 7.749110221862793, + "learning_rate": 9.562843191999576e-07, + "loss": 0.1461, + "step": 31687 + }, + { + "epoch": 0.8018827340132095, + "grad_norm": 6.314036846160889, + "learning_rate": 9.56048171938414e-07, + "loss": 0.1754, + "step": 31688 + }, + { + "epoch": 0.8019080395779031, + "grad_norm": 3.8369362354278564, + "learning_rate": 9.558120507555658e-07, + "loss": 0.0652, + "step": 31689 + }, + { + "epoch": 0.8019333451425968, + "grad_norm": 4.419997215270996, + "learning_rate": 9.55575955652931e-07, + "loss": 0.1779, + "step": 31690 + }, + { + "epoch": 0.8019586507072906, + "grad_norm": 3.863600730895996, + "learning_rate": 9.553398866320357e-07, + "loss": 0.1216, + "step": 31691 + }, + { + "epoch": 0.8019839562719842, + "grad_norm": 4.438034534454346, + "learning_rate": 9.551038436944e-07, + "loss": 0.1472, + "step": 31692 + }, + { + "epoch": 0.8020092618366779, + "grad_norm": 7.593376636505127, + "learning_rate": 9.548678268415485e-07, + "loss": 0.1094, + "step": 31693 + }, + { + "epoch": 0.8020345674013716, + "grad_norm": 10.36800479888916, + "learning_rate": 9.546318360750018e-07, + "loss": 0.2933, + "step": 31694 + }, + { + "epoch": 0.8020598729660653, + "grad_norm": 2.9660000801086426, + "learning_rate": 9.54395871396282e-07, + "loss": 0.0909, + "step": 31695 + }, + { + "epoch": 0.8020851785307589, + "grad_norm": 3.7921857833862305, + "learning_rate": 9.54159932806909e-07, + "loss": 0.1464, + "step": 31696 + }, + { + "epoch": 0.8021104840954526, + "grad_norm": 4.187495231628418, + "learning_rate": 9.539240203084076e-07, + "loss": 0.1845, + "step": 31697 + }, + { + "epoch": 0.8021357896601463, + "grad_norm": 6.600532054901123, + "learning_rate": 9.53688133902298e-07, + "loss": 0.1127, + "step": 31698 + }, + { + "epoch": 0.8021610952248399, + "grad_norm": 26.00465202331543, + "learning_rate": 9.534522735901003e-07, + "loss": 0.3034, + "step": 31699 + }, + { + "epoch": 0.8021864007895336, + "grad_norm": 3.999324321746826, + "learning_rate": 9.532164393733351e-07, + "loss": 0.1568, + "step": 31700 + }, + { + "epoch": 0.8022117063542273, + "grad_norm": 12.940898895263672, + "learning_rate": 9.529806312535256e-07, + "loss": 0.1253, + "step": 31701 + }, + { + "epoch": 0.8022370119189209, + "grad_norm": 3.936488389968872, + "learning_rate": 9.5274484923219e-07, + "loss": 0.0878, + "step": 31702 + }, + { + "epoch": 0.8022623174836147, + "grad_norm": 6.426478862762451, + "learning_rate": 9.525090933108528e-07, + "loss": 0.1719, + "step": 31703 + }, + { + "epoch": 0.8022876230483084, + "grad_norm": 9.19847583770752, + "learning_rate": 9.522733634910286e-07, + "loss": 0.2312, + "step": 31704 + }, + { + "epoch": 0.802312928613002, + "grad_norm": 2.6265604496002197, + "learning_rate": 9.52037659774242e-07, + "loss": 0.0856, + "step": 31705 + }, + { + "epoch": 0.8023382341776957, + "grad_norm": 5.01954984664917, + "learning_rate": 9.518019821620112e-07, + "loss": 0.1509, + "step": 31706 + }, + { + "epoch": 0.8023635397423894, + "grad_norm": 3.5817322731018066, + "learning_rate": 9.51566330655857e-07, + "loss": 0.1077, + "step": 31707 + }, + { + "epoch": 0.802388845307083, + "grad_norm": 4.006507873535156, + "learning_rate": 9.513307052572967e-07, + "loss": 0.1259, + "step": 31708 + }, + { + "epoch": 0.8024141508717767, + "grad_norm": 13.21672534942627, + "learning_rate": 9.51095105967853e-07, + "loss": 0.2536, + "step": 31709 + }, + { + "epoch": 0.8024394564364704, + "grad_norm": 6.009016990661621, + "learning_rate": 9.50859532789044e-07, + "loss": 0.2276, + "step": 31710 + }, + { + "epoch": 0.802464762001164, + "grad_norm": 4.479111671447754, + "learning_rate": 9.506239857223887e-07, + "loss": 0.1641, + "step": 31711 + }, + { + "epoch": 0.8024900675658577, + "grad_norm": 3.116177797317505, + "learning_rate": 9.503884647694045e-07, + "loss": 0.1086, + "step": 31712 + }, + { + "epoch": 0.8025153731305514, + "grad_norm": 9.833563804626465, + "learning_rate": 9.501529699316131e-07, + "loss": 0.1791, + "step": 31713 + }, + { + "epoch": 0.802540678695245, + "grad_norm": 5.609777927398682, + "learning_rate": 9.499175012105321e-07, + "loss": 0.1406, + "step": 31714 + }, + { + "epoch": 0.8025659842599387, + "grad_norm": 2.869166374206543, + "learning_rate": 9.496820586076783e-07, + "loss": 0.1046, + "step": 31715 + }, + { + "epoch": 0.8025912898246325, + "grad_norm": 3.9126787185668945, + "learning_rate": 9.49446642124574e-07, + "loss": 0.1337, + "step": 31716 + }, + { + "epoch": 0.8026165953893261, + "grad_norm": 4.958477020263672, + "learning_rate": 9.492112517627327e-07, + "loss": 0.1486, + "step": 31717 + }, + { + "epoch": 0.8026419009540198, + "grad_norm": 2.762704849243164, + "learning_rate": 9.489758875236754e-07, + "loss": 0.1061, + "step": 31718 + }, + { + "epoch": 0.8026672065187135, + "grad_norm": 2.843291759490967, + "learning_rate": 9.48740549408918e-07, + "loss": 0.0994, + "step": 31719 + }, + { + "epoch": 0.8026925120834072, + "grad_norm": 11.006455421447754, + "learning_rate": 9.485052374199815e-07, + "loss": 0.2315, + "step": 31720 + }, + { + "epoch": 0.8027178176481008, + "grad_norm": 4.2455973625183105, + "learning_rate": 9.482699515583788e-07, + "loss": 0.0779, + "step": 31721 + }, + { + "epoch": 0.8027431232127945, + "grad_norm": 5.065713882446289, + "learning_rate": 9.480346918256306e-07, + "loss": 0.1636, + "step": 31722 + }, + { + "epoch": 0.8027684287774882, + "grad_norm": 2.6150007247924805, + "learning_rate": 9.477994582232519e-07, + "loss": 0.0791, + "step": 31723 + }, + { + "epoch": 0.8027937343421818, + "grad_norm": 8.429019927978516, + "learning_rate": 9.475642507527616e-07, + "loss": 0.1854, + "step": 31724 + }, + { + "epoch": 0.8028190399068755, + "grad_norm": 3.440397262573242, + "learning_rate": 9.473290694156761e-07, + "loss": 0.1153, + "step": 31725 + }, + { + "epoch": 0.8028443454715692, + "grad_norm": 3.8342630863189697, + "learning_rate": 9.47093914213511e-07, + "loss": 0.1276, + "step": 31726 + }, + { + "epoch": 0.8028696510362628, + "grad_norm": 8.222236633300781, + "learning_rate": 9.468587851477829e-07, + "loss": 0.2028, + "step": 31727 + }, + { + "epoch": 0.8028949566009566, + "grad_norm": 2.7041068077087402, + "learning_rate": 9.466236822200092e-07, + "loss": 0.1495, + "step": 31728 + }, + { + "epoch": 0.8029202621656503, + "grad_norm": 6.394277572631836, + "learning_rate": 9.463886054317056e-07, + "loss": 0.1782, + "step": 31729 + }, + { + "epoch": 0.8029455677303439, + "grad_norm": 5.6463236808776855, + "learning_rate": 9.461535547843876e-07, + "loss": 0.1304, + "step": 31730 + }, + { + "epoch": 0.8029708732950376, + "grad_norm": 5.656013011932373, + "learning_rate": 9.459185302795703e-07, + "loss": 0.198, + "step": 31731 + }, + { + "epoch": 0.8029961788597313, + "grad_norm": 4.8766584396362305, + "learning_rate": 9.456835319187712e-07, + "loss": 0.157, + "step": 31732 + }, + { + "epoch": 0.8030214844244249, + "grad_norm": 4.645712852478027, + "learning_rate": 9.454485597035052e-07, + "loss": 0.1887, + "step": 31733 + }, + { + "epoch": 0.8030467899891186, + "grad_norm": 6.57641077041626, + "learning_rate": 9.452136136352869e-07, + "loss": 0.1959, + "step": 31734 + }, + { + "epoch": 0.8030720955538123, + "grad_norm": 2.9532299041748047, + "learning_rate": 9.449786937156308e-07, + "loss": 0.1503, + "step": 31735 + }, + { + "epoch": 0.8030974011185059, + "grad_norm": 3.685870409011841, + "learning_rate": 9.447437999460541e-07, + "loss": 0.1561, + "step": 31736 + }, + { + "epoch": 0.8031227066831996, + "grad_norm": 5.471680641174316, + "learning_rate": 9.445089323280699e-07, + "loss": 0.1909, + "step": 31737 + }, + { + "epoch": 0.8031480122478933, + "grad_norm": 5.749226093292236, + "learning_rate": 9.442740908631936e-07, + "loss": 0.1451, + "step": 31738 + }, + { + "epoch": 0.8031733178125869, + "grad_norm": 4.445631504058838, + "learning_rate": 9.440392755529382e-07, + "loss": 0.1272, + "step": 31739 + }, + { + "epoch": 0.8031986233772807, + "grad_norm": 4.382827281951904, + "learning_rate": 9.438044863988199e-07, + "loss": 0.1591, + "step": 31740 + }, + { + "epoch": 0.8032239289419744, + "grad_norm": 3.312859058380127, + "learning_rate": 9.435697234023517e-07, + "loss": 0.1136, + "step": 31741 + }, + { + "epoch": 0.803249234506668, + "grad_norm": 3.471078634262085, + "learning_rate": 9.433349865650482e-07, + "loss": 0.108, + "step": 31742 + }, + { + "epoch": 0.8032745400713617, + "grad_norm": 2.694986343383789, + "learning_rate": 9.431002758884227e-07, + "loss": 0.1122, + "step": 31743 + }, + { + "epoch": 0.8032998456360554, + "grad_norm": 3.6261494159698486, + "learning_rate": 9.428655913739876e-07, + "loss": 0.0994, + "step": 31744 + }, + { + "epoch": 0.8033251512007491, + "grad_norm": 5.253265380859375, + "learning_rate": 9.426309330232592e-07, + "loss": 0.154, + "step": 31745 + }, + { + "epoch": 0.8033504567654427, + "grad_norm": 3.427760601043701, + "learning_rate": 9.423963008377473e-07, + "loss": 0.1682, + "step": 31746 + }, + { + "epoch": 0.8033757623301364, + "grad_norm": 34.659446716308594, + "learning_rate": 9.421616948189699e-07, + "loss": 0.3237, + "step": 31747 + }, + { + "epoch": 0.8034010678948301, + "grad_norm": 7.986575603485107, + "learning_rate": 9.419271149684345e-07, + "loss": 0.175, + "step": 31748 + }, + { + "epoch": 0.8034263734595237, + "grad_norm": 13.088848114013672, + "learning_rate": 9.416925612876571e-07, + "loss": 0.2229, + "step": 31749 + }, + { + "epoch": 0.8034516790242174, + "grad_norm": 6.065808296203613, + "learning_rate": 9.414580337781481e-07, + "loss": 0.1814, + "step": 31750 + }, + { + "epoch": 0.8034769845889111, + "grad_norm": 5.528632164001465, + "learning_rate": 9.412235324414242e-07, + "loss": 0.0962, + "step": 31751 + }, + { + "epoch": 0.8035022901536047, + "grad_norm": 4.2091474533081055, + "learning_rate": 9.409890572789926e-07, + "loss": 0.1047, + "step": 31752 + }, + { + "epoch": 0.8035275957182985, + "grad_norm": 5.761909484863281, + "learning_rate": 9.40754608292368e-07, + "loss": 0.1024, + "step": 31753 + }, + { + "epoch": 0.8035529012829922, + "grad_norm": 4.769225120544434, + "learning_rate": 9.405201854830609e-07, + "loss": 0.0591, + "step": 31754 + }, + { + "epoch": 0.8035782068476858, + "grad_norm": 4.415748596191406, + "learning_rate": 9.402857888525852e-07, + "loss": 0.1319, + "step": 31755 + }, + { + "epoch": 0.8036035124123795, + "grad_norm": 5.512302398681641, + "learning_rate": 9.400514184024518e-07, + "loss": 0.1633, + "step": 31756 + }, + { + "epoch": 0.8036288179770732, + "grad_norm": 2.82660174369812, + "learning_rate": 9.398170741341712e-07, + "loss": 0.0894, + "step": 31757 + }, + { + "epoch": 0.8036541235417668, + "grad_norm": 4.2062225341796875, + "learning_rate": 9.395827560492532e-07, + "loss": 0.1641, + "step": 31758 + }, + { + "epoch": 0.8036794291064605, + "grad_norm": 10.09726619720459, + "learning_rate": 9.393484641492129e-07, + "loss": 0.276, + "step": 31759 + }, + { + "epoch": 0.8037047346711542, + "grad_norm": 4.3421525955200195, + "learning_rate": 9.391141984355584e-07, + "loss": 0.111, + "step": 31760 + }, + { + "epoch": 0.8037300402358478, + "grad_norm": 7.005589485168457, + "learning_rate": 9.388799589098013e-07, + "loss": 0.2114, + "step": 31761 + }, + { + "epoch": 0.8037553458005415, + "grad_norm": 4.09368371963501, + "learning_rate": 9.386457455734504e-07, + "loss": 0.1847, + "step": 31762 + }, + { + "epoch": 0.8037806513652352, + "grad_norm": 5.319433212280273, + "learning_rate": 9.384115584280196e-07, + "loss": 0.1847, + "step": 31763 + }, + { + "epoch": 0.8038059569299288, + "grad_norm": 4.340500831604004, + "learning_rate": 9.381773974750163e-07, + "loss": 0.0986, + "step": 31764 + }, + { + "epoch": 0.8038312624946226, + "grad_norm": 32.81684112548828, + "learning_rate": 9.379432627159518e-07, + "loss": 0.2257, + "step": 31765 + }, + { + "epoch": 0.8038565680593163, + "grad_norm": 5.207732677459717, + "learning_rate": 9.377091541523342e-07, + "loss": 0.0484, + "step": 31766 + }, + { + "epoch": 0.8038818736240099, + "grad_norm": 5.070196628570557, + "learning_rate": 9.374750717856762e-07, + "loss": 0.1189, + "step": 31767 + }, + { + "epoch": 0.8039071791887036, + "grad_norm": 9.28237533569336, + "learning_rate": 9.372410156174855e-07, + "loss": 0.2444, + "step": 31768 + }, + { + "epoch": 0.8039324847533973, + "grad_norm": 3.6779263019561768, + "learning_rate": 9.37006985649272e-07, + "loss": 0.1808, + "step": 31769 + }, + { + "epoch": 0.803957790318091, + "grad_norm": 3.748420476913452, + "learning_rate": 9.367729818825433e-07, + "loss": 0.0888, + "step": 31770 + }, + { + "epoch": 0.8039830958827846, + "grad_norm": 4.899333953857422, + "learning_rate": 9.365390043188105e-07, + "loss": 0.1781, + "step": 31771 + }, + { + "epoch": 0.8040084014474783, + "grad_norm": 10.734230995178223, + "learning_rate": 9.363050529595824e-07, + "loss": 0.1278, + "step": 31772 + }, + { + "epoch": 0.804033707012172, + "grad_norm": 2.650505781173706, + "learning_rate": 9.360711278063672e-07, + "loss": 0.1001, + "step": 31773 + }, + { + "epoch": 0.8040590125768656, + "grad_norm": 16.91623306274414, + "learning_rate": 9.358372288606732e-07, + "loss": 0.3418, + "step": 31774 + }, + { + "epoch": 0.8040843181415593, + "grad_norm": 3.4129936695098877, + "learning_rate": 9.356033561240074e-07, + "loss": 0.1081, + "step": 31775 + }, + { + "epoch": 0.804109623706253, + "grad_norm": 10.337733268737793, + "learning_rate": 9.35369509597881e-07, + "loss": 0.1632, + "step": 31776 + }, + { + "epoch": 0.8041349292709467, + "grad_norm": 9.693079948425293, + "learning_rate": 9.351356892837993e-07, + "loss": 0.2772, + "step": 31777 + }, + { + "epoch": 0.8041602348356404, + "grad_norm": 4.865390777587891, + "learning_rate": 9.349018951832739e-07, + "loss": 0.1168, + "step": 31778 + }, + { + "epoch": 0.8041855404003341, + "grad_norm": 13.957297325134277, + "learning_rate": 9.346681272978075e-07, + "loss": 0.3654, + "step": 31779 + }, + { + "epoch": 0.8042108459650277, + "grad_norm": 21.3631591796875, + "learning_rate": 9.344343856289118e-07, + "loss": 0.2368, + "step": 31780 + }, + { + "epoch": 0.8042361515297214, + "grad_norm": 11.260846138000488, + "learning_rate": 9.342006701780909e-07, + "loss": 0.2802, + "step": 31781 + }, + { + "epoch": 0.8042614570944151, + "grad_norm": 2.5731019973754883, + "learning_rate": 9.33966980946856e-07, + "loss": 0.0879, + "step": 31782 + }, + { + "epoch": 0.8042867626591087, + "grad_norm": 4.831247329711914, + "learning_rate": 9.337333179367092e-07, + "loss": 0.1499, + "step": 31783 + }, + { + "epoch": 0.8043120682238024, + "grad_norm": 4.186457633972168, + "learning_rate": 9.334996811491609e-07, + "loss": 0.1464, + "step": 31784 + }, + { + "epoch": 0.8043373737884961, + "grad_norm": 9.203869819641113, + "learning_rate": 9.332660705857161e-07, + "loss": 0.2442, + "step": 31785 + }, + { + "epoch": 0.8043626793531897, + "grad_norm": 10.375617980957031, + "learning_rate": 9.33032486247884e-07, + "loss": 0.1783, + "step": 31786 + }, + { + "epoch": 0.8043879849178834, + "grad_norm": 3.6492559909820557, + "learning_rate": 9.327989281371663e-07, + "loss": 0.1556, + "step": 31787 + }, + { + "epoch": 0.8044132904825771, + "grad_norm": 11.094487190246582, + "learning_rate": 9.325653962550735e-07, + "loss": 0.2342, + "step": 31788 + }, + { + "epoch": 0.8044385960472707, + "grad_norm": 4.474420547485352, + "learning_rate": 9.323318906031081e-07, + "loss": 0.1562, + "step": 31789 + }, + { + "epoch": 0.8044639016119645, + "grad_norm": 5.6382622718811035, + "learning_rate": 9.320984111827791e-07, + "loss": 0.1584, + "step": 31790 + }, + { + "epoch": 0.8044892071766582, + "grad_norm": 11.902019500732422, + "learning_rate": 9.318649579955907e-07, + "loss": 0.2729, + "step": 31791 + }, + { + "epoch": 0.8045145127413518, + "grad_norm": 14.358850479125977, + "learning_rate": 9.316315310430485e-07, + "loss": 0.1906, + "step": 31792 + }, + { + "epoch": 0.8045398183060455, + "grad_norm": 6.501995086669922, + "learning_rate": 9.313981303266568e-07, + "loss": 0.1859, + "step": 31793 + }, + { + "epoch": 0.8045651238707392, + "grad_norm": 3.141078233718872, + "learning_rate": 9.311647558479226e-07, + "loss": 0.1123, + "step": 31794 + }, + { + "epoch": 0.8045904294354328, + "grad_norm": 4.401946544647217, + "learning_rate": 9.309314076083498e-07, + "loss": 0.1086, + "step": 31795 + }, + { + "epoch": 0.8046157350001265, + "grad_norm": 7.427495002746582, + "learning_rate": 9.306980856094439e-07, + "loss": 0.1673, + "step": 31796 + }, + { + "epoch": 0.8046410405648202, + "grad_norm": 4.181035995483398, + "learning_rate": 9.304647898527081e-07, + "loss": 0.0733, + "step": 31797 + }, + { + "epoch": 0.8046663461295139, + "grad_norm": 2.796196937561035, + "learning_rate": 9.302315203396483e-07, + "loss": 0.0621, + "step": 31798 + }, + { + "epoch": 0.8046916516942075, + "grad_norm": 6.175727844238281, + "learning_rate": 9.299982770717692e-07, + "loss": 0.1515, + "step": 31799 + }, + { + "epoch": 0.8047169572589012, + "grad_norm": 4.85963249206543, + "learning_rate": 9.297650600505737e-07, + "loss": 0.1448, + "step": 31800 + }, + { + "epoch": 0.804742262823595, + "grad_norm": 8.33522891998291, + "learning_rate": 9.295318692775662e-07, + "loss": 0.176, + "step": 31801 + }, + { + "epoch": 0.8047675683882886, + "grad_norm": 4.409915447235107, + "learning_rate": 9.292987047542496e-07, + "loss": 0.153, + "step": 31802 + }, + { + "epoch": 0.8047928739529823, + "grad_norm": 7.100093364715576, + "learning_rate": 9.290655664821296e-07, + "loss": 0.2561, + "step": 31803 + }, + { + "epoch": 0.804818179517676, + "grad_norm": 7.887964725494385, + "learning_rate": 9.28832454462708e-07, + "loss": 0.1756, + "step": 31804 + }, + { + "epoch": 0.8048434850823696, + "grad_norm": 9.862756729125977, + "learning_rate": 9.285993686974892e-07, + "loss": 0.1587, + "step": 31805 + }, + { + "epoch": 0.8048687906470633, + "grad_norm": 2.9105963706970215, + "learning_rate": 9.283663091879741e-07, + "loss": 0.1079, + "step": 31806 + }, + { + "epoch": 0.804894096211757, + "grad_norm": 10.313804626464844, + "learning_rate": 9.281332759356688e-07, + "loss": 0.2148, + "step": 31807 + }, + { + "epoch": 0.8049194017764506, + "grad_norm": 4.910078048706055, + "learning_rate": 9.279002689420735e-07, + "loss": 0.109, + "step": 31808 + }, + { + "epoch": 0.8049447073411443, + "grad_norm": 9.480881690979004, + "learning_rate": 9.27667288208694e-07, + "loss": 0.1351, + "step": 31809 + }, + { + "epoch": 0.804970012905838, + "grad_norm": 3.0189461708068848, + "learning_rate": 9.274343337370284e-07, + "loss": 0.1172, + "step": 31810 + }, + { + "epoch": 0.8049953184705316, + "grad_norm": 5.554135322570801, + "learning_rate": 9.272014055285821e-07, + "loss": 0.1236, + "step": 31811 + }, + { + "epoch": 0.8050206240352253, + "grad_norm": 3.2384254932403564, + "learning_rate": 9.269685035848553e-07, + "loss": 0.074, + "step": 31812 + }, + { + "epoch": 0.805045929599919, + "grad_norm": 5.58303689956665, + "learning_rate": 9.267356279073536e-07, + "loss": 0.2418, + "step": 31813 + }, + { + "epoch": 0.8050712351646127, + "grad_norm": 4.853208541870117, + "learning_rate": 9.265027784975739e-07, + "loss": 0.1259, + "step": 31814 + }, + { + "epoch": 0.8050965407293064, + "grad_norm": 3.3087494373321533, + "learning_rate": 9.262699553570209e-07, + "loss": 0.1327, + "step": 31815 + }, + { + "epoch": 0.8051218462940001, + "grad_norm": 5.929274559020996, + "learning_rate": 9.260371584871941e-07, + "loss": 0.1718, + "step": 31816 + }, + { + "epoch": 0.8051471518586937, + "grad_norm": 11.557089805603027, + "learning_rate": 9.258043878895984e-07, + "loss": 0.2011, + "step": 31817 + }, + { + "epoch": 0.8051724574233874, + "grad_norm": 5.274712085723877, + "learning_rate": 9.255716435657292e-07, + "loss": 0.1898, + "step": 31818 + }, + { + "epoch": 0.8051977629880811, + "grad_norm": 8.610247611999512, + "learning_rate": 9.253389255170925e-07, + "loss": 0.1386, + "step": 31819 + }, + { + "epoch": 0.8052230685527747, + "grad_norm": 4.0677289962768555, + "learning_rate": 9.251062337451855e-07, + "loss": 0.1461, + "step": 31820 + }, + { + "epoch": 0.8052483741174684, + "grad_norm": 4.1167402267456055, + "learning_rate": 9.24873568251512e-07, + "loss": 0.1357, + "step": 31821 + }, + { + "epoch": 0.8052736796821621, + "grad_norm": 2.8623154163360596, + "learning_rate": 9.246409290375707e-07, + "loss": 0.1516, + "step": 31822 + }, + { + "epoch": 0.8052989852468558, + "grad_norm": 7.532939910888672, + "learning_rate": 9.244083161048622e-07, + "loss": 0.1377, + "step": 31823 + }, + { + "epoch": 0.8053242908115494, + "grad_norm": 6.402317523956299, + "learning_rate": 9.241757294548848e-07, + "loss": 0.1839, + "step": 31824 + }, + { + "epoch": 0.8053495963762431, + "grad_norm": 3.203409194946289, + "learning_rate": 9.23943169089141e-07, + "loss": 0.1024, + "step": 31825 + }, + { + "epoch": 0.8053749019409369, + "grad_norm": 4.463983535766602, + "learning_rate": 9.237106350091301e-07, + "loss": 0.1417, + "step": 31826 + }, + { + "epoch": 0.8054002075056305, + "grad_norm": 3.151974678039551, + "learning_rate": 9.234781272163506e-07, + "loss": 0.1399, + "step": 31827 + }, + { + "epoch": 0.8054255130703242, + "grad_norm": 2.948920249938965, + "learning_rate": 9.232456457123024e-07, + "loss": 0.0943, + "step": 31828 + }, + { + "epoch": 0.8054508186350179, + "grad_norm": 3.671323537826538, + "learning_rate": 9.230131904984835e-07, + "loss": 0.1497, + "step": 31829 + }, + { + "epoch": 0.8054761241997115, + "grad_norm": 8.965524673461914, + "learning_rate": 9.227807615763956e-07, + "loss": 0.1459, + "step": 31830 + }, + { + "epoch": 0.8055014297644052, + "grad_norm": 16.208953857421875, + "learning_rate": 9.225483589475359e-07, + "loss": 0.1221, + "step": 31831 + }, + { + "epoch": 0.8055267353290989, + "grad_norm": 4.913142681121826, + "learning_rate": 9.223159826134037e-07, + "loss": 0.1448, + "step": 31832 + }, + { + "epoch": 0.8055520408937925, + "grad_norm": 5.358636856079102, + "learning_rate": 9.220836325754956e-07, + "loss": 0.21, + "step": 31833 + }, + { + "epoch": 0.8055773464584862, + "grad_norm": 5.277106761932373, + "learning_rate": 9.218513088353132e-07, + "loss": 0.1937, + "step": 31834 + }, + { + "epoch": 0.8056026520231799, + "grad_norm": 5.801593780517578, + "learning_rate": 9.216190113943529e-07, + "loss": 0.1905, + "step": 31835 + }, + { + "epoch": 0.8056279575878735, + "grad_norm": 13.466933250427246, + "learning_rate": 9.213867402541127e-07, + "loss": 0.3167, + "step": 31836 + }, + { + "epoch": 0.8056532631525672, + "grad_norm": 4.885253429412842, + "learning_rate": 9.211544954160901e-07, + "loss": 0.1715, + "step": 31837 + }, + { + "epoch": 0.805678568717261, + "grad_norm": 5.36627721786499, + "learning_rate": 9.209222768817843e-07, + "loss": 0.1867, + "step": 31838 + }, + { + "epoch": 0.8057038742819546, + "grad_norm": 6.30899715423584, + "learning_rate": 9.206900846526917e-07, + "loss": 0.2164, + "step": 31839 + }, + { + "epoch": 0.8057291798466483, + "grad_norm": 3.669764757156372, + "learning_rate": 9.204579187303104e-07, + "loss": 0.0933, + "step": 31840 + }, + { + "epoch": 0.805754485411342, + "grad_norm": 7.3464789390563965, + "learning_rate": 9.202257791161356e-07, + "loss": 0.158, + "step": 31841 + }, + { + "epoch": 0.8057797909760356, + "grad_norm": 4.336236476898193, + "learning_rate": 9.199936658116676e-07, + "loss": 0.1271, + "step": 31842 + }, + { + "epoch": 0.8058050965407293, + "grad_norm": 10.411444664001465, + "learning_rate": 9.197615788183995e-07, + "loss": 0.1633, + "step": 31843 + }, + { + "epoch": 0.805830402105423, + "grad_norm": 6.596353530883789, + "learning_rate": 9.195295181378333e-07, + "loss": 0.1381, + "step": 31844 + }, + { + "epoch": 0.8058557076701166, + "grad_norm": 2.564471483230591, + "learning_rate": 9.192974837714591e-07, + "loss": 0.0911, + "step": 31845 + }, + { + "epoch": 0.8058810132348103, + "grad_norm": 13.572502136230469, + "learning_rate": 9.190654757207778e-07, + "loss": 0.233, + "step": 31846 + }, + { + "epoch": 0.805906318799504, + "grad_norm": 12.370476722717285, + "learning_rate": 9.188334939872828e-07, + "loss": 0.188, + "step": 31847 + }, + { + "epoch": 0.8059316243641977, + "grad_norm": 5.3854217529296875, + "learning_rate": 9.186015385724739e-07, + "loss": 0.114, + "step": 31848 + }, + { + "epoch": 0.8059569299288913, + "grad_norm": 3.380995512008667, + "learning_rate": 9.183696094778421e-07, + "loss": 0.1758, + "step": 31849 + }, + { + "epoch": 0.805982235493585, + "grad_norm": 6.282153129577637, + "learning_rate": 9.181377067048869e-07, + "loss": 0.181, + "step": 31850 + }, + { + "epoch": 0.8060075410582788, + "grad_norm": 4.705039024353027, + "learning_rate": 9.179058302551008e-07, + "loss": 0.1057, + "step": 31851 + }, + { + "epoch": 0.8060328466229724, + "grad_norm": 2.997765064239502, + "learning_rate": 9.176739801299823e-07, + "loss": 0.1407, + "step": 31852 + }, + { + "epoch": 0.8060581521876661, + "grad_norm": 4.8053131103515625, + "learning_rate": 9.174421563310243e-07, + "loss": 0.1576, + "step": 31853 + }, + { + "epoch": 0.8060834577523598, + "grad_norm": 5.1691389083862305, + "learning_rate": 9.172103588597231e-07, + "loss": 0.1722, + "step": 31854 + }, + { + "epoch": 0.8061087633170534, + "grad_norm": 6.076071739196777, + "learning_rate": 9.169785877175713e-07, + "loss": 0.1609, + "step": 31855 + }, + { + "epoch": 0.8061340688817471, + "grad_norm": 6.18838357925415, + "learning_rate": 9.167468429060661e-07, + "loss": 0.12, + "step": 31856 + }, + { + "epoch": 0.8061593744464408, + "grad_norm": 9.310709953308105, + "learning_rate": 9.165151244267012e-07, + "loss": 0.231, + "step": 31857 + }, + { + "epoch": 0.8061846800111344, + "grad_norm": 3.500202178955078, + "learning_rate": 9.162834322809705e-07, + "loss": 0.1557, + "step": 31858 + }, + { + "epoch": 0.8062099855758281, + "grad_norm": 7.045668601989746, + "learning_rate": 9.160517664703689e-07, + "loss": 0.189, + "step": 31859 + }, + { + "epoch": 0.8062352911405218, + "grad_norm": 3.2038583755493164, + "learning_rate": 9.158201269963879e-07, + "loss": 0.1155, + "step": 31860 + }, + { + "epoch": 0.8062605967052154, + "grad_norm": 3.908604383468628, + "learning_rate": 9.155885138605247e-07, + "loss": 0.1372, + "step": 31861 + }, + { + "epoch": 0.8062859022699091, + "grad_norm": 5.734895706176758, + "learning_rate": 9.153569270642709e-07, + "loss": 0.1152, + "step": 31862 + }, + { + "epoch": 0.8063112078346029, + "grad_norm": 7.05855131149292, + "learning_rate": 9.151253666091214e-07, + "loss": 0.2064, + "step": 31863 + }, + { + "epoch": 0.8063365133992965, + "grad_norm": 4.593573570251465, + "learning_rate": 9.148938324965667e-07, + "loss": 0.1627, + "step": 31864 + }, + { + "epoch": 0.8063618189639902, + "grad_norm": 3.95414400100708, + "learning_rate": 9.146623247281033e-07, + "loss": 0.1396, + "step": 31865 + }, + { + "epoch": 0.8063871245286839, + "grad_norm": 2.446032762527466, + "learning_rate": 9.144308433052223e-07, + "loss": 0.0846, + "step": 31866 + }, + { + "epoch": 0.8064124300933775, + "grad_norm": 9.434962272644043, + "learning_rate": 9.14199388229417e-07, + "loss": 0.3314, + "step": 31867 + }, + { + "epoch": 0.8064377356580712, + "grad_norm": 4.059262752532959, + "learning_rate": 9.139679595021789e-07, + "loss": 0.1276, + "step": 31868 + }, + { + "epoch": 0.8064630412227649, + "grad_norm": 24.664691925048828, + "learning_rate": 9.137365571250023e-07, + "loss": 0.2019, + "step": 31869 + }, + { + "epoch": 0.8064883467874585, + "grad_norm": 5.288822174072266, + "learning_rate": 9.135051810993783e-07, + "loss": 0.1212, + "step": 31870 + }, + { + "epoch": 0.8065136523521522, + "grad_norm": 2.5832955837249756, + "learning_rate": 9.132738314267997e-07, + "loss": 0.0781, + "step": 31871 + }, + { + "epoch": 0.8065389579168459, + "grad_norm": 2.4411516189575195, + "learning_rate": 9.130425081087563e-07, + "loss": 0.1101, + "step": 31872 + }, + { + "epoch": 0.8065642634815396, + "grad_norm": 2.616224527359009, + "learning_rate": 9.128112111467429e-07, + "loss": 0.0611, + "step": 31873 + }, + { + "epoch": 0.8065895690462332, + "grad_norm": 2.486633539199829, + "learning_rate": 9.125799405422486e-07, + "loss": 0.1059, + "step": 31874 + }, + { + "epoch": 0.806614874610927, + "grad_norm": 4.954821586608887, + "learning_rate": 9.123486962967687e-07, + "loss": 0.1399, + "step": 31875 + }, + { + "epoch": 0.8066401801756207, + "grad_norm": 4.023221015930176, + "learning_rate": 9.121174784117887e-07, + "loss": 0.1349, + "step": 31876 + }, + { + "epoch": 0.8066654857403143, + "grad_norm": 5.7388434410095215, + "learning_rate": 9.118862868888045e-07, + "loss": 0.1381, + "step": 31877 + }, + { + "epoch": 0.806690791305008, + "grad_norm": 3.6928961277008057, + "learning_rate": 9.116551217293034e-07, + "loss": 0.1558, + "step": 31878 + }, + { + "epoch": 0.8067160968697017, + "grad_norm": 8.467795372009277, + "learning_rate": 9.114239829347804e-07, + "loss": 0.2038, + "step": 31879 + }, + { + "epoch": 0.8067414024343953, + "grad_norm": 2.8647985458374023, + "learning_rate": 9.111928705067213e-07, + "loss": 0.107, + "step": 31880 + }, + { + "epoch": 0.806766707999089, + "grad_norm": 6.017947673797607, + "learning_rate": 9.1096178444662e-07, + "loss": 0.1423, + "step": 31881 + }, + { + "epoch": 0.8067920135637827, + "grad_norm": 6.540981292724609, + "learning_rate": 9.107307247559644e-07, + "loss": 0.1482, + "step": 31882 + }, + { + "epoch": 0.8068173191284763, + "grad_norm": 7.078677654266357, + "learning_rate": 9.104996914362474e-07, + "loss": 0.1868, + "step": 31883 + }, + { + "epoch": 0.80684262469317, + "grad_norm": 8.522852897644043, + "learning_rate": 9.102686844889563e-07, + "loss": 0.1646, + "step": 31884 + }, + { + "epoch": 0.8068679302578637, + "grad_norm": 3.4023172855377197, + "learning_rate": 9.100377039155822e-07, + "loss": 0.0924, + "step": 31885 + }, + { + "epoch": 0.8068932358225573, + "grad_norm": 5.7559733390808105, + "learning_rate": 9.098067497176138e-07, + "loss": 0.1505, + "step": 31886 + }, + { + "epoch": 0.806918541387251, + "grad_norm": 3.4319865703582764, + "learning_rate": 9.0957582189654e-07, + "loss": 0.1531, + "step": 31887 + }, + { + "epoch": 0.8069438469519448, + "grad_norm": 5.579957008361816, + "learning_rate": 9.093449204538518e-07, + "loss": 0.1515, + "step": 31888 + }, + { + "epoch": 0.8069691525166384, + "grad_norm": 6.503880500793457, + "learning_rate": 9.091140453910374e-07, + "loss": 0.2309, + "step": 31889 + }, + { + "epoch": 0.8069944580813321, + "grad_norm": 5.177606582641602, + "learning_rate": 9.088831967095857e-07, + "loss": 0.1344, + "step": 31890 + }, + { + "epoch": 0.8070197636460258, + "grad_norm": 3.6270389556884766, + "learning_rate": 9.086523744109838e-07, + "loss": 0.1711, + "step": 31891 + }, + { + "epoch": 0.8070450692107194, + "grad_norm": 3.8130810260772705, + "learning_rate": 9.084215784967232e-07, + "loss": 0.1597, + "step": 31892 + }, + { + "epoch": 0.8070703747754131, + "grad_norm": 4.148355007171631, + "learning_rate": 9.081908089682906e-07, + "loss": 0.1485, + "step": 31893 + }, + { + "epoch": 0.8070956803401068, + "grad_norm": 3.8993914127349854, + "learning_rate": 9.079600658271743e-07, + "loss": 0.1414, + "step": 31894 + }, + { + "epoch": 0.8071209859048004, + "grad_norm": 8.225811958312988, + "learning_rate": 9.077293490748613e-07, + "loss": 0.236, + "step": 31895 + }, + { + "epoch": 0.8071462914694941, + "grad_norm": 3.371943712234497, + "learning_rate": 9.074986587128415e-07, + "loss": 0.0949, + "step": 31896 + }, + { + "epoch": 0.8071715970341878, + "grad_norm": 4.872303009033203, + "learning_rate": 9.072679947426016e-07, + "loss": 0.1983, + "step": 31897 + }, + { + "epoch": 0.8071969025988815, + "grad_norm": 4.1583967208862305, + "learning_rate": 9.070373571656294e-07, + "loss": 0.1884, + "step": 31898 + }, + { + "epoch": 0.8072222081635752, + "grad_norm": 3.952594757080078, + "learning_rate": 9.068067459834101e-07, + "loss": 0.1383, + "step": 31899 + }, + { + "epoch": 0.8072475137282689, + "grad_norm": 5.189072132110596, + "learning_rate": 9.065761611974344e-07, + "loss": 0.1385, + "step": 31900 + }, + { + "epoch": 0.8072728192929626, + "grad_norm": 4.2668776512146, + "learning_rate": 9.063456028091877e-07, + "loss": 0.1142, + "step": 31901 + }, + { + "epoch": 0.8072981248576562, + "grad_norm": 3.176910638809204, + "learning_rate": 9.061150708201561e-07, + "loss": 0.145, + "step": 31902 + }, + { + "epoch": 0.8073234304223499, + "grad_norm": 8.554661750793457, + "learning_rate": 9.05884565231826e-07, + "loss": 0.187, + "step": 31903 + }, + { + "epoch": 0.8073487359870436, + "grad_norm": 5.981935977935791, + "learning_rate": 9.056540860456864e-07, + "loss": 0.1732, + "step": 31904 + }, + { + "epoch": 0.8073740415517372, + "grad_norm": 3.5684990882873535, + "learning_rate": 9.054236332632205e-07, + "loss": 0.1239, + "step": 31905 + }, + { + "epoch": 0.8073993471164309, + "grad_norm": 6.516280174255371, + "learning_rate": 9.051932068859182e-07, + "loss": 0.1743, + "step": 31906 + }, + { + "epoch": 0.8074246526811246, + "grad_norm": 3.3629283905029297, + "learning_rate": 9.04962806915261e-07, + "loss": 0.1025, + "step": 31907 + }, + { + "epoch": 0.8074499582458182, + "grad_norm": 3.7266948223114014, + "learning_rate": 9.04732433352738e-07, + "loss": 0.1373, + "step": 31908 + }, + { + "epoch": 0.8074752638105119, + "grad_norm": 4.418233394622803, + "learning_rate": 9.045020861998327e-07, + "loss": 0.1413, + "step": 31909 + }, + { + "epoch": 0.8075005693752056, + "grad_norm": 13.712077140808105, + "learning_rate": 9.042717654580335e-07, + "loss": 0.3132, + "step": 31910 + }, + { + "epoch": 0.8075258749398992, + "grad_norm": 6.157784938812256, + "learning_rate": 9.04041471128822e-07, + "loss": 0.1344, + "step": 31911 + }, + { + "epoch": 0.807551180504593, + "grad_norm": 6.392029285430908, + "learning_rate": 9.038112032136864e-07, + "loss": 0.1212, + "step": 31912 + }, + { + "epoch": 0.8075764860692867, + "grad_norm": 6.915478229522705, + "learning_rate": 9.035809617141101e-07, + "loss": 0.1718, + "step": 31913 + }, + { + "epoch": 0.8076017916339803, + "grad_norm": 5.629636764526367, + "learning_rate": 9.033507466315783e-07, + "loss": 0.1345, + "step": 31914 + }, + { + "epoch": 0.807627097198674, + "grad_norm": 4.882153511047363, + "learning_rate": 9.031205579675744e-07, + "loss": 0.1779, + "step": 31915 + }, + { + "epoch": 0.8076524027633677, + "grad_norm": 8.905204772949219, + "learning_rate": 9.028903957235846e-07, + "loss": 0.3069, + "step": 31916 + }, + { + "epoch": 0.8076777083280613, + "grad_norm": 12.963830947875977, + "learning_rate": 9.02660259901093e-07, + "loss": 0.2701, + "step": 31917 + }, + { + "epoch": 0.807703013892755, + "grad_norm": 7.64139986038208, + "learning_rate": 9.02430150501582e-07, + "loss": 0.209, + "step": 31918 + }, + { + "epoch": 0.8077283194574487, + "grad_norm": 5.213499546051025, + "learning_rate": 9.02200067526538e-07, + "loss": 0.1915, + "step": 31919 + }, + { + "epoch": 0.8077536250221423, + "grad_norm": 4.616445541381836, + "learning_rate": 9.019700109774438e-07, + "loss": 0.1499, + "step": 31920 + }, + { + "epoch": 0.807778930586836, + "grad_norm": 7.230495452880859, + "learning_rate": 9.01739980855782e-07, + "loss": 0.2266, + "step": 31921 + }, + { + "epoch": 0.8078042361515297, + "grad_norm": 4.147047996520996, + "learning_rate": 9.015099771630359e-07, + "loss": 0.1293, + "step": 31922 + }, + { + "epoch": 0.8078295417162233, + "grad_norm": 2.715787410736084, + "learning_rate": 9.012799999006922e-07, + "loss": 0.0753, + "step": 31923 + }, + { + "epoch": 0.807854847280917, + "grad_norm": 11.886674880981445, + "learning_rate": 9.010500490702285e-07, + "loss": 0.2, + "step": 31924 + }, + { + "epoch": 0.8078801528456108, + "grad_norm": 2.539846420288086, + "learning_rate": 9.008201246731319e-07, + "loss": 0.0943, + "step": 31925 + }, + { + "epoch": 0.8079054584103045, + "grad_norm": 5.010170936584473, + "learning_rate": 9.005902267108829e-07, + "loss": 0.1316, + "step": 31926 + }, + { + "epoch": 0.8079307639749981, + "grad_norm": 5.422889709472656, + "learning_rate": 9.00360355184966e-07, + "loss": 0.1721, + "step": 31927 + }, + { + "epoch": 0.8079560695396918, + "grad_norm": 7.90978479385376, + "learning_rate": 9.001305100968621e-07, + "loss": 0.1289, + "step": 31928 + }, + { + "epoch": 0.8079813751043855, + "grad_norm": 5.877165794372559, + "learning_rate": 8.999006914480546e-07, + "loss": 0.1686, + "step": 31929 + }, + { + "epoch": 0.8080066806690791, + "grad_norm": 7.001636505126953, + "learning_rate": 8.996708992400232e-07, + "loss": 0.1752, + "step": 31930 + }, + { + "epoch": 0.8080319862337728, + "grad_norm": 4.799196720123291, + "learning_rate": 8.99441133474253e-07, + "loss": 0.1769, + "step": 31931 + }, + { + "epoch": 0.8080572917984665, + "grad_norm": 8.97797679901123, + "learning_rate": 8.99211394152224e-07, + "loss": 0.2323, + "step": 31932 + }, + { + "epoch": 0.8080825973631601, + "grad_norm": 8.538626670837402, + "learning_rate": 8.989816812754182e-07, + "loss": 0.1738, + "step": 31933 + }, + { + "epoch": 0.8081079029278538, + "grad_norm": 3.95005464553833, + "learning_rate": 8.98751994845315e-07, + "loss": 0.1311, + "step": 31934 + }, + { + "epoch": 0.8081332084925476, + "grad_norm": 11.833341598510742, + "learning_rate": 8.985223348633987e-07, + "loss": 0.3903, + "step": 31935 + }, + { + "epoch": 0.8081585140572412, + "grad_norm": 2.4636266231536865, + "learning_rate": 8.982927013311477e-07, + "loss": 0.0454, + "step": 31936 + }, + { + "epoch": 0.8081838196219349, + "grad_norm": 3.470690965652466, + "learning_rate": 8.980630942500468e-07, + "loss": 0.1273, + "step": 31937 + }, + { + "epoch": 0.8082091251866286, + "grad_norm": 5.976766586303711, + "learning_rate": 8.97833513621571e-07, + "loss": 0.2006, + "step": 31938 + }, + { + "epoch": 0.8082344307513222, + "grad_norm": 5.293757915496826, + "learning_rate": 8.976039594472058e-07, + "loss": 0.1983, + "step": 31939 + }, + { + "epoch": 0.8082597363160159, + "grad_norm": 3.6458516120910645, + "learning_rate": 8.973744317284278e-07, + "loss": 0.1539, + "step": 31940 + }, + { + "epoch": 0.8082850418807096, + "grad_norm": 4.582215309143066, + "learning_rate": 8.971449304667213e-07, + "loss": 0.0991, + "step": 31941 + }, + { + "epoch": 0.8083103474454032, + "grad_norm": 3.4600977897644043, + "learning_rate": 8.969154556635618e-07, + "loss": 0.1407, + "step": 31942 + }, + { + "epoch": 0.8083356530100969, + "grad_norm": 7.021618366241455, + "learning_rate": 8.966860073204325e-07, + "loss": 0.2176, + "step": 31943 + }, + { + "epoch": 0.8083609585747906, + "grad_norm": 5.846954822540283, + "learning_rate": 8.964565854388118e-07, + "loss": 0.1501, + "step": 31944 + }, + { + "epoch": 0.8083862641394842, + "grad_norm": 9.609358787536621, + "learning_rate": 8.962271900201791e-07, + "loss": 0.1604, + "step": 31945 + }, + { + "epoch": 0.8084115697041779, + "grad_norm": 4.398643493652344, + "learning_rate": 8.959978210660125e-07, + "loss": 0.1447, + "step": 31946 + }, + { + "epoch": 0.8084368752688716, + "grad_norm": 15.70601749420166, + "learning_rate": 8.957684785777943e-07, + "loss": 0.3087, + "step": 31947 + }, + { + "epoch": 0.8084621808335652, + "grad_norm": 4.478134632110596, + "learning_rate": 8.955391625570015e-07, + "loss": 0.1136, + "step": 31948 + }, + { + "epoch": 0.808487486398259, + "grad_norm": 3.7720141410827637, + "learning_rate": 8.953098730051119e-07, + "loss": 0.0877, + "step": 31949 + }, + { + "epoch": 0.8085127919629527, + "grad_norm": 2.3520655632019043, + "learning_rate": 8.950806099236081e-07, + "loss": 0.1203, + "step": 31950 + }, + { + "epoch": 0.8085380975276464, + "grad_norm": 4.713550567626953, + "learning_rate": 8.94851373313963e-07, + "loss": 0.1197, + "step": 31951 + }, + { + "epoch": 0.80856340309234, + "grad_norm": 5.935903072357178, + "learning_rate": 8.946221631776591e-07, + "loss": 0.1797, + "step": 31952 + }, + { + "epoch": 0.8085887086570337, + "grad_norm": 6.308651924133301, + "learning_rate": 8.943929795161721e-07, + "loss": 0.1507, + "step": 31953 + }, + { + "epoch": 0.8086140142217274, + "grad_norm": 3.499203681945801, + "learning_rate": 8.94163822330984e-07, + "loss": 0.1255, + "step": 31954 + }, + { + "epoch": 0.808639319786421, + "grad_norm": 12.970004081726074, + "learning_rate": 8.939346916235664e-07, + "loss": 0.1234, + "step": 31955 + }, + { + "epoch": 0.8086646253511147, + "grad_norm": 5.3199462890625, + "learning_rate": 8.93705587395402e-07, + "loss": 0.1209, + "step": 31956 + }, + { + "epoch": 0.8086899309158084, + "grad_norm": 5.289933204650879, + "learning_rate": 8.934765096479648e-07, + "loss": 0.1298, + "step": 31957 + }, + { + "epoch": 0.808715236480502, + "grad_norm": 14.7066650390625, + "learning_rate": 8.93247458382735e-07, + "loss": 0.2587, + "step": 31958 + }, + { + "epoch": 0.8087405420451957, + "grad_norm": 4.801178455352783, + "learning_rate": 8.930184336011887e-07, + "loss": 0.1393, + "step": 31959 + }, + { + "epoch": 0.8087658476098895, + "grad_norm": 8.577709197998047, + "learning_rate": 8.927894353048022e-07, + "loss": 0.1146, + "step": 31960 + }, + { + "epoch": 0.808791153174583, + "grad_norm": 3.854874610900879, + "learning_rate": 8.925604634950519e-07, + "loss": 0.1425, + "step": 31961 + }, + { + "epoch": 0.8088164587392768, + "grad_norm": 4.1059417724609375, + "learning_rate": 8.92331518173416e-07, + "loss": 0.132, + "step": 31962 + }, + { + "epoch": 0.8088417643039705, + "grad_norm": 3.853602409362793, + "learning_rate": 8.921025993413702e-07, + "loss": 0.1709, + "step": 31963 + }, + { + "epoch": 0.8088670698686641, + "grad_norm": 2.6793761253356934, + "learning_rate": 8.918737070003902e-07, + "loss": 0.146, + "step": 31964 + }, + { + "epoch": 0.8088923754333578, + "grad_norm": 4.063533306121826, + "learning_rate": 8.916448411519518e-07, + "loss": 0.149, + "step": 31965 + }, + { + "epoch": 0.8089176809980515, + "grad_norm": 7.863947868347168, + "learning_rate": 8.914160017975326e-07, + "loss": 0.1537, + "step": 31966 + }, + { + "epoch": 0.8089429865627451, + "grad_norm": 5.063335418701172, + "learning_rate": 8.91187188938607e-07, + "loss": 0.1618, + "step": 31967 + }, + { + "epoch": 0.8089682921274388, + "grad_norm": 9.227167129516602, + "learning_rate": 8.909584025766516e-07, + "loss": 0.2373, + "step": 31968 + }, + { + "epoch": 0.8089935976921325, + "grad_norm": 7.640912055969238, + "learning_rate": 8.907296427131395e-07, + "loss": 0.2134, + "step": 31969 + }, + { + "epoch": 0.8090189032568261, + "grad_norm": 9.853119850158691, + "learning_rate": 8.905009093495487e-07, + "loss": 0.1702, + "step": 31970 + }, + { + "epoch": 0.8090442088215198, + "grad_norm": 2.3290610313415527, + "learning_rate": 8.902722024873534e-07, + "loss": 0.085, + "step": 31971 + }, + { + "epoch": 0.8090695143862136, + "grad_norm": 3.9491138458251953, + "learning_rate": 8.900435221280279e-07, + "loss": 0.1235, + "step": 31972 + }, + { + "epoch": 0.8090948199509072, + "grad_norm": 10.240049362182617, + "learning_rate": 8.898148682730462e-07, + "loss": 0.1149, + "step": 31973 + }, + { + "epoch": 0.8091201255156009, + "grad_norm": 6.458018779754639, + "learning_rate": 8.895862409238854e-07, + "loss": 0.1977, + "step": 31974 + }, + { + "epoch": 0.8091454310802946, + "grad_norm": 5.082042217254639, + "learning_rate": 8.893576400820181e-07, + "loss": 0.1585, + "step": 31975 + }, + { + "epoch": 0.8091707366449883, + "grad_norm": 2.8723349571228027, + "learning_rate": 8.891290657489182e-07, + "loss": 0.0995, + "step": 31976 + }, + { + "epoch": 0.8091960422096819, + "grad_norm": 6.2025651931762695, + "learning_rate": 8.889005179260596e-07, + "loss": 0.1684, + "step": 31977 + }, + { + "epoch": 0.8092213477743756, + "grad_norm": 8.269430160522461, + "learning_rate": 8.886719966149177e-07, + "loss": 0.1501, + "step": 31978 + }, + { + "epoch": 0.8092466533390693, + "grad_norm": 4.359435081481934, + "learning_rate": 8.884435018169657e-07, + "loss": 0.164, + "step": 31979 + }, + { + "epoch": 0.8092719589037629, + "grad_norm": 6.940304756164551, + "learning_rate": 8.882150335336754e-07, + "loss": 0.1177, + "step": 31980 + }, + { + "epoch": 0.8092972644684566, + "grad_norm": 4.028171539306641, + "learning_rate": 8.879865917665242e-07, + "loss": 0.1537, + "step": 31981 + }, + { + "epoch": 0.8093225700331503, + "grad_norm": 16.941917419433594, + "learning_rate": 8.877581765169796e-07, + "loss": 0.253, + "step": 31982 + }, + { + "epoch": 0.8093478755978439, + "grad_norm": 10.24547004699707, + "learning_rate": 8.875297877865191e-07, + "loss": 0.1789, + "step": 31983 + }, + { + "epoch": 0.8093731811625376, + "grad_norm": 5.47783899307251, + "learning_rate": 8.873014255766127e-07, + "loss": 0.0851, + "step": 31984 + }, + { + "epoch": 0.8093984867272314, + "grad_norm": 4.649972438812256, + "learning_rate": 8.87073089888737e-07, + "loss": 0.1056, + "step": 31985 + }, + { + "epoch": 0.809423792291925, + "grad_norm": 4.963540077209473, + "learning_rate": 8.868447807243591e-07, + "loss": 0.1244, + "step": 31986 + }, + { + "epoch": 0.8094490978566187, + "grad_norm": 3.7440600395202637, + "learning_rate": 8.866164980849556e-07, + "loss": 0.1327, + "step": 31987 + }, + { + "epoch": 0.8094744034213124, + "grad_norm": 9.090296745300293, + "learning_rate": 8.86388241971996e-07, + "loss": 0.1428, + "step": 31988 + }, + { + "epoch": 0.809499708986006, + "grad_norm": 4.359461307525635, + "learning_rate": 8.861600123869557e-07, + "loss": 0.1335, + "step": 31989 + }, + { + "epoch": 0.8095250145506997, + "grad_norm": 5.022555828094482, + "learning_rate": 8.859318093313018e-07, + "loss": 0.135, + "step": 31990 + }, + { + "epoch": 0.8095503201153934, + "grad_norm": 13.434157371520996, + "learning_rate": 8.857036328065099e-07, + "loss": 0.2226, + "step": 31991 + }, + { + "epoch": 0.809575625680087, + "grad_norm": 11.283056259155273, + "learning_rate": 8.854754828140482e-07, + "loss": 0.3414, + "step": 31992 + }, + { + "epoch": 0.8096009312447807, + "grad_norm": 4.954095363616943, + "learning_rate": 8.852473593553917e-07, + "loss": 0.117, + "step": 31993 + }, + { + "epoch": 0.8096262368094744, + "grad_norm": 3.6051104068756104, + "learning_rate": 8.850192624320092e-07, + "loss": 0.1378, + "step": 31994 + }, + { + "epoch": 0.809651542374168, + "grad_norm": 4.502486228942871, + "learning_rate": 8.847911920453716e-07, + "loss": 0.1383, + "step": 31995 + }, + { + "epoch": 0.8096768479388617, + "grad_norm": 3.406569242477417, + "learning_rate": 8.845631481969497e-07, + "loss": 0.1173, + "step": 31996 + }, + { + "epoch": 0.8097021535035555, + "grad_norm": 2.6791746616363525, + "learning_rate": 8.843351308882153e-07, + "loss": 0.107, + "step": 31997 + }, + { + "epoch": 0.8097274590682491, + "grad_norm": 2.8475699424743652, + "learning_rate": 8.84107140120638e-07, + "loss": 0.0808, + "step": 31998 + }, + { + "epoch": 0.8097527646329428, + "grad_norm": 3.404697895050049, + "learning_rate": 8.838791758956883e-07, + "loss": 0.1584, + "step": 31999 + }, + { + "epoch": 0.8097780701976365, + "grad_norm": 3.1850357055664062, + "learning_rate": 8.836512382148349e-07, + "loss": 0.1, + "step": 32000 + }, + { + "epoch": 0.8098033757623302, + "grad_norm": 2.8709425926208496, + "learning_rate": 8.834233270795501e-07, + "loss": 0.1143, + "step": 32001 + }, + { + "epoch": 0.8098286813270238, + "grad_norm": 3.8233110904693604, + "learning_rate": 8.831954424913026e-07, + "loss": 0.1696, + "step": 32002 + }, + { + "epoch": 0.8098539868917175, + "grad_norm": 6.822518825531006, + "learning_rate": 8.829675844515617e-07, + "loss": 0.1277, + "step": 32003 + }, + { + "epoch": 0.8098792924564112, + "grad_norm": 3.541116714477539, + "learning_rate": 8.827397529617959e-07, + "loss": 0.1425, + "step": 32004 + }, + { + "epoch": 0.8099045980211048, + "grad_norm": 4.652806758880615, + "learning_rate": 8.825119480234773e-07, + "loss": 0.1722, + "step": 32005 + }, + { + "epoch": 0.8099299035857985, + "grad_norm": 3.845304250717163, + "learning_rate": 8.822841696380724e-07, + "loss": 0.1532, + "step": 32006 + }, + { + "epoch": 0.8099552091504922, + "grad_norm": 8.125201225280762, + "learning_rate": 8.820564178070518e-07, + "loss": 0.2198, + "step": 32007 + }, + { + "epoch": 0.8099805147151858, + "grad_norm": 6.616430282592773, + "learning_rate": 8.818286925318826e-07, + "loss": 0.1921, + "step": 32008 + }, + { + "epoch": 0.8100058202798796, + "grad_norm": 4.316554069519043, + "learning_rate": 8.816009938140329e-07, + "loss": 0.1273, + "step": 32009 + }, + { + "epoch": 0.8100311258445733, + "grad_norm": 8.354452133178711, + "learning_rate": 8.813733216549741e-07, + "loss": 0.1491, + "step": 32010 + }, + { + "epoch": 0.8100564314092669, + "grad_norm": 12.310651779174805, + "learning_rate": 8.81145676056171e-07, + "loss": 0.2209, + "step": 32011 + }, + { + "epoch": 0.8100817369739606, + "grad_norm": 10.291215896606445, + "learning_rate": 8.809180570190961e-07, + "loss": 0.2443, + "step": 32012 + }, + { + "epoch": 0.8101070425386543, + "grad_norm": 4.251443862915039, + "learning_rate": 8.806904645452119e-07, + "loss": 0.0972, + "step": 32013 + }, + { + "epoch": 0.8101323481033479, + "grad_norm": 8.020133972167969, + "learning_rate": 8.804628986359898e-07, + "loss": 0.1988, + "step": 32014 + }, + { + "epoch": 0.8101576536680416, + "grad_norm": 12.784994125366211, + "learning_rate": 8.802353592928947e-07, + "loss": 0.1873, + "step": 32015 + }, + { + "epoch": 0.8101829592327353, + "grad_norm": 38.77863311767578, + "learning_rate": 8.800078465173978e-07, + "loss": 0.3063, + "step": 32016 + }, + { + "epoch": 0.8102082647974289, + "grad_norm": 7.096940517425537, + "learning_rate": 8.797803603109617e-07, + "loss": 0.1645, + "step": 32017 + }, + { + "epoch": 0.8102335703621226, + "grad_norm": 3.4549219608306885, + "learning_rate": 8.795529006750569e-07, + "loss": 0.0975, + "step": 32018 + }, + { + "epoch": 0.8102588759268163, + "grad_norm": 19.875032424926758, + "learning_rate": 8.793254676111479e-07, + "loss": 0.2426, + "step": 32019 + }, + { + "epoch": 0.8102841814915099, + "grad_norm": 9.543030738830566, + "learning_rate": 8.790980611207045e-07, + "loss": 0.1735, + "step": 32020 + }, + { + "epoch": 0.8103094870562036, + "grad_norm": 3.016981363296509, + "learning_rate": 8.788706812051889e-07, + "loss": 0.0804, + "step": 32021 + }, + { + "epoch": 0.8103347926208974, + "grad_norm": 4.073151111602783, + "learning_rate": 8.786433278660711e-07, + "loss": 0.1296, + "step": 32022 + }, + { + "epoch": 0.810360098185591, + "grad_norm": 4.2763991355896, + "learning_rate": 8.784160011048149e-07, + "loss": 0.1275, + "step": 32023 + }, + { + "epoch": 0.8103854037502847, + "grad_norm": 2.371433734893799, + "learning_rate": 8.781887009228879e-07, + "loss": 0.0828, + "step": 32024 + }, + { + "epoch": 0.8104107093149784, + "grad_norm": 5.630947113037109, + "learning_rate": 8.779614273217551e-07, + "loss": 0.1578, + "step": 32025 + }, + { + "epoch": 0.8104360148796721, + "grad_norm": 4.432959079742432, + "learning_rate": 8.777341803028827e-07, + "loss": 0.0815, + "step": 32026 + }, + { + "epoch": 0.8104613204443657, + "grad_norm": 6.230525493621826, + "learning_rate": 8.775069598677343e-07, + "loss": 0.1414, + "step": 32027 + }, + { + "epoch": 0.8104866260090594, + "grad_norm": 7.206298828125, + "learning_rate": 8.772797660177784e-07, + "loss": 0.1553, + "step": 32028 + }, + { + "epoch": 0.8105119315737531, + "grad_norm": 7.889499187469482, + "learning_rate": 8.770525987544781e-07, + "loss": 0.2211, + "step": 32029 + }, + { + "epoch": 0.8105372371384467, + "grad_norm": 3.059499979019165, + "learning_rate": 8.768254580792984e-07, + "loss": 0.1155, + "step": 32030 + }, + { + "epoch": 0.8105625427031404, + "grad_norm": 6.343642711639404, + "learning_rate": 8.765983439937032e-07, + "loss": 0.1392, + "step": 32031 + }, + { + "epoch": 0.8105878482678341, + "grad_norm": 10.726945877075195, + "learning_rate": 8.7637125649916e-07, + "loss": 0.1725, + "step": 32032 + }, + { + "epoch": 0.8106131538325277, + "grad_norm": 4.640940189361572, + "learning_rate": 8.761441955971311e-07, + "loss": 0.163, + "step": 32033 + }, + { + "epoch": 0.8106384593972215, + "grad_norm": 5.292604923248291, + "learning_rate": 8.75917161289081e-07, + "loss": 0.1655, + "step": 32034 + }, + { + "epoch": 0.8106637649619152, + "grad_norm": 6.353142261505127, + "learning_rate": 8.756901535764744e-07, + "loss": 0.2363, + "step": 32035 + }, + { + "epoch": 0.8106890705266088, + "grad_norm": 3.0367610454559326, + "learning_rate": 8.754631724607732e-07, + "loss": 0.1697, + "step": 32036 + }, + { + "epoch": 0.8107143760913025, + "grad_norm": 3.0623350143432617, + "learning_rate": 8.752362179434442e-07, + "loss": 0.1062, + "step": 32037 + }, + { + "epoch": 0.8107396816559962, + "grad_norm": 5.342051029205322, + "learning_rate": 8.750092900259494e-07, + "loss": 0.1935, + "step": 32038 + }, + { + "epoch": 0.8107649872206898, + "grad_norm": 6.1930036544799805, + "learning_rate": 8.747823887097529e-07, + "loss": 0.2446, + "step": 32039 + }, + { + "epoch": 0.8107902927853835, + "grad_norm": 4.917163372039795, + "learning_rate": 8.745555139963158e-07, + "loss": 0.1225, + "step": 32040 + }, + { + "epoch": 0.8108155983500772, + "grad_norm": 22.576801300048828, + "learning_rate": 8.743286658871042e-07, + "loss": 0.2406, + "step": 32041 + }, + { + "epoch": 0.8108409039147708, + "grad_norm": 8.501879692077637, + "learning_rate": 8.741018443835791e-07, + "loss": 0.2157, + "step": 32042 + }, + { + "epoch": 0.8108662094794645, + "grad_norm": 3.7714967727661133, + "learning_rate": 8.738750494872045e-07, + "loss": 0.174, + "step": 32043 + }, + { + "epoch": 0.8108915150441582, + "grad_norm": 3.74412202835083, + "learning_rate": 8.736482811994407e-07, + "loss": 0.1191, + "step": 32044 + }, + { + "epoch": 0.8109168206088518, + "grad_norm": 2.7918989658355713, + "learning_rate": 8.734215395217521e-07, + "loss": 0.0547, + "step": 32045 + }, + { + "epoch": 0.8109421261735456, + "grad_norm": 4.514986515045166, + "learning_rate": 8.731948244555999e-07, + "loss": 0.1931, + "step": 32046 + }, + { + "epoch": 0.8109674317382393, + "grad_norm": 4.9550018310546875, + "learning_rate": 8.729681360024489e-07, + "loss": 0.1232, + "step": 32047 + }, + { + "epoch": 0.8109927373029329, + "grad_norm": 20.297216415405273, + "learning_rate": 8.727414741637564e-07, + "loss": 0.2245, + "step": 32048 + }, + { + "epoch": 0.8110180428676266, + "grad_norm": 3.871777296066284, + "learning_rate": 8.725148389409877e-07, + "loss": 0.1511, + "step": 32049 + }, + { + "epoch": 0.8110433484323203, + "grad_norm": 9.89858627319336, + "learning_rate": 8.722882303356012e-07, + "loss": 0.2043, + "step": 32050 + }, + { + "epoch": 0.8110686539970139, + "grad_norm": 2.1290955543518066, + "learning_rate": 8.720616483490635e-07, + "loss": 0.105, + "step": 32051 + }, + { + "epoch": 0.8110939595617076, + "grad_norm": 6.344664096832275, + "learning_rate": 8.718350929828289e-07, + "loss": 0.094, + "step": 32052 + }, + { + "epoch": 0.8111192651264013, + "grad_norm": 4.644992828369141, + "learning_rate": 8.716085642383637e-07, + "loss": 0.1441, + "step": 32053 + }, + { + "epoch": 0.811144570691095, + "grad_norm": 10.504321098327637, + "learning_rate": 8.713820621171254e-07, + "loss": 0.1563, + "step": 32054 + }, + { + "epoch": 0.8111698762557886, + "grad_norm": 8.307171821594238, + "learning_rate": 8.711555866205779e-07, + "loss": 0.1583, + "step": 32055 + }, + { + "epoch": 0.8111951818204823, + "grad_norm": 3.542686939239502, + "learning_rate": 8.709291377501799e-07, + "loss": 0.0696, + "step": 32056 + }, + { + "epoch": 0.811220487385176, + "grad_norm": 3.544557571411133, + "learning_rate": 8.707027155073916e-07, + "loss": 0.0702, + "step": 32057 + }, + { + "epoch": 0.8112457929498696, + "grad_norm": 4.412369728088379, + "learning_rate": 8.704763198936722e-07, + "loss": 0.1473, + "step": 32058 + }, + { + "epoch": 0.8112710985145634, + "grad_norm": 3.800230026245117, + "learning_rate": 8.70249950910484e-07, + "loss": 0.1273, + "step": 32059 + }, + { + "epoch": 0.8112964040792571, + "grad_norm": 6.797645092010498, + "learning_rate": 8.700236085592856e-07, + "loss": 0.2191, + "step": 32060 + }, + { + "epoch": 0.8113217096439507, + "grad_norm": 5.606902122497559, + "learning_rate": 8.697972928415371e-07, + "loss": 0.192, + "step": 32061 + }, + { + "epoch": 0.8113470152086444, + "grad_norm": 3.2525546550750732, + "learning_rate": 8.695710037586957e-07, + "loss": 0.1252, + "step": 32062 + }, + { + "epoch": 0.8113723207733381, + "grad_norm": 4.620065212249756, + "learning_rate": 8.693447413122247e-07, + "loss": 0.1301, + "step": 32063 + }, + { + "epoch": 0.8113976263380317, + "grad_norm": 3.668164014816284, + "learning_rate": 8.691185055035805e-07, + "loss": 0.1303, + "step": 32064 + }, + { + "epoch": 0.8114229319027254, + "grad_norm": 4.279835224151611, + "learning_rate": 8.688922963342227e-07, + "loss": 0.1121, + "step": 32065 + }, + { + "epoch": 0.8114482374674191, + "grad_norm": 7.3982744216918945, + "learning_rate": 8.686661138056101e-07, + "loss": 0.1424, + "step": 32066 + }, + { + "epoch": 0.8114735430321127, + "grad_norm": 3.579190492630005, + "learning_rate": 8.684399579191999e-07, + "loss": 0.1492, + "step": 32067 + }, + { + "epoch": 0.8114988485968064, + "grad_norm": 5.852692604064941, + "learning_rate": 8.682138286764535e-07, + "loss": 0.1654, + "step": 32068 + }, + { + "epoch": 0.8115241541615001, + "grad_norm": 5.9018096923828125, + "learning_rate": 8.679877260788272e-07, + "loss": 0.1944, + "step": 32069 + }, + { + "epoch": 0.8115494597261937, + "grad_norm": 5.741379737854004, + "learning_rate": 8.677616501277797e-07, + "loss": 0.2128, + "step": 32070 + }, + { + "epoch": 0.8115747652908875, + "grad_norm": 7.895753383636475, + "learning_rate": 8.67535600824767e-07, + "loss": 0.1672, + "step": 32071 + }, + { + "epoch": 0.8116000708555812, + "grad_norm": 6.75937032699585, + "learning_rate": 8.673095781712504e-07, + "loss": 0.2092, + "step": 32072 + }, + { + "epoch": 0.8116253764202748, + "grad_norm": 5.437322616577148, + "learning_rate": 8.670835821686851e-07, + "loss": 0.1786, + "step": 32073 + }, + { + "epoch": 0.8116506819849685, + "grad_norm": 9.512162208557129, + "learning_rate": 8.668576128185291e-07, + "loss": 0.1845, + "step": 32074 + }, + { + "epoch": 0.8116759875496622, + "grad_norm": 5.4617695808410645, + "learning_rate": 8.666316701222383e-07, + "loss": 0.1436, + "step": 32075 + }, + { + "epoch": 0.8117012931143558, + "grad_norm": 3.2885844707489014, + "learning_rate": 8.664057540812721e-07, + "loss": 0.1164, + "step": 32076 + }, + { + "epoch": 0.8117265986790495, + "grad_norm": 6.756059169769287, + "learning_rate": 8.661798646970853e-07, + "loss": 0.1474, + "step": 32077 + }, + { + "epoch": 0.8117519042437432, + "grad_norm": 4.007461071014404, + "learning_rate": 8.659540019711376e-07, + "loss": 0.1074, + "step": 32078 + }, + { + "epoch": 0.8117772098084369, + "grad_norm": 3.418736696243286, + "learning_rate": 8.657281659048816e-07, + "loss": 0.1003, + "step": 32079 + }, + { + "epoch": 0.8118025153731305, + "grad_norm": 13.804001808166504, + "learning_rate": 8.655023564997766e-07, + "loss": 0.1362, + "step": 32080 + }, + { + "epoch": 0.8118278209378242, + "grad_norm": 7.296337127685547, + "learning_rate": 8.652765737572771e-07, + "loss": 0.1579, + "step": 32081 + }, + { + "epoch": 0.811853126502518, + "grad_norm": 3.4237782955169678, + "learning_rate": 8.650508176788419e-07, + "loss": 0.1352, + "step": 32082 + }, + { + "epoch": 0.8118784320672116, + "grad_norm": 4.988608360290527, + "learning_rate": 8.648250882659221e-07, + "loss": 0.17, + "step": 32083 + }, + { + "epoch": 0.8119037376319053, + "grad_norm": 3.619203805923462, + "learning_rate": 8.645993855199775e-07, + "loss": 0.1098, + "step": 32084 + }, + { + "epoch": 0.811929043196599, + "grad_norm": 4.6026506423950195, + "learning_rate": 8.643737094424615e-07, + "loss": 0.1207, + "step": 32085 + }, + { + "epoch": 0.8119543487612926, + "grad_norm": 2.7788405418395996, + "learning_rate": 8.641480600348306e-07, + "loss": 0.0616, + "step": 32086 + }, + { + "epoch": 0.8119796543259863, + "grad_norm": 5.454744815826416, + "learning_rate": 8.6392243729854e-07, + "loss": 0.0919, + "step": 32087 + }, + { + "epoch": 0.81200495989068, + "grad_norm": 3.2508692741394043, + "learning_rate": 8.636968412350444e-07, + "loss": 0.1368, + "step": 32088 + }, + { + "epoch": 0.8120302654553736, + "grad_norm": 5.685380458831787, + "learning_rate": 8.634712718457966e-07, + "loss": 0.1509, + "step": 32089 + }, + { + "epoch": 0.8120555710200673, + "grad_norm": 3.4601988792419434, + "learning_rate": 8.632457291322549e-07, + "loss": 0.1234, + "step": 32090 + }, + { + "epoch": 0.812080876584761, + "grad_norm": 5.609701633453369, + "learning_rate": 8.63020213095872e-07, + "loss": 0.1261, + "step": 32091 + }, + { + "epoch": 0.8121061821494546, + "grad_norm": 3.9602608680725098, + "learning_rate": 8.627947237381018e-07, + "loss": 0.1511, + "step": 32092 + }, + { + "epoch": 0.8121314877141483, + "grad_norm": 3.161154270172119, + "learning_rate": 8.625692610603992e-07, + "loss": 0.1132, + "step": 32093 + }, + { + "epoch": 0.812156793278842, + "grad_norm": 7.957238674163818, + "learning_rate": 8.623438250642164e-07, + "loss": 0.2188, + "step": 32094 + }, + { + "epoch": 0.8121820988435356, + "grad_norm": 5.449774265289307, + "learning_rate": 8.621184157510099e-07, + "loss": 0.1504, + "step": 32095 + }, + { + "epoch": 0.8122074044082294, + "grad_norm": 2.0069830417633057, + "learning_rate": 8.618930331222319e-07, + "loss": 0.0804, + "step": 32096 + }, + { + "epoch": 0.8122327099729231, + "grad_norm": 3.3644583225250244, + "learning_rate": 8.616676771793365e-07, + "loss": 0.0871, + "step": 32097 + }, + { + "epoch": 0.8122580155376167, + "grad_norm": 6.755473613739014, + "learning_rate": 8.614423479237744e-07, + "loss": 0.1659, + "step": 32098 + }, + { + "epoch": 0.8122833211023104, + "grad_norm": 3.61582612991333, + "learning_rate": 8.612170453570024e-07, + "loss": 0.1065, + "step": 32099 + }, + { + "epoch": 0.8123086266670041, + "grad_norm": 9.16088581085205, + "learning_rate": 8.60991769480472e-07, + "loss": 0.1622, + "step": 32100 + }, + { + "epoch": 0.8123339322316977, + "grad_norm": 2.6511614322662354, + "learning_rate": 8.607665202956355e-07, + "loss": 0.1307, + "step": 32101 + }, + { + "epoch": 0.8123592377963914, + "grad_norm": 3.788548231124878, + "learning_rate": 8.605412978039445e-07, + "loss": 0.113, + "step": 32102 + }, + { + "epoch": 0.8123845433610851, + "grad_norm": 5.343179702758789, + "learning_rate": 8.603161020068545e-07, + "loss": 0.2418, + "step": 32103 + }, + { + "epoch": 0.8124098489257788, + "grad_norm": 7.480183124542236, + "learning_rate": 8.600909329058155e-07, + "loss": 0.2445, + "step": 32104 + }, + { + "epoch": 0.8124351544904724, + "grad_norm": 6.62051248550415, + "learning_rate": 8.598657905022794e-07, + "loss": 0.1219, + "step": 32105 + }, + { + "epoch": 0.8124604600551661, + "grad_norm": 12.483644485473633, + "learning_rate": 8.596406747976982e-07, + "loss": 0.2657, + "step": 32106 + }, + { + "epoch": 0.8124857656198599, + "grad_norm": 3.8323142528533936, + "learning_rate": 8.594155857935254e-07, + "loss": 0.1398, + "step": 32107 + }, + { + "epoch": 0.8125110711845535, + "grad_norm": 5.328544616699219, + "learning_rate": 8.591905234912095e-07, + "loss": 0.1299, + "step": 32108 + }, + { + "epoch": 0.8125363767492472, + "grad_norm": 5.752325057983398, + "learning_rate": 8.589654878922066e-07, + "loss": 0.1913, + "step": 32109 + }, + { + "epoch": 0.8125616823139409, + "grad_norm": 3.56138277053833, + "learning_rate": 8.587404789979626e-07, + "loss": 0.1516, + "step": 32110 + }, + { + "epoch": 0.8125869878786345, + "grad_norm": 7.574598789215088, + "learning_rate": 8.585154968099319e-07, + "loss": 0.1422, + "step": 32111 + }, + { + "epoch": 0.8126122934433282, + "grad_norm": 4.436532497406006, + "learning_rate": 8.582905413295628e-07, + "loss": 0.1161, + "step": 32112 + }, + { + "epoch": 0.8126375990080219, + "grad_norm": 2.6922292709350586, + "learning_rate": 8.580656125583103e-07, + "loss": 0.1342, + "step": 32113 + }, + { + "epoch": 0.8126629045727155, + "grad_norm": 3.3009960651397705, + "learning_rate": 8.578407104976199e-07, + "loss": 0.1112, + "step": 32114 + }, + { + "epoch": 0.8126882101374092, + "grad_norm": 3.765316963195801, + "learning_rate": 8.576158351489456e-07, + "loss": 0.1869, + "step": 32115 + }, + { + "epoch": 0.8127135157021029, + "grad_norm": 5.271150588989258, + "learning_rate": 8.573909865137348e-07, + "loss": 0.1516, + "step": 32116 + }, + { + "epoch": 0.8127388212667965, + "grad_norm": 7.542306900024414, + "learning_rate": 8.571661645934414e-07, + "loss": 0.1812, + "step": 32117 + }, + { + "epoch": 0.8127641268314902, + "grad_norm": 10.620342254638672, + "learning_rate": 8.569413693895107e-07, + "loss": 0.1751, + "step": 32118 + }, + { + "epoch": 0.812789432396184, + "grad_norm": 5.280855178833008, + "learning_rate": 8.56716600903395e-07, + "loss": 0.1316, + "step": 32119 + }, + { + "epoch": 0.8128147379608776, + "grad_norm": 10.415006637573242, + "learning_rate": 8.564918591365434e-07, + "loss": 0.1007, + "step": 32120 + }, + { + "epoch": 0.8128400435255713, + "grad_norm": 8.94446849822998, + "learning_rate": 8.562671440904041e-07, + "loss": 0.2017, + "step": 32121 + }, + { + "epoch": 0.812865349090265, + "grad_norm": 8.658416748046875, + "learning_rate": 8.560424557664282e-07, + "loss": 0.2191, + "step": 32122 + }, + { + "epoch": 0.8128906546549586, + "grad_norm": 2.882535934448242, + "learning_rate": 8.558177941660639e-07, + "loss": 0.0703, + "step": 32123 + }, + { + "epoch": 0.8129159602196523, + "grad_norm": 13.697598457336426, + "learning_rate": 8.555931592907596e-07, + "loss": 0.1772, + "step": 32124 + }, + { + "epoch": 0.812941265784346, + "grad_norm": 10.459362030029297, + "learning_rate": 8.553685511419629e-07, + "loss": 0.1642, + "step": 32125 + }, + { + "epoch": 0.8129665713490396, + "grad_norm": 9.1380033493042, + "learning_rate": 8.55143969721125e-07, + "loss": 0.1245, + "step": 32126 + }, + { + "epoch": 0.8129918769137333, + "grad_norm": 7.618360996246338, + "learning_rate": 8.549194150296919e-07, + "loss": 0.2333, + "step": 32127 + }, + { + "epoch": 0.813017182478427, + "grad_norm": 4.654727935791016, + "learning_rate": 8.546948870691129e-07, + "loss": 0.1552, + "step": 32128 + }, + { + "epoch": 0.8130424880431207, + "grad_norm": 9.974520683288574, + "learning_rate": 8.544703858408343e-07, + "loss": 0.2356, + "step": 32129 + }, + { + "epoch": 0.8130677936078143, + "grad_norm": 6.709883213043213, + "learning_rate": 8.542459113463065e-07, + "loss": 0.1891, + "step": 32130 + }, + { + "epoch": 0.813093099172508, + "grad_norm": 8.375913619995117, + "learning_rate": 8.540214635869753e-07, + "loss": 0.1588, + "step": 32131 + }, + { + "epoch": 0.8131184047372018, + "grad_norm": 9.363343238830566, + "learning_rate": 8.537970425642883e-07, + "loss": 0.268, + "step": 32132 + }, + { + "epoch": 0.8131437103018954, + "grad_norm": 4.893599033355713, + "learning_rate": 8.535726482796919e-07, + "loss": 0.1435, + "step": 32133 + }, + { + "epoch": 0.8131690158665891, + "grad_norm": 11.247591972351074, + "learning_rate": 8.533482807346354e-07, + "loss": 0.181, + "step": 32134 + }, + { + "epoch": 0.8131943214312828, + "grad_norm": 3.9881367683410645, + "learning_rate": 8.531239399305646e-07, + "loss": 0.1443, + "step": 32135 + }, + { + "epoch": 0.8132196269959764, + "grad_norm": 12.215205192565918, + "learning_rate": 8.528996258689265e-07, + "loss": 0.1915, + "step": 32136 + }, + { + "epoch": 0.8132449325606701, + "grad_norm": 13.657204627990723, + "learning_rate": 8.526753385511655e-07, + "loss": 0.1571, + "step": 32137 + }, + { + "epoch": 0.8132702381253638, + "grad_norm": 6.063762187957764, + "learning_rate": 8.52451077978731e-07, + "loss": 0.1611, + "step": 32138 + }, + { + "epoch": 0.8132955436900574, + "grad_norm": 5.011387825012207, + "learning_rate": 8.522268441530667e-07, + "loss": 0.1941, + "step": 32139 + }, + { + "epoch": 0.8133208492547511, + "grad_norm": 6.685145854949951, + "learning_rate": 8.520026370756229e-07, + "loss": 0.1431, + "step": 32140 + }, + { + "epoch": 0.8133461548194448, + "grad_norm": 7.86321496963501, + "learning_rate": 8.517784567478399e-07, + "loss": 0.1525, + "step": 32141 + }, + { + "epoch": 0.8133714603841384, + "grad_norm": 8.368541717529297, + "learning_rate": 8.51554303171167e-07, + "loss": 0.1607, + "step": 32142 + }, + { + "epoch": 0.8133967659488321, + "grad_norm": 3.8880908489227295, + "learning_rate": 8.513301763470477e-07, + "loss": 0.1599, + "step": 32143 + }, + { + "epoch": 0.8134220715135259, + "grad_norm": 3.1630868911743164, + "learning_rate": 8.511060762769308e-07, + "loss": 0.1175, + "step": 32144 + }, + { + "epoch": 0.8134473770782195, + "grad_norm": 5.82043981552124, + "learning_rate": 8.508820029622561e-07, + "loss": 0.1479, + "step": 32145 + }, + { + "epoch": 0.8134726826429132, + "grad_norm": 3.870142936706543, + "learning_rate": 8.506579564044737e-07, + "loss": 0.1167, + "step": 32146 + }, + { + "epoch": 0.8134979882076069, + "grad_norm": 8.92172908782959, + "learning_rate": 8.504339366050257e-07, + "loss": 0.3323, + "step": 32147 + }, + { + "epoch": 0.8135232937723005, + "grad_norm": 7.348924160003662, + "learning_rate": 8.502099435653571e-07, + "loss": 0.231, + "step": 32148 + }, + { + "epoch": 0.8135485993369942, + "grad_norm": 2.868565559387207, + "learning_rate": 8.499859772869124e-07, + "loss": 0.1153, + "step": 32149 + }, + { + "epoch": 0.8135739049016879, + "grad_norm": 4.9104323387146, + "learning_rate": 8.497620377711363e-07, + "loss": 0.1081, + "step": 32150 + }, + { + "epoch": 0.8135992104663815, + "grad_norm": 5.1647491455078125, + "learning_rate": 8.495381250194734e-07, + "loss": 0.1664, + "step": 32151 + }, + { + "epoch": 0.8136245160310752, + "grad_norm": 2.5758376121520996, + "learning_rate": 8.493142390333658e-07, + "loss": 0.0801, + "step": 32152 + }, + { + "epoch": 0.8136498215957689, + "grad_norm": 8.18918228149414, + "learning_rate": 8.490903798142603e-07, + "loss": 0.2416, + "step": 32153 + }, + { + "epoch": 0.8136751271604626, + "grad_norm": 3.946239709854126, + "learning_rate": 8.488665473635982e-07, + "loss": 0.1221, + "step": 32154 + }, + { + "epoch": 0.8137004327251562, + "grad_norm": 2.760741710662842, + "learning_rate": 8.486427416828236e-07, + "loss": 0.104, + "step": 32155 + }, + { + "epoch": 0.81372573828985, + "grad_norm": 4.626658916473389, + "learning_rate": 8.484189627733785e-07, + "loss": 0.0998, + "step": 32156 + }, + { + "epoch": 0.8137510438545437, + "grad_norm": 4.083251953125, + "learning_rate": 8.481952106367098e-07, + "loss": 0.1306, + "step": 32157 + }, + { + "epoch": 0.8137763494192373, + "grad_norm": 6.796449661254883, + "learning_rate": 8.479714852742555e-07, + "loss": 0.2234, + "step": 32158 + }, + { + "epoch": 0.813801654983931, + "grad_norm": 3.7719950675964355, + "learning_rate": 8.477477866874618e-07, + "loss": 0.1153, + "step": 32159 + }, + { + "epoch": 0.8138269605486247, + "grad_norm": 3.92488956451416, + "learning_rate": 8.475241148777691e-07, + "loss": 0.1314, + "step": 32160 + }, + { + "epoch": 0.8138522661133183, + "grad_norm": 5.946700572967529, + "learning_rate": 8.473004698466225e-07, + "loss": 0.1756, + "step": 32161 + }, + { + "epoch": 0.813877571678012, + "grad_norm": 3.1647520065307617, + "learning_rate": 8.470768515954625e-07, + "loss": 0.1305, + "step": 32162 + }, + { + "epoch": 0.8139028772427057, + "grad_norm": 6.002222061157227, + "learning_rate": 8.468532601257312e-07, + "loss": 0.2338, + "step": 32163 + }, + { + "epoch": 0.8139281828073993, + "grad_norm": 2.835737705230713, + "learning_rate": 8.466296954388697e-07, + "loss": 0.0932, + "step": 32164 + }, + { + "epoch": 0.813953488372093, + "grad_norm": 6.789999485015869, + "learning_rate": 8.464061575363219e-07, + "loss": 0.1683, + "step": 32165 + }, + { + "epoch": 0.8139787939367867, + "grad_norm": 3.7101972103118896, + "learning_rate": 8.461826464195278e-07, + "loss": 0.1821, + "step": 32166 + }, + { + "epoch": 0.8140040995014803, + "grad_norm": 4.345196723937988, + "learning_rate": 8.459591620899299e-07, + "loss": 0.1608, + "step": 32167 + }, + { + "epoch": 0.814029405066174, + "grad_norm": 2.709075450897217, + "learning_rate": 8.457357045489667e-07, + "loss": 0.0787, + "step": 32168 + }, + { + "epoch": 0.8140547106308678, + "grad_norm": 10.27637767791748, + "learning_rate": 8.455122737980825e-07, + "loss": 0.2317, + "step": 32169 + }, + { + "epoch": 0.8140800161955614, + "grad_norm": 3.924875259399414, + "learning_rate": 8.452888698387174e-07, + "loss": 0.0846, + "step": 32170 + }, + { + "epoch": 0.8141053217602551, + "grad_norm": 8.627227783203125, + "learning_rate": 8.450654926723112e-07, + "loss": 0.2313, + "step": 32171 + }, + { + "epoch": 0.8141306273249488, + "grad_norm": 5.7251763343811035, + "learning_rate": 8.448421423003034e-07, + "loss": 0.1359, + "step": 32172 + }, + { + "epoch": 0.8141559328896424, + "grad_norm": 2.1352121829986572, + "learning_rate": 8.446188187241366e-07, + "loss": 0.1014, + "step": 32173 + }, + { + "epoch": 0.8141812384543361, + "grad_norm": 5.013205051422119, + "learning_rate": 8.443955219452499e-07, + "loss": 0.1762, + "step": 32174 + }, + { + "epoch": 0.8142065440190298, + "grad_norm": 5.1216959953308105, + "learning_rate": 8.441722519650852e-07, + "loss": 0.1491, + "step": 32175 + }, + { + "epoch": 0.8142318495837234, + "grad_norm": 4.644170761108398, + "learning_rate": 8.439490087850782e-07, + "loss": 0.1585, + "step": 32176 + }, + { + "epoch": 0.8142571551484171, + "grad_norm": 4.436767578125, + "learning_rate": 8.437257924066722e-07, + "loss": 0.208, + "step": 32177 + }, + { + "epoch": 0.8142824607131108, + "grad_norm": 5.775941848754883, + "learning_rate": 8.435026028313053e-07, + "loss": 0.2046, + "step": 32178 + }, + { + "epoch": 0.8143077662778044, + "grad_norm": 4.736403942108154, + "learning_rate": 8.432794400604172e-07, + "loss": 0.1521, + "step": 32179 + }, + { + "epoch": 0.8143330718424981, + "grad_norm": 3.743692636489868, + "learning_rate": 8.430563040954454e-07, + "loss": 0.125, + "step": 32180 + }, + { + "epoch": 0.8143583774071919, + "grad_norm": 5.842630863189697, + "learning_rate": 8.428331949378315e-07, + "loss": 0.0866, + "step": 32181 + }, + { + "epoch": 0.8143836829718856, + "grad_norm": 14.619629859924316, + "learning_rate": 8.426101125890135e-07, + "loss": 0.2291, + "step": 32182 + }, + { + "epoch": 0.8144089885365792, + "grad_norm": 6.376585960388184, + "learning_rate": 8.423870570504278e-07, + "loss": 0.1738, + "step": 32183 + }, + { + "epoch": 0.8144342941012729, + "grad_norm": 19.037349700927734, + "learning_rate": 8.421640283235172e-07, + "loss": 0.2044, + "step": 32184 + }, + { + "epoch": 0.8144595996659666, + "grad_norm": 2.170337438583374, + "learning_rate": 8.41941026409715e-07, + "loss": 0.1253, + "step": 32185 + }, + { + "epoch": 0.8144849052306602, + "grad_norm": 8.229013442993164, + "learning_rate": 8.417180513104628e-07, + "loss": 0.2716, + "step": 32186 + }, + { + "epoch": 0.8145102107953539, + "grad_norm": 4.4356207847595215, + "learning_rate": 8.414951030271961e-07, + "loss": 0.1321, + "step": 32187 + }, + { + "epoch": 0.8145355163600476, + "grad_norm": 2.5365753173828125, + "learning_rate": 8.412721815613561e-07, + "loss": 0.1176, + "step": 32188 + }, + { + "epoch": 0.8145608219247412, + "grad_norm": 2.7233214378356934, + "learning_rate": 8.41049286914376e-07, + "loss": 0.0986, + "step": 32189 + }, + { + "epoch": 0.8145861274894349, + "grad_norm": 3.243316888809204, + "learning_rate": 8.408264190876969e-07, + "loss": 0.1153, + "step": 32190 + }, + { + "epoch": 0.8146114330541286, + "grad_norm": 4.952225208282471, + "learning_rate": 8.406035780827532e-07, + "loss": 0.1176, + "step": 32191 + }, + { + "epoch": 0.8146367386188222, + "grad_norm": 6.994563579559326, + "learning_rate": 8.403807639009847e-07, + "loss": 0.1314, + "step": 32192 + }, + { + "epoch": 0.814662044183516, + "grad_norm": 8.075920104980469, + "learning_rate": 8.401579765438266e-07, + "loss": 0.1887, + "step": 32193 + }, + { + "epoch": 0.8146873497482097, + "grad_norm": 5.3825273513793945, + "learning_rate": 8.399352160127161e-07, + "loss": 0.2079, + "step": 32194 + }, + { + "epoch": 0.8147126553129033, + "grad_norm": 9.037209510803223, + "learning_rate": 8.39712482309088e-07, + "loss": 0.2942, + "step": 32195 + }, + { + "epoch": 0.814737960877597, + "grad_norm": 4.450008392333984, + "learning_rate": 8.394897754343817e-07, + "loss": 0.12, + "step": 32196 + }, + { + "epoch": 0.8147632664422907, + "grad_norm": 5.361696720123291, + "learning_rate": 8.392670953900322e-07, + "loss": 0.1668, + "step": 32197 + }, + { + "epoch": 0.8147885720069843, + "grad_norm": 5.644356727600098, + "learning_rate": 8.390444421774752e-07, + "loss": 0.1628, + "step": 32198 + }, + { + "epoch": 0.814813877571678, + "grad_norm": 2.655007839202881, + "learning_rate": 8.388218157981448e-07, + "loss": 0.0871, + "step": 32199 + }, + { + "epoch": 0.8148391831363717, + "grad_norm": 4.2468695640563965, + "learning_rate": 8.3859921625348e-07, + "loss": 0.2062, + "step": 32200 + }, + { + "epoch": 0.8148644887010653, + "grad_norm": 4.3145880699157715, + "learning_rate": 8.383766435449153e-07, + "loss": 0.1248, + "step": 32201 + }, + { + "epoch": 0.814889794265759, + "grad_norm": 2.2924728393554688, + "learning_rate": 8.38154097673885e-07, + "loss": 0.1018, + "step": 32202 + }, + { + "epoch": 0.8149150998304527, + "grad_norm": 6.536116123199463, + "learning_rate": 8.379315786418235e-07, + "loss": 0.2257, + "step": 32203 + }, + { + "epoch": 0.8149404053951463, + "grad_norm": 2.615976572036743, + "learning_rate": 8.37709086450168e-07, + "loss": 0.1149, + "step": 32204 + }, + { + "epoch": 0.81496571095984, + "grad_norm": 7.016548156738281, + "learning_rate": 8.37486621100353e-07, + "loss": 0.1301, + "step": 32205 + }, + { + "epoch": 0.8149910165245338, + "grad_norm": 4.279562473297119, + "learning_rate": 8.372641825938121e-07, + "loss": 0.1603, + "step": 32206 + }, + { + "epoch": 0.8150163220892275, + "grad_norm": 5.289058208465576, + "learning_rate": 8.370417709319789e-07, + "loss": 0.1511, + "step": 32207 + }, + { + "epoch": 0.8150416276539211, + "grad_norm": 6.272588729858398, + "learning_rate": 8.368193861162904e-07, + "loss": 0.1335, + "step": 32208 + }, + { + "epoch": 0.8150669332186148, + "grad_norm": 8.574828147888184, + "learning_rate": 8.365970281481794e-07, + "loss": 0.2792, + "step": 32209 + }, + { + "epoch": 0.8150922387833085, + "grad_norm": 8.420350074768066, + "learning_rate": 8.363746970290792e-07, + "loss": 0.1077, + "step": 32210 + }, + { + "epoch": 0.8151175443480021, + "grad_norm": 1.926737904548645, + "learning_rate": 8.361523927604231e-07, + "loss": 0.0983, + "step": 32211 + }, + { + "epoch": 0.8151428499126958, + "grad_norm": 7.275491714477539, + "learning_rate": 8.359301153436472e-07, + "loss": 0.1288, + "step": 32212 + }, + { + "epoch": 0.8151681554773895, + "grad_norm": 5.038490295410156, + "learning_rate": 8.35707864780183e-07, + "loss": 0.1247, + "step": 32213 + }, + { + "epoch": 0.8151934610420831, + "grad_norm": 18.7508602142334, + "learning_rate": 8.354856410714634e-07, + "loss": 0.2112, + "step": 32214 + }, + { + "epoch": 0.8152187666067768, + "grad_norm": 7.545313835144043, + "learning_rate": 8.352634442189245e-07, + "loss": 0.1901, + "step": 32215 + }, + { + "epoch": 0.8152440721714705, + "grad_norm": 3.002347230911255, + "learning_rate": 8.350412742239944e-07, + "loss": 0.0604, + "step": 32216 + }, + { + "epoch": 0.8152693777361641, + "grad_norm": 5.04942512512207, + "learning_rate": 8.348191310881093e-07, + "loss": 0.1469, + "step": 32217 + }, + { + "epoch": 0.8152946833008579, + "grad_norm": 4.101979732513428, + "learning_rate": 8.345970148126997e-07, + "loss": 0.164, + "step": 32218 + }, + { + "epoch": 0.8153199888655516, + "grad_norm": 3.1080260276794434, + "learning_rate": 8.343749253992023e-07, + "loss": 0.138, + "step": 32219 + }, + { + "epoch": 0.8153452944302452, + "grad_norm": 2.165844678878784, + "learning_rate": 8.341528628490431e-07, + "loss": 0.0701, + "step": 32220 + }, + { + "epoch": 0.8153705999949389, + "grad_norm": 2.922847270965576, + "learning_rate": 8.339308271636582e-07, + "loss": 0.1206, + "step": 32221 + }, + { + "epoch": 0.8153959055596326, + "grad_norm": 4.866054058074951, + "learning_rate": 8.337088183444775e-07, + "loss": 0.1667, + "step": 32222 + }, + { + "epoch": 0.8154212111243262, + "grad_norm": 3.0738236904144287, + "learning_rate": 8.334868363929361e-07, + "loss": 0.1239, + "step": 32223 + }, + { + "epoch": 0.8154465166890199, + "grad_norm": 4.999967098236084, + "learning_rate": 8.33264881310461e-07, + "loss": 0.0947, + "step": 32224 + }, + { + "epoch": 0.8154718222537136, + "grad_norm": 2.929338216781616, + "learning_rate": 8.330429530984863e-07, + "loss": 0.1143, + "step": 32225 + }, + { + "epoch": 0.8154971278184072, + "grad_norm": 5.67173957824707, + "learning_rate": 8.328210517584412e-07, + "loss": 0.2296, + "step": 32226 + }, + { + "epoch": 0.8155224333831009, + "grad_norm": 3.9651038646698, + "learning_rate": 8.325991772917596e-07, + "loss": 0.1081, + "step": 32227 + }, + { + "epoch": 0.8155477389477946, + "grad_norm": 9.01963996887207, + "learning_rate": 8.323773296998705e-07, + "loss": 0.2408, + "step": 32228 + }, + { + "epoch": 0.8155730445124882, + "grad_norm": 5.905527114868164, + "learning_rate": 8.321555089842049e-07, + "loss": 0.1786, + "step": 32229 + }, + { + "epoch": 0.815598350077182, + "grad_norm": 3.753866672515869, + "learning_rate": 8.319337151461915e-07, + "loss": 0.1124, + "step": 32230 + }, + { + "epoch": 0.8156236556418757, + "grad_norm": 6.9999098777771, + "learning_rate": 8.317119481872638e-07, + "loss": 0.1614, + "step": 32231 + }, + { + "epoch": 0.8156489612065694, + "grad_norm": 4.582718849182129, + "learning_rate": 8.314902081088505e-07, + "loss": 0.1311, + "step": 32232 + }, + { + "epoch": 0.815674266771263, + "grad_norm": 9.217338562011719, + "learning_rate": 8.312684949123812e-07, + "loss": 0.1688, + "step": 32233 + }, + { + "epoch": 0.8156995723359567, + "grad_norm": 4.598434925079346, + "learning_rate": 8.310468085992851e-07, + "loss": 0.0934, + "step": 32234 + }, + { + "epoch": 0.8157248779006504, + "grad_norm": 5.1896843910217285, + "learning_rate": 8.308251491709934e-07, + "loss": 0.1743, + "step": 32235 + }, + { + "epoch": 0.815750183465344, + "grad_norm": 6.725835800170898, + "learning_rate": 8.30603516628935e-07, + "loss": 0.1743, + "step": 32236 + }, + { + "epoch": 0.8157754890300377, + "grad_norm": 3.8764095306396484, + "learning_rate": 8.303819109745392e-07, + "loss": 0.1457, + "step": 32237 + }, + { + "epoch": 0.8158007945947314, + "grad_norm": 12.665904998779297, + "learning_rate": 8.301603322092333e-07, + "loss": 0.1437, + "step": 32238 + }, + { + "epoch": 0.815826100159425, + "grad_norm": 7.001617908477783, + "learning_rate": 8.299387803344489e-07, + "loss": 0.2163, + "step": 32239 + }, + { + "epoch": 0.8158514057241187, + "grad_norm": 6.171182632446289, + "learning_rate": 8.297172553516136e-07, + "loss": 0.2031, + "step": 32240 + }, + { + "epoch": 0.8158767112888125, + "grad_norm": 4.2852091789245605, + "learning_rate": 8.294957572621565e-07, + "loss": 0.1271, + "step": 32241 + }, + { + "epoch": 0.815902016853506, + "grad_norm": 3.6880605220794678, + "learning_rate": 8.292742860675052e-07, + "loss": 0.112, + "step": 32242 + }, + { + "epoch": 0.8159273224181998, + "grad_norm": 3.0862016677856445, + "learning_rate": 8.290528417690868e-07, + "loss": 0.0841, + "step": 32243 + }, + { + "epoch": 0.8159526279828935, + "grad_norm": 4.645529270172119, + "learning_rate": 8.28831424368332e-07, + "loss": 0.1517, + "step": 32244 + }, + { + "epoch": 0.8159779335475871, + "grad_norm": 3.5704643726348877, + "learning_rate": 8.286100338666675e-07, + "loss": 0.0946, + "step": 32245 + }, + { + "epoch": 0.8160032391122808, + "grad_norm": 5.451972007751465, + "learning_rate": 8.283886702655209e-07, + "loss": 0.1728, + "step": 32246 + }, + { + "epoch": 0.8160285446769745, + "grad_norm": 7.210794448852539, + "learning_rate": 8.281673335663188e-07, + "loss": 0.1745, + "step": 32247 + }, + { + "epoch": 0.8160538502416681, + "grad_norm": 8.130577087402344, + "learning_rate": 8.279460237704906e-07, + "loss": 0.104, + "step": 32248 + }, + { + "epoch": 0.8160791558063618, + "grad_norm": 6.760757923126221, + "learning_rate": 8.277247408794614e-07, + "loss": 0.1933, + "step": 32249 + }, + { + "epoch": 0.8161044613710555, + "grad_norm": 3.9093470573425293, + "learning_rate": 8.275034848946617e-07, + "loss": 0.1442, + "step": 32250 + }, + { + "epoch": 0.8161297669357491, + "grad_norm": 11.208239555358887, + "learning_rate": 8.272822558175131e-07, + "loss": 0.2102, + "step": 32251 + }, + { + "epoch": 0.8161550725004428, + "grad_norm": 6.914847373962402, + "learning_rate": 8.270610536494467e-07, + "loss": 0.2218, + "step": 32252 + }, + { + "epoch": 0.8161803780651365, + "grad_norm": 5.370234966278076, + "learning_rate": 8.268398783918857e-07, + "loss": 0.0783, + "step": 32253 + }, + { + "epoch": 0.8162056836298301, + "grad_norm": 4.334593772888184, + "learning_rate": 8.266187300462608e-07, + "loss": 0.1087, + "step": 32254 + }, + { + "epoch": 0.8162309891945239, + "grad_norm": 4.366912841796875, + "learning_rate": 8.263976086139924e-07, + "loss": 0.1644, + "step": 32255 + }, + { + "epoch": 0.8162562947592176, + "grad_norm": 6.351032257080078, + "learning_rate": 8.261765140965111e-07, + "loss": 0.1854, + "step": 32256 + }, + { + "epoch": 0.8162816003239113, + "grad_norm": 2.6395576000213623, + "learning_rate": 8.259554464952402e-07, + "loss": 0.1031, + "step": 32257 + }, + { + "epoch": 0.8163069058886049, + "grad_norm": 7.170285701751709, + "learning_rate": 8.257344058116068e-07, + "loss": 0.1548, + "step": 32258 + }, + { + "epoch": 0.8163322114532986, + "grad_norm": 5.780579090118408, + "learning_rate": 8.255133920470365e-07, + "loss": 0.1759, + "step": 32259 + }, + { + "epoch": 0.8163575170179923, + "grad_norm": 10.965954780578613, + "learning_rate": 8.252924052029532e-07, + "loss": 0.1238, + "step": 32260 + }, + { + "epoch": 0.8163828225826859, + "grad_norm": 5.88855504989624, + "learning_rate": 8.250714452807818e-07, + "loss": 0.1603, + "step": 32261 + }, + { + "epoch": 0.8164081281473796, + "grad_norm": 2.471021890640259, + "learning_rate": 8.248505122819489e-07, + "loss": 0.0821, + "step": 32262 + }, + { + "epoch": 0.8164334337120733, + "grad_norm": 2.631948471069336, + "learning_rate": 8.246296062078784e-07, + "loss": 0.0738, + "step": 32263 + }, + { + "epoch": 0.8164587392767669, + "grad_norm": 6.537906169891357, + "learning_rate": 8.244087270599948e-07, + "loss": 0.1442, + "step": 32264 + }, + { + "epoch": 0.8164840448414606, + "grad_norm": 7.443497657775879, + "learning_rate": 8.241878748397219e-07, + "loss": 0.2223, + "step": 32265 + }, + { + "epoch": 0.8165093504061544, + "grad_norm": 8.093385696411133, + "learning_rate": 8.239670495484853e-07, + "loss": 0.2209, + "step": 32266 + }, + { + "epoch": 0.816534655970848, + "grad_norm": 3.644644021987915, + "learning_rate": 8.237462511877086e-07, + "loss": 0.0845, + "step": 32267 + }, + { + "epoch": 0.8165599615355417, + "grad_norm": 3.370802402496338, + "learning_rate": 8.23525479758816e-07, + "loss": 0.1521, + "step": 32268 + }, + { + "epoch": 0.8165852671002354, + "grad_norm": 10.810738563537598, + "learning_rate": 8.2330473526323e-07, + "loss": 0.2007, + "step": 32269 + }, + { + "epoch": 0.816610572664929, + "grad_norm": 4.555289268493652, + "learning_rate": 8.230840177023742e-07, + "loss": 0.1631, + "step": 32270 + }, + { + "epoch": 0.8166358782296227, + "grad_norm": 4.577892303466797, + "learning_rate": 8.228633270776732e-07, + "loss": 0.1566, + "step": 32271 + }, + { + "epoch": 0.8166611837943164, + "grad_norm": 7.026400566101074, + "learning_rate": 8.226426633905499e-07, + "loss": 0.1833, + "step": 32272 + }, + { + "epoch": 0.81668648935901, + "grad_norm": 8.885687828063965, + "learning_rate": 8.224220266424271e-07, + "loss": 0.2132, + "step": 32273 + }, + { + "epoch": 0.8167117949237037, + "grad_norm": 15.602885246276855, + "learning_rate": 8.222014168347259e-07, + "loss": 0.2949, + "step": 32274 + }, + { + "epoch": 0.8167371004883974, + "grad_norm": 3.679546356201172, + "learning_rate": 8.219808339688722e-07, + "loss": 0.1946, + "step": 32275 + }, + { + "epoch": 0.816762406053091, + "grad_norm": 6.866323947906494, + "learning_rate": 8.217602780462869e-07, + "loss": 0.1315, + "step": 32276 + }, + { + "epoch": 0.8167877116177847, + "grad_norm": 5.193754196166992, + "learning_rate": 8.215397490683924e-07, + "loss": 0.1953, + "step": 32277 + }, + { + "epoch": 0.8168130171824785, + "grad_norm": 3.3718738555908203, + "learning_rate": 8.2131924703661e-07, + "loss": 0.1113, + "step": 32278 + }, + { + "epoch": 0.816838322747172, + "grad_norm": 8.144469261169434, + "learning_rate": 8.210987719523627e-07, + "loss": 0.1801, + "step": 32279 + }, + { + "epoch": 0.8168636283118658, + "grad_norm": 4.8080034255981445, + "learning_rate": 8.208783238170715e-07, + "loss": 0.141, + "step": 32280 + }, + { + "epoch": 0.8168889338765595, + "grad_norm": 4.14624547958374, + "learning_rate": 8.206579026321609e-07, + "loss": 0.1356, + "step": 32281 + }, + { + "epoch": 0.8169142394412531, + "grad_norm": 10.160313606262207, + "learning_rate": 8.204375083990479e-07, + "loss": 0.1622, + "step": 32282 + }, + { + "epoch": 0.8169395450059468, + "grad_norm": 4.94622802734375, + "learning_rate": 8.202171411191573e-07, + "loss": 0.1402, + "step": 32283 + }, + { + "epoch": 0.8169648505706405, + "grad_norm": 8.416333198547363, + "learning_rate": 8.199968007939069e-07, + "loss": 0.1411, + "step": 32284 + }, + { + "epoch": 0.8169901561353342, + "grad_norm": 4.397089958190918, + "learning_rate": 8.197764874247227e-07, + "loss": 0.1247, + "step": 32285 + }, + { + "epoch": 0.8170154617000278, + "grad_norm": 8.92953109741211, + "learning_rate": 8.195562010130198e-07, + "loss": 0.2745, + "step": 32286 + }, + { + "epoch": 0.8170407672647215, + "grad_norm": 4.011376857757568, + "learning_rate": 8.193359415602221e-07, + "loss": 0.116, + "step": 32287 + }, + { + "epoch": 0.8170660728294152, + "grad_norm": 5.315272808074951, + "learning_rate": 8.191157090677482e-07, + "loss": 0.2004, + "step": 32288 + }, + { + "epoch": 0.8170913783941088, + "grad_norm": 4.45121955871582, + "learning_rate": 8.188955035370211e-07, + "loss": 0.1171, + "step": 32289 + }, + { + "epoch": 0.8171166839588025, + "grad_norm": 3.5320239067077637, + "learning_rate": 8.186753249694585e-07, + "loss": 0.1595, + "step": 32290 + }, + { + "epoch": 0.8171419895234963, + "grad_norm": 3.7837114334106445, + "learning_rate": 8.184551733664814e-07, + "loss": 0.1403, + "step": 32291 + }, + { + "epoch": 0.8171672950881899, + "grad_norm": 3.403346538543701, + "learning_rate": 8.182350487295077e-07, + "loss": 0.1187, + "step": 32292 + }, + { + "epoch": 0.8171926006528836, + "grad_norm": 3.510093927383423, + "learning_rate": 8.180149510599594e-07, + "loss": 0.1269, + "step": 32293 + }, + { + "epoch": 0.8172179062175773, + "grad_norm": 4.192257881164551, + "learning_rate": 8.17794880359255e-07, + "loss": 0.1487, + "step": 32294 + }, + { + "epoch": 0.8172432117822709, + "grad_norm": 7.083704471588135, + "learning_rate": 8.175748366288133e-07, + "loss": 0.2586, + "step": 32295 + }, + { + "epoch": 0.8172685173469646, + "grad_norm": 5.5204668045043945, + "learning_rate": 8.173548198700526e-07, + "loss": 0.1903, + "step": 32296 + }, + { + "epoch": 0.8172938229116583, + "grad_norm": 6.990556240081787, + "learning_rate": 8.171348300843934e-07, + "loss": 0.1833, + "step": 32297 + }, + { + "epoch": 0.8173191284763519, + "grad_norm": 6.022512912750244, + "learning_rate": 8.169148672732536e-07, + "loss": 0.1767, + "step": 32298 + }, + { + "epoch": 0.8173444340410456, + "grad_norm": 2.281967878341675, + "learning_rate": 8.166949314380518e-07, + "loss": 0.0962, + "step": 32299 + }, + { + "epoch": 0.8173697396057393, + "grad_norm": 8.193862915039062, + "learning_rate": 8.164750225802065e-07, + "loss": 0.2297, + "step": 32300 + }, + { + "epoch": 0.8173950451704329, + "grad_norm": 8.1686372756958, + "learning_rate": 8.162551407011343e-07, + "loss": 0.2023, + "step": 32301 + }, + { + "epoch": 0.8174203507351266, + "grad_norm": 9.00067138671875, + "learning_rate": 8.160352858022558e-07, + "loss": 0.2191, + "step": 32302 + }, + { + "epoch": 0.8174456562998204, + "grad_norm": 7.188614368438721, + "learning_rate": 8.15815457884987e-07, + "loss": 0.1425, + "step": 32303 + }, + { + "epoch": 0.817470961864514, + "grad_norm": 4.449063301086426, + "learning_rate": 8.155956569507462e-07, + "loss": 0.1985, + "step": 32304 + }, + { + "epoch": 0.8174962674292077, + "grad_norm": 2.7703189849853516, + "learning_rate": 8.153758830009495e-07, + "loss": 0.1374, + "step": 32305 + }, + { + "epoch": 0.8175215729939014, + "grad_norm": 4.537900924682617, + "learning_rate": 8.151561360370164e-07, + "loss": 0.1837, + "step": 32306 + }, + { + "epoch": 0.817546878558595, + "grad_norm": 5.765334606170654, + "learning_rate": 8.149364160603629e-07, + "loss": 0.1296, + "step": 32307 + }, + { + "epoch": 0.8175721841232887, + "grad_norm": 3.6661453247070312, + "learning_rate": 8.147167230724063e-07, + "loss": 0.1393, + "step": 32308 + }, + { + "epoch": 0.8175974896879824, + "grad_norm": 4.209107398986816, + "learning_rate": 8.144970570745614e-07, + "loss": 0.1462, + "step": 32309 + }, + { + "epoch": 0.8176227952526761, + "grad_norm": 6.300745964050293, + "learning_rate": 8.142774180682483e-07, + "loss": 0.1023, + "step": 32310 + }, + { + "epoch": 0.8176481008173697, + "grad_norm": 3.142552137374878, + "learning_rate": 8.140578060548798e-07, + "loss": 0.1422, + "step": 32311 + }, + { + "epoch": 0.8176734063820634, + "grad_norm": 5.1930460929870605, + "learning_rate": 8.138382210358764e-07, + "loss": 0.1291, + "step": 32312 + }, + { + "epoch": 0.8176987119467571, + "grad_norm": 11.20733642578125, + "learning_rate": 8.136186630126491e-07, + "loss": 0.137, + "step": 32313 + }, + { + "epoch": 0.8177240175114507, + "grad_norm": 3.1358842849731445, + "learning_rate": 8.133991319866175e-07, + "loss": 0.1276, + "step": 32314 + }, + { + "epoch": 0.8177493230761445, + "grad_norm": 3.1901090145111084, + "learning_rate": 8.131796279591953e-07, + "loss": 0.0934, + "step": 32315 + }, + { + "epoch": 0.8177746286408382, + "grad_norm": 9.4716157913208, + "learning_rate": 8.129601509318008e-07, + "loss": 0.1702, + "step": 32316 + }, + { + "epoch": 0.8177999342055318, + "grad_norm": 5.010025501251221, + "learning_rate": 8.127407009058452e-07, + "loss": 0.1432, + "step": 32317 + }, + { + "epoch": 0.8178252397702255, + "grad_norm": 5.515341758728027, + "learning_rate": 8.125212778827474e-07, + "loss": 0.157, + "step": 32318 + }, + { + "epoch": 0.8178505453349192, + "grad_norm": 3.270487070083618, + "learning_rate": 8.123018818639195e-07, + "loss": 0.1281, + "step": 32319 + }, + { + "epoch": 0.8178758508996128, + "grad_norm": 7.6403889656066895, + "learning_rate": 8.120825128507786e-07, + "loss": 0.1823, + "step": 32320 + }, + { + "epoch": 0.8179011564643065, + "grad_norm": 7.9246063232421875, + "learning_rate": 8.118631708447389e-07, + "loss": 0.2563, + "step": 32321 + }, + { + "epoch": 0.8179264620290002, + "grad_norm": 5.365675926208496, + "learning_rate": 8.116438558472151e-07, + "loss": 0.1949, + "step": 32322 + }, + { + "epoch": 0.8179517675936938, + "grad_norm": 4.227482318878174, + "learning_rate": 8.114245678596188e-07, + "loss": 0.1949, + "step": 32323 + }, + { + "epoch": 0.8179770731583875, + "grad_norm": 7.166293621063232, + "learning_rate": 8.112053068833681e-07, + "loss": 0.1593, + "step": 32324 + }, + { + "epoch": 0.8180023787230812, + "grad_norm": 4.883143901824951, + "learning_rate": 8.109860729198754e-07, + "loss": 0.1751, + "step": 32325 + }, + { + "epoch": 0.8180276842877748, + "grad_norm": 3.7220046520233154, + "learning_rate": 8.107668659705537e-07, + "loss": 0.1089, + "step": 32326 + }, + { + "epoch": 0.8180529898524685, + "grad_norm": 9.827070236206055, + "learning_rate": 8.105476860368178e-07, + "loss": 0.252, + "step": 32327 + }, + { + "epoch": 0.8180782954171623, + "grad_norm": 4.293376445770264, + "learning_rate": 8.103285331200788e-07, + "loss": 0.112, + "step": 32328 + }, + { + "epoch": 0.8181036009818559, + "grad_norm": 7.89233922958374, + "learning_rate": 8.101094072217536e-07, + "loss": 0.2134, + "step": 32329 + }, + { + "epoch": 0.8181289065465496, + "grad_norm": 3.938710927963257, + "learning_rate": 8.09890308343253e-07, + "loss": 0.1314, + "step": 32330 + }, + { + "epoch": 0.8181542121112433, + "grad_norm": 9.075867652893066, + "learning_rate": 8.096712364859905e-07, + "loss": 0.1086, + "step": 32331 + }, + { + "epoch": 0.8181795176759369, + "grad_norm": 4.804733753204346, + "learning_rate": 8.094521916513776e-07, + "loss": 0.1755, + "step": 32332 + }, + { + "epoch": 0.8182048232406306, + "grad_norm": 12.126521110534668, + "learning_rate": 8.092331738408294e-07, + "loss": 0.209, + "step": 32333 + }, + { + "epoch": 0.8182301288053243, + "grad_norm": 5.450274467468262, + "learning_rate": 8.090141830557563e-07, + "loss": 0.1838, + "step": 32334 + }, + { + "epoch": 0.818255434370018, + "grad_norm": 5.991183280944824, + "learning_rate": 8.087952192975718e-07, + "loss": 0.1324, + "step": 32335 + }, + { + "epoch": 0.8182807399347116, + "grad_norm": 15.872820854187012, + "learning_rate": 8.085762825676857e-07, + "loss": 0.1729, + "step": 32336 + }, + { + "epoch": 0.8183060454994053, + "grad_norm": 5.240220546722412, + "learning_rate": 8.083573728675126e-07, + "loss": 0.1455, + "step": 32337 + }, + { + "epoch": 0.818331351064099, + "grad_norm": 9.161044120788574, + "learning_rate": 8.081384901984634e-07, + "loss": 0.1981, + "step": 32338 + }, + { + "epoch": 0.8183566566287926, + "grad_norm": 5.6316118240356445, + "learning_rate": 8.079196345619494e-07, + "loss": 0.1591, + "step": 32339 + }, + { + "epoch": 0.8183819621934864, + "grad_norm": 3.8313703536987305, + "learning_rate": 8.077008059593805e-07, + "loss": 0.0892, + "step": 32340 + }, + { + "epoch": 0.8184072677581801, + "grad_norm": 4.172909259796143, + "learning_rate": 8.074820043921705e-07, + "loss": 0.1427, + "step": 32341 + }, + { + "epoch": 0.8184325733228737, + "grad_norm": 3.739047050476074, + "learning_rate": 8.072632298617283e-07, + "loss": 0.1725, + "step": 32342 + }, + { + "epoch": 0.8184578788875674, + "grad_norm": 7.45034646987915, + "learning_rate": 8.070444823694673e-07, + "loss": 0.1732, + "step": 32343 + }, + { + "epoch": 0.8184831844522611, + "grad_norm": 7.398759841918945, + "learning_rate": 8.068257619167946e-07, + "loss": 0.2196, + "step": 32344 + }, + { + "epoch": 0.8185084900169547, + "grad_norm": 9.679229736328125, + "learning_rate": 8.066070685051236e-07, + "loss": 0.1772, + "step": 32345 + }, + { + "epoch": 0.8185337955816484, + "grad_norm": 3.1719253063201904, + "learning_rate": 8.063884021358626e-07, + "loss": 0.0892, + "step": 32346 + }, + { + "epoch": 0.8185591011463421, + "grad_norm": 4.581364154815674, + "learning_rate": 8.061697628104248e-07, + "loss": 0.1786, + "step": 32347 + }, + { + "epoch": 0.8185844067110357, + "grad_norm": 2.9239895343780518, + "learning_rate": 8.059511505302159e-07, + "loss": 0.0623, + "step": 32348 + }, + { + "epoch": 0.8186097122757294, + "grad_norm": 11.80958080291748, + "learning_rate": 8.057325652966491e-07, + "loss": 0.2701, + "step": 32349 + }, + { + "epoch": 0.8186350178404231, + "grad_norm": 4.493706226348877, + "learning_rate": 8.055140071111317e-07, + "loss": 0.1398, + "step": 32350 + }, + { + "epoch": 0.8186603234051167, + "grad_norm": 6.020735263824463, + "learning_rate": 8.052954759750764e-07, + "loss": 0.1639, + "step": 32351 + }, + { + "epoch": 0.8186856289698105, + "grad_norm": 7.246132850646973, + "learning_rate": 8.050769718898882e-07, + "loss": 0.2329, + "step": 32352 + }, + { + "epoch": 0.8187109345345042, + "grad_norm": 4.594481468200684, + "learning_rate": 8.048584948569793e-07, + "loss": 0.1058, + "step": 32353 + }, + { + "epoch": 0.8187362400991978, + "grad_norm": 6.841049671173096, + "learning_rate": 8.046400448777575e-07, + "loss": 0.1097, + "step": 32354 + }, + { + "epoch": 0.8187615456638915, + "grad_norm": 2.884758949279785, + "learning_rate": 8.044216219536305e-07, + "loss": 0.1466, + "step": 32355 + }, + { + "epoch": 0.8187868512285852, + "grad_norm": 8.247381210327148, + "learning_rate": 8.042032260860094e-07, + "loss": 0.1834, + "step": 32356 + }, + { + "epoch": 0.8188121567932788, + "grad_norm": 3.610499382019043, + "learning_rate": 8.03984857276301e-07, + "loss": 0.1396, + "step": 32357 + }, + { + "epoch": 0.8188374623579725, + "grad_norm": 4.355157375335693, + "learning_rate": 8.037665155259139e-07, + "loss": 0.1808, + "step": 32358 + }, + { + "epoch": 0.8188627679226662, + "grad_norm": 5.323948860168457, + "learning_rate": 8.035482008362544e-07, + "loss": 0.12, + "step": 32359 + }, + { + "epoch": 0.8188880734873599, + "grad_norm": 12.601491928100586, + "learning_rate": 8.033299132087336e-07, + "loss": 0.3449, + "step": 32360 + }, + { + "epoch": 0.8189133790520535, + "grad_norm": 3.0963432788848877, + "learning_rate": 8.031116526447575e-07, + "loss": 0.1502, + "step": 32361 + }, + { + "epoch": 0.8189386846167472, + "grad_norm": 3.9674298763275146, + "learning_rate": 8.028934191457333e-07, + "loss": 0.166, + "step": 32362 + }, + { + "epoch": 0.818963990181441, + "grad_norm": 5.253147602081299, + "learning_rate": 8.026752127130677e-07, + "loss": 0.1024, + "step": 32363 + }, + { + "epoch": 0.8189892957461345, + "grad_norm": 6.229822635650635, + "learning_rate": 8.024570333481702e-07, + "loss": 0.1831, + "step": 32364 + }, + { + "epoch": 0.8190146013108283, + "grad_norm": 3.4123589992523193, + "learning_rate": 8.022388810524467e-07, + "loss": 0.176, + "step": 32365 + }, + { + "epoch": 0.819039906875522, + "grad_norm": 15.671483993530273, + "learning_rate": 8.020207558273036e-07, + "loss": 0.2146, + "step": 32366 + }, + { + "epoch": 0.8190652124402156, + "grad_norm": 7.221224308013916, + "learning_rate": 8.018026576741466e-07, + "loss": 0.1696, + "step": 32367 + }, + { + "epoch": 0.8190905180049093, + "grad_norm": 4.544060230255127, + "learning_rate": 8.015845865943845e-07, + "loss": 0.2041, + "step": 32368 + }, + { + "epoch": 0.819115823569603, + "grad_norm": 5.281745433807373, + "learning_rate": 8.013665425894224e-07, + "loss": 0.2096, + "step": 32369 + }, + { + "epoch": 0.8191411291342966, + "grad_norm": 3.3476099967956543, + "learning_rate": 8.011485256606666e-07, + "loss": 0.1069, + "step": 32370 + }, + { + "epoch": 0.8191664346989903, + "grad_norm": 7.7058000564575195, + "learning_rate": 8.009305358095215e-07, + "loss": 0.1944, + "step": 32371 + }, + { + "epoch": 0.819191740263684, + "grad_norm": 10.385209083557129, + "learning_rate": 8.007125730373954e-07, + "loss": 0.1894, + "step": 32372 + }, + { + "epoch": 0.8192170458283776, + "grad_norm": 3.8463072776794434, + "learning_rate": 8.004946373456929e-07, + "loss": 0.1079, + "step": 32373 + }, + { + "epoch": 0.8192423513930713, + "grad_norm": 5.070578098297119, + "learning_rate": 8.002767287358193e-07, + "loss": 0.1392, + "step": 32374 + }, + { + "epoch": 0.819267656957765, + "grad_norm": 4.911562442779541, + "learning_rate": 8.000588472091791e-07, + "loss": 0.1805, + "step": 32375 + }, + { + "epoch": 0.8192929625224586, + "grad_norm": 10.017623901367188, + "learning_rate": 7.998409927671785e-07, + "loss": 0.2133, + "step": 32376 + }, + { + "epoch": 0.8193182680871524, + "grad_norm": 2.371504068374634, + "learning_rate": 7.996231654112219e-07, + "loss": 0.0424, + "step": 32377 + }, + { + "epoch": 0.8193435736518461, + "grad_norm": 6.1025919914245605, + "learning_rate": 7.994053651427158e-07, + "loss": 0.201, + "step": 32378 + }, + { + "epoch": 0.8193688792165397, + "grad_norm": 5.679238319396973, + "learning_rate": 7.99187591963061e-07, + "loss": 0.2472, + "step": 32379 + }, + { + "epoch": 0.8193941847812334, + "grad_norm": 5.8455610275268555, + "learning_rate": 7.98969845873665e-07, + "loss": 0.1998, + "step": 32380 + }, + { + "epoch": 0.8194194903459271, + "grad_norm": 2.3944613933563232, + "learning_rate": 7.987521268759302e-07, + "loss": 0.0949, + "step": 32381 + }, + { + "epoch": 0.8194447959106207, + "grad_norm": 3.5440878868103027, + "learning_rate": 7.985344349712632e-07, + "loss": 0.1912, + "step": 32382 + }, + { + "epoch": 0.8194701014753144, + "grad_norm": 5.757390975952148, + "learning_rate": 7.983167701610645e-07, + "loss": 0.1588, + "step": 32383 + }, + { + "epoch": 0.8194954070400081, + "grad_norm": 3.9217941761016846, + "learning_rate": 7.980991324467402e-07, + "loss": 0.094, + "step": 32384 + }, + { + "epoch": 0.8195207126047018, + "grad_norm": 7.863511562347412, + "learning_rate": 7.978815218296926e-07, + "loss": 0.1317, + "step": 32385 + }, + { + "epoch": 0.8195460181693954, + "grad_norm": 5.1298346519470215, + "learning_rate": 7.976639383113249e-07, + "loss": 0.0941, + "step": 32386 + }, + { + "epoch": 0.8195713237340891, + "grad_norm": 6.043312072753906, + "learning_rate": 7.974463818930416e-07, + "loss": 0.1645, + "step": 32387 + }, + { + "epoch": 0.8195966292987829, + "grad_norm": 4.088781833648682, + "learning_rate": 7.97228852576245e-07, + "loss": 0.1471, + "step": 32388 + }, + { + "epoch": 0.8196219348634765, + "grad_norm": 1.7414227724075317, + "learning_rate": 7.970113503623372e-07, + "loss": 0.0758, + "step": 32389 + }, + { + "epoch": 0.8196472404281702, + "grad_norm": 5.361381530761719, + "learning_rate": 7.96793875252721e-07, + "loss": 0.1894, + "step": 32390 + }, + { + "epoch": 0.8196725459928639, + "grad_norm": 3.455432415008545, + "learning_rate": 7.965764272488014e-07, + "loss": 0.1016, + "step": 32391 + }, + { + "epoch": 0.8196978515575575, + "grad_norm": 5.839425563812256, + "learning_rate": 7.963590063519755e-07, + "loss": 0.1347, + "step": 32392 + }, + { + "epoch": 0.8197231571222512, + "grad_norm": 6.794301986694336, + "learning_rate": 7.961416125636501e-07, + "loss": 0.1409, + "step": 32393 + }, + { + "epoch": 0.8197484626869449, + "grad_norm": 2.7816641330718994, + "learning_rate": 7.959242458852245e-07, + "loss": 0.1225, + "step": 32394 + }, + { + "epoch": 0.8197737682516385, + "grad_norm": 2.3172829151153564, + "learning_rate": 7.957069063181022e-07, + "loss": 0.1163, + "step": 32395 + }, + { + "epoch": 0.8197990738163322, + "grad_norm": 3.217822790145874, + "learning_rate": 7.954895938636842e-07, + "loss": 0.0854, + "step": 32396 + }, + { + "epoch": 0.8198243793810259, + "grad_norm": 9.703155517578125, + "learning_rate": 7.952723085233721e-07, + "loss": 0.1379, + "step": 32397 + }, + { + "epoch": 0.8198496849457195, + "grad_norm": 6.428854942321777, + "learning_rate": 7.950550502985648e-07, + "loss": 0.1206, + "step": 32398 + }, + { + "epoch": 0.8198749905104132, + "grad_norm": 3.4363443851470947, + "learning_rate": 7.94837819190667e-07, + "loss": 0.0975, + "step": 32399 + }, + { + "epoch": 0.819900296075107, + "grad_norm": 9.302190780639648, + "learning_rate": 7.946206152010777e-07, + "loss": 0.1543, + "step": 32400 + }, + { + "epoch": 0.8199256016398005, + "grad_norm": 9.900079727172852, + "learning_rate": 7.944034383311983e-07, + "loss": 0.1321, + "step": 32401 + }, + { + "epoch": 0.8199509072044943, + "grad_norm": 4.389939308166504, + "learning_rate": 7.941862885824269e-07, + "loss": 0.1192, + "step": 32402 + }, + { + "epoch": 0.819976212769188, + "grad_norm": 7.510507583618164, + "learning_rate": 7.939691659561671e-07, + "loss": 0.1745, + "step": 32403 + }, + { + "epoch": 0.8200015183338816, + "grad_norm": 7.524941444396973, + "learning_rate": 7.937520704538181e-07, + "loss": 0.1368, + "step": 32404 + }, + { + "epoch": 0.8200268238985753, + "grad_norm": 18.704090118408203, + "learning_rate": 7.935350020767795e-07, + "loss": 0.3448, + "step": 32405 + }, + { + "epoch": 0.820052129463269, + "grad_norm": 5.079885482788086, + "learning_rate": 7.933179608264496e-07, + "loss": 0.0801, + "step": 32406 + }, + { + "epoch": 0.8200774350279626, + "grad_norm": 5.552639007568359, + "learning_rate": 7.931009467042312e-07, + "loss": 0.1921, + "step": 32407 + }, + { + "epoch": 0.8201027405926563, + "grad_norm": 2.6583120822906494, + "learning_rate": 7.928839597115212e-07, + "loss": 0.0699, + "step": 32408 + }, + { + "epoch": 0.82012804615735, + "grad_norm": 4.287960529327393, + "learning_rate": 7.926669998497222e-07, + "loss": 0.0919, + "step": 32409 + }, + { + "epoch": 0.8201533517220436, + "grad_norm": 4.274679660797119, + "learning_rate": 7.924500671202284e-07, + "loss": 0.1468, + "step": 32410 + }, + { + "epoch": 0.8201786572867373, + "grad_norm": 7.4334869384765625, + "learning_rate": 7.922331615244433e-07, + "loss": 0.2114, + "step": 32411 + }, + { + "epoch": 0.820203962851431, + "grad_norm": 7.748987674713135, + "learning_rate": 7.920162830637635e-07, + "loss": 0.1269, + "step": 32412 + }, + { + "epoch": 0.8202292684161248, + "grad_norm": 5.685537815093994, + "learning_rate": 7.917994317395877e-07, + "loss": 0.2022, + "step": 32413 + }, + { + "epoch": 0.8202545739808184, + "grad_norm": 4.771835803985596, + "learning_rate": 7.915826075533133e-07, + "loss": 0.1563, + "step": 32414 + }, + { + "epoch": 0.8202798795455121, + "grad_norm": 4.349499225616455, + "learning_rate": 7.913658105063415e-07, + "loss": 0.1946, + "step": 32415 + }, + { + "epoch": 0.8203051851102058, + "grad_norm": 2.728175163269043, + "learning_rate": 7.911490406000688e-07, + "loss": 0.1019, + "step": 32416 + }, + { + "epoch": 0.8203304906748994, + "grad_norm": 2.7233848571777344, + "learning_rate": 7.909322978358914e-07, + "loss": 0.1224, + "step": 32417 + }, + { + "epoch": 0.8203557962395931, + "grad_norm": 5.604745388031006, + "learning_rate": 7.907155822152102e-07, + "loss": 0.1184, + "step": 32418 + }, + { + "epoch": 0.8203811018042868, + "grad_norm": 10.366060256958008, + "learning_rate": 7.904988937394209e-07, + "loss": 0.187, + "step": 32419 + }, + { + "epoch": 0.8204064073689804, + "grad_norm": 4.307159423828125, + "learning_rate": 7.902822324099213e-07, + "loss": 0.1259, + "step": 32420 + }, + { + "epoch": 0.8204317129336741, + "grad_norm": 5.365438938140869, + "learning_rate": 7.900655982281075e-07, + "loss": 0.1795, + "step": 32421 + }, + { + "epoch": 0.8204570184983678, + "grad_norm": 15.135173797607422, + "learning_rate": 7.8984899119538e-07, + "loss": 0.1166, + "step": 32422 + }, + { + "epoch": 0.8204823240630614, + "grad_norm": 9.771246910095215, + "learning_rate": 7.896324113131304e-07, + "loss": 0.1366, + "step": 32423 + }, + { + "epoch": 0.8205076296277551, + "grad_norm": 3.0130438804626465, + "learning_rate": 7.894158585827599e-07, + "loss": 0.1187, + "step": 32424 + }, + { + "epoch": 0.8205329351924489, + "grad_norm": 5.546752452850342, + "learning_rate": 7.891993330056619e-07, + "loss": 0.1811, + "step": 32425 + }, + { + "epoch": 0.8205582407571425, + "grad_norm": 2.03291916847229, + "learning_rate": 7.889828345832363e-07, + "loss": 0.0631, + "step": 32426 + }, + { + "epoch": 0.8205835463218362, + "grad_norm": 7.133358955383301, + "learning_rate": 7.887663633168746e-07, + "loss": 0.1929, + "step": 32427 + }, + { + "epoch": 0.8206088518865299, + "grad_norm": 2.160539150238037, + "learning_rate": 7.885499192079771e-07, + "loss": 0.105, + "step": 32428 + }, + { + "epoch": 0.8206341574512235, + "grad_norm": 5.138381481170654, + "learning_rate": 7.883335022579364e-07, + "loss": 0.1641, + "step": 32429 + }, + { + "epoch": 0.8206594630159172, + "grad_norm": 6.775759696960449, + "learning_rate": 7.881171124681503e-07, + "loss": 0.0948, + "step": 32430 + }, + { + "epoch": 0.8206847685806109, + "grad_norm": 3.990429639816284, + "learning_rate": 7.879007498400138e-07, + "loss": 0.1486, + "step": 32431 + }, + { + "epoch": 0.8207100741453045, + "grad_norm": 6.713110446929932, + "learning_rate": 7.876844143749219e-07, + "loss": 0.2302, + "step": 32432 + }, + { + "epoch": 0.8207353797099982, + "grad_norm": 5.403286457061768, + "learning_rate": 7.874681060742679e-07, + "loss": 0.1775, + "step": 32433 + }, + { + "epoch": 0.8207606852746919, + "grad_norm": 15.899718284606934, + "learning_rate": 7.872518249394501e-07, + "loss": 0.2036, + "step": 32434 + }, + { + "epoch": 0.8207859908393855, + "grad_norm": 3.816443920135498, + "learning_rate": 7.870355709718619e-07, + "loss": 0.147, + "step": 32435 + }, + { + "epoch": 0.8208112964040792, + "grad_norm": 3.127531051635742, + "learning_rate": 7.868193441728972e-07, + "loss": 0.0886, + "step": 32436 + }, + { + "epoch": 0.820836601968773, + "grad_norm": 5.482348442077637, + "learning_rate": 7.866031445439498e-07, + "loss": 0.1584, + "step": 32437 + }, + { + "epoch": 0.8208619075334667, + "grad_norm": 6.367761611938477, + "learning_rate": 7.863869720864165e-07, + "loss": 0.18, + "step": 32438 + }, + { + "epoch": 0.8208872130981603, + "grad_norm": 4.2718682289123535, + "learning_rate": 7.861708268016893e-07, + "loss": 0.0913, + "step": 32439 + }, + { + "epoch": 0.820912518662854, + "grad_norm": 4.93881893157959, + "learning_rate": 7.859547086911629e-07, + "loss": 0.0963, + "step": 32440 + }, + { + "epoch": 0.8209378242275477, + "grad_norm": 4.750095844268799, + "learning_rate": 7.857386177562293e-07, + "loss": 0.2015, + "step": 32441 + }, + { + "epoch": 0.8209631297922413, + "grad_norm": 9.44472599029541, + "learning_rate": 7.855225539982847e-07, + "loss": 0.2252, + "step": 32442 + }, + { + "epoch": 0.820988435356935, + "grad_norm": 4.504540920257568, + "learning_rate": 7.853065174187214e-07, + "loss": 0.1208, + "step": 32443 + }, + { + "epoch": 0.8210137409216287, + "grad_norm": 3.9881458282470703, + "learning_rate": 7.850905080189324e-07, + "loss": 0.1753, + "step": 32444 + }, + { + "epoch": 0.8210390464863223, + "grad_norm": 4.341951847076416, + "learning_rate": 7.848745258003094e-07, + "loss": 0.139, + "step": 32445 + }, + { + "epoch": 0.821064352051016, + "grad_norm": 14.711532592773438, + "learning_rate": 7.846585707642474e-07, + "loss": 0.2666, + "step": 32446 + }, + { + "epoch": 0.8210896576157097, + "grad_norm": 4.077212810516357, + "learning_rate": 7.844426429121382e-07, + "loss": 0.1322, + "step": 32447 + }, + { + "epoch": 0.8211149631804033, + "grad_norm": 13.48660945892334, + "learning_rate": 7.842267422453737e-07, + "loss": 0.2216, + "step": 32448 + }, + { + "epoch": 0.821140268745097, + "grad_norm": 4.88023042678833, + "learning_rate": 7.840108687653485e-07, + "loss": 0.1679, + "step": 32449 + }, + { + "epoch": 0.8211655743097908, + "grad_norm": 4.1996612548828125, + "learning_rate": 7.837950224734508e-07, + "loss": 0.1589, + "step": 32450 + }, + { + "epoch": 0.8211908798744844, + "grad_norm": 9.595829963684082, + "learning_rate": 7.83579203371076e-07, + "loss": 0.2088, + "step": 32451 + }, + { + "epoch": 0.8212161854391781, + "grad_norm": 4.819659233093262, + "learning_rate": 7.833634114596133e-07, + "loss": 0.1408, + "step": 32452 + }, + { + "epoch": 0.8212414910038718, + "grad_norm": 6.289936542510986, + "learning_rate": 7.831476467404581e-07, + "loss": 0.1125, + "step": 32453 + }, + { + "epoch": 0.8212667965685654, + "grad_norm": 3.100085735321045, + "learning_rate": 7.829319092149967e-07, + "loss": 0.1058, + "step": 32454 + }, + { + "epoch": 0.8212921021332591, + "grad_norm": 4.577432632446289, + "learning_rate": 7.827161988846243e-07, + "loss": 0.1595, + "step": 32455 + }, + { + "epoch": 0.8213174076979528, + "grad_norm": 3.802185535430908, + "learning_rate": 7.825005157507293e-07, + "loss": 0.146, + "step": 32456 + }, + { + "epoch": 0.8213427132626464, + "grad_norm": 6.367279529571533, + "learning_rate": 7.822848598147065e-07, + "loss": 0.1627, + "step": 32457 + }, + { + "epoch": 0.8213680188273401, + "grad_norm": 5.598029613494873, + "learning_rate": 7.82069231077942e-07, + "loss": 0.2013, + "step": 32458 + }, + { + "epoch": 0.8213933243920338, + "grad_norm": 5.372909069061279, + "learning_rate": 7.818536295418294e-07, + "loss": 0.2007, + "step": 32459 + }, + { + "epoch": 0.8214186299567274, + "grad_norm": 5.589481353759766, + "learning_rate": 7.816380552077568e-07, + "loss": 0.173, + "step": 32460 + }, + { + "epoch": 0.8214439355214211, + "grad_norm": 4.307451248168945, + "learning_rate": 7.81422508077117e-07, + "loss": 0.1759, + "step": 32461 + }, + { + "epoch": 0.8214692410861149, + "grad_norm": 5.257175445556641, + "learning_rate": 7.812069881512985e-07, + "loss": 0.1904, + "step": 32462 + }, + { + "epoch": 0.8214945466508086, + "grad_norm": 3.4499599933624268, + "learning_rate": 7.809914954316916e-07, + "loss": 0.0933, + "step": 32463 + }, + { + "epoch": 0.8215198522155022, + "grad_norm": 7.075733661651611, + "learning_rate": 7.807760299196843e-07, + "loss": 0.2728, + "step": 32464 + }, + { + "epoch": 0.8215451577801959, + "grad_norm": 21.852840423583984, + "learning_rate": 7.805605916166692e-07, + "loss": 0.1702, + "step": 32465 + }, + { + "epoch": 0.8215704633448896, + "grad_norm": 3.845308780670166, + "learning_rate": 7.803451805240336e-07, + "loss": 0.1043, + "step": 32466 + }, + { + "epoch": 0.8215957689095832, + "grad_norm": 6.0873870849609375, + "learning_rate": 7.80129796643167e-07, + "loss": 0.2315, + "step": 32467 + }, + { + "epoch": 0.8216210744742769, + "grad_norm": 5.083275318145752, + "learning_rate": 7.799144399754577e-07, + "loss": 0.1647, + "step": 32468 + }, + { + "epoch": 0.8216463800389706, + "grad_norm": 3.0337626934051514, + "learning_rate": 7.796991105222962e-07, + "loss": 0.1515, + "step": 32469 + }, + { + "epoch": 0.8216716856036642, + "grad_norm": 6.989724159240723, + "learning_rate": 7.794838082850697e-07, + "loss": 0.1481, + "step": 32470 + }, + { + "epoch": 0.8216969911683579, + "grad_norm": 4.764156341552734, + "learning_rate": 7.792685332651673e-07, + "loss": 0.1492, + "step": 32471 + }, + { + "epoch": 0.8217222967330516, + "grad_norm": 4.016089916229248, + "learning_rate": 7.79053285463976e-07, + "loss": 0.1345, + "step": 32472 + }, + { + "epoch": 0.8217476022977452, + "grad_norm": 3.173069477081299, + "learning_rate": 7.788380648828858e-07, + "loss": 0.1573, + "step": 32473 + }, + { + "epoch": 0.821772907862439, + "grad_norm": 3.047492027282715, + "learning_rate": 7.786228715232835e-07, + "loss": 0.1347, + "step": 32474 + }, + { + "epoch": 0.8217982134271327, + "grad_norm": 7.490676403045654, + "learning_rate": 7.784077053865574e-07, + "loss": 0.1386, + "step": 32475 + }, + { + "epoch": 0.8218235189918263, + "grad_norm": 3.572819232940674, + "learning_rate": 7.781925664740947e-07, + "loss": 0.1732, + "step": 32476 + }, + { + "epoch": 0.82184882455652, + "grad_norm": 9.064040184020996, + "learning_rate": 7.779774547872815e-07, + "loss": 0.1649, + "step": 32477 + }, + { + "epoch": 0.8218741301212137, + "grad_norm": 5.73020601272583, + "learning_rate": 7.777623703275078e-07, + "loss": 0.1636, + "step": 32478 + }, + { + "epoch": 0.8218994356859073, + "grad_norm": 3.621027708053589, + "learning_rate": 7.775473130961592e-07, + "loss": 0.1371, + "step": 32479 + }, + { + "epoch": 0.821924741250601, + "grad_norm": 7.924816608428955, + "learning_rate": 7.773322830946223e-07, + "loss": 0.1988, + "step": 32480 + }, + { + "epoch": 0.8219500468152947, + "grad_norm": 3.1423726081848145, + "learning_rate": 7.771172803242827e-07, + "loss": 0.1546, + "step": 32481 + }, + { + "epoch": 0.8219753523799883, + "grad_norm": 5.729771614074707, + "learning_rate": 7.769023047865298e-07, + "loss": 0.2217, + "step": 32482 + }, + { + "epoch": 0.822000657944682, + "grad_norm": 4.506470680236816, + "learning_rate": 7.76687356482747e-07, + "loss": 0.1937, + "step": 32483 + }, + { + "epoch": 0.8220259635093757, + "grad_norm": 3.606137275695801, + "learning_rate": 7.764724354143244e-07, + "loss": 0.099, + "step": 32484 + }, + { + "epoch": 0.8220512690740693, + "grad_norm": 8.671111106872559, + "learning_rate": 7.762575415826429e-07, + "loss": 0.2427, + "step": 32485 + }, + { + "epoch": 0.822076574638763, + "grad_norm": 8.870783805847168, + "learning_rate": 7.76042674989092e-07, + "loss": 0.2395, + "step": 32486 + }, + { + "epoch": 0.8221018802034568, + "grad_norm": 3.0094921588897705, + "learning_rate": 7.758278356350551e-07, + "loss": 0.1178, + "step": 32487 + }, + { + "epoch": 0.8221271857681505, + "grad_norm": 8.34299373626709, + "learning_rate": 7.756130235219211e-07, + "loss": 0.1591, + "step": 32488 + }, + { + "epoch": 0.8221524913328441, + "grad_norm": 2.7443675994873047, + "learning_rate": 7.753982386510706e-07, + "loss": 0.095, + "step": 32489 + }, + { + "epoch": 0.8221777968975378, + "grad_norm": 4.0306525230407715, + "learning_rate": 7.751834810238917e-07, + "loss": 0.1704, + "step": 32490 + }, + { + "epoch": 0.8222031024622315, + "grad_norm": 7.887998580932617, + "learning_rate": 7.74968750641768e-07, + "loss": 0.2445, + "step": 32491 + }, + { + "epoch": 0.8222284080269251, + "grad_norm": 11.885265350341797, + "learning_rate": 7.74754047506086e-07, + "loss": 0.193, + "step": 32492 + }, + { + "epoch": 0.8222537135916188, + "grad_norm": 5.469719886779785, + "learning_rate": 7.745393716182293e-07, + "loss": 0.202, + "step": 32493 + }, + { + "epoch": 0.8222790191563125, + "grad_norm": 3.2865872383117676, + "learning_rate": 7.743247229795819e-07, + "loss": 0.0946, + "step": 32494 + }, + { + "epoch": 0.8223043247210061, + "grad_norm": 6.518470287322998, + "learning_rate": 7.74110101591527e-07, + "loss": 0.1365, + "step": 32495 + }, + { + "epoch": 0.8223296302856998, + "grad_norm": 5.295617580413818, + "learning_rate": 7.738955074554516e-07, + "loss": 0.2378, + "step": 32496 + }, + { + "epoch": 0.8223549358503935, + "grad_norm": 3.666165828704834, + "learning_rate": 7.736809405727374e-07, + "loss": 0.1451, + "step": 32497 + }, + { + "epoch": 0.8223802414150871, + "grad_norm": 11.351486206054688, + "learning_rate": 7.734664009447695e-07, + "loss": 0.2899, + "step": 32498 + }, + { + "epoch": 0.8224055469797809, + "grad_norm": 8.16349983215332, + "learning_rate": 7.73251888572929e-07, + "loss": 0.1956, + "step": 32499 + }, + { + "epoch": 0.8224308525444746, + "grad_norm": 5.998510360717773, + "learning_rate": 7.73037403458602e-07, + "loss": 0.152, + "step": 32500 + }, + { + "epoch": 0.8224561581091682, + "grad_norm": 2.6593358516693115, + "learning_rate": 7.728229456031705e-07, + "loss": 0.1098, + "step": 32501 + }, + { + "epoch": 0.8224814636738619, + "grad_norm": 6.018312931060791, + "learning_rate": 7.726085150080171e-07, + "loss": 0.132, + "step": 32502 + }, + { + "epoch": 0.8225067692385556, + "grad_norm": 4.215897083282471, + "learning_rate": 7.723941116745242e-07, + "loss": 0.1506, + "step": 32503 + }, + { + "epoch": 0.8225320748032492, + "grad_norm": 3.300595283508301, + "learning_rate": 7.721797356040766e-07, + "loss": 0.1235, + "step": 32504 + }, + { + "epoch": 0.8225573803679429, + "grad_norm": 5.8635053634643555, + "learning_rate": 7.719653867980554e-07, + "loss": 0.1336, + "step": 32505 + }, + { + "epoch": 0.8225826859326366, + "grad_norm": 5.925957202911377, + "learning_rate": 7.717510652578431e-07, + "loss": 0.2163, + "step": 32506 + }, + { + "epoch": 0.8226079914973302, + "grad_norm": 8.265605926513672, + "learning_rate": 7.715367709848215e-07, + "loss": 0.2193, + "step": 32507 + }, + { + "epoch": 0.8226332970620239, + "grad_norm": 5.5883612632751465, + "learning_rate": 7.713225039803712e-07, + "loss": 0.1838, + "step": 32508 + }, + { + "epoch": 0.8226586026267176, + "grad_norm": 3.649061441421509, + "learning_rate": 7.711082642458773e-07, + "loss": 0.1087, + "step": 32509 + }, + { + "epoch": 0.8226839081914112, + "grad_norm": 9.361984252929688, + "learning_rate": 7.70894051782719e-07, + "loss": 0.116, + "step": 32510 + }, + { + "epoch": 0.822709213756105, + "grad_norm": 3.3185179233551025, + "learning_rate": 7.706798665922783e-07, + "loss": 0.1458, + "step": 32511 + }, + { + "epoch": 0.8227345193207987, + "grad_norm": 7.470983028411865, + "learning_rate": 7.704657086759353e-07, + "loss": 0.1729, + "step": 32512 + }, + { + "epoch": 0.8227598248854924, + "grad_norm": 6.284575939178467, + "learning_rate": 7.702515780350739e-07, + "loss": 0.1375, + "step": 32513 + }, + { + "epoch": 0.822785130450186, + "grad_norm": 9.628928184509277, + "learning_rate": 7.700374746710715e-07, + "loss": 0.1432, + "step": 32514 + }, + { + "epoch": 0.8228104360148797, + "grad_norm": 5.39077091217041, + "learning_rate": 7.698233985853132e-07, + "loss": 0.107, + "step": 32515 + }, + { + "epoch": 0.8228357415795734, + "grad_norm": 6.420918941497803, + "learning_rate": 7.696093497791745e-07, + "loss": 0.1341, + "step": 32516 + }, + { + "epoch": 0.822861047144267, + "grad_norm": 3.473376750946045, + "learning_rate": 7.693953282540395e-07, + "loss": 0.1068, + "step": 32517 + }, + { + "epoch": 0.8228863527089607, + "grad_norm": 3.601074695587158, + "learning_rate": 7.691813340112858e-07, + "loss": 0.0842, + "step": 32518 + }, + { + "epoch": 0.8229116582736544, + "grad_norm": 6.439310550689697, + "learning_rate": 7.689673670522974e-07, + "loss": 0.192, + "step": 32519 + }, + { + "epoch": 0.822936963838348, + "grad_norm": 10.465335845947266, + "learning_rate": 7.687534273784486e-07, + "loss": 0.2266, + "step": 32520 + }, + { + "epoch": 0.8229622694030417, + "grad_norm": 3.5910143852233887, + "learning_rate": 7.685395149911229e-07, + "loss": 0.126, + "step": 32521 + }, + { + "epoch": 0.8229875749677354, + "grad_norm": 3.559539318084717, + "learning_rate": 7.68325629891698e-07, + "loss": 0.1401, + "step": 32522 + }, + { + "epoch": 0.823012880532429, + "grad_norm": 3.691594123840332, + "learning_rate": 7.681117720815551e-07, + "loss": 0.1231, + "step": 32523 + }, + { + "epoch": 0.8230381860971228, + "grad_norm": 8.581884384155273, + "learning_rate": 7.678979415620718e-07, + "loss": 0.1745, + "step": 32524 + }, + { + "epoch": 0.8230634916618165, + "grad_norm": 4.133457183837891, + "learning_rate": 7.676841383346278e-07, + "loss": 0.1884, + "step": 32525 + }, + { + "epoch": 0.8230887972265101, + "grad_norm": 7.195197105407715, + "learning_rate": 7.674703624006002e-07, + "loss": 0.2238, + "step": 32526 + }, + { + "epoch": 0.8231141027912038, + "grad_norm": 8.48582935333252, + "learning_rate": 7.672566137613702e-07, + "loss": 0.196, + "step": 32527 + }, + { + "epoch": 0.8231394083558975, + "grad_norm": 4.690694332122803, + "learning_rate": 7.670428924183148e-07, + "loss": 0.0745, + "step": 32528 + }, + { + "epoch": 0.8231647139205911, + "grad_norm": 8.207304954528809, + "learning_rate": 7.668291983728127e-07, + "loss": 0.1706, + "step": 32529 + }, + { + "epoch": 0.8231900194852848, + "grad_norm": 5.497509002685547, + "learning_rate": 7.666155316262403e-07, + "loss": 0.1325, + "step": 32530 + }, + { + "epoch": 0.8232153250499785, + "grad_norm": 3.097177267074585, + "learning_rate": 7.664018921799782e-07, + "loss": 0.125, + "step": 32531 + }, + { + "epoch": 0.8232406306146721, + "grad_norm": 4.289323329925537, + "learning_rate": 7.661882800354026e-07, + "loss": 0.0968, + "step": 32532 + }, + { + "epoch": 0.8232659361793658, + "grad_norm": 4.950519561767578, + "learning_rate": 7.659746951938912e-07, + "loss": 0.134, + "step": 32533 + }, + { + "epoch": 0.8232912417440595, + "grad_norm": 9.683707237243652, + "learning_rate": 7.657611376568213e-07, + "loss": 0.155, + "step": 32534 + }, + { + "epoch": 0.8233165473087531, + "grad_norm": 7.90742826461792, + "learning_rate": 7.655476074255691e-07, + "loss": 0.1597, + "step": 32535 + }, + { + "epoch": 0.8233418528734469, + "grad_norm": 4.347143650054932, + "learning_rate": 7.653341045015139e-07, + "loss": 0.1479, + "step": 32536 + }, + { + "epoch": 0.8233671584381406, + "grad_norm": 5.415891647338867, + "learning_rate": 7.651206288860314e-07, + "loss": 0.1831, + "step": 32537 + }, + { + "epoch": 0.8233924640028342, + "grad_norm": 7.274389266967773, + "learning_rate": 7.649071805804987e-07, + "loss": 0.2328, + "step": 32538 + }, + { + "epoch": 0.8234177695675279, + "grad_norm": 3.72304368019104, + "learning_rate": 7.646937595862897e-07, + "loss": 0.1305, + "step": 32539 + }, + { + "epoch": 0.8234430751322216, + "grad_norm": 8.278097152709961, + "learning_rate": 7.644803659047845e-07, + "loss": 0.1602, + "step": 32540 + }, + { + "epoch": 0.8234683806969153, + "grad_norm": 5.764975070953369, + "learning_rate": 7.642669995373575e-07, + "loss": 0.2278, + "step": 32541 + }, + { + "epoch": 0.8234936862616089, + "grad_norm": 12.36032485961914, + "learning_rate": 7.640536604853849e-07, + "loss": 0.2163, + "step": 32542 + }, + { + "epoch": 0.8235189918263026, + "grad_norm": 3.560422420501709, + "learning_rate": 7.63840348750241e-07, + "loss": 0.1194, + "step": 32543 + }, + { + "epoch": 0.8235442973909963, + "grad_norm": 5.105190753936768, + "learning_rate": 7.636270643333038e-07, + "loss": 0.1902, + "step": 32544 + }, + { + "epoch": 0.8235696029556899, + "grad_norm": 2.7381019592285156, + "learning_rate": 7.634138072359465e-07, + "loss": 0.1012, + "step": 32545 + }, + { + "epoch": 0.8235949085203836, + "grad_norm": 8.137588500976562, + "learning_rate": 7.63200577459548e-07, + "loss": 0.1816, + "step": 32546 + }, + { + "epoch": 0.8236202140850774, + "grad_norm": 4.319565773010254, + "learning_rate": 7.629873750054784e-07, + "loss": 0.1521, + "step": 32547 + }, + { + "epoch": 0.823645519649771, + "grad_norm": 4.171066761016846, + "learning_rate": 7.627741998751159e-07, + "loss": 0.2267, + "step": 32548 + }, + { + "epoch": 0.8236708252144647, + "grad_norm": 3.849529266357422, + "learning_rate": 7.625610520698334e-07, + "loss": 0.1441, + "step": 32549 + }, + { + "epoch": 0.8236961307791584, + "grad_norm": 3.207362413406372, + "learning_rate": 7.623479315910093e-07, + "loss": 0.1323, + "step": 32550 + }, + { + "epoch": 0.823721436343852, + "grad_norm": 2.74198055267334, + "learning_rate": 7.621348384400124e-07, + "loss": 0.1316, + "step": 32551 + }, + { + "epoch": 0.8237467419085457, + "grad_norm": 4.355624675750732, + "learning_rate": 7.619217726182204e-07, + "loss": 0.1765, + "step": 32552 + }, + { + "epoch": 0.8237720474732394, + "grad_norm": 4.228490352630615, + "learning_rate": 7.617087341270058e-07, + "loss": 0.183, + "step": 32553 + }, + { + "epoch": 0.823797353037933, + "grad_norm": 6.88065767288208, + "learning_rate": 7.614957229677455e-07, + "loss": 0.2211, + "step": 32554 + }, + { + "epoch": 0.8238226586026267, + "grad_norm": 8.799464225769043, + "learning_rate": 7.612827391418088e-07, + "loss": 0.2735, + "step": 32555 + }, + { + "epoch": 0.8238479641673204, + "grad_norm": 4.165823459625244, + "learning_rate": 7.61069782650572e-07, + "loss": 0.151, + "step": 32556 + }, + { + "epoch": 0.823873269732014, + "grad_norm": 3.8537142276763916, + "learning_rate": 7.608568534954063e-07, + "loss": 0.1459, + "step": 32557 + }, + { + "epoch": 0.8238985752967077, + "grad_norm": 3.147706985473633, + "learning_rate": 7.606439516776881e-07, + "loss": 0.181, + "step": 32558 + }, + { + "epoch": 0.8239238808614014, + "grad_norm": 7.782618522644043, + "learning_rate": 7.604310771987883e-07, + "loss": 0.1642, + "step": 32559 + }, + { + "epoch": 0.823949186426095, + "grad_norm": 3.8759679794311523, + "learning_rate": 7.602182300600796e-07, + "loss": 0.1331, + "step": 32560 + }, + { + "epoch": 0.8239744919907888, + "grad_norm": 4.5840983390808105, + "learning_rate": 7.600054102629346e-07, + "loss": 0.1168, + "step": 32561 + }, + { + "epoch": 0.8239997975554825, + "grad_norm": 10.278362274169922, + "learning_rate": 7.597926178087251e-07, + "loss": 0.2303, + "step": 32562 + }, + { + "epoch": 0.8240251031201761, + "grad_norm": 6.121805191040039, + "learning_rate": 7.595798526988258e-07, + "loss": 0.1836, + "step": 32563 + }, + { + "epoch": 0.8240504086848698, + "grad_norm": 4.85589075088501, + "learning_rate": 7.593671149346072e-07, + "loss": 0.2028, + "step": 32564 + }, + { + "epoch": 0.8240757142495635, + "grad_norm": 3.420152425765991, + "learning_rate": 7.591544045174409e-07, + "loss": 0.1129, + "step": 32565 + }, + { + "epoch": 0.8241010198142572, + "grad_norm": 12.517441749572754, + "learning_rate": 7.589417214486977e-07, + "loss": 0.2454, + "step": 32566 + }, + { + "epoch": 0.8241263253789508, + "grad_norm": 5.157712936401367, + "learning_rate": 7.587290657297519e-07, + "loss": 0.1242, + "step": 32567 + }, + { + "epoch": 0.8241516309436445, + "grad_norm": 7.942773818969727, + "learning_rate": 7.585164373619735e-07, + "loss": 0.1723, + "step": 32568 + }, + { + "epoch": 0.8241769365083382, + "grad_norm": 10.29056453704834, + "learning_rate": 7.583038363467332e-07, + "loss": 0.1693, + "step": 32569 + }, + { + "epoch": 0.8242022420730318, + "grad_norm": 5.481783390045166, + "learning_rate": 7.58091262685402e-07, + "loss": 0.1782, + "step": 32570 + }, + { + "epoch": 0.8242275476377255, + "grad_norm": 8.157181739807129, + "learning_rate": 7.578787163793516e-07, + "loss": 0.1315, + "step": 32571 + }, + { + "epoch": 0.8242528532024193, + "grad_norm": 2.8975799083709717, + "learning_rate": 7.576661974299526e-07, + "loss": 0.1085, + "step": 32572 + }, + { + "epoch": 0.8242781587671129, + "grad_norm": 4.147947788238525, + "learning_rate": 7.57453705838575e-07, + "loss": 0.1658, + "step": 32573 + }, + { + "epoch": 0.8243034643318066, + "grad_norm": 6.647599220275879, + "learning_rate": 7.572412416065883e-07, + "loss": 0.1939, + "step": 32574 + }, + { + "epoch": 0.8243287698965003, + "grad_norm": 7.535408973693848, + "learning_rate": 7.570288047353647e-07, + "loss": 0.1787, + "step": 32575 + }, + { + "epoch": 0.8243540754611939, + "grad_norm": 4.115249156951904, + "learning_rate": 7.568163952262719e-07, + "loss": 0.1571, + "step": 32576 + }, + { + "epoch": 0.8243793810258876, + "grad_norm": 5.220215320587158, + "learning_rate": 7.566040130806834e-07, + "loss": 0.231, + "step": 32577 + }, + { + "epoch": 0.8244046865905813, + "grad_norm": 8.397391319274902, + "learning_rate": 7.563916582999642e-07, + "loss": 0.2532, + "step": 32578 + }, + { + "epoch": 0.8244299921552749, + "grad_norm": 3.3927290439605713, + "learning_rate": 7.561793308854865e-07, + "loss": 0.1408, + "step": 32579 + }, + { + "epoch": 0.8244552977199686, + "grad_norm": 3.370666980743408, + "learning_rate": 7.559670308386174e-07, + "loss": 0.072, + "step": 32580 + }, + { + "epoch": 0.8244806032846623, + "grad_norm": 5.033115863800049, + "learning_rate": 7.557547581607306e-07, + "loss": 0.1647, + "step": 32581 + }, + { + "epoch": 0.8245059088493559, + "grad_norm": 3.5956501960754395, + "learning_rate": 7.55542512853189e-07, + "loss": 0.146, + "step": 32582 + }, + { + "epoch": 0.8245312144140496, + "grad_norm": 8.713809967041016, + "learning_rate": 7.553302949173658e-07, + "loss": 0.2231, + "step": 32583 + }, + { + "epoch": 0.8245565199787434, + "grad_norm": 7.5054612159729, + "learning_rate": 7.551181043546263e-07, + "loss": 0.158, + "step": 32584 + }, + { + "epoch": 0.824581825543437, + "grad_norm": 5.174101829528809, + "learning_rate": 7.549059411663434e-07, + "loss": 0.1739, + "step": 32585 + }, + { + "epoch": 0.8246071311081307, + "grad_norm": 3.2384560108184814, + "learning_rate": 7.546938053538794e-07, + "loss": 0.1455, + "step": 32586 + }, + { + "epoch": 0.8246324366728244, + "grad_norm": 8.467872619628906, + "learning_rate": 7.544816969186069e-07, + "loss": 0.1725, + "step": 32587 + }, + { + "epoch": 0.824657742237518, + "grad_norm": 2.580498456954956, + "learning_rate": 7.542696158618912e-07, + "loss": 0.1149, + "step": 32588 + }, + { + "epoch": 0.8246830478022117, + "grad_norm": 15.708063125610352, + "learning_rate": 7.54057562185102e-07, + "loss": 0.295, + "step": 32589 + }, + { + "epoch": 0.8247083533669054, + "grad_norm": 3.0082221031188965, + "learning_rate": 7.538455358896058e-07, + "loss": 0.0659, + "step": 32590 + }, + { + "epoch": 0.8247336589315991, + "grad_norm": 6.537162780761719, + "learning_rate": 7.5363353697677e-07, + "loss": 0.1712, + "step": 32591 + }, + { + "epoch": 0.8247589644962927, + "grad_norm": 8.478059768676758, + "learning_rate": 7.534215654479616e-07, + "loss": 0.2151, + "step": 32592 + }, + { + "epoch": 0.8247842700609864, + "grad_norm": 6.294717788696289, + "learning_rate": 7.532096213045465e-07, + "loss": 0.1417, + "step": 32593 + }, + { + "epoch": 0.8248095756256801, + "grad_norm": 8.688569068908691, + "learning_rate": 7.529977045478937e-07, + "loss": 0.1321, + "step": 32594 + }, + { + "epoch": 0.8248348811903737, + "grad_norm": 7.285594940185547, + "learning_rate": 7.527858151793682e-07, + "loss": 0.1732, + "step": 32595 + }, + { + "epoch": 0.8248601867550674, + "grad_norm": 5.445669651031494, + "learning_rate": 7.525739532003373e-07, + "loss": 0.198, + "step": 32596 + }, + { + "epoch": 0.8248854923197612, + "grad_norm": 4.902139663696289, + "learning_rate": 7.523621186121661e-07, + "loss": 0.1455, + "step": 32597 + }, + { + "epoch": 0.8249107978844548, + "grad_norm": 6.5052080154418945, + "learning_rate": 7.521503114162221e-07, + "loss": 0.1729, + "step": 32598 + }, + { + "epoch": 0.8249361034491485, + "grad_norm": 6.125598907470703, + "learning_rate": 7.519385316138711e-07, + "loss": 0.1188, + "step": 32599 + }, + { + "epoch": 0.8249614090138422, + "grad_norm": 5.578277587890625, + "learning_rate": 7.51726779206478e-07, + "loss": 0.1923, + "step": 32600 + }, + { + "epoch": 0.8249867145785358, + "grad_norm": 4.809469699859619, + "learning_rate": 7.515150541954075e-07, + "loss": 0.1488, + "step": 32601 + }, + { + "epoch": 0.8250120201432295, + "grad_norm": 2.842639446258545, + "learning_rate": 7.513033565820272e-07, + "loss": 0.091, + "step": 32602 + }, + { + "epoch": 0.8250373257079232, + "grad_norm": 3.8910675048828125, + "learning_rate": 7.510916863677015e-07, + "loss": 0.1022, + "step": 32603 + }, + { + "epoch": 0.8250626312726168, + "grad_norm": 8.794760704040527, + "learning_rate": 7.508800435537949e-07, + "loss": 0.2566, + "step": 32604 + }, + { + "epoch": 0.8250879368373105, + "grad_norm": 3.910687208175659, + "learning_rate": 7.506684281416716e-07, + "loss": 0.2055, + "step": 32605 + }, + { + "epoch": 0.8251132424020042, + "grad_norm": 5.107390880584717, + "learning_rate": 7.504568401326979e-07, + "loss": 0.2248, + "step": 32606 + }, + { + "epoch": 0.8251385479666978, + "grad_norm": 5.803221702575684, + "learning_rate": 7.502452795282378e-07, + "loss": 0.0904, + "step": 32607 + }, + { + "epoch": 0.8251638535313915, + "grad_norm": 5.2087836265563965, + "learning_rate": 7.500337463296553e-07, + "loss": 0.1221, + "step": 32608 + }, + { + "epoch": 0.8251891590960853, + "grad_norm": 4.399259090423584, + "learning_rate": 7.498222405383138e-07, + "loss": 0.1156, + "step": 32609 + }, + { + "epoch": 0.8252144646607789, + "grad_norm": 12.027740478515625, + "learning_rate": 7.496107621555793e-07, + "loss": 0.2233, + "step": 32610 + }, + { + "epoch": 0.8252397702254726, + "grad_norm": 5.286689281463623, + "learning_rate": 7.493993111828124e-07, + "loss": 0.1798, + "step": 32611 + }, + { + "epoch": 0.8252650757901663, + "grad_norm": 6.122295379638672, + "learning_rate": 7.491878876213815e-07, + "loss": 0.1319, + "step": 32612 + }, + { + "epoch": 0.8252903813548599, + "grad_norm": 3.053534746170044, + "learning_rate": 7.489764914726449e-07, + "loss": 0.1025, + "step": 32613 + }, + { + "epoch": 0.8253156869195536, + "grad_norm": 2.254232883453369, + "learning_rate": 7.487651227379694e-07, + "loss": 0.1393, + "step": 32614 + }, + { + "epoch": 0.8253409924842473, + "grad_norm": 6.28594970703125, + "learning_rate": 7.485537814187155e-07, + "loss": 0.1466, + "step": 32615 + }, + { + "epoch": 0.825366298048941, + "grad_norm": 2.1524672508239746, + "learning_rate": 7.483424675162498e-07, + "loss": 0.0795, + "step": 32616 + }, + { + "epoch": 0.8253916036136346, + "grad_norm": 4.452322483062744, + "learning_rate": 7.481311810319308e-07, + "loss": 0.1481, + "step": 32617 + }, + { + "epoch": 0.8254169091783283, + "grad_norm": 14.021273612976074, + "learning_rate": 7.479199219671235e-07, + "loss": 0.3968, + "step": 32618 + }, + { + "epoch": 0.825442214743022, + "grad_norm": 4.591229438781738, + "learning_rate": 7.477086903231895e-07, + "loss": 0.1363, + "step": 32619 + }, + { + "epoch": 0.8254675203077156, + "grad_norm": 8.7699613571167, + "learning_rate": 7.474974861014906e-07, + "loss": 0.1941, + "step": 32620 + }, + { + "epoch": 0.8254928258724094, + "grad_norm": 13.243733406066895, + "learning_rate": 7.4728630930339e-07, + "loss": 0.255, + "step": 32621 + }, + { + "epoch": 0.8255181314371031, + "grad_norm": 4.9783711433410645, + "learning_rate": 7.470751599302495e-07, + "loss": 0.1332, + "step": 32622 + }, + { + "epoch": 0.8255434370017967, + "grad_norm": 5.568615436553955, + "learning_rate": 7.468640379834297e-07, + "loss": 0.1988, + "step": 32623 + }, + { + "epoch": 0.8255687425664904, + "grad_norm": 3.157560348510742, + "learning_rate": 7.466529434642916e-07, + "loss": 0.1374, + "step": 32624 + }, + { + "epoch": 0.8255940481311841, + "grad_norm": 6.714371204376221, + "learning_rate": 7.464418763741988e-07, + "loss": 0.1227, + "step": 32625 + }, + { + "epoch": 0.8256193536958777, + "grad_norm": 6.113460063934326, + "learning_rate": 7.46230836714511e-07, + "loss": 0.1822, + "step": 32626 + }, + { + "epoch": 0.8256446592605714, + "grad_norm": 8.858428955078125, + "learning_rate": 7.46019824486589e-07, + "loss": 0.1816, + "step": 32627 + }, + { + "epoch": 0.8256699648252651, + "grad_norm": 2.457723379135132, + "learning_rate": 7.458088396917929e-07, + "loss": 0.0976, + "step": 32628 + }, + { + "epoch": 0.8256952703899587, + "grad_norm": 6.235799789428711, + "learning_rate": 7.455978823314858e-07, + "loss": 0.1282, + "step": 32629 + }, + { + "epoch": 0.8257205759546524, + "grad_norm": 6.608529567718506, + "learning_rate": 7.453869524070262e-07, + "loss": 0.0883, + "step": 32630 + }, + { + "epoch": 0.8257458815193461, + "grad_norm": 14.741695404052734, + "learning_rate": 7.451760499197747e-07, + "loss": 0.2063, + "step": 32631 + }, + { + "epoch": 0.8257711870840397, + "grad_norm": 4.306990146636963, + "learning_rate": 7.449651748710901e-07, + "loss": 0.1716, + "step": 32632 + }, + { + "epoch": 0.8257964926487334, + "grad_norm": 10.96881103515625, + "learning_rate": 7.447543272623348e-07, + "loss": 0.2148, + "step": 32633 + }, + { + "epoch": 0.8258217982134272, + "grad_norm": 4.601404666900635, + "learning_rate": 7.445435070948676e-07, + "loss": 0.1535, + "step": 32634 + }, + { + "epoch": 0.8258471037781208, + "grad_norm": 4.82015323638916, + "learning_rate": 7.443327143700474e-07, + "loss": 0.2037, + "step": 32635 + }, + { + "epoch": 0.8258724093428145, + "grad_norm": 7.365521430969238, + "learning_rate": 7.441219490892332e-07, + "loss": 0.2074, + "step": 32636 + }, + { + "epoch": 0.8258977149075082, + "grad_norm": 6.676237106323242, + "learning_rate": 7.439112112537855e-07, + "loss": 0.18, + "step": 32637 + }, + { + "epoch": 0.8259230204722018, + "grad_norm": 3.420217752456665, + "learning_rate": 7.437005008650633e-07, + "loss": 0.1356, + "step": 32638 + }, + { + "epoch": 0.8259483260368955, + "grad_norm": 8.049139022827148, + "learning_rate": 7.434898179244243e-07, + "loss": 0.1932, + "step": 32639 + }, + { + "epoch": 0.8259736316015892, + "grad_norm": 5.037966728210449, + "learning_rate": 7.432791624332269e-07, + "loss": 0.1401, + "step": 32640 + }, + { + "epoch": 0.8259989371662829, + "grad_norm": 9.51724624633789, + "learning_rate": 7.430685343928312e-07, + "loss": 0.1614, + "step": 32641 + }, + { + "epoch": 0.8260242427309765, + "grad_norm": 5.722326755523682, + "learning_rate": 7.428579338045938e-07, + "loss": 0.0862, + "step": 32642 + }, + { + "epoch": 0.8260495482956702, + "grad_norm": 5.33737325668335, + "learning_rate": 7.426473606698758e-07, + "loss": 0.186, + "step": 32643 + }, + { + "epoch": 0.826074853860364, + "grad_norm": 8.518115997314453, + "learning_rate": 7.424368149900307e-07, + "loss": 0.2271, + "step": 32644 + }, + { + "epoch": 0.8261001594250575, + "grad_norm": 2.9257571697235107, + "learning_rate": 7.422262967664201e-07, + "loss": 0.1128, + "step": 32645 + }, + { + "epoch": 0.8261254649897513, + "grad_norm": 3.8513522148132324, + "learning_rate": 7.420158060004001e-07, + "loss": 0.1301, + "step": 32646 + }, + { + "epoch": 0.826150770554445, + "grad_norm": 4.382762432098389, + "learning_rate": 7.418053426933275e-07, + "loss": 0.1227, + "step": 32647 + }, + { + "epoch": 0.8261760761191386, + "grad_norm": 4.005163669586182, + "learning_rate": 7.415949068465594e-07, + "loss": 0.1312, + "step": 32648 + }, + { + "epoch": 0.8262013816838323, + "grad_norm": 4.26499080657959, + "learning_rate": 7.413844984614549e-07, + "loss": 0.1272, + "step": 32649 + }, + { + "epoch": 0.826226687248526, + "grad_norm": 3.778151273727417, + "learning_rate": 7.411741175393694e-07, + "loss": 0.1445, + "step": 32650 + }, + { + "epoch": 0.8262519928132196, + "grad_norm": 2.958766460418701, + "learning_rate": 7.409637640816592e-07, + "loss": 0.1299, + "step": 32651 + }, + { + "epoch": 0.8262772983779133, + "grad_norm": 3.8424267768859863, + "learning_rate": 7.407534380896819e-07, + "loss": 0.1415, + "step": 32652 + }, + { + "epoch": 0.826302603942607, + "grad_norm": 3.497905731201172, + "learning_rate": 7.405431395647939e-07, + "loss": 0.1353, + "step": 32653 + }, + { + "epoch": 0.8263279095073006, + "grad_norm": 4.374156475067139, + "learning_rate": 7.403328685083505e-07, + "loss": 0.1056, + "step": 32654 + }, + { + "epoch": 0.8263532150719943, + "grad_norm": 5.169819355010986, + "learning_rate": 7.40122624921707e-07, + "loss": 0.0715, + "step": 32655 + }, + { + "epoch": 0.826378520636688, + "grad_norm": 9.123540878295898, + "learning_rate": 7.399124088062226e-07, + "loss": 0.2159, + "step": 32656 + }, + { + "epoch": 0.8264038262013816, + "grad_norm": 4.9185380935668945, + "learning_rate": 7.397022201632487e-07, + "loss": 0.1598, + "step": 32657 + }, + { + "epoch": 0.8264291317660754, + "grad_norm": 3.410564422607422, + "learning_rate": 7.394920589941435e-07, + "loss": 0.1176, + "step": 32658 + }, + { + "epoch": 0.8264544373307691, + "grad_norm": 2.6851747035980225, + "learning_rate": 7.392819253002603e-07, + "loss": 0.1363, + "step": 32659 + }, + { + "epoch": 0.8264797428954627, + "grad_norm": 6.062051773071289, + "learning_rate": 7.390718190829577e-07, + "loss": 0.1505, + "step": 32660 + }, + { + "epoch": 0.8265050484601564, + "grad_norm": 8.376067161560059, + "learning_rate": 7.38861740343586e-07, + "loss": 0.2159, + "step": 32661 + }, + { + "epoch": 0.8265303540248501, + "grad_norm": 8.60551929473877, + "learning_rate": 7.386516890835038e-07, + "loss": 0.1794, + "step": 32662 + }, + { + "epoch": 0.8265556595895437, + "grad_norm": 5.301724910736084, + "learning_rate": 7.384416653040633e-07, + "loss": 0.1329, + "step": 32663 + }, + { + "epoch": 0.8265809651542374, + "grad_norm": 3.0968661308288574, + "learning_rate": 7.382316690066199e-07, + "loss": 0.1187, + "step": 32664 + }, + { + "epoch": 0.8266062707189311, + "grad_norm": 4.066795349121094, + "learning_rate": 7.380217001925288e-07, + "loss": 0.1782, + "step": 32665 + }, + { + "epoch": 0.8266315762836247, + "grad_norm": 3.594291925430298, + "learning_rate": 7.378117588631423e-07, + "loss": 0.1118, + "step": 32666 + }, + { + "epoch": 0.8266568818483184, + "grad_norm": 8.024338722229004, + "learning_rate": 7.376018450198142e-07, + "loss": 0.177, + "step": 32667 + }, + { + "epoch": 0.8266821874130121, + "grad_norm": 8.4561128616333, + "learning_rate": 7.373919586638994e-07, + "loss": 0.1477, + "step": 32668 + }, + { + "epoch": 0.8267074929777058, + "grad_norm": 6.489500999450684, + "learning_rate": 7.371820997967516e-07, + "loss": 0.1583, + "step": 32669 + }, + { + "epoch": 0.8267327985423994, + "grad_norm": 3.030163049697876, + "learning_rate": 7.369722684197233e-07, + "loss": 0.1313, + "step": 32670 + }, + { + "epoch": 0.8267581041070932, + "grad_norm": 10.043477058410645, + "learning_rate": 7.367624645341665e-07, + "loss": 0.2423, + "step": 32671 + }, + { + "epoch": 0.8267834096717869, + "grad_norm": 6.793388366699219, + "learning_rate": 7.36552688141437e-07, + "loss": 0.1629, + "step": 32672 + }, + { + "epoch": 0.8268087152364805, + "grad_norm": 3.3475828170776367, + "learning_rate": 7.363429392428851e-07, + "loss": 0.1284, + "step": 32673 + }, + { + "epoch": 0.8268340208011742, + "grad_norm": 8.350741386413574, + "learning_rate": 7.361332178398661e-07, + "loss": 0.1513, + "step": 32674 + }, + { + "epoch": 0.8268593263658679, + "grad_norm": 5.74709939956665, + "learning_rate": 7.359235239337292e-07, + "loss": 0.1893, + "step": 32675 + }, + { + "epoch": 0.8268846319305615, + "grad_norm": 7.848998546600342, + "learning_rate": 7.357138575258293e-07, + "loss": 0.1871, + "step": 32676 + }, + { + "epoch": 0.8269099374952552, + "grad_norm": 4.4501166343688965, + "learning_rate": 7.355042186175177e-07, + "loss": 0.1474, + "step": 32677 + }, + { + "epoch": 0.8269352430599489, + "grad_norm": 5.510499477386475, + "learning_rate": 7.352946072101458e-07, + "loss": 0.1168, + "step": 32678 + }, + { + "epoch": 0.8269605486246425, + "grad_norm": 4.724584102630615, + "learning_rate": 7.350850233050649e-07, + "loss": 0.1428, + "step": 32679 + }, + { + "epoch": 0.8269858541893362, + "grad_norm": 3.625122308731079, + "learning_rate": 7.348754669036279e-07, + "loss": 0.1456, + "step": 32680 + }, + { + "epoch": 0.82701115975403, + "grad_norm": 2.7946159839630127, + "learning_rate": 7.346659380071864e-07, + "loss": 0.0778, + "step": 32681 + }, + { + "epoch": 0.8270364653187235, + "grad_norm": 4.094173908233643, + "learning_rate": 7.344564366170904e-07, + "loss": 0.1388, + "step": 32682 + }, + { + "epoch": 0.8270617708834173, + "grad_norm": 4.542484760284424, + "learning_rate": 7.342469627346915e-07, + "loss": 0.1435, + "step": 32683 + }, + { + "epoch": 0.827087076448111, + "grad_norm": 3.4257659912109375, + "learning_rate": 7.340375163613389e-07, + "loss": 0.1256, + "step": 32684 + }, + { + "epoch": 0.8271123820128046, + "grad_norm": 2.9508254528045654, + "learning_rate": 7.338280974983864e-07, + "loss": 0.0733, + "step": 32685 + }, + { + "epoch": 0.8271376875774983, + "grad_norm": 8.823681831359863, + "learning_rate": 7.336187061471817e-07, + "loss": 0.2172, + "step": 32686 + }, + { + "epoch": 0.827162993142192, + "grad_norm": 8.011445999145508, + "learning_rate": 7.334093423090788e-07, + "loss": 0.1742, + "step": 32687 + }, + { + "epoch": 0.8271882987068856, + "grad_norm": 3.871856689453125, + "learning_rate": 7.332000059854227e-07, + "loss": 0.0871, + "step": 32688 + }, + { + "epoch": 0.8272136042715793, + "grad_norm": 4.719619274139404, + "learning_rate": 7.329906971775675e-07, + "loss": 0.1816, + "step": 32689 + }, + { + "epoch": 0.827238909836273, + "grad_norm": 11.23343276977539, + "learning_rate": 7.327814158868607e-07, + "loss": 0.191, + "step": 32690 + }, + { + "epoch": 0.8272642154009666, + "grad_norm": 7.458145618438721, + "learning_rate": 7.325721621146548e-07, + "loss": 0.1919, + "step": 32691 + }, + { + "epoch": 0.8272895209656603, + "grad_norm": 9.202420234680176, + "learning_rate": 7.323629358622947e-07, + "loss": 0.1873, + "step": 32692 + }, + { + "epoch": 0.827314826530354, + "grad_norm": 3.631779909133911, + "learning_rate": 7.321537371311338e-07, + "loss": 0.1118, + "step": 32693 + }, + { + "epoch": 0.8273401320950478, + "grad_norm": 19.469247817993164, + "learning_rate": 7.319445659225177e-07, + "loss": 0.2652, + "step": 32694 + }, + { + "epoch": 0.8273654376597414, + "grad_norm": 2.6186509132385254, + "learning_rate": 7.317354222377993e-07, + "loss": 0.0853, + "step": 32695 + }, + { + "epoch": 0.8273907432244351, + "grad_norm": 3.094128131866455, + "learning_rate": 7.315263060783245e-07, + "loss": 0.1028, + "step": 32696 + }, + { + "epoch": 0.8274160487891288, + "grad_norm": 7.017611026763916, + "learning_rate": 7.313172174454425e-07, + "loss": 0.1074, + "step": 32697 + }, + { + "epoch": 0.8274413543538224, + "grad_norm": 4.854513645172119, + "learning_rate": 7.311081563405009e-07, + "loss": 0.0982, + "step": 32698 + }, + { + "epoch": 0.8274666599185161, + "grad_norm": 3.483611583709717, + "learning_rate": 7.308991227648498e-07, + "loss": 0.1619, + "step": 32699 + }, + { + "epoch": 0.8274919654832098, + "grad_norm": 4.002252578735352, + "learning_rate": 7.30690116719836e-07, + "loss": 0.181, + "step": 32700 + }, + { + "epoch": 0.8275172710479034, + "grad_norm": 2.878312349319458, + "learning_rate": 7.304811382068078e-07, + "loss": 0.0634, + "step": 32701 + }, + { + "epoch": 0.8275425766125971, + "grad_norm": 5.5753607749938965, + "learning_rate": 7.302721872271113e-07, + "loss": 0.1114, + "step": 32702 + }, + { + "epoch": 0.8275678821772908, + "grad_norm": 4.957672595977783, + "learning_rate": 7.300632637820965e-07, + "loss": 0.144, + "step": 32703 + }, + { + "epoch": 0.8275931877419844, + "grad_norm": 3.570596933364868, + "learning_rate": 7.29854367873109e-07, + "loss": 0.0938, + "step": 32704 + }, + { + "epoch": 0.8276184933066781, + "grad_norm": 6.445470333099365, + "learning_rate": 7.296454995014968e-07, + "loss": 0.1978, + "step": 32705 + }, + { + "epoch": 0.8276437988713718, + "grad_norm": 9.280322074890137, + "learning_rate": 7.294366586686053e-07, + "loss": 0.1847, + "step": 32706 + }, + { + "epoch": 0.8276691044360655, + "grad_norm": 5.636059761047363, + "learning_rate": 7.292278453757834e-07, + "loss": 0.1242, + "step": 32707 + }, + { + "epoch": 0.8276944100007592, + "grad_norm": 2.5074267387390137, + "learning_rate": 7.290190596243763e-07, + "loss": 0.0977, + "step": 32708 + }, + { + "epoch": 0.8277197155654529, + "grad_norm": 5.058594703674316, + "learning_rate": 7.288103014157311e-07, + "loss": 0.1628, + "step": 32709 + }, + { + "epoch": 0.8277450211301465, + "grad_norm": 7.5585527420043945, + "learning_rate": 7.286015707511928e-07, + "loss": 0.226, + "step": 32710 + }, + { + "epoch": 0.8277703266948402, + "grad_norm": 3.9988701343536377, + "learning_rate": 7.283928676321095e-07, + "loss": 0.1694, + "step": 32711 + }, + { + "epoch": 0.8277956322595339, + "grad_norm": 4.323723316192627, + "learning_rate": 7.28184192059826e-07, + "loss": 0.1953, + "step": 32712 + }, + { + "epoch": 0.8278209378242275, + "grad_norm": 8.712244987487793, + "learning_rate": 7.279755440356883e-07, + "loss": 0.2011, + "step": 32713 + }, + { + "epoch": 0.8278462433889212, + "grad_norm": 3.6758382320404053, + "learning_rate": 7.277669235610413e-07, + "loss": 0.1861, + "step": 32714 + }, + { + "epoch": 0.8278715489536149, + "grad_norm": 5.823757171630859, + "learning_rate": 7.275583306372297e-07, + "loss": 0.1416, + "step": 32715 + }, + { + "epoch": 0.8278968545183085, + "grad_norm": 7.339056968688965, + "learning_rate": 7.273497652656008e-07, + "loss": 0.1703, + "step": 32716 + }, + { + "epoch": 0.8279221600830022, + "grad_norm": 9.68095874786377, + "learning_rate": 7.271412274474971e-07, + "loss": 0.2252, + "step": 32717 + }, + { + "epoch": 0.827947465647696, + "grad_norm": 2.2917635440826416, + "learning_rate": 7.269327171842671e-07, + "loss": 0.1069, + "step": 32718 + }, + { + "epoch": 0.8279727712123897, + "grad_norm": 9.298830032348633, + "learning_rate": 7.267242344772513e-07, + "loss": 0.2141, + "step": 32719 + }, + { + "epoch": 0.8279980767770833, + "grad_norm": 6.380732536315918, + "learning_rate": 7.265157793277971e-07, + "loss": 0.1432, + "step": 32720 + }, + { + "epoch": 0.828023382341777, + "grad_norm": 4.577319145202637, + "learning_rate": 7.263073517372465e-07, + "loss": 0.1823, + "step": 32721 + }, + { + "epoch": 0.8280486879064707, + "grad_norm": 5.162574768066406, + "learning_rate": 7.260989517069466e-07, + "loss": 0.2204, + "step": 32722 + }, + { + "epoch": 0.8280739934711643, + "grad_norm": 4.578628063201904, + "learning_rate": 7.258905792382381e-07, + "loss": 0.1754, + "step": 32723 + }, + { + "epoch": 0.828099299035858, + "grad_norm": 5.846034526824951, + "learning_rate": 7.256822343324671e-07, + "loss": 0.1833, + "step": 32724 + }, + { + "epoch": 0.8281246046005517, + "grad_norm": 5.476309299468994, + "learning_rate": 7.254739169909747e-07, + "loss": 0.1268, + "step": 32725 + }, + { + "epoch": 0.8281499101652453, + "grad_norm": 5.146862030029297, + "learning_rate": 7.252656272151076e-07, + "loss": 0.1399, + "step": 32726 + }, + { + "epoch": 0.828175215729939, + "grad_norm": 7.046669960021973, + "learning_rate": 7.25057365006207e-07, + "loss": 0.1536, + "step": 32727 + }, + { + "epoch": 0.8282005212946327, + "grad_norm": 4.777374267578125, + "learning_rate": 7.248491303656163e-07, + "loss": 0.1366, + "step": 32728 + }, + { + "epoch": 0.8282258268593263, + "grad_norm": 3.737086772918701, + "learning_rate": 7.246409232946772e-07, + "loss": 0.0802, + "step": 32729 + }, + { + "epoch": 0.82825113242402, + "grad_norm": 5.1358489990234375, + "learning_rate": 7.244327437947346e-07, + "loss": 0.1792, + "step": 32730 + }, + { + "epoch": 0.8282764379887138, + "grad_norm": 9.506308555603027, + "learning_rate": 7.242245918671298e-07, + "loss": 0.308, + "step": 32731 + }, + { + "epoch": 0.8283017435534074, + "grad_norm": 7.284581184387207, + "learning_rate": 7.240164675132055e-07, + "loss": 0.1798, + "step": 32732 + }, + { + "epoch": 0.8283270491181011, + "grad_norm": 5.520865440368652, + "learning_rate": 7.238083707343024e-07, + "loss": 0.1554, + "step": 32733 + }, + { + "epoch": 0.8283523546827948, + "grad_norm": 3.8237810134887695, + "learning_rate": 7.236003015317644e-07, + "loss": 0.1314, + "step": 32734 + }, + { + "epoch": 0.8283776602474884, + "grad_norm": 3.8738980293273926, + "learning_rate": 7.233922599069332e-07, + "loss": 0.1935, + "step": 32735 + }, + { + "epoch": 0.8284029658121821, + "grad_norm": 3.38055157661438, + "learning_rate": 7.231842458611493e-07, + "loss": 0.0994, + "step": 32736 + }, + { + "epoch": 0.8284282713768758, + "grad_norm": 7.277002334594727, + "learning_rate": 7.22976259395754e-07, + "loss": 0.1622, + "step": 32737 + }, + { + "epoch": 0.8284535769415694, + "grad_norm": 3.8001010417938232, + "learning_rate": 7.227683005120901e-07, + "loss": 0.1223, + "step": 32738 + }, + { + "epoch": 0.8284788825062631, + "grad_norm": 5.82688045501709, + "learning_rate": 7.22560369211498e-07, + "loss": 0.1756, + "step": 32739 + }, + { + "epoch": 0.8285041880709568, + "grad_norm": 15.776061058044434, + "learning_rate": 7.223524654953179e-07, + "loss": 0.2312, + "step": 32740 + }, + { + "epoch": 0.8285294936356504, + "grad_norm": 7.163781642913818, + "learning_rate": 7.221445893648915e-07, + "loss": 0.2059, + "step": 32741 + }, + { + "epoch": 0.8285547992003441, + "grad_norm": 4.052754878997803, + "learning_rate": 7.219367408215577e-07, + "loss": 0.1045, + "step": 32742 + }, + { + "epoch": 0.8285801047650379, + "grad_norm": 4.037940979003906, + "learning_rate": 7.217289198666594e-07, + "loss": 0.1172, + "step": 32743 + }, + { + "epoch": 0.8286054103297316, + "grad_norm": 4.41098165512085, + "learning_rate": 7.215211265015349e-07, + "loss": 0.1362, + "step": 32744 + }, + { + "epoch": 0.8286307158944252, + "grad_norm": 3.356895685195923, + "learning_rate": 7.213133607275252e-07, + "loss": 0.0871, + "step": 32745 + }, + { + "epoch": 0.8286560214591189, + "grad_norm": 4.076895713806152, + "learning_rate": 7.21105622545969e-07, + "loss": 0.1642, + "step": 32746 + }, + { + "epoch": 0.8286813270238126, + "grad_norm": 8.426464080810547, + "learning_rate": 7.208979119582072e-07, + "loss": 0.1643, + "step": 32747 + }, + { + "epoch": 0.8287066325885062, + "grad_norm": 6.518740177154541, + "learning_rate": 7.206902289655776e-07, + "loss": 0.1362, + "step": 32748 + }, + { + "epoch": 0.8287319381531999, + "grad_norm": 7.992455005645752, + "learning_rate": 7.204825735694232e-07, + "loss": 0.2279, + "step": 32749 + }, + { + "epoch": 0.8287572437178936, + "grad_norm": 13.377339363098145, + "learning_rate": 7.202749457710784e-07, + "loss": 0.2145, + "step": 32750 + }, + { + "epoch": 0.8287825492825872, + "grad_norm": 5.152218341827393, + "learning_rate": 7.200673455718854e-07, + "loss": 0.1292, + "step": 32751 + }, + { + "epoch": 0.8288078548472809, + "grad_norm": 3.5987820625305176, + "learning_rate": 7.19859772973181e-07, + "loss": 0.118, + "step": 32752 + }, + { + "epoch": 0.8288331604119746, + "grad_norm": 3.376298189163208, + "learning_rate": 7.196522279763074e-07, + "loss": 0.1153, + "step": 32753 + }, + { + "epoch": 0.8288584659766682, + "grad_norm": 3.383775472640991, + "learning_rate": 7.194447105825974e-07, + "loss": 0.1557, + "step": 32754 + }, + { + "epoch": 0.828883771541362, + "grad_norm": 8.094182968139648, + "learning_rate": 7.192372207933934e-07, + "loss": 0.2553, + "step": 32755 + }, + { + "epoch": 0.8289090771060557, + "grad_norm": 4.433096885681152, + "learning_rate": 7.190297586100314e-07, + "loss": 0.1103, + "step": 32756 + }, + { + "epoch": 0.8289343826707493, + "grad_norm": 4.324475288391113, + "learning_rate": 7.188223240338516e-07, + "loss": 0.1502, + "step": 32757 + }, + { + "epoch": 0.828959688235443, + "grad_norm": 8.726614952087402, + "learning_rate": 7.186149170661899e-07, + "loss": 0.195, + "step": 32758 + }, + { + "epoch": 0.8289849938001367, + "grad_norm": 6.015135288238525, + "learning_rate": 7.184075377083843e-07, + "loss": 0.1087, + "step": 32759 + }, + { + "epoch": 0.8290102993648303, + "grad_norm": 9.083977699279785, + "learning_rate": 7.182001859617704e-07, + "loss": 0.2474, + "step": 32760 + }, + { + "epoch": 0.829035604929524, + "grad_norm": 3.1383395195007324, + "learning_rate": 7.179928618276888e-07, + "loss": 0.1109, + "step": 32761 + }, + { + "epoch": 0.8290609104942177, + "grad_norm": 3.9755964279174805, + "learning_rate": 7.177855653074744e-07, + "loss": 0.1124, + "step": 32762 + }, + { + "epoch": 0.8290862160589113, + "grad_norm": 16.94219207763672, + "learning_rate": 7.175782964024647e-07, + "loss": 0.1736, + "step": 32763 + }, + { + "epoch": 0.829111521623605, + "grad_norm": 8.211335182189941, + "learning_rate": 7.173710551139945e-07, + "loss": 0.1013, + "step": 32764 + }, + { + "epoch": 0.8291368271882987, + "grad_norm": 7.210249900817871, + "learning_rate": 7.171638414434029e-07, + "loss": 0.1677, + "step": 32765 + }, + { + "epoch": 0.8291621327529923, + "grad_norm": 3.458792209625244, + "learning_rate": 7.169566553920255e-07, + "loss": 0.0827, + "step": 32766 + }, + { + "epoch": 0.829187438317686, + "grad_norm": 10.957992553710938, + "learning_rate": 7.167494969611971e-07, + "loss": 0.1522, + "step": 32767 + }, + { + "epoch": 0.8292127438823798, + "grad_norm": 7.066674709320068, + "learning_rate": 7.165423661522552e-07, + "loss": 0.1705, + "step": 32768 + }, + { + "epoch": 0.8292380494470735, + "grad_norm": 4.518743991851807, + "learning_rate": 7.163352629665332e-07, + "loss": 0.1565, + "step": 32769 + }, + { + "epoch": 0.8292633550117671, + "grad_norm": 3.293834924697876, + "learning_rate": 7.161281874053694e-07, + "loss": 0.1064, + "step": 32770 + }, + { + "epoch": 0.8292886605764608, + "grad_norm": 4.865928649902344, + "learning_rate": 7.159211394700982e-07, + "loss": 0.1553, + "step": 32771 + }, + { + "epoch": 0.8293139661411545, + "grad_norm": 2.8680386543273926, + "learning_rate": 7.157141191620548e-07, + "loss": 0.0612, + "step": 32772 + }, + { + "epoch": 0.8293392717058481, + "grad_norm": 3.7761032581329346, + "learning_rate": 7.155071264825724e-07, + "loss": 0.122, + "step": 32773 + }, + { + "epoch": 0.8293645772705418, + "grad_norm": 3.7136542797088623, + "learning_rate": 7.153001614329896e-07, + "loss": 0.1335, + "step": 32774 + }, + { + "epoch": 0.8293898828352355, + "grad_norm": 5.080739498138428, + "learning_rate": 7.150932240146386e-07, + "loss": 0.1791, + "step": 32775 + }, + { + "epoch": 0.8294151883999291, + "grad_norm": 5.049568176269531, + "learning_rate": 7.148863142288543e-07, + "loss": 0.1073, + "step": 32776 + }, + { + "epoch": 0.8294404939646228, + "grad_norm": 4.497770309448242, + "learning_rate": 7.146794320769695e-07, + "loss": 0.1224, + "step": 32777 + }, + { + "epoch": 0.8294657995293165, + "grad_norm": 5.681506156921387, + "learning_rate": 7.144725775603218e-07, + "loss": 0.2413, + "step": 32778 + }, + { + "epoch": 0.8294911050940101, + "grad_norm": 12.28420639038086, + "learning_rate": 7.142657506802419e-07, + "loss": 0.1549, + "step": 32779 + }, + { + "epoch": 0.8295164106587039, + "grad_norm": 4.2874755859375, + "learning_rate": 7.14058951438067e-07, + "loss": 0.167, + "step": 32780 + }, + { + "epoch": 0.8295417162233976, + "grad_norm": 4.550515651702881, + "learning_rate": 7.138521798351266e-07, + "loss": 0.0915, + "step": 32781 + }, + { + "epoch": 0.8295670217880912, + "grad_norm": 3.2507739067077637, + "learning_rate": 7.136454358727573e-07, + "loss": 0.1207, + "step": 32782 + }, + { + "epoch": 0.8295923273527849, + "grad_norm": 14.920080184936523, + "learning_rate": 7.134387195522901e-07, + "loss": 0.1421, + "step": 32783 + }, + { + "epoch": 0.8296176329174786, + "grad_norm": 4.245420932769775, + "learning_rate": 7.132320308750617e-07, + "loss": 0.0891, + "step": 32784 + }, + { + "epoch": 0.8296429384821722, + "grad_norm": 2.9895217418670654, + "learning_rate": 7.130253698424e-07, + "loss": 0.1341, + "step": 32785 + }, + { + "epoch": 0.8296682440468659, + "grad_norm": 5.104100704193115, + "learning_rate": 7.128187364556415e-07, + "loss": 0.1396, + "step": 32786 + }, + { + "epoch": 0.8296935496115596, + "grad_norm": 5.642199516296387, + "learning_rate": 7.126121307161166e-07, + "loss": 0.1985, + "step": 32787 + }, + { + "epoch": 0.8297188551762532, + "grad_norm": 4.40507698059082, + "learning_rate": 7.124055526251606e-07, + "loss": 0.1331, + "step": 32788 + }, + { + "epoch": 0.8297441607409469, + "grad_norm": 6.470493316650391, + "learning_rate": 7.121990021841014e-07, + "loss": 0.1933, + "step": 32789 + }, + { + "epoch": 0.8297694663056406, + "grad_norm": 4.755819320678711, + "learning_rate": 7.11992479394274e-07, + "loss": 0.1317, + "step": 32790 + }, + { + "epoch": 0.8297947718703342, + "grad_norm": 5.177320957183838, + "learning_rate": 7.117859842570091e-07, + "loss": 0.1727, + "step": 32791 + }, + { + "epoch": 0.829820077435028, + "grad_norm": 3.8579885959625244, + "learning_rate": 7.115795167736389e-07, + "loss": 0.0991, + "step": 32792 + }, + { + "epoch": 0.8298453829997217, + "grad_norm": 8.310078620910645, + "learning_rate": 7.113730769454952e-07, + "loss": 0.1597, + "step": 32793 + }, + { + "epoch": 0.8298706885644153, + "grad_norm": 3.2374939918518066, + "learning_rate": 7.111666647739091e-07, + "loss": 0.1177, + "step": 32794 + }, + { + "epoch": 0.829895994129109, + "grad_norm": 5.976776123046875, + "learning_rate": 7.109602802602111e-07, + "loss": 0.1238, + "step": 32795 + }, + { + "epoch": 0.8299212996938027, + "grad_norm": 6.881141662597656, + "learning_rate": 7.107539234057315e-07, + "loss": 0.2299, + "step": 32796 + }, + { + "epoch": 0.8299466052584964, + "grad_norm": 3.7087173461914062, + "learning_rate": 7.105475942118029e-07, + "loss": 0.1359, + "step": 32797 + }, + { + "epoch": 0.82997191082319, + "grad_norm": 7.11887788772583, + "learning_rate": 7.103412926797548e-07, + "loss": 0.084, + "step": 32798 + }, + { + "epoch": 0.8299972163878837, + "grad_norm": 6.634285926818848, + "learning_rate": 7.10135018810918e-07, + "loss": 0.1997, + "step": 32799 + }, + { + "epoch": 0.8300225219525774, + "grad_norm": 5.979771137237549, + "learning_rate": 7.099287726066212e-07, + "loss": 0.1755, + "step": 32800 + }, + { + "epoch": 0.830047827517271, + "grad_norm": 3.589136838912964, + "learning_rate": 7.097225540681968e-07, + "loss": 0.1105, + "step": 32801 + }, + { + "epoch": 0.8300731330819647, + "grad_norm": 8.132043838500977, + "learning_rate": 7.095163631969737e-07, + "loss": 0.2953, + "step": 32802 + }, + { + "epoch": 0.8300984386466584, + "grad_norm": 8.358922004699707, + "learning_rate": 7.093101999942814e-07, + "loss": 0.21, + "step": 32803 + }, + { + "epoch": 0.830123744211352, + "grad_norm": 4.782200813293457, + "learning_rate": 7.091040644614489e-07, + "loss": 0.1409, + "step": 32804 + }, + { + "epoch": 0.8301490497760458, + "grad_norm": 5.494961261749268, + "learning_rate": 7.088979565998067e-07, + "loss": 0.1669, + "step": 32805 + }, + { + "epoch": 0.8301743553407395, + "grad_norm": 4.0375847816467285, + "learning_rate": 7.086918764106837e-07, + "loss": 0.1646, + "step": 32806 + }, + { + "epoch": 0.8301996609054331, + "grad_norm": 3.516166925430298, + "learning_rate": 7.084858238954078e-07, + "loss": 0.1001, + "step": 32807 + }, + { + "epoch": 0.8302249664701268, + "grad_norm": 6.098161220550537, + "learning_rate": 7.082797990553081e-07, + "loss": 0.1669, + "step": 32808 + }, + { + "epoch": 0.8302502720348205, + "grad_norm": 3.494504928588867, + "learning_rate": 7.08073801891715e-07, + "loss": 0.1227, + "step": 32809 + }, + { + "epoch": 0.8302755775995141, + "grad_norm": 6.361706733703613, + "learning_rate": 7.078678324059552e-07, + "loss": 0.1889, + "step": 32810 + }, + { + "epoch": 0.8303008831642078, + "grad_norm": 5.328095436096191, + "learning_rate": 7.076618905993571e-07, + "loss": 0.1676, + "step": 32811 + }, + { + "epoch": 0.8303261887289015, + "grad_norm": 4.63118839263916, + "learning_rate": 7.074559764732486e-07, + "loss": 0.1028, + "step": 32812 + }, + { + "epoch": 0.8303514942935951, + "grad_norm": 4.3627610206604, + "learning_rate": 7.072500900289586e-07, + "loss": 0.1371, + "step": 32813 + }, + { + "epoch": 0.8303767998582888, + "grad_norm": 3.6594762802124023, + "learning_rate": 7.070442312678133e-07, + "loss": 0.1225, + "step": 32814 + }, + { + "epoch": 0.8304021054229825, + "grad_norm": 8.530472755432129, + "learning_rate": 7.068384001911433e-07, + "loss": 0.1624, + "step": 32815 + }, + { + "epoch": 0.8304274109876761, + "grad_norm": 2.5981650352478027, + "learning_rate": 7.066325968002719e-07, + "loss": 0.065, + "step": 32816 + }, + { + "epoch": 0.8304527165523699, + "grad_norm": 3.828704833984375, + "learning_rate": 7.064268210965292e-07, + "loss": 0.1473, + "step": 32817 + }, + { + "epoch": 0.8304780221170636, + "grad_norm": 7.609044075012207, + "learning_rate": 7.062210730812402e-07, + "loss": 0.1712, + "step": 32818 + }, + { + "epoch": 0.8305033276817572, + "grad_norm": 6.7799787521362305, + "learning_rate": 7.060153527557351e-07, + "loss": 0.1692, + "step": 32819 + }, + { + "epoch": 0.8305286332464509, + "grad_norm": 4.462606430053711, + "learning_rate": 7.05809660121336e-07, + "loss": 0.113, + "step": 32820 + }, + { + "epoch": 0.8305539388111446, + "grad_norm": 3.2953240871429443, + "learning_rate": 7.056039951793725e-07, + "loss": 0.1093, + "step": 32821 + }, + { + "epoch": 0.8305792443758383, + "grad_norm": 8.571351051330566, + "learning_rate": 7.053983579311691e-07, + "loss": 0.1857, + "step": 32822 + }, + { + "epoch": 0.8306045499405319, + "grad_norm": 3.06282901763916, + "learning_rate": 7.051927483780536e-07, + "loss": 0.0889, + "step": 32823 + }, + { + "epoch": 0.8306298555052256, + "grad_norm": 8.080039978027344, + "learning_rate": 7.049871665213515e-07, + "loss": 0.2322, + "step": 32824 + }, + { + "epoch": 0.8306551610699193, + "grad_norm": 6.12567663192749, + "learning_rate": 7.047816123623886e-07, + "loss": 0.1625, + "step": 32825 + }, + { + "epoch": 0.8306804666346129, + "grad_norm": 20.871700286865234, + "learning_rate": 7.045760859024897e-07, + "loss": 0.28, + "step": 32826 + }, + { + "epoch": 0.8307057721993066, + "grad_norm": 3.7691328525543213, + "learning_rate": 7.043705871429796e-07, + "loss": 0.1019, + "step": 32827 + }, + { + "epoch": 0.8307310777640003, + "grad_norm": 7.514382839202881, + "learning_rate": 7.041651160851853e-07, + "loss": 0.2459, + "step": 32828 + }, + { + "epoch": 0.830756383328694, + "grad_norm": 12.512482643127441, + "learning_rate": 7.039596727304315e-07, + "loss": 0.1661, + "step": 32829 + }, + { + "epoch": 0.8307816888933877, + "grad_norm": 5.377340793609619, + "learning_rate": 7.037542570800421e-07, + "loss": 0.1464, + "step": 32830 + }, + { + "epoch": 0.8308069944580814, + "grad_norm": 6.976626396179199, + "learning_rate": 7.03548869135342e-07, + "loss": 0.1462, + "step": 32831 + }, + { + "epoch": 0.830832300022775, + "grad_norm": 2.741286039352417, + "learning_rate": 7.033435088976565e-07, + "loss": 0.0851, + "step": 32832 + }, + { + "epoch": 0.8308576055874687, + "grad_norm": 5.295950412750244, + "learning_rate": 7.031381763683098e-07, + "loss": 0.1286, + "step": 32833 + }, + { + "epoch": 0.8308829111521624, + "grad_norm": 5.6278581619262695, + "learning_rate": 7.029328715486256e-07, + "loss": 0.137, + "step": 32834 + }, + { + "epoch": 0.830908216716856, + "grad_norm": 4.936188697814941, + "learning_rate": 7.02727594439927e-07, + "loss": 0.1369, + "step": 32835 + }, + { + "epoch": 0.8309335222815497, + "grad_norm": 5.074601650238037, + "learning_rate": 7.025223450435398e-07, + "loss": 0.1465, + "step": 32836 + }, + { + "epoch": 0.8309588278462434, + "grad_norm": 6.75889778137207, + "learning_rate": 7.023171233607867e-07, + "loss": 0.1387, + "step": 32837 + }, + { + "epoch": 0.830984133410937, + "grad_norm": 9.597851753234863, + "learning_rate": 7.021119293929912e-07, + "loss": 0.1979, + "step": 32838 + }, + { + "epoch": 0.8310094389756307, + "grad_norm": 3.600496530532837, + "learning_rate": 7.01906763141475e-07, + "loss": 0.1136, + "step": 32839 + }, + { + "epoch": 0.8310347445403244, + "grad_norm": 5.712307453155518, + "learning_rate": 7.017016246075637e-07, + "loss": 0.1706, + "step": 32840 + }, + { + "epoch": 0.831060050105018, + "grad_norm": 6.829759120941162, + "learning_rate": 7.014965137925795e-07, + "loss": 0.1221, + "step": 32841 + }, + { + "epoch": 0.8310853556697118, + "grad_norm": 3.608531951904297, + "learning_rate": 7.012914306978441e-07, + "loss": 0.1058, + "step": 32842 + }, + { + "epoch": 0.8311106612344055, + "grad_norm": 4.966806411743164, + "learning_rate": 7.0108637532468e-07, + "loss": 0.1939, + "step": 32843 + }, + { + "epoch": 0.8311359667990991, + "grad_norm": 12.14083194732666, + "learning_rate": 7.008813476744108e-07, + "loss": 0.3363, + "step": 32844 + }, + { + "epoch": 0.8311612723637928, + "grad_norm": 6.976815223693848, + "learning_rate": 7.006763477483575e-07, + "loss": 0.2277, + "step": 32845 + }, + { + "epoch": 0.8311865779284865, + "grad_norm": 5.734113693237305, + "learning_rate": 7.004713755478443e-07, + "loss": 0.1305, + "step": 32846 + }, + { + "epoch": 0.8312118834931802, + "grad_norm": 5.843276500701904, + "learning_rate": 7.002664310741897e-07, + "loss": 0.2401, + "step": 32847 + }, + { + "epoch": 0.8312371890578738, + "grad_norm": 4.615109920501709, + "learning_rate": 7.000615143287182e-07, + "loss": 0.1198, + "step": 32848 + }, + { + "epoch": 0.8312624946225675, + "grad_norm": 4.626810550689697, + "learning_rate": 6.998566253127487e-07, + "loss": 0.1499, + "step": 32849 + }, + { + "epoch": 0.8312878001872612, + "grad_norm": 8.834156036376953, + "learning_rate": 6.996517640276068e-07, + "loss": 0.1272, + "step": 32850 + }, + { + "epoch": 0.8313131057519548, + "grad_norm": 5.118104457855225, + "learning_rate": 6.994469304746076e-07, + "loss": 0.1302, + "step": 32851 + }, + { + "epoch": 0.8313384113166485, + "grad_norm": 2.3180973529815674, + "learning_rate": 6.992421246550768e-07, + "loss": 0.0749, + "step": 32852 + }, + { + "epoch": 0.8313637168813423, + "grad_norm": 7.965301036834717, + "learning_rate": 6.990373465703337e-07, + "loss": 0.1548, + "step": 32853 + }, + { + "epoch": 0.8313890224460359, + "grad_norm": 3.159734010696411, + "learning_rate": 6.988325962216974e-07, + "loss": 0.0906, + "step": 32854 + }, + { + "epoch": 0.8314143280107296, + "grad_norm": 5.083574295043945, + "learning_rate": 6.986278736104907e-07, + "loss": 0.1589, + "step": 32855 + }, + { + "epoch": 0.8314396335754233, + "grad_norm": 5.863588809967041, + "learning_rate": 6.98423178738033e-07, + "loss": 0.174, + "step": 32856 + }, + { + "epoch": 0.8314649391401169, + "grad_norm": 5.420376300811768, + "learning_rate": 6.982185116056439e-07, + "loss": 0.1223, + "step": 32857 + }, + { + "epoch": 0.8314902447048106, + "grad_norm": 2.851532220840454, + "learning_rate": 6.980138722146423e-07, + "loss": 0.134, + "step": 32858 + }, + { + "epoch": 0.8315155502695043, + "grad_norm": 6.876371383666992, + "learning_rate": 6.9780926056635e-07, + "loss": 0.1421, + "step": 32859 + }, + { + "epoch": 0.8315408558341979, + "grad_norm": 5.568724155426025, + "learning_rate": 6.976046766620853e-07, + "loss": 0.1477, + "step": 32860 + }, + { + "epoch": 0.8315661613988916, + "grad_norm": 5.517532825469971, + "learning_rate": 6.974001205031683e-07, + "loss": 0.1337, + "step": 32861 + }, + { + "epoch": 0.8315914669635853, + "grad_norm": 4.504976272583008, + "learning_rate": 6.971955920909162e-07, + "loss": 0.1553, + "step": 32862 + }, + { + "epoch": 0.8316167725282789, + "grad_norm": 7.692318439483643, + "learning_rate": 6.969910914266515e-07, + "loss": 0.109, + "step": 32863 + }, + { + "epoch": 0.8316420780929726, + "grad_norm": 6.303054332733154, + "learning_rate": 6.967866185116884e-07, + "loss": 0.1319, + "step": 32864 + }, + { + "epoch": 0.8316673836576663, + "grad_norm": 6.271784782409668, + "learning_rate": 6.965821733473493e-07, + "loss": 0.134, + "step": 32865 + }, + { + "epoch": 0.83169268922236, + "grad_norm": 4.871923923492432, + "learning_rate": 6.963777559349505e-07, + "loss": 0.0948, + "step": 32866 + }, + { + "epoch": 0.8317179947870537, + "grad_norm": 4.551386833190918, + "learning_rate": 6.961733662758119e-07, + "loss": 0.1112, + "step": 32867 + }, + { + "epoch": 0.8317433003517474, + "grad_norm": 6.199370861053467, + "learning_rate": 6.959690043712503e-07, + "loss": 0.1874, + "step": 32868 + }, + { + "epoch": 0.831768605916441, + "grad_norm": 4.327569961547852, + "learning_rate": 6.957646702225845e-07, + "loss": 0.1903, + "step": 32869 + }, + { + "epoch": 0.8317939114811347, + "grad_norm": 5.330966472625732, + "learning_rate": 6.955603638311304e-07, + "loss": 0.1328, + "step": 32870 + }, + { + "epoch": 0.8318192170458284, + "grad_norm": 2.6449825763702393, + "learning_rate": 6.953560851982077e-07, + "loss": 0.1042, + "step": 32871 + }, + { + "epoch": 0.8318445226105221, + "grad_norm": 9.328880310058594, + "learning_rate": 6.951518343251329e-07, + "loss": 0.1477, + "step": 32872 + }, + { + "epoch": 0.8318698281752157, + "grad_norm": 5.266305446624756, + "learning_rate": 6.949476112132236e-07, + "loss": 0.1093, + "step": 32873 + }, + { + "epoch": 0.8318951337399094, + "grad_norm": 11.975540161132812, + "learning_rate": 6.94743415863795e-07, + "loss": 0.2427, + "step": 32874 + }, + { + "epoch": 0.8319204393046031, + "grad_norm": 6.685951232910156, + "learning_rate": 6.94539248278166e-07, + "loss": 0.2294, + "step": 32875 + }, + { + "epoch": 0.8319457448692967, + "grad_norm": 4.126872539520264, + "learning_rate": 6.943351084576516e-07, + "loss": 0.1182, + "step": 32876 + }, + { + "epoch": 0.8319710504339904, + "grad_norm": 4.116754531860352, + "learning_rate": 6.94130996403572e-07, + "loss": 0.1456, + "step": 32877 + }, + { + "epoch": 0.8319963559986842, + "grad_norm": 4.880913257598877, + "learning_rate": 6.939269121172377e-07, + "loss": 0.1001, + "step": 32878 + }, + { + "epoch": 0.8320216615633778, + "grad_norm": 4.522476673126221, + "learning_rate": 6.937228555999692e-07, + "loss": 0.1574, + "step": 32879 + }, + { + "epoch": 0.8320469671280715, + "grad_norm": 7.5152482986450195, + "learning_rate": 6.935188268530807e-07, + "loss": 0.1709, + "step": 32880 + }, + { + "epoch": 0.8320722726927652, + "grad_norm": 4.5730156898498535, + "learning_rate": 6.933148258778888e-07, + "loss": 0.1258, + "step": 32881 + }, + { + "epoch": 0.8320975782574588, + "grad_norm": 5.367543697357178, + "learning_rate": 6.931108526757068e-07, + "loss": 0.1059, + "step": 32882 + }, + { + "epoch": 0.8321228838221525, + "grad_norm": 9.619481086730957, + "learning_rate": 6.929069072478534e-07, + "loss": 0.2521, + "step": 32883 + }, + { + "epoch": 0.8321481893868462, + "grad_norm": 5.378543853759766, + "learning_rate": 6.927029895956416e-07, + "loss": 0.131, + "step": 32884 + }, + { + "epoch": 0.8321734949515398, + "grad_norm": 5.392687797546387, + "learning_rate": 6.924990997203862e-07, + "loss": 0.1024, + "step": 32885 + }, + { + "epoch": 0.8321988005162335, + "grad_norm": 8.68095874786377, + "learning_rate": 6.922952376234038e-07, + "loss": 0.1987, + "step": 32886 + }, + { + "epoch": 0.8322241060809272, + "grad_norm": 4.686917304992676, + "learning_rate": 6.920914033060083e-07, + "loss": 0.0835, + "step": 32887 + }, + { + "epoch": 0.8322494116456208, + "grad_norm": 3.2180657386779785, + "learning_rate": 6.918875967695138e-07, + "loss": 0.1378, + "step": 32888 + }, + { + "epoch": 0.8322747172103145, + "grad_norm": 13.58864688873291, + "learning_rate": 6.916838180152341e-07, + "loss": 0.1719, + "step": 32889 + }, + { + "epoch": 0.8323000227750083, + "grad_norm": 4.195044040679932, + "learning_rate": 6.91480067044486e-07, + "loss": 0.1627, + "step": 32890 + }, + { + "epoch": 0.8323253283397019, + "grad_norm": 9.748847961425781, + "learning_rate": 6.91276343858579e-07, + "loss": 0.2282, + "step": 32891 + }, + { + "epoch": 0.8323506339043956, + "grad_norm": 9.033097267150879, + "learning_rate": 6.910726484588309e-07, + "loss": 0.203, + "step": 32892 + }, + { + "epoch": 0.8323759394690893, + "grad_norm": 5.627938270568848, + "learning_rate": 6.908689808465524e-07, + "loss": 0.1976, + "step": 32893 + }, + { + "epoch": 0.8324012450337829, + "grad_norm": 2.7116799354553223, + "learning_rate": 6.906653410230608e-07, + "loss": 0.0647, + "step": 32894 + }, + { + "epoch": 0.8324265505984766, + "grad_norm": 6.959934234619141, + "learning_rate": 6.904617289896643e-07, + "loss": 0.1404, + "step": 32895 + }, + { + "epoch": 0.8324518561631703, + "grad_norm": 4.92326545715332, + "learning_rate": 6.902581447476797e-07, + "loss": 0.1533, + "step": 32896 + }, + { + "epoch": 0.832477161727864, + "grad_norm": 3.127007484436035, + "learning_rate": 6.900545882984177e-07, + "loss": 0.12, + "step": 32897 + }, + { + "epoch": 0.8325024672925576, + "grad_norm": 5.8302083015441895, + "learning_rate": 6.898510596431929e-07, + "loss": 0.1487, + "step": 32898 + }, + { + "epoch": 0.8325277728572513, + "grad_norm": 3.974893569946289, + "learning_rate": 6.896475587833168e-07, + "loss": 0.1798, + "step": 32899 + }, + { + "epoch": 0.832553078421945, + "grad_norm": 1.4200278520584106, + "learning_rate": 6.894440857201018e-07, + "loss": 0.0475, + "step": 32900 + }, + { + "epoch": 0.8325783839866386, + "grad_norm": 6.6117682456970215, + "learning_rate": 6.892406404548591e-07, + "loss": 0.236, + "step": 32901 + }, + { + "epoch": 0.8326036895513323, + "grad_norm": 4.06826639175415, + "learning_rate": 6.890372229889025e-07, + "loss": 0.1447, + "step": 32902 + }, + { + "epoch": 0.8326289951160261, + "grad_norm": 3.7177505493164062, + "learning_rate": 6.888338333235434e-07, + "loss": 0.0852, + "step": 32903 + }, + { + "epoch": 0.8326543006807197, + "grad_norm": 5.478402614593506, + "learning_rate": 6.886304714600928e-07, + "loss": 0.1075, + "step": 32904 + }, + { + "epoch": 0.8326796062454134, + "grad_norm": 4.185826778411865, + "learning_rate": 6.884271373998608e-07, + "loss": 0.123, + "step": 32905 + }, + { + "epoch": 0.8327049118101071, + "grad_norm": 3.319944143295288, + "learning_rate": 6.882238311441619e-07, + "loss": 0.1374, + "step": 32906 + }, + { + "epoch": 0.8327302173748007, + "grad_norm": 9.288111686706543, + "learning_rate": 6.88020552694304e-07, + "loss": 0.1779, + "step": 32907 + }, + { + "epoch": 0.8327555229394944, + "grad_norm": 2.4086689949035645, + "learning_rate": 6.878173020516015e-07, + "loss": 0.0812, + "step": 32908 + }, + { + "epoch": 0.8327808285041881, + "grad_norm": 6.14495325088501, + "learning_rate": 6.876140792173614e-07, + "loss": 0.1799, + "step": 32909 + }, + { + "epoch": 0.8328061340688817, + "grad_norm": 3.464664936065674, + "learning_rate": 6.874108841928967e-07, + "loss": 0.1416, + "step": 32910 + }, + { + "epoch": 0.8328314396335754, + "grad_norm": 17.549747467041016, + "learning_rate": 6.872077169795172e-07, + "loss": 0.1723, + "step": 32911 + }, + { + "epoch": 0.8328567451982691, + "grad_norm": 2.816866874694824, + "learning_rate": 6.870045775785327e-07, + "loss": 0.1319, + "step": 32912 + }, + { + "epoch": 0.8328820507629627, + "grad_norm": 9.297528266906738, + "learning_rate": 6.86801465991252e-07, + "loss": 0.3011, + "step": 32913 + }, + { + "epoch": 0.8329073563276564, + "grad_norm": 4.7065043449401855, + "learning_rate": 6.865983822189881e-07, + "loss": 0.0966, + "step": 32914 + }, + { + "epoch": 0.8329326618923502, + "grad_norm": 6.53458309173584, + "learning_rate": 6.863953262630485e-07, + "loss": 0.1644, + "step": 32915 + }, + { + "epoch": 0.8329579674570438, + "grad_norm": 4.630284309387207, + "learning_rate": 6.861922981247432e-07, + "loss": 0.1345, + "step": 32916 + }, + { + "epoch": 0.8329832730217375, + "grad_norm": 4.271894931793213, + "learning_rate": 6.859892978053812e-07, + "loss": 0.2096, + "step": 32917 + }, + { + "epoch": 0.8330085785864312, + "grad_norm": 4.162238121032715, + "learning_rate": 6.857863253062707e-07, + "loss": 0.1416, + "step": 32918 + }, + { + "epoch": 0.8330338841511248, + "grad_norm": 12.449917793273926, + "learning_rate": 6.855833806287226e-07, + "loss": 0.1584, + "step": 32919 + }, + { + "epoch": 0.8330591897158185, + "grad_norm": 6.116547107696533, + "learning_rate": 6.853804637740441e-07, + "loss": 0.2374, + "step": 32920 + }, + { + "epoch": 0.8330844952805122, + "grad_norm": 6.625833034515381, + "learning_rate": 6.851775747435463e-07, + "loss": 0.1283, + "step": 32921 + }, + { + "epoch": 0.8331098008452058, + "grad_norm": 4.178149223327637, + "learning_rate": 6.849747135385331e-07, + "loss": 0.173, + "step": 32922 + }, + { + "epoch": 0.8331351064098995, + "grad_norm": 2.15458607673645, + "learning_rate": 6.84771880160317e-07, + "loss": 0.0538, + "step": 32923 + }, + { + "epoch": 0.8331604119745932, + "grad_norm": 3.8010919094085693, + "learning_rate": 6.845690746102035e-07, + "loss": 0.0901, + "step": 32924 + }, + { + "epoch": 0.8331857175392869, + "grad_norm": 3.5308380126953125, + "learning_rate": 6.843662968895032e-07, + "loss": 0.1736, + "step": 32925 + }, + { + "epoch": 0.8332110231039805, + "grad_norm": 7.7541327476501465, + "learning_rate": 6.841635469995195e-07, + "loss": 0.1198, + "step": 32926 + }, + { + "epoch": 0.8332363286686743, + "grad_norm": 5.281675815582275, + "learning_rate": 6.83960824941563e-07, + "loss": 0.1544, + "step": 32927 + }, + { + "epoch": 0.833261634233368, + "grad_norm": 5.629590034484863, + "learning_rate": 6.8375813071694e-07, + "loss": 0.1348, + "step": 32928 + }, + { + "epoch": 0.8332869397980616, + "grad_norm": 3.275836944580078, + "learning_rate": 6.835554643269588e-07, + "loss": 0.1216, + "step": 32929 + }, + { + "epoch": 0.8333122453627553, + "grad_norm": 2.9852466583251953, + "learning_rate": 6.833528257729249e-07, + "loss": 0.0764, + "step": 32930 + }, + { + "epoch": 0.833337550927449, + "grad_norm": 3.2057693004608154, + "learning_rate": 6.831502150561464e-07, + "loss": 0.1523, + "step": 32931 + }, + { + "epoch": 0.8333628564921426, + "grad_norm": 5.629481315612793, + "learning_rate": 6.829476321779277e-07, + "loss": 0.1623, + "step": 32932 + }, + { + "epoch": 0.8333881620568363, + "grad_norm": 3.017190456390381, + "learning_rate": 6.827450771395783e-07, + "loss": 0.1076, + "step": 32933 + }, + { + "epoch": 0.83341346762153, + "grad_norm": 2.6904518604278564, + "learning_rate": 6.825425499424021e-07, + "loss": 0.0836, + "step": 32934 + }, + { + "epoch": 0.8334387731862236, + "grad_norm": 8.012834548950195, + "learning_rate": 6.823400505877064e-07, + "loss": 0.3481, + "step": 32935 + }, + { + "epoch": 0.8334640787509173, + "grad_norm": 3.781895637512207, + "learning_rate": 6.821375790767953e-07, + "loss": 0.1234, + "step": 32936 + }, + { + "epoch": 0.833489384315611, + "grad_norm": 3.443066358566284, + "learning_rate": 6.819351354109765e-07, + "loss": 0.1336, + "step": 32937 + }, + { + "epoch": 0.8335146898803046, + "grad_norm": 4.764944076538086, + "learning_rate": 6.817327195915552e-07, + "loss": 0.1627, + "step": 32938 + }, + { + "epoch": 0.8335399954449983, + "grad_norm": 9.33891487121582, + "learning_rate": 6.815303316198362e-07, + "loss": 0.246, + "step": 32939 + }, + { + "epoch": 0.8335653010096921, + "grad_norm": 4.73783540725708, + "learning_rate": 6.813279714971238e-07, + "loss": 0.1005, + "step": 32940 + }, + { + "epoch": 0.8335906065743857, + "grad_norm": 3.259537696838379, + "learning_rate": 6.811256392247251e-07, + "loss": 0.1234, + "step": 32941 + }, + { + "epoch": 0.8336159121390794, + "grad_norm": 5.5460734367370605, + "learning_rate": 6.80923334803944e-07, + "loss": 0.1591, + "step": 32942 + }, + { + "epoch": 0.8336412177037731, + "grad_norm": 4.4389119148254395, + "learning_rate": 6.807210582360845e-07, + "loss": 0.1146, + "step": 32943 + }, + { + "epoch": 0.8336665232684667, + "grad_norm": 3.316047430038452, + "learning_rate": 6.805188095224508e-07, + "loss": 0.1249, + "step": 32944 + }, + { + "epoch": 0.8336918288331604, + "grad_norm": 9.499932289123535, + "learning_rate": 6.803165886643482e-07, + "loss": 0.3562, + "step": 32945 + }, + { + "epoch": 0.8337171343978541, + "grad_norm": 7.10349702835083, + "learning_rate": 6.801143956630812e-07, + "loss": 0.1774, + "step": 32946 + }, + { + "epoch": 0.8337424399625477, + "grad_norm": 2.46881103515625, + "learning_rate": 6.799122305199524e-07, + "loss": 0.0802, + "step": 32947 + }, + { + "epoch": 0.8337677455272414, + "grad_norm": 4.137027263641357, + "learning_rate": 6.79710093236266e-07, + "loss": 0.122, + "step": 32948 + }, + { + "epoch": 0.8337930510919351, + "grad_norm": 5.394595146179199, + "learning_rate": 6.795079838133245e-07, + "loss": 0.1813, + "step": 32949 + }, + { + "epoch": 0.8338183566566288, + "grad_norm": 3.295806884765625, + "learning_rate": 6.793059022524334e-07, + "loss": 0.1279, + "step": 32950 + }, + { + "epoch": 0.8338436622213224, + "grad_norm": 10.646050453186035, + "learning_rate": 6.79103848554894e-07, + "loss": 0.1733, + "step": 32951 + }, + { + "epoch": 0.8338689677860162, + "grad_norm": 4.187431335449219, + "learning_rate": 6.789018227220123e-07, + "loss": 0.1638, + "step": 32952 + }, + { + "epoch": 0.8338942733507099, + "grad_norm": 11.313720703125, + "learning_rate": 6.786998247550868e-07, + "loss": 0.1982, + "step": 32953 + }, + { + "epoch": 0.8339195789154035, + "grad_norm": 3.282604932785034, + "learning_rate": 6.784978546554233e-07, + "loss": 0.1347, + "step": 32954 + }, + { + "epoch": 0.8339448844800972, + "grad_norm": 5.346893787384033, + "learning_rate": 6.782959124243221e-07, + "loss": 0.1239, + "step": 32955 + }, + { + "epoch": 0.8339701900447909, + "grad_norm": 3.417731523513794, + "learning_rate": 6.780939980630886e-07, + "loss": 0.1214, + "step": 32956 + }, + { + "epoch": 0.8339954956094845, + "grad_norm": 6.604846477508545, + "learning_rate": 6.778921115730214e-07, + "loss": 0.1104, + "step": 32957 + }, + { + "epoch": 0.8340208011741782, + "grad_norm": 7.047262191772461, + "learning_rate": 6.776902529554247e-07, + "loss": 0.1983, + "step": 32958 + }, + { + "epoch": 0.8340461067388719, + "grad_norm": 5.714243412017822, + "learning_rate": 6.774884222115985e-07, + "loss": 0.1358, + "step": 32959 + }, + { + "epoch": 0.8340714123035655, + "grad_norm": 31.10926628112793, + "learning_rate": 6.772866193428462e-07, + "loss": 0.2505, + "step": 32960 + }, + { + "epoch": 0.8340967178682592, + "grad_norm": 3.0488929748535156, + "learning_rate": 6.770848443504691e-07, + "loss": 0.1174, + "step": 32961 + }, + { + "epoch": 0.8341220234329529, + "grad_norm": 4.72215461730957, + "learning_rate": 6.768830972357671e-07, + "loss": 0.1576, + "step": 32962 + }, + { + "epoch": 0.8341473289976465, + "grad_norm": 4.5507025718688965, + "learning_rate": 6.766813780000409e-07, + "loss": 0.1108, + "step": 32963 + }, + { + "epoch": 0.8341726345623403, + "grad_norm": 10.679557800292969, + "learning_rate": 6.764796866445939e-07, + "loss": 0.1836, + "step": 32964 + }, + { + "epoch": 0.834197940127034, + "grad_norm": 4.299063682556152, + "learning_rate": 6.762780231707245e-07, + "loss": 0.1746, + "step": 32965 + }, + { + "epoch": 0.8342232456917276, + "grad_norm": 8.082483291625977, + "learning_rate": 6.760763875797344e-07, + "loss": 0.184, + "step": 32966 + }, + { + "epoch": 0.8342485512564213, + "grad_norm": 4.717154502868652, + "learning_rate": 6.758747798729215e-07, + "loss": 0.2182, + "step": 32967 + }, + { + "epoch": 0.834273856821115, + "grad_norm": 6.097846031188965, + "learning_rate": 6.756732000515897e-07, + "loss": 0.1742, + "step": 32968 + }, + { + "epoch": 0.8342991623858086, + "grad_norm": 6.041519641876221, + "learning_rate": 6.754716481170364e-07, + "loss": 0.1084, + "step": 32969 + }, + { + "epoch": 0.8343244679505023, + "grad_norm": 3.914154529571533, + "learning_rate": 6.752701240705622e-07, + "loss": 0.1234, + "step": 32970 + }, + { + "epoch": 0.834349773515196, + "grad_norm": 5.3086018562316895, + "learning_rate": 6.750686279134661e-07, + "loss": 0.1934, + "step": 32971 + }, + { + "epoch": 0.8343750790798896, + "grad_norm": 4.167913913726807, + "learning_rate": 6.74867159647048e-07, + "loss": 0.1044, + "step": 32972 + }, + { + "epoch": 0.8344003846445833, + "grad_norm": 2.753296136856079, + "learning_rate": 6.746657192726075e-07, + "loss": 0.0814, + "step": 32973 + }, + { + "epoch": 0.834425690209277, + "grad_norm": 5.54371452331543, + "learning_rate": 6.744643067914436e-07, + "loss": 0.2002, + "step": 32974 + }, + { + "epoch": 0.8344509957739707, + "grad_norm": 8.790234565734863, + "learning_rate": 6.742629222048541e-07, + "loss": 0.2581, + "step": 32975 + }, + { + "epoch": 0.8344763013386644, + "grad_norm": 13.4249849319458, + "learning_rate": 6.740615655141375e-07, + "loss": 0.163, + "step": 32976 + }, + { + "epoch": 0.8345016069033581, + "grad_norm": 4.91820764541626, + "learning_rate": 6.738602367205943e-07, + "loss": 0.179, + "step": 32977 + }, + { + "epoch": 0.8345269124680518, + "grad_norm": 8.09768009185791, + "learning_rate": 6.736589358255219e-07, + "loss": 0.1731, + "step": 32978 + }, + { + "epoch": 0.8345522180327454, + "grad_norm": 3.2329049110412598, + "learning_rate": 6.734576628302181e-07, + "loss": 0.1315, + "step": 32979 + }, + { + "epoch": 0.8345775235974391, + "grad_norm": 5.405261516571045, + "learning_rate": 6.732564177359802e-07, + "loss": 0.143, + "step": 32980 + }, + { + "epoch": 0.8346028291621328, + "grad_norm": 4.682194709777832, + "learning_rate": 6.730552005441077e-07, + "loss": 0.124, + "step": 32981 + }, + { + "epoch": 0.8346281347268264, + "grad_norm": 4.011557579040527, + "learning_rate": 6.728540112558956e-07, + "loss": 0.1542, + "step": 32982 + }, + { + "epoch": 0.8346534402915201, + "grad_norm": 10.604634284973145, + "learning_rate": 6.726528498726465e-07, + "loss": 0.207, + "step": 32983 + }, + { + "epoch": 0.8346787458562138, + "grad_norm": 8.640320777893066, + "learning_rate": 6.724517163956512e-07, + "loss": 0.2533, + "step": 32984 + }, + { + "epoch": 0.8347040514209074, + "grad_norm": 2.9108152389526367, + "learning_rate": 6.722506108262112e-07, + "loss": 0.0901, + "step": 32985 + }, + { + "epoch": 0.8347293569856011, + "grad_norm": 4.295221328735352, + "learning_rate": 6.720495331656208e-07, + "loss": 0.1713, + "step": 32986 + }, + { + "epoch": 0.8347546625502948, + "grad_norm": 6.469956874847412, + "learning_rate": 6.718484834151801e-07, + "loss": 0.1716, + "step": 32987 + }, + { + "epoch": 0.8347799681149884, + "grad_norm": 4.699426651000977, + "learning_rate": 6.716474615761809e-07, + "loss": 0.1722, + "step": 32988 + }, + { + "epoch": 0.8348052736796822, + "grad_norm": 4.582459926605225, + "learning_rate": 6.714464676499232e-07, + "loss": 0.1261, + "step": 32989 + }, + { + "epoch": 0.8348305792443759, + "grad_norm": 2.946300506591797, + "learning_rate": 6.712455016377018e-07, + "loss": 0.0943, + "step": 32990 + }, + { + "epoch": 0.8348558848090695, + "grad_norm": 9.861358642578125, + "learning_rate": 6.710445635408141e-07, + "loss": 0.231, + "step": 32991 + }, + { + "epoch": 0.8348811903737632, + "grad_norm": 7.210701942443848, + "learning_rate": 6.708436533605534e-07, + "loss": 0.1882, + "step": 32992 + }, + { + "epoch": 0.8349064959384569, + "grad_norm": 3.185763359069824, + "learning_rate": 6.706427710982172e-07, + "loss": 0.0773, + "step": 32993 + }, + { + "epoch": 0.8349318015031505, + "grad_norm": 11.435039520263672, + "learning_rate": 6.704419167550997e-07, + "loss": 0.2175, + "step": 32994 + }, + { + "epoch": 0.8349571070678442, + "grad_norm": 5.594193458557129, + "learning_rate": 6.70241090332498e-07, + "loss": 0.104, + "step": 32995 + }, + { + "epoch": 0.8349824126325379, + "grad_norm": 6.440280437469482, + "learning_rate": 6.700402918317062e-07, + "loss": 0.177, + "step": 32996 + }, + { + "epoch": 0.8350077181972315, + "grad_norm": 4.59022855758667, + "learning_rate": 6.698395212540188e-07, + "loss": 0.1623, + "step": 32997 + }, + { + "epoch": 0.8350330237619252, + "grad_norm": 6.333189010620117, + "learning_rate": 6.696387786007297e-07, + "loss": 0.1453, + "step": 32998 + }, + { + "epoch": 0.8350583293266189, + "grad_norm": 5.417144298553467, + "learning_rate": 6.694380638731363e-07, + "loss": 0.1277, + "step": 32999 + }, + { + "epoch": 0.8350836348913127, + "grad_norm": 7.841486930847168, + "learning_rate": 6.692373770725307e-07, + "loss": 0.1574, + "step": 33000 + }, + { + "epoch": 0.8351089404560063, + "grad_norm": 3.8866472244262695, + "learning_rate": 6.690367182002077e-07, + "loss": 0.1094, + "step": 33001 + }, + { + "epoch": 0.8351342460207, + "grad_norm": 5.776155471801758, + "learning_rate": 6.688360872574617e-07, + "loss": 0.1474, + "step": 33002 + }, + { + "epoch": 0.8351595515853937, + "grad_norm": 2.384021520614624, + "learning_rate": 6.68635484245585e-07, + "loss": 0.0987, + "step": 33003 + }, + { + "epoch": 0.8351848571500873, + "grad_norm": 4.593901634216309, + "learning_rate": 6.684349091658732e-07, + "loss": 0.1119, + "step": 33004 + }, + { + "epoch": 0.835210162714781, + "grad_norm": 6.789805889129639, + "learning_rate": 6.682343620196186e-07, + "loss": 0.2014, + "step": 33005 + }, + { + "epoch": 0.8352354682794747, + "grad_norm": 7.841700553894043, + "learning_rate": 6.680338428081156e-07, + "loss": 0.1806, + "step": 33006 + }, + { + "epoch": 0.8352607738441683, + "grad_norm": 5.7553253173828125, + "learning_rate": 6.678333515326546e-07, + "loss": 0.1734, + "step": 33007 + }, + { + "epoch": 0.835286079408862, + "grad_norm": 5.913937568664551, + "learning_rate": 6.676328881945321e-07, + "loss": 0.1629, + "step": 33008 + }, + { + "epoch": 0.8353113849735557, + "grad_norm": 2.879664659500122, + "learning_rate": 6.674324527950394e-07, + "loss": 0.1442, + "step": 33009 + }, + { + "epoch": 0.8353366905382493, + "grad_norm": 3.5168042182922363, + "learning_rate": 6.672320453354686e-07, + "loss": 0.1181, + "step": 33010 + }, + { + "epoch": 0.835361996102943, + "grad_norm": 3.1969456672668457, + "learning_rate": 6.670316658171111e-07, + "loss": 0.1317, + "step": 33011 + }, + { + "epoch": 0.8353873016676368, + "grad_norm": 7.619912147521973, + "learning_rate": 6.668313142412619e-07, + "loss": 0.1606, + "step": 33012 + }, + { + "epoch": 0.8354126072323304, + "grad_norm": 9.146563529968262, + "learning_rate": 6.6663099060921e-07, + "loss": 0.2751, + "step": 33013 + }, + { + "epoch": 0.8354379127970241, + "grad_norm": 8.034692764282227, + "learning_rate": 6.664306949222515e-07, + "loss": 0.2023, + "step": 33014 + }, + { + "epoch": 0.8354632183617178, + "grad_norm": 4.058093547821045, + "learning_rate": 6.662304271816727e-07, + "loss": 0.1385, + "step": 33015 + }, + { + "epoch": 0.8354885239264114, + "grad_norm": 4.5027289390563965, + "learning_rate": 6.660301873887693e-07, + "loss": 0.1267, + "step": 33016 + }, + { + "epoch": 0.8355138294911051, + "grad_norm": 6.459183216094971, + "learning_rate": 6.658299755448294e-07, + "loss": 0.1542, + "step": 33017 + }, + { + "epoch": 0.8355391350557988, + "grad_norm": 6.889886379241943, + "learning_rate": 6.656297916511484e-07, + "loss": 0.1683, + "step": 33018 + }, + { + "epoch": 0.8355644406204924, + "grad_norm": 3.8124277591705322, + "learning_rate": 6.654296357090117e-07, + "loss": 0.127, + "step": 33019 + }, + { + "epoch": 0.8355897461851861, + "grad_norm": 9.255681991577148, + "learning_rate": 6.652295077197146e-07, + "loss": 0.2163, + "step": 33020 + }, + { + "epoch": 0.8356150517498798, + "grad_norm": 3.898564100265503, + "learning_rate": 6.650294076845454e-07, + "loss": 0.1668, + "step": 33021 + }, + { + "epoch": 0.8356403573145734, + "grad_norm": 4.121057033538818, + "learning_rate": 6.648293356047969e-07, + "loss": 0.1357, + "step": 33022 + }, + { + "epoch": 0.8356656628792671, + "grad_norm": 4.958065986633301, + "learning_rate": 6.646292914817554e-07, + "loss": 0.2233, + "step": 33023 + }, + { + "epoch": 0.8356909684439608, + "grad_norm": 2.010777473449707, + "learning_rate": 6.644292753167142e-07, + "loss": 0.0688, + "step": 33024 + }, + { + "epoch": 0.8357162740086546, + "grad_norm": 9.077250480651855, + "learning_rate": 6.642292871109613e-07, + "loss": 0.1675, + "step": 33025 + }, + { + "epoch": 0.8357415795733482, + "grad_norm": 3.1775643825531006, + "learning_rate": 6.640293268657883e-07, + "loss": 0.1218, + "step": 33026 + }, + { + "epoch": 0.8357668851380419, + "grad_norm": 9.199827194213867, + "learning_rate": 6.638293945824831e-07, + "loss": 0.142, + "step": 33027 + }, + { + "epoch": 0.8357921907027356, + "grad_norm": 5.542570114135742, + "learning_rate": 6.636294902623358e-07, + "loss": 0.1544, + "step": 33028 + }, + { + "epoch": 0.8358174962674292, + "grad_norm": 8.233210563659668, + "learning_rate": 6.634296139066349e-07, + "loss": 0.1819, + "step": 33029 + }, + { + "epoch": 0.8358428018321229, + "grad_norm": 3.6090104579925537, + "learning_rate": 6.632297655166697e-07, + "loss": 0.1213, + "step": 33030 + }, + { + "epoch": 0.8358681073968166, + "grad_norm": 6.58845853805542, + "learning_rate": 6.6302994509373e-07, + "loss": 0.1672, + "step": 33031 + }, + { + "epoch": 0.8358934129615102, + "grad_norm": 4.372124671936035, + "learning_rate": 6.628301526391029e-07, + "loss": 0.1029, + "step": 33032 + }, + { + "epoch": 0.8359187185262039, + "grad_norm": 12.15670108795166, + "learning_rate": 6.626303881540774e-07, + "loss": 0.24, + "step": 33033 + }, + { + "epoch": 0.8359440240908976, + "grad_norm": 8.297283172607422, + "learning_rate": 6.624306516399409e-07, + "loss": 0.1541, + "step": 33034 + }, + { + "epoch": 0.8359693296555912, + "grad_norm": 6.066819667816162, + "learning_rate": 6.622309430979834e-07, + "loss": 0.2022, + "step": 33035 + }, + { + "epoch": 0.8359946352202849, + "grad_norm": 2.919834613800049, + "learning_rate": 6.620312625294911e-07, + "loss": 0.125, + "step": 33036 + }, + { + "epoch": 0.8360199407849787, + "grad_norm": 4.387702941894531, + "learning_rate": 6.61831609935753e-07, + "loss": 0.1265, + "step": 33037 + }, + { + "epoch": 0.8360452463496723, + "grad_norm": 6.66472053527832, + "learning_rate": 6.616319853180541e-07, + "loss": 0.1533, + "step": 33038 + }, + { + "epoch": 0.836070551914366, + "grad_norm": 4.013862609863281, + "learning_rate": 6.614323886776852e-07, + "loss": 0.1502, + "step": 33039 + }, + { + "epoch": 0.8360958574790597, + "grad_norm": 3.6938557624816895, + "learning_rate": 6.612328200159313e-07, + "loss": 0.1303, + "step": 33040 + }, + { + "epoch": 0.8361211630437533, + "grad_norm": 4.375833511352539, + "learning_rate": 6.610332793340801e-07, + "loss": 0.1516, + "step": 33041 + }, + { + "epoch": 0.836146468608447, + "grad_norm": 2.959744691848755, + "learning_rate": 6.608337666334175e-07, + "loss": 0.1042, + "step": 33042 + }, + { + "epoch": 0.8361717741731407, + "grad_norm": 3.90958833694458, + "learning_rate": 6.606342819152312e-07, + "loss": 0.125, + "step": 33043 + }, + { + "epoch": 0.8361970797378343, + "grad_norm": 7.149258613586426, + "learning_rate": 6.604348251808079e-07, + "loss": 0.1589, + "step": 33044 + }, + { + "epoch": 0.836222385302528, + "grad_norm": 9.803492546081543, + "learning_rate": 6.602353964314323e-07, + "loss": 0.181, + "step": 33045 + }, + { + "epoch": 0.8362476908672217, + "grad_norm": 5.3762383460998535, + "learning_rate": 6.60035995668391e-07, + "loss": 0.1591, + "step": 33046 + }, + { + "epoch": 0.8362729964319153, + "grad_norm": 6.389449596405029, + "learning_rate": 6.598366228929714e-07, + "loss": 0.2062, + "step": 33047 + }, + { + "epoch": 0.836298301996609, + "grad_norm": 8.721358299255371, + "learning_rate": 6.596372781064564e-07, + "loss": 0.1754, + "step": 33048 + }, + { + "epoch": 0.8363236075613028, + "grad_norm": 12.288134574890137, + "learning_rate": 6.59437961310136e-07, + "loss": 0.1726, + "step": 33049 + }, + { + "epoch": 0.8363489131259964, + "grad_norm": 4.257725238800049, + "learning_rate": 6.592386725052902e-07, + "loss": 0.123, + "step": 33050 + }, + { + "epoch": 0.8363742186906901, + "grad_norm": 2.6935715675354004, + "learning_rate": 6.59039411693208e-07, + "loss": 0.1235, + "step": 33051 + }, + { + "epoch": 0.8363995242553838, + "grad_norm": 8.702033996582031, + "learning_rate": 6.588401788751719e-07, + "loss": 0.2843, + "step": 33052 + }, + { + "epoch": 0.8364248298200775, + "grad_norm": 8.277567863464355, + "learning_rate": 6.586409740524707e-07, + "loss": 0.1986, + "step": 33053 + }, + { + "epoch": 0.8364501353847711, + "grad_norm": 3.199392080307007, + "learning_rate": 6.584417972263834e-07, + "loss": 0.1448, + "step": 33054 + }, + { + "epoch": 0.8364754409494648, + "grad_norm": 3.5737991333007812, + "learning_rate": 6.582426483981991e-07, + "loss": 0.1207, + "step": 33055 + }, + { + "epoch": 0.8365007465141585, + "grad_norm": 7.8635640144348145, + "learning_rate": 6.580435275691987e-07, + "loss": 0.2005, + "step": 33056 + }, + { + "epoch": 0.8365260520788521, + "grad_norm": 10.354738235473633, + "learning_rate": 6.578444347406692e-07, + "loss": 0.2002, + "step": 33057 + }, + { + "epoch": 0.8365513576435458, + "grad_norm": 7.739030838012695, + "learning_rate": 6.576453699138935e-07, + "loss": 0.1668, + "step": 33058 + }, + { + "epoch": 0.8365766632082395, + "grad_norm": 8.615966796875, + "learning_rate": 6.574463330901548e-07, + "loss": 0.1711, + "step": 33059 + }, + { + "epoch": 0.8366019687729331, + "grad_norm": 3.2708442211151123, + "learning_rate": 6.572473242707367e-07, + "loss": 0.1067, + "step": 33060 + }, + { + "epoch": 0.8366272743376268, + "grad_norm": 4.1561994552612305, + "learning_rate": 6.570483434569219e-07, + "loss": 0.1055, + "step": 33061 + }, + { + "epoch": 0.8366525799023206, + "grad_norm": 9.827592849731445, + "learning_rate": 6.568493906499956e-07, + "loss": 0.2381, + "step": 33062 + }, + { + "epoch": 0.8366778854670142, + "grad_norm": 12.025432586669922, + "learning_rate": 6.566504658512401e-07, + "loss": 0.1961, + "step": 33063 + }, + { + "epoch": 0.8367031910317079, + "grad_norm": 6.841067790985107, + "learning_rate": 6.564515690619372e-07, + "loss": 0.1632, + "step": 33064 + }, + { + "epoch": 0.8367284965964016, + "grad_norm": 5.931737899780273, + "learning_rate": 6.562527002833696e-07, + "loss": 0.1701, + "step": 33065 + }, + { + "epoch": 0.8367538021610952, + "grad_norm": 4.205190658569336, + "learning_rate": 6.560538595168208e-07, + "loss": 0.1462, + "step": 33066 + }, + { + "epoch": 0.8367791077257889, + "grad_norm": 7.494565486907959, + "learning_rate": 6.558550467635732e-07, + "loss": 0.2293, + "step": 33067 + }, + { + "epoch": 0.8368044132904826, + "grad_norm": 6.056880950927734, + "learning_rate": 6.556562620249079e-07, + "loss": 0.1627, + "step": 33068 + }, + { + "epoch": 0.8368297188551762, + "grad_norm": 6.519411563873291, + "learning_rate": 6.554575053021067e-07, + "loss": 0.1963, + "step": 33069 + }, + { + "epoch": 0.8368550244198699, + "grad_norm": 13.549803733825684, + "learning_rate": 6.552587765964524e-07, + "loss": 0.1936, + "step": 33070 + }, + { + "epoch": 0.8368803299845636, + "grad_norm": 5.235801696777344, + "learning_rate": 6.550600759092262e-07, + "loss": 0.1923, + "step": 33071 + }, + { + "epoch": 0.8369056355492572, + "grad_norm": 8.338471412658691, + "learning_rate": 6.5486140324171e-07, + "loss": 0.2786, + "step": 33072 + }, + { + "epoch": 0.8369309411139509, + "grad_norm": 15.1234130859375, + "learning_rate": 6.546627585951826e-07, + "loss": 0.157, + "step": 33073 + }, + { + "epoch": 0.8369562466786447, + "grad_norm": 4.661815166473389, + "learning_rate": 6.54464141970928e-07, + "loss": 0.1804, + "step": 33074 + }, + { + "epoch": 0.8369815522433383, + "grad_norm": 6.213162422180176, + "learning_rate": 6.542655533702258e-07, + "loss": 0.2431, + "step": 33075 + }, + { + "epoch": 0.837006857808032, + "grad_norm": 5.232974052429199, + "learning_rate": 6.540669927943566e-07, + "loss": 0.2006, + "step": 33076 + }, + { + "epoch": 0.8370321633727257, + "grad_norm": 4.698724269866943, + "learning_rate": 6.538684602446e-07, + "loss": 0.1638, + "step": 33077 + }, + { + "epoch": 0.8370574689374194, + "grad_norm": 4.123678207397461, + "learning_rate": 6.536699557222382e-07, + "loss": 0.1246, + "step": 33078 + }, + { + "epoch": 0.837082774502113, + "grad_norm": 2.3374831676483154, + "learning_rate": 6.534714792285496e-07, + "loss": 0.067, + "step": 33079 + }, + { + "epoch": 0.8371080800668067, + "grad_norm": 5.710273265838623, + "learning_rate": 6.532730307648166e-07, + "loss": 0.1577, + "step": 33080 + }, + { + "epoch": 0.8371333856315004, + "grad_norm": 5.750087261199951, + "learning_rate": 6.530746103323154e-07, + "loss": 0.1794, + "step": 33081 + }, + { + "epoch": 0.837158691196194, + "grad_norm": 5.7635602951049805, + "learning_rate": 6.528762179323289e-07, + "loss": 0.2311, + "step": 33082 + }, + { + "epoch": 0.8371839967608877, + "grad_norm": 4.044984817504883, + "learning_rate": 6.526778535661332e-07, + "loss": 0.1449, + "step": 33083 + }, + { + "epoch": 0.8372093023255814, + "grad_norm": 4.018775939941406, + "learning_rate": 6.524795172350124e-07, + "loss": 0.1352, + "step": 33084 + }, + { + "epoch": 0.837234607890275, + "grad_norm": 10.92078971862793, + "learning_rate": 6.522812089402397e-07, + "loss": 0.2244, + "step": 33085 + }, + { + "epoch": 0.8372599134549688, + "grad_norm": 7.3842902183532715, + "learning_rate": 6.520829286830982e-07, + "loss": 0.1924, + "step": 33086 + }, + { + "epoch": 0.8372852190196625, + "grad_norm": 11.280930519104004, + "learning_rate": 6.518846764648646e-07, + "loss": 0.1975, + "step": 33087 + }, + { + "epoch": 0.8373105245843561, + "grad_norm": 4.231365203857422, + "learning_rate": 6.516864522868171e-07, + "loss": 0.1332, + "step": 33088 + }, + { + "epoch": 0.8373358301490498, + "grad_norm": 3.9466304779052734, + "learning_rate": 6.514882561502361e-07, + "loss": 0.0799, + "step": 33089 + }, + { + "epoch": 0.8373611357137435, + "grad_norm": 4.263354301452637, + "learning_rate": 6.512900880563983e-07, + "loss": 0.1402, + "step": 33090 + }, + { + "epoch": 0.8373864412784371, + "grad_norm": 7.40064811706543, + "learning_rate": 6.510919480065814e-07, + "loss": 0.1305, + "step": 33091 + }, + { + "epoch": 0.8374117468431308, + "grad_norm": 9.192695617675781, + "learning_rate": 6.508938360020628e-07, + "loss": 0.2426, + "step": 33092 + }, + { + "epoch": 0.8374370524078245, + "grad_norm": 10.393148422241211, + "learning_rate": 6.506957520441221e-07, + "loss": 0.1311, + "step": 33093 + }, + { + "epoch": 0.8374623579725181, + "grad_norm": 5.267917156219482, + "learning_rate": 6.504976961340353e-07, + "loss": 0.1441, + "step": 33094 + }, + { + "epoch": 0.8374876635372118, + "grad_norm": 7.0509033203125, + "learning_rate": 6.5029966827308e-07, + "loss": 0.2129, + "step": 33095 + }, + { + "epoch": 0.8375129691019055, + "grad_norm": 3.684971332550049, + "learning_rate": 6.501016684625316e-07, + "loss": 0.1075, + "step": 33096 + }, + { + "epoch": 0.8375382746665991, + "grad_norm": 4.2852864265441895, + "learning_rate": 6.499036967036709e-07, + "loss": 0.0866, + "step": 33097 + }, + { + "epoch": 0.8375635802312928, + "grad_norm": 2.9793925285339355, + "learning_rate": 6.497057529977696e-07, + "loss": 0.1174, + "step": 33098 + }, + { + "epoch": 0.8375888857959866, + "grad_norm": 3.79841947555542, + "learning_rate": 6.495078373461083e-07, + "loss": 0.135, + "step": 33099 + }, + { + "epoch": 0.8376141913606802, + "grad_norm": 2.8000359535217285, + "learning_rate": 6.4930994974996e-07, + "loss": 0.1243, + "step": 33100 + }, + { + "epoch": 0.8376394969253739, + "grad_norm": 6.6760663986206055, + "learning_rate": 6.49112090210604e-07, + "loss": 0.2158, + "step": 33101 + }, + { + "epoch": 0.8376648024900676, + "grad_norm": 5.783229351043701, + "learning_rate": 6.489142587293145e-07, + "loss": 0.1634, + "step": 33102 + }, + { + "epoch": 0.8376901080547613, + "grad_norm": 3.6322295665740967, + "learning_rate": 6.487164553073683e-07, + "loss": 0.1217, + "step": 33103 + }, + { + "epoch": 0.8377154136194549, + "grad_norm": 3.022542953491211, + "learning_rate": 6.485186799460386e-07, + "loss": 0.1084, + "step": 33104 + }, + { + "epoch": 0.8377407191841486, + "grad_norm": 4.7384185791015625, + "learning_rate": 6.483209326466034e-07, + "loss": 0.1295, + "step": 33105 + }, + { + "epoch": 0.8377660247488423, + "grad_norm": 5.238552093505859, + "learning_rate": 6.481232134103371e-07, + "loss": 0.2062, + "step": 33106 + }, + { + "epoch": 0.8377913303135359, + "grad_norm": 6.325050354003906, + "learning_rate": 6.47925522238515e-07, + "loss": 0.2324, + "step": 33107 + }, + { + "epoch": 0.8378166358782296, + "grad_norm": 3.330549478530884, + "learning_rate": 6.477278591324104e-07, + "loss": 0.1145, + "step": 33108 + }, + { + "epoch": 0.8378419414429233, + "grad_norm": 4.744285583496094, + "learning_rate": 6.475302240933007e-07, + "loss": 0.1405, + "step": 33109 + }, + { + "epoch": 0.8378672470076169, + "grad_norm": 7.2832770347595215, + "learning_rate": 6.473326171224575e-07, + "loss": 0.1752, + "step": 33110 + }, + { + "epoch": 0.8378925525723107, + "grad_norm": 4.133504867553711, + "learning_rate": 6.471350382211589e-07, + "loss": 0.1405, + "step": 33111 + }, + { + "epoch": 0.8379178581370044, + "grad_norm": 4.3232316970825195, + "learning_rate": 6.469374873906742e-07, + "loss": 0.1636, + "step": 33112 + }, + { + "epoch": 0.837943163701698, + "grad_norm": 6.009197235107422, + "learning_rate": 6.467399646322814e-07, + "loss": 0.1466, + "step": 33113 + }, + { + "epoch": 0.8379684692663917, + "grad_norm": 4.745760440826416, + "learning_rate": 6.465424699472516e-07, + "loss": 0.0938, + "step": 33114 + }, + { + "epoch": 0.8379937748310854, + "grad_norm": 4.428581237792969, + "learning_rate": 6.463450033368618e-07, + "loss": 0.1556, + "step": 33115 + }, + { + "epoch": 0.838019080395779, + "grad_norm": 6.653114318847656, + "learning_rate": 6.461475648023808e-07, + "loss": 0.1461, + "step": 33116 + }, + { + "epoch": 0.8380443859604727, + "grad_norm": 4.1156206130981445, + "learning_rate": 6.459501543450858e-07, + "loss": 0.1575, + "step": 33117 + }, + { + "epoch": 0.8380696915251664, + "grad_norm": 5.77618408203125, + "learning_rate": 6.457527719662482e-07, + "loss": 0.1554, + "step": 33118 + }, + { + "epoch": 0.83809499708986, + "grad_norm": 16.4350643157959, + "learning_rate": 6.455554176671408e-07, + "loss": 0.1927, + "step": 33119 + }, + { + "epoch": 0.8381203026545537, + "grad_norm": 3.359671115875244, + "learning_rate": 6.453580914490354e-07, + "loss": 0.1483, + "step": 33120 + }, + { + "epoch": 0.8381456082192474, + "grad_norm": 4.11307430267334, + "learning_rate": 6.45160793313207e-07, + "loss": 0.0959, + "step": 33121 + }, + { + "epoch": 0.838170913783941, + "grad_norm": 3.005337715148926, + "learning_rate": 6.449635232609264e-07, + "loss": 0.1081, + "step": 33122 + }, + { + "epoch": 0.8381962193486348, + "grad_norm": 5.8917460441589355, + "learning_rate": 6.447662812934652e-07, + "loss": 0.1365, + "step": 33123 + }, + { + "epoch": 0.8382215249133285, + "grad_norm": 13.358458518981934, + "learning_rate": 6.44569067412098e-07, + "loss": 0.3438, + "step": 33124 + }, + { + "epoch": 0.8382468304780221, + "grad_norm": 9.219975471496582, + "learning_rate": 6.443718816180927e-07, + "loss": 0.2765, + "step": 33125 + }, + { + "epoch": 0.8382721360427158, + "grad_norm": 2.566760301589966, + "learning_rate": 6.441747239127239e-07, + "loss": 0.1129, + "step": 33126 + }, + { + "epoch": 0.8382974416074095, + "grad_norm": 7.292384147644043, + "learning_rate": 6.43977594297261e-07, + "loss": 0.1794, + "step": 33127 + }, + { + "epoch": 0.8383227471721032, + "grad_norm": 3.4404215812683105, + "learning_rate": 6.437804927729785e-07, + "loss": 0.1091, + "step": 33128 + }, + { + "epoch": 0.8383480527367968, + "grad_norm": 10.127876281738281, + "learning_rate": 6.435834193411428e-07, + "loss": 0.1368, + "step": 33129 + }, + { + "epoch": 0.8383733583014905, + "grad_norm": 18.903770446777344, + "learning_rate": 6.433863740030289e-07, + "loss": 0.1268, + "step": 33130 + }, + { + "epoch": 0.8383986638661842, + "grad_norm": 10.099961280822754, + "learning_rate": 6.431893567599046e-07, + "loss": 0.1596, + "step": 33131 + }, + { + "epoch": 0.8384239694308778, + "grad_norm": 3.8223791122436523, + "learning_rate": 6.429923676130423e-07, + "loss": 0.1619, + "step": 33132 + }, + { + "epoch": 0.8384492749955715, + "grad_norm": 3.2251946926116943, + "learning_rate": 6.42795406563712e-07, + "loss": 0.1623, + "step": 33133 + }, + { + "epoch": 0.8384745805602652, + "grad_norm": 3.585049867630005, + "learning_rate": 6.425984736131841e-07, + "loss": 0.0867, + "step": 33134 + }, + { + "epoch": 0.8384998861249588, + "grad_norm": 7.895902156829834, + "learning_rate": 6.424015687627267e-07, + "loss": 0.2511, + "step": 33135 + }, + { + "epoch": 0.8385251916896526, + "grad_norm": 5.097933292388916, + "learning_rate": 6.422046920136121e-07, + "loss": 0.1523, + "step": 33136 + }, + { + "epoch": 0.8385504972543463, + "grad_norm": 7.145791530609131, + "learning_rate": 6.42007843367109e-07, + "loss": 0.1921, + "step": 33137 + }, + { + "epoch": 0.8385758028190399, + "grad_norm": 3.9805550575256348, + "learning_rate": 6.418110228244862e-07, + "loss": 0.0999, + "step": 33138 + }, + { + "epoch": 0.8386011083837336, + "grad_norm": 8.808371543884277, + "learning_rate": 6.416142303870126e-07, + "loss": 0.1683, + "step": 33139 + }, + { + "epoch": 0.8386264139484273, + "grad_norm": 4.229995250701904, + "learning_rate": 6.414174660559597e-07, + "loss": 0.117, + "step": 33140 + }, + { + "epoch": 0.8386517195131209, + "grad_norm": 2.512219190597534, + "learning_rate": 6.412207298325929e-07, + "loss": 0.1276, + "step": 33141 + }, + { + "epoch": 0.8386770250778146, + "grad_norm": 4.742786407470703, + "learning_rate": 6.410240217181857e-07, + "loss": 0.1802, + "step": 33142 + }, + { + "epoch": 0.8387023306425083, + "grad_norm": 5.700887680053711, + "learning_rate": 6.40827341714001e-07, + "loss": 0.1893, + "step": 33143 + }, + { + "epoch": 0.8387276362072019, + "grad_norm": 11.190869331359863, + "learning_rate": 6.406306898213111e-07, + "loss": 0.2379, + "step": 33144 + }, + { + "epoch": 0.8387529417718956, + "grad_norm": 3.665470838546753, + "learning_rate": 6.404340660413832e-07, + "loss": 0.1097, + "step": 33145 + }, + { + "epoch": 0.8387782473365893, + "grad_norm": 6.36947774887085, + "learning_rate": 6.402374703754849e-07, + "loss": 0.1077, + "step": 33146 + }, + { + "epoch": 0.838803552901283, + "grad_norm": 6.131938934326172, + "learning_rate": 6.400409028248833e-07, + "loss": 0.1608, + "step": 33147 + }, + { + "epoch": 0.8388288584659767, + "grad_norm": 2.852479934692383, + "learning_rate": 6.39844363390848e-07, + "loss": 0.1084, + "step": 33148 + }, + { + "epoch": 0.8388541640306704, + "grad_norm": 4.010242938995361, + "learning_rate": 6.396478520746452e-07, + "loss": 0.1144, + "step": 33149 + }, + { + "epoch": 0.838879469595364, + "grad_norm": 2.567112684249878, + "learning_rate": 6.394513688775422e-07, + "loss": 0.1199, + "step": 33150 + }, + { + "epoch": 0.8389047751600577, + "grad_norm": 4.312685966491699, + "learning_rate": 6.392549138008053e-07, + "loss": 0.1396, + "step": 33151 + }, + { + "epoch": 0.8389300807247514, + "grad_norm": 3.770498275756836, + "learning_rate": 6.390584868457033e-07, + "loss": 0.164, + "step": 33152 + }, + { + "epoch": 0.8389553862894451, + "grad_norm": 5.447597026824951, + "learning_rate": 6.388620880135015e-07, + "loss": 0.1877, + "step": 33153 + }, + { + "epoch": 0.8389806918541387, + "grad_norm": 5.496747970581055, + "learning_rate": 6.386657173054666e-07, + "loss": 0.1558, + "step": 33154 + }, + { + "epoch": 0.8390059974188324, + "grad_norm": 4.782785892486572, + "learning_rate": 6.384693747228671e-07, + "loss": 0.2374, + "step": 33155 + }, + { + "epoch": 0.8390313029835261, + "grad_norm": 8.903059005737305, + "learning_rate": 6.382730602669646e-07, + "loss": 0.2162, + "step": 33156 + }, + { + "epoch": 0.8390566085482197, + "grad_norm": 17.02143096923828, + "learning_rate": 6.380767739390292e-07, + "loss": 0.1419, + "step": 33157 + }, + { + "epoch": 0.8390819141129134, + "grad_norm": 7.442655563354492, + "learning_rate": 6.378805157403245e-07, + "loss": 0.1722, + "step": 33158 + }, + { + "epoch": 0.8391072196776072, + "grad_norm": 3.9861652851104736, + "learning_rate": 6.376842856721189e-07, + "loss": 0.1683, + "step": 33159 + }, + { + "epoch": 0.8391325252423008, + "grad_norm": 14.697222709655762, + "learning_rate": 6.37488083735674e-07, + "loss": 0.1698, + "step": 33160 + }, + { + "epoch": 0.8391578308069945, + "grad_norm": 2.9642550945281982, + "learning_rate": 6.372919099322578e-07, + "loss": 0.073, + "step": 33161 + }, + { + "epoch": 0.8391831363716882, + "grad_norm": 4.822568893432617, + "learning_rate": 6.370957642631331e-07, + "loss": 0.1452, + "step": 33162 + }, + { + "epoch": 0.8392084419363818, + "grad_norm": 7.259010314941406, + "learning_rate": 6.36899646729568e-07, + "loss": 0.2008, + "step": 33163 + }, + { + "epoch": 0.8392337475010755, + "grad_norm": 4.156551361083984, + "learning_rate": 6.36703557332825e-07, + "loss": 0.1406, + "step": 33164 + }, + { + "epoch": 0.8392590530657692, + "grad_norm": 7.9380784034729, + "learning_rate": 6.365074960741691e-07, + "loss": 0.2181, + "step": 33165 + }, + { + "epoch": 0.8392843586304628, + "grad_norm": 7.012577533721924, + "learning_rate": 6.363114629548639e-07, + "loss": 0.197, + "step": 33166 + }, + { + "epoch": 0.8393096641951565, + "grad_norm": 4.5041327476501465, + "learning_rate": 6.361154579761753e-07, + "loss": 0.1211, + "step": 33167 + }, + { + "epoch": 0.8393349697598502, + "grad_norm": 2.3082213401794434, + "learning_rate": 6.359194811393665e-07, + "loss": 0.0712, + "step": 33168 + }, + { + "epoch": 0.8393602753245438, + "grad_norm": 6.315946578979492, + "learning_rate": 6.357235324457011e-07, + "loss": 0.1495, + "step": 33169 + }, + { + "epoch": 0.8393855808892375, + "grad_norm": 3.1765546798706055, + "learning_rate": 6.355276118964415e-07, + "loss": 0.1534, + "step": 33170 + }, + { + "epoch": 0.8394108864539312, + "grad_norm": 6.433343410491943, + "learning_rate": 6.353317194928538e-07, + "loss": 0.1569, + "step": 33171 + }, + { + "epoch": 0.8394361920186248, + "grad_norm": 6.8005876541137695, + "learning_rate": 6.351358552362002e-07, + "loss": 0.1848, + "step": 33172 + }, + { + "epoch": 0.8394614975833186, + "grad_norm": 6.885213851928711, + "learning_rate": 6.349400191277433e-07, + "loss": 0.233, + "step": 33173 + }, + { + "epoch": 0.8394868031480123, + "grad_norm": 9.073263168334961, + "learning_rate": 6.347442111687452e-07, + "loss": 0.2818, + "step": 33174 + }, + { + "epoch": 0.8395121087127059, + "grad_norm": 5.426300525665283, + "learning_rate": 6.345484313604705e-07, + "loss": 0.134, + "step": 33175 + }, + { + "epoch": 0.8395374142773996, + "grad_norm": 3.1128005981445312, + "learning_rate": 6.34352679704181e-07, + "loss": 0.1159, + "step": 33176 + }, + { + "epoch": 0.8395627198420933, + "grad_norm": 5.771440029144287, + "learning_rate": 6.34156956201139e-07, + "loss": 0.2168, + "step": 33177 + }, + { + "epoch": 0.8395880254067869, + "grad_norm": 6.340664386749268, + "learning_rate": 6.339612608526057e-07, + "loss": 0.208, + "step": 33178 + }, + { + "epoch": 0.8396133309714806, + "grad_norm": 3.8808298110961914, + "learning_rate": 6.337655936598458e-07, + "loss": 0.1255, + "step": 33179 + }, + { + "epoch": 0.8396386365361743, + "grad_norm": 23.335674285888672, + "learning_rate": 6.335699546241187e-07, + "loss": 0.2818, + "step": 33180 + }, + { + "epoch": 0.839663942100868, + "grad_norm": 6.622403621673584, + "learning_rate": 6.333743437466866e-07, + "loss": 0.1525, + "step": 33181 + }, + { + "epoch": 0.8396892476655616, + "grad_norm": 9.745826721191406, + "learning_rate": 6.331787610288115e-07, + "loss": 0.2471, + "step": 33182 + }, + { + "epoch": 0.8397145532302553, + "grad_norm": 10.368910789489746, + "learning_rate": 6.329832064717529e-07, + "loss": 0.2419, + "step": 33183 + }, + { + "epoch": 0.8397398587949491, + "grad_norm": 2.2323482036590576, + "learning_rate": 6.327876800767746e-07, + "loss": 0.0784, + "step": 33184 + }, + { + "epoch": 0.8397651643596427, + "grad_norm": 2.7103192806243896, + "learning_rate": 6.325921818451353e-07, + "loss": 0.1076, + "step": 33185 + }, + { + "epoch": 0.8397904699243364, + "grad_norm": 7.164542198181152, + "learning_rate": 6.323967117780982e-07, + "loss": 0.1906, + "step": 33186 + }, + { + "epoch": 0.8398157754890301, + "grad_norm": 5.116508960723877, + "learning_rate": 6.322012698769203e-07, + "loss": 0.1358, + "step": 33187 + }, + { + "epoch": 0.8398410810537237, + "grad_norm": 3.5612943172454834, + "learning_rate": 6.320058561428655e-07, + "loss": 0.1058, + "step": 33188 + }, + { + "epoch": 0.8398663866184174, + "grad_norm": 3.8273184299468994, + "learning_rate": 6.318104705771905e-07, + "loss": 0.1436, + "step": 33189 + }, + { + "epoch": 0.8398916921831111, + "grad_norm": 3.241011619567871, + "learning_rate": 6.316151131811599e-07, + "loss": 0.1311, + "step": 33190 + }, + { + "epoch": 0.8399169977478047, + "grad_norm": 15.197944641113281, + "learning_rate": 6.314197839560288e-07, + "loss": 0.2166, + "step": 33191 + }, + { + "epoch": 0.8399423033124984, + "grad_norm": 3.12705135345459, + "learning_rate": 6.3122448290306e-07, + "loss": 0.1159, + "step": 33192 + }, + { + "epoch": 0.8399676088771921, + "grad_norm": 9.83411693572998, + "learning_rate": 6.310292100235105e-07, + "loss": 0.1949, + "step": 33193 + }, + { + "epoch": 0.8399929144418857, + "grad_norm": 4.026790142059326, + "learning_rate": 6.308339653186429e-07, + "loss": 0.1224, + "step": 33194 + }, + { + "epoch": 0.8400182200065794, + "grad_norm": 3.9629127979278564, + "learning_rate": 6.306387487897125e-07, + "loss": 0.1472, + "step": 33195 + }, + { + "epoch": 0.8400435255712732, + "grad_norm": 2.2936899662017822, + "learning_rate": 6.30443560437981e-07, + "loss": 0.091, + "step": 33196 + }, + { + "epoch": 0.8400688311359668, + "grad_norm": 6.889955520629883, + "learning_rate": 6.302484002647052e-07, + "loss": 0.2105, + "step": 33197 + }, + { + "epoch": 0.8400941367006605, + "grad_norm": 7.448108673095703, + "learning_rate": 6.300532682711457e-07, + "loss": 0.0945, + "step": 33198 + }, + { + "epoch": 0.8401194422653542, + "grad_norm": 6.842141628265381, + "learning_rate": 6.2985816445856e-07, + "loss": 0.198, + "step": 33199 + }, + { + "epoch": 0.8401447478300478, + "grad_norm": 3.133046865463257, + "learning_rate": 6.296630888282058e-07, + "loss": 0.1677, + "step": 33200 + }, + { + "epoch": 0.8401700533947415, + "grad_norm": 11.62089729309082, + "learning_rate": 6.294680413813403e-07, + "loss": 0.1565, + "step": 33201 + }, + { + "epoch": 0.8401953589594352, + "grad_norm": 7.081183433532715, + "learning_rate": 6.292730221192239e-07, + "loss": 0.1842, + "step": 33202 + }, + { + "epoch": 0.8402206645241288, + "grad_norm": 4.335243225097656, + "learning_rate": 6.290780310431121e-07, + "loss": 0.0702, + "step": 33203 + }, + { + "epoch": 0.8402459700888225, + "grad_norm": 2.3108742237091064, + "learning_rate": 6.288830681542635e-07, + "loss": 0.1283, + "step": 33204 + }, + { + "epoch": 0.8402712756535162, + "grad_norm": 7.586331367492676, + "learning_rate": 6.286881334539341e-07, + "loss": 0.1933, + "step": 33205 + }, + { + "epoch": 0.8402965812182099, + "grad_norm": 4.641305446624756, + "learning_rate": 6.284932269433824e-07, + "loss": 0.1615, + "step": 33206 + }, + { + "epoch": 0.8403218867829035, + "grad_norm": 3.0695385932922363, + "learning_rate": 6.282983486238648e-07, + "loss": 0.1036, + "step": 33207 + }, + { + "epoch": 0.8403471923475972, + "grad_norm": 8.517070770263672, + "learning_rate": 6.281034984966378e-07, + "loss": 0.1522, + "step": 33208 + }, + { + "epoch": 0.840372497912291, + "grad_norm": 13.814787864685059, + "learning_rate": 6.279086765629583e-07, + "loss": 0.2806, + "step": 33209 + }, + { + "epoch": 0.8403978034769846, + "grad_norm": 5.925771713256836, + "learning_rate": 6.277138828240814e-07, + "loss": 0.2327, + "step": 33210 + }, + { + "epoch": 0.8404231090416783, + "grad_norm": 5.809326171875, + "learning_rate": 6.275191172812656e-07, + "loss": 0.1402, + "step": 33211 + }, + { + "epoch": 0.840448414606372, + "grad_norm": 2.856870174407959, + "learning_rate": 6.273243799357653e-07, + "loss": 0.0701, + "step": 33212 + }, + { + "epoch": 0.8404737201710656, + "grad_norm": 4.734572410583496, + "learning_rate": 6.271296707888369e-07, + "loss": 0.0823, + "step": 33213 + }, + { + "epoch": 0.8404990257357593, + "grad_norm": 3.366145133972168, + "learning_rate": 6.269349898417348e-07, + "loss": 0.1004, + "step": 33214 + }, + { + "epoch": 0.840524331300453, + "grad_norm": 10.58758544921875, + "learning_rate": 6.267403370957165e-07, + "loss": 0.2493, + "step": 33215 + }, + { + "epoch": 0.8405496368651466, + "grad_norm": 9.9642972946167, + "learning_rate": 6.265457125520353e-07, + "loss": 0.314, + "step": 33216 + }, + { + "epoch": 0.8405749424298403, + "grad_norm": 3.8563807010650635, + "learning_rate": 6.26351116211949e-07, + "loss": 0.1335, + "step": 33217 + }, + { + "epoch": 0.840600247994534, + "grad_norm": 7.738297462463379, + "learning_rate": 6.26156548076709e-07, + "loss": 0.1979, + "step": 33218 + }, + { + "epoch": 0.8406255535592276, + "grad_norm": 3.848788022994995, + "learning_rate": 6.259620081475731e-07, + "loss": 0.1346, + "step": 33219 + }, + { + "epoch": 0.8406508591239213, + "grad_norm": 9.139726638793945, + "learning_rate": 6.257674964257931e-07, + "loss": 0.2809, + "step": 33220 + }, + { + "epoch": 0.8406761646886151, + "grad_norm": 6.436675548553467, + "learning_rate": 6.25573012912627e-07, + "loss": 0.146, + "step": 33221 + }, + { + "epoch": 0.8407014702533087, + "grad_norm": 3.66268253326416, + "learning_rate": 6.253785576093252e-07, + "loss": 0.1658, + "step": 33222 + }, + { + "epoch": 0.8407267758180024, + "grad_norm": 4.145594120025635, + "learning_rate": 6.251841305171436e-07, + "loss": 0.1046, + "step": 33223 + }, + { + "epoch": 0.8407520813826961, + "grad_norm": 9.6482515335083, + "learning_rate": 6.249897316373354e-07, + "loss": 0.1841, + "step": 33224 + }, + { + "epoch": 0.8407773869473897, + "grad_norm": 2.65551495552063, + "learning_rate": 6.247953609711566e-07, + "loss": 0.1042, + "step": 33225 + }, + { + "epoch": 0.8408026925120834, + "grad_norm": 5.445186138153076, + "learning_rate": 6.246010185198564e-07, + "loss": 0.1741, + "step": 33226 + }, + { + "epoch": 0.8408279980767771, + "grad_norm": 6.418207168579102, + "learning_rate": 6.244067042846913e-07, + "loss": 0.171, + "step": 33227 + }, + { + "epoch": 0.8408533036414707, + "grad_norm": 3.1796836853027344, + "learning_rate": 6.242124182669124e-07, + "loss": 0.1459, + "step": 33228 + }, + { + "epoch": 0.8408786092061644, + "grad_norm": 5.800294399261475, + "learning_rate": 6.240181604677748e-07, + "loss": 0.2049, + "step": 33229 + }, + { + "epoch": 0.8409039147708581, + "grad_norm": 3.1835339069366455, + "learning_rate": 6.238239308885302e-07, + "loss": 0.1358, + "step": 33230 + }, + { + "epoch": 0.8409292203355518, + "grad_norm": 3.7502145767211914, + "learning_rate": 6.236297295304311e-07, + "loss": 0.1394, + "step": 33231 + }, + { + "epoch": 0.8409545259002454, + "grad_norm": 8.180017471313477, + "learning_rate": 6.234355563947286e-07, + "loss": 0.1722, + "step": 33232 + }, + { + "epoch": 0.8409798314649392, + "grad_norm": 3.429060697555542, + "learning_rate": 6.232414114826774e-07, + "loss": 0.1556, + "step": 33233 + }, + { + "epoch": 0.8410051370296329, + "grad_norm": 4.964030742645264, + "learning_rate": 6.230472947955274e-07, + "loss": 0.1426, + "step": 33234 + }, + { + "epoch": 0.8410304425943265, + "grad_norm": 7.423256874084473, + "learning_rate": 6.228532063345321e-07, + "loss": 0.2507, + "step": 33235 + }, + { + "epoch": 0.8410557481590202, + "grad_norm": 5.296574592590332, + "learning_rate": 6.226591461009407e-07, + "loss": 0.1832, + "step": 33236 + }, + { + "epoch": 0.8410810537237139, + "grad_norm": 4.124168872833252, + "learning_rate": 6.224651140960075e-07, + "loss": 0.1622, + "step": 33237 + }, + { + "epoch": 0.8411063592884075, + "grad_norm": 4.669921875, + "learning_rate": 6.222711103209828e-07, + "loss": 0.2032, + "step": 33238 + }, + { + "epoch": 0.8411316648531012, + "grad_norm": 3.6108014583587646, + "learning_rate": 6.220771347771165e-07, + "loss": 0.077, + "step": 33239 + }, + { + "epoch": 0.8411569704177949, + "grad_norm": 4.101688385009766, + "learning_rate": 6.218831874656612e-07, + "loss": 0.1219, + "step": 33240 + }, + { + "epoch": 0.8411822759824885, + "grad_norm": 4.546113967895508, + "learning_rate": 6.216892683878655e-07, + "loss": 0.1567, + "step": 33241 + }, + { + "epoch": 0.8412075815471822, + "grad_norm": 7.59535551071167, + "learning_rate": 6.214953775449822e-07, + "loss": 0.1518, + "step": 33242 + }, + { + "epoch": 0.8412328871118759, + "grad_norm": 3.8008105754852295, + "learning_rate": 6.213015149382612e-07, + "loss": 0.1062, + "step": 33243 + }, + { + "epoch": 0.8412581926765695, + "grad_norm": 5.323215007781982, + "learning_rate": 6.21107680568952e-07, + "loss": 0.1872, + "step": 33244 + }, + { + "epoch": 0.8412834982412633, + "grad_norm": 3.359637975692749, + "learning_rate": 6.209138744383036e-07, + "loss": 0.1267, + "step": 33245 + }, + { + "epoch": 0.841308803805957, + "grad_norm": 4.223579406738281, + "learning_rate": 6.207200965475679e-07, + "loss": 0.1288, + "step": 33246 + }, + { + "epoch": 0.8413341093706506, + "grad_norm": 4.42753791809082, + "learning_rate": 6.205263468979944e-07, + "loss": 0.124, + "step": 33247 + }, + { + "epoch": 0.8413594149353443, + "grad_norm": 10.61846923828125, + "learning_rate": 6.203326254908315e-07, + "loss": 0.2377, + "step": 33248 + }, + { + "epoch": 0.841384720500038, + "grad_norm": 4.811460494995117, + "learning_rate": 6.201389323273277e-07, + "loss": 0.0887, + "step": 33249 + }, + { + "epoch": 0.8414100260647316, + "grad_norm": 5.0267133712768555, + "learning_rate": 6.199452674087341e-07, + "loss": 0.174, + "step": 33250 + }, + { + "epoch": 0.8414353316294253, + "grad_norm": 11.560467720031738, + "learning_rate": 6.197516307362977e-07, + "loss": 0.2473, + "step": 33251 + }, + { + "epoch": 0.841460637194119, + "grad_norm": 8.496062278747559, + "learning_rate": 6.195580223112707e-07, + "loss": 0.1481, + "step": 33252 + }, + { + "epoch": 0.8414859427588126, + "grad_norm": 3.840949296951294, + "learning_rate": 6.193644421348966e-07, + "loss": 0.1029, + "step": 33253 + }, + { + "epoch": 0.8415112483235063, + "grad_norm": 4.830947399139404, + "learning_rate": 6.191708902084276e-07, + "loss": 0.1935, + "step": 33254 + }, + { + "epoch": 0.8415365538882, + "grad_norm": 4.020766735076904, + "learning_rate": 6.1897736653311e-07, + "loss": 0.1797, + "step": 33255 + }, + { + "epoch": 0.8415618594528937, + "grad_norm": 4.31740140914917, + "learning_rate": 6.18783871110194e-07, + "loss": 0.1096, + "step": 33256 + }, + { + "epoch": 0.8415871650175873, + "grad_norm": 10.430132865905762, + "learning_rate": 6.18590403940924e-07, + "loss": 0.1605, + "step": 33257 + }, + { + "epoch": 0.8416124705822811, + "grad_norm": 3.927243232727051, + "learning_rate": 6.183969650265503e-07, + "loss": 0.1547, + "step": 33258 + }, + { + "epoch": 0.8416377761469748, + "grad_norm": 5.48454475402832, + "learning_rate": 6.18203554368319e-07, + "loss": 0.1885, + "step": 33259 + }, + { + "epoch": 0.8416630817116684, + "grad_norm": 6.143592357635498, + "learning_rate": 6.180101719674786e-07, + "loss": 0.1732, + "step": 33260 + }, + { + "epoch": 0.8416883872763621, + "grad_norm": 10.708296775817871, + "learning_rate": 6.178168178252753e-07, + "loss": 0.3109, + "step": 33261 + }, + { + "epoch": 0.8417136928410558, + "grad_norm": 4.291240692138672, + "learning_rate": 6.176234919429564e-07, + "loss": 0.1542, + "step": 33262 + }, + { + "epoch": 0.8417389984057494, + "grad_norm": 3.236602783203125, + "learning_rate": 6.174301943217675e-07, + "loss": 0.101, + "step": 33263 + }, + { + "epoch": 0.8417643039704431, + "grad_norm": 18.915496826171875, + "learning_rate": 6.172369249629567e-07, + "loss": 0.1141, + "step": 33264 + }, + { + "epoch": 0.8417896095351368, + "grad_norm": 3.5286483764648438, + "learning_rate": 6.170436838677702e-07, + "loss": 0.1648, + "step": 33265 + }, + { + "epoch": 0.8418149150998304, + "grad_norm": 3.201246976852417, + "learning_rate": 6.168504710374535e-07, + "loss": 0.1298, + "step": 33266 + }, + { + "epoch": 0.8418402206645241, + "grad_norm": 3.9614601135253906, + "learning_rate": 6.166572864732523e-07, + "loss": 0.1221, + "step": 33267 + }, + { + "epoch": 0.8418655262292178, + "grad_norm": 5.556789875030518, + "learning_rate": 6.164641301764123e-07, + "loss": 0.115, + "step": 33268 + }, + { + "epoch": 0.8418908317939114, + "grad_norm": 5.456541061401367, + "learning_rate": 6.16271002148181e-07, + "loss": 0.146, + "step": 33269 + }, + { + "epoch": 0.8419161373586052, + "grad_norm": 4.378267765045166, + "learning_rate": 6.160779023898022e-07, + "loss": 0.1095, + "step": 33270 + }, + { + "epoch": 0.8419414429232989, + "grad_norm": 6.544042110443115, + "learning_rate": 6.158848309025217e-07, + "loss": 0.1293, + "step": 33271 + }, + { + "epoch": 0.8419667484879925, + "grad_norm": 4.9462056159973145, + "learning_rate": 6.156917876875834e-07, + "loss": 0.1134, + "step": 33272 + }, + { + "epoch": 0.8419920540526862, + "grad_norm": 6.077031135559082, + "learning_rate": 6.154987727462342e-07, + "loss": 0.1499, + "step": 33273 + }, + { + "epoch": 0.8420173596173799, + "grad_norm": 7.984675884246826, + "learning_rate": 6.15305786079718e-07, + "loss": 0.2702, + "step": 33274 + }, + { + "epoch": 0.8420426651820735, + "grad_norm": 3.4603817462921143, + "learning_rate": 6.151128276892787e-07, + "loss": 0.1375, + "step": 33275 + }, + { + "epoch": 0.8420679707467672, + "grad_norm": 6.399308681488037, + "learning_rate": 6.149198975761606e-07, + "loss": 0.1814, + "step": 33276 + }, + { + "epoch": 0.8420932763114609, + "grad_norm": 6.138081073760986, + "learning_rate": 6.147269957416091e-07, + "loss": 0.1594, + "step": 33277 + }, + { + "epoch": 0.8421185818761545, + "grad_norm": 4.868260860443115, + "learning_rate": 6.145341221868678e-07, + "loss": 0.1742, + "step": 33278 + }, + { + "epoch": 0.8421438874408482, + "grad_norm": 4.550537109375, + "learning_rate": 6.143412769131796e-07, + "loss": 0.1218, + "step": 33279 + }, + { + "epoch": 0.8421691930055419, + "grad_norm": 3.340686798095703, + "learning_rate": 6.141484599217884e-07, + "loss": 0.118, + "step": 33280 + }, + { + "epoch": 0.8421944985702357, + "grad_norm": 4.742544174194336, + "learning_rate": 6.139556712139383e-07, + "loss": 0.1573, + "step": 33281 + }, + { + "epoch": 0.8422198041349293, + "grad_norm": 13.060912132263184, + "learning_rate": 6.137629107908716e-07, + "loss": 0.1603, + "step": 33282 + }, + { + "epoch": 0.842245109699623, + "grad_norm": 3.8888885974884033, + "learning_rate": 6.135701786538339e-07, + "loss": 0.1363, + "step": 33283 + }, + { + "epoch": 0.8422704152643167, + "grad_norm": 3.6513428688049316, + "learning_rate": 6.133774748040644e-07, + "loss": 0.09, + "step": 33284 + }, + { + "epoch": 0.8422957208290103, + "grad_norm": 3.7095749378204346, + "learning_rate": 6.131847992428086e-07, + "loss": 0.1188, + "step": 33285 + }, + { + "epoch": 0.842321026393704, + "grad_norm": 7.1323723793029785, + "learning_rate": 6.129921519713067e-07, + "loss": 0.1601, + "step": 33286 + }, + { + "epoch": 0.8423463319583977, + "grad_norm": 5.468276023864746, + "learning_rate": 6.127995329908043e-07, + "loss": 0.1993, + "step": 33287 + }, + { + "epoch": 0.8423716375230913, + "grad_norm": 11.582983016967773, + "learning_rate": 6.126069423025399e-07, + "loss": 0.193, + "step": 33288 + }, + { + "epoch": 0.842396943087785, + "grad_norm": 4.82747745513916, + "learning_rate": 6.124143799077581e-07, + "loss": 0.2037, + "step": 33289 + }, + { + "epoch": 0.8424222486524787, + "grad_norm": 4.766054630279541, + "learning_rate": 6.122218458076989e-07, + "loss": 0.1463, + "step": 33290 + }, + { + "epoch": 0.8424475542171723, + "grad_norm": 6.846982479095459, + "learning_rate": 6.120293400036059e-07, + "loss": 0.0888, + "step": 33291 + }, + { + "epoch": 0.842472859781866, + "grad_norm": 8.268267631530762, + "learning_rate": 6.118368624967192e-07, + "loss": 0.1393, + "step": 33292 + }, + { + "epoch": 0.8424981653465597, + "grad_norm": 8.067121505737305, + "learning_rate": 6.116444132882804e-07, + "loss": 0.158, + "step": 33293 + }, + { + "epoch": 0.8425234709112533, + "grad_norm": 3.2878096103668213, + "learning_rate": 6.114519923795304e-07, + "loss": 0.124, + "step": 33294 + }, + { + "epoch": 0.8425487764759471, + "grad_norm": 7.62668514251709, + "learning_rate": 6.112595997717097e-07, + "loss": 0.1981, + "step": 33295 + }, + { + "epoch": 0.8425740820406408, + "grad_norm": 3.126912832260132, + "learning_rate": 6.110672354660597e-07, + "loss": 0.0869, + "step": 33296 + }, + { + "epoch": 0.8425993876053344, + "grad_norm": 5.888967037200928, + "learning_rate": 6.108748994638214e-07, + "loss": 0.1749, + "step": 33297 + }, + { + "epoch": 0.8426246931700281, + "grad_norm": 3.5734853744506836, + "learning_rate": 6.10682591766234e-07, + "loss": 0.1132, + "step": 33298 + }, + { + "epoch": 0.8426499987347218, + "grad_norm": 12.807136535644531, + "learning_rate": 6.10490312374537e-07, + "loss": 0.2191, + "step": 33299 + }, + { + "epoch": 0.8426753042994154, + "grad_norm": 6.140774250030518, + "learning_rate": 6.102980612899728e-07, + "loss": 0.1609, + "step": 33300 + }, + { + "epoch": 0.8427006098641091, + "grad_norm": 4.152536869049072, + "learning_rate": 6.101058385137798e-07, + "loss": 0.1271, + "step": 33301 + }, + { + "epoch": 0.8427259154288028, + "grad_norm": 4.625846862792969, + "learning_rate": 6.099136440471975e-07, + "loss": 0.1571, + "step": 33302 + }, + { + "epoch": 0.8427512209934964, + "grad_norm": 14.72586441040039, + "learning_rate": 6.097214778914645e-07, + "loss": 0.1867, + "step": 33303 + }, + { + "epoch": 0.8427765265581901, + "grad_norm": 3.7109878063201904, + "learning_rate": 6.09529340047822e-07, + "loss": 0.1096, + "step": 33304 + }, + { + "epoch": 0.8428018321228838, + "grad_norm": 9.39777946472168, + "learning_rate": 6.093372305175082e-07, + "loss": 0.2057, + "step": 33305 + }, + { + "epoch": 0.8428271376875774, + "grad_norm": 16.908004760742188, + "learning_rate": 6.09145149301762e-07, + "loss": 0.2798, + "step": 33306 + }, + { + "epoch": 0.8428524432522712, + "grad_norm": 14.302165985107422, + "learning_rate": 6.089530964018209e-07, + "loss": 0.4542, + "step": 33307 + }, + { + "epoch": 0.8428777488169649, + "grad_norm": 2.9711222648620605, + "learning_rate": 6.087610718189251e-07, + "loss": 0.0596, + "step": 33308 + }, + { + "epoch": 0.8429030543816586, + "grad_norm": 5.218575477600098, + "learning_rate": 6.085690755543128e-07, + "loss": 0.1875, + "step": 33309 + }, + { + "epoch": 0.8429283599463522, + "grad_norm": 5.079248905181885, + "learning_rate": 6.083771076092215e-07, + "loss": 0.0972, + "step": 33310 + }, + { + "epoch": 0.8429536655110459, + "grad_norm": 3.8505046367645264, + "learning_rate": 6.081851679848882e-07, + "loss": 0.138, + "step": 33311 + }, + { + "epoch": 0.8429789710757396, + "grad_norm": 8.383707046508789, + "learning_rate": 6.079932566825531e-07, + "loss": 0.1524, + "step": 33312 + }, + { + "epoch": 0.8430042766404332, + "grad_norm": 7.878736972808838, + "learning_rate": 6.07801373703451e-07, + "loss": 0.1976, + "step": 33313 + }, + { + "epoch": 0.8430295822051269, + "grad_norm": 3.7765774726867676, + "learning_rate": 6.076095190488234e-07, + "loss": 0.0924, + "step": 33314 + }, + { + "epoch": 0.8430548877698206, + "grad_norm": 4.70686674118042, + "learning_rate": 6.074176927199027e-07, + "loss": 0.1965, + "step": 33315 + }, + { + "epoch": 0.8430801933345142, + "grad_norm": 3.2843596935272217, + "learning_rate": 6.072258947179294e-07, + "loss": 0.108, + "step": 33316 + }, + { + "epoch": 0.8431054988992079, + "grad_norm": 3.5593369007110596, + "learning_rate": 6.07034125044138e-07, + "loss": 0.2005, + "step": 33317 + }, + { + "epoch": 0.8431308044639017, + "grad_norm": 6.752484321594238, + "learning_rate": 6.068423836997683e-07, + "loss": 0.1564, + "step": 33318 + }, + { + "epoch": 0.8431561100285953, + "grad_norm": 5.354559421539307, + "learning_rate": 6.066506706860531e-07, + "loss": 0.1685, + "step": 33319 + }, + { + "epoch": 0.843181415593289, + "grad_norm": 4.036661624908447, + "learning_rate": 6.064589860042314e-07, + "loss": 0.0717, + "step": 33320 + }, + { + "epoch": 0.8432067211579827, + "grad_norm": 4.644503593444824, + "learning_rate": 6.062673296555371e-07, + "loss": 0.1079, + "step": 33321 + }, + { + "epoch": 0.8432320267226763, + "grad_norm": 5.485169887542725, + "learning_rate": 6.0607570164121e-07, + "loss": 0.1624, + "step": 33322 + }, + { + "epoch": 0.84325733228737, + "grad_norm": 3.089552402496338, + "learning_rate": 6.058841019624811e-07, + "loss": 0.1444, + "step": 33323 + }, + { + "epoch": 0.8432826378520637, + "grad_norm": 3.96445369720459, + "learning_rate": 6.056925306205897e-07, + "loss": 0.1123, + "step": 33324 + }, + { + "epoch": 0.8433079434167573, + "grad_norm": 7.2822465896606445, + "learning_rate": 6.055009876167694e-07, + "loss": 0.1266, + "step": 33325 + }, + { + "epoch": 0.843333248981451, + "grad_norm": 4.908518314361572, + "learning_rate": 6.053094729522547e-07, + "loss": 0.1279, + "step": 33326 + }, + { + "epoch": 0.8433585545461447, + "grad_norm": 7.815921306610107, + "learning_rate": 6.05117986628283e-07, + "loss": 0.1398, + "step": 33327 + }, + { + "epoch": 0.8433838601108383, + "grad_norm": 7.462522983551025, + "learning_rate": 6.049265286460876e-07, + "loss": 0.2004, + "step": 33328 + }, + { + "epoch": 0.843409165675532, + "grad_norm": 7.609262466430664, + "learning_rate": 6.047350990069034e-07, + "loss": 0.2094, + "step": 33329 + }, + { + "epoch": 0.8434344712402257, + "grad_norm": 3.504709005355835, + "learning_rate": 6.045436977119645e-07, + "loss": 0.1218, + "step": 33330 + }, + { + "epoch": 0.8434597768049193, + "grad_norm": 4.523012638092041, + "learning_rate": 6.043523247625072e-07, + "loss": 0.1177, + "step": 33331 + }, + { + "epoch": 0.8434850823696131, + "grad_norm": 5.881079196929932, + "learning_rate": 6.041609801597619e-07, + "loss": 0.1403, + "step": 33332 + }, + { + "epoch": 0.8435103879343068, + "grad_norm": 5.017167568206787, + "learning_rate": 6.039696639049663e-07, + "loss": 0.1602, + "step": 33333 + }, + { + "epoch": 0.8435356934990005, + "grad_norm": 3.6368205547332764, + "learning_rate": 6.037783759993516e-07, + "loss": 0.166, + "step": 33334 + }, + { + "epoch": 0.8435609990636941, + "grad_norm": 3.494852304458618, + "learning_rate": 6.035871164441531e-07, + "loss": 0.098, + "step": 33335 + }, + { + "epoch": 0.8435863046283878, + "grad_norm": 4.158619403839111, + "learning_rate": 6.033958852406035e-07, + "loss": 0.1208, + "step": 33336 + }, + { + "epoch": 0.8436116101930815, + "grad_norm": 3.3745434284210205, + "learning_rate": 6.032046823899362e-07, + "loss": 0.0996, + "step": 33337 + }, + { + "epoch": 0.8436369157577751, + "grad_norm": 11.790714263916016, + "learning_rate": 6.03013507893383e-07, + "loss": 0.1856, + "step": 33338 + }, + { + "epoch": 0.8436622213224688, + "grad_norm": 6.363205909729004, + "learning_rate": 6.028223617521789e-07, + "loss": 0.2229, + "step": 33339 + }, + { + "epoch": 0.8436875268871625, + "grad_norm": 4.991033554077148, + "learning_rate": 6.026312439675553e-07, + "loss": 0.1955, + "step": 33340 + }, + { + "epoch": 0.8437128324518561, + "grad_norm": 3.5645751953125, + "learning_rate": 6.024401545407449e-07, + "loss": 0.1588, + "step": 33341 + }, + { + "epoch": 0.8437381380165498, + "grad_norm": 4.752762794494629, + "learning_rate": 6.022490934729786e-07, + "loss": 0.1321, + "step": 33342 + }, + { + "epoch": 0.8437634435812436, + "grad_norm": 3.452540636062622, + "learning_rate": 6.020580607654908e-07, + "loss": 0.0993, + "step": 33343 + }, + { + "epoch": 0.8437887491459372, + "grad_norm": 10.995731353759766, + "learning_rate": 6.018670564195117e-07, + "loss": 0.3226, + "step": 33344 + }, + { + "epoch": 0.8438140547106309, + "grad_norm": 4.633841037750244, + "learning_rate": 6.016760804362764e-07, + "loss": 0.1411, + "step": 33345 + }, + { + "epoch": 0.8438393602753246, + "grad_norm": 8.501996994018555, + "learning_rate": 6.014851328170113e-07, + "loss": 0.2755, + "step": 33346 + }, + { + "epoch": 0.8438646658400182, + "grad_norm": 4.146435260772705, + "learning_rate": 6.012942135629513e-07, + "loss": 0.1432, + "step": 33347 + }, + { + "epoch": 0.8438899714047119, + "grad_norm": 2.4970474243164062, + "learning_rate": 6.01103322675326e-07, + "loss": 0.0964, + "step": 33348 + }, + { + "epoch": 0.8439152769694056, + "grad_norm": 5.362372875213623, + "learning_rate": 6.00912460155369e-07, + "loss": 0.1907, + "step": 33349 + }, + { + "epoch": 0.8439405825340992, + "grad_norm": 5.817885875701904, + "learning_rate": 6.007216260043075e-07, + "loss": 0.192, + "step": 33350 + }, + { + "epoch": 0.8439658880987929, + "grad_norm": 8.402375221252441, + "learning_rate": 6.005308202233744e-07, + "loss": 0.1486, + "step": 33351 + }, + { + "epoch": 0.8439911936634866, + "grad_norm": 5.635504722595215, + "learning_rate": 6.003400428138001e-07, + "loss": 0.1405, + "step": 33352 + }, + { + "epoch": 0.8440164992281802, + "grad_norm": 14.222466468811035, + "learning_rate": 6.001492937768144e-07, + "loss": 0.4234, + "step": 33353 + }, + { + "epoch": 0.8440418047928739, + "grad_norm": 6.981997013092041, + "learning_rate": 5.999585731136465e-07, + "loss": 0.2228, + "step": 33354 + }, + { + "epoch": 0.8440671103575677, + "grad_norm": 7.148564338684082, + "learning_rate": 5.997678808255281e-07, + "loss": 0.1862, + "step": 33355 + }, + { + "epoch": 0.8440924159222613, + "grad_norm": 6.936892032623291, + "learning_rate": 5.995772169136882e-07, + "loss": 0.254, + "step": 33356 + }, + { + "epoch": 0.844117721486955, + "grad_norm": 3.706468105316162, + "learning_rate": 5.993865813793553e-07, + "loss": 0.106, + "step": 33357 + }, + { + "epoch": 0.8441430270516487, + "grad_norm": 5.639493465423584, + "learning_rate": 5.991959742237607e-07, + "loss": 0.1694, + "step": 33358 + }, + { + "epoch": 0.8441683326163424, + "grad_norm": 10.23963737487793, + "learning_rate": 5.990053954481328e-07, + "loss": 0.1994, + "step": 33359 + }, + { + "epoch": 0.844193638181036, + "grad_norm": 4.09389066696167, + "learning_rate": 5.988148450537001e-07, + "loss": 0.1592, + "step": 33360 + }, + { + "epoch": 0.8442189437457297, + "grad_norm": 3.9125237464904785, + "learning_rate": 5.986243230416911e-07, + "loss": 0.1635, + "step": 33361 + }, + { + "epoch": 0.8442442493104234, + "grad_norm": 5.818260669708252, + "learning_rate": 5.984338294133374e-07, + "loss": 0.2128, + "step": 33362 + }, + { + "epoch": 0.844269554875117, + "grad_norm": 4.850165367126465, + "learning_rate": 5.982433641698626e-07, + "loss": 0.1187, + "step": 33363 + }, + { + "epoch": 0.8442948604398107, + "grad_norm": 8.59761905670166, + "learning_rate": 5.980529273124991e-07, + "loss": 0.1767, + "step": 33364 + }, + { + "epoch": 0.8443201660045044, + "grad_norm": 5.583163738250732, + "learning_rate": 5.978625188424719e-07, + "loss": 0.1805, + "step": 33365 + }, + { + "epoch": 0.844345471569198, + "grad_norm": 4.033047199249268, + "learning_rate": 5.976721387610118e-07, + "loss": 0.1075, + "step": 33366 + }, + { + "epoch": 0.8443707771338917, + "grad_norm": 5.528784275054932, + "learning_rate": 5.97481787069345e-07, + "loss": 0.1385, + "step": 33367 + }, + { + "epoch": 0.8443960826985855, + "grad_norm": 4.096749782562256, + "learning_rate": 5.972914637686994e-07, + "loss": 0.124, + "step": 33368 + }, + { + "epoch": 0.8444213882632791, + "grad_norm": 2.517285108566284, + "learning_rate": 5.971011688603013e-07, + "loss": 0.0901, + "step": 33369 + }, + { + "epoch": 0.8444466938279728, + "grad_norm": 7.68949556350708, + "learning_rate": 5.969109023453795e-07, + "loss": 0.2323, + "step": 33370 + }, + { + "epoch": 0.8444719993926665, + "grad_norm": 8.342642784118652, + "learning_rate": 5.967206642251599e-07, + "loss": 0.1262, + "step": 33371 + }, + { + "epoch": 0.8444973049573601, + "grad_norm": 3.061401844024658, + "learning_rate": 5.965304545008704e-07, + "loss": 0.1152, + "step": 33372 + }, + { + "epoch": 0.8445226105220538, + "grad_norm": 9.670679092407227, + "learning_rate": 5.963402731737356e-07, + "loss": 0.1993, + "step": 33373 + }, + { + "epoch": 0.8445479160867475, + "grad_norm": 5.056920528411865, + "learning_rate": 5.961501202449843e-07, + "loss": 0.1788, + "step": 33374 + }, + { + "epoch": 0.8445732216514411, + "grad_norm": 4.057365417480469, + "learning_rate": 5.959599957158413e-07, + "loss": 0.1065, + "step": 33375 + }, + { + "epoch": 0.8445985272161348, + "grad_norm": 3.3595595359802246, + "learning_rate": 5.957698995875333e-07, + "loss": 0.1625, + "step": 33376 + }, + { + "epoch": 0.8446238327808285, + "grad_norm": 2.9453182220458984, + "learning_rate": 5.955798318612854e-07, + "loss": 0.0993, + "step": 33377 + }, + { + "epoch": 0.8446491383455221, + "grad_norm": 6.122512340545654, + "learning_rate": 5.953897925383246e-07, + "loss": 0.21, + "step": 33378 + }, + { + "epoch": 0.8446744439102158, + "grad_norm": 5.709115505218506, + "learning_rate": 5.951997816198751e-07, + "loss": 0.2163, + "step": 33379 + }, + { + "epoch": 0.8446997494749096, + "grad_norm": 3.274075508117676, + "learning_rate": 5.950097991071635e-07, + "loss": 0.1153, + "step": 33380 + }, + { + "epoch": 0.8447250550396032, + "grad_norm": 2.807434320449829, + "learning_rate": 5.948198450014131e-07, + "loss": 0.0533, + "step": 33381 + }, + { + "epoch": 0.8447503606042969, + "grad_norm": 3.590747356414795, + "learning_rate": 5.946299193038513e-07, + "loss": 0.1113, + "step": 33382 + }, + { + "epoch": 0.8447756661689906, + "grad_norm": 3.783949613571167, + "learning_rate": 5.944400220157016e-07, + "loss": 0.1046, + "step": 33383 + }, + { + "epoch": 0.8448009717336843, + "grad_norm": 2.51908278465271, + "learning_rate": 5.942501531381889e-07, + "loss": 0.116, + "step": 33384 + }, + { + "epoch": 0.8448262772983779, + "grad_norm": 4.74321985244751, + "learning_rate": 5.940603126725359e-07, + "loss": 0.0904, + "step": 33385 + }, + { + "epoch": 0.8448515828630716, + "grad_norm": 7.867771148681641, + "learning_rate": 5.938705006199696e-07, + "loss": 0.1168, + "step": 33386 + }, + { + "epoch": 0.8448768884277653, + "grad_norm": 4.911917209625244, + "learning_rate": 5.936807169817133e-07, + "loss": 0.1759, + "step": 33387 + }, + { + "epoch": 0.8449021939924589, + "grad_norm": 9.41014289855957, + "learning_rate": 5.93490961758989e-07, + "loss": 0.265, + "step": 33388 + }, + { + "epoch": 0.8449274995571526, + "grad_norm": 3.5398995876312256, + "learning_rate": 5.933012349530237e-07, + "loss": 0.1187, + "step": 33389 + }, + { + "epoch": 0.8449528051218463, + "grad_norm": 6.644439697265625, + "learning_rate": 5.931115365650375e-07, + "loss": 0.1875, + "step": 33390 + }, + { + "epoch": 0.8449781106865399, + "grad_norm": 8.534076690673828, + "learning_rate": 5.929218665962555e-07, + "loss": 0.1006, + "step": 33391 + }, + { + "epoch": 0.8450034162512337, + "grad_norm": 4.014994144439697, + "learning_rate": 5.927322250479001e-07, + "loss": 0.1053, + "step": 33392 + }, + { + "epoch": 0.8450287218159274, + "grad_norm": 5.016941547393799, + "learning_rate": 5.925426119211969e-07, + "loss": 0.1613, + "step": 33393 + }, + { + "epoch": 0.845054027380621, + "grad_norm": 26.7143497467041, + "learning_rate": 5.923530272173644e-07, + "loss": 0.3091, + "step": 33394 + }, + { + "epoch": 0.8450793329453147, + "grad_norm": 5.362892150878906, + "learning_rate": 5.921634709376284e-07, + "loss": 0.2001, + "step": 33395 + }, + { + "epoch": 0.8451046385100084, + "grad_norm": 4.316532611846924, + "learning_rate": 5.919739430832088e-07, + "loss": 0.1174, + "step": 33396 + }, + { + "epoch": 0.845129944074702, + "grad_norm": 3.243581771850586, + "learning_rate": 5.917844436553305e-07, + "loss": 0.1305, + "step": 33397 + }, + { + "epoch": 0.8451552496393957, + "grad_norm": 4.210685729980469, + "learning_rate": 5.915949726552145e-07, + "loss": 0.098, + "step": 33398 + }, + { + "epoch": 0.8451805552040894, + "grad_norm": 3.048449754714966, + "learning_rate": 5.914055300840826e-07, + "loss": 0.1031, + "step": 33399 + }, + { + "epoch": 0.845205860768783, + "grad_norm": 7.94312858581543, + "learning_rate": 5.91216115943155e-07, + "loss": 0.2255, + "step": 33400 + }, + { + "epoch": 0.8452311663334767, + "grad_norm": 4.125621318817139, + "learning_rate": 5.910267302336559e-07, + "loss": 0.1919, + "step": 33401 + }, + { + "epoch": 0.8452564718981704, + "grad_norm": 23.856794357299805, + "learning_rate": 5.908373729568056e-07, + "loss": 0.2329, + "step": 33402 + }, + { + "epoch": 0.845281777462864, + "grad_norm": 7.831423282623291, + "learning_rate": 5.906480441138246e-07, + "loss": 0.2959, + "step": 33403 + }, + { + "epoch": 0.8453070830275577, + "grad_norm": 5.5894622802734375, + "learning_rate": 5.904587437059328e-07, + "loss": 0.1009, + "step": 33404 + }, + { + "epoch": 0.8453323885922515, + "grad_norm": 5.074152946472168, + "learning_rate": 5.902694717343537e-07, + "loss": 0.1276, + "step": 33405 + }, + { + "epoch": 0.8453576941569451, + "grad_norm": 6.649587631225586, + "learning_rate": 5.900802282003065e-07, + "loss": 0.166, + "step": 33406 + }, + { + "epoch": 0.8453829997216388, + "grad_norm": 5.0164794921875, + "learning_rate": 5.898910131050112e-07, + "loss": 0.1702, + "step": 33407 + }, + { + "epoch": 0.8454083052863325, + "grad_norm": 5.115906715393066, + "learning_rate": 5.897018264496879e-07, + "loss": 0.1176, + "step": 33408 + }, + { + "epoch": 0.8454336108510262, + "grad_norm": 17.666231155395508, + "learning_rate": 5.895126682355579e-07, + "loss": 0.1528, + "step": 33409 + }, + { + "epoch": 0.8454589164157198, + "grad_norm": 3.7180826663970947, + "learning_rate": 5.893235384638402e-07, + "loss": 0.166, + "step": 33410 + }, + { + "epoch": 0.8454842219804135, + "grad_norm": 3.8692188262939453, + "learning_rate": 5.891344371357549e-07, + "loss": 0.1434, + "step": 33411 + }, + { + "epoch": 0.8455095275451072, + "grad_norm": 5.13124418258667, + "learning_rate": 5.8894536425252e-07, + "loss": 0.1705, + "step": 33412 + }, + { + "epoch": 0.8455348331098008, + "grad_norm": 5.987432479858398, + "learning_rate": 5.887563198153567e-07, + "loss": 0.2024, + "step": 33413 + }, + { + "epoch": 0.8455601386744945, + "grad_norm": 8.001404762268066, + "learning_rate": 5.885673038254835e-07, + "loss": 0.218, + "step": 33414 + }, + { + "epoch": 0.8455854442391882, + "grad_norm": 4.497072219848633, + "learning_rate": 5.88378316284119e-07, + "loss": 0.1359, + "step": 33415 + }, + { + "epoch": 0.8456107498038818, + "grad_norm": 6.327641010284424, + "learning_rate": 5.881893571924824e-07, + "loss": 0.1601, + "step": 33416 + }, + { + "epoch": 0.8456360553685756, + "grad_norm": 5.846954822540283, + "learning_rate": 5.880004265517908e-07, + "loss": 0.171, + "step": 33417 + }, + { + "epoch": 0.8456613609332693, + "grad_norm": 15.55117130279541, + "learning_rate": 5.878115243632648e-07, + "loss": 0.1393, + "step": 33418 + }, + { + "epoch": 0.8456866664979629, + "grad_norm": 3.5189478397369385, + "learning_rate": 5.876226506281208e-07, + "loss": 0.1305, + "step": 33419 + }, + { + "epoch": 0.8457119720626566, + "grad_norm": 5.164839744567871, + "learning_rate": 5.874338053475792e-07, + "loss": 0.1104, + "step": 33420 + }, + { + "epoch": 0.8457372776273503, + "grad_norm": 5.0708842277526855, + "learning_rate": 5.872449885228543e-07, + "loss": 0.1655, + "step": 33421 + }, + { + "epoch": 0.8457625831920439, + "grad_norm": 8.913704872131348, + "learning_rate": 5.870562001551666e-07, + "loss": 0.1559, + "step": 33422 + }, + { + "epoch": 0.8457878887567376, + "grad_norm": 5.355454921722412, + "learning_rate": 5.868674402457313e-07, + "loss": 0.174, + "step": 33423 + }, + { + "epoch": 0.8458131943214313, + "grad_norm": 3.1752655506134033, + "learning_rate": 5.866787087957699e-07, + "loss": 0.1279, + "step": 33424 + }, + { + "epoch": 0.8458384998861249, + "grad_norm": 5.5853705406188965, + "learning_rate": 5.864900058064937e-07, + "loss": 0.1691, + "step": 33425 + }, + { + "epoch": 0.8458638054508186, + "grad_norm": 13.00903034210205, + "learning_rate": 5.863013312791238e-07, + "loss": 0.2041, + "step": 33426 + }, + { + "epoch": 0.8458891110155123, + "grad_norm": 4.916927814483643, + "learning_rate": 5.861126852148746e-07, + "loss": 0.2338, + "step": 33427 + }, + { + "epoch": 0.8459144165802059, + "grad_norm": 9.657734870910645, + "learning_rate": 5.859240676149652e-07, + "loss": 0.2083, + "step": 33428 + }, + { + "epoch": 0.8459397221448997, + "grad_norm": 11.578267097473145, + "learning_rate": 5.85735478480609e-07, + "loss": 0.1995, + "step": 33429 + }, + { + "epoch": 0.8459650277095934, + "grad_norm": 7.813265800476074, + "learning_rate": 5.855469178130247e-07, + "loss": 0.2009, + "step": 33430 + }, + { + "epoch": 0.845990333274287, + "grad_norm": 3.468843698501587, + "learning_rate": 5.853583856134253e-07, + "loss": 0.1309, + "step": 33431 + }, + { + "epoch": 0.8460156388389807, + "grad_norm": 2.8009419441223145, + "learning_rate": 5.851698818830304e-07, + "loss": 0.1125, + "step": 33432 + }, + { + "epoch": 0.8460409444036744, + "grad_norm": 5.411497592926025, + "learning_rate": 5.84981406623053e-07, + "loss": 0.1863, + "step": 33433 + }, + { + "epoch": 0.846066249968368, + "grad_norm": 15.182536125183105, + "learning_rate": 5.847929598347102e-07, + "loss": 0.1286, + "step": 33434 + }, + { + "epoch": 0.8460915555330617, + "grad_norm": 6.758751392364502, + "learning_rate": 5.846045415192142e-07, + "loss": 0.1891, + "step": 33435 + }, + { + "epoch": 0.8461168610977554, + "grad_norm": 8.26193904876709, + "learning_rate": 5.844161516777841e-07, + "loss": 0.2393, + "step": 33436 + }, + { + "epoch": 0.8461421666624491, + "grad_norm": 33.081356048583984, + "learning_rate": 5.842277903116323e-07, + "loss": 0.2194, + "step": 33437 + }, + { + "epoch": 0.8461674722271427, + "grad_norm": 9.157306671142578, + "learning_rate": 5.840394574219744e-07, + "loss": 0.2995, + "step": 33438 + }, + { + "epoch": 0.8461927777918364, + "grad_norm": 4.194432735443115, + "learning_rate": 5.838511530100232e-07, + "loss": 0.1547, + "step": 33439 + }, + { + "epoch": 0.8462180833565301, + "grad_norm": 4.61625337600708, + "learning_rate": 5.836628770769959e-07, + "loss": 0.1604, + "step": 33440 + }, + { + "epoch": 0.8462433889212237, + "grad_norm": 2.387500286102295, + "learning_rate": 5.834746296241051e-07, + "loss": 0.1096, + "step": 33441 + }, + { + "epoch": 0.8462686944859175, + "grad_norm": 5.220970153808594, + "learning_rate": 5.832864106525649e-07, + "loss": 0.1111, + "step": 33442 + }, + { + "epoch": 0.8462940000506112, + "grad_norm": 3.5385687351226807, + "learning_rate": 5.830982201635877e-07, + "loss": 0.1649, + "step": 33443 + }, + { + "epoch": 0.8463193056153048, + "grad_norm": 6.420440673828125, + "learning_rate": 5.829100581583897e-07, + "loss": 0.1671, + "step": 33444 + }, + { + "epoch": 0.8463446111799985, + "grad_norm": 3.933907985687256, + "learning_rate": 5.827219246381833e-07, + "loss": 0.1134, + "step": 33445 + }, + { + "epoch": 0.8463699167446922, + "grad_norm": 4.672604084014893, + "learning_rate": 5.825338196041813e-07, + "loss": 0.1878, + "step": 33446 + }, + { + "epoch": 0.8463952223093858, + "grad_norm": 5.519821643829346, + "learning_rate": 5.823457430575968e-07, + "loss": 0.1622, + "step": 33447 + }, + { + "epoch": 0.8464205278740795, + "grad_norm": 4.747750282287598, + "learning_rate": 5.821576949996421e-07, + "loss": 0.0991, + "step": 33448 + }, + { + "epoch": 0.8464458334387732, + "grad_norm": 5.364999771118164, + "learning_rate": 5.819696754315318e-07, + "loss": 0.169, + "step": 33449 + }, + { + "epoch": 0.8464711390034668, + "grad_norm": 5.514078617095947, + "learning_rate": 5.81781684354476e-07, + "loss": 0.1375, + "step": 33450 + }, + { + "epoch": 0.8464964445681605, + "grad_norm": 3.6610331535339355, + "learning_rate": 5.815937217696904e-07, + "loss": 0.1963, + "step": 33451 + }, + { + "epoch": 0.8465217501328542, + "grad_norm": 5.39054012298584, + "learning_rate": 5.81405787678383e-07, + "loss": 0.1212, + "step": 33452 + }, + { + "epoch": 0.8465470556975478, + "grad_norm": 5.224359512329102, + "learning_rate": 5.812178820817688e-07, + "loss": 0.1379, + "step": 33453 + }, + { + "epoch": 0.8465723612622416, + "grad_norm": 5.808054447174072, + "learning_rate": 5.810300049810574e-07, + "loss": 0.1767, + "step": 33454 + }, + { + "epoch": 0.8465976668269353, + "grad_norm": 6.997715950012207, + "learning_rate": 5.808421563774635e-07, + "loss": 0.2192, + "step": 33455 + }, + { + "epoch": 0.8466229723916289, + "grad_norm": 11.382062911987305, + "learning_rate": 5.806543362721945e-07, + "loss": 0.2836, + "step": 33456 + }, + { + "epoch": 0.8466482779563226, + "grad_norm": 11.2273588180542, + "learning_rate": 5.804665446664648e-07, + "loss": 0.2048, + "step": 33457 + }, + { + "epoch": 0.8466735835210163, + "grad_norm": 15.32251262664795, + "learning_rate": 5.80278781561483e-07, + "loss": 0.2306, + "step": 33458 + }, + { + "epoch": 0.8466988890857099, + "grad_norm": 12.023449897766113, + "learning_rate": 5.800910469584636e-07, + "loss": 0.2181, + "step": 33459 + }, + { + "epoch": 0.8467241946504036, + "grad_norm": 6.902078628540039, + "learning_rate": 5.799033408586124e-07, + "loss": 0.2269, + "step": 33460 + }, + { + "epoch": 0.8467495002150973, + "grad_norm": 8.41650104522705, + "learning_rate": 5.797156632631435e-07, + "loss": 0.2003, + "step": 33461 + }, + { + "epoch": 0.846774805779791, + "grad_norm": 3.929231643676758, + "learning_rate": 5.795280141732651e-07, + "loss": 0.1012, + "step": 33462 + }, + { + "epoch": 0.8468001113444846, + "grad_norm": 2.960759162902832, + "learning_rate": 5.793403935901893e-07, + "loss": 0.0912, + "step": 33463 + }, + { + "epoch": 0.8468254169091783, + "grad_norm": 4.078141212463379, + "learning_rate": 5.791528015151254e-07, + "loss": 0.1004, + "step": 33464 + }, + { + "epoch": 0.846850722473872, + "grad_norm": 8.23147201538086, + "learning_rate": 5.789652379492822e-07, + "loss": 0.1791, + "step": 33465 + }, + { + "epoch": 0.8468760280385657, + "grad_norm": 3.62546968460083, + "learning_rate": 5.787777028938691e-07, + "loss": 0.1047, + "step": 33466 + }, + { + "epoch": 0.8469013336032594, + "grad_norm": 3.3231937885284424, + "learning_rate": 5.785901963500967e-07, + "loss": 0.1289, + "step": 33467 + }, + { + "epoch": 0.8469266391679531, + "grad_norm": 4.352868556976318, + "learning_rate": 5.784027183191742e-07, + "loss": 0.174, + "step": 33468 + }, + { + "epoch": 0.8469519447326467, + "grad_norm": 3.766716957092285, + "learning_rate": 5.782152688023096e-07, + "loss": 0.1236, + "step": 33469 + }, + { + "epoch": 0.8469772502973404, + "grad_norm": 4.541948318481445, + "learning_rate": 5.780278478007117e-07, + "loss": 0.1906, + "step": 33470 + }, + { + "epoch": 0.8470025558620341, + "grad_norm": 12.192912101745605, + "learning_rate": 5.778404553155903e-07, + "loss": 0.1944, + "step": 33471 + }, + { + "epoch": 0.8470278614267277, + "grad_norm": 3.4231019020080566, + "learning_rate": 5.776530913481532e-07, + "loss": 0.0983, + "step": 33472 + }, + { + "epoch": 0.8470531669914214, + "grad_norm": 2.866213321685791, + "learning_rate": 5.774657558996089e-07, + "loss": 0.1362, + "step": 33473 + }, + { + "epoch": 0.8470784725561151, + "grad_norm": 7.284589767456055, + "learning_rate": 5.772784489711652e-07, + "loss": 0.1427, + "step": 33474 + }, + { + "epoch": 0.8471037781208087, + "grad_norm": 3.882256507873535, + "learning_rate": 5.770911705640292e-07, + "loss": 0.1593, + "step": 33475 + }, + { + "epoch": 0.8471290836855024, + "grad_norm": 3.8139259815216064, + "learning_rate": 5.769039206794108e-07, + "loss": 0.1031, + "step": 33476 + }, + { + "epoch": 0.8471543892501961, + "grad_norm": 5.7295823097229, + "learning_rate": 5.767166993185158e-07, + "loss": 0.1352, + "step": 33477 + }, + { + "epoch": 0.8471796948148897, + "grad_norm": 5.025876045227051, + "learning_rate": 5.765295064825521e-07, + "loss": 0.0769, + "step": 33478 + }, + { + "epoch": 0.8472050003795835, + "grad_norm": 3.6948587894439697, + "learning_rate": 5.76342342172726e-07, + "loss": 0.1643, + "step": 33479 + }, + { + "epoch": 0.8472303059442772, + "grad_norm": 5.405813694000244, + "learning_rate": 5.761552063902459e-07, + "loss": 0.1014, + "step": 33480 + }, + { + "epoch": 0.8472556115089708, + "grad_norm": 3.959578514099121, + "learning_rate": 5.759680991363182e-07, + "loss": 0.1344, + "step": 33481 + }, + { + "epoch": 0.8472809170736645, + "grad_norm": 4.346790313720703, + "learning_rate": 5.757810204121494e-07, + "loss": 0.2061, + "step": 33482 + }, + { + "epoch": 0.8473062226383582, + "grad_norm": 8.192277908325195, + "learning_rate": 5.755939702189444e-07, + "loss": 0.2908, + "step": 33483 + }, + { + "epoch": 0.8473315282030518, + "grad_norm": 8.092179298400879, + "learning_rate": 5.754069485579123e-07, + "loss": 0.2475, + "step": 33484 + }, + { + "epoch": 0.8473568337677455, + "grad_norm": 4.878805637359619, + "learning_rate": 5.752199554302567e-07, + "loss": 0.1811, + "step": 33485 + }, + { + "epoch": 0.8473821393324392, + "grad_norm": 6.513771057128906, + "learning_rate": 5.750329908371865e-07, + "loss": 0.1627, + "step": 33486 + }, + { + "epoch": 0.8474074448971329, + "grad_norm": 3.1300461292266846, + "learning_rate": 5.748460547799034e-07, + "loss": 0.1954, + "step": 33487 + }, + { + "epoch": 0.8474327504618265, + "grad_norm": 6.414143085479736, + "learning_rate": 5.746591472596158e-07, + "loss": 0.2116, + "step": 33488 + }, + { + "epoch": 0.8474580560265202, + "grad_norm": 4.706970691680908, + "learning_rate": 5.744722682775272e-07, + "loss": 0.182, + "step": 33489 + }, + { + "epoch": 0.847483361591214, + "grad_norm": 2.8533923625946045, + "learning_rate": 5.742854178348461e-07, + "loss": 0.1035, + "step": 33490 + }, + { + "epoch": 0.8475086671559076, + "grad_norm": 2.052988052368164, + "learning_rate": 5.740985959327727e-07, + "loss": 0.0788, + "step": 33491 + }, + { + "epoch": 0.8475339727206013, + "grad_norm": 5.289855480194092, + "learning_rate": 5.739118025725155e-07, + "loss": 0.1877, + "step": 33492 + }, + { + "epoch": 0.847559278285295, + "grad_norm": 7.308587074279785, + "learning_rate": 5.737250377552766e-07, + "loss": 0.1648, + "step": 33493 + }, + { + "epoch": 0.8475845838499886, + "grad_norm": 10.725226402282715, + "learning_rate": 5.735383014822627e-07, + "loss": 0.3029, + "step": 33494 + }, + { + "epoch": 0.8476098894146823, + "grad_norm": 6.080618381500244, + "learning_rate": 5.733515937546763e-07, + "loss": 0.1508, + "step": 33495 + }, + { + "epoch": 0.847635194979376, + "grad_norm": 3.2180728912353516, + "learning_rate": 5.731649145737228e-07, + "loss": 0.1634, + "step": 33496 + }, + { + "epoch": 0.8476605005440696, + "grad_norm": 2.790113687515259, + "learning_rate": 5.729782639406039e-07, + "loss": 0.1369, + "step": 33497 + }, + { + "epoch": 0.8476858061087633, + "grad_norm": 7.140230655670166, + "learning_rate": 5.727916418565261e-07, + "loss": 0.2144, + "step": 33498 + }, + { + "epoch": 0.847711111673457, + "grad_norm": 17.41697120666504, + "learning_rate": 5.726050483226908e-07, + "loss": 0.1657, + "step": 33499 + }, + { + "epoch": 0.8477364172381506, + "grad_norm": 4.057847023010254, + "learning_rate": 5.724184833403029e-07, + "loss": 0.1778, + "step": 33500 + }, + { + "epoch": 0.8477617228028443, + "grad_norm": 7.0226149559021, + "learning_rate": 5.722319469105642e-07, + "loss": 0.2153, + "step": 33501 + }, + { + "epoch": 0.847787028367538, + "grad_norm": 18.8970947265625, + "learning_rate": 5.720454390346763e-07, + "loss": 0.1867, + "step": 33502 + }, + { + "epoch": 0.8478123339322317, + "grad_norm": 3.8096961975097656, + "learning_rate": 5.718589597138458e-07, + "loss": 0.2024, + "step": 33503 + }, + { + "epoch": 0.8478376394969254, + "grad_norm": 5.008179664611816, + "learning_rate": 5.716725089492726e-07, + "loss": 0.0953, + "step": 33504 + }, + { + "epoch": 0.8478629450616191, + "grad_norm": 4.010308265686035, + "learning_rate": 5.714860867421596e-07, + "loss": 0.0719, + "step": 33505 + }, + { + "epoch": 0.8478882506263127, + "grad_norm": 14.061481475830078, + "learning_rate": 5.712996930937087e-07, + "loss": 0.3664, + "step": 33506 + }, + { + "epoch": 0.8479135561910064, + "grad_norm": 3.4722344875335693, + "learning_rate": 5.711133280051223e-07, + "loss": 0.0784, + "step": 33507 + }, + { + "epoch": 0.8479388617557001, + "grad_norm": 13.253653526306152, + "learning_rate": 5.709269914776028e-07, + "loss": 0.2455, + "step": 33508 + }, + { + "epoch": 0.8479641673203937, + "grad_norm": 7.179503917694092, + "learning_rate": 5.707406835123514e-07, + "loss": 0.1767, + "step": 33509 + }, + { + "epoch": 0.8479894728850874, + "grad_norm": 7.8923659324646, + "learning_rate": 5.705544041105682e-07, + "loss": 0.1434, + "step": 33510 + }, + { + "epoch": 0.8480147784497811, + "grad_norm": 3.7595181465148926, + "learning_rate": 5.703681532734567e-07, + "loss": 0.189, + "step": 33511 + }, + { + "epoch": 0.8480400840144748, + "grad_norm": 7.696050643920898, + "learning_rate": 5.701819310022172e-07, + "loss": 0.1689, + "step": 33512 + }, + { + "epoch": 0.8480653895791684, + "grad_norm": 6.475733280181885, + "learning_rate": 5.699957372980503e-07, + "loss": 0.2203, + "step": 33513 + }, + { + "epoch": 0.8480906951438621, + "grad_norm": 10.907672882080078, + "learning_rate": 5.698095721621555e-07, + "loss": 0.2867, + "step": 33514 + }, + { + "epoch": 0.8481160007085559, + "grad_norm": 3.055830240249634, + "learning_rate": 5.696234355957359e-07, + "loss": 0.0864, + "step": 33515 + }, + { + "epoch": 0.8481413062732495, + "grad_norm": 4.447235584259033, + "learning_rate": 5.694373275999898e-07, + "loss": 0.1321, + "step": 33516 + }, + { + "epoch": 0.8481666118379432, + "grad_norm": 8.656424522399902, + "learning_rate": 5.692512481761204e-07, + "loss": 0.1386, + "step": 33517 + }, + { + "epoch": 0.8481919174026369, + "grad_norm": 8.188615798950195, + "learning_rate": 5.690651973253231e-07, + "loss": 0.1672, + "step": 33518 + }, + { + "epoch": 0.8482172229673305, + "grad_norm": 5.284108638763428, + "learning_rate": 5.688791750488015e-07, + "loss": 0.188, + "step": 33519 + }, + { + "epoch": 0.8482425285320242, + "grad_norm": 2.532057762145996, + "learning_rate": 5.686931813477525e-07, + "loss": 0.083, + "step": 33520 + }, + { + "epoch": 0.8482678340967179, + "grad_norm": 4.200380802154541, + "learning_rate": 5.68507216223379e-07, + "loss": 0.166, + "step": 33521 + }, + { + "epoch": 0.8482931396614115, + "grad_norm": 8.623412132263184, + "learning_rate": 5.683212796768761e-07, + "loss": 0.2309, + "step": 33522 + }, + { + "epoch": 0.8483184452261052, + "grad_norm": 6.333704948425293, + "learning_rate": 5.681353717094462e-07, + "loss": 0.2048, + "step": 33523 + }, + { + "epoch": 0.8483437507907989, + "grad_norm": 5.645310878753662, + "learning_rate": 5.679494923222856e-07, + "loss": 0.1863, + "step": 33524 + }, + { + "epoch": 0.8483690563554925, + "grad_norm": 6.337932586669922, + "learning_rate": 5.677636415165955e-07, + "loss": 0.1669, + "step": 33525 + }, + { + "epoch": 0.8483943619201862, + "grad_norm": 5.242722034454346, + "learning_rate": 5.675778192935732e-07, + "loss": 0.1343, + "step": 33526 + }, + { + "epoch": 0.84841966748488, + "grad_norm": 2.8941526412963867, + "learning_rate": 5.673920256544168e-07, + "loss": 0.138, + "step": 33527 + }, + { + "epoch": 0.8484449730495736, + "grad_norm": 8.531753540039062, + "learning_rate": 5.672062606003247e-07, + "loss": 0.2552, + "step": 33528 + }, + { + "epoch": 0.8484702786142673, + "grad_norm": 5.060338973999023, + "learning_rate": 5.670205241324944e-07, + "loss": 0.162, + "step": 33529 + }, + { + "epoch": 0.848495584178961, + "grad_norm": 7.33607292175293, + "learning_rate": 5.668348162521248e-07, + "loss": 0.2358, + "step": 33530 + }, + { + "epoch": 0.8485208897436546, + "grad_norm": 7.342169284820557, + "learning_rate": 5.666491369604127e-07, + "loss": 0.1227, + "step": 33531 + }, + { + "epoch": 0.8485461953083483, + "grad_norm": 5.80905294418335, + "learning_rate": 5.664634862585555e-07, + "loss": 0.1624, + "step": 33532 + }, + { + "epoch": 0.848571500873042, + "grad_norm": 3.889179229736328, + "learning_rate": 5.662778641477501e-07, + "loss": 0.1196, + "step": 33533 + }, + { + "epoch": 0.8485968064377356, + "grad_norm": 4.200127124786377, + "learning_rate": 5.660922706291949e-07, + "loss": 0.1091, + "step": 33534 + }, + { + "epoch": 0.8486221120024293, + "grad_norm": 12.206060409545898, + "learning_rate": 5.659067057040857e-07, + "loss": 0.1981, + "step": 33535 + }, + { + "epoch": 0.848647417567123, + "grad_norm": 3.6827552318573, + "learning_rate": 5.657211693736192e-07, + "loss": 0.1299, + "step": 33536 + }, + { + "epoch": 0.8486727231318167, + "grad_norm": 7.36631441116333, + "learning_rate": 5.655356616389912e-07, + "loss": 0.1958, + "step": 33537 + }, + { + "epoch": 0.8486980286965103, + "grad_norm": 7.756683826446533, + "learning_rate": 5.653501825013996e-07, + "loss": 0.1408, + "step": 33538 + }, + { + "epoch": 0.848723334261204, + "grad_norm": 4.646448135375977, + "learning_rate": 5.651647319620401e-07, + "loss": 0.1533, + "step": 33539 + }, + { + "epoch": 0.8487486398258978, + "grad_norm": 4.047907829284668, + "learning_rate": 5.649793100221085e-07, + "loss": 0.15, + "step": 33540 + }, + { + "epoch": 0.8487739453905914, + "grad_norm": 2.897251605987549, + "learning_rate": 5.647939166827987e-07, + "loss": 0.1173, + "step": 33541 + }, + { + "epoch": 0.8487992509552851, + "grad_norm": 3.833904266357422, + "learning_rate": 5.646085519453093e-07, + "loss": 0.1511, + "step": 33542 + }, + { + "epoch": 0.8488245565199788, + "grad_norm": 4.215787410736084, + "learning_rate": 5.644232158108342e-07, + "loss": 0.1107, + "step": 33543 + }, + { + "epoch": 0.8488498620846724, + "grad_norm": 8.82027816772461, + "learning_rate": 5.642379082805693e-07, + "loss": 0.1317, + "step": 33544 + }, + { + "epoch": 0.8488751676493661, + "grad_norm": 5.5914387702941895, + "learning_rate": 5.640526293557075e-07, + "loss": 0.2112, + "step": 33545 + }, + { + "epoch": 0.8489004732140598, + "grad_norm": 7.463094711303711, + "learning_rate": 5.63867379037446e-07, + "loss": 0.2012, + "step": 33546 + }, + { + "epoch": 0.8489257787787534, + "grad_norm": 3.6953983306884766, + "learning_rate": 5.63682157326978e-07, + "loss": 0.1113, + "step": 33547 + }, + { + "epoch": 0.8489510843434471, + "grad_norm": 4.501133441925049, + "learning_rate": 5.634969642255001e-07, + "loss": 0.1866, + "step": 33548 + }, + { + "epoch": 0.8489763899081408, + "grad_norm": 6.046019554138184, + "learning_rate": 5.633117997342031e-07, + "loss": 0.1234, + "step": 33549 + }, + { + "epoch": 0.8490016954728344, + "grad_norm": 5.744309425354004, + "learning_rate": 5.631266638542843e-07, + "loss": 0.2023, + "step": 33550 + }, + { + "epoch": 0.8490270010375282, + "grad_norm": 7.005058765411377, + "learning_rate": 5.629415565869351e-07, + "loss": 0.1891, + "step": 33551 + }, + { + "epoch": 0.8490523066022219, + "grad_norm": 8.918737411499023, + "learning_rate": 5.627564779333522e-07, + "loss": 0.2074, + "step": 33552 + }, + { + "epoch": 0.8490776121669155, + "grad_norm": 6.724200248718262, + "learning_rate": 5.625714278947258e-07, + "loss": 0.2318, + "step": 33553 + }, + { + "epoch": 0.8491029177316092, + "grad_norm": 8.822661399841309, + "learning_rate": 5.623864064722517e-07, + "loss": 0.1937, + "step": 33554 + }, + { + "epoch": 0.8491282232963029, + "grad_norm": 6.687267303466797, + "learning_rate": 5.622014136671211e-07, + "loss": 0.1316, + "step": 33555 + }, + { + "epoch": 0.8491535288609965, + "grad_norm": 19.03431510925293, + "learning_rate": 5.620164494805308e-07, + "loss": 0.1593, + "step": 33556 + }, + { + "epoch": 0.8491788344256902, + "grad_norm": 5.229859352111816, + "learning_rate": 5.618315139136682e-07, + "loss": 0.1445, + "step": 33557 + }, + { + "epoch": 0.8492041399903839, + "grad_norm": 13.091062545776367, + "learning_rate": 5.616466069677296e-07, + "loss": 0.1777, + "step": 33558 + }, + { + "epoch": 0.8492294455550775, + "grad_norm": 4.503966331481934, + "learning_rate": 5.614617286439067e-07, + "loss": 0.1175, + "step": 33559 + }, + { + "epoch": 0.8492547511197712, + "grad_norm": 5.163528919219971, + "learning_rate": 5.612768789433904e-07, + "loss": 0.2242, + "step": 33560 + }, + { + "epoch": 0.8492800566844649, + "grad_norm": 7.517011642456055, + "learning_rate": 5.610920578673746e-07, + "loss": 0.1432, + "step": 33561 + }, + { + "epoch": 0.8493053622491585, + "grad_norm": 9.062679290771484, + "learning_rate": 5.609072654170505e-07, + "loss": 0.2572, + "step": 33562 + }, + { + "epoch": 0.8493306678138522, + "grad_norm": 3.5528275966644287, + "learning_rate": 5.607225015936097e-07, + "loss": 0.0923, + "step": 33563 + }, + { + "epoch": 0.849355973378546, + "grad_norm": 6.415464401245117, + "learning_rate": 5.605377663982431e-07, + "loss": 0.1559, + "step": 33564 + }, + { + "epoch": 0.8493812789432397, + "grad_norm": 6.11686897277832, + "learning_rate": 5.603530598321444e-07, + "loss": 0.1879, + "step": 33565 + }, + { + "epoch": 0.8494065845079333, + "grad_norm": 19.858510971069336, + "learning_rate": 5.60168381896501e-07, + "loss": 0.3345, + "step": 33566 + }, + { + "epoch": 0.849431890072627, + "grad_norm": 6.708944320678711, + "learning_rate": 5.599837325925067e-07, + "loss": 0.1594, + "step": 33567 + }, + { + "epoch": 0.8494571956373207, + "grad_norm": 2.3743677139282227, + "learning_rate": 5.597991119213503e-07, + "loss": 0.0677, + "step": 33568 + }, + { + "epoch": 0.8494825012020143, + "grad_norm": 4.705096244812012, + "learning_rate": 5.596145198842245e-07, + "loss": 0.0686, + "step": 33569 + }, + { + "epoch": 0.849507806766708, + "grad_norm": 20.937740325927734, + "learning_rate": 5.594299564823191e-07, + "loss": 0.1718, + "step": 33570 + }, + { + "epoch": 0.8495331123314017, + "grad_norm": 3.347891092300415, + "learning_rate": 5.592454217168236e-07, + "loss": 0.1489, + "step": 33571 + }, + { + "epoch": 0.8495584178960953, + "grad_norm": 2.4745917320251465, + "learning_rate": 5.59060915588927e-07, + "loss": 0.0661, + "step": 33572 + }, + { + "epoch": 0.849583723460789, + "grad_norm": 5.882146835327148, + "learning_rate": 5.588764380998218e-07, + "loss": 0.2021, + "step": 33573 + }, + { + "epoch": 0.8496090290254827, + "grad_norm": 3.2096710205078125, + "learning_rate": 5.586919892506964e-07, + "loss": 0.1236, + "step": 33574 + }, + { + "epoch": 0.8496343345901763, + "grad_norm": 4.435856819152832, + "learning_rate": 5.585075690427405e-07, + "loss": 0.086, + "step": 33575 + }, + { + "epoch": 0.84965964015487, + "grad_norm": 3.260392427444458, + "learning_rate": 5.583231774771419e-07, + "loss": 0.1379, + "step": 33576 + }, + { + "epoch": 0.8496849457195638, + "grad_norm": 4.881534576416016, + "learning_rate": 5.58138814555092e-07, + "loss": 0.1654, + "step": 33577 + }, + { + "epoch": 0.8497102512842574, + "grad_norm": 8.227416038513184, + "learning_rate": 5.57954480277777e-07, + "loss": 0.1986, + "step": 33578 + }, + { + "epoch": 0.8497355568489511, + "grad_norm": 3.5940616130828857, + "learning_rate": 5.577701746463898e-07, + "loss": 0.1561, + "step": 33579 + }, + { + "epoch": 0.8497608624136448, + "grad_norm": 1.923506498336792, + "learning_rate": 5.575858976621146e-07, + "loss": 0.0636, + "step": 33580 + }, + { + "epoch": 0.8497861679783384, + "grad_norm": 3.325300693511963, + "learning_rate": 5.574016493261425e-07, + "loss": 0.0979, + "step": 33581 + }, + { + "epoch": 0.8498114735430321, + "grad_norm": 10.007744789123535, + "learning_rate": 5.572174296396593e-07, + "loss": 0.2239, + "step": 33582 + }, + { + "epoch": 0.8498367791077258, + "grad_norm": 4.063438892364502, + "learning_rate": 5.570332386038569e-07, + "loss": 0.1381, + "step": 33583 + }, + { + "epoch": 0.8498620846724194, + "grad_norm": 10.202832221984863, + "learning_rate": 5.568490762199186e-07, + "loss": 0.1676, + "step": 33584 + }, + { + "epoch": 0.8498873902371131, + "grad_norm": 6.8309102058410645, + "learning_rate": 5.566649424890352e-07, + "loss": 0.0942, + "step": 33585 + }, + { + "epoch": 0.8499126958018068, + "grad_norm": 3.678433656692505, + "learning_rate": 5.564808374123932e-07, + "loss": 0.0851, + "step": 33586 + }, + { + "epoch": 0.8499380013665004, + "grad_norm": 2.6136693954467773, + "learning_rate": 5.56296760991179e-07, + "loss": 0.0878, + "step": 33587 + }, + { + "epoch": 0.8499633069311942, + "grad_norm": 4.055866241455078, + "learning_rate": 5.561127132265804e-07, + "loss": 0.1374, + "step": 33588 + }, + { + "epoch": 0.8499886124958879, + "grad_norm": 4.026828765869141, + "learning_rate": 5.559286941197845e-07, + "loss": 0.1047, + "step": 33589 + }, + { + "epoch": 0.8500139180605816, + "grad_norm": 4.799544811248779, + "learning_rate": 5.557447036719782e-07, + "loss": 0.1512, + "step": 33590 + }, + { + "epoch": 0.8500392236252752, + "grad_norm": 6.166427135467529, + "learning_rate": 5.555607418843461e-07, + "loss": 0.2654, + "step": 33591 + }, + { + "epoch": 0.8500645291899689, + "grad_norm": 7.928749084472656, + "learning_rate": 5.553768087580774e-07, + "loss": 0.1701, + "step": 33592 + }, + { + "epoch": 0.8500898347546626, + "grad_norm": 7.280150890350342, + "learning_rate": 5.551929042943572e-07, + "loss": 0.1599, + "step": 33593 + }, + { + "epoch": 0.8501151403193562, + "grad_norm": 3.3113372325897217, + "learning_rate": 5.55009028494371e-07, + "loss": 0.1363, + "step": 33594 + }, + { + "epoch": 0.8501404458840499, + "grad_norm": 3.0061898231506348, + "learning_rate": 5.548251813593036e-07, + "loss": 0.0837, + "step": 33595 + }, + { + "epoch": 0.8501657514487436, + "grad_norm": 5.704747676849365, + "learning_rate": 5.546413628903435e-07, + "loss": 0.137, + "step": 33596 + }, + { + "epoch": 0.8501910570134372, + "grad_norm": 4.301886081695557, + "learning_rate": 5.544575730886726e-07, + "loss": 0.1263, + "step": 33597 + }, + { + "epoch": 0.8502163625781309, + "grad_norm": 5.114437580108643, + "learning_rate": 5.542738119554791e-07, + "loss": 0.1533, + "step": 33598 + }, + { + "epoch": 0.8502416681428246, + "grad_norm": 2.8088269233703613, + "learning_rate": 5.540900794919458e-07, + "loss": 0.0579, + "step": 33599 + }, + { + "epoch": 0.8502669737075182, + "grad_norm": 3.8879406452178955, + "learning_rate": 5.539063756992591e-07, + "loss": 0.1584, + "step": 33600 + }, + { + "epoch": 0.850292279272212, + "grad_norm": 3.405653476715088, + "learning_rate": 5.537227005786033e-07, + "loss": 0.1438, + "step": 33601 + }, + { + "epoch": 0.8503175848369057, + "grad_norm": 4.306232929229736, + "learning_rate": 5.53539054131163e-07, + "loss": 0.1148, + "step": 33602 + }, + { + "epoch": 0.8503428904015993, + "grad_norm": 7.120090007781982, + "learning_rate": 5.533554363581212e-07, + "loss": 0.1637, + "step": 33603 + }, + { + "epoch": 0.850368195966293, + "grad_norm": 3.5602424144744873, + "learning_rate": 5.53171847260664e-07, + "loss": 0.1327, + "step": 33604 + }, + { + "epoch": 0.8503935015309867, + "grad_norm": 5.4306488037109375, + "learning_rate": 5.529882868399744e-07, + "loss": 0.1721, + "step": 33605 + }, + { + "epoch": 0.8504188070956803, + "grad_norm": 6.075951099395752, + "learning_rate": 5.52804755097236e-07, + "loss": 0.1418, + "step": 33606 + }, + { + "epoch": 0.850444112660374, + "grad_norm": 6.49399471282959, + "learning_rate": 5.526212520336316e-07, + "loss": 0.1815, + "step": 33607 + }, + { + "epoch": 0.8504694182250677, + "grad_norm": 4.868690490722656, + "learning_rate": 5.524377776503464e-07, + "loss": 0.1383, + "step": 33608 + }, + { + "epoch": 0.8504947237897613, + "grad_norm": 5.424450874328613, + "learning_rate": 5.522543319485629e-07, + "loss": 0.1696, + "step": 33609 + }, + { + "epoch": 0.850520029354455, + "grad_norm": 4.373137950897217, + "learning_rate": 5.520709149294634e-07, + "loss": 0.1509, + "step": 33610 + }, + { + "epoch": 0.8505453349191487, + "grad_norm": 2.279665946960449, + "learning_rate": 5.518875265942303e-07, + "loss": 0.0803, + "step": 33611 + }, + { + "epoch": 0.8505706404838423, + "grad_norm": 7.056861400604248, + "learning_rate": 5.517041669440487e-07, + "loss": 0.0888, + "step": 33612 + }, + { + "epoch": 0.8505959460485361, + "grad_norm": 9.28199577331543, + "learning_rate": 5.515208359800989e-07, + "loss": 0.1741, + "step": 33613 + }, + { + "epoch": 0.8506212516132298, + "grad_norm": 5.306771278381348, + "learning_rate": 5.513375337035637e-07, + "loss": 0.0987, + "step": 33614 + }, + { + "epoch": 0.8506465571779235, + "grad_norm": 3.783515214920044, + "learning_rate": 5.511542601156244e-07, + "loss": 0.1207, + "step": 33615 + }, + { + "epoch": 0.8506718627426171, + "grad_norm": 5.226097583770752, + "learning_rate": 5.509710152174652e-07, + "loss": 0.1854, + "step": 33616 + }, + { + "epoch": 0.8506971683073108, + "grad_norm": 7.844667911529541, + "learning_rate": 5.507877990102656e-07, + "loss": 0.2036, + "step": 33617 + }, + { + "epoch": 0.8507224738720045, + "grad_norm": 5.098901271820068, + "learning_rate": 5.506046114952085e-07, + "loss": 0.1101, + "step": 33618 + }, + { + "epoch": 0.8507477794366981, + "grad_norm": 5.363772392272949, + "learning_rate": 5.50421452673473e-07, + "loss": 0.1919, + "step": 33619 + }, + { + "epoch": 0.8507730850013918, + "grad_norm": 3.5295934677124023, + "learning_rate": 5.502383225462432e-07, + "loss": 0.0987, + "step": 33620 + }, + { + "epoch": 0.8507983905660855, + "grad_norm": 4.6555328369140625, + "learning_rate": 5.500552211146987e-07, + "loss": 0.2373, + "step": 33621 + }, + { + "epoch": 0.8508236961307791, + "grad_norm": 7.5800347328186035, + "learning_rate": 5.498721483800195e-07, + "loss": 0.1514, + "step": 33622 + }, + { + "epoch": 0.8508490016954728, + "grad_norm": 6.759122848510742, + "learning_rate": 5.496891043433888e-07, + "loss": 0.1693, + "step": 33623 + }, + { + "epoch": 0.8508743072601666, + "grad_norm": 3.013010025024414, + "learning_rate": 5.495060890059834e-07, + "loss": 0.1208, + "step": 33624 + }, + { + "epoch": 0.8508996128248602, + "grad_norm": 5.9739766120910645, + "learning_rate": 5.49323102368986e-07, + "loss": 0.1778, + "step": 33625 + }, + { + "epoch": 0.8509249183895539, + "grad_norm": 4.7573981285095215, + "learning_rate": 5.491401444335753e-07, + "loss": 0.1298, + "step": 33626 + }, + { + "epoch": 0.8509502239542476, + "grad_norm": 7.816723346710205, + "learning_rate": 5.489572152009337e-07, + "loss": 0.1678, + "step": 33627 + }, + { + "epoch": 0.8509755295189412, + "grad_norm": 5.468236446380615, + "learning_rate": 5.487743146722374e-07, + "loss": 0.176, + "step": 33628 + }, + { + "epoch": 0.8510008350836349, + "grad_norm": 6.754892826080322, + "learning_rate": 5.485914428486683e-07, + "loss": 0.1864, + "step": 33629 + }, + { + "epoch": 0.8510261406483286, + "grad_norm": 6.449840068817139, + "learning_rate": 5.484085997314042e-07, + "loss": 0.2218, + "step": 33630 + }, + { + "epoch": 0.8510514462130222, + "grad_norm": 5.248301029205322, + "learning_rate": 5.482257853216266e-07, + "loss": 0.1492, + "step": 33631 + }, + { + "epoch": 0.8510767517777159, + "grad_norm": 6.17575216293335, + "learning_rate": 5.480429996205111e-07, + "loss": 0.1474, + "step": 33632 + }, + { + "epoch": 0.8511020573424096, + "grad_norm": 3.466381549835205, + "learning_rate": 5.478602426292396e-07, + "loss": 0.1131, + "step": 33633 + }, + { + "epoch": 0.8511273629071032, + "grad_norm": 4.442469596862793, + "learning_rate": 5.476775143489877e-07, + "loss": 0.1508, + "step": 33634 + }, + { + "epoch": 0.8511526684717969, + "grad_norm": 4.203145503997803, + "learning_rate": 5.474948147809361e-07, + "loss": 0.1292, + "step": 33635 + }, + { + "epoch": 0.8511779740364906, + "grad_norm": 3.1617698669433594, + "learning_rate": 5.473121439262624e-07, + "loss": 0.0907, + "step": 33636 + }, + { + "epoch": 0.8512032796011842, + "grad_norm": 5.907866477966309, + "learning_rate": 5.471295017861444e-07, + "loss": 0.1449, + "step": 33637 + }, + { + "epoch": 0.851228585165878, + "grad_norm": 5.165882110595703, + "learning_rate": 5.46946888361759e-07, + "loss": 0.165, + "step": 33638 + }, + { + "epoch": 0.8512538907305717, + "grad_norm": 3.5463643074035645, + "learning_rate": 5.467643036542864e-07, + "loss": 0.1187, + "step": 33639 + }, + { + "epoch": 0.8512791962952654, + "grad_norm": 3.3520925045013428, + "learning_rate": 5.465817476649015e-07, + "loss": 0.1344, + "step": 33640 + }, + { + "epoch": 0.851304501859959, + "grad_norm": 4.257792949676514, + "learning_rate": 5.463992203947827e-07, + "loss": 0.1468, + "step": 33641 + }, + { + "epoch": 0.8513298074246527, + "grad_norm": 5.906492710113525, + "learning_rate": 5.462167218451064e-07, + "loss": 0.1868, + "step": 33642 + }, + { + "epoch": 0.8513551129893464, + "grad_norm": 5.754331111907959, + "learning_rate": 5.460342520170503e-07, + "loss": 0.1865, + "step": 33643 + }, + { + "epoch": 0.85138041855404, + "grad_norm": 5.966792106628418, + "learning_rate": 5.458518109117916e-07, + "loss": 0.1697, + "step": 33644 + }, + { + "epoch": 0.8514057241187337, + "grad_norm": 2.036578416824341, + "learning_rate": 5.456693985305056e-07, + "loss": 0.114, + "step": 33645 + }, + { + "epoch": 0.8514310296834274, + "grad_norm": 17.615787506103516, + "learning_rate": 5.45487014874368e-07, + "loss": 0.1471, + "step": 33646 + }, + { + "epoch": 0.851456335248121, + "grad_norm": 2.5626962184906006, + "learning_rate": 5.453046599445572e-07, + "loss": 0.1042, + "step": 33647 + }, + { + "epoch": 0.8514816408128147, + "grad_norm": 2.4740054607391357, + "learning_rate": 5.451223337422479e-07, + "loss": 0.0904, + "step": 33648 + }, + { + "epoch": 0.8515069463775085, + "grad_norm": 3.4593915939331055, + "learning_rate": 5.449400362686158e-07, + "loss": 0.1115, + "step": 33649 + }, + { + "epoch": 0.8515322519422021, + "grad_norm": 3.5379831790924072, + "learning_rate": 5.447577675248372e-07, + "loss": 0.1261, + "step": 33650 + }, + { + "epoch": 0.8515575575068958, + "grad_norm": 8.519693374633789, + "learning_rate": 5.445755275120856e-07, + "loss": 0.1829, + "step": 33651 + }, + { + "epoch": 0.8515828630715895, + "grad_norm": 6.178302764892578, + "learning_rate": 5.44393316231539e-07, + "loss": 0.2402, + "step": 33652 + }, + { + "epoch": 0.8516081686362831, + "grad_norm": 5.8208794593811035, + "learning_rate": 5.442111336843698e-07, + "loss": 0.1523, + "step": 33653 + }, + { + "epoch": 0.8516334742009768, + "grad_norm": 3.86730694770813, + "learning_rate": 5.440289798717563e-07, + "loss": 0.1476, + "step": 33654 + }, + { + "epoch": 0.8516587797656705, + "grad_norm": 4.813765525817871, + "learning_rate": 5.438468547948689e-07, + "loss": 0.1484, + "step": 33655 + }, + { + "epoch": 0.8516840853303641, + "grad_norm": 8.579389572143555, + "learning_rate": 5.436647584548854e-07, + "loss": 0.2467, + "step": 33656 + }, + { + "epoch": 0.8517093908950578, + "grad_norm": 8.662615776062012, + "learning_rate": 5.434826908529778e-07, + "loss": 0.1485, + "step": 33657 + }, + { + "epoch": 0.8517346964597515, + "grad_norm": 9.13259506225586, + "learning_rate": 5.433006519903234e-07, + "loss": 0.2519, + "step": 33658 + }, + { + "epoch": 0.8517600020244451, + "grad_norm": 7.134689807891846, + "learning_rate": 5.431186418680923e-07, + "loss": 0.136, + "step": 33659 + }, + { + "epoch": 0.8517853075891388, + "grad_norm": 6.227250099182129, + "learning_rate": 5.429366604874608e-07, + "loss": 0.1509, + "step": 33660 + }, + { + "epoch": 0.8518106131538326, + "grad_norm": 6.348431587219238, + "learning_rate": 5.427547078496003e-07, + "loss": 0.1424, + "step": 33661 + }, + { + "epoch": 0.8518359187185262, + "grad_norm": 3.0738110542297363, + "learning_rate": 5.425727839556883e-07, + "loss": 0.0794, + "step": 33662 + }, + { + "epoch": 0.8518612242832199, + "grad_norm": 7.735090732574463, + "learning_rate": 5.42390888806893e-07, + "loss": 0.2128, + "step": 33663 + }, + { + "epoch": 0.8518865298479136, + "grad_norm": 4.289638996124268, + "learning_rate": 5.422090224043908e-07, + "loss": 0.0936, + "step": 33664 + }, + { + "epoch": 0.8519118354126072, + "grad_norm": 5.86553430557251, + "learning_rate": 5.420271847493519e-07, + "loss": 0.1052, + "step": 33665 + }, + { + "epoch": 0.8519371409773009, + "grad_norm": 13.389758110046387, + "learning_rate": 5.41845375842952e-07, + "loss": 0.2403, + "step": 33666 + }, + { + "epoch": 0.8519624465419946, + "grad_norm": 5.977199554443359, + "learning_rate": 5.416635956863619e-07, + "loss": 0.1672, + "step": 33667 + }, + { + "epoch": 0.8519877521066883, + "grad_norm": 20.173370361328125, + "learning_rate": 5.414818442807534e-07, + "loss": 0.2386, + "step": 33668 + }, + { + "epoch": 0.8520130576713819, + "grad_norm": 4.106274127960205, + "learning_rate": 5.41300121627299e-07, + "loss": 0.1364, + "step": 33669 + }, + { + "epoch": 0.8520383632360756, + "grad_norm": 7.174745082855225, + "learning_rate": 5.411184277271714e-07, + "loss": 0.1479, + "step": 33670 + }, + { + "epoch": 0.8520636688007693, + "grad_norm": 4.181479454040527, + "learning_rate": 5.409367625815415e-07, + "loss": 0.1153, + "step": 33671 + }, + { + "epoch": 0.8520889743654629, + "grad_norm": 4.286567211151123, + "learning_rate": 5.407551261915811e-07, + "loss": 0.1398, + "step": 33672 + }, + { + "epoch": 0.8521142799301566, + "grad_norm": 8.769265174865723, + "learning_rate": 5.405735185584599e-07, + "loss": 0.2542, + "step": 33673 + }, + { + "epoch": 0.8521395854948504, + "grad_norm": 4.776700973510742, + "learning_rate": 5.403919396833523e-07, + "loss": 0.1033, + "step": 33674 + }, + { + "epoch": 0.852164891059544, + "grad_norm": 9.501921653747559, + "learning_rate": 5.402103895674271e-07, + "loss": 0.2519, + "step": 33675 + }, + { + "epoch": 0.8521901966242377, + "grad_norm": 10.247429847717285, + "learning_rate": 5.400288682118555e-07, + "loss": 0.2943, + "step": 33676 + }, + { + "epoch": 0.8522155021889314, + "grad_norm": 3.8919622898101807, + "learning_rate": 5.398473756178074e-07, + "loss": 0.0933, + "step": 33677 + }, + { + "epoch": 0.852240807753625, + "grad_norm": 3.6860454082489014, + "learning_rate": 5.39665911786455e-07, + "loss": 0.0989, + "step": 33678 + }, + { + "epoch": 0.8522661133183187, + "grad_norm": 4.9224853515625, + "learning_rate": 5.394844767189672e-07, + "loss": 0.1714, + "step": 33679 + }, + { + "epoch": 0.8522914188830124, + "grad_norm": 3.7918026447296143, + "learning_rate": 5.393030704165142e-07, + "loss": 0.1348, + "step": 33680 + }, + { + "epoch": 0.852316724447706, + "grad_norm": 5.212862968444824, + "learning_rate": 5.391216928802661e-07, + "loss": 0.1102, + "step": 33681 + }, + { + "epoch": 0.8523420300123997, + "grad_norm": 3.8382010459899902, + "learning_rate": 5.38940344111391e-07, + "loss": 0.1133, + "step": 33682 + }, + { + "epoch": 0.8523673355770934, + "grad_norm": 4.937816143035889, + "learning_rate": 5.387590241110613e-07, + "loss": 0.0869, + "step": 33683 + }, + { + "epoch": 0.852392641141787, + "grad_norm": 3.545570135116577, + "learning_rate": 5.385777328804448e-07, + "loss": 0.1389, + "step": 33684 + }, + { + "epoch": 0.8524179467064807, + "grad_norm": 5.435822010040283, + "learning_rate": 5.383964704207107e-07, + "loss": 0.2906, + "step": 33685 + }, + { + "epoch": 0.8524432522711745, + "grad_norm": 4.354265213012695, + "learning_rate": 5.382152367330262e-07, + "loss": 0.1507, + "step": 33686 + }, + { + "epoch": 0.8524685578358681, + "grad_norm": 4.0855913162231445, + "learning_rate": 5.380340318185634e-07, + "loss": 0.1457, + "step": 33687 + }, + { + "epoch": 0.8524938634005618, + "grad_norm": 3.2095539569854736, + "learning_rate": 5.378528556784878e-07, + "loss": 0.1201, + "step": 33688 + }, + { + "epoch": 0.8525191689652555, + "grad_norm": 6.960980415344238, + "learning_rate": 5.37671708313971e-07, + "loss": 0.2006, + "step": 33689 + }, + { + "epoch": 0.8525444745299491, + "grad_norm": 7.676526069641113, + "learning_rate": 5.374905897261779e-07, + "loss": 0.2062, + "step": 33690 + }, + { + "epoch": 0.8525697800946428, + "grad_norm": 5.163314342498779, + "learning_rate": 5.373094999162781e-07, + "loss": 0.1577, + "step": 33691 + }, + { + "epoch": 0.8525950856593365, + "grad_norm": 4.367490291595459, + "learning_rate": 5.371284388854386e-07, + "loss": 0.1588, + "step": 33692 + }, + { + "epoch": 0.8526203912240302, + "grad_norm": 7.575099945068359, + "learning_rate": 5.369474066348296e-07, + "loss": 0.1963, + "step": 33693 + }, + { + "epoch": 0.8526456967887238, + "grad_norm": 7.715795040130615, + "learning_rate": 5.367664031656144e-07, + "loss": 0.1894, + "step": 33694 + }, + { + "epoch": 0.8526710023534175, + "grad_norm": 3.0480802059173584, + "learning_rate": 5.36585428478964e-07, + "loss": 0.1277, + "step": 33695 + }, + { + "epoch": 0.8526963079181112, + "grad_norm": 7.217062950134277, + "learning_rate": 5.364044825760422e-07, + "loss": 0.1899, + "step": 33696 + }, + { + "epoch": 0.8527216134828048, + "grad_norm": 4.130765438079834, + "learning_rate": 5.362235654580189e-07, + "loss": 0.1494, + "step": 33697 + }, + { + "epoch": 0.8527469190474986, + "grad_norm": 5.114288330078125, + "learning_rate": 5.360426771260596e-07, + "loss": 0.1408, + "step": 33698 + }, + { + "epoch": 0.8527722246121923, + "grad_norm": 9.045584678649902, + "learning_rate": 5.358618175813307e-07, + "loss": 0.2117, + "step": 33699 + }, + { + "epoch": 0.8527975301768859, + "grad_norm": 3.2214083671569824, + "learning_rate": 5.356809868249974e-07, + "loss": 0.1373, + "step": 33700 + }, + { + "epoch": 0.8528228357415796, + "grad_norm": 2.9134795665740967, + "learning_rate": 5.355001848582281e-07, + "loss": 0.0908, + "step": 33701 + }, + { + "epoch": 0.8528481413062733, + "grad_norm": 7.686528205871582, + "learning_rate": 5.35319411682188e-07, + "loss": 0.1955, + "step": 33702 + }, + { + "epoch": 0.8528734468709669, + "grad_norm": 4.108429908752441, + "learning_rate": 5.351386672980418e-07, + "loss": 0.152, + "step": 33703 + }, + { + "epoch": 0.8528987524356606, + "grad_norm": 5.775064468383789, + "learning_rate": 5.349579517069553e-07, + "loss": 0.0992, + "step": 33704 + }, + { + "epoch": 0.8529240580003543, + "grad_norm": 6.613089084625244, + "learning_rate": 5.347772649100951e-07, + "loss": 0.198, + "step": 33705 + }, + { + "epoch": 0.8529493635650479, + "grad_norm": 6.62906551361084, + "learning_rate": 5.345966069086261e-07, + "loss": 0.1647, + "step": 33706 + }, + { + "epoch": 0.8529746691297416, + "grad_norm": 5.022942543029785, + "learning_rate": 5.344159777037128e-07, + "loss": 0.1172, + "step": 33707 + }, + { + "epoch": 0.8529999746944353, + "grad_norm": 4.565126895904541, + "learning_rate": 5.342353772965203e-07, + "loss": 0.1689, + "step": 33708 + }, + { + "epoch": 0.8530252802591289, + "grad_norm": 5.878276348114014, + "learning_rate": 5.340548056882117e-07, + "loss": 0.1264, + "step": 33709 + }, + { + "epoch": 0.8530505858238226, + "grad_norm": 4.1482110023498535, + "learning_rate": 5.338742628799543e-07, + "loss": 0.1054, + "step": 33710 + }, + { + "epoch": 0.8530758913885164, + "grad_norm": 10.028802871704102, + "learning_rate": 5.33693748872911e-07, + "loss": 0.1366, + "step": 33711 + }, + { + "epoch": 0.85310119695321, + "grad_norm": 2.591142177581787, + "learning_rate": 5.335132636682461e-07, + "loss": 0.1481, + "step": 33712 + }, + { + "epoch": 0.8531265025179037, + "grad_norm": 6.370871543884277, + "learning_rate": 5.333328072671217e-07, + "loss": 0.1995, + "step": 33713 + }, + { + "epoch": 0.8531518080825974, + "grad_norm": 12.2783842086792, + "learning_rate": 5.33152379670705e-07, + "loss": 0.3375, + "step": 33714 + }, + { + "epoch": 0.853177113647291, + "grad_norm": 4.872662544250488, + "learning_rate": 5.32971980880157e-07, + "loss": 0.1507, + "step": 33715 + }, + { + "epoch": 0.8532024192119847, + "grad_norm": 3.3957979679107666, + "learning_rate": 5.327916108966419e-07, + "loss": 0.1217, + "step": 33716 + }, + { + "epoch": 0.8532277247766784, + "grad_norm": 2.436640977859497, + "learning_rate": 5.32611269721322e-07, + "loss": 0.0643, + "step": 33717 + }, + { + "epoch": 0.8532530303413721, + "grad_norm": 7.201639652252197, + "learning_rate": 5.324309573553621e-07, + "loss": 0.2389, + "step": 33718 + }, + { + "epoch": 0.8532783359060657, + "grad_norm": 2.4742321968078613, + "learning_rate": 5.322506737999222e-07, + "loss": 0.0695, + "step": 33719 + }, + { + "epoch": 0.8533036414707594, + "grad_norm": 5.755227565765381, + "learning_rate": 5.320704190561693e-07, + "loss": 0.1463, + "step": 33720 + }, + { + "epoch": 0.8533289470354531, + "grad_norm": 3.073857545852661, + "learning_rate": 5.318901931252613e-07, + "loss": 0.1029, + "step": 33721 + }, + { + "epoch": 0.8533542526001467, + "grad_norm": 6.95205545425415, + "learning_rate": 5.317099960083627e-07, + "loss": 0.1878, + "step": 33722 + }, + { + "epoch": 0.8533795581648405, + "grad_norm": 6.27427864074707, + "learning_rate": 5.315298277066344e-07, + "loss": 0.1325, + "step": 33723 + }, + { + "epoch": 0.8534048637295342, + "grad_norm": 3.029754161834717, + "learning_rate": 5.313496882212415e-07, + "loss": 0.1664, + "step": 33724 + }, + { + "epoch": 0.8534301692942278, + "grad_norm": 5.600599765777588, + "learning_rate": 5.31169577553341e-07, + "loss": 0.1701, + "step": 33725 + }, + { + "epoch": 0.8534554748589215, + "grad_norm": 10.600615501403809, + "learning_rate": 5.309894957040973e-07, + "loss": 0.0946, + "step": 33726 + }, + { + "epoch": 0.8534807804236152, + "grad_norm": 12.207962036132812, + "learning_rate": 5.308094426746701e-07, + "loss": 0.2313, + "step": 33727 + }, + { + "epoch": 0.8535060859883088, + "grad_norm": 4.032952308654785, + "learning_rate": 5.306294184662225e-07, + "loss": 0.1339, + "step": 33728 + }, + { + "epoch": 0.8535313915530025, + "grad_norm": 10.004382133483887, + "learning_rate": 5.304494230799146e-07, + "loss": 0.1996, + "step": 33729 + }, + { + "epoch": 0.8535566971176962, + "grad_norm": 3.9592714309692383, + "learning_rate": 5.30269456516907e-07, + "loss": 0.1533, + "step": 33730 + }, + { + "epoch": 0.8535820026823898, + "grad_norm": 3.6783015727996826, + "learning_rate": 5.300895187783594e-07, + "loss": 0.0606, + "step": 33731 + }, + { + "epoch": 0.8536073082470835, + "grad_norm": 9.011605262756348, + "learning_rate": 5.299096098654338e-07, + "loss": 0.3566, + "step": 33732 + }, + { + "epoch": 0.8536326138117772, + "grad_norm": 12.717167854309082, + "learning_rate": 5.297297297792892e-07, + "loss": 0.2135, + "step": 33733 + }, + { + "epoch": 0.8536579193764708, + "grad_norm": 3.4269094467163086, + "learning_rate": 5.295498785210868e-07, + "loss": 0.1334, + "step": 33734 + }, + { + "epoch": 0.8536832249411646, + "grad_norm": 4.433855056762695, + "learning_rate": 5.293700560919851e-07, + "loss": 0.1895, + "step": 33735 + }, + { + "epoch": 0.8537085305058583, + "grad_norm": 5.84534215927124, + "learning_rate": 5.291902624931433e-07, + "loss": 0.163, + "step": 33736 + }, + { + "epoch": 0.8537338360705519, + "grad_norm": 4.073398590087891, + "learning_rate": 5.290104977257232e-07, + "loss": 0.1208, + "step": 33737 + }, + { + "epoch": 0.8537591416352456, + "grad_norm": 6.079331874847412, + "learning_rate": 5.288307617908827e-07, + "loss": 0.1356, + "step": 33738 + }, + { + "epoch": 0.8537844471999393, + "grad_norm": 7.2659382820129395, + "learning_rate": 5.286510546897811e-07, + "loss": 0.205, + "step": 33739 + }, + { + "epoch": 0.8538097527646329, + "grad_norm": 4.835339069366455, + "learning_rate": 5.284713764235755e-07, + "loss": 0.0937, + "step": 33740 + }, + { + "epoch": 0.8538350583293266, + "grad_norm": 6.9925761222839355, + "learning_rate": 5.282917269934274e-07, + "loss": 0.1715, + "step": 33741 + }, + { + "epoch": 0.8538603638940203, + "grad_norm": 4.745967388153076, + "learning_rate": 5.281121064004946e-07, + "loss": 0.1267, + "step": 33742 + }, + { + "epoch": 0.853885669458714, + "grad_norm": 3.666654109954834, + "learning_rate": 5.279325146459347e-07, + "loss": 0.1373, + "step": 33743 + }, + { + "epoch": 0.8539109750234076, + "grad_norm": 12.880736351013184, + "learning_rate": 5.277529517309049e-07, + "loss": 0.3303, + "step": 33744 + }, + { + "epoch": 0.8539362805881013, + "grad_norm": 2.8808517456054688, + "learning_rate": 5.275734176565656e-07, + "loss": 0.1183, + "step": 33745 + }, + { + "epoch": 0.853961586152795, + "grad_norm": 7.69525146484375, + "learning_rate": 5.273939124240735e-07, + "loss": 0.2258, + "step": 33746 + }, + { + "epoch": 0.8539868917174886, + "grad_norm": 3.318148374557495, + "learning_rate": 5.27214436034586e-07, + "loss": 0.1117, + "step": 33747 + }, + { + "epoch": 0.8540121972821824, + "grad_norm": 5.535315990447998, + "learning_rate": 5.270349884892601e-07, + "loss": 0.2108, + "step": 33748 + }, + { + "epoch": 0.8540375028468761, + "grad_norm": 4.619641304016113, + "learning_rate": 5.268555697892536e-07, + "loss": 0.1805, + "step": 33749 + }, + { + "epoch": 0.8540628084115697, + "grad_norm": 5.6766486167907715, + "learning_rate": 5.266761799357229e-07, + "loss": 0.1059, + "step": 33750 + }, + { + "epoch": 0.8540881139762634, + "grad_norm": 2.6549482345581055, + "learning_rate": 5.264968189298275e-07, + "loss": 0.098, + "step": 33751 + }, + { + "epoch": 0.8541134195409571, + "grad_norm": 6.234856605529785, + "learning_rate": 5.263174867727199e-07, + "loss": 0.1432, + "step": 33752 + }, + { + "epoch": 0.8541387251056507, + "grad_norm": 2.9460830688476562, + "learning_rate": 5.261381834655599e-07, + "loss": 0.1081, + "step": 33753 + }, + { + "epoch": 0.8541640306703444, + "grad_norm": 3.501718282699585, + "learning_rate": 5.259589090095013e-07, + "loss": 0.1381, + "step": 33754 + }, + { + "epoch": 0.8541893362350381, + "grad_norm": 8.356037139892578, + "learning_rate": 5.257796634057033e-07, + "loss": 0.1954, + "step": 33755 + }, + { + "epoch": 0.8542146417997317, + "grad_norm": 2.2881195545196533, + "learning_rate": 5.256004466553183e-07, + "loss": 0.0724, + "step": 33756 + }, + { + "epoch": 0.8542399473644254, + "grad_norm": 4.95217227935791, + "learning_rate": 5.254212587595048e-07, + "loss": 0.1397, + "step": 33757 + }, + { + "epoch": 0.8542652529291191, + "grad_norm": 14.15433120727539, + "learning_rate": 5.252420997194158e-07, + "loss": 0.2505, + "step": 33758 + }, + { + "epoch": 0.8542905584938127, + "grad_norm": 3.126140594482422, + "learning_rate": 5.250629695362103e-07, + "loss": 0.1199, + "step": 33759 + }, + { + "epoch": 0.8543158640585065, + "grad_norm": 4.50132417678833, + "learning_rate": 5.248838682110396e-07, + "loss": 0.1907, + "step": 33760 + }, + { + "epoch": 0.8543411696232002, + "grad_norm": 4.4902801513671875, + "learning_rate": 5.247047957450613e-07, + "loss": 0.1186, + "step": 33761 + }, + { + "epoch": 0.8543664751878938, + "grad_norm": 3.650014877319336, + "learning_rate": 5.245257521394281e-07, + "loss": 0.0804, + "step": 33762 + }, + { + "epoch": 0.8543917807525875, + "grad_norm": 4.967767238616943, + "learning_rate": 5.243467373952971e-07, + "loss": 0.1428, + "step": 33763 + }, + { + "epoch": 0.8544170863172812, + "grad_norm": 2.434566020965576, + "learning_rate": 5.24167751513821e-07, + "loss": 0.0931, + "step": 33764 + }, + { + "epoch": 0.8544423918819748, + "grad_norm": 6.212021350860596, + "learning_rate": 5.239887944961547e-07, + "loss": 0.0942, + "step": 33765 + }, + { + "epoch": 0.8544676974466685, + "grad_norm": 4.892856121063232, + "learning_rate": 5.238098663434521e-07, + "loss": 0.1457, + "step": 33766 + }, + { + "epoch": 0.8544930030113622, + "grad_norm": 11.348348617553711, + "learning_rate": 5.236309670568662e-07, + "loss": 0.1997, + "step": 33767 + }, + { + "epoch": 0.8545183085760559, + "grad_norm": 13.249898910522461, + "learning_rate": 5.234520966375523e-07, + "loss": 0.1812, + "step": 33768 + }, + { + "epoch": 0.8545436141407495, + "grad_norm": 5.438148498535156, + "learning_rate": 5.232732550866631e-07, + "loss": 0.1684, + "step": 33769 + }, + { + "epoch": 0.8545689197054432, + "grad_norm": 2.8957998752593994, + "learning_rate": 5.230944424053525e-07, + "loss": 0.073, + "step": 33770 + }, + { + "epoch": 0.854594225270137, + "grad_norm": 5.12048864364624, + "learning_rate": 5.229156585947714e-07, + "loss": 0.0938, + "step": 33771 + }, + { + "epoch": 0.8546195308348306, + "grad_norm": 7.051474094390869, + "learning_rate": 5.227369036560759e-07, + "loss": 0.1722, + "step": 33772 + }, + { + "epoch": 0.8546448363995243, + "grad_norm": 4.35404109954834, + "learning_rate": 5.22558177590417e-07, + "loss": 0.1476, + "step": 33773 + }, + { + "epoch": 0.854670141964218, + "grad_norm": 5.954955577850342, + "learning_rate": 5.223794803989474e-07, + "loss": 0.1629, + "step": 33774 + }, + { + "epoch": 0.8546954475289116, + "grad_norm": 3.0559780597686768, + "learning_rate": 5.222008120828187e-07, + "loss": 0.1151, + "step": 33775 + }, + { + "epoch": 0.8547207530936053, + "grad_norm": 3.4704086780548096, + "learning_rate": 5.220221726431852e-07, + "loss": 0.0767, + "step": 33776 + }, + { + "epoch": 0.854746058658299, + "grad_norm": 5.517242431640625, + "learning_rate": 5.218435620811979e-07, + "loss": 0.1639, + "step": 33777 + }, + { + "epoch": 0.8547713642229926, + "grad_norm": 3.941519260406494, + "learning_rate": 5.216649803980078e-07, + "loss": 0.0918, + "step": 33778 + }, + { + "epoch": 0.8547966697876863, + "grad_norm": 5.618740081787109, + "learning_rate": 5.214864275947667e-07, + "loss": 0.1023, + "step": 33779 + }, + { + "epoch": 0.85482197535238, + "grad_norm": 5.389063358306885, + "learning_rate": 5.213079036726277e-07, + "loss": 0.0748, + "step": 33780 + }, + { + "epoch": 0.8548472809170736, + "grad_norm": 8.19672679901123, + "learning_rate": 5.211294086327396e-07, + "loss": 0.1994, + "step": 33781 + }, + { + "epoch": 0.8548725864817673, + "grad_norm": 12.79333209991455, + "learning_rate": 5.209509424762571e-07, + "loss": 0.1491, + "step": 33782 + }, + { + "epoch": 0.854897892046461, + "grad_norm": 3.9650542736053467, + "learning_rate": 5.207725052043266e-07, + "loss": 0.1408, + "step": 33783 + }, + { + "epoch": 0.8549231976111547, + "grad_norm": 16.546518325805664, + "learning_rate": 5.205940968181022e-07, + "loss": 0.3415, + "step": 33784 + }, + { + "epoch": 0.8549485031758484, + "grad_norm": 5.0403828620910645, + "learning_rate": 5.204157173187319e-07, + "loss": 0.0886, + "step": 33785 + }, + { + "epoch": 0.8549738087405421, + "grad_norm": 8.304587364196777, + "learning_rate": 5.202373667073695e-07, + "loss": 0.2659, + "step": 33786 + }, + { + "epoch": 0.8549991143052357, + "grad_norm": 6.186941623687744, + "learning_rate": 5.200590449851611e-07, + "loss": 0.1914, + "step": 33787 + }, + { + "epoch": 0.8550244198699294, + "grad_norm": 6.142770767211914, + "learning_rate": 5.198807521532595e-07, + "loss": 0.1292, + "step": 33788 + }, + { + "epoch": 0.8550497254346231, + "grad_norm": 12.090301513671875, + "learning_rate": 5.197024882128127e-07, + "loss": 0.1915, + "step": 33789 + }, + { + "epoch": 0.8550750309993167, + "grad_norm": 3.5373799800872803, + "learning_rate": 5.195242531649731e-07, + "loss": 0.0844, + "step": 33790 + }, + { + "epoch": 0.8551003365640104, + "grad_norm": 3.894819736480713, + "learning_rate": 5.193460470108863e-07, + "loss": 0.1252, + "step": 33791 + }, + { + "epoch": 0.8551256421287041, + "grad_norm": 6.225409507751465, + "learning_rate": 5.191678697517044e-07, + "loss": 0.2325, + "step": 33792 + }, + { + "epoch": 0.8551509476933977, + "grad_norm": 6.142963886260986, + "learning_rate": 5.189897213885753e-07, + "loss": 0.1909, + "step": 33793 + }, + { + "epoch": 0.8551762532580914, + "grad_norm": 6.990109443664551, + "learning_rate": 5.188116019226469e-07, + "loss": 0.1723, + "step": 33794 + }, + { + "epoch": 0.8552015588227851, + "grad_norm": 4.75965690612793, + "learning_rate": 5.1863351135507e-07, + "loss": 0.1252, + "step": 33795 + }, + { + "epoch": 0.8552268643874789, + "grad_norm": 7.09650993347168, + "learning_rate": 5.184554496869914e-07, + "loss": 0.2094, + "step": 33796 + }, + { + "epoch": 0.8552521699521725, + "grad_norm": 8.251564979553223, + "learning_rate": 5.182774169195609e-07, + "loss": 0.1298, + "step": 33797 + }, + { + "epoch": 0.8552774755168662, + "grad_norm": 6.696227073669434, + "learning_rate": 5.180994130539241e-07, + "loss": 0.1537, + "step": 33798 + }, + { + "epoch": 0.8553027810815599, + "grad_norm": 5.999637603759766, + "learning_rate": 5.179214380912317e-07, + "loss": 0.1476, + "step": 33799 + }, + { + "epoch": 0.8553280866462535, + "grad_norm": 3.9042515754699707, + "learning_rate": 5.177434920326302e-07, + "loss": 0.138, + "step": 33800 + }, + { + "epoch": 0.8553533922109472, + "grad_norm": 3.2699007987976074, + "learning_rate": 5.175655748792669e-07, + "loss": 0.1305, + "step": 33801 + }, + { + "epoch": 0.8553786977756409, + "grad_norm": 3.3169009685516357, + "learning_rate": 5.173876866322885e-07, + "loss": 0.0966, + "step": 33802 + }, + { + "epoch": 0.8554040033403345, + "grad_norm": 3.4113359451293945, + "learning_rate": 5.172098272928445e-07, + "loss": 0.1381, + "step": 33803 + }, + { + "epoch": 0.8554293089050282, + "grad_norm": 5.716681003570557, + "learning_rate": 5.170319968620802e-07, + "loss": 0.1833, + "step": 33804 + }, + { + "epoch": 0.8554546144697219, + "grad_norm": 9.552239418029785, + "learning_rate": 5.168541953411427e-07, + "loss": 0.3389, + "step": 33805 + }, + { + "epoch": 0.8554799200344155, + "grad_norm": 5.485404014587402, + "learning_rate": 5.166764227311771e-07, + "loss": 0.1454, + "step": 33806 + }, + { + "epoch": 0.8555052255991092, + "grad_norm": 6.498714447021484, + "learning_rate": 5.164986790333326e-07, + "loss": 0.0972, + "step": 33807 + }, + { + "epoch": 0.855530531163803, + "grad_norm": 12.771025657653809, + "learning_rate": 5.163209642487544e-07, + "loss": 0.1489, + "step": 33808 + }, + { + "epoch": 0.8555558367284966, + "grad_norm": 3.1463913917541504, + "learning_rate": 5.16143278378588e-07, + "loss": 0.1543, + "step": 33809 + }, + { + "epoch": 0.8555811422931903, + "grad_norm": 5.338664531707764, + "learning_rate": 5.159656214239789e-07, + "loss": 0.1043, + "step": 33810 + }, + { + "epoch": 0.855606447857884, + "grad_norm": 6.4427289962768555, + "learning_rate": 5.157879933860738e-07, + "loss": 0.2125, + "step": 33811 + }, + { + "epoch": 0.8556317534225776, + "grad_norm": 3.3419291973114014, + "learning_rate": 5.156103942660184e-07, + "loss": 0.1228, + "step": 33812 + }, + { + "epoch": 0.8556570589872713, + "grad_norm": 5.40591287612915, + "learning_rate": 5.15432824064957e-07, + "loss": 0.1565, + "step": 33813 + }, + { + "epoch": 0.855682364551965, + "grad_norm": 4.873366832733154, + "learning_rate": 5.15255282784034e-07, + "loss": 0.1625, + "step": 33814 + }, + { + "epoch": 0.8557076701166586, + "grad_norm": 7.109188556671143, + "learning_rate": 5.15077770424397e-07, + "loss": 0.2293, + "step": 33815 + }, + { + "epoch": 0.8557329756813523, + "grad_norm": 5.418639659881592, + "learning_rate": 5.149002869871878e-07, + "loss": 0.1274, + "step": 33816 + }, + { + "epoch": 0.855758281246046, + "grad_norm": 2.797407627105713, + "learning_rate": 5.147228324735543e-07, + "loss": 0.1523, + "step": 33817 + }, + { + "epoch": 0.8557835868107396, + "grad_norm": 5.4033074378967285, + "learning_rate": 5.145454068846373e-07, + "loss": 0.1361, + "step": 33818 + }, + { + "epoch": 0.8558088923754333, + "grad_norm": 4.071355819702148, + "learning_rate": 5.143680102215831e-07, + "loss": 0.1438, + "step": 33819 + }, + { + "epoch": 0.855834197940127, + "grad_norm": 1.9503705501556396, + "learning_rate": 5.141906424855358e-07, + "loss": 0.0533, + "step": 33820 + }, + { + "epoch": 0.8558595035048208, + "grad_norm": 11.86392879486084, + "learning_rate": 5.140133036776385e-07, + "loss": 0.285, + "step": 33821 + }, + { + "epoch": 0.8558848090695144, + "grad_norm": 12.90089225769043, + "learning_rate": 5.138359937990334e-07, + "loss": 0.203, + "step": 33822 + }, + { + "epoch": 0.8559101146342081, + "grad_norm": 3.426097869873047, + "learning_rate": 5.136587128508669e-07, + "loss": 0.1537, + "step": 33823 + }, + { + "epoch": 0.8559354201989018, + "grad_norm": 6.393925189971924, + "learning_rate": 5.134814608342809e-07, + "loss": 0.1165, + "step": 33824 + }, + { + "epoch": 0.8559607257635954, + "grad_norm": 5.2926716804504395, + "learning_rate": 5.133042377504171e-07, + "loss": 0.1182, + "step": 33825 + }, + { + "epoch": 0.8559860313282891, + "grad_norm": 9.56941032409668, + "learning_rate": 5.13127043600421e-07, + "loss": 0.1788, + "step": 33826 + }, + { + "epoch": 0.8560113368929828, + "grad_norm": 3.70859694480896, + "learning_rate": 5.129498783854342e-07, + "loss": 0.2014, + "step": 33827 + }, + { + "epoch": 0.8560366424576764, + "grad_norm": 11.81067180633545, + "learning_rate": 5.127727421065987e-07, + "loss": 0.2401, + "step": 33828 + }, + { + "epoch": 0.8560619480223701, + "grad_norm": 6.57184362411499, + "learning_rate": 5.125956347650557e-07, + "loss": 0.1813, + "step": 33829 + }, + { + "epoch": 0.8560872535870638, + "grad_norm": 4.338666915893555, + "learning_rate": 5.124185563619511e-07, + "loss": 0.1676, + "step": 33830 + }, + { + "epoch": 0.8561125591517574, + "grad_norm": 6.588891983032227, + "learning_rate": 5.122415068984227e-07, + "loss": 0.1702, + "step": 33831 + }, + { + "epoch": 0.8561378647164511, + "grad_norm": 5.027072429656982, + "learning_rate": 5.120644863756147e-07, + "loss": 0.1415, + "step": 33832 + }, + { + "epoch": 0.8561631702811449, + "grad_norm": 4.649480819702148, + "learning_rate": 5.118874947946667e-07, + "loss": 0.1301, + "step": 33833 + }, + { + "epoch": 0.8561884758458385, + "grad_norm": 8.398974418640137, + "learning_rate": 5.117105321567228e-07, + "loss": 0.2396, + "step": 33834 + }, + { + "epoch": 0.8562137814105322, + "grad_norm": 3.845141887664795, + "learning_rate": 5.115335984629227e-07, + "loss": 0.0874, + "step": 33835 + }, + { + "epoch": 0.8562390869752259, + "grad_norm": 4.756869792938232, + "learning_rate": 5.113566937144076e-07, + "loss": 0.1188, + "step": 33836 + }, + { + "epoch": 0.8562643925399195, + "grad_norm": 3.9635164737701416, + "learning_rate": 5.111798179123173e-07, + "loss": 0.157, + "step": 33837 + }, + { + "epoch": 0.8562896981046132, + "grad_norm": 25.937528610229492, + "learning_rate": 5.110029710577946e-07, + "loss": 0.1712, + "step": 33838 + }, + { + "epoch": 0.8563150036693069, + "grad_norm": 7.170994758605957, + "learning_rate": 5.108261531519782e-07, + "loss": 0.1939, + "step": 33839 + }, + { + "epoch": 0.8563403092340005, + "grad_norm": 9.255570411682129, + "learning_rate": 5.106493641960092e-07, + "loss": 0.1449, + "step": 33840 + }, + { + "epoch": 0.8563656147986942, + "grad_norm": 9.993232727050781, + "learning_rate": 5.104726041910263e-07, + "loss": 0.176, + "step": 33841 + }, + { + "epoch": 0.8563909203633879, + "grad_norm": 6.738809585571289, + "learning_rate": 5.102958731381713e-07, + "loss": 0.2055, + "step": 33842 + }, + { + "epoch": 0.8564162259280815, + "grad_norm": 3.060637950897217, + "learning_rate": 5.101191710385833e-07, + "loss": 0.0901, + "step": 33843 + }, + { + "epoch": 0.8564415314927752, + "grad_norm": 5.04326868057251, + "learning_rate": 5.099424978934014e-07, + "loss": 0.1612, + "step": 33844 + }, + { + "epoch": 0.856466837057469, + "grad_norm": 3.00005841255188, + "learning_rate": 5.097658537037642e-07, + "loss": 0.1136, + "step": 33845 + }, + { + "epoch": 0.8564921426221627, + "grad_norm": 1.526546835899353, + "learning_rate": 5.095892384708129e-07, + "loss": 0.0377, + "step": 33846 + }, + { + "epoch": 0.8565174481868563, + "grad_norm": 8.541061401367188, + "learning_rate": 5.094126521956838e-07, + "loss": 0.1366, + "step": 33847 + }, + { + "epoch": 0.85654275375155, + "grad_norm": 7.7579474449157715, + "learning_rate": 5.092360948795194e-07, + "loss": 0.2143, + "step": 33848 + }, + { + "epoch": 0.8565680593162437, + "grad_norm": 2.6800968647003174, + "learning_rate": 5.090595665234544e-07, + "loss": 0.1019, + "step": 33849 + }, + { + "epoch": 0.8565933648809373, + "grad_norm": 3.6343905925750732, + "learning_rate": 5.088830671286299e-07, + "loss": 0.1329, + "step": 33850 + }, + { + "epoch": 0.856618670445631, + "grad_norm": 2.779024362564087, + "learning_rate": 5.087065966961829e-07, + "loss": 0.1268, + "step": 33851 + }, + { + "epoch": 0.8566439760103247, + "grad_norm": 7.137139320373535, + "learning_rate": 5.085301552272514e-07, + "loss": 0.1501, + "step": 33852 + }, + { + "epoch": 0.8566692815750183, + "grad_norm": 3.819108486175537, + "learning_rate": 5.083537427229729e-07, + "loss": 0.1588, + "step": 33853 + }, + { + "epoch": 0.856694587139712, + "grad_norm": 6.320102691650391, + "learning_rate": 5.081773591844863e-07, + "loss": 0.1905, + "step": 33854 + }, + { + "epoch": 0.8567198927044057, + "grad_norm": 2.7975680828094482, + "learning_rate": 5.080010046129286e-07, + "loss": 0.1107, + "step": 33855 + }, + { + "epoch": 0.8567451982690993, + "grad_norm": 3.8369076251983643, + "learning_rate": 5.078246790094354e-07, + "loss": 0.0745, + "step": 33856 + }, + { + "epoch": 0.856770503833793, + "grad_norm": 4.001652717590332, + "learning_rate": 5.076483823751477e-07, + "loss": 0.124, + "step": 33857 + }, + { + "epoch": 0.8567958093984868, + "grad_norm": 6.153153896331787, + "learning_rate": 5.074721147111977e-07, + "loss": 0.1816, + "step": 33858 + }, + { + "epoch": 0.8568211149631804, + "grad_norm": 3.971531867980957, + "learning_rate": 5.072958760187252e-07, + "loss": 0.1244, + "step": 33859 + }, + { + "epoch": 0.8568464205278741, + "grad_norm": 3.4885432720184326, + "learning_rate": 5.071196662988654e-07, + "loss": 0.1103, + "step": 33860 + }, + { + "epoch": 0.8568717260925678, + "grad_norm": 11.487103462219238, + "learning_rate": 5.069434855527566e-07, + "loss": 0.3198, + "step": 33861 + }, + { + "epoch": 0.8568970316572614, + "grad_norm": 2.6270599365234375, + "learning_rate": 5.067673337815315e-07, + "loss": 0.0902, + "step": 33862 + }, + { + "epoch": 0.8569223372219551, + "grad_norm": 3.189069986343384, + "learning_rate": 5.065912109863297e-07, + "loss": 0.1002, + "step": 33863 + }, + { + "epoch": 0.8569476427866488, + "grad_norm": 9.479070663452148, + "learning_rate": 5.064151171682836e-07, + "loss": 0.1855, + "step": 33864 + }, + { + "epoch": 0.8569729483513424, + "grad_norm": 4.14376974105835, + "learning_rate": 5.062390523285332e-07, + "loss": 0.1279, + "step": 33865 + }, + { + "epoch": 0.8569982539160361, + "grad_norm": 9.612548828125, + "learning_rate": 5.06063016468209e-07, + "loss": 0.2255, + "step": 33866 + }, + { + "epoch": 0.8570235594807298, + "grad_norm": 10.06474781036377, + "learning_rate": 5.058870095884488e-07, + "loss": 0.3078, + "step": 33867 + }, + { + "epoch": 0.8570488650454234, + "grad_norm": 3.511307716369629, + "learning_rate": 5.057110316903874e-07, + "loss": 0.1326, + "step": 33868 + }, + { + "epoch": 0.8570741706101171, + "grad_norm": 4.68428897857666, + "learning_rate": 5.055350827751598e-07, + "loss": 0.1259, + "step": 33869 + }, + { + "epoch": 0.8570994761748109, + "grad_norm": 7.771031856536865, + "learning_rate": 5.053591628439009e-07, + "loss": 0.1779, + "step": 33870 + }, + { + "epoch": 0.8571247817395046, + "grad_norm": 3.973727226257324, + "learning_rate": 5.05183271897744e-07, + "loss": 0.1057, + "step": 33871 + }, + { + "epoch": 0.8571500873041982, + "grad_norm": 8.912028312683105, + "learning_rate": 5.050074099378238e-07, + "loss": 0.1767, + "step": 33872 + }, + { + "epoch": 0.8571753928688919, + "grad_norm": 5.16851806640625, + "learning_rate": 5.048315769652757e-07, + "loss": 0.2165, + "step": 33873 + }, + { + "epoch": 0.8572006984335856, + "grad_norm": 11.510013580322266, + "learning_rate": 5.046557729812319e-07, + "loss": 0.2611, + "step": 33874 + }, + { + "epoch": 0.8572260039982792, + "grad_norm": 7.117006778717041, + "learning_rate": 5.044799979868275e-07, + "loss": 0.1244, + "step": 33875 + }, + { + "epoch": 0.8572513095629729, + "grad_norm": 10.305700302124023, + "learning_rate": 5.043042519831937e-07, + "loss": 0.1206, + "step": 33876 + }, + { + "epoch": 0.8572766151276666, + "grad_norm": 7.205849647521973, + "learning_rate": 5.04128534971467e-07, + "loss": 0.2057, + "step": 33877 + }, + { + "epoch": 0.8573019206923602, + "grad_norm": 4.845475196838379, + "learning_rate": 5.039528469527793e-07, + "loss": 0.1592, + "step": 33878 + }, + { + "epoch": 0.8573272262570539, + "grad_norm": 4.854754447937012, + "learning_rate": 5.037771879282627e-07, + "loss": 0.1869, + "step": 33879 + }, + { + "epoch": 0.8573525318217476, + "grad_norm": 5.354983329772949, + "learning_rate": 5.0360155789905e-07, + "loss": 0.1132, + "step": 33880 + }, + { + "epoch": 0.8573778373864412, + "grad_norm": 5.892663478851318, + "learning_rate": 5.034259568662758e-07, + "loss": 0.1189, + "step": 33881 + }, + { + "epoch": 0.857403142951135, + "grad_norm": 4.424102306365967, + "learning_rate": 5.032503848310704e-07, + "loss": 0.1819, + "step": 33882 + }, + { + "epoch": 0.8574284485158287, + "grad_norm": 8.508835792541504, + "learning_rate": 5.030748417945674e-07, + "loss": 0.2092, + "step": 33883 + }, + { + "epoch": 0.8574537540805223, + "grad_norm": 10.473600387573242, + "learning_rate": 5.02899327757897e-07, + "loss": 0.27, + "step": 33884 + }, + { + "epoch": 0.857479059645216, + "grad_norm": 11.826814651489258, + "learning_rate": 5.027238427221937e-07, + "loss": 0.1509, + "step": 33885 + }, + { + "epoch": 0.8575043652099097, + "grad_norm": 6.05308723449707, + "learning_rate": 5.025483866885872e-07, + "loss": 0.1758, + "step": 33886 + }, + { + "epoch": 0.8575296707746033, + "grad_norm": 4.3788676261901855, + "learning_rate": 5.023729596582099e-07, + "loss": 0.1272, + "step": 33887 + }, + { + "epoch": 0.857554976339297, + "grad_norm": 3.0113167762756348, + "learning_rate": 5.021975616321928e-07, + "loss": 0.0697, + "step": 33888 + }, + { + "epoch": 0.8575802819039907, + "grad_norm": 5.211981296539307, + "learning_rate": 5.020221926116658e-07, + "loss": 0.1016, + "step": 33889 + }, + { + "epoch": 0.8576055874686843, + "grad_norm": 6.774073600769043, + "learning_rate": 5.018468525977621e-07, + "loss": 0.1801, + "step": 33890 + }, + { + "epoch": 0.857630893033378, + "grad_norm": 2.938490152359009, + "learning_rate": 5.016715415916102e-07, + "loss": 0.0966, + "step": 33891 + }, + { + "epoch": 0.8576561985980717, + "grad_norm": 3.3117456436157227, + "learning_rate": 5.01496259594344e-07, + "loss": 0.1012, + "step": 33892 + }, + { + "epoch": 0.8576815041627653, + "grad_norm": 5.184020519256592, + "learning_rate": 5.013210066070895e-07, + "loss": 0.1398, + "step": 33893 + }, + { + "epoch": 0.857706809727459, + "grad_norm": 3.9585044384002686, + "learning_rate": 5.011457826309802e-07, + "loss": 0.1194, + "step": 33894 + }, + { + "epoch": 0.8577321152921528, + "grad_norm": 3.1794583797454834, + "learning_rate": 5.009705876671433e-07, + "loss": 0.1408, + "step": 33895 + }, + { + "epoch": 0.8577574208568465, + "grad_norm": 8.768206596374512, + "learning_rate": 5.007954217167127e-07, + "loss": 0.147, + "step": 33896 + }, + { + "epoch": 0.8577827264215401, + "grad_norm": 7.458739280700684, + "learning_rate": 5.006202847808134e-07, + "loss": 0.2245, + "step": 33897 + }, + { + "epoch": 0.8578080319862338, + "grad_norm": 2.6873934268951416, + "learning_rate": 5.004451768605784e-07, + "loss": 0.0585, + "step": 33898 + }, + { + "epoch": 0.8578333375509275, + "grad_norm": 12.434285163879395, + "learning_rate": 5.002700979571339e-07, + "loss": 0.1679, + "step": 33899 + }, + { + "epoch": 0.8578586431156211, + "grad_norm": 5.680150508880615, + "learning_rate": 5.000950480716116e-07, + "loss": 0.1588, + "step": 33900 + }, + { + "epoch": 0.8578839486803148, + "grad_norm": 8.72564697265625, + "learning_rate": 4.999200272051391e-07, + "loss": 0.1833, + "step": 33901 + }, + { + "epoch": 0.8579092542450085, + "grad_norm": 2.6622025966644287, + "learning_rate": 4.997450353588457e-07, + "loss": 0.0676, + "step": 33902 + }, + { + "epoch": 0.8579345598097021, + "grad_norm": 6.881225109100342, + "learning_rate": 4.995700725338581e-07, + "loss": 0.1583, + "step": 33903 + }, + { + "epoch": 0.8579598653743958, + "grad_norm": 5.960070610046387, + "learning_rate": 4.993951387313073e-07, + "loss": 0.1893, + "step": 33904 + }, + { + "epoch": 0.8579851709390895, + "grad_norm": 4.332469940185547, + "learning_rate": 4.9922023395232e-07, + "loss": 0.1144, + "step": 33905 + }, + { + "epoch": 0.8580104765037831, + "grad_norm": 11.067296028137207, + "learning_rate": 4.990453581980242e-07, + "loss": 0.1972, + "step": 33906 + }, + { + "epoch": 0.8580357820684769, + "grad_norm": 3.3732919692993164, + "learning_rate": 4.988705114695463e-07, + "loss": 0.1153, + "step": 33907 + }, + { + "epoch": 0.8580610876331706, + "grad_norm": 3.573906183242798, + "learning_rate": 4.986956937680166e-07, + "loss": 0.1002, + "step": 33908 + }, + { + "epoch": 0.8580863931978642, + "grad_norm": 4.363917827606201, + "learning_rate": 4.985209050945605e-07, + "loss": 0.0811, + "step": 33909 + }, + { + "epoch": 0.8581116987625579, + "grad_norm": 8.242341041564941, + "learning_rate": 4.983461454503064e-07, + "loss": 0.2106, + "step": 33910 + }, + { + "epoch": 0.8581370043272516, + "grad_norm": 4.419079780578613, + "learning_rate": 4.981714148363792e-07, + "loss": 0.1246, + "step": 33911 + }, + { + "epoch": 0.8581623098919452, + "grad_norm": 4.20374059677124, + "learning_rate": 4.979967132539088e-07, + "loss": 0.1325, + "step": 33912 + }, + { + "epoch": 0.8581876154566389, + "grad_norm": 12.369032859802246, + "learning_rate": 4.978220407040196e-07, + "loss": 0.2854, + "step": 33913 + }, + { + "epoch": 0.8582129210213326, + "grad_norm": 12.326667785644531, + "learning_rate": 4.976473971878388e-07, + "loss": 0.1742, + "step": 33914 + }, + { + "epoch": 0.8582382265860262, + "grad_norm": 8.550482749938965, + "learning_rate": 4.974727827064918e-07, + "loss": 0.167, + "step": 33915 + }, + { + "epoch": 0.8582635321507199, + "grad_norm": 4.201893329620361, + "learning_rate": 4.972981972611052e-07, + "loss": 0.0939, + "step": 33916 + }, + { + "epoch": 0.8582888377154136, + "grad_norm": 3.381800651550293, + "learning_rate": 4.971236408528052e-07, + "loss": 0.1126, + "step": 33917 + }, + { + "epoch": 0.8583141432801072, + "grad_norm": 3.2418510913848877, + "learning_rate": 4.969491134827175e-07, + "loss": 0.1075, + "step": 33918 + }, + { + "epoch": 0.858339448844801, + "grad_norm": 4.776286602020264, + "learning_rate": 4.967746151519675e-07, + "loss": 0.1076, + "step": 33919 + }, + { + "epoch": 0.8583647544094947, + "grad_norm": 3.5693600177764893, + "learning_rate": 4.966001458616792e-07, + "loss": 0.1483, + "step": 33920 + }, + { + "epoch": 0.8583900599741883, + "grad_norm": 5.139520645141602, + "learning_rate": 4.9642570561298e-07, + "loss": 0.1705, + "step": 33921 + }, + { + "epoch": 0.858415365538882, + "grad_norm": 7.550915718078613, + "learning_rate": 4.96251294406993e-07, + "loss": 0.2665, + "step": 33922 + }, + { + "epoch": 0.8584406711035757, + "grad_norm": 10.38811206817627, + "learning_rate": 4.96076912244845e-07, + "loss": 0.2278, + "step": 33923 + }, + { + "epoch": 0.8584659766682694, + "grad_norm": 3.1627607345581055, + "learning_rate": 4.959025591276579e-07, + "loss": 0.0829, + "step": 33924 + }, + { + "epoch": 0.858491282232963, + "grad_norm": 14.59555435180664, + "learning_rate": 4.957282350565579e-07, + "loss": 0.2775, + "step": 33925 + }, + { + "epoch": 0.8585165877976567, + "grad_norm": 3.8543059825897217, + "learning_rate": 4.955539400326676e-07, + "loss": 0.1628, + "step": 33926 + }, + { + "epoch": 0.8585418933623504, + "grad_norm": 15.18975830078125, + "learning_rate": 4.953796740571138e-07, + "loss": 0.2044, + "step": 33927 + }, + { + "epoch": 0.858567198927044, + "grad_norm": 2.767583131790161, + "learning_rate": 4.952054371310172e-07, + "loss": 0.0954, + "step": 33928 + }, + { + "epoch": 0.8585925044917377, + "grad_norm": 4.795040607452393, + "learning_rate": 4.950312292555032e-07, + "loss": 0.1764, + "step": 33929 + }, + { + "epoch": 0.8586178100564315, + "grad_norm": 6.3224053382873535, + "learning_rate": 4.948570504316941e-07, + "loss": 0.1344, + "step": 33930 + }, + { + "epoch": 0.858643115621125, + "grad_norm": 4.295458793640137, + "learning_rate": 4.946829006607152e-07, + "loss": 0.1233, + "step": 33931 + }, + { + "epoch": 0.8586684211858188, + "grad_norm": 3.6842846870422363, + "learning_rate": 4.945087799436876e-07, + "loss": 0.1007, + "step": 33932 + }, + { + "epoch": 0.8586937267505125, + "grad_norm": 5.57232666015625, + "learning_rate": 4.943346882817351e-07, + "loss": 0.1479, + "step": 33933 + }, + { + "epoch": 0.8587190323152061, + "grad_norm": 10.462981224060059, + "learning_rate": 4.94160625675979e-07, + "loss": 0.2082, + "step": 33934 + }, + { + "epoch": 0.8587443378798998, + "grad_norm": 3.4169533252716064, + "learning_rate": 4.939865921275438e-07, + "loss": 0.1229, + "step": 33935 + }, + { + "epoch": 0.8587696434445935, + "grad_norm": 10.702105522155762, + "learning_rate": 4.938125876375511e-07, + "loss": 0.1729, + "step": 33936 + }, + { + "epoch": 0.8587949490092871, + "grad_norm": 3.0373713970184326, + "learning_rate": 4.93638612207123e-07, + "loss": 0.1368, + "step": 33937 + }, + { + "epoch": 0.8588202545739808, + "grad_norm": 4.1716508865356445, + "learning_rate": 4.934646658373798e-07, + "loss": 0.1683, + "step": 33938 + }, + { + "epoch": 0.8588455601386745, + "grad_norm": 2.792095184326172, + "learning_rate": 4.932907485294458e-07, + "loss": 0.1206, + "step": 33939 + }, + { + "epoch": 0.8588708657033681, + "grad_norm": 3.209244728088379, + "learning_rate": 4.931168602844417e-07, + "loss": 0.1327, + "step": 33940 + }, + { + "epoch": 0.8588961712680618, + "grad_norm": 3.784496545791626, + "learning_rate": 4.929430011034886e-07, + "loss": 0.1104, + "step": 33941 + }, + { + "epoch": 0.8589214768327555, + "grad_norm": 3.599447011947632, + "learning_rate": 4.927691709877075e-07, + "loss": 0.1766, + "step": 33942 + }, + { + "epoch": 0.8589467823974491, + "grad_norm": 6.306333541870117, + "learning_rate": 4.925953699382186e-07, + "loss": 0.1269, + "step": 33943 + }, + { + "epoch": 0.8589720879621429, + "grad_norm": 8.035932540893555, + "learning_rate": 4.924215979561447e-07, + "loss": 0.225, + "step": 33944 + }, + { + "epoch": 0.8589973935268366, + "grad_norm": 3.9500935077667236, + "learning_rate": 4.922478550426052e-07, + "loss": 0.1346, + "step": 33945 + }, + { + "epoch": 0.8590226990915302, + "grad_norm": 3.5033464431762695, + "learning_rate": 4.920741411987213e-07, + "loss": 0.1254, + "step": 33946 + }, + { + "epoch": 0.8590480046562239, + "grad_norm": 3.3430771827697754, + "learning_rate": 4.919004564256114e-07, + "loss": 0.1166, + "step": 33947 + }, + { + "epoch": 0.8590733102209176, + "grad_norm": 5.517850399017334, + "learning_rate": 4.91726800724398e-07, + "loss": 0.1029, + "step": 33948 + }, + { + "epoch": 0.8590986157856113, + "grad_norm": 5.585555076599121, + "learning_rate": 4.915531740961998e-07, + "loss": 0.1684, + "step": 33949 + }, + { + "epoch": 0.8591239213503049, + "grad_norm": 3.8438799381256104, + "learning_rate": 4.913795765421364e-07, + "loss": 0.0844, + "step": 33950 + }, + { + "epoch": 0.8591492269149986, + "grad_norm": 3.7297065258026123, + "learning_rate": 4.912060080633269e-07, + "loss": 0.0881, + "step": 33951 + }, + { + "epoch": 0.8591745324796923, + "grad_norm": 3.6548383235931396, + "learning_rate": 4.910324686608919e-07, + "loss": 0.0962, + "step": 33952 + }, + { + "epoch": 0.8591998380443859, + "grad_norm": 4.818630695343018, + "learning_rate": 4.908589583359485e-07, + "loss": 0.1811, + "step": 33953 + }, + { + "epoch": 0.8592251436090796, + "grad_norm": 10.23046875, + "learning_rate": 4.906854770896191e-07, + "loss": 0.2014, + "step": 33954 + }, + { + "epoch": 0.8592504491737734, + "grad_norm": 7.069571495056152, + "learning_rate": 4.905120249230189e-07, + "loss": 0.196, + "step": 33955 + }, + { + "epoch": 0.859275754738467, + "grad_norm": 5.118689060211182, + "learning_rate": 4.90338601837268e-07, + "loss": 0.1074, + "step": 33956 + }, + { + "epoch": 0.8593010603031607, + "grad_norm": 3.3500430583953857, + "learning_rate": 4.90165207833484e-07, + "loss": 0.1197, + "step": 33957 + }, + { + "epoch": 0.8593263658678544, + "grad_norm": 11.759033203125, + "learning_rate": 4.899918429127876e-07, + "loss": 0.1921, + "step": 33958 + }, + { + "epoch": 0.859351671432548, + "grad_norm": 6.510275363922119, + "learning_rate": 4.898185070762929e-07, + "loss": 0.1833, + "step": 33959 + }, + { + "epoch": 0.8593769769972417, + "grad_norm": 4.263457775115967, + "learning_rate": 4.8964520032512e-07, + "loss": 0.0957, + "step": 33960 + }, + { + "epoch": 0.8594022825619354, + "grad_norm": 6.139072895050049, + "learning_rate": 4.894719226603861e-07, + "loss": 0.1277, + "step": 33961 + }, + { + "epoch": 0.859427588126629, + "grad_norm": 11.437047958374023, + "learning_rate": 4.892986740832095e-07, + "loss": 0.3324, + "step": 33962 + }, + { + "epoch": 0.8594528936913227, + "grad_norm": 5.593608856201172, + "learning_rate": 4.891254545947066e-07, + "loss": 0.1322, + "step": 33963 + }, + { + "epoch": 0.8594781992560164, + "grad_norm": 3.15674090385437, + "learning_rate": 4.889522641959948e-07, + "loss": 0.0864, + "step": 33964 + }, + { + "epoch": 0.85950350482071, + "grad_norm": 2.9218297004699707, + "learning_rate": 4.887791028881894e-07, + "loss": 0.0529, + "step": 33965 + }, + { + "epoch": 0.8595288103854037, + "grad_norm": 7.003808975219727, + "learning_rate": 4.886059706724094e-07, + "loss": 0.2019, + "step": 33966 + }, + { + "epoch": 0.8595541159500975, + "grad_norm": 4.456206798553467, + "learning_rate": 4.884328675497707e-07, + "loss": 0.1247, + "step": 33967 + }, + { + "epoch": 0.859579421514791, + "grad_norm": 3.137512445449829, + "learning_rate": 4.88259793521389e-07, + "loss": 0.1331, + "step": 33968 + }, + { + "epoch": 0.8596047270794848, + "grad_norm": 5.7403411865234375, + "learning_rate": 4.880867485883795e-07, + "loss": 0.1773, + "step": 33969 + }, + { + "epoch": 0.8596300326441785, + "grad_norm": 4.553853511810303, + "learning_rate": 4.879137327518607e-07, + "loss": 0.157, + "step": 33970 + }, + { + "epoch": 0.8596553382088721, + "grad_norm": 12.126320838928223, + "learning_rate": 4.877407460129463e-07, + "loss": 0.1846, + "step": 33971 + }, + { + "epoch": 0.8596806437735658, + "grad_norm": 5.934375762939453, + "learning_rate": 4.875677883727531e-07, + "loss": 0.1192, + "step": 33972 + }, + { + "epoch": 0.8597059493382595, + "grad_norm": 3.8772974014282227, + "learning_rate": 4.873948598323952e-07, + "loss": 0.1119, + "step": 33973 + }, + { + "epoch": 0.8597312549029532, + "grad_norm": 6.558979511260986, + "learning_rate": 4.872219603929878e-07, + "loss": 0.1795, + "step": 33974 + }, + { + "epoch": 0.8597565604676468, + "grad_norm": 3.8640341758728027, + "learning_rate": 4.870490900556479e-07, + "loss": 0.1328, + "step": 33975 + }, + { + "epoch": 0.8597818660323405, + "grad_norm": 1.7677799463272095, + "learning_rate": 4.868762488214884e-07, + "loss": 0.058, + "step": 33976 + }, + { + "epoch": 0.8598071715970342, + "grad_norm": 5.309058666229248, + "learning_rate": 4.867034366916245e-07, + "loss": 0.1724, + "step": 33977 + }, + { + "epoch": 0.8598324771617278, + "grad_norm": 3.0113296508789062, + "learning_rate": 4.865306536671699e-07, + "loss": 0.0764, + "step": 33978 + }, + { + "epoch": 0.8598577827264215, + "grad_norm": 3.036679983139038, + "learning_rate": 4.863578997492407e-07, + "loss": 0.1124, + "step": 33979 + }, + { + "epoch": 0.8598830882911153, + "grad_norm": 4.947098731994629, + "learning_rate": 4.861851749389496e-07, + "loss": 0.1538, + "step": 33980 + }, + { + "epoch": 0.8599083938558089, + "grad_norm": 12.339156150817871, + "learning_rate": 4.860124792374105e-07, + "loss": 0.151, + "step": 33981 + }, + { + "epoch": 0.8599336994205026, + "grad_norm": 2.6197383403778076, + "learning_rate": 4.858398126457364e-07, + "loss": 0.1084, + "step": 33982 + }, + { + "epoch": 0.8599590049851963, + "grad_norm": 3.1800119876861572, + "learning_rate": 4.856671751650427e-07, + "loss": 0.0917, + "step": 33983 + }, + { + "epoch": 0.8599843105498899, + "grad_norm": 5.254793643951416, + "learning_rate": 4.854945667964406e-07, + "loss": 0.1032, + "step": 33984 + }, + { + "epoch": 0.8600096161145836, + "grad_norm": 3.8650174140930176, + "learning_rate": 4.853219875410459e-07, + "loss": 0.1164, + "step": 33985 + }, + { + "epoch": 0.8600349216792773, + "grad_norm": 8.948046684265137, + "learning_rate": 4.851494373999682e-07, + "loss": 0.2158, + "step": 33986 + }, + { + "epoch": 0.8600602272439709, + "grad_norm": 9.064193725585938, + "learning_rate": 4.849769163743228e-07, + "loss": 0.0974, + "step": 33987 + }, + { + "epoch": 0.8600855328086646, + "grad_norm": 4.534427642822266, + "learning_rate": 4.848044244652206e-07, + "loss": 0.1636, + "step": 33988 + }, + { + "epoch": 0.8601108383733583, + "grad_norm": 5.303296089172363, + "learning_rate": 4.846319616737771e-07, + "loss": 0.1616, + "step": 33989 + }, + { + "epoch": 0.8601361439380519, + "grad_norm": 4.791931629180908, + "learning_rate": 4.844595280011e-07, + "loss": 0.1534, + "step": 33990 + }, + { + "epoch": 0.8601614495027456, + "grad_norm": 3.3581650257110596, + "learning_rate": 4.842871234483038e-07, + "loss": 0.1491, + "step": 33991 + }, + { + "epoch": 0.8601867550674394, + "grad_norm": 5.237252235412598, + "learning_rate": 4.841147480164999e-07, + "loss": 0.1963, + "step": 33992 + }, + { + "epoch": 0.860212060632133, + "grad_norm": 7.558454513549805, + "learning_rate": 4.839424017068011e-07, + "loss": 0.2138, + "step": 33993 + }, + { + "epoch": 0.8602373661968267, + "grad_norm": 4.610511779785156, + "learning_rate": 4.837700845203164e-07, + "loss": 0.1457, + "step": 33994 + }, + { + "epoch": 0.8602626717615204, + "grad_norm": 2.571995258331299, + "learning_rate": 4.835977964581589e-07, + "loss": 0.0659, + "step": 33995 + }, + { + "epoch": 0.860287977326214, + "grad_norm": 6.660301208496094, + "learning_rate": 4.83425537521438e-07, + "loss": 0.156, + "step": 33996 + }, + { + "epoch": 0.8603132828909077, + "grad_norm": 8.637617111206055, + "learning_rate": 4.832533077112672e-07, + "loss": 0.2156, + "step": 33997 + }, + { + "epoch": 0.8603385884556014, + "grad_norm": 3.914857864379883, + "learning_rate": 4.83081107028755e-07, + "loss": 0.1393, + "step": 33998 + }, + { + "epoch": 0.8603638940202951, + "grad_norm": 14.475582122802734, + "learning_rate": 4.829089354750133e-07, + "loss": 0.1157, + "step": 33999 + }, + { + "epoch": 0.8603891995849887, + "grad_norm": 5.815397262573242, + "learning_rate": 4.827367930511512e-07, + "loss": 0.1107, + "step": 34000 + }, + { + "epoch": 0.8604145051496824, + "grad_norm": 2.599199056625366, + "learning_rate": 4.825646797582778e-07, + "loss": 0.1157, + "step": 34001 + }, + { + "epoch": 0.8604398107143761, + "grad_norm": 4.408386707305908, + "learning_rate": 4.823925955975062e-07, + "loss": 0.1392, + "step": 34002 + }, + { + "epoch": 0.8604651162790697, + "grad_norm": 3.5582973957061768, + "learning_rate": 4.822205405699443e-07, + "loss": 0.0993, + "step": 34003 + }, + { + "epoch": 0.8604904218437635, + "grad_norm": 7.0374603271484375, + "learning_rate": 4.820485146767012e-07, + "loss": 0.2095, + "step": 34004 + }, + { + "epoch": 0.8605157274084572, + "grad_norm": 5.120654582977295, + "learning_rate": 4.818765179188867e-07, + "loss": 0.0891, + "step": 34005 + }, + { + "epoch": 0.8605410329731508, + "grad_norm": 4.5924882888793945, + "learning_rate": 4.817045502976103e-07, + "loss": 0.1926, + "step": 34006 + }, + { + "epoch": 0.8605663385378445, + "grad_norm": 6.295872211456299, + "learning_rate": 4.815326118139813e-07, + "loss": 0.0967, + "step": 34007 + }, + { + "epoch": 0.8605916441025382, + "grad_norm": 5.711030960083008, + "learning_rate": 4.813607024691075e-07, + "loss": 0.1454, + "step": 34008 + }, + { + "epoch": 0.8606169496672318, + "grad_norm": 3.3939735889434814, + "learning_rate": 4.811888222640976e-07, + "loss": 0.1039, + "step": 34009 + }, + { + "epoch": 0.8606422552319255, + "grad_norm": 2.8624963760375977, + "learning_rate": 4.810169712000612e-07, + "loss": 0.1052, + "step": 34010 + }, + { + "epoch": 0.8606675607966192, + "grad_norm": 4.044198513031006, + "learning_rate": 4.808451492781053e-07, + "loss": 0.1363, + "step": 34011 + }, + { + "epoch": 0.8606928663613128, + "grad_norm": 3.019364595413208, + "learning_rate": 4.806733564993388e-07, + "loss": 0.1115, + "step": 34012 + }, + { + "epoch": 0.8607181719260065, + "grad_norm": 3.6369168758392334, + "learning_rate": 4.805015928648682e-07, + "loss": 0.1318, + "step": 34013 + }, + { + "epoch": 0.8607434774907002, + "grad_norm": 3.938629388809204, + "learning_rate": 4.803298583758032e-07, + "loss": 0.0956, + "step": 34014 + }, + { + "epoch": 0.8607687830553938, + "grad_norm": 6.237715244293213, + "learning_rate": 4.8015815303325e-07, + "loss": 0.2077, + "step": 34015 + }, + { + "epoch": 0.8607940886200875, + "grad_norm": 4.4504876136779785, + "learning_rate": 4.799864768383162e-07, + "loss": 0.1519, + "step": 34016 + }, + { + "epoch": 0.8608193941847813, + "grad_norm": 6.038105010986328, + "learning_rate": 4.798148297921079e-07, + "loss": 0.1611, + "step": 34017 + }, + { + "epoch": 0.8608446997494749, + "grad_norm": 5.970058917999268, + "learning_rate": 4.796432118957339e-07, + "loss": 0.1331, + "step": 34018 + }, + { + "epoch": 0.8608700053141686, + "grad_norm": 2.990745782852173, + "learning_rate": 4.794716231502988e-07, + "loss": 0.1063, + "step": 34019 + }, + { + "epoch": 0.8608953108788623, + "grad_norm": 10.12330436706543, + "learning_rate": 4.793000635569122e-07, + "loss": 0.2932, + "step": 34020 + }, + { + "epoch": 0.8609206164435559, + "grad_norm": 5.634709358215332, + "learning_rate": 4.791285331166773e-07, + "loss": 0.2382, + "step": 34021 + }, + { + "epoch": 0.8609459220082496, + "grad_norm": 5.477733135223389, + "learning_rate": 4.789570318307018e-07, + "loss": 0.1798, + "step": 34022 + }, + { + "epoch": 0.8609712275729433, + "grad_norm": 7.700556755065918, + "learning_rate": 4.787855597000913e-07, + "loss": 0.2177, + "step": 34023 + }, + { + "epoch": 0.860996533137637, + "grad_norm": 3.978576898574829, + "learning_rate": 4.786141167259528e-07, + "loss": 0.1451, + "step": 34024 + }, + { + "epoch": 0.8610218387023306, + "grad_norm": 4.437729835510254, + "learning_rate": 4.784427029093891e-07, + "loss": 0.1212, + "step": 34025 + }, + { + "epoch": 0.8610471442670243, + "grad_norm": 3.968766450881958, + "learning_rate": 4.782713182515086e-07, + "loss": 0.1415, + "step": 34026 + }, + { + "epoch": 0.861072449831718, + "grad_norm": 2.634490966796875, + "learning_rate": 4.780999627534149e-07, + "loss": 0.0953, + "step": 34027 + }, + { + "epoch": 0.8610977553964116, + "grad_norm": 5.867713451385498, + "learning_rate": 4.779286364162127e-07, + "loss": 0.1207, + "step": 34028 + }, + { + "epoch": 0.8611230609611054, + "grad_norm": 4.081149101257324, + "learning_rate": 4.777573392410084e-07, + "loss": 0.1115, + "step": 34029 + }, + { + "epoch": 0.8611483665257991, + "grad_norm": 2.2550666332244873, + "learning_rate": 4.775860712289054e-07, + "loss": 0.0904, + "step": 34030 + }, + { + "epoch": 0.8611736720904927, + "grad_norm": 8.443804740905762, + "learning_rate": 4.774148323810091e-07, + "loss": 0.164, + "step": 34031 + }, + { + "epoch": 0.8611989776551864, + "grad_norm": 3.4512038230895996, + "learning_rate": 4.772436226984223e-07, + "loss": 0.102, + "step": 34032 + }, + { + "epoch": 0.8612242832198801, + "grad_norm": 2.762540578842163, + "learning_rate": 4.770724421822509e-07, + "loss": 0.1329, + "step": 34033 + }, + { + "epoch": 0.8612495887845737, + "grad_norm": 3.4029924869537354, + "learning_rate": 4.769012908335979e-07, + "loss": 0.0958, + "step": 34034 + }, + { + "epoch": 0.8612748943492674, + "grad_norm": 6.413918972015381, + "learning_rate": 4.767301686535675e-07, + "loss": 0.1573, + "step": 34035 + }, + { + "epoch": 0.8613001999139611, + "grad_norm": 6.284219264984131, + "learning_rate": 4.7655907564326197e-07, + "loss": 0.1766, + "step": 34036 + }, + { + "epoch": 0.8613255054786547, + "grad_norm": 4.620509624481201, + "learning_rate": 4.763880118037861e-07, + "loss": 0.1604, + "step": 34037 + }, + { + "epoch": 0.8613508110433484, + "grad_norm": 7.621874809265137, + "learning_rate": 4.762169771362424e-07, + "loss": 0.3431, + "step": 34038 + }, + { + "epoch": 0.8613761166080421, + "grad_norm": 4.799839496612549, + "learning_rate": 4.7604597164173426e-07, + "loss": 0.1386, + "step": 34039 + }, + { + "epoch": 0.8614014221727357, + "grad_norm": 5.9721760749816895, + "learning_rate": 4.758749953213626e-07, + "loss": 0.1387, + "step": 34040 + }, + { + "epoch": 0.8614267277374295, + "grad_norm": 3.8472728729248047, + "learning_rate": 4.7570404817623317e-07, + "loss": 0.124, + "step": 34041 + }, + { + "epoch": 0.8614520333021232, + "grad_norm": 2.9165780544281006, + "learning_rate": 4.7553313020744673e-07, + "loss": 0.1076, + "step": 34042 + }, + { + "epoch": 0.8614773388668168, + "grad_norm": 2.77884840965271, + "learning_rate": 4.753622414161052e-07, + "loss": 0.1498, + "step": 34043 + }, + { + "epoch": 0.8615026444315105, + "grad_norm": 3.889378786087036, + "learning_rate": 4.7519138180330994e-07, + "loss": 0.1433, + "step": 34044 + }, + { + "epoch": 0.8615279499962042, + "grad_norm": 4.90452241897583, + "learning_rate": 4.75020551370165e-07, + "loss": 0.149, + "step": 34045 + }, + { + "epoch": 0.8615532555608978, + "grad_norm": 1.9535832405090332, + "learning_rate": 4.748497501177707e-07, + "loss": 0.0709, + "step": 34046 + }, + { + "epoch": 0.8615785611255915, + "grad_norm": 12.48361587524414, + "learning_rate": 4.746789780472283e-07, + "loss": 0.2198, + "step": 34047 + }, + { + "epoch": 0.8616038666902852, + "grad_norm": 18.999452590942383, + "learning_rate": 4.7450823515963864e-07, + "loss": 0.2781, + "step": 34048 + }, + { + "epoch": 0.8616291722549788, + "grad_norm": 6.972115516662598, + "learning_rate": 4.743375214561047e-07, + "loss": 0.1746, + "step": 34049 + }, + { + "epoch": 0.8616544778196725, + "grad_norm": 4.423905372619629, + "learning_rate": 4.74166836937725e-07, + "loss": 0.1712, + "step": 34050 + }, + { + "epoch": 0.8616797833843662, + "grad_norm": 4.0148115158081055, + "learning_rate": 4.739961816056038e-07, + "loss": 0.0922, + "step": 34051 + }, + { + "epoch": 0.86170508894906, + "grad_norm": 3.5466816425323486, + "learning_rate": 4.738255554608373e-07, + "loss": 0.0944, + "step": 34052 + }, + { + "epoch": 0.8617303945137536, + "grad_norm": 13.413110733032227, + "learning_rate": 4.736549585045286e-07, + "loss": 0.2455, + "step": 34053 + }, + { + "epoch": 0.8617557000784473, + "grad_norm": 5.081300735473633, + "learning_rate": 4.734843907377773e-07, + "loss": 0.1877, + "step": 34054 + }, + { + "epoch": 0.861781005643141, + "grad_norm": 4.488731384277344, + "learning_rate": 4.733138521616831e-07, + "loss": 0.0751, + "step": 34055 + }, + { + "epoch": 0.8618063112078346, + "grad_norm": 22.133142471313477, + "learning_rate": 4.7314334277734465e-07, + "loss": 0.1508, + "step": 34056 + }, + { + "epoch": 0.8618316167725283, + "grad_norm": 3.1425867080688477, + "learning_rate": 4.729728625858643e-07, + "loss": 0.1717, + "step": 34057 + }, + { + "epoch": 0.861856922337222, + "grad_norm": 4.124494552612305, + "learning_rate": 4.728024115883395e-07, + "loss": 0.1269, + "step": 34058 + }, + { + "epoch": 0.8618822279019156, + "grad_norm": 3.708994150161743, + "learning_rate": 4.726319897858689e-07, + "loss": 0.0874, + "step": 34059 + }, + { + "epoch": 0.8619075334666093, + "grad_norm": 2.467881917953491, + "learning_rate": 4.724615971795532e-07, + "loss": 0.1062, + "step": 34060 + }, + { + "epoch": 0.861932839031303, + "grad_norm": 5.882587432861328, + "learning_rate": 4.7229123377049047e-07, + "loss": 0.2059, + "step": 34061 + }, + { + "epoch": 0.8619581445959966, + "grad_norm": 5.71962308883667, + "learning_rate": 4.721208995597798e-07, + "loss": 0.1386, + "step": 34062 + }, + { + "epoch": 0.8619834501606903, + "grad_norm": 5.636353015899658, + "learning_rate": 4.7195059454851754e-07, + "loss": 0.0904, + "step": 34063 + }, + { + "epoch": 0.862008755725384, + "grad_norm": 2.8299560546875, + "learning_rate": 4.7178031873780614e-07, + "loss": 0.08, + "step": 34064 + }, + { + "epoch": 0.8620340612900776, + "grad_norm": 2.620750665664673, + "learning_rate": 4.7161007212873924e-07, + "loss": 0.0809, + "step": 34065 + }, + { + "epoch": 0.8620593668547714, + "grad_norm": 4.866796016693115, + "learning_rate": 4.714398547224169e-07, + "loss": 0.1425, + "step": 34066 + }, + { + "epoch": 0.8620846724194651, + "grad_norm": 4.048267841339111, + "learning_rate": 4.712696665199362e-07, + "loss": 0.1026, + "step": 34067 + }, + { + "epoch": 0.8621099779841587, + "grad_norm": 8.005804061889648, + "learning_rate": 4.7109950752239676e-07, + "loss": 0.1632, + "step": 34068 + }, + { + "epoch": 0.8621352835488524, + "grad_norm": 4.402580738067627, + "learning_rate": 4.709293777308926e-07, + "loss": 0.1412, + "step": 34069 + }, + { + "epoch": 0.8621605891135461, + "grad_norm": 3.9439759254455566, + "learning_rate": 4.7075927714652303e-07, + "loss": 0.0732, + "step": 34070 + }, + { + "epoch": 0.8621858946782397, + "grad_norm": 3.5688815116882324, + "learning_rate": 4.705892057703837e-07, + "loss": 0.1721, + "step": 34071 + }, + { + "epoch": 0.8622112002429334, + "grad_norm": 3.75164794921875, + "learning_rate": 4.7041916360357263e-07, + "loss": 0.1078, + "step": 34072 + }, + { + "epoch": 0.8622365058076271, + "grad_norm": 2.8383731842041016, + "learning_rate": 4.702491506471857e-07, + "loss": 0.1121, + "step": 34073 + }, + { + "epoch": 0.8622618113723207, + "grad_norm": 2.7389116287231445, + "learning_rate": 4.700791669023197e-07, + "loss": 0.1047, + "step": 34074 + }, + { + "epoch": 0.8622871169370144, + "grad_norm": 6.067688465118408, + "learning_rate": 4.699092123700694e-07, + "loss": 0.1981, + "step": 34075 + }, + { + "epoch": 0.8623124225017081, + "grad_norm": 3.9071028232574463, + "learning_rate": 4.6973928705153325e-07, + "loss": 0.1421, + "step": 34076 + }, + { + "epoch": 0.8623377280664019, + "grad_norm": 6.430709362030029, + "learning_rate": 4.695693909478055e-07, + "loss": 0.2099, + "step": 34077 + }, + { + "epoch": 0.8623630336310955, + "grad_norm": 9.08060359954834, + "learning_rate": 4.693995240599819e-07, + "loss": 0.2378, + "step": 34078 + }, + { + "epoch": 0.8623883391957892, + "grad_norm": 6.9463324546813965, + "learning_rate": 4.692296863891571e-07, + "loss": 0.1145, + "step": 34079 + }, + { + "epoch": 0.8624136447604829, + "grad_norm": 3.6432697772979736, + "learning_rate": 4.690598779364286e-07, + "loss": 0.0897, + "step": 34080 + }, + { + "epoch": 0.8624389503251765, + "grad_norm": 11.326950073242188, + "learning_rate": 4.688900987028888e-07, + "loss": 0.2137, + "step": 34081 + }, + { + "epoch": 0.8624642558898702, + "grad_norm": 3.7757890224456787, + "learning_rate": 4.6872034868963577e-07, + "loss": 0.1471, + "step": 34082 + }, + { + "epoch": 0.8624895614545639, + "grad_norm": 5.95598840713501, + "learning_rate": 4.685506278977603e-07, + "loss": 0.1354, + "step": 34083 + }, + { + "epoch": 0.8625148670192575, + "grad_norm": 3.029768943786621, + "learning_rate": 4.6838093632836034e-07, + "loss": 0.1067, + "step": 34084 + }, + { + "epoch": 0.8625401725839512, + "grad_norm": 4.014187812805176, + "learning_rate": 4.682112739825284e-07, + "loss": 0.1604, + "step": 34085 + }, + { + "epoch": 0.8625654781486449, + "grad_norm": 3.4637374877929688, + "learning_rate": 4.6804164086135915e-07, + "loss": 0.1027, + "step": 34086 + }, + { + "epoch": 0.8625907837133385, + "grad_norm": 10.528718948364258, + "learning_rate": 4.678720369659456e-07, + "loss": 0.2786, + "step": 34087 + }, + { + "epoch": 0.8626160892780322, + "grad_norm": 5.309118270874023, + "learning_rate": 4.67702462297383e-07, + "loss": 0.1546, + "step": 34088 + }, + { + "epoch": 0.862641394842726, + "grad_norm": 6.1125311851501465, + "learning_rate": 4.675329168567638e-07, + "loss": 0.1847, + "step": 34089 + }, + { + "epoch": 0.8626667004074196, + "grad_norm": 3.6487998962402344, + "learning_rate": 4.6736340064518105e-07, + "loss": 0.1403, + "step": 34090 + }, + { + "epoch": 0.8626920059721133, + "grad_norm": 3.7073938846588135, + "learning_rate": 4.671939136637299e-07, + "loss": 0.156, + "step": 34091 + }, + { + "epoch": 0.862717311536807, + "grad_norm": 4.641069412231445, + "learning_rate": 4.670244559135012e-07, + "loss": 0.1535, + "step": 34092 + }, + { + "epoch": 0.8627426171015006, + "grad_norm": 3.882897138595581, + "learning_rate": 4.6685502739558906e-07, + "loss": 0.1388, + "step": 34093 + }, + { + "epoch": 0.8627679226661943, + "grad_norm": 5.310646057128906, + "learning_rate": 4.6668562811108485e-07, + "loss": 0.1719, + "step": 34094 + }, + { + "epoch": 0.862793228230888, + "grad_norm": 8.746822357177734, + "learning_rate": 4.665162580610838e-07, + "loss": 0.3013, + "step": 34095 + }, + { + "epoch": 0.8628185337955816, + "grad_norm": 4.308267593383789, + "learning_rate": 4.663469172466739e-07, + "loss": 0.1253, + "step": 34096 + }, + { + "epoch": 0.8628438393602753, + "grad_norm": 8.246435165405273, + "learning_rate": 4.661776056689504e-07, + "loss": 0.2382, + "step": 34097 + }, + { + "epoch": 0.862869144924969, + "grad_norm": 17.621437072753906, + "learning_rate": 4.6600832332900304e-07, + "loss": 0.3484, + "step": 34098 + }, + { + "epoch": 0.8628944504896626, + "grad_norm": 3.2590880393981934, + "learning_rate": 4.658390702279264e-07, + "loss": 0.0739, + "step": 34099 + }, + { + "epoch": 0.8629197560543563, + "grad_norm": 5.871425628662109, + "learning_rate": 4.6566984636680855e-07, + "loss": 0.1743, + "step": 34100 + }, + { + "epoch": 0.86294506161905, + "grad_norm": 10.026954650878906, + "learning_rate": 4.655006517467436e-07, + "loss": 0.1987, + "step": 34101 + }, + { + "epoch": 0.8629703671837438, + "grad_norm": 6.889703750610352, + "learning_rate": 4.653314863688202e-07, + "loss": 0.1146, + "step": 34102 + }, + { + "epoch": 0.8629956727484374, + "grad_norm": 4.019931316375732, + "learning_rate": 4.6516235023413125e-07, + "loss": 0.1467, + "step": 34103 + }, + { + "epoch": 0.8630209783131311, + "grad_norm": 1.7615594863891602, + "learning_rate": 4.6499324334376705e-07, + "loss": 0.054, + "step": 34104 + }, + { + "epoch": 0.8630462838778248, + "grad_norm": 6.789101600646973, + "learning_rate": 4.648241656988178e-07, + "loss": 0.1856, + "step": 34105 + }, + { + "epoch": 0.8630715894425184, + "grad_norm": 3.6570048332214355, + "learning_rate": 4.646551173003733e-07, + "loss": 0.1321, + "step": 34106 + }, + { + "epoch": 0.8630968950072121, + "grad_norm": 9.242696762084961, + "learning_rate": 4.6448609814952475e-07, + "loss": 0.1633, + "step": 34107 + }, + { + "epoch": 0.8631222005719058, + "grad_norm": 7.5770721435546875, + "learning_rate": 4.6431710824736185e-07, + "loss": 0.1512, + "step": 34108 + }, + { + "epoch": 0.8631475061365994, + "grad_norm": 4.367612838745117, + "learning_rate": 4.641481475949744e-07, + "loss": 0.1965, + "step": 34109 + }, + { + "epoch": 0.8631728117012931, + "grad_norm": 4.696913242340088, + "learning_rate": 4.6397921619345085e-07, + "loss": 0.0698, + "step": 34110 + }, + { + "epoch": 0.8631981172659868, + "grad_norm": 11.29161262512207, + "learning_rate": 4.638103140438821e-07, + "loss": 0.2472, + "step": 34111 + }, + { + "epoch": 0.8632234228306804, + "grad_norm": 8.155329704284668, + "learning_rate": 4.636414411473572e-07, + "loss": 0.222, + "step": 34112 + }, + { + "epoch": 0.8632487283953741, + "grad_norm": 3.7478432655334473, + "learning_rate": 4.634725975049648e-07, + "loss": 0.1651, + "step": 34113 + }, + { + "epoch": 0.8632740339600679, + "grad_norm": 7.033439636230469, + "learning_rate": 4.633037831177928e-07, + "loss": 0.205, + "step": 34114 + }, + { + "epoch": 0.8632993395247615, + "grad_norm": 6.195833206176758, + "learning_rate": 4.6313499798693205e-07, + "loss": 0.232, + "step": 34115 + }, + { + "epoch": 0.8633246450894552, + "grad_norm": 6.958590030670166, + "learning_rate": 4.6296624211346895e-07, + "loss": 0.1761, + "step": 34116 + }, + { + "epoch": 0.8633499506541489, + "grad_norm": 7.127033233642578, + "learning_rate": 4.6279751549849316e-07, + "loss": 0.217, + "step": 34117 + }, + { + "epoch": 0.8633752562188425, + "grad_norm": 8.383719444274902, + "learning_rate": 4.6262881814309156e-07, + "loss": 0.2427, + "step": 34118 + }, + { + "epoch": 0.8634005617835362, + "grad_norm": 6.3371076583862305, + "learning_rate": 4.6246015004835334e-07, + "loss": 0.2232, + "step": 34119 + }, + { + "epoch": 0.8634258673482299, + "grad_norm": 3.6495771408081055, + "learning_rate": 4.622915112153653e-07, + "loss": 0.1645, + "step": 34120 + }, + { + "epoch": 0.8634511729129235, + "grad_norm": 5.642259120941162, + "learning_rate": 4.6212290164521554e-07, + "loss": 0.1715, + "step": 34121 + }, + { + "epoch": 0.8634764784776172, + "grad_norm": 4.28476095199585, + "learning_rate": 4.61954321338991e-07, + "loss": 0.1103, + "step": 34122 + }, + { + "epoch": 0.8635017840423109, + "grad_norm": 4.651391506195068, + "learning_rate": 4.6178577029777794e-07, + "loss": 0.199, + "step": 34123 + }, + { + "epoch": 0.8635270896070045, + "grad_norm": 6.252852916717529, + "learning_rate": 4.6161724852266496e-07, + "loss": 0.1793, + "step": 34124 + }, + { + "epoch": 0.8635523951716982, + "grad_norm": 4.35299825668335, + "learning_rate": 4.614487560147374e-07, + "loss": 0.1069, + "step": 34125 + }, + { + "epoch": 0.863577700736392, + "grad_norm": 7.3949995040893555, + "learning_rate": 4.612802927750848e-07, + "loss": 0.2005, + "step": 34126 + }, + { + "epoch": 0.8636030063010857, + "grad_norm": 2.949052572250366, + "learning_rate": 4.6111185880478913e-07, + "loss": 0.1163, + "step": 34127 + }, + { + "epoch": 0.8636283118657793, + "grad_norm": 8.653278350830078, + "learning_rate": 4.609434541049396e-07, + "loss": 0.2748, + "step": 34128 + }, + { + "epoch": 0.863653617430473, + "grad_norm": 4.637491226196289, + "learning_rate": 4.6077507867662073e-07, + "loss": 0.1424, + "step": 34129 + }, + { + "epoch": 0.8636789229951667, + "grad_norm": 7.167888164520264, + "learning_rate": 4.606067325209207e-07, + "loss": 0.1762, + "step": 34130 + }, + { + "epoch": 0.8637042285598603, + "grad_norm": 11.399942398071289, + "learning_rate": 4.604384156389213e-07, + "loss": 0.1795, + "step": 34131 + }, + { + "epoch": 0.863729534124554, + "grad_norm": 4.163374423980713, + "learning_rate": 4.6027012803171177e-07, + "loss": 0.1472, + "step": 34132 + }, + { + "epoch": 0.8637548396892477, + "grad_norm": 8.295563697814941, + "learning_rate": 4.6010186970037454e-07, + "loss": 0.1702, + "step": 34133 + }, + { + "epoch": 0.8637801452539413, + "grad_norm": 22.108678817749023, + "learning_rate": 4.5993364064599644e-07, + "loss": 0.249, + "step": 34134 + }, + { + "epoch": 0.863805450818635, + "grad_norm": 5.555710792541504, + "learning_rate": 4.5976544086966225e-07, + "loss": 0.1047, + "step": 34135 + }, + { + "epoch": 0.8638307563833287, + "grad_norm": 5.602630615234375, + "learning_rate": 4.5959727037245607e-07, + "loss": 0.2094, + "step": 34136 + }, + { + "epoch": 0.8638560619480223, + "grad_norm": 12.871169090270996, + "learning_rate": 4.5942912915546146e-07, + "loss": 0.1989, + "step": 34137 + }, + { + "epoch": 0.863881367512716, + "grad_norm": 3.946708917617798, + "learning_rate": 4.592610172197648e-07, + "loss": 0.0918, + "step": 34138 + }, + { + "epoch": 0.8639066730774098, + "grad_norm": 3.5937442779541016, + "learning_rate": 4.590929345664491e-07, + "loss": 0.1371, + "step": 34139 + }, + { + "epoch": 0.8639319786421034, + "grad_norm": 4.653204917907715, + "learning_rate": 4.5892488119659794e-07, + "loss": 0.134, + "step": 34140 + }, + { + "epoch": 0.8639572842067971, + "grad_norm": 4.955855369567871, + "learning_rate": 4.5875685711129546e-07, + "loss": 0.1796, + "step": 34141 + }, + { + "epoch": 0.8639825897714908, + "grad_norm": 8.870443344116211, + "learning_rate": 4.585888623116258e-07, + "loss": 0.2581, + "step": 34142 + }, + { + "epoch": 0.8640078953361844, + "grad_norm": 3.9181041717529297, + "learning_rate": 4.58420896798672e-07, + "loss": 0.1067, + "step": 34143 + }, + { + "epoch": 0.8640332009008781, + "grad_norm": 3.116844415664673, + "learning_rate": 4.5825296057351643e-07, + "loss": 0.0987, + "step": 34144 + }, + { + "epoch": 0.8640585064655718, + "grad_norm": 3.446958065032959, + "learning_rate": 4.580850536372422e-07, + "loss": 0.0841, + "step": 34145 + }, + { + "epoch": 0.8640838120302654, + "grad_norm": 5.13999605178833, + "learning_rate": 4.5791717599093345e-07, + "loss": 0.1763, + "step": 34146 + }, + { + "epoch": 0.8641091175949591, + "grad_norm": 11.58008098602295, + "learning_rate": 4.5774932763567205e-07, + "loss": 0.1641, + "step": 34147 + }, + { + "epoch": 0.8641344231596528, + "grad_norm": 3.865964412689209, + "learning_rate": 4.5758150857254045e-07, + "loss": 0.1134, + "step": 34148 + }, + { + "epoch": 0.8641597287243464, + "grad_norm": 3.303838014602661, + "learning_rate": 4.5741371880262063e-07, + "loss": 0.0915, + "step": 34149 + }, + { + "epoch": 0.8641850342890401, + "grad_norm": 3.7479562759399414, + "learning_rate": 4.5724595832699394e-07, + "loss": 0.1692, + "step": 34150 + }, + { + "epoch": 0.8642103398537339, + "grad_norm": 5.816227912902832, + "learning_rate": 4.570782271467433e-07, + "loss": 0.193, + "step": 34151 + }, + { + "epoch": 0.8642356454184276, + "grad_norm": 3.5372862815856934, + "learning_rate": 4.5691052526295077e-07, + "loss": 0.1272, + "step": 34152 + }, + { + "epoch": 0.8642609509831212, + "grad_norm": 4.1771955490112305, + "learning_rate": 4.56742852676697e-07, + "loss": 0.137, + "step": 34153 + }, + { + "epoch": 0.8642862565478149, + "grad_norm": 3.284553289413452, + "learning_rate": 4.5657520938906295e-07, + "loss": 0.1515, + "step": 34154 + }, + { + "epoch": 0.8643115621125086, + "grad_norm": 5.701802730560303, + "learning_rate": 4.5640759540113046e-07, + "loss": 0.0923, + "step": 34155 + }, + { + "epoch": 0.8643368676772022, + "grad_norm": 15.133066177368164, + "learning_rate": 4.5624001071397974e-07, + "loss": 0.2489, + "step": 34156 + }, + { + "epoch": 0.8643621732418959, + "grad_norm": 2.780499219894409, + "learning_rate": 4.5607245532869383e-07, + "loss": 0.135, + "step": 34157 + }, + { + "epoch": 0.8643874788065896, + "grad_norm": 7.31313419342041, + "learning_rate": 4.559049292463491e-07, + "loss": 0.149, + "step": 34158 + }, + { + "epoch": 0.8644127843712832, + "grad_norm": 7.154414653778076, + "learning_rate": 4.5573743246802973e-07, + "loss": 0.2211, + "step": 34159 + }, + { + "epoch": 0.8644380899359769, + "grad_norm": 3.5104410648345947, + "learning_rate": 4.555699649948131e-07, + "loss": 0.1517, + "step": 34160 + }, + { + "epoch": 0.8644633955006706, + "grad_norm": 5.042224407196045, + "learning_rate": 4.554025268277823e-07, + "loss": 0.1973, + "step": 34161 + }, + { + "epoch": 0.8644887010653642, + "grad_norm": 3.687007427215576, + "learning_rate": 4.5523511796801367e-07, + "loss": 0.0906, + "step": 34162 + }, + { + "epoch": 0.864514006630058, + "grad_norm": 4.581283092498779, + "learning_rate": 4.5506773841658913e-07, + "loss": 0.2015, + "step": 34163 + }, + { + "epoch": 0.8645393121947517, + "grad_norm": 3.883873224258423, + "learning_rate": 4.5490038817458615e-07, + "loss": 0.1165, + "step": 34164 + }, + { + "epoch": 0.8645646177594453, + "grad_norm": 4.74439001083374, + "learning_rate": 4.5473306724308607e-07, + "loss": 0.1684, + "step": 34165 + }, + { + "epoch": 0.864589923324139, + "grad_norm": 6.20462703704834, + "learning_rate": 4.545657756231675e-07, + "loss": 0.1182, + "step": 34166 + }, + { + "epoch": 0.8646152288888327, + "grad_norm": 21.838302612304688, + "learning_rate": 4.5439851331590845e-07, + "loss": 0.4852, + "step": 34167 + }, + { + "epoch": 0.8646405344535263, + "grad_norm": 8.73637580871582, + "learning_rate": 4.542312803223864e-07, + "loss": 0.2001, + "step": 34168 + }, + { + "epoch": 0.86466584001822, + "grad_norm": 3.047722101211548, + "learning_rate": 4.5406407664368267e-07, + "loss": 0.1156, + "step": 34169 + }, + { + "epoch": 0.8646911455829137, + "grad_norm": 3.652606248855591, + "learning_rate": 4.5389690228087427e-07, + "loss": 0.1869, + "step": 34170 + }, + { + "epoch": 0.8647164511476073, + "grad_norm": 9.447626113891602, + "learning_rate": 4.5372975723503856e-07, + "loss": 0.1799, + "step": 34171 + }, + { + "epoch": 0.864741756712301, + "grad_norm": 3.265137195587158, + "learning_rate": 4.535626415072536e-07, + "loss": 0.1191, + "step": 34172 + }, + { + "epoch": 0.8647670622769947, + "grad_norm": 4.17595100402832, + "learning_rate": 4.53395555098598e-07, + "loss": 0.1549, + "step": 34173 + }, + { + "epoch": 0.8647923678416883, + "grad_norm": 7.44306755065918, + "learning_rate": 4.5322849801014925e-07, + "loss": 0.0917, + "step": 34174 + }, + { + "epoch": 0.864817673406382, + "grad_norm": 6.276196002960205, + "learning_rate": 4.530614702429836e-07, + "loss": 0.2076, + "step": 34175 + }, + { + "epoch": 0.8648429789710758, + "grad_norm": 4.9517741203308105, + "learning_rate": 4.528944717981787e-07, + "loss": 0.1538, + "step": 34176 + }, + { + "epoch": 0.8648682845357694, + "grad_norm": 5.478902816772461, + "learning_rate": 4.5272750267681077e-07, + "loss": 0.2134, + "step": 34177 + }, + { + "epoch": 0.8648935901004631, + "grad_norm": 8.637674331665039, + "learning_rate": 4.5256056287995844e-07, + "loss": 0.2236, + "step": 34178 + }, + { + "epoch": 0.8649188956651568, + "grad_norm": 6.485352993011475, + "learning_rate": 4.5239365240869695e-07, + "loss": 0.1843, + "step": 34179 + }, + { + "epoch": 0.8649442012298505, + "grad_norm": 4.557475566864014, + "learning_rate": 4.522267712641032e-07, + "loss": 0.1672, + "step": 34180 + }, + { + "epoch": 0.8649695067945441, + "grad_norm": 17.18805503845215, + "learning_rate": 4.5205991944725193e-07, + "loss": 0.2251, + "step": 34181 + }, + { + "epoch": 0.8649948123592378, + "grad_norm": 4.91842794418335, + "learning_rate": 4.518930969592211e-07, + "loss": 0.1573, + "step": 34182 + }, + { + "epoch": 0.8650201179239315, + "grad_norm": 5.59220027923584, + "learning_rate": 4.517263038010855e-07, + "loss": 0.1821, + "step": 34183 + }, + { + "epoch": 0.8650454234886251, + "grad_norm": 3.612488269805908, + "learning_rate": 4.515595399739209e-07, + "loss": 0.142, + "step": 34184 + }, + { + "epoch": 0.8650707290533188, + "grad_norm": 2.5797598361968994, + "learning_rate": 4.513928054788019e-07, + "loss": 0.0999, + "step": 34185 + }, + { + "epoch": 0.8650960346180125, + "grad_norm": 5.63800048828125, + "learning_rate": 4.5122610031680614e-07, + "loss": 0.1361, + "step": 34186 + }, + { + "epoch": 0.8651213401827061, + "grad_norm": 5.346882343292236, + "learning_rate": 4.510594244890054e-07, + "loss": 0.1738, + "step": 34187 + }, + { + "epoch": 0.8651466457473999, + "grad_norm": 8.675131797790527, + "learning_rate": 4.5089277799647836e-07, + "loss": 0.2939, + "step": 34188 + }, + { + "epoch": 0.8651719513120936, + "grad_norm": 4.804677486419678, + "learning_rate": 4.5072616084029576e-07, + "loss": 0.1285, + "step": 34189 + }, + { + "epoch": 0.8651972568767872, + "grad_norm": 5.912990570068359, + "learning_rate": 4.505595730215351e-07, + "loss": 0.1231, + "step": 34190 + }, + { + "epoch": 0.8652225624414809, + "grad_norm": 6.916429042816162, + "learning_rate": 4.5039301454126827e-07, + "loss": 0.1631, + "step": 34191 + }, + { + "epoch": 0.8652478680061746, + "grad_norm": 7.440866470336914, + "learning_rate": 4.5022648540057277e-07, + "loss": 0.2259, + "step": 34192 + }, + { + "epoch": 0.8652731735708682, + "grad_norm": 16.18834114074707, + "learning_rate": 4.5005998560051835e-07, + "loss": 0.2332, + "step": 34193 + }, + { + "epoch": 0.8652984791355619, + "grad_norm": 5.828418254852295, + "learning_rate": 4.4989351514218126e-07, + "loss": 0.1907, + "step": 34194 + }, + { + "epoch": 0.8653237847002556, + "grad_norm": 8.344725608825684, + "learning_rate": 4.4972707402663407e-07, + "loss": 0.2184, + "step": 34195 + }, + { + "epoch": 0.8653490902649492, + "grad_norm": 2.313370704650879, + "learning_rate": 4.4956066225495197e-07, + "loss": 0.1142, + "step": 34196 + }, + { + "epoch": 0.8653743958296429, + "grad_norm": 6.248490333557129, + "learning_rate": 4.4939427982820524e-07, + "loss": 0.1235, + "step": 34197 + }, + { + "epoch": 0.8653997013943366, + "grad_norm": 12.600064277648926, + "learning_rate": 4.492279267474692e-07, + "loss": 0.2359, + "step": 34198 + }, + { + "epoch": 0.8654250069590302, + "grad_norm": 7.191946029663086, + "learning_rate": 4.4906160301381454e-07, + "loss": 0.1248, + "step": 34199 + }, + { + "epoch": 0.865450312523724, + "grad_norm": 4.427860736846924, + "learning_rate": 4.488953086283165e-07, + "loss": 0.1573, + "step": 34200 + }, + { + "epoch": 0.8654756180884177, + "grad_norm": 4.183664321899414, + "learning_rate": 4.487290435920455e-07, + "loss": 0.1148, + "step": 34201 + }, + { + "epoch": 0.8655009236531113, + "grad_norm": 2.619657516479492, + "learning_rate": 4.4856280790607443e-07, + "loss": 0.1209, + "step": 34202 + }, + { + "epoch": 0.865526229217805, + "grad_norm": 5.624715328216553, + "learning_rate": 4.483966015714747e-07, + "loss": 0.1749, + "step": 34203 + }, + { + "epoch": 0.8655515347824987, + "grad_norm": 4.673989295959473, + "learning_rate": 4.4823042458931877e-07, + "loss": 0.1412, + "step": 34204 + }, + { + "epoch": 0.8655768403471924, + "grad_norm": 10.729124069213867, + "learning_rate": 4.4806427696067857e-07, + "loss": 0.1415, + "step": 34205 + }, + { + "epoch": 0.865602145911886, + "grad_norm": 3.4009790420532227, + "learning_rate": 4.4789815868662547e-07, + "loss": 0.1065, + "step": 34206 + }, + { + "epoch": 0.8656274514765797, + "grad_norm": 11.728931427001953, + "learning_rate": 4.477320697682297e-07, + "loss": 0.1878, + "step": 34207 + }, + { + "epoch": 0.8656527570412734, + "grad_norm": 10.954955101013184, + "learning_rate": 4.4756601020656265e-07, + "loss": 0.2666, + "step": 34208 + }, + { + "epoch": 0.865678062605967, + "grad_norm": 5.4907355308532715, + "learning_rate": 4.4739998000269624e-07, + "loss": 0.1942, + "step": 34209 + }, + { + "epoch": 0.8657033681706607, + "grad_norm": 4.8474249839782715, + "learning_rate": 4.4723397915770075e-07, + "loss": 0.1356, + "step": 34210 + }, + { + "epoch": 0.8657286737353544, + "grad_norm": 2.508528709411621, + "learning_rate": 4.4706800767264635e-07, + "loss": 0.0797, + "step": 34211 + }, + { + "epoch": 0.865753979300048, + "grad_norm": 10.3671293258667, + "learning_rate": 4.4690206554860226e-07, + "loss": 0.1968, + "step": 34212 + }, + { + "epoch": 0.8657792848647418, + "grad_norm": 9.010126113891602, + "learning_rate": 4.4673615278664093e-07, + "loss": 0.124, + "step": 34213 + }, + { + "epoch": 0.8658045904294355, + "grad_norm": 4.7718706130981445, + "learning_rate": 4.4657026938783154e-07, + "loss": 0.2173, + "step": 34214 + }, + { + "epoch": 0.8658298959941291, + "grad_norm": 7.072229862213135, + "learning_rate": 4.4640441535324265e-07, + "loss": 0.212, + "step": 34215 + }, + { + "epoch": 0.8658552015588228, + "grad_norm": 9.639825820922852, + "learning_rate": 4.462385906839445e-07, + "loss": 0.1129, + "step": 34216 + }, + { + "epoch": 0.8658805071235165, + "grad_norm": 3.426664352416992, + "learning_rate": 4.4607279538100733e-07, + "loss": 0.0977, + "step": 34217 + }, + { + "epoch": 0.8659058126882101, + "grad_norm": 7.951048851013184, + "learning_rate": 4.459070294454987e-07, + "loss": 0.1875, + "step": 34218 + }, + { + "epoch": 0.8659311182529038, + "grad_norm": 5.176088333129883, + "learning_rate": 4.457412928784899e-07, + "loss": 0.1568, + "step": 34219 + }, + { + "epoch": 0.8659564238175975, + "grad_norm": 4.613676071166992, + "learning_rate": 4.4557558568104675e-07, + "loss": 0.1353, + "step": 34220 + }, + { + "epoch": 0.8659817293822911, + "grad_norm": 7.170991897583008, + "learning_rate": 4.4540990785424066e-07, + "loss": 0.1938, + "step": 34221 + }, + { + "epoch": 0.8660070349469848, + "grad_norm": 8.848520278930664, + "learning_rate": 4.4524425939913794e-07, + "loss": 0.194, + "step": 34222 + }, + { + "epoch": 0.8660323405116785, + "grad_norm": 9.60133171081543, + "learning_rate": 4.450786403168095e-07, + "loss": 0.2406, + "step": 34223 + }, + { + "epoch": 0.8660576460763721, + "grad_norm": 3.920592784881592, + "learning_rate": 4.4491305060831935e-07, + "loss": 0.0689, + "step": 34224 + }, + { + "epoch": 0.8660829516410659, + "grad_norm": 5.062427520751953, + "learning_rate": 4.4474749027473896e-07, + "loss": 0.1381, + "step": 34225 + }, + { + "epoch": 0.8661082572057596, + "grad_norm": 4.68870210647583, + "learning_rate": 4.4458195931713357e-07, + "loss": 0.147, + "step": 34226 + }, + { + "epoch": 0.8661335627704532, + "grad_norm": 7.764029502868652, + "learning_rate": 4.4441645773657394e-07, + "loss": 0.1951, + "step": 34227 + }, + { + "epoch": 0.8661588683351469, + "grad_norm": 4.083644390106201, + "learning_rate": 4.442509855341226e-07, + "loss": 0.1227, + "step": 34228 + }, + { + "epoch": 0.8661841738998406, + "grad_norm": 2.9273152351379395, + "learning_rate": 4.4408554271085094e-07, + "loss": 0.1205, + "step": 34229 + }, + { + "epoch": 0.8662094794645343, + "grad_norm": 4.772316932678223, + "learning_rate": 4.4392012926782304e-07, + "loss": 0.1907, + "step": 34230 + }, + { + "epoch": 0.8662347850292279, + "grad_norm": 4.383895397186279, + "learning_rate": 4.437547452061075e-07, + "loss": 0.1515, + "step": 34231 + }, + { + "epoch": 0.8662600905939216, + "grad_norm": 7.897090911865234, + "learning_rate": 4.435893905267702e-07, + "loss": 0.1642, + "step": 34232 + }, + { + "epoch": 0.8662853961586153, + "grad_norm": 16.61149787902832, + "learning_rate": 4.434240652308769e-07, + "loss": 0.2595, + "step": 34233 + }, + { + "epoch": 0.8663107017233089, + "grad_norm": 7.5468831062316895, + "learning_rate": 4.432587693194945e-07, + "loss": 0.1569, + "step": 34234 + }, + { + "epoch": 0.8663360072880026, + "grad_norm": 4.2323994636535645, + "learning_rate": 4.430935027936878e-07, + "loss": 0.0808, + "step": 34235 + }, + { + "epoch": 0.8663613128526964, + "grad_norm": 8.769800186157227, + "learning_rate": 4.429282656545247e-07, + "loss": 0.2391, + "step": 34236 + }, + { + "epoch": 0.86638661841739, + "grad_norm": 8.59826374053955, + "learning_rate": 4.4276305790306894e-07, + "loss": 0.1873, + "step": 34237 + }, + { + "epoch": 0.8664119239820837, + "grad_norm": 3.6601498126983643, + "learning_rate": 4.425978795403868e-07, + "loss": 0.115, + "step": 34238 + }, + { + "epoch": 0.8664372295467774, + "grad_norm": 13.778573036193848, + "learning_rate": 4.4243273056754244e-07, + "loss": 0.2191, + "step": 34239 + }, + { + "epoch": 0.866462535111471, + "grad_norm": 9.98975658416748, + "learning_rate": 4.422676109856022e-07, + "loss": 0.2915, + "step": 34240 + }, + { + "epoch": 0.8664878406761647, + "grad_norm": 3.2541894912719727, + "learning_rate": 4.4210252079563086e-07, + "loss": 0.0951, + "step": 34241 + }, + { + "epoch": 0.8665131462408584, + "grad_norm": 3.776466131210327, + "learning_rate": 4.4193745999869197e-07, + "loss": 0.108, + "step": 34242 + }, + { + "epoch": 0.866538451805552, + "grad_norm": 4.705722808837891, + "learning_rate": 4.4177242859585023e-07, + "loss": 0.092, + "step": 34243 + }, + { + "epoch": 0.8665637573702457, + "grad_norm": 2.8023858070373535, + "learning_rate": 4.4160742658817034e-07, + "loss": 0.1071, + "step": 34244 + }, + { + "epoch": 0.8665890629349394, + "grad_norm": 4.741113662719727, + "learning_rate": 4.4144245397671646e-07, + "loss": 0.107, + "step": 34245 + }, + { + "epoch": 0.866614368499633, + "grad_norm": 5.9244384765625, + "learning_rate": 4.4127751076255275e-07, + "loss": 0.1526, + "step": 34246 + }, + { + "epoch": 0.8666396740643267, + "grad_norm": 5.369262218475342, + "learning_rate": 4.411125969467411e-07, + "loss": 0.2462, + "step": 34247 + }, + { + "epoch": 0.8666649796290204, + "grad_norm": 4.07111120223999, + "learning_rate": 4.409477125303474e-07, + "loss": 0.1134, + "step": 34248 + }, + { + "epoch": 0.866690285193714, + "grad_norm": 6.553962230682373, + "learning_rate": 4.407828575144335e-07, + "loss": 0.1522, + "step": 34249 + }, + { + "epoch": 0.8667155907584078, + "grad_norm": 2.8879034519195557, + "learning_rate": 4.40618031900063e-07, + "loss": 0.0971, + "step": 34250 + }, + { + "epoch": 0.8667408963231015, + "grad_norm": 3.379803419113159, + "learning_rate": 4.404532356882979e-07, + "loss": 0.0943, + "step": 34251 + }, + { + "epoch": 0.8667662018877951, + "grad_norm": 3.445551872253418, + "learning_rate": 4.4028846888020285e-07, + "loss": 0.1362, + "step": 34252 + }, + { + "epoch": 0.8667915074524888, + "grad_norm": 7.492027759552002, + "learning_rate": 4.401237314768381e-07, + "loss": 0.176, + "step": 34253 + }, + { + "epoch": 0.8668168130171825, + "grad_norm": 3.7751121520996094, + "learning_rate": 4.399590234792689e-07, + "loss": 0.1168, + "step": 34254 + }, + { + "epoch": 0.8668421185818762, + "grad_norm": 4.450180530548096, + "learning_rate": 4.397943448885539e-07, + "loss": 0.1562, + "step": 34255 + }, + { + "epoch": 0.8668674241465698, + "grad_norm": 5.378521919250488, + "learning_rate": 4.3962969570575773e-07, + "loss": 0.0853, + "step": 34256 + }, + { + "epoch": 0.8668927297112635, + "grad_norm": 3.7656137943267822, + "learning_rate": 4.394650759319408e-07, + "loss": 0.1552, + "step": 34257 + }, + { + "epoch": 0.8669180352759572, + "grad_norm": 5.509526252746582, + "learning_rate": 4.393004855681671e-07, + "loss": 0.1497, + "step": 34258 + }, + { + "epoch": 0.8669433408406508, + "grad_norm": 5.275940418243408, + "learning_rate": 4.3913592461549415e-07, + "loss": 0.1367, + "step": 34259 + }, + { + "epoch": 0.8669686464053445, + "grad_norm": 9.748787879943848, + "learning_rate": 4.389713930749867e-07, + "loss": 0.2098, + "step": 34260 + }, + { + "epoch": 0.8669939519700383, + "grad_norm": 4.600467681884766, + "learning_rate": 4.388068909477039e-07, + "loss": 0.1695, + "step": 34261 + }, + { + "epoch": 0.8670192575347319, + "grad_norm": 4.120469570159912, + "learning_rate": 4.38642418234706e-07, + "loss": 0.1628, + "step": 34262 + }, + { + "epoch": 0.8670445630994256, + "grad_norm": 6.0053558349609375, + "learning_rate": 4.3847797493705545e-07, + "loss": 0.1615, + "step": 34263 + }, + { + "epoch": 0.8670698686641193, + "grad_norm": 6.378247261047363, + "learning_rate": 4.38313561055812e-07, + "loss": 0.1333, + "step": 34264 + }, + { + "epoch": 0.8670951742288129, + "grad_norm": 6.101004600524902, + "learning_rate": 4.381491765920365e-07, + "loss": 0.1167, + "step": 34265 + }, + { + "epoch": 0.8671204797935066, + "grad_norm": 10.58652400970459, + "learning_rate": 4.379848215467869e-07, + "loss": 0.2782, + "step": 34266 + }, + { + "epoch": 0.8671457853582003, + "grad_norm": 5.65329122543335, + "learning_rate": 4.378204959211252e-07, + "loss": 0.2243, + "step": 34267 + }, + { + "epoch": 0.8671710909228939, + "grad_norm": 3.756431818008423, + "learning_rate": 4.376561997161111e-07, + "loss": 0.1505, + "step": 34268 + }, + { + "epoch": 0.8671963964875876, + "grad_norm": 6.917391300201416, + "learning_rate": 4.3749193293280314e-07, + "loss": 0.1214, + "step": 34269 + }, + { + "epoch": 0.8672217020522813, + "grad_norm": 3.238389730453491, + "learning_rate": 4.373276955722605e-07, + "loss": 0.159, + "step": 34270 + }, + { + "epoch": 0.8672470076169749, + "grad_norm": 5.3488850593566895, + "learning_rate": 4.3716348763554404e-07, + "loss": 0.2046, + "step": 34271 + }, + { + "epoch": 0.8672723131816686, + "grad_norm": 8.275528907775879, + "learning_rate": 4.3699930912371016e-07, + "loss": 0.149, + "step": 34272 + }, + { + "epoch": 0.8672976187463624, + "grad_norm": 3.0071935653686523, + "learning_rate": 4.3683516003782013e-07, + "loss": 0.1065, + "step": 34273 + }, + { + "epoch": 0.867322924311056, + "grad_norm": 4.586676597595215, + "learning_rate": 4.3667104037892984e-07, + "loss": 0.1349, + "step": 34274 + }, + { + "epoch": 0.8673482298757497, + "grad_norm": 13.810015678405762, + "learning_rate": 4.365069501481001e-07, + "loss": 0.2082, + "step": 34275 + }, + { + "epoch": 0.8673735354404434, + "grad_norm": 5.74670934677124, + "learning_rate": 4.3634288934638835e-07, + "loss": 0.208, + "step": 34276 + }, + { + "epoch": 0.867398841005137, + "grad_norm": 5.2134575843811035, + "learning_rate": 4.361788579748527e-07, + "loss": 0.1589, + "step": 34277 + }, + { + "epoch": 0.8674241465698307, + "grad_norm": 7.501006603240967, + "learning_rate": 4.36014856034549e-07, + "loss": 0.1779, + "step": 34278 + }, + { + "epoch": 0.8674494521345244, + "grad_norm": 4.572887897491455, + "learning_rate": 4.358508835265379e-07, + "loss": 0.1959, + "step": 34279 + }, + { + "epoch": 0.8674747576992181, + "grad_norm": 3.613049030303955, + "learning_rate": 4.356869404518754e-07, + "loss": 0.1526, + "step": 34280 + }, + { + "epoch": 0.8675000632639117, + "grad_norm": 3.091111898422241, + "learning_rate": 4.3552302681161897e-07, + "loss": 0.0736, + "step": 34281 + }, + { + "epoch": 0.8675253688286054, + "grad_norm": 15.682960510253906, + "learning_rate": 4.353591426068243e-07, + "loss": 0.2102, + "step": 34282 + }, + { + "epoch": 0.8675506743932991, + "grad_norm": 5.803504943847656, + "learning_rate": 4.351952878385507e-07, + "loss": 0.1575, + "step": 34283 + }, + { + "epoch": 0.8675759799579927, + "grad_norm": 8.459779739379883, + "learning_rate": 4.350314625078522e-07, + "loss": 0.1782, + "step": 34284 + }, + { + "epoch": 0.8676012855226864, + "grad_norm": 5.620453834533691, + "learning_rate": 4.348676666157886e-07, + "loss": 0.1676, + "step": 34285 + }, + { + "epoch": 0.8676265910873802, + "grad_norm": 8.746134757995605, + "learning_rate": 4.347039001634129e-07, + "loss": 0.2477, + "step": 34286 + }, + { + "epoch": 0.8676518966520738, + "grad_norm": 6.195923805236816, + "learning_rate": 4.3454016315178315e-07, + "loss": 0.2015, + "step": 34287 + }, + { + "epoch": 0.8676772022167675, + "grad_norm": 3.7430858612060547, + "learning_rate": 4.343764555819535e-07, + "loss": 0.1205, + "step": 34288 + }, + { + "epoch": 0.8677025077814612, + "grad_norm": 5.19795036315918, + "learning_rate": 4.3421277745498257e-07, + "loss": 0.1208, + "step": 34289 + }, + { + "epoch": 0.8677278133461548, + "grad_norm": 4.486844539642334, + "learning_rate": 4.3404912877192284e-07, + "loss": 0.1492, + "step": 34290 + }, + { + "epoch": 0.8677531189108485, + "grad_norm": 4.927515029907227, + "learning_rate": 4.3388550953383123e-07, + "loss": 0.1484, + "step": 34291 + }, + { + "epoch": 0.8677784244755422, + "grad_norm": 11.29870319366455, + "learning_rate": 4.33721919741763e-07, + "loss": 0.3055, + "step": 34292 + }, + { + "epoch": 0.8678037300402358, + "grad_norm": 3.502748489379883, + "learning_rate": 4.3355835939677183e-07, + "loss": 0.104, + "step": 34293 + }, + { + "epoch": 0.8678290356049295, + "grad_norm": 5.381980895996094, + "learning_rate": 4.33394828499914e-07, + "loss": 0.159, + "step": 34294 + }, + { + "epoch": 0.8678543411696232, + "grad_norm": 5.415595531463623, + "learning_rate": 4.332313270522437e-07, + "loss": 0.2665, + "step": 34295 + }, + { + "epoch": 0.8678796467343168, + "grad_norm": 3.687494993209839, + "learning_rate": 4.330678550548151e-07, + "loss": 0.1185, + "step": 34296 + }, + { + "epoch": 0.8679049522990105, + "grad_norm": 11.020536422729492, + "learning_rate": 4.329044125086812e-07, + "loss": 0.2443, + "step": 34297 + }, + { + "epoch": 0.8679302578637043, + "grad_norm": 4.950397491455078, + "learning_rate": 4.327409994148996e-07, + "loss": 0.1223, + "step": 34298 + }, + { + "epoch": 0.8679555634283979, + "grad_norm": 4.931707859039307, + "learning_rate": 4.325776157745193e-07, + "loss": 0.1551, + "step": 34299 + }, + { + "epoch": 0.8679808689930916, + "grad_norm": 4.508802890777588, + "learning_rate": 4.324142615885979e-07, + "loss": 0.1852, + "step": 34300 + }, + { + "epoch": 0.8680061745577853, + "grad_norm": 4.759444713592529, + "learning_rate": 4.3225093685818565e-07, + "loss": 0.2512, + "step": 34301 + }, + { + "epoch": 0.8680314801224789, + "grad_norm": 5.308977127075195, + "learning_rate": 4.3208764158433946e-07, + "loss": 0.1607, + "step": 34302 + }, + { + "epoch": 0.8680567856871726, + "grad_norm": 8.239519119262695, + "learning_rate": 4.3192437576810907e-07, + "loss": 0.2422, + "step": 34303 + }, + { + "epoch": 0.8680820912518663, + "grad_norm": 7.4527411460876465, + "learning_rate": 4.317611394105492e-07, + "loss": 0.2024, + "step": 34304 + }, + { + "epoch": 0.8681073968165599, + "grad_norm": 4.163379192352295, + "learning_rate": 4.315979325127107e-07, + "loss": 0.1409, + "step": 34305 + }, + { + "epoch": 0.8681327023812536, + "grad_norm": 6.461480140686035, + "learning_rate": 4.314347550756487e-07, + "loss": 0.1983, + "step": 34306 + }, + { + "epoch": 0.8681580079459473, + "grad_norm": 30.074575424194336, + "learning_rate": 4.3127160710041414e-07, + "loss": 0.1739, + "step": 34307 + }, + { + "epoch": 0.868183313510641, + "grad_norm": 10.785392761230469, + "learning_rate": 4.31108488588059e-07, + "loss": 0.1791, + "step": 34308 + }, + { + "epoch": 0.8682086190753346, + "grad_norm": 3.5491418838500977, + "learning_rate": 4.3094539953963454e-07, + "loss": 0.1286, + "step": 34309 + }, + { + "epoch": 0.8682339246400284, + "grad_norm": 4.87531852722168, + "learning_rate": 4.307823399561939e-07, + "loss": 0.1576, + "step": 34310 + }, + { + "epoch": 0.8682592302047221, + "grad_norm": 5.212474346160889, + "learning_rate": 4.306193098387884e-07, + "loss": 0.1865, + "step": 34311 + }, + { + "epoch": 0.8682845357694157, + "grad_norm": 3.1361591815948486, + "learning_rate": 4.304563091884689e-07, + "loss": 0.1246, + "step": 34312 + }, + { + "epoch": 0.8683098413341094, + "grad_norm": 8.449899673461914, + "learning_rate": 4.302933380062857e-07, + "loss": 0.2211, + "step": 34313 + }, + { + "epoch": 0.8683351468988031, + "grad_norm": 3.6190409660339355, + "learning_rate": 4.301303962932918e-07, + "loss": 0.1223, + "step": 34314 + }, + { + "epoch": 0.8683604524634967, + "grad_norm": 6.697991371154785, + "learning_rate": 4.299674840505358e-07, + "loss": 0.1407, + "step": 34315 + }, + { + "epoch": 0.8683857580281904, + "grad_norm": 3.995789051055908, + "learning_rate": 4.298046012790713e-07, + "loss": 0.1349, + "step": 34316 + }, + { + "epoch": 0.8684110635928841, + "grad_norm": 4.921628952026367, + "learning_rate": 4.296417479799447e-07, + "loss": 0.1562, + "step": 34317 + }, + { + "epoch": 0.8684363691575777, + "grad_norm": 7.245220184326172, + "learning_rate": 4.2947892415420967e-07, + "loss": 0.1746, + "step": 34318 + }, + { + "epoch": 0.8684616747222714, + "grad_norm": 5.060595989227295, + "learning_rate": 4.2931612980291473e-07, + "loss": 0.2091, + "step": 34319 + }, + { + "epoch": 0.8684869802869651, + "grad_norm": 4.527845859527588, + "learning_rate": 4.2915336492710956e-07, + "loss": 0.1492, + "step": 34320 + }, + { + "epoch": 0.8685122858516587, + "grad_norm": 5.416604995727539, + "learning_rate": 4.2899062952784344e-07, + "loss": 0.1969, + "step": 34321 + }, + { + "epoch": 0.8685375914163525, + "grad_norm": 4.220667362213135, + "learning_rate": 4.2882792360616763e-07, + "loss": 0.1659, + "step": 34322 + }, + { + "epoch": 0.8685628969810462, + "grad_norm": 22.0429744720459, + "learning_rate": 4.2866524716312976e-07, + "loss": 0.3526, + "step": 34323 + }, + { + "epoch": 0.8685882025457398, + "grad_norm": 5.211034297943115, + "learning_rate": 4.2850260019978e-07, + "loss": 0.087, + "step": 34324 + }, + { + "epoch": 0.8686135081104335, + "grad_norm": 3.7282955646514893, + "learning_rate": 4.2833998271716524e-07, + "loss": 0.1215, + "step": 34325 + }, + { + "epoch": 0.8686388136751272, + "grad_norm": 3.0338852405548096, + "learning_rate": 4.2817739471633643e-07, + "loss": 0.1088, + "step": 34326 + }, + { + "epoch": 0.8686641192398208, + "grad_norm": 5.091892242431641, + "learning_rate": 4.280148361983411e-07, + "loss": 0.1726, + "step": 34327 + }, + { + "epoch": 0.8686894248045145, + "grad_norm": 5.550611972808838, + "learning_rate": 4.278523071642271e-07, + "loss": 0.1214, + "step": 34328 + }, + { + "epoch": 0.8687147303692082, + "grad_norm": 4.986627101898193, + "learning_rate": 4.276898076150443e-07, + "loss": 0.1949, + "step": 34329 + }, + { + "epoch": 0.8687400359339018, + "grad_norm": 5.838424205780029, + "learning_rate": 4.2752733755183793e-07, + "loss": 0.2126, + "step": 34330 + }, + { + "epoch": 0.8687653414985955, + "grad_norm": 3.4102158546447754, + "learning_rate": 4.273648969756583e-07, + "loss": 0.0932, + "step": 34331 + }, + { + "epoch": 0.8687906470632892, + "grad_norm": 3.4098222255706787, + "learning_rate": 4.272024858875512e-07, + "loss": 0.1096, + "step": 34332 + }, + { + "epoch": 0.868815952627983, + "grad_norm": 6.169798374176025, + "learning_rate": 4.270401042885658e-07, + "loss": 0.149, + "step": 34333 + }, + { + "epoch": 0.8688412581926765, + "grad_norm": 1.547078013420105, + "learning_rate": 4.268777521797468e-07, + "loss": 0.0714, + "step": 34334 + }, + { + "epoch": 0.8688665637573703, + "grad_norm": 9.166105270385742, + "learning_rate": 4.267154295621434e-07, + "loss": 0.1796, + "step": 34335 + }, + { + "epoch": 0.868891869322064, + "grad_norm": 3.7104058265686035, + "learning_rate": 4.265531364368003e-07, + "loss": 0.1638, + "step": 34336 + }, + { + "epoch": 0.8689171748867576, + "grad_norm": 15.136665344238281, + "learning_rate": 4.263908728047661e-07, + "loss": 0.2069, + "step": 34337 + }, + { + "epoch": 0.8689424804514513, + "grad_norm": 4.067274570465088, + "learning_rate": 4.262286386670872e-07, + "loss": 0.1419, + "step": 34338 + }, + { + "epoch": 0.868967786016145, + "grad_norm": 4.0490007400512695, + "learning_rate": 4.260664340248083e-07, + "loss": 0.1729, + "step": 34339 + }, + { + "epoch": 0.8689930915808386, + "grad_norm": 9.96597671508789, + "learning_rate": 4.2590425887897526e-07, + "loss": 0.2664, + "step": 34340 + }, + { + "epoch": 0.8690183971455323, + "grad_norm": 8.674325942993164, + "learning_rate": 4.257421132306361e-07, + "loss": 0.2039, + "step": 34341 + }, + { + "epoch": 0.869043702710226, + "grad_norm": 5.361672878265381, + "learning_rate": 4.2557999708083497e-07, + "loss": 0.1292, + "step": 34342 + }, + { + "epoch": 0.8690690082749196, + "grad_norm": 3.362264633178711, + "learning_rate": 4.254179104306172e-07, + "loss": 0.1318, + "step": 34343 + }, + { + "epoch": 0.8690943138396133, + "grad_norm": 4.373054027557373, + "learning_rate": 4.2525585328102804e-07, + "loss": 0.1308, + "step": 34344 + }, + { + "epoch": 0.869119619404307, + "grad_norm": 5.283135414123535, + "learning_rate": 4.250938256331133e-07, + "loss": 0.1509, + "step": 34345 + }, + { + "epoch": 0.8691449249690006, + "grad_norm": 7.020432949066162, + "learning_rate": 4.249318274879177e-07, + "loss": 0.154, + "step": 34346 + }, + { + "epoch": 0.8691702305336944, + "grad_norm": 3.5701169967651367, + "learning_rate": 4.24769858846486e-07, + "loss": 0.1781, + "step": 34347 + }, + { + "epoch": 0.8691955360983881, + "grad_norm": 4.077038764953613, + "learning_rate": 4.2460791970986127e-07, + "loss": 0.1083, + "step": 34348 + }, + { + "epoch": 0.8692208416630817, + "grad_norm": 3.119852066040039, + "learning_rate": 4.244460100790898e-07, + "loss": 0.1099, + "step": 34349 + }, + { + "epoch": 0.8692461472277754, + "grad_norm": 6.08248233795166, + "learning_rate": 4.242841299552153e-07, + "loss": 0.1332, + "step": 34350 + }, + { + "epoch": 0.8692714527924691, + "grad_norm": 2.4681596755981445, + "learning_rate": 4.2412227933928076e-07, + "loss": 0.0597, + "step": 34351 + }, + { + "epoch": 0.8692967583571627, + "grad_norm": 6.6441497802734375, + "learning_rate": 4.2396045823232977e-07, + "loss": 0.1463, + "step": 34352 + }, + { + "epoch": 0.8693220639218564, + "grad_norm": 4.360763072967529, + "learning_rate": 4.237986666354077e-07, + "loss": 0.1439, + "step": 34353 + }, + { + "epoch": 0.8693473694865501, + "grad_norm": 7.828336238861084, + "learning_rate": 4.236369045495564e-07, + "loss": 0.1609, + "step": 34354 + }, + { + "epoch": 0.8693726750512437, + "grad_norm": 5.192187786102295, + "learning_rate": 4.234751719758201e-07, + "loss": 0.2002, + "step": 34355 + }, + { + "epoch": 0.8693979806159374, + "grad_norm": 3.417985200881958, + "learning_rate": 4.2331346891524015e-07, + "loss": 0.1002, + "step": 34356 + }, + { + "epoch": 0.8694232861806311, + "grad_norm": 6.763514995574951, + "learning_rate": 4.231517953688602e-07, + "loss": 0.1629, + "step": 34357 + }, + { + "epoch": 0.8694485917453249, + "grad_norm": 11.235130310058594, + "learning_rate": 4.2299015133772323e-07, + "loss": 0.2092, + "step": 34358 + }, + { + "epoch": 0.8694738973100185, + "grad_norm": 4.530478477478027, + "learning_rate": 4.228285368228713e-07, + "loss": 0.1828, + "step": 34359 + }, + { + "epoch": 0.8694992028747122, + "grad_norm": 2.9313368797302246, + "learning_rate": 4.226669518253479e-07, + "loss": 0.1411, + "step": 34360 + }, + { + "epoch": 0.8695245084394059, + "grad_norm": 2.429931402206421, + "learning_rate": 4.2250539634619223e-07, + "loss": 0.102, + "step": 34361 + }, + { + "epoch": 0.8695498140040995, + "grad_norm": 5.14362096786499, + "learning_rate": 4.2234387038644853e-07, + "loss": 0.1227, + "step": 34362 + }, + { + "epoch": 0.8695751195687932, + "grad_norm": 6.015547752380371, + "learning_rate": 4.22182373947157e-07, + "loss": 0.1164, + "step": 34363 + }, + { + "epoch": 0.8696004251334869, + "grad_norm": 2.3069863319396973, + "learning_rate": 4.2202090702936184e-07, + "loss": 0.0717, + "step": 34364 + }, + { + "epoch": 0.8696257306981805, + "grad_norm": 9.141419410705566, + "learning_rate": 4.2185946963409995e-07, + "loss": 0.2646, + "step": 34365 + }, + { + "epoch": 0.8696510362628742, + "grad_norm": 4.647495269775391, + "learning_rate": 4.2169806176241555e-07, + "loss": 0.1875, + "step": 34366 + }, + { + "epoch": 0.8696763418275679, + "grad_norm": 3.053211212158203, + "learning_rate": 4.2153668341534836e-07, + "loss": 0.1097, + "step": 34367 + }, + { + "epoch": 0.8697016473922615, + "grad_norm": 4.00811243057251, + "learning_rate": 4.213753345939403e-07, + "loss": 0.2108, + "step": 34368 + }, + { + "epoch": 0.8697269529569552, + "grad_norm": 4.475110054016113, + "learning_rate": 4.2121401529923054e-07, + "loss": 0.1109, + "step": 34369 + }, + { + "epoch": 0.869752258521649, + "grad_norm": 9.226137161254883, + "learning_rate": 4.210527255322605e-07, + "loss": 0.1953, + "step": 34370 + }, + { + "epoch": 0.8697775640863425, + "grad_norm": 7.820468425750732, + "learning_rate": 4.208914652940682e-07, + "loss": 0.1225, + "step": 34371 + }, + { + "epoch": 0.8698028696510363, + "grad_norm": 3.586238384246826, + "learning_rate": 4.2073023458569616e-07, + "loss": 0.1268, + "step": 34372 + }, + { + "epoch": 0.86982817521573, + "grad_norm": 4.267014503479004, + "learning_rate": 4.2056903340818354e-07, + "loss": 0.1857, + "step": 34373 + }, + { + "epoch": 0.8698534807804236, + "grad_norm": 5.092010974884033, + "learning_rate": 4.204078617625684e-07, + "loss": 0.1757, + "step": 34374 + }, + { + "epoch": 0.8698787863451173, + "grad_norm": 5.192823886871338, + "learning_rate": 4.2024671964989104e-07, + "loss": 0.1392, + "step": 34375 + }, + { + "epoch": 0.869904091909811, + "grad_norm": 7.036532402038574, + "learning_rate": 4.2008560707119116e-07, + "loss": 0.1408, + "step": 34376 + }, + { + "epoch": 0.8699293974745046, + "grad_norm": 4.407326698303223, + "learning_rate": 4.1992452402750736e-07, + "loss": 0.1537, + "step": 34377 + }, + { + "epoch": 0.8699547030391983, + "grad_norm": 3.1294288635253906, + "learning_rate": 4.197634705198783e-07, + "loss": 0.1256, + "step": 34378 + }, + { + "epoch": 0.869980008603892, + "grad_norm": 5.170928955078125, + "learning_rate": 4.19602446549342e-07, + "loss": 0.1604, + "step": 34379 + }, + { + "epoch": 0.8700053141685856, + "grad_norm": 6.443737983703613, + "learning_rate": 4.194414521169382e-07, + "loss": 0.1943, + "step": 34380 + }, + { + "epoch": 0.8700306197332793, + "grad_norm": 5.081818580627441, + "learning_rate": 4.192804872237044e-07, + "loss": 0.1267, + "step": 34381 + }, + { + "epoch": 0.870055925297973, + "grad_norm": 4.767275810241699, + "learning_rate": 4.1911955187067864e-07, + "loss": 0.1118, + "step": 34382 + }, + { + "epoch": 0.8700812308626668, + "grad_norm": 7.14106559753418, + "learning_rate": 4.18958646058899e-07, + "loss": 0.2069, + "step": 34383 + }, + { + "epoch": 0.8701065364273604, + "grad_norm": 6.501396179199219, + "learning_rate": 4.187977697894019e-07, + "loss": 0.1408, + "step": 34384 + }, + { + "epoch": 0.8701318419920541, + "grad_norm": 2.8226823806762695, + "learning_rate": 4.1863692306322646e-07, + "loss": 0.0955, + "step": 34385 + }, + { + "epoch": 0.8701571475567478, + "grad_norm": 9.201742172241211, + "learning_rate": 4.1847610588140965e-07, + "loss": 0.2277, + "step": 34386 + }, + { + "epoch": 0.8701824531214414, + "grad_norm": 10.716880798339844, + "learning_rate": 4.1831531824498783e-07, + "loss": 0.2421, + "step": 34387 + }, + { + "epoch": 0.8702077586861351, + "grad_norm": 6.870457649230957, + "learning_rate": 4.1815456015499744e-07, + "loss": 0.1387, + "step": 34388 + }, + { + "epoch": 0.8702330642508288, + "grad_norm": 3.6260673999786377, + "learning_rate": 4.179938316124771e-07, + "loss": 0.17, + "step": 34389 + }, + { + "epoch": 0.8702583698155224, + "grad_norm": 4.5757622718811035, + "learning_rate": 4.178331326184615e-07, + "loss": 0.122, + "step": 34390 + }, + { + "epoch": 0.8702836753802161, + "grad_norm": 2.6402347087860107, + "learning_rate": 4.1767246317398924e-07, + "loss": 0.1153, + "step": 34391 + }, + { + "epoch": 0.8703089809449098, + "grad_norm": 4.159045219421387, + "learning_rate": 4.1751182328009286e-07, + "loss": 0.1852, + "step": 34392 + }, + { + "epoch": 0.8703342865096034, + "grad_norm": 2.6557395458221436, + "learning_rate": 4.1735121293781155e-07, + "loss": 0.1267, + "step": 34393 + }, + { + "epoch": 0.8703595920742971, + "grad_norm": 2.775580883026123, + "learning_rate": 4.171906321481789e-07, + "loss": 0.0959, + "step": 34394 + }, + { + "epoch": 0.8703848976389909, + "grad_norm": 4.980576038360596, + "learning_rate": 4.17030080912233e-07, + "loss": 0.1339, + "step": 34395 + }, + { + "epoch": 0.8704102032036845, + "grad_norm": 13.126380920410156, + "learning_rate": 4.1686955923100625e-07, + "loss": 0.2383, + "step": 34396 + }, + { + "epoch": 0.8704355087683782, + "grad_norm": 4.860947608947754, + "learning_rate": 4.1670906710553573e-07, + "loss": 0.1046, + "step": 34397 + }, + { + "epoch": 0.8704608143330719, + "grad_norm": 3.555853843688965, + "learning_rate": 4.1654860453685496e-07, + "loss": 0.1394, + "step": 34398 + }, + { + "epoch": 0.8704861198977655, + "grad_norm": 3.16735577583313, + "learning_rate": 4.1638817152600044e-07, + "loss": 0.1635, + "step": 34399 + }, + { + "epoch": 0.8705114254624592, + "grad_norm": 4.0026750564575195, + "learning_rate": 4.1622776807400624e-07, + "loss": 0.1476, + "step": 34400 + }, + { + "epoch": 0.8705367310271529, + "grad_norm": 5.6642632484436035, + "learning_rate": 4.1606739418190654e-07, + "loss": 0.1323, + "step": 34401 + }, + { + "epoch": 0.8705620365918465, + "grad_norm": 10.930658340454102, + "learning_rate": 4.159070498507345e-07, + "loss": 0.1439, + "step": 34402 + }, + { + "epoch": 0.8705873421565402, + "grad_norm": 4.638245582580566, + "learning_rate": 4.157467350815259e-07, + "loss": 0.1253, + "step": 34403 + }, + { + "epoch": 0.8706126477212339, + "grad_norm": 3.654489755630493, + "learning_rate": 4.155864498753143e-07, + "loss": 0.1247, + "step": 34404 + }, + { + "epoch": 0.8706379532859275, + "grad_norm": 4.88487434387207, + "learning_rate": 4.154261942331328e-07, + "loss": 0.1912, + "step": 34405 + }, + { + "epoch": 0.8706632588506212, + "grad_norm": 3.66758131980896, + "learning_rate": 4.1526596815601396e-07, + "loss": 0.0941, + "step": 34406 + }, + { + "epoch": 0.870688564415315, + "grad_norm": 4.784699440002441, + "learning_rate": 4.15105771644993e-07, + "loss": 0.1422, + "step": 34407 + }, + { + "epoch": 0.8707138699800087, + "grad_norm": 3.8089394569396973, + "learning_rate": 4.149456047011019e-07, + "loss": 0.1278, + "step": 34408 + }, + { + "epoch": 0.8707391755447023, + "grad_norm": 4.256869316101074, + "learning_rate": 4.147854673253737e-07, + "loss": 0.1325, + "step": 34409 + }, + { + "epoch": 0.870764481109396, + "grad_norm": 4.96550989151001, + "learning_rate": 4.146253595188404e-07, + "loss": 0.0697, + "step": 34410 + }, + { + "epoch": 0.8707897866740897, + "grad_norm": 6.63831090927124, + "learning_rate": 4.1446528128253616e-07, + "loss": 0.1519, + "step": 34411 + }, + { + "epoch": 0.8708150922387833, + "grad_norm": 5.5710883140563965, + "learning_rate": 4.1430523261749234e-07, + "loss": 0.1784, + "step": 34412 + }, + { + "epoch": 0.870840397803477, + "grad_norm": 4.061346054077148, + "learning_rate": 4.141452135247409e-07, + "loss": 0.1692, + "step": 34413 + }, + { + "epoch": 0.8708657033681707, + "grad_norm": 5.275717258453369, + "learning_rate": 4.139852240053144e-07, + "loss": 0.143, + "step": 34414 + }, + { + "epoch": 0.8708910089328643, + "grad_norm": 3.4945595264434814, + "learning_rate": 4.138252640602425e-07, + "loss": 0.1271, + "step": 34415 + }, + { + "epoch": 0.870916314497558, + "grad_norm": 6.04438591003418, + "learning_rate": 4.1366533369055995e-07, + "loss": 0.1818, + "step": 34416 + }, + { + "epoch": 0.8709416200622517, + "grad_norm": 3.9139606952667236, + "learning_rate": 4.1350543289729594e-07, + "loss": 0.122, + "step": 34417 + }, + { + "epoch": 0.8709669256269453, + "grad_norm": 4.0409111976623535, + "learning_rate": 4.1334556168148245e-07, + "loss": 0.0728, + "step": 34418 + }, + { + "epoch": 0.870992231191639, + "grad_norm": 5.047842979431152, + "learning_rate": 4.131857200441497e-07, + "loss": 0.1614, + "step": 34419 + }, + { + "epoch": 0.8710175367563328, + "grad_norm": 6.606374740600586, + "learning_rate": 4.130259079863297e-07, + "loss": 0.1756, + "step": 34420 + }, + { + "epoch": 0.8710428423210264, + "grad_norm": 18.743310928344727, + "learning_rate": 4.128661255090516e-07, + "loss": 0.2023, + "step": 34421 + }, + { + "epoch": 0.8710681478857201, + "grad_norm": 9.598726272583008, + "learning_rate": 4.127063726133479e-07, + "loss": 0.184, + "step": 34422 + }, + { + "epoch": 0.8710934534504138, + "grad_norm": 2.7745282649993896, + "learning_rate": 4.125466493002461e-07, + "loss": 0.0723, + "step": 34423 + }, + { + "epoch": 0.8711187590151074, + "grad_norm": 6.007101535797119, + "learning_rate": 4.123869555707788e-07, + "loss": 0.1771, + "step": 34424 + }, + { + "epoch": 0.8711440645798011, + "grad_norm": 25.0976505279541, + "learning_rate": 4.122272914259734e-07, + "loss": 0.2312, + "step": 34425 + }, + { + "epoch": 0.8711693701444948, + "grad_norm": 4.945123195648193, + "learning_rate": 4.1206765686686244e-07, + "loss": 0.1384, + "step": 34426 + }, + { + "epoch": 0.8711946757091884, + "grad_norm": 8.460718154907227, + "learning_rate": 4.119080518944718e-07, + "loss": 0.2251, + "step": 34427 + }, + { + "epoch": 0.8712199812738821, + "grad_norm": 4.092447280883789, + "learning_rate": 4.1174847650983394e-07, + "loss": 0.1304, + "step": 34428 + }, + { + "epoch": 0.8712452868385758, + "grad_norm": 3.8038320541381836, + "learning_rate": 4.1158893071397585e-07, + "loss": 0.1533, + "step": 34429 + }, + { + "epoch": 0.8712705924032694, + "grad_norm": 3.42946457862854, + "learning_rate": 4.1142941450792894e-07, + "loss": 0.1595, + "step": 34430 + }, + { + "epoch": 0.8712958979679631, + "grad_norm": 2.553515911102295, + "learning_rate": 4.1126992789271847e-07, + "loss": 0.0734, + "step": 34431 + }, + { + "epoch": 0.8713212035326569, + "grad_norm": 4.013779640197754, + "learning_rate": 4.111104708693753e-07, + "loss": 0.1848, + "step": 34432 + }, + { + "epoch": 0.8713465090973505, + "grad_norm": 3.3287694454193115, + "learning_rate": 4.109510434389263e-07, + "loss": 0.1421, + "step": 34433 + }, + { + "epoch": 0.8713718146620442, + "grad_norm": 4.296736240386963, + "learning_rate": 4.107916456024019e-07, + "loss": 0.1142, + "step": 34434 + }, + { + "epoch": 0.8713971202267379, + "grad_norm": 5.663447380065918, + "learning_rate": 4.106322773608279e-07, + "loss": 0.1104, + "step": 34435 + }, + { + "epoch": 0.8714224257914316, + "grad_norm": 7.506625175476074, + "learning_rate": 4.104729387152329e-07, + "loss": 0.1561, + "step": 34436 + }, + { + "epoch": 0.8714477313561252, + "grad_norm": 3.063288450241089, + "learning_rate": 4.103136296666427e-07, + "loss": 0.1482, + "step": 34437 + }, + { + "epoch": 0.8714730369208189, + "grad_norm": 5.5618672370910645, + "learning_rate": 4.1015435021608764e-07, + "loss": 0.1264, + "step": 34438 + }, + { + "epoch": 0.8714983424855126, + "grad_norm": 4.173346519470215, + "learning_rate": 4.0999510036459353e-07, + "loss": 0.1089, + "step": 34439 + }, + { + "epoch": 0.8715236480502062, + "grad_norm": 3.1463239192962646, + "learning_rate": 4.0983588011318685e-07, + "loss": 0.1223, + "step": 34440 + }, + { + "epoch": 0.8715489536148999, + "grad_norm": 2.323451519012451, + "learning_rate": 4.096766894628945e-07, + "loss": 0.132, + "step": 34441 + }, + { + "epoch": 0.8715742591795936, + "grad_norm": 6.609075546264648, + "learning_rate": 4.095175284147429e-07, + "loss": 0.1944, + "step": 34442 + }, + { + "epoch": 0.8715995647442872, + "grad_norm": 5.803037643432617, + "learning_rate": 4.0935839696975956e-07, + "loss": 0.1397, + "step": 34443 + }, + { + "epoch": 0.871624870308981, + "grad_norm": 4.067050457000732, + "learning_rate": 4.0919929512897026e-07, + "loss": 0.1352, + "step": 34444 + }, + { + "epoch": 0.8716501758736747, + "grad_norm": 4.1730499267578125, + "learning_rate": 4.090402228934004e-07, + "loss": 0.1329, + "step": 34445 + }, + { + "epoch": 0.8716754814383683, + "grad_norm": 13.38473892211914, + "learning_rate": 4.088811802640752e-07, + "loss": 0.1491, + "step": 34446 + }, + { + "epoch": 0.871700787003062, + "grad_norm": 6.913060188293457, + "learning_rate": 4.087221672420227e-07, + "loss": 0.0826, + "step": 34447 + }, + { + "epoch": 0.8717260925677557, + "grad_norm": 7.500682830810547, + "learning_rate": 4.0856318382826667e-07, + "loss": 0.2139, + "step": 34448 + }, + { + "epoch": 0.8717513981324493, + "grad_norm": 8.947760581970215, + "learning_rate": 4.0840423002383225e-07, + "loss": 0.2067, + "step": 34449 + }, + { + "epoch": 0.871776703697143, + "grad_norm": 4.437291622161865, + "learning_rate": 4.082453058297442e-07, + "loss": 0.1151, + "step": 34450 + }, + { + "epoch": 0.8718020092618367, + "grad_norm": 3.3633310794830322, + "learning_rate": 4.0808641124702895e-07, + "loss": 0.1184, + "step": 34451 + }, + { + "epoch": 0.8718273148265303, + "grad_norm": 3.3979296684265137, + "learning_rate": 4.0792754627671017e-07, + "loss": 0.1215, + "step": 34452 + }, + { + "epoch": 0.871852620391224, + "grad_norm": 3.0373668670654297, + "learning_rate": 4.077687109198125e-07, + "loss": 0.0798, + "step": 34453 + }, + { + "epoch": 0.8718779259559177, + "grad_norm": 3.0953149795532227, + "learning_rate": 4.0760990517735965e-07, + "loss": 0.1017, + "step": 34454 + }, + { + "epoch": 0.8719032315206113, + "grad_norm": 4.757818698883057, + "learning_rate": 4.074511290503763e-07, + "loss": 0.1198, + "step": 34455 + }, + { + "epoch": 0.871928537085305, + "grad_norm": 6.926548004150391, + "learning_rate": 4.072923825398861e-07, + "loss": 0.1835, + "step": 34456 + }, + { + "epoch": 0.8719538426499988, + "grad_norm": 12.137382507324219, + "learning_rate": 4.0713366564691493e-07, + "loss": 0.2359, + "step": 34457 + }, + { + "epoch": 0.8719791482146924, + "grad_norm": 7.66885232925415, + "learning_rate": 4.069749783724825e-07, + "loss": 0.2062, + "step": 34458 + }, + { + "epoch": 0.8720044537793861, + "grad_norm": 4.1688337326049805, + "learning_rate": 4.0681632071761455e-07, + "loss": 0.1048, + "step": 34459 + }, + { + "epoch": 0.8720297593440798, + "grad_norm": 10.567078590393066, + "learning_rate": 4.066576926833332e-07, + "loss": 0.2659, + "step": 34460 + }, + { + "epoch": 0.8720550649087735, + "grad_norm": 3.7905113697052, + "learning_rate": 4.064990942706637e-07, + "loss": 0.1812, + "step": 34461 + }, + { + "epoch": 0.8720803704734671, + "grad_norm": 3.530524253845215, + "learning_rate": 4.063405254806252e-07, + "loss": 0.1704, + "step": 34462 + }, + { + "epoch": 0.8721056760381608, + "grad_norm": 5.075612545013428, + "learning_rate": 4.0618198631424356e-07, + "loss": 0.1164, + "step": 34463 + }, + { + "epoch": 0.8721309816028545, + "grad_norm": 10.879084587097168, + "learning_rate": 4.06023476772538e-07, + "loss": 0.2804, + "step": 34464 + }, + { + "epoch": 0.8721562871675481, + "grad_norm": 7.19422721862793, + "learning_rate": 4.0586499685653425e-07, + "loss": 0.2304, + "step": 34465 + }, + { + "epoch": 0.8721815927322418, + "grad_norm": 10.038615226745605, + "learning_rate": 4.0570654656725216e-07, + "loss": 0.1971, + "step": 34466 + }, + { + "epoch": 0.8722068982969355, + "grad_norm": 8.089150428771973, + "learning_rate": 4.055481259057137e-07, + "loss": 0.2307, + "step": 34467 + }, + { + "epoch": 0.8722322038616291, + "grad_norm": 6.007005214691162, + "learning_rate": 4.053897348729413e-07, + "loss": 0.1443, + "step": 34468 + }, + { + "epoch": 0.8722575094263229, + "grad_norm": 9.771880149841309, + "learning_rate": 4.0523137346995425e-07, + "loss": 0.1465, + "step": 34469 + }, + { + "epoch": 0.8722828149910166, + "grad_norm": 6.224104881286621, + "learning_rate": 4.050730416977766e-07, + "loss": 0.1311, + "step": 34470 + }, + { + "epoch": 0.8723081205557102, + "grad_norm": 3.7103195190429688, + "learning_rate": 4.0491473955742823e-07, + "loss": 0.1243, + "step": 34471 + }, + { + "epoch": 0.8723334261204039, + "grad_norm": 3.277088165283203, + "learning_rate": 4.0475646704993045e-07, + "loss": 0.1027, + "step": 34472 + }, + { + "epoch": 0.8723587316850976, + "grad_norm": 3.5841448307037354, + "learning_rate": 4.045982241763019e-07, + "loss": 0.0949, + "step": 34473 + }, + { + "epoch": 0.8723840372497912, + "grad_norm": 5.94383430480957, + "learning_rate": 4.044400109375657e-07, + "loss": 0.1482, + "step": 34474 + }, + { + "epoch": 0.8724093428144849, + "grad_norm": 6.413949012756348, + "learning_rate": 4.04281827334741e-07, + "loss": 0.1305, + "step": 34475 + }, + { + "epoch": 0.8724346483791786, + "grad_norm": 3.624889373779297, + "learning_rate": 4.041236733688475e-07, + "loss": 0.0982, + "step": 34476 + }, + { + "epoch": 0.8724599539438722, + "grad_norm": 4.8069562911987305, + "learning_rate": 4.039655490409056e-07, + "loss": 0.1131, + "step": 34477 + }, + { + "epoch": 0.8724852595085659, + "grad_norm": 2.7286264896392822, + "learning_rate": 4.0380745435193493e-07, + "loss": 0.1132, + "step": 34478 + }, + { + "epoch": 0.8725105650732596, + "grad_norm": 4.414830684661865, + "learning_rate": 4.0364938930295526e-07, + "loss": 0.1323, + "step": 34479 + }, + { + "epoch": 0.8725358706379532, + "grad_norm": 12.53730297088623, + "learning_rate": 4.034913538949858e-07, + "loss": 0.1529, + "step": 34480 + }, + { + "epoch": 0.872561176202647, + "grad_norm": 11.393115997314453, + "learning_rate": 4.033333481290447e-07, + "loss": 0.2077, + "step": 34481 + }, + { + "epoch": 0.8725864817673407, + "grad_norm": 3.788478136062622, + "learning_rate": 4.031753720061526e-07, + "loss": 0.0911, + "step": 34482 + }, + { + "epoch": 0.8726117873320343, + "grad_norm": 4.144033432006836, + "learning_rate": 4.030174255273278e-07, + "loss": 0.1479, + "step": 34483 + }, + { + "epoch": 0.872637092896728, + "grad_norm": 10.5440673828125, + "learning_rate": 4.0285950869358823e-07, + "loss": 0.1962, + "step": 34484 + }, + { + "epoch": 0.8726623984614217, + "grad_norm": 2.939577102661133, + "learning_rate": 4.027016215059515e-07, + "loss": 0.1115, + "step": 34485 + }, + { + "epoch": 0.8726877040261154, + "grad_norm": 13.746190071105957, + "learning_rate": 4.0254376396543835e-07, + "loss": 0.2026, + "step": 34486 + }, + { + "epoch": 0.872713009590809, + "grad_norm": 2.82527756690979, + "learning_rate": 4.0238593607306365e-07, + "loss": 0.1423, + "step": 34487 + }, + { + "epoch": 0.8727383151555027, + "grad_norm": 3.930248260498047, + "learning_rate": 4.022281378298493e-07, + "loss": 0.1243, + "step": 34488 + }, + { + "epoch": 0.8727636207201964, + "grad_norm": 4.825323581695557, + "learning_rate": 4.020703692368083e-07, + "loss": 0.1993, + "step": 34489 + }, + { + "epoch": 0.87278892628489, + "grad_norm": 4.195510387420654, + "learning_rate": 4.019126302949611e-07, + "loss": 0.119, + "step": 34490 + }, + { + "epoch": 0.8728142318495837, + "grad_norm": 5.799691200256348, + "learning_rate": 4.017549210053229e-07, + "loss": 0.1544, + "step": 34491 + }, + { + "epoch": 0.8728395374142774, + "grad_norm": 9.329157829284668, + "learning_rate": 4.0159724136891397e-07, + "loss": 0.2144, + "step": 34492 + }, + { + "epoch": 0.872864842978971, + "grad_norm": 6.745730876922607, + "learning_rate": 4.014395913867469e-07, + "loss": 0.1191, + "step": 34493 + }, + { + "epoch": 0.8728901485436648, + "grad_norm": 2.7795982360839844, + "learning_rate": 4.0128197105984204e-07, + "loss": 0.1203, + "step": 34494 + }, + { + "epoch": 0.8729154541083585, + "grad_norm": 3.680365800857544, + "learning_rate": 4.0112438038921284e-07, + "loss": 0.0492, + "step": 34495 + }, + { + "epoch": 0.8729407596730521, + "grad_norm": 10.37890911102295, + "learning_rate": 4.009668193758781e-07, + "loss": 0.2088, + "step": 34496 + }, + { + "epoch": 0.8729660652377458, + "grad_norm": 12.504958152770996, + "learning_rate": 4.0080928802085306e-07, + "loss": 0.1343, + "step": 34497 + }, + { + "epoch": 0.8729913708024395, + "grad_norm": 3.4601171016693115, + "learning_rate": 4.0065178632515355e-07, + "loss": 0.1749, + "step": 34498 + }, + { + "epoch": 0.8730166763671331, + "grad_norm": 4.722984790802002, + "learning_rate": 4.0049431428979436e-07, + "loss": 0.1348, + "step": 34499 + }, + { + "epoch": 0.8730419819318268, + "grad_norm": 4.35054874420166, + "learning_rate": 4.0033687191579183e-07, + "loss": 0.1418, + "step": 34500 + }, + { + "epoch": 0.8730672874965205, + "grad_norm": 8.634561538696289, + "learning_rate": 4.0017945920416133e-07, + "loss": 0.1869, + "step": 34501 + }, + { + "epoch": 0.8730925930612141, + "grad_norm": 3.944474935531616, + "learning_rate": 4.0002207615591814e-07, + "loss": 0.1191, + "step": 34502 + }, + { + "epoch": 0.8731178986259078, + "grad_norm": 4.734923839569092, + "learning_rate": 3.99864722772077e-07, + "loss": 0.1885, + "step": 34503 + }, + { + "epoch": 0.8731432041906015, + "grad_norm": 5.401975631713867, + "learning_rate": 3.997073990536515e-07, + "loss": 0.1184, + "step": 34504 + }, + { + "epoch": 0.8731685097552951, + "grad_norm": 4.48639440536499, + "learning_rate": 3.9955010500165924e-07, + "loss": 0.1896, + "step": 34505 + }, + { + "epoch": 0.8731938153199889, + "grad_norm": 8.737348556518555, + "learning_rate": 3.9939284061711105e-07, + "loss": 0.3385, + "step": 34506 + }, + { + "epoch": 0.8732191208846826, + "grad_norm": 4.874267578125, + "learning_rate": 3.9923560590102326e-07, + "loss": 0.1792, + "step": 34507 + }, + { + "epoch": 0.8732444264493762, + "grad_norm": 4.5641961097717285, + "learning_rate": 3.990784008544085e-07, + "loss": 0.1377, + "step": 34508 + }, + { + "epoch": 0.8732697320140699, + "grad_norm": 7.073354244232178, + "learning_rate": 3.9892122547828205e-07, + "loss": 0.1524, + "step": 34509 + }, + { + "epoch": 0.8732950375787636, + "grad_norm": 8.838704109191895, + "learning_rate": 3.9876407977365637e-07, + "loss": 0.1428, + "step": 34510 + }, + { + "epoch": 0.8733203431434573, + "grad_norm": 3.0932695865631104, + "learning_rate": 3.9860696374154573e-07, + "loss": 0.1246, + "step": 34511 + }, + { + "epoch": 0.8733456487081509, + "grad_norm": 3.774712562561035, + "learning_rate": 3.98449877382962e-07, + "loss": 0.1024, + "step": 34512 + }, + { + "epoch": 0.8733709542728446, + "grad_norm": 3.565981388092041, + "learning_rate": 3.982928206989195e-07, + "loss": 0.1166, + "step": 34513 + }, + { + "epoch": 0.8733962598375383, + "grad_norm": 6.208371639251709, + "learning_rate": 3.981357936904312e-07, + "loss": 0.2375, + "step": 34514 + }, + { + "epoch": 0.8734215654022319, + "grad_norm": 5.057070732116699, + "learning_rate": 3.9797879635850854e-07, + "loss": 0.2439, + "step": 34515 + }, + { + "epoch": 0.8734468709669256, + "grad_norm": 3.572362184524536, + "learning_rate": 3.9782182870416407e-07, + "loss": 0.169, + "step": 34516 + }, + { + "epoch": 0.8734721765316193, + "grad_norm": 8.221643447875977, + "learning_rate": 3.976648907284114e-07, + "loss": 0.1459, + "step": 34517 + }, + { + "epoch": 0.873497482096313, + "grad_norm": 4.289177894592285, + "learning_rate": 3.975079824322609e-07, + "loss": 0.1065, + "step": 34518 + }, + { + "epoch": 0.8735227876610067, + "grad_norm": 4.520820140838623, + "learning_rate": 3.9735110381672714e-07, + "loss": 0.1346, + "step": 34519 + }, + { + "epoch": 0.8735480932257004, + "grad_norm": 7.2115983963012695, + "learning_rate": 3.971942548828178e-07, + "loss": 0.1734, + "step": 34520 + }, + { + "epoch": 0.873573398790394, + "grad_norm": 5.4441962242126465, + "learning_rate": 3.9703743563154816e-07, + "loss": 0.1065, + "step": 34521 + }, + { + "epoch": 0.8735987043550877, + "grad_norm": 6.0527424812316895, + "learning_rate": 3.9688064606392627e-07, + "loss": 0.1047, + "step": 34522 + }, + { + "epoch": 0.8736240099197814, + "grad_norm": 3.5321707725524902, + "learning_rate": 3.9672388618096693e-07, + "loss": 0.1014, + "step": 34523 + }, + { + "epoch": 0.873649315484475, + "grad_norm": 5.603646755218506, + "learning_rate": 3.96567155983677e-07, + "loss": 0.1603, + "step": 34524 + }, + { + "epoch": 0.8736746210491687, + "grad_norm": 1.5253229141235352, + "learning_rate": 3.9641045547307023e-07, + "loss": 0.0506, + "step": 34525 + }, + { + "epoch": 0.8736999266138624, + "grad_norm": 5.237452983856201, + "learning_rate": 3.962537846501563e-07, + "loss": 0.1308, + "step": 34526 + }, + { + "epoch": 0.873725232178556, + "grad_norm": 6.3719563484191895, + "learning_rate": 3.960971435159439e-07, + "loss": 0.108, + "step": 34527 + }, + { + "epoch": 0.8737505377432497, + "grad_norm": 4.239894866943359, + "learning_rate": 3.95940532071446e-07, + "loss": 0.2061, + "step": 34528 + }, + { + "epoch": 0.8737758433079434, + "grad_norm": 6.9104108810424805, + "learning_rate": 3.957839503176708e-07, + "loss": 0.196, + "step": 34529 + }, + { + "epoch": 0.873801148872637, + "grad_norm": 3.4866881370544434, + "learning_rate": 3.9562739825562913e-07, + "loss": 0.1411, + "step": 34530 + }, + { + "epoch": 0.8738264544373308, + "grad_norm": 4.577617168426514, + "learning_rate": 3.954708758863285e-07, + "loss": 0.2255, + "step": 34531 + }, + { + "epoch": 0.8738517600020245, + "grad_norm": 6.325052738189697, + "learning_rate": 3.953143832107803e-07, + "loss": 0.1244, + "step": 34532 + }, + { + "epoch": 0.8738770655667181, + "grad_norm": 6.39427375793457, + "learning_rate": 3.9515792022999377e-07, + "loss": 0.1848, + "step": 34533 + }, + { + "epoch": 0.8739023711314118, + "grad_norm": 6.728989124298096, + "learning_rate": 3.9500148694497645e-07, + "loss": 0.194, + "step": 34534 + }, + { + "epoch": 0.8739276766961055, + "grad_norm": 7.480659484863281, + "learning_rate": 3.94845083356738e-07, + "loss": 0.2162, + "step": 34535 + }, + { + "epoch": 0.8739529822607992, + "grad_norm": 2.4801061153411865, + "learning_rate": 3.946887094662882e-07, + "loss": 0.0831, + "step": 34536 + }, + { + "epoch": 0.8739782878254928, + "grad_norm": 3.835143566131592, + "learning_rate": 3.9453236527463246e-07, + "loss": 0.0933, + "step": 34537 + }, + { + "epoch": 0.8740035933901865, + "grad_norm": 4.664552211761475, + "learning_rate": 3.943760507827815e-07, + "loss": 0.153, + "step": 34538 + }, + { + "epoch": 0.8740288989548802, + "grad_norm": 5.824062824249268, + "learning_rate": 3.9421976599174237e-07, + "loss": 0.1369, + "step": 34539 + }, + { + "epoch": 0.8740542045195738, + "grad_norm": 18.80248260498047, + "learning_rate": 3.940635109025237e-07, + "loss": 0.2332, + "step": 34540 + }, + { + "epoch": 0.8740795100842675, + "grad_norm": 4.755673408508301, + "learning_rate": 3.9390728551613297e-07, + "loss": 0.1933, + "step": 34541 + }, + { + "epoch": 0.8741048156489613, + "grad_norm": 3.7240090370178223, + "learning_rate": 3.937510898335772e-07, + "loss": 0.1035, + "step": 34542 + }, + { + "epoch": 0.8741301212136549, + "grad_norm": 4.630345821380615, + "learning_rate": 3.935949238558628e-07, + "loss": 0.1561, + "step": 34543 + }, + { + "epoch": 0.8741554267783486, + "grad_norm": 17.781232833862305, + "learning_rate": 3.93438787583999e-07, + "loss": 0.2441, + "step": 34544 + }, + { + "epoch": 0.8741807323430423, + "grad_norm": 3.6124086380004883, + "learning_rate": 3.9328268101899156e-07, + "loss": 0.1643, + "step": 34545 + }, + { + "epoch": 0.8742060379077359, + "grad_norm": 8.14188289642334, + "learning_rate": 3.93126604161847e-07, + "loss": 0.167, + "step": 34546 + }, + { + "epoch": 0.8742313434724296, + "grad_norm": 8.1126070022583, + "learning_rate": 3.929705570135711e-07, + "loss": 0.187, + "step": 34547 + }, + { + "epoch": 0.8742566490371233, + "grad_norm": 3.087010145187378, + "learning_rate": 3.9281453957517204e-07, + "loss": 0.1091, + "step": 34548 + }, + { + "epoch": 0.8742819546018169, + "grad_norm": 5.067557334899902, + "learning_rate": 3.9265855184765447e-07, + "loss": 0.0957, + "step": 34549 + }, + { + "epoch": 0.8743072601665106, + "grad_norm": 5.863192558288574, + "learning_rate": 3.9250259383202647e-07, + "loss": 0.1842, + "step": 34550 + }, + { + "epoch": 0.8743325657312043, + "grad_norm": 4.015271186828613, + "learning_rate": 3.9234666552929015e-07, + "loss": 0.156, + "step": 34551 + }, + { + "epoch": 0.8743578712958979, + "grad_norm": 13.91739273071289, + "learning_rate": 3.92190766940454e-07, + "loss": 0.3041, + "step": 34552 + }, + { + "epoch": 0.8743831768605916, + "grad_norm": 7.711175441741943, + "learning_rate": 3.920348980665228e-07, + "loss": 0.1445, + "step": 34553 + }, + { + "epoch": 0.8744084824252853, + "grad_norm": 6.349114418029785, + "learning_rate": 3.918790589085014e-07, + "loss": 0.1652, + "step": 34554 + }, + { + "epoch": 0.874433787989979, + "grad_norm": 3.492431879043579, + "learning_rate": 3.9172324946739336e-07, + "loss": 0.1319, + "step": 34555 + }, + { + "epoch": 0.8744590935546727, + "grad_norm": 13.440603256225586, + "learning_rate": 3.9156746974420625e-07, + "loss": 0.159, + "step": 34556 + }, + { + "epoch": 0.8744843991193664, + "grad_norm": 4.221472263336182, + "learning_rate": 3.9141171973994307e-07, + "loss": 0.1624, + "step": 34557 + }, + { + "epoch": 0.87450970468406, + "grad_norm": 4.508223056793213, + "learning_rate": 3.9125599945560866e-07, + "loss": 0.1705, + "step": 34558 + }, + { + "epoch": 0.8745350102487537, + "grad_norm": 8.491782188415527, + "learning_rate": 3.9110030889220607e-07, + "loss": 0.1103, + "step": 34559 + }, + { + "epoch": 0.8745603158134474, + "grad_norm": 7.034165859222412, + "learning_rate": 3.909446480507412e-07, + "loss": 0.2498, + "step": 34560 + }, + { + "epoch": 0.874585621378141, + "grad_norm": 3.0019848346710205, + "learning_rate": 3.9078901693221704e-07, + "loss": 0.07, + "step": 34561 + }, + { + "epoch": 0.8746109269428347, + "grad_norm": 9.467743873596191, + "learning_rate": 3.906334155376357e-07, + "loss": 0.2756, + "step": 34562 + }, + { + "epoch": 0.8746362325075284, + "grad_norm": 2.8869850635528564, + "learning_rate": 3.9047784386800405e-07, + "loss": 0.1571, + "step": 34563 + }, + { + "epoch": 0.8746615380722221, + "grad_norm": 5.1289286613464355, + "learning_rate": 3.9032230192432184e-07, + "loss": 0.1646, + "step": 34564 + }, + { + "epoch": 0.8746868436369157, + "grad_norm": 2.7037124633789062, + "learning_rate": 3.901667897075945e-07, + "loss": 0.0803, + "step": 34565 + }, + { + "epoch": 0.8747121492016094, + "grad_norm": 6.170302391052246, + "learning_rate": 3.900113072188233e-07, + "loss": 0.1873, + "step": 34566 + }, + { + "epoch": 0.8747374547663032, + "grad_norm": 11.18740463256836, + "learning_rate": 3.898558544590131e-07, + "loss": 0.1052, + "step": 34567 + }, + { + "epoch": 0.8747627603309968, + "grad_norm": 4.763828754425049, + "learning_rate": 3.8970043142916304e-07, + "loss": 0.2101, + "step": 34568 + }, + { + "epoch": 0.8747880658956905, + "grad_norm": 5.905595779418945, + "learning_rate": 3.895450381302784e-07, + "loss": 0.1386, + "step": 34569 + }, + { + "epoch": 0.8748133714603842, + "grad_norm": 3.8419036865234375, + "learning_rate": 3.893896745633591e-07, + "loss": 0.1297, + "step": 34570 + }, + { + "epoch": 0.8748386770250778, + "grad_norm": 3.9279234409332275, + "learning_rate": 3.892343407294097e-07, + "loss": 0.1735, + "step": 34571 + }, + { + "epoch": 0.8748639825897715, + "grad_norm": 2.9547553062438965, + "learning_rate": 3.890790366294295e-07, + "loss": 0.1178, + "step": 34572 + }, + { + "epoch": 0.8748892881544652, + "grad_norm": 8.483807563781738, + "learning_rate": 3.889237622644215e-07, + "loss": 0.188, + "step": 34573 + }, + { + "epoch": 0.8749145937191588, + "grad_norm": 7.088768005371094, + "learning_rate": 3.887685176353856e-07, + "loss": 0.1476, + "step": 34574 + }, + { + "epoch": 0.8749398992838525, + "grad_norm": 3.7176973819732666, + "learning_rate": 3.886133027433242e-07, + "loss": 0.1145, + "step": 34575 + }, + { + "epoch": 0.8749652048485462, + "grad_norm": 3.9904627799987793, + "learning_rate": 3.8845811758923823e-07, + "loss": 0.1329, + "step": 34576 + }, + { + "epoch": 0.8749905104132398, + "grad_norm": 2.2871670722961426, + "learning_rate": 3.883029621741274e-07, + "loss": 0.0196, + "step": 34577 + }, + { + "epoch": 0.8750158159779335, + "grad_norm": 5.766270160675049, + "learning_rate": 3.881478364989927e-07, + "loss": 0.0966, + "step": 34578 + }, + { + "epoch": 0.8750411215426273, + "grad_norm": 4.0487847328186035, + "learning_rate": 3.8799274056483535e-07, + "loss": 0.1915, + "step": 34579 + }, + { + "epoch": 0.8750664271073209, + "grad_norm": 4.410615921020508, + "learning_rate": 3.878376743726553e-07, + "loss": 0.1061, + "step": 34580 + }, + { + "epoch": 0.8750917326720146, + "grad_norm": 6.224915027618408, + "learning_rate": 3.876826379234516e-07, + "loss": 0.2008, + "step": 34581 + }, + { + "epoch": 0.8751170382367083, + "grad_norm": 6.949882984161377, + "learning_rate": 3.8752763121822413e-07, + "loss": 0.1432, + "step": 34582 + }, + { + "epoch": 0.8751423438014019, + "grad_norm": 5.5653181076049805, + "learning_rate": 3.873726542579737e-07, + "loss": 0.1843, + "step": 34583 + }, + { + "epoch": 0.8751676493660956, + "grad_norm": 5.6658477783203125, + "learning_rate": 3.872177070436989e-07, + "loss": 0.1343, + "step": 34584 + }, + { + "epoch": 0.8751929549307893, + "grad_norm": 5.001789093017578, + "learning_rate": 3.8706278957639896e-07, + "loss": 0.1303, + "step": 34585 + }, + { + "epoch": 0.8752182604954829, + "grad_norm": 6.421923637390137, + "learning_rate": 3.869079018570726e-07, + "loss": 0.191, + "step": 34586 + }, + { + "epoch": 0.8752435660601766, + "grad_norm": 8.413517951965332, + "learning_rate": 3.8675304388671953e-07, + "loss": 0.1523, + "step": 34587 + }, + { + "epoch": 0.8752688716248703, + "grad_norm": 5.789592742919922, + "learning_rate": 3.865982156663384e-07, + "loss": 0.2772, + "step": 34588 + }, + { + "epoch": 0.875294177189564, + "grad_norm": 4.6480536460876465, + "learning_rate": 3.864434171969267e-07, + "loss": 0.0963, + "step": 34589 + }, + { + "epoch": 0.8753194827542576, + "grad_norm": 10.089725494384766, + "learning_rate": 3.8628864847948367e-07, + "loss": 0.2717, + "step": 34590 + }, + { + "epoch": 0.8753447883189514, + "grad_norm": 3.00030255317688, + "learning_rate": 3.8613390951500574e-07, + "loss": 0.0839, + "step": 34591 + }, + { + "epoch": 0.8753700938836451, + "grad_norm": 7.668893814086914, + "learning_rate": 3.8597920030449323e-07, + "loss": 0.1568, + "step": 34592 + }, + { + "epoch": 0.8753953994483387, + "grad_norm": 3.1984612941741943, + "learning_rate": 3.858245208489414e-07, + "loss": 0.1148, + "step": 34593 + }, + { + "epoch": 0.8754207050130324, + "grad_norm": 2.958303689956665, + "learning_rate": 3.856698711493506e-07, + "loss": 0.1161, + "step": 34594 + }, + { + "epoch": 0.8754460105777261, + "grad_norm": 5.214315891265869, + "learning_rate": 3.8551525120671506e-07, + "loss": 0.1958, + "step": 34595 + }, + { + "epoch": 0.8754713161424197, + "grad_norm": 2.3818819522857666, + "learning_rate": 3.8536066102203394e-07, + "loss": 0.095, + "step": 34596 + }, + { + "epoch": 0.8754966217071134, + "grad_norm": 4.79706335067749, + "learning_rate": 3.852061005963026e-07, + "loss": 0.1419, + "step": 34597 + }, + { + "epoch": 0.8755219272718071, + "grad_norm": 11.607760429382324, + "learning_rate": 3.8505156993052075e-07, + "loss": 0.2464, + "step": 34598 + }, + { + "epoch": 0.8755472328365007, + "grad_norm": 5.845696449279785, + "learning_rate": 3.8489706902568037e-07, + "loss": 0.0789, + "step": 34599 + }, + { + "epoch": 0.8755725384011944, + "grad_norm": 3.451152801513672, + "learning_rate": 3.847425978827818e-07, + "loss": 0.0821, + "step": 34600 + }, + { + "epoch": 0.8755978439658881, + "grad_norm": 3.3441622257232666, + "learning_rate": 3.845881565028192e-07, + "loss": 0.119, + "step": 34601 + }, + { + "epoch": 0.8756231495305817, + "grad_norm": 5.38539981842041, + "learning_rate": 3.8443374488678906e-07, + "loss": 0.164, + "step": 34602 + }, + { + "epoch": 0.8756484550952754, + "grad_norm": 27.94254493713379, + "learning_rate": 3.842793630356878e-07, + "loss": 0.2161, + "step": 34603 + }, + { + "epoch": 0.8756737606599692, + "grad_norm": 3.7586991786956787, + "learning_rate": 3.8412501095051014e-07, + "loss": 0.1262, + "step": 34604 + }, + { + "epoch": 0.8756990662246628, + "grad_norm": 7.196725368499756, + "learning_rate": 3.8397068863225083e-07, + "loss": 0.1343, + "step": 34605 + }, + { + "epoch": 0.8757243717893565, + "grad_norm": 9.828399658203125, + "learning_rate": 3.8381639608190635e-07, + "loss": 0.3089, + "step": 34606 + }, + { + "epoch": 0.8757496773540502, + "grad_norm": 12.625191688537598, + "learning_rate": 3.8366213330047196e-07, + "loss": 0.0902, + "step": 34607 + }, + { + "epoch": 0.8757749829187438, + "grad_norm": 4.382643222808838, + "learning_rate": 3.8350790028894134e-07, + "loss": 0.1079, + "step": 34608 + }, + { + "epoch": 0.8758002884834375, + "grad_norm": 4.050898551940918, + "learning_rate": 3.833536970483087e-07, + "loss": 0.162, + "step": 34609 + }, + { + "epoch": 0.8758255940481312, + "grad_norm": 13.095555305480957, + "learning_rate": 3.8319952357957044e-07, + "loss": 0.2443, + "step": 34610 + }, + { + "epoch": 0.8758508996128248, + "grad_norm": 2.1768639087677, + "learning_rate": 3.8304537988371916e-07, + "loss": 0.1071, + "step": 34611 + }, + { + "epoch": 0.8758762051775185, + "grad_norm": 5.0495734214782715, + "learning_rate": 3.828912659617501e-07, + "loss": 0.2, + "step": 34612 + }, + { + "epoch": 0.8759015107422122, + "grad_norm": 8.437828063964844, + "learning_rate": 3.827371818146547e-07, + "loss": 0.1225, + "step": 34613 + }, + { + "epoch": 0.8759268163069059, + "grad_norm": 5.323512554168701, + "learning_rate": 3.8258312744342994e-07, + "loss": 0.1326, + "step": 34614 + }, + { + "epoch": 0.8759521218715995, + "grad_norm": 3.7414608001708984, + "learning_rate": 3.824291028490673e-07, + "loss": 0.1313, + "step": 34615 + }, + { + "epoch": 0.8759774274362933, + "grad_norm": 8.096822738647461, + "learning_rate": 3.822751080325604e-07, + "loss": 0.2804, + "step": 34616 + }, + { + "epoch": 0.876002733000987, + "grad_norm": 4.487400054931641, + "learning_rate": 3.8212114299490176e-07, + "loss": 0.1879, + "step": 34617 + }, + { + "epoch": 0.8760280385656806, + "grad_norm": 4.547844409942627, + "learning_rate": 3.8196720773708564e-07, + "loss": 0.1533, + "step": 34618 + }, + { + "epoch": 0.8760533441303743, + "grad_norm": 3.8371078968048096, + "learning_rate": 3.8181330226010393e-07, + "loss": 0.1115, + "step": 34619 + }, + { + "epoch": 0.876078649695068, + "grad_norm": 4.9890007972717285, + "learning_rate": 3.8165942656494924e-07, + "loss": 0.1063, + "step": 34620 + }, + { + "epoch": 0.8761039552597616, + "grad_norm": 3.659605026245117, + "learning_rate": 3.8150558065261357e-07, + "loss": 0.1195, + "step": 34621 + }, + { + "epoch": 0.8761292608244553, + "grad_norm": 3.7415895462036133, + "learning_rate": 3.8135176452408883e-07, + "loss": 0.1268, + "step": 34622 + }, + { + "epoch": 0.876154566389149, + "grad_norm": 5.077136039733887, + "learning_rate": 3.8119797818036817e-07, + "loss": 0.1484, + "step": 34623 + }, + { + "epoch": 0.8761798719538426, + "grad_norm": 3.2323365211486816, + "learning_rate": 3.8104422162244194e-07, + "loss": 0.0939, + "step": 34624 + }, + { + "epoch": 0.8762051775185363, + "grad_norm": 12.421055793762207, + "learning_rate": 3.808904948513037e-07, + "loss": 0.2617, + "step": 34625 + }, + { + "epoch": 0.87623048308323, + "grad_norm": 3.0601589679718018, + "learning_rate": 3.807367978679416e-07, + "loss": 0.1389, + "step": 34626 + }, + { + "epoch": 0.8762557886479236, + "grad_norm": 3.607827663421631, + "learning_rate": 3.8058313067334986e-07, + "loss": 0.1547, + "step": 34627 + }, + { + "epoch": 0.8762810942126174, + "grad_norm": 3.5149009227752686, + "learning_rate": 3.8042949326851705e-07, + "loss": 0.1539, + "step": 34628 + }, + { + "epoch": 0.8763063997773111, + "grad_norm": 8.182927131652832, + "learning_rate": 3.8027588565443694e-07, + "loss": 0.1367, + "step": 34629 + }, + { + "epoch": 0.8763317053420047, + "grad_norm": 4.575840950012207, + "learning_rate": 3.8012230783209705e-07, + "loss": 0.1399, + "step": 34630 + }, + { + "epoch": 0.8763570109066984, + "grad_norm": 11.928768157958984, + "learning_rate": 3.7996875980248926e-07, + "loss": 0.1986, + "step": 34631 + }, + { + "epoch": 0.8763823164713921, + "grad_norm": 6.865693092346191, + "learning_rate": 3.7981524156660286e-07, + "loss": 0.1208, + "step": 34632 + }, + { + "epoch": 0.8764076220360857, + "grad_norm": 2.995358467102051, + "learning_rate": 3.796617531254304e-07, + "loss": 0.1257, + "step": 34633 + }, + { + "epoch": 0.8764329276007794, + "grad_norm": 2.7320241928100586, + "learning_rate": 3.795082944799583e-07, + "loss": 0.1316, + "step": 34634 + }, + { + "epoch": 0.8764582331654731, + "grad_norm": 3.1846179962158203, + "learning_rate": 3.793548656311785e-07, + "loss": 0.1227, + "step": 34635 + }, + { + "epoch": 0.8764835387301667, + "grad_norm": 4.298444747924805, + "learning_rate": 3.7920146658007863e-07, + "loss": 0.1095, + "step": 34636 + }, + { + "epoch": 0.8765088442948604, + "grad_norm": 4.234645843505859, + "learning_rate": 3.7904809732764947e-07, + "loss": 0.1533, + "step": 34637 + }, + { + "epoch": 0.8765341498595541, + "grad_norm": 5.652884483337402, + "learning_rate": 3.788947578748803e-07, + "loss": 0.1283, + "step": 34638 + }, + { + "epoch": 0.8765594554242478, + "grad_norm": 3.428260087966919, + "learning_rate": 3.787414482227586e-07, + "loss": 0.0831, + "step": 34639 + }, + { + "epoch": 0.8765847609889414, + "grad_norm": 2.405888080596924, + "learning_rate": 3.7858816837227366e-07, + "loss": 0.1251, + "step": 34640 + }, + { + "epoch": 0.8766100665536352, + "grad_norm": 4.115677833557129, + "learning_rate": 3.78434918324414e-07, + "loss": 0.1617, + "step": 34641 + }, + { + "epoch": 0.8766353721183289, + "grad_norm": 8.506336212158203, + "learning_rate": 3.782816980801685e-07, + "loss": 0.1846, + "step": 34642 + }, + { + "epoch": 0.8766606776830225, + "grad_norm": 3.21247935295105, + "learning_rate": 3.781285076405239e-07, + "loss": 0.1831, + "step": 34643 + }, + { + "epoch": 0.8766859832477162, + "grad_norm": 4.970163822174072, + "learning_rate": 3.7797534700646843e-07, + "loss": 0.1837, + "step": 34644 + }, + { + "epoch": 0.8767112888124099, + "grad_norm": 6.675630569458008, + "learning_rate": 3.7782221617899127e-07, + "loss": 0.1231, + "step": 34645 + }, + { + "epoch": 0.8767365943771035, + "grad_norm": 4.2913126945495605, + "learning_rate": 3.7766911515907833e-07, + "loss": 0.0984, + "step": 34646 + }, + { + "epoch": 0.8767618999417972, + "grad_norm": 5.360395431518555, + "learning_rate": 3.775160439477177e-07, + "loss": 0.1802, + "step": 34647 + }, + { + "epoch": 0.8767872055064909, + "grad_norm": 3.200139284133911, + "learning_rate": 3.773630025458963e-07, + "loss": 0.126, + "step": 34648 + }, + { + "epoch": 0.8768125110711845, + "grad_norm": 4.1819891929626465, + "learning_rate": 3.772099909546001e-07, + "loss": 0.1089, + "step": 34649 + }, + { + "epoch": 0.8768378166358782, + "grad_norm": 4.882070541381836, + "learning_rate": 3.7705700917481767e-07, + "loss": 0.1522, + "step": 34650 + }, + { + "epoch": 0.8768631222005719, + "grad_norm": 2.5710809230804443, + "learning_rate": 3.7690405720753445e-07, + "loss": 0.1352, + "step": 34651 + }, + { + "epoch": 0.8768884277652655, + "grad_norm": 16.18390655517578, + "learning_rate": 3.767511350537373e-07, + "loss": 0.2169, + "step": 34652 + }, + { + "epoch": 0.8769137333299593, + "grad_norm": 7.368236064910889, + "learning_rate": 3.7659824271441104e-07, + "loss": 0.2088, + "step": 34653 + }, + { + "epoch": 0.876939038894653, + "grad_norm": 5.522877216339111, + "learning_rate": 3.7644538019054375e-07, + "loss": 0.1892, + "step": 34654 + }, + { + "epoch": 0.8769643444593466, + "grad_norm": 6.909284591674805, + "learning_rate": 3.762925474831192e-07, + "loss": 0.1483, + "step": 34655 + }, + { + "epoch": 0.8769896500240403, + "grad_norm": 4.999801158905029, + "learning_rate": 3.761397445931253e-07, + "loss": 0.1175, + "step": 34656 + }, + { + "epoch": 0.877014955588734, + "grad_norm": 5.35866641998291, + "learning_rate": 3.7598697152154475e-07, + "loss": 0.1654, + "step": 34657 + }, + { + "epoch": 0.8770402611534276, + "grad_norm": 4.3158159255981445, + "learning_rate": 3.75834228269365e-07, + "loss": 0.1423, + "step": 34658 + }, + { + "epoch": 0.8770655667181213, + "grad_norm": 4.154253959655762, + "learning_rate": 3.756815148375692e-07, + "loss": 0.1333, + "step": 34659 + }, + { + "epoch": 0.877090872282815, + "grad_norm": 4.379216194152832, + "learning_rate": 3.755288312271449e-07, + "loss": 0.1316, + "step": 34660 + }, + { + "epoch": 0.8771161778475086, + "grad_norm": 9.79621696472168, + "learning_rate": 3.7537617743907294e-07, + "loss": 0.2117, + "step": 34661 + }, + { + "epoch": 0.8771414834122023, + "grad_norm": 3.5699868202209473, + "learning_rate": 3.752235534743409e-07, + "loss": 0.0895, + "step": 34662 + }, + { + "epoch": 0.877166788976896, + "grad_norm": 2.982649326324463, + "learning_rate": 3.750709593339308e-07, + "loss": 0.1118, + "step": 34663 + }, + { + "epoch": 0.8771920945415898, + "grad_norm": 7.212841033935547, + "learning_rate": 3.749183950188295e-07, + "loss": 0.1205, + "step": 34664 + }, + { + "epoch": 0.8772174001062834, + "grad_norm": 4.5509819984436035, + "learning_rate": 3.747658605300175e-07, + "loss": 0.1676, + "step": 34665 + }, + { + "epoch": 0.8772427056709771, + "grad_norm": 4.58762264251709, + "learning_rate": 3.7461335586848113e-07, + "loss": 0.1214, + "step": 34666 + }, + { + "epoch": 0.8772680112356708, + "grad_norm": 4.9506964683532715, + "learning_rate": 3.7446088103520184e-07, + "loss": 0.1613, + "step": 34667 + }, + { + "epoch": 0.8772933168003644, + "grad_norm": 8.67823314666748, + "learning_rate": 3.7430843603116495e-07, + "loss": 0.1229, + "step": 34668 + }, + { + "epoch": 0.8773186223650581, + "grad_norm": 8.315762519836426, + "learning_rate": 3.741560208573525e-07, + "loss": 0.1767, + "step": 34669 + }, + { + "epoch": 0.8773439279297518, + "grad_norm": 7.399193286895752, + "learning_rate": 3.7400363551474694e-07, + "loss": 0.148, + "step": 34670 + }, + { + "epoch": 0.8773692334944454, + "grad_norm": 22.85226058959961, + "learning_rate": 3.738512800043309e-07, + "loss": 0.2585, + "step": 34671 + }, + { + "epoch": 0.8773945390591391, + "grad_norm": 4.841699123382568, + "learning_rate": 3.736989543270886e-07, + "loss": 0.183, + "step": 34672 + }, + { + "epoch": 0.8774198446238328, + "grad_norm": 5.050178050994873, + "learning_rate": 3.7354665848400083e-07, + "loss": 0.1715, + "step": 34673 + }, + { + "epoch": 0.8774451501885264, + "grad_norm": 4.497339725494385, + "learning_rate": 3.7339439247605023e-07, + "loss": 0.168, + "step": 34674 + }, + { + "epoch": 0.8774704557532201, + "grad_norm": 3.485820770263672, + "learning_rate": 3.7324215630421825e-07, + "loss": 0.0813, + "step": 34675 + }, + { + "epoch": 0.8774957613179138, + "grad_norm": 2.6269924640655518, + "learning_rate": 3.730899499694862e-07, + "loss": 0.1506, + "step": 34676 + }, + { + "epoch": 0.8775210668826074, + "grad_norm": 4.633882999420166, + "learning_rate": 3.729377734728373e-07, + "loss": 0.1634, + "step": 34677 + }, + { + "epoch": 0.8775463724473012, + "grad_norm": 9.749228477478027, + "learning_rate": 3.7278562681525244e-07, + "loss": 0.2287, + "step": 34678 + }, + { + "epoch": 0.8775716780119949, + "grad_norm": 8.042923927307129, + "learning_rate": 3.7263350999771184e-07, + "loss": 0.241, + "step": 34679 + }, + { + "epoch": 0.8775969835766885, + "grad_norm": 6.500609874725342, + "learning_rate": 3.724814230211965e-07, + "loss": 0.1742, + "step": 34680 + }, + { + "epoch": 0.8776222891413822, + "grad_norm": 5.165983200073242, + "learning_rate": 3.7232936588668777e-07, + "loss": 0.2146, + "step": 34681 + }, + { + "epoch": 0.8776475947060759, + "grad_norm": 8.846787452697754, + "learning_rate": 3.7217733859516714e-07, + "loss": 0.2499, + "step": 34682 + }, + { + "epoch": 0.8776729002707695, + "grad_norm": 7.486447811126709, + "learning_rate": 3.7202534114761324e-07, + "loss": 0.1203, + "step": 34683 + }, + { + "epoch": 0.8776982058354632, + "grad_norm": 4.753330707550049, + "learning_rate": 3.718733735450064e-07, + "loss": 0.0582, + "step": 34684 + }, + { + "epoch": 0.8777235114001569, + "grad_norm": 4.868987083435059, + "learning_rate": 3.7172143578832865e-07, + "loss": 0.1651, + "step": 34685 + }, + { + "epoch": 0.8777488169648505, + "grad_norm": 5.490060806274414, + "learning_rate": 3.7156952787855746e-07, + "loss": 0.172, + "step": 34686 + }, + { + "epoch": 0.8777741225295442, + "grad_norm": 6.6913886070251465, + "learning_rate": 3.714176498166744e-07, + "loss": 0.186, + "step": 34687 + }, + { + "epoch": 0.8777994280942379, + "grad_norm": 6.547604084014893, + "learning_rate": 3.712658016036563e-07, + "loss": 0.2367, + "step": 34688 + }, + { + "epoch": 0.8778247336589315, + "grad_norm": 3.401172161102295, + "learning_rate": 3.711139832404853e-07, + "loss": 0.1241, + "step": 34689 + }, + { + "epoch": 0.8778500392236253, + "grad_norm": 3.7862703800201416, + "learning_rate": 3.709621947281383e-07, + "loss": 0.0668, + "step": 34690 + }, + { + "epoch": 0.877875344788319, + "grad_norm": 4.909913539886475, + "learning_rate": 3.708104360675968e-07, + "loss": 0.153, + "step": 34691 + }, + { + "epoch": 0.8779006503530127, + "grad_norm": 5.442347049713135, + "learning_rate": 3.706587072598361e-07, + "loss": 0.2561, + "step": 34692 + }, + { + "epoch": 0.8779259559177063, + "grad_norm": 7.015289306640625, + "learning_rate": 3.705070083058371e-07, + "loss": 0.1872, + "step": 34693 + }, + { + "epoch": 0.8779512614824, + "grad_norm": 5.34039306640625, + "learning_rate": 3.7035533920657616e-07, + "loss": 0.1124, + "step": 34694 + }, + { + "epoch": 0.8779765670470937, + "grad_norm": 3.9343044757843018, + "learning_rate": 3.702036999630343e-07, + "loss": 0.0865, + "step": 34695 + }, + { + "epoch": 0.8780018726117873, + "grad_norm": 6.988046646118164, + "learning_rate": 3.700520905761856e-07, + "loss": 0.1668, + "step": 34696 + }, + { + "epoch": 0.878027178176481, + "grad_norm": 3.844292163848877, + "learning_rate": 3.6990051104701054e-07, + "loss": 0.1481, + "step": 34697 + }, + { + "epoch": 0.8780524837411747, + "grad_norm": 2.6551547050476074, + "learning_rate": 3.697489613764854e-07, + "loss": 0.0981, + "step": 34698 + }, + { + "epoch": 0.8780777893058683, + "grad_norm": 4.646564960479736, + "learning_rate": 3.695974415655884e-07, + "loss": 0.1565, + "step": 34699 + }, + { + "epoch": 0.878103094870562, + "grad_norm": 17.372793197631836, + "learning_rate": 3.6944595161529653e-07, + "loss": 0.3001, + "step": 34700 + }, + { + "epoch": 0.8781284004352558, + "grad_norm": 3.9254016876220703, + "learning_rate": 3.692944915265861e-07, + "loss": 0.0956, + "step": 34701 + }, + { + "epoch": 0.8781537059999494, + "grad_norm": 6.721383571624756, + "learning_rate": 3.6914306130043377e-07, + "loss": 0.0996, + "step": 34702 + }, + { + "epoch": 0.8781790115646431, + "grad_norm": 3.76141357421875, + "learning_rate": 3.689916609378169e-07, + "loss": 0.1001, + "step": 34703 + }, + { + "epoch": 0.8782043171293368, + "grad_norm": 3.407717704772949, + "learning_rate": 3.688402904397109e-07, + "loss": 0.1076, + "step": 34704 + }, + { + "epoch": 0.8782296226940304, + "grad_norm": 10.181867599487305, + "learning_rate": 3.686889498070928e-07, + "loss": 0.2148, + "step": 34705 + }, + { + "epoch": 0.8782549282587241, + "grad_norm": 9.463896751403809, + "learning_rate": 3.6853763904093843e-07, + "loss": 0.258, + "step": 34706 + }, + { + "epoch": 0.8782802338234178, + "grad_norm": 2.7565598487854004, + "learning_rate": 3.6838635814222255e-07, + "loss": 0.1602, + "step": 34707 + }, + { + "epoch": 0.8783055393881114, + "grad_norm": 3.606327772140503, + "learning_rate": 3.682351071119217e-07, + "loss": 0.1055, + "step": 34708 + }, + { + "epoch": 0.8783308449528051, + "grad_norm": 5.360023498535156, + "learning_rate": 3.6808388595101165e-07, + "loss": 0.15, + "step": 34709 + }, + { + "epoch": 0.8783561505174988, + "grad_norm": 6.859926223754883, + "learning_rate": 3.6793269466046665e-07, + "loss": 0.1581, + "step": 34710 + }, + { + "epoch": 0.8783814560821924, + "grad_norm": 4.350323677062988, + "learning_rate": 3.6778153324126154e-07, + "loss": 0.1045, + "step": 34711 + }, + { + "epoch": 0.8784067616468861, + "grad_norm": 6.959863662719727, + "learning_rate": 3.6763040169437214e-07, + "loss": 0.1613, + "step": 34712 + }, + { + "epoch": 0.8784320672115798, + "grad_norm": 2.400355339050293, + "learning_rate": 3.674793000207727e-07, + "loss": 0.1094, + "step": 34713 + }, + { + "epoch": 0.8784573727762734, + "grad_norm": 3.803312301635742, + "learning_rate": 3.67328228221438e-07, + "loss": 0.115, + "step": 34714 + }, + { + "epoch": 0.8784826783409672, + "grad_norm": 4.727200984954834, + "learning_rate": 3.671771862973406e-07, + "loss": 0.1634, + "step": 34715 + }, + { + "epoch": 0.8785079839056609, + "grad_norm": 4.173731803894043, + "learning_rate": 3.670261742494563e-07, + "loss": 0.1221, + "step": 34716 + }, + { + "epoch": 0.8785332894703546, + "grad_norm": 8.114814758300781, + "learning_rate": 3.6687519207875886e-07, + "loss": 0.1476, + "step": 34717 + }, + { + "epoch": 0.8785585950350482, + "grad_norm": 8.80781078338623, + "learning_rate": 3.667242397862214e-07, + "loss": 0.1635, + "step": 34718 + }, + { + "epoch": 0.8785839005997419, + "grad_norm": 3.8836019039154053, + "learning_rate": 3.665733173728164e-07, + "loss": 0.1378, + "step": 34719 + }, + { + "epoch": 0.8786092061644356, + "grad_norm": 7.466248989105225, + "learning_rate": 3.6642242483951926e-07, + "loss": 0.1705, + "step": 34720 + }, + { + "epoch": 0.8786345117291292, + "grad_norm": 3.639596700668335, + "learning_rate": 3.6627156218730086e-07, + "loss": 0.1045, + "step": 34721 + }, + { + "epoch": 0.8786598172938229, + "grad_norm": 4.094355583190918, + "learning_rate": 3.661207294171376e-07, + "loss": 0.094, + "step": 34722 + }, + { + "epoch": 0.8786851228585166, + "grad_norm": 5.066620826721191, + "learning_rate": 3.6596992652999766e-07, + "loss": 0.1679, + "step": 34723 + }, + { + "epoch": 0.8787104284232102, + "grad_norm": 9.500694274902344, + "learning_rate": 3.6581915352685627e-07, + "loss": 0.1641, + "step": 34724 + }, + { + "epoch": 0.8787357339879039, + "grad_norm": 14.712787628173828, + "learning_rate": 3.656684104086844e-07, + "loss": 0.2131, + "step": 34725 + }, + { + "epoch": 0.8787610395525977, + "grad_norm": 3.7245118618011475, + "learning_rate": 3.655176971764568e-07, + "loss": 0.0991, + "step": 34726 + }, + { + "epoch": 0.8787863451172913, + "grad_norm": 4.524343967437744, + "learning_rate": 3.653670138311416e-07, + "loss": 0.1189, + "step": 34727 + }, + { + "epoch": 0.878811650681985, + "grad_norm": 4.882424831390381, + "learning_rate": 3.65216360373713e-07, + "loss": 0.0832, + "step": 34728 + }, + { + "epoch": 0.8788369562466787, + "grad_norm": 3.5290369987487793, + "learning_rate": 3.650657368051419e-07, + "loss": 0.1564, + "step": 34729 + }, + { + "epoch": 0.8788622618113723, + "grad_norm": 5.294898509979248, + "learning_rate": 3.6491514312639973e-07, + "loss": 0.1433, + "step": 34730 + }, + { + "epoch": 0.878887567376066, + "grad_norm": 3.9817769527435303, + "learning_rate": 3.64764579338458e-07, + "loss": 0.1381, + "step": 34731 + }, + { + "epoch": 0.8789128729407597, + "grad_norm": 4.615243911743164, + "learning_rate": 3.646140454422875e-07, + "loss": 0.1682, + "step": 34732 + }, + { + "epoch": 0.8789381785054533, + "grad_norm": 2.977139711380005, + "learning_rate": 3.644635414388586e-07, + "loss": 0.1156, + "step": 34733 + }, + { + "epoch": 0.878963484070147, + "grad_norm": 8.519417762756348, + "learning_rate": 3.643130673291412e-07, + "loss": 0.1307, + "step": 34734 + }, + { + "epoch": 0.8789887896348407, + "grad_norm": 4.175856113433838, + "learning_rate": 3.6416262311410713e-07, + "loss": 0.1786, + "step": 34735 + }, + { + "epoch": 0.8790140951995343, + "grad_norm": 3.676426887512207, + "learning_rate": 3.6401220879472623e-07, + "loss": 0.0872, + "step": 34736 + }, + { + "epoch": 0.879039400764228, + "grad_norm": 2.713810682296753, + "learning_rate": 3.638618243719677e-07, + "loss": 0.1021, + "step": 34737 + }, + { + "epoch": 0.8790647063289218, + "grad_norm": 4.159943580627441, + "learning_rate": 3.637114698468019e-07, + "loss": 0.1281, + "step": 34738 + }, + { + "epoch": 0.8790900118936154, + "grad_norm": 3.108811616897583, + "learning_rate": 3.635611452201987e-07, + "loss": 0.1224, + "step": 34739 + }, + { + "epoch": 0.8791153174583091, + "grad_norm": 3.962630271911621, + "learning_rate": 3.634108504931272e-07, + "loss": 0.1408, + "step": 34740 + }, + { + "epoch": 0.8791406230230028, + "grad_norm": 10.96523666381836, + "learning_rate": 3.632605856665572e-07, + "loss": 0.1673, + "step": 34741 + }, + { + "epoch": 0.8791659285876965, + "grad_norm": 4.489306449890137, + "learning_rate": 3.631103507414557e-07, + "loss": 0.1184, + "step": 34742 + }, + { + "epoch": 0.8791912341523901, + "grad_norm": 6.716671943664551, + "learning_rate": 3.6296014571879424e-07, + "loss": 0.1385, + "step": 34743 + }, + { + "epoch": 0.8792165397170838, + "grad_norm": 7.018754482269287, + "learning_rate": 3.6280997059954024e-07, + "loss": 0.1408, + "step": 34744 + }, + { + "epoch": 0.8792418452817775, + "grad_norm": 3.6559855937957764, + "learning_rate": 3.6265982538466195e-07, + "loss": 0.1429, + "step": 34745 + }, + { + "epoch": 0.8792671508464711, + "grad_norm": 6.62874174118042, + "learning_rate": 3.6250971007512683e-07, + "loss": 0.1723, + "step": 34746 + }, + { + "epoch": 0.8792924564111648, + "grad_norm": 11.050382614135742, + "learning_rate": 3.623596246719052e-07, + "loss": 0.2005, + "step": 34747 + }, + { + "epoch": 0.8793177619758585, + "grad_norm": 8.133139610290527, + "learning_rate": 3.622095691759636e-07, + "loss": 0.1586, + "step": 34748 + }, + { + "epoch": 0.8793430675405521, + "grad_norm": 9.752557754516602, + "learning_rate": 3.6205954358826955e-07, + "loss": 0.2981, + "step": 34749 + }, + { + "epoch": 0.8793683731052458, + "grad_norm": 8.387613296508789, + "learning_rate": 3.6190954790979003e-07, + "loss": 0.1476, + "step": 34750 + }, + { + "epoch": 0.8793936786699396, + "grad_norm": 3.9680659770965576, + "learning_rate": 3.617595821414943e-07, + "loss": 0.0968, + "step": 34751 + }, + { + "epoch": 0.8794189842346332, + "grad_norm": 4.660216808319092, + "learning_rate": 3.616096462843471e-07, + "loss": 0.1046, + "step": 34752 + }, + { + "epoch": 0.8794442897993269, + "grad_norm": 7.9724650382995605, + "learning_rate": 3.614597403393183e-07, + "loss": 0.171, + "step": 34753 + }, + { + "epoch": 0.8794695953640206, + "grad_norm": 3.8808727264404297, + "learning_rate": 3.613098643073709e-07, + "loss": 0.1388, + "step": 34754 + }, + { + "epoch": 0.8794949009287142, + "grad_norm": 4.8662872314453125, + "learning_rate": 3.6116001818947423e-07, + "loss": 0.169, + "step": 34755 + }, + { + "epoch": 0.8795202064934079, + "grad_norm": 4.042889595031738, + "learning_rate": 3.6101020198659297e-07, + "loss": 0.1383, + "step": 34756 + }, + { + "epoch": 0.8795455120581016, + "grad_norm": 5.580979824066162, + "learning_rate": 3.608604156996953e-07, + "loss": 0.1345, + "step": 34757 + }, + { + "epoch": 0.8795708176227952, + "grad_norm": 5.6739301681518555, + "learning_rate": 3.607106593297449e-07, + "loss": 0.1209, + "step": 34758 + }, + { + "epoch": 0.8795961231874889, + "grad_norm": 9.046797752380371, + "learning_rate": 3.6056093287770876e-07, + "loss": 0.1808, + "step": 34759 + }, + { + "epoch": 0.8796214287521826, + "grad_norm": 23.191631317138672, + "learning_rate": 3.604112363445522e-07, + "loss": 0.1823, + "step": 34760 + }, + { + "epoch": 0.8796467343168762, + "grad_norm": 18.808855056762695, + "learning_rate": 3.602615697312406e-07, + "loss": 0.1887, + "step": 34761 + }, + { + "epoch": 0.87967203988157, + "grad_norm": 7.064826965332031, + "learning_rate": 3.601119330387387e-07, + "loss": 0.1735, + "step": 34762 + }, + { + "epoch": 0.8796973454462637, + "grad_norm": 3.0511441230773926, + "learning_rate": 3.599623262680119e-07, + "loss": 0.1256, + "step": 34763 + }, + { + "epoch": 0.8797226510109573, + "grad_norm": 6.89718770980835, + "learning_rate": 3.5981274942002544e-07, + "loss": 0.1566, + "step": 34764 + }, + { + "epoch": 0.879747956575651, + "grad_norm": 6.0822529792785645, + "learning_rate": 3.596632024957425e-07, + "loss": 0.1703, + "step": 34765 + }, + { + "epoch": 0.8797732621403447, + "grad_norm": 3.5077435970306396, + "learning_rate": 3.5951368549612953e-07, + "loss": 0.0964, + "step": 34766 + }, + { + "epoch": 0.8797985677050384, + "grad_norm": 7.291057586669922, + "learning_rate": 3.593641984221491e-07, + "loss": 0.157, + "step": 34767 + }, + { + "epoch": 0.879823873269732, + "grad_norm": 4.318066120147705, + "learning_rate": 3.59214741274766e-07, + "loss": 0.1924, + "step": 34768 + }, + { + "epoch": 0.8798491788344257, + "grad_norm": 4.338094234466553, + "learning_rate": 3.5906531405494273e-07, + "loss": 0.1288, + "step": 34769 + }, + { + "epoch": 0.8798744843991194, + "grad_norm": 6.825384140014648, + "learning_rate": 3.589159167636458e-07, + "loss": 0.2102, + "step": 34770 + }, + { + "epoch": 0.879899789963813, + "grad_norm": 4.225398063659668, + "learning_rate": 3.58766549401835e-07, + "loss": 0.0974, + "step": 34771 + }, + { + "epoch": 0.8799250955285067, + "grad_norm": 3.80191707611084, + "learning_rate": 3.5861721197047674e-07, + "loss": 0.1425, + "step": 34772 + }, + { + "epoch": 0.8799504010932004, + "grad_norm": 4.892887115478516, + "learning_rate": 3.584679044705314e-07, + "loss": 0.1658, + "step": 34773 + }, + { + "epoch": 0.879975706657894, + "grad_norm": 12.124369621276855, + "learning_rate": 3.5831862690296427e-07, + "loss": 0.1945, + "step": 34774 + }, + { + "epoch": 0.8800010122225878, + "grad_norm": 6.816154956817627, + "learning_rate": 3.581693792687363e-07, + "loss": 0.1866, + "step": 34775 + }, + { + "epoch": 0.8800263177872815, + "grad_norm": 13.518561363220215, + "learning_rate": 3.5802016156881113e-07, + "loss": 0.2237, + "step": 34776 + }, + { + "epoch": 0.8800516233519751, + "grad_norm": 5.049192428588867, + "learning_rate": 3.578709738041497e-07, + "loss": 0.1242, + "step": 34777 + }, + { + "epoch": 0.8800769289166688, + "grad_norm": 7.298155307769775, + "learning_rate": 3.577218159757151e-07, + "loss": 0.1888, + "step": 34778 + }, + { + "epoch": 0.8801022344813625, + "grad_norm": 4.849765300750732, + "learning_rate": 3.575726880844693e-07, + "loss": 0.1736, + "step": 34779 + }, + { + "epoch": 0.8801275400460561, + "grad_norm": 34.708740234375, + "learning_rate": 3.5742359013137384e-07, + "loss": 0.288, + "step": 34780 + }, + { + "epoch": 0.8801528456107498, + "grad_norm": 3.0676567554473877, + "learning_rate": 3.57274522117389e-07, + "loss": 0.1146, + "step": 34781 + }, + { + "epoch": 0.8801781511754435, + "grad_norm": 3.4062142372131348, + "learning_rate": 3.5712548404347846e-07, + "loss": 0.0937, + "step": 34782 + }, + { + "epoch": 0.8802034567401371, + "grad_norm": 2.232356309890747, + "learning_rate": 3.5697647591060037e-07, + "loss": 0.108, + "step": 34783 + }, + { + "epoch": 0.8802287623048308, + "grad_norm": 5.122617721557617, + "learning_rate": 3.568274977197195e-07, + "loss": 0.1927, + "step": 34784 + }, + { + "epoch": 0.8802540678695245, + "grad_norm": 5.915834903717041, + "learning_rate": 3.566785494717928e-07, + "loss": 0.1181, + "step": 34785 + }, + { + "epoch": 0.8802793734342181, + "grad_norm": 5.1534624099731445, + "learning_rate": 3.5652963116778295e-07, + "loss": 0.133, + "step": 34786 + }, + { + "epoch": 0.8803046789989118, + "grad_norm": 4.298131465911865, + "learning_rate": 3.563807428086502e-07, + "loss": 0.1325, + "step": 34787 + }, + { + "epoch": 0.8803299845636056, + "grad_norm": 4.321763515472412, + "learning_rate": 3.5623188439535386e-07, + "loss": 0.1642, + "step": 34788 + }, + { + "epoch": 0.8803552901282992, + "grad_norm": 8.404102325439453, + "learning_rate": 3.560830559288536e-07, + "loss": 0.2197, + "step": 34789 + }, + { + "epoch": 0.8803805956929929, + "grad_norm": 9.13877010345459, + "learning_rate": 3.55934257410111e-07, + "loss": 0.1849, + "step": 34790 + }, + { + "epoch": 0.8804059012576866, + "grad_norm": 3.501185417175293, + "learning_rate": 3.5578548884008413e-07, + "loss": 0.1594, + "step": 34791 + }, + { + "epoch": 0.8804312068223803, + "grad_norm": 6.972533226013184, + "learning_rate": 3.556367502197328e-07, + "loss": 0.1415, + "step": 34792 + }, + { + "epoch": 0.8804565123870739, + "grad_norm": 3.606414556503296, + "learning_rate": 3.55488041550015e-07, + "loss": 0.0946, + "step": 34793 + }, + { + "epoch": 0.8804818179517676, + "grad_norm": 6.153117656707764, + "learning_rate": 3.553393628318924e-07, + "loss": 0.1466, + "step": 34794 + }, + { + "epoch": 0.8805071235164613, + "grad_norm": 4.519932746887207, + "learning_rate": 3.551907140663219e-07, + "loss": 0.1128, + "step": 34795 + }, + { + "epoch": 0.8805324290811549, + "grad_norm": 2.3689069747924805, + "learning_rate": 3.5504209525426157e-07, + "loss": 0.0964, + "step": 34796 + }, + { + "epoch": 0.8805577346458486, + "grad_norm": 4.501352787017822, + "learning_rate": 3.548935063966724e-07, + "loss": 0.1205, + "step": 34797 + }, + { + "epoch": 0.8805830402105423, + "grad_norm": 4.919566631317139, + "learning_rate": 3.547449474945097e-07, + "loss": 0.1505, + "step": 34798 + }, + { + "epoch": 0.880608345775236, + "grad_norm": 25.004276275634766, + "learning_rate": 3.545964185487333e-07, + "loss": 0.1452, + "step": 34799 + }, + { + "epoch": 0.8806336513399297, + "grad_norm": 4.261805057525635, + "learning_rate": 3.5444791956029956e-07, + "loss": 0.173, + "step": 34800 + }, + { + "epoch": 0.8806589569046234, + "grad_norm": 13.562569618225098, + "learning_rate": 3.542994505301689e-07, + "loss": 0.3035, + "step": 34801 + }, + { + "epoch": 0.880684262469317, + "grad_norm": 8.406228065490723, + "learning_rate": 3.54151011459295e-07, + "loss": 0.2172, + "step": 34802 + }, + { + "epoch": 0.8807095680340107, + "grad_norm": 7.162978649139404, + "learning_rate": 3.5400260234863816e-07, + "loss": 0.242, + "step": 34803 + }, + { + "epoch": 0.8807348735987044, + "grad_norm": 2.8728270530700684, + "learning_rate": 3.538542231991532e-07, + "loss": 0.1041, + "step": 34804 + }, + { + "epoch": 0.880760179163398, + "grad_norm": 3.826277017593384, + "learning_rate": 3.5370587401179877e-07, + "loss": 0.1459, + "step": 34805 + }, + { + "epoch": 0.8807854847280917, + "grad_norm": 7.0861430168151855, + "learning_rate": 3.535575547875314e-07, + "loss": 0.1535, + "step": 34806 + }, + { + "epoch": 0.8808107902927854, + "grad_norm": 5.146228790283203, + "learning_rate": 3.5340926552730693e-07, + "loss": 0.1244, + "step": 34807 + }, + { + "epoch": 0.880836095857479, + "grad_norm": 6.1832990646362305, + "learning_rate": 3.532610062320807e-07, + "loss": 0.2068, + "step": 34808 + }, + { + "epoch": 0.8808614014221727, + "grad_norm": 2.4321606159210205, + "learning_rate": 3.531127769028109e-07, + "loss": 0.1296, + "step": 34809 + }, + { + "epoch": 0.8808867069868664, + "grad_norm": 3.7654905319213867, + "learning_rate": 3.529645775404522e-07, + "loss": 0.1247, + "step": 34810 + }, + { + "epoch": 0.88091201255156, + "grad_norm": 3.2404918670654297, + "learning_rate": 3.528164081459606e-07, + "loss": 0.1255, + "step": 34811 + }, + { + "epoch": 0.8809373181162538, + "grad_norm": 13.725220680236816, + "learning_rate": 3.5266826872029094e-07, + "loss": 0.1737, + "step": 34812 + }, + { + "epoch": 0.8809626236809475, + "grad_norm": 3.929523468017578, + "learning_rate": 3.525201592644001e-07, + "loss": 0.1236, + "step": 34813 + }, + { + "epoch": 0.8809879292456411, + "grad_norm": 5.602436542510986, + "learning_rate": 3.523720797792418e-07, + "loss": 0.1361, + "step": 34814 + }, + { + "epoch": 0.8810132348103348, + "grad_norm": 5.908054828643799, + "learning_rate": 3.52224030265772e-07, + "loss": 0.1358, + "step": 34815 + }, + { + "epoch": 0.8810385403750285, + "grad_norm": 8.62160873413086, + "learning_rate": 3.5207601072494325e-07, + "loss": 0.226, + "step": 34816 + }, + { + "epoch": 0.8810638459397221, + "grad_norm": 11.209787368774414, + "learning_rate": 3.5192802115771306e-07, + "loss": 0.1769, + "step": 34817 + }, + { + "epoch": 0.8810891515044158, + "grad_norm": 13.394140243530273, + "learning_rate": 3.5178006156503466e-07, + "loss": 0.1573, + "step": 34818 + }, + { + "epoch": 0.8811144570691095, + "grad_norm": 8.13814926147461, + "learning_rate": 3.516321319478616e-07, + "loss": 0.2074, + "step": 34819 + }, + { + "epoch": 0.8811397626338032, + "grad_norm": 4.684924602508545, + "learning_rate": 3.5148423230714767e-07, + "loss": 0.1352, + "step": 34820 + }, + { + "epoch": 0.8811650681984968, + "grad_norm": 3.6414132118225098, + "learning_rate": 3.5133636264384764e-07, + "loss": 0.1415, + "step": 34821 + }, + { + "epoch": 0.8811903737631905, + "grad_norm": 3.844860553741455, + "learning_rate": 3.5118852295891514e-07, + "loss": 0.1444, + "step": 34822 + }, + { + "epoch": 0.8812156793278842, + "grad_norm": 2.5026586055755615, + "learning_rate": 3.510407132533028e-07, + "loss": 0.0722, + "step": 34823 + }, + { + "epoch": 0.8812409848925778, + "grad_norm": 4.103017807006836, + "learning_rate": 3.508929335279637e-07, + "loss": 0.1763, + "step": 34824 + }, + { + "epoch": 0.8812662904572716, + "grad_norm": 4.912778377532959, + "learning_rate": 3.507451837838516e-07, + "loss": 0.0831, + "step": 34825 + }, + { + "epoch": 0.8812915960219653, + "grad_norm": 3.9044699668884277, + "learning_rate": 3.505974640219195e-07, + "loss": 0.0962, + "step": 34826 + }, + { + "epoch": 0.8813169015866589, + "grad_norm": 4.468260765075684, + "learning_rate": 3.5044977424311846e-07, + "loss": 0.1645, + "step": 34827 + }, + { + "epoch": 0.8813422071513526, + "grad_norm": 2.5066184997558594, + "learning_rate": 3.503021144484037e-07, + "loss": 0.1118, + "step": 34828 + }, + { + "epoch": 0.8813675127160463, + "grad_norm": 15.18973159790039, + "learning_rate": 3.5015448463872394e-07, + "loss": 0.1846, + "step": 34829 + }, + { + "epoch": 0.8813928182807399, + "grad_norm": 3.7100460529327393, + "learning_rate": 3.5000688481503344e-07, + "loss": 0.1544, + "step": 34830 + }, + { + "epoch": 0.8814181238454336, + "grad_norm": 5.681760787963867, + "learning_rate": 3.4985931497828307e-07, + "loss": 0.131, + "step": 34831 + }, + { + "epoch": 0.8814434294101273, + "grad_norm": 6.615987300872803, + "learning_rate": 3.4971177512942653e-07, + "loss": 0.1313, + "step": 34832 + }, + { + "epoch": 0.8814687349748209, + "grad_norm": 4.3697919845581055, + "learning_rate": 3.4956426526941257e-07, + "loss": 0.1565, + "step": 34833 + }, + { + "epoch": 0.8814940405395146, + "grad_norm": 6.1744537353515625, + "learning_rate": 3.4941678539919366e-07, + "loss": 0.1956, + "step": 34834 + }, + { + "epoch": 0.8815193461042083, + "grad_norm": 6.747343063354492, + "learning_rate": 3.4926933551972076e-07, + "loss": 0.2212, + "step": 34835 + }, + { + "epoch": 0.881544651668902, + "grad_norm": 6.62954568862915, + "learning_rate": 3.491219156319459e-07, + "loss": 0.1611, + "step": 34836 + }, + { + "epoch": 0.8815699572335957, + "grad_norm": 4.396608829498291, + "learning_rate": 3.489745257368171e-07, + "loss": 0.1574, + "step": 34837 + }, + { + "epoch": 0.8815952627982894, + "grad_norm": 3.113058567047119, + "learning_rate": 3.4882716583528764e-07, + "loss": 0.1449, + "step": 34838 + }, + { + "epoch": 0.881620568362983, + "grad_norm": 10.137587547302246, + "learning_rate": 3.4867983592830555e-07, + "loss": 0.2058, + "step": 34839 + }, + { + "epoch": 0.8816458739276767, + "grad_norm": 3.304837465286255, + "learning_rate": 3.485325360168229e-07, + "loss": 0.1577, + "step": 34840 + }, + { + "epoch": 0.8816711794923704, + "grad_norm": 6.933337688446045, + "learning_rate": 3.4838526610178836e-07, + "loss": 0.1963, + "step": 34841 + }, + { + "epoch": 0.881696485057064, + "grad_norm": 4.767486572265625, + "learning_rate": 3.4823802618415225e-07, + "loss": 0.1156, + "step": 34842 + }, + { + "epoch": 0.8817217906217577, + "grad_norm": 1.9129509925842285, + "learning_rate": 3.4809081626486274e-07, + "loss": 0.0558, + "step": 34843 + }, + { + "epoch": 0.8817470961864514, + "grad_norm": 6.033011436462402, + "learning_rate": 3.4794363634487125e-07, + "loss": 0.1662, + "step": 34844 + }, + { + "epoch": 0.8817724017511451, + "grad_norm": 5.420641899108887, + "learning_rate": 3.47796486425126e-07, + "loss": 0.134, + "step": 34845 + }, + { + "epoch": 0.8817977073158387, + "grad_norm": 3.8850202560424805, + "learning_rate": 3.4764936650657557e-07, + "loss": 0.1084, + "step": 34846 + }, + { + "epoch": 0.8818230128805324, + "grad_norm": 6.552439212799072, + "learning_rate": 3.475022765901687e-07, + "loss": 0.2095, + "step": 34847 + }, + { + "epoch": 0.8818483184452262, + "grad_norm": 9.58527946472168, + "learning_rate": 3.4735521667685457e-07, + "loss": 0.2933, + "step": 34848 + }, + { + "epoch": 0.8818736240099198, + "grad_norm": 8.104004859924316, + "learning_rate": 3.472081867675808e-07, + "loss": 0.2557, + "step": 34849 + }, + { + "epoch": 0.8818989295746135, + "grad_norm": 4.734345436096191, + "learning_rate": 3.470611868632967e-07, + "loss": 0.1686, + "step": 34850 + }, + { + "epoch": 0.8819242351393072, + "grad_norm": 7.596429347991943, + "learning_rate": 3.469142169649481e-07, + "loss": 0.1917, + "step": 34851 + }, + { + "epoch": 0.8819495407040008, + "grad_norm": 3.774873733520508, + "learning_rate": 3.4676727707348535e-07, + "loss": 0.1291, + "step": 34852 + }, + { + "epoch": 0.8819748462686945, + "grad_norm": 7.064738750457764, + "learning_rate": 3.4662036718985494e-07, + "loss": 0.154, + "step": 34853 + }, + { + "epoch": 0.8820001518333882, + "grad_norm": 4.272376537322998, + "learning_rate": 3.464734873150039e-07, + "loss": 0.1696, + "step": 34854 + }, + { + "epoch": 0.8820254573980818, + "grad_norm": 9.828848838806152, + "learning_rate": 3.463266374498797e-07, + "loss": 0.1786, + "step": 34855 + }, + { + "epoch": 0.8820507629627755, + "grad_norm": 3.138791561126709, + "learning_rate": 3.46179817595429e-07, + "loss": 0.1699, + "step": 34856 + }, + { + "epoch": 0.8820760685274692, + "grad_norm": 6.8991594314575195, + "learning_rate": 3.4603302775259974e-07, + "loss": 0.1882, + "step": 34857 + }, + { + "epoch": 0.8821013740921628, + "grad_norm": 3.6622986793518066, + "learning_rate": 3.4588626792233623e-07, + "loss": 0.1653, + "step": 34858 + }, + { + "epoch": 0.8821266796568565, + "grad_norm": 3.6949210166931152, + "learning_rate": 3.457395381055889e-07, + "loss": 0.1509, + "step": 34859 + }, + { + "epoch": 0.8821519852215502, + "grad_norm": 8.174640655517578, + "learning_rate": 3.455928383032997e-07, + "loss": 0.1861, + "step": 34860 + }, + { + "epoch": 0.8821772907862439, + "grad_norm": 8.662793159484863, + "learning_rate": 3.4544616851641675e-07, + "loss": 0.1714, + "step": 34861 + }, + { + "epoch": 0.8822025963509376, + "grad_norm": 4.3840532302856445, + "learning_rate": 3.4529952874588544e-07, + "loss": 0.1731, + "step": 34862 + }, + { + "epoch": 0.8822279019156313, + "grad_norm": 3.424654245376587, + "learning_rate": 3.451529189926528e-07, + "loss": 0.1285, + "step": 34863 + }, + { + "epoch": 0.8822532074803249, + "grad_norm": 13.715753555297852, + "learning_rate": 3.450063392576619e-07, + "loss": 0.1796, + "step": 34864 + }, + { + "epoch": 0.8822785130450186, + "grad_norm": 3.463239908218384, + "learning_rate": 3.4485978954185985e-07, + "loss": 0.1017, + "step": 34865 + }, + { + "epoch": 0.8823038186097123, + "grad_norm": 5.383951187133789, + "learning_rate": 3.447132698461902e-07, + "loss": 0.1941, + "step": 34866 + }, + { + "epoch": 0.8823291241744059, + "grad_norm": 9.231616020202637, + "learning_rate": 3.445667801716007e-07, + "loss": 0.1807, + "step": 34867 + }, + { + "epoch": 0.8823544297390996, + "grad_norm": 2.9774229526519775, + "learning_rate": 3.444203205190322e-07, + "loss": 0.09, + "step": 34868 + }, + { + "epoch": 0.8823797353037933, + "grad_norm": 4.286726951599121, + "learning_rate": 3.442738908894322e-07, + "loss": 0.1004, + "step": 34869 + }, + { + "epoch": 0.882405040868487, + "grad_norm": 10.411763191223145, + "learning_rate": 3.441274912837428e-07, + "loss": 0.199, + "step": 34870 + }, + { + "epoch": 0.8824303464331806, + "grad_norm": 22.629003524780273, + "learning_rate": 3.439811217029093e-07, + "loss": 0.2894, + "step": 34871 + }, + { + "epoch": 0.8824556519978743, + "grad_norm": 4.442920684814453, + "learning_rate": 3.4383478214787656e-07, + "loss": 0.2125, + "step": 34872 + }, + { + "epoch": 0.8824809575625681, + "grad_norm": 4.81837272644043, + "learning_rate": 3.436884726195866e-07, + "loss": 0.1283, + "step": 34873 + }, + { + "epoch": 0.8825062631272617, + "grad_norm": 5.977842330932617, + "learning_rate": 3.4354219311898254e-07, + "loss": 0.1977, + "step": 34874 + }, + { + "epoch": 0.8825315686919554, + "grad_norm": 5.544350624084473, + "learning_rate": 3.433959436470097e-07, + "loss": 0.1201, + "step": 34875 + }, + { + "epoch": 0.8825568742566491, + "grad_norm": 4.4213690757751465, + "learning_rate": 3.4324972420461067e-07, + "loss": 0.1703, + "step": 34876 + }, + { + "epoch": 0.8825821798213427, + "grad_norm": 4.561467170715332, + "learning_rate": 3.431035347927275e-07, + "loss": 0.1834, + "step": 34877 + }, + { + "epoch": 0.8826074853860364, + "grad_norm": 4.311497211456299, + "learning_rate": 3.4295737541230277e-07, + "loss": 0.0912, + "step": 34878 + }, + { + "epoch": 0.8826327909507301, + "grad_norm": 3.2186837196350098, + "learning_rate": 3.4281124606428017e-07, + "loss": 0.1478, + "step": 34879 + }, + { + "epoch": 0.8826580965154237, + "grad_norm": 8.237051010131836, + "learning_rate": 3.426651467496017e-07, + "loss": 0.2215, + "step": 34880 + }, + { + "epoch": 0.8826834020801174, + "grad_norm": 9.660292625427246, + "learning_rate": 3.425190774692094e-07, + "loss": 0.249, + "step": 34881 + }, + { + "epoch": 0.8827087076448111, + "grad_norm": 9.686827659606934, + "learning_rate": 3.423730382240453e-07, + "loss": 0.1178, + "step": 34882 + }, + { + "epoch": 0.8827340132095047, + "grad_norm": 2.9022622108459473, + "learning_rate": 3.422270290150498e-07, + "loss": 0.0721, + "step": 34883 + }, + { + "epoch": 0.8827593187741984, + "grad_norm": 3.9521234035491943, + "learning_rate": 3.4208104984316703e-07, + "loss": 0.1448, + "step": 34884 + }, + { + "epoch": 0.8827846243388922, + "grad_norm": 4.320973873138428, + "learning_rate": 3.4193510070933743e-07, + "loss": 0.1729, + "step": 34885 + }, + { + "epoch": 0.8828099299035858, + "grad_norm": 5.045836925506592, + "learning_rate": 3.4178918161450137e-07, + "loss": 0.128, + "step": 34886 + }, + { + "epoch": 0.8828352354682795, + "grad_norm": 2.3371825218200684, + "learning_rate": 3.4164329255959974e-07, + "loss": 0.0735, + "step": 34887 + }, + { + "epoch": 0.8828605410329732, + "grad_norm": 3.2951529026031494, + "learning_rate": 3.4149743354557454e-07, + "loss": 0.0818, + "step": 34888 + }, + { + "epoch": 0.8828858465976668, + "grad_norm": 4.759065628051758, + "learning_rate": 3.4135160457336615e-07, + "loss": 0.1486, + "step": 34889 + }, + { + "epoch": 0.8829111521623605, + "grad_norm": 6.018577575683594, + "learning_rate": 3.412058056439149e-07, + "loss": 0.1204, + "step": 34890 + }, + { + "epoch": 0.8829364577270542, + "grad_norm": 3.6388020515441895, + "learning_rate": 3.410600367581596e-07, + "loss": 0.1337, + "step": 34891 + }, + { + "epoch": 0.8829617632917478, + "grad_norm": 4.211447238922119, + "learning_rate": 3.409142979170427e-07, + "loss": 0.1888, + "step": 34892 + }, + { + "epoch": 0.8829870688564415, + "grad_norm": 4.115050792694092, + "learning_rate": 3.407685891215018e-07, + "loss": 0.1342, + "step": 34893 + }, + { + "epoch": 0.8830123744211352, + "grad_norm": 6.010741233825684, + "learning_rate": 3.40622910372479e-07, + "loss": 0.1141, + "step": 34894 + }, + { + "epoch": 0.8830376799858289, + "grad_norm": 4.366916179656982, + "learning_rate": 3.4047726167091066e-07, + "loss": 0.1418, + "step": 34895 + }, + { + "epoch": 0.8830629855505225, + "grad_norm": 5.684911251068115, + "learning_rate": 3.403316430177389e-07, + "loss": 0.193, + "step": 34896 + }, + { + "epoch": 0.8830882911152163, + "grad_norm": 6.925741195678711, + "learning_rate": 3.4018605441390075e-07, + "loss": 0.1345, + "step": 34897 + }, + { + "epoch": 0.88311359667991, + "grad_norm": 4.5222649574279785, + "learning_rate": 3.400404958603376e-07, + "loss": 0.1144, + "step": 34898 + }, + { + "epoch": 0.8831389022446036, + "grad_norm": 4.040517330169678, + "learning_rate": 3.398949673579843e-07, + "loss": 0.1663, + "step": 34899 + }, + { + "epoch": 0.8831642078092973, + "grad_norm": 4.928262710571289, + "learning_rate": 3.3974946890778293e-07, + "loss": 0.0951, + "step": 34900 + }, + { + "epoch": 0.883189513373991, + "grad_norm": 5.992428302764893, + "learning_rate": 3.3960400051066934e-07, + "loss": 0.2241, + "step": 34901 + }, + { + "epoch": 0.8832148189386846, + "grad_norm": 16.164581298828125, + "learning_rate": 3.394585621675839e-07, + "loss": 0.2608, + "step": 34902 + }, + { + "epoch": 0.8832401245033783, + "grad_norm": 3.831207513809204, + "learning_rate": 3.393131538794625e-07, + "loss": 0.1394, + "step": 34903 + }, + { + "epoch": 0.883265430068072, + "grad_norm": 7.956082820892334, + "learning_rate": 3.391677756472439e-07, + "loss": 0.1698, + "step": 34904 + }, + { + "epoch": 0.8832907356327656, + "grad_norm": 3.7419469356536865, + "learning_rate": 3.390224274718645e-07, + "loss": 0.0708, + "step": 34905 + }, + { + "epoch": 0.8833160411974593, + "grad_norm": 2.8462331295013428, + "learning_rate": 3.3887710935426366e-07, + "loss": 0.1092, + "step": 34906 + }, + { + "epoch": 0.883341346762153, + "grad_norm": 4.7286882400512695, + "learning_rate": 3.387318212953772e-07, + "loss": 0.2088, + "step": 34907 + }, + { + "epoch": 0.8833666523268466, + "grad_norm": 5.162442207336426, + "learning_rate": 3.385865632961416e-07, + "loss": 0.1638, + "step": 34908 + }, + { + "epoch": 0.8833919578915403, + "grad_norm": 5.813676834106445, + "learning_rate": 3.3844133535749447e-07, + "loss": 0.104, + "step": 34909 + }, + { + "epoch": 0.8834172634562341, + "grad_norm": 5.784619331359863, + "learning_rate": 3.3829613748037173e-07, + "loss": 0.1819, + "step": 34910 + }, + { + "epoch": 0.8834425690209277, + "grad_norm": 5.664491653442383, + "learning_rate": 3.3815096966571037e-07, + "loss": 0.1153, + "step": 34911 + }, + { + "epoch": 0.8834678745856214, + "grad_norm": 3.9150357246398926, + "learning_rate": 3.3800583191444635e-07, + "loss": 0.0847, + "step": 34912 + }, + { + "epoch": 0.8834931801503151, + "grad_norm": 19.535980224609375, + "learning_rate": 3.378607242275156e-07, + "loss": 0.2685, + "step": 34913 + }, + { + "epoch": 0.8835184857150087, + "grad_norm": 3.7907462120056152, + "learning_rate": 3.377156466058529e-07, + "loss": 0.1569, + "step": 34914 + }, + { + "epoch": 0.8835437912797024, + "grad_norm": 5.322576522827148, + "learning_rate": 3.3757059905039526e-07, + "loss": 0.1878, + "step": 34915 + }, + { + "epoch": 0.8835690968443961, + "grad_norm": 5.428295612335205, + "learning_rate": 3.374255815620781e-07, + "loss": 0.1672, + "step": 34916 + }, + { + "epoch": 0.8835944024090897, + "grad_norm": 3.526092052459717, + "learning_rate": 3.3728059414183556e-07, + "loss": 0.1459, + "step": 34917 + }, + { + "epoch": 0.8836197079737834, + "grad_norm": 4.760556221008301, + "learning_rate": 3.3713563679060257e-07, + "loss": 0.1266, + "step": 34918 + }, + { + "epoch": 0.8836450135384771, + "grad_norm": 12.978242874145508, + "learning_rate": 3.36990709509315e-07, + "loss": 0.1522, + "step": 34919 + }, + { + "epoch": 0.8836703191031708, + "grad_norm": 6.02364444732666, + "learning_rate": 3.3684581229890766e-07, + "loss": 0.1486, + "step": 34920 + }, + { + "epoch": 0.8836956246678644, + "grad_norm": 4.901271343231201, + "learning_rate": 3.367009451603137e-07, + "loss": 0.114, + "step": 34921 + }, + { + "epoch": 0.8837209302325582, + "grad_norm": 4.788824081420898, + "learning_rate": 3.3655610809446737e-07, + "loss": 0.1086, + "step": 34922 + }, + { + "epoch": 0.8837462357972519, + "grad_norm": 6.73603630065918, + "learning_rate": 3.3641130110230404e-07, + "loss": 0.1967, + "step": 34923 + }, + { + "epoch": 0.8837715413619455, + "grad_norm": 2.027472734451294, + "learning_rate": 3.362665241847557e-07, + "loss": 0.0704, + "step": 34924 + }, + { + "epoch": 0.8837968469266392, + "grad_norm": 6.3513898849487305, + "learning_rate": 3.361217773427589e-07, + "loss": 0.1137, + "step": 34925 + }, + { + "epoch": 0.8838221524913329, + "grad_norm": 2.9409830570220947, + "learning_rate": 3.359770605772433e-07, + "loss": 0.1255, + "step": 34926 + }, + { + "epoch": 0.8838474580560265, + "grad_norm": 3.8355066776275635, + "learning_rate": 3.3583237388914503e-07, + "loss": 0.0582, + "step": 34927 + }, + { + "epoch": 0.8838727636207202, + "grad_norm": 3.000021457672119, + "learning_rate": 3.3568771727939545e-07, + "loss": 0.1006, + "step": 34928 + }, + { + "epoch": 0.8838980691854139, + "grad_norm": 4.943216323852539, + "learning_rate": 3.355430907489293e-07, + "loss": 0.127, + "step": 34929 + }, + { + "epoch": 0.8839233747501075, + "grad_norm": 3.731114149093628, + "learning_rate": 3.353984942986771e-07, + "loss": 0.1184, + "step": 34930 + }, + { + "epoch": 0.8839486803148012, + "grad_norm": 3.0835635662078857, + "learning_rate": 3.35253927929573e-07, + "loss": 0.1381, + "step": 34931 + }, + { + "epoch": 0.8839739858794949, + "grad_norm": 26.105133056640625, + "learning_rate": 3.351093916425474e-07, + "loss": 0.2338, + "step": 34932 + }, + { + "epoch": 0.8839992914441885, + "grad_norm": 4.231087684631348, + "learning_rate": 3.3496488543853454e-07, + "loss": 0.0833, + "step": 34933 + }, + { + "epoch": 0.8840245970088823, + "grad_norm": 4.057953357696533, + "learning_rate": 3.3482040931846594e-07, + "loss": 0.1513, + "step": 34934 + }, + { + "epoch": 0.884049902573576, + "grad_norm": 4.323000907897949, + "learning_rate": 3.3467596328327243e-07, + "loss": 0.1268, + "step": 34935 + }, + { + "epoch": 0.8840752081382696, + "grad_norm": 6.267159938812256, + "learning_rate": 3.3453154733388447e-07, + "loss": 0.1683, + "step": 34936 + }, + { + "epoch": 0.8841005137029633, + "grad_norm": 5.907644748687744, + "learning_rate": 3.343871614712363e-07, + "loss": 0.1219, + "step": 34937 + }, + { + "epoch": 0.884125819267657, + "grad_norm": 5.121336936950684, + "learning_rate": 3.3424280569625713e-07, + "loss": 0.1638, + "step": 34938 + }, + { + "epoch": 0.8841511248323506, + "grad_norm": 9.887413024902344, + "learning_rate": 3.340984800098779e-07, + "loss": 0.2867, + "step": 34939 + }, + { + "epoch": 0.8841764303970443, + "grad_norm": 3.5732882022857666, + "learning_rate": 3.339541844130301e-07, + "loss": 0.1261, + "step": 34940 + }, + { + "epoch": 0.884201735961738, + "grad_norm": 5.367650508880615, + "learning_rate": 3.3380991890664296e-07, + "loss": 0.1615, + "step": 34941 + }, + { + "epoch": 0.8842270415264316, + "grad_norm": 6.086524963378906, + "learning_rate": 3.3366568349164853e-07, + "loss": 0.1375, + "step": 34942 + }, + { + "epoch": 0.8842523470911253, + "grad_norm": 3.264167547225952, + "learning_rate": 3.335214781689755e-07, + "loss": 0.1089, + "step": 34943 + }, + { + "epoch": 0.884277652655819, + "grad_norm": 27.352062225341797, + "learning_rate": 3.333773029395548e-07, + "loss": 0.1657, + "step": 34944 + }, + { + "epoch": 0.8843029582205126, + "grad_norm": 5.256677627563477, + "learning_rate": 3.332331578043152e-07, + "loss": 0.141, + "step": 34945 + }, + { + "epoch": 0.8843282637852063, + "grad_norm": 8.01439380645752, + "learning_rate": 3.330890427641875e-07, + "loss": 0.1724, + "step": 34946 + }, + { + "epoch": 0.8843535693499001, + "grad_norm": 5.159247875213623, + "learning_rate": 3.329449578201005e-07, + "loss": 0.1755, + "step": 34947 + }, + { + "epoch": 0.8843788749145938, + "grad_norm": 7.412784099578857, + "learning_rate": 3.328009029729834e-07, + "loss": 0.2011, + "step": 34948 + }, + { + "epoch": 0.8844041804792874, + "grad_norm": 5.219930648803711, + "learning_rate": 3.3265687822376437e-07, + "loss": 0.1555, + "step": 34949 + }, + { + "epoch": 0.8844294860439811, + "grad_norm": 4.014688968658447, + "learning_rate": 3.3251288357337373e-07, + "loss": 0.1694, + "step": 34950 + }, + { + "epoch": 0.8844547916086748, + "grad_norm": 4.134154319763184, + "learning_rate": 3.323689190227392e-07, + "loss": 0.1161, + "step": 34951 + }, + { + "epoch": 0.8844800971733684, + "grad_norm": 6.55381441116333, + "learning_rate": 3.3222498457278927e-07, + "loss": 0.1494, + "step": 34952 + }, + { + "epoch": 0.8845054027380621, + "grad_norm": 5.89924430847168, + "learning_rate": 3.320810802244512e-07, + "loss": 0.1599, + "step": 34953 + }, + { + "epoch": 0.8845307083027558, + "grad_norm": 4.803645133972168, + "learning_rate": 3.3193720597865463e-07, + "loss": 0.19, + "step": 34954 + }, + { + "epoch": 0.8845560138674494, + "grad_norm": 3.8772878646850586, + "learning_rate": 3.3179336183632613e-07, + "loss": 0.1332, + "step": 34955 + }, + { + "epoch": 0.8845813194321431, + "grad_norm": 7.05109977722168, + "learning_rate": 3.3164954779839555e-07, + "loss": 0.1529, + "step": 34956 + }, + { + "epoch": 0.8846066249968368, + "grad_norm": 3.586641788482666, + "learning_rate": 3.31505763865787e-07, + "loss": 0.1281, + "step": 34957 + }, + { + "epoch": 0.8846319305615304, + "grad_norm": 6.6681671142578125, + "learning_rate": 3.3136201003943045e-07, + "loss": 0.1839, + "step": 34958 + }, + { + "epoch": 0.8846572361262242, + "grad_norm": 17.223764419555664, + "learning_rate": 3.3121828632025066e-07, + "loss": 0.225, + "step": 34959 + }, + { + "epoch": 0.8846825416909179, + "grad_norm": 4.954179286956787, + "learning_rate": 3.3107459270917743e-07, + "loss": 0.0968, + "step": 34960 + }, + { + "epoch": 0.8847078472556115, + "grad_norm": 4.839372158050537, + "learning_rate": 3.3093092920713385e-07, + "loss": 0.16, + "step": 34961 + }, + { + "epoch": 0.8847331528203052, + "grad_norm": 12.250088691711426, + "learning_rate": 3.307872958150493e-07, + "loss": 0.2962, + "step": 34962 + }, + { + "epoch": 0.8847584583849989, + "grad_norm": 5.732039451599121, + "learning_rate": 3.3064369253384797e-07, + "loss": 0.1544, + "step": 34963 + }, + { + "epoch": 0.8847837639496925, + "grad_norm": 5.985909461975098, + "learning_rate": 3.305001193644586e-07, + "loss": 0.149, + "step": 34964 + }, + { + "epoch": 0.8848090695143862, + "grad_norm": 5.095943450927734, + "learning_rate": 3.303565763078037e-07, + "loss": 0.1419, + "step": 34965 + }, + { + "epoch": 0.8848343750790799, + "grad_norm": 4.7734599113464355, + "learning_rate": 3.302130633648115e-07, + "loss": 0.139, + "step": 34966 + }, + { + "epoch": 0.8848596806437735, + "grad_norm": 6.619324684143066, + "learning_rate": 3.300695805364063e-07, + "loss": 0.2218, + "step": 34967 + }, + { + "epoch": 0.8848849862084672, + "grad_norm": 5.146393775939941, + "learning_rate": 3.299261278235133e-07, + "loss": 0.1652, + "step": 34968 + }, + { + "epoch": 0.8849102917731609, + "grad_norm": 8.817646980285645, + "learning_rate": 3.2978270522705856e-07, + "loss": 0.1418, + "step": 34969 + }, + { + "epoch": 0.8849355973378545, + "grad_norm": 6.069820880889893, + "learning_rate": 3.2963931274796633e-07, + "loss": 0.1467, + "step": 34970 + }, + { + "epoch": 0.8849609029025483, + "grad_norm": 5.233625411987305, + "learning_rate": 3.294959503871614e-07, + "loss": 0.1712, + "step": 34971 + }, + { + "epoch": 0.884986208467242, + "grad_norm": 5.858588695526123, + "learning_rate": 3.2935261814556805e-07, + "loss": 0.0754, + "step": 34972 + }, + { + "epoch": 0.8850115140319357, + "grad_norm": 4.30979061126709, + "learning_rate": 3.292093160241111e-07, + "loss": 0.0768, + "step": 34973 + }, + { + "epoch": 0.8850368195966293, + "grad_norm": 4.191421031951904, + "learning_rate": 3.2906604402371477e-07, + "loss": 0.0944, + "step": 34974 + }, + { + "epoch": 0.885062125161323, + "grad_norm": 3.178628921508789, + "learning_rate": 3.289228021453028e-07, + "loss": 0.1399, + "step": 34975 + }, + { + "epoch": 0.8850874307260167, + "grad_norm": 4.486985206604004, + "learning_rate": 3.287795903897972e-07, + "loss": 0.1615, + "step": 34976 + }, + { + "epoch": 0.8851127362907103, + "grad_norm": 3.3715150356292725, + "learning_rate": 3.286364087581245e-07, + "loss": 0.0868, + "step": 34977 + }, + { + "epoch": 0.885138041855404, + "grad_norm": 2.9600915908813477, + "learning_rate": 3.2849325725120674e-07, + "loss": 0.1038, + "step": 34978 + }, + { + "epoch": 0.8851633474200977, + "grad_norm": 16.864402770996094, + "learning_rate": 3.28350135869967e-07, + "loss": 0.2441, + "step": 34979 + }, + { + "epoch": 0.8851886529847913, + "grad_norm": 9.530464172363281, + "learning_rate": 3.2820704461532737e-07, + "loss": 0.1293, + "step": 34980 + }, + { + "epoch": 0.885213958549485, + "grad_norm": 8.997346878051758, + "learning_rate": 3.280639834882121e-07, + "loss": 0.2011, + "step": 34981 + }, + { + "epoch": 0.8852392641141787, + "grad_norm": 4.038866996765137, + "learning_rate": 3.279209524895433e-07, + "loss": 0.1336, + "step": 34982 + }, + { + "epoch": 0.8852645696788723, + "grad_norm": 6.930581569671631, + "learning_rate": 3.2777795162024286e-07, + "loss": 0.1453, + "step": 34983 + }, + { + "epoch": 0.8852898752435661, + "grad_norm": 4.895951747894287, + "learning_rate": 3.276349808812329e-07, + "loss": 0.1046, + "step": 34984 + }, + { + "epoch": 0.8853151808082598, + "grad_norm": 3.755657911300659, + "learning_rate": 3.274920402734366e-07, + "loss": 0.1437, + "step": 34985 + }, + { + "epoch": 0.8853404863729534, + "grad_norm": 4.752073764801025, + "learning_rate": 3.273491297977743e-07, + "loss": 0.1534, + "step": 34986 + }, + { + "epoch": 0.8853657919376471, + "grad_norm": 3.4152448177337646, + "learning_rate": 3.2720624945516975e-07, + "loss": 0.1212, + "step": 34987 + }, + { + "epoch": 0.8853910975023408, + "grad_norm": 7.380476474761963, + "learning_rate": 3.270633992465411e-07, + "loss": 0.1564, + "step": 34988 + }, + { + "epoch": 0.8854164030670344, + "grad_norm": 8.069256782531738, + "learning_rate": 3.269205791728125e-07, + "loss": 0.2077, + "step": 34989 + }, + { + "epoch": 0.8854417086317281, + "grad_norm": 4.139225959777832, + "learning_rate": 3.2677778923490277e-07, + "loss": 0.1108, + "step": 34990 + }, + { + "epoch": 0.8854670141964218, + "grad_norm": 5.186816215515137, + "learning_rate": 3.2663502943373506e-07, + "loss": 0.1495, + "step": 34991 + }, + { + "epoch": 0.8854923197611154, + "grad_norm": 8.43006706237793, + "learning_rate": 3.264922997702275e-07, + "loss": 0.1538, + "step": 34992 + }, + { + "epoch": 0.8855176253258091, + "grad_norm": 4.398190498352051, + "learning_rate": 3.263496002453026e-07, + "loss": 0.1414, + "step": 34993 + }, + { + "epoch": 0.8855429308905028, + "grad_norm": 2.306016206741333, + "learning_rate": 3.262069308598798e-07, + "loss": 0.0776, + "step": 34994 + }, + { + "epoch": 0.8855682364551964, + "grad_norm": 3.8839781284332275, + "learning_rate": 3.2606429161487927e-07, + "loss": 0.0966, + "step": 34995 + }, + { + "epoch": 0.8855935420198902, + "grad_norm": 2.652750015258789, + "learning_rate": 3.259216825112194e-07, + "loss": 0.1236, + "step": 34996 + }, + { + "epoch": 0.8856188475845839, + "grad_norm": 3.4285922050476074, + "learning_rate": 3.2577910354982256e-07, + "loss": 0.1101, + "step": 34997 + }, + { + "epoch": 0.8856441531492776, + "grad_norm": 5.745582103729248, + "learning_rate": 3.256365547316065e-07, + "loss": 0.2205, + "step": 34998 + }, + { + "epoch": 0.8856694587139712, + "grad_norm": 4.132096767425537, + "learning_rate": 3.2549403605748983e-07, + "loss": 0.0624, + "step": 34999 + }, + { + "epoch": 0.8856947642786649, + "grad_norm": 5.784506797790527, + "learning_rate": 3.253515475283936e-07, + "loss": 0.1318, + "step": 35000 + }, + { + "epoch": 0.8857200698433586, + "grad_norm": 2.6180920600891113, + "learning_rate": 3.2520908914523583e-07, + "loss": 0.0891, + "step": 35001 + }, + { + "epoch": 0.8857453754080522, + "grad_norm": 6.390848636627197, + "learning_rate": 3.250666609089348e-07, + "loss": 0.1515, + "step": 35002 + }, + { + "epoch": 0.8857706809727459, + "grad_norm": 7.042761325836182, + "learning_rate": 3.2492426282040856e-07, + "loss": 0.2838, + "step": 35003 + }, + { + "epoch": 0.8857959865374396, + "grad_norm": 3.0964691638946533, + "learning_rate": 3.2478189488057753e-07, + "loss": 0.1037, + "step": 35004 + }, + { + "epoch": 0.8858212921021332, + "grad_norm": 3.378687858581543, + "learning_rate": 3.246395570903571e-07, + "loss": 0.1331, + "step": 35005 + }, + { + "epoch": 0.8858465976668269, + "grad_norm": 6.891361713409424, + "learning_rate": 3.2449724945066706e-07, + "loss": 0.1284, + "step": 35006 + }, + { + "epoch": 0.8858719032315207, + "grad_norm": 3.958439826965332, + "learning_rate": 3.243549719624239e-07, + "loss": 0.1091, + "step": 35007 + }, + { + "epoch": 0.8858972087962143, + "grad_norm": 7.759354114532471, + "learning_rate": 3.242127246265464e-07, + "loss": 0.1768, + "step": 35008 + }, + { + "epoch": 0.885922514360908, + "grad_norm": 9.336212158203125, + "learning_rate": 3.240705074439515e-07, + "loss": 0.1781, + "step": 35009 + }, + { + "epoch": 0.8859478199256017, + "grad_norm": 4.565147876739502, + "learning_rate": 3.2392832041555575e-07, + "loss": 0.205, + "step": 35010 + }, + { + "epoch": 0.8859731254902953, + "grad_norm": 5.675488471984863, + "learning_rate": 3.2378616354227623e-07, + "loss": 0.2264, + "step": 35011 + }, + { + "epoch": 0.885998431054989, + "grad_norm": 17.182613372802734, + "learning_rate": 3.2364403682503045e-07, + "loss": 0.3612, + "step": 35012 + }, + { + "epoch": 0.8860237366196827, + "grad_norm": 4.007944107055664, + "learning_rate": 3.235019402647344e-07, + "loss": 0.1581, + "step": 35013 + }, + { + "epoch": 0.8860490421843763, + "grad_norm": 12.806842803955078, + "learning_rate": 3.233598738623045e-07, + "loss": 0.1082, + "step": 35014 + }, + { + "epoch": 0.88607434774907, + "grad_norm": 4.165965557098389, + "learning_rate": 3.2321783761865565e-07, + "loss": 0.1038, + "step": 35015 + }, + { + "epoch": 0.8860996533137637, + "grad_norm": 9.320003509521484, + "learning_rate": 3.23075831534706e-07, + "loss": 0.2209, + "step": 35016 + }, + { + "epoch": 0.8861249588784573, + "grad_norm": 12.135149955749512, + "learning_rate": 3.229338556113704e-07, + "loss": 0.2184, + "step": 35017 + }, + { + "epoch": 0.886150264443151, + "grad_norm": 3.632094621658325, + "learning_rate": 3.2279190984956466e-07, + "loss": 0.1591, + "step": 35018 + }, + { + "epoch": 0.8861755700078447, + "grad_norm": 5.250797748565674, + "learning_rate": 3.226499942502026e-07, + "loss": 0.1111, + "step": 35019 + }, + { + "epoch": 0.8862008755725383, + "grad_norm": 3.1217031478881836, + "learning_rate": 3.225081088142018e-07, + "loss": 0.1416, + "step": 35020 + }, + { + "epoch": 0.8862261811372321, + "grad_norm": 4.576469898223877, + "learning_rate": 3.2236625354247543e-07, + "loss": 0.2151, + "step": 35021 + }, + { + "epoch": 0.8862514867019258, + "grad_norm": 16.641551971435547, + "learning_rate": 3.222244284359399e-07, + "loss": 0.249, + "step": 35022 + }, + { + "epoch": 0.8862767922666195, + "grad_norm": 3.1832666397094727, + "learning_rate": 3.220826334955079e-07, + "loss": 0.1167, + "step": 35023 + }, + { + "epoch": 0.8863020978313131, + "grad_norm": 17.05522346496582, + "learning_rate": 3.219408687220954e-07, + "loss": 0.3127, + "step": 35024 + }, + { + "epoch": 0.8863274033960068, + "grad_norm": 4.250668525695801, + "learning_rate": 3.2179913411661655e-07, + "loss": 0.1835, + "step": 35025 + }, + { + "epoch": 0.8863527089607005, + "grad_norm": 6.882791996002197, + "learning_rate": 3.2165742967998405e-07, + "loss": 0.1263, + "step": 35026 + }, + { + "epoch": 0.8863780145253941, + "grad_norm": 1.5003012418746948, + "learning_rate": 3.215157554131126e-07, + "loss": 0.044, + "step": 35027 + }, + { + "epoch": 0.8864033200900878, + "grad_norm": 4.109914779663086, + "learning_rate": 3.213741113169161e-07, + "loss": 0.1321, + "step": 35028 + }, + { + "epoch": 0.8864286256547815, + "grad_norm": 6.818849563598633, + "learning_rate": 3.212324973923076e-07, + "loss": 0.204, + "step": 35029 + }, + { + "epoch": 0.8864539312194751, + "grad_norm": 6.169492244720459, + "learning_rate": 3.210909136401996e-07, + "loss": 0.1659, + "step": 35030 + }, + { + "epoch": 0.8864792367841688, + "grad_norm": 7.696061611175537, + "learning_rate": 3.209493600615077e-07, + "loss": 0.2113, + "step": 35031 + }, + { + "epoch": 0.8865045423488626, + "grad_norm": 5.5128607749938965, + "learning_rate": 3.2080783665714156e-07, + "loss": 0.2484, + "step": 35032 + }, + { + "epoch": 0.8865298479135562, + "grad_norm": 3.3278989791870117, + "learning_rate": 3.2066634342801607e-07, + "loss": 0.0972, + "step": 35033 + }, + { + "epoch": 0.8865551534782499, + "grad_norm": 3.2259490489959717, + "learning_rate": 3.205248803750416e-07, + "loss": 0.1254, + "step": 35034 + }, + { + "epoch": 0.8865804590429436, + "grad_norm": 7.339263439178467, + "learning_rate": 3.203834474991335e-07, + "loss": 0.1781, + "step": 35035 + }, + { + "epoch": 0.8866057646076372, + "grad_norm": 13.956650733947754, + "learning_rate": 3.202420448012011e-07, + "loss": 0.2451, + "step": 35036 + }, + { + "epoch": 0.8866310701723309, + "grad_norm": 6.712392330169678, + "learning_rate": 3.201006722821576e-07, + "loss": 0.1461, + "step": 35037 + }, + { + "epoch": 0.8866563757370246, + "grad_norm": 4.324559211730957, + "learning_rate": 3.1995932994291323e-07, + "loss": 0.0882, + "step": 35038 + }, + { + "epoch": 0.8866816813017182, + "grad_norm": 11.457255363464355, + "learning_rate": 3.198180177843818e-07, + "loss": 0.1827, + "step": 35039 + }, + { + "epoch": 0.8867069868664119, + "grad_norm": 9.567534446716309, + "learning_rate": 3.1967673580747315e-07, + "loss": 0.1855, + "step": 35040 + }, + { + "epoch": 0.8867322924311056, + "grad_norm": 8.112895011901855, + "learning_rate": 3.1953548401309877e-07, + "loss": 0.2343, + "step": 35041 + }, + { + "epoch": 0.8867575979957992, + "grad_norm": 3.7246274948120117, + "learning_rate": 3.1939426240216897e-07, + "loss": 0.1461, + "step": 35042 + }, + { + "epoch": 0.8867829035604929, + "grad_norm": 4.76456356048584, + "learning_rate": 3.192530709755953e-07, + "loss": 0.1715, + "step": 35043 + }, + { + "epoch": 0.8868082091251867, + "grad_norm": 4.917178153991699, + "learning_rate": 3.191119097342876e-07, + "loss": 0.1992, + "step": 35044 + }, + { + "epoch": 0.8868335146898803, + "grad_norm": 5.964117050170898, + "learning_rate": 3.189707786791568e-07, + "loss": 0.1625, + "step": 35045 + }, + { + "epoch": 0.886858820254574, + "grad_norm": 2.4887590408325195, + "learning_rate": 3.188296778111122e-07, + "loss": 0.1032, + "step": 35046 + }, + { + "epoch": 0.8868841258192677, + "grad_norm": 5.85966682434082, + "learning_rate": 3.186886071310646e-07, + "loss": 0.2118, + "step": 35047 + }, + { + "epoch": 0.8869094313839614, + "grad_norm": 5.6028666496276855, + "learning_rate": 3.1854756663992345e-07, + "loss": 0.1916, + "step": 35048 + }, + { + "epoch": 0.886934736948655, + "grad_norm": 3.653923273086548, + "learning_rate": 3.184065563385985e-07, + "loss": 0.1435, + "step": 35049 + }, + { + "epoch": 0.8869600425133487, + "grad_norm": 5.592606067657471, + "learning_rate": 3.182655762279979e-07, + "loss": 0.1174, + "step": 35050 + }, + { + "epoch": 0.8869853480780424, + "grad_norm": 3.065786123275757, + "learning_rate": 3.181246263090326e-07, + "loss": 0.1249, + "step": 35051 + }, + { + "epoch": 0.887010653642736, + "grad_norm": 3.147109031677246, + "learning_rate": 3.179837065826102e-07, + "loss": 0.1398, + "step": 35052 + }, + { + "epoch": 0.8870359592074297, + "grad_norm": 2.6956584453582764, + "learning_rate": 3.1784281704964057e-07, + "loss": 0.118, + "step": 35053 + }, + { + "epoch": 0.8870612647721234, + "grad_norm": 5.822865009307861, + "learning_rate": 3.1770195771103074e-07, + "loss": 0.1836, + "step": 35054 + }, + { + "epoch": 0.887086570336817, + "grad_norm": 3.2708799839019775, + "learning_rate": 3.175611285676905e-07, + "loss": 0.1197, + "step": 35055 + }, + { + "epoch": 0.8871118759015107, + "grad_norm": 7.41957426071167, + "learning_rate": 3.1742032962052805e-07, + "loss": 0.2372, + "step": 35056 + }, + { + "epoch": 0.8871371814662045, + "grad_norm": 5.891051292419434, + "learning_rate": 3.1727956087045043e-07, + "loss": 0.1536, + "step": 35057 + }, + { + "epoch": 0.8871624870308981, + "grad_norm": 6.476376533508301, + "learning_rate": 3.1713882231836523e-07, + "loss": 0.2073, + "step": 35058 + }, + { + "epoch": 0.8871877925955918, + "grad_norm": 6.614351272583008, + "learning_rate": 3.169981139651812e-07, + "loss": 0.1278, + "step": 35059 + }, + { + "epoch": 0.8872130981602855, + "grad_norm": 21.0399169921875, + "learning_rate": 3.168574358118054e-07, + "loss": 0.2337, + "step": 35060 + }, + { + "epoch": 0.8872384037249791, + "grad_norm": 5.413099765777588, + "learning_rate": 3.167167878591443e-07, + "loss": 0.1609, + "step": 35061 + }, + { + "epoch": 0.8872637092896728, + "grad_norm": 2.715843915939331, + "learning_rate": 3.1657617010810716e-07, + "loss": 0.1076, + "step": 35062 + }, + { + "epoch": 0.8872890148543665, + "grad_norm": 4.7993245124816895, + "learning_rate": 3.1643558255959714e-07, + "loss": 0.1683, + "step": 35063 + }, + { + "epoch": 0.8873143204190601, + "grad_norm": 4.397403240203857, + "learning_rate": 3.162950252145236e-07, + "loss": 0.1435, + "step": 35064 + }, + { + "epoch": 0.8873396259837538, + "grad_norm": 2.5363829135894775, + "learning_rate": 3.1615449807379186e-07, + "loss": 0.0931, + "step": 35065 + }, + { + "epoch": 0.8873649315484475, + "grad_norm": 2.74660062789917, + "learning_rate": 3.160140011383095e-07, + "loss": 0.0924, + "step": 35066 + }, + { + "epoch": 0.8873902371131411, + "grad_norm": 3.961852788925171, + "learning_rate": 3.1587353440898027e-07, + "loss": 0.0993, + "step": 35067 + }, + { + "epoch": 0.8874155426778348, + "grad_norm": 3.4666121006011963, + "learning_rate": 3.1573309788671236e-07, + "loss": 0.1284, + "step": 35068 + }, + { + "epoch": 0.8874408482425286, + "grad_norm": 8.692319869995117, + "learning_rate": 3.1559269157240946e-07, + "loss": 0.1618, + "step": 35069 + }, + { + "epoch": 0.8874661538072222, + "grad_norm": 4.228426933288574, + "learning_rate": 3.154523154669792e-07, + "loss": 0.1665, + "step": 35070 + }, + { + "epoch": 0.8874914593719159, + "grad_norm": 6.403584957122803, + "learning_rate": 3.153119695713241e-07, + "loss": 0.1849, + "step": 35071 + }, + { + "epoch": 0.8875167649366096, + "grad_norm": 8.149649620056152, + "learning_rate": 3.151716538863514e-07, + "loss": 0.1826, + "step": 35072 + }, + { + "epoch": 0.8875420705013032, + "grad_norm": 24.32855796813965, + "learning_rate": 3.150313684129647e-07, + "loss": 0.2055, + "step": 35073 + }, + { + "epoch": 0.8875673760659969, + "grad_norm": 19.676395416259766, + "learning_rate": 3.148911131520699e-07, + "loss": 0.2306, + "step": 35074 + }, + { + "epoch": 0.8875926816306906, + "grad_norm": 5.415283679962158, + "learning_rate": 3.147508881045708e-07, + "loss": 0.1071, + "step": 35075 + }, + { + "epoch": 0.8876179871953843, + "grad_norm": 3.607412576675415, + "learning_rate": 3.1461069327137163e-07, + "loss": 0.1373, + "step": 35076 + }, + { + "epoch": 0.8876432927600779, + "grad_norm": 4.851208686828613, + "learning_rate": 3.144705286533761e-07, + "loss": 0.1652, + "step": 35077 + }, + { + "epoch": 0.8876685983247716, + "grad_norm": 4.341643810272217, + "learning_rate": 3.1433039425148915e-07, + "loss": 0.1276, + "step": 35078 + }, + { + "epoch": 0.8876939038894653, + "grad_norm": 3.9730381965637207, + "learning_rate": 3.141902900666138e-07, + "loss": 0.1667, + "step": 35079 + }, + { + "epoch": 0.8877192094541589, + "grad_norm": 6.291376113891602, + "learning_rate": 3.1405021609965393e-07, + "loss": 0.1071, + "step": 35080 + }, + { + "epoch": 0.8877445150188527, + "grad_norm": 3.6340529918670654, + "learning_rate": 3.1391017235151145e-07, + "loss": 0.1301, + "step": 35081 + }, + { + "epoch": 0.8877698205835464, + "grad_norm": 5.877433776855469, + "learning_rate": 3.1377015882309127e-07, + "loss": 0.1403, + "step": 35082 + }, + { + "epoch": 0.88779512614824, + "grad_norm": 5.259612560272217, + "learning_rate": 3.1363017551529597e-07, + "loss": 0.2175, + "step": 35083 + }, + { + "epoch": 0.8878204317129337, + "grad_norm": 3.077071189880371, + "learning_rate": 3.134902224290276e-07, + "loss": 0.1364, + "step": 35084 + }, + { + "epoch": 0.8878457372776274, + "grad_norm": 2.4489452838897705, + "learning_rate": 3.1335029956518883e-07, + "loss": 0.0737, + "step": 35085 + }, + { + "epoch": 0.887871042842321, + "grad_norm": 8.534289360046387, + "learning_rate": 3.132104069246822e-07, + "loss": 0.2107, + "step": 35086 + }, + { + "epoch": 0.8878963484070147, + "grad_norm": 3.729064464569092, + "learning_rate": 3.1307054450841035e-07, + "loss": 0.1363, + "step": 35087 + }, + { + "epoch": 0.8879216539717084, + "grad_norm": 20.42022132873535, + "learning_rate": 3.129307123172748e-07, + "loss": 0.374, + "step": 35088 + }, + { + "epoch": 0.887946959536402, + "grad_norm": 2.8773930072784424, + "learning_rate": 3.1279091035217703e-07, + "loss": 0.0988, + "step": 35089 + }, + { + "epoch": 0.8879722651010957, + "grad_norm": 5.297637939453125, + "learning_rate": 3.12651138614018e-07, + "loss": 0.1524, + "step": 35090 + }, + { + "epoch": 0.8879975706657894, + "grad_norm": 8.017130851745605, + "learning_rate": 3.1251139710370025e-07, + "loss": 0.1731, + "step": 35091 + }, + { + "epoch": 0.888022876230483, + "grad_norm": 5.82135009765625, + "learning_rate": 3.1237168582212484e-07, + "loss": 0.1337, + "step": 35092 + }, + { + "epoch": 0.8880481817951767, + "grad_norm": 6.342034816741943, + "learning_rate": 3.1223200477019266e-07, + "loss": 0.1315, + "step": 35093 + }, + { + "epoch": 0.8880734873598705, + "grad_norm": 3.2339162826538086, + "learning_rate": 3.1209235394880353e-07, + "loss": 0.1149, + "step": 35094 + }, + { + "epoch": 0.8880987929245641, + "grad_norm": 4.620632648468018, + "learning_rate": 3.1195273335885954e-07, + "loss": 0.1461, + "step": 35095 + }, + { + "epoch": 0.8881240984892578, + "grad_norm": 7.915709018707275, + "learning_rate": 3.1181314300125944e-07, + "loss": 0.1964, + "step": 35096 + }, + { + "epoch": 0.8881494040539515, + "grad_norm": 5.193230628967285, + "learning_rate": 3.1167358287690574e-07, + "loss": 0.1435, + "step": 35097 + }, + { + "epoch": 0.8881747096186451, + "grad_norm": 2.220526933670044, + "learning_rate": 3.1153405298669557e-07, + "loss": 0.0834, + "step": 35098 + }, + { + "epoch": 0.8882000151833388, + "grad_norm": 3.461169958114624, + "learning_rate": 3.11394553331531e-07, + "loss": 0.0933, + "step": 35099 + }, + { + "epoch": 0.8882253207480325, + "grad_norm": 5.561848163604736, + "learning_rate": 3.112550839123102e-07, + "loss": 0.2074, + "step": 35100 + }, + { + "epoch": 0.8882506263127262, + "grad_norm": 11.786589622497559, + "learning_rate": 3.111156447299346e-07, + "loss": 0.2414, + "step": 35101 + }, + { + "epoch": 0.8882759318774198, + "grad_norm": 3.207695245742798, + "learning_rate": 3.1097623578530076e-07, + "loss": 0.0944, + "step": 35102 + }, + { + "epoch": 0.8883012374421135, + "grad_norm": 9.373062133789062, + "learning_rate": 3.108368570793091e-07, + "loss": 0.1238, + "step": 35103 + }, + { + "epoch": 0.8883265430068072, + "grad_norm": 2.6287715435028076, + "learning_rate": 3.1069750861285773e-07, + "loss": 0.1288, + "step": 35104 + }, + { + "epoch": 0.8883518485715008, + "grad_norm": 6.622469902038574, + "learning_rate": 3.105581903868471e-07, + "loss": 0.1142, + "step": 35105 + }, + { + "epoch": 0.8883771541361946, + "grad_norm": 4.784310340881348, + "learning_rate": 3.1041890240217366e-07, + "loss": 0.2596, + "step": 35106 + }, + { + "epoch": 0.8884024597008883, + "grad_norm": 3.086625099182129, + "learning_rate": 3.102796446597367e-07, + "loss": 0.1212, + "step": 35107 + }, + { + "epoch": 0.8884277652655819, + "grad_norm": 6.199852466583252, + "learning_rate": 3.101404171604333e-07, + "loss": 0.1404, + "step": 35108 + }, + { + "epoch": 0.8884530708302756, + "grad_norm": 3.2780394554138184, + "learning_rate": 3.100012199051627e-07, + "loss": 0.1168, + "step": 35109 + }, + { + "epoch": 0.8884783763949693, + "grad_norm": 11.676977157592773, + "learning_rate": 3.098620528948221e-07, + "loss": 0.2738, + "step": 35110 + }, + { + "epoch": 0.8885036819596629, + "grad_norm": 2.99277400970459, + "learning_rate": 3.097229161303084e-07, + "loss": 0.1564, + "step": 35111 + }, + { + "epoch": 0.8885289875243566, + "grad_norm": 2.480755567550659, + "learning_rate": 3.095838096125181e-07, + "loss": 0.0816, + "step": 35112 + }, + { + "epoch": 0.8885542930890503, + "grad_norm": 12.868913650512695, + "learning_rate": 3.094447333423506e-07, + "loss": 0.174, + "step": 35113 + }, + { + "epoch": 0.8885795986537439, + "grad_norm": 6.054758548736572, + "learning_rate": 3.093056873207012e-07, + "loss": 0.1213, + "step": 35114 + }, + { + "epoch": 0.8886049042184376, + "grad_norm": 4.858206272125244, + "learning_rate": 3.091666715484665e-07, + "loss": 0.2017, + "step": 35115 + }, + { + "epoch": 0.8886302097831313, + "grad_norm": 5.637606143951416, + "learning_rate": 3.09027686026544e-07, + "loss": 0.1548, + "step": 35116 + }, + { + "epoch": 0.8886555153478249, + "grad_norm": 13.296555519104004, + "learning_rate": 3.088887307558286e-07, + "loss": 0.2187, + "step": 35117 + }, + { + "epoch": 0.8886808209125187, + "grad_norm": 5.426533222198486, + "learning_rate": 3.0874980573721734e-07, + "loss": 0.1617, + "step": 35118 + }, + { + "epoch": 0.8887061264772124, + "grad_norm": 5.247125625610352, + "learning_rate": 3.086109109716068e-07, + "loss": 0.2043, + "step": 35119 + }, + { + "epoch": 0.888731432041906, + "grad_norm": 8.80107593536377, + "learning_rate": 3.0847204645989116e-07, + "loss": 0.2023, + "step": 35120 + }, + { + "epoch": 0.8887567376065997, + "grad_norm": 6.711934566497803, + "learning_rate": 3.0833321220296584e-07, + "loss": 0.1595, + "step": 35121 + }, + { + "epoch": 0.8887820431712934, + "grad_norm": 5.331569671630859, + "learning_rate": 3.0819440820172795e-07, + "loss": 0.1244, + "step": 35122 + }, + { + "epoch": 0.888807348735987, + "grad_norm": 4.462014198303223, + "learning_rate": 3.0805563445707175e-07, + "loss": 0.1186, + "step": 35123 + }, + { + "epoch": 0.8888326543006807, + "grad_norm": 5.6912150382995605, + "learning_rate": 3.079168909698921e-07, + "loss": 0.1761, + "step": 35124 + }, + { + "epoch": 0.8888579598653744, + "grad_norm": 4.949007034301758, + "learning_rate": 3.077781777410821e-07, + "loss": 0.156, + "step": 35125 + }, + { + "epoch": 0.8888832654300681, + "grad_norm": 4.823922157287598, + "learning_rate": 3.0763949477153943e-07, + "loss": 0.1746, + "step": 35126 + }, + { + "epoch": 0.8889085709947617, + "grad_norm": 3.27714467048645, + "learning_rate": 3.075008420621556e-07, + "loss": 0.1139, + "step": 35127 + }, + { + "epoch": 0.8889338765594554, + "grad_norm": 11.375202178955078, + "learning_rate": 3.0736221961382707e-07, + "loss": 0.3331, + "step": 35128 + }, + { + "epoch": 0.8889591821241491, + "grad_norm": 4.171541213989258, + "learning_rate": 3.0722362742744596e-07, + "loss": 0.1229, + "step": 35129 + }, + { + "epoch": 0.8889844876888428, + "grad_norm": 3.43408203125, + "learning_rate": 3.07085065503907e-07, + "loss": 0.0575, + "step": 35130 + }, + { + "epoch": 0.8890097932535365, + "grad_norm": 9.273725509643555, + "learning_rate": 3.06946533844103e-07, + "loss": 0.1745, + "step": 35131 + }, + { + "epoch": 0.8890350988182302, + "grad_norm": 5.492764472961426, + "learning_rate": 3.068080324489292e-07, + "loss": 0.2215, + "step": 35132 + }, + { + "epoch": 0.8890604043829238, + "grad_norm": 7.162621021270752, + "learning_rate": 3.066695613192761e-07, + "loss": 0.1615, + "step": 35133 + }, + { + "epoch": 0.8890857099476175, + "grad_norm": 10.734040260314941, + "learning_rate": 3.065311204560384e-07, + "loss": 0.2166, + "step": 35134 + }, + { + "epoch": 0.8891110155123112, + "grad_norm": 4.842103004455566, + "learning_rate": 3.063927098601077e-07, + "loss": 0.126, + "step": 35135 + }, + { + "epoch": 0.8891363210770048, + "grad_norm": 12.49724006652832, + "learning_rate": 3.062543295323783e-07, + "loss": 0.2483, + "step": 35136 + }, + { + "epoch": 0.8891616266416985, + "grad_norm": 2.4885618686676025, + "learning_rate": 3.0611597947374173e-07, + "loss": 0.1069, + "step": 35137 + }, + { + "epoch": 0.8891869322063922, + "grad_norm": 7.987578392028809, + "learning_rate": 3.0597765968508996e-07, + "loss": 0.2241, + "step": 35138 + }, + { + "epoch": 0.8892122377710858, + "grad_norm": 4.0967936515808105, + "learning_rate": 3.0583937016731456e-07, + "loss": 0.1228, + "step": 35139 + }, + { + "epoch": 0.8892375433357795, + "grad_norm": 5.72102165222168, + "learning_rate": 3.0570111092130815e-07, + "loss": 0.1103, + "step": 35140 + }, + { + "epoch": 0.8892628489004732, + "grad_norm": 3.95862078666687, + "learning_rate": 3.0556288194796223e-07, + "loss": 0.1712, + "step": 35141 + }, + { + "epoch": 0.8892881544651668, + "grad_norm": 8.048595428466797, + "learning_rate": 3.054246832481683e-07, + "loss": 0.1445, + "step": 35142 + }, + { + "epoch": 0.8893134600298606, + "grad_norm": 23.946889877319336, + "learning_rate": 3.0528651482281623e-07, + "loss": 0.2184, + "step": 35143 + }, + { + "epoch": 0.8893387655945543, + "grad_norm": 8.247886657714844, + "learning_rate": 3.0514837667279916e-07, + "loss": 0.216, + "step": 35144 + }, + { + "epoch": 0.8893640711592479, + "grad_norm": 2.983548164367676, + "learning_rate": 3.05010268799007e-07, + "loss": 0.1004, + "step": 35145 + }, + { + "epoch": 0.8893893767239416, + "grad_norm": 5.768892765045166, + "learning_rate": 3.0487219120233003e-07, + "loss": 0.152, + "step": 35146 + }, + { + "epoch": 0.8894146822886353, + "grad_norm": 4.551220893859863, + "learning_rate": 3.047341438836593e-07, + "loss": 0.1411, + "step": 35147 + }, + { + "epoch": 0.8894399878533289, + "grad_norm": 4.800671100616455, + "learning_rate": 3.045961268438835e-07, + "loss": 0.1834, + "step": 35148 + }, + { + "epoch": 0.8894652934180226, + "grad_norm": 2.8423526287078857, + "learning_rate": 3.0445814008389474e-07, + "loss": 0.0915, + "step": 35149 + }, + { + "epoch": 0.8894905989827163, + "grad_norm": 3.7340023517608643, + "learning_rate": 3.043201836045817e-07, + "loss": 0.1613, + "step": 35150 + }, + { + "epoch": 0.88951590454741, + "grad_norm": 6.628333568572998, + "learning_rate": 3.0418225740683484e-07, + "loss": 0.1624, + "step": 35151 + }, + { + "epoch": 0.8895412101121036, + "grad_norm": 3.2056407928466797, + "learning_rate": 3.040443614915423e-07, + "loss": 0.1243, + "step": 35152 + }, + { + "epoch": 0.8895665156767973, + "grad_norm": 5.0210442543029785, + "learning_rate": 3.03906495859595e-07, + "loss": 0.1595, + "step": 35153 + }, + { + "epoch": 0.889591821241491, + "grad_norm": 10.766881942749023, + "learning_rate": 3.037686605118806e-07, + "loss": 0.1815, + "step": 35154 + }, + { + "epoch": 0.8896171268061847, + "grad_norm": 5.29286527633667, + "learning_rate": 3.0363085544928894e-07, + "loss": 0.171, + "step": 35155 + }, + { + "epoch": 0.8896424323708784, + "grad_norm": 5.002786636352539, + "learning_rate": 3.0349308067270767e-07, + "loss": 0.1913, + "step": 35156 + }, + { + "epoch": 0.8896677379355721, + "grad_norm": 3.5083000659942627, + "learning_rate": 3.0335533618302657e-07, + "loss": 0.1388, + "step": 35157 + }, + { + "epoch": 0.8896930435002657, + "grad_norm": 5.139405727386475, + "learning_rate": 3.032176219811322e-07, + "loss": 0.1957, + "step": 35158 + }, + { + "epoch": 0.8897183490649594, + "grad_norm": 9.084046363830566, + "learning_rate": 3.030799380679156e-07, + "loss": 0.1305, + "step": 35159 + }, + { + "epoch": 0.8897436546296531, + "grad_norm": 3.2972919940948486, + "learning_rate": 3.029422844442609e-07, + "loss": 0.0968, + "step": 35160 + }, + { + "epoch": 0.8897689601943467, + "grad_norm": 4.175347805023193, + "learning_rate": 3.028046611110591e-07, + "loss": 0.1528, + "step": 35161 + }, + { + "epoch": 0.8897942657590404, + "grad_norm": 2.851649522781372, + "learning_rate": 3.0266706806919513e-07, + "loss": 0.1139, + "step": 35162 + }, + { + "epoch": 0.8898195713237341, + "grad_norm": 4.752560615539551, + "learning_rate": 3.025295053195587e-07, + "loss": 0.1609, + "step": 35163 + }, + { + "epoch": 0.8898448768884277, + "grad_norm": 3.9715120792388916, + "learning_rate": 3.0239197286303425e-07, + "loss": 0.1467, + "step": 35164 + }, + { + "epoch": 0.8898701824531214, + "grad_norm": 3.5773868560791016, + "learning_rate": 3.022544707005115e-07, + "loss": 0.1175, + "step": 35165 + }, + { + "epoch": 0.8898954880178152, + "grad_norm": 4.973088264465332, + "learning_rate": 3.021169988328743e-07, + "loss": 0.1023, + "step": 35166 + }, + { + "epoch": 0.8899207935825088, + "grad_norm": 4.297118186950684, + "learning_rate": 3.019795572610118e-07, + "loss": 0.1656, + "step": 35167 + }, + { + "epoch": 0.8899460991472025, + "grad_norm": 5.651900768280029, + "learning_rate": 3.018421459858095e-07, + "loss": 0.2258, + "step": 35168 + }, + { + "epoch": 0.8899714047118962, + "grad_norm": 3.164823532104492, + "learning_rate": 3.0170476500815283e-07, + "loss": 0.1361, + "step": 35169 + }, + { + "epoch": 0.8899967102765898, + "grad_norm": 6.395077228546143, + "learning_rate": 3.015674143289277e-07, + "loss": 0.1866, + "step": 35170 + }, + { + "epoch": 0.8900220158412835, + "grad_norm": 4.276622772216797, + "learning_rate": 3.014300939490211e-07, + "loss": 0.1454, + "step": 35171 + }, + { + "epoch": 0.8900473214059772, + "grad_norm": 6.85483455657959, + "learning_rate": 3.0129280386931804e-07, + "loss": 0.1793, + "step": 35172 + }, + { + "epoch": 0.8900726269706708, + "grad_norm": 6.598362445831299, + "learning_rate": 3.0115554409070327e-07, + "loss": 0.0771, + "step": 35173 + }, + { + "epoch": 0.8900979325353645, + "grad_norm": 6.515666961669922, + "learning_rate": 3.0101831461406215e-07, + "loss": 0.1972, + "step": 35174 + }, + { + "epoch": 0.8901232381000582, + "grad_norm": 7.504544734954834, + "learning_rate": 3.008811154402796e-07, + "loss": 0.2245, + "step": 35175 + }, + { + "epoch": 0.8901485436647518, + "grad_norm": 4.7614569664001465, + "learning_rate": 3.007439465702411e-07, + "loss": 0.1239, + "step": 35176 + }, + { + "epoch": 0.8901738492294455, + "grad_norm": 7.158110618591309, + "learning_rate": 3.006068080048308e-07, + "loss": 0.1902, + "step": 35177 + }, + { + "epoch": 0.8901991547941392, + "grad_norm": 3.892347574234009, + "learning_rate": 3.00469699744933e-07, + "loss": 0.1494, + "step": 35178 + }, + { + "epoch": 0.890224460358833, + "grad_norm": 3.0825703144073486, + "learning_rate": 3.003326217914315e-07, + "loss": 0.1103, + "step": 35179 + }, + { + "epoch": 0.8902497659235266, + "grad_norm": 6.342960834503174, + "learning_rate": 3.001955741452112e-07, + "loss": 0.2092, + "step": 35180 + }, + { + "epoch": 0.8902750714882203, + "grad_norm": 5.175938129425049, + "learning_rate": 3.000585568071551e-07, + "loss": 0.1237, + "step": 35181 + }, + { + "epoch": 0.890300377052914, + "grad_norm": 2.637899398803711, + "learning_rate": 2.999215697781477e-07, + "loss": 0.0899, + "step": 35182 + }, + { + "epoch": 0.8903256826176076, + "grad_norm": 3.467047691345215, + "learning_rate": 2.997846130590709e-07, + "loss": 0.1801, + "step": 35183 + }, + { + "epoch": 0.8903509881823013, + "grad_norm": 4.145505428314209, + "learning_rate": 2.996476866508097e-07, + "loss": 0.1219, + "step": 35184 + }, + { + "epoch": 0.890376293746995, + "grad_norm": 14.381620407104492, + "learning_rate": 2.995107905542455e-07, + "loss": 0.1815, + "step": 35185 + }, + { + "epoch": 0.8904015993116886, + "grad_norm": 3.8129944801330566, + "learning_rate": 2.993739247702626e-07, + "loss": 0.1152, + "step": 35186 + }, + { + "epoch": 0.8904269048763823, + "grad_norm": 3.1757688522338867, + "learning_rate": 2.9923708929974204e-07, + "loss": 0.176, + "step": 35187 + }, + { + "epoch": 0.890452210441076, + "grad_norm": 4.326307773590088, + "learning_rate": 2.9910028414356753e-07, + "loss": 0.1407, + "step": 35188 + }, + { + "epoch": 0.8904775160057696, + "grad_norm": 15.546445846557617, + "learning_rate": 2.989635093026205e-07, + "loss": 0.2164, + "step": 35189 + }, + { + "epoch": 0.8905028215704633, + "grad_norm": 3.926072359085083, + "learning_rate": 2.988267647777843e-07, + "loss": 0.1254, + "step": 35190 + }, + { + "epoch": 0.890528127135157, + "grad_norm": 2.488433599472046, + "learning_rate": 2.986900505699386e-07, + "loss": 0.1186, + "step": 35191 + }, + { + "epoch": 0.8905534326998507, + "grad_norm": 5.763726711273193, + "learning_rate": 2.9855336667996726e-07, + "loss": 0.0869, + "step": 35192 + }, + { + "epoch": 0.8905787382645444, + "grad_norm": 2.917661428451538, + "learning_rate": 2.9841671310874954e-07, + "loss": 0.0693, + "step": 35193 + }, + { + "epoch": 0.8906040438292381, + "grad_norm": 3.5841429233551025, + "learning_rate": 2.982800898571697e-07, + "loss": 0.1163, + "step": 35194 + }, + { + "epoch": 0.8906293493939317, + "grad_norm": 2.9032185077667236, + "learning_rate": 2.9814349692610546e-07, + "loss": 0.0984, + "step": 35195 + }, + { + "epoch": 0.8906546549586254, + "grad_norm": 7.868292808532715, + "learning_rate": 2.9800693431643933e-07, + "loss": 0.1409, + "step": 35196 + }, + { + "epoch": 0.8906799605233191, + "grad_norm": 3.09222674369812, + "learning_rate": 2.9787040202905184e-07, + "loss": 0.0807, + "step": 35197 + }, + { + "epoch": 0.8907052660880127, + "grad_norm": 3.178610324859619, + "learning_rate": 2.977339000648244e-07, + "loss": 0.1085, + "step": 35198 + }, + { + "epoch": 0.8907305716527064, + "grad_norm": 14.728790283203125, + "learning_rate": 2.9759742842463525e-07, + "loss": 0.296, + "step": 35199 + }, + { + "epoch": 0.8907558772174001, + "grad_norm": 3.180251121520996, + "learning_rate": 2.9746098710936646e-07, + "loss": 0.1043, + "step": 35200 + }, + { + "epoch": 0.8907811827820937, + "grad_norm": 4.297488212585449, + "learning_rate": 2.973245761198967e-07, + "loss": 0.1201, + "step": 35201 + }, + { + "epoch": 0.8908064883467874, + "grad_norm": 5.792663097381592, + "learning_rate": 2.971881954571054e-07, + "loss": 0.2271, + "step": 35202 + }, + { + "epoch": 0.8908317939114812, + "grad_norm": 6.206775188446045, + "learning_rate": 2.970518451218735e-07, + "loss": 0.1465, + "step": 35203 + }, + { + "epoch": 0.8908570994761749, + "grad_norm": 4.55366849899292, + "learning_rate": 2.96915525115079e-07, + "loss": 0.1483, + "step": 35204 + }, + { + "epoch": 0.8908824050408685, + "grad_norm": 4.542717456817627, + "learning_rate": 2.9677923543760143e-07, + "loss": 0.1581, + "step": 35205 + }, + { + "epoch": 0.8909077106055622, + "grad_norm": 3.6048643589019775, + "learning_rate": 2.966429760903194e-07, + "loss": 0.122, + "step": 35206 + }, + { + "epoch": 0.8909330161702559, + "grad_norm": 5.2325921058654785, + "learning_rate": 2.965067470741123e-07, + "loss": 0.0951, + "step": 35207 + }, + { + "epoch": 0.8909583217349495, + "grad_norm": 4.10088586807251, + "learning_rate": 2.963705483898588e-07, + "loss": 0.1536, + "step": 35208 + }, + { + "epoch": 0.8909836272996432, + "grad_norm": 4.587348937988281, + "learning_rate": 2.96234380038436e-07, + "loss": 0.1289, + "step": 35209 + }, + { + "epoch": 0.8910089328643369, + "grad_norm": 4.298257350921631, + "learning_rate": 2.960982420207226e-07, + "loss": 0.0685, + "step": 35210 + }, + { + "epoch": 0.8910342384290305, + "grad_norm": 9.756422996520996, + "learning_rate": 2.959621343375968e-07, + "loss": 0.2812, + "step": 35211 + }, + { + "epoch": 0.8910595439937242, + "grad_norm": 10.438224792480469, + "learning_rate": 2.9582605698993684e-07, + "loss": 0.0959, + "step": 35212 + }, + { + "epoch": 0.8910848495584179, + "grad_norm": 6.593736171722412, + "learning_rate": 2.956900099786192e-07, + "loss": 0.1754, + "step": 35213 + }, + { + "epoch": 0.8911101551231115, + "grad_norm": 2.2169363498687744, + "learning_rate": 2.9555399330452095e-07, + "loss": 0.0541, + "step": 35214 + }, + { + "epoch": 0.8911354606878052, + "grad_norm": 3.088953733444214, + "learning_rate": 2.954180069685203e-07, + "loss": 0.132, + "step": 35215 + }, + { + "epoch": 0.891160766252499, + "grad_norm": 3.0287656784057617, + "learning_rate": 2.9528205097149433e-07, + "loss": 0.1366, + "step": 35216 + }, + { + "epoch": 0.8911860718171926, + "grad_norm": 4.980032920837402, + "learning_rate": 2.9514612531431896e-07, + "loss": 0.0988, + "step": 35217 + }, + { + "epoch": 0.8912113773818863, + "grad_norm": 4.314696788787842, + "learning_rate": 2.950102299978702e-07, + "loss": 0.1967, + "step": 35218 + }, + { + "epoch": 0.89123668294658, + "grad_norm": 8.251224517822266, + "learning_rate": 2.948743650230262e-07, + "loss": 0.2113, + "step": 35219 + }, + { + "epoch": 0.8912619885112736, + "grad_norm": 3.7463572025299072, + "learning_rate": 2.9473853039066194e-07, + "loss": 0.0733, + "step": 35220 + }, + { + "epoch": 0.8912872940759673, + "grad_norm": 5.595637321472168, + "learning_rate": 2.946027261016543e-07, + "loss": 0.1298, + "step": 35221 + }, + { + "epoch": 0.891312599640661, + "grad_norm": 6.652679443359375, + "learning_rate": 2.944669521568777e-07, + "loss": 0.1392, + "step": 35222 + }, + { + "epoch": 0.8913379052053546, + "grad_norm": 11.818794250488281, + "learning_rate": 2.9433120855720866e-07, + "loss": 0.2204, + "step": 35223 + }, + { + "epoch": 0.8913632107700483, + "grad_norm": 6.262123107910156, + "learning_rate": 2.9419549530352144e-07, + "loss": 0.2115, + "step": 35224 + }, + { + "epoch": 0.891388516334742, + "grad_norm": 5.625524520874023, + "learning_rate": 2.9405981239669426e-07, + "loss": 0.134, + "step": 35225 + }, + { + "epoch": 0.8914138218994356, + "grad_norm": 16.905155181884766, + "learning_rate": 2.9392415983759803e-07, + "loss": 0.1501, + "step": 35226 + }, + { + "epoch": 0.8914391274641293, + "grad_norm": 3.116581439971924, + "learning_rate": 2.937885376271099e-07, + "loss": 0.1045, + "step": 35227 + }, + { + "epoch": 0.8914644330288231, + "grad_norm": 9.941717147827148, + "learning_rate": 2.9365294576610417e-07, + "loss": 0.1823, + "step": 35228 + }, + { + "epoch": 0.8914897385935168, + "grad_norm": 3.571610927581787, + "learning_rate": 2.935173842554562e-07, + "loss": 0.0996, + "step": 35229 + }, + { + "epoch": 0.8915150441582104, + "grad_norm": 4.7718186378479, + "learning_rate": 2.933818530960375e-07, + "loss": 0.1443, + "step": 35230 + }, + { + "epoch": 0.8915403497229041, + "grad_norm": 2.989208936691284, + "learning_rate": 2.932463522887247e-07, + "loss": 0.0913, + "step": 35231 + }, + { + "epoch": 0.8915656552875978, + "grad_norm": 4.495036602020264, + "learning_rate": 2.9311088183439027e-07, + "loss": 0.1514, + "step": 35232 + }, + { + "epoch": 0.8915909608522914, + "grad_norm": 4.6971821784973145, + "learning_rate": 2.9297544173390815e-07, + "loss": 0.1398, + "step": 35233 + }, + { + "epoch": 0.8916162664169851, + "grad_norm": 16.307065963745117, + "learning_rate": 2.928400319881519e-07, + "loss": 0.2404, + "step": 35234 + }, + { + "epoch": 0.8916415719816788, + "grad_norm": 5.553781509399414, + "learning_rate": 2.927046525979954e-07, + "loss": 0.2168, + "step": 35235 + }, + { + "epoch": 0.8916668775463724, + "grad_norm": 4.7955732345581055, + "learning_rate": 2.925693035643101e-07, + "loss": 0.1666, + "step": 35236 + }, + { + "epoch": 0.8916921831110661, + "grad_norm": 6.191775798797607, + "learning_rate": 2.924339848879698e-07, + "loss": 0.1633, + "step": 35237 + }, + { + "epoch": 0.8917174886757598, + "grad_norm": 5.86035680770874, + "learning_rate": 2.922986965698482e-07, + "loss": 0.1064, + "step": 35238 + }, + { + "epoch": 0.8917427942404534, + "grad_norm": 3.789616346359253, + "learning_rate": 2.9216343861081464e-07, + "loss": 0.09, + "step": 35239 + }, + { + "epoch": 0.8917680998051472, + "grad_norm": 10.513357162475586, + "learning_rate": 2.9202821101174504e-07, + "loss": 0.1531, + "step": 35240 + }, + { + "epoch": 0.8917934053698409, + "grad_norm": 3.6903936862945557, + "learning_rate": 2.918930137735082e-07, + "loss": 0.1249, + "step": 35241 + }, + { + "epoch": 0.8918187109345345, + "grad_norm": 5.939504146575928, + "learning_rate": 2.917578468969784e-07, + "loss": 0.2415, + "step": 35242 + }, + { + "epoch": 0.8918440164992282, + "grad_norm": 17.64042854309082, + "learning_rate": 2.9162271038302713e-07, + "loss": 0.1015, + "step": 35243 + }, + { + "epoch": 0.8918693220639219, + "grad_norm": 5.305317401885986, + "learning_rate": 2.914876042325243e-07, + "loss": 0.1933, + "step": 35244 + }, + { + "epoch": 0.8918946276286155, + "grad_norm": 4.342663288116455, + "learning_rate": 2.91352528446342e-07, + "loss": 0.1296, + "step": 35245 + }, + { + "epoch": 0.8919199331933092, + "grad_norm": 4.312283515930176, + "learning_rate": 2.912174830253517e-07, + "loss": 0.1263, + "step": 35246 + }, + { + "epoch": 0.8919452387580029, + "grad_norm": 5.31408166885376, + "learning_rate": 2.910824679704238e-07, + "loss": 0.1839, + "step": 35247 + }, + { + "epoch": 0.8919705443226965, + "grad_norm": 10.009861946105957, + "learning_rate": 2.909474832824294e-07, + "loss": 0.1446, + "step": 35248 + }, + { + "epoch": 0.8919958498873902, + "grad_norm": 5.212399959564209, + "learning_rate": 2.908125289622382e-07, + "loss": 0.1586, + "step": 35249 + }, + { + "epoch": 0.8920211554520839, + "grad_norm": 3.0162625312805176, + "learning_rate": 2.9067760501072185e-07, + "loss": 0.1101, + "step": 35250 + }, + { + "epoch": 0.8920464610167775, + "grad_norm": 3.8736603260040283, + "learning_rate": 2.90542711428749e-07, + "loss": 0.1133, + "step": 35251 + }, + { + "epoch": 0.8920717665814712, + "grad_norm": 4.196028232574463, + "learning_rate": 2.904078482171907e-07, + "loss": 0.1525, + "step": 35252 + }, + { + "epoch": 0.892097072146165, + "grad_norm": 4.977777004241943, + "learning_rate": 2.902730153769151e-07, + "loss": 0.1361, + "step": 35253 + }, + { + "epoch": 0.8921223777108587, + "grad_norm": 12.502552032470703, + "learning_rate": 2.901382129087937e-07, + "loss": 0.2322, + "step": 35254 + }, + { + "epoch": 0.8921476832755523, + "grad_norm": 7.287696838378906, + "learning_rate": 2.900034408136937e-07, + "loss": 0.1862, + "step": 35255 + }, + { + "epoch": 0.892172988840246, + "grad_norm": 4.203623294830322, + "learning_rate": 2.898686990924876e-07, + "loss": 0.144, + "step": 35256 + }, + { + "epoch": 0.8921982944049397, + "grad_norm": 4.816180229187012, + "learning_rate": 2.897339877460398e-07, + "loss": 0.1645, + "step": 35257 + }, + { + "epoch": 0.8922235999696333, + "grad_norm": 5.384674549102783, + "learning_rate": 2.8959930677522284e-07, + "loss": 0.2108, + "step": 35258 + }, + { + "epoch": 0.892248905534327, + "grad_norm": 9.120481491088867, + "learning_rate": 2.8946465618090337e-07, + "loss": 0.2601, + "step": 35259 + }, + { + "epoch": 0.8922742110990207, + "grad_norm": 4.525816917419434, + "learning_rate": 2.8933003596395005e-07, + "loss": 0.1418, + "step": 35260 + }, + { + "epoch": 0.8922995166637143, + "grad_norm": 8.684534072875977, + "learning_rate": 2.8919544612523056e-07, + "loss": 0.2443, + "step": 35261 + }, + { + "epoch": 0.892324822228408, + "grad_norm": 5.3850178718566895, + "learning_rate": 2.890608866656136e-07, + "loss": 0.188, + "step": 35262 + }, + { + "epoch": 0.8923501277931017, + "grad_norm": 6.076155662536621, + "learning_rate": 2.8892635758596744e-07, + "loss": 0.149, + "step": 35263 + }, + { + "epoch": 0.8923754333577953, + "grad_norm": 2.5066561698913574, + "learning_rate": 2.887918588871574e-07, + "loss": 0.1191, + "step": 35264 + }, + { + "epoch": 0.8924007389224891, + "grad_norm": 5.787660598754883, + "learning_rate": 2.886573905700535e-07, + "loss": 0.1611, + "step": 35265 + }, + { + "epoch": 0.8924260444871828, + "grad_norm": 10.23869514465332, + "learning_rate": 2.885229526355215e-07, + "loss": 0.2075, + "step": 35266 + }, + { + "epoch": 0.8924513500518764, + "grad_norm": 3.4907569885253906, + "learning_rate": 2.883885450844287e-07, + "loss": 0.1287, + "step": 35267 + }, + { + "epoch": 0.8924766556165701, + "grad_norm": 3.669461965560913, + "learning_rate": 2.8825416791764096e-07, + "loss": 0.1766, + "step": 35268 + }, + { + "epoch": 0.8925019611812638, + "grad_norm": 2.434847831726074, + "learning_rate": 2.88119821136027e-07, + "loss": 0.1139, + "step": 35269 + }, + { + "epoch": 0.8925272667459574, + "grad_norm": 4.623717308044434, + "learning_rate": 2.879855047404506e-07, + "loss": 0.1632, + "step": 35270 + }, + { + "epoch": 0.8925525723106511, + "grad_norm": 10.551194190979004, + "learning_rate": 2.8785121873178003e-07, + "loss": 0.1818, + "step": 35271 + }, + { + "epoch": 0.8925778778753448, + "grad_norm": 2.4963228702545166, + "learning_rate": 2.877169631108795e-07, + "loss": 0.0991, + "step": 35272 + }, + { + "epoch": 0.8926031834400384, + "grad_norm": 2.7852768898010254, + "learning_rate": 2.875827378786167e-07, + "loss": 0.1097, + "step": 35273 + }, + { + "epoch": 0.8926284890047321, + "grad_norm": 9.022340774536133, + "learning_rate": 2.8744854303585533e-07, + "loss": 0.2375, + "step": 35274 + }, + { + "epoch": 0.8926537945694258, + "grad_norm": 7.865203380584717, + "learning_rate": 2.8731437858346255e-07, + "loss": 0.2375, + "step": 35275 + }, + { + "epoch": 0.8926791001341194, + "grad_norm": 2.069902181625366, + "learning_rate": 2.871802445223015e-07, + "loss": 0.0818, + "step": 35276 + }, + { + "epoch": 0.8927044056988132, + "grad_norm": 7.568229675292969, + "learning_rate": 2.8704614085323936e-07, + "loss": 0.1926, + "step": 35277 + }, + { + "epoch": 0.8927297112635069, + "grad_norm": 3.215454578399658, + "learning_rate": 2.869120675771403e-07, + "loss": 0.1215, + "step": 35278 + }, + { + "epoch": 0.8927550168282006, + "grad_norm": 7.240524768829346, + "learning_rate": 2.8677802469486815e-07, + "loss": 0.148, + "step": 35279 + }, + { + "epoch": 0.8927803223928942, + "grad_norm": 2.596830368041992, + "learning_rate": 2.866440122072872e-07, + "loss": 0.1276, + "step": 35280 + }, + { + "epoch": 0.8928056279575879, + "grad_norm": 3.934995651245117, + "learning_rate": 2.8651003011526344e-07, + "loss": 0.1258, + "step": 35281 + }, + { + "epoch": 0.8928309335222816, + "grad_norm": 11.292889595031738, + "learning_rate": 2.86376078419659e-07, + "loss": 0.2241, + "step": 35282 + }, + { + "epoch": 0.8928562390869752, + "grad_norm": 6.48604679107666, + "learning_rate": 2.862421571213392e-07, + "loss": 0.1761, + "step": 35283 + }, + { + "epoch": 0.8928815446516689, + "grad_norm": 10.497106552124023, + "learning_rate": 2.861082662211656e-07, + "loss": 0.1879, + "step": 35284 + }, + { + "epoch": 0.8929068502163626, + "grad_norm": 3.2939751148223877, + "learning_rate": 2.859744057200037e-07, + "loss": 0.146, + "step": 35285 + }, + { + "epoch": 0.8929321557810562, + "grad_norm": 15.69563102722168, + "learning_rate": 2.8584057561871606e-07, + "loss": 0.1449, + "step": 35286 + }, + { + "epoch": 0.8929574613457499, + "grad_norm": 15.631874084472656, + "learning_rate": 2.8570677591816586e-07, + "loss": 0.2064, + "step": 35287 + }, + { + "epoch": 0.8929827669104436, + "grad_norm": 2.0756008625030518, + "learning_rate": 2.8557300661921473e-07, + "loss": 0.0735, + "step": 35288 + }, + { + "epoch": 0.8930080724751372, + "grad_norm": 6.742038249969482, + "learning_rate": 2.8543926772272687e-07, + "loss": 0.2167, + "step": 35289 + }, + { + "epoch": 0.893033378039831, + "grad_norm": 2.7765414714813232, + "learning_rate": 2.853055592295645e-07, + "loss": 0.0972, + "step": 35290 + }, + { + "epoch": 0.8930586836045247, + "grad_norm": 7.66790771484375, + "learning_rate": 2.851718811405896e-07, + "loss": 0.2013, + "step": 35291 + }, + { + "epoch": 0.8930839891692183, + "grad_norm": 4.780061721801758, + "learning_rate": 2.850382334566637e-07, + "loss": 0.145, + "step": 35292 + }, + { + "epoch": 0.893109294733912, + "grad_norm": 4.013307094573975, + "learning_rate": 2.849046161786495e-07, + "loss": 0.1176, + "step": 35293 + }, + { + "epoch": 0.8931346002986057, + "grad_norm": 15.886075019836426, + "learning_rate": 2.8477102930740853e-07, + "loss": 0.4386, + "step": 35294 + }, + { + "epoch": 0.8931599058632993, + "grad_norm": 1.7866593599319458, + "learning_rate": 2.846374728438012e-07, + "loss": 0.0688, + "step": 35295 + }, + { + "epoch": 0.893185211427993, + "grad_norm": 5.166271209716797, + "learning_rate": 2.845039467886912e-07, + "loss": 0.1534, + "step": 35296 + }, + { + "epoch": 0.8932105169926867, + "grad_norm": 5.486405849456787, + "learning_rate": 2.843704511429368e-07, + "loss": 0.146, + "step": 35297 + }, + { + "epoch": 0.8932358225573803, + "grad_norm": 4.812537670135498, + "learning_rate": 2.842369859074007e-07, + "loss": 0.1983, + "step": 35298 + }, + { + "epoch": 0.893261128122074, + "grad_norm": 7.304324626922607, + "learning_rate": 2.841035510829426e-07, + "loss": 0.2022, + "step": 35299 + }, + { + "epoch": 0.8932864336867677, + "grad_norm": 4.612548828125, + "learning_rate": 2.839701466704248e-07, + "loss": 0.0708, + "step": 35300 + }, + { + "epoch": 0.8933117392514613, + "grad_norm": 2.9808106422424316, + "learning_rate": 2.8383677267070474e-07, + "loss": 0.1045, + "step": 35301 + }, + { + "epoch": 0.8933370448161551, + "grad_norm": 6.058272838592529, + "learning_rate": 2.837034290846452e-07, + "loss": 0.0847, + "step": 35302 + }, + { + "epoch": 0.8933623503808488, + "grad_norm": 12.225075721740723, + "learning_rate": 2.835701159131038e-07, + "loss": 0.2271, + "step": 35303 + }, + { + "epoch": 0.8933876559455424, + "grad_norm": 3.4907288551330566, + "learning_rate": 2.834368331569426e-07, + "loss": 0.0812, + "step": 35304 + }, + { + "epoch": 0.8934129615102361, + "grad_norm": 9.796993255615234, + "learning_rate": 2.8330358081701925e-07, + "loss": 0.2442, + "step": 35305 + }, + { + "epoch": 0.8934382670749298, + "grad_norm": 4.980538845062256, + "learning_rate": 2.8317035889419367e-07, + "loss": 0.1843, + "step": 35306 + }, + { + "epoch": 0.8934635726396235, + "grad_norm": 5.7225728034973145, + "learning_rate": 2.8303716738932505e-07, + "loss": 0.1428, + "step": 35307 + }, + { + "epoch": 0.8934888782043171, + "grad_norm": 3.1445302963256836, + "learning_rate": 2.829040063032723e-07, + "loss": 0.12, + "step": 35308 + }, + { + "epoch": 0.8935141837690108, + "grad_norm": 2.8931610584259033, + "learning_rate": 2.8277087563689464e-07, + "loss": 0.1055, + "step": 35309 + }, + { + "epoch": 0.8935394893337045, + "grad_norm": 6.425625324249268, + "learning_rate": 2.826377753910503e-07, + "loss": 0.1224, + "step": 35310 + }, + { + "epoch": 0.8935647948983981, + "grad_norm": 14.300745964050293, + "learning_rate": 2.825047055665964e-07, + "loss": 0.1353, + "step": 35311 + }, + { + "epoch": 0.8935901004630918, + "grad_norm": 4.757081031799316, + "learning_rate": 2.823716661643927e-07, + "loss": 0.176, + "step": 35312 + }, + { + "epoch": 0.8936154060277856, + "grad_norm": 4.657378196716309, + "learning_rate": 2.82238657185297e-07, + "loss": 0.1287, + "step": 35313 + }, + { + "epoch": 0.8936407115924792, + "grad_norm": 6.058340549468994, + "learning_rate": 2.821056786301662e-07, + "loss": 0.2414, + "step": 35314 + }, + { + "epoch": 0.8936660171571729, + "grad_norm": 4.637221336364746, + "learning_rate": 2.8197273049985763e-07, + "loss": 0.115, + "step": 35315 + }, + { + "epoch": 0.8936913227218666, + "grad_norm": 3.9228811264038086, + "learning_rate": 2.818398127952304e-07, + "loss": 0.1265, + "step": 35316 + }, + { + "epoch": 0.8937166282865602, + "grad_norm": 2.9818313121795654, + "learning_rate": 2.817069255171401e-07, + "loss": 0.1424, + "step": 35317 + }, + { + "epoch": 0.8937419338512539, + "grad_norm": 8.666625022888184, + "learning_rate": 2.8157406866644477e-07, + "loss": 0.1682, + "step": 35318 + }, + { + "epoch": 0.8937672394159476, + "grad_norm": 11.078463554382324, + "learning_rate": 2.814412422439994e-07, + "loss": 0.1598, + "step": 35319 + }, + { + "epoch": 0.8937925449806412, + "grad_norm": 7.17153787612915, + "learning_rate": 2.8130844625066267e-07, + "loss": 0.1672, + "step": 35320 + }, + { + "epoch": 0.8938178505453349, + "grad_norm": 3.214327573776245, + "learning_rate": 2.811756806872895e-07, + "loss": 0.138, + "step": 35321 + }, + { + "epoch": 0.8938431561100286, + "grad_norm": 7.8106255531311035, + "learning_rate": 2.81042945554737e-07, + "loss": 0.2543, + "step": 35322 + }, + { + "epoch": 0.8938684616747222, + "grad_norm": 3.8368561267852783, + "learning_rate": 2.809102408538611e-07, + "loss": 0.0641, + "step": 35323 + }, + { + "epoch": 0.8938937672394159, + "grad_norm": 4.4426350593566895, + "learning_rate": 2.8077756658551615e-07, + "loss": 0.1379, + "step": 35324 + }, + { + "epoch": 0.8939190728041096, + "grad_norm": 6.5985894203186035, + "learning_rate": 2.806449227505592e-07, + "loss": 0.1026, + "step": 35325 + }, + { + "epoch": 0.8939443783688032, + "grad_norm": 5.8810038566589355, + "learning_rate": 2.8051230934984576e-07, + "loss": 0.0892, + "step": 35326 + }, + { + "epoch": 0.893969683933497, + "grad_norm": 6.74128532409668, + "learning_rate": 2.803797263842306e-07, + "loss": 0.2445, + "step": 35327 + }, + { + "epoch": 0.8939949894981907, + "grad_norm": 20.910367965698242, + "learning_rate": 2.8024717385456757e-07, + "loss": 0.2914, + "step": 35328 + }, + { + "epoch": 0.8940202950628843, + "grad_norm": 6.0690717697143555, + "learning_rate": 2.801146517617137e-07, + "loss": 0.149, + "step": 35329 + }, + { + "epoch": 0.894045600627578, + "grad_norm": 5.428921222686768, + "learning_rate": 2.799821601065217e-07, + "loss": 0.1556, + "step": 35330 + }, + { + "epoch": 0.8940709061922717, + "grad_norm": 4.861629009246826, + "learning_rate": 2.7984969888984805e-07, + "loss": 0.1772, + "step": 35331 + }, + { + "epoch": 0.8940962117569654, + "grad_norm": 11.690269470214844, + "learning_rate": 2.7971726811254427e-07, + "loss": 0.1879, + "step": 35332 + }, + { + "epoch": 0.894121517321659, + "grad_norm": 8.306360244750977, + "learning_rate": 2.7958486777546644e-07, + "loss": 0.1701, + "step": 35333 + }, + { + "epoch": 0.8941468228863527, + "grad_norm": 27.758834838867188, + "learning_rate": 2.7945249787946715e-07, + "loss": 0.1281, + "step": 35334 + }, + { + "epoch": 0.8941721284510464, + "grad_norm": 2.9534504413604736, + "learning_rate": 2.7932015842540184e-07, + "loss": 0.1409, + "step": 35335 + }, + { + "epoch": 0.89419743401574, + "grad_norm": 6.881349086761475, + "learning_rate": 2.791878494141215e-07, + "loss": 0.0873, + "step": 35336 + }, + { + "epoch": 0.8942227395804337, + "grad_norm": 5.011064052581787, + "learning_rate": 2.7905557084648093e-07, + "loss": 0.1936, + "step": 35337 + }, + { + "epoch": 0.8942480451451275, + "grad_norm": 2.992717742919922, + "learning_rate": 2.7892332272333235e-07, + "loss": 0.0781, + "step": 35338 + }, + { + "epoch": 0.8942733507098211, + "grad_norm": 6.326992988586426, + "learning_rate": 2.7879110504553e-07, + "loss": 0.1813, + "step": 35339 + }, + { + "epoch": 0.8942986562745148, + "grad_norm": 7.921408653259277, + "learning_rate": 2.786589178139254e-07, + "loss": 0.1838, + "step": 35340 + }, + { + "epoch": 0.8943239618392085, + "grad_norm": 3.8385441303253174, + "learning_rate": 2.785267610293713e-07, + "loss": 0.1491, + "step": 35341 + }, + { + "epoch": 0.8943492674039021, + "grad_norm": 3.7985658645629883, + "learning_rate": 2.783946346927191e-07, + "loss": 0.1681, + "step": 35342 + }, + { + "epoch": 0.8943745729685958, + "grad_norm": 4.751648902893066, + "learning_rate": 2.782625388048221e-07, + "loss": 0.1216, + "step": 35343 + }, + { + "epoch": 0.8943998785332895, + "grad_norm": 8.07270336151123, + "learning_rate": 2.781304733665319e-07, + "loss": 0.2067, + "step": 35344 + }, + { + "epoch": 0.8944251840979831, + "grad_norm": 8.970952987670898, + "learning_rate": 2.779984383786999e-07, + "loss": 0.2227, + "step": 35345 + }, + { + "epoch": 0.8944504896626768, + "grad_norm": 10.309661865234375, + "learning_rate": 2.778664338421766e-07, + "loss": 0.1926, + "step": 35346 + }, + { + "epoch": 0.8944757952273705, + "grad_norm": 2.733543634414673, + "learning_rate": 2.777344597578152e-07, + "loss": 0.1272, + "step": 35347 + }, + { + "epoch": 0.8945011007920641, + "grad_norm": 5.488131523132324, + "learning_rate": 2.7760251612646617e-07, + "loss": 0.1633, + "step": 35348 + }, + { + "epoch": 0.8945264063567578, + "grad_norm": 5.992837905883789, + "learning_rate": 2.774706029489799e-07, + "loss": 0.1734, + "step": 35349 + }, + { + "epoch": 0.8945517119214516, + "grad_norm": 11.179187774658203, + "learning_rate": 2.773387202262068e-07, + "loss": 0.1358, + "step": 35350 + }, + { + "epoch": 0.8945770174861452, + "grad_norm": 5.481825828552246, + "learning_rate": 2.772068679589979e-07, + "loss": 0.1399, + "step": 35351 + }, + { + "epoch": 0.8946023230508389, + "grad_norm": 5.61925745010376, + "learning_rate": 2.770750461482036e-07, + "loss": 0.1508, + "step": 35352 + }, + { + "epoch": 0.8946276286155326, + "grad_norm": 3.9506444931030273, + "learning_rate": 2.7694325479467434e-07, + "loss": 0.0939, + "step": 35353 + }, + { + "epoch": 0.8946529341802262, + "grad_norm": 6.838635444641113, + "learning_rate": 2.768114938992589e-07, + "loss": 0.2132, + "step": 35354 + }, + { + "epoch": 0.8946782397449199, + "grad_norm": 4.137366771697998, + "learning_rate": 2.766797634628071e-07, + "loss": 0.2338, + "step": 35355 + }, + { + "epoch": 0.8947035453096136, + "grad_norm": 7.573379993438721, + "learning_rate": 2.765480634861695e-07, + "loss": 0.1403, + "step": 35356 + }, + { + "epoch": 0.8947288508743073, + "grad_norm": 2.839022636413574, + "learning_rate": 2.764163939701947e-07, + "loss": 0.0888, + "step": 35357 + }, + { + "epoch": 0.8947541564390009, + "grad_norm": 2.8585364818573, + "learning_rate": 2.7628475491573216e-07, + "loss": 0.1292, + "step": 35358 + }, + { + "epoch": 0.8947794620036946, + "grad_norm": 5.804972171783447, + "learning_rate": 2.7615314632362945e-07, + "loss": 0.1294, + "step": 35359 + }, + { + "epoch": 0.8948047675683883, + "grad_norm": 8.746838569641113, + "learning_rate": 2.7602156819473756e-07, + "loss": 0.2848, + "step": 35360 + }, + { + "epoch": 0.8948300731330819, + "grad_norm": 10.274772644042969, + "learning_rate": 2.758900205299031e-07, + "loss": 0.1893, + "step": 35361 + }, + { + "epoch": 0.8948553786977756, + "grad_norm": 6.503615856170654, + "learning_rate": 2.7575850332997645e-07, + "loss": 0.1884, + "step": 35362 + }, + { + "epoch": 0.8948806842624694, + "grad_norm": 4.787239074707031, + "learning_rate": 2.7562701659580304e-07, + "loss": 0.1395, + "step": 35363 + }, + { + "epoch": 0.894905989827163, + "grad_norm": 6.443232536315918, + "learning_rate": 2.7549556032823277e-07, + "loss": 0.157, + "step": 35364 + }, + { + "epoch": 0.8949312953918567, + "grad_norm": 5.9850382804870605, + "learning_rate": 2.753641345281122e-07, + "loss": 0.2349, + "step": 35365 + }, + { + "epoch": 0.8949566009565504, + "grad_norm": 4.359370708465576, + "learning_rate": 2.7523273919629114e-07, + "loss": 0.0773, + "step": 35366 + }, + { + "epoch": 0.894981906521244, + "grad_norm": 3.895179510116577, + "learning_rate": 2.751013743336134e-07, + "loss": 0.1111, + "step": 35367 + }, + { + "epoch": 0.8950072120859377, + "grad_norm": 4.24107551574707, + "learning_rate": 2.7497003994092884e-07, + "loss": 0.1086, + "step": 35368 + }, + { + "epoch": 0.8950325176506314, + "grad_norm": 2.349459409713745, + "learning_rate": 2.74838736019083e-07, + "loss": 0.1123, + "step": 35369 + }, + { + "epoch": 0.895057823215325, + "grad_norm": 4.517581939697266, + "learning_rate": 2.747074625689239e-07, + "loss": 0.1481, + "step": 35370 + }, + { + "epoch": 0.8950831287800187, + "grad_norm": 3.0050954818725586, + "learning_rate": 2.745762195912971e-07, + "loss": 0.1017, + "step": 35371 + }, + { + "epoch": 0.8951084343447124, + "grad_norm": 9.530824661254883, + "learning_rate": 2.744450070870497e-07, + "loss": 0.2247, + "step": 35372 + }, + { + "epoch": 0.895133739909406, + "grad_norm": 5.039942741394043, + "learning_rate": 2.743138250570265e-07, + "loss": 0.1514, + "step": 35373 + }, + { + "epoch": 0.8951590454740997, + "grad_norm": 7.001497745513916, + "learning_rate": 2.741826735020753e-07, + "loss": 0.2621, + "step": 35374 + }, + { + "epoch": 0.8951843510387935, + "grad_norm": 3.220569133758545, + "learning_rate": 2.740515524230408e-07, + "loss": 0.1041, + "step": 35375 + }, + { + "epoch": 0.8952096566034871, + "grad_norm": 2.6396102905273438, + "learning_rate": 2.739204618207686e-07, + "loss": 0.1429, + "step": 35376 + }, + { + "epoch": 0.8952349621681808, + "grad_norm": 5.214089393615723, + "learning_rate": 2.7378940169610344e-07, + "loss": 0.1426, + "step": 35377 + }, + { + "epoch": 0.8952602677328745, + "grad_norm": 7.944273948669434, + "learning_rate": 2.736583720498925e-07, + "loss": 0.1933, + "step": 35378 + }, + { + "epoch": 0.8952855732975681, + "grad_norm": 8.38927173614502, + "learning_rate": 2.7352737288297905e-07, + "loss": 0.2038, + "step": 35379 + }, + { + "epoch": 0.8953108788622618, + "grad_norm": 10.54992389678955, + "learning_rate": 2.7339640419620897e-07, + "loss": 0.2071, + "step": 35380 + }, + { + "epoch": 0.8953361844269555, + "grad_norm": 5.005740642547607, + "learning_rate": 2.732654659904255e-07, + "loss": 0.1488, + "step": 35381 + }, + { + "epoch": 0.8953614899916492, + "grad_norm": 3.268481969833374, + "learning_rate": 2.731345582664735e-07, + "loss": 0.0704, + "step": 35382 + }, + { + "epoch": 0.8953867955563428, + "grad_norm": 5.353668212890625, + "learning_rate": 2.7300368102519847e-07, + "loss": 0.1234, + "step": 35383 + }, + { + "epoch": 0.8954121011210365, + "grad_norm": 4.310808181762695, + "learning_rate": 2.72872834267443e-07, + "loss": 0.1584, + "step": 35384 + }, + { + "epoch": 0.8954374066857302, + "grad_norm": 4.299734592437744, + "learning_rate": 2.7274201799405156e-07, + "loss": 0.1563, + "step": 35385 + }, + { + "epoch": 0.8954627122504238, + "grad_norm": 6.3610453605651855, + "learning_rate": 2.726112322058666e-07, + "loss": 0.1977, + "step": 35386 + }, + { + "epoch": 0.8954880178151176, + "grad_norm": 4.164609909057617, + "learning_rate": 2.724804769037331e-07, + "loss": 0.1789, + "step": 35387 + }, + { + "epoch": 0.8955133233798113, + "grad_norm": 2.995391845703125, + "learning_rate": 2.723497520884938e-07, + "loss": 0.1109, + "step": 35388 + }, + { + "epoch": 0.8955386289445049, + "grad_norm": 4.739150047302246, + "learning_rate": 2.7221905776099124e-07, + "loss": 0.0855, + "step": 35389 + }, + { + "epoch": 0.8955639345091986, + "grad_norm": 2.8005316257476807, + "learning_rate": 2.720883939220681e-07, + "loss": 0.1019, + "step": 35390 + }, + { + "epoch": 0.8955892400738923, + "grad_norm": 7.4145121574401855, + "learning_rate": 2.7195776057256816e-07, + "loss": 0.1874, + "step": 35391 + }, + { + "epoch": 0.8956145456385859, + "grad_norm": 21.011856079101562, + "learning_rate": 2.718271577133325e-07, + "loss": 0.1878, + "step": 35392 + }, + { + "epoch": 0.8956398512032796, + "grad_norm": 6.745549201965332, + "learning_rate": 2.716965853452053e-07, + "loss": 0.1303, + "step": 35393 + }, + { + "epoch": 0.8956651567679733, + "grad_norm": 3.6671414375305176, + "learning_rate": 2.7156604346902595e-07, + "loss": 0.0834, + "step": 35394 + }, + { + "epoch": 0.8956904623326669, + "grad_norm": 4.951930046081543, + "learning_rate": 2.714355320856382e-07, + "loss": 0.2214, + "step": 35395 + }, + { + "epoch": 0.8957157678973606, + "grad_norm": 3.7978787422180176, + "learning_rate": 2.7130505119588257e-07, + "loss": 0.1375, + "step": 35396 + }, + { + "epoch": 0.8957410734620543, + "grad_norm": 2.768914222717285, + "learning_rate": 2.711746008006022e-07, + "loss": 0.0991, + "step": 35397 + }, + { + "epoch": 0.8957663790267479, + "grad_norm": 4.73993444442749, + "learning_rate": 2.710441809006359e-07, + "loss": 0.1208, + "step": 35398 + }, + { + "epoch": 0.8957916845914417, + "grad_norm": 3.4489874839782715, + "learning_rate": 2.7091379149682683e-07, + "loss": 0.1606, + "step": 35399 + }, + { + "epoch": 0.8958169901561354, + "grad_norm": 6.122307777404785, + "learning_rate": 2.707834325900144e-07, + "loss": 0.1707, + "step": 35400 + }, + { + "epoch": 0.895842295720829, + "grad_norm": 2.8222110271453857, + "learning_rate": 2.7065310418104116e-07, + "loss": 0.0922, + "step": 35401 + }, + { + "epoch": 0.8958676012855227, + "grad_norm": 3.0181267261505127, + "learning_rate": 2.705228062707449e-07, + "loss": 0.1182, + "step": 35402 + }, + { + "epoch": 0.8958929068502164, + "grad_norm": 4.2007341384887695, + "learning_rate": 2.7039253885996817e-07, + "loss": 0.1625, + "step": 35403 + }, + { + "epoch": 0.89591821241491, + "grad_norm": 6.7077531814575195, + "learning_rate": 2.7026230194954926e-07, + "loss": 0.2788, + "step": 35404 + }, + { + "epoch": 0.8959435179796037, + "grad_norm": 3.347005605697632, + "learning_rate": 2.701320955403297e-07, + "loss": 0.1283, + "step": 35405 + }, + { + "epoch": 0.8959688235442974, + "grad_norm": 2.558454990386963, + "learning_rate": 2.700019196331488e-07, + "loss": 0.0951, + "step": 35406 + }, + { + "epoch": 0.8959941291089911, + "grad_norm": 5.57507848739624, + "learning_rate": 2.698717742288454e-07, + "loss": 0.1286, + "step": 35407 + }, + { + "epoch": 0.8960194346736847, + "grad_norm": 6.146429538726807, + "learning_rate": 2.697416593282592e-07, + "loss": 0.164, + "step": 35408 + }, + { + "epoch": 0.8960447402383784, + "grad_norm": 8.839991569519043, + "learning_rate": 2.6961157493222814e-07, + "loss": 0.1954, + "step": 35409 + }, + { + "epoch": 0.8960700458030721, + "grad_norm": 9.251346588134766, + "learning_rate": 2.69481521041593e-07, + "loss": 0.2137, + "step": 35410 + }, + { + "epoch": 0.8960953513677657, + "grad_norm": 5.856932163238525, + "learning_rate": 2.6935149765719147e-07, + "loss": 0.1573, + "step": 35411 + }, + { + "epoch": 0.8961206569324595, + "grad_norm": 2.940523386001587, + "learning_rate": 2.692215047798624e-07, + "loss": 0.128, + "step": 35412 + }, + { + "epoch": 0.8961459624971532, + "grad_norm": 12.127787590026855, + "learning_rate": 2.690915424104429e-07, + "loss": 0.1423, + "step": 35413 + }, + { + "epoch": 0.8961712680618468, + "grad_norm": 3.5926642417907715, + "learning_rate": 2.689616105497728e-07, + "loss": 0.1538, + "step": 35414 + }, + { + "epoch": 0.8961965736265405, + "grad_norm": 3.5564231872558594, + "learning_rate": 2.6883170919868975e-07, + "loss": 0.1913, + "step": 35415 + }, + { + "epoch": 0.8962218791912342, + "grad_norm": 7.80712890625, + "learning_rate": 2.687018383580303e-07, + "loss": 0.2043, + "step": 35416 + }, + { + "epoch": 0.8962471847559278, + "grad_norm": 4.933343410491943, + "learning_rate": 2.685719980286322e-07, + "loss": 0.1079, + "step": 35417 + }, + { + "epoch": 0.8962724903206215, + "grad_norm": 4.12880277633667, + "learning_rate": 2.684421882113336e-07, + "loss": 0.1128, + "step": 35418 + }, + { + "epoch": 0.8962977958853152, + "grad_norm": 5.462893962860107, + "learning_rate": 2.6831240890697106e-07, + "loss": 0.1778, + "step": 35419 + }, + { + "epoch": 0.8963231014500088, + "grad_norm": 11.633796691894531, + "learning_rate": 2.681826601163823e-07, + "loss": 0.259, + "step": 35420 + }, + { + "epoch": 0.8963484070147025, + "grad_norm": 3.2303316593170166, + "learning_rate": 2.680529418404021e-07, + "loss": 0.1408, + "step": 35421 + }, + { + "epoch": 0.8963737125793962, + "grad_norm": 3.289133310317993, + "learning_rate": 2.679232540798693e-07, + "loss": 0.0634, + "step": 35422 + }, + { + "epoch": 0.8963990181440898, + "grad_norm": 3.87886118888855, + "learning_rate": 2.6779359683561824e-07, + "loss": 0.1062, + "step": 35423 + }, + { + "epoch": 0.8964243237087836, + "grad_norm": 4.632144451141357, + "learning_rate": 2.6766397010848767e-07, + "loss": 0.1092, + "step": 35424 + }, + { + "epoch": 0.8964496292734773, + "grad_norm": 4.733870983123779, + "learning_rate": 2.6753437389931023e-07, + "loss": 0.1337, + "step": 35425 + }, + { + "epoch": 0.8964749348381709, + "grad_norm": 6.824088096618652, + "learning_rate": 2.674048082089242e-07, + "loss": 0.1803, + "step": 35426 + }, + { + "epoch": 0.8965002404028646, + "grad_norm": 3.603142261505127, + "learning_rate": 2.672752730381634e-07, + "loss": 0.1058, + "step": 35427 + }, + { + "epoch": 0.8965255459675583, + "grad_norm": 3.908867359161377, + "learning_rate": 2.6714576838786534e-07, + "loss": 0.0541, + "step": 35428 + }, + { + "epoch": 0.8965508515322519, + "grad_norm": 5.411487102508545, + "learning_rate": 2.6701629425886277e-07, + "loss": 0.1373, + "step": 35429 + }, + { + "epoch": 0.8965761570969456, + "grad_norm": 4.061260223388672, + "learning_rate": 2.6688685065199227e-07, + "loss": 0.1061, + "step": 35430 + }, + { + "epoch": 0.8966014626616393, + "grad_norm": 5.870485305786133, + "learning_rate": 2.66757437568087e-07, + "loss": 0.1348, + "step": 35431 + }, + { + "epoch": 0.8966267682263329, + "grad_norm": 4.817892074584961, + "learning_rate": 2.6662805500798417e-07, + "loss": 0.124, + "step": 35432 + }, + { + "epoch": 0.8966520737910266, + "grad_norm": 3.725783348083496, + "learning_rate": 2.6649870297251523e-07, + "loss": 0.1262, + "step": 35433 + }, + { + "epoch": 0.8966773793557203, + "grad_norm": 3.0012030601501465, + "learning_rate": 2.663693814625168e-07, + "loss": 0.0694, + "step": 35434 + }, + { + "epoch": 0.896702684920414, + "grad_norm": 4.58880615234375, + "learning_rate": 2.6624009047882094e-07, + "loss": 0.1596, + "step": 35435 + }, + { + "epoch": 0.8967279904851077, + "grad_norm": 7.007414817810059, + "learning_rate": 2.66110830022262e-07, + "loss": 0.1399, + "step": 35436 + }, + { + "epoch": 0.8967532960498014, + "grad_norm": 4.7832536697387695, + "learning_rate": 2.6598160009367436e-07, + "loss": 0.1472, + "step": 35437 + }, + { + "epoch": 0.8967786016144951, + "grad_norm": 6.936694145202637, + "learning_rate": 2.6585240069389065e-07, + "loss": 0.1993, + "step": 35438 + }, + { + "epoch": 0.8968039071791887, + "grad_norm": 8.372532844543457, + "learning_rate": 2.6572323182374404e-07, + "loss": 0.1536, + "step": 35439 + }, + { + "epoch": 0.8968292127438824, + "grad_norm": 3.1685354709625244, + "learning_rate": 2.6559409348406727e-07, + "loss": 0.1235, + "step": 35440 + }, + { + "epoch": 0.8968545183085761, + "grad_norm": 4.393326759338379, + "learning_rate": 2.6546498567569414e-07, + "loss": 0.1758, + "step": 35441 + }, + { + "epoch": 0.8968798238732697, + "grad_norm": 6.987244606018066, + "learning_rate": 2.6533590839945613e-07, + "loss": 0.0963, + "step": 35442 + }, + { + "epoch": 0.8969051294379634, + "grad_norm": 4.055020809173584, + "learning_rate": 2.6520686165618703e-07, + "loss": 0.0789, + "step": 35443 + }, + { + "epoch": 0.8969304350026571, + "grad_norm": 8.934395790100098, + "learning_rate": 2.6507784544671677e-07, + "loss": 0.191, + "step": 35444 + }, + { + "epoch": 0.8969557405673507, + "grad_norm": 5.070243835449219, + "learning_rate": 2.649488597718797e-07, + "loss": 0.1272, + "step": 35445 + }, + { + "epoch": 0.8969810461320444, + "grad_norm": 3.334408760070801, + "learning_rate": 2.6481990463250675e-07, + "loss": 0.1301, + "step": 35446 + }, + { + "epoch": 0.8970063516967381, + "grad_norm": 6.885415554046631, + "learning_rate": 2.646909800294289e-07, + "loss": 0.1563, + "step": 35447 + }, + { + "epoch": 0.8970316572614317, + "grad_norm": 8.014158248901367, + "learning_rate": 2.645620859634779e-07, + "loss": 0.1444, + "step": 35448 + }, + { + "epoch": 0.8970569628261255, + "grad_norm": 5.475250244140625, + "learning_rate": 2.644332224354856e-07, + "loss": 0.2219, + "step": 35449 + }, + { + "epoch": 0.8970822683908192, + "grad_norm": 3.079771041870117, + "learning_rate": 2.643043894462827e-07, + "loss": 0.1381, + "step": 35450 + }, + { + "epoch": 0.8971075739555128, + "grad_norm": 5.799921989440918, + "learning_rate": 2.6417558699669997e-07, + "loss": 0.1402, + "step": 35451 + }, + { + "epoch": 0.8971328795202065, + "grad_norm": 4.438066482543945, + "learning_rate": 2.6404681508756693e-07, + "loss": 0.1056, + "step": 35452 + }, + { + "epoch": 0.8971581850849002, + "grad_norm": 2.7077796459198, + "learning_rate": 2.639180737197161e-07, + "loss": 0.0724, + "step": 35453 + }, + { + "epoch": 0.8971834906495938, + "grad_norm": 2.8103346824645996, + "learning_rate": 2.637893628939764e-07, + "loss": 0.107, + "step": 35454 + }, + { + "epoch": 0.8972087962142875, + "grad_norm": 5.225799083709717, + "learning_rate": 2.6366068261117816e-07, + "loss": 0.1606, + "step": 35455 + }, + { + "epoch": 0.8972341017789812, + "grad_norm": 6.537059307098389, + "learning_rate": 2.635320328721502e-07, + "loss": 0.1527, + "step": 35456 + }, + { + "epoch": 0.8972594073436748, + "grad_norm": 5.808399200439453, + "learning_rate": 2.6340341367772417e-07, + "loss": 0.1241, + "step": 35457 + }, + { + "epoch": 0.8972847129083685, + "grad_norm": 4.873178482055664, + "learning_rate": 2.632748250287276e-07, + "loss": 0.1619, + "step": 35458 + }, + { + "epoch": 0.8973100184730622, + "grad_norm": 8.835505485534668, + "learning_rate": 2.631462669259921e-07, + "loss": 0.2255, + "step": 35459 + }, + { + "epoch": 0.897335324037756, + "grad_norm": 6.7884016036987305, + "learning_rate": 2.630177393703437e-07, + "loss": 0.2453, + "step": 35460 + }, + { + "epoch": 0.8973606296024496, + "grad_norm": 5.447484016418457, + "learning_rate": 2.6288924236261337e-07, + "loss": 0.1353, + "step": 35461 + }, + { + "epoch": 0.8973859351671433, + "grad_norm": 11.446261405944824, + "learning_rate": 2.627607759036288e-07, + "loss": 0.2206, + "step": 35462 + }, + { + "epoch": 0.897411240731837, + "grad_norm": 14.261744499206543, + "learning_rate": 2.626323399942199e-07, + "loss": 0.2771, + "step": 35463 + }, + { + "epoch": 0.8974365462965306, + "grad_norm": 3.7584760189056396, + "learning_rate": 2.625039346352126e-07, + "loss": 0.096, + "step": 35464 + }, + { + "epoch": 0.8974618518612243, + "grad_norm": 2.1956708431243896, + "learning_rate": 2.6237555982743635e-07, + "loss": 0.0904, + "step": 35465 + }, + { + "epoch": 0.897487157425918, + "grad_norm": 4.544480800628662, + "learning_rate": 2.622472155717193e-07, + "loss": 0.109, + "step": 35466 + }, + { + "epoch": 0.8975124629906116, + "grad_norm": 4.309284210205078, + "learning_rate": 2.62118901868888e-07, + "loss": 0.1793, + "step": 35467 + }, + { + "epoch": 0.8975377685553053, + "grad_norm": 4.4459614753723145, + "learning_rate": 2.6199061871977127e-07, + "loss": 0.1338, + "step": 35468 + }, + { + "epoch": 0.897563074119999, + "grad_norm": 3.8818552494049072, + "learning_rate": 2.6186236612519565e-07, + "loss": 0.1392, + "step": 35469 + }, + { + "epoch": 0.8975883796846926, + "grad_norm": 3.346857786178589, + "learning_rate": 2.617341440859883e-07, + "loss": 0.1709, + "step": 35470 + }, + { + "epoch": 0.8976136852493863, + "grad_norm": 8.260749816894531, + "learning_rate": 2.616059526029752e-07, + "loss": 0.0894, + "step": 35471 + }, + { + "epoch": 0.89763899081408, + "grad_norm": 15.906898498535156, + "learning_rate": 2.614777916769845e-07, + "loss": 0.1402, + "step": 35472 + }, + { + "epoch": 0.8976642963787737, + "grad_norm": 3.1078591346740723, + "learning_rate": 2.6134966130884233e-07, + "loss": 0.1178, + "step": 35473 + }, + { + "epoch": 0.8976896019434674, + "grad_norm": 11.80271053314209, + "learning_rate": 2.6122156149937516e-07, + "loss": 0.278, + "step": 35474 + }, + { + "epoch": 0.8977149075081611, + "grad_norm": 9.123278617858887, + "learning_rate": 2.610934922494074e-07, + "loss": 0.246, + "step": 35475 + }, + { + "epoch": 0.8977402130728547, + "grad_norm": 5.412580490112305, + "learning_rate": 2.6096545355976666e-07, + "loss": 0.1468, + "step": 35476 + }, + { + "epoch": 0.8977655186375484, + "grad_norm": 3.346302032470703, + "learning_rate": 2.60837445431279e-07, + "loss": 0.1653, + "step": 35477 + }, + { + "epoch": 0.8977908242022421, + "grad_norm": 7.87949275970459, + "learning_rate": 2.607094678647687e-07, + "loss": 0.1693, + "step": 35478 + }, + { + "epoch": 0.8978161297669357, + "grad_norm": 3.849370241165161, + "learning_rate": 2.605815208610607e-07, + "loss": 0.1188, + "step": 35479 + }, + { + "epoch": 0.8978414353316294, + "grad_norm": 3.7993314266204834, + "learning_rate": 2.6045360442098155e-07, + "loss": 0.1203, + "step": 35480 + }, + { + "epoch": 0.8978667408963231, + "grad_norm": 10.424320220947266, + "learning_rate": 2.6032571854535505e-07, + "loss": 0.196, + "step": 35481 + }, + { + "epoch": 0.8978920464610167, + "grad_norm": 3.559335708618164, + "learning_rate": 2.6019786323500716e-07, + "loss": 0.0949, + "step": 35482 + }, + { + "epoch": 0.8979173520257104, + "grad_norm": 10.3532075881958, + "learning_rate": 2.600700384907606e-07, + "loss": 0.2015, + "step": 35483 + }, + { + "epoch": 0.8979426575904041, + "grad_norm": 7.221095561981201, + "learning_rate": 2.599422443134414e-07, + "loss": 0.2599, + "step": 35484 + }, + { + "epoch": 0.8979679631550979, + "grad_norm": 4.945962429046631, + "learning_rate": 2.598144807038727e-07, + "loss": 0.1978, + "step": 35485 + }, + { + "epoch": 0.8979932687197915, + "grad_norm": 7.081312656402588, + "learning_rate": 2.5968674766287896e-07, + "loss": 0.1497, + "step": 35486 + }, + { + "epoch": 0.8980185742844852, + "grad_norm": 3.6181540489196777, + "learning_rate": 2.595590451912827e-07, + "loss": 0.1231, + "step": 35487 + }, + { + "epoch": 0.8980438798491789, + "grad_norm": 3.7299513816833496, + "learning_rate": 2.59431373289909e-07, + "loss": 0.1371, + "step": 35488 + }, + { + "epoch": 0.8980691854138725, + "grad_norm": 15.006853103637695, + "learning_rate": 2.593037319595798e-07, + "loss": 0.1907, + "step": 35489 + }, + { + "epoch": 0.8980944909785662, + "grad_norm": 7.342495918273926, + "learning_rate": 2.591761212011207e-07, + "loss": 0.2029, + "step": 35490 + }, + { + "epoch": 0.8981197965432599, + "grad_norm": 3.854254961013794, + "learning_rate": 2.590485410153515e-07, + "loss": 0.1145, + "step": 35491 + }, + { + "epoch": 0.8981451021079535, + "grad_norm": 12.236380577087402, + "learning_rate": 2.5892099140309714e-07, + "loss": 0.0729, + "step": 35492 + }, + { + "epoch": 0.8981704076726472, + "grad_norm": 4.189363956451416, + "learning_rate": 2.5879347236517914e-07, + "loss": 0.1711, + "step": 35493 + }, + { + "epoch": 0.8981957132373409, + "grad_norm": 4.598147869110107, + "learning_rate": 2.5866598390242023e-07, + "loss": 0.1815, + "step": 35494 + }, + { + "epoch": 0.8982210188020345, + "grad_norm": 3.673161268234253, + "learning_rate": 2.5853852601564145e-07, + "loss": 0.1231, + "step": 35495 + }, + { + "epoch": 0.8982463243667282, + "grad_norm": 5.278921127319336, + "learning_rate": 2.584110987056671e-07, + "loss": 0.1897, + "step": 35496 + }, + { + "epoch": 0.898271629931422, + "grad_norm": 3.127835988998413, + "learning_rate": 2.5828370197331697e-07, + "loss": 0.1006, + "step": 35497 + }, + { + "epoch": 0.8982969354961156, + "grad_norm": 4.201643943786621, + "learning_rate": 2.581563358194128e-07, + "loss": 0.1672, + "step": 35498 + }, + { + "epoch": 0.8983222410608093, + "grad_norm": 11.75771427154541, + "learning_rate": 2.580290002447772e-07, + "loss": 0.2773, + "step": 35499 + }, + { + "epoch": 0.898347546625503, + "grad_norm": 6.572328567504883, + "learning_rate": 2.5790169525023e-07, + "loss": 0.189, + "step": 35500 + }, + { + "epoch": 0.8983728521901966, + "grad_norm": 6.014374256134033, + "learning_rate": 2.577744208365929e-07, + "loss": 0.1037, + "step": 35501 + }, + { + "epoch": 0.8983981577548903, + "grad_norm": 16.027732849121094, + "learning_rate": 2.576471770046862e-07, + "loss": 0.3152, + "step": 35502 + }, + { + "epoch": 0.898423463319584, + "grad_norm": 6.216676712036133, + "learning_rate": 2.575199637553316e-07, + "loss": 0.1334, + "step": 35503 + }, + { + "epoch": 0.8984487688842776, + "grad_norm": 4.6817193031311035, + "learning_rate": 2.573927810893473e-07, + "loss": 0.1616, + "step": 35504 + }, + { + "epoch": 0.8984740744489713, + "grad_norm": 2.934649705886841, + "learning_rate": 2.572656290075554e-07, + "loss": 0.1067, + "step": 35505 + }, + { + "epoch": 0.898499380013665, + "grad_norm": 5.723008632659912, + "learning_rate": 2.571385075107746e-07, + "loss": 0.1911, + "step": 35506 + }, + { + "epoch": 0.8985246855783586, + "grad_norm": 4.947291851043701, + "learning_rate": 2.5701141659982664e-07, + "loss": 0.1515, + "step": 35507 + }, + { + "epoch": 0.8985499911430523, + "grad_norm": 3.9402031898498535, + "learning_rate": 2.568843562755285e-07, + "loss": 0.1492, + "step": 35508 + }, + { + "epoch": 0.898575296707746, + "grad_norm": 8.553339958190918, + "learning_rate": 2.567573265387019e-07, + "loss": 0.235, + "step": 35509 + }, + { + "epoch": 0.8986006022724398, + "grad_norm": 9.637630462646484, + "learning_rate": 2.566303273901638e-07, + "loss": 0.2531, + "step": 35510 + }, + { + "epoch": 0.8986259078371334, + "grad_norm": 6.16955041885376, + "learning_rate": 2.565033588307353e-07, + "loss": 0.1541, + "step": 35511 + }, + { + "epoch": 0.8986512134018271, + "grad_norm": 5.012109756469727, + "learning_rate": 2.5637642086123404e-07, + "loss": 0.144, + "step": 35512 + }, + { + "epoch": 0.8986765189665208, + "grad_norm": 2.7457923889160156, + "learning_rate": 2.562495134824794e-07, + "loss": 0.0957, + "step": 35513 + }, + { + "epoch": 0.8987018245312144, + "grad_norm": 8.420550346374512, + "learning_rate": 2.561226366952879e-07, + "loss": 0.1221, + "step": 35514 + }, + { + "epoch": 0.8987271300959081, + "grad_norm": 7.1836256980896, + "learning_rate": 2.5599579050048005e-07, + "loss": 0.1231, + "step": 35515 + }, + { + "epoch": 0.8987524356606018, + "grad_norm": 4.1659064292907715, + "learning_rate": 2.5586897489887296e-07, + "loss": 0.1238, + "step": 35516 + }, + { + "epoch": 0.8987777412252954, + "grad_norm": 5.748912334442139, + "learning_rate": 2.557421898912843e-07, + "loss": 0.1702, + "step": 35517 + }, + { + "epoch": 0.8988030467899891, + "grad_norm": 3.7132372856140137, + "learning_rate": 2.556154354785312e-07, + "loss": 0.1481, + "step": 35518 + }, + { + "epoch": 0.8988283523546828, + "grad_norm": 2.7188353538513184, + "learning_rate": 2.5548871166143183e-07, + "loss": 0.0775, + "step": 35519 + }, + { + "epoch": 0.8988536579193764, + "grad_norm": 9.670207023620605, + "learning_rate": 2.55362018440804e-07, + "loss": 0.2511, + "step": 35520 + }, + { + "epoch": 0.8988789634840701, + "grad_norm": 4.20150899887085, + "learning_rate": 2.552353558174636e-07, + "loss": 0.1825, + "step": 35521 + }, + { + "epoch": 0.8989042690487639, + "grad_norm": 6.045112609863281, + "learning_rate": 2.551087237922267e-07, + "loss": 0.1782, + "step": 35522 + }, + { + "epoch": 0.8989295746134575, + "grad_norm": 3.6761598587036133, + "learning_rate": 2.5498212236591214e-07, + "loss": 0.1328, + "step": 35523 + }, + { + "epoch": 0.8989548801781512, + "grad_norm": 4.543006896972656, + "learning_rate": 2.5485555153933526e-07, + "loss": 0.1142, + "step": 35524 + }, + { + "epoch": 0.8989801857428449, + "grad_norm": 10.720539093017578, + "learning_rate": 2.5472901131331216e-07, + "loss": 0.1338, + "step": 35525 + }, + { + "epoch": 0.8990054913075385, + "grad_norm": 4.629303455352783, + "learning_rate": 2.546025016886583e-07, + "loss": 0.1799, + "step": 35526 + }, + { + "epoch": 0.8990307968722322, + "grad_norm": 4.036567687988281, + "learning_rate": 2.5447602266619073e-07, + "loss": 0.1665, + "step": 35527 + }, + { + "epoch": 0.8990561024369259, + "grad_norm": 8.292533874511719, + "learning_rate": 2.543495742467245e-07, + "loss": 0.1896, + "step": 35528 + }, + { + "epoch": 0.8990814080016195, + "grad_norm": 6.996102809906006, + "learning_rate": 2.542231564310754e-07, + "loss": 0.1605, + "step": 35529 + }, + { + "epoch": 0.8991067135663132, + "grad_norm": 4.382844924926758, + "learning_rate": 2.5409676922005854e-07, + "loss": 0.1291, + "step": 35530 + }, + { + "epoch": 0.8991320191310069, + "grad_norm": 3.923847198486328, + "learning_rate": 2.539704126144876e-07, + "loss": 0.0984, + "step": 35531 + }, + { + "epoch": 0.8991573246957005, + "grad_norm": 13.54673957824707, + "learning_rate": 2.538440866151798e-07, + "loss": 0.1778, + "step": 35532 + }, + { + "epoch": 0.8991826302603942, + "grad_norm": 6.3824896812438965, + "learning_rate": 2.537177912229477e-07, + "loss": 0.2132, + "step": 35533 + }, + { + "epoch": 0.899207935825088, + "grad_norm": 11.535712242126465, + "learning_rate": 2.5359152643860797e-07, + "loss": 0.2538, + "step": 35534 + }, + { + "epoch": 0.8992332413897817, + "grad_norm": 4.187595367431641, + "learning_rate": 2.534652922629721e-07, + "loss": 0.161, + "step": 35535 + }, + { + "epoch": 0.8992585469544753, + "grad_norm": 3.702410936355591, + "learning_rate": 2.5333908869685675e-07, + "loss": 0.1592, + "step": 35536 + }, + { + "epoch": 0.899283852519169, + "grad_norm": 5.456546306610107, + "learning_rate": 2.5321291574107396e-07, + "loss": 0.2283, + "step": 35537 + }, + { + "epoch": 0.8993091580838627, + "grad_norm": 6.611532688140869, + "learning_rate": 2.5308677339643926e-07, + "loss": 0.2023, + "step": 35538 + }, + { + "epoch": 0.8993344636485563, + "grad_norm": 2.83791446685791, + "learning_rate": 2.5296066166376363e-07, + "loss": 0.1059, + "step": 35539 + }, + { + "epoch": 0.89935976921325, + "grad_norm": 3.3213138580322266, + "learning_rate": 2.5283458054386256e-07, + "loss": 0.1711, + "step": 35540 + }, + { + "epoch": 0.8993850747779437, + "grad_norm": 7.123454570770264, + "learning_rate": 2.52708530037547e-07, + "loss": 0.1655, + "step": 35541 + }, + { + "epoch": 0.8994103803426373, + "grad_norm": 7.768111705780029, + "learning_rate": 2.525825101456325e-07, + "loss": 0.1457, + "step": 35542 + }, + { + "epoch": 0.899435685907331, + "grad_norm": 3.373042583465576, + "learning_rate": 2.524565208689295e-07, + "loss": 0.1227, + "step": 35543 + }, + { + "epoch": 0.8994609914720247, + "grad_norm": 3.837996006011963, + "learning_rate": 2.523305622082517e-07, + "loss": 0.0899, + "step": 35544 + }, + { + "epoch": 0.8994862970367183, + "grad_norm": 7.139357089996338, + "learning_rate": 2.5220463416441023e-07, + "loss": 0.2505, + "step": 35545 + }, + { + "epoch": 0.899511602601412, + "grad_norm": 2.9378135204315186, + "learning_rate": 2.520787367382188e-07, + "loss": 0.1052, + "step": 35546 + }, + { + "epoch": 0.8995369081661058, + "grad_norm": 8.758072853088379, + "learning_rate": 2.519528699304885e-07, + "loss": 0.1858, + "step": 35547 + }, + { + "epoch": 0.8995622137307994, + "grad_norm": 5.937519073486328, + "learning_rate": 2.5182703374203086e-07, + "loss": 0.184, + "step": 35548 + }, + { + "epoch": 0.8995875192954931, + "grad_norm": 7.010380744934082, + "learning_rate": 2.517012281736564e-07, + "loss": 0.1814, + "step": 35549 + }, + { + "epoch": 0.8996128248601868, + "grad_norm": 3.5749077796936035, + "learning_rate": 2.515754532261783e-07, + "loss": 0.1273, + "step": 35550 + }, + { + "epoch": 0.8996381304248804, + "grad_norm": 4.8489298820495605, + "learning_rate": 2.5144970890040697e-07, + "loss": 0.1778, + "step": 35551 + }, + { + "epoch": 0.8996634359895741, + "grad_norm": 4.7627668380737305, + "learning_rate": 2.51323995197153e-07, + "loss": 0.1593, + "step": 35552 + }, + { + "epoch": 0.8996887415542678, + "grad_norm": 5.633224964141846, + "learning_rate": 2.511983121172268e-07, + "loss": 0.1447, + "step": 35553 + }, + { + "epoch": 0.8997140471189614, + "grad_norm": 19.81220054626465, + "learning_rate": 2.510726596614399e-07, + "loss": 0.2132, + "step": 35554 + }, + { + "epoch": 0.8997393526836551, + "grad_norm": 9.161941528320312, + "learning_rate": 2.5094703783060226e-07, + "loss": 0.1929, + "step": 35555 + }, + { + "epoch": 0.8997646582483488, + "grad_norm": 6.587760925292969, + "learning_rate": 2.508214466255232e-07, + "loss": 0.225, + "step": 35556 + }, + { + "epoch": 0.8997899638130424, + "grad_norm": 1.8560670614242554, + "learning_rate": 2.5069588604701376e-07, + "loss": 0.0699, + "step": 35557 + }, + { + "epoch": 0.8998152693777361, + "grad_norm": 3.0925779342651367, + "learning_rate": 2.505703560958822e-07, + "loss": 0.122, + "step": 35558 + }, + { + "epoch": 0.8998405749424299, + "grad_norm": 6.477499961853027, + "learning_rate": 2.5044485677294006e-07, + "loss": 0.1437, + "step": 35559 + }, + { + "epoch": 0.8998658805071235, + "grad_norm": 2.9860825538635254, + "learning_rate": 2.50319388078995e-07, + "loss": 0.1222, + "step": 35560 + }, + { + "epoch": 0.8998911860718172, + "grad_norm": 3.8046181201934814, + "learning_rate": 2.5019395001485704e-07, + "loss": 0.1831, + "step": 35561 + }, + { + "epoch": 0.8999164916365109, + "grad_norm": 4.787990093231201, + "learning_rate": 2.500685425813337e-07, + "loss": 0.105, + "step": 35562 + }, + { + "epoch": 0.8999417972012046, + "grad_norm": 3.839587688446045, + "learning_rate": 2.499431657792356e-07, + "loss": 0.1572, + "step": 35563 + }, + { + "epoch": 0.8999671027658982, + "grad_norm": 15.137746810913086, + "learning_rate": 2.4981781960936977e-07, + "loss": 0.1757, + "step": 35564 + }, + { + "epoch": 0.8999924083305919, + "grad_norm": 6.411746978759766, + "learning_rate": 2.4969250407254677e-07, + "loss": 0.2374, + "step": 35565 + }, + { + "epoch": 0.9000177138952856, + "grad_norm": 3.9548423290252686, + "learning_rate": 2.495672191695719e-07, + "loss": 0.1734, + "step": 35566 + }, + { + "epoch": 0.9000430194599792, + "grad_norm": 3.0661962032318115, + "learning_rate": 2.494419649012547e-07, + "loss": 0.1065, + "step": 35567 + }, + { + "epoch": 0.9000683250246729, + "grad_norm": 5.783027172088623, + "learning_rate": 2.4931674126840224e-07, + "loss": 0.1463, + "step": 35568 + }, + { + "epoch": 0.9000683250246729, + "eval_loss": 0.16171902418136597, + "eval_runtime": 69.8555, + "eval_samples_per_second": 45.723, + "eval_steps_per_second": 5.726, + "step": 35568 + }, + { + "epoch": 0.9000936305893666, + "grad_norm": 3.471054792404175, + "learning_rate": 2.491915482718238e-07, + "loss": 0.0688, + "step": 35569 + }, + { + "epoch": 0.9001189361540602, + "grad_norm": 5.655165672302246, + "learning_rate": 2.490663859123238e-07, + "loss": 0.1692, + "step": 35570 + }, + { + "epoch": 0.900144241718754, + "grad_norm": 13.989602088928223, + "learning_rate": 2.489412541907116e-07, + "loss": 0.2192, + "step": 35571 + }, + { + "epoch": 0.9001695472834477, + "grad_norm": 5.285205841064453, + "learning_rate": 2.488161531077926e-07, + "loss": 0.1995, + "step": 35572 + }, + { + "epoch": 0.9001948528481413, + "grad_norm": 5.293964385986328, + "learning_rate": 2.486910826643757e-07, + "loss": 0.1328, + "step": 35573 + }, + { + "epoch": 0.900220158412835, + "grad_norm": 9.33894157409668, + "learning_rate": 2.4856604286126575e-07, + "loss": 0.1437, + "step": 35574 + }, + { + "epoch": 0.9002454639775287, + "grad_norm": 10.775613784790039, + "learning_rate": 2.4844103369926986e-07, + "loss": 0.1801, + "step": 35575 + }, + { + "epoch": 0.9002707695422223, + "grad_norm": 4.450692653656006, + "learning_rate": 2.4831605517919297e-07, + "loss": 0.0919, + "step": 35576 + }, + { + "epoch": 0.900296075106916, + "grad_norm": 6.152518272399902, + "learning_rate": 2.4819110730184225e-07, + "loss": 0.184, + "step": 35577 + }, + { + "epoch": 0.9003213806716097, + "grad_norm": 6.993545055389404, + "learning_rate": 2.480661900680237e-07, + "loss": 0.2046, + "step": 35578 + }, + { + "epoch": 0.9003466862363033, + "grad_norm": 6.9826741218566895, + "learning_rate": 2.4794130347854217e-07, + "loss": 0.1802, + "step": 35579 + }, + { + "epoch": 0.900371991800997, + "grad_norm": 2.2661449909210205, + "learning_rate": 2.4781644753420264e-07, + "loss": 0.0811, + "step": 35580 + }, + { + "epoch": 0.9003972973656907, + "grad_norm": 2.6630730628967285, + "learning_rate": 2.4769162223581167e-07, + "loss": 0.1454, + "step": 35581 + }, + { + "epoch": 0.9004226029303843, + "grad_norm": 5.395758628845215, + "learning_rate": 2.475668275841736e-07, + "loss": 0.169, + "step": 35582 + }, + { + "epoch": 0.900447908495078, + "grad_norm": 8.45239543914795, + "learning_rate": 2.474420635800928e-07, + "loss": 0.2025, + "step": 35583 + }, + { + "epoch": 0.9004732140597718, + "grad_norm": 6.399075031280518, + "learning_rate": 2.473173302243731e-07, + "loss": 0.1436, + "step": 35584 + }, + { + "epoch": 0.9004985196244654, + "grad_norm": 7.583449363708496, + "learning_rate": 2.47192627517821e-07, + "loss": 0.0978, + "step": 35585 + }, + { + "epoch": 0.9005238251891591, + "grad_norm": 8.192292213439941, + "learning_rate": 2.470679554612393e-07, + "loss": 0.2144, + "step": 35586 + }, + { + "epoch": 0.9005491307538528, + "grad_norm": 9.969550132751465, + "learning_rate": 2.4694331405543224e-07, + "loss": 0.2016, + "step": 35587 + }, + { + "epoch": 0.9005744363185465, + "grad_norm": 5.091001510620117, + "learning_rate": 2.4681870330120374e-07, + "loss": 0.1696, + "step": 35588 + }, + { + "epoch": 0.9005997418832401, + "grad_norm": 4.6218976974487305, + "learning_rate": 2.4669412319935637e-07, + "loss": 0.1402, + "step": 35589 + }, + { + "epoch": 0.9006250474479338, + "grad_norm": 4.288212776184082, + "learning_rate": 2.4656957375069566e-07, + "loss": 0.13, + "step": 35590 + }, + { + "epoch": 0.9006503530126275, + "grad_norm": 3.0207245349884033, + "learning_rate": 2.464450549560227e-07, + "loss": 0.1263, + "step": 35591 + }, + { + "epoch": 0.9006756585773211, + "grad_norm": 3.465336799621582, + "learning_rate": 2.463205668161422e-07, + "loss": 0.1292, + "step": 35592 + }, + { + "epoch": 0.9007009641420148, + "grad_norm": 8.605428695678711, + "learning_rate": 2.461961093318549e-07, + "loss": 0.1149, + "step": 35593 + }, + { + "epoch": 0.9007262697067085, + "grad_norm": 5.791228294372559, + "learning_rate": 2.4607168250396554e-07, + "loss": 0.1284, + "step": 35594 + }, + { + "epoch": 0.9007515752714021, + "grad_norm": 5.450122356414795, + "learning_rate": 2.459472863332746e-07, + "loss": 0.1599, + "step": 35595 + }, + { + "epoch": 0.9007768808360959, + "grad_norm": 2.7963836193084717, + "learning_rate": 2.45822920820587e-07, + "loss": 0.1281, + "step": 35596 + }, + { + "epoch": 0.9008021864007896, + "grad_norm": 3.8020312786102295, + "learning_rate": 2.456985859667016e-07, + "loss": 0.1084, + "step": 35597 + }, + { + "epoch": 0.9008274919654832, + "grad_norm": 2.800060749053955, + "learning_rate": 2.4557428177242217e-07, + "loss": 0.1027, + "step": 35598 + }, + { + "epoch": 0.9008527975301769, + "grad_norm": 3.53226637840271, + "learning_rate": 2.4545000823854914e-07, + "loss": 0.1407, + "step": 35599 + }, + { + "epoch": 0.9008781030948706, + "grad_norm": 5.001906394958496, + "learning_rate": 2.453257653658864e-07, + "loss": 0.1412, + "step": 35600 + }, + { + "epoch": 0.9009034086595642, + "grad_norm": 5.072487831115723, + "learning_rate": 2.452015531552315e-07, + "loss": 0.1396, + "step": 35601 + }, + { + "epoch": 0.9009287142242579, + "grad_norm": 9.615306854248047, + "learning_rate": 2.4507737160738786e-07, + "loss": 0.2032, + "step": 35602 + }, + { + "epoch": 0.9009540197889516, + "grad_norm": 6.71140193939209, + "learning_rate": 2.449532207231559e-07, + "loss": 0.1972, + "step": 35603 + }, + { + "epoch": 0.9009793253536452, + "grad_norm": 3.922497034072876, + "learning_rate": 2.4482910050333596e-07, + "loss": 0.1656, + "step": 35604 + }, + { + "epoch": 0.9010046309183389, + "grad_norm": 5.114433288574219, + "learning_rate": 2.4470501094872865e-07, + "loss": 0.1041, + "step": 35605 + }, + { + "epoch": 0.9010299364830326, + "grad_norm": 7.413272380828857, + "learning_rate": 2.44580952060135e-07, + "loss": 0.1769, + "step": 35606 + }, + { + "epoch": 0.9010552420477262, + "grad_norm": 5.025491237640381, + "learning_rate": 2.444569238383526e-07, + "loss": 0.1418, + "step": 35607 + }, + { + "epoch": 0.90108054761242, + "grad_norm": 2.7210044860839844, + "learning_rate": 2.4433292628418426e-07, + "loss": 0.1178, + "step": 35608 + }, + { + "epoch": 0.9011058531771137, + "grad_norm": 4.256420612335205, + "learning_rate": 2.442089593984276e-07, + "loss": 0.1354, + "step": 35609 + }, + { + "epoch": 0.9011311587418073, + "grad_norm": 7.551236629486084, + "learning_rate": 2.440850231818831e-07, + "loss": 0.2517, + "step": 35610 + }, + { + "epoch": 0.901156464306501, + "grad_norm": 3.4041249752044678, + "learning_rate": 2.4396111763534904e-07, + "loss": 0.0907, + "step": 35611 + }, + { + "epoch": 0.9011817698711947, + "grad_norm": 6.262688159942627, + "learning_rate": 2.438372427596258e-07, + "loss": 0.1575, + "step": 35612 + }, + { + "epoch": 0.9012070754358884, + "grad_norm": 3.4994893074035645, + "learning_rate": 2.4371339855551125e-07, + "loss": 0.1497, + "step": 35613 + }, + { + "epoch": 0.901232381000582, + "grad_norm": 4.304073810577393, + "learning_rate": 2.43589585023804e-07, + "loss": 0.2048, + "step": 35614 + }, + { + "epoch": 0.9012576865652757, + "grad_norm": 3.925229072570801, + "learning_rate": 2.4346580216530305e-07, + "loss": 0.0838, + "step": 35615 + }, + { + "epoch": 0.9012829921299694, + "grad_norm": 4.302244186401367, + "learning_rate": 2.433420499808059e-07, + "loss": 0.0983, + "step": 35616 + }, + { + "epoch": 0.901308297694663, + "grad_norm": 2.518612861633301, + "learning_rate": 2.4321832847111147e-07, + "loss": 0.0984, + "step": 35617 + }, + { + "epoch": 0.9013336032593567, + "grad_norm": 5.316781997680664, + "learning_rate": 2.430946376370169e-07, + "loss": 0.2395, + "step": 35618 + }, + { + "epoch": 0.9013589088240505, + "grad_norm": 11.728362083435059, + "learning_rate": 2.4297097747932097e-07, + "loss": 0.2705, + "step": 35619 + }, + { + "epoch": 0.901384214388744, + "grad_norm": 10.35212516784668, + "learning_rate": 2.428473479988191e-07, + "loss": 0.167, + "step": 35620 + }, + { + "epoch": 0.9014095199534378, + "grad_norm": 5.787930488586426, + "learning_rate": 2.4272374919631025e-07, + "loss": 0.1572, + "step": 35621 + }, + { + "epoch": 0.9014348255181315, + "grad_norm": 9.389348030090332, + "learning_rate": 2.4260018107259145e-07, + "loss": 0.2177, + "step": 35622 + }, + { + "epoch": 0.9014601310828251, + "grad_norm": 10.803485870361328, + "learning_rate": 2.4247664362845926e-07, + "loss": 0.1523, + "step": 35623 + }, + { + "epoch": 0.9014854366475188, + "grad_norm": 14.90022087097168, + "learning_rate": 2.423531368647092e-07, + "loss": 0.1483, + "step": 35624 + }, + { + "epoch": 0.9015107422122125, + "grad_norm": 3.9809062480926514, + "learning_rate": 2.422296607821395e-07, + "loss": 0.1338, + "step": 35625 + }, + { + "epoch": 0.9015360477769061, + "grad_norm": 6.167342662811279, + "learning_rate": 2.421062153815451e-07, + "loss": 0.0929, + "step": 35626 + }, + { + "epoch": 0.9015613533415998, + "grad_norm": 8.542737007141113, + "learning_rate": 2.419828006637237e-07, + "loss": 0.1577, + "step": 35627 + }, + { + "epoch": 0.9015866589062935, + "grad_norm": 4.8411970138549805, + "learning_rate": 2.418594166294691e-07, + "loss": 0.1335, + "step": 35628 + }, + { + "epoch": 0.9016119644709871, + "grad_norm": 4.757746696472168, + "learning_rate": 2.41736063279579e-07, + "loss": 0.1202, + "step": 35629 + }, + { + "epoch": 0.9016372700356808, + "grad_norm": 5.186241149902344, + "learning_rate": 2.4161274061484665e-07, + "loss": 0.0922, + "step": 35630 + }, + { + "epoch": 0.9016625756003745, + "grad_norm": 4.279841423034668, + "learning_rate": 2.4148944863607027e-07, + "loss": 0.1454, + "step": 35631 + }, + { + "epoch": 0.9016878811650682, + "grad_norm": 4.3469672203063965, + "learning_rate": 2.4136618734404205e-07, + "loss": 0.1311, + "step": 35632 + }, + { + "epoch": 0.9017131867297619, + "grad_norm": 11.141952514648438, + "learning_rate": 2.412429567395591e-07, + "loss": 0.2146, + "step": 35633 + }, + { + "epoch": 0.9017384922944556, + "grad_norm": 9.933209419250488, + "learning_rate": 2.411197568234136e-07, + "loss": 0.2285, + "step": 35634 + }, + { + "epoch": 0.9017637978591492, + "grad_norm": 6.952983856201172, + "learning_rate": 2.4099658759640374e-07, + "loss": 0.2199, + "step": 35635 + }, + { + "epoch": 0.9017891034238429, + "grad_norm": 3.5773308277130127, + "learning_rate": 2.4087344905931955e-07, + "loss": 0.1028, + "step": 35636 + }, + { + "epoch": 0.9018144089885366, + "grad_norm": 5.256894111633301, + "learning_rate": 2.407503412129586e-07, + "loss": 0.156, + "step": 35637 + }, + { + "epoch": 0.9018397145532303, + "grad_norm": 3.333355665206909, + "learning_rate": 2.40627264058112e-07, + "loss": 0.108, + "step": 35638 + }, + { + "epoch": 0.9018650201179239, + "grad_norm": 4.764780044555664, + "learning_rate": 2.405042175955763e-07, + "loss": 0.1787, + "step": 35639 + }, + { + "epoch": 0.9018903256826176, + "grad_norm": 10.724242210388184, + "learning_rate": 2.403812018261431e-07, + "loss": 0.2891, + "step": 35640 + }, + { + "epoch": 0.9019156312473113, + "grad_norm": 5.474549770355225, + "learning_rate": 2.4025821675060624e-07, + "loss": 0.1503, + "step": 35641 + }, + { + "epoch": 0.9019409368120049, + "grad_norm": 9.141227722167969, + "learning_rate": 2.4013526236975894e-07, + "loss": 0.1906, + "step": 35642 + }, + { + "epoch": 0.9019662423766986, + "grad_norm": 6.075222969055176, + "learning_rate": 2.400123386843928e-07, + "loss": 0.1411, + "step": 35643 + }, + { + "epoch": 0.9019915479413924, + "grad_norm": 3.8904826641082764, + "learning_rate": 2.3988944569530273e-07, + "loss": 0.1475, + "step": 35644 + }, + { + "epoch": 0.902016853506086, + "grad_norm": 7.849964141845703, + "learning_rate": 2.397665834032803e-07, + "loss": 0.215, + "step": 35645 + }, + { + "epoch": 0.9020421590707797, + "grad_norm": 3.9788217544555664, + "learning_rate": 2.3964375180911715e-07, + "loss": 0.094, + "step": 35646 + }, + { + "epoch": 0.9020674646354734, + "grad_norm": 16.752517700195312, + "learning_rate": 2.3952095091360595e-07, + "loss": 0.2535, + "step": 35647 + }, + { + "epoch": 0.902092770200167, + "grad_norm": 2.4159984588623047, + "learning_rate": 2.393981807175388e-07, + "loss": 0.0893, + "step": 35648 + }, + { + "epoch": 0.9021180757648607, + "grad_norm": 3.984154462814331, + "learning_rate": 2.392754412217074e-07, + "loss": 0.134, + "step": 35649 + }, + { + "epoch": 0.9021433813295544, + "grad_norm": 12.71560287475586, + "learning_rate": 2.391527324269033e-07, + "loss": 0.1671, + "step": 35650 + }, + { + "epoch": 0.902168686894248, + "grad_norm": 3.2127349376678467, + "learning_rate": 2.3903005433391635e-07, + "loss": 0.0779, + "step": 35651 + }, + { + "epoch": 0.9021939924589417, + "grad_norm": 9.68676471710205, + "learning_rate": 2.389074069435404e-07, + "loss": 0.1668, + "step": 35652 + }, + { + "epoch": 0.9022192980236354, + "grad_norm": 5.039771556854248, + "learning_rate": 2.387847902565643e-07, + "loss": 0.1358, + "step": 35653 + }, + { + "epoch": 0.902244603588329, + "grad_norm": 6.4910383224487305, + "learning_rate": 2.3866220427378016e-07, + "loss": 0.1509, + "step": 35654 + }, + { + "epoch": 0.9022699091530227, + "grad_norm": 5.312354564666748, + "learning_rate": 2.385396489959768e-07, + "loss": 0.1854, + "step": 35655 + }, + { + "epoch": 0.9022952147177165, + "grad_norm": 6.864474296569824, + "learning_rate": 2.3841712442394582e-07, + "loss": 0.1177, + "step": 35656 + }, + { + "epoch": 0.90232052028241, + "grad_norm": 6.127755165100098, + "learning_rate": 2.3829463055847769e-07, + "loss": 0.1781, + "step": 35657 + }, + { + "epoch": 0.9023458258471038, + "grad_norm": 4.452311992645264, + "learning_rate": 2.381721674003612e-07, + "loss": 0.154, + "step": 35658 + }, + { + "epoch": 0.9023711314117975, + "grad_norm": 3.59521746635437, + "learning_rate": 2.380497349503863e-07, + "loss": 0.1035, + "step": 35659 + }, + { + "epoch": 0.9023964369764911, + "grad_norm": 3.874163866043091, + "learning_rate": 2.3792733320934348e-07, + "loss": 0.0883, + "step": 35660 + }, + { + "epoch": 0.9024217425411848, + "grad_norm": 6.925803184509277, + "learning_rate": 2.3780496217802097e-07, + "loss": 0.1293, + "step": 35661 + }, + { + "epoch": 0.9024470481058785, + "grad_norm": 2.9510653018951416, + "learning_rate": 2.3768262185720925e-07, + "loss": 0.0896, + "step": 35662 + }, + { + "epoch": 0.9024723536705722, + "grad_norm": 2.4380791187286377, + "learning_rate": 2.375603122476955e-07, + "loss": 0.1201, + "step": 35663 + }, + { + "epoch": 0.9024976592352658, + "grad_norm": 6.344302654266357, + "learning_rate": 2.3743803335027015e-07, + "loss": 0.1437, + "step": 35664 + }, + { + "epoch": 0.9025229647999595, + "grad_norm": 6.564198017120361, + "learning_rate": 2.3731578516571985e-07, + "loss": 0.2076, + "step": 35665 + }, + { + "epoch": 0.9025482703646532, + "grad_norm": 3.8075034618377686, + "learning_rate": 2.3719356769483615e-07, + "loss": 0.1649, + "step": 35666 + }, + { + "epoch": 0.9025735759293468, + "grad_norm": 4.190709590911865, + "learning_rate": 2.3707138093840342e-07, + "loss": 0.14, + "step": 35667 + }, + { + "epoch": 0.9025988814940406, + "grad_norm": 7.68266487121582, + "learning_rate": 2.3694922489721216e-07, + "loss": 0.1202, + "step": 35668 + }, + { + "epoch": 0.9026241870587343, + "grad_norm": 11.705852508544922, + "learning_rate": 2.3682709957204896e-07, + "loss": 0.1788, + "step": 35669 + }, + { + "epoch": 0.9026494926234279, + "grad_norm": 5.24973201751709, + "learning_rate": 2.367050049637021e-07, + "loss": 0.1972, + "step": 35670 + }, + { + "epoch": 0.9026747981881216, + "grad_norm": 6.8435821533203125, + "learning_rate": 2.3658294107295865e-07, + "loss": 0.1734, + "step": 35671 + }, + { + "epoch": 0.9027001037528153, + "grad_norm": 6.082881450653076, + "learning_rate": 2.3646090790060582e-07, + "loss": 0.1862, + "step": 35672 + }, + { + "epoch": 0.9027254093175089, + "grad_norm": 5.654457092285156, + "learning_rate": 2.363389054474302e-07, + "loss": 0.1438, + "step": 35673 + }, + { + "epoch": 0.9027507148822026, + "grad_norm": 3.5222527980804443, + "learning_rate": 2.362169337142184e-07, + "loss": 0.1002, + "step": 35674 + }, + { + "epoch": 0.9027760204468963, + "grad_norm": 3.1024043560028076, + "learning_rate": 2.3609499270175806e-07, + "loss": 0.1178, + "step": 35675 + }, + { + "epoch": 0.9028013260115899, + "grad_norm": 5.952941417694092, + "learning_rate": 2.3597308241083528e-07, + "loss": 0.1831, + "step": 35676 + }, + { + "epoch": 0.9028266315762836, + "grad_norm": 5.329117774963379, + "learning_rate": 2.3585120284223607e-07, + "loss": 0.1322, + "step": 35677 + }, + { + "epoch": 0.9028519371409773, + "grad_norm": 3.702338457107544, + "learning_rate": 2.357293539967448e-07, + "loss": 0.1955, + "step": 35678 + }, + { + "epoch": 0.9028772427056709, + "grad_norm": 3.9410552978515625, + "learning_rate": 2.356075358751503e-07, + "loss": 0.1136, + "step": 35679 + }, + { + "epoch": 0.9029025482703646, + "grad_norm": 5.697874546051025, + "learning_rate": 2.354857484782358e-07, + "loss": 0.1693, + "step": 35680 + }, + { + "epoch": 0.9029278538350584, + "grad_norm": 4.6204423904418945, + "learning_rate": 2.3536399180678793e-07, + "loss": 0.1434, + "step": 35681 + }, + { + "epoch": 0.902953159399752, + "grad_norm": 2.851318836212158, + "learning_rate": 2.352422658615905e-07, + "loss": 0.1177, + "step": 35682 + }, + { + "epoch": 0.9029784649644457, + "grad_norm": 7.571464538574219, + "learning_rate": 2.3512057064343008e-07, + "loss": 0.1701, + "step": 35683 + }, + { + "epoch": 0.9030037705291394, + "grad_norm": 4.858543872833252, + "learning_rate": 2.3499890615309051e-07, + "loss": 0.1529, + "step": 35684 + }, + { + "epoch": 0.903029076093833, + "grad_norm": 3.366968870162964, + "learning_rate": 2.3487727239135672e-07, + "loss": 0.0982, + "step": 35685 + }, + { + "epoch": 0.9030543816585267, + "grad_norm": 4.979660987854004, + "learning_rate": 2.347556693590125e-07, + "loss": 0.1787, + "step": 35686 + }, + { + "epoch": 0.9030796872232204, + "grad_norm": 11.884001731872559, + "learning_rate": 2.3463409705684336e-07, + "loss": 0.2111, + "step": 35687 + }, + { + "epoch": 0.903104992787914, + "grad_norm": 7.50963830947876, + "learning_rate": 2.3451255548563255e-07, + "loss": 0.1371, + "step": 35688 + }, + { + "epoch": 0.9031302983526077, + "grad_norm": 8.876958847045898, + "learning_rate": 2.3439104464616334e-07, + "loss": 0.2867, + "step": 35689 + }, + { + "epoch": 0.9031556039173014, + "grad_norm": 3.3365235328674316, + "learning_rate": 2.3426956453921956e-07, + "loss": 0.1149, + "step": 35690 + }, + { + "epoch": 0.9031809094819951, + "grad_norm": 4.597618579864502, + "learning_rate": 2.3414811516558555e-07, + "loss": 0.1559, + "step": 35691 + }, + { + "epoch": 0.9032062150466887, + "grad_norm": 6.192517280578613, + "learning_rate": 2.3402669652604292e-07, + "loss": 0.2036, + "step": 35692 + }, + { + "epoch": 0.9032315206113825, + "grad_norm": 4.51060152053833, + "learning_rate": 2.3390530862137716e-07, + "loss": 0.1222, + "step": 35693 + }, + { + "epoch": 0.9032568261760762, + "grad_norm": 3.0536019802093506, + "learning_rate": 2.337839514523682e-07, + "loss": 0.1351, + "step": 35694 + }, + { + "epoch": 0.9032821317407698, + "grad_norm": 3.7316431999206543, + "learning_rate": 2.336626250198004e-07, + "loss": 0.0691, + "step": 35695 + }, + { + "epoch": 0.9033074373054635, + "grad_norm": 6.171183109283447, + "learning_rate": 2.3354132932445485e-07, + "loss": 0.1555, + "step": 35696 + }, + { + "epoch": 0.9033327428701572, + "grad_norm": 7.04386043548584, + "learning_rate": 2.334200643671164e-07, + "loss": 0.1781, + "step": 35697 + }, + { + "epoch": 0.9033580484348508, + "grad_norm": 4.758702754974365, + "learning_rate": 2.332988301485639e-07, + "loss": 0.1013, + "step": 35698 + }, + { + "epoch": 0.9033833539995445, + "grad_norm": 2.5943760871887207, + "learning_rate": 2.331776266695812e-07, + "loss": 0.0849, + "step": 35699 + }, + { + "epoch": 0.9034086595642382, + "grad_norm": 3.691323757171631, + "learning_rate": 2.330564539309488e-07, + "loss": 0.0906, + "step": 35700 + }, + { + "epoch": 0.9034339651289318, + "grad_norm": 12.084887504577637, + "learning_rate": 2.3293531193344876e-07, + "loss": 0.2785, + "step": 35701 + }, + { + "epoch": 0.9034592706936255, + "grad_norm": 5.756986141204834, + "learning_rate": 2.328142006778622e-07, + "loss": 0.182, + "step": 35702 + }, + { + "epoch": 0.9034845762583192, + "grad_norm": 5.275855541229248, + "learning_rate": 2.326931201649707e-07, + "loss": 0.1172, + "step": 35703 + }, + { + "epoch": 0.9035098818230128, + "grad_norm": 22.220844268798828, + "learning_rate": 2.3257207039555363e-07, + "loss": 0.2215, + "step": 35704 + }, + { + "epoch": 0.9035351873877066, + "grad_norm": 6.298348426818848, + "learning_rate": 2.324510513703926e-07, + "loss": 0.0948, + "step": 35705 + }, + { + "epoch": 0.9035604929524003, + "grad_norm": 3.2829701900482178, + "learning_rate": 2.3233006309026807e-07, + "loss": 0.1301, + "step": 35706 + }, + { + "epoch": 0.9035857985170939, + "grad_norm": 4.7148356437683105, + "learning_rate": 2.3220910555596055e-07, + "loss": 0.1208, + "step": 35707 + }, + { + "epoch": 0.9036111040817876, + "grad_norm": 2.726875066757202, + "learning_rate": 2.320881787682494e-07, + "loss": 0.085, + "step": 35708 + }, + { + "epoch": 0.9036364096464813, + "grad_norm": 6.599402904510498, + "learning_rate": 2.3196728272791403e-07, + "loss": 0.1296, + "step": 35709 + }, + { + "epoch": 0.9036617152111749, + "grad_norm": 2.6377766132354736, + "learning_rate": 2.3184641743573598e-07, + "loss": 0.1209, + "step": 35710 + }, + { + "epoch": 0.9036870207758686, + "grad_norm": 20.504833221435547, + "learning_rate": 2.3172558289249192e-07, + "loss": 0.2314, + "step": 35711 + }, + { + "epoch": 0.9037123263405623, + "grad_norm": 4.468304634094238, + "learning_rate": 2.316047790989634e-07, + "loss": 0.1615, + "step": 35712 + }, + { + "epoch": 0.9037376319052559, + "grad_norm": 4.010875225067139, + "learning_rate": 2.3148400605592813e-07, + "loss": 0.1853, + "step": 35713 + }, + { + "epoch": 0.9037629374699496, + "grad_norm": 4.595442295074463, + "learning_rate": 2.3136326376416608e-07, + "loss": 0.1087, + "step": 35714 + }, + { + "epoch": 0.9037882430346433, + "grad_norm": 3.3458993434906006, + "learning_rate": 2.3124255222445546e-07, + "loss": 0.0583, + "step": 35715 + }, + { + "epoch": 0.903813548599337, + "grad_norm": 5.709259510040283, + "learning_rate": 2.3112187143757458e-07, + "loss": 0.198, + "step": 35716 + }, + { + "epoch": 0.9038388541640306, + "grad_norm": 4.973989963531494, + "learning_rate": 2.3100122140430113e-07, + "loss": 0.1808, + "step": 35717 + }, + { + "epoch": 0.9038641597287244, + "grad_norm": 4.580521106719971, + "learning_rate": 2.3088060212541397e-07, + "loss": 0.1771, + "step": 35718 + }, + { + "epoch": 0.9038894652934181, + "grad_norm": 5.4016499519348145, + "learning_rate": 2.3076001360169076e-07, + "loss": 0.1404, + "step": 35719 + }, + { + "epoch": 0.9039147708581117, + "grad_norm": 10.226631164550781, + "learning_rate": 2.3063945583390924e-07, + "loss": 0.2618, + "step": 35720 + }, + { + "epoch": 0.9039400764228054, + "grad_norm": 7.059026718139648, + "learning_rate": 2.3051892882284598e-07, + "loss": 0.2424, + "step": 35721 + }, + { + "epoch": 0.9039653819874991, + "grad_norm": 6.193572044372559, + "learning_rate": 2.3039843256927986e-07, + "loss": 0.1631, + "step": 35722 + }, + { + "epoch": 0.9039906875521927, + "grad_norm": 2.7059664726257324, + "learning_rate": 2.3027796707398577e-07, + "loss": 0.085, + "step": 35723 + }, + { + "epoch": 0.9040159931168864, + "grad_norm": 6.7522759437561035, + "learning_rate": 2.3015753233774363e-07, + "loss": 0.1506, + "step": 35724 + }, + { + "epoch": 0.9040412986815801, + "grad_norm": 6.255101203918457, + "learning_rate": 2.3003712836132676e-07, + "loss": 0.1277, + "step": 35725 + }, + { + "epoch": 0.9040666042462737, + "grad_norm": 6.458212375640869, + "learning_rate": 2.2991675514551393e-07, + "loss": 0.1893, + "step": 35726 + }, + { + "epoch": 0.9040919098109674, + "grad_norm": 9.739206314086914, + "learning_rate": 2.2979641269108067e-07, + "loss": 0.1717, + "step": 35727 + }, + { + "epoch": 0.9041172153756611, + "grad_norm": 5.148951530456543, + "learning_rate": 2.29676100998803e-07, + "loss": 0.1412, + "step": 35728 + }, + { + "epoch": 0.9041425209403547, + "grad_norm": 4.363027572631836, + "learning_rate": 2.2955582006945641e-07, + "loss": 0.1077, + "step": 35729 + }, + { + "epoch": 0.9041678265050485, + "grad_norm": 4.0573601722717285, + "learning_rate": 2.29435569903817e-07, + "loss": 0.074, + "step": 35730 + }, + { + "epoch": 0.9041931320697422, + "grad_norm": 4.157938003540039, + "learning_rate": 2.2931535050266073e-07, + "loss": 0.122, + "step": 35731 + }, + { + "epoch": 0.9042184376344358, + "grad_norm": 6.826954364776611, + "learning_rate": 2.291951618667615e-07, + "loss": 0.1675, + "step": 35732 + }, + { + "epoch": 0.9042437431991295, + "grad_norm": 7.737119197845459, + "learning_rate": 2.2907500399689586e-07, + "loss": 0.1237, + "step": 35733 + }, + { + "epoch": 0.9042690487638232, + "grad_norm": 6.897768020629883, + "learning_rate": 2.2895487689383822e-07, + "loss": 0.164, + "step": 35734 + }, + { + "epoch": 0.9042943543285168, + "grad_norm": 9.422616958618164, + "learning_rate": 2.2883478055836295e-07, + "loss": 0.1946, + "step": 35735 + }, + { + "epoch": 0.9043196598932105, + "grad_norm": 6.595586776733398, + "learning_rate": 2.2871471499124387e-07, + "loss": 0.1888, + "step": 35736 + }, + { + "epoch": 0.9043449654579042, + "grad_norm": 5.494203090667725, + "learning_rate": 2.285946801932576e-07, + "loss": 0.1661, + "step": 35737 + }, + { + "epoch": 0.9043702710225978, + "grad_norm": 4.413028240203857, + "learning_rate": 2.2847467616517516e-07, + "loss": 0.1468, + "step": 35738 + }, + { + "epoch": 0.9043955765872915, + "grad_norm": 4.136274337768555, + "learning_rate": 2.2835470290777316e-07, + "loss": 0.1116, + "step": 35739 + }, + { + "epoch": 0.9044208821519852, + "grad_norm": 5.191440582275391, + "learning_rate": 2.2823476042182268e-07, + "loss": 0.1534, + "step": 35740 + }, + { + "epoch": 0.904446187716679, + "grad_norm": 11.340258598327637, + "learning_rate": 2.2811484870810086e-07, + "loss": 0.2994, + "step": 35741 + }, + { + "epoch": 0.9044714932813726, + "grad_norm": 5.542250633239746, + "learning_rate": 2.279949677673765e-07, + "loss": 0.1406, + "step": 35742 + }, + { + "epoch": 0.9044967988460663, + "grad_norm": 3.283766031265259, + "learning_rate": 2.2787511760042623e-07, + "loss": 0.0938, + "step": 35743 + }, + { + "epoch": 0.90452210441076, + "grad_norm": 2.779035806655884, + "learning_rate": 2.2775529820802056e-07, + "loss": 0.0833, + "step": 35744 + }, + { + "epoch": 0.9045474099754536, + "grad_norm": 12.776963233947754, + "learning_rate": 2.2763550959093384e-07, + "loss": 0.3154, + "step": 35745 + }, + { + "epoch": 0.9045727155401473, + "grad_norm": 5.7018327713012695, + "learning_rate": 2.2751575174993823e-07, + "loss": 0.1384, + "step": 35746 + }, + { + "epoch": 0.904598021104841, + "grad_norm": 19.924482345581055, + "learning_rate": 2.273960246858059e-07, + "loss": 0.1797, + "step": 35747 + }, + { + "epoch": 0.9046233266695346, + "grad_norm": 2.982689619064331, + "learning_rate": 2.2727632839930846e-07, + "loss": 0.134, + "step": 35748 + }, + { + "epoch": 0.9046486322342283, + "grad_norm": 6.176702976226807, + "learning_rate": 2.2715666289121807e-07, + "loss": 0.2238, + "step": 35749 + }, + { + "epoch": 0.904673937798922, + "grad_norm": 4.853897571563721, + "learning_rate": 2.2703702816230688e-07, + "loss": 0.1303, + "step": 35750 + }, + { + "epoch": 0.9046992433636156, + "grad_norm": 12.834521293640137, + "learning_rate": 2.269174242133465e-07, + "loss": 0.3577, + "step": 35751 + }, + { + "epoch": 0.9047245489283093, + "grad_norm": 4.827120780944824, + "learning_rate": 2.267978510451063e-07, + "loss": 0.1169, + "step": 35752 + }, + { + "epoch": 0.904749854493003, + "grad_norm": 5.924982070922852, + "learning_rate": 2.2667830865836016e-07, + "loss": 0.1837, + "step": 35753 + }, + { + "epoch": 0.9047751600576966, + "grad_norm": 2.9482877254486084, + "learning_rate": 2.2655879705387684e-07, + "loss": 0.098, + "step": 35754 + }, + { + "epoch": 0.9048004656223904, + "grad_norm": 3.013807773590088, + "learning_rate": 2.2643931623242908e-07, + "loss": 0.0882, + "step": 35755 + }, + { + "epoch": 0.9048257711870841, + "grad_norm": 9.091170310974121, + "learning_rate": 2.263198661947852e-07, + "loss": 0.1657, + "step": 35756 + }, + { + "epoch": 0.9048510767517777, + "grad_norm": 6.071404457092285, + "learning_rate": 2.2620044694171728e-07, + "loss": 0.1191, + "step": 35757 + }, + { + "epoch": 0.9048763823164714, + "grad_norm": 4.3824639320373535, + "learning_rate": 2.260810584739942e-07, + "loss": 0.0844, + "step": 35758 + }, + { + "epoch": 0.9049016878811651, + "grad_norm": 7.655600547790527, + "learning_rate": 2.25961700792387e-07, + "loss": 0.1952, + "step": 35759 + }, + { + "epoch": 0.9049269934458587, + "grad_norm": 14.915338516235352, + "learning_rate": 2.2584237389766338e-07, + "loss": 0.1248, + "step": 35760 + }, + { + "epoch": 0.9049522990105524, + "grad_norm": 2.568873405456543, + "learning_rate": 2.2572307779059554e-07, + "loss": 0.0476, + "step": 35761 + }, + { + "epoch": 0.9049776045752461, + "grad_norm": 17.120479583740234, + "learning_rate": 2.2560381247195063e-07, + "loss": 0.2203, + "step": 35762 + }, + { + "epoch": 0.9050029101399397, + "grad_norm": 5.733577728271484, + "learning_rate": 2.2548457794249966e-07, + "loss": 0.1419, + "step": 35763 + }, + { + "epoch": 0.9050282157046334, + "grad_norm": 6.710039138793945, + "learning_rate": 2.2536537420300986e-07, + "loss": 0.2347, + "step": 35764 + }, + { + "epoch": 0.9050535212693271, + "grad_norm": 4.073612689971924, + "learning_rate": 2.2524620125425e-07, + "loss": 0.0899, + "step": 35765 + }, + { + "epoch": 0.9050788268340209, + "grad_norm": 5.174503326416016, + "learning_rate": 2.2512705909699007e-07, + "loss": 0.0868, + "step": 35766 + }, + { + "epoch": 0.9051041323987145, + "grad_norm": 9.540915489196777, + "learning_rate": 2.2500794773199663e-07, + "loss": 0.1861, + "step": 35767 + }, + { + "epoch": 0.9051294379634082, + "grad_norm": 3.773386240005493, + "learning_rate": 2.2488886716004022e-07, + "loss": 0.1476, + "step": 35768 + }, + { + "epoch": 0.9051547435281019, + "grad_norm": 4.181631565093994, + "learning_rate": 2.2476981738188575e-07, + "loss": 0.1254, + "step": 35769 + }, + { + "epoch": 0.9051800490927955, + "grad_norm": 2.662001132965088, + "learning_rate": 2.2465079839830372e-07, + "loss": 0.1682, + "step": 35770 + }, + { + "epoch": 0.9052053546574892, + "grad_norm": 4.2422308921813965, + "learning_rate": 2.245318102100591e-07, + "loss": 0.1519, + "step": 35771 + }, + { + "epoch": 0.9052306602221829, + "grad_norm": 7.5651421546936035, + "learning_rate": 2.2441285281792236e-07, + "loss": 0.111, + "step": 35772 + }, + { + "epoch": 0.9052559657868765, + "grad_norm": 10.364087104797363, + "learning_rate": 2.2429392622265678e-07, + "loss": 0.2893, + "step": 35773 + }, + { + "epoch": 0.9052812713515702, + "grad_norm": 5.105452537536621, + "learning_rate": 2.2417503042503286e-07, + "loss": 0.1735, + "step": 35774 + }, + { + "epoch": 0.9053065769162639, + "grad_norm": 3.605011224746704, + "learning_rate": 2.2405616542581444e-07, + "loss": 0.057, + "step": 35775 + }, + { + "epoch": 0.9053318824809575, + "grad_norm": 12.455310821533203, + "learning_rate": 2.2393733122577032e-07, + "loss": 0.1623, + "step": 35776 + }, + { + "epoch": 0.9053571880456512, + "grad_norm": 7.174878120422363, + "learning_rate": 2.2381852782566604e-07, + "loss": 0.1899, + "step": 35777 + }, + { + "epoch": 0.905382493610345, + "grad_norm": 3.087350606918335, + "learning_rate": 2.2369975522626763e-07, + "loss": 0.1228, + "step": 35778 + }, + { + "epoch": 0.9054077991750386, + "grad_norm": 8.37316608428955, + "learning_rate": 2.2358101342834005e-07, + "loss": 0.1737, + "step": 35779 + }, + { + "epoch": 0.9054331047397323, + "grad_norm": 4.1045966148376465, + "learning_rate": 2.2346230243265099e-07, + "loss": 0.0985, + "step": 35780 + }, + { + "epoch": 0.905458410304426, + "grad_norm": 6.642115592956543, + "learning_rate": 2.2334362223996485e-07, + "loss": 0.1442, + "step": 35781 + }, + { + "epoch": 0.9054837158691196, + "grad_norm": 2.9779109954833984, + "learning_rate": 2.2322497285104716e-07, + "loss": 0.0991, + "step": 35782 + }, + { + "epoch": 0.9055090214338133, + "grad_norm": 18.43082046508789, + "learning_rate": 2.2310635426666283e-07, + "loss": 0.2093, + "step": 35783 + }, + { + "epoch": 0.905534326998507, + "grad_norm": 7.253335475921631, + "learning_rate": 2.2298776648757737e-07, + "loss": 0.1316, + "step": 35784 + }, + { + "epoch": 0.9055596325632006, + "grad_norm": 6.720456600189209, + "learning_rate": 2.2286920951455515e-07, + "loss": 0.3009, + "step": 35785 + }, + { + "epoch": 0.9055849381278943, + "grad_norm": 5.123356342315674, + "learning_rate": 2.2275068334836114e-07, + "loss": 0.1319, + "step": 35786 + }, + { + "epoch": 0.905610243692588, + "grad_norm": 6.366270542144775, + "learning_rate": 2.226321879897586e-07, + "loss": 0.1633, + "step": 35787 + }, + { + "epoch": 0.9056355492572816, + "grad_norm": 4.518945693969727, + "learning_rate": 2.225137234395125e-07, + "loss": 0.1652, + "step": 35788 + }, + { + "epoch": 0.9056608548219753, + "grad_norm": 2.861283540725708, + "learning_rate": 2.223952896983872e-07, + "loss": 0.0966, + "step": 35789 + }, + { + "epoch": 0.905686160386669, + "grad_norm": 2.406965494155884, + "learning_rate": 2.2227688676714597e-07, + "loss": 0.0889, + "step": 35790 + }, + { + "epoch": 0.9057114659513628, + "grad_norm": 4.738333225250244, + "learning_rate": 2.2215851464655159e-07, + "loss": 0.0895, + "step": 35791 + }, + { + "epoch": 0.9057367715160564, + "grad_norm": 6.200948715209961, + "learning_rate": 2.2204017333736895e-07, + "loss": 0.1581, + "step": 35792 + }, + { + "epoch": 0.9057620770807501, + "grad_norm": 3.1067512035369873, + "learning_rate": 2.219218628403602e-07, + "loss": 0.1262, + "step": 35793 + }, + { + "epoch": 0.9057873826454438, + "grad_norm": 4.543381690979004, + "learning_rate": 2.2180358315628924e-07, + "loss": 0.1615, + "step": 35794 + }, + { + "epoch": 0.9058126882101374, + "grad_norm": 4.613600730895996, + "learning_rate": 2.2168533428591765e-07, + "loss": 0.1522, + "step": 35795 + }, + { + "epoch": 0.9058379937748311, + "grad_norm": 5.613620281219482, + "learning_rate": 2.215671162300076e-07, + "loss": 0.1903, + "step": 35796 + }, + { + "epoch": 0.9058632993395248, + "grad_norm": 7.469920635223389, + "learning_rate": 2.2144892898932346e-07, + "loss": 0.2498, + "step": 35797 + }, + { + "epoch": 0.9058886049042184, + "grad_norm": 8.220782279968262, + "learning_rate": 2.2133077256462576e-07, + "loss": 0.2314, + "step": 35798 + }, + { + "epoch": 0.9059139104689121, + "grad_norm": 4.0554986000061035, + "learning_rate": 2.2121264695667776e-07, + "loss": 0.071, + "step": 35799 + }, + { + "epoch": 0.9059392160336058, + "grad_norm": 2.6083364486694336, + "learning_rate": 2.2109455216623943e-07, + "loss": 0.1029, + "step": 35800 + }, + { + "epoch": 0.9059645215982994, + "grad_norm": 7.788351058959961, + "learning_rate": 2.2097648819407402e-07, + "loss": 0.2347, + "step": 35801 + }, + { + "epoch": 0.9059898271629931, + "grad_norm": 6.2712554931640625, + "learning_rate": 2.2085845504094207e-07, + "loss": 0.262, + "step": 35802 + }, + { + "epoch": 0.9060151327276869, + "grad_norm": 9.785070419311523, + "learning_rate": 2.207404527076057e-07, + "loss": 0.1468, + "step": 35803 + }, + { + "epoch": 0.9060404382923805, + "grad_norm": 14.143866539001465, + "learning_rate": 2.2062248119482433e-07, + "loss": 0.2167, + "step": 35804 + }, + { + "epoch": 0.9060657438570742, + "grad_norm": 4.695792198181152, + "learning_rate": 2.2050454050335957e-07, + "loss": 0.1286, + "step": 35805 + }, + { + "epoch": 0.9060910494217679, + "grad_norm": 5.529694080352783, + "learning_rate": 2.2038663063397193e-07, + "loss": 0.1746, + "step": 35806 + }, + { + "epoch": 0.9061163549864615, + "grad_norm": 7.109774589538574, + "learning_rate": 2.202687515874219e-07, + "loss": 0.2385, + "step": 35807 + }, + { + "epoch": 0.9061416605511552, + "grad_norm": 3.524608850479126, + "learning_rate": 2.2015090336446998e-07, + "loss": 0.0923, + "step": 35808 + }, + { + "epoch": 0.9061669661158489, + "grad_norm": 3.154634952545166, + "learning_rate": 2.2003308596587557e-07, + "loss": 0.139, + "step": 35809 + }, + { + "epoch": 0.9061922716805425, + "grad_norm": 7.695786952972412, + "learning_rate": 2.199152993923981e-07, + "loss": 0.1604, + "step": 35810 + }, + { + "epoch": 0.9062175772452362, + "grad_norm": 3.3887321949005127, + "learning_rate": 2.1979754364479854e-07, + "loss": 0.1016, + "step": 35811 + }, + { + "epoch": 0.9062428828099299, + "grad_norm": 3.8357012271881104, + "learning_rate": 2.1967981872383525e-07, + "loss": 0.1035, + "step": 35812 + }, + { + "epoch": 0.9062681883746235, + "grad_norm": 4.906546592712402, + "learning_rate": 2.195621246302676e-07, + "loss": 0.1176, + "step": 35813 + }, + { + "epoch": 0.9062934939393172, + "grad_norm": 7.211453914642334, + "learning_rate": 2.1944446136485388e-07, + "loss": 0.1838, + "step": 35814 + }, + { + "epoch": 0.906318799504011, + "grad_norm": 5.706734657287598, + "learning_rate": 2.19326828928354e-07, + "loss": 0.1583, + "step": 35815 + }, + { + "epoch": 0.9063441050687046, + "grad_norm": 5.437981128692627, + "learning_rate": 2.1920922732152684e-07, + "loss": 0.1705, + "step": 35816 + }, + { + "epoch": 0.9063694106333983, + "grad_norm": 6.042692184448242, + "learning_rate": 2.1909165654512953e-07, + "loss": 0.1703, + "step": 35817 + }, + { + "epoch": 0.906394716198092, + "grad_norm": 3.8099820613861084, + "learning_rate": 2.1897411659991986e-07, + "loss": 0.1386, + "step": 35818 + }, + { + "epoch": 0.9064200217627857, + "grad_norm": 1.9510488510131836, + "learning_rate": 2.1885660748665827e-07, + "loss": 0.0614, + "step": 35819 + }, + { + "epoch": 0.9064453273274793, + "grad_norm": 6.144752025604248, + "learning_rate": 2.187391292061003e-07, + "loss": 0.1756, + "step": 35820 + }, + { + "epoch": 0.906470632892173, + "grad_norm": 3.3336687088012695, + "learning_rate": 2.1862168175900477e-07, + "loss": 0.1386, + "step": 35821 + }, + { + "epoch": 0.9064959384568667, + "grad_norm": 6.079041481018066, + "learning_rate": 2.1850426514612831e-07, + "loss": 0.1942, + "step": 35822 + }, + { + "epoch": 0.9065212440215603, + "grad_norm": 7.603668212890625, + "learning_rate": 2.183868793682281e-07, + "loss": 0.1515, + "step": 35823 + }, + { + "epoch": 0.906546549586254, + "grad_norm": 5.390112400054932, + "learning_rate": 2.1826952442606186e-07, + "loss": 0.1876, + "step": 35824 + }, + { + "epoch": 0.9065718551509477, + "grad_norm": 4.085090160369873, + "learning_rate": 2.181522003203862e-07, + "loss": 0.1057, + "step": 35825 + }, + { + "epoch": 0.9065971607156413, + "grad_norm": 6.885649681091309, + "learning_rate": 2.1803490705195717e-07, + "loss": 0.1946, + "step": 35826 + }, + { + "epoch": 0.906622466280335, + "grad_norm": 4.98066520690918, + "learning_rate": 2.1791764462153087e-07, + "loss": 0.1568, + "step": 35827 + }, + { + "epoch": 0.9066477718450288, + "grad_norm": 7.887575149536133, + "learning_rate": 2.17800413029865e-07, + "loss": 0.2429, + "step": 35828 + }, + { + "epoch": 0.9066730774097224, + "grad_norm": 25.005590438842773, + "learning_rate": 2.1768321227771395e-07, + "loss": 0.2509, + "step": 35829 + }, + { + "epoch": 0.9066983829744161, + "grad_norm": 7.088161945343018, + "learning_rate": 2.1756604236583602e-07, + "loss": 0.2571, + "step": 35830 + }, + { + "epoch": 0.9067236885391098, + "grad_norm": 4.895096302032471, + "learning_rate": 2.1744890329498335e-07, + "loss": 0.1168, + "step": 35831 + }, + { + "epoch": 0.9067489941038034, + "grad_norm": 2.862562417984009, + "learning_rate": 2.1733179506591372e-07, + "loss": 0.1264, + "step": 35832 + }, + { + "epoch": 0.9067742996684971, + "grad_norm": 10.99692153930664, + "learning_rate": 2.1721471767938097e-07, + "loss": 0.2156, + "step": 35833 + }, + { + "epoch": 0.9067996052331908, + "grad_norm": 7.3818182945251465, + "learning_rate": 2.1709767113614223e-07, + "loss": 0.2136, + "step": 35834 + }, + { + "epoch": 0.9068249107978844, + "grad_norm": 4.533392429351807, + "learning_rate": 2.1698065543694967e-07, + "loss": 0.0934, + "step": 35835 + }, + { + "epoch": 0.9068502163625781, + "grad_norm": 3.6180014610290527, + "learning_rate": 2.1686367058255998e-07, + "loss": 0.1134, + "step": 35836 + }, + { + "epoch": 0.9068755219272718, + "grad_norm": 2.978397846221924, + "learning_rate": 2.167467165737258e-07, + "loss": 0.1268, + "step": 35837 + }, + { + "epoch": 0.9069008274919654, + "grad_norm": 3.850740909576416, + "learning_rate": 2.1662979341120384e-07, + "loss": 0.1197, + "step": 35838 + }, + { + "epoch": 0.9069261330566591, + "grad_norm": 3.8950741291046143, + "learning_rate": 2.1651290109574507e-07, + "loss": 0.1629, + "step": 35839 + }, + { + "epoch": 0.9069514386213529, + "grad_norm": 11.52203369140625, + "learning_rate": 2.1639603962810562e-07, + "loss": 0.2792, + "step": 35840 + }, + { + "epoch": 0.9069767441860465, + "grad_norm": 2.969344139099121, + "learning_rate": 2.1627920900903765e-07, + "loss": 0.1177, + "step": 35841 + }, + { + "epoch": 0.9070020497507402, + "grad_norm": 6.0458083152771, + "learning_rate": 2.161624092392961e-07, + "loss": 0.1854, + "step": 35842 + }, + { + "epoch": 0.9070273553154339, + "grad_norm": 4.8024163246154785, + "learning_rate": 2.1604564031963259e-07, + "loss": 0.2146, + "step": 35843 + }, + { + "epoch": 0.9070526608801276, + "grad_norm": 6.169695854187012, + "learning_rate": 2.1592890225080155e-07, + "loss": 0.1725, + "step": 35844 + }, + { + "epoch": 0.9070779664448212, + "grad_norm": 3.418975830078125, + "learning_rate": 2.1581219503355399e-07, + "loss": 0.1354, + "step": 35845 + }, + { + "epoch": 0.9071032720095149, + "grad_norm": 4.3017473220825195, + "learning_rate": 2.1569551866864437e-07, + "loss": 0.1293, + "step": 35846 + }, + { + "epoch": 0.9071285775742086, + "grad_norm": 3.113990068435669, + "learning_rate": 2.155788731568248e-07, + "loss": 0.0989, + "step": 35847 + }, + { + "epoch": 0.9071538831389022, + "grad_norm": 4.025158882141113, + "learning_rate": 2.1546225849884694e-07, + "loss": 0.1928, + "step": 35848 + }, + { + "epoch": 0.9071791887035959, + "grad_norm": 3.627040147781372, + "learning_rate": 2.1534567469546298e-07, + "loss": 0.1648, + "step": 35849 + }, + { + "epoch": 0.9072044942682896, + "grad_norm": 2.467639923095703, + "learning_rate": 2.1522912174742395e-07, + "loss": 0.1051, + "step": 35850 + }, + { + "epoch": 0.9072297998329832, + "grad_norm": 3.905090808868408, + "learning_rate": 2.151125996554826e-07, + "loss": 0.121, + "step": 35851 + }, + { + "epoch": 0.907255105397677, + "grad_norm": 4.31350564956665, + "learning_rate": 2.1499610842039055e-07, + "loss": 0.141, + "step": 35852 + }, + { + "epoch": 0.9072804109623707, + "grad_norm": 5.03609561920166, + "learning_rate": 2.148796480428983e-07, + "loss": 0.1328, + "step": 35853 + }, + { + "epoch": 0.9073057165270643, + "grad_norm": 3.3692920207977295, + "learning_rate": 2.1476321852375692e-07, + "loss": 0.1499, + "step": 35854 + }, + { + "epoch": 0.907331022091758, + "grad_norm": 4.360565662384033, + "learning_rate": 2.146468198637175e-07, + "loss": 0.1404, + "step": 35855 + }, + { + "epoch": 0.9073563276564517, + "grad_norm": 4.295725345611572, + "learning_rate": 2.1453045206353052e-07, + "loss": 0.1271, + "step": 35856 + }, + { + "epoch": 0.9073816332211453, + "grad_norm": 6.188342571258545, + "learning_rate": 2.1441411512394704e-07, + "loss": 0.2102, + "step": 35857 + }, + { + "epoch": 0.907406938785839, + "grad_norm": 11.41801929473877, + "learning_rate": 2.1429780904571596e-07, + "loss": 0.1558, + "step": 35858 + }, + { + "epoch": 0.9074322443505327, + "grad_norm": 3.055215358734131, + "learning_rate": 2.1418153382958828e-07, + "loss": 0.1168, + "step": 35859 + }, + { + "epoch": 0.9074575499152263, + "grad_norm": 2.9309005737304688, + "learning_rate": 2.1406528947631343e-07, + "loss": 0.1386, + "step": 35860 + }, + { + "epoch": 0.90748285547992, + "grad_norm": 6.891590595245361, + "learning_rate": 2.139490759866425e-07, + "loss": 0.1249, + "step": 35861 + }, + { + "epoch": 0.9075081610446137, + "grad_norm": 5.753643035888672, + "learning_rate": 2.1383289336132206e-07, + "loss": 0.1777, + "step": 35862 + }, + { + "epoch": 0.9075334666093073, + "grad_norm": 2.632314443588257, + "learning_rate": 2.1371674160110434e-07, + "loss": 0.1022, + "step": 35863 + }, + { + "epoch": 0.907558772174001, + "grad_norm": 5.2395853996276855, + "learning_rate": 2.1360062070673592e-07, + "loss": 0.1327, + "step": 35864 + }, + { + "epoch": 0.9075840777386948, + "grad_norm": 4.1994948387146, + "learning_rate": 2.134845306789679e-07, + "loss": 0.1032, + "step": 35865 + }, + { + "epoch": 0.9076093833033884, + "grad_norm": 6.911266803741455, + "learning_rate": 2.133684715185469e-07, + "loss": 0.1317, + "step": 35866 + }, + { + "epoch": 0.9076346888680821, + "grad_norm": 10.91960334777832, + "learning_rate": 2.1325244322622286e-07, + "loss": 0.3193, + "step": 35867 + }, + { + "epoch": 0.9076599944327758, + "grad_norm": 3.5751218795776367, + "learning_rate": 2.131364458027424e-07, + "loss": 0.1376, + "step": 35868 + }, + { + "epoch": 0.9076852999974695, + "grad_norm": 13.756814002990723, + "learning_rate": 2.1302047924885606e-07, + "loss": 0.3474, + "step": 35869 + }, + { + "epoch": 0.9077106055621631, + "grad_norm": 6.363429546356201, + "learning_rate": 2.129045435653093e-07, + "loss": 0.2269, + "step": 35870 + }, + { + "epoch": 0.9077359111268568, + "grad_norm": 8.622369766235352, + "learning_rate": 2.127886387528505e-07, + "loss": 0.1546, + "step": 35871 + }, + { + "epoch": 0.9077612166915505, + "grad_norm": 5.0269269943237305, + "learning_rate": 2.126727648122273e-07, + "loss": 0.1318, + "step": 35872 + }, + { + "epoch": 0.9077865222562441, + "grad_norm": 3.3535027503967285, + "learning_rate": 2.1255692174418752e-07, + "loss": 0.1467, + "step": 35873 + }, + { + "epoch": 0.9078118278209378, + "grad_norm": 4.153156757354736, + "learning_rate": 2.1244110954947717e-07, + "loss": 0.164, + "step": 35874 + }, + { + "epoch": 0.9078371333856315, + "grad_norm": 6.126096248626709, + "learning_rate": 2.12325328228844e-07, + "loss": 0.1444, + "step": 35875 + }, + { + "epoch": 0.9078624389503251, + "grad_norm": 6.287711143493652, + "learning_rate": 2.1220957778303354e-07, + "loss": 0.1593, + "step": 35876 + }, + { + "epoch": 0.9078877445150189, + "grad_norm": 5.205140113830566, + "learning_rate": 2.120938582127935e-07, + "loss": 0.1868, + "step": 35877 + }, + { + "epoch": 0.9079130500797126, + "grad_norm": 3.9691779613494873, + "learning_rate": 2.119781695188694e-07, + "loss": 0.1018, + "step": 35878 + }, + { + "epoch": 0.9079383556444062, + "grad_norm": 4.534868240356445, + "learning_rate": 2.1186251170200732e-07, + "loss": 0.196, + "step": 35879 + }, + { + "epoch": 0.9079636612090999, + "grad_norm": 37.284942626953125, + "learning_rate": 2.1174688476295336e-07, + "loss": 0.1346, + "step": 35880 + }, + { + "epoch": 0.9079889667737936, + "grad_norm": 5.305908203125, + "learning_rate": 2.1163128870245298e-07, + "loss": 0.1247, + "step": 35881 + }, + { + "epoch": 0.9080142723384872, + "grad_norm": 5.989198684692383, + "learning_rate": 2.1151572352125171e-07, + "loss": 0.1584, + "step": 35882 + }, + { + "epoch": 0.9080395779031809, + "grad_norm": 24.34897232055664, + "learning_rate": 2.1140018922009508e-07, + "loss": 0.2662, + "step": 35883 + }, + { + "epoch": 0.9080648834678746, + "grad_norm": 3.1935794353485107, + "learning_rate": 2.112846857997275e-07, + "loss": 0.0784, + "step": 35884 + }, + { + "epoch": 0.9080901890325682, + "grad_norm": 9.89758014678955, + "learning_rate": 2.111692132608939e-07, + "loss": 0.2101, + "step": 35885 + }, + { + "epoch": 0.9081154945972619, + "grad_norm": 7.052767276763916, + "learning_rate": 2.1105377160433982e-07, + "loss": 0.1709, + "step": 35886 + }, + { + "epoch": 0.9081408001619556, + "grad_norm": 3.8620803356170654, + "learning_rate": 2.109383608308091e-07, + "loss": 0.12, + "step": 35887 + }, + { + "epoch": 0.9081661057266492, + "grad_norm": 3.933525562286377, + "learning_rate": 2.1082298094104613e-07, + "loss": 0.1049, + "step": 35888 + }, + { + "epoch": 0.908191411291343, + "grad_norm": 8.39476490020752, + "learning_rate": 2.1070763193579424e-07, + "loss": 0.1583, + "step": 35889 + }, + { + "epoch": 0.9082167168560367, + "grad_norm": 3.0963985919952393, + "learning_rate": 2.1059231381579893e-07, + "loss": 0.087, + "step": 35890 + }, + { + "epoch": 0.9082420224207303, + "grad_norm": 4.4371795654296875, + "learning_rate": 2.1047702658180237e-07, + "loss": 0.1008, + "step": 35891 + }, + { + "epoch": 0.908267327985424, + "grad_norm": 3.4862008094787598, + "learning_rate": 2.1036177023454896e-07, + "loss": 0.1441, + "step": 35892 + }, + { + "epoch": 0.9082926335501177, + "grad_norm": 4.5882697105407715, + "learning_rate": 2.102465447747809e-07, + "loss": 0.1314, + "step": 35893 + }, + { + "epoch": 0.9083179391148114, + "grad_norm": 5.571619033813477, + "learning_rate": 2.1013135020324205e-07, + "loss": 0.1742, + "step": 35894 + }, + { + "epoch": 0.908343244679505, + "grad_norm": 6.8570170402526855, + "learning_rate": 2.1001618652067513e-07, + "loss": 0.1721, + "step": 35895 + }, + { + "epoch": 0.9083685502441987, + "grad_norm": 4.778554916381836, + "learning_rate": 2.0990105372782344e-07, + "loss": 0.1217, + "step": 35896 + }, + { + "epoch": 0.9083938558088924, + "grad_norm": 3.698674440383911, + "learning_rate": 2.0978595182542805e-07, + "loss": 0.1452, + "step": 35897 + }, + { + "epoch": 0.908419161373586, + "grad_norm": 3.5503575801849365, + "learning_rate": 2.0967088081423227e-07, + "loss": 0.1463, + "step": 35898 + }, + { + "epoch": 0.9084444669382797, + "grad_norm": 8.233503341674805, + "learning_rate": 2.095558406949777e-07, + "loss": 0.1907, + "step": 35899 + }, + { + "epoch": 0.9084697725029734, + "grad_norm": 3.5661873817443848, + "learning_rate": 2.0944083146840766e-07, + "loss": 0.1122, + "step": 35900 + }, + { + "epoch": 0.908495078067667, + "grad_norm": 4.631572246551514, + "learning_rate": 2.09325853135261e-07, + "loss": 0.1078, + "step": 35901 + }, + { + "epoch": 0.9085203836323608, + "grad_norm": 2.3109703063964844, + "learning_rate": 2.0921090569628155e-07, + "loss": 0.0961, + "step": 35902 + }, + { + "epoch": 0.9085456891970545, + "grad_norm": 12.016160011291504, + "learning_rate": 2.090959891522093e-07, + "loss": 0.2572, + "step": 35903 + }, + { + "epoch": 0.9085709947617481, + "grad_norm": 7.238286972045898, + "learning_rate": 2.089811035037864e-07, + "loss": 0.1295, + "step": 35904 + }, + { + "epoch": 0.9085963003264418, + "grad_norm": 3.7536513805389404, + "learning_rate": 2.0886624875175287e-07, + "loss": 0.1728, + "step": 35905 + }, + { + "epoch": 0.9086216058911355, + "grad_norm": 5.093599319458008, + "learning_rate": 2.0875142489684975e-07, + "loss": 0.112, + "step": 35906 + }, + { + "epoch": 0.9086469114558291, + "grad_norm": 8.386590957641602, + "learning_rate": 2.086366319398181e-07, + "loss": 0.2201, + "step": 35907 + }, + { + "epoch": 0.9086722170205228, + "grad_norm": 3.311950445175171, + "learning_rate": 2.0852186988139622e-07, + "loss": 0.0717, + "step": 35908 + }, + { + "epoch": 0.9086975225852165, + "grad_norm": 4.209378242492676, + "learning_rate": 2.0840713872232632e-07, + "loss": 0.1332, + "step": 35909 + }, + { + "epoch": 0.9087228281499101, + "grad_norm": 8.71977710723877, + "learning_rate": 2.082924384633478e-07, + "loss": 0.2452, + "step": 35910 + }, + { + "epoch": 0.9087481337146038, + "grad_norm": 2.948488712310791, + "learning_rate": 2.081777691051995e-07, + "loss": 0.1099, + "step": 35911 + }, + { + "epoch": 0.9087734392792975, + "grad_norm": 4.1153998374938965, + "learning_rate": 2.0806313064862082e-07, + "loss": 0.1309, + "step": 35912 + }, + { + "epoch": 0.9087987448439911, + "grad_norm": 5.283528804779053, + "learning_rate": 2.0794852309435287e-07, + "loss": 0.198, + "step": 35913 + }, + { + "epoch": 0.9088240504086849, + "grad_norm": 3.6313111782073975, + "learning_rate": 2.0783394644313283e-07, + "loss": 0.1496, + "step": 35914 + }, + { + "epoch": 0.9088493559733786, + "grad_norm": 6.779613494873047, + "learning_rate": 2.0771940069570062e-07, + "loss": 0.1221, + "step": 35915 + }, + { + "epoch": 0.9088746615380722, + "grad_norm": 8.238201141357422, + "learning_rate": 2.0760488585279403e-07, + "loss": 0.257, + "step": 35916 + }, + { + "epoch": 0.9088999671027659, + "grad_norm": 4.787725925445557, + "learning_rate": 2.0749040191515246e-07, + "loss": 0.167, + "step": 35917 + }, + { + "epoch": 0.9089252726674596, + "grad_norm": 8.658509254455566, + "learning_rate": 2.073759488835142e-07, + "loss": 0.1972, + "step": 35918 + }, + { + "epoch": 0.9089505782321533, + "grad_norm": 4.166623115539551, + "learning_rate": 2.0726152675861644e-07, + "loss": 0.1369, + "step": 35919 + }, + { + "epoch": 0.9089758837968469, + "grad_norm": 7.469614505767822, + "learning_rate": 2.0714713554119691e-07, + "loss": 0.2321, + "step": 35920 + }, + { + "epoch": 0.9090011893615406, + "grad_norm": 5.367514133453369, + "learning_rate": 2.0703277523199505e-07, + "loss": 0.1159, + "step": 35921 + }, + { + "epoch": 0.9090264949262343, + "grad_norm": 4.4403791427612305, + "learning_rate": 2.0691844583174691e-07, + "loss": 0.0953, + "step": 35922 + }, + { + "epoch": 0.9090518004909279, + "grad_norm": 2.269523859024048, + "learning_rate": 2.0680414734119025e-07, + "loss": 0.1015, + "step": 35923 + }, + { + "epoch": 0.9090771060556216, + "grad_norm": 3.3153839111328125, + "learning_rate": 2.066898797610617e-07, + "loss": 0.1161, + "step": 35924 + }, + { + "epoch": 0.9091024116203154, + "grad_norm": 3.4169468879699707, + "learning_rate": 2.06575643092099e-07, + "loss": 0.1925, + "step": 35925 + }, + { + "epoch": 0.909127717185009, + "grad_norm": 3.8023672103881836, + "learning_rate": 2.064614373350371e-07, + "loss": 0.1355, + "step": 35926 + }, + { + "epoch": 0.9091530227497027, + "grad_norm": 4.709335803985596, + "learning_rate": 2.0634726249061542e-07, + "loss": 0.1312, + "step": 35927 + }, + { + "epoch": 0.9091783283143964, + "grad_norm": 6.8870015144348145, + "learning_rate": 2.0623311855956785e-07, + "loss": 0.0817, + "step": 35928 + }, + { + "epoch": 0.90920363387909, + "grad_norm": 5.287585735321045, + "learning_rate": 2.0611900554263097e-07, + "loss": 0.1449, + "step": 35929 + }, + { + "epoch": 0.9092289394437837, + "grad_norm": 3.581655502319336, + "learning_rate": 2.060049234405409e-07, + "loss": 0.0858, + "step": 35930 + }, + { + "epoch": 0.9092542450084774, + "grad_norm": 3.9614357948303223, + "learning_rate": 2.0589087225403425e-07, + "loss": 0.1519, + "step": 35931 + }, + { + "epoch": 0.909279550573171, + "grad_norm": 4.427906513214111, + "learning_rate": 2.0577685198384435e-07, + "loss": 0.1551, + "step": 35932 + }, + { + "epoch": 0.9093048561378647, + "grad_norm": 8.43345832824707, + "learning_rate": 2.0566286263070834e-07, + "loss": 0.1731, + "step": 35933 + }, + { + "epoch": 0.9093301617025584, + "grad_norm": 4.434643745422363, + "learning_rate": 2.0554890419536066e-07, + "loss": 0.088, + "step": 35934 + }, + { + "epoch": 0.909355467267252, + "grad_norm": 8.436354637145996, + "learning_rate": 2.0543497667853573e-07, + "loss": 0.2639, + "step": 35935 + }, + { + "epoch": 0.9093807728319457, + "grad_norm": 32.63789749145508, + "learning_rate": 2.0532108008096906e-07, + "loss": 0.2768, + "step": 35936 + }, + { + "epoch": 0.9094060783966395, + "grad_norm": 7.438688278198242, + "learning_rate": 2.0520721440339563e-07, + "loss": 0.1155, + "step": 35937 + }, + { + "epoch": 0.909431383961333, + "grad_norm": 3.731801986694336, + "learning_rate": 2.0509337964654818e-07, + "loss": 0.1461, + "step": 35938 + }, + { + "epoch": 0.9094566895260268, + "grad_norm": 5.9409942626953125, + "learning_rate": 2.0497957581116168e-07, + "loss": 0.2001, + "step": 35939 + }, + { + "epoch": 0.9094819950907205, + "grad_norm": 9.333868980407715, + "learning_rate": 2.0486580289797e-07, + "loss": 0.1613, + "step": 35940 + }, + { + "epoch": 0.9095073006554141, + "grad_norm": 12.866974830627441, + "learning_rate": 2.0475206090770693e-07, + "loss": 0.1913, + "step": 35941 + }, + { + "epoch": 0.9095326062201078, + "grad_norm": 4.031939506530762, + "learning_rate": 2.0463834984110587e-07, + "loss": 0.1377, + "step": 35942 + }, + { + "epoch": 0.9095579117848015, + "grad_norm": 6.850221633911133, + "learning_rate": 2.0452466969889951e-07, + "loss": 0.1128, + "step": 35943 + }, + { + "epoch": 0.9095832173494951, + "grad_norm": 3.7752959728240967, + "learning_rate": 2.0441102048182282e-07, + "loss": 0.0993, + "step": 35944 + }, + { + "epoch": 0.9096085229141888, + "grad_norm": 3.3264105319976807, + "learning_rate": 2.0429740219060634e-07, + "loss": 0.0666, + "step": 35945 + }, + { + "epoch": 0.9096338284788825, + "grad_norm": 5.590333938598633, + "learning_rate": 2.0418381482598448e-07, + "loss": 0.1488, + "step": 35946 + }, + { + "epoch": 0.9096591340435762, + "grad_norm": 2.4607462882995605, + "learning_rate": 2.0407025838868831e-07, + "loss": 0.0926, + "step": 35947 + }, + { + "epoch": 0.9096844396082698, + "grad_norm": 7.160665035247803, + "learning_rate": 2.0395673287945116e-07, + "loss": 0.1809, + "step": 35948 + }, + { + "epoch": 0.9097097451729635, + "grad_norm": 8.59669017791748, + "learning_rate": 2.038432382990052e-07, + "loss": 0.2274, + "step": 35949 + }, + { + "epoch": 0.9097350507376573, + "grad_norm": 3.564549446105957, + "learning_rate": 2.0372977464808207e-07, + "loss": 0.1551, + "step": 35950 + }, + { + "epoch": 0.9097603563023509, + "grad_norm": 4.73503303527832, + "learning_rate": 2.0361634192741286e-07, + "loss": 0.1394, + "step": 35951 + }, + { + "epoch": 0.9097856618670446, + "grad_norm": 4.145654678344727, + "learning_rate": 2.035029401377303e-07, + "loss": 0.136, + "step": 35952 + }, + { + "epoch": 0.9098109674317383, + "grad_norm": 8.322514533996582, + "learning_rate": 2.033895692797655e-07, + "loss": 0.1903, + "step": 35953 + }, + { + "epoch": 0.9098362729964319, + "grad_norm": 6.791055202484131, + "learning_rate": 2.032762293542484e-07, + "loss": 0.1443, + "step": 35954 + }, + { + "epoch": 0.9098615785611256, + "grad_norm": 5.002954959869385, + "learning_rate": 2.031629203619101e-07, + "loss": 0.1569, + "step": 35955 + }, + { + "epoch": 0.9098868841258193, + "grad_norm": 10.256406784057617, + "learning_rate": 2.0304964230348278e-07, + "loss": 0.2, + "step": 35956 + }, + { + "epoch": 0.9099121896905129, + "grad_norm": 5.032251358032227, + "learning_rate": 2.0293639517969533e-07, + "loss": 0.098, + "step": 35957 + }, + { + "epoch": 0.9099374952552066, + "grad_norm": 12.835201263427734, + "learning_rate": 2.028231789912799e-07, + "loss": 0.0915, + "step": 35958 + }, + { + "epoch": 0.9099628008199003, + "grad_norm": 4.958995342254639, + "learning_rate": 2.0270999373896427e-07, + "loss": 0.1211, + "step": 35959 + }, + { + "epoch": 0.9099881063845939, + "grad_norm": 8.190694808959961, + "learning_rate": 2.0259683942348007e-07, + "loss": 0.1415, + "step": 35960 + }, + { + "epoch": 0.9100134119492876, + "grad_norm": 7.616290092468262, + "learning_rate": 2.0248371604555616e-07, + "loss": 0.1607, + "step": 35961 + }, + { + "epoch": 0.9100387175139814, + "grad_norm": 5.045427322387695, + "learning_rate": 2.0237062360592307e-07, + "loss": 0.1799, + "step": 35962 + }, + { + "epoch": 0.910064023078675, + "grad_norm": 12.773816108703613, + "learning_rate": 2.0225756210530855e-07, + "loss": 0.3159, + "step": 35963 + }, + { + "epoch": 0.9100893286433687, + "grad_norm": 4.1109819412231445, + "learning_rate": 2.0214453154444314e-07, + "loss": 0.159, + "step": 35964 + }, + { + "epoch": 0.9101146342080624, + "grad_norm": 4.2336931228637695, + "learning_rate": 2.0203153192405512e-07, + "loss": 0.1186, + "step": 35965 + }, + { + "epoch": 0.910139939772756, + "grad_norm": 4.82828950881958, + "learning_rate": 2.0191856324487336e-07, + "loss": 0.1251, + "step": 35966 + }, + { + "epoch": 0.9101652453374497, + "grad_norm": 2.9496090412139893, + "learning_rate": 2.0180562550762562e-07, + "loss": 0.09, + "step": 35967 + }, + { + "epoch": 0.9101905509021434, + "grad_norm": 9.245214462280273, + "learning_rate": 2.016927187130413e-07, + "loss": 0.2323, + "step": 35968 + }, + { + "epoch": 0.910215856466837, + "grad_norm": 4.345813751220703, + "learning_rate": 2.0157984286184818e-07, + "loss": 0.123, + "step": 35969 + }, + { + "epoch": 0.9102411620315307, + "grad_norm": 4.984318733215332, + "learning_rate": 2.0146699795477342e-07, + "loss": 0.1371, + "step": 35970 + }, + { + "epoch": 0.9102664675962244, + "grad_norm": 5.275160312652588, + "learning_rate": 2.0135418399254648e-07, + "loss": 0.1284, + "step": 35971 + }, + { + "epoch": 0.9102917731609181, + "grad_norm": 8.893109321594238, + "learning_rate": 2.0124140097589285e-07, + "loss": 0.2133, + "step": 35972 + }, + { + "epoch": 0.9103170787256117, + "grad_norm": 4.672013759613037, + "learning_rate": 2.011286489055414e-07, + "loss": 0.1644, + "step": 35973 + }, + { + "epoch": 0.9103423842903055, + "grad_norm": 4.733782768249512, + "learning_rate": 2.0101592778221767e-07, + "loss": 0.197, + "step": 35974 + }, + { + "epoch": 0.9103676898549992, + "grad_norm": 3.666123390197754, + "learning_rate": 2.0090323760665108e-07, + "loss": 0.1184, + "step": 35975 + }, + { + "epoch": 0.9103929954196928, + "grad_norm": 6.70263147354126, + "learning_rate": 2.0079057837956551e-07, + "loss": 0.1758, + "step": 35976 + }, + { + "epoch": 0.9104183009843865, + "grad_norm": 3.1930043697357178, + "learning_rate": 2.006779501016892e-07, + "loss": 0.1433, + "step": 35977 + }, + { + "epoch": 0.9104436065490802, + "grad_norm": 18.365943908691406, + "learning_rate": 2.005653527737478e-07, + "loss": 0.2781, + "step": 35978 + }, + { + "epoch": 0.9104689121137738, + "grad_norm": 4.730130195617676, + "learning_rate": 2.0045278639646838e-07, + "loss": 0.1386, + "step": 35979 + }, + { + "epoch": 0.9104942176784675, + "grad_norm": 4.8538103103637695, + "learning_rate": 2.0034025097057597e-07, + "loss": 0.1821, + "step": 35980 + }, + { + "epoch": 0.9105195232431612, + "grad_norm": 3.8304402828216553, + "learning_rate": 2.0022774649679612e-07, + "loss": 0.1221, + "step": 35981 + }, + { + "epoch": 0.9105448288078548, + "grad_norm": 7.851187229156494, + "learning_rate": 2.0011527297585432e-07, + "loss": 0.1764, + "step": 35982 + }, + { + "epoch": 0.9105701343725485, + "grad_norm": 8.334360122680664, + "learning_rate": 2.0000283040847723e-07, + "loss": 0.1809, + "step": 35983 + }, + { + "epoch": 0.9105954399372422, + "grad_norm": 12.296612739562988, + "learning_rate": 1.9989041879538872e-07, + "loss": 0.1889, + "step": 35984 + }, + { + "epoch": 0.9106207455019358, + "grad_norm": 8.920489311218262, + "learning_rate": 1.997780381373138e-07, + "loss": 0.2194, + "step": 35985 + }, + { + "epoch": 0.9106460510666295, + "grad_norm": 3.727581739425659, + "learning_rate": 1.9966568843497735e-07, + "loss": 0.1734, + "step": 35986 + }, + { + "epoch": 0.9106713566313233, + "grad_norm": 5.116584300994873, + "learning_rate": 1.995533696891039e-07, + "loss": 0.1255, + "step": 35987 + }, + { + "epoch": 0.9106966621960169, + "grad_norm": 2.9215660095214844, + "learning_rate": 1.9944108190041778e-07, + "loss": 0.0739, + "step": 35988 + }, + { + "epoch": 0.9107219677607106, + "grad_norm": 3.826808214187622, + "learning_rate": 1.9932882506964403e-07, + "loss": 0.128, + "step": 35989 + }, + { + "epoch": 0.9107472733254043, + "grad_norm": 7.987088680267334, + "learning_rate": 1.992165991975048e-07, + "loss": 0.203, + "step": 35990 + }, + { + "epoch": 0.9107725788900979, + "grad_norm": 7.300983428955078, + "learning_rate": 1.991044042847251e-07, + "loss": 0.1373, + "step": 35991 + }, + { + "epoch": 0.9107978844547916, + "grad_norm": 4.261294364929199, + "learning_rate": 1.989922403320277e-07, + "loss": 0.1239, + "step": 35992 + }, + { + "epoch": 0.9108231900194853, + "grad_norm": 3.127779483795166, + "learning_rate": 1.9888010734013697e-07, + "loss": 0.1085, + "step": 35993 + }, + { + "epoch": 0.9108484955841789, + "grad_norm": 6.911563873291016, + "learning_rate": 1.9876800530977402e-07, + "loss": 0.187, + "step": 35994 + }, + { + "epoch": 0.9108738011488726, + "grad_norm": 4.577665328979492, + "learning_rate": 1.986559342416644e-07, + "loss": 0.1131, + "step": 35995 + }, + { + "epoch": 0.9108991067135663, + "grad_norm": 14.101289749145508, + "learning_rate": 1.9854389413652918e-07, + "loss": 0.2224, + "step": 35996 + }, + { + "epoch": 0.91092441227826, + "grad_norm": 3.6601574420928955, + "learning_rate": 1.9843188499509115e-07, + "loss": 0.0943, + "step": 35997 + }, + { + "epoch": 0.9109497178429536, + "grad_norm": 4.138665199279785, + "learning_rate": 1.9831990681807246e-07, + "loss": 0.1167, + "step": 35998 + }, + { + "epoch": 0.9109750234076474, + "grad_norm": 5.376229286193848, + "learning_rate": 1.9820795960619586e-07, + "loss": 0.1547, + "step": 35999 + }, + { + "epoch": 0.9110003289723411, + "grad_norm": 23.02190589904785, + "learning_rate": 1.9809604336018308e-07, + "loss": 0.2412, + "step": 36000 + }, + { + "epoch": 0.9110256345370347, + "grad_norm": 12.444771766662598, + "learning_rate": 1.9798415808075511e-07, + "loss": 0.2096, + "step": 36001 + }, + { + "epoch": 0.9110509401017284, + "grad_norm": 3.652637243270874, + "learning_rate": 1.978723037686353e-07, + "loss": 0.1572, + "step": 36002 + }, + { + "epoch": 0.9110762456664221, + "grad_norm": 5.93792200088501, + "learning_rate": 1.9776048042454198e-07, + "loss": 0.1395, + "step": 36003 + }, + { + "epoch": 0.9111015512311157, + "grad_norm": 2.794490337371826, + "learning_rate": 1.9764868804919957e-07, + "loss": 0.0718, + "step": 36004 + }, + { + "epoch": 0.9111268567958094, + "grad_norm": 4.070071220397949, + "learning_rate": 1.9753692664332637e-07, + "loss": 0.0797, + "step": 36005 + }, + { + "epoch": 0.9111521623605031, + "grad_norm": 7.41241979598999, + "learning_rate": 1.974251962076451e-07, + "loss": 0.1173, + "step": 36006 + }, + { + "epoch": 0.9111774679251967, + "grad_norm": 3.093275785446167, + "learning_rate": 1.973134967428747e-07, + "loss": 0.121, + "step": 36007 + }, + { + "epoch": 0.9112027734898904, + "grad_norm": 4.152102947235107, + "learning_rate": 1.972018282497362e-07, + "loss": 0.0912, + "step": 36008 + }, + { + "epoch": 0.9112280790545841, + "grad_norm": 3.2948620319366455, + "learning_rate": 1.9709019072894965e-07, + "loss": 0.1094, + "step": 36009 + }, + { + "epoch": 0.9112533846192777, + "grad_norm": 9.376638412475586, + "learning_rate": 1.969785841812355e-07, + "loss": 0.1803, + "step": 36010 + }, + { + "epoch": 0.9112786901839715, + "grad_norm": 4.527166366577148, + "learning_rate": 1.9686700860731321e-07, + "loss": 0.1402, + "step": 36011 + }, + { + "epoch": 0.9113039957486652, + "grad_norm": 2.609133005142212, + "learning_rate": 1.9675546400790169e-07, + "loss": 0.1287, + "step": 36012 + }, + { + "epoch": 0.9113293013133588, + "grad_norm": 6.422154903411865, + "learning_rate": 1.966439503837203e-07, + "loss": 0.1714, + "step": 36013 + }, + { + "epoch": 0.9113546068780525, + "grad_norm": 9.545083999633789, + "learning_rate": 1.9653246773548962e-07, + "loss": 0.1725, + "step": 36014 + }, + { + "epoch": 0.9113799124427462, + "grad_norm": 6.4221343994140625, + "learning_rate": 1.9642101606392683e-07, + "loss": 0.1403, + "step": 36015 + }, + { + "epoch": 0.9114052180074398, + "grad_norm": 7.533758640289307, + "learning_rate": 1.963095953697519e-07, + "loss": 0.1397, + "step": 36016 + }, + { + "epoch": 0.9114305235721335, + "grad_norm": 5.937515735626221, + "learning_rate": 1.9619820565368208e-07, + "loss": 0.1615, + "step": 36017 + }, + { + "epoch": 0.9114558291368272, + "grad_norm": 9.250566482543945, + "learning_rate": 1.9608684691643732e-07, + "loss": 0.1494, + "step": 36018 + }, + { + "epoch": 0.9114811347015208, + "grad_norm": 5.466437816619873, + "learning_rate": 1.9597551915873425e-07, + "loss": 0.2134, + "step": 36019 + }, + { + "epoch": 0.9115064402662145, + "grad_norm": 3.5659873485565186, + "learning_rate": 1.9586422238129177e-07, + "loss": 0.1225, + "step": 36020 + }, + { + "epoch": 0.9115317458309082, + "grad_norm": 9.672863960266113, + "learning_rate": 1.9575295658482706e-07, + "loss": 0.197, + "step": 36021 + }, + { + "epoch": 0.911557051395602, + "grad_norm": 6.6775970458984375, + "learning_rate": 1.9564172177005792e-07, + "loss": 0.1808, + "step": 36022 + }, + { + "epoch": 0.9115823569602955, + "grad_norm": 6.241336345672607, + "learning_rate": 1.9553051793770205e-07, + "loss": 0.1754, + "step": 36023 + }, + { + "epoch": 0.9116076625249893, + "grad_norm": 5.913383960723877, + "learning_rate": 1.954193450884756e-07, + "loss": 0.1866, + "step": 36024 + }, + { + "epoch": 0.911632968089683, + "grad_norm": 12.198740005493164, + "learning_rate": 1.9530820322309573e-07, + "loss": 0.2877, + "step": 36025 + }, + { + "epoch": 0.9116582736543766, + "grad_norm": 5.214931488037109, + "learning_rate": 1.9519709234228023e-07, + "loss": 0.1447, + "step": 36026 + }, + { + "epoch": 0.9116835792190703, + "grad_norm": 7.751839637756348, + "learning_rate": 1.9508601244674518e-07, + "loss": 0.2276, + "step": 36027 + }, + { + "epoch": 0.911708884783764, + "grad_norm": 9.673824310302734, + "learning_rate": 1.9497496353720612e-07, + "loss": 0.2455, + "step": 36028 + }, + { + "epoch": 0.9117341903484576, + "grad_norm": 5.658588409423828, + "learning_rate": 1.9486394561438026e-07, + "loss": 0.1564, + "step": 36029 + }, + { + "epoch": 0.9117594959131513, + "grad_norm": 4.347466468811035, + "learning_rate": 1.9475295867898204e-07, + "loss": 0.1335, + "step": 36030 + }, + { + "epoch": 0.911784801477845, + "grad_norm": 4.025167942047119, + "learning_rate": 1.946420027317286e-07, + "loss": 0.1602, + "step": 36031 + }, + { + "epoch": 0.9118101070425386, + "grad_norm": 5.232156276702881, + "learning_rate": 1.9453107777333502e-07, + "loss": 0.2014, + "step": 36032 + }, + { + "epoch": 0.9118354126072323, + "grad_norm": 5.056332111358643, + "learning_rate": 1.9442018380451734e-07, + "loss": 0.0973, + "step": 36033 + }, + { + "epoch": 0.911860718171926, + "grad_norm": 5.331042289733887, + "learning_rate": 1.9430932082598886e-07, + "loss": 0.1733, + "step": 36034 + }, + { + "epoch": 0.9118860237366196, + "grad_norm": 6.576821327209473, + "learning_rate": 1.9419848883846627e-07, + "loss": 0.1352, + "step": 36035 + }, + { + "epoch": 0.9119113293013134, + "grad_norm": 7.611196517944336, + "learning_rate": 1.9408768784266285e-07, + "loss": 0.2216, + "step": 36036 + }, + { + "epoch": 0.9119366348660071, + "grad_norm": 2.669955015182495, + "learning_rate": 1.9397691783929584e-07, + "loss": 0.1347, + "step": 36037 + }, + { + "epoch": 0.9119619404307007, + "grad_norm": 2.5414507389068604, + "learning_rate": 1.9386617882907578e-07, + "loss": 0.1397, + "step": 36038 + }, + { + "epoch": 0.9119872459953944, + "grad_norm": 6.065342903137207, + "learning_rate": 1.9375547081271984e-07, + "loss": 0.1488, + "step": 36039 + }, + { + "epoch": 0.9120125515600881, + "grad_norm": 5.111926078796387, + "learning_rate": 1.936447937909397e-07, + "loss": 0.1778, + "step": 36040 + }, + { + "epoch": 0.9120378571247817, + "grad_norm": 6.20379638671875, + "learning_rate": 1.9353414776445147e-07, + "loss": 0.1321, + "step": 36041 + }, + { + "epoch": 0.9120631626894754, + "grad_norm": 3.7706243991851807, + "learning_rate": 1.9342353273396675e-07, + "loss": 0.1178, + "step": 36042 + }, + { + "epoch": 0.9120884682541691, + "grad_norm": 4.548691272735596, + "learning_rate": 1.933129487002e-07, + "loss": 0.1643, + "step": 36043 + }, + { + "epoch": 0.9121137738188627, + "grad_norm": 4.809169769287109, + "learning_rate": 1.9320239566386346e-07, + "loss": 0.1599, + "step": 36044 + }, + { + "epoch": 0.9121390793835564, + "grad_norm": 7.7113776206970215, + "learning_rate": 1.9309187362567095e-07, + "loss": 0.1642, + "step": 36045 + }, + { + "epoch": 0.9121643849482501, + "grad_norm": 3.5528361797332764, + "learning_rate": 1.929813825863347e-07, + "loss": 0.1254, + "step": 36046 + }, + { + "epoch": 0.9121896905129439, + "grad_norm": 9.965842247009277, + "learning_rate": 1.9287092254656748e-07, + "loss": 0.1, + "step": 36047 + }, + { + "epoch": 0.9122149960776375, + "grad_norm": 10.230778694152832, + "learning_rate": 1.9276049350708093e-07, + "loss": 0.2156, + "step": 36048 + }, + { + "epoch": 0.9122403016423312, + "grad_norm": 5.318641185760498, + "learning_rate": 1.926500954685878e-07, + "loss": 0.1044, + "step": 36049 + }, + { + "epoch": 0.9122656072070249, + "grad_norm": 5.013527870178223, + "learning_rate": 1.925397284318009e-07, + "loss": 0.2227, + "step": 36050 + }, + { + "epoch": 0.9122909127717185, + "grad_norm": 5.704229831695557, + "learning_rate": 1.924293923974302e-07, + "loss": 0.1716, + "step": 36051 + }, + { + "epoch": 0.9123162183364122, + "grad_norm": 4.595317840576172, + "learning_rate": 1.9231908736618787e-07, + "loss": 0.0839, + "step": 36052 + }, + { + "epoch": 0.9123415239011059, + "grad_norm": 3.805166244506836, + "learning_rate": 1.9220881333878616e-07, + "loss": 0.1005, + "step": 36053 + }, + { + "epoch": 0.9123668294657995, + "grad_norm": 3.5911948680877686, + "learning_rate": 1.9209857031593504e-07, + "loss": 0.0981, + "step": 36054 + }, + { + "epoch": 0.9123921350304932, + "grad_norm": 5.634438514709473, + "learning_rate": 1.9198835829834617e-07, + "loss": 0.1151, + "step": 36055 + }, + { + "epoch": 0.9124174405951869, + "grad_norm": 2.348573684692383, + "learning_rate": 1.9187817728673008e-07, + "loss": 0.0667, + "step": 36056 + }, + { + "epoch": 0.9124427461598805, + "grad_norm": 5.154500961303711, + "learning_rate": 1.9176802728179676e-07, + "loss": 0.1499, + "step": 36057 + }, + { + "epoch": 0.9124680517245742, + "grad_norm": 5.794395446777344, + "learning_rate": 1.9165790828425734e-07, + "loss": 0.1089, + "step": 36058 + }, + { + "epoch": 0.912493357289268, + "grad_norm": 3.0832102298736572, + "learning_rate": 1.9154782029482178e-07, + "loss": 0.1177, + "step": 36059 + }, + { + "epoch": 0.9125186628539615, + "grad_norm": 4.7550482749938965, + "learning_rate": 1.9143776331420005e-07, + "loss": 0.1697, + "step": 36060 + }, + { + "epoch": 0.9125439684186553, + "grad_norm": 3.5036489963531494, + "learning_rate": 1.9132773734310105e-07, + "loss": 0.1289, + "step": 36061 + }, + { + "epoch": 0.912569273983349, + "grad_norm": 3.8033483028411865, + "learning_rate": 1.9121774238223535e-07, + "loss": 0.1381, + "step": 36062 + }, + { + "epoch": 0.9125945795480426, + "grad_norm": 7.777943134307861, + "learning_rate": 1.9110777843231175e-07, + "loss": 0.1313, + "step": 36063 + }, + { + "epoch": 0.9126198851127363, + "grad_norm": 7.796865940093994, + "learning_rate": 1.9099784549404033e-07, + "loss": 0.1662, + "step": 36064 + }, + { + "epoch": 0.91264519067743, + "grad_norm": 3.9295032024383545, + "learning_rate": 1.9088794356812823e-07, + "loss": 0.0919, + "step": 36065 + }, + { + "epoch": 0.9126704962421236, + "grad_norm": 5.803186416625977, + "learning_rate": 1.9077807265528604e-07, + "loss": 0.1673, + "step": 36066 + }, + { + "epoch": 0.9126958018068173, + "grad_norm": 5.075010776519775, + "learning_rate": 1.9066823275622037e-07, + "loss": 0.1791, + "step": 36067 + }, + { + "epoch": 0.912721107371511, + "grad_norm": 6.384272575378418, + "learning_rate": 1.9055842387164236e-07, + "loss": 0.1455, + "step": 36068 + }, + { + "epoch": 0.9127464129362046, + "grad_norm": 2.837223768234253, + "learning_rate": 1.90448646002257e-07, + "loss": 0.1135, + "step": 36069 + }, + { + "epoch": 0.9127717185008983, + "grad_norm": 2.9012293815612793, + "learning_rate": 1.9033889914877424e-07, + "loss": 0.0797, + "step": 36070 + }, + { + "epoch": 0.912797024065592, + "grad_norm": 7.50635290145874, + "learning_rate": 1.9022918331190078e-07, + "loss": 0.206, + "step": 36071 + }, + { + "epoch": 0.9128223296302856, + "grad_norm": 9.808455467224121, + "learning_rate": 1.9011949849234602e-07, + "loss": 0.2034, + "step": 36072 + }, + { + "epoch": 0.9128476351949794, + "grad_norm": 3.0448145866394043, + "learning_rate": 1.900098446908144e-07, + "loss": 0.1047, + "step": 36073 + }, + { + "epoch": 0.9128729407596731, + "grad_norm": 13.103087425231934, + "learning_rate": 1.8990022190801483e-07, + "loss": 0.2169, + "step": 36074 + }, + { + "epoch": 0.9128982463243668, + "grad_norm": 3.7741665840148926, + "learning_rate": 1.8979063014465392e-07, + "loss": 0.0841, + "step": 36075 + }, + { + "epoch": 0.9129235518890604, + "grad_norm": 3.5196423530578613, + "learning_rate": 1.8968106940143893e-07, + "loss": 0.13, + "step": 36076 + }, + { + "epoch": 0.9129488574537541, + "grad_norm": 6.492487907409668, + "learning_rate": 1.8957153967907594e-07, + "loss": 0.163, + "step": 36077 + }, + { + "epoch": 0.9129741630184478, + "grad_norm": 13.697149276733398, + "learning_rate": 1.894620409782716e-07, + "loss": 0.192, + "step": 36078 + }, + { + "epoch": 0.9129994685831414, + "grad_norm": 6.9228434562683105, + "learning_rate": 1.8935257329973034e-07, + "loss": 0.1021, + "step": 36079 + }, + { + "epoch": 0.9130247741478351, + "grad_norm": 4.158051013946533, + "learning_rate": 1.8924313664416105e-07, + "loss": 0.1583, + "step": 36080 + }, + { + "epoch": 0.9130500797125288, + "grad_norm": 3.752002716064453, + "learning_rate": 1.8913373101226706e-07, + "loss": 0.1008, + "step": 36081 + }, + { + "epoch": 0.9130753852772224, + "grad_norm": 5.555182933807373, + "learning_rate": 1.8902435640475502e-07, + "loss": 0.0876, + "step": 36082 + }, + { + "epoch": 0.9131006908419161, + "grad_norm": 2.8167917728424072, + "learning_rate": 1.8891501282232938e-07, + "loss": 0.1073, + "step": 36083 + }, + { + "epoch": 0.9131259964066099, + "grad_norm": 5.746669769287109, + "learning_rate": 1.8880570026569677e-07, + "loss": 0.2047, + "step": 36084 + }, + { + "epoch": 0.9131513019713035, + "grad_norm": 3.030613899230957, + "learning_rate": 1.886964187355611e-07, + "loss": 0.134, + "step": 36085 + }, + { + "epoch": 0.9131766075359972, + "grad_norm": 4.889721870422363, + "learning_rate": 1.8858716823262735e-07, + "loss": 0.1778, + "step": 36086 + }, + { + "epoch": 0.9132019131006909, + "grad_norm": 4.268346309661865, + "learning_rate": 1.8847794875759995e-07, + "loss": 0.1398, + "step": 36087 + }, + { + "epoch": 0.9132272186653845, + "grad_norm": 7.53858757019043, + "learning_rate": 1.8836876031118224e-07, + "loss": 0.1323, + "step": 36088 + }, + { + "epoch": 0.9132525242300782, + "grad_norm": 9.270638465881348, + "learning_rate": 1.8825960289408029e-07, + "loss": 0.1475, + "step": 36089 + }, + { + "epoch": 0.9132778297947719, + "grad_norm": 5.419600009918213, + "learning_rate": 1.8815047650699692e-07, + "loss": 0.1778, + "step": 36090 + }, + { + "epoch": 0.9133031353594655, + "grad_norm": 7.180213451385498, + "learning_rate": 1.880413811506365e-07, + "loss": 0.1994, + "step": 36091 + }, + { + "epoch": 0.9133284409241592, + "grad_norm": 5.679788112640381, + "learning_rate": 1.8793231682570135e-07, + "loss": 0.154, + "step": 36092 + }, + { + "epoch": 0.9133537464888529, + "grad_norm": 4.252047538757324, + "learning_rate": 1.8782328353289582e-07, + "loss": 0.1591, + "step": 36093 + }, + { + "epoch": 0.9133790520535465, + "grad_norm": 3.2380807399749756, + "learning_rate": 1.8771428127292325e-07, + "loss": 0.1055, + "step": 36094 + }, + { + "epoch": 0.9134043576182402, + "grad_norm": 13.441035270690918, + "learning_rate": 1.8760531004648586e-07, + "loss": 0.2734, + "step": 36095 + }, + { + "epoch": 0.913429663182934, + "grad_norm": 5.614413261413574, + "learning_rate": 1.8749636985428587e-07, + "loss": 0.2194, + "step": 36096 + }, + { + "epoch": 0.9134549687476275, + "grad_norm": 11.722540855407715, + "learning_rate": 1.8738746069702774e-07, + "loss": 0.3781, + "step": 36097 + }, + { + "epoch": 0.9134802743123213, + "grad_norm": 3.643317699432373, + "learning_rate": 1.8727858257541144e-07, + "loss": 0.1567, + "step": 36098 + }, + { + "epoch": 0.913505579877015, + "grad_norm": 17.715633392333984, + "learning_rate": 1.8716973549014193e-07, + "loss": 0.1445, + "step": 36099 + }, + { + "epoch": 0.9135308854417087, + "grad_norm": 5.170905590057373, + "learning_rate": 1.8706091944191817e-07, + "loss": 0.1458, + "step": 36100 + }, + { + "epoch": 0.9135561910064023, + "grad_norm": 6.162478923797607, + "learning_rate": 1.86952134431444e-07, + "loss": 0.1233, + "step": 36101 + }, + { + "epoch": 0.913581496571096, + "grad_norm": 4.924139976501465, + "learning_rate": 1.868433804594194e-07, + "loss": 0.1691, + "step": 36102 + }, + { + "epoch": 0.9136068021357897, + "grad_norm": 10.769272804260254, + "learning_rate": 1.8673465752654775e-07, + "loss": 0.2575, + "step": 36103 + }, + { + "epoch": 0.9136321077004833, + "grad_norm": 4.637797832489014, + "learning_rate": 1.8662596563352786e-07, + "loss": 0.1451, + "step": 36104 + }, + { + "epoch": 0.913657413265177, + "grad_norm": 11.142152786254883, + "learning_rate": 1.8651730478106255e-07, + "loss": 0.1945, + "step": 36105 + }, + { + "epoch": 0.9136827188298707, + "grad_norm": 3.4402918815612793, + "learning_rate": 1.8640867496985126e-07, + "loss": 0.1306, + "step": 36106 + }, + { + "epoch": 0.9137080243945643, + "grad_norm": 3.841858386993408, + "learning_rate": 1.863000762005951e-07, + "loss": 0.1956, + "step": 36107 + }, + { + "epoch": 0.913733329959258, + "grad_norm": 3.171865940093994, + "learning_rate": 1.8619150847399457e-07, + "loss": 0.0632, + "step": 36108 + }, + { + "epoch": 0.9137586355239518, + "grad_norm": 5.09720516204834, + "learning_rate": 1.8608297179074974e-07, + "loss": 0.1695, + "step": 36109 + }, + { + "epoch": 0.9137839410886454, + "grad_norm": 5.783636569976807, + "learning_rate": 1.8597446615155946e-07, + "loss": 0.1752, + "step": 36110 + }, + { + "epoch": 0.9138092466533391, + "grad_norm": 4.204446315765381, + "learning_rate": 1.8586599155712536e-07, + "loss": 0.1648, + "step": 36111 + }, + { + "epoch": 0.9138345522180328, + "grad_norm": 12.380123138427734, + "learning_rate": 1.8575754800814583e-07, + "loss": 0.2387, + "step": 36112 + }, + { + "epoch": 0.9138598577827264, + "grad_norm": 11.030540466308594, + "learning_rate": 1.8564913550532028e-07, + "loss": 0.1905, + "step": 36113 + }, + { + "epoch": 0.9138851633474201, + "grad_norm": 11.556685447692871, + "learning_rate": 1.8554075404934757e-07, + "loss": 0.1566, + "step": 36114 + }, + { + "epoch": 0.9139104689121138, + "grad_norm": 19.00456428527832, + "learning_rate": 1.854324036409272e-07, + "loss": 0.2527, + "step": 36115 + }, + { + "epoch": 0.9139357744768074, + "grad_norm": 2.5888991355895996, + "learning_rate": 1.8532408428075742e-07, + "loss": 0.0764, + "step": 36116 + }, + { + "epoch": 0.9139610800415011, + "grad_norm": 2.67475962638855, + "learning_rate": 1.8521579596953777e-07, + "loss": 0.1245, + "step": 36117 + }, + { + "epoch": 0.9139863856061948, + "grad_norm": 8.505048751831055, + "learning_rate": 1.851075387079654e-07, + "loss": 0.1476, + "step": 36118 + }, + { + "epoch": 0.9140116911708884, + "grad_norm": 9.031219482421875, + "learning_rate": 1.8499931249673807e-07, + "loss": 0.1243, + "step": 36119 + }, + { + "epoch": 0.9140369967355821, + "grad_norm": 3.931386709213257, + "learning_rate": 1.8489111733655585e-07, + "loss": 0.1499, + "step": 36120 + }, + { + "epoch": 0.9140623023002759, + "grad_norm": 5.428196907043457, + "learning_rate": 1.8478295322811424e-07, + "loss": 0.1528, + "step": 36121 + }, + { + "epoch": 0.9140876078649695, + "grad_norm": 3.743896245956421, + "learning_rate": 1.8467482017211213e-07, + "loss": 0.0883, + "step": 36122 + }, + { + "epoch": 0.9141129134296632, + "grad_norm": 3.0969502925872803, + "learning_rate": 1.8456671816924565e-07, + "loss": 0.1759, + "step": 36123 + }, + { + "epoch": 0.9141382189943569, + "grad_norm": 4.016297340393066, + "learning_rate": 1.8445864722021368e-07, + "loss": 0.135, + "step": 36124 + }, + { + "epoch": 0.9141635245590506, + "grad_norm": 5.056234836578369, + "learning_rate": 1.8435060732571176e-07, + "loss": 0.1136, + "step": 36125 + }, + { + "epoch": 0.9141888301237442, + "grad_norm": 2.457623243331909, + "learning_rate": 1.8424259848643711e-07, + "loss": 0.0825, + "step": 36126 + }, + { + "epoch": 0.9142141356884379, + "grad_norm": 4.399343967437744, + "learning_rate": 1.841346207030853e-07, + "loss": 0.1564, + "step": 36127 + }, + { + "epoch": 0.9142394412531316, + "grad_norm": 6.731215000152588, + "learning_rate": 1.8402667397635464e-07, + "loss": 0.128, + "step": 36128 + }, + { + "epoch": 0.9142647468178252, + "grad_norm": 4.062864303588867, + "learning_rate": 1.8391875830693905e-07, + "loss": 0.1926, + "step": 36129 + }, + { + "epoch": 0.9142900523825189, + "grad_norm": 2.511924982070923, + "learning_rate": 1.8381087369553686e-07, + "loss": 0.1014, + "step": 36130 + }, + { + "epoch": 0.9143153579472126, + "grad_norm": 5.895667552947998, + "learning_rate": 1.8370302014284137e-07, + "loss": 0.1515, + "step": 36131 + }, + { + "epoch": 0.9143406635119062, + "grad_norm": 5.448169231414795, + "learning_rate": 1.835951976495498e-07, + "loss": 0.1204, + "step": 36132 + }, + { + "epoch": 0.9143659690766, + "grad_norm": 9.418082237243652, + "learning_rate": 1.834874062163561e-07, + "loss": 0.2864, + "step": 36133 + }, + { + "epoch": 0.9143912746412937, + "grad_norm": 5.650360584259033, + "learning_rate": 1.833796458439574e-07, + "loss": 0.1005, + "step": 36134 + }, + { + "epoch": 0.9144165802059873, + "grad_norm": 9.653732299804688, + "learning_rate": 1.83271916533046e-07, + "loss": 0.1486, + "step": 36135 + }, + { + "epoch": 0.914441885770681, + "grad_norm": 7.83432149887085, + "learning_rate": 1.8316421828431908e-07, + "loss": 0.2291, + "step": 36136 + }, + { + "epoch": 0.9144671913353747, + "grad_norm": 9.805694580078125, + "learning_rate": 1.8305655109846941e-07, + "loss": 0.1408, + "step": 36137 + }, + { + "epoch": 0.9144924969000683, + "grad_norm": 7.087967395782471, + "learning_rate": 1.829489149761926e-07, + "loss": 0.199, + "step": 36138 + }, + { + "epoch": 0.914517802464762, + "grad_norm": 4.892001152038574, + "learning_rate": 1.8284130991818194e-07, + "loss": 0.1439, + "step": 36139 + }, + { + "epoch": 0.9145431080294557, + "grad_norm": 6.540905475616455, + "learning_rate": 1.827337359251319e-07, + "loss": 0.1434, + "step": 36140 + }, + { + "epoch": 0.9145684135941493, + "grad_norm": 5.235376358032227, + "learning_rate": 1.8262619299773577e-07, + "loss": 0.1399, + "step": 36141 + }, + { + "epoch": 0.914593719158843, + "grad_norm": 11.88533878326416, + "learning_rate": 1.8251868113668693e-07, + "loss": 0.4655, + "step": 36142 + }, + { + "epoch": 0.9146190247235367, + "grad_norm": 3.235316753387451, + "learning_rate": 1.8241120034267924e-07, + "loss": 0.1093, + "step": 36143 + }, + { + "epoch": 0.9146443302882303, + "grad_norm": 4.83397912979126, + "learning_rate": 1.8230375061640547e-07, + "loss": 0.1257, + "step": 36144 + }, + { + "epoch": 0.914669635852924, + "grad_norm": 6.810108661651611, + "learning_rate": 1.82196331958559e-07, + "loss": 0.1663, + "step": 36145 + }, + { + "epoch": 0.9146949414176178, + "grad_norm": 3.968130588531494, + "learning_rate": 1.820889443698315e-07, + "loss": 0.1496, + "step": 36146 + }, + { + "epoch": 0.9147202469823114, + "grad_norm": 5.349133014678955, + "learning_rate": 1.8198158785091678e-07, + "loss": 0.1771, + "step": 36147 + }, + { + "epoch": 0.9147455525470051, + "grad_norm": 2.8633861541748047, + "learning_rate": 1.8187426240250662e-07, + "loss": 0.0816, + "step": 36148 + }, + { + "epoch": 0.9147708581116988, + "grad_norm": 3.087759494781494, + "learning_rate": 1.817669680252926e-07, + "loss": 0.0984, + "step": 36149 + }, + { + "epoch": 0.9147961636763925, + "grad_norm": 3.9992268085479736, + "learning_rate": 1.8165970471996698e-07, + "loss": 0.207, + "step": 36150 + }, + { + "epoch": 0.9148214692410861, + "grad_norm": 5.317045211791992, + "learning_rate": 1.815524724872225e-07, + "loss": 0.1241, + "step": 36151 + }, + { + "epoch": 0.9148467748057798, + "grad_norm": 3.5362207889556885, + "learning_rate": 1.8144527132774982e-07, + "loss": 0.0955, + "step": 36152 + }, + { + "epoch": 0.9148720803704735, + "grad_norm": 3.817897081375122, + "learning_rate": 1.8133810124223994e-07, + "loss": 0.1067, + "step": 36153 + }, + { + "epoch": 0.9148973859351671, + "grad_norm": 3.808079957962036, + "learning_rate": 1.8123096223138347e-07, + "loss": 0.1697, + "step": 36154 + }, + { + "epoch": 0.9149226914998608, + "grad_norm": 6.809124946594238, + "learning_rate": 1.8112385429587322e-07, + "loss": 0.1595, + "step": 36155 + }, + { + "epoch": 0.9149479970645545, + "grad_norm": 4.707089424133301, + "learning_rate": 1.8101677743639857e-07, + "loss": 0.1389, + "step": 36156 + }, + { + "epoch": 0.9149733026292481, + "grad_norm": 7.114553928375244, + "learning_rate": 1.8090973165365067e-07, + "loss": 0.2354, + "step": 36157 + }, + { + "epoch": 0.9149986081939419, + "grad_norm": 3.8544137477874756, + "learning_rate": 1.808027169483184e-07, + "loss": 0.1314, + "step": 36158 + }, + { + "epoch": 0.9150239137586356, + "grad_norm": 6.7160749435424805, + "learning_rate": 1.8069573332109402e-07, + "loss": 0.1801, + "step": 36159 + }, + { + "epoch": 0.9150492193233292, + "grad_norm": 3.938687562942505, + "learning_rate": 1.8058878077266584e-07, + "loss": 0.1521, + "step": 36160 + }, + { + "epoch": 0.9150745248880229, + "grad_norm": 13.076776504516602, + "learning_rate": 1.8048185930372498e-07, + "loss": 0.2117, + "step": 36161 + }, + { + "epoch": 0.9150998304527166, + "grad_norm": 10.412674903869629, + "learning_rate": 1.803749689149592e-07, + "loss": 0.1626, + "step": 36162 + }, + { + "epoch": 0.9151251360174102, + "grad_norm": 10.636237144470215, + "learning_rate": 1.802681096070591e-07, + "loss": 0.2271, + "step": 36163 + }, + { + "epoch": 0.9151504415821039, + "grad_norm": 7.572487831115723, + "learning_rate": 1.80161281380713e-07, + "loss": 0.2377, + "step": 36164 + }, + { + "epoch": 0.9151757471467976, + "grad_norm": 4.068891525268555, + "learning_rate": 1.8005448423661142e-07, + "loss": 0.1631, + "step": 36165 + }, + { + "epoch": 0.9152010527114912, + "grad_norm": 3.7521679401397705, + "learning_rate": 1.7994771817544054e-07, + "loss": 0.1367, + "step": 36166 + }, + { + "epoch": 0.9152263582761849, + "grad_norm": 4.165274143218994, + "learning_rate": 1.798409831978909e-07, + "loss": 0.1447, + "step": 36167 + }, + { + "epoch": 0.9152516638408786, + "grad_norm": 3.5899815559387207, + "learning_rate": 1.7973427930465027e-07, + "loss": 0.1493, + "step": 36168 + }, + { + "epoch": 0.9152769694055722, + "grad_norm": 4.519300937652588, + "learning_rate": 1.7962760649640588e-07, + "loss": 0.1303, + "step": 36169 + }, + { + "epoch": 0.915302274970266, + "grad_norm": 5.051185131072998, + "learning_rate": 1.795209647738466e-07, + "loss": 0.1196, + "step": 36170 + }, + { + "epoch": 0.9153275805349597, + "grad_norm": 2.869818925857544, + "learning_rate": 1.7941435413766028e-07, + "loss": 0.0722, + "step": 36171 + }, + { + "epoch": 0.9153528860996533, + "grad_norm": 5.590930461883545, + "learning_rate": 1.7930777458853465e-07, + "loss": 0.2233, + "step": 36172 + }, + { + "epoch": 0.915378191664347, + "grad_norm": 2.2733688354492188, + "learning_rate": 1.7920122612715528e-07, + "loss": 0.1006, + "step": 36173 + }, + { + "epoch": 0.9154034972290407, + "grad_norm": 4.227181911468506, + "learning_rate": 1.7909470875421108e-07, + "loss": 0.1154, + "step": 36174 + }, + { + "epoch": 0.9154288027937344, + "grad_norm": 6.662059307098389, + "learning_rate": 1.789882224703887e-07, + "loss": 0.1654, + "step": 36175 + }, + { + "epoch": 0.915454108358428, + "grad_norm": 7.359897136688232, + "learning_rate": 1.7888176727637375e-07, + "loss": 0.2051, + "step": 36176 + }, + { + "epoch": 0.9154794139231217, + "grad_norm": 12.941262245178223, + "learning_rate": 1.787753431728534e-07, + "loss": 0.2728, + "step": 36177 + }, + { + "epoch": 0.9155047194878154, + "grad_norm": 3.2888684272766113, + "learning_rate": 1.7866895016051544e-07, + "loss": 0.1022, + "step": 36178 + }, + { + "epoch": 0.915530025052509, + "grad_norm": 5.810947895050049, + "learning_rate": 1.7856258824004323e-07, + "loss": 0.202, + "step": 36179 + }, + { + "epoch": 0.9155553306172027, + "grad_norm": 2.8733739852905273, + "learning_rate": 1.7845625741212457e-07, + "loss": 0.0982, + "step": 36180 + }, + { + "epoch": 0.9155806361818964, + "grad_norm": 3.067930221557617, + "learning_rate": 1.783499576774439e-07, + "loss": 0.1018, + "step": 36181 + }, + { + "epoch": 0.91560594174659, + "grad_norm": 9.775391578674316, + "learning_rate": 1.7824368903668844e-07, + "loss": 0.2957, + "step": 36182 + }, + { + "epoch": 0.9156312473112838, + "grad_norm": 2.8144378662109375, + "learning_rate": 1.7813745149054206e-07, + "loss": 0.1249, + "step": 36183 + }, + { + "epoch": 0.9156565528759775, + "grad_norm": 3.8981401920318604, + "learning_rate": 1.7803124503969038e-07, + "loss": 0.1786, + "step": 36184 + }, + { + "epoch": 0.9156818584406711, + "grad_norm": 6.068109035491943, + "learning_rate": 1.779250696848178e-07, + "loss": 0.1736, + "step": 36185 + }, + { + "epoch": 0.9157071640053648, + "grad_norm": 3.478145122528076, + "learning_rate": 1.778189254266105e-07, + "loss": 0.1429, + "step": 36186 + }, + { + "epoch": 0.9157324695700585, + "grad_norm": 9.50204849243164, + "learning_rate": 1.7771281226575122e-07, + "loss": 0.1409, + "step": 36187 + }, + { + "epoch": 0.9157577751347521, + "grad_norm": 6.791365146636963, + "learning_rate": 1.7760673020292552e-07, + "loss": 0.2203, + "step": 36188 + }, + { + "epoch": 0.9157830806994458, + "grad_norm": 5.538915634155273, + "learning_rate": 1.775006792388162e-07, + "loss": 0.1558, + "step": 36189 + }, + { + "epoch": 0.9158083862641395, + "grad_norm": 12.820510864257812, + "learning_rate": 1.7739465937410826e-07, + "loss": 0.1594, + "step": 36190 + }, + { + "epoch": 0.9158336918288331, + "grad_norm": 2.4202678203582764, + "learning_rate": 1.772886706094845e-07, + "loss": 0.0894, + "step": 36191 + }, + { + "epoch": 0.9158589973935268, + "grad_norm": 3.5943310260772705, + "learning_rate": 1.771827129456305e-07, + "loss": 0.0732, + "step": 36192 + }, + { + "epoch": 0.9158843029582205, + "grad_norm": 8.559642791748047, + "learning_rate": 1.7707678638322678e-07, + "loss": 0.1832, + "step": 36193 + }, + { + "epoch": 0.9159096085229141, + "grad_norm": 4.4478654861450195, + "learning_rate": 1.769708909229584e-07, + "loss": 0.1483, + "step": 36194 + }, + { + "epoch": 0.9159349140876079, + "grad_norm": 6.038867950439453, + "learning_rate": 1.7686502656550697e-07, + "loss": 0.1581, + "step": 36195 + }, + { + "epoch": 0.9159602196523016, + "grad_norm": 11.728199005126953, + "learning_rate": 1.76759193311557e-07, + "loss": 0.1297, + "step": 36196 + }, + { + "epoch": 0.9159855252169952, + "grad_norm": 5.244138240814209, + "learning_rate": 1.7665339116178848e-07, + "loss": 0.1641, + "step": 36197 + }, + { + "epoch": 0.9160108307816889, + "grad_norm": 3.067586898803711, + "learning_rate": 1.7654762011688586e-07, + "loss": 0.0597, + "step": 36198 + }, + { + "epoch": 0.9160361363463826, + "grad_norm": 13.833661079406738, + "learning_rate": 1.7644188017753027e-07, + "loss": 0.132, + "step": 36199 + }, + { + "epoch": 0.9160614419110762, + "grad_norm": 14.894757270812988, + "learning_rate": 1.7633617134440395e-07, + "loss": 0.2109, + "step": 36200 + }, + { + "epoch": 0.9160867474757699, + "grad_norm": 3.9545421600341797, + "learning_rate": 1.7623049361818746e-07, + "loss": 0.1689, + "step": 36201 + }, + { + "epoch": 0.9161120530404636, + "grad_norm": 8.035274505615234, + "learning_rate": 1.7612484699956412e-07, + "loss": 0.2103, + "step": 36202 + }, + { + "epoch": 0.9161373586051573, + "grad_norm": 3.6537063121795654, + "learning_rate": 1.7601923148921396e-07, + "loss": 0.1619, + "step": 36203 + }, + { + "epoch": 0.9161626641698509, + "grad_norm": 9.544347763061523, + "learning_rate": 1.759136470878181e-07, + "loss": 0.1568, + "step": 36204 + }, + { + "epoch": 0.9161879697345446, + "grad_norm": 3.3609540462493896, + "learning_rate": 1.758080937960588e-07, + "loss": 0.0855, + "step": 36205 + }, + { + "epoch": 0.9162132752992383, + "grad_norm": 3.6745200157165527, + "learning_rate": 1.7570257161461492e-07, + "loss": 0.1115, + "step": 36206 + }, + { + "epoch": 0.916238580863932, + "grad_norm": 5.916826248168945, + "learning_rate": 1.7559708054416812e-07, + "loss": 0.1815, + "step": 36207 + }, + { + "epoch": 0.9162638864286257, + "grad_norm": 4.913269519805908, + "learning_rate": 1.7549162058539794e-07, + "loss": 0.1723, + "step": 36208 + }, + { + "epoch": 0.9162891919933194, + "grad_norm": 5.54319953918457, + "learning_rate": 1.7538619173898596e-07, + "loss": 0.121, + "step": 36209 + }, + { + "epoch": 0.916314497558013, + "grad_norm": 7.719936847686768, + "learning_rate": 1.752807940056095e-07, + "loss": 0.1796, + "step": 36210 + }, + { + "epoch": 0.9163398031227067, + "grad_norm": 5.29634952545166, + "learning_rate": 1.7517542738595071e-07, + "loss": 0.1292, + "step": 36211 + }, + { + "epoch": 0.9163651086874004, + "grad_norm": 5.219048023223877, + "learning_rate": 1.7507009188068748e-07, + "loss": 0.1725, + "step": 36212 + }, + { + "epoch": 0.916390414252094, + "grad_norm": 5.097596645355225, + "learning_rate": 1.7496478749049973e-07, + "loss": 0.2347, + "step": 36213 + }, + { + "epoch": 0.9164157198167877, + "grad_norm": 6.449128150939941, + "learning_rate": 1.748595142160675e-07, + "loss": 0.1625, + "step": 36214 + }, + { + "epoch": 0.9164410253814814, + "grad_norm": 5.572441101074219, + "learning_rate": 1.7475427205806806e-07, + "loss": 0.1764, + "step": 36215 + }, + { + "epoch": 0.916466330946175, + "grad_norm": 4.716376304626465, + "learning_rate": 1.7464906101718026e-07, + "loss": 0.1469, + "step": 36216 + }, + { + "epoch": 0.9164916365108687, + "grad_norm": 6.808838367462158, + "learning_rate": 1.745438810940836e-07, + "loss": 0.2088, + "step": 36217 + }, + { + "epoch": 0.9165169420755624, + "grad_norm": 5.343908786773682, + "learning_rate": 1.7443873228945585e-07, + "loss": 0.1523, + "step": 36218 + }, + { + "epoch": 0.916542247640256, + "grad_norm": 3.6724343299865723, + "learning_rate": 1.7433361460397534e-07, + "loss": 0.0849, + "step": 36219 + }, + { + "epoch": 0.9165675532049498, + "grad_norm": 9.841693878173828, + "learning_rate": 1.742285280383188e-07, + "loss": 0.1755, + "step": 36220 + }, + { + "epoch": 0.9165928587696435, + "grad_norm": 5.0407304763793945, + "learning_rate": 1.7412347259316565e-07, + "loss": 0.171, + "step": 36221 + }, + { + "epoch": 0.9166181643343371, + "grad_norm": 6.578604221343994, + "learning_rate": 1.7401844826919255e-07, + "loss": 0.0964, + "step": 36222 + }, + { + "epoch": 0.9166434698990308, + "grad_norm": 5.697086334228516, + "learning_rate": 1.739134550670768e-07, + "loss": 0.1201, + "step": 36223 + }, + { + "epoch": 0.9166687754637245, + "grad_norm": 9.102412223815918, + "learning_rate": 1.7380849298749502e-07, + "loss": 0.2145, + "step": 36224 + }, + { + "epoch": 0.9166940810284181, + "grad_norm": 2.7863802909851074, + "learning_rate": 1.737035620311245e-07, + "loss": 0.0807, + "step": 36225 + }, + { + "epoch": 0.9167193865931118, + "grad_norm": 3.1853318214416504, + "learning_rate": 1.7359866219864242e-07, + "loss": 0.1382, + "step": 36226 + }, + { + "epoch": 0.9167446921578055, + "grad_norm": 6.264216423034668, + "learning_rate": 1.7349379349072493e-07, + "loss": 0.1959, + "step": 36227 + }, + { + "epoch": 0.9167699977224992, + "grad_norm": 6.0035080909729, + "learning_rate": 1.7338895590804705e-07, + "loss": 0.224, + "step": 36228 + }, + { + "epoch": 0.9167953032871928, + "grad_norm": 5.896356105804443, + "learning_rate": 1.7328414945128713e-07, + "loss": 0.1333, + "step": 36229 + }, + { + "epoch": 0.9168206088518865, + "grad_norm": 5.327425479888916, + "learning_rate": 1.7317937412111962e-07, + "loss": 0.1274, + "step": 36230 + }, + { + "epoch": 0.9168459144165803, + "grad_norm": 3.945128917694092, + "learning_rate": 1.730746299182201e-07, + "loss": 0.2016, + "step": 36231 + }, + { + "epoch": 0.9168712199812739, + "grad_norm": 19.030414581298828, + "learning_rate": 1.729699168432647e-07, + "loss": 0.3022, + "step": 36232 + }, + { + "epoch": 0.9168965255459676, + "grad_norm": 7.045691013336182, + "learning_rate": 1.728652348969284e-07, + "loss": 0.1929, + "step": 36233 + }, + { + "epoch": 0.9169218311106613, + "grad_norm": 3.502502679824829, + "learning_rate": 1.7276058407988627e-07, + "loss": 0.1172, + "step": 36234 + }, + { + "epoch": 0.9169471366753549, + "grad_norm": 2.995192289352417, + "learning_rate": 1.7265596439281273e-07, + "loss": 0.1084, + "step": 36235 + }, + { + "epoch": 0.9169724422400486, + "grad_norm": 4.277467727661133, + "learning_rate": 1.7255137583638392e-07, + "loss": 0.1294, + "step": 36236 + }, + { + "epoch": 0.9169977478047423, + "grad_norm": 4.81903600692749, + "learning_rate": 1.7244681841127265e-07, + "loss": 0.1695, + "step": 36237 + }, + { + "epoch": 0.9170230533694359, + "grad_norm": 3.905878782272339, + "learning_rate": 1.723422921181539e-07, + "loss": 0.178, + "step": 36238 + }, + { + "epoch": 0.9170483589341296, + "grad_norm": 8.39857292175293, + "learning_rate": 1.7223779695770104e-07, + "loss": 0.313, + "step": 36239 + }, + { + "epoch": 0.9170736644988233, + "grad_norm": 3.6153805255889893, + "learning_rate": 1.721333329305902e-07, + "loss": 0.1844, + "step": 36240 + }, + { + "epoch": 0.9170989700635169, + "grad_norm": 2.674333095550537, + "learning_rate": 1.7202890003749195e-07, + "loss": 0.1203, + "step": 36241 + }, + { + "epoch": 0.9171242756282106, + "grad_norm": 9.691618919372559, + "learning_rate": 1.7192449827908243e-07, + "loss": 0.2303, + "step": 36242 + }, + { + "epoch": 0.9171495811929044, + "grad_norm": 7.072446823120117, + "learning_rate": 1.7182012765603273e-07, + "loss": 0.1676, + "step": 36243 + }, + { + "epoch": 0.917174886757598, + "grad_norm": 6.845853328704834, + "learning_rate": 1.7171578816901735e-07, + "loss": 0.1114, + "step": 36244 + }, + { + "epoch": 0.9172001923222917, + "grad_norm": 4.8564772605896, + "learning_rate": 1.7161147981870851e-07, + "loss": 0.1393, + "step": 36245 + }, + { + "epoch": 0.9172254978869854, + "grad_norm": 4.741999626159668, + "learning_rate": 1.715072026057796e-07, + "loss": 0.1528, + "step": 36246 + }, + { + "epoch": 0.917250803451679, + "grad_norm": 2.813232421875, + "learning_rate": 1.7140295653090166e-07, + "loss": 0.1111, + "step": 36247 + }, + { + "epoch": 0.9172761090163727, + "grad_norm": 2.9551780223846436, + "learning_rate": 1.7129874159474867e-07, + "loss": 0.0839, + "step": 36248 + }, + { + "epoch": 0.9173014145810664, + "grad_norm": 3.8224475383758545, + "learning_rate": 1.7119455779799178e-07, + "loss": 0.1113, + "step": 36249 + }, + { + "epoch": 0.91732672014576, + "grad_norm": 2.9871981143951416, + "learning_rate": 1.7109040514130314e-07, + "loss": 0.0729, + "step": 36250 + }, + { + "epoch": 0.9173520257104537, + "grad_norm": 5.141506671905518, + "learning_rate": 1.709862836253534e-07, + "loss": 0.1286, + "step": 36251 + }, + { + "epoch": 0.9173773312751474, + "grad_norm": 4.994833946228027, + "learning_rate": 1.708821932508159e-07, + "loss": 0.1596, + "step": 36252 + }, + { + "epoch": 0.9174026368398411, + "grad_norm": 3.6786904335021973, + "learning_rate": 1.7077813401836008e-07, + "loss": 0.1516, + "step": 36253 + }, + { + "epoch": 0.9174279424045347, + "grad_norm": 5.929256916046143, + "learning_rate": 1.706741059286582e-07, + "loss": 0.1655, + "step": 36254 + }, + { + "epoch": 0.9174532479692284, + "grad_norm": 4.503864765167236, + "learning_rate": 1.7057010898238026e-07, + "loss": 0.1311, + "step": 36255 + }, + { + "epoch": 0.9174785535339222, + "grad_norm": 7.50771951675415, + "learning_rate": 1.7046614318019795e-07, + "loss": 0.2273, + "step": 36256 + }, + { + "epoch": 0.9175038590986158, + "grad_norm": 3.0300650596618652, + "learning_rate": 1.7036220852278073e-07, + "loss": 0.0933, + "step": 36257 + }, + { + "epoch": 0.9175291646633095, + "grad_norm": 2.9622905254364014, + "learning_rate": 1.702583050107992e-07, + "loss": 0.0824, + "step": 36258 + }, + { + "epoch": 0.9175544702280032, + "grad_norm": 23.541120529174805, + "learning_rate": 1.7015443264492338e-07, + "loss": 0.1433, + "step": 36259 + }, + { + "epoch": 0.9175797757926968, + "grad_norm": 6.567699432373047, + "learning_rate": 1.7005059142582325e-07, + "loss": 0.1657, + "step": 36260 + }, + { + "epoch": 0.9176050813573905, + "grad_norm": 3.443624496459961, + "learning_rate": 1.6994678135416886e-07, + "loss": 0.1352, + "step": 36261 + }, + { + "epoch": 0.9176303869220842, + "grad_norm": 3.6234774589538574, + "learning_rate": 1.6984300243062912e-07, + "loss": 0.106, + "step": 36262 + }, + { + "epoch": 0.9176556924867778, + "grad_norm": 3.002800941467285, + "learning_rate": 1.697392546558735e-07, + "loss": 0.1187, + "step": 36263 + }, + { + "epoch": 0.9176809980514715, + "grad_norm": 3.4221999645233154, + "learning_rate": 1.6963553803056975e-07, + "loss": 0.113, + "step": 36264 + }, + { + "epoch": 0.9177063036161652, + "grad_norm": 4.480385780334473, + "learning_rate": 1.6953185255538905e-07, + "loss": 0.1067, + "step": 36265 + }, + { + "epoch": 0.9177316091808588, + "grad_norm": 5.239086627960205, + "learning_rate": 1.694281982309981e-07, + "loss": 0.1388, + "step": 36266 + }, + { + "epoch": 0.9177569147455525, + "grad_norm": 11.036428451538086, + "learning_rate": 1.6932457505806743e-07, + "loss": 0.1631, + "step": 36267 + }, + { + "epoch": 0.9177822203102463, + "grad_norm": 3.372572898864746, + "learning_rate": 1.6922098303726264e-07, + "loss": 0.1682, + "step": 36268 + }, + { + "epoch": 0.9178075258749399, + "grad_norm": 5.864551544189453, + "learning_rate": 1.6911742216925374e-07, + "loss": 0.1217, + "step": 36269 + }, + { + "epoch": 0.9178328314396336, + "grad_norm": 1.8653805255889893, + "learning_rate": 1.69013892454708e-07, + "loss": 0.0696, + "step": 36270 + }, + { + "epoch": 0.9178581370043273, + "grad_norm": 8.602097511291504, + "learning_rate": 1.689103938942932e-07, + "loss": 0.0933, + "step": 36271 + }, + { + "epoch": 0.9178834425690209, + "grad_norm": 8.625515937805176, + "learning_rate": 1.6880692648867603e-07, + "loss": 0.2675, + "step": 36272 + }, + { + "epoch": 0.9179087481337146, + "grad_norm": 10.112747192382812, + "learning_rate": 1.6870349023852482e-07, + "loss": 0.2143, + "step": 36273 + }, + { + "epoch": 0.9179340536984083, + "grad_norm": 2.6181728839874268, + "learning_rate": 1.6860008514450576e-07, + "loss": 0.0779, + "step": 36274 + }, + { + "epoch": 0.9179593592631019, + "grad_norm": 7.934445858001709, + "learning_rate": 1.6849671120728716e-07, + "loss": 0.1255, + "step": 36275 + }, + { + "epoch": 0.9179846648277956, + "grad_norm": 7.118378162384033, + "learning_rate": 1.6839336842753295e-07, + "loss": 0.2373, + "step": 36276 + }, + { + "epoch": 0.9180099703924893, + "grad_norm": 15.94844913482666, + "learning_rate": 1.6829005680591204e-07, + "loss": 0.1496, + "step": 36277 + }, + { + "epoch": 0.918035275957183, + "grad_norm": 5.9966349601745605, + "learning_rate": 1.681867763430889e-07, + "loss": 0.1178, + "step": 36278 + }, + { + "epoch": 0.9180605815218766, + "grad_norm": 6.631586074829102, + "learning_rate": 1.680835270397313e-07, + "loss": 0.1008, + "step": 36279 + }, + { + "epoch": 0.9180858870865704, + "grad_norm": 4.362093925476074, + "learning_rate": 1.679803088965043e-07, + "loss": 0.118, + "step": 36280 + }, + { + "epoch": 0.9181111926512641, + "grad_norm": 6.6810808181762695, + "learning_rate": 1.6787712191407346e-07, + "loss": 0.1676, + "step": 36281 + }, + { + "epoch": 0.9181364982159577, + "grad_norm": 3.935194492340088, + "learning_rate": 1.677739660931038e-07, + "loss": 0.1202, + "step": 36282 + }, + { + "epoch": 0.9181618037806514, + "grad_norm": 5.706119537353516, + "learning_rate": 1.6767084143426093e-07, + "loss": 0.2606, + "step": 36283 + }, + { + "epoch": 0.9181871093453451, + "grad_norm": 6.966766834259033, + "learning_rate": 1.675677479382104e-07, + "loss": 0.1962, + "step": 36284 + }, + { + "epoch": 0.9182124149100387, + "grad_norm": 3.560770273208618, + "learning_rate": 1.674646856056167e-07, + "loss": 0.142, + "step": 36285 + }, + { + "epoch": 0.9182377204747324, + "grad_norm": 6.718571186065674, + "learning_rate": 1.6736165443714314e-07, + "loss": 0.1828, + "step": 36286 + }, + { + "epoch": 0.9182630260394261, + "grad_norm": 5.489368915557861, + "learning_rate": 1.6725865443345645e-07, + "loss": 0.1556, + "step": 36287 + }, + { + "epoch": 0.9182883316041197, + "grad_norm": 12.3068265914917, + "learning_rate": 1.6715568559521944e-07, + "loss": 0.2329, + "step": 36288 + }, + { + "epoch": 0.9183136371688134, + "grad_norm": 4.868321418762207, + "learning_rate": 1.6705274792309655e-07, + "loss": 0.1157, + "step": 36289 + }, + { + "epoch": 0.9183389427335071, + "grad_norm": 3.5989112854003906, + "learning_rate": 1.6694984141775173e-07, + "loss": 0.1053, + "step": 36290 + }, + { + "epoch": 0.9183642482982007, + "grad_norm": 21.081546783447266, + "learning_rate": 1.668469660798472e-07, + "loss": 0.1381, + "step": 36291 + }, + { + "epoch": 0.9183895538628944, + "grad_norm": 5.661078453063965, + "learning_rate": 1.667441219100485e-07, + "loss": 0.1866, + "step": 36292 + }, + { + "epoch": 0.9184148594275882, + "grad_norm": 11.107693672180176, + "learning_rate": 1.6664130890901797e-07, + "loss": 0.1532, + "step": 36293 + }, + { + "epoch": 0.9184401649922818, + "grad_norm": 8.409526824951172, + "learning_rate": 1.6653852707741835e-07, + "loss": 0.2234, + "step": 36294 + }, + { + "epoch": 0.9184654705569755, + "grad_norm": 15.100358963012695, + "learning_rate": 1.6643577641591248e-07, + "loss": 0.1737, + "step": 36295 + }, + { + "epoch": 0.9184907761216692, + "grad_norm": 3.2628018856048584, + "learning_rate": 1.6633305692516367e-07, + "loss": 0.1224, + "step": 36296 + }, + { + "epoch": 0.9185160816863628, + "grad_norm": 5.3231306076049805, + "learning_rate": 1.6623036860583307e-07, + "loss": 0.1883, + "step": 36297 + }, + { + "epoch": 0.9185413872510565, + "grad_norm": 5.685168266296387, + "learning_rate": 1.6612771145858464e-07, + "loss": 0.1313, + "step": 36298 + }, + { + "epoch": 0.9185666928157502, + "grad_norm": 7.530094623565674, + "learning_rate": 1.6602508548407837e-07, + "loss": 0.2211, + "step": 36299 + }, + { + "epoch": 0.9185919983804438, + "grad_norm": 18.113277435302734, + "learning_rate": 1.6592249068297818e-07, + "loss": 0.1718, + "step": 36300 + }, + { + "epoch": 0.9186173039451375, + "grad_norm": 5.119908332824707, + "learning_rate": 1.6581992705594352e-07, + "loss": 0.1656, + "step": 36301 + }, + { + "epoch": 0.9186426095098312, + "grad_norm": 7.030173301696777, + "learning_rate": 1.6571739460363835e-07, + "loss": 0.1198, + "step": 36302 + }, + { + "epoch": 0.9186679150745249, + "grad_norm": 6.383265495300293, + "learning_rate": 1.65614893326721e-07, + "loss": 0.1724, + "step": 36303 + }, + { + "epoch": 0.9186932206392185, + "grad_norm": 3.750900983810425, + "learning_rate": 1.655124232258548e-07, + "loss": 0.1214, + "step": 36304 + }, + { + "epoch": 0.9187185262039123, + "grad_norm": 3.865849733352661, + "learning_rate": 1.654099843016993e-07, + "loss": 0.1539, + "step": 36305 + }, + { + "epoch": 0.918743831768606, + "grad_norm": 4.954841613769531, + "learning_rate": 1.6530757655491615e-07, + "loss": 0.149, + "step": 36306 + }, + { + "epoch": 0.9187691373332996, + "grad_norm": 5.5163421630859375, + "learning_rate": 1.6520519998616423e-07, + "loss": 0.143, + "step": 36307 + }, + { + "epoch": 0.9187944428979933, + "grad_norm": 9.221813201904297, + "learning_rate": 1.651028545961053e-07, + "loss": 0.1604, + "step": 36308 + }, + { + "epoch": 0.918819748462687, + "grad_norm": 5.175289630889893, + "learning_rate": 1.6500054038539825e-07, + "loss": 0.1497, + "step": 36309 + }, + { + "epoch": 0.9188450540273806, + "grad_norm": 3.835114002227783, + "learning_rate": 1.6489825735470365e-07, + "loss": 0.1052, + "step": 36310 + }, + { + "epoch": 0.9188703595920743, + "grad_norm": 7.740011215209961, + "learning_rate": 1.64796005504681e-07, + "loss": 0.2045, + "step": 36311 + }, + { + "epoch": 0.918895665156768, + "grad_norm": 6.777438163757324, + "learning_rate": 1.646937848359892e-07, + "loss": 0.1846, + "step": 36312 + }, + { + "epoch": 0.9189209707214616, + "grad_norm": 5.639612197875977, + "learning_rate": 1.6459159534928715e-07, + "loss": 0.1182, + "step": 36313 + }, + { + "epoch": 0.9189462762861553, + "grad_norm": 6.079726696014404, + "learning_rate": 1.6448943704523546e-07, + "loss": 0.1607, + "step": 36314 + }, + { + "epoch": 0.918971581850849, + "grad_norm": 5.028695106506348, + "learning_rate": 1.6438730992449137e-07, + "loss": 0.0959, + "step": 36315 + }, + { + "epoch": 0.9189968874155426, + "grad_norm": 3.8738133907318115, + "learning_rate": 1.6428521398771434e-07, + "loss": 0.1012, + "step": 36316 + }, + { + "epoch": 0.9190221929802364, + "grad_norm": 4.969846248626709, + "learning_rate": 1.6418314923556166e-07, + "loss": 0.1924, + "step": 36317 + }, + { + "epoch": 0.9190474985449301, + "grad_norm": 8.193768501281738, + "learning_rate": 1.6408111566869333e-07, + "loss": 0.2489, + "step": 36318 + }, + { + "epoch": 0.9190728041096237, + "grad_norm": 5.6014580726623535, + "learning_rate": 1.6397911328776605e-07, + "loss": 0.1962, + "step": 36319 + }, + { + "epoch": 0.9190981096743174, + "grad_norm": 8.930194854736328, + "learning_rate": 1.6387714209343818e-07, + "loss": 0.2512, + "step": 36320 + }, + { + "epoch": 0.9191234152390111, + "grad_norm": 5.065461158752441, + "learning_rate": 1.6377520208636643e-07, + "loss": 0.1409, + "step": 36321 + }, + { + "epoch": 0.9191487208037047, + "grad_norm": 6.982222557067871, + "learning_rate": 1.636732932672086e-07, + "loss": 0.1435, + "step": 36322 + }, + { + "epoch": 0.9191740263683984, + "grad_norm": 4.42060661315918, + "learning_rate": 1.635714156366225e-07, + "loss": 0.1682, + "step": 36323 + }, + { + "epoch": 0.9191993319330921, + "grad_norm": 3.1938648223876953, + "learning_rate": 1.634695691952648e-07, + "loss": 0.1127, + "step": 36324 + }, + { + "epoch": 0.9192246374977857, + "grad_norm": 6.00982666015625, + "learning_rate": 1.6336775394379224e-07, + "loss": 0.1908, + "step": 36325 + }, + { + "epoch": 0.9192499430624794, + "grad_norm": 11.421899795532227, + "learning_rate": 1.6326596988286093e-07, + "loss": 0.2163, + "step": 36326 + }, + { + "epoch": 0.9192752486271731, + "grad_norm": 4.395956516265869, + "learning_rate": 1.6316421701312812e-07, + "loss": 0.1479, + "step": 36327 + }, + { + "epoch": 0.9193005541918667, + "grad_norm": 5.496463298797607, + "learning_rate": 1.6306249533524942e-07, + "loss": 0.1228, + "step": 36328 + }, + { + "epoch": 0.9193258597565604, + "grad_norm": 2.3033132553100586, + "learning_rate": 1.6296080484988097e-07, + "loss": 0.0631, + "step": 36329 + }, + { + "epoch": 0.9193511653212542, + "grad_norm": 9.69626235961914, + "learning_rate": 1.628591455576778e-07, + "loss": 0.2669, + "step": 36330 + }, + { + "epoch": 0.9193764708859479, + "grad_norm": 15.096550941467285, + "learning_rate": 1.6275751745929713e-07, + "loss": 0.2415, + "step": 36331 + }, + { + "epoch": 0.9194017764506415, + "grad_norm": 8.480945587158203, + "learning_rate": 1.626559205553929e-07, + "loss": 0.2043, + "step": 36332 + }, + { + "epoch": 0.9194270820153352, + "grad_norm": 9.042550086975098, + "learning_rate": 1.6255435484662185e-07, + "loss": 0.2808, + "step": 36333 + }, + { + "epoch": 0.9194523875800289, + "grad_norm": 7.465267658233643, + "learning_rate": 1.6245282033363674e-07, + "loss": 0.1719, + "step": 36334 + }, + { + "epoch": 0.9194776931447225, + "grad_norm": 3.2296884059906006, + "learning_rate": 1.6235131701709427e-07, + "loss": 0.111, + "step": 36335 + }, + { + "epoch": 0.9195029987094162, + "grad_norm": 6.085543632507324, + "learning_rate": 1.6224984489764727e-07, + "loss": 0.2059, + "step": 36336 + }, + { + "epoch": 0.9195283042741099, + "grad_norm": 3.9761781692504883, + "learning_rate": 1.6214840397595243e-07, + "loss": 0.1078, + "step": 36337 + }, + { + "epoch": 0.9195536098388035, + "grad_norm": 6.268431186676025, + "learning_rate": 1.6204699425266201e-07, + "loss": 0.1963, + "step": 36338 + }, + { + "epoch": 0.9195789154034972, + "grad_norm": 7.127255916595459, + "learning_rate": 1.619456157284305e-07, + "loss": 0.1224, + "step": 36339 + }, + { + "epoch": 0.9196042209681909, + "grad_norm": 2.316979169845581, + "learning_rate": 1.618442684039112e-07, + "loss": 0.0779, + "step": 36340 + }, + { + "epoch": 0.9196295265328845, + "grad_norm": 3.496364116668701, + "learning_rate": 1.617429522797592e-07, + "loss": 0.1663, + "step": 36341 + }, + { + "epoch": 0.9196548320975783, + "grad_norm": 8.673962593078613, + "learning_rate": 1.6164166735662678e-07, + "loss": 0.213, + "step": 36342 + }, + { + "epoch": 0.919680137662272, + "grad_norm": 4.400679588317871, + "learning_rate": 1.6154041363516726e-07, + "loss": 0.1668, + "step": 36343 + }, + { + "epoch": 0.9197054432269656, + "grad_norm": 11.102005004882812, + "learning_rate": 1.6143919111603345e-07, + "loss": 0.2279, + "step": 36344 + }, + { + "epoch": 0.9197307487916593, + "grad_norm": 7.614776611328125, + "learning_rate": 1.6133799979987818e-07, + "loss": 0.2194, + "step": 36345 + }, + { + "epoch": 0.919756054356353, + "grad_norm": 11.727404594421387, + "learning_rate": 1.6123683968735427e-07, + "loss": 0.3183, + "step": 36346 + }, + { + "epoch": 0.9197813599210466, + "grad_norm": 4.37457799911499, + "learning_rate": 1.6113571077911393e-07, + "loss": 0.1539, + "step": 36347 + }, + { + "epoch": 0.9198066654857403, + "grad_norm": 5.679252624511719, + "learning_rate": 1.6103461307580949e-07, + "loss": 0.1769, + "step": 36348 + }, + { + "epoch": 0.919831971050434, + "grad_norm": 4.911110877990723, + "learning_rate": 1.6093354657809202e-07, + "loss": 0.1367, + "step": 36349 + }, + { + "epoch": 0.9198572766151276, + "grad_norm": 4.025790691375732, + "learning_rate": 1.6083251128661437e-07, + "loss": 0.127, + "step": 36350 + }, + { + "epoch": 0.9198825821798213, + "grad_norm": 9.573577880859375, + "learning_rate": 1.6073150720202768e-07, + "loss": 0.2037, + "step": 36351 + }, + { + "epoch": 0.919907887744515, + "grad_norm": 7.017332553863525, + "learning_rate": 1.6063053432498365e-07, + "loss": 0.1756, + "step": 36352 + }, + { + "epoch": 0.9199331933092086, + "grad_norm": 3.904583692550659, + "learning_rate": 1.6052959265613234e-07, + "loss": 0.1689, + "step": 36353 + }, + { + "epoch": 0.9199584988739024, + "grad_norm": 8.182167053222656, + "learning_rate": 1.6042868219612595e-07, + "loss": 0.1421, + "step": 36354 + }, + { + "epoch": 0.9199838044385961, + "grad_norm": 16.470998764038086, + "learning_rate": 1.6032780294561456e-07, + "loss": 0.2127, + "step": 36355 + }, + { + "epoch": 0.9200091100032898, + "grad_norm": 5.7478251457214355, + "learning_rate": 1.602269549052493e-07, + "loss": 0.1585, + "step": 36356 + }, + { + "epoch": 0.9200344155679834, + "grad_norm": 2.89729642868042, + "learning_rate": 1.6012613807567968e-07, + "loss": 0.1195, + "step": 36357 + }, + { + "epoch": 0.9200597211326771, + "grad_norm": 20.611997604370117, + "learning_rate": 1.6002535245755678e-07, + "loss": 0.1566, + "step": 36358 + }, + { + "epoch": 0.9200850266973708, + "grad_norm": 6.454907417297363, + "learning_rate": 1.599245980515296e-07, + "loss": 0.1719, + "step": 36359 + }, + { + "epoch": 0.9201103322620644, + "grad_norm": 5.36272668838501, + "learning_rate": 1.5982387485824923e-07, + "loss": 0.2223, + "step": 36360 + }, + { + "epoch": 0.9201356378267581, + "grad_norm": 3.10579252243042, + "learning_rate": 1.5972318287836298e-07, + "loss": 0.1416, + "step": 36361 + }, + { + "epoch": 0.9201609433914518, + "grad_norm": 4.288428783416748, + "learning_rate": 1.596225221125225e-07, + "loss": 0.1422, + "step": 36362 + }, + { + "epoch": 0.9201862489561454, + "grad_norm": 6.2449235916137695, + "learning_rate": 1.5952189256137563e-07, + "loss": 0.1764, + "step": 36363 + }, + { + "epoch": 0.9202115545208391, + "grad_norm": 6.6511640548706055, + "learning_rate": 1.594212942255724e-07, + "loss": 0.1594, + "step": 36364 + }, + { + "epoch": 0.9202368600855328, + "grad_norm": 3.815577268600464, + "learning_rate": 1.5932072710576007e-07, + "loss": 0.1766, + "step": 36365 + }, + { + "epoch": 0.9202621656502264, + "grad_norm": 6.823818683624268, + "learning_rate": 1.5922019120258815e-07, + "loss": 0.1023, + "step": 36366 + }, + { + "epoch": 0.9202874712149202, + "grad_norm": 8.733349800109863, + "learning_rate": 1.591196865167044e-07, + "loss": 0.1661, + "step": 36367 + }, + { + "epoch": 0.9203127767796139, + "grad_norm": 5.479583263397217, + "learning_rate": 1.5901921304875835e-07, + "loss": 0.0721, + "step": 36368 + }, + { + "epoch": 0.9203380823443075, + "grad_norm": 2.990596294403076, + "learning_rate": 1.5891877079939556e-07, + "loss": 0.0931, + "step": 36369 + }, + { + "epoch": 0.9203633879090012, + "grad_norm": 3.255577325820923, + "learning_rate": 1.5881835976926608e-07, + "loss": 0.094, + "step": 36370 + }, + { + "epoch": 0.9203886934736949, + "grad_norm": 9.243651390075684, + "learning_rate": 1.587179799590155e-07, + "loss": 0.1806, + "step": 36371 + }, + { + "epoch": 0.9204139990383885, + "grad_norm": 4.282793045043945, + "learning_rate": 1.5861763136929332e-07, + "loss": 0.1258, + "step": 36372 + }, + { + "epoch": 0.9204393046030822, + "grad_norm": 7.167558670043945, + "learning_rate": 1.5851731400074511e-07, + "loss": 0.1818, + "step": 36373 + }, + { + "epoch": 0.9204646101677759, + "grad_norm": 5.877996921539307, + "learning_rate": 1.5841702785401814e-07, + "loss": 0.1194, + "step": 36374 + }, + { + "epoch": 0.9204899157324695, + "grad_norm": 18.577024459838867, + "learning_rate": 1.583167729297591e-07, + "loss": 0.3551, + "step": 36375 + }, + { + "epoch": 0.9205152212971632, + "grad_norm": 3.179781436920166, + "learning_rate": 1.5821654922861363e-07, + "loss": 0.1342, + "step": 36376 + }, + { + "epoch": 0.920540526861857, + "grad_norm": 33.50278854370117, + "learning_rate": 1.5811635675123006e-07, + "loss": 0.4201, + "step": 36377 + }, + { + "epoch": 0.9205658324265505, + "grad_norm": 6.422593593597412, + "learning_rate": 1.5801619549825342e-07, + "loss": 0.1675, + "step": 36378 + }, + { + "epoch": 0.9205911379912443, + "grad_norm": 7.764311790466309, + "learning_rate": 1.579160654703299e-07, + "loss": 0.1648, + "step": 36379 + }, + { + "epoch": 0.920616443555938, + "grad_norm": 10.524518013000488, + "learning_rate": 1.5781596666810396e-07, + "loss": 0.2401, + "step": 36380 + }, + { + "epoch": 0.9206417491206317, + "grad_norm": 5.897976875305176, + "learning_rate": 1.5771589909222284e-07, + "loss": 0.1036, + "step": 36381 + }, + { + "epoch": 0.9206670546853253, + "grad_norm": 3.4205422401428223, + "learning_rate": 1.5761586274333108e-07, + "loss": 0.1348, + "step": 36382 + }, + { + "epoch": 0.920692360250019, + "grad_norm": 4.18190336227417, + "learning_rate": 1.575158576220742e-07, + "loss": 0.1133, + "step": 36383 + }, + { + "epoch": 0.9207176658147127, + "grad_norm": 3.4270880222320557, + "learning_rate": 1.5741588372909566e-07, + "loss": 0.0952, + "step": 36384 + }, + { + "epoch": 0.9207429713794063, + "grad_norm": 8.966344833374023, + "learning_rate": 1.573159410650421e-07, + "loss": 0.1743, + "step": 36385 + }, + { + "epoch": 0.9207682769441, + "grad_norm": 3.1797149181365967, + "learning_rate": 1.572160296305575e-07, + "loss": 0.1475, + "step": 36386 + }, + { + "epoch": 0.9207935825087937, + "grad_norm": 2.5417287349700928, + "learning_rate": 1.5711614942628572e-07, + "loss": 0.1058, + "step": 36387 + }, + { + "epoch": 0.9208188880734873, + "grad_norm": 5.21601676940918, + "learning_rate": 1.570163004528702e-07, + "loss": 0.1262, + "step": 36388 + }, + { + "epoch": 0.920844193638181, + "grad_norm": 6.939866065979004, + "learning_rate": 1.569164827109565e-07, + "loss": 0.1449, + "step": 36389 + }, + { + "epoch": 0.9208694992028748, + "grad_norm": 2.8061821460723877, + "learning_rate": 1.56816696201188e-07, + "loss": 0.0804, + "step": 36390 + }, + { + "epoch": 0.9208948047675684, + "grad_norm": 2.3918747901916504, + "learning_rate": 1.5671694092420697e-07, + "loss": 0.0646, + "step": 36391 + }, + { + "epoch": 0.9209201103322621, + "grad_norm": 10.499825477600098, + "learning_rate": 1.5661721688065734e-07, + "loss": 0.2117, + "step": 36392 + }, + { + "epoch": 0.9209454158969558, + "grad_norm": 5.553675174713135, + "learning_rate": 1.5651752407118247e-07, + "loss": 0.1431, + "step": 36393 + }, + { + "epoch": 0.9209707214616494, + "grad_norm": 4.167248249053955, + "learning_rate": 1.564178624964252e-07, + "loss": 0.1568, + "step": 36394 + }, + { + "epoch": 0.9209960270263431, + "grad_norm": 4.408097267150879, + "learning_rate": 1.563182321570289e-07, + "loss": 0.136, + "step": 36395 + }, + { + "epoch": 0.9210213325910368, + "grad_norm": 3.181974411010742, + "learning_rate": 1.5621863305363415e-07, + "loss": 0.1095, + "step": 36396 + }, + { + "epoch": 0.9210466381557304, + "grad_norm": 8.189308166503906, + "learning_rate": 1.5611906518688492e-07, + "loss": 0.1839, + "step": 36397 + }, + { + "epoch": 0.9210719437204241, + "grad_norm": 5.845071315765381, + "learning_rate": 1.560195285574223e-07, + "loss": 0.1407, + "step": 36398 + }, + { + "epoch": 0.9210972492851178, + "grad_norm": 6.370555877685547, + "learning_rate": 1.5592002316589028e-07, + "loss": 0.1276, + "step": 36399 + }, + { + "epoch": 0.9211225548498114, + "grad_norm": 6.469235420227051, + "learning_rate": 1.558205490129272e-07, + "loss": 0.1571, + "step": 36400 + }, + { + "epoch": 0.9211478604145051, + "grad_norm": 3.734424352645874, + "learning_rate": 1.5572110609917756e-07, + "loss": 0.1065, + "step": 36401 + }, + { + "epoch": 0.9211731659791988, + "grad_norm": 7.369433879852295, + "learning_rate": 1.556216944252803e-07, + "loss": 0.1726, + "step": 36402 + }, + { + "epoch": 0.9211984715438924, + "grad_norm": 12.111023902893066, + "learning_rate": 1.555223139918788e-07, + "loss": 0.2197, + "step": 36403 + }, + { + "epoch": 0.9212237771085862, + "grad_norm": 4.212679862976074, + "learning_rate": 1.554229647996114e-07, + "loss": 0.1348, + "step": 36404 + }, + { + "epoch": 0.9212490826732799, + "grad_norm": 4.4019951820373535, + "learning_rate": 1.553236468491215e-07, + "loss": 0.1186, + "step": 36405 + }, + { + "epoch": 0.9212743882379736, + "grad_norm": 5.26168966293335, + "learning_rate": 1.5522436014104747e-07, + "loss": 0.1366, + "step": 36406 + }, + { + "epoch": 0.9212996938026672, + "grad_norm": 6.355850696563721, + "learning_rate": 1.5512510467602993e-07, + "loss": 0.2237, + "step": 36407 + }, + { + "epoch": 0.9213249993673609, + "grad_norm": 5.176004409790039, + "learning_rate": 1.5502588045471e-07, + "loss": 0.198, + "step": 36408 + }, + { + "epoch": 0.9213503049320546, + "grad_norm": 5.744977951049805, + "learning_rate": 1.5492668747772722e-07, + "loss": 0.1538, + "step": 36409 + }, + { + "epoch": 0.9213756104967482, + "grad_norm": 6.81850528717041, + "learning_rate": 1.5482752574572046e-07, + "loss": 0.1753, + "step": 36410 + }, + { + "epoch": 0.9214009160614419, + "grad_norm": 2.8793349266052246, + "learning_rate": 1.5472839525932982e-07, + "loss": 0.1076, + "step": 36411 + }, + { + "epoch": 0.9214262216261356, + "grad_norm": 6.800764560699463, + "learning_rate": 1.5462929601919474e-07, + "loss": 0.2533, + "step": 36412 + }, + { + "epoch": 0.9214515271908292, + "grad_norm": 7.59127950668335, + "learning_rate": 1.5453022802595362e-07, + "loss": 0.2248, + "step": 36413 + }, + { + "epoch": 0.921476832755523, + "grad_norm": 4.461881160736084, + "learning_rate": 1.5443119128024598e-07, + "loss": 0.1524, + "step": 36414 + }, + { + "epoch": 0.9215021383202167, + "grad_norm": 6.583540439605713, + "learning_rate": 1.5433218578270957e-07, + "loss": 0.1572, + "step": 36415 + }, + { + "epoch": 0.9215274438849103, + "grad_norm": 3.7020912170410156, + "learning_rate": 1.5423321153398453e-07, + "loss": 0.1594, + "step": 36416 + }, + { + "epoch": 0.921552749449604, + "grad_norm": 4.767840385437012, + "learning_rate": 1.5413426853470804e-07, + "loss": 0.1775, + "step": 36417 + }, + { + "epoch": 0.9215780550142977, + "grad_norm": 3.0128188133239746, + "learning_rate": 1.5403535678551796e-07, + "loss": 0.1055, + "step": 36418 + }, + { + "epoch": 0.9216033605789913, + "grad_norm": 3.81382417678833, + "learning_rate": 1.5393647628705154e-07, + "loss": 0.1453, + "step": 36419 + }, + { + "epoch": 0.921628666143685, + "grad_norm": 4.443301677703857, + "learning_rate": 1.5383762703994885e-07, + "loss": 0.1627, + "step": 36420 + }, + { + "epoch": 0.9216539717083787, + "grad_norm": 8.6975736618042, + "learning_rate": 1.5373880904484496e-07, + "loss": 0.2251, + "step": 36421 + }, + { + "epoch": 0.9216792772730723, + "grad_norm": 7.229187965393066, + "learning_rate": 1.5364002230237874e-07, + "loss": 0.2525, + "step": 36422 + }, + { + "epoch": 0.921704582837766, + "grad_norm": 3.546687602996826, + "learning_rate": 1.5354126681318527e-07, + "loss": 0.1345, + "step": 36423 + }, + { + "epoch": 0.9217298884024597, + "grad_norm": 4.4737443923950195, + "learning_rate": 1.5344254257790404e-07, + "loss": 0.1755, + "step": 36424 + }, + { + "epoch": 0.9217551939671533, + "grad_norm": 5.223985195159912, + "learning_rate": 1.5334384959716898e-07, + "loss": 0.1448, + "step": 36425 + }, + { + "epoch": 0.921780499531847, + "grad_norm": 3.4002723693847656, + "learning_rate": 1.5324518787161957e-07, + "loss": 0.1016, + "step": 36426 + }, + { + "epoch": 0.9218058050965408, + "grad_norm": 3.582996368408203, + "learning_rate": 1.5314655740188867e-07, + "loss": 0.1627, + "step": 36427 + }, + { + "epoch": 0.9218311106612344, + "grad_norm": 5.545085906982422, + "learning_rate": 1.5304795818861517e-07, + "loss": 0.1965, + "step": 36428 + }, + { + "epoch": 0.9218564162259281, + "grad_norm": 4.419061660766602, + "learning_rate": 1.5294939023243305e-07, + "loss": 0.1877, + "step": 36429 + }, + { + "epoch": 0.9218817217906218, + "grad_norm": 5.71834135055542, + "learning_rate": 1.5285085353397954e-07, + "loss": 0.1396, + "step": 36430 + }, + { + "epoch": 0.9219070273553155, + "grad_norm": 5.833682060241699, + "learning_rate": 1.527523480938886e-07, + "loss": 0.1309, + "step": 36431 + }, + { + "epoch": 0.9219323329200091, + "grad_norm": 4.291039943695068, + "learning_rate": 1.526538739127964e-07, + "loss": 0.1208, + "step": 36432 + }, + { + "epoch": 0.9219576384847028, + "grad_norm": 2.9119410514831543, + "learning_rate": 1.525554309913374e-07, + "loss": 0.1507, + "step": 36433 + }, + { + "epoch": 0.9219829440493965, + "grad_norm": 2.46226167678833, + "learning_rate": 1.5245701933014723e-07, + "loss": 0.0823, + "step": 36434 + }, + { + "epoch": 0.9220082496140901, + "grad_norm": 7.080073833465576, + "learning_rate": 1.5235863892985925e-07, + "loss": 0.1596, + "step": 36435 + }, + { + "epoch": 0.9220335551787838, + "grad_norm": 4.058895587921143, + "learning_rate": 1.5226028979110908e-07, + "loss": 0.1091, + "step": 36436 + }, + { + "epoch": 0.9220588607434775, + "grad_norm": 7.100313663482666, + "learning_rate": 1.5216197191453119e-07, + "loss": 0.1733, + "step": 36437 + }, + { + "epoch": 0.9220841663081711, + "grad_norm": 3.670045852661133, + "learning_rate": 1.5206368530075787e-07, + "loss": 0.1154, + "step": 36438 + }, + { + "epoch": 0.9221094718728648, + "grad_norm": 7.288546562194824, + "learning_rate": 1.5196542995042475e-07, + "loss": 0.1597, + "step": 36439 + }, + { + "epoch": 0.9221347774375586, + "grad_norm": 3.696469306945801, + "learning_rate": 1.5186720586416458e-07, + "loss": 0.1555, + "step": 36440 + }, + { + "epoch": 0.9221600830022522, + "grad_norm": 3.1502370834350586, + "learning_rate": 1.5176901304261138e-07, + "loss": 0.1183, + "step": 36441 + }, + { + "epoch": 0.9221853885669459, + "grad_norm": 6.767353057861328, + "learning_rate": 1.516708514863968e-07, + "loss": 0.1812, + "step": 36442 + }, + { + "epoch": 0.9222106941316396, + "grad_norm": 6.353091716766357, + "learning_rate": 1.5157272119615652e-07, + "loss": 0.1153, + "step": 36443 + }, + { + "epoch": 0.9222359996963332, + "grad_norm": 3.972626209259033, + "learning_rate": 1.5147462217252108e-07, + "loss": 0.1081, + "step": 36444 + }, + { + "epoch": 0.9222613052610269, + "grad_norm": 4.727380752563477, + "learning_rate": 1.5137655441612388e-07, + "loss": 0.1318, + "step": 36445 + }, + { + "epoch": 0.9222866108257206, + "grad_norm": 4.558829307556152, + "learning_rate": 1.5127851792759717e-07, + "loss": 0.054, + "step": 36446 + }, + { + "epoch": 0.9223119163904142, + "grad_norm": 8.22262954711914, + "learning_rate": 1.5118051270757383e-07, + "loss": 0.2576, + "step": 36447 + }, + { + "epoch": 0.9223372219551079, + "grad_norm": 5.05235481262207, + "learning_rate": 1.5108253875668555e-07, + "loss": 0.1554, + "step": 36448 + }, + { + "epoch": 0.9223625275198016, + "grad_norm": 6.462240219116211, + "learning_rate": 1.5098459607556404e-07, + "loss": 0.1772, + "step": 36449 + }, + { + "epoch": 0.9223878330844952, + "grad_norm": 3.018467903137207, + "learning_rate": 1.508866846648399e-07, + "loss": 0.1098, + "step": 36450 + }, + { + "epoch": 0.922413138649189, + "grad_norm": 5.330791473388672, + "learning_rate": 1.5078880452514655e-07, + "loss": 0.1319, + "step": 36451 + }, + { + "epoch": 0.9224384442138827, + "grad_norm": 3.113163709640503, + "learning_rate": 1.50690955657114e-07, + "loss": 0.0726, + "step": 36452 + }, + { + "epoch": 0.9224637497785763, + "grad_norm": 8.662521362304688, + "learning_rate": 1.5059313806137344e-07, + "loss": 0.2973, + "step": 36453 + }, + { + "epoch": 0.92248905534327, + "grad_norm": 4.726759910583496, + "learning_rate": 1.504953517385549e-07, + "loss": 0.1234, + "step": 36454 + }, + { + "epoch": 0.9225143609079637, + "grad_norm": 10.277534484863281, + "learning_rate": 1.5039759668929065e-07, + "loss": 0.1912, + "step": 36455 + }, + { + "epoch": 0.9225396664726573, + "grad_norm": 2.9017999172210693, + "learning_rate": 1.5029987291420968e-07, + "loss": 0.112, + "step": 36456 + }, + { + "epoch": 0.922564972037351, + "grad_norm": 5.5059614181518555, + "learning_rate": 1.5020218041394307e-07, + "loss": 0.1859, + "step": 36457 + }, + { + "epoch": 0.9225902776020447, + "grad_norm": 3.6101529598236084, + "learning_rate": 1.501045191891204e-07, + "loss": 0.1661, + "step": 36458 + }, + { + "epoch": 0.9226155831667384, + "grad_norm": 8.84647274017334, + "learning_rate": 1.5000688924037167e-07, + "loss": 0.2304, + "step": 36459 + }, + { + "epoch": 0.922640888731432, + "grad_norm": 5.87532901763916, + "learning_rate": 1.4990929056832582e-07, + "loss": 0.1571, + "step": 36460 + }, + { + "epoch": 0.9226661942961257, + "grad_norm": 14.086434364318848, + "learning_rate": 1.4981172317361348e-07, + "loss": 0.278, + "step": 36461 + }, + { + "epoch": 0.9226914998608194, + "grad_norm": 4.549661159515381, + "learning_rate": 1.4971418705686246e-07, + "loss": 0.1316, + "step": 36462 + }, + { + "epoch": 0.922716805425513, + "grad_norm": 6.710269451141357, + "learning_rate": 1.4961668221870284e-07, + "loss": 0.2579, + "step": 36463 + }, + { + "epoch": 0.9227421109902068, + "grad_norm": 2.795855760574341, + "learning_rate": 1.49519208659763e-07, + "loss": 0.0743, + "step": 36464 + }, + { + "epoch": 0.9227674165549005, + "grad_norm": 5.337362766265869, + "learning_rate": 1.494217663806713e-07, + "loss": 0.14, + "step": 36465 + }, + { + "epoch": 0.9227927221195941, + "grad_norm": 5.472471237182617, + "learning_rate": 1.4932435538205613e-07, + "loss": 0.1069, + "step": 36466 + }, + { + "epoch": 0.9228180276842878, + "grad_norm": 3.7718162536621094, + "learning_rate": 1.4922697566454648e-07, + "loss": 0.167, + "step": 36467 + }, + { + "epoch": 0.9228433332489815, + "grad_norm": 7.0517778396606445, + "learning_rate": 1.4912962722876955e-07, + "loss": 0.1782, + "step": 36468 + }, + { + "epoch": 0.9228686388136751, + "grad_norm": 5.114485740661621, + "learning_rate": 1.4903231007535324e-07, + "loss": 0.1108, + "step": 36469 + }, + { + "epoch": 0.9228939443783688, + "grad_norm": 8.691564559936523, + "learning_rate": 1.489350242049259e-07, + "loss": 0.134, + "step": 36470 + }, + { + "epoch": 0.9229192499430625, + "grad_norm": 1.9842101335525513, + "learning_rate": 1.4883776961811314e-07, + "loss": 0.0339, + "step": 36471 + }, + { + "epoch": 0.9229445555077561, + "grad_norm": 10.855369567871094, + "learning_rate": 1.4874054631554335e-07, + "loss": 0.1926, + "step": 36472 + }, + { + "epoch": 0.9229698610724498, + "grad_norm": 4.449813365936279, + "learning_rate": 1.4864335429784326e-07, + "loss": 0.121, + "step": 36473 + }, + { + "epoch": 0.9229951666371435, + "grad_norm": 6.641556739807129, + "learning_rate": 1.4854619356564127e-07, + "loss": 0.173, + "step": 36474 + }, + { + "epoch": 0.9230204722018371, + "grad_norm": 6.393235683441162, + "learning_rate": 1.4844906411956071e-07, + "loss": 0.1499, + "step": 36475 + }, + { + "epoch": 0.9230457777665309, + "grad_norm": 7.5776143074035645, + "learning_rate": 1.483519659602306e-07, + "loss": 0.0947, + "step": 36476 + }, + { + "epoch": 0.9230710833312246, + "grad_norm": 8.190263748168945, + "learning_rate": 1.482548990882754e-07, + "loss": 0.1272, + "step": 36477 + }, + { + "epoch": 0.9230963888959182, + "grad_norm": 5.797933578491211, + "learning_rate": 1.4815786350432294e-07, + "loss": 0.1296, + "step": 36478 + }, + { + "epoch": 0.9231216944606119, + "grad_norm": 4.188247203826904, + "learning_rate": 1.4806085920899716e-07, + "loss": 0.1861, + "step": 36479 + }, + { + "epoch": 0.9231470000253056, + "grad_norm": 4.769295692443848, + "learning_rate": 1.4796388620292424e-07, + "loss": 0.1723, + "step": 36480 + }, + { + "epoch": 0.9231723055899992, + "grad_norm": 3.839698076248169, + "learning_rate": 1.4786694448672977e-07, + "loss": 0.13, + "step": 36481 + }, + { + "epoch": 0.9231976111546929, + "grad_norm": 3.140714645385742, + "learning_rate": 1.4777003406103941e-07, + "loss": 0.0933, + "step": 36482 + }, + { + "epoch": 0.9232229167193866, + "grad_norm": 6.096192359924316, + "learning_rate": 1.4767315492647705e-07, + "loss": 0.1591, + "step": 36483 + }, + { + "epoch": 0.9232482222840803, + "grad_norm": 5.546947479248047, + "learning_rate": 1.4757630708366777e-07, + "loss": 0.1898, + "step": 36484 + }, + { + "epoch": 0.9232735278487739, + "grad_norm": 3.147026777267456, + "learning_rate": 1.4747949053323606e-07, + "loss": 0.0877, + "step": 36485 + }, + { + "epoch": 0.9232988334134676, + "grad_norm": 7.028110504150391, + "learning_rate": 1.47382705275807e-07, + "loss": 0.1598, + "step": 36486 + }, + { + "epoch": 0.9233241389781613, + "grad_norm": 2.577650308609009, + "learning_rate": 1.4728595131200453e-07, + "loss": 0.0783, + "step": 36487 + }, + { + "epoch": 0.923349444542855, + "grad_norm": 3.3717827796936035, + "learning_rate": 1.47189228642452e-07, + "loss": 0.1529, + "step": 36488 + }, + { + "epoch": 0.9233747501075487, + "grad_norm": 15.244909286499023, + "learning_rate": 1.4709253726777284e-07, + "loss": 0.1859, + "step": 36489 + }, + { + "epoch": 0.9234000556722424, + "grad_norm": 24.508541107177734, + "learning_rate": 1.4699587718859155e-07, + "loss": 0.3227, + "step": 36490 + }, + { + "epoch": 0.923425361236936, + "grad_norm": 2.8931610584259033, + "learning_rate": 1.468992484055315e-07, + "loss": 0.1115, + "step": 36491 + }, + { + "epoch": 0.9234506668016297, + "grad_norm": 4.615972995758057, + "learning_rate": 1.4680265091921553e-07, + "loss": 0.151, + "step": 36492 + }, + { + "epoch": 0.9234759723663234, + "grad_norm": 5.02287483215332, + "learning_rate": 1.4670608473026538e-07, + "loss": 0.1165, + "step": 36493 + }, + { + "epoch": 0.923501277931017, + "grad_norm": 12.286344528198242, + "learning_rate": 1.466095498393061e-07, + "loss": 0.1216, + "step": 36494 + }, + { + "epoch": 0.9235265834957107, + "grad_norm": 14.778249740600586, + "learning_rate": 1.4651304624695883e-07, + "loss": 0.1853, + "step": 36495 + }, + { + "epoch": 0.9235518890604044, + "grad_norm": 4.81071662902832, + "learning_rate": 1.4641657395384589e-07, + "loss": 0.172, + "step": 36496 + }, + { + "epoch": 0.923577194625098, + "grad_norm": 6.17301607131958, + "learning_rate": 1.4632013296058955e-07, + "loss": 0.1497, + "step": 36497 + }, + { + "epoch": 0.9236025001897917, + "grad_norm": 4.529250144958496, + "learning_rate": 1.4622372326781154e-07, + "loss": 0.1941, + "step": 36498 + }, + { + "epoch": 0.9236278057544854, + "grad_norm": 5.217350959777832, + "learning_rate": 1.4612734487613467e-07, + "loss": 0.1497, + "step": 36499 + }, + { + "epoch": 0.923653111319179, + "grad_norm": 3.84879469871521, + "learning_rate": 1.4603099778617846e-07, + "loss": 0.1385, + "step": 36500 + }, + { + "epoch": 0.9236784168838728, + "grad_norm": 6.2075514793396, + "learning_rate": 1.4593468199856687e-07, + "loss": 0.1514, + "step": 36501 + }, + { + "epoch": 0.9237037224485665, + "grad_norm": 7.5534491539001465, + "learning_rate": 1.4583839751391881e-07, + "loss": 0.2264, + "step": 36502 + }, + { + "epoch": 0.9237290280132601, + "grad_norm": 7.6516032218933105, + "learning_rate": 1.4574214433285605e-07, + "loss": 0.2183, + "step": 36503 + }, + { + "epoch": 0.9237543335779538, + "grad_norm": 4.271608352661133, + "learning_rate": 1.456459224559986e-07, + "loss": 0.1204, + "step": 36504 + }, + { + "epoch": 0.9237796391426475, + "grad_norm": 2.3215906620025635, + "learning_rate": 1.4554973188396936e-07, + "loss": 0.0562, + "step": 36505 + }, + { + "epoch": 0.9238049447073411, + "grad_norm": 3.846010208129883, + "learning_rate": 1.4545357261738557e-07, + "loss": 0.1562, + "step": 36506 + }, + { + "epoch": 0.9238302502720348, + "grad_norm": 2.978315591812134, + "learning_rate": 1.4535744465686896e-07, + "loss": 0.1032, + "step": 36507 + }, + { + "epoch": 0.9238555558367285, + "grad_norm": 5.9662885665893555, + "learning_rate": 1.4526134800303848e-07, + "loss": 0.1266, + "step": 36508 + }, + { + "epoch": 0.9238808614014222, + "grad_norm": 4.760798931121826, + "learning_rate": 1.4516528265651587e-07, + "loss": 0.1606, + "step": 36509 + }, + { + "epoch": 0.9239061669661158, + "grad_norm": 3.6737208366394043, + "learning_rate": 1.4506924861791838e-07, + "loss": 0.131, + "step": 36510 + }, + { + "epoch": 0.9239314725308095, + "grad_norm": 6.43068790435791, + "learning_rate": 1.4497324588786666e-07, + "loss": 0.1568, + "step": 36511 + }, + { + "epoch": 0.9239567780955033, + "grad_norm": 6.279842376708984, + "learning_rate": 1.4487727446697852e-07, + "loss": 0.1279, + "step": 36512 + }, + { + "epoch": 0.9239820836601969, + "grad_norm": 3.4546780586242676, + "learning_rate": 1.447813343558746e-07, + "loss": 0.1288, + "step": 36513 + }, + { + "epoch": 0.9240073892248906, + "grad_norm": 8.349380493164062, + "learning_rate": 1.446854255551722e-07, + "loss": 0.1688, + "step": 36514 + }, + { + "epoch": 0.9240326947895843, + "grad_norm": 3.6807613372802734, + "learning_rate": 1.4458954806549074e-07, + "loss": 0.0926, + "step": 36515 + }, + { + "epoch": 0.9240580003542779, + "grad_norm": 7.816120624542236, + "learning_rate": 1.444937018874476e-07, + "loss": 0.2336, + "step": 36516 + }, + { + "epoch": 0.9240833059189716, + "grad_norm": 3.5492167472839355, + "learning_rate": 1.4439788702166168e-07, + "loss": 0.0863, + "step": 36517 + }, + { + "epoch": 0.9241086114836653, + "grad_norm": 4.3288187980651855, + "learning_rate": 1.4430210346875083e-07, + "loss": 0.171, + "step": 36518 + }, + { + "epoch": 0.9241339170483589, + "grad_norm": 2.9379217624664307, + "learning_rate": 1.4420635122933236e-07, + "loss": 0.1122, + "step": 36519 + }, + { + "epoch": 0.9241592226130526, + "grad_norm": 5.938360214233398, + "learning_rate": 1.441106303040235e-07, + "loss": 0.1713, + "step": 36520 + }, + { + "epoch": 0.9241845281777463, + "grad_norm": 8.847565650939941, + "learning_rate": 1.440149406934427e-07, + "loss": 0.221, + "step": 36521 + }, + { + "epoch": 0.9242098337424399, + "grad_norm": 5.7040815353393555, + "learning_rate": 1.439192823982055e-07, + "loss": 0.146, + "step": 36522 + }, + { + "epoch": 0.9242351393071336, + "grad_norm": 5.6774115562438965, + "learning_rate": 1.438236554189304e-07, + "loss": 0.1381, + "step": 36523 + }, + { + "epoch": 0.9242604448718273, + "grad_norm": 7.692997455596924, + "learning_rate": 1.4372805975623184e-07, + "loss": 0.1366, + "step": 36524 + }, + { + "epoch": 0.924285750436521, + "grad_norm": 3.0767033100128174, + "learning_rate": 1.4363249541072876e-07, + "loss": 0.0528, + "step": 36525 + }, + { + "epoch": 0.9243110560012147, + "grad_norm": 5.391119956970215, + "learning_rate": 1.4353696238303628e-07, + "loss": 0.2292, + "step": 36526 + }, + { + "epoch": 0.9243363615659084, + "grad_norm": 5.769766807556152, + "learning_rate": 1.4344146067377053e-07, + "loss": 0.1896, + "step": 36527 + }, + { + "epoch": 0.924361667130602, + "grad_norm": 2.8359827995300293, + "learning_rate": 1.4334599028354768e-07, + "loss": 0.0772, + "step": 36528 + }, + { + "epoch": 0.9243869726952957, + "grad_norm": 9.41512393951416, + "learning_rate": 1.4325055121298225e-07, + "loss": 0.1629, + "step": 36529 + }, + { + "epoch": 0.9244122782599894, + "grad_norm": 8.223554611206055, + "learning_rate": 1.4315514346269098e-07, + "loss": 0.1686, + "step": 36530 + }, + { + "epoch": 0.924437583824683, + "grad_norm": 5.995806694030762, + "learning_rate": 1.4305976703328894e-07, + "loss": 0.1464, + "step": 36531 + }, + { + "epoch": 0.9244628893893767, + "grad_norm": 8.48144245147705, + "learning_rate": 1.4296442192539117e-07, + "loss": 0.2129, + "step": 36532 + }, + { + "epoch": 0.9244881949540704, + "grad_norm": 3.8188798427581787, + "learning_rate": 1.4286910813961162e-07, + "loss": 0.1138, + "step": 36533 + }, + { + "epoch": 0.9245135005187641, + "grad_norm": 8.779336929321289, + "learning_rate": 1.4277382567656651e-07, + "loss": 0.2278, + "step": 36534 + }, + { + "epoch": 0.9245388060834577, + "grad_norm": 10.898545265197754, + "learning_rate": 1.426785745368686e-07, + "loss": 0.1557, + "step": 36535 + }, + { + "epoch": 0.9245641116481514, + "grad_norm": 25.77806282043457, + "learning_rate": 1.4258335472113417e-07, + "loss": 0.1982, + "step": 36536 + }, + { + "epoch": 0.9245894172128452, + "grad_norm": 4.151055335998535, + "learning_rate": 1.4248816622997542e-07, + "loss": 0.1074, + "step": 36537 + }, + { + "epoch": 0.9246147227775388, + "grad_norm": 5.953478813171387, + "learning_rate": 1.423930090640069e-07, + "loss": 0.1109, + "step": 36538 + }, + { + "epoch": 0.9246400283422325, + "grad_norm": 5.344935417175293, + "learning_rate": 1.4229788322384198e-07, + "loss": 0.1758, + "step": 36539 + }, + { + "epoch": 0.9246653339069262, + "grad_norm": 4.560805320739746, + "learning_rate": 1.4220278871009574e-07, + "loss": 0.1267, + "step": 36540 + }, + { + "epoch": 0.9246906394716198, + "grad_norm": 2.415064573287964, + "learning_rate": 1.4210772552337883e-07, + "loss": 0.0933, + "step": 36541 + }, + { + "epoch": 0.9247159450363135, + "grad_norm": 2.8388237953186035, + "learning_rate": 1.420126936643057e-07, + "loss": 0.1055, + "step": 36542 + }, + { + "epoch": 0.9247412506010072, + "grad_norm": 4.437760353088379, + "learning_rate": 1.4191769313348925e-07, + "loss": 0.1292, + "step": 36543 + }, + { + "epoch": 0.9247665561657008, + "grad_norm": 7.252880096435547, + "learning_rate": 1.4182272393154173e-07, + "loss": 0.1619, + "step": 36544 + }, + { + "epoch": 0.9247918617303945, + "grad_norm": 7.43529748916626, + "learning_rate": 1.4172778605907655e-07, + "loss": 0.1989, + "step": 36545 + }, + { + "epoch": 0.9248171672950882, + "grad_norm": 4.664463043212891, + "learning_rate": 1.4163287951670436e-07, + "loss": 0.1899, + "step": 36546 + }, + { + "epoch": 0.9248424728597818, + "grad_norm": 3.7808728218078613, + "learning_rate": 1.4153800430503795e-07, + "loss": 0.1132, + "step": 36547 + }, + { + "epoch": 0.9248677784244755, + "grad_norm": 6.3451409339904785, + "learning_rate": 1.4144316042468964e-07, + "loss": 0.1806, + "step": 36548 + }, + { + "epoch": 0.9248930839891693, + "grad_norm": 7.390312671661377, + "learning_rate": 1.413483478762706e-07, + "loss": 0.186, + "step": 36549 + }, + { + "epoch": 0.9249183895538629, + "grad_norm": 14.740375518798828, + "learning_rate": 1.412535666603926e-07, + "loss": 0.1469, + "step": 36550 + }, + { + "epoch": 0.9249436951185566, + "grad_norm": 2.6872611045837402, + "learning_rate": 1.4115881677766563e-07, + "loss": 0.1155, + "step": 36551 + }, + { + "epoch": 0.9249690006832503, + "grad_norm": 3.248289108276367, + "learning_rate": 1.4106409822870203e-07, + "loss": 0.0957, + "step": 36552 + }, + { + "epoch": 0.9249943062479439, + "grad_norm": 4.035518169403076, + "learning_rate": 1.4096941101411298e-07, + "loss": 0.1271, + "step": 36553 + }, + { + "epoch": 0.9250196118126376, + "grad_norm": 3.350208282470703, + "learning_rate": 1.4087475513450744e-07, + "loss": 0.1147, + "step": 36554 + }, + { + "epoch": 0.9250449173773313, + "grad_norm": 6.440815448760986, + "learning_rate": 1.4078013059049712e-07, + "loss": 0.2026, + "step": 36555 + }, + { + "epoch": 0.9250702229420249, + "grad_norm": 5.617153644561768, + "learning_rate": 1.4068553738269153e-07, + "loss": 0.1085, + "step": 36556 + }, + { + "epoch": 0.9250955285067186, + "grad_norm": 4.413753986358643, + "learning_rate": 1.4059097551170132e-07, + "loss": 0.083, + "step": 36557 + }, + { + "epoch": 0.9251208340714123, + "grad_norm": 4.63779354095459, + "learning_rate": 1.4049644497813598e-07, + "loss": 0.1617, + "step": 36558 + }, + { + "epoch": 0.925146139636106, + "grad_norm": 2.7116239070892334, + "learning_rate": 1.4040194578260558e-07, + "loss": 0.0962, + "step": 36559 + }, + { + "epoch": 0.9251714452007996, + "grad_norm": 9.13560962677002, + "learning_rate": 1.4030747792571796e-07, + "loss": 0.2217, + "step": 36560 + }, + { + "epoch": 0.9251967507654933, + "grad_norm": 3.018686532974243, + "learning_rate": 1.4021304140808433e-07, + "loss": 0.0932, + "step": 36561 + }, + { + "epoch": 0.9252220563301871, + "grad_norm": 4.869842529296875, + "learning_rate": 1.401186362303125e-07, + "loss": 0.1368, + "step": 36562 + }, + { + "epoch": 0.9252473618948807, + "grad_norm": 5.257539749145508, + "learning_rate": 1.400242623930115e-07, + "loss": 0.1807, + "step": 36563 + }, + { + "epoch": 0.9252726674595744, + "grad_norm": 13.606873512268066, + "learning_rate": 1.3992991989679016e-07, + "loss": 0.2168, + "step": 36564 + }, + { + "epoch": 0.9252979730242681, + "grad_norm": 5.5806379318237305, + "learning_rate": 1.3983560874225644e-07, + "loss": 0.1701, + "step": 36565 + }, + { + "epoch": 0.9253232785889617, + "grad_norm": 6.243022918701172, + "learning_rate": 1.3974132893001868e-07, + "loss": 0.2073, + "step": 36566 + }, + { + "epoch": 0.9253485841536554, + "grad_norm": 4.286108493804932, + "learning_rate": 1.3964708046068587e-07, + "loss": 0.1491, + "step": 36567 + }, + { + "epoch": 0.9253738897183491, + "grad_norm": 4.66654109954834, + "learning_rate": 1.3955286333486416e-07, + "loss": 0.1668, + "step": 36568 + }, + { + "epoch": 0.9253991952830427, + "grad_norm": 5.443683624267578, + "learning_rate": 1.3945867755316257e-07, + "loss": 0.1618, + "step": 36569 + }, + { + "epoch": 0.9254245008477364, + "grad_norm": 5.709414005279541, + "learning_rate": 1.393645231161872e-07, + "loss": 0.189, + "step": 36570 + }, + { + "epoch": 0.9254498064124301, + "grad_norm": 3.642642021179199, + "learning_rate": 1.392704000245465e-07, + "loss": 0.1077, + "step": 36571 + }, + { + "epoch": 0.9254751119771237, + "grad_norm": 4.50031852722168, + "learning_rate": 1.3917630827884609e-07, + "loss": 0.2061, + "step": 36572 + }, + { + "epoch": 0.9255004175418174, + "grad_norm": 12.773468971252441, + "learning_rate": 1.3908224787969382e-07, + "loss": 0.248, + "step": 36573 + }, + { + "epoch": 0.9255257231065112, + "grad_norm": 3.2725470066070557, + "learning_rate": 1.3898821882769587e-07, + "loss": 0.1217, + "step": 36574 + }, + { + "epoch": 0.9255510286712048, + "grad_norm": 8.59449291229248, + "learning_rate": 1.3889422112345895e-07, + "loss": 0.236, + "step": 36575 + }, + { + "epoch": 0.9255763342358985, + "grad_norm": 4.078223705291748, + "learning_rate": 1.3880025476758874e-07, + "loss": 0.0828, + "step": 36576 + }, + { + "epoch": 0.9256016398005922, + "grad_norm": 3.797863483428955, + "learning_rate": 1.3870631976069193e-07, + "loss": 0.1026, + "step": 36577 + }, + { + "epoch": 0.9256269453652858, + "grad_norm": 3.548032522201538, + "learning_rate": 1.3861241610337306e-07, + "loss": 0.1893, + "step": 36578 + }, + { + "epoch": 0.9256522509299795, + "grad_norm": 8.355337142944336, + "learning_rate": 1.3851854379623885e-07, + "loss": 0.1855, + "step": 36579 + }, + { + "epoch": 0.9256775564946732, + "grad_norm": 6.697329044342041, + "learning_rate": 1.3842470283989384e-07, + "loss": 0.2354, + "step": 36580 + }, + { + "epoch": 0.9257028620593668, + "grad_norm": 5.323194980621338, + "learning_rate": 1.383308932349442e-07, + "loss": 0.1332, + "step": 36581 + }, + { + "epoch": 0.9257281676240605, + "grad_norm": 4.16990327835083, + "learning_rate": 1.3823711498199387e-07, + "loss": 0.1048, + "step": 36582 + }, + { + "epoch": 0.9257534731887542, + "grad_norm": 3.6285927295684814, + "learning_rate": 1.3814336808164797e-07, + "loss": 0.1375, + "step": 36583 + }, + { + "epoch": 0.9257787787534478, + "grad_norm": 3.3729488849639893, + "learning_rate": 1.3804965253451152e-07, + "loss": 0.1363, + "step": 36584 + }, + { + "epoch": 0.9258040843181415, + "grad_norm": 3.703645706176758, + "learning_rate": 1.379559683411885e-07, + "loss": 0.1039, + "step": 36585 + }, + { + "epoch": 0.9258293898828353, + "grad_norm": 3.411363363265991, + "learning_rate": 1.378623155022829e-07, + "loss": 0.1264, + "step": 36586 + }, + { + "epoch": 0.925854695447529, + "grad_norm": 4.6159563064575195, + "learning_rate": 1.377686940183981e-07, + "loss": 0.1751, + "step": 36587 + }, + { + "epoch": 0.9258800010122226, + "grad_norm": 4.880702972412109, + "learning_rate": 1.3767510389013917e-07, + "loss": 0.1448, + "step": 36588 + }, + { + "epoch": 0.9259053065769163, + "grad_norm": 4.253951549530029, + "learning_rate": 1.3758154511810896e-07, + "loss": 0.0948, + "step": 36589 + }, + { + "epoch": 0.92593061214161, + "grad_norm": 19.890642166137695, + "learning_rate": 1.3748801770291143e-07, + "loss": 0.1934, + "step": 36590 + }, + { + "epoch": 0.9259559177063036, + "grad_norm": 8.597095489501953, + "learning_rate": 1.3739452164514778e-07, + "loss": 0.2214, + "step": 36591 + }, + { + "epoch": 0.9259812232709973, + "grad_norm": 5.136999607086182, + "learning_rate": 1.3730105694542362e-07, + "loss": 0.1589, + "step": 36592 + }, + { + "epoch": 0.926006528835691, + "grad_norm": 4.710944175720215, + "learning_rate": 1.3720762360434014e-07, + "loss": 0.1849, + "step": 36593 + }, + { + "epoch": 0.9260318344003846, + "grad_norm": 4.815024375915527, + "learning_rate": 1.371142216225002e-07, + "loss": 0.1299, + "step": 36594 + }, + { + "epoch": 0.9260571399650783, + "grad_norm": 13.43857192993164, + "learning_rate": 1.3702085100050555e-07, + "loss": 0.1976, + "step": 36595 + }, + { + "epoch": 0.926082445529772, + "grad_norm": 6.62998104095459, + "learning_rate": 1.3692751173895903e-07, + "loss": 0.1258, + "step": 36596 + }, + { + "epoch": 0.9261077510944656, + "grad_norm": 4.370581150054932, + "learning_rate": 1.3683420383846236e-07, + "loss": 0.1155, + "step": 36597 + }, + { + "epoch": 0.9261330566591593, + "grad_norm": 26.02253532409668, + "learning_rate": 1.3674092729961786e-07, + "loss": 0.4426, + "step": 36598 + }, + { + "epoch": 0.9261583622238531, + "grad_norm": 6.5391926765441895, + "learning_rate": 1.3664768212302558e-07, + "loss": 0.1539, + "step": 36599 + }, + { + "epoch": 0.9261836677885467, + "grad_norm": 3.5738730430603027, + "learning_rate": 1.365544683092884e-07, + "loss": 0.1524, + "step": 36600 + }, + { + "epoch": 0.9262089733532404, + "grad_norm": 6.468020915985107, + "learning_rate": 1.3646128585900641e-07, + "loss": 0.1011, + "step": 36601 + }, + { + "epoch": 0.9262342789179341, + "grad_norm": 2.385838747024536, + "learning_rate": 1.3636813477278188e-07, + "loss": 0.0891, + "step": 36602 + }, + { + "epoch": 0.9262595844826277, + "grad_norm": 4.5545334815979, + "learning_rate": 1.3627501505121322e-07, + "loss": 0.194, + "step": 36603 + }, + { + "epoch": 0.9262848900473214, + "grad_norm": 6.7006964683532715, + "learning_rate": 1.3618192669490325e-07, + "loss": 0.1099, + "step": 36604 + }, + { + "epoch": 0.9263101956120151, + "grad_norm": 3.8359947204589844, + "learning_rate": 1.3608886970445046e-07, + "loss": 0.1602, + "step": 36605 + }, + { + "epoch": 0.9263355011767087, + "grad_norm": 8.357687950134277, + "learning_rate": 1.3599584408045708e-07, + "loss": 0.1669, + "step": 36606 + }, + { + "epoch": 0.9263608067414024, + "grad_norm": 13.212384223937988, + "learning_rate": 1.3590284982352043e-07, + "loss": 0.2964, + "step": 36607 + }, + { + "epoch": 0.9263861123060961, + "grad_norm": 3.9624524116516113, + "learning_rate": 1.358098869342417e-07, + "loss": 0.103, + "step": 36608 + }, + { + "epoch": 0.9264114178707897, + "grad_norm": 5.472667217254639, + "learning_rate": 1.3571695541322038e-07, + "loss": 0.1605, + "step": 36609 + }, + { + "epoch": 0.9264367234354834, + "grad_norm": 8.028595924377441, + "learning_rate": 1.3562405526105549e-07, + "loss": 0.2068, + "step": 36610 + }, + { + "epoch": 0.9264620290001772, + "grad_norm": 3.4280014038085938, + "learning_rate": 1.3553118647834652e-07, + "loss": 0.1027, + "step": 36611 + }, + { + "epoch": 0.9264873345648709, + "grad_norm": 4.2998552322387695, + "learning_rate": 1.3543834906569243e-07, + "loss": 0.1654, + "step": 36612 + }, + { + "epoch": 0.9265126401295645, + "grad_norm": 12.6254243850708, + "learning_rate": 1.3534554302369107e-07, + "loss": 0.3251, + "step": 36613 + }, + { + "epoch": 0.9265379456942582, + "grad_norm": 2.1781699657440186, + "learning_rate": 1.3525276835294087e-07, + "loss": 0.1133, + "step": 36614 + }, + { + "epoch": 0.9265632512589519, + "grad_norm": 4.611076831817627, + "learning_rate": 1.3516002505404135e-07, + "loss": 0.1297, + "step": 36615 + }, + { + "epoch": 0.9265885568236455, + "grad_norm": 8.912151336669922, + "learning_rate": 1.3506731312758982e-07, + "loss": 0.1399, + "step": 36616 + }, + { + "epoch": 0.9266138623883392, + "grad_norm": 6.468831539154053, + "learning_rate": 1.3497463257418465e-07, + "loss": 0.0864, + "step": 36617 + }, + { + "epoch": 0.9266391679530329, + "grad_norm": 6.911097526550293, + "learning_rate": 1.3488198339442205e-07, + "loss": 0.1843, + "step": 36618 + }, + { + "epoch": 0.9266644735177265, + "grad_norm": 5.922338008880615, + "learning_rate": 1.3478936558890098e-07, + "loss": 0.1696, + "step": 36619 + }, + { + "epoch": 0.9266897790824202, + "grad_norm": 5.184762001037598, + "learning_rate": 1.3469677915821876e-07, + "loss": 0.1262, + "step": 36620 + }, + { + "epoch": 0.9267150846471139, + "grad_norm": 7.821054935455322, + "learning_rate": 1.3460422410297157e-07, + "loss": 0.1486, + "step": 36621 + }, + { + "epoch": 0.9267403902118075, + "grad_norm": 2.9657230377197266, + "learning_rate": 1.3451170042375672e-07, + "loss": 0.1137, + "step": 36622 + }, + { + "epoch": 0.9267656957765013, + "grad_norm": 4.169596195220947, + "learning_rate": 1.3441920812117094e-07, + "loss": 0.1816, + "step": 36623 + }, + { + "epoch": 0.926791001341195, + "grad_norm": 7.079051971435547, + "learning_rate": 1.3432674719581095e-07, + "loss": 0.2169, + "step": 36624 + }, + { + "epoch": 0.9268163069058886, + "grad_norm": 4.636221885681152, + "learning_rate": 1.3423431764827243e-07, + "loss": 0.1483, + "step": 36625 + }, + { + "epoch": 0.9268416124705823, + "grad_norm": 6.628503799438477, + "learning_rate": 1.34141919479151e-07, + "loss": 0.1823, + "step": 36626 + }, + { + "epoch": 0.926866918035276, + "grad_norm": 3.7230982780456543, + "learning_rate": 1.3404955268904397e-07, + "loss": 0.2033, + "step": 36627 + }, + { + "epoch": 0.9268922235999696, + "grad_norm": 8.587998390197754, + "learning_rate": 1.3395721727854584e-07, + "loss": 0.2386, + "step": 36628 + }, + { + "epoch": 0.9269175291646633, + "grad_norm": 4.2170939445495605, + "learning_rate": 1.3386491324825336e-07, + "loss": 0.1552, + "step": 36629 + }, + { + "epoch": 0.926942834729357, + "grad_norm": 12.69224739074707, + "learning_rate": 1.3377264059875995e-07, + "loss": 0.2385, + "step": 36630 + }, + { + "epoch": 0.9269681402940506, + "grad_norm": 10.422758102416992, + "learning_rate": 1.336803993306618e-07, + "loss": 0.2851, + "step": 36631 + }, + { + "epoch": 0.9269934458587443, + "grad_norm": 5.416399002075195, + "learning_rate": 1.3358818944455343e-07, + "loss": 0.1188, + "step": 36632 + }, + { + "epoch": 0.927018751423438, + "grad_norm": 6.911777496337891, + "learning_rate": 1.3349601094103047e-07, + "loss": 0.2132, + "step": 36633 + }, + { + "epoch": 0.9270440569881316, + "grad_norm": 6.802029132843018, + "learning_rate": 1.3340386382068527e-07, + "loss": 0.1535, + "step": 36634 + }, + { + "epoch": 0.9270693625528253, + "grad_norm": 3.3584866523742676, + "learning_rate": 1.333117480841145e-07, + "loss": 0.069, + "step": 36635 + }, + { + "epoch": 0.9270946681175191, + "grad_norm": 5.718977928161621, + "learning_rate": 1.3321966373190997e-07, + "loss": 0.1128, + "step": 36636 + }, + { + "epoch": 0.9271199736822128, + "grad_norm": 11.129425048828125, + "learning_rate": 1.331276107646673e-07, + "loss": 0.2256, + "step": 36637 + }, + { + "epoch": 0.9271452792469064, + "grad_norm": 7.4228291511535645, + "learning_rate": 1.3303558918297878e-07, + "loss": 0.1423, + "step": 36638 + }, + { + "epoch": 0.9271705848116001, + "grad_norm": 5.06007719039917, + "learning_rate": 1.3294359898743837e-07, + "loss": 0.0818, + "step": 36639 + }, + { + "epoch": 0.9271958903762938, + "grad_norm": 4.075861930847168, + "learning_rate": 1.3285164017864006e-07, + "loss": 0.1338, + "step": 36640 + }, + { + "epoch": 0.9272211959409874, + "grad_norm": 3.5172226428985596, + "learning_rate": 1.3275971275717503e-07, + "loss": 0.1406, + "step": 36641 + }, + { + "epoch": 0.9272465015056811, + "grad_norm": 20.223251342773438, + "learning_rate": 1.3266781672363838e-07, + "loss": 0.1622, + "step": 36642 + }, + { + "epoch": 0.9272718070703748, + "grad_norm": 3.649785041809082, + "learning_rate": 1.3257595207862127e-07, + "loss": 0.1566, + "step": 36643 + }, + { + "epoch": 0.9272971126350684, + "grad_norm": 4.664155006408691, + "learning_rate": 1.3248411882271605e-07, + "loss": 0.1494, + "step": 36644 + }, + { + "epoch": 0.9273224181997621, + "grad_norm": 7.066163539886475, + "learning_rate": 1.3239231695651555e-07, + "loss": 0.2535, + "step": 36645 + }, + { + "epoch": 0.9273477237644558, + "grad_norm": 7.953025817871094, + "learning_rate": 1.3230054648061153e-07, + "loss": 0.2125, + "step": 36646 + }, + { + "epoch": 0.9273730293291494, + "grad_norm": 10.737834930419922, + "learning_rate": 1.3220880739559626e-07, + "loss": 0.2888, + "step": 36647 + }, + { + "epoch": 0.9273983348938432, + "grad_norm": 7.85424280166626, + "learning_rate": 1.3211709970206043e-07, + "loss": 0.2183, + "step": 36648 + }, + { + "epoch": 0.9274236404585369, + "grad_norm": 3.5878210067749023, + "learning_rate": 1.3202542340059575e-07, + "loss": 0.1328, + "step": 36649 + }, + { + "epoch": 0.9274489460232305, + "grad_norm": 9.594837188720703, + "learning_rate": 1.31933778491794e-07, + "loss": 0.194, + "step": 36650 + }, + { + "epoch": 0.9274742515879242, + "grad_norm": 6.893601894378662, + "learning_rate": 1.318421649762458e-07, + "loss": 0.2095, + "step": 36651 + }, + { + "epoch": 0.9274995571526179, + "grad_norm": 4.595518589019775, + "learning_rate": 1.3175058285454178e-07, + "loss": 0.1119, + "step": 36652 + }, + { + "epoch": 0.9275248627173115, + "grad_norm": 7.867453098297119, + "learning_rate": 1.3165903212727206e-07, + "loss": 0.2363, + "step": 36653 + }, + { + "epoch": 0.9275501682820052, + "grad_norm": 4.398746490478516, + "learning_rate": 1.315675127950289e-07, + "loss": 0.1341, + "step": 36654 + }, + { + "epoch": 0.9275754738466989, + "grad_norm": 3.3305585384368896, + "learning_rate": 1.3147602485840072e-07, + "loss": 0.1266, + "step": 36655 + }, + { + "epoch": 0.9276007794113925, + "grad_norm": 20.037981033325195, + "learning_rate": 1.3138456831797763e-07, + "loss": 0.2537, + "step": 36656 + }, + { + "epoch": 0.9276260849760862, + "grad_norm": 3.9168224334716797, + "learning_rate": 1.3129314317435027e-07, + "loss": 0.1189, + "step": 36657 + }, + { + "epoch": 0.9276513905407799, + "grad_norm": 7.530641078948975, + "learning_rate": 1.312017494281076e-07, + "loss": 0.2127, + "step": 36658 + }, + { + "epoch": 0.9276766961054735, + "grad_norm": 8.435304641723633, + "learning_rate": 1.3111038707983914e-07, + "loss": 0.1712, + "step": 36659 + }, + { + "epoch": 0.9277020016701673, + "grad_norm": 4.689388275146484, + "learning_rate": 1.3101905613013444e-07, + "loss": 0.1687, + "step": 36660 + }, + { + "epoch": 0.927727307234861, + "grad_norm": 3.9208827018737793, + "learning_rate": 1.309277565795819e-07, + "loss": 0.095, + "step": 36661 + }, + { + "epoch": 0.9277526127995547, + "grad_norm": 5.3115668296813965, + "learning_rate": 1.308364884287705e-07, + "loss": 0.1573, + "step": 36662 + }, + { + "epoch": 0.9277779183642483, + "grad_norm": 4.402132987976074, + "learning_rate": 1.3074525167828867e-07, + "loss": 0.1555, + "step": 36663 + }, + { + "epoch": 0.927803223928942, + "grad_norm": 2.472590684890747, + "learning_rate": 1.3065404632872592e-07, + "loss": 0.1313, + "step": 36664 + }, + { + "epoch": 0.9278285294936357, + "grad_norm": 5.876377105712891, + "learning_rate": 1.305628723806679e-07, + "loss": 0.1333, + "step": 36665 + }, + { + "epoch": 0.9278538350583293, + "grad_norm": 4.379269123077393, + "learning_rate": 1.3047172983470523e-07, + "loss": 0.1376, + "step": 36666 + }, + { + "epoch": 0.927879140623023, + "grad_norm": 2.507209062576294, + "learning_rate": 1.303806186914247e-07, + "loss": 0.103, + "step": 36667 + }, + { + "epoch": 0.9279044461877167, + "grad_norm": 4.454846382141113, + "learning_rate": 1.3028953895141306e-07, + "loss": 0.1334, + "step": 36668 + }, + { + "epoch": 0.9279297517524103, + "grad_norm": 4.858768463134766, + "learning_rate": 1.3019849061525868e-07, + "loss": 0.1262, + "step": 36669 + }, + { + "epoch": 0.927955057317104, + "grad_norm": 5.592861652374268, + "learning_rate": 1.3010747368354837e-07, + "loss": 0.1465, + "step": 36670 + }, + { + "epoch": 0.9279803628817977, + "grad_norm": 3.0082666873931885, + "learning_rate": 1.300164881568694e-07, + "loss": 0.1006, + "step": 36671 + }, + { + "epoch": 0.9280056684464913, + "grad_norm": 6.766413688659668, + "learning_rate": 1.2992553403580744e-07, + "loss": 0.1048, + "step": 36672 + }, + { + "epoch": 0.9280309740111851, + "grad_norm": 3.3649260997772217, + "learning_rate": 1.2983461132095033e-07, + "loss": 0.1224, + "step": 36673 + }, + { + "epoch": 0.9280562795758788, + "grad_norm": 7.495610237121582, + "learning_rate": 1.297437200128837e-07, + "loss": 0.175, + "step": 36674 + }, + { + "epoch": 0.9280815851405724, + "grad_norm": 4.529567241668701, + "learning_rate": 1.2965286011219436e-07, + "loss": 0.1022, + "step": 36675 + }, + { + "epoch": 0.9281068907052661, + "grad_norm": 4.736093521118164, + "learning_rate": 1.2956203161946679e-07, + "loss": 0.1674, + "step": 36676 + }, + { + "epoch": 0.9281321962699598, + "grad_norm": 5.794295787811279, + "learning_rate": 1.2947123453528886e-07, + "loss": 0.1208, + "step": 36677 + }, + { + "epoch": 0.9281575018346534, + "grad_norm": 6.136066436767578, + "learning_rate": 1.2938046886024403e-07, + "loss": 0.1945, + "step": 36678 + }, + { + "epoch": 0.9281828073993471, + "grad_norm": 3.3491127490997314, + "learning_rate": 1.2928973459491955e-07, + "loss": 0.0784, + "step": 36679 + }, + { + "epoch": 0.9282081129640408, + "grad_norm": 1.1015634536743164, + "learning_rate": 1.2919903173989834e-07, + "loss": 0.0257, + "step": 36680 + }, + { + "epoch": 0.9282334185287344, + "grad_norm": 4.166835784912109, + "learning_rate": 1.2910836029576713e-07, + "loss": 0.0834, + "step": 36681 + }, + { + "epoch": 0.9282587240934281, + "grad_norm": 9.32676887512207, + "learning_rate": 1.2901772026311044e-07, + "loss": 0.2646, + "step": 36682 + }, + { + "epoch": 0.9282840296581218, + "grad_norm": 6.1926493644714355, + "learning_rate": 1.2892711164251227e-07, + "loss": 0.1359, + "step": 36683 + }, + { + "epoch": 0.9283093352228154, + "grad_norm": 2.6748480796813965, + "learning_rate": 1.2883653443455602e-07, + "loss": 0.0865, + "step": 36684 + }, + { + "epoch": 0.9283346407875092, + "grad_norm": 10.287498474121094, + "learning_rate": 1.287459886398279e-07, + "loss": 0.2725, + "step": 36685 + }, + { + "epoch": 0.9283599463522029, + "grad_norm": 6.685785293579102, + "learning_rate": 1.286554742589108e-07, + "loss": 0.2366, + "step": 36686 + }, + { + "epoch": 0.9283852519168965, + "grad_norm": 3.1964786052703857, + "learning_rate": 1.2856499129238808e-07, + "loss": 0.1124, + "step": 36687 + }, + { + "epoch": 0.9284105574815902, + "grad_norm": 4.895288467407227, + "learning_rate": 1.2847453974084322e-07, + "loss": 0.1273, + "step": 36688 + }, + { + "epoch": 0.9284358630462839, + "grad_norm": 2.60634183883667, + "learning_rate": 1.2838411960486075e-07, + "loss": 0.0791, + "step": 36689 + }, + { + "epoch": 0.9284611686109776, + "grad_norm": 4.12844705581665, + "learning_rate": 1.282937308850224e-07, + "loss": 0.1449, + "step": 36690 + }, + { + "epoch": 0.9284864741756712, + "grad_norm": 7.828564167022705, + "learning_rate": 1.2820337358191158e-07, + "loss": 0.0912, + "step": 36691 + }, + { + "epoch": 0.9285117797403649, + "grad_norm": 6.627377986907959, + "learning_rate": 1.2811304769611067e-07, + "loss": 0.2096, + "step": 36692 + }, + { + "epoch": 0.9285370853050586, + "grad_norm": 6.635859489440918, + "learning_rate": 1.2802275322820302e-07, + "loss": 0.1377, + "step": 36693 + }, + { + "epoch": 0.9285623908697522, + "grad_norm": 9.389440536499023, + "learning_rate": 1.2793249017876986e-07, + "loss": 0.1584, + "step": 36694 + }, + { + "epoch": 0.9285876964344459, + "grad_norm": 2.7957897186279297, + "learning_rate": 1.2784225854839462e-07, + "loss": 0.0661, + "step": 36695 + }, + { + "epoch": 0.9286130019991397, + "grad_norm": 4.188013553619385, + "learning_rate": 1.2775205833765737e-07, + "loss": 0.1138, + "step": 36696 + }, + { + "epoch": 0.9286383075638333, + "grad_norm": 3.508561611175537, + "learning_rate": 1.2766188954714153e-07, + "loss": 0.1526, + "step": 36697 + }, + { + "epoch": 0.928663613128527, + "grad_norm": 4.529232501983643, + "learning_rate": 1.2757175217742778e-07, + "loss": 0.1314, + "step": 36698 + }, + { + "epoch": 0.9286889186932207, + "grad_norm": 4.07060432434082, + "learning_rate": 1.274816462290973e-07, + "loss": 0.1442, + "step": 36699 + }, + { + "epoch": 0.9287142242579143, + "grad_norm": 6.4419169425964355, + "learning_rate": 1.2739157170273076e-07, + "loss": 0.1067, + "step": 36700 + }, + { + "epoch": 0.928739529822608, + "grad_norm": 6.334893226623535, + "learning_rate": 1.2730152859891044e-07, + "loss": 0.182, + "step": 36701 + }, + { + "epoch": 0.9287648353873017, + "grad_norm": 3.273552417755127, + "learning_rate": 1.2721151691821588e-07, + "loss": 0.0831, + "step": 36702 + }, + { + "epoch": 0.9287901409519953, + "grad_norm": 12.276182174682617, + "learning_rate": 1.2712153666122772e-07, + "loss": 0.2444, + "step": 36703 + }, + { + "epoch": 0.928815446516689, + "grad_norm": 8.671395301818848, + "learning_rate": 1.2703158782852666e-07, + "loss": 0.1122, + "step": 36704 + }, + { + "epoch": 0.9288407520813827, + "grad_norm": 7.629980564117432, + "learning_rate": 1.2694167042069217e-07, + "loss": 0.2471, + "step": 36705 + }, + { + "epoch": 0.9288660576460763, + "grad_norm": 5.948880195617676, + "learning_rate": 1.2685178443830436e-07, + "loss": 0.1923, + "step": 36706 + }, + { + "epoch": 0.92889136321077, + "grad_norm": 6.237016677856445, + "learning_rate": 1.267619298819428e-07, + "loss": 0.1123, + "step": 36707 + }, + { + "epoch": 0.9289166687754637, + "grad_norm": 10.495077133178711, + "learning_rate": 1.2667210675218754e-07, + "loss": 0.2536, + "step": 36708 + }, + { + "epoch": 0.9289419743401574, + "grad_norm": 15.805266380310059, + "learning_rate": 1.2658231504961704e-07, + "loss": 0.2114, + "step": 36709 + }, + { + "epoch": 0.9289672799048511, + "grad_norm": 3.4431660175323486, + "learning_rate": 1.264925547748108e-07, + "loss": 0.1215, + "step": 36710 + }, + { + "epoch": 0.9289925854695448, + "grad_norm": 3.982208251953125, + "learning_rate": 1.2640282592834673e-07, + "loss": 0.1172, + "step": 36711 + }, + { + "epoch": 0.9290178910342384, + "grad_norm": 3.3134207725524902, + "learning_rate": 1.2631312851080601e-07, + "loss": 0.1242, + "step": 36712 + }, + { + "epoch": 0.9290431965989321, + "grad_norm": 7.8722453117370605, + "learning_rate": 1.2622346252276373e-07, + "loss": 0.3043, + "step": 36713 + }, + { + "epoch": 0.9290685021636258, + "grad_norm": 8.860477447509766, + "learning_rate": 1.2613382796480056e-07, + "loss": 0.1872, + "step": 36714 + }, + { + "epoch": 0.9290938077283195, + "grad_norm": 3.512636184692383, + "learning_rate": 1.2604422483749324e-07, + "loss": 0.1432, + "step": 36715 + }, + { + "epoch": 0.9291191132930131, + "grad_norm": 3.077472686767578, + "learning_rate": 1.259546531414202e-07, + "loss": 0.1148, + "step": 36716 + }, + { + "epoch": 0.9291444188577068, + "grad_norm": 6.707839012145996, + "learning_rate": 1.258651128771593e-07, + "loss": 0.2185, + "step": 36717 + }, + { + "epoch": 0.9291697244224005, + "grad_norm": 8.971723556518555, + "learning_rate": 1.2577560404528788e-07, + "loss": 0.2098, + "step": 36718 + }, + { + "epoch": 0.9291950299870941, + "grad_norm": 2.69061017036438, + "learning_rate": 1.2568612664638213e-07, + "loss": 0.1004, + "step": 36719 + }, + { + "epoch": 0.9292203355517878, + "grad_norm": 4.153936862945557, + "learning_rate": 1.2559668068101993e-07, + "loss": 0.1404, + "step": 36720 + }, + { + "epoch": 0.9292456411164816, + "grad_norm": 3.6558268070220947, + "learning_rate": 1.2550726614977859e-07, + "loss": 0.0746, + "step": 36721 + }, + { + "epoch": 0.9292709466811752, + "grad_norm": 4.074892520904541, + "learning_rate": 1.2541788305323376e-07, + "loss": 0.1544, + "step": 36722 + }, + { + "epoch": 0.9292962522458689, + "grad_norm": 2.91237473487854, + "learning_rate": 1.253285313919622e-07, + "loss": 0.1333, + "step": 36723 + }, + { + "epoch": 0.9293215578105626, + "grad_norm": 7.722021579742432, + "learning_rate": 1.252392111665407e-07, + "loss": 0.2057, + "step": 36724 + }, + { + "epoch": 0.9293468633752562, + "grad_norm": 17.30730628967285, + "learning_rate": 1.251499223775443e-07, + "loss": 0.1958, + "step": 36725 + }, + { + "epoch": 0.9293721689399499, + "grad_norm": 4.7215399742126465, + "learning_rate": 1.250606650255498e-07, + "loss": 0.1612, + "step": 36726 + }, + { + "epoch": 0.9293974745046436, + "grad_norm": 5.218877792358398, + "learning_rate": 1.249714391111312e-07, + "loss": 0.1813, + "step": 36727 + }, + { + "epoch": 0.9294227800693372, + "grad_norm": 5.286922454833984, + "learning_rate": 1.2488224463486576e-07, + "loss": 0.1892, + "step": 36728 + }, + { + "epoch": 0.9294480856340309, + "grad_norm": 4.7263407707214355, + "learning_rate": 1.2479308159732806e-07, + "loss": 0.1267, + "step": 36729 + }, + { + "epoch": 0.9294733911987246, + "grad_norm": 7.418140411376953, + "learning_rate": 1.2470394999909262e-07, + "loss": 0.2834, + "step": 36730 + }, + { + "epoch": 0.9294986967634182, + "grad_norm": 4.082749843597412, + "learning_rate": 1.2461484984073403e-07, + "loss": 0.1533, + "step": 36731 + }, + { + "epoch": 0.9295240023281119, + "grad_norm": 6.1738691329956055, + "learning_rate": 1.2452578112282786e-07, + "loss": 0.2222, + "step": 36732 + }, + { + "epoch": 0.9295493078928057, + "grad_norm": 3.848890781402588, + "learning_rate": 1.2443674384594817e-07, + "loss": 0.107, + "step": 36733 + }, + { + "epoch": 0.9295746134574993, + "grad_norm": 4.618699073791504, + "learning_rate": 1.243477380106689e-07, + "loss": 0.1181, + "step": 36734 + }, + { + "epoch": 0.929599919022193, + "grad_norm": 7.902722358703613, + "learning_rate": 1.2425876361756406e-07, + "loss": 0.1525, + "step": 36735 + }, + { + "epoch": 0.9296252245868867, + "grad_norm": 4.578348636627197, + "learning_rate": 1.2416982066720763e-07, + "loss": 0.1668, + "step": 36736 + }, + { + "epoch": 0.9296505301515803, + "grad_norm": 9.720568656921387, + "learning_rate": 1.24080909160173e-07, + "loss": 0.1079, + "step": 36737 + }, + { + "epoch": 0.929675835716274, + "grad_norm": 4.409139156341553, + "learning_rate": 1.2399202909703312e-07, + "loss": 0.1745, + "step": 36738 + }, + { + "epoch": 0.9297011412809677, + "grad_norm": 3.282526969909668, + "learning_rate": 1.2390318047836248e-07, + "loss": 0.1224, + "step": 36739 + }, + { + "epoch": 0.9297264468456614, + "grad_norm": 3.3876054286956787, + "learning_rate": 1.238143633047323e-07, + "loss": 0.1306, + "step": 36740 + }, + { + "epoch": 0.929751752410355, + "grad_norm": 3.0449812412261963, + "learning_rate": 1.237255775767171e-07, + "loss": 0.0985, + "step": 36741 + }, + { + "epoch": 0.9297770579750487, + "grad_norm": 4.592080593109131, + "learning_rate": 1.2363682329488756e-07, + "loss": 0.1149, + "step": 36742 + }, + { + "epoch": 0.9298023635397424, + "grad_norm": 7.199704647064209, + "learning_rate": 1.2354810045981825e-07, + "loss": 0.1604, + "step": 36743 + }, + { + "epoch": 0.929827669104436, + "grad_norm": 2.2817647457122803, + "learning_rate": 1.2345940907207922e-07, + "loss": 0.0932, + "step": 36744 + }, + { + "epoch": 0.9298529746691298, + "grad_norm": 3.030613422393799, + "learning_rate": 1.2337074913224388e-07, + "loss": 0.1227, + "step": 36745 + }, + { + "epoch": 0.9298782802338235, + "grad_norm": 9.840764045715332, + "learning_rate": 1.2328212064088296e-07, + "loss": 0.2668, + "step": 36746 + }, + { + "epoch": 0.9299035857985171, + "grad_norm": 5.357252597808838, + "learning_rate": 1.2319352359856928e-07, + "loss": 0.1453, + "step": 36747 + }, + { + "epoch": 0.9299288913632108, + "grad_norm": 2.8924508094787598, + "learning_rate": 1.2310495800587298e-07, + "loss": 0.0619, + "step": 36748 + }, + { + "epoch": 0.9299541969279045, + "grad_norm": 3.9864540100097656, + "learning_rate": 1.2301642386336577e-07, + "loss": 0.1185, + "step": 36749 + }, + { + "epoch": 0.9299795024925981, + "grad_norm": 8.593026161193848, + "learning_rate": 1.2292792117161777e-07, + "loss": 0.2031, + "step": 36750 + }, + { + "epoch": 0.9300048080572918, + "grad_norm": 8.604214668273926, + "learning_rate": 1.2283944993120133e-07, + "loss": 0.1961, + "step": 36751 + }, + { + "epoch": 0.9300301136219855, + "grad_norm": 5.106979846954346, + "learning_rate": 1.2275101014268598e-07, + "loss": 0.1505, + "step": 36752 + }, + { + "epoch": 0.9300554191866791, + "grad_norm": 4.720349311828613, + "learning_rate": 1.226626018066418e-07, + "loss": 0.1714, + "step": 36753 + }, + { + "epoch": 0.9300807247513728, + "grad_norm": 5.580519676208496, + "learning_rate": 1.225742249236389e-07, + "loss": 0.1277, + "step": 36754 + }, + { + "epoch": 0.9301060303160665, + "grad_norm": 4.1291022300720215, + "learning_rate": 1.2248587949424795e-07, + "loss": 0.125, + "step": 36755 + }, + { + "epoch": 0.9301313358807601, + "grad_norm": 3.5053372383117676, + "learning_rate": 1.2239756551903846e-07, + "loss": 0.089, + "step": 36756 + }, + { + "epoch": 0.9301566414454538, + "grad_norm": 5.219625949859619, + "learning_rate": 1.2230928299858004e-07, + "loss": 0.189, + "step": 36757 + }, + { + "epoch": 0.9301819470101476, + "grad_norm": 3.990185260772705, + "learning_rate": 1.2222103193344047e-07, + "loss": 0.1482, + "step": 36758 + }, + { + "epoch": 0.9302072525748412, + "grad_norm": 2.7046937942504883, + "learning_rate": 1.2213281232419104e-07, + "loss": 0.081, + "step": 36759 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 7.764378547668457, + "learning_rate": 1.2204462417140018e-07, + "loss": 0.1682, + "step": 36760 + }, + { + "epoch": 0.9302578637042286, + "grad_norm": 6.373410701751709, + "learning_rate": 1.2195646747563572e-07, + "loss": 0.2516, + "step": 36761 + }, + { + "epoch": 0.9302831692689222, + "grad_norm": 5.320797443389893, + "learning_rate": 1.2186834223746612e-07, + "loss": 0.1405, + "step": 36762 + }, + { + "epoch": 0.9303084748336159, + "grad_norm": 3.249255895614624, + "learning_rate": 1.217802484574604e-07, + "loss": 0.1136, + "step": 36763 + }, + { + "epoch": 0.9303337803983096, + "grad_norm": 11.280205726623535, + "learning_rate": 1.2169218613618695e-07, + "loss": 0.1378, + "step": 36764 + }, + { + "epoch": 0.9303590859630033, + "grad_norm": 3.6753146648406982, + "learning_rate": 1.216041552742131e-07, + "loss": 0.1129, + "step": 36765 + }, + { + "epoch": 0.9303843915276969, + "grad_norm": 5.04848575592041, + "learning_rate": 1.215161558721062e-07, + "loss": 0.1701, + "step": 36766 + }, + { + "epoch": 0.9304096970923906, + "grad_norm": 7.604717254638672, + "learning_rate": 1.214281879304341e-07, + "loss": 0.1718, + "step": 36767 + }, + { + "epoch": 0.9304350026570843, + "grad_norm": 3.316939115524292, + "learning_rate": 1.2134025144976413e-07, + "loss": 0.095, + "step": 36768 + }, + { + "epoch": 0.9304603082217779, + "grad_norm": 3.2242441177368164, + "learning_rate": 1.212523464306631e-07, + "loss": 0.1676, + "step": 36769 + }, + { + "epoch": 0.9304856137864717, + "grad_norm": 7.333570957183838, + "learning_rate": 1.2116447287369937e-07, + "loss": 0.2857, + "step": 36770 + }, + { + "epoch": 0.9305109193511654, + "grad_norm": 7.530404090881348, + "learning_rate": 1.2107663077943755e-07, + "loss": 0.2311, + "step": 36771 + }, + { + "epoch": 0.930536224915859, + "grad_norm": 6.846957683563232, + "learning_rate": 1.2098882014844494e-07, + "loss": 0.1351, + "step": 36772 + }, + { + "epoch": 0.9305615304805527, + "grad_norm": 4.18967342376709, + "learning_rate": 1.2090104098128774e-07, + "loss": 0.1234, + "step": 36773 + }, + { + "epoch": 0.9305868360452464, + "grad_norm": 10.826327323913574, + "learning_rate": 1.208132932785333e-07, + "loss": 0.1763, + "step": 36774 + }, + { + "epoch": 0.93061214160994, + "grad_norm": 2.6237027645111084, + "learning_rate": 1.2072557704074506e-07, + "loss": 0.0984, + "step": 36775 + }, + { + "epoch": 0.9306374471746337, + "grad_norm": 3.109452247619629, + "learning_rate": 1.206378922684903e-07, + "loss": 0.1331, + "step": 36776 + }, + { + "epoch": 0.9306627527393274, + "grad_norm": 4.646137714385986, + "learning_rate": 1.2055023896233363e-07, + "loss": 0.234, + "step": 36777 + }, + { + "epoch": 0.930688058304021, + "grad_norm": 9.380280494689941, + "learning_rate": 1.204626171228418e-07, + "loss": 0.1938, + "step": 36778 + }, + { + "epoch": 0.9307133638687147, + "grad_norm": 21.267263412475586, + "learning_rate": 1.2037502675057877e-07, + "loss": 0.175, + "step": 36779 + }, + { + "epoch": 0.9307386694334084, + "grad_norm": 9.723111152648926, + "learning_rate": 1.2028746784610912e-07, + "loss": 0.3027, + "step": 36780 + }, + { + "epoch": 0.930763974998102, + "grad_norm": 13.660683631896973, + "learning_rate": 1.2019994040999793e-07, + "loss": 0.1864, + "step": 36781 + }, + { + "epoch": 0.9307892805627958, + "grad_norm": 3.657977342605591, + "learning_rate": 1.2011244444280977e-07, + "loss": 0.105, + "step": 36782 + }, + { + "epoch": 0.9308145861274895, + "grad_norm": 8.536944389343262, + "learning_rate": 1.2002497994510864e-07, + "loss": 0.1703, + "step": 36783 + }, + { + "epoch": 0.9308398916921831, + "grad_norm": 16.628908157348633, + "learning_rate": 1.1993754691745851e-07, + "loss": 0.2419, + "step": 36784 + }, + { + "epoch": 0.9308651972568768, + "grad_norm": 3.78530216217041, + "learning_rate": 1.198501453604234e-07, + "loss": 0.0886, + "step": 36785 + }, + { + "epoch": 0.9308905028215705, + "grad_norm": 3.2375731468200684, + "learning_rate": 1.1976277527456725e-07, + "loss": 0.1329, + "step": 36786 + }, + { + "epoch": 0.9309158083862641, + "grad_norm": 2.9258315563201904, + "learning_rate": 1.1967543666045301e-07, + "loss": 0.0881, + "step": 36787 + }, + { + "epoch": 0.9309411139509578, + "grad_norm": 8.006627082824707, + "learning_rate": 1.1958812951864406e-07, + "loss": 0.1653, + "step": 36788 + }, + { + "epoch": 0.9309664195156515, + "grad_norm": 5.797707557678223, + "learning_rate": 1.1950085384970388e-07, + "loss": 0.1571, + "step": 36789 + }, + { + "epoch": 0.9309917250803452, + "grad_norm": 5.62337064743042, + "learning_rate": 1.194136096541937e-07, + "loss": 0.1528, + "step": 36790 + }, + { + "epoch": 0.9310170306450388, + "grad_norm": 5.47350549697876, + "learning_rate": 1.1932639693267857e-07, + "loss": 0.1252, + "step": 36791 + }, + { + "epoch": 0.9310423362097325, + "grad_norm": 5.026790618896484, + "learning_rate": 1.1923921568571916e-07, + "loss": 0.163, + "step": 36792 + }, + { + "epoch": 0.9310676417744262, + "grad_norm": 8.00271987915039, + "learning_rate": 1.1915206591387784e-07, + "loss": 0.1228, + "step": 36793 + }, + { + "epoch": 0.9310929473391198, + "grad_norm": 6.054623603820801, + "learning_rate": 1.1906494761771692e-07, + "loss": 0.1632, + "step": 36794 + }, + { + "epoch": 0.9311182529038136, + "grad_norm": 2.3565497398376465, + "learning_rate": 1.1897786079779871e-07, + "loss": 0.0477, + "step": 36795 + }, + { + "epoch": 0.9311435584685073, + "grad_norm": 6.078603744506836, + "learning_rate": 1.188908054546839e-07, + "loss": 0.0775, + "step": 36796 + }, + { + "epoch": 0.9311688640332009, + "grad_norm": 4.91921854019165, + "learning_rate": 1.1880378158893479e-07, + "loss": 0.1749, + "step": 36797 + }, + { + "epoch": 0.9311941695978946, + "grad_norm": 2.3924717903137207, + "learning_rate": 1.1871678920111151e-07, + "loss": 0.111, + "step": 36798 + }, + { + "epoch": 0.9312194751625883, + "grad_norm": 5.122100830078125, + "learning_rate": 1.1862982829177639e-07, + "loss": 0.1293, + "step": 36799 + }, + { + "epoch": 0.9312447807272819, + "grad_norm": 3.954845428466797, + "learning_rate": 1.185428988614884e-07, + "loss": 0.1072, + "step": 36800 + }, + { + "epoch": 0.9312700862919756, + "grad_norm": 6.626687526702881, + "learning_rate": 1.1845600091081044e-07, + "loss": 0.1719, + "step": 36801 + }, + { + "epoch": 0.9312953918566693, + "grad_norm": 15.965494155883789, + "learning_rate": 1.1836913444030096e-07, + "loss": 0.1467, + "step": 36802 + }, + { + "epoch": 0.9313206974213629, + "grad_norm": 5.339414596557617, + "learning_rate": 1.1828229945052117e-07, + "loss": 0.1558, + "step": 36803 + }, + { + "epoch": 0.9313460029860566, + "grad_norm": 4.621650695800781, + "learning_rate": 1.1819549594203062e-07, + "loss": 0.1904, + "step": 36804 + }, + { + "epoch": 0.9313713085507503, + "grad_norm": 6.089974403381348, + "learning_rate": 1.1810872391538997e-07, + "loss": 0.157, + "step": 36805 + }, + { + "epoch": 0.9313966141154439, + "grad_norm": 4.78349494934082, + "learning_rate": 1.180219833711571e-07, + "loss": 0.1795, + "step": 36806 + }, + { + "epoch": 0.9314219196801377, + "grad_norm": 7.566211700439453, + "learning_rate": 1.1793527430989271e-07, + "loss": 0.2365, + "step": 36807 + }, + { + "epoch": 0.9314472252448314, + "grad_norm": 8.5846529006958, + "learning_rate": 1.178485967321552e-07, + "loss": 0.2053, + "step": 36808 + }, + { + "epoch": 0.931472530809525, + "grad_norm": 11.249059677124023, + "learning_rate": 1.1776195063850415e-07, + "loss": 0.3271, + "step": 36809 + }, + { + "epoch": 0.9314978363742187, + "grad_norm": 3.266677141189575, + "learning_rate": 1.1767533602949855e-07, + "loss": 0.0943, + "step": 36810 + }, + { + "epoch": 0.9315231419389124, + "grad_norm": 6.80979061126709, + "learning_rate": 1.1758875290569627e-07, + "loss": 0.1326, + "step": 36811 + }, + { + "epoch": 0.931548447503606, + "grad_norm": 5.190313816070557, + "learning_rate": 1.1750220126765577e-07, + "loss": 0.1519, + "step": 36812 + }, + { + "epoch": 0.9315737530682997, + "grad_norm": 3.761848211288452, + "learning_rate": 1.1741568111593548e-07, + "loss": 0.1538, + "step": 36813 + }, + { + "epoch": 0.9315990586329934, + "grad_norm": 5.0047221183776855, + "learning_rate": 1.1732919245109331e-07, + "loss": 0.1667, + "step": 36814 + }, + { + "epoch": 0.931624364197687, + "grad_norm": 7.647187232971191, + "learning_rate": 1.1724273527368712e-07, + "loss": 0.2136, + "step": 36815 + }, + { + "epoch": 0.9316496697623807, + "grad_norm": 3.354017972946167, + "learning_rate": 1.1715630958427371e-07, + "loss": 0.1281, + "step": 36816 + }, + { + "epoch": 0.9316749753270744, + "grad_norm": 4.833645343780518, + "learning_rate": 1.1706991538341094e-07, + "loss": 0.1497, + "step": 36817 + }, + { + "epoch": 0.9317002808917682, + "grad_norm": 3.864276885986328, + "learning_rate": 1.1698355267165673e-07, + "loss": 0.1357, + "step": 36818 + }, + { + "epoch": 0.9317255864564618, + "grad_norm": 3.7387187480926514, + "learning_rate": 1.1689722144956672e-07, + "loss": 0.1418, + "step": 36819 + }, + { + "epoch": 0.9317508920211555, + "grad_norm": 3.0669312477111816, + "learning_rate": 1.1681092171769882e-07, + "loss": 0.115, + "step": 36820 + }, + { + "epoch": 0.9317761975858492, + "grad_norm": 26.9178409576416, + "learning_rate": 1.1672465347660811e-07, + "loss": 0.2743, + "step": 36821 + }, + { + "epoch": 0.9318015031505428, + "grad_norm": 4.210508346557617, + "learning_rate": 1.1663841672685194e-07, + "loss": 0.1518, + "step": 36822 + }, + { + "epoch": 0.9318268087152365, + "grad_norm": 8.176261901855469, + "learning_rate": 1.1655221146898655e-07, + "loss": 0.1389, + "step": 36823 + }, + { + "epoch": 0.9318521142799302, + "grad_norm": 3.7274880409240723, + "learning_rate": 1.1646603770356757e-07, + "loss": 0.0873, + "step": 36824 + }, + { + "epoch": 0.9318774198446238, + "grad_norm": 6.217278957366943, + "learning_rate": 1.1637989543115014e-07, + "loss": 0.1575, + "step": 36825 + }, + { + "epoch": 0.9319027254093175, + "grad_norm": 3.9926319122314453, + "learning_rate": 1.1629378465229103e-07, + "loss": 0.0853, + "step": 36826 + }, + { + "epoch": 0.9319280309740112, + "grad_norm": 4.409275054931641, + "learning_rate": 1.1620770536754478e-07, + "loss": 0.1668, + "step": 36827 + }, + { + "epoch": 0.9319533365387048, + "grad_norm": 9.375476837158203, + "learning_rate": 1.1612165757746652e-07, + "loss": 0.1982, + "step": 36828 + }, + { + "epoch": 0.9319786421033985, + "grad_norm": 10.695216178894043, + "learning_rate": 1.160356412826108e-07, + "loss": 0.2469, + "step": 36829 + }, + { + "epoch": 0.9320039476680922, + "grad_norm": 4.587436676025391, + "learning_rate": 1.1594965648353274e-07, + "loss": 0.1443, + "step": 36830 + }, + { + "epoch": 0.9320292532327858, + "grad_norm": 4.46609354019165, + "learning_rate": 1.1586370318078688e-07, + "loss": 0.1581, + "step": 36831 + }, + { + "epoch": 0.9320545587974796, + "grad_norm": 6.403511047363281, + "learning_rate": 1.1577778137492834e-07, + "loss": 0.1238, + "step": 36832 + }, + { + "epoch": 0.9320798643621733, + "grad_norm": 2.5211341381073, + "learning_rate": 1.1569189106650947e-07, + "loss": 0.1144, + "step": 36833 + }, + { + "epoch": 0.9321051699268669, + "grad_norm": 4.40438175201416, + "learning_rate": 1.1560603225608536e-07, + "loss": 0.137, + "step": 36834 + }, + { + "epoch": 0.9321304754915606, + "grad_norm": 3.5511343479156494, + "learning_rate": 1.155202049442089e-07, + "loss": 0.0905, + "step": 36835 + }, + { + "epoch": 0.9321557810562543, + "grad_norm": 5.710447788238525, + "learning_rate": 1.1543440913143522e-07, + "loss": 0.1516, + "step": 36836 + }, + { + "epoch": 0.9321810866209479, + "grad_norm": 9.785654067993164, + "learning_rate": 1.1534864481831554e-07, + "loss": 0.159, + "step": 36837 + }, + { + "epoch": 0.9322063921856416, + "grad_norm": 4.019542694091797, + "learning_rate": 1.1526291200540385e-07, + "loss": 0.116, + "step": 36838 + }, + { + "epoch": 0.9322316977503353, + "grad_norm": 14.637378692626953, + "learning_rate": 1.1517721069325305e-07, + "loss": 0.2041, + "step": 36839 + }, + { + "epoch": 0.9322570033150289, + "grad_norm": 10.826424598693848, + "learning_rate": 1.1509154088241659e-07, + "loss": 0.1482, + "step": 36840 + }, + { + "epoch": 0.9322823088797226, + "grad_norm": 11.776863098144531, + "learning_rate": 1.1500590257344512e-07, + "loss": 0.1833, + "step": 36841 + }, + { + "epoch": 0.9323076144444163, + "grad_norm": 5.329712390899658, + "learning_rate": 1.1492029576689213e-07, + "loss": 0.1647, + "step": 36842 + }, + { + "epoch": 0.93233292000911, + "grad_norm": 6.685415744781494, + "learning_rate": 1.1483472046330934e-07, + "loss": 0.195, + "step": 36843 + }, + { + "epoch": 0.9323582255738037, + "grad_norm": 5.820882320404053, + "learning_rate": 1.1474917666324915e-07, + "loss": 0.1363, + "step": 36844 + }, + { + "epoch": 0.9323835311384974, + "grad_norm": 11.161126136779785, + "learning_rate": 1.1466366436726217e-07, + "loss": 0.245, + "step": 36845 + }, + { + "epoch": 0.9324088367031911, + "grad_norm": 9.739657402038574, + "learning_rate": 1.1457818357590134e-07, + "loss": 0.1437, + "step": 36846 + }, + { + "epoch": 0.9324341422678847, + "grad_norm": 3.0966413021087646, + "learning_rate": 1.1449273428971675e-07, + "loss": 0.0554, + "step": 36847 + }, + { + "epoch": 0.9324594478325784, + "grad_norm": 4.579409122467041, + "learning_rate": 1.1440731650925907e-07, + "loss": 0.1453, + "step": 36848 + }, + { + "epoch": 0.9324847533972721, + "grad_norm": 4.9695563316345215, + "learning_rate": 1.1432193023508065e-07, + "loss": 0.1753, + "step": 36849 + }, + { + "epoch": 0.9325100589619657, + "grad_norm": 8.102190971374512, + "learning_rate": 1.1423657546773104e-07, + "loss": 0.104, + "step": 36850 + }, + { + "epoch": 0.9325353645266594, + "grad_norm": 3.4329025745391846, + "learning_rate": 1.141512522077609e-07, + "loss": 0.1288, + "step": 36851 + }, + { + "epoch": 0.9325606700913531, + "grad_norm": 4.902495384216309, + "learning_rate": 1.1406596045571983e-07, + "loss": 0.1936, + "step": 36852 + }, + { + "epoch": 0.9325859756560467, + "grad_norm": 4.497364044189453, + "learning_rate": 1.1398070021215956e-07, + "loss": 0.1238, + "step": 36853 + }, + { + "epoch": 0.9326112812207404, + "grad_norm": 3.238845109939575, + "learning_rate": 1.1389547147762858e-07, + "loss": 0.1266, + "step": 36854 + }, + { + "epoch": 0.9326365867854342, + "grad_norm": 5.081393718719482, + "learning_rate": 1.13810274252677e-07, + "loss": 0.133, + "step": 36855 + }, + { + "epoch": 0.9326618923501278, + "grad_norm": 13.090917587280273, + "learning_rate": 1.1372510853785323e-07, + "loss": 0.2169, + "step": 36856 + }, + { + "epoch": 0.9326871979148215, + "grad_norm": 5.113921165466309, + "learning_rate": 1.13639974333708e-07, + "loss": 0.1922, + "step": 36857 + }, + { + "epoch": 0.9327125034795152, + "grad_norm": 4.415079116821289, + "learning_rate": 1.1355487164078971e-07, + "loss": 0.1212, + "step": 36858 + }, + { + "epoch": 0.9327378090442088, + "grad_norm": 2.6208930015563965, + "learning_rate": 1.1346980045964683e-07, + "loss": 0.1018, + "step": 36859 + }, + { + "epoch": 0.9327631146089025, + "grad_norm": 4.133749008178711, + "learning_rate": 1.1338476079082783e-07, + "loss": 0.0515, + "step": 36860 + }, + { + "epoch": 0.9327884201735962, + "grad_norm": 2.795703172683716, + "learning_rate": 1.1329975263488225e-07, + "loss": 0.1251, + "step": 36861 + }, + { + "epoch": 0.9328137257382898, + "grad_norm": 7.356332778930664, + "learning_rate": 1.1321477599235741e-07, + "loss": 0.215, + "step": 36862 + }, + { + "epoch": 0.9328390313029835, + "grad_norm": 4.737354278564453, + "learning_rate": 1.131298308638018e-07, + "loss": 0.1892, + "step": 36863 + }, + { + "epoch": 0.9328643368676772, + "grad_norm": 5.286291599273682, + "learning_rate": 1.1304491724976219e-07, + "loss": 0.1592, + "step": 36864 + }, + { + "epoch": 0.9328896424323708, + "grad_norm": 5.579028129577637, + "learning_rate": 1.1296003515078701e-07, + "loss": 0.1603, + "step": 36865 + }, + { + "epoch": 0.9329149479970645, + "grad_norm": 4.244559288024902, + "learning_rate": 1.1287518456742309e-07, + "loss": 0.1156, + "step": 36866 + }, + { + "epoch": 0.9329402535617582, + "grad_norm": 4.051600933074951, + "learning_rate": 1.1279036550021938e-07, + "loss": 0.0944, + "step": 36867 + }, + { + "epoch": 0.932965559126452, + "grad_norm": 7.808524131774902, + "learning_rate": 1.1270557794972048e-07, + "loss": 0.1794, + "step": 36868 + }, + { + "epoch": 0.9329908646911456, + "grad_norm": 4.518826007843018, + "learning_rate": 1.1262082191647428e-07, + "loss": 0.1466, + "step": 36869 + }, + { + "epoch": 0.9330161702558393, + "grad_norm": 7.2474470138549805, + "learning_rate": 1.12536097401027e-07, + "loss": 0.1981, + "step": 36870 + }, + { + "epoch": 0.933041475820533, + "grad_norm": 9.04666519165039, + "learning_rate": 1.1245140440392655e-07, + "loss": 0.1771, + "step": 36871 + }, + { + "epoch": 0.9330667813852266, + "grad_norm": 3.3199291229248047, + "learning_rate": 1.1236674292571691e-07, + "loss": 0.1228, + "step": 36872 + }, + { + "epoch": 0.9330920869499203, + "grad_norm": 7.179388523101807, + "learning_rate": 1.1228211296694491e-07, + "loss": 0.114, + "step": 36873 + }, + { + "epoch": 0.933117392514614, + "grad_norm": 2.8155720233917236, + "learning_rate": 1.1219751452815675e-07, + "loss": 0.117, + "step": 36874 + }, + { + "epoch": 0.9331426980793076, + "grad_norm": 5.8961615562438965, + "learning_rate": 1.12112947609897e-07, + "loss": 0.1628, + "step": 36875 + }, + { + "epoch": 0.9331680036440013, + "grad_norm": 4.434198379516602, + "learning_rate": 1.1202841221271243e-07, + "loss": 0.1419, + "step": 36876 + }, + { + "epoch": 0.933193309208695, + "grad_norm": 3.5675606727600098, + "learning_rate": 1.1194390833714708e-07, + "loss": 0.0775, + "step": 36877 + }, + { + "epoch": 0.9332186147733886, + "grad_norm": 4.35434103012085, + "learning_rate": 1.1185943598374605e-07, + "loss": 0.1457, + "step": 36878 + }, + { + "epoch": 0.9332439203380823, + "grad_norm": 3.48594069480896, + "learning_rate": 1.117749951530539e-07, + "loss": 0.0622, + "step": 36879 + }, + { + "epoch": 0.9332692259027761, + "grad_norm": 5.011514663696289, + "learning_rate": 1.1169058584561632e-07, + "loss": 0.104, + "step": 36880 + }, + { + "epoch": 0.9332945314674697, + "grad_norm": 5.746335029602051, + "learning_rate": 1.1160620806197675e-07, + "loss": 0.1019, + "step": 36881 + }, + { + "epoch": 0.9333198370321634, + "grad_norm": 3.1186304092407227, + "learning_rate": 1.1152186180267921e-07, + "loss": 0.0725, + "step": 36882 + }, + { + "epoch": 0.9333451425968571, + "grad_norm": 6.066704750061035, + "learning_rate": 1.114375470682677e-07, + "loss": 0.1649, + "step": 36883 + }, + { + "epoch": 0.9333704481615507, + "grad_norm": 4.039482593536377, + "learning_rate": 1.1135326385928624e-07, + "loss": 0.1293, + "step": 36884 + }, + { + "epoch": 0.9333957537262444, + "grad_norm": 9.727510452270508, + "learning_rate": 1.1126901217627828e-07, + "loss": 0.2616, + "step": 36885 + }, + { + "epoch": 0.9334210592909381, + "grad_norm": 4.3317670822143555, + "learning_rate": 1.1118479201978727e-07, + "loss": 0.0857, + "step": 36886 + }, + { + "epoch": 0.9334463648556317, + "grad_norm": 4.540738582611084, + "learning_rate": 1.1110060339035556e-07, + "loss": 0.1446, + "step": 36887 + }, + { + "epoch": 0.9334716704203254, + "grad_norm": 4.171886444091797, + "learning_rate": 1.1101644628852715e-07, + "loss": 0.1412, + "step": 36888 + }, + { + "epoch": 0.9334969759850191, + "grad_norm": 5.025121212005615, + "learning_rate": 1.1093232071484383e-07, + "loss": 0.1076, + "step": 36889 + }, + { + "epoch": 0.9335222815497127, + "grad_norm": 4.120611667633057, + "learning_rate": 1.1084822666984907e-07, + "loss": 0.1007, + "step": 36890 + }, + { + "epoch": 0.9335475871144064, + "grad_norm": 5.215434551239014, + "learning_rate": 1.1076416415408409e-07, + "loss": 0.1611, + "step": 36891 + }, + { + "epoch": 0.9335728926791002, + "grad_norm": 4.066797256469727, + "learning_rate": 1.1068013316809234e-07, + "loss": 0.1096, + "step": 36892 + }, + { + "epoch": 0.9335981982437939, + "grad_norm": 9.331060409545898, + "learning_rate": 1.1059613371241452e-07, + "loss": 0.1856, + "step": 36893 + }, + { + "epoch": 0.9336235038084875, + "grad_norm": 4.780418872833252, + "learning_rate": 1.1051216578759239e-07, + "loss": 0.1151, + "step": 36894 + }, + { + "epoch": 0.9336488093731812, + "grad_norm": 8.612988471984863, + "learning_rate": 1.1042822939416775e-07, + "loss": 0.1771, + "step": 36895 + }, + { + "epoch": 0.9336741149378749, + "grad_norm": 4.102281093597412, + "learning_rate": 1.1034432453268241e-07, + "loss": 0.1469, + "step": 36896 + }, + { + "epoch": 0.9336994205025685, + "grad_norm": 4.034310817718506, + "learning_rate": 1.1026045120367646e-07, + "loss": 0.1536, + "step": 36897 + }, + { + "epoch": 0.9337247260672622, + "grad_norm": 4.97728157043457, + "learning_rate": 1.1017660940769226e-07, + "loss": 0.155, + "step": 36898 + }, + { + "epoch": 0.9337500316319559, + "grad_norm": 3.9846913814544678, + "learning_rate": 1.1009279914526883e-07, + "loss": 0.0974, + "step": 36899 + }, + { + "epoch": 0.9337753371966495, + "grad_norm": 6.473102569580078, + "learning_rate": 1.1000902041694738e-07, + "loss": 0.1792, + "step": 36900 + }, + { + "epoch": 0.9338006427613432, + "grad_norm": 5.705472469329834, + "learning_rate": 1.0992527322326808e-07, + "loss": 0.1644, + "step": 36901 + }, + { + "epoch": 0.9338259483260369, + "grad_norm": 4.997537612915039, + "learning_rate": 1.0984155756477155e-07, + "loss": 0.1646, + "step": 36902 + }, + { + "epoch": 0.9338512538907305, + "grad_norm": 4.73169469833374, + "learning_rate": 1.0975787344199628e-07, + "loss": 0.1758, + "step": 36903 + }, + { + "epoch": 0.9338765594554242, + "grad_norm": 5.863646507263184, + "learning_rate": 1.0967422085548351e-07, + "loss": 0.141, + "step": 36904 + }, + { + "epoch": 0.933901865020118, + "grad_norm": 7.568219184875488, + "learning_rate": 1.0959059980577224e-07, + "loss": 0.2369, + "step": 36905 + }, + { + "epoch": 0.9339271705848116, + "grad_norm": 8.86223316192627, + "learning_rate": 1.0950701029340094e-07, + "loss": 0.2211, + "step": 36906 + }, + { + "epoch": 0.9339524761495053, + "grad_norm": 5.818072319030762, + "learning_rate": 1.0942345231890916e-07, + "loss": 0.18, + "step": 36907 + }, + { + "epoch": 0.933977781714199, + "grad_norm": 6.047184944152832, + "learning_rate": 1.0933992588283648e-07, + "loss": 0.187, + "step": 36908 + }, + { + "epoch": 0.9340030872788926, + "grad_norm": 3.075033187866211, + "learning_rate": 1.0925643098572025e-07, + "loss": 0.0728, + "step": 36909 + }, + { + "epoch": 0.9340283928435863, + "grad_norm": 4.520312786102295, + "learning_rate": 1.0917296762809947e-07, + "loss": 0.1503, + "step": 36910 + }, + { + "epoch": 0.93405369840828, + "grad_norm": 2.9113361835479736, + "learning_rate": 1.0908953581051373e-07, + "loss": 0.1088, + "step": 36911 + }, + { + "epoch": 0.9340790039729736, + "grad_norm": 5.132198810577393, + "learning_rate": 1.0900613553349815e-07, + "loss": 0.1442, + "step": 36912 + }, + { + "epoch": 0.9341043095376673, + "grad_norm": 7.88291597366333, + "learning_rate": 1.089227667975934e-07, + "loss": 0.1873, + "step": 36913 + }, + { + "epoch": 0.934129615102361, + "grad_norm": 4.000974655151367, + "learning_rate": 1.088394296033346e-07, + "loss": 0.1525, + "step": 36914 + }, + { + "epoch": 0.9341549206670546, + "grad_norm": 24.962360382080078, + "learning_rate": 1.087561239512619e-07, + "loss": 0.1303, + "step": 36915 + }, + { + "epoch": 0.9341802262317483, + "grad_norm": 5.057414531707764, + "learning_rate": 1.086728498419104e-07, + "loss": 0.2201, + "step": 36916 + }, + { + "epoch": 0.9342055317964421, + "grad_norm": 4.922884941101074, + "learning_rate": 1.0858960727581803e-07, + "loss": 0.1282, + "step": 36917 + }, + { + "epoch": 0.9342308373611358, + "grad_norm": 3.670011281967163, + "learning_rate": 1.08506396253521e-07, + "loss": 0.0921, + "step": 36918 + }, + { + "epoch": 0.9342561429258294, + "grad_norm": 5.236960411071777, + "learning_rate": 1.0842321677555667e-07, + "loss": 0.1633, + "step": 36919 + }, + { + "epoch": 0.9342814484905231, + "grad_norm": 3.5539512634277344, + "learning_rate": 1.0834006884246128e-07, + "loss": 0.1474, + "step": 36920 + }, + { + "epoch": 0.9343067540552168, + "grad_norm": 3.8665504455566406, + "learning_rate": 1.0825695245477108e-07, + "loss": 0.171, + "step": 36921 + }, + { + "epoch": 0.9343320596199104, + "grad_norm": 3.9554507732391357, + "learning_rate": 1.0817386761302118e-07, + "loss": 0.1315, + "step": 36922 + }, + { + "epoch": 0.9343573651846041, + "grad_norm": 5.615808963775635, + "learning_rate": 1.0809081431774837e-07, + "loss": 0.1544, + "step": 36923 + }, + { + "epoch": 0.9343826707492978, + "grad_norm": 3.275634527206421, + "learning_rate": 1.0800779256948779e-07, + "loss": 0.1061, + "step": 36924 + }, + { + "epoch": 0.9344079763139914, + "grad_norm": 10.310140609741211, + "learning_rate": 1.0792480236877568e-07, + "loss": 0.2095, + "step": 36925 + }, + { + "epoch": 0.9344332818786851, + "grad_norm": 7.740809917449951, + "learning_rate": 1.0784184371614548e-07, + "loss": 0.1795, + "step": 36926 + }, + { + "epoch": 0.9344585874433788, + "grad_norm": 7.346766471862793, + "learning_rate": 1.07758916612134e-07, + "loss": 0.1408, + "step": 36927 + }, + { + "epoch": 0.9344838930080724, + "grad_norm": 4.544950008392334, + "learning_rate": 1.0767602105727415e-07, + "loss": 0.1589, + "step": 36928 + }, + { + "epoch": 0.9345091985727662, + "grad_norm": 6.393130302429199, + "learning_rate": 1.0759315705210328e-07, + "loss": 0.1845, + "step": 36929 + }, + { + "epoch": 0.9345345041374599, + "grad_norm": 5.9749274253845215, + "learning_rate": 1.075103245971526e-07, + "loss": 0.2073, + "step": 36930 + }, + { + "epoch": 0.9345598097021535, + "grad_norm": 3.003068208694458, + "learning_rate": 1.0742752369295839e-07, + "loss": 0.1142, + "step": 36931 + }, + { + "epoch": 0.9345851152668472, + "grad_norm": 4.645507335662842, + "learning_rate": 1.0734475434005409e-07, + "loss": 0.1501, + "step": 36932 + }, + { + "epoch": 0.9346104208315409, + "grad_norm": 3.7196569442749023, + "learning_rate": 1.0726201653897372e-07, + "loss": 0.1667, + "step": 36933 + }, + { + "epoch": 0.9346357263962345, + "grad_norm": 2.4288272857666016, + "learning_rate": 1.0717931029024964e-07, + "loss": 0.0492, + "step": 36934 + }, + { + "epoch": 0.9346610319609282, + "grad_norm": 9.890948295593262, + "learning_rate": 1.070966355944164e-07, + "loss": 0.2405, + "step": 36935 + }, + { + "epoch": 0.9346863375256219, + "grad_norm": 7.868103504180908, + "learning_rate": 1.0701399245200695e-07, + "loss": 0.1778, + "step": 36936 + }, + { + "epoch": 0.9347116430903155, + "grad_norm": 5.047576904296875, + "learning_rate": 1.0693138086355359e-07, + "loss": 0.2002, + "step": 36937 + }, + { + "epoch": 0.9347369486550092, + "grad_norm": 7.877702236175537, + "learning_rate": 1.0684880082959038e-07, + "loss": 0.1787, + "step": 36938 + }, + { + "epoch": 0.9347622542197029, + "grad_norm": 3.26065993309021, + "learning_rate": 1.0676625235064852e-07, + "loss": 0.1452, + "step": 36939 + }, + { + "epoch": 0.9347875597843965, + "grad_norm": 20.49785804748535, + "learning_rate": 1.0668373542726096e-07, + "loss": 0.2191, + "step": 36940 + }, + { + "epoch": 0.9348128653490902, + "grad_norm": 5.955347061157227, + "learning_rate": 1.0660125005995947e-07, + "loss": 0.1145, + "step": 36941 + }, + { + "epoch": 0.934838170913784, + "grad_norm": 4.2972636222839355, + "learning_rate": 1.0651879624927697e-07, + "loss": 0.1765, + "step": 36942 + }, + { + "epoch": 0.9348634764784776, + "grad_norm": 4.439381122589111, + "learning_rate": 1.0643637399574413e-07, + "loss": 0.1012, + "step": 36943 + }, + { + "epoch": 0.9348887820431713, + "grad_norm": 6.512370586395264, + "learning_rate": 1.0635398329989277e-07, + "loss": 0.1609, + "step": 36944 + }, + { + "epoch": 0.934914087607865, + "grad_norm": 5.600375652313232, + "learning_rate": 1.0627162416225411e-07, + "loss": 0.1851, + "step": 36945 + }, + { + "epoch": 0.9349393931725587, + "grad_norm": 3.3399689197540283, + "learning_rate": 1.0618929658335997e-07, + "loss": 0.1016, + "step": 36946 + }, + { + "epoch": 0.9349646987372523, + "grad_norm": 4.902462005615234, + "learning_rate": 1.0610700056374046e-07, + "loss": 0.1462, + "step": 36947 + }, + { + "epoch": 0.934990004301946, + "grad_norm": 7.154478549957275, + "learning_rate": 1.0602473610392628e-07, + "loss": 0.1872, + "step": 36948 + }, + { + "epoch": 0.9350153098666397, + "grad_norm": 4.28365421295166, + "learning_rate": 1.0594250320444865e-07, + "loss": 0.1135, + "step": 36949 + }, + { + "epoch": 0.9350406154313333, + "grad_norm": 8.491976737976074, + "learning_rate": 1.0586030186583717e-07, + "loss": 0.1735, + "step": 36950 + }, + { + "epoch": 0.935065920996027, + "grad_norm": 4.971207141876221, + "learning_rate": 1.0577813208862253e-07, + "loss": 0.142, + "step": 36951 + }, + { + "epoch": 0.9350912265607207, + "grad_norm": 16.215377807617188, + "learning_rate": 1.056959938733343e-07, + "loss": 0.227, + "step": 36952 + }, + { + "epoch": 0.9351165321254143, + "grad_norm": 6.944485664367676, + "learning_rate": 1.0561388722050148e-07, + "loss": 0.2138, + "step": 36953 + }, + { + "epoch": 0.9351418376901081, + "grad_norm": 3.0296061038970947, + "learning_rate": 1.0553181213065534e-07, + "loss": 0.1524, + "step": 36954 + }, + { + "epoch": 0.9351671432548018, + "grad_norm": 5.114497661590576, + "learning_rate": 1.0544976860432377e-07, + "loss": 0.1931, + "step": 36955 + }, + { + "epoch": 0.9351924488194954, + "grad_norm": 5.259139537811279, + "learning_rate": 1.053677566420358e-07, + "loss": 0.2437, + "step": 36956 + }, + { + "epoch": 0.9352177543841891, + "grad_norm": 4.291841506958008, + "learning_rate": 1.0528577624432046e-07, + "loss": 0.172, + "step": 36957 + }, + { + "epoch": 0.9352430599488828, + "grad_norm": 3.7673518657684326, + "learning_rate": 1.0520382741170731e-07, + "loss": 0.1804, + "step": 36958 + }, + { + "epoch": 0.9352683655135764, + "grad_norm": 4.986557483673096, + "learning_rate": 1.0512191014472429e-07, + "loss": 0.1973, + "step": 36959 + }, + { + "epoch": 0.9352936710782701, + "grad_norm": 7.1226806640625, + "learning_rate": 1.0504002444389926e-07, + "loss": 0.1799, + "step": 36960 + }, + { + "epoch": 0.9353189766429638, + "grad_norm": 4.220097541809082, + "learning_rate": 1.0495817030976019e-07, + "loss": 0.1012, + "step": 36961 + }, + { + "epoch": 0.9353442822076574, + "grad_norm": 6.232185363769531, + "learning_rate": 1.0487634774283606e-07, + "loss": 0.1578, + "step": 36962 + }, + { + "epoch": 0.9353695877723511, + "grad_norm": 4.59188175201416, + "learning_rate": 1.0479455674365368e-07, + "loss": 0.1872, + "step": 36963 + }, + { + "epoch": 0.9353948933370448, + "grad_norm": 7.27177095413208, + "learning_rate": 1.0471279731274042e-07, + "loss": 0.1412, + "step": 36964 + }, + { + "epoch": 0.9354201989017384, + "grad_norm": 4.3629560470581055, + "learning_rate": 1.0463106945062307e-07, + "loss": 0.126, + "step": 36965 + }, + { + "epoch": 0.9354455044664322, + "grad_norm": 9.49609375, + "learning_rate": 1.0454937315783009e-07, + "loss": 0.1635, + "step": 36966 + }, + { + "epoch": 0.9354708100311259, + "grad_norm": 4.086686611175537, + "learning_rate": 1.0446770843488774e-07, + "loss": 0.1918, + "step": 36967 + }, + { + "epoch": 0.9354961155958195, + "grad_norm": 4.458535671234131, + "learning_rate": 1.0438607528232225e-07, + "loss": 0.1531, + "step": 36968 + }, + { + "epoch": 0.9355214211605132, + "grad_norm": 3.2450225353240967, + "learning_rate": 1.0430447370066044e-07, + "loss": 0.0777, + "step": 36969 + }, + { + "epoch": 0.9355467267252069, + "grad_norm": 9.306061744689941, + "learning_rate": 1.0422290369042798e-07, + "loss": 0.1541, + "step": 36970 + }, + { + "epoch": 0.9355720322899006, + "grad_norm": 3.865440607070923, + "learning_rate": 1.0414136525215168e-07, + "loss": 0.1106, + "step": 36971 + }, + { + "epoch": 0.9355973378545942, + "grad_norm": 3.248964786529541, + "learning_rate": 1.0405985838635668e-07, + "loss": 0.0974, + "step": 36972 + }, + { + "epoch": 0.9356226434192879, + "grad_norm": 4.558133602142334, + "learning_rate": 1.0397838309356978e-07, + "loss": 0.143, + "step": 36973 + }, + { + "epoch": 0.9356479489839816, + "grad_norm": 3.5928351879119873, + "learning_rate": 1.0389693937431444e-07, + "loss": 0.1133, + "step": 36974 + }, + { + "epoch": 0.9356732545486752, + "grad_norm": 4.9486517906188965, + "learning_rate": 1.0381552722911803e-07, + "loss": 0.1219, + "step": 36975 + }, + { + "epoch": 0.9356985601133689, + "grad_norm": 13.418517112731934, + "learning_rate": 1.0373414665850346e-07, + "loss": 0.1984, + "step": 36976 + }, + { + "epoch": 0.9357238656780626, + "grad_norm": 3.0425124168395996, + "learning_rate": 1.0365279766299807e-07, + "loss": 0.0768, + "step": 36977 + }, + { + "epoch": 0.9357491712427563, + "grad_norm": 5.019852638244629, + "learning_rate": 1.0357148024312368e-07, + "loss": 0.2343, + "step": 36978 + }, + { + "epoch": 0.93577447680745, + "grad_norm": 4.826659202575684, + "learning_rate": 1.0349019439940655e-07, + "loss": 0.1515, + "step": 36979 + }, + { + "epoch": 0.9357997823721437, + "grad_norm": 3.369436502456665, + "learning_rate": 1.0340894013236957e-07, + "loss": 0.1241, + "step": 36980 + }, + { + "epoch": 0.9358250879368373, + "grad_norm": 5.640555381774902, + "learning_rate": 1.0332771744253844e-07, + "loss": 0.1653, + "step": 36981 + }, + { + "epoch": 0.935850393501531, + "grad_norm": 4.0742716789245605, + "learning_rate": 1.0324652633043608e-07, + "loss": 0.1322, + "step": 36982 + }, + { + "epoch": 0.9358756990662247, + "grad_norm": 5.174886226654053, + "learning_rate": 1.031653667965854e-07, + "loss": 0.1469, + "step": 36983 + }, + { + "epoch": 0.9359010046309183, + "grad_norm": 3.764434337615967, + "learning_rate": 1.0308423884151043e-07, + "loss": 0.1254, + "step": 36984 + }, + { + "epoch": 0.935926310195612, + "grad_norm": 8.224163055419922, + "learning_rate": 1.0300314246573462e-07, + "loss": 0.1186, + "step": 36985 + }, + { + "epoch": 0.9359516157603057, + "grad_norm": 10.294916152954102, + "learning_rate": 1.029220776697809e-07, + "loss": 0.2303, + "step": 36986 + }, + { + "epoch": 0.9359769213249993, + "grad_norm": 6.421672344207764, + "learning_rate": 1.0284104445417109e-07, + "loss": 0.1721, + "step": 36987 + }, + { + "epoch": 0.936002226889693, + "grad_norm": 5.375850200653076, + "learning_rate": 1.0276004281942864e-07, + "loss": 0.1406, + "step": 36988 + }, + { + "epoch": 0.9360275324543867, + "grad_norm": 2.865039587020874, + "learning_rate": 1.0267907276607592e-07, + "loss": 0.1065, + "step": 36989 + }, + { + "epoch": 0.9360528380190803, + "grad_norm": 7.026327133178711, + "learning_rate": 1.0259813429463472e-07, + "loss": 0.1555, + "step": 36990 + }, + { + "epoch": 0.9360781435837741, + "grad_norm": 5.617032051086426, + "learning_rate": 1.0251722740562742e-07, + "loss": 0.1494, + "step": 36991 + }, + { + "epoch": 0.9361034491484678, + "grad_norm": 13.04893684387207, + "learning_rate": 1.0243635209957526e-07, + "loss": 0.208, + "step": 36992 + }, + { + "epoch": 0.9361287547131614, + "grad_norm": 8.618967056274414, + "learning_rate": 1.0235550837700004e-07, + "loss": 0.2375, + "step": 36993 + }, + { + "epoch": 0.9361540602778551, + "grad_norm": 3.725473403930664, + "learning_rate": 1.0227469623842357e-07, + "loss": 0.1367, + "step": 36994 + }, + { + "epoch": 0.9361793658425488, + "grad_norm": 4.158450126647949, + "learning_rate": 1.0219391568436599e-07, + "loss": 0.0936, + "step": 36995 + }, + { + "epoch": 0.9362046714072425, + "grad_norm": 7.742430210113525, + "learning_rate": 1.021131667153491e-07, + "loss": 0.2392, + "step": 36996 + }, + { + "epoch": 0.9362299769719361, + "grad_norm": 5.053892135620117, + "learning_rate": 1.0203244933189305e-07, + "loss": 0.1589, + "step": 36997 + }, + { + "epoch": 0.9362552825366298, + "grad_norm": 5.636527061462402, + "learning_rate": 1.0195176353451853e-07, + "loss": 0.131, + "step": 36998 + }, + { + "epoch": 0.9362805881013235, + "grad_norm": 2.83526873588562, + "learning_rate": 1.0187110932374678e-07, + "loss": 0.1356, + "step": 36999 + }, + { + "epoch": 0.9363058936660171, + "grad_norm": 3.227454900741577, + "learning_rate": 1.0179048670009628e-07, + "loss": 0.0583, + "step": 37000 + }, + { + "epoch": 0.9363311992307108, + "grad_norm": 4.3493123054504395, + "learning_rate": 1.0170989566408773e-07, + "loss": 0.1493, + "step": 37001 + }, + { + "epoch": 0.9363565047954046, + "grad_norm": 4.919332981109619, + "learning_rate": 1.0162933621624182e-07, + "loss": 0.1523, + "step": 37002 + }, + { + "epoch": 0.9363818103600982, + "grad_norm": 3.5008320808410645, + "learning_rate": 1.0154880835707592e-07, + "loss": 0.14, + "step": 37003 + }, + { + "epoch": 0.9364071159247919, + "grad_norm": 3.7841997146606445, + "learning_rate": 1.0146831208711183e-07, + "loss": 0.1216, + "step": 37004 + }, + { + "epoch": 0.9364324214894856, + "grad_norm": 4.099523067474365, + "learning_rate": 1.0138784740686636e-07, + "loss": 0.1598, + "step": 37005 + }, + { + "epoch": 0.9364577270541792, + "grad_norm": 4.590336322784424, + "learning_rate": 1.013074143168602e-07, + "loss": 0.1875, + "step": 37006 + }, + { + "epoch": 0.9364830326188729, + "grad_norm": 8.096944808959961, + "learning_rate": 1.0122701281761072e-07, + "loss": 0.2449, + "step": 37007 + }, + { + "epoch": 0.9365083381835666, + "grad_norm": 6.299679756164551, + "learning_rate": 1.0114664290963805e-07, + "loss": 0.165, + "step": 37008 + }, + { + "epoch": 0.9365336437482602, + "grad_norm": 5.706534385681152, + "learning_rate": 1.0106630459345846e-07, + "loss": 0.2043, + "step": 37009 + }, + { + "epoch": 0.9365589493129539, + "grad_norm": 3.014122247695923, + "learning_rate": 1.0098599786959096e-07, + "loss": 0.1442, + "step": 37010 + }, + { + "epoch": 0.9365842548776476, + "grad_norm": 3.8772809505462646, + "learning_rate": 1.0090572273855348e-07, + "loss": 0.0986, + "step": 37011 + }, + { + "epoch": 0.9366095604423412, + "grad_norm": 5.366689682006836, + "learning_rate": 1.0082547920086395e-07, + "loss": 0.1776, + "step": 37012 + }, + { + "epoch": 0.9366348660070349, + "grad_norm": 4.4405951499938965, + "learning_rate": 1.007452672570397e-07, + "loss": 0.14, + "step": 37013 + }, + { + "epoch": 0.9366601715717287, + "grad_norm": 11.045414924621582, + "learning_rate": 1.0066508690759814e-07, + "loss": 0.2056, + "step": 37014 + }, + { + "epoch": 0.9366854771364223, + "grad_norm": 5.5951948165893555, + "learning_rate": 1.0058493815305492e-07, + "loss": 0.1772, + "step": 37015 + }, + { + "epoch": 0.936710782701116, + "grad_norm": 3.348254442214966, + "learning_rate": 1.0050482099392911e-07, + "loss": 0.1374, + "step": 37016 + }, + { + "epoch": 0.9367360882658097, + "grad_norm": 13.381123542785645, + "learning_rate": 1.0042473543073639e-07, + "loss": 0.3201, + "step": 37017 + }, + { + "epoch": 0.9367613938305033, + "grad_norm": 5.319108486175537, + "learning_rate": 1.0034468146399301e-07, + "loss": 0.1202, + "step": 37018 + }, + { + "epoch": 0.936786699395197, + "grad_norm": 2.931643486022949, + "learning_rate": 1.0026465909421468e-07, + "loss": 0.0922, + "step": 37019 + }, + { + "epoch": 0.9368120049598907, + "grad_norm": 6.112781047821045, + "learning_rate": 1.0018466832191876e-07, + "loss": 0.0828, + "step": 37020 + }, + { + "epoch": 0.9368373105245844, + "grad_norm": 5.734492778778076, + "learning_rate": 1.0010470914762038e-07, + "loss": 0.1548, + "step": 37021 + }, + { + "epoch": 0.936862616089278, + "grad_norm": 9.58495807647705, + "learning_rate": 1.0002478157183582e-07, + "loss": 0.134, + "step": 37022 + }, + { + "epoch": 0.9368879216539717, + "grad_norm": 6.286552429199219, + "learning_rate": 9.994488559507964e-08, + "loss": 0.1443, + "step": 37023 + }, + { + "epoch": 0.9369132272186654, + "grad_norm": 9.096161842346191, + "learning_rate": 9.986502121786645e-08, + "loss": 0.2014, + "step": 37024 + }, + { + "epoch": 0.936938532783359, + "grad_norm": 10.649391174316406, + "learning_rate": 9.978518844071305e-08, + "loss": 0.2592, + "step": 37025 + }, + { + "epoch": 0.9369638383480527, + "grad_norm": 7.536654949188232, + "learning_rate": 9.970538726413404e-08, + "loss": 0.1688, + "step": 37026 + }, + { + "epoch": 0.9369891439127465, + "grad_norm": 3.468506097793579, + "learning_rate": 9.962561768864288e-08, + "loss": 0.1408, + "step": 37027 + }, + { + "epoch": 0.9370144494774401, + "grad_norm": 5.690089225769043, + "learning_rate": 9.954587971475416e-08, + "loss": 0.1181, + "step": 37028 + }, + { + "epoch": 0.9370397550421338, + "grad_norm": 3.6314029693603516, + "learning_rate": 9.946617334298303e-08, + "loss": 0.1276, + "step": 37029 + }, + { + "epoch": 0.9370650606068275, + "grad_norm": 3.585447072982788, + "learning_rate": 9.938649857384296e-08, + "loss": 0.1659, + "step": 37030 + }, + { + "epoch": 0.9370903661715211, + "grad_norm": 3.018420934677124, + "learning_rate": 9.930685540784746e-08, + "loss": 0.0789, + "step": 37031 + }, + { + "epoch": 0.9371156717362148, + "grad_norm": 6.32358455657959, + "learning_rate": 9.922724384551053e-08, + "loss": 0.1523, + "step": 37032 + }, + { + "epoch": 0.9371409773009085, + "grad_norm": 6.1829071044921875, + "learning_rate": 9.914766388734564e-08, + "loss": 0.1681, + "step": 37033 + }, + { + "epoch": 0.9371662828656021, + "grad_norm": 5.5232462882995605, + "learning_rate": 9.906811553386575e-08, + "loss": 0.1264, + "step": 37034 + }, + { + "epoch": 0.9371915884302958, + "grad_norm": 3.297062635421753, + "learning_rate": 9.89885987855843e-08, + "loss": 0.1194, + "step": 37035 + }, + { + "epoch": 0.9372168939949895, + "grad_norm": 3.720005512237549, + "learning_rate": 9.890911364301314e-08, + "loss": 0.1468, + "step": 37036 + }, + { + "epoch": 0.9372421995596831, + "grad_norm": 4.859439849853516, + "learning_rate": 9.882966010666572e-08, + "loss": 0.1227, + "step": 37037 + }, + { + "epoch": 0.9372675051243768, + "grad_norm": 11.813794136047363, + "learning_rate": 9.875023817705386e-08, + "loss": 0.2729, + "step": 37038 + }, + { + "epoch": 0.9372928106890706, + "grad_norm": 9.614870071411133, + "learning_rate": 9.867084785469105e-08, + "loss": 0.1883, + "step": 37039 + }, + { + "epoch": 0.9373181162537642, + "grad_norm": 15.4266357421875, + "learning_rate": 9.859148914008687e-08, + "loss": 0.0942, + "step": 37040 + }, + { + "epoch": 0.9373434218184579, + "grad_norm": 3.0710341930389404, + "learning_rate": 9.851216203375536e-08, + "loss": 0.1504, + "step": 37041 + }, + { + "epoch": 0.9373687273831516, + "grad_norm": 4.271521091461182, + "learning_rate": 9.843286653620609e-08, + "loss": 0.1418, + "step": 37042 + }, + { + "epoch": 0.9373940329478452, + "grad_norm": 5.940021991729736, + "learning_rate": 9.835360264795313e-08, + "loss": 0.1738, + "step": 37043 + }, + { + "epoch": 0.9374193385125389, + "grad_norm": 5.725025653839111, + "learning_rate": 9.827437036950493e-08, + "loss": 0.1305, + "step": 37044 + }, + { + "epoch": 0.9374446440772326, + "grad_norm": 3.141026496887207, + "learning_rate": 9.819516970137333e-08, + "loss": 0.1259, + "step": 37045 + }, + { + "epoch": 0.9374699496419263, + "grad_norm": 2.987302541732788, + "learning_rate": 9.811600064406956e-08, + "loss": 0.1154, + "step": 37046 + }, + { + "epoch": 0.9374952552066199, + "grad_norm": 5.000185012817383, + "learning_rate": 9.803686319810434e-08, + "loss": 0.1338, + "step": 37047 + }, + { + "epoch": 0.9375205607713136, + "grad_norm": 5.778439044952393, + "learning_rate": 9.795775736398728e-08, + "loss": 0.1797, + "step": 37048 + }, + { + "epoch": 0.9375458663360073, + "grad_norm": 6.764671325683594, + "learning_rate": 9.787868314222848e-08, + "loss": 0.1309, + "step": 37049 + }, + { + "epoch": 0.9375711719007009, + "grad_norm": 5.063014030456543, + "learning_rate": 9.779964053333813e-08, + "loss": 0.2002, + "step": 37050 + }, + { + "epoch": 0.9375964774653947, + "grad_norm": 10.859756469726562, + "learning_rate": 9.772062953782635e-08, + "loss": 0.3059, + "step": 37051 + }, + { + "epoch": 0.9376217830300884, + "grad_norm": 4.523056507110596, + "learning_rate": 9.764165015620274e-08, + "loss": 0.1682, + "step": 37052 + }, + { + "epoch": 0.937647088594782, + "grad_norm": 20.691299438476562, + "learning_rate": 9.756270238897581e-08, + "loss": 0.3474, + "step": 37053 + }, + { + "epoch": 0.9376723941594757, + "grad_norm": 2.2654500007629395, + "learning_rate": 9.748378623665511e-08, + "loss": 0.0681, + "step": 37054 + }, + { + "epoch": 0.9376976997241694, + "grad_norm": 7.102685928344727, + "learning_rate": 9.740490169974914e-08, + "loss": 0.1954, + "step": 37055 + }, + { + "epoch": 0.937723005288863, + "grad_norm": 8.858722686767578, + "learning_rate": 9.73260487787675e-08, + "loss": 0.1632, + "step": 37056 + }, + { + "epoch": 0.9377483108535567, + "grad_norm": 6.106456279754639, + "learning_rate": 9.72472274742181e-08, + "loss": 0.2345, + "step": 37057 + }, + { + "epoch": 0.9377736164182504, + "grad_norm": 7.717215061187744, + "learning_rate": 9.716843778660945e-08, + "loss": 0.247, + "step": 37058 + }, + { + "epoch": 0.937798921982944, + "grad_norm": 4.851327419281006, + "learning_rate": 9.708967971644889e-08, + "loss": 0.1942, + "step": 37059 + }, + { + "epoch": 0.9378242275476377, + "grad_norm": 4.415956497192383, + "learning_rate": 9.701095326424603e-08, + "loss": 0.1781, + "step": 37060 + }, + { + "epoch": 0.9378495331123314, + "grad_norm": 2.5350918769836426, + "learning_rate": 9.693225843050658e-08, + "loss": 0.0964, + "step": 37061 + }, + { + "epoch": 0.937874838677025, + "grad_norm": 5.140071392059326, + "learning_rate": 9.685359521573956e-08, + "loss": 0.1286, + "step": 37062 + }, + { + "epoch": 0.9379001442417187, + "grad_norm": 4.583980083465576, + "learning_rate": 9.677496362045124e-08, + "loss": 0.1391, + "step": 37063 + }, + { + "epoch": 0.9379254498064125, + "grad_norm": 3.362518310546875, + "learning_rate": 9.669636364514955e-08, + "loss": 0.0836, + "step": 37064 + }, + { + "epoch": 0.9379507553711061, + "grad_norm": 5.900263786315918, + "learning_rate": 9.66177952903402e-08, + "loss": 0.1275, + "step": 37065 + }, + { + "epoch": 0.9379760609357998, + "grad_norm": 5.722598075866699, + "learning_rate": 9.653925855653168e-08, + "loss": 0.1452, + "step": 37066 + }, + { + "epoch": 0.9380013665004935, + "grad_norm": 9.282523155212402, + "learning_rate": 9.646075344422856e-08, + "loss": 0.1584, + "step": 37067 + }, + { + "epoch": 0.9380266720651871, + "grad_norm": 3.5145468711853027, + "learning_rate": 9.638227995393823e-08, + "loss": 0.1264, + "step": 37068 + }, + { + "epoch": 0.9380519776298808, + "grad_norm": 5.095061302185059, + "learning_rate": 9.630383808616584e-08, + "loss": 0.1304, + "step": 37069 + }, + { + "epoch": 0.9380772831945745, + "grad_norm": 4.672349452972412, + "learning_rate": 9.622542784141875e-08, + "loss": 0.0721, + "step": 37070 + }, + { + "epoch": 0.9381025887592681, + "grad_norm": 5.568387508392334, + "learning_rate": 9.614704922020101e-08, + "loss": 0.1285, + "step": 37071 + }, + { + "epoch": 0.9381278943239618, + "grad_norm": 5.470483303070068, + "learning_rate": 9.60687022230189e-08, + "loss": 0.1583, + "step": 37072 + }, + { + "epoch": 0.9381531998886555, + "grad_norm": 2.646253824234009, + "learning_rate": 9.599038685037753e-08, + "loss": 0.0864, + "step": 37073 + }, + { + "epoch": 0.9381785054533492, + "grad_norm": 7.5217061042785645, + "learning_rate": 9.591210310278265e-08, + "loss": 0.1537, + "step": 37074 + }, + { + "epoch": 0.9382038110180428, + "grad_norm": 10.096858024597168, + "learning_rate": 9.583385098073716e-08, + "loss": 0.2315, + "step": 37075 + }, + { + "epoch": 0.9382291165827366, + "grad_norm": 15.304362297058105, + "learning_rate": 9.57556304847479e-08, + "loss": 0.3159, + "step": 37076 + }, + { + "epoch": 0.9382544221474303, + "grad_norm": 5.407783031463623, + "learning_rate": 9.567744161531778e-08, + "loss": 0.158, + "step": 37077 + }, + { + "epoch": 0.9382797277121239, + "grad_norm": 7.87017297744751, + "learning_rate": 9.55992843729514e-08, + "loss": 0.1089, + "step": 37078 + }, + { + "epoch": 0.9383050332768176, + "grad_norm": 5.811313152313232, + "learning_rate": 9.552115875815337e-08, + "loss": 0.1477, + "step": 37079 + }, + { + "epoch": 0.9383303388415113, + "grad_norm": 5.837364673614502, + "learning_rate": 9.544306477142718e-08, + "loss": 0.0956, + "step": 37080 + }, + { + "epoch": 0.9383556444062049, + "grad_norm": 3.5148162841796875, + "learning_rate": 9.536500241327628e-08, + "loss": 0.1365, + "step": 37081 + }, + { + "epoch": 0.9383809499708986, + "grad_norm": 2.768889904022217, + "learning_rate": 9.528697168420365e-08, + "loss": 0.1021, + "step": 37082 + }, + { + "epoch": 0.9384062555355923, + "grad_norm": 8.482073783874512, + "learning_rate": 9.520897258471384e-08, + "loss": 0.1857, + "step": 37083 + }, + { + "epoch": 0.9384315611002859, + "grad_norm": 6.191765308380127, + "learning_rate": 9.513100511530871e-08, + "loss": 0.1783, + "step": 37084 + }, + { + "epoch": 0.9384568666649796, + "grad_norm": 8.597373962402344, + "learning_rate": 9.505306927649116e-08, + "loss": 0.1756, + "step": 37085 + }, + { + "epoch": 0.9384821722296733, + "grad_norm": 9.22616958618164, + "learning_rate": 9.497516506876358e-08, + "loss": 0.1764, + "step": 37086 + }, + { + "epoch": 0.9385074777943669, + "grad_norm": 9.097260475158691, + "learning_rate": 9.489729249262947e-08, + "loss": 0.1641, + "step": 37087 + }, + { + "epoch": 0.9385327833590607, + "grad_norm": 4.540417194366455, + "learning_rate": 9.481945154859007e-08, + "loss": 0.1295, + "step": 37088 + }, + { + "epoch": 0.9385580889237544, + "grad_norm": 6.64668083190918, + "learning_rate": 9.474164223714777e-08, + "loss": 0.1699, + "step": 37089 + }, + { + "epoch": 0.938583394488448, + "grad_norm": 3.520542621612549, + "learning_rate": 9.466386455880328e-08, + "loss": 0.166, + "step": 37090 + }, + { + "epoch": 0.9386087000531417, + "grad_norm": 4.232426166534424, + "learning_rate": 9.458611851406008e-08, + "loss": 0.0956, + "step": 37091 + }, + { + "epoch": 0.9386340056178354, + "grad_norm": 4.560964107513428, + "learning_rate": 9.450840410341833e-08, + "loss": 0.1757, + "step": 37092 + }, + { + "epoch": 0.938659311182529, + "grad_norm": 15.33349895477295, + "learning_rate": 9.44307213273793e-08, + "loss": 0.1969, + "step": 37093 + }, + { + "epoch": 0.9386846167472227, + "grad_norm": 7.082851886749268, + "learning_rate": 9.435307018644369e-08, + "loss": 0.1509, + "step": 37094 + }, + { + "epoch": 0.9387099223119164, + "grad_norm": 4.848311901092529, + "learning_rate": 9.427545068111277e-08, + "loss": 0.2215, + "step": 37095 + }, + { + "epoch": 0.93873522787661, + "grad_norm": 39.38042068481445, + "learning_rate": 9.419786281188726e-08, + "loss": 0.2853, + "step": 37096 + }, + { + "epoch": 0.9387605334413037, + "grad_norm": 2.239318609237671, + "learning_rate": 9.41203065792673e-08, + "loss": 0.091, + "step": 37097 + }, + { + "epoch": 0.9387858390059974, + "grad_norm": 6.942234992980957, + "learning_rate": 9.404278198375194e-08, + "loss": 0.1434, + "step": 37098 + }, + { + "epoch": 0.9388111445706911, + "grad_norm": 5.2078166007995605, + "learning_rate": 9.396528902584301e-08, + "loss": 0.2021, + "step": 37099 + }, + { + "epoch": 0.9388364501353847, + "grad_norm": 3.6396901607513428, + "learning_rate": 9.388782770603899e-08, + "loss": 0.1161, + "step": 37100 + }, + { + "epoch": 0.9388617557000785, + "grad_norm": 14.223087310791016, + "learning_rate": 9.381039802484004e-08, + "loss": 0.1394, + "step": 37101 + }, + { + "epoch": 0.9388870612647722, + "grad_norm": 5.376776695251465, + "learning_rate": 9.373299998274521e-08, + "loss": 0.1878, + "step": 37102 + }, + { + "epoch": 0.9389123668294658, + "grad_norm": 4.3015899658203125, + "learning_rate": 9.365563358025354e-08, + "loss": 0.116, + "step": 37103 + }, + { + "epoch": 0.9389376723941595, + "grad_norm": 4.4250617027282715, + "learning_rate": 9.357829881786352e-08, + "loss": 0.1328, + "step": 37104 + }, + { + "epoch": 0.9389629779588532, + "grad_norm": 11.992083549499512, + "learning_rate": 9.35009956960753e-08, + "loss": 0.264, + "step": 37105 + }, + { + "epoch": 0.9389882835235468, + "grad_norm": 3.675008535385132, + "learning_rate": 9.34237242153857e-08, + "loss": 0.1246, + "step": 37106 + }, + { + "epoch": 0.9390135890882405, + "grad_norm": 10.40491008758545, + "learning_rate": 9.334648437629435e-08, + "loss": 0.2085, + "step": 37107 + }, + { + "epoch": 0.9390388946529342, + "grad_norm": 5.342727184295654, + "learning_rate": 9.326927617929859e-08, + "loss": 0.1122, + "step": 37108 + }, + { + "epoch": 0.9390642002176278, + "grad_norm": 3.533604145050049, + "learning_rate": 9.319209962489584e-08, + "loss": 0.1123, + "step": 37109 + }, + { + "epoch": 0.9390895057823215, + "grad_norm": 3.7987282276153564, + "learning_rate": 9.311495471358567e-08, + "loss": 0.0872, + "step": 37110 + }, + { + "epoch": 0.9391148113470152, + "grad_norm": 12.279499053955078, + "learning_rate": 9.30378414458638e-08, + "loss": 0.1602, + "step": 37111 + }, + { + "epoch": 0.9391401169117088, + "grad_norm": 6.528666973114014, + "learning_rate": 9.296075982222819e-08, + "loss": 0.136, + "step": 37112 + }, + { + "epoch": 0.9391654224764026, + "grad_norm": 3.85573410987854, + "learning_rate": 9.288370984317563e-08, + "loss": 0.1103, + "step": 37113 + }, + { + "epoch": 0.9391907280410963, + "grad_norm": 10.257601737976074, + "learning_rate": 9.280669150920407e-08, + "loss": 0.0725, + "step": 37114 + }, + { + "epoch": 0.9392160336057899, + "grad_norm": 7.058491230010986, + "learning_rate": 9.272970482080867e-08, + "loss": 0.15, + "step": 37115 + }, + { + "epoch": 0.9392413391704836, + "grad_norm": 2.4156219959259033, + "learning_rate": 9.265274977848681e-08, + "loss": 0.0847, + "step": 37116 + }, + { + "epoch": 0.9392666447351773, + "grad_norm": 1.9088274240493774, + "learning_rate": 9.257582638273422e-08, + "loss": 0.0718, + "step": 37117 + }, + { + "epoch": 0.9392919502998709, + "grad_norm": 8.234481811523438, + "learning_rate": 9.249893463404769e-08, + "loss": 0.2207, + "step": 37118 + }, + { + "epoch": 0.9393172558645646, + "grad_norm": 4.312513828277588, + "learning_rate": 9.24220745329224e-08, + "loss": 0.1598, + "step": 37119 + }, + { + "epoch": 0.9393425614292583, + "grad_norm": 2.9495651721954346, + "learning_rate": 9.234524607985462e-08, + "loss": 0.1292, + "step": 37120 + }, + { + "epoch": 0.9393678669939519, + "grad_norm": 2.2930290699005127, + "learning_rate": 9.226844927533896e-08, + "loss": 0.0389, + "step": 37121 + }, + { + "epoch": 0.9393931725586456, + "grad_norm": 4.825464725494385, + "learning_rate": 9.219168411987111e-08, + "loss": 0.1856, + "step": 37122 + }, + { + "epoch": 0.9394184781233393, + "grad_norm": 3.471968173980713, + "learning_rate": 9.211495061394682e-08, + "loss": 0.0915, + "step": 37123 + }, + { + "epoch": 0.939443783688033, + "grad_norm": 5.152712345123291, + "learning_rate": 9.203824875806011e-08, + "loss": 0.1326, + "step": 37124 + }, + { + "epoch": 0.9394690892527267, + "grad_norm": 4.337971210479736, + "learning_rate": 9.196157855270504e-08, + "loss": 0.1568, + "step": 37125 + }, + { + "epoch": 0.9394943948174204, + "grad_norm": 5.058638572692871, + "learning_rate": 9.188493999837734e-08, + "loss": 0.1913, + "step": 37126 + }, + { + "epoch": 0.9395197003821141, + "grad_norm": 10.37856674194336, + "learning_rate": 9.180833309557102e-08, + "loss": 0.1961, + "step": 37127 + }, + { + "epoch": 0.9395450059468077, + "grad_norm": 6.140896797180176, + "learning_rate": 9.173175784477905e-08, + "loss": 0.1747, + "step": 37128 + }, + { + "epoch": 0.9395703115115014, + "grad_norm": 3.2041501998901367, + "learning_rate": 9.165521424649604e-08, + "loss": 0.0989, + "step": 37129 + }, + { + "epoch": 0.9395956170761951, + "grad_norm": 5.673173904418945, + "learning_rate": 9.157870230121601e-08, + "loss": 0.142, + "step": 37130 + }, + { + "epoch": 0.9396209226408887, + "grad_norm": 11.879117965698242, + "learning_rate": 9.150222200943137e-08, + "loss": 0.2297, + "step": 37131 + }, + { + "epoch": 0.9396462282055824, + "grad_norm": 8.901496887207031, + "learning_rate": 9.142577337163672e-08, + "loss": 0.2293, + "step": 37132 + }, + { + "epoch": 0.9396715337702761, + "grad_norm": 2.4186782836914062, + "learning_rate": 9.134935638832331e-08, + "loss": 0.0661, + "step": 37133 + }, + { + "epoch": 0.9396968393349697, + "grad_norm": 5.680630683898926, + "learning_rate": 9.127297105998523e-08, + "loss": 0.167, + "step": 37134 + }, + { + "epoch": 0.9397221448996634, + "grad_norm": 4.632547378540039, + "learning_rate": 9.119661738711427e-08, + "loss": 0.0745, + "step": 37135 + }, + { + "epoch": 0.9397474504643571, + "grad_norm": 8.857975006103516, + "learning_rate": 9.112029537020395e-08, + "loss": 0.1634, + "step": 37136 + }, + { + "epoch": 0.9397727560290507, + "grad_norm": 6.953880786895752, + "learning_rate": 9.104400500974498e-08, + "loss": 0.1645, + "step": 37137 + }, + { + "epoch": 0.9397980615937445, + "grad_norm": 5.379568576812744, + "learning_rate": 9.096774630623084e-08, + "loss": 0.1154, + "step": 37138 + }, + { + "epoch": 0.9398233671584382, + "grad_norm": 12.084122657775879, + "learning_rate": 9.089151926015227e-08, + "loss": 0.4047, + "step": 37139 + }, + { + "epoch": 0.9398486727231318, + "grad_norm": 7.252776145935059, + "learning_rate": 9.081532387200054e-08, + "loss": 0.2259, + "step": 37140 + }, + { + "epoch": 0.9398739782878255, + "grad_norm": 8.325020790100098, + "learning_rate": 9.073916014226803e-08, + "loss": 0.1993, + "step": 37141 + }, + { + "epoch": 0.9398992838525192, + "grad_norm": 12.531853675842285, + "learning_rate": 9.066302807144545e-08, + "loss": 0.2052, + "step": 37142 + }, + { + "epoch": 0.9399245894172128, + "grad_norm": 3.627288579940796, + "learning_rate": 9.058692766002408e-08, + "loss": 0.1211, + "step": 37143 + }, + { + "epoch": 0.9399498949819065, + "grad_norm": 4.945983409881592, + "learning_rate": 9.051085890849409e-08, + "loss": 0.1331, + "step": 37144 + }, + { + "epoch": 0.9399752005466002, + "grad_norm": 6.886476516723633, + "learning_rate": 9.043482181734675e-08, + "loss": 0.1749, + "step": 37145 + }, + { + "epoch": 0.9400005061112938, + "grad_norm": 6.884150505065918, + "learning_rate": 9.035881638707167e-08, + "loss": 0.1631, + "step": 37146 + }, + { + "epoch": 0.9400258116759875, + "grad_norm": 11.440277099609375, + "learning_rate": 9.0282842618159e-08, + "loss": 0.1777, + "step": 37147 + }, + { + "epoch": 0.9400511172406812, + "grad_norm": 4.66591215133667, + "learning_rate": 9.020690051109893e-08, + "loss": 0.1612, + "step": 37148 + }, + { + "epoch": 0.940076422805375, + "grad_norm": 3.5182032585144043, + "learning_rate": 9.013099006638215e-08, + "loss": 0.1095, + "step": 37149 + }, + { + "epoch": 0.9401017283700686, + "grad_norm": 14.74867057800293, + "learning_rate": 9.005511128449662e-08, + "loss": 0.3438, + "step": 37150 + }, + { + "epoch": 0.9401270339347623, + "grad_norm": 5.937587261199951, + "learning_rate": 8.99792641659325e-08, + "loss": 0.1117, + "step": 37151 + }, + { + "epoch": 0.940152339499456, + "grad_norm": 7.161223411560059, + "learning_rate": 8.990344871117885e-08, + "loss": 0.1439, + "step": 37152 + }, + { + "epoch": 0.9401776450641496, + "grad_norm": 3.9262359142303467, + "learning_rate": 8.98276649207247e-08, + "loss": 0.1228, + "step": 37153 + }, + { + "epoch": 0.9402029506288433, + "grad_norm": 13.693113327026367, + "learning_rate": 8.975191279505802e-08, + "loss": 0.1467, + "step": 37154 + }, + { + "epoch": 0.940228256193537, + "grad_norm": 6.036190986633301, + "learning_rate": 8.96761923346684e-08, + "loss": 0.1013, + "step": 37155 + }, + { + "epoch": 0.9402535617582306, + "grad_norm": 5.424395561218262, + "learning_rate": 8.960050354004324e-08, + "loss": 0.0771, + "step": 37156 + }, + { + "epoch": 0.9402788673229243, + "grad_norm": 6.864121437072754, + "learning_rate": 8.952484641167158e-08, + "loss": 0.2091, + "step": 37157 + }, + { + "epoch": 0.940304172887618, + "grad_norm": 8.122802734375, + "learning_rate": 8.944922095004027e-08, + "loss": 0.1595, + "step": 37158 + }, + { + "epoch": 0.9403294784523116, + "grad_norm": 2.818164825439453, + "learning_rate": 8.93736271556378e-08, + "loss": 0.116, + "step": 37159 + }, + { + "epoch": 0.9403547840170053, + "grad_norm": 4.595949172973633, + "learning_rate": 8.9298065028951e-08, + "loss": 0.1291, + "step": 37160 + }, + { + "epoch": 0.940380089581699, + "grad_norm": 5.493037223815918, + "learning_rate": 8.922253457046782e-08, + "loss": 0.2177, + "step": 37161 + }, + { + "epoch": 0.9404053951463927, + "grad_norm": 3.508385419845581, + "learning_rate": 8.914703578067451e-08, + "loss": 0.1963, + "step": 37162 + }, + { + "epoch": 0.9404307007110864, + "grad_norm": 3.728882074356079, + "learning_rate": 8.907156866005961e-08, + "loss": 0.114, + "step": 37163 + }, + { + "epoch": 0.9404560062757801, + "grad_norm": 5.641571998596191, + "learning_rate": 8.899613320910716e-08, + "loss": 0.1472, + "step": 37164 + }, + { + "epoch": 0.9404813118404737, + "grad_norm": 5.491128921508789, + "learning_rate": 8.892072942830621e-08, + "loss": 0.1499, + "step": 37165 + }, + { + "epoch": 0.9405066174051674, + "grad_norm": 4.796045780181885, + "learning_rate": 8.884535731814192e-08, + "loss": 0.1581, + "step": 37166 + }, + { + "epoch": 0.9405319229698611, + "grad_norm": 8.291730880737305, + "learning_rate": 8.877001687910003e-08, + "loss": 0.1593, + "step": 37167 + }, + { + "epoch": 0.9405572285345547, + "grad_norm": 5.193310737609863, + "learning_rate": 8.86947081116668e-08, + "loss": 0.1421, + "step": 37168 + }, + { + "epoch": 0.9405825340992484, + "grad_norm": 4.389057636260986, + "learning_rate": 8.861943101632797e-08, + "loss": 0.1561, + "step": 37169 + }, + { + "epoch": 0.9406078396639421, + "grad_norm": 5.886246204376221, + "learning_rate": 8.85441855935687e-08, + "loss": 0.1879, + "step": 37170 + }, + { + "epoch": 0.9406331452286357, + "grad_norm": 3.5049362182617188, + "learning_rate": 8.84689718438747e-08, + "loss": 0.1206, + "step": 37171 + }, + { + "epoch": 0.9406584507933294, + "grad_norm": 3.800971031188965, + "learning_rate": 8.839378976773005e-08, + "loss": 0.0809, + "step": 37172 + }, + { + "epoch": 0.9406837563580231, + "grad_norm": 7.568766117095947, + "learning_rate": 8.831863936562046e-08, + "loss": 0.1914, + "step": 37173 + }, + { + "epoch": 0.9407090619227169, + "grad_norm": 3.9709737300872803, + "learning_rate": 8.824352063803054e-08, + "loss": 0.1303, + "step": 37174 + }, + { + "epoch": 0.9407343674874105, + "grad_norm": 16.086023330688477, + "learning_rate": 8.81684335854438e-08, + "loss": 0.2843, + "step": 37175 + }, + { + "epoch": 0.9407596730521042, + "grad_norm": 6.099217414855957, + "learning_rate": 8.809337820834651e-08, + "loss": 0.2186, + "step": 37176 + }, + { + "epoch": 0.9407849786167979, + "grad_norm": 6.208846092224121, + "learning_rate": 8.801835450722052e-08, + "loss": 0.1247, + "step": 37177 + }, + { + "epoch": 0.9408102841814915, + "grad_norm": 3.767951250076294, + "learning_rate": 8.794336248255042e-08, + "loss": 0.1408, + "step": 37178 + }, + { + "epoch": 0.9408355897461852, + "grad_norm": 5.1014533042907715, + "learning_rate": 8.786840213481973e-08, + "loss": 0.156, + "step": 37179 + }, + { + "epoch": 0.9408608953108789, + "grad_norm": 7.160466194152832, + "learning_rate": 8.77934734645125e-08, + "loss": 0.2182, + "step": 37180 + }, + { + "epoch": 0.9408862008755725, + "grad_norm": 6.834997177124023, + "learning_rate": 8.771857647211113e-08, + "loss": 0.2091, + "step": 37181 + }, + { + "epoch": 0.9409115064402662, + "grad_norm": 3.1156458854675293, + "learning_rate": 8.764371115809855e-08, + "loss": 0.1177, + "step": 37182 + }, + { + "epoch": 0.9409368120049599, + "grad_norm": 8.261033058166504, + "learning_rate": 8.756887752295829e-08, + "loss": 0.1405, + "step": 37183 + }, + { + "epoch": 0.9409621175696535, + "grad_norm": 14.218204498291016, + "learning_rate": 8.749407556717272e-08, + "loss": 0.2558, + "step": 37184 + }, + { + "epoch": 0.9409874231343472, + "grad_norm": 11.11892318725586, + "learning_rate": 8.741930529122367e-08, + "loss": 0.1436, + "step": 37185 + }, + { + "epoch": 0.941012728699041, + "grad_norm": 5.345327854156494, + "learning_rate": 8.73445666955941e-08, + "loss": 0.2063, + "step": 37186 + }, + { + "epoch": 0.9410380342637346, + "grad_norm": 2.575514793395996, + "learning_rate": 8.72698597807653e-08, + "loss": 0.1352, + "step": 37187 + }, + { + "epoch": 0.9410633398284283, + "grad_norm": 4.633614540100098, + "learning_rate": 8.719518454721964e-08, + "loss": 0.1254, + "step": 37188 + }, + { + "epoch": 0.941088645393122, + "grad_norm": 5.449425220489502, + "learning_rate": 8.712054099543787e-08, + "loss": 0.1222, + "step": 37189 + }, + { + "epoch": 0.9411139509578156, + "grad_norm": 4.961997032165527, + "learning_rate": 8.704592912590237e-08, + "loss": 0.1564, + "step": 37190 + }, + { + "epoch": 0.9411392565225093, + "grad_norm": 5.845376014709473, + "learning_rate": 8.697134893909332e-08, + "loss": 0.1546, + "step": 37191 + }, + { + "epoch": 0.941164562087203, + "grad_norm": 4.143818378448486, + "learning_rate": 8.689680043549253e-08, + "loss": 0.1573, + "step": 37192 + }, + { + "epoch": 0.9411898676518966, + "grad_norm": 3.0580224990844727, + "learning_rate": 8.682228361558076e-08, + "loss": 0.0913, + "step": 37193 + }, + { + "epoch": 0.9412151732165903, + "grad_norm": 9.16635513305664, + "learning_rate": 8.674779847983761e-08, + "loss": 0.2482, + "step": 37194 + }, + { + "epoch": 0.941240478781284, + "grad_norm": 7.014538288116455, + "learning_rate": 8.66733450287438e-08, + "loss": 0.1892, + "step": 37195 + }, + { + "epoch": 0.9412657843459776, + "grad_norm": 1.9745537042617798, + "learning_rate": 8.659892326278008e-08, + "loss": 0.0565, + "step": 37196 + }, + { + "epoch": 0.9412910899106713, + "grad_norm": 3.7733023166656494, + "learning_rate": 8.652453318242549e-08, + "loss": 0.1126, + "step": 37197 + }, + { + "epoch": 0.941316395475365, + "grad_norm": 5.430765151977539, + "learning_rate": 8.645017478816076e-08, + "loss": 0.1503, + "step": 37198 + }, + { + "epoch": 0.9413417010400587, + "grad_norm": 2.3695273399353027, + "learning_rate": 8.63758480804644e-08, + "loss": 0.1078, + "step": 37199 + }, + { + "epoch": 0.9413670066047524, + "grad_norm": 5.111281871795654, + "learning_rate": 8.630155305981602e-08, + "loss": 0.1071, + "step": 37200 + }, + { + "epoch": 0.9413923121694461, + "grad_norm": 4.5552897453308105, + "learning_rate": 8.622728972669525e-08, + "loss": 0.0949, + "step": 37201 + }, + { + "epoch": 0.9414176177341398, + "grad_norm": 4.343072891235352, + "learning_rate": 8.615305808158115e-08, + "loss": 0.1223, + "step": 37202 + }, + { + "epoch": 0.9414429232988334, + "grad_norm": 3.717935085296631, + "learning_rate": 8.607885812495165e-08, + "loss": 0.1076, + "step": 37203 + }, + { + "epoch": 0.9414682288635271, + "grad_norm": 3.7725436687469482, + "learning_rate": 8.600468985728472e-08, + "loss": 0.1203, + "step": 37204 + }, + { + "epoch": 0.9414935344282208, + "grad_norm": 3.4206149578094482, + "learning_rate": 8.593055327905996e-08, + "loss": 0.1285, + "step": 37205 + }, + { + "epoch": 0.9415188399929144, + "grad_norm": 15.304451942443848, + "learning_rate": 8.58564483907548e-08, + "loss": 0.1937, + "step": 37206 + }, + { + "epoch": 0.9415441455576081, + "grad_norm": 8.81816577911377, + "learning_rate": 8.578237519284827e-08, + "loss": 0.2571, + "step": 37207 + }, + { + "epoch": 0.9415694511223018, + "grad_norm": 4.572242259979248, + "learning_rate": 8.57083336858161e-08, + "loss": 0.2128, + "step": 37208 + }, + { + "epoch": 0.9415947566869954, + "grad_norm": 6.554450988769531, + "learning_rate": 8.563432387013736e-08, + "loss": 0.1774, + "step": 37209 + }, + { + "epoch": 0.9416200622516891, + "grad_norm": 4.725898265838623, + "learning_rate": 8.55603457462878e-08, + "loss": 0.1295, + "step": 37210 + }, + { + "epoch": 0.9416453678163829, + "grad_norm": 4.403651714324951, + "learning_rate": 8.548639931474645e-08, + "loss": 0.1371, + "step": 37211 + }, + { + "epoch": 0.9416706733810765, + "grad_norm": 2.8892271518707275, + "learning_rate": 8.54124845759885e-08, + "loss": 0.0663, + "step": 37212 + }, + { + "epoch": 0.9416959789457702, + "grad_norm": 5.902562141418457, + "learning_rate": 8.53386015304919e-08, + "loss": 0.1882, + "step": 37213 + }, + { + "epoch": 0.9417212845104639, + "grad_norm": 3.480832576751709, + "learning_rate": 8.526475017873181e-08, + "loss": 0.139, + "step": 37214 + }, + { + "epoch": 0.9417465900751575, + "grad_norm": 9.467035293579102, + "learning_rate": 8.51909305211851e-08, + "loss": 0.1376, + "step": 37215 + }, + { + "epoch": 0.9417718956398512, + "grad_norm": 5.105086326599121, + "learning_rate": 8.511714255832859e-08, + "loss": 0.1205, + "step": 37216 + }, + { + "epoch": 0.9417972012045449, + "grad_norm": 5.749993324279785, + "learning_rate": 8.504338629063691e-08, + "loss": 0.1486, + "step": 37217 + }, + { + "epoch": 0.9418225067692385, + "grad_norm": 7.079396724700928, + "learning_rate": 8.496966171858579e-08, + "loss": 0.1634, + "step": 37218 + }, + { + "epoch": 0.9418478123339322, + "grad_norm": 4.137166976928711, + "learning_rate": 8.48959688426515e-08, + "loss": 0.0949, + "step": 37219 + }, + { + "epoch": 0.9418731178986259, + "grad_norm": 7.543015956878662, + "learning_rate": 8.482230766330867e-08, + "loss": 0.1384, + "step": 37220 + }, + { + "epoch": 0.9418984234633195, + "grad_norm": 11.096858978271484, + "learning_rate": 8.474867818103305e-08, + "loss": 0.124, + "step": 37221 + }, + { + "epoch": 0.9419237290280132, + "grad_norm": 6.031565189361572, + "learning_rate": 8.467508039629812e-08, + "loss": 0.2415, + "step": 37222 + }, + { + "epoch": 0.941949034592707, + "grad_norm": 3.6154978275299072, + "learning_rate": 8.460151430957964e-08, + "loss": 0.1429, + "step": 37223 + }, + { + "epoch": 0.9419743401574006, + "grad_norm": 6.5557403564453125, + "learning_rate": 8.452797992135165e-08, + "loss": 0.1195, + "step": 37224 + }, + { + "epoch": 0.9419996457220943, + "grad_norm": 8.409016609191895, + "learning_rate": 8.445447723208877e-08, + "loss": 0.1808, + "step": 37225 + }, + { + "epoch": 0.942024951286788, + "grad_norm": 3.7755320072174072, + "learning_rate": 8.438100624226342e-08, + "loss": 0.1514, + "step": 37226 + }, + { + "epoch": 0.9420502568514817, + "grad_norm": 3.7728967666625977, + "learning_rate": 8.430756695235188e-08, + "loss": 0.1463, + "step": 37227 + }, + { + "epoch": 0.9420755624161753, + "grad_norm": 8.315147399902344, + "learning_rate": 8.423415936282597e-08, + "loss": 0.1923, + "step": 37228 + }, + { + "epoch": 0.942100867980869, + "grad_norm": 13.86034870147705, + "learning_rate": 8.416078347415924e-08, + "loss": 0.2253, + "step": 37229 + }, + { + "epoch": 0.9421261735455627, + "grad_norm": 3.161020278930664, + "learning_rate": 8.408743928682572e-08, + "loss": 0.122, + "step": 37230 + }, + { + "epoch": 0.9421514791102563, + "grad_norm": 3.976893663406372, + "learning_rate": 8.401412680129728e-08, + "loss": 0.1485, + "step": 37231 + }, + { + "epoch": 0.94217678467495, + "grad_norm": 4.147792339324951, + "learning_rate": 8.394084601804798e-08, + "loss": 0.1098, + "step": 37232 + }, + { + "epoch": 0.9422020902396437, + "grad_norm": 6.236659526824951, + "learning_rate": 8.38675969375491e-08, + "loss": 0.1871, + "step": 37233 + }, + { + "epoch": 0.9422273958043373, + "grad_norm": 3.8337903022766113, + "learning_rate": 8.379437956027414e-08, + "loss": 0.1386, + "step": 37234 + }, + { + "epoch": 0.942252701369031, + "grad_norm": 2.013624429702759, + "learning_rate": 8.372119388669442e-08, + "loss": 0.0762, + "step": 37235 + }, + { + "epoch": 0.9422780069337248, + "grad_norm": 5.093268871307373, + "learning_rate": 8.364803991728232e-08, + "loss": 0.1642, + "step": 37236 + }, + { + "epoch": 0.9423033124984184, + "grad_norm": 6.303818225860596, + "learning_rate": 8.357491765250914e-08, + "loss": 0.1484, + "step": 37237 + }, + { + "epoch": 0.9423286180631121, + "grad_norm": 3.6429989337921143, + "learning_rate": 8.350182709284726e-08, + "loss": 0.0914, + "step": 37238 + }, + { + "epoch": 0.9423539236278058, + "grad_norm": 2.993929386138916, + "learning_rate": 8.342876823876745e-08, + "loss": 0.1049, + "step": 37239 + }, + { + "epoch": 0.9423792291924994, + "grad_norm": 4.504434108734131, + "learning_rate": 8.335574109074095e-08, + "loss": 0.1102, + "step": 37240 + }, + { + "epoch": 0.9424045347571931, + "grad_norm": 6.047847270965576, + "learning_rate": 8.328274564923855e-08, + "loss": 0.1696, + "step": 37241 + }, + { + "epoch": 0.9424298403218868, + "grad_norm": 3.5361440181732178, + "learning_rate": 8.32097819147315e-08, + "loss": 0.0853, + "step": 37242 + }, + { + "epoch": 0.9424551458865804, + "grad_norm": 6.136765956878662, + "learning_rate": 8.313684988768944e-08, + "loss": 0.1246, + "step": 37243 + }, + { + "epoch": 0.9424804514512741, + "grad_norm": 9.012954711914062, + "learning_rate": 8.306394956858366e-08, + "loss": 0.1235, + "step": 37244 + }, + { + "epoch": 0.9425057570159678, + "grad_norm": 8.727962493896484, + "learning_rate": 8.299108095788321e-08, + "loss": 0.1485, + "step": 37245 + }, + { + "epoch": 0.9425310625806614, + "grad_norm": 10.967113494873047, + "learning_rate": 8.291824405605942e-08, + "loss": 0.1858, + "step": 37246 + }, + { + "epoch": 0.9425563681453552, + "grad_norm": 10.218833923339844, + "learning_rate": 8.284543886358132e-08, + "loss": 0.2555, + "step": 37247 + }, + { + "epoch": 0.9425816737100489, + "grad_norm": 5.1417975425720215, + "learning_rate": 8.2772665380918e-08, + "loss": 0.1253, + "step": 37248 + }, + { + "epoch": 0.9426069792747425, + "grad_norm": 6.251189708709717, + "learning_rate": 8.269992360853852e-08, + "loss": 0.1088, + "step": 37249 + }, + { + "epoch": 0.9426322848394362, + "grad_norm": 4.872110366821289, + "learning_rate": 8.26272135469136e-08, + "loss": 0.1726, + "step": 37250 + }, + { + "epoch": 0.9426575904041299, + "grad_norm": 15.669716835021973, + "learning_rate": 8.255453519651013e-08, + "loss": 0.2427, + "step": 37251 + }, + { + "epoch": 0.9426828959688236, + "grad_norm": 3.640735149383545, + "learning_rate": 8.248188855779882e-08, + "loss": 0.1235, + "step": 37252 + }, + { + "epoch": 0.9427082015335172, + "grad_norm": 2.301518201828003, + "learning_rate": 8.240927363124596e-08, + "loss": 0.0684, + "step": 37253 + }, + { + "epoch": 0.9427335070982109, + "grad_norm": 14.470123291015625, + "learning_rate": 8.233669041732173e-08, + "loss": 0.0884, + "step": 37254 + }, + { + "epoch": 0.9427588126629046, + "grad_norm": 5.017635345458984, + "learning_rate": 8.226413891649298e-08, + "loss": 0.1133, + "step": 37255 + }, + { + "epoch": 0.9427841182275982, + "grad_norm": 6.56639289855957, + "learning_rate": 8.219161912922824e-08, + "loss": 0.0901, + "step": 37256 + }, + { + "epoch": 0.9428094237922919, + "grad_norm": 5.358842372894287, + "learning_rate": 8.21191310559949e-08, + "loss": 0.0896, + "step": 37257 + }, + { + "epoch": 0.9428347293569856, + "grad_norm": 9.57335090637207, + "learning_rate": 8.204667469726036e-08, + "loss": 0.2491, + "step": 37258 + }, + { + "epoch": 0.9428600349216792, + "grad_norm": 2.802061080932617, + "learning_rate": 8.197425005349202e-08, + "loss": 0.141, + "step": 37259 + }, + { + "epoch": 0.942885340486373, + "grad_norm": 7.054155349731445, + "learning_rate": 8.190185712515731e-08, + "loss": 0.2005, + "step": 37260 + }, + { + "epoch": 0.9429106460510667, + "grad_norm": 4.197763919830322, + "learning_rate": 8.182949591272249e-08, + "loss": 0.1626, + "step": 37261 + }, + { + "epoch": 0.9429359516157603, + "grad_norm": 6.165414810180664, + "learning_rate": 8.175716641665388e-08, + "loss": 0.1808, + "step": 37262 + }, + { + "epoch": 0.942961257180454, + "grad_norm": 11.060144424438477, + "learning_rate": 8.168486863741943e-08, + "loss": 0.2345, + "step": 37263 + }, + { + "epoch": 0.9429865627451477, + "grad_norm": 7.292579174041748, + "learning_rate": 8.161260257548376e-08, + "loss": 0.1506, + "step": 37264 + }, + { + "epoch": 0.9430118683098413, + "grad_norm": 6.9163408279418945, + "learning_rate": 8.154036823131373e-08, + "loss": 0.2007, + "step": 37265 + }, + { + "epoch": 0.943037173874535, + "grad_norm": 2.9170982837677, + "learning_rate": 8.146816560537452e-08, + "loss": 0.1137, + "step": 37266 + }, + { + "epoch": 0.9430624794392287, + "grad_norm": 5.360167503356934, + "learning_rate": 8.139599469813242e-08, + "loss": 0.1679, + "step": 37267 + }, + { + "epoch": 0.9430877850039223, + "grad_norm": 3.8025102615356445, + "learning_rate": 8.132385551005261e-08, + "loss": 0.1373, + "step": 37268 + }, + { + "epoch": 0.943113090568616, + "grad_norm": 3.093580722808838, + "learning_rate": 8.125174804160085e-08, + "loss": 0.1135, + "step": 37269 + }, + { + "epoch": 0.9431383961333097, + "grad_norm": 8.747355461120605, + "learning_rate": 8.117967229324064e-08, + "loss": 0.1213, + "step": 37270 + }, + { + "epoch": 0.9431637016980033, + "grad_norm": 5.8652849197387695, + "learning_rate": 8.110762826543883e-08, + "loss": 0.1227, + "step": 37271 + }, + { + "epoch": 0.943189007262697, + "grad_norm": 3.889803886413574, + "learning_rate": 8.103561595865783e-08, + "loss": 0.1663, + "step": 37272 + }, + { + "epoch": 0.9432143128273908, + "grad_norm": 5.1392974853515625, + "learning_rate": 8.09636353733645e-08, + "loss": 0.1355, + "step": 37273 + }, + { + "epoch": 0.9432396183920844, + "grad_norm": 11.223586082458496, + "learning_rate": 8.08916865100201e-08, + "loss": 0.3212, + "step": 37274 + }, + { + "epoch": 0.9432649239567781, + "grad_norm": 3.1952764987945557, + "learning_rate": 8.081976936909152e-08, + "loss": 0.1523, + "step": 37275 + }, + { + "epoch": 0.9432902295214718, + "grad_norm": 5.679750442504883, + "learning_rate": 8.074788395104005e-08, + "loss": 0.0971, + "step": 37276 + }, + { + "epoch": 0.9433155350861655, + "grad_norm": 5.012497425079346, + "learning_rate": 8.067603025633142e-08, + "loss": 0.137, + "step": 37277 + }, + { + "epoch": 0.9433408406508591, + "grad_norm": 4.506557941436768, + "learning_rate": 8.060420828542747e-08, + "loss": 0.1784, + "step": 37278 + }, + { + "epoch": 0.9433661462155528, + "grad_norm": 5.719659328460693, + "learning_rate": 8.05324180387923e-08, + "loss": 0.1112, + "step": 37279 + }, + { + "epoch": 0.9433914517802465, + "grad_norm": 4.052612781524658, + "learning_rate": 8.046065951688775e-08, + "loss": 0.144, + "step": 37280 + }, + { + "epoch": 0.9434167573449401, + "grad_norm": 2.2908756732940674, + "learning_rate": 8.038893272017789e-08, + "loss": 0.0846, + "step": 37281 + }, + { + "epoch": 0.9434420629096338, + "grad_norm": 5.664912223815918, + "learning_rate": 8.031723764912458e-08, + "loss": 0.1329, + "step": 37282 + }, + { + "epoch": 0.9434673684743276, + "grad_norm": 9.485183715820312, + "learning_rate": 8.024557430419078e-08, + "loss": 0.2452, + "step": 37283 + }, + { + "epoch": 0.9434926740390212, + "grad_norm": 13.2627534866333, + "learning_rate": 8.017394268583722e-08, + "loss": 0.1919, + "step": 37284 + }, + { + "epoch": 0.9435179796037149, + "grad_norm": 6.698087215423584, + "learning_rate": 8.010234279452745e-08, + "loss": 0.1172, + "step": 37285 + }, + { + "epoch": 0.9435432851684086, + "grad_norm": 15.176607131958008, + "learning_rate": 8.003077463072217e-08, + "loss": 0.2664, + "step": 37286 + }, + { + "epoch": 0.9435685907331022, + "grad_norm": 5.419952392578125, + "learning_rate": 7.995923819488327e-08, + "loss": 0.1302, + "step": 37287 + }, + { + "epoch": 0.9435938962977959, + "grad_norm": 4.4821271896362305, + "learning_rate": 7.988773348747203e-08, + "loss": 0.1304, + "step": 37288 + }, + { + "epoch": 0.9436192018624896, + "grad_norm": 3.899183988571167, + "learning_rate": 7.981626050894919e-08, + "loss": 0.1473, + "step": 37289 + }, + { + "epoch": 0.9436445074271832, + "grad_norm": 2.821370840072632, + "learning_rate": 7.974481925977662e-08, + "loss": 0.0615, + "step": 37290 + }, + { + "epoch": 0.9436698129918769, + "grad_norm": 5.6994757652282715, + "learning_rate": 7.967340974041449e-08, + "loss": 0.1065, + "step": 37291 + }, + { + "epoch": 0.9436951185565706, + "grad_norm": 5.420080184936523, + "learning_rate": 7.960203195132354e-08, + "loss": 0.1549, + "step": 37292 + }, + { + "epoch": 0.9437204241212642, + "grad_norm": 5.708460330963135, + "learning_rate": 7.953068589296287e-08, + "loss": 0.1415, + "step": 37293 + }, + { + "epoch": 0.9437457296859579, + "grad_norm": 3.8722634315490723, + "learning_rate": 7.945937156579431e-08, + "loss": 0.1358, + "step": 37294 + }, + { + "epoch": 0.9437710352506516, + "grad_norm": 6.439319133758545, + "learning_rate": 7.938808897027694e-08, + "loss": 0.1831, + "step": 37295 + }, + { + "epoch": 0.9437963408153452, + "grad_norm": 3.2758896350860596, + "learning_rate": 7.931683810686986e-08, + "loss": 0.0997, + "step": 37296 + }, + { + "epoch": 0.943821646380039, + "grad_norm": 5.500715732574463, + "learning_rate": 7.924561897603377e-08, + "loss": 0.169, + "step": 37297 + }, + { + "epoch": 0.9438469519447327, + "grad_norm": 3.736751079559326, + "learning_rate": 7.917443157822725e-08, + "loss": 0.0982, + "step": 37298 + }, + { + "epoch": 0.9438722575094263, + "grad_norm": 12.737344741821289, + "learning_rate": 7.910327591390932e-08, + "loss": 0.2537, + "step": 37299 + }, + { + "epoch": 0.94389756307412, + "grad_norm": 5.897887229919434, + "learning_rate": 7.903215198353909e-08, + "loss": 0.233, + "step": 37300 + }, + { + "epoch": 0.9439228686388137, + "grad_norm": 3.806314706802368, + "learning_rate": 7.896105978757506e-08, + "loss": 0.134, + "step": 37301 + }, + { + "epoch": 0.9439481742035074, + "grad_norm": 4.966341018676758, + "learning_rate": 7.888999932647578e-08, + "loss": 0.1581, + "step": 37302 + }, + { + "epoch": 0.943973479768201, + "grad_norm": 4.32509183883667, + "learning_rate": 7.88189706006992e-08, + "loss": 0.128, + "step": 37303 + }, + { + "epoch": 0.9439987853328947, + "grad_norm": 6.411821365356445, + "learning_rate": 7.874797361070441e-08, + "loss": 0.1691, + "step": 37304 + }, + { + "epoch": 0.9440240908975884, + "grad_norm": 4.334702968597412, + "learning_rate": 7.867700835694825e-08, + "loss": 0.1537, + "step": 37305 + }, + { + "epoch": 0.944049396462282, + "grad_norm": 6.348780155181885, + "learning_rate": 7.860607483988813e-08, + "loss": 0.2572, + "step": 37306 + }, + { + "epoch": 0.9440747020269757, + "grad_norm": 3.7931714057922363, + "learning_rate": 7.853517305998204e-08, + "loss": 0.2054, + "step": 37307 + }, + { + "epoch": 0.9441000075916695, + "grad_norm": 4.851290225982666, + "learning_rate": 7.846430301768848e-08, + "loss": 0.1254, + "step": 37308 + }, + { + "epoch": 0.944125313156363, + "grad_norm": 9.8991117477417, + "learning_rate": 7.839346471346154e-08, + "loss": 0.3534, + "step": 37309 + }, + { + "epoch": 0.9441506187210568, + "grad_norm": 3.6824328899383545, + "learning_rate": 7.83226581477603e-08, + "loss": 0.1566, + "step": 37310 + }, + { + "epoch": 0.9441759242857505, + "grad_norm": 13.450672149658203, + "learning_rate": 7.82518833210405e-08, + "loss": 0.3112, + "step": 37311 + }, + { + "epoch": 0.9442012298504441, + "grad_norm": 4.323325157165527, + "learning_rate": 7.8181140233759e-08, + "loss": 0.116, + "step": 37312 + }, + { + "epoch": 0.9442265354151378, + "grad_norm": 4.288539886474609, + "learning_rate": 7.81104288863721e-08, + "loss": 0.1612, + "step": 37313 + }, + { + "epoch": 0.9442518409798315, + "grad_norm": 2.528665542602539, + "learning_rate": 7.8039749279335e-08, + "loss": 0.0885, + "step": 37314 + }, + { + "epoch": 0.9442771465445251, + "grad_norm": 4.3282856941223145, + "learning_rate": 7.796910141310454e-08, + "loss": 0.0879, + "step": 37315 + }, + { + "epoch": 0.9443024521092188, + "grad_norm": 3.3080179691314697, + "learning_rate": 7.789848528813537e-08, + "loss": 0.0768, + "step": 37316 + }, + { + "epoch": 0.9443277576739125, + "grad_norm": 5.08372688293457, + "learning_rate": 7.782790090488324e-08, + "loss": 0.1374, + "step": 37317 + }, + { + "epoch": 0.9443530632386061, + "grad_norm": 6.175062656402588, + "learning_rate": 7.775734826380332e-08, + "loss": 0.2001, + "step": 37318 + }, + { + "epoch": 0.9443783688032998, + "grad_norm": 3.846905469894409, + "learning_rate": 7.768682736535138e-08, + "loss": 0.1436, + "step": 37319 + }, + { + "epoch": 0.9444036743679936, + "grad_norm": 4.314736366271973, + "learning_rate": 7.761633820998037e-08, + "loss": 0.165, + "step": 37320 + }, + { + "epoch": 0.9444289799326872, + "grad_norm": 5.558509826660156, + "learning_rate": 7.754588079814663e-08, + "loss": 0.1576, + "step": 37321 + }, + { + "epoch": 0.9444542854973809, + "grad_norm": 4.229011058807373, + "learning_rate": 7.747545513030364e-08, + "loss": 0.1569, + "step": 37322 + }, + { + "epoch": 0.9444795910620746, + "grad_norm": 6.9977827072143555, + "learning_rate": 7.740506120690606e-08, + "loss": 0.1191, + "step": 37323 + }, + { + "epoch": 0.9445048966267682, + "grad_norm": 4.888495922088623, + "learning_rate": 7.73346990284074e-08, + "loss": 0.1814, + "step": 37324 + }, + { + "epoch": 0.9445302021914619, + "grad_norm": 18.34082794189453, + "learning_rate": 7.726436859526177e-08, + "loss": 0.1613, + "step": 37325 + }, + { + "epoch": 0.9445555077561556, + "grad_norm": 4.437432289123535, + "learning_rate": 7.71940699079221e-08, + "loss": 0.1655, + "step": 37326 + }, + { + "epoch": 0.9445808133208492, + "grad_norm": 4.258965492248535, + "learning_rate": 7.712380296684252e-08, + "loss": 0.2207, + "step": 37327 + }, + { + "epoch": 0.9446061188855429, + "grad_norm": 7.856846332550049, + "learning_rate": 7.705356777247541e-08, + "loss": 0.2345, + "step": 37328 + }, + { + "epoch": 0.9446314244502366, + "grad_norm": 2.565687894821167, + "learning_rate": 7.698336432527432e-08, + "loss": 0.0847, + "step": 37329 + }, + { + "epoch": 0.9446567300149303, + "grad_norm": 10.324042320251465, + "learning_rate": 7.691319262569219e-08, + "loss": 0.2034, + "step": 37330 + }, + { + "epoch": 0.9446820355796239, + "grad_norm": 4.125009536743164, + "learning_rate": 7.68430526741809e-08, + "loss": 0.1713, + "step": 37331 + }, + { + "epoch": 0.9447073411443176, + "grad_norm": 4.587517261505127, + "learning_rate": 7.677294447119232e-08, + "loss": 0.1762, + "step": 37332 + }, + { + "epoch": 0.9447326467090114, + "grad_norm": 9.405592918395996, + "learning_rate": 7.670286801717997e-08, + "loss": 0.2453, + "step": 37333 + }, + { + "epoch": 0.944757952273705, + "grad_norm": 3.3308446407318115, + "learning_rate": 7.663282331259459e-08, + "loss": 0.1173, + "step": 37334 + }, + { + "epoch": 0.9447832578383987, + "grad_norm": 12.384332656860352, + "learning_rate": 7.65628103578886e-08, + "loss": 0.2061, + "step": 37335 + }, + { + "epoch": 0.9448085634030924, + "grad_norm": 3.726284980773926, + "learning_rate": 7.649282915351275e-08, + "loss": 0.1513, + "step": 37336 + }, + { + "epoch": 0.944833868967786, + "grad_norm": 5.1571855545043945, + "learning_rate": 7.642287969991946e-08, + "loss": 0.137, + "step": 37337 + }, + { + "epoch": 0.9448591745324797, + "grad_norm": 7.017456531524658, + "learning_rate": 7.635296199755838e-08, + "loss": 0.1418, + "step": 37338 + }, + { + "epoch": 0.9448844800971734, + "grad_norm": 3.2944905757904053, + "learning_rate": 7.628307604688246e-08, + "loss": 0.1275, + "step": 37339 + }, + { + "epoch": 0.944909785661867, + "grad_norm": 3.5152065753936768, + "learning_rate": 7.621322184834024e-08, + "loss": 0.109, + "step": 37340 + }, + { + "epoch": 0.9449350912265607, + "grad_norm": 6.808591842651367, + "learning_rate": 7.614339940238303e-08, + "loss": 0.1863, + "step": 37341 + }, + { + "epoch": 0.9449603967912544, + "grad_norm": 9.020405769348145, + "learning_rate": 7.607360870946101e-08, + "loss": 0.2248, + "step": 37342 + }, + { + "epoch": 0.944985702355948, + "grad_norm": 6.315948963165283, + "learning_rate": 7.60038497700244e-08, + "loss": 0.1679, + "step": 37343 + }, + { + "epoch": 0.9450110079206417, + "grad_norm": 5.557557106018066, + "learning_rate": 7.593412258452337e-08, + "loss": 0.131, + "step": 37344 + }, + { + "epoch": 0.9450363134853355, + "grad_norm": 5.618635177612305, + "learning_rate": 7.586442715340758e-08, + "loss": 0.126, + "step": 37345 + }, + { + "epoch": 0.9450616190500291, + "grad_norm": 9.881349563598633, + "learning_rate": 7.579476347712556e-08, + "loss": 0.2548, + "step": 37346 + }, + { + "epoch": 0.9450869246147228, + "grad_norm": 10.340008735656738, + "learning_rate": 7.572513155612693e-08, + "loss": 0.2564, + "step": 37347 + }, + { + "epoch": 0.9451122301794165, + "grad_norm": 10.802129745483398, + "learning_rate": 7.565553139086135e-08, + "loss": 0.2309, + "step": 37348 + }, + { + "epoch": 0.9451375357441101, + "grad_norm": 3.481640100479126, + "learning_rate": 7.558596298177734e-08, + "loss": 0.1194, + "step": 37349 + }, + { + "epoch": 0.9451628413088038, + "grad_norm": 3.4006614685058594, + "learning_rate": 7.551642632932343e-08, + "loss": 0.1508, + "step": 37350 + }, + { + "epoch": 0.9451881468734975, + "grad_norm": 6.621260643005371, + "learning_rate": 7.544692143394705e-08, + "loss": 0.1511, + "step": 37351 + }, + { + "epoch": 0.9452134524381911, + "grad_norm": 6.373878479003906, + "learning_rate": 7.537744829609895e-08, + "loss": 0.1543, + "step": 37352 + }, + { + "epoch": 0.9452387580028848, + "grad_norm": 5.175182342529297, + "learning_rate": 7.530800691622487e-08, + "loss": 0.1202, + "step": 37353 + }, + { + "epoch": 0.9452640635675785, + "grad_norm": 2.5754401683807373, + "learning_rate": 7.523859729477334e-08, + "loss": 0.0697, + "step": 37354 + }, + { + "epoch": 0.9452893691322722, + "grad_norm": 8.83730697631836, + "learning_rate": 7.51692194321918e-08, + "loss": 0.1718, + "step": 37355 + }, + { + "epoch": 0.9453146746969658, + "grad_norm": 4.737055778503418, + "learning_rate": 7.50998733289282e-08, + "loss": 0.1258, + "step": 37356 + }, + { + "epoch": 0.9453399802616596, + "grad_norm": 8.575864791870117, + "learning_rate": 7.503055898543e-08, + "loss": 0.1191, + "step": 37357 + }, + { + "epoch": 0.9453652858263533, + "grad_norm": 8.99791145324707, + "learning_rate": 7.496127640214291e-08, + "loss": 0.1959, + "step": 37358 + }, + { + "epoch": 0.9453905913910469, + "grad_norm": 3.406872510910034, + "learning_rate": 7.489202557951436e-08, + "loss": 0.1329, + "step": 37359 + }, + { + "epoch": 0.9454158969557406, + "grad_norm": 5.440533638000488, + "learning_rate": 7.482280651799123e-08, + "loss": 0.1553, + "step": 37360 + }, + { + "epoch": 0.9454412025204343, + "grad_norm": 1.931339144706726, + "learning_rate": 7.475361921801983e-08, + "loss": 0.066, + "step": 37361 + }, + { + "epoch": 0.9454665080851279, + "grad_norm": 4.721404075622559, + "learning_rate": 7.468446368004589e-08, + "loss": 0.1087, + "step": 37362 + }, + { + "epoch": 0.9454918136498216, + "grad_norm": 8.636533737182617, + "learning_rate": 7.461533990451575e-08, + "loss": 0.1163, + "step": 37363 + }, + { + "epoch": 0.9455171192145153, + "grad_norm": 3.8738694190979004, + "learning_rate": 7.454624789187514e-08, + "loss": 0.0864, + "step": 37364 + }, + { + "epoch": 0.9455424247792089, + "grad_norm": 3.506150245666504, + "learning_rate": 7.447718764256928e-08, + "loss": 0.0796, + "step": 37365 + }, + { + "epoch": 0.9455677303439026, + "grad_norm": 8.349325180053711, + "learning_rate": 7.440815915704447e-08, + "loss": 0.1865, + "step": 37366 + }, + { + "epoch": 0.9455930359085963, + "grad_norm": 10.166679382324219, + "learning_rate": 7.433916243574423e-08, + "loss": 0.1997, + "step": 37367 + }, + { + "epoch": 0.9456183414732899, + "grad_norm": 4.47266960144043, + "learning_rate": 7.427019747911545e-08, + "loss": 0.1541, + "step": 37368 + }, + { + "epoch": 0.9456436470379836, + "grad_norm": 4.2268805503845215, + "learning_rate": 7.42012642876011e-08, + "loss": 0.1532, + "step": 37369 + }, + { + "epoch": 0.9456689526026774, + "grad_norm": 6.80722188949585, + "learning_rate": 7.413236286164749e-08, + "loss": 0.0629, + "step": 37370 + }, + { + "epoch": 0.945694258167371, + "grad_norm": 6.531473159790039, + "learning_rate": 7.406349320169703e-08, + "loss": 0.1344, + "step": 37371 + }, + { + "epoch": 0.9457195637320647, + "grad_norm": 5.770179271697998, + "learning_rate": 7.399465530819549e-08, + "loss": 0.1141, + "step": 37372 + }, + { + "epoch": 0.9457448692967584, + "grad_norm": 14.084403038024902, + "learning_rate": 7.392584918158586e-08, + "loss": 0.3653, + "step": 37373 + }, + { + "epoch": 0.945770174861452, + "grad_norm": 2.781561851501465, + "learning_rate": 7.385707482231163e-08, + "loss": 0.0831, + "step": 37374 + }, + { + "epoch": 0.9457954804261457, + "grad_norm": 3.3948333263397217, + "learning_rate": 7.37883322308175e-08, + "loss": 0.1095, + "step": 37375 + }, + { + "epoch": 0.9458207859908394, + "grad_norm": 5.207907676696777, + "learning_rate": 7.371962140754586e-08, + "loss": 0.1607, + "step": 37376 + }, + { + "epoch": 0.945846091555533, + "grad_norm": 9.299110412597656, + "learning_rate": 7.365094235294024e-08, + "loss": 0.2512, + "step": 37377 + }, + { + "epoch": 0.9458713971202267, + "grad_norm": 5.286705493927002, + "learning_rate": 7.358229506744308e-08, + "loss": 0.0772, + "step": 37378 + }, + { + "epoch": 0.9458967026849204, + "grad_norm": 8.695131301879883, + "learning_rate": 7.351367955149735e-08, + "loss": 0.1854, + "step": 37379 + }, + { + "epoch": 0.9459220082496141, + "grad_norm": 5.85994291305542, + "learning_rate": 7.344509580554548e-08, + "loss": 0.1354, + "step": 37380 + }, + { + "epoch": 0.9459473138143077, + "grad_norm": 5.220363616943359, + "learning_rate": 7.33765438300299e-08, + "loss": 0.2062, + "step": 37381 + }, + { + "epoch": 0.9459726193790015, + "grad_norm": 32.10984802246094, + "learning_rate": 7.330802362539191e-08, + "loss": 0.3819, + "step": 37382 + }, + { + "epoch": 0.9459979249436952, + "grad_norm": 4.0440568923950195, + "learning_rate": 7.323953519207505e-08, + "loss": 0.1642, + "step": 37383 + }, + { + "epoch": 0.9460232305083888, + "grad_norm": 6.847010612487793, + "learning_rate": 7.317107853051952e-08, + "loss": 0.1534, + "step": 37384 + }, + { + "epoch": 0.9460485360730825, + "grad_norm": 3.2992680072784424, + "learning_rate": 7.31026536411672e-08, + "loss": 0.0919, + "step": 37385 + }, + { + "epoch": 0.9460738416377762, + "grad_norm": 11.06777572631836, + "learning_rate": 7.303426052445883e-08, + "loss": 0.3116, + "step": 37386 + }, + { + "epoch": 0.9460991472024698, + "grad_norm": 6.080828666687012, + "learning_rate": 7.296589918083685e-08, + "loss": 0.1463, + "step": 37387 + }, + { + "epoch": 0.9461244527671635, + "grad_norm": 5.262448310852051, + "learning_rate": 7.28975696107409e-08, + "loss": 0.1726, + "step": 37388 + }, + { + "epoch": 0.9461497583318572, + "grad_norm": 4.880038738250732, + "learning_rate": 7.28292718146123e-08, + "loss": 0.1301, + "step": 37389 + }, + { + "epoch": 0.9461750638965508, + "grad_norm": 4.603328704833984, + "learning_rate": 7.27610057928907e-08, + "loss": 0.1459, + "step": 37390 + }, + { + "epoch": 0.9462003694612445, + "grad_norm": 5.71838903427124, + "learning_rate": 7.269277154601739e-08, + "loss": 0.1424, + "step": 37391 + }, + { + "epoch": 0.9462256750259382, + "grad_norm": 6.9368577003479, + "learning_rate": 7.262456907443206e-08, + "loss": 0.1679, + "step": 37392 + }, + { + "epoch": 0.9462509805906318, + "grad_norm": 14.14089298248291, + "learning_rate": 7.255639837857376e-08, + "loss": 0.2593, + "step": 37393 + }, + { + "epoch": 0.9462762861553256, + "grad_norm": 3.084888219833374, + "learning_rate": 7.248825945888271e-08, + "loss": 0.0753, + "step": 37394 + }, + { + "epoch": 0.9463015917200193, + "grad_norm": 7.297711372375488, + "learning_rate": 7.242015231579857e-08, + "loss": 0.201, + "step": 37395 + }, + { + "epoch": 0.9463268972847129, + "grad_norm": 5.115392208099365, + "learning_rate": 7.235207694975988e-08, + "loss": 0.1643, + "step": 37396 + }, + { + "epoch": 0.9463522028494066, + "grad_norm": 6.214675426483154, + "learning_rate": 7.228403336120626e-08, + "loss": 0.2245, + "step": 37397 + }, + { + "epoch": 0.9463775084141003, + "grad_norm": 3.9329099655151367, + "learning_rate": 7.221602155057627e-08, + "loss": 0.1782, + "step": 37398 + }, + { + "epoch": 0.9464028139787939, + "grad_norm": 9.584554672241211, + "learning_rate": 7.214804151830846e-08, + "loss": 0.1653, + "step": 37399 + }, + { + "epoch": 0.9464281195434876, + "grad_norm": 6.756616592407227, + "learning_rate": 7.208009326484133e-08, + "loss": 0.1713, + "step": 37400 + }, + { + "epoch": 0.9464534251081813, + "grad_norm": 8.597591400146484, + "learning_rate": 7.201217679061346e-08, + "loss": 0.2168, + "step": 37401 + }, + { + "epoch": 0.9464787306728749, + "grad_norm": 4.117628574371338, + "learning_rate": 7.194429209606169e-08, + "loss": 0.1503, + "step": 37402 + }, + { + "epoch": 0.9465040362375686, + "grad_norm": 3.7526023387908936, + "learning_rate": 7.187643918162457e-08, + "loss": 0.0902, + "step": 37403 + }, + { + "epoch": 0.9465293418022623, + "grad_norm": 4.648643493652344, + "learning_rate": 7.180861804774008e-08, + "loss": 0.2069, + "step": 37404 + }, + { + "epoch": 0.946554647366956, + "grad_norm": 4.492876052856445, + "learning_rate": 7.174082869484511e-08, + "loss": 0.189, + "step": 37405 + }, + { + "epoch": 0.9465799529316496, + "grad_norm": 3.117816209793091, + "learning_rate": 7.167307112337596e-08, + "loss": 0.1259, + "step": 37406 + }, + { + "epoch": 0.9466052584963434, + "grad_norm": 7.831998825073242, + "learning_rate": 7.160534533377117e-08, + "loss": 0.1696, + "step": 37407 + }, + { + "epoch": 0.9466305640610371, + "grad_norm": 7.917852878570557, + "learning_rate": 7.15376513264665e-08, + "loss": 0.1642, + "step": 37408 + }, + { + "epoch": 0.9466558696257307, + "grad_norm": 4.268033027648926, + "learning_rate": 7.146998910189884e-08, + "loss": 0.1284, + "step": 37409 + }, + { + "epoch": 0.9466811751904244, + "grad_norm": 5.256985664367676, + "learning_rate": 7.14023586605045e-08, + "loss": 0.135, + "step": 37410 + }, + { + "epoch": 0.9467064807551181, + "grad_norm": 3.2853386402130127, + "learning_rate": 7.133476000271921e-08, + "loss": 0.0979, + "step": 37411 + }, + { + "epoch": 0.9467317863198117, + "grad_norm": 6.081639766693115, + "learning_rate": 7.126719312897989e-08, + "loss": 0.1747, + "step": 37412 + }, + { + "epoch": 0.9467570918845054, + "grad_norm": 2.933806896209717, + "learning_rate": 7.119965803972062e-08, + "loss": 0.1161, + "step": 37413 + }, + { + "epoch": 0.9467823974491991, + "grad_norm": 4.702197074890137, + "learning_rate": 7.113215473537937e-08, + "loss": 0.1205, + "step": 37414 + }, + { + "epoch": 0.9468077030138927, + "grad_norm": 4.000575542449951, + "learning_rate": 7.106468321638859e-08, + "loss": 0.1868, + "step": 37415 + }, + { + "epoch": 0.9468330085785864, + "grad_norm": 6.106503486633301, + "learning_rate": 7.09972434831857e-08, + "loss": 0.1633, + "step": 37416 + }, + { + "epoch": 0.9468583141432801, + "grad_norm": 11.472960472106934, + "learning_rate": 7.092983553620425e-08, + "loss": 0.255, + "step": 37417 + }, + { + "epoch": 0.9468836197079737, + "grad_norm": 7.109921455383301, + "learning_rate": 7.086245937587943e-08, + "loss": 0.1559, + "step": 37418 + }, + { + "epoch": 0.9469089252726675, + "grad_norm": 6.216384410858154, + "learning_rate": 7.079511500264646e-08, + "loss": 0.1113, + "step": 37419 + }, + { + "epoch": 0.9469342308373612, + "grad_norm": 5.45478630065918, + "learning_rate": 7.072780241693833e-08, + "loss": 0.1947, + "step": 37420 + }, + { + "epoch": 0.9469595364020548, + "grad_norm": 4.406164646148682, + "learning_rate": 7.066052161918913e-08, + "loss": 0.1241, + "step": 37421 + }, + { + "epoch": 0.9469848419667485, + "grad_norm": 9.324590682983398, + "learning_rate": 7.059327260983406e-08, + "loss": 0.0965, + "step": 37422 + }, + { + "epoch": 0.9470101475314422, + "grad_norm": 4.681393623352051, + "learning_rate": 7.052605538930613e-08, + "loss": 0.1145, + "step": 37423 + }, + { + "epoch": 0.9470354530961358, + "grad_norm": 3.7603166103363037, + "learning_rate": 7.04588699580383e-08, + "loss": 0.082, + "step": 37424 + }, + { + "epoch": 0.9470607586608295, + "grad_norm": 6.535958766937256, + "learning_rate": 7.039171631646414e-08, + "loss": 0.1192, + "step": 37425 + }, + { + "epoch": 0.9470860642255232, + "grad_norm": 5.425017833709717, + "learning_rate": 7.032459446501716e-08, + "loss": 0.1342, + "step": 37426 + }, + { + "epoch": 0.9471113697902168, + "grad_norm": 3.3799326419830322, + "learning_rate": 7.025750440412982e-08, + "loss": 0.1015, + "step": 37427 + }, + { + "epoch": 0.9471366753549105, + "grad_norm": 4.396806716918945, + "learning_rate": 7.019044613423509e-08, + "loss": 0.1627, + "step": 37428 + }, + { + "epoch": 0.9471619809196042, + "grad_norm": 18.04641342163086, + "learning_rate": 7.01234196557643e-08, + "loss": 0.1278, + "step": 37429 + }, + { + "epoch": 0.947187286484298, + "grad_norm": 4.183449745178223, + "learning_rate": 7.005642496915155e-08, + "loss": 0.1105, + "step": 37430 + }, + { + "epoch": 0.9472125920489916, + "grad_norm": 7.6312947273254395, + "learning_rate": 6.998946207482759e-08, + "loss": 0.1205, + "step": 37431 + }, + { + "epoch": 0.9472378976136853, + "grad_norm": 3.3686609268188477, + "learning_rate": 6.992253097322433e-08, + "loss": 0.1583, + "step": 37432 + }, + { + "epoch": 0.947263203178379, + "grad_norm": 14.21839427947998, + "learning_rate": 6.985563166477305e-08, + "loss": 0.1605, + "step": 37433 + }, + { + "epoch": 0.9472885087430726, + "grad_norm": 4.47967529296875, + "learning_rate": 6.978876414990676e-08, + "loss": 0.1745, + "step": 37434 + }, + { + "epoch": 0.9473138143077663, + "grad_norm": 13.638660430908203, + "learning_rate": 6.972192842905511e-08, + "loss": 0.3516, + "step": 37435 + }, + { + "epoch": 0.94733911987246, + "grad_norm": 4.396803855895996, + "learning_rate": 6.965512450265e-08, + "loss": 0.1353, + "step": 37436 + }, + { + "epoch": 0.9473644254371536, + "grad_norm": 16.264076232910156, + "learning_rate": 6.95883523711216e-08, + "loss": 0.2505, + "step": 37437 + }, + { + "epoch": 0.9473897310018473, + "grad_norm": 8.602481842041016, + "learning_rate": 6.952161203490071e-08, + "loss": 0.1711, + "step": 37438 + }, + { + "epoch": 0.947415036566541, + "grad_norm": 5.829176902770996, + "learning_rate": 6.945490349441808e-08, + "loss": 0.2018, + "step": 37439 + }, + { + "epoch": 0.9474403421312346, + "grad_norm": 3.018737316131592, + "learning_rate": 6.938822675010281e-08, + "loss": 0.094, + "step": 37440 + }, + { + "epoch": 0.9474656476959283, + "grad_norm": 4.436282634735107, + "learning_rate": 6.93215818023868e-08, + "loss": 0.1798, + "step": 37441 + }, + { + "epoch": 0.947490953260622, + "grad_norm": 5.298758029937744, + "learning_rate": 6.9254968651698e-08, + "loss": 0.1386, + "step": 37442 + }, + { + "epoch": 0.9475162588253156, + "grad_norm": 8.934261322021484, + "learning_rate": 6.918838729846722e-08, + "loss": 0.2582, + "step": 37443 + }, + { + "epoch": 0.9475415643900094, + "grad_norm": 5.366423606872559, + "learning_rate": 6.912183774312297e-08, + "loss": 0.1716, + "step": 37444 + }, + { + "epoch": 0.9475668699547031, + "grad_norm": 3.8637571334838867, + "learning_rate": 6.905531998609549e-08, + "loss": 0.1423, + "step": 37445 + }, + { + "epoch": 0.9475921755193967, + "grad_norm": 2.74692964553833, + "learning_rate": 6.898883402781164e-08, + "loss": 0.1028, + "step": 37446 + }, + { + "epoch": 0.9476174810840904, + "grad_norm": 9.169804573059082, + "learning_rate": 6.892237986870276e-08, + "loss": 0.2164, + "step": 37447 + }, + { + "epoch": 0.9476427866487841, + "grad_norm": 16.481945037841797, + "learning_rate": 6.885595750919572e-08, + "loss": 0.3249, + "step": 37448 + }, + { + "epoch": 0.9476680922134777, + "grad_norm": 5.773839950561523, + "learning_rate": 6.878956694971905e-08, + "loss": 0.2151, + "step": 37449 + }, + { + "epoch": 0.9476933977781714, + "grad_norm": 6.951021671295166, + "learning_rate": 6.872320819070188e-08, + "loss": 0.1793, + "step": 37450 + }, + { + "epoch": 0.9477187033428651, + "grad_norm": 16.18701171875, + "learning_rate": 6.865688123257108e-08, + "loss": 0.2702, + "step": 37451 + }, + { + "epoch": 0.9477440089075587, + "grad_norm": 5.609018325805664, + "learning_rate": 6.859058607575465e-08, + "loss": 0.11, + "step": 37452 + }, + { + "epoch": 0.9477693144722524, + "grad_norm": 3.9540350437164307, + "learning_rate": 6.852432272068054e-08, + "loss": 0.1776, + "step": 37453 + }, + { + "epoch": 0.9477946200369461, + "grad_norm": 4.847995758056641, + "learning_rate": 6.845809116777568e-08, + "loss": 0.1705, + "step": 37454 + }, + { + "epoch": 0.9478199256016397, + "grad_norm": 2.8002941608428955, + "learning_rate": 6.839189141746749e-08, + "loss": 0.1039, + "step": 37455 + }, + { + "epoch": 0.9478452311663335, + "grad_norm": 11.08659553527832, + "learning_rate": 6.832572347018174e-08, + "loss": 0.1174, + "step": 37456 + }, + { + "epoch": 0.9478705367310272, + "grad_norm": 4.343128681182861, + "learning_rate": 6.825958732634696e-08, + "loss": 0.1713, + "step": 37457 + }, + { + "epoch": 0.9478958422957209, + "grad_norm": 15.245362281799316, + "learning_rate": 6.819348298638839e-08, + "loss": 0.1094, + "step": 37458 + }, + { + "epoch": 0.9479211478604145, + "grad_norm": 3.5903849601745605, + "learning_rate": 6.812741045073235e-08, + "loss": 0.1351, + "step": 37459 + }, + { + "epoch": 0.9479464534251082, + "grad_norm": 5.035039901733398, + "learning_rate": 6.80613697198057e-08, + "loss": 0.1319, + "step": 37460 + }, + { + "epoch": 0.9479717589898019, + "grad_norm": 4.392698764801025, + "learning_rate": 6.799536079403369e-08, + "loss": 0.107, + "step": 37461 + }, + { + "epoch": 0.9479970645544955, + "grad_norm": 6.705382823944092, + "learning_rate": 6.792938367384205e-08, + "loss": 0.2599, + "step": 37462 + }, + { + "epoch": 0.9480223701191892, + "grad_norm": 4.163565635681152, + "learning_rate": 6.786343835965659e-08, + "loss": 0.1129, + "step": 37463 + }, + { + "epoch": 0.9480476756838829, + "grad_norm": 6.8030619621276855, + "learning_rate": 6.779752485190195e-08, + "loss": 0.1111, + "step": 37464 + }, + { + "epoch": 0.9480729812485765, + "grad_norm": 5.470046043395996, + "learning_rate": 6.773164315100444e-08, + "loss": 0.1357, + "step": 37465 + }, + { + "epoch": 0.9480982868132702, + "grad_norm": 6.921432018280029, + "learning_rate": 6.766579325738765e-08, + "loss": 0.1813, + "step": 37466 + }, + { + "epoch": 0.948123592377964, + "grad_norm": 6.326786994934082, + "learning_rate": 6.759997517147676e-08, + "loss": 0.1433, + "step": 37467 + }, + { + "epoch": 0.9481488979426576, + "grad_norm": 5.746601104736328, + "learning_rate": 6.75341888936959e-08, + "loss": 0.1625, + "step": 37468 + }, + { + "epoch": 0.9481742035073513, + "grad_norm": 3.5745456218719482, + "learning_rate": 6.746843442446971e-08, + "loss": 0.1053, + "step": 37469 + }, + { + "epoch": 0.948199509072045, + "grad_norm": 6.282878398895264, + "learning_rate": 6.740271176422175e-08, + "loss": 0.152, + "step": 37470 + }, + { + "epoch": 0.9482248146367386, + "grad_norm": 9.964498519897461, + "learning_rate": 6.733702091337612e-08, + "loss": 0.2494, + "step": 37471 + }, + { + "epoch": 0.9482501202014323, + "grad_norm": 7.233495235443115, + "learning_rate": 6.72713618723575e-08, + "loss": 0.1853, + "step": 37472 + }, + { + "epoch": 0.948275425766126, + "grad_norm": 7.722934722900391, + "learning_rate": 6.72057346415872e-08, + "loss": 0.1487, + "step": 37473 + }, + { + "epoch": 0.9483007313308196, + "grad_norm": 5.749933242797852, + "learning_rate": 6.714013922148987e-08, + "loss": 0.1517, + "step": 37474 + }, + { + "epoch": 0.9483260368955133, + "grad_norm": 10.120835304260254, + "learning_rate": 6.707457561248854e-08, + "loss": 0.2047, + "step": 37475 + }, + { + "epoch": 0.948351342460207, + "grad_norm": 4.312355995178223, + "learning_rate": 6.700904381500561e-08, + "loss": 0.1698, + "step": 37476 + }, + { + "epoch": 0.9483766480249006, + "grad_norm": 2.7714123725891113, + "learning_rate": 6.694354382946356e-08, + "loss": 0.0865, + "step": 37477 + }, + { + "epoch": 0.9484019535895943, + "grad_norm": 5.123661518096924, + "learning_rate": 6.687807565628479e-08, + "loss": 0.1141, + "step": 37478 + }, + { + "epoch": 0.948427259154288, + "grad_norm": 8.185229301452637, + "learning_rate": 6.681263929589177e-08, + "loss": 0.239, + "step": 37479 + }, + { + "epoch": 0.9484525647189816, + "grad_norm": 3.5561745166778564, + "learning_rate": 6.674723474870692e-08, + "loss": 0.0812, + "step": 37480 + }, + { + "epoch": 0.9484778702836754, + "grad_norm": 4.2670063972473145, + "learning_rate": 6.668186201515047e-08, + "loss": 0.1153, + "step": 37481 + }, + { + "epoch": 0.9485031758483691, + "grad_norm": 3.9717905521392822, + "learning_rate": 6.661652109564598e-08, + "loss": 0.1457, + "step": 37482 + }, + { + "epoch": 0.9485284814130628, + "grad_norm": 3.468235492706299, + "learning_rate": 6.65512119906131e-08, + "loss": 0.1021, + "step": 37483 + }, + { + "epoch": 0.9485537869777564, + "grad_norm": 3.8061511516571045, + "learning_rate": 6.648593470047427e-08, + "loss": 0.0452, + "step": 37484 + }, + { + "epoch": 0.9485790925424501, + "grad_norm": 6.13254451751709, + "learning_rate": 6.642068922565026e-08, + "loss": 0.1374, + "step": 37485 + }, + { + "epoch": 0.9486043981071438, + "grad_norm": 5.80885648727417, + "learning_rate": 6.63554755665613e-08, + "loss": 0.1019, + "step": 37486 + }, + { + "epoch": 0.9486297036718374, + "grad_norm": 3.8979811668395996, + "learning_rate": 6.62902937236276e-08, + "loss": 0.1581, + "step": 37487 + }, + { + "epoch": 0.9486550092365311, + "grad_norm": 9.310646057128906, + "learning_rate": 6.622514369727051e-08, + "loss": 0.2572, + "step": 37488 + }, + { + "epoch": 0.9486803148012248, + "grad_norm": 13.914468765258789, + "learning_rate": 6.616002548790967e-08, + "loss": 0.2437, + "step": 37489 + }, + { + "epoch": 0.9487056203659184, + "grad_norm": 10.04433536529541, + "learning_rate": 6.609493909596476e-08, + "loss": 0.1409, + "step": 37490 + }, + { + "epoch": 0.9487309259306121, + "grad_norm": 4.637983322143555, + "learning_rate": 6.602988452185599e-08, + "loss": 0.1013, + "step": 37491 + }, + { + "epoch": 0.9487562314953059, + "grad_norm": 9.236706733703613, + "learning_rate": 6.596486176600303e-08, + "loss": 0.2337, + "step": 37492 + }, + { + "epoch": 0.9487815370599995, + "grad_norm": 11.544986724853516, + "learning_rate": 6.589987082882443e-08, + "loss": 0.185, + "step": 37493 + }, + { + "epoch": 0.9488068426246932, + "grad_norm": 4.616718769073486, + "learning_rate": 6.583491171073986e-08, + "loss": 0.1111, + "step": 37494 + }, + { + "epoch": 0.9488321481893869, + "grad_norm": 11.76126766204834, + "learning_rate": 6.576998441216842e-08, + "loss": 0.1737, + "step": 37495 + }, + { + "epoch": 0.9488574537540805, + "grad_norm": 3.2633674144744873, + "learning_rate": 6.570508893352812e-08, + "loss": 0.1108, + "step": 37496 + }, + { + "epoch": 0.9488827593187742, + "grad_norm": 4.021456718444824, + "learning_rate": 6.564022527523806e-08, + "loss": 0.1207, + "step": 37497 + }, + { + "epoch": 0.9489080648834679, + "grad_norm": 4.194071292877197, + "learning_rate": 6.557539343771624e-08, + "loss": 0.1579, + "step": 37498 + }, + { + "epoch": 0.9489333704481615, + "grad_norm": 6.614253044128418, + "learning_rate": 6.551059342138123e-08, + "loss": 0.1031, + "step": 37499 + }, + { + "epoch": 0.9489586760128552, + "grad_norm": 4.441030979156494, + "learning_rate": 6.544582522664989e-08, + "loss": 0.1225, + "step": 37500 + }, + { + "epoch": 0.9489839815775489, + "grad_norm": 4.41715145111084, + "learning_rate": 6.538108885394079e-08, + "loss": 0.1027, + "step": 37501 + }, + { + "epoch": 0.9490092871422425, + "grad_norm": 8.227867126464844, + "learning_rate": 6.531638430367082e-08, + "loss": 0.1334, + "step": 37502 + }, + { + "epoch": 0.9490345927069362, + "grad_norm": 5.203941345214844, + "learning_rate": 6.525171157625854e-08, + "loss": 0.1906, + "step": 37503 + }, + { + "epoch": 0.94905989827163, + "grad_norm": 5.76509952545166, + "learning_rate": 6.518707067211915e-08, + "loss": 0.1037, + "step": 37504 + }, + { + "epoch": 0.9490852038363236, + "grad_norm": 9.870916366577148, + "learning_rate": 6.512246159167068e-08, + "loss": 0.1925, + "step": 37505 + }, + { + "epoch": 0.9491105094010173, + "grad_norm": 5.529604434967041, + "learning_rate": 6.505788433532944e-08, + "loss": 0.166, + "step": 37506 + }, + { + "epoch": 0.949135814965711, + "grad_norm": 2.720123767852783, + "learning_rate": 6.499333890351234e-08, + "loss": 0.1058, + "step": 37507 + }, + { + "epoch": 0.9491611205304047, + "grad_norm": 3.543067216873169, + "learning_rate": 6.492882529663458e-08, + "loss": 0.0849, + "step": 37508 + }, + { + "epoch": 0.9491864260950983, + "grad_norm": 6.578405857086182, + "learning_rate": 6.486434351511306e-08, + "loss": 0.2002, + "step": 37509 + }, + { + "epoch": 0.949211731659792, + "grad_norm": 4.356404781341553, + "learning_rate": 6.4799893559363e-08, + "loss": 0.1652, + "step": 37510 + }, + { + "epoch": 0.9492370372244857, + "grad_norm": 3.019662857055664, + "learning_rate": 6.473547542980129e-08, + "loss": 0.0485, + "step": 37511 + }, + { + "epoch": 0.9492623427891793, + "grad_norm": 7.022240161895752, + "learning_rate": 6.467108912684206e-08, + "loss": 0.1417, + "step": 37512 + }, + { + "epoch": 0.949287648353873, + "grad_norm": 3.0580334663391113, + "learning_rate": 6.46067346509005e-08, + "loss": 0.0794, + "step": 37513 + }, + { + "epoch": 0.9493129539185667, + "grad_norm": 6.268765926361084, + "learning_rate": 6.454241200239241e-08, + "loss": 0.1904, + "step": 37514 + }, + { + "epoch": 0.9493382594832603, + "grad_norm": 2.5049140453338623, + "learning_rate": 6.447812118173247e-08, + "loss": 0.1088, + "step": 37515 + }, + { + "epoch": 0.949363565047954, + "grad_norm": 6.33327054977417, + "learning_rate": 6.441386218933476e-08, + "loss": 0.168, + "step": 37516 + }, + { + "epoch": 0.9493888706126478, + "grad_norm": 5.978454113006592, + "learning_rate": 6.434963502561398e-08, + "loss": 0.1332, + "step": 37517 + }, + { + "epoch": 0.9494141761773414, + "grad_norm": 5.233337879180908, + "learning_rate": 6.428543969098421e-08, + "loss": 0.2498, + "step": 37518 + }, + { + "epoch": 0.9494394817420351, + "grad_norm": 10.573862075805664, + "learning_rate": 6.422127618585961e-08, + "loss": 0.1995, + "step": 37519 + }, + { + "epoch": 0.9494647873067288, + "grad_norm": 8.056801795959473, + "learning_rate": 6.415714451065425e-08, + "loss": 0.1388, + "step": 37520 + }, + { + "epoch": 0.9494900928714224, + "grad_norm": 13.45886516571045, + "learning_rate": 6.409304466578115e-08, + "loss": 0.2906, + "step": 37521 + }, + { + "epoch": 0.9495153984361161, + "grad_norm": 14.721466064453125, + "learning_rate": 6.402897665165387e-08, + "loss": 0.132, + "step": 37522 + }, + { + "epoch": 0.9495407040008098, + "grad_norm": 3.546800374984741, + "learning_rate": 6.396494046868485e-08, + "loss": 0.0507, + "step": 37523 + }, + { + "epoch": 0.9495660095655034, + "grad_norm": 3.039590358734131, + "learning_rate": 6.390093611728876e-08, + "loss": 0.0797, + "step": 37524 + }, + { + "epoch": 0.9495913151301971, + "grad_norm": 5.174332141876221, + "learning_rate": 6.383696359787694e-08, + "loss": 0.1619, + "step": 37525 + }, + { + "epoch": 0.9496166206948908, + "grad_norm": 6.342062473297119, + "learning_rate": 6.37730229108624e-08, + "loss": 0.224, + "step": 37526 + }, + { + "epoch": 0.9496419262595844, + "grad_norm": 9.961116790771484, + "learning_rate": 6.370911405665702e-08, + "loss": 0.2297, + "step": 37527 + }, + { + "epoch": 0.9496672318242781, + "grad_norm": 6.082192897796631, + "learning_rate": 6.364523703567382e-08, + "loss": 0.1665, + "step": 37528 + }, + { + "epoch": 0.9496925373889719, + "grad_norm": 9.786169052124023, + "learning_rate": 6.358139184832412e-08, + "loss": 0.2422, + "step": 37529 + }, + { + "epoch": 0.9497178429536655, + "grad_norm": 5.803354740142822, + "learning_rate": 6.351757849501982e-08, + "loss": 0.2123, + "step": 37530 + }, + { + "epoch": 0.9497431485183592, + "grad_norm": 3.2381088733673096, + "learning_rate": 6.345379697617226e-08, + "loss": 0.1235, + "step": 37531 + }, + { + "epoch": 0.9497684540830529, + "grad_norm": 8.084271430969238, + "learning_rate": 6.339004729219278e-08, + "loss": 0.1861, + "step": 37532 + }, + { + "epoch": 0.9497937596477466, + "grad_norm": 2.271792411804199, + "learning_rate": 6.332632944349271e-08, + "loss": 0.0386, + "step": 37533 + }, + { + "epoch": 0.9498190652124402, + "grad_norm": 4.908891677856445, + "learning_rate": 6.326264343048339e-08, + "loss": 0.1123, + "step": 37534 + }, + { + "epoch": 0.9498443707771339, + "grad_norm": 3.671152353286743, + "learning_rate": 6.319898925357393e-08, + "loss": 0.0954, + "step": 37535 + }, + { + "epoch": 0.9498696763418276, + "grad_norm": 4.928508281707764, + "learning_rate": 6.313536691317679e-08, + "loss": 0.1093, + "step": 37536 + }, + { + "epoch": 0.9498949819065212, + "grad_norm": 6.802257061004639, + "learning_rate": 6.307177640970052e-08, + "loss": 0.1504, + "step": 37537 + }, + { + "epoch": 0.9499202874712149, + "grad_norm": 5.793110370635986, + "learning_rate": 6.300821774355703e-08, + "loss": 0.1819, + "step": 37538 + }, + { + "epoch": 0.9499455930359086, + "grad_norm": 4.241814136505127, + "learning_rate": 6.294469091515431e-08, + "loss": 0.1127, + "step": 37539 + }, + { + "epoch": 0.9499708986006022, + "grad_norm": 13.235848426818848, + "learning_rate": 6.288119592490372e-08, + "loss": 0.1606, + "step": 37540 + }, + { + "epoch": 0.949996204165296, + "grad_norm": 5.101268768310547, + "learning_rate": 6.281773277321324e-08, + "loss": 0.143, + "step": 37541 + }, + { + "epoch": 0.9500215097299897, + "grad_norm": 3.774815559387207, + "learning_rate": 6.27543014604931e-08, + "loss": 0.1099, + "step": 37542 + }, + { + "epoch": 0.9500468152946833, + "grad_norm": 10.690559387207031, + "learning_rate": 6.269090198715189e-08, + "loss": 0.1483, + "step": 37543 + }, + { + "epoch": 0.950072120859377, + "grad_norm": 6.733078479766846, + "learning_rate": 6.262753435359925e-08, + "loss": 0.1719, + "step": 37544 + }, + { + "epoch": 0.9500974264240707, + "grad_norm": 5.732152938842773, + "learning_rate": 6.256419856024209e-08, + "loss": 0.1564, + "step": 37545 + }, + { + "epoch": 0.9501227319887643, + "grad_norm": 8.911798477172852, + "learning_rate": 6.250089460749121e-08, + "loss": 0.224, + "step": 37546 + }, + { + "epoch": 0.950148037553458, + "grad_norm": 3.0320217609405518, + "learning_rate": 6.243762249575291e-08, + "loss": 0.0709, + "step": 37547 + }, + { + "epoch": 0.9501733431181517, + "grad_norm": 3.9402554035186768, + "learning_rate": 6.237438222543635e-08, + "loss": 0.1511, + "step": 37548 + }, + { + "epoch": 0.9501986486828453, + "grad_norm": 3.78725528717041, + "learning_rate": 6.231117379694839e-08, + "loss": 0.1017, + "step": 37549 + }, + { + "epoch": 0.950223954247539, + "grad_norm": 8.855463981628418, + "learning_rate": 6.224799721069708e-08, + "loss": 0.2627, + "step": 37550 + }, + { + "epoch": 0.9502492598122327, + "grad_norm": 2.197981119155884, + "learning_rate": 6.218485246709094e-08, + "loss": 0.0745, + "step": 37551 + }, + { + "epoch": 0.9502745653769263, + "grad_norm": 8.791529655456543, + "learning_rate": 6.212173956653522e-08, + "loss": 0.1956, + "step": 37552 + }, + { + "epoch": 0.95029987094162, + "grad_norm": 5.730572700500488, + "learning_rate": 6.205865850943848e-08, + "loss": 0.18, + "step": 37553 + }, + { + "epoch": 0.9503251765063138, + "grad_norm": 4.7606940269470215, + "learning_rate": 6.199560929620651e-08, + "loss": 0.1222, + "step": 37554 + }, + { + "epoch": 0.9503504820710074, + "grad_norm": 4.6201887130737305, + "learning_rate": 6.193259192724621e-08, + "loss": 0.1707, + "step": 37555 + }, + { + "epoch": 0.9503757876357011, + "grad_norm": 4.381333351135254, + "learning_rate": 6.186960640296446e-08, + "loss": 0.1111, + "step": 37556 + }, + { + "epoch": 0.9504010932003948, + "grad_norm": 7.3897385597229, + "learning_rate": 6.180665272376708e-08, + "loss": 0.2046, + "step": 37557 + }, + { + "epoch": 0.9504263987650885, + "grad_norm": 3.0917632579803467, + "learning_rate": 6.17437308900598e-08, + "loss": 0.1366, + "step": 37558 + }, + { + "epoch": 0.9504517043297821, + "grad_norm": 5.16024923324585, + "learning_rate": 6.168084090224902e-08, + "loss": 0.1199, + "step": 37559 + }, + { + "epoch": 0.9504770098944758, + "grad_norm": 4.288939952850342, + "learning_rate": 6.161798276073938e-08, + "loss": 0.1552, + "step": 37560 + }, + { + "epoch": 0.9505023154591695, + "grad_norm": 4.376124382019043, + "learning_rate": 6.155515646593724e-08, + "loss": 0.1291, + "step": 37561 + }, + { + "epoch": 0.9505276210238631, + "grad_norm": 6.677783012390137, + "learning_rate": 6.149236201824671e-08, + "loss": 0.1506, + "step": 37562 + }, + { + "epoch": 0.9505529265885568, + "grad_norm": 3.3583250045776367, + "learning_rate": 6.142959941807359e-08, + "loss": 0.1228, + "step": 37563 + }, + { + "epoch": 0.9505782321532505, + "grad_norm": 5.797924518585205, + "learning_rate": 6.136686866582254e-08, + "loss": 0.1044, + "step": 37564 + }, + { + "epoch": 0.9506035377179441, + "grad_norm": 4.579006195068359, + "learning_rate": 6.130416976189767e-08, + "loss": 0.1453, + "step": 37565 + }, + { + "epoch": 0.9506288432826379, + "grad_norm": 6.161749362945557, + "learning_rate": 6.124150270670315e-08, + "loss": 0.2202, + "step": 37566 + }, + { + "epoch": 0.9506541488473316, + "grad_norm": 7.610479354858398, + "learning_rate": 6.117886750064417e-08, + "loss": 0.1225, + "step": 37567 + }, + { + "epoch": 0.9506794544120252, + "grad_norm": 5.49050235748291, + "learning_rate": 6.111626414412319e-08, + "loss": 0.1245, + "step": 37568 + }, + { + "epoch": 0.9507047599767189, + "grad_norm": 3.7933387756347656, + "learning_rate": 6.105369263754601e-08, + "loss": 0.1665, + "step": 37569 + }, + { + "epoch": 0.9507300655414126, + "grad_norm": 3.1275992393493652, + "learning_rate": 6.099115298131341e-08, + "loss": 0.0891, + "step": 37570 + }, + { + "epoch": 0.9507553711061062, + "grad_norm": 6.759275913238525, + "learning_rate": 6.092864517583119e-08, + "loss": 0.2599, + "step": 37571 + }, + { + "epoch": 0.9507806766707999, + "grad_norm": 6.955217361450195, + "learning_rate": 6.086616922150068e-08, + "loss": 0.1965, + "step": 37572 + }, + { + "epoch": 0.9508059822354936, + "grad_norm": 5.275684833526611, + "learning_rate": 6.080372511872656e-08, + "loss": 0.1859, + "step": 37573 + }, + { + "epoch": 0.9508312878001872, + "grad_norm": 5.569386005401611, + "learning_rate": 6.074131286790907e-08, + "loss": 0.1531, + "step": 37574 + }, + { + "epoch": 0.9508565933648809, + "grad_norm": 3.2857534885406494, + "learning_rate": 6.067893246945289e-08, + "loss": 0.0856, + "step": 37575 + }, + { + "epoch": 0.9508818989295746, + "grad_norm": 9.126484870910645, + "learning_rate": 6.061658392375935e-08, + "loss": 0.1399, + "step": 37576 + }, + { + "epoch": 0.9509072044942682, + "grad_norm": 8.04617977142334, + "learning_rate": 6.055426723123036e-08, + "loss": 0.2014, + "step": 37577 + }, + { + "epoch": 0.950932510058962, + "grad_norm": 4.791679859161377, + "learning_rate": 6.049198239226895e-08, + "loss": 0.1315, + "step": 37578 + }, + { + "epoch": 0.9509578156236557, + "grad_norm": 5.774019241333008, + "learning_rate": 6.042972940727532e-08, + "loss": 0.1266, + "step": 37579 + }, + { + "epoch": 0.9509831211883493, + "grad_norm": 3.782076597213745, + "learning_rate": 6.036750827665139e-08, + "loss": 0.1084, + "step": 37580 + }, + { + "epoch": 0.951008426753043, + "grad_norm": 5.035853385925293, + "learning_rate": 6.03053190007985e-08, + "loss": 0.1664, + "step": 37581 + }, + { + "epoch": 0.9510337323177367, + "grad_norm": 4.072822570800781, + "learning_rate": 6.024316158011855e-08, + "loss": 0.1344, + "step": 37582 + }, + { + "epoch": 0.9510590378824303, + "grad_norm": 5.9387288093566895, + "learning_rate": 6.018103601501068e-08, + "loss": 0.1649, + "step": 37583 + }, + { + "epoch": 0.951084343447124, + "grad_norm": 6.158975601196289, + "learning_rate": 6.011894230587733e-08, + "loss": 0.1965, + "step": 37584 + }, + { + "epoch": 0.9511096490118177, + "grad_norm": 3.3973615169525146, + "learning_rate": 6.005688045311708e-08, + "loss": 0.1178, + "step": 37585 + }, + { + "epoch": 0.9511349545765114, + "grad_norm": 4.972121715545654, + "learning_rate": 5.999485045713182e-08, + "loss": 0.1634, + "step": 37586 + }, + { + "epoch": 0.951160260141205, + "grad_norm": 4.323721885681152, + "learning_rate": 5.993285231832013e-08, + "loss": 0.1648, + "step": 37587 + }, + { + "epoch": 0.9511855657058987, + "grad_norm": 4.795146465301514, + "learning_rate": 5.987088603708336e-08, + "loss": 0.12, + "step": 37588 + }, + { + "epoch": 0.9512108712705925, + "grad_norm": 5.834471702575684, + "learning_rate": 5.98089516138195e-08, + "loss": 0.1918, + "step": 37589 + }, + { + "epoch": 0.951236176835286, + "grad_norm": 5.105254650115967, + "learning_rate": 5.974704904892937e-08, + "loss": 0.1632, + "step": 37590 + }, + { + "epoch": 0.9512614823999798, + "grad_norm": 9.056013107299805, + "learning_rate": 5.968517834281151e-08, + "loss": 0.2453, + "step": 37591 + }, + { + "epoch": 0.9512867879646735, + "grad_norm": 9.089879035949707, + "learning_rate": 5.962333949586507e-08, + "loss": 0.2244, + "step": 37592 + }, + { + "epoch": 0.9513120935293671, + "grad_norm": 6.584412574768066, + "learning_rate": 5.9561532508488063e-08, + "loss": 0.1837, + "step": 37593 + }, + { + "epoch": 0.9513373990940608, + "grad_norm": 3.8375868797302246, + "learning_rate": 5.949975738108016e-08, + "loss": 0.1084, + "step": 37594 + }, + { + "epoch": 0.9513627046587545, + "grad_norm": 4.894440174102783, + "learning_rate": 5.9438014114039934e-08, + "loss": 0.1319, + "step": 37595 + }, + { + "epoch": 0.9513880102234481, + "grad_norm": 4.647106647491455, + "learning_rate": 5.9376302707764285e-08, + "loss": 0.1088, + "step": 37596 + }, + { + "epoch": 0.9514133157881418, + "grad_norm": 5.569558143615723, + "learning_rate": 5.9314623162651794e-08, + "loss": 0.1225, + "step": 37597 + }, + { + "epoch": 0.9514386213528355, + "grad_norm": 5.193242073059082, + "learning_rate": 5.925297547910047e-08, + "loss": 0.1233, + "step": 37598 + }, + { + "epoch": 0.9514639269175291, + "grad_norm": 4.949079513549805, + "learning_rate": 5.919135965750722e-08, + "loss": 0.1346, + "step": 37599 + }, + { + "epoch": 0.9514892324822228, + "grad_norm": 3.1818346977233887, + "learning_rate": 5.912977569827061e-08, + "loss": 0.1229, + "step": 37600 + }, + { + "epoch": 0.9515145380469165, + "grad_norm": 2.7872567176818848, + "learning_rate": 5.9068223601786434e-08, + "loss": 0.136, + "step": 37601 + }, + { + "epoch": 0.9515398436116101, + "grad_norm": 3.70788836479187, + "learning_rate": 5.9006703368452154e-08, + "loss": 0.0834, + "step": 37602 + }, + { + "epoch": 0.9515651491763039, + "grad_norm": 5.432796001434326, + "learning_rate": 5.894521499866468e-08, + "loss": 0.124, + "step": 37603 + }, + { + "epoch": 0.9515904547409976, + "grad_norm": 4.690231800079346, + "learning_rate": 5.888375849282091e-08, + "loss": 0.1027, + "step": 37604 + }, + { + "epoch": 0.9516157603056912, + "grad_norm": 11.373908996582031, + "learning_rate": 5.8822333851315525e-08, + "loss": 0.2115, + "step": 37605 + }, + { + "epoch": 0.9516410658703849, + "grad_norm": 10.015692710876465, + "learning_rate": 5.876094107454655e-08, + "loss": 0.1897, + "step": 37606 + }, + { + "epoch": 0.9516663714350786, + "grad_norm": 5.271923542022705, + "learning_rate": 5.869958016290922e-08, + "loss": 0.1597, + "step": 37607 + }, + { + "epoch": 0.9516916769997722, + "grad_norm": 5.520238399505615, + "learning_rate": 5.863825111679933e-08, + "loss": 0.1511, + "step": 37608 + }, + { + "epoch": 0.9517169825644659, + "grad_norm": 6.6901469230651855, + "learning_rate": 5.8576953936611556e-08, + "loss": 0.1888, + "step": 37609 + }, + { + "epoch": 0.9517422881291596, + "grad_norm": 6.297961235046387, + "learning_rate": 5.851568862274226e-08, + "loss": 0.1528, + "step": 37610 + }, + { + "epoch": 0.9517675936938533, + "grad_norm": 4.476709365844727, + "learning_rate": 5.845445517558557e-08, + "loss": 0.1139, + "step": 37611 + }, + { + "epoch": 0.9517928992585469, + "grad_norm": 2.7415053844451904, + "learning_rate": 5.839325359553727e-08, + "loss": 0.1053, + "step": 37612 + }, + { + "epoch": 0.9518182048232406, + "grad_norm": 3.245274305343628, + "learning_rate": 5.83320838829915e-08, + "loss": 0.1241, + "step": 37613 + }, + { + "epoch": 0.9518435103879344, + "grad_norm": 4.100139617919922, + "learning_rate": 5.827094603834349e-08, + "loss": 0.091, + "step": 37614 + }, + { + "epoch": 0.951868815952628, + "grad_norm": 4.419340133666992, + "learning_rate": 5.8209840061986267e-08, + "loss": 0.0687, + "step": 37615 + }, + { + "epoch": 0.9518941215173217, + "grad_norm": 4.636965274810791, + "learning_rate": 5.8148765954314515e-08, + "loss": 0.1483, + "step": 37616 + }, + { + "epoch": 0.9519194270820154, + "grad_norm": 5.273200035095215, + "learning_rate": 5.808772371572291e-08, + "loss": 0.2232, + "step": 37617 + }, + { + "epoch": 0.951944732646709, + "grad_norm": 4.629073143005371, + "learning_rate": 5.802671334660392e-08, + "loss": 0.1583, + "step": 37618 + }, + { + "epoch": 0.9519700382114027, + "grad_norm": 7.53897762298584, + "learning_rate": 5.7965734847351126e-08, + "loss": 0.26, + "step": 37619 + }, + { + "epoch": 0.9519953437760964, + "grad_norm": 3.212552785873413, + "learning_rate": 5.790478821835754e-08, + "loss": 0.0997, + "step": 37620 + }, + { + "epoch": 0.95202064934079, + "grad_norm": 2.7497811317443848, + "learning_rate": 5.784387346001785e-08, + "loss": 0.1335, + "step": 37621 + }, + { + "epoch": 0.9520459549054837, + "grad_norm": 2.6969985961914062, + "learning_rate": 5.7782990572722855e-08, + "loss": 0.1081, + "step": 37622 + }, + { + "epoch": 0.9520712604701774, + "grad_norm": 4.0864152908325195, + "learning_rate": 5.772213955686668e-08, + "loss": 0.123, + "step": 37623 + }, + { + "epoch": 0.952096566034871, + "grad_norm": 5.318413257598877, + "learning_rate": 5.766132041284012e-08, + "loss": 0.1362, + "step": 37624 + }, + { + "epoch": 0.9521218715995647, + "grad_norm": 7.362644195556641, + "learning_rate": 5.7600533141037306e-08, + "loss": 0.1621, + "step": 37625 + }, + { + "epoch": 0.9521471771642585, + "grad_norm": 4.0511884689331055, + "learning_rate": 5.753977774184905e-08, + "loss": 0.1679, + "step": 37626 + }, + { + "epoch": 0.952172482728952, + "grad_norm": 5.051211357116699, + "learning_rate": 5.747905421566724e-08, + "loss": 0.1318, + "step": 37627 + }, + { + "epoch": 0.9521977882936458, + "grad_norm": 7.684350967407227, + "learning_rate": 5.7418362562883246e-08, + "loss": 0.1864, + "step": 37628 + }, + { + "epoch": 0.9522230938583395, + "grad_norm": 15.832175254821777, + "learning_rate": 5.735770278388953e-08, + "loss": 0.2318, + "step": 37629 + }, + { + "epoch": 0.9522483994230331, + "grad_norm": 4.137441158294678, + "learning_rate": 5.729707487907632e-08, + "loss": 0.1953, + "step": 37630 + }, + { + "epoch": 0.9522737049877268, + "grad_norm": 7.060220718383789, + "learning_rate": 5.723647884883499e-08, + "loss": 0.1202, + "step": 37631 + }, + { + "epoch": 0.9522990105524205, + "grad_norm": 8.022089004516602, + "learning_rate": 5.7175914693555766e-08, + "loss": 0.2392, + "step": 37632 + }, + { + "epoch": 0.9523243161171141, + "grad_norm": 3.837509870529175, + "learning_rate": 5.711538241363002e-08, + "loss": 0.1522, + "step": 37633 + }, + { + "epoch": 0.9523496216818078, + "grad_norm": 8.590910911560059, + "learning_rate": 5.7054882009447977e-08, + "loss": 0.1986, + "step": 37634 + }, + { + "epoch": 0.9523749272465015, + "grad_norm": 4.67293643951416, + "learning_rate": 5.6994413481399334e-08, + "loss": 0.1507, + "step": 37635 + }, + { + "epoch": 0.9524002328111952, + "grad_norm": 3.3051233291625977, + "learning_rate": 5.6933976829873785e-08, + "loss": 0.1128, + "step": 37636 + }, + { + "epoch": 0.9524255383758888, + "grad_norm": 3.3879427909851074, + "learning_rate": 5.6873572055262116e-08, + "loss": 0.1301, + "step": 37637 + }, + { + "epoch": 0.9524508439405825, + "grad_norm": 13.922428131103516, + "learning_rate": 5.681319915795291e-08, + "loss": 0.2486, + "step": 37638 + }, + { + "epoch": 0.9524761495052763, + "grad_norm": 4.888288974761963, + "learning_rate": 5.6752858138335866e-08, + "loss": 0.1129, + "step": 37639 + }, + { + "epoch": 0.9525014550699699, + "grad_norm": 4.690986156463623, + "learning_rate": 5.6692548996800103e-08, + "loss": 0.1223, + "step": 37640 + }, + { + "epoch": 0.9525267606346636, + "grad_norm": 11.830554008483887, + "learning_rate": 5.663227173373476e-08, + "loss": 0.1712, + "step": 37641 + }, + { + "epoch": 0.9525520661993573, + "grad_norm": 3.1916520595550537, + "learning_rate": 5.657202634952841e-08, + "loss": 0.0781, + "step": 37642 + }, + { + "epoch": 0.9525773717640509, + "grad_norm": 6.590847015380859, + "learning_rate": 5.6511812844569084e-08, + "loss": 0.2325, + "step": 37643 + }, + { + "epoch": 0.9526026773287446, + "grad_norm": 6.515151023864746, + "learning_rate": 5.6451631219246464e-08, + "loss": 0.2011, + "step": 37644 + }, + { + "epoch": 0.9526279828934383, + "grad_norm": 6.675693988800049, + "learning_rate": 5.6391481473946906e-08, + "loss": 0.2129, + "step": 37645 + }, + { + "epoch": 0.9526532884581319, + "grad_norm": 3.3688058853149414, + "learning_rate": 5.6331363609059553e-08, + "loss": 0.1122, + "step": 37646 + }, + { + "epoch": 0.9526785940228256, + "grad_norm": 12.356402397155762, + "learning_rate": 5.627127762497131e-08, + "loss": 0.1661, + "step": 37647 + }, + { + "epoch": 0.9527038995875193, + "grad_norm": 4.159956932067871, + "learning_rate": 5.62112235220702e-08, + "loss": 0.1068, + "step": 37648 + }, + { + "epoch": 0.9527292051522129, + "grad_norm": 6.595759391784668, + "learning_rate": 5.615120130074314e-08, + "loss": 0.1869, + "step": 37649 + }, + { + "epoch": 0.9527545107169066, + "grad_norm": 6.331157207489014, + "learning_rate": 5.609121096137704e-08, + "loss": 0.1909, + "step": 37650 + }, + { + "epoch": 0.9527798162816004, + "grad_norm": 5.982957363128662, + "learning_rate": 5.603125250435937e-08, + "loss": 0.2075, + "step": 37651 + }, + { + "epoch": 0.952805121846294, + "grad_norm": 7.587192535400391, + "learning_rate": 5.597132593007648e-08, + "loss": 0.1919, + "step": 37652 + }, + { + "epoch": 0.9528304274109877, + "grad_norm": 7.51862907409668, + "learning_rate": 5.591143123891529e-08, + "loss": 0.2014, + "step": 37653 + }, + { + "epoch": 0.9528557329756814, + "grad_norm": 3.0597591400146484, + "learning_rate": 5.585156843126105e-08, + "loss": 0.1207, + "step": 37654 + }, + { + "epoch": 0.952881038540375, + "grad_norm": 2.855059862136841, + "learning_rate": 5.5791737507500664e-08, + "loss": 0.0779, + "step": 37655 + }, + { + "epoch": 0.9529063441050687, + "grad_norm": 5.1602654457092285, + "learning_rate": 5.5731938468019385e-08, + "loss": 0.2112, + "step": 37656 + }, + { + "epoch": 0.9529316496697624, + "grad_norm": 3.4889562129974365, + "learning_rate": 5.567217131320357e-08, + "loss": 0.1035, + "step": 37657 + }, + { + "epoch": 0.952956955234456, + "grad_norm": 2.8003952503204346, + "learning_rate": 5.561243604343791e-08, + "loss": 0.0833, + "step": 37658 + }, + { + "epoch": 0.9529822607991497, + "grad_norm": 4.671101093292236, + "learning_rate": 5.555273265910766e-08, + "loss": 0.1461, + "step": 37659 + }, + { + "epoch": 0.9530075663638434, + "grad_norm": 5.824029922485352, + "learning_rate": 5.549306116059861e-08, + "loss": 0.1787, + "step": 37660 + }, + { + "epoch": 0.9530328719285371, + "grad_norm": 3.672952890396118, + "learning_rate": 5.54334215482949e-08, + "loss": 0.1464, + "step": 37661 + }, + { + "epoch": 0.9530581774932307, + "grad_norm": 4.0144243240356445, + "learning_rate": 5.5373813822581224e-08, + "loss": 0.137, + "step": 37662 + }, + { + "epoch": 0.9530834830579245, + "grad_norm": 2.318687915802002, + "learning_rate": 5.5314237983842275e-08, + "loss": 0.079, + "step": 37663 + }, + { + "epoch": 0.9531087886226182, + "grad_norm": 3.3825936317443848, + "learning_rate": 5.525469403246164e-08, + "loss": 0.0549, + "step": 37664 + }, + { + "epoch": 0.9531340941873118, + "grad_norm": 4.172893524169922, + "learning_rate": 5.519518196882401e-08, + "loss": 0.0894, + "step": 37665 + }, + { + "epoch": 0.9531593997520055, + "grad_norm": 4.805696487426758, + "learning_rate": 5.513570179331296e-08, + "loss": 0.1898, + "step": 37666 + }, + { + "epoch": 0.9531847053166992, + "grad_norm": 6.259872913360596, + "learning_rate": 5.5076253506311517e-08, + "loss": 0.1496, + "step": 37667 + }, + { + "epoch": 0.9532100108813928, + "grad_norm": 3.4920475482940674, + "learning_rate": 5.501683710820383e-08, + "loss": 0.0942, + "step": 37668 + }, + { + "epoch": 0.9532353164460865, + "grad_norm": 3.061289072036743, + "learning_rate": 5.495745259937235e-08, + "loss": 0.1259, + "step": 37669 + }, + { + "epoch": 0.9532606220107802, + "grad_norm": 3.6197783946990967, + "learning_rate": 5.489809998020068e-08, + "loss": 0.1148, + "step": 37670 + }, + { + "epoch": 0.9532859275754738, + "grad_norm": 4.700381278991699, + "learning_rate": 5.4838779251071284e-08, + "loss": 0.1356, + "step": 37671 + }, + { + "epoch": 0.9533112331401675, + "grad_norm": 7.507326602935791, + "learning_rate": 5.477949041236663e-08, + "loss": 0.1255, + "step": 37672 + }, + { + "epoch": 0.9533365387048612, + "grad_norm": 6.214254379272461, + "learning_rate": 5.4720233464469194e-08, + "loss": 0.1463, + "step": 37673 + }, + { + "epoch": 0.9533618442695548, + "grad_norm": 4.841426849365234, + "learning_rate": 5.46610084077609e-08, + "loss": 0.1257, + "step": 37674 + }, + { + "epoch": 0.9533871498342485, + "grad_norm": 3.2204787731170654, + "learning_rate": 5.460181524262476e-08, + "loss": 0.1412, + "step": 37675 + }, + { + "epoch": 0.9534124553989423, + "grad_norm": 3.2626428604125977, + "learning_rate": 5.454265396944047e-08, + "loss": 0.117, + "step": 37676 + }, + { + "epoch": 0.9534377609636359, + "grad_norm": 3.4816672801971436, + "learning_rate": 5.448352458859107e-08, + "loss": 0.093, + "step": 37677 + }, + { + "epoch": 0.9534630665283296, + "grad_norm": 5.362685680389404, + "learning_rate": 5.4424427100457365e-08, + "loss": 0.1498, + "step": 37678 + }, + { + "epoch": 0.9534883720930233, + "grad_norm": 10.521429061889648, + "learning_rate": 5.436536150542127e-08, + "loss": 0.238, + "step": 37679 + }, + { + "epoch": 0.9535136776577169, + "grad_norm": 6.57725191116333, + "learning_rate": 5.430632780386247e-08, + "loss": 0.1601, + "step": 37680 + }, + { + "epoch": 0.9535389832224106, + "grad_norm": 4.20135498046875, + "learning_rate": 5.424732599616234e-08, + "loss": 0.124, + "step": 37681 + }, + { + "epoch": 0.9535642887871043, + "grad_norm": 22.817081451416016, + "learning_rate": 5.418835608270057e-08, + "loss": 0.1267, + "step": 37682 + }, + { + "epoch": 0.9535895943517979, + "grad_norm": 3.930332899093628, + "learning_rate": 5.412941806385907e-08, + "loss": 0.1717, + "step": 37683 + }, + { + "epoch": 0.9536148999164916, + "grad_norm": 5.828841209411621, + "learning_rate": 5.407051194001645e-08, + "loss": 0.1421, + "step": 37684 + }, + { + "epoch": 0.9536402054811853, + "grad_norm": 5.391782760620117, + "learning_rate": 5.4011637711552933e-08, + "loss": 0.1155, + "step": 37685 + }, + { + "epoch": 0.953665511045879, + "grad_norm": 3.7369437217712402, + "learning_rate": 5.3952795378848235e-08, + "loss": 0.1358, + "step": 37686 + }, + { + "epoch": 0.9536908166105726, + "grad_norm": 12.964919090270996, + "learning_rate": 5.38939849422826e-08, + "loss": 0.2615, + "step": 37687 + }, + { + "epoch": 0.9537161221752664, + "grad_norm": 3.983658790588379, + "learning_rate": 5.3835206402234054e-08, + "loss": 0.1296, + "step": 37688 + }, + { + "epoch": 0.9537414277399601, + "grad_norm": 4.48072624206543, + "learning_rate": 5.377645975908286e-08, + "loss": 0.1328, + "step": 37689 + }, + { + "epoch": 0.9537667333046537, + "grad_norm": 5.616884708404541, + "learning_rate": 5.371774501320648e-08, + "loss": 0.1758, + "step": 37690 + }, + { + "epoch": 0.9537920388693474, + "grad_norm": 5.439591884613037, + "learning_rate": 5.365906216498462e-08, + "loss": 0.223, + "step": 37691 + }, + { + "epoch": 0.9538173444340411, + "grad_norm": 6.744738578796387, + "learning_rate": 5.360041121479531e-08, + "loss": 0.2367, + "step": 37692 + }, + { + "epoch": 0.9538426499987347, + "grad_norm": 3.3299596309661865, + "learning_rate": 5.354179216301714e-08, + "loss": 0.0994, + "step": 37693 + }, + { + "epoch": 0.9538679555634284, + "grad_norm": 5.507163047790527, + "learning_rate": 5.3483205010027016e-08, + "loss": 0.2124, + "step": 37694 + }, + { + "epoch": 0.9538932611281221, + "grad_norm": 4.6512861251831055, + "learning_rate": 5.3424649756204096e-08, + "loss": 0.1761, + "step": 37695 + }, + { + "epoch": 0.9539185666928157, + "grad_norm": 6.97991418838501, + "learning_rate": 5.3366126401925845e-08, + "loss": 0.1644, + "step": 37696 + }, + { + "epoch": 0.9539438722575094, + "grad_norm": 4.507558822631836, + "learning_rate": 5.330763494756863e-08, + "loss": 0.1209, + "step": 37697 + }, + { + "epoch": 0.9539691778222031, + "grad_norm": 4.977748394012451, + "learning_rate": 5.3249175393510486e-08, + "loss": 0.1547, + "step": 37698 + }, + { + "epoch": 0.9539944833868967, + "grad_norm": 10.717647552490234, + "learning_rate": 5.319074774012889e-08, + "loss": 0.1539, + "step": 37699 + }, + { + "epoch": 0.9540197889515905, + "grad_norm": 5.5909504890441895, + "learning_rate": 5.313235198779909e-08, + "loss": 0.1134, + "step": 37700 + }, + { + "epoch": 0.9540450945162842, + "grad_norm": 4.211699962615967, + "learning_rate": 5.3073988136899125e-08, + "loss": 0.1407, + "step": 37701 + }, + { + "epoch": 0.9540704000809778, + "grad_norm": 3.537033796310425, + "learning_rate": 5.30156561878048e-08, + "loss": 0.0968, + "step": 37702 + }, + { + "epoch": 0.9540957056456715, + "grad_norm": 6.296747207641602, + "learning_rate": 5.2957356140891923e-08, + "loss": 0.2013, + "step": 37703 + }, + { + "epoch": 0.9541210112103652, + "grad_norm": 4.9123125076293945, + "learning_rate": 5.2899087996537426e-08, + "loss": 0.1621, + "step": 37704 + }, + { + "epoch": 0.9541463167750588, + "grad_norm": 4.516380786895752, + "learning_rate": 5.284085175511544e-08, + "loss": 0.1366, + "step": 37705 + }, + { + "epoch": 0.9541716223397525, + "grad_norm": 4.828528881072998, + "learning_rate": 5.278264741700401e-08, + "loss": 0.1626, + "step": 37706 + }, + { + "epoch": 0.9541969279044462, + "grad_norm": 7.7343573570251465, + "learning_rate": 5.2724474982575605e-08, + "loss": 0.0991, + "step": 37707 + }, + { + "epoch": 0.9542222334691398, + "grad_norm": 3.7161340713500977, + "learning_rate": 5.26663344522077e-08, + "loss": 0.0948, + "step": 37708 + }, + { + "epoch": 0.9542475390338335, + "grad_norm": 4.701322555541992, + "learning_rate": 5.2608225826273896e-08, + "loss": 0.1256, + "step": 37709 + }, + { + "epoch": 0.9542728445985272, + "grad_norm": 3.9796741008758545, + "learning_rate": 5.255014910515e-08, + "loss": 0.0995, + "step": 37710 + }, + { + "epoch": 0.9542981501632208, + "grad_norm": 10.380025863647461, + "learning_rate": 5.2492104289209035e-08, + "loss": 0.1249, + "step": 37711 + }, + { + "epoch": 0.9543234557279145, + "grad_norm": 9.908202171325684, + "learning_rate": 5.243409137882682e-08, + "loss": 0.2346, + "step": 37712 + }, + { + "epoch": 0.9543487612926083, + "grad_norm": 4.364556789398193, + "learning_rate": 5.23761103743764e-08, + "loss": 0.1058, + "step": 37713 + }, + { + "epoch": 0.954374066857302, + "grad_norm": 4.923343181610107, + "learning_rate": 5.231816127623246e-08, + "loss": 0.179, + "step": 37714 + }, + { + "epoch": 0.9543993724219956, + "grad_norm": 2.006519079208374, + "learning_rate": 5.2260244084768044e-08, + "loss": 0.1006, + "step": 37715 + }, + { + "epoch": 0.9544246779866893, + "grad_norm": 5.6842265129089355, + "learning_rate": 5.220235880035729e-08, + "loss": 0.1511, + "step": 37716 + }, + { + "epoch": 0.954449983551383, + "grad_norm": 5.215948104858398, + "learning_rate": 5.214450542337268e-08, + "loss": 0.1012, + "step": 37717 + }, + { + "epoch": 0.9544752891160766, + "grad_norm": 5.574271202087402, + "learning_rate": 5.208668395418837e-08, + "loss": 0.1369, + "step": 37718 + }, + { + "epoch": 0.9545005946807703, + "grad_norm": 18.27887725830078, + "learning_rate": 5.2028894393176265e-08, + "loss": 0.1819, + "step": 37719 + }, + { + "epoch": 0.954525900245464, + "grad_norm": 5.229467391967773, + "learning_rate": 5.197113674070997e-08, + "loss": 0.2208, + "step": 37720 + }, + { + "epoch": 0.9545512058101576, + "grad_norm": 4.997500896453857, + "learning_rate": 5.191341099716085e-08, + "loss": 0.1385, + "step": 37721 + }, + { + "epoch": 0.9545765113748513, + "grad_norm": 6.258721351623535, + "learning_rate": 5.185571716290194e-08, + "loss": 0.1461, + "step": 37722 + }, + { + "epoch": 0.954601816939545, + "grad_norm": 5.732175350189209, + "learning_rate": 5.179805523830461e-08, + "loss": 0.2665, + "step": 37723 + }, + { + "epoch": 0.9546271225042386, + "grad_norm": 13.956320762634277, + "learning_rate": 5.17404252237419e-08, + "loss": 0.216, + "step": 37724 + }, + { + "epoch": 0.9546524280689324, + "grad_norm": 7.455829620361328, + "learning_rate": 5.1682827119584053e-08, + "loss": 0.2604, + "step": 37725 + }, + { + "epoch": 0.9546777336336261, + "grad_norm": 5.973635196685791, + "learning_rate": 5.1625260926203566e-08, + "loss": 0.1646, + "step": 37726 + }, + { + "epoch": 0.9547030391983197, + "grad_norm": 5.072901725769043, + "learning_rate": 5.156772664397181e-08, + "loss": 0.107, + "step": 37727 + }, + { + "epoch": 0.9547283447630134, + "grad_norm": 2.2770791053771973, + "learning_rate": 5.151022427325847e-08, + "loss": 0.0445, + "step": 37728 + }, + { + "epoch": 0.9547536503277071, + "grad_norm": 11.379937171936035, + "learning_rate": 5.145275381443604e-08, + "loss": 0.1819, + "step": 37729 + }, + { + "epoch": 0.9547789558924007, + "grad_norm": 2.5739591121673584, + "learning_rate": 5.139531526787367e-08, + "loss": 0.0413, + "step": 37730 + }, + { + "epoch": 0.9548042614570944, + "grad_norm": 5.169358253479004, + "learning_rate": 5.1337908633942165e-08, + "loss": 0.192, + "step": 37731 + }, + { + "epoch": 0.9548295670217881, + "grad_norm": 4.535238742828369, + "learning_rate": 5.1280533913012355e-08, + "loss": 0.0784, + "step": 37732 + }, + { + "epoch": 0.9548548725864817, + "grad_norm": 15.575074195861816, + "learning_rate": 5.122319110545393e-08, + "loss": 0.2012, + "step": 37733 + }, + { + "epoch": 0.9548801781511754, + "grad_norm": 5.908292293548584, + "learning_rate": 5.11658802116366e-08, + "loss": 0.1245, + "step": 37734 + }, + { + "epoch": 0.9549054837158691, + "grad_norm": 3.8765015602111816, + "learning_rate": 5.110860123193007e-08, + "loss": 0.1112, + "step": 37735 + }, + { + "epoch": 0.9549307892805627, + "grad_norm": 4.784973621368408, + "learning_rate": 5.105135416670348e-08, + "loss": 0.137, + "step": 37736 + }, + { + "epoch": 0.9549560948452565, + "grad_norm": 7.390982627868652, + "learning_rate": 5.099413901632599e-08, + "loss": 0.0914, + "step": 37737 + }, + { + "epoch": 0.9549814004099502, + "grad_norm": 5.615260124206543, + "learning_rate": 5.09369557811662e-08, + "loss": 0.1917, + "step": 37738 + }, + { + "epoch": 0.9550067059746439, + "grad_norm": 6.541415691375732, + "learning_rate": 5.087980446159435e-08, + "loss": 0.1203, + "step": 37739 + }, + { + "epoch": 0.9550320115393375, + "grad_norm": 8.2045316696167, + "learning_rate": 5.082268505797738e-08, + "loss": 0.2676, + "step": 37740 + }, + { + "epoch": 0.9550573171040312, + "grad_norm": 2.3970744609832764, + "learning_rate": 5.0765597570685e-08, + "loss": 0.1191, + "step": 37741 + }, + { + "epoch": 0.9550826226687249, + "grad_norm": 4.095865726470947, + "learning_rate": 5.070854200008357e-08, + "loss": 0.13, + "step": 37742 + }, + { + "epoch": 0.9551079282334185, + "grad_norm": 21.29524803161621, + "learning_rate": 5.0651518346542806e-08, + "loss": 0.1446, + "step": 37743 + }, + { + "epoch": 0.9551332337981122, + "grad_norm": 22.179533004760742, + "learning_rate": 5.059452661042963e-08, + "loss": 0.1591, + "step": 37744 + }, + { + "epoch": 0.9551585393628059, + "grad_norm": 12.047842979431152, + "learning_rate": 5.053756679211208e-08, + "loss": 0.154, + "step": 37745 + }, + { + "epoch": 0.9551838449274995, + "grad_norm": 9.353398323059082, + "learning_rate": 5.0480638891956535e-08, + "loss": 0.1875, + "step": 37746 + }, + { + "epoch": 0.9552091504921932, + "grad_norm": 4.732033729553223, + "learning_rate": 5.042374291033103e-08, + "loss": 0.1968, + "step": 37747 + }, + { + "epoch": 0.955234456056887, + "grad_norm": 5.413846015930176, + "learning_rate": 5.036687884760194e-08, + "loss": 0.1023, + "step": 37748 + }, + { + "epoch": 0.9552597616215805, + "grad_norm": 6.349387168884277, + "learning_rate": 5.031004670413619e-08, + "loss": 0.1767, + "step": 37749 + }, + { + "epoch": 0.9552850671862743, + "grad_norm": 8.7855224609375, + "learning_rate": 5.0253246480300165e-08, + "loss": 0.1311, + "step": 37750 + }, + { + "epoch": 0.955310372750968, + "grad_norm": 3.8097357749938965, + "learning_rate": 5.019647817646078e-08, + "loss": 0.1421, + "step": 37751 + }, + { + "epoch": 0.9553356783156616, + "grad_norm": 6.620469570159912, + "learning_rate": 5.0139741792982754e-08, + "loss": 0.1886, + "step": 37752 + }, + { + "epoch": 0.9553609838803553, + "grad_norm": 6.122120380401611, + "learning_rate": 5.008303733023301e-08, + "loss": 0.2229, + "step": 37753 + }, + { + "epoch": 0.955386289445049, + "grad_norm": 4.908301830291748, + "learning_rate": 5.002636478857736e-08, + "loss": 0.1031, + "step": 37754 + }, + { + "epoch": 0.9554115950097426, + "grad_norm": 3.2256550788879395, + "learning_rate": 4.996972416838053e-08, + "loss": 0.1081, + "step": 37755 + }, + { + "epoch": 0.9554369005744363, + "grad_norm": 8.83407974243164, + "learning_rate": 4.9913115470008324e-08, + "loss": 0.2126, + "step": 37756 + }, + { + "epoch": 0.95546220613913, + "grad_norm": 2.785051107406616, + "learning_rate": 4.985653869382545e-08, + "loss": 0.124, + "step": 37757 + }, + { + "epoch": 0.9554875117038236, + "grad_norm": 4.092012405395508, + "learning_rate": 4.979999384019718e-08, + "loss": 0.0789, + "step": 37758 + }, + { + "epoch": 0.9555128172685173, + "grad_norm": 5.019845962524414, + "learning_rate": 4.974348090948822e-08, + "loss": 0.1813, + "step": 37759 + }, + { + "epoch": 0.955538122833211, + "grad_norm": 7.061376094818115, + "learning_rate": 4.9686999902062716e-08, + "loss": 0.164, + "step": 37760 + }, + { + "epoch": 0.9555634283979046, + "grad_norm": 4.344366550445557, + "learning_rate": 4.963055081828427e-08, + "loss": 0.1667, + "step": 37761 + }, + { + "epoch": 0.9555887339625984, + "grad_norm": 2.1660609245300293, + "learning_rate": 4.957413365851815e-08, + "loss": 0.0999, + "step": 37762 + }, + { + "epoch": 0.9556140395272921, + "grad_norm": 4.598745346069336, + "learning_rate": 4.95177484231274e-08, + "loss": 0.1403, + "step": 37763 + }, + { + "epoch": 0.9556393450919858, + "grad_norm": 3.6541855335235596, + "learning_rate": 4.9461395112475605e-08, + "loss": 0.1439, + "step": 37764 + }, + { + "epoch": 0.9556646506566794, + "grad_norm": 2.8162946701049805, + "learning_rate": 4.940507372692638e-08, + "loss": 0.1394, + "step": 37765 + }, + { + "epoch": 0.9556899562213731, + "grad_norm": 7.1189398765563965, + "learning_rate": 4.934878426684331e-08, + "loss": 0.1941, + "step": 37766 + }, + { + "epoch": 0.9557152617860668, + "grad_norm": 2.991032600402832, + "learning_rate": 4.929252673258889e-08, + "loss": 0.0704, + "step": 37767 + }, + { + "epoch": 0.9557405673507604, + "grad_norm": 7.657085418701172, + "learning_rate": 4.9236301124526157e-08, + "loss": 0.1986, + "step": 37768 + }, + { + "epoch": 0.9557658729154541, + "grad_norm": 4.730685710906982, + "learning_rate": 4.91801074430176e-08, + "loss": 0.1609, + "step": 37769 + }, + { + "epoch": 0.9557911784801478, + "grad_norm": 3.8317642211914062, + "learning_rate": 4.912394568842571e-08, + "loss": 0.0777, + "step": 37770 + }, + { + "epoch": 0.9558164840448414, + "grad_norm": 3.500340223312378, + "learning_rate": 4.906781586111187e-08, + "loss": 0.0984, + "step": 37771 + }, + { + "epoch": 0.9558417896095351, + "grad_norm": 4.821304798126221, + "learning_rate": 4.9011717961439664e-08, + "loss": 0.1509, + "step": 37772 + }, + { + "epoch": 0.9558670951742289, + "grad_norm": 3.790778160095215, + "learning_rate": 4.895565198976937e-08, + "loss": 0.1542, + "step": 37773 + }, + { + "epoch": 0.9558924007389225, + "grad_norm": 4.927961826324463, + "learning_rate": 4.889961794646347e-08, + "loss": 0.1337, + "step": 37774 + }, + { + "epoch": 0.9559177063036162, + "grad_norm": 8.685076713562012, + "learning_rate": 4.884361583188224e-08, + "loss": 0.226, + "step": 37775 + }, + { + "epoch": 0.9559430118683099, + "grad_norm": 4.170322895050049, + "learning_rate": 4.878764564638871e-08, + "loss": 0.139, + "step": 37776 + }, + { + "epoch": 0.9559683174330035, + "grad_norm": 5.38848352432251, + "learning_rate": 4.87317073903415e-08, + "loss": 0.1848, + "step": 37777 + }, + { + "epoch": 0.9559936229976972, + "grad_norm": 3.507333755493164, + "learning_rate": 4.867580106410308e-08, + "loss": 0.1378, + "step": 37778 + }, + { + "epoch": 0.9560189285623909, + "grad_norm": 7.23084020614624, + "learning_rate": 4.861992666803317e-08, + "loss": 0.0832, + "step": 37779 + }, + { + "epoch": 0.9560442341270845, + "grad_norm": 5.206038475036621, + "learning_rate": 4.856408420249259e-08, + "loss": 0.0805, + "step": 37780 + }, + { + "epoch": 0.9560695396917782, + "grad_norm": 2.5856637954711914, + "learning_rate": 4.8508273667841054e-08, + "loss": 0.068, + "step": 37781 + }, + { + "epoch": 0.9560948452564719, + "grad_norm": 4.649234771728516, + "learning_rate": 4.845249506443883e-08, + "loss": 0.1336, + "step": 37782 + }, + { + "epoch": 0.9561201508211655, + "grad_norm": 5.327627182006836, + "learning_rate": 4.839674839264508e-08, + "loss": 0.1489, + "step": 37783 + }, + { + "epoch": 0.9561454563858592, + "grad_norm": 4.825436115264893, + "learning_rate": 4.8341033652819506e-08, + "loss": 0.1098, + "step": 37784 + }, + { + "epoch": 0.956170761950553, + "grad_norm": 4.8297271728515625, + "learning_rate": 4.8285350845322375e-08, + "loss": 0.1857, + "step": 37785 + }, + { + "epoch": 0.9561960675152466, + "grad_norm": 3.396021604537964, + "learning_rate": 4.822969997051119e-08, + "loss": 0.1259, + "step": 37786 + }, + { + "epoch": 0.9562213730799403, + "grad_norm": 6.1986799240112305, + "learning_rate": 4.817408102874566e-08, + "loss": 0.1064, + "step": 37787 + }, + { + "epoch": 0.956246678644634, + "grad_norm": 4.778597354888916, + "learning_rate": 4.811849402038438e-08, + "loss": 0.1708, + "step": 37788 + }, + { + "epoch": 0.9562719842093277, + "grad_norm": 7.1548895835876465, + "learning_rate": 4.8062938945785956e-08, + "loss": 0.1729, + "step": 37789 + }, + { + "epoch": 0.9562972897740213, + "grad_norm": 3.521411180496216, + "learning_rate": 4.800741580530843e-08, + "loss": 0.1413, + "step": 37790 + }, + { + "epoch": 0.956322595338715, + "grad_norm": 8.14227294921875, + "learning_rate": 4.7951924599309864e-08, + "loss": 0.1727, + "step": 37791 + }, + { + "epoch": 0.9563479009034087, + "grad_norm": 4.637383937835693, + "learning_rate": 4.7896465328148287e-08, + "loss": 0.1568, + "step": 37792 + }, + { + "epoch": 0.9563732064681023, + "grad_norm": 3.537970781326294, + "learning_rate": 4.784103799218121e-08, + "loss": 0.1508, + "step": 37793 + }, + { + "epoch": 0.956398512032796, + "grad_norm": 6.7158379554748535, + "learning_rate": 4.77856425917661e-08, + "loss": 0.1837, + "step": 37794 + }, + { + "epoch": 0.9564238175974897, + "grad_norm": 18.456859588623047, + "learning_rate": 4.773027912726047e-08, + "loss": 0.2558, + "step": 37795 + }, + { + "epoch": 0.9564491231621833, + "grad_norm": 11.333843231201172, + "learning_rate": 4.7674947599020696e-08, + "loss": 0.1958, + "step": 37796 + }, + { + "epoch": 0.956474428726877, + "grad_norm": 5.924298286437988, + "learning_rate": 4.761964800740371e-08, + "loss": 0.1616, + "step": 37797 + }, + { + "epoch": 0.9564997342915708, + "grad_norm": 6.0311737060546875, + "learning_rate": 4.7564380352767005e-08, + "loss": 0.1947, + "step": 37798 + }, + { + "epoch": 0.9565250398562644, + "grad_norm": 3.5451242923736572, + "learning_rate": 4.750914463546641e-08, + "loss": 0.1368, + "step": 37799 + }, + { + "epoch": 0.9565503454209581, + "grad_norm": 8.762004852294922, + "learning_rate": 4.745394085585775e-08, + "loss": 0.1862, + "step": 37800 + }, + { + "epoch": 0.9565756509856518, + "grad_norm": 4.502600193023682, + "learning_rate": 4.739876901429741e-08, + "loss": 0.1239, + "step": 37801 + }, + { + "epoch": 0.9566009565503454, + "grad_norm": 3.5486114025115967, + "learning_rate": 4.734362911114121e-08, + "loss": 0.1021, + "step": 37802 + }, + { + "epoch": 0.9566262621150391, + "grad_norm": 15.305691719055176, + "learning_rate": 4.728852114674498e-08, + "loss": 0.1881, + "step": 37803 + }, + { + "epoch": 0.9566515676797328, + "grad_norm": 7.4556403160095215, + "learning_rate": 4.723344512146344e-08, + "loss": 0.1385, + "step": 37804 + }, + { + "epoch": 0.9566768732444264, + "grad_norm": 5.4578375816345215, + "learning_rate": 4.7178401035651854e-08, + "loss": 0.2242, + "step": 37805 + }, + { + "epoch": 0.9567021788091201, + "grad_norm": 11.035853385925293, + "learning_rate": 4.712338888966606e-08, + "loss": 0.2723, + "step": 37806 + }, + { + "epoch": 0.9567274843738138, + "grad_norm": 6.304753303527832, + "learning_rate": 4.70684086838602e-08, + "loss": 0.1619, + "step": 37807 + }, + { + "epoch": 0.9567527899385074, + "grad_norm": 5.741850852966309, + "learning_rate": 4.7013460418588456e-08, + "loss": 0.2519, + "step": 37808 + }, + { + "epoch": 0.9567780955032011, + "grad_norm": 4.056737422943115, + "learning_rate": 4.6958544094205526e-08, + "loss": 0.1183, + "step": 37809 + }, + { + "epoch": 0.9568034010678949, + "grad_norm": 19.577123641967773, + "learning_rate": 4.690365971106614e-08, + "loss": 0.2431, + "step": 37810 + }, + { + "epoch": 0.9568287066325885, + "grad_norm": 13.923232078552246, + "learning_rate": 4.684880726952334e-08, + "loss": 0.2019, + "step": 37811 + }, + { + "epoch": 0.9568540121972822, + "grad_norm": 4.130209445953369, + "learning_rate": 4.67939867699313e-08, + "loss": 0.1217, + "step": 37812 + }, + { + "epoch": 0.9568793177619759, + "grad_norm": 4.545739650726318, + "learning_rate": 4.673919821264361e-08, + "loss": 0.1216, + "step": 37813 + }, + { + "epoch": 0.9569046233266696, + "grad_norm": 7.778502464294434, + "learning_rate": 4.668444159801333e-08, + "loss": 0.1736, + "step": 37814 + }, + { + "epoch": 0.9569299288913632, + "grad_norm": 8.041938781738281, + "learning_rate": 4.662971692639351e-08, + "loss": 0.2124, + "step": 37815 + }, + { + "epoch": 0.9569552344560569, + "grad_norm": 5.6954450607299805, + "learning_rate": 4.6575024198137755e-08, + "loss": 0.1723, + "step": 37816 + }, + { + "epoch": 0.9569805400207506, + "grad_norm": 8.544377326965332, + "learning_rate": 4.6520363413598005e-08, + "loss": 0.1287, + "step": 37817 + }, + { + "epoch": 0.9570058455854442, + "grad_norm": 8.181512832641602, + "learning_rate": 4.6465734573126754e-08, + "loss": 0.2345, + "step": 37818 + }, + { + "epoch": 0.9570311511501379, + "grad_norm": 4.894401550292969, + "learning_rate": 4.64111376770765e-08, + "loss": 0.1215, + "step": 37819 + }, + { + "epoch": 0.9570564567148316, + "grad_norm": 2.272038698196411, + "learning_rate": 4.635657272579974e-08, + "loss": 0.0909, + "step": 37820 + }, + { + "epoch": 0.9570817622795252, + "grad_norm": 7.414729118347168, + "learning_rate": 4.630203971964786e-08, + "loss": 0.1725, + "step": 37821 + }, + { + "epoch": 0.957107067844219, + "grad_norm": 4.819272994995117, + "learning_rate": 4.62475386589728e-08, + "loss": 0.1579, + "step": 37822 + }, + { + "epoch": 0.9571323734089127, + "grad_norm": 8.289379119873047, + "learning_rate": 4.619306954412539e-08, + "loss": 0.2072, + "step": 37823 + }, + { + "epoch": 0.9571576789736063, + "grad_norm": 5.357310771942139, + "learning_rate": 4.613863237545812e-08, + "loss": 0.1447, + "step": 37824 + }, + { + "epoch": 0.9571829845383, + "grad_norm": 7.513788223266602, + "learning_rate": 4.608422715332128e-08, + "loss": 0.188, + "step": 37825 + }, + { + "epoch": 0.9572082901029937, + "grad_norm": 11.416107177734375, + "learning_rate": 4.6029853878065136e-08, + "loss": 0.1579, + "step": 37826 + }, + { + "epoch": 0.9572335956676873, + "grad_norm": 6.008600234985352, + "learning_rate": 4.597551255004107e-08, + "loss": 0.2107, + "step": 37827 + }, + { + "epoch": 0.957258901232381, + "grad_norm": 5.8421711921691895, + "learning_rate": 4.592120316959992e-08, + "loss": 0.144, + "step": 37828 + }, + { + "epoch": 0.9572842067970747, + "grad_norm": 4.103481292724609, + "learning_rate": 4.5866925737091395e-08, + "loss": 0.1385, + "step": 37829 + }, + { + "epoch": 0.9573095123617683, + "grad_norm": 4.722285747528076, + "learning_rate": 4.581268025286523e-08, + "loss": 0.1696, + "step": 37830 + }, + { + "epoch": 0.957334817926462, + "grad_norm": 5.219435214996338, + "learning_rate": 4.5758466717271134e-08, + "loss": 0.173, + "step": 37831 + }, + { + "epoch": 0.9573601234911557, + "grad_norm": 7.2192158699035645, + "learning_rate": 4.570428513065939e-08, + "loss": 0.2156, + "step": 37832 + }, + { + "epoch": 0.9573854290558493, + "grad_norm": 6.246987819671631, + "learning_rate": 4.565013549337916e-08, + "loss": 0.1228, + "step": 37833 + }, + { + "epoch": 0.957410734620543, + "grad_norm": 5.266973495483398, + "learning_rate": 4.5596017805780155e-08, + "loss": 0.1584, + "step": 37834 + }, + { + "epoch": 0.9574360401852368, + "grad_norm": 3.404865026473999, + "learning_rate": 4.554193206821045e-08, + "loss": 0.0932, + "step": 37835 + }, + { + "epoch": 0.9574613457499304, + "grad_norm": 2.9410364627838135, + "learning_rate": 4.548787828101919e-08, + "loss": 0.0866, + "step": 37836 + }, + { + "epoch": 0.9574866513146241, + "grad_norm": 10.895499229431152, + "learning_rate": 4.5433856444555e-08, + "loss": 0.2857, + "step": 37837 + }, + { + "epoch": 0.9575119568793178, + "grad_norm": 4.721608638763428, + "learning_rate": 4.537986655916649e-08, + "loss": 0.1461, + "step": 37838 + }, + { + "epoch": 0.9575372624440114, + "grad_norm": 6.610406875610352, + "learning_rate": 4.532590862520114e-08, + "loss": 0.2399, + "step": 37839 + }, + { + "epoch": 0.9575625680087051, + "grad_norm": 10.99914836883545, + "learning_rate": 4.527198264300758e-08, + "loss": 0.2004, + "step": 37840 + }, + { + "epoch": 0.9575878735733988, + "grad_norm": 3.009373903274536, + "learning_rate": 4.5218088612933306e-08, + "loss": 0.1106, + "step": 37841 + }, + { + "epoch": 0.9576131791380925, + "grad_norm": 3.462949514389038, + "learning_rate": 4.516422653532637e-08, + "loss": 0.136, + "step": 37842 + }, + { + "epoch": 0.9576384847027861, + "grad_norm": 4.160505771636963, + "learning_rate": 4.5110396410533165e-08, + "loss": 0.1153, + "step": 37843 + }, + { + "epoch": 0.9576637902674798, + "grad_norm": 3.992892026901245, + "learning_rate": 4.5056598238901185e-08, + "loss": 0.1111, + "step": 37844 + }, + { + "epoch": 0.9576890958321735, + "grad_norm": 2.974494695663452, + "learning_rate": 4.500283202077793e-08, + "loss": 0.096, + "step": 37845 + }, + { + "epoch": 0.9577144013968671, + "grad_norm": 2.304980754852295, + "learning_rate": 4.494909775650924e-08, + "loss": 0.0853, + "step": 37846 + }, + { + "epoch": 0.9577397069615609, + "grad_norm": 8.131896018981934, + "learning_rate": 4.4895395446442614e-08, + "loss": 0.1644, + "step": 37847 + }, + { + "epoch": 0.9577650125262546, + "grad_norm": 6.259434223175049, + "learning_rate": 4.4841725090923885e-08, + "loss": 0.1965, + "step": 37848 + }, + { + "epoch": 0.9577903180909482, + "grad_norm": 9.92743968963623, + "learning_rate": 4.478808669029888e-08, + "loss": 0.3005, + "step": 37849 + }, + { + "epoch": 0.9578156236556419, + "grad_norm": 5.699038505554199, + "learning_rate": 4.4734480244913445e-08, + "loss": 0.1898, + "step": 37850 + }, + { + "epoch": 0.9578409292203356, + "grad_norm": 4.483243465423584, + "learning_rate": 4.468090575511452e-08, + "loss": 0.1317, + "step": 37851 + }, + { + "epoch": 0.9578662347850292, + "grad_norm": 3.1317641735076904, + "learning_rate": 4.462736322124572e-08, + "loss": 0.112, + "step": 37852 + }, + { + "epoch": 0.9578915403497229, + "grad_norm": 3.129668712615967, + "learning_rate": 4.457385264365399e-08, + "loss": 0.0888, + "step": 37853 + }, + { + "epoch": 0.9579168459144166, + "grad_norm": 5.026355743408203, + "learning_rate": 4.4520374022682944e-08, + "loss": 0.1123, + "step": 37854 + }, + { + "epoch": 0.9579421514791102, + "grad_norm": 2.658993721008301, + "learning_rate": 4.446692735867897e-08, + "loss": 0.1061, + "step": 37855 + }, + { + "epoch": 0.9579674570438039, + "grad_norm": 16.019081115722656, + "learning_rate": 4.441351265198568e-08, + "loss": 0.2511, + "step": 37856 + }, + { + "epoch": 0.9579927626084976, + "grad_norm": 3.840296506881714, + "learning_rate": 4.43601299029478e-08, + "loss": 0.1119, + "step": 37857 + }, + { + "epoch": 0.9580180681731912, + "grad_norm": 9.59971809387207, + "learning_rate": 4.43067791119095e-08, + "loss": 0.1905, + "step": 37858 + }, + { + "epoch": 0.958043373737885, + "grad_norm": 3.3953044414520264, + "learning_rate": 4.425346027921551e-08, + "loss": 0.1731, + "step": 37859 + }, + { + "epoch": 0.9580686793025787, + "grad_norm": 4.226443290710449, + "learning_rate": 4.420017340520888e-08, + "loss": 0.1069, + "step": 37860 + }, + { + "epoch": 0.9580939848672723, + "grad_norm": 4.404171943664551, + "learning_rate": 4.414691849023323e-08, + "loss": 0.1611, + "step": 37861 + }, + { + "epoch": 0.958119290431966, + "grad_norm": 7.131566047668457, + "learning_rate": 4.4093695534632164e-08, + "loss": 0.1977, + "step": 37862 + }, + { + "epoch": 0.9581445959966597, + "grad_norm": 6.284949779510498, + "learning_rate": 4.404050453874931e-08, + "loss": 0.1849, + "step": 37863 + }, + { + "epoch": 0.9581699015613533, + "grad_norm": 4.5657958984375, + "learning_rate": 4.398734550292716e-08, + "loss": 0.0902, + "step": 37864 + }, + { + "epoch": 0.958195207126047, + "grad_norm": 5.540675163269043, + "learning_rate": 4.3934218427508776e-08, + "loss": 0.1357, + "step": 37865 + }, + { + "epoch": 0.9582205126907407, + "grad_norm": 14.36917495727539, + "learning_rate": 4.3881123312836674e-08, + "loss": 0.2495, + "step": 37866 + }, + { + "epoch": 0.9582458182554344, + "grad_norm": 4.391335487365723, + "learning_rate": 4.382806015925334e-08, + "loss": 0.093, + "step": 37867 + }, + { + "epoch": 0.958271123820128, + "grad_norm": 5.27211332321167, + "learning_rate": 4.377502896710073e-08, + "loss": 0.1543, + "step": 37868 + }, + { + "epoch": 0.9582964293848217, + "grad_norm": 2.545299530029297, + "learning_rate": 4.372202973672135e-08, + "loss": 0.1094, + "step": 37869 + }, + { + "epoch": 0.9583217349495154, + "grad_norm": 6.618412971496582, + "learning_rate": 4.366906246845659e-08, + "loss": 0.1064, + "step": 37870 + }, + { + "epoch": 0.958347040514209, + "grad_norm": 7.51400899887085, + "learning_rate": 4.3616127162647845e-08, + "loss": 0.1415, + "step": 37871 + }, + { + "epoch": 0.9583723460789028, + "grad_norm": 3.9604482650756836, + "learning_rate": 4.356322381963707e-08, + "loss": 0.1687, + "step": 37872 + }, + { + "epoch": 0.9583976516435965, + "grad_norm": 5.524856090545654, + "learning_rate": 4.351035243976509e-08, + "loss": 0.1177, + "step": 37873 + }, + { + "epoch": 0.9584229572082901, + "grad_norm": 4.541158676147461, + "learning_rate": 4.3457513023372754e-08, + "loss": 0.1976, + "step": 37874 + }, + { + "epoch": 0.9584482627729838, + "grad_norm": 4.398323059082031, + "learning_rate": 4.34047055708009e-08, + "loss": 0.1396, + "step": 37875 + }, + { + "epoch": 0.9584735683376775, + "grad_norm": 15.573105812072754, + "learning_rate": 4.335193008239036e-08, + "loss": 0.1761, + "step": 37876 + }, + { + "epoch": 0.9584988739023711, + "grad_norm": 4.427933692932129, + "learning_rate": 4.3299186558480866e-08, + "loss": 0.1081, + "step": 37877 + }, + { + "epoch": 0.9585241794670648, + "grad_norm": 4.918315887451172, + "learning_rate": 4.3246474999413256e-08, + "loss": 0.1585, + "step": 37878 + }, + { + "epoch": 0.9585494850317585, + "grad_norm": 8.776304244995117, + "learning_rate": 4.3193795405526705e-08, + "loss": 0.1221, + "step": 37879 + }, + { + "epoch": 0.9585747905964521, + "grad_norm": 3.306777000427246, + "learning_rate": 4.31411477771615e-08, + "loss": 0.1311, + "step": 37880 + }, + { + "epoch": 0.9586000961611458, + "grad_norm": 11.526695251464844, + "learning_rate": 4.30885321146568e-08, + "loss": 0.1549, + "step": 37881 + }, + { + "epoch": 0.9586254017258395, + "grad_norm": 4.302063465118408, + "learning_rate": 4.3035948418352344e-08, + "loss": 0.1566, + "step": 37882 + }, + { + "epoch": 0.9586507072905331, + "grad_norm": 3.969477891921997, + "learning_rate": 4.2983396688586755e-08, + "loss": 0.2299, + "step": 37883 + }, + { + "epoch": 0.9586760128552269, + "grad_norm": 3.7851600646972656, + "learning_rate": 4.293087692569975e-08, + "loss": 0.1366, + "step": 37884 + }, + { + "epoch": 0.9587013184199206, + "grad_norm": 4.503646373748779, + "learning_rate": 4.287838913002884e-08, + "loss": 0.1521, + "step": 37885 + }, + { + "epoch": 0.9587266239846142, + "grad_norm": 2.782564878463745, + "learning_rate": 4.2825933301913205e-08, + "loss": 0.1529, + "step": 37886 + }, + { + "epoch": 0.9587519295493079, + "grad_norm": 5.912932872772217, + "learning_rate": 4.27735094416909e-08, + "loss": 0.0783, + "step": 37887 + }, + { + "epoch": 0.9587772351140016, + "grad_norm": 9.053007125854492, + "learning_rate": 4.2721117549700544e-08, + "loss": 0.1462, + "step": 37888 + }, + { + "epoch": 0.9588025406786952, + "grad_norm": 6.006529331207275, + "learning_rate": 4.266875762627909e-08, + "loss": 0.1201, + "step": 37889 + }, + { + "epoch": 0.9588278462433889, + "grad_norm": 6.034771919250488, + "learning_rate": 4.261642967176516e-08, + "loss": 0.2118, + "step": 37890 + }, + { + "epoch": 0.9588531518080826, + "grad_norm": 2.9813904762268066, + "learning_rate": 4.25641336864957e-08, + "loss": 0.1035, + "step": 37891 + }, + { + "epoch": 0.9588784573727763, + "grad_norm": 3.814634323120117, + "learning_rate": 4.251186967080767e-08, + "loss": 0.143, + "step": 37892 + }, + { + "epoch": 0.9589037629374699, + "grad_norm": 12.2254638671875, + "learning_rate": 4.245963762503857e-08, + "loss": 0.1552, + "step": 37893 + }, + { + "epoch": 0.9589290685021636, + "grad_norm": 2.9863078594207764, + "learning_rate": 4.240743754952481e-08, + "loss": 0.1155, + "step": 37894 + }, + { + "epoch": 0.9589543740668574, + "grad_norm": 3.6056606769561768, + "learning_rate": 4.235526944460389e-08, + "loss": 0.1653, + "step": 37895 + }, + { + "epoch": 0.958979679631551, + "grad_norm": 19.291975021362305, + "learning_rate": 4.230313331061109e-08, + "loss": 0.2396, + "step": 37896 + }, + { + "epoch": 0.9590049851962447, + "grad_norm": 6.467098236083984, + "learning_rate": 4.225102914788337e-08, + "loss": 0.2142, + "step": 37897 + }, + { + "epoch": 0.9590302907609384, + "grad_norm": 3.7937307357788086, + "learning_rate": 4.2198956956756575e-08, + "loss": 0.132, + "step": 37898 + }, + { + "epoch": 0.959055596325632, + "grad_norm": 3.3030645847320557, + "learning_rate": 4.2146916737566545e-08, + "loss": 0.0825, + "step": 37899 + }, + { + "epoch": 0.9590809018903257, + "grad_norm": 3.361814022064209, + "learning_rate": 4.209490849064912e-08, + "loss": 0.1353, + "step": 37900 + }, + { + "epoch": 0.9591062074550194, + "grad_norm": 4.751034736633301, + "learning_rate": 4.204293221633904e-08, + "loss": 0.1526, + "step": 37901 + }, + { + "epoch": 0.959131513019713, + "grad_norm": 2.8269670009613037, + "learning_rate": 4.199098791497158e-08, + "loss": 0.1384, + "step": 37902 + }, + { + "epoch": 0.9591568185844067, + "grad_norm": 3.873695135116577, + "learning_rate": 4.1939075586882596e-08, + "loss": 0.1145, + "step": 37903 + }, + { + "epoch": 0.9591821241491004, + "grad_norm": 3.3362538814544678, + "learning_rate": 4.18871952324057e-08, + "loss": 0.1123, + "step": 37904 + }, + { + "epoch": 0.959207429713794, + "grad_norm": 3.6554737091064453, + "learning_rate": 4.183534685187618e-08, + "loss": 0.0909, + "step": 37905 + }, + { + "epoch": 0.9592327352784877, + "grad_norm": 11.36488151550293, + "learning_rate": 4.178353044562822e-08, + "loss": 0.2055, + "step": 37906 + }, + { + "epoch": 0.9592580408431814, + "grad_norm": 3.9719741344451904, + "learning_rate": 4.1731746013995986e-08, + "loss": 0.144, + "step": 37907 + }, + { + "epoch": 0.959283346407875, + "grad_norm": 3.498708963394165, + "learning_rate": 4.167999355731312e-08, + "loss": 0.0591, + "step": 37908 + }, + { + "epoch": 0.9593086519725688, + "grad_norm": 3.7713561058044434, + "learning_rate": 4.162827307591433e-08, + "loss": 0.1253, + "step": 37909 + }, + { + "epoch": 0.9593339575372625, + "grad_norm": 4.032783508300781, + "learning_rate": 4.157658457013214e-08, + "loss": 0.1222, + "step": 37910 + }, + { + "epoch": 0.9593592631019561, + "grad_norm": 3.086468458175659, + "learning_rate": 4.152492804030017e-08, + "loss": 0.1435, + "step": 37911 + }, + { + "epoch": 0.9593845686666498, + "grad_norm": 7.24245548248291, + "learning_rate": 4.147330348675149e-08, + "loss": 0.2222, + "step": 37912 + }, + { + "epoch": 0.9594098742313435, + "grad_norm": 7.255446434020996, + "learning_rate": 4.1421710909819727e-08, + "loss": 0.1382, + "step": 37913 + }, + { + "epoch": 0.9594351797960371, + "grad_norm": 5.023545742034912, + "learning_rate": 4.137015030983627e-08, + "loss": 0.169, + "step": 37914 + }, + { + "epoch": 0.9594604853607308, + "grad_norm": 11.608827590942383, + "learning_rate": 4.1318621687134186e-08, + "loss": 0.1515, + "step": 37915 + }, + { + "epoch": 0.9594857909254245, + "grad_norm": 12.092655181884766, + "learning_rate": 4.1267125042046555e-08, + "loss": 0.2737, + "step": 37916 + }, + { + "epoch": 0.9595110964901182, + "grad_norm": 2.8837649822235107, + "learning_rate": 4.1215660374904764e-08, + "loss": 0.0701, + "step": 37917 + }, + { + "epoch": 0.9595364020548118, + "grad_norm": 2.8770837783813477, + "learning_rate": 4.116422768604023e-08, + "loss": 0.0872, + "step": 37918 + }, + { + "epoch": 0.9595617076195055, + "grad_norm": 11.145020484924316, + "learning_rate": 4.1112826975786e-08, + "loss": 0.1895, + "step": 37919 + }, + { + "epoch": 0.9595870131841993, + "grad_norm": 3.540165901184082, + "learning_rate": 4.106145824447183e-08, + "loss": 0.1006, + "step": 37920 + }, + { + "epoch": 0.9596123187488929, + "grad_norm": 21.21949577331543, + "learning_rate": 4.1010121492430224e-08, + "loss": 0.3099, + "step": 37921 + }, + { + "epoch": 0.9596376243135866, + "grad_norm": 3.989591121673584, + "learning_rate": 4.0958816719992025e-08, + "loss": 0.1206, + "step": 37922 + }, + { + "epoch": 0.9596629298782803, + "grad_norm": 10.286636352539062, + "learning_rate": 4.090754392748808e-08, + "loss": 0.1213, + "step": 37923 + }, + { + "epoch": 0.9596882354429739, + "grad_norm": 3.382831573486328, + "learning_rate": 4.0856303115248685e-08, + "loss": 0.1294, + "step": 37924 + }, + { + "epoch": 0.9597135410076676, + "grad_norm": 3.936772346496582, + "learning_rate": 4.080509428360469e-08, + "loss": 0.1389, + "step": 37925 + }, + { + "epoch": 0.9597388465723613, + "grad_norm": 10.273126602172852, + "learning_rate": 4.075391743288637e-08, + "loss": 0.1479, + "step": 37926 + }, + { + "epoch": 0.9597641521370549, + "grad_norm": 4.558464527130127, + "learning_rate": 4.070277256342292e-08, + "loss": 0.0978, + "step": 37927 + }, + { + "epoch": 0.9597894577017486, + "grad_norm": 2.2619729042053223, + "learning_rate": 4.0651659675545184e-08, + "loss": 0.0801, + "step": 37928 + }, + { + "epoch": 0.9598147632664423, + "grad_norm": 3.4291632175445557, + "learning_rate": 4.0600578769582344e-08, + "loss": 0.1021, + "step": 37929 + }, + { + "epoch": 0.9598400688311359, + "grad_norm": 8.204917907714844, + "learning_rate": 4.0549529845864136e-08, + "loss": 0.1476, + "step": 37930 + }, + { + "epoch": 0.9598653743958296, + "grad_norm": 4.459813594818115, + "learning_rate": 4.049851290471918e-08, + "loss": 0.1245, + "step": 37931 + }, + { + "epoch": 0.9598906799605234, + "grad_norm": 3.422492027282715, + "learning_rate": 4.0447527946476664e-08, + "loss": 0.121, + "step": 37932 + }, + { + "epoch": 0.959915985525217, + "grad_norm": 4.106589317321777, + "learning_rate": 4.0396574971465764e-08, + "loss": 0.073, + "step": 37933 + }, + { + "epoch": 0.9599412910899107, + "grad_norm": 8.426506996154785, + "learning_rate": 4.0345653980014554e-08, + "loss": 0.1492, + "step": 37934 + }, + { + "epoch": 0.9599665966546044, + "grad_norm": 5.559628009796143, + "learning_rate": 4.029476497245166e-08, + "loss": 0.1721, + "step": 37935 + }, + { + "epoch": 0.959991902219298, + "grad_norm": 6.80040168762207, + "learning_rate": 4.024390794910571e-08, + "loss": 0.1751, + "step": 37936 + }, + { + "epoch": 0.9600172077839917, + "grad_norm": 4.256956100463867, + "learning_rate": 4.019308291030366e-08, + "loss": 0.133, + "step": 37937 + }, + { + "epoch": 0.9600425133486854, + "grad_norm": 3.878153085708618, + "learning_rate": 4.014228985637358e-08, + "loss": 0.146, + "step": 37938 + }, + { + "epoch": 0.960067818913379, + "grad_norm": 4.981681823730469, + "learning_rate": 4.009152878764355e-08, + "loss": 0.1178, + "step": 37939 + }, + { + "epoch": 0.9600931244780727, + "grad_norm": 6.57368803024292, + "learning_rate": 4.004079970444108e-08, + "loss": 0.1707, + "step": 37940 + }, + { + "epoch": 0.9601184300427664, + "grad_norm": 9.016585350036621, + "learning_rate": 3.9990102607092576e-08, + "loss": 0.142, + "step": 37941 + }, + { + "epoch": 0.9601437356074601, + "grad_norm": 4.006642818450928, + "learning_rate": 3.993943749592555e-08, + "loss": 0.1084, + "step": 37942 + }, + { + "epoch": 0.9601690411721537, + "grad_norm": 9.059532165527344, + "learning_rate": 3.9888804371265854e-08, + "loss": 0.1714, + "step": 37943 + }, + { + "epoch": 0.9601943467368474, + "grad_norm": 4.955368995666504, + "learning_rate": 3.983820323344101e-08, + "loss": 0.1363, + "step": 37944 + }, + { + "epoch": 0.9602196523015412, + "grad_norm": 3.3323910236358643, + "learning_rate": 3.978763408277686e-08, + "loss": 0.0938, + "step": 37945 + }, + { + "epoch": 0.9602449578662348, + "grad_norm": 8.573164939880371, + "learning_rate": 3.9737096919599264e-08, + "loss": 0.2335, + "step": 37946 + }, + { + "epoch": 0.9602702634309285, + "grad_norm": 6.273386001586914, + "learning_rate": 3.9686591744234615e-08, + "loss": 0.2181, + "step": 37947 + }, + { + "epoch": 0.9602955689956222, + "grad_norm": 3.6584060192108154, + "learning_rate": 3.9636118557009326e-08, + "loss": 0.1246, + "step": 37948 + }, + { + "epoch": 0.9603208745603158, + "grad_norm": 7.97464656829834, + "learning_rate": 3.9585677358247034e-08, + "loss": 0.2214, + "step": 37949 + }, + { + "epoch": 0.9603461801250095, + "grad_norm": 10.367066383361816, + "learning_rate": 3.953526814827413e-08, + "loss": 0.195, + "step": 37950 + }, + { + "epoch": 0.9603714856897032, + "grad_norm": 4.073040008544922, + "learning_rate": 3.9484890927415365e-08, + "loss": 0.0911, + "step": 37951 + }, + { + "epoch": 0.9603967912543968, + "grad_norm": 3.319176197052002, + "learning_rate": 3.943454569599603e-08, + "loss": 0.1345, + "step": 37952 + }, + { + "epoch": 0.9604220968190905, + "grad_norm": 11.147921562194824, + "learning_rate": 3.938423245434087e-08, + "loss": 0.309, + "step": 37953 + }, + { + "epoch": 0.9604474023837842, + "grad_norm": 3.6581451892852783, + "learning_rate": 3.933395120277406e-08, + "loss": 0.0995, + "step": 37954 + }, + { + "epoch": 0.9604727079484778, + "grad_norm": 3.642221450805664, + "learning_rate": 3.9283701941619234e-08, + "loss": 0.1227, + "step": 37955 + }, + { + "epoch": 0.9604980135131715, + "grad_norm": 9.263492584228516, + "learning_rate": 3.923348467120169e-08, + "loss": 0.1147, + "step": 37956 + }, + { + "epoch": 0.9605233190778653, + "grad_norm": 4.820321083068848, + "learning_rate": 3.918329939184451e-08, + "loss": 0.0913, + "step": 37957 + }, + { + "epoch": 0.9605486246425589, + "grad_norm": 8.489015579223633, + "learning_rate": 3.913314610387131e-08, + "loss": 0.2235, + "step": 37958 + }, + { + "epoch": 0.9605739302072526, + "grad_norm": 4.089942932128906, + "learning_rate": 3.908302480760517e-08, + "loss": 0.1313, + "step": 37959 + }, + { + "epoch": 0.9605992357719463, + "grad_norm": 10.41193675994873, + "learning_rate": 3.903293550337084e-08, + "loss": 0.1795, + "step": 37960 + }, + { + "epoch": 0.9606245413366399, + "grad_norm": 4.605476379394531, + "learning_rate": 3.898287819148972e-08, + "loss": 0.174, + "step": 37961 + }, + { + "epoch": 0.9606498469013336, + "grad_norm": 7.5839996337890625, + "learning_rate": 3.893285287228543e-08, + "loss": 0.1052, + "step": 37962 + }, + { + "epoch": 0.9606751524660273, + "grad_norm": 4.882610321044922, + "learning_rate": 3.8882859546079954e-08, + "loss": 0.1469, + "step": 37963 + }, + { + "epoch": 0.9607004580307209, + "grad_norm": 5.81542444229126, + "learning_rate": 3.8832898213196355e-08, + "loss": 0.1839, + "step": 37964 + }, + { + "epoch": 0.9607257635954146, + "grad_norm": 3.7915282249450684, + "learning_rate": 3.878296887395605e-08, + "loss": 0.2067, + "step": 37965 + }, + { + "epoch": 0.9607510691601083, + "grad_norm": 7.146949768066406, + "learning_rate": 3.873307152868211e-08, + "loss": 0.1645, + "step": 37966 + }, + { + "epoch": 0.9607763747248019, + "grad_norm": 12.975476264953613, + "learning_rate": 3.868320617769594e-08, + "loss": 0.206, + "step": 37967 + }, + { + "epoch": 0.9608016802894956, + "grad_norm": 5.930310249328613, + "learning_rate": 3.8633372821317846e-08, + "loss": 0.1861, + "step": 37968 + }, + { + "epoch": 0.9608269858541894, + "grad_norm": 7.765095233917236, + "learning_rate": 3.858357145987146e-08, + "loss": 0.1203, + "step": 37969 + }, + { + "epoch": 0.9608522914188831, + "grad_norm": 2.739877700805664, + "learning_rate": 3.8533802093675964e-08, + "loss": 0.0638, + "step": 37970 + }, + { + "epoch": 0.9608775969835767, + "grad_norm": 4.51439094543457, + "learning_rate": 3.8484064723053327e-08, + "loss": 0.1886, + "step": 37971 + }, + { + "epoch": 0.9609029025482704, + "grad_norm": 4.274729251861572, + "learning_rate": 3.843435934832385e-08, + "loss": 0.1298, + "step": 37972 + }, + { + "epoch": 0.9609282081129641, + "grad_norm": 5.2433857917785645, + "learning_rate": 3.838468596980838e-08, + "loss": 0.1564, + "step": 37973 + }, + { + "epoch": 0.9609535136776577, + "grad_norm": 10.033235549926758, + "learning_rate": 3.833504458782722e-08, + "loss": 0.1293, + "step": 37974 + }, + { + "epoch": 0.9609788192423514, + "grad_norm": 5.90169095993042, + "learning_rate": 3.8285435202700673e-08, + "loss": 0.1482, + "step": 37975 + }, + { + "epoch": 0.9610041248070451, + "grad_norm": 4.962122917175293, + "learning_rate": 3.8235857814747926e-08, + "loss": 0.1313, + "step": 37976 + }, + { + "epoch": 0.9610294303717387, + "grad_norm": 11.455453872680664, + "learning_rate": 3.8186312424289274e-08, + "loss": 0.2483, + "step": 37977 + }, + { + "epoch": 0.9610547359364324, + "grad_norm": 5.27068567276001, + "learning_rate": 3.813679903164391e-08, + "loss": 0.1837, + "step": 37978 + }, + { + "epoch": 0.9610800415011261, + "grad_norm": 5.274913311004639, + "learning_rate": 3.8087317637132135e-08, + "loss": 0.1546, + "step": 37979 + }, + { + "epoch": 0.9611053470658197, + "grad_norm": 7.248492240905762, + "learning_rate": 3.803786824107148e-08, + "loss": 0.1255, + "step": 37980 + }, + { + "epoch": 0.9611306526305134, + "grad_norm": 1.9620161056518555, + "learning_rate": 3.798845084378222e-08, + "loss": 0.0551, + "step": 37981 + }, + { + "epoch": 0.9611559581952072, + "grad_norm": 5.170680522918701, + "learning_rate": 3.7939065445581904e-08, + "loss": 0.1759, + "step": 37982 + }, + { + "epoch": 0.9611812637599008, + "grad_norm": 4.339008331298828, + "learning_rate": 3.788971204678971e-08, + "loss": 0.13, + "step": 37983 + }, + { + "epoch": 0.9612065693245945, + "grad_norm": 5.739804744720459, + "learning_rate": 3.784039064772371e-08, + "loss": 0.1524, + "step": 37984 + }, + { + "epoch": 0.9612318748892882, + "grad_norm": 22.314350128173828, + "learning_rate": 3.779110124870256e-08, + "loss": 0.1715, + "step": 37985 + }, + { + "epoch": 0.9612571804539818, + "grad_norm": 4.094298839569092, + "learning_rate": 3.774184385004265e-08, + "loss": 0.1241, + "step": 37986 + }, + { + "epoch": 0.9612824860186755, + "grad_norm": 6.278510093688965, + "learning_rate": 3.769261845206318e-08, + "loss": 0.1969, + "step": 37987 + }, + { + "epoch": 0.9613077915833692, + "grad_norm": 6.470500469207764, + "learning_rate": 3.764342505508056e-08, + "loss": 0.1273, + "step": 37988 + }, + { + "epoch": 0.9613330971480628, + "grad_norm": 4.848280429840088, + "learning_rate": 3.759426365941288e-08, + "loss": 0.1472, + "step": 37989 + }, + { + "epoch": 0.9613584027127565, + "grad_norm": 5.551631927490234, + "learning_rate": 3.7545134265375984e-08, + "loss": 0.1649, + "step": 37990 + }, + { + "epoch": 0.9613837082774502, + "grad_norm": 5.152592182159424, + "learning_rate": 3.7496036873287956e-08, + "loss": 0.1755, + "step": 37991 + }, + { + "epoch": 0.9614090138421438, + "grad_norm": 3.5289838314056396, + "learning_rate": 3.744697148346521e-08, + "loss": 0.1292, + "step": 37992 + }, + { + "epoch": 0.9614343194068375, + "grad_norm": 5.073554039001465, + "learning_rate": 3.739793809622305e-08, + "loss": 0.2278, + "step": 37993 + }, + { + "epoch": 0.9614596249715313, + "grad_norm": 4.362144947052002, + "learning_rate": 3.7348936711879006e-08, + "loss": 0.1397, + "step": 37994 + }, + { + "epoch": 0.961484930536225, + "grad_norm": 6.098268032073975, + "learning_rate": 3.729996733074836e-08, + "loss": 0.1492, + "step": 37995 + }, + { + "epoch": 0.9615102361009186, + "grad_norm": 4.058957099914551, + "learning_rate": 3.7251029953147e-08, + "loss": 0.1394, + "step": 37996 + }, + { + "epoch": 0.9615355416656123, + "grad_norm": 4.130673885345459, + "learning_rate": 3.720212457939132e-08, + "loss": 0.1218, + "step": 37997 + }, + { + "epoch": 0.961560847230306, + "grad_norm": 6.126090049743652, + "learning_rate": 3.715325120979496e-08, + "loss": 0.1618, + "step": 37998 + }, + { + "epoch": 0.9615861527949996, + "grad_norm": 9.290297508239746, + "learning_rate": 3.7104409844674894e-08, + "loss": 0.2511, + "step": 37999 + }, + { + "epoch": 0.9616114583596933, + "grad_norm": 5.757409572601318, + "learning_rate": 3.705560048434476e-08, + "loss": 0.1263, + "step": 38000 + }, + { + "epoch": 0.961636763924387, + "grad_norm": 5.6105804443359375, + "learning_rate": 3.700682312912041e-08, + "loss": 0.1621, + "step": 38001 + }, + { + "epoch": 0.9616620694890806, + "grad_norm": 7.5160603523254395, + "learning_rate": 3.695807777931548e-08, + "loss": 0.1842, + "step": 38002 + }, + { + "epoch": 0.9616873750537743, + "grad_norm": 4.448421955108643, + "learning_rate": 3.690936443524473e-08, + "loss": 0.1523, + "step": 38003 + }, + { + "epoch": 0.961712680618468, + "grad_norm": 4.74837589263916, + "learning_rate": 3.686068309722235e-08, + "loss": 0.1545, + "step": 38004 + }, + { + "epoch": 0.9617379861831616, + "grad_norm": 5.250904560089111, + "learning_rate": 3.681203376556197e-08, + "loss": 0.1284, + "step": 38005 + }, + { + "epoch": 0.9617632917478554, + "grad_norm": 6.537950038909912, + "learning_rate": 3.676341644057835e-08, + "loss": 0.1938, + "step": 38006 + }, + { + "epoch": 0.9617885973125491, + "grad_norm": 3.3496458530426025, + "learning_rate": 3.671483112258345e-08, + "loss": 0.1604, + "step": 38007 + }, + { + "epoch": 0.9618139028772427, + "grad_norm": 7.659634590148926, + "learning_rate": 3.6666277811891474e-08, + "loss": 0.1776, + "step": 38008 + }, + { + "epoch": 0.9618392084419364, + "grad_norm": 8.372589111328125, + "learning_rate": 3.6617756508815496e-08, + "loss": 0.1648, + "step": 38009 + }, + { + "epoch": 0.9618645140066301, + "grad_norm": 2.6211493015289307, + "learning_rate": 3.6569267213668604e-08, + "loss": 0.1269, + "step": 38010 + }, + { + "epoch": 0.9618898195713237, + "grad_norm": 6.728390216827393, + "learning_rate": 3.652080992676277e-08, + "loss": 0.2387, + "step": 38011 + }, + { + "epoch": 0.9619151251360174, + "grad_norm": 8.24479866027832, + "learning_rate": 3.647238464841163e-08, + "loss": 0.2121, + "step": 38012 + }, + { + "epoch": 0.9619404307007111, + "grad_norm": 3.489682674407959, + "learning_rate": 3.642399137892605e-08, + "loss": 0.1088, + "step": 38013 + }, + { + "epoch": 0.9619657362654047, + "grad_norm": 3.521817207336426, + "learning_rate": 3.637563011861911e-08, + "loss": 0.1319, + "step": 38014 + }, + { + "epoch": 0.9619910418300984, + "grad_norm": 19.55336570739746, + "learning_rate": 3.6327300867802785e-08, + "loss": 0.1907, + "step": 38015 + }, + { + "epoch": 0.9620163473947921, + "grad_norm": 11.574739456176758, + "learning_rate": 3.627900362678793e-08, + "loss": 0.2109, + "step": 38016 + }, + { + "epoch": 0.9620416529594857, + "grad_norm": 4.391880035400391, + "learning_rate": 3.6230738395886535e-08, + "loss": 0.1819, + "step": 38017 + }, + { + "epoch": 0.9620669585241794, + "grad_norm": 8.638280868530273, + "learning_rate": 3.618250517541e-08, + "loss": 0.1135, + "step": 38018 + }, + { + "epoch": 0.9620922640888732, + "grad_norm": 3.2634313106536865, + "learning_rate": 3.6134303965669194e-08, + "loss": 0.102, + "step": 38019 + }, + { + "epoch": 0.9621175696535669, + "grad_norm": 3.492558717727661, + "learning_rate": 3.608613476697442e-08, + "loss": 0.1458, + "step": 38020 + }, + { + "epoch": 0.9621428752182605, + "grad_norm": 4.1470489501953125, + "learning_rate": 3.603799757963711e-08, + "loss": 0.1432, + "step": 38021 + }, + { + "epoch": 0.9621681807829542, + "grad_norm": 4.312503337860107, + "learning_rate": 3.5989892403967554e-08, + "loss": 0.1214, + "step": 38022 + }, + { + "epoch": 0.9621934863476479, + "grad_norm": 3.513444662094116, + "learning_rate": 3.594181924027551e-08, + "loss": 0.1144, + "step": 38023 + }, + { + "epoch": 0.9622187919123415, + "grad_norm": 3.606783628463745, + "learning_rate": 3.589377808887185e-08, + "loss": 0.1339, + "step": 38024 + }, + { + "epoch": 0.9622440974770352, + "grad_norm": 9.02889633178711, + "learning_rate": 3.5845768950065196e-08, + "loss": 0.2446, + "step": 38025 + }, + { + "epoch": 0.9622694030417289, + "grad_norm": 8.346650123596191, + "learning_rate": 3.5797791824165874e-08, + "loss": 0.115, + "step": 38026 + }, + { + "epoch": 0.9622947086064225, + "grad_norm": 3.9771058559417725, + "learning_rate": 3.574984671148307e-08, + "loss": 0.1274, + "step": 38027 + }, + { + "epoch": 0.9623200141711162, + "grad_norm": 3.2993855476379395, + "learning_rate": 3.570193361232654e-08, + "loss": 0.0827, + "step": 38028 + }, + { + "epoch": 0.96234531973581, + "grad_norm": 1.6248078346252441, + "learning_rate": 3.5654052527004934e-08, + "loss": 0.0748, + "step": 38029 + }, + { + "epoch": 0.9623706253005035, + "grad_norm": 3.7937514781951904, + "learning_rate": 3.560620345582633e-08, + "loss": 0.1239, + "step": 38030 + }, + { + "epoch": 0.9623959308651973, + "grad_norm": 5.8299150466918945, + "learning_rate": 3.555838639909992e-08, + "loss": 0.137, + "step": 38031 + }, + { + "epoch": 0.962421236429891, + "grad_norm": 4.1458420753479, + "learning_rate": 3.551060135713436e-08, + "loss": 0.169, + "step": 38032 + }, + { + "epoch": 0.9624465419945846, + "grad_norm": 7.954248905181885, + "learning_rate": 3.5462848330237165e-08, + "loss": 0.2041, + "step": 38033 + }, + { + "epoch": 0.9624718475592783, + "grad_norm": 3.3437490463256836, + "learning_rate": 3.5415127318716436e-08, + "loss": 0.1195, + "step": 38034 + }, + { + "epoch": 0.962497153123972, + "grad_norm": 4.5936760902404785, + "learning_rate": 3.536743832288025e-08, + "loss": 0.1416, + "step": 38035 + }, + { + "epoch": 0.9625224586886656, + "grad_norm": 4.368743419647217, + "learning_rate": 3.5319781343036155e-08, + "loss": 0.1523, + "step": 38036 + }, + { + "epoch": 0.9625477642533593, + "grad_norm": 4.762401580810547, + "learning_rate": 3.527215637949166e-08, + "loss": 0.1015, + "step": 38037 + }, + { + "epoch": 0.962573069818053, + "grad_norm": 6.080382823944092, + "learning_rate": 3.522456343255265e-08, + "loss": 0.1069, + "step": 38038 + }, + { + "epoch": 0.9625983753827466, + "grad_norm": 3.0831713676452637, + "learning_rate": 3.51770025025272e-08, + "loss": 0.1165, + "step": 38039 + }, + { + "epoch": 0.9626236809474403, + "grad_norm": 7.258908748626709, + "learning_rate": 3.512947358972174e-08, + "loss": 0.1849, + "step": 38040 + }, + { + "epoch": 0.962648986512134, + "grad_norm": 5.488529682159424, + "learning_rate": 3.5081976694443246e-08, + "loss": 0.1231, + "step": 38041 + }, + { + "epoch": 0.9626742920768276, + "grad_norm": 3.9146599769592285, + "learning_rate": 3.503451181699702e-08, + "loss": 0.1253, + "step": 38042 + }, + { + "epoch": 0.9626995976415214, + "grad_norm": 5.480843544006348, + "learning_rate": 3.4987078957690045e-08, + "loss": 0.1674, + "step": 38043 + }, + { + "epoch": 0.9627249032062151, + "grad_norm": 3.554816246032715, + "learning_rate": 3.493967811682708e-08, + "loss": 0.1819, + "step": 38044 + }, + { + "epoch": 0.9627502087709088, + "grad_norm": 5.114062786102295, + "learning_rate": 3.489230929471565e-08, + "loss": 0.0936, + "step": 38045 + }, + { + "epoch": 0.9627755143356024, + "grad_norm": 6.579773902893066, + "learning_rate": 3.484497249165941e-08, + "loss": 0.1716, + "step": 38046 + }, + { + "epoch": 0.9628008199002961, + "grad_norm": 7.460580825805664, + "learning_rate": 3.479766770796422e-08, + "loss": 0.1829, + "step": 38047 + }, + { + "epoch": 0.9628261254649898, + "grad_norm": 6.340742588043213, + "learning_rate": 3.475039494393595e-08, + "loss": 0.1766, + "step": 38048 + }, + { + "epoch": 0.9628514310296834, + "grad_norm": 4.554211139678955, + "learning_rate": 3.470315419987824e-08, + "loss": 0.1944, + "step": 38049 + }, + { + "epoch": 0.9628767365943771, + "grad_norm": 3.4173789024353027, + "learning_rate": 3.465594547609641e-08, + "loss": 0.1175, + "step": 38050 + }, + { + "epoch": 0.9629020421590708, + "grad_norm": 8.128271102905273, + "learning_rate": 3.460876877289465e-08, + "loss": 0.148, + "step": 38051 + }, + { + "epoch": 0.9629273477237644, + "grad_norm": 2.807518720626831, + "learning_rate": 3.456162409057773e-08, + "loss": 0.1218, + "step": 38052 + }, + { + "epoch": 0.9629526532884581, + "grad_norm": 4.43570613861084, + "learning_rate": 3.451451142944873e-08, + "loss": 0.1868, + "step": 38053 + }, + { + "epoch": 0.9629779588531518, + "grad_norm": 4.033998012542725, + "learning_rate": 3.446743078981185e-08, + "loss": 0.1431, + "step": 38054 + }, + { + "epoch": 0.9630032644178455, + "grad_norm": 6.949943542480469, + "learning_rate": 3.44203821719713e-08, + "loss": 0.1388, + "step": 38055 + }, + { + "epoch": 0.9630285699825392, + "grad_norm": 5.892088413238525, + "learning_rate": 3.437336557623017e-08, + "loss": 0.1394, + "step": 38056 + }, + { + "epoch": 0.9630538755472329, + "grad_norm": 6.004270076751709, + "learning_rate": 3.432638100289099e-08, + "loss": 0.1485, + "step": 38057 + }, + { + "epoch": 0.9630791811119265, + "grad_norm": 3.5112733840942383, + "learning_rate": 3.427942845225796e-08, + "loss": 0.1565, + "step": 38058 + }, + { + "epoch": 0.9631044866766202, + "grad_norm": 4.083098888397217, + "learning_rate": 3.423250792463306e-08, + "loss": 0.1071, + "step": 38059 + }, + { + "epoch": 0.9631297922413139, + "grad_norm": 2.8707773685455322, + "learning_rate": 3.418561942031884e-08, + "loss": 0.0712, + "step": 38060 + }, + { + "epoch": 0.9631550978060075, + "grad_norm": 5.949357986450195, + "learning_rate": 3.4138762939617264e-08, + "loss": 0.1099, + "step": 38061 + }, + { + "epoch": 0.9631804033707012, + "grad_norm": 2.8238728046417236, + "learning_rate": 3.4091938482831986e-08, + "loss": 0.1092, + "step": 38062 + }, + { + "epoch": 0.9632057089353949, + "grad_norm": 3.7699198722839355, + "learning_rate": 3.404514605026388e-08, + "loss": 0.2121, + "step": 38063 + }, + { + "epoch": 0.9632310145000885, + "grad_norm": 10.71368408203125, + "learning_rate": 3.399838564221436e-08, + "loss": 0.3099, + "step": 38064 + }, + { + "epoch": 0.9632563200647822, + "grad_norm": 6.159701824188232, + "learning_rate": 3.3951657258985973e-08, + "loss": 0.17, + "step": 38065 + }, + { + "epoch": 0.963281625629476, + "grad_norm": 7.572762489318848, + "learning_rate": 3.390496090087958e-08, + "loss": 0.1969, + "step": 38066 + }, + { + "epoch": 0.9633069311941695, + "grad_norm": 3.2346057891845703, + "learning_rate": 3.385829656819606e-08, + "loss": 0.1261, + "step": 38067 + }, + { + "epoch": 0.9633322367588633, + "grad_norm": 4.5720391273498535, + "learning_rate": 3.381166426123683e-08, + "loss": 0.1626, + "step": 38068 + }, + { + "epoch": 0.963357542323557, + "grad_norm": 7.363215923309326, + "learning_rate": 3.3765063980302215e-08, + "loss": 0.2226, + "step": 38069 + }, + { + "epoch": 0.9633828478882506, + "grad_norm": 6.417357444763184, + "learning_rate": 3.371849572569308e-08, + "loss": 0.1941, + "step": 38070 + }, + { + "epoch": 0.9634081534529443, + "grad_norm": 13.8464994430542, + "learning_rate": 3.3671959497709184e-08, + "loss": 0.0894, + "step": 38071 + }, + { + "epoch": 0.963433459017638, + "grad_norm": 3.935591459274292, + "learning_rate": 3.3625455296651956e-08, + "loss": 0.1552, + "step": 38072 + }, + { + "epoch": 0.9634587645823317, + "grad_norm": 3.4300954341888428, + "learning_rate": 3.357898312281948e-08, + "loss": 0.1345, + "step": 38073 + }, + { + "epoch": 0.9634840701470253, + "grad_norm": 5.69536828994751, + "learning_rate": 3.353254297651265e-08, + "loss": 0.1497, + "step": 38074 + }, + { + "epoch": 0.963509375711719, + "grad_norm": 3.002760887145996, + "learning_rate": 3.348613485803065e-08, + "loss": 0.0909, + "step": 38075 + }, + { + "epoch": 0.9635346812764127, + "grad_norm": 3.232973575592041, + "learning_rate": 3.343975876767269e-08, + "loss": 0.0982, + "step": 38076 + }, + { + "epoch": 0.9635599868411063, + "grad_norm": 4.754269599914551, + "learning_rate": 3.339341470573743e-08, + "loss": 0.1089, + "step": 38077 + }, + { + "epoch": 0.9635852924058, + "grad_norm": 4.020423889160156, + "learning_rate": 3.334710267252461e-08, + "loss": 0.131, + "step": 38078 + }, + { + "epoch": 0.9636105979704938, + "grad_norm": 3.6834206581115723, + "learning_rate": 3.33008226683329e-08, + "loss": 0.1006, + "step": 38079 + }, + { + "epoch": 0.9636359035351874, + "grad_norm": 9.008773803710938, + "learning_rate": 3.3254574693459275e-08, + "loss": 0.1408, + "step": 38080 + }, + { + "epoch": 0.9636612090998811, + "grad_norm": 2.4450888633728027, + "learning_rate": 3.320835874820405e-08, + "loss": 0.1252, + "step": 38081 + }, + { + "epoch": 0.9636865146645748, + "grad_norm": 2.910310745239258, + "learning_rate": 3.316217483286366e-08, + "loss": 0.1197, + "step": 38082 + }, + { + "epoch": 0.9637118202292684, + "grad_norm": 3.8205957412719727, + "learning_rate": 3.3116022947736745e-08, + "loss": 0.1309, + "step": 38083 + }, + { + "epoch": 0.9637371257939621, + "grad_norm": 4.634854316711426, + "learning_rate": 3.3069903093120304e-08, + "loss": 0.1643, + "step": 38084 + }, + { + "epoch": 0.9637624313586558, + "grad_norm": 4.887446403503418, + "learning_rate": 3.302381526931242e-08, + "loss": 0.1462, + "step": 38085 + }, + { + "epoch": 0.9637877369233494, + "grad_norm": 5.688129901885986, + "learning_rate": 3.2977759476610085e-08, + "loss": 0.2292, + "step": 38086 + }, + { + "epoch": 0.9638130424880431, + "grad_norm": 9.69658374786377, + "learning_rate": 3.293173571530972e-08, + "loss": 0.1695, + "step": 38087 + }, + { + "epoch": 0.9638383480527368, + "grad_norm": 4.304329872131348, + "learning_rate": 3.288574398570887e-08, + "loss": 0.1729, + "step": 38088 + }, + { + "epoch": 0.9638636536174304, + "grad_norm": 6.489288806915283, + "learning_rate": 3.2839784288104525e-08, + "loss": 0.2544, + "step": 38089 + }, + { + "epoch": 0.9638889591821241, + "grad_norm": 5.209325313568115, + "learning_rate": 3.2793856622791995e-08, + "loss": 0.1768, + "step": 38090 + }, + { + "epoch": 0.9639142647468179, + "grad_norm": 2.1690022945404053, + "learning_rate": 3.274796099006772e-08, + "loss": 0.0989, + "step": 38091 + }, + { + "epoch": 0.9639395703115115, + "grad_norm": 3.019458055496216, + "learning_rate": 3.270209739022812e-08, + "loss": 0.0756, + "step": 38092 + }, + { + "epoch": 0.9639648758762052, + "grad_norm": 12.258441925048828, + "learning_rate": 3.265626582356851e-08, + "loss": 0.1967, + "step": 38093 + }, + { + "epoch": 0.9639901814408989, + "grad_norm": 2.593045949935913, + "learning_rate": 3.261046629038478e-08, + "loss": 0.1028, + "step": 38094 + }, + { + "epoch": 0.9640154870055925, + "grad_norm": 8.094396591186523, + "learning_rate": 3.256469879097224e-08, + "loss": 0.2263, + "step": 38095 + }, + { + "epoch": 0.9640407925702862, + "grad_norm": 5.683403491973877, + "learning_rate": 3.251896332562565e-08, + "loss": 0.1132, + "step": 38096 + }, + { + "epoch": 0.9640660981349799, + "grad_norm": 4.528862953186035, + "learning_rate": 3.247325989464089e-08, + "loss": 0.1602, + "step": 38097 + }, + { + "epoch": 0.9640914036996736, + "grad_norm": 3.670785665512085, + "learning_rate": 3.2427588498311626e-08, + "loss": 0.1054, + "step": 38098 + }, + { + "epoch": 0.9641167092643672, + "grad_norm": 2.954810619354248, + "learning_rate": 3.23819491369326e-08, + "loss": 0.1306, + "step": 38099 + }, + { + "epoch": 0.9641420148290609, + "grad_norm": 3.951430559158325, + "learning_rate": 3.233634181079859e-08, + "loss": 0.1091, + "step": 38100 + }, + { + "epoch": 0.9641673203937546, + "grad_norm": 6.325503826141357, + "learning_rate": 3.2290766520203246e-08, + "loss": 0.1349, + "step": 38101 + }, + { + "epoch": 0.9641926259584482, + "grad_norm": 2.780485153198242, + "learning_rate": 3.224522326544077e-08, + "loss": 0.1046, + "step": 38102 + }, + { + "epoch": 0.964217931523142, + "grad_norm": 6.176972389221191, + "learning_rate": 3.2199712046804835e-08, + "loss": 0.1778, + "step": 38103 + }, + { + "epoch": 0.9642432370878357, + "grad_norm": 4.378610610961914, + "learning_rate": 3.2154232864589074e-08, + "loss": 0.1618, + "step": 38104 + }, + { + "epoch": 0.9642685426525293, + "grad_norm": 8.763283729553223, + "learning_rate": 3.210878571908605e-08, + "loss": 0.0975, + "step": 38105 + }, + { + "epoch": 0.964293848217223, + "grad_norm": 3.92842435836792, + "learning_rate": 3.206337061058995e-08, + "loss": 0.1968, + "step": 38106 + }, + { + "epoch": 0.9643191537819167, + "grad_norm": 4.979751110076904, + "learning_rate": 3.201798753939278e-08, + "loss": 0.159, + "step": 38107 + }, + { + "epoch": 0.9643444593466103, + "grad_norm": 8.728399276733398, + "learning_rate": 3.197263650578708e-08, + "loss": 0.2719, + "step": 38108 + }, + { + "epoch": 0.964369764911304, + "grad_norm": 7.745785713195801, + "learning_rate": 3.19273175100665e-08, + "loss": 0.2259, + "step": 38109 + }, + { + "epoch": 0.9643950704759977, + "grad_norm": 10.037446022033691, + "learning_rate": 3.188203055252248e-08, + "loss": 0.3116, + "step": 38110 + }, + { + "epoch": 0.9644203760406913, + "grad_norm": 3.6528074741363525, + "learning_rate": 3.183677563344645e-08, + "loss": 0.1241, + "step": 38111 + }, + { + "epoch": 0.964445681605385, + "grad_norm": 5.140495300292969, + "learning_rate": 3.179155275313095e-08, + "loss": 0.1066, + "step": 38112 + }, + { + "epoch": 0.9644709871700787, + "grad_norm": 4.34577751159668, + "learning_rate": 3.174636191186797e-08, + "loss": 0.2005, + "step": 38113 + }, + { + "epoch": 0.9644962927347723, + "grad_norm": 1.8710038661956787, + "learning_rate": 3.1701203109948955e-08, + "loss": 0.0715, + "step": 38114 + }, + { + "epoch": 0.964521598299466, + "grad_norm": 6.300193786621094, + "learning_rate": 3.1656076347663654e-08, + "loss": 0.1425, + "step": 38115 + }, + { + "epoch": 0.9645469038641598, + "grad_norm": 9.666702270507812, + "learning_rate": 3.161098162530518e-08, + "loss": 0.191, + "step": 38116 + }, + { + "epoch": 0.9645722094288534, + "grad_norm": 4.293084144592285, + "learning_rate": 3.156591894316274e-08, + "loss": 0.0946, + "step": 38117 + }, + { + "epoch": 0.9645975149935471, + "grad_norm": 7.621293544769287, + "learning_rate": 3.152088830152722e-08, + "loss": 0.1928, + "step": 38118 + }, + { + "epoch": 0.9646228205582408, + "grad_norm": 3.6408603191375732, + "learning_rate": 3.147588970068949e-08, + "loss": 0.1359, + "step": 38119 + }, + { + "epoch": 0.9646481261229344, + "grad_norm": 6.127279281616211, + "learning_rate": 3.1430923140939874e-08, + "loss": 0.194, + "step": 38120 + }, + { + "epoch": 0.9646734316876281, + "grad_norm": 6.108932018280029, + "learning_rate": 3.1385988622567585e-08, + "loss": 0.1073, + "step": 38121 + }, + { + "epoch": 0.9646987372523218, + "grad_norm": 4.203232288360596, + "learning_rate": 3.1341086145862955e-08, + "loss": 0.0707, + "step": 38122 + }, + { + "epoch": 0.9647240428170155, + "grad_norm": 3.9061951637268066, + "learning_rate": 3.129621571111574e-08, + "loss": 0.1496, + "step": 38123 + }, + { + "epoch": 0.9647493483817091, + "grad_norm": 6.447951316833496, + "learning_rate": 3.1251377318614604e-08, + "loss": 0.194, + "step": 38124 + }, + { + "epoch": 0.9647746539464028, + "grad_norm": 4.341701507568359, + "learning_rate": 3.120657096864932e-08, + "loss": 0.1042, + "step": 38125 + }, + { + "epoch": 0.9647999595110965, + "grad_norm": 3.2430219650268555, + "learning_rate": 3.116179666150854e-08, + "loss": 0.1478, + "step": 38126 + }, + { + "epoch": 0.9648252650757901, + "grad_norm": 4.85471773147583, + "learning_rate": 3.1117054397480915e-08, + "loss": 0.121, + "step": 38127 + }, + { + "epoch": 0.9648505706404839, + "grad_norm": 8.41932487487793, + "learning_rate": 3.107234417685512e-08, + "loss": 0.1435, + "step": 38128 + }, + { + "epoch": 0.9648758762051776, + "grad_norm": 3.9249589443206787, + "learning_rate": 3.1027665999919245e-08, + "loss": 0.1717, + "step": 38129 + }, + { + "epoch": 0.9649011817698712, + "grad_norm": 6.316054344177246, + "learning_rate": 3.0983019866962504e-08, + "loss": 0.1677, + "step": 38130 + }, + { + "epoch": 0.9649264873345649, + "grad_norm": 5.623469829559326, + "learning_rate": 3.093840577827079e-08, + "loss": 0.1233, + "step": 38131 + }, + { + "epoch": 0.9649517928992586, + "grad_norm": 5.415688514709473, + "learning_rate": 3.0893823734133856e-08, + "loss": 0.1859, + "step": 38132 + }, + { + "epoch": 0.9649770984639522, + "grad_norm": 4.4897942543029785, + "learning_rate": 3.084927373483815e-08, + "loss": 0.1677, + "step": 38133 + }, + { + "epoch": 0.9650024040286459, + "grad_norm": 15.289542198181152, + "learning_rate": 3.080475578067121e-08, + "loss": 0.2264, + "step": 38134 + }, + { + "epoch": 0.9650277095933396, + "grad_norm": 3.8708977699279785, + "learning_rate": 3.076026987191949e-08, + "loss": 0.1056, + "step": 38135 + }, + { + "epoch": 0.9650530151580332, + "grad_norm": 3.6347944736480713, + "learning_rate": 3.071581600887108e-08, + "loss": 0.1518, + "step": 38136 + }, + { + "epoch": 0.9650783207227269, + "grad_norm": 6.080407619476318, + "learning_rate": 3.0671394191811865e-08, + "loss": 0.1248, + "step": 38137 + }, + { + "epoch": 0.9651036262874206, + "grad_norm": 7.781414985656738, + "learning_rate": 3.062700442102828e-08, + "loss": 0.1801, + "step": 38138 + }, + { + "epoch": 0.9651289318521142, + "grad_norm": 2.3356456756591797, + "learning_rate": 3.058264669680677e-08, + "loss": 0.1208, + "step": 38139 + }, + { + "epoch": 0.965154237416808, + "grad_norm": 2.5412299633026123, + "learning_rate": 3.053832101943377e-08, + "loss": 0.1082, + "step": 38140 + }, + { + "epoch": 0.9651795429815017, + "grad_norm": 5.245948314666748, + "learning_rate": 3.04940273891946e-08, + "loss": 0.0388, + "step": 38141 + }, + { + "epoch": 0.9652048485461953, + "grad_norm": 4.980357646942139, + "learning_rate": 3.04497658063746e-08, + "loss": 0.1532, + "step": 38142 + }, + { + "epoch": 0.965230154110889, + "grad_norm": 3.207486867904663, + "learning_rate": 3.0405536271260196e-08, + "loss": 0.0953, + "step": 38143 + }, + { + "epoch": 0.9652554596755827, + "grad_norm": 4.855388641357422, + "learning_rate": 3.036133878413561e-08, + "loss": 0.1192, + "step": 38144 + }, + { + "epoch": 0.9652807652402763, + "grad_norm": 3.036006450653076, + "learning_rate": 3.031717334528672e-08, + "loss": 0.1346, + "step": 38145 + }, + { + "epoch": 0.96530607080497, + "grad_norm": 6.151719570159912, + "learning_rate": 3.027303995499775e-08, + "loss": 0.1838, + "step": 38146 + }, + { + "epoch": 0.9653313763696637, + "grad_norm": 5.251904487609863, + "learning_rate": 3.0228938613554025e-08, + "loss": 0.1327, + "step": 38147 + }, + { + "epoch": 0.9653566819343574, + "grad_norm": 3.5792646408081055, + "learning_rate": 3.018486932123865e-08, + "loss": 0.1827, + "step": 38148 + }, + { + "epoch": 0.965381987499051, + "grad_norm": 3.9699020385742188, + "learning_rate": 3.014083207833696e-08, + "loss": 0.1392, + "step": 38149 + }, + { + "epoch": 0.9654072930637447, + "grad_norm": 16.454126358032227, + "learning_rate": 3.00968268851326e-08, + "loss": 0.2418, + "step": 38150 + }, + { + "epoch": 0.9654325986284384, + "grad_norm": 7.688071250915527, + "learning_rate": 3.005285374190925e-08, + "loss": 0.2052, + "step": 38151 + }, + { + "epoch": 0.965457904193132, + "grad_norm": 4.982391357421875, + "learning_rate": 3.000891264895056e-08, + "loss": 0.1423, + "step": 38152 + }, + { + "epoch": 0.9654832097578258, + "grad_norm": 3.116731882095337, + "learning_rate": 2.99650036065402e-08, + "loss": 0.153, + "step": 38153 + }, + { + "epoch": 0.9655085153225195, + "grad_norm": 8.332122802734375, + "learning_rate": 2.992112661496016e-08, + "loss": 0.1932, + "step": 38154 + }, + { + "epoch": 0.9655338208872131, + "grad_norm": 4.443688869476318, + "learning_rate": 2.987728167449522e-08, + "loss": 0.1474, + "step": 38155 + }, + { + "epoch": 0.9655591264519068, + "grad_norm": 5.932931423187256, + "learning_rate": 2.983346878542681e-08, + "loss": 0.0984, + "step": 38156 + }, + { + "epoch": 0.9655844320166005, + "grad_norm": 9.95048713684082, + "learning_rate": 2.978968794803805e-08, + "loss": 0.1698, + "step": 38157 + }, + { + "epoch": 0.9656097375812941, + "grad_norm": 6.702199935913086, + "learning_rate": 2.9745939162610372e-08, + "loss": 0.1639, + "step": 38158 + }, + { + "epoch": 0.9656350431459878, + "grad_norm": 5.341902732849121, + "learning_rate": 2.970222242942744e-08, + "loss": 0.1518, + "step": 38159 + }, + { + "epoch": 0.9656603487106815, + "grad_norm": 4.322556495666504, + "learning_rate": 2.9658537748770143e-08, + "loss": 0.1522, + "step": 38160 + }, + { + "epoch": 0.9656856542753751, + "grad_norm": 5.42531156539917, + "learning_rate": 2.9614885120919923e-08, + "loss": 0.1677, + "step": 38161 + }, + { + "epoch": 0.9657109598400688, + "grad_norm": 3.4735372066497803, + "learning_rate": 2.9571264546158773e-08, + "loss": 0.1617, + "step": 38162 + }, + { + "epoch": 0.9657362654047625, + "grad_norm": 6.194971561431885, + "learning_rate": 2.9527676024768693e-08, + "loss": 0.1872, + "step": 38163 + }, + { + "epoch": 0.9657615709694561, + "grad_norm": 4.989534854888916, + "learning_rate": 2.9484119557029456e-08, + "loss": 0.1215, + "step": 38164 + }, + { + "epoch": 0.9657868765341499, + "grad_norm": 3.9614741802215576, + "learning_rate": 2.9440595143222507e-08, + "loss": 0.13, + "step": 38165 + }, + { + "epoch": 0.9658121820988436, + "grad_norm": 4.423036575317383, + "learning_rate": 2.9397102783628727e-08, + "loss": 0.1404, + "step": 38166 + }, + { + "epoch": 0.9658374876635372, + "grad_norm": 3.872389078140259, + "learning_rate": 2.9353642478528455e-08, + "loss": 0.185, + "step": 38167 + }, + { + "epoch": 0.9658627932282309, + "grad_norm": 4.183481216430664, + "learning_rate": 2.9310214228202016e-08, + "loss": 0.1547, + "step": 38168 + }, + { + "epoch": 0.9658880987929246, + "grad_norm": 12.693591117858887, + "learning_rate": 2.9266818032929188e-08, + "loss": 0.1375, + "step": 38169 + }, + { + "epoch": 0.9659134043576182, + "grad_norm": 4.281714916229248, + "learning_rate": 2.9223453892990307e-08, + "loss": 0.1221, + "step": 38170 + }, + { + "epoch": 0.9659387099223119, + "grad_norm": 5.123476982116699, + "learning_rate": 2.9180121808664584e-08, + "loss": 0.1761, + "step": 38171 + }, + { + "epoch": 0.9659640154870056, + "grad_norm": 3.1935176849365234, + "learning_rate": 2.913682178023125e-08, + "loss": 0.0857, + "step": 38172 + }, + { + "epoch": 0.9659893210516993, + "grad_norm": 9.425765991210938, + "learning_rate": 2.9093553807970632e-08, + "loss": 0.1666, + "step": 38173 + }, + { + "epoch": 0.9660146266163929, + "grad_norm": 4.190865516662598, + "learning_rate": 2.9050317892160838e-08, + "loss": 0.167, + "step": 38174 + }, + { + "epoch": 0.9660399321810866, + "grad_norm": 6.801954746246338, + "learning_rate": 2.9007114033079985e-08, + "loss": 0.2186, + "step": 38175 + }, + { + "epoch": 0.9660652377457803, + "grad_norm": 5.114065170288086, + "learning_rate": 2.8963942231008402e-08, + "loss": 0.1339, + "step": 38176 + }, + { + "epoch": 0.966090543310474, + "grad_norm": 4.793718338012695, + "learning_rate": 2.8920802486223644e-08, + "loss": 0.1777, + "step": 38177 + }, + { + "epoch": 0.9661158488751677, + "grad_norm": 8.644591331481934, + "learning_rate": 2.887769479900382e-08, + "loss": 0.2517, + "step": 38178 + }, + { + "epoch": 0.9661411544398614, + "grad_norm": 4.143115043640137, + "learning_rate": 2.8834619169627044e-08, + "loss": 0.2175, + "step": 38179 + }, + { + "epoch": 0.966166460004555, + "grad_norm": 15.871955871582031, + "learning_rate": 2.879157559837087e-08, + "loss": 0.4732, + "step": 38180 + }, + { + "epoch": 0.9661917655692487, + "grad_norm": 6.986512184143066, + "learning_rate": 2.8748564085513407e-08, + "loss": 0.195, + "step": 38181 + }, + { + "epoch": 0.9662170711339424, + "grad_norm": 3.2214913368225098, + "learning_rate": 2.8705584631332218e-08, + "loss": 0.1117, + "step": 38182 + }, + { + "epoch": 0.966242376698636, + "grad_norm": 3.4184348583221436, + "learning_rate": 2.8662637236103185e-08, + "loss": 0.1008, + "step": 38183 + }, + { + "epoch": 0.9662676822633297, + "grad_norm": 9.034049034118652, + "learning_rate": 2.8619721900104423e-08, + "loss": 0.2378, + "step": 38184 + }, + { + "epoch": 0.9662929878280234, + "grad_norm": 5.7439775466918945, + "learning_rate": 2.8576838623611824e-08, + "loss": 0.1469, + "step": 38185 + }, + { + "epoch": 0.966318293392717, + "grad_norm": 6.310259819030762, + "learning_rate": 2.853398740690294e-08, + "loss": 0.1834, + "step": 38186 + }, + { + "epoch": 0.9663435989574107, + "grad_norm": 4.270153999328613, + "learning_rate": 2.8491168250253665e-08, + "loss": 0.1646, + "step": 38187 + }, + { + "epoch": 0.9663689045221044, + "grad_norm": 6.662341117858887, + "learning_rate": 2.8448381153939887e-08, + "loss": 0.1407, + "step": 38188 + }, + { + "epoch": 0.966394210086798, + "grad_norm": 3.8261029720306396, + "learning_rate": 2.8405626118237494e-08, + "loss": 0.1058, + "step": 38189 + }, + { + "epoch": 0.9664195156514918, + "grad_norm": 5.485196590423584, + "learning_rate": 2.836290314342238e-08, + "loss": 0.1394, + "step": 38190 + }, + { + "epoch": 0.9664448212161855, + "grad_norm": 20.429946899414062, + "learning_rate": 2.8320212229770438e-08, + "loss": 0.3526, + "step": 38191 + }, + { + "epoch": 0.9664701267808791, + "grad_norm": 11.710963249206543, + "learning_rate": 2.8277553377557e-08, + "loss": 0.2425, + "step": 38192 + }, + { + "epoch": 0.9664954323455728, + "grad_norm": 6.002894878387451, + "learning_rate": 2.823492658705629e-08, + "loss": 0.0748, + "step": 38193 + }, + { + "epoch": 0.9665207379102665, + "grad_norm": 4.850666522979736, + "learning_rate": 2.8192331858543643e-08, + "loss": 0.1252, + "step": 38194 + }, + { + "epoch": 0.9665460434749601, + "grad_norm": 3.7440450191497803, + "learning_rate": 2.8149769192294396e-08, + "loss": 0.1638, + "step": 38195 + }, + { + "epoch": 0.9665713490396538, + "grad_norm": 12.0222806930542, + "learning_rate": 2.810723858858222e-08, + "loss": 0.2851, + "step": 38196 + }, + { + "epoch": 0.9665966546043475, + "grad_norm": 4.78930139541626, + "learning_rate": 2.8064740047681337e-08, + "loss": 0.1607, + "step": 38197 + }, + { + "epoch": 0.9666219601690411, + "grad_norm": 8.529767036437988, + "learning_rate": 2.8022273569865978e-08, + "loss": 0.149, + "step": 38198 + }, + { + "epoch": 0.9666472657337348, + "grad_norm": 4.881191730499268, + "learning_rate": 2.797983915541036e-08, + "loss": 0.1262, + "step": 38199 + }, + { + "epoch": 0.9666725712984285, + "grad_norm": 3.7113993167877197, + "learning_rate": 2.7937436804588157e-08, + "loss": 0.1435, + "step": 38200 + }, + { + "epoch": 0.9666978768631223, + "grad_norm": 2.9538068771362305, + "learning_rate": 2.7895066517672488e-08, + "loss": 0.0827, + "step": 38201 + }, + { + "epoch": 0.9667231824278159, + "grad_norm": 16.671724319458008, + "learning_rate": 2.7852728294935903e-08, + "loss": 0.1711, + "step": 38202 + }, + { + "epoch": 0.9667484879925096, + "grad_norm": 3.9799296855926514, + "learning_rate": 2.7810422136653193e-08, + "loss": 0.1023, + "step": 38203 + }, + { + "epoch": 0.9667737935572033, + "grad_norm": 7.4665937423706055, + "learning_rate": 2.7768148043095245e-08, + "loss": 0.2394, + "step": 38204 + }, + { + "epoch": 0.9667990991218969, + "grad_norm": 5.901638984680176, + "learning_rate": 2.7725906014536286e-08, + "loss": 0.1657, + "step": 38205 + }, + { + "epoch": 0.9668244046865906, + "grad_norm": 6.867645740509033, + "learning_rate": 2.7683696051247767e-08, + "loss": 0.13, + "step": 38206 + }, + { + "epoch": 0.9668497102512843, + "grad_norm": 3.594839334487915, + "learning_rate": 2.7641518153502244e-08, + "loss": 0.1159, + "step": 38207 + }, + { + "epoch": 0.9668750158159779, + "grad_norm": 4.467522144317627, + "learning_rate": 2.7599372321571728e-08, + "loss": 0.1933, + "step": 38208 + }, + { + "epoch": 0.9669003213806716, + "grad_norm": 6.131505966186523, + "learning_rate": 2.7557258555728218e-08, + "loss": 0.2285, + "step": 38209 + }, + { + "epoch": 0.9669256269453653, + "grad_norm": 6.008495807647705, + "learning_rate": 2.7515176856242053e-08, + "loss": 0.1947, + "step": 38210 + }, + { + "epoch": 0.9669509325100589, + "grad_norm": 13.601373672485352, + "learning_rate": 2.7473127223386353e-08, + "loss": 0.2842, + "step": 38211 + }, + { + "epoch": 0.9669762380747526, + "grad_norm": 6.260858535766602, + "learning_rate": 2.74311096574309e-08, + "loss": 0.1795, + "step": 38212 + }, + { + "epoch": 0.9670015436394463, + "grad_norm": 4.085068225860596, + "learning_rate": 2.738912415864825e-08, + "loss": 0.0897, + "step": 38213 + }, + { + "epoch": 0.96702684920414, + "grad_norm": 2.6930577754974365, + "learning_rate": 2.734717072730708e-08, + "loss": 0.1252, + "step": 38214 + }, + { + "epoch": 0.9670521547688337, + "grad_norm": 10.506650924682617, + "learning_rate": 2.7305249363679398e-08, + "loss": 0.1142, + "step": 38215 + }, + { + "epoch": 0.9670774603335274, + "grad_norm": 3.0329525470733643, + "learning_rate": 2.726336006803554e-08, + "loss": 0.0903, + "step": 38216 + }, + { + "epoch": 0.967102765898221, + "grad_norm": 3.579145908355713, + "learning_rate": 2.722150284064473e-08, + "loss": 0.0848, + "step": 38217 + }, + { + "epoch": 0.9671280714629147, + "grad_norm": 4.569973468780518, + "learning_rate": 2.717967768177787e-08, + "loss": 0.1282, + "step": 38218 + }, + { + "epoch": 0.9671533770276084, + "grad_norm": 4.726062774658203, + "learning_rate": 2.713788459170419e-08, + "loss": 0.1632, + "step": 38219 + }, + { + "epoch": 0.967178682592302, + "grad_norm": 9.022453308105469, + "learning_rate": 2.709612357069291e-08, + "loss": 0.1457, + "step": 38220 + }, + { + "epoch": 0.9672039881569957, + "grad_norm": 8.34432601928711, + "learning_rate": 2.7054394619013824e-08, + "loss": 0.2686, + "step": 38221 + }, + { + "epoch": 0.9672292937216894, + "grad_norm": 11.655896186828613, + "learning_rate": 2.7012697736936155e-08, + "loss": 0.1977, + "step": 38222 + }, + { + "epoch": 0.967254599286383, + "grad_norm": 6.038003921508789, + "learning_rate": 2.697103292472858e-08, + "loss": 0.202, + "step": 38223 + }, + { + "epoch": 0.9672799048510767, + "grad_norm": 5.257443904876709, + "learning_rate": 2.6929400182659215e-08, + "loss": 0.2053, + "step": 38224 + }, + { + "epoch": 0.9673052104157704, + "grad_norm": 6.108861446380615, + "learning_rate": 2.688779951099785e-08, + "loss": 0.1848, + "step": 38225 + }, + { + "epoch": 0.9673305159804642, + "grad_norm": 6.050390243530273, + "learning_rate": 2.6846230910011483e-08, + "loss": 0.2045, + "step": 38226 + }, + { + "epoch": 0.9673558215451578, + "grad_norm": 3.4726457595825195, + "learning_rate": 2.68046943799688e-08, + "loss": 0.0854, + "step": 38227 + }, + { + "epoch": 0.9673811271098515, + "grad_norm": 16.365755081176758, + "learning_rate": 2.676318992113791e-08, + "loss": 0.2525, + "step": 38228 + }, + { + "epoch": 0.9674064326745452, + "grad_norm": 4.022130489349365, + "learning_rate": 2.6721717533785273e-08, + "loss": 0.0992, + "step": 38229 + }, + { + "epoch": 0.9674317382392388, + "grad_norm": 4.649309158325195, + "learning_rate": 2.6680277218180117e-08, + "loss": 0.1728, + "step": 38230 + }, + { + "epoch": 0.9674570438039325, + "grad_norm": 6.289511680603027, + "learning_rate": 2.663886897458834e-08, + "loss": 0.2021, + "step": 38231 + }, + { + "epoch": 0.9674823493686262, + "grad_norm": 6.570329666137695, + "learning_rate": 2.6597492803277503e-08, + "loss": 0.191, + "step": 38232 + }, + { + "epoch": 0.9675076549333198, + "grad_norm": 4.206923484802246, + "learning_rate": 2.6556148704513507e-08, + "loss": 0.1491, + "step": 38233 + }, + { + "epoch": 0.9675329604980135, + "grad_norm": 6.002188682556152, + "learning_rate": 2.651483667856447e-08, + "loss": 0.1548, + "step": 38234 + }, + { + "epoch": 0.9675582660627072, + "grad_norm": 3.6532201766967773, + "learning_rate": 2.647355672569629e-08, + "loss": 0.113, + "step": 38235 + }, + { + "epoch": 0.9675835716274008, + "grad_norm": 4.279749870300293, + "learning_rate": 2.643230884617487e-08, + "loss": 0.1788, + "step": 38236 + }, + { + "epoch": 0.9676088771920945, + "grad_norm": 2.996453046798706, + "learning_rate": 2.63910930402661e-08, + "loss": 0.0873, + "step": 38237 + }, + { + "epoch": 0.9676341827567883, + "grad_norm": 3.42917537689209, + "learning_rate": 2.6349909308236443e-08, + "loss": 0.1196, + "step": 38238 + }, + { + "epoch": 0.9676594883214819, + "grad_norm": 6.018601894378662, + "learning_rate": 2.630875765035068e-08, + "loss": 0.1587, + "step": 38239 + }, + { + "epoch": 0.9676847938861756, + "grad_norm": 5.981184959411621, + "learning_rate": 2.626763806687471e-08, + "loss": 0.1576, + "step": 38240 + }, + { + "epoch": 0.9677100994508693, + "grad_norm": 3.826301097869873, + "learning_rate": 2.6226550558073328e-08, + "loss": 0.1429, + "step": 38241 + }, + { + "epoch": 0.9677354050155629, + "grad_norm": 10.169622421264648, + "learning_rate": 2.6185495124211868e-08, + "loss": 0.0894, + "step": 38242 + }, + { + "epoch": 0.9677607105802566, + "grad_norm": 5.550145626068115, + "learning_rate": 2.6144471765555125e-08, + "loss": 0.1274, + "step": 38243 + }, + { + "epoch": 0.9677860161449503, + "grad_norm": 10.544233322143555, + "learning_rate": 2.6103480482367327e-08, + "loss": 0.2866, + "step": 38244 + }, + { + "epoch": 0.9678113217096439, + "grad_norm": 5.353479862213135, + "learning_rate": 2.606252127491271e-08, + "loss": 0.1479, + "step": 38245 + }, + { + "epoch": 0.9678366272743376, + "grad_norm": 5.329925537109375, + "learning_rate": 2.6021594143456064e-08, + "loss": 0.1784, + "step": 38246 + }, + { + "epoch": 0.9678619328390313, + "grad_norm": 4.766892910003662, + "learning_rate": 2.598069908826051e-08, + "loss": 0.0983, + "step": 38247 + }, + { + "epoch": 0.9678872384037249, + "grad_norm": 11.256223678588867, + "learning_rate": 2.5939836109590833e-08, + "loss": 0.2449, + "step": 38248 + }, + { + "epoch": 0.9679125439684186, + "grad_norm": 7.193628311157227, + "learning_rate": 2.5899005207709605e-08, + "loss": 0.146, + "step": 38249 + }, + { + "epoch": 0.9679378495331123, + "grad_norm": 4.12204122543335, + "learning_rate": 2.5858206382880503e-08, + "loss": 0.1371, + "step": 38250 + }, + { + "epoch": 0.9679631550978061, + "grad_norm": 5.571510314941406, + "learning_rate": 2.5817439635366094e-08, + "loss": 0.1399, + "step": 38251 + }, + { + "epoch": 0.9679884606624997, + "grad_norm": 4.9513678550720215, + "learning_rate": 2.5776704965430056e-08, + "loss": 0.21, + "step": 38252 + }, + { + "epoch": 0.9680137662271934, + "grad_norm": 6.667637348175049, + "learning_rate": 2.5736002373334957e-08, + "loss": 0.1511, + "step": 38253 + }, + { + "epoch": 0.9680390717918871, + "grad_norm": 6.943368911743164, + "learning_rate": 2.569533185934281e-08, + "loss": 0.1776, + "step": 38254 + }, + { + "epoch": 0.9680643773565807, + "grad_norm": 2.636495351791382, + "learning_rate": 2.5654693423716736e-08, + "loss": 0.1351, + "step": 38255 + }, + { + "epoch": 0.9680896829212744, + "grad_norm": 4.892099380493164, + "learning_rate": 2.561408706671764e-08, + "loss": 0.1335, + "step": 38256 + }, + { + "epoch": 0.9681149884859681, + "grad_norm": 5.635442733764648, + "learning_rate": 2.557351278860809e-08, + "loss": 0.1686, + "step": 38257 + }, + { + "epoch": 0.9681402940506617, + "grad_norm": 6.908605575561523, + "learning_rate": 2.5532970589650096e-08, + "loss": 0.1558, + "step": 38258 + }, + { + "epoch": 0.9681655996153554, + "grad_norm": 4.820685386657715, + "learning_rate": 2.549246047010401e-08, + "loss": 0.1738, + "step": 38259 + }, + { + "epoch": 0.9681909051800491, + "grad_norm": 15.790193557739258, + "learning_rate": 2.5451982430231836e-08, + "loss": 0.1536, + "step": 38260 + }, + { + "epoch": 0.9682162107447427, + "grad_norm": 15.677534103393555, + "learning_rate": 2.541153647029504e-08, + "loss": 0.3436, + "step": 38261 + }, + { + "epoch": 0.9682415163094364, + "grad_norm": 10.291287422180176, + "learning_rate": 2.537112259055341e-08, + "loss": 0.2002, + "step": 38262 + }, + { + "epoch": 0.9682668218741302, + "grad_norm": 5.5964674949646, + "learning_rate": 2.5330740791268404e-08, + "loss": 0.1587, + "step": 38263 + }, + { + "epoch": 0.9682921274388238, + "grad_norm": 3.171278476715088, + "learning_rate": 2.5290391072699262e-08, + "loss": 0.1337, + "step": 38264 + }, + { + "epoch": 0.9683174330035175, + "grad_norm": 15.013503074645996, + "learning_rate": 2.5250073435107992e-08, + "loss": 0.243, + "step": 38265 + }, + { + "epoch": 0.9683427385682112, + "grad_norm": 4.876464366912842, + "learning_rate": 2.520978787875328e-08, + "loss": 0.1573, + "step": 38266 + }, + { + "epoch": 0.9683680441329048, + "grad_norm": 3.006080389022827, + "learning_rate": 2.5169534403894912e-08, + "loss": 0.1507, + "step": 38267 + }, + { + "epoch": 0.9683933496975985, + "grad_norm": 9.034849166870117, + "learning_rate": 2.5129313010793245e-08, + "loss": 0.3303, + "step": 38268 + }, + { + "epoch": 0.9684186552622922, + "grad_norm": 5.152936935424805, + "learning_rate": 2.5089123699706952e-08, + "loss": 0.1584, + "step": 38269 + }, + { + "epoch": 0.9684439608269858, + "grad_norm": 5.195321083068848, + "learning_rate": 2.504896647089583e-08, + "loss": 0.146, + "step": 38270 + }, + { + "epoch": 0.9684692663916795, + "grad_norm": 3.0649826526641846, + "learning_rate": 2.500884132461856e-08, + "loss": 0.0963, + "step": 38271 + }, + { + "epoch": 0.9684945719563732, + "grad_norm": 4.309125900268555, + "learning_rate": 2.496874826113327e-08, + "loss": 0.0761, + "step": 38272 + }, + { + "epoch": 0.9685198775210668, + "grad_norm": 4.918266773223877, + "learning_rate": 2.4928687280699748e-08, + "loss": 0.2279, + "step": 38273 + }, + { + "epoch": 0.9685451830857605, + "grad_norm": 18.849138259887695, + "learning_rate": 2.4888658383575014e-08, + "loss": 0.2204, + "step": 38274 + }, + { + "epoch": 0.9685704886504543, + "grad_norm": 3.794203281402588, + "learning_rate": 2.4848661570018863e-08, + "loss": 0.1372, + "step": 38275 + }, + { + "epoch": 0.968595794215148, + "grad_norm": 4.547886848449707, + "learning_rate": 2.4808696840287193e-08, + "loss": 0.1066, + "step": 38276 + }, + { + "epoch": 0.9686210997798416, + "grad_norm": 3.8490347862243652, + "learning_rate": 2.476876419463925e-08, + "loss": 0.1567, + "step": 38277 + }, + { + "epoch": 0.9686464053445353, + "grad_norm": 6.992445945739746, + "learning_rate": 2.4728863633332046e-08, + "loss": 0.1998, + "step": 38278 + }, + { + "epoch": 0.968671710909229, + "grad_norm": 3.275252103805542, + "learning_rate": 2.4688995156623154e-08, + "loss": 0.1151, + "step": 38279 + }, + { + "epoch": 0.9686970164739226, + "grad_norm": 3.1114096641540527, + "learning_rate": 2.4649158764769033e-08, + "loss": 0.0695, + "step": 38280 + }, + { + "epoch": 0.9687223220386163, + "grad_norm": 7.099506855010986, + "learning_rate": 2.4609354458027255e-08, + "loss": 0.1635, + "step": 38281 + }, + { + "epoch": 0.96874762760331, + "grad_norm": 5.5468220710754395, + "learning_rate": 2.456958223665429e-08, + "loss": 0.1718, + "step": 38282 + }, + { + "epoch": 0.9687729331680036, + "grad_norm": 5.363811492919922, + "learning_rate": 2.4529842100906586e-08, + "loss": 0.2009, + "step": 38283 + }, + { + "epoch": 0.9687982387326973, + "grad_norm": 5.920866012573242, + "learning_rate": 2.4490134051040617e-08, + "loss": 0.1599, + "step": 38284 + }, + { + "epoch": 0.968823544297391, + "grad_norm": 7.600958824157715, + "learning_rate": 2.4450458087311724e-08, + "loss": 0.2138, + "step": 38285 + }, + { + "epoch": 0.9688488498620846, + "grad_norm": 4.501636981964111, + "learning_rate": 2.441081420997693e-08, + "loss": 0.1446, + "step": 38286 + }, + { + "epoch": 0.9688741554267783, + "grad_norm": 6.083978652954102, + "learning_rate": 2.4371202419290473e-08, + "loss": 0.1924, + "step": 38287 + }, + { + "epoch": 0.9688994609914721, + "grad_norm": 3.5031025409698486, + "learning_rate": 2.4331622715509375e-08, + "loss": 0.0637, + "step": 38288 + }, + { + "epoch": 0.9689247665561657, + "grad_norm": 3.45123553276062, + "learning_rate": 2.4292075098887868e-08, + "loss": 0.1492, + "step": 38289 + }, + { + "epoch": 0.9689500721208594, + "grad_norm": 2.858701705932617, + "learning_rate": 2.425255956968131e-08, + "loss": 0.1299, + "step": 38290 + }, + { + "epoch": 0.9689753776855531, + "grad_norm": 3.821478843688965, + "learning_rate": 2.4213076128143943e-08, + "loss": 0.1439, + "step": 38291 + }, + { + "epoch": 0.9690006832502467, + "grad_norm": 4.335210800170898, + "learning_rate": 2.4173624774531114e-08, + "loss": 0.1328, + "step": 38292 + }, + { + "epoch": 0.9690259888149404, + "grad_norm": 2.960240364074707, + "learning_rate": 2.4134205509097065e-08, + "loss": 0.1215, + "step": 38293 + }, + { + "epoch": 0.9690512943796341, + "grad_norm": 4.536462306976318, + "learning_rate": 2.409481833209604e-08, + "loss": 0.1415, + "step": 38294 + }, + { + "epoch": 0.9690765999443277, + "grad_norm": 3.4254884719848633, + "learning_rate": 2.4055463243781718e-08, + "loss": 0.1186, + "step": 38295 + }, + { + "epoch": 0.9691019055090214, + "grad_norm": 3.0249011516571045, + "learning_rate": 2.401614024440835e-08, + "loss": 0.0991, + "step": 38296 + }, + { + "epoch": 0.9691272110737151, + "grad_norm": 9.253668785095215, + "learning_rate": 2.3976849334229058e-08, + "loss": 0.1403, + "step": 38297 + }, + { + "epoch": 0.9691525166384087, + "grad_norm": 4.828395366668701, + "learning_rate": 2.3937590513497532e-08, + "loss": 0.1079, + "step": 38298 + }, + { + "epoch": 0.9691778222031024, + "grad_norm": 3.717088222503662, + "learning_rate": 2.3898363782466905e-08, + "loss": 0.1303, + "step": 38299 + }, + { + "epoch": 0.9692031277677962, + "grad_norm": 3.928081512451172, + "learning_rate": 2.3859169141390303e-08, + "loss": 0.1013, + "step": 38300 + }, + { + "epoch": 0.9692284333324899, + "grad_norm": 10.175615310668945, + "learning_rate": 2.3820006590520305e-08, + "loss": 0.2989, + "step": 38301 + }, + { + "epoch": 0.9692537388971835, + "grad_norm": 5.950262069702148, + "learning_rate": 2.3780876130108932e-08, + "loss": 0.0855, + "step": 38302 + }, + { + "epoch": 0.9692790444618772, + "grad_norm": 9.626947402954102, + "learning_rate": 2.3741777760409313e-08, + "loss": 0.1854, + "step": 38303 + }, + { + "epoch": 0.9693043500265709, + "grad_norm": 5.9166035652160645, + "learning_rate": 2.370271148167347e-08, + "loss": 0.1698, + "step": 38304 + }, + { + "epoch": 0.9693296555912645, + "grad_norm": 6.888040065765381, + "learning_rate": 2.3663677294152866e-08, + "loss": 0.157, + "step": 38305 + }, + { + "epoch": 0.9693549611559582, + "grad_norm": 10.225302696228027, + "learning_rate": 2.362467519810008e-08, + "loss": 0.2222, + "step": 38306 + }, + { + "epoch": 0.9693802667206519, + "grad_norm": 14.964529037475586, + "learning_rate": 2.3585705193765463e-08, + "loss": 0.2338, + "step": 38307 + }, + { + "epoch": 0.9694055722853455, + "grad_norm": 4.409080505371094, + "learning_rate": 2.354676728140104e-08, + "loss": 0.1595, + "step": 38308 + }, + { + "epoch": 0.9694308778500392, + "grad_norm": 4.635115623474121, + "learning_rate": 2.3507861461258273e-08, + "loss": 0.1504, + "step": 38309 + }, + { + "epoch": 0.9694561834147329, + "grad_norm": 5.731987476348877, + "learning_rate": 2.3468987733587524e-08, + "loss": 0.1368, + "step": 38310 + }, + { + "epoch": 0.9694814889794265, + "grad_norm": 2.7753288745880127, + "learning_rate": 2.343014609863914e-08, + "loss": 0.0909, + "step": 38311 + }, + { + "epoch": 0.9695067945441203, + "grad_norm": 4.676485061645508, + "learning_rate": 2.3391336556664036e-08, + "loss": 0.1978, + "step": 38312 + }, + { + "epoch": 0.969532100108814, + "grad_norm": 4.895843982696533, + "learning_rate": 2.3352559107913118e-08, + "loss": 0.1647, + "step": 38313 + }, + { + "epoch": 0.9695574056735076, + "grad_norm": 2.862966775894165, + "learning_rate": 2.3313813752635085e-08, + "loss": 0.1196, + "step": 38314 + }, + { + "epoch": 0.9695827112382013, + "grad_norm": 7.609120845794678, + "learning_rate": 2.3275100491080838e-08, + "loss": 0.2591, + "step": 38315 + }, + { + "epoch": 0.969608016802895, + "grad_norm": 4.499589920043945, + "learning_rate": 2.323641932350018e-08, + "loss": 0.1449, + "step": 38316 + }, + { + "epoch": 0.9696333223675886, + "grad_norm": 5.425610542297363, + "learning_rate": 2.319777025014125e-08, + "loss": 0.1686, + "step": 38317 + }, + { + "epoch": 0.9696586279322823, + "grad_norm": 5.282473087310791, + "learning_rate": 2.3159153271254954e-08, + "loss": 0.1737, + "step": 38318 + }, + { + "epoch": 0.969683933496976, + "grad_norm": 6.230623245239258, + "learning_rate": 2.3120568387088872e-08, + "loss": 0.1567, + "step": 38319 + }, + { + "epoch": 0.9697092390616696, + "grad_norm": 3.808715581893921, + "learning_rate": 2.3082015597892803e-08, + "loss": 0.1102, + "step": 38320 + }, + { + "epoch": 0.9697345446263633, + "grad_norm": 5.729091167449951, + "learning_rate": 2.3043494903914888e-08, + "loss": 0.1845, + "step": 38321 + }, + { + "epoch": 0.969759850191057, + "grad_norm": 6.6106791496276855, + "learning_rate": 2.300500630540381e-08, + "loss": 0.2252, + "step": 38322 + }, + { + "epoch": 0.9697851557557506, + "grad_norm": 18.291372299194336, + "learning_rate": 2.296654980260771e-08, + "loss": 0.2009, + "step": 38323 + }, + { + "epoch": 0.9698104613204444, + "grad_norm": 13.758907318115234, + "learning_rate": 2.2928125395774158e-08, + "loss": 0.1789, + "step": 38324 + }, + { + "epoch": 0.9698357668851381, + "grad_norm": 3.210207462310791, + "learning_rate": 2.2889733085151854e-08, + "loss": 0.1226, + "step": 38325 + }, + { + "epoch": 0.9698610724498317, + "grad_norm": 7.874840259552002, + "learning_rate": 2.2851372870987264e-08, + "loss": 0.3057, + "step": 38326 + }, + { + "epoch": 0.9698863780145254, + "grad_norm": 2.991856098175049, + "learning_rate": 2.2813044753529077e-08, + "loss": 0.1284, + "step": 38327 + }, + { + "epoch": 0.9699116835792191, + "grad_norm": 5.281030178070068, + "learning_rate": 2.2774748733023212e-08, + "loss": 0.151, + "step": 38328 + }, + { + "epoch": 0.9699369891439128, + "grad_norm": 4.525707244873047, + "learning_rate": 2.273648480971724e-08, + "loss": 0.1877, + "step": 38329 + }, + { + "epoch": 0.9699622947086064, + "grad_norm": 3.4489424228668213, + "learning_rate": 2.2698252983857637e-08, + "loss": 0.1246, + "step": 38330 + }, + { + "epoch": 0.9699876002733001, + "grad_norm": 2.55153751373291, + "learning_rate": 2.2660053255691982e-08, + "loss": 0.1226, + "step": 38331 + }, + { + "epoch": 0.9700129058379938, + "grad_norm": 8.956766128540039, + "learning_rate": 2.262188562546508e-08, + "loss": 0.1726, + "step": 38332 + }, + { + "epoch": 0.9700382114026874, + "grad_norm": 6.695979595184326, + "learning_rate": 2.258375009342395e-08, + "loss": 0.0997, + "step": 38333 + }, + { + "epoch": 0.9700635169673811, + "grad_norm": 2.8115956783294678, + "learning_rate": 2.2545646659813958e-08, + "loss": 0.1412, + "step": 38334 + }, + { + "epoch": 0.9700888225320748, + "grad_norm": 2.7475643157958984, + "learning_rate": 2.250757532488157e-08, + "loss": 0.1322, + "step": 38335 + }, + { + "epoch": 0.9701141280967684, + "grad_norm": 3.766188621520996, + "learning_rate": 2.246953608887159e-08, + "loss": 0.1624, + "step": 38336 + }, + { + "epoch": 0.9701394336614622, + "grad_norm": 3.8896524906158447, + "learning_rate": 2.243152895203049e-08, + "loss": 0.1127, + "step": 38337 + }, + { + "epoch": 0.9701647392261559, + "grad_norm": 2.587364673614502, + "learning_rate": 2.239355391460196e-08, + "loss": 0.1099, + "step": 38338 + }, + { + "epoch": 0.9701900447908495, + "grad_norm": 4.048499584197998, + "learning_rate": 2.2355610976831922e-08, + "loss": 0.1126, + "step": 38339 + }, + { + "epoch": 0.9702153503555432, + "grad_norm": 4.409086227416992, + "learning_rate": 2.231770013896406e-08, + "loss": 0.1199, + "step": 38340 + }, + { + "epoch": 0.9702406559202369, + "grad_norm": 4.184342861175537, + "learning_rate": 2.227982140124374e-08, + "loss": 0.1495, + "step": 38341 + }, + { + "epoch": 0.9702659614849305, + "grad_norm": 3.4454684257507324, + "learning_rate": 2.2241974763914653e-08, + "loss": 0.0878, + "step": 38342 + }, + { + "epoch": 0.9702912670496242, + "grad_norm": 4.3564958572387695, + "learning_rate": 2.2204160227221604e-08, + "loss": 0.121, + "step": 38343 + }, + { + "epoch": 0.9703165726143179, + "grad_norm": 7.309350967407227, + "learning_rate": 2.2166377791407734e-08, + "loss": 0.2071, + "step": 38344 + }, + { + "epoch": 0.9703418781790115, + "grad_norm": 10.481534004211426, + "learning_rate": 2.2128627456716733e-08, + "loss": 0.2097, + "step": 38345 + }, + { + "epoch": 0.9703671837437052, + "grad_norm": 4.177441120147705, + "learning_rate": 2.2090909223392853e-08, + "loss": 0.1567, + "step": 38346 + }, + { + "epoch": 0.9703924893083989, + "grad_norm": 8.52007007598877, + "learning_rate": 2.2053223091678122e-08, + "loss": 0.1019, + "step": 38347 + }, + { + "epoch": 0.9704177948730925, + "grad_norm": 7.920771598815918, + "learning_rate": 2.2015569061816787e-08, + "loss": 0.1408, + "step": 38348 + }, + { + "epoch": 0.9704431004377863, + "grad_norm": 3.6897993087768555, + "learning_rate": 2.1977947134050325e-08, + "loss": 0.1816, + "step": 38349 + }, + { + "epoch": 0.97046840600248, + "grad_norm": 7.362209796905518, + "learning_rate": 2.1940357308622984e-08, + "loss": 0.1873, + "step": 38350 + }, + { + "epoch": 0.9704937115671736, + "grad_norm": 2.474477767944336, + "learning_rate": 2.190279958577568e-08, + "loss": 0.0821, + "step": 38351 + }, + { + "epoch": 0.9705190171318673, + "grad_norm": 3.790825605392456, + "learning_rate": 2.1865273965751e-08, + "loss": 0.1038, + "step": 38352 + }, + { + "epoch": 0.970544322696561, + "grad_norm": 3.3455936908721924, + "learning_rate": 2.1827780448791525e-08, + "loss": 0.1224, + "step": 38353 + }, + { + "epoch": 0.9705696282612547, + "grad_norm": 6.242414474487305, + "learning_rate": 2.179031903513873e-08, + "loss": 0.085, + "step": 38354 + }, + { + "epoch": 0.9705949338259483, + "grad_norm": 2.9438939094543457, + "learning_rate": 2.1752889725034088e-08, + "loss": 0.0769, + "step": 38355 + }, + { + "epoch": 0.970620239390642, + "grad_norm": 5.084621906280518, + "learning_rate": 2.1715492518719073e-08, + "loss": 0.1442, + "step": 38356 + }, + { + "epoch": 0.9706455449553357, + "grad_norm": 7.440768718719482, + "learning_rate": 2.16781274164346e-08, + "loss": 0.1358, + "step": 38357 + }, + { + "epoch": 0.9706708505200293, + "grad_norm": 3.1111257076263428, + "learning_rate": 2.1640794418422152e-08, + "loss": 0.1052, + "step": 38358 + }, + { + "epoch": 0.970696156084723, + "grad_norm": 3.3541789054870605, + "learning_rate": 2.160349352492208e-08, + "loss": 0.1199, + "step": 38359 + }, + { + "epoch": 0.9707214616494168, + "grad_norm": 8.252695083618164, + "learning_rate": 2.1566224736174755e-08, + "loss": 0.1528, + "step": 38360 + }, + { + "epoch": 0.9707467672141104, + "grad_norm": 3.974536895751953, + "learning_rate": 2.1528988052421096e-08, + "loss": 0.0834, + "step": 38361 + }, + { + "epoch": 0.9707720727788041, + "grad_norm": 2.68422532081604, + "learning_rate": 2.1491783473900907e-08, + "loss": 0.0873, + "step": 38362 + }, + { + "epoch": 0.9707973783434978, + "grad_norm": 3.8227624893188477, + "learning_rate": 2.1454611000854e-08, + "loss": 0.1344, + "step": 38363 + }, + { + "epoch": 0.9708226839081914, + "grad_norm": 6.940603733062744, + "learning_rate": 2.141747063352073e-08, + "loss": 0.173, + "step": 38364 + }, + { + "epoch": 0.9708479894728851, + "grad_norm": 4.221693515777588, + "learning_rate": 2.1380362372139253e-08, + "loss": 0.1423, + "step": 38365 + }, + { + "epoch": 0.9708732950375788, + "grad_norm": 14.694419860839844, + "learning_rate": 2.1343286216950476e-08, + "loss": 0.1404, + "step": 38366 + }, + { + "epoch": 0.9708986006022724, + "grad_norm": 3.4477076530456543, + "learning_rate": 2.1306242168191994e-08, + "loss": 0.1814, + "step": 38367 + }, + { + "epoch": 0.9709239061669661, + "grad_norm": 5.950778484344482, + "learning_rate": 2.1269230226104166e-08, + "loss": 0.1813, + "step": 38368 + }, + { + "epoch": 0.9709492117316598, + "grad_norm": 6.598481178283691, + "learning_rate": 2.123225039092458e-08, + "loss": 0.1662, + "step": 38369 + }, + { + "epoch": 0.9709745172963534, + "grad_norm": 4.792257308959961, + "learning_rate": 2.1195302662891938e-08, + "loss": 0.1416, + "step": 38370 + }, + { + "epoch": 0.9709998228610471, + "grad_norm": 6.876309871673584, + "learning_rate": 2.1158387042244932e-08, + "loss": 0.1999, + "step": 38371 + }, + { + "epoch": 0.9710251284257408, + "grad_norm": 8.742378234863281, + "learning_rate": 2.112150352922171e-08, + "loss": 0.1158, + "step": 38372 + }, + { + "epoch": 0.9710504339904344, + "grad_norm": 4.929538726806641, + "learning_rate": 2.1084652124059302e-08, + "loss": 0.1469, + "step": 38373 + }, + { + "epoch": 0.9710757395551282, + "grad_norm": 8.741170883178711, + "learning_rate": 2.104783282699585e-08, + "loss": 0.1353, + "step": 38374 + }, + { + "epoch": 0.9711010451198219, + "grad_norm": 4.594151496887207, + "learning_rate": 2.1011045638268945e-08, + "loss": 0.1656, + "step": 38375 + }, + { + "epoch": 0.9711263506845155, + "grad_norm": 3.1157968044281006, + "learning_rate": 2.097429055811506e-08, + "loss": 0.0975, + "step": 38376 + }, + { + "epoch": 0.9711516562492092, + "grad_norm": 11.154439926147461, + "learning_rate": 2.0937567586772346e-08, + "loss": 0.1838, + "step": 38377 + }, + { + "epoch": 0.9711769618139029, + "grad_norm": 4.69141960144043, + "learning_rate": 2.0900876724477272e-08, + "loss": 0.1251, + "step": 38378 + }, + { + "epoch": 0.9712022673785966, + "grad_norm": 3.964277744293213, + "learning_rate": 2.086421797146576e-08, + "loss": 0.116, + "step": 38379 + }, + { + "epoch": 0.9712275729432902, + "grad_norm": 9.177511215209961, + "learning_rate": 2.082759132797485e-08, + "loss": 0.1832, + "step": 38380 + }, + { + "epoch": 0.9712528785079839, + "grad_norm": 3.761979818344116, + "learning_rate": 2.0790996794241013e-08, + "loss": 0.1372, + "step": 38381 + }, + { + "epoch": 0.9712781840726776, + "grad_norm": 8.071221351623535, + "learning_rate": 2.0754434370499065e-08, + "loss": 0.1316, + "step": 38382 + }, + { + "epoch": 0.9713034896373712, + "grad_norm": 2.6220884323120117, + "learning_rate": 2.0717904056986037e-08, + "loss": 0.1152, + "step": 38383 + }, + { + "epoch": 0.9713287952020649, + "grad_norm": 4.192617416381836, + "learning_rate": 2.0681405853937297e-08, + "loss": 0.1239, + "step": 38384 + }, + { + "epoch": 0.9713541007667587, + "grad_norm": 4.110406875610352, + "learning_rate": 2.0644939761587656e-08, + "loss": 0.125, + "step": 38385 + }, + { + "epoch": 0.9713794063314523, + "grad_norm": 5.760929584503174, + "learning_rate": 2.060850578017248e-08, + "loss": 0.2074, + "step": 38386 + }, + { + "epoch": 0.971404711896146, + "grad_norm": 14.30207347869873, + "learning_rate": 2.0572103909927142e-08, + "loss": 0.218, + "step": 38387 + }, + { + "epoch": 0.9714300174608397, + "grad_norm": 5.235904693603516, + "learning_rate": 2.053573415108534e-08, + "loss": 0.1735, + "step": 38388 + }, + { + "epoch": 0.9714553230255333, + "grad_norm": 6.705055236816406, + "learning_rate": 2.0499396503882997e-08, + "loss": 0.231, + "step": 38389 + }, + { + "epoch": 0.971480628590227, + "grad_norm": 5.12983512878418, + "learning_rate": 2.046309096855381e-08, + "loss": 0.1472, + "step": 38390 + }, + { + "epoch": 0.9715059341549207, + "grad_norm": 5.549776077270508, + "learning_rate": 2.042681754533149e-08, + "loss": 0.1791, + "step": 38391 + }, + { + "epoch": 0.9715312397196143, + "grad_norm": 7.595211982727051, + "learning_rate": 2.0390576234450287e-08, + "loss": 0.2361, + "step": 38392 + }, + { + "epoch": 0.971556545284308, + "grad_norm": 7.13500452041626, + "learning_rate": 2.0354367036144464e-08, + "loss": 0.1878, + "step": 38393 + }, + { + "epoch": 0.9715818508490017, + "grad_norm": 4.922685623168945, + "learning_rate": 2.0318189950646604e-08, + "loss": 0.1388, + "step": 38394 + }, + { + "epoch": 0.9716071564136953, + "grad_norm": 4.003973007202148, + "learning_rate": 2.0282044978190973e-08, + "loss": 0.1282, + "step": 38395 + }, + { + "epoch": 0.971632461978389, + "grad_norm": 6.445231914520264, + "learning_rate": 2.0245932119009604e-08, + "loss": 0.1363, + "step": 38396 + }, + { + "epoch": 0.9716577675430828, + "grad_norm": 9.71290397644043, + "learning_rate": 2.02098513733362e-08, + "loss": 0.171, + "step": 38397 + }, + { + "epoch": 0.9716830731077764, + "grad_norm": 8.643424034118652, + "learning_rate": 2.0173802741402793e-08, + "loss": 0.108, + "step": 38398 + }, + { + "epoch": 0.9717083786724701, + "grad_norm": 5.263587951660156, + "learning_rate": 2.0137786223442536e-08, + "loss": 0.1, + "step": 38399 + }, + { + "epoch": 0.9717336842371638, + "grad_norm": 4.7227020263671875, + "learning_rate": 2.010180181968746e-08, + "loss": 0.1951, + "step": 38400 + }, + { + "epoch": 0.9717589898018574, + "grad_norm": 8.418750762939453, + "learning_rate": 2.0065849530369608e-08, + "loss": 0.2326, + "step": 38401 + }, + { + "epoch": 0.9717842953665511, + "grad_norm": 6.075877666473389, + "learning_rate": 2.0029929355720458e-08, + "loss": 0.2148, + "step": 38402 + }, + { + "epoch": 0.9718096009312448, + "grad_norm": 5.737244606018066, + "learning_rate": 1.9994041295972046e-08, + "loss": 0.1433, + "step": 38403 + }, + { + "epoch": 0.9718349064959385, + "grad_norm": 4.714019298553467, + "learning_rate": 1.9958185351355853e-08, + "loss": 0.1055, + "step": 38404 + }, + { + "epoch": 0.9718602120606321, + "grad_norm": 30.333005905151367, + "learning_rate": 1.9922361522102252e-08, + "loss": 0.1976, + "step": 38405 + }, + { + "epoch": 0.9718855176253258, + "grad_norm": 3.7555811405181885, + "learning_rate": 1.9886569808443833e-08, + "loss": 0.1041, + "step": 38406 + }, + { + "epoch": 0.9719108231900195, + "grad_norm": 3.447521209716797, + "learning_rate": 1.9850810210610417e-08, + "loss": 0.1086, + "step": 38407 + }, + { + "epoch": 0.9719361287547131, + "grad_norm": 4.6336259841918945, + "learning_rate": 1.981508272883237e-08, + "loss": 0.1654, + "step": 38408 + }, + { + "epoch": 0.9719614343194068, + "grad_norm": 3.8030800819396973, + "learning_rate": 1.9779387363340618e-08, + "loss": 0.0961, + "step": 38409 + }, + { + "epoch": 0.9719867398841006, + "grad_norm": 4.086534023284912, + "learning_rate": 1.9743724114365538e-08, + "loss": 0.1255, + "step": 38410 + }, + { + "epoch": 0.9720120454487942, + "grad_norm": 7.224972724914551, + "learning_rate": 1.9708092982136383e-08, + "loss": 0.2307, + "step": 38411 + }, + { + "epoch": 0.9720373510134879, + "grad_norm": 11.815147399902344, + "learning_rate": 1.9672493966883534e-08, + "loss": 0.1219, + "step": 38412 + }, + { + "epoch": 0.9720626565781816, + "grad_norm": 11.780532836914062, + "learning_rate": 1.9636927068836242e-08, + "loss": 0.3078, + "step": 38413 + }, + { + "epoch": 0.9720879621428752, + "grad_norm": 3.7655582427978516, + "learning_rate": 1.9601392288223776e-08, + "loss": 0.1189, + "step": 38414 + }, + { + "epoch": 0.9721132677075689, + "grad_norm": 5.160643577575684, + "learning_rate": 1.9565889625275945e-08, + "loss": 0.1679, + "step": 38415 + }, + { + "epoch": 0.9721385732722626, + "grad_norm": 15.615799903869629, + "learning_rate": 1.9530419080220907e-08, + "loss": 0.1097, + "step": 38416 + }, + { + "epoch": 0.9721638788369562, + "grad_norm": 3.9138453006744385, + "learning_rate": 1.9494980653287922e-08, + "loss": 0.1575, + "step": 38417 + }, + { + "epoch": 0.9721891844016499, + "grad_norm": 3.549017906188965, + "learning_rate": 1.945957434470569e-08, + "loss": 0.1128, + "step": 38418 + }, + { + "epoch": 0.9722144899663436, + "grad_norm": 3.407644033432007, + "learning_rate": 1.942420015470181e-08, + "loss": 0.1428, + "step": 38419 + }, + { + "epoch": 0.9722397955310372, + "grad_norm": 4.171788692474365, + "learning_rate": 1.9388858083504436e-08, + "loss": 0.149, + "step": 38420 + }, + { + "epoch": 0.9722651010957309, + "grad_norm": 6.704408645629883, + "learning_rate": 1.9353548131342825e-08, + "loss": 0.1012, + "step": 38421 + }, + { + "epoch": 0.9722904066604247, + "grad_norm": 9.343999862670898, + "learning_rate": 1.931827029844291e-08, + "loss": 0.1635, + "step": 38422 + }, + { + "epoch": 0.9723157122251183, + "grad_norm": 4.442694187164307, + "learning_rate": 1.9283024585033394e-08, + "loss": 0.0982, + "step": 38423 + }, + { + "epoch": 0.972341017789812, + "grad_norm": 8.282257080078125, + "learning_rate": 1.9247810991340766e-08, + "loss": 0.1585, + "step": 38424 + }, + { + "epoch": 0.9723663233545057, + "grad_norm": 6.750572204589844, + "learning_rate": 1.9212629517593173e-08, + "loss": 0.1415, + "step": 38425 + }, + { + "epoch": 0.9723916289191993, + "grad_norm": 4.1940531730651855, + "learning_rate": 1.917748016401655e-08, + "loss": 0.1684, + "step": 38426 + }, + { + "epoch": 0.972416934483893, + "grad_norm": 10.12277603149414, + "learning_rate": 1.9142362930837378e-08, + "loss": 0.2364, + "step": 38427 + }, + { + "epoch": 0.9724422400485867, + "grad_norm": 3.4051713943481445, + "learning_rate": 1.9107277818283255e-08, + "loss": 0.1532, + "step": 38428 + }, + { + "epoch": 0.9724675456132804, + "grad_norm": 6.049253940582275, + "learning_rate": 1.9072224826579556e-08, + "loss": 0.1159, + "step": 38429 + }, + { + "epoch": 0.972492851177974, + "grad_norm": 4.010667324066162, + "learning_rate": 1.903720395595221e-08, + "loss": 0.1387, + "step": 38430 + }, + { + "epoch": 0.9725181567426677, + "grad_norm": 5.778667449951172, + "learning_rate": 1.9002215206627705e-08, + "loss": 0.173, + "step": 38431 + }, + { + "epoch": 0.9725434623073614, + "grad_norm": 21.165403366088867, + "learning_rate": 1.8967258578831416e-08, + "loss": 0.3025, + "step": 38432 + }, + { + "epoch": 0.972568767872055, + "grad_norm": 2.808537244796753, + "learning_rate": 1.8932334072788715e-08, + "loss": 0.0892, + "step": 38433 + }, + { + "epoch": 0.9725940734367488, + "grad_norm": 3.6226112842559814, + "learning_rate": 1.8897441688724983e-08, + "loss": 0.1132, + "step": 38434 + }, + { + "epoch": 0.9726193790014425, + "grad_norm": 3.186882495880127, + "learning_rate": 1.8862581426865033e-08, + "loss": 0.0505, + "step": 38435 + }, + { + "epoch": 0.9726446845661361, + "grad_norm": 5.644315242767334, + "learning_rate": 1.882775328743314e-08, + "loss": 0.1779, + "step": 38436 + }, + { + "epoch": 0.9726699901308298, + "grad_norm": 3.0585944652557373, + "learning_rate": 1.879295727065522e-08, + "loss": 0.1223, + "step": 38437 + }, + { + "epoch": 0.9726952956955235, + "grad_norm": 7.277299880981445, + "learning_rate": 1.8758193376754442e-08, + "loss": 0.1241, + "step": 38438 + }, + { + "epoch": 0.9727206012602171, + "grad_norm": 7.167825222015381, + "learning_rate": 1.8723461605956173e-08, + "loss": 0.1764, + "step": 38439 + }, + { + "epoch": 0.9727459068249108, + "grad_norm": 3.912191152572632, + "learning_rate": 1.8688761958483014e-08, + "loss": 0.1278, + "step": 38440 + }, + { + "epoch": 0.9727712123896045, + "grad_norm": 3.922919273376465, + "learning_rate": 1.8654094434559788e-08, + "loss": 0.1872, + "step": 38441 + }, + { + "epoch": 0.9727965179542981, + "grad_norm": 4.361785411834717, + "learning_rate": 1.8619459034409648e-08, + "loss": 0.1142, + "step": 38442 + }, + { + "epoch": 0.9728218235189918, + "grad_norm": 7.98361349105835, + "learning_rate": 1.858485575825575e-08, + "loss": 0.2142, + "step": 38443 + }, + { + "epoch": 0.9728471290836855, + "grad_norm": 15.93877124786377, + "learning_rate": 1.85502846063218e-08, + "loss": 0.3831, + "step": 38444 + }, + { + "epoch": 0.9728724346483791, + "grad_norm": 6.870024681091309, + "learning_rate": 1.8515745578830403e-08, + "loss": 0.1655, + "step": 38445 + }, + { + "epoch": 0.9728977402130728, + "grad_norm": 3.7061688899993896, + "learning_rate": 1.8481238676004152e-08, + "loss": 0.1124, + "step": 38446 + }, + { + "epoch": 0.9729230457777666, + "grad_norm": 3.7036993503570557, + "learning_rate": 1.8446763898066213e-08, + "loss": 0.104, + "step": 38447 + }, + { + "epoch": 0.9729483513424602, + "grad_norm": 2.9069230556488037, + "learning_rate": 1.8412321245238064e-08, + "loss": 0.1525, + "step": 38448 + }, + { + "epoch": 0.9729736569071539, + "grad_norm": 5.73596715927124, + "learning_rate": 1.837791071774231e-08, + "loss": 0.2112, + "step": 38449 + }, + { + "epoch": 0.9729989624718476, + "grad_norm": 7.478957653045654, + "learning_rate": 1.8343532315800437e-08, + "loss": 0.0987, + "step": 38450 + }, + { + "epoch": 0.9730242680365412, + "grad_norm": 4.779991626739502, + "learning_rate": 1.8309186039635052e-08, + "loss": 0.102, + "step": 38451 + }, + { + "epoch": 0.9730495736012349, + "grad_norm": 5.978338241577148, + "learning_rate": 1.8274871889467083e-08, + "loss": 0.1575, + "step": 38452 + }, + { + "epoch": 0.9730748791659286, + "grad_norm": 6.549750328063965, + "learning_rate": 1.8240589865517465e-08, + "loss": 0.1281, + "step": 38453 + }, + { + "epoch": 0.9731001847306222, + "grad_norm": 3.281038284301758, + "learning_rate": 1.8206339968007136e-08, + "loss": 0.0893, + "step": 38454 + }, + { + "epoch": 0.9731254902953159, + "grad_norm": 2.911775588989258, + "learning_rate": 1.8172122197158136e-08, + "loss": 0.134, + "step": 38455 + }, + { + "epoch": 0.9731507958600096, + "grad_norm": 4.0035247802734375, + "learning_rate": 1.8137936553190295e-08, + "loss": 0.1583, + "step": 38456 + }, + { + "epoch": 0.9731761014247033, + "grad_norm": 3.6511175632476807, + "learning_rate": 1.810378303632454e-08, + "loss": 0.0704, + "step": 38457 + }, + { + "epoch": 0.9732014069893969, + "grad_norm": 8.403247833251953, + "learning_rate": 1.8069661646780144e-08, + "loss": 0.2328, + "step": 38458 + }, + { + "epoch": 0.9732267125540907, + "grad_norm": 6.68558931350708, + "learning_rate": 1.8035572384778043e-08, + "loss": 0.1512, + "step": 38459 + }, + { + "epoch": 0.9732520181187844, + "grad_norm": 3.260953426361084, + "learning_rate": 1.8001515250537505e-08, + "loss": 0.1092, + "step": 38460 + }, + { + "epoch": 0.973277323683478, + "grad_norm": 3.4244513511657715, + "learning_rate": 1.7967490244278907e-08, + "loss": 0.1419, + "step": 38461 + }, + { + "epoch": 0.9733026292481717, + "grad_norm": 6.395376205444336, + "learning_rate": 1.793349736622152e-08, + "loss": 0.1935, + "step": 38462 + }, + { + "epoch": 0.9733279348128654, + "grad_norm": 8.607311248779297, + "learning_rate": 1.78995366165835e-08, + "loss": 0.2472, + "step": 38463 + }, + { + "epoch": 0.973353240377559, + "grad_norm": 12.019994735717773, + "learning_rate": 1.786560799558523e-08, + "loss": 0.1073, + "step": 38464 + }, + { + "epoch": 0.9733785459422527, + "grad_norm": 6.186763763427734, + "learning_rate": 1.7831711503444317e-08, + "loss": 0.1342, + "step": 38465 + }, + { + "epoch": 0.9734038515069464, + "grad_norm": 5.775769233703613, + "learning_rate": 1.7797847140380576e-08, + "loss": 0.1371, + "step": 38466 + }, + { + "epoch": 0.97342915707164, + "grad_norm": 4.388359546661377, + "learning_rate": 1.776401490661106e-08, + "loss": 0.1442, + "step": 38467 + }, + { + "epoch": 0.9734544626363337, + "grad_norm": 3.9089159965515137, + "learning_rate": 1.7730214802355595e-08, + "loss": 0.1361, + "step": 38468 + }, + { + "epoch": 0.9734797682010274, + "grad_norm": 12.674114227294922, + "learning_rate": 1.7696446827830672e-08, + "loss": 0.2215, + "step": 38469 + }, + { + "epoch": 0.973505073765721, + "grad_norm": 8.752694129943848, + "learning_rate": 1.766271098325445e-08, + "loss": 0.1891, + "step": 38470 + }, + { + "epoch": 0.9735303793304148, + "grad_norm": 3.377286672592163, + "learning_rate": 1.7629007268844533e-08, + "loss": 0.1238, + "step": 38471 + }, + { + "epoch": 0.9735556848951085, + "grad_norm": 3.2462759017944336, + "learning_rate": 1.7595335684818525e-08, + "loss": 0.083, + "step": 38472 + }, + { + "epoch": 0.9735809904598021, + "grad_norm": 3.4815094470977783, + "learning_rate": 1.7561696231393477e-08, + "loss": 0.1514, + "step": 38473 + }, + { + "epoch": 0.9736062960244958, + "grad_norm": 3.176318645477295, + "learning_rate": 1.7528088908786435e-08, + "loss": 0.0913, + "step": 38474 + }, + { + "epoch": 0.9736316015891895, + "grad_norm": 17.049823760986328, + "learning_rate": 1.7494513717213335e-08, + "loss": 0.3184, + "step": 38475 + }, + { + "epoch": 0.9736569071538831, + "grad_norm": 5.314426898956299, + "learning_rate": 1.7460970656891786e-08, + "loss": 0.1048, + "step": 38476 + }, + { + "epoch": 0.9736822127185768, + "grad_norm": 6.270893096923828, + "learning_rate": 1.742745972803772e-08, + "loss": 0.2104, + "step": 38477 + }, + { + "epoch": 0.9737075182832705, + "grad_norm": 4.011619567871094, + "learning_rate": 1.7393980930867084e-08, + "loss": 0.1543, + "step": 38478 + }, + { + "epoch": 0.9737328238479641, + "grad_norm": 4.949777126312256, + "learning_rate": 1.7360534265595253e-08, + "loss": 0.1573, + "step": 38479 + }, + { + "epoch": 0.9737581294126578, + "grad_norm": 3.269498109817505, + "learning_rate": 1.7327119732439278e-08, + "loss": 0.1628, + "step": 38480 + }, + { + "epoch": 0.9737834349773515, + "grad_norm": 5.629993915557861, + "learning_rate": 1.729373733161288e-08, + "loss": 0.1682, + "step": 38481 + }, + { + "epoch": 0.9738087405420452, + "grad_norm": 3.801435947418213, + "learning_rate": 1.726038706333366e-08, + "loss": 0.1436, + "step": 38482 + }, + { + "epoch": 0.9738340461067388, + "grad_norm": 3.234326124191284, + "learning_rate": 1.722706892781423e-08, + "loss": 0.1336, + "step": 38483 + }, + { + "epoch": 0.9738593516714326, + "grad_norm": 5.2744669914245605, + "learning_rate": 1.7193782925271073e-08, + "loss": 0.234, + "step": 38484 + }, + { + "epoch": 0.9738846572361263, + "grad_norm": 6.824779987335205, + "learning_rate": 1.7160529055917918e-08, + "loss": 0.1266, + "step": 38485 + }, + { + "epoch": 0.9739099628008199, + "grad_norm": 2.380591869354248, + "learning_rate": 1.7127307319970144e-08, + "loss": 0.0827, + "step": 38486 + }, + { + "epoch": 0.9739352683655136, + "grad_norm": 5.462947845458984, + "learning_rate": 1.7094117717640914e-08, + "loss": 0.1729, + "step": 38487 + }, + { + "epoch": 0.9739605739302073, + "grad_norm": 3.772690534591675, + "learning_rate": 1.706096024914561e-08, + "loss": 0.0959, + "step": 38488 + }, + { + "epoch": 0.9739858794949009, + "grad_norm": 5.685307025909424, + "learning_rate": 1.702783491469684e-08, + "loss": 0.1712, + "step": 38489 + }, + { + "epoch": 0.9740111850595946, + "grad_norm": 4.300131320953369, + "learning_rate": 1.6994741714508324e-08, + "loss": 0.1371, + "step": 38490 + }, + { + "epoch": 0.9740364906242883, + "grad_norm": 17.344179153442383, + "learning_rate": 1.6961680648794333e-08, + "loss": 0.2134, + "step": 38491 + }, + { + "epoch": 0.9740617961889819, + "grad_norm": 2.412181854248047, + "learning_rate": 1.6928651717767474e-08, + "loss": 0.1029, + "step": 38492 + }, + { + "epoch": 0.9740871017536756, + "grad_norm": 4.173418998718262, + "learning_rate": 1.6895654921640357e-08, + "loss": 0.1576, + "step": 38493 + }, + { + "epoch": 0.9741124073183693, + "grad_norm": 5.057982921600342, + "learning_rate": 1.68626902606267e-08, + "loss": 0.1691, + "step": 38494 + }, + { + "epoch": 0.974137712883063, + "grad_norm": 5.300832748413086, + "learning_rate": 1.6829757734938557e-08, + "loss": 0.1452, + "step": 38495 + }, + { + "epoch": 0.9741630184477567, + "grad_norm": 4.838281154632568, + "learning_rate": 1.6796857344788532e-08, + "loss": 0.1131, + "step": 38496 + }, + { + "epoch": 0.9741883240124504, + "grad_norm": 3.3417017459869385, + "learning_rate": 1.6763989090388123e-08, + "loss": 0.1115, + "step": 38497 + }, + { + "epoch": 0.974213629577144, + "grad_norm": 4.908645153045654, + "learning_rate": 1.6731152971949937e-08, + "loss": 0.1606, + "step": 38498 + }, + { + "epoch": 0.9742389351418377, + "grad_norm": 4.113522529602051, + "learning_rate": 1.669834898968603e-08, + "loss": 0.1561, + "step": 38499 + }, + { + "epoch": 0.9742642407065314, + "grad_norm": 5.316141128540039, + "learning_rate": 1.6665577143806788e-08, + "loss": 0.184, + "step": 38500 + }, + { + "epoch": 0.974289546271225, + "grad_norm": 6.520687103271484, + "learning_rate": 1.6632837434524817e-08, + "loss": 0.1936, + "step": 38501 + }, + { + "epoch": 0.9743148518359187, + "grad_norm": 9.26645278930664, + "learning_rate": 1.660012986205051e-08, + "loss": 0.1729, + "step": 38502 + }, + { + "epoch": 0.9743401574006124, + "grad_norm": 3.087479591369629, + "learning_rate": 1.6567454426595354e-08, + "loss": 0.1247, + "step": 38503 + }, + { + "epoch": 0.974365462965306, + "grad_norm": 27.15609359741211, + "learning_rate": 1.6534811128369188e-08, + "loss": 0.1896, + "step": 38504 + }, + { + "epoch": 0.9743907685299997, + "grad_norm": 7.1782636642456055, + "learning_rate": 1.650219996758351e-08, + "loss": 0.2836, + "step": 38505 + }, + { + "epoch": 0.9744160740946934, + "grad_norm": 7.057925701141357, + "learning_rate": 1.6469620944447595e-08, + "loss": 0.1534, + "step": 38506 + }, + { + "epoch": 0.9744413796593872, + "grad_norm": 5.6124162673950195, + "learning_rate": 1.6437074059172386e-08, + "loss": 0.2007, + "step": 38507 + }, + { + "epoch": 0.9744666852240808, + "grad_norm": 3.456655740737915, + "learning_rate": 1.640455931196716e-08, + "loss": 0.0937, + "step": 38508 + }, + { + "epoch": 0.9744919907887745, + "grad_norm": 9.305453300476074, + "learning_rate": 1.6372076703042306e-08, + "loss": 0.2351, + "step": 38509 + }, + { + "epoch": 0.9745172963534682, + "grad_norm": 9.686846733093262, + "learning_rate": 1.6339626232606543e-08, + "loss": 0.2447, + "step": 38510 + }, + { + "epoch": 0.9745426019181618, + "grad_norm": 3.1147282123565674, + "learning_rate": 1.6307207900869704e-08, + "loss": 0.1311, + "step": 38511 + }, + { + "epoch": 0.9745679074828555, + "grad_norm": 6.371681213378906, + "learning_rate": 1.6274821708040513e-08, + "loss": 0.1009, + "step": 38512 + }, + { + "epoch": 0.9745932130475492, + "grad_norm": 3.835089683532715, + "learning_rate": 1.6242467654328797e-08, + "loss": 0.1182, + "step": 38513 + }, + { + "epoch": 0.9746185186122428, + "grad_norm": 9.88306999206543, + "learning_rate": 1.6210145739941064e-08, + "loss": 0.109, + "step": 38514 + }, + { + "epoch": 0.9746438241769365, + "grad_norm": 18.533363342285156, + "learning_rate": 1.6177855965087696e-08, + "loss": 0.345, + "step": 38515 + }, + { + "epoch": 0.9746691297416302, + "grad_norm": 6.022028923034668, + "learning_rate": 1.6145598329976308e-08, + "loss": 0.1088, + "step": 38516 + }, + { + "epoch": 0.9746944353063238, + "grad_norm": 3.4483397006988525, + "learning_rate": 1.6113372834815066e-08, + "loss": 0.0909, + "step": 38517 + }, + { + "epoch": 0.9747197408710175, + "grad_norm": 2.502967119216919, + "learning_rate": 1.6081179479811025e-08, + "loss": 0.0543, + "step": 38518 + }, + { + "epoch": 0.9747450464357112, + "grad_norm": 3.7924835681915283, + "learning_rate": 1.604901826517291e-08, + "loss": 0.1884, + "step": 38519 + }, + { + "epoch": 0.9747703520004048, + "grad_norm": 2.3969874382019043, + "learning_rate": 1.601688919110722e-08, + "loss": 0.0924, + "step": 38520 + }, + { + "epoch": 0.9747956575650986, + "grad_norm": 4.300082683563232, + "learning_rate": 1.598479225782157e-08, + "loss": 0.1484, + "step": 38521 + }, + { + "epoch": 0.9748209631297923, + "grad_norm": 5.60748291015625, + "learning_rate": 1.595272746552301e-08, + "loss": 0.1592, + "step": 38522 + }, + { + "epoch": 0.9748462686944859, + "grad_norm": 8.069822311401367, + "learning_rate": 1.59206948144186e-08, + "loss": 0.2335, + "step": 38523 + }, + { + "epoch": 0.9748715742591796, + "grad_norm": 9.569068908691406, + "learning_rate": 1.5888694304713736e-08, + "loss": 0.2406, + "step": 38524 + }, + { + "epoch": 0.9748968798238733, + "grad_norm": 9.176494598388672, + "learning_rate": 1.5856725936616025e-08, + "loss": 0.2121, + "step": 38525 + }, + { + "epoch": 0.9749221853885669, + "grad_norm": 6.281101226806641, + "learning_rate": 1.582478971033141e-08, + "loss": 0.1615, + "step": 38526 + }, + { + "epoch": 0.9749474909532606, + "grad_norm": 3.9938712120056152, + "learning_rate": 1.579288562606529e-08, + "loss": 0.1384, + "step": 38527 + }, + { + "epoch": 0.9749727965179543, + "grad_norm": 3.951096534729004, + "learning_rate": 1.5761013684023606e-08, + "loss": 0.1142, + "step": 38528 + }, + { + "epoch": 0.9749981020826479, + "grad_norm": 4.200919151306152, + "learning_rate": 1.5729173884411754e-08, + "loss": 0.1087, + "step": 38529 + }, + { + "epoch": 0.9750234076473416, + "grad_norm": 4.829130172729492, + "learning_rate": 1.5697366227435674e-08, + "loss": 0.1468, + "step": 38530 + }, + { + "epoch": 0.9750487132120353, + "grad_norm": 4.523000240325928, + "learning_rate": 1.5665590713300204e-08, + "loss": 0.1321, + "step": 38531 + }, + { + "epoch": 0.9750740187767291, + "grad_norm": 9.078895568847656, + "learning_rate": 1.5633847342210185e-08, + "loss": 0.2013, + "step": 38532 + }, + { + "epoch": 0.9750993243414227, + "grad_norm": 8.621142387390137, + "learning_rate": 1.560213611437045e-08, + "loss": 0.2009, + "step": 38533 + }, + { + "epoch": 0.9751246299061164, + "grad_norm": 2.4415459632873535, + "learning_rate": 1.557045702998472e-08, + "loss": 0.073, + "step": 38534 + }, + { + "epoch": 0.9751499354708101, + "grad_norm": 3.6001179218292236, + "learning_rate": 1.5538810089258393e-08, + "loss": 0.1181, + "step": 38535 + }, + { + "epoch": 0.9751752410355037, + "grad_norm": 11.008964538574219, + "learning_rate": 1.5507195292395195e-08, + "loss": 0.1893, + "step": 38536 + }, + { + "epoch": 0.9752005466001974, + "grad_norm": 4.249380588531494, + "learning_rate": 1.5475612639598848e-08, + "loss": 0.171, + "step": 38537 + }, + { + "epoch": 0.9752258521648911, + "grad_norm": 9.695930480957031, + "learning_rate": 1.544406213107308e-08, + "loss": 0.1277, + "step": 38538 + }, + { + "epoch": 0.9752511577295847, + "grad_norm": 3.676649570465088, + "learning_rate": 1.541254376702106e-08, + "loss": 0.1143, + "step": 38539 + }, + { + "epoch": 0.9752764632942784, + "grad_norm": 3.065859794616699, + "learning_rate": 1.538105754764707e-08, + "loss": 0.1005, + "step": 38540 + }, + { + "epoch": 0.9753017688589721, + "grad_norm": 4.820611000061035, + "learning_rate": 1.534960347315262e-08, + "loss": 0.1499, + "step": 38541 + }, + { + "epoch": 0.9753270744236657, + "grad_norm": 6.824508190155029, + "learning_rate": 1.5318181543741984e-08, + "loss": 0.1802, + "step": 38542 + }, + { + "epoch": 0.9753523799883594, + "grad_norm": 8.210411071777344, + "learning_rate": 1.528679175961667e-08, + "loss": 0.179, + "step": 38543 + }, + { + "epoch": 0.9753776855530532, + "grad_norm": 2.5847065448760986, + "learning_rate": 1.525543412097985e-08, + "loss": 0.0848, + "step": 38544 + }, + { + "epoch": 0.9754029911177468, + "grad_norm": 4.998493671417236, + "learning_rate": 1.5224108628033584e-08, + "loss": 0.1335, + "step": 38545 + }, + { + "epoch": 0.9754282966824405, + "grad_norm": 4.143777370452881, + "learning_rate": 1.5192815280979377e-08, + "loss": 0.1263, + "step": 38546 + }, + { + "epoch": 0.9754536022471342, + "grad_norm": 2.896529197692871, + "learning_rate": 1.5161554080019848e-08, + "loss": 0.078, + "step": 38547 + }, + { + "epoch": 0.9754789078118278, + "grad_norm": 4.874638557434082, + "learning_rate": 1.513032502535594e-08, + "loss": 0.117, + "step": 38548 + }, + { + "epoch": 0.9755042133765215, + "grad_norm": 3.7211949825286865, + "learning_rate": 1.509912811718972e-08, + "loss": 0.1478, + "step": 38549 + }, + { + "epoch": 0.9755295189412152, + "grad_norm": 6.011648178100586, + "learning_rate": 1.5067963355721582e-08, + "loss": 0.1265, + "step": 38550 + }, + { + "epoch": 0.9755548245059088, + "grad_norm": 4.65442419052124, + "learning_rate": 1.503683074115303e-08, + "loss": 0.1243, + "step": 38551 + }, + { + "epoch": 0.9755801300706025, + "grad_norm": 5.377139568328857, + "learning_rate": 1.500573027368446e-08, + "loss": 0.1225, + "step": 38552 + }, + { + "epoch": 0.9756054356352962, + "grad_norm": 6.1632795333862305, + "learning_rate": 1.4974661953517378e-08, + "loss": 0.155, + "step": 38553 + }, + { + "epoch": 0.9756307411999898, + "grad_norm": 4.233085632324219, + "learning_rate": 1.4943625780850512e-08, + "loss": 0.1622, + "step": 38554 + }, + { + "epoch": 0.9756560467646835, + "grad_norm": 6.6093430519104, + "learning_rate": 1.491262175588537e-08, + "loss": 0.2013, + "step": 38555 + }, + { + "epoch": 0.9756813523293772, + "grad_norm": 10.204551696777344, + "learning_rate": 1.4881649878821236e-08, + "loss": 0.1686, + "step": 38556 + }, + { + "epoch": 0.975706657894071, + "grad_norm": 4.505552768707275, + "learning_rate": 1.4850710149858505e-08, + "loss": 0.1995, + "step": 38557 + }, + { + "epoch": 0.9757319634587646, + "grad_norm": 4.185281753540039, + "learning_rate": 1.4819802569195906e-08, + "loss": 0.1407, + "step": 38558 + }, + { + "epoch": 0.9757572690234583, + "grad_norm": 3.933598279953003, + "learning_rate": 1.4788927137033282e-08, + "loss": 0.1553, + "step": 38559 + }, + { + "epoch": 0.975782574588152, + "grad_norm": 5.865687847137451, + "learning_rate": 1.4758083853568805e-08, + "loss": 0.1673, + "step": 38560 + }, + { + "epoch": 0.9758078801528456, + "grad_norm": 2.743574619293213, + "learning_rate": 1.4727272719002873e-08, + "loss": 0.1133, + "step": 38561 + }, + { + "epoch": 0.9758331857175393, + "grad_norm": 5.313058853149414, + "learning_rate": 1.4696493733533101e-08, + "loss": 0.1517, + "step": 38562 + }, + { + "epoch": 0.975858491282233, + "grad_norm": 3.6440067291259766, + "learning_rate": 1.4665746897358778e-08, + "loss": 0.1335, + "step": 38563 + }, + { + "epoch": 0.9758837968469266, + "grad_norm": 3.413386344909668, + "learning_rate": 1.463503221067697e-08, + "loss": 0.1255, + "step": 38564 + }, + { + "epoch": 0.9759091024116203, + "grad_norm": 7.897712230682373, + "learning_rate": 1.4604349673686956e-08, + "loss": 0.189, + "step": 38565 + }, + { + "epoch": 0.975934407976314, + "grad_norm": 5.283727645874023, + "learning_rate": 1.4573699286586362e-08, + "loss": 0.1675, + "step": 38566 + }, + { + "epoch": 0.9759597135410076, + "grad_norm": 4.786362648010254, + "learning_rate": 1.454308104957225e-08, + "loss": 0.1634, + "step": 38567 + }, + { + "epoch": 0.9759850191057013, + "grad_norm": 6.725019931793213, + "learning_rate": 1.4512494962842238e-08, + "loss": 0.1244, + "step": 38568 + }, + { + "epoch": 0.9760103246703951, + "grad_norm": 4.0948286056518555, + "learning_rate": 1.4481941026593949e-08, + "loss": 0.085, + "step": 38569 + }, + { + "epoch": 0.9760356302350887, + "grad_norm": 3.1706631183624268, + "learning_rate": 1.4451419241024445e-08, + "loss": 0.1334, + "step": 38570 + }, + { + "epoch": 0.9760609357997824, + "grad_norm": 6.016030311584473, + "learning_rate": 1.4420929606330236e-08, + "loss": 0.1022, + "step": 38571 + }, + { + "epoch": 0.9760862413644761, + "grad_norm": 5.389466285705566, + "learning_rate": 1.4390472122708388e-08, + "loss": 0.1602, + "step": 38572 + }, + { + "epoch": 0.9761115469291697, + "grad_norm": 6.085590839385986, + "learning_rate": 1.4360046790354299e-08, + "loss": 0.1589, + "step": 38573 + }, + { + "epoch": 0.9761368524938634, + "grad_norm": 2.346559524536133, + "learning_rate": 1.4329653609465589e-08, + "loss": 0.076, + "step": 38574 + }, + { + "epoch": 0.9761621580585571, + "grad_norm": 4.845623970031738, + "learning_rate": 1.42992925802371e-08, + "loss": 0.1917, + "step": 38575 + }, + { + "epoch": 0.9761874636232507, + "grad_norm": 4.51082706451416, + "learning_rate": 1.4268963702865346e-08, + "loss": 0.1224, + "step": 38576 + }, + { + "epoch": 0.9762127691879444, + "grad_norm": 9.36451244354248, + "learning_rate": 1.4238666977545167e-08, + "loss": 0.1753, + "step": 38577 + }, + { + "epoch": 0.9762380747526381, + "grad_norm": 3.6667234897613525, + "learning_rate": 1.420840240447252e-08, + "loss": 0.1315, + "step": 38578 + }, + { + "epoch": 0.9762633803173317, + "grad_norm": 7.68732213973999, + "learning_rate": 1.4178169983842804e-08, + "loss": 0.1907, + "step": 38579 + }, + { + "epoch": 0.9762886858820254, + "grad_norm": 6.174713611602783, + "learning_rate": 1.4147969715850306e-08, + "loss": 0.168, + "step": 38580 + }, + { + "epoch": 0.9763139914467192, + "grad_norm": 2.995750904083252, + "learning_rate": 1.4117801600690429e-08, + "loss": 0.1289, + "step": 38581 + }, + { + "epoch": 0.9763392970114128, + "grad_norm": 3.8091208934783936, + "learning_rate": 1.4087665638557457e-08, + "loss": 0.1376, + "step": 38582 + }, + { + "epoch": 0.9763646025761065, + "grad_norm": 3.8229165077209473, + "learning_rate": 1.4057561829645128e-08, + "loss": 0.1058, + "step": 38583 + }, + { + "epoch": 0.9763899081408002, + "grad_norm": 4.755871295928955, + "learning_rate": 1.4027490174148839e-08, + "loss": 0.0828, + "step": 38584 + }, + { + "epoch": 0.9764152137054939, + "grad_norm": 8.143266677856445, + "learning_rate": 1.3997450672261215e-08, + "loss": 0.2166, + "step": 38585 + }, + { + "epoch": 0.9764405192701875, + "grad_norm": 7.4620442390441895, + "learning_rate": 1.3967443324176545e-08, + "loss": 0.1488, + "step": 38586 + }, + { + "epoch": 0.9764658248348812, + "grad_norm": 4.809340953826904, + "learning_rate": 1.393746813008856e-08, + "loss": 0.1118, + "step": 38587 + }, + { + "epoch": 0.9764911303995749, + "grad_norm": 3.121122121810913, + "learning_rate": 1.3907525090190444e-08, + "loss": 0.1143, + "step": 38588 + }, + { + "epoch": 0.9765164359642685, + "grad_norm": 6.179849147796631, + "learning_rate": 1.3877614204675372e-08, + "loss": 0.1612, + "step": 38589 + }, + { + "epoch": 0.9765417415289622, + "grad_norm": 3.7228221893310547, + "learning_rate": 1.3847735473735414e-08, + "loss": 0.1491, + "step": 38590 + }, + { + "epoch": 0.9765670470936559, + "grad_norm": 3.2073564529418945, + "learning_rate": 1.3817888897564302e-08, + "loss": 0.1237, + "step": 38591 + }, + { + "epoch": 0.9765923526583495, + "grad_norm": 4.037370204925537, + "learning_rate": 1.3788074476354663e-08, + "loss": 0.1456, + "step": 38592 + }, + { + "epoch": 0.9766176582230433, + "grad_norm": 7.639433860778809, + "learning_rate": 1.3758292210297453e-08, + "loss": 0.2661, + "step": 38593 + }, + { + "epoch": 0.976642963787737, + "grad_norm": 2.967811107635498, + "learning_rate": 1.372854209958585e-08, + "loss": 0.0767, + "step": 38594 + }, + { + "epoch": 0.9766682693524306, + "grad_norm": 3.519521474838257, + "learning_rate": 1.3698824144411371e-08, + "loss": 0.1297, + "step": 38595 + }, + { + "epoch": 0.9766935749171243, + "grad_norm": 3.6775095462799072, + "learning_rate": 1.3669138344965527e-08, + "loss": 0.113, + "step": 38596 + }, + { + "epoch": 0.976718880481818, + "grad_norm": 3.761824607849121, + "learning_rate": 1.3639484701439831e-08, + "loss": 0.1374, + "step": 38597 + }, + { + "epoch": 0.9767441860465116, + "grad_norm": 14.332207679748535, + "learning_rate": 1.3609863214025799e-08, + "loss": 0.3004, + "step": 38598 + }, + { + "epoch": 0.9767694916112053, + "grad_norm": 3.1646339893341064, + "learning_rate": 1.358027388291383e-08, + "loss": 0.1058, + "step": 38599 + }, + { + "epoch": 0.976794797175899, + "grad_norm": 7.7760233879089355, + "learning_rate": 1.3550716708295441e-08, + "loss": 0.2146, + "step": 38600 + }, + { + "epoch": 0.9768201027405926, + "grad_norm": 5.74631404876709, + "learning_rate": 1.3521191690361035e-08, + "loss": 0.138, + "step": 38601 + }, + { + "epoch": 0.9768454083052863, + "grad_norm": 9.565934181213379, + "learning_rate": 1.3491698829301015e-08, + "loss": 0.2664, + "step": 38602 + }, + { + "epoch": 0.97687071386998, + "grad_norm": 7.2524542808532715, + "learning_rate": 1.3462238125304672e-08, + "loss": 0.1959, + "step": 38603 + }, + { + "epoch": 0.9768960194346736, + "grad_norm": 3.9477717876434326, + "learning_rate": 1.3432809578563521e-08, + "loss": 0.1988, + "step": 38604 + }, + { + "epoch": 0.9769213249993673, + "grad_norm": 3.593374252319336, + "learning_rate": 1.3403413189266301e-08, + "loss": 0.0645, + "step": 38605 + }, + { + "epoch": 0.9769466305640611, + "grad_norm": 4.898210048675537, + "learning_rate": 1.3374048957603414e-08, + "loss": 0.123, + "step": 38606 + }, + { + "epoch": 0.9769719361287547, + "grad_norm": 11.446330070495605, + "learning_rate": 1.3344716883763042e-08, + "loss": 0.229, + "step": 38607 + }, + { + "epoch": 0.9769972416934484, + "grad_norm": 10.59363079071045, + "learning_rate": 1.3315416967935036e-08, + "loss": 0.2781, + "step": 38608 + }, + { + "epoch": 0.9770225472581421, + "grad_norm": 5.897510528564453, + "learning_rate": 1.3286149210308686e-08, + "loss": 0.123, + "step": 38609 + }, + { + "epoch": 0.9770478528228358, + "grad_norm": 4.435214042663574, + "learning_rate": 1.3256913611072175e-08, + "loss": 0.1311, + "step": 38610 + }, + { + "epoch": 0.9770731583875294, + "grad_norm": 5.364931106567383, + "learning_rate": 1.3227710170414242e-08, + "loss": 0.1176, + "step": 38611 + }, + { + "epoch": 0.9770984639522231, + "grad_norm": 4.080547332763672, + "learning_rate": 1.319853888852307e-08, + "loss": 0.1424, + "step": 38612 + }, + { + "epoch": 0.9771237695169168, + "grad_norm": 2.8981409072875977, + "learning_rate": 1.3169399765587398e-08, + "loss": 0.0974, + "step": 38613 + }, + { + "epoch": 0.9771490750816104, + "grad_norm": 4.048010349273682, + "learning_rate": 1.314029280179374e-08, + "loss": 0.1015, + "step": 38614 + }, + { + "epoch": 0.9771743806463041, + "grad_norm": 5.853267669677734, + "learning_rate": 1.3111217997331394e-08, + "loss": 0.1187, + "step": 38615 + }, + { + "epoch": 0.9771996862109978, + "grad_norm": 6.0834431648254395, + "learning_rate": 1.3082175352386872e-08, + "loss": 0.1173, + "step": 38616 + }, + { + "epoch": 0.9772249917756914, + "grad_norm": 5.396968841552734, + "learning_rate": 1.305316486714836e-08, + "loss": 0.194, + "step": 38617 + }, + { + "epoch": 0.9772502973403852, + "grad_norm": 7.594748020172119, + "learning_rate": 1.302418654180182e-08, + "loss": 0.2271, + "step": 38618 + }, + { + "epoch": 0.9772756029050789, + "grad_norm": 4.192966461181641, + "learning_rate": 1.2995240376534878e-08, + "loss": 0.1432, + "step": 38619 + }, + { + "epoch": 0.9773009084697725, + "grad_norm": 5.69727087020874, + "learning_rate": 1.2966326371533499e-08, + "loss": 0.1428, + "step": 38620 + }, + { + "epoch": 0.9773262140344662, + "grad_norm": 4.5834574699401855, + "learning_rate": 1.293744452698531e-08, + "loss": 0.1688, + "step": 38621 + }, + { + "epoch": 0.9773515195991599, + "grad_norm": 5.702670097351074, + "learning_rate": 1.2908594843075162e-08, + "loss": 0.121, + "step": 38622 + }, + { + "epoch": 0.9773768251638535, + "grad_norm": 6.049283027648926, + "learning_rate": 1.2879777319990128e-08, + "loss": 0.1737, + "step": 38623 + }, + { + "epoch": 0.9774021307285472, + "grad_norm": 3.892690420150757, + "learning_rate": 1.2850991957915616e-08, + "loss": 0.1449, + "step": 38624 + }, + { + "epoch": 0.9774274362932409, + "grad_norm": 3.1960911750793457, + "learning_rate": 1.2822238757037586e-08, + "loss": 0.0988, + "step": 38625 + }, + { + "epoch": 0.9774527418579345, + "grad_norm": 4.727715015411377, + "learning_rate": 1.2793517717541449e-08, + "loss": 0.1476, + "step": 38626 + }, + { + "epoch": 0.9774780474226282, + "grad_norm": 6.0225300788879395, + "learning_rate": 1.2764828839612054e-08, + "loss": 0.1086, + "step": 38627 + }, + { + "epoch": 0.9775033529873219, + "grad_norm": 2.8181021213531494, + "learning_rate": 1.2736172123434254e-08, + "loss": 0.113, + "step": 38628 + }, + { + "epoch": 0.9775286585520155, + "grad_norm": 8.988103866577148, + "learning_rate": 1.2707547569193456e-08, + "loss": 0.2676, + "step": 38629 + }, + { + "epoch": 0.9775539641167093, + "grad_norm": 8.819294929504395, + "learning_rate": 1.267895517707396e-08, + "loss": 0.164, + "step": 38630 + }, + { + "epoch": 0.977579269681403, + "grad_norm": 7.231213092803955, + "learning_rate": 1.2650394947260059e-08, + "loss": 0.1616, + "step": 38631 + }, + { + "epoch": 0.9776045752460966, + "grad_norm": 5.725639820098877, + "learning_rate": 1.262186687993605e-08, + "loss": 0.1133, + "step": 38632 + }, + { + "epoch": 0.9776298808107903, + "grad_norm": 2.1809329986572266, + "learning_rate": 1.2593370975286234e-08, + "loss": 0.1029, + "step": 38633 + }, + { + "epoch": 0.977655186375484, + "grad_norm": 5.188260555267334, + "learning_rate": 1.2564907233493794e-08, + "loss": 0.15, + "step": 38634 + }, + { + "epoch": 0.9776804919401777, + "grad_norm": 5.929401397705078, + "learning_rate": 1.2536475654742475e-08, + "loss": 0.1404, + "step": 38635 + }, + { + "epoch": 0.9777057975048713, + "grad_norm": 3.7937417030334473, + "learning_rate": 1.2508076239216015e-08, + "loss": 0.1505, + "step": 38636 + }, + { + "epoch": 0.977731103069565, + "grad_norm": 10.835010528564453, + "learning_rate": 1.2479708987096495e-08, + "loss": 0.178, + "step": 38637 + }, + { + "epoch": 0.9777564086342587, + "grad_norm": 3.6153993606567383, + "learning_rate": 1.2451373898568208e-08, + "loss": 0.0933, + "step": 38638 + }, + { + "epoch": 0.9777817141989523, + "grad_norm": 6.356688499450684, + "learning_rate": 1.2423070973813235e-08, + "loss": 0.193, + "step": 38639 + }, + { + "epoch": 0.977807019763646, + "grad_norm": 8.409768104553223, + "learning_rate": 1.2394800213013647e-08, + "loss": 0.2025, + "step": 38640 + }, + { + "epoch": 0.9778323253283397, + "grad_norm": 3.875969171524048, + "learning_rate": 1.2366561616352634e-08, + "loss": 0.1186, + "step": 38641 + }, + { + "epoch": 0.9778576308930333, + "grad_norm": 4.64092493057251, + "learning_rate": 1.2338355184012275e-08, + "loss": 0.1433, + "step": 38642 + }, + { + "epoch": 0.9778829364577271, + "grad_norm": 4.32244873046875, + "learning_rate": 1.231018091617353e-08, + "loss": 0.0936, + "step": 38643 + }, + { + "epoch": 0.9779082420224208, + "grad_norm": 3.494166851043701, + "learning_rate": 1.2282038813018482e-08, + "loss": 0.0916, + "step": 38644 + }, + { + "epoch": 0.9779335475871144, + "grad_norm": 10.143672943115234, + "learning_rate": 1.2253928874729204e-08, + "loss": 0.1138, + "step": 38645 + }, + { + "epoch": 0.9779588531518081, + "grad_norm": 5.483772277832031, + "learning_rate": 1.2225851101486108e-08, + "loss": 0.1606, + "step": 38646 + }, + { + "epoch": 0.9779841587165018, + "grad_norm": 5.738522052764893, + "learning_rate": 1.219780549347127e-08, + "loss": 0.147, + "step": 38647 + }, + { + "epoch": 0.9780094642811954, + "grad_norm": 7.692242622375488, + "learning_rate": 1.2169792050864549e-08, + "loss": 0.2885, + "step": 38648 + }, + { + "epoch": 0.9780347698458891, + "grad_norm": 5.008077621459961, + "learning_rate": 1.2141810773846908e-08, + "loss": 0.1, + "step": 38649 + }, + { + "epoch": 0.9780600754105828, + "grad_norm": 4.251691818237305, + "learning_rate": 1.2113861662599314e-08, + "loss": 0.1503, + "step": 38650 + }, + { + "epoch": 0.9780853809752764, + "grad_norm": 4.047379493713379, + "learning_rate": 1.2085944717301623e-08, + "loss": 0.1381, + "step": 38651 + }, + { + "epoch": 0.9781106865399701, + "grad_norm": 4.871676921844482, + "learning_rate": 1.2058059938133693e-08, + "loss": 0.1269, + "step": 38652 + }, + { + "epoch": 0.9781359921046638, + "grad_norm": 13.454212188720703, + "learning_rate": 1.2030207325275378e-08, + "loss": 0.1641, + "step": 38653 + }, + { + "epoch": 0.9781612976693574, + "grad_norm": 12.467480659484863, + "learning_rate": 1.2002386878906535e-08, + "loss": 0.2434, + "step": 38654 + }, + { + "epoch": 0.9781866032340512, + "grad_norm": 4.672738075256348, + "learning_rate": 1.1974598599206467e-08, + "loss": 0.0931, + "step": 38655 + }, + { + "epoch": 0.9782119087987449, + "grad_norm": 3.945802688598633, + "learning_rate": 1.194684248635447e-08, + "loss": 0.1927, + "step": 38656 + }, + { + "epoch": 0.9782372143634385, + "grad_norm": 19.369091033935547, + "learning_rate": 1.1919118540529294e-08, + "loss": 0.2732, + "step": 38657 + }, + { + "epoch": 0.9782625199281322, + "grad_norm": 3.6175992488861084, + "learning_rate": 1.1891426761909685e-08, + "loss": 0.0703, + "step": 38658 + }, + { + "epoch": 0.9782878254928259, + "grad_norm": 2.5814332962036133, + "learning_rate": 1.1863767150674944e-08, + "loss": 0.0894, + "step": 38659 + }, + { + "epoch": 0.9783131310575196, + "grad_norm": 3.6979963779449463, + "learning_rate": 1.1836139707002703e-08, + "loss": 0.1383, + "step": 38660 + }, + { + "epoch": 0.9783384366222132, + "grad_norm": 10.993025779724121, + "learning_rate": 1.1808544431071156e-08, + "loss": 0.2897, + "step": 38661 + }, + { + "epoch": 0.9783637421869069, + "grad_norm": 2.648456573486328, + "learning_rate": 1.1780981323058494e-08, + "loss": 0.1087, + "step": 38662 + }, + { + "epoch": 0.9783890477516006, + "grad_norm": 5.983787536621094, + "learning_rate": 1.1753450383142351e-08, + "loss": 0.1533, + "step": 38663 + }, + { + "epoch": 0.9784143533162942, + "grad_norm": 8.066361427307129, + "learning_rate": 1.1725951611500363e-08, + "loss": 0.2508, + "step": 38664 + }, + { + "epoch": 0.9784396588809879, + "grad_norm": 3.3996975421905518, + "learning_rate": 1.1698485008309612e-08, + "loss": 0.113, + "step": 38665 + }, + { + "epoch": 0.9784649644456817, + "grad_norm": 9.574400901794434, + "learning_rate": 1.1671050573747733e-08, + "loss": 0.202, + "step": 38666 + }, + { + "epoch": 0.9784902700103753, + "grad_norm": 4.654125213623047, + "learning_rate": 1.1643648307991252e-08, + "loss": 0.1402, + "step": 38667 + }, + { + "epoch": 0.978515575575069, + "grad_norm": 6.415265083312988, + "learning_rate": 1.1616278211217246e-08, + "loss": 0.1962, + "step": 38668 + }, + { + "epoch": 0.9785408811397627, + "grad_norm": 4.051852703094482, + "learning_rate": 1.158894028360169e-08, + "loss": 0.0776, + "step": 38669 + }, + { + "epoch": 0.9785661867044563, + "grad_norm": 7.15504789352417, + "learning_rate": 1.1561634525321107e-08, + "loss": 0.1688, + "step": 38670 + }, + { + "epoch": 0.97859149226915, + "grad_norm": 3.519552230834961, + "learning_rate": 1.1534360936552025e-08, + "loss": 0.1089, + "step": 38671 + }, + { + "epoch": 0.9786167978338437, + "grad_norm": 22.17568588256836, + "learning_rate": 1.15071195174693e-08, + "loss": 0.1761, + "step": 38672 + }, + { + "epoch": 0.9786421033985373, + "grad_norm": 3.8558175563812256, + "learning_rate": 1.1479910268249462e-08, + "loss": 0.13, + "step": 38673 + }, + { + "epoch": 0.978667408963231, + "grad_norm": 4.654116153717041, + "learning_rate": 1.1452733189067921e-08, + "loss": 0.1389, + "step": 38674 + }, + { + "epoch": 0.9786927145279247, + "grad_norm": 9.853852272033691, + "learning_rate": 1.1425588280099542e-08, + "loss": 0.2907, + "step": 38675 + }, + { + "epoch": 0.9787180200926183, + "grad_norm": 4.767348289489746, + "learning_rate": 1.1398475541519739e-08, + "loss": 0.1771, + "step": 38676 + }, + { + "epoch": 0.978743325657312, + "grad_norm": 23.233516693115234, + "learning_rate": 1.1371394973503368e-08, + "loss": 0.1331, + "step": 38677 + }, + { + "epoch": 0.9787686312220057, + "grad_norm": 6.87605094909668, + "learning_rate": 1.1344346576224186e-08, + "loss": 0.136, + "step": 38678 + }, + { + "epoch": 0.9787939367866993, + "grad_norm": 5.0839362144470215, + "learning_rate": 1.1317330349858158e-08, + "loss": 0.1625, + "step": 38679 + }, + { + "epoch": 0.9788192423513931, + "grad_norm": 6.738631725311279, + "learning_rate": 1.129034629457848e-08, + "loss": 0.199, + "step": 38680 + }, + { + "epoch": 0.9788445479160868, + "grad_norm": 3.5692503452301025, + "learning_rate": 1.1263394410559458e-08, + "loss": 0.1341, + "step": 38681 + }, + { + "epoch": 0.9788698534807804, + "grad_norm": 2.581752300262451, + "learning_rate": 1.1236474697974287e-08, + "loss": 0.1223, + "step": 38682 + }, + { + "epoch": 0.9788951590454741, + "grad_norm": 12.26230525970459, + "learning_rate": 1.1209587156997825e-08, + "loss": 0.1913, + "step": 38683 + }, + { + "epoch": 0.9789204646101678, + "grad_norm": 9.171477317810059, + "learning_rate": 1.1182731787802159e-08, + "loss": 0.2169, + "step": 38684 + }, + { + "epoch": 0.9789457701748615, + "grad_norm": 5.13419246673584, + "learning_rate": 1.1155908590561593e-08, + "loss": 0.1299, + "step": 38685 + }, + { + "epoch": 0.9789710757395551, + "grad_norm": 2.944340705871582, + "learning_rate": 1.112911756544821e-08, + "loss": 0.1269, + "step": 38686 + }, + { + "epoch": 0.9789963813042488, + "grad_norm": 13.966984748840332, + "learning_rate": 1.1102358712635208e-08, + "loss": 0.195, + "step": 38687 + }, + { + "epoch": 0.9790216868689425, + "grad_norm": 8.67480754852295, + "learning_rate": 1.1075632032295225e-08, + "loss": 0.1582, + "step": 38688 + }, + { + "epoch": 0.9790469924336361, + "grad_norm": 4.55618143081665, + "learning_rate": 1.1048937524600345e-08, + "loss": 0.1222, + "step": 38689 + }, + { + "epoch": 0.9790722979983298, + "grad_norm": 5.330175876617432, + "learning_rate": 1.1022275189723208e-08, + "loss": 0.1907, + "step": 38690 + }, + { + "epoch": 0.9790976035630236, + "grad_norm": 5.312991619110107, + "learning_rate": 1.0995645027834789e-08, + "loss": 0.1083, + "step": 38691 + }, + { + "epoch": 0.9791229091277172, + "grad_norm": 3.98923397064209, + "learning_rate": 1.0969047039107727e-08, + "loss": 0.1415, + "step": 38692 + }, + { + "epoch": 0.9791482146924109, + "grad_norm": 7.268927574157715, + "learning_rate": 1.0942481223713552e-08, + "loss": 0.1275, + "step": 38693 + }, + { + "epoch": 0.9791735202571046, + "grad_norm": 4.157569408416748, + "learning_rate": 1.0915947581822683e-08, + "loss": 0.1062, + "step": 38694 + }, + { + "epoch": 0.9791988258217982, + "grad_norm": 4.571986675262451, + "learning_rate": 1.0889446113607204e-08, + "loss": 0.1285, + "step": 38695 + }, + { + "epoch": 0.9792241313864919, + "grad_norm": 11.131044387817383, + "learning_rate": 1.086297681923809e-08, + "loss": 0.1529, + "step": 38696 + }, + { + "epoch": 0.9792494369511856, + "grad_norm": 5.386063098907471, + "learning_rate": 1.0836539698884651e-08, + "loss": 0.1516, + "step": 38697 + }, + { + "epoch": 0.9792747425158792, + "grad_norm": 9.96257209777832, + "learning_rate": 1.081013475271897e-08, + "loss": 0.2588, + "step": 38698 + }, + { + "epoch": 0.9793000480805729, + "grad_norm": 3.8093955516815186, + "learning_rate": 1.0783761980910357e-08, + "loss": 0.1134, + "step": 38699 + }, + { + "epoch": 0.9793253536452666, + "grad_norm": 4.536771297454834, + "learning_rate": 1.0757421383629229e-08, + "loss": 0.1681, + "step": 38700 + }, + { + "epoch": 0.9793506592099602, + "grad_norm": 3.545381546020508, + "learning_rate": 1.0731112961045453e-08, + "loss": 0.0832, + "step": 38701 + }, + { + "epoch": 0.9793759647746539, + "grad_norm": 3.331501007080078, + "learning_rate": 1.0704836713328892e-08, + "loss": 0.1223, + "step": 38702 + }, + { + "epoch": 0.9794012703393477, + "grad_norm": 2.777769088745117, + "learning_rate": 1.06785926406483e-08, + "loss": 0.0914, + "step": 38703 + }, + { + "epoch": 0.9794265759040413, + "grad_norm": 5.535048007965088, + "learning_rate": 1.0652380743173541e-08, + "loss": 0.1662, + "step": 38704 + }, + { + "epoch": 0.979451881468735, + "grad_norm": 3.9512522220611572, + "learning_rate": 1.0626201021073367e-08, + "loss": 0.1099, + "step": 38705 + }, + { + "epoch": 0.9794771870334287, + "grad_norm": 5.057662010192871, + "learning_rate": 1.0600053474517091e-08, + "loss": 0.1042, + "step": 38706 + }, + { + "epoch": 0.9795024925981223, + "grad_norm": 5.247759819030762, + "learning_rate": 1.057393810367291e-08, + "loss": 0.1797, + "step": 38707 + }, + { + "epoch": 0.979527798162816, + "grad_norm": 5.7704901695251465, + "learning_rate": 1.054785490870902e-08, + "loss": 0.1376, + "step": 38708 + }, + { + "epoch": 0.9795531037275097, + "grad_norm": 7.332580089569092, + "learning_rate": 1.052180388979418e-08, + "loss": 0.214, + "step": 38709 + }, + { + "epoch": 0.9795784092922033, + "grad_norm": 3.002659797668457, + "learning_rate": 1.049578504709603e-08, + "loss": 0.102, + "step": 38710 + }, + { + "epoch": 0.979603714856897, + "grad_norm": 2.8671767711639404, + "learning_rate": 1.0469798380782214e-08, + "loss": 0.0979, + "step": 38711 + }, + { + "epoch": 0.9796290204215907, + "grad_norm": 6.629672050476074, + "learning_rate": 1.0443843891020934e-08, + "loss": 0.2128, + "step": 38712 + }, + { + "epoch": 0.9796543259862844, + "grad_norm": 4.259743690490723, + "learning_rate": 1.0417921577979273e-08, + "loss": 0.1544, + "step": 38713 + }, + { + "epoch": 0.979679631550978, + "grad_norm": 5.773766040802002, + "learning_rate": 1.0392031441823769e-08, + "loss": 0.1381, + "step": 38714 + }, + { + "epoch": 0.9797049371156717, + "grad_norm": 4.740983963012695, + "learning_rate": 1.0366173482722064e-08, + "loss": 0.1228, + "step": 38715 + }, + { + "epoch": 0.9797302426803655, + "grad_norm": 3.9318349361419678, + "learning_rate": 1.0340347700841246e-08, + "loss": 0.1562, + "step": 38716 + }, + { + "epoch": 0.9797555482450591, + "grad_norm": 4.95854377746582, + "learning_rate": 1.031455409634674e-08, + "loss": 0.14, + "step": 38717 + }, + { + "epoch": 0.9797808538097528, + "grad_norm": 11.200409889221191, + "learning_rate": 1.0288792669405634e-08, + "loss": 0.3474, + "step": 38718 + }, + { + "epoch": 0.9798061593744465, + "grad_norm": 10.990367889404297, + "learning_rate": 1.026306342018446e-08, + "loss": 0.1658, + "step": 38719 + }, + { + "epoch": 0.9798314649391401, + "grad_norm": 4.952498912811279, + "learning_rate": 1.0237366348848088e-08, + "loss": 0.1519, + "step": 38720 + }, + { + "epoch": 0.9798567705038338, + "grad_norm": 5.801601886749268, + "learning_rate": 1.021170145556305e-08, + "loss": 0.1685, + "step": 38721 + }, + { + "epoch": 0.9798820760685275, + "grad_norm": 4.321067810058594, + "learning_rate": 1.0186068740494215e-08, + "loss": 0.1685, + "step": 38722 + }, + { + "epoch": 0.9799073816332211, + "grad_norm": 4.596715927124023, + "learning_rate": 1.016046820380756e-08, + "loss": 0.1187, + "step": 38723 + }, + { + "epoch": 0.9799326871979148, + "grad_norm": 3.8946213722229004, + "learning_rate": 1.0134899845667956e-08, + "loss": 0.1532, + "step": 38724 + }, + { + "epoch": 0.9799579927626085, + "grad_norm": 4.220209121704102, + "learning_rate": 1.0109363666240268e-08, + "loss": 0.118, + "step": 38725 + }, + { + "epoch": 0.9799832983273021, + "grad_norm": 7.286200523376465, + "learning_rate": 1.008385966568881e-08, + "loss": 0.1861, + "step": 38726 + }, + { + "epoch": 0.9800086038919958, + "grad_norm": 9.072500228881836, + "learning_rate": 1.0058387844178452e-08, + "loss": 0.2269, + "step": 38727 + }, + { + "epoch": 0.9800339094566896, + "grad_norm": 14.960824012756348, + "learning_rate": 1.003294820187295e-08, + "loss": 0.2013, + "step": 38728 + }, + { + "epoch": 0.9800592150213832, + "grad_norm": 10.699755668640137, + "learning_rate": 1.0007540738937172e-08, + "loss": 0.2015, + "step": 38729 + }, + { + "epoch": 0.9800845205860769, + "grad_norm": 8.873230934143066, + "learning_rate": 9.982165455534876e-09, + "loss": 0.2836, + "step": 38730 + }, + { + "epoch": 0.9801098261507706, + "grad_norm": 30.207992553710938, + "learning_rate": 9.95682235182871e-09, + "loss": 0.2438, + "step": 38731 + }, + { + "epoch": 0.9801351317154642, + "grad_norm": 7.04896354675293, + "learning_rate": 9.931511427982986e-09, + "loss": 0.1419, + "step": 38732 + }, + { + "epoch": 0.9801604372801579, + "grad_norm": 4.176636219024658, + "learning_rate": 9.90623268416091e-09, + "loss": 0.1681, + "step": 38733 + }, + { + "epoch": 0.9801857428448516, + "grad_norm": 4.769608974456787, + "learning_rate": 9.880986120525127e-09, + "loss": 0.1571, + "step": 38734 + }, + { + "epoch": 0.9802110484095452, + "grad_norm": 3.233931541442871, + "learning_rate": 9.85577173723884e-09, + "loss": 0.1113, + "step": 38735 + }, + { + "epoch": 0.9802363539742389, + "grad_norm": 3.9378061294555664, + "learning_rate": 9.830589534464141e-09, + "loss": 0.1254, + "step": 38736 + }, + { + "epoch": 0.9802616595389326, + "grad_norm": 5.5525407791137695, + "learning_rate": 9.805439512364234e-09, + "loss": 0.143, + "step": 38737 + }, + { + "epoch": 0.9802869651036263, + "grad_norm": 3.908830404281616, + "learning_rate": 9.780321671100656e-09, + "loss": 0.14, + "step": 38738 + }, + { + "epoch": 0.9803122706683199, + "grad_norm": 8.583985328674316, + "learning_rate": 9.755236010834945e-09, + "loss": 0.2577, + "step": 38739 + }, + { + "epoch": 0.9803375762330137, + "grad_norm": 6.102600574493408, + "learning_rate": 9.730182531729749e-09, + "loss": 0.1334, + "step": 38740 + }, + { + "epoch": 0.9803628817977074, + "grad_norm": 3.7776029109954834, + "learning_rate": 9.705161233946602e-09, + "loss": 0.1548, + "step": 38741 + }, + { + "epoch": 0.980388187362401, + "grad_norm": 9.359908103942871, + "learning_rate": 9.680172117646492e-09, + "loss": 0.2065, + "step": 38742 + }, + { + "epoch": 0.9804134929270947, + "grad_norm": 3.9165265560150146, + "learning_rate": 9.655215182990952e-09, + "loss": 0.1037, + "step": 38743 + }, + { + "epoch": 0.9804387984917884, + "grad_norm": 3.6875393390655518, + "learning_rate": 9.63029043014041e-09, + "loss": 0.0799, + "step": 38744 + }, + { + "epoch": 0.980464104056482, + "grad_norm": 4.242951393127441, + "learning_rate": 9.60539785925585e-09, + "loss": 0.1751, + "step": 38745 + }, + { + "epoch": 0.9804894096211757, + "grad_norm": 4.121160984039307, + "learning_rate": 9.580537470497142e-09, + "loss": 0.1506, + "step": 38746 + }, + { + "epoch": 0.9805147151858694, + "grad_norm": 4.844132900238037, + "learning_rate": 9.55570926402638e-09, + "loss": 0.2136, + "step": 38747 + }, + { + "epoch": 0.980540020750563, + "grad_norm": 4.128510475158691, + "learning_rate": 9.530913240001771e-09, + "loss": 0.1556, + "step": 38748 + }, + { + "epoch": 0.9805653263152567, + "grad_norm": 2.666799306869507, + "learning_rate": 9.506149398584297e-09, + "loss": 0.114, + "step": 38749 + }, + { + "epoch": 0.9805906318799504, + "grad_norm": 2.8706088066101074, + "learning_rate": 9.481417739933274e-09, + "loss": 0.1288, + "step": 38750 + }, + { + "epoch": 0.980615937444644, + "grad_norm": 5.192356109619141, + "learning_rate": 9.456718264208576e-09, + "loss": 0.0595, + "step": 38751 + }, + { + "epoch": 0.9806412430093377, + "grad_norm": 3.9522054195404053, + "learning_rate": 9.432050971568963e-09, + "loss": 0.1241, + "step": 38752 + }, + { + "epoch": 0.9806665485740315, + "grad_norm": 9.962986946105957, + "learning_rate": 9.407415862173752e-09, + "loss": 0.2537, + "step": 38753 + }, + { + "epoch": 0.9806918541387251, + "grad_norm": 10.155838966369629, + "learning_rate": 9.382812936181706e-09, + "loss": 0.2668, + "step": 38754 + }, + { + "epoch": 0.9807171597034188, + "grad_norm": 3.2476847171783447, + "learning_rate": 9.358242193751588e-09, + "loss": 0.123, + "step": 38755 + }, + { + "epoch": 0.9807424652681125, + "grad_norm": 18.606069564819336, + "learning_rate": 9.333703635041602e-09, + "loss": 0.1095, + "step": 38756 + }, + { + "epoch": 0.9807677708328061, + "grad_norm": 8.046257019042969, + "learning_rate": 9.309197260210512e-09, + "loss": 0.141, + "step": 38757 + }, + { + "epoch": 0.9807930763974998, + "grad_norm": 4.81264066696167, + "learning_rate": 9.284723069416523e-09, + "loss": 0.1585, + "step": 38758 + }, + { + "epoch": 0.9808183819621935, + "grad_norm": 8.836267471313477, + "learning_rate": 9.260281062816179e-09, + "loss": 0.1813, + "step": 38759 + }, + { + "epoch": 0.9808436875268871, + "grad_norm": 2.3591442108154297, + "learning_rate": 9.235871240568239e-09, + "loss": 0.0669, + "step": 38760 + }, + { + "epoch": 0.9808689930915808, + "grad_norm": 4.992241382598877, + "learning_rate": 9.211493602830356e-09, + "loss": 0.1899, + "step": 38761 + }, + { + "epoch": 0.9808942986562745, + "grad_norm": 3.3056769371032715, + "learning_rate": 9.187148149758518e-09, + "loss": 0.1365, + "step": 38762 + }, + { + "epoch": 0.9809196042209682, + "grad_norm": 5.798144817352295, + "learning_rate": 9.162834881510374e-09, + "loss": 0.1628, + "step": 38763 + }, + { + "epoch": 0.9809449097856618, + "grad_norm": 3.0000181198120117, + "learning_rate": 9.138553798243022e-09, + "loss": 0.1192, + "step": 38764 + }, + { + "epoch": 0.9809702153503556, + "grad_norm": 6.227313995361328, + "learning_rate": 9.114304900113002e-09, + "loss": 0.1542, + "step": 38765 + }, + { + "epoch": 0.9809955209150493, + "grad_norm": 8.530406951904297, + "learning_rate": 9.090088187275748e-09, + "loss": 0.2045, + "step": 38766 + }, + { + "epoch": 0.9810208264797429, + "grad_norm": 7.612508296966553, + "learning_rate": 9.065903659888908e-09, + "loss": 0.1287, + "step": 38767 + }, + { + "epoch": 0.9810461320444366, + "grad_norm": 9.966590881347656, + "learning_rate": 9.041751318107362e-09, + "loss": 0.1907, + "step": 38768 + }, + { + "epoch": 0.9810714376091303, + "grad_norm": 3.5393881797790527, + "learning_rate": 9.017631162087093e-09, + "loss": 0.1462, + "step": 38769 + }, + { + "epoch": 0.9810967431738239, + "grad_norm": 4.386508941650391, + "learning_rate": 8.993543191983534e-09, + "loss": 0.1012, + "step": 38770 + }, + { + "epoch": 0.9811220487385176, + "grad_norm": 7.265971660614014, + "learning_rate": 8.96948740795267e-09, + "loss": 0.1809, + "step": 38771 + }, + { + "epoch": 0.9811473543032113, + "grad_norm": 11.40768814086914, + "learning_rate": 8.945463810148825e-09, + "loss": 0.1452, + "step": 38772 + }, + { + "epoch": 0.9811726598679049, + "grad_norm": 2.99119234085083, + "learning_rate": 8.921472398727427e-09, + "loss": 0.1122, + "step": 38773 + }, + { + "epoch": 0.9811979654325986, + "grad_norm": 5.419045925140381, + "learning_rate": 8.897513173842797e-09, + "loss": 0.1316, + "step": 38774 + }, + { + "epoch": 0.9812232709972923, + "grad_norm": 5.349447250366211, + "learning_rate": 8.873586135649814e-09, + "loss": 0.1111, + "step": 38775 + }, + { + "epoch": 0.9812485765619859, + "grad_norm": 4.581453800201416, + "learning_rate": 8.849691284302798e-09, + "loss": 0.1531, + "step": 38776 + }, + { + "epoch": 0.9812738821266797, + "grad_norm": 3.936878204345703, + "learning_rate": 8.825828619955511e-09, + "loss": 0.1195, + "step": 38777 + }, + { + "epoch": 0.9812991876913734, + "grad_norm": 2.794041395187378, + "learning_rate": 8.80199814276228e-09, + "loss": 0.1244, + "step": 38778 + }, + { + "epoch": 0.981324493256067, + "grad_norm": 8.80711555480957, + "learning_rate": 8.778199852876313e-09, + "loss": 0.2155, + "step": 38779 + }, + { + "epoch": 0.9813497988207607, + "grad_norm": 9.943192481994629, + "learning_rate": 8.754433750451375e-09, + "loss": 0.0979, + "step": 38780 + }, + { + "epoch": 0.9813751043854544, + "grad_norm": 6.379847049713135, + "learning_rate": 8.730699835640677e-09, + "loss": 0.1347, + "step": 38781 + }, + { + "epoch": 0.981400409950148, + "grad_norm": 6.163477897644043, + "learning_rate": 8.706998108597432e-09, + "loss": 0.1414, + "step": 38782 + }, + { + "epoch": 0.9814257155148417, + "grad_norm": 9.183332443237305, + "learning_rate": 8.683328569474292e-09, + "loss": 0.1655, + "step": 38783 + }, + { + "epoch": 0.9814510210795354, + "grad_norm": 3.343676805496216, + "learning_rate": 8.65969121842336e-09, + "loss": 0.1459, + "step": 38784 + }, + { + "epoch": 0.981476326644229, + "grad_norm": 4.016905307769775, + "learning_rate": 8.636086055598404e-09, + "loss": 0.1048, + "step": 38785 + }, + { + "epoch": 0.9815016322089227, + "grad_norm": 5.933730602264404, + "learning_rate": 8.612513081149854e-09, + "loss": 0.2042, + "step": 38786 + }, + { + "epoch": 0.9815269377736164, + "grad_norm": 5.097705841064453, + "learning_rate": 8.588972295231479e-09, + "loss": 0.0869, + "step": 38787 + }, + { + "epoch": 0.9815522433383101, + "grad_norm": 3.8088293075561523, + "learning_rate": 8.565463697994269e-09, + "loss": 0.1085, + "step": 38788 + }, + { + "epoch": 0.9815775489030037, + "grad_norm": 5.604365825653076, + "learning_rate": 8.54198728958977e-09, + "loss": 0.1717, + "step": 38789 + }, + { + "epoch": 0.9816028544676975, + "grad_norm": 2.8225817680358887, + "learning_rate": 8.518543070169528e-09, + "loss": 0.0495, + "step": 38790 + }, + { + "epoch": 0.9816281600323912, + "grad_norm": 3.382051944732666, + "learning_rate": 8.495131039885085e-09, + "loss": 0.1357, + "step": 38791 + }, + { + "epoch": 0.9816534655970848, + "grad_norm": 5.759787082672119, + "learning_rate": 8.47175119888688e-09, + "loss": 0.1488, + "step": 38792 + }, + { + "epoch": 0.9816787711617785, + "grad_norm": 3.1827855110168457, + "learning_rate": 8.448403547325346e-09, + "loss": 0.0892, + "step": 38793 + }, + { + "epoch": 0.9817040767264722, + "grad_norm": 3.2369439601898193, + "learning_rate": 8.425088085352029e-09, + "loss": 0.0702, + "step": 38794 + }, + { + "epoch": 0.9817293822911658, + "grad_norm": 3.47841739654541, + "learning_rate": 8.40180481311681e-09, + "loss": 0.1358, + "step": 38795 + }, + { + "epoch": 0.9817546878558595, + "grad_norm": 11.18704891204834, + "learning_rate": 8.378553730769567e-09, + "loss": 0.1481, + "step": 38796 + }, + { + "epoch": 0.9817799934205532, + "grad_norm": 4.355898857116699, + "learning_rate": 8.355334838460737e-09, + "loss": 0.139, + "step": 38797 + }, + { + "epoch": 0.9818052989852468, + "grad_norm": 6.748242378234863, + "learning_rate": 8.332148136339646e-09, + "loss": 0.1796, + "step": 38798 + }, + { + "epoch": 0.9818306045499405, + "grad_norm": 4.766342639923096, + "learning_rate": 8.308993624556171e-09, + "loss": 0.162, + "step": 38799 + }, + { + "epoch": 0.9818559101146342, + "grad_norm": 3.1606080532073975, + "learning_rate": 8.28587130325964e-09, + "loss": 0.1346, + "step": 38800 + }, + { + "epoch": 0.9818812156793278, + "grad_norm": 4.863376617431641, + "learning_rate": 8.262781172598822e-09, + "loss": 0.1553, + "step": 38801 + }, + { + "epoch": 0.9819065212440216, + "grad_norm": 4.444521427154541, + "learning_rate": 8.239723232722485e-09, + "loss": 0.0753, + "step": 38802 + }, + { + "epoch": 0.9819318268087153, + "grad_norm": 6.665685176849365, + "learning_rate": 8.216697483779956e-09, + "loss": 0.2083, + "step": 38803 + }, + { + "epoch": 0.9819571323734089, + "grad_norm": 3.9780030250549316, + "learning_rate": 8.19370392591945e-09, + "loss": 0.1443, + "step": 38804 + }, + { + "epoch": 0.9819824379381026, + "grad_norm": 4.81458854675293, + "learning_rate": 8.17074255928918e-09, + "loss": 0.1143, + "step": 38805 + }, + { + "epoch": 0.9820077435027963, + "grad_norm": 15.352938652038574, + "learning_rate": 8.147813384037363e-09, + "loss": 0.1527, + "step": 38806 + }, + { + "epoch": 0.9820330490674899, + "grad_norm": 5.251458168029785, + "learning_rate": 8.124916400311655e-09, + "loss": 0.1387, + "step": 38807 + }, + { + "epoch": 0.9820583546321836, + "grad_norm": 7.453311443328857, + "learning_rate": 8.10205160825972e-09, + "loss": 0.1595, + "step": 38808 + }, + { + "epoch": 0.9820836601968773, + "grad_norm": 5.816947937011719, + "learning_rate": 8.07921900802866e-09, + "loss": 0.1832, + "step": 38809 + }, + { + "epoch": 0.9821089657615709, + "grad_norm": 2.5142531394958496, + "learning_rate": 8.056418599766691e-09, + "loss": 0.102, + "step": 38810 + }, + { + "epoch": 0.9821342713262646, + "grad_norm": 6.393086910247803, + "learning_rate": 8.033650383620362e-09, + "loss": 0.137, + "step": 38811 + }, + { + "epoch": 0.9821595768909583, + "grad_norm": 4.156373500823975, + "learning_rate": 8.010914359736222e-09, + "loss": 0.1216, + "step": 38812 + }, + { + "epoch": 0.982184882455652, + "grad_norm": 3.3515281677246094, + "learning_rate": 7.98821052826082e-09, + "loss": 0.1169, + "step": 38813 + }, + { + "epoch": 0.9822101880203457, + "grad_norm": 12.087228775024414, + "learning_rate": 7.965538889341262e-09, + "loss": 0.4363, + "step": 38814 + }, + { + "epoch": 0.9822354935850394, + "grad_norm": 9.946637153625488, + "learning_rate": 7.942899443123542e-09, + "loss": 0.1476, + "step": 38815 + }, + { + "epoch": 0.9822607991497331, + "grad_norm": 11.92762565612793, + "learning_rate": 7.920292189753098e-09, + "loss": 0.3732, + "step": 38816 + }, + { + "epoch": 0.9822861047144267, + "grad_norm": 22.165760040283203, + "learning_rate": 7.897717129375926e-09, + "loss": 0.3142, + "step": 38817 + }, + { + "epoch": 0.9823114102791204, + "grad_norm": 12.130331039428711, + "learning_rate": 7.875174262138574e-09, + "loss": 0.2539, + "step": 38818 + }, + { + "epoch": 0.9823367158438141, + "grad_norm": 5.272221088409424, + "learning_rate": 7.852663588184816e-09, + "loss": 0.1792, + "step": 38819 + }, + { + "epoch": 0.9823620214085077, + "grad_norm": 4.021372318267822, + "learning_rate": 7.830185107661204e-09, + "loss": 0.1242, + "step": 38820 + }, + { + "epoch": 0.9823873269732014, + "grad_norm": 7.701635360717773, + "learning_rate": 7.807738820711508e-09, + "loss": 0.2257, + "step": 38821 + }, + { + "epoch": 0.9824126325378951, + "grad_norm": 2.3652501106262207, + "learning_rate": 7.785324727481725e-09, + "loss": 0.0832, + "step": 38822 + }, + { + "epoch": 0.9824379381025887, + "grad_norm": 3.3225080966949463, + "learning_rate": 7.762942828115071e-09, + "loss": 0.1169, + "step": 38823 + }, + { + "epoch": 0.9824632436672824, + "grad_norm": 4.246592998504639, + "learning_rate": 7.74059312275699e-09, + "loss": 0.0831, + "step": 38824 + }, + { + "epoch": 0.9824885492319761, + "grad_norm": 5.579664707183838, + "learning_rate": 7.718275611551252e-09, + "loss": 0.1205, + "step": 38825 + }, + { + "epoch": 0.9825138547966697, + "grad_norm": 3.995622396469116, + "learning_rate": 7.695990294641632e-09, + "loss": 0.0917, + "step": 38826 + }, + { + "epoch": 0.9825391603613635, + "grad_norm": 4.147301197052002, + "learning_rate": 7.673737172171903e-09, + "loss": 0.082, + "step": 38827 + }, + { + "epoch": 0.9825644659260572, + "grad_norm": 3.131258726119995, + "learning_rate": 7.651516244285284e-09, + "loss": 0.0737, + "step": 38828 + }, + { + "epoch": 0.9825897714907508, + "grad_norm": 3.4913887977600098, + "learning_rate": 7.62932751112555e-09, + "loss": 0.0642, + "step": 38829 + }, + { + "epoch": 0.9826150770554445, + "grad_norm": 5.585435390472412, + "learning_rate": 7.60717097283592e-09, + "loss": 0.196, + "step": 38830 + }, + { + "epoch": 0.9826403826201382, + "grad_norm": 6.418731212615967, + "learning_rate": 7.585046629559056e-09, + "loss": 0.1105, + "step": 38831 + }, + { + "epoch": 0.9826656881848318, + "grad_norm": 12.503559112548828, + "learning_rate": 7.562954481437068e-09, + "loss": 0.0968, + "step": 38832 + }, + { + "epoch": 0.9826909937495255, + "grad_norm": 4.597120761871338, + "learning_rate": 7.540894528613174e-09, + "loss": 0.1836, + "step": 38833 + }, + { + "epoch": 0.9827162993142192, + "grad_norm": 6.150774002075195, + "learning_rate": 7.518866771229482e-09, + "loss": 0.2145, + "step": 38834 + }, + { + "epoch": 0.9827416048789128, + "grad_norm": 5.341279029846191, + "learning_rate": 7.496871209428103e-09, + "loss": 0.0999, + "step": 38835 + }, + { + "epoch": 0.9827669104436065, + "grad_norm": 4.562701225280762, + "learning_rate": 7.474907843350587e-09, + "loss": 0.1452, + "step": 38836 + }, + { + "epoch": 0.9827922160083002, + "grad_norm": 5.454101085662842, + "learning_rate": 7.45297667313849e-09, + "loss": 0.1385, + "step": 38837 + }, + { + "epoch": 0.9828175215729938, + "grad_norm": 3.485948085784912, + "learning_rate": 7.431077698933364e-09, + "loss": 0.0782, + "step": 38838 + }, + { + "epoch": 0.9828428271376876, + "grad_norm": 8.332210540771484, + "learning_rate": 7.409210920876764e-09, + "loss": 0.1232, + "step": 38839 + }, + { + "epoch": 0.9828681327023813, + "grad_norm": 4.53088903427124, + "learning_rate": 7.387376339109686e-09, + "loss": 0.1784, + "step": 38840 + }, + { + "epoch": 0.982893438267075, + "grad_norm": 3.719719886779785, + "learning_rate": 7.365573953772576e-09, + "loss": 0.1173, + "step": 38841 + }, + { + "epoch": 0.9829187438317686, + "grad_norm": 12.90876293182373, + "learning_rate": 7.343803765005875e-09, + "loss": 0.1896, + "step": 38842 + }, + { + "epoch": 0.9829440493964623, + "grad_norm": 4.8453288078308105, + "learning_rate": 7.3220657729505815e-09, + "loss": 0.099, + "step": 38843 + }, + { + "epoch": 0.982969354961156, + "grad_norm": 6.272747993469238, + "learning_rate": 7.300359977746585e-09, + "loss": 0.1113, + "step": 38844 + }, + { + "epoch": 0.9829946605258496, + "grad_norm": 8.905049324035645, + "learning_rate": 7.278686379533773e-09, + "loss": 0.2262, + "step": 38845 + }, + { + "epoch": 0.9830199660905433, + "grad_norm": 4.870438098907471, + "learning_rate": 7.257044978452032e-09, + "loss": 0.1029, + "step": 38846 + }, + { + "epoch": 0.983045271655237, + "grad_norm": 4.335472583770752, + "learning_rate": 7.235435774640698e-09, + "loss": 0.1144, + "step": 38847 + }, + { + "epoch": 0.9830705772199306, + "grad_norm": 2.8066959381103516, + "learning_rate": 7.2138587682396565e-09, + "loss": 0.1101, + "step": 38848 + }, + { + "epoch": 0.9830958827846243, + "grad_norm": 2.8826725482940674, + "learning_rate": 7.192313959387687e-09, + "loss": 0.1175, + "step": 38849 + }, + { + "epoch": 0.983121188349318, + "grad_norm": 7.660192012786865, + "learning_rate": 7.1708013482235665e-09, + "loss": 0.1966, + "step": 38850 + }, + { + "epoch": 0.9831464939140117, + "grad_norm": 5.673903465270996, + "learning_rate": 7.1493209348860724e-09, + "loss": 0.1433, + "step": 38851 + }, + { + "epoch": 0.9831717994787054, + "grad_norm": 3.5470759868621826, + "learning_rate": 7.127872719513984e-09, + "loss": 0.0948, + "step": 38852 + }, + { + "epoch": 0.9831971050433991, + "grad_norm": 4.704631805419922, + "learning_rate": 7.106456702245523e-09, + "loss": 0.1179, + "step": 38853 + }, + { + "epoch": 0.9832224106080927, + "grad_norm": 3.4974217414855957, + "learning_rate": 7.085072883218913e-09, + "loss": 0.1533, + "step": 38854 + }, + { + "epoch": 0.9832477161727864, + "grad_norm": 3.616873025894165, + "learning_rate": 7.063721262571821e-09, + "loss": 0.1217, + "step": 38855 + }, + { + "epoch": 0.9832730217374801, + "grad_norm": 4.77459192276001, + "learning_rate": 7.042401840441915e-09, + "loss": 0.0968, + "step": 38856 + }, + { + "epoch": 0.9832983273021737, + "grad_norm": 3.376703977584839, + "learning_rate": 7.021114616966862e-09, + "loss": 0.1082, + "step": 38857 + }, + { + "epoch": 0.9833236328668674, + "grad_norm": 10.091562271118164, + "learning_rate": 6.999859592283775e-09, + "loss": 0.1606, + "step": 38858 + }, + { + "epoch": 0.9833489384315611, + "grad_norm": 10.499099731445312, + "learning_rate": 6.978636766529767e-09, + "loss": 0.2524, + "step": 38859 + }, + { + "epoch": 0.9833742439962547, + "grad_norm": 4.070755481719971, + "learning_rate": 6.9574461398419506e-09, + "loss": 0.1296, + "step": 38860 + }, + { + "epoch": 0.9833995495609484, + "grad_norm": 6.4065704345703125, + "learning_rate": 6.936287712356881e-09, + "loss": 0.1338, + "step": 38861 + }, + { + "epoch": 0.9834248551256421, + "grad_norm": 4.468544960021973, + "learning_rate": 6.915161484211119e-09, + "loss": 0.1606, + "step": 38862 + }, + { + "epoch": 0.9834501606903358, + "grad_norm": 8.125306129455566, + "learning_rate": 6.8940674555401095e-09, + "loss": 0.2343, + "step": 38863 + }, + { + "epoch": 0.9834754662550295, + "grad_norm": 4.666525363922119, + "learning_rate": 6.873005626480966e-09, + "loss": 0.1849, + "step": 38864 + }, + { + "epoch": 0.9835007718197232, + "grad_norm": 5.926671981811523, + "learning_rate": 6.85197599716858e-09, + "loss": 0.1224, + "step": 38865 + }, + { + "epoch": 0.9835260773844169, + "grad_norm": 4.255768775939941, + "learning_rate": 6.830978567738955e-09, + "loss": 0.1504, + "step": 38866 + }, + { + "epoch": 0.9835513829491105, + "grad_norm": 4.786046504974365, + "learning_rate": 6.810013338328092e-09, + "loss": 0.1346, + "step": 38867 + }, + { + "epoch": 0.9835766885138042, + "grad_norm": 28.104677200317383, + "learning_rate": 6.789080309070329e-09, + "loss": 0.2881, + "step": 38868 + }, + { + "epoch": 0.9836019940784979, + "grad_norm": 6.586132049560547, + "learning_rate": 6.7681794801011115e-09, + "loss": 0.2126, + "step": 38869 + }, + { + "epoch": 0.9836272996431915, + "grad_norm": 7.964696884155273, + "learning_rate": 6.747310851554778e-09, + "loss": 0.2423, + "step": 38870 + }, + { + "epoch": 0.9836526052078852, + "grad_norm": 4.592220783233643, + "learning_rate": 6.726474423566776e-09, + "loss": 0.1512, + "step": 38871 + }, + { + "epoch": 0.9836779107725789, + "grad_norm": 10.730507850646973, + "learning_rate": 6.705670196270331e-09, + "loss": 0.237, + "step": 38872 + }, + { + "epoch": 0.9837032163372725, + "grad_norm": 3.0592761039733887, + "learning_rate": 6.684898169800335e-09, + "loss": 0.0705, + "step": 38873 + }, + { + "epoch": 0.9837285219019662, + "grad_norm": 5.052505970001221, + "learning_rate": 6.664158344290572e-09, + "loss": 0.1297, + "step": 38874 + }, + { + "epoch": 0.98375382746666, + "grad_norm": 7.427665710449219, + "learning_rate": 6.6434507198753775e-09, + "loss": 0.1249, + "step": 38875 + }, + { + "epoch": 0.9837791330313536, + "grad_norm": 4.937532901763916, + "learning_rate": 6.622775296687422e-09, + "loss": 0.1134, + "step": 38876 + }, + { + "epoch": 0.9838044385960473, + "grad_norm": 9.52932357788086, + "learning_rate": 6.602132074859935e-09, + "loss": 0.2084, + "step": 38877 + }, + { + "epoch": 0.983829744160741, + "grad_norm": 3.041895866394043, + "learning_rate": 6.5815210545266964e-09, + "loss": 0.1165, + "step": 38878 + }, + { + "epoch": 0.9838550497254346, + "grad_norm": 5.691634654998779, + "learning_rate": 6.560942235820378e-09, + "loss": 0.1431, + "step": 38879 + }, + { + "epoch": 0.9838803552901283, + "grad_norm": 5.074901580810547, + "learning_rate": 6.5403956188742066e-09, + "loss": 0.1141, + "step": 38880 + }, + { + "epoch": 0.983905660854822, + "grad_norm": 13.102278709411621, + "learning_rate": 6.51988120381919e-09, + "loss": 0.1647, + "step": 38881 + }, + { + "epoch": 0.9839309664195156, + "grad_norm": 2.6152851581573486, + "learning_rate": 6.499398990789663e-09, + "loss": 0.0847, + "step": 38882 + }, + { + "epoch": 0.9839562719842093, + "grad_norm": 3.0639142990112305, + "learning_rate": 6.478948979915523e-09, + "loss": 0.1095, + "step": 38883 + }, + { + "epoch": 0.983981577548903, + "grad_norm": 12.229002952575684, + "learning_rate": 6.458531171330551e-09, + "loss": 0.2328, + "step": 38884 + }, + { + "epoch": 0.9840068831135966, + "grad_norm": 6.750247955322266, + "learning_rate": 6.438145565165754e-09, + "loss": 0.161, + "step": 38885 + }, + { + "epoch": 0.9840321886782903, + "grad_norm": 8.268064498901367, + "learning_rate": 6.417792161552139e-09, + "loss": 0.1447, + "step": 38886 + }, + { + "epoch": 0.984057494242984, + "grad_norm": 3.2787158489227295, + "learning_rate": 6.397470960621266e-09, + "loss": 0.1542, + "step": 38887 + }, + { + "epoch": 0.9840827998076777, + "grad_norm": 3.7406089305877686, + "learning_rate": 6.377181962504142e-09, + "loss": 0.1391, + "step": 38888 + }, + { + "epoch": 0.9841081053723714, + "grad_norm": 5.103031635284424, + "learning_rate": 6.3569251673317735e-09, + "loss": 0.1272, + "step": 38889 + }, + { + "epoch": 0.9841334109370651, + "grad_norm": 4.828897953033447, + "learning_rate": 6.336700575235167e-09, + "loss": 0.1494, + "step": 38890 + }, + { + "epoch": 0.9841587165017588, + "grad_norm": 3.428273916244507, + "learning_rate": 6.316508186343662e-09, + "loss": 0.0826, + "step": 38891 + }, + { + "epoch": 0.9841840220664524, + "grad_norm": 5.5837626457214355, + "learning_rate": 6.296348000788266e-09, + "loss": 0.1406, + "step": 38892 + }, + { + "epoch": 0.9842093276311461, + "grad_norm": 7.6754374504089355, + "learning_rate": 6.276220018698875e-09, + "loss": 0.197, + "step": 38893 + }, + { + "epoch": 0.9842346331958398, + "grad_norm": 4.756626129150391, + "learning_rate": 6.256124240205386e-09, + "loss": 0.1628, + "step": 38894 + }, + { + "epoch": 0.9842599387605334, + "grad_norm": 4.452920913696289, + "learning_rate": 6.236060665437138e-09, + "loss": 0.0978, + "step": 38895 + }, + { + "epoch": 0.9842852443252271, + "grad_norm": 5.214486122131348, + "learning_rate": 6.216029294523473e-09, + "loss": 0.1191, + "step": 38896 + }, + { + "epoch": 0.9843105498899208, + "grad_norm": 9.204154968261719, + "learning_rate": 6.196030127594288e-09, + "loss": 0.169, + "step": 38897 + }, + { + "epoch": 0.9843358554546144, + "grad_norm": 3.275266170501709, + "learning_rate": 6.176063164777257e-09, + "loss": 0.099, + "step": 38898 + }, + { + "epoch": 0.9843611610193082, + "grad_norm": 4.243672847747803, + "learning_rate": 6.156128406202277e-09, + "loss": 0.1015, + "step": 38899 + }, + { + "epoch": 0.9843864665840019, + "grad_norm": 5.110998153686523, + "learning_rate": 6.1362258519975795e-09, + "loss": 0.1563, + "step": 38900 + }, + { + "epoch": 0.9844117721486955, + "grad_norm": 4.216565132141113, + "learning_rate": 6.116355502290839e-09, + "loss": 0.1086, + "step": 38901 + }, + { + "epoch": 0.9844370777133892, + "grad_norm": 7.3855390548706055, + "learning_rate": 6.096517357211395e-09, + "loss": 0.1607, + "step": 38902 + }, + { + "epoch": 0.9844623832780829, + "grad_norm": 7.408432960510254, + "learning_rate": 6.076711416886371e-09, + "loss": 0.2282, + "step": 38903 + }, + { + "epoch": 0.9844876888427765, + "grad_norm": 4.663399696350098, + "learning_rate": 6.056937681443442e-09, + "loss": 0.1754, + "step": 38904 + }, + { + "epoch": 0.9845129944074702, + "grad_norm": 5.444742679595947, + "learning_rate": 6.037196151010838e-09, + "loss": 0.1665, + "step": 38905 + }, + { + "epoch": 0.9845382999721639, + "grad_norm": 8.402846336364746, + "learning_rate": 6.017486825715124e-09, + "loss": 0.2498, + "step": 38906 + }, + { + "epoch": 0.9845636055368575, + "grad_norm": 10.72836685180664, + "learning_rate": 5.9978097056839764e-09, + "loss": 0.1427, + "step": 38907 + }, + { + "epoch": 0.9845889111015512, + "grad_norm": 14.494211196899414, + "learning_rate": 5.978164791043406e-09, + "loss": 0.2427, + "step": 38908 + }, + { + "epoch": 0.9846142166662449, + "grad_norm": 6.813766956329346, + "learning_rate": 5.958552081921088e-09, + "loss": 0.2108, + "step": 38909 + }, + { + "epoch": 0.9846395222309385, + "grad_norm": 3.809380054473877, + "learning_rate": 5.938971578443032e-09, + "loss": 0.1293, + "step": 38910 + }, + { + "epoch": 0.9846648277956322, + "grad_norm": 5.438185214996338, + "learning_rate": 5.919423280735803e-09, + "loss": 0.147, + "step": 38911 + }, + { + "epoch": 0.984690133360326, + "grad_norm": 10.001519203186035, + "learning_rate": 5.899907188925413e-09, + "loss": 0.1536, + "step": 38912 + }, + { + "epoch": 0.9847154389250196, + "grad_norm": 3.622159481048584, + "learning_rate": 5.880423303136762e-09, + "loss": 0.16, + "step": 38913 + }, + { + "epoch": 0.9847407444897133, + "grad_norm": 5.831821441650391, + "learning_rate": 5.860971623496969e-09, + "loss": 0.1378, + "step": 38914 + }, + { + "epoch": 0.984766050054407, + "grad_norm": 3.9104998111724854, + "learning_rate": 5.841552150130381e-09, + "loss": 0.1997, + "step": 38915 + }, + { + "epoch": 0.9847913556191007, + "grad_norm": 4.747304439544678, + "learning_rate": 5.822164883163006e-09, + "loss": 0.0754, + "step": 38916 + }, + { + "epoch": 0.9848166611837943, + "grad_norm": 6.407619953155518, + "learning_rate": 5.802809822719191e-09, + "loss": 0.2329, + "step": 38917 + }, + { + "epoch": 0.984841966748488, + "grad_norm": 2.7431747913360596, + "learning_rate": 5.78348696892439e-09, + "loss": 0.1113, + "step": 38918 + }, + { + "epoch": 0.9848672723131817, + "grad_norm": 5.459566593170166, + "learning_rate": 5.7641963219023936e-09, + "loss": 0.1827, + "step": 38919 + }, + { + "epoch": 0.9848925778778753, + "grad_norm": 3.450242519378662, + "learning_rate": 5.744937881778656e-09, + "loss": 0.1654, + "step": 38920 + }, + { + "epoch": 0.984917883442569, + "grad_norm": 15.642632484436035, + "learning_rate": 5.725711648676413e-09, + "loss": 0.1799, + "step": 38921 + }, + { + "epoch": 0.9849431890072627, + "grad_norm": 6.01675271987915, + "learning_rate": 5.706517622720009e-09, + "loss": 0.1697, + "step": 38922 + }, + { + "epoch": 0.9849684945719563, + "grad_norm": 4.345582962036133, + "learning_rate": 5.687355804033234e-09, + "loss": 0.0994, + "step": 38923 + }, + { + "epoch": 0.98499380013665, + "grad_norm": 5.9079999923706055, + "learning_rate": 5.668226192739879e-09, + "loss": 0.1172, + "step": 38924 + }, + { + "epoch": 0.9850191057013438, + "grad_norm": 7.0566630363464355, + "learning_rate": 5.649128788963176e-09, + "loss": 0.1265, + "step": 38925 + }, + { + "epoch": 0.9850444112660374, + "grad_norm": 6.940811634063721, + "learning_rate": 5.630063592826362e-09, + "loss": 0.1562, + "step": 38926 + }, + { + "epoch": 0.9850697168307311, + "grad_norm": 5.102452278137207, + "learning_rate": 5.6110306044521164e-09, + "loss": 0.1511, + "step": 38927 + }, + { + "epoch": 0.9850950223954248, + "grad_norm": 3.6864712238311768, + "learning_rate": 5.592029823963119e-09, + "loss": 0.117, + "step": 38928 + }, + { + "epoch": 0.9851203279601184, + "grad_norm": 4.606655120849609, + "learning_rate": 5.573061251482048e-09, + "loss": 0.1246, + "step": 38929 + }, + { + "epoch": 0.9851456335248121, + "grad_norm": 3.118680238723755, + "learning_rate": 5.554124887131585e-09, + "loss": 0.1047, + "step": 38930 + }, + { + "epoch": 0.9851709390895058, + "grad_norm": 6.511647701263428, + "learning_rate": 5.5352207310332975e-09, + "loss": 0.1611, + "step": 38931 + }, + { + "epoch": 0.9851962446541994, + "grad_norm": 6.857161998748779, + "learning_rate": 5.516348783309311e-09, + "loss": 0.2001, + "step": 38932 + }, + { + "epoch": 0.9852215502188931, + "grad_norm": 7.0929036140441895, + "learning_rate": 5.49750904408175e-09, + "loss": 0.1704, + "step": 38933 + }, + { + "epoch": 0.9852468557835868, + "grad_norm": 5.025158405303955, + "learning_rate": 5.4787015134710744e-09, + "loss": 0.1266, + "step": 38934 + }, + { + "epoch": 0.9852721613482804, + "grad_norm": 2.3013393878936768, + "learning_rate": 5.459926191599407e-09, + "loss": 0.0573, + "step": 38935 + }, + { + "epoch": 0.9852974669129742, + "grad_norm": 4.3119378089904785, + "learning_rate": 5.441183078587764e-09, + "loss": 0.1905, + "step": 38936 + }, + { + "epoch": 0.9853227724776679, + "grad_norm": 10.087326049804688, + "learning_rate": 5.422472174557158e-09, + "loss": 0.3162, + "step": 38937 + }, + { + "epoch": 0.9853480780423615, + "grad_norm": 8.819947242736816, + "learning_rate": 5.403793479627495e-09, + "loss": 0.1431, + "step": 38938 + }, + { + "epoch": 0.9853733836070552, + "grad_norm": 8.18994140625, + "learning_rate": 5.385146993919788e-09, + "loss": 0.1397, + "step": 38939 + }, + { + "epoch": 0.9853986891717489, + "grad_norm": 4.555871963500977, + "learning_rate": 5.366532717554496e-09, + "loss": 0.1204, + "step": 38940 + }, + { + "epoch": 0.9854239947364426, + "grad_norm": 8.266225814819336, + "learning_rate": 5.347950650650968e-09, + "loss": 0.2024, + "step": 38941 + }, + { + "epoch": 0.9854493003011362, + "grad_norm": 6.785050392150879, + "learning_rate": 5.329400793329664e-09, + "loss": 0.0871, + "step": 38942 + }, + { + "epoch": 0.9854746058658299, + "grad_norm": 5.416407585144043, + "learning_rate": 5.310883145709933e-09, + "loss": 0.1758, + "step": 38943 + }, + { + "epoch": 0.9854999114305236, + "grad_norm": 3.6424782276153564, + "learning_rate": 5.292397707911124e-09, + "loss": 0.1309, + "step": 38944 + }, + { + "epoch": 0.9855252169952172, + "grad_norm": 9.608732223510742, + "learning_rate": 5.2739444800525845e-09, + "loss": 0.1181, + "step": 38945 + }, + { + "epoch": 0.9855505225599109, + "grad_norm": 3.745051622390747, + "learning_rate": 5.2555234622536645e-09, + "loss": 0.1207, + "step": 38946 + }, + { + "epoch": 0.9855758281246046, + "grad_norm": 6.74080228805542, + "learning_rate": 5.2371346546320476e-09, + "loss": 0.1168, + "step": 38947 + }, + { + "epoch": 0.9856011336892982, + "grad_norm": 9.572175979614258, + "learning_rate": 5.2187780573076385e-09, + "loss": 0.1678, + "step": 38948 + }, + { + "epoch": 0.985626439253992, + "grad_norm": 2.932082176208496, + "learning_rate": 5.20045367039812e-09, + "loss": 0.1446, + "step": 38949 + }, + { + "epoch": 0.9856517448186857, + "grad_norm": 5.626569747924805, + "learning_rate": 5.1821614940217315e-09, + "loss": 0.1337, + "step": 38950 + }, + { + "epoch": 0.9856770503833793, + "grad_norm": 6.533888816833496, + "learning_rate": 5.1639015282961555e-09, + "loss": 0.116, + "step": 38951 + }, + { + "epoch": 0.985702355948073, + "grad_norm": 5.3451714515686035, + "learning_rate": 5.145673773339633e-09, + "loss": 0.2026, + "step": 38952 + }, + { + "epoch": 0.9857276615127667, + "grad_norm": 3.9681711196899414, + "learning_rate": 5.127478229269844e-09, + "loss": 0.1395, + "step": 38953 + }, + { + "epoch": 0.9857529670774603, + "grad_norm": 9.452112197875977, + "learning_rate": 5.109314896203921e-09, + "loss": 0.2231, + "step": 38954 + }, + { + "epoch": 0.985778272642154, + "grad_norm": 3.7474706172943115, + "learning_rate": 5.091183774258434e-09, + "loss": 0.1239, + "step": 38955 + }, + { + "epoch": 0.9858035782068477, + "grad_norm": 5.2176833152771, + "learning_rate": 5.0730848635510696e-09, + "loss": 0.1308, + "step": 38956 + }, + { + "epoch": 0.9858288837715413, + "grad_norm": 9.57707691192627, + "learning_rate": 5.055018164197844e-09, + "loss": 0.2718, + "step": 38957 + }, + { + "epoch": 0.985854189336235, + "grad_norm": 5.216099262237549, + "learning_rate": 5.036983676316442e-09, + "loss": 0.1838, + "step": 38958 + }, + { + "epoch": 0.9858794949009287, + "grad_norm": 5.508059978485107, + "learning_rate": 5.018981400021771e-09, + "loss": 0.1292, + "step": 38959 + }, + { + "epoch": 0.9859048004656223, + "grad_norm": 6.353137493133545, + "learning_rate": 5.001011335430961e-09, + "loss": 0.175, + "step": 38960 + }, + { + "epoch": 0.9859301060303161, + "grad_norm": 4.450414180755615, + "learning_rate": 4.983073482658918e-09, + "loss": 0.1934, + "step": 38961 + }, + { + "epoch": 0.9859554115950098, + "grad_norm": 10.073639869689941, + "learning_rate": 4.965167841822216e-09, + "loss": 0.1669, + "step": 38962 + }, + { + "epoch": 0.9859807171597034, + "grad_norm": 7.633833885192871, + "learning_rate": 4.9472944130363184e-09, + "loss": 0.1777, + "step": 38963 + }, + { + "epoch": 0.9860060227243971, + "grad_norm": 17.8847713470459, + "learning_rate": 4.929453196415579e-09, + "loss": 0.1851, + "step": 38964 + }, + { + "epoch": 0.9860313282890908, + "grad_norm": 3.0134987831115723, + "learning_rate": 4.9116441920760145e-09, + "loss": 0.1726, + "step": 38965 + }, + { + "epoch": 0.9860566338537844, + "grad_norm": 4.805772304534912, + "learning_rate": 4.89386740013198e-09, + "loss": 0.108, + "step": 38966 + }, + { + "epoch": 0.9860819394184781, + "grad_norm": 3.6405179500579834, + "learning_rate": 4.876122820698381e-09, + "loss": 0.1753, + "step": 38967 + }, + { + "epoch": 0.9861072449831718, + "grad_norm": 9.700736999511719, + "learning_rate": 4.8584104538895725e-09, + "loss": 0.2524, + "step": 38968 + }, + { + "epoch": 0.9861325505478655, + "grad_norm": 6.876688480377197, + "learning_rate": 4.840730299819351e-09, + "loss": 0.1628, + "step": 38969 + }, + { + "epoch": 0.9861578561125591, + "grad_norm": 5.725936412811279, + "learning_rate": 4.823082358602071e-09, + "loss": 0.1557, + "step": 38970 + }, + { + "epoch": 0.9861831616772528, + "grad_norm": 7.291684150695801, + "learning_rate": 4.8054666303520845e-09, + "loss": 0.1885, + "step": 38971 + }, + { + "epoch": 0.9862084672419466, + "grad_norm": 3.4761738777160645, + "learning_rate": 4.787883115182079e-09, + "loss": 0.1116, + "step": 38972 + }, + { + "epoch": 0.9862337728066402, + "grad_norm": 3.160320281982422, + "learning_rate": 4.770331813205853e-09, + "loss": 0.105, + "step": 38973 + }, + { + "epoch": 0.9862590783713339, + "grad_norm": 9.465213775634766, + "learning_rate": 4.752812724536648e-09, + "loss": 0.2045, + "step": 38974 + }, + { + "epoch": 0.9862843839360276, + "grad_norm": 5.943892955780029, + "learning_rate": 4.7353258492871535e-09, + "loss": 0.1426, + "step": 38975 + }, + { + "epoch": 0.9863096895007212, + "grad_norm": 16.133176803588867, + "learning_rate": 4.71787118757061e-09, + "loss": 0.2184, + "step": 38976 + }, + { + "epoch": 0.9863349950654149, + "grad_norm": 5.0945515632629395, + "learning_rate": 4.700448739499153e-09, + "loss": 0.1542, + "step": 38977 + }, + { + "epoch": 0.9863603006301086, + "grad_norm": 6.131264686584473, + "learning_rate": 4.683058505185467e-09, + "loss": 0.1118, + "step": 38978 + }, + { + "epoch": 0.9863856061948022, + "grad_norm": 6.908011436462402, + "learning_rate": 4.6657004847411315e-09, + "loss": 0.1751, + "step": 38979 + }, + { + "epoch": 0.9864109117594959, + "grad_norm": 8.472088813781738, + "learning_rate": 4.648374678278833e-09, + "loss": 0.174, + "step": 38980 + }, + { + "epoch": 0.9864362173241896, + "grad_norm": 5.023979663848877, + "learning_rate": 4.63108108591015e-09, + "loss": 0.0966, + "step": 38981 + }, + { + "epoch": 0.9864615228888832, + "grad_norm": 3.5005900859832764, + "learning_rate": 4.6138197077461035e-09, + "loss": 0.0726, + "step": 38982 + }, + { + "epoch": 0.9864868284535769, + "grad_norm": 6.355399131774902, + "learning_rate": 4.5965905438982715e-09, + "loss": 0.1617, + "step": 38983 + }, + { + "epoch": 0.9865121340182706, + "grad_norm": 10.11783218383789, + "learning_rate": 4.579393594478232e-09, + "loss": 0.1755, + "step": 38984 + }, + { + "epoch": 0.9865374395829642, + "grad_norm": 8.3991060256958, + "learning_rate": 4.562228859595896e-09, + "loss": 0.2624, + "step": 38985 + }, + { + "epoch": 0.986562745147658, + "grad_norm": 3.2224552631378174, + "learning_rate": 4.545096339362842e-09, + "loss": 0.115, + "step": 38986 + }, + { + "epoch": 0.9865880507123517, + "grad_norm": 9.20782470703125, + "learning_rate": 4.527996033889536e-09, + "loss": 0.2919, + "step": 38987 + }, + { + "epoch": 0.9866133562770453, + "grad_norm": 4.425808429718018, + "learning_rate": 4.510927943285337e-09, + "loss": 0.1443, + "step": 38988 + }, + { + "epoch": 0.986638661841739, + "grad_norm": 3.7891242504119873, + "learning_rate": 4.4938920676612655e-09, + "loss": 0.1497, + "step": 38989 + }, + { + "epoch": 0.9866639674064327, + "grad_norm": 4.447136402130127, + "learning_rate": 4.476888407126678e-09, + "loss": 0.1068, + "step": 38990 + }, + { + "epoch": 0.9866892729711263, + "grad_norm": 3.2261054515838623, + "learning_rate": 4.4599169617914885e-09, + "loss": 0.136, + "step": 38991 + }, + { + "epoch": 0.98671457853582, + "grad_norm": 3.7185375690460205, + "learning_rate": 4.4429777317656074e-09, + "loss": 0.1159, + "step": 38992 + }, + { + "epoch": 0.9867398841005137, + "grad_norm": 2.9307994842529297, + "learning_rate": 4.426070717156727e-09, + "loss": 0.0988, + "step": 38993 + }, + { + "epoch": 0.9867651896652074, + "grad_norm": 4.384567737579346, + "learning_rate": 4.409195918075315e-09, + "loss": 0.1515, + "step": 38994 + }, + { + "epoch": 0.986790495229901, + "grad_norm": 6.177577495574951, + "learning_rate": 4.392353334630173e-09, + "loss": 0.1537, + "step": 38995 + }, + { + "epoch": 0.9868158007945947, + "grad_norm": 5.2997541427612305, + "learning_rate": 4.375542966928992e-09, + "loss": 0.1796, + "step": 38996 + }, + { + "epoch": 0.9868411063592885, + "grad_norm": 4.43436861038208, + "learning_rate": 4.358764815080574e-09, + "loss": 0.1096, + "step": 38997 + }, + { + "epoch": 0.9868664119239821, + "grad_norm": 4.394326686859131, + "learning_rate": 4.342018879193721e-09, + "loss": 0.1713, + "step": 38998 + }, + { + "epoch": 0.9868917174886758, + "grad_norm": 5.254866600036621, + "learning_rate": 4.32530515937557e-09, + "loss": 0.1382, + "step": 38999 + }, + { + "epoch": 0.9869170230533695, + "grad_norm": 3.5411031246185303, + "learning_rate": 4.3086236557343674e-09, + "loss": 0.1165, + "step": 39000 + }, + { + "epoch": 0.9869423286180631, + "grad_norm": 8.657575607299805, + "learning_rate": 4.29197436837725e-09, + "loss": 0.212, + "step": 39001 + }, + { + "epoch": 0.9869676341827568, + "grad_norm": 3.2139456272125244, + "learning_rate": 4.275357297412463e-09, + "loss": 0.0923, + "step": 39002 + }, + { + "epoch": 0.9869929397474505, + "grad_norm": 5.09307861328125, + "learning_rate": 4.258772442946035e-09, + "loss": 0.1238, + "step": 39003 + }, + { + "epoch": 0.9870182453121441, + "grad_norm": 6.110108852386475, + "learning_rate": 4.242219805086212e-09, + "loss": 0.1744, + "step": 39004 + }, + { + "epoch": 0.9870435508768378, + "grad_norm": 6.077169418334961, + "learning_rate": 4.225699383938464e-09, + "loss": 0.1976, + "step": 39005 + }, + { + "epoch": 0.9870688564415315, + "grad_norm": 5.992949485778809, + "learning_rate": 4.2092111796099286e-09, + "loss": 0.1215, + "step": 39006 + }, + { + "epoch": 0.9870941620062251, + "grad_norm": 3.647022008895874, + "learning_rate": 4.192755192207187e-09, + "loss": 0.0967, + "step": 39007 + }, + { + "epoch": 0.9871194675709188, + "grad_norm": 5.205357551574707, + "learning_rate": 4.1763314218362664e-09, + "loss": 0.1634, + "step": 39008 + }, + { + "epoch": 0.9871447731356126, + "grad_norm": 2.280503034591675, + "learning_rate": 4.1599398686020805e-09, + "loss": 0.0581, + "step": 39009 + }, + { + "epoch": 0.9871700787003062, + "grad_norm": 4.1875996589660645, + "learning_rate": 4.1435805326117686e-09, + "loss": 0.1171, + "step": 39010 + }, + { + "epoch": 0.9871953842649999, + "grad_norm": 4.291406154632568, + "learning_rate": 4.127253413970244e-09, + "loss": 0.1517, + "step": 39011 + }, + { + "epoch": 0.9872206898296936, + "grad_norm": 4.049113750457764, + "learning_rate": 4.110958512782426e-09, + "loss": 0.1049, + "step": 39012 + }, + { + "epoch": 0.9872459953943872, + "grad_norm": 3.0217130184173584, + "learning_rate": 4.094695829153783e-09, + "loss": 0.1096, + "step": 39013 + }, + { + "epoch": 0.9872713009590809, + "grad_norm": 4.019109725952148, + "learning_rate": 4.078465363189232e-09, + "loss": 0.1311, + "step": 39014 + }, + { + "epoch": 0.9872966065237746, + "grad_norm": 6.6890974044799805, + "learning_rate": 4.06226711499369e-09, + "loss": 0.2087, + "step": 39015 + }, + { + "epoch": 0.9873219120884682, + "grad_norm": 5.916386604309082, + "learning_rate": 4.046101084670406e-09, + "loss": 0.2107, + "step": 39016 + }, + { + "epoch": 0.9873472176531619, + "grad_norm": 2.215298652648926, + "learning_rate": 4.029967272325408e-09, + "loss": 0.0705, + "step": 39017 + }, + { + "epoch": 0.9873725232178556, + "grad_norm": 3.7881462574005127, + "learning_rate": 4.013865678061391e-09, + "loss": 0.1582, + "step": 39018 + }, + { + "epoch": 0.9873978287825493, + "grad_norm": 5.117173194885254, + "learning_rate": 3.997796301982715e-09, + "loss": 0.2159, + "step": 39019 + }, + { + "epoch": 0.9874231343472429, + "grad_norm": 4.071386814117432, + "learning_rate": 3.981759144192632e-09, + "loss": 0.0975, + "step": 39020 + }, + { + "epoch": 0.9874484399119366, + "grad_norm": 6.402987957000732, + "learning_rate": 3.965754204795502e-09, + "loss": 0.2179, + "step": 39021 + }, + { + "epoch": 0.9874737454766304, + "grad_norm": 4.331881999969482, + "learning_rate": 3.949781483893466e-09, + "loss": 0.149, + "step": 39022 + }, + { + "epoch": 0.987499051041324, + "grad_norm": 3.7745373249053955, + "learning_rate": 3.933840981589776e-09, + "loss": 0.1359, + "step": 39023 + }, + { + "epoch": 0.9875243566060177, + "grad_norm": 3.0158674716949463, + "learning_rate": 3.9179326979876805e-09, + "loss": 0.0692, + "step": 39024 + }, + { + "epoch": 0.9875496621707114, + "grad_norm": 8.718847274780273, + "learning_rate": 3.9020566331893216e-09, + "loss": 0.1497, + "step": 39025 + }, + { + "epoch": 0.987574967735405, + "grad_norm": 2.65777850151062, + "learning_rate": 3.886212787297394e-09, + "loss": 0.0702, + "step": 39026 + }, + { + "epoch": 0.9876002733000987, + "grad_norm": 2.3605477809906006, + "learning_rate": 3.870401160414039e-09, + "loss": 0.1016, + "step": 39027 + }, + { + "epoch": 0.9876255788647924, + "grad_norm": 3.9682652950286865, + "learning_rate": 3.854621752640842e-09, + "loss": 0.151, + "step": 39028 + }, + { + "epoch": 0.987650884429486, + "grad_norm": 3.667344331741333, + "learning_rate": 3.838874564079942e-09, + "loss": 0.0963, + "step": 39029 + }, + { + "epoch": 0.9876761899941797, + "grad_norm": 4.793049335479736, + "learning_rate": 3.823159594832926e-09, + "loss": 0.1636, + "step": 39030 + }, + { + "epoch": 0.9877014955588734, + "grad_norm": 4.1035075187683105, + "learning_rate": 3.8074768450008235e-09, + "loss": 0.1473, + "step": 39031 + }, + { + "epoch": 0.987726801123567, + "grad_norm": 7.568853855133057, + "learning_rate": 3.7918263146852205e-09, + "loss": 0.1612, + "step": 39032 + }, + { + "epoch": 0.9877521066882607, + "grad_norm": 4.409943580627441, + "learning_rate": 3.776208003986592e-09, + "loss": 0.1174, + "step": 39033 + }, + { + "epoch": 0.9877774122529545, + "grad_norm": 2.948702812194824, + "learning_rate": 3.760621913005413e-09, + "loss": 0.0942, + "step": 39034 + }, + { + "epoch": 0.9878027178176481, + "grad_norm": 4.551581859588623, + "learning_rate": 3.7450680418432696e-09, + "loss": 0.2076, + "step": 39035 + }, + { + "epoch": 0.9878280233823418, + "grad_norm": 6.87798547744751, + "learning_rate": 3.729546390599525e-09, + "loss": 0.1637, + "step": 39036 + }, + { + "epoch": 0.9878533289470355, + "grad_norm": 4.74826717376709, + "learning_rate": 3.7140569593741018e-09, + "loss": 0.1413, + "step": 39037 + }, + { + "epoch": 0.9878786345117291, + "grad_norm": 7.712544918060303, + "learning_rate": 3.698599748268028e-09, + "loss": 0.1754, + "step": 39038 + }, + { + "epoch": 0.9879039400764228, + "grad_norm": 7.4056243896484375, + "learning_rate": 3.6831747573801146e-09, + "loss": 0.2202, + "step": 39039 + }, + { + "epoch": 0.9879292456411165, + "grad_norm": 13.244729042053223, + "learning_rate": 3.6677819868097264e-09, + "loss": 0.1996, + "step": 39040 + }, + { + "epoch": 0.9879545512058101, + "grad_norm": 6.843295574188232, + "learning_rate": 3.6524214366567836e-09, + "loss": 0.192, + "step": 39041 + }, + { + "epoch": 0.9879798567705038, + "grad_norm": 7.518611431121826, + "learning_rate": 3.6370931070200954e-09, + "loss": 0.2247, + "step": 39042 + }, + { + "epoch": 0.9880051623351975, + "grad_norm": 3.6851565837860107, + "learning_rate": 3.621796997997917e-09, + "loss": 0.1162, + "step": 39043 + }, + { + "epoch": 0.9880304678998912, + "grad_norm": 3.673403263092041, + "learning_rate": 3.606533109689614e-09, + "loss": 0.1531, + "step": 39044 + }, + { + "epoch": 0.9880557734645848, + "grad_norm": 3.708289384841919, + "learning_rate": 3.5913014421934402e-09, + "loss": 0.1467, + "step": 39045 + }, + { + "epoch": 0.9880810790292786, + "grad_norm": 3.3543541431427, + "learning_rate": 3.57610199560765e-09, + "loss": 0.1514, + "step": 39046 + }, + { + "epoch": 0.9881063845939723, + "grad_norm": 3.1752002239227295, + "learning_rate": 3.5609347700299447e-09, + "loss": 0.1039, + "step": 39047 + }, + { + "epoch": 0.9881316901586659, + "grad_norm": 5.796509265899658, + "learning_rate": 3.5457997655585773e-09, + "loss": 0.1522, + "step": 39048 + }, + { + "epoch": 0.9881569957233596, + "grad_norm": 3.6755685806274414, + "learning_rate": 3.5306969822906935e-09, + "loss": 0.1077, + "step": 39049 + }, + { + "epoch": 0.9881823012880533, + "grad_norm": 3.2334091663360596, + "learning_rate": 3.515626420323992e-09, + "loss": 0.1222, + "step": 39050 + }, + { + "epoch": 0.9882076068527469, + "grad_norm": 2.954590082168579, + "learning_rate": 3.500588079755618e-09, + "loss": 0.1017, + "step": 39051 + }, + { + "epoch": 0.9882329124174406, + "grad_norm": 3.6916165351867676, + "learning_rate": 3.4855819606827155e-09, + "loss": 0.145, + "step": 39052 + }, + { + "epoch": 0.9882582179821343, + "grad_norm": 4.973570823669434, + "learning_rate": 3.4706080632018746e-09, + "loss": 0.1494, + "step": 39053 + }, + { + "epoch": 0.9882835235468279, + "grad_norm": 5.373449802398682, + "learning_rate": 3.455666387409129e-09, + "loss": 0.1724, + "step": 39054 + }, + { + "epoch": 0.9883088291115216, + "grad_norm": 9.50635051727295, + "learning_rate": 3.4407569334016233e-09, + "loss": 0.2311, + "step": 39055 + }, + { + "epoch": 0.9883341346762153, + "grad_norm": 9.560423851013184, + "learning_rate": 3.425879701275392e-09, + "loss": 0.2078, + "step": 39056 + }, + { + "epoch": 0.9883594402409089, + "grad_norm": 13.300599098205566, + "learning_rate": 3.411034691126469e-09, + "loss": 0.3495, + "step": 39057 + }, + { + "epoch": 0.9883847458056026, + "grad_norm": 3.5555858612060547, + "learning_rate": 3.396221903049779e-09, + "loss": 0.1792, + "step": 39058 + }, + { + "epoch": 0.9884100513702964, + "grad_norm": 5.168686389923096, + "learning_rate": 3.3814413371419107e-09, + "loss": 0.0875, + "step": 39059 + }, + { + "epoch": 0.98843535693499, + "grad_norm": 2.459329605102539, + "learning_rate": 3.3666929934972336e-09, + "loss": 0.0768, + "step": 39060 + }, + { + "epoch": 0.9884606624996837, + "grad_norm": 8.803045272827148, + "learning_rate": 3.351976872211782e-09, + "loss": 0.2051, + "step": 39061 + }, + { + "epoch": 0.9884859680643774, + "grad_norm": 3.1743414402008057, + "learning_rate": 3.337292973379924e-09, + "loss": 0.1276, + "step": 39062 + }, + { + "epoch": 0.988511273629071, + "grad_norm": 3.3044419288635254, + "learning_rate": 3.32264129709603e-09, + "loss": 0.1219, + "step": 39063 + }, + { + "epoch": 0.9885365791937647, + "grad_norm": 20.801151275634766, + "learning_rate": 3.308021843455023e-09, + "loss": 0.1929, + "step": 39064 + }, + { + "epoch": 0.9885618847584584, + "grad_norm": 24.584482192993164, + "learning_rate": 3.293434612551272e-09, + "loss": 0.396, + "step": 39065 + }, + { + "epoch": 0.988587190323152, + "grad_norm": 7.799892425537109, + "learning_rate": 3.278879604479146e-09, + "loss": 0.1661, + "step": 39066 + }, + { + "epoch": 0.9886124958878457, + "grad_norm": 4.069115161895752, + "learning_rate": 3.264356819331349e-09, + "loss": 0.1624, + "step": 39067 + }, + { + "epoch": 0.9886378014525394, + "grad_norm": 27.785600662231445, + "learning_rate": 3.249866257202805e-09, + "loss": 0.2005, + "step": 39068 + }, + { + "epoch": 0.9886631070172331, + "grad_norm": 7.934000492095947, + "learning_rate": 3.2354079181862174e-09, + "loss": 0.2619, + "step": 39069 + }, + { + "epoch": 0.9886884125819267, + "grad_norm": 5.503217697143555, + "learning_rate": 3.220981802374845e-09, + "loss": 0.1611, + "step": 39070 + }, + { + "epoch": 0.9887137181466205, + "grad_norm": 8.207099914550781, + "learning_rate": 3.2065879098619466e-09, + "loss": 0.1713, + "step": 39071 + }, + { + "epoch": 0.9887390237113142, + "grad_norm": 6.447227954864502, + "learning_rate": 3.192226240740226e-09, + "loss": 0.1693, + "step": 39072 + }, + { + "epoch": 0.9887643292760078, + "grad_norm": 8.721980094909668, + "learning_rate": 3.1778967951023866e-09, + "loss": 0.2446, + "step": 39073 + }, + { + "epoch": 0.9887896348407015, + "grad_norm": 5.184284687042236, + "learning_rate": 3.1635995730411317e-09, + "loss": 0.1585, + "step": 39074 + }, + { + "epoch": 0.9888149404053952, + "grad_norm": 5.4920854568481445, + "learning_rate": 3.149334574648055e-09, + "loss": 0.1597, + "step": 39075 + }, + { + "epoch": 0.9888402459700888, + "grad_norm": 8.178502082824707, + "learning_rate": 3.1351018000153055e-09, + "loss": 0.2193, + "step": 39076 + }, + { + "epoch": 0.9888655515347825, + "grad_norm": 3.1892709732055664, + "learning_rate": 3.120901249235031e-09, + "loss": 0.1191, + "step": 39077 + }, + { + "epoch": 0.9888908570994762, + "grad_norm": 4.792553901672363, + "learning_rate": 3.1067329223982703e-09, + "loss": 0.1963, + "step": 39078 + }, + { + "epoch": 0.9889161626641698, + "grad_norm": 8.782678604125977, + "learning_rate": 3.092596819597171e-09, + "loss": 0.1714, + "step": 39079 + }, + { + "epoch": 0.9889414682288635, + "grad_norm": 4.724015712738037, + "learning_rate": 3.0784929409222175e-09, + "loss": 0.1686, + "step": 39080 + }, + { + "epoch": 0.9889667737935572, + "grad_norm": 6.00093412399292, + "learning_rate": 3.0644212864644475e-09, + "loss": 0.1274, + "step": 39081 + }, + { + "epoch": 0.9889920793582508, + "grad_norm": 4.9060378074646, + "learning_rate": 3.050381856314899e-09, + "loss": 0.0981, + "step": 39082 + }, + { + "epoch": 0.9890173849229446, + "grad_norm": 6.831164836883545, + "learning_rate": 3.0363746505635004e-09, + "loss": 0.1512, + "step": 39083 + }, + { + "epoch": 0.9890426904876383, + "grad_norm": 3.2643649578094482, + "learning_rate": 3.0223996693012903e-09, + "loss": 0.0893, + "step": 39084 + }, + { + "epoch": 0.9890679960523319, + "grad_norm": 3.276646375656128, + "learning_rate": 3.008456912618196e-09, + "loss": 0.0956, + "step": 39085 + }, + { + "epoch": 0.9890933016170256, + "grad_norm": 6.761198997497559, + "learning_rate": 2.9945463806041464e-09, + "loss": 0.2122, + "step": 39086 + }, + { + "epoch": 0.9891186071817193, + "grad_norm": 8.052780151367188, + "learning_rate": 2.9806680733485137e-09, + "loss": 0.1315, + "step": 39087 + }, + { + "epoch": 0.9891439127464129, + "grad_norm": 4.13696813583374, + "learning_rate": 2.9668219909412268e-09, + "loss": 0.1009, + "step": 39088 + }, + { + "epoch": 0.9891692183111066, + "grad_norm": 4.592081069946289, + "learning_rate": 2.9530081334716577e-09, + "loss": 0.0736, + "step": 39089 + }, + { + "epoch": 0.9891945238758003, + "grad_norm": 5.559194087982178, + "learning_rate": 2.93922650102807e-09, + "loss": 0.0833, + "step": 39090 + }, + { + "epoch": 0.9892198294404939, + "grad_norm": 2.916590929031372, + "learning_rate": 2.9254770936998355e-09, + "loss": 0.0877, + "step": 39091 + }, + { + "epoch": 0.9892451350051876, + "grad_norm": 9.943878173828125, + "learning_rate": 2.9117599115763284e-09, + "loss": 0.1723, + "step": 39092 + }, + { + "epoch": 0.9892704405698813, + "grad_norm": 4.923707485198975, + "learning_rate": 2.8980749547447006e-09, + "loss": 0.1624, + "step": 39093 + }, + { + "epoch": 0.9892957461345749, + "grad_norm": 12.495746612548828, + "learning_rate": 2.884422223294325e-09, + "loss": 0.2297, + "step": 39094 + }, + { + "epoch": 0.9893210516992686, + "grad_norm": 4.5648274421691895, + "learning_rate": 2.870801717312355e-09, + "loss": 0.1306, + "step": 39095 + }, + { + "epoch": 0.9893463572639624, + "grad_norm": 6.485311508178711, + "learning_rate": 2.8572134368870517e-09, + "loss": 0.1784, + "step": 39096 + }, + { + "epoch": 0.9893716628286561, + "grad_norm": 3.3428139686584473, + "learning_rate": 2.8436573821066793e-09, + "loss": 0.1063, + "step": 39097 + }, + { + "epoch": 0.9893969683933497, + "grad_norm": 11.58557415008545, + "learning_rate": 2.8301335530572793e-09, + "loss": 0.205, + "step": 39098 + }, + { + "epoch": 0.9894222739580434, + "grad_norm": 6.437488555908203, + "learning_rate": 2.816641949827115e-09, + "loss": 0.1198, + "step": 39099 + }, + { + "epoch": 0.9894475795227371, + "grad_norm": 12.498064994812012, + "learning_rate": 2.803182572502783e-09, + "loss": 0.3258, + "step": 39100 + }, + { + "epoch": 0.9894728850874307, + "grad_norm": 8.330145835876465, + "learning_rate": 2.789755421171436e-09, + "loss": 0.1883, + "step": 39101 + }, + { + "epoch": 0.9894981906521244, + "grad_norm": 3.5900769233703613, + "learning_rate": 2.776360495919117e-09, + "loss": 0.1999, + "step": 39102 + }, + { + "epoch": 0.9895234962168181, + "grad_norm": 3.1751010417938232, + "learning_rate": 2.762997796832423e-09, + "loss": 0.1241, + "step": 39103 + }, + { + "epoch": 0.9895488017815117, + "grad_norm": 8.584948539733887, + "learning_rate": 2.749667323997951e-09, + "loss": 0.1875, + "step": 39104 + }, + { + "epoch": 0.9895741073462054, + "grad_norm": 3.456387996673584, + "learning_rate": 2.7363690775011885e-09, + "loss": 0.1497, + "step": 39105 + }, + { + "epoch": 0.9895994129108991, + "grad_norm": 3.552814245223999, + "learning_rate": 2.7231030574281782e-09, + "loss": 0.0978, + "step": 39106 + }, + { + "epoch": 0.9896247184755927, + "grad_norm": 10.423896789550781, + "learning_rate": 2.709869263863851e-09, + "loss": 0.1904, + "step": 39107 + }, + { + "epoch": 0.9896500240402865, + "grad_norm": 5.156382083892822, + "learning_rate": 2.6966676968942508e-09, + "loss": 0.1388, + "step": 39108 + }, + { + "epoch": 0.9896753296049802, + "grad_norm": 5.484676837921143, + "learning_rate": 2.6834983566043084e-09, + "loss": 0.1391, + "step": 39109 + }, + { + "epoch": 0.9897006351696738, + "grad_norm": 9.402201652526855, + "learning_rate": 2.670361243078956e-09, + "loss": 0.2073, + "step": 39110 + }, + { + "epoch": 0.9897259407343675, + "grad_norm": 4.802804946899414, + "learning_rate": 2.657256356402571e-09, + "loss": 0.1947, + "step": 39111 + }, + { + "epoch": 0.9897512462990612, + "grad_norm": 15.484668731689453, + "learning_rate": 2.6441836966606406e-09, + "loss": 0.2709, + "step": 39112 + }, + { + "epoch": 0.9897765518637548, + "grad_norm": 3.9161767959594727, + "learning_rate": 2.631143263935876e-09, + "loss": 0.0821, + "step": 39113 + }, + { + "epoch": 0.9898018574284485, + "grad_norm": 4.086427688598633, + "learning_rate": 2.6181350583137643e-09, + "loss": 0.0968, + "step": 39114 + }, + { + "epoch": 0.9898271629931422, + "grad_norm": 5.769387722015381, + "learning_rate": 2.605159079878128e-09, + "loss": 0.1031, + "step": 39115 + }, + { + "epoch": 0.9898524685578358, + "grad_norm": 4.967153072357178, + "learning_rate": 2.592215328711678e-09, + "loss": 0.1569, + "step": 39116 + }, + { + "epoch": 0.9898777741225295, + "grad_norm": 6.368112564086914, + "learning_rate": 2.579303804898237e-09, + "loss": 0.1771, + "step": 39117 + }, + { + "epoch": 0.9899030796872232, + "grad_norm": 6.489384174346924, + "learning_rate": 2.5664245085221806e-09, + "loss": 0.1266, + "step": 39118 + }, + { + "epoch": 0.9899283852519168, + "grad_norm": 12.345243453979492, + "learning_rate": 2.5535774396645563e-09, + "loss": 0.2088, + "step": 39119 + }, + { + "epoch": 0.9899536908166106, + "grad_norm": 4.7794575691223145, + "learning_rate": 2.5407625984102956e-09, + "loss": 0.132, + "step": 39120 + }, + { + "epoch": 0.9899789963813043, + "grad_norm": 5.6603312492370605, + "learning_rate": 2.527979984840445e-09, + "loss": 0.1911, + "step": 39121 + }, + { + "epoch": 0.990004301945998, + "grad_norm": 7.815885066986084, + "learning_rate": 2.5152295990382715e-09, + "loss": 0.165, + "step": 39122 + }, + { + "epoch": 0.9900296075106916, + "grad_norm": 2.9230868816375732, + "learning_rate": 2.5025114410853756e-09, + "loss": 0.0607, + "step": 39123 + }, + { + "epoch": 0.9900549130753853, + "grad_norm": 8.728641510009766, + "learning_rate": 2.4898255110650247e-09, + "loss": 0.1633, + "step": 39124 + }, + { + "epoch": 0.990080218640079, + "grad_norm": 2.9367034435272217, + "learning_rate": 2.477171809057155e-09, + "loss": 0.121, + "step": 39125 + }, + { + "epoch": 0.9901055242047726, + "grad_norm": 5.490418910980225, + "learning_rate": 2.464550335145033e-09, + "loss": 0.1503, + "step": 39126 + }, + { + "epoch": 0.9901308297694663, + "grad_norm": 14.113697052001953, + "learning_rate": 2.4519610894097045e-09, + "loss": 0.3151, + "step": 39127 + }, + { + "epoch": 0.99015613533416, + "grad_norm": 6.238539695739746, + "learning_rate": 2.4394040719316613e-09, + "loss": 0.1691, + "step": 39128 + }, + { + "epoch": 0.9901814408988536, + "grad_norm": 3.579888105392456, + "learning_rate": 2.4268792827919495e-09, + "loss": 0.1478, + "step": 39129 + }, + { + "epoch": 0.9902067464635473, + "grad_norm": 4.361462116241455, + "learning_rate": 2.414386722072171e-09, + "loss": 0.156, + "step": 39130 + }, + { + "epoch": 0.990232052028241, + "grad_norm": 6.629157543182373, + "learning_rate": 2.4019263898522606e-09, + "loss": 0.1612, + "step": 39131 + }, + { + "epoch": 0.9902573575929347, + "grad_norm": 8.806351661682129, + "learning_rate": 2.3894982862127102e-09, + "loss": 0.202, + "step": 39132 + }, + { + "epoch": 0.9902826631576284, + "grad_norm": 3.580951452255249, + "learning_rate": 2.3771024112334563e-09, + "loss": 0.0918, + "step": 39133 + }, + { + "epoch": 0.9903079687223221, + "grad_norm": 5.179293155670166, + "learning_rate": 2.3647387649949894e-09, + "loss": 0.1817, + "step": 39134 + }, + { + "epoch": 0.9903332742870157, + "grad_norm": 3.7691116333007812, + "learning_rate": 2.3524073475761355e-09, + "loss": 0.1199, + "step": 39135 + }, + { + "epoch": 0.9903585798517094, + "grad_norm": 5.088191986083984, + "learning_rate": 2.340108159057386e-09, + "loss": 0.1731, + "step": 39136 + }, + { + "epoch": 0.9903838854164031, + "grad_norm": 5.153520584106445, + "learning_rate": 2.3278411995175664e-09, + "loss": 0.1219, + "step": 39137 + }, + { + "epoch": 0.9904091909810967, + "grad_norm": 6.807683944702148, + "learning_rate": 2.315606469036058e-09, + "loss": 0.1462, + "step": 39138 + }, + { + "epoch": 0.9904344965457904, + "grad_norm": 5.452246189117432, + "learning_rate": 2.303403967691131e-09, + "loss": 0.1665, + "step": 39139 + }, + { + "epoch": 0.9904598021104841, + "grad_norm": 5.997348785400391, + "learning_rate": 2.2912336955621674e-09, + "loss": 0.1326, + "step": 39140 + }, + { + "epoch": 0.9904851076751777, + "grad_norm": 5.372581958770752, + "learning_rate": 2.2790956527274366e-09, + "loss": 0.2172, + "step": 39141 + }, + { + "epoch": 0.9905104132398714, + "grad_norm": 7.079364776611328, + "learning_rate": 2.26698983926521e-09, + "loss": 0.1185, + "step": 39142 + }, + { + "epoch": 0.9905357188045651, + "grad_norm": 4.517431735992432, + "learning_rate": 2.2549162552537586e-09, + "loss": 0.1801, + "step": 39143 + }, + { + "epoch": 0.9905610243692587, + "grad_norm": 3.8723304271698, + "learning_rate": 2.242874900770242e-09, + "loss": 0.1254, + "step": 39144 + }, + { + "epoch": 0.9905863299339525, + "grad_norm": 3.9362998008728027, + "learning_rate": 2.230865775892932e-09, + "loss": 0.1359, + "step": 39145 + }, + { + "epoch": 0.9906116354986462, + "grad_norm": 5.576356887817383, + "learning_rate": 2.2188888806989882e-09, + "loss": 0.1228, + "step": 39146 + }, + { + "epoch": 0.9906369410633399, + "grad_norm": 6.370519638061523, + "learning_rate": 2.206944215266127e-09, + "loss": 0.1472, + "step": 39147 + }, + { + "epoch": 0.9906622466280335, + "grad_norm": 5.5411481857299805, + "learning_rate": 2.195031779670398e-09, + "loss": 0.1074, + "step": 39148 + }, + { + "epoch": 0.9906875521927272, + "grad_norm": 3.4432828426361084, + "learning_rate": 2.183151573989517e-09, + "loss": 0.1386, + "step": 39149 + }, + { + "epoch": 0.9907128577574209, + "grad_norm": 9.067610740661621, + "learning_rate": 2.1713035983000897e-09, + "loss": 0.2059, + "step": 39150 + }, + { + "epoch": 0.9907381633221145, + "grad_norm": 8.122553825378418, + "learning_rate": 2.159487852678166e-09, + "loss": 0.1679, + "step": 39151 + }, + { + "epoch": 0.9907634688868082, + "grad_norm": 8.595064163208008, + "learning_rate": 2.147704337199796e-09, + "loss": 0.089, + "step": 39152 + }, + { + "epoch": 0.9907887744515019, + "grad_norm": 4.6391448974609375, + "learning_rate": 2.135953051941586e-09, + "loss": 0.171, + "step": 39153 + }, + { + "epoch": 0.9908140800161955, + "grad_norm": 5.370449066162109, + "learning_rate": 2.1242339969784753e-09, + "loss": 0.1246, + "step": 39154 + }, + { + "epoch": 0.9908393855808892, + "grad_norm": 4.7047600746154785, + "learning_rate": 2.1125471723870692e-09, + "loss": 0.201, + "step": 39155 + }, + { + "epoch": 0.990864691145583, + "grad_norm": 8.100884437561035, + "learning_rate": 2.1008925782417533e-09, + "loss": 0.1412, + "step": 39156 + }, + { + "epoch": 0.9908899967102766, + "grad_norm": 5.963658809661865, + "learning_rate": 2.089270214618022e-09, + "loss": 0.1531, + "step": 39157 + }, + { + "epoch": 0.9909153022749703, + "grad_norm": 3.3093101978302, + "learning_rate": 2.077680081591371e-09, + "loss": 0.1237, + "step": 39158 + }, + { + "epoch": 0.990940607839664, + "grad_norm": 12.55023193359375, + "learning_rate": 2.06612217923563e-09, + "loss": 0.3585, + "step": 39159 + }, + { + "epoch": 0.9909659134043576, + "grad_norm": 3.729994773864746, + "learning_rate": 2.0545965076257392e-09, + "loss": 0.1259, + "step": 39160 + }, + { + "epoch": 0.9909912189690513, + "grad_norm": 4.500317096710205, + "learning_rate": 2.043103066836638e-09, + "loss": 0.0946, + "step": 39161 + }, + { + "epoch": 0.991016524533745, + "grad_norm": 4.573807239532471, + "learning_rate": 2.0316418569416017e-09, + "loss": 0.1611, + "step": 39162 + }, + { + "epoch": 0.9910418300984386, + "grad_norm": 6.971252918243408, + "learning_rate": 2.0202128780144605e-09, + "loss": 0.1353, + "step": 39163 + }, + { + "epoch": 0.9910671356631323, + "grad_norm": 4.464415073394775, + "learning_rate": 2.008816130129598e-09, + "loss": 0.1358, + "step": 39164 + }, + { + "epoch": 0.991092441227826, + "grad_norm": 5.790569305419922, + "learning_rate": 1.9974516133602904e-09, + "loss": 0.0756, + "step": 39165 + }, + { + "epoch": 0.9911177467925196, + "grad_norm": 7.1790032386779785, + "learning_rate": 1.986119327779257e-09, + "loss": 0.1728, + "step": 39166 + }, + { + "epoch": 0.9911430523572133, + "grad_norm": 2.421110153198242, + "learning_rate": 1.9748192734603267e-09, + "loss": 0.0853, + "step": 39167 + }, + { + "epoch": 0.991168357921907, + "grad_norm": 4.011758327484131, + "learning_rate": 1.96355145047622e-09, + "loss": 0.1511, + "step": 39168 + }, + { + "epoch": 0.9911936634866007, + "grad_norm": 6.1476359367370605, + "learning_rate": 1.952315858899101e-09, + "loss": 0.2258, + "step": 39169 + }, + { + "epoch": 0.9912189690512944, + "grad_norm": 4.278327465057373, + "learning_rate": 1.9411124988022446e-09, + "loss": 0.099, + "step": 39170 + }, + { + "epoch": 0.9912442746159881, + "grad_norm": 3.767591714859009, + "learning_rate": 1.92994137025726e-09, + "loss": 0.1582, + "step": 39171 + }, + { + "epoch": 0.9912695801806818, + "grad_norm": 11.773746490478516, + "learning_rate": 1.918802473336312e-09, + "loss": 0.2145, + "step": 39172 + }, + { + "epoch": 0.9912948857453754, + "grad_norm": 3.7406842708587646, + "learning_rate": 1.90769580811101e-09, + "loss": 0.1836, + "step": 39173 + }, + { + "epoch": 0.9913201913100691, + "grad_norm": 3.653778314590454, + "learning_rate": 1.8966213746535177e-09, + "loss": 0.0957, + "step": 39174 + }, + { + "epoch": 0.9913454968747628, + "grad_norm": 9.364357948303223, + "learning_rate": 1.8855791730348903e-09, + "loss": 0.121, + "step": 39175 + }, + { + "epoch": 0.9913708024394564, + "grad_norm": 2.635925769805908, + "learning_rate": 1.874569203326737e-09, + "loss": 0.1179, + "step": 39176 + }, + { + "epoch": 0.9913961080041501, + "grad_norm": 4.399812698364258, + "learning_rate": 1.863591465599557e-09, + "loss": 0.0968, + "step": 39177 + }, + { + "epoch": 0.9914214135688438, + "grad_norm": 4.515237331390381, + "learning_rate": 1.852645959924404e-09, + "loss": 0.2367, + "step": 39178 + }, + { + "epoch": 0.9914467191335374, + "grad_norm": 4.022437572479248, + "learning_rate": 1.8417326863717778e-09, + "loss": 0.118, + "step": 39179 + }, + { + "epoch": 0.9914720246982311, + "grad_norm": 8.737829208374023, + "learning_rate": 1.830851645012177e-09, + "loss": 0.1907, + "step": 39180 + }, + { + "epoch": 0.9914973302629249, + "grad_norm": 6.623841762542725, + "learning_rate": 1.8200028359155463e-09, + "loss": 0.1711, + "step": 39181 + }, + { + "epoch": 0.9915226358276185, + "grad_norm": 4.955554962158203, + "learning_rate": 1.8091862591523846e-09, + "loss": 0.1274, + "step": 39182 + }, + { + "epoch": 0.9915479413923122, + "grad_norm": 8.966127395629883, + "learning_rate": 1.7984019147915255e-09, + "loss": 0.2357, + "step": 39183 + }, + { + "epoch": 0.9915732469570059, + "grad_norm": 5.098960876464844, + "learning_rate": 1.7876498029034683e-09, + "loss": 0.0964, + "step": 39184 + }, + { + "epoch": 0.9915985525216995, + "grad_norm": 8.301213264465332, + "learning_rate": 1.7769299235564918e-09, + "loss": 0.2354, + "step": 39185 + }, + { + "epoch": 0.9916238580863932, + "grad_norm": 4.91368293762207, + "learning_rate": 1.7662422768210952e-09, + "loss": 0.1611, + "step": 39186 + }, + { + "epoch": 0.9916491636510869, + "grad_norm": 5.538754940032959, + "learning_rate": 1.755586862765002e-09, + "loss": 0.182, + "step": 39187 + }, + { + "epoch": 0.9916744692157805, + "grad_norm": 8.695988655090332, + "learning_rate": 1.744963681457601e-09, + "loss": 0.1541, + "step": 39188 + }, + { + "epoch": 0.9916997747804742, + "grad_norm": 3.766810655593872, + "learning_rate": 1.7343727329671711e-09, + "loss": 0.1157, + "step": 39189 + }, + { + "epoch": 0.9917250803451679, + "grad_norm": 6.527129650115967, + "learning_rate": 1.723814017361436e-09, + "loss": 0.1649, + "step": 39190 + }, + { + "epoch": 0.9917503859098615, + "grad_norm": 5.298675537109375, + "learning_rate": 1.7132875347097844e-09, + "loss": 0.1449, + "step": 39191 + }, + { + "epoch": 0.9917756914745552, + "grad_norm": 3.6735999584198, + "learning_rate": 1.7027932850793849e-09, + "loss": 0.1497, + "step": 39192 + }, + { + "epoch": 0.991800997039249, + "grad_norm": 2.987262010574341, + "learning_rate": 1.692331268537406e-09, + "loss": 0.0935, + "step": 39193 + }, + { + "epoch": 0.9918263026039426, + "grad_norm": 16.977092742919922, + "learning_rate": 1.6819014851521264e-09, + "loss": 0.147, + "step": 39194 + }, + { + "epoch": 0.9918516081686363, + "grad_norm": 4.148522853851318, + "learning_rate": 1.6715039349907148e-09, + "loss": 0.1475, + "step": 39195 + }, + { + "epoch": 0.99187691373333, + "grad_norm": 9.226499557495117, + "learning_rate": 1.6611386181197841e-09, + "loss": 0.1274, + "step": 39196 + }, + { + "epoch": 0.9919022192980237, + "grad_norm": 4.0598297119140625, + "learning_rate": 1.6508055346065032e-09, + "loss": 0.1433, + "step": 39197 + }, + { + "epoch": 0.9919275248627173, + "grad_norm": 4.038737773895264, + "learning_rate": 1.6405046845169304e-09, + "loss": 0.1489, + "step": 39198 + }, + { + "epoch": 0.991952830427411, + "grad_norm": 4.686069965362549, + "learning_rate": 1.630236067918789e-09, + "loss": 0.1469, + "step": 39199 + }, + { + "epoch": 0.9919781359921047, + "grad_norm": 3.879281997680664, + "learning_rate": 1.6199996848770271e-09, + "loss": 0.1575, + "step": 39200 + }, + { + "epoch": 0.9920034415567983, + "grad_norm": 2.7369368076324463, + "learning_rate": 1.6097955354582583e-09, + "loss": 0.0842, + "step": 39201 + }, + { + "epoch": 0.992028747121492, + "grad_norm": 8.4099760055542, + "learning_rate": 1.5996236197279857e-09, + "loss": 0.1517, + "step": 39202 + }, + { + "epoch": 0.9920540526861857, + "grad_norm": 4.287188529968262, + "learning_rate": 1.5894839377517123e-09, + "loss": 0.1657, + "step": 39203 + }, + { + "epoch": 0.9920793582508793, + "grad_norm": 7.339822292327881, + "learning_rate": 1.5793764895954965e-09, + "loss": 0.2171, + "step": 39204 + }, + { + "epoch": 0.992104663815573, + "grad_norm": 4.313869476318359, + "learning_rate": 1.5693012753242864e-09, + "loss": 0.1934, + "step": 39205 + }, + { + "epoch": 0.9921299693802668, + "grad_norm": 4.250769138336182, + "learning_rate": 1.559258295002475e-09, + "loss": 0.1722, + "step": 39206 + }, + { + "epoch": 0.9921552749449604, + "grad_norm": 3.4204277992248535, + "learning_rate": 1.5492475486950097e-09, + "loss": 0.0967, + "step": 39207 + }, + { + "epoch": 0.9921805805096541, + "grad_norm": 5.536380290985107, + "learning_rate": 1.5392690364673946e-09, + "loss": 0.0862, + "step": 39208 + }, + { + "epoch": 0.9922058860743478, + "grad_norm": 5.83007287979126, + "learning_rate": 1.5293227583829117e-09, + "loss": 0.0809, + "step": 39209 + }, + { + "epoch": 0.9922311916390414, + "grad_norm": 6.807114124298096, + "learning_rate": 1.5194087145059544e-09, + "loss": 0.1357, + "step": 39210 + }, + { + "epoch": 0.9922564972037351, + "grad_norm": 5.37578821182251, + "learning_rate": 1.5095269049003603e-09, + "loss": 0.1577, + "step": 39211 + }, + { + "epoch": 0.9922818027684288, + "grad_norm": 9.752574920654297, + "learning_rate": 1.4996773296305222e-09, + "loss": 0.2249, + "step": 39212 + }, + { + "epoch": 0.9923071083331224, + "grad_norm": 3.5060431957244873, + "learning_rate": 1.489859988759168e-09, + "loss": 0.1094, + "step": 39213 + }, + { + "epoch": 0.9923324138978161, + "grad_norm": 5.986598014831543, + "learning_rate": 1.4800748823495803e-09, + "loss": 0.1689, + "step": 39214 + }, + { + "epoch": 0.9923577194625098, + "grad_norm": 2.934335470199585, + "learning_rate": 1.4703220104655968e-09, + "loss": 0.1104, + "step": 39215 + }, + { + "epoch": 0.9923830250272034, + "grad_norm": 13.894928932189941, + "learning_rate": 1.4606013731699453e-09, + "loss": 0.1694, + "step": 39216 + }, + { + "epoch": 0.9924083305918971, + "grad_norm": 5.195491790771484, + "learning_rate": 1.4509129705247982e-09, + "loss": 0.1489, + "step": 39217 + }, + { + "epoch": 0.9924336361565909, + "grad_norm": 3.577226400375366, + "learning_rate": 1.441256802592883e-09, + "loss": 0.1921, + "step": 39218 + }, + { + "epoch": 0.9924589417212845, + "grad_norm": 7.78497314453125, + "learning_rate": 1.4316328694363723e-09, + "loss": 0.2021, + "step": 39219 + }, + { + "epoch": 0.9924842472859782, + "grad_norm": 4.609675884246826, + "learning_rate": 1.4220411711174387e-09, + "loss": 0.1952, + "step": 39220 + }, + { + "epoch": 0.9925095528506719, + "grad_norm": 7.4440717697143555, + "learning_rate": 1.4124817076982544e-09, + "loss": 0.22, + "step": 39221 + }, + { + "epoch": 0.9925348584153655, + "grad_norm": 6.769797325134277, + "learning_rate": 1.402954479239882e-09, + "loss": 0.1813, + "step": 39222 + }, + { + "epoch": 0.9925601639800592, + "grad_norm": 4.369388580322266, + "learning_rate": 1.3934594858044937e-09, + "loss": 0.1824, + "step": 39223 + }, + { + "epoch": 0.9925854695447529, + "grad_norm": 4.043220520019531, + "learning_rate": 1.3839967274525967e-09, + "loss": 0.1867, + "step": 39224 + }, + { + "epoch": 0.9926107751094466, + "grad_norm": 3.5610389709472656, + "learning_rate": 1.3745662042452534e-09, + "loss": 0.1349, + "step": 39225 + }, + { + "epoch": 0.9926360806741402, + "grad_norm": 8.145292282104492, + "learning_rate": 1.3651679162440812e-09, + "loss": 0.2211, + "step": 39226 + }, + { + "epoch": 0.9926613862388339, + "grad_norm": 8.004201889038086, + "learning_rate": 1.3558018635084769e-09, + "loss": 0.147, + "step": 39227 + }, + { + "epoch": 0.9926866918035276, + "grad_norm": 4.446318626403809, + "learning_rate": 1.3464680461000579e-09, + "loss": 0.1828, + "step": 39228 + }, + { + "epoch": 0.9927119973682212, + "grad_norm": 3.9495747089385986, + "learning_rate": 1.337166464078221e-09, + "loss": 0.1537, + "step": 39229 + }, + { + "epoch": 0.992737302932915, + "grad_norm": 2.94752836227417, + "learning_rate": 1.3278971175029187e-09, + "loss": 0.0863, + "step": 39230 + }, + { + "epoch": 0.9927626084976087, + "grad_norm": 12.447425842285156, + "learning_rate": 1.3186600064346577e-09, + "loss": 0.27, + "step": 39231 + }, + { + "epoch": 0.9927879140623023, + "grad_norm": 6.985748291015625, + "learning_rate": 1.30945513093228e-09, + "loss": 0.2601, + "step": 39232 + }, + { + "epoch": 0.992813219626996, + "grad_norm": 12.98993968963623, + "learning_rate": 1.3002824910551825e-09, + "loss": 0.1295, + "step": 39233 + }, + { + "epoch": 0.9928385251916897, + "grad_norm": 6.4590373039245605, + "learning_rate": 1.2911420868627622e-09, + "loss": 0.1667, + "step": 39234 + }, + { + "epoch": 0.9928638307563833, + "grad_norm": 5.2451887130737305, + "learning_rate": 1.282033918414416e-09, + "loss": 0.2032, + "step": 39235 + }, + { + "epoch": 0.992889136321077, + "grad_norm": 5.850520610809326, + "learning_rate": 1.2729579857678754e-09, + "loss": 0.0977, + "step": 39236 + }, + { + "epoch": 0.9929144418857707, + "grad_norm": 7.233242988586426, + "learning_rate": 1.2639142889825374e-09, + "loss": 0.2255, + "step": 39237 + }, + { + "epoch": 0.9929397474504643, + "grad_norm": 7.045579433441162, + "learning_rate": 1.2549028281161335e-09, + "loss": 0.2219, + "step": 39238 + }, + { + "epoch": 0.992965053015158, + "grad_norm": 4.590324401855469, + "learning_rate": 1.2459236032275058e-09, + "loss": 0.1616, + "step": 39239 + }, + { + "epoch": 0.9929903585798517, + "grad_norm": 24.129297256469727, + "learning_rate": 1.2369766143732754e-09, + "loss": 0.202, + "step": 39240 + }, + { + "epoch": 0.9930156641445453, + "grad_norm": 6.956103801727295, + "learning_rate": 1.2280618616128393e-09, + "loss": 0.1617, + "step": 39241 + }, + { + "epoch": 0.993040969709239, + "grad_norm": 8.339925765991211, + "learning_rate": 1.2191793450022638e-09, + "loss": 0.2271, + "step": 39242 + }, + { + "epoch": 0.9930662752739328, + "grad_norm": 4.096283912658691, + "learning_rate": 1.2103290645992804e-09, + "loss": 0.1061, + "step": 39243 + }, + { + "epoch": 0.9930915808386264, + "grad_norm": 7.577819347381592, + "learning_rate": 1.2015110204610658e-09, + "loss": 0.1164, + "step": 39244 + }, + { + "epoch": 0.9931168864033201, + "grad_norm": 8.644165992736816, + "learning_rate": 1.1927252126447963e-09, + "loss": 0.1944, + "step": 39245 + }, + { + "epoch": 0.9931421919680138, + "grad_norm": 3.200340509414673, + "learning_rate": 1.1839716412065383e-09, + "loss": 0.1131, + "step": 39246 + }, + { + "epoch": 0.9931674975327074, + "grad_norm": 27.581798553466797, + "learning_rate": 1.175250306202913e-09, + "loss": 0.2349, + "step": 39247 + }, + { + "epoch": 0.9931928030974011, + "grad_norm": 3.3232102394104004, + "learning_rate": 1.1665612076905419e-09, + "loss": 0.0969, + "step": 39248 + }, + { + "epoch": 0.9932181086620948, + "grad_norm": 3.232050657272339, + "learning_rate": 1.1579043457249363e-09, + "loss": 0.0917, + "step": 39249 + }, + { + "epoch": 0.9932434142267885, + "grad_norm": 3.919262170791626, + "learning_rate": 1.149279720362162e-09, + "loss": 0.1087, + "step": 39250 + }, + { + "epoch": 0.9932687197914821, + "grad_norm": 5.098623752593994, + "learning_rate": 1.1406873316577306e-09, + "loss": 0.1542, + "step": 39251 + }, + { + "epoch": 0.9932940253561758, + "grad_norm": 6.571133613586426, + "learning_rate": 1.132127179667708e-09, + "loss": 0.2214, + "step": 39252 + }, + { + "epoch": 0.9933193309208695, + "grad_norm": 3.460658311843872, + "learning_rate": 1.1235992644459404e-09, + "loss": 0.1091, + "step": 39253 + }, + { + "epoch": 0.9933446364855631, + "grad_norm": 2.9021472930908203, + "learning_rate": 1.1151035860484938e-09, + "loss": 0.0918, + "step": 39254 + }, + { + "epoch": 0.9933699420502569, + "grad_norm": 5.836973190307617, + "learning_rate": 1.1066401445303243e-09, + "loss": 0.165, + "step": 39255 + }, + { + "epoch": 0.9933952476149506, + "grad_norm": 7.475010395050049, + "learning_rate": 1.0982089399452778e-09, + "loss": 0.0753, + "step": 39256 + }, + { + "epoch": 0.9934205531796442, + "grad_norm": 3.748591423034668, + "learning_rate": 1.0898099723477551e-09, + "loss": 0.1088, + "step": 39257 + }, + { + "epoch": 0.9934458587443379, + "grad_norm": 4.234452247619629, + "learning_rate": 1.0814432417921572e-09, + "loss": 0.1236, + "step": 39258 + }, + { + "epoch": 0.9934711643090316, + "grad_norm": 10.192201614379883, + "learning_rate": 1.073108748332885e-09, + "loss": 0.2058, + "step": 39259 + }, + { + "epoch": 0.9934964698737252, + "grad_norm": 9.106266021728516, + "learning_rate": 1.0648064920232292e-09, + "loss": 0.1991, + "step": 39260 + }, + { + "epoch": 0.9935217754384189, + "grad_norm": 4.8778977394104, + "learning_rate": 1.0565364729164806e-09, + "loss": 0.0804, + "step": 39261 + }, + { + "epoch": 0.9935470810031126, + "grad_norm": 7.188226699829102, + "learning_rate": 1.048298691066485e-09, + "loss": 0.1482, + "step": 39262 + }, + { + "epoch": 0.9935723865678062, + "grad_norm": 3.3538289070129395, + "learning_rate": 1.0400931465259778e-09, + "loss": 0.1139, + "step": 39263 + }, + { + "epoch": 0.9935976921324999, + "grad_norm": 12.928901672363281, + "learning_rate": 1.03191983934825e-09, + "loss": 0.1166, + "step": 39264 + }, + { + "epoch": 0.9936229976971936, + "grad_norm": 6.258940696716309, + "learning_rate": 1.0237787695854817e-09, + "loss": 0.1493, + "step": 39265 + }, + { + "epoch": 0.9936483032618872, + "grad_norm": 5.318741798400879, + "learning_rate": 1.0156699372904088e-09, + "loss": 0.1121, + "step": 39266 + }, + { + "epoch": 0.993673608826581, + "grad_norm": 3.901738405227661, + "learning_rate": 1.0075933425157669e-09, + "loss": 0.12, + "step": 39267 + }, + { + "epoch": 0.9936989143912747, + "grad_norm": 6.079133987426758, + "learning_rate": 9.995489853131812e-10, + "loss": 0.1002, + "step": 39268 + }, + { + "epoch": 0.9937242199559683, + "grad_norm": 5.571850776672363, + "learning_rate": 9.915368657342771e-10, + "loss": 0.1166, + "step": 39269 + }, + { + "epoch": 0.993749525520662, + "grad_norm": 4.631552696228027, + "learning_rate": 9.835569838312353e-10, + "loss": 0.1672, + "step": 39270 + }, + { + "epoch": 0.9937748310853557, + "grad_norm": 5.317788600921631, + "learning_rate": 9.75609339655681e-10, + "loss": 0.0849, + "step": 39271 + }, + { + "epoch": 0.9938001366500493, + "grad_norm": 7.494784832000732, + "learning_rate": 9.676939332581291e-10, + "loss": 0.1754, + "step": 39272 + }, + { + "epoch": 0.993825442214743, + "grad_norm": 4.878782272338867, + "learning_rate": 9.598107646902056e-10, + "loss": 0.1511, + "step": 39273 + }, + { + "epoch": 0.9938507477794367, + "grad_norm": 5.8623881340026855, + "learning_rate": 9.5195983400298e-10, + "loss": 0.2333, + "step": 39274 + }, + { + "epoch": 0.9938760533441304, + "grad_norm": 6.507606506347656, + "learning_rate": 9.441411412458578e-10, + "loss": 0.1, + "step": 39275 + }, + { + "epoch": 0.993901358908824, + "grad_norm": 11.78506088256836, + "learning_rate": 9.363546864704641e-10, + "loss": 0.2482, + "step": 39276 + }, + { + "epoch": 0.9939266644735177, + "grad_norm": 8.280407905578613, + "learning_rate": 9.286004697267592e-10, + "loss": 0.1831, + "step": 39277 + }, + { + "epoch": 0.9939519700382115, + "grad_norm": 3.835886240005493, + "learning_rate": 9.208784910647029e-10, + "loss": 0.0922, + "step": 39278 + }, + { + "epoch": 0.993977275602905, + "grad_norm": 9.442253112792969, + "learning_rate": 9.131887505337001e-10, + "loss": 0.1816, + "step": 39279 + }, + { + "epoch": 0.9940025811675988, + "grad_norm": 6.122213363647461, + "learning_rate": 9.055312481837109e-10, + "loss": 0.1725, + "step": 39280 + }, + { + "epoch": 0.9940278867322925, + "grad_norm": 18.536165237426758, + "learning_rate": 8.979059840641402e-10, + "loss": 0.1676, + "step": 39281 + }, + { + "epoch": 0.9940531922969861, + "grad_norm": 4.849923610687256, + "learning_rate": 8.903129582238379e-10, + "loss": 0.1211, + "step": 39282 + }, + { + "epoch": 0.9940784978616798, + "grad_norm": 2.839815139770508, + "learning_rate": 8.827521707122088e-10, + "loss": 0.0557, + "step": 39283 + }, + { + "epoch": 0.9941038034263735, + "grad_norm": 6.087114334106445, + "learning_rate": 8.752236215775478e-10, + "loss": 0.0795, + "step": 39284 + }, + { + "epoch": 0.9941291089910671, + "grad_norm": 4.080814838409424, + "learning_rate": 8.677273108692597e-10, + "loss": 0.1175, + "step": 39285 + }, + { + "epoch": 0.9941544145557608, + "grad_norm": 7.673386096954346, + "learning_rate": 8.602632386345289e-10, + "loss": 0.1818, + "step": 39286 + }, + { + "epoch": 0.9941797201204545, + "grad_norm": 3.1352028846740723, + "learning_rate": 8.528314049222053e-10, + "loss": 0.088, + "step": 39287 + }, + { + "epoch": 0.9942050256851481, + "grad_norm": 3.4562876224517822, + "learning_rate": 8.454318097800285e-10, + "loss": 0.1134, + "step": 39288 + }, + { + "epoch": 0.9942303312498418, + "grad_norm": 4.1266188621521, + "learning_rate": 8.380644532557381e-10, + "loss": 0.151, + "step": 39289 + }, + { + "epoch": 0.9942556368145355, + "grad_norm": 4.964549541473389, + "learning_rate": 8.307293353970736e-10, + "loss": 0.1183, + "step": 39290 + }, + { + "epoch": 0.9942809423792291, + "grad_norm": 4.093120098114014, + "learning_rate": 8.234264562506645e-10, + "loss": 0.1855, + "step": 39291 + }, + { + "epoch": 0.9943062479439229, + "grad_norm": 6.613679885864258, + "learning_rate": 8.161558158642502e-10, + "loss": 0.164, + "step": 39292 + }, + { + "epoch": 0.9943315535086166, + "grad_norm": 5.284002780914307, + "learning_rate": 8.089174142850154e-10, + "loss": 0.202, + "step": 39293 + }, + { + "epoch": 0.9943568590733102, + "grad_norm": 3.8371052742004395, + "learning_rate": 8.017112515584791e-10, + "loss": 0.1317, + "step": 39294 + }, + { + "epoch": 0.9943821646380039, + "grad_norm": 3.3451054096221924, + "learning_rate": 7.94537327732381e-10, + "loss": 0.1104, + "step": 39295 + }, + { + "epoch": 0.9944074702026976, + "grad_norm": 3.322601079940796, + "learning_rate": 7.873956428516849e-10, + "loss": 0.1158, + "step": 39296 + }, + { + "epoch": 0.9944327757673912, + "grad_norm": 3.307368278503418, + "learning_rate": 7.802861969635755e-10, + "loss": 0.1402, + "step": 39297 + }, + { + "epoch": 0.9944580813320849, + "grad_norm": 7.954238414764404, + "learning_rate": 7.73208990113572e-10, + "loss": 0.2152, + "step": 39298 + }, + { + "epoch": 0.9944833868967786, + "grad_norm": 3.5166678428649902, + "learning_rate": 7.661640223471933e-10, + "loss": 0.1156, + "step": 39299 + }, + { + "epoch": 0.9945086924614723, + "grad_norm": 5.443237781524658, + "learning_rate": 7.591512937094036e-10, + "loss": 0.1051, + "step": 39300 + }, + { + "epoch": 0.9945339980261659, + "grad_norm": 8.044305801391602, + "learning_rate": 7.521708042462772e-10, + "loss": 0.1398, + "step": 39301 + }, + { + "epoch": 0.9945593035908596, + "grad_norm": 5.989782810211182, + "learning_rate": 7.45222554002778e-10, + "loss": 0.1273, + "step": 39302 + }, + { + "epoch": 0.9945846091555534, + "grad_norm": 11.174742698669434, + "learning_rate": 7.383065430227598e-10, + "loss": 0.261, + "step": 39303 + }, + { + "epoch": 0.994609914720247, + "grad_norm": 4.035539627075195, + "learning_rate": 7.31422771352297e-10, + "loss": 0.2257, + "step": 39304 + }, + { + "epoch": 0.9946352202849407, + "grad_norm": 3.737851619720459, + "learning_rate": 7.245712390341331e-10, + "loss": 0.1902, + "step": 39305 + }, + { + "epoch": 0.9946605258496344, + "grad_norm": 3.5518808364868164, + "learning_rate": 7.177519461137872e-10, + "loss": 0.1193, + "step": 39306 + }, + { + "epoch": 0.994685831414328, + "grad_norm": 3.3708553314208984, + "learning_rate": 7.10964892634558e-10, + "loss": 0.1247, + "step": 39307 + }, + { + "epoch": 0.9947111369790217, + "grad_norm": 10.07375431060791, + "learning_rate": 7.042100786402995e-10, + "loss": 0.1717, + "step": 39308 + }, + { + "epoch": 0.9947364425437154, + "grad_norm": 5.423050403594971, + "learning_rate": 6.974875041748652e-10, + "loss": 0.2005, + "step": 39309 + }, + { + "epoch": 0.994761748108409, + "grad_norm": 4.103301525115967, + "learning_rate": 6.907971692809989e-10, + "loss": 0.1202, + "step": 39310 + }, + { + "epoch": 0.9947870536731027, + "grad_norm": 3.56549334526062, + "learning_rate": 6.841390740025544e-10, + "loss": 0.1004, + "step": 39311 + }, + { + "epoch": 0.9948123592377964, + "grad_norm": 8.001188278198242, + "learning_rate": 6.775132183822753e-10, + "loss": 0.2153, + "step": 39312 + }, + { + "epoch": 0.99483766480249, + "grad_norm": 3.2343316078186035, + "learning_rate": 6.709196024623499e-10, + "loss": 0.1305, + "step": 39313 + }, + { + "epoch": 0.9948629703671837, + "grad_norm": 9.48051643371582, + "learning_rate": 6.643582262860771e-10, + "loss": 0.1833, + "step": 39314 + }, + { + "epoch": 0.9948882759318775, + "grad_norm": 3.4347434043884277, + "learning_rate": 6.578290898950901e-10, + "loss": 0.1384, + "step": 39315 + }, + { + "epoch": 0.994913581496571, + "grad_norm": 7.263946533203125, + "learning_rate": 6.513321933321326e-10, + "loss": 0.1461, + "step": 39316 + }, + { + "epoch": 0.9949388870612648, + "grad_norm": 7.519218444824219, + "learning_rate": 6.44867536638838e-10, + "loss": 0.1478, + "step": 39317 + }, + { + "epoch": 0.9949641926259585, + "grad_norm": 12.007719993591309, + "learning_rate": 6.384351198562844e-10, + "loss": 0.2252, + "step": 39318 + }, + { + "epoch": 0.9949894981906521, + "grad_norm": 16.591686248779297, + "learning_rate": 6.320349430272154e-10, + "loss": 0.2825, + "step": 39319 + }, + { + "epoch": 0.9950148037553458, + "grad_norm": 4.536084175109863, + "learning_rate": 6.256670061915992e-10, + "loss": 0.1396, + "step": 39320 + }, + { + "epoch": 0.9950401093200395, + "grad_norm": 4.946452617645264, + "learning_rate": 6.193313093916242e-10, + "loss": 0.1047, + "step": 39321 + }, + { + "epoch": 0.9950654148847331, + "grad_norm": 5.059864044189453, + "learning_rate": 6.130278526672584e-10, + "loss": 0.13, + "step": 39322 + }, + { + "epoch": 0.9950907204494268, + "grad_norm": 3.7912709712982178, + "learning_rate": 6.0675663605958e-10, + "loss": 0.0847, + "step": 39323 + }, + { + "epoch": 0.9951160260141205, + "grad_norm": 3.383288860321045, + "learning_rate": 6.005176596091122e-10, + "loss": 0.1119, + "step": 39324 + }, + { + "epoch": 0.9951413315788142, + "grad_norm": 5.069960594177246, + "learning_rate": 5.943109233558231e-10, + "loss": 0.1739, + "step": 39325 + }, + { + "epoch": 0.9951666371435078, + "grad_norm": 9.084432601928711, + "learning_rate": 5.881364273396806e-10, + "loss": 0.1345, + "step": 39326 + }, + { + "epoch": 0.9951919427082015, + "grad_norm": 5.696338653564453, + "learning_rate": 5.819941716012079e-10, + "loss": 0.1607, + "step": 39327 + }, + { + "epoch": 0.9952172482728953, + "grad_norm": 3.3491547107696533, + "learning_rate": 5.758841561787076e-10, + "loss": 0.1515, + "step": 39328 + }, + { + "epoch": 0.9952425538375889, + "grad_norm": 4.833774089813232, + "learning_rate": 5.698063811132582e-10, + "loss": 0.1819, + "step": 39329 + }, + { + "epoch": 0.9952678594022826, + "grad_norm": 10.13637638092041, + "learning_rate": 5.63760846442607e-10, + "loss": 0.0867, + "step": 39330 + }, + { + "epoch": 0.9952931649669763, + "grad_norm": 8.828544616699219, + "learning_rate": 5.57747552206167e-10, + "loss": 0.1283, + "step": 39331 + }, + { + "epoch": 0.9953184705316699, + "grad_norm": 11.571409225463867, + "learning_rate": 5.517664984433513e-10, + "loss": 0.286, + "step": 39332 + }, + { + "epoch": 0.9953437760963636, + "grad_norm": 5.371866703033447, + "learning_rate": 5.458176851919073e-10, + "loss": 0.1223, + "step": 39333 + }, + { + "epoch": 0.9953690816610573, + "grad_norm": 4.9177937507629395, + "learning_rate": 5.399011124906928e-10, + "loss": 0.1171, + "step": 39334 + }, + { + "epoch": 0.9953943872257509, + "grad_norm": 6.8248748779296875, + "learning_rate": 5.340167803774554e-10, + "loss": 0.2322, + "step": 39335 + }, + { + "epoch": 0.9954196927904446, + "grad_norm": 8.534928321838379, + "learning_rate": 5.281646888904979e-10, + "loss": 0.1048, + "step": 39336 + }, + { + "epoch": 0.9954449983551383, + "grad_norm": 2.0265772342681885, + "learning_rate": 5.223448380675678e-10, + "loss": 0.0931, + "step": 39337 + }, + { + "epoch": 0.9954703039198319, + "grad_norm": 15.629557609558105, + "learning_rate": 5.165572279458575e-10, + "loss": 0.2555, + "step": 39338 + }, + { + "epoch": 0.9954956094845256, + "grad_norm": 2.4555931091308594, + "learning_rate": 5.108018585631147e-10, + "loss": 0.0938, + "step": 39339 + }, + { + "epoch": 0.9955209150492194, + "grad_norm": 2.9477007389068604, + "learning_rate": 5.050787299559767e-10, + "loss": 0.117, + "step": 39340 + }, + { + "epoch": 0.995546220613913, + "grad_norm": 4.988849639892578, + "learning_rate": 4.99387842161636e-10, + "loss": 0.2074, + "step": 39341 + }, + { + "epoch": 0.9955715261786067, + "grad_norm": 7.513205528259277, + "learning_rate": 4.9372919521673e-10, + "loss": 0.2216, + "step": 39342 + }, + { + "epoch": 0.9955968317433004, + "grad_norm": 3.0408949851989746, + "learning_rate": 4.88102789157896e-10, + "loss": 0.0716, + "step": 39343 + }, + { + "epoch": 0.995622137307994, + "grad_norm": 5.406413555145264, + "learning_rate": 4.825086240217713e-10, + "loss": 0.1305, + "step": 39344 + }, + { + "epoch": 0.9956474428726877, + "grad_norm": 6.530621528625488, + "learning_rate": 4.76946699843328e-10, + "loss": 0.1999, + "step": 39345 + }, + { + "epoch": 0.9956727484373814, + "grad_norm": 4.287171840667725, + "learning_rate": 4.714170166592036e-10, + "loss": 0.1348, + "step": 39346 + }, + { + "epoch": 0.995698054002075, + "grad_norm": 7.461915493011475, + "learning_rate": 4.65919574504925e-10, + "loss": 0.1962, + "step": 39347 + }, + { + "epoch": 0.9957233595667687, + "grad_norm": 5.862635612487793, + "learning_rate": 4.604543734160194e-10, + "loss": 0.1371, + "step": 39348 + }, + { + "epoch": 0.9957486651314624, + "grad_norm": 2.066871404647827, + "learning_rate": 4.5502141342745887e-10, + "loss": 0.0775, + "step": 39349 + }, + { + "epoch": 0.995773970696156, + "grad_norm": 4.371971607208252, + "learning_rate": 4.4962069457477053e-10, + "loss": 0.1732, + "step": 39350 + }, + { + "epoch": 0.9957992762608497, + "grad_norm": 6.285384654998779, + "learning_rate": 4.442522168923713e-10, + "loss": 0.1727, + "step": 39351 + }, + { + "epoch": 0.9958245818255435, + "grad_norm": 4.069249629974365, + "learning_rate": 4.389159804152332e-10, + "loss": 0.0649, + "step": 39352 + }, + { + "epoch": 0.9958498873902372, + "grad_norm": 5.957603931427002, + "learning_rate": 4.3361198517721804e-10, + "loss": 0.1829, + "step": 39353 + }, + { + "epoch": 0.9958751929549308, + "grad_norm": 9.331297874450684, + "learning_rate": 4.283402312127427e-10, + "loss": 0.1896, + "step": 39354 + }, + { + "epoch": 0.9959004985196245, + "grad_norm": 5.452960968017578, + "learning_rate": 4.2310071855622414e-10, + "loss": 0.1528, + "step": 39355 + }, + { + "epoch": 0.9959258040843182, + "grad_norm": 7.069860458374023, + "learning_rate": 4.17893447240969e-10, + "loss": 0.1944, + "step": 39356 + }, + { + "epoch": 0.9959511096490118, + "grad_norm": 3.7024123668670654, + "learning_rate": 4.127184173008392e-10, + "loss": 0.1151, + "step": 39357 + }, + { + "epoch": 0.9959764152137055, + "grad_norm": 5.053539752960205, + "learning_rate": 4.0757562876914126e-10, + "loss": 0.1238, + "step": 39358 + }, + { + "epoch": 0.9960017207783992, + "grad_norm": 3.1798291206359863, + "learning_rate": 4.0246508167862684e-10, + "loss": 0.1123, + "step": 39359 + }, + { + "epoch": 0.9960270263430928, + "grad_norm": 3.0248236656188965, + "learning_rate": 3.973867760631578e-10, + "loss": 0.0917, + "step": 39360 + }, + { + "epoch": 0.9960523319077865, + "grad_norm": 3.4348795413970947, + "learning_rate": 3.923407119543754e-10, + "loss": 0.1224, + "step": 39361 + }, + { + "epoch": 0.9960776374724802, + "grad_norm": 5.08643913269043, + "learning_rate": 3.8732688938558637e-10, + "loss": 0.1284, + "step": 39362 + }, + { + "epoch": 0.9961029430371738, + "grad_norm": 12.558568954467773, + "learning_rate": 3.823453083889872e-10, + "loss": 0.231, + "step": 39363 + }, + { + "epoch": 0.9961282486018675, + "grad_norm": 3.30881404876709, + "learning_rate": 3.773959689967743e-10, + "loss": 0.1297, + "step": 39364 + }, + { + "epoch": 0.9961535541665613, + "grad_norm": 19.205303192138672, + "learning_rate": 3.724788712405891e-10, + "loss": 0.2135, + "step": 39365 + }, + { + "epoch": 0.9961788597312549, + "grad_norm": 10.715981483459473, + "learning_rate": 3.67594015152628e-10, + "loss": 0.2592, + "step": 39366 + }, + { + "epoch": 0.9962041652959486, + "grad_norm": 3.58093523979187, + "learning_rate": 3.627414007634222e-10, + "loss": 0.128, + "step": 39367 + }, + { + "epoch": 0.9962294708606423, + "grad_norm": 4.990739822387695, + "learning_rate": 3.5792102810516815e-10, + "loss": 0.1067, + "step": 39368 + }, + { + "epoch": 0.9962547764253359, + "grad_norm": 3.4640135765075684, + "learning_rate": 3.5313289720895204e-10, + "loss": 0.1366, + "step": 39369 + }, + { + "epoch": 0.9962800819900296, + "grad_norm": 24.216407775878906, + "learning_rate": 3.483770081053051e-10, + "loss": 0.218, + "step": 39370 + }, + { + "epoch": 0.9963053875547233, + "grad_norm": 5.529679298400879, + "learning_rate": 3.436533608247583e-10, + "loss": 0.1084, + "step": 39371 + }, + { + "epoch": 0.9963306931194169, + "grad_norm": 4.596628189086914, + "learning_rate": 3.389619553978429e-10, + "loss": 0.1486, + "step": 39372 + }, + { + "epoch": 0.9963559986841106, + "grad_norm": 2.393254041671753, + "learning_rate": 3.343027918550901e-10, + "loss": 0.1061, + "step": 39373 + }, + { + "epoch": 0.9963813042488043, + "grad_norm": 7.520087718963623, + "learning_rate": 3.2967587022647574e-10, + "loss": 0.1307, + "step": 39374 + }, + { + "epoch": 0.9964066098134979, + "grad_norm": 6.183558464050293, + "learning_rate": 3.25081190541976e-10, + "loss": 0.1377, + "step": 39375 + }, + { + "epoch": 0.9964319153781916, + "grad_norm": 4.003041744232178, + "learning_rate": 3.205187528304565e-10, + "loss": 0.1227, + "step": 39376 + }, + { + "epoch": 0.9964572209428854, + "grad_norm": 5.042220592498779, + "learning_rate": 3.159885571224486e-10, + "loss": 0.1296, + "step": 39377 + }, + { + "epoch": 0.9964825265075791, + "grad_norm": 2.875347375869751, + "learning_rate": 3.114906034462628e-10, + "loss": 0.0927, + "step": 39378 + }, + { + "epoch": 0.9965078320722727, + "grad_norm": 3.259921073913574, + "learning_rate": 3.0702489183132014e-10, + "loss": 0.1519, + "step": 39379 + }, + { + "epoch": 0.9965331376369664, + "grad_norm": 7.022061824798584, + "learning_rate": 3.025914223064863e-10, + "loss": 0.1879, + "step": 39380 + }, + { + "epoch": 0.9965584432016601, + "grad_norm": 3.8234100341796875, + "learning_rate": 2.981901949006272e-10, + "loss": 0.0751, + "step": 39381 + }, + { + "epoch": 0.9965837487663537, + "grad_norm": 7.06367826461792, + "learning_rate": 2.9382120964094317e-10, + "loss": 0.1323, + "step": 39382 + }, + { + "epoch": 0.9966090543310474, + "grad_norm": 4.296472549438477, + "learning_rate": 2.8948446655685525e-10, + "loss": 0.1086, + "step": 39383 + }, + { + "epoch": 0.9966343598957411, + "grad_norm": 4.13584566116333, + "learning_rate": 2.851799656755638e-10, + "loss": 0.1168, + "step": 39384 + }, + { + "epoch": 0.9966596654604347, + "grad_norm": 4.4141740798950195, + "learning_rate": 2.809077070253796e-10, + "loss": 0.1206, + "step": 39385 + }, + { + "epoch": 0.9966849710251284, + "grad_norm": 4.749998092651367, + "learning_rate": 2.766676906335031e-10, + "loss": 0.1085, + "step": 39386 + }, + { + "epoch": 0.9967102765898221, + "grad_norm": 5.823376655578613, + "learning_rate": 2.724599165276898e-10, + "loss": 0.1363, + "step": 39387 + }, + { + "epoch": 0.9967355821545157, + "grad_norm": 9.52708625793457, + "learning_rate": 2.6828438473458506e-10, + "loss": 0.0924, + "step": 39388 + }, + { + "epoch": 0.9967608877192095, + "grad_norm": 5.428701877593994, + "learning_rate": 2.6414109528083433e-10, + "loss": 0.1919, + "step": 39389 + }, + { + "epoch": 0.9967861932839032, + "grad_norm": 5.7294230461120605, + "learning_rate": 2.6003004819419306e-10, + "loss": 0.1621, + "step": 39390 + }, + { + "epoch": 0.9968114988485968, + "grad_norm": 2.7416458129882812, + "learning_rate": 2.5595124350019654e-10, + "loss": 0.1428, + "step": 39391 + }, + { + "epoch": 0.9968368044132905, + "grad_norm": 5.486457347869873, + "learning_rate": 2.519046812260451e-10, + "loss": 0.2264, + "step": 39392 + }, + { + "epoch": 0.9968621099779842, + "grad_norm": 3.9829728603363037, + "learning_rate": 2.478903613967187e-10, + "loss": 0.0773, + "step": 39393 + }, + { + "epoch": 0.9968874155426778, + "grad_norm": 3.5980029106140137, + "learning_rate": 2.439082840388629e-10, + "loss": 0.1644, + "step": 39394 + }, + { + "epoch": 0.9969127211073715, + "grad_norm": 4.944239616394043, + "learning_rate": 2.399584491785678e-10, + "loss": 0.0939, + "step": 39395 + }, + { + "epoch": 0.9969380266720652, + "grad_norm": 2.5507185459136963, + "learning_rate": 2.3604085684025836e-10, + "loss": 0.1072, + "step": 39396 + }, + { + "epoch": 0.9969633322367588, + "grad_norm": 4.455418109893799, + "learning_rate": 2.3215550704946964e-10, + "loss": 0.1328, + "step": 39397 + }, + { + "epoch": 0.9969886378014525, + "grad_norm": 5.473064422607422, + "learning_rate": 2.2830239983173685e-10, + "loss": 0.1446, + "step": 39398 + }, + { + "epoch": 0.9970139433661462, + "grad_norm": 6.215526103973389, + "learning_rate": 2.2448153521203997e-10, + "loss": 0.1411, + "step": 39399 + }, + { + "epoch": 0.9970392489308398, + "grad_norm": 6.708166599273682, + "learning_rate": 2.2069291321424878e-10, + "loss": 0.2228, + "step": 39400 + }, + { + "epoch": 0.9970645544955336, + "grad_norm": 3.427839517593384, + "learning_rate": 2.1693653386334336e-10, + "loss": 0.1507, + "step": 39401 + }, + { + "epoch": 0.9970898600602273, + "grad_norm": 3.223031520843506, + "learning_rate": 2.1321239718319343e-10, + "loss": 0.1188, + "step": 39402 + }, + { + "epoch": 0.997115165624921, + "grad_norm": 4.022861003875732, + "learning_rate": 2.095205031982239e-10, + "loss": 0.1408, + "step": 39403 + }, + { + "epoch": 0.9971404711896146, + "grad_norm": 3.9656448364257812, + "learning_rate": 2.0586085193174954e-10, + "loss": 0.1441, + "step": 39404 + }, + { + "epoch": 0.9971657767543083, + "grad_norm": 4.437704563140869, + "learning_rate": 2.0223344340764007e-10, + "loss": 0.1985, + "step": 39405 + }, + { + "epoch": 0.997191082319002, + "grad_norm": 10.34567928314209, + "learning_rate": 1.986382776497653e-10, + "loss": 0.2171, + "step": 39406 + }, + { + "epoch": 0.9972163878836956, + "grad_norm": 4.246662616729736, + "learning_rate": 1.9507535468032967e-10, + "loss": 0.1689, + "step": 39407 + }, + { + "epoch": 0.9972416934483893, + "grad_norm": 3.7332658767700195, + "learning_rate": 1.9154467452320302e-10, + "loss": 0.139, + "step": 39408 + }, + { + "epoch": 0.997266999013083, + "grad_norm": 4.5377702713012695, + "learning_rate": 1.8804623720058978e-10, + "loss": 0.1264, + "step": 39409 + }, + { + "epoch": 0.9972923045777766, + "grad_norm": 10.792780876159668, + "learning_rate": 1.8458004273524955e-10, + "loss": 0.1072, + "step": 39410 + }, + { + "epoch": 0.9973176101424703, + "grad_norm": 5.715993404388428, + "learning_rate": 1.8114609114938674e-10, + "loss": 0.0984, + "step": 39411 + }, + { + "epoch": 0.997342915707164, + "grad_norm": 4.896886825561523, + "learning_rate": 1.7774438246520587e-10, + "loss": 0.1446, + "step": 39412 + }, + { + "epoch": 0.9973682212718576, + "grad_norm": 3.0066869258880615, + "learning_rate": 1.7437491670491136e-10, + "loss": 0.0903, + "step": 39413 + }, + { + "epoch": 0.9973935268365514, + "grad_norm": 11.801602363586426, + "learning_rate": 1.7103769389015257e-10, + "loss": 0.1788, + "step": 39414 + }, + { + "epoch": 0.9974188324012451, + "grad_norm": 3.1615495681762695, + "learning_rate": 1.6773271404202375e-10, + "loss": 0.1381, + "step": 39415 + }, + { + "epoch": 0.9974441379659387, + "grad_norm": 11.423994064331055, + "learning_rate": 1.6445997718217423e-10, + "loss": 0.2479, + "step": 39416 + }, + { + "epoch": 0.9974694435306324, + "grad_norm": 9.01067066192627, + "learning_rate": 1.6121948333169824e-10, + "loss": 0.2366, + "step": 39417 + }, + { + "epoch": 0.9974947490953261, + "grad_norm": 2.987943172454834, + "learning_rate": 1.5801123251169004e-10, + "loss": 0.0947, + "step": 39418 + }, + { + "epoch": 0.9975200546600197, + "grad_norm": 7.326693534851074, + "learning_rate": 1.5483522474268875e-10, + "loss": 0.1193, + "step": 39419 + }, + { + "epoch": 0.9975453602247134, + "grad_norm": 3.1128287315368652, + "learning_rate": 1.516914600446784e-10, + "loss": 0.0698, + "step": 39420 + }, + { + "epoch": 0.9975706657894071, + "grad_norm": 5.437897682189941, + "learning_rate": 1.485799384387532e-10, + "loss": 0.1766, + "step": 39421 + }, + { + "epoch": 0.9975959713541007, + "grad_norm": 3.834843873977661, + "learning_rate": 1.4550065994434205e-10, + "loss": 0.0856, + "step": 39422 + }, + { + "epoch": 0.9976212769187944, + "grad_norm": 15.782334327697754, + "learning_rate": 1.4245362458198407e-10, + "loss": 0.4651, + "step": 39423 + }, + { + "epoch": 0.9976465824834881, + "grad_norm": 4.623784065246582, + "learning_rate": 1.3943883237055312e-10, + "loss": 0.16, + "step": 39424 + }, + { + "epoch": 0.9976718880481817, + "grad_norm": 8.82712173461914, + "learning_rate": 1.3645628332947803e-10, + "loss": 0.1797, + "step": 39425 + }, + { + "epoch": 0.9976971936128755, + "grad_norm": 10.43082046508789, + "learning_rate": 1.3350597747874284e-10, + "loss": 0.2763, + "step": 39426 + }, + { + "epoch": 0.9977224991775692, + "grad_norm": 5.5476460456848145, + "learning_rate": 1.3058791483722133e-10, + "loss": 0.1366, + "step": 39427 + }, + { + "epoch": 0.9977478047422629, + "grad_norm": 9.570043563842773, + "learning_rate": 1.277020954232322e-10, + "loss": 0.218, + "step": 39428 + }, + { + "epoch": 0.9977731103069565, + "grad_norm": 6.766509532928467, + "learning_rate": 1.2484851925564923e-10, + "loss": 0.1563, + "step": 39429 + }, + { + "epoch": 0.9977984158716502, + "grad_norm": 12.931002616882324, + "learning_rate": 1.220271863527911e-10, + "loss": 0.3075, + "step": 39430 + }, + { + "epoch": 0.9978237214363439, + "grad_norm": 3.655963659286499, + "learning_rate": 1.1923809673297648e-10, + "loss": 0.13, + "step": 39431 + }, + { + "epoch": 0.9978490270010375, + "grad_norm": 5.947310924530029, + "learning_rate": 1.1648125041452407e-10, + "loss": 0.2214, + "step": 39432 + }, + { + "epoch": 0.9978743325657312, + "grad_norm": 3.954619884490967, + "learning_rate": 1.1375664741464232e-10, + "loss": 0.0838, + "step": 39433 + }, + { + "epoch": 0.9978996381304249, + "grad_norm": 4.436159133911133, + "learning_rate": 1.1106428775053968e-10, + "loss": 0.2017, + "step": 39434 + }, + { + "epoch": 0.9979249436951185, + "grad_norm": 3.8196911811828613, + "learning_rate": 1.0840417144053483e-10, + "loss": 0.1148, + "step": 39435 + }, + { + "epoch": 0.9979502492598122, + "grad_norm": 9.61122989654541, + "learning_rate": 1.0577629850183624e-10, + "loss": 0.1897, + "step": 39436 + }, + { + "epoch": 0.997975554824506, + "grad_norm": 5.382273197174072, + "learning_rate": 1.0318066894998702e-10, + "loss": 0.1564, + "step": 39437 + }, + { + "epoch": 0.9980008603891996, + "grad_norm": 3.73714017868042, + "learning_rate": 1.0061728280330585e-10, + "loss": 0.1275, + "step": 39438 + }, + { + "epoch": 0.9980261659538933, + "grad_norm": 5.34977912902832, + "learning_rate": 9.808614007789097e-11, + "loss": 0.1816, + "step": 39439 + }, + { + "epoch": 0.998051471518587, + "grad_norm": 5.7111005783081055, + "learning_rate": 9.558724078928549e-11, + "loss": 0.137, + "step": 39440 + }, + { + "epoch": 0.9980767770832806, + "grad_norm": 6.482593536376953, + "learning_rate": 9.31205849546979e-11, + "loss": 0.2097, + "step": 39441 + }, + { + "epoch": 0.9981020826479743, + "grad_norm": 5.726070404052734, + "learning_rate": 9.068617258911616e-11, + "loss": 0.1825, + "step": 39442 + }, + { + "epoch": 0.998127388212668, + "grad_norm": 4.566473484039307, + "learning_rate": 8.828400370863854e-11, + "loss": 0.1205, + "step": 39443 + }, + { + "epoch": 0.9981526937773616, + "grad_norm": 3.78312087059021, + "learning_rate": 8.591407832880816e-11, + "loss": 0.1377, + "step": 39444 + }, + { + "epoch": 0.9981779993420553, + "grad_norm": 6.972850322723389, + "learning_rate": 8.357639646516813e-11, + "loss": 0.1817, + "step": 39445 + }, + { + "epoch": 0.998203304906749, + "grad_norm": 3.591179132461548, + "learning_rate": 8.127095813215136e-11, + "loss": 0.1076, + "step": 39446 + }, + { + "epoch": 0.9982286104714426, + "grad_norm": 9.980695724487305, + "learning_rate": 7.899776334530096e-11, + "loss": 0.1734, + "step": 39447 + }, + { + "epoch": 0.9982539160361363, + "grad_norm": 8.16496467590332, + "learning_rate": 7.675681211849473e-11, + "loss": 0.165, + "step": 39448 + }, + { + "epoch": 0.99827922160083, + "grad_norm": 5.313545227050781, + "learning_rate": 7.454810446727578e-11, + "loss": 0.161, + "step": 39449 + }, + { + "epoch": 0.9983045271655236, + "grad_norm": 3.7693727016448975, + "learning_rate": 7.237164040441169e-11, + "loss": 0.18, + "step": 39450 + }, + { + "epoch": 0.9983298327302174, + "grad_norm": 6.175013542175293, + "learning_rate": 7.022741994544557e-11, + "loss": 0.1155, + "step": 39451 + }, + { + "epoch": 0.9983551382949111, + "grad_norm": 5.122835636138916, + "learning_rate": 6.8115443103145e-11, + "loss": 0.1225, + "step": 39452 + }, + { + "epoch": 0.9983804438596048, + "grad_norm": 31.7905330657959, + "learning_rate": 6.603570989194285e-11, + "loss": 0.2721, + "step": 39453 + }, + { + "epoch": 0.9984057494242984, + "grad_norm": 4.006309509277344, + "learning_rate": 6.398822032405161e-11, + "loss": 0.1478, + "step": 39454 + }, + { + "epoch": 0.9984310549889921, + "grad_norm": 2.6272122859954834, + "learning_rate": 6.197297441390415e-11, + "loss": 0.0852, + "step": 39455 + }, + { + "epoch": 0.9984563605536858, + "grad_norm": 17.480546951293945, + "learning_rate": 5.998997217426805e-11, + "loss": 0.2031, + "step": 39456 + }, + { + "epoch": 0.9984816661183794, + "grad_norm": 3.6282458305358887, + "learning_rate": 5.803921361735576e-11, + "loss": 0.126, + "step": 39457 + }, + { + "epoch": 0.9985069716830731, + "grad_norm": 2.0674169063568115, + "learning_rate": 5.6120698756489956e-11, + "loss": 0.0675, + "step": 39458 + }, + { + "epoch": 0.9985322772477668, + "grad_norm": 19.255966186523438, + "learning_rate": 5.423442760332798e-11, + "loss": 0.1567, + "step": 39459 + }, + { + "epoch": 0.9985575828124604, + "grad_norm": 4.64605712890625, + "learning_rate": 5.238040017008228e-11, + "loss": 0.0968, + "step": 39460 + }, + { + "epoch": 0.9985828883771541, + "grad_norm": 2.8909244537353516, + "learning_rate": 5.055861646952043e-11, + "loss": 0.1053, + "step": 39461 + }, + { + "epoch": 0.9986081939418479, + "grad_norm": 3.808755874633789, + "learning_rate": 4.876907651218954e-11, + "loss": 0.1452, + "step": 39462 + }, + { + "epoch": 0.9986334995065415, + "grad_norm": 5.510989189147949, + "learning_rate": 4.701178031085718e-11, + "loss": 0.181, + "step": 39463 + }, + { + "epoch": 0.9986588050712352, + "grad_norm": 5.6481428146362305, + "learning_rate": 4.528672787607047e-11, + "loss": 0.2029, + "step": 39464 + }, + { + "epoch": 0.9986841106359289, + "grad_norm": 4.266093730926514, + "learning_rate": 4.359391921893163e-11, + "loss": 0.1442, + "step": 39465 + }, + { + "epoch": 0.9987094162006225, + "grad_norm": 4.72351598739624, + "learning_rate": 4.193335435109802e-11, + "loss": 0.1056, + "step": 39466 + }, + { + "epoch": 0.9987347217653162, + "grad_norm": 5.436206340789795, + "learning_rate": 4.0305033282561637e-11, + "loss": 0.1522, + "step": 39467 + }, + { + "epoch": 0.9987600273300099, + "grad_norm": 3.8472306728363037, + "learning_rate": 3.8708956023869595e-11, + "loss": 0.0993, + "step": 39468 + }, + { + "epoch": 0.9987853328947035, + "grad_norm": 8.588399887084961, + "learning_rate": 3.714512258556902e-11, + "loss": 0.1926, + "step": 39469 + }, + { + "epoch": 0.9988106384593972, + "grad_norm": 21.534404754638672, + "learning_rate": 3.561353297765191e-11, + "loss": 0.2768, + "step": 39470 + }, + { + "epoch": 0.9988359440240909, + "grad_norm": 4.464258193969727, + "learning_rate": 3.4114187210110285e-11, + "loss": 0.1107, + "step": 39471 + }, + { + "epoch": 0.9988612495887845, + "grad_norm": 6.806620121002197, + "learning_rate": 3.264708529238103e-11, + "loss": 0.2376, + "step": 39472 + }, + { + "epoch": 0.9988865551534782, + "grad_norm": 7.563985824584961, + "learning_rate": 3.121222723445616e-11, + "loss": 0.1864, + "step": 39473 + }, + { + "epoch": 0.998911860718172, + "grad_norm": 6.628334045410156, + "learning_rate": 2.9809613044662345e-11, + "loss": 0.1246, + "step": 39474 + }, + { + "epoch": 0.9989371662828656, + "grad_norm": 4.143768310546875, + "learning_rate": 2.843924273243648e-11, + "loss": 0.1199, + "step": 39475 + }, + { + "epoch": 0.9989624718475593, + "grad_norm": 10.666023254394531, + "learning_rate": 2.7101116307215458e-11, + "loss": 0.2222, + "step": 39476 + }, + { + "epoch": 0.998987777412253, + "grad_norm": 8.774330139160156, + "learning_rate": 2.5795233776770845e-11, + "loss": 0.2814, + "step": 39477 + }, + { + "epoch": 0.9990130829769466, + "grad_norm": 2.995098829269409, + "learning_rate": 2.4521595150539533e-11, + "loss": 0.1091, + "step": 39478 + }, + { + "epoch": 0.9990383885416403, + "grad_norm": 2.7856831550598145, + "learning_rate": 2.3280200435182866e-11, + "loss": 0.1306, + "step": 39479 + }, + { + "epoch": 0.999063694106334, + "grad_norm": 3.06009840965271, + "learning_rate": 2.207104964013773e-11, + "loss": 0.0976, + "step": 39480 + }, + { + "epoch": 0.9990889996710277, + "grad_norm": 4.952132701873779, + "learning_rate": 2.0894142772620586e-11, + "loss": 0.1182, + "step": 39481 + }, + { + "epoch": 0.9991143052357213, + "grad_norm": 7.977906227111816, + "learning_rate": 1.9749479840402984e-11, + "loss": 0.1852, + "step": 39482 + }, + { + "epoch": 0.999139610800415, + "grad_norm": 12.960599899291992, + "learning_rate": 1.8637060850701382e-11, + "loss": 0.3824, + "step": 39483 + }, + { + "epoch": 0.9991649163651087, + "grad_norm": 3.503293514251709, + "learning_rate": 1.7556885810732227e-11, + "loss": 0.1079, + "step": 39484 + }, + { + "epoch": 0.9991902219298023, + "grad_norm": 4.683947563171387, + "learning_rate": 1.6508954727711966e-11, + "loss": 0.1331, + "step": 39485 + }, + { + "epoch": 0.999215527494496, + "grad_norm": 6.488844871520996, + "learning_rate": 1.5493267607746833e-11, + "loss": 0.1815, + "step": 39486 + }, + { + "epoch": 0.9992408330591898, + "grad_norm": 6.319011688232422, + "learning_rate": 1.4509824458053268e-11, + "loss": 0.1596, + "step": 39487 + }, + { + "epoch": 0.9992661386238834, + "grad_norm": 9.899828910827637, + "learning_rate": 1.3558625284182391e-11, + "loss": 0.2132, + "step": 39488 + }, + { + "epoch": 0.9992914441885771, + "grad_norm": 6.533380508422852, + "learning_rate": 1.2639670093350653e-11, + "loss": 0.1457, + "step": 39489 + }, + { + "epoch": 0.9993167497532708, + "grad_norm": 3.994419574737549, + "learning_rate": 1.1752958891109167e-11, + "loss": 0.1631, + "step": 39490 + }, + { + "epoch": 0.9993420553179644, + "grad_norm": 3.4839096069335938, + "learning_rate": 1.0898491682453938e-11, + "loss": 0.1615, + "step": 39491 + }, + { + "epoch": 0.9993673608826581, + "grad_norm": 3.3043134212493896, + "learning_rate": 1.0076268473491191e-11, + "loss": 0.1574, + "step": 39492 + }, + { + "epoch": 0.9993926664473518, + "grad_norm": 4.585416793823242, + "learning_rate": 9.286289269772041e-12, + "loss": 0.1608, + "step": 39493 + }, + { + "epoch": 0.9994179720120454, + "grad_norm": 3.4138071537017822, + "learning_rate": 8.528554076292494e-12, + "loss": 0.1406, + "step": 39494 + }, + { + "epoch": 0.9994432775767391, + "grad_norm": 6.693256855010986, + "learning_rate": 7.803062897493441e-12, + "loss": 0.17, + "step": 39495 + }, + { + "epoch": 0.9994685831414328, + "grad_norm": 6.797619342803955, + "learning_rate": 7.109815738370885e-12, + "loss": 0.1574, + "step": 39496 + }, + { + "epoch": 0.9994938887061264, + "grad_norm": 6.944126129150391, + "learning_rate": 6.448812602810606e-12, + "loss": 0.166, + "step": 39497 + }, + { + "epoch": 0.9995191942708201, + "grad_norm": 6.831911563873291, + "learning_rate": 5.82005349636372e-12, + "loss": 0.1182, + "step": 39498 + }, + { + "epoch": 0.9995444998355139, + "grad_norm": 8.951163291931152, + "learning_rate": 5.223538421805785e-12, + "loss": 0.1916, + "step": 39499 + }, + { + "epoch": 0.9995698054002075, + "grad_norm": 3.579357624053955, + "learning_rate": 4.659267383577693e-12, + "loss": 0.1341, + "step": 39500 + }, + { + "epoch": 0.9995951109649012, + "grad_norm": 3.439085006713867, + "learning_rate": 4.127240385010112e-12, + "loss": 0.1006, + "step": 39501 + }, + { + "epoch": 0.9996204165295949, + "grad_norm": 7.198095798492432, + "learning_rate": 3.6274574305439346e-12, + "loss": 0.1785, + "step": 39502 + }, + { + "epoch": 0.9996457220942885, + "grad_norm": 4.468045234680176, + "learning_rate": 3.1599185218444963e-12, + "loss": 0.1303, + "step": 39503 + }, + { + "epoch": 0.9996710276589822, + "grad_norm": 4.244230270385742, + "learning_rate": 2.7246236627975765e-12, + "loss": 0.173, + "step": 39504 + }, + { + "epoch": 0.9996963332236759, + "grad_norm": 7.149433135986328, + "learning_rate": 2.3215728561787332e-12, + "loss": 0.134, + "step": 39505 + }, + { + "epoch": 0.9997216387883696, + "grad_norm": 3.3436269760131836, + "learning_rate": 1.950766104763524e-12, + "loss": 0.1388, + "step": 39506 + }, + { + "epoch": 0.9997469443530632, + "grad_norm": 6.921220779418945, + "learning_rate": 1.612203410772395e-12, + "loss": 0.142, + "step": 39507 + }, + { + "epoch": 0.9997722499177569, + "grad_norm": 4.691792964935303, + "learning_rate": 1.3058847758706805e-12, + "loss": 0.1613, + "step": 39508 + }, + { + "epoch": 0.9997975554824506, + "grad_norm": 7.617001056671143, + "learning_rate": 1.0318102028339383e-12, + "loss": 0.1654, + "step": 39509 + }, + { + "epoch": 0.9998228610471442, + "grad_norm": 2.9594085216522217, + "learning_rate": 7.899796927723913e-13, + "loss": 0.0998, + "step": 39510 + }, + { + "epoch": 0.999848166611838, + "grad_norm": 5.551313400268555, + "learning_rate": 5.803932479064855e-13, + "loss": 0.1434, + "step": 39511 + }, + { + "epoch": 0.9998734721765317, + "grad_norm": 4.562256813049316, + "learning_rate": 4.030508687913326e-13, + "loss": 0.0791, + "step": 39512 + }, + { + "epoch": 0.9998987777412253, + "grad_norm": 3.409564256668091, + "learning_rate": 2.5795255709226695e-13, + "loss": 0.1087, + "step": 39513 + }, + { + "epoch": 0.999924083305919, + "grad_norm": 4.941267013549805, + "learning_rate": 1.4509831391951167e-13, + "loss": 0.1486, + "step": 39514 + }, + { + "epoch": 0.9999493888706127, + "grad_norm": 3.367554187774658, + "learning_rate": 6.448813982817825e-14, + "loss": 0.0846, + "step": 39515 + }, + { + "epoch": 0.9999746944353063, + "grad_norm": 5.534474849700928, + "learning_rate": 1.6122034818266685e-14, + "loss": 0.1209, + "step": 39516 + }, + { + "epoch": 1.0, + "grad_norm": 5.023179531097412, + "learning_rate": 0.0, + "loss": 0.1587, + "step": 39517 + }, + { + "epoch": 1.0, + "step": 39517, + "total_flos": 5.561277359811723e+18, + "train_loss": 0.18354613499983982, + "train_runtime": 38782.0916, + "train_samples_per_second": 8.152, + "train_steps_per_second": 1.019 + } + ], + "logging_steps": 1, + "max_steps": 39517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 39517, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.561277359811723e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}