{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7039764359351989, "eval_steps": 500, "global_step": 239, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0029455081001472753, "grad_norm": 148176371712.0, "learning_rate": 1.0000000000000002e-06, "loss": 3.5829, "step": 1 }, { "epoch": 0.005891016200294551, "grad_norm": 521838526464.0, "learning_rate": 2.0000000000000003e-06, "loss": 3.7866, "step": 2 }, { "epoch": 0.008836524300441826, "grad_norm": 29312.015625, "learning_rate": 3e-06, "loss": 5.4788, "step": 3 }, { "epoch": 0.011782032400589101, "grad_norm": 4224.009765625, "learning_rate": 4.000000000000001e-06, "loss": 4.9963, "step": 4 }, { "epoch": 0.014727540500736377, "grad_norm": 6816.009765625, "learning_rate": 5e-06, "loss": 6.1696, "step": 5 }, { "epoch": 0.017673048600883652, "grad_norm": 37120.00390625, "learning_rate": 6e-06, "loss": 6.0103, "step": 6 }, { "epoch": 0.020618556701030927, "grad_norm": 16512.0078125, "learning_rate": 7.000000000000001e-06, "loss": 5.2747, "step": 7 }, { "epoch": 0.023564064801178203, "grad_norm": 4544.0126953125, "learning_rate": 8.000000000000001e-06, "loss": 5.0828, "step": 8 }, { "epoch": 0.026509572901325478, "grad_norm": 1360.0126953125, "learning_rate": 9e-06, "loss": 4.4745, "step": 9 }, { "epoch": 0.029455081001472753, "grad_norm": 2528.019287109375, "learning_rate": 1e-05, "loss": 6.0115, "step": 10 }, { "epoch": 0.03240058910162003, "grad_norm": 608.0706787109375, "learning_rate": 1.1000000000000001e-05, "loss": 5.1475, "step": 11 }, { "epoch": 0.035346097201767304, "grad_norm": 1560.0152587890625, "learning_rate": 1.2e-05, "loss": 5.2077, "step": 12 }, { "epoch": 0.03829160530191458, "grad_norm": 1152.024169921875, "learning_rate": 1.3000000000000001e-05, "loss": 4.9248, "step": 13 }, { "epoch": 0.041237113402061855, "grad_norm": 1096.0169677734375, "learning_rate": 1.4000000000000001e-05, "loss": 4.3356, "step": 14 }, { "epoch": 0.044182621502209134, "grad_norm": 2560.037841796875, "learning_rate": 1.5e-05, "loss": 5.9271, "step": 15 }, { "epoch": 0.047128129602356406, "grad_norm": 1312.0172119140625, "learning_rate": 1.6000000000000003e-05, "loss": 4.9467, "step": 16 }, { "epoch": 0.050073637702503684, "grad_norm": 748.0582275390625, "learning_rate": 1.7000000000000003e-05, "loss": 5.5781, "step": 17 }, { "epoch": 0.053019145802650956, "grad_norm": 608.0347290039062, "learning_rate": 1.8e-05, "loss": 4.9313, "step": 18 }, { "epoch": 0.055964653902798235, "grad_norm": 444.0478820800781, "learning_rate": 1.9e-05, "loss": 4.7802, "step": 19 }, { "epoch": 0.05891016200294551, "grad_norm": 400.05474853515625, "learning_rate": 2e-05, "loss": 4.9916, "step": 20 }, { "epoch": 0.061855670103092786, "grad_norm": 324.0529479980469, "learning_rate": 2.1e-05, "loss": 4.5852, "step": 21 }, { "epoch": 0.06480117820324006, "grad_norm": 552.0383911132812, "learning_rate": 2.2000000000000003e-05, "loss": 4.5658, "step": 22 }, { "epoch": 0.06774668630338733, "grad_norm": 780.0543823242188, "learning_rate": 2.3000000000000003e-05, "loss": 4.9509, "step": 23 }, { "epoch": 0.07069219440353461, "grad_norm": 1352.05712890625, "learning_rate": 2.4e-05, "loss": 4.9526, "step": 24 }, { "epoch": 0.07363770250368189, "grad_norm": 692.063720703125, "learning_rate": 2.5e-05, "loss": 5.1079, "step": 25 }, { "epoch": 0.07658321060382917, "grad_norm": 940.0375366210938, "learning_rate": 2.6000000000000002e-05, "loss": 4.3864, "step": 26 }, { "epoch": 0.07952871870397643, "grad_norm": 406.0652770996094, "learning_rate": 2.7000000000000002e-05, "loss": 4.7486, "step": 27 }, { "epoch": 0.08247422680412371, "grad_norm": 736.0764770507812, "learning_rate": 2.8000000000000003e-05, "loss": 5.4081, "step": 28 }, { "epoch": 0.08541973490427099, "grad_norm": 1240.040283203125, "learning_rate": 2.9e-05, "loss": 4.6019, "step": 29 }, { "epoch": 0.08836524300441827, "grad_norm": 1312.10302734375, "learning_rate": 3e-05, "loss": 5.481, "step": 30 }, { "epoch": 0.09131075110456553, "grad_norm": 466.0743103027344, "learning_rate": 3.1e-05, "loss": 4.4326, "step": 31 }, { "epoch": 0.09425625920471281, "grad_norm": 828.0822143554688, "learning_rate": 3.2000000000000005e-05, "loss": 5.0284, "step": 32 }, { "epoch": 0.09720176730486009, "grad_norm": 764.077880859375, "learning_rate": 3.3e-05, "loss": 4.9104, "step": 33 }, { "epoch": 0.10014727540500737, "grad_norm": 231.1255340576172, "learning_rate": 3.4000000000000007e-05, "loss": 3.9872, "step": 34 }, { "epoch": 0.10309278350515463, "grad_norm": 434.1127624511719, "learning_rate": 3.5e-05, "loss": 5.2373, "step": 35 }, { "epoch": 0.10603829160530191, "grad_norm": 253.16815185546875, "learning_rate": 3.6e-05, "loss": 4.8148, "step": 36 }, { "epoch": 0.10898379970544919, "grad_norm": 290.1834716796875, "learning_rate": 3.7e-05, "loss": 3.6852, "step": 37 }, { "epoch": 0.11192930780559647, "grad_norm": 584.0982666015625, "learning_rate": 3.8e-05, "loss": 3.8818, "step": 38 }, { "epoch": 0.11487481590574374, "grad_norm": 904.0762329101562, "learning_rate": 3.9000000000000006e-05, "loss": 4.2878, "step": 39 }, { "epoch": 0.11782032400589101, "grad_norm": 336.28033447265625, "learning_rate": 4e-05, "loss": 4.2046, "step": 40 }, { "epoch": 0.12076583210603829, "grad_norm": 704.1439208984375, "learning_rate": 4.1e-05, "loss": 4.4065, "step": 41 }, { "epoch": 0.12371134020618557, "grad_norm": 804.0842895507812, "learning_rate": 4.2e-05, "loss": 3.9044, "step": 42 }, { "epoch": 0.12665684830633284, "grad_norm": 752.0977172851562, "learning_rate": 4.3e-05, "loss": 3.5964, "step": 43 }, { "epoch": 0.12960235640648013, "grad_norm": 752.181884765625, "learning_rate": 4.4000000000000006e-05, "loss": 4.3488, "step": 44 }, { "epoch": 0.1325478645066274, "grad_norm": 664.2083129882812, "learning_rate": 4.5e-05, "loss": 4.6978, "step": 45 }, { "epoch": 0.13549337260677466, "grad_norm": 1984.197998046875, "learning_rate": 4.600000000000001e-05, "loss": 3.8173, "step": 46 }, { "epoch": 0.13843888070692195, "grad_norm": 1536.0904541015625, "learning_rate": 4.7e-05, "loss": 3.3992, "step": 47 }, { "epoch": 0.14138438880706922, "grad_norm": 616.1571044921875, "learning_rate": 4.8e-05, "loss": 3.5619, "step": 48 }, { "epoch": 0.14432989690721648, "grad_norm": 5408.072265625, "learning_rate": 4.9e-05, "loss": 4.3322, "step": 49 }, { "epoch": 0.14727540500736377, "grad_norm": 3408.068603515625, "learning_rate": 5e-05, "loss": 4.0344, "step": 50 }, { "epoch": 0.15022091310751104, "grad_norm": 724.0775146484375, "learning_rate": 5.1000000000000006e-05, "loss": 3.8171, "step": 51 }, { "epoch": 0.15316642120765833, "grad_norm": 1448.1661376953125, "learning_rate": 5.2000000000000004e-05, "loss": 2.6826, "step": 52 }, { "epoch": 0.1561119293078056, "grad_norm": 304.0506591796875, "learning_rate": 5.300000000000001e-05, "loss": 2.4422, "step": 53 }, { "epoch": 0.15905743740795286, "grad_norm": 396.0224914550781, "learning_rate": 5.4000000000000005e-05, "loss": 2.4475, "step": 54 }, { "epoch": 0.16200294550810015, "grad_norm": 684.0194702148438, "learning_rate": 5.500000000000001e-05, "loss": 2.2932, "step": 55 }, { "epoch": 0.16494845360824742, "grad_norm": 231.03273010253906, "learning_rate": 5.6000000000000006e-05, "loss": 2.5193, "step": 56 }, { "epoch": 0.16789396170839468, "grad_norm": 164.0357208251953, "learning_rate": 5.6999999999999996e-05, "loss": 1.9575, "step": 57 }, { "epoch": 0.17083946980854198, "grad_norm": 1600.0091552734375, "learning_rate": 5.8e-05, "loss": 2.5806, "step": 58 }, { "epoch": 0.17378497790868924, "grad_norm": 50.36320114135742, "learning_rate": 5.9e-05, "loss": 3.651, "step": 59 }, { "epoch": 0.17673048600883653, "grad_norm": 66.08223724365234, "learning_rate": 6e-05, "loss": 3.5998, "step": 60 }, { "epoch": 0.1796759941089838, "grad_norm": 20.88555335998535, "learning_rate": 6.1e-05, "loss": 3.2898, "step": 61 }, { "epoch": 0.18262150220913106, "grad_norm": 33.95121765136719, "learning_rate": 6.2e-05, "loss": 3.5322, "step": 62 }, { "epoch": 0.18556701030927836, "grad_norm": 18.569725036621094, "learning_rate": 6.3e-05, "loss": 2.7009, "step": 63 }, { "epoch": 0.18851251840942562, "grad_norm": 11.800430297851562, "learning_rate": 6.400000000000001e-05, "loss": 3.0148, "step": 64 }, { "epoch": 0.19145802650957292, "grad_norm": 11.598824501037598, "learning_rate": 6.500000000000001e-05, "loss": 2.5356, "step": 65 }, { "epoch": 0.19440353460972018, "grad_norm": 29.798603057861328, "learning_rate": 6.6e-05, "loss": 3.419, "step": 66 }, { "epoch": 0.19734904270986744, "grad_norm": 23.390228271484375, "learning_rate": 6.7e-05, "loss": 3.0823, "step": 67 }, { "epoch": 0.20029455081001474, "grad_norm": 30.04606056213379, "learning_rate": 6.800000000000001e-05, "loss": 2.6675, "step": 68 }, { "epoch": 0.203240058910162, "grad_norm": 59.424739837646484, "learning_rate": 6.9e-05, "loss": 2.6406, "step": 69 }, { "epoch": 0.20618556701030927, "grad_norm": 18.69099235534668, "learning_rate": 7e-05, "loss": 2.8912, "step": 70 }, { "epoch": 0.20913107511045656, "grad_norm": 11.040902137756348, "learning_rate": 7.1e-05, "loss": 2.2423, "step": 71 }, { "epoch": 0.21207658321060383, "grad_norm": 15.589529991149902, "learning_rate": 7.2e-05, "loss": 2.6387, "step": 72 }, { "epoch": 0.21502209131075112, "grad_norm": 8.346650123596191, "learning_rate": 7.3e-05, "loss": 2.7416, "step": 73 }, { "epoch": 0.21796759941089838, "grad_norm": 16.002195358276367, "learning_rate": 7.4e-05, "loss": 2.5981, "step": 74 }, { "epoch": 0.22091310751104565, "grad_norm": 12.781440734863281, "learning_rate": 7.500000000000001e-05, "loss": 2.8455, "step": 75 }, { "epoch": 0.22385861561119294, "grad_norm": 14.14976978302002, "learning_rate": 7.6e-05, "loss": 2.3376, "step": 76 }, { "epoch": 0.2268041237113402, "grad_norm": 14.174376487731934, "learning_rate": 7.7e-05, "loss": 2.2975, "step": 77 }, { "epoch": 0.22974963181148747, "grad_norm": 12.794411659240723, "learning_rate": 7.800000000000001e-05, "loss": 2.2424, "step": 78 }, { "epoch": 0.23269513991163476, "grad_norm": 8.186522483825684, "learning_rate": 7.900000000000001e-05, "loss": 1.9646, "step": 79 }, { "epoch": 0.23564064801178203, "grad_norm": 7.599482536315918, "learning_rate": 8e-05, "loss": 2.2699, "step": 80 }, { "epoch": 0.23858615611192932, "grad_norm": 13.385209083557129, "learning_rate": 8.1e-05, "loss": 2.9279, "step": 81 }, { "epoch": 0.24153166421207659, "grad_norm": 10.520708084106445, "learning_rate": 8.2e-05, "loss": 2.6627, "step": 82 }, { "epoch": 0.24447717231222385, "grad_norm": 10.881275177001953, "learning_rate": 8.3e-05, "loss": 2.6776, "step": 83 }, { "epoch": 0.24742268041237114, "grad_norm": 11.189691543579102, "learning_rate": 8.4e-05, "loss": 2.5101, "step": 84 }, { "epoch": 0.2503681885125184, "grad_norm": 14.185538291931152, "learning_rate": 8.5e-05, "loss": 2.4517, "step": 85 }, { "epoch": 0.2533136966126657, "grad_norm": 11.319371223449707, "learning_rate": 8.6e-05, "loss": 2.3935, "step": 86 }, { "epoch": 0.25625920471281294, "grad_norm": 11.8043212890625, "learning_rate": 8.7e-05, "loss": 2.3776, "step": 87 }, { "epoch": 0.25920471281296026, "grad_norm": 17.876741409301758, "learning_rate": 8.800000000000001e-05, "loss": 2.1173, "step": 88 }, { "epoch": 0.2621502209131075, "grad_norm": 9.818507194519043, "learning_rate": 8.900000000000001e-05, "loss": 2.4566, "step": 89 }, { "epoch": 0.2650957290132548, "grad_norm": 17.97085189819336, "learning_rate": 9e-05, "loss": 2.4889, "step": 90 }, { "epoch": 0.26804123711340205, "grad_norm": 13.525333404541016, "learning_rate": 9.1e-05, "loss": 2.6413, "step": 91 }, { "epoch": 0.2709867452135493, "grad_norm": 11.17566204071045, "learning_rate": 9.200000000000001e-05, "loss": 2.5934, "step": 92 }, { "epoch": 0.27393225331369664, "grad_norm": 10.861861228942871, "learning_rate": 9.300000000000001e-05, "loss": 2.0431, "step": 93 }, { "epoch": 0.2768777614138439, "grad_norm": 10.782200813293457, "learning_rate": 9.4e-05, "loss": 2.2952, "step": 94 }, { "epoch": 0.27982326951399117, "grad_norm": 10.202653884887695, "learning_rate": 9.5e-05, "loss": 1.9, "step": 95 }, { "epoch": 0.28276877761413843, "grad_norm": 11.118406295776367, "learning_rate": 9.6e-05, "loss": 1.8267, "step": 96 }, { "epoch": 0.2857142857142857, "grad_norm": 11.998210906982422, "learning_rate": 9.7e-05, "loss": 2.7094, "step": 97 }, { "epoch": 0.28865979381443296, "grad_norm": 11.203243255615234, "learning_rate": 9.8e-05, "loss": 2.6765, "step": 98 }, { "epoch": 0.2916053019145803, "grad_norm": 11.754383087158203, "learning_rate": 9.900000000000001e-05, "loss": 2.323, "step": 99 }, { "epoch": 0.29455081001472755, "grad_norm": 20.766103744506836, "learning_rate": 0.0001, "loss": 3.2395, "step": 100 }, { "epoch": 0.2974963181148748, "grad_norm": 19.38508415222168, "learning_rate": 9.999568045802217e-05, "loss": 3.1611, "step": 101 }, { "epoch": 0.3004418262150221, "grad_norm": 12.322184562683105, "learning_rate": 9.998272257842641e-05, "loss": 1.8749, "step": 102 }, { "epoch": 0.30338733431516934, "grad_norm": 8.819778442382812, "learning_rate": 9.996112860009688e-05, "loss": 2.1352, "step": 103 }, { "epoch": 0.30633284241531666, "grad_norm": 4.589802265167236, "learning_rate": 9.993090225407743e-05, "loss": 1.9195, "step": 104 }, { "epoch": 0.30927835051546393, "grad_norm": 3.1305477619171143, "learning_rate": 9.989204876292688e-05, "loss": 1.6522, "step": 105 }, { "epoch": 0.3122238586156112, "grad_norm": 2.7143571376800537, "learning_rate": 9.984457483981669e-05, "loss": 1.1763, "step": 106 }, { "epoch": 0.31516936671575846, "grad_norm": 2.8804931640625, "learning_rate": 9.978848868737098e-05, "loss": 1.7974, "step": 107 }, { "epoch": 0.3181148748159057, "grad_norm": 3.790278673171997, "learning_rate": 9.972379999624936e-05, "loss": 2.1482, "step": 108 }, { "epoch": 0.32106038291605304, "grad_norm": 4.674582004547119, "learning_rate": 9.96505199434725e-05, "loss": 2.5313, "step": 109 }, { "epoch": 0.3240058910162003, "grad_norm": 3.20375657081604, "learning_rate": 9.956866119049095e-05, "loss": 2.2333, "step": 110 }, { "epoch": 0.3269513991163476, "grad_norm": 3.65432071685791, "learning_rate": 9.947823788099753e-05, "loss": 2.3037, "step": 111 }, { "epoch": 0.32989690721649484, "grad_norm": 4.25070858001709, "learning_rate": 9.937926563848346e-05, "loss": 2.1421, "step": 112 }, { "epoch": 0.3328424153166421, "grad_norm": 3.4754691123962402, "learning_rate": 9.927176156353899e-05, "loss": 1.6731, "step": 113 }, { "epoch": 0.33578792341678937, "grad_norm": 4.312386989593506, "learning_rate": 9.91557442308987e-05, "loss": 2.2766, "step": 114 }, { "epoch": 0.3387334315169367, "grad_norm": 8.520076751708984, "learning_rate": 9.903123368623216e-05, "loss": 2.3844, "step": 115 }, { "epoch": 0.34167893961708395, "grad_norm": 9.280426979064941, "learning_rate": 9.889825144268029e-05, "loss": 2.5895, "step": 116 }, { "epoch": 0.3446244477172312, "grad_norm": 5.867470741271973, "learning_rate": 9.875682047713846e-05, "loss": 2.1818, "step": 117 }, { "epoch": 0.3475699558173785, "grad_norm": 4.29853630065918, "learning_rate": 9.860696522628639e-05, "loss": 2.1243, "step": 118 }, { "epoch": 0.35051546391752575, "grad_norm": 4.369655132293701, "learning_rate": 9.844871158236591e-05, "loss": 1.9835, "step": 119 }, { "epoch": 0.35346097201767307, "grad_norm": 15.677599906921387, "learning_rate": 9.828208688870735e-05, "loss": 2.3591, "step": 120 }, { "epoch": 0.35640648011782033, "grad_norm": 7.070474624633789, "learning_rate": 9.810711993500507e-05, "loss": 2.1378, "step": 121 }, { "epoch": 0.3593519882179676, "grad_norm": 4.5425004959106445, "learning_rate": 9.792384095234313e-05, "loss": 1.838, "step": 122 }, { "epoch": 0.36229749631811486, "grad_norm": 5.75803804397583, "learning_rate": 9.773228160797188e-05, "loss": 2.045, "step": 123 }, { "epoch": 0.36524300441826213, "grad_norm": 4.905186653137207, "learning_rate": 9.753247499983649e-05, "loss": 2.263, "step": 124 }, { "epoch": 0.36818851251840945, "grad_norm": 5.079438209533691, "learning_rate": 9.732445565085824e-05, "loss": 1.9375, "step": 125 }, { "epoch": 0.3711340206185567, "grad_norm": 5.083024978637695, "learning_rate": 9.71082595029695e-05, "loss": 1.9512, "step": 126 }, { "epoch": 0.374079528718704, "grad_norm": 5.701948642730713, "learning_rate": 9.688392391090373e-05, "loss": 2.0932, "step": 127 }, { "epoch": 0.37702503681885124, "grad_norm": 5.612240314483643, "learning_rate": 9.665148763574123e-05, "loss": 2.1218, "step": 128 }, { "epoch": 0.3799705449189985, "grad_norm": 5.693153381347656, "learning_rate": 9.64109908382119e-05, "loss": 2.4763, "step": 129 }, { "epoch": 0.38291605301914583, "grad_norm": 5.299254417419434, "learning_rate": 9.616247507175623e-05, "loss": 2.3062, "step": 130 }, { "epoch": 0.3858615611192931, "grad_norm": 5.711902618408203, "learning_rate": 9.590598327534564e-05, "loss": 2.4087, "step": 131 }, { "epoch": 0.38880706921944036, "grad_norm": 4.181498050689697, "learning_rate": 9.564155976606339e-05, "loss": 1.7175, "step": 132 }, { "epoch": 0.3917525773195876, "grad_norm": 5.31155252456665, "learning_rate": 9.536925023144742e-05, "loss": 1.6347, "step": 133 }, { "epoch": 0.3946980854197349, "grad_norm": 6.574872970581055, "learning_rate": 9.508910172159635e-05, "loss": 1.8686, "step": 134 }, { "epoch": 0.39764359351988215, "grad_norm": 6.77908992767334, "learning_rate": 9.480116264104011e-05, "loss": 2.1223, "step": 135 }, { "epoch": 0.4005891016200295, "grad_norm": 7.450313091278076, "learning_rate": 9.450548274037653e-05, "loss": 2.3148, "step": 136 }, { "epoch": 0.40353460972017674, "grad_norm": 5.135898113250732, "learning_rate": 9.420211310767533e-05, "loss": 1.386, "step": 137 }, { "epoch": 0.406480117820324, "grad_norm": 8.198874473571777, "learning_rate": 9.389110615965102e-05, "loss": 1.9208, "step": 138 }, { "epoch": 0.40942562592047127, "grad_norm": 10.153027534484863, "learning_rate": 9.35725156326063e-05, "loss": 2.2577, "step": 139 }, { "epoch": 0.41237113402061853, "grad_norm": 8.334872245788574, "learning_rate": 9.324639657314742e-05, "loss": 2.0401, "step": 140 }, { "epoch": 0.41531664212076586, "grad_norm": 6.7754950523376465, "learning_rate": 9.291280532867302e-05, "loss": 1.5202, "step": 141 }, { "epoch": 0.4182621502209131, "grad_norm": 9.433147430419922, "learning_rate": 9.257179953763845e-05, "loss": 2.1687, "step": 142 }, { "epoch": 0.4212076583210604, "grad_norm": 7.446643352508545, "learning_rate": 9.222343811959693e-05, "loss": 1.6996, "step": 143 }, { "epoch": 0.42415316642120765, "grad_norm": 7.048992156982422, "learning_rate": 9.186778126501916e-05, "loss": 1.5279, "step": 144 }, { "epoch": 0.4270986745213549, "grad_norm": 7.340887069702148, "learning_rate": 9.150489042489367e-05, "loss": 1.4406, "step": 145 }, { "epoch": 0.43004418262150224, "grad_norm": 11.633044242858887, "learning_rate": 9.113482830010918e-05, "loss": 1.5187, "step": 146 }, { "epoch": 0.4329896907216495, "grad_norm": 9.241588592529297, "learning_rate": 9.075765883062093e-05, "loss": 1.8564, "step": 147 }, { "epoch": 0.43593519882179677, "grad_norm": 8.745128631591797, "learning_rate": 9.037344718440322e-05, "loss": 1.5419, "step": 148 }, { "epoch": 0.43888070692194403, "grad_norm": 16.220924377441406, "learning_rate": 8.99822597461894e-05, "loss": 2.5006, "step": 149 }, { "epoch": 0.4418262150220913, "grad_norm": 15.44288444519043, "learning_rate": 8.958416410600187e-05, "loss": 2.0001, "step": 150 }, { "epoch": 0.44477172312223856, "grad_norm": 10.727198600769043, "learning_rate": 8.917922904747384e-05, "loss": 2.8376, "step": 151 }, { "epoch": 0.4477172312223859, "grad_norm": 5.658038139343262, "learning_rate": 8.876752453596462e-05, "loss": 1.7435, "step": 152 }, { "epoch": 0.45066273932253315, "grad_norm": 4.7606892585754395, "learning_rate": 8.834912170647101e-05, "loss": 1.3439, "step": 153 }, { "epoch": 0.4536082474226804, "grad_norm": 4.7208757400512695, "learning_rate": 8.792409285133642e-05, "loss": 1.8022, "step": 154 }, { "epoch": 0.4565537555228277, "grad_norm": 3.1722121238708496, "learning_rate": 8.749251140776016e-05, "loss": 0.7509, "step": 155 }, { "epoch": 0.45949926362297494, "grad_norm": 2.528482675552368, "learning_rate": 8.705445194510868e-05, "loss": 1.6068, "step": 156 }, { "epoch": 0.46244477172312226, "grad_norm": 2.8189425468444824, "learning_rate": 8.66099901520315e-05, "loss": 0.4364, "step": 157 }, { "epoch": 0.4653902798232695, "grad_norm": 2.734297752380371, "learning_rate": 8.615920282338355e-05, "loss": 1.6278, "step": 158 }, { "epoch": 0.4683357879234168, "grad_norm": 2.981930732727051, "learning_rate": 8.570216784695637e-05, "loss": 1.9541, "step": 159 }, { "epoch": 0.47128129602356406, "grad_norm": 3.195127248764038, "learning_rate": 8.52389641900206e-05, "loss": 2.0949, "step": 160 }, { "epoch": 0.4742268041237113, "grad_norm": 3.8657867908477783, "learning_rate": 8.476967188568188e-05, "loss": 1.8021, "step": 161 }, { "epoch": 0.47717231222385864, "grad_norm": 3.0095670223236084, "learning_rate": 8.429437201905254e-05, "loss": 1.8119, "step": 162 }, { "epoch": 0.4801178203240059, "grad_norm": 3.025339365005493, "learning_rate": 8.381314671324159e-05, "loss": 1.8117, "step": 163 }, { "epoch": 0.48306332842415317, "grad_norm": 4.113118648529053, "learning_rate": 8.332607911516545e-05, "loss": 1.7912, "step": 164 }, { "epoch": 0.48600883652430044, "grad_norm": 3.183894395828247, "learning_rate": 8.283325338118153e-05, "loss": 1.7364, "step": 165 }, { "epoch": 0.4889543446244477, "grad_norm": 5.081910610198975, "learning_rate": 8.233475466254765e-05, "loss": 2.0706, "step": 166 }, { "epoch": 0.49189985272459497, "grad_norm": 4.033633708953857, "learning_rate": 8.183066909070947e-05, "loss": 2.0481, "step": 167 }, { "epoch": 0.4948453608247423, "grad_norm": 4.505871772766113, "learning_rate": 8.132108376241849e-05, "loss": 2.044, "step": 168 }, { "epoch": 0.49779086892488955, "grad_norm": 6.440969944000244, "learning_rate": 8.08060867246834e-05, "loss": 2.209, "step": 169 }, { "epoch": 0.5007363770250368, "grad_norm": 6.235000133514404, "learning_rate": 8.028576695955711e-05, "loss": 2.0359, "step": 170 }, { "epoch": 0.5036818851251841, "grad_norm": 4.07951021194458, "learning_rate": 7.97602143687623e-05, "loss": 1.6721, "step": 171 }, { "epoch": 0.5066273932253313, "grad_norm": 4.1656413078308105, "learning_rate": 7.922951975815811e-05, "loss": 1.7503, "step": 172 }, { "epoch": 0.5095729013254786, "grad_norm": 3.7077293395996094, "learning_rate": 7.869377482205042e-05, "loss": 1.6881, "step": 173 }, { "epoch": 0.5125184094256259, "grad_norm": 4.08639669418335, "learning_rate": 7.815307212734888e-05, "loss": 1.4451, "step": 174 }, { "epoch": 0.5154639175257731, "grad_norm": 4.255699157714844, "learning_rate": 7.760750509757298e-05, "loss": 1.7914, "step": 175 }, { "epoch": 0.5184094256259205, "grad_norm": 4.119191646575928, "learning_rate": 7.705716799671019e-05, "loss": 1.8201, "step": 176 }, { "epoch": 0.5213549337260678, "grad_norm": 4.7409467697143555, "learning_rate": 7.650215591292888e-05, "loss": 1.8449, "step": 177 }, { "epoch": 0.524300441826215, "grad_norm": 5.348622798919678, "learning_rate": 7.594256474214882e-05, "loss": 1.6767, "step": 178 }, { "epoch": 0.5272459499263623, "grad_norm": 4.13510799407959, "learning_rate": 7.537849117147212e-05, "loss": 1.2513, "step": 179 }, { "epoch": 0.5301914580265096, "grad_norm": 4.210190296173096, "learning_rate": 7.481003266247744e-05, "loss": 1.5867, "step": 180 }, { "epoch": 0.5331369661266568, "grad_norm": 5.271762371063232, "learning_rate": 7.423728743438048e-05, "loss": 2.1488, "step": 181 }, { "epoch": 0.5360824742268041, "grad_norm": 5.800075054168701, "learning_rate": 7.366035444706347e-05, "loss": 1.8385, "step": 182 }, { "epoch": 0.5390279823269514, "grad_norm": 6.030768871307373, "learning_rate": 7.307933338397667e-05, "loss": 1.6634, "step": 183 }, { "epoch": 0.5419734904270986, "grad_norm": 5.144318580627441, "learning_rate": 7.249432463491498e-05, "loss": 1.4081, "step": 184 }, { "epoch": 0.5449189985272459, "grad_norm": 5.144754409790039, "learning_rate": 7.190542927867234e-05, "loss": 1.4496, "step": 185 }, { "epoch": 0.5478645066273933, "grad_norm": 7.374088764190674, "learning_rate": 7.131274906557725e-05, "loss": 1.94, "step": 186 }, { "epoch": 0.5508100147275405, "grad_norm": 6.2576003074646, "learning_rate": 7.071638639991207e-05, "loss": 1.701, "step": 187 }, { "epoch": 0.5537555228276878, "grad_norm": 5.392474174499512, "learning_rate": 7.011644432221958e-05, "loss": 1.474, "step": 188 }, { "epoch": 0.5567010309278351, "grad_norm": 7.743690013885498, "learning_rate": 6.95130264914993e-05, "loss": 2.1458, "step": 189 }, { "epoch": 0.5596465390279823, "grad_norm": 7.468958377838135, "learning_rate": 6.890623716729724e-05, "loss": 2.2593, "step": 190 }, { "epoch": 0.5625920471281296, "grad_norm": 6.144321918487549, "learning_rate": 6.82961811916917e-05, "loss": 1.3348, "step": 191 }, { "epoch": 0.5655375552282769, "grad_norm": 7.852849006652832, "learning_rate": 6.768296397117848e-05, "loss": 1.4751, "step": 192 }, { "epoch": 0.5684830633284241, "grad_norm": 7.6272172927856445, "learning_rate": 6.706669145845863e-05, "loss": 1.7606, "step": 193 }, { "epoch": 0.5714285714285714, "grad_norm": 13.624709129333496, "learning_rate": 6.644747013413168e-05, "loss": 1.9716, "step": 194 }, { "epoch": 0.5743740795287187, "grad_norm": 8.707653999328613, "learning_rate": 6.582540698829781e-05, "loss": 1.8245, "step": 195 }, { "epoch": 0.5773195876288659, "grad_norm": 6.5573039054870605, "learning_rate": 6.520060950207185e-05, "loss": 1.1733, "step": 196 }, { "epoch": 0.5802650957290133, "grad_norm": 8.433490753173828, "learning_rate": 6.457318562901256e-05, "loss": 1.2757, "step": 197 }, { "epoch": 0.5832106038291606, "grad_norm": 9.936591148376465, "learning_rate": 6.394324377647028e-05, "loss": 1.4713, "step": 198 }, { "epoch": 0.5861561119293078, "grad_norm": 12.52322769165039, "learning_rate": 6.331089278685599e-05, "loss": 1.6688, "step": 199 }, { "epoch": 0.5891016200294551, "grad_norm": 16.770870208740234, "learning_rate": 6.26762419188355e-05, "loss": 2.3924, "step": 200 }, { "epoch": 0.5920471281296024, "grad_norm": 3.7220280170440674, "learning_rate": 6.203940082845144e-05, "loss": 1.8733, "step": 201 }, { "epoch": 0.5949926362297496, "grad_norm": 3.919069290161133, "learning_rate": 6.140047955017671e-05, "loss": 0.7927, "step": 202 }, { "epoch": 0.5979381443298969, "grad_norm": 3.081045150756836, "learning_rate": 6.075958847790262e-05, "loss": 1.2878, "step": 203 }, { "epoch": 0.6008836524300442, "grad_norm": 2.82151198387146, "learning_rate": 6.011683834586473e-05, "loss": 1.2722, "step": 204 }, { "epoch": 0.6038291605301914, "grad_norm": 2.1925573348999023, "learning_rate": 5.947234020951015e-05, "loss": 0.7102, "step": 205 }, { "epoch": 0.6067746686303387, "grad_norm": 2.2746307849884033, "learning_rate": 5.882620542630901e-05, "loss": 0.8639, "step": 206 }, { "epoch": 0.6097201767304861, "grad_norm": 2.5907399654388428, "learning_rate": 5.8178545636514145e-05, "loss": 1.3898, "step": 207 }, { "epoch": 0.6126656848306333, "grad_norm": 3.3455467224121094, "learning_rate": 5.752947274387147e-05, "loss": 1.6453, "step": 208 }, { "epoch": 0.6156111929307806, "grad_norm": 3.1584908962249756, "learning_rate": 5.687909889628529e-05, "loss": 1.8621, "step": 209 }, { "epoch": 0.6185567010309279, "grad_norm": 3.1960365772247314, "learning_rate": 5.622753646644102e-05, "loss": 1.6358, "step": 210 }, { "epoch": 0.6215022091310751, "grad_norm": 3.073702573776245, "learning_rate": 5.557489803238933e-05, "loss": 1.5381, "step": 211 }, { "epoch": 0.6244477172312224, "grad_norm": 2.917039632797241, "learning_rate": 5.492129635809473e-05, "loss": 1.6452, "step": 212 }, { "epoch": 0.6273932253313697, "grad_norm": 3.823730707168579, "learning_rate": 5.426684437395196e-05, "loss": 1.7625, "step": 213 }, { "epoch": 0.6303387334315169, "grad_norm": 4.05581521987915, "learning_rate": 5.361165515727374e-05, "loss": 1.9468, "step": 214 }, { "epoch": 0.6332842415316642, "grad_norm": 3.9730498790740967, "learning_rate": 5.295584191275308e-05, "loss": 2.2487, "step": 215 }, { "epoch": 0.6362297496318114, "grad_norm": 3.2518630027770996, "learning_rate": 5.229951795290353e-05, "loss": 1.5587, "step": 216 }, { "epoch": 0.6391752577319587, "grad_norm": 4.549688816070557, "learning_rate": 5.164279667848094e-05, "loss": 1.9062, "step": 217 }, { "epoch": 0.6421207658321061, "grad_norm": 3.6768784523010254, "learning_rate": 5.0985791558889785e-05, "loss": 1.6801, "step": 218 }, { "epoch": 0.6450662739322534, "grad_norm": 3.8859946727752686, "learning_rate": 5.032861611257783e-05, "loss": 1.9869, "step": 219 }, { "epoch": 0.6480117820324006, "grad_norm": 4.19022798538208, "learning_rate": 4.967138388742218e-05, "loss": 2.0944, "step": 220 }, { "epoch": 0.6509572901325479, "grad_norm": 3.7361319065093994, "learning_rate": 4.901420844111021e-05, "loss": 1.3391, "step": 221 }, { "epoch": 0.6539027982326951, "grad_norm": 3.7620272636413574, "learning_rate": 4.835720332151907e-05, "loss": 1.2597, "step": 222 }, { "epoch": 0.6568483063328424, "grad_norm": 3.610649824142456, "learning_rate": 4.770048204709648e-05, "loss": 1.2885, "step": 223 }, { "epoch": 0.6597938144329897, "grad_norm": 4.286924839019775, "learning_rate": 4.7044158087246926e-05, "loss": 1.7704, "step": 224 }, { "epoch": 0.6627393225331369, "grad_norm": 4.295466423034668, "learning_rate": 4.6388344842726264e-05, "loss": 1.3672, "step": 225 }, { "epoch": 0.6656848306332842, "grad_norm": 5.368373870849609, "learning_rate": 4.5733155626048036e-05, "loss": 1.5249, "step": 226 }, { "epoch": 0.6686303387334315, "grad_norm": 5.0956950187683105, "learning_rate": 4.507870364190527e-05, "loss": 1.3973, "step": 227 }, { "epoch": 0.6715758468335787, "grad_norm": 4.604374408721924, "learning_rate": 4.4425101967610674e-05, "loss": 1.3965, "step": 228 }, { "epoch": 0.6745213549337261, "grad_norm": 6.400148391723633, "learning_rate": 4.377246353355899e-05, "loss": 2.1319, "step": 229 }, { "epoch": 0.6774668630338734, "grad_norm": 6.704805850982666, "learning_rate": 4.312090110371473e-05, "loss": 1.9594, "step": 230 }, { "epoch": 0.6804123711340206, "grad_norm": 4.956854343414307, "learning_rate": 4.247052725612852e-05, "loss": 1.3392, "step": 231 }, { "epoch": 0.6833578792341679, "grad_norm": 8.188490867614746, "learning_rate": 4.1821454363485866e-05, "loss": 1.7636, "step": 232 }, { "epoch": 0.6863033873343152, "grad_norm": 5.5076751708984375, "learning_rate": 4.1173794573690996e-05, "loss": 1.3066, "step": 233 }, { "epoch": 0.6892488954344624, "grad_norm": 6.480940341949463, "learning_rate": 4.052765979048986e-05, "loss": 1.8326, "step": 234 }, { "epoch": 0.6921944035346097, "grad_norm": 7.0487284660339355, "learning_rate": 3.988316165413528e-05, "loss": 1.7487, "step": 235 }, { "epoch": 0.695139911634757, "grad_norm": 6.207117557525635, "learning_rate": 3.924041152209739e-05, "loss": 1.4652, "step": 236 }, { "epoch": 0.6980854197349042, "grad_norm": 5.3427205085754395, "learning_rate": 3.859952044982329e-05, "loss": 1.3738, "step": 237 }, { "epoch": 0.7010309278350515, "grad_norm": 7.366813659667969, "learning_rate": 3.7960599171548574e-05, "loss": 1.4897, "step": 238 }, { "epoch": 0.7039764359351989, "grad_norm": 6.943472862243652, "learning_rate": 3.732375808116451e-05, "loss": 1.2526, "step": 239 } ], "logging_steps": 1, "max_steps": 339, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 239, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.287612097809613e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }