|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9985272459499264, |
|
"eval_steps": 500, |
|
"global_step": 339, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0029455081001472753, |
|
"grad_norm": 148176371712.0, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 3.5829, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005891016200294551, |
|
"grad_norm": 521838526464.0, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 3.7866, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008836524300441826, |
|
"grad_norm": 29312.015625, |
|
"learning_rate": 3e-06, |
|
"loss": 5.4788, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.011782032400589101, |
|
"grad_norm": 4224.009765625, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 4.9963, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.014727540500736377, |
|
"grad_norm": 6816.009765625, |
|
"learning_rate": 5e-06, |
|
"loss": 6.1696, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.017673048600883652, |
|
"grad_norm": 37120.00390625, |
|
"learning_rate": 6e-06, |
|
"loss": 6.0103, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.020618556701030927, |
|
"grad_norm": 16512.0078125, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 5.2747, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.023564064801178203, |
|
"grad_norm": 4544.0126953125, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 5.0828, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.026509572901325478, |
|
"grad_norm": 1360.0126953125, |
|
"learning_rate": 9e-06, |
|
"loss": 4.4745, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.029455081001472753, |
|
"grad_norm": 2528.019287109375, |
|
"learning_rate": 1e-05, |
|
"loss": 6.0115, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03240058910162003, |
|
"grad_norm": 608.0706787109375, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 5.1475, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.035346097201767304, |
|
"grad_norm": 1560.0152587890625, |
|
"learning_rate": 1.2e-05, |
|
"loss": 5.2077, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03829160530191458, |
|
"grad_norm": 1152.024169921875, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 4.9248, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.041237113402061855, |
|
"grad_norm": 1096.0169677734375, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 4.3356, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.044182621502209134, |
|
"grad_norm": 2560.037841796875, |
|
"learning_rate": 1.5e-05, |
|
"loss": 5.9271, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.047128129602356406, |
|
"grad_norm": 1312.0172119140625, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 4.9467, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.050073637702503684, |
|
"grad_norm": 748.0582275390625, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 5.5781, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.053019145802650956, |
|
"grad_norm": 608.0347290039062, |
|
"learning_rate": 1.8e-05, |
|
"loss": 4.9313, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.055964653902798235, |
|
"grad_norm": 444.0478820800781, |
|
"learning_rate": 1.9e-05, |
|
"loss": 4.7802, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05891016200294551, |
|
"grad_norm": 400.05474853515625, |
|
"learning_rate": 2e-05, |
|
"loss": 4.9916, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.061855670103092786, |
|
"grad_norm": 324.0529479980469, |
|
"learning_rate": 2.1e-05, |
|
"loss": 4.5852, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06480117820324006, |
|
"grad_norm": 552.0383911132812, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 4.5658, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06774668630338733, |
|
"grad_norm": 780.0543823242188, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 4.9509, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.07069219440353461, |
|
"grad_norm": 1352.05712890625, |
|
"learning_rate": 2.4e-05, |
|
"loss": 4.9526, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07363770250368189, |
|
"grad_norm": 692.063720703125, |
|
"learning_rate": 2.5e-05, |
|
"loss": 5.1079, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07658321060382917, |
|
"grad_norm": 940.0375366210938, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 4.3864, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07952871870397643, |
|
"grad_norm": 406.0652770996094, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 4.7486, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.08247422680412371, |
|
"grad_norm": 736.0764770507812, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 5.4081, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08541973490427099, |
|
"grad_norm": 1240.040283203125, |
|
"learning_rate": 2.9e-05, |
|
"loss": 4.6019, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.08836524300441827, |
|
"grad_norm": 1312.10302734375, |
|
"learning_rate": 3e-05, |
|
"loss": 5.481, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09131075110456553, |
|
"grad_norm": 466.0743103027344, |
|
"learning_rate": 3.1e-05, |
|
"loss": 4.4326, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.09425625920471281, |
|
"grad_norm": 828.0822143554688, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 5.0284, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09720176730486009, |
|
"grad_norm": 764.077880859375, |
|
"learning_rate": 3.3e-05, |
|
"loss": 4.9104, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.10014727540500737, |
|
"grad_norm": 231.1255340576172, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 3.9872, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.10309278350515463, |
|
"grad_norm": 434.1127624511719, |
|
"learning_rate": 3.5e-05, |
|
"loss": 5.2373, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.10603829160530191, |
|
"grad_norm": 253.16815185546875, |
|
"learning_rate": 3.6e-05, |
|
"loss": 4.8148, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10898379970544919, |
|
"grad_norm": 290.1834716796875, |
|
"learning_rate": 3.7e-05, |
|
"loss": 3.6852, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.11192930780559647, |
|
"grad_norm": 584.0982666015625, |
|
"learning_rate": 3.8e-05, |
|
"loss": 3.8818, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.11487481590574374, |
|
"grad_norm": 904.0762329101562, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 4.2878, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.11782032400589101, |
|
"grad_norm": 336.28033447265625, |
|
"learning_rate": 4e-05, |
|
"loss": 4.2046, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12076583210603829, |
|
"grad_norm": 704.1439208984375, |
|
"learning_rate": 4.1e-05, |
|
"loss": 4.4065, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.12371134020618557, |
|
"grad_norm": 804.0842895507812, |
|
"learning_rate": 4.2e-05, |
|
"loss": 3.9044, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.12665684830633284, |
|
"grad_norm": 752.0977172851562, |
|
"learning_rate": 4.3e-05, |
|
"loss": 3.5964, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.12960235640648013, |
|
"grad_norm": 752.181884765625, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 4.3488, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1325478645066274, |
|
"grad_norm": 664.2083129882812, |
|
"learning_rate": 4.5e-05, |
|
"loss": 4.6978, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.13549337260677466, |
|
"grad_norm": 1984.197998046875, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 3.8173, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.13843888070692195, |
|
"grad_norm": 1536.0904541015625, |
|
"learning_rate": 4.7e-05, |
|
"loss": 3.3992, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.14138438880706922, |
|
"grad_norm": 616.1571044921875, |
|
"learning_rate": 4.8e-05, |
|
"loss": 3.5619, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.14432989690721648, |
|
"grad_norm": 5408.072265625, |
|
"learning_rate": 4.9e-05, |
|
"loss": 4.3322, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.14727540500736377, |
|
"grad_norm": 3408.068603515625, |
|
"learning_rate": 5e-05, |
|
"loss": 4.0344, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15022091310751104, |
|
"grad_norm": 724.0775146484375, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 3.8171, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.15316642120765833, |
|
"grad_norm": 1448.1661376953125, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 2.6826, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1561119293078056, |
|
"grad_norm": 304.0506591796875, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 2.4422, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.15905743740795286, |
|
"grad_norm": 396.0224914550781, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 2.4475, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.16200294550810015, |
|
"grad_norm": 684.0194702148438, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 2.2932, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.16494845360824742, |
|
"grad_norm": 231.03273010253906, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 2.5193, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.16789396170839468, |
|
"grad_norm": 164.0357208251953, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 1.9575, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.17083946980854198, |
|
"grad_norm": 1600.0091552734375, |
|
"learning_rate": 5.8e-05, |
|
"loss": 2.5806, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.17378497790868924, |
|
"grad_norm": 50.36320114135742, |
|
"learning_rate": 5.9e-05, |
|
"loss": 3.651, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.17673048600883653, |
|
"grad_norm": 66.08223724365234, |
|
"learning_rate": 6e-05, |
|
"loss": 3.5998, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1796759941089838, |
|
"grad_norm": 20.88555335998535, |
|
"learning_rate": 6.1e-05, |
|
"loss": 3.2898, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.18262150220913106, |
|
"grad_norm": 33.95121765136719, |
|
"learning_rate": 6.2e-05, |
|
"loss": 3.5322, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.18556701030927836, |
|
"grad_norm": 18.569725036621094, |
|
"learning_rate": 6.3e-05, |
|
"loss": 2.7009, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.18851251840942562, |
|
"grad_norm": 11.800430297851562, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 3.0148, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.19145802650957292, |
|
"grad_norm": 11.598824501037598, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 2.5356, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.19440353460972018, |
|
"grad_norm": 29.798603057861328, |
|
"learning_rate": 6.6e-05, |
|
"loss": 3.419, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.19734904270986744, |
|
"grad_norm": 23.390228271484375, |
|
"learning_rate": 6.7e-05, |
|
"loss": 3.0823, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.20029455081001474, |
|
"grad_norm": 30.04606056213379, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 2.6675, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.203240058910162, |
|
"grad_norm": 59.424739837646484, |
|
"learning_rate": 6.9e-05, |
|
"loss": 2.6406, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 18.69099235534668, |
|
"learning_rate": 7e-05, |
|
"loss": 2.8912, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20913107511045656, |
|
"grad_norm": 11.040902137756348, |
|
"learning_rate": 7.1e-05, |
|
"loss": 2.2423, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.21207658321060383, |
|
"grad_norm": 15.589529991149902, |
|
"learning_rate": 7.2e-05, |
|
"loss": 2.6387, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.21502209131075112, |
|
"grad_norm": 8.346650123596191, |
|
"learning_rate": 7.3e-05, |
|
"loss": 2.7416, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.21796759941089838, |
|
"grad_norm": 16.002195358276367, |
|
"learning_rate": 7.4e-05, |
|
"loss": 2.5981, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.22091310751104565, |
|
"grad_norm": 12.781440734863281, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 2.8455, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.22385861561119294, |
|
"grad_norm": 14.14976978302002, |
|
"learning_rate": 7.6e-05, |
|
"loss": 2.3376, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2268041237113402, |
|
"grad_norm": 14.174376487731934, |
|
"learning_rate": 7.7e-05, |
|
"loss": 2.2975, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.22974963181148747, |
|
"grad_norm": 12.794411659240723, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 2.2424, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.23269513991163476, |
|
"grad_norm": 8.186522483825684, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 1.9646, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.23564064801178203, |
|
"grad_norm": 7.599482536315918, |
|
"learning_rate": 8e-05, |
|
"loss": 2.2699, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.23858615611192932, |
|
"grad_norm": 13.385209083557129, |
|
"learning_rate": 8.1e-05, |
|
"loss": 2.9279, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.24153166421207659, |
|
"grad_norm": 10.520708084106445, |
|
"learning_rate": 8.2e-05, |
|
"loss": 2.6627, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.24447717231222385, |
|
"grad_norm": 10.881275177001953, |
|
"learning_rate": 8.3e-05, |
|
"loss": 2.6776, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.24742268041237114, |
|
"grad_norm": 11.189691543579102, |
|
"learning_rate": 8.4e-05, |
|
"loss": 2.5101, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2503681885125184, |
|
"grad_norm": 14.185538291931152, |
|
"learning_rate": 8.5e-05, |
|
"loss": 2.4517, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2533136966126657, |
|
"grad_norm": 11.319371223449707, |
|
"learning_rate": 8.6e-05, |
|
"loss": 2.3935, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.25625920471281294, |
|
"grad_norm": 11.8043212890625, |
|
"learning_rate": 8.7e-05, |
|
"loss": 2.3776, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.25920471281296026, |
|
"grad_norm": 17.876741409301758, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 2.1173, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2621502209131075, |
|
"grad_norm": 9.818507194519043, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 2.4566, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.2650957290132548, |
|
"grad_norm": 17.97085189819336, |
|
"learning_rate": 9e-05, |
|
"loss": 2.4889, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.26804123711340205, |
|
"grad_norm": 13.525333404541016, |
|
"learning_rate": 9.1e-05, |
|
"loss": 2.6413, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2709867452135493, |
|
"grad_norm": 11.17566204071045, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 2.5934, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.27393225331369664, |
|
"grad_norm": 10.861861228942871, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 2.0431, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2768777614138439, |
|
"grad_norm": 10.782200813293457, |
|
"learning_rate": 9.4e-05, |
|
"loss": 2.2952, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.27982326951399117, |
|
"grad_norm": 10.202653884887695, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.9, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.28276877761413843, |
|
"grad_norm": 11.118406295776367, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.8267, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 11.998210906982422, |
|
"learning_rate": 9.7e-05, |
|
"loss": 2.7094, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.28865979381443296, |
|
"grad_norm": 11.203243255615234, |
|
"learning_rate": 9.8e-05, |
|
"loss": 2.6765, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2916053019145803, |
|
"grad_norm": 11.754383087158203, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 2.323, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.29455081001472755, |
|
"grad_norm": 20.766103744506836, |
|
"learning_rate": 0.0001, |
|
"loss": 3.2395, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2974963181148748, |
|
"grad_norm": 19.38508415222168, |
|
"learning_rate": 9.999568045802217e-05, |
|
"loss": 3.1611, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.3004418262150221, |
|
"grad_norm": 12.322184562683105, |
|
"learning_rate": 9.998272257842641e-05, |
|
"loss": 1.8749, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.30338733431516934, |
|
"grad_norm": 8.819778442382812, |
|
"learning_rate": 9.996112860009688e-05, |
|
"loss": 2.1352, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.30633284241531666, |
|
"grad_norm": 4.589802265167236, |
|
"learning_rate": 9.993090225407743e-05, |
|
"loss": 1.9195, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.30927835051546393, |
|
"grad_norm": 3.1305477619171143, |
|
"learning_rate": 9.989204876292688e-05, |
|
"loss": 1.6522, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3122238586156112, |
|
"grad_norm": 2.7143571376800537, |
|
"learning_rate": 9.984457483981669e-05, |
|
"loss": 1.1763, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.31516936671575846, |
|
"grad_norm": 2.8804931640625, |
|
"learning_rate": 9.978848868737098e-05, |
|
"loss": 1.7974, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.3181148748159057, |
|
"grad_norm": 3.790278673171997, |
|
"learning_rate": 9.972379999624936e-05, |
|
"loss": 2.1482, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.32106038291605304, |
|
"grad_norm": 4.674582004547119, |
|
"learning_rate": 9.96505199434725e-05, |
|
"loss": 2.5313, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3240058910162003, |
|
"grad_norm": 3.20375657081604, |
|
"learning_rate": 9.956866119049095e-05, |
|
"loss": 2.2333, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3269513991163476, |
|
"grad_norm": 3.65432071685791, |
|
"learning_rate": 9.947823788099753e-05, |
|
"loss": 2.3037, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.32989690721649484, |
|
"grad_norm": 4.25070858001709, |
|
"learning_rate": 9.937926563848346e-05, |
|
"loss": 2.1421, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.3328424153166421, |
|
"grad_norm": 3.4754691123962402, |
|
"learning_rate": 9.927176156353899e-05, |
|
"loss": 1.6731, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.33578792341678937, |
|
"grad_norm": 4.312386989593506, |
|
"learning_rate": 9.91557442308987e-05, |
|
"loss": 2.2766, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3387334315169367, |
|
"grad_norm": 8.520076751708984, |
|
"learning_rate": 9.903123368623216e-05, |
|
"loss": 2.3844, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.34167893961708395, |
|
"grad_norm": 9.280426979064941, |
|
"learning_rate": 9.889825144268029e-05, |
|
"loss": 2.5895, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3446244477172312, |
|
"grad_norm": 5.867470741271973, |
|
"learning_rate": 9.875682047713846e-05, |
|
"loss": 2.1818, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3475699558173785, |
|
"grad_norm": 4.29853630065918, |
|
"learning_rate": 9.860696522628639e-05, |
|
"loss": 2.1243, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.35051546391752575, |
|
"grad_norm": 4.369655132293701, |
|
"learning_rate": 9.844871158236591e-05, |
|
"loss": 1.9835, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.35346097201767307, |
|
"grad_norm": 15.677599906921387, |
|
"learning_rate": 9.828208688870735e-05, |
|
"loss": 2.3591, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.35640648011782033, |
|
"grad_norm": 7.070474624633789, |
|
"learning_rate": 9.810711993500507e-05, |
|
"loss": 2.1378, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.3593519882179676, |
|
"grad_norm": 4.5425004959106445, |
|
"learning_rate": 9.792384095234313e-05, |
|
"loss": 1.838, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.36229749631811486, |
|
"grad_norm": 5.75803804397583, |
|
"learning_rate": 9.773228160797188e-05, |
|
"loss": 2.045, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.36524300441826213, |
|
"grad_norm": 4.905186653137207, |
|
"learning_rate": 9.753247499983649e-05, |
|
"loss": 2.263, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.36818851251840945, |
|
"grad_norm": 5.079438209533691, |
|
"learning_rate": 9.732445565085824e-05, |
|
"loss": 1.9375, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3711340206185567, |
|
"grad_norm": 5.083024978637695, |
|
"learning_rate": 9.71082595029695e-05, |
|
"loss": 1.9512, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.374079528718704, |
|
"grad_norm": 5.701948642730713, |
|
"learning_rate": 9.688392391090373e-05, |
|
"loss": 2.0932, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.37702503681885124, |
|
"grad_norm": 5.612240314483643, |
|
"learning_rate": 9.665148763574123e-05, |
|
"loss": 2.1218, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3799705449189985, |
|
"grad_norm": 5.693153381347656, |
|
"learning_rate": 9.64109908382119e-05, |
|
"loss": 2.4763, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.38291605301914583, |
|
"grad_norm": 5.299254417419434, |
|
"learning_rate": 9.616247507175623e-05, |
|
"loss": 2.3062, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3858615611192931, |
|
"grad_norm": 5.711902618408203, |
|
"learning_rate": 9.590598327534564e-05, |
|
"loss": 2.4087, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.38880706921944036, |
|
"grad_norm": 4.181498050689697, |
|
"learning_rate": 9.564155976606339e-05, |
|
"loss": 1.7175, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3917525773195876, |
|
"grad_norm": 5.31155252456665, |
|
"learning_rate": 9.536925023144742e-05, |
|
"loss": 1.6347, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3946980854197349, |
|
"grad_norm": 6.574872970581055, |
|
"learning_rate": 9.508910172159635e-05, |
|
"loss": 1.8686, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.39764359351988215, |
|
"grad_norm": 6.77908992767334, |
|
"learning_rate": 9.480116264104011e-05, |
|
"loss": 2.1223, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4005891016200295, |
|
"grad_norm": 7.450313091278076, |
|
"learning_rate": 9.450548274037653e-05, |
|
"loss": 2.3148, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.40353460972017674, |
|
"grad_norm": 5.135898113250732, |
|
"learning_rate": 9.420211310767533e-05, |
|
"loss": 1.386, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.406480117820324, |
|
"grad_norm": 8.198874473571777, |
|
"learning_rate": 9.389110615965102e-05, |
|
"loss": 1.9208, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.40942562592047127, |
|
"grad_norm": 10.153027534484863, |
|
"learning_rate": 9.35725156326063e-05, |
|
"loss": 2.2577, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 8.334872245788574, |
|
"learning_rate": 9.324639657314742e-05, |
|
"loss": 2.0401, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.41531664212076586, |
|
"grad_norm": 6.7754950523376465, |
|
"learning_rate": 9.291280532867302e-05, |
|
"loss": 1.5202, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.4182621502209131, |
|
"grad_norm": 9.433147430419922, |
|
"learning_rate": 9.257179953763845e-05, |
|
"loss": 2.1687, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.4212076583210604, |
|
"grad_norm": 7.446643352508545, |
|
"learning_rate": 9.222343811959693e-05, |
|
"loss": 1.6996, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.42415316642120765, |
|
"grad_norm": 7.048992156982422, |
|
"learning_rate": 9.186778126501916e-05, |
|
"loss": 1.5279, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.4270986745213549, |
|
"grad_norm": 7.340887069702148, |
|
"learning_rate": 9.150489042489367e-05, |
|
"loss": 1.4406, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.43004418262150224, |
|
"grad_norm": 11.633044242858887, |
|
"learning_rate": 9.113482830010918e-05, |
|
"loss": 1.5187, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.4329896907216495, |
|
"grad_norm": 9.241588592529297, |
|
"learning_rate": 9.075765883062093e-05, |
|
"loss": 1.8564, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.43593519882179677, |
|
"grad_norm": 8.745128631591797, |
|
"learning_rate": 9.037344718440322e-05, |
|
"loss": 1.5419, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.43888070692194403, |
|
"grad_norm": 16.220924377441406, |
|
"learning_rate": 8.99822597461894e-05, |
|
"loss": 2.5006, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.4418262150220913, |
|
"grad_norm": 15.44288444519043, |
|
"learning_rate": 8.958416410600187e-05, |
|
"loss": 2.0001, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.44477172312223856, |
|
"grad_norm": 10.727198600769043, |
|
"learning_rate": 8.917922904747384e-05, |
|
"loss": 2.8376, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.4477172312223859, |
|
"grad_norm": 5.658038139343262, |
|
"learning_rate": 8.876752453596462e-05, |
|
"loss": 1.7435, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.45066273932253315, |
|
"grad_norm": 4.7606892585754395, |
|
"learning_rate": 8.834912170647101e-05, |
|
"loss": 1.3439, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.4536082474226804, |
|
"grad_norm": 4.7208757400512695, |
|
"learning_rate": 8.792409285133642e-05, |
|
"loss": 1.8022, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4565537555228277, |
|
"grad_norm": 3.1722121238708496, |
|
"learning_rate": 8.749251140776016e-05, |
|
"loss": 0.7509, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.45949926362297494, |
|
"grad_norm": 2.528482675552368, |
|
"learning_rate": 8.705445194510868e-05, |
|
"loss": 1.6068, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.46244477172312226, |
|
"grad_norm": 2.8189425468444824, |
|
"learning_rate": 8.66099901520315e-05, |
|
"loss": 0.4364, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.4653902798232695, |
|
"grad_norm": 2.734297752380371, |
|
"learning_rate": 8.615920282338355e-05, |
|
"loss": 1.6278, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4683357879234168, |
|
"grad_norm": 2.981930732727051, |
|
"learning_rate": 8.570216784695637e-05, |
|
"loss": 1.9541, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.47128129602356406, |
|
"grad_norm": 3.195127248764038, |
|
"learning_rate": 8.52389641900206e-05, |
|
"loss": 2.0949, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4742268041237113, |
|
"grad_norm": 3.8657867908477783, |
|
"learning_rate": 8.476967188568188e-05, |
|
"loss": 1.8021, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.47717231222385864, |
|
"grad_norm": 3.0095670223236084, |
|
"learning_rate": 8.429437201905254e-05, |
|
"loss": 1.8119, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4801178203240059, |
|
"grad_norm": 3.025339365005493, |
|
"learning_rate": 8.381314671324159e-05, |
|
"loss": 1.8117, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.48306332842415317, |
|
"grad_norm": 4.113118648529053, |
|
"learning_rate": 8.332607911516545e-05, |
|
"loss": 1.7912, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.48600883652430044, |
|
"grad_norm": 3.183894395828247, |
|
"learning_rate": 8.283325338118153e-05, |
|
"loss": 1.7364, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4889543446244477, |
|
"grad_norm": 5.081910610198975, |
|
"learning_rate": 8.233475466254765e-05, |
|
"loss": 2.0706, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.49189985272459497, |
|
"grad_norm": 4.033633708953857, |
|
"learning_rate": 8.183066909070947e-05, |
|
"loss": 2.0481, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4948453608247423, |
|
"grad_norm": 4.505871772766113, |
|
"learning_rate": 8.132108376241849e-05, |
|
"loss": 2.044, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.49779086892488955, |
|
"grad_norm": 6.440969944000244, |
|
"learning_rate": 8.08060867246834e-05, |
|
"loss": 2.209, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.5007363770250368, |
|
"grad_norm": 6.235000133514404, |
|
"learning_rate": 8.028576695955711e-05, |
|
"loss": 2.0359, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5036818851251841, |
|
"grad_norm": 4.07951021194458, |
|
"learning_rate": 7.97602143687623e-05, |
|
"loss": 1.6721, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.5066273932253313, |
|
"grad_norm": 4.1656413078308105, |
|
"learning_rate": 7.922951975815811e-05, |
|
"loss": 1.7503, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5095729013254786, |
|
"grad_norm": 3.7077293395996094, |
|
"learning_rate": 7.869377482205042e-05, |
|
"loss": 1.6881, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5125184094256259, |
|
"grad_norm": 4.08639669418335, |
|
"learning_rate": 7.815307212734888e-05, |
|
"loss": 1.4451, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5154639175257731, |
|
"grad_norm": 4.255699157714844, |
|
"learning_rate": 7.760750509757298e-05, |
|
"loss": 1.7914, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5184094256259205, |
|
"grad_norm": 4.119191646575928, |
|
"learning_rate": 7.705716799671019e-05, |
|
"loss": 1.8201, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5213549337260678, |
|
"grad_norm": 4.7409467697143555, |
|
"learning_rate": 7.650215591292888e-05, |
|
"loss": 1.8449, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.524300441826215, |
|
"grad_norm": 5.348622798919678, |
|
"learning_rate": 7.594256474214882e-05, |
|
"loss": 1.6767, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.5272459499263623, |
|
"grad_norm": 4.13510799407959, |
|
"learning_rate": 7.537849117147212e-05, |
|
"loss": 1.2513, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.5301914580265096, |
|
"grad_norm": 4.210190296173096, |
|
"learning_rate": 7.481003266247744e-05, |
|
"loss": 1.5867, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5331369661266568, |
|
"grad_norm": 5.271762371063232, |
|
"learning_rate": 7.423728743438048e-05, |
|
"loss": 2.1488, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.5360824742268041, |
|
"grad_norm": 5.800075054168701, |
|
"learning_rate": 7.366035444706347e-05, |
|
"loss": 1.8385, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.5390279823269514, |
|
"grad_norm": 6.030768871307373, |
|
"learning_rate": 7.307933338397667e-05, |
|
"loss": 1.6634, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.5419734904270986, |
|
"grad_norm": 5.144318580627441, |
|
"learning_rate": 7.249432463491498e-05, |
|
"loss": 1.4081, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.5449189985272459, |
|
"grad_norm": 5.144754409790039, |
|
"learning_rate": 7.190542927867234e-05, |
|
"loss": 1.4496, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5478645066273933, |
|
"grad_norm": 7.374088764190674, |
|
"learning_rate": 7.131274906557725e-05, |
|
"loss": 1.94, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5508100147275405, |
|
"grad_norm": 6.2576003074646, |
|
"learning_rate": 7.071638639991207e-05, |
|
"loss": 1.701, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.5537555228276878, |
|
"grad_norm": 5.392474174499512, |
|
"learning_rate": 7.011644432221958e-05, |
|
"loss": 1.474, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.5567010309278351, |
|
"grad_norm": 7.743690013885498, |
|
"learning_rate": 6.95130264914993e-05, |
|
"loss": 2.1458, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5596465390279823, |
|
"grad_norm": 7.468958377838135, |
|
"learning_rate": 6.890623716729724e-05, |
|
"loss": 2.2593, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5625920471281296, |
|
"grad_norm": 6.144321918487549, |
|
"learning_rate": 6.82961811916917e-05, |
|
"loss": 1.3348, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5655375552282769, |
|
"grad_norm": 7.852849006652832, |
|
"learning_rate": 6.768296397117848e-05, |
|
"loss": 1.4751, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5684830633284241, |
|
"grad_norm": 7.6272172927856445, |
|
"learning_rate": 6.706669145845863e-05, |
|
"loss": 1.7606, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 13.624709129333496, |
|
"learning_rate": 6.644747013413168e-05, |
|
"loss": 1.9716, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5743740795287187, |
|
"grad_norm": 8.707653999328613, |
|
"learning_rate": 6.582540698829781e-05, |
|
"loss": 1.8245, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5773195876288659, |
|
"grad_norm": 6.5573039054870605, |
|
"learning_rate": 6.520060950207185e-05, |
|
"loss": 1.1733, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5802650957290133, |
|
"grad_norm": 8.433490753173828, |
|
"learning_rate": 6.457318562901256e-05, |
|
"loss": 1.2757, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5832106038291606, |
|
"grad_norm": 9.936591148376465, |
|
"learning_rate": 6.394324377647028e-05, |
|
"loss": 1.4713, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5861561119293078, |
|
"grad_norm": 12.52322769165039, |
|
"learning_rate": 6.331089278685599e-05, |
|
"loss": 1.6688, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5891016200294551, |
|
"grad_norm": 16.770870208740234, |
|
"learning_rate": 6.26762419188355e-05, |
|
"loss": 2.3924, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5920471281296024, |
|
"grad_norm": 3.7220280170440674, |
|
"learning_rate": 6.203940082845144e-05, |
|
"loss": 1.8733, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5949926362297496, |
|
"grad_norm": 3.919069290161133, |
|
"learning_rate": 6.140047955017671e-05, |
|
"loss": 0.7927, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5979381443298969, |
|
"grad_norm": 3.081045150756836, |
|
"learning_rate": 6.075958847790262e-05, |
|
"loss": 1.2878, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.6008836524300442, |
|
"grad_norm": 2.82151198387146, |
|
"learning_rate": 6.011683834586473e-05, |
|
"loss": 1.2722, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.6038291605301914, |
|
"grad_norm": 2.1925573348999023, |
|
"learning_rate": 5.947234020951015e-05, |
|
"loss": 0.7102, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6067746686303387, |
|
"grad_norm": 2.2746307849884033, |
|
"learning_rate": 5.882620542630901e-05, |
|
"loss": 0.8639, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.6097201767304861, |
|
"grad_norm": 2.5907399654388428, |
|
"learning_rate": 5.8178545636514145e-05, |
|
"loss": 1.3898, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.6126656848306333, |
|
"grad_norm": 3.3455467224121094, |
|
"learning_rate": 5.752947274387147e-05, |
|
"loss": 1.6453, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.6156111929307806, |
|
"grad_norm": 3.1584908962249756, |
|
"learning_rate": 5.687909889628529e-05, |
|
"loss": 1.8621, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"grad_norm": 3.1960365772247314, |
|
"learning_rate": 5.622753646644102e-05, |
|
"loss": 1.6358, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6215022091310751, |
|
"grad_norm": 3.073702573776245, |
|
"learning_rate": 5.557489803238933e-05, |
|
"loss": 1.5381, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.6244477172312224, |
|
"grad_norm": 2.917039632797241, |
|
"learning_rate": 5.492129635809473e-05, |
|
"loss": 1.6452, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.6273932253313697, |
|
"grad_norm": 3.823730707168579, |
|
"learning_rate": 5.426684437395196e-05, |
|
"loss": 1.7625, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.6303387334315169, |
|
"grad_norm": 4.05581521987915, |
|
"learning_rate": 5.361165515727374e-05, |
|
"loss": 1.9468, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.6332842415316642, |
|
"grad_norm": 3.9730498790740967, |
|
"learning_rate": 5.295584191275308e-05, |
|
"loss": 2.2487, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6362297496318114, |
|
"grad_norm": 3.2518630027770996, |
|
"learning_rate": 5.229951795290353e-05, |
|
"loss": 1.5587, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.6391752577319587, |
|
"grad_norm": 4.549688816070557, |
|
"learning_rate": 5.164279667848094e-05, |
|
"loss": 1.9062, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.6421207658321061, |
|
"grad_norm": 3.6768784523010254, |
|
"learning_rate": 5.0985791558889785e-05, |
|
"loss": 1.6801, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.6450662739322534, |
|
"grad_norm": 3.8859946727752686, |
|
"learning_rate": 5.032861611257783e-05, |
|
"loss": 1.9869, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.6480117820324006, |
|
"grad_norm": 4.19022798538208, |
|
"learning_rate": 4.967138388742218e-05, |
|
"loss": 2.0944, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6509572901325479, |
|
"grad_norm": 3.7361319065093994, |
|
"learning_rate": 4.901420844111021e-05, |
|
"loss": 1.3391, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.6539027982326951, |
|
"grad_norm": 3.7620272636413574, |
|
"learning_rate": 4.835720332151907e-05, |
|
"loss": 1.2597, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.6568483063328424, |
|
"grad_norm": 3.610649824142456, |
|
"learning_rate": 4.770048204709648e-05, |
|
"loss": 1.2885, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.6597938144329897, |
|
"grad_norm": 4.286924839019775, |
|
"learning_rate": 4.7044158087246926e-05, |
|
"loss": 1.7704, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.6627393225331369, |
|
"grad_norm": 4.295466423034668, |
|
"learning_rate": 4.6388344842726264e-05, |
|
"loss": 1.3672, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6656848306332842, |
|
"grad_norm": 5.368373870849609, |
|
"learning_rate": 4.5733155626048036e-05, |
|
"loss": 1.5249, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.6686303387334315, |
|
"grad_norm": 5.0956950187683105, |
|
"learning_rate": 4.507870364190527e-05, |
|
"loss": 1.3973, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.6715758468335787, |
|
"grad_norm": 4.604374408721924, |
|
"learning_rate": 4.4425101967610674e-05, |
|
"loss": 1.3965, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.6745213549337261, |
|
"grad_norm": 6.400148391723633, |
|
"learning_rate": 4.377246353355899e-05, |
|
"loss": 2.1319, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.6774668630338734, |
|
"grad_norm": 6.704805850982666, |
|
"learning_rate": 4.312090110371473e-05, |
|
"loss": 1.9594, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6804123711340206, |
|
"grad_norm": 4.956854343414307, |
|
"learning_rate": 4.247052725612852e-05, |
|
"loss": 1.3392, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6833578792341679, |
|
"grad_norm": 8.188490867614746, |
|
"learning_rate": 4.1821454363485866e-05, |
|
"loss": 1.7636, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6863033873343152, |
|
"grad_norm": 5.5076751708984375, |
|
"learning_rate": 4.1173794573690996e-05, |
|
"loss": 1.3066, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6892488954344624, |
|
"grad_norm": 6.480940341949463, |
|
"learning_rate": 4.052765979048986e-05, |
|
"loss": 1.8326, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6921944035346097, |
|
"grad_norm": 7.0487284660339355, |
|
"learning_rate": 3.988316165413528e-05, |
|
"loss": 1.7487, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.695139911634757, |
|
"grad_norm": 6.207117557525635, |
|
"learning_rate": 3.924041152209739e-05, |
|
"loss": 1.4652, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6980854197349042, |
|
"grad_norm": 5.3427205085754395, |
|
"learning_rate": 3.859952044982329e-05, |
|
"loss": 1.3738, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.7010309278350515, |
|
"grad_norm": 7.366813659667969, |
|
"learning_rate": 3.7960599171548574e-05, |
|
"loss": 1.4897, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.7039764359351989, |
|
"grad_norm": 6.943472862243652, |
|
"learning_rate": 3.732375808116451e-05, |
|
"loss": 1.2526, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.7069219440353461, |
|
"grad_norm": 6.768280506134033, |
|
"learning_rate": 3.668910721314402e-05, |
|
"loss": 1.215, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7098674521354934, |
|
"grad_norm": 8.1182279586792, |
|
"learning_rate": 3.605675622352973e-05, |
|
"loss": 1.8819, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.7128129602356407, |
|
"grad_norm": 6.481826305389404, |
|
"learning_rate": 3.542681437098745e-05, |
|
"loss": 1.0718, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.7157584683357879, |
|
"grad_norm": 6.8479485511779785, |
|
"learning_rate": 3.479939049792817e-05, |
|
"loss": 1.0558, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.7187039764359352, |
|
"grad_norm": 7.331018924713135, |
|
"learning_rate": 3.417459301170219e-05, |
|
"loss": 1.2658, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.7216494845360825, |
|
"grad_norm": 7.68209981918335, |
|
"learning_rate": 3.355252986586832e-05, |
|
"loss": 1.2073, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7245949926362297, |
|
"grad_norm": 11.420364379882812, |
|
"learning_rate": 3.293330854154136e-05, |
|
"loss": 1.9316, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.727540500736377, |
|
"grad_norm": 7.791138172149658, |
|
"learning_rate": 3.2317036028821523e-05, |
|
"loss": 1.3763, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.7304860088365243, |
|
"grad_norm": 8.121346473693848, |
|
"learning_rate": 3.1703818808308324e-05, |
|
"loss": 1.2789, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.7334315169366715, |
|
"grad_norm": 11.62152099609375, |
|
"learning_rate": 3.109376283270277e-05, |
|
"loss": 1.9576, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.7363770250368189, |
|
"grad_norm": 16.3736515045166, |
|
"learning_rate": 3.0486973508500727e-05, |
|
"loss": 1.8166, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7393225331369662, |
|
"grad_norm": 2.2238049507141113, |
|
"learning_rate": 2.988355567778043e-05, |
|
"loss": 1.8472, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.7422680412371134, |
|
"grad_norm": 2.6677799224853516, |
|
"learning_rate": 2.9283613600087933e-05, |
|
"loss": 1.3083, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.7452135493372607, |
|
"grad_norm": 2.822364568710327, |
|
"learning_rate": 2.8687250934422772e-05, |
|
"loss": 0.9437, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.748159057437408, |
|
"grad_norm": 2.1095311641693115, |
|
"learning_rate": 2.8094570721327662e-05, |
|
"loss": 0.9748, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.7511045655375552, |
|
"grad_norm": 1.7905985116958618, |
|
"learning_rate": 2.750567536508504e-05, |
|
"loss": 0.881, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7540500736377025, |
|
"grad_norm": 2.3329577445983887, |
|
"learning_rate": 2.6920666616023327e-05, |
|
"loss": 1.2698, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.7569955817378498, |
|
"grad_norm": 1.4233232736587524, |
|
"learning_rate": 2.6339645552936536e-05, |
|
"loss": 0.6157, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.759941089837997, |
|
"grad_norm": 2.3130457401275635, |
|
"learning_rate": 2.5762712565619528e-05, |
|
"loss": 1.1048, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.7628865979381443, |
|
"grad_norm": 3.3101067543029785, |
|
"learning_rate": 2.5189967337522573e-05, |
|
"loss": 1.9625, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.7658321060382917, |
|
"grad_norm": 3.263514995574951, |
|
"learning_rate": 2.46215088285279e-05, |
|
"loss": 1.9653, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7687776141384389, |
|
"grad_norm": 4.304958820343018, |
|
"learning_rate": 2.4057435257851175e-05, |
|
"loss": 1.7951, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.7717231222385862, |
|
"grad_norm": 3.644676685333252, |
|
"learning_rate": 2.349784408707112e-05, |
|
"loss": 1.7257, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.7746686303387335, |
|
"grad_norm": 3.691222906112671, |
|
"learning_rate": 2.2942832003289823e-05, |
|
"loss": 1.3952, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.7776141384388807, |
|
"grad_norm": 3.36948823928833, |
|
"learning_rate": 2.2392494902427025e-05, |
|
"loss": 1.5119, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.780559646539028, |
|
"grad_norm": 3.5477828979492188, |
|
"learning_rate": 2.1846927872651137e-05, |
|
"loss": 1.6785, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7835051546391752, |
|
"grad_norm": 3.44746470451355, |
|
"learning_rate": 2.1306225177949585e-05, |
|
"loss": 1.3705, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.7864506627393225, |
|
"grad_norm": 5.451107501983643, |
|
"learning_rate": 2.07704802418419e-05, |
|
"loss": 2.1179, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.7893961708394698, |
|
"grad_norm": 3.672868251800537, |
|
"learning_rate": 2.0239785631237705e-05, |
|
"loss": 1.6712, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.792341678939617, |
|
"grad_norm": 3.347046375274658, |
|
"learning_rate": 1.9714233040442915e-05, |
|
"loss": 1.466, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.7952871870397643, |
|
"grad_norm": 3.741647481918335, |
|
"learning_rate": 1.9193913275316626e-05, |
|
"loss": 1.5614, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7982326951399117, |
|
"grad_norm": 3.9401445388793945, |
|
"learning_rate": 1.8678916237581522e-05, |
|
"loss": 1.3664, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.801178203240059, |
|
"grad_norm": 4.1288604736328125, |
|
"learning_rate": 1.816933090929055e-05, |
|
"loss": 1.5295, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.8041237113402062, |
|
"grad_norm": 3.8962388038635254, |
|
"learning_rate": 1.7665245337452368e-05, |
|
"loss": 1.5394, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.8070692194403535, |
|
"grad_norm": 3.2219879627227783, |
|
"learning_rate": 1.716674661881848e-05, |
|
"loss": 1.1269, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.8100147275405007, |
|
"grad_norm": 3.5994858741760254, |
|
"learning_rate": 1.667392088483456e-05, |
|
"loss": 1.1873, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.812960235640648, |
|
"grad_norm": 4.191354274749756, |
|
"learning_rate": 1.6186853286758397e-05, |
|
"loss": 1.6051, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.8159057437407953, |
|
"grad_norm": 3.547515392303467, |
|
"learning_rate": 1.570562798094747e-05, |
|
"loss": 1.2364, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.8188512518409425, |
|
"grad_norm": 7.393482208251953, |
|
"learning_rate": 1.5230328114318127e-05, |
|
"loss": 1.4475, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.8217967599410898, |
|
"grad_norm": 3.847755193710327, |
|
"learning_rate": 1.4761035809979395e-05, |
|
"loss": 1.1197, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 4.247459888458252, |
|
"learning_rate": 1.4297832153043656e-05, |
|
"loss": 1.2487, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8276877761413843, |
|
"grad_norm": 5.425104141235352, |
|
"learning_rate": 1.3840797176616466e-05, |
|
"loss": 1.7611, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.8306332842415317, |
|
"grad_norm": 4.583087921142578, |
|
"learning_rate": 1.3390009847968504e-05, |
|
"loss": 1.4049, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.833578792341679, |
|
"grad_norm": 8.560991287231445, |
|
"learning_rate": 1.2945548054891321e-05, |
|
"loss": 1.2722, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.8365243004418262, |
|
"grad_norm": 5.156883716583252, |
|
"learning_rate": 1.2507488592239847e-05, |
|
"loss": 1.3135, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.8394698085419735, |
|
"grad_norm": 4.798750400543213, |
|
"learning_rate": 1.2075907148663579e-05, |
|
"loss": 1.0881, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8424153166421208, |
|
"grad_norm": 5.219260215759277, |
|
"learning_rate": 1.1650878293528994e-05, |
|
"loss": 1.3982, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.845360824742268, |
|
"grad_norm": 5.036367893218994, |
|
"learning_rate": 1.1232475464035385e-05, |
|
"loss": 1.2723, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.8483063328424153, |
|
"grad_norm": 5.566346645355225, |
|
"learning_rate": 1.0820770952526155e-05, |
|
"loss": 0.7823, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.8512518409425626, |
|
"grad_norm": 4.645596504211426, |
|
"learning_rate": 1.0415835893998116e-05, |
|
"loss": 0.9811, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.8541973490427098, |
|
"grad_norm": 5.099687099456787, |
|
"learning_rate": 1.0017740253810609e-05, |
|
"loss": 1.0945, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 5.225978374481201, |
|
"learning_rate": 9.62655281559679e-06, |
|
"loss": 1.0845, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.8600883652430045, |
|
"grad_norm": 9.22032356262207, |
|
"learning_rate": 9.242341169379076e-06, |
|
"loss": 1.221, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.8630338733431517, |
|
"grad_norm": 9.09154987335205, |
|
"learning_rate": 8.865171699890834e-06, |
|
"loss": 1.7676, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.865979381443299, |
|
"grad_norm": 6.99169397354126, |
|
"learning_rate": 8.49510957510633e-06, |
|
"loss": 1.1186, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.8689248895434463, |
|
"grad_norm": 7.78825044631958, |
|
"learning_rate": 8.132218734980852e-06, |
|
"loss": 1.1668, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8718703976435935, |
|
"grad_norm": 6.3908371925354, |
|
"learning_rate": 7.776561880403072e-06, |
|
"loss": 1.1461, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.8748159057437408, |
|
"grad_norm": 8.278850555419922, |
|
"learning_rate": 7.4282004623615396e-06, |
|
"loss": 1.2128, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.8777614138438881, |
|
"grad_norm": 8.332165718078613, |
|
"learning_rate": 7.0871946713269856e-06, |
|
"loss": 1.016, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.8807069219440353, |
|
"grad_norm": 14.03607177734375, |
|
"learning_rate": 6.753603426852589e-06, |
|
"loss": 1.7378, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.8836524300441826, |
|
"grad_norm": 15.087005615234375, |
|
"learning_rate": 6.427484367393699e-06, |
|
"loss": 1.6779, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8865979381443299, |
|
"grad_norm": 2.469926357269287, |
|
"learning_rate": 6.108893840348995e-06, |
|
"loss": 1.7045, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.8895434462444771, |
|
"grad_norm": 2.2709763050079346, |
|
"learning_rate": 5.797886892324694e-06, |
|
"loss": 0.9064, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.8924889543446245, |
|
"grad_norm": 2.1486294269561768, |
|
"learning_rate": 5.494517259623477e-06, |
|
"loss": 1.3705, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.8954344624447718, |
|
"grad_norm": 1.1387767791748047, |
|
"learning_rate": 5.198837358959901e-06, |
|
"loss": 0.2727, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.898379970544919, |
|
"grad_norm": 4.750706672668457, |
|
"learning_rate": 4.910898278403669e-06, |
|
"loss": 0.9473, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.9013254786450663, |
|
"grad_norm": 3.402461051940918, |
|
"learning_rate": 4.630749768552589e-06, |
|
"loss": 1.3646, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.9042709867452136, |
|
"grad_norm": 3.262354612350464, |
|
"learning_rate": 4.358440233936617e-06, |
|
"loss": 1.6704, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.9072164948453608, |
|
"grad_norm": 2.7272191047668457, |
|
"learning_rate": 4.094016724654359e-06, |
|
"loss": 1.4253, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.9101620029455081, |
|
"grad_norm": 3.3352503776550293, |
|
"learning_rate": 3.837524928243774e-06, |
|
"loss": 1.5388, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.9131075110456554, |
|
"grad_norm": 2.7655768394470215, |
|
"learning_rate": 3.589009161788104e-06, |
|
"loss": 1.3632, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9160530191458026, |
|
"grad_norm": 3.8515467643737793, |
|
"learning_rate": 3.3485123642587658e-06, |
|
"loss": 1.822, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.9189985272459499, |
|
"grad_norm": 3.5630061626434326, |
|
"learning_rate": 3.116076089096265e-06, |
|
"loss": 1.6991, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.9219440353460973, |
|
"grad_norm": 4.128489017486572, |
|
"learning_rate": 2.8917404970305097e-06, |
|
"loss": 1.3073, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.9248895434462445, |
|
"grad_norm": 4.167806148529053, |
|
"learning_rate": 2.675544349141779e-06, |
|
"loss": 1.73, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.9278350515463918, |
|
"grad_norm": 4.222491264343262, |
|
"learning_rate": 2.4675250001635232e-06, |
|
"loss": 1.6474, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.930780559646539, |
|
"grad_norm": 3.9301912784576416, |
|
"learning_rate": 2.2677183920281343e-06, |
|
"loss": 1.8446, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.9337260677466863, |
|
"grad_norm": 10.473541259765625, |
|
"learning_rate": 2.076159047656889e-06, |
|
"loss": 1.7129, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.9366715758468336, |
|
"grad_norm": 3.969463348388672, |
|
"learning_rate": 1.892880064994934e-06, |
|
"loss": 1.2407, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.9396170839469808, |
|
"grad_norm": 4.088743686676025, |
|
"learning_rate": 1.7179131112926627e-06, |
|
"loss": 1.5257, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.9425625920471281, |
|
"grad_norm": 3.984471559524536, |
|
"learning_rate": 1.551288417634106e-06, |
|
"loss": 1.2564, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9455081001472754, |
|
"grad_norm": 7.943843841552734, |
|
"learning_rate": 1.3930347737136196e-06, |
|
"loss": 1.8827, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.9484536082474226, |
|
"grad_norm": 5.492068767547607, |
|
"learning_rate": 1.2431795228615372e-06, |
|
"loss": 1.7856, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.9513991163475699, |
|
"grad_norm": 4.9044647216796875, |
|
"learning_rate": 1.101748557319715e-06, |
|
"loss": 1.4158, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.9543446244477173, |
|
"grad_norm": 4.557461261749268, |
|
"learning_rate": 9.687663137678604e-07, |
|
"loss": 1.1651, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.9572901325478645, |
|
"grad_norm": 5.330766201019287, |
|
"learning_rate": 8.442557691013043e-07, |
|
"loss": 1.3875, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9602356406480118, |
|
"grad_norm": 4.67563009262085, |
|
"learning_rate": 7.282384364610206e-07, |
|
"loss": 1.0563, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.9631811487481591, |
|
"grad_norm": 5.674734592437744, |
|
"learning_rate": 6.207343615165561e-07, |
|
"loss": 1.1674, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.9661266568483063, |
|
"grad_norm": 6.089367866516113, |
|
"learning_rate": 5.217621190024779e-07, |
|
"loss": 1.1991, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.9690721649484536, |
|
"grad_norm": 5.158655643463135, |
|
"learning_rate": 4.3133880950905205e-07, |
|
"loss": 1.2847, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.9720176730486009, |
|
"grad_norm": 5.49578332901001, |
|
"learning_rate": 3.494800565275125e-07, |
|
"loss": 1.1238, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9749631811487481, |
|
"grad_norm": 5.605154991149902, |
|
"learning_rate": 2.762000037506485e-07, |
|
"loss": 0.964, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.9779086892488954, |
|
"grad_norm": 7.648482799530029, |
|
"learning_rate": 2.115113126290258e-07, |
|
"loss": 1.4449, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.9808541973490427, |
|
"grad_norm": 8.961212158203125, |
|
"learning_rate": 1.554251601833201e-07, |
|
"loss": 1.5872, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.9837997054491899, |
|
"grad_norm": 7.242101669311523, |
|
"learning_rate": 1.0795123707312283e-07, |
|
"loss": 1.0052, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.9867452135493373, |
|
"grad_norm": 7.433496475219727, |
|
"learning_rate": 6.909774592258056e-08, |
|
"loss": 1.1363, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9896907216494846, |
|
"grad_norm": 7.9218854904174805, |
|
"learning_rate": 3.8871399903134265e-08, |
|
"loss": 1.1405, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.9926362297496318, |
|
"grad_norm": 6.639596462249756, |
|
"learning_rate": 1.7277421573608232e-08, |
|
"loss": 1.1224, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.9955817378497791, |
|
"grad_norm": 9.504895210266113, |
|
"learning_rate": 4.319541977831909e-09, |
|
"loss": 1.5028, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.9985272459499264, |
|
"grad_norm": 8.52180290222168, |
|
"learning_rate": 0.0, |
|
"loss": 0.8636, |
|
"step": 339 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 339, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 239, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.500002096893133e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|