diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7061 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.18851918182675087, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00018851918182675087, + "grad_norm": 1.6484375, + "learning_rate": 5e-06, + "loss": 2.7659, + "step": 1 + }, + { + "epoch": 0.00037703836365350174, + "grad_norm": 1.6640625, + "learning_rate": 1e-05, + "loss": 2.5842, + "step": 2 + }, + { + "epoch": 0.0005655575454802526, + "grad_norm": 1.609375, + "learning_rate": 1.5e-05, + "loss": 2.8169, + "step": 3 + }, + { + "epoch": 0.0007540767273070035, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 2.6938, + "step": 4 + }, + { + "epoch": 0.0009425959091337543, + "grad_norm": 1.546875, + "learning_rate": 2.5e-05, + "loss": 2.7862, + "step": 5 + }, + { + "epoch": 0.0011311150909605052, + "grad_norm": 1.671875, + "learning_rate": 3e-05, + "loss": 2.8844, + "step": 6 + }, + { + "epoch": 0.0013196342727872562, + "grad_norm": 1.5703125, + "learning_rate": 3.5000000000000004e-05, + "loss": 2.8254, + "step": 7 + }, + { + "epoch": 0.001508153454614007, + "grad_norm": 1.5625, + "learning_rate": 4e-05, + "loss": 2.7735, + "step": 8 + }, + { + "epoch": 0.001696672636440758, + "grad_norm": 1.609375, + "learning_rate": 4.4999999999999996e-05, + "loss": 2.8222, + "step": 9 + }, + { + "epoch": 0.0018851918182675087, + "grad_norm": 1.6328125, + "learning_rate": 5e-05, + "loss": 2.6943, + "step": 10 + }, + { + "epoch": 0.0020737110000942595, + "grad_norm": 1.5546875, + "learning_rate": 5.5e-05, + "loss": 2.6735, + "step": 11 + }, + { + "epoch": 0.0022622301819210104, + "grad_norm": 1.6640625, + "learning_rate": 6e-05, + "loss": 2.6482, + "step": 12 + }, + { + "epoch": 0.0024507493637477614, + "grad_norm": 1.4375, + "learning_rate": 6.500000000000001e-05, + "loss": 2.8788, + "step": 13 + }, + { + "epoch": 0.0026392685455745124, + "grad_norm": 1.6328125, + "learning_rate": 7.000000000000001e-05, + "loss": 2.7531, + "step": 14 + }, + { + "epoch": 0.002827787727401263, + "grad_norm": 1.53125, + "learning_rate": 7.5e-05, + "loss": 2.7911, + "step": 15 + }, + { + "epoch": 0.003016306909228014, + "grad_norm": 1.5703125, + "learning_rate": 8e-05, + "loss": 2.7358, + "step": 16 + }, + { + "epoch": 0.003204826091054765, + "grad_norm": 1.5859375, + "learning_rate": 8.5e-05, + "loss": 2.7272, + "step": 17 + }, + { + "epoch": 0.003393345272881516, + "grad_norm": 1.515625, + "learning_rate": 8.999999999999999e-05, + "loss": 2.7176, + "step": 18 + }, + { + "epoch": 0.0035818644547082664, + "grad_norm": 1.5, + "learning_rate": 9.5e-05, + "loss": 2.8573, + "step": 19 + }, + { + "epoch": 0.0037703836365350174, + "grad_norm": 1.5234375, + "learning_rate": 0.0001, + "loss": 2.7512, + "step": 20 + }, + { + "epoch": 0.003958902818361768, + "grad_norm": 1.296875, + "learning_rate": 0.000105, + "loss": 2.7962, + "step": 21 + }, + { + "epoch": 0.004147422000188519, + "grad_norm": 1.3671875, + "learning_rate": 0.00011, + "loss": 2.7, + "step": 22 + }, + { + "epoch": 0.00433594118201527, + "grad_norm": 1.296875, + "learning_rate": 0.000115, + "loss": 2.7128, + "step": 23 + }, + { + "epoch": 0.004524460363842021, + "grad_norm": 1.2890625, + "learning_rate": 0.00012, + "loss": 2.729, + "step": 24 + }, + { + "epoch": 0.004712979545668771, + "grad_norm": 1.25, + "learning_rate": 0.000125, + "loss": 2.698, + "step": 25 + }, + { + "epoch": 0.004901498727495523, + "grad_norm": 1.3046875, + "learning_rate": 0.00013000000000000002, + "loss": 2.7461, + "step": 26 + }, + { + "epoch": 0.005090017909322273, + "grad_norm": 1.1796875, + "learning_rate": 0.000135, + "loss": 2.7315, + "step": 27 + }, + { + "epoch": 0.005278537091149025, + "grad_norm": 1.15625, + "learning_rate": 0.00014000000000000001, + "loss": 2.7089, + "step": 28 + }, + { + "epoch": 0.005467056272975775, + "grad_norm": 1.1875, + "learning_rate": 0.000145, + "loss": 2.6724, + "step": 29 + }, + { + "epoch": 0.005655575454802526, + "grad_norm": 1.0859375, + "learning_rate": 0.00015, + "loss": 2.799, + "step": 30 + }, + { + "epoch": 0.005844094636629277, + "grad_norm": 1.0546875, + "learning_rate": 0.000155, + "loss": 2.7939, + "step": 31 + }, + { + "epoch": 0.006032613818456028, + "grad_norm": 1.078125, + "learning_rate": 0.00016, + "loss": 2.8004, + "step": 32 + }, + { + "epoch": 0.006221133000282778, + "grad_norm": 1.0390625, + "learning_rate": 0.000165, + "loss": 2.6322, + "step": 33 + }, + { + "epoch": 0.00640965218210953, + "grad_norm": 1.0, + "learning_rate": 0.00017, + "loss": 2.7095, + "step": 34 + }, + { + "epoch": 0.00659817136393628, + "grad_norm": 1.0234375, + "learning_rate": 0.000175, + "loss": 2.7111, + "step": 35 + }, + { + "epoch": 0.006786690545763032, + "grad_norm": 0.91796875, + "learning_rate": 0.00017999999999999998, + "loss": 2.7666, + "step": 36 + }, + { + "epoch": 0.006975209727589782, + "grad_norm": 0.921875, + "learning_rate": 0.000185, + "loss": 2.7779, + "step": 37 + }, + { + "epoch": 0.007163728909416533, + "grad_norm": 0.91015625, + "learning_rate": 0.00019, + "loss": 2.7684, + "step": 38 + }, + { + "epoch": 0.007352248091243284, + "grad_norm": 0.875, + "learning_rate": 0.00019500000000000002, + "loss": 2.8674, + "step": 39 + }, + { + "epoch": 0.007540767273070035, + "grad_norm": 0.8515625, + "learning_rate": 0.0002, + "loss": 2.7694, + "step": 40 + }, + { + "epoch": 0.007729286454896786, + "grad_norm": 0.85546875, + "learning_rate": 0.000205, + "loss": 2.6799, + "step": 41 + }, + { + "epoch": 0.007917805636723537, + "grad_norm": 0.8046875, + "learning_rate": 0.00021, + "loss": 2.6289, + "step": 42 + }, + { + "epoch": 0.008106324818550288, + "grad_norm": 0.859375, + "learning_rate": 0.000215, + "loss": 2.7937, + "step": 43 + }, + { + "epoch": 0.008294844000377038, + "grad_norm": 0.80859375, + "learning_rate": 0.00022, + "loss": 2.78, + "step": 44 + }, + { + "epoch": 0.00848336318220379, + "grad_norm": 0.765625, + "learning_rate": 0.00022500000000000002, + "loss": 2.6351, + "step": 45 + }, + { + "epoch": 0.00867188236403054, + "grad_norm": 0.7890625, + "learning_rate": 0.00023, + "loss": 2.8156, + "step": 46 + }, + { + "epoch": 0.00886040154585729, + "grad_norm": 0.7890625, + "learning_rate": 0.000235, + "loss": 2.8304, + "step": 47 + }, + { + "epoch": 0.009048920727684042, + "grad_norm": 0.76171875, + "learning_rate": 0.00024, + "loss": 2.7148, + "step": 48 + }, + { + "epoch": 0.009237439909510793, + "grad_norm": 0.75390625, + "learning_rate": 0.000245, + "loss": 2.7169, + "step": 49 + }, + { + "epoch": 0.009425959091337543, + "grad_norm": 0.76953125, + "learning_rate": 0.00025, + "loss": 2.8345, + "step": 50 + }, + { + "epoch": 0.009614478273164294, + "grad_norm": 0.73046875, + "learning_rate": 0.000255, + "loss": 2.8149, + "step": 51 + }, + { + "epoch": 0.009802997454991046, + "grad_norm": 0.7421875, + "learning_rate": 0.00026000000000000003, + "loss": 2.8182, + "step": 52 + }, + { + "epoch": 0.009991516636817797, + "grad_norm": 0.7578125, + "learning_rate": 0.00026500000000000004, + "loss": 2.8114, + "step": 53 + }, + { + "epoch": 0.010180035818644547, + "grad_norm": 0.71875, + "learning_rate": 0.00027, + "loss": 2.803, + "step": 54 + }, + { + "epoch": 0.010368555000471298, + "grad_norm": 0.73046875, + "learning_rate": 0.000275, + "loss": 2.7979, + "step": 55 + }, + { + "epoch": 0.01055707418229805, + "grad_norm": 0.73828125, + "learning_rate": 0.00028000000000000003, + "loss": 2.8062, + "step": 56 + }, + { + "epoch": 0.0107455933641248, + "grad_norm": 0.76171875, + "learning_rate": 0.000285, + "loss": 2.6728, + "step": 57 + }, + { + "epoch": 0.01093411254595155, + "grad_norm": 0.73046875, + "learning_rate": 0.00029, + "loss": 2.7547, + "step": 58 + }, + { + "epoch": 0.011122631727778302, + "grad_norm": 0.71875, + "learning_rate": 0.000295, + "loss": 2.6773, + "step": 59 + }, + { + "epoch": 0.011311150909605052, + "grad_norm": 0.7109375, + "learning_rate": 0.0003, + "loss": 2.7238, + "step": 60 + }, + { + "epoch": 0.011499670091431803, + "grad_norm": 0.77734375, + "learning_rate": 0.000305, + "loss": 2.6842, + "step": 61 + }, + { + "epoch": 0.011688189273258555, + "grad_norm": 0.703125, + "learning_rate": 0.00031, + "loss": 2.8449, + "step": 62 + }, + { + "epoch": 0.011876708455085304, + "grad_norm": 0.703125, + "learning_rate": 0.000315, + "loss": 2.6828, + "step": 63 + }, + { + "epoch": 0.012065227636912056, + "grad_norm": 0.6796875, + "learning_rate": 0.00032, + "loss": 2.7663, + "step": 64 + }, + { + "epoch": 0.012253746818738807, + "grad_norm": 0.69140625, + "learning_rate": 0.00032500000000000004, + "loss": 2.6127, + "step": 65 + }, + { + "epoch": 0.012442266000565557, + "grad_norm": 0.70703125, + "learning_rate": 0.00033, + "loss": 2.6333, + "step": 66 + }, + { + "epoch": 0.012630785182392308, + "grad_norm": 0.6796875, + "learning_rate": 0.000335, + "loss": 2.7669, + "step": 67 + }, + { + "epoch": 0.01281930436421906, + "grad_norm": 0.7265625, + "learning_rate": 0.00034, + "loss": 2.7363, + "step": 68 + }, + { + "epoch": 0.013007823546045811, + "grad_norm": 0.7109375, + "learning_rate": 0.000345, + "loss": 2.6626, + "step": 69 + }, + { + "epoch": 0.01319634272787256, + "grad_norm": 0.71875, + "learning_rate": 0.00035, + "loss": 2.7896, + "step": 70 + }, + { + "epoch": 0.013384861909699312, + "grad_norm": 0.69140625, + "learning_rate": 0.000355, + "loss": 2.7407, + "step": 71 + }, + { + "epoch": 0.013573381091526063, + "grad_norm": 0.6953125, + "learning_rate": 0.00035999999999999997, + "loss": 2.804, + "step": 72 + }, + { + "epoch": 0.013761900273352813, + "grad_norm": 0.69140625, + "learning_rate": 0.000365, + "loss": 2.781, + "step": 73 + }, + { + "epoch": 0.013950419455179565, + "grad_norm": 0.6875, + "learning_rate": 0.00037, + "loss": 2.5436, + "step": 74 + }, + { + "epoch": 0.014138938637006316, + "grad_norm": 0.66796875, + "learning_rate": 0.000375, + "loss": 2.7272, + "step": 75 + }, + { + "epoch": 0.014327457818833066, + "grad_norm": 0.68359375, + "learning_rate": 0.00038, + "loss": 2.6777, + "step": 76 + }, + { + "epoch": 0.014515977000659817, + "grad_norm": 0.7734375, + "learning_rate": 0.00038500000000000003, + "loss": 2.8211, + "step": 77 + }, + { + "epoch": 0.014704496182486568, + "grad_norm": 0.6953125, + "learning_rate": 0.00039000000000000005, + "loss": 2.7639, + "step": 78 + }, + { + "epoch": 0.014893015364313318, + "grad_norm": 0.6953125, + "learning_rate": 0.000395, + "loss": 2.6884, + "step": 79 + }, + { + "epoch": 0.01508153454614007, + "grad_norm": 0.76171875, + "learning_rate": 0.0004, + "loss": 2.6492, + "step": 80 + }, + { + "epoch": 0.015270053727966821, + "grad_norm": 0.6796875, + "learning_rate": 0.00040500000000000003, + "loss": 2.8072, + "step": 81 + }, + { + "epoch": 0.015458572909793572, + "grad_norm": 0.7109375, + "learning_rate": 0.00041, + "loss": 2.7446, + "step": 82 + }, + { + "epoch": 0.015647092091620324, + "grad_norm": 0.69140625, + "learning_rate": 0.000415, + "loss": 2.7554, + "step": 83 + }, + { + "epoch": 0.015835611273447073, + "grad_norm": 0.70703125, + "learning_rate": 0.00042, + "loss": 2.7212, + "step": 84 + }, + { + "epoch": 0.016024130455273823, + "grad_norm": 0.72265625, + "learning_rate": 0.000425, + "loss": 2.6933, + "step": 85 + }, + { + "epoch": 0.016212649637100576, + "grad_norm": 0.69140625, + "learning_rate": 0.00043, + "loss": 2.7461, + "step": 86 + }, + { + "epoch": 0.016401168818927326, + "grad_norm": 0.671875, + "learning_rate": 0.000435, + "loss": 2.7079, + "step": 87 + }, + { + "epoch": 0.016589688000754076, + "grad_norm": 0.66796875, + "learning_rate": 0.00044, + "loss": 2.8562, + "step": 88 + }, + { + "epoch": 0.01677820718258083, + "grad_norm": 0.671875, + "learning_rate": 0.00044500000000000003, + "loss": 2.6606, + "step": 89 + }, + { + "epoch": 0.01696672636440758, + "grad_norm": 0.69140625, + "learning_rate": 0.00045000000000000004, + "loss": 2.7817, + "step": 90 + }, + { + "epoch": 0.017155245546234328, + "grad_norm": 0.74609375, + "learning_rate": 0.000455, + "loss": 2.7714, + "step": 91 + }, + { + "epoch": 0.01734376472806108, + "grad_norm": 0.68359375, + "learning_rate": 0.00046, + "loss": 2.7217, + "step": 92 + }, + { + "epoch": 0.01753228390988783, + "grad_norm": 0.671875, + "learning_rate": 0.000465, + "loss": 2.6855, + "step": 93 + }, + { + "epoch": 0.01772080309171458, + "grad_norm": 0.734375, + "learning_rate": 0.00047, + "loss": 2.7111, + "step": 94 + }, + { + "epoch": 0.017909322273541334, + "grad_norm": 0.71484375, + "learning_rate": 0.000475, + "loss": 2.6868, + "step": 95 + }, + { + "epoch": 0.018097841455368083, + "grad_norm": 0.71484375, + "learning_rate": 0.00048, + "loss": 2.7355, + "step": 96 + }, + { + "epoch": 0.018286360637194833, + "grad_norm": 0.69921875, + "learning_rate": 0.00048499999999999997, + "loss": 2.7172, + "step": 97 + }, + { + "epoch": 0.018474879819021586, + "grad_norm": 0.6796875, + "learning_rate": 0.00049, + "loss": 2.8204, + "step": 98 + }, + { + "epoch": 0.018663399000848336, + "grad_norm": 0.6875, + "learning_rate": 0.000495, + "loss": 2.6965, + "step": 99 + }, + { + "epoch": 0.018851918182675086, + "grad_norm": 0.6875, + "learning_rate": 0.0005, + "loss": 2.7988, + "step": 100 + }, + { + "epoch": 0.01904043736450184, + "grad_norm": 0.72265625, + "learning_rate": 0.000505, + "loss": 2.7069, + "step": 101 + }, + { + "epoch": 0.01922895654632859, + "grad_norm": 0.6796875, + "learning_rate": 0.00051, + "loss": 2.6942, + "step": 102 + }, + { + "epoch": 0.019417475728155338, + "grad_norm": 0.66015625, + "learning_rate": 0.000515, + "loss": 2.7497, + "step": 103 + }, + { + "epoch": 0.01960599490998209, + "grad_norm": 0.6875, + "learning_rate": 0.0005200000000000001, + "loss": 2.6381, + "step": 104 + }, + { + "epoch": 0.01979451409180884, + "grad_norm": 0.6796875, + "learning_rate": 0.0005250000000000001, + "loss": 2.6969, + "step": 105 + }, + { + "epoch": 0.019983033273635594, + "grad_norm": 0.66015625, + "learning_rate": 0.0005300000000000001, + "loss": 2.7247, + "step": 106 + }, + { + "epoch": 0.020171552455462344, + "grad_norm": 0.72265625, + "learning_rate": 0.000535, + "loss": 2.828, + "step": 107 + }, + { + "epoch": 0.020360071637289093, + "grad_norm": 0.7265625, + "learning_rate": 0.00054, + "loss": 2.7309, + "step": 108 + }, + { + "epoch": 0.020548590819115847, + "grad_norm": 0.71484375, + "learning_rate": 0.000545, + "loss": 2.8354, + "step": 109 + }, + { + "epoch": 0.020737110000942596, + "grad_norm": 0.69140625, + "learning_rate": 0.00055, + "loss": 2.8101, + "step": 110 + }, + { + "epoch": 0.020925629182769346, + "grad_norm": 0.69921875, + "learning_rate": 0.000555, + "loss": 2.7837, + "step": 111 + }, + { + "epoch": 0.0211141483645961, + "grad_norm": 0.69140625, + "learning_rate": 0.0005600000000000001, + "loss": 2.6813, + "step": 112 + }, + { + "epoch": 0.02130266754642285, + "grad_norm": 0.65234375, + "learning_rate": 0.000565, + "loss": 2.7035, + "step": 113 + }, + { + "epoch": 0.0214911867282496, + "grad_norm": 0.75390625, + "learning_rate": 0.00057, + "loss": 2.6901, + "step": 114 + }, + { + "epoch": 0.02167970591007635, + "grad_norm": 0.75, + "learning_rate": 0.000575, + "loss": 2.7001, + "step": 115 + }, + { + "epoch": 0.0218682250919031, + "grad_norm": 0.70703125, + "learning_rate": 0.00058, + "loss": 2.7508, + "step": 116 + }, + { + "epoch": 0.02205674427372985, + "grad_norm": 0.65625, + "learning_rate": 0.000585, + "loss": 2.7348, + "step": 117 + }, + { + "epoch": 0.022245263455556604, + "grad_norm": 0.6796875, + "learning_rate": 0.00059, + "loss": 2.7434, + "step": 118 + }, + { + "epoch": 0.022433782637383354, + "grad_norm": 0.66015625, + "learning_rate": 0.0005949999999999999, + "loss": 2.6735, + "step": 119 + }, + { + "epoch": 0.022622301819210103, + "grad_norm": 0.703125, + "learning_rate": 0.0006, + "loss": 2.6258, + "step": 120 + }, + { + "epoch": 0.022810821001036857, + "grad_norm": 0.75, + "learning_rate": 0.000605, + "loss": 2.7676, + "step": 121 + }, + { + "epoch": 0.022999340182863606, + "grad_norm": 0.6875, + "learning_rate": 0.00061, + "loss": 2.7045, + "step": 122 + }, + { + "epoch": 0.023187859364690356, + "grad_norm": 0.66015625, + "learning_rate": 0.000615, + "loss": 2.6322, + "step": 123 + }, + { + "epoch": 0.02337637854651711, + "grad_norm": 0.7109375, + "learning_rate": 0.00062, + "loss": 2.6953, + "step": 124 + }, + { + "epoch": 0.02356489772834386, + "grad_norm": 0.71875, + "learning_rate": 0.000625, + "loss": 2.6045, + "step": 125 + }, + { + "epoch": 0.02375341691017061, + "grad_norm": 0.71484375, + "learning_rate": 0.00063, + "loss": 2.6551, + "step": 126 + }, + { + "epoch": 0.02394193609199736, + "grad_norm": 0.69921875, + "learning_rate": 0.000635, + "loss": 2.656, + "step": 127 + }, + { + "epoch": 0.02413045527382411, + "grad_norm": 0.81640625, + "learning_rate": 0.00064, + "loss": 2.791, + "step": 128 + }, + { + "epoch": 0.02431897445565086, + "grad_norm": 0.69921875, + "learning_rate": 0.0006450000000000001, + "loss": 2.6599, + "step": 129 + }, + { + "epoch": 0.024507493637477614, + "grad_norm": 0.7421875, + "learning_rate": 0.0006500000000000001, + "loss": 2.633, + "step": 130 + }, + { + "epoch": 0.024696012819304364, + "grad_norm": 0.74609375, + "learning_rate": 0.0006550000000000001, + "loss": 2.6002, + "step": 131 + }, + { + "epoch": 0.024884532001131113, + "grad_norm": 0.68359375, + "learning_rate": 0.00066, + "loss": 2.7593, + "step": 132 + }, + { + "epoch": 0.025073051182957867, + "grad_norm": 0.69921875, + "learning_rate": 0.000665, + "loss": 2.706, + "step": 133 + }, + { + "epoch": 0.025261570364784616, + "grad_norm": 0.7109375, + "learning_rate": 0.00067, + "loss": 2.7094, + "step": 134 + }, + { + "epoch": 0.02545008954661137, + "grad_norm": 0.796875, + "learning_rate": 0.000675, + "loss": 2.6961, + "step": 135 + }, + { + "epoch": 0.02563860872843812, + "grad_norm": 0.74609375, + "learning_rate": 0.00068, + "loss": 2.7805, + "step": 136 + }, + { + "epoch": 0.02582712791026487, + "grad_norm": 0.71875, + "learning_rate": 0.0006850000000000001, + "loss": 2.6559, + "step": 137 + }, + { + "epoch": 0.026015647092091622, + "grad_norm": 0.6796875, + "learning_rate": 0.00069, + "loss": 2.7455, + "step": 138 + }, + { + "epoch": 0.02620416627391837, + "grad_norm": 0.734375, + "learning_rate": 0.000695, + "loss": 2.7533, + "step": 139 + }, + { + "epoch": 0.02639268545574512, + "grad_norm": 0.75, + "learning_rate": 0.0007, + "loss": 2.7434, + "step": 140 + }, + { + "epoch": 0.026581204637571874, + "grad_norm": 0.6953125, + "learning_rate": 0.000705, + "loss": 2.7018, + "step": 141 + }, + { + "epoch": 0.026769723819398624, + "grad_norm": 0.70703125, + "learning_rate": 0.00071, + "loss": 2.6182, + "step": 142 + }, + { + "epoch": 0.026958243001225374, + "grad_norm": 0.71875, + "learning_rate": 0.000715, + "loss": 2.5742, + "step": 143 + }, + { + "epoch": 0.027146762183052127, + "grad_norm": 0.68359375, + "learning_rate": 0.0007199999999999999, + "loss": 2.6547, + "step": 144 + }, + { + "epoch": 0.027335281364878877, + "grad_norm": 0.74609375, + "learning_rate": 0.000725, + "loss": 2.7054, + "step": 145 + }, + { + "epoch": 0.027523800546705626, + "grad_norm": 0.75390625, + "learning_rate": 0.00073, + "loss": 2.5809, + "step": 146 + }, + { + "epoch": 0.02771231972853238, + "grad_norm": 0.71484375, + "learning_rate": 0.000735, + "loss": 2.6474, + "step": 147 + }, + { + "epoch": 0.02790083891035913, + "grad_norm": 0.71484375, + "learning_rate": 0.00074, + "loss": 2.7606, + "step": 148 + }, + { + "epoch": 0.02808935809218588, + "grad_norm": 0.7734375, + "learning_rate": 0.000745, + "loss": 2.6923, + "step": 149 + }, + { + "epoch": 0.028277877274012632, + "grad_norm": 0.7421875, + "learning_rate": 0.00075, + "loss": 2.782, + "step": 150 + }, + { + "epoch": 0.02846639645583938, + "grad_norm": 0.69140625, + "learning_rate": 0.000755, + "loss": 2.7369, + "step": 151 + }, + { + "epoch": 0.02865491563766613, + "grad_norm": 0.74609375, + "learning_rate": 0.00076, + "loss": 2.6287, + "step": 152 + }, + { + "epoch": 0.028843434819492884, + "grad_norm": 0.765625, + "learning_rate": 0.0007650000000000001, + "loss": 2.6649, + "step": 153 + }, + { + "epoch": 0.029031954001319634, + "grad_norm": 0.875, + "learning_rate": 0.0007700000000000001, + "loss": 2.7421, + "step": 154 + }, + { + "epoch": 0.029220473183146384, + "grad_norm": 0.734375, + "learning_rate": 0.0007750000000000001, + "loss": 2.5988, + "step": 155 + }, + { + "epoch": 0.029408992364973137, + "grad_norm": 0.734375, + "learning_rate": 0.0007800000000000001, + "loss": 2.6876, + "step": 156 + }, + { + "epoch": 0.029597511546799887, + "grad_norm": 0.796875, + "learning_rate": 0.000785, + "loss": 2.6846, + "step": 157 + }, + { + "epoch": 0.029786030728626636, + "grad_norm": 0.7734375, + "learning_rate": 0.00079, + "loss": 2.7869, + "step": 158 + }, + { + "epoch": 0.02997454991045339, + "grad_norm": 0.68359375, + "learning_rate": 0.000795, + "loss": 2.6972, + "step": 159 + }, + { + "epoch": 0.03016306909228014, + "grad_norm": 0.73828125, + "learning_rate": 0.0008, + "loss": 2.7664, + "step": 160 + }, + { + "epoch": 0.03035158827410689, + "grad_norm": 0.69140625, + "learning_rate": 0.000805, + "loss": 2.6554, + "step": 161 + }, + { + "epoch": 0.030540107455933642, + "grad_norm": 0.69140625, + "learning_rate": 0.0008100000000000001, + "loss": 2.662, + "step": 162 + }, + { + "epoch": 0.03072862663776039, + "grad_norm": 0.734375, + "learning_rate": 0.000815, + "loss": 2.622, + "step": 163 + }, + { + "epoch": 0.030917145819587145, + "grad_norm": 0.73828125, + "learning_rate": 0.00082, + "loss": 2.6071, + "step": 164 + }, + { + "epoch": 0.031105665001413894, + "grad_norm": 0.7421875, + "learning_rate": 0.000825, + "loss": 2.6724, + "step": 165 + }, + { + "epoch": 0.03129418418324065, + "grad_norm": 0.71875, + "learning_rate": 0.00083, + "loss": 2.5888, + "step": 166 + }, + { + "epoch": 0.031482703365067394, + "grad_norm": 0.75390625, + "learning_rate": 0.000835, + "loss": 2.7932, + "step": 167 + }, + { + "epoch": 0.03167122254689415, + "grad_norm": 0.734375, + "learning_rate": 0.00084, + "loss": 2.6234, + "step": 168 + }, + { + "epoch": 0.0318597417287209, + "grad_norm": 0.73828125, + "learning_rate": 0.0008449999999999999, + "loss": 2.6725, + "step": 169 + }, + { + "epoch": 0.032048260910547646, + "grad_norm": 0.8515625, + "learning_rate": 0.00085, + "loss": 2.6502, + "step": 170 + }, + { + "epoch": 0.0322367800923744, + "grad_norm": 0.7109375, + "learning_rate": 0.000855, + "loss": 2.7151, + "step": 171 + }, + { + "epoch": 0.03242529927420115, + "grad_norm": 0.84375, + "learning_rate": 0.00086, + "loss": 2.8332, + "step": 172 + }, + { + "epoch": 0.0326138184560279, + "grad_norm": 0.72265625, + "learning_rate": 0.000865, + "loss": 2.8183, + "step": 173 + }, + { + "epoch": 0.03280233763785465, + "grad_norm": 0.73046875, + "learning_rate": 0.00087, + "loss": 2.6777, + "step": 174 + }, + { + "epoch": 0.032990856819681405, + "grad_norm": 0.7421875, + "learning_rate": 0.000875, + "loss": 2.6281, + "step": 175 + }, + { + "epoch": 0.03317937600150815, + "grad_norm": 0.734375, + "learning_rate": 0.00088, + "loss": 2.7047, + "step": 176 + }, + { + "epoch": 0.033367895183334904, + "grad_norm": 0.73046875, + "learning_rate": 0.000885, + "loss": 2.6637, + "step": 177 + }, + { + "epoch": 0.03355641436516166, + "grad_norm": 0.7734375, + "learning_rate": 0.0008900000000000001, + "loss": 2.7817, + "step": 178 + }, + { + "epoch": 0.033744933546988404, + "grad_norm": 0.7265625, + "learning_rate": 0.0008950000000000001, + "loss": 2.6216, + "step": 179 + }, + { + "epoch": 0.03393345272881516, + "grad_norm": 0.734375, + "learning_rate": 0.0009000000000000001, + "loss": 2.6608, + "step": 180 + }, + { + "epoch": 0.03412197191064191, + "grad_norm": 0.6953125, + "learning_rate": 0.0009050000000000001, + "loss": 2.712, + "step": 181 + }, + { + "epoch": 0.034310491092468656, + "grad_norm": 0.71484375, + "learning_rate": 0.00091, + "loss": 2.6812, + "step": 182 + }, + { + "epoch": 0.03449901027429541, + "grad_norm": 0.73828125, + "learning_rate": 0.000915, + "loss": 2.6181, + "step": 183 + }, + { + "epoch": 0.03468752945612216, + "grad_norm": 0.77734375, + "learning_rate": 0.00092, + "loss": 2.5939, + "step": 184 + }, + { + "epoch": 0.03487604863794891, + "grad_norm": 0.75, + "learning_rate": 0.000925, + "loss": 2.6378, + "step": 185 + }, + { + "epoch": 0.03506456781977566, + "grad_norm": 0.78515625, + "learning_rate": 0.00093, + "loss": 2.658, + "step": 186 + }, + { + "epoch": 0.035253087001602415, + "grad_norm": 0.7578125, + "learning_rate": 0.0009350000000000001, + "loss": 2.6324, + "step": 187 + }, + { + "epoch": 0.03544160618342916, + "grad_norm": 0.73046875, + "learning_rate": 0.00094, + "loss": 2.7615, + "step": 188 + }, + { + "epoch": 0.035630125365255914, + "grad_norm": 0.7421875, + "learning_rate": 0.000945, + "loss": 2.8334, + "step": 189 + }, + { + "epoch": 0.03581864454708267, + "grad_norm": 0.7421875, + "learning_rate": 0.00095, + "loss": 2.8026, + "step": 190 + }, + { + "epoch": 0.036007163728909414, + "grad_norm": 0.71484375, + "learning_rate": 0.000955, + "loss": 2.6532, + "step": 191 + }, + { + "epoch": 0.03619568291073617, + "grad_norm": 0.73046875, + "learning_rate": 0.00096, + "loss": 2.5541, + "step": 192 + }, + { + "epoch": 0.03638420209256292, + "grad_norm": 0.703125, + "learning_rate": 0.000965, + "loss": 2.6375, + "step": 193 + }, + { + "epoch": 0.036572721274389666, + "grad_norm": 0.703125, + "learning_rate": 0.0009699999999999999, + "loss": 2.5705, + "step": 194 + }, + { + "epoch": 0.03676124045621642, + "grad_norm": 0.73828125, + "learning_rate": 0.000975, + "loss": 2.6405, + "step": 195 + }, + { + "epoch": 0.03694975963804317, + "grad_norm": 0.72265625, + "learning_rate": 0.00098, + "loss": 2.7821, + "step": 196 + }, + { + "epoch": 0.03713827881986992, + "grad_norm": 0.69140625, + "learning_rate": 0.000985, + "loss": 2.6889, + "step": 197 + }, + { + "epoch": 0.03732679800169667, + "grad_norm": 0.7578125, + "learning_rate": 0.00099, + "loss": 2.6658, + "step": 198 + }, + { + "epoch": 0.037515317183523425, + "grad_norm": 0.72265625, + "learning_rate": 0.000995, + "loss": 2.6969, + "step": 199 + }, + { + "epoch": 0.03770383636535017, + "grad_norm": 0.71484375, + "learning_rate": 0.001, + "loss": 2.5479, + "step": 200 + }, + { + "epoch": 0.037892355547176924, + "grad_norm": 0.78125, + "learning_rate": 0.0009998040752351098, + "loss": 2.7177, + "step": 201 + }, + { + "epoch": 0.03808087472900368, + "grad_norm": 0.76171875, + "learning_rate": 0.0009996081504702195, + "loss": 2.7224, + "step": 202 + }, + { + "epoch": 0.038269393910830424, + "grad_norm": 0.6796875, + "learning_rate": 0.0009994122257053293, + "loss": 2.6316, + "step": 203 + }, + { + "epoch": 0.03845791309265718, + "grad_norm": 0.76171875, + "learning_rate": 0.0009992163009404388, + "loss": 2.8178, + "step": 204 + }, + { + "epoch": 0.03864643227448393, + "grad_norm": 0.73046875, + "learning_rate": 0.0009990203761755486, + "loss": 2.7619, + "step": 205 + }, + { + "epoch": 0.038834951456310676, + "grad_norm": 0.671875, + "learning_rate": 0.0009988244514106584, + "loss": 2.5739, + "step": 206 + }, + { + "epoch": 0.03902347063813743, + "grad_norm": 0.76171875, + "learning_rate": 0.0009986285266457681, + "loss": 2.7797, + "step": 207 + }, + { + "epoch": 0.03921198981996418, + "grad_norm": 0.7734375, + "learning_rate": 0.0009984326018808779, + "loss": 2.695, + "step": 208 + }, + { + "epoch": 0.039400509001790936, + "grad_norm": 0.69921875, + "learning_rate": 0.0009982366771159876, + "loss": 2.7551, + "step": 209 + }, + { + "epoch": 0.03958902818361768, + "grad_norm": 0.73828125, + "learning_rate": 0.0009980407523510972, + "loss": 2.7898, + "step": 210 + }, + { + "epoch": 0.039777547365444435, + "grad_norm": 0.765625, + "learning_rate": 0.000997844827586207, + "loss": 2.6824, + "step": 211 + }, + { + "epoch": 0.03996606654727119, + "grad_norm": 0.703125, + "learning_rate": 0.0009976489028213167, + "loss": 2.8341, + "step": 212 + }, + { + "epoch": 0.040154585729097934, + "grad_norm": 0.671875, + "learning_rate": 0.0009974529780564262, + "loss": 2.6885, + "step": 213 + }, + { + "epoch": 0.04034310491092469, + "grad_norm": 0.70703125, + "learning_rate": 0.000997257053291536, + "loss": 2.5722, + "step": 214 + }, + { + "epoch": 0.04053162409275144, + "grad_norm": 0.6875, + "learning_rate": 0.0009970611285266457, + "loss": 2.7023, + "step": 215 + }, + { + "epoch": 0.04072014327457819, + "grad_norm": 0.734375, + "learning_rate": 0.0009968652037617555, + "loss": 2.6429, + "step": 216 + }, + { + "epoch": 0.04090866245640494, + "grad_norm": 0.76171875, + "learning_rate": 0.0009966692789968653, + "loss": 2.7053, + "step": 217 + }, + { + "epoch": 0.04109718163823169, + "grad_norm": 0.73046875, + "learning_rate": 0.000996473354231975, + "loss": 2.7841, + "step": 218 + }, + { + "epoch": 0.04128570082005844, + "grad_norm": 0.6875, + "learning_rate": 0.0009962774294670846, + "loss": 2.6687, + "step": 219 + }, + { + "epoch": 0.04147422000188519, + "grad_norm": 0.71875, + "learning_rate": 0.0009960815047021943, + "loss": 2.7893, + "step": 220 + }, + { + "epoch": 0.041662739183711946, + "grad_norm": 0.703125, + "learning_rate": 0.000995885579937304, + "loss": 2.5992, + "step": 221 + }, + { + "epoch": 0.04185125836553869, + "grad_norm": 0.71484375, + "learning_rate": 0.0009956896551724138, + "loss": 2.7238, + "step": 222 + }, + { + "epoch": 0.042039777547365445, + "grad_norm": 0.71484375, + "learning_rate": 0.0009954937304075236, + "loss": 2.7477, + "step": 223 + }, + { + "epoch": 0.0422282967291922, + "grad_norm": 0.73828125, + "learning_rate": 0.0009952978056426334, + "loss": 2.6079, + "step": 224 + }, + { + "epoch": 0.042416815911018944, + "grad_norm": 0.71875, + "learning_rate": 0.000995101880877743, + "loss": 2.6389, + "step": 225 + }, + { + "epoch": 0.0426053350928457, + "grad_norm": 0.703125, + "learning_rate": 0.0009949059561128527, + "loss": 2.6014, + "step": 226 + }, + { + "epoch": 0.04279385427467245, + "grad_norm": 0.69921875, + "learning_rate": 0.0009947100313479624, + "loss": 2.6708, + "step": 227 + }, + { + "epoch": 0.0429823734564992, + "grad_norm": 0.703125, + "learning_rate": 0.0009945141065830722, + "loss": 2.7032, + "step": 228 + }, + { + "epoch": 0.04317089263832595, + "grad_norm": 0.71484375, + "learning_rate": 0.0009943181818181817, + "loss": 2.7911, + "step": 229 + }, + { + "epoch": 0.0433594118201527, + "grad_norm": 0.734375, + "learning_rate": 0.0009941222570532915, + "loss": 2.5071, + "step": 230 + }, + { + "epoch": 0.04354793100197945, + "grad_norm": 0.734375, + "learning_rate": 0.0009939263322884012, + "loss": 2.695, + "step": 231 + }, + { + "epoch": 0.0437364501838062, + "grad_norm": 0.72265625, + "learning_rate": 0.000993730407523511, + "loss": 2.5969, + "step": 232 + }, + { + "epoch": 0.043924969365632956, + "grad_norm": 0.6875, + "learning_rate": 0.0009935344827586207, + "loss": 2.6602, + "step": 233 + }, + { + "epoch": 0.0441134885474597, + "grad_norm": 0.69140625, + "learning_rate": 0.0009933385579937305, + "loss": 2.6561, + "step": 234 + }, + { + "epoch": 0.044302007729286455, + "grad_norm": 0.734375, + "learning_rate": 0.00099314263322884, + "loss": 2.6442, + "step": 235 + }, + { + "epoch": 0.04449052691111321, + "grad_norm": 0.859375, + "learning_rate": 0.0009929467084639498, + "loss": 2.7465, + "step": 236 + }, + { + "epoch": 0.044679046092939954, + "grad_norm": 0.76171875, + "learning_rate": 0.0009927507836990596, + "loss": 2.7102, + "step": 237 + }, + { + "epoch": 0.04486756527476671, + "grad_norm": 0.69140625, + "learning_rate": 0.0009925548589341693, + "loss": 2.7074, + "step": 238 + }, + { + "epoch": 0.04505608445659346, + "grad_norm": 0.8125, + "learning_rate": 0.000992358934169279, + "loss": 2.6626, + "step": 239 + }, + { + "epoch": 0.04524460363842021, + "grad_norm": 0.671875, + "learning_rate": 0.0009921630094043888, + "loss": 2.5579, + "step": 240 + }, + { + "epoch": 0.04543312282024696, + "grad_norm": 0.73828125, + "learning_rate": 0.0009919670846394984, + "loss": 2.7225, + "step": 241 + }, + { + "epoch": 0.04562164200207371, + "grad_norm": 0.703125, + "learning_rate": 0.0009917711598746081, + "loss": 2.6952, + "step": 242 + }, + { + "epoch": 0.04581016118390046, + "grad_norm": 0.76171875, + "learning_rate": 0.000991575235109718, + "loss": 2.6886, + "step": 243 + }, + { + "epoch": 0.04599868036572721, + "grad_norm": 0.68359375, + "learning_rate": 0.0009913793103448277, + "loss": 2.6096, + "step": 244 + }, + { + "epoch": 0.046187199547553966, + "grad_norm": 0.75390625, + "learning_rate": 0.0009911833855799374, + "loss": 2.7612, + "step": 245 + }, + { + "epoch": 0.04637571872938071, + "grad_norm": 0.71484375, + "learning_rate": 0.0009909874608150472, + "loss": 2.6082, + "step": 246 + }, + { + "epoch": 0.046564237911207465, + "grad_norm": 0.7734375, + "learning_rate": 0.0009907915360501567, + "loss": 2.7621, + "step": 247 + }, + { + "epoch": 0.04675275709303422, + "grad_norm": 0.6953125, + "learning_rate": 0.0009905956112852665, + "loss": 2.6764, + "step": 248 + }, + { + "epoch": 0.046941276274860964, + "grad_norm": 0.71484375, + "learning_rate": 0.0009903996865203762, + "loss": 2.6527, + "step": 249 + }, + { + "epoch": 0.04712979545668772, + "grad_norm": 0.67578125, + "learning_rate": 0.0009902037617554858, + "loss": 2.5762, + "step": 250 + }, + { + "epoch": 0.04731831463851447, + "grad_norm": 0.7421875, + "learning_rate": 0.0009900078369905955, + "loss": 2.7241, + "step": 251 + }, + { + "epoch": 0.04750683382034122, + "grad_norm": 0.69921875, + "learning_rate": 0.0009898119122257053, + "loss": 2.6935, + "step": 252 + }, + { + "epoch": 0.04769535300216797, + "grad_norm": 0.75390625, + "learning_rate": 0.000989615987460815, + "loss": 2.776, + "step": 253 + }, + { + "epoch": 0.04788387218399472, + "grad_norm": 0.7109375, + "learning_rate": 0.0009894200626959248, + "loss": 2.7799, + "step": 254 + }, + { + "epoch": 0.04807239136582147, + "grad_norm": 0.69921875, + "learning_rate": 0.0009892241379310346, + "loss": 2.7589, + "step": 255 + }, + { + "epoch": 0.04826091054764822, + "grad_norm": 0.69921875, + "learning_rate": 0.0009890282131661443, + "loss": 2.646, + "step": 256 + }, + { + "epoch": 0.048449429729474976, + "grad_norm": 0.6953125, + "learning_rate": 0.0009888322884012539, + "loss": 2.7226, + "step": 257 + }, + { + "epoch": 0.04863794891130172, + "grad_norm": 0.66796875, + "learning_rate": 0.0009886363636363636, + "loss": 2.6825, + "step": 258 + }, + { + "epoch": 0.048826468093128475, + "grad_norm": 0.671875, + "learning_rate": 0.0009884404388714734, + "loss": 2.6494, + "step": 259 + }, + { + "epoch": 0.04901498727495523, + "grad_norm": 0.703125, + "learning_rate": 0.0009882445141065831, + "loss": 2.7586, + "step": 260 + }, + { + "epoch": 0.049203506456781974, + "grad_norm": 0.73046875, + "learning_rate": 0.000988048589341693, + "loss": 2.7986, + "step": 261 + }, + { + "epoch": 0.04939202563860873, + "grad_norm": 0.68359375, + "learning_rate": 0.0009878526645768027, + "loss": 2.624, + "step": 262 + }, + { + "epoch": 0.04958054482043548, + "grad_norm": 0.66015625, + "learning_rate": 0.0009876567398119122, + "loss": 2.4967, + "step": 263 + }, + { + "epoch": 0.04976906400226223, + "grad_norm": 0.69140625, + "learning_rate": 0.000987460815047022, + "loss": 2.5694, + "step": 264 + }, + { + "epoch": 0.04995758318408898, + "grad_norm": 0.71484375, + "learning_rate": 0.0009872648902821317, + "loss": 2.7369, + "step": 265 + }, + { + "epoch": 0.05014610236591573, + "grad_norm": 0.73828125, + "learning_rate": 0.0009870689655172413, + "loss": 2.641, + "step": 266 + }, + { + "epoch": 0.05033462154774248, + "grad_norm": 0.6953125, + "learning_rate": 0.000986873040752351, + "loss": 2.5988, + "step": 267 + }, + { + "epoch": 0.05052314072956923, + "grad_norm": 0.7734375, + "learning_rate": 0.0009866771159874608, + "loss": 2.6935, + "step": 268 + }, + { + "epoch": 0.050711659911395986, + "grad_norm": 0.71875, + "learning_rate": 0.0009864811912225705, + "loss": 2.6573, + "step": 269 + }, + { + "epoch": 0.05090017909322274, + "grad_norm": 0.71875, + "learning_rate": 0.0009862852664576803, + "loss": 2.5501, + "step": 270 + }, + { + "epoch": 0.051088698275049485, + "grad_norm": 0.8515625, + "learning_rate": 0.00098608934169279, + "loss": 2.7173, + "step": 271 + }, + { + "epoch": 0.05127721745687624, + "grad_norm": 0.765625, + "learning_rate": 0.0009858934169278996, + "loss": 2.7147, + "step": 272 + }, + { + "epoch": 0.05146573663870299, + "grad_norm": 0.69921875, + "learning_rate": 0.0009856974921630094, + "loss": 2.6823, + "step": 273 + }, + { + "epoch": 0.05165425582052974, + "grad_norm": 0.71875, + "learning_rate": 0.0009855015673981191, + "loss": 2.7399, + "step": 274 + }, + { + "epoch": 0.05184277500235649, + "grad_norm": 0.7734375, + "learning_rate": 0.0009853056426332289, + "loss": 2.8052, + "step": 275 + }, + { + "epoch": 0.052031294184183244, + "grad_norm": 0.78125, + "learning_rate": 0.0009851097178683386, + "loss": 2.6471, + "step": 276 + }, + { + "epoch": 0.05221981336600999, + "grad_norm": 0.703125, + "learning_rate": 0.0009849137931034484, + "loss": 2.5997, + "step": 277 + }, + { + "epoch": 0.05240833254783674, + "grad_norm": 0.671875, + "learning_rate": 0.000984717868338558, + "loss": 2.6933, + "step": 278 + }, + { + "epoch": 0.052596851729663496, + "grad_norm": 0.71484375, + "learning_rate": 0.0009845219435736677, + "loss": 2.7849, + "step": 279 + }, + { + "epoch": 0.05278537091149024, + "grad_norm": 0.76953125, + "learning_rate": 0.0009843260188087774, + "loss": 2.7277, + "step": 280 + }, + { + "epoch": 0.052973890093316996, + "grad_norm": 0.671875, + "learning_rate": 0.0009841300940438872, + "loss": 2.7328, + "step": 281 + }, + { + "epoch": 0.05316240927514375, + "grad_norm": 0.69921875, + "learning_rate": 0.000983934169278997, + "loss": 2.8041, + "step": 282 + }, + { + "epoch": 0.053350928456970495, + "grad_norm": 0.6796875, + "learning_rate": 0.0009837382445141067, + "loss": 2.6497, + "step": 283 + }, + { + "epoch": 0.05353944763879725, + "grad_norm": 0.7265625, + "learning_rate": 0.0009835423197492165, + "loss": 2.6852, + "step": 284 + }, + { + "epoch": 0.053727966820624, + "grad_norm": 0.71484375, + "learning_rate": 0.000983346394984326, + "loss": 2.6116, + "step": 285 + }, + { + "epoch": 0.05391648600245075, + "grad_norm": 0.72265625, + "learning_rate": 0.0009831504702194358, + "loss": 2.5864, + "step": 286 + }, + { + "epoch": 0.0541050051842775, + "grad_norm": 0.68359375, + "learning_rate": 0.0009829545454545455, + "loss": 2.6291, + "step": 287 + }, + { + "epoch": 0.054293524366104254, + "grad_norm": 0.69921875, + "learning_rate": 0.000982758620689655, + "loss": 2.672, + "step": 288 + }, + { + "epoch": 0.054482043547931, + "grad_norm": 0.75390625, + "learning_rate": 0.0009825626959247648, + "loss": 2.6036, + "step": 289 + }, + { + "epoch": 0.05467056272975775, + "grad_norm": 0.6484375, + "learning_rate": 0.0009823667711598746, + "loss": 2.4802, + "step": 290 + }, + { + "epoch": 0.054859081911584506, + "grad_norm": 0.7578125, + "learning_rate": 0.0009821708463949844, + "loss": 2.721, + "step": 291 + }, + { + "epoch": 0.05504760109341125, + "grad_norm": 0.703125, + "learning_rate": 0.0009819749216300941, + "loss": 2.6039, + "step": 292 + }, + { + "epoch": 0.055236120275238006, + "grad_norm": 0.69140625, + "learning_rate": 0.0009817789968652039, + "loss": 2.7125, + "step": 293 + }, + { + "epoch": 0.05542463945706476, + "grad_norm": 0.71875, + "learning_rate": 0.0009815830721003134, + "loss": 2.7176, + "step": 294 + }, + { + "epoch": 0.055613158638891505, + "grad_norm": 0.73828125, + "learning_rate": 0.0009813871473354232, + "loss": 2.7061, + "step": 295 + }, + { + "epoch": 0.05580167782071826, + "grad_norm": 0.69140625, + "learning_rate": 0.000981191222570533, + "loss": 2.7324, + "step": 296 + }, + { + "epoch": 0.05599019700254501, + "grad_norm": 0.69140625, + "learning_rate": 0.0009809952978056427, + "loss": 2.6318, + "step": 297 + }, + { + "epoch": 0.05617871618437176, + "grad_norm": 0.6796875, + "learning_rate": 0.0009807993730407524, + "loss": 2.637, + "step": 298 + }, + { + "epoch": 0.05636723536619851, + "grad_norm": 0.6875, + "learning_rate": 0.0009806034482758622, + "loss": 2.5645, + "step": 299 + }, + { + "epoch": 0.056555754548025264, + "grad_norm": 0.6953125, + "learning_rate": 0.0009804075235109717, + "loss": 2.7011, + "step": 300 + }, + { + "epoch": 0.05674427372985201, + "grad_norm": 0.6796875, + "learning_rate": 0.0009802115987460815, + "loss": 2.7293, + "step": 301 + }, + { + "epoch": 0.05693279291167876, + "grad_norm": 0.68359375, + "learning_rate": 0.0009800156739811913, + "loss": 2.5779, + "step": 302 + }, + { + "epoch": 0.057121312093505516, + "grad_norm": 0.73046875, + "learning_rate": 0.000979819749216301, + "loss": 2.7574, + "step": 303 + }, + { + "epoch": 0.05730983127533226, + "grad_norm": 0.69140625, + "learning_rate": 0.0009796238244514106, + "loss": 2.7168, + "step": 304 + }, + { + "epoch": 0.057498350457159016, + "grad_norm": 0.6953125, + "learning_rate": 0.0009794278996865203, + "loss": 2.6531, + "step": 305 + }, + { + "epoch": 0.05768686963898577, + "grad_norm": 0.67578125, + "learning_rate": 0.00097923197492163, + "loss": 2.6852, + "step": 306 + }, + { + "epoch": 0.057875388820812515, + "grad_norm": 0.69921875, + "learning_rate": 0.0009790360501567398, + "loss": 2.8098, + "step": 307 + }, + { + "epoch": 0.05806390800263927, + "grad_norm": 0.68359375, + "learning_rate": 0.0009788401253918496, + "loss": 2.5938, + "step": 308 + }, + { + "epoch": 0.05825242718446602, + "grad_norm": 0.69921875, + "learning_rate": 0.0009786442006269591, + "loss": 2.6858, + "step": 309 + }, + { + "epoch": 0.05844094636629277, + "grad_norm": 0.7265625, + "learning_rate": 0.000978448275862069, + "loss": 2.6455, + "step": 310 + }, + { + "epoch": 0.05862946554811952, + "grad_norm": 0.72265625, + "learning_rate": 0.0009782523510971787, + "loss": 2.7194, + "step": 311 + }, + { + "epoch": 0.058817984729946274, + "grad_norm": 0.71484375, + "learning_rate": 0.0009780564263322884, + "loss": 2.5933, + "step": 312 + }, + { + "epoch": 0.05900650391177302, + "grad_norm": 0.68359375, + "learning_rate": 0.0009778605015673982, + "loss": 2.7103, + "step": 313 + }, + { + "epoch": 0.05919502309359977, + "grad_norm": 0.73046875, + "learning_rate": 0.000977664576802508, + "loss": 2.7317, + "step": 314 + }, + { + "epoch": 0.059383542275426526, + "grad_norm": 0.796875, + "learning_rate": 0.0009774686520376177, + "loss": 2.6629, + "step": 315 + }, + { + "epoch": 0.05957206145725327, + "grad_norm": 0.71484375, + "learning_rate": 0.0009772727272727272, + "loss": 2.811, + "step": 316 + }, + { + "epoch": 0.059760580639080026, + "grad_norm": 0.71875, + "learning_rate": 0.000977076802507837, + "loss": 2.679, + "step": 317 + }, + { + "epoch": 0.05994909982090678, + "grad_norm": 0.70703125, + "learning_rate": 0.0009768808777429468, + "loss": 2.7421, + "step": 318 + }, + { + "epoch": 0.060137619002733525, + "grad_norm": 0.77734375, + "learning_rate": 0.0009766849529780565, + "loss": 2.7717, + "step": 319 + }, + { + "epoch": 0.06032613818456028, + "grad_norm": 0.74609375, + "learning_rate": 0.0009764890282131662, + "loss": 2.7456, + "step": 320 + }, + { + "epoch": 0.06051465736638703, + "grad_norm": 0.73046875, + "learning_rate": 0.0009762931034482759, + "loss": 2.6342, + "step": 321 + }, + { + "epoch": 0.06070317654821378, + "grad_norm": 0.80078125, + "learning_rate": 0.0009760971786833856, + "loss": 2.7088, + "step": 322 + }, + { + "epoch": 0.06089169573004053, + "grad_norm": 0.72265625, + "learning_rate": 0.0009759012539184952, + "loss": 2.651, + "step": 323 + }, + { + "epoch": 0.061080214911867284, + "grad_norm": 0.890625, + "learning_rate": 0.000975705329153605, + "loss": 2.7472, + "step": 324 + }, + { + "epoch": 0.06126873409369403, + "grad_norm": 0.7421875, + "learning_rate": 0.0009755094043887147, + "loss": 2.8417, + "step": 325 + }, + { + "epoch": 0.06145725327552078, + "grad_norm": 0.671875, + "learning_rate": 0.0009753134796238245, + "loss": 2.7298, + "step": 326 + }, + { + "epoch": 0.061645772457347536, + "grad_norm": 0.7109375, + "learning_rate": 0.0009751175548589341, + "loss": 2.4934, + "step": 327 + }, + { + "epoch": 0.06183429163917429, + "grad_norm": 0.76953125, + "learning_rate": 0.0009749216300940439, + "loss": 2.6646, + "step": 328 + }, + { + "epoch": 0.062022810821001036, + "grad_norm": 0.75390625, + "learning_rate": 0.0009747257053291537, + "loss": 2.6718, + "step": 329 + }, + { + "epoch": 0.06221133000282779, + "grad_norm": 0.66796875, + "learning_rate": 0.0009745297805642633, + "loss": 2.5689, + "step": 330 + }, + { + "epoch": 0.06239984918465454, + "grad_norm": 0.66796875, + "learning_rate": 0.0009743338557993731, + "loss": 2.5188, + "step": 331 + }, + { + "epoch": 0.0625883683664813, + "grad_norm": 0.7734375, + "learning_rate": 0.0009741379310344828, + "loss": 2.7064, + "step": 332 + }, + { + "epoch": 0.06277688754830804, + "grad_norm": 0.73046875, + "learning_rate": 0.0009739420062695925, + "loss": 2.726, + "step": 333 + }, + { + "epoch": 0.06296540673013479, + "grad_norm": 0.69140625, + "learning_rate": 0.0009737460815047022, + "loss": 2.7389, + "step": 334 + }, + { + "epoch": 0.06315392591196155, + "grad_norm": 0.75390625, + "learning_rate": 0.000973550156739812, + "loss": 2.8134, + "step": 335 + }, + { + "epoch": 0.0633424450937883, + "grad_norm": 0.73046875, + "learning_rate": 0.0009733542319749216, + "loss": 2.7394, + "step": 336 + }, + { + "epoch": 0.06353096427561504, + "grad_norm": 0.66015625, + "learning_rate": 0.0009731583072100314, + "loss": 2.6256, + "step": 337 + }, + { + "epoch": 0.0637194834574418, + "grad_norm": 0.76171875, + "learning_rate": 0.0009729623824451412, + "loss": 2.7413, + "step": 338 + }, + { + "epoch": 0.06390800263926855, + "grad_norm": 0.7109375, + "learning_rate": 0.0009727664576802508, + "loss": 2.7725, + "step": 339 + }, + { + "epoch": 0.06409652182109529, + "grad_norm": 0.69140625, + "learning_rate": 0.0009725705329153606, + "loss": 2.8092, + "step": 340 + }, + { + "epoch": 0.06428504100292205, + "grad_norm": 0.703125, + "learning_rate": 0.0009723746081504702, + "loss": 2.7276, + "step": 341 + }, + { + "epoch": 0.0644735601847488, + "grad_norm": 0.66015625, + "learning_rate": 0.0009721786833855799, + "loss": 2.5861, + "step": 342 + }, + { + "epoch": 0.06466207936657555, + "grad_norm": 0.75390625, + "learning_rate": 0.0009719827586206896, + "loss": 2.6467, + "step": 343 + }, + { + "epoch": 0.0648505985484023, + "grad_norm": 0.70703125, + "learning_rate": 0.0009717868338557994, + "loss": 2.7404, + "step": 344 + }, + { + "epoch": 0.06503911773022905, + "grad_norm": 0.6875, + "learning_rate": 0.000971590909090909, + "loss": 2.6333, + "step": 345 + }, + { + "epoch": 0.0652276369120558, + "grad_norm": 0.7265625, + "learning_rate": 0.0009713949843260188, + "loss": 2.6079, + "step": 346 + }, + { + "epoch": 0.06541615609388256, + "grad_norm": 0.6796875, + "learning_rate": 0.0009711990595611286, + "loss": 2.5708, + "step": 347 + }, + { + "epoch": 0.0656046752757093, + "grad_norm": 0.71484375, + "learning_rate": 0.0009710031347962382, + "loss": 2.6675, + "step": 348 + }, + { + "epoch": 0.06579319445753605, + "grad_norm": 0.734375, + "learning_rate": 0.000970807210031348, + "loss": 2.7782, + "step": 349 + }, + { + "epoch": 0.06598171363936281, + "grad_norm": 0.66015625, + "learning_rate": 0.0009706112852664577, + "loss": 2.6853, + "step": 350 + }, + { + "epoch": 0.06617023282118956, + "grad_norm": 0.70703125, + "learning_rate": 0.0009704153605015674, + "loss": 2.7684, + "step": 351 + }, + { + "epoch": 0.0663587520030163, + "grad_norm": 0.68359375, + "learning_rate": 0.0009702194357366771, + "loss": 2.5759, + "step": 352 + }, + { + "epoch": 0.06654727118484306, + "grad_norm": 0.73828125, + "learning_rate": 0.0009700235109717869, + "loss": 2.7151, + "step": 353 + }, + { + "epoch": 0.06673579036666981, + "grad_norm": 0.7265625, + "learning_rate": 0.0009698275862068966, + "loss": 2.6346, + "step": 354 + }, + { + "epoch": 0.06692430954849656, + "grad_norm": 0.70703125, + "learning_rate": 0.0009696316614420063, + "loss": 2.5878, + "step": 355 + }, + { + "epoch": 0.06711282873032332, + "grad_norm": 0.71875, + "learning_rate": 0.0009694357366771161, + "loss": 2.6841, + "step": 356 + }, + { + "epoch": 0.06730134791215006, + "grad_norm": 0.72265625, + "learning_rate": 0.0009692398119122258, + "loss": 2.5688, + "step": 357 + }, + { + "epoch": 0.06748986709397681, + "grad_norm": 0.6875, + "learning_rate": 0.0009690438871473355, + "loss": 2.5057, + "step": 358 + }, + { + "epoch": 0.06767838627580357, + "grad_norm": 0.74609375, + "learning_rate": 0.0009688479623824452, + "loss": 2.6444, + "step": 359 + }, + { + "epoch": 0.06786690545763031, + "grad_norm": 0.73046875, + "learning_rate": 0.0009686520376175549, + "loss": 2.6894, + "step": 360 + }, + { + "epoch": 0.06805542463945706, + "grad_norm": 0.65234375, + "learning_rate": 0.0009684561128526645, + "loss": 2.5921, + "step": 361 + }, + { + "epoch": 0.06824394382128382, + "grad_norm": 0.76171875, + "learning_rate": 0.0009682601880877743, + "loss": 2.7547, + "step": 362 + }, + { + "epoch": 0.06843246300311057, + "grad_norm": 0.8125, + "learning_rate": 0.000968064263322884, + "loss": 2.7235, + "step": 363 + }, + { + "epoch": 0.06862098218493731, + "grad_norm": 0.69921875, + "learning_rate": 0.0009678683385579937, + "loss": 2.6726, + "step": 364 + }, + { + "epoch": 0.06880950136676407, + "grad_norm": 0.77734375, + "learning_rate": 0.0009676724137931034, + "loss": 2.7688, + "step": 365 + }, + { + "epoch": 0.06899802054859082, + "grad_norm": 0.6953125, + "learning_rate": 0.0009674764890282132, + "loss": 2.6567, + "step": 366 + }, + { + "epoch": 0.06918653973041756, + "grad_norm": 0.69921875, + "learning_rate": 0.0009672805642633229, + "loss": 2.7241, + "step": 367 + }, + { + "epoch": 0.06937505891224433, + "grad_norm": 0.703125, + "learning_rate": 0.0009670846394984326, + "loss": 2.603, + "step": 368 + }, + { + "epoch": 0.06956357809407107, + "grad_norm": 0.6796875, + "learning_rate": 0.0009668887147335424, + "loss": 2.6863, + "step": 369 + }, + { + "epoch": 0.06975209727589782, + "grad_norm": 0.6796875, + "learning_rate": 0.000966692789968652, + "loss": 2.6655, + "step": 370 + }, + { + "epoch": 0.06994061645772458, + "grad_norm": 0.66015625, + "learning_rate": 0.0009664968652037618, + "loss": 2.5301, + "step": 371 + }, + { + "epoch": 0.07012913563955132, + "grad_norm": 0.71484375, + "learning_rate": 0.0009663009404388715, + "loss": 2.7405, + "step": 372 + }, + { + "epoch": 0.07031765482137807, + "grad_norm": 0.671875, + "learning_rate": 0.0009661050156739812, + "loss": 2.7326, + "step": 373 + }, + { + "epoch": 0.07050617400320483, + "grad_norm": 0.7265625, + "learning_rate": 0.000965909090909091, + "loss": 2.6685, + "step": 374 + }, + { + "epoch": 0.07069469318503158, + "grad_norm": 0.71484375, + "learning_rate": 0.0009657131661442007, + "loss": 2.6664, + "step": 375 + }, + { + "epoch": 0.07088321236685832, + "grad_norm": 0.7109375, + "learning_rate": 0.0009655172413793104, + "loss": 2.5892, + "step": 376 + }, + { + "epoch": 0.07107173154868508, + "grad_norm": 0.7109375, + "learning_rate": 0.0009653213166144201, + "loss": 2.6351, + "step": 377 + }, + { + "epoch": 0.07126025073051183, + "grad_norm": 0.75390625, + "learning_rate": 0.0009651253918495299, + "loss": 2.6587, + "step": 378 + }, + { + "epoch": 0.07144876991233857, + "grad_norm": 0.69140625, + "learning_rate": 0.0009649294670846394, + "loss": 2.7744, + "step": 379 + }, + { + "epoch": 0.07163728909416534, + "grad_norm": 0.69921875, + "learning_rate": 0.0009647335423197492, + "loss": 2.7516, + "step": 380 + }, + { + "epoch": 0.07182580827599208, + "grad_norm": 0.7734375, + "learning_rate": 0.0009645376175548589, + "loss": 2.6607, + "step": 381 + }, + { + "epoch": 0.07201432745781883, + "grad_norm": 0.73046875, + "learning_rate": 0.0009643416927899687, + "loss": 2.7513, + "step": 382 + }, + { + "epoch": 0.07220284663964559, + "grad_norm": 0.65625, + "learning_rate": 0.0009641457680250783, + "loss": 2.607, + "step": 383 + }, + { + "epoch": 0.07239136582147233, + "grad_norm": 0.6875, + "learning_rate": 0.0009639498432601881, + "loss": 2.5463, + "step": 384 + }, + { + "epoch": 0.07257988500329908, + "grad_norm": 0.7421875, + "learning_rate": 0.0009637539184952979, + "loss": 2.6368, + "step": 385 + }, + { + "epoch": 0.07276840418512584, + "grad_norm": 0.734375, + "learning_rate": 0.0009635579937304075, + "loss": 2.5846, + "step": 386 + }, + { + "epoch": 0.07295692336695259, + "grad_norm": 0.73046875, + "learning_rate": 0.0009633620689655173, + "loss": 2.7072, + "step": 387 + }, + { + "epoch": 0.07314544254877933, + "grad_norm": 0.6953125, + "learning_rate": 0.000963166144200627, + "loss": 2.6918, + "step": 388 + }, + { + "epoch": 0.07333396173060609, + "grad_norm": 0.69140625, + "learning_rate": 0.0009629702194357367, + "loss": 2.6682, + "step": 389 + }, + { + "epoch": 0.07352248091243284, + "grad_norm": 0.6953125, + "learning_rate": 0.0009627742946708464, + "loss": 2.6512, + "step": 390 + }, + { + "epoch": 0.07371100009425958, + "grad_norm": 0.796875, + "learning_rate": 0.0009625783699059562, + "loss": 2.718, + "step": 391 + }, + { + "epoch": 0.07389951927608635, + "grad_norm": 0.79296875, + "learning_rate": 0.0009623824451410658, + "loss": 2.7208, + "step": 392 + }, + { + "epoch": 0.07408803845791309, + "grad_norm": 0.734375, + "learning_rate": 0.0009621865203761756, + "loss": 2.7411, + "step": 393 + }, + { + "epoch": 0.07427655763973984, + "grad_norm": 0.69921875, + "learning_rate": 0.0009619905956112854, + "loss": 2.6763, + "step": 394 + }, + { + "epoch": 0.0744650768215666, + "grad_norm": 0.703125, + "learning_rate": 0.000961794670846395, + "loss": 2.6919, + "step": 395 + }, + { + "epoch": 0.07465359600339334, + "grad_norm": 0.7421875, + "learning_rate": 0.0009615987460815048, + "loss": 2.767, + "step": 396 + }, + { + "epoch": 0.07484211518522009, + "grad_norm": 0.66796875, + "learning_rate": 0.0009614028213166145, + "loss": 2.6868, + "step": 397 + }, + { + "epoch": 0.07503063436704685, + "grad_norm": 0.67578125, + "learning_rate": 0.0009612068965517241, + "loss": 2.6393, + "step": 398 + }, + { + "epoch": 0.0752191535488736, + "grad_norm": 0.7265625, + "learning_rate": 0.0009610109717868338, + "loss": 2.5917, + "step": 399 + }, + { + "epoch": 0.07540767273070034, + "grad_norm": 0.70703125, + "learning_rate": 0.0009608150470219436, + "loss": 2.709, + "step": 400 + }, + { + "epoch": 0.0755961919125271, + "grad_norm": 0.70703125, + "learning_rate": 0.0009606191222570532, + "loss": 2.6591, + "step": 401 + }, + { + "epoch": 0.07578471109435385, + "grad_norm": 0.70703125, + "learning_rate": 0.000960423197492163, + "loss": 2.7638, + "step": 402 + }, + { + "epoch": 0.0759732302761806, + "grad_norm": 0.67578125, + "learning_rate": 0.0009602272727272728, + "loss": 2.58, + "step": 403 + }, + { + "epoch": 0.07616174945800736, + "grad_norm": 0.7109375, + "learning_rate": 0.0009600313479623824, + "loss": 2.5257, + "step": 404 + }, + { + "epoch": 0.0763502686398341, + "grad_norm": 0.75390625, + "learning_rate": 0.0009598354231974922, + "loss": 2.6512, + "step": 405 + }, + { + "epoch": 0.07653878782166085, + "grad_norm": 0.67578125, + "learning_rate": 0.0009596394984326019, + "loss": 2.6432, + "step": 406 + }, + { + "epoch": 0.07672730700348761, + "grad_norm": 0.65625, + "learning_rate": 0.0009594435736677116, + "loss": 2.6028, + "step": 407 + }, + { + "epoch": 0.07691582618531435, + "grad_norm": 0.87890625, + "learning_rate": 0.0009592476489028213, + "loss": 2.707, + "step": 408 + }, + { + "epoch": 0.0771043453671411, + "grad_norm": 0.75390625, + "learning_rate": 0.0009590517241379311, + "loss": 2.5831, + "step": 409 + }, + { + "epoch": 0.07729286454896786, + "grad_norm": 0.72265625, + "learning_rate": 0.0009588557993730408, + "loss": 2.7447, + "step": 410 + }, + { + "epoch": 0.0774813837307946, + "grad_norm": 0.65234375, + "learning_rate": 0.0009586598746081505, + "loss": 2.6452, + "step": 411 + }, + { + "epoch": 0.07766990291262135, + "grad_norm": 0.75, + "learning_rate": 0.0009584639498432603, + "loss": 2.7138, + "step": 412 + }, + { + "epoch": 0.07785842209444811, + "grad_norm": 0.76953125, + "learning_rate": 0.00095826802507837, + "loss": 2.5726, + "step": 413 + }, + { + "epoch": 0.07804694127627486, + "grad_norm": 0.71484375, + "learning_rate": 0.0009580721003134797, + "loss": 2.7128, + "step": 414 + }, + { + "epoch": 0.0782354604581016, + "grad_norm": 0.6640625, + "learning_rate": 0.0009578761755485894, + "loss": 2.5482, + "step": 415 + }, + { + "epoch": 0.07842397963992837, + "grad_norm": 0.7421875, + "learning_rate": 0.0009576802507836991, + "loss": 2.768, + "step": 416 + }, + { + "epoch": 0.07861249882175511, + "grad_norm": 0.7265625, + "learning_rate": 0.0009574843260188087, + "loss": 2.774, + "step": 417 + }, + { + "epoch": 0.07880101800358187, + "grad_norm": 0.67578125, + "learning_rate": 0.0009572884012539185, + "loss": 2.7388, + "step": 418 + }, + { + "epoch": 0.07898953718540862, + "grad_norm": 0.63671875, + "learning_rate": 0.0009570924764890282, + "loss": 2.5905, + "step": 419 + }, + { + "epoch": 0.07917805636723536, + "grad_norm": 0.73046875, + "learning_rate": 0.0009568965517241379, + "loss": 2.7478, + "step": 420 + }, + { + "epoch": 0.07936657554906212, + "grad_norm": 0.6953125, + "learning_rate": 0.0009567006269592476, + "loss": 2.699, + "step": 421 + }, + { + "epoch": 0.07955509473088887, + "grad_norm": 0.66796875, + "learning_rate": 0.0009565047021943574, + "loss": 2.5614, + "step": 422 + }, + { + "epoch": 0.07974361391271562, + "grad_norm": 0.69921875, + "learning_rate": 0.0009563087774294671, + "loss": 2.6036, + "step": 423 + }, + { + "epoch": 0.07993213309454238, + "grad_norm": 0.69921875, + "learning_rate": 0.0009561128526645768, + "loss": 2.6515, + "step": 424 + }, + { + "epoch": 0.08012065227636912, + "grad_norm": 0.69921875, + "learning_rate": 0.0009559169278996866, + "loss": 2.7026, + "step": 425 + }, + { + "epoch": 0.08030917145819587, + "grad_norm": 0.68359375, + "learning_rate": 0.0009557210031347962, + "loss": 2.7497, + "step": 426 + }, + { + "epoch": 0.08049769064002263, + "grad_norm": 0.7109375, + "learning_rate": 0.000955525078369906, + "loss": 2.6129, + "step": 427 + }, + { + "epoch": 0.08068620982184938, + "grad_norm": 0.70703125, + "learning_rate": 0.0009553291536050157, + "loss": 2.6113, + "step": 428 + }, + { + "epoch": 0.08087472900367612, + "grad_norm": 0.734375, + "learning_rate": 0.0009551332288401254, + "loss": 2.4547, + "step": 429 + }, + { + "epoch": 0.08106324818550288, + "grad_norm": 0.671875, + "learning_rate": 0.0009549373040752351, + "loss": 2.6197, + "step": 430 + }, + { + "epoch": 0.08125176736732963, + "grad_norm": 0.66796875, + "learning_rate": 0.0009547413793103449, + "loss": 2.684, + "step": 431 + }, + { + "epoch": 0.08144028654915637, + "grad_norm": 0.71875, + "learning_rate": 0.0009545454545454546, + "loss": 2.6874, + "step": 432 + }, + { + "epoch": 0.08162880573098313, + "grad_norm": 0.734375, + "learning_rate": 0.0009543495297805643, + "loss": 2.6019, + "step": 433 + }, + { + "epoch": 0.08181732491280988, + "grad_norm": 0.6796875, + "learning_rate": 0.0009541536050156741, + "loss": 2.6309, + "step": 434 + }, + { + "epoch": 0.08200584409463663, + "grad_norm": 0.7734375, + "learning_rate": 0.0009539576802507836, + "loss": 2.6848, + "step": 435 + }, + { + "epoch": 0.08219436327646339, + "grad_norm": 0.72265625, + "learning_rate": 0.0009537617554858934, + "loss": 2.7124, + "step": 436 + }, + { + "epoch": 0.08238288245829013, + "grad_norm": 0.66015625, + "learning_rate": 0.0009535658307210031, + "loss": 2.5744, + "step": 437 + }, + { + "epoch": 0.08257140164011688, + "grad_norm": 0.75, + "learning_rate": 0.0009533699059561129, + "loss": 2.7689, + "step": 438 + }, + { + "epoch": 0.08275992082194364, + "grad_norm": 0.7265625, + "learning_rate": 0.0009531739811912225, + "loss": 2.7709, + "step": 439 + }, + { + "epoch": 0.08294844000377039, + "grad_norm": 0.71484375, + "learning_rate": 0.0009529780564263323, + "loss": 2.5495, + "step": 440 + }, + { + "epoch": 0.08313695918559713, + "grad_norm": 0.66796875, + "learning_rate": 0.0009527821316614421, + "loss": 2.696, + "step": 441 + }, + { + "epoch": 0.08332547836742389, + "grad_norm": 0.68359375, + "learning_rate": 0.0009525862068965517, + "loss": 2.6657, + "step": 442 + }, + { + "epoch": 0.08351399754925064, + "grad_norm": 0.67578125, + "learning_rate": 0.0009523902821316615, + "loss": 2.6998, + "step": 443 + }, + { + "epoch": 0.08370251673107738, + "grad_norm": 0.6875, + "learning_rate": 0.0009521943573667712, + "loss": 2.7154, + "step": 444 + }, + { + "epoch": 0.08389103591290414, + "grad_norm": 0.68359375, + "learning_rate": 0.0009519984326018809, + "loss": 2.6478, + "step": 445 + }, + { + "epoch": 0.08407955509473089, + "grad_norm": 0.671875, + "learning_rate": 0.0009518025078369906, + "loss": 2.6899, + "step": 446 + }, + { + "epoch": 0.08426807427655764, + "grad_norm": 0.6640625, + "learning_rate": 0.0009516065830721004, + "loss": 2.7137, + "step": 447 + }, + { + "epoch": 0.0844565934583844, + "grad_norm": 0.6875, + "learning_rate": 0.00095141065830721, + "loss": 2.6207, + "step": 448 + }, + { + "epoch": 0.08464511264021114, + "grad_norm": 0.73046875, + "learning_rate": 0.0009512147335423198, + "loss": 2.7149, + "step": 449 + }, + { + "epoch": 0.08483363182203789, + "grad_norm": 0.6796875, + "learning_rate": 0.0009510188087774296, + "loss": 2.7011, + "step": 450 + }, + { + "epoch": 0.08502215100386465, + "grad_norm": 0.6484375, + "learning_rate": 0.0009508228840125392, + "loss": 2.6496, + "step": 451 + }, + { + "epoch": 0.0852106701856914, + "grad_norm": 0.70703125, + "learning_rate": 0.000950626959247649, + "loss": 2.6714, + "step": 452 + }, + { + "epoch": 0.08539918936751814, + "grad_norm": 0.6484375, + "learning_rate": 0.0009504310344827587, + "loss": 2.6271, + "step": 453 + }, + { + "epoch": 0.0855877085493449, + "grad_norm": 0.7109375, + "learning_rate": 0.0009502351097178683, + "loss": 2.6513, + "step": 454 + }, + { + "epoch": 0.08577622773117165, + "grad_norm": 0.671875, + "learning_rate": 0.000950039184952978, + "loss": 2.6638, + "step": 455 + }, + { + "epoch": 0.0859647469129984, + "grad_norm": 0.6953125, + "learning_rate": 0.0009498432601880878, + "loss": 2.7398, + "step": 456 + }, + { + "epoch": 0.08615326609482515, + "grad_norm": 0.69921875, + "learning_rate": 0.0009496473354231974, + "loss": 2.7013, + "step": 457 + }, + { + "epoch": 0.0863417852766519, + "grad_norm": 0.69140625, + "learning_rate": 0.0009494514106583072, + "loss": 2.6336, + "step": 458 + }, + { + "epoch": 0.08653030445847865, + "grad_norm": 0.65625, + "learning_rate": 0.000949255485893417, + "loss": 2.5915, + "step": 459 + }, + { + "epoch": 0.0867188236403054, + "grad_norm": 0.6796875, + "learning_rate": 0.0009490595611285266, + "loss": 2.6545, + "step": 460 + }, + { + "epoch": 0.08690734282213215, + "grad_norm": 0.73046875, + "learning_rate": 0.0009488636363636364, + "loss": 2.6792, + "step": 461 + }, + { + "epoch": 0.0870958620039589, + "grad_norm": 0.66796875, + "learning_rate": 0.0009486677115987461, + "loss": 2.6238, + "step": 462 + }, + { + "epoch": 0.08728438118578566, + "grad_norm": 0.70703125, + "learning_rate": 0.0009484717868338558, + "loss": 2.6929, + "step": 463 + }, + { + "epoch": 0.0874729003676124, + "grad_norm": 0.66796875, + "learning_rate": 0.0009482758620689655, + "loss": 2.7269, + "step": 464 + }, + { + "epoch": 0.08766141954943915, + "grad_norm": 0.6875, + "learning_rate": 0.0009480799373040753, + "loss": 2.6728, + "step": 465 + }, + { + "epoch": 0.08784993873126591, + "grad_norm": 0.87890625, + "learning_rate": 0.000947884012539185, + "loss": 2.667, + "step": 466 + }, + { + "epoch": 0.08803845791309266, + "grad_norm": 0.75390625, + "learning_rate": 0.0009476880877742947, + "loss": 2.7706, + "step": 467 + }, + { + "epoch": 0.0882269770949194, + "grad_norm": 0.65234375, + "learning_rate": 0.0009474921630094045, + "loss": 2.7464, + "step": 468 + }, + { + "epoch": 0.08841549627674616, + "grad_norm": 0.76953125, + "learning_rate": 0.0009472962382445142, + "loss": 2.6004, + "step": 469 + }, + { + "epoch": 0.08860401545857291, + "grad_norm": 0.671875, + "learning_rate": 0.0009471003134796239, + "loss": 2.6237, + "step": 470 + }, + { + "epoch": 0.08879253464039966, + "grad_norm": 0.69140625, + "learning_rate": 0.0009469043887147336, + "loss": 2.6628, + "step": 471 + }, + { + "epoch": 0.08898105382222642, + "grad_norm": 0.734375, + "learning_rate": 0.0009467084639498434, + "loss": 2.7066, + "step": 472 + }, + { + "epoch": 0.08916957300405316, + "grad_norm": 0.73046875, + "learning_rate": 0.0009465125391849529, + "loss": 2.6655, + "step": 473 + }, + { + "epoch": 0.08935809218587991, + "grad_norm": 0.7265625, + "learning_rate": 0.0009463166144200627, + "loss": 2.6333, + "step": 474 + }, + { + "epoch": 0.08954661136770667, + "grad_norm": 0.72265625, + "learning_rate": 0.0009461206896551724, + "loss": 2.5766, + "step": 475 + }, + { + "epoch": 0.08973513054953342, + "grad_norm": 0.7734375, + "learning_rate": 0.0009459247648902821, + "loss": 2.7387, + "step": 476 + }, + { + "epoch": 0.08992364973136016, + "grad_norm": 0.7890625, + "learning_rate": 0.0009457288401253918, + "loss": 2.7342, + "step": 477 + }, + { + "epoch": 0.09011216891318692, + "grad_norm": 0.75, + "learning_rate": 0.0009455329153605016, + "loss": 2.6416, + "step": 478 + }, + { + "epoch": 0.09030068809501367, + "grad_norm": 0.70703125, + "learning_rate": 0.0009453369905956113, + "loss": 2.6143, + "step": 479 + }, + { + "epoch": 0.09048920727684041, + "grad_norm": 0.73046875, + "learning_rate": 0.000945141065830721, + "loss": 2.7185, + "step": 480 + }, + { + "epoch": 0.09067772645866717, + "grad_norm": 0.8359375, + "learning_rate": 0.0009449451410658308, + "loss": 2.6152, + "step": 481 + }, + { + "epoch": 0.09086624564049392, + "grad_norm": 0.6875, + "learning_rate": 0.0009447492163009404, + "loss": 2.6592, + "step": 482 + }, + { + "epoch": 0.09105476482232067, + "grad_norm": 0.6875, + "learning_rate": 0.0009445532915360502, + "loss": 2.5181, + "step": 483 + }, + { + "epoch": 0.09124328400414743, + "grad_norm": 0.6953125, + "learning_rate": 0.0009443573667711599, + "loss": 2.6332, + "step": 484 + }, + { + "epoch": 0.09143180318597417, + "grad_norm": 0.73828125, + "learning_rate": 0.0009441614420062696, + "loss": 2.521, + "step": 485 + }, + { + "epoch": 0.09162032236780092, + "grad_norm": 0.71484375, + "learning_rate": 0.0009439655172413793, + "loss": 2.6339, + "step": 486 + }, + { + "epoch": 0.09180884154962768, + "grad_norm": 0.70703125, + "learning_rate": 0.0009437695924764891, + "loss": 2.6627, + "step": 487 + }, + { + "epoch": 0.09199736073145443, + "grad_norm": 0.703125, + "learning_rate": 0.0009435736677115988, + "loss": 2.6227, + "step": 488 + }, + { + "epoch": 0.09218587991328117, + "grad_norm": 0.734375, + "learning_rate": 0.0009433777429467085, + "loss": 2.784, + "step": 489 + }, + { + "epoch": 0.09237439909510793, + "grad_norm": 2.09375, + "learning_rate": 0.0009431818181818183, + "loss": 2.5622, + "step": 490 + }, + { + "epoch": 0.09256291827693468, + "grad_norm": 0.7265625, + "learning_rate": 0.0009429858934169278, + "loss": 2.6712, + "step": 491 + }, + { + "epoch": 0.09275143745876142, + "grad_norm": 0.71875, + "learning_rate": 0.0009427899686520376, + "loss": 2.5781, + "step": 492 + }, + { + "epoch": 0.09293995664058818, + "grad_norm": 0.7734375, + "learning_rate": 0.0009425940438871473, + "loss": 2.6193, + "step": 493 + }, + { + "epoch": 0.09312847582241493, + "grad_norm": 0.703125, + "learning_rate": 0.0009423981191222571, + "loss": 2.716, + "step": 494 + }, + { + "epoch": 0.09331699500424168, + "grad_norm": 0.73828125, + "learning_rate": 0.0009422021943573667, + "loss": 2.745, + "step": 495 + }, + { + "epoch": 0.09350551418606844, + "grad_norm": 0.67578125, + "learning_rate": 0.0009420062695924765, + "loss": 2.5251, + "step": 496 + }, + { + "epoch": 0.09369403336789518, + "grad_norm": 0.69921875, + "learning_rate": 0.0009418103448275863, + "loss": 2.7023, + "step": 497 + }, + { + "epoch": 0.09388255254972193, + "grad_norm": 0.6953125, + "learning_rate": 0.0009416144200626959, + "loss": 2.7697, + "step": 498 + }, + { + "epoch": 0.09407107173154869, + "grad_norm": 0.72265625, + "learning_rate": 0.0009414184952978057, + "loss": 2.6253, + "step": 499 + }, + { + "epoch": 0.09425959091337544, + "grad_norm": 0.640625, + "learning_rate": 0.0009412225705329154, + "loss": 2.6668, + "step": 500 + }, + { + "epoch": 0.09425959091337544, + "eval_runtime": 58.5785, + "eval_samples_per_second": 17.481, + "eval_steps_per_second": 0.546, + "step": 500 + }, + { + "epoch": 0.09425959091337544, + "eval/hellaswag_acc": 0.37572196773551086, + "eval/hellaswag_acc_norm": 0.4714200358494324, + "eval_hellaswag_elapsed_time": 195.95180106163025, + "step": 500 + }, + { + "epoch": 0.09444811009520218, + "grad_norm": 0.67578125, + "learning_rate": 0.0009410266457680251, + "loss": 2.6645, + "step": 501 + }, + { + "epoch": 0.09463662927702894, + "grad_norm": 0.77734375, + "learning_rate": 0.0009408307210031348, + "loss": 2.7233, + "step": 502 + }, + { + "epoch": 0.09482514845885569, + "grad_norm": 0.68359375, + "learning_rate": 0.0009406347962382446, + "loss": 2.6959, + "step": 503 + }, + { + "epoch": 0.09501366764068243, + "grad_norm": 0.6796875, + "learning_rate": 0.0009404388714733542, + "loss": 2.747, + "step": 504 + }, + { + "epoch": 0.0952021868225092, + "grad_norm": 0.63671875, + "learning_rate": 0.000940242946708464, + "loss": 2.521, + "step": 505 + }, + { + "epoch": 0.09539070600433594, + "grad_norm": 0.69140625, + "learning_rate": 0.0009400470219435738, + "loss": 2.7368, + "step": 506 + }, + { + "epoch": 0.09557922518616269, + "grad_norm": 0.7109375, + "learning_rate": 0.0009398510971786834, + "loss": 2.6509, + "step": 507 + }, + { + "epoch": 0.09576774436798945, + "grad_norm": 0.73046875, + "learning_rate": 0.0009396551724137932, + "loss": 2.785, + "step": 508 + }, + { + "epoch": 0.09595626354981619, + "grad_norm": 0.66796875, + "learning_rate": 0.0009394592476489029, + "loss": 2.5647, + "step": 509 + }, + { + "epoch": 0.09614478273164294, + "grad_norm": 0.70703125, + "learning_rate": 0.0009392633228840125, + "loss": 2.6087, + "step": 510 + }, + { + "epoch": 0.0963333019134697, + "grad_norm": 0.67578125, + "learning_rate": 0.0009390673981191222, + "loss": 2.6032, + "step": 511 + }, + { + "epoch": 0.09652182109529645, + "grad_norm": 0.703125, + "learning_rate": 0.000938871473354232, + "loss": 2.6934, + "step": 512 + }, + { + "epoch": 0.09671034027712319, + "grad_norm": 0.6953125, + "learning_rate": 0.0009386755485893416, + "loss": 2.7077, + "step": 513 + }, + { + "epoch": 0.09689885945894995, + "grad_norm": 0.71875, + "learning_rate": 0.0009384796238244514, + "loss": 2.7372, + "step": 514 + }, + { + "epoch": 0.0970873786407767, + "grad_norm": 0.69921875, + "learning_rate": 0.0009382836990595611, + "loss": 2.5907, + "step": 515 + }, + { + "epoch": 0.09727589782260344, + "grad_norm": 0.6484375, + "learning_rate": 0.0009380877742946708, + "loss": 2.5623, + "step": 516 + }, + { + "epoch": 0.0974644170044302, + "grad_norm": 0.703125, + "learning_rate": 0.0009378918495297806, + "loss": 2.6949, + "step": 517 + }, + { + "epoch": 0.09765293618625695, + "grad_norm": 0.78515625, + "learning_rate": 0.0009376959247648903, + "loss": 2.6505, + "step": 518 + }, + { + "epoch": 0.0978414553680837, + "grad_norm": 0.72265625, + "learning_rate": 0.0009375, + "loss": 2.6902, + "step": 519 + }, + { + "epoch": 0.09802997454991046, + "grad_norm": 0.71484375, + "learning_rate": 0.0009373040752351097, + "loss": 2.6529, + "step": 520 + }, + { + "epoch": 0.0982184937317372, + "grad_norm": 0.69921875, + "learning_rate": 0.0009371081504702195, + "loss": 2.5571, + "step": 521 + }, + { + "epoch": 0.09840701291356395, + "grad_norm": 0.71875, + "learning_rate": 0.0009369122257053292, + "loss": 2.7718, + "step": 522 + }, + { + "epoch": 0.09859553209539071, + "grad_norm": 0.734375, + "learning_rate": 0.0009367163009404389, + "loss": 2.7112, + "step": 523 + }, + { + "epoch": 0.09878405127721746, + "grad_norm": 0.6875, + "learning_rate": 0.0009365203761755486, + "loss": 2.5318, + "step": 524 + }, + { + "epoch": 0.0989725704590442, + "grad_norm": 0.66796875, + "learning_rate": 0.0009363244514106584, + "loss": 2.6242, + "step": 525 + }, + { + "epoch": 0.09916108964087096, + "grad_norm": 0.6875, + "learning_rate": 0.0009361285266457681, + "loss": 2.6603, + "step": 526 + }, + { + "epoch": 0.09934960882269771, + "grad_norm": 0.7109375, + "learning_rate": 0.0009359326018808778, + "loss": 2.7204, + "step": 527 + }, + { + "epoch": 0.09953812800452445, + "grad_norm": 0.68359375, + "learning_rate": 0.0009357366771159876, + "loss": 2.6355, + "step": 528 + }, + { + "epoch": 0.09972664718635121, + "grad_norm": 0.69140625, + "learning_rate": 0.0009355407523510971, + "loss": 2.6126, + "step": 529 + }, + { + "epoch": 0.09991516636817796, + "grad_norm": 0.6484375, + "learning_rate": 0.0009353448275862069, + "loss": 2.5042, + "step": 530 + }, + { + "epoch": 0.1001036855500047, + "grad_norm": 0.6953125, + "learning_rate": 0.0009351489028213166, + "loss": 2.639, + "step": 531 + }, + { + "epoch": 0.10029220473183147, + "grad_norm": 0.69140625, + "learning_rate": 0.0009349529780564263, + "loss": 2.6981, + "step": 532 + }, + { + "epoch": 0.10048072391365821, + "grad_norm": 0.66796875, + "learning_rate": 0.000934757053291536, + "loss": 2.6578, + "step": 533 + }, + { + "epoch": 0.10066924309548496, + "grad_norm": 0.734375, + "learning_rate": 0.0009345611285266458, + "loss": 2.7651, + "step": 534 + }, + { + "epoch": 0.10085776227731172, + "grad_norm": 0.69140625, + "learning_rate": 0.0009343652037617555, + "loss": 2.6639, + "step": 535 + }, + { + "epoch": 0.10104628145913847, + "grad_norm": 0.70703125, + "learning_rate": 0.0009341692789968652, + "loss": 2.6911, + "step": 536 + }, + { + "epoch": 0.10123480064096523, + "grad_norm": 0.69921875, + "learning_rate": 0.000933973354231975, + "loss": 2.6213, + "step": 537 + }, + { + "epoch": 0.10142331982279197, + "grad_norm": 0.7109375, + "learning_rate": 0.0009337774294670846, + "loss": 2.6084, + "step": 538 + }, + { + "epoch": 0.10161183900461872, + "grad_norm": 0.72265625, + "learning_rate": 0.0009335815047021944, + "loss": 2.6893, + "step": 539 + }, + { + "epoch": 0.10180035818644548, + "grad_norm": 0.65625, + "learning_rate": 0.0009333855799373041, + "loss": 2.547, + "step": 540 + }, + { + "epoch": 0.10198887736827222, + "grad_norm": 0.72265625, + "learning_rate": 0.0009331896551724138, + "loss": 2.7084, + "step": 541 + }, + { + "epoch": 0.10217739655009897, + "grad_norm": 0.65625, + "learning_rate": 0.0009329937304075235, + "loss": 2.6611, + "step": 542 + }, + { + "epoch": 0.10236591573192573, + "grad_norm": 0.703125, + "learning_rate": 0.0009327978056426333, + "loss": 2.658, + "step": 543 + }, + { + "epoch": 0.10255443491375248, + "grad_norm": 0.70703125, + "learning_rate": 0.000932601880877743, + "loss": 2.7423, + "step": 544 + }, + { + "epoch": 0.10274295409557922, + "grad_norm": 0.63671875, + "learning_rate": 0.0009324059561128527, + "loss": 2.6237, + "step": 545 + }, + { + "epoch": 0.10293147327740598, + "grad_norm": 0.671875, + "learning_rate": 0.0009322100313479625, + "loss": 2.6846, + "step": 546 + }, + { + "epoch": 0.10311999245923273, + "grad_norm": 0.66015625, + "learning_rate": 0.0009320141065830722, + "loss": 2.5963, + "step": 547 + }, + { + "epoch": 0.10330851164105948, + "grad_norm": 0.7265625, + "learning_rate": 0.0009318181818181818, + "loss": 2.6334, + "step": 548 + }, + { + "epoch": 0.10349703082288624, + "grad_norm": 0.69921875, + "learning_rate": 0.0009316222570532915, + "loss": 2.6657, + "step": 549 + }, + { + "epoch": 0.10368555000471298, + "grad_norm": 0.671875, + "learning_rate": 0.0009314263322884013, + "loss": 2.7307, + "step": 550 + }, + { + "epoch": 0.10387406918653973, + "grad_norm": 0.67578125, + "learning_rate": 0.0009312304075235109, + "loss": 2.7773, + "step": 551 + }, + { + "epoch": 0.10406258836836649, + "grad_norm": 0.7109375, + "learning_rate": 0.0009310344827586207, + "loss": 2.7054, + "step": 552 + }, + { + "epoch": 0.10425110755019323, + "grad_norm": 0.68359375, + "learning_rate": 0.0009308385579937305, + "loss": 2.6325, + "step": 553 + }, + { + "epoch": 0.10443962673201998, + "grad_norm": 0.65625, + "learning_rate": 0.0009306426332288401, + "loss": 2.5011, + "step": 554 + }, + { + "epoch": 0.10462814591384674, + "grad_norm": 0.67578125, + "learning_rate": 0.0009304467084639499, + "loss": 2.6252, + "step": 555 + }, + { + "epoch": 0.10481666509567349, + "grad_norm": 0.7109375, + "learning_rate": 0.0009302507836990596, + "loss": 2.6176, + "step": 556 + }, + { + "epoch": 0.10500518427750023, + "grad_norm": 0.69921875, + "learning_rate": 0.0009300548589341693, + "loss": 2.699, + "step": 557 + }, + { + "epoch": 0.10519370345932699, + "grad_norm": 0.71875, + "learning_rate": 0.000929858934169279, + "loss": 2.5058, + "step": 558 + }, + { + "epoch": 0.10538222264115374, + "grad_norm": 0.72265625, + "learning_rate": 0.0009296630094043888, + "loss": 2.6034, + "step": 559 + }, + { + "epoch": 0.10557074182298049, + "grad_norm": 0.640625, + "learning_rate": 0.0009294670846394984, + "loss": 2.5785, + "step": 560 + }, + { + "epoch": 0.10575926100480725, + "grad_norm": 0.7578125, + "learning_rate": 0.0009292711598746082, + "loss": 2.7846, + "step": 561 + }, + { + "epoch": 0.10594778018663399, + "grad_norm": 0.86328125, + "learning_rate": 0.000929075235109718, + "loss": 2.7049, + "step": 562 + }, + { + "epoch": 0.10613629936846074, + "grad_norm": 0.703125, + "learning_rate": 0.0009288793103448276, + "loss": 2.72, + "step": 563 + }, + { + "epoch": 0.1063248185502875, + "grad_norm": 0.72265625, + "learning_rate": 0.0009286833855799374, + "loss": 2.7468, + "step": 564 + }, + { + "epoch": 0.10651333773211424, + "grad_norm": 0.703125, + "learning_rate": 0.0009284874608150471, + "loss": 2.706, + "step": 565 + }, + { + "epoch": 0.10670185691394099, + "grad_norm": 0.71484375, + "learning_rate": 0.0009282915360501567, + "loss": 2.6664, + "step": 566 + }, + { + "epoch": 0.10689037609576775, + "grad_norm": 0.734375, + "learning_rate": 0.0009280956112852664, + "loss": 2.6685, + "step": 567 + }, + { + "epoch": 0.1070788952775945, + "grad_norm": 0.76953125, + "learning_rate": 0.0009278996865203762, + "loss": 2.6295, + "step": 568 + }, + { + "epoch": 0.10726741445942124, + "grad_norm": 0.75, + "learning_rate": 0.0009277037617554858, + "loss": 2.7556, + "step": 569 + }, + { + "epoch": 0.107455933641248, + "grad_norm": 0.6875, + "learning_rate": 0.0009275078369905956, + "loss": 2.6027, + "step": 570 + }, + { + "epoch": 0.10764445282307475, + "grad_norm": 0.73828125, + "learning_rate": 0.0009273119122257053, + "loss": 2.622, + "step": 571 + }, + { + "epoch": 0.1078329720049015, + "grad_norm": 0.7421875, + "learning_rate": 0.000927115987460815, + "loss": 2.6136, + "step": 572 + }, + { + "epoch": 0.10802149118672826, + "grad_norm": 0.7421875, + "learning_rate": 0.0009269200626959248, + "loss": 2.6196, + "step": 573 + }, + { + "epoch": 0.108210010368555, + "grad_norm": 0.65234375, + "learning_rate": 0.0009267241379310345, + "loss": 2.6569, + "step": 574 + }, + { + "epoch": 0.10839852955038175, + "grad_norm": 0.69921875, + "learning_rate": 0.0009265282131661443, + "loss": 2.7018, + "step": 575 + }, + { + "epoch": 0.10858704873220851, + "grad_norm": 0.70703125, + "learning_rate": 0.0009263322884012539, + "loss": 2.5521, + "step": 576 + }, + { + "epoch": 0.10877556791403525, + "grad_norm": 0.67578125, + "learning_rate": 0.0009261363636363637, + "loss": 2.6091, + "step": 577 + }, + { + "epoch": 0.108964087095862, + "grad_norm": 0.6796875, + "learning_rate": 0.0009259404388714734, + "loss": 2.5969, + "step": 578 + }, + { + "epoch": 0.10915260627768876, + "grad_norm": 0.70703125, + "learning_rate": 0.0009257445141065831, + "loss": 2.6032, + "step": 579 + }, + { + "epoch": 0.1093411254595155, + "grad_norm": 0.671875, + "learning_rate": 0.0009255485893416928, + "loss": 2.6755, + "step": 580 + }, + { + "epoch": 0.10952964464134225, + "grad_norm": 0.67578125, + "learning_rate": 0.0009253526645768026, + "loss": 2.6504, + "step": 581 + }, + { + "epoch": 0.10971816382316901, + "grad_norm": 0.7109375, + "learning_rate": 0.0009251567398119123, + "loss": 2.7218, + "step": 582 + }, + { + "epoch": 0.10990668300499576, + "grad_norm": 0.66796875, + "learning_rate": 0.000924960815047022, + "loss": 2.6733, + "step": 583 + }, + { + "epoch": 0.1100952021868225, + "grad_norm": 0.703125, + "learning_rate": 0.0009247648902821318, + "loss": 2.7395, + "step": 584 + }, + { + "epoch": 0.11028372136864927, + "grad_norm": 0.6953125, + "learning_rate": 0.0009245689655172413, + "loss": 2.7391, + "step": 585 + }, + { + "epoch": 0.11047224055047601, + "grad_norm": 0.66015625, + "learning_rate": 0.0009243730407523511, + "loss": 2.5436, + "step": 586 + }, + { + "epoch": 0.11066075973230276, + "grad_norm": 0.6796875, + "learning_rate": 0.0009241771159874608, + "loss": 2.6671, + "step": 587 + }, + { + "epoch": 0.11084927891412952, + "grad_norm": 0.71484375, + "learning_rate": 0.0009239811912225705, + "loss": 2.7918, + "step": 588 + }, + { + "epoch": 0.11103779809595626, + "grad_norm": 0.65625, + "learning_rate": 0.0009237852664576802, + "loss": 2.6591, + "step": 589 + }, + { + "epoch": 0.11122631727778301, + "grad_norm": 0.6640625, + "learning_rate": 0.00092358934169279, + "loss": 2.5917, + "step": 590 + }, + { + "epoch": 0.11141483645960977, + "grad_norm": 0.69921875, + "learning_rate": 0.0009233934169278996, + "loss": 2.5555, + "step": 591 + }, + { + "epoch": 0.11160335564143652, + "grad_norm": 0.7421875, + "learning_rate": 0.0009231974921630094, + "loss": 2.7138, + "step": 592 + }, + { + "epoch": 0.11179187482326326, + "grad_norm": 0.7109375, + "learning_rate": 0.0009230015673981192, + "loss": 2.7638, + "step": 593 + }, + { + "epoch": 0.11198039400509002, + "grad_norm": 0.65234375, + "learning_rate": 0.0009228056426332288, + "loss": 2.6662, + "step": 594 + }, + { + "epoch": 0.11216891318691677, + "grad_norm": 0.65625, + "learning_rate": 0.0009226097178683386, + "loss": 2.7772, + "step": 595 + }, + { + "epoch": 0.11235743236874352, + "grad_norm": 0.78125, + "learning_rate": 0.0009224137931034483, + "loss": 2.7929, + "step": 596 + }, + { + "epoch": 0.11254595155057028, + "grad_norm": 0.75, + "learning_rate": 0.000922217868338558, + "loss": 2.7971, + "step": 597 + }, + { + "epoch": 0.11273447073239702, + "grad_norm": 0.6953125, + "learning_rate": 0.0009220219435736677, + "loss": 2.7352, + "step": 598 + }, + { + "epoch": 0.11292298991422377, + "grad_norm": 0.71875, + "learning_rate": 0.0009218260188087775, + "loss": 2.6516, + "step": 599 + }, + { + "epoch": 0.11311150909605053, + "grad_norm": 0.65625, + "learning_rate": 0.0009216300940438871, + "loss": 2.7391, + "step": 600 + }, + { + "epoch": 0.11330002827787727, + "grad_norm": 0.65234375, + "learning_rate": 0.0009214341692789969, + "loss": 2.6186, + "step": 601 + }, + { + "epoch": 0.11348854745970402, + "grad_norm": 0.66796875, + "learning_rate": 0.0009212382445141067, + "loss": 2.7238, + "step": 602 + }, + { + "epoch": 0.11367706664153078, + "grad_norm": 0.64453125, + "learning_rate": 0.0009210423197492164, + "loss": 2.5381, + "step": 603 + }, + { + "epoch": 0.11386558582335753, + "grad_norm": 0.83203125, + "learning_rate": 0.000920846394984326, + "loss": 2.6816, + "step": 604 + }, + { + "epoch": 0.11405410500518427, + "grad_norm": 0.6640625, + "learning_rate": 0.0009206504702194357, + "loss": 2.5738, + "step": 605 + }, + { + "epoch": 0.11424262418701103, + "grad_norm": 0.671875, + "learning_rate": 0.0009204545454545455, + "loss": 2.6701, + "step": 606 + }, + { + "epoch": 0.11443114336883778, + "grad_norm": 0.65234375, + "learning_rate": 0.0009202586206896551, + "loss": 2.668, + "step": 607 + }, + { + "epoch": 0.11461966255066453, + "grad_norm": 0.7734375, + "learning_rate": 0.0009200626959247649, + "loss": 2.6853, + "step": 608 + }, + { + "epoch": 0.11480818173249129, + "grad_norm": 0.71875, + "learning_rate": 0.0009198667711598747, + "loss": 2.6926, + "step": 609 + }, + { + "epoch": 0.11499670091431803, + "grad_norm": 0.6796875, + "learning_rate": 0.0009196708463949843, + "loss": 2.7387, + "step": 610 + }, + { + "epoch": 0.11518522009614478, + "grad_norm": 0.64453125, + "learning_rate": 0.0009194749216300941, + "loss": 2.613, + "step": 611 + }, + { + "epoch": 0.11537373927797154, + "grad_norm": 0.69921875, + "learning_rate": 0.0009192789968652038, + "loss": 2.5144, + "step": 612 + }, + { + "epoch": 0.11556225845979828, + "grad_norm": 0.765625, + "learning_rate": 0.0009190830721003135, + "loss": 2.7777, + "step": 613 + }, + { + "epoch": 0.11575077764162503, + "grad_norm": 0.7265625, + "learning_rate": 0.0009188871473354232, + "loss": 2.6103, + "step": 614 + }, + { + "epoch": 0.11593929682345179, + "grad_norm": 0.65625, + "learning_rate": 0.000918691222570533, + "loss": 2.6806, + "step": 615 + }, + { + "epoch": 0.11612781600527854, + "grad_norm": 0.73046875, + "learning_rate": 0.0009184952978056426, + "loss": 2.6122, + "step": 616 + }, + { + "epoch": 0.11631633518710528, + "grad_norm": 0.671875, + "learning_rate": 0.0009182993730407524, + "loss": 2.6198, + "step": 617 + }, + { + "epoch": 0.11650485436893204, + "grad_norm": 0.703125, + "learning_rate": 0.0009181034482758622, + "loss": 2.5324, + "step": 618 + }, + { + "epoch": 0.11669337355075879, + "grad_norm": 0.6328125, + "learning_rate": 0.0009179075235109718, + "loss": 2.6424, + "step": 619 + }, + { + "epoch": 0.11688189273258554, + "grad_norm": 0.67578125, + "learning_rate": 0.0009177115987460816, + "loss": 2.6544, + "step": 620 + }, + { + "epoch": 0.1170704119144123, + "grad_norm": 0.67578125, + "learning_rate": 0.0009175156739811913, + "loss": 2.6725, + "step": 621 + }, + { + "epoch": 0.11725893109623904, + "grad_norm": 0.6640625, + "learning_rate": 0.000917319749216301, + "loss": 2.6113, + "step": 622 + }, + { + "epoch": 0.11744745027806579, + "grad_norm": 0.7109375, + "learning_rate": 0.0009171238244514106, + "loss": 2.6232, + "step": 623 + }, + { + "epoch": 0.11763596945989255, + "grad_norm": 0.68359375, + "learning_rate": 0.0009169278996865204, + "loss": 2.5457, + "step": 624 + }, + { + "epoch": 0.1178244886417193, + "grad_norm": 0.65234375, + "learning_rate": 0.00091673197492163, + "loss": 2.5777, + "step": 625 + }, + { + "epoch": 0.11801300782354604, + "grad_norm": 0.6640625, + "learning_rate": 0.0009165360501567398, + "loss": 2.6668, + "step": 626 + }, + { + "epoch": 0.1182015270053728, + "grad_norm": 0.65234375, + "learning_rate": 0.0009163401253918495, + "loss": 2.7571, + "step": 627 + }, + { + "epoch": 0.11839004618719955, + "grad_norm": 0.671875, + "learning_rate": 0.0009161442006269592, + "loss": 2.6457, + "step": 628 + }, + { + "epoch": 0.11857856536902629, + "grad_norm": 0.66796875, + "learning_rate": 0.000915948275862069, + "loss": 2.5717, + "step": 629 + }, + { + "epoch": 0.11876708455085305, + "grad_norm": 0.6640625, + "learning_rate": 0.0009157523510971787, + "loss": 2.6518, + "step": 630 + }, + { + "epoch": 0.1189556037326798, + "grad_norm": 0.69921875, + "learning_rate": 0.0009155564263322885, + "loss": 2.6215, + "step": 631 + }, + { + "epoch": 0.11914412291450655, + "grad_norm": 0.66015625, + "learning_rate": 0.0009153605015673981, + "loss": 2.6829, + "step": 632 + }, + { + "epoch": 0.1193326420963333, + "grad_norm": 0.75, + "learning_rate": 0.0009151645768025079, + "loss": 2.6905, + "step": 633 + }, + { + "epoch": 0.11952116127816005, + "grad_norm": 0.71875, + "learning_rate": 0.0009149686520376176, + "loss": 2.5993, + "step": 634 + }, + { + "epoch": 0.1197096804599868, + "grad_norm": 0.6796875, + "learning_rate": 0.0009147727272727273, + "loss": 2.6115, + "step": 635 + }, + { + "epoch": 0.11989819964181356, + "grad_norm": 0.66796875, + "learning_rate": 0.000914576802507837, + "loss": 2.7553, + "step": 636 + }, + { + "epoch": 0.1200867188236403, + "grad_norm": 0.6640625, + "learning_rate": 0.0009143808777429468, + "loss": 2.685, + "step": 637 + }, + { + "epoch": 0.12027523800546705, + "grad_norm": 0.66796875, + "learning_rate": 0.0009141849529780565, + "loss": 2.6026, + "step": 638 + }, + { + "epoch": 0.12046375718729381, + "grad_norm": 0.73046875, + "learning_rate": 0.0009139890282131662, + "loss": 2.7128, + "step": 639 + }, + { + "epoch": 0.12065227636912056, + "grad_norm": 0.6796875, + "learning_rate": 0.000913793103448276, + "loss": 2.5892, + "step": 640 + }, + { + "epoch": 0.1208407955509473, + "grad_norm": 0.6953125, + "learning_rate": 0.0009135971786833855, + "loss": 2.6472, + "step": 641 + }, + { + "epoch": 0.12102931473277406, + "grad_norm": 0.6875, + "learning_rate": 0.0009134012539184953, + "loss": 2.6738, + "step": 642 + }, + { + "epoch": 0.12121783391460081, + "grad_norm": 0.65234375, + "learning_rate": 0.000913205329153605, + "loss": 2.6913, + "step": 643 + }, + { + "epoch": 0.12140635309642756, + "grad_norm": 0.67578125, + "learning_rate": 0.0009130094043887147, + "loss": 2.6355, + "step": 644 + }, + { + "epoch": 0.12159487227825432, + "grad_norm": 0.65234375, + "learning_rate": 0.0009128134796238244, + "loss": 2.5979, + "step": 645 + }, + { + "epoch": 0.12178339146008106, + "grad_norm": 0.6875, + "learning_rate": 0.0009126175548589342, + "loss": 2.7308, + "step": 646 + }, + { + "epoch": 0.12197191064190781, + "grad_norm": 0.69140625, + "learning_rate": 0.0009124216300940438, + "loss": 2.6809, + "step": 647 + }, + { + "epoch": 0.12216042982373457, + "grad_norm": 0.67578125, + "learning_rate": 0.0009122257053291536, + "loss": 2.7713, + "step": 648 + }, + { + "epoch": 0.12234894900556131, + "grad_norm": 0.63671875, + "learning_rate": 0.0009120297805642634, + "loss": 2.4518, + "step": 649 + }, + { + "epoch": 0.12253746818738806, + "grad_norm": 0.66015625, + "learning_rate": 0.000911833855799373, + "loss": 2.5884, + "step": 650 + }, + { + "epoch": 0.12272598736921482, + "grad_norm": 0.6875, + "learning_rate": 0.0009116379310344828, + "loss": 2.6641, + "step": 651 + }, + { + "epoch": 0.12291450655104157, + "grad_norm": 0.69140625, + "learning_rate": 0.0009114420062695925, + "loss": 2.5444, + "step": 652 + }, + { + "epoch": 0.12310302573286831, + "grad_norm": 0.69140625, + "learning_rate": 0.0009112460815047022, + "loss": 2.6773, + "step": 653 + }, + { + "epoch": 0.12329154491469507, + "grad_norm": 0.6796875, + "learning_rate": 0.0009110501567398119, + "loss": 2.6689, + "step": 654 + }, + { + "epoch": 0.12348006409652182, + "grad_norm": 0.6953125, + "learning_rate": 0.0009108542319749217, + "loss": 2.8371, + "step": 655 + }, + { + "epoch": 0.12366858327834858, + "grad_norm": 0.671875, + "learning_rate": 0.0009106583072100313, + "loss": 2.6491, + "step": 656 + }, + { + "epoch": 0.12385710246017533, + "grad_norm": 0.70703125, + "learning_rate": 0.0009104623824451411, + "loss": 2.7192, + "step": 657 + }, + { + "epoch": 0.12404562164200207, + "grad_norm": 0.69140625, + "learning_rate": 0.0009102664576802509, + "loss": 2.7254, + "step": 658 + }, + { + "epoch": 0.12423414082382883, + "grad_norm": 0.65625, + "learning_rate": 0.0009100705329153606, + "loss": 2.6403, + "step": 659 + }, + { + "epoch": 0.12442266000565558, + "grad_norm": 0.67578125, + "learning_rate": 0.0009098746081504702, + "loss": 2.6217, + "step": 660 + }, + { + "epoch": 0.12461117918748232, + "grad_norm": 0.7265625, + "learning_rate": 0.0009096786833855799, + "loss": 2.7402, + "step": 661 + }, + { + "epoch": 0.12479969836930908, + "grad_norm": 0.671875, + "learning_rate": 0.0009094827586206897, + "loss": 2.7237, + "step": 662 + }, + { + "epoch": 0.12498821755113583, + "grad_norm": 0.7109375, + "learning_rate": 0.0009092868338557993, + "loss": 2.5321, + "step": 663 + }, + { + "epoch": 0.1251767367329626, + "grad_norm": 0.69140625, + "learning_rate": 0.0009090909090909091, + "loss": 2.4766, + "step": 664 + }, + { + "epoch": 0.12536525591478934, + "grad_norm": 0.625, + "learning_rate": 0.0009088949843260188, + "loss": 2.5655, + "step": 665 + }, + { + "epoch": 0.12555377509661608, + "grad_norm": 0.6796875, + "learning_rate": 0.0009086990595611285, + "loss": 2.4495, + "step": 666 + }, + { + "epoch": 0.12574229427844283, + "grad_norm": 0.67578125, + "learning_rate": 0.0009085031347962383, + "loss": 2.7035, + "step": 667 + }, + { + "epoch": 0.12593081346026958, + "grad_norm": 0.671875, + "learning_rate": 0.000908307210031348, + "loss": 2.5528, + "step": 668 + }, + { + "epoch": 0.12611933264209632, + "grad_norm": 0.66015625, + "learning_rate": 0.0009081112852664577, + "loss": 2.5787, + "step": 669 + }, + { + "epoch": 0.1263078518239231, + "grad_norm": 0.6640625, + "learning_rate": 0.0009079153605015674, + "loss": 2.6167, + "step": 670 + }, + { + "epoch": 0.12649637100574984, + "grad_norm": 0.66015625, + "learning_rate": 0.0009077194357366772, + "loss": 2.7147, + "step": 671 + }, + { + "epoch": 0.1266848901875766, + "grad_norm": 0.7109375, + "learning_rate": 0.0009075235109717868, + "loss": 2.7819, + "step": 672 + }, + { + "epoch": 0.12687340936940333, + "grad_norm": 0.69921875, + "learning_rate": 0.0009073275862068966, + "loss": 2.5718, + "step": 673 + }, + { + "epoch": 0.12706192855123008, + "grad_norm": 0.70703125, + "learning_rate": 0.0009071316614420063, + "loss": 2.6887, + "step": 674 + }, + { + "epoch": 0.12725044773305683, + "grad_norm": 0.69921875, + "learning_rate": 0.000906935736677116, + "loss": 2.6037, + "step": 675 + }, + { + "epoch": 0.1274389669148836, + "grad_norm": 0.65234375, + "learning_rate": 0.0009067398119122258, + "loss": 2.6148, + "step": 676 + }, + { + "epoch": 0.12762748609671035, + "grad_norm": 0.796875, + "learning_rate": 0.0009065438871473355, + "loss": 2.6737, + "step": 677 + }, + { + "epoch": 0.1278160052785371, + "grad_norm": 0.75390625, + "learning_rate": 0.0009063479623824452, + "loss": 2.7679, + "step": 678 + }, + { + "epoch": 0.12800452446036384, + "grad_norm": 0.703125, + "learning_rate": 0.0009061520376175548, + "loss": 2.6948, + "step": 679 + }, + { + "epoch": 0.12819304364219059, + "grad_norm": 0.671875, + "learning_rate": 0.0009059561128526646, + "loss": 2.6185, + "step": 680 + }, + { + "epoch": 0.12838156282401733, + "grad_norm": 0.7578125, + "learning_rate": 0.0009057601880877742, + "loss": 2.7351, + "step": 681 + }, + { + "epoch": 0.1285700820058441, + "grad_norm": 0.6796875, + "learning_rate": 0.000905564263322884, + "loss": 2.6394, + "step": 682 + }, + { + "epoch": 0.12875860118767085, + "grad_norm": 0.6875, + "learning_rate": 0.0009053683385579937, + "loss": 2.7473, + "step": 683 + }, + { + "epoch": 0.1289471203694976, + "grad_norm": 0.68359375, + "learning_rate": 0.0009051724137931034, + "loss": 2.5965, + "step": 684 + }, + { + "epoch": 0.12913563955132434, + "grad_norm": 0.671875, + "learning_rate": 0.0009049764890282132, + "loss": 2.8092, + "step": 685 + }, + { + "epoch": 0.1293241587331511, + "grad_norm": 0.72265625, + "learning_rate": 0.0009047805642633229, + "loss": 2.58, + "step": 686 + }, + { + "epoch": 0.12951267791497784, + "grad_norm": 0.69921875, + "learning_rate": 0.0009045846394984327, + "loss": 2.6549, + "step": 687 + }, + { + "epoch": 0.1297011970968046, + "grad_norm": 0.66796875, + "learning_rate": 0.0009043887147335423, + "loss": 2.6374, + "step": 688 + }, + { + "epoch": 0.12988971627863136, + "grad_norm": 0.6875, + "learning_rate": 0.0009041927899686521, + "loss": 2.5968, + "step": 689 + }, + { + "epoch": 0.1300782354604581, + "grad_norm": 0.66015625, + "learning_rate": 0.0009039968652037618, + "loss": 2.6792, + "step": 690 + }, + { + "epoch": 0.13026675464228485, + "grad_norm": 0.72265625, + "learning_rate": 0.0009038009404388715, + "loss": 2.6631, + "step": 691 + }, + { + "epoch": 0.1304552738241116, + "grad_norm": 0.7578125, + "learning_rate": 0.0009036050156739812, + "loss": 2.7008, + "step": 692 + }, + { + "epoch": 0.13064379300593834, + "grad_norm": 0.6640625, + "learning_rate": 0.000903409090909091, + "loss": 2.5822, + "step": 693 + }, + { + "epoch": 0.13083231218776512, + "grad_norm": 0.65625, + "learning_rate": 0.0009032131661442007, + "loss": 2.5259, + "step": 694 + }, + { + "epoch": 0.13102083136959186, + "grad_norm": 0.6796875, + "learning_rate": 0.0009030172413793104, + "loss": 2.5717, + "step": 695 + }, + { + "epoch": 0.1312093505514186, + "grad_norm": 0.71875, + "learning_rate": 0.0009028213166144202, + "loss": 2.7658, + "step": 696 + }, + { + "epoch": 0.13139786973324535, + "grad_norm": 0.76953125, + "learning_rate": 0.0009026253918495298, + "loss": 2.6304, + "step": 697 + }, + { + "epoch": 0.1315863889150721, + "grad_norm": 0.6875, + "learning_rate": 0.0009024294670846395, + "loss": 2.6773, + "step": 698 + }, + { + "epoch": 0.13177490809689885, + "grad_norm": 0.75390625, + "learning_rate": 0.0009022335423197492, + "loss": 2.6761, + "step": 699 + }, + { + "epoch": 0.13196342727872562, + "grad_norm": 0.8984375, + "learning_rate": 0.0009020376175548589, + "loss": 2.6741, + "step": 700 + }, + { + "epoch": 0.13215194646055237, + "grad_norm": 0.67578125, + "learning_rate": 0.0009018416927899686, + "loss": 2.5493, + "step": 701 + }, + { + "epoch": 0.1323404656423791, + "grad_norm": 0.66015625, + "learning_rate": 0.0009016457680250784, + "loss": 2.6143, + "step": 702 + }, + { + "epoch": 0.13252898482420586, + "grad_norm": 0.76953125, + "learning_rate": 0.000901449843260188, + "loss": 2.6671, + "step": 703 + }, + { + "epoch": 0.1327175040060326, + "grad_norm": 0.69921875, + "learning_rate": 0.0009012539184952978, + "loss": 2.6108, + "step": 704 + }, + { + "epoch": 0.13290602318785935, + "grad_norm": 0.6953125, + "learning_rate": 0.0009010579937304076, + "loss": 2.6649, + "step": 705 + }, + { + "epoch": 0.13309454236968613, + "grad_norm": 0.66796875, + "learning_rate": 0.0009008620689655172, + "loss": 2.6925, + "step": 706 + }, + { + "epoch": 0.13328306155151287, + "grad_norm": 0.703125, + "learning_rate": 0.000900666144200627, + "loss": 2.6506, + "step": 707 + }, + { + "epoch": 0.13347158073333962, + "grad_norm": 0.8046875, + "learning_rate": 0.0009004702194357367, + "loss": 2.7378, + "step": 708 + }, + { + "epoch": 0.13366009991516636, + "grad_norm": 0.6953125, + "learning_rate": 0.0009002742946708464, + "loss": 2.6705, + "step": 709 + }, + { + "epoch": 0.1338486190969931, + "grad_norm": 0.71875, + "learning_rate": 0.0009000783699059561, + "loss": 2.7019, + "step": 710 + }, + { + "epoch": 0.13403713827881986, + "grad_norm": 0.65625, + "learning_rate": 0.0008998824451410659, + "loss": 2.6219, + "step": 711 + }, + { + "epoch": 0.13422565746064663, + "grad_norm": 0.7734375, + "learning_rate": 0.0008996865203761755, + "loss": 2.5641, + "step": 712 + }, + { + "epoch": 0.13441417664247338, + "grad_norm": 0.76171875, + "learning_rate": 0.0008994905956112853, + "loss": 2.6977, + "step": 713 + }, + { + "epoch": 0.13460269582430012, + "grad_norm": 0.68359375, + "learning_rate": 0.0008992946708463951, + "loss": 2.7332, + "step": 714 + }, + { + "epoch": 0.13479121500612687, + "grad_norm": 0.67578125, + "learning_rate": 0.0008990987460815048, + "loss": 2.5868, + "step": 715 + }, + { + "epoch": 0.13497973418795361, + "grad_norm": 0.67578125, + "learning_rate": 0.0008989028213166145, + "loss": 2.6311, + "step": 716 + }, + { + "epoch": 0.1351682533697804, + "grad_norm": 0.6640625, + "learning_rate": 0.0008987068965517241, + "loss": 2.6369, + "step": 717 + }, + { + "epoch": 0.13535677255160714, + "grad_norm": 0.69921875, + "learning_rate": 0.0008985109717868339, + "loss": 2.778, + "step": 718 + }, + { + "epoch": 0.13554529173343388, + "grad_norm": 0.7421875, + "learning_rate": 0.0008983150470219435, + "loss": 2.6355, + "step": 719 + }, + { + "epoch": 0.13573381091526063, + "grad_norm": 0.67578125, + "learning_rate": 0.0008981191222570533, + "loss": 2.6524, + "step": 720 + }, + { + "epoch": 0.13592233009708737, + "grad_norm": 0.6875, + "learning_rate": 0.000897923197492163, + "loss": 2.7543, + "step": 721 + }, + { + "epoch": 0.13611084927891412, + "grad_norm": 0.71875, + "learning_rate": 0.0008977272727272727, + "loss": 2.7062, + "step": 722 + }, + { + "epoch": 0.1362993684607409, + "grad_norm": 0.75390625, + "learning_rate": 0.0008975313479623825, + "loss": 2.7506, + "step": 723 + }, + { + "epoch": 0.13648788764256764, + "grad_norm": 0.6484375, + "learning_rate": 0.0008973354231974922, + "loss": 2.6015, + "step": 724 + }, + { + "epoch": 0.1366764068243944, + "grad_norm": 0.640625, + "learning_rate": 0.0008971394984326019, + "loss": 2.6391, + "step": 725 + }, + { + "epoch": 0.13686492600622113, + "grad_norm": 0.7421875, + "learning_rate": 0.0008969435736677116, + "loss": 2.7587, + "step": 726 + }, + { + "epoch": 0.13705344518804788, + "grad_norm": 0.6640625, + "learning_rate": 0.0008967476489028214, + "loss": 2.7175, + "step": 727 + }, + { + "epoch": 0.13724196436987462, + "grad_norm": 0.68359375, + "learning_rate": 0.000896551724137931, + "loss": 2.6185, + "step": 728 + }, + { + "epoch": 0.1374304835517014, + "grad_norm": 0.6796875, + "learning_rate": 0.0008963557993730408, + "loss": 2.5831, + "step": 729 + }, + { + "epoch": 0.13761900273352815, + "grad_norm": 0.64453125, + "learning_rate": 0.0008961598746081505, + "loss": 2.6364, + "step": 730 + }, + { + "epoch": 0.1378075219153549, + "grad_norm": 0.65625, + "learning_rate": 0.0008959639498432602, + "loss": 2.5928, + "step": 731 + }, + { + "epoch": 0.13799604109718164, + "grad_norm": 0.66796875, + "learning_rate": 0.00089576802507837, + "loss": 2.6144, + "step": 732 + }, + { + "epoch": 0.13818456027900838, + "grad_norm": 0.671875, + "learning_rate": 0.0008955721003134797, + "loss": 2.5887, + "step": 733 + }, + { + "epoch": 0.13837307946083513, + "grad_norm": 0.62109375, + "learning_rate": 0.0008953761755485894, + "loss": 2.5888, + "step": 734 + }, + { + "epoch": 0.1385615986426619, + "grad_norm": 0.63671875, + "learning_rate": 0.000895180250783699, + "loss": 2.5938, + "step": 735 + }, + { + "epoch": 0.13875011782448865, + "grad_norm": 0.640625, + "learning_rate": 0.0008949843260188088, + "loss": 2.5876, + "step": 736 + }, + { + "epoch": 0.1389386370063154, + "grad_norm": 0.62890625, + "learning_rate": 0.0008947884012539184, + "loss": 2.6677, + "step": 737 + }, + { + "epoch": 0.13912715618814214, + "grad_norm": 0.65234375, + "learning_rate": 0.0008945924764890282, + "loss": 2.5932, + "step": 738 + }, + { + "epoch": 0.1393156753699689, + "grad_norm": 0.68359375, + "learning_rate": 0.0008943965517241379, + "loss": 2.6358, + "step": 739 + }, + { + "epoch": 0.13950419455179563, + "grad_norm": 0.671875, + "learning_rate": 0.0008942006269592476, + "loss": 2.6529, + "step": 740 + }, + { + "epoch": 0.1396927137336224, + "grad_norm": 0.6796875, + "learning_rate": 0.0008940047021943573, + "loss": 2.6557, + "step": 741 + }, + { + "epoch": 0.13988123291544916, + "grad_norm": 0.6796875, + "learning_rate": 0.0008938087774294671, + "loss": 2.6333, + "step": 742 + }, + { + "epoch": 0.1400697520972759, + "grad_norm": 0.734375, + "learning_rate": 0.0008936128526645769, + "loss": 2.5974, + "step": 743 + }, + { + "epoch": 0.14025827127910265, + "grad_norm": 0.71875, + "learning_rate": 0.0008934169278996865, + "loss": 2.6484, + "step": 744 + }, + { + "epoch": 0.1404467904609294, + "grad_norm": 0.97265625, + "learning_rate": 0.0008932210031347963, + "loss": 2.617, + "step": 745 + }, + { + "epoch": 0.14063530964275614, + "grad_norm": 0.765625, + "learning_rate": 0.000893025078369906, + "loss": 2.6803, + "step": 746 + }, + { + "epoch": 0.14082382882458291, + "grad_norm": 0.6875, + "learning_rate": 0.0008928291536050157, + "loss": 2.6882, + "step": 747 + }, + { + "epoch": 0.14101234800640966, + "grad_norm": 0.79296875, + "learning_rate": 0.0008926332288401254, + "loss": 2.6814, + "step": 748 + }, + { + "epoch": 0.1412008671882364, + "grad_norm": 0.75, + "learning_rate": 0.0008924373040752352, + "loss": 2.7618, + "step": 749 + }, + { + "epoch": 0.14138938637006315, + "grad_norm": 0.69140625, + "learning_rate": 0.0008922413793103448, + "loss": 2.7598, + "step": 750 + }, + { + "epoch": 0.1415779055518899, + "grad_norm": 0.91015625, + "learning_rate": 0.0008920454545454546, + "loss": 2.6827, + "step": 751 + }, + { + "epoch": 0.14176642473371664, + "grad_norm": 0.7265625, + "learning_rate": 0.0008918495297805644, + "loss": 2.7531, + "step": 752 + }, + { + "epoch": 0.14195494391554342, + "grad_norm": 0.6484375, + "learning_rate": 0.000891653605015674, + "loss": 2.5641, + "step": 753 + }, + { + "epoch": 0.14214346309737017, + "grad_norm": 0.65625, + "learning_rate": 0.0008914576802507837, + "loss": 2.6333, + "step": 754 + }, + { + "epoch": 0.1423319822791969, + "grad_norm": 0.69140625, + "learning_rate": 0.0008912617554858934, + "loss": 2.7232, + "step": 755 + }, + { + "epoch": 0.14252050146102366, + "grad_norm": 0.65625, + "learning_rate": 0.0008910658307210031, + "loss": 2.6999, + "step": 756 + }, + { + "epoch": 0.1427090206428504, + "grad_norm": 0.6953125, + "learning_rate": 0.0008908699059561128, + "loss": 2.6267, + "step": 757 + }, + { + "epoch": 0.14289753982467715, + "grad_norm": 0.765625, + "learning_rate": 0.0008906739811912226, + "loss": 2.6947, + "step": 758 + }, + { + "epoch": 0.14308605900650392, + "grad_norm": 0.76953125, + "learning_rate": 0.0008904780564263322, + "loss": 2.6448, + "step": 759 + }, + { + "epoch": 0.14327457818833067, + "grad_norm": 0.703125, + "learning_rate": 0.000890282131661442, + "loss": 2.7433, + "step": 760 + }, + { + "epoch": 0.14346309737015742, + "grad_norm": 0.71875, + "learning_rate": 0.0008900862068965518, + "loss": 2.6372, + "step": 761 + }, + { + "epoch": 0.14365161655198416, + "grad_norm": 0.68359375, + "learning_rate": 0.0008898902821316614, + "loss": 2.7054, + "step": 762 + }, + { + "epoch": 0.1438401357338109, + "grad_norm": 0.65625, + "learning_rate": 0.0008896943573667712, + "loss": 2.6433, + "step": 763 + }, + { + "epoch": 0.14402865491563765, + "grad_norm": 0.671875, + "learning_rate": 0.0008894984326018809, + "loss": 2.6648, + "step": 764 + }, + { + "epoch": 0.14421717409746443, + "grad_norm": 0.7109375, + "learning_rate": 0.0008893025078369906, + "loss": 2.6961, + "step": 765 + }, + { + "epoch": 0.14440569327929118, + "grad_norm": 0.69921875, + "learning_rate": 0.0008891065830721003, + "loss": 2.5835, + "step": 766 + }, + { + "epoch": 0.14459421246111792, + "grad_norm": 0.67578125, + "learning_rate": 0.0008889106583072101, + "loss": 2.6922, + "step": 767 + }, + { + "epoch": 0.14478273164294467, + "grad_norm": 0.65625, + "learning_rate": 0.0008887147335423197, + "loss": 2.5899, + "step": 768 + }, + { + "epoch": 0.1449712508247714, + "grad_norm": 0.70703125, + "learning_rate": 0.0008885188087774295, + "loss": 2.6164, + "step": 769 + }, + { + "epoch": 0.14515977000659816, + "grad_norm": 0.6953125, + "learning_rate": 0.0008883228840125393, + "loss": 2.708, + "step": 770 + }, + { + "epoch": 0.14534828918842493, + "grad_norm": 0.69140625, + "learning_rate": 0.000888126959247649, + "loss": 2.59, + "step": 771 + }, + { + "epoch": 0.14553680837025168, + "grad_norm": 0.69140625, + "learning_rate": 0.0008879310344827587, + "loss": 2.5905, + "step": 772 + }, + { + "epoch": 0.14572532755207843, + "grad_norm": 0.66015625, + "learning_rate": 0.0008877351097178683, + "loss": 2.6069, + "step": 773 + }, + { + "epoch": 0.14591384673390517, + "grad_norm": 0.72265625, + "learning_rate": 0.0008875391849529781, + "loss": 2.695, + "step": 774 + }, + { + "epoch": 0.14610236591573192, + "grad_norm": 0.68359375, + "learning_rate": 0.0008873432601880877, + "loss": 2.7418, + "step": 775 + }, + { + "epoch": 0.14629088509755866, + "grad_norm": 0.66796875, + "learning_rate": 0.0008871473354231975, + "loss": 2.6499, + "step": 776 + }, + { + "epoch": 0.14647940427938544, + "grad_norm": 0.66015625, + "learning_rate": 0.0008869514106583072, + "loss": 2.6959, + "step": 777 + }, + { + "epoch": 0.14666792346121219, + "grad_norm": 0.6875, + "learning_rate": 0.0008867554858934169, + "loss": 2.6255, + "step": 778 + }, + { + "epoch": 0.14685644264303893, + "grad_norm": 0.6875, + "learning_rate": 0.0008865595611285267, + "loss": 2.6885, + "step": 779 + }, + { + "epoch": 0.14704496182486568, + "grad_norm": 0.671875, + "learning_rate": 0.0008863636363636364, + "loss": 2.6154, + "step": 780 + }, + { + "epoch": 0.14723348100669242, + "grad_norm": 0.63671875, + "learning_rate": 0.0008861677115987461, + "loss": 2.6715, + "step": 781 + }, + { + "epoch": 0.14742200018851917, + "grad_norm": 0.69140625, + "learning_rate": 0.0008859717868338558, + "loss": 2.6356, + "step": 782 + }, + { + "epoch": 0.14761051937034594, + "grad_norm": 0.6953125, + "learning_rate": 0.0008857758620689656, + "loss": 2.6181, + "step": 783 + }, + { + "epoch": 0.1477990385521727, + "grad_norm": 0.70703125, + "learning_rate": 0.0008855799373040752, + "loss": 2.6921, + "step": 784 + }, + { + "epoch": 0.14798755773399944, + "grad_norm": 0.62109375, + "learning_rate": 0.000885384012539185, + "loss": 2.5067, + "step": 785 + }, + { + "epoch": 0.14817607691582618, + "grad_norm": 0.7109375, + "learning_rate": 0.0008851880877742947, + "loss": 2.729, + "step": 786 + }, + { + "epoch": 0.14836459609765293, + "grad_norm": 0.72265625, + "learning_rate": 0.0008849921630094044, + "loss": 2.6272, + "step": 787 + }, + { + "epoch": 0.14855311527947967, + "grad_norm": 0.63671875, + "learning_rate": 0.0008847962382445142, + "loss": 2.5603, + "step": 788 + }, + { + "epoch": 0.14874163446130645, + "grad_norm": 0.671875, + "learning_rate": 0.0008846003134796239, + "loss": 2.7072, + "step": 789 + }, + { + "epoch": 0.1489301536431332, + "grad_norm": 0.6953125, + "learning_rate": 0.0008844043887147336, + "loss": 2.6431, + "step": 790 + }, + { + "epoch": 0.14911867282495994, + "grad_norm": 0.703125, + "learning_rate": 0.0008842084639498433, + "loss": 2.6058, + "step": 791 + }, + { + "epoch": 0.1493071920067867, + "grad_norm": 0.6484375, + "learning_rate": 0.000884012539184953, + "loss": 2.6369, + "step": 792 + }, + { + "epoch": 0.14949571118861343, + "grad_norm": 0.63671875, + "learning_rate": 0.0008838166144200626, + "loss": 2.5803, + "step": 793 + }, + { + "epoch": 0.14968423037044018, + "grad_norm": 0.65625, + "learning_rate": 0.0008836206896551724, + "loss": 2.6228, + "step": 794 + }, + { + "epoch": 0.14987274955226695, + "grad_norm": 0.65234375, + "learning_rate": 0.0008834247648902821, + "loss": 2.5875, + "step": 795 + }, + { + "epoch": 0.1500612687340937, + "grad_norm": 0.63671875, + "learning_rate": 0.0008832288401253918, + "loss": 2.6705, + "step": 796 + }, + { + "epoch": 0.15024978791592045, + "grad_norm": 0.63671875, + "learning_rate": 0.0008830329153605015, + "loss": 2.6356, + "step": 797 + }, + { + "epoch": 0.1504383070977472, + "grad_norm": 0.65625, + "learning_rate": 0.0008828369905956113, + "loss": 2.6354, + "step": 798 + }, + { + "epoch": 0.15062682627957394, + "grad_norm": 0.66796875, + "learning_rate": 0.0008826410658307211, + "loss": 2.7054, + "step": 799 + }, + { + "epoch": 0.15081534546140068, + "grad_norm": 0.671875, + "learning_rate": 0.0008824451410658307, + "loss": 2.5472, + "step": 800 + }, + { + "epoch": 0.15100386464322746, + "grad_norm": 0.64453125, + "learning_rate": 0.0008822492163009405, + "loss": 2.5917, + "step": 801 + }, + { + "epoch": 0.1511923838250542, + "grad_norm": 0.671875, + "learning_rate": 0.0008820532915360502, + "loss": 2.6797, + "step": 802 + }, + { + "epoch": 0.15138090300688095, + "grad_norm": 0.68359375, + "learning_rate": 0.0008818573667711599, + "loss": 2.6894, + "step": 803 + }, + { + "epoch": 0.1515694221887077, + "grad_norm": 0.75390625, + "learning_rate": 0.0008816614420062696, + "loss": 2.6542, + "step": 804 + }, + { + "epoch": 0.15175794137053444, + "grad_norm": 0.6640625, + "learning_rate": 0.0008814655172413794, + "loss": 2.7165, + "step": 805 + }, + { + "epoch": 0.1519464605523612, + "grad_norm": 0.6875, + "learning_rate": 0.000881269592476489, + "loss": 2.6396, + "step": 806 + }, + { + "epoch": 0.15213497973418796, + "grad_norm": 0.66015625, + "learning_rate": 0.0008810736677115988, + "loss": 2.5562, + "step": 807 + }, + { + "epoch": 0.1523234989160147, + "grad_norm": 0.6953125, + "learning_rate": 0.0008808777429467086, + "loss": 2.5097, + "step": 808 + }, + { + "epoch": 0.15251201809784146, + "grad_norm": 0.7109375, + "learning_rate": 0.0008806818181818182, + "loss": 2.6022, + "step": 809 + }, + { + "epoch": 0.1527005372796682, + "grad_norm": 0.67578125, + "learning_rate": 0.0008804858934169279, + "loss": 2.6144, + "step": 810 + }, + { + "epoch": 0.15288905646149495, + "grad_norm": 0.68359375, + "learning_rate": 0.0008802899686520376, + "loss": 2.7956, + "step": 811 + }, + { + "epoch": 0.1530775756433217, + "grad_norm": 0.66015625, + "learning_rate": 0.0008800940438871473, + "loss": 2.6763, + "step": 812 + }, + { + "epoch": 0.15326609482514847, + "grad_norm": 0.68359375, + "learning_rate": 0.000879898119122257, + "loss": 2.7261, + "step": 813 + }, + { + "epoch": 0.15345461400697522, + "grad_norm": 0.6484375, + "learning_rate": 0.0008797021943573668, + "loss": 2.5969, + "step": 814 + }, + { + "epoch": 0.15364313318880196, + "grad_norm": 0.68359375, + "learning_rate": 0.0008795062695924764, + "loss": 2.5595, + "step": 815 + }, + { + "epoch": 0.1538316523706287, + "grad_norm": 0.66796875, + "learning_rate": 0.0008793103448275862, + "loss": 2.4956, + "step": 816 + }, + { + "epoch": 0.15402017155245545, + "grad_norm": 0.69140625, + "learning_rate": 0.000879114420062696, + "loss": 2.6691, + "step": 817 + }, + { + "epoch": 0.1542086907342822, + "grad_norm": 0.65234375, + "learning_rate": 0.0008789184952978056, + "loss": 2.5205, + "step": 818 + }, + { + "epoch": 0.15439720991610897, + "grad_norm": 0.6875, + "learning_rate": 0.0008787225705329154, + "loss": 2.5277, + "step": 819 + }, + { + "epoch": 0.15458572909793572, + "grad_norm": 0.62890625, + "learning_rate": 0.0008785266457680251, + "loss": 2.6255, + "step": 820 + }, + { + "epoch": 0.15477424827976247, + "grad_norm": 0.68359375, + "learning_rate": 0.0008783307210031348, + "loss": 2.6624, + "step": 821 + }, + { + "epoch": 0.1549627674615892, + "grad_norm": 0.6171875, + "learning_rate": 0.0008781347962382445, + "loss": 2.6192, + "step": 822 + }, + { + "epoch": 0.15515128664341596, + "grad_norm": 0.671875, + "learning_rate": 0.0008779388714733543, + "loss": 2.6637, + "step": 823 + }, + { + "epoch": 0.1553398058252427, + "grad_norm": 0.703125, + "learning_rate": 0.000877742946708464, + "loss": 2.6116, + "step": 824 + }, + { + "epoch": 0.15552832500706948, + "grad_norm": 0.66015625, + "learning_rate": 0.0008775470219435737, + "loss": 2.7295, + "step": 825 + }, + { + "epoch": 0.15571684418889623, + "grad_norm": 0.69140625, + "learning_rate": 0.0008773510971786835, + "loss": 2.6604, + "step": 826 + }, + { + "epoch": 0.15590536337072297, + "grad_norm": 0.703125, + "learning_rate": 0.0008771551724137932, + "loss": 2.661, + "step": 827 + }, + { + "epoch": 0.15609388255254972, + "grad_norm": 0.6875, + "learning_rate": 0.0008769592476489029, + "loss": 2.6784, + "step": 828 + }, + { + "epoch": 0.15628240173437646, + "grad_norm": 0.69140625, + "learning_rate": 0.0008767633228840125, + "loss": 2.6322, + "step": 829 + }, + { + "epoch": 0.1564709209162032, + "grad_norm": 0.67578125, + "learning_rate": 0.0008765673981191223, + "loss": 2.6958, + "step": 830 + }, + { + "epoch": 0.15665944009802998, + "grad_norm": 0.640625, + "learning_rate": 0.0008763714733542319, + "loss": 2.614, + "step": 831 + }, + { + "epoch": 0.15684795927985673, + "grad_norm": 0.67578125, + "learning_rate": 0.0008761755485893417, + "loss": 2.61, + "step": 832 + }, + { + "epoch": 0.15703647846168348, + "grad_norm": 0.703125, + "learning_rate": 0.0008759796238244514, + "loss": 2.6638, + "step": 833 + }, + { + "epoch": 0.15722499764351022, + "grad_norm": 0.69921875, + "learning_rate": 0.0008757836990595611, + "loss": 2.5964, + "step": 834 + }, + { + "epoch": 0.15741351682533697, + "grad_norm": 0.6953125, + "learning_rate": 0.0008755877742946709, + "loss": 2.6957, + "step": 835 + }, + { + "epoch": 0.15760203600716374, + "grad_norm": 0.640625, + "learning_rate": 0.0008753918495297806, + "loss": 2.5888, + "step": 836 + }, + { + "epoch": 0.1577905551889905, + "grad_norm": 0.6640625, + "learning_rate": 0.0008751959247648903, + "loss": 2.644, + "step": 837 + }, + { + "epoch": 0.15797907437081724, + "grad_norm": 0.69921875, + "learning_rate": 0.000875, + "loss": 2.772, + "step": 838 + }, + { + "epoch": 0.15816759355264398, + "grad_norm": 0.70703125, + "learning_rate": 0.0008748040752351098, + "loss": 2.6483, + "step": 839 + }, + { + "epoch": 0.15835611273447073, + "grad_norm": 0.69140625, + "learning_rate": 0.0008746081504702194, + "loss": 2.5755, + "step": 840 + }, + { + "epoch": 0.15854463191629747, + "grad_norm": 0.71875, + "learning_rate": 0.0008744122257053292, + "loss": 2.6584, + "step": 841 + }, + { + "epoch": 0.15873315109812425, + "grad_norm": 0.6796875, + "learning_rate": 0.0008742163009404389, + "loss": 2.6448, + "step": 842 + }, + { + "epoch": 0.158921670279951, + "grad_norm": 0.6640625, + "learning_rate": 0.0008740203761755486, + "loss": 2.6187, + "step": 843 + }, + { + "epoch": 0.15911018946177774, + "grad_norm": 0.6953125, + "learning_rate": 0.0008738244514106584, + "loss": 2.7043, + "step": 844 + }, + { + "epoch": 0.1592987086436045, + "grad_norm": 0.6875, + "learning_rate": 0.0008736285266457681, + "loss": 2.5718, + "step": 845 + }, + { + "epoch": 0.15948722782543123, + "grad_norm": 0.66015625, + "learning_rate": 0.0008734326018808778, + "loss": 2.6218, + "step": 846 + }, + { + "epoch": 0.15967574700725798, + "grad_norm": 0.640625, + "learning_rate": 0.0008732366771159875, + "loss": 2.7162, + "step": 847 + }, + { + "epoch": 0.15986426618908475, + "grad_norm": 0.69140625, + "learning_rate": 0.0008730407523510972, + "loss": 2.7002, + "step": 848 + }, + { + "epoch": 0.1600527853709115, + "grad_norm": 0.984375, + "learning_rate": 0.0008728448275862068, + "loss": 2.5591, + "step": 849 + }, + { + "epoch": 0.16024130455273825, + "grad_norm": 0.70703125, + "learning_rate": 0.0008726489028213166, + "loss": 2.68, + "step": 850 + }, + { + "epoch": 0.160429823734565, + "grad_norm": 0.703125, + "learning_rate": 0.0008724529780564263, + "loss": 2.6753, + "step": 851 + }, + { + "epoch": 0.16061834291639174, + "grad_norm": 0.6953125, + "learning_rate": 0.000872257053291536, + "loss": 2.6676, + "step": 852 + }, + { + "epoch": 0.16080686209821848, + "grad_norm": 0.75390625, + "learning_rate": 0.0008720611285266457, + "loss": 2.6622, + "step": 853 + }, + { + "epoch": 0.16099538128004526, + "grad_norm": 0.76953125, + "learning_rate": 0.0008718652037617555, + "loss": 2.8021, + "step": 854 + }, + { + "epoch": 0.161183900461872, + "grad_norm": 0.66796875, + "learning_rate": 0.0008716692789968653, + "loss": 2.6823, + "step": 855 + }, + { + "epoch": 0.16137241964369875, + "grad_norm": 0.73828125, + "learning_rate": 0.0008714733542319749, + "loss": 2.6915, + "step": 856 + }, + { + "epoch": 0.1615609388255255, + "grad_norm": 0.7109375, + "learning_rate": 0.0008712774294670847, + "loss": 2.6373, + "step": 857 + }, + { + "epoch": 0.16174945800735224, + "grad_norm": 0.67578125, + "learning_rate": 0.0008710815047021944, + "loss": 2.6126, + "step": 858 + }, + { + "epoch": 0.161937977189179, + "grad_norm": 0.6953125, + "learning_rate": 0.0008708855799373041, + "loss": 2.544, + "step": 859 + }, + { + "epoch": 0.16212649637100576, + "grad_norm": 0.6796875, + "learning_rate": 0.0008706896551724138, + "loss": 2.5462, + "step": 860 + }, + { + "epoch": 0.1623150155528325, + "grad_norm": 0.68359375, + "learning_rate": 0.0008704937304075236, + "loss": 2.6703, + "step": 861 + }, + { + "epoch": 0.16250353473465926, + "grad_norm": 0.74609375, + "learning_rate": 0.0008702978056426332, + "loss": 2.606, + "step": 862 + }, + { + "epoch": 0.162692053916486, + "grad_norm": 0.71484375, + "learning_rate": 0.000870101880877743, + "loss": 2.5978, + "step": 863 + }, + { + "epoch": 0.16288057309831275, + "grad_norm": 0.734375, + "learning_rate": 0.0008699059561128528, + "loss": 2.6027, + "step": 864 + }, + { + "epoch": 0.1630690922801395, + "grad_norm": 0.65234375, + "learning_rate": 0.0008697100313479624, + "loss": 2.654, + "step": 865 + }, + { + "epoch": 0.16325761146196627, + "grad_norm": 0.640625, + "learning_rate": 0.0008695141065830722, + "loss": 2.4922, + "step": 866 + }, + { + "epoch": 0.16344613064379301, + "grad_norm": 0.6875, + "learning_rate": 0.0008693181818181818, + "loss": 2.54, + "step": 867 + }, + { + "epoch": 0.16363464982561976, + "grad_norm": 0.6640625, + "learning_rate": 0.0008691222570532915, + "loss": 2.6802, + "step": 868 + }, + { + "epoch": 0.1638231690074465, + "grad_norm": 0.76953125, + "learning_rate": 0.0008689263322884012, + "loss": 2.6451, + "step": 869 + }, + { + "epoch": 0.16401168818927325, + "grad_norm": 0.6484375, + "learning_rate": 0.000868730407523511, + "loss": 2.6301, + "step": 870 + }, + { + "epoch": 0.1642002073711, + "grad_norm": 0.65625, + "learning_rate": 0.0008685344827586206, + "loss": 2.6155, + "step": 871 + }, + { + "epoch": 0.16438872655292677, + "grad_norm": 0.65234375, + "learning_rate": 0.0008683385579937304, + "loss": 2.5815, + "step": 872 + }, + { + "epoch": 0.16457724573475352, + "grad_norm": 0.6640625, + "learning_rate": 0.0008681426332288402, + "loss": 2.6225, + "step": 873 + }, + { + "epoch": 0.16476576491658027, + "grad_norm": 0.671875, + "learning_rate": 0.0008679467084639498, + "loss": 2.6562, + "step": 874 + }, + { + "epoch": 0.164954284098407, + "grad_norm": 0.6796875, + "learning_rate": 0.0008677507836990596, + "loss": 2.6155, + "step": 875 + }, + { + "epoch": 0.16514280328023376, + "grad_norm": 0.69140625, + "learning_rate": 0.0008675548589341693, + "loss": 2.6055, + "step": 876 + }, + { + "epoch": 0.1653313224620605, + "grad_norm": 0.64453125, + "learning_rate": 0.000867358934169279, + "loss": 2.637, + "step": 877 + }, + { + "epoch": 0.16551984164388728, + "grad_norm": 0.69140625, + "learning_rate": 0.0008671630094043887, + "loss": 2.6913, + "step": 878 + }, + { + "epoch": 0.16570836082571402, + "grad_norm": 0.6484375, + "learning_rate": 0.0008669670846394985, + "loss": 2.5859, + "step": 879 + }, + { + "epoch": 0.16589688000754077, + "grad_norm": 0.68359375, + "learning_rate": 0.0008667711598746082, + "loss": 2.7039, + "step": 880 + }, + { + "epoch": 0.16608539918936752, + "grad_norm": 0.67578125, + "learning_rate": 0.0008665752351097179, + "loss": 2.662, + "step": 881 + }, + { + "epoch": 0.16627391837119426, + "grad_norm": 0.65625, + "learning_rate": 0.0008663793103448277, + "loss": 2.6888, + "step": 882 + }, + { + "epoch": 0.166462437553021, + "grad_norm": 0.625, + "learning_rate": 0.0008661833855799374, + "loss": 2.6346, + "step": 883 + }, + { + "epoch": 0.16665095673484778, + "grad_norm": 0.65625, + "learning_rate": 0.0008659874608150471, + "loss": 2.6536, + "step": 884 + }, + { + "epoch": 0.16683947591667453, + "grad_norm": 0.65625, + "learning_rate": 0.0008657915360501567, + "loss": 2.5995, + "step": 885 + }, + { + "epoch": 0.16702799509850128, + "grad_norm": 0.6484375, + "learning_rate": 0.0008655956112852665, + "loss": 2.7111, + "step": 886 + }, + { + "epoch": 0.16721651428032802, + "grad_norm": 0.71484375, + "learning_rate": 0.0008653996865203761, + "loss": 2.6629, + "step": 887 + }, + { + "epoch": 0.16740503346215477, + "grad_norm": 0.65625, + "learning_rate": 0.0008652037617554859, + "loss": 2.5553, + "step": 888 + }, + { + "epoch": 0.1675935526439815, + "grad_norm": 0.6796875, + "learning_rate": 0.0008650078369905956, + "loss": 2.7594, + "step": 889 + }, + { + "epoch": 0.1677820718258083, + "grad_norm": 0.69140625, + "learning_rate": 0.0008648119122257053, + "loss": 2.6729, + "step": 890 + }, + { + "epoch": 0.16797059100763503, + "grad_norm": 0.68359375, + "learning_rate": 0.000864615987460815, + "loss": 2.7915, + "step": 891 + }, + { + "epoch": 0.16815911018946178, + "grad_norm": 0.6640625, + "learning_rate": 0.0008644200626959248, + "loss": 2.5758, + "step": 892 + }, + { + "epoch": 0.16834762937128853, + "grad_norm": 0.6484375, + "learning_rate": 0.0008642241379310345, + "loss": 2.5708, + "step": 893 + }, + { + "epoch": 0.16853614855311527, + "grad_norm": 0.69140625, + "learning_rate": 0.0008640282131661442, + "loss": 2.6254, + "step": 894 + }, + { + "epoch": 0.16872466773494202, + "grad_norm": 0.6796875, + "learning_rate": 0.000863832288401254, + "loss": 2.5748, + "step": 895 + }, + { + "epoch": 0.1689131869167688, + "grad_norm": 0.6875, + "learning_rate": 0.0008636363636363636, + "loss": 2.7726, + "step": 896 + }, + { + "epoch": 0.16910170609859554, + "grad_norm": 0.68359375, + "learning_rate": 0.0008634404388714734, + "loss": 2.6658, + "step": 897 + }, + { + "epoch": 0.16929022528042229, + "grad_norm": 0.65234375, + "learning_rate": 0.0008632445141065831, + "loss": 2.6508, + "step": 898 + }, + { + "epoch": 0.16947874446224903, + "grad_norm": 0.7421875, + "learning_rate": 0.0008630485893416928, + "loss": 2.6266, + "step": 899 + }, + { + "epoch": 0.16966726364407578, + "grad_norm": 0.71875, + "learning_rate": 0.0008628526645768026, + "loss": 2.7459, + "step": 900 + }, + { + "epoch": 0.16985578282590252, + "grad_norm": 0.68359375, + "learning_rate": 0.0008626567398119123, + "loss": 2.587, + "step": 901 + }, + { + "epoch": 0.1700443020077293, + "grad_norm": 0.6484375, + "learning_rate": 0.000862460815047022, + "loss": 2.6079, + "step": 902 + }, + { + "epoch": 0.17023282118955604, + "grad_norm": 0.7109375, + "learning_rate": 0.0008622648902821317, + "loss": 2.7354, + "step": 903 + }, + { + "epoch": 0.1704213403713828, + "grad_norm": 0.76953125, + "learning_rate": 0.0008620689655172414, + "loss": 2.5455, + "step": 904 + }, + { + "epoch": 0.17060985955320954, + "grad_norm": 0.66015625, + "learning_rate": 0.000861873040752351, + "loss": 2.6302, + "step": 905 + }, + { + "epoch": 0.17079837873503628, + "grad_norm": 0.63671875, + "learning_rate": 0.0008616771159874608, + "loss": 2.4757, + "step": 906 + }, + { + "epoch": 0.17098689791686303, + "grad_norm": 0.75, + "learning_rate": 0.0008614811912225705, + "loss": 2.6884, + "step": 907 + }, + { + "epoch": 0.1711754170986898, + "grad_norm": 0.68359375, + "learning_rate": 0.0008612852664576803, + "loss": 2.5877, + "step": 908 + }, + { + "epoch": 0.17136393628051655, + "grad_norm": 0.70703125, + "learning_rate": 0.0008610893416927899, + "loss": 2.6323, + "step": 909 + }, + { + "epoch": 0.1715524554623433, + "grad_norm": 0.66796875, + "learning_rate": 0.0008608934169278997, + "loss": 2.657, + "step": 910 + }, + { + "epoch": 0.17174097464417004, + "grad_norm": 0.671875, + "learning_rate": 0.0008606974921630095, + "loss": 2.6591, + "step": 911 + }, + { + "epoch": 0.1719294938259968, + "grad_norm": 0.68359375, + "learning_rate": 0.0008605015673981191, + "loss": 2.6385, + "step": 912 + }, + { + "epoch": 0.17211801300782353, + "grad_norm": 1.859375, + "learning_rate": 0.0008603056426332289, + "loss": 2.7853, + "step": 913 + }, + { + "epoch": 0.1723065321896503, + "grad_norm": 0.70703125, + "learning_rate": 0.0008601097178683386, + "loss": 2.7299, + "step": 914 + }, + { + "epoch": 0.17249505137147705, + "grad_norm": 0.68359375, + "learning_rate": 0.0008599137931034483, + "loss": 2.7032, + "step": 915 + }, + { + "epoch": 0.1726835705533038, + "grad_norm": 0.68359375, + "learning_rate": 0.000859717868338558, + "loss": 2.6568, + "step": 916 + }, + { + "epoch": 0.17287208973513055, + "grad_norm": 0.6640625, + "learning_rate": 0.0008595219435736678, + "loss": 2.8574, + "step": 917 + }, + { + "epoch": 0.1730606089169573, + "grad_norm": 0.7109375, + "learning_rate": 0.0008593260188087774, + "loss": 2.7283, + "step": 918 + }, + { + "epoch": 0.17324912809878404, + "grad_norm": 0.72265625, + "learning_rate": 0.0008591300940438872, + "loss": 2.712, + "step": 919 + }, + { + "epoch": 0.1734376472806108, + "grad_norm": 0.67578125, + "learning_rate": 0.000858934169278997, + "loss": 2.5807, + "step": 920 + }, + { + "epoch": 0.17362616646243756, + "grad_norm": 0.67578125, + "learning_rate": 0.0008587382445141066, + "loss": 2.6998, + "step": 921 + }, + { + "epoch": 0.1738146856442643, + "grad_norm": 0.66796875, + "learning_rate": 0.0008585423197492164, + "loss": 2.5734, + "step": 922 + }, + { + "epoch": 0.17400320482609105, + "grad_norm": 0.6953125, + "learning_rate": 0.000858346394984326, + "loss": 2.6374, + "step": 923 + }, + { + "epoch": 0.1741917240079178, + "grad_norm": 0.640625, + "learning_rate": 0.0008581504702194357, + "loss": 2.5408, + "step": 924 + }, + { + "epoch": 0.17438024318974454, + "grad_norm": 0.71484375, + "learning_rate": 0.0008579545454545454, + "loss": 2.661, + "step": 925 + }, + { + "epoch": 0.17456876237157132, + "grad_norm": 0.68359375, + "learning_rate": 0.0008577586206896552, + "loss": 2.701, + "step": 926 + }, + { + "epoch": 0.17475728155339806, + "grad_norm": 0.69140625, + "learning_rate": 0.0008575626959247648, + "loss": 2.71, + "step": 927 + }, + { + "epoch": 0.1749458007352248, + "grad_norm": 0.671875, + "learning_rate": 0.0008573667711598746, + "loss": 2.6468, + "step": 928 + }, + { + "epoch": 0.17513431991705156, + "grad_norm": 0.64453125, + "learning_rate": 0.0008571708463949844, + "loss": 2.5039, + "step": 929 + }, + { + "epoch": 0.1753228390988783, + "grad_norm": 0.6875, + "learning_rate": 0.000856974921630094, + "loss": 2.6299, + "step": 930 + }, + { + "epoch": 0.17551135828070505, + "grad_norm": 0.7109375, + "learning_rate": 0.0008567789968652038, + "loss": 2.7097, + "step": 931 + }, + { + "epoch": 0.17569987746253182, + "grad_norm": 0.71875, + "learning_rate": 0.0008565830721003135, + "loss": 2.662, + "step": 932 + }, + { + "epoch": 0.17588839664435857, + "grad_norm": 0.6875, + "learning_rate": 0.0008563871473354232, + "loss": 2.6554, + "step": 933 + }, + { + "epoch": 0.17607691582618532, + "grad_norm": 0.69140625, + "learning_rate": 0.0008561912225705329, + "loss": 2.5157, + "step": 934 + }, + { + "epoch": 0.17626543500801206, + "grad_norm": 0.703125, + "learning_rate": 0.0008559952978056427, + "loss": 2.658, + "step": 935 + }, + { + "epoch": 0.1764539541898388, + "grad_norm": 0.6796875, + "learning_rate": 0.0008557993730407524, + "loss": 2.5697, + "step": 936 + }, + { + "epoch": 0.17664247337166555, + "grad_norm": 0.6953125, + "learning_rate": 0.0008556034482758621, + "loss": 2.6467, + "step": 937 + }, + { + "epoch": 0.17683099255349233, + "grad_norm": 0.671875, + "learning_rate": 0.0008554075235109719, + "loss": 2.7308, + "step": 938 + }, + { + "epoch": 0.17701951173531907, + "grad_norm": 0.703125, + "learning_rate": 0.0008552115987460816, + "loss": 2.5552, + "step": 939 + }, + { + "epoch": 0.17720803091714582, + "grad_norm": 0.70703125, + "learning_rate": 0.0008550156739811913, + "loss": 2.7391, + "step": 940 + }, + { + "epoch": 0.17739655009897257, + "grad_norm": 0.6328125, + "learning_rate": 0.000854819749216301, + "loss": 2.5914, + "step": 941 + }, + { + "epoch": 0.1775850692807993, + "grad_norm": 0.66796875, + "learning_rate": 0.0008546238244514107, + "loss": 2.6652, + "step": 942 + }, + { + "epoch": 0.17777358846262606, + "grad_norm": 0.67578125, + "learning_rate": 0.0008544278996865203, + "loss": 2.7119, + "step": 943 + }, + { + "epoch": 0.17796210764445283, + "grad_norm": 0.67578125, + "learning_rate": 0.0008542319749216301, + "loss": 2.7099, + "step": 944 + }, + { + "epoch": 0.17815062682627958, + "grad_norm": 0.69921875, + "learning_rate": 0.0008540360501567398, + "loss": 2.6452, + "step": 945 + }, + { + "epoch": 0.17833914600810633, + "grad_norm": 0.66796875, + "learning_rate": 0.0008538401253918495, + "loss": 2.5148, + "step": 946 + }, + { + "epoch": 0.17852766518993307, + "grad_norm": 0.69140625, + "learning_rate": 0.0008536442006269592, + "loss": 2.6379, + "step": 947 + }, + { + "epoch": 0.17871618437175982, + "grad_norm": 0.66796875, + "learning_rate": 0.000853448275862069, + "loss": 2.6155, + "step": 948 + }, + { + "epoch": 0.17890470355358656, + "grad_norm": 0.68359375, + "learning_rate": 0.0008532523510971787, + "loss": 2.5122, + "step": 949 + }, + { + "epoch": 0.17909322273541334, + "grad_norm": 0.66015625, + "learning_rate": 0.0008530564263322884, + "loss": 2.7232, + "step": 950 + }, + { + "epoch": 0.17928174191724008, + "grad_norm": 0.703125, + "learning_rate": 0.0008528605015673982, + "loss": 2.7196, + "step": 951 + }, + { + "epoch": 0.17947026109906683, + "grad_norm": 0.6484375, + "learning_rate": 0.0008526645768025078, + "loss": 2.5332, + "step": 952 + }, + { + "epoch": 0.17965878028089358, + "grad_norm": 0.6484375, + "learning_rate": 0.0008524686520376176, + "loss": 2.7149, + "step": 953 + }, + { + "epoch": 0.17984729946272032, + "grad_norm": 0.6640625, + "learning_rate": 0.0008522727272727273, + "loss": 2.7601, + "step": 954 + }, + { + "epoch": 0.1800358186445471, + "grad_norm": 0.671875, + "learning_rate": 0.000852076802507837, + "loss": 2.6729, + "step": 955 + }, + { + "epoch": 0.18022433782637384, + "grad_norm": 0.69140625, + "learning_rate": 0.0008518808777429467, + "loss": 2.5701, + "step": 956 + }, + { + "epoch": 0.1804128570082006, + "grad_norm": 0.7109375, + "learning_rate": 0.0008516849529780565, + "loss": 2.6001, + "step": 957 + }, + { + "epoch": 0.18060137619002734, + "grad_norm": 0.6796875, + "learning_rate": 0.0008514890282131662, + "loss": 2.7079, + "step": 958 + }, + { + "epoch": 0.18078989537185408, + "grad_norm": 0.7109375, + "learning_rate": 0.0008512931034482759, + "loss": 2.6712, + "step": 959 + }, + { + "epoch": 0.18097841455368083, + "grad_norm": 0.7265625, + "learning_rate": 0.0008510971786833856, + "loss": 2.722, + "step": 960 + }, + { + "epoch": 0.1811669337355076, + "grad_norm": 0.65625, + "learning_rate": 0.0008509012539184952, + "loss": 2.5951, + "step": 961 + }, + { + "epoch": 0.18135545291733435, + "grad_norm": 0.6484375, + "learning_rate": 0.000850705329153605, + "loss": 2.6927, + "step": 962 + }, + { + "epoch": 0.1815439720991611, + "grad_norm": 0.68359375, + "learning_rate": 0.0008505094043887147, + "loss": 2.6872, + "step": 963 + }, + { + "epoch": 0.18173249128098784, + "grad_norm": 0.671875, + "learning_rate": 0.0008503134796238245, + "loss": 2.498, + "step": 964 + }, + { + "epoch": 0.1819210104628146, + "grad_norm": 0.69140625, + "learning_rate": 0.0008501175548589341, + "loss": 2.7254, + "step": 965 + }, + { + "epoch": 0.18210952964464133, + "grad_norm": 0.67578125, + "learning_rate": 0.0008499216300940439, + "loss": 2.6044, + "step": 966 + }, + { + "epoch": 0.1822980488264681, + "grad_norm": 0.63671875, + "learning_rate": 0.0008497257053291537, + "loss": 2.6202, + "step": 967 + }, + { + "epoch": 0.18248656800829485, + "grad_norm": 0.7109375, + "learning_rate": 0.0008495297805642633, + "loss": 2.6671, + "step": 968 + }, + { + "epoch": 0.1826750871901216, + "grad_norm": 0.6875, + "learning_rate": 0.0008493338557993731, + "loss": 2.7925, + "step": 969 + }, + { + "epoch": 0.18286360637194835, + "grad_norm": 0.71875, + "learning_rate": 0.0008491379310344828, + "loss": 2.5893, + "step": 970 + }, + { + "epoch": 0.1830521255537751, + "grad_norm": 0.6328125, + "learning_rate": 0.0008489420062695925, + "loss": 2.6704, + "step": 971 + }, + { + "epoch": 0.18324064473560184, + "grad_norm": 0.68359375, + "learning_rate": 0.0008487460815047022, + "loss": 2.6642, + "step": 972 + }, + { + "epoch": 0.1834291639174286, + "grad_norm": 0.72265625, + "learning_rate": 0.000848550156739812, + "loss": 2.6579, + "step": 973 + }, + { + "epoch": 0.18361768309925536, + "grad_norm": 0.62890625, + "learning_rate": 0.0008483542319749216, + "loss": 2.47, + "step": 974 + }, + { + "epoch": 0.1838062022810821, + "grad_norm": 0.66796875, + "learning_rate": 0.0008481583072100314, + "loss": 2.5976, + "step": 975 + }, + { + "epoch": 0.18399472146290885, + "grad_norm": 0.67578125, + "learning_rate": 0.0008479623824451412, + "loss": 2.7708, + "step": 976 + }, + { + "epoch": 0.1841832406447356, + "grad_norm": 0.68359375, + "learning_rate": 0.0008477664576802508, + "loss": 2.6823, + "step": 977 + }, + { + "epoch": 0.18437175982656234, + "grad_norm": 0.640625, + "learning_rate": 0.0008475705329153606, + "loss": 2.6927, + "step": 978 + }, + { + "epoch": 0.18456027900838912, + "grad_norm": 0.69921875, + "learning_rate": 0.0008473746081504702, + "loss": 2.682, + "step": 979 + }, + { + "epoch": 0.18474879819021586, + "grad_norm": 0.6953125, + "learning_rate": 0.0008471786833855799, + "loss": 2.6509, + "step": 980 + }, + { + "epoch": 0.1849373173720426, + "grad_norm": 0.66015625, + "learning_rate": 0.0008469827586206896, + "loss": 2.7625, + "step": 981 + }, + { + "epoch": 0.18512583655386936, + "grad_norm": 0.6953125, + "learning_rate": 0.0008467868338557994, + "loss": 2.7144, + "step": 982 + }, + { + "epoch": 0.1853143557356961, + "grad_norm": 0.69140625, + "learning_rate": 0.000846590909090909, + "loss": 2.7065, + "step": 983 + }, + { + "epoch": 0.18550287491752285, + "grad_norm": 0.625, + "learning_rate": 0.0008463949843260188, + "loss": 2.5608, + "step": 984 + }, + { + "epoch": 0.18569139409934962, + "grad_norm": 0.63671875, + "learning_rate": 0.0008461990595611286, + "loss": 2.587, + "step": 985 + }, + { + "epoch": 0.18587991328117637, + "grad_norm": 0.69140625, + "learning_rate": 0.0008460031347962382, + "loss": 2.5719, + "step": 986 + }, + { + "epoch": 0.18606843246300311, + "grad_norm": 0.68359375, + "learning_rate": 0.000845807210031348, + "loss": 2.6672, + "step": 987 + }, + { + "epoch": 0.18625695164482986, + "grad_norm": 0.6640625, + "learning_rate": 0.0008456112852664577, + "loss": 2.6857, + "step": 988 + }, + { + "epoch": 0.1864454708266566, + "grad_norm": 0.6796875, + "learning_rate": 0.0008454153605015674, + "loss": 2.6708, + "step": 989 + }, + { + "epoch": 0.18663399000848335, + "grad_norm": 0.69921875, + "learning_rate": 0.0008452194357366771, + "loss": 2.6183, + "step": 990 + }, + { + "epoch": 0.18682250919031013, + "grad_norm": 0.66015625, + "learning_rate": 0.0008450235109717869, + "loss": 2.6452, + "step": 991 + }, + { + "epoch": 0.18701102837213687, + "grad_norm": 0.7421875, + "learning_rate": 0.0008448275862068966, + "loss": 2.704, + "step": 992 + }, + { + "epoch": 0.18719954755396362, + "grad_norm": 0.68359375, + "learning_rate": 0.0008446316614420063, + "loss": 2.674, + "step": 993 + }, + { + "epoch": 0.18738806673579037, + "grad_norm": 0.6953125, + "learning_rate": 0.000844435736677116, + "loss": 2.7137, + "step": 994 + }, + { + "epoch": 0.1875765859176171, + "grad_norm": 0.66015625, + "learning_rate": 0.0008442398119122258, + "loss": 2.7582, + "step": 995 + }, + { + "epoch": 0.18776510509944386, + "grad_norm": 0.69140625, + "learning_rate": 0.0008440438871473355, + "loss": 2.6009, + "step": 996 + }, + { + "epoch": 0.18795362428127063, + "grad_norm": 0.71875, + "learning_rate": 0.0008438479623824452, + "loss": 2.6595, + "step": 997 + }, + { + "epoch": 0.18814214346309738, + "grad_norm": 0.66015625, + "learning_rate": 0.0008436520376175549, + "loss": 2.552, + "step": 998 + }, + { + "epoch": 0.18833066264492412, + "grad_norm": 0.68359375, + "learning_rate": 0.0008434561128526645, + "loss": 2.5806, + "step": 999 + }, + { + "epoch": 0.18851918182675087, + "grad_norm": 0.66796875, + "learning_rate": 0.0008432601880877743, + "loss": 2.6469, + "step": 1000 + }, + { + "epoch": 0.18851918182675087, + "eval_runtime": 16.219, + "eval_samples_per_second": 63.136, + "eval_steps_per_second": 1.973, + "step": 1000 + }, + { + "epoch": 0.18851918182675087, + "eval/hellaswag_acc": 0.3743278231428002, + "eval/hellaswag_acc_norm": 0.4706233817964549, + "eval_hellaswag_elapsed_time": 116.27660393714905, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 5304, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.325965577388032e+18, + "train_batch_size": 12, + "trial_name": null, + "trial_params": null +}