{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9683794466403162, "eval_steps": 16, "global_step": 126, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 0.17573034763336182, "learning_rate": 2e-05, "loss": 2.0307, "step": 1 }, { "epoch": 0.02, "eval_loss": 2.0314464569091797, "eval_runtime": 589.9959, "eval_samples_per_second": 0.351, "eval_steps_per_second": 0.351, "step": 1 }, { "epoch": 0.03, "grad_norm": 0.1727771908044815, "learning_rate": 4e-05, "loss": 2.1265, "step": 2 }, { "epoch": 0.05, "grad_norm": 0.2174711972475052, "learning_rate": 6e-05, "loss": 2.0362, "step": 3 }, { "epoch": 0.06, "grad_norm": 0.15978679060935974, "learning_rate": 8e-05, "loss": 2.1557, "step": 4 }, { "epoch": 0.08, "grad_norm": 0.1518234759569168, "learning_rate": 0.0001, "loss": 1.9664, "step": 5 }, { "epoch": 0.09, "grad_norm": 0.1279090940952301, "learning_rate": 0.00012, "loss": 1.9584, "step": 6 }, { "epoch": 0.11, "grad_norm": 0.14364583790302277, "learning_rate": 0.00014, "loss": 2.0269, "step": 7 }, { "epoch": 0.13, "grad_norm": 0.18047624826431274, "learning_rate": 0.00016, "loss": 1.9699, "step": 8 }, { "epoch": 0.14, "grad_norm": 0.1313191056251526, "learning_rate": 0.00018, "loss": 1.93, "step": 9 }, { "epoch": 0.16, "grad_norm": 0.14072288572788239, "learning_rate": 0.0002, "loss": 1.9375, "step": 10 }, { "epoch": 0.17, "grad_norm": 0.13216689229011536, "learning_rate": 0.00019999871623526481, "loss": 2.0861, "step": 11 }, { "epoch": 0.19, "grad_norm": 0.12376871705055237, "learning_rate": 0.00019999486497402038, "loss": 2.0024, "step": 12 }, { "epoch": 0.21, "grad_norm": 0.11119924485683441, "learning_rate": 0.00019998844631514886, "loss": 2.0243, "step": 13 }, { "epoch": 0.22, "grad_norm": 0.11198648810386658, "learning_rate": 0.00019997946042345127, "loss": 1.9499, "step": 14 }, { "epoch": 0.24, "grad_norm": 0.11798805743455887, "learning_rate": 0.00019996790752964305, "loss": 2.0469, "step": 15 }, { "epoch": 0.25, "grad_norm": 0.12525062263011932, "learning_rate": 0.00019995378793034814, "loss": 2.0982, "step": 16 }, { "epoch": 0.25, "eval_loss": 1.9930143356323242, "eval_runtime": 587.3154, "eval_samples_per_second": 0.352, "eval_steps_per_second": 0.352, "step": 16 }, { "epoch": 0.27, "grad_norm": 0.1235467791557312, "learning_rate": 0.00019993710198809136, "loss": 2.0232, "step": 17 }, { "epoch": 0.28, "grad_norm": 0.10296319425106049, "learning_rate": 0.00019991785013128923, "loss": 1.996, "step": 18 }, { "epoch": 0.3, "grad_norm": 0.10613357275724411, "learning_rate": 0.00019989603285423889, "loss": 2.0909, "step": 19 }, { "epoch": 0.32, "grad_norm": 0.1147119328379631, "learning_rate": 0.00019987165071710527, "loss": 1.9893, "step": 20 }, { "epoch": 0.33, "grad_norm": 0.1092303916811943, "learning_rate": 0.00019984470434590703, "loss": 1.9745, "step": 21 }, { "epoch": 0.35, "grad_norm": 0.13837751746177673, "learning_rate": 0.0001998151944325001, "loss": 2.0544, "step": 22 }, { "epoch": 0.36, "grad_norm": 0.12572677433490753, "learning_rate": 0.00019978312173456027, "loss": 1.9235, "step": 23 }, { "epoch": 0.38, "grad_norm": 0.11084165424108505, "learning_rate": 0.00019974848707556345, "loss": 2.0651, "step": 24 }, { "epoch": 0.4, "grad_norm": 0.10701000690460205, "learning_rate": 0.00019971129134476473, "loss": 1.9667, "step": 25 }, { "epoch": 0.41, "grad_norm": 0.1185716763138771, "learning_rate": 0.00019967153549717553, "loss": 1.9993, "step": 26 }, { "epoch": 0.43, "grad_norm": 0.14391109347343445, "learning_rate": 0.0001996292205535389, "loss": 2.0942, "step": 27 }, { "epoch": 0.44, "grad_norm": 0.11472482979297638, "learning_rate": 0.00019958434760030346, "loss": 2.0754, "step": 28 }, { "epoch": 0.46, "grad_norm": 0.09692487865686417, "learning_rate": 0.00019953691778959557, "loss": 2.0101, "step": 29 }, { "epoch": 0.47, "grad_norm": 0.10973215848207474, "learning_rate": 0.00019948693233918952, "loss": 2.0088, "step": 30 }, { "epoch": 0.49, "grad_norm": 0.10801681876182556, "learning_rate": 0.00019943439253247656, "loss": 2.0177, "step": 31 }, { "epoch": 0.51, "grad_norm": 0.11581862717866898, "learning_rate": 0.00019937929971843165, "loss": 1.8967, "step": 32 }, { "epoch": 0.51, "eval_loss": 1.9818447828292847, "eval_runtime": 587.0079, "eval_samples_per_second": 0.353, "eval_steps_per_second": 0.353, "step": 32 }, { "epoch": 0.52, "grad_norm": 0.11895039677619934, "learning_rate": 0.0001993216553115791, "loss": 1.9273, "step": 33 }, { "epoch": 0.54, "grad_norm": 0.12135493755340576, "learning_rate": 0.00019926146079195594, "loss": 2.0167, "step": 34 }, { "epoch": 0.55, "grad_norm": 0.1046937108039856, "learning_rate": 0.0001991987177050743, "loss": 2.0068, "step": 35 }, { "epoch": 0.57, "grad_norm": 0.11556221544742584, "learning_rate": 0.00019913342766188138, "loss": 2.157, "step": 36 }, { "epoch": 0.58, "grad_norm": 0.10031617432832718, "learning_rate": 0.00019906559233871828, "loss": 2.0234, "step": 37 }, { "epoch": 0.6, "grad_norm": 0.11878479272127151, "learning_rate": 0.0001989952134772769, "loss": 1.9956, "step": 38 }, { "epoch": 0.62, "grad_norm": 0.10510522127151489, "learning_rate": 0.00019892229288455532, "loss": 2.1002, "step": 39 }, { "epoch": 0.63, "grad_norm": 0.09501214325428009, "learning_rate": 0.00019884683243281116, "loss": 1.8809, "step": 40 }, { "epoch": 0.65, "grad_norm": 0.09750509262084961, "learning_rate": 0.00019876883405951377, "loss": 2.1677, "step": 41 }, { "epoch": 0.66, "grad_norm": 0.10707078874111176, "learning_rate": 0.00019868829976729443, "loss": 1.9349, "step": 42 }, { "epoch": 0.68, "grad_norm": 0.11763511598110199, "learning_rate": 0.00019860523162389476, "loss": 2.0093, "step": 43 }, { "epoch": 0.7, "grad_norm": 0.09981624782085419, "learning_rate": 0.00019851963176211387, "loss": 2.003, "step": 44 }, { "epoch": 0.71, "grad_norm": 0.10791173577308655, "learning_rate": 0.00019843150237975344, "loss": 2.0147, "step": 45 }, { "epoch": 0.73, "grad_norm": 0.10654355585575104, "learning_rate": 0.00019834084573956128, "loss": 2.0423, "step": 46 }, { "epoch": 0.74, "grad_norm": 0.11113700270652771, "learning_rate": 0.00019824766416917338, "loss": 1.9396, "step": 47 }, { "epoch": 0.76, "grad_norm": 0.11194361746311188, "learning_rate": 0.00019815196006105402, "loss": 1.9726, "step": 48 }, { "epoch": 0.76, "eval_loss": 1.9757568836212158, "eval_runtime": 586.7896, "eval_samples_per_second": 0.353, "eval_steps_per_second": 0.353, "step": 48 }, { "epoch": 0.77, "grad_norm": 0.10718481242656708, "learning_rate": 0.0001980537358724344, "loss": 1.974, "step": 49 }, { "epoch": 0.79, "grad_norm": 0.1139945313334465, "learning_rate": 0.00019795299412524945, "loss": 2.0978, "step": 50 }, { "epoch": 0.81, "grad_norm": 0.11229316890239716, "learning_rate": 0.0001978497374060733, "loss": 2.0419, "step": 51 }, { "epoch": 0.82, "grad_norm": 0.10654540359973907, "learning_rate": 0.00019774396836605255, "loss": 1.9337, "step": 52 }, { "epoch": 0.84, "grad_norm": 0.09649579972028732, "learning_rate": 0.00019763568972083856, "loss": 2.0043, "step": 53 }, { "epoch": 0.85, "grad_norm": 0.10226842761039734, "learning_rate": 0.00019752490425051743, "loss": 1.9678, "step": 54 }, { "epoch": 0.87, "grad_norm": 0.10110396891832352, "learning_rate": 0.0001974116147995387, "loss": 2.0006, "step": 55 }, { "epoch": 0.89, "grad_norm": 0.10171829164028168, "learning_rate": 0.0001972958242766425, "loss": 1.9753, "step": 56 }, { "epoch": 0.9, "grad_norm": 0.1117730587720871, "learning_rate": 0.0001971775356547846, "loss": 1.9982, "step": 57 }, { "epoch": 0.92, "grad_norm": 0.15865837037563324, "learning_rate": 0.00019705675197106016, "loss": 2.0222, "step": 58 }, { "epoch": 0.93, "grad_norm": 0.10181812196969986, "learning_rate": 0.00019693347632662595, "loss": 2.0899, "step": 59 }, { "epoch": 0.95, "grad_norm": 0.13255544006824493, "learning_rate": 0.00019680771188662044, "loss": 2.0191, "step": 60 }, { "epoch": 0.96, "grad_norm": 0.8338626623153687, "learning_rate": 0.0001966794618800827, "loss": 2.0432, "step": 61 }, { "epoch": 0.98, "grad_norm": 0.11790451407432556, "learning_rate": 0.00019654872959986937, "loss": 2.0005, "step": 62 }, { "epoch": 1.0, "grad_norm": 0.12201216071844101, "learning_rate": 0.00019641551840257035, "loss": 1.9949, "step": 63 }, { "epoch": 1.01, "grad_norm": 0.10102586448192596, "learning_rate": 0.00019627983170842234, "loss": 2.0108, "step": 64 }, { "epoch": 1.01, "eval_loss": 1.9710921049118042, "eval_runtime": 588.7491, "eval_samples_per_second": 0.352, "eval_steps_per_second": 0.352, "step": 64 }, { "epoch": 1.0, "grad_norm": 0.14357174932956696, "learning_rate": 0.00019614167300122126, "loss": 1.7736, "step": 65 }, { "epoch": 1.02, "grad_norm": 0.0991039052605629, "learning_rate": 0.0001960010458282326, "loss": 1.9889, "step": 66 }, { "epoch": 1.04, "grad_norm": 0.10566934943199158, "learning_rate": 0.00019585795380010044, "loss": 1.8411, "step": 67 }, { "epoch": 1.05, "grad_norm": 0.12068266421556473, "learning_rate": 0.0001957124005907548, "loss": 1.9175, "step": 68 }, { "epoch": 1.07, "grad_norm": 0.10454852879047394, "learning_rate": 0.00019556438993731726, "loss": 1.9158, "step": 69 }, { "epoch": 1.08, "grad_norm": 0.11026794463396072, "learning_rate": 0.00019541392564000488, "loss": 1.9053, "step": 70 }, { "epoch": 1.1, "grad_norm": 0.11682567000389099, "learning_rate": 0.00019526101156203295, "loss": 1.9778, "step": 71 }, { "epoch": 1.11, "grad_norm": 0.11718857288360596, "learning_rate": 0.00019510565162951537, "loss": 1.9957, "step": 72 }, { "epoch": 1.13, "grad_norm": 0.11099609732627869, "learning_rate": 0.00019494784983136425, "loss": 1.8909, "step": 73 }, { "epoch": 1.15, "grad_norm": 0.11991846561431885, "learning_rate": 0.00019478761021918728, "loss": 1.9644, "step": 74 }, { "epoch": 1.16, "grad_norm": 0.12009608745574951, "learning_rate": 0.0001946249369071837, "loss": 2.0168, "step": 75 }, { "epoch": 1.18, "grad_norm": 0.1274980902671814, "learning_rate": 0.00019445983407203872, "loss": 1.7852, "step": 76 }, { "epoch": 1.19, "grad_norm": 0.12307930737733841, "learning_rate": 0.00019429230595281632, "loss": 1.9185, "step": 77 }, { "epoch": 1.21, "grad_norm": 0.1239650622010231, "learning_rate": 0.00019412235685085035, "loss": 1.9912, "step": 78 }, { "epoch": 1.23, "grad_norm": 0.12378279119729996, "learning_rate": 0.00019394999112963402, "loss": 1.8249, "step": 79 }, { "epoch": 1.24, "grad_norm": 0.1403149962425232, "learning_rate": 0.00019377521321470805, "loss": 2.01, "step": 80 }, { "epoch": 1.24, "eval_loss": 1.9759098291397095, "eval_runtime": 587.469, "eval_samples_per_second": 0.352, "eval_steps_per_second": 0.352, "step": 80 }, { "epoch": 1.26, "grad_norm": 0.12876343727111816, "learning_rate": 0.00019359802759354695, "loss": 1.9961, "step": 81 }, { "epoch": 1.27, "grad_norm": 0.1330948770046234, "learning_rate": 0.00019341843881544372, "loss": 1.9642, "step": 82 }, { "epoch": 1.29, "grad_norm": 0.13431289792060852, "learning_rate": 0.00019323645149139319, "loss": 1.9867, "step": 83 }, { "epoch": 1.3, "grad_norm": 0.13692668080329895, "learning_rate": 0.00019305207029397348, "loss": 1.9685, "step": 84 }, { "epoch": 1.32, "grad_norm": 0.13141922652721405, "learning_rate": 0.00019286529995722623, "loss": 1.9022, "step": 85 }, { "epoch": 1.34, "grad_norm": 0.13360707461833954, "learning_rate": 0.00019267614527653488, "loss": 1.9811, "step": 86 }, { "epoch": 1.35, "grad_norm": 0.1410575956106186, "learning_rate": 0.00019248461110850157, "loss": 2.0276, "step": 87 }, { "epoch": 1.37, "grad_norm": 0.1281205415725708, "learning_rate": 0.00019229070237082252, "loss": 1.8166, "step": 88 }, { "epoch": 1.38, "grad_norm": 0.31853872537612915, "learning_rate": 0.0001920944240421617, "loss": 1.9699, "step": 89 }, { "epoch": 1.4, "grad_norm": 0.1361798644065857, "learning_rate": 0.00019189578116202307, "loss": 1.9419, "step": 90 }, { "epoch": 1.42, "grad_norm": 0.1369122415781021, "learning_rate": 0.0001916947788306211, "loss": 2.0005, "step": 91 }, { "epoch": 1.43, "grad_norm": 0.14621175825595856, "learning_rate": 0.0001914914222087499, "loss": 1.904, "step": 92 }, { "epoch": 1.45, "grad_norm": 0.1483631581068039, "learning_rate": 0.0001912857165176507, "loss": 1.9783, "step": 93 }, { "epoch": 1.46, "grad_norm": 0.15535128116607666, "learning_rate": 0.00019107766703887764, "loss": 1.9242, "step": 94 }, { "epoch": 1.48, "grad_norm": 0.1497003585100174, "learning_rate": 0.0001908672791141625, "loss": 1.9094, "step": 95 }, { "epoch": 1.49, "grad_norm": 0.14517174661159515, "learning_rate": 0.00019065455814527718, "loss": 2.0594, "step": 96 }, { "epoch": 1.49, "eval_loss": 1.977068305015564, "eval_runtime": 587.2535, "eval_samples_per_second": 0.352, "eval_steps_per_second": 0.352, "step": 96 }, { "epoch": 1.51, "grad_norm": 0.149668887257576, "learning_rate": 0.0001904395095938953, "loss": 1.8582, "step": 97 }, { "epoch": 1.53, "grad_norm": 0.14095304906368256, "learning_rate": 0.00019022213898145176, "loss": 1.8937, "step": 98 }, { "epoch": 1.54, "grad_norm": 0.14488399028778076, "learning_rate": 0.00019000245188900111, "loss": 1.8799, "step": 99 }, { "epoch": 1.56, "grad_norm": 0.1409035623073578, "learning_rate": 0.00018978045395707418, "loss": 1.9641, "step": 100 }, { "epoch": 1.57, "grad_norm": 0.15016290545463562, "learning_rate": 0.0001895561508855333, "loss": 1.9878, "step": 101 }, { "epoch": 1.59, "grad_norm": 0.14171820878982544, "learning_rate": 0.00018932954843342591, "loss": 1.9328, "step": 102 }, { "epoch": 1.6, "grad_norm": 0.1357404738664627, "learning_rate": 0.0001891006524188368, "loss": 1.9264, "step": 103 }, { "epoch": 1.62, "grad_norm": 0.13309913873672485, "learning_rate": 0.00018886946871873856, "loss": 1.9505, "step": 104 }, { "epoch": 1.64, "grad_norm": 0.13994307816028595, "learning_rate": 0.00018863600326884082, "loss": 2.0543, "step": 105 }, { "epoch": 1.65, "grad_norm": 0.147433340549469, "learning_rate": 0.00018840026206343784, "loss": 2.0231, "step": 106 }, { "epoch": 1.67, "grad_norm": 0.13546781241893768, "learning_rate": 0.00018816225115525454, "loss": 1.8892, "step": 107 }, { "epoch": 1.68, "grad_norm": 0.13406571745872498, "learning_rate": 0.0001879219766552911, "loss": 1.8828, "step": 108 }, { "epoch": 1.7, "grad_norm": 0.14394783973693848, "learning_rate": 0.00018767944473266614, "loss": 1.938, "step": 109 }, { "epoch": 1.72, "grad_norm": 0.1423158198595047, "learning_rate": 0.00018743466161445823, "loss": 1.8268, "step": 110 }, { "epoch": 1.73, "grad_norm": 0.2636297345161438, "learning_rate": 0.00018718763358554607, "loss": 1.8625, "step": 111 }, { "epoch": 1.75, "grad_norm": 0.14624255895614624, "learning_rate": 0.0001869383669884471, "loss": 1.9392, "step": 112 }, { "epoch": 1.75, "eval_loss": 1.9756392240524292, "eval_runtime": 587.7368, "eval_samples_per_second": 0.352, "eval_steps_per_second": 0.352, "step": 112 }, { "epoch": 1.76, "grad_norm": 0.14748816192150116, "learning_rate": 0.0001866868682231547, "loss": 1.9757, "step": 113 }, { "epoch": 1.78, "grad_norm": 0.16464634239673615, "learning_rate": 0.00018643314374697378, "loss": 1.96, "step": 114 }, { "epoch": 1.79, "grad_norm": 0.1459386795759201, "learning_rate": 0.00018617720007435497, "loss": 1.9337, "step": 115 }, { "epoch": 1.81, "grad_norm": 0.1428644359111786, "learning_rate": 0.00018591904377672757, "loss": 1.7804, "step": 116 }, { "epoch": 1.83, "grad_norm": 0.14610743522644043, "learning_rate": 0.00018565868148233053, "loss": 1.9056, "step": 117 }, { "epoch": 1.84, "grad_norm": 0.14204370975494385, "learning_rate": 0.00018539611987604258, "loss": 1.97, "step": 118 }, { "epoch": 1.86, "grad_norm": 0.14234338700771332, "learning_rate": 0.00018513136569921023, "loss": 1.9025, "step": 119 }, { "epoch": 1.87, "grad_norm": 0.1486772745847702, "learning_rate": 0.00018486442574947511, "loss": 1.9135, "step": 120 }, { "epoch": 1.89, "grad_norm": 0.1450343132019043, "learning_rate": 0.000184595306880599, "loss": 1.8578, "step": 121 }, { "epoch": 1.91, "grad_norm": 0.140796959400177, "learning_rate": 0.00018432401600228823, "loss": 1.9178, "step": 122 }, { "epoch": 1.92, "grad_norm": 0.14083623886108398, "learning_rate": 0.00018405056008001603, "loss": 1.7776, "step": 123 }, { "epoch": 1.94, "grad_norm": 0.1653621792793274, "learning_rate": 0.00018377494613484378, "loss": 1.8844, "step": 124 }, { "epoch": 1.95, "grad_norm": 0.156838059425354, "learning_rate": 0.00018349718124324076, "loss": 1.9659, "step": 125 }, { "epoch": 1.97, "grad_norm": 0.14624595642089844, "learning_rate": 0.0001832172725369024, "loss": 1.9813, "step": 126 } ], "logging_steps": 1, "max_steps": 630, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 63, "total_flos": 7.12910688623788e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }