{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 491, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002036659877800407, "grad_norm": 42.73765709178892, "learning_rate": 4.0000000000000003e-07, "loss": 4.2137, "step": 1 }, { "epoch": 0.010183299389002037, "grad_norm": 43.823126465212304, "learning_rate": 2.0000000000000003e-06, "loss": 4.2378, "step": 5 }, { "epoch": 0.020366598778004074, "grad_norm": 42.78292182490666, "learning_rate": 4.000000000000001e-06, "loss": 4.1532, "step": 10 }, { "epoch": 0.03054989816700611, "grad_norm": 26.956865206535298, "learning_rate": 6e-06, "loss": 3.7244, "step": 15 }, { "epoch": 0.04073319755600815, "grad_norm": 12.021358068486181, "learning_rate": 8.000000000000001e-06, "loss": 3.0576, "step": 20 }, { "epoch": 0.05091649694501019, "grad_norm": 10.364947442111331, "learning_rate": 1e-05, "loss": 2.5815, "step": 25 }, { "epoch": 0.06109979633401222, "grad_norm": 4.530399924565578, "learning_rate": 1.2e-05, "loss": 2.2026, "step": 30 }, { "epoch": 0.07128309572301425, "grad_norm": 3.2260177605870783, "learning_rate": 1.4e-05, "loss": 1.9904, "step": 35 }, { "epoch": 0.0814663951120163, "grad_norm": 2.12005760250828, "learning_rate": 1.6000000000000003e-05, "loss": 1.8236, "step": 40 }, { "epoch": 0.09164969450101833, "grad_norm": 1.8468068964995024, "learning_rate": 1.8e-05, "loss": 1.6897, "step": 45 }, { "epoch": 0.10183299389002037, "grad_norm": 1.5429747637898503, "learning_rate": 2e-05, "loss": 1.6026, "step": 50 }, { "epoch": 0.1120162932790224, "grad_norm": 1.1896025193659396, "learning_rate": 1.9993657117550972e-05, "loss": 1.5431, "step": 55 }, { "epoch": 0.12219959266802444, "grad_norm": 1.0406079144844387, "learning_rate": 1.9974636516635436e-05, "loss": 1.4948, "step": 60 }, { "epoch": 0.13238289205702647, "grad_norm": 0.9826731697259301, "learning_rate": 1.994296232634054e-05, "loss": 1.4551, "step": 65 }, { "epoch": 0.1425661914460285, "grad_norm": 0.8350404902447894, "learning_rate": 1.9898674727799418e-05, "loss": 1.4548, "step": 70 }, { "epoch": 0.15274949083503056, "grad_norm": 0.8022790771564833, "learning_rate": 1.9841829903218377e-05, "loss": 1.45, "step": 75 }, { "epoch": 0.1629327902240326, "grad_norm": 0.8137274794499317, "learning_rate": 1.977249996460544e-05, "loss": 1.4206, "step": 80 }, { "epoch": 0.17311608961303462, "grad_norm": 0.6702892273400702, "learning_rate": 1.969077286229078e-05, "loss": 1.4068, "step": 85 }, { "epoch": 0.18329938900203666, "grad_norm": 0.6712397854919697, "learning_rate": 1.959675227335497e-05, "loss": 1.4027, "step": 90 }, { "epoch": 0.1934826883910387, "grad_norm": 0.6017420494268232, "learning_rate": 1.949055747010669e-05, "loss": 1.4036, "step": 95 }, { "epoch": 0.20366598778004075, "grad_norm": 0.5946907262324206, "learning_rate": 1.937232316877668e-05, "loss": 1.4021, "step": 100 }, { "epoch": 0.21384928716904278, "grad_norm": 0.584049807273413, "learning_rate": 1.9242199358619897e-05, "loss": 1.3773, "step": 105 }, { "epoch": 0.2240325865580448, "grad_norm": 0.5468171068137498, "learning_rate": 1.9100351111642666e-05, "loss": 1.3824, "step": 110 }, { "epoch": 0.23421588594704684, "grad_norm": 0.5631810496676463, "learning_rate": 1.894695837319623e-05, "loss": 1.3716, "step": 115 }, { "epoch": 0.24439918533604887, "grad_norm": 0.5579171191759417, "learning_rate": 1.8782215733702286e-05, "loss": 1.3722, "step": 120 }, { "epoch": 0.2545824847250509, "grad_norm": 0.5126218869264142, "learning_rate": 1.8606332181800165e-05, "loss": 1.3523, "step": 125 }, { "epoch": 0.26476578411405294, "grad_norm": 0.5007973223598261, "learning_rate": 1.841953083922875e-05, "loss": 1.3451, "step": 130 }, { "epoch": 0.27494908350305497, "grad_norm": 0.5048172409656928, "learning_rate": 1.8222048677779495e-05, "loss": 1.3534, "step": 135 }, { "epoch": 0.285132382892057, "grad_norm": 0.4725169108639936, "learning_rate": 1.8014136218679566e-05, "loss": 1.3473, "step": 140 }, { "epoch": 0.2953156822810591, "grad_norm": 0.48611682020845765, "learning_rate": 1.779605721478652e-05, "loss": 1.3373, "step": 145 }, { "epoch": 0.3054989816700611, "grad_norm": 0.45300491076813515, "learning_rate": 1.756808831599762e-05, "loss": 1.3513, "step": 150 }, { "epoch": 0.31568228105906315, "grad_norm": 0.4382778646436873, "learning_rate": 1.7330518718298263e-05, "loss": 1.3342, "step": 155 }, { "epoch": 0.3258655804480652, "grad_norm": 0.4427941208449195, "learning_rate": 1.7083649796894798e-05, "loss": 1.3162, "step": 160 }, { "epoch": 0.3360488798370672, "grad_norm": 0.416296697128989, "learning_rate": 1.6827794723896968e-05, "loss": 1.3468, "step": 165 }, { "epoch": 0.34623217922606925, "grad_norm": 0.43327582549066207, "learning_rate": 1.6563278071035182e-05, "loss": 1.3413, "step": 170 }, { "epoch": 0.3564154786150713, "grad_norm": 0.39987863733708096, "learning_rate": 1.6290435397916426e-05, "loss": 1.3275, "step": 175 }, { "epoch": 0.3665987780040733, "grad_norm": 0.4107034128984309, "learning_rate": 1.6009612826341226e-05, "loss": 1.3165, "step": 180 }, { "epoch": 0.37678207739307534, "grad_norm": 0.39958981244827124, "learning_rate": 1.5721166601221697e-05, "loss": 1.3238, "step": 185 }, { "epoch": 0.3869653767820774, "grad_norm": 0.40620805351711664, "learning_rate": 1.5425462638657597e-05, "loss": 1.3265, "step": 190 }, { "epoch": 0.3971486761710794, "grad_norm": 0.3930588952515658, "learning_rate": 1.5122876061743772e-05, "loss": 1.3346, "step": 195 }, { "epoch": 0.4073319755600815, "grad_norm": 0.3455411843725847, "learning_rate": 1.4813790724697832e-05, "loss": 1.2996, "step": 200 }, { "epoch": 0.4073319755600815, "eval_loss": 1.3280781507492065, "eval_runtime": 58.3849, "eval_samples_per_second": 238.195, "eval_steps_per_second": 3.734, "step": 200 }, { "epoch": 0.4175152749490835, "grad_norm": 0.38779728978307215, "learning_rate": 1.4498598725911693e-05, "loss": 1.3155, "step": 205 }, { "epoch": 0.42769857433808556, "grad_norm": 0.34758190751014123, "learning_rate": 1.4177699910544793e-05, "loss": 1.3226, "step": 210 }, { "epoch": 0.4378818737270876, "grad_norm": 0.347052132116147, "learning_rate": 1.3851501363289907e-05, "loss": 1.3042, "step": 215 }, { "epoch": 0.4480651731160896, "grad_norm": 0.35329848914934636, "learning_rate": 1.3520416891955101e-05, "loss": 1.3198, "step": 220 }, { "epoch": 0.45824847250509165, "grad_norm": 0.3689489097870586, "learning_rate": 1.3184866502516846e-05, "loss": 1.3295, "step": 225 }, { "epoch": 0.4684317718940937, "grad_norm": 0.3591427089290735, "learning_rate": 1.2845275866310325e-05, "loss": 1.3026, "step": 230 }, { "epoch": 0.4786150712830957, "grad_norm": 0.33192490822778736, "learning_rate": 1.2502075780032792e-05, "loss": 1.3211, "step": 235 }, { "epoch": 0.48879837067209775, "grad_norm": 0.3314637232326215, "learning_rate": 1.2155701619244997e-05, "loss": 1.3146, "step": 240 }, { "epoch": 0.4989816700610998, "grad_norm": 0.33428121648270825, "learning_rate": 1.1806592786063991e-05, "loss": 1.2953, "step": 245 }, { "epoch": 0.5091649694501018, "grad_norm": 0.33140728066789105, "learning_rate": 1.1455192151747931e-05, "loss": 1.3083, "step": 250 }, { "epoch": 0.5193482688391039, "grad_norm": 0.34894619572153845, "learning_rate": 1.1101945494880013e-05, "loss": 1.3001, "step": 255 }, { "epoch": 0.5295315682281059, "grad_norm": 0.2936066479920224, "learning_rate": 1.0747300935864245e-05, "loss": 1.299, "step": 260 }, { "epoch": 0.539714867617108, "grad_norm": 0.34761064821495535, "learning_rate": 1.0391708368450429e-05, "loss": 1.2987, "step": 265 }, { "epoch": 0.5498981670061099, "grad_norm": 0.32443077572814916, "learning_rate": 1.0035618889009535e-05, "loss": 1.3154, "step": 270 }, { "epoch": 0.560081466395112, "grad_norm": 0.3573334196924316, "learning_rate": 9.67948422428345e-06, "loss": 1.3045, "step": 275 }, { "epoch": 0.570264765784114, "grad_norm": 0.2929859629265363, "learning_rate": 9.323756158335054e-06, "loss": 1.2995, "step": 280 }, { "epoch": 0.5804480651731161, "grad_norm": 0.3725484551984738, "learning_rate": 8.968885959425567e-06, "loss": 1.3006, "step": 285 }, { "epoch": 0.5906313645621182, "grad_norm": 0.2977272122499357, "learning_rate": 8.615323807546258e-06, "loss": 1.2785, "step": 290 }, { "epoch": 0.6008146639511202, "grad_norm": 0.2934843742212814, "learning_rate": 8.263518223330698e-06, "loss": 1.301, "step": 295 }, { "epoch": 0.6109979633401222, "grad_norm": 0.2851617487482263, "learning_rate": 7.913915499071994e-06, "loss": 1.2957, "step": 300 }, { "epoch": 0.6211812627291242, "grad_norm": 0.3280712460992494, "learning_rate": 7.566959132566914e-06, "loss": 1.3021, "step": 305 }, { "epoch": 0.6313645621181263, "grad_norm": 0.28660010388812523, "learning_rate": 7.223089264505001e-06, "loss": 1.2809, "step": 310 }, { "epoch": 0.6415478615071283, "grad_norm": 0.29055149959882093, "learning_rate": 6.882742120116419e-06, "loss": 1.2958, "step": 315 }, { "epoch": 0.6517311608961304, "grad_norm": 0.2868148780704892, "learning_rate": 6.546349455786926e-06, "loss": 1.2931, "step": 320 }, { "epoch": 0.6619144602851323, "grad_norm": 0.3021008995725944, "learning_rate": 6.214338011341825e-06, "loss": 1.2855, "step": 325 }, { "epoch": 0.6720977596741344, "grad_norm": 0.2840488188778215, "learning_rate": 5.887128968693887e-06, "loss": 1.3091, "step": 330 }, { "epoch": 0.6822810590631364, "grad_norm": 0.2978772611216691, "learning_rate": 5.565137417541866e-06, "loss": 1.2826, "step": 335 }, { "epoch": 0.6924643584521385, "grad_norm": 0.28796550430490037, "learning_rate": 5.248771828797474e-06, "loss": 1.3008, "step": 340 }, { "epoch": 0.7026476578411406, "grad_norm": 0.2791511099905026, "learning_rate": 4.938433536408771e-06, "loss": 1.2848, "step": 345 }, { "epoch": 0.7128309572301426, "grad_norm": 0.282696156100298, "learning_rate": 4.634516228237372e-06, "loss": 1.291, "step": 350 }, { "epoch": 0.7230142566191446, "grad_norm": 0.27007483942462107, "learning_rate": 4.337405446635264e-06, "loss": 1.2821, "step": 355 }, { "epoch": 0.7331975560081466, "grad_norm": 0.2885536779707833, "learning_rate": 4.047478099354857e-06, "loss": 1.3193, "step": 360 }, { "epoch": 0.7433808553971487, "grad_norm": 0.3194820894463116, "learning_rate": 3.7651019814126656e-06, "loss": 1.2993, "step": 365 }, { "epoch": 0.7535641547861507, "grad_norm": 0.28463406586484574, "learning_rate": 3.4906353085131917e-06, "loss": 1.297, "step": 370 }, { "epoch": 0.7637474541751528, "grad_norm": 0.2887353591273086, "learning_rate": 3.224426262624908e-06, "loss": 1.2887, "step": 375 }, { "epoch": 0.7739307535641547, "grad_norm": 0.28334983310000655, "learning_rate": 2.9668125502848035e-06, "loss": 1.2951, "step": 380 }, { "epoch": 0.7841140529531568, "grad_norm": 0.25257521474138683, "learning_rate": 2.7181209741918093e-06, "loss": 1.2937, "step": 385 }, { "epoch": 0.7942973523421588, "grad_norm": 0.2871884940474984, "learning_rate": 2.478667018632562e-06, "loss": 1.2858, "step": 390 }, { "epoch": 0.8044806517311609, "grad_norm": 0.25758574412430923, "learning_rate": 2.2487544492654832e-06, "loss": 1.3027, "step": 395 }, { "epoch": 0.814663951120163, "grad_norm": 0.2840318008095242, "learning_rate": 2.0286749277707783e-06, "loss": 1.2961, "step": 400 }, { "epoch": 0.814663951120163, "eval_loss": 1.297537922859192, "eval_runtime": 58.3016, "eval_samples_per_second": 238.536, "eval_steps_per_second": 3.739, "step": 400 }, { "epoch": 0.824847250509165, "grad_norm": 0.3101944505956513, "learning_rate": 1.8187076418552974e-06, "loss": 1.3014, "step": 405 }, { "epoch": 0.835030549898167, "grad_norm": 0.2791695231780548, "learning_rate": 1.6191189510815942e-06, "loss": 1.2931, "step": 410 }, { "epoch": 0.845213849287169, "grad_norm": 0.28415893155261507, "learning_rate": 1.4301620489704072e-06, "loss": 1.2988, "step": 415 }, { "epoch": 0.8553971486761711, "grad_norm": 0.2822535369305096, "learning_rate": 1.2520766418053408e-06, "loss": 1.2905, "step": 420 }, { "epoch": 0.8655804480651731, "grad_norm": 0.28168468277135167, "learning_rate": 1.0850886445471055e-06, "loss": 1.2817, "step": 425 }, { "epoch": 0.8757637474541752, "grad_norm": 0.2655425105216882, "learning_rate": 9.294098942430996e-07, "loss": 1.2843, "step": 430 }, { "epoch": 0.8859470468431772, "grad_norm": 0.2741436677332839, "learning_rate": 7.852378812959227e-07, "loss": 1.2924, "step": 435 }, { "epoch": 0.8961303462321792, "grad_norm": 0.26895584158358093, "learning_rate": 6.527554989316898e-07, "loss": 1.2935, "step": 440 }, { "epoch": 0.9063136456211812, "grad_norm": 0.266002518787797, "learning_rate": 5.321308111859791e-07, "loss": 1.2857, "step": 445 }, { "epoch": 0.9164969450101833, "grad_norm": 0.2650609910106162, "learning_rate": 4.235168397017542e-07, "loss": 1.2664, "step": 450 }, { "epoch": 0.9266802443991853, "grad_norm": 0.2647562856371479, "learning_rate": 3.2705136960970554e-07, "loss": 1.2988, "step": 455 }, { "epoch": 0.9368635437881874, "grad_norm": 0.29894331322023837, "learning_rate": 2.4285677473727123e-07, "loss": 1.2744, "step": 460 }, { "epoch": 0.9470468431771895, "grad_norm": 0.2719080220558317, "learning_rate": 1.7103986236807312e-07, "loss": 1.2803, "step": 465 }, { "epoch": 0.9572301425661914, "grad_norm": 0.2791229463614563, "learning_rate": 1.1169173774871478e-07, "loss": 1.2906, "step": 470 }, { "epoch": 0.9674134419551935, "grad_norm": 0.2987546290973262, "learning_rate": 6.488768851480087e-08, "loss": 1.2884, "step": 475 }, { "epoch": 0.9775967413441955, "grad_norm": 0.30009885175708245, "learning_rate": 3.0687089182819264e-08, "loss": 1.3051, "step": 480 }, { "epoch": 0.9877800407331976, "grad_norm": 0.2873283513110056, "learning_rate": 9.13332582901716e-09, "loss": 1.286, "step": 485 }, { "epoch": 0.9979633401221996, "grad_norm": 0.272339006461075, "learning_rate": 2.5374105085518297e-10, "loss": 1.2873, "step": 490 }, { "epoch": 1.0, "step": 491, "total_flos": 154920007237632.0, "train_loss": 1.4646558688760047, "train_runtime": 2127.1747, "train_samples_per_second": 59.07, "train_steps_per_second": 0.231 } ], "logging_steps": 5, "max_steps": 491, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 154920007237632.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }