{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030303030303030304, "grad_norm": 1.5466958284378052, "learning_rate": 2.0000000000000003e-06, "loss": 1.8566, "step": 1 }, { "epoch": 0.06060606060606061, "grad_norm": 1.5582852363586426, "learning_rate": 4.000000000000001e-06, "loss": 1.8976, "step": 2 }, { "epoch": 0.09090909090909091, "grad_norm": 1.524529218673706, "learning_rate": 6e-06, "loss": 1.7259, "step": 3 }, { "epoch": 0.12121212121212122, "grad_norm": 1.6974282264709473, "learning_rate": 8.000000000000001e-06, "loss": 1.814, "step": 4 }, { "epoch": 0.15151515151515152, "grad_norm": 1.4745838642120361, "learning_rate": 1e-05, "loss": 1.5887, "step": 5 }, { "epoch": 0.18181818181818182, "grad_norm": 1.522696852684021, "learning_rate": 1.2e-05, "loss": 1.692, "step": 6 }, { "epoch": 0.21212121212121213, "grad_norm": 1.4861280918121338, "learning_rate": 1.4e-05, "loss": 1.6883, "step": 7 }, { "epoch": 0.24242424242424243, "grad_norm": 1.441834807395935, "learning_rate": 1.6000000000000003e-05, "loss": 1.6722, "step": 8 }, { "epoch": 0.2727272727272727, "grad_norm": 1.633545160293579, "learning_rate": 1.8e-05, "loss": 1.6187, "step": 9 }, { "epoch": 0.30303030303030304, "grad_norm": 1.495406150817871, "learning_rate": 2e-05, "loss": 1.5615, "step": 10 }, { "epoch": 0.3333333333333333, "grad_norm": 1.5129448175430298, "learning_rate": 1.9993770622619784e-05, "loss": 1.6768, "step": 11 }, { "epoch": 0.36363636363636365, "grad_norm": 1.461745023727417, "learning_rate": 1.9975090251507637e-05, "loss": 1.7377, "step": 12 }, { "epoch": 0.3939393939393939, "grad_norm": 1.2962136268615723, "learning_rate": 1.9943982160079823e-05, "loss": 1.7039, "step": 13 }, { "epoch": 0.42424242424242425, "grad_norm": 1.5274053812026978, "learning_rate": 1.9900485105144544e-05, "loss": 1.8094, "step": 14 }, { "epoch": 0.45454545454545453, "grad_norm": 1.8526736497879028, "learning_rate": 1.9844653278615836e-05, "loss": 1.7952, "step": 15 }, { "epoch": 0.48484848484848486, "grad_norm": 1.5075675249099731, "learning_rate": 1.9776556239997146e-05, "loss": 1.7614, "step": 16 }, { "epoch": 0.5151515151515151, "grad_norm": 1.3172680139541626, "learning_rate": 1.9696278829718882e-05, "loss": 1.8214, "step": 17 }, { "epoch": 0.5454545454545454, "grad_norm": 1.2293428182601929, "learning_rate": 1.9603921063437795e-05, "loss": 1.688, "step": 18 }, { "epoch": 0.5757575757575758, "grad_norm": 1.6465842723846436, "learning_rate": 1.949959800742991e-05, "loss": 1.7641, "step": 19 }, { "epoch": 0.6060606060606061, "grad_norm": 1.2794415950775146, "learning_rate": 1.9383439635232296e-05, "loss": 1.7237, "step": 20 }, { "epoch": 0.6363636363636364, "grad_norm": 1.5035333633422852, "learning_rate": 1.9255590665712214e-05, "loss": 1.7502, "step": 21 }, { "epoch": 0.6666666666666666, "grad_norm": 1.6453092098236084, "learning_rate": 1.911621038276542e-05, "loss": 1.4906, "step": 22 }, { "epoch": 0.696969696969697, "grad_norm": 1.51984703540802, "learning_rate": 1.8965472436868288e-05, "loss": 1.5359, "step": 23 }, { "epoch": 0.7272727272727273, "grad_norm": 1.3503923416137695, "learning_rate": 1.8803564628730916e-05, "loss": 1.5969, "step": 24 }, { "epoch": 0.7575757575757576, "grad_norm": 1.3228968381881714, "learning_rate": 1.8630688675320844e-05, "loss": 1.7511, "step": 25 }, { "epoch": 0.7878787878787878, "grad_norm": 1.2232959270477295, "learning_rate": 1.8447059958548822e-05, "loss": 1.6303, "step": 26 }, { "epoch": 0.8181818181818182, "grad_norm": 1.2176752090454102, "learning_rate": 1.8252907256929777e-05, "loss": 1.6744, "step": 27 }, { "epoch": 0.8484848484848485, "grad_norm": 1.6348463296890259, "learning_rate": 1.804847246055326e-05, "loss": 1.7386, "step": 28 }, { "epoch": 0.8787878787878788, "grad_norm": 1.3479255437850952, "learning_rate": 1.7834010269718526e-05, "loss": 1.8378, "step": 29 }, { "epoch": 0.9090909090909091, "grad_norm": 1.4100639820098877, "learning_rate": 1.7609787877609678e-05, "loss": 1.6901, "step": 30 }, { "epoch": 0.9393939393939394, "grad_norm": 1.289868950843811, "learning_rate": 1.7376084637406222e-05, "loss": 1.7371, "step": 31 }, { "epoch": 0.9696969696969697, "grad_norm": 1.2278428077697754, "learning_rate": 1.7133191714243805e-05, "loss": 1.5087, "step": 32 }, { "epoch": 1.0, "grad_norm": 1.5690200328826904, "learning_rate": 1.6881411722458688e-05, "loss": 1.4644, "step": 33 }, { "epoch": 1.0303030303030303, "grad_norm": 1.7996729612350464, "learning_rate": 1.6621058348568008e-05, "loss": 1.2771, "step": 34 }, { "epoch": 1.0606060606060606, "grad_norm": 1.6773903369903564, "learning_rate": 1.6352455960455385e-05, "loss": 1.324, "step": 35 }, { "epoch": 1.0909090909090908, "grad_norm": 1.6019961833953857, "learning_rate": 1.607593920324899e-05, "loss": 1.2817, "step": 36 }, { "epoch": 1.121212121212121, "grad_norm": 1.3165000677108765, "learning_rate": 1.5791852582395334e-05, "loss": 1.1927, "step": 37 }, { "epoch": 1.1515151515151516, "grad_norm": 1.455589771270752, "learning_rate": 1.5500550034448415e-05, "loss": 1.2133, "step": 38 }, { "epoch": 1.1818181818181819, "grad_norm": 1.8257672786712646, "learning_rate": 1.5202394486108823e-05, "loss": 1.1531, "step": 39 }, { "epoch": 1.2121212121212122, "grad_norm": 1.873024821281433, "learning_rate": 1.4897757402062285e-05, "loss": 1.2731, "step": 40 }, { "epoch": 1.2424242424242424, "grad_norm": 1.7878319025039673, "learning_rate": 1.4587018322180906e-05, "loss": 1.2247, "step": 41 }, { "epoch": 1.2727272727272727, "grad_norm": 1.2891494035720825, "learning_rate": 1.4270564388663761e-05, "loss": 1.0763, "step": 42 }, { "epoch": 1.303030303030303, "grad_norm": 1.4569677114486694, "learning_rate": 1.3948789863705914e-05, "loss": 1.1027, "step": 43 }, { "epoch": 1.3333333333333333, "grad_norm": 1.5054666996002197, "learning_rate": 1.3622095638296827e-05, "loss": 1.1478, "step": 44 }, { "epoch": 1.3636363636363638, "grad_norm": 1.4277585744857788, "learning_rate": 1.32908887327601e-05, "loss": 1.1792, "step": 45 }, { "epoch": 1.393939393939394, "grad_norm": 1.3641875982284546, "learning_rate": 1.2955581789656844e-05, "loss": 1.0412, "step": 46 }, { "epoch": 1.4242424242424243, "grad_norm": 1.706855297088623, "learning_rate": 1.2616592559684408e-05, "loss": 1.1198, "step": 47 }, { "epoch": 1.4545454545454546, "grad_norm": 1.8636873960494995, "learning_rate": 1.2274343381211067e-05, "loss": 1.0971, "step": 48 }, { "epoch": 1.4848484848484849, "grad_norm": 1.6042084693908691, "learning_rate": 1.192926065409497e-05, "loss": 1.0331, "step": 49 }, { "epoch": 1.5151515151515151, "grad_norm": 1.6517517566680908, "learning_rate": 1.1581774308443042e-05, "loss": 1.1258, "step": 50 }, { "epoch": 1.5454545454545454, "grad_norm": 1.7735017538070679, "learning_rate": 1.1232317268971586e-05, "loss": 1.211, "step": 51 }, { "epoch": 1.5757575757575757, "grad_norm": 1.4876844882965088, "learning_rate": 1.088132491563602e-05, "loss": 1.0778, "step": 52 }, { "epoch": 1.606060606060606, "grad_norm": 1.4725524187088013, "learning_rate": 1.0529234541201631e-05, "loss": 1.1482, "step": 53 }, { "epoch": 1.6363636363636362, "grad_norm": 1.6460925340652466, "learning_rate": 1.0176484806431288e-05, "loss": 1.0857, "step": 54 }, { "epoch": 1.6666666666666665, "grad_norm": 1.6207877397537231, "learning_rate": 9.823515193568715e-06, "loss": 1.1458, "step": 55 }, { "epoch": 1.696969696969697, "grad_norm": 1.5556446313858032, "learning_rate": 9.470765458798369e-06, "loss": 0.9578, "step": 56 }, { "epoch": 1.7272727272727273, "grad_norm": 1.459346890449524, "learning_rate": 9.118675084363986e-06, "loss": 1.1004, "step": 57 }, { "epoch": 1.7575757575757576, "grad_norm": 1.6329230070114136, "learning_rate": 8.767682731028415e-06, "loss": 1.0838, "step": 58 }, { "epoch": 1.7878787878787878, "grad_norm": 1.4568471908569336, "learning_rate": 8.418225691556962e-06, "loss": 0.9716, "step": 59 }, { "epoch": 1.8181818181818183, "grad_norm": 1.6114221811294556, "learning_rate": 8.070739345905032e-06, "loss": 1.2118, "step": 60 }, { "epoch": 1.8484848484848486, "grad_norm": 1.4424853324890137, "learning_rate": 7.725656618788938e-06, "loss": 1.0933, "step": 61 }, { "epoch": 1.878787878787879, "grad_norm": 1.3055541515350342, "learning_rate": 7.383407440315595e-06, "loss": 1.054, "step": 62 }, { "epoch": 1.9090909090909092, "grad_norm": 1.608759880065918, "learning_rate": 7.044418210343161e-06, "loss": 1.2112, "step": 63 }, { "epoch": 1.9393939393939394, "grad_norm": 1.381705403327942, "learning_rate": 6.7091112672399e-06, "loss": 1.06, "step": 64 }, { "epoch": 1.9696969696969697, "grad_norm": 1.5174615383148193, "learning_rate": 6.3779043617031775e-06, "loss": 1.1465, "step": 65 }, { "epoch": 2.0, "grad_norm": 1.7489651441574097, "learning_rate": 6.051210136294089e-06, "loss": 0.8968, "step": 66 }, { "epoch": 2.0303030303030303, "grad_norm": 1.7633519172668457, "learning_rate": 5.729435611336239e-06, "loss": 0.8801, "step": 67 }, { "epoch": 2.0606060606060606, "grad_norm": 2.278275728225708, "learning_rate": 5.412981677819094e-06, "loss": 0.8784, "step": 68 }, { "epoch": 2.090909090909091, "grad_norm": 1.8437879085540771, "learning_rate": 5.1022425979377174e-06, "loss": 0.7264, "step": 69 }, { "epoch": 2.121212121212121, "grad_norm": 1.5237785577774048, "learning_rate": 4.797605513891179e-06, "loss": 0.758, "step": 70 }, { "epoch": 2.1515151515151514, "grad_norm": 1.7430484294891357, "learning_rate": 4.4994499655515865e-06, "loss": 0.8087, "step": 71 }, { "epoch": 2.1818181818181817, "grad_norm": 1.7282936573028564, "learning_rate": 4.208147417604665e-06, "loss": 0.7311, "step": 72 }, { "epoch": 2.212121212121212, "grad_norm": 1.907755970954895, "learning_rate": 3.924060796751012e-06, "loss": 0.7755, "step": 73 }, { "epoch": 2.242424242424242, "grad_norm": 2.1145212650299072, "learning_rate": 3.647544039544615e-06, "loss": 0.7668, "step": 74 }, { "epoch": 2.2727272727272725, "grad_norm": 2.2172389030456543, "learning_rate": 3.378941651431996e-06, "loss": 0.7701, "step": 75 }, { "epoch": 2.303030303030303, "grad_norm": 2.0913803577423096, "learning_rate": 3.1185882775413123e-06, "loss": 0.889, "step": 76 }, { "epoch": 2.3333333333333335, "grad_norm": 2.1478118896484375, "learning_rate": 2.8668082857562006e-06, "loss": 0.7411, "step": 77 }, { "epoch": 2.3636363636363638, "grad_norm": 2.0034427642822266, "learning_rate": 2.6239153625937786e-06, "loss": 0.7564, "step": 78 }, { "epoch": 2.393939393939394, "grad_norm": 1.6710702180862427, "learning_rate": 2.390212122390323e-06, "loss": 0.7695, "step": 79 }, { "epoch": 2.4242424242424243, "grad_norm": 1.7885438203811646, "learning_rate": 2.165989730281475e-06, "loss": 0.8017, "step": 80 }, { "epoch": 2.4545454545454546, "grad_norm": 1.3610081672668457, "learning_rate": 1.9515275394467446e-06, "loss": 0.7561, "step": 81 }, { "epoch": 2.484848484848485, "grad_norm": 1.645119547843933, "learning_rate": 1.7470927430702277e-06, "loss": 0.6155, "step": 82 }, { "epoch": 2.515151515151515, "grad_norm": 1.7851331233978271, "learning_rate": 1.5529400414511809e-06, "loss": 0.6894, "step": 83 }, { "epoch": 2.5454545454545454, "grad_norm": 1.4505351781845093, "learning_rate": 1.369311324679159e-06, "loss": 0.8904, "step": 84 }, { "epoch": 2.5757575757575757, "grad_norm": 1.663177728652954, "learning_rate": 1.196435371269089e-06, "loss": 0.8289, "step": 85 }, { "epoch": 2.606060606060606, "grad_norm": 1.7335422039031982, "learning_rate": 1.0345275631317165e-06, "loss": 0.7425, "step": 86 }, { "epoch": 2.6363636363636362, "grad_norm": 1.3842012882232666, "learning_rate": 8.837896172345827e-07, "loss": 0.7578, "step": 87 }, { "epoch": 2.6666666666666665, "grad_norm": 1.770256757736206, "learning_rate": 7.4440933428779e-07, "loss": 0.6974, "step": 88 }, { "epoch": 2.6969696969696972, "grad_norm": 1.5890138149261475, "learning_rate": 6.165603647677054e-07, "loss": 0.7797, "step": 89 }, { "epoch": 2.7272727272727275, "grad_norm": 1.6013965606689453, "learning_rate": 5.004019925700921e-07, "loss": 0.7053, "step": 90 }, { "epoch": 2.757575757575758, "grad_norm": 1.4727004766464233, "learning_rate": 3.960789365622075e-07, "loss": 0.7264, "step": 91 }, { "epoch": 2.787878787878788, "grad_norm": 1.6595274209976196, "learning_rate": 3.0372117028111825e-07, "loss": 0.7089, "step": 92 }, { "epoch": 2.8181818181818183, "grad_norm": 1.8651310205459595, "learning_rate": 2.2344376000285606e-07, "loss": 0.6721, "step": 93 }, { "epoch": 2.8484848484848486, "grad_norm": 1.7887459993362427, "learning_rate": 1.553467213841664e-07, "loss": 0.788, "step": 94 }, { "epoch": 2.878787878787879, "grad_norm": 1.454445719718933, "learning_rate": 9.951489485545696e-08, "loss": 0.6617, "step": 95 }, { "epoch": 2.909090909090909, "grad_norm": 1.662096619606018, "learning_rate": 5.6017839920180506e-08, "loss": 0.7603, "step": 96 }, { "epoch": 2.9393939393939394, "grad_norm": 1.6245455741882324, "learning_rate": 2.4909748492362162e-08, "loss": 0.7336, "step": 97 }, { "epoch": 2.9696969696969697, "grad_norm": 1.5725836753845215, "learning_rate": 6.229377380218005e-09, "loss": 0.7566, "step": 98 }, { "epoch": 3.0, "grad_norm": 1.6165506839752197, "learning_rate": 0.0, "loss": 0.6936, "step": 99 }, { "epoch": 3.0, "step": 99, "total_flos": 2.3659576070189875e+17, "train_loss": 1.195338128191052, "train_runtime": 1048.1424, "train_samples_per_second": 2.945, "train_steps_per_second": 0.094 } ], "logging_steps": 1, "max_steps": 99, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3659576070189875e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }