{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999115122555526, "eval_steps": 500, "global_step": 5650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008848774444739404, "grad_norm": 1.8227072749903463, "learning_rate": 5.882352941176471e-06, "loss": 1.434, "step": 50 }, { "epoch": 0.017697548889478807, "grad_norm": 1.593675125457607, "learning_rate": 1.1764705882352942e-05, "loss": 1.1202, "step": 100 }, { "epoch": 0.026546323334218212, "grad_norm": 1.5547518309505592, "learning_rate": 1.7647058823529414e-05, "loss": 1.0971, "step": 150 }, { "epoch": 0.035395097778957614, "grad_norm": 1.3952753982651918, "learning_rate": 1.9998521094455198e-05, "loss": 1.0463, "step": 200 }, { "epoch": 0.044243872223697016, "grad_norm": 1.6576864383381864, "learning_rate": 1.9989484922416503e-05, "loss": 1.0808, "step": 250 }, { "epoch": 0.053092646668436425, "grad_norm": 1.340907083636223, "learning_rate": 1.9972241607451552e-05, "loss": 1.0342, "step": 300 }, { "epoch": 0.061941421113175826, "grad_norm": 1.269578433993962, "learning_rate": 1.9946805316291817e-05, "loss": 1.0199, "step": 350 }, { "epoch": 0.07079019555791523, "grad_norm": 1.2537035911999723, "learning_rate": 1.9913196946839304e-05, "loss": 1.0137, "step": 400 }, { "epoch": 0.07963897000265463, "grad_norm": 1.1857124728289088, "learning_rate": 1.987144411099731e-05, "loss": 1.0133, "step": 450 }, { "epoch": 0.08848774444739403, "grad_norm": 1.2181869575632758, "learning_rate": 1.9821581111985072e-05, "loss": 1.0178, "step": 500 }, { "epoch": 0.09733651889213343, "grad_norm": 1.2987295471871965, "learning_rate": 1.9763648916154982e-05, "loss": 1.0127, "step": 550 }, { "epoch": 0.10618529333687285, "grad_norm": 1.2413868753158877, "learning_rate": 1.9697695119335547e-05, "loss": 0.9979, "step": 600 }, { "epoch": 0.11503406778161225, "grad_norm": 1.2626131743513744, "learning_rate": 1.9623773907727682e-05, "loss": 0.9965, "step": 650 }, { "epoch": 0.12388284222635165, "grad_norm": 1.3730231292537942, "learning_rate": 1.954194601338651e-05, "loss": 0.9942, "step": 700 }, { "epoch": 0.13273161667109104, "grad_norm": 1.2218007272454348, "learning_rate": 1.9452278664325227e-05, "loss": 1.0036, "step": 750 }, { "epoch": 0.14158039111583046, "grad_norm": 1.1454037410098823, "learning_rate": 1.9354845529282042e-05, "loss": 0.9868, "step": 800 }, { "epoch": 0.15042916556056987, "grad_norm": 1.199534918146064, "learning_rate": 1.9249726657195534e-05, "loss": 0.9972, "step": 850 }, { "epoch": 0.15927794000530926, "grad_norm": 1.2920998134175072, "learning_rate": 1.9137008411438213e-05, "loss": 1.0239, "step": 900 }, { "epoch": 0.16812671445004868, "grad_norm": 1.1321328064281995, "learning_rate": 1.901678339886223e-05, "loss": 0.9807, "step": 950 }, { "epoch": 0.17697548889478806, "grad_norm": 1.1146456739633037, "learning_rate": 1.8889150393715627e-05, "loss": 0.981, "step": 1000 }, { "epoch": 0.18582426333952748, "grad_norm": 1.161097648736237, "learning_rate": 1.8754214256491564e-05, "loss": 0.9826, "step": 1050 }, { "epoch": 0.19467303778426687, "grad_norm": 1.2010813609189326, "learning_rate": 1.8612085847777215e-05, "loss": 0.9846, "step": 1100 }, { "epoch": 0.20352181222900628, "grad_norm": 1.203856802982565, "learning_rate": 1.8462881937173144e-05, "loss": 0.9789, "step": 1150 }, { "epoch": 0.2123705866737457, "grad_norm": 1.1809801509975393, "learning_rate": 1.8306725107357933e-05, "loss": 0.9785, "step": 1200 }, { "epoch": 0.22121936111848509, "grad_norm": 1.1856255544481202, "learning_rate": 1.8143743653376944e-05, "loss": 0.9724, "step": 1250 }, { "epoch": 0.2300681355632245, "grad_norm": 1.2932019902094527, "learning_rate": 1.7974071477237887e-05, "loss": 0.9741, "step": 1300 }, { "epoch": 0.2389169100079639, "grad_norm": 1.1399596376970142, "learning_rate": 1.7797847977899873e-05, "loss": 0.9787, "step": 1350 }, { "epoch": 0.2477656844527033, "grad_norm": 1.1851681853908578, "learning_rate": 1.7615217936746246e-05, "loss": 0.9712, "step": 1400 }, { "epoch": 0.2566144588974427, "grad_norm": 1.212090367995841, "learning_rate": 1.742633139863538e-05, "loss": 0.9729, "step": 1450 }, { "epoch": 0.2654632333421821, "grad_norm": 1.0975454688592081, "learning_rate": 1.7231343548627085e-05, "loss": 0.9714, "step": 1500 }, { "epoch": 0.2743120077869215, "grad_norm": 1.0110033370546834, "learning_rate": 1.7030414584485938e-05, "loss": 0.9591, "step": 1550 }, { "epoch": 0.2831607822316609, "grad_norm": 1.0352711739713445, "learning_rate": 1.6823709585066308e-05, "loss": 0.9719, "step": 1600 }, { "epoch": 0.29200955667640033, "grad_norm": 1.1174206790465606, "learning_rate": 1.6611398374687172e-05, "loss": 0.9673, "step": 1650 }, { "epoch": 0.30085833112113974, "grad_norm": 1.1508488673423878, "learning_rate": 1.6393655383608132e-05, "loss": 0.9579, "step": 1700 }, { "epoch": 0.3097071055658791, "grad_norm": 1.1140112909261894, "learning_rate": 1.6170659504721365e-05, "loss": 0.9773, "step": 1750 }, { "epoch": 0.3185558800106185, "grad_norm": 1.078883305222083, "learning_rate": 1.594259394657707e-05, "loss": 0.963, "step": 1800 }, { "epoch": 0.32740465445535794, "grad_norm": 1.0741496670790676, "learning_rate": 1.570964608286336e-05, "loss": 0.9665, "step": 1850 }, { "epoch": 0.33625342890009735, "grad_norm": 1.0674741658785543, "learning_rate": 1.5472007298464117e-05, "loss": 0.9577, "step": 1900 }, { "epoch": 0.34510220334483677, "grad_norm": 1.1266524576573997, "learning_rate": 1.5229872832221336e-05, "loss": 0.9578, "step": 1950 }, { "epoch": 0.3539509777895761, "grad_norm": 1.0507368907995636, "learning_rate": 1.4983441616531152e-05, "loss": 0.9543, "step": 2000 }, { "epoch": 0.36279975223431554, "grad_norm": 1.5910538700413814, "learning_rate": 1.4732916113905336e-05, "loss": 0.9499, "step": 2050 }, { "epoch": 0.37164852667905496, "grad_norm": 1.1710473762069435, "learning_rate": 1.4478502150632503e-05, "loss": 0.9928, "step": 2100 }, { "epoch": 0.3804973011237944, "grad_norm": 1.1721776444324115, "learning_rate": 1.4220408747675714e-05, "loss": 0.9509, "step": 2150 }, { "epoch": 0.38934607556853373, "grad_norm": 1.1265584958834658, "learning_rate": 1.3958847948945428e-05, "loss": 0.9437, "step": 2200 }, { "epoch": 0.39819485001327315, "grad_norm": 1.1519073177115475, "learning_rate": 1.369403464708884e-05, "loss": 0.9445, "step": 2250 }, { "epoch": 0.40704362445801257, "grad_norm": 1.1172434119258432, "learning_rate": 1.3426186406938769e-05, "loss": 1.0387, "step": 2300 }, { "epoch": 0.415892398902752, "grad_norm": 1.138922531256483, "learning_rate": 1.315552328676714e-05, "loss": 0.9391, "step": 2350 }, { "epoch": 0.4247411733474914, "grad_norm": 1.089137186693905, "learning_rate": 1.2882267657489908e-05, "loss": 0.9457, "step": 2400 }, { "epoch": 0.43358994779223076, "grad_norm": 1.0358420925020666, "learning_rate": 1.2606644019971967e-05, "loss": 0.9972, "step": 2450 }, { "epoch": 0.44243872223697017, "grad_norm": 1.0748089642780165, "learning_rate": 1.2328878820582122e-05, "loss": 0.926, "step": 2500 }, { "epoch": 0.4512874966817096, "grad_norm": 1.1178495139589024, "learning_rate": 1.204920026514971e-05, "loss": 0.9371, "step": 2550 }, { "epoch": 0.460136271126449, "grad_norm": 1.0570225052003097, "learning_rate": 1.1767838131475654e-05, "loss": 0.9299, "step": 2600 }, { "epoch": 0.46898504557118836, "grad_norm": 1.198704612437538, "learning_rate": 1.1485023580552039e-05, "loss": 0.9333, "step": 2650 }, { "epoch": 0.4778338200159278, "grad_norm": 1.2153247727284249, "learning_rate": 1.1200988966645286e-05, "loss": 0.9325, "step": 2700 }, { "epoch": 0.4866825944606672, "grad_norm": 1.0862037277462553, "learning_rate": 1.091596764639895e-05, "loss": 0.9341, "step": 2750 }, { "epoch": 0.4955313689054066, "grad_norm": 1.0724182576148855, "learning_rate": 1.0630193787112994e-05, "loss": 0.9063, "step": 2800 }, { "epoch": 0.504380143350146, "grad_norm": 1.0396985853342051, "learning_rate": 1.034390217435704e-05, "loss": 0.9293, "step": 2850 }, { "epoch": 0.5132289177948854, "grad_norm": 1.0749902902208996, "learning_rate": 1.005732801907567e-05, "loss": 0.9214, "step": 2900 }, { "epoch": 0.5220776922396249, "grad_norm": 1.085293805471844, "learning_rate": 9.770706764344235e-06, "loss": 1.0245, "step": 2950 }, { "epoch": 0.5309264666843642, "grad_norm": 1.1358464620386077, "learning_rate": 9.484273891933982e-06, "loss": 0.9297, "step": 3000 }, { "epoch": 0.5397752411291036, "grad_norm": 1.102492622904414, "learning_rate": 9.198264728845332e-06, "loss": 0.9157, "step": 3050 }, { "epoch": 0.548624015573843, "grad_norm": 1.1636978192620964, "learning_rate": 8.912914253968391e-06, "loss": 0.9236, "step": 3100 }, { "epoch": 0.5574727900185824, "grad_norm": 1.1333308119371828, "learning_rate": 8.628456905029383e-06, "loss": 0.9158, "step": 3150 }, { "epoch": 0.5663215644633218, "grad_norm": 1.117341944549429, "learning_rate": 8.345126385981737e-06, "loss": 0.9102, "step": 3200 }, { "epoch": 0.5751703389080612, "grad_norm": 1.0840357862773122, "learning_rate": 8.063155475000037e-06, "loss": 0.9546, "step": 3250 }, { "epoch": 0.5840191133528007, "grad_norm": 1.1791446775850642, "learning_rate": 7.782775833234522e-06, "loss": 0.924, "step": 3300 }, { "epoch": 0.5928678877975401, "grad_norm": 1.1948088039686837, "learning_rate": 7.504217814483364e-06, "loss": 0.9135, "step": 3350 }, { "epoch": 0.6017166622422795, "grad_norm": 1.107584300567853, "learning_rate": 7.227710275938987e-06, "loss": 0.9088, "step": 3400 }, { "epoch": 0.6105654366870189, "grad_norm": 1.167878367521536, "learning_rate": 6.953480390164001e-06, "loss": 0.9394, "step": 3450 }, { "epoch": 0.6194142111317582, "grad_norm": 1.0387264654842252, "learning_rate": 6.68175345845119e-06, "loss": 0.9022, "step": 3500 }, { "epoch": 0.6282629855764976, "grad_norm": 1.2650051247263, "learning_rate": 6.412752725720864e-06, "loss": 0.9135, "step": 3550 }, { "epoch": 0.637111760021237, "grad_norm": 1.20136444284978, "learning_rate": 6.146699197107715e-06, "loss": 0.9068, "step": 3600 }, { "epoch": 0.6459605344659765, "grad_norm": 1.0811422393549932, "learning_rate": 5.883811456387821e-06, "loss": 0.9082, "step": 3650 }, { "epoch": 0.6548093089107159, "grad_norm": 1.1656545210876348, "learning_rate": 5.6243054863949675e-06, "loss": 0.8898, "step": 3700 }, { "epoch": 0.6636580833554553, "grad_norm": 1.1852994908957295, "learning_rate": 5.368394491573876e-06, "loss": 0.9026, "step": 3750 }, { "epoch": 0.6725068578001947, "grad_norm": 1.161821140479422, "learning_rate": 5.116288722816087e-06, "loss": 0.8838, "step": 3800 }, { "epoch": 0.6813556322449341, "grad_norm": 1.2037332656822164, "learning_rate": 4.868195304722391e-06, "loss": 0.9025, "step": 3850 }, { "epoch": 0.6902044066896735, "grad_norm": 1.1242488366634837, "learning_rate": 4.6243180654337975e-06, "loss": 0.931, "step": 3900 }, { "epoch": 0.6990531811344128, "grad_norm": 1.1696361772435924, "learning_rate": 4.384857369170772e-06, "loss": 0.9338, "step": 3950 }, { "epoch": 0.7079019555791523, "grad_norm": 1.2264055771278481, "learning_rate": 4.1500099516183555e-06, "loss": 0.8993, "step": 4000 }, { "epoch": 0.7167507300238917, "grad_norm": 1.1225371954007977, "learning_rate": 3.919968758292425e-06, "loss": 0.9044, "step": 4050 }, { "epoch": 0.7255995044686311, "grad_norm": 1.1128045918827218, "learning_rate": 3.6949227860198712e-06, "loss": 0.8963, "step": 4100 }, { "epoch": 0.7344482789133705, "grad_norm": 1.1426952334649678, "learning_rate": 3.475056927662912e-06, "loss": 0.8955, "step": 4150 }, { "epoch": 0.7432970533581099, "grad_norm": 1.0853524038336615, "learning_rate": 3.2605518202151577e-06, "loss": 0.8973, "step": 4200 }, { "epoch": 0.7521458278028493, "grad_norm": 1.1322485210453683, "learning_rate": 3.0515836963942056e-06, "loss": 0.8944, "step": 4250 }, { "epoch": 0.7609946022475887, "grad_norm": 1.2182044740120312, "learning_rate": 2.8483242398526723e-06, "loss": 0.8872, "step": 4300 }, { "epoch": 0.7698433766923282, "grad_norm": 1.0357920295682677, "learning_rate": 2.650940444126654e-06, "loss": 0.8856, "step": 4350 }, { "epoch": 0.7786921511370675, "grad_norm": 0.984081860446035, "learning_rate": 2.4595944754374723e-06, "loss": 0.8818, "step": 4400 }, { "epoch": 0.7875409255818069, "grad_norm": 1.1062257149827126, "learning_rate": 2.27444353945945e-06, "loss": 0.8883, "step": 4450 }, { "epoch": 0.7963897000265463, "grad_norm": 1.1408547470520316, "learning_rate": 2.0956397521631666e-06, "loss": 0.8729, "step": 4500 }, { "epoch": 0.8052384744712857, "grad_norm": 1.2286350695548351, "learning_rate": 1.9233300148402767e-06, "loss": 0.8782, "step": 4550 }, { "epoch": 0.8140872489160251, "grad_norm": 1.1745122607432803, "learning_rate": 1.757655893412622e-06, "loss": 0.8763, "step": 4600 }, { "epoch": 0.8229360233607645, "grad_norm": 1.291525874104284, "learning_rate": 1.5987535021247668e-06, "loss": 0.8817, "step": 4650 }, { "epoch": 0.831784797805504, "grad_norm": 1.1484560802799162, "learning_rate": 1.4467533917154842e-06, "loss": 0.8914, "step": 4700 }, { "epoch": 0.8406335722502434, "grad_norm": 1.0916994218547142, "learning_rate": 1.3017804421601298e-06, "loss": 0.9154, "step": 4750 }, { "epoch": 0.8494823466949828, "grad_norm": 1.2124618779904544, "learning_rate": 1.1639537600719764e-06, "loss": 0.8821, "step": 4800 }, { "epoch": 0.8583311211397221, "grad_norm": 1.1869802843495634, "learning_rate": 1.0333865808468203e-06, "loss": 0.8821, "step": 4850 }, { "epoch": 0.8671798955844615, "grad_norm": 1.134603833533901, "learning_rate": 9.101861756312369e-07, "loss": 0.8799, "step": 4900 }, { "epoch": 0.8760286700292009, "grad_norm": 1.23950870941106, "learning_rate": 7.944537631909666e-07, "loss": 0.8874, "step": 4950 }, { "epoch": 0.8848774444739403, "grad_norm": 1.0710206299690663, "learning_rate": 6.862844267517643e-07, "loss": 0.9178, "step": 5000 }, { "epoch": 0.8937262189186798, "grad_norm": 1.1079780487199702, "learning_rate": 5.857670358811096e-07, "loss": 0.9139, "step": 5050 }, { "epoch": 0.9025749933634192, "grad_norm": 1.224157319904236, "learning_rate": 4.929841734749063e-07, "loss": 0.883, "step": 5100 }, { "epoch": 0.9114237678081586, "grad_norm": 1.2973098944898664, "learning_rate": 4.0801206790916815e-07, "loss": 0.8748, "step": 5150 }, { "epoch": 0.920272542252898, "grad_norm": 1.2788191274563776, "learning_rate": 3.309205304124552e-07, "loss": 0.9109, "step": 5200 }, { "epoch": 0.9291213166976374, "grad_norm": 1.1179855803922911, "learning_rate": 2.6177289771049274e-07, "loss": 0.8944, "step": 5250 }, { "epoch": 0.9379700911423767, "grad_norm": 1.1595081612750888, "learning_rate": 2.0062597999009114e-07, "loss": 0.8857, "step": 5300 }, { "epoch": 0.9468188655871161, "grad_norm": 1.0486044192293602, "learning_rate": 1.4753001422514125e-07, "loss": 0.8827, "step": 5350 }, { "epoch": 0.9556676400318556, "grad_norm": 1.1477897530038004, "learning_rate": 1.0252862290301092e-07, "loss": 0.8769, "step": 5400 }, { "epoch": 0.964516414476595, "grad_norm": 1.1863014296064434, "learning_rate": 6.565877818526245e-08, "loss": 0.8754, "step": 5450 }, { "epoch": 0.9733651889213344, "grad_norm": 1.2034312267234064, "learning_rate": 3.6950771532126004e-08, "loss": 0.8723, "step": 5500 }, { "epoch": 0.9822139633660738, "grad_norm": 1.1269276845827294, "learning_rate": 1.6428188815703627e-08, "loss": 0.9178, "step": 5550 }, { "epoch": 0.9910627378108132, "grad_norm": 1.2360290358413641, "learning_rate": 4.1078909423253325e-09, "loss": 0.8848, "step": 5600 }, { "epoch": 0.9999115122555526, "grad_norm": 1.271675294761002, "learning_rate": 0.0, "loss": 0.8882, "step": 5650 }, { "epoch": 0.9999115122555526, "step": 5650, "total_flos": 3.3418410989715456e+16, "train_loss": 0.94595458984375, "train_runtime": 88600.3448, "train_samples_per_second": 0.51, "train_steps_per_second": 0.064 } ], "logging_steps": 50, "max_steps": 5650, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 3.3418410989715456e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }