{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.11943863839952225, "eval_steps": 100, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009953219866626853, "grad_norm": 1.912980556488037, "learning_rate": 9.995023390066686e-06, "loss": 1.8703, "step": 10 }, { "epoch": 0.0019906439733253707, "grad_norm": 1.866821050643921, "learning_rate": 9.990046780133374e-06, "loss": 1.8723, "step": 20 }, { "epoch": 0.002985965959988056, "grad_norm": 2.058809280395508, "learning_rate": 9.985070170200061e-06, "loss": 1.8097, "step": 30 }, { "epoch": 0.003981287946650741, "grad_norm": 1.459013819694519, "learning_rate": 9.980093560266747e-06, "loss": 1.7456, "step": 40 }, { "epoch": 0.004976609933313427, "grad_norm": 0.9095586538314819, "learning_rate": 9.975116950333434e-06, "loss": 1.7195, "step": 50 }, { "epoch": 0.005971931919976112, "grad_norm": 1.1065226793289185, "learning_rate": 9.970140340400121e-06, "loss": 1.6502, "step": 60 }, { "epoch": 0.0069672539066387975, "grad_norm": 0.8301252126693726, "learning_rate": 9.965163730466807e-06, "loss": 1.5699, "step": 70 }, { "epoch": 0.007962575893301483, "grad_norm": 1.0762828588485718, "learning_rate": 9.960187120533493e-06, "loss": 1.5072, "step": 80 }, { "epoch": 0.008957897879964169, "grad_norm": 1.0814900398254395, "learning_rate": 9.95521051060018e-06, "loss": 1.4369, "step": 90 }, { "epoch": 0.009953219866626855, "grad_norm": 1.3561326265335083, "learning_rate": 9.950233900666867e-06, "loss": 1.3467, "step": 100 }, { "epoch": 0.009953219866626855, "eval_loss": 1.2846794128417969, "eval_runtime": 147.6242, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 100 }, { "epoch": 0.010948541853289539, "grad_norm": 1.438547968864441, "learning_rate": 9.945257290733553e-06, "loss": 1.2222, "step": 110 }, { "epoch": 0.011943863839952225, "grad_norm": 1.402588963508606, "learning_rate": 9.94028068080024e-06, "loss": 1.1001, "step": 120 }, { "epoch": 0.012939185826614909, "grad_norm": 1.4357985258102417, "learning_rate": 9.935304070866926e-06, "loss": 0.9657, "step": 130 }, { "epoch": 0.013934507813277595, "grad_norm": 2.137953042984009, "learning_rate": 9.930327460933613e-06, "loss": 0.8211, "step": 140 }, { "epoch": 0.014929829799940281, "grad_norm": 1.374299168586731, "learning_rate": 9.925350851000299e-06, "loss": 0.7142, "step": 150 }, { "epoch": 0.015925151786602965, "grad_norm": 1.1510456800460815, "learning_rate": 9.920374241066986e-06, "loss": 0.656, "step": 160 }, { "epoch": 0.01692047377326565, "grad_norm": 1.0226788520812988, "learning_rate": 9.915397631133673e-06, "loss": 0.6212, "step": 170 }, { "epoch": 0.017915795759928337, "grad_norm": 0.9365411400794983, "learning_rate": 9.910421021200359e-06, "loss": 0.6069, "step": 180 }, { "epoch": 0.018911117746591023, "grad_norm": 0.6880003213882446, "learning_rate": 9.905444411267046e-06, "loss": 0.6128, "step": 190 }, { "epoch": 0.01990643973325371, "grad_norm": 1.1190361976623535, "learning_rate": 9.900467801333732e-06, "loss": 0.5426, "step": 200 }, { "epoch": 0.01990643973325371, "eval_loss": 0.5788590908050537, "eval_runtime": 147.511, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.691, "step": 200 }, { "epoch": 0.02090176171991639, "grad_norm": 1.184279441833496, "learning_rate": 9.895491191400419e-06, "loss": 0.5887, "step": 210 }, { "epoch": 0.021897083706579078, "grad_norm": 0.7627615928649902, "learning_rate": 9.890514581467106e-06, "loss": 0.5433, "step": 220 }, { "epoch": 0.022892405693241764, "grad_norm": 0.7858164310455322, "learning_rate": 9.885537971533792e-06, "loss": 0.5843, "step": 230 }, { "epoch": 0.02388772767990445, "grad_norm": 0.695697009563446, "learning_rate": 9.880561361600478e-06, "loss": 0.5365, "step": 240 }, { "epoch": 0.024883049666567136, "grad_norm": 0.8994197845458984, "learning_rate": 9.875584751667165e-06, "loss": 0.5662, "step": 250 }, { "epoch": 0.025878371653229818, "grad_norm": 0.8016309142112732, "learning_rate": 9.870608141733852e-06, "loss": 0.5592, "step": 260 }, { "epoch": 0.026873693639892504, "grad_norm": 0.8534384369850159, "learning_rate": 9.865631531800538e-06, "loss": 0.5248, "step": 270 }, { "epoch": 0.02786901562655519, "grad_norm": 0.9857029914855957, "learning_rate": 9.860654921867225e-06, "loss": 0.5294, "step": 280 }, { "epoch": 0.028864337613217876, "grad_norm": 0.7766090631484985, "learning_rate": 9.855678311933912e-06, "loss": 0.5198, "step": 290 }, { "epoch": 0.029859659599880562, "grad_norm": 0.6832401752471924, "learning_rate": 9.850701702000598e-06, "loss": 0.5844, "step": 300 }, { "epoch": 0.029859659599880562, "eval_loss": 0.536589503288269, "eval_runtime": 147.4968, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.692, "step": 300 }, { "epoch": 0.030854981586543248, "grad_norm": 0.7720848917961121, "learning_rate": 9.845725092067284e-06, "loss": 0.5365, "step": 310 }, { "epoch": 0.03185030357320593, "grad_norm": 0.7022100687026978, "learning_rate": 9.840748482133971e-06, "loss": 0.4841, "step": 320 }, { "epoch": 0.03284562555986862, "grad_norm": 1.0030310153961182, "learning_rate": 9.835771872200658e-06, "loss": 0.4635, "step": 330 }, { "epoch": 0.0338409475465313, "grad_norm": 0.8628882765769958, "learning_rate": 9.830795262267344e-06, "loss": 0.4932, "step": 340 }, { "epoch": 0.034836269533193985, "grad_norm": 0.7178316712379456, "learning_rate": 9.825818652334031e-06, "loss": 0.6057, "step": 350 }, { "epoch": 0.035831591519856675, "grad_norm": 0.9564626216888428, "learning_rate": 9.820842042400718e-06, "loss": 0.5371, "step": 360 }, { "epoch": 0.03682691350651936, "grad_norm": 0.7041760683059692, "learning_rate": 9.815865432467404e-06, "loss": 0.513, "step": 370 }, { "epoch": 0.037822235493182046, "grad_norm": 1.0203750133514404, "learning_rate": 9.81088882253409e-06, "loss": 0.5118, "step": 380 }, { "epoch": 0.03881755747984473, "grad_norm": 0.8765382170677185, "learning_rate": 9.805912212600777e-06, "loss": 0.4529, "step": 390 }, { "epoch": 0.03981287946650742, "grad_norm": 0.9951983690261841, "learning_rate": 9.800935602667464e-06, "loss": 0.5336, "step": 400 }, { "epoch": 0.03981287946650742, "eval_loss": 0.5151349306106567, "eval_runtime": 147.6615, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 400 }, { "epoch": 0.0408082014531701, "grad_norm": 0.7691435813903809, "learning_rate": 9.79595899273415e-06, "loss": 0.506, "step": 410 }, { "epoch": 0.04180352343983278, "grad_norm": 1.1955533027648926, "learning_rate": 9.790982382800837e-06, "loss": 0.4692, "step": 420 }, { "epoch": 0.04279884542649547, "grad_norm": 1.128085732460022, "learning_rate": 9.786005772867525e-06, "loss": 0.4608, "step": 430 }, { "epoch": 0.043794167413158155, "grad_norm": 0.5518949627876282, "learning_rate": 9.78102916293421e-06, "loss": 0.5006, "step": 440 }, { "epoch": 0.044789489399820845, "grad_norm": 0.7164484858512878, "learning_rate": 9.776052553000896e-06, "loss": 0.4996, "step": 450 }, { "epoch": 0.04578481138648353, "grad_norm": 0.5959630012512207, "learning_rate": 9.771075943067583e-06, "loss": 0.4843, "step": 460 }, { "epoch": 0.04678013337314621, "grad_norm": 0.743648111820221, "learning_rate": 9.76609933313427e-06, "loss": 0.4363, "step": 470 }, { "epoch": 0.0477754553598089, "grad_norm": 0.8757079243659973, "learning_rate": 9.761122723200956e-06, "loss": 0.4665, "step": 480 }, { "epoch": 0.04877077734647158, "grad_norm": 1.0122153759002686, "learning_rate": 9.756146113267643e-06, "loss": 0.492, "step": 490 }, { "epoch": 0.04976609933313427, "grad_norm": 0.6179729700088501, "learning_rate": 9.751169503334329e-06, "loss": 0.5022, "step": 500 }, { "epoch": 0.04976609933313427, "eval_loss": 0.4993921220302582, "eval_runtime": 147.7401, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.69, "step": 500 }, { "epoch": 0.050761421319796954, "grad_norm": 0.952812671661377, "learning_rate": 9.746192893401016e-06, "loss": 0.4901, "step": 510 }, { "epoch": 0.051756743306459636, "grad_norm": 0.6715916991233826, "learning_rate": 9.741216283467702e-06, "loss": 0.5055, "step": 520 }, { "epoch": 0.052752065293122326, "grad_norm": 0.674640953540802, "learning_rate": 9.736239673534389e-06, "loss": 0.4874, "step": 530 }, { "epoch": 0.05374738727978501, "grad_norm": 0.7867962718009949, "learning_rate": 9.731263063601075e-06, "loss": 0.4956, "step": 540 }, { "epoch": 0.0547427092664477, "grad_norm": 0.9035332202911377, "learning_rate": 9.726286453667762e-06, "loss": 0.499, "step": 550 }, { "epoch": 0.05573803125311038, "grad_norm": 0.7009295225143433, "learning_rate": 9.72130984373445e-06, "loss": 0.5034, "step": 560 }, { "epoch": 0.05673335323977307, "grad_norm": 0.7018862366676331, "learning_rate": 9.716333233801135e-06, "loss": 0.5137, "step": 570 }, { "epoch": 0.05772867522643575, "grad_norm": 0.7812825441360474, "learning_rate": 9.711356623867822e-06, "loss": 0.4724, "step": 580 }, { "epoch": 0.058723997213098435, "grad_norm": 0.6245225071907043, "learning_rate": 9.70638001393451e-06, "loss": 0.4446, "step": 590 }, { "epoch": 0.059719319199761124, "grad_norm": 0.9083976149559021, "learning_rate": 9.701403404001195e-06, "loss": 0.4884, "step": 600 }, { "epoch": 0.059719319199761124, "eval_loss": 0.4891846477985382, "eval_runtime": 147.5284, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.691, "step": 600 }, { "epoch": 0.06071464118642381, "grad_norm": 0.6195352673530579, "learning_rate": 9.69642679406788e-06, "loss": 0.5121, "step": 610 }, { "epoch": 0.061709963173086496, "grad_norm": 0.8068727254867554, "learning_rate": 9.691450184134568e-06, "loss": 0.4689, "step": 620 }, { "epoch": 0.06270528515974919, "grad_norm": 1.0427749156951904, "learning_rate": 9.686473574201255e-06, "loss": 0.4968, "step": 630 }, { "epoch": 0.06370060714641186, "grad_norm": 0.698349118232727, "learning_rate": 9.681496964267941e-06, "loss": 0.4691, "step": 640 }, { "epoch": 0.06469592913307455, "grad_norm": 0.9104384183883667, "learning_rate": 9.676520354334628e-06, "loss": 0.4775, "step": 650 }, { "epoch": 0.06569125111973724, "grad_norm": 0.8729726076126099, "learning_rate": 9.671543744401316e-06, "loss": 0.5201, "step": 660 }, { "epoch": 0.06668657310639992, "grad_norm": 0.9858236908912659, "learning_rate": 9.666567134468001e-06, "loss": 0.4268, "step": 670 }, { "epoch": 0.0676818950930626, "grad_norm": 2.322754383087158, "learning_rate": 9.661590524534687e-06, "loss": 0.4744, "step": 680 }, { "epoch": 0.0686772170797253, "grad_norm": 0.9327623248100281, "learning_rate": 9.656613914601374e-06, "loss": 0.4355, "step": 690 }, { "epoch": 0.06967253906638797, "grad_norm": 0.6949413418769836, "learning_rate": 9.651637304668062e-06, "loss": 0.465, "step": 700 }, { "epoch": 0.06967253906638797, "eval_loss": 0.4817120432853699, "eval_runtime": 147.5643, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.691, "step": 700 }, { "epoch": 0.07066786105305066, "grad_norm": 0.5208165049552917, "learning_rate": 9.646660694734747e-06, "loss": 0.4973, "step": 710 }, { "epoch": 0.07166318303971335, "grad_norm": 0.8434884548187256, "learning_rate": 9.641684084801434e-06, "loss": 0.4721, "step": 720 }, { "epoch": 0.07265850502637604, "grad_norm": 0.7161769866943359, "learning_rate": 9.636707474868122e-06, "loss": 0.498, "step": 730 }, { "epoch": 0.07365382701303871, "grad_norm": 0.7036088705062866, "learning_rate": 9.631730864934807e-06, "loss": 0.4672, "step": 740 }, { "epoch": 0.0746491489997014, "grad_norm": 0.9175013899803162, "learning_rate": 9.626754255001493e-06, "loss": 0.4781, "step": 750 }, { "epoch": 0.07564447098636409, "grad_norm": 0.678519606590271, "learning_rate": 9.62177764506818e-06, "loss": 0.4048, "step": 760 }, { "epoch": 0.07663979297302677, "grad_norm": 0.6295528411865234, "learning_rate": 9.616801035134868e-06, "loss": 0.449, "step": 770 }, { "epoch": 0.07763511495968946, "grad_norm": 0.5424385666847229, "learning_rate": 9.611824425201553e-06, "loss": 0.4394, "step": 780 }, { "epoch": 0.07863043694635215, "grad_norm": 0.508836030960083, "learning_rate": 9.60684781526824e-06, "loss": 0.4317, "step": 790 }, { "epoch": 0.07962575893301484, "grad_norm": 0.6004147529602051, "learning_rate": 9.601871205334926e-06, "loss": 0.4308, "step": 800 }, { "epoch": 0.07962575893301484, "eval_loss": 0.47557342052459717, "eval_runtime": 147.5812, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.691, "step": 800 }, { "epoch": 0.08062108091967751, "grad_norm": 0.5553786754608154, "learning_rate": 9.596894595401613e-06, "loss": 0.4376, "step": 810 }, { "epoch": 0.0816164029063402, "grad_norm": 0.7254445552825928, "learning_rate": 9.591917985468299e-06, "loss": 0.4884, "step": 820 }, { "epoch": 0.08261172489300289, "grad_norm": 0.7175013422966003, "learning_rate": 9.586941375534986e-06, "loss": 0.4167, "step": 830 }, { "epoch": 0.08360704687966557, "grad_norm": 0.6464620232582092, "learning_rate": 9.581964765601674e-06, "loss": 0.4622, "step": 840 }, { "epoch": 0.08460236886632826, "grad_norm": 0.6999176144599915, "learning_rate": 9.57698815566836e-06, "loss": 0.4708, "step": 850 }, { "epoch": 0.08559769085299095, "grad_norm": 0.7939727306365967, "learning_rate": 9.572011545735047e-06, "loss": 0.4633, "step": 860 }, { "epoch": 0.08659301283965362, "grad_norm": 0.473017156124115, "learning_rate": 9.567034935801732e-06, "loss": 0.4585, "step": 870 }, { "epoch": 0.08758833482631631, "grad_norm": 0.7265183329582214, "learning_rate": 9.56205832586842e-06, "loss": 0.4485, "step": 880 }, { "epoch": 0.088583656812979, "grad_norm": 0.539735734462738, "learning_rate": 9.557081715935105e-06, "loss": 0.475, "step": 890 }, { "epoch": 0.08957897879964169, "grad_norm": 0.7587076425552368, "learning_rate": 9.552105106001792e-06, "loss": 0.4347, "step": 900 }, { "epoch": 0.08957897879964169, "eval_loss": 0.4690374732017517, "eval_runtime": 147.5672, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.691, "step": 900 }, { "epoch": 0.09057430078630437, "grad_norm": 0.7549741864204407, "learning_rate": 9.547128496068478e-06, "loss": 0.4434, "step": 910 }, { "epoch": 0.09156962277296705, "grad_norm": 0.686689555644989, "learning_rate": 9.542151886135165e-06, "loss": 0.4052, "step": 920 }, { "epoch": 0.09256494475962974, "grad_norm": 1.02870512008667, "learning_rate": 9.537175276201853e-06, "loss": 0.4806, "step": 930 }, { "epoch": 0.09356026674629242, "grad_norm": 0.7680675983428955, "learning_rate": 9.532198666268538e-06, "loss": 0.4609, "step": 940 }, { "epoch": 0.09455558873295511, "grad_norm": 0.5478435754776001, "learning_rate": 9.527222056335224e-06, "loss": 0.4171, "step": 950 }, { "epoch": 0.0955509107196178, "grad_norm": 0.5974985361099243, "learning_rate": 9.522245446401913e-06, "loss": 0.4686, "step": 960 }, { "epoch": 0.09654623270628049, "grad_norm": 0.997151792049408, "learning_rate": 9.517268836468598e-06, "loss": 0.4676, "step": 970 }, { "epoch": 0.09754155469294316, "grad_norm": 0.6366075277328491, "learning_rate": 9.512292226535284e-06, "loss": 0.4467, "step": 980 }, { "epoch": 0.09853687667960585, "grad_norm": 0.5682553052902222, "learning_rate": 9.507315616601971e-06, "loss": 0.4772, "step": 990 }, { "epoch": 0.09953219866626854, "grad_norm": 0.5869882106781006, "learning_rate": 9.502339006668659e-06, "loss": 0.3976, "step": 1000 }, { "epoch": 0.09953219866626854, "eval_loss": 0.46156319975852966, "eval_runtime": 147.6656, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 1000 }, { "epoch": 0.10052752065293122, "grad_norm": 0.5758237838745117, "learning_rate": 9.497362396735344e-06, "loss": 0.4528, "step": 1010 }, { "epoch": 0.10152284263959391, "grad_norm": 0.700281023979187, "learning_rate": 9.492385786802032e-06, "loss": 0.4545, "step": 1020 }, { "epoch": 0.1025181646262566, "grad_norm": 1.1320914030075073, "learning_rate": 9.487409176868719e-06, "loss": 0.4331, "step": 1030 }, { "epoch": 0.10351348661291927, "grad_norm": 0.6469867825508118, "learning_rate": 9.482432566935405e-06, "loss": 0.3759, "step": 1040 }, { "epoch": 0.10450880859958196, "grad_norm": 0.9471383094787598, "learning_rate": 9.47745595700209e-06, "loss": 0.4041, "step": 1050 }, { "epoch": 0.10550413058624465, "grad_norm": 0.5729160904884338, "learning_rate": 9.472479347068777e-06, "loss": 0.4871, "step": 1060 }, { "epoch": 0.10649945257290734, "grad_norm": 0.642436683177948, "learning_rate": 9.467502737135465e-06, "loss": 0.3893, "step": 1070 }, { "epoch": 0.10749477455957002, "grad_norm": 0.95659339427948, "learning_rate": 9.46252612720215e-06, "loss": 0.4486, "step": 1080 }, { "epoch": 0.1084900965462327, "grad_norm": 0.6642667055130005, "learning_rate": 9.457549517268838e-06, "loss": 0.5168, "step": 1090 }, { "epoch": 0.1094854185328954, "grad_norm": 0.5805796980857849, "learning_rate": 9.452572907335525e-06, "loss": 0.4019, "step": 1100 }, { "epoch": 0.1094854185328954, "eval_loss": 0.4559178054332733, "eval_runtime": 147.5891, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 1100 }, { "epoch": 0.11048074051955807, "grad_norm": 0.7006909251213074, "learning_rate": 9.44759629740221e-06, "loss": 0.457, "step": 1110 }, { "epoch": 0.11147606250622076, "grad_norm": 1.1821540594100952, "learning_rate": 9.442619687468896e-06, "loss": 0.3484, "step": 1120 }, { "epoch": 0.11247138449288345, "grad_norm": 0.7232743501663208, "learning_rate": 9.437643077535584e-06, "loss": 0.417, "step": 1130 }, { "epoch": 0.11346670647954614, "grad_norm": 0.6104183197021484, "learning_rate": 9.43266646760227e-06, "loss": 0.4821, "step": 1140 }, { "epoch": 0.11446202846620881, "grad_norm": 0.5961386561393738, "learning_rate": 9.427689857668956e-06, "loss": 0.4834, "step": 1150 }, { "epoch": 0.1154573504528715, "grad_norm": 0.5530894994735718, "learning_rate": 9.422713247735644e-06, "loss": 0.443, "step": 1160 }, { "epoch": 0.1164526724395342, "grad_norm": 0.5148622393608093, "learning_rate": 9.41773663780233e-06, "loss": 0.4029, "step": 1170 }, { "epoch": 0.11744799442619687, "grad_norm": 0.6148583292961121, "learning_rate": 9.412760027869017e-06, "loss": 0.4308, "step": 1180 }, { "epoch": 0.11844331641285956, "grad_norm": 0.7840449213981628, "learning_rate": 9.407783417935702e-06, "loss": 0.499, "step": 1190 }, { "epoch": 0.11943863839952225, "grad_norm": 0.6757422089576721, "learning_rate": 9.40280680800239e-06, "loss": 0.4263, "step": 1200 }, { "epoch": 0.11943863839952225, "eval_loss": 0.4505193829536438, "eval_runtime": 147.6664, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 1200 } ], "logging_steps": 10, "max_steps": 20094, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.756627798568276e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }