{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002, "grad_norm": 0.3986969590187073, "learning_rate": 0.0001, "loss": 2.7769, "step": 1 }, { "epoch": 0.002, "eval_loss": 3.0125324726104736, "eval_runtime": 4.8013, "eval_samples_per_second": 4.374, "eval_steps_per_second": 4.374, "step": 1 }, { "epoch": 0.004, "grad_norm": 0.5986809730529785, "learning_rate": 0.0002, "loss": 2.9521, "step": 2 }, { "epoch": 0.006, "grad_norm": 0.595142662525177, "learning_rate": 0.0003, "loss": 2.955, "step": 3 }, { "epoch": 0.008, "grad_norm": 0.7013932466506958, "learning_rate": 0.0004, "loss": 2.9037, "step": 4 }, { "epoch": 0.01, "grad_norm": 1.5847638845443726, "learning_rate": 0.0005, "loss": 2.9706, "step": 5 }, { "epoch": 0.012, "grad_norm": 1.6309813261032104, "learning_rate": 0.0006, "loss": 2.75, "step": 6 }, { "epoch": 0.014, "grad_norm": 1.3442208766937256, "learning_rate": 0.0007, "loss": 2.5161, "step": 7 }, { "epoch": 0.016, "grad_norm": 0.900488018989563, "learning_rate": 0.0008, "loss": 2.2906, "step": 8 }, { "epoch": 0.018, "grad_norm": 2.340869903564453, "learning_rate": 0.0009000000000000001, "loss": 2.6079, "step": 9 }, { "epoch": 0.02, "grad_norm": 2.987302303314209, "learning_rate": 0.001, "loss": 2.5506, "step": 10 }, { "epoch": 0.022, "grad_norm": 1.844685673713684, "learning_rate": 0.0009996954135095479, "loss": 2.7146, "step": 11 }, { "epoch": 0.024, "grad_norm": 0.9662850499153137, "learning_rate": 0.0009987820251299122, "loss": 2.6323, "step": 12 }, { "epoch": 0.026, "grad_norm": 3.0721042156219482, "learning_rate": 0.0009972609476841367, "loss": 2.1718, "step": 13 }, { "epoch": 0.028, "grad_norm": 1.0009405612945557, "learning_rate": 0.0009951340343707852, "loss": 2.6348, "step": 14 }, { "epoch": 0.03, "grad_norm": 14.435264587402344, "learning_rate": 0.000992403876506104, "loss": 2.5352, "step": 15 }, { "epoch": 0.032, "grad_norm": 5.060039520263672, "learning_rate": 0.0009890738003669028, "loss": 2.708, "step": 16 }, { "epoch": 0.034, "grad_norm": 1.6351608037948608, "learning_rate": 0.0009851478631379982, "loss": 2.3905, "step": 17 }, { "epoch": 0.036, "grad_norm": 2.9582386016845703, "learning_rate": 0.0009806308479691594, "loss": 2.5147, "step": 18 }, { "epoch": 0.038, "grad_norm": 1.8205921649932861, "learning_rate": 0.0009755282581475768, "loss": 2.766, "step": 19 }, { "epoch": 0.04, "grad_norm": 1.1158825159072876, "learning_rate": 0.0009698463103929542, "loss": 2.7895, "step": 20 }, { "epoch": 0.042, "grad_norm": 1.1689060926437378, "learning_rate": 0.0009635919272833937, "loss": 2.6373, "step": 21 }, { "epoch": 0.044, "grad_norm": 0.8205438256263733, "learning_rate": 0.0009567727288213005, "loss": 2.4038, "step": 22 }, { "epoch": 0.046, "grad_norm": 1.2794568538665771, "learning_rate": 0.0009493970231495835, "loss": 2.3676, "step": 23 }, { "epoch": 0.048, "grad_norm": 0.822256863117218, "learning_rate": 0.0009414737964294635, "loss": 2.327, "step": 24 }, { "epoch": 0.05, "grad_norm": 1.986864447593689, "learning_rate": 0.0009330127018922195, "loss": 2.4431, "step": 25 }, { "epoch": 0.052, "grad_norm": 3.7959301471710205, "learning_rate": 0.0009240240480782129, "loss": 2.6657, "step": 26 }, { "epoch": 0.054, "grad_norm": 2.489267587661743, "learning_rate": 0.0009145187862775209, "loss": 2.5005, "step": 27 }, { "epoch": 0.056, "grad_norm": 2.1583516597747803, "learning_rate": 0.0009045084971874737, "loss": 2.5402, "step": 28 }, { "epoch": 0.058, "grad_norm": 4.524465084075928, "learning_rate": 0.0008940053768033609, "loss": 2.2461, "step": 29 }, { "epoch": 0.06, "grad_norm": 1.3595800399780273, "learning_rate": 0.000883022221559489, "loss": 2.331, "step": 30 }, { "epoch": 0.062, "grad_norm": 0.9844056367874146, "learning_rate": 0.0008715724127386971, "loss": 2.3781, "step": 31 }, { "epoch": 0.064, "grad_norm": 1.117148518562317, "learning_rate": 0.0008596699001693256, "loss": 2.4258, "step": 32 }, { "epoch": 0.066, "grad_norm": 0.7900739312171936, "learning_rate": 0.0008473291852294987, "loss": 2.437, "step": 33 }, { "epoch": 0.068, "grad_norm": 0.8672456741333008, "learning_rate": 0.0008345653031794292, "loss": 2.8025, "step": 34 }, { "epoch": 0.07, "grad_norm": 0.816504716873169, "learning_rate": 0.0008213938048432696, "loss": 2.5078, "step": 35 }, { "epoch": 0.072, "grad_norm": 1.0574641227722168, "learning_rate": 0.0008078307376628291, "loss": 2.6408, "step": 36 }, { "epoch": 0.074, "grad_norm": 0.6753240823745728, "learning_rate": 0.0007938926261462366, "loss": 2.2858, "step": 37 }, { "epoch": 0.076, "grad_norm": 0.9166250824928284, "learning_rate": 0.0007795964517353734, "loss": 2.7091, "step": 38 }, { "epoch": 0.078, "grad_norm": 0.9022424221038818, "learning_rate": 0.0007649596321166025, "loss": 2.6459, "step": 39 }, { "epoch": 0.08, "grad_norm": 0.7723848223686218, "learning_rate": 0.00075, "loss": 2.4329, "step": 40 }, { "epoch": 0.082, "grad_norm": 0.8669672012329102, "learning_rate": 0.0007347357813929454, "loss": 2.3661, "step": 41 }, { "epoch": 0.084, "grad_norm": 0.9701873660087585, "learning_rate": 0.0007191855733945387, "loss": 2.6723, "step": 42 }, { "epoch": 0.086, "grad_norm": 0.8038893342018127, "learning_rate": 0.0007033683215379002, "loss": 2.7652, "step": 43 }, { "epoch": 0.088, "grad_norm": 0.6812747716903687, "learning_rate": 0.0006873032967079561, "loss": 2.4019, "step": 44 }, { "epoch": 0.09, "grad_norm": 0.8909493088722229, "learning_rate": 0.0006710100716628344, "loss": 2.349, "step": 45 }, { "epoch": 0.092, "grad_norm": 0.9887206554412842, "learning_rate": 0.0006545084971874737, "loss": 2.5577, "step": 46 }, { "epoch": 0.094, "grad_norm": 0.7749077081680298, "learning_rate": 0.0006378186779084996, "loss": 2.2903, "step": 47 }, { "epoch": 0.096, "grad_norm": 1.0913500785827637, "learning_rate": 0.0006209609477998338, "loss": 2.3697, "step": 48 }, { "epoch": 0.098, "grad_norm": 0.894119381904602, "learning_rate": 0.0006039558454088796, "loss": 2.5167, "step": 49 }, { "epoch": 0.1, "grad_norm": 1.159035325050354, "learning_rate": 0.0005868240888334653, "loss": 2.4637, "step": 50 }, { "epoch": 0.1, "eval_loss": 2.5838444232940674, "eval_runtime": 4.8707, "eval_samples_per_second": 4.311, "eval_steps_per_second": 4.311, "step": 50 }, { "epoch": 0.102, "grad_norm": 0.6844251751899719, "learning_rate": 0.0005695865504800327, "loss": 2.4118, "step": 51 }, { "epoch": 0.104, "grad_norm": 1.1709848642349243, "learning_rate": 0.0005522642316338268, "loss": 2.444, "step": 52 }, { "epoch": 0.106, "grad_norm": 0.9435467720031738, "learning_rate": 0.0005348782368720626, "loss": 2.5568, "step": 53 }, { "epoch": 0.108, "grad_norm": 1.0800719261169434, "learning_rate": 0.0005174497483512506, "loss": 2.5766, "step": 54 }, { "epoch": 0.11, "grad_norm": 1.001356840133667, "learning_rate": 0.0005, "loss": 2.2205, "step": 55 }, { "epoch": 0.112, "grad_norm": 1.4582829475402832, "learning_rate": 0.0004825502516487497, "loss": 2.7271, "step": 56 }, { "epoch": 0.114, "grad_norm": 0.8312236666679382, "learning_rate": 0.00046512176312793734, "loss": 2.3204, "step": 57 }, { "epoch": 0.116, "grad_norm": 1.2127161026000977, "learning_rate": 0.00044773576836617336, "loss": 2.0169, "step": 58 }, { "epoch": 0.118, "grad_norm": 1.6428215503692627, "learning_rate": 0.0004304134495199674, "loss": 2.4521, "step": 59 }, { "epoch": 0.12, "grad_norm": 1.7682443857192993, "learning_rate": 0.00041317591116653486, "loss": 2.6753, "step": 60 }, { "epoch": 0.122, "grad_norm": 1.0919681787490845, "learning_rate": 0.0003960441545911204, "loss": 2.4022, "step": 61 }, { "epoch": 0.124, "grad_norm": 2.5304136276245117, "learning_rate": 0.0003790390522001662, "loss": 2.4325, "step": 62 }, { "epoch": 0.126, "grad_norm": 1.1737953424453735, "learning_rate": 0.00036218132209150044, "loss": 2.2653, "step": 63 }, { "epoch": 0.128, "grad_norm": 0.7943472862243652, "learning_rate": 0.00034549150281252633, "loss": 2.6079, "step": 64 }, { "epoch": 0.13, "grad_norm": 1.3269349336624146, "learning_rate": 0.0003289899283371657, "loss": 2.3745, "step": 65 }, { "epoch": 0.132, "grad_norm": 0.8898394107818604, "learning_rate": 0.00031269670329204396, "loss": 2.3862, "step": 66 }, { "epoch": 0.134, "grad_norm": 0.8309778571128845, "learning_rate": 0.0002966316784621, "loss": 2.5131, "step": 67 }, { "epoch": 0.136, "grad_norm": 1.2103646993637085, "learning_rate": 0.00028081442660546124, "loss": 2.5138, "step": 68 }, { "epoch": 0.138, "grad_norm": 0.9281813502311707, "learning_rate": 0.00026526421860705474, "loss": 2.5798, "step": 69 }, { "epoch": 0.14, "grad_norm": 0.8275775909423828, "learning_rate": 0.0002500000000000001, "loss": 2.5348, "step": 70 }, { "epoch": 0.142, "grad_norm": 1.5009329319000244, "learning_rate": 0.0002350403678833976, "loss": 2.5156, "step": 71 }, { "epoch": 0.144, "grad_norm": 1.4796998500823975, "learning_rate": 0.00022040354826462666, "loss": 2.3567, "step": 72 }, { "epoch": 0.146, "grad_norm": 0.7437081933021545, "learning_rate": 0.00020610737385376348, "loss": 2.4399, "step": 73 }, { "epoch": 0.148, "grad_norm": 0.7033576369285583, "learning_rate": 0.00019216926233717085, "loss": 2.3149, "step": 74 }, { "epoch": 0.15, "grad_norm": 0.9651651978492737, "learning_rate": 0.0001786061951567303, "loss": 2.5816, "step": 75 }, { "epoch": 0.152, "grad_norm": 1.0059478282928467, "learning_rate": 0.00016543469682057105, "loss": 2.6395, "step": 76 }, { "epoch": 0.154, "grad_norm": 1.6795697212219238, "learning_rate": 0.00015267081477050133, "loss": 2.3551, "step": 77 }, { "epoch": 0.156, "grad_norm": 0.7962441444396973, "learning_rate": 0.00014033009983067452, "loss": 2.2151, "step": 78 }, { "epoch": 0.158, "grad_norm": 0.880089282989502, "learning_rate": 0.00012842758726130281, "loss": 2.4376, "step": 79 }, { "epoch": 0.16, "grad_norm": 1.0629572868347168, "learning_rate": 0.00011697777844051105, "loss": 2.6063, "step": 80 }, { "epoch": 0.162, "grad_norm": 0.8691402077674866, "learning_rate": 0.00010599462319663906, "loss": 2.4764, "step": 81 }, { "epoch": 0.164, "grad_norm": 0.8258126378059387, "learning_rate": 9.549150281252633e-05, "loss": 2.3996, "step": 82 }, { "epoch": 0.166, "grad_norm": 2.253006935119629, "learning_rate": 8.548121372247918e-05, "loss": 2.7106, "step": 83 }, { "epoch": 0.168, "grad_norm": 0.9351361393928528, "learning_rate": 7.597595192178702e-05, "loss": 2.3613, "step": 84 }, { "epoch": 0.17, "grad_norm": 0.8624694347381592, "learning_rate": 6.698729810778065e-05, "loss": 2.4328, "step": 85 }, { "epoch": 0.172, "grad_norm": 0.6949071884155273, "learning_rate": 5.852620357053651e-05, "loss": 2.4157, "step": 86 }, { "epoch": 0.174, "grad_norm": 0.7830259203910828, "learning_rate": 5.060297685041659e-05, "loss": 2.2797, "step": 87 }, { "epoch": 0.176, "grad_norm": 1.3727121353149414, "learning_rate": 4.322727117869951e-05, "loss": 2.6155, "step": 88 }, { "epoch": 0.178, "grad_norm": 0.6731472611427307, "learning_rate": 3.6408072716606344e-05, "loss": 2.4149, "step": 89 }, { "epoch": 0.18, "grad_norm": 0.846976101398468, "learning_rate": 3.0153689607045842e-05, "loss": 2.3137, "step": 90 }, { "epoch": 0.182, "grad_norm": 0.9294453859329224, "learning_rate": 2.4471741852423235e-05, "loss": 2.5798, "step": 91 }, { "epoch": 0.184, "grad_norm": 0.766918957233429, "learning_rate": 1.9369152030840554e-05, "loss": 2.6766, "step": 92 }, { "epoch": 0.186, "grad_norm": 1.3079534769058228, "learning_rate": 1.4852136862001764e-05, "loss": 2.6047, "step": 93 }, { "epoch": 0.188, "grad_norm": 1.1351994276046753, "learning_rate": 1.0926199633097156e-05, "loss": 2.6034, "step": 94 }, { "epoch": 0.19, "grad_norm": 0.8010856509208679, "learning_rate": 7.59612349389599e-06, "loss": 2.2994, "step": 95 }, { "epoch": 0.192, "grad_norm": 0.9184717535972595, "learning_rate": 4.865965629214819e-06, "loss": 2.5489, "step": 96 }, { "epoch": 0.194, "grad_norm": 0.9543655514717102, "learning_rate": 2.739052315863355e-06, "loss": 2.5186, "step": 97 }, { "epoch": 0.196, "grad_norm": 0.9216803908348083, "learning_rate": 1.2179748700879012e-06, "loss": 2.5627, "step": 98 }, { "epoch": 0.198, "grad_norm": 0.8810911178588867, "learning_rate": 3.0458649045211895e-07, "loss": 2.6527, "step": 99 }, { "epoch": 0.2, "grad_norm": 0.7426478266716003, "learning_rate": 0.0, "loss": 2.1737, "step": 100 }, { "epoch": 0.2, "eval_loss": 2.527949094772339, "eval_runtime": 4.9855, "eval_samples_per_second": 4.212, "eval_steps_per_second": 4.212, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.62874924204032e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }