{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15748031496062992, "eval_steps": 500, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013123359580052493, "grad_norm": 2.3873534202575684, "learning_rate": 5e-06, "loss": 2.0092, "step": 1 }, { "epoch": 0.0026246719160104987, "grad_norm": 2.7223856449127197, "learning_rate": 1e-05, "loss": 2.2132, "step": 2 }, { "epoch": 0.003937007874015748, "grad_norm": 1.9315420389175415, "learning_rate": 1.5e-05, "loss": 1.9733, "step": 3 }, { "epoch": 0.005249343832020997, "grad_norm": 2.3864612579345703, "learning_rate": 2e-05, "loss": 2.1109, "step": 4 }, { "epoch": 0.006561679790026247, "grad_norm": 2.958733558654785, "learning_rate": 2.5e-05, "loss": 1.9878, "step": 5 }, { "epoch": 0.007874015748031496, "grad_norm": 1.7133508920669556, "learning_rate": 3e-05, "loss": 2.0302, "step": 6 }, { "epoch": 0.009186351706036745, "grad_norm": 3.248690366744995, "learning_rate": 3.5e-05, "loss": 1.8898, "step": 7 }, { "epoch": 0.010498687664041995, "grad_norm": 2.4799916744232178, "learning_rate": 4e-05, "loss": 1.9307, "step": 8 }, { "epoch": 0.011811023622047244, "grad_norm": 1.676323413848877, "learning_rate": 4.5e-05, "loss": 1.7378, "step": 9 }, { "epoch": 0.013123359580052493, "grad_norm": 2.3488361835479736, "learning_rate": 5e-05, "loss": 1.6053, "step": 10 }, { "epoch": 0.014435695538057743, "grad_norm": 1.7687658071517944, "learning_rate": 4.9545454545454553e-05, "loss": 1.5975, "step": 11 }, { "epoch": 0.015748031496062992, "grad_norm": 1.8719062805175781, "learning_rate": 4.909090909090909e-05, "loss": 1.9477, "step": 12 }, { "epoch": 0.01706036745406824, "grad_norm": 2.5747101306915283, "learning_rate": 4.863636363636364e-05, "loss": 1.7535, "step": 13 }, { "epoch": 0.01837270341207349, "grad_norm": 2.137082576751709, "learning_rate": 4.8181818181818186e-05, "loss": 2.0586, "step": 14 }, { "epoch": 0.01968503937007874, "grad_norm": 2.6489369869232178, "learning_rate": 4.772727272727273e-05, "loss": 1.6699, "step": 15 }, { "epoch": 0.02099737532808399, "grad_norm": 2.265389919281006, "learning_rate": 4.7272727272727275e-05, "loss": 1.5817, "step": 16 }, { "epoch": 0.02230971128608924, "grad_norm": 1.5514482259750366, "learning_rate": 4.681818181818182e-05, "loss": 1.4608, "step": 17 }, { "epoch": 0.023622047244094488, "grad_norm": 2.0037670135498047, "learning_rate": 4.636363636363636e-05, "loss": 1.7752, "step": 18 }, { "epoch": 0.024934383202099737, "grad_norm": 1.9936821460723877, "learning_rate": 4.5909090909090914e-05, "loss": 1.853, "step": 19 }, { "epoch": 0.026246719160104987, "grad_norm": 1.5179853439331055, "learning_rate": 4.545454545454546e-05, "loss": 1.2913, "step": 20 }, { "epoch": 0.027559055118110236, "grad_norm": 1.905592679977417, "learning_rate": 4.5e-05, "loss": 1.9713, "step": 21 }, { "epoch": 0.028871391076115485, "grad_norm": 1.677475094795227, "learning_rate": 4.454545454545455e-05, "loss": 1.8309, "step": 22 }, { "epoch": 0.030183727034120734, "grad_norm": 1.7671284675598145, "learning_rate": 4.409090909090909e-05, "loss": 1.5843, "step": 23 }, { "epoch": 0.031496062992125984, "grad_norm": 1.7625699043273926, "learning_rate": 4.3636363636363636e-05, "loss": 1.6694, "step": 24 }, { "epoch": 0.03280839895013123, "grad_norm": 1.8689508438110352, "learning_rate": 4.318181818181819e-05, "loss": 1.466, "step": 25 }, { "epoch": 0.03412073490813648, "grad_norm": 1.8745778799057007, "learning_rate": 4.2727272727272724e-05, "loss": 1.6644, "step": 26 }, { "epoch": 0.03543307086614173, "grad_norm": 2.0465145111083984, "learning_rate": 4.2272727272727275e-05, "loss": 1.5525, "step": 27 }, { "epoch": 0.03674540682414698, "grad_norm": 1.6687242984771729, "learning_rate": 4.181818181818182e-05, "loss": 1.6084, "step": 28 }, { "epoch": 0.03805774278215223, "grad_norm": 2.1840498447418213, "learning_rate": 4.1363636363636364e-05, "loss": 1.6326, "step": 29 }, { "epoch": 0.03937007874015748, "grad_norm": 1.6122187376022339, "learning_rate": 4.0909090909090915e-05, "loss": 2.0658, "step": 30 }, { "epoch": 0.04068241469816273, "grad_norm": 1.9633705615997314, "learning_rate": 4.045454545454546e-05, "loss": 1.5645, "step": 31 }, { "epoch": 0.04199475065616798, "grad_norm": 1.70602285861969, "learning_rate": 4e-05, "loss": 1.7884, "step": 32 }, { "epoch": 0.04330708661417323, "grad_norm": 1.6108742952346802, "learning_rate": 3.954545454545455e-05, "loss": 1.7589, "step": 33 }, { "epoch": 0.04461942257217848, "grad_norm": 1.5919780731201172, "learning_rate": 3.909090909090909e-05, "loss": 1.812, "step": 34 }, { "epoch": 0.045931758530183726, "grad_norm": 1.897838830947876, "learning_rate": 3.8636363636363636e-05, "loss": 1.4926, "step": 35 }, { "epoch": 0.047244094488188976, "grad_norm": 1.629894495010376, "learning_rate": 3.818181818181819e-05, "loss": 1.3242, "step": 36 }, { "epoch": 0.048556430446194225, "grad_norm": 1.8950881958007812, "learning_rate": 3.7727272727272725e-05, "loss": 1.8765, "step": 37 }, { "epoch": 0.049868766404199474, "grad_norm": 2.0185630321502686, "learning_rate": 3.7272727272727276e-05, "loss": 1.8417, "step": 38 }, { "epoch": 0.051181102362204724, "grad_norm": 1.7326010465621948, "learning_rate": 3.681818181818182e-05, "loss": 1.8811, "step": 39 }, { "epoch": 0.05249343832020997, "grad_norm": 2.088695526123047, "learning_rate": 3.6363636363636364e-05, "loss": 1.5162, "step": 40 }, { "epoch": 0.05380577427821522, "grad_norm": 1.5483412742614746, "learning_rate": 3.590909090909091e-05, "loss": 1.5722, "step": 41 }, { "epoch": 0.05511811023622047, "grad_norm": 1.638238787651062, "learning_rate": 3.545454545454546e-05, "loss": 1.7526, "step": 42 }, { "epoch": 0.05643044619422572, "grad_norm": 1.4721009731292725, "learning_rate": 3.5e-05, "loss": 1.7936, "step": 43 }, { "epoch": 0.05774278215223097, "grad_norm": 1.747015118598938, "learning_rate": 3.454545454545455e-05, "loss": 1.8557, "step": 44 }, { "epoch": 0.05905511811023622, "grad_norm": 1.5715826749801636, "learning_rate": 3.409090909090909e-05, "loss": 2.0143, "step": 45 }, { "epoch": 0.06036745406824147, "grad_norm": 1.5074222087860107, "learning_rate": 3.3636363636363636e-05, "loss": 1.8241, "step": 46 }, { "epoch": 0.06167979002624672, "grad_norm": 1.5870237350463867, "learning_rate": 3.318181818181819e-05, "loss": 1.8897, "step": 47 }, { "epoch": 0.06299212598425197, "grad_norm": 1.7853312492370605, "learning_rate": 3.272727272727273e-05, "loss": 1.6319, "step": 48 }, { "epoch": 0.06430446194225722, "grad_norm": 1.7995538711547852, "learning_rate": 3.2272727272727276e-05, "loss": 1.543, "step": 49 }, { "epoch": 0.06561679790026247, "grad_norm": 1.7892729043960571, "learning_rate": 3.181818181818182e-05, "loss": 1.6126, "step": 50 }, { "epoch": 0.06692913385826772, "grad_norm": 1.7710286378860474, "learning_rate": 3.1363636363636365e-05, "loss": 1.2743, "step": 51 }, { "epoch": 0.06824146981627296, "grad_norm": 1.5249481201171875, "learning_rate": 3.090909090909091e-05, "loss": 1.8577, "step": 52 }, { "epoch": 0.06955380577427822, "grad_norm": 2.504638671875, "learning_rate": 3.0454545454545456e-05, "loss": 1.5559, "step": 53 }, { "epoch": 0.07086614173228346, "grad_norm": 1.6714133024215698, "learning_rate": 3e-05, "loss": 1.7014, "step": 54 }, { "epoch": 0.07217847769028872, "grad_norm": 1.778096079826355, "learning_rate": 2.954545454545455e-05, "loss": 1.9663, "step": 55 }, { "epoch": 0.07349081364829396, "grad_norm": 1.3447784185409546, "learning_rate": 2.909090909090909e-05, "loss": 1.584, "step": 56 }, { "epoch": 0.07480314960629922, "grad_norm": 1.3542073965072632, "learning_rate": 2.863636363636364e-05, "loss": 1.6182, "step": 57 }, { "epoch": 0.07611548556430446, "grad_norm": 1.4053988456726074, "learning_rate": 2.818181818181818e-05, "loss": 1.8773, "step": 58 }, { "epoch": 0.07742782152230972, "grad_norm": 1.7100603580474854, "learning_rate": 2.772727272727273e-05, "loss": 1.6747, "step": 59 }, { "epoch": 0.07874015748031496, "grad_norm": 1.3210084438323975, "learning_rate": 2.7272727272727273e-05, "loss": 1.6899, "step": 60 }, { "epoch": 0.08005249343832022, "grad_norm": 1.9091479778289795, "learning_rate": 2.681818181818182e-05, "loss": 1.3745, "step": 61 }, { "epoch": 0.08136482939632546, "grad_norm": 1.4597588777542114, "learning_rate": 2.636363636363636e-05, "loss": 1.8139, "step": 62 }, { "epoch": 0.08267716535433071, "grad_norm": 1.7286309003829956, "learning_rate": 2.590909090909091e-05, "loss": 1.8441, "step": 63 }, { "epoch": 0.08398950131233596, "grad_norm": 1.7880821228027344, "learning_rate": 2.5454545454545454e-05, "loss": 1.8993, "step": 64 }, { "epoch": 0.08530183727034121, "grad_norm": 1.3418382406234741, "learning_rate": 2.5e-05, "loss": 2.0127, "step": 65 }, { "epoch": 0.08661417322834646, "grad_norm": 1.3379124402999878, "learning_rate": 2.4545454545454545e-05, "loss": 1.5954, "step": 66 }, { "epoch": 0.08792650918635171, "grad_norm": 1.7308839559555054, "learning_rate": 2.4090909090909093e-05, "loss": 1.5228, "step": 67 }, { "epoch": 0.08923884514435695, "grad_norm": 1.5438733100891113, "learning_rate": 2.3636363636363637e-05, "loss": 1.5943, "step": 68 }, { "epoch": 0.09055118110236221, "grad_norm": 1.3583992719650269, "learning_rate": 2.318181818181818e-05, "loss": 1.254, "step": 69 }, { "epoch": 0.09186351706036745, "grad_norm": 2.1203346252441406, "learning_rate": 2.272727272727273e-05, "loss": 1.0785, "step": 70 }, { "epoch": 0.09317585301837271, "grad_norm": 1.419634222984314, "learning_rate": 2.2272727272727274e-05, "loss": 1.6355, "step": 71 }, { "epoch": 0.09448818897637795, "grad_norm": 1.657630205154419, "learning_rate": 2.1818181818181818e-05, "loss": 1.7307, "step": 72 }, { "epoch": 0.09580052493438321, "grad_norm": 1.7227027416229248, "learning_rate": 2.1363636363636362e-05, "loss": 1.6308, "step": 73 }, { "epoch": 0.09711286089238845, "grad_norm": 1.6593782901763916, "learning_rate": 2.090909090909091e-05, "loss": 1.6965, "step": 74 }, { "epoch": 0.0984251968503937, "grad_norm": 1.5448381900787354, "learning_rate": 2.0454545454545457e-05, "loss": 1.6164, "step": 75 }, { "epoch": 0.09973753280839895, "grad_norm": 1.8978919982910156, "learning_rate": 2e-05, "loss": 1.6052, "step": 76 }, { "epoch": 0.1010498687664042, "grad_norm": 1.7359306812286377, "learning_rate": 1.9545454545454546e-05, "loss": 1.3419, "step": 77 }, { "epoch": 0.10236220472440945, "grad_norm": 1.6190097332000732, "learning_rate": 1.9090909090909094e-05, "loss": 1.7516, "step": 78 }, { "epoch": 0.1036745406824147, "grad_norm": 1.660467505455017, "learning_rate": 1.8636363636363638e-05, "loss": 1.7958, "step": 79 }, { "epoch": 0.10498687664041995, "grad_norm": 1.5485517978668213, "learning_rate": 1.8181818181818182e-05, "loss": 1.5263, "step": 80 }, { "epoch": 0.1062992125984252, "grad_norm": 1.545188546180725, "learning_rate": 1.772727272727273e-05, "loss": 1.1843, "step": 81 }, { "epoch": 0.10761154855643044, "grad_norm": 1.9092739820480347, "learning_rate": 1.7272727272727274e-05, "loss": 1.2617, "step": 82 }, { "epoch": 0.1089238845144357, "grad_norm": 1.2646833658218384, "learning_rate": 1.6818181818181818e-05, "loss": 1.0567, "step": 83 }, { "epoch": 0.11023622047244094, "grad_norm": 1.636443018913269, "learning_rate": 1.6363636363636366e-05, "loss": 1.7907, "step": 84 }, { "epoch": 0.1115485564304462, "grad_norm": 1.9909443855285645, "learning_rate": 1.590909090909091e-05, "loss": 1.8247, "step": 85 }, { "epoch": 0.11286089238845144, "grad_norm": 1.3928022384643555, "learning_rate": 1.5454545454545454e-05, "loss": 2.0051, "step": 86 }, { "epoch": 0.1141732283464567, "grad_norm": 1.3174184560775757, "learning_rate": 1.5e-05, "loss": 1.1068, "step": 87 }, { "epoch": 0.11548556430446194, "grad_norm": 1.7415287494659424, "learning_rate": 1.4545454545454545e-05, "loss": 1.5977, "step": 88 }, { "epoch": 0.1167979002624672, "grad_norm": 1.4680395126342773, "learning_rate": 1.409090909090909e-05, "loss": 1.6774, "step": 89 }, { "epoch": 0.11811023622047244, "grad_norm": 1.1343920230865479, "learning_rate": 1.3636363636363637e-05, "loss": 1.2001, "step": 90 }, { "epoch": 0.1194225721784777, "grad_norm": 1.4262038469314575, "learning_rate": 1.318181818181818e-05, "loss": 1.6783, "step": 91 }, { "epoch": 0.12073490813648294, "grad_norm": 1.3531324863433838, "learning_rate": 1.2727272727272727e-05, "loss": 1.3457, "step": 92 }, { "epoch": 0.1220472440944882, "grad_norm": 1.4570822715759277, "learning_rate": 1.2272727272727273e-05, "loss": 1.4791, "step": 93 }, { "epoch": 0.12335958005249344, "grad_norm": 1.5847400426864624, "learning_rate": 1.1818181818181819e-05, "loss": 1.775, "step": 94 }, { "epoch": 0.12467191601049869, "grad_norm": 1.318939447402954, "learning_rate": 1.1363636363636365e-05, "loss": 1.2534, "step": 95 }, { "epoch": 0.12598425196850394, "grad_norm": 1.2848924398422241, "learning_rate": 1.0909090909090909e-05, "loss": 1.4489, "step": 96 }, { "epoch": 0.1272965879265092, "grad_norm": 1.366542100906372, "learning_rate": 1.0454545454545455e-05, "loss": 1.4036, "step": 97 }, { "epoch": 0.12860892388451445, "grad_norm": 1.3594622611999512, "learning_rate": 1e-05, "loss": 1.3951, "step": 98 }, { "epoch": 0.12992125984251968, "grad_norm": 1.5525848865509033, "learning_rate": 9.545454545454547e-06, "loss": 1.4648, "step": 99 }, { "epoch": 0.13123359580052493, "grad_norm": 1.0725868940353394, "learning_rate": 9.090909090909091e-06, "loss": 1.2162, "step": 100 }, { "epoch": 0.1325459317585302, "grad_norm": 1.3359230756759644, "learning_rate": 8.636363636363637e-06, "loss": 1.8726, "step": 101 }, { "epoch": 0.13385826771653545, "grad_norm": 1.2529302835464478, "learning_rate": 8.181818181818183e-06, "loss": 1.2787, "step": 102 }, { "epoch": 0.13517060367454067, "grad_norm": 1.0764676332473755, "learning_rate": 7.727272727272727e-06, "loss": 1.3241, "step": 103 }, { "epoch": 0.13648293963254593, "grad_norm": 1.7506639957427979, "learning_rate": 7.272727272727272e-06, "loss": 1.5734, "step": 104 }, { "epoch": 0.1377952755905512, "grad_norm": 1.609567403793335, "learning_rate": 6.818181818181818e-06, "loss": 1.6403, "step": 105 }, { "epoch": 0.13910761154855644, "grad_norm": 1.47870671749115, "learning_rate": 6.363636363636363e-06, "loss": 1.9961, "step": 106 }, { "epoch": 0.14041994750656167, "grad_norm": 2.0325543880462646, "learning_rate": 5.909090909090909e-06, "loss": 1.2603, "step": 107 }, { "epoch": 0.14173228346456693, "grad_norm": 1.988196849822998, "learning_rate": 5.4545454545454545e-06, "loss": 1.6179, "step": 108 }, { "epoch": 0.14304461942257218, "grad_norm": 1.5494691133499146, "learning_rate": 5e-06, "loss": 1.6994, "step": 109 }, { "epoch": 0.14435695538057744, "grad_norm": 1.5802687406539917, "learning_rate": 4.5454545454545455e-06, "loss": 1.843, "step": 110 }, { "epoch": 0.14566929133858267, "grad_norm": 1.7539070844650269, "learning_rate": 4.0909090909090915e-06, "loss": 1.6814, "step": 111 }, { "epoch": 0.14698162729658792, "grad_norm": 1.5182738304138184, "learning_rate": 3.636363636363636e-06, "loss": 1.7597, "step": 112 }, { "epoch": 0.14829396325459318, "grad_norm": 1.5883097648620605, "learning_rate": 3.1818181818181817e-06, "loss": 1.5302, "step": 113 }, { "epoch": 0.14960629921259844, "grad_norm": 1.4584324359893799, "learning_rate": 2.7272727272727272e-06, "loss": 1.652, "step": 114 }, { "epoch": 0.15091863517060367, "grad_norm": 1.9044572114944458, "learning_rate": 2.2727272727272728e-06, "loss": 1.7888, "step": 115 }, { "epoch": 0.15223097112860892, "grad_norm": 1.6256836652755737, "learning_rate": 1.818181818181818e-06, "loss": 1.6997, "step": 116 }, { "epoch": 0.15354330708661418, "grad_norm": 1.4854289293289185, "learning_rate": 1.3636363636363636e-06, "loss": 1.7064, "step": 117 }, { "epoch": 0.15485564304461943, "grad_norm": 1.5047718286514282, "learning_rate": 9.09090909090909e-07, "loss": 1.4211, "step": 118 }, { "epoch": 0.15616797900262466, "grad_norm": 1.6195610761642456, "learning_rate": 4.545454545454545e-07, "loss": 1.8672, "step": 119 }, { "epoch": 0.15748031496062992, "grad_norm": 1.5324839353561401, "learning_rate": 0.0, "loss": 1.6956, "step": 120 } ], "logging_steps": 1, "max_steps": 120, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.935734882582528e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }