|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.962025316455696, |
|
"eval_steps": 500, |
|
"global_step": 78, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0379746835443038, |
|
"grad_norm": 9.957548553322647, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.9713, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0759493670886076, |
|
"grad_norm": 9.985108674661252, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.991, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.11392405063291139, |
|
"grad_norm": 9.706261106399902, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 1.0266, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.1518987341772152, |
|
"grad_norm": 8.001810981524628, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9713, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.189873417721519, |
|
"grad_norm": 5.169780518606936, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.9267, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.22784810126582278, |
|
"grad_norm": 3.998888986405131, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.8087, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.26582278481012656, |
|
"grad_norm": 3.3221599230206063, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 0.7365, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.3037974683544304, |
|
"grad_norm": 2.616871192982944, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7156, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.34177215189873417, |
|
"grad_norm": 2.346550661330392, |
|
"learning_rate": 9.994965332706574e-06, |
|
"loss": 0.687, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.379746835443038, |
|
"grad_norm": 2.4394026660864716, |
|
"learning_rate": 9.979871469976197e-06, |
|
"loss": 0.6445, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.4177215189873418, |
|
"grad_norm": 1.9546180950509056, |
|
"learning_rate": 9.954748808839675e-06, |
|
"loss": 0.634, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.45569620253164556, |
|
"grad_norm": 1.6494467158995272, |
|
"learning_rate": 9.91964794299315e-06, |
|
"loss": 0.6408, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.4936708860759494, |
|
"grad_norm": 1.593487584280316, |
|
"learning_rate": 9.874639560909118e-06, |
|
"loss": 0.5857, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.5316455696202531, |
|
"grad_norm": 1.7418928363879251, |
|
"learning_rate": 9.819814303479268e-06, |
|
"loss": 0.6297, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.569620253164557, |
|
"grad_norm": 1.481174396209577, |
|
"learning_rate": 9.755282581475769e-06, |
|
"loss": 0.6558, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.6075949367088608, |
|
"grad_norm": 1.3303606157200163, |
|
"learning_rate": 9.681174353198687e-06, |
|
"loss": 0.6002, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.6455696202531646, |
|
"grad_norm": 1.433820593402365, |
|
"learning_rate": 9.597638862757255e-06, |
|
"loss": 0.6206, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.6835443037974683, |
|
"grad_norm": 1.2744328251297492, |
|
"learning_rate": 9.504844339512096e-06, |
|
"loss": 0.6227, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.7215189873417721, |
|
"grad_norm": 1.3031604814024054, |
|
"learning_rate": 9.40297765928369e-06, |
|
"loss": 0.587, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.759493670886076, |
|
"grad_norm": 1.4316949075465608, |
|
"learning_rate": 9.292243968009332e-06, |
|
"loss": 0.6471, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7974683544303798, |
|
"grad_norm": 1.162051301543542, |
|
"learning_rate": 9.172866268606514e-06, |
|
"loss": 0.5536, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.8354430379746836, |
|
"grad_norm": 1.157069048811417, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.578, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.8734177215189873, |
|
"grad_norm": 1.3103025654917138, |
|
"learning_rate": 8.90915741234015e-06, |
|
"loss": 0.591, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.9113924050632911, |
|
"grad_norm": 1.380236644707809, |
|
"learning_rate": 8.765357330018056e-06, |
|
"loss": 0.6074, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.9493670886075949, |
|
"grad_norm": 1.2486616195577804, |
|
"learning_rate": 8.613974319136959e-06, |
|
"loss": 0.6023, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.9873417721518988, |
|
"grad_norm": 1.2816040526324866, |
|
"learning_rate": 8.455313244934324e-06, |
|
"loss": 0.5933, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.0253164556962024, |
|
"grad_norm": 2.335290159380928, |
|
"learning_rate": 8.289693629698564e-06, |
|
"loss": 0.9482, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.0632911392405062, |
|
"grad_norm": 1.097409197181704, |
|
"learning_rate": 8.117449009293668e-06, |
|
"loss": 0.4475, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.1012658227848102, |
|
"grad_norm": 1.0749833654838596, |
|
"learning_rate": 7.938926261462366e-06, |
|
"loss": 0.479, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.139240506329114, |
|
"grad_norm": 1.060278309114871, |
|
"learning_rate": 7.754484907260513e-06, |
|
"loss": 0.487, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.1772151898734178, |
|
"grad_norm": 1.1027437567641056, |
|
"learning_rate": 7.564496387029532e-06, |
|
"loss": 0.4719, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.2151898734177216, |
|
"grad_norm": 1.1228067655268963, |
|
"learning_rate": 7.369343312364994e-06, |
|
"loss": 0.439, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.2531645569620253, |
|
"grad_norm": 1.2042550076536291, |
|
"learning_rate": 7.169418695587791e-06, |
|
"loss": 0.4851, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.2911392405063291, |
|
"grad_norm": 1.071175489408689, |
|
"learning_rate": 6.965125158269619e-06, |
|
"loss": 0.4677, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.3291139240506329, |
|
"grad_norm": 1.095264978758299, |
|
"learning_rate": 6.7568741204067145e-06, |
|
"loss": 0.4578, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.3670886075949367, |
|
"grad_norm": 1.147169839461561, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.4479, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.4050632911392404, |
|
"grad_norm": 1.2799965851045123, |
|
"learning_rate": 6.330184227833376e-06, |
|
"loss": 0.4731, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.4430379746835442, |
|
"grad_norm": 1.177828457101423, |
|
"learning_rate": 6.112604669781572e-06, |
|
"loss": 0.4498, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.481012658227848, |
|
"grad_norm": 1.1619484427029347, |
|
"learning_rate": 5.892784473993184e-06, |
|
"loss": 0.4736, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.518987341772152, |
|
"grad_norm": 1.0345103815690855, |
|
"learning_rate": 5.671166329088278e-06, |
|
"loss": 0.435, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.5569620253164556, |
|
"grad_norm": 1.1026479888157206, |
|
"learning_rate": 5.448196544517168e-06, |
|
"loss": 0.4889, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.5949367088607596, |
|
"grad_norm": 0.9919388016336564, |
|
"learning_rate": 5.224324151752575e-06, |
|
"loss": 0.4153, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.6329113924050633, |
|
"grad_norm": 1.0715771514703643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4665, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.6708860759493671, |
|
"grad_norm": 1.1245062678426976, |
|
"learning_rate": 4.775675848247427e-06, |
|
"loss": 0.467, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.7088607594936709, |
|
"grad_norm": 0.9773184362626276, |
|
"learning_rate": 4.551803455482833e-06, |
|
"loss": 0.4624, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.7468354430379747, |
|
"grad_norm": 0.9253982190523925, |
|
"learning_rate": 4.3288336709117246e-06, |
|
"loss": 0.4029, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.7848101265822784, |
|
"grad_norm": 1.0275614152119965, |
|
"learning_rate": 4.107215526006818e-06, |
|
"loss": 0.4661, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.8227848101265822, |
|
"grad_norm": 1.0879971013287744, |
|
"learning_rate": 3.887395330218429e-06, |
|
"loss": 0.4822, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.8607594936708862, |
|
"grad_norm": 0.9725180909825993, |
|
"learning_rate": 3.669815772166625e-06, |
|
"loss": 0.437, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.8987341772151898, |
|
"grad_norm": 1.0660365226149333, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 0.4685, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.9367088607594938, |
|
"grad_norm": 0.9864954896447238, |
|
"learning_rate": 3.2431258795932863e-06, |
|
"loss": 0.4427, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.9746835443037973, |
|
"grad_norm": 1.0251705463878462, |
|
"learning_rate": 3.0348748417303826e-06, |
|
"loss": 0.4103, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 2.0126582278481013, |
|
"grad_norm": 1.7762263480297156, |
|
"learning_rate": 2.83058130441221e-06, |
|
"loss": 0.6331, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 2.050632911392405, |
|
"grad_norm": 1.0454599192267253, |
|
"learning_rate": 2.6306566876350072e-06, |
|
"loss": 0.3539, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.088607594936709, |
|
"grad_norm": 0.9998841150009384, |
|
"learning_rate": 2.43550361297047e-06, |
|
"loss": 0.3578, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.1265822784810124, |
|
"grad_norm": 0.9964166404453179, |
|
"learning_rate": 2.245515092739488e-06, |
|
"loss": 0.3562, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.1645569620253164, |
|
"grad_norm": 0.9459908925231907, |
|
"learning_rate": 2.061073738537635e-06, |
|
"loss": 0.3568, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 2.2025316455696204, |
|
"grad_norm": 0.8966814279446472, |
|
"learning_rate": 1.8825509907063328e-06, |
|
"loss": 0.3325, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.240506329113924, |
|
"grad_norm": 0.9244456697881348, |
|
"learning_rate": 1.7103063703014372e-06, |
|
"loss": 0.3113, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.278481012658228, |
|
"grad_norm": 0.9483141643601266, |
|
"learning_rate": 1.544686755065677e-06, |
|
"loss": 0.3228, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.3164556962025316, |
|
"grad_norm": 1.0064785741358837, |
|
"learning_rate": 1.3860256808630429e-06, |
|
"loss": 0.3656, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.3544303797468356, |
|
"grad_norm": 0.9913312730806086, |
|
"learning_rate": 1.234642669981946e-06, |
|
"loss": 0.3416, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.392405063291139, |
|
"grad_norm": 0.9569431038994634, |
|
"learning_rate": 1.0908425876598512e-06, |
|
"loss": 0.3091, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.430379746835443, |
|
"grad_norm": 1.0791734978912138, |
|
"learning_rate": 9.549150281252633e-07, |
|
"loss": 0.3358, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.4683544303797467, |
|
"grad_norm": 1.061245141407699, |
|
"learning_rate": 8.271337313934869e-07, |
|
"loss": 0.3524, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.5063291139240507, |
|
"grad_norm": 1.0946626722996773, |
|
"learning_rate": 7.077560319906696e-07, |
|
"loss": 0.3412, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.5443037974683547, |
|
"grad_norm": 0.9843396259475062, |
|
"learning_rate": 5.9702234071631e-07, |
|
"loss": 0.3056, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.5822784810126582, |
|
"grad_norm": 0.9896788540648425, |
|
"learning_rate": 4.951556604879049e-07, |
|
"loss": 0.3231, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.620253164556962, |
|
"grad_norm": 1.018520895923904, |
|
"learning_rate": 4.0236113724274716e-07, |
|
"loss": 0.3487, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.6582278481012658, |
|
"grad_norm": 1.0112046345213885, |
|
"learning_rate": 3.18825646801314e-07, |
|
"loss": 0.3574, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.6962025316455698, |
|
"grad_norm": 0.9805922789991742, |
|
"learning_rate": 2.447174185242324e-07, |
|
"loss": 0.3268, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.7341772151898733, |
|
"grad_norm": 1.0121311757244564, |
|
"learning_rate": 1.801856965207338e-07, |
|
"loss": 0.3047, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.7721518987341773, |
|
"grad_norm": 0.9808044853524287, |
|
"learning_rate": 1.253604390908819e-07, |
|
"loss": 0.344, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.810126582278481, |
|
"grad_norm": 0.9448770611064761, |
|
"learning_rate": 8.035205700685167e-08, |
|
"loss": 0.3286, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.848101265822785, |
|
"grad_norm": 0.938807423248215, |
|
"learning_rate": 4.52511911603265e-08, |
|
"loss": 0.3249, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.8860759493670884, |
|
"grad_norm": 0.9467118042819199, |
|
"learning_rate": 2.012853002380466e-08, |
|
"loss": 0.3396, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.9240506329113924, |
|
"grad_norm": 1.0059613466947979, |
|
"learning_rate": 5.034667293427053e-09, |
|
"loss": 0.3369, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.962025316455696, |
|
"grad_norm": 0.999912605851653, |
|
"learning_rate": 0.0, |
|
"loss": 0.348, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.962025316455696, |
|
"step": 78, |
|
"total_flos": 16384759316480.0, |
|
"train_loss": 0.5084553494667395, |
|
"train_runtime": 492.5209, |
|
"train_samples_per_second": 15.228, |
|
"train_steps_per_second": 0.158 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 78, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 16384759316480.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|