|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.08526360665056132, |
|
"eval_steps": 25, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001136848088674151, |
|
"grad_norm": 12.549942016601562, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 17.412, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.001136848088674151, |
|
"eval_loss": 2.1443395614624023, |
|
"eval_runtime": 66.3426, |
|
"eval_samples_per_second": 11.169, |
|
"eval_steps_per_second": 5.592, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002273696177348302, |
|
"grad_norm": 19.546024322509766, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 20.1466, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.003410544266022453, |
|
"grad_norm": 13.155139923095703, |
|
"learning_rate": 0.0001, |
|
"loss": 18.9999, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.004547392354696604, |
|
"grad_norm": 10.267646789550781, |
|
"learning_rate": 9.99524110790929e-05, |
|
"loss": 17.0896, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.005684240443370754, |
|
"grad_norm": 8.46841812133789, |
|
"learning_rate": 9.980973490458728e-05, |
|
"loss": 18.8043, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.006821088532044906, |
|
"grad_norm": 9.922762870788574, |
|
"learning_rate": 9.957224306869053e-05, |
|
"loss": 13.901, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.007957936620719057, |
|
"grad_norm": 8.48350715637207, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 15.7619, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.009094784709393207, |
|
"grad_norm": 9.610814094543457, |
|
"learning_rate": 9.881480035599667e-05, |
|
"loss": 15.5471, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.010231632798067358, |
|
"grad_norm": 8.592899322509766, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 12.7241, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.011368480886741509, |
|
"grad_norm": 6.759888648986816, |
|
"learning_rate": 9.768584753741134e-05, |
|
"loss": 18.8752, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01250532897541566, |
|
"grad_norm": 7.15319299697876, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 15.7519, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.013642177064089812, |
|
"grad_norm": 8.924838066101074, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 10.8917, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.014779025152763962, |
|
"grad_norm": 6.7501749992370605, |
|
"learning_rate": 9.53153893518325e-05, |
|
"loss": 12.3339, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.015915873241438113, |
|
"grad_norm": 6.784092426300049, |
|
"learning_rate": 9.435054165891109e-05, |
|
"loss": 13.9071, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.017052721330112264, |
|
"grad_norm": 7.0436320304870605, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 15.3224, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.018189569418786414, |
|
"grad_norm": 6.630817890167236, |
|
"learning_rate": 9.21695722906443e-05, |
|
"loss": 14.9763, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.019326417507460565, |
|
"grad_norm": 8.37083911895752, |
|
"learning_rate": 9.09576022144496e-05, |
|
"loss": 12.5683, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.020463265596134716, |
|
"grad_norm": 22.17447853088379, |
|
"learning_rate": 8.966766701456177e-05, |
|
"loss": 14.6574, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.021600113684808867, |
|
"grad_norm": 8.09908390045166, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 13.1111, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.022736961773483017, |
|
"grad_norm": 10.496138572692871, |
|
"learning_rate": 8.68638668405062e-05, |
|
"loss": 16.7398, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.023873809862157168, |
|
"grad_norm": 43.967933654785156, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 14.1475, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.02501065795083132, |
|
"grad_norm": 7.2808451652526855, |
|
"learning_rate": 8.377951038078302e-05, |
|
"loss": 13.2917, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.026147506039505473, |
|
"grad_norm": 7.135393142700195, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 16.542, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.027284354128179623, |
|
"grad_norm": 8.165382385253906, |
|
"learning_rate": 8.043807145043604e-05, |
|
"loss": 15.5284, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.028421202216853774, |
|
"grad_norm": 8.087389945983887, |
|
"learning_rate": 7.86788218175523e-05, |
|
"loss": 15.624, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.028421202216853774, |
|
"eval_loss": 1.7818211317062378, |
|
"eval_runtime": 66.6436, |
|
"eval_samples_per_second": 11.119, |
|
"eval_steps_per_second": 5.567, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.029558050305527925, |
|
"grad_norm": 7.320021629333496, |
|
"learning_rate": 7.68649804173412e-05, |
|
"loss": 13.1999, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.030694898394202075, |
|
"grad_norm": 7.271773815155029, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 15.5507, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.031831746482876226, |
|
"grad_norm": 7.728046894073486, |
|
"learning_rate": 7.308743066175172e-05, |
|
"loss": 17.3175, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03296859457155037, |
|
"grad_norm": 5.821549892425537, |
|
"learning_rate": 7.113091308703498e-05, |
|
"loss": 10.6975, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.03410544266022453, |
|
"grad_norm": 6.840242385864258, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 15.8851, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03524229074889868, |
|
"grad_norm": 12.806135177612305, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 13.5899, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.03637913883757283, |
|
"grad_norm": 6.994766712188721, |
|
"learning_rate": 6.503528997521366e-05, |
|
"loss": 15.6565, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.03751598692624698, |
|
"grad_norm": 7.654018402099609, |
|
"learning_rate": 6.294095225512603e-05, |
|
"loss": 15.5655, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.03865283501492113, |
|
"grad_norm": 8.867931365966797, |
|
"learning_rate": 6.0821980696905146e-05, |
|
"loss": 16.6892, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.039789683103595284, |
|
"grad_norm": 7.223085403442383, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 12.9233, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04092653119226943, |
|
"grad_norm": 6.747748851776123, |
|
"learning_rate": 5.6526309611002594e-05, |
|
"loss": 13.6085, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.042063379280943586, |
|
"grad_norm": 6.9611616134643555, |
|
"learning_rate": 5.435778713738292e-05, |
|
"loss": 13.9404, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.04320022736961773, |
|
"grad_norm": 8.217069625854492, |
|
"learning_rate": 5.218096936826681e-05, |
|
"loss": 13.8488, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04433707545829189, |
|
"grad_norm": 6.399670124053955, |
|
"learning_rate": 5e-05, |
|
"loss": 13.4894, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.045473923546966034, |
|
"grad_norm": 6.03984260559082, |
|
"learning_rate": 4.781903063173321e-05, |
|
"loss": 15.3474, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04661077163564019, |
|
"grad_norm": 7.631984710693359, |
|
"learning_rate": 4.564221286261709e-05, |
|
"loss": 12.2432, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.047747619724314336, |
|
"grad_norm": 6.203024864196777, |
|
"learning_rate": 4.347369038899744e-05, |
|
"loss": 13.4534, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.04888446781298849, |
|
"grad_norm": 6.871401309967041, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 14.3178, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.05002131590166264, |
|
"grad_norm": 7.451761722564697, |
|
"learning_rate": 3.917801930309486e-05, |
|
"loss": 14.6442, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.05115816399033679, |
|
"grad_norm": 7.06754732131958, |
|
"learning_rate": 3.705904774487396e-05, |
|
"loss": 12.303, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.052295012079010945, |
|
"grad_norm": 6.363272190093994, |
|
"learning_rate": 3.4964710024786354e-05, |
|
"loss": 18.1736, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.05343186016768509, |
|
"grad_norm": 7.741644382476807, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 13.9887, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.05456870825635925, |
|
"grad_norm": 7.256255149841309, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 18.5768, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.055705556345033394, |
|
"grad_norm": 8.02944564819336, |
|
"learning_rate": 2.886908691296504e-05, |
|
"loss": 13.2399, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.05684240443370755, |
|
"grad_norm": 6.896628379821777, |
|
"learning_rate": 2.6912569338248315e-05, |
|
"loss": 10.77, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05684240443370755, |
|
"eval_loss": 1.744654893875122, |
|
"eval_runtime": 66.449, |
|
"eval_samples_per_second": 11.151, |
|
"eval_steps_per_second": 5.583, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.057979252522381695, |
|
"grad_norm": 5.924437046051025, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 12.278, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.05911610061105585, |
|
"grad_norm": 6.9019951820373535, |
|
"learning_rate": 2.3135019582658802e-05, |
|
"loss": 12.2277, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06025294869973, |
|
"grad_norm": 8.663328170776367, |
|
"learning_rate": 2.132117818244771e-05, |
|
"loss": 16.2151, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.06138979678840415, |
|
"grad_norm": 7.101144790649414, |
|
"learning_rate": 1.9561928549563968e-05, |
|
"loss": 12.9355, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0625266448770783, |
|
"grad_norm": 6.247992992401123, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 15.3862, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06366349296575245, |
|
"grad_norm": 6.452046871185303, |
|
"learning_rate": 1.622048961921699e-05, |
|
"loss": 15.4312, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0648003410544266, |
|
"grad_norm": 8.117711067199707, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 15.8753, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.06593718914310075, |
|
"grad_norm": 7.468315124511719, |
|
"learning_rate": 1.3136133159493802e-05, |
|
"loss": 17.0208, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.06707403723177491, |
|
"grad_norm": 6.299598217010498, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 14.5744, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.06821088532044906, |
|
"grad_norm": 8.590055465698242, |
|
"learning_rate": 1.0332332985438248e-05, |
|
"loss": 12.2129, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0693477334091232, |
|
"grad_norm": 20.570341110229492, |
|
"learning_rate": 9.042397785550405e-06, |
|
"loss": 16.3481, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.07048458149779736, |
|
"grad_norm": 6.487506866455078, |
|
"learning_rate": 7.830427709355725e-06, |
|
"loss": 13.0091, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.07162142958647151, |
|
"grad_norm": 7.227597236633301, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 19.0134, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.07275827767514566, |
|
"grad_norm": 9.197311401367188, |
|
"learning_rate": 5.649458341088915e-06, |
|
"loss": 16.6898, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0738951257638198, |
|
"grad_norm": 8.438094139099121, |
|
"learning_rate": 4.684610648167503e-06, |
|
"loss": 8.918, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07503197385249397, |
|
"grad_norm": 8.239605903625488, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 15.4048, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.07616882194116811, |
|
"grad_norm": 6.532566547393799, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 10.6503, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.07730567002984226, |
|
"grad_norm": 7.610565185546875, |
|
"learning_rate": 2.314152462588659e-06, |
|
"loss": 13.7247, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.07844251811851641, |
|
"grad_norm": 7.103012561798096, |
|
"learning_rate": 1.70370868554659e-06, |
|
"loss": 13.4282, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.07957936620719057, |
|
"grad_norm": 7.29534912109375, |
|
"learning_rate": 1.1851996440033319e-06, |
|
"loss": 14.6105, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08071621429586472, |
|
"grad_norm": 7.118172645568848, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 15.4684, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.08185306238453886, |
|
"grad_norm": 8.032720565795898, |
|
"learning_rate": 4.277569313094809e-07, |
|
"loss": 12.0972, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.08298991047321301, |
|
"grad_norm": 5.702446937561035, |
|
"learning_rate": 1.9026509541272275e-07, |
|
"loss": 14.8636, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.08412675856188717, |
|
"grad_norm": 6.32309627532959, |
|
"learning_rate": 4.7588920907110094e-08, |
|
"loss": 14.2163, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.08526360665056132, |
|
"grad_norm": 6.813048362731934, |
|
"learning_rate": 0.0, |
|
"loss": 11.5691, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08526360665056132, |
|
"eval_loss": 1.739717721939087, |
|
"eval_runtime": 66.4587, |
|
"eval_samples_per_second": 11.15, |
|
"eval_steps_per_second": 5.582, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4573317650448384.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|