kokovova's picture
Training in progress, step 75, checkpoint
3c8bba3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.31729243786356426,
"eval_steps": 25,
"global_step": 75,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004230565838180857,
"grad_norm": 5.8103156089782715,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.9107,
"step": 1
},
{
"epoch": 0.004230565838180857,
"eval_loss": 1.4842164516448975,
"eval_runtime": 22.7015,
"eval_samples_per_second": 8.766,
"eval_steps_per_second": 4.405,
"step": 1
},
{
"epoch": 0.008461131676361713,
"grad_norm": 9.743329048156738,
"learning_rate": 6.666666666666667e-05,
"loss": 1.3577,
"step": 2
},
{
"epoch": 0.01269169751454257,
"grad_norm": 7.520903587341309,
"learning_rate": 0.0001,
"loss": 1.0173,
"step": 3
},
{
"epoch": 0.016922263352723427,
"grad_norm": 3.6215078830718994,
"learning_rate": 9.99524110790929e-05,
"loss": 0.4168,
"step": 4
},
{
"epoch": 0.021152829190904283,
"grad_norm": 0.892546534538269,
"learning_rate": 9.980973490458728e-05,
"loss": 0.1434,
"step": 5
},
{
"epoch": 0.02538339502908514,
"grad_norm": 1.1964820623397827,
"learning_rate": 9.957224306869053e-05,
"loss": 0.2214,
"step": 6
},
{
"epoch": 0.029613960867265997,
"grad_norm": 1.6945499181747437,
"learning_rate": 9.924038765061042e-05,
"loss": 0.2125,
"step": 7
},
{
"epoch": 0.033844526705446853,
"grad_norm": 0.8636919260025024,
"learning_rate": 9.881480035599667e-05,
"loss": 0.1462,
"step": 8
},
{
"epoch": 0.03807509254362771,
"grad_norm": 0.6063279509544373,
"learning_rate": 9.829629131445342e-05,
"loss": 0.1616,
"step": 9
},
{
"epoch": 0.04230565838180857,
"grad_norm": 0.16028481721878052,
"learning_rate": 9.768584753741134e-05,
"loss": 0.1363,
"step": 10
},
{
"epoch": 0.046536224219989424,
"grad_norm": 0.20494475960731506,
"learning_rate": 9.698463103929542e-05,
"loss": 0.1488,
"step": 11
},
{
"epoch": 0.05076679005817028,
"grad_norm": 0.642350971698761,
"learning_rate": 9.619397662556435e-05,
"loss": 0.1523,
"step": 12
},
{
"epoch": 0.05499735589635114,
"grad_norm": 1.9442977905273438,
"learning_rate": 9.53153893518325e-05,
"loss": 0.1667,
"step": 13
},
{
"epoch": 0.059227921734531994,
"grad_norm": 0.6603432297706604,
"learning_rate": 9.435054165891109e-05,
"loss": 0.1418,
"step": 14
},
{
"epoch": 0.06345848757271286,
"grad_norm": 9.015190124511719,
"learning_rate": 9.330127018922194e-05,
"loss": 0.2035,
"step": 15
},
{
"epoch": 0.06768905341089371,
"grad_norm": 0.3438994884490967,
"learning_rate": 9.21695722906443e-05,
"loss": 0.1295,
"step": 16
},
{
"epoch": 0.07191961924907457,
"grad_norm": 0.527033269405365,
"learning_rate": 9.09576022144496e-05,
"loss": 0.1756,
"step": 17
},
{
"epoch": 0.07615018508725542,
"grad_norm": 0.20277735590934753,
"learning_rate": 8.966766701456177e-05,
"loss": 0.1264,
"step": 18
},
{
"epoch": 0.08038075092543628,
"grad_norm": 0.2575612962245941,
"learning_rate": 8.83022221559489e-05,
"loss": 0.1473,
"step": 19
},
{
"epoch": 0.08461131676361713,
"grad_norm": 0.5664454102516174,
"learning_rate": 8.68638668405062e-05,
"loss": 0.1472,
"step": 20
},
{
"epoch": 0.088841882601798,
"grad_norm": 2.2762813568115234,
"learning_rate": 8.535533905932738e-05,
"loss": 0.1818,
"step": 21
},
{
"epoch": 0.09307244843997885,
"grad_norm": 0.33865222334861755,
"learning_rate": 8.377951038078302e-05,
"loss": 0.1691,
"step": 22
},
{
"epoch": 0.09730301427815971,
"grad_norm": 0.20331673324108124,
"learning_rate": 8.213938048432697e-05,
"loss": 0.1298,
"step": 23
},
{
"epoch": 0.10153358011634056,
"grad_norm": 0.307666540145874,
"learning_rate": 8.043807145043604e-05,
"loss": 0.1255,
"step": 24
},
{
"epoch": 0.10576414595452142,
"grad_norm": 0.5439885258674622,
"learning_rate": 7.86788218175523e-05,
"loss": 0.1441,
"step": 25
},
{
"epoch": 0.10576414595452142,
"eval_loss": 0.16374452412128448,
"eval_runtime": 22.2386,
"eval_samples_per_second": 8.948,
"eval_steps_per_second": 4.497,
"step": 25
},
{
"epoch": 0.10999471179270227,
"grad_norm": 0.2562794089317322,
"learning_rate": 7.68649804173412e-05,
"loss": 0.1286,
"step": 26
},
{
"epoch": 0.11422527763088314,
"grad_norm": 1.385337471961975,
"learning_rate": 7.500000000000001e-05,
"loss": 0.1934,
"step": 27
},
{
"epoch": 0.11845584346906399,
"grad_norm": 0.28447943925857544,
"learning_rate": 7.308743066175172e-05,
"loss": 0.1475,
"step": 28
},
{
"epoch": 0.12268640930724485,
"grad_norm": 0.4681300222873688,
"learning_rate": 7.113091308703498e-05,
"loss": 0.1723,
"step": 29
},
{
"epoch": 0.12691697514542571,
"grad_norm": 0.33006489276885986,
"learning_rate": 6.91341716182545e-05,
"loss": 0.1467,
"step": 30
},
{
"epoch": 0.13114754098360656,
"grad_norm": 1.173805832862854,
"learning_rate": 6.710100716628344e-05,
"loss": 0.1347,
"step": 31
},
{
"epoch": 0.13537810682178741,
"grad_norm": 0.3175932466983795,
"learning_rate": 6.503528997521366e-05,
"loss": 0.154,
"step": 32
},
{
"epoch": 0.13960867265996826,
"grad_norm": 0.19901403784751892,
"learning_rate": 6.294095225512603e-05,
"loss": 0.1594,
"step": 33
},
{
"epoch": 0.14383923849814914,
"grad_norm": 1.1052857637405396,
"learning_rate": 6.0821980696905146e-05,
"loss": 0.2117,
"step": 34
},
{
"epoch": 0.14806980433633,
"grad_norm": 0.11121287196874619,
"learning_rate": 5.868240888334653e-05,
"loss": 0.1437,
"step": 35
},
{
"epoch": 0.15230037017451084,
"grad_norm": 0.18161903321743011,
"learning_rate": 5.6526309611002594e-05,
"loss": 0.1203,
"step": 36
},
{
"epoch": 0.1565309360126917,
"grad_norm": 0.1642102152109146,
"learning_rate": 5.435778713738292e-05,
"loss": 0.1832,
"step": 37
},
{
"epoch": 0.16076150185087257,
"grad_norm": 0.2714860439300537,
"learning_rate": 5.218096936826681e-05,
"loss": 0.1728,
"step": 38
},
{
"epoch": 0.16499206768905342,
"grad_norm": 0.3132612109184265,
"learning_rate": 5e-05,
"loss": 0.149,
"step": 39
},
{
"epoch": 0.16922263352723427,
"grad_norm": 0.17970287799835205,
"learning_rate": 4.781903063173321e-05,
"loss": 0.1759,
"step": 40
},
{
"epoch": 0.17345319936541512,
"grad_norm": 0.2791941463947296,
"learning_rate": 4.564221286261709e-05,
"loss": 0.1519,
"step": 41
},
{
"epoch": 0.177683765203596,
"grad_norm": 0.38692009449005127,
"learning_rate": 4.347369038899744e-05,
"loss": 0.1627,
"step": 42
},
{
"epoch": 0.18191433104177684,
"grad_norm": 0.1635783314704895,
"learning_rate": 4.131759111665349e-05,
"loss": 0.1533,
"step": 43
},
{
"epoch": 0.1861448968799577,
"grad_norm": 0.23903019726276398,
"learning_rate": 3.917801930309486e-05,
"loss": 0.1517,
"step": 44
},
{
"epoch": 0.19037546271813854,
"grad_norm": 0.551013708114624,
"learning_rate": 3.705904774487396e-05,
"loss": 0.1954,
"step": 45
},
{
"epoch": 0.19460602855631942,
"grad_norm": 0.722944438457489,
"learning_rate": 3.4964710024786354e-05,
"loss": 0.1574,
"step": 46
},
{
"epoch": 0.19883659439450027,
"grad_norm": 0.6895685791969299,
"learning_rate": 3.289899283371657e-05,
"loss": 0.1543,
"step": 47
},
{
"epoch": 0.20306716023268112,
"grad_norm": 4.492163181304932,
"learning_rate": 3.086582838174551e-05,
"loss": 0.252,
"step": 48
},
{
"epoch": 0.20729772607086197,
"grad_norm": 0.8367879986763,
"learning_rate": 2.886908691296504e-05,
"loss": 0.1803,
"step": 49
},
{
"epoch": 0.21152829190904285,
"grad_norm": 0.26023951172828674,
"learning_rate": 2.6912569338248315e-05,
"loss": 0.1944,
"step": 50
},
{
"epoch": 0.21152829190904285,
"eval_loss": 0.162020742893219,
"eval_runtime": 22.2041,
"eval_samples_per_second": 8.962,
"eval_steps_per_second": 4.504,
"step": 50
},
{
"epoch": 0.2157588577472237,
"grad_norm": 0.1405401974916458,
"learning_rate": 2.500000000000001e-05,
"loss": 0.1114,
"step": 51
},
{
"epoch": 0.21998942358540455,
"grad_norm": 0.10868234187364578,
"learning_rate": 2.3135019582658802e-05,
"loss": 0.1065,
"step": 52
},
{
"epoch": 0.2242199894235854,
"grad_norm": 0.11740752309560776,
"learning_rate": 2.132117818244771e-05,
"loss": 0.143,
"step": 53
},
{
"epoch": 0.22845055526176627,
"grad_norm": 0.16673767566680908,
"learning_rate": 1.9561928549563968e-05,
"loss": 0.1385,
"step": 54
},
{
"epoch": 0.23268112109994712,
"grad_norm": 0.1313553750514984,
"learning_rate": 1.7860619515673033e-05,
"loss": 0.1416,
"step": 55
},
{
"epoch": 0.23691168693812797,
"grad_norm": 0.22419168055057526,
"learning_rate": 1.622048961921699e-05,
"loss": 0.1657,
"step": 56
},
{
"epoch": 0.24114225277630882,
"grad_norm": 0.12620574235916138,
"learning_rate": 1.4644660940672627e-05,
"loss": 0.1182,
"step": 57
},
{
"epoch": 0.2453728186144897,
"grad_norm": 0.0984712690114975,
"learning_rate": 1.3136133159493802e-05,
"loss": 0.122,
"step": 58
},
{
"epoch": 0.24960338445267055,
"grad_norm": 0.12404455244541168,
"learning_rate": 1.1697777844051105e-05,
"loss": 0.1278,
"step": 59
},
{
"epoch": 0.25383395029085143,
"grad_norm": 0.1908160299062729,
"learning_rate": 1.0332332985438248e-05,
"loss": 0.1428,
"step": 60
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.17151938378810883,
"learning_rate": 9.042397785550405e-06,
"loss": 0.1438,
"step": 61
},
{
"epoch": 0.26229508196721313,
"grad_norm": 0.14736545085906982,
"learning_rate": 7.830427709355725e-06,
"loss": 0.141,
"step": 62
},
{
"epoch": 0.26652564780539395,
"grad_norm": 0.0722377598285675,
"learning_rate": 6.698729810778065e-06,
"loss": 0.1283,
"step": 63
},
{
"epoch": 0.27075621364357483,
"grad_norm": 0.14723365008831024,
"learning_rate": 5.649458341088915e-06,
"loss": 0.1441,
"step": 64
},
{
"epoch": 0.2749867794817557,
"grad_norm": 0.2239135503768921,
"learning_rate": 4.684610648167503e-06,
"loss": 0.1696,
"step": 65
},
{
"epoch": 0.2792173453199365,
"grad_norm": 0.12539656460285187,
"learning_rate": 3.8060233744356633e-06,
"loss": 0.1273,
"step": 66
},
{
"epoch": 0.2834479111581174,
"grad_norm": 0.14846888184547424,
"learning_rate": 3.0153689607045845e-06,
"loss": 0.1552,
"step": 67
},
{
"epoch": 0.2876784769962983,
"grad_norm": 0.1942921280860901,
"learning_rate": 2.314152462588659e-06,
"loss": 0.1507,
"step": 68
},
{
"epoch": 0.2919090428344791,
"grad_norm": 1.0786352157592773,
"learning_rate": 1.70370868554659e-06,
"loss": 0.1681,
"step": 69
},
{
"epoch": 0.29613960867266,
"grad_norm": 0.1428898423910141,
"learning_rate": 1.1851996440033319e-06,
"loss": 0.1282,
"step": 70
},
{
"epoch": 0.3003701745108408,
"grad_norm": 0.17222754657268524,
"learning_rate": 7.596123493895991e-07,
"loss": 0.1281,
"step": 71
},
{
"epoch": 0.3046007403490217,
"grad_norm": 0.12844932079315186,
"learning_rate": 4.277569313094809e-07,
"loss": 0.1478,
"step": 72
},
{
"epoch": 0.30883130618720256,
"grad_norm": 0.1278412789106369,
"learning_rate": 1.9026509541272275e-07,
"loss": 0.1369,
"step": 73
},
{
"epoch": 0.3130618720253834,
"grad_norm": 0.23847945034503937,
"learning_rate": 4.7588920907110094e-08,
"loss": 0.1768,
"step": 74
},
{
"epoch": 0.31729243786356426,
"grad_norm": 0.1610560566186905,
"learning_rate": 0.0,
"loss": 0.1589,
"step": 75
},
{
"epoch": 0.31729243786356426,
"eval_loss": 0.15878306329250336,
"eval_runtime": 22.1981,
"eval_samples_per_second": 8.965,
"eval_steps_per_second": 4.505,
"step": 75
}
],
"logging_steps": 1,
"max_steps": 75,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.119015678246912e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}