{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.985645933014354, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009569377990430622, "grad_norm": 2.0939094431170635, "learning_rate": 6.25e-07, "loss": 1.0265, "step": 1 }, { "epoch": 0.04784688995215311, "grad_norm": 1.2308584432431018, "learning_rate": 3.125e-06, "loss": 1.0462, "step": 5 }, { "epoch": 0.09569377990430622, "grad_norm": 0.8192528176324341, "learning_rate": 6.25e-06, "loss": 0.9416, "step": 10 }, { "epoch": 0.14354066985645933, "grad_norm": 0.5587998098246274, "learning_rate": 9.375000000000001e-06, "loss": 0.8065, "step": 15 }, { "epoch": 0.19138755980861244, "grad_norm": 0.3008165110686449, "learning_rate": 1.25e-05, "loss": 0.6851, "step": 20 }, { "epoch": 0.23923444976076555, "grad_norm": 0.20347128493031447, "learning_rate": 1.5625e-05, "loss": 0.6537, "step": 25 }, { "epoch": 0.28708133971291866, "grad_norm": 0.17128021850811456, "learning_rate": 1.8750000000000002e-05, "loss": 0.6333, "step": 30 }, { "epoch": 0.3349282296650718, "grad_norm": 0.16652191969621177, "learning_rate": 1.9994335583335336e-05, "loss": 0.6149, "step": 35 }, { "epoch": 0.3827751196172249, "grad_norm": 0.14198363233390676, "learning_rate": 1.9959742939952393e-05, "loss": 0.6082, "step": 40 }, { "epoch": 0.430622009569378, "grad_norm": 0.21713283105061185, "learning_rate": 1.9893813260530368e-05, "loss": 0.5895, "step": 45 }, { "epoch": 0.4784688995215311, "grad_norm": 0.13924953869885492, "learning_rate": 1.9796753984232357e-05, "loss": 0.5561, "step": 50 }, { "epoch": 0.5263157894736842, "grad_norm": 0.13684768549424628, "learning_rate": 1.9668870495450064e-05, "loss": 0.569, "step": 55 }, { "epoch": 0.5741626794258373, "grad_norm": 0.13894406081826732, "learning_rate": 1.9510565162951538e-05, "loss": 0.5712, "step": 60 }, { "epoch": 0.6220095693779905, "grad_norm": 0.14374947839898614, "learning_rate": 1.9322336073880143e-05, "loss": 0.5702, "step": 65 }, { "epoch": 0.6698564593301436, "grad_norm": 0.14479528353076987, "learning_rate": 1.9104775466588162e-05, "loss": 0.5664, "step": 70 }, { "epoch": 0.7177033492822966, "grad_norm": 0.14766483692101193, "learning_rate": 1.88585678672358e-05, "loss": 0.5477, "step": 75 }, { "epoch": 0.7655502392344498, "grad_norm": 0.15525222782036766, "learning_rate": 1.8584487936018663e-05, "loss": 0.5383, "step": 80 }, { "epoch": 0.8133971291866029, "grad_norm": 0.14420000030755972, "learning_rate": 1.8283398029800167e-05, "loss": 0.5509, "step": 85 }, { "epoch": 0.861244019138756, "grad_norm": 0.14021638409143689, "learning_rate": 1.795624548881781e-05, "loss": 0.5492, "step": 90 }, { "epoch": 0.9090909090909091, "grad_norm": 0.13897720896535773, "learning_rate": 1.7604059656000313e-05, "loss": 0.535, "step": 95 }, { "epoch": 0.9569377990430622, "grad_norm": 0.144999235947906, "learning_rate": 1.7227948638273918e-05, "loss": 0.558, "step": 100 }, { "epoch": 0.9952153110047847, "eval_loss": 0.5750399827957153, "eval_runtime": 7.9492, "eval_samples_per_second": 17.109, "eval_steps_per_second": 5.787, "step": 104 }, { "epoch": 1.0047846889952152, "grad_norm": 0.2609579886738764, "learning_rate": 1.682909582004807e-05, "loss": 0.5801, "step": 105 }, { "epoch": 1.0526315789473684, "grad_norm": 0.14024934432114472, "learning_rate": 1.6408756139850243e-05, "loss": 0.53, "step": 110 }, { "epoch": 1.1004784688995215, "grad_norm": 0.14458482486273788, "learning_rate": 1.5968252141825038e-05, "loss": 0.5322, "step": 115 }, { "epoch": 1.1483253588516746, "grad_norm": 0.14629086236317015, "learning_rate": 1.5508969814521026e-05, "loss": 0.5226, "step": 120 }, { "epoch": 1.1961722488038278, "grad_norm": 0.15275249319420742, "learning_rate": 1.5032354230058004e-05, "loss": 0.5237, "step": 125 }, { "epoch": 1.244019138755981, "grad_norm": 0.15270769602744638, "learning_rate": 1.4539904997395468e-05, "loss": 0.5101, "step": 130 }, { "epoch": 1.291866028708134, "grad_norm": 0.14854577877982414, "learning_rate": 1.4033171544008053e-05, "loss": 0.5084, "step": 135 }, { "epoch": 1.339712918660287, "grad_norm": 0.14092662389201455, "learning_rate": 1.3513748240813429e-05, "loss": 0.518, "step": 140 }, { "epoch": 1.38755980861244, "grad_norm": 0.13607051872752182, "learning_rate": 1.2983269385691562e-05, "loss": 0.5184, "step": 145 }, { "epoch": 1.4354066985645932, "grad_norm": 0.14966143078522062, "learning_rate": 1.2443404061378941e-05, "loss": 0.5062, "step": 150 }, { "epoch": 1.4832535885167464, "grad_norm": 0.15381766051199638, "learning_rate": 1.1895850883916786e-05, "loss": 0.5049, "step": 155 }, { "epoch": 1.5311004784688995, "grad_norm": 0.1654965470842917, "learning_rate": 1.1342332658176556e-05, "loss": 0.5027, "step": 160 }, { "epoch": 1.5789473684210527, "grad_norm": 0.15124728351536673, "learning_rate": 1.0784590957278452e-05, "loss": 0.4928, "step": 165 }, { "epoch": 1.6267942583732058, "grad_norm": 0.1445164484047255, "learning_rate": 1.0224380642958052e-05, "loss": 0.4926, "step": 170 }, { "epoch": 1.674641148325359, "grad_norm": 0.14421144356382473, "learning_rate": 9.663464344122064e-06, "loss": 0.4953, "step": 175 }, { "epoch": 1.722488038277512, "grad_norm": 0.14068706673794015, "learning_rate": 9.103606910965666e-06, "loss": 0.5027, "step": 180 }, { "epoch": 1.7703349282296652, "grad_norm": 0.14082548483803073, "learning_rate": 8.546569862100876e-06, "loss": 0.4853, "step": 185 }, { "epoch": 1.8181818181818183, "grad_norm": 0.14468108210583894, "learning_rate": 7.994105842167274e-06, "loss": 0.498, "step": 190 }, { "epoch": 1.8660287081339713, "grad_norm": 0.14087947795424693, "learning_rate": 7.447953107363574e-06, "loss": 0.4763, "step": 195 }, { "epoch": 1.9138755980861244, "grad_norm": 0.13646275065164523, "learning_rate": 6.909830056250527e-06, "loss": 0.4549, "step": 200 }, { "epoch": 1.9617224880382775, "grad_norm": 0.13978124622450475, "learning_rate": 6.381429823033281e-06, "loss": 0.4876, "step": 205 }, { "epoch": 2.0, "eval_loss": 0.5740731954574585, "eval_runtime": 11.5294, "eval_samples_per_second": 11.796, "eval_steps_per_second": 3.99, "step": 209 }, { "epoch": 2.0095693779904304, "grad_norm": 0.1395628254838664, "learning_rate": 5.864414950334796e-06, "loss": 0.5258, "step": 210 }, { "epoch": 2.0574162679425836, "grad_norm": 0.1646035873905585, "learning_rate": 5.360412158221661e-06, "loss": 0.4763, "step": 215 }, { "epoch": 2.1052631578947367, "grad_norm": 0.1430640678690393, "learning_rate": 4.871007225940939e-06, "loss": 0.4613, "step": 220 }, { "epoch": 2.15311004784689, "grad_norm": 0.1461980161567961, "learning_rate": 4.397740002471973e-06, "loss": 0.4446, "step": 225 }, { "epoch": 2.200956937799043, "grad_norm": 0.14316992446211674, "learning_rate": 3.942099561591802e-06, "loss": 0.4654, "step": 230 }, { "epoch": 2.248803827751196, "grad_norm": 0.16075469053450084, "learning_rate": 3.505519516698165e-06, "loss": 0.4513, "step": 235 }, { "epoch": 2.2966507177033493, "grad_norm": 0.1555203656237632, "learning_rate": 3.089373510131354e-06, "loss": 0.4645, "step": 240 }, { "epoch": 2.3444976076555024, "grad_norm": 0.14495139725052797, "learning_rate": 2.694970891187225e-06, "loss": 0.4397, "step": 245 }, { "epoch": 2.3923444976076556, "grad_norm": 0.14820564306015635, "learning_rate": 2.323552596419889e-06, "loss": 0.4573, "step": 250 }, { "epoch": 2.4401913875598087, "grad_norm": 0.14625430854110832, "learning_rate": 1.9762872451962214e-06, "loss": 0.4453, "step": 255 }, { "epoch": 2.488038277511962, "grad_norm": 0.1417828182794398, "learning_rate": 1.6542674627869738e-06, "loss": 0.454, "step": 260 }, { "epoch": 2.535885167464115, "grad_norm": 0.2347199249792442, "learning_rate": 1.3585064425634542e-06, "loss": 0.4469, "step": 265 }, { "epoch": 2.583732057416268, "grad_norm": 0.14511063554793874, "learning_rate": 1.0899347581163222e-06, "loss": 0.4483, "step": 270 }, { "epoch": 2.6315789473684212, "grad_norm": 0.14598006035506486, "learning_rate": 8.493974353268019e-07, "loss": 0.459, "step": 275 }, { "epoch": 2.679425837320574, "grad_norm": 0.15533898050231443, "learning_rate": 6.37651293602628e-07, "loss": 0.4493, "step": 280 }, { "epoch": 2.7272727272727275, "grad_norm": 0.14147230186128865, "learning_rate": 4.553625646441928e-07, "loss": 0.4344, "step": 285 }, { "epoch": 2.77511961722488, "grad_norm": 0.1738953363945195, "learning_rate": 3.0310479623313125e-07, "loss": 0.4513, "step": 290 }, { "epoch": 2.8229665071770333, "grad_norm": 0.15001004355811276, "learning_rate": 1.81357047638816e-07, "loss": 0.4474, "step": 295 }, { "epoch": 2.8708133971291865, "grad_norm": 0.15435665748487531, "learning_rate": 9.0502382320653e-08, "loss": 0.4543, "step": 300 }, { "epoch": 2.9186602870813396, "grad_norm": 0.14366671363851222, "learning_rate": 3.082666266872036e-08, "loss": 0.4558, "step": 305 }, { "epoch": 2.9665071770334928, "grad_norm": 0.1455740191931236, "learning_rate": 2.5176505749346937e-09, "loss": 0.4556, "step": 310 }, { "epoch": 2.985645933014354, "eval_loss": 0.5831425786018372, "eval_runtime": 13.8937, "eval_samples_per_second": 9.789, "eval_steps_per_second": 3.311, "step": 312 }, { "epoch": 2.985645933014354, "step": 312, "total_flos": 9149727375360.0, "train_loss": 0.5307981822735224, "train_runtime": 3368.0081, "train_samples_per_second": 1.115, "train_steps_per_second": 0.093 } ], "logging_steps": 5, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9149727375360.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }