|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.010608828903164967, |
|
"eval_steps": 500, |
|
"global_step": 22500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00023575175340366595, |
|
"grad_norm": 0.8459708094596863, |
|
"learning_rate": 0.0002999764248246596, |
|
"loss": 8.8499, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0004715035068073319, |
|
"grad_norm": 1.0758037567138672, |
|
"learning_rate": 0.00029995284964931925, |
|
"loss": 6.2672, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0007072552602109978, |
|
"grad_norm": 0.9614197611808777, |
|
"learning_rate": 0.0002999292744739789, |
|
"loss": 5.7862, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.0009430070136146638, |
|
"grad_norm": 0.9660577774047852, |
|
"learning_rate": 0.0002999056992986385, |
|
"loss": 5.5715, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.0011787587670183297, |
|
"grad_norm": 1.3354485034942627, |
|
"learning_rate": 0.00029988212412329813, |
|
"loss": 5.4455, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.0014145105204219955, |
|
"grad_norm": 1.0552821159362793, |
|
"learning_rate": 0.00029985854894795774, |
|
"loss": 5.3649, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.0016502622738256616, |
|
"grad_norm": 1.069198489189148, |
|
"learning_rate": 0.0002998349737726174, |
|
"loss": 5.281, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.0018860140272293276, |
|
"grad_norm": 1.2014654874801636, |
|
"learning_rate": 0.00029981139859727707, |
|
"loss": 5.2006, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.0021217657806329934, |
|
"grad_norm": 1.0098108053207397, |
|
"learning_rate": 0.0002997878234219367, |
|
"loss": 5.1572, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.0023575175340366595, |
|
"grad_norm": 1.2696442604064941, |
|
"learning_rate": 0.0002997642482465963, |
|
"loss": 5.1337, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.0025932692874403255, |
|
"grad_norm": 1.1595019102096558, |
|
"learning_rate": 0.00029974067307125596, |
|
"loss": 5.0396, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.002829021040843991, |
|
"grad_norm": 1.01584792137146, |
|
"learning_rate": 0.00029971709789591557, |
|
"loss": 5.0324, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.003064772794247657, |
|
"grad_norm": 1.6431899070739746, |
|
"learning_rate": 0.00029969352272057523, |
|
"loss": 4.9911, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.003300524547651323, |
|
"grad_norm": 0.9707762002944946, |
|
"learning_rate": 0.00029966994754523484, |
|
"loss": 4.9459, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.003536276301054989, |
|
"grad_norm": 1.0742356777191162, |
|
"learning_rate": 0.00029964637236989445, |
|
"loss": 4.8719, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.0037720280544586552, |
|
"grad_norm": 1.077572226524353, |
|
"learning_rate": 0.0002996227971945541, |
|
"loss": 4.8526, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.004007779807862321, |
|
"grad_norm": 1.2099336385726929, |
|
"learning_rate": 0.0002995992220192138, |
|
"loss": 4.805, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.004243531561265987, |
|
"grad_norm": 1.2295851707458496, |
|
"learning_rate": 0.0002995756468438734, |
|
"loss": 4.7844, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.004479283314669653, |
|
"grad_norm": 1.419725775718689, |
|
"learning_rate": 0.000299552071668533, |
|
"loss": 4.7182, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.004715035068073319, |
|
"grad_norm": 1.2460483312606812, |
|
"learning_rate": 0.00029952849649319266, |
|
"loss": 4.7153, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.004950786821476985, |
|
"grad_norm": 1.3061468601226807, |
|
"learning_rate": 0.0002995049213178523, |
|
"loss": 4.6789, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.005186538574880651, |
|
"grad_norm": 1.0660468339920044, |
|
"learning_rate": 0.00029948134614251194, |
|
"loss": 4.6484, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.005422290328284317, |
|
"grad_norm": 1.0721254348754883, |
|
"learning_rate": 0.00029945777096717155, |
|
"loss": 4.6284, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.005658042081687982, |
|
"grad_norm": 1.1215749979019165, |
|
"learning_rate": 0.00029943419579183116, |
|
"loss": 4.5578, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.005893793835091648, |
|
"grad_norm": 0.9331501126289368, |
|
"learning_rate": 0.0002994106206164908, |
|
"loss": 4.5434, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.006129545588495314, |
|
"grad_norm": 1.6719545125961304, |
|
"learning_rate": 0.00029938704544115043, |
|
"loss": 4.5433, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.00636529734189898, |
|
"grad_norm": 1.087511658668518, |
|
"learning_rate": 0.0002993634702658101, |
|
"loss": 4.451, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.006601049095302646, |
|
"grad_norm": 0.9610065817832947, |
|
"learning_rate": 0.0002993398950904697, |
|
"loss": 4.4753, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.006836800848706312, |
|
"grad_norm": 1.6184645891189575, |
|
"learning_rate": 0.0002993163199151293, |
|
"loss": 4.4395, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.007072552602109978, |
|
"grad_norm": 1.266706109046936, |
|
"learning_rate": 0.000299292744739789, |
|
"loss": 4.4283, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.007308304355513644, |
|
"grad_norm": 1.0746177434921265, |
|
"learning_rate": 0.0002992691695644486, |
|
"loss": 4.3878, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.0075440561089173104, |
|
"grad_norm": 1.0867644548416138, |
|
"learning_rate": 0.00029924559438910826, |
|
"loss": 4.3745, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.007779807862320976, |
|
"grad_norm": 1.246843934059143, |
|
"learning_rate": 0.00029922201921376787, |
|
"loss": 4.3288, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.008015559615724642, |
|
"grad_norm": 1.013817310333252, |
|
"learning_rate": 0.00029919844403842753, |
|
"loss": 4.3222, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.008251311369128309, |
|
"grad_norm": 0.7903661727905273, |
|
"learning_rate": 0.00029917486886308714, |
|
"loss": 4.2851, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.008487063122531974, |
|
"grad_norm": 0.884263277053833, |
|
"learning_rate": 0.00029915129368774675, |
|
"loss": 4.2747, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.00872281487593564, |
|
"grad_norm": 0.901438295841217, |
|
"learning_rate": 0.0002991277185124064, |
|
"loss": 4.248, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.008958566629339306, |
|
"grad_norm": 1.007119059562683, |
|
"learning_rate": 0.000299104143337066, |
|
"loss": 4.2322, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.009194318382742971, |
|
"grad_norm": 0.9175025224685669, |
|
"learning_rate": 0.0002990805681617257, |
|
"loss": 4.224, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.009430070136146638, |
|
"grad_norm": 0.802945077419281, |
|
"learning_rate": 0.0002990569929863853, |
|
"loss": 4.1627, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.009665821889550303, |
|
"grad_norm": 0.9863154292106628, |
|
"learning_rate": 0.0002990334178110449, |
|
"loss": 4.159, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.00990157364295397, |
|
"grad_norm": 0.9913619160652161, |
|
"learning_rate": 0.0002990098426357046, |
|
"loss": 4.1331, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.010137325396357635, |
|
"grad_norm": 0.9557477831840515, |
|
"learning_rate": 0.00029898626746036424, |
|
"loss": 4.1369, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.010373077149761302, |
|
"grad_norm": 0.9752131104469299, |
|
"learning_rate": 0.00029896269228502385, |
|
"loss": 4.1304, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.010608828903164967, |
|
"grad_norm": 0.8786485195159912, |
|
"learning_rate": 0.00029893911710968346, |
|
"loss": 4.1096, |
|
"step": 22500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 6362625, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 2.4361403726168064e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|