|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.11379800853485064, |
|
"eval_steps": 50, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005689900426742532, |
|
"eval_loss": 1.4702181816101074, |
|
"eval_runtime": 97.6278, |
|
"eval_samples_per_second": 7.58, |
|
"eval_steps_per_second": 3.79, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002844950213371266, |
|
"grad_norm": 0.15776292979717255, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5423, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005689900426742532, |
|
"grad_norm": 0.3180772662162781, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5495, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008534850640113799, |
|
"grad_norm": 0.2475329339504242, |
|
"learning_rate": 9.98292246503335e-05, |
|
"loss": 1.4748, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.011379800853485065, |
|
"grad_norm": 0.23491647839546204, |
|
"learning_rate": 9.931806517013612e-05, |
|
"loss": 1.4243, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01422475106685633, |
|
"grad_norm": 0.156405970454216, |
|
"learning_rate": 9.847001329696653e-05, |
|
"loss": 1.349, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.017069701280227598, |
|
"grad_norm": 0.1800297200679779, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 1.334, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01991465149359886, |
|
"grad_norm": 0.1841624230146408, |
|
"learning_rate": 9.578866633275288e-05, |
|
"loss": 1.3211, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02275960170697013, |
|
"grad_norm": 0.18861369788646698, |
|
"learning_rate": 9.397368756032445e-05, |
|
"loss": 1.3624, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.025604551920341393, |
|
"grad_norm": 0.20811361074447632, |
|
"learning_rate": 9.185832391312644e-05, |
|
"loss": 1.2135, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02844950213371266, |
|
"grad_norm": 0.49080491065979004, |
|
"learning_rate": 8.945702546981969e-05, |
|
"loss": 1.0109, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02844950213371266, |
|
"eval_loss": 1.2867763042449951, |
|
"eval_runtime": 98.9605, |
|
"eval_samples_per_second": 7.478, |
|
"eval_steps_per_second": 3.739, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.031294452347083924, |
|
"grad_norm": 0.28847643733024597, |
|
"learning_rate": 8.678619553365659e-05, |
|
"loss": 1.3137, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.034139402560455195, |
|
"grad_norm": 0.1672595888376236, |
|
"learning_rate": 8.386407858128706e-05, |
|
"loss": 1.2642, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03698435277382646, |
|
"grad_norm": 0.20153048634529114, |
|
"learning_rate": 8.07106356344834e-05, |
|
"loss": 1.3451, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03982930298719772, |
|
"grad_norm": 0.18182030320167542, |
|
"learning_rate": 7.734740790612136e-05, |
|
"loss": 1.2577, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04267425320056899, |
|
"grad_norm": 0.18567194044589996, |
|
"learning_rate": 7.379736965185368e-05, |
|
"loss": 1.3163, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04551920341394026, |
|
"grad_norm": 0.19859260320663452, |
|
"learning_rate": 7.008477123264848e-05, |
|
"loss": 1.2875, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04836415362731152, |
|
"grad_norm": 0.19528040289878845, |
|
"learning_rate": 6.623497346023418e-05, |
|
"loss": 1.2905, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.051209103840682786, |
|
"grad_norm": 0.2310878038406372, |
|
"learning_rate": 6.227427435703997e-05, |
|
"loss": 1.3571, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05405405405405406, |
|
"grad_norm": 0.27197933197021484, |
|
"learning_rate": 5.8229729514036705e-05, |
|
"loss": 1.1707, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05689900426742532, |
|
"grad_norm": 0.5292563438415527, |
|
"learning_rate": 5.4128967273616625e-05, |
|
"loss": 0.9571, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05689900426742532, |
|
"eval_loss": 1.2389965057373047, |
|
"eval_runtime": 98.9658, |
|
"eval_samples_per_second": 7.477, |
|
"eval_steps_per_second": 3.739, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.059743954480796585, |
|
"grad_norm": 0.25443074107170105, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2893, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06258890469416785, |
|
"grad_norm": 0.2228810042142868, |
|
"learning_rate": 4.5871032726383386e-05, |
|
"loss": 1.2617, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06543385490753911, |
|
"grad_norm": 0.27687934041023254, |
|
"learning_rate": 4.17702704859633e-05, |
|
"loss": 1.2612, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.06827880512091039, |
|
"grad_norm": 0.21604830026626587, |
|
"learning_rate": 3.772572564296005e-05, |
|
"loss": 1.3271, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07112375533428165, |
|
"grad_norm": 0.22077901661396027, |
|
"learning_rate": 3.3765026539765834e-05, |
|
"loss": 1.2858, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.07396870554765292, |
|
"grad_norm": 0.20174729824066162, |
|
"learning_rate": 2.991522876735154e-05, |
|
"loss": 1.2405, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07681365576102418, |
|
"grad_norm": 0.22198493778705597, |
|
"learning_rate": 2.6202630348146324e-05, |
|
"loss": 1.2382, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.07965860597439545, |
|
"grad_norm": 0.2763294577598572, |
|
"learning_rate": 2.2652592093878666e-05, |
|
"loss": 1.3044, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08250355618776671, |
|
"grad_norm": 0.22474978864192963, |
|
"learning_rate": 1.928936436551661e-05, |
|
"loss": 1.1803, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.08534850640113797, |
|
"grad_norm": 0.5697594881057739, |
|
"learning_rate": 1.6135921418712956e-05, |
|
"loss": 0.9433, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08534850640113797, |
|
"eval_loss": 1.222001314163208, |
|
"eval_runtime": 98.9441, |
|
"eval_samples_per_second": 7.479, |
|
"eval_steps_per_second": 3.739, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08819345661450925, |
|
"grad_norm": 0.22383780777454376, |
|
"learning_rate": 1.3213804466343421e-05, |
|
"loss": 1.2384, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.09103840682788052, |
|
"grad_norm": 0.23186904191970825, |
|
"learning_rate": 1.0542974530180327e-05, |
|
"loss": 1.2749, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09388335704125178, |
|
"grad_norm": 0.2378578633069992, |
|
"learning_rate": 8.141676086873572e-06, |
|
"loss": 1.3229, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.09672830725462304, |
|
"grad_norm": 0.2523314356803894, |
|
"learning_rate": 6.026312439675552e-06, |
|
"loss": 1.2946, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.09957325746799431, |
|
"grad_norm": 0.22312402725219727, |
|
"learning_rate": 4.2113336672471245e-06, |
|
"loss": 1.2379, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.10241820768136557, |
|
"grad_norm": 0.2241775393486023, |
|
"learning_rate": 2.7091379149682685e-06, |
|
"loss": 1.2604, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 0.2440503090620041, |
|
"learning_rate": 1.5299867030334814e-06, |
|
"loss": 1.2162, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.10810810810810811, |
|
"grad_norm": 0.25313878059387207, |
|
"learning_rate": 6.819348298638839e-07, |
|
"loss": 1.2753, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11095305832147938, |
|
"grad_norm": 0.25487780570983887, |
|
"learning_rate": 1.7077534966650766e-07, |
|
"loss": 1.0624, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.11379800853485064, |
|
"grad_norm": 0.5397162437438965, |
|
"learning_rate": 0.0, |
|
"loss": 0.9448, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11379800853485064, |
|
"eval_loss": 1.2195565700531006, |
|
"eval_runtime": 98.8875, |
|
"eval_samples_per_second": 7.483, |
|
"eval_steps_per_second": 3.742, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1865438228263731e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|