{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.988458927359131,
  "eval_steps": 500,
  "global_step": 552,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.05431093007467753,
      "grad_norm": 1.4870964288711548,
      "learning_rate": 0.00019997351589651408,
      "loss": 3.4965,
      "step": 10
    },
    {
      "epoch": 0.10862186014935506,
      "grad_norm": 1.784044861793518,
      "learning_rate": 0.00019967573081342103,
      "loss": 2.065,
      "step": 20
    },
    {
      "epoch": 0.1629327902240326,
      "grad_norm": 0.7305468916893005,
      "learning_rate": 0.00019904804439875633,
      "loss": 1.2421,
      "step": 30
    },
    {
      "epoch": 0.2172437202987101,
      "grad_norm": 0.6995559930801392,
      "learning_rate": 0.00019809253413499565,
      "loss": 1.093,
      "step": 40
    },
    {
      "epoch": 0.27155465037338766,
      "grad_norm": 0.6627448201179504,
      "learning_rate": 0.00019681236251822273,
      "loss": 1.0856,
      "step": 50
    },
    {
      "epoch": 0.3258655804480652,
      "grad_norm": 0.7160666584968567,
      "learning_rate": 0.00019521176659107142,
      "loss": 1.013,
      "step": 60
    },
    {
      "epoch": 0.3801765105227427,
      "grad_norm": 0.6306814551353455,
      "learning_rate": 0.0001932960439191915,
      "loss": 1.0374,
      "step": 70
    },
    {
      "epoch": 0.4344874405974202,
      "grad_norm": 0.7758208513259888,
      "learning_rate": 0.00019107153505765306,
      "loss": 0.9474,
      "step": 80
    },
    {
      "epoch": 0.48879837067209775,
      "grad_norm": 1.2394300699234009,
      "learning_rate": 0.000188545602565321,
      "loss": 0.9932,
      "step": 90
    },
    {
      "epoch": 0.5431093007467753,
      "grad_norm": 0.829031229019165,
      "learning_rate": 0.0001857266066366567,
      "loss": 0.9204,
      "step": 100
    },
    {
      "epoch": 0.5974202308214528,
      "grad_norm": 0.7629134654998779,
      "learning_rate": 0.0001826238774315995,
      "loss": 0.9457,
      "step": 110
    },
    {
      "epoch": 0.6517311608961304,
      "grad_norm": 0.8157823085784912,
      "learning_rate": 0.00017924768419510904,
      "loss": 0.8539,
      "step": 120
    },
    {
      "epoch": 0.7060420909708078,
      "grad_norm": 0.7475631237030029,
      "learning_rate": 0.0001756092012685749,
      "loss": 0.82,
      "step": 130
    },
    {
      "epoch": 0.7603530210454854,
      "grad_norm": 0.6592528223991394,
      "learning_rate": 0.000171720471105587,
      "loss": 0.8846,
      "step": 140
    },
    {
      "epoch": 0.814663951120163,
      "grad_norm": 0.6989027857780457,
      "learning_rate": 0.00016759436441447545,
      "loss": 0.8367,
      "step": 150
    },
    {
      "epoch": 0.8689748811948405,
      "grad_norm": 0.7253873348236084,
      "learning_rate": 0.00016324453755953773,
      "loss": 0.8068,
      "step": 160
    },
    {
      "epoch": 0.923285811269518,
      "grad_norm": 0.7640873193740845,
      "learning_rate": 0.00015868538736194427,
      "loss": 0.8169,
      "step": 170
    },
    {
      "epoch": 0.9775967413441955,
      "grad_norm": 0.7669989466667175,
      "learning_rate": 0.00015393200344991995,
      "loss": 0.8355,
      "step": 180
    },
    {
      "epoch": 1.0271554650373387,
      "grad_norm": 0.7532988786697388,
      "learning_rate": 0.0001490001183159105,
      "loss": 0.7339,
      "step": 190
    },
    {
      "epoch": 1.0814663951120163,
      "grad_norm": 0.7974510192871094,
      "learning_rate": 0.0001439060552460318,
      "loss": 0.8186,
      "step": 200
    },
    {
      "epoch": 1.1357773251866938,
      "grad_norm": 0.9017219543457031,
      "learning_rate": 0.0001386666742941419,
      "loss": 0.775,
      "step": 210
    },
    {
      "epoch": 1.1900882552613714,
      "grad_norm": 0.8205109238624573,
      "learning_rate": 0.00013329931647934883,
      "loss": 0.7421,
      "step": 220
    },
    {
      "epoch": 1.2443991853360488,
      "grad_norm": 0.866692066192627,
      "learning_rate": 0.0001278217463916453,
      "loss": 0.7113,
      "step": 230
    },
    {
      "epoch": 1.2987101154107263,
      "grad_norm": 0.8832337856292725,
      "learning_rate": 0.00012225209339563145,
      "loss": 0.7545,
      "step": 240
    },
    {
      "epoch": 1.353021045485404,
      "grad_norm": 1.0796443223953247,
      "learning_rate": 0.00011660879162692675,
      "loss": 0.7085,
      "step": 250
    },
    {
      "epoch": 1.4073319755600815,
      "grad_norm": 0.9231683015823364,
      "learning_rate": 0.00011091051897986678,
      "loss": 0.7168,
      "step": 260
    },
    {
      "epoch": 1.461642905634759,
      "grad_norm": 0.8881363272666931,
      "learning_rate": 0.00010517613528842097,
      "loss": 0.7606,
      "step": 270
    },
    {
      "epoch": 1.5159538357094364,
      "grad_norm": 0.8930597901344299,
      "learning_rate": 9.942461990493625e-05,
      "loss": 0.6926,
      "step": 280
    },
    {
      "epoch": 1.570264765784114,
      "grad_norm": 1.0270030498504639,
      "learning_rate": 9.367500888330545e-05,
      "loss": 0.7571,
      "step": 290
    },
    {
      "epoch": 1.6245756958587916,
      "grad_norm": 0.8959159255027771,
      "learning_rate": 8.79463319744677e-05,
      "loss": 0.7786,
      "step": 300
    },
    {
      "epoch": 1.6788866259334692,
      "grad_norm": 0.8595919013023376,
      "learning_rate": 8.225754964277018e-05,
      "loss": 0.6935,
      "step": 310
    },
    {
      "epoch": 1.7331975560081467,
      "grad_norm": 0.953175961971283,
      "learning_rate": 7.662749031165092e-05,
      "loss": 0.6901,
      "step": 320
    },
    {
      "epoch": 1.787508486082824,
      "grad_norm": 0.985431969165802,
      "learning_rate": 7.107478804634325e-05,
      "loss": 0.7101,
      "step": 330
    },
    {
      "epoch": 1.8418194161575017,
      "grad_norm": 1.0016827583312988,
      "learning_rate": 6.561782087985681e-05,
      "loss": 0.707,
      "step": 340
    },
    {
      "epoch": 1.8961303462321792,
      "grad_norm": 0.9732582569122314,
      "learning_rate": 6.02746499863599e-05,
      "loss": 0.7426,
      "step": 350
    },
    {
      "epoch": 1.9504412763068566,
      "grad_norm": 0.9253762364387512,
      "learning_rate": 5.506295990328385e-05,
      "loss": 0.7273,
      "step": 360
    },
    {
      "epoch": 2.0,
      "grad_norm": 2.792293071746826,
      "learning_rate": 5.000000000000002e-05,
      "loss": 0.7256,
      "step": 370
    },
    {
      "epoch": 2.0543109300746774,
      "grad_norm": 0.9254827499389648,
      "learning_rate": 4.510252738679136e-05,
      "loss": 0.6432,
      "step": 380
    },
    {
      "epoch": 2.108621860149355,
      "grad_norm": 1.0876941680908203,
      "learning_rate": 4.038675145307747e-05,
      "loss": 0.6256,
      "step": 390
    },
    {
      "epoch": 2.1629327902240325,
      "grad_norm": 0.916249692440033,
      "learning_rate": 3.5868280218455796e-05,
      "loss": 0.6442,
      "step": 400
    },
    {
      "epoch": 2.2172437202987103,
      "grad_norm": 0.9240853190422058,
      "learning_rate": 3.1562068674124344e-05,
      "loss": 0.5883,
      "step": 410
    },
    {
      "epoch": 2.2715546503733877,
      "grad_norm": 1.2008038759231567,
      "learning_rate": 2.7482369285662378e-05,
      "loss": 0.6987,
      "step": 420
    },
    {
      "epoch": 2.325865580448065,
      "grad_norm": 1.2723044157028198,
      "learning_rate": 2.364268482099218e-05,
      "loss": 0.708,
      "step": 430
    },
    {
      "epoch": 2.380176510522743,
      "grad_norm": 0.9695908427238464,
      "learning_rate": 2.0055723659649904e-05,
      "loss": 0.6782,
      "step": 440
    },
    {
      "epoch": 2.43448744059742,
      "grad_norm": 1.044391393661499,
      "learning_rate": 1.6733357731279377e-05,
      "loss": 0.5803,
      "step": 450
    },
    {
      "epoch": 2.4887983706720975,
      "grad_norm": 0.9964624643325806,
      "learning_rate": 1.368658322256311e-05,
      "loss": 0.6112,
      "step": 460
    },
    {
      "epoch": 2.5431093007467753,
      "grad_norm": 1.004639744758606,
      "learning_rate": 1.0925484182639467e-05,
      "loss": 0.6322,
      "step": 470
    },
    {
      "epoch": 2.5974202308214527,
      "grad_norm": 1.1456069946289062,
      "learning_rate": 8.45919914746337e-06,
      "loss": 0.5633,
      "step": 480
    },
    {
      "epoch": 2.6517311608961305,
      "grad_norm": 1.1862763166427612,
      "learning_rate": 6.2958908935752955e-06,
      "loss": 0.5859,
      "step": 490
    },
    {
      "epoch": 2.706042090970808,
      "grad_norm": 1.1233826875686646,
      "learning_rate": 4.442719421385922e-06,
      "loss": 0.6147,
      "step": 500
    },
    {
      "epoch": 2.7603530210454856,
      "grad_norm": 1.0159374475479126,
      "learning_rate": 2.905818257394799e-06,
      "loss": 0.5829,
      "step": 510
    },
    {
      "epoch": 2.814663951120163,
      "grad_norm": 1.053791880607605,
      "learning_rate": 1.6902741537767609e-06,
      "loss": 0.5938,
      "step": 520
    },
    {
      "epoch": 2.8689748811948403,
      "grad_norm": 1.0928566455841064,
      "learning_rate": 8.00110252525299e-07,
      "loss": 0.6136,
      "step": 530
    },
    {
      "epoch": 2.923285811269518,
      "grad_norm": 1.1599104404449463,
      "learning_rate": 2.382727698752474e-07,
      "loss": 0.6389,
      "step": 540
    },
    {
      "epoch": 2.9775967413441955,
      "grad_norm": 1.2020913362503052,
      "learning_rate": 6.621245075910665e-09,
      "loss": 0.6719,
      "step": 550
    }
  ],
  "logging_steps": 10,
  "max_steps": 552,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 4322859040948224.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}