diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.8004843414607298, + "epoch": 0.8505146128020254, "eval_steps": 500, - "global_step": 14544, + "global_step": 15453, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -101815,6 +101815,6369 @@ "learning_rate": 6.566176945963464e-06, "loss": 0.744, "step": 14544 + }, + { + "epoch": 0.8005393802630855, + "grad_norm": 0.696674644947052, + "learning_rate": 6.565765287444097e-06, + "loss": 0.6822, + "step": 14545 + }, + { + "epoch": 0.8005944190654412, + "grad_norm": 0.711722195148468, + "learning_rate": 6.5653536171568574e-06, + "loss": 0.7724, + "step": 14546 + }, + { + "epoch": 0.8006494578677968, + "grad_norm": 0.791977047920227, + "learning_rate": 6.564941935104838e-06, + "loss": 0.7913, + "step": 14547 + }, + { + "epoch": 0.8007044966701524, + "grad_norm": 0.6904259920120239, + "learning_rate": 6.564530241291135e-06, + "loss": 0.7732, + "step": 14548 + }, + { + "epoch": 0.8007595354725081, + "grad_norm": 0.6089264750480652, + "learning_rate": 6.564118535718842e-06, + "loss": 0.6506, + "step": 14549 + }, + { + "epoch": 0.8008145742748638, + "grad_norm": 0.6502360105514526, + "learning_rate": 6.563706818391051e-06, + "loss": 0.6638, + "step": 14550 + }, + { + "epoch": 0.8008696130772195, + "grad_norm": 0.6249814033508301, + "learning_rate": 6.563295089310859e-06, + "loss": 0.7066, + "step": 14551 + }, + { + "epoch": 0.8009246518795751, + "grad_norm": 0.8013060688972473, + "learning_rate": 6.56288334848136e-06, + "loss": 0.7968, + "step": 14552 + }, + { + "epoch": 0.8009796906819308, + "grad_norm": 0.7289897799491882, + "learning_rate": 6.562471595905648e-06, + "loss": 0.752, + "step": 14553 + }, + { + "epoch": 0.8010347294842864, + "grad_norm": 0.6774812340736389, + "learning_rate": 6.5620598315868176e-06, + "loss": 0.8263, + "step": 14554 + }, + { + "epoch": 0.8010897682866421, + "grad_norm": 0.6756269931793213, + "learning_rate": 6.561648055527965e-06, + "loss": 0.8096, + "step": 14555 + }, + { + "epoch": 0.8011448070889977, + "grad_norm": 0.7138845324516296, + "learning_rate": 6.5612362677321815e-06, + "loss": 0.7513, + "step": 14556 + }, + { + "epoch": 0.8011998458913534, + "grad_norm": 0.6763927340507507, + "learning_rate": 6.5608244682025656e-06, + "loss": 0.7975, + "step": 14557 + }, + { + "epoch": 0.8012548846937091, + "grad_norm": 0.8147655129432678, + "learning_rate": 6.56041265694221e-06, + "loss": 0.8192, + "step": 14558 + }, + { + "epoch": 0.8013099234960648, + "grad_norm": 0.7272641658782959, + "learning_rate": 6.5600008339542095e-06, + "loss": 0.7829, + "step": 14559 + }, + { + "epoch": 0.8013649622984204, + "grad_norm": 0.7464525103569031, + "learning_rate": 6.559588999241661e-06, + "loss": 0.7596, + "step": 14560 + }, + { + "epoch": 0.801420001100776, + "grad_norm": 0.7236443758010864, + "learning_rate": 6.559177152807661e-06, + "loss": 0.8151, + "step": 14561 + }, + { + "epoch": 0.8014750399031317, + "grad_norm": 0.6752793192863464, + "learning_rate": 6.558765294655301e-06, + "loss": 0.7578, + "step": 14562 + }, + { + "epoch": 0.8015300787054873, + "grad_norm": 0.709994375705719, + "learning_rate": 6.558353424787678e-06, + "loss": 0.6847, + "step": 14563 + }, + { + "epoch": 0.801585117507843, + "grad_norm": 0.7082880139350891, + "learning_rate": 6.557941543207889e-06, + "loss": 0.7968, + "step": 14564 + }, + { + "epoch": 0.8016401563101987, + "grad_norm": 0.692663848400116, + "learning_rate": 6.557529649919028e-06, + "loss": 0.6625, + "step": 14565 + }, + { + "epoch": 0.8016951951125544, + "grad_norm": 0.8464102149009705, + "learning_rate": 6.557117744924191e-06, + "loss": 0.7383, + "step": 14566 + }, + { + "epoch": 0.80175023391491, + "grad_norm": 0.6129899024963379, + "learning_rate": 6.5567058282264735e-06, + "loss": 0.7007, + "step": 14567 + }, + { + "epoch": 0.8018052727172656, + "grad_norm": 0.6458886861801147, + "learning_rate": 6.556293899828973e-06, + "loss": 0.7019, + "step": 14568 + }, + { + "epoch": 0.8018603115196213, + "grad_norm": 0.6543694138526917, + "learning_rate": 6.555881959734783e-06, + "loss": 0.7254, + "step": 14569 + }, + { + "epoch": 0.801915350321977, + "grad_norm": 0.7678859829902649, + "learning_rate": 6.555470007947001e-06, + "loss": 0.7952, + "step": 14570 + }, + { + "epoch": 0.8019703891243326, + "grad_norm": 0.7121342420578003, + "learning_rate": 6.555058044468722e-06, + "loss": 0.7951, + "step": 14571 + }, + { + "epoch": 0.8020254279266883, + "grad_norm": 0.6496285200119019, + "learning_rate": 6.554646069303043e-06, + "loss": 0.696, + "step": 14572 + }, + { + "epoch": 0.802080466729044, + "grad_norm": 0.7206087112426758, + "learning_rate": 6.5542340824530614e-06, + "loss": 0.7599, + "step": 14573 + }, + { + "epoch": 0.8021355055313997, + "grad_norm": 0.7285301685333252, + "learning_rate": 6.553822083921872e-06, + "loss": 0.7805, + "step": 14574 + }, + { + "epoch": 0.8021905443337553, + "grad_norm": 0.7524350881576538, + "learning_rate": 6.553410073712572e-06, + "loss": 0.7388, + "step": 14575 + }, + { + "epoch": 0.8022455831361109, + "grad_norm": 0.7634537220001221, + "learning_rate": 6.552998051828256e-06, + "loss": 0.6969, + "step": 14576 + }, + { + "epoch": 0.8023006219384666, + "grad_norm": 0.6950779557228088, + "learning_rate": 6.552586018272024e-06, + "loss": 0.8533, + "step": 14577 + }, + { + "epoch": 0.8023556607408223, + "grad_norm": 0.694496214389801, + "learning_rate": 6.552173973046972e-06, + "loss": 0.766, + "step": 14578 + }, + { + "epoch": 0.8024106995431779, + "grad_norm": 0.8068329691886902, + "learning_rate": 6.5517619161561954e-06, + "loss": 0.7642, + "step": 14579 + }, + { + "epoch": 0.8024657383455336, + "grad_norm": 0.6933363080024719, + "learning_rate": 6.5513498476027905e-06, + "loss": 0.8721, + "step": 14580 + }, + { + "epoch": 0.8025207771478893, + "grad_norm": 0.7041658163070679, + "learning_rate": 6.550937767389857e-06, + "loss": 0.6654, + "step": 14581 + }, + { + "epoch": 0.802575815950245, + "grad_norm": 0.7080103754997253, + "learning_rate": 6.550525675520489e-06, + "loss": 0.6917, + "step": 14582 + }, + { + "epoch": 0.8026308547526005, + "grad_norm": 0.6644875407218933, + "learning_rate": 6.550113571997785e-06, + "loss": 0.7674, + "step": 14583 + }, + { + "epoch": 0.8026858935549562, + "grad_norm": 0.7660395503044128, + "learning_rate": 6.549701456824843e-06, + "loss": 0.792, + "step": 14584 + }, + { + "epoch": 0.8027409323573119, + "grad_norm": 0.6853451132774353, + "learning_rate": 6.549289330004759e-06, + "loss": 0.8038, + "step": 14585 + }, + { + "epoch": 0.8027959711596676, + "grad_norm": 0.7349985837936401, + "learning_rate": 6.548877191540632e-06, + "loss": 0.7658, + "step": 14586 + }, + { + "epoch": 0.8028510099620232, + "grad_norm": 0.7605637311935425, + "learning_rate": 6.548465041435557e-06, + "loss": 0.7691, + "step": 14587 + }, + { + "epoch": 0.8029060487643789, + "grad_norm": 0.7635177969932556, + "learning_rate": 6.548052879692635e-06, + "loss": 0.8337, + "step": 14588 + }, + { + "epoch": 0.8029610875667346, + "grad_norm": 0.6873355507850647, + "learning_rate": 6.5476407063149614e-06, + "loss": 0.64, + "step": 14589 + }, + { + "epoch": 0.8030161263690903, + "grad_norm": 0.7642813920974731, + "learning_rate": 6.547228521305635e-06, + "loss": 0.6961, + "step": 14590 + }, + { + "epoch": 0.8030711651714458, + "grad_norm": 0.6329793334007263, + "learning_rate": 6.546816324667752e-06, + "loss": 0.73, + "step": 14591 + }, + { + "epoch": 0.8031262039738015, + "grad_norm": 0.6932308673858643, + "learning_rate": 6.546404116404412e-06, + "loss": 0.7582, + "step": 14592 + }, + { + "epoch": 0.8031812427761572, + "grad_norm": 0.699260413646698, + "learning_rate": 6.545991896518713e-06, + "loss": 0.7219, + "step": 14593 + }, + { + "epoch": 0.8032362815785129, + "grad_norm": 0.6217201948165894, + "learning_rate": 6.545579665013754e-06, + "loss": 0.6237, + "step": 14594 + }, + { + "epoch": 0.8032913203808685, + "grad_norm": 0.7078647017478943, + "learning_rate": 6.545167421892629e-06, + "loss": 0.666, + "step": 14595 + }, + { + "epoch": 0.8033463591832242, + "grad_norm": 0.6955916881561279, + "learning_rate": 6.544755167158441e-06, + "loss": 0.737, + "step": 14596 + }, + { + "epoch": 0.8034013979855799, + "grad_norm": 0.8195130825042725, + "learning_rate": 6.544342900814287e-06, + "loss": 0.787, + "step": 14597 + }, + { + "epoch": 0.8034564367879355, + "grad_norm": 0.6160768270492554, + "learning_rate": 6.543930622863263e-06, + "loss": 0.6141, + "step": 14598 + }, + { + "epoch": 0.8035114755902911, + "grad_norm": 0.8483116030693054, + "learning_rate": 6.543518333308472e-06, + "loss": 0.7639, + "step": 14599 + }, + { + "epoch": 0.8035665143926468, + "grad_norm": 0.6937680244445801, + "learning_rate": 6.5431060321530105e-06, + "loss": 0.7484, + "step": 14600 + }, + { + "epoch": 0.8036215531950025, + "grad_norm": 0.6298720836639404, + "learning_rate": 6.542693719399975e-06, + "loss": 0.6357, + "step": 14601 + }, + { + "epoch": 0.8036765919973582, + "grad_norm": 0.6431903839111328, + "learning_rate": 6.54228139505247e-06, + "loss": 0.6749, + "step": 14602 + }, + { + "epoch": 0.8037316307997138, + "grad_norm": 0.8972636461257935, + "learning_rate": 6.541869059113588e-06, + "loss": 0.8907, + "step": 14603 + }, + { + "epoch": 0.8037866696020695, + "grad_norm": 0.7302204966545105, + "learning_rate": 6.5414567115864316e-06, + "loss": 0.7494, + "step": 14604 + }, + { + "epoch": 0.8038417084044251, + "grad_norm": 0.7784821391105652, + "learning_rate": 6.541044352474099e-06, + "loss": 0.6582, + "step": 14605 + }, + { + "epoch": 0.8038967472067807, + "grad_norm": 0.7257398366928101, + "learning_rate": 6.54063198177969e-06, + "loss": 0.7362, + "step": 14606 + }, + { + "epoch": 0.8039517860091364, + "grad_norm": 0.6745980381965637, + "learning_rate": 6.540219599506302e-06, + "loss": 0.6756, + "step": 14607 + }, + { + "epoch": 0.8040068248114921, + "grad_norm": 0.8664490580558777, + "learning_rate": 6.539807205657037e-06, + "loss": 0.6728, + "step": 14608 + }, + { + "epoch": 0.8040618636138478, + "grad_norm": 0.704233705997467, + "learning_rate": 6.5393948002349926e-06, + "loss": 0.7713, + "step": 14609 + }, + { + "epoch": 0.8041169024162034, + "grad_norm": 0.7709019780158997, + "learning_rate": 6.538982383243271e-06, + "loss": 0.8148, + "step": 14610 + }, + { + "epoch": 0.8041719412185591, + "grad_norm": 0.7056839466094971, + "learning_rate": 6.538569954684967e-06, + "loss": 0.7143, + "step": 14611 + }, + { + "epoch": 0.8042269800209148, + "grad_norm": 0.715506374835968, + "learning_rate": 6.538157514563184e-06, + "loss": 0.7932, + "step": 14612 + }, + { + "epoch": 0.8042820188232704, + "grad_norm": 0.8245391845703125, + "learning_rate": 6.537745062881021e-06, + "loss": 0.7569, + "step": 14613 + }, + { + "epoch": 0.804337057625626, + "grad_norm": 0.6912628412246704, + "learning_rate": 6.5373325996415794e-06, + "loss": 0.7174, + "step": 14614 + }, + { + "epoch": 0.8043920964279817, + "grad_norm": 0.6994870901107788, + "learning_rate": 6.536920124847955e-06, + "loss": 0.6174, + "step": 14615 + }, + { + "epoch": 0.8044471352303374, + "grad_norm": 0.6660363674163818, + "learning_rate": 6.536507638503251e-06, + "loss": 0.8065, + "step": 14616 + }, + { + "epoch": 0.8045021740326931, + "grad_norm": 0.6742863059043884, + "learning_rate": 6.536095140610567e-06, + "loss": 0.7984, + "step": 14617 + }, + { + "epoch": 0.8045572128350487, + "grad_norm": 0.6868259906768799, + "learning_rate": 6.535682631173005e-06, + "loss": 0.7907, + "step": 14618 + }, + { + "epoch": 0.8046122516374044, + "grad_norm": 0.7442048788070679, + "learning_rate": 6.5352701101936615e-06, + "loss": 0.7893, + "step": 14619 + }, + { + "epoch": 0.80466729043976, + "grad_norm": 0.7389286756515503, + "learning_rate": 6.534857577675639e-06, + "loss": 0.827, + "step": 14620 + }, + { + "epoch": 0.8047223292421157, + "grad_norm": 0.6679701209068298, + "learning_rate": 6.534445033622036e-06, + "loss": 0.6721, + "step": 14621 + }, + { + "epoch": 0.8047773680444713, + "grad_norm": 0.6372442841529846, + "learning_rate": 6.534032478035957e-06, + "loss": 0.7381, + "step": 14622 + }, + { + "epoch": 0.804832406846827, + "grad_norm": 0.7682638764381409, + "learning_rate": 6.533619910920501e-06, + "loss": 0.7003, + "step": 14623 + }, + { + "epoch": 0.8048874456491827, + "grad_norm": 0.6821291446685791, + "learning_rate": 6.533207332278767e-06, + "loss": 0.8164, + "step": 14624 + }, + { + "epoch": 0.8049424844515384, + "grad_norm": 0.6591019034385681, + "learning_rate": 6.532794742113858e-06, + "loss": 0.6772, + "step": 14625 + }, + { + "epoch": 0.804997523253894, + "grad_norm": 0.7331292033195496, + "learning_rate": 6.532382140428874e-06, + "loss": 0.7606, + "step": 14626 + }, + { + "epoch": 0.8050525620562496, + "grad_norm": 0.9654768705368042, + "learning_rate": 6.531969527226917e-06, + "loss": 0.9196, + "step": 14627 + }, + { + "epoch": 0.8051076008586053, + "grad_norm": 0.6320267915725708, + "learning_rate": 6.5315569025110844e-06, + "loss": 0.6982, + "step": 14628 + }, + { + "epoch": 0.805162639660961, + "grad_norm": 0.6921746134757996, + "learning_rate": 6.531144266284481e-06, + "loss": 0.7176, + "step": 14629 + }, + { + "epoch": 0.8052176784633166, + "grad_norm": 0.7233335375785828, + "learning_rate": 6.530731618550208e-06, + "loss": 0.8388, + "step": 14630 + }, + { + "epoch": 0.8052727172656723, + "grad_norm": 0.6576363444328308, + "learning_rate": 6.530318959311366e-06, + "loss": 0.7511, + "step": 14631 + }, + { + "epoch": 0.805327756068028, + "grad_norm": 0.6921162009239197, + "learning_rate": 6.529906288571055e-06, + "loss": 0.8161, + "step": 14632 + }, + { + "epoch": 0.8053827948703837, + "grad_norm": 0.7314246296882629, + "learning_rate": 6.529493606332379e-06, + "loss": 0.7824, + "step": 14633 + }, + { + "epoch": 0.8054378336727392, + "grad_norm": 0.6419001221656799, + "learning_rate": 6.529080912598438e-06, + "loss": 0.7593, + "step": 14634 + }, + { + "epoch": 0.8054928724750949, + "grad_norm": 0.9500213861465454, + "learning_rate": 6.528668207372335e-06, + "loss": 0.7429, + "step": 14635 + }, + { + "epoch": 0.8055479112774506, + "grad_norm": 0.7299035787582397, + "learning_rate": 6.52825549065717e-06, + "loss": 0.8064, + "step": 14636 + }, + { + "epoch": 0.8056029500798063, + "grad_norm": 0.6231887936592102, + "learning_rate": 6.527842762456046e-06, + "loss": 0.6177, + "step": 14637 + }, + { + "epoch": 0.8056579888821619, + "grad_norm": 0.6219315528869629, + "learning_rate": 6.527430022772066e-06, + "loss": 0.6781, + "step": 14638 + }, + { + "epoch": 0.8057130276845176, + "grad_norm": 0.696861982345581, + "learning_rate": 6.527017271608329e-06, + "loss": 0.7508, + "step": 14639 + }, + { + "epoch": 0.8057680664868733, + "grad_norm": 0.7849573493003845, + "learning_rate": 6.5266045089679394e-06, + "loss": 0.7347, + "step": 14640 + }, + { + "epoch": 0.805823105289229, + "grad_norm": 0.6350993514060974, + "learning_rate": 6.526191734853999e-06, + "loss": 0.6863, + "step": 14641 + }, + { + "epoch": 0.8058781440915845, + "grad_norm": 0.6293141841888428, + "learning_rate": 6.5257789492696115e-06, + "loss": 0.7288, + "step": 14642 + }, + { + "epoch": 0.8059331828939402, + "grad_norm": 0.7801508903503418, + "learning_rate": 6.525366152217876e-06, + "loss": 0.7592, + "step": 14643 + }, + { + "epoch": 0.8059882216962959, + "grad_norm": 0.7031479477882385, + "learning_rate": 6.5249533437018964e-06, + "loss": 0.8677, + "step": 14644 + }, + { + "epoch": 0.8060432604986516, + "grad_norm": 0.7052507996559143, + "learning_rate": 6.524540523724777e-06, + "loss": 0.7957, + "step": 14645 + }, + { + "epoch": 0.8060982993010072, + "grad_norm": 0.669743537902832, + "learning_rate": 6.524127692289619e-06, + "loss": 0.7163, + "step": 14646 + }, + { + "epoch": 0.8061533381033629, + "grad_norm": 0.7180876731872559, + "learning_rate": 6.523714849399525e-06, + "loss": 0.8814, + "step": 14647 + }, + { + "epoch": 0.8062083769057186, + "grad_norm": 0.6617746353149414, + "learning_rate": 6.523301995057597e-06, + "loss": 0.721, + "step": 14648 + }, + { + "epoch": 0.8062634157080741, + "grad_norm": 0.6464657783508301, + "learning_rate": 6.5228891292669404e-06, + "loss": 0.7334, + "step": 14649 + }, + { + "epoch": 0.8063184545104298, + "grad_norm": 0.7648638486862183, + "learning_rate": 6.522476252030658e-06, + "loss": 0.7701, + "step": 14650 + }, + { + "epoch": 0.8063734933127855, + "grad_norm": 0.7313019037246704, + "learning_rate": 6.522063363351851e-06, + "loss": 0.7912, + "step": 14651 + }, + { + "epoch": 0.8064285321151412, + "grad_norm": 0.6175631284713745, + "learning_rate": 6.5216504632336195e-06, + "loss": 0.7568, + "step": 14652 + }, + { + "epoch": 0.8064835709174968, + "grad_norm": 0.6935408711433411, + "learning_rate": 6.521237551679074e-06, + "loss": 0.7622, + "step": 14653 + }, + { + "epoch": 0.8065386097198525, + "grad_norm": 0.7232398390769958, + "learning_rate": 6.520824628691314e-06, + "loss": 0.7908, + "step": 14654 + }, + { + "epoch": 0.8065936485222082, + "grad_norm": 0.6642309427261353, + "learning_rate": 6.520411694273443e-06, + "loss": 0.7355, + "step": 14655 + }, + { + "epoch": 0.8066486873245639, + "grad_norm": 0.6679350137710571, + "learning_rate": 6.5199987484285635e-06, + "loss": 0.735, + "step": 14656 + }, + { + "epoch": 0.8067037261269194, + "grad_norm": 0.6861871480941772, + "learning_rate": 6.519585791159782e-06, + "loss": 0.6744, + "step": 14657 + }, + { + "epoch": 0.8067587649292751, + "grad_norm": 0.7689095735549927, + "learning_rate": 6.519172822470199e-06, + "loss": 0.6888, + "step": 14658 + }, + { + "epoch": 0.8068138037316308, + "grad_norm": 0.6604742407798767, + "learning_rate": 6.5187598423629206e-06, + "loss": 0.6943, + "step": 14659 + }, + { + "epoch": 0.8068688425339865, + "grad_norm": 0.6478890776634216, + "learning_rate": 6.518346850841049e-06, + "loss": 0.7161, + "step": 14660 + }, + { + "epoch": 0.8069238813363421, + "grad_norm": 0.6213741302490234, + "learning_rate": 6.517933847907689e-06, + "loss": 0.68, + "step": 14661 + }, + { + "epoch": 0.8069789201386978, + "grad_norm": 0.7663899660110474, + "learning_rate": 6.517520833565945e-06, + "loss": 0.7498, + "step": 14662 + }, + { + "epoch": 0.8070339589410535, + "grad_norm": 0.653498649597168, + "learning_rate": 6.517107807818921e-06, + "loss": 0.7433, + "step": 14663 + }, + { + "epoch": 0.8070889977434091, + "grad_norm": 0.7618738412857056, + "learning_rate": 6.51669477066972e-06, + "loss": 0.7499, + "step": 14664 + }, + { + "epoch": 0.8071440365457647, + "grad_norm": 0.5960344672203064, + "learning_rate": 6.516281722121447e-06, + "loss": 0.6005, + "step": 14665 + }, + { + "epoch": 0.8071990753481204, + "grad_norm": 0.6768549084663391, + "learning_rate": 6.5158686621772075e-06, + "loss": 0.6859, + "step": 14666 + }, + { + "epoch": 0.8072541141504761, + "grad_norm": 0.6475711464881897, + "learning_rate": 6.515455590840104e-06, + "loss": 0.7582, + "step": 14667 + }, + { + "epoch": 0.8073091529528318, + "grad_norm": 0.7188607454299927, + "learning_rate": 6.5150425081132414e-06, + "loss": 0.7241, + "step": 14668 + }, + { + "epoch": 0.8073641917551874, + "grad_norm": 0.6507582068443298, + "learning_rate": 6.514629413999727e-06, + "loss": 0.7659, + "step": 14669 + }, + { + "epoch": 0.807419230557543, + "grad_norm": 0.6676538586616516, + "learning_rate": 6.514216308502661e-06, + "loss": 0.7336, + "step": 14670 + }, + { + "epoch": 0.8074742693598987, + "grad_norm": 0.7141211628913879, + "learning_rate": 6.513803191625152e-06, + "loss": 0.8121, + "step": 14671 + }, + { + "epoch": 0.8075293081622544, + "grad_norm": 0.7497949600219727, + "learning_rate": 6.513390063370302e-06, + "loss": 0.7238, + "step": 14672 + }, + { + "epoch": 0.80758434696461, + "grad_norm": 0.671271562576294, + "learning_rate": 6.51297692374122e-06, + "loss": 0.7876, + "step": 14673 + }, + { + "epoch": 0.8076393857669657, + "grad_norm": 0.7081878781318665, + "learning_rate": 6.512563772741008e-06, + "loss": 0.6774, + "step": 14674 + }, + { + "epoch": 0.8076944245693214, + "grad_norm": 0.640925943851471, + "learning_rate": 6.512150610372769e-06, + "loss": 0.7094, + "step": 14675 + }, + { + "epoch": 0.8077494633716771, + "grad_norm": 0.6333619952201843, + "learning_rate": 6.511737436639611e-06, + "loss": 0.6439, + "step": 14676 + }, + { + "epoch": 0.8078045021740327, + "grad_norm": 0.7294490337371826, + "learning_rate": 6.511324251544642e-06, + "loss": 0.7786, + "step": 14677 + }, + { + "epoch": 0.8078595409763883, + "grad_norm": 0.6488819718360901, + "learning_rate": 6.510911055090963e-06, + "loss": 0.7495, + "step": 14678 + }, + { + "epoch": 0.807914579778744, + "grad_norm": 0.6535395383834839, + "learning_rate": 6.51049784728168e-06, + "loss": 0.6713, + "step": 14679 + }, + { + "epoch": 0.8079696185810997, + "grad_norm": 0.6795744895935059, + "learning_rate": 6.5100846281198995e-06, + "loss": 0.71, + "step": 14680 + }, + { + "epoch": 0.8080246573834553, + "grad_norm": 0.661171019077301, + "learning_rate": 6.509671397608728e-06, + "loss": 0.7009, + "step": 14681 + }, + { + "epoch": 0.808079696185811, + "grad_norm": 0.6474859118461609, + "learning_rate": 6.50925815575127e-06, + "loss": 0.7268, + "step": 14682 + }, + { + "epoch": 0.8081347349881667, + "grad_norm": 0.676891565322876, + "learning_rate": 6.508844902550633e-06, + "loss": 0.8748, + "step": 14683 + }, + { + "epoch": 0.8081897737905224, + "grad_norm": 0.9747083783149719, + "learning_rate": 6.50843163800992e-06, + "loss": 0.6817, + "step": 14684 + }, + { + "epoch": 0.808244812592878, + "grad_norm": 0.655274510383606, + "learning_rate": 6.50801836213224e-06, + "loss": 0.7675, + "step": 14685 + }, + { + "epoch": 0.8082998513952336, + "grad_norm": 0.6916972398757935, + "learning_rate": 6.507605074920697e-06, + "loss": 0.7862, + "step": 14686 + }, + { + "epoch": 0.8083548901975893, + "grad_norm": 0.7079103589057922, + "learning_rate": 6.5071917763783975e-06, + "loss": 0.671, + "step": 14687 + }, + { + "epoch": 0.808409928999945, + "grad_norm": 0.7460986375808716, + "learning_rate": 6.506778466508447e-06, + "loss": 0.7136, + "step": 14688 + }, + { + "epoch": 0.8084649678023006, + "grad_norm": 0.6531261801719666, + "learning_rate": 6.5063651453139555e-06, + "loss": 0.811, + "step": 14689 + }, + { + "epoch": 0.8085200066046563, + "grad_norm": 0.7160762548446655, + "learning_rate": 6.505951812798025e-06, + "loss": 0.8368, + "step": 14690 + }, + { + "epoch": 0.808575045407012, + "grad_norm": 0.7230852842330933, + "learning_rate": 6.505538468963763e-06, + "loss": 0.6908, + "step": 14691 + }, + { + "epoch": 0.8086300842093676, + "grad_norm": 0.6912978887557983, + "learning_rate": 6.505125113814278e-06, + "loss": 0.6716, + "step": 14692 + }, + { + "epoch": 0.8086851230117232, + "grad_norm": 0.6745109558105469, + "learning_rate": 6.504711747352677e-06, + "loss": 0.7119, + "step": 14693 + }, + { + "epoch": 0.8087401618140789, + "grad_norm": 0.678657054901123, + "learning_rate": 6.5042983695820624e-06, + "loss": 0.7548, + "step": 14694 + }, + { + "epoch": 0.8087952006164346, + "grad_norm": 0.7501665949821472, + "learning_rate": 6.503884980505546e-06, + "loss": 0.7493, + "step": 14695 + }, + { + "epoch": 0.8088502394187902, + "grad_norm": 0.6181747317314148, + "learning_rate": 6.503471580126232e-06, + "loss": 0.7217, + "step": 14696 + }, + { + "epoch": 0.8089052782211459, + "grad_norm": 0.6548559069633484, + "learning_rate": 6.5030581684472295e-06, + "loss": 0.7448, + "step": 14697 + }, + { + "epoch": 0.8089603170235016, + "grad_norm": 0.7716642022132874, + "learning_rate": 6.5026447454716426e-06, + "loss": 0.8794, + "step": 14698 + }, + { + "epoch": 0.8090153558258573, + "grad_norm": 0.861995279788971, + "learning_rate": 6.502231311202581e-06, + "loss": 0.7839, + "step": 14699 + }, + { + "epoch": 0.8090703946282128, + "grad_norm": 0.796821117401123, + "learning_rate": 6.501817865643149e-06, + "loss": 0.8541, + "step": 14700 + }, + { + "epoch": 0.8091254334305685, + "grad_norm": 0.6995296478271484, + "learning_rate": 6.501404408796457e-06, + "loss": 0.677, + "step": 14701 + }, + { + "epoch": 0.8091804722329242, + "grad_norm": 0.6681582927703857, + "learning_rate": 6.500990940665611e-06, + "loss": 0.7754, + "step": 14702 + }, + { + "epoch": 0.8092355110352799, + "grad_norm": 0.5945298671722412, + "learning_rate": 6.50057746125372e-06, + "loss": 0.6762, + "step": 14703 + }, + { + "epoch": 0.8092905498376355, + "grad_norm": 0.672554612159729, + "learning_rate": 6.500163970563889e-06, + "loss": 0.6967, + "step": 14704 + }, + { + "epoch": 0.8093455886399912, + "grad_norm": 0.6375272870063782, + "learning_rate": 6.499750468599227e-06, + "loss": 0.7291, + "step": 14705 + }, + { + "epoch": 0.8094006274423469, + "grad_norm": 0.6369407773017883, + "learning_rate": 6.499336955362844e-06, + "loss": 0.6939, + "step": 14706 + }, + { + "epoch": 0.8094556662447026, + "grad_norm": 0.6497664451599121, + "learning_rate": 6.498923430857844e-06, + "loss": 0.7207, + "step": 14707 + }, + { + "epoch": 0.8095107050470581, + "grad_norm": 0.7345920205116272, + "learning_rate": 6.498509895087337e-06, + "loss": 0.8373, + "step": 14708 + }, + { + "epoch": 0.8095657438494138, + "grad_norm": 0.6824862957000732, + "learning_rate": 6.4980963480544324e-06, + "loss": 0.7531, + "step": 14709 + }, + { + "epoch": 0.8096207826517695, + "grad_norm": 0.7067939639091492, + "learning_rate": 6.497682789762236e-06, + "loss": 0.6951, + "step": 14710 + }, + { + "epoch": 0.8096758214541252, + "grad_norm": 0.6856693625450134, + "learning_rate": 6.497269220213856e-06, + "loss": 0.7264, + "step": 14711 + }, + { + "epoch": 0.8097308602564808, + "grad_norm": 0.6881466507911682, + "learning_rate": 6.4968556394124e-06, + "loss": 0.7837, + "step": 14712 + }, + { + "epoch": 0.8097858990588365, + "grad_norm": 0.6211455464363098, + "learning_rate": 6.49644204736098e-06, + "loss": 0.7278, + "step": 14713 + }, + { + "epoch": 0.8098409378611922, + "grad_norm": 0.688604474067688, + "learning_rate": 6.496028444062701e-06, + "loss": 0.7786, + "step": 14714 + }, + { + "epoch": 0.8098959766635478, + "grad_norm": 0.6615015268325806, + "learning_rate": 6.495614829520673e-06, + "loss": 0.7014, + "step": 14715 + }, + { + "epoch": 0.8099510154659034, + "grad_norm": 0.712661623954773, + "learning_rate": 6.495201203738004e-06, + "loss": 0.6792, + "step": 14716 + }, + { + "epoch": 0.8100060542682591, + "grad_norm": 0.6737191677093506, + "learning_rate": 6.494787566717803e-06, + "loss": 0.7937, + "step": 14717 + }, + { + "epoch": 0.8100610930706148, + "grad_norm": 0.8007351160049438, + "learning_rate": 6.494373918463179e-06, + "loss": 0.8367, + "step": 14718 + }, + { + "epoch": 0.8101161318729705, + "grad_norm": 0.7500883936882019, + "learning_rate": 6.493960258977241e-06, + "loss": 0.8102, + "step": 14719 + }, + { + "epoch": 0.8101711706753261, + "grad_norm": 0.7605966925621033, + "learning_rate": 6.493546588263097e-06, + "loss": 0.8316, + "step": 14720 + }, + { + "epoch": 0.8102262094776818, + "grad_norm": 0.746762216091156, + "learning_rate": 6.493132906323858e-06, + "loss": 0.7765, + "step": 14721 + }, + { + "epoch": 0.8102812482800374, + "grad_norm": 0.6034676432609558, + "learning_rate": 6.49271921316263e-06, + "loss": 0.7109, + "step": 14722 + }, + { + "epoch": 0.8103362870823931, + "grad_norm": 0.6965274810791016, + "learning_rate": 6.492305508782525e-06, + "loss": 0.8156, + "step": 14723 + }, + { + "epoch": 0.8103913258847487, + "grad_norm": 0.6813820004463196, + "learning_rate": 6.4918917931866495e-06, + "loss": 0.7016, + "step": 14724 + }, + { + "epoch": 0.8104463646871044, + "grad_norm": 0.8055655360221863, + "learning_rate": 6.491478066378117e-06, + "loss": 0.7837, + "step": 14725 + }, + { + "epoch": 0.8105014034894601, + "grad_norm": 0.6131647229194641, + "learning_rate": 6.491064328360033e-06, + "loss": 0.6716, + "step": 14726 + }, + { + "epoch": 0.8105564422918158, + "grad_norm": 0.6845986247062683, + "learning_rate": 6.49065057913551e-06, + "loss": 0.8112, + "step": 14727 + }, + { + "epoch": 0.8106114810941714, + "grad_norm": 0.6867175698280334, + "learning_rate": 6.490236818707653e-06, + "loss": 0.7953, + "step": 14728 + }, + { + "epoch": 0.810666519896527, + "grad_norm": 0.7170011401176453, + "learning_rate": 6.489823047079578e-06, + "loss": 0.8108, + "step": 14729 + }, + { + "epoch": 0.8107215586988827, + "grad_norm": 0.6280927658081055, + "learning_rate": 6.489409264254393e-06, + "loss": 0.6807, + "step": 14730 + }, + { + "epoch": 0.8107765975012384, + "grad_norm": 0.8344630002975464, + "learning_rate": 6.488995470235204e-06, + "loss": 0.7555, + "step": 14731 + }, + { + "epoch": 0.810831636303594, + "grad_norm": 0.6674200296401978, + "learning_rate": 6.488581665025125e-06, + "loss": 0.5732, + "step": 14732 + }, + { + "epoch": 0.8108866751059497, + "grad_norm": 0.7843313217163086, + "learning_rate": 6.4881678486272646e-06, + "loss": 0.6689, + "step": 14733 + }, + { + "epoch": 0.8109417139083054, + "grad_norm": 0.6951878666877747, + "learning_rate": 6.487754021044732e-06, + "loss": 0.8005, + "step": 14734 + }, + { + "epoch": 0.810996752710661, + "grad_norm": 0.7773714065551758, + "learning_rate": 6.487340182280639e-06, + "loss": 0.8151, + "step": 14735 + }, + { + "epoch": 0.8110517915130167, + "grad_norm": 0.824998140335083, + "learning_rate": 6.486926332338095e-06, + "loss": 0.7947, + "step": 14736 + }, + { + "epoch": 0.8111068303153723, + "grad_norm": 0.6411730647087097, + "learning_rate": 6.486512471220212e-06, + "loss": 0.7272, + "step": 14737 + }, + { + "epoch": 0.811161869117728, + "grad_norm": 0.6758518815040588, + "learning_rate": 6.486098598930097e-06, + "loss": 0.6676, + "step": 14738 + }, + { + "epoch": 0.8112169079200836, + "grad_norm": 0.7147762179374695, + "learning_rate": 6.485684715470866e-06, + "loss": 0.7796, + "step": 14739 + }, + { + "epoch": 0.8112719467224393, + "grad_norm": 0.7641217112541199, + "learning_rate": 6.485270820845623e-06, + "loss": 0.7943, + "step": 14740 + }, + { + "epoch": 0.811326985524795, + "grad_norm": 0.6947311162948608, + "learning_rate": 6.484856915057482e-06, + "loss": 0.7791, + "step": 14741 + }, + { + "epoch": 0.8113820243271507, + "grad_norm": 0.6781480312347412, + "learning_rate": 6.4844429981095565e-06, + "loss": 0.7399, + "step": 14742 + }, + { + "epoch": 0.8114370631295063, + "grad_norm": 0.6716181039810181, + "learning_rate": 6.484029070004953e-06, + "loss": 0.8111, + "step": 14743 + }, + { + "epoch": 0.8114921019318619, + "grad_norm": 0.8642836213111877, + "learning_rate": 6.4836151307467854e-06, + "loss": 0.756, + "step": 14744 + }, + { + "epoch": 0.8115471407342176, + "grad_norm": 0.5997880101203918, + "learning_rate": 6.483201180338163e-06, + "loss": 0.6043, + "step": 14745 + }, + { + "epoch": 0.8116021795365733, + "grad_norm": 0.7397846579551697, + "learning_rate": 6.4827872187821985e-06, + "loss": 0.848, + "step": 14746 + }, + { + "epoch": 0.8116572183389289, + "grad_norm": 0.7586305141448975, + "learning_rate": 6.482373246082001e-06, + "loss": 0.802, + "step": 14747 + }, + { + "epoch": 0.8117122571412846, + "grad_norm": 0.705182671546936, + "learning_rate": 6.4819592622406825e-06, + "loss": 0.7484, + "step": 14748 + }, + { + "epoch": 0.8117672959436403, + "grad_norm": 0.7092768549919128, + "learning_rate": 6.481545267261357e-06, + "loss": 0.7031, + "step": 14749 + }, + { + "epoch": 0.811822334745996, + "grad_norm": 0.6800800561904907, + "learning_rate": 6.4811312611471325e-06, + "loss": 0.7253, + "step": 14750 + }, + { + "epoch": 0.8118773735483515, + "grad_norm": 0.6862359642982483, + "learning_rate": 6.4807172439011215e-06, + "loss": 0.818, + "step": 14751 + }, + { + "epoch": 0.8119324123507072, + "grad_norm": 0.6928552389144897, + "learning_rate": 6.480303215526436e-06, + "loss": 0.7459, + "step": 14752 + }, + { + "epoch": 0.8119874511530629, + "grad_norm": 0.6869228482246399, + "learning_rate": 6.479889176026189e-06, + "loss": 0.7024, + "step": 14753 + }, + { + "epoch": 0.8120424899554186, + "grad_norm": 0.7036190032958984, + "learning_rate": 6.479475125403489e-06, + "loss": 0.766, + "step": 14754 + }, + { + "epoch": 0.8120975287577742, + "grad_norm": 0.6574180722236633, + "learning_rate": 6.479061063661452e-06, + "loss": 0.7355, + "step": 14755 + }, + { + "epoch": 0.8121525675601299, + "grad_norm": 0.6424534916877747, + "learning_rate": 6.478646990803188e-06, + "loss": 0.6837, + "step": 14756 + }, + { + "epoch": 0.8122076063624856, + "grad_norm": 0.6922320127487183, + "learning_rate": 6.478232906831808e-06, + "loss": 0.7535, + "step": 14757 + }, + { + "epoch": 0.8122626451648413, + "grad_norm": 0.6424705386161804, + "learning_rate": 6.477818811750426e-06, + "loss": 0.691, + "step": 14758 + }, + { + "epoch": 0.8123176839671968, + "grad_norm": 0.6180749535560608, + "learning_rate": 6.4774047055621525e-06, + "loss": 0.6944, + "step": 14759 + }, + { + "epoch": 0.8123727227695525, + "grad_norm": 0.8718746900558472, + "learning_rate": 6.4769905882701e-06, + "loss": 0.89, + "step": 14760 + }, + { + "epoch": 0.8124277615719082, + "grad_norm": 0.6664311289787292, + "learning_rate": 6.476576459877384e-06, + "loss": 0.7144, + "step": 14761 + }, + { + "epoch": 0.8124828003742639, + "grad_norm": 0.6547374129295349, + "learning_rate": 6.476162320387112e-06, + "loss": 0.7292, + "step": 14762 + }, + { + "epoch": 0.8125378391766195, + "grad_norm": 0.7387503385543823, + "learning_rate": 6.475748169802401e-06, + "loss": 0.7388, + "step": 14763 + }, + { + "epoch": 0.8125928779789752, + "grad_norm": 0.6013749241828918, + "learning_rate": 6.475334008126361e-06, + "loss": 0.6853, + "step": 14764 + }, + { + "epoch": 0.8126479167813309, + "grad_norm": 0.6720583438873291, + "learning_rate": 6.474919835362105e-06, + "loss": 0.7392, + "step": 14765 + }, + { + "epoch": 0.8127029555836865, + "grad_norm": 0.6651661992073059, + "learning_rate": 6.474505651512748e-06, + "loss": 0.7586, + "step": 14766 + }, + { + "epoch": 0.8127579943860421, + "grad_norm": 0.7653207182884216, + "learning_rate": 6.474091456581401e-06, + "loss": 0.9182, + "step": 14767 + }, + { + "epoch": 0.8128130331883978, + "grad_norm": 0.6322795152664185, + "learning_rate": 6.473677250571176e-06, + "loss": 0.6954, + "step": 14768 + }, + { + "epoch": 0.8128680719907535, + "grad_norm": 0.7423616647720337, + "learning_rate": 6.4732630334851885e-06, + "loss": 0.748, + "step": 14769 + }, + { + "epoch": 0.8129231107931092, + "grad_norm": 0.5989160537719727, + "learning_rate": 6.472848805326549e-06, + "loss": 0.6571, + "step": 14770 + }, + { + "epoch": 0.8129781495954648, + "grad_norm": 0.695566713809967, + "learning_rate": 6.472434566098373e-06, + "loss": 0.6936, + "step": 14771 + }, + { + "epoch": 0.8130331883978205, + "grad_norm": 0.6993961930274963, + "learning_rate": 6.4720203158037734e-06, + "loss": 0.8283, + "step": 14772 + }, + { + "epoch": 0.8130882272001762, + "grad_norm": 0.6430020928382874, + "learning_rate": 6.471606054445861e-06, + "loss": 0.6882, + "step": 14773 + }, + { + "epoch": 0.8131432660025318, + "grad_norm": 0.6834734678268433, + "learning_rate": 6.471191782027754e-06, + "loss": 0.7519, + "step": 14774 + }, + { + "epoch": 0.8131983048048874, + "grad_norm": 0.679432213306427, + "learning_rate": 6.470777498552561e-06, + "loss": 0.7707, + "step": 14775 + }, + { + "epoch": 0.8132533436072431, + "grad_norm": 0.6929466128349304, + "learning_rate": 6.4703632040234e-06, + "loss": 0.7166, + "step": 14776 + }, + { + "epoch": 0.8133083824095988, + "grad_norm": 0.7033447623252869, + "learning_rate": 6.469948898443381e-06, + "loss": 0.7558, + "step": 14777 + }, + { + "epoch": 0.8133634212119544, + "grad_norm": 0.89338618516922, + "learning_rate": 6.469534581815621e-06, + "loss": 0.7829, + "step": 14778 + }, + { + "epoch": 0.8134184600143101, + "grad_norm": 0.7361789345741272, + "learning_rate": 6.469120254143233e-06, + "loss": 0.7885, + "step": 14779 + }, + { + "epoch": 0.8134734988166658, + "grad_norm": 0.7532172203063965, + "learning_rate": 6.468705915429329e-06, + "loss": 0.7791, + "step": 14780 + }, + { + "epoch": 0.8135285376190214, + "grad_norm": 0.7082527279853821, + "learning_rate": 6.468291565677025e-06, + "loss": 0.7809, + "step": 14781 + }, + { + "epoch": 0.813583576421377, + "grad_norm": 0.7854330539703369, + "learning_rate": 6.467877204889435e-06, + "loss": 0.8467, + "step": 14782 + }, + { + "epoch": 0.8136386152237327, + "grad_norm": 0.7649636268615723, + "learning_rate": 6.467462833069672e-06, + "loss": 0.7766, + "step": 14783 + }, + { + "epoch": 0.8136936540260884, + "grad_norm": 0.6293399930000305, + "learning_rate": 6.467048450220852e-06, + "loss": 0.7307, + "step": 14784 + }, + { + "epoch": 0.8137486928284441, + "grad_norm": 0.7131813764572144, + "learning_rate": 6.4666340563460874e-06, + "loss": 0.7614, + "step": 14785 + }, + { + "epoch": 0.8138037316307997, + "grad_norm": 0.6650925874710083, + "learning_rate": 6.466219651448496e-06, + "loss": 0.7576, + "step": 14786 + }, + { + "epoch": 0.8138587704331554, + "grad_norm": 0.8009011745452881, + "learning_rate": 6.4658052355311875e-06, + "loss": 0.7127, + "step": 14787 + }, + { + "epoch": 0.813913809235511, + "grad_norm": 1.009027123451233, + "learning_rate": 6.465390808597281e-06, + "loss": 0.7647, + "step": 14788 + }, + { + "epoch": 0.8139688480378667, + "grad_norm": 0.7495583891868591, + "learning_rate": 6.464976370649888e-06, + "loss": 0.7276, + "step": 14789 + }, + { + "epoch": 0.8140238868402223, + "grad_norm": 0.7181064486503601, + "learning_rate": 6.464561921692125e-06, + "loss": 0.687, + "step": 14790 + }, + { + "epoch": 0.814078925642578, + "grad_norm": 0.7480552196502686, + "learning_rate": 6.464147461727108e-06, + "loss": 0.7813, + "step": 14791 + }, + { + "epoch": 0.8141339644449337, + "grad_norm": 0.6699607968330383, + "learning_rate": 6.4637329907579506e-06, + "loss": 0.7364, + "step": 14792 + }, + { + "epoch": 0.8141890032472894, + "grad_norm": 0.7321322560310364, + "learning_rate": 6.463318508787767e-06, + "loss": 0.6799, + "step": 14793 + }, + { + "epoch": 0.814244042049645, + "grad_norm": 0.8992179036140442, + "learning_rate": 6.462904015819673e-06, + "loss": 0.7602, + "step": 14794 + }, + { + "epoch": 0.8142990808520006, + "grad_norm": 0.6949485540390015, + "learning_rate": 6.462489511856784e-06, + "loss": 0.6701, + "step": 14795 + }, + { + "epoch": 0.8143541196543563, + "grad_norm": 0.6367032527923584, + "learning_rate": 6.462074996902217e-06, + "loss": 0.7132, + "step": 14796 + }, + { + "epoch": 0.814409158456712, + "grad_norm": 0.6424476504325867, + "learning_rate": 6.461660470959084e-06, + "loss": 0.7111, + "step": 14797 + }, + { + "epoch": 0.8144641972590676, + "grad_norm": 0.6649259924888611, + "learning_rate": 6.4612459340305025e-06, + "loss": 0.6583, + "step": 14798 + }, + { + "epoch": 0.8145192360614233, + "grad_norm": 0.7781171798706055, + "learning_rate": 6.460831386119587e-06, + "loss": 0.8145, + "step": 14799 + }, + { + "epoch": 0.814574274863779, + "grad_norm": 0.7409094572067261, + "learning_rate": 6.460416827229455e-06, + "loss": 0.7559, + "step": 14800 + }, + { + "epoch": 0.8146293136661347, + "grad_norm": 1.2152613401412964, + "learning_rate": 6.46000225736322e-06, + "loss": 0.8263, + "step": 14801 + }, + { + "epoch": 0.8146843524684902, + "grad_norm": 0.7133356332778931, + "learning_rate": 6.459587676524e-06, + "loss": 0.7687, + "step": 14802 + }, + { + "epoch": 0.8147393912708459, + "grad_norm": 0.8576061129570007, + "learning_rate": 6.459173084714908e-06, + "loss": 0.8364, + "step": 14803 + }, + { + "epoch": 0.8147944300732016, + "grad_norm": 0.7701650857925415, + "learning_rate": 6.4587584819390634e-06, + "loss": 0.7768, + "step": 14804 + }, + { + "epoch": 0.8148494688755573, + "grad_norm": 0.6629199981689453, + "learning_rate": 6.45834386819958e-06, + "loss": 0.7338, + "step": 14805 + }, + { + "epoch": 0.8149045076779129, + "grad_norm": 0.6498340964317322, + "learning_rate": 6.457929243499574e-06, + "loss": 0.7241, + "step": 14806 + }, + { + "epoch": 0.8149595464802686, + "grad_norm": 0.7107635140419006, + "learning_rate": 6.457514607842164e-06, + "loss": 0.7999, + "step": 14807 + }, + { + "epoch": 0.8150145852826243, + "grad_norm": 0.8689384460449219, + "learning_rate": 6.457099961230462e-06, + "loss": 0.7882, + "step": 14808 + }, + { + "epoch": 0.81506962408498, + "grad_norm": 0.7050377726554871, + "learning_rate": 6.456685303667587e-06, + "loss": 0.8039, + "step": 14809 + }, + { + "epoch": 0.8151246628873355, + "grad_norm": 0.6171709895133972, + "learning_rate": 6.456270635156656e-06, + "loss": 0.6569, + "step": 14810 + }, + { + "epoch": 0.8151797016896912, + "grad_norm": 0.837285041809082, + "learning_rate": 6.455855955700785e-06, + "loss": 0.6529, + "step": 14811 + }, + { + "epoch": 0.8152347404920469, + "grad_norm": 0.7335891723632812, + "learning_rate": 6.45544126530309e-06, + "loss": 0.814, + "step": 14812 + }, + { + "epoch": 0.8152897792944026, + "grad_norm": 0.7217129468917847, + "learning_rate": 6.4550265639666864e-06, + "loss": 0.795, + "step": 14813 + }, + { + "epoch": 0.8153448180967582, + "grad_norm": 0.7292104959487915, + "learning_rate": 6.454611851694694e-06, + "loss": 0.7169, + "step": 14814 + }, + { + "epoch": 0.8153998568991139, + "grad_norm": 0.7190173864364624, + "learning_rate": 6.454197128490229e-06, + "loss": 0.8413, + "step": 14815 + }, + { + "epoch": 0.8154548957014696, + "grad_norm": 0.6679649949073792, + "learning_rate": 6.453782394356407e-06, + "loss": 0.6626, + "step": 14816 + }, + { + "epoch": 0.8155099345038253, + "grad_norm": 0.6829885244369507, + "learning_rate": 6.453367649296347e-06, + "loss": 0.6512, + "step": 14817 + }, + { + "epoch": 0.8155649733061808, + "grad_norm": 0.659461498260498, + "learning_rate": 6.452952893313163e-06, + "loss": 0.7271, + "step": 14818 + }, + { + "epoch": 0.8156200121085365, + "grad_norm": 0.6737749576568604, + "learning_rate": 6.452538126409975e-06, + "loss": 0.6882, + "step": 14819 + }, + { + "epoch": 0.8156750509108922, + "grad_norm": 0.7798036336898804, + "learning_rate": 6.452123348589899e-06, + "loss": 0.7214, + "step": 14820 + }, + { + "epoch": 0.8157300897132478, + "grad_norm": 0.6594774127006531, + "learning_rate": 6.451708559856051e-06, + "loss": 0.7611, + "step": 14821 + }, + { + "epoch": 0.8157851285156035, + "grad_norm": 0.6795164942741394, + "learning_rate": 6.451293760211552e-06, + "loss": 0.6825, + "step": 14822 + }, + { + "epoch": 0.8158401673179592, + "grad_norm": 0.8376501798629761, + "learning_rate": 6.450878949659517e-06, + "loss": 0.7898, + "step": 14823 + }, + { + "epoch": 0.8158952061203149, + "grad_norm": 0.6746712923049927, + "learning_rate": 6.450464128203064e-06, + "loss": 0.6771, + "step": 14824 + }, + { + "epoch": 0.8159502449226704, + "grad_norm": 0.7984384894371033, + "learning_rate": 6.450049295845311e-06, + "loss": 0.7326, + "step": 14825 + }, + { + "epoch": 0.8160052837250261, + "grad_norm": 0.8210996389389038, + "learning_rate": 6.449634452589376e-06, + "loss": 0.8194, + "step": 14826 + }, + { + "epoch": 0.8160603225273818, + "grad_norm": 0.7045891284942627, + "learning_rate": 6.449219598438376e-06, + "loss": 0.7683, + "step": 14827 + }, + { + "epoch": 0.8161153613297375, + "grad_norm": 0.7199337482452393, + "learning_rate": 6.448804733395431e-06, + "loss": 0.7125, + "step": 14828 + }, + { + "epoch": 0.8161704001320931, + "grad_norm": 0.8576976656913757, + "learning_rate": 6.448389857463655e-06, + "loss": 0.6744, + "step": 14829 + }, + { + "epoch": 0.8162254389344488, + "grad_norm": 0.6944701075553894, + "learning_rate": 6.4479749706461705e-06, + "loss": 0.7663, + "step": 14830 + }, + { + "epoch": 0.8162804777368045, + "grad_norm": 0.7436455488204956, + "learning_rate": 6.447560072946093e-06, + "loss": 0.7612, + "step": 14831 + }, + { + "epoch": 0.8163355165391601, + "grad_norm": 0.6023590564727783, + "learning_rate": 6.447145164366542e-06, + "loss": 0.7029, + "step": 14832 + }, + { + "epoch": 0.8163905553415157, + "grad_norm": 0.6720685362815857, + "learning_rate": 6.446730244910633e-06, + "loss": 0.7821, + "step": 14833 + }, + { + "epoch": 0.8164455941438714, + "grad_norm": 0.6359856128692627, + "learning_rate": 6.446315314581488e-06, + "loss": 0.7119, + "step": 14834 + }, + { + "epoch": 0.8165006329462271, + "grad_norm": 0.6796891689300537, + "learning_rate": 6.445900373382225e-06, + "loss": 0.7414, + "step": 14835 + }, + { + "epoch": 0.8165556717485828, + "grad_norm": 0.6865763068199158, + "learning_rate": 6.445485421315963e-06, + "loss": 0.7239, + "step": 14836 + }, + { + "epoch": 0.8166107105509384, + "grad_norm": 0.6696601510047913, + "learning_rate": 6.445070458385816e-06, + "loss": 0.6322, + "step": 14837 + }, + { + "epoch": 0.8166657493532941, + "grad_norm": 0.6800506711006165, + "learning_rate": 6.444655484594909e-06, + "loss": 0.7827, + "step": 14838 + }, + { + "epoch": 0.8167207881556497, + "grad_norm": 0.7590689063072205, + "learning_rate": 6.444240499946357e-06, + "loss": 0.7177, + "step": 14839 + }, + { + "epoch": 0.8167758269580054, + "grad_norm": 0.6692266464233398, + "learning_rate": 6.4438255044432805e-06, + "loss": 0.6631, + "step": 14840 + }, + { + "epoch": 0.816830865760361, + "grad_norm": 0.695164144039154, + "learning_rate": 6.443410498088798e-06, + "loss": 0.6953, + "step": 14841 + }, + { + "epoch": 0.8168859045627167, + "grad_norm": 0.6503697037696838, + "learning_rate": 6.442995480886028e-06, + "loss": 0.7868, + "step": 14842 + }, + { + "epoch": 0.8169409433650724, + "grad_norm": 0.6943323016166687, + "learning_rate": 6.442580452838091e-06, + "loss": 0.7464, + "step": 14843 + }, + { + "epoch": 0.8169959821674281, + "grad_norm": 0.7510622143745422, + "learning_rate": 6.442165413948105e-06, + "loss": 0.7984, + "step": 14844 + }, + { + "epoch": 0.8170510209697837, + "grad_norm": 0.6322263479232788, + "learning_rate": 6.441750364219189e-06, + "loss": 0.7693, + "step": 14845 + }, + { + "epoch": 0.8171060597721393, + "grad_norm": 0.681967556476593, + "learning_rate": 6.4413353036544646e-06, + "loss": 0.6781, + "step": 14846 + }, + { + "epoch": 0.817161098574495, + "grad_norm": 0.6799043416976929, + "learning_rate": 6.440920232257049e-06, + "loss": 0.7791, + "step": 14847 + }, + { + "epoch": 0.8172161373768507, + "grad_norm": 0.673652172088623, + "learning_rate": 6.440505150030064e-06, + "loss": 0.7099, + "step": 14848 + }, + { + "epoch": 0.8172711761792063, + "grad_norm": 0.755377471446991, + "learning_rate": 6.4400900569766255e-06, + "loss": 0.7292, + "step": 14849 + }, + { + "epoch": 0.817326214981562, + "grad_norm": 0.6099830269813538, + "learning_rate": 6.439674953099857e-06, + "loss": 0.7154, + "step": 14850 + }, + { + "epoch": 0.8173812537839177, + "grad_norm": 0.6330500841140747, + "learning_rate": 6.439259838402878e-06, + "loss": 0.6858, + "step": 14851 + }, + { + "epoch": 0.8174362925862734, + "grad_norm": 0.6727203726768494, + "learning_rate": 6.438844712888806e-06, + "loss": 0.7089, + "step": 14852 + }, + { + "epoch": 0.817491331388629, + "grad_norm": 0.7482651472091675, + "learning_rate": 6.438429576560763e-06, + "loss": 0.7065, + "step": 14853 + }, + { + "epoch": 0.8175463701909846, + "grad_norm": 0.6786343455314636, + "learning_rate": 6.438014429421868e-06, + "loss": 0.7049, + "step": 14854 + }, + { + "epoch": 0.8176014089933403, + "grad_norm": 0.6155980825424194, + "learning_rate": 6.437599271475241e-06, + "loss": 0.607, + "step": 14855 + }, + { + "epoch": 0.817656447795696, + "grad_norm": 0.6551154851913452, + "learning_rate": 6.437184102724003e-06, + "loss": 0.7022, + "step": 14856 + }, + { + "epoch": 0.8177114865980516, + "grad_norm": 0.6127358078956604, + "learning_rate": 6.436768923171273e-06, + "loss": 0.6827, + "step": 14857 + }, + { + "epoch": 0.8177665254004073, + "grad_norm": 0.6470245718955994, + "learning_rate": 6.436353732820175e-06, + "loss": 0.6877, + "step": 14858 + }, + { + "epoch": 0.817821564202763, + "grad_norm": 0.704667866230011, + "learning_rate": 6.435938531673825e-06, + "loss": 0.7223, + "step": 14859 + }, + { + "epoch": 0.8178766030051187, + "grad_norm": 0.6328873634338379, + "learning_rate": 6.435523319735345e-06, + "loss": 0.7181, + "step": 14860 + }, + { + "epoch": 0.8179316418074742, + "grad_norm": 0.6489065885543823, + "learning_rate": 6.435108097007856e-06, + "loss": 0.7597, + "step": 14861 + }, + { + "epoch": 0.8179866806098299, + "grad_norm": 0.6398639678955078, + "learning_rate": 6.43469286349448e-06, + "loss": 0.667, + "step": 14862 + }, + { + "epoch": 0.8180417194121856, + "grad_norm": 0.7615578770637512, + "learning_rate": 6.434277619198335e-06, + "loss": 0.8474, + "step": 14863 + }, + { + "epoch": 0.8180967582145412, + "grad_norm": 0.8604047894477844, + "learning_rate": 6.433862364122545e-06, + "loss": 0.7977, + "step": 14864 + }, + { + "epoch": 0.8181517970168969, + "grad_norm": 0.6157855987548828, + "learning_rate": 6.433447098270228e-06, + "loss": 0.6513, + "step": 14865 + }, + { + "epoch": 0.8182068358192526, + "grad_norm": 0.7052211761474609, + "learning_rate": 6.433031821644507e-06, + "loss": 0.7043, + "step": 14866 + }, + { + "epoch": 0.8182618746216083, + "grad_norm": 0.785987138748169, + "learning_rate": 6.432616534248503e-06, + "loss": 0.8722, + "step": 14867 + }, + { + "epoch": 0.8183169134239638, + "grad_norm": 0.7711461782455444, + "learning_rate": 6.432201236085336e-06, + "loss": 0.68, + "step": 14868 + }, + { + "epoch": 0.8183719522263195, + "grad_norm": 0.6299784183502197, + "learning_rate": 6.431785927158126e-06, + "loss": 0.7397, + "step": 14869 + }, + { + "epoch": 0.8184269910286752, + "grad_norm": 0.6292238235473633, + "learning_rate": 6.431370607469998e-06, + "loss": 0.7392, + "step": 14870 + }, + { + "epoch": 0.8184820298310309, + "grad_norm": 0.8696228861808777, + "learning_rate": 6.430955277024071e-06, + "loss": 0.884, + "step": 14871 + }, + { + "epoch": 0.8185370686333865, + "grad_norm": 0.6754364967346191, + "learning_rate": 6.430539935823469e-06, + "loss": 0.7122, + "step": 14872 + }, + { + "epoch": 0.8185921074357422, + "grad_norm": 0.6936547160148621, + "learning_rate": 6.4301245838713085e-06, + "loss": 0.7353, + "step": 14873 + }, + { + "epoch": 0.8186471462380979, + "grad_norm": 0.8840705156326294, + "learning_rate": 6.429709221170717e-06, + "loss": 0.7043, + "step": 14874 + }, + { + "epoch": 0.8187021850404536, + "grad_norm": 0.7349988222122192, + "learning_rate": 6.4292938477248135e-06, + "loss": 0.7861, + "step": 14875 + }, + { + "epoch": 0.8187572238428091, + "grad_norm": 0.697790801525116, + "learning_rate": 6.428878463536721e-06, + "loss": 0.8021, + "step": 14876 + }, + { + "epoch": 0.8188122626451648, + "grad_norm": 0.7873979806900024, + "learning_rate": 6.428463068609559e-06, + "loss": 0.7313, + "step": 14877 + }, + { + "epoch": 0.8188673014475205, + "grad_norm": 0.6542018055915833, + "learning_rate": 6.4280476629464505e-06, + "loss": 0.7811, + "step": 14878 + }, + { + "epoch": 0.8189223402498762, + "grad_norm": 0.7477063536643982, + "learning_rate": 6.427632246550519e-06, + "loss": 0.764, + "step": 14879 + }, + { + "epoch": 0.8189773790522318, + "grad_norm": 0.6456438302993774, + "learning_rate": 6.4272168194248855e-06, + "loss": 0.7517, + "step": 14880 + }, + { + "epoch": 0.8190324178545875, + "grad_norm": 0.699684202671051, + "learning_rate": 6.426801381572671e-06, + "loss": 0.7963, + "step": 14881 + }, + { + "epoch": 0.8190874566569432, + "grad_norm": 0.9158867001533508, + "learning_rate": 6.426385932997001e-06, + "loss": 0.8782, + "step": 14882 + }, + { + "epoch": 0.8191424954592988, + "grad_norm": 0.5998190641403198, + "learning_rate": 6.425970473700995e-06, + "loss": 0.6598, + "step": 14883 + }, + { + "epoch": 0.8191975342616544, + "grad_norm": 0.6674730777740479, + "learning_rate": 6.4255550036877775e-06, + "loss": 0.7232, + "step": 14884 + }, + { + "epoch": 0.8192525730640101, + "grad_norm": 0.6303582191467285, + "learning_rate": 6.42513952296047e-06, + "loss": 0.7614, + "step": 14885 + }, + { + "epoch": 0.8193076118663658, + "grad_norm": 0.6255910992622375, + "learning_rate": 6.424724031522195e-06, + "loss": 0.7052, + "step": 14886 + }, + { + "epoch": 0.8193626506687215, + "grad_norm": 0.6610854268074036, + "learning_rate": 6.424308529376075e-06, + "loss": 0.7403, + "step": 14887 + }, + { + "epoch": 0.8194176894710771, + "grad_norm": 0.6758664846420288, + "learning_rate": 6.4238930165252355e-06, + "loss": 0.7603, + "step": 14888 + }, + { + "epoch": 0.8194727282734328, + "grad_norm": 0.6897797584533691, + "learning_rate": 6.423477492972796e-06, + "loss": 0.7194, + "step": 14889 + }, + { + "epoch": 0.8195277670757884, + "grad_norm": 0.7007622718811035, + "learning_rate": 6.42306195872188e-06, + "loss": 0.7905, + "step": 14890 + }, + { + "epoch": 0.8195828058781441, + "grad_norm": 0.7482092976570129, + "learning_rate": 6.422646413775613e-06, + "loss": 0.7809, + "step": 14891 + }, + { + "epoch": 0.8196378446804997, + "grad_norm": 0.9551613926887512, + "learning_rate": 6.422230858137115e-06, + "loss": 0.8559, + "step": 14892 + }, + { + "epoch": 0.8196928834828554, + "grad_norm": 0.6831939220428467, + "learning_rate": 6.42181529180951e-06, + "loss": 0.7867, + "step": 14893 + }, + { + "epoch": 0.8197479222852111, + "grad_norm": 1.446377158164978, + "learning_rate": 6.421399714795923e-06, + "loss": 0.8745, + "step": 14894 + }, + { + "epoch": 0.8198029610875668, + "grad_norm": 0.6738638877868652, + "learning_rate": 6.420984127099475e-06, + "loss": 0.727, + "step": 14895 + }, + { + "epoch": 0.8198579998899224, + "grad_norm": 0.7388872504234314, + "learning_rate": 6.420568528723292e-06, + "loss": 0.7041, + "step": 14896 + }, + { + "epoch": 0.819913038692278, + "grad_norm": 0.6977630853652954, + "learning_rate": 6.420152919670495e-06, + "loss": 0.7944, + "step": 14897 + }, + { + "epoch": 0.8199680774946337, + "grad_norm": 0.6300190091133118, + "learning_rate": 6.41973729994421e-06, + "loss": 0.6879, + "step": 14898 + }, + { + "epoch": 0.8200231162969894, + "grad_norm": 0.6350599527359009, + "learning_rate": 6.419321669547559e-06, + "loss": 0.6725, + "step": 14899 + }, + { + "epoch": 0.820078155099345, + "grad_norm": 0.8604453206062317, + "learning_rate": 6.418906028483667e-06, + "loss": 0.7706, + "step": 14900 + }, + { + "epoch": 0.8201331939017007, + "grad_norm": 0.6574103236198425, + "learning_rate": 6.418490376755656e-06, + "loss": 0.7008, + "step": 14901 + }, + { + "epoch": 0.8201882327040564, + "grad_norm": 0.706132173538208, + "learning_rate": 6.418074714366651e-06, + "loss": 0.7608, + "step": 14902 + }, + { + "epoch": 0.8202432715064121, + "grad_norm": 1.155480146408081, + "learning_rate": 6.417659041319777e-06, + "loss": 0.6893, + "step": 14903 + }, + { + "epoch": 0.8202983103087677, + "grad_norm": 0.8497835397720337, + "learning_rate": 6.417243357618157e-06, + "loss": 0.6889, + "step": 14904 + }, + { + "epoch": 0.8203533491111233, + "grad_norm": 0.9319966435432434, + "learning_rate": 6.416827663264915e-06, + "loss": 0.8098, + "step": 14905 + }, + { + "epoch": 0.820408387913479, + "grad_norm": 0.744888186454773, + "learning_rate": 6.4164119582631745e-06, + "loss": 0.7871, + "step": 14906 + }, + { + "epoch": 0.8204634267158346, + "grad_norm": 0.6928347945213318, + "learning_rate": 6.415996242616063e-06, + "loss": 0.7693, + "step": 14907 + }, + { + "epoch": 0.8205184655181903, + "grad_norm": 0.7455456852912903, + "learning_rate": 6.415580516326701e-06, + "loss": 0.6475, + "step": 14908 + }, + { + "epoch": 0.820573504320546, + "grad_norm": 0.6823583245277405, + "learning_rate": 6.415164779398215e-06, + "loss": 0.7223, + "step": 14909 + }, + { + "epoch": 0.8206285431229017, + "grad_norm": 0.6989970207214355, + "learning_rate": 6.414749031833729e-06, + "loss": 0.8203, + "step": 14910 + }, + { + "epoch": 0.8206835819252573, + "grad_norm": 0.6026825308799744, + "learning_rate": 6.414333273636369e-06, + "loss": 0.6307, + "step": 14911 + }, + { + "epoch": 0.8207386207276129, + "grad_norm": 0.6102367639541626, + "learning_rate": 6.413917504809258e-06, + "loss": 0.7049, + "step": 14912 + }, + { + "epoch": 0.8207936595299686, + "grad_norm": 0.6658119559288025, + "learning_rate": 6.4135017253555225e-06, + "loss": 0.7541, + "step": 14913 + }, + { + "epoch": 0.8208486983323243, + "grad_norm": 0.7272284626960754, + "learning_rate": 6.413085935278286e-06, + "loss": 0.7581, + "step": 14914 + }, + { + "epoch": 0.8209037371346799, + "grad_norm": 0.7826990485191345, + "learning_rate": 6.412670134580674e-06, + "loss": 0.8121, + "step": 14915 + }, + { + "epoch": 0.8209587759370356, + "grad_norm": 0.5845723748207092, + "learning_rate": 6.412254323265811e-06, + "loss": 0.5921, + "step": 14916 + }, + { + "epoch": 0.8210138147393913, + "grad_norm": 0.655577540397644, + "learning_rate": 6.411838501336823e-06, + "loss": 0.7694, + "step": 14917 + }, + { + "epoch": 0.821068853541747, + "grad_norm": 0.6722497940063477, + "learning_rate": 6.4114226687968325e-06, + "loss": 0.6377, + "step": 14918 + }, + { + "epoch": 0.8211238923441025, + "grad_norm": 0.713169276714325, + "learning_rate": 6.41100682564897e-06, + "loss": 0.7328, + "step": 14919 + }, + { + "epoch": 0.8211789311464582, + "grad_norm": 0.6004113554954529, + "learning_rate": 6.410590971896357e-06, + "loss": 0.6564, + "step": 14920 + }, + { + "epoch": 0.8212339699488139, + "grad_norm": 0.6541520953178406, + "learning_rate": 6.410175107542119e-06, + "loss": 0.7063, + "step": 14921 + }, + { + "epoch": 0.8212890087511696, + "grad_norm": 0.7937784194946289, + "learning_rate": 6.409759232589383e-06, + "loss": 0.7516, + "step": 14922 + }, + { + "epoch": 0.8213440475535252, + "grad_norm": 0.7017408013343811, + "learning_rate": 6.409343347041274e-06, + "loss": 0.6846, + "step": 14923 + }, + { + "epoch": 0.8213990863558809, + "grad_norm": 0.6233413815498352, + "learning_rate": 6.408927450900917e-06, + "loss": 0.6655, + "step": 14924 + }, + { + "epoch": 0.8214541251582366, + "grad_norm": 0.93160480260849, + "learning_rate": 6.4085115441714396e-06, + "loss": 0.7461, + "step": 14925 + }, + { + "epoch": 0.8215091639605923, + "grad_norm": 0.6075658202171326, + "learning_rate": 6.4080956268559655e-06, + "loss": 0.705, + "step": 14926 + }, + { + "epoch": 0.8215642027629478, + "grad_norm": 0.6212051510810852, + "learning_rate": 6.407679698957623e-06, + "loss": 0.6943, + "step": 14927 + }, + { + "epoch": 0.8216192415653035, + "grad_norm": 0.8143971562385559, + "learning_rate": 6.407263760479536e-06, + "loss": 0.6918, + "step": 14928 + }, + { + "epoch": 0.8216742803676592, + "grad_norm": 0.6851963996887207, + "learning_rate": 6.406847811424831e-06, + "loss": 0.7849, + "step": 14929 + }, + { + "epoch": 0.8217293191700149, + "grad_norm": 0.7047909498214722, + "learning_rate": 6.406431851796633e-06, + "loss": 0.7364, + "step": 14930 + }, + { + "epoch": 0.8217843579723705, + "grad_norm": 0.7377674579620361, + "learning_rate": 6.406015881598071e-06, + "loss": 0.7413, + "step": 14931 + }, + { + "epoch": 0.8218393967747262, + "grad_norm": 0.7188243269920349, + "learning_rate": 6.405599900832271e-06, + "loss": 0.8051, + "step": 14932 + }, + { + "epoch": 0.8218944355770819, + "grad_norm": 0.7588842511177063, + "learning_rate": 6.4051839095023575e-06, + "loss": 0.7687, + "step": 14933 + }, + { + "epoch": 0.8219494743794376, + "grad_norm": 0.6396436095237732, + "learning_rate": 6.404767907611457e-06, + "loss": 0.7516, + "step": 14934 + }, + { + "epoch": 0.8220045131817931, + "grad_norm": 0.6896073818206787, + "learning_rate": 6.404351895162698e-06, + "loss": 0.7904, + "step": 14935 + }, + { + "epoch": 0.8220595519841488, + "grad_norm": 0.7475640773773193, + "learning_rate": 6.403935872159206e-06, + "loss": 0.8325, + "step": 14936 + }, + { + "epoch": 0.8221145907865045, + "grad_norm": 0.6456442475318909, + "learning_rate": 6.403519838604107e-06, + "loss": 0.7685, + "step": 14937 + }, + { + "epoch": 0.8221696295888602, + "grad_norm": 0.6446966528892517, + "learning_rate": 6.40310379450053e-06, + "loss": 0.731, + "step": 14938 + }, + { + "epoch": 0.8222246683912158, + "grad_norm": 0.7744176983833313, + "learning_rate": 6.4026877398515995e-06, + "loss": 0.7975, + "step": 14939 + }, + { + "epoch": 0.8222797071935715, + "grad_norm": 0.6441214680671692, + "learning_rate": 6.402271674660444e-06, + "loss": 0.7386, + "step": 14940 + }, + { + "epoch": 0.8223347459959272, + "grad_norm": 0.6788361072540283, + "learning_rate": 6.40185559893019e-06, + "loss": 0.7664, + "step": 14941 + }, + { + "epoch": 0.8223897847982828, + "grad_norm": 0.6565073132514954, + "learning_rate": 6.4014395126639624e-06, + "loss": 0.6716, + "step": 14942 + }, + { + "epoch": 0.8224448236006384, + "grad_norm": 0.6475300788879395, + "learning_rate": 6.401023415864893e-06, + "loss": 0.6887, + "step": 14943 + }, + { + "epoch": 0.8224998624029941, + "grad_norm": 0.7058338522911072, + "learning_rate": 6.400607308536107e-06, + "loss": 0.7248, + "step": 14944 + }, + { + "epoch": 0.8225549012053498, + "grad_norm": 0.7184485197067261, + "learning_rate": 6.4001911906807305e-06, + "loss": 0.693, + "step": 14945 + }, + { + "epoch": 0.8226099400077055, + "grad_norm": 0.6280504465103149, + "learning_rate": 6.399775062301891e-06, + "loss": 0.6776, + "step": 14946 + }, + { + "epoch": 0.8226649788100611, + "grad_norm": 0.6995168328285217, + "learning_rate": 6.399358923402716e-06, + "loss": 0.7536, + "step": 14947 + }, + { + "epoch": 0.8227200176124168, + "grad_norm": 0.7770118713378906, + "learning_rate": 6.398942773986337e-06, + "loss": 0.6966, + "step": 14948 + }, + { + "epoch": 0.8227750564147724, + "grad_norm": 0.6947488188743591, + "learning_rate": 6.398526614055876e-06, + "loss": 0.7317, + "step": 14949 + }, + { + "epoch": 0.822830095217128, + "grad_norm": 0.7234527468681335, + "learning_rate": 6.3981104436144645e-06, + "loss": 0.6495, + "step": 14950 + }, + { + "epoch": 0.8228851340194837, + "grad_norm": 0.6872434020042419, + "learning_rate": 6.3976942626652295e-06, + "loss": 0.651, + "step": 14951 + }, + { + "epoch": 0.8229401728218394, + "grad_norm": 0.6762012243270874, + "learning_rate": 6.397278071211298e-06, + "loss": 0.7115, + "step": 14952 + }, + { + "epoch": 0.8229952116241951, + "grad_norm": 0.7007278800010681, + "learning_rate": 6.396861869255799e-06, + "loss": 0.717, + "step": 14953 + }, + { + "epoch": 0.8230502504265507, + "grad_norm": 0.7403082251548767, + "learning_rate": 6.396445656801859e-06, + "loss": 0.846, + "step": 14954 + }, + { + "epoch": 0.8231052892289064, + "grad_norm": 0.688758373260498, + "learning_rate": 6.396029433852609e-06, + "loss": 0.7871, + "step": 14955 + }, + { + "epoch": 0.823160328031262, + "grad_norm": 0.7264360189437866, + "learning_rate": 6.395613200411173e-06, + "loss": 0.7803, + "step": 14956 + }, + { + "epoch": 0.8232153668336177, + "grad_norm": 0.6858585476875305, + "learning_rate": 6.395196956480683e-06, + "loss": 0.6595, + "step": 14957 + }, + { + "epoch": 0.8232704056359733, + "grad_norm": 0.7834211587905884, + "learning_rate": 6.394780702064266e-06, + "loss": 0.7689, + "step": 14958 + }, + { + "epoch": 0.823325444438329, + "grad_norm": 0.6933274865150452, + "learning_rate": 6.394364437165052e-06, + "loss": 0.758, + "step": 14959 + }, + { + "epoch": 0.8233804832406847, + "grad_norm": 0.7490070462226868, + "learning_rate": 6.3939481617861664e-06, + "loss": 0.8106, + "step": 14960 + }, + { + "epoch": 0.8234355220430404, + "grad_norm": 0.5586501955986023, + "learning_rate": 6.3935318759307405e-06, + "loss": 0.6207, + "step": 14961 + }, + { + "epoch": 0.823490560845396, + "grad_norm": 0.6999693512916565, + "learning_rate": 6.393115579601902e-06, + "loss": 0.7787, + "step": 14962 + }, + { + "epoch": 0.8235455996477516, + "grad_norm": 1.0214177370071411, + "learning_rate": 6.392699272802779e-06, + "loss": 0.6444, + "step": 14963 + }, + { + "epoch": 0.8236006384501073, + "grad_norm": 0.7808836698532104, + "learning_rate": 6.392282955536502e-06, + "loss": 0.7537, + "step": 14964 + }, + { + "epoch": 0.823655677252463, + "grad_norm": 0.6825253963470459, + "learning_rate": 6.391866627806198e-06, + "loss": 0.7346, + "step": 14965 + }, + { + "epoch": 0.8237107160548186, + "grad_norm": 0.6105558276176453, + "learning_rate": 6.391450289614998e-06, + "loss": 0.6631, + "step": 14966 + }, + { + "epoch": 0.8237657548571743, + "grad_norm": 0.721986711025238, + "learning_rate": 6.391033940966029e-06, + "loss": 0.8638, + "step": 14967 + }, + { + "epoch": 0.82382079365953, + "grad_norm": 0.6226428747177124, + "learning_rate": 6.390617581862421e-06, + "loss": 0.7291, + "step": 14968 + }, + { + "epoch": 0.8238758324618857, + "grad_norm": 0.7403777241706848, + "learning_rate": 6.390201212307305e-06, + "loss": 0.7417, + "step": 14969 + }, + { + "epoch": 0.8239308712642412, + "grad_norm": 0.7188371419906616, + "learning_rate": 6.389784832303808e-06, + "loss": 0.757, + "step": 14970 + }, + { + "epoch": 0.8239859100665969, + "grad_norm": 0.8741163611412048, + "learning_rate": 6.389368441855061e-06, + "loss": 0.7264, + "step": 14971 + }, + { + "epoch": 0.8240409488689526, + "grad_norm": 0.7092788219451904, + "learning_rate": 6.388952040964192e-06, + "loss": 0.731, + "step": 14972 + }, + { + "epoch": 0.8240959876713083, + "grad_norm": 0.9291765689849854, + "learning_rate": 6.388535629634331e-06, + "loss": 0.7964, + "step": 14973 + }, + { + "epoch": 0.8241510264736639, + "grad_norm": 0.6140535473823547, + "learning_rate": 6.388119207868608e-06, + "loss": 0.7099, + "step": 14974 + }, + { + "epoch": 0.8242060652760196, + "grad_norm": 0.654778778553009, + "learning_rate": 6.387702775670154e-06, + "loss": 0.6667, + "step": 14975 + }, + { + "epoch": 0.8242611040783753, + "grad_norm": 0.7221185564994812, + "learning_rate": 6.387286333042095e-06, + "loss": 0.7533, + "step": 14976 + }, + { + "epoch": 0.824316142880731, + "grad_norm": 0.6680133938789368, + "learning_rate": 6.386869879987565e-06, + "loss": 0.6404, + "step": 14977 + }, + { + "epoch": 0.8243711816830865, + "grad_norm": 0.7067292928695679, + "learning_rate": 6.386453416509691e-06, + "loss": 0.8493, + "step": 14978 + }, + { + "epoch": 0.8244262204854422, + "grad_norm": 0.6279785633087158, + "learning_rate": 6.386036942611605e-06, + "loss": 0.7465, + "step": 14979 + }, + { + "epoch": 0.8244812592877979, + "grad_norm": 0.7184332013130188, + "learning_rate": 6.385620458296438e-06, + "loss": 0.738, + "step": 14980 + }, + { + "epoch": 0.8245362980901536, + "grad_norm": 0.7318315505981445, + "learning_rate": 6.385203963567316e-06, + "loss": 0.7409, + "step": 14981 + }, + { + "epoch": 0.8245913368925092, + "grad_norm": 0.6848355531692505, + "learning_rate": 6.384787458427372e-06, + "loss": 0.7343, + "step": 14982 + }, + { + "epoch": 0.8246463756948649, + "grad_norm": 0.7097738981246948, + "learning_rate": 6.384370942879736e-06, + "loss": 0.817, + "step": 14983 + }, + { + "epoch": 0.8247014144972206, + "grad_norm": 0.6933857798576355, + "learning_rate": 6.38395441692754e-06, + "loss": 0.7356, + "step": 14984 + }, + { + "epoch": 0.8247564532995763, + "grad_norm": 0.6631865501403809, + "learning_rate": 6.383537880573913e-06, + "loss": 0.752, + "step": 14985 + }, + { + "epoch": 0.8248114921019318, + "grad_norm": 0.6564633846282959, + "learning_rate": 6.3831213338219855e-06, + "loss": 0.7755, + "step": 14986 + }, + { + "epoch": 0.8248665309042875, + "grad_norm": 0.6518037915229797, + "learning_rate": 6.382704776674887e-06, + "loss": 0.7185, + "step": 14987 + }, + { + "epoch": 0.8249215697066432, + "grad_norm": 0.7074370384216309, + "learning_rate": 6.382288209135752e-06, + "loss": 0.7632, + "step": 14988 + }, + { + "epoch": 0.8249766085089989, + "grad_norm": 0.7034205198287964, + "learning_rate": 6.381871631207707e-06, + "loss": 0.8234, + "step": 14989 + }, + { + "epoch": 0.8250316473113545, + "grad_norm": 0.7635502815246582, + "learning_rate": 6.381455042893884e-06, + "loss": 0.7847, + "step": 14990 + }, + { + "epoch": 0.8250866861137102, + "grad_norm": 0.7682950496673584, + "learning_rate": 6.381038444197416e-06, + "loss": 0.6815, + "step": 14991 + }, + { + "epoch": 0.8251417249160659, + "grad_norm": 0.7713856101036072, + "learning_rate": 6.380621835121432e-06, + "loss": 0.7437, + "step": 14992 + }, + { + "epoch": 0.8251967637184214, + "grad_norm": 0.7955800294876099, + "learning_rate": 6.380205215669064e-06, + "loss": 0.876, + "step": 14993 + }, + { + "epoch": 0.8252518025207771, + "grad_norm": 0.6979825496673584, + "learning_rate": 6.379788585843443e-06, + "loss": 0.7018, + "step": 14994 + }, + { + "epoch": 0.8253068413231328, + "grad_norm": 0.6413466930389404, + "learning_rate": 6.379371945647701e-06, + "loss": 0.7345, + "step": 14995 + }, + { + "epoch": 0.8253618801254885, + "grad_norm": 0.6284430027008057, + "learning_rate": 6.378955295084968e-06, + "loss": 0.6758, + "step": 14996 + }, + { + "epoch": 0.8254169189278441, + "grad_norm": 0.5943842530250549, + "learning_rate": 6.378538634158377e-06, + "loss": 0.6572, + "step": 14997 + }, + { + "epoch": 0.8254719577301998, + "grad_norm": 0.7123218774795532, + "learning_rate": 6.378121962871058e-06, + "loss": 0.6993, + "step": 14998 + }, + { + "epoch": 0.8255269965325555, + "grad_norm": 0.6608574390411377, + "learning_rate": 6.377705281226143e-06, + "loss": 0.7802, + "step": 14999 + }, + { + "epoch": 0.8255820353349111, + "grad_norm": 0.6387534141540527, + "learning_rate": 6.377288589226764e-06, + "loss": 0.6572, + "step": 15000 + }, + { + "epoch": 0.8256370741372667, + "grad_norm": 0.6593596935272217, + "learning_rate": 6.376871886876054e-06, + "loss": 0.665, + "step": 15001 + }, + { + "epoch": 0.8256921129396224, + "grad_norm": 0.7146610617637634, + "learning_rate": 6.376455174177141e-06, + "loss": 0.7278, + "step": 15002 + }, + { + "epoch": 0.8257471517419781, + "grad_norm": 0.6776326298713684, + "learning_rate": 6.376038451133161e-06, + "loss": 0.7679, + "step": 15003 + }, + { + "epoch": 0.8258021905443338, + "grad_norm": 0.7008724808692932, + "learning_rate": 6.375621717747244e-06, + "loss": 0.8749, + "step": 15004 + }, + { + "epoch": 0.8258572293466894, + "grad_norm": 0.6809947490692139, + "learning_rate": 6.375204974022522e-06, + "loss": 0.7248, + "step": 15005 + }, + { + "epoch": 0.8259122681490451, + "grad_norm": 0.6921886205673218, + "learning_rate": 6.374788219962127e-06, + "loss": 0.6685, + "step": 15006 + }, + { + "epoch": 0.8259673069514007, + "grad_norm": 0.6471500396728516, + "learning_rate": 6.374371455569192e-06, + "loss": 0.6856, + "step": 15007 + }, + { + "epoch": 0.8260223457537564, + "grad_norm": 0.673425555229187, + "learning_rate": 6.373954680846851e-06, + "loss": 0.7006, + "step": 15008 + }, + { + "epoch": 0.826077384556112, + "grad_norm": 0.710217297077179, + "learning_rate": 6.373537895798233e-06, + "loss": 0.7315, + "step": 15009 + }, + { + "epoch": 0.8261324233584677, + "grad_norm": 0.692030668258667, + "learning_rate": 6.3731211004264725e-06, + "loss": 0.6534, + "step": 15010 + }, + { + "epoch": 0.8261874621608234, + "grad_norm": 0.6370778679847717, + "learning_rate": 6.372704294734701e-06, + "loss": 0.7278, + "step": 15011 + }, + { + "epoch": 0.8262425009631791, + "grad_norm": 0.6571012139320374, + "learning_rate": 6.372287478726052e-06, + "loss": 0.6889, + "step": 15012 + }, + { + "epoch": 0.8262975397655347, + "grad_norm": 0.721810519695282, + "learning_rate": 6.371870652403657e-06, + "loss": 0.8572, + "step": 15013 + }, + { + "epoch": 0.8263525785678904, + "grad_norm": 0.6751163601875305, + "learning_rate": 6.371453815770647e-06, + "loss": 0.7646, + "step": 15014 + }, + { + "epoch": 0.826407617370246, + "grad_norm": 0.724319338798523, + "learning_rate": 6.371036968830161e-06, + "loss": 0.8433, + "step": 15015 + }, + { + "epoch": 0.8264626561726017, + "grad_norm": 0.6961913108825684, + "learning_rate": 6.370620111585326e-06, + "loss": 0.7069, + "step": 15016 + }, + { + "epoch": 0.8265176949749573, + "grad_norm": 0.649428129196167, + "learning_rate": 6.370203244039279e-06, + "loss": 0.7286, + "step": 15017 + }, + { + "epoch": 0.826572733777313, + "grad_norm": 0.6468552947044373, + "learning_rate": 6.369786366195149e-06, + "loss": 0.7006, + "step": 15018 + }, + { + "epoch": 0.8266277725796687, + "grad_norm": 0.6564732789993286, + "learning_rate": 6.369369478056072e-06, + "loss": 0.727, + "step": 15019 + }, + { + "epoch": 0.8266828113820244, + "grad_norm": 0.6573188900947571, + "learning_rate": 6.36895257962518e-06, + "loss": 0.6603, + "step": 15020 + }, + { + "epoch": 0.82673785018438, + "grad_norm": 0.747164785861969, + "learning_rate": 6.368535670905609e-06, + "loss": 0.7426, + "step": 15021 + }, + { + "epoch": 0.8267928889867356, + "grad_norm": 0.6366723775863647, + "learning_rate": 6.368118751900489e-06, + "loss": 0.6487, + "step": 15022 + }, + { + "epoch": 0.8268479277890913, + "grad_norm": 0.6517844200134277, + "learning_rate": 6.367701822612955e-06, + "loss": 0.7131, + "step": 15023 + }, + { + "epoch": 0.826902966591447, + "grad_norm": 0.774309515953064, + "learning_rate": 6.367284883046141e-06, + "loss": 0.7978, + "step": 15024 + }, + { + "epoch": 0.8269580053938026, + "grad_norm": 0.6302667856216431, + "learning_rate": 6.366867933203178e-06, + "loss": 0.7403, + "step": 15025 + }, + { + "epoch": 0.8270130441961583, + "grad_norm": 0.6881224513053894, + "learning_rate": 6.366450973087202e-06, + "loss": 0.7884, + "step": 15026 + }, + { + "epoch": 0.827068082998514, + "grad_norm": 0.6901270747184753, + "learning_rate": 6.366034002701346e-06, + "loss": 0.6596, + "step": 15027 + }, + { + "epoch": 0.8271231218008697, + "grad_norm": 0.7436091303825378, + "learning_rate": 6.365617022048745e-06, + "loss": 0.8141, + "step": 15028 + }, + { + "epoch": 0.8271781606032252, + "grad_norm": 0.6745834350585938, + "learning_rate": 6.365200031132531e-06, + "loss": 0.7738, + "step": 15029 + }, + { + "epoch": 0.8272331994055809, + "grad_norm": 0.6963297724723816, + "learning_rate": 6.364783029955839e-06, + "loss": 0.8649, + "step": 15030 + }, + { + "epoch": 0.8272882382079366, + "grad_norm": 0.6468135714530945, + "learning_rate": 6.364366018521803e-06, + "loss": 0.7403, + "step": 15031 + }, + { + "epoch": 0.8273432770102923, + "grad_norm": 0.6481515169143677, + "learning_rate": 6.363948996833559e-06, + "loss": 0.6268, + "step": 15032 + }, + { + "epoch": 0.8273983158126479, + "grad_norm": 0.6881366968154907, + "learning_rate": 6.3635319648942386e-06, + "loss": 0.6339, + "step": 15033 + }, + { + "epoch": 0.8274533546150036, + "grad_norm": 0.6858122944831848, + "learning_rate": 6.363114922706977e-06, + "loss": 0.7685, + "step": 15034 + }, + { + "epoch": 0.8275083934173593, + "grad_norm": 0.6630339026451111, + "learning_rate": 6.362697870274907e-06, + "loss": 0.7281, + "step": 15035 + }, + { + "epoch": 0.8275634322197148, + "grad_norm": 0.7198584079742432, + "learning_rate": 6.362280807601167e-06, + "loss": 0.7726, + "step": 15036 + }, + { + "epoch": 0.8276184710220705, + "grad_norm": 0.721622884273529, + "learning_rate": 6.361863734688888e-06, + "loss": 0.6471, + "step": 15037 + }, + { + "epoch": 0.8276735098244262, + "grad_norm": 0.6032352447509766, + "learning_rate": 6.3614466515412055e-06, + "loss": 0.6684, + "step": 15038 + }, + { + "epoch": 0.8277285486267819, + "grad_norm": 0.7568576335906982, + "learning_rate": 6.3610295581612535e-06, + "loss": 0.7089, + "step": 15039 + }, + { + "epoch": 0.8277835874291375, + "grad_norm": 0.7461723685264587, + "learning_rate": 6.360612454552168e-06, + "loss": 0.806, + "step": 15040 + }, + { + "epoch": 0.8278386262314932, + "grad_norm": 0.6606107354164124, + "learning_rate": 6.3601953407170855e-06, + "loss": 0.7276, + "step": 15041 + }, + { + "epoch": 0.8278936650338489, + "grad_norm": 0.7203792333602905, + "learning_rate": 6.3597782166591384e-06, + "loss": 0.844, + "step": 15042 + }, + { + "epoch": 0.8279487038362046, + "grad_norm": 0.7327194213867188, + "learning_rate": 6.35936108238146e-06, + "loss": 0.8289, + "step": 15043 + }, + { + "epoch": 0.8280037426385601, + "grad_norm": 0.6741734147071838, + "learning_rate": 6.358943937887189e-06, + "loss": 0.7022, + "step": 15044 + }, + { + "epoch": 0.8280587814409158, + "grad_norm": 0.795724630355835, + "learning_rate": 6.35852678317946e-06, + "loss": 0.7703, + "step": 15045 + }, + { + "epoch": 0.8281138202432715, + "grad_norm": 0.6476230621337891, + "learning_rate": 6.3581096182614055e-06, + "loss": 0.7471, + "step": 15046 + }, + { + "epoch": 0.8281688590456272, + "grad_norm": 0.658829391002655, + "learning_rate": 6.357692443136164e-06, + "loss": 0.7796, + "step": 15047 + }, + { + "epoch": 0.8282238978479828, + "grad_norm": 0.6755202412605286, + "learning_rate": 6.35727525780687e-06, + "loss": 0.8239, + "step": 15048 + }, + { + "epoch": 0.8282789366503385, + "grad_norm": 0.6518263220787048, + "learning_rate": 6.356858062276658e-06, + "loss": 0.7222, + "step": 15049 + }, + { + "epoch": 0.8283339754526942, + "grad_norm": 0.7006294131278992, + "learning_rate": 6.356440856548662e-06, + "loss": 0.7779, + "step": 15050 + }, + { + "epoch": 0.8283890142550498, + "grad_norm": 0.6771633625030518, + "learning_rate": 6.356023640626021e-06, + "loss": 0.7529, + "step": 15051 + }, + { + "epoch": 0.8284440530574054, + "grad_norm": 0.6893792152404785, + "learning_rate": 6.35560641451187e-06, + "loss": 0.834, + "step": 15052 + }, + { + "epoch": 0.8284990918597611, + "grad_norm": 0.7450309991836548, + "learning_rate": 6.355189178209343e-06, + "loss": 0.7017, + "step": 15053 + }, + { + "epoch": 0.8285541306621168, + "grad_norm": 0.7094436883926392, + "learning_rate": 6.3547719317215785e-06, + "loss": 0.7883, + "step": 15054 + }, + { + "epoch": 0.8286091694644725, + "grad_norm": 0.6926944255828857, + "learning_rate": 6.3543546750517085e-06, + "loss": 0.7309, + "step": 15055 + }, + { + "epoch": 0.8286642082668281, + "grad_norm": 0.7394436597824097, + "learning_rate": 6.3539374082028725e-06, + "loss": 0.8819, + "step": 15056 + }, + { + "epoch": 0.8287192470691838, + "grad_norm": 0.7663393616676331, + "learning_rate": 6.353520131178206e-06, + "loss": 0.7269, + "step": 15057 + }, + { + "epoch": 0.8287742858715395, + "grad_norm": 0.702627956867218, + "learning_rate": 6.353102843980844e-06, + "loss": 0.8205, + "step": 15058 + }, + { + "epoch": 0.8288293246738951, + "grad_norm": 0.6575393676757812, + "learning_rate": 6.352685546613924e-06, + "loss": 0.782, + "step": 15059 + }, + { + "epoch": 0.8288843634762507, + "grad_norm": 0.6844787001609802, + "learning_rate": 6.35226823908058e-06, + "loss": 0.7485, + "step": 15060 + }, + { + "epoch": 0.8289394022786064, + "grad_norm": 0.6018843054771423, + "learning_rate": 6.351850921383951e-06, + "loss": 0.6788, + "step": 15061 + }, + { + "epoch": 0.8289944410809621, + "grad_norm": 0.7418997883796692, + "learning_rate": 6.351433593527172e-06, + "loss": 0.6789, + "step": 15062 + }, + { + "epoch": 0.8290494798833178, + "grad_norm": 0.625535786151886, + "learning_rate": 6.351016255513379e-06, + "loss": 0.7405, + "step": 15063 + }, + { + "epoch": 0.8291045186856734, + "grad_norm": 0.678569495677948, + "learning_rate": 6.350598907345711e-06, + "loss": 0.7386, + "step": 15064 + }, + { + "epoch": 0.829159557488029, + "grad_norm": 0.8012919425964355, + "learning_rate": 6.350181549027302e-06, + "loss": 0.7703, + "step": 15065 + }, + { + "epoch": 0.8292145962903847, + "grad_norm": 0.6115431189537048, + "learning_rate": 6.3497641805612905e-06, + "loss": 0.7131, + "step": 15066 + }, + { + "epoch": 0.8292696350927404, + "grad_norm": 0.7392085194587708, + "learning_rate": 6.349346801950812e-06, + "loss": 0.7648, + "step": 15067 + }, + { + "epoch": 0.829324673895096, + "grad_norm": 0.597613513469696, + "learning_rate": 6.348929413199005e-06, + "loss": 0.6023, + "step": 15068 + }, + { + "epoch": 0.8293797126974517, + "grad_norm": 0.6418130397796631, + "learning_rate": 6.348512014309005e-06, + "loss": 0.7507, + "step": 15069 + }, + { + "epoch": 0.8294347514998074, + "grad_norm": 0.6351965665817261, + "learning_rate": 6.34809460528395e-06, + "loss": 0.722, + "step": 15070 + }, + { + "epoch": 0.8294897903021631, + "grad_norm": 0.6593570709228516, + "learning_rate": 6.347677186126977e-06, + "loss": 0.7032, + "step": 15071 + }, + { + "epoch": 0.8295448291045187, + "grad_norm": 0.8040562868118286, + "learning_rate": 6.3472597568412235e-06, + "loss": 0.6519, + "step": 15072 + }, + { + "epoch": 0.8295998679068743, + "grad_norm": 0.7043612599372864, + "learning_rate": 6.346842317429825e-06, + "loss": 0.7765, + "step": 15073 + }, + { + "epoch": 0.82965490670923, + "grad_norm": 0.6304612159729004, + "learning_rate": 6.346424867895922e-06, + "loss": 0.6763, + "step": 15074 + }, + { + "epoch": 0.8297099455115857, + "grad_norm": 0.6402591466903687, + "learning_rate": 6.346007408242647e-06, + "loss": 0.828, + "step": 15075 + }, + { + "epoch": 0.8297649843139413, + "grad_norm": 0.6908280849456787, + "learning_rate": 6.345589938473142e-06, + "loss": 0.855, + "step": 15076 + }, + { + "epoch": 0.829820023116297, + "grad_norm": 0.5829552412033081, + "learning_rate": 6.345172458590545e-06, + "loss": 0.6323, + "step": 15077 + }, + { + "epoch": 0.8298750619186527, + "grad_norm": 0.8221700191497803, + "learning_rate": 6.34475496859799e-06, + "loss": 0.7069, + "step": 15078 + }, + { + "epoch": 0.8299301007210083, + "grad_norm": 0.7065801024436951, + "learning_rate": 6.344337468498616e-06, + "loss": 0.692, + "step": 15079 + }, + { + "epoch": 0.829985139523364, + "grad_norm": 0.6199344396591187, + "learning_rate": 6.343919958295564e-06, + "loss": 0.682, + "step": 15080 + }, + { + "epoch": 0.8300401783257196, + "grad_norm": 0.8999378681182861, + "learning_rate": 6.343502437991968e-06, + "loss": 0.7924, + "step": 15081 + }, + { + "epoch": 0.8300952171280753, + "grad_norm": 0.639163076877594, + "learning_rate": 6.343084907590966e-06, + "loss": 0.6976, + "step": 15082 + }, + { + "epoch": 0.8301502559304309, + "grad_norm": 0.8266178965568542, + "learning_rate": 6.3426673670957e-06, + "loss": 0.6831, + "step": 15083 + }, + { + "epoch": 0.8302052947327866, + "grad_norm": 0.6245449781417847, + "learning_rate": 6.3422498165093034e-06, + "loss": 0.6917, + "step": 15084 + }, + { + "epoch": 0.8302603335351423, + "grad_norm": 0.7809823751449585, + "learning_rate": 6.341832255834918e-06, + "loss": 0.8424, + "step": 15085 + }, + { + "epoch": 0.830315372337498, + "grad_norm": 0.6803410053253174, + "learning_rate": 6.34141468507568e-06, + "loss": 0.8345, + "step": 15086 + }, + { + "epoch": 0.8303704111398535, + "grad_norm": 0.7445305585861206, + "learning_rate": 6.340997104234728e-06, + "loss": 0.8823, + "step": 15087 + }, + { + "epoch": 0.8304254499422092, + "grad_norm": 0.6992506384849548, + "learning_rate": 6.340579513315199e-06, + "loss": 0.7857, + "step": 15088 + }, + { + "epoch": 0.8304804887445649, + "grad_norm": 0.7050431966781616, + "learning_rate": 6.340161912320237e-06, + "loss": 0.7988, + "step": 15089 + }, + { + "epoch": 0.8305355275469206, + "grad_norm": 0.8718838095664978, + "learning_rate": 6.339744301252973e-06, + "loss": 0.9983, + "step": 15090 + }, + { + "epoch": 0.8305905663492762, + "grad_norm": 0.7317140698432922, + "learning_rate": 6.339326680116551e-06, + "loss": 0.6852, + "step": 15091 + }, + { + "epoch": 0.8306456051516319, + "grad_norm": 0.6975864768028259, + "learning_rate": 6.338909048914108e-06, + "loss": 0.7334, + "step": 15092 + }, + { + "epoch": 0.8307006439539876, + "grad_norm": 0.6615436673164368, + "learning_rate": 6.3384914076487834e-06, + "loss": 0.776, + "step": 15093 + }, + { + "epoch": 0.8307556827563433, + "grad_norm": 0.773273766040802, + "learning_rate": 6.338073756323717e-06, + "loss": 0.7868, + "step": 15094 + }, + { + "epoch": 0.8308107215586988, + "grad_norm": 0.6686182022094727, + "learning_rate": 6.337656094942045e-06, + "loss": 0.7487, + "step": 15095 + }, + { + "epoch": 0.8308657603610545, + "grad_norm": 0.8202255368232727, + "learning_rate": 6.337238423506909e-06, + "loss": 0.7748, + "step": 15096 + }, + { + "epoch": 0.8309207991634102, + "grad_norm": 0.6356936693191528, + "learning_rate": 6.336820742021445e-06, + "loss": 0.6539, + "step": 15097 + }, + { + "epoch": 0.8309758379657659, + "grad_norm": 0.6543401479721069, + "learning_rate": 6.3364030504887955e-06, + "loss": 0.7185, + "step": 15098 + }, + { + "epoch": 0.8310308767681215, + "grad_norm": 0.6499043107032776, + "learning_rate": 6.335985348912097e-06, + "loss": 0.7254, + "step": 15099 + }, + { + "epoch": 0.8310859155704772, + "grad_norm": 0.6983271241188049, + "learning_rate": 6.335567637294491e-06, + "loss": 0.784, + "step": 15100 + }, + { + "epoch": 0.8311409543728329, + "grad_norm": 0.7932507395744324, + "learning_rate": 6.335149915639117e-06, + "loss": 0.6708, + "step": 15101 + }, + { + "epoch": 0.8311959931751886, + "grad_norm": 0.6792518496513367, + "learning_rate": 6.334732183949112e-06, + "loss": 0.7365, + "step": 15102 + }, + { + "epoch": 0.8312510319775441, + "grad_norm": 0.6852229237556458, + "learning_rate": 6.334314442227618e-06, + "loss": 0.7283, + "step": 15103 + }, + { + "epoch": 0.8313060707798998, + "grad_norm": 0.6528468728065491, + "learning_rate": 6.333896690477774e-06, + "loss": 0.763, + "step": 15104 + }, + { + "epoch": 0.8313611095822555, + "grad_norm": 0.7215067148208618, + "learning_rate": 6.33347892870272e-06, + "loss": 0.7769, + "step": 15105 + }, + { + "epoch": 0.8314161483846112, + "grad_norm": 0.7171593308448792, + "learning_rate": 6.333061156905596e-06, + "loss": 0.6807, + "step": 15106 + }, + { + "epoch": 0.8314711871869668, + "grad_norm": 0.6781407594680786, + "learning_rate": 6.332643375089539e-06, + "loss": 0.6801, + "step": 15107 + }, + { + "epoch": 0.8315262259893225, + "grad_norm": 0.803057849407196, + "learning_rate": 6.332225583257693e-06, + "loss": 0.682, + "step": 15108 + }, + { + "epoch": 0.8315812647916782, + "grad_norm": 0.6467291712760925, + "learning_rate": 6.331807781413195e-06, + "loss": 0.6675, + "step": 15109 + }, + { + "epoch": 0.8316363035940338, + "grad_norm": 0.7285529971122742, + "learning_rate": 6.331389969559186e-06, + "loss": 0.7333, + "step": 15110 + }, + { + "epoch": 0.8316913423963894, + "grad_norm": 0.6569895148277283, + "learning_rate": 6.330972147698806e-06, + "loss": 0.7202, + "step": 15111 + }, + { + "epoch": 0.8317463811987451, + "grad_norm": 0.7848708033561707, + "learning_rate": 6.330554315835198e-06, + "loss": 0.7936, + "step": 15112 + }, + { + "epoch": 0.8318014200011008, + "grad_norm": 0.6699723601341248, + "learning_rate": 6.330136473971498e-06, + "loss": 0.7107, + "step": 15113 + }, + { + "epoch": 0.8318564588034565, + "grad_norm": 0.7443183660507202, + "learning_rate": 6.329718622110848e-06, + "loss": 0.8102, + "step": 15114 + }, + { + "epoch": 0.8319114976058121, + "grad_norm": 0.6073893904685974, + "learning_rate": 6.329300760256389e-06, + "loss": 0.7061, + "step": 15115 + }, + { + "epoch": 0.8319665364081678, + "grad_norm": 0.6192148923873901, + "learning_rate": 6.328882888411262e-06, + "loss": 0.6929, + "step": 15116 + }, + { + "epoch": 0.8320215752105234, + "grad_norm": 0.7347237467765808, + "learning_rate": 6.3284650065786065e-06, + "loss": 0.6705, + "step": 15117 + }, + { + "epoch": 0.8320766140128791, + "grad_norm": 0.6286477446556091, + "learning_rate": 6.328047114761564e-06, + "loss": 0.6494, + "step": 15118 + }, + { + "epoch": 0.8321316528152347, + "grad_norm": 0.6492440104484558, + "learning_rate": 6.327629212963275e-06, + "loss": 0.6618, + "step": 15119 + }, + { + "epoch": 0.8321866916175904, + "grad_norm": 0.6295114755630493, + "learning_rate": 6.3272113011868804e-06, + "loss": 0.786, + "step": 15120 + }, + { + "epoch": 0.8322417304199461, + "grad_norm": 0.6737865805625916, + "learning_rate": 6.3267933794355206e-06, + "loss": 0.7544, + "step": 15121 + }, + { + "epoch": 0.8322967692223017, + "grad_norm": 0.8025132417678833, + "learning_rate": 6.3263754477123374e-06, + "loss": 0.7736, + "step": 15122 + }, + { + "epoch": 0.8323518080246574, + "grad_norm": 0.6820534467697144, + "learning_rate": 6.32595750602047e-06, + "loss": 0.6616, + "step": 15123 + }, + { + "epoch": 0.832406846827013, + "grad_norm": 0.7022573351860046, + "learning_rate": 6.325539554363061e-06, + "loss": 0.8175, + "step": 15124 + }, + { + "epoch": 0.8324618856293687, + "grad_norm": 0.7034926414489746, + "learning_rate": 6.325121592743253e-06, + "loss": 0.7047, + "step": 15125 + }, + { + "epoch": 0.8325169244317243, + "grad_norm": 0.654296875, + "learning_rate": 6.3247036211641856e-06, + "loss": 0.6468, + "step": 15126 + }, + { + "epoch": 0.83257196323408, + "grad_norm": 0.647859513759613, + "learning_rate": 6.324285639628999e-06, + "loss": 0.694, + "step": 15127 + }, + { + "epoch": 0.8326270020364357, + "grad_norm": 1.0824226140975952, + "learning_rate": 6.323867648140837e-06, + "loss": 0.7226, + "step": 15128 + }, + { + "epoch": 0.8326820408387914, + "grad_norm": 0.8568648099899292, + "learning_rate": 6.323449646702839e-06, + "loss": 0.7524, + "step": 15129 + }, + { + "epoch": 0.832737079641147, + "grad_norm": 0.6550299525260925, + "learning_rate": 6.32303163531815e-06, + "loss": 0.7294, + "step": 15130 + }, + { + "epoch": 0.8327921184435026, + "grad_norm": 0.7722175121307373, + "learning_rate": 6.3226136139899075e-06, + "loss": 0.7864, + "step": 15131 + }, + { + "epoch": 0.8328471572458583, + "grad_norm": 0.6542928218841553, + "learning_rate": 6.322195582721256e-06, + "loss": 0.6614, + "step": 15132 + }, + { + "epoch": 0.832902196048214, + "grad_norm": 0.6617493629455566, + "learning_rate": 6.321777541515337e-06, + "loss": 0.7147, + "step": 15133 + }, + { + "epoch": 0.8329572348505696, + "grad_norm": 0.698868989944458, + "learning_rate": 6.321359490375291e-06, + "loss": 0.6894, + "step": 15134 + }, + { + "epoch": 0.8330122736529253, + "grad_norm": 0.8005796074867249, + "learning_rate": 6.3209414293042595e-06, + "loss": 0.7513, + "step": 15135 + }, + { + "epoch": 0.833067312455281, + "grad_norm": 0.7656713128089905, + "learning_rate": 6.320523358305387e-06, + "loss": 0.7387, + "step": 15136 + }, + { + "epoch": 0.8331223512576367, + "grad_norm": 0.7299987077713013, + "learning_rate": 6.320105277381815e-06, + "loss": 0.7868, + "step": 15137 + }, + { + "epoch": 0.8331773900599923, + "grad_norm": 0.782574474811554, + "learning_rate": 6.319687186536685e-06, + "loss": 0.8307, + "step": 15138 + }, + { + "epoch": 0.8332324288623479, + "grad_norm": 0.6786854863166809, + "learning_rate": 6.319269085773138e-06, + "loss": 0.7819, + "step": 15139 + }, + { + "epoch": 0.8332874676647036, + "grad_norm": 1.173049807548523, + "learning_rate": 6.318850975094318e-06, + "loss": 0.7623, + "step": 15140 + }, + { + "epoch": 0.8333425064670593, + "grad_norm": 0.8410226106643677, + "learning_rate": 6.318432854503368e-06, + "loss": 0.812, + "step": 15141 + }, + { + "epoch": 0.8333975452694149, + "grad_norm": 0.8525705337524414, + "learning_rate": 6.3180147240034304e-06, + "loss": 0.7585, + "step": 15142 + }, + { + "epoch": 0.8334525840717706, + "grad_norm": 0.6345195770263672, + "learning_rate": 6.317596583597645e-06, + "loss": 0.7446, + "step": 15143 + }, + { + "epoch": 0.8335076228741263, + "grad_norm": 0.7238603234291077, + "learning_rate": 6.317178433289157e-06, + "loss": 0.7461, + "step": 15144 + }, + { + "epoch": 0.833562661676482, + "grad_norm": 0.6187044382095337, + "learning_rate": 6.31676027308111e-06, + "loss": 0.7195, + "step": 15145 + }, + { + "epoch": 0.8336177004788375, + "grad_norm": 0.6813417077064514, + "learning_rate": 6.316342102976644e-06, + "loss": 0.772, + "step": 15146 + }, + { + "epoch": 0.8336727392811932, + "grad_norm": 0.665515124797821, + "learning_rate": 6.315923922978902e-06, + "loss": 0.7127, + "step": 15147 + }, + { + "epoch": 0.8337277780835489, + "grad_norm": 0.8104628920555115, + "learning_rate": 6.315505733091028e-06, + "loss": 0.7332, + "step": 15148 + }, + { + "epoch": 0.8337828168859046, + "grad_norm": 0.8447679281234741, + "learning_rate": 6.315087533316166e-06, + "loss": 0.6803, + "step": 15149 + }, + { + "epoch": 0.8338378556882602, + "grad_norm": 0.7588180303573608, + "learning_rate": 6.31466932365746e-06, + "loss": 0.8301, + "step": 15150 + }, + { + "epoch": 0.8338928944906159, + "grad_norm": 0.7697302103042603, + "learning_rate": 6.314251104118048e-06, + "loss": 0.7777, + "step": 15151 + }, + { + "epoch": 0.8339479332929716, + "grad_norm": 0.8361233472824097, + "learning_rate": 6.313832874701078e-06, + "loss": 0.7585, + "step": 15152 + }, + { + "epoch": 0.8340029720953273, + "grad_norm": 0.6954757571220398, + "learning_rate": 6.313414635409692e-06, + "loss": 0.759, + "step": 15153 + }, + { + "epoch": 0.8340580108976828, + "grad_norm": 0.72389155626297, + "learning_rate": 6.312996386247034e-06, + "loss": 0.6679, + "step": 15154 + }, + { + "epoch": 0.8341130497000385, + "grad_norm": 0.781382143497467, + "learning_rate": 6.312578127216245e-06, + "loss": 0.769, + "step": 15155 + }, + { + "epoch": 0.8341680885023942, + "grad_norm": 0.7186244130134583, + "learning_rate": 6.312159858320472e-06, + "loss": 0.7476, + "step": 15156 + }, + { + "epoch": 0.8342231273047499, + "grad_norm": 0.6909130215644836, + "learning_rate": 6.311741579562855e-06, + "loss": 0.749, + "step": 15157 + }, + { + "epoch": 0.8342781661071055, + "grad_norm": 0.7692446708679199, + "learning_rate": 6.31132329094654e-06, + "loss": 0.7141, + "step": 15158 + }, + { + "epoch": 0.8343332049094612, + "grad_norm": 0.6753776669502258, + "learning_rate": 6.310904992474669e-06, + "loss": 0.7259, + "step": 15159 + }, + { + "epoch": 0.8343882437118169, + "grad_norm": 0.7118550539016724, + "learning_rate": 6.3104866841503885e-06, + "loss": 0.8282, + "step": 15160 + }, + { + "epoch": 0.8344432825141725, + "grad_norm": 0.6651625037193298, + "learning_rate": 6.31006836597684e-06, + "loss": 0.7639, + "step": 15161 + }, + { + "epoch": 0.8344983213165281, + "grad_norm": 0.6745681762695312, + "learning_rate": 6.30965003795717e-06, + "loss": 0.5922, + "step": 15162 + }, + { + "epoch": 0.8345533601188838, + "grad_norm": 0.7344138622283936, + "learning_rate": 6.309231700094518e-06, + "loss": 0.7134, + "step": 15163 + }, + { + "epoch": 0.8346083989212395, + "grad_norm": 0.7628228664398193, + "learning_rate": 6.308813352392034e-06, + "loss": 0.7341, + "step": 15164 + }, + { + "epoch": 0.8346634377235951, + "grad_norm": 0.6599448919296265, + "learning_rate": 6.308394994852858e-06, + "loss": 0.6821, + "step": 15165 + }, + { + "epoch": 0.8347184765259508, + "grad_norm": 0.9132193922996521, + "learning_rate": 6.307976627480136e-06, + "loss": 0.7862, + "step": 15166 + }, + { + "epoch": 0.8347735153283065, + "grad_norm": 0.752200722694397, + "learning_rate": 6.307558250277011e-06, + "loss": 0.7942, + "step": 15167 + }, + { + "epoch": 0.8348285541306621, + "grad_norm": 0.6848111748695374, + "learning_rate": 6.307139863246628e-06, + "loss": 0.8161, + "step": 15168 + }, + { + "epoch": 0.8348835929330177, + "grad_norm": 0.7229306697845459, + "learning_rate": 6.306721466392132e-06, + "loss": 0.684, + "step": 15169 + }, + { + "epoch": 0.8349386317353734, + "grad_norm": 0.7294610142707825, + "learning_rate": 6.306303059716667e-06, + "loss": 0.7046, + "step": 15170 + }, + { + "epoch": 0.8349936705377291, + "grad_norm": 0.7153074741363525, + "learning_rate": 6.305884643223378e-06, + "loss": 0.7613, + "step": 15171 + }, + { + "epoch": 0.8350487093400848, + "grad_norm": 0.6200907826423645, + "learning_rate": 6.30546621691541e-06, + "loss": 0.642, + "step": 15172 + }, + { + "epoch": 0.8351037481424404, + "grad_norm": 0.6640743017196655, + "learning_rate": 6.305047780795907e-06, + "loss": 0.7201, + "step": 15173 + }, + { + "epoch": 0.8351587869447961, + "grad_norm": 0.6427313089370728, + "learning_rate": 6.3046293348680144e-06, + "loss": 0.764, + "step": 15174 + }, + { + "epoch": 0.8352138257471518, + "grad_norm": 0.6475403308868408, + "learning_rate": 6.3042108791348755e-06, + "loss": 0.6678, + "step": 15175 + }, + { + "epoch": 0.8352688645495074, + "grad_norm": 0.6376405358314514, + "learning_rate": 6.303792413599638e-06, + "loss": 0.6972, + "step": 15176 + }, + { + "epoch": 0.835323903351863, + "grad_norm": 0.6648433804512024, + "learning_rate": 6.303373938265447e-06, + "loss": 0.6531, + "step": 15177 + }, + { + "epoch": 0.8353789421542187, + "grad_norm": 0.6582038402557373, + "learning_rate": 6.302955453135446e-06, + "loss": 0.7703, + "step": 15178 + }, + { + "epoch": 0.8354339809565744, + "grad_norm": 0.6386045217514038, + "learning_rate": 6.30253695821278e-06, + "loss": 0.6821, + "step": 15179 + }, + { + "epoch": 0.8354890197589301, + "grad_norm": 0.7268567681312561, + "learning_rate": 6.302118453500594e-06, + "loss": 0.7434, + "step": 15180 + }, + { + "epoch": 0.8355440585612857, + "grad_norm": 0.8008975982666016, + "learning_rate": 6.301699939002035e-06, + "loss": 0.8537, + "step": 15181 + }, + { + "epoch": 0.8355990973636414, + "grad_norm": 0.6803351044654846, + "learning_rate": 6.301281414720247e-06, + "loss": 0.6741, + "step": 15182 + }, + { + "epoch": 0.835654136165997, + "grad_norm": 0.6567045450210571, + "learning_rate": 6.3008628806583785e-06, + "loss": 0.7033, + "step": 15183 + }, + { + "epoch": 0.8357091749683527, + "grad_norm": 0.7088850140571594, + "learning_rate": 6.3004443368195685e-06, + "loss": 0.699, + "step": 15184 + }, + { + "epoch": 0.8357642137707083, + "grad_norm": 0.664929986000061, + "learning_rate": 6.3000257832069715e-06, + "loss": 0.6875, + "step": 15185 + }, + { + "epoch": 0.835819252573064, + "grad_norm": 0.7132309079170227, + "learning_rate": 6.299607219823727e-06, + "loss": 0.8172, + "step": 15186 + }, + { + "epoch": 0.8358742913754197, + "grad_norm": 0.7312454581260681, + "learning_rate": 6.2991886466729815e-06, + "loss": 0.7277, + "step": 15187 + }, + { + "epoch": 0.8359293301777754, + "grad_norm": 0.6576625108718872, + "learning_rate": 6.298770063757882e-06, + "loss": 0.7134, + "step": 15188 + }, + { + "epoch": 0.835984368980131, + "grad_norm": 0.6840282678604126, + "learning_rate": 6.2983514710815756e-06, + "loss": 0.777, + "step": 15189 + }, + { + "epoch": 0.8360394077824866, + "grad_norm": 0.7194011211395264, + "learning_rate": 6.297932868647207e-06, + "loss": 0.783, + "step": 15190 + }, + { + "epoch": 0.8360944465848423, + "grad_norm": 0.6619371175765991, + "learning_rate": 6.297514256457922e-06, + "loss": 0.7809, + "step": 15191 + }, + { + "epoch": 0.836149485387198, + "grad_norm": 0.8256712555885315, + "learning_rate": 6.2970956345168666e-06, + "loss": 0.9086, + "step": 15192 + }, + { + "epoch": 0.8362045241895536, + "grad_norm": 0.6951783299446106, + "learning_rate": 6.296677002827188e-06, + "loss": 0.7489, + "step": 15193 + }, + { + "epoch": 0.8362595629919093, + "grad_norm": 0.8535193204879761, + "learning_rate": 6.296258361392033e-06, + "loss": 0.7744, + "step": 15194 + }, + { + "epoch": 0.836314601794265, + "grad_norm": 0.7569966912269592, + "learning_rate": 6.295839710214546e-06, + "loss": 0.7091, + "step": 15195 + }, + { + "epoch": 0.8363696405966207, + "grad_norm": 0.6435930728912354, + "learning_rate": 6.295421049297875e-06, + "loss": 0.6601, + "step": 15196 + }, + { + "epoch": 0.8364246793989762, + "grad_norm": 0.811500608921051, + "learning_rate": 6.295002378645166e-06, + "loss": 0.7304, + "step": 15197 + }, + { + "epoch": 0.8364797182013319, + "grad_norm": 0.7306826114654541, + "learning_rate": 6.294583698259566e-06, + "loss": 0.8471, + "step": 15198 + }, + { + "epoch": 0.8365347570036876, + "grad_norm": 0.6411521434783936, + "learning_rate": 6.294165008144222e-06, + "loss": 0.6572, + "step": 15199 + }, + { + "epoch": 0.8365897958060433, + "grad_norm": 0.6460714340209961, + "learning_rate": 6.293746308302278e-06, + "loss": 0.7514, + "step": 15200 + }, + { + "epoch": 0.8366448346083989, + "grad_norm": 0.9355582594871521, + "learning_rate": 6.2933275987368855e-06, + "loss": 0.8171, + "step": 15201 + }, + { + "epoch": 0.8366998734107546, + "grad_norm": 0.6221946477890015, + "learning_rate": 6.292908879451189e-06, + "loss": 0.7323, + "step": 15202 + }, + { + "epoch": 0.8367549122131103, + "grad_norm": 0.6820993423461914, + "learning_rate": 6.292490150448335e-06, + "loss": 0.8168, + "step": 15203 + }, + { + "epoch": 0.836809951015466, + "grad_norm": 0.6494680643081665, + "learning_rate": 6.29207141173147e-06, + "loss": 0.7926, + "step": 15204 + }, + { + "epoch": 0.8368649898178215, + "grad_norm": 0.7658956050872803, + "learning_rate": 6.291652663303744e-06, + "loss": 0.7304, + "step": 15205 + }, + { + "epoch": 0.8369200286201772, + "grad_norm": 0.6653497219085693, + "learning_rate": 6.2912339051683e-06, + "loss": 0.7284, + "step": 15206 + }, + { + "epoch": 0.8369750674225329, + "grad_norm": 0.6136276721954346, + "learning_rate": 6.290815137328289e-06, + "loss": 0.7313, + "step": 15207 + }, + { + "epoch": 0.8370301062248885, + "grad_norm": 0.7542527914047241, + "learning_rate": 6.2903963597868555e-06, + "loss": 0.7806, + "step": 15208 + }, + { + "epoch": 0.8370851450272442, + "grad_norm": 0.6994839906692505, + "learning_rate": 6.2899775725471505e-06, + "loss": 0.8132, + "step": 15209 + }, + { + "epoch": 0.8371401838295999, + "grad_norm": 0.6558997631072998, + "learning_rate": 6.289558775612319e-06, + "loss": 0.7188, + "step": 15210 + }, + { + "epoch": 0.8371952226319556, + "grad_norm": 0.7155564427375793, + "learning_rate": 6.289139968985507e-06, + "loss": 0.6584, + "step": 15211 + }, + { + "epoch": 0.8372502614343111, + "grad_norm": 0.7645565867424011, + "learning_rate": 6.288721152669865e-06, + "loss": 0.761, + "step": 15212 + }, + { + "epoch": 0.8373053002366668, + "grad_norm": 0.6507940292358398, + "learning_rate": 6.288302326668542e-06, + "loss": 0.7139, + "step": 15213 + }, + { + "epoch": 0.8373603390390225, + "grad_norm": 0.7598558664321899, + "learning_rate": 6.287883490984682e-06, + "loss": 0.7627, + "step": 15214 + }, + { + "epoch": 0.8374153778413782, + "grad_norm": 0.6542350649833679, + "learning_rate": 6.287464645621434e-06, + "loss": 0.7508, + "step": 15215 + }, + { + "epoch": 0.8374704166437338, + "grad_norm": 0.7530503869056702, + "learning_rate": 6.287045790581946e-06, + "loss": 0.8234, + "step": 15216 + }, + { + "epoch": 0.8375254554460895, + "grad_norm": 0.9945759773254395, + "learning_rate": 6.286626925869367e-06, + "loss": 0.7637, + "step": 15217 + }, + { + "epoch": 0.8375804942484452, + "grad_norm": 0.6644982695579529, + "learning_rate": 6.286208051486844e-06, + "loss": 0.7671, + "step": 15218 + }, + { + "epoch": 0.8376355330508009, + "grad_norm": 0.8195061683654785, + "learning_rate": 6.285789167437526e-06, + "loss": 0.662, + "step": 15219 + }, + { + "epoch": 0.8376905718531564, + "grad_norm": 0.6578626036643982, + "learning_rate": 6.2853702737245605e-06, + "loss": 0.7681, + "step": 15220 + }, + { + "epoch": 0.8377456106555121, + "grad_norm": 0.6632179021835327, + "learning_rate": 6.2849513703510955e-06, + "loss": 0.759, + "step": 15221 + }, + { + "epoch": 0.8378006494578678, + "grad_norm": 0.6822313070297241, + "learning_rate": 6.284532457320282e-06, + "loss": 0.7859, + "step": 15222 + }, + { + "epoch": 0.8378556882602235, + "grad_norm": 0.6448203921318054, + "learning_rate": 6.284113534635265e-06, + "loss": 0.7224, + "step": 15223 + }, + { + "epoch": 0.8379107270625791, + "grad_norm": 0.6147580146789551, + "learning_rate": 6.2836946022991926e-06, + "loss": 0.7389, + "step": 15224 + }, + { + "epoch": 0.8379657658649348, + "grad_norm": 0.7476562857627869, + "learning_rate": 6.283275660315219e-06, + "loss": 0.7535, + "step": 15225 + }, + { + "epoch": 0.8380208046672905, + "grad_norm": 0.7396713495254517, + "learning_rate": 6.282856708686488e-06, + "loss": 0.7621, + "step": 15226 + }, + { + "epoch": 0.8380758434696461, + "grad_norm": 0.7220024466514587, + "learning_rate": 6.282437747416148e-06, + "loss": 0.672, + "step": 15227 + }, + { + "epoch": 0.8381308822720017, + "grad_norm": 0.9414284229278564, + "learning_rate": 6.2820187765073495e-06, + "loss": 0.8791, + "step": 15228 + }, + { + "epoch": 0.8381859210743574, + "grad_norm": 0.6074691414833069, + "learning_rate": 6.281599795963241e-06, + "loss": 0.6771, + "step": 15229 + }, + { + "epoch": 0.8382409598767131, + "grad_norm": 0.7367346286773682, + "learning_rate": 6.281180805786973e-06, + "loss": 0.7869, + "step": 15230 + }, + { + "epoch": 0.8382959986790688, + "grad_norm": 0.711016833782196, + "learning_rate": 6.280761805981691e-06, + "loss": 0.7166, + "step": 15231 + }, + { + "epoch": 0.8383510374814244, + "grad_norm": 0.6464707255363464, + "learning_rate": 6.280342796550546e-06, + "loss": 0.6965, + "step": 15232 + }, + { + "epoch": 0.83840607628378, + "grad_norm": 0.7385185956954956, + "learning_rate": 6.279923777496688e-06, + "loss": 0.7031, + "step": 15233 + }, + { + "epoch": 0.8384611150861357, + "grad_norm": 0.6799347996711731, + "learning_rate": 6.2795047488232665e-06, + "loss": 0.6777, + "step": 15234 + }, + { + "epoch": 0.8385161538884914, + "grad_norm": 0.690740168094635, + "learning_rate": 6.279085710533429e-06, + "loss": 0.7675, + "step": 15235 + }, + { + "epoch": 0.838571192690847, + "grad_norm": 0.9359111189842224, + "learning_rate": 6.278666662630325e-06, + "loss": 0.7063, + "step": 15236 + }, + { + "epoch": 0.8386262314932027, + "grad_norm": 0.751430094242096, + "learning_rate": 6.2782476051171075e-06, + "loss": 0.7851, + "step": 15237 + }, + { + "epoch": 0.8386812702955584, + "grad_norm": 0.6865997314453125, + "learning_rate": 6.27782853799692e-06, + "loss": 0.7347, + "step": 15238 + }, + { + "epoch": 0.8387363090979141, + "grad_norm": 0.6713284850120544, + "learning_rate": 6.277409461272916e-06, + "loss": 0.7651, + "step": 15239 + }, + { + "epoch": 0.8387913479002697, + "grad_norm": 0.7481899857521057, + "learning_rate": 6.276990374948244e-06, + "loss": 0.7681, + "step": 15240 + }, + { + "epoch": 0.8388463867026253, + "grad_norm": 0.7126002311706543, + "learning_rate": 6.2765712790260554e-06, + "loss": 0.7772, + "step": 15241 + }, + { + "epoch": 0.838901425504981, + "grad_norm": 0.6616978645324707, + "learning_rate": 6.276152173509497e-06, + "loss": 0.7028, + "step": 15242 + }, + { + "epoch": 0.8389564643073367, + "grad_norm": 0.9032973051071167, + "learning_rate": 6.2757330584017225e-06, + "loss": 0.7646, + "step": 15243 + }, + { + "epoch": 0.8390115031096923, + "grad_norm": 0.6345590353012085, + "learning_rate": 6.275313933705879e-06, + "loss": 0.6692, + "step": 15244 + }, + { + "epoch": 0.839066541912048, + "grad_norm": 0.6989019513130188, + "learning_rate": 6.2748947994251175e-06, + "loss": 0.6916, + "step": 15245 + }, + { + "epoch": 0.8391215807144037, + "grad_norm": 0.7115045189857483, + "learning_rate": 6.2744756555625875e-06, + "loss": 0.6923, + "step": 15246 + }, + { + "epoch": 0.8391766195167594, + "grad_norm": 0.6989235281944275, + "learning_rate": 6.2740565021214406e-06, + "loss": 0.7057, + "step": 15247 + }, + { + "epoch": 0.839231658319115, + "grad_norm": 0.684779942035675, + "learning_rate": 6.273637339104824e-06, + "loss": 0.7777, + "step": 15248 + }, + { + "epoch": 0.8392866971214706, + "grad_norm": 0.6341322064399719, + "learning_rate": 6.2732181665158934e-06, + "loss": 0.7335, + "step": 15249 + }, + { + "epoch": 0.8393417359238263, + "grad_norm": 0.7232723832130432, + "learning_rate": 6.272798984357793e-06, + "loss": 0.8055, + "step": 15250 + }, + { + "epoch": 0.8393967747261819, + "grad_norm": 0.9725174307823181, + "learning_rate": 6.272379792633678e-06, + "loss": 0.6221, + "step": 15251 + }, + { + "epoch": 0.8394518135285376, + "grad_norm": 0.6602086424827576, + "learning_rate": 6.271960591346695e-06, + "loss": 0.8023, + "step": 15252 + }, + { + "epoch": 0.8395068523308933, + "grad_norm": 0.7092040777206421, + "learning_rate": 6.271541380499998e-06, + "loss": 0.8135, + "step": 15253 + }, + { + "epoch": 0.839561891133249, + "grad_norm": 0.5656731724739075, + "learning_rate": 6.271122160096736e-06, + "loss": 0.647, + "step": 15254 + }, + { + "epoch": 0.8396169299356046, + "grad_norm": 1.1831625699996948, + "learning_rate": 6.270702930140061e-06, + "loss": 0.8513, + "step": 15255 + }, + { + "epoch": 0.8396719687379602, + "grad_norm": 0.6398816704750061, + "learning_rate": 6.270283690633121e-06, + "loss": 0.6988, + "step": 15256 + }, + { + "epoch": 0.8397270075403159, + "grad_norm": 0.6856167316436768, + "learning_rate": 6.26986444157907e-06, + "loss": 0.7789, + "step": 15257 + }, + { + "epoch": 0.8397820463426716, + "grad_norm": 0.7355605363845825, + "learning_rate": 6.269445182981058e-06, + "loss": 0.6652, + "step": 15258 + }, + { + "epoch": 0.8398370851450272, + "grad_norm": 0.6691173315048218, + "learning_rate": 6.2690259148422364e-06, + "loss": 0.6807, + "step": 15259 + }, + { + "epoch": 0.8398921239473829, + "grad_norm": 0.6596276164054871, + "learning_rate": 6.268606637165754e-06, + "loss": 0.6947, + "step": 15260 + }, + { + "epoch": 0.8399471627497386, + "grad_norm": 0.7198327779769897, + "learning_rate": 6.268187349954766e-06, + "loss": 0.7981, + "step": 15261 + }, + { + "epoch": 0.8400022015520943, + "grad_norm": 0.7006517648696899, + "learning_rate": 6.267768053212419e-06, + "loss": 0.7756, + "step": 15262 + }, + { + "epoch": 0.8400572403544498, + "grad_norm": 0.769062340259552, + "learning_rate": 6.267348746941869e-06, + "loss": 0.8433, + "step": 15263 + }, + { + "epoch": 0.8401122791568055, + "grad_norm": 0.6317951679229736, + "learning_rate": 6.266929431146263e-06, + "loss": 0.6575, + "step": 15264 + }, + { + "epoch": 0.8401673179591612, + "grad_norm": 0.7127153873443604, + "learning_rate": 6.2665101058287554e-06, + "loss": 0.7745, + "step": 15265 + }, + { + "epoch": 0.8402223567615169, + "grad_norm": 0.6909182667732239, + "learning_rate": 6.266090770992497e-06, + "loss": 0.7567, + "step": 15266 + }, + { + "epoch": 0.8402773955638725, + "grad_norm": 0.7875083684921265, + "learning_rate": 6.2656714266406384e-06, + "loss": 0.7392, + "step": 15267 + }, + { + "epoch": 0.8403324343662282, + "grad_norm": 0.7068803906440735, + "learning_rate": 6.2652520727763326e-06, + "loss": 0.6723, + "step": 15268 + }, + { + "epoch": 0.8403874731685839, + "grad_norm": 0.6994038820266724, + "learning_rate": 6.264832709402731e-06, + "loss": 0.6989, + "step": 15269 + }, + { + "epoch": 0.8404425119709396, + "grad_norm": 0.714044988155365, + "learning_rate": 6.264413336522985e-06, + "loss": 0.7464, + "step": 15270 + }, + { + "epoch": 0.8404975507732951, + "grad_norm": 0.8202210068702698, + "learning_rate": 6.263993954140249e-06, + "loss": 0.7174, + "step": 15271 + }, + { + "epoch": 0.8405525895756508, + "grad_norm": 0.6762316823005676, + "learning_rate": 6.2635745622576694e-06, + "loss": 0.7416, + "step": 15272 + }, + { + "epoch": 0.8406076283780065, + "grad_norm": 0.7461959719657898, + "learning_rate": 6.263155160878405e-06, + "loss": 0.7835, + "step": 15273 + }, + { + "epoch": 0.8406626671803622, + "grad_norm": 0.6263054609298706, + "learning_rate": 6.262735750005602e-06, + "loss": 0.7034, + "step": 15274 + }, + { + "epoch": 0.8407177059827178, + "grad_norm": 0.7489733695983887, + "learning_rate": 6.2623163296424165e-06, + "loss": 0.7387, + "step": 15275 + }, + { + "epoch": 0.8407727447850735, + "grad_norm": 0.7841430306434631, + "learning_rate": 6.261896899791997e-06, + "loss": 0.8487, + "step": 15276 + }, + { + "epoch": 0.8408277835874292, + "grad_norm": 0.8390078544616699, + "learning_rate": 6.2614774604575e-06, + "loss": 0.8335, + "step": 15277 + }, + { + "epoch": 0.8408828223897848, + "grad_norm": 0.9100946187973022, + "learning_rate": 6.261058011642076e-06, + "loss": 0.6196, + "step": 15278 + }, + { + "epoch": 0.8409378611921404, + "grad_norm": 0.7001772522926331, + "learning_rate": 6.260638553348879e-06, + "loss": 0.6935, + "step": 15279 + }, + { + "epoch": 0.8409928999944961, + "grad_norm": 0.7877102494239807, + "learning_rate": 6.260219085581057e-06, + "loss": 0.7378, + "step": 15280 + }, + { + "epoch": 0.8410479387968518, + "grad_norm": 0.687240719795227, + "learning_rate": 6.259799608341768e-06, + "loss": 0.7224, + "step": 15281 + }, + { + "epoch": 0.8411029775992075, + "grad_norm": 0.7766143083572388, + "learning_rate": 6.2593801216341625e-06, + "loss": 0.7157, + "step": 15282 + }, + { + "epoch": 0.8411580164015631, + "grad_norm": 1.1593633890151978, + "learning_rate": 6.258960625461391e-06, + "loss": 0.8555, + "step": 15283 + }, + { + "epoch": 0.8412130552039188, + "grad_norm": 0.6179451942443848, + "learning_rate": 6.2585411198266085e-06, + "loss": 0.6715, + "step": 15284 + }, + { + "epoch": 0.8412680940062744, + "grad_norm": 0.6755460500717163, + "learning_rate": 6.258121604732971e-06, + "loss": 0.7475, + "step": 15285 + }, + { + "epoch": 0.8413231328086301, + "grad_norm": 0.6775393486022949, + "learning_rate": 6.257702080183627e-06, + "loss": 0.6594, + "step": 15286 + }, + { + "epoch": 0.8413781716109857, + "grad_norm": 0.6972197890281677, + "learning_rate": 6.25728254618173e-06, + "loss": 0.7865, + "step": 15287 + }, + { + "epoch": 0.8414332104133414, + "grad_norm": 0.6446948051452637, + "learning_rate": 6.256863002730433e-06, + "loss": 0.6874, + "step": 15288 + }, + { + "epoch": 0.8414882492156971, + "grad_norm": 0.7012035846710205, + "learning_rate": 6.256443449832892e-06, + "loss": 0.7465, + "step": 15289 + }, + { + "epoch": 0.8415432880180528, + "grad_norm": 0.698693573474884, + "learning_rate": 6.256023887492257e-06, + "loss": 0.8206, + "step": 15290 + }, + { + "epoch": 0.8415983268204084, + "grad_norm": 0.7083185315132141, + "learning_rate": 6.255604315711684e-06, + "loss": 0.8306, + "step": 15291 + }, + { + "epoch": 0.841653365622764, + "grad_norm": 0.6605321764945984, + "learning_rate": 6.255184734494324e-06, + "loss": 0.6742, + "step": 15292 + }, + { + "epoch": 0.8417084044251197, + "grad_norm": 0.681881844997406, + "learning_rate": 6.254765143843331e-06, + "loss": 0.7009, + "step": 15293 + }, + { + "epoch": 0.8417634432274753, + "grad_norm": 0.6995699405670166, + "learning_rate": 6.2543455437618605e-06, + "loss": 0.8069, + "step": 15294 + }, + { + "epoch": 0.841818482029831, + "grad_norm": 0.7004442811012268, + "learning_rate": 6.2539259342530644e-06, + "loss": 0.71, + "step": 15295 + }, + { + "epoch": 0.8418735208321867, + "grad_norm": 0.7816279530525208, + "learning_rate": 6.253506315320097e-06, + "loss": 0.7833, + "step": 15296 + }, + { + "epoch": 0.8419285596345424, + "grad_norm": 0.6875490546226501, + "learning_rate": 6.25308668696611e-06, + "loss": 0.7223, + "step": 15297 + }, + { + "epoch": 0.841983598436898, + "grad_norm": 0.7126815915107727, + "learning_rate": 6.252667049194261e-06, + "loss": 0.7934, + "step": 15298 + }, + { + "epoch": 0.8420386372392537, + "grad_norm": 0.8048780560493469, + "learning_rate": 6.252247402007701e-06, + "loss": 0.7775, + "step": 15299 + }, + { + "epoch": 0.8420936760416093, + "grad_norm": 0.6681318879127502, + "learning_rate": 6.251827745409583e-06, + "loss": 0.6516, + "step": 15300 + }, + { + "epoch": 0.842148714843965, + "grad_norm": 0.6467457413673401, + "learning_rate": 6.251408079403064e-06, + "loss": 0.7417, + "step": 15301 + }, + { + "epoch": 0.8422037536463206, + "grad_norm": 0.6815666556358337, + "learning_rate": 6.250988403991297e-06, + "loss": 0.7498, + "step": 15302 + }, + { + "epoch": 0.8422587924486763, + "grad_norm": 0.6596205234527588, + "learning_rate": 6.250568719177437e-06, + "loss": 0.762, + "step": 15303 + }, + { + "epoch": 0.842313831251032, + "grad_norm": 0.7564731240272522, + "learning_rate": 6.250149024964635e-06, + "loss": 0.7592, + "step": 15304 + }, + { + "epoch": 0.8423688700533877, + "grad_norm": 0.6755058169364929, + "learning_rate": 6.249729321356048e-06, + "loss": 0.6953, + "step": 15305 + }, + { + "epoch": 0.8424239088557433, + "grad_norm": 0.7423762083053589, + "learning_rate": 6.249309608354832e-06, + "loss": 0.7018, + "step": 15306 + }, + { + "epoch": 0.8424789476580989, + "grad_norm": 0.727678120136261, + "learning_rate": 6.248889885964138e-06, + "loss": 0.8159, + "step": 15307 + }, + { + "epoch": 0.8425339864604546, + "grad_norm": 1.0823713541030884, + "learning_rate": 6.248470154187123e-06, + "loss": 0.872, + "step": 15308 + }, + { + "epoch": 0.8425890252628103, + "grad_norm": 0.6428259015083313, + "learning_rate": 6.248050413026939e-06, + "loss": 0.683, + "step": 15309 + }, + { + "epoch": 0.8426440640651659, + "grad_norm": 0.6622119545936584, + "learning_rate": 6.247630662486743e-06, + "loss": 0.7891, + "step": 15310 + }, + { + "epoch": 0.8426991028675216, + "grad_norm": 1.2377631664276123, + "learning_rate": 6.247210902569689e-06, + "loss": 0.7675, + "step": 15311 + }, + { + "epoch": 0.8427541416698773, + "grad_norm": 0.7909934520721436, + "learning_rate": 6.246791133278931e-06, + "loss": 0.8688, + "step": 15312 + }, + { + "epoch": 0.842809180472233, + "grad_norm": 0.6541300415992737, + "learning_rate": 6.246371354617625e-06, + "loss": 0.6754, + "step": 15313 + }, + { + "epoch": 0.8428642192745885, + "grad_norm": 0.6664960384368896, + "learning_rate": 6.245951566588926e-06, + "loss": 0.6666, + "step": 15314 + }, + { + "epoch": 0.8429192580769442, + "grad_norm": 0.7288552522659302, + "learning_rate": 6.245531769195988e-06, + "loss": 0.8179, + "step": 15315 + }, + { + "epoch": 0.8429742968792999, + "grad_norm": 0.7044054865837097, + "learning_rate": 6.245111962441966e-06, + "loss": 0.7306, + "step": 15316 + }, + { + "epoch": 0.8430293356816556, + "grad_norm": 0.6108603477478027, + "learning_rate": 6.244692146330016e-06, + "loss": 0.6213, + "step": 15317 + }, + { + "epoch": 0.8430843744840112, + "grad_norm": 0.6381129622459412, + "learning_rate": 6.2442723208632935e-06, + "loss": 0.7709, + "step": 15318 + }, + { + "epoch": 0.8431394132863669, + "grad_norm": 0.7355496883392334, + "learning_rate": 6.243852486044955e-06, + "loss": 0.665, + "step": 15319 + }, + { + "epoch": 0.8431944520887226, + "grad_norm": 0.7450826168060303, + "learning_rate": 6.2434326418781525e-06, + "loss": 0.7551, + "step": 15320 + }, + { + "epoch": 0.8432494908910783, + "grad_norm": 0.6463751792907715, + "learning_rate": 6.243012788366043e-06, + "loss": 0.7956, + "step": 15321 + }, + { + "epoch": 0.8433045296934338, + "grad_norm": 0.6673271059989929, + "learning_rate": 6.242592925511782e-06, + "loss": 0.7148, + "step": 15322 + }, + { + "epoch": 0.8433595684957895, + "grad_norm": 0.7663269639015198, + "learning_rate": 6.242173053318526e-06, + "loss": 0.8594, + "step": 15323 + }, + { + "epoch": 0.8434146072981452, + "grad_norm": 0.8503594994544983, + "learning_rate": 6.2417531717894285e-06, + "loss": 0.7594, + "step": 15324 + }, + { + "epoch": 0.8434696461005009, + "grad_norm": 0.6903344988822937, + "learning_rate": 6.241333280927647e-06, + "loss": 0.7252, + "step": 15325 + }, + { + "epoch": 0.8435246849028565, + "grad_norm": 0.6472830772399902, + "learning_rate": 6.240913380736337e-06, + "loss": 0.7379, + "step": 15326 + }, + { + "epoch": 0.8435797237052122, + "grad_norm": 0.6442959308624268, + "learning_rate": 6.240493471218655e-06, + "loss": 0.7447, + "step": 15327 + }, + { + "epoch": 0.8436347625075679, + "grad_norm": 0.6387843489646912, + "learning_rate": 6.240073552377756e-06, + "loss": 0.7659, + "step": 15328 + }, + { + "epoch": 0.8436898013099235, + "grad_norm": 0.7017341256141663, + "learning_rate": 6.239653624216794e-06, + "loss": 0.6934, + "step": 15329 + }, + { + "epoch": 0.8437448401122791, + "grad_norm": 0.6204355359077454, + "learning_rate": 6.2392336867389294e-06, + "loss": 0.6553, + "step": 15330 + }, + { + "epoch": 0.8437998789146348, + "grad_norm": 0.6765483021736145, + "learning_rate": 6.238813739947315e-06, + "loss": 0.7492, + "step": 15331 + }, + { + "epoch": 0.8438549177169905, + "grad_norm": 0.7261079549789429, + "learning_rate": 6.238393783845109e-06, + "loss": 0.7373, + "step": 15332 + }, + { + "epoch": 0.8439099565193462, + "grad_norm": 0.7019803524017334, + "learning_rate": 6.237973818435466e-06, + "loss": 0.7742, + "step": 15333 + }, + { + "epoch": 0.8439649953217018, + "grad_norm": 0.7521516680717468, + "learning_rate": 6.237553843721545e-06, + "loss": 0.8808, + "step": 15334 + }, + { + "epoch": 0.8440200341240575, + "grad_norm": 0.6796375513076782, + "learning_rate": 6.237133859706499e-06, + "loss": 0.7759, + "step": 15335 + }, + { + "epoch": 0.8440750729264132, + "grad_norm": 0.6199387311935425, + "learning_rate": 6.236713866393487e-06, + "loss": 0.6203, + "step": 15336 + }, + { + "epoch": 0.8441301117287687, + "grad_norm": 0.6968052983283997, + "learning_rate": 6.236293863785663e-06, + "loss": 0.7645, + "step": 15337 + }, + { + "epoch": 0.8441851505311244, + "grad_norm": 0.757556676864624, + "learning_rate": 6.235873851886186e-06, + "loss": 0.8005, + "step": 15338 + }, + { + "epoch": 0.8442401893334801, + "grad_norm": 0.6558085680007935, + "learning_rate": 6.235453830698211e-06, + "loss": 0.796, + "step": 15339 + }, + { + "epoch": 0.8442952281358358, + "grad_norm": 0.6963368654251099, + "learning_rate": 6.235033800224898e-06, + "loss": 0.7077, + "step": 15340 + }, + { + "epoch": 0.8443502669381914, + "grad_norm": 0.6057709455490112, + "learning_rate": 6.234613760469399e-06, + "loss": 0.5443, + "step": 15341 + }, + { + "epoch": 0.8444053057405471, + "grad_norm": 0.7616491317749023, + "learning_rate": 6.234193711434875e-06, + "loss": 0.6764, + "step": 15342 + }, + { + "epoch": 0.8444603445429028, + "grad_norm": 0.7143368721008301, + "learning_rate": 6.233773653124482e-06, + "loss": 0.6647, + "step": 15343 + }, + { + "epoch": 0.8445153833452584, + "grad_norm": 0.8766696453094482, + "learning_rate": 6.233353585541375e-06, + "loss": 0.7112, + "step": 15344 + }, + { + "epoch": 0.844570422147614, + "grad_norm": 0.6184048652648926, + "learning_rate": 6.232933508688714e-06, + "loss": 0.6645, + "step": 15345 + }, + { + "epoch": 0.8446254609499697, + "grad_norm": 0.8119208812713623, + "learning_rate": 6.232513422569655e-06, + "loss": 0.6729, + "step": 15346 + }, + { + "epoch": 0.8446804997523254, + "grad_norm": 0.5964543223381042, + "learning_rate": 6.2320933271873544e-06, + "loss": 0.6931, + "step": 15347 + }, + { + "epoch": 0.8447355385546811, + "grad_norm": 0.696611225605011, + "learning_rate": 6.23167322254497e-06, + "loss": 0.8292, + "step": 15348 + }, + { + "epoch": 0.8447905773570367, + "grad_norm": 0.6196489930152893, + "learning_rate": 6.231253108645658e-06, + "loss": 0.6651, + "step": 15349 + }, + { + "epoch": 0.8448456161593924, + "grad_norm": 0.6222663521766663, + "learning_rate": 6.230832985492579e-06, + "loss": 0.6513, + "step": 15350 + }, + { + "epoch": 0.844900654961748, + "grad_norm": 0.6424199342727661, + "learning_rate": 6.230412853088889e-06, + "loss": 0.7005, + "step": 15351 + }, + { + "epoch": 0.8449556937641037, + "grad_norm": 0.6484132409095764, + "learning_rate": 6.229992711437745e-06, + "loss": 0.6931, + "step": 15352 + }, + { + "epoch": 0.8450107325664593, + "grad_norm": 0.7568885684013367, + "learning_rate": 6.229572560542303e-06, + "loss": 0.7036, + "step": 15353 + }, + { + "epoch": 0.845065771368815, + "grad_norm": 0.665937602519989, + "learning_rate": 6.229152400405724e-06, + "loss": 0.5498, + "step": 15354 + }, + { + "epoch": 0.8451208101711707, + "grad_norm": 0.6861961483955383, + "learning_rate": 6.228732231031165e-06, + "loss": 0.7622, + "step": 15355 + }, + { + "epoch": 0.8451758489735264, + "grad_norm": 0.6793088316917419, + "learning_rate": 6.2283120524217845e-06, + "loss": 0.758, + "step": 15356 + }, + { + "epoch": 0.845230887775882, + "grad_norm": 0.7460890412330627, + "learning_rate": 6.227891864580739e-06, + "loss": 0.6618, + "step": 15357 + }, + { + "epoch": 0.8452859265782376, + "grad_norm": 0.6434195041656494, + "learning_rate": 6.227471667511186e-06, + "loss": 0.7226, + "step": 15358 + }, + { + "epoch": 0.8453409653805933, + "grad_norm": 0.7655256986618042, + "learning_rate": 6.227051461216285e-06, + "loss": 0.8461, + "step": 15359 + }, + { + "epoch": 0.845396004182949, + "grad_norm": 0.6727028489112854, + "learning_rate": 6.226631245699193e-06, + "loss": 0.6765, + "step": 15360 + }, + { + "epoch": 0.8454510429853046, + "grad_norm": 0.6030625700950623, + "learning_rate": 6.226211020963069e-06, + "loss": 0.6548, + "step": 15361 + }, + { + "epoch": 0.8455060817876603, + "grad_norm": 0.6430317163467407, + "learning_rate": 6.225790787011071e-06, + "loss": 0.7564, + "step": 15362 + }, + { + "epoch": 0.845561120590016, + "grad_norm": 0.633975088596344, + "learning_rate": 6.225370543846359e-06, + "loss": 0.716, + "step": 15363 + }, + { + "epoch": 0.8456161593923717, + "grad_norm": 0.6722174286842346, + "learning_rate": 6.2249502914720895e-06, + "loss": 0.7266, + "step": 15364 + }, + { + "epoch": 0.8456711981947272, + "grad_norm": 0.724166214466095, + "learning_rate": 6.22453002989142e-06, + "loss": 0.788, + "step": 15365 + }, + { + "epoch": 0.8457262369970829, + "grad_norm": 0.6406343579292297, + "learning_rate": 6.224109759107512e-06, + "loss": 0.8086, + "step": 15366 + }, + { + "epoch": 0.8457812757994386, + "grad_norm": 0.7344949245452881, + "learning_rate": 6.223689479123523e-06, + "loss": 0.7838, + "step": 15367 + }, + { + "epoch": 0.8458363146017943, + "grad_norm": 0.8572549819946289, + "learning_rate": 6.22326918994261e-06, + "loss": 0.7427, + "step": 15368 + }, + { + "epoch": 0.8458913534041499, + "grad_norm": 0.662644624710083, + "learning_rate": 6.222848891567934e-06, + "loss": 0.7165, + "step": 15369 + }, + { + "epoch": 0.8459463922065056, + "grad_norm": 0.7139797210693359, + "learning_rate": 6.222428584002654e-06, + "loss": 0.8218, + "step": 15370 + }, + { + "epoch": 0.8460014310088613, + "grad_norm": 0.6846550107002258, + "learning_rate": 6.222008267249927e-06, + "loss": 0.6686, + "step": 15371 + }, + { + "epoch": 0.846056469811217, + "grad_norm": 0.6675787568092346, + "learning_rate": 6.221587941312914e-06, + "loss": 0.7151, + "step": 15372 + }, + { + "epoch": 0.8461115086135725, + "grad_norm": 0.626371443271637, + "learning_rate": 6.221167606194771e-06, + "loss": 0.7637, + "step": 15373 + }, + { + "epoch": 0.8461665474159282, + "grad_norm": 0.6768763065338135, + "learning_rate": 6.220747261898661e-06, + "loss": 0.7363, + "step": 15374 + }, + { + "epoch": 0.8462215862182839, + "grad_norm": 0.7771314978599548, + "learning_rate": 6.220326908427741e-06, + "loss": 0.7032, + "step": 15375 + }, + { + "epoch": 0.8462766250206396, + "grad_norm": 0.8215247392654419, + "learning_rate": 6.219906545785171e-06, + "loss": 0.8917, + "step": 15376 + }, + { + "epoch": 0.8463316638229952, + "grad_norm": 0.7277588248252869, + "learning_rate": 6.219486173974107e-06, + "loss": 0.7531, + "step": 15377 + }, + { + "epoch": 0.8463867026253509, + "grad_norm": 0.6487376093864441, + "learning_rate": 6.219065792997714e-06, + "loss": 0.7182, + "step": 15378 + }, + { + "epoch": 0.8464417414277066, + "grad_norm": 0.6960493326187134, + "learning_rate": 6.218645402859148e-06, + "loss": 0.8125, + "step": 15379 + }, + { + "epoch": 0.8464967802300621, + "grad_norm": 0.7183159589767456, + "learning_rate": 6.218225003561571e-06, + "loss": 0.6536, + "step": 15380 + }, + { + "epoch": 0.8465518190324178, + "grad_norm": 0.7001940011978149, + "learning_rate": 6.217804595108139e-06, + "loss": 0.8203, + "step": 15381 + }, + { + "epoch": 0.8466068578347735, + "grad_norm": 0.5986705422401428, + "learning_rate": 6.217384177502015e-06, + "loss": 0.6672, + "step": 15382 + }, + { + "epoch": 0.8466618966371292, + "grad_norm": 0.6191138029098511, + "learning_rate": 6.216963750746356e-06, + "loss": 0.6565, + "step": 15383 + }, + { + "epoch": 0.8467169354394848, + "grad_norm": 1.2927004098892212, + "learning_rate": 6.216543314844326e-06, + "loss": 0.7511, + "step": 15384 + }, + { + "epoch": 0.8467719742418405, + "grad_norm": 0.6715198159217834, + "learning_rate": 6.2161228697990785e-06, + "loss": 0.7712, + "step": 15385 + }, + { + "epoch": 0.8468270130441962, + "grad_norm": 0.7516033053398132, + "learning_rate": 6.215702415613778e-06, + "loss": 0.6595, + "step": 15386 + }, + { + "epoch": 0.8468820518465519, + "grad_norm": 0.6913008689880371, + "learning_rate": 6.215281952291585e-06, + "loss": 0.7262, + "step": 15387 + }, + { + "epoch": 0.8469370906489074, + "grad_norm": 0.7288102507591248, + "learning_rate": 6.214861479835657e-06, + "loss": 0.6628, + "step": 15388 + }, + { + "epoch": 0.8469921294512631, + "grad_norm": 0.7889914512634277, + "learning_rate": 6.214440998249155e-06, + "loss": 0.7744, + "step": 15389 + }, + { + "epoch": 0.8470471682536188, + "grad_norm": 0.7622396945953369, + "learning_rate": 6.21402050753524e-06, + "loss": 0.7818, + "step": 15390 + }, + { + "epoch": 0.8471022070559745, + "grad_norm": 0.6172721982002258, + "learning_rate": 6.213600007697072e-06, + "loss": 0.626, + "step": 15391 + }, + { + "epoch": 0.8471572458583301, + "grad_norm": 0.710991621017456, + "learning_rate": 6.213179498737812e-06, + "loss": 0.7313, + "step": 15392 + }, + { + "epoch": 0.8472122846606858, + "grad_norm": 0.660139262676239, + "learning_rate": 6.2127589806606195e-06, + "loss": 0.6479, + "step": 15393 + }, + { + "epoch": 0.8472673234630415, + "grad_norm": 0.6611735224723816, + "learning_rate": 6.2123384534686534e-06, + "loss": 0.7091, + "step": 15394 + }, + { + "epoch": 0.8473223622653971, + "grad_norm": 0.8392653465270996, + "learning_rate": 6.211917917165078e-06, + "loss": 0.8514, + "step": 15395 + }, + { + "epoch": 0.8473774010677527, + "grad_norm": 0.6202608942985535, + "learning_rate": 6.211497371753052e-06, + "loss": 0.7068, + "step": 15396 + }, + { + "epoch": 0.8474324398701084, + "grad_norm": 0.6785926818847656, + "learning_rate": 6.211076817235734e-06, + "loss": 0.7216, + "step": 15397 + }, + { + "epoch": 0.8474874786724641, + "grad_norm": 0.7234075665473938, + "learning_rate": 6.210656253616288e-06, + "loss": 0.7379, + "step": 15398 + }, + { + "epoch": 0.8475425174748198, + "grad_norm": 0.6223714351654053, + "learning_rate": 6.210235680897874e-06, + "loss": 0.758, + "step": 15399 + }, + { + "epoch": 0.8475975562771754, + "grad_norm": 0.7993804812431335, + "learning_rate": 6.209815099083651e-06, + "loss": 0.8174, + "step": 15400 + }, + { + "epoch": 0.8476525950795311, + "grad_norm": 0.7897897362709045, + "learning_rate": 6.209394508176783e-06, + "loss": 0.6833, + "step": 15401 + }, + { + "epoch": 0.8477076338818867, + "grad_norm": 0.6803291440010071, + "learning_rate": 6.208973908180429e-06, + "loss": 0.7977, + "step": 15402 + }, + { + "epoch": 0.8477626726842424, + "grad_norm": 0.6937161087989807, + "learning_rate": 6.208553299097751e-06, + "loss": 0.7118, + "step": 15403 + }, + { + "epoch": 0.847817711486598, + "grad_norm": 0.7939958572387695, + "learning_rate": 6.208132680931911e-06, + "loss": 0.794, + "step": 15404 + }, + { + "epoch": 0.8478727502889537, + "grad_norm": 0.7009061574935913, + "learning_rate": 6.207712053686068e-06, + "loss": 0.7534, + "step": 15405 + }, + { + "epoch": 0.8479277890913094, + "grad_norm": 0.6890555620193481, + "learning_rate": 6.207291417363384e-06, + "loss": 0.7638, + "step": 15406 + }, + { + "epoch": 0.8479828278936651, + "grad_norm": 0.677119255065918, + "learning_rate": 6.206870771967022e-06, + "loss": 0.6814, + "step": 15407 + }, + { + "epoch": 0.8480378666960207, + "grad_norm": 0.706792950630188, + "learning_rate": 6.2064501175001425e-06, + "loss": 0.7722, + "step": 15408 + }, + { + "epoch": 0.8480929054983763, + "grad_norm": 0.6590496897697449, + "learning_rate": 6.206029453965905e-06, + "loss": 0.772, + "step": 15409 + }, + { + "epoch": 0.848147944300732, + "grad_norm": 0.6821194887161255, + "learning_rate": 6.205608781367475e-06, + "loss": 0.7687, + "step": 15410 + }, + { + "epoch": 0.8482029831030877, + "grad_norm": 0.6030088663101196, + "learning_rate": 6.205188099708011e-06, + "loss": 0.6673, + "step": 15411 + }, + { + "epoch": 0.8482580219054433, + "grad_norm": 0.6877727508544922, + "learning_rate": 6.204767408990676e-06, + "loss": 0.756, + "step": 15412 + }, + { + "epoch": 0.848313060707799, + "grad_norm": 0.7107367515563965, + "learning_rate": 6.204346709218632e-06, + "loss": 0.7481, + "step": 15413 + }, + { + "epoch": 0.8483680995101547, + "grad_norm": 0.7213658094406128, + "learning_rate": 6.2039260003950395e-06, + "loss": 0.7135, + "step": 15414 + }, + { + "epoch": 0.8484231383125104, + "grad_norm": 0.7002324461936951, + "learning_rate": 6.203505282523063e-06, + "loss": 0.6768, + "step": 15415 + }, + { + "epoch": 0.848478177114866, + "grad_norm": 0.7483230829238892, + "learning_rate": 6.2030845556058614e-06, + "loss": 0.633, + "step": 15416 + }, + { + "epoch": 0.8485332159172216, + "grad_norm": 0.6701670289039612, + "learning_rate": 6.2026638196466e-06, + "loss": 0.7936, + "step": 15417 + }, + { + "epoch": 0.8485882547195773, + "grad_norm": 0.6940304636955261, + "learning_rate": 6.202243074648438e-06, + "loss": 0.7787, + "step": 15418 + }, + { + "epoch": 0.848643293521933, + "grad_norm": 0.5912098288536072, + "learning_rate": 6.20182232061454e-06, + "loss": 0.6458, + "step": 15419 + }, + { + "epoch": 0.8486983323242886, + "grad_norm": 0.6538116931915283, + "learning_rate": 6.201401557548066e-06, + "loss": 0.6986, + "step": 15420 + }, + { + "epoch": 0.8487533711266443, + "grad_norm": 1.0245170593261719, + "learning_rate": 6.20098078545218e-06, + "loss": 0.7111, + "step": 15421 + }, + { + "epoch": 0.848808409929, + "grad_norm": 0.6896708011627197, + "learning_rate": 6.200560004330043e-06, + "loss": 0.7921, + "step": 15422 + }, + { + "epoch": 0.8488634487313556, + "grad_norm": 0.6219936013221741, + "learning_rate": 6.2001392141848195e-06, + "loss": 0.7345, + "step": 15423 + }, + { + "epoch": 0.8489184875337112, + "grad_norm": 0.7418678998947144, + "learning_rate": 6.199718415019671e-06, + "loss": 0.8517, + "step": 15424 + }, + { + "epoch": 0.8489735263360669, + "grad_norm": 0.7002347111701965, + "learning_rate": 6.199297606837759e-06, + "loss": 0.7345, + "step": 15425 + }, + { + "epoch": 0.8490285651384226, + "grad_norm": 0.7004539966583252, + "learning_rate": 6.198876789642247e-06, + "loss": 0.7639, + "step": 15426 + }, + { + "epoch": 0.8490836039407782, + "grad_norm": 0.64945387840271, + "learning_rate": 6.1984559634362995e-06, + "loss": 0.7556, + "step": 15427 + }, + { + "epoch": 0.8491386427431339, + "grad_norm": 0.6660465598106384, + "learning_rate": 6.1980351282230764e-06, + "loss": 0.7342, + "step": 15428 + }, + { + "epoch": 0.8491936815454896, + "grad_norm": 0.6177669763565063, + "learning_rate": 6.197614284005743e-06, + "loss": 0.7092, + "step": 15429 + }, + { + "epoch": 0.8492487203478453, + "grad_norm": 0.7604618072509766, + "learning_rate": 6.197193430787462e-06, + "loss": 0.8271, + "step": 15430 + }, + { + "epoch": 0.8493037591502008, + "grad_norm": 0.6788204312324524, + "learning_rate": 6.196772568571394e-06, + "loss": 0.7817, + "step": 15431 + }, + { + "epoch": 0.8493587979525565, + "grad_norm": 0.6073753833770752, + "learning_rate": 6.196351697360704e-06, + "loss": 0.6479, + "step": 15432 + }, + { + "epoch": 0.8494138367549122, + "grad_norm": 0.6842348575592041, + "learning_rate": 6.195930817158555e-06, + "loss": 0.7956, + "step": 15433 + }, + { + "epoch": 0.8494688755572679, + "grad_norm": 0.7863163352012634, + "learning_rate": 6.19550992796811e-06, + "loss": 0.7441, + "step": 15434 + }, + { + "epoch": 0.8495239143596235, + "grad_norm": 0.7495602965354919, + "learning_rate": 6.195089029792532e-06, + "loss": 0.7854, + "step": 15435 + }, + { + "epoch": 0.8495789531619792, + "grad_norm": 0.6595779061317444, + "learning_rate": 6.194668122634986e-06, + "loss": 0.6705, + "step": 15436 + }, + { + "epoch": 0.8496339919643349, + "grad_norm": 0.7727940082550049, + "learning_rate": 6.194247206498633e-06, + "loss": 0.7269, + "step": 15437 + }, + { + "epoch": 0.8496890307666906, + "grad_norm": 0.7433161735534668, + "learning_rate": 6.193826281386639e-06, + "loss": 0.7747, + "step": 15438 + }, + { + "epoch": 0.8497440695690461, + "grad_norm": 0.7075695991516113, + "learning_rate": 6.193405347302165e-06, + "loss": 0.8423, + "step": 15439 + }, + { + "epoch": 0.8497991083714018, + "grad_norm": 0.8821007013320923, + "learning_rate": 6.192984404248377e-06, + "loss": 0.705, + "step": 15440 + }, + { + "epoch": 0.8498541471737575, + "grad_norm": 0.7283695936203003, + "learning_rate": 6.192563452228437e-06, + "loss": 0.7013, + "step": 15441 + }, + { + "epoch": 0.8499091859761132, + "grad_norm": 0.7810649275779724, + "learning_rate": 6.192142491245509e-06, + "loss": 0.8303, + "step": 15442 + }, + { + "epoch": 0.8499642247784688, + "grad_norm": 0.5930086374282837, + "learning_rate": 6.191721521302758e-06, + "loss": 0.7117, + "step": 15443 + }, + { + "epoch": 0.8500192635808245, + "grad_norm": 0.6570530533790588, + "learning_rate": 6.191300542403347e-06, + "loss": 0.7525, + "step": 15444 + }, + { + "epoch": 0.8500743023831802, + "grad_norm": 0.8024932146072388, + "learning_rate": 6.190879554550437e-06, + "loss": 0.8011, + "step": 15445 + }, + { + "epoch": 0.8501293411855358, + "grad_norm": 0.851327121257782, + "learning_rate": 6.190458557747199e-06, + "loss": 0.8117, + "step": 15446 + }, + { + "epoch": 0.8501843799878914, + "grad_norm": 0.816034197807312, + "learning_rate": 6.190037551996791e-06, + "loss": 0.6659, + "step": 15447 + }, + { + "epoch": 0.8502394187902471, + "grad_norm": 0.7001582980155945, + "learning_rate": 6.18961653730238e-06, + "loss": 0.7406, + "step": 15448 + }, + { + "epoch": 0.8502944575926028, + "grad_norm": 0.6798322200775146, + "learning_rate": 6.189195513667129e-06, + "loss": 0.7504, + "step": 15449 + }, + { + "epoch": 0.8503494963949585, + "grad_norm": 0.6565585136413574, + "learning_rate": 6.188774481094203e-06, + "loss": 0.6445, + "step": 15450 + }, + { + "epoch": 0.8504045351973141, + "grad_norm": 0.674721360206604, + "learning_rate": 6.188353439586767e-06, + "loss": 0.6718, + "step": 15451 + }, + { + "epoch": 0.8504595739996698, + "grad_norm": 0.7626152634620667, + "learning_rate": 6.187932389147984e-06, + "loss": 0.7273, + "step": 15452 + }, + { + "epoch": 0.8505146128020254, + "grad_norm": 0.6497740149497986, + "learning_rate": 6.18751132978102e-06, + "loss": 0.7619, + "step": 15453 } ], "logging_steps": 1, @@ -101834,7 +108197,7 @@ "attributes": {} } }, - "total_flos": 4.292023542595191e+19, + "total_flos": 4.56027501400739e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null