{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1495275725820306, "eval_steps": 150, "global_step": 272, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005497337227280536, "grad_norm": 3.5206565856933594, "learning_rate": 2e-05, "loss": 1.6048, "step": 1 }, { "epoch": 0.0005497337227280536, "eval_loss": 1.6324800252914429, "eval_runtime": 780.3054, "eval_samples_per_second": 7.853, "eval_steps_per_second": 1.963, "step": 1 }, { "epoch": 0.0010994674454561071, "grad_norm": 3.200124502182007, "learning_rate": 4e-05, "loss": 1.4684, "step": 2 }, { "epoch": 0.0016492011681841607, "grad_norm": 2.607300043106079, "learning_rate": 6e-05, "loss": 1.5521, "step": 3 }, { "epoch": 0.0021989348909122143, "grad_norm": 3.1669507026672363, "learning_rate": 8e-05, "loss": 1.3427, "step": 4 }, { "epoch": 0.002748668613640268, "grad_norm": 1.7303417921066284, "learning_rate": 0.0001, "loss": 1.1477, "step": 5 }, { "epoch": 0.0032984023363683214, "grad_norm": 1.311330795288086, "learning_rate": 0.00012, "loss": 1.1286, "step": 6 }, { "epoch": 0.003848136059096375, "grad_norm": 1.310185432434082, "learning_rate": 0.00014, "loss": 0.988, "step": 7 }, { "epoch": 0.0043978697818244285, "grad_norm": 1.3487765789031982, "learning_rate": 0.00016, "loss": 0.9608, "step": 8 }, { "epoch": 0.004947603504552482, "grad_norm": 1.1397814750671387, "learning_rate": 0.00018, "loss": 0.8497, "step": 9 }, { "epoch": 0.005497337227280536, "grad_norm": 1.165784239768982, "learning_rate": 0.0002, "loss": 0.9448, "step": 10 }, { "epoch": 0.00604707095000859, "grad_norm": 1.1118385791778564, "learning_rate": 0.00019999281110792807, "loss": 0.9032, "step": 11 }, { "epoch": 0.006596804672736643, "grad_norm": 0.9284569621086121, "learning_rate": 0.0001999712454653157, "loss": 0.7939, "step": 12 }, { "epoch": 0.007146538395464697, "grad_norm": 0.8805456161499023, "learning_rate": 0.00019993530617282436, "loss": 0.7904, "step": 13 }, { "epoch": 0.00769627211819275, "grad_norm": 1.137610912322998, "learning_rate": 0.00019988499839772804, "loss": 0.8537, "step": 14 }, { "epoch": 0.008246005840920804, "grad_norm": 0.9163579940795898, "learning_rate": 0.00019982032937316998, "loss": 0.8835, "step": 15 }, { "epoch": 0.008795739563648857, "grad_norm": 0.8768914341926575, "learning_rate": 0.000199741308397123, "loss": 0.8384, "step": 16 }, { "epoch": 0.009345473286376912, "grad_norm": 0.9132257103919983, "learning_rate": 0.0001996479468310524, "loss": 0.8275, "step": 17 }, { "epoch": 0.009895207009104965, "grad_norm": 0.8939024806022644, "learning_rate": 0.00019954025809828266, "loss": 0.8497, "step": 18 }, { "epoch": 0.010444940731833018, "grad_norm": 1.0300184488296509, "learning_rate": 0.0001994182576820673, "loss": 0.7272, "step": 19 }, { "epoch": 0.010994674454561072, "grad_norm": 0.7796829342842102, "learning_rate": 0.00019928196312336285, "loss": 0.7691, "step": 20 }, { "epoch": 0.011544408177289125, "grad_norm": 0.8311827182769775, "learning_rate": 0.00019913139401830674, "loss": 0.7215, "step": 21 }, { "epoch": 0.01209414190001718, "grad_norm": 0.8189563751220703, "learning_rate": 0.0001989665720153999, "loss": 0.7394, "step": 22 }, { "epoch": 0.012643875622745233, "grad_norm": 0.9236388802528381, "learning_rate": 0.0001987875208123941, "loss": 0.796, "step": 23 }, { "epoch": 0.013193609345473286, "grad_norm": 0.9475937485694885, "learning_rate": 0.00019859426615288488, "loss": 0.7981, "step": 24 }, { "epoch": 0.01374334306820134, "grad_norm": 0.9530321955680847, "learning_rate": 0.00019838683582260993, "loss": 0.7842, "step": 25 }, { "epoch": 0.014293076790929393, "grad_norm": 0.8120668530464172, "learning_rate": 0.00019816525964545448, "loss": 0.7178, "step": 26 }, { "epoch": 0.014842810513657448, "grad_norm": 0.7486498355865479, "learning_rate": 0.00019792956947916292, "loss": 0.7088, "step": 27 }, { "epoch": 0.0153925442363855, "grad_norm": 0.8260335326194763, "learning_rate": 0.00019767979921075866, "loss": 0.7598, "step": 28 }, { "epoch": 0.015942277959113554, "grad_norm": 0.7146168947219849, "learning_rate": 0.00019741598475167175, "loss": 0.7017, "step": 29 }, { "epoch": 0.01649201168184161, "grad_norm": 0.8072403073310852, "learning_rate": 0.0001971381640325756, "loss": 0.7983, "step": 30 }, { "epoch": 0.017041745404569663, "grad_norm": 0.8120503425598145, "learning_rate": 0.00019684637699793358, "loss": 0.7222, "step": 31 }, { "epoch": 0.017591479127297714, "grad_norm": 0.8462120294570923, "learning_rate": 0.00019654066560025567, "loss": 0.799, "step": 32 }, { "epoch": 0.01814121285002577, "grad_norm": 0.8152217268943787, "learning_rate": 0.00019622107379406667, "loss": 0.7261, "step": 33 }, { "epoch": 0.018690946572753823, "grad_norm": 0.8320972323417664, "learning_rate": 0.00019588764752958668, "loss": 0.8056, "step": 34 }, { "epoch": 0.019240680295481875, "grad_norm": 0.8976477384567261, "learning_rate": 0.0001955404347461243, "loss": 0.764, "step": 35 }, { "epoch": 0.01979041401820993, "grad_norm": 0.7760395407676697, "learning_rate": 0.000195179485365184, "loss": 0.7497, "step": 36 }, { "epoch": 0.020340147740937984, "grad_norm": 0.812126874923706, "learning_rate": 0.00019480485128328868, "loss": 0.6858, "step": 37 }, { "epoch": 0.020889881463666035, "grad_norm": 0.7526835203170776, "learning_rate": 0.00019441658636451794, "loss": 0.7634, "step": 38 }, { "epoch": 0.02143961518639409, "grad_norm": 0.8227682709693909, "learning_rate": 0.0001940147464327637, "loss": 0.8038, "step": 39 }, { "epoch": 0.021989348909122144, "grad_norm": 0.7662093043327332, "learning_rate": 0.000193599389263704, "loss": 0.7568, "step": 40 }, { "epoch": 0.0225390826318502, "grad_norm": 0.8277942538261414, "learning_rate": 0.000193170574576496, "loss": 0.703, "step": 41 }, { "epoch": 0.02308881635457825, "grad_norm": 0.8513416647911072, "learning_rate": 0.0001927283640251898, "loss": 0.7764, "step": 42 }, { "epoch": 0.023638550077306305, "grad_norm": 0.7439628839492798, "learning_rate": 0.00019227282118986394, "loss": 0.7399, "step": 43 }, { "epoch": 0.02418828380003436, "grad_norm": 0.7977442145347595, "learning_rate": 0.00019180401156748396, "loss": 0.7181, "step": 44 }, { "epoch": 0.02473801752276241, "grad_norm": 0.7685129642486572, "learning_rate": 0.0001913220025624854, "loss": 0.7703, "step": 45 }, { "epoch": 0.025287751245490465, "grad_norm": 0.7644825577735901, "learning_rate": 0.00019082686347708254, "loss": 0.7998, "step": 46 }, { "epoch": 0.02583748496821852, "grad_norm": 0.8463825583457947, "learning_rate": 0.00019031866550130438, "loss": 0.8169, "step": 47 }, { "epoch": 0.02638721869094657, "grad_norm": 0.7918380498886108, "learning_rate": 0.0001897974817027588, "loss": 0.7363, "step": 48 }, { "epoch": 0.026936952413674626, "grad_norm": 0.8936023712158203, "learning_rate": 0.00018926338701612738, "loss": 0.7442, "step": 49 }, { "epoch": 0.02748668613640268, "grad_norm": 0.7773414254188538, "learning_rate": 0.00018871645823239128, "loss": 0.7424, "step": 50 }, { "epoch": 0.028036419859130735, "grad_norm": 0.762144923210144, "learning_rate": 0.00018815677398779048, "loss": 0.6575, "step": 51 }, { "epoch": 0.028586153581858786, "grad_norm": 0.8487825989723206, "learning_rate": 0.00018758441475251754, "loss": 0.7957, "step": 52 }, { "epoch": 0.02913588730458684, "grad_norm": 0.773217499256134, "learning_rate": 0.0001869994628191478, "loss": 0.7037, "step": 53 }, { "epoch": 0.029685621027314896, "grad_norm": 0.7514662742614746, "learning_rate": 0.00018640200229080763, "loss": 0.7296, "step": 54 }, { "epoch": 0.030235354750042947, "grad_norm": 0.7621944546699524, "learning_rate": 0.00018579211906908215, "loss": 0.7017, "step": 55 }, { "epoch": 0.030785088472771, "grad_norm": 0.7579927444458008, "learning_rate": 0.00018516990084166442, "loss": 0.7727, "step": 56 }, { "epoch": 0.03133482219549905, "grad_norm": 0.7203779220581055, "learning_rate": 0.0001845354370697482, "loss": 0.7481, "step": 57 }, { "epoch": 0.03188455591822711, "grad_norm": 0.8017358183860779, "learning_rate": 0.000183888818975165, "loss": 0.7528, "step": 58 }, { "epoch": 0.03243428964095516, "grad_norm": 0.7234224677085876, "learning_rate": 0.00018323013952726875, "loss": 0.7355, "step": 59 }, { "epoch": 0.03298402336368322, "grad_norm": 0.8994983434677124, "learning_rate": 0.00018255949342956863, "loss": 0.7944, "step": 60 }, { "epoch": 0.03353375708641127, "grad_norm": 0.8010777235031128, "learning_rate": 0.00018187697710611298, "loss": 0.6973, "step": 61 }, { "epoch": 0.034083490809139326, "grad_norm": 0.7887091040611267, "learning_rate": 0.00018118268868762546, "loss": 0.7219, "step": 62 }, { "epoch": 0.034633224531867374, "grad_norm": 0.764696478843689, "learning_rate": 0.00018047672799739628, "loss": 0.6921, "step": 63 }, { "epoch": 0.03518295825459543, "grad_norm": 0.7982161045074463, "learning_rate": 0.0001797591965369296, "loss": 0.7621, "step": 64 }, { "epoch": 0.03573269197732348, "grad_norm": 0.8482686877250671, "learning_rate": 0.00017903019747134998, "loss": 0.7742, "step": 65 }, { "epoch": 0.03628242570005154, "grad_norm": 0.7955244779586792, "learning_rate": 0.00017828983561456941, "loss": 0.6882, "step": 66 }, { "epoch": 0.03683215942277959, "grad_norm": 0.8099084496498108, "learning_rate": 0.00017753821741421769, "loss": 0.6858, "step": 67 }, { "epoch": 0.03738189314550765, "grad_norm": 0.7965316772460938, "learning_rate": 0.00017677545093633713, "loss": 0.756, "step": 68 }, { "epoch": 0.0379316268682357, "grad_norm": 0.74765545129776, "learning_rate": 0.00017600164584984546, "loss": 0.7614, "step": 69 }, { "epoch": 0.03848136059096375, "grad_norm": 0.7999709248542786, "learning_rate": 0.00017521691341076774, "loss": 0.7523, "step": 70 }, { "epoch": 0.039031094313691804, "grad_norm": 0.754521906375885, "learning_rate": 0.00017442136644624015, "loss": 0.7073, "step": 71 }, { "epoch": 0.03958082803641986, "grad_norm": 0.7447445392608643, "learning_rate": 0.00017361511933828801, "loss": 0.7291, "step": 72 }, { "epoch": 0.04013056175914791, "grad_norm": 0.7177430391311646, "learning_rate": 0.00017279828800738017, "loss": 0.6392, "step": 73 }, { "epoch": 0.04068029548187597, "grad_norm": 0.7754899859428406, "learning_rate": 0.00017197098989576222, "loss": 0.7301, "step": 74 }, { "epoch": 0.04123002920460402, "grad_norm": 0.8475909233093262, "learning_rate": 0.00017113334395057087, "loss": 0.7958, "step": 75 }, { "epoch": 0.04177976292733207, "grad_norm": 0.7996639609336853, "learning_rate": 0.000170285470606732, "loss": 0.7139, "step": 76 }, { "epoch": 0.042329496650060125, "grad_norm": 0.8424475789070129, "learning_rate": 0.0001694274917696448, "loss": 0.6856, "step": 77 }, { "epoch": 0.04287923037278818, "grad_norm": 0.8987585306167603, "learning_rate": 0.00016855953079765448, "loss": 0.6983, "step": 78 }, { "epoch": 0.043428964095516234, "grad_norm": 0.8693024516105652, "learning_rate": 0.00016768171248431602, "loss": 0.7346, "step": 79 }, { "epoch": 0.04397869781824429, "grad_norm": 0.8143675923347473, "learning_rate": 0.0001667941630404517, "loss": 0.731, "step": 80 }, { "epoch": 0.044528431540972344, "grad_norm": 0.77469402551651, "learning_rate": 0.00016589701007600476, "loss": 0.64, "step": 81 }, { "epoch": 0.0450781652637004, "grad_norm": 0.8193095922470093, "learning_rate": 0.0001649903825816918, "loss": 0.678, "step": 82 }, { "epoch": 0.045627898986428446, "grad_norm": 0.8982542753219604, "learning_rate": 0.00016407441091045706, "loss": 0.7889, "step": 83 }, { "epoch": 0.0461776327091565, "grad_norm": 0.7841023206710815, "learning_rate": 0.0001631492267587301, "loss": 0.6793, "step": 84 }, { "epoch": 0.046727366431884555, "grad_norm": 0.7533169984817505, "learning_rate": 0.0001622149631474913, "loss": 0.6684, "step": 85 }, { "epoch": 0.04727710015461261, "grad_norm": 0.8421821594238281, "learning_rate": 0.00016127175440314596, "loss": 0.7266, "step": 86 }, { "epoch": 0.047826833877340665, "grad_norm": 0.7997047305107117, "learning_rate": 0.0001603197361382114, "loss": 0.6937, "step": 87 }, { "epoch": 0.04837656760006872, "grad_norm": 0.7438738346099854, "learning_rate": 0.0001593590452318187, "loss": 0.6882, "step": 88 }, { "epoch": 0.04892630132279677, "grad_norm": 0.8511331081390381, "learning_rate": 0.00015838981981003273, "loss": 0.7752, "step": 89 }, { "epoch": 0.04947603504552482, "grad_norm": 0.7980754375457764, "learning_rate": 0.00015741219922599253, "loss": 0.7558, "step": 90 }, { "epoch": 0.050025768768252876, "grad_norm": 0.8246051669120789, "learning_rate": 0.00015642632403987535, "loss": 0.7416, "step": 91 }, { "epoch": 0.05057550249098093, "grad_norm": 0.8296177983283997, "learning_rate": 0.00015543233599868742, "loss": 0.7666, "step": 92 }, { "epoch": 0.051125236213708986, "grad_norm": 0.8095186352729797, "learning_rate": 0.0001544303780158837, "loss": 0.8564, "step": 93 }, { "epoch": 0.05167496993643704, "grad_norm": 0.7858137488365173, "learning_rate": 0.0001534205941508202, "loss": 0.7686, "step": 94 }, { "epoch": 0.052224703659165095, "grad_norm": 0.7376622557640076, "learning_rate": 0.00015240312958804132, "loss": 0.7613, "step": 95 }, { "epoch": 0.05277443738189314, "grad_norm": 0.7384364008903503, "learning_rate": 0.00015137813061640563, "loss": 0.7409, "step": 96 }, { "epoch": 0.0533241711046212, "grad_norm": 0.7371678948402405, "learning_rate": 0.00015034574460805279, "loss": 0.7219, "step": 97 }, { "epoch": 0.05387390482734925, "grad_norm": 0.7421817779541016, "learning_rate": 0.00014930611999721457, "loss": 0.6619, "step": 98 }, { "epoch": 0.054423638550077306, "grad_norm": 0.8186623454093933, "learning_rate": 0.00014825940625887342, "loss": 0.7793, "step": 99 }, { "epoch": 0.05497337227280536, "grad_norm": 0.7645162343978882, "learning_rate": 0.00014720575388727132, "loss": 0.7038, "step": 100 }, { "epoch": 0.055523105995533416, "grad_norm": 0.8227371573448181, "learning_rate": 0.0001461453143742718, "loss": 0.7015, "step": 101 }, { "epoch": 0.05607283971826147, "grad_norm": 0.782328188419342, "learning_rate": 0.00014507824018757906, "loss": 0.7674, "step": 102 }, { "epoch": 0.05662257344098952, "grad_norm": 0.8299933075904846, "learning_rate": 0.0001440046847488163, "loss": 0.7426, "step": 103 }, { "epoch": 0.05717230716371757, "grad_norm": 0.8031938672065735, "learning_rate": 0.00014292480241146716, "loss": 0.6522, "step": 104 }, { "epoch": 0.05772204088644563, "grad_norm": 0.8313961625099182, "learning_rate": 0.00014183874843868313, "loss": 0.7357, "step": 105 }, { "epoch": 0.05827177460917368, "grad_norm": 0.8558992147445679, "learning_rate": 0.0001407466789809601, "loss": 0.713, "step": 106 }, { "epoch": 0.05882150833190174, "grad_norm": 0.8198270797729492, "learning_rate": 0.0001396487510536874, "loss": 0.7919, "step": 107 }, { "epoch": 0.05937124205462979, "grad_norm": 0.7915957570075989, "learning_rate": 0.00013854512251457247, "loss": 0.7017, "step": 108 }, { "epoch": 0.05992097577735784, "grad_norm": 0.7795407772064209, "learning_rate": 0.0001374359520409444, "loss": 0.7519, "step": 109 }, { "epoch": 0.060470709500085894, "grad_norm": 0.7501072287559509, "learning_rate": 0.0001363213991069397, "loss": 0.7338, "step": 110 }, { "epoch": 0.06102044322281395, "grad_norm": 0.7231119871139526, "learning_rate": 0.00013520162396057342, "loss": 0.6933, "step": 111 }, { "epoch": 0.061570176945542, "grad_norm": 0.7336615324020386, "learning_rate": 0.00013407678760069891, "loss": 0.6773, "step": 112 }, { "epoch": 0.06211991066827006, "grad_norm": 0.7293228507041931, "learning_rate": 0.00013294705175386003, "loss": 0.6965, "step": 113 }, { "epoch": 0.0626696443909981, "grad_norm": 0.7496985197067261, "learning_rate": 0.00013181257885103818, "loss": 0.6531, "step": 114 }, { "epoch": 0.06321937811372616, "grad_norm": 0.7169116735458374, "learning_rate": 0.00013067353200429857, "loss": 0.6802, "step": 115 }, { "epoch": 0.06376911183645421, "grad_norm": 0.7333677411079407, "learning_rate": 0.00012953007498333808, "loss": 0.7468, "step": 116 }, { "epoch": 0.06431884555918227, "grad_norm": 0.7618181705474854, "learning_rate": 0.00012838237219193896, "loss": 0.7403, "step": 117 }, { "epoch": 0.06486857928191032, "grad_norm": 0.8433738350868225, "learning_rate": 0.00012723058864433118, "loss": 0.7453, "step": 118 }, { "epoch": 0.06541831300463838, "grad_norm": 0.7351827621459961, "learning_rate": 0.00012607488994146704, "loss": 0.693, "step": 119 }, { "epoch": 0.06596804672736643, "grad_norm": 0.7342813014984131, "learning_rate": 0.00012491544224721136, "loss": 0.6954, "step": 120 }, { "epoch": 0.06651778045009449, "grad_norm": 0.7844246625900269, "learning_rate": 0.00012375241226445088, "loss": 0.6836, "step": 121 }, { "epoch": 0.06706751417282254, "grad_norm": 0.7780027389526367, "learning_rate": 0.00012258596721112608, "loss": 0.6995, "step": 122 }, { "epoch": 0.0676172478955506, "grad_norm": 0.7621326446533203, "learning_rate": 0.00012141627479618885, "loss": 0.7148, "step": 123 }, { "epoch": 0.06816698161827865, "grad_norm": 0.7723284959793091, "learning_rate": 0.00012024350319548976, "loss": 0.6942, "step": 124 }, { "epoch": 0.0687167153410067, "grad_norm": 0.778733491897583, "learning_rate": 0.00011906782102759808, "loss": 0.672, "step": 125 }, { "epoch": 0.06926644906373475, "grad_norm": 0.769509494304657, "learning_rate": 0.0001178893973295581, "loss": 0.6919, "step": 126 }, { "epoch": 0.0698161827864628, "grad_norm": 0.7939193248748779, "learning_rate": 0.00011670840153258547, "loss": 0.735, "step": 127 }, { "epoch": 0.07036591650919086, "grad_norm": 0.7830147743225098, "learning_rate": 0.00011552500343770658, "loss": 0.6858, "step": 128 }, { "epoch": 0.07091565023191891, "grad_norm": 0.8524277806282043, "learning_rate": 0.00011433937319134511, "loss": 0.7311, "step": 129 }, { "epoch": 0.07146538395464697, "grad_norm": 0.7894855737686157, "learning_rate": 0.00011315168126085857, "loss": 0.741, "step": 130 }, { "epoch": 0.07201511767737502, "grad_norm": 0.8021276593208313, "learning_rate": 0.00011196209841002909, "loss": 0.6682, "step": 131 }, { "epoch": 0.07256485140010308, "grad_norm": 0.8221745491027832, "learning_rate": 0.00011077079567451111, "loss": 0.6638, "step": 132 }, { "epoch": 0.07311458512283113, "grad_norm": 0.756178617477417, "learning_rate": 0.00010957794433724051, "loss": 0.6842, "step": 133 }, { "epoch": 0.07366431884555918, "grad_norm": 0.7691609859466553, "learning_rate": 0.00010838371590380765, "loss": 0.6807, "step": 134 }, { "epoch": 0.07421405256828724, "grad_norm": 0.7821537852287292, "learning_rate": 0.00010718828207779894, "loss": 0.7022, "step": 135 }, { "epoch": 0.0747637862910153, "grad_norm": 0.7772252559661865, "learning_rate": 0.0001059918147361094, "loss": 0.7267, "step": 136 }, { "epoch": 0.07531352001374335, "grad_norm": 0.7883121371269226, "learning_rate": 0.00010479448590423082, "loss": 0.743, "step": 137 }, { "epoch": 0.0758632537364714, "grad_norm": 0.7632911801338196, "learning_rate": 0.00010359646773151814, "loss": 0.7131, "step": 138 }, { "epoch": 0.07641298745919944, "grad_norm": 0.7260380387306213, "learning_rate": 0.00010239793246643819, "loss": 0.6593, "step": 139 }, { "epoch": 0.0769627211819275, "grad_norm": 0.7732318043708801, "learning_rate": 0.00010119905243180432, "loss": 0.6847, "step": 140 }, { "epoch": 0.07751245490465555, "grad_norm": 0.7376705408096313, "learning_rate": 0.0001, "loss": 0.6245, "step": 141 }, { "epoch": 0.07806218862738361, "grad_norm": 0.8448079228401184, "learning_rate": 9.880094756819572e-05, "loss": 0.7015, "step": 142 }, { "epoch": 0.07861192235011166, "grad_norm": 0.7295001149177551, "learning_rate": 9.760206753356184e-05, "loss": 0.6688, "step": 143 }, { "epoch": 0.07916165607283972, "grad_norm": 0.7648081183433533, "learning_rate": 9.64035322684819e-05, "loss": 0.7318, "step": 144 }, { "epoch": 0.07971138979556777, "grad_norm": 0.7527865767478943, "learning_rate": 9.520551409576919e-05, "loss": 0.7139, "step": 145 }, { "epoch": 0.08026112351829583, "grad_norm": 0.751616358757019, "learning_rate": 9.400818526389063e-05, "loss": 0.6713, "step": 146 }, { "epoch": 0.08081085724102388, "grad_norm": 0.7490566372871399, "learning_rate": 9.281171792220107e-05, "loss": 0.6595, "step": 147 }, { "epoch": 0.08136059096375194, "grad_norm": 0.7755638360977173, "learning_rate": 9.161628409619236e-05, "loss": 0.676, "step": 148 }, { "epoch": 0.08191032468647999, "grad_norm": 0.714603841304779, "learning_rate": 9.042205566275951e-05, "loss": 0.6564, "step": 149 }, { "epoch": 0.08246005840920805, "grad_norm": 0.738848090171814, "learning_rate": 8.92292043254889e-05, "loss": 0.644, "step": 150 }, { "epoch": 0.08246005840920805, "eval_loss": 0.683157205581665, "eval_runtime": 783.195, "eval_samples_per_second": 7.824, "eval_steps_per_second": 1.956, "step": 150 }, { "epoch": 0.0830097921319361, "grad_norm": 0.7367557287216187, "learning_rate": 8.803790158997095e-05, "loss": 0.6731, "step": 151 }, { "epoch": 0.08355952585466414, "grad_norm": 0.7581673860549927, "learning_rate": 8.684831873914145e-05, "loss": 0.6878, "step": 152 }, { "epoch": 0.0841092595773922, "grad_norm": 0.7462360858917236, "learning_rate": 8.566062680865494e-05, "loss": 0.6606, "step": 153 }, { "epoch": 0.08465899330012025, "grad_norm": 0.7873330116271973, "learning_rate": 8.447499656229344e-05, "loss": 0.746, "step": 154 }, { "epoch": 0.0852087270228483, "grad_norm": 0.7316953539848328, "learning_rate": 8.329159846741457e-05, "loss": 0.6985, "step": 155 }, { "epoch": 0.08575846074557636, "grad_norm": 0.7993731498718262, "learning_rate": 8.211060267044191e-05, "loss": 0.6972, "step": 156 }, { "epoch": 0.08630819446830441, "grad_norm": 0.7711489200592041, "learning_rate": 8.093217897240195e-05, "loss": 0.698, "step": 157 }, { "epoch": 0.08685792819103247, "grad_norm": 0.7824850678443909, "learning_rate": 7.975649680451024e-05, "loss": 0.7373, "step": 158 }, { "epoch": 0.08740766191376052, "grad_norm": 0.7459863424301147, "learning_rate": 7.858372520381119e-05, "loss": 0.707, "step": 159 }, { "epoch": 0.08795739563648858, "grad_norm": 0.7161909341812134, "learning_rate": 7.741403278887397e-05, "loss": 0.6568, "step": 160 }, { "epoch": 0.08850712935921663, "grad_norm": 0.7749399542808533, "learning_rate": 7.624758773554914e-05, "loss": 0.7019, "step": 161 }, { "epoch": 0.08905686308194469, "grad_norm": 0.7229331731796265, "learning_rate": 7.508455775278867e-05, "loss": 0.7358, "step": 162 }, { "epoch": 0.08960659680467274, "grad_norm": 0.7543591260910034, "learning_rate": 7.392511005853297e-05, "loss": 0.7449, "step": 163 }, { "epoch": 0.0901563305274008, "grad_norm": 0.7939632534980774, "learning_rate": 7.276941135566884e-05, "loss": 0.7764, "step": 164 }, { "epoch": 0.09070606425012884, "grad_norm": 0.7263380885124207, "learning_rate": 7.161762780806103e-05, "loss": 0.6929, "step": 165 }, { "epoch": 0.09125579797285689, "grad_norm": 0.7231655120849609, "learning_rate": 7.046992501666195e-05, "loss": 0.683, "step": 166 }, { "epoch": 0.09180553169558495, "grad_norm": 0.7676687240600586, "learning_rate": 6.932646799570144e-05, "loss": 0.6371, "step": 167 }, { "epoch": 0.092355265418313, "grad_norm": 0.7562560439109802, "learning_rate": 6.818742114896184e-05, "loss": 0.6995, "step": 168 }, { "epoch": 0.09290499914104106, "grad_norm": 0.7539961934089661, "learning_rate": 6.705294824614004e-05, "loss": 0.6689, "step": 169 }, { "epoch": 0.09345473286376911, "grad_norm": 0.7321000099182129, "learning_rate": 6.592321239930112e-05, "loss": 0.7316, "step": 170 }, { "epoch": 0.09400446658649717, "grad_norm": 0.7690988183021545, "learning_rate": 6.479837603942665e-05, "loss": 0.7074, "step": 171 }, { "epoch": 0.09455420030922522, "grad_norm": 0.7960511445999146, "learning_rate": 6.367860089306028e-05, "loss": 0.6801, "step": 172 }, { "epoch": 0.09510393403195327, "grad_norm": 0.7848622798919678, "learning_rate": 6.256404795905561e-05, "loss": 0.6983, "step": 173 }, { "epoch": 0.09565366775468133, "grad_norm": 0.7471621632575989, "learning_rate": 6.145487748542753e-05, "loss": 0.7292, "step": 174 }, { "epoch": 0.09620340147740938, "grad_norm": 0.8460478186607361, "learning_rate": 6.035124894631263e-05, "loss": 0.6947, "step": 175 }, { "epoch": 0.09675313520013744, "grad_norm": 0.740354597568512, "learning_rate": 5.925332101903994e-05, "loss": 0.7619, "step": 176 }, { "epoch": 0.09730286892286549, "grad_norm": 0.7667998671531677, "learning_rate": 5.816125156131691e-05, "loss": 0.6886, "step": 177 }, { "epoch": 0.09785260264559353, "grad_norm": 0.7397231459617615, "learning_rate": 5.707519758853288e-05, "loss": 0.7408, "step": 178 }, { "epoch": 0.09840233636832159, "grad_norm": 0.7253665924072266, "learning_rate": 5.5995315251183734e-05, "loss": 0.6718, "step": 179 }, { "epoch": 0.09895207009104964, "grad_norm": 0.6998276710510254, "learning_rate": 5.492175981242097e-05, "loss": 0.6089, "step": 180 }, { "epoch": 0.0995018038137777, "grad_norm": 0.7730736136436462, "learning_rate": 5.385468562572823e-05, "loss": 0.6997, "step": 181 }, { "epoch": 0.10005153753650575, "grad_norm": 0.7921561002731323, "learning_rate": 5.279424611272873e-05, "loss": 0.6635, "step": 182 }, { "epoch": 0.10060127125923381, "grad_norm": 0.7567150592803955, "learning_rate": 5.174059374112657e-05, "loss": 0.663, "step": 183 }, { "epoch": 0.10115100498196186, "grad_norm": 0.7531036138534546, "learning_rate": 5.0693880002785456e-05, "loss": 0.7741, "step": 184 }, { "epoch": 0.10170073870468992, "grad_norm": 0.7522636651992798, "learning_rate": 4.965425539194726e-05, "loss": 0.6726, "step": 185 }, { "epoch": 0.10225047242741797, "grad_norm": 0.7481539249420166, "learning_rate": 4.8621869383594406e-05, "loss": 0.6576, "step": 186 }, { "epoch": 0.10280020615014603, "grad_norm": 0.7241939902305603, "learning_rate": 4.759687041195874e-05, "loss": 0.7217, "step": 187 }, { "epoch": 0.10334993987287408, "grad_norm": 0.8103671073913574, "learning_rate": 4.657940584917983e-05, "loss": 0.7514, "step": 188 }, { "epoch": 0.10389967359560214, "grad_norm": 0.6993573307991028, "learning_rate": 4.556962198411631e-05, "loss": 0.7305, "step": 189 }, { "epoch": 0.10444940731833019, "grad_norm": 0.758790135383606, "learning_rate": 4.45676640013126e-05, "loss": 0.7046, "step": 190 }, { "epoch": 0.10499914104105824, "grad_norm": 0.7289198637008667, "learning_rate": 4.3573675960124684e-05, "loss": 0.6688, "step": 191 }, { "epoch": 0.10554887476378629, "grad_norm": 0.72430819272995, "learning_rate": 4.258780077400748e-05, "loss": 0.6795, "step": 192 }, { "epoch": 0.10609860848651434, "grad_norm": 0.746095597743988, "learning_rate": 4.161018018996727e-05, "loss": 0.6936, "step": 193 }, { "epoch": 0.1066483422092424, "grad_norm": 0.6850857138633728, "learning_rate": 4.064095476818133e-05, "loss": 0.6533, "step": 194 }, { "epoch": 0.10719807593197045, "grad_norm": 0.7202500700950623, "learning_rate": 3.968026386178867e-05, "loss": 0.6103, "step": 195 }, { "epoch": 0.1077478096546985, "grad_norm": 0.6932888627052307, "learning_rate": 3.87282455968541e-05, "loss": 0.6562, "step": 196 }, { "epoch": 0.10829754337742656, "grad_norm": 0.7929200530052185, "learning_rate": 3.778503685250873e-05, "loss": 0.7782, "step": 197 }, { "epoch": 0.10884727710015461, "grad_norm": 0.7463613152503967, "learning_rate": 3.685077324126992e-05, "loss": 0.6788, "step": 198 }, { "epoch": 0.10939701082288267, "grad_norm": 0.7823365330696106, "learning_rate": 3.592558908954295e-05, "loss": 0.6377, "step": 199 }, { "epoch": 0.10994674454561072, "grad_norm": 0.7536443471908569, "learning_rate": 3.500961741830821e-05, "loss": 0.6895, "step": 200 }, { "epoch": 0.11049647826833878, "grad_norm": 0.7127309441566467, "learning_rate": 3.410298992399524e-05, "loss": 0.6603, "step": 201 }, { "epoch": 0.11104621199106683, "grad_norm": 0.7310481667518616, "learning_rate": 3.3205836959548296e-05, "loss": 0.7008, "step": 202 }, { "epoch": 0.11159594571379489, "grad_norm": 0.714931309223175, "learning_rate": 3.231828751568401e-05, "loss": 0.6391, "step": 203 }, { "epoch": 0.11214567943652294, "grad_norm": 0.7373867630958557, "learning_rate": 3.144046920234553e-05, "loss": 0.6814, "step": 204 }, { "epoch": 0.11269541315925098, "grad_norm": 0.795272171497345, "learning_rate": 3.0572508230355246e-05, "loss": 0.7655, "step": 205 }, { "epoch": 0.11324514688197904, "grad_norm": 0.7341108918190002, "learning_rate": 2.971452939326802e-05, "loss": 0.6816, "step": 206 }, { "epoch": 0.11379488060470709, "grad_norm": 0.7305004000663757, "learning_rate": 2.8866656049429162e-05, "loss": 0.7035, "step": 207 }, { "epoch": 0.11434461432743515, "grad_norm": 0.707233190536499, "learning_rate": 2.8029010104237785e-05, "loss": 0.6513, "step": 208 }, { "epoch": 0.1148943480501632, "grad_norm": 0.7114582061767578, "learning_rate": 2.720171199261987e-05, "loss": 0.8152, "step": 209 }, { "epoch": 0.11544408177289125, "grad_norm": 0.7058811783790588, "learning_rate": 2.638488066171201e-05, "loss": 0.7069, "step": 210 }, { "epoch": 0.11599381549561931, "grad_norm": 0.7495465278625488, "learning_rate": 2.5578633553759878e-05, "loss": 0.7327, "step": 211 }, { "epoch": 0.11654354921834736, "grad_norm": 0.7546464204788208, "learning_rate": 2.4783086589232295e-05, "loss": 0.7122, "step": 212 }, { "epoch": 0.11709328294107542, "grad_norm": 0.8024098873138428, "learning_rate": 2.3998354150154555e-05, "loss": 0.7525, "step": 213 }, { "epoch": 0.11764301666380347, "grad_norm": 0.7234117984771729, "learning_rate": 2.3224549063662927e-05, "loss": 0.6774, "step": 214 }, { "epoch": 0.11819275038653153, "grad_norm": 0.8027140498161316, "learning_rate": 2.246178258578234e-05, "loss": 0.6236, "step": 215 }, { "epoch": 0.11874248410925958, "grad_norm": 0.7529194951057434, "learning_rate": 2.171016438543059e-05, "loss": 0.6198, "step": 216 }, { "epoch": 0.11929221783198764, "grad_norm": 0.7332431674003601, "learning_rate": 2.096980252865005e-05, "loss": 0.621, "step": 217 }, { "epoch": 0.11984195155471568, "grad_norm": 0.7334296703338623, "learning_rate": 2.0240803463070425e-05, "loss": 0.6224, "step": 218 }, { "epoch": 0.12039168527744373, "grad_norm": 0.7870594263076782, "learning_rate": 1.9523272002603742e-05, "loss": 0.7452, "step": 219 }, { "epoch": 0.12094141900017179, "grad_norm": 0.7538629174232483, "learning_rate": 1.8817311312374564e-05, "loss": 0.7277, "step": 220 }, { "epoch": 0.12149115272289984, "grad_norm": 0.7705982327461243, "learning_rate": 1.8123022893887065e-05, "loss": 0.7166, "step": 221 }, { "epoch": 0.1220408864456279, "grad_norm": 0.7688170075416565, "learning_rate": 1.744050657043137e-05, "loss": 0.78, "step": 222 }, { "epoch": 0.12259062016835595, "grad_norm": 0.790281355381012, "learning_rate": 1.6769860472731257e-05, "loss": 0.782, "step": 223 }, { "epoch": 0.123140353891084, "grad_norm": 0.741191029548645, "learning_rate": 1.6111181024835e-05, "loss": 0.6336, "step": 224 }, { "epoch": 0.12369008761381206, "grad_norm": 0.7377080917358398, "learning_rate": 1.5464562930251814e-05, "loss": 0.6631, "step": 225 }, { "epoch": 0.12423982133654012, "grad_norm": 0.813720703125, "learning_rate": 1.4830099158335563e-05, "loss": 0.7512, "step": 226 }, { "epoch": 0.12478955505926817, "grad_norm": 0.7634631991386414, "learning_rate": 1.4207880930917871e-05, "loss": 0.6863, "step": 227 }, { "epoch": 0.1253392887819962, "grad_norm": 0.7778201699256897, "learning_rate": 1.3597997709192378e-05, "loss": 0.6879, "step": 228 }, { "epoch": 0.12588902250472428, "grad_norm": 0.7663206458091736, "learning_rate": 1.3000537180852212e-05, "loss": 0.6894, "step": 229 }, { "epoch": 0.12643875622745232, "grad_norm": 0.7697199583053589, "learning_rate": 1.2415585247482498e-05, "loss": 0.6947, "step": 230 }, { "epoch": 0.1269884899501804, "grad_norm": 0.6958724856376648, "learning_rate": 1.1843226012209529e-05, "loss": 0.6224, "step": 231 }, { "epoch": 0.12753822367290843, "grad_norm": 0.7540673613548279, "learning_rate": 1.128354176760873e-05, "loss": 0.7011, "step": 232 }, { "epoch": 0.1280879573956365, "grad_norm": 0.6760576963424683, "learning_rate": 1.073661298387265e-05, "loss": 0.6481, "step": 233 }, { "epoch": 0.12863769111836454, "grad_norm": 0.7170032858848572, "learning_rate": 1.0202518297241237e-05, "loss": 0.6073, "step": 234 }, { "epoch": 0.1291874248410926, "grad_norm": 0.7394299507141113, "learning_rate": 9.681334498695648e-06, "loss": 0.6666, "step": 235 }, { "epoch": 0.12973715856382065, "grad_norm": 0.7589176297187805, "learning_rate": 9.173136522917457e-06, "loss": 0.7032, "step": 236 }, { "epoch": 0.13028689228654872, "grad_norm": 0.7487704753875732, "learning_rate": 8.677997437514629e-06, "loss": 0.6858, "step": 237 }, { "epoch": 0.13083662600927676, "grad_norm": 0.736187756061554, "learning_rate": 8.195988432516078e-06, "loss": 0.7002, "step": 238 }, { "epoch": 0.1313863597320048, "grad_norm": 0.7526095509529114, "learning_rate": 7.727178810136093e-06, "loss": 0.6738, "step": 239 }, { "epoch": 0.13193609345473287, "grad_norm": 0.7353979349136353, "learning_rate": 7.27163597481022e-06, "loss": 0.7334, "step": 240 }, { "epoch": 0.1324858271774609, "grad_norm": 0.7452487349510193, "learning_rate": 6.829425423504021e-06, "loss": 0.684, "step": 241 }, { "epoch": 0.13303556090018898, "grad_norm": 0.7281635403633118, "learning_rate": 6.4006107362960195e-06, "loss": 0.761, "step": 242 }, { "epoch": 0.13358529462291702, "grad_norm": 0.7512807846069336, "learning_rate": 5.985253567236304e-06, "loss": 0.7029, "step": 243 }, { "epoch": 0.13413502834564509, "grad_norm": 0.7333109974861145, "learning_rate": 5.583413635482082e-06, "loss": 0.6238, "step": 244 }, { "epoch": 0.13468476206837313, "grad_norm": 0.7307942509651184, "learning_rate": 5.19514871671134e-06, "loss": 0.6929, "step": 245 }, { "epoch": 0.1352344957911012, "grad_norm": 0.7078768014907837, "learning_rate": 4.82051463481602e-06, "loss": 0.6769, "step": 246 }, { "epoch": 0.13578422951382924, "grad_norm": 0.7309481501579285, "learning_rate": 4.45956525387573e-06, "loss": 0.7295, "step": 247 }, { "epoch": 0.1363339632365573, "grad_norm": 0.7615429759025574, "learning_rate": 4.112352470413328e-06, "loss": 0.6425, "step": 248 }, { "epoch": 0.13688369695928534, "grad_norm": 0.7425394654273987, "learning_rate": 3.778926205933342e-06, "loss": 0.6675, "step": 249 }, { "epoch": 0.1374334306820134, "grad_norm": 0.7740041017532349, "learning_rate": 3.459334399744374e-06, "loss": 0.74, "step": 250 }, { "epoch": 0.13798316440474145, "grad_norm": 0.7285709381103516, "learning_rate": 3.1536230020664417e-06, "loss": 0.6986, "step": 251 }, { "epoch": 0.1385328981274695, "grad_norm": 0.7289578318595886, "learning_rate": 2.861835967424409e-06, "loss": 0.6583, "step": 252 }, { "epoch": 0.13908263185019756, "grad_norm": 0.7507825493812561, "learning_rate": 2.5840152483282752e-06, "loss": 0.6601, "step": 253 }, { "epoch": 0.1396323655729256, "grad_norm": 0.7534707188606262, "learning_rate": 2.3202007892413447e-06, "loss": 0.7061, "step": 254 }, { "epoch": 0.14018209929565367, "grad_norm": 0.6799616813659668, "learning_rate": 2.0704305208370857e-06, "loss": 0.6317, "step": 255 }, { "epoch": 0.1407318330183817, "grad_norm": 0.7823193669319153, "learning_rate": 1.83474035454555e-06, "loss": 0.7433, "step": 256 }, { "epoch": 0.14128156674110978, "grad_norm": 0.698180615901947, "learning_rate": 1.6131641773900807e-06, "loss": 0.6891, "step": 257 }, { "epoch": 0.14183130046383782, "grad_norm": 0.7654922604560852, "learning_rate": 1.4057338471151427e-06, "loss": 0.7029, "step": 258 }, { "epoch": 0.1423810341865659, "grad_norm": 0.7790406942367554, "learning_rate": 1.212479187605897e-06, "loss": 0.7639, "step": 259 }, { "epoch": 0.14293076790929393, "grad_norm": 0.7204973697662354, "learning_rate": 1.0334279846001106e-06, "loss": 0.7016, "step": 260 }, { "epoch": 0.143480501632022, "grad_norm": 0.7769961953163147, "learning_rate": 8.686059816932602e-07, "loss": 0.7379, "step": 261 }, { "epoch": 0.14403023535475004, "grad_norm": 0.703819751739502, "learning_rate": 7.180368766371515e-07, "loss": 0.6579, "step": 262 }, { "epoch": 0.1445799690774781, "grad_norm": 0.7404126524925232, "learning_rate": 5.817423179327098e-07, "loss": 0.7114, "step": 263 }, { "epoch": 0.14512970280020615, "grad_norm": 0.7175320982933044, "learning_rate": 4.5974190171735874e-07, "loss": 0.6847, "step": 264 }, { "epoch": 0.1456794365229342, "grad_norm": 0.7125775814056396, "learning_rate": 3.520531689476192e-07, "loss": 0.7358, "step": 265 }, { "epoch": 0.14622917024566226, "grad_norm": 0.7841463685035706, "learning_rate": 2.586916028770259e-07, "loss": 0.6563, "step": 266 }, { "epoch": 0.1467789039683903, "grad_norm": 0.7318646311759949, "learning_rate": 1.7967062683001967e-07, "loss": 0.7004, "step": 267 }, { "epoch": 0.14732863769111837, "grad_norm": 0.7782835364341736, "learning_rate": 1.150016022719691e-07, "loss": 0.644, "step": 268 }, { "epoch": 0.1478783714138464, "grad_norm": 0.7568761706352234, "learning_rate": 6.469382717563255e-08, "loss": 0.6848, "step": 269 }, { "epoch": 0.14842810513657448, "grad_norm": 0.7343583703041077, "learning_rate": 2.8754534684316547e-08, "loss": 0.7193, "step": 270 }, { "epoch": 0.14897783885930252, "grad_norm": 0.7375150918960571, "learning_rate": 7.188892071929854e-09, "loss": 0.7181, "step": 271 }, { "epoch": 0.1495275725820306, "grad_norm": 0.6891016364097595, "learning_rate": 0.0, "loss": 0.6775, "step": 272 } ], "logging_steps": 1, "max_steps": 272, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.96090445723992e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }