{ "best_metric": 0.11646050214767456, "best_model_checkpoint": "./vit-indian-food/checkpoint-1500", "epoch": 7.853403141361256, "eval_steps": 50, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 7.558130264282227, "learning_rate": 0.00019947643979057592, "loss": 2.4369, "step": 5 }, { "epoch": 0.05, "grad_norm": 6.5493950843811035, "learning_rate": 0.00019895287958115185, "loss": 1.6214, "step": 10 }, { "epoch": 0.08, "grad_norm": 6.205695629119873, "learning_rate": 0.00019842931937172776, "loss": 1.197, "step": 15 }, { "epoch": 0.1, "grad_norm": 3.8880319595336914, "learning_rate": 0.00019790575916230367, "loss": 0.7391, "step": 20 }, { "epoch": 0.13, "grad_norm": 6.315922260284424, "learning_rate": 0.0001973821989528796, "loss": 0.6288, "step": 25 }, { "epoch": 0.16, "grad_norm": 4.008142471313477, "learning_rate": 0.0001968586387434555, "loss": 0.3582, "step": 30 }, { "epoch": 0.18, "grad_norm": 4.005300998687744, "learning_rate": 0.00019633507853403142, "loss": 0.3882, "step": 35 }, { "epoch": 0.21, "grad_norm": 3.9506969451904297, "learning_rate": 0.00019581151832460733, "loss": 0.39, "step": 40 }, { "epoch": 0.24, "grad_norm": 4.074863910675049, "learning_rate": 0.00019528795811518326, "loss": 0.3413, "step": 45 }, { "epoch": 0.26, "grad_norm": 4.617203712463379, "learning_rate": 0.00019476439790575917, "loss": 0.358, "step": 50 }, { "epoch": 0.26, "eval_accuracy": 0.9081364829396326, "eval_loss": 0.32936015725135803, "eval_precision": 0.9220515144455581, "eval_recall": 0.9081364829396326, "eval_runtime": 17.5763, "eval_samples_per_second": 43.354, "eval_steps_per_second": 2.731, "step": 50 }, { "epoch": 0.29, "grad_norm": 4.685304641723633, "learning_rate": 0.0001942408376963351, "loss": 0.3669, "step": 55 }, { "epoch": 0.31, "grad_norm": 3.119244337081909, "learning_rate": 0.000193717277486911, "loss": 0.1722, "step": 60 }, { "epoch": 0.34, "grad_norm": 6.17639684677124, "learning_rate": 0.00019319371727748692, "loss": 0.405, "step": 65 }, { "epoch": 0.37, "grad_norm": 4.263852119445801, "learning_rate": 0.00019267015706806283, "loss": 0.5013, "step": 70 }, { "epoch": 0.39, "grad_norm": 2.03020977973938, "learning_rate": 0.00019214659685863877, "loss": 0.2251, "step": 75 }, { "epoch": 0.42, "grad_norm": 5.0216193199157715, "learning_rate": 0.00019162303664921465, "loss": 0.3329, "step": 80 }, { "epoch": 0.45, "grad_norm": 7.692380428314209, "learning_rate": 0.00019109947643979058, "loss": 0.2191, "step": 85 }, { "epoch": 0.47, "grad_norm": 5.645913600921631, "learning_rate": 0.0001905759162303665, "loss": 0.3437, "step": 90 }, { "epoch": 0.5, "grad_norm": 5.066718578338623, "learning_rate": 0.00019005235602094243, "loss": 0.4461, "step": 95 }, { "epoch": 0.52, "grad_norm": 4.659124851226807, "learning_rate": 0.00018952879581151833, "loss": 0.5051, "step": 100 }, { "epoch": 0.52, "eval_accuracy": 0.9291338582677166, "eval_loss": 0.22470374405384064, "eval_precision": 0.9372539303361345, "eval_recall": 0.9291338582677166, "eval_runtime": 17.4785, "eval_samples_per_second": 43.597, "eval_steps_per_second": 2.746, "step": 100 }, { "epoch": 0.55, "grad_norm": 4.5200324058532715, "learning_rate": 0.00018900523560209424, "loss": 0.2615, "step": 105 }, { "epoch": 0.58, "grad_norm": 1.2165738344192505, "learning_rate": 0.00018848167539267018, "loss": 0.2415, "step": 110 }, { "epoch": 0.6, "grad_norm": 6.801146030426025, "learning_rate": 0.00018795811518324608, "loss": 0.3234, "step": 115 }, { "epoch": 0.63, "grad_norm": 0.8039401173591614, "learning_rate": 0.00018743455497382202, "loss": 0.1962, "step": 120 }, { "epoch": 0.65, "grad_norm": 4.639632701873779, "learning_rate": 0.0001869109947643979, "loss": 0.2755, "step": 125 }, { "epoch": 0.68, "grad_norm": 3.4670636653900146, "learning_rate": 0.00018638743455497384, "loss": 0.3463, "step": 130 }, { "epoch": 0.71, "grad_norm": 6.07156229019165, "learning_rate": 0.00018586387434554974, "loss": 0.3047, "step": 135 }, { "epoch": 0.73, "grad_norm": 6.347087860107422, "learning_rate": 0.00018534031413612568, "loss": 0.2364, "step": 140 }, { "epoch": 0.76, "grad_norm": 5.963706970214844, "learning_rate": 0.00018481675392670156, "loss": 0.4148, "step": 145 }, { "epoch": 0.79, "grad_norm": 4.037764072418213, "learning_rate": 0.0001842931937172775, "loss": 0.1361, "step": 150 }, { "epoch": 0.79, "eval_accuracy": 0.9409448818897638, "eval_loss": 0.21655863523483276, "eval_precision": 0.9454930944052194, "eval_recall": 0.9409448818897638, "eval_runtime": 17.4607, "eval_samples_per_second": 43.641, "eval_steps_per_second": 2.749, "step": 150 }, { "epoch": 0.81, "grad_norm": 4.717739582061768, "learning_rate": 0.0001837696335078534, "loss": 0.1183, "step": 155 }, { "epoch": 0.84, "grad_norm": 3.9427084922790527, "learning_rate": 0.00018324607329842934, "loss": 0.1138, "step": 160 }, { "epoch": 0.86, "grad_norm": 4.915616989135742, "learning_rate": 0.00018272251308900525, "loss": 0.2988, "step": 165 }, { "epoch": 0.89, "grad_norm": 6.3770432472229, "learning_rate": 0.00018219895287958115, "loss": 0.1744, "step": 170 }, { "epoch": 0.92, "grad_norm": 0.7859281301498413, "learning_rate": 0.00018167539267015706, "loss": 0.2486, "step": 175 }, { "epoch": 0.94, "grad_norm": 3.75736141204834, "learning_rate": 0.000181151832460733, "loss": 0.2637, "step": 180 }, { "epoch": 0.97, "grad_norm": 6.771958351135254, "learning_rate": 0.0001806282722513089, "loss": 0.2305, "step": 185 }, { "epoch": 0.99, "grad_norm": 4.951119899749756, "learning_rate": 0.0001801047120418848, "loss": 0.1108, "step": 190 }, { "epoch": 1.02, "grad_norm": 3.362152576446533, "learning_rate": 0.00017958115183246075, "loss": 0.133, "step": 195 }, { "epoch": 1.05, "grad_norm": 0.8346299529075623, "learning_rate": 0.00017905759162303666, "loss": 0.0611, "step": 200 }, { "epoch": 1.05, "eval_accuracy": 0.9488188976377953, "eval_loss": 0.18689152598381042, "eval_precision": 0.9533647964651322, "eval_recall": 0.9488188976377953, "eval_runtime": 17.6111, "eval_samples_per_second": 43.268, "eval_steps_per_second": 2.726, "step": 200 }, { "epoch": 1.07, "grad_norm": 0.5746695399284363, "learning_rate": 0.0001785340314136126, "loss": 0.0241, "step": 205 }, { "epoch": 1.1, "grad_norm": 2.0743372440338135, "learning_rate": 0.0001780104712041885, "loss": 0.0662, "step": 210 }, { "epoch": 1.13, "grad_norm": 1.6814557313919067, "learning_rate": 0.0001774869109947644, "loss": 0.0635, "step": 215 }, { "epoch": 1.15, "grad_norm": 2.118739128112793, "learning_rate": 0.00017696335078534032, "loss": 0.1024, "step": 220 }, { "epoch": 1.18, "grad_norm": 3.4306182861328125, "learning_rate": 0.00017643979057591625, "loss": 0.0201, "step": 225 }, { "epoch": 1.2, "grad_norm": 0.28945377469062805, "learning_rate": 0.00017591623036649216, "loss": 0.1151, "step": 230 }, { "epoch": 1.23, "grad_norm": 4.6208648681640625, "learning_rate": 0.00017539267015706807, "loss": 0.2324, "step": 235 }, { "epoch": 1.26, "grad_norm": 1.6085395812988281, "learning_rate": 0.00017486910994764398, "loss": 0.0429, "step": 240 }, { "epoch": 1.28, "grad_norm": 5.860565185546875, "learning_rate": 0.0001743455497382199, "loss": 0.0567, "step": 245 }, { "epoch": 1.31, "grad_norm": 5.421276092529297, "learning_rate": 0.00017382198952879582, "loss": 0.1037, "step": 250 }, { "epoch": 1.31, "eval_accuracy": 0.968503937007874, "eval_loss": 0.11793948709964752, "eval_precision": 0.9693355319509627, "eval_recall": 0.968503937007874, "eval_runtime": 17.5701, "eval_samples_per_second": 43.369, "eval_steps_per_second": 2.732, "step": 250 }, { "epoch": 1.34, "grad_norm": 3.9477956295013428, "learning_rate": 0.00017329842931937175, "loss": 0.0285, "step": 255 }, { "epoch": 1.36, "grad_norm": 7.515362739562988, "learning_rate": 0.00017277486910994763, "loss": 0.2195, "step": 260 }, { "epoch": 1.39, "grad_norm": 0.1676739752292633, "learning_rate": 0.00017225130890052357, "loss": 0.0172, "step": 265 }, { "epoch": 1.41, "grad_norm": 8.235359191894531, "learning_rate": 0.00017172774869109948, "loss": 0.1756, "step": 270 }, { "epoch": 1.44, "grad_norm": 7.9198832511901855, "learning_rate": 0.0001712041884816754, "loss": 0.0663, "step": 275 }, { "epoch": 1.47, "grad_norm": 6.388627529144287, "learning_rate": 0.00017068062827225132, "loss": 0.1467, "step": 280 }, { "epoch": 1.49, "grad_norm": 0.09213338047266006, "learning_rate": 0.00017015706806282723, "loss": 0.0684, "step": 285 }, { "epoch": 1.52, "grad_norm": 1.0691012144088745, "learning_rate": 0.00016963350785340316, "loss": 0.0954, "step": 290 }, { "epoch": 1.54, "grad_norm": 6.120029449462891, "learning_rate": 0.00016910994764397907, "loss": 0.0587, "step": 295 }, { "epoch": 1.57, "grad_norm": 0.0928138718008995, "learning_rate": 0.00016858638743455498, "loss": 0.0294, "step": 300 }, { "epoch": 1.57, "eval_accuracy": 0.963254593175853, "eval_loss": 0.11820019036531448, "eval_precision": 0.9645155573207865, "eval_recall": 0.963254593175853, "eval_runtime": 17.5014, "eval_samples_per_second": 43.539, "eval_steps_per_second": 2.743, "step": 300 }, { "epoch": 1.6, "grad_norm": 0.3131069242954254, "learning_rate": 0.0001680628272251309, "loss": 0.1184, "step": 305 }, { "epoch": 1.62, "grad_norm": 0.417850524187088, "learning_rate": 0.00016753926701570682, "loss": 0.0053, "step": 310 }, { "epoch": 1.65, "grad_norm": 5.371811389923096, "learning_rate": 0.00016701570680628273, "loss": 0.1099, "step": 315 }, { "epoch": 1.68, "grad_norm": 4.09129524230957, "learning_rate": 0.00016649214659685867, "loss": 0.2501, "step": 320 }, { "epoch": 1.7, "grad_norm": 0.6515682935714722, "learning_rate": 0.00016596858638743455, "loss": 0.0528, "step": 325 }, { "epoch": 1.73, "grad_norm": 0.5656059384346008, "learning_rate": 0.00016544502617801048, "loss": 0.1378, "step": 330 }, { "epoch": 1.75, "grad_norm": 0.10378353297710419, "learning_rate": 0.0001649214659685864, "loss": 0.0329, "step": 335 }, { "epoch": 1.78, "grad_norm": 2.387305736541748, "learning_rate": 0.00016439790575916233, "loss": 0.0353, "step": 340 }, { "epoch": 1.81, "grad_norm": 0.05628238245844841, "learning_rate": 0.0001638743455497382, "loss": 0.0138, "step": 345 }, { "epoch": 1.83, "grad_norm": 0.04053734615445137, "learning_rate": 0.00016335078534031414, "loss": 0.0082, "step": 350 }, { "epoch": 1.83, "eval_accuracy": 0.963254593175853, "eval_loss": 0.11838709563016891, "eval_precision": 0.9654537871036863, "eval_recall": 0.963254593175853, "eval_runtime": 17.4034, "eval_samples_per_second": 43.785, "eval_steps_per_second": 2.758, "step": 350 }, { "epoch": 1.86, "grad_norm": 2.287546396255493, "learning_rate": 0.00016282722513089005, "loss": 0.02, "step": 355 }, { "epoch": 1.88, "grad_norm": 4.561453342437744, "learning_rate": 0.00016230366492146599, "loss": 0.203, "step": 360 }, { "epoch": 1.91, "grad_norm": 4.0477294921875, "learning_rate": 0.0001617801047120419, "loss": 0.0646, "step": 365 }, { "epoch": 1.94, "grad_norm": 1.0380969047546387, "learning_rate": 0.0001612565445026178, "loss": 0.0848, "step": 370 }, { "epoch": 1.96, "grad_norm": 4.385131359100342, "learning_rate": 0.0001607329842931937, "loss": 0.166, "step": 375 }, { "epoch": 1.99, "grad_norm": 0.036621347069740295, "learning_rate": 0.00016020942408376964, "loss": 0.0871, "step": 380 }, { "epoch": 2.02, "grad_norm": 0.5929269194602966, "learning_rate": 0.00015968586387434555, "loss": 0.1271, "step": 385 }, { "epoch": 2.04, "grad_norm": 0.26594266295433044, "learning_rate": 0.00015916230366492146, "loss": 0.0114, "step": 390 }, { "epoch": 2.07, "grad_norm": 0.020196596160531044, "learning_rate": 0.0001586387434554974, "loss": 0.0081, "step": 395 }, { "epoch": 2.09, "grad_norm": 0.6509531140327454, "learning_rate": 0.0001581151832460733, "loss": 0.0206, "step": 400 }, { "epoch": 2.09, "eval_accuracy": 0.963254593175853, "eval_loss": 0.130891352891922, "eval_precision": 0.964741369994443, "eval_recall": 0.963254593175853, "eval_runtime": 17.4584, "eval_samples_per_second": 43.647, "eval_steps_per_second": 2.749, "step": 400 }, { "epoch": 2.12, "grad_norm": 0.04883525148034096, "learning_rate": 0.00015759162303664924, "loss": 0.047, "step": 405 }, { "epoch": 2.15, "grad_norm": 1.4584039449691772, "learning_rate": 0.00015706806282722515, "loss": 0.0393, "step": 410 }, { "epoch": 2.17, "grad_norm": 0.3986053466796875, "learning_rate": 0.00015654450261780105, "loss": 0.0606, "step": 415 }, { "epoch": 2.2, "grad_norm": 0.11985136568546295, "learning_rate": 0.00015602094240837696, "loss": 0.0379, "step": 420 }, { "epoch": 2.23, "grad_norm": 0.07443628460168839, "learning_rate": 0.0001554973821989529, "loss": 0.0157, "step": 425 }, { "epoch": 2.25, "grad_norm": 0.03219376876950264, "learning_rate": 0.0001549738219895288, "loss": 0.01, "step": 430 }, { "epoch": 2.28, "grad_norm": 0.17020490765571594, "learning_rate": 0.00015445026178010471, "loss": 0.0093, "step": 435 }, { "epoch": 2.3, "grad_norm": 2.6877171993255615, "learning_rate": 0.00015392670157068062, "loss": 0.021, "step": 440 }, { "epoch": 2.33, "grad_norm": 0.010094034485518932, "learning_rate": 0.00015340314136125656, "loss": 0.0083, "step": 445 }, { "epoch": 2.36, "grad_norm": 0.0077184755355119705, "learning_rate": 0.00015287958115183247, "loss": 0.0246, "step": 450 }, { "epoch": 2.36, "eval_accuracy": 0.9553805774278216, "eval_loss": 0.17534801363945007, "eval_precision": 0.9586171915743325, "eval_recall": 0.9553805774278216, "eval_runtime": 17.505, "eval_samples_per_second": 43.53, "eval_steps_per_second": 2.742, "step": 450 }, { "epoch": 2.38, "grad_norm": 5.202419757843018, "learning_rate": 0.0001523560209424084, "loss": 0.032, "step": 455 }, { "epoch": 2.41, "grad_norm": 0.018663793802261353, "learning_rate": 0.00015183246073298428, "loss": 0.0221, "step": 460 }, { "epoch": 2.43, "grad_norm": 1.0761280059814453, "learning_rate": 0.00015130890052356022, "loss": 0.0055, "step": 465 }, { "epoch": 2.46, "grad_norm": 0.8688761591911316, "learning_rate": 0.00015078534031413612, "loss": 0.0852, "step": 470 }, { "epoch": 2.49, "grad_norm": 0.06378360092639923, "learning_rate": 0.00015026178010471206, "loss": 0.0031, "step": 475 }, { "epoch": 2.51, "grad_norm": 7.236721992492676, "learning_rate": 0.00014973821989528797, "loss": 0.0106, "step": 480 }, { "epoch": 2.54, "grad_norm": 0.013875219039618969, "learning_rate": 0.00014921465968586388, "loss": 0.0032, "step": 485 }, { "epoch": 2.57, "grad_norm": 0.031233886256814003, "learning_rate": 0.0001486910994764398, "loss": 0.0047, "step": 490 }, { "epoch": 2.59, "grad_norm": 0.2528940439224243, "learning_rate": 0.00014816753926701572, "loss": 0.0081, "step": 495 }, { "epoch": 2.62, "grad_norm": 0.02138712629675865, "learning_rate": 0.00014764397905759163, "loss": 0.0161, "step": 500 }, { "epoch": 2.62, "eval_accuracy": 0.9514435695538058, "eval_loss": 0.16896109282970428, "eval_precision": 0.9537490891546931, "eval_recall": 0.9514435695538058, "eval_runtime": 17.5851, "eval_samples_per_second": 43.332, "eval_steps_per_second": 2.73, "step": 500 }, { "epoch": 2.64, "grad_norm": 5.179097652435303, "learning_rate": 0.00014712041884816754, "loss": 0.0271, "step": 505 }, { "epoch": 2.67, "grad_norm": 0.06704265624284744, "learning_rate": 0.00014659685863874347, "loss": 0.0072, "step": 510 }, { "epoch": 2.7, "grad_norm": 0.15809708833694458, "learning_rate": 0.00014607329842931938, "loss": 0.0027, "step": 515 }, { "epoch": 2.72, "grad_norm": 6.423127174377441, "learning_rate": 0.00014554973821989531, "loss": 0.0238, "step": 520 }, { "epoch": 2.75, "grad_norm": 0.010630112141370773, "learning_rate": 0.0001450261780104712, "loss": 0.0271, "step": 525 }, { "epoch": 2.77, "grad_norm": 2.8193652629852295, "learning_rate": 0.00014450261780104713, "loss": 0.1816, "step": 530 }, { "epoch": 2.8, "grad_norm": 0.004105111118406057, "learning_rate": 0.00014397905759162304, "loss": 0.1842, "step": 535 }, { "epoch": 2.83, "grad_norm": 0.20598988234996796, "learning_rate": 0.00014345549738219897, "loss": 0.0302, "step": 540 }, { "epoch": 2.85, "grad_norm": 0.7738168835639954, "learning_rate": 0.00014293193717277485, "loss": 0.0256, "step": 545 }, { "epoch": 2.88, "grad_norm": 0.07728663086891174, "learning_rate": 0.0001424083769633508, "loss": 0.0341, "step": 550 }, { "epoch": 2.88, "eval_accuracy": 0.9488188976377953, "eval_loss": 0.18862025439739227, "eval_precision": 0.9543073149026239, "eval_recall": 0.9488188976377953, "eval_runtime": 17.3957, "eval_samples_per_second": 43.804, "eval_steps_per_second": 2.759, "step": 550 }, { "epoch": 2.91, "grad_norm": 12.730446815490723, "learning_rate": 0.0001418848167539267, "loss": 0.0664, "step": 555 }, { "epoch": 2.93, "grad_norm": 8.560800552368164, "learning_rate": 0.00014136125654450263, "loss": 0.0518, "step": 560 }, { "epoch": 2.96, "grad_norm": 1.7465717792510986, "learning_rate": 0.00014083769633507854, "loss": 0.0236, "step": 565 }, { "epoch": 2.98, "grad_norm": 5.386238098144531, "learning_rate": 0.00014031413612565445, "loss": 0.0449, "step": 570 }, { "epoch": 3.01, "grad_norm": 0.008860424160957336, "learning_rate": 0.00013979057591623038, "loss": 0.0168, "step": 575 }, { "epoch": 3.04, "grad_norm": 0.0057926299050450325, "learning_rate": 0.0001392670157068063, "loss": 0.0191, "step": 580 }, { "epoch": 3.06, "grad_norm": 0.5266512036323547, "learning_rate": 0.00013874345549738223, "loss": 0.0023, "step": 585 }, { "epoch": 3.09, "grad_norm": 0.03807899355888367, "learning_rate": 0.0001382198952879581, "loss": 0.0259, "step": 590 }, { "epoch": 3.12, "grad_norm": 0.41590607166290283, "learning_rate": 0.00013769633507853404, "loss": 0.0224, "step": 595 }, { "epoch": 3.14, "grad_norm": 0.003112237202003598, "learning_rate": 0.00013717277486910995, "loss": 0.0022, "step": 600 }, { "epoch": 3.14, "eval_accuracy": 0.963254593175853, "eval_loss": 0.1624463051557541, "eval_precision": 0.9649964224541913, "eval_recall": 0.963254593175853, "eval_runtime": 17.4244, "eval_samples_per_second": 43.732, "eval_steps_per_second": 2.755, "step": 600 }, { "epoch": 3.17, "grad_norm": 0.013240986503660679, "learning_rate": 0.00013664921465968589, "loss": 0.0096, "step": 605 }, { "epoch": 3.19, "grad_norm": 0.13229414820671082, "learning_rate": 0.0001361256544502618, "loss": 0.0016, "step": 610 }, { "epoch": 3.22, "grad_norm": 0.2047564685344696, "learning_rate": 0.0001356020942408377, "loss": 0.0029, "step": 615 }, { "epoch": 3.25, "grad_norm": 5.075094699859619, "learning_rate": 0.0001350785340314136, "loss": 0.1541, "step": 620 }, { "epoch": 3.27, "grad_norm": 0.11219122260808945, "learning_rate": 0.00013455497382198955, "loss": 0.0016, "step": 625 }, { "epoch": 3.3, "grad_norm": 0.0553029403090477, "learning_rate": 0.00013403141361256545, "loss": 0.0016, "step": 630 }, { "epoch": 3.32, "grad_norm": 0.1759098470211029, "learning_rate": 0.00013350785340314136, "loss": 0.0616, "step": 635 }, { "epoch": 3.35, "grad_norm": 0.29212847352027893, "learning_rate": 0.00013298429319371727, "loss": 0.0074, "step": 640 }, { "epoch": 3.38, "grad_norm": 0.02130221202969551, "learning_rate": 0.0001324607329842932, "loss": 0.0013, "step": 645 }, { "epoch": 3.4, "grad_norm": 1.0668525695800781, "learning_rate": 0.0001319371727748691, "loss": 0.0044, "step": 650 }, { "epoch": 3.4, "eval_accuracy": 0.937007874015748, "eval_loss": 0.22285035252571106, "eval_precision": 0.9410975298976415, "eval_recall": 0.937007874015748, "eval_runtime": 17.5444, "eval_samples_per_second": 43.433, "eval_steps_per_second": 2.736, "step": 650 }, { "epoch": 3.43, "grad_norm": 0.27807894349098206, "learning_rate": 0.00013141361256544505, "loss": 0.0669, "step": 655 }, { "epoch": 3.46, "grad_norm": 0.17472220957279205, "learning_rate": 0.00013089005235602096, "loss": 0.0066, "step": 660 }, { "epoch": 3.48, "grad_norm": 0.07417164742946625, "learning_rate": 0.00013036649214659686, "loss": 0.0045, "step": 665 }, { "epoch": 3.51, "grad_norm": 0.007742606569081545, "learning_rate": 0.00012984293193717277, "loss": 0.0008, "step": 670 }, { "epoch": 3.53, "grad_norm": 0.00367438024841249, "learning_rate": 0.0001293193717277487, "loss": 0.003, "step": 675 }, { "epoch": 3.56, "grad_norm": 0.0222078375518322, "learning_rate": 0.00012879581151832462, "loss": 0.0017, "step": 680 }, { "epoch": 3.59, "grad_norm": 6.7436299324035645, "learning_rate": 0.00012827225130890052, "loss": 0.0179, "step": 685 }, { "epoch": 3.61, "grad_norm": 0.0018994753481820226, "learning_rate": 0.00012774869109947646, "loss": 0.0008, "step": 690 }, { "epoch": 3.64, "grad_norm": 0.005384970456361771, "learning_rate": 0.00012722513089005237, "loss": 0.0018, "step": 695 }, { "epoch": 3.66, "grad_norm": 0.0077237836085259914, "learning_rate": 0.00012670157068062827, "loss": 0.0012, "step": 700 }, { "epoch": 3.66, "eval_accuracy": 0.9501312335958005, "eval_loss": 0.17090748250484467, "eval_precision": 0.9525764956521452, "eval_recall": 0.9501312335958005, "eval_runtime": 17.5175, "eval_samples_per_second": 43.499, "eval_steps_per_second": 2.74, "step": 700 }, { "epoch": 3.69, "grad_norm": 0.016971301287412643, "learning_rate": 0.00012617801047120418, "loss": 0.0672, "step": 705 }, { "epoch": 3.72, "grad_norm": 0.37131184339523315, "learning_rate": 0.00012565445026178012, "loss": 0.0032, "step": 710 }, { "epoch": 3.74, "grad_norm": 0.020785262808203697, "learning_rate": 0.00012513089005235603, "loss": 0.0036, "step": 715 }, { "epoch": 3.77, "grad_norm": 0.003909711726009846, "learning_rate": 0.00012460732984293196, "loss": 0.0005, "step": 720 }, { "epoch": 3.8, "grad_norm": 0.32289111614227295, "learning_rate": 0.00012408376963350784, "loss": 0.0067, "step": 725 }, { "epoch": 3.82, "grad_norm": 0.011649747379124165, "learning_rate": 0.00012356020942408378, "loss": 0.0004, "step": 730 }, { "epoch": 3.85, "grad_norm": 0.009505286812782288, "learning_rate": 0.00012303664921465968, "loss": 0.0004, "step": 735 }, { "epoch": 3.87, "grad_norm": 0.016009971499443054, "learning_rate": 0.00012251308900523562, "loss": 0.0132, "step": 740 }, { "epoch": 3.9, "grad_norm": 0.07612348347902298, "learning_rate": 0.00012198952879581151, "loss": 0.0012, "step": 745 }, { "epoch": 3.93, "grad_norm": 0.05622846260666847, "learning_rate": 0.00012146596858638744, "loss": 0.0058, "step": 750 }, { "epoch": 3.93, "eval_accuracy": 0.958005249343832, "eval_loss": 0.17302347719669342, "eval_precision": 0.9602924010464745, "eval_recall": 0.958005249343832, "eval_runtime": 17.4895, "eval_samples_per_second": 43.569, "eval_steps_per_second": 2.744, "step": 750 }, { "epoch": 3.95, "grad_norm": 0.050630178302526474, "learning_rate": 0.00012094240837696336, "loss": 0.0008, "step": 755 }, { "epoch": 3.98, "grad_norm": 0.02956685610115528, "learning_rate": 0.00012041884816753928, "loss": 0.0017, "step": 760 }, { "epoch": 4.01, "grad_norm": 0.038056932389736176, "learning_rate": 0.0001198952879581152, "loss": 0.001, "step": 765 }, { "epoch": 4.03, "grad_norm": 0.006303295027464628, "learning_rate": 0.0001193717277486911, "loss": 0.0003, "step": 770 }, { "epoch": 4.06, "grad_norm": 0.019060682505369186, "learning_rate": 0.00011884816753926702, "loss": 0.0005, "step": 775 }, { "epoch": 4.08, "grad_norm": 0.9220014810562134, "learning_rate": 0.00011832460732984294, "loss": 0.0014, "step": 780 }, { "epoch": 4.11, "grad_norm": 0.0184442438185215, "learning_rate": 0.00011780104712041886, "loss": 0.0003, "step": 785 }, { "epoch": 4.14, "grad_norm": 0.003219619393348694, "learning_rate": 0.00011727748691099475, "loss": 0.0003, "step": 790 }, { "epoch": 4.16, "grad_norm": 0.006713965907692909, "learning_rate": 0.00011675392670157068, "loss": 0.0044, "step": 795 }, { "epoch": 4.19, "grad_norm": 0.0026614165399223566, "learning_rate": 0.0001162303664921466, "loss": 0.0003, "step": 800 }, { "epoch": 4.19, "eval_accuracy": 0.9488188976377953, "eval_loss": 0.20851939916610718, "eval_precision": 0.9513993632135537, "eval_recall": 0.9488188976377953, "eval_runtime": 17.4973, "eval_samples_per_second": 43.55, "eval_steps_per_second": 2.743, "step": 800 }, { "epoch": 4.21, "grad_norm": 0.2586454749107361, "learning_rate": 0.00011570680628272252, "loss": 0.001, "step": 805 }, { "epoch": 4.24, "grad_norm": 0.005604247096925974, "learning_rate": 0.00011518324607329844, "loss": 0.0027, "step": 810 }, { "epoch": 4.27, "grad_norm": 0.00321913487277925, "learning_rate": 0.00011465968586387435, "loss": 0.0002, "step": 815 }, { "epoch": 4.29, "grad_norm": 0.00729359732940793, "learning_rate": 0.00011413612565445027, "loss": 0.0068, "step": 820 }, { "epoch": 4.32, "grad_norm": 2.0007290840148926, "learning_rate": 0.00011361256544502619, "loss": 0.002, "step": 825 }, { "epoch": 4.35, "grad_norm": 0.0031692993361502886, "learning_rate": 0.00011308900523560211, "loss": 0.0005, "step": 830 }, { "epoch": 4.37, "grad_norm": 0.007637929171323776, "learning_rate": 0.00011256544502617801, "loss": 0.0002, "step": 835 }, { "epoch": 4.4, "grad_norm": 0.003380405716598034, "learning_rate": 0.00011204188481675393, "loss": 0.0002, "step": 840 }, { "epoch": 4.42, "grad_norm": 0.0019728606566786766, "learning_rate": 0.00011151832460732985, "loss": 0.0005, "step": 845 }, { "epoch": 4.45, "grad_norm": 0.0047022090293467045, "learning_rate": 0.00011099476439790577, "loss": 0.0003, "step": 850 }, { "epoch": 4.45, "eval_accuracy": 0.9698162729658792, "eval_loss": 0.1316651999950409, "eval_precision": 0.9708177710817824, "eval_recall": 0.9698162729658792, "eval_runtime": 17.4736, "eval_samples_per_second": 43.609, "eval_steps_per_second": 2.747, "step": 850 }, { "epoch": 4.48, "grad_norm": 0.003034041728824377, "learning_rate": 0.0001104712041884817, "loss": 0.0002, "step": 855 }, { "epoch": 4.5, "grad_norm": 0.0816386267542839, "learning_rate": 0.00010994764397905759, "loss": 0.0005, "step": 860 }, { "epoch": 4.53, "grad_norm": 0.010150614194571972, "learning_rate": 0.00010942408376963351, "loss": 0.0004, "step": 865 }, { "epoch": 4.55, "grad_norm": 0.003652422921732068, "learning_rate": 0.00010890052356020943, "loss": 0.0002, "step": 870 }, { "epoch": 4.58, "grad_norm": 0.0027291346341371536, "learning_rate": 0.00010837696335078535, "loss": 0.0002, "step": 875 }, { "epoch": 4.61, "grad_norm": 0.0021885058376938105, "learning_rate": 0.00010785340314136125, "loss": 0.0002, "step": 880 }, { "epoch": 4.63, "grad_norm": 0.0019014202989637852, "learning_rate": 0.00010732984293193717, "loss": 0.0002, "step": 885 }, { "epoch": 4.66, "grad_norm": 0.9188545942306519, "learning_rate": 0.00010680628272251309, "loss": 0.0032, "step": 890 }, { "epoch": 4.69, "grad_norm": 0.0015793447382748127, "learning_rate": 0.00010628272251308901, "loss": 0.0002, "step": 895 }, { "epoch": 4.71, "grad_norm": 0.0016003657365217805, "learning_rate": 0.00010575916230366492, "loss": 0.0002, "step": 900 }, { "epoch": 4.71, "eval_accuracy": 0.9698162729658792, "eval_loss": 0.12266764789819717, "eval_precision": 0.9709897987678096, "eval_recall": 0.9698162729658792, "eval_runtime": 17.6013, "eval_samples_per_second": 43.292, "eval_steps_per_second": 2.727, "step": 900 }, { "epoch": 4.74, "grad_norm": 0.001760563114657998, "learning_rate": 0.00010523560209424084, "loss": 0.0002, "step": 905 }, { "epoch": 4.76, "grad_norm": 0.14921420812606812, "learning_rate": 0.00010471204188481676, "loss": 0.0007, "step": 910 }, { "epoch": 4.79, "grad_norm": 0.008730238303542137, "learning_rate": 0.00010418848167539269, "loss": 0.0005, "step": 915 }, { "epoch": 4.82, "grad_norm": 0.0021068351343274117, "learning_rate": 0.0001036649214659686, "loss": 0.0004, "step": 920 }, { "epoch": 4.84, "grad_norm": 0.0018881463911384344, "learning_rate": 0.0001031413612565445, "loss": 0.0002, "step": 925 }, { "epoch": 4.87, "grad_norm": 0.010380023159086704, "learning_rate": 0.00010261780104712042, "loss": 0.0002, "step": 930 }, { "epoch": 4.9, "grad_norm": 0.005413604434579611, "learning_rate": 0.00010209424083769635, "loss": 0.0435, "step": 935 }, { "epoch": 4.92, "grad_norm": 0.013811178505420685, "learning_rate": 0.00010157068062827227, "loss": 0.0002, "step": 940 }, { "epoch": 4.95, "grad_norm": 0.0024414085783064365, "learning_rate": 0.00010104712041884816, "loss": 0.0014, "step": 945 }, { "epoch": 4.97, "grad_norm": 0.00615642499178648, "learning_rate": 0.00010052356020942408, "loss": 0.0004, "step": 950 }, { "epoch": 4.97, "eval_accuracy": 0.9711286089238845, "eval_loss": 0.12034053355455399, "eval_precision": 0.9732646926949845, "eval_recall": 0.9711286089238845, "eval_runtime": 17.6401, "eval_samples_per_second": 43.197, "eval_steps_per_second": 2.721, "step": 950 }, { "epoch": 5.0, "grad_norm": 1.5034066438674927, "learning_rate": 0.0001, "loss": 0.0035, "step": 955 }, { "epoch": 5.03, "grad_norm": 0.0023136490490287542, "learning_rate": 9.947643979057593e-05, "loss": 0.0002, "step": 960 }, { "epoch": 5.05, "grad_norm": 0.025553325191140175, "learning_rate": 9.895287958115183e-05, "loss": 0.0003, "step": 965 }, { "epoch": 5.08, "grad_norm": 0.0021953664254397154, "learning_rate": 9.842931937172776e-05, "loss": 0.0002, "step": 970 }, { "epoch": 5.1, "grad_norm": 0.0016761135775595903, "learning_rate": 9.790575916230366e-05, "loss": 0.0002, "step": 975 }, { "epoch": 5.13, "grad_norm": 0.001983917085453868, "learning_rate": 9.738219895287959e-05, "loss": 0.0002, "step": 980 }, { "epoch": 5.16, "grad_norm": 0.008785477839410305, "learning_rate": 9.68586387434555e-05, "loss": 0.0003, "step": 985 }, { "epoch": 5.18, "grad_norm": 0.00397130474448204, "learning_rate": 9.633507853403142e-05, "loss": 0.0002, "step": 990 }, { "epoch": 5.21, "grad_norm": 0.0015918458811938763, "learning_rate": 9.581151832460732e-05, "loss": 0.0003, "step": 995 }, { "epoch": 5.24, "grad_norm": 0.0018075347179546952, "learning_rate": 9.528795811518324e-05, "loss": 0.0002, "step": 1000 }, { "epoch": 5.24, "eval_accuracy": 0.958005249343832, "eval_loss": 0.19232842326164246, "eval_precision": 0.960493858579392, "eval_recall": 0.958005249343832, "eval_runtime": 17.4659, "eval_samples_per_second": 43.628, "eval_steps_per_second": 2.748, "step": 1000 }, { "epoch": 5.26, "grad_norm": 0.0014983582077547908, "learning_rate": 9.476439790575917e-05, "loss": 0.0003, "step": 1005 }, { "epoch": 5.29, "grad_norm": 0.004939761478453875, "learning_rate": 9.424083769633509e-05, "loss": 0.0025, "step": 1010 }, { "epoch": 5.31, "grad_norm": 0.003176321741193533, "learning_rate": 9.371727748691101e-05, "loss": 0.0002, "step": 1015 }, { "epoch": 5.34, "grad_norm": 0.008645892143249512, "learning_rate": 9.319371727748692e-05, "loss": 0.0002, "step": 1020 }, { "epoch": 5.37, "grad_norm": 0.009872050024569035, "learning_rate": 9.267015706806284e-05, "loss": 0.0002, "step": 1025 }, { "epoch": 5.39, "grad_norm": 0.0015197212342172861, "learning_rate": 9.214659685863875e-05, "loss": 0.0001, "step": 1030 }, { "epoch": 5.42, "grad_norm": 0.0035685019101947546, "learning_rate": 9.162303664921467e-05, "loss": 0.0002, "step": 1035 }, { "epoch": 5.45, "grad_norm": 0.0015960617456585169, "learning_rate": 9.109947643979058e-05, "loss": 0.0002, "step": 1040 }, { "epoch": 5.47, "grad_norm": 0.0019863061606884003, "learning_rate": 9.05759162303665e-05, "loss": 0.0036, "step": 1045 }, { "epoch": 5.5, "grad_norm": 0.0011158788111060858, "learning_rate": 9.00523560209424e-05, "loss": 0.0001, "step": 1050 }, { "epoch": 5.5, "eval_accuracy": 0.9711286089238845, "eval_loss": 0.11267632246017456, "eval_precision": 0.9717079331418645, "eval_recall": 0.9711286089238845, "eval_runtime": 17.5861, "eval_samples_per_second": 43.33, "eval_steps_per_second": 2.729, "step": 1050 }, { "epoch": 5.52, "grad_norm": 0.003642949042841792, "learning_rate": 8.952879581151833e-05, "loss": 0.0002, "step": 1055 }, { "epoch": 5.55, "grad_norm": 2.638040781021118, "learning_rate": 8.900523560209425e-05, "loss": 0.0021, "step": 1060 }, { "epoch": 5.58, "grad_norm": 5.575833320617676, "learning_rate": 8.848167539267016e-05, "loss": 0.0068, "step": 1065 }, { "epoch": 5.6, "grad_norm": 0.0017755662556737661, "learning_rate": 8.795811518324608e-05, "loss": 0.0001, "step": 1070 }, { "epoch": 5.63, "grad_norm": 0.0034077195450663567, "learning_rate": 8.743455497382199e-05, "loss": 0.0003, "step": 1075 }, { "epoch": 5.65, "grad_norm": 0.003383865812793374, "learning_rate": 8.691099476439791e-05, "loss": 0.0004, "step": 1080 }, { "epoch": 5.68, "grad_norm": 0.0037927927915006876, "learning_rate": 8.638743455497382e-05, "loss": 0.0025, "step": 1085 }, { "epoch": 5.71, "grad_norm": 0.008559320122003555, "learning_rate": 8.586387434554974e-05, "loss": 0.0002, "step": 1090 }, { "epoch": 5.73, "grad_norm": 0.0041591702029109, "learning_rate": 8.534031413612566e-05, "loss": 0.0002, "step": 1095 }, { "epoch": 5.76, "grad_norm": 0.0011583847226575017, "learning_rate": 8.481675392670158e-05, "loss": 0.0003, "step": 1100 }, { "epoch": 5.76, "eval_accuracy": 0.9711286089238845, "eval_loss": 0.13462768495082855, "eval_precision": 0.9722333613195212, "eval_recall": 0.9711286089238845, "eval_runtime": 17.6983, "eval_samples_per_second": 43.055, "eval_steps_per_second": 2.712, "step": 1100 }, { "epoch": 5.79, "grad_norm": 0.0030487922485917807, "learning_rate": 8.429319371727749e-05, "loss": 0.0002, "step": 1105 }, { "epoch": 5.81, "grad_norm": 0.003955108113586903, "learning_rate": 8.376963350785341e-05, "loss": 0.0002, "step": 1110 }, { "epoch": 5.84, "grad_norm": 0.002146199345588684, "learning_rate": 8.324607329842933e-05, "loss": 0.0001, "step": 1115 }, { "epoch": 5.86, "grad_norm": 0.0010180504759773612, "learning_rate": 8.272251308900524e-05, "loss": 0.0002, "step": 1120 }, { "epoch": 5.89, "grad_norm": 0.0008741291239857674, "learning_rate": 8.219895287958116e-05, "loss": 0.0001, "step": 1125 }, { "epoch": 5.92, "grad_norm": 0.0017273599514737725, "learning_rate": 8.167539267015707e-05, "loss": 0.0071, "step": 1130 }, { "epoch": 5.94, "grad_norm": 0.003460003063082695, "learning_rate": 8.115183246073299e-05, "loss": 0.0006, "step": 1135 }, { "epoch": 5.97, "grad_norm": 0.0010187524603679776, "learning_rate": 8.06282722513089e-05, "loss": 0.0001, "step": 1140 }, { "epoch": 5.99, "grad_norm": 0.0035252266097813845, "learning_rate": 8.010471204188482e-05, "loss": 0.0009, "step": 1145 }, { "epoch": 6.02, "grad_norm": 0.0013756396947428584, "learning_rate": 7.958115183246073e-05, "loss": 0.0001, "step": 1150 }, { "epoch": 6.02, "eval_accuracy": 0.9711286089238845, "eval_loss": 0.11907853186130524, "eval_precision": 0.9719758650287736, "eval_recall": 0.9711286089238845, "eval_runtime": 18.1879, "eval_samples_per_second": 41.896, "eval_steps_per_second": 2.639, "step": 1150 }, { "epoch": 6.05, "grad_norm": 0.0016478670295327902, "learning_rate": 7.905759162303665e-05, "loss": 0.0001, "step": 1155 }, { "epoch": 6.07, "grad_norm": 0.0019091927679255605, "learning_rate": 7.853403141361257e-05, "loss": 0.0001, "step": 1160 }, { "epoch": 6.1, "grad_norm": 0.00418177992105484, "learning_rate": 7.801047120418848e-05, "loss": 0.0001, "step": 1165 }, { "epoch": 6.13, "grad_norm": 0.0010454648872837424, "learning_rate": 7.74869109947644e-05, "loss": 0.0001, "step": 1170 }, { "epoch": 6.15, "grad_norm": 0.003557993331924081, "learning_rate": 7.696335078534031e-05, "loss": 0.0001, "step": 1175 }, { "epoch": 6.18, "grad_norm": 0.00358203100040555, "learning_rate": 7.643979057591623e-05, "loss": 0.0002, "step": 1180 }, { "epoch": 6.2, "grad_norm": 0.0017727952217683196, "learning_rate": 7.591623036649214e-05, "loss": 0.0002, "step": 1185 }, { "epoch": 6.23, "grad_norm": 0.056761130690574646, "learning_rate": 7.539267015706806e-05, "loss": 0.0003, "step": 1190 }, { "epoch": 6.26, "grad_norm": 0.0010181496618315578, "learning_rate": 7.486910994764398e-05, "loss": 0.0003, "step": 1195 }, { "epoch": 6.28, "grad_norm": 0.003052978776395321, "learning_rate": 7.43455497382199e-05, "loss": 0.0001, "step": 1200 }, { "epoch": 6.28, "eval_accuracy": 0.968503937007874, "eval_loss": 0.1217927485704422, "eval_precision": 0.9692342594033277, "eval_recall": 0.968503937007874, "eval_runtime": 17.611, "eval_samples_per_second": 43.268, "eval_steps_per_second": 2.726, "step": 1200 }, { "epoch": 6.31, "grad_norm": 0.0031509266700595617, "learning_rate": 7.382198952879581e-05, "loss": 0.0002, "step": 1205 }, { "epoch": 6.34, "grad_norm": 0.0011931936023756862, "learning_rate": 7.329842931937174e-05, "loss": 0.0002, "step": 1210 }, { "epoch": 6.36, "grad_norm": 0.0014170074136927724, "learning_rate": 7.277486910994766e-05, "loss": 0.0002, "step": 1215 }, { "epoch": 6.39, "grad_norm": 0.0008681151666678488, "learning_rate": 7.225130890052356e-05, "loss": 0.0002, "step": 1220 }, { "epoch": 6.41, "grad_norm": 0.0015088602667674422, "learning_rate": 7.172774869109949e-05, "loss": 0.0002, "step": 1225 }, { "epoch": 6.44, "grad_norm": 0.0011560139246284962, "learning_rate": 7.12041884816754e-05, "loss": 0.0002, "step": 1230 }, { "epoch": 6.47, "grad_norm": 0.0059287999756634235, "learning_rate": 7.068062827225132e-05, "loss": 0.0002, "step": 1235 }, { "epoch": 6.49, "grad_norm": 0.0017274218844249845, "learning_rate": 7.015706806282722e-05, "loss": 0.0002, "step": 1240 }, { "epoch": 6.52, "grad_norm": 0.000886244117282331, "learning_rate": 6.963350785340315e-05, "loss": 0.0001, "step": 1245 }, { "epoch": 6.54, "grad_norm": 0.0028559649363160133, "learning_rate": 6.910994764397905e-05, "loss": 0.0002, "step": 1250 }, { "epoch": 6.54, "eval_accuracy": 0.968503937007874, "eval_loss": 0.11844318360090256, "eval_precision": 0.9692177012971396, "eval_recall": 0.968503937007874, "eval_runtime": 17.5917, "eval_samples_per_second": 43.316, "eval_steps_per_second": 2.729, "step": 1250 }, { "epoch": 6.57, "grad_norm": 0.0011471702018752694, "learning_rate": 6.858638743455498e-05, "loss": 0.0001, "step": 1255 }, { "epoch": 6.6, "grad_norm": 0.0015491463709622622, "learning_rate": 6.80628272251309e-05, "loss": 0.0001, "step": 1260 }, { "epoch": 6.62, "grad_norm": 0.0011632316745817661, "learning_rate": 6.75392670157068e-05, "loss": 0.0001, "step": 1265 }, { "epoch": 6.65, "grad_norm": 0.006089572329074144, "learning_rate": 6.701570680628273e-05, "loss": 0.0001, "step": 1270 }, { "epoch": 6.68, "grad_norm": 0.0025600052904337645, "learning_rate": 6.649214659685863e-05, "loss": 0.0001, "step": 1275 }, { "epoch": 6.7, "grad_norm": 0.001310704043135047, "learning_rate": 6.596858638743456e-05, "loss": 0.0001, "step": 1280 }, { "epoch": 6.73, "grad_norm": 0.0050185080617666245, "learning_rate": 6.544502617801048e-05, "loss": 0.0001, "step": 1285 }, { "epoch": 6.75, "grad_norm": 0.0030103351455181837, "learning_rate": 6.492146596858639e-05, "loss": 0.0001, "step": 1290 }, { "epoch": 6.78, "grad_norm": 0.0009425992611795664, "learning_rate": 6.439790575916231e-05, "loss": 0.0001, "step": 1295 }, { "epoch": 6.81, "grad_norm": 0.0016329910140484571, "learning_rate": 6.387434554973823e-05, "loss": 0.0002, "step": 1300 }, { "epoch": 6.81, "eval_accuracy": 0.9698162729658792, "eval_loss": 0.11775030940771103, "eval_precision": 0.9703878401175062, "eval_recall": 0.9698162729658792, "eval_runtime": 17.4725, "eval_samples_per_second": 43.611, "eval_steps_per_second": 2.747, "step": 1300 }, { "epoch": 6.83, "grad_norm": 0.0049902000464499, "learning_rate": 6.335078534031414e-05, "loss": 0.0002, "step": 1305 }, { "epoch": 6.86, "grad_norm": 0.0012894049286842346, "learning_rate": 6.282722513089006e-05, "loss": 0.0001, "step": 1310 }, { "epoch": 6.88, "grad_norm": 0.0013533816672861576, "learning_rate": 6.230366492146598e-05, "loss": 0.0001, "step": 1315 }, { "epoch": 6.91, "grad_norm": 0.0015272346790879965, "learning_rate": 6.178010471204189e-05, "loss": 0.0001, "step": 1320 }, { "epoch": 6.94, "grad_norm": 0.0021596469450742006, "learning_rate": 6.125654450261781e-05, "loss": 0.0001, "step": 1325 }, { "epoch": 6.96, "grad_norm": 0.0010469523258507252, "learning_rate": 6.073298429319372e-05, "loss": 0.0001, "step": 1330 }, { "epoch": 6.99, "grad_norm": 0.004506214987486601, "learning_rate": 6.020942408376964e-05, "loss": 0.0001, "step": 1335 }, { "epoch": 7.02, "grad_norm": 0.002483614254742861, "learning_rate": 5.968586387434555e-05, "loss": 0.0001, "step": 1340 }, { "epoch": 7.04, "grad_norm": 0.0011110632913187146, "learning_rate": 5.916230366492147e-05, "loss": 0.0001, "step": 1345 }, { "epoch": 7.07, "grad_norm": 0.0009813542710617185, "learning_rate": 5.863874345549738e-05, "loss": 0.0001, "step": 1350 }, { "epoch": 7.07, "eval_accuracy": 0.9698162729658792, "eval_loss": 0.1171058714389801, "eval_precision": 0.9703878401175062, "eval_recall": 0.9698162729658792, "eval_runtime": 17.4755, "eval_samples_per_second": 43.604, "eval_steps_per_second": 2.747, "step": 1350 }, { "epoch": 7.09, "grad_norm": 0.0015625334344804287, "learning_rate": 5.81151832460733e-05, "loss": 0.0001, "step": 1355 }, { "epoch": 7.12, "grad_norm": 0.0011336584575474262, "learning_rate": 5.759162303664922e-05, "loss": 0.0001, "step": 1360 }, { "epoch": 7.15, "grad_norm": 0.002448306418955326, "learning_rate": 5.7068062827225135e-05, "loss": 0.0001, "step": 1365 }, { "epoch": 7.17, "grad_norm": 0.003220533486455679, "learning_rate": 5.654450261780106e-05, "loss": 0.0001, "step": 1370 }, { "epoch": 7.2, "grad_norm": 0.0035826002713292837, "learning_rate": 5.6020942408376965e-05, "loss": 0.0001, "step": 1375 }, { "epoch": 7.23, "grad_norm": 0.0011589195346459746, "learning_rate": 5.5497382198952887e-05, "loss": 0.0001, "step": 1380 }, { "epoch": 7.25, "grad_norm": 0.00346089294180274, "learning_rate": 5.4973821989528795e-05, "loss": 0.0001, "step": 1385 }, { "epoch": 7.28, "grad_norm": 0.0009584302315488458, "learning_rate": 5.4450261780104716e-05, "loss": 0.0001, "step": 1390 }, { "epoch": 7.3, "grad_norm": 0.0017620971193537116, "learning_rate": 5.3926701570680624e-05, "loss": 0.0001, "step": 1395 }, { "epoch": 7.33, "grad_norm": 0.0009925129124894738, "learning_rate": 5.3403141361256546e-05, "loss": 0.0001, "step": 1400 }, { "epoch": 7.33, "eval_accuracy": 0.9698162729658792, "eval_loss": 0.11688791215419769, "eval_precision": 0.9704864266182309, "eval_recall": 0.9698162729658792, "eval_runtime": 17.5351, "eval_samples_per_second": 43.456, "eval_steps_per_second": 2.737, "step": 1400 }, { "epoch": 7.36, "grad_norm": 0.0010989775182679296, "learning_rate": 5.287958115183246e-05, "loss": 0.0001, "step": 1405 }, { "epoch": 7.38, "grad_norm": 0.001141382148489356, "learning_rate": 5.235602094240838e-05, "loss": 0.0001, "step": 1410 }, { "epoch": 7.41, "grad_norm": 0.0018131214892491698, "learning_rate": 5.18324607329843e-05, "loss": 0.0001, "step": 1415 }, { "epoch": 7.43, "grad_norm": 0.0016311927465721965, "learning_rate": 5.130890052356021e-05, "loss": 0.0002, "step": 1420 }, { "epoch": 7.46, "grad_norm": 0.0019679146353155375, "learning_rate": 5.0785340314136134e-05, "loss": 0.0001, "step": 1425 }, { "epoch": 7.49, "grad_norm": 0.0030663602519780397, "learning_rate": 5.026178010471204e-05, "loss": 0.0001, "step": 1430 }, { "epoch": 7.51, "grad_norm": 0.0022476576268672943, "learning_rate": 4.973821989528796e-05, "loss": 0.0001, "step": 1435 }, { "epoch": 7.54, "grad_norm": 0.001257985015399754, "learning_rate": 4.921465968586388e-05, "loss": 0.0001, "step": 1440 }, { "epoch": 7.57, "grad_norm": 0.002487305086106062, "learning_rate": 4.869109947643979e-05, "loss": 0.0001, "step": 1445 }, { "epoch": 7.59, "grad_norm": 0.0023026170674711466, "learning_rate": 4.816753926701571e-05, "loss": 0.0001, "step": 1450 }, { "epoch": 7.59, "eval_accuracy": 0.9711286089238845, "eval_loss": 0.11671730875968933, "eval_precision": 0.9718133010833329, "eval_recall": 0.9711286089238845, "eval_runtime": 17.5661, "eval_samples_per_second": 43.379, "eval_steps_per_second": 2.733, "step": 1450 }, { "epoch": 7.62, "grad_norm": 0.0014070291072130203, "learning_rate": 4.764397905759162e-05, "loss": 0.0001, "step": 1455 }, { "epoch": 7.64, "grad_norm": 0.0011672518448904157, "learning_rate": 4.7120418848167544e-05, "loss": 0.0001, "step": 1460 }, { "epoch": 7.67, "grad_norm": 0.0015011669602245092, "learning_rate": 4.659685863874346e-05, "loss": 0.0001, "step": 1465 }, { "epoch": 7.7, "grad_norm": 0.001993841025978327, "learning_rate": 4.6073298429319374e-05, "loss": 0.0001, "step": 1470 }, { "epoch": 7.72, "grad_norm": 0.0008604762842878699, "learning_rate": 4.554973821989529e-05, "loss": 0.0001, "step": 1475 }, { "epoch": 7.75, "grad_norm": 0.002782499184831977, "learning_rate": 4.50261780104712e-05, "loss": 0.0001, "step": 1480 }, { "epoch": 7.77, "grad_norm": 0.002074967371299863, "learning_rate": 4.4502617801047125e-05, "loss": 0.0001, "step": 1485 }, { "epoch": 7.8, "grad_norm": 0.0031987845432013273, "learning_rate": 4.397905759162304e-05, "loss": 0.0001, "step": 1490 }, { "epoch": 7.83, "grad_norm": 0.0007084137760102749, "learning_rate": 4.3455497382198955e-05, "loss": 0.0001, "step": 1495 }, { "epoch": 7.85, "grad_norm": 0.0019736553076654673, "learning_rate": 4.293193717277487e-05, "loss": 0.0001, "step": 1500 }, { "epoch": 7.85, "eval_accuracy": 0.9711286089238845, "eval_loss": 0.11646050214767456, "eval_precision": 0.9718133010833329, "eval_recall": 0.9711286089238845, "eval_runtime": 17.4763, "eval_samples_per_second": 43.602, "eval_steps_per_second": 2.747, "step": 1500 } ], "logging_steps": 5, "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.8551418818712515e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }