|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.8596001859600186, |
|
"eval_steps": 500, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004649000464900047, |
|
"grad_norm": 0.3293333649635315, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.2004, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.009298000929800094, |
|
"grad_norm": 0.3225855827331543, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.2289, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01394700139470014, |
|
"grad_norm": 0.3228434920310974, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2513, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.018596001859600187, |
|
"grad_norm": 0.3422330915927887, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.2952, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.023245002324500233, |
|
"grad_norm": 0.35845088958740234, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.3072, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02789400278940028, |
|
"grad_norm": 0.41468068957328796, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4565, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.032543003254300325, |
|
"grad_norm": 0.45462965965270996, |
|
"learning_rate": 0.00011666666666666668, |
|
"loss": 1.4221, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.037192003719200374, |
|
"grad_norm": 0.5441383719444275, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 1.4777, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04184100418410042, |
|
"grad_norm": 0.7736104726791382, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.5793, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.046490004649000466, |
|
"grad_norm": 1.1128149032592773, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 1.7799, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05113900511390051, |
|
"grad_norm": 1.295057773590088, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 1.6539, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.05578800557880056, |
|
"grad_norm": 2.0307557582855225, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6614, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.06043700604370061, |
|
"grad_norm": 1.489993691444397, |
|
"learning_rate": 0.0001999970498341241, |
|
"loss": 1.3891, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.06508600650860065, |
|
"grad_norm": 0.45855778455734253, |
|
"learning_rate": 0.0001999881995299069, |
|
"loss": 1.1051, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0697350069735007, |
|
"grad_norm": 0.464190274477005, |
|
"learning_rate": 0.0001999734496675677, |
|
"loss": 1.1558, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07438400743840075, |
|
"grad_norm": 0.47878339886665344, |
|
"learning_rate": 0.00019995280121409636, |
|
"loss": 1.1533, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07903300790330078, |
|
"grad_norm": 0.4506080448627472, |
|
"learning_rate": 0.00019992625552318972, |
|
"loss": 1.2074, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.08368200836820083, |
|
"grad_norm": 0.4147086441516876, |
|
"learning_rate": 0.00019989381433516316, |
|
"loss": 1.274, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08833100883310088, |
|
"grad_norm": 0.3981817066669464, |
|
"learning_rate": 0.00019985547977683643, |
|
"loss": 1.2457, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.09298000929800093, |
|
"grad_norm": 0.3968053162097931, |
|
"learning_rate": 0.00019981125436139405, |
|
"loss": 1.1961, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09762900976290098, |
|
"grad_norm": 0.5115089416503906, |
|
"learning_rate": 0.00019976114098822073, |
|
"loss": 1.228, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.10227801022780102, |
|
"grad_norm": 0.5449497103691101, |
|
"learning_rate": 0.00019970514294271124, |
|
"loss": 1.1062, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.10692701069270107, |
|
"grad_norm": 0.5306552052497864, |
|
"learning_rate": 0.00019964326389605496, |
|
"loss": 1.0354, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.11157601115760112, |
|
"grad_norm": 0.5531755089759827, |
|
"learning_rate": 0.00019957550790499526, |
|
"loss": 1.0759, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.11622501162250116, |
|
"grad_norm": 0.9215659499168396, |
|
"learning_rate": 0.00019950187941156355, |
|
"loss": 1.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.12087401208740121, |
|
"grad_norm": 0.2715967297554016, |
|
"learning_rate": 0.00019942238324278803, |
|
"loss": 1.0803, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.12552301255230125, |
|
"grad_norm": 0.24064235389232635, |
|
"learning_rate": 0.00019933702461037716, |
|
"loss": 1.0892, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1301720130172013, |
|
"grad_norm": 0.18895158171653748, |
|
"learning_rate": 0.00019924580911037827, |
|
"loss": 1.1303, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.13482101348210135, |
|
"grad_norm": 0.1712905615568161, |
|
"learning_rate": 0.00019914874272281032, |
|
"loss": 1.1635, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1394700139470014, |
|
"grad_norm": 0.18927285075187683, |
|
"learning_rate": 0.00019904583181127206, |
|
"loss": 1.1706, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14411901441190145, |
|
"grad_norm": 0.2120945006608963, |
|
"learning_rate": 0.0001989370831225248, |
|
"loss": 1.1774, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1487680148768015, |
|
"grad_norm": 0.2811720371246338, |
|
"learning_rate": 0.00019882250378605015, |
|
"loss": 1.1867, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.15341701534170155, |
|
"grad_norm": 0.3283132016658783, |
|
"learning_rate": 0.00019870210131358253, |
|
"loss": 1.2136, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.15806601580660157, |
|
"grad_norm": 0.3853696286678314, |
|
"learning_rate": 0.0001985758835986167, |
|
"loss": 1.1155, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.16271501627150162, |
|
"grad_norm": 0.3992000222206116, |
|
"learning_rate": 0.0001984438589158903, |
|
"loss": 1.071, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.16736401673640167, |
|
"grad_norm": 0.3955259621143341, |
|
"learning_rate": 0.0001983060359208415, |
|
"loss": 0.9018, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.17201301720130172, |
|
"grad_norm": 0.44543391466140747, |
|
"learning_rate": 0.00019816242364904132, |
|
"loss": 0.8921, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.17666201766620176, |
|
"grad_norm": 0.5863587856292725, |
|
"learning_rate": 0.00019801303151560138, |
|
"loss": 1.0336, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.18131101813110181, |
|
"grad_norm": 0.13918030261993408, |
|
"learning_rate": 0.0001978578693145566, |
|
"loss": 1.1035, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.18596001859600186, |
|
"grad_norm": 0.14183400571346283, |
|
"learning_rate": 0.00019769694721822337, |
|
"loss": 1.103, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1906090190609019, |
|
"grad_norm": 0.11069459468126297, |
|
"learning_rate": 0.00019753027577653213, |
|
"loss": 1.11, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.19525801952580196, |
|
"grad_norm": 0.1001875028014183, |
|
"learning_rate": 0.00019735786591633633, |
|
"loss": 1.1537, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.199907019990702, |
|
"grad_norm": 0.137644425034523, |
|
"learning_rate": 0.0001971797289406956, |
|
"loss": 1.1678, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.20455602045560203, |
|
"grad_norm": 0.13967232406139374, |
|
"learning_rate": 0.00019699587652813503, |
|
"loss": 1.147, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.20920502092050208, |
|
"grad_norm": 0.19884178042411804, |
|
"learning_rate": 0.00019680632073187931, |
|
"loss": 1.1424, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.21385402138540213, |
|
"grad_norm": 0.22492916882038116, |
|
"learning_rate": 0.00019661107397906275, |
|
"loss": 1.061, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.21850302185030218, |
|
"grad_norm": 0.2543758153915405, |
|
"learning_rate": 0.00019641014906991437, |
|
"loss": 0.9985, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.22315202231520223, |
|
"grad_norm": 0.3426196873188019, |
|
"learning_rate": 0.00019620355917691884, |
|
"loss": 0.935, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.22780102278010228, |
|
"grad_norm": 0.39695194363594055, |
|
"learning_rate": 0.00019599131784395297, |
|
"loss": 0.8601, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.23245002324500233, |
|
"grad_norm": 0.762519359588623, |
|
"learning_rate": 0.00019577343898539748, |
|
"loss": 0.9861, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.23709902370990238, |
|
"grad_norm": 0.10567212849855423, |
|
"learning_rate": 0.00019554993688522524, |
|
"loss": 1.0594, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.24174802417480243, |
|
"grad_norm": 0.11329612880945206, |
|
"learning_rate": 0.00019532082619606436, |
|
"loss": 1.0659, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.24639702463970248, |
|
"grad_norm": 0.1084655150771141, |
|
"learning_rate": 0.00019508612193823793, |
|
"loss": 1.0896, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.2510460251046025, |
|
"grad_norm": 0.1181301698088646, |
|
"learning_rate": 0.00019484583949877908, |
|
"loss": 1.1282, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.2556950255695026, |
|
"grad_norm": 0.14401216804981232, |
|
"learning_rate": 0.00019459999463042237, |
|
"loss": 1.118, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2603440260344026, |
|
"grad_norm": 0.1677497774362564, |
|
"learning_rate": 0.00019434860345057096, |
|
"loss": 1.1425, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2649930264993027, |
|
"grad_norm": 0.20358914136886597, |
|
"learning_rate": 0.00019409168244023987, |
|
"loss": 1.1249, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.2696420269642027, |
|
"grad_norm": 0.26533111929893494, |
|
"learning_rate": 0.00019382924844297582, |
|
"loss": 1.1438, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2742910274291027, |
|
"grad_norm": 0.33439338207244873, |
|
"learning_rate": 0.0001935613186637526, |
|
"loss": 1.0949, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.2789400278940028, |
|
"grad_norm": 0.4424538016319275, |
|
"learning_rate": 0.0001932879106678434, |
|
"loss": 1.0185, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2835890283589028, |
|
"grad_norm": 0.4415128827095032, |
|
"learning_rate": 0.00019300904237966906, |
|
"loss": 0.9373, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2882380288238029, |
|
"grad_norm": 0.4046485424041748, |
|
"learning_rate": 0.00019272473208162313, |
|
"loss": 0.8202, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.2928870292887029, |
|
"grad_norm": 0.324368417263031, |
|
"learning_rate": 0.00019243499841287308, |
|
"loss": 0.9427, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.297536029753603, |
|
"grad_norm": 0.07730630785226822, |
|
"learning_rate": 0.00019213986036813863, |
|
"loss": 1.0637, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.302185030218503, |
|
"grad_norm": 0.08834154158830643, |
|
"learning_rate": 0.0001918393372964461, |
|
"loss": 1.0867, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3068340306834031, |
|
"grad_norm": 0.09792964160442352, |
|
"learning_rate": 0.00019153344889986023, |
|
"loss": 1.0948, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.3114830311483031, |
|
"grad_norm": 0.08694823086261749, |
|
"learning_rate": 0.0001912222152321923, |
|
"loss": 1.1405, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.31613203161320313, |
|
"grad_norm": 0.12065356969833374, |
|
"learning_rate": 0.0001909056566976856, |
|
"loss": 1.1431, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.3207810320781032, |
|
"grad_norm": 0.1258689910173416, |
|
"learning_rate": 0.00019058379404967757, |
|
"loss": 1.1709, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.32543003254300323, |
|
"grad_norm": 0.14431503415107727, |
|
"learning_rate": 0.0001902566483892393, |
|
"loss": 1.0588, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3300790330079033, |
|
"grad_norm": 0.1783875674009323, |
|
"learning_rate": 0.00018992424116379228, |
|
"loss": 1.0258, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.33472803347280333, |
|
"grad_norm": 0.23002314567565918, |
|
"learning_rate": 0.00018958659416570212, |
|
"loss": 0.9455, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3393770339377034, |
|
"grad_norm": 0.305070161819458, |
|
"learning_rate": 0.00018924372953084997, |
|
"loss": 0.9194, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.34402603440260343, |
|
"grad_norm": 0.3687322437763214, |
|
"learning_rate": 0.0001888956697371813, |
|
"loss": 0.883, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3486750348675035, |
|
"grad_norm": 0.6616573333740234, |
|
"learning_rate": 0.00018854243760323223, |
|
"loss": 0.872, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.35332403533240353, |
|
"grad_norm": 0.09560154378414154, |
|
"learning_rate": 0.0001881840562866336, |
|
"loss": 1.0362, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.3579730357973036, |
|
"grad_norm": 0.09787497669458389, |
|
"learning_rate": 0.00018782054928259277, |
|
"loss": 1.0836, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.36262203626220363, |
|
"grad_norm": 0.12214916199445724, |
|
"learning_rate": 0.0001874519404223533, |
|
"loss": 1.0993, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.36727103672710365, |
|
"grad_norm": 0.1271420419216156, |
|
"learning_rate": 0.00018707825387163248, |
|
"loss": 1.1327, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.3719200371920037, |
|
"grad_norm": 0.15113487839698792, |
|
"learning_rate": 0.00018669951412903725, |
|
"loss": 1.1284, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.37656903765690375, |
|
"grad_norm": 0.1705760657787323, |
|
"learning_rate": 0.00018631574602445792, |
|
"loss": 1.1973, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3812180381218038, |
|
"grad_norm": 0.1751885861158371, |
|
"learning_rate": 0.0001859269747174404, |
|
"loss": 1.1302, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.38586703858670385, |
|
"grad_norm": 0.23309065401554108, |
|
"learning_rate": 0.00018553322569553682, |
|
"loss": 1.1091, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3905160390516039, |
|
"grad_norm": 0.30925455689430237, |
|
"learning_rate": 0.0001851345247726344, |
|
"loss": 1.0555, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.39516503951650395, |
|
"grad_norm": 0.41445159912109375, |
|
"learning_rate": 0.00018473089808726336, |
|
"loss": 0.9417, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.399814039981404, |
|
"grad_norm": 0.3391374945640564, |
|
"learning_rate": 0.00018432237210088307, |
|
"loss": 0.9262, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.40446304044630405, |
|
"grad_norm": 0.43099090456962585, |
|
"learning_rate": 0.00018390897359614748, |
|
"loss": 0.77, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.40911204091120407, |
|
"grad_norm": 0.3982544243335724, |
|
"learning_rate": 0.00018349072967514896, |
|
"loss": 0.9355, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.41376104137610414, |
|
"grad_norm": 0.06400130689144135, |
|
"learning_rate": 0.00018306766775764196, |
|
"loss": 1.0571, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.41841004184100417, |
|
"grad_norm": 0.0744890421628952, |
|
"learning_rate": 0.00018263981557924483, |
|
"loss": 1.0519, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.42305904230590424, |
|
"grad_norm": 0.08555030077695847, |
|
"learning_rate": 0.00018220720118962205, |
|
"loss": 1.1044, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.42770804277080426, |
|
"grad_norm": 0.09533877670764923, |
|
"learning_rate": 0.00018176985295064487, |
|
"loss": 1.1114, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.43235704323570434, |
|
"grad_norm": 0.11658069491386414, |
|
"learning_rate": 0.00018132779953453226, |
|
"loss": 1.1064, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.43700604370060436, |
|
"grad_norm": 0.13404923677444458, |
|
"learning_rate": 0.00018088106992197091, |
|
"loss": 1.1292, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.44165504416550444, |
|
"grad_norm": 0.16842801868915558, |
|
"learning_rate": 0.00018042969340021546, |
|
"loss": 1.1256, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.44630404463040446, |
|
"grad_norm": 0.24980179965496063, |
|
"learning_rate": 0.00017997369956116845, |
|
"loss": 1.0524, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.4509530450953045, |
|
"grad_norm": 0.27264076471328735, |
|
"learning_rate": 0.00017951311829944014, |
|
"loss": 0.9696, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.45560204556020456, |
|
"grad_norm": 0.33832067251205444, |
|
"learning_rate": 0.00017904797981038874, |
|
"loss": 0.9045, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4602510460251046, |
|
"grad_norm": 0.40327176451683044, |
|
"learning_rate": 0.00017857831458814098, |
|
"loss": 0.8434, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.46490004649000466, |
|
"grad_norm": 0.7074009776115417, |
|
"learning_rate": 0.00017810415342359257, |
|
"loss": 0.9095, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4695490469549047, |
|
"grad_norm": 0.0739617720246315, |
|
"learning_rate": 0.00017762552740238998, |
|
"loss": 1.0309, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.47419804741980476, |
|
"grad_norm": 0.0909004807472229, |
|
"learning_rate": 0.00017714246790289214, |
|
"loss": 1.0933, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.4788470478847048, |
|
"grad_norm": 0.0961037203669548, |
|
"learning_rate": 0.0001766550065941136, |
|
"loss": 1.0684, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.48349604834960486, |
|
"grad_norm": 0.11135982722043991, |
|
"learning_rate": 0.00017616317543364804, |
|
"loss": 1.118, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.4881450488145049, |
|
"grad_norm": 0.11919623613357544, |
|
"learning_rate": 0.00017566700666557346, |
|
"loss": 1.1175, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.49279404927940496, |
|
"grad_norm": 0.1320277750492096, |
|
"learning_rate": 0.00017516653281833794, |
|
"loss": 1.113, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.497443049744305, |
|
"grad_norm": 0.15232138335704803, |
|
"learning_rate": 0.00017466178670262747, |
|
"loss": 1.1451, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.502092050209205, |
|
"grad_norm": 0.17556369304656982, |
|
"learning_rate": 0.00017415280140921463, |
|
"loss": 1.0489, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.506741050674105, |
|
"grad_norm": 0.2791363596916199, |
|
"learning_rate": 0.00017363961030678927, |
|
"loss": 1.0767, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.5113900511390052, |
|
"grad_norm": 0.39919206500053406, |
|
"learning_rate": 0.00017312224703977094, |
|
"loss": 0.999, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5160390516039052, |
|
"grad_norm": 0.41332653164863586, |
|
"learning_rate": 0.00017260074552610306, |
|
"loss": 0.8625, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.5206880520688052, |
|
"grad_norm": 0.41465967893600464, |
|
"learning_rate": 0.00017207513995502939, |
|
"loss": 0.8294, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.5253370525337052, |
|
"grad_norm": 0.35807111859321594, |
|
"learning_rate": 0.00017154546478485264, |
|
"loss": 0.9328, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.5299860529986054, |
|
"grad_norm": 0.18978600203990936, |
|
"learning_rate": 0.0001710117547406753, |
|
"loss": 1.0327, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5346350534635054, |
|
"grad_norm": 0.16808465123176575, |
|
"learning_rate": 0.00017047404481212314, |
|
"loss": 1.0977, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5392840539284054, |
|
"grad_norm": 0.1754007339477539, |
|
"learning_rate": 0.0001699323702510513, |
|
"loss": 1.063, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5439330543933054, |
|
"grad_norm": 0.15331235527992249, |
|
"learning_rate": 0.0001693867665692333, |
|
"loss": 1.1018, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5485820548582054, |
|
"grad_norm": 0.13346055150032043, |
|
"learning_rate": 0.00016883726953603273, |
|
"loss": 1.1215, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5532310553231056, |
|
"grad_norm": 0.14829131960868835, |
|
"learning_rate": 0.00016828391517605845, |
|
"loss": 1.1484, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5578800557880056, |
|
"grad_norm": 0.14702975749969482, |
|
"learning_rate": 0.0001677267397668026, |
|
"loss": 1.0535, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5625290562529056, |
|
"grad_norm": 0.21014554798603058, |
|
"learning_rate": 0.00016716577983626259, |
|
"loss": 1.1138, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5671780567178056, |
|
"grad_norm": 0.2756856679916382, |
|
"learning_rate": 0.000166601072160546, |
|
"loss": 0.9879, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5718270571827058, |
|
"grad_norm": 0.39934659004211426, |
|
"learning_rate": 0.0001660326537614599, |
|
"loss": 0.9133, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5764760576476058, |
|
"grad_norm": 0.37866297364234924, |
|
"learning_rate": 0.0001654605619040835, |
|
"loss": 0.9116, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5811250581125058, |
|
"grad_norm": 0.479396253824234, |
|
"learning_rate": 0.00016488483409432504, |
|
"loss": 0.8499, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5857740585774058, |
|
"grad_norm": 0.0876205563545227, |
|
"learning_rate": 0.00016430550807646323, |
|
"loss": 1.0378, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5904230590423059, |
|
"grad_norm": 0.09576641768217087, |
|
"learning_rate": 0.00016372262183067247, |
|
"loss": 1.0617, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.595072059507206, |
|
"grad_norm": 0.08960004895925522, |
|
"learning_rate": 0.00016313621357053306, |
|
"loss": 1.0774, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.599721059972106, |
|
"grad_norm": 0.0907701924443245, |
|
"learning_rate": 0.00016254632174052578, |
|
"loss": 1.0989, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.604370060437006, |
|
"grad_norm": 0.09422672539949417, |
|
"learning_rate": 0.00016195298501351177, |
|
"loss": 1.1659, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.609019060901906, |
|
"grad_norm": 0.10595980286598206, |
|
"learning_rate": 0.00016135624228819683, |
|
"loss": 1.1642, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.6136680613668062, |
|
"grad_norm": 0.12369013577699661, |
|
"learning_rate": 0.00016075613268658157, |
|
"loss": 1.1369, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.6183170618317062, |
|
"grad_norm": 0.16726712882518768, |
|
"learning_rate": 0.00016015269555139642, |
|
"loss": 1.0458, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.6229660622966062, |
|
"grad_norm": 0.21234917640686035, |
|
"learning_rate": 0.00015954597044352234, |
|
"loss": 1.0013, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.6276150627615062, |
|
"grad_norm": 0.28728505969047546, |
|
"learning_rate": 0.00015893599713939728, |
|
"loss": 0.9075, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6322640632264063, |
|
"grad_norm": 0.4090898334980011, |
|
"learning_rate": 0.00015832281562840856, |
|
"loss": 0.9677, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.6369130636913064, |
|
"grad_norm": 0.3748779296875, |
|
"learning_rate": 0.000157706466110271, |
|
"loss": 0.7873, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.6415620641562064, |
|
"grad_norm": 0.31243249773979187, |
|
"learning_rate": 0.00015708698899239172, |
|
"loss": 0.8241, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6462110646211064, |
|
"grad_norm": 0.08351978659629822, |
|
"learning_rate": 0.00015646442488722074, |
|
"loss": 1.0431, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.6508600650860065, |
|
"grad_norm": 0.08479636162519455, |
|
"learning_rate": 0.00015583881460958868, |
|
"loss": 1.0725, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6555090655509066, |
|
"grad_norm": 0.09441729635000229, |
|
"learning_rate": 0.000155210199174031, |
|
"loss": 1.076, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.6601580660158066, |
|
"grad_norm": 0.10794027149677277, |
|
"learning_rate": 0.0001545786197920989, |
|
"loss": 1.1112, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6648070664807066, |
|
"grad_norm": 0.11890177428722382, |
|
"learning_rate": 0.00015394411786965776, |
|
"loss": 1.1748, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.6694560669456067, |
|
"grad_norm": 0.13239571452140808, |
|
"learning_rate": 0.0001533067350041725, |
|
"loss": 1.1075, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.6741050674105067, |
|
"grad_norm": 0.1535806506872177, |
|
"learning_rate": 0.00015266651298198033, |
|
"loss": 1.1259, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6787540678754068, |
|
"grad_norm": 0.19703824818134308, |
|
"learning_rate": 0.00015202349377555166, |
|
"loss": 1.0655, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.6834030683403068, |
|
"grad_norm": 0.2627493441104889, |
|
"learning_rate": 0.00015137771954073804, |
|
"loss": 0.9644, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6880520688052069, |
|
"grad_norm": 0.3154362142086029, |
|
"learning_rate": 0.0001507292326140085, |
|
"loss": 0.8241, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6927010692701069, |
|
"grad_norm": 0.3660978078842163, |
|
"learning_rate": 0.0001500780755096743, |
|
"loss": 0.9218, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.697350069735007, |
|
"grad_norm": 0.4281309247016907, |
|
"learning_rate": 0.00014942429091710141, |
|
"loss": 0.7471, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.701999070199907, |
|
"grad_norm": 0.058116402477025986, |
|
"learning_rate": 0.00014876792169791193, |
|
"loss": 1.0336, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.7066480706648071, |
|
"grad_norm": 0.06079982966184616, |
|
"learning_rate": 0.00014810901088317414, |
|
"loss": 1.0446, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.7112970711297071, |
|
"grad_norm": 0.06905698031187057, |
|
"learning_rate": 0.00014744760167058137, |
|
"loss": 1.0841, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.7159460715946072, |
|
"grad_norm": 0.07570036500692368, |
|
"learning_rate": 0.00014678373742162007, |
|
"loss": 1.0895, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.7205950720595072, |
|
"grad_norm": 0.08454253524541855, |
|
"learning_rate": 0.00014611746165872698, |
|
"loss": 1.1083, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.7252440725244073, |
|
"grad_norm": 0.10023923218250275, |
|
"learning_rate": 0.00014544881806243583, |
|
"loss": 1.0951, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.7298930729893073, |
|
"grad_norm": 0.12155482918024063, |
|
"learning_rate": 0.00014477785046851385, |
|
"loss": 1.092, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.7345420734542073, |
|
"grad_norm": 0.15077327191829681, |
|
"learning_rate": 0.00014410460286508762, |
|
"loss": 1.0582, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.7391910739191074, |
|
"grad_norm": 0.19081415235996246, |
|
"learning_rate": 0.00014342911938975948, |
|
"loss": 1.0055, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.7438400743840075, |
|
"grad_norm": 0.24974983930587769, |
|
"learning_rate": 0.0001427514443267139, |
|
"loss": 0.9043, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7484890748489075, |
|
"grad_norm": 0.33478638529777527, |
|
"learning_rate": 0.00014207162210381404, |
|
"loss": 0.8285, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.7531380753138075, |
|
"grad_norm": 0.34924110770225525, |
|
"learning_rate": 0.0001413896972896894, |
|
"loss": 0.774, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.7577870757787076, |
|
"grad_norm": 0.2992526888847351, |
|
"learning_rate": 0.00014070571459081366, |
|
"loss": 0.8779, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.7624360762436077, |
|
"grad_norm": 0.07906091213226318, |
|
"learning_rate": 0.0001400197188485739, |
|
"loss": 1.047, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7670850767085077, |
|
"grad_norm": 0.09800871461629868, |
|
"learning_rate": 0.00013933175503633068, |
|
"loss": 1.0439, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.7717340771734077, |
|
"grad_norm": 0.09857136756181717, |
|
"learning_rate": 0.00013864186825646995, |
|
"loss": 1.0522, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.7763830776383077, |
|
"grad_norm": 0.1110193282365799, |
|
"learning_rate": 0.00013795010373744582, |
|
"loss": 1.1126, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.7810320781032078, |
|
"grad_norm": 0.11846626549959183, |
|
"learning_rate": 0.00013725650683081556, |
|
"loss": 1.0925, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.7856810785681079, |
|
"grad_norm": 0.14214776456356049, |
|
"learning_rate": 0.00013656112300826646, |
|
"loss": 1.1323, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.7903300790330079, |
|
"grad_norm": 0.1606011986732483, |
|
"learning_rate": 0.00013586399785863454, |
|
"loss": 1.0505, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7949790794979079, |
|
"grad_norm": 0.18738731741905212, |
|
"learning_rate": 0.000135165177084916, |
|
"loss": 1.0334, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.799628079962808, |
|
"grad_norm": 0.23303773999214172, |
|
"learning_rate": 0.0001344647065012709, |
|
"loss": 0.9471, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.8042770804277081, |
|
"grad_norm": 0.27448582649230957, |
|
"learning_rate": 0.00013376263203001938, |
|
"loss": 0.8672, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.8089260808926081, |
|
"grad_norm": 0.29609808325767517, |
|
"learning_rate": 0.0001330589996986315, |
|
"loss": 0.7936, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.8135750813575081, |
|
"grad_norm": 0.43915602564811707, |
|
"learning_rate": 0.00013235385563670934, |
|
"loss": 0.7688, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.8182240818224081, |
|
"grad_norm": 0.06484824419021606, |
|
"learning_rate": 0.00013164724607296285, |
|
"loss": 1.0403, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.8228730822873083, |
|
"grad_norm": 0.07361859828233719, |
|
"learning_rate": 0.00013093921733217916, |
|
"loss": 1.0539, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.8275220827522083, |
|
"grad_norm": 0.08026642352342606, |
|
"learning_rate": 0.00013022981583218565, |
|
"loss": 1.0596, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.8321710832171083, |
|
"grad_norm": 0.08283592760562897, |
|
"learning_rate": 0.0001295190880808067, |
|
"loss": 1.0435, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.8368200836820083, |
|
"grad_norm": 0.11680889129638672, |
|
"learning_rate": 0.00012880708067281477, |
|
"loss": 1.1464, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8414690841469085, |
|
"grad_norm": 0.10784471035003662, |
|
"learning_rate": 0.00012809384028687553, |
|
"loss": 1.1004, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.8461180846118085, |
|
"grad_norm": 0.1224328875541687, |
|
"learning_rate": 0.00012737941368248792, |
|
"loss": 1.0699, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.8507670850767085, |
|
"grad_norm": 0.15732775628566742, |
|
"learning_rate": 0.0001266638476969183, |
|
"loss": 1.0579, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.8554160855416085, |
|
"grad_norm": 0.18987177312374115, |
|
"learning_rate": 0.00012594718924213008, |
|
"loss": 1.0212, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.8600650860065086, |
|
"grad_norm": 0.250615656375885, |
|
"learning_rate": 0.00012522948530170806, |
|
"loss": 0.9817, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8647140864714087, |
|
"grad_norm": 0.28331658244132996, |
|
"learning_rate": 0.00012451078292777837, |
|
"loss": 0.8079, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.8693630869363087, |
|
"grad_norm": 0.3439493477344513, |
|
"learning_rate": 0.0001237911292379237, |
|
"loss": 0.7382, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.8740120874012087, |
|
"grad_norm": 0.35732191801071167, |
|
"learning_rate": 0.00012307057141209415, |
|
"loss": 0.9792, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.8786610878661087, |
|
"grad_norm": 0.10152771323919296, |
|
"learning_rate": 0.0001223491566895144, |
|
"loss": 1.0674, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.8833100883310089, |
|
"grad_norm": 0.11470180004835129, |
|
"learning_rate": 0.00012162693236558658, |
|
"loss": 1.0276, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8879590887959089, |
|
"grad_norm": 0.1217978298664093, |
|
"learning_rate": 0.00012090394578878974, |
|
"loss": 1.0734, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.8926080892608089, |
|
"grad_norm": 0.11547485738992691, |
|
"learning_rate": 0.0001201802443575756, |
|
"loss": 1.0862, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.897257089725709, |
|
"grad_norm": 0.13204635679721832, |
|
"learning_rate": 0.00011945587551726116, |
|
"loss": 1.1112, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.901906090190609, |
|
"grad_norm": 0.12314517050981522, |
|
"learning_rate": 0.00011873088675691835, |
|
"loss": 1.1342, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.9065550906555091, |
|
"grad_norm": 0.14891035854816437, |
|
"learning_rate": 0.00011800532560626048, |
|
"loss": 1.0975, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.9112040911204091, |
|
"grad_norm": 0.17384392023086548, |
|
"learning_rate": 0.0001172792396325264, |
|
"loss": 1.0599, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.9158530915853091, |
|
"grad_norm": 0.22434796392917633, |
|
"learning_rate": 0.00011655267643736194, |
|
"loss": 1.0202, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.9205020920502092, |
|
"grad_norm": 0.26256677508354187, |
|
"learning_rate": 0.00011582568365369924, |
|
"loss": 0.9057, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.9251510925151093, |
|
"grad_norm": 0.30824849009513855, |
|
"learning_rate": 0.00011509830894263387, |
|
"loss": 0.8073, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.9298000929800093, |
|
"grad_norm": 0.4767857789993286, |
|
"learning_rate": 0.00011437059999030035, |
|
"loss": 0.806, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9344490934449093, |
|
"grad_norm": 0.05803222209215164, |
|
"learning_rate": 0.00011364260450474575, |
|
"loss": 1.0481, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.9390980939098094, |
|
"grad_norm": 0.06460921466350555, |
|
"learning_rate": 0.00011291437021280205, |
|
"loss": 1.0617, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.9437470943747094, |
|
"grad_norm": 0.07504323869943619, |
|
"learning_rate": 0.0001121859448569572, |
|
"loss": 1.0995, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.9483960948396095, |
|
"grad_norm": 0.07197124511003494, |
|
"learning_rate": 0.00011145737619222516, |
|
"loss": 1.0629, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.9530450953045095, |
|
"grad_norm": 0.08467988669872284, |
|
"learning_rate": 0.0001107287119830151, |
|
"loss": 1.0961, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.9576940957694096, |
|
"grad_norm": 0.10283592343330383, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 1.1351, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.9623430962343096, |
|
"grad_norm": 0.11667031794786453, |
|
"learning_rate": 0.00010927128801698494, |
|
"loss": 1.0459, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.9669920966992097, |
|
"grad_norm": 0.1426560878753662, |
|
"learning_rate": 0.00010854262380777486, |
|
"loss": 1.0367, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.9716410971641097, |
|
"grad_norm": 0.18659153580665588, |
|
"learning_rate": 0.00010781405514304284, |
|
"loss": 0.9566, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.9762900976290098, |
|
"grad_norm": 0.2517950236797333, |
|
"learning_rate": 0.000107085629787198, |
|
"loss": 0.9468, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9809390980939098, |
|
"grad_norm": 0.27402400970458984, |
|
"learning_rate": 0.0001063573954952543, |
|
"loss": 0.8093, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.9855880985588099, |
|
"grad_norm": 0.339691698551178, |
|
"learning_rate": 0.0001056294000096997, |
|
"loss": 0.7326, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.9902370990237099, |
|
"grad_norm": 0.36932969093322754, |
|
"learning_rate": 0.00010490169105736613, |
|
"loss": 0.9435, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.99488609948861, |
|
"grad_norm": 0.09353512525558472, |
|
"learning_rate": 0.0001041743163463008, |
|
"loss": 1.1068, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.99953509995351, |
|
"grad_norm": 0.249754399061203, |
|
"learning_rate": 0.00010344732356263808, |
|
"loss": 0.8395, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.00418410041841, |
|
"grad_norm": 0.0985850915312767, |
|
"learning_rate": 0.00010272076036747365, |
|
"loss": 0.9939, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.00883310088331, |
|
"grad_norm": 0.06556614488363266, |
|
"learning_rate": 0.00010199467439373956, |
|
"loss": 1.0809, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.01348210134821, |
|
"grad_norm": 0.07044567912817001, |
|
"learning_rate": 0.00010126911324308168, |
|
"loss": 1.0733, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.0181311018131103, |
|
"grad_norm": 0.08298351615667343, |
|
"learning_rate": 0.00010054412448273886, |
|
"loss": 1.0562, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.0227801022780103, |
|
"grad_norm": 0.09130789339542389, |
|
"learning_rate": 9.981975564242443e-05, |
|
"loss": 1.0767, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.0274291027429103, |
|
"grad_norm": 0.10151571035385132, |
|
"learning_rate": 9.909605421121028e-05, |
|
"loss": 1.1066, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.0320781032078103, |
|
"grad_norm": 0.11376982927322388, |
|
"learning_rate": 9.837306763441345e-05, |
|
"loss": 1.0909, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.0367271036727104, |
|
"grad_norm": 0.14317406713962555, |
|
"learning_rate": 9.765084331048567e-05, |
|
"loss": 1.0539, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.0413761041376104, |
|
"grad_norm": 0.183615580201149, |
|
"learning_rate": 9.692942858790591e-05, |
|
"loss": 0.9804, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.0460251046025104, |
|
"grad_norm": 0.2267308384180069, |
|
"learning_rate": 9.620887076207632e-05, |
|
"loss": 0.9, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.0506741050674104, |
|
"grad_norm": 0.25844672322273254, |
|
"learning_rate": 9.548921707222163e-05, |
|
"loss": 0.7342, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.0553231055323105, |
|
"grad_norm": 0.35192665457725525, |
|
"learning_rate": 9.477051469829196e-05, |
|
"loss": 0.7048, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.0599721059972107, |
|
"grad_norm": 0.28442806005477905, |
|
"learning_rate": 9.405281075786995e-05, |
|
"loss": 0.7852, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.0646211064621107, |
|
"grad_norm": 0.06501670181751251, |
|
"learning_rate": 9.333615230308173e-05, |
|
"loss": 1.0592, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.0692701069270107, |
|
"grad_norm": 0.06888816505670547, |
|
"learning_rate": 9.26205863175121e-05, |
|
"loss": 1.0582, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0739191073919108, |
|
"grad_norm": 0.07344524562358856, |
|
"learning_rate": 9.190615971312446e-05, |
|
"loss": 1.0434, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.0785681078568108, |
|
"grad_norm": 0.08665701746940613, |
|
"learning_rate": 9.119291932718525e-05, |
|
"loss": 1.0843, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.0832171083217108, |
|
"grad_norm": 0.09859387576580048, |
|
"learning_rate": 9.048091191919332e-05, |
|
"loss": 1.0871, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.0878661087866108, |
|
"grad_norm": 0.11464710533618927, |
|
"learning_rate": 8.97701841678144e-05, |
|
"loss": 1.0692, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.0925151092515109, |
|
"grad_norm": 0.14108416438102722, |
|
"learning_rate": 8.906078266782087e-05, |
|
"loss": 1.0655, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.0971641097164109, |
|
"grad_norm": 0.1800997108221054, |
|
"learning_rate": 8.835275392703721e-05, |
|
"loss": 1.0205, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.1018131101813111, |
|
"grad_norm": 0.2220492660999298, |
|
"learning_rate": 8.764614436329066e-05, |
|
"loss": 0.9219, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.1064621106462111, |
|
"grad_norm": 0.2564367353916168, |
|
"learning_rate": 8.694100030136849e-05, |
|
"loss": 0.7692, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.2938039004802704, |
|
"learning_rate": 8.623736796998063e-05, |
|
"loss": 0.7331, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.1157601115760112, |
|
"grad_norm": 0.37189003825187683, |
|
"learning_rate": 8.553529349872916e-05, |
|
"loss": 0.5737, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.1204091120409112, |
|
"grad_norm": 0.21121616661548615, |
|
"learning_rate": 8.4834822915084e-05, |
|
"loss": 1.0279, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.1250581125058112, |
|
"grad_norm": 0.08440238237380981, |
|
"learning_rate": 8.413600214136548e-05, |
|
"loss": 1.039, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.1297071129707112, |
|
"grad_norm": 0.08833085745573044, |
|
"learning_rate": 8.343887699173356e-05, |
|
"loss": 1.065, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.1343561134356113, |
|
"grad_norm": 0.09478267282247543, |
|
"learning_rate": 8.274349316918446e-05, |
|
"loss": 1.0549, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.1390051139005113, |
|
"grad_norm": 0.1044965535402298, |
|
"learning_rate": 8.204989626255422e-05, |
|
"loss": 1.1372, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.1436541143654115, |
|
"grad_norm": 0.13615119457244873, |
|
"learning_rate": 8.135813174353008e-05, |
|
"loss": 1.1241, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.1483031148303116, |
|
"grad_norm": 0.1386057436466217, |
|
"learning_rate": 8.066824496366937e-05, |
|
"loss": 1.1088, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.1529521152952116, |
|
"grad_norm": 0.16772480309009552, |
|
"learning_rate": 7.998028115142617e-05, |
|
"loss": 1.0241, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.1576011157601116, |
|
"grad_norm": 0.2133951038122177, |
|
"learning_rate": 7.929428540918635e-05, |
|
"loss": 0.9239, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.1622501162250116, |
|
"grad_norm": 0.2830871343612671, |
|
"learning_rate": 7.86103027103106e-05, |
|
"loss": 0.8394, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1668991166899116, |
|
"grad_norm": 0.29441362619400024, |
|
"learning_rate": 7.792837789618595e-05, |
|
"loss": 0.7265, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.1715481171548117, |
|
"grad_norm": 0.33453473448753357, |
|
"learning_rate": 7.724855567328613e-05, |
|
"loss": 0.6728, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.1761971176197117, |
|
"grad_norm": 0.3466179370880127, |
|
"learning_rate": 7.657088061024054e-05, |
|
"loss": 0.7098, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.1808461180846117, |
|
"grad_norm": 0.0637335404753685, |
|
"learning_rate": 7.58953971349124e-05, |
|
"loss": 1.0493, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.185495118549512, |
|
"grad_norm": 0.07033415138721466, |
|
"learning_rate": 7.522214953148618e-05, |
|
"loss": 1.04, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.190144119014412, |
|
"grad_norm": 0.07706312835216522, |
|
"learning_rate": 7.455118193756419e-05, |
|
"loss": 1.0632, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.194793119479312, |
|
"grad_norm": 0.09520713984966278, |
|
"learning_rate": 7.388253834127306e-05, |
|
"loss": 1.1191, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.199442119944212, |
|
"grad_norm": 0.10701565444469452, |
|
"learning_rate": 7.321626257837996e-05, |
|
"loss": 1.0896, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.204091120409112, |
|
"grad_norm": 0.11862656474113464, |
|
"learning_rate": 7.255239832941866e-05, |
|
"loss": 1.0893, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.208740120874012, |
|
"grad_norm": 0.1468852460384369, |
|
"learning_rate": 7.189098911682592e-05, |
|
"loss": 1.0755, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.213389121338912, |
|
"grad_norm": 0.18458393216133118, |
|
"learning_rate": 7.123207830208806e-05, |
|
"loss": 0.951, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.218038121803812, |
|
"grad_norm": 0.22184807062149048, |
|
"learning_rate": 7.05757090828986e-05, |
|
"loss": 0.8779, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.2226871222687121, |
|
"grad_norm": 0.3002167344093323, |
|
"learning_rate": 6.992192449032571e-05, |
|
"loss": 0.8529, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.2273361227336124, |
|
"grad_norm": 0.3212537169456482, |
|
"learning_rate": 6.927076738599152e-05, |
|
"loss": 0.6355, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.2319851231985124, |
|
"grad_norm": 0.3631160259246826, |
|
"learning_rate": 6.862228045926202e-05, |
|
"loss": 0.4947, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.2366341236634124, |
|
"grad_norm": 0.19289681315422058, |
|
"learning_rate": 6.797650622444836e-05, |
|
"loss": 0.9966, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.2412831241283124, |
|
"grad_norm": 0.07654353976249695, |
|
"learning_rate": 6.733348701801969e-05, |
|
"loss": 1.0453, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.2459321245932125, |
|
"grad_norm": 0.08304441720247269, |
|
"learning_rate": 6.669326499582755e-05, |
|
"loss": 1.0742, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.2505811250581125, |
|
"grad_norm": 0.0931682139635086, |
|
"learning_rate": 6.605588213034227e-05, |
|
"loss": 1.0973, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.2552301255230125, |
|
"grad_norm": 0.10273009538650513, |
|
"learning_rate": 6.542138020790116e-05, |
|
"loss": 1.0871, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2598791259879125, |
|
"grad_norm": 0.1231117695569992, |
|
"learning_rate": 6.478980082596905e-05, |
|
"loss": 1.1266, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.2645281264528125, |
|
"grad_norm": 0.13809573650360107, |
|
"learning_rate": 6.416118539041135e-05, |
|
"loss": 1.0663, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.2691771269177128, |
|
"grad_norm": 0.17248062789440155, |
|
"learning_rate": 6.353557511277928e-05, |
|
"loss": 1.0427, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.2738261273826128, |
|
"grad_norm": 0.22152015566825867, |
|
"learning_rate": 6.291301100760829e-05, |
|
"loss": 0.9226, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.2784751278475128, |
|
"grad_norm": 0.2788388729095459, |
|
"learning_rate": 6.229353388972901e-05, |
|
"loss": 0.8773, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.2831241283124128, |
|
"grad_norm": 0.30847859382629395, |
|
"learning_rate": 6.167718437159147e-05, |
|
"loss": 0.7012, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.2877731287773129, |
|
"grad_norm": 0.3570963442325592, |
|
"learning_rate": 6.106400286060274e-05, |
|
"loss": 0.6076, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.292422129242213, |
|
"grad_norm": 0.3086980879306793, |
|
"learning_rate": 6.045402955647769e-05, |
|
"loss": 0.7205, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.297071129707113, |
|
"grad_norm": 0.0752180814743042, |
|
"learning_rate": 5.98473044486036e-05, |
|
"loss": 1.0282, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.301720130172013, |
|
"grad_norm": 0.07571510970592499, |
|
"learning_rate": 5.924386731341842e-05, |
|
"loss": 1.0712, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.306369130636913, |
|
"grad_norm": 0.08388427644968033, |
|
"learning_rate": 5.864375771180317e-05, |
|
"loss": 1.0425, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.3110181311018132, |
|
"grad_norm": 0.0906265452504158, |
|
"learning_rate": 5.804701498648828e-05, |
|
"loss": 1.0463, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.3156671315667132, |
|
"grad_norm": 0.10397284477949142, |
|
"learning_rate": 5.7453678259474234e-05, |
|
"loss": 1.0913, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.3203161320316132, |
|
"grad_norm": 0.12175474315881729, |
|
"learning_rate": 5.686378642946699e-05, |
|
"loss": 1.0896, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.3249651324965133, |
|
"grad_norm": 0.14317680895328522, |
|
"learning_rate": 5.627737816932754e-05, |
|
"loss": 1.0503, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.3296141329614133, |
|
"grad_norm": 0.19962027668952942, |
|
"learning_rate": 5.569449192353678e-05, |
|
"loss": 1.0168, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.3342631334263133, |
|
"grad_norm": 0.253571093082428, |
|
"learning_rate": 5.511516590567499e-05, |
|
"loss": 0.8714, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.3389121338912133, |
|
"grad_norm": 0.2986457049846649, |
|
"learning_rate": 5.453943809591654e-05, |
|
"loss": 0.7573, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.3435611343561136, |
|
"grad_norm": 0.37284067273139954, |
|
"learning_rate": 5.396734623854012e-05, |
|
"loss": 0.6708, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.3482101348210134, |
|
"grad_norm": 0.4108807444572449, |
|
"learning_rate": 5.3398927839453996e-05, |
|
"loss": 0.549, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.3528591352859136, |
|
"grad_norm": 0.21761813759803772, |
|
"learning_rate": 5.283422016373745e-05, |
|
"loss": 0.9454, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.3575081357508136, |
|
"grad_norm": 0.07017389684915543, |
|
"learning_rate": 5.227326023319743e-05, |
|
"loss": 1.06, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.3621571362157137, |
|
"grad_norm": 0.07673922181129456, |
|
"learning_rate": 5.17160848239416e-05, |
|
"loss": 1.061, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.3668061366806137, |
|
"grad_norm": 0.08843539655208588, |
|
"learning_rate": 5.1162730463967304e-05, |
|
"loss": 1.0668, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.3714551371455137, |
|
"grad_norm": 0.10018595308065414, |
|
"learning_rate": 5.061323343076672e-05, |
|
"loss": 1.0926, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.3761041376104137, |
|
"grad_norm": 0.11475757509469986, |
|
"learning_rate": 5.006762974894872e-05, |
|
"loss": 1.0878, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.3807531380753137, |
|
"grad_norm": 0.14695028960704803, |
|
"learning_rate": 4.9525955187876885e-05, |
|
"loss": 1.088, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.385402138540214, |
|
"grad_norm": 0.18600180745124817, |
|
"learning_rate": 4.898824525932471e-05, |
|
"loss": 1.0019, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.3900511390051138, |
|
"grad_norm": 0.21412943303585052, |
|
"learning_rate": 4.845453521514738e-05, |
|
"loss": 0.9161, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.394700139470014, |
|
"grad_norm": 0.2769903838634491, |
|
"learning_rate": 4.7924860044970615e-05, |
|
"loss": 0.8767, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.399349139934914, |
|
"grad_norm": 0.37584012746810913, |
|
"learning_rate": 4.739925447389698e-05, |
|
"loss": 0.8159, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.403998140399814, |
|
"grad_norm": 0.35033103823661804, |
|
"learning_rate": 4.687775296022908e-05, |
|
"loss": 0.5912, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.408647140864714, |
|
"grad_norm": 0.3620702624320984, |
|
"learning_rate": 4.6360389693210735e-05, |
|
"loss": 0.7819, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.4132961413296141, |
|
"grad_norm": 0.06753652542829514, |
|
"learning_rate": 4.5847198590785394e-05, |
|
"loss": 1.023, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.4179451417945141, |
|
"grad_norm": 0.07456682622432709, |
|
"learning_rate": 4.5338213297372534e-05, |
|
"loss": 1.0321, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.4225941422594142, |
|
"grad_norm": 0.08220840245485306, |
|
"learning_rate": 4.4833467181662086e-05, |
|
"loss": 1.0518, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.4272431427243144, |
|
"grad_norm": 0.09785965085029602, |
|
"learning_rate": 4.4332993334426576e-05, |
|
"loss": 1.0736, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.4318921431892142, |
|
"grad_norm": 0.10714032500982285, |
|
"learning_rate": 4.383682456635199e-05, |
|
"loss": 1.0766, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.4365411436541144, |
|
"grad_norm": 0.12374605983495712, |
|
"learning_rate": 4.3344993405886425e-05, |
|
"loss": 1.1013, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.4411901441190145, |
|
"grad_norm": 0.15557971596717834, |
|
"learning_rate": 4.285753209710786e-05, |
|
"loss": 1.0673, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.4458391445839145, |
|
"grad_norm": 0.1834760308265686, |
|
"learning_rate": 4.2374472597610044e-05, |
|
"loss": 0.9611, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.4504881450488145, |
|
"grad_norm": 0.24258282780647278, |
|
"learning_rate": 4.1895846576407424e-05, |
|
"loss": 0.9162, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.4551371455137145, |
|
"grad_norm": 0.31023555994033813, |
|
"learning_rate": 4.1421685411859046e-05, |
|
"loss": 0.8321, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.4597861459786146, |
|
"grad_norm": 0.3393552601337433, |
|
"learning_rate": 4.095202018961125e-05, |
|
"loss": 0.6979, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.4644351464435146, |
|
"grad_norm": 0.3873143792152405, |
|
"learning_rate": 4.048688170055989e-05, |
|
"loss": 0.5101, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.4690841469084148, |
|
"grad_norm": 0.20240376889705658, |
|
"learning_rate": 4.002630043883159e-05, |
|
"loss": 1.0254, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.4737331473733146, |
|
"grad_norm": 0.08021257072687149, |
|
"learning_rate": 3.9570306599784544e-05, |
|
"loss": 1.033, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.4783821478382149, |
|
"grad_norm": 0.079825259745121, |
|
"learning_rate": 3.911893007802913e-05, |
|
"loss": 1.0105, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.4830311483031149, |
|
"grad_norm": 0.09591138362884521, |
|
"learning_rate": 3.8672200465467765e-05, |
|
"loss": 1.0867, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.487680148768015, |
|
"grad_norm": 0.10192592442035675, |
|
"learning_rate": 3.8230147049355147e-05, |
|
"loss": 1.1238, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.492329149232915, |
|
"grad_norm": 0.11998689919710159, |
|
"learning_rate": 3.779279881037797e-05, |
|
"loss": 1.099, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.496978149697815, |
|
"grad_norm": 0.14268672466278076, |
|
"learning_rate": 3.7360184420755165e-05, |
|
"loss": 1.0832, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.501627150162715, |
|
"grad_norm": 0.1779240220785141, |
|
"learning_rate": 3.693233224235806e-05, |
|
"loss": 1.0322, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.506276150627615, |
|
"grad_norm": 0.23235994577407837, |
|
"learning_rate": 3.650927032485101e-05, |
|
"loss": 0.987, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.5109251510925152, |
|
"grad_norm": 0.2786919176578522, |
|
"learning_rate": 3.609102640385254e-05, |
|
"loss": 0.7974, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.515574151557415, |
|
"grad_norm": 0.3645856976509094, |
|
"learning_rate": 3.567762789911693e-05, |
|
"loss": 0.7208, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.5202231520223153, |
|
"grad_norm": 0.4197288751602173, |
|
"learning_rate": 3.526910191273665e-05, |
|
"loss": 0.5941, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.524872152487215, |
|
"grad_norm": 0.43531715869903564, |
|
"learning_rate": 3.486547522736562e-05, |
|
"loss": 0.8169, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.5295211529521153, |
|
"grad_norm": 0.07292664051055908, |
|
"learning_rate": 3.44667743044632e-05, |
|
"loss": 1.026, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.5341701534170153, |
|
"grad_norm": 0.07665824145078659, |
|
"learning_rate": 3.407302528255961e-05, |
|
"loss": 1.0711, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.5388191538819154, |
|
"grad_norm": 0.09035459905862808, |
|
"learning_rate": 3.36842539755421e-05, |
|
"loss": 1.058, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.5434681543468154, |
|
"grad_norm": 0.10302021354436874, |
|
"learning_rate": 3.3300485870962776e-05, |
|
"loss": 1.0717, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.5481171548117154, |
|
"grad_norm": 0.11944916844367981, |
|
"learning_rate": 3.292174612836757e-05, |
|
"loss": 1.1738, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.5527661552766157, |
|
"grad_norm": 0.1299082338809967, |
|
"learning_rate": 3.254805957764673e-05, |
|
"loss": 1.1078, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.5574151557415155, |
|
"grad_norm": 0.15316098928451538, |
|
"learning_rate": 3.217945071740724e-05, |
|
"loss": 1.0111, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.5620641562064157, |
|
"grad_norm": 0.19828684628009796, |
|
"learning_rate": 3.1815943713366404e-05, |
|
"loss": 1.0292, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.5667131566713157, |
|
"grad_norm": 0.2431751936674118, |
|
"learning_rate": 3.145756239676779e-05, |
|
"loss": 0.867, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.5713621571362157, |
|
"grad_norm": 0.3115411698818207, |
|
"learning_rate": 3.110433026281872e-05, |
|
"loss": 0.7307, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.5760111576011158, |
|
"grad_norm": 0.37397709488868713, |
|
"learning_rate": 3.075627046915003e-05, |
|
"loss": 0.6894, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.5806601580660158, |
|
"grad_norm": 0.4691689610481262, |
|
"learning_rate": 3.041340583429789e-05, |
|
"loss": 0.6568, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5853091585309158, |
|
"grad_norm": 0.27498671412467957, |
|
"learning_rate": 3.0075758836207716e-05, |
|
"loss": 1.0342, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.5899581589958158, |
|
"grad_norm": 0.07807187736034393, |
|
"learning_rate": 2.9743351610760716e-05, |
|
"loss": 1.0501, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.594607159460716, |
|
"grad_norm": 0.08518065512180328, |
|
"learning_rate": 2.941620595032246e-05, |
|
"loss": 1.0657, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.5992561599256159, |
|
"grad_norm": 0.09096917510032654, |
|
"learning_rate": 2.9094343302314432e-05, |
|
"loss": 1.0534, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.6039051603905161, |
|
"grad_norm": 0.10106653720140457, |
|
"learning_rate": 2.8777784767807727e-05, |
|
"loss": 1.0949, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.6085541608554161, |
|
"grad_norm": 0.11770515143871307, |
|
"learning_rate": 2.846655110013978e-05, |
|
"loss": 1.1013, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.6132031613203162, |
|
"grad_norm": 0.14044031500816345, |
|
"learning_rate": 2.816066270355391e-05, |
|
"loss": 1.0657, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.6178521617852162, |
|
"grad_norm": 0.1705469787120819, |
|
"learning_rate": 2.78601396318614e-05, |
|
"loss": 1.0108, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.6225011622501162, |
|
"grad_norm": 0.23262286186218262, |
|
"learning_rate": 2.7565001587126922e-05, |
|
"loss": 0.9104, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.6271501627150162, |
|
"grad_norm": 0.3084715008735657, |
|
"learning_rate": 2.7275267918376912e-05, |
|
"loss": 0.8493, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.6317991631799162, |
|
"grad_norm": 0.34869176149368286, |
|
"learning_rate": 2.6990957620330954e-05, |
|
"loss": 0.7103, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.6364481636448165, |
|
"grad_norm": 0.40754231810569763, |
|
"learning_rate": 2.6712089332156633e-05, |
|
"loss": 0.606, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.6410971641097163, |
|
"grad_norm": 0.38754644989967346, |
|
"learning_rate": 2.6438681336247417e-05, |
|
"loss": 0.7465, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.6457461645746165, |
|
"grad_norm": 0.07341364026069641, |
|
"learning_rate": 2.6170751557024197e-05, |
|
"loss": 1.0565, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.6503951650395166, |
|
"grad_norm": 0.08013252913951874, |
|
"learning_rate": 2.5908317559760138e-05, |
|
"loss": 1.0146, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.6550441655044166, |
|
"grad_norm": 0.09281094372272491, |
|
"learning_rate": 2.5651396549429086e-05, |
|
"loss": 1.0903, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.6596931659693166, |
|
"grad_norm": 0.09770162403583527, |
|
"learning_rate": 2.540000536957765e-05, |
|
"loss": 1.1007, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.6643421664342166, |
|
"grad_norm": 0.11755497008562088, |
|
"learning_rate": 2.515416050122092e-05, |
|
"loss": 1.1027, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.6689911668991166, |
|
"grad_norm": 0.1314418464899063, |
|
"learning_rate": 2.4913878061762094e-05, |
|
"loss": 1.0532, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.6736401673640167, |
|
"grad_norm": 0.1723410040140152, |
|
"learning_rate": 2.4679173803935662e-05, |
|
"loss": 1.0447, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.678289167828917, |
|
"grad_norm": 0.22374863922595978, |
|
"learning_rate": 2.4450063114774784e-05, |
|
"loss": 1.0271, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.6829381682938167, |
|
"grad_norm": 0.2649782598018646, |
|
"learning_rate": 2.4226561014602522e-05, |
|
"loss": 0.8288, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.687587168758717, |
|
"grad_norm": 0.3294927477836609, |
|
"learning_rate": 2.400868215604706e-05, |
|
"loss": 0.7618, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.692236169223617, |
|
"grad_norm": 0.390989750623703, |
|
"learning_rate": 2.3796440823081167e-05, |
|
"loss": 0.6498, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.696885169688517, |
|
"grad_norm": 0.47515323758125305, |
|
"learning_rate": 2.358985093008566e-05, |
|
"loss": 0.6206, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.701534170153417, |
|
"grad_norm": 0.18342038989067078, |
|
"learning_rate": 2.3388926020937286e-05, |
|
"loss": 0.9768, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.706183170618317, |
|
"grad_norm": 0.08224090933799744, |
|
"learning_rate": 2.3193679268120718e-05, |
|
"loss": 1.0393, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.710832171083217, |
|
"grad_norm": 0.08616173267364502, |
|
"learning_rate": 2.3004123471865e-05, |
|
"loss": 1.0021, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.715481171548117, |
|
"grad_norm": 0.08873171359300613, |
|
"learning_rate": 2.2820271059304412e-05, |
|
"loss": 1.066, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.7201301720130173, |
|
"grad_norm": 0.10707499831914902, |
|
"learning_rate": 2.2642134083663678e-05, |
|
"loss": 1.1172, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.7247791724779171, |
|
"grad_norm": 0.1225440576672554, |
|
"learning_rate": 2.2469724223467866e-05, |
|
"loss": 1.0865, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.7294281729428174, |
|
"grad_norm": 0.1396765112876892, |
|
"learning_rate": 2.2303052781776664e-05, |
|
"loss": 1.0273, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.7340771734077174, |
|
"grad_norm": 0.17938072979450226, |
|
"learning_rate": 2.2142130685443382e-05, |
|
"loss": 1.1046, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.7387261738726174, |
|
"grad_norm": 0.2324485331773758, |
|
"learning_rate": 2.198696848439865e-05, |
|
"loss": 0.9628, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.7433751743375174, |
|
"grad_norm": 0.2933482527732849, |
|
"learning_rate": 2.1837576350958686e-05, |
|
"loss": 0.8228, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.7480241748024175, |
|
"grad_norm": 0.3423949182033539, |
|
"learning_rate": 2.169396407915849e-05, |
|
"loss": 0.7043, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.7526731752673175, |
|
"grad_norm": 0.38756224513053894, |
|
"learning_rate": 2.155614108410968e-05, |
|
"loss": 0.6579, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.7573221757322175, |
|
"grad_norm": 0.4097736179828644, |
|
"learning_rate": 2.142411640138332e-05, |
|
"loss": 0.7569, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.7619711761971177, |
|
"grad_norm": 0.07299873232841492, |
|
"learning_rate": 2.129789868641749e-05, |
|
"loss": 1.0571, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.7666201766620175, |
|
"grad_norm": 0.07895659655332565, |
|
"learning_rate": 2.1177496213949837e-05, |
|
"loss": 1.0487, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.7712691771269178, |
|
"grad_norm": 0.09031179547309875, |
|
"learning_rate": 2.1062916877475198e-05, |
|
"loss": 1.0911, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.7759181775918178, |
|
"grad_norm": 0.09883815795183182, |
|
"learning_rate": 2.0954168188727962e-05, |
|
"loss": 1.0619, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.7805671780567178, |
|
"grad_norm": 0.11488756537437439, |
|
"learning_rate": 2.0851257277189703e-05, |
|
"loss": 1.0788, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.7852161785216178, |
|
"grad_norm": 0.14409998059272766, |
|
"learning_rate": 2.0754190889621745e-05, |
|
"loss": 1.0902, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.7898651789865179, |
|
"grad_norm": 0.1650954782962799, |
|
"learning_rate": 2.0662975389622843e-05, |
|
"loss": 0.999, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.794514179451418, |
|
"grad_norm": 0.22679570317268372, |
|
"learning_rate": 2.0577616757212016e-05, |
|
"loss": 0.9647, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.799163179916318, |
|
"grad_norm": 0.284229040145874, |
|
"learning_rate": 2.0498120588436466e-05, |
|
"loss": 0.85, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.8038121803812182, |
|
"grad_norm": 0.32825493812561035, |
|
"learning_rate": 2.0424492095004746e-05, |
|
"loss": 0.7025, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.808461180846118, |
|
"grad_norm": 0.37925466895103455, |
|
"learning_rate": 2.0356736103945047e-05, |
|
"loss": 0.6595, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.8131101813110182, |
|
"grad_norm": 0.45006221532821655, |
|
"learning_rate": 2.029485705728876e-05, |
|
"loss": 0.4914, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.8177591817759182, |
|
"grad_norm": 0.2308957725763321, |
|
"learning_rate": 2.023885901177926e-05, |
|
"loss": 1.0114, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.8224081822408182, |
|
"grad_norm": 0.07487577944993973, |
|
"learning_rate": 2.0188745638605954e-05, |
|
"loss": 1.0175, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.8270571827057183, |
|
"grad_norm": 0.07909037917852402, |
|
"learning_rate": 2.014452022316358e-05, |
|
"loss": 1.0365, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.8317061831706183, |
|
"grad_norm": 0.09483642131090164, |
|
"learning_rate": 2.010618566483684e-05, |
|
"loss": 1.0723, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.8363551836355185, |
|
"grad_norm": 0.1127537190914154, |
|
"learning_rate": 2.00737444768103e-05, |
|
"loss": 1.0905, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.8410041841004183, |
|
"grad_norm": 0.12364522367715836, |
|
"learning_rate": 2.0047198785903658e-05, |
|
"loss": 1.1045, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.8456531845653186, |
|
"grad_norm": 0.14588101208209991, |
|
"learning_rate": 2.002655033243228e-05, |
|
"loss": 1.0753, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.8503021850302184, |
|
"grad_norm": 0.196581169962883, |
|
"learning_rate": 2.0011800470093105e-05, |
|
"loss": 1.0165, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.8549511854951186, |
|
"grad_norm": 0.24468840658664703, |
|
"learning_rate": 2.0002950165875934e-05, |
|
"loss": 0.9067, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.8596001859600186, |
|
"grad_norm": 0.32478219270706177, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8752, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.3538115420584673e+18, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|