diff --git "a/checkpoint-200/trainer_state.json" "b/checkpoint-200/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-200/trainer_state.json" @@ -0,0 +1,3233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.009958671513220137, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 148.59375, + "epoch": 4.979335756610068e-05, + "grad_norm": 3.3132607025954948, + "kl": 0.0, + "learning_rate": 9.999751033212169e-07, + "loss": 0.0, + "reward": 0.7696280479431152, + "reward_std": 0.8762097358703613, + "rewards/format_reward": 0.3984375, + "rewards/iou_reward": 0.30087804794311523, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.06640625, + "rewards/think_target_format_reward": 0.00390625, + "step": 1 + }, + { + "completion_length": 137.65625, + "epoch": 9.958671513220136e-05, + "grad_norm": 5.237894667222982, + "kl": 0.000621795654296875, + "learning_rate": 9.99950206642434e-07, + "loss": 0.0, + "reward": 0.9414893984794617, + "reward_std": 0.8353227376937866, + "rewards/format_reward": 0.5234375, + "rewards/iou_reward": 0.3985206186771393, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.01953125, + "rewards/think_target_format_reward": 0.0, + "step": 2 + }, + { + "completion_length": 142.5546875, + "epoch": 0.00014938007269830206, + "grad_norm": 3.269844307577445, + "kl": 0.0008487701416015625, + "learning_rate": 9.999253099636507e-07, + "loss": 0.0, + "reward": 0.9207633435726166, + "reward_std": 0.8793700337409973, + "rewards/format_reward": 0.5, + "rewards/iou_reward": 0.3934195786714554, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.0234375, + "rewards/think_target_format_reward": 0.00390625, + "step": 3 + }, + { + "completion_length": 132.65625, + "epoch": 0.00019917343026440272, + "grad_norm": 2.6095199048954667, + "kl": 0.001575469970703125, + "learning_rate": 9.999004132848678e-07, + "loss": 0.0001, + "reward": 0.9318321347236633, + "reward_std": 0.8345852196216583, + "rewards/format_reward": 0.46875, + "rewards/iou_reward": 0.42792588472366333, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.03125, + "rewards/think_target_format_reward": 0.00390625, + "step": 4 + }, + { + "completion_length": 133.34375, + "epoch": 0.0002489667878305034, + "grad_norm": 3.800136869205371, + "kl": 0.00278472900390625, + "learning_rate": 9.998755166060848e-07, + "loss": 0.0001, + "reward": 1.2287306785583496, + "reward_std": 0.8828634321689606, + "rewards/format_reward": 0.671875, + "rewards/iou_reward": 0.5216993987560272, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.03125, + "rewards/think_target_format_reward": 0.00390625, + "step": 5 + }, + { + "completion_length": 136.1015625, + "epoch": 0.0002987601453966041, + "grad_norm": 2.683781511891632, + "kl": 0.0067596435546875, + "learning_rate": 9.998506199273017e-07, + "loss": 0.0003, + "reward": 1.317540168762207, + "reward_std": 0.8890805840492249, + "rewards/format_reward": 0.65625, + "rewards/iou_reward": 0.6183213889598846, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.03125, + "rewards/think_target_format_reward": 0.01171875, + "step": 6 + }, + { + "completion_length": 139.765625, + "epoch": 0.0003485535029627048, + "grad_norm": 3.8796082905560607, + "kl": 0.0079498291015625, + "learning_rate": 9.998257232485186e-07, + "loss": 0.0003, + "reward": 1.3720521926879883, + "reward_std": 0.7464183866977692, + "rewards/format_reward": 0.734375, + "rewards/iou_reward": 0.5829896926879883, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.05078125, + "rewards/think_target_format_reward": 0.00390625, + "step": 7 + }, + { + "completion_length": 142.6015625, + "epoch": 0.00039834686052880544, + "grad_norm": 8.728586903144173, + "kl": 0.0087890625, + "learning_rate": 9.998008265697355e-07, + "loss": 0.0004, + "reward": 1.4526811838150024, + "reward_std": 0.7429399788379669, + "rewards/format_reward": 0.7421875, + "rewards/iou_reward": 0.6714312136173248, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.0390625, + "rewards/think_target_format_reward": 0.0, + "step": 8 + }, + { + "completion_length": 122.9453125, + "epoch": 0.00044814021809490615, + "grad_norm": 9.98798451818712, + "kl": 0.01397705078125, + "learning_rate": 9.997759298909524e-07, + "loss": 0.0006, + "reward": 1.563430666923523, + "reward_std": 0.6167902648448944, + "rewards/format_reward": 0.8203125, + "rewards/iou_reward": 0.7040556371212006, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.03515625, + "rewards/think_target_format_reward": 0.00390625, + "step": 9 + }, + { + "completion_length": 124.9140625, + "epoch": 0.0004979335756610068, + "grad_norm": 2.599109669228597, + "kl": 0.019073486328125, + "learning_rate": 9.997510332121696e-07, + "loss": 0.0008, + "reward": 1.669438362121582, + "reward_std": 0.5043404847383499, + "rewards/format_reward": 0.9140625, + "rewards/iou_reward": 0.6811570525169373, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.07421875, + "rewards/think_target_format_reward": 0.0, + "step": 10 + }, + { + "completion_length": 119.625, + "epoch": 0.0005477269332271075, + "grad_norm": 3.2147058956290744, + "kl": 0.025634765625, + "learning_rate": 9.997261365333865e-07, + "loss": 0.001, + "reward": 1.7129460573196411, + "reward_std": 0.5375795662403107, + "rewards/format_reward": 0.9140625, + "rewards/iou_reward": 0.7285711169242859, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.046875, + "rewards/think_target_format_reward": 0.0234375, + "step": 11 + }, + { + "completion_length": 121.0078125, + "epoch": 0.0005975202907932082, + "grad_norm": 3.3499276130110807, + "kl": 0.02178955078125, + "learning_rate": 9.997012398546034e-07, + "loss": 0.0009, + "reward": 1.738543152809143, + "reward_std": 0.4616367518901825, + "rewards/format_reward": 0.9453125, + "rewards/iou_reward": 0.7424493730068207, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.0390625, + "rewards/think_target_format_reward": 0.01171875, + "step": 12 + }, + { + "completion_length": 126.6953125, + "epoch": 0.0006473136483593089, + "grad_norm": 7.742916412659206, + "kl": 0.0260009765625, + "learning_rate": 9.996763431758203e-07, + "loss": 0.001, + "reward": 1.818768560886383, + "reward_std": 0.48613953590393066, + "rewards/format_reward": 0.9140625, + "rewards/iou_reward": 0.7914247810840607, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.09375, + "rewards/think_target_format_reward": 0.01953125, + "step": 13 + }, + { + "completion_length": 111.9453125, + "epoch": 0.0006971070059254096, + "grad_norm": 3.88880926683341, + "kl": 0.03424072265625, + "learning_rate": 9.996514464970372e-07, + "loss": 0.0014, + "reward": 1.7196047902107239, + "reward_std": 0.44675369560718536, + "rewards/format_reward": 0.9453125, + "rewards/iou_reward": 0.6922610998153687, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.0625, + "rewards/think_target_format_reward": 0.01953125, + "step": 14 + }, + { + "completion_length": 110.765625, + "epoch": 0.0007469003634915102, + "grad_norm": 2.8338699195583583, + "kl": 0.0255126953125, + "learning_rate": 9.996265498182541e-07, + "loss": 0.001, + "reward": 1.7771166563034058, + "reward_std": 0.476392924785614, + "rewards/format_reward": 0.9375, + "rewards/iou_reward": 0.7771166265010834, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.046875, + "rewards/think_target_format_reward": 0.015625, + "step": 15 + }, + { + "completion_length": 115.7109375, + "epoch": 0.0007966937210576109, + "grad_norm": 3.0262708327443963, + "kl": 0.023681640625, + "learning_rate": 9.996016531394713e-07, + "loss": 0.0009, + "reward": 1.6927234530448914, + "reward_std": 0.5437607169151306, + "rewards/format_reward": 0.921875, + "rewards/iou_reward": 0.6692859530448914, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.0703125, + "rewards/think_target_format_reward": 0.03125, + "step": 16 + }, + { + "completion_length": 111.890625, + "epoch": 0.0008464870786237116, + "grad_norm": 2.124451554979982, + "kl": 0.0255126953125, + "learning_rate": 9.99576756460688e-07, + "loss": 0.001, + "reward": 1.8239760994911194, + "reward_std": 0.4948176443576813, + "rewards/format_reward": 0.9609375, + "rewards/iou_reward": 0.7575698494911194, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.078125, + "rewards/think_target_format_reward": 0.02734375, + "step": 17 + }, + { + "completion_length": 114.6171875, + "epoch": 0.0008962804361898123, + "grad_norm": 2.485451005324509, + "kl": 0.02728271484375, + "learning_rate": 9.995518597819051e-07, + "loss": 0.0011, + "reward": 1.8716576099395752, + "reward_std": 0.36720022559165955, + "rewards/format_reward": 0.9765625, + "rewards/iou_reward": 0.8091575801372528, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.05078125, + "rewards/think_target_format_reward": 0.03515625, + "step": 18 + }, + { + "completion_length": 115.75, + "epoch": 0.000946073793755913, + "grad_norm": 2.437280406223035, + "kl": 0.03216552734375, + "learning_rate": 9.99526963103122e-07, + "loss": 0.0013, + "reward": 1.9123786687850952, + "reward_std": 0.36016273498535156, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.8420661687850952, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.06640625, + "rewards/think_target_format_reward": 0.01953125, + "step": 19 + }, + { + "completion_length": 111.8984375, + "epoch": 0.0009958671513220136, + "grad_norm": 2.8994855593559983, + "kl": 0.0460205078125, + "learning_rate": 9.99502066424339e-07, + "loss": 0.0018, + "reward": 1.7710133790969849, + "reward_std": 0.4925228953361511, + "rewards/format_reward": 0.953125, + "rewards/iou_reward": 0.6850758194923401, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.0859375, + "rewards/think_target_format_reward": 0.046875, + "step": 20 + }, + { + "completion_length": 115.7265625, + "epoch": 0.0010456605088881143, + "grad_norm": 2.2957263509130827, + "kl": 0.0426025390625, + "learning_rate": 9.994771697455559e-07, + "loss": 0.0017, + "reward": 1.9904733300209045, + "reward_std": 0.39012354612350464, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8264108002185822, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.10546875, + "rewards/think_target_format_reward": 0.05859375, + "step": 21 + }, + { + "completion_length": 108.921875, + "epoch": 0.001095453866454215, + "grad_norm": 2.4431028306907416, + "kl": 0.060302734375, + "learning_rate": 9.994522730667728e-07, + "loss": 0.0024, + "reward": 1.9434444904327393, + "reward_std": 0.4659569561481476, + "rewards/format_reward": 0.953125, + "rewards/iou_reward": 0.7911007404327393, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.1171875, + "rewards/think_target_format_reward": 0.08203125, + "step": 22 + }, + { + "completion_length": 111.5703125, + "epoch": 0.0011452472240203156, + "grad_norm": 3.198312298946404, + "kl": 0.0518798828125, + "learning_rate": 9.994273763879897e-07, + "loss": 0.0021, + "reward": 1.9792124032974243, + "reward_std": 0.4927746653556824, + "rewards/format_reward": 0.9453125, + "rewards/iou_reward": 0.7878062725067139, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.14453125, + "rewards/think_target_format_reward": 0.1015625, + "step": 23 + }, + { + "completion_length": 123.5625, + "epoch": 0.0011950405815864165, + "grad_norm": 9.974432493223043, + "kl": 0.06201171875, + "learning_rate": 9.994024797092068e-07, + "loss": 0.0025, + "reward": 2.0949708223342896, + "reward_std": 0.5853737592697144, + "rewards/format_reward": 0.96875, + "rewards/iou_reward": 0.8059084117412567, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.19921875, + "rewards/think_target_format_reward": 0.12109375, + "step": 24 + }, + { + "completion_length": 123.125, + "epoch": 0.0012448339391525171, + "grad_norm": 2.5235759457207503, + "kl": 0.071044921875, + "learning_rate": 9.993775830304237e-07, + "loss": 0.0028, + "reward": 2.321195960044861, + "reward_std": 0.6965383887290955, + "rewards/format_reward": 0.96875, + "rewards/iou_reward": 0.8758834898471832, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.27734375, + "rewards/think_target_format_reward": 0.19921875, + "step": 25 + }, + { + "completion_length": 132.0625, + "epoch": 0.0012946272967186178, + "grad_norm": 2.170952524760718, + "kl": 0.0791015625, + "learning_rate": 9.993526863516407e-07, + "loss": 0.0032, + "reward": 2.5628193616867065, + "reward_std": 0.6395607888698578, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9104756712913513, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.40625, + "rewards/think_target_format_reward": 0.25390625, + "step": 26 + }, + { + "completion_length": 134.3359375, + "epoch": 0.0013444206542847185, + "grad_norm": 2.3692294211216574, + "kl": 0.081298828125, + "learning_rate": 9.993277896728576e-07, + "loss": 0.0033, + "reward": 2.4406754970550537, + "reward_std": 0.6460122466087341, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8117692470550537, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.35546875, + "rewards/think_target_format_reward": 0.28125, + "step": 27 + }, + { + "completion_length": 135.2421875, + "epoch": 0.0013942140118508191, + "grad_norm": 8.6951865164392, + "kl": 0.1015625, + "learning_rate": 9.993028929940745e-07, + "loss": 0.0041, + "reward": 2.4812941551208496, + "reward_std": 0.6459673345088959, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.7820753157138824, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.41015625, + "rewards/think_target_format_reward": 0.3046875, + "step": 28 + }, + { + "completion_length": 139.0078125, + "epoch": 0.0014440073694169198, + "grad_norm": 2.9025335753348855, + "kl": 0.098876953125, + "learning_rate": 9.992779963152916e-07, + "loss": 0.004, + "reward": 2.9502846002578735, + "reward_std": 0.614486813545227, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9580970406532288, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.5703125, + "rewards/think_target_format_reward": 0.421875, + "step": 29 + }, + { + "completion_length": 158.953125, + "epoch": 0.0014938007269830204, + "grad_norm": 15.620897556804868, + "kl": 0.216552734375, + "learning_rate": 9.992530996365083e-07, + "loss": 0.0087, + "reward": 2.8008846044540405, + "reward_std": 0.5895302891731262, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.7735407948493958, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.5078125, + "rewards/think_target_format_reward": 0.52734375, + "step": 30 + }, + { + "completion_length": 136.2890625, + "epoch": 0.001543594084549121, + "grad_norm": 5.0158714714996355, + "kl": 0.199462890625, + "learning_rate": 9.992282029577255e-07, + "loss": 0.008, + "reward": 2.9739110469818115, + "reward_std": 0.6401561498641968, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8489110171794891, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.54296875, + "rewards/think_target_format_reward": 0.58203125, + "step": 31 + }, + { + "completion_length": 144.015625, + "epoch": 0.0015933874421152217, + "grad_norm": 2.6244281791614226, + "kl": 0.11572265625, + "learning_rate": 9.992033062789424e-07, + "loss": 0.0046, + "reward": 3.0174964666366577, + "reward_std": 0.574400782585144, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.8221839666366577, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.546875, + "rewards/think_target_format_reward": 0.6640625, + "step": 32 + }, + { + "completion_length": 146.0546875, + "epoch": 0.0016431807996813224, + "grad_norm": 1.8591560406893894, + "kl": 0.12939453125, + "learning_rate": 9.991784096001593e-07, + "loss": 0.0052, + "reward": 3.0510542392730713, + "reward_std": 0.6198671758174896, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8635542988777161, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.5234375, + "rewards/think_target_format_reward": 0.6640625, + "step": 33 + }, + { + "completion_length": 137.75, + "epoch": 0.0016929741572474233, + "grad_norm": 1.4522204123717166, + "kl": 0.104736328125, + "learning_rate": 9.991535129213762e-07, + "loss": 0.0042, + "reward": 3.230587959289551, + "reward_std": 0.5039547085762024, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.910275399684906, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.59375, + "rewards/think_target_format_reward": 0.734375, + "step": 34 + }, + { + "completion_length": 133.0390625, + "epoch": 0.001742767514813524, + "grad_norm": 1.6595676650518942, + "kl": 0.126708984375, + "learning_rate": 9.991286162425931e-07, + "loss": 0.0051, + "reward": 3.259363889694214, + "reward_std": 0.5665396153926849, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8492077589035034, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.6328125, + "rewards/think_target_format_reward": 0.77734375, + "step": 35 + }, + { + "completion_length": 138.2421875, + "epoch": 0.0017925608723796246, + "grad_norm": 2.1859408439265207, + "kl": 0.103515625, + "learning_rate": 9.9910371956381e-07, + "loss": 0.0041, + "reward": 3.230665445327759, + "reward_std": 0.5916065573692322, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8205091953277588, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.625, + "rewards/think_target_format_reward": 0.78515625, + "step": 36 + }, + { + "completion_length": 137.9375, + "epoch": 0.0018423542299457253, + "grad_norm": 1.4603007819232943, + "kl": 0.10546875, + "learning_rate": 9.990788228850272e-07, + "loss": 0.0042, + "reward": 3.1859707832336426, + "reward_std": 0.5895537436008453, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.8422207832336426, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.515625, + "rewards/think_target_format_reward": 0.84375, + "step": 37 + }, + { + "completion_length": 143.8203125, + "epoch": 0.001892147587511826, + "grad_norm": 1.4240990586917466, + "kl": 0.100830078125, + "learning_rate": 9.99053926206244e-07, + "loss": 0.004, + "reward": 3.3519304990768433, + "reward_std": 0.5754130780696869, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9261493384838104, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.59765625, + "rewards/think_target_format_reward": 0.8359375, + "step": 38 + }, + { + "completion_length": 126.6015625, + "epoch": 0.0019419409450779266, + "grad_norm": 9.08931599188114, + "kl": 0.106201171875, + "learning_rate": 9.99029029527461e-07, + "loss": 0.0042, + "reward": 3.403307795524597, + "reward_std": 0.5115114450454712, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.9072139859199524, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.640625, + "rewards/think_target_format_reward": 0.87109375, + "step": 39 + }, + { + "completion_length": 123.921875, + "epoch": 0.0019917343026440272, + "grad_norm": 5.789908602951853, + "kl": 0.12109375, + "learning_rate": 9.99004132848678e-07, + "loss": 0.0048, + "reward": 3.3576027154922485, + "reward_std": 0.5787676572799683, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8107276558876038, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.66015625, + "rewards/think_target_format_reward": 0.88671875, + "step": 40 + }, + { + "completion_length": 117.171875, + "epoch": 0.002041527660210128, + "grad_norm": 1.9203096240152633, + "kl": 0.1513671875, + "learning_rate": 9.989792361698948e-07, + "loss": 0.0061, + "reward": 3.4251708984375, + "reward_std": 0.46290460228919983, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8861083090305328, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.6484375, + "rewards/think_target_format_reward": 0.890625, + "step": 41 + }, + { + "completion_length": 125.2890625, + "epoch": 0.0020913210177762286, + "grad_norm": 1.8533606805715384, + "kl": 0.1640625, + "learning_rate": 9.989543394911118e-07, + "loss": 0.0066, + "reward": 3.398453712463379, + "reward_std": 0.5093448460102081, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8476724028587341, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.671875, + "rewards/think_target_format_reward": 0.88671875, + "step": 42 + }, + { + "completion_length": 119.7109375, + "epoch": 0.0021411143753423292, + "grad_norm": 6.237670024884941, + "kl": 0.142578125, + "learning_rate": 9.98929442812329e-07, + "loss": 0.0057, + "reward": 3.536202549934387, + "reward_std": 0.4743429124355316, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9580776691436768, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.703125, + "rewards/think_target_format_reward": 0.875, + "step": 43 + }, + { + "completion_length": 125.2734375, + "epoch": 0.00219090773290843, + "grad_norm": 1.291619298716437, + "kl": 0.12255859375, + "learning_rate": 9.989045461335456e-07, + "loss": 0.0049, + "reward": 3.482519507408142, + "reward_std": 0.4058193862438202, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9083006978034973, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.6328125, + "rewards/think_target_format_reward": 0.94140625, + "step": 44 + }, + { + "completion_length": 124.0, + "epoch": 0.0022407010904745305, + "grad_norm": 1.5664305291523453, + "kl": 0.11669921875, + "learning_rate": 9.988796494547627e-07, + "loss": 0.0047, + "reward": 3.487349033355713, + "reward_std": 0.5048477649688721, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8857865929603577, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.66796875, + "rewards/think_target_format_reward": 0.94140625, + "step": 45 + }, + { + "completion_length": 126.671875, + "epoch": 0.002290494448040631, + "grad_norm": 1.5027287291702889, + "kl": 0.11767578125, + "learning_rate": 9.988547527759796e-07, + "loss": 0.0047, + "reward": 3.5357731580734253, + "reward_std": 0.48469752073287964, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8599919378757477, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.75, + "rewards/think_target_format_reward": 0.93359375, + "step": 46 + }, + { + "completion_length": 120.5625, + "epoch": 0.002340287805606732, + "grad_norm": 1.4032472517478942, + "kl": 0.1337890625, + "learning_rate": 9.988298560971966e-07, + "loss": 0.0054, + "reward": 3.4687615633010864, + "reward_std": 0.5174609124660492, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.800792783498764, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.7421875, + "rewards/think_target_format_reward": 0.93359375, + "step": 47 + }, + { + "completion_length": 129.0703125, + "epoch": 0.002390081163172833, + "grad_norm": 1.7454459440099364, + "kl": 0.1279296875, + "learning_rate": 9.988049594184135e-07, + "loss": 0.0051, + "reward": 3.513799548149109, + "reward_std": 0.4797051399946213, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8458308279514313, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.73828125, + "rewards/think_target_format_reward": 0.9375, + "step": 48 + }, + { + "completion_length": 130.265625, + "epoch": 0.0024398745207389336, + "grad_norm": 2.0849072318618322, + "kl": 0.12744140625, + "learning_rate": 9.987800627396304e-07, + "loss": 0.0051, + "reward": 3.4303088188171387, + "reward_std": 0.4839910864830017, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.7896837294101715, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.71484375, + "rewards/think_target_format_reward": 0.92578125, + "step": 49 + }, + { + "completion_length": 131.6796875, + "epoch": 0.0024896678783050343, + "grad_norm": 2.8133023610507224, + "kl": 0.14111328125, + "learning_rate": 9.987551660608473e-07, + "loss": 0.0056, + "reward": 3.6369577646255493, + "reward_std": 0.42703309655189514, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8986765444278717, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.76953125, + "rewards/think_target_format_reward": 0.96875, + "step": 50 + }, + { + "completion_length": 139.8515625, + "epoch": 0.002539461235871135, + "grad_norm": 4.426595193516742, + "kl": 0.138671875, + "learning_rate": 9.987302693820644e-07, + "loss": 0.0055, + "reward": 3.6227035522460938, + "reward_std": 0.43158045411109924, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9000474214553833, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.7578125, + "rewards/think_target_format_reward": 0.97265625, + "step": 51 + }, + { + "completion_length": 143.046875, + "epoch": 0.0025892545934372356, + "grad_norm": 1.5158653155804376, + "kl": 0.1357421875, + "learning_rate": 9.987053727032814e-07, + "loss": 0.0054, + "reward": 3.5918326377868652, + "reward_std": 0.4459882974624634, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.8223012089729309, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.82421875, + "rewards/think_target_format_reward": 0.9609375, + "step": 52 + }, + { + "completion_length": 135.9296875, + "epoch": 0.0026390479510033362, + "grad_norm": 2.447354266148581, + "kl": 0.13818359375, + "learning_rate": 9.986804760244983e-07, + "loss": 0.0055, + "reward": 3.4752540588378906, + "reward_std": 0.5756521224975586, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8619728088378906, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.66796875, + "rewards/think_target_format_reward": 0.953125, + "step": 53 + }, + { + "completion_length": 138.171875, + "epoch": 0.002688841308569437, + "grad_norm": 1.725875350859902, + "kl": 0.12646484375, + "learning_rate": 9.986555793457152e-07, + "loss": 0.0051, + "reward": 3.666760802268982, + "reward_std": 0.44961297512054443, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9206670522689819, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.80078125, + "rewards/think_target_format_reward": 0.953125, + "step": 54 + }, + { + "completion_length": 141.390625, + "epoch": 0.0027386346661355376, + "grad_norm": 2.269913400176499, + "kl": 0.1474609375, + "learning_rate": 9.986306826669321e-07, + "loss": 0.0059, + "reward": 3.6116433143615723, + "reward_std": 0.4918966293334961, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8499245643615723, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.80078125, + "rewards/think_target_format_reward": 0.96875, + "step": 55 + }, + { + "completion_length": 148.453125, + "epoch": 0.0027884280237016382, + "grad_norm": 1.8724566919299863, + "kl": 0.12890625, + "learning_rate": 9.986057859881492e-07, + "loss": 0.0052, + "reward": 3.549386143684387, + "reward_std": 0.5290462970733643, + "rewards/format_reward": 0.96875, + "rewards/iou_reward": 0.826729953289032, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.80859375, + "rewards/think_target_format_reward": 0.9453125, + "step": 56 + }, + { + "completion_length": 145.625, + "epoch": 0.002838221381267739, + "grad_norm": 1.6913923166650102, + "kl": 0.134765625, + "learning_rate": 9.985808893093662e-07, + "loss": 0.0054, + "reward": 3.518980026245117, + "reward_std": 0.46834084391593933, + "rewards/format_reward": 0.9765625, + "rewards/iou_reward": 0.7416362762451172, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.84765625, + "rewards/think_target_format_reward": 0.953125, + "step": 57 + }, + { + "completion_length": 129.7578125, + "epoch": 0.0028880147388338395, + "grad_norm": 5.767896952388487, + "kl": 0.1298828125, + "learning_rate": 9.98555992630583e-07, + "loss": 0.0052, + "reward": 3.6969090700149536, + "reward_std": 0.41619783639907837, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.903940349817276, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.83984375, + "rewards/think_target_format_reward": 0.9609375, + "step": 58 + }, + { + "completion_length": 140.9453125, + "epoch": 0.00293780809639994, + "grad_norm": 2.2932242102144045, + "kl": 0.1259765625, + "learning_rate": 9.985310959518e-07, + "loss": 0.005, + "reward": 3.6542489528656006, + "reward_std": 0.45224829018116, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8495614528656006, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.86328125, + "rewards/think_target_format_reward": 0.94921875, + "step": 59 + }, + { + "completion_length": 135.0, + "epoch": 0.002987601453966041, + "grad_norm": 2.381291333412653, + "kl": 0.12890625, + "learning_rate": 9.98506199273017e-07, + "loss": 0.0052, + "reward": 3.618720769882202, + "reward_std": 0.37515829503536224, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8101270198822021, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.86328125, + "rewards/think_target_format_reward": 0.9453125, + "step": 60 + }, + { + "completion_length": 141.6015625, + "epoch": 0.0030373948115321415, + "grad_norm": 2.412225306077241, + "kl": 0.12646484375, + "learning_rate": 9.984813025942338e-07, + "loss": 0.005, + "reward": 3.6837724447250366, + "reward_std": 0.34784846007823944, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8400223255157471, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.87890625, + "rewards/think_target_format_reward": 0.96484375, + "step": 61 + }, + { + "completion_length": 128.890625, + "epoch": 0.003087188169098242, + "grad_norm": 1.379645959786542, + "kl": 0.14306640625, + "learning_rate": 9.98456405915451e-07, + "loss": 0.0057, + "reward": 3.691563367843628, + "reward_std": 0.3355468511581421, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8400007784366608, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.90625, + "rewards/think_target_format_reward": 0.9453125, + "step": 62 + }, + { + "completion_length": 130.4140625, + "epoch": 0.003136981526664343, + "grad_norm": 1.3281961947796395, + "kl": 0.150390625, + "learning_rate": 9.984315092366677e-07, + "loss": 0.006, + "reward": 3.6392574310302734, + "reward_std": 0.3283068835735321, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.7720700204372406, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.90234375, + "rewards/think_target_format_reward": 0.97265625, + "step": 63 + }, + { + "completion_length": 139.671875, + "epoch": 0.0031867748842304435, + "grad_norm": 2.225980776446206, + "kl": 0.1494140625, + "learning_rate": 9.984066125578848e-07, + "loss": 0.006, + "reward": 3.7183239459991455, + "reward_std": 0.40951642394065857, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8823864161968231, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.8828125, + "rewards/think_target_format_reward": 0.9609375, + "step": 64 + }, + { + "completion_length": 129.1484375, + "epoch": 0.003236568241796544, + "grad_norm": 3.6999790145936426, + "kl": 0.14599609375, + "learning_rate": 9.983817158791017e-07, + "loss": 0.0058, + "reward": 3.8304333686828613, + "reward_std": 0.2774406671524048, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9280895292758942, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91796875, + "rewards/think_target_format_reward": 0.984375, + "step": 65 + }, + { + "completion_length": 135.9453125, + "epoch": 0.003286361599362645, + "grad_norm": 1.2333692039230266, + "kl": 0.14892578125, + "learning_rate": 9.983568192003186e-07, + "loss": 0.006, + "reward": 3.6681270599365234, + "reward_std": 0.3672686517238617, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.7970332503318787, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.92578125, + "rewards/think_target_format_reward": 0.953125, + "step": 66 + }, + { + "completion_length": 131.109375, + "epoch": 0.003336154956928746, + "grad_norm": 1.1760926602992998, + "kl": 0.1416015625, + "learning_rate": 9.983319225215356e-07, + "loss": 0.0057, + "reward": 3.8153891563415527, + "reward_std": 0.16522850841283798, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8896079361438751, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9609375, + "rewards/think_target_format_reward": 0.96484375, + "step": 67 + }, + { + "completion_length": 129.5703125, + "epoch": 0.0033859483144948466, + "grad_norm": 1.4634015997053385, + "kl": 0.15625, + "learning_rate": 9.983070258427525e-07, + "loss": 0.0062, + "reward": 3.651693105697632, + "reward_std": 0.36964625120162964, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.761068195104599, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.94140625, + "rewards/think_target_format_reward": 0.95703125, + "step": 68 + }, + { + "completion_length": 133.359375, + "epoch": 0.0034357416720609472, + "grad_norm": 7.572647824857089, + "kl": 0.15478515625, + "learning_rate": 9.982821291639694e-07, + "loss": 0.0062, + "reward": 3.764968156814575, + "reward_std": 0.31832005083560944, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8665306270122528, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.94921875, + "rewards/think_target_format_reward": 0.95703125, + "step": 69 + }, + { + "completion_length": 135.6796875, + "epoch": 0.003485535029627048, + "grad_norm": 1.4025220540509198, + "kl": 0.140625, + "learning_rate": 9.982572324851865e-07, + "loss": 0.0056, + "reward": 3.668897271156311, + "reward_std": 0.33910442888736725, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.786084920167923, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.90625, + "rewards/think_target_format_reward": 0.9765625, + "step": 70 + }, + { + "completion_length": 134.609375, + "epoch": 0.0035353283871931485, + "grad_norm": 1.6969499028337114, + "kl": 0.15869140625, + "learning_rate": 9.982323358064034e-07, + "loss": 0.0063, + "reward": 3.77866530418396, + "reward_std": 0.2938871532678604, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8958528339862823, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9140625, + "rewards/think_target_format_reward": 0.96875, + "step": 71 + }, + { + "completion_length": 134.515625, + "epoch": 0.003585121744759249, + "grad_norm": 1.5049005936408864, + "kl": 0.12548828125, + "learning_rate": 9.982074391276204e-07, + "loss": 0.005, + "reward": 3.671709895133972, + "reward_std": 0.34661880135536194, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8201473951339722, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.8984375, + "rewards/think_target_format_reward": 0.953125, + "step": 72 + }, + { + "completion_length": 128.3984375, + "epoch": 0.00363491510232535, + "grad_norm": 1.6428583140368767, + "kl": 0.1279296875, + "learning_rate": 9.981825424488373e-07, + "loss": 0.0051, + "reward": 3.8222426176071167, + "reward_std": 0.29343181848526, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9589613378047943, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91015625, + "rewards/think_target_format_reward": 0.953125, + "step": 73 + }, + { + "completion_length": 140.28125, + "epoch": 0.0036847084598914505, + "grad_norm": 2.5432636843555994, + "kl": 0.140625, + "learning_rate": 9.981576457700542e-07, + "loss": 0.0056, + "reward": 3.699527621269226, + "reward_std": 0.41473595798015594, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8948401212692261, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.87109375, + "rewards/think_target_format_reward": 0.93359375, + "step": 74 + }, + { + "completion_length": 134.5703125, + "epoch": 0.003734501817457551, + "grad_norm": 1.2816786393185755, + "kl": 0.1181640625, + "learning_rate": 9.981327490912711e-07, + "loss": 0.0047, + "reward": 3.846506953239441, + "reward_std": 0.29512619972229004, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9754130840301514, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.90625, + "rewards/think_target_format_reward": 0.96484375, + "step": 75 + }, + { + "completion_length": 137.8828125, + "epoch": 0.003784295175023652, + "grad_norm": 2.757181803423125, + "kl": 0.13525390625, + "learning_rate": 9.981078524124882e-07, + "loss": 0.0054, + "reward": 3.7217005491256714, + "reward_std": 0.46195192635059357, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8818567097187042, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91796875, + "rewards/think_target_format_reward": 0.9296875, + "step": 76 + }, + { + "completion_length": 138.3515625, + "epoch": 0.0038340885325897525, + "grad_norm": 3.7038993506352798, + "kl": 0.1875, + "learning_rate": 9.98082955733705e-07, + "loss": 0.0075, + "reward": 3.766265630722046, + "reward_std": 0.31814898550510406, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8873593509197235, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.90625, + "rewards/think_target_format_reward": 0.97265625, + "step": 77 + }, + { + "completion_length": 138.1015625, + "epoch": 0.003883881890155853, + "grad_norm": 1.4925995902344449, + "kl": 0.142822265625, + "learning_rate": 9.98058059054922e-07, + "loss": 0.0057, + "reward": 3.7071441411972046, + "reward_std": 0.4433154761791229, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9063630104064941, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.8828125, + "rewards/think_target_format_reward": 0.92578125, + "step": 78 + }, + { + "completion_length": 146.6953125, + "epoch": 0.003933675247721954, + "grad_norm": 1.4769031268556563, + "kl": 0.127197265625, + "learning_rate": 9.98033162376139e-07, + "loss": 0.0051, + "reward": 3.613555669784546, + "reward_std": 0.5199896395206451, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8401181995868683, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.85546875, + "rewards/think_target_format_reward": 0.92578125, + "step": 79 + }, + { + "completion_length": 135.8203125, + "epoch": 0.0039834686052880545, + "grad_norm": 1.5764457873289686, + "kl": 0.121337890625, + "learning_rate": 9.98008265697356e-07, + "loss": 0.0049, + "reward": 3.582049012184143, + "reward_std": 0.4074898660182953, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.7773613929748535, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.87109375, + "rewards/think_target_format_reward": 0.93359375, + "step": 80 + }, + { + "completion_length": 140.3125, + "epoch": 0.004033261962854156, + "grad_norm": 1.5850058390039068, + "kl": 0.113525390625, + "learning_rate": 9.979833690185728e-07, + "loss": 0.0045, + "reward": 3.525166392326355, + "reward_std": 0.4487512558698654, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.7868850529193878, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.8203125, + "rewards/think_target_format_reward": 0.91796875, + "step": 81 + }, + { + "completion_length": 134.640625, + "epoch": 0.004083055320420256, + "grad_norm": 3.136671800082412, + "kl": 0.118408203125, + "learning_rate": 9.979584723397897e-07, + "loss": 0.0047, + "reward": 3.58487069606781, + "reward_std": 0.5471959412097931, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.9090894460678101, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.8125, + "rewards/think_target_format_reward": 0.87890625, + "step": 82 + }, + { + "completion_length": 139.3671875, + "epoch": 0.004132848677986357, + "grad_norm": 1.4765065698169861, + "kl": 0.10791015625, + "learning_rate": 9.979335756610069e-07, + "loss": 0.0043, + "reward": 3.636987328529358, + "reward_std": 0.4565633535385132, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8987061977386475, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.85546875, + "rewards/think_target_format_reward": 0.8828125, + "step": 83 + }, + { + "completion_length": 136.6640625, + "epoch": 0.004182642035552457, + "grad_norm": 1.5954605987301622, + "kl": 0.138671875, + "learning_rate": 9.979086789822238e-07, + "loss": 0.0056, + "reward": 3.572832703590393, + "reward_std": 0.5160607248544693, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8736139237880707, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.7890625, + "rewards/think_target_format_reward": 0.91015625, + "step": 84 + }, + { + "completion_length": 136.0, + "epoch": 0.004232435393118558, + "grad_norm": 1.4195769721593605, + "kl": 0.11572265625, + "learning_rate": 9.978837823034407e-07, + "loss": 0.0046, + "reward": 3.527488589286804, + "reward_std": 0.41929225623607635, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.824363648891449, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.82421875, + "rewards/think_target_format_reward": 0.87890625, + "step": 85 + }, + { + "completion_length": 143.65625, + "epoch": 0.0042822287506846584, + "grad_norm": 1.6283641975077487, + "kl": 0.10693359375, + "learning_rate": 9.978588856246576e-07, + "loss": 0.0043, + "reward": 3.501909017562866, + "reward_std": 0.45429614186286926, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.7401902079582214, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.86328125, + "rewards/think_target_format_reward": 0.8984375, + "step": 86 + }, + { + "completion_length": 137.8359375, + "epoch": 0.0043320221082507595, + "grad_norm": 2.3047606913714307, + "kl": 0.11767578125, + "learning_rate": 9.978339889458745e-07, + "loss": 0.0047, + "reward": 3.652833104133606, + "reward_std": 0.48845815658569336, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.914551854133606, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.84765625, + "rewards/think_target_format_reward": 0.8984375, + "step": 87 + }, + { + "completion_length": 152.4921875, + "epoch": 0.00438181546581686, + "grad_norm": 1.7152268308154732, + "kl": 0.10546875, + "learning_rate": 9.978090922670915e-07, + "loss": 0.0042, + "reward": 3.537259340286255, + "reward_std": 0.5511800348758698, + "rewards/format_reward": 0.9765625, + "rewards/iou_reward": 0.7950718700885773, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.859375, + "rewards/think_target_format_reward": 0.90625, + "step": 88 + }, + { + "completion_length": 137.234375, + "epoch": 0.004431608823382961, + "grad_norm": 1.867379550273408, + "kl": 0.116943359375, + "learning_rate": 9.977841955883086e-07, + "loss": 0.0047, + "reward": 3.7130141258239746, + "reward_std": 0.43672071397304535, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.9044204652309418, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.90234375, + "rewards/think_target_format_reward": 0.921875, + "step": 89 + }, + { + "completion_length": 128.265625, + "epoch": 0.004481402180949061, + "grad_norm": 2.0532156841111977, + "kl": 0.1240234375, + "learning_rate": 9.977592989095253e-07, + "loss": 0.005, + "reward": 3.7684637308120728, + "reward_std": 0.3327006995677948, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9286200106143951, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.90625, + "rewards/think_target_format_reward": 0.93359375, + "step": 90 + }, + { + "completion_length": 134.4609375, + "epoch": 0.004531195538515162, + "grad_norm": 1.4463925074607025, + "kl": 0.14794921875, + "learning_rate": 9.977344022307424e-07, + "loss": 0.0059, + "reward": 3.664172410964966, + "reward_std": 0.4466496855020523, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.914172351360321, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.83203125, + "rewards/think_target_format_reward": 0.91796875, + "step": 91 + }, + { + "completion_length": 131.703125, + "epoch": 0.004580988896081262, + "grad_norm": 44.92633928818439, + "kl": 0.109619140625, + "learning_rate": 9.977095055519593e-07, + "loss": 0.0044, + "reward": 3.7392773628234863, + "reward_std": 0.3363420218229294, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8799023628234863, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.92578125, + "rewards/think_target_format_reward": 0.94140625, + "step": 92 + }, + { + "completion_length": 142.2265625, + "epoch": 0.0046307822536473635, + "grad_norm": 2.37895192708155, + "kl": 0.1357421875, + "learning_rate": 9.976846088731763e-07, + "loss": 0.0054, + "reward": 3.5543183088302612, + "reward_std": 0.42823314666748047, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.7652557790279388, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.87890625, + "rewards/think_target_format_reward": 0.91015625, + "step": 93 + }, + { + "completion_length": 135.1640625, + "epoch": 0.004680575611213464, + "grad_norm": 1.1536155221795477, + "kl": 0.11669921875, + "learning_rate": 9.976597121943932e-07, + "loss": 0.0047, + "reward": 3.757152557373047, + "reward_std": 0.3326938897371292, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.9016837179660797, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9140625, + "rewards/think_target_format_reward": 0.95703125, + "step": 94 + }, + { + "completion_length": 136.1015625, + "epoch": 0.004730368968779565, + "grad_norm": 1.8469892723664458, + "kl": 0.1201171875, + "learning_rate": 9.9763481551561e-07, + "loss": 0.0048, + "reward": 3.5813076496124268, + "reward_std": 0.4128554165363312, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.7883389294147491, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.875, + "rewards/think_target_format_reward": 0.92578125, + "step": 95 + }, + { + "completion_length": 133.40625, + "epoch": 0.004780162326345666, + "grad_norm": 2.048700217788109, + "kl": 0.1337890625, + "learning_rate": 9.97609918836827e-07, + "loss": 0.0054, + "reward": 3.6366097927093506, + "reward_std": 0.3528625965118408, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8084847033023834, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.87890625, + "rewards/think_target_format_reward": 0.94921875, + "step": 96 + }, + { + "completion_length": 130.109375, + "epoch": 0.004829955683911766, + "grad_norm": 1.6222046290895982, + "kl": 0.124267578125, + "learning_rate": 9.975850221580441e-07, + "loss": 0.005, + "reward": 3.759361982345581, + "reward_std": 0.34027788043022156, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8882682621479034, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9140625, + "rewards/think_target_format_reward": 0.95703125, + "step": 97 + }, + { + "completion_length": 132.109375, + "epoch": 0.004879749041477867, + "grad_norm": 2.167970299872686, + "kl": 0.11376953125, + "learning_rate": 9.97560125479261e-07, + "loss": 0.0045, + "reward": 3.686954617500305, + "reward_std": 0.34396885335445404, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8080483973026276, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.90234375, + "rewards/think_target_format_reward": 0.9765625, + "step": 98 + }, + { + "completion_length": 135.453125, + "epoch": 0.0049295423990439674, + "grad_norm": 1.9857587275243114, + "kl": 0.118896484375, + "learning_rate": 9.97535228800478e-07, + "loss": 0.0047, + "reward": 3.642647862434387, + "reward_std": 0.44180823862552643, + "rewards/format_reward": 0.9765625, + "rewards/iou_reward": 0.8379602134227753, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.87890625, + "rewards/think_target_format_reward": 0.94921875, + "step": 99 + }, + { + "completion_length": 140.671875, + "epoch": 0.0049793357566100685, + "grad_norm": 1.5592361899199478, + "kl": 0.124755859375, + "learning_rate": 9.97510332121695e-07, + "loss": 0.005, + "reward": 3.7475059032440186, + "reward_std": 0.32066014409065247, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9115684330463409, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.8828125, + "rewards/think_target_format_reward": 0.953125, + "step": 100 + }, + { + "completion_length": 136.9609375, + "epoch": 0.005029129114176169, + "grad_norm": 2.865506881295543, + "kl": 0.12353515625, + "learning_rate": 9.974854354429118e-07, + "loss": 0.0049, + "reward": 3.6237845420837402, + "reward_std": 0.29808564484119415, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.752690851688385, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9375, + "rewards/think_target_format_reward": 0.93359375, + "step": 101 + }, + { + "completion_length": 138.859375, + "epoch": 0.00507892247174227, + "grad_norm": 1.4874494326087124, + "kl": 0.12646484375, + "learning_rate": 9.974605387641287e-07, + "loss": 0.0051, + "reward": 3.614951491355896, + "reward_std": 0.41148778796195984, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.786826491355896, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.88671875, + "rewards/think_target_format_reward": 0.94140625, + "step": 102 + }, + { + "completion_length": 131.0234375, + "epoch": 0.00512871582930837, + "grad_norm": 1.4304786338477065, + "kl": 0.118896484375, + "learning_rate": 9.974356420853459e-07, + "loss": 0.0048, + "reward": 3.75937557220459, + "reward_std": 0.34399718046188354, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9156256020069122, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.88671875, + "rewards/think_target_format_reward": 0.96484375, + "step": 103 + }, + { + "completion_length": 126.8359375, + "epoch": 0.005178509186874471, + "grad_norm": 2.8556590851602857, + "kl": 0.148681640625, + "learning_rate": 9.974107454065626e-07, + "loss": 0.0059, + "reward": 3.7888190746307373, + "reward_std": 0.23447177559137344, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8786627650260925, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9375, + "rewards/think_target_format_reward": 0.97265625, + "step": 104 + }, + { + "completion_length": 132.1328125, + "epoch": 0.005228302544440571, + "grad_norm": 2.2772279008023526, + "kl": 0.1416015625, + "learning_rate": 9.973858487277797e-07, + "loss": 0.0057, + "reward": 3.735332489013672, + "reward_std": 0.3226277828216553, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8564262986183167, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.93359375, + "rewards/think_target_format_reward": 0.9453125, + "step": 105 + }, + { + "completion_length": 128.40625, + "epoch": 0.0052780959020066725, + "grad_norm": 1.7578441033665, + "kl": 0.12939453125, + "learning_rate": 9.973609520489966e-07, + "loss": 0.0052, + "reward": 3.741284728050232, + "reward_std": 0.32741629332304, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8819097578525543, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.8984375, + "rewards/think_target_format_reward": 0.96875, + "step": 106 + }, + { + "completion_length": 129.6171875, + "epoch": 0.005327889259572773, + "grad_norm": 2.1486917085197, + "kl": 0.16015625, + "learning_rate": 9.973360553702135e-07, + "loss": 0.0064, + "reward": 3.7753995656967163, + "reward_std": 0.36066462099552155, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9199308156967163, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91015625, + "rewards/think_target_format_reward": 0.9453125, + "step": 107 + }, + { + "completion_length": 127.2734375, + "epoch": 0.005377682617138874, + "grad_norm": 1.4934338351074141, + "kl": 0.120849609375, + "learning_rate": 9.973111586914307e-07, + "loss": 0.0048, + "reward": 3.8515725135803223, + "reward_std": 0.2768500745296478, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9453224837779999, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.93359375, + "rewards/think_target_format_reward": 0.97265625, + "step": 108 + }, + { + "completion_length": 126.9609375, + "epoch": 0.005427475974704974, + "grad_norm": 1.9083041804936929, + "kl": 0.12646484375, + "learning_rate": 9.972862620126474e-07, + "loss": 0.0051, + "reward": 3.798158288002014, + "reward_std": 0.31719064712524414, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9231583178043365, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9296875, + "rewards/think_target_format_reward": 0.9453125, + "step": 109 + }, + { + "completion_length": 135.03125, + "epoch": 0.005477269332271075, + "grad_norm": 2.167186114226301, + "kl": 0.12548828125, + "learning_rate": 9.972613653338645e-07, + "loss": 0.005, + "reward": 3.6624783277511597, + "reward_std": 0.49521274864673615, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8421658873558044, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.8671875, + "rewards/think_target_format_reward": 0.9609375, + "step": 110 + }, + { + "completion_length": 138.1015625, + "epoch": 0.005527062689837175, + "grad_norm": 1.753506599114052, + "kl": 0.1220703125, + "learning_rate": 9.972364686550814e-07, + "loss": 0.0049, + "reward": 3.7800755500793457, + "reward_std": 0.3457276225090027, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9089819192886353, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9140625, + "rewards/think_target_format_reward": 0.96484375, + "step": 111 + }, + { + "completion_length": 135.296875, + "epoch": 0.0055768560474032764, + "grad_norm": 1.2704605683592256, + "kl": 0.135009765625, + "learning_rate": 9.972115719762983e-07, + "loss": 0.0054, + "reward": 3.7460256814956665, + "reward_std": 0.3799598515033722, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9100881218910217, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.875, + "rewards/think_target_format_reward": 0.9609375, + "step": 112 + }, + { + "completion_length": 129.5, + "epoch": 0.005626649404969377, + "grad_norm": 1.1805088666825172, + "kl": 0.11376953125, + "learning_rate": 9.971866752975152e-07, + "loss": 0.0045, + "reward": 3.8595980405807495, + "reward_std": 0.2444089949131012, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9377230405807495, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.94921875, + "rewards/think_target_format_reward": 0.97265625, + "step": 113 + }, + { + "completion_length": 136.1171875, + "epoch": 0.005676442762535478, + "grad_norm": 3.5080180878820064, + "kl": 0.1298828125, + "learning_rate": 9.971617786187322e-07, + "loss": 0.0052, + "reward": 3.671228289604187, + "reward_std": 0.37986910343170166, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.8157596290111542, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9140625, + "rewards/think_target_format_reward": 0.95703125, + "step": 114 + }, + { + "completion_length": 128.6484375, + "epoch": 0.005726236120101579, + "grad_norm": 2.0309422835381956, + "kl": 0.143310546875, + "learning_rate": 9.97136881939949e-07, + "loss": 0.0057, + "reward": 3.832314372062683, + "reward_std": 0.26949892193078995, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9495019316673279, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9296875, + "rewards/think_target_format_reward": 0.9609375, + "step": 115 + }, + { + "completion_length": 134.96875, + "epoch": 0.005776029477667679, + "grad_norm": 1.3529954025089752, + "kl": 0.115234375, + "learning_rate": 9.971119852611662e-07, + "loss": 0.0046, + "reward": 3.603510856628418, + "reward_std": 0.4776967465877533, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.7910108864307404, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.87109375, + "rewards/think_target_format_reward": 0.94140625, + "step": 116 + }, + { + "completion_length": 129.8671875, + "epoch": 0.00582582283523378, + "grad_norm": 1.816119649138257, + "kl": 0.12939453125, + "learning_rate": 9.970870885823831e-07, + "loss": 0.0052, + "reward": 3.740101456642151, + "reward_std": 0.3371017426252365, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8924451172351837, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.92578125, + "rewards/think_target_format_reward": 0.921875, + "step": 117 + }, + { + "completion_length": 142.4296875, + "epoch": 0.00587561619279988, + "grad_norm": 1.9820961209003143, + "kl": 0.120361328125, + "learning_rate": 9.970621919036e-07, + "loss": 0.0048, + "reward": 3.7626017332077026, + "reward_std": 0.41295652091503143, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.9383830428123474, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.89453125, + "rewards/think_target_format_reward": 0.9453125, + "step": 118 + }, + { + "completion_length": 128.7734375, + "epoch": 0.0059254095503659815, + "grad_norm": 1.6361688102254675, + "kl": 0.111328125, + "learning_rate": 9.97037295224817e-07, + "loss": 0.0044, + "reward": 3.765626549720764, + "reward_std": 0.42581306397914886, + "rewards/format_reward": 0.9765625, + "rewards/iou_reward": 0.9335952699184418, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.921875, + "rewards/think_target_format_reward": 0.93359375, + "step": 119 + }, + { + "completion_length": 130.8046875, + "epoch": 0.005975202907932082, + "grad_norm": 1.8389573390999723, + "kl": 0.11767578125, + "learning_rate": 9.970123985460339e-07, + "loss": 0.0047, + "reward": 3.710259199142456, + "reward_std": 0.3429488092660904, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8430716693401337, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.921875, + "rewards/think_target_format_reward": 0.9453125, + "step": 120 + }, + { + "completion_length": 130.4765625, + "epoch": 0.006024996265498183, + "grad_norm": 3.9422244276160723, + "kl": 0.119140625, + "learning_rate": 9.969875018672508e-07, + "loss": 0.0048, + "reward": 3.678414463996887, + "reward_std": 0.3737645298242569, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.85028937458992, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.890625, + "rewards/think_target_format_reward": 0.9453125, + "step": 121 + }, + { + "completion_length": 134.796875, + "epoch": 0.006074789623064283, + "grad_norm": 1.5776160234332826, + "kl": 0.114990234375, + "learning_rate": 9.96962605188468e-07, + "loss": 0.0046, + "reward": 3.730717897415161, + "reward_std": 0.31151118874549866, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8596242666244507, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.92578125, + "rewards/think_target_format_reward": 0.953125, + "step": 122 + }, + { + "completion_length": 129.0859375, + "epoch": 0.006124582980630384, + "grad_norm": 1.5516066906213293, + "kl": 0.14013671875, + "learning_rate": 9.969377085096846e-07, + "loss": 0.0056, + "reward": 3.759946346282959, + "reward_std": 0.33592642843723297, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8927588164806366, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91796875, + "rewards/think_target_format_reward": 0.94921875, + "step": 123 + }, + { + "completion_length": 131.46875, + "epoch": 0.006174376338196484, + "grad_norm": 1.33072505316342, + "kl": 0.16015625, + "learning_rate": 9.969128118309018e-07, + "loss": 0.0064, + "reward": 3.788348078727722, + "reward_std": 0.39808133244514465, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.9602230489253998, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9140625, + "rewards/think_target_format_reward": 0.9296875, + "step": 124 + }, + { + "completion_length": 129.2421875, + "epoch": 0.0062241696957625854, + "grad_norm": 1.6285417986294162, + "kl": 0.13134765625, + "learning_rate": 9.968879151521187e-07, + "loss": 0.0053, + "reward": 3.8626081943511963, + "reward_std": 0.22992850840091705, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9797957241535187, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9296875, + "rewards/think_target_format_reward": 0.953125, + "step": 125 + }, + { + "completion_length": 135.1640625, + "epoch": 0.006273963053328686, + "grad_norm": 2.1985940038883736, + "kl": 0.1103515625, + "learning_rate": 9.968630184733356e-07, + "loss": 0.0044, + "reward": 3.8274006843566895, + "reward_std": 0.28642556071281433, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9563069343566895, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91015625, + "rewards/think_target_format_reward": 0.96875, + "step": 126 + }, + { + "completion_length": 133.3671875, + "epoch": 0.006323756410894787, + "grad_norm": 2.81835670970775, + "kl": 0.1201171875, + "learning_rate": 9.968381217945525e-07, + "loss": 0.0048, + "reward": 3.7653896808624268, + "reward_std": 0.4158574640750885, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9450772702693939, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.87109375, + "rewards/think_target_format_reward": 0.94921875, + "step": 127 + }, + { + "completion_length": 131.1953125, + "epoch": 0.006373549768460887, + "grad_norm": 1.4975452301757977, + "kl": 0.1142578125, + "learning_rate": 9.968132251157694e-07, + "loss": 0.0046, + "reward": 3.6445605754852295, + "reward_std": 0.33991990983486176, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.7695604264736176, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9453125, + "rewards/think_target_format_reward": 0.9375, + "step": 128 + }, + { + "completion_length": 130.984375, + "epoch": 0.006423343126026988, + "grad_norm": 2.432891148577604, + "kl": 0.14208984375, + "learning_rate": 9.967883284369866e-07, + "loss": 0.0057, + "reward": 3.7626477479934692, + "reward_std": 0.38446202874183655, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.961866557598114, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.88671875, + "rewards/think_target_format_reward": 0.921875, + "step": 129 + }, + { + "completion_length": 138.1875, + "epoch": 0.006473136483593088, + "grad_norm": 1.768493204156978, + "kl": 0.1015625, + "learning_rate": 9.967634317582035e-07, + "loss": 0.0041, + "reward": 3.747402787208557, + "reward_std": 0.3363555520772934, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8724027872085571, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91796875, + "rewards/think_target_format_reward": 0.96484375, + "step": 130 + }, + { + "completion_length": 129.1640625, + "epoch": 0.006522929841159189, + "grad_norm": 1.4668984574716555, + "kl": 0.11474609375, + "learning_rate": 9.967385350794204e-07, + "loss": 0.0046, + "reward": 3.791267156600952, + "reward_std": 0.30764734745025635, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9045484066009521, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.93359375, + "rewards/think_target_format_reward": 0.953125, + "step": 131 + }, + { + "completion_length": 133.1796875, + "epoch": 0.00657272319872529, + "grad_norm": 1.1479994426760773, + "kl": 0.1201171875, + "learning_rate": 9.967136384006373e-07, + "loss": 0.0048, + "reward": 3.8186784982681274, + "reward_std": 0.28686320781707764, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9397721886634827, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91796875, + "rewards/think_target_format_reward": 0.9609375, + "step": 132 + }, + { + "completion_length": 133.5234375, + "epoch": 0.006622516556291391, + "grad_norm": 1.4606748968420713, + "kl": 0.10693359375, + "learning_rate": 9.966887417218542e-07, + "loss": 0.0043, + "reward": 3.8264963626861572, + "reward_std": 0.31041674315929413, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.951496422290802, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9375, + "rewards/think_target_format_reward": 0.9453125, + "step": 133 + }, + { + "completion_length": 130.046875, + "epoch": 0.006672309913857492, + "grad_norm": 1.505585519072124, + "kl": 0.11767578125, + "learning_rate": 9.966638450430712e-07, + "loss": 0.0047, + "reward": 3.7808796167373657, + "reward_std": 0.2776695638895035, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8863483667373657, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9375, + "rewards/think_target_format_reward": 0.95703125, + "step": 134 + }, + { + "completion_length": 132.7109375, + "epoch": 0.006722103271423592, + "grad_norm": 1.9010721060857094, + "kl": 0.122314453125, + "learning_rate": 9.966389483642883e-07, + "loss": 0.0049, + "reward": 3.761750102043152, + "reward_std": 0.2718608230352402, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8828437924385071, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91796875, + "rewards/think_target_format_reward": 0.9609375, + "step": 135 + }, + { + "completion_length": 145.0625, + "epoch": 0.006771896628989693, + "grad_norm": 1.7360529332367947, + "kl": 0.107421875, + "learning_rate": 9.966140516855052e-07, + "loss": 0.0043, + "reward": 3.7078869342803955, + "reward_std": 0.44852541387081146, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9344494342803955, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.8515625, + "rewards/think_target_format_reward": 0.9296875, + "step": 136 + }, + { + "completion_length": 132.953125, + "epoch": 0.006821689986555793, + "grad_norm": 1.4430066302434916, + "kl": 0.11083984375, + "learning_rate": 9.965891550067221e-07, + "loss": 0.0044, + "reward": 3.7096720933914185, + "reward_std": 0.3169776052236557, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8229533731937408, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.921875, + "rewards/think_target_format_reward": 0.96484375, + "step": 137 + }, + { + "completion_length": 132.7265625, + "epoch": 0.0068714833441218945, + "grad_norm": 1.605850878311132, + "kl": 0.1201171875, + "learning_rate": 9.96564258327939e-07, + "loss": 0.0048, + "reward": 3.8397518396377563, + "reward_std": 0.29040297865867615, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9569393694400787, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.94140625, + "rewards/think_target_format_reward": 0.94140625, + "step": 138 + }, + { + "completion_length": 136.5390625, + "epoch": 0.006921276701687995, + "grad_norm": 1.438621086051704, + "kl": 0.11083984375, + "learning_rate": 9.96539361649156e-07, + "loss": 0.0044, + "reward": 3.7994948625564575, + "reward_std": 0.27994580566883087, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8619949817657471, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.95703125, + "rewards/think_target_format_reward": 0.98046875, + "step": 139 + }, + { + "completion_length": 136.9921875, + "epoch": 0.006971070059254096, + "grad_norm": 1.742902448030199, + "kl": 0.1357421875, + "learning_rate": 9.965144649703729e-07, + "loss": 0.0054, + "reward": 3.7724233865737915, + "reward_std": 0.3635234981775284, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.909142255783081, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.90234375, + "rewards/think_target_format_reward": 0.96875, + "step": 140 + }, + { + "completion_length": 135.125, + "epoch": 0.007020863416820196, + "grad_norm": 1.5868682122567102, + "kl": 0.13671875, + "learning_rate": 9.964895682915898e-07, + "loss": 0.0055, + "reward": 3.84320604801178, + "reward_std": 0.2651433199644089, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9174248576164246, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.94921875, + "rewards/think_target_format_reward": 0.9765625, + "step": 141 + }, + { + "completion_length": 133.078125, + "epoch": 0.007070656774386297, + "grad_norm": 0.9538113656610722, + "kl": 0.11572265625, + "learning_rate": 9.964646716128067e-07, + "loss": 0.0046, + "reward": 3.9042047262191772, + "reward_std": 0.1541340947151184, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9588921666145325, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.97265625, + "rewards/think_target_format_reward": 0.98046875, + "step": 142 + }, + { + "completion_length": 134.421875, + "epoch": 0.007120450131952397, + "grad_norm": 1.4326083857992864, + "kl": 0.12890625, + "learning_rate": 9.964397749340238e-07, + "loss": 0.0052, + "reward": 3.85475492477417, + "reward_std": 0.23000210523605347, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9328798949718475, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.94140625, + "rewards/think_target_format_reward": 0.98046875, + "step": 143 + }, + { + "completion_length": 133.0703125, + "epoch": 0.007170243489518498, + "grad_norm": 1.4970940504044847, + "kl": 0.1318359375, + "learning_rate": 9.964148782552408e-07, + "loss": 0.0053, + "reward": 3.8657045364379883, + "reward_std": 0.2182132601737976, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9516420662403107, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9375, + "rewards/think_target_format_reward": 0.984375, + "step": 144 + }, + { + "completion_length": 142.1875, + "epoch": 0.007220036847084599, + "grad_norm": 1.5857593019554064, + "kl": 0.1259765625, + "learning_rate": 9.963899815764577e-07, + "loss": 0.005, + "reward": 3.8219841718673706, + "reward_std": 0.2619125619530678, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.896202951669693, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.96484375, + "rewards/think_target_format_reward": 0.96875, + "step": 145 + }, + { + "completion_length": 139.1796875, + "epoch": 0.0072698302046507, + "grad_norm": 2.691450415345861, + "kl": 0.1337890625, + "learning_rate": 9.963650848976746e-07, + "loss": 0.0053, + "reward": 3.734684467315674, + "reward_std": 0.3204428255558014, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8479657769203186, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9296875, + "rewards/think_target_format_reward": 0.95703125, + "step": 146 + }, + { + "completion_length": 136.203125, + "epoch": 0.0073196235622168, + "grad_norm": 1.495496256661318, + "kl": 0.14794921875, + "learning_rate": 9.963401882188915e-07, + "loss": 0.0059, + "reward": 3.838493585586548, + "reward_std": 0.2868060767650604, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9361498653888702, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.93359375, + "rewards/think_target_format_reward": 0.96875, + "step": 147 + }, + { + "completion_length": 133.3515625, + "epoch": 0.007369416919782901, + "grad_norm": 1.0983399124579554, + "kl": 0.124267578125, + "learning_rate": 9.963152915401084e-07, + "loss": 0.005, + "reward": 3.9058321714401245, + "reward_std": 0.14993900060653687, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9331759214401245, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.984375, + "rewards/think_target_format_reward": 0.98828125, + "step": 148 + }, + { + "completion_length": 136.140625, + "epoch": 0.007419210277349001, + "grad_norm": 1.3770544593695273, + "kl": 0.123046875, + "learning_rate": 9.962903948613256e-07, + "loss": 0.0049, + "reward": 3.777996063232422, + "reward_std": 0.3385941982269287, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8678397238254547, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9453125, + "rewards/think_target_format_reward": 0.96484375, + "step": 149 + }, + { + "completion_length": 131.6328125, + "epoch": 0.007469003634915102, + "grad_norm": 1.464594476189404, + "kl": 0.12548828125, + "learning_rate": 9.962654981825423e-07, + "loss": 0.005, + "reward": 3.787842631340027, + "reward_std": 0.28376615047454834, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8894051313400269, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9375, + "rewards/think_target_format_reward": 0.9609375, + "step": 150 + }, + { + "completion_length": 134.6953125, + "epoch": 0.007518796992481203, + "grad_norm": 1.2004489896966177, + "kl": 0.115966796875, + "learning_rate": 9.962406015037594e-07, + "loss": 0.0046, + "reward": 3.8803350925445557, + "reward_std": 0.17658080160617828, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9272100925445557, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.96484375, + "rewards/think_target_format_reward": 0.98828125, + "step": 151 + }, + { + "completion_length": 144.515625, + "epoch": 0.007568590350047304, + "grad_norm": 1.8184923458592253, + "kl": 0.1767578125, + "learning_rate": 9.962157048249763e-07, + "loss": 0.0071, + "reward": 3.7153056859970093, + "reward_std": 0.3971406817436218, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8754618465900421, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91015625, + "rewards/think_target_format_reward": 0.9375, + "step": 152 + }, + { + "completion_length": 145.1484375, + "epoch": 0.007618383707613405, + "grad_norm": 1.740066860532123, + "kl": 0.115478515625, + "learning_rate": 9.961908081461932e-07, + "loss": 0.0046, + "reward": 3.5525866746902466, + "reward_std": 0.3999825417995453, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.736180454492569, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.875, + "rewards/think_target_format_reward": 0.94921875, + "step": 153 + }, + { + "completion_length": 133.25, + "epoch": 0.007668177065179505, + "grad_norm": 1.8977649396016614, + "kl": 0.12060546875, + "learning_rate": 9.961659114674104e-07, + "loss": 0.0048, + "reward": 3.74560284614563, + "reward_std": 0.28055620193481445, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8627903461456299, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.94140625, + "rewards/think_target_format_reward": 0.94921875, + "step": 154 + }, + { + "completion_length": 151.859375, + "epoch": 0.007717970422745606, + "grad_norm": 1.1725147495638082, + "kl": 0.111572265625, + "learning_rate": 9.96141014788627e-07, + "loss": 0.0045, + "reward": 3.6868364810943604, + "reward_std": 0.2801390290260315, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8508988916873932, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.87890625, + "rewards/think_target_format_reward": 0.95703125, + "step": 155 + }, + { + "completion_length": 135.9609375, + "epoch": 0.007767763780311706, + "grad_norm": 2.2677825780417793, + "kl": 0.14453125, + "learning_rate": 9.961161181098442e-07, + "loss": 0.0058, + "reward": 3.7312498092651367, + "reward_std": 0.3374701291322708, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8796873390674591, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.921875, + "rewards/think_target_format_reward": 0.9375, + "step": 156 + }, + { + "completion_length": 135.3203125, + "epoch": 0.007817557137877807, + "grad_norm": 2.836869723063664, + "kl": 0.111328125, + "learning_rate": 9.96091221431061e-07, + "loss": 0.0045, + "reward": 3.6421433687210083, + "reward_std": 0.3600689470767975, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8062058389186859, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91015625, + "rewards/think_target_format_reward": 0.92578125, + "step": 157 + }, + { + "completion_length": 141.34375, + "epoch": 0.007867350495443909, + "grad_norm": 1.4724566571316633, + "kl": 0.119140625, + "learning_rate": 9.96066324752278e-07, + "loss": 0.0048, + "reward": 3.581104278564453, + "reward_std": 0.5320860892534256, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.8193855285644531, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.84765625, + "rewards/think_target_format_reward": 0.9296875, + "step": 158 + }, + { + "completion_length": 143.46875, + "epoch": 0.007917143853010008, + "grad_norm": 1.3122109117379412, + "kl": 0.12353515625, + "learning_rate": 9.96041428073495e-07, + "loss": 0.0049, + "reward": 3.741799831390381, + "reward_std": 0.43711815774440765, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9410184919834137, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.87890625, + "rewards/think_target_format_reward": 0.9296875, + "step": 159 + }, + { + "completion_length": 139.2265625, + "epoch": 0.007966937210576109, + "grad_norm": 2.703237259652327, + "kl": 0.1328125, + "learning_rate": 9.960165313947119e-07, + "loss": 0.0053, + "reward": 3.7987393140792847, + "reward_std": 0.26848846673965454, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9237393438816071, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.92578125, + "rewards/think_target_format_reward": 0.94921875, + "step": 160 + }, + { + "completion_length": 134.234375, + "epoch": 0.00801673056814221, + "grad_norm": 3.9032622352853075, + "kl": 0.138671875, + "learning_rate": 9.959916347159288e-07, + "loss": 0.0055, + "reward": 3.7779178619384766, + "reward_std": 0.3339390158653259, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9497927129268646, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.92578125, + "rewards/think_target_format_reward": 0.90234375, + "step": 161 + }, + { + "completion_length": 135.1015625, + "epoch": 0.008066523925708311, + "grad_norm": 1.2031936098091933, + "kl": 0.11767578125, + "learning_rate": 9.95966738037146e-07, + "loss": 0.0047, + "reward": 3.7648441791534424, + "reward_std": 0.2988617271184921, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8937504291534424, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91796875, + "rewards/think_target_format_reward": 0.953125, + "step": 162 + }, + { + "completion_length": 138.40625, + "epoch": 0.00811631728327441, + "grad_norm": 1.4061207126164985, + "kl": 0.10986328125, + "learning_rate": 9.959418413583628e-07, + "loss": 0.0044, + "reward": 3.777918815612793, + "reward_std": 0.33315446972846985, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8833875954151154, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9375, + "rewards/think_target_format_reward": 0.96484375, + "step": 163 + }, + { + "completion_length": 137.25, + "epoch": 0.008166110640840512, + "grad_norm": 1.4848358680233253, + "kl": 0.117431640625, + "learning_rate": 9.959169446795797e-07, + "loss": 0.0047, + "reward": 3.6809157133102417, + "reward_std": 0.33889006078243256, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8449782729148865, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.8984375, + "rewards/think_target_format_reward": 0.9375, + "step": 164 + }, + { + "completion_length": 133.1796875, + "epoch": 0.008215903998406613, + "grad_norm": 1.858780364596672, + "kl": 0.126953125, + "learning_rate": 9.958920480007967e-07, + "loss": 0.0051, + "reward": 3.739651918411255, + "reward_std": 0.3844549208879471, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9271518290042877, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.89453125, + "rewards/think_target_format_reward": 0.92578125, + "step": 165 + }, + { + "completion_length": 144.765625, + "epoch": 0.008265697355972714, + "grad_norm": 1.4747436757816734, + "kl": 0.12939453125, + "learning_rate": 9.958671513220136e-07, + "loss": 0.0052, + "reward": 3.710367441177368, + "reward_std": 0.4435315430164337, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.9173987507820129, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.8984375, + "rewards/think_target_format_reward": 0.91015625, + "step": 166 + }, + { + "completion_length": 142.5859375, + "epoch": 0.008315490713538813, + "grad_norm": 2.5676869604785053, + "kl": 0.125732421875, + "learning_rate": 9.958422546432305e-07, + "loss": 0.005, + "reward": 3.7357089519500732, + "reward_std": 0.3510442525148392, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8997714817523956, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.890625, + "rewards/think_target_format_reward": 0.9453125, + "step": 167 + }, + { + "completion_length": 145.59375, + "epoch": 0.008365284071104914, + "grad_norm": 2.1425386245071385, + "kl": 0.128662109375, + "learning_rate": 9.958173579644476e-07, + "loss": 0.0051, + "reward": 3.687202215194702, + "reward_std": 0.511498898267746, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.9215771853923798, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.875, + "rewards/think_target_format_reward": 0.90625, + "step": 168 + }, + { + "completion_length": 143.2109375, + "epoch": 0.008415077428671015, + "grad_norm": 1.4016255030157003, + "kl": 0.1435546875, + "learning_rate": 9.957924612856643e-07, + "loss": 0.0057, + "reward": 3.7200942039489746, + "reward_std": 0.35561899840831757, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8802503943443298, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91796875, + "rewards/think_target_format_reward": 0.921875, + "step": 169 + }, + { + "completion_length": 140.4921875, + "epoch": 0.008464870786237116, + "grad_norm": 2.8709280784312723, + "kl": 0.12548828125, + "learning_rate": 9.957675646068815e-07, + "loss": 0.005, + "reward": 3.7149710655212402, + "reward_std": 0.36778540909290314, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.8790336549282074, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.90625, + "rewards/think_target_format_reward": 0.9453125, + "step": 170 + }, + { + "completion_length": 142.171875, + "epoch": 0.008514664143803218, + "grad_norm": 1.226223825898933, + "kl": 0.134521484375, + "learning_rate": 9.957426679280984e-07, + "loss": 0.0054, + "reward": 3.8738365173339844, + "reward_std": 0.2439577430486679, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9910240471363068, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.953125, + "rewards/think_target_format_reward": 0.9296875, + "step": 171 + }, + { + "completion_length": 130.15625, + "epoch": 0.008564457501369317, + "grad_norm": 2.0908810708158905, + "kl": 0.1328125, + "learning_rate": 9.957177712493153e-07, + "loss": 0.0053, + "reward": 3.8390663862228394, + "reward_std": 0.2808324694633484, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9601600170135498, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.93359375, + "rewards/think_target_format_reward": 0.9453125, + "step": 172 + }, + { + "completion_length": 140.0703125, + "epoch": 0.008614250858935418, + "grad_norm": 1.4453383940404523, + "kl": 0.125244140625, + "learning_rate": 9.956928745705322e-07, + "loss": 0.005, + "reward": 3.7553484439849854, + "reward_std": 0.40174250304698944, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9350358545780182, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.88671875, + "rewards/think_target_format_reward": 0.94140625, + "step": 173 + }, + { + "completion_length": 136.2890625, + "epoch": 0.008664044216501519, + "grad_norm": 1.713845227339911, + "kl": 0.13134765625, + "learning_rate": 9.956679778917491e-07, + "loss": 0.0053, + "reward": 3.7936216592788696, + "reward_std": 0.24137836694717407, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8678405284881592, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.96875, + "rewards/think_target_format_reward": 0.96484375, + "step": 174 + }, + { + "completion_length": 140.203125, + "epoch": 0.00871383757406762, + "grad_norm": 2.1609036886833946, + "kl": 0.13330078125, + "learning_rate": 9.95643081212966e-07, + "loss": 0.0053, + "reward": 3.7413917779922485, + "reward_std": 0.36172524839639664, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9015480577945709, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.89453125, + "rewards/think_target_format_reward": 0.953125, + "step": 175 + }, + { + "completion_length": 131.078125, + "epoch": 0.00876363093163372, + "grad_norm": 1.1442404831048407, + "kl": 0.116455078125, + "learning_rate": 9.956181845341832e-07, + "loss": 0.0047, + "reward": 3.818881392478943, + "reward_std": 0.2061913162469864, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8931001722812653, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9609375, + "rewards/think_target_format_reward": 0.97265625, + "step": 176 + }, + { + "completion_length": 133.890625, + "epoch": 0.00881342428919982, + "grad_norm": 2.2929973062605526, + "kl": 0.11181640625, + "learning_rate": 9.955932878554e-07, + "loss": 0.0045, + "reward": 3.875520706176758, + "reward_std": 0.21005383133888245, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9497394263744354, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.94921875, + "rewards/think_target_format_reward": 0.9765625, + "step": 177 + }, + { + "completion_length": 133.6015625, + "epoch": 0.008863217646765922, + "grad_norm": 1.2448177543558887, + "kl": 0.13916015625, + "learning_rate": 9.95568391176617e-07, + "loss": 0.0056, + "reward": 3.764333486557007, + "reward_std": 0.29275651276111603, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8854272067546844, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.92578125, + "rewards/think_target_format_reward": 0.953125, + "step": 178 + }, + { + "completion_length": 130.359375, + "epoch": 0.008913011004332023, + "grad_norm": 2.9924606023171716, + "kl": 0.135009765625, + "learning_rate": 9.95543494497834e-07, + "loss": 0.0054, + "reward": 3.6208207607269287, + "reward_std": 0.1692044734954834, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.6872271299362183, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9765625, + "rewards/think_target_format_reward": 0.96484375, + "step": 179 + }, + { + "completion_length": 143.6484375, + "epoch": 0.008962804361898122, + "grad_norm": 1.765492997683097, + "kl": 0.122314453125, + "learning_rate": 9.955185978190508e-07, + "loss": 0.0049, + "reward": 3.6510881185531616, + "reward_std": 0.3637561723589897, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.7604632079601288, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9296875, + "rewards/think_target_format_reward": 0.9765625, + "step": 180 + }, + { + "completion_length": 131.0, + "epoch": 0.009012597719464223, + "grad_norm": 2.343454400266417, + "kl": 0.124755859375, + "learning_rate": 9.95493701140268e-07, + "loss": 0.005, + "reward": 3.7367568016052246, + "reward_std": 0.232781320810318, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.8305067718029022, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.94921875, + "rewards/think_target_format_reward": 0.96484375, + "step": 181 + }, + { + "completion_length": 132.5, + "epoch": 0.009062391077030324, + "grad_norm": 1.1111419661435378, + "kl": 0.125732421875, + "learning_rate": 9.954688044614849e-07, + "loss": 0.005, + "reward": 3.8547242879867554, + "reward_std": 0.21629764139652252, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9172243475914001, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.97265625, + "rewards/think_target_format_reward": 0.96484375, + "step": 182 + }, + { + "completion_length": 131.390625, + "epoch": 0.009112184434596425, + "grad_norm": 2.0370926559400986, + "kl": 0.11767578125, + "learning_rate": 9.954439077827018e-07, + "loss": 0.0047, + "reward": 3.624150037765503, + "reward_std": 0.3819653391838074, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.7608687877655029, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.92578125, + "rewards/think_target_format_reward": 0.953125, + "step": 183 + }, + { + "completion_length": 129.234375, + "epoch": 0.009161977792162525, + "grad_norm": 1.3964944991422275, + "kl": 0.1259765625, + "learning_rate": 9.954190111039187e-07, + "loss": 0.005, + "reward": 3.766968011856079, + "reward_std": 0.30075134336948395, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.8880617618560791, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.94140625, + "rewards/think_target_format_reward": 0.953125, + "step": 184 + }, + { + "completion_length": 130.1328125, + "epoch": 0.009211771149728626, + "grad_norm": 2.1227955246094914, + "kl": 0.1240234375, + "learning_rate": 9.953941144251356e-07, + "loss": 0.005, + "reward": 3.8269156217575073, + "reward_std": 0.35711514949798584, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.9636342823505402, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9296875, + "rewards/think_target_format_reward": 0.94921875, + "step": 185 + }, + { + "completion_length": 142.15625, + "epoch": 0.009261564507294727, + "grad_norm": 1.95409263644765, + "kl": 0.110107421875, + "learning_rate": 9.953692177463526e-07, + "loss": 0.0044, + "reward": 3.8330496549606323, + "reward_std": 0.2704509347677231, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9307057559490204, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.93359375, + "rewards/think_target_format_reward": 0.96875, + "step": 186 + }, + { + "completion_length": 134.2734375, + "epoch": 0.009311357864860828, + "grad_norm": 2.363007168066581, + "kl": 0.130615234375, + "learning_rate": 9.953443210675695e-07, + "loss": 0.0052, + "reward": 3.635684370994568, + "reward_std": 0.33139239251613617, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.7528719007968903, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9375, + "rewards/think_target_format_reward": 0.953125, + "step": 187 + }, + { + "completion_length": 131.1953125, + "epoch": 0.009361151222426927, + "grad_norm": 4.254913357245525, + "kl": 0.17529296875, + "learning_rate": 9.953194243887864e-07, + "loss": 0.007, + "reward": 3.803374409675598, + "reward_std": 0.29695306718349457, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9127494394779205, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.94140625, + "rewards/think_target_format_reward": 0.94921875, + "step": 188 + }, + { + "completion_length": 132.484375, + "epoch": 0.009410944579993029, + "grad_norm": 1.0352695919531547, + "kl": 0.126708984375, + "learning_rate": 9.952945277100035e-07, + "loss": 0.0051, + "reward": 3.8454381227493286, + "reward_std": 0.2677586227655411, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9626255929470062, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.921875, + "rewards/think_target_format_reward": 0.96875, + "step": 189 + }, + { + "completion_length": 134.3515625, + "epoch": 0.00946073793755913, + "grad_norm": 2.0343529191628447, + "kl": 0.11474609375, + "learning_rate": 9.952696310312204e-07, + "loss": 0.0046, + "reward": 3.8033363819122314, + "reward_std": 0.3392862230539322, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9517738819122314, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91796875, + "rewards/think_target_format_reward": 0.93359375, + "step": 190 + }, + { + "completion_length": 137.6171875, + "epoch": 0.00951053129512523, + "grad_norm": 1.2106439736675043, + "kl": 0.11865234375, + "learning_rate": 9.952447343524374e-07, + "loss": 0.0048, + "reward": 3.8473784923553467, + "reward_std": 0.2762718200683594, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9801909625530243, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91796875, + "rewards/think_target_format_reward": 0.94921875, + "step": 191 + }, + { + "completion_length": 137.421875, + "epoch": 0.009560324652691332, + "grad_norm": 1.8432265310075955, + "kl": 0.113037109375, + "learning_rate": 9.952198376736543e-07, + "loss": 0.0045, + "reward": 3.8353395462036133, + "reward_std": 0.29184091091156006, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9525269865989685, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.921875, + "rewards/think_target_format_reward": 0.9609375, + "step": 192 + }, + { + "completion_length": 135.90625, + "epoch": 0.009610118010257431, + "grad_norm": 2.3584214680619557, + "kl": 0.12744140625, + "learning_rate": 9.951949409948712e-07, + "loss": 0.0051, + "reward": 3.755526304244995, + "reward_std": 0.34870287775993347, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8883388042449951, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9453125, + "rewards/think_target_format_reward": 0.921875, + "step": 193 + }, + { + "completion_length": 143.25, + "epoch": 0.009659911367823532, + "grad_norm": 1.2166183849660404, + "kl": 0.107666015625, + "learning_rate": 9.951700443160881e-07, + "loss": 0.0043, + "reward": 3.7504327297210693, + "reward_std": 0.3105862885713577, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9262140393257141, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.88671875, + "rewards/think_target_format_reward": 0.9375, + "step": 194 + }, + { + "completion_length": 135.28125, + "epoch": 0.009709704725389633, + "grad_norm": 1.3206251921762313, + "kl": 0.127685546875, + "learning_rate": 9.951451476373052e-07, + "loss": 0.0051, + "reward": 3.809424638748169, + "reward_std": 0.36549994349479675, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.961768388748169, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.92578125, + "rewards/think_target_format_reward": 0.921875, + "step": 195 + }, + { + "completion_length": 142.8203125, + "epoch": 0.009759498082955734, + "grad_norm": 2.060018593001608, + "kl": 0.13330078125, + "learning_rate": 9.95120250958522e-07, + "loss": 0.0053, + "reward": 3.794524669647217, + "reward_std": 0.3640502989292145, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.9781184792518616, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.90625, + "rewards/think_target_format_reward": 0.91015625, + "step": 196 + }, + { + "completion_length": 140.1953125, + "epoch": 0.009809291440521834, + "grad_norm": 1.4751437362350364, + "kl": 0.16259765625, + "learning_rate": 9.95095354279739e-07, + "loss": 0.0065, + "reward": 3.769395589828491, + "reward_std": 0.3139412999153137, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.925645649433136, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.91015625, + "rewards/think_target_format_reward": 0.93359375, + "step": 197 + }, + { + "completion_length": 143.3203125, + "epoch": 0.009859084798087935, + "grad_norm": 1.9968938821122522, + "kl": 0.12353515625, + "learning_rate": 9.95070457600956e-07, + "loss": 0.0049, + "reward": 3.7199814319610596, + "reward_std": 0.45447179675102234, + "rewards/format_reward": 0.984375, + "rewards/iou_reward": 0.8606063425540924, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9140625, + "rewards/think_target_format_reward": 0.9609375, + "step": 198 + }, + { + "completion_length": 134.859375, + "epoch": 0.009908878155654036, + "grad_norm": 1.4535570244442058, + "kl": 0.115966796875, + "learning_rate": 9.95045560922173e-07, + "loss": 0.0046, + "reward": 3.7775022983551025, + "reward_std": 0.20365791022777557, + "rewards/format_reward": 1.0, + "rewards/iou_reward": 0.8712522685527802, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.9453125, + "rewards/think_target_format_reward": 0.9609375, + "step": 199 + }, + { + "completion_length": 136.78125, + "epoch": 0.009958671513220137, + "grad_norm": 1.6921135631113906, + "kl": 0.112060546875, + "learning_rate": 9.950206642433898e-07, + "loss": 0.0045, + "reward": 3.80859375, + "reward_std": 0.24464495480060577, + "rewards/format_reward": 0.9921875, + "rewards/iou_reward": 0.9375, + "rewards/log_reward": 0.0, + "rewards/think_bbox_format_reward": 0.94140625, + "rewards/think_target_format_reward": 0.9375, + "step": 200 + } + ], + "logging_steps": 1.0, + "max_steps": 40166, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}