diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8058 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4008, + "eval_steps": 500, + "global_step": 501, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 288.625, + "epoch": 0.008, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2e-05, + "loss": -0.055, + "reward": 0.748445987701416, + "reward_std": 0.48841265588998795, + "rewards/mrr_reward": 0.24103422835469246, + "rewards/rank_answer_foramt_reward": 0.4765625, + "rewards/rank_overall_format_reward": 0.875, + "rewards/rank_think_format_reward": 0.1860488811507821, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.5, + "epoch": 0.016, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2e-05, + "loss": -0.0847, + "reward": 0.6804930120706558, + "reward_std": 0.38646717369556427, + "rewards/mrr_reward": 0.1652343738824129, + "rewards/rank_answer_foramt_reward": 0.421875, + "rewards/rank_overall_format_reward": 0.8671875, + "rewards/rank_think_format_reward": 0.27232725732028484, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.84375, + "epoch": 0.024, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2e-05, + "loss": -0.0834, + "reward": 0.9339272081851959, + "reward_std": 0.5670942962169647, + "rewards/mrr_reward": 0.37167659401893616, + "rewards/rank_answer_foramt_reward": 0.5703125, + "rewards/rank_overall_format_reward": 0.8671875, + "rewards/rank_think_format_reward": 0.26628969237208366, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.671875, + "epoch": 0.032, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2e-05, + "loss": -0.0804, + "reward": 1.024831861257553, + "reward_std": 0.5523797571659088, + "rewards/mrr_reward": 0.37087053433060646, + "rewards/rank_answer_foramt_reward": 0.623046875, + "rewards/rank_overall_format_reward": 0.90625, + "rewards/rank_think_format_reward": 0.45240409672260284, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.25, + "epoch": 0.04, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2e-05, + "loss": -0.0456, + "reward": 0.9988610446453094, + "reward_std": 0.5176242738962173, + "rewards/mrr_reward": 0.4147135466337204, + "rewards/rank_answer_foramt_reward": 0.607421875, + "rewards/rank_overall_format_reward": 0.890625, + "rewards/rank_think_format_reward": 0.27209700644016266, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.34375, + "epoch": 0.048, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2e-05, + "loss": -0.0798, + "reward": 0.7302423864603043, + "reward_std": 0.3661453425884247, + "rewards/mrr_reward": 0.182986113242805, + "rewards/rank_answer_foramt_reward": 0.48828125, + "rewards/rank_overall_format_reward": 0.90625, + "rewards/rank_think_format_reward": 0.2638210151344538, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.421875, + "epoch": 0.056, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2e-05, + "loss": -0.0388, + "reward": 0.9191122651100159, + "reward_std": 0.5329124554991722, + "rewards/mrr_reward": 0.358767356723547, + "rewards/rank_answer_foramt_reward": 0.564453125, + "rewards/rank_overall_format_reward": 0.875, + "rewards/rank_think_format_reward": 0.2585616558790207, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.421875, + "epoch": 0.064, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2e-05, + "loss": -0.0344, + "reward": 0.8672969937324524, + "reward_std": 0.48230237513780594, + "rewards/mrr_reward": 0.3153645843267441, + "rewards/rank_answer_foramt_reward": 0.525390625, + "rewards/rank_overall_format_reward": 0.859375, + "rewards/rank_think_format_reward": 0.2877568071708083, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.59375, + "epoch": 0.072, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2e-05, + "loss": -0.0414, + "reward": 1.0287966132164001, + "reward_std": 0.5458935871720314, + "rewards/mrr_reward": 0.4440104216337204, + "rewards/rank_answer_foramt_reward": 0.560546875, + "rewards/rank_overall_format_reward": 0.875, + "rewards/rank_think_format_reward": 0.33653245121240616, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.8125, + "epoch": 0.08, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2e-05, + "loss": -0.0409, + "reward": 1.157219260931015, + "reward_std": 0.38755715638399124, + "rewards/mrr_reward": 0.5087177604436874, + "rewards/rank_answer_foramt_reward": 0.8203125, + "rewards/rank_overall_format_reward": 0.859375, + "rewards/rank_think_format_reward": 0.2854683920741081, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.140625, + "epoch": 0.088, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2e-05, + "loss": -0.0841, + "reward": 0.9862491339445114, + "reward_std": 0.5109377801418304, + "rewards/mrr_reward": 0.3429687526077032, + "rewards/rank_answer_foramt_reward": 0.611328125, + "rewards/rank_overall_format_reward": 0.921875, + "rewards/rank_think_format_reward": 0.41613128781318665, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.796875, + "epoch": 0.096, + "grad_norm": 0.03115917183458805, + "kl": 0.0, + "learning_rate": 1.9999999684172664e-05, + "loss": -0.0053, + "reward": 0.7712794989347458, + "reward_std": 0.48659200221300125, + "rewards/mrr_reward": 0.2405133955180645, + "rewards/rank_answer_foramt_reward": 0.400390625, + "rewards/rank_overall_format_reward": 0.90625, + "rewards/rank_think_format_reward": 0.3017413951456547, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.734375, + "epoch": 0.104, + "grad_norm": 0.03115917183458805, + "kl": -6.780028343200684e-06, + "learning_rate": 1.9999999684172664e-05, + "loss": -0.0281, + "reward": 0.8285368829965591, + "reward_std": 0.5136675909161568, + "rewards/mrr_reward": 0.27406374365091324, + "rewards/rank_answer_foramt_reward": 0.494140625, + "rewards/rank_overall_format_reward": 0.890625, + "rewards/rank_think_format_reward": 0.2954559400677681, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.71875, + "epoch": 0.112, + "grad_norm": 0.03210530802607536, + "kl": -6.556510925292969e-06, + "learning_rate": 1.9999998736690666e-05, + "loss": -0.0714, + "reward": 0.8715860396623611, + "reward_std": 0.49175362288951874, + "rewards/mrr_reward": 0.27080853283405304, + "rewards/rank_answer_foramt_reward": 0.5078125, + "rewards/rank_overall_format_reward": 0.9296875, + "rewards/rank_think_format_reward": 0.3830379396677017, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.921875, + "epoch": 0.12, + "grad_norm": 0.02840823121368885, + "kl": -7.599592208862305e-06, + "learning_rate": 1.999999715755407e-05, + "loss": -0.0276, + "reward": 0.8312882781028748, + "reward_std": 0.4621574282646179, + "rewards/mrr_reward": 0.2509300671517849, + "rewards/rank_answer_foramt_reward": 0.5234375, + "rewards/rank_overall_format_reward": 0.921875, + "rewards/rank_think_format_reward": 0.31334864534437656, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.390625, + "epoch": 0.128, + "grad_norm": 0.025292610749602318, + "kl": -5.260109901428223e-06, + "learning_rate": 1.9999994946762974e-05, + "loss": -0.0246, + "reward": 0.9414703845977783, + "reward_std": 0.5526015311479568, + "rewards/mrr_reward": 0.36406249925494194, + "rewards/rank_answer_foramt_reward": 0.482421875, + "rewards/rank_overall_format_reward": 0.859375, + "rewards/rank_think_format_reward": 0.4079238325357437, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.046875, + "epoch": 0.136, + "grad_norm": 0.03076692298054695, + "kl": -3.769993782043457e-06, + "learning_rate": 1.999999210431752e-05, + "loss": 0.0048, + "reward": 0.8599012494087219, + "reward_std": 0.49315596371889114, + "rewards/mrr_reward": 0.28183284401893616, + "rewards/rank_answer_foramt_reward": 0.53515625, + "rewards/rank_overall_format_reward": 0.8984375, + "rewards/rank_think_format_reward": 0.31812864542007446, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.5, + "epoch": 0.144, + "grad_norm": 0.029196709394454956, + "kl": 2.5480985641479492e-06, + "learning_rate": 1.9999988630217885e-05, + "loss": -0.0552, + "reward": 0.9807900786399841, + "reward_std": 0.5464348271489143, + "rewards/mrr_reward": 0.38268229365348816, + "rewards/rank_answer_foramt_reward": 0.623046875, + "rewards/rank_overall_format_reward": 0.8828125, + "rewards/rank_think_format_reward": 0.306588314473629, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.65625, + "epoch": 0.152, + "grad_norm": 0.03444257006049156, + "kl": 7.301568984985352e-07, + "learning_rate": 1.999998452446429e-05, + "loss": -0.0274, + "reward": 0.7828617691993713, + "reward_std": 0.38783423602581024, + "rewards/mrr_reward": 0.2496279813349247, + "rewards/rank_answer_foramt_reward": 0.48046875, + "rewards/rank_overall_format_reward": 0.8984375, + "rewards/rank_think_format_reward": 0.23695369437336922, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.46875, + "epoch": 0.16, + "grad_norm": 0.029183639213442802, + "kl": 8.150935173034668e-06, + "learning_rate": 1.9999979787056998e-05, + "loss": -0.0822, + "reward": 1.052710935473442, + "reward_std": 0.5361286401748657, + "rewards/mrr_reward": 0.420355923473835, + "rewards/rank_answer_foramt_reward": 0.650390625, + "rewards/rank_overall_format_reward": 0.890625, + "rewards/rank_think_format_reward": 0.3752116933465004, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.703125, + "epoch": 0.168, + "grad_norm": 0.028047969564795494, + "kl": 1.2278556823730469e-05, + "learning_rate": 1.9999974417996303e-05, + "loss": -0.0809, + "reward": 1.107970878481865, + "reward_std": 0.5233071744441986, + "rewards/mrr_reward": 0.43270088732242584, + "rewards/rank_answer_foramt_reward": 0.62109375, + "rewards/rank_overall_format_reward": 0.890625, + "rewards/rank_think_format_reward": 0.5345538482069969, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.203125, + "epoch": 0.176, + "grad_norm": 0.03128313645720482, + "kl": 1.9982457160949707e-05, + "learning_rate": 1.9999968417282542e-05, + "loss": -0.0821, + "reward": 0.8156533539295197, + "reward_std": 0.43370306491851807, + "rewards/mrr_reward": 0.25281498208642006, + "rewards/rank_answer_foramt_reward": 0.498046875, + "rewards/rank_overall_format_reward": 0.890625, + "rewards/rank_think_format_reward": 0.3168988637626171, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.640625, + "epoch": 0.184, + "grad_norm": 0.02993021160364151, + "kl": 2.4840235710144043e-05, + "learning_rate": 1.99999617849161e-05, + "loss": -0.078, + "reward": 1.129832923412323, + "reward_std": 0.5531031638383865, + "rewards/mrr_reward": 0.47254466265439987, + "rewards/rank_answer_foramt_reward": 0.705078125, + "rewards/rank_overall_format_reward": 0.8671875, + "rewards/rank_think_format_reward": 0.4195169061422348, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.828125, + "epoch": 0.192, + "grad_norm": 0.03055522032082081, + "kl": 4.00543212890625e-05, + "learning_rate": 1.9999954520897394e-05, + "loss": -0.0252, + "reward": 0.9006818234920502, + "reward_std": 0.5223089158535004, + "rewards/mrr_reward": 0.3359374925494194, + "rewards/rank_answer_foramt_reward": 0.525390625, + "rewards/rank_overall_format_reward": 0.9296875, + "rewards/rank_think_format_reward": 0.25626825354993343, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.796875, + "epoch": 0.2, + "grad_norm": 0.03174176439642906, + "kl": 3.859400749206543e-05, + "learning_rate": 1.999994662522688e-05, + "loss": -0.0891, + "reward": 0.9934758394956589, + "reward_std": 0.4950134977698326, + "rewards/mrr_reward": 0.3896019347012043, + "rewards/rank_answer_foramt_reward": 0.5703125, + "rewards/rank_overall_format_reward": 0.8984375, + "rewards/rank_think_format_reward": 0.3611707091331482, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.921875, + "epoch": 0.208, + "grad_norm": 0.03093676082789898, + "kl": 7.84844160079956e-05, + "learning_rate": 1.9999938097905064e-05, + "loss": -0.1107, + "reward": 0.7378821074962616, + "reward_std": 0.46790947765111923, + "rewards/mrr_reward": 0.2100694440305233, + "rewards/rank_answer_foramt_reward": 0.486328125, + "rewards/rank_overall_format_reward": 0.8828125, + "rewards/rank_think_format_reward": 0.23029159009456635, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.296875, + "epoch": 0.216, + "grad_norm": 0.03421920910477638, + "kl": 8.234381675720215e-05, + "learning_rate": 1.9999928938932473e-05, + "loss": -0.0799, + "reward": 0.9299369752407074, + "reward_std": 0.48957522213459015, + "rewards/mrr_reward": 0.3561817966401577, + "rewards/rank_answer_foramt_reward": 0.619140625, + "rewards/rank_overall_format_reward": 0.8984375, + "rewards/rank_think_format_reward": 0.22107390873134136, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.4375, + "epoch": 0.224, + "grad_norm": 0.03421920910477638, + "kl": 0.00018423795700073242, + "learning_rate": 1.9999928938932473e-05, + "loss": -0.0439, + "reward": 0.9257623851299286, + "reward_std": 0.5316600203514099, + "rewards/mrr_reward": 0.36640625819563866, + "rewards/rank_answer_foramt_reward": 0.59375, + "rewards/rank_overall_format_reward": 0.84375, + "rewards/rank_think_format_reward": 0.25751855596899986, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.609375, + "epoch": 0.232, + "grad_norm": 0.02869362011551857, + "kl": 0.0001027137041091919, + "learning_rate": 1.99999191483097e-05, + "loss": -0.0072, + "reward": 0.849075511097908, + "reward_std": 0.39980996400117874, + "rewards/mrr_reward": 0.31562500074505806, + "rewards/rank_answer_foramt_reward": 0.44921875, + "rewards/rank_overall_format_reward": 0.8984375, + "rewards/rank_think_format_reward": 0.26886044442653656, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.328125, + "epoch": 0.24, + "grad_norm": 0.028789300471544266, + "kl": 0.00010889768600463867, + "learning_rate": 1.999990872603735e-05, + "loss": -0.056, + "reward": 1.0844270288944244, + "reward_std": 0.4819156527519226, + "rewards/mrr_reward": 0.3760416656732559, + "rewards/rank_answer_foramt_reward": 0.625, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.5684971548616886, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.5, + "epoch": 0.248, + "grad_norm": 0.03265060856938362, + "kl": 0.00014287233352661133, + "learning_rate": 1.999989767211609e-05, + "loss": -0.0575, + "reward": 1.005102053284645, + "reward_std": 0.44889208674430847, + "rewards/mrr_reward": 0.3782986141741276, + "rewards/rank_answer_foramt_reward": 0.59375, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.35252929478883743, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.171875, + "epoch": 0.256, + "grad_norm": 0.027822472155094147, + "kl": 0.00015693902969360352, + "learning_rate": 1.9999885986546613e-05, + "loss": -0.0599, + "reward": 0.9155676811933517, + "reward_std": 0.4416455924510956, + "rewards/mrr_reward": 0.3112413324415684, + "rewards/rank_answer_foramt_reward": 0.607421875, + "rewards/rank_overall_format_reward": 0.9296875, + "rewards/rank_think_format_reward": 0.29418253153562546, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.421875, + "epoch": 0.264, + "grad_norm": 0.03131254017353058, + "kl": 0.00016450881958007812, + "learning_rate": 1.999987366932966e-05, + "loss": -0.0832, + "reward": 0.7057739198207855, + "reward_std": 0.4360158443450928, + "rewards/mrr_reward": 0.19101562350988388, + "rewards/rank_answer_foramt_reward": 0.439453125, + "rewards/rank_overall_format_reward": 0.8984375, + "rewards/rank_think_format_reward": 0.221982903778553, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.0, + "epoch": 0.272, + "grad_norm": 0.031528931111097336, + "kl": 0.00021272897720336914, + "learning_rate": 1.9999860720466007e-05, + "loss": 0.0174, + "reward": 0.8981269598007202, + "reward_std": 0.43555907905101776, + "rewards/mrr_reward": 0.33803943544626236, + "rewards/rank_answer_foramt_reward": 0.478515625, + "rewards/rank_overall_format_reward": 0.9140625, + "rewards/rank_think_format_reward": 0.304656695574522, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.171875, + "epoch": 0.28, + "grad_norm": 0.029030395671725273, + "kl": 0.00024181604385375977, + "learning_rate": 1.9999847139956477e-05, + "loss": 0.0231, + "reward": 1.1540020108222961, + "reward_std": 0.45568330585956573, + "rewards/mrr_reward": 0.4646453410387039, + "rewards/rank_answer_foramt_reward": 0.732421875, + "rewards/rank_overall_format_reward": 0.9296875, + "rewards/rank_think_format_reward": 0.4268500804901123, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.3125, + "epoch": 0.288, + "grad_norm": 0.03088027611374855, + "kl": 0.0003453493118286133, + "learning_rate": 1.9999832927801922e-05, + "loss": -0.0633, + "reward": 0.9360256493091583, + "reward_std": 0.4951440170407295, + "rewards/mrr_reward": 0.330078125, + "rewards/rank_answer_foramt_reward": 0.5703125, + "rewards/rank_overall_format_reward": 0.9453125, + "rewards/rank_think_format_reward": 0.32057957723736763, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.546875, + "epoch": 0.296, + "grad_norm": 0.033634252846241, + "kl": 0.00039967894554138184, + "learning_rate": 1.9999818084003243e-05, + "loss": -0.0662, + "reward": 0.989772766828537, + "reward_std": 0.4721188619732857, + "rewards/mrr_reward": 0.36536458879709244, + "rewards/rank_answer_foramt_reward": 0.62109375, + "rewards/rank_overall_format_reward": 0.9375, + "rewards/rank_think_format_reward": 0.33355215936899185, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.65625, + "epoch": 0.304, + "grad_norm": 0.03363899141550064, + "kl": 0.0003497004508972168, + "learning_rate": 1.999980260856137e-05, + "loss": -0.0616, + "reward": 1.0276555567979813, + "reward_std": 0.49545609951019287, + "rewards/mrr_reward": 0.4115767106413841, + "rewards/rank_answer_foramt_reward": 0.6171875, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.29659304022789, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.5, + "epoch": 0.312, + "grad_norm": 0.03134270757436752, + "kl": 0.00044792890548706055, + "learning_rate": 1.9999786501477298e-05, + "loss": -0.0625, + "reward": 0.9894662201404572, + "reward_std": 0.4404422789812088, + "rewards/mrr_reward": 0.3252232186496258, + "rewards/rank_answer_foramt_reward": 0.673828125, + "rewards/rank_overall_format_reward": 0.921875, + "rewards/rank_think_format_reward": 0.4171544536948204, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.5625, + "epoch": 0.32, + "grad_norm": 0.03180496767163277, + "kl": 0.0005990266799926758, + "learning_rate": 1.9999769762752024e-05, + "loss": -0.034, + "reward": 0.9837304353713989, + "reward_std": 0.5629166960716248, + "rewards/mrr_reward": 0.3815104216337204, + "rewards/rank_answer_foramt_reward": 0.5625, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.29365903325378895, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.5625, + "epoch": 0.328, + "grad_norm": 0.03363664820790291, + "kl": 0.0006002187728881836, + "learning_rate": 1.999975239238662e-05, + "loss": -0.0186, + "reward": 1.0903609842061996, + "reward_std": 0.6452237367630005, + "rewards/mrr_reward": 0.5052083358168602, + "rewards/rank_answer_foramt_reward": 0.60546875, + "rewards/rank_overall_format_reward": 0.9375, + "rewards/rank_think_format_reward": 0.2302209585905075, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.953125, + "epoch": 0.336, + "grad_norm": 0.02957266755402088, + "kl": 0.000903010368347168, + "learning_rate": 1.999973439038218e-05, + "loss": -0.0295, + "reward": 1.0027846843004227, + "reward_std": 0.4366834908723831, + "rewards/mrr_reward": 0.3347470201551914, + "rewards/rank_answer_foramt_reward": 0.6328125, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.43841899931430817, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.25, + "epoch": 0.344, + "grad_norm": 0.03136228397488594, + "kl": 0.000525057315826416, + "learning_rate": 1.9999715756739833e-05, + "loss": -0.0375, + "reward": 1.043868064880371, + "reward_std": 0.45916447043418884, + "rewards/mrr_reward": 0.38901908695697784, + "rewards/rank_answer_foramt_reward": 0.67578125, + "rewards/rank_overall_format_reward": 0.9296875, + "rewards/rank_think_format_reward": 0.37892188876867294, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.234375, + "epoch": 0.352, + "grad_norm": 0.031588468700647354, + "kl": 0.0005010366439819336, + "learning_rate": 1.9999696491460764e-05, + "loss": -0.0386, + "reward": 1.1304609179496765, + "reward_std": 0.41832099854946136, + "rewards/mrr_reward": 0.43151041865348816, + "rewards/rank_answer_foramt_reward": 0.75, + "rewards/rank_overall_format_reward": 0.9296875, + "rewards/rank_think_format_reward": 0.43834417313337326, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.390625, + "epoch": 0.36, + "grad_norm": 0.032946065068244934, + "kl": 0.0008701086044311523, + "learning_rate": 1.9999676594546187e-05, + "loss": -0.0529, + "reward": 1.1123294532299042, + "reward_std": 0.5354420319199562, + "rewards/mrr_reward": 0.46336185187101364, + "rewards/rank_answer_foramt_reward": 0.68359375, + "rewards/rank_overall_format_reward": 0.9296875, + "rewards/rank_think_format_reward": 0.35328710824251175, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.125, + "epoch": 0.368, + "grad_norm": 0.029322165995836258, + "kl": 0.0005701780319213867, + "learning_rate": 1.999965606599736e-05, + "loss": -0.0847, + "reward": 1.107217699289322, + "reward_std": 0.4067194238305092, + "rewards/mrr_reward": 0.3920392580330372, + "rewards/rank_answer_foramt_reward": 0.69140625, + "rewards/rank_overall_format_reward": 0.9453125, + "rewards/rank_think_format_reward": 0.5304885134100914, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.6875, + "epoch": 0.376, + "grad_norm": 0.0324987918138504, + "kl": 0.0009363889694213867, + "learning_rate": 1.999963490581558e-05, + "loss": -0.0823, + "reward": 0.7899350076913834, + "reward_std": 0.40959134697914124, + "rewards/mrr_reward": 0.21142169833183289, + "rewards/rank_answer_foramt_reward": 0.5703125, + "rewards/rank_overall_format_reward": 0.9375, + "rewards/rank_think_format_reward": 0.24525804258883, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.078125, + "epoch": 0.384, + "grad_norm": 0.03164631128311157, + "kl": 0.0010160207748413086, + "learning_rate": 1.9999613114002184e-05, + "loss": -0.0145, + "reward": 1.3335086703300476, + "reward_std": 0.3851042538881302, + "rewards/mrr_reward": 0.6021019294857979, + "rewards/rank_answer_foramt_reward": 0.787109375, + "rewards/rank_overall_format_reward": 0.9296875, + "rewards/rank_think_format_reward": 0.4995870888233185, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.875, + "epoch": 0.392, + "grad_norm": 0.03086424618959427, + "kl": 0.0010756254196166992, + "learning_rate": 1.9999590690558545e-05, + "loss": -0.0188, + "reward": 1.048632025718689, + "reward_std": 0.5718654319643974, + "rewards/mrr_reward": 0.40448908507823944, + "rewards/rank_answer_foramt_reward": 0.650390625, + "rewards/rank_overall_format_reward": 0.9375, + "rewards/rank_think_format_reward": 0.3640574663877487, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.6875, + "epoch": 0.4, + "grad_norm": 0.03133213520050049, + "kl": 0.0009970664978027344, + "learning_rate": 1.9999567635486086e-05, + "loss": -0.0647, + "reward": 1.1414598375558853, + "reward_std": 0.3979858383536339, + "rewards/mrr_reward": 0.4474826380610466, + "rewards/rank_answer_foramt_reward": 0.693359375, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.45647673308849335, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.578125, + "epoch": 0.408, + "grad_norm": 0.031969401985406876, + "kl": 0.0011034011840820312, + "learning_rate": 1.9999543948786258e-05, + "loss": -0.0894, + "reward": 1.0822753310203552, + "reward_std": 0.48015688359737396, + "rewards/mrr_reward": 0.4179687649011612, + "rewards/rank_answer_foramt_reward": 0.640625, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.419300127774477, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.890625, + "epoch": 0.416, + "grad_norm": 0.03478927165269852, + "kl": 0.0011467933654785156, + "learning_rate": 1.9999519630460554e-05, + "loss": -0.0107, + "reward": 1.0111391097307205, + "reward_std": 0.4753674492239952, + "rewards/mrr_reward": 0.4023437537252903, + "rewards/rank_answer_foramt_reward": 0.59375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.2588968575000763, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.703125, + "epoch": 0.424, + "grad_norm": 0.036083146929740906, + "kl": 0.0012824535369873047, + "learning_rate": 1.999949468051052e-05, + "loss": -0.0088, + "reward": 1.0059151947498322, + "reward_std": 0.45006410777568817, + "rewards/mrr_reward": 0.30140748247504234, + "rewards/rank_answer_foramt_reward": 0.599609375, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.566512443125248, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.4375, + "epoch": 0.432, + "grad_norm": 0.031547144055366516, + "kl": 0.0010249614715576172, + "learning_rate": 1.9999469098937726e-05, + "loss": -0.0604, + "reward": 1.0756309181451797, + "reward_std": 0.41381075978279114, + "rewards/mrr_reward": 0.3790246248245239, + "rewards/rank_answer_foramt_reward": 0.6875, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.4624905288219452, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.234375, + "epoch": 0.44, + "grad_norm": 0.03417390212416649, + "kl": 0.001626729965209961, + "learning_rate": 1.9999442885743785e-05, + "loss": -0.049, + "reward": 1.037936955690384, + "reward_std": 0.5517462939023972, + "rewards/mrr_reward": 0.4401041716337204, + "rewards/rank_answer_foramt_reward": 0.583984375, + "rewards/rank_overall_format_reward": 0.9375, + "rewards/rank_think_format_reward": 0.2901299186050892, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.46875, + "epoch": 0.448, + "grad_norm": 0.03233444690704346, + "kl": 0.0018727779388427734, + "learning_rate": 1.9999416040930354e-05, + "loss": -0.0382, + "reward": 0.9221065938472748, + "reward_std": 0.46301373839378357, + "rewards/mrr_reward": 0.289515133947134, + "rewards/rank_answer_foramt_reward": 0.611328125, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.344678096473217, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.15625, + "epoch": 0.456, + "grad_norm": 0.03557359054684639, + "kl": 0.0014951229095458984, + "learning_rate": 1.9999388564499135e-05, + "loss": -0.0302, + "reward": 1.0118384808301926, + "reward_std": 0.40575922280550003, + "rewards/mrr_reward": 0.3382130526006222, + "rewards/rank_answer_foramt_reward": 0.708984375, + "rewards/rank_overall_format_reward": 0.9453125, + "rewards/rank_think_format_reward": 0.38699227198958397, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.53125, + "epoch": 0.464, + "grad_norm": 0.03160971775650978, + "kl": 0.0015277862548828125, + "learning_rate": 1.999936045645186e-05, + "loss": -0.0462, + "reward": 1.2952305674552917, + "reward_std": 0.4332849085330963, + "rewards/mrr_reward": 0.5801215022802353, + "rewards/rank_answer_foramt_reward": 0.7578125, + "rewards/rank_overall_format_reward": 0.9375, + "rewards/rank_think_format_reward": 0.4716843515634537, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.84375, + "epoch": 0.472, + "grad_norm": 0.029569024220108986, + "kl": 0.0018584728240966797, + "learning_rate": 1.9999331716790303e-05, + "loss": -0.0289, + "reward": 1.2610860168933868, + "reward_std": 0.42367615550756454, + "rewards/mrr_reward": 0.489583320915699, + "rewards/rank_answer_foramt_reward": 0.802734375, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.5664023458957672, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.640625, + "epoch": 0.48, + "grad_norm": 0.031776271760463715, + "kl": 0.001566171646118164, + "learning_rate": 1.9999302345516278e-05, + "loss": -0.048, + "reward": 1.31424281001091, + "reward_std": 0.5264566540718079, + "rewards/mrr_reward": 0.5922247171401978, + "rewards/rank_answer_foramt_reward": 0.767578125, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.45941800996661186, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.859375, + "epoch": 0.488, + "grad_norm": 0.0326690673828125, + "kl": 0.003124237060546875, + "learning_rate": 1.9999272342631644e-05, + "loss": -0.0727, + "reward": 1.1519178003072739, + "reward_std": 0.41105426847934723, + "rewards/mrr_reward": 0.4376183748245239, + "rewards/rank_answer_foramt_reward": 0.771484375, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.43212172016501427, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.59375, + "epoch": 0.496, + "grad_norm": 0.032641906291246414, + "kl": 0.003032207489013672, + "learning_rate": 1.9999241708138296e-05, + "loss": -0.0419, + "reward": 1.0120218098163605, + "reward_std": 0.5163902416825294, + "rewards/mrr_reward": 0.37465277686715126, + "rewards/rank_answer_foramt_reward": 0.619140625, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.3513430394232273, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.953125, + "epoch": 0.504, + "grad_norm": 0.03364777937531471, + "kl": 0.0018007755279541016, + "learning_rate": 1.9999210442038164e-05, + "loss": 0.0066, + "reward": 1.251243233680725, + "reward_std": 0.46613559126853943, + "rewards/mrr_reward": 0.5524925589561462, + "rewards/rank_answer_foramt_reward": 0.80078125, + "rewards/rank_overall_format_reward": 0.9453125, + "rewards/rank_think_format_reward": 0.3713323250412941, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.546875, + "epoch": 0.512, + "grad_norm": 0.03263048082590103, + "kl": 0.002674579620361328, + "learning_rate": 1.9999178544333228e-05, + "loss": -0.0261, + "reward": 1.2319741547107697, + "reward_std": 0.40322718769311905, + "rewards/mrr_reward": 0.5384114533662796, + "rewards/rank_answer_foramt_reward": 0.7578125, + "rewards/rank_overall_format_reward": 0.9453125, + "rewards/rank_think_format_reward": 0.3985799662768841, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.5, + "epoch": 0.52, + "grad_norm": 0.04350940138101578, + "kl": 0.002279043197631836, + "learning_rate": 1.9999146015025503e-05, + "loss": -0.0436, + "reward": 1.2106666564941406, + "reward_std": 0.5054564848542213, + "rewards/mrr_reward": 0.5012090876698494, + "rewards/rank_answer_foramt_reward": 0.771484375, + "rewards/rank_overall_format_reward": 0.9375, + "rewards/rank_think_format_reward": 0.44088681042194366, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.046875, + "epoch": 0.528, + "grad_norm": 0.0323958545923233, + "kl": 0.0024566650390625, + "learning_rate": 1.999911285411704e-05, + "loss": -0.0498, + "reward": 0.9955967366695404, + "reward_std": 0.2997688129544258, + "rewards/mrr_reward": 0.2881200350821018, + "rewards/rank_answer_foramt_reward": 0.70703125, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.4680873528122902, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.515625, + "epoch": 0.536, + "grad_norm": 0.03429020941257477, + "kl": 0.0021696090698242188, + "learning_rate": 1.9999079061609933e-05, + "loss": -0.0498, + "reward": 1.1947258114814758, + "reward_std": 0.3867759630084038, + "rewards/mrr_reward": 0.4760354682803154, + "rewards/rank_answer_foramt_reward": 0.78515625, + "rewards/rank_overall_format_reward": 0.9296875, + "rewards/rank_think_format_reward": 0.46300553530454636, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.96875, + "epoch": 0.544, + "grad_norm": 0.04061827063560486, + "kl": 0.003525972366333008, + "learning_rate": 1.999904463750632e-05, + "loss": 0.0386, + "reward": 1.1350408345460892, + "reward_std": 0.4857459217309952, + "rewards/mrr_reward": 0.487152773886919, + "rewards/rank_answer_foramt_reward": 0.716796875, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.2933751530945301, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.46875, + "epoch": 0.552, + "grad_norm": 0.036203574389219284, + "kl": 0.0033969879150390625, + "learning_rate": 1.999900958180838e-05, + "loss": -0.0501, + "reward": 1.3717794716358185, + "reward_std": 0.34492378681898117, + "rewards/mrr_reward": 0.613802082836628, + "rewards/rank_answer_foramt_reward": 0.853515625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.4668227881193161, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.59375, + "epoch": 0.56, + "grad_norm": 0.03513631597161293, + "kl": 0.0030303001403808594, + "learning_rate": 1.9998973894518318e-05, + "loss": -0.0498, + "reward": 1.38496533036232, + "reward_std": 0.565589427947998, + "rewards/mrr_reward": 0.6923362985253334, + "rewards/rank_answer_foramt_reward": 0.7890625, + "rewards/rank_overall_format_reward": 0.9375, + "rewards/rank_think_format_reward": 0.37231335788965225, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.578125, + "epoch": 0.568, + "grad_norm": 0.037784941494464874, + "kl": 0.005275249481201172, + "learning_rate": 1.999893757563839e-05, + "loss": -0.0125, + "reward": 1.1960534453392029, + "reward_std": 0.4160301834344864, + "rewards/mrr_reward": 0.48038194328546524, + "rewards/rank_answer_foramt_reward": 0.78515625, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.4147949740290642, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.625, + "epoch": 0.576, + "grad_norm": 0.040549177676439285, + "kl": 0.004588603973388672, + "learning_rate": 1.9998900625170897e-05, + "loss": -0.0169, + "reward": 1.429105520248413, + "reward_std": 0.4955332353711128, + "rewards/mrr_reward": 0.6686197817325592, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 0.921875, + "rewards/rank_think_format_reward": 0.5095802322030067, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.578125, + "epoch": 0.584, + "grad_norm": 0.03511694818735123, + "kl": 0.003040313720703125, + "learning_rate": 1.9998863043118163e-05, + "loss": -0.0441, + "reward": 0.9159589856863022, + "reward_std": 0.38665422797203064, + "rewards/mrr_reward": 0.22663691639900208, + "rewards/rank_answer_foramt_reward": 0.576171875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.5283078029751778, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.3125, + "epoch": 0.592, + "grad_norm": 0.03343343734741211, + "kl": 0.0038709640502929688, + "learning_rate": 1.999882482948257e-05, + "loss": -0.0629, + "reward": 1.190420851111412, + "reward_std": 0.47938936948776245, + "rewards/mrr_reward": 0.45703125, + "rewards/rank_answer_foramt_reward": 0.708984375, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.5446582287549973, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.015625, + "epoch": 0.6, + "grad_norm": 0.03709420561790466, + "kl": 0.003997325897216797, + "learning_rate": 1.999878598426653e-05, + "loss": -0.0283, + "reward": 1.1767661273479462, + "reward_std": 0.46029242873191833, + "rewards/mrr_reward": 0.4516245126724243, + "rewards/rank_answer_foramt_reward": 0.771484375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.4493520185351372, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.59375, + "epoch": 0.608, + "grad_norm": 0.03993833065032959, + "kl": 0.003428936004638672, + "learning_rate": 1.9998746507472493e-05, + "loss": -0.0233, + "reward": 1.3970292508602142, + "reward_std": 0.4703235626220703, + "rewards/mrr_reward": 0.6536458432674408, + "rewards/rank_answer_foramt_reward": 0.83203125, + "rewards/rank_overall_format_reward": 0.9453125, + "rewards/rank_think_format_reward": 0.47533298283815384, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.546875, + "epoch": 0.616, + "grad_norm": 0.03496386855840683, + "kl": 0.0029549598693847656, + "learning_rate": 1.999870639910296e-05, + "loss": -0.0372, + "reward": 1.3616310954093933, + "reward_std": 0.3801681473851204, + "rewards/mrr_reward": 0.5703992992639542, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.5949375629425049, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.90625, + "epoch": 0.624, + "grad_norm": 0.039075274020433426, + "kl": 0.005405902862548828, + "learning_rate": 1.9998665659160453e-05, + "loss": -0.0099, + "reward": 1.1831572949886322, + "reward_std": 0.48211684823036194, + "rewards/mrr_reward": 0.4921874850988388, + "rewards/rank_answer_foramt_reward": 0.689453125, + "rewards/rank_overall_format_reward": 0.9375, + "rewards/rank_think_format_reward": 0.46689455583691597, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.546875, + "epoch": 0.632, + "grad_norm": 0.03583746775984764, + "kl": 0.007263660430908203, + "learning_rate": 1.999862428764756e-05, + "loss": -0.057, + "reward": 1.1416280269622803, + "reward_std": 0.49653460085392, + "rewards/mrr_reward": 0.41783855855464935, + "rewards/rank_answer_foramt_reward": 0.689453125, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.5507232397794724, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.84375, + "epoch": 0.64, + "grad_norm": 0.03996375948190689, + "kl": 0.0040721893310546875, + "learning_rate": 1.9998582284566878e-05, + "loss": -0.0417, + "reward": 1.2116663455963135, + "reward_std": 0.4182490184903145, + "rewards/mrr_reward": 0.4896267279982567, + "rewards/rank_answer_foramt_reward": 0.755859375, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.4712018519639969, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.046875, + "epoch": 0.648, + "grad_norm": 0.03645450249314308, + "kl": 0.0038928985595703125, + "learning_rate": 1.999853964992107e-05, + "loss": -0.0315, + "reward": 1.1708963364362717, + "reward_std": 0.4123397395014763, + "rewards/mrr_reward": 0.4096788167953491, + "rewards/rank_answer_foramt_reward": 0.7734375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.5489069819450378, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.25, + "epoch": 0.656, + "grad_norm": 0.03626188263297081, + "kl": 0.004076957702636719, + "learning_rate": 1.9998496383712828e-05, + "loss": -0.0268, + "reward": 1.4341766834259033, + "reward_std": 0.3844763785600662, + "rewards/mrr_reward": 0.651041679084301, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.5000894367694855, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.765625, + "epoch": 0.664, + "grad_norm": 0.03594069182872772, + "kl": 0.006040096282958984, + "learning_rate": 1.999845248594489e-05, + "loss": -0.0422, + "reward": 1.225644826889038, + "reward_std": 0.45316731184720993, + "rewards/mrr_reward": 0.45325520634651184, + "rewards/rank_answer_foramt_reward": 0.798828125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.5495589375495911, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.078125, + "epoch": 0.672, + "grad_norm": 0.03165300935506821, + "kl": 0.0038886070251464844, + "learning_rate": 1.9998407956620017e-05, + "loss": -0.0321, + "reward": 1.4280948042869568, + "reward_std": 0.45007024705410004, + "rewards/mrr_reward": 0.6236979365348816, + "rewards/rank_answer_foramt_reward": 0.841796875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6113943159580231, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.21875, + "epoch": 0.68, + "grad_norm": 0.03456708416342735, + "kl": 0.00469207763671875, + "learning_rate": 1.9998362795741027e-05, + "loss": -0.033, + "reward": 1.0892604291439056, + "reward_std": 0.34771857038140297, + "rewards/mrr_reward": 0.32194321043789387, + "rewards/rank_answer_foramt_reward": 0.78125, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.583016149699688, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.171875, + "epoch": 0.688, + "grad_norm": 0.03425449877977371, + "kl": 0.004755496978759766, + "learning_rate": 1.9998317003310775e-05, + "loss": -0.0464, + "reward": 1.2324982285499573, + "reward_std": 0.42087381333112717, + "rewards/mrr_reward": 0.4638020992279053, + "rewards/rank_answer_foramt_reward": 0.798828125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.5539913699030876, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.65625, + "epoch": 0.696, + "grad_norm": 0.04177427664399147, + "kl": 0.00435638427734375, + "learning_rate": 1.9998270579332154e-05, + "loss": -0.0672, + "reward": 1.3639829754829407, + "reward_std": 0.43886173516511917, + "rewards/mrr_reward": 0.6211123615503311, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.42495106160640717, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.25, + "epoch": 0.704, + "grad_norm": 0.03630625456571579, + "kl": 0.0060329437255859375, + "learning_rate": 1.9998223523808092e-05, + "loss": 0.0047, + "reward": 1.08887879550457, + "reward_std": 0.4105418995022774, + "rewards/mrr_reward": 0.3867373540997505, + "rewards/rank_answer_foramt_reward": 0.65234375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.4987950399518013, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.265625, + "epoch": 0.712, + "grad_norm": 0.033998582512140274, + "kl": 0.004588127136230469, + "learning_rate": 1.9998175836741564e-05, + "loss": -0.0166, + "reward": 1.3856353461742401, + "reward_std": 0.5018515959382057, + "rewards/mrr_reward": 0.5744357854127884, + "rewards/rank_answer_foramt_reward": 0.802734375, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.6866960972547531, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.59375, + "epoch": 0.72, + "grad_norm": 0.03614083677530289, + "kl": 0.005847930908203125, + "learning_rate": 1.999812751813558e-05, + "loss": -0.0538, + "reward": 1.2505441904067993, + "reward_std": 0.39697666093707085, + "rewards/mrr_reward": 0.4752671793103218, + "rewards/rank_answer_foramt_reward": 0.814453125, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.5817459300160408, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.15625, + "epoch": 0.728, + "grad_norm": 0.03517737612128258, + "kl": 0.006634712219238281, + "learning_rate": 1.9998078567993197e-05, + "loss": -0.0462, + "reward": 1.4160068929195404, + "reward_std": 0.3253961279988289, + "rewards/mrr_reward": 0.6407738029956818, + "rewards/rank_answer_foramt_reward": 0.853515625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.5191128998994827, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.6875, + "epoch": 0.736, + "grad_norm": 0.03939124941825867, + "kl": 0.006640434265136719, + "learning_rate": 1.9998028986317504e-05, + "loss": 0.0157, + "reward": 1.2075020372867584, + "reward_std": 0.48353683948516846, + "rewards/mrr_reward": 0.5126488097012043, + "rewards/rank_answer_foramt_reward": 0.748046875, + "rewards/rank_overall_format_reward": 0.9296875, + "rewards/rank_think_format_reward": 0.42788132280111313, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.0625, + "epoch": 0.744, + "grad_norm": 0.036580055952072144, + "kl": 0.0062274932861328125, + "learning_rate": 1.999797877311163e-05, + "loss": -0.0228, + "reward": 1.3180749416351318, + "reward_std": 0.3634856082499027, + "rewards/mrr_reward": 0.49606895446777344, + "rewards/rank_answer_foramt_reward": 0.830078125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.6842865198850632, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.203125, + "epoch": 0.752, + "grad_norm": 0.03729122877120972, + "kl": 0.0060596466064453125, + "learning_rate": 1.9997927928378753e-05, + "loss": -0.0667, + "reward": 1.5598929524421692, + "reward_std": 0.28483540937304497, + "rewards/mrr_reward": 0.7404017746448517, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.5653376057744026, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.5625, + "epoch": 0.76, + "grad_norm": 0.037984032183885574, + "kl": 0.004604816436767578, + "learning_rate": 1.999787645212208e-05, + "loss": -0.0151, + "reward": 1.3950347006320953, + "reward_std": 0.4405433312058449, + "rewards/mrr_reward": 0.5818328410387039, + "rewards/rank_answer_foramt_reward": 0.77734375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7025292068719864, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.5, + "epoch": 0.768, + "grad_norm": 0.03686251863837242, + "kl": 0.0077533721923828125, + "learning_rate": 1.999782434434486e-05, + "loss": 0.0062, + "reward": 1.0413940846920013, + "reward_std": 0.45301979780197144, + "rewards/mrr_reward": 0.3227802626788616, + "rewards/rank_answer_foramt_reward": 0.626953125, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.5819142758846283, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.125, + "epoch": 0.776, + "grad_norm": 0.035745490342378616, + "kl": 0.0069942474365234375, + "learning_rate": 1.999777160505039e-05, + "loss": -0.056, + "reward": 1.2480146288871765, + "reward_std": 0.38091614469885826, + "rewards/mrr_reward": 0.4531250074505806, + "rewards/rank_answer_foramt_reward": 0.806640625, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.6489906013011932, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.96875, + "epoch": 0.784, + "grad_norm": 0.03713075444102287, + "kl": 0.007597923278808594, + "learning_rate": 1.9997718234242e-05, + "loss": -0.0048, + "reward": 1.2264422178268433, + "reward_std": 0.4423217736184597, + "rewards/mrr_reward": 0.46302084624767303, + "rewards/rank_answer_foramt_reward": 0.744140625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.5926948338747025, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.484375, + "epoch": 0.792, + "grad_norm": 0.03935292363166809, + "kl": 0.009016036987304688, + "learning_rate": 1.999766423192306e-05, + "loss": -0.0447, + "reward": 1.442174106836319, + "reward_std": 0.2968660295009613, + "rewards/mrr_reward": 0.6010602712631226, + "rewards/rank_answer_foramt_reward": 0.912109375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6523452401161194, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.90625, + "epoch": 0.8, + "grad_norm": 0.040015578269958496, + "kl": 0.00775146484375, + "learning_rate": 1.9997609598096982e-05, + "loss": 0.0258, + "reward": 1.3165824115276337, + "reward_std": 0.4892076849937439, + "rewards/mrr_reward": 0.546875, + "rewards/rank_answer_foramt_reward": 0.78125, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.5902589708566666, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.125, + "epoch": 0.808, + "grad_norm": 0.037578992545604706, + "kl": 0.007415771484375, + "learning_rate": 1.9997554332767214e-05, + "loss": -0.0399, + "reward": 1.4174852073192596, + "reward_std": 0.41939637064933777, + "rewards/mrr_reward": 0.6032738238573074, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.6411352306604385, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.046875, + "epoch": 0.816, + "grad_norm": 0.03692830353975296, + "kl": 0.007373809814453125, + "learning_rate": 1.9997498435937254e-05, + "loss": -0.0086, + "reward": 1.2665570676326752, + "reward_std": 0.4313989281654358, + "rewards/mrr_reward": 0.53735176846385, + "rewards/rank_answer_foramt_reward": 0.8203125, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.42846303433179855, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.96875, + "epoch": 0.824, + "grad_norm": 0.03523707389831543, + "kl": 0.007399559020996094, + "learning_rate": 1.9997441907610624e-05, + "loss": 0.056, + "reward": 1.2068978399038315, + "reward_std": 0.3156552240252495, + "rewards/mrr_reward": 0.43065476045012474, + "rewards/rank_answer_foramt_reward": 0.826171875, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.5573297291994095, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.421875, + "epoch": 0.832, + "grad_norm": 0.03504414111375809, + "kl": 0.00534820556640625, + "learning_rate": 1.9997384747790903e-05, + "loss": -0.0279, + "reward": 1.2453699111938477, + "reward_std": 0.3804602213203907, + "rewards/mrr_reward": 0.44001737236976624, + "rewards/rank_answer_foramt_reward": 0.84765625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.6162433475255966, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.515625, + "epoch": 0.84, + "grad_norm": 0.033324431627988815, + "kl": 0.0075550079345703125, + "learning_rate": 1.9997326956481693e-05, + "loss": -0.0007, + "reward": 1.3570080995559692, + "reward_std": 0.35840654745697975, + "rewards/mrr_reward": 0.5557477697730064, + "rewards/rank_answer_foramt_reward": 0.830078125, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.6370457410812378, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.03125, + "epoch": 0.848, + "grad_norm": 0.034413669258356094, + "kl": 0.005972862243652344, + "learning_rate": 1.999726853368665e-05, + "loss": -0.0128, + "reward": 1.4416370689868927, + "reward_std": 0.435688741505146, + "rewards/mrr_reward": 0.6221354156732559, + "rewards/rank_answer_foramt_reward": 0.828125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6630256772041321, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.8125, + "epoch": 0.856, + "grad_norm": 0.0455174446105957, + "kl": 0.007664680480957031, + "learning_rate": 1.9997209479409464e-05, + "loss": 0.0547, + "reward": 1.3583179116249084, + "reward_std": 0.33279163390398026, + "rewards/mrr_reward": 0.5312686040997505, + "rewards/rank_answer_foramt_reward": 0.857421875, + "rewards/rank_overall_format_reward": 0.9375, + "rewards/rank_think_format_reward": 0.7112879157066345, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 405.125, + "epoch": 0.864, + "grad_norm": 0.03769733011722565, + "kl": 0.007842063903808594, + "learning_rate": 1.9997149793653862e-05, + "loss": -0.0215, + "reward": 1.5695316791534424, + "reward_std": 0.4045773670077324, + "rewards/mrr_reward": 0.7216145843267441, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7022580057382584, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.25, + "epoch": 0.872, + "grad_norm": 0.044221311807632446, + "kl": 0.009271621704101562, + "learning_rate": 1.9997089476423617e-05, + "loss": 0.0187, + "reward": 1.373361498117447, + "reward_std": 0.4413030967116356, + "rewards/mrr_reward": 0.5739335417747498, + "rewards/rank_answer_foramt_reward": 0.84375, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.6178214848041534, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.984375, + "epoch": 0.88, + "grad_norm": 0.03868413344025612, + "kl": 0.009458541870117188, + "learning_rate": 1.999702852772254e-05, + "loss": -0.0229, + "reward": 1.2501296997070312, + "reward_std": 0.34093382209539413, + "rewards/mrr_reward": 0.4071986749768257, + "rewards/rank_answer_foramt_reward": 0.84375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7183988690376282, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.921875, + "epoch": 0.888, + "grad_norm": 0.041314590722322464, + "kl": 0.00878143310546875, + "learning_rate": 1.9996966947554476e-05, + "loss": -0.0452, + "reward": 1.5108999907970428, + "reward_std": 0.38177699968218803, + "rewards/mrr_reward": 0.6616443544626236, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6848299354314804, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 390.09375, + "epoch": 0.896, + "grad_norm": 0.03469943255186081, + "kl": 0.008004188537597656, + "learning_rate": 1.9996904735923325e-05, + "loss": -0.024, + "reward": 1.4337850511074066, + "reward_std": 0.3383907675743103, + "rewards/mrr_reward": 0.604687511920929, + "rewards/rank_answer_foramt_reward": 0.884765625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6432760506868362, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.625, + "epoch": 0.904, + "grad_norm": 0.03499722108244896, + "kl": 0.008179664611816406, + "learning_rate": 1.9996841892833e-05, + "loss": 0.009, + "reward": 1.4004603326320648, + "reward_std": 0.4079892486333847, + "rewards/mrr_reward": 0.597135417163372, + "rewards/rank_answer_foramt_reward": 0.828125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6218177676200867, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.703125, + "epoch": 0.912, + "grad_norm": 0.03794240951538086, + "kl": 0.007323265075683594, + "learning_rate": 1.9996778418287486e-05, + "loss": -0.0338, + "reward": 1.2410678267478943, + "reward_std": 0.3803473189473152, + "rewards/mrr_reward": 0.4220486134290695, + "rewards/rank_answer_foramt_reward": 0.78125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7084388732910156, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.453125, + "epoch": 0.92, + "grad_norm": 0.03945469111204147, + "kl": 0.010195732116699219, + "learning_rate": 1.9996714312290784e-05, + "loss": -0.0435, + "reward": 1.2030570209026337, + "reward_std": 0.4484190344810486, + "rewards/mrr_reward": 0.38446180522441864, + "rewards/rank_answer_foramt_reward": 0.759765625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7364507168531418, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.8125, + "epoch": 0.928, + "grad_norm": 0.03646434098482132, + "kl": 0.008108139038085938, + "learning_rate": 1.9996649574846948e-05, + "loss": -0.0451, + "reward": 1.4321197271347046, + "reward_std": 0.3283313438296318, + "rewards/mrr_reward": 0.5520399212837219, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7587052434682846, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.859375, + "epoch": 0.936, + "grad_norm": 0.040539614856243134, + "kl": 0.009557723999023438, + "learning_rate": 1.9996584205960063e-05, + "loss": -0.0017, + "reward": 1.408842772245407, + "reward_std": 0.3340775966644287, + "rewards/mrr_reward": 0.533283744007349, + "rewards/rank_answer_foramt_reward": 0.88671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7664903402328491, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.03125, + "epoch": 0.944, + "grad_norm": 0.03920678421854973, + "kl": 0.009622573852539062, + "learning_rate": 1.999651820563426e-05, + "loss": -0.0548, + "reward": 1.2891173958778381, + "reward_std": 0.3157798573374748, + "rewards/mrr_reward": 0.4310639798641205, + "rewards/rank_answer_foramt_reward": 0.83203125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7759429067373276, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 405.359375, + "epoch": 0.952, + "grad_norm": 0.09537135064601898, + "kl": 0.0254364013671875, + "learning_rate": 1.999645157387371e-05, + "loss": -0.0146, + "reward": 1.5277230143547058, + "reward_std": 0.3659735471010208, + "rewards/mrr_reward": 0.6688368320465088, + "rewards/rank_answer_foramt_reward": 0.927734375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6905757486820221, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.421875, + "epoch": 0.96, + "grad_norm": 0.04111702740192413, + "kl": 0.008625030517578125, + "learning_rate": 1.9996384310682615e-05, + "loss": -0.0319, + "reward": 1.2409729659557343, + "reward_std": 0.3955169692635536, + "rewards/mrr_reward": 0.40638699010014534, + "rewards/rank_answer_foramt_reward": 0.814453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7224076837301254, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.828125, + "epoch": 0.968, + "grad_norm": 0.03895534947514534, + "kl": 0.008586883544921875, + "learning_rate": 1.999631641606523e-05, + "loss": -0.0318, + "reward": 1.2535248398780823, + "reward_std": 0.4473446235060692, + "rewards/mrr_reward": 0.46145833283662796, + "rewards/rank_answer_foramt_reward": 0.74609375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.6775450259447098, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.40625, + "epoch": 0.976, + "grad_norm": 0.0410754568874836, + "kl": 0.008921623229980469, + "learning_rate": 1.9996247890025845e-05, + "loss": 0.03, + "reward": 1.274911493062973, + "reward_std": 0.4455215707421303, + "rewards/mrr_reward": 0.4238405302166939, + "rewards/rank_answer_foramt_reward": 0.84375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7430653125047684, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.328125, + "epoch": 0.984, + "grad_norm": 0.038471318781375885, + "kl": 0.008017539978027344, + "learning_rate": 1.9996178732568784e-05, + "loss": -0.0218, + "reward": 1.1934560984373093, + "reward_std": 0.37501702457666397, + "rewards/mrr_reward": 0.38322172313928604, + "rewards/rank_answer_foramt_reward": 0.79296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6700992956757545, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.859375, + "epoch": 0.992, + "grad_norm": 0.03736767917871475, + "kl": 0.009922027587890625, + "learning_rate": 1.9996108943698412e-05, + "loss": -0.0288, + "reward": 1.4828196465969086, + "reward_std": 0.34185753017663956, + "rewards/mrr_reward": 0.6228484585881233, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.764176219701767, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.84375, + "epoch": 1.0, + "grad_norm": 0.03988206014037132, + "kl": 0.009454727172851562, + "learning_rate": 1.9996038523419148e-05, + "loss": -0.0202, + "reward": 1.2809572219848633, + "reward_std": 0.4308247435837984, + "rewards/mrr_reward": 0.4690104201436043, + "rewards/rank_answer_foramt_reward": 0.775390625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7084915935993195, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.46875, + "epoch": 0.1008, + "grad_norm": 0.03588635101914406, + "kl": 0.007241249084472656, + "learning_rate": 1.7583619152887222e-05, + "loss": -0.0384, + "reward": 1.2436591684818268, + "reward_std": 0.4599437266588211, + "rewards/mrr_reward": 0.4638392850756645, + "rewards/rank_answer_foramt_reward": 0.765625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6130905151367188, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.03125, + "epoch": 0.1016, + "grad_norm": 0.042928650975227356, + "kl": 0.009435653686523438, + "learning_rate": 1.754251380736104e-05, + "loss": 0.0143, + "reward": 1.1844342947006226, + "reward_std": 0.42420684546232224, + "rewards/mrr_reward": 0.36152033507823944, + "rewards/rank_answer_foramt_reward": 0.712890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7807879894971848, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.09375, + "epoch": 0.1024, + "grad_norm": 0.03468465059995651, + "kl": 0.009843826293945312, + "learning_rate": 1.7501110696304598e-05, + "loss": 0.0222, + "reward": 1.2601959109306335, + "reward_std": 0.40002964437007904, + "rewards/mrr_reward": 0.4385974779725075, + "rewards/rank_answer_foramt_reward": 0.818359375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.694770097732544, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.75, + "epoch": 0.1032, + "grad_norm": 0.037175022065639496, + "kl": 0.00865936279296875, + "learning_rate": 1.7459411454241822e-05, + "loss": -0.0003, + "reward": 1.2285803258419037, + "reward_std": 0.305690661072731, + "rewards/mrr_reward": 0.4261036813259125, + "rewards/rank_answer_foramt_reward": 0.75, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6895599216222763, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.796875, + "epoch": 0.104, + "grad_norm": 0.04238922521471977, + "kl": 0.008852005004882812, + "learning_rate": 1.7417417727387392e-05, + "loss": -0.0508, + "reward": 1.2500934600830078, + "reward_std": 0.4739305451512337, + "rewards/mrr_reward": 0.4687500074505806, + "rewards/rank_answer_foramt_reward": 0.712890625, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.6938792169094086, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.546875, + "epoch": 0.1048, + "grad_norm": 0.039994798600673676, + "kl": 0.008302688598632812, + "learning_rate": 1.737513117358174e-05, + "loss": -0.0044, + "reward": 1.3667091131210327, + "reward_std": 0.4646128937602043, + "rewards/mrr_reward": 0.5651041865348816, + "rewards/rank_answer_foramt_reward": 0.751953125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6849651783704758, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.765625, + "epoch": 0.1056, + "grad_norm": 0.045952655375003815, + "kl": 0.007769584655761719, + "learning_rate": 1.7332553462225604e-05, + "loss": -0.0423, + "reward": 1.362520307302475, + "reward_std": 0.40486711636185646, + "rewards/mrr_reward": 0.5388020724058151, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6621311977505684, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.953125, + "epoch": 0.1064, + "grad_norm": 0.03839990124106407, + "kl": 0.009153366088867188, + "learning_rate": 1.7289686274214116e-05, + "loss": -0.0437, + "reward": 1.3655548691749573, + "reward_std": 0.3156409915536642, + "rewards/mrr_reward": 0.5556609630584717, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.6007082015275955, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.703125, + "epoch": 0.1072, + "grad_norm": 0.04110146313905716, + "kl": 0.009965896606445312, + "learning_rate": 1.7246531301870467e-05, + "loss": 0.0237, + "reward": 1.2509280890226364, + "reward_std": 0.3098420277237892, + "rewards/mrr_reward": 0.42760417610406876, + "rewards/rank_answer_foramt_reward": 0.802734375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7078114748001099, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.90625, + "epoch": 0.108, + "grad_norm": 0.03654169663786888, + "kl": 0.011074066162109375, + "learning_rate": 1.720309024887907e-05, + "loss": -0.0004, + "reward": 1.2900923788547516, + "reward_std": 0.30175913497805595, + "rewards/mrr_reward": 0.4537264332175255, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7238953113555908, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.65625, + "epoch": 0.1088, + "grad_norm": 0.03870633617043495, + "kl": 0.0076389312744140625, + "learning_rate": 1.7159364830218312e-05, + "loss": -0.0273, + "reward": 1.4046232402324677, + "reward_std": 0.1458736453205347, + "rewards/mrr_reward": 0.5324838608503342, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7541746199131012, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.453125, + "epoch": 0.1096, + "grad_norm": 0.03937549516558647, + "kl": 0.0086822509765625, + "learning_rate": 1.7115356772092858e-05, + "loss": -0.0347, + "reward": 1.4993912875652313, + "reward_std": 0.3712911829352379, + "rewards/mrr_reward": 0.6232638955116272, + "rewards/rank_answer_foramt_reward": 0.900390625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7701657563447952, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.609375, + "epoch": 0.1104, + "grad_norm": 0.03570104390382767, + "kl": 0.008465766906738281, + "learning_rate": 1.7071067811865477e-05, + "loss": -0.0247, + "reward": 1.6328608393669128, + "reward_std": 0.45016467571258545, + "rewards/mrr_reward": 0.8177083283662796, + "rewards/rank_answer_foramt_reward": 0.869140625, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.6400808244943619, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.46875, + "epoch": 0.1112, + "grad_norm": 0.03384555131196976, + "kl": 0.00774383544921875, + "learning_rate": 1.7026499697988496e-05, + "loss": -0.0086, + "reward": 1.325139045715332, + "reward_std": 0.34500668570399284, + "rewards/mrr_reward": 0.46684029698371887, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7669208198785782, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.390625, + "epoch": 0.112, + "grad_norm": 0.03522227331995964, + "kl": 0.007678031921386719, + "learning_rate": 1.698165418993473e-05, + "loss": 0.0335, + "reward": 1.4752865731716156, + "reward_std": 0.341037068516016, + "rewards/mrr_reward": 0.6348958387970924, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.6892164349555969, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.140625, + "epoch": 0.1128, + "grad_norm": 0.03765201196074486, + "kl": 0.008716583251953125, + "learning_rate": 1.693653305812805e-05, + "loss": -0.0408, + "reward": 1.3769253194332123, + "reward_std": 0.3529956042766571, + "rewards/mrr_reward": 0.5243489742279053, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7085645794868469, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.390625, + "epoch": 0.1136, + "grad_norm": 0.04156142473220825, + "kl": 0.01380157470703125, + "learning_rate": 1.6891138083873486e-05, + "loss": -0.0025, + "reward": 1.199166625738144, + "reward_std": 0.3892873339354992, + "rewards/mrr_reward": 0.3583643361926079, + "rewards/rank_answer_foramt_reward": 0.822265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7334325760602951, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.328125, + "epoch": 0.1144, + "grad_norm": 0.04161708801984787, + "kl": 0.008413314819335938, + "learning_rate": 1.684547105928689e-05, + "loss": 0.0184, + "reward": 1.47340127825737, + "reward_std": 0.5706894248723984, + "rewards/mrr_reward": 0.6829427182674408, + "rewards/rank_answer_foramt_reward": 0.79296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6179851442575455, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.890625, + "epoch": 0.1152, + "grad_norm": 0.038589105010032654, + "kl": 0.007936477661132812, + "learning_rate": 1.6799533787224192e-05, + "loss": -0.033, + "reward": 1.3689055740833282, + "reward_std": 0.29699520394206047, + "rewards/mrr_reward": 0.5373697876930237, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6291802376508713, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.03125, + "epoch": 0.116, + "grad_norm": 0.034039001911878586, + "kl": 0.006977081298828125, + "learning_rate": 1.6753328081210244e-05, + "loss": -0.0048, + "reward": 1.5173978507518768, + "reward_std": 0.41116751730442047, + "rewards/mrr_reward": 0.6428571343421936, + "rewards/rank_answer_foramt_reward": 0.84765625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8024669140577316, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.9375, + "epoch": 0.1168, + "grad_norm": 0.03563903272151947, + "kl": 0.008948326110839844, + "learning_rate": 1.6706855765367202e-05, + "loss": -0.0162, + "reward": 1.3468570411205292, + "reward_std": 0.36933452636003494, + "rewards/mrr_reward": 0.510937511920929, + "rewards/rank_answer_foramt_reward": 0.703125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8455893844366074, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.421875, + "epoch": 0.1176, + "grad_norm": 0.036863867193460464, + "kl": 0.008589744567871094, + "learning_rate": 1.666011867434252e-05, + "loss": -0.0109, + "reward": 1.2238706946372986, + "reward_std": 0.38623613119125366, + "rewards/mrr_reward": 0.3937686011195183, + "rewards/rank_answer_foramt_reward": 0.779296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7439763844013214, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.609375, + "epoch": 0.1184, + "grad_norm": 0.03305863216519356, + "kl": 0.007039070129394531, + "learning_rate": 1.661311865323652e-05, + "loss": -0.017, + "reward": 1.4561704695224762, + "reward_std": 0.2617410905659199, + "rewards/mrr_reward": 0.5868675485253334, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7123762518167496, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.46875, + "epoch": 0.1192, + "grad_norm": 0.03671317920088768, + "kl": 0.008609771728515625, + "learning_rate": 1.6565857557529567e-05, + "loss": -0.0152, + "reward": 1.1992796063423157, + "reward_std": 0.40606988221406937, + "rewards/mrr_reward": 0.3986979275941849, + "rewards/rank_answer_foramt_reward": 0.6875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7619423866271973, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.0, + "epoch": 0.12, + "grad_norm": 0.035398129373788834, + "kl": 0.007327079772949219, + "learning_rate": 1.651833725300879e-05, + "loss": -0.0072, + "reward": 1.4916549921035767, + "reward_std": 0.3184054736047983, + "rewards/mrr_reward": 0.6293340772390366, + "rewards/rank_answer_foramt_reward": 0.83203125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7810622304677963, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.796875, + "epoch": 0.1208, + "grad_norm": 0.04079672694206238, + "kl": 0.009191513061523438, + "learning_rate": 1.6470559615694445e-05, + "loss": -0.0333, + "reward": 1.3449455797672272, + "reward_std": 0.35879848897457123, + "rewards/mrr_reward": 0.5183903872966766, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.6101813167333603, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.53125, + "epoch": 0.1216, + "grad_norm": 0.037399739027023315, + "kl": 0.0074825286865234375, + "learning_rate": 1.6422526531765846e-05, + "loss": -0.0135, + "reward": 1.627054363489151, + "reward_std": 0.3690681420266628, + "rewards/mrr_reward": 0.7510850876569748, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7306240200996399, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.328125, + "epoch": 0.1224, + "grad_norm": 0.0394161157310009, + "kl": 0.009618759155273438, + "learning_rate": 1.63742398974869e-05, + "loss": -0.0023, + "reward": 1.2834027111530304, + "reward_std": 0.41649453714489937, + "rewards/mrr_reward": 0.45035962387919426, + "rewards/rank_answer_foramt_reward": 0.818359375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7216386198997498, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.046875, + "epoch": 0.1232, + "grad_norm": 0.033092282712459564, + "kl": 0.009715080261230469, + "learning_rate": 1.6325701619131246e-05, + "loss": 0.0023, + "reward": 1.5148942172527313, + "reward_std": 0.4242171198129654, + "rewards/mrr_reward": 0.6458953246474266, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.7739546746015549, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.328125, + "epoch": 0.124, + "grad_norm": 0.03961697220802307, + "kl": 0.0072040557861328125, + "learning_rate": 1.6276913612907005e-05, + "loss": 0.0073, + "reward": 1.5414968132972717, + "reward_std": 0.338295828551054, + "rewards/mrr_reward": 0.6806175708770752, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.70833420753479, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.34375, + "epoch": 0.1248, + "grad_norm": 0.03811102360486984, + "kl": 0.008966445922851562, + "learning_rate": 1.6227877804881126e-05, + "loss": -0.0576, + "reward": 1.5272436439990997, + "reward_std": 0.25212423875927925, + "rewards/mrr_reward": 0.7075520902872086, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.5815699324011803, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.015625, + "epoch": 0.1256, + "grad_norm": 0.040453094989061356, + "kl": 0.009019851684570312, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.0118, + "reward": 1.3084798157215118, + "reward_std": 0.41541341692209244, + "rewards/mrr_reward": 0.4686570018529892, + "rewards/rank_answer_foramt_reward": 0.806640625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7617143541574478, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.859375, + "epoch": 0.1264, + "grad_norm": 0.03529253602027893, + "kl": 0.00853729248046875, + "learning_rate": 1.6129070536529767e-05, + "loss": -0.0249, + "reward": 1.3785496950149536, + "reward_std": 0.317409735172987, + "rewards/mrr_reward": 0.5057911723852158, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7912070602178574, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.328125, + "epoch": 0.1272, + "grad_norm": 0.03864454850554466, + "kl": 0.008512496948242188, + "learning_rate": 1.6079302976946055e-05, + "loss": -0.0397, + "reward": 1.2989526093006134, + "reward_std": 0.4882466495037079, + "rewards/mrr_reward": 0.4919270947575569, + "rewards/rank_answer_foramt_reward": 0.765625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7033442407846451, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.3125, + "epoch": 0.128, + "grad_norm": 0.03776967152953148, + "kl": 0.010316848754882812, + "learning_rate": 1.602929541689025e-05, + "loss": -0.0441, + "reward": 1.2715973854064941, + "reward_std": 0.31563179939985275, + "rewards/mrr_reward": 0.4436384178698063, + "rewards/rank_answer_foramt_reward": 0.845703125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6788883656263351, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.609375, + "epoch": 0.1288, + "grad_norm": 0.04325239732861519, + "kl": 0.00868988037109375, + "learning_rate": 1.597904983057519e-05, + "loss": -0.0253, + "reward": 1.3210601806640625, + "reward_std": 0.34434930980205536, + "rewards/mrr_reward": 0.46409972012043, + "rewards/rank_answer_foramt_reward": 0.900390625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.704271599650383, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.640625, + "epoch": 0.1296, + "grad_norm": 0.033945418894290924, + "kl": 0.009050369262695312, + "learning_rate": 1.5928568201610593e-05, + "loss": -0.0045, + "reward": 1.0713335573673248, + "reward_std": 0.262711264193058, + "rewards/mrr_reward": 0.23516865447163582, + "rewards/rank_answer_foramt_reward": 0.80859375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7486767023801804, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.515625, + "epoch": 0.1304, + "grad_norm": 0.035303499549627304, + "kl": 0.008235931396484375, + "learning_rate": 1.5877852522924733e-05, + "loss": -0.0111, + "reward": 1.5817199647426605, + "reward_std": 0.4066779837012291, + "rewards/mrr_reward": 0.7029947862029076, + "rewards/rank_answer_foramt_reward": 0.861328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8092876970767975, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.078125, + "epoch": 0.1312, + "grad_norm": 0.03836703300476074, + "kl": 0.008309364318847656, + "learning_rate": 1.5826904796685763e-05, + "loss": -0.0341, + "reward": 1.3209553062915802, + "reward_std": 0.3643554821610451, + "rewards/mrr_reward": 0.4703125134110451, + "rewards/rank_answer_foramt_reward": 0.791015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7866897433996201, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.046875, + "epoch": 0.132, + "grad_norm": 0.03883913904428482, + "kl": 0.011602401733398438, + "learning_rate": 1.5775727034222675e-05, + "loss": 0.0062, + "reward": 1.370899885892868, + "reward_std": 0.38270220160484314, + "rewards/mrr_reward": 0.506225224584341, + "rewards/rank_answer_foramt_reward": 0.859375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7842886447906494, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.375, + "epoch": 0.1328, + "grad_norm": 0.03487444296479225, + "kl": 0.00785064697265625, + "learning_rate": 1.572432125594591e-05, + "loss": -0.0483, + "reward": 1.5634401440620422, + "reward_std": 0.2945178374648094, + "rewards/mrr_reward": 0.6865327507257462, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7334667444229126, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.65625, + "epoch": 0.1336, + "grad_norm": 0.0383358933031559, + "kl": 0.009000778198242188, + "learning_rate": 1.567268949126757e-05, + "loss": -0.0308, + "reward": 1.2559186518192291, + "reward_std": 0.3751576766371727, + "rewards/mrr_reward": 0.42760416120290756, + "rewards/rank_answer_foramt_reward": 0.8203125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7053561210632324, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.25, + "epoch": 0.1344, + "grad_norm": 0.0366198867559433, + "kl": 0.008385658264160156, + "learning_rate": 1.5620833778521306e-05, + "loss": 0.015, + "reward": 1.3973772525787354, + "reward_std": 0.39591234177351, + "rewards/mrr_reward": 0.531770870089531, + "rewards/rank_answer_foramt_reward": 0.78125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8417995721101761, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.6875, + "epoch": 0.1352, + "grad_norm": 0.03712960332632065, + "kl": 0.008477210998535156, + "learning_rate": 1.556875616488188e-05, + "loss": -0.008, + "reward": 1.428687036037445, + "reward_std": 0.3864752873778343, + "rewards/mrr_reward": 0.5722842365503311, + "rewards/rank_answer_foramt_reward": 0.857421875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7377379089593887, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.203125, + "epoch": 0.136, + "grad_norm": 0.03674308583140373, + "kl": 0.012662887573242188, + "learning_rate": 1.5516458706284306e-05, + "loss": -0.045, + "reward": 1.4177999794483185, + "reward_std": 0.37393568456172943, + "rewards/mrr_reward": 0.5833333283662796, + "rewards/rank_answer_foramt_reward": 0.857421875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.6947023347020149, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.71875, + "epoch": 0.1368, + "grad_norm": 0.03608371317386627, + "kl": 0.007844924926757812, + "learning_rate": 1.5463943467342694e-05, + "loss": -0.009, + "reward": 1.3503046333789825, + "reward_std": 0.31038716807961464, + "rewards/mrr_reward": 0.49075521528720856, + "rewards/rank_answer_foramt_reward": 0.859375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7453198581933975, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.984375, + "epoch": 0.1376, + "grad_norm": 0.040419355034828186, + "kl": 0.010079383850097656, + "learning_rate": 1.541121252126876e-05, + "loss": -0.0363, + "reward": 1.244249552488327, + "reward_std": 0.31654617190361023, + "rewards/mrr_reward": 0.4160156324505806, + "rewards/rank_answer_foramt_reward": 0.763671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7461278587579727, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.375, + "epoch": 0.1384, + "grad_norm": 0.03765353932976723, + "kl": 0.0077056884765625, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.0198, + "reward": 1.4578497409820557, + "reward_std": 0.34172070026397705, + "rewards/mrr_reward": 0.5898437574505806, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7260240614414215, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.8125, + "epoch": 0.1392, + "grad_norm": 0.041146595031023026, + "kl": 0.010126113891601562, + "learning_rate": 1.5305111843067343e-05, + "loss": -0.0176, + "reward": 1.3684653639793396, + "reward_std": 0.29850760102272034, + "rewards/mrr_reward": 0.5057725831866264, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7314079403877258, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.984375, + "epoch": 0.14, + "grad_norm": 0.03607525676488876, + "kl": 0.00901031494140625, + "learning_rate": 1.5251746299612959e-05, + "loss": -0.0444, + "reward": 1.1752477288246155, + "reward_std": 0.3523693382740021, + "rewards/mrr_reward": 0.36098091304302216, + "rewards/rank_answer_foramt_reward": 0.814453125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.6764592081308365, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.75, + "epoch": 0.1408, + "grad_norm": 0.04322541132569313, + "kl": 0.010807037353515625, + "learning_rate": 1.5198173426207095e-05, + "loss": -0.0348, + "reward": 1.3782255053520203, + "reward_std": 0.33319515362381935, + "rewards/mrr_reward": 0.5212177634239197, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6868368983268738, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.5625, + "epoch": 0.1416, + "grad_norm": 0.039602383971214294, + "kl": 0.012025833129882812, + "learning_rate": 1.5144395337815066e-05, + "loss": -0.0754, + "reward": 1.3582023978233337, + "reward_std": 0.40261131525039673, + "rewards/mrr_reward": 0.503838062286377, + "rewards/rank_answer_foramt_reward": 0.859375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7296076565980911, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.125, + "epoch": 0.1424, + "grad_norm": 0.041363585740327835, + "kl": 0.009540557861328125, + "learning_rate": 1.5090414157503715e-05, + "loss": -0.0197, + "reward": 1.323040246963501, + "reward_std": 0.3850451856851578, + "rewards/mrr_reward": 0.48712798207998276, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.6834579259157181, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.5, + "epoch": 0.1432, + "grad_norm": 0.03604589402675629, + "kl": 0.009143829345703125, + "learning_rate": 1.503623201635761e-05, + "loss": -0.0201, + "reward": 1.4883578717708588, + "reward_std": 0.38600361347198486, + "rewards/mrr_reward": 0.6411458402872086, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6844964772462845, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.96875, + "epoch": 0.144, + "grad_norm": 0.03885720670223236, + "kl": 0.00914764404296875, + "learning_rate": 1.498185105339491e-05, + "loss": -0.0262, + "reward": 1.3653124272823334, + "reward_std": 0.4237579368054867, + "rewards/mrr_reward": 0.5257812589406967, + "rewards/rank_answer_foramt_reward": 0.857421875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.702236957848072, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.140625, + "epoch": 0.1448, + "grad_norm": 0.039864372462034225, + "kl": 0.007981300354003906, + "learning_rate": 1.4927273415482916e-05, + "loss": 0.0164, + "reward": 1.3653839826583862, + "reward_std": 0.39134908467531204, + "rewards/mrr_reward": 0.5079861059784889, + "rewards/rank_answer_foramt_reward": 0.822265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7759094536304474, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.46875, + "epoch": 0.1456, + "grad_norm": 0.04062679037451744, + "kl": 0.009542465209960938, + "learning_rate": 1.4872501257253325e-05, + "loss": -0.0132, + "reward": 1.255301147699356, + "reward_std": 0.38088975846767426, + "rewards/mrr_reward": 0.4399925619363785, + "rewards/rank_answer_foramt_reward": 0.794921875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.691335141658783, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.890625, + "epoch": 0.1464, + "grad_norm": 0.04299888014793396, + "kl": 0.011081695556640625, + "learning_rate": 1.4817536741017153e-05, + "loss": -0.0543, + "reward": 1.4246177673339844, + "reward_std": 0.296498604118824, + "rewards/mrr_reward": 0.582961305975914, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6559426188468933, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.40625, + "epoch": 0.1472, + "grad_norm": 0.05667012929916382, + "kl": 0.02362060546875, + "learning_rate": 1.4762382036679393e-05, + "loss": -0.0457, + "reward": 1.3302158117294312, + "reward_std": 0.46241550147533417, + "rewards/mrr_reward": 0.50021081417799, + "rewards/rank_answer_foramt_reward": 0.7578125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7651665955781937, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.65625, + "epoch": 0.148, + "grad_norm": 0.03618616238236427, + "kl": 0.00848388671875, + "learning_rate": 1.470703932165333e-05, + "loss": 0.0097, + "reward": 1.4576621353626251, + "reward_std": 0.32078187353909016, + "rewards/mrr_reward": 0.5706349164247513, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7973360121250153, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.015625, + "epoch": 0.1488, + "grad_norm": 0.03927586227655411, + "kl": 0.009456634521484375, + "learning_rate": 1.4651510780774585e-05, + "loss": -0.0331, + "reward": 1.5841456949710846, + "reward_std": 0.44154639542102814, + "rewards/mrr_reward": 0.703125, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.781087726354599, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.8125, + "epoch": 0.1496, + "grad_norm": 0.037521783262491226, + "kl": 0.009569168090820312, + "learning_rate": 1.4595798606214882e-05, + "loss": 0.0342, + "reward": 1.3032934665679932, + "reward_std": 0.37545448541641235, + "rewards/mrr_reward": 0.4639943093061447, + "rewards/rank_answer_foramt_reward": 0.798828125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7679399847984314, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.4375, + "epoch": 0.1504, + "grad_norm": 0.039995163679122925, + "kl": 0.00890350341796875, + "learning_rate": 1.4539904997395468e-05, + "loss": -0.0577, + "reward": 1.563814789056778, + "reward_std": 0.28440105356276035, + "rewards/mrr_reward": 0.6919271051883698, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7182557433843613, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.9375, + "epoch": 0.1512, + "grad_norm": 0.04071548208594322, + "kl": 0.008481979370117188, + "learning_rate": 1.4483832160900326e-05, + "loss": -0.0276, + "reward": 1.3721172213554382, + "reward_std": 0.33370155096054077, + "rewards/mrr_reward": 0.5183965861797333, + "rewards/rank_answer_foramt_reward": 0.845703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7491414994001389, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.984375, + "epoch": 0.152, + "grad_norm": 0.041358765214681625, + "kl": 0.008540153503417969, + "learning_rate": 1.442758231038902e-05, + "loss": -0.0272, + "reward": 1.6237071752548218, + "reward_std": 0.3772216849029064, + "rewards/mrr_reward": 0.7747395932674408, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6566131561994553, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.203125, + "epoch": 0.1528, + "grad_norm": 0.033840637654066086, + "kl": 0.009374618530273438, + "learning_rate": 1.437115766650933e-05, + "loss": 0.0117, + "reward": 1.2770065069198608, + "reward_std": 0.2644694857299328, + "rewards/mrr_reward": 0.4101128578186035, + "rewards/rank_answer_foramt_reward": 0.86328125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7792940139770508, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.0625, + "epoch": 0.1536, + "grad_norm": 0.038305025547742844, + "kl": 0.009326934814453125, + "learning_rate": 1.4314560456809592e-05, + "loss": -0.0217, + "reward": 1.2699792385101318, + "reward_std": 0.3280069828033447, + "rewards/mrr_reward": 0.422588050365448, + "rewards/rank_answer_foramt_reward": 0.8046875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7631644010543823, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.390625, + "epoch": 0.1544, + "grad_norm": 0.03981156274676323, + "kl": 0.00913238525390625, + "learning_rate": 1.4257792915650728e-05, + "loss": -0.047, + "reward": 1.3255890011787415, + "reward_std": 0.3319142311811447, + "rewards/mrr_reward": 0.49487847834825516, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.714570015668869, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.25, + "epoch": 0.1552, + "grad_norm": 0.03656027093529701, + "kl": 0.009430885314941406, + "learning_rate": 1.4200857284118067e-05, + "loss": -0.0332, + "reward": 1.4800786972045898, + "reward_std": 0.26632819697260857, + "rewards/mrr_reward": 0.5991257503628731, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7340072840452194, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.0625, + "epoch": 0.156, + "grad_norm": 0.04333413392305374, + "kl": 0.0097503662109375, + "learning_rate": 1.4143755809932843e-05, + "loss": -0.0022, + "reward": 1.4679460525512695, + "reward_std": 0.4186020493507385, + "rewards/mrr_reward": 0.6158854365348816, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7128610759973526, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.78125, + "epoch": 0.1568, + "grad_norm": 0.03645530715584755, + "kl": 0.009119987487792969, + "learning_rate": 1.4086490747363492e-05, + "loss": -0.0124, + "reward": 1.3291675448417664, + "reward_std": 0.34505781903862953, + "rewards/mrr_reward": 0.47128596901893616, + "rewards/rank_answer_foramt_reward": 0.818359375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8047191351652145, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.9375, + "epoch": 0.1576, + "grad_norm": 0.04249008744955063, + "kl": 0.010274887084960938, + "learning_rate": 1.4029064357136628e-05, + "loss": 0.0055, + "reward": 1.4444103240966797, + "reward_std": 0.4831986427307129, + "rewards/mrr_reward": 0.6312500089406967, + "rewards/rank_answer_foramt_reward": 0.822265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6418563276529312, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.859375, + "epoch": 0.1584, + "grad_norm": 0.038807179778814316, + "kl": 0.01303863525390625, + "learning_rate": 1.3971478906347806e-05, + "loss": 0.0025, + "reward": 1.4374900162220001, + "reward_std": 0.39226941764354706, + "rewards/mrr_reward": 0.5747395902872086, + "rewards/rank_answer_foramt_reward": 0.84375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7706450521945953, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.796875, + "epoch": 0.1592, + "grad_norm": 0.03673255071043968, + "kl": 0.008836746215820312, + "learning_rate": 1.3913736668372027e-05, + "loss": -0.0084, + "reward": 1.6338341534137726, + "reward_std": 0.3351624459028244, + "rewards/mrr_reward": 0.767447903752327, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6820531934499741, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.0, + "epoch": 0.16, + "grad_norm": 0.03517894819378853, + "kl": 0.010756492614746094, + "learning_rate": 1.3855839922773968e-05, + "loss": -0.0186, + "reward": 1.489585041999817, + "reward_std": 0.35270126909017563, + "rewards/mrr_reward": 0.6339409798383713, + "rewards/rank_answer_foramt_reward": 0.83203125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7764543294906616, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.625, + "epoch": 0.1608, + "grad_norm": 0.039749667048454285, + "kl": 0.009137153625488281, + "learning_rate": 1.3797790955218014e-05, + "loss": -0.0303, + "reward": 1.57523912191391, + "reward_std": 0.3249845299869776, + "rewards/mrr_reward": 0.6855654865503311, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7721523940563202, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.75, + "epoch": 0.1616, + "grad_norm": 0.03552306815981865, + "kl": 0.008211135864257812, + "learning_rate": 1.3739592057378005e-05, + "loss": -0.0296, + "reward": 1.6716056764125824, + "reward_std": 0.32526458986103535, + "rewards/mrr_reward": 0.7610677182674408, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8431902080774307, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.125, + "epoch": 0.1624, + "grad_norm": 0.037600353360176086, + "kl": 0.008890151977539062, + "learning_rate": 1.3681245526846782e-05, + "loss": -0.0236, + "reward": 1.3627182841300964, + "reward_std": 0.2870614193379879, + "rewards/mrr_reward": 0.486111119389534, + "rewards/rank_answer_foramt_reward": 0.84765625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8087291121482849, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.375, + "epoch": 0.1632, + "grad_norm": 0.04189027473330498, + "kl": 0.01094818115234375, + "learning_rate": 1.3622753667045459e-05, + "loss": -0.0358, + "reward": 1.410872757434845, + "reward_std": 0.3953894004225731, + "rewards/mrr_reward": 0.5746279805898666, + "rewards/rank_answer_foramt_reward": 0.845703125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.703996941447258, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.59375, + "epoch": 0.164, + "grad_norm": 0.04231363534927368, + "kl": 0.010824203491210938, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.0009, + "reward": 1.3419412970542908, + "reward_std": 0.34718624502420425, + "rewards/mrr_reward": 0.49730904027819633, + "rewards/rank_answer_foramt_reward": 0.818359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7411323189735413, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.578125, + "epoch": 0.1648, + "grad_norm": 0.04113239422440529, + "kl": 0.01050567626953125, + "learning_rate": 1.350534320191259e-05, + "loss": -0.0414, + "reward": 1.3435261249542236, + "reward_std": 0.3302622064948082, + "rewards/mrr_reward": 0.48489584773778915, + "rewards/rank_answer_foramt_reward": 0.88671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.715191051363945, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.46875, + "epoch": 0.1656, + "grad_norm": 0.03465278074145317, + "kl": 0.008320808410644531, + "learning_rate": 1.344642923174517e-05, + "loss": -0.0085, + "reward": 1.4250126481056213, + "reward_std": 0.3028215132653713, + "rewards/mrr_reward": 0.5315724313259125, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8245819211006165, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.15625, + "epoch": 0.1664, + "grad_norm": 0.040414392948150635, + "kl": 0.011005401611328125, + "learning_rate": 1.3387379202452917e-05, + "loss": -0.0043, + "reward": 1.2858222126960754, + "reward_std": 0.3545723408460617, + "rewards/mrr_reward": 0.4399677626788616, + "rewards/rank_answer_foramt_reward": 0.8203125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.758507713675499, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.015625, + "epoch": 0.1672, + "grad_norm": 0.03951350972056389, + "kl": 0.010406494140625, + "learning_rate": 1.3328195445229869e-05, + "loss": -0.0136, + "reward": 1.5512892603874207, + "reward_std": 0.23556600511074066, + "rewards/mrr_reward": 0.6481771022081375, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.773812785744667, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.03125, + "epoch": 0.168, + "grad_norm": 0.03597855195403099, + "kl": 0.012434005737304688, + "learning_rate": 1.3268880296549424e-05, + "loss": -0.0475, + "reward": 1.5305464267730713, + "reward_std": 0.3849369138479233, + "rewards/mrr_reward": 0.6821614503860474, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.7075821757316589, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.484375, + "epoch": 0.1688, + "grad_norm": 0.04128649830818176, + "kl": 0.009695053100585938, + "learning_rate": 1.3209436098072095e-05, + "loss": -0.0153, + "reward": 1.5931483805179596, + "reward_std": 0.24104237789288163, + "rewards/mrr_reward": 0.6974144279956818, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7768451869487762, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.125, + "epoch": 0.1696, + "grad_norm": 0.03705138713121414, + "kl": 0.008800506591796875, + "learning_rate": 1.3149865196553049e-05, + "loss": 0.0028, + "reward": 1.374686360359192, + "reward_std": 0.2328730747103691, + "rewards/mrr_reward": 0.4906249921768904, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7883486449718475, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.34375, + "epoch": 0.1704, + "grad_norm": 0.03573732078075409, + "kl": 0.008937835693359375, + "learning_rate": 1.3090169943749475e-05, + "loss": -0.0051, + "reward": 1.4394998252391815, + "reward_std": 0.22585050389170647, + "rewards/mrr_reward": 0.524367555975914, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.814143717288971, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.578125, + "epoch": 0.1712, + "grad_norm": 0.038891687989234924, + "kl": 0.010797500610351562, + "learning_rate": 1.3030352696327741e-05, + "loss": -0.0244, + "reward": 1.3804857730865479, + "reward_std": 0.35337623208761215, + "rewards/mrr_reward": 0.5573722943663597, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6602989211678505, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.34375, + "epoch": 0.172, + "grad_norm": 0.03773610666394234, + "kl": 0.007359504699707031, + "learning_rate": 1.297041581577035e-05, + "loss": -0.0117, + "reward": 1.4981496632099152, + "reward_std": 0.23435491137206554, + "rewards/mrr_reward": 0.6250000149011612, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7513765692710876, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.203125, + "epoch": 0.1728, + "grad_norm": 0.039261989295482635, + "kl": 0.01084136962890625, + "learning_rate": 1.2910361668282718e-05, + "loss": -0.039, + "reward": 1.5580702126026154, + "reward_std": 0.28226844780147076, + "rewards/mrr_reward": 0.6630394235253334, + "rewards/rank_answer_foramt_reward": 0.94140625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7708081007003784, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.375, + "epoch": 0.1736, + "grad_norm": 0.05010940507054329, + "kl": 0.008663177490234375, + "learning_rate": 1.2850192624699762e-05, + "loss": -0.0192, + "reward": 1.4300898909568787, + "reward_std": 0.4626375511288643, + "rewards/mrr_reward": 0.5627356097102165, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7943617403507233, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.546875, + "epoch": 0.1744, + "grad_norm": 0.03629588708281517, + "kl": 0.010377883911132812, + "learning_rate": 1.2789911060392295e-05, + "loss": 0.0197, + "reward": 1.5270054042339325, + "reward_std": 0.2459590807557106, + "rewards/mrr_reward": 0.6557477712631226, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6909556984901428, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.796875, + "epoch": 0.1752, + "grad_norm": 0.04659593850374222, + "kl": 0.00952911376953125, + "learning_rate": 1.2729519355173254e-05, + "loss": -0.0118, + "reward": 1.3348519802093506, + "reward_std": 0.398675125092268, + "rewards/mrr_reward": 0.5026785731315613, + "rewards/rank_answer_foramt_reward": 0.806640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7229092568159103, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.921875, + "epoch": 0.176, + "grad_norm": 0.043882764875888824, + "kl": 0.011045455932617188, + "learning_rate": 1.2669019893203758e-05, + "loss": -0.0301, + "reward": 1.497531145811081, + "reward_std": 0.27045511081814766, + "rewards/mrr_reward": 0.6290550753474236, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6942455917596817, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.84375, + "epoch": 0.1768, + "grad_norm": 0.03796388581395149, + "kl": 0.010257720947265625, + "learning_rate": 1.2608415062898971e-05, + "loss": -0.0241, + "reward": 1.4202575087547302, + "reward_std": 0.3259681724011898, + "rewards/mrr_reward": 0.5604166686534882, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6915156245231628, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.1875, + "epoch": 0.1776, + "grad_norm": 0.040519829839468, + "kl": 0.009119033813476562, + "learning_rate": 1.2547707256833823e-05, + "loss": -0.0025, + "reward": 1.271849274635315, + "reward_std": 0.2725183069705963, + "rewards/mrr_reward": 0.39179687947034836, + "rewards/rank_answer_foramt_reward": 0.88671875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7879189848899841, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.34375, + "epoch": 0.1784, + "grad_norm": 0.041910942643880844, + "kl": 0.008281707763671875, + "learning_rate": 1.2486898871648552e-05, + "loss": -0.013, + "reward": 1.3902939558029175, + "reward_std": 0.3279624804854393, + "rewards/mrr_reward": 0.520114079117775, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7462835609912872, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.484375, + "epoch": 0.1792, + "grad_norm": 0.03706391900777817, + "kl": 0.008459091186523438, + "learning_rate": 1.2425992307954075e-05, + "loss": 0.0018, + "reward": 1.4472178220748901, + "reward_std": 0.3175524137914181, + "rewards/mrr_reward": 0.5600880607962608, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8113187104463577, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.234375, + "epoch": 0.18, + "grad_norm": 0.03729787841439247, + "kl": 0.009768486022949219, + "learning_rate": 1.236498997023725e-05, + "loss": -0.0125, + "reward": 1.4809614419937134, + "reward_std": 0.340122077614069, + "rewards/mrr_reward": 0.6090277582406998, + "rewards/rank_answer_foramt_reward": 0.900390625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7496449500322342, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.015625, + "epoch": 0.1808, + "grad_norm": 0.04090157151222229, + "kl": 0.009912490844726562, + "learning_rate": 1.2303894266765908e-05, + "loss": 0.0045, + "reward": 1.4084090292453766, + "reward_std": 0.24517753347754478, + "rewards/mrr_reward": 0.5429625511169434, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.731939822435379, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.5, + "epoch": 0.1816, + "grad_norm": 0.040676042437553406, + "kl": 0.00809478759765625, + "learning_rate": 1.2242707609493814e-05, + "loss": -0.0272, + "reward": 1.5291298627853394, + "reward_std": 0.3408605456352234, + "rewards/mrr_reward": 0.6497396007180214, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.760521873831749, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.40625, + "epoch": 0.1824, + "grad_norm": 0.04246947541832924, + "kl": 0.009426116943359375, + "learning_rate": 1.2181432413965428e-05, + "loss": 0.0024, + "reward": 1.4244692921638489, + "reward_std": 0.2590717747807503, + "rewards/mrr_reward": 0.5588541701436043, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7129197269678116, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.375, + "epoch": 0.1832, + "grad_norm": 0.04285717383027077, + "kl": 0.008266448974609375, + "learning_rate": 1.212007109922055e-05, + "loss": -0.0374, + "reward": 1.4698918163776398, + "reward_std": 0.33165768533945084, + "rewards/mrr_reward": 0.59691222012043, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7508614361286163, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.59375, + "epoch": 0.184, + "grad_norm": 0.041689760982990265, + "kl": 0.01070404052734375, + "learning_rate": 1.2058626087698814e-05, + "loss": 0.0003, + "reward": 1.43734011054039, + "reward_std": 0.44176214933395386, + "rewards/mrr_reward": 0.6163132339715958, + "rewards/rank_answer_foramt_reward": 0.8046875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.706710159778595, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.328125, + "epoch": 0.1848, + "grad_norm": 0.03540234640240669, + "kl": 0.007287025451660156, + "learning_rate": 1.1997099805144071e-05, + "loss": 0.0284, + "reward": 1.5855993926525116, + "reward_std": 0.3010600432753563, + "rewards/mrr_reward": 0.6768229231238365, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7948835492134094, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.171875, + "epoch": 0.1856, + "grad_norm": 0.03808577358722687, + "kl": 0.009876251220703125, + "learning_rate": 1.1935494680508606e-05, + "loss": -0.0193, + "reward": 1.5138767063617706, + "reward_std": 0.46021201461553574, + "rewards/mrr_reward": 0.67578125, + "rewards/rank_answer_foramt_reward": 0.818359375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7369488328695297, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.125, + "epoch": 0.1864, + "grad_norm": 0.03571336343884468, + "kl": 0.010625839233398438, + "learning_rate": 1.187381314585725e-05, + "loss": -0.0271, + "reward": 1.2613760828971863, + "reward_std": 0.2764936424791813, + "rewards/mrr_reward": 0.43248388171195984, + "rewards/rank_answer_foramt_reward": 0.84765625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6641382277011871, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.34375, + "epoch": 0.1872, + "grad_norm": 0.03995465487241745, + "kl": 0.008295059204101562, + "learning_rate": 1.1812057636271374e-05, + "loss": -0.0114, + "reward": 1.4478269517421722, + "reward_std": 0.3669227175414562, + "rewards/mrr_reward": 0.5865017399191856, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7272636741399765, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.84375, + "epoch": 0.188, + "grad_norm": 0.04256037250161171, + "kl": 0.00879669189453125, + "learning_rate": 1.1750230589752763e-05, + "loss": -0.0233, + "reward": 1.298891007900238, + "reward_std": 0.3018713817000389, + "rewards/mrr_reward": 0.4497953839600086, + "rewards/rank_answer_foramt_reward": 0.814453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7741887420415878, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.671875, + "epoch": 0.1888, + "grad_norm": 0.039598457515239716, + "kl": 0.009709358215332031, + "learning_rate": 1.1688334447127338e-05, + "loss": -0.0192, + "reward": 1.6007757186889648, + "reward_std": 0.4135289564728737, + "rewards/mrr_reward": 0.744140625, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.7384417653083801, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.78125, + "epoch": 0.1896, + "grad_norm": 0.036554981023073196, + "kl": 0.009065628051757812, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.0093, + "reward": 1.5135816633701324, + "reward_std": 0.2605090048164129, + "rewards/mrr_reward": 0.640625, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7722761482000351, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.78125, + "epoch": 0.1904, + "grad_norm": 0.04034959152340889, + "kl": 0.009461402893066406, + "learning_rate": 1.156434465040231e-05, + "loss": -0.0206, + "reward": 1.374279260635376, + "reward_std": 0.3329411558806896, + "rewards/mrr_reward": 0.5505580306053162, + "rewards/rank_answer_foramt_reward": 0.806640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.705109030008316, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.9375, + "epoch": 0.1912, + "grad_norm": 0.0426294319331646, + "kl": 0.009059906005859375, + "learning_rate": 1.1502255891207572e-05, + "loss": 0.0192, + "reward": 1.5010081231594086, + "reward_std": 0.37140223383903503, + "rewards/mrr_reward": 0.6446304619312286, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6985992938280106, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.609375, + "epoch": 0.192, + "grad_norm": 0.03961321711540222, + "kl": 0.0083465576171875, + "learning_rate": 1.1440107825522522e-05, + "loss": -0.0221, + "reward": 1.4463289082050323, + "reward_std": 0.2782457806169987, + "rewards/mrr_reward": 0.5504092425107956, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8067047744989395, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.828125, + "epoch": 0.1928, + "grad_norm": 0.04309277981519699, + "kl": 0.0095672607421875, + "learning_rate": 1.137790290684638e-05, + "loss": -0.024, + "reward": 1.3144385814666748, + "reward_std": 0.26049618795514107, + "rewards/mrr_reward": 0.4519035294651985, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.719211108982563, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.59375, + "epoch": 0.1936, + "grad_norm": 0.04275864362716675, + "kl": 0.009810447692871094, + "learning_rate": 1.1315643590922827e-05, + "loss": -0.0326, + "reward": 1.597327709197998, + "reward_std": 0.22876879945397377, + "rewards/mrr_reward": 0.7057291716337204, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.731110468506813, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.71875, + "epoch": 0.1944, + "grad_norm": 0.03837655857205391, + "kl": 0.011631011962890625, + "learning_rate": 1.1253332335643043e-05, + "loss": -0.0353, + "reward": 1.4636397659778595, + "reward_std": 0.2928088903427124, + "rewards/mrr_reward": 0.5801215246319771, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7417809367179871, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.1875, + "epoch": 0.1952, + "grad_norm": 0.039478182792663574, + "kl": 0.013881683349609375, + "learning_rate": 1.11909716009487e-05, + "loss": -0.0426, + "reward": 1.2862295508384705, + "reward_std": 0.2817831374704838, + "rewards/mrr_reward": 0.43656374514102936, + "rewards/rank_answer_foramt_reward": 0.818359375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7641979157924652, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.90625, + "epoch": 0.196, + "grad_norm": 0.04093409702181816, + "kl": 0.008157730102539062, + "learning_rate": 1.1128563848734817e-05, + "loss": -0.0431, + "reward": 1.3213240504264832, + "reward_std": 0.3792931139469147, + "rewards/mrr_reward": 0.4646267518401146, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7776929587125778, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.609375, + "epoch": 0.1968, + "grad_norm": 0.038951508700847626, + "kl": 0.0119476318359375, + "learning_rate": 1.10661115427526e-05, + "loss": -0.0258, + "reward": 1.4929006099700928, + "reward_std": 0.19486585073173046, + "rewards/mrr_reward": 0.5957589447498322, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.781110942363739, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.84375, + "epoch": 0.1976, + "grad_norm": 0.040840230882167816, + "kl": 0.008116722106933594, + "learning_rate": 1.1003617148512149e-05, + "loss": 0.0099, + "reward": 1.4765380024909973, + "reward_std": 0.30231093615293503, + "rewards/mrr_reward": 0.5662698447704315, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8423726558685303, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.390625, + "epoch": 0.1984, + "grad_norm": 0.03562074154615402, + "kl": 0.010061264038085938, + "learning_rate": 1.0941083133185146e-05, + "loss": -0.0065, + "reward": 1.5500171780586243, + "reward_std": 0.37310022860765457, + "rewards/mrr_reward": 0.6682291775941849, + "rewards/rank_answer_foramt_reward": 0.88671875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7931784391403198, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.421875, + "epoch": 0.1992, + "grad_norm": 0.040016964077949524, + "kl": 0.007778167724609375, + "learning_rate": 1.0878511965507435e-05, + "loss": -0.0046, + "reward": 1.4200845062732697, + "reward_std": 0.2748546898365021, + "rewards/mrr_reward": 0.5281250104308128, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7712667435407639, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.953125, + "epoch": 0.2, + "grad_norm": 0.0408429279923439, + "kl": 0.008358001708984375, + "learning_rate": 1.0815906115681579e-05, + "loss": -0.0003, + "reward": 1.4024586379528046, + "reward_std": 0.45451923459768295, + "rewards/mrr_reward": 0.5480654612183571, + "rewards/rank_answer_foramt_reward": 0.8203125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.792195051908493, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.390625, + "epoch": 0.2008, + "grad_norm": 0.044342394918203354, + "kl": 0.011260986328125, + "learning_rate": 1.0753268055279328e-05, + "loss": -0.0243, + "reward": 1.4417364001274109, + "reward_std": 0.32018742710351944, + "rewards/mrr_reward": 0.599479153752327, + "rewards/rank_answer_foramt_reward": 0.849609375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7026848942041397, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.015625, + "epoch": 0.2016, + "grad_norm": 0.0681619718670845, + "kl": 0.021749496459960938, + "learning_rate": 1.0690600257144062e-05, + "loss": -0.0105, + "reward": 1.3221397995948792, + "reward_std": 0.3863202631473541, + "rewards/mrr_reward": 0.4698536768555641, + "rewards/rank_answer_foramt_reward": 0.86328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.719403862953186, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.546875, + "epoch": 0.2024, + "grad_norm": 0.0338444858789444, + "kl": 0.009431838989257812, + "learning_rate": 1.0627905195293135e-05, + "loss": -0.0055, + "reward": 1.4945510029792786, + "reward_std": 0.2610199525952339, + "rewards/mrr_reward": 0.5938120037317276, + "rewards/rank_answer_foramt_reward": 0.927734375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8095900267362595, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.109375, + "epoch": 0.2032, + "grad_norm": 0.04314475134015083, + "kl": 0.008597373962402344, + "learning_rate": 1.0565185344820248e-05, + "loss": -0.0178, + "reward": 1.4714539349079132, + "reward_std": 0.224628996104002, + "rewards/mrr_reward": 0.5916666761040688, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7226623743772507, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.21875, + "epoch": 0.204, + "grad_norm": 0.04039366543292999, + "kl": 0.01071929931640625, + "learning_rate": 1.0502443181797696e-05, + "loss": -0.0199, + "reward": 1.255561202764511, + "reward_std": 0.34654828906059265, + "rewards/mrr_reward": 0.43823785334825516, + "rewards/rank_answer_foramt_reward": 0.8203125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6564248204231262, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.984375, + "epoch": 0.2048, + "grad_norm": 0.04161591827869415, + "kl": 0.008603096008300781, + "learning_rate": 1.043968118317865e-05, + "loss": 0.0094, + "reward": 1.4394341111183167, + "reward_std": 0.32009443640708923, + "rewards/mrr_reward": 0.5737165212631226, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7288551479578018, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.40625, + "epoch": 0.2056, + "grad_norm": 0.038173969835042953, + "kl": 0.010286331176757812, + "learning_rate": 1.0376901826699349e-05, + "loss": -0.0254, + "reward": 1.445276916027069, + "reward_std": 0.3517743721604347, + "rewards/mrr_reward": 0.6002604365348816, + "rewards/rank_answer_foramt_reward": 0.841796875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7422964721918106, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.171875, + "epoch": 0.2064, + "grad_norm": 0.03894796594977379, + "kl": 0.010990142822265625, + "learning_rate": 1.0314107590781284e-05, + "loss": -0.0134, + "reward": 1.4257104396820068, + "reward_std": 0.23646708950400352, + "rewards/mrr_reward": 0.5429191589355469, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7512968927621841, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 405.875, + "epoch": 0.2072, + "grad_norm": 0.0407368466258049, + "kl": 0.0094451904296875, + "learning_rate": 1.0251300954433377e-05, + "loss": -0.0281, + "reward": 1.5280113816261292, + "reward_std": 0.3434094376862049, + "rewards/mrr_reward": 0.6421006992459297, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7744212746620178, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.203125, + "epoch": 0.208, + "grad_norm": 0.03855285048484802, + "kl": 0.008718490600585938, + "learning_rate": 1.0188484397154083e-05, + "loss": 0.0004, + "reward": 1.5177774131298065, + "reward_std": 0.3898182809352875, + "rewards/mrr_reward": 0.6282552182674408, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8205215632915497, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.46875, + "epoch": 0.2088, + "grad_norm": 0.0410795584321022, + "kl": 0.01009368896484375, + "learning_rate": 1.0125660398833528e-05, + "loss": -0.0357, + "reward": 1.5540205836296082, + "reward_std": 0.2195252738893032, + "rewards/mrr_reward": 0.6762276813387871, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.743962749838829, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.125, + "epoch": 0.2096, + "grad_norm": 0.03606094419956207, + "kl": 0.011699676513671875, + "learning_rate": 1.0062831439655591e-05, + "loss": -0.0107, + "reward": 1.5303250849246979, + "reward_std": 0.22126532718539238, + "rewards/mrr_reward": 0.6565104424953461, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.67721988260746, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.84375, + "epoch": 0.2104, + "grad_norm": 0.03741401433944702, + "kl": 0.008730888366699219, + "learning_rate": 1e-05, + "loss": -0.0197, + "reward": 1.541961818933487, + "reward_std": 0.27997246757149696, + "rewards/mrr_reward": 0.6421006917953491, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7893517762422562, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.796875, + "epoch": 0.2112, + "grad_norm": 0.03900214284658432, + "kl": 0.008241653442382812, + "learning_rate": 9.937168560344412e-06, + "loss": 0.0077, + "reward": 1.4794524312019348, + "reward_std": 0.35047149658203125, + "rewards/mrr_reward": 0.5804191678762436, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7868431955575943, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.171875, + "epoch": 0.212, + "grad_norm": 0.038283322006464005, + "kl": 0.00952911376953125, + "learning_rate": 9.874339601166474e-06, + "loss": 0.0138, + "reward": 1.506893515586853, + "reward_std": 0.2615400552749634, + "rewards/mrr_reward": 0.6185701861977577, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7739198356866837, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.453125, + "epoch": 0.2128, + "grad_norm": 0.03715907782316208, + "kl": 0.009618759155273438, + "learning_rate": 9.81151560284592e-06, + "loss": -0.016, + "reward": 1.5035657286643982, + "reward_std": 0.3300909399986267, + "rewards/mrr_reward": 0.6235367059707642, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7507388442754745, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.390625, + "epoch": 0.2136, + "grad_norm": 0.041197191923856735, + "kl": 0.010580062866210938, + "learning_rate": 9.748699045566626e-06, + "loss": -0.0366, + "reward": 1.576979547739029, + "reward_std": 0.439083069562912, + "rewards/mrr_reward": 0.7342447936534882, + "rewards/rank_answer_foramt_reward": 0.884765625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6689759790897369, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.453125, + "epoch": 0.2144, + "grad_norm": 0.041197191923856735, + "kl": 0.009737014770507812, + "learning_rate": 9.748699045566626e-06, + "loss": 0.001, + "reward": 1.4734593629837036, + "reward_std": 0.26162197068333626, + "rewards/mrr_reward": 0.5788566395640373, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7656047195196152, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.875, + "epoch": 0.2152, + "grad_norm": 0.0444633774459362, + "kl": 0.008083343505859375, + "learning_rate": 9.685892409218718e-06, + "loss": -0.0102, + "reward": 1.4165308475494385, + "reward_std": 0.26300579868257046, + "rewards/mrr_reward": 0.5317460373044014, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7495253682136536, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.75, + "epoch": 0.216, + "grad_norm": 0.0361800491809845, + "kl": 0.009550094604492188, + "learning_rate": 9.623098173300655e-06, + "loss": -0.0378, + "reward": 1.4879783987998962, + "reward_std": 0.3136373609304428, + "rewards/mrr_reward": 0.6265625134110451, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7080072462558746, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.15625, + "epoch": 0.2168, + "grad_norm": 0.04007337614893913, + "kl": 0.010549545288085938, + "learning_rate": 9.560318816821354e-06, + "loss": -0.0364, + "reward": 1.2709717452526093, + "reward_std": 0.36226093024015427, + "rewards/mrr_reward": 0.4365575388073921, + "rewards/rank_answer_foramt_reward": 0.83203125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7043090015649796, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.484375, + "epoch": 0.2176, + "grad_norm": 0.039333194494247437, + "kl": 0.009454727172851562, + "learning_rate": 9.497556818202306e-06, + "loss": -0.0502, + "reward": 1.3762290477752686, + "reward_std": 0.3271942213177681, + "rewards/mrr_reward": 0.49231771379709244, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7742221802473068, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 404.953125, + "epoch": 0.2184, + "grad_norm": 0.04287045821547508, + "kl": 0.014556884765625, + "learning_rate": 9.434814655179756e-06, + "loss": -0.0263, + "reward": 1.5546920597553253, + "reward_std": 0.38028147257864475, + "rewards/mrr_reward": 0.7030567973852158, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6939939856529236, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.21875, + "epoch": 0.2192, + "grad_norm": 0.039541661739349365, + "kl": 0.010419845581054688, + "learning_rate": 9.372094804706867e-06, + "loss": -0.0369, + "reward": 1.7257789969444275, + "reward_std": 0.29538945853710175, + "rewards/mrr_reward": 0.8723958432674408, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6289780139923096, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.21875, + "epoch": 0.22, + "grad_norm": 0.04606436938047409, + "kl": 0.014463424682617188, + "learning_rate": 9.309399742855943e-06, + "loss": 0.0001, + "reward": 1.4452278017997742, + "reward_std": 0.3192713689059019, + "rewards/mrr_reward": 0.576227679848671, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.75247423350811, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.078125, + "epoch": 0.2208, + "grad_norm": 0.037436868995428085, + "kl": 0.0102691650390625, + "learning_rate": 9.246731944720675e-06, + "loss": -0.0397, + "reward": 1.184128776192665, + "reward_std": 0.2542712949216366, + "rewards/mrr_reward": 0.3415798544883728, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6723189651966095, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.421875, + "epoch": 0.2216, + "grad_norm": 0.04319589212536812, + "kl": 0.010812759399414062, + "learning_rate": 9.184093884318426e-06, + "loss": -0.0151, + "reward": 1.4168269038200378, + "reward_std": 0.38048839569091797, + "rewards/mrr_reward": 0.5684895813465118, + "rewards/rank_answer_foramt_reward": 0.849609375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7211094051599503, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.390625, + "epoch": 0.2224, + "grad_norm": 0.03658146783709526, + "kl": 0.008172988891601562, + "learning_rate": 9.121488034492569e-06, + "loss": -0.006, + "reward": 1.4866581559181213, + "reward_std": 0.3485083729028702, + "rewards/mrr_reward": 0.6006448417901993, + "rewards/rank_answer_foramt_reward": 0.884765625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8001230210065842, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.71875, + "epoch": 0.2232, + "grad_norm": 0.036550384014844894, + "kl": 0.009181976318359375, + "learning_rate": 9.058916866814857e-06, + "loss": -0.0197, + "reward": 1.2319900691509247, + "reward_std": 0.2513876333832741, + "rewards/mrr_reward": 0.3517051041126251, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7925300300121307, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.859375, + "epoch": 0.224, + "grad_norm": 0.03881368413567543, + "kl": 0.009093284606933594, + "learning_rate": 8.996382851487851e-06, + "loss": -0.0384, + "reward": 1.3488417267799377, + "reward_std": 0.32528745383024216, + "rewards/mrr_reward": 0.4834573529660702, + "rewards/rank_answer_foramt_reward": 0.859375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7630017250776291, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.9375, + "epoch": 0.2248, + "grad_norm": 0.04073338583111763, + "kl": 0.010313034057617188, + "learning_rate": 8.933888457247402e-06, + "loss": -0.0459, + "reward": 1.402299851179123, + "reward_std": 0.40668466687202454, + "rewards/mrr_reward": 0.5348090305924416, + "rewards/rank_answer_foramt_reward": 0.859375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7693850100040436, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.1875, + "epoch": 0.2256, + "grad_norm": 0.043222904205322266, + "kl": 0.0100250244140625, + "learning_rate": 8.871436151265183e-06, + "loss": 0.0008, + "reward": 1.3178912699222565, + "reward_std": 0.3209885358810425, + "rewards/mrr_reward": 0.4478670582175255, + "rewards/rank_answer_foramt_reward": 0.84765625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7887807190418243, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.234375, + "epoch": 0.2264, + "grad_norm": 0.036993607878685, + "kl": 0.008585929870605469, + "learning_rate": 8.809028399051302e-06, + "loss": 0.0047, + "reward": 1.469320148229599, + "reward_std": 0.3071388304233551, + "rewards/mrr_reward": 0.5837797746062279, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.8065022975206375, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.703125, + "epoch": 0.2272, + "grad_norm": 0.0379331074655056, + "kl": 0.010618209838867188, + "learning_rate": 8.746667664356957e-06, + "loss": 0.0039, + "reward": 1.2527941763401031, + "reward_std": 0.23029084131121635, + "rewards/mrr_reward": 0.3807787857949734, + "rewards/rank_answer_foramt_reward": 0.861328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7889550924301147, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.546875, + "epoch": 0.228, + "grad_norm": 0.03945260867476463, + "kl": 0.010408401489257812, + "learning_rate": 8.684356409077177e-06, + "loss": 0.0084, + "reward": 1.5159841179847717, + "reward_std": 0.34652554243803024, + "rewards/mrr_reward": 0.6295944899320602, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7895446866750717, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.234375, + "epoch": 0.2288, + "grad_norm": 0.03930336609482765, + "kl": 0.008619308471679688, + "learning_rate": 8.62209709315362e-06, + "loss": -0.0207, + "reward": 1.3798122704029083, + "reward_std": 0.29061378724873066, + "rewards/mrr_reward": 0.5122581720352173, + "rewards/rank_answer_foramt_reward": 0.86328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7656702399253845, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.40625, + "epoch": 0.2296, + "grad_norm": 0.04059287905693054, + "kl": 0.011449813842773438, + "learning_rate": 8.559892174477478e-06, + "loss": -0.0303, + "reward": 1.4114105701446533, + "reward_std": 0.39132068306207657, + "rewards/mrr_reward": 0.583333358168602, + "rewards/rank_answer_foramt_reward": 0.8203125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.689012199640274, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.828125, + "epoch": 0.2304, + "grad_norm": 0.03984786570072174, + "kl": 0.010134696960449219, + "learning_rate": 8.49774410879243e-06, + "loss": -0.0409, + "reward": 1.2429711520671844, + "reward_std": 0.28321645595133305, + "rewards/mrr_reward": 0.3541666753590107, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7831906080245972, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.078125, + "epoch": 0.2312, + "grad_norm": 0.0415896400809288, + "kl": 0.010349273681640625, + "learning_rate": 8.43565534959769e-06, + "loss": -0.042, + "reward": 1.3772334456443787, + "reward_std": 0.26923793181777, + "rewards/mrr_reward": 0.5224826335906982, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.646794430911541, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.359375, + "epoch": 0.232, + "grad_norm": 0.03732220083475113, + "kl": 0.008699417114257812, + "learning_rate": 8.373628348051165e-06, + "loss": -0.0092, + "reward": 1.5791191756725311, + "reward_std": 0.315543457865715, + "rewards/mrr_reward": 0.6815476268529892, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7472574412822723, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.890625, + "epoch": 0.2328, + "grad_norm": 0.04088292643427849, + "kl": 0.010980606079101562, + "learning_rate": 8.311665552872662e-06, + "loss": -0.0234, + "reward": 1.4555590748786926, + "reward_std": 0.2937028855085373, + "rewards/mrr_reward": 0.583984375, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7524634450674057, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.625, + "epoch": 0.2336, + "grad_norm": 0.044596508145332336, + "kl": 0.012334823608398438, + "learning_rate": 8.249769410247239e-06, + "loss": -0.04, + "reward": 1.431397259235382, + "reward_std": 0.36401835083961487, + "rewards/mrr_reward": 0.580512136220932, + "rewards/rank_answer_foramt_reward": 0.845703125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7561738044023514, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.03125, + "epoch": 0.2344, + "grad_norm": 0.04020597040653229, + "kl": 0.00943756103515625, + "learning_rate": 8.187942363728626e-06, + "loss": 0.0116, + "reward": 1.4588199257850647, + "reward_std": 0.235812745988369, + "rewards/mrr_reward": 0.5665984451770782, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7388575822114944, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.640625, + "epoch": 0.2352, + "grad_norm": 0.042804013937711716, + "kl": 0.009305953979492188, + "learning_rate": 8.126186854142752e-06, + "loss": 0.0242, + "reward": 1.3589671552181244, + "reward_std": 0.3509839344769716, + "rewards/mrr_reward": 0.4847656264901161, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7955798357725143, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.828125, + "epoch": 0.236, + "grad_norm": 0.03595392778515816, + "kl": 0.008063316345214844, + "learning_rate": 8.064505319491398e-06, + "loss": -0.0051, + "reward": 1.3917770087718964, + "reward_std": 0.23030859045684338, + "rewards/mrr_reward": 0.49776167050004005, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7911685407161713, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.515625, + "epoch": 0.2368, + "grad_norm": 0.042226508259773254, + "kl": 0.015703201293945312, + "learning_rate": 8.00290019485593e-06, + "loss": -0.0313, + "reward": 1.4788413643836975, + "reward_std": 0.32476067543029785, + "rewards/mrr_reward": 0.6268229335546494, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.699061393737793, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.03125, + "epoch": 0.2376, + "grad_norm": 0.03763006627559662, + "kl": 0.008462905883789062, + "learning_rate": 7.94137391230119e-06, + "loss": -0.0188, + "reward": 1.582606554031372, + "reward_std": 0.20448793843388557, + "rewards/mrr_reward": 0.6720920205116272, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7806191295385361, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.796875, + "epoch": 0.2384, + "grad_norm": 0.03964044153690338, + "kl": 0.009276390075683594, + "learning_rate": 7.879928900779457e-06, + "loss": 0.0007, + "reward": 1.4767478704452515, + "reward_std": 0.3392485938966274, + "rewards/mrr_reward": 0.6051401421427727, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7760008126497269, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.5, + "epoch": 0.2392, + "grad_norm": 0.03888827934861183, + "kl": 0.010454177856445312, + "learning_rate": 7.818567586034578e-06, + "loss": -0.0233, + "reward": 1.5537761747837067, + "reward_std": 0.1316620425786823, + "rewards/mrr_reward": 0.6217633932828903, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8379528671503067, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.625, + "epoch": 0.24, + "grad_norm": 0.038070328533649445, + "kl": 0.009571075439453125, + "learning_rate": 7.757292390506191e-06, + "loss": -0.0591, + "reward": 1.7321368753910065, + "reward_std": 0.1761476807296276, + "rewards/mrr_reward": 0.841796875, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7136247903108597, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.46875, + "epoch": 0.2408, + "grad_norm": 0.0418258011341095, + "kl": 0.010776519775390625, + "learning_rate": 7.696105733234099e-06, + "loss": -0.0165, + "reward": 1.5259293019771576, + "reward_std": 0.29256412014365196, + "rewards/mrr_reward": 0.6532242149114609, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6875295341014862, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.09375, + "epoch": 0.2416, + "grad_norm": 0.04463861510157585, + "kl": 0.008836746215820312, + "learning_rate": 7.635010029762755e-06, + "loss": -0.0098, + "reward": 1.4686961770057678, + "reward_std": 0.3223220370709896, + "rewards/mrr_reward": 0.606919676065445, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7012877017259598, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.75, + "epoch": 0.2424, + "grad_norm": 0.04330219700932503, + "kl": 0.011468887329101562, + "learning_rate": 7.574007692045928e-06, + "loss": 0.0058, + "reward": 1.4482861161231995, + "reward_std": 0.38629114255309105, + "rewards/mrr_reward": 0.5864769443869591, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.7541209012269974, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.171875, + "epoch": 0.2432, + "grad_norm": 0.03801389038562775, + "kl": 0.011196136474609375, + "learning_rate": 7.513101128351454e-06, + "loss": -0.0036, + "reward": 1.5316137671470642, + "reward_std": 0.33731117844581604, + "rewards/mrr_reward": 0.672395870089531, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.658377930521965, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.75, + "epoch": 0.244, + "grad_norm": 0.03998512774705887, + "kl": 0.008950233459472656, + "learning_rate": 7.4522927431661805e-06, + "loss": -0.0292, + "reward": 1.5211172103881836, + "reward_std": 0.23774503916502, + "rewards/mrr_reward": 0.6258432418107986, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8027948141098022, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.40625, + "epoch": 0.2448, + "grad_norm": 0.043149009346961975, + "kl": 0.011903762817382812, + "learning_rate": 7.391584937101034e-06, + "loss": -0.0233, + "reward": 1.3914374113082886, + "reward_std": 0.23153280839323997, + "rewards/mrr_reward": 0.5059523731470108, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7399283051490784, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.65625, + "epoch": 0.2456, + "grad_norm": 0.04259219393134117, + "kl": 0.009889602661132812, + "learning_rate": 7.330980106796247e-06, + "loss": -0.0619, + "reward": 1.380819708108902, + "reward_std": 0.23608434945344925, + "rewards/mrr_reward": 0.5217137709259987, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7088199555873871, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.53125, + "epoch": 0.2464, + "grad_norm": 0.0958850160241127, + "kl": 0.02011871337890625, + "learning_rate": 7.27048064482675e-06, + "loss": -0.0235, + "reward": 1.3718312680721283, + "reward_std": 0.2723774276673794, + "rewards/mrr_reward": 0.48489585518836975, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7872923165559769, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 402.28125, + "epoch": 0.2472, + "grad_norm": 0.0465591698884964, + "kl": 0.016143798828125, + "learning_rate": 7.210088939607709e-06, + "loss": -0.0312, + "reward": 1.231211543083191, + "reward_std": 0.32072607427835464, + "rewards/mrr_reward": 0.3999132066965103, + "rewards/rank_answer_foramt_reward": 0.849609375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.6929138600826263, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.984375, + "epoch": 0.248, + "grad_norm": 0.038580480962991714, + "kl": 0.011264801025390625, + "learning_rate": 7.149807375300239e-06, + "loss": 0.0033, + "reward": 1.326162338256836, + "reward_std": 0.33862701058387756, + "rewards/mrr_reward": 0.47708334028720856, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7624196261167526, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.703125, + "epoch": 0.2488, + "grad_norm": 0.037029314786195755, + "kl": 0.008462905883789062, + "learning_rate": 7.0896383317172845e-06, + "loss": -0.0385, + "reward": 1.2545623183250427, + "reward_std": 0.31125089153647423, + "rewards/mrr_reward": 0.3746279776096344, + "rewards/rank_answer_foramt_reward": 0.861328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8129519671201706, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.453125, + "epoch": 0.2496, + "grad_norm": 0.041599493473768234, + "kl": 0.010118484497070312, + "learning_rate": 7.029584184229653e-06, + "loss": 0.026, + "reward": 1.4846043288707733, + "reward_std": 0.36305932328104973, + "rewards/mrr_reward": 0.5936383754014969, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8327091336250305, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.1875, + "epoch": 0.2504, + "grad_norm": 0.0423130989074707, + "kl": 0.010608673095703125, + "learning_rate": 6.969647303672262e-06, + "loss": -0.0047, + "reward": 1.3655261397361755, + "reward_std": 0.3140504229813814, + "rewards/mrr_reward": 0.48500124737620354, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.791303962469101, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.921875, + "epoch": 0.2512, + "grad_norm": 0.06699636578559875, + "kl": 0.0169677734375, + "learning_rate": 6.909830056250527e-06, + "loss": -0.0292, + "reward": 1.3997429311275482, + "reward_std": 0.3030847944319248, + "rewards/mrr_reward": 0.5588541775941849, + "rewards/rank_answer_foramt_reward": 0.861328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.686819538474083, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.65625, + "epoch": 0.252, + "grad_norm": 0.03891773149371147, + "kl": 0.011255264282226562, + "learning_rate": 6.850134803446955e-06, + "loss": -0.0275, + "reward": 1.4285947978496552, + "reward_std": 0.2862687110900879, + "rewards/mrr_reward": 0.5552455335855484, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.812528446316719, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.328125, + "epoch": 0.2528, + "grad_norm": 0.04100382700562477, + "kl": 0.009349822998046875, + "learning_rate": 6.790563901927907e-06, + "loss": -0.0514, + "reward": 1.50279700756073, + "reward_std": 0.32215361297130585, + "rewards/mrr_reward": 0.6073288694024086, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7760395705699921, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.90625, + "epoch": 0.2536, + "grad_norm": 0.0384552925825119, + "kl": 0.011285781860351562, + "learning_rate": 6.731119703450577e-06, + "loss": -0.0035, + "reward": 1.3444324433803558, + "reward_std": 0.31130388006567955, + "rewards/mrr_reward": 0.47606024146080017, + "rewards/rank_answer_foramt_reward": 0.8359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7954932898283005, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.578125, + "epoch": 0.2544, + "grad_norm": 0.03822173550724983, + "kl": 0.007521629333496094, + "learning_rate": 6.671804554770135e-06, + "loss": -0.0184, + "reward": 1.4845271408557892, + "reward_std": 0.24785233289003372, + "rewards/mrr_reward": 0.576884925365448, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8051183372735977, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.484375, + "epoch": 0.2552, + "grad_norm": 0.03790445998311043, + "kl": 0.012842178344726562, + "learning_rate": 6.612620797547087e-06, + "loss": -0.0038, + "reward": 1.250670075416565, + "reward_std": 0.29811690375208855, + "rewards/mrr_reward": 0.35522693768143654, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8482294976711273, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.734375, + "epoch": 0.256, + "grad_norm": 0.04126419499516487, + "kl": 0.009257316589355469, + "learning_rate": 6.553570768254831e-06, + "loss": -0.017, + "reward": 1.694937378168106, + "reward_std": 0.23029077798128128, + "rewards/mrr_reward": 0.8385850638151169, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6379755735397339, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.015625, + "epoch": 0.2568, + "grad_norm": 0.04230582341551781, + "kl": 0.014929771423339844, + "learning_rate": 6.494656798087412e-06, + "loss": -0.0211, + "reward": 1.477415293455124, + "reward_std": 0.2716873064637184, + "rewards/mrr_reward": 0.6104352548718452, + "rewards/rank_answer_foramt_reward": 0.94140625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7092431783676147, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.03125, + "epoch": 0.2576, + "grad_norm": 0.036968860775232315, + "kl": 0.008441925048828125, + "learning_rate": 6.435881212867494e-06, + "loss": -0.0312, + "reward": 1.4142729341983795, + "reward_std": 0.31983111053705215, + "rewards/mrr_reward": 0.5367559641599655, + "rewards/rank_answer_foramt_reward": 0.861328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.797814130783081, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.375, + "epoch": 0.2584, + "grad_norm": 0.0385778546333313, + "kl": 0.010784149169921875, + "learning_rate": 6.377246332954544e-06, + "loss": -0.025, + "reward": 1.481526404619217, + "reward_std": 0.3511744774878025, + "rewards/mrr_reward": 0.5972842276096344, + "rewards/rank_answer_foramt_reward": 0.88671875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8084279000759125, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.625, + "epoch": 0.2592, + "grad_norm": 0.04187595844268799, + "kl": 0.01099395751953125, + "learning_rate": 6.318754473153221e-06, + "loss": -0.0355, + "reward": 1.4383732378482819, + "reward_std": 0.298159871250391, + "rewards/mrr_reward": 0.5604166761040688, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7425056099891663, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.359375, + "epoch": 0.26, + "grad_norm": 0.04161127656698227, + "kl": 0.010236740112304688, + "learning_rate": 6.260407942621998e-06, + "loss": 0.0278, + "reward": 1.4653730988502502, + "reward_std": 0.28907064720988274, + "rewards/mrr_reward": 0.5929687470197678, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7608369886875153, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.5, + "epoch": 0.2608, + "grad_norm": 0.044767290353775024, + "kl": 0.010900497436523438, + "learning_rate": 6.202209044781991e-06, + "loss": -0.0244, + "reward": 1.445456624031067, + "reward_std": 0.2606505677103996, + "rewards/mrr_reward": 0.5714161843061447, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7404041886329651, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.4375, + "epoch": 0.2616, + "grad_norm": 0.042263783514499664, + "kl": 0.010478019714355469, + "learning_rate": 6.144160077226035e-06, + "loss": -0.0364, + "reward": 1.4664171934127808, + "reward_std": 0.37280726805329323, + "rewards/mrr_reward": 0.5984560996294022, + "rewards/rank_answer_foramt_reward": 0.861328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7766695767641068, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.25, + "epoch": 0.2624, + "grad_norm": 0.0396074615418911, + "kl": 0.009977340698242188, + "learning_rate": 6.086263331627976e-06, + "loss": -0.0303, + "reward": 1.3097558319568634, + "reward_std": 0.3439793810248375, + "rewards/mrr_reward": 0.46181795187294483, + "rewards/rank_answer_foramt_reward": 0.81640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7687272727489471, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.78125, + "epoch": 0.2632, + "grad_norm": 0.040687814354896545, + "kl": 0.008890151977539062, + "learning_rate": 6.028521093652195e-06, + "loss": -0.0267, + "reward": 1.4824483394622803, + "reward_std": 0.35101721435785294, + "rewards/mrr_reward": 0.6202008873224258, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7222457826137543, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.96875, + "epoch": 0.264, + "grad_norm": 0.03991749510169029, + "kl": 0.009294509887695312, + "learning_rate": 5.970935642863375e-06, + "loss": -0.0049, + "reward": 1.4789340198040009, + "reward_std": 0.26942019537091255, + "rewards/mrr_reward": 0.5792286917567253, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7673952430486679, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.171875, + "epoch": 0.2648, + "grad_norm": 0.04425426945090294, + "kl": 0.009923934936523438, + "learning_rate": 5.913509252636511e-06, + "loss": -0.0369, + "reward": 1.4969634413719177, + "reward_std": 0.26327501237392426, + "rewards/mrr_reward": 0.6369791775941849, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6626534163951874, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.5625, + "epoch": 0.2656, + "grad_norm": 0.04324210807681084, + "kl": 0.0114898681640625, + "learning_rate": 5.85624419006716e-06, + "loss": -0.0185, + "reward": 1.4559223651885986, + "reward_std": 0.3514735624194145, + "rewards/mrr_reward": 0.5601996332406998, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8568893522024155, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.640625, + "epoch": 0.2664, + "grad_norm": 0.04046209529042244, + "kl": 0.012134552001953125, + "learning_rate": 5.799142715881938e-06, + "loss": -0.0164, + "reward": 1.4584909677505493, + "reward_std": 0.25963789224624634, + "rewards/mrr_reward": 0.610590286552906, + "rewards/rank_answer_foramt_reward": 0.927734375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6572864204645157, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.265625, + "epoch": 0.2672, + "grad_norm": 0.03997616469860077, + "kl": 0.011236190795898438, + "learning_rate": 5.742207084349274e-06, + "loss": -0.0195, + "reward": 1.5415248572826385, + "reward_std": 0.33238353580236435, + "rewards/mrr_reward": 0.6676215380430222, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7810041755437851, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.546875, + "epoch": 0.268, + "grad_norm": 0.04179609194397926, + "kl": 0.009923934936523438, + "learning_rate": 5.685439543190409e-06, + "loss": -0.0119, + "reward": 1.8288907408714294, + "reward_std": 0.2181766740977764, + "rewards/mrr_reward": 0.9140625, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8210345953702927, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.015625, + "epoch": 0.2688, + "grad_norm": 0.04074598103761673, + "kl": 0.008538246154785156, + "learning_rate": 5.628842333490674e-06, + "loss": 0.0237, + "reward": 1.5049967169761658, + "reward_std": 0.26735008880496025, + "rewards/mrr_reward": 0.5956349298357964, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8181416988372803, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.046875, + "epoch": 0.2696, + "grad_norm": 0.038923174142837524, + "kl": 0.0103607177734375, + "learning_rate": 5.572417689610987e-06, + "loss": -0.0079, + "reward": 1.4763097763061523, + "reward_std": 0.22926313430070877, + "rewards/mrr_reward": 0.5806609615683556, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7687745988368988, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.65625, + "epoch": 0.2704, + "grad_norm": 0.040242068469524384, + "kl": 0.009759902954101562, + "learning_rate": 5.516167839099679e-06, + "loss": -0.0346, + "reward": 1.583184838294983, + "reward_std": 0.36991823837161064, + "rewards/mrr_reward": 0.6937499940395355, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7772882282733917, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.375, + "epoch": 0.2712, + "grad_norm": 0.04075354337692261, + "kl": 0.012285232543945312, + "learning_rate": 5.460095002604533e-06, + "loss": -0.0154, + "reward": 1.6092648804187775, + "reward_std": 0.32450181245803833, + "rewards/mrr_reward": 0.7244791835546494, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7729655653238297, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.390625, + "epoch": 0.272, + "grad_norm": 0.045717377215623856, + "kl": 0.010404586791992188, + "learning_rate": 5.404201393785123e-06, + "loss": -0.0222, + "reward": 1.3845095038414001, + "reward_std": 0.49984729290008545, + "rewards/mrr_reward": 0.5492187440395355, + "rewards/rank_answer_foramt_reward": 0.78125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7577463984489441, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.515625, + "epoch": 0.2728, + "grad_norm": 0.03839201480150223, + "kl": 0.01171875, + "learning_rate": 5.348489219225417e-06, + "loss": -0.0114, + "reward": 1.4330370724201202, + "reward_std": 0.3987097330391407, + "rewards/mrr_reward": 0.5739769265055656, + "rewards/rank_answer_foramt_reward": 0.9140625, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.7203998863697052, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.703125, + "epoch": 0.2736, + "grad_norm": 0.04262446239590645, + "kl": 0.010091781616210938, + "learning_rate": 5.292960678346674e-06, + "loss": -0.0048, + "reward": 1.3840036988258362, + "reward_std": 0.26258981972932816, + "rewards/mrr_reward": 0.5252604261040688, + "rewards/rank_answer_foramt_reward": 0.79296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8249083608388901, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.125, + "epoch": 0.2744, + "grad_norm": 0.03931298851966858, + "kl": 0.009639739990234375, + "learning_rate": 5.237617963320608e-06, + "loss": -0.012, + "reward": 1.4826070964336395, + "reward_std": 0.24309994652867317, + "rewards/mrr_reward": 0.5967881828546524, + "rewards/rank_answer_foramt_reward": 0.92578125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7663308084011078, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.5625, + "epoch": 0.2752, + "grad_norm": 0.04591841250658035, + "kl": 0.01171875, + "learning_rate": 5.1824632589828465e-06, + "loss": -0.0147, + "reward": 1.4441474378108978, + "reward_std": 0.4614497572183609, + "rewards/mrr_reward": 0.5976562574505806, + "rewards/rank_answer_foramt_reward": 0.83203125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7409058511257172, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.4375, + "epoch": 0.276, + "grad_norm": 0.04385710135102272, + "kl": 0.00975799560546875, + "learning_rate": 5.127498742746675e-06, + "loss": 0.0096, + "reward": 1.4345026016235352, + "reward_std": 0.3426055870950222, + "rewards/mrr_reward": 0.5657985955476761, + "rewards/rank_answer_foramt_reward": 0.88671875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7691548317670822, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.859375, + "epoch": 0.2768, + "grad_norm": 0.04883267357945442, + "kl": 0.010219573974609375, + "learning_rate": 5.072726584517086e-06, + "loss": 0.001, + "reward": 1.6219241619110107, + "reward_std": 0.311983335763216, + "rewards/mrr_reward": 0.7708333432674408, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.6571878343820572, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.03125, + "epoch": 0.2776, + "grad_norm": 0.038777440786361694, + "kl": 0.009169578552246094, + "learning_rate": 5.018148946605092e-06, + "loss": -0.0351, + "reward": 1.5609507262706757, + "reward_std": 0.23676074855029583, + "rewards/mrr_reward": 0.6690104454755783, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7594898641109467, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.890625, + "epoch": 0.2784, + "grad_norm": 0.041409607976675034, + "kl": 0.011716842651367188, + "learning_rate": 4.9637679836423926e-06, + "loss": -0.0058, + "reward": 1.518358290195465, + "reward_std": 0.26389508321881294, + "rewards/mrr_reward": 0.6400359719991684, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7318951040506363, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.96875, + "epoch": 0.2792, + "grad_norm": 0.039285335689783096, + "kl": 0.008753776550292969, + "learning_rate": 4.909585842496287e-06, + "loss": 0.0012, + "reward": 1.3429620265960693, + "reward_std": 0.3432212918996811, + "rewards/mrr_reward": 0.45104166865348816, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8082574605941772, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.765625, + "epoch": 0.28, + "grad_norm": 0.04197699576616287, + "kl": 0.00982666015625, + "learning_rate": 4.855604662184935e-06, + "loss": -0.0316, + "reward": 1.3227859288454056, + "reward_std": 0.27039322815835476, + "rewards/mrr_reward": 0.459523793309927, + "rewards/rank_answer_foramt_reward": 0.86328125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7761019766330719, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.890625, + "epoch": 0.2808, + "grad_norm": 0.04606630280613899, + "kl": 0.01332855224609375, + "learning_rate": 4.801826573792905e-06, + "loss": -0.0354, + "reward": 1.3492314517498016, + "reward_std": 0.3611329570412636, + "rewards/mrr_reward": 0.5093750059604645, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7344726175069809, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.640625, + "epoch": 0.2816, + "grad_norm": 0.037404146045446396, + "kl": 0.009065628051757812, + "learning_rate": 4.7482537003870425e-06, + "loss": -0.0019, + "reward": 1.7138548493385315, + "reward_std": 0.37241312861442566, + "rewards/mrr_reward": 0.8257812559604645, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7966005057096481, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.15625, + "epoch": 0.2824, + "grad_norm": 0.04106292501091957, + "kl": 0.01189422607421875, + "learning_rate": 4.694888156932657e-06, + "loss": 0.0021, + "reward": 1.4714123904705048, + "reward_std": 0.3619183078408241, + "rewards/mrr_reward": 0.6462239772081375, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6353365629911423, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.3125, + "epoch": 0.2832, + "grad_norm": 0.04472840949892998, + "kl": 0.01025390625, + "learning_rate": 4.641732050210032e-06, + "loss": -0.0384, + "reward": 1.3738009333610535, + "reward_std": 0.34098855778574944, + "rewards/mrr_reward": 0.5247395932674408, + "rewards/rank_answer_foramt_reward": 0.822265625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7662725001573563, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.25, + "epoch": 0.284, + "grad_norm": 0.0422356016933918, + "kl": 0.007762908935546875, + "learning_rate": 4.588787478731242e-06, + "loss": -0.0179, + "reward": 1.5886476337909698, + "reward_std": 0.3262007385492325, + "rewards/mrr_reward": 0.7180555313825607, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7553452551364899, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.265625, + "epoch": 0.2848, + "grad_norm": 0.045158080756664276, + "kl": 0.014837265014648438, + "learning_rate": 4.53605653265731e-06, + "loss": -0.0355, + "reward": 1.6843328177928925, + "reward_std": 0.30529190599918365, + "rewards/mrr_reward": 0.8046875149011612, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7378572374582291, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.46875, + "epoch": 0.2856, + "grad_norm": 0.04521140828728676, + "kl": 0.009514808654785156, + "learning_rate": 4.483541293715699e-06, + "loss": -0.0368, + "reward": 1.5743017494678497, + "reward_std": 0.2875717096030712, + "rewards/mrr_reward": 0.6670758798718452, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8331535160541534, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.421875, + "epoch": 0.2864, + "grad_norm": 0.04222414642572403, + "kl": 0.008882522583007812, + "learning_rate": 4.4312438351181246e-06, + "loss": -0.0041, + "reward": 1.5822243094444275, + "reward_std": 0.3601609170436859, + "rewards/mrr_reward": 0.6888020932674408, + "rewards/rank_answer_foramt_reward": 0.8984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8167148977518082, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.109375, + "epoch": 0.2872, + "grad_norm": 0.042955782264471054, + "kl": 0.008832931518554688, + "learning_rate": 4.379166221478697e-06, + "loss": -0.0191, + "reward": 1.5433863401412964, + "reward_std": 0.3501916863024235, + "rewards/mrr_reward": 0.639533743262291, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8229316174983978, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 404.6875, + "epoch": 0.288, + "grad_norm": 0.040022190660238266, + "kl": 0.009357452392578125, + "learning_rate": 4.3273105087324375e-06, + "loss": -0.0363, + "reward": 1.546007513999939, + "reward_std": 0.2631131783127785, + "rewards/mrr_reward": 0.6498883962631226, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.750668540596962, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.328125, + "epoch": 0.2888, + "grad_norm": 0.03617052733898163, + "kl": 0.008795738220214844, + "learning_rate": 4.275678744054094e-06, + "loss": 0.0144, + "reward": 1.3946971893310547, + "reward_std": 0.24523303098976612, + "rewards/mrr_reward": 0.4952690973877907, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8173362910747528, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.203125, + "epoch": 0.2896, + "grad_norm": 0.04181591421365738, + "kl": 0.009317398071289062, + "learning_rate": 4.224272965777326e-06, + "loss": -0.0191, + "reward": 1.5229368805885315, + "reward_std": 0.19505535019561648, + "rewards/mrr_reward": 0.6376736015081406, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.793943926692009, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.0625, + "epoch": 0.2904, + "grad_norm": 0.040446627885103226, + "kl": 0.009857177734375, + "learning_rate": 4.173095203314241e-06, + "loss": -0.0547, + "reward": 1.463654488325119, + "reward_std": 0.2678440436720848, + "rewards/mrr_reward": 0.6329861134290695, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6831922978162766, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.4375, + "epoch": 0.2912, + "grad_norm": 0.03971518948674202, + "kl": 0.009275436401367188, + "learning_rate": 4.12214747707527e-06, + "loss": -0.0236, + "reward": 1.4997781217098236, + "reward_std": 0.24033548682928085, + "rewards/mrr_reward": 0.6165364682674408, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8034428507089615, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.046875, + "epoch": 0.292, + "grad_norm": 0.04116431623697281, + "kl": 0.0095062255859375, + "learning_rate": 4.071431798389408e-06, + "loss": -0.0171, + "reward": 1.57802614569664, + "reward_std": 0.25258472189307213, + "rewards/mrr_reward": 0.6707217246294022, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7923757880926132, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.984375, + "epoch": 0.2928, + "grad_norm": 0.04119415953755379, + "kl": 0.008808135986328125, + "learning_rate": 4.020950169424815e-06, + "loss": 0.0292, + "reward": 1.5060677528381348, + "reward_std": 0.3413529172539711, + "rewards/mrr_reward": 0.6168836876749992, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8351219594478607, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.796875, + "epoch": 0.2936, + "grad_norm": 0.03977242112159729, + "kl": 0.010467529296875, + "learning_rate": 3.970704583109755e-06, + "loss": -0.0172, + "reward": 1.4002881050109863, + "reward_std": 0.30990614742040634, + "rewards/mrr_reward": 0.4957217499613762, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8465787619352341, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.875, + "epoch": 0.2944, + "grad_norm": 0.042127519845962524, + "kl": 0.010568618774414062, + "learning_rate": 3.920697023053949e-06, + "loss": -0.0185, + "reward": 1.4711587131023407, + "reward_std": 0.2235279567539692, + "rewards/mrr_reward": 0.5671316906809807, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7687725126743317, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.140625, + "epoch": 0.2952, + "grad_norm": 0.037665415555238724, + "kl": 0.0077457427978515625, + "learning_rate": 3.8709294634702374e-06, + "loss": 0.009, + "reward": 1.6012870371341705, + "reward_std": 0.2355819009244442, + "rewards/mrr_reward": 0.69921875, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8038526326417923, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.578125, + "epoch": 0.296, + "grad_norm": 0.045537903904914856, + "kl": 0.012182235717773438, + "learning_rate": 3.821403869096658e-06, + "loss": -0.027, + "reward": 1.413929671049118, + "reward_std": 0.2634736839681864, + "rewards/mrr_reward": 0.5472656264901161, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7180513441562653, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.890625, + "epoch": 0.2968, + "grad_norm": 0.041050802916288376, + "kl": 0.008855819702148438, + "learning_rate": 3.772122195118877e-06, + "loss": -0.0169, + "reward": 1.451367437839508, + "reward_std": 0.3203510493040085, + "rewards/mrr_reward": 0.580543152987957, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7287050783634186, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.421875, + "epoch": 0.2976, + "grad_norm": 0.04175710305571556, + "kl": 0.009654998779296875, + "learning_rate": 3.723086387092997e-06, + "loss": -0.0023, + "reward": 1.476247251033783, + "reward_std": 0.40557974576950073, + "rewards/mrr_reward": 0.5853484943509102, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8110213726758957, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.28125, + "epoch": 0.2984, + "grad_norm": 0.044037140905857086, + "kl": 0.009381294250488281, + "learning_rate": 3.674298380868756e-06, + "loss": 0.0037, + "reward": 1.3548841178417206, + "reward_std": 0.3138626515865326, + "rewards/mrr_reward": 0.46049726754426956, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8196381330490112, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.34375, + "epoch": 0.2992, + "grad_norm": 0.04055827111005783, + "kl": 0.007966995239257812, + "learning_rate": 3.625760102513103e-06, + "loss": 0.0133, + "reward": 1.4686636626720428, + "reward_std": 0.3094808869063854, + "rewards/mrr_reward": 0.5532738268375397, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8168772459030151, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.171875, + "epoch": 0.3, + "grad_norm": 0.04194509610533714, + "kl": 0.009783744812011719, + "learning_rate": 3.5774734682341563e-06, + "loss": -0.0282, + "reward": 1.4523612558841705, + "reward_std": 0.36508404091000557, + "rewards/mrr_reward": 0.5867187529802322, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7403464317321777, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.109375, + "epoch": 0.3008, + "grad_norm": 0.03853485733270645, + "kl": 0.009164810180664062, + "learning_rate": 3.5294403843055604e-06, + "loss": 0.0198, + "reward": 1.4639207124710083, + "reward_std": 0.21186323463916779, + "rewards/mrr_reward": 0.5635416880249977, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7733431160449982, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.765625, + "epoch": 0.3016, + "grad_norm": 0.04190933704376221, + "kl": 0.008619308471679688, + "learning_rate": 3.4816627469912147e-06, + "loss": -0.0263, + "reward": 1.4570399820804596, + "reward_std": 0.3633367531001568, + "rewards/mrr_reward": 0.5539248585700989, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8206967562437057, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.890625, + "epoch": 0.3024, + "grad_norm": 0.04122132807970047, + "kl": 0.008838653564453125, + "learning_rate": 3.4341424424704373e-06, + "loss": -0.0555, + "reward": 1.4766654968261719, + "reward_std": 0.27532494999468327, + "rewards/mrr_reward": 0.6087425425648689, + "rewards/rank_answer_foramt_reward": 0.955078125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6906161457300186, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.640625, + "epoch": 0.3032, + "grad_norm": 0.044697657227516174, + "kl": 0.013956069946289062, + "learning_rate": 3.3868813467634833e-06, + "loss": -0.0113, + "reward": 1.4251343607902527, + "reward_std": 0.25692647136747837, + "rewards/mrr_reward": 0.5694444477558136, + "rewards/rank_answer_foramt_reward": 0.9140625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6789370179176331, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.40625, + "epoch": 0.304, + "grad_norm": 0.03943547606468201, + "kl": 0.008486747741699219, + "learning_rate": 3.3398813256574847e-06, + "loss": 0.0001, + "reward": 1.6643346548080444, + "reward_std": 0.26205284520983696, + "rewards/mrr_reward": 0.7689236104488373, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7758665382862091, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.9375, + "epoch": 0.3048, + "grad_norm": 0.049449626356363297, + "kl": 0.011739730834960938, + "learning_rate": 3.2931442346328e-06, + "loss": -0.0534, + "reward": 1.339944213628769, + "reward_std": 0.2773168385028839, + "rewards/mrr_reward": 0.4835565537214279, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6634734272956848, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.765625, + "epoch": 0.3056, + "grad_norm": 0.040942005813121796, + "kl": 0.010448455810546875, + "learning_rate": 3.2466719187897555e-06, + "loss": -0.0245, + "reward": 1.5223515927791595, + "reward_std": 0.3074699230492115, + "rewards/mrr_reward": 0.6432291939854622, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.7733821123838425, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.625, + "epoch": 0.3064, + "grad_norm": 0.038345228880643845, + "kl": 0.010009765625, + "learning_rate": 3.200466212775808e-06, + "loss": -0.0195, + "reward": 1.519100308418274, + "reward_std": 0.24653562158346176, + "rewards/mrr_reward": 0.6174231022596359, + "rewards/rank_answer_foramt_reward": 0.94140625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8065735548734665, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.828125, + "epoch": 0.3072, + "grad_norm": 0.03888833522796631, + "kl": 0.009685516357421875, + "learning_rate": 3.1545289407131128e-06, + "loss": -0.002, + "reward": 1.4710081219673157, + "reward_std": 0.38252247869968414, + "rewards/mrr_reward": 0.5821180492639542, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7834498137235641, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.46875, + "epoch": 0.308, + "grad_norm": 0.038772933185100555, + "kl": 0.009763717651367188, + "learning_rate": 3.108861916126518e-06, + "loss": -0.0166, + "reward": 1.4420768022537231, + "reward_std": 0.32166776806116104, + "rewards/mrr_reward": 0.5522321313619614, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7668113112449646, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.96875, + "epoch": 0.3088, + "grad_norm": 0.04055608808994293, + "kl": 0.009151458740234375, + "learning_rate": 3.063466941871952e-06, + "loss": -0.0134, + "reward": 1.5163768231868744, + "reward_std": 0.3534121476113796, + "rewards/mrr_reward": 0.6155258119106293, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.798210859298706, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.859375, + "epoch": 0.3096, + "grad_norm": 0.04640579968690872, + "kl": 0.008348464965820312, + "learning_rate": 3.0183458100652752e-06, + "loss": 0.025, + "reward": 1.5191034972667694, + "reward_std": 0.39003700762987137, + "rewards/mrr_reward": 0.6491505652666092, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7709864974021912, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.609375, + "epoch": 0.3104, + "grad_norm": 0.03954498469829559, + "kl": 0.010484695434570312, + "learning_rate": 2.9735003020115095e-06, + "loss": -0.0515, + "reward": 1.6413687765598297, + "reward_std": 0.28913578018546104, + "rewards/mrr_reward": 0.76171875, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7339653223752975, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.8125, + "epoch": 0.3112, + "grad_norm": 0.03751927241683006, + "kl": 0.010460853576660156, + "learning_rate": 2.9289321881345257e-06, + "loss": -0.0129, + "reward": 1.5514837503433228, + "reward_std": 0.15940535813570023, + "rewards/mrr_reward": 0.6467633992433548, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7689204066991806, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.859375, + "epoch": 0.312, + "grad_norm": 0.04677467420697212, + "kl": 0.013050079345703125, + "learning_rate": 2.884643227907147e-06, + "loss": 0.0117, + "reward": 1.3619890213012695, + "reward_std": 0.2909104973077774, + "rewards/mrr_reward": 0.5390625, + "rewards/rank_answer_foramt_reward": 0.787109375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7144197523593903, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.640625, + "epoch": 0.3128, + "grad_norm": 0.03747595474123955, + "kl": 0.009700775146484375, + "learning_rate": 2.840635169781688e-06, + "loss": -0.0023, + "reward": 1.5902496874332428, + "reward_std": 0.3390812985599041, + "rewards/mrr_reward": 0.7155134230852127, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7620441168546677, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.28125, + "epoch": 0.3136, + "grad_norm": 0.038146279752254486, + "kl": 0.009531021118164062, + "learning_rate": 2.796909751120931e-06, + "loss": -0.0248, + "reward": 1.51711967587471, + "reward_std": 0.387925885617733, + "rewards/mrr_reward": 0.6617807745933533, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7227952480316162, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.09375, + "epoch": 0.3144, + "grad_norm": 0.040637459605932236, + "kl": 0.009733200073242188, + "learning_rate": 2.7534686981295335e-06, + "loss": 0.0159, + "reward": 1.3827373087406158, + "reward_std": 0.4360157921910286, + "rewards/mrr_reward": 0.49673859030008316, + "rewards/rank_answer_foramt_reward": 0.86328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.821563258767128, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.4375, + "epoch": 0.3152, + "grad_norm": 0.04179443418979645, + "kl": 0.009754180908203125, + "learning_rate": 2.7103137257858867e-06, + "loss": -0.0236, + "reward": 1.4687740206718445, + "reward_std": 0.3465902768075466, + "rewards/mrr_reward": 0.5991691499948502, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.717197373509407, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.71875, + "epoch": 0.316, + "grad_norm": 0.046546537429094315, + "kl": 0.012380599975585938, + "learning_rate": 2.667446537774402e-06, + "loss": -0.0005, + "reward": 1.5630870461463928, + "reward_std": 0.3838147297501564, + "rewards/mrr_reward": 0.685695692896843, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7915740013122559, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.109375, + "epoch": 0.3168, + "grad_norm": 0.04081228747963905, + "kl": 0.009305953979492188, + "learning_rate": 2.624868826418262e-06, + "loss": -0.0092, + "reward": 1.4161024391651154, + "reward_std": 0.3121535889804363, + "rewards/mrr_reward": 0.5301339440047741, + "rewards/rank_answer_foramt_reward": 0.861328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.831237405538559, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.828125, + "epoch": 0.3176, + "grad_norm": 0.04065534844994545, + "kl": 0.010629653930664062, + "learning_rate": 2.5825822726126095e-06, + "loss": 0.0077, + "reward": 1.4853081405162811, + "reward_std": 0.2746486961841583, + "rewards/mrr_reward": 0.5950706824660301, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7699548155069351, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.984375, + "epoch": 0.3184, + "grad_norm": 0.04703785106539726, + "kl": 0.01222991943359375, + "learning_rate": 2.5405885457581793e-06, + "loss": -0.0213, + "reward": 1.2661742269992828, + "reward_std": 0.30840878933668137, + "rewards/mrr_reward": 0.3860801197588444, + "rewards/rank_answer_foramt_reward": 0.91015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7567955106496811, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.421875, + "epoch": 0.3192, + "grad_norm": 0.03758614510297775, + "kl": 0.01016998291015625, + "learning_rate": 2.4988893036954045e-06, + "loss": -0.0215, + "reward": 1.606367141008377, + "reward_std": 0.12326683290302753, + "rewards/mrr_reward": 0.6976996511220932, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.782834529876709, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.6875, + "epoch": 0.32, + "grad_norm": 0.040864165872335434, + "kl": 0.009775161743164062, + "learning_rate": 2.4574861926389615e-06, + "loss": -0.0104, + "reward": 1.3981690108776093, + "reward_std": 0.30249515548348427, + "rewards/mrr_reward": 0.5021701380610466, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.804991826415062, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.359375, + "epoch": 0.3208, + "grad_norm": 0.0384257435798645, + "kl": 0.008899688720703125, + "learning_rate": 2.4163808471127815e-06, + "loss": -0.0165, + "reward": 1.363112986087799, + "reward_std": 0.38970305398106575, + "rewards/mrr_reward": 0.4783792197704315, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.804058238863945, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.25, + "epoch": 0.3216, + "grad_norm": 0.040598995983600616, + "kl": 0.008358001708984375, + "learning_rate": 2.37557488988552e-06, + "loss": 0.0419, + "reward": 1.5591547191143036, + "reward_std": 0.2982029393315315, + "rewards/mrr_reward": 0.6777963787317276, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.7450015395879745, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.421875, + "epoch": 0.3224, + "grad_norm": 0.039519764482975006, + "kl": 0.012409210205078125, + "learning_rate": 2.335069931906503e-06, + "loss": -0.0157, + "reward": 1.4554656445980072, + "reward_std": 0.30417120456695557, + "rewards/mrr_reward": 0.569221243262291, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7715264558792114, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.390625, + "epoch": 0.3232, + "grad_norm": 0.04290325567126274, + "kl": 0.010166168212890625, + "learning_rate": 2.2948675722421086e-06, + "loss": -0.0324, + "reward": 1.323316365480423, + "reward_std": 0.31126467883586884, + "rewards/mrr_reward": 0.4303385391831398, + "rewards/rank_answer_foramt_reward": 0.884765625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.821227639913559, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.96875, + "epoch": 0.324, + "grad_norm": 0.039676424115896225, + "kl": 0.01012420654296875, + "learning_rate": 2.254969398012663e-06, + "loss": -0.0132, + "reward": 1.3026447296142578, + "reward_std": 0.30587131530046463, + "rewards/mrr_reward": 0.41312623769044876, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8224635273218155, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.046875, + "epoch": 0.3248, + "grad_norm": 0.04161365330219269, + "kl": 0.009571075439453125, + "learning_rate": 2.215376984329767e-06, + "loss": 0.0262, + "reward": 1.6110160648822784, + "reward_std": 0.355252580717206, + "rewards/mrr_reward": 0.717968761920929, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8175320029258728, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.734375, + "epoch": 0.3256, + "grad_norm": 0.03859516605734825, + "kl": 0.00945281982421875, + "learning_rate": 2.1760918942341193e-06, + "loss": -0.0282, + "reward": 1.5105039477348328, + "reward_std": 0.3397863022983074, + "rewards/mrr_reward": 0.6239955350756645, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7977168411016464, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.21875, + "epoch": 0.3264, + "grad_norm": 0.044858209788799286, + "kl": 0.014190673828125, + "learning_rate": 2.1371156786338108e-06, + "loss": -0.0274, + "reward": 1.4385575950145721, + "reward_std": 0.47953635454177856, + "rewards/mrr_reward": 0.6119791567325592, + "rewards/rank_answer_foramt_reward": 0.822265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6903298795223236, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.65625, + "epoch": 0.3272, + "grad_norm": 0.03756219893693924, + "kl": 0.010061264038085938, + "learning_rate": 2.098449876243096e-06, + "loss": -0.0082, + "reward": 1.3541287928819656, + "reward_std": 0.2427298128604889, + "rewards/mrr_reward": 0.4596044272184372, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7946641892194748, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.28125, + "epoch": 0.328, + "grad_norm": 0.04040595516562462, + "kl": 0.008250236511230469, + "learning_rate": 2.0600960135216463e-06, + "loss": -0.0227, + "reward": 1.6990008354187012, + "reward_std": 0.15870059095323086, + "rewards/mrr_reward": 0.7838541716337204, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.786843404173851, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.90625, + "epoch": 0.3288, + "grad_norm": 0.04700813814997673, + "kl": 0.0103759765625, + "learning_rate": 2.022055604614289e-06, + "loss": -0.0175, + "reward": 1.3733271956443787, + "reward_std": 0.27594383619725704, + "rewards/mrr_reward": 0.4928571507334709, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7794190347194672, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.09375, + "epoch": 0.3296, + "grad_norm": 0.041563473641872406, + "kl": 0.009532928466796875, + "learning_rate": 1.984330151291233e-06, + "loss": 0.0048, + "reward": 1.6681818068027496, + "reward_std": 0.31681402772665024, + "rewards/mrr_reward": 0.76953125, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8091207593679428, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.84375, + "epoch": 0.3304, + "grad_norm": 0.03638165071606636, + "kl": 0.009305953979492188, + "learning_rate": 1.9469211428887813e-06, + "loss": 0.0181, + "reward": 1.3885623514652252, + "reward_std": 0.34145616739988327, + "rewards/mrr_reward": 0.501946933567524, + "rewards/rank_answer_foramt_reward": 0.830078125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8644477128982544, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.625, + "epoch": 0.3312, + "grad_norm": 0.03859129920601845, + "kl": 0.0097808837890625, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.0172, + "reward": 1.4659111499786377, + "reward_std": 0.2733108922839165, + "rewards/mrr_reward": 0.5613281279802322, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8388167470693588, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.28125, + "epoch": 0.332, + "grad_norm": 0.042557183653116226, + "kl": 0.010099411010742188, + "learning_rate": 1.8730583556690607e-06, + "loss": 0.0313, + "reward": 1.3624907732009888, + "reward_std": 0.270840547978878, + "rewards/mrr_reward": 0.4820932298898697, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7225586175918579, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.703125, + "epoch": 0.3328, + "grad_norm": 0.04032871872186661, + "kl": 0.009693145751953125, + "learning_rate": 1.8366074928281608e-06, + "loss": -0.0172, + "reward": 1.355335295200348, + "reward_std": 0.16411831602454185, + "rewards/mrr_reward": 0.4381696507334709, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7929615378379822, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.5, + "epoch": 0.3336, + "grad_norm": 0.03926324471831322, + "kl": 0.009528160095214844, + "learning_rate": 1.8004789067454763e-06, + "loss": -0.0313, + "reward": 1.3210996389389038, + "reward_std": 0.31574152782559395, + "rewards/mrr_reward": 0.4390067160129547, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7765243351459503, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.484375, + "epoch": 0.3344, + "grad_norm": 0.0452018603682518, + "kl": 0.009485244750976562, + "learning_rate": 1.7646740237157256e-06, + "loss": 0.0053, + "reward": 1.3444588482379913, + "reward_std": 0.3464643657207489, + "rewards/mrr_reward": 0.4705481231212616, + "rewards/rank_answer_foramt_reward": 0.861328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7946986109018326, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.1875, + "epoch": 0.3352, + "grad_norm": 0.04134754836559296, + "kl": 0.011098861694335938, + "learning_rate": 1.7291942572543806e-06, + "loss": -0.0201, + "reward": 1.5103633105754852, + "reward_std": 0.26311646308749914, + "rewards/mrr_reward": 0.6181113570928574, + "rewards/rank_answer_foramt_reward": 0.9140625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7897311300039291, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.40625, + "epoch": 0.336, + "grad_norm": 0.04511724412441254, + "kl": 0.012248992919921875, + "learning_rate": 1.6940410080418723e-06, + "loss": -0.0424, + "reward": 1.3716591596603394, + "reward_std": 0.3640734553337097, + "rewards/mrr_reward": 0.490234375, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7823120504617691, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.796875, + "epoch": 0.3368, + "grad_norm": 0.04100240021944046, + "kl": 0.010448455810546875, + "learning_rate": 1.6592156638682887e-06, + "loss": -0.0274, + "reward": 1.425389677286148, + "reward_std": 0.3395872414112091, + "rewards/mrr_reward": 0.5344307869672775, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7682346403598785, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.046875, + "epoch": 0.3376, + "grad_norm": 0.03941246122121811, + "kl": 0.009730339050292969, + "learning_rate": 1.6247195995785836e-06, + "loss": 0.0095, + "reward": 1.5374208092689514, + "reward_std": 0.165388286113739, + "rewards/mrr_reward": 0.6133246421813965, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8491195142269135, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.0625, + "epoch": 0.3384, + "grad_norm": 0.036055538803339005, + "kl": 0.00992584228515625, + "learning_rate": 1.5905541770183096e-06, + "loss": -0.0127, + "reward": 1.5841495990753174, + "reward_std": 0.2556448373943567, + "rewards/mrr_reward": 0.6819444745779037, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7769235521554947, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.09375, + "epoch": 0.3392, + "grad_norm": 0.04632318392395973, + "kl": 0.010486602783203125, + "learning_rate": 1.5567207449798517e-06, + "loss": 0.0142, + "reward": 1.3678037822246552, + "reward_std": 0.31665654107928276, + "rewards/mrr_reward": 0.4849950596690178, + "rewards/rank_answer_foramt_reward": 0.861328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8138497620820999, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.328125, + "epoch": 0.34, + "grad_norm": 0.0403946228325367, + "kl": 0.0091400146484375, + "learning_rate": 1.52322063914917e-06, + "loss": -0.0281, + "reward": 1.3638777285814285, + "reward_std": 0.3191062808036804, + "rewards/mrr_reward": 0.527752973139286, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 0.953125, + "rewards/rank_think_format_reward": 0.7075394093990326, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.140625, + "epoch": 0.3408, + "grad_norm": 0.04744080826640129, + "kl": 0.010467529296875, + "learning_rate": 1.490055182053083e-06, + "loss": 0.0045, + "reward": 1.5458801984786987, + "reward_std": 0.223420899361372, + "rewards/mrr_reward": 0.6529947966337204, + "rewards/rank_answer_foramt_reward": 0.955078125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7662600725889206, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.625, + "epoch": 0.3416, + "grad_norm": 0.035729970782995224, + "kl": 0.009447097778320312, + "learning_rate": 1.4572256830070497e-06, + "loss": -0.0069, + "reward": 1.506346195936203, + "reward_std": 0.2625518664717674, + "rewards/mrr_reward": 0.6086123436689377, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8024365603923798, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.375, + "epoch": 0.3424, + "grad_norm": 0.04041771963238716, + "kl": 0.009374618530273438, + "learning_rate": 1.4247334380634792e-06, + "loss": -0.0142, + "reward": 1.44622141122818, + "reward_std": 0.19066144712269306, + "rewards/mrr_reward": 0.5250868275761604, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8049887120723724, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.3125, + "epoch": 0.3432, + "grad_norm": 0.03661702945828438, + "kl": 0.008138656616210938, + "learning_rate": 1.3925797299605649e-06, + "loss": -0.0252, + "reward": 1.3437042236328125, + "reward_std": 0.23780394345521927, + "rewards/mrr_reward": 0.44283854961395264, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8158335089683533, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.75, + "epoch": 0.344, + "grad_norm": 0.04520437493920326, + "kl": 0.012065887451171875, + "learning_rate": 1.3607658280716474e-06, + "loss": -0.0399, + "reward": 1.4789037704467773, + "reward_std": 0.3689150810241699, + "rewards/mrr_reward": 0.6351562589406967, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6681386232376099, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.0625, + "epoch": 0.3448, + "grad_norm": 0.0407538115978241, + "kl": 0.0075836181640625, + "learning_rate": 1.3292929883550998e-06, + "loss": 0.0148, + "reward": 1.4983892738819122, + "reward_std": 0.2416827231645584, + "rewards/mrr_reward": 0.5973958522081375, + "rewards/rank_answer_foramt_reward": 0.955078125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7908297926187515, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.375, + "epoch": 0.3456, + "grad_norm": 0.038508400321006775, + "kl": 0.010146141052246094, + "learning_rate": 1.2981624533047432e-06, + "loss": -0.0291, + "reward": 1.6629530489444733, + "reward_std": 0.33460457250475883, + "rewards/mrr_reward": 0.7881510555744171, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7134150713682175, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.96875, + "epoch": 0.3464, + "grad_norm": 0.043071672320365906, + "kl": 0.012836456298828125, + "learning_rate": 1.2673754519008008e-06, + "loss": 0.0302, + "reward": 1.4194163978099823, + "reward_std": 0.21476874873042107, + "rewards/mrr_reward": 0.5527033805847168, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7514027804136276, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.375, + "epoch": 0.3472, + "grad_norm": 0.039470795542001724, + "kl": 0.00861358642578125, + "learning_rate": 1.2369331995613664e-06, + "loss": -0.0457, + "reward": 1.6773528754711151, + "reward_std": 0.2540069557726383, + "rewards/mrr_reward": 0.7979166507720947, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.735270619392395, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.3125, + "epoch": 0.348, + "grad_norm": 0.042133934795856476, + "kl": 0.012884140014648438, + "learning_rate": 1.206836898094439e-06, + "loss": -0.0082, + "reward": 1.3745841085910797, + "reward_std": 0.25170470029115677, + "rewards/mrr_reward": 0.4813492000102997, + "rewards/rank_answer_foramt_reward": 0.927734375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7946629524230957, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.609375, + "epoch": 0.3488, + "grad_norm": 0.0405392199754715, + "kl": 0.010301589965820312, + "learning_rate": 1.1770877356504684e-06, + "loss": -0.0233, + "reward": 1.5826008915901184, + "reward_std": 0.24600278958678246, + "rewards/mrr_reward": 0.6921875178813934, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7255659103393555, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.578125, + "epoch": 0.3496, + "grad_norm": 0.04396117478609085, + "kl": 0.014806747436523438, + "learning_rate": 1.1476868866754488e-06, + "loss": -0.0133, + "reward": 1.5188599526882172, + "reward_std": 0.3319630119949579, + "rewards/mrr_reward": 0.651041679084301, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7567054778337479, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.515625, + "epoch": 0.3504, + "grad_norm": 0.04198921099305153, + "kl": 0.01055908203125, + "learning_rate": 1.1186355118645552e-06, + "loss": -0.0335, + "reward": 1.3222236633300781, + "reward_std": 0.3595678508281708, + "rewards/mrr_reward": 0.4773871749639511, + "rewards/rank_answer_foramt_reward": 0.86328125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7124541997909546, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.59375, + "epoch": 0.3512, + "grad_norm": 0.036484286189079285, + "kl": 0.009668350219726562, + "learning_rate": 1.0899347581163222e-06, + "loss": -0.011, + "reward": 1.4655856788158417, + "reward_std": 0.2706412933766842, + "rewards/mrr_reward": 0.5767113268375397, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7853553593158722, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.0, + "epoch": 0.352, + "grad_norm": 0.04056106507778168, + "kl": 0.009960174560546875, + "learning_rate": 1.0615857584873624e-06, + "loss": -0.0195, + "reward": 1.39870023727417, + "reward_std": 0.28301356360316277, + "rewards/mrr_reward": 0.5242187604308128, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7534593939781189, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.078125, + "epoch": 0.3528, + "grad_norm": 0.0444202646613121, + "kl": 0.010128021240234375, + "learning_rate": 1.0335896321476413e-06, + "loss": -0.0272, + "reward": 1.4179519712924957, + "reward_std": 0.2796483486890793, + "rewards/mrr_reward": 0.5222966149449348, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7961382865905762, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.71875, + "epoch": 0.3536, + "grad_norm": 0.040348101407289505, + "kl": 0.009181976318359375, + "learning_rate": 1.0059474843362893e-06, + "loss": -0.0111, + "reward": 1.5824732184410095, + "reward_std": 0.27756644412875175, + "rewards/mrr_reward": 0.6705729365348816, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8336466401815414, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.796875, + "epoch": 0.3544, + "grad_norm": 0.04108656197786331, + "kl": 0.010869979858398438, + "learning_rate": 9.786604063179728e-07, + "loss": -0.0285, + "reward": 1.447850614786148, + "reward_std": 0.4478020519018173, + "rewards/mrr_reward": 0.6013020873069763, + "rewards/rank_answer_foramt_reward": 0.84765625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7332671880722046, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.40625, + "epoch": 0.3552, + "grad_norm": 0.04063072428107262, + "kl": 0.009490966796875, + "learning_rate": 9.517294753398066e-07, + "loss": 0.0038, + "reward": 1.4069487750530243, + "reward_std": 0.24927948415279388, + "rewards/mrr_reward": 0.4890128970146179, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7952955961227417, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.546875, + "epoch": 0.356, + "grad_norm": 0.03845556825399399, + "kl": 0.008657455444335938, + "learning_rate": 9.251557545888312e-07, + "loss": -0.0288, + "reward": 1.4185177087783813, + "reward_std": 0.25927474722266197, + "rewards/mrr_reward": 0.5444196350872517, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7679224908351898, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.671875, + "epoch": 0.3568, + "grad_norm": 0.0459010936319828, + "kl": 0.009598731994628906, + "learning_rate": 8.989402931500434e-07, + "loss": 0.0213, + "reward": 1.6813454329967499, + "reward_std": 0.3304239772260189, + "rewards/mrr_reward": 0.7842820137739182, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7769677639007568, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.46875, + "epoch": 0.3576, + "grad_norm": 0.04193993657827377, + "kl": 0.010549545288085938, + "learning_rate": 8.730841259649725e-07, + "loss": 0.0062, + "reward": 1.5157469809055328, + "reward_std": 0.34639402106404305, + "rewards/mrr_reward": 0.6320250630378723, + "rewards/rank_answer_foramt_reward": 0.927734375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7502106577157974, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.640625, + "epoch": 0.3584, + "grad_norm": 0.03967143967747688, + "kl": 0.010587692260742188, + "learning_rate": 8.475882737908248e-07, + "loss": -0.0067, + "reward": 1.677318662405014, + "reward_std": 0.26558706164360046, + "rewards/mrr_reward": 0.760627493262291, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8403518497943878, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.765625, + "epoch": 0.3592, + "grad_norm": 0.04171387106180191, + "kl": 0.009777069091796875, + "learning_rate": 8.224537431601886e-07, + "loss": -0.0275, + "reward": 1.4102658927440643, + "reward_std": 0.29483452066779137, + "rewards/mrr_reward": 0.5575024783611298, + "rewards/rank_answer_foramt_reward": 0.849609375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.757959634065628, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.625, + "epoch": 0.36, + "grad_norm": 0.047788720577955246, + "kl": 0.010875701904296875, + "learning_rate": 7.976815263412963e-07, + "loss": -0.0053, + "reward": 1.4586642682552338, + "reward_std": 0.28997623920440674, + "rewards/mrr_reward": 0.5601996779441833, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7929323315620422, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.09375, + "epoch": 0.3608, + "grad_norm": 0.03860335424542427, + "kl": 0.007927894592285156, + "learning_rate": 7.732726012988512e-07, + "loss": 0.0076, + "reward": 1.3660823702812195, + "reward_std": 0.33029213547706604, + "rewards/mrr_reward": 0.4904638007283211, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7490926533937454, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.15625, + "epoch": 0.3616, + "grad_norm": 0.03353331610560417, + "kl": 0.009148597717285156, + "learning_rate": 7.492279316554207e-07, + "loss": -0.0041, + "reward": 1.5296534895896912, + "reward_std": 0.16189000383019447, + "rewards/mrr_reward": 0.6202628910541534, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7967446148395538, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.75, + "epoch": 0.3624, + "grad_norm": 0.04088251292705536, + "kl": 0.009881973266601562, + "learning_rate": 7.255484666533874e-07, + "loss": 0.0072, + "reward": 1.5006992816925049, + "reward_std": 0.28344327583909035, + "rewards/mrr_reward": 0.6140811145305634, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7433622479438782, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.59375, + "epoch": 0.3632, + "grad_norm": 0.04271751642227173, + "kl": 0.010400772094726562, + "learning_rate": 7.022351411174866e-07, + "loss": -0.0219, + "reward": 1.3978624939918518, + "reward_std": 0.27385834977030754, + "rewards/mrr_reward": 0.5089161656796932, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7699484676122665, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.171875, + "epoch": 0.364, + "grad_norm": 0.048336923122406006, + "kl": 0.01015472412109375, + "learning_rate": 6.792888754178906e-07, + "loss": -0.0419, + "reward": 1.5317293107509613, + "reward_std": 0.38713493943214417, + "rewards/mrr_reward": 0.6699218675494194, + "rewards/rank_answer_foramt_reward": 0.86328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7560686320066452, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.578125, + "epoch": 0.3648, + "grad_norm": 0.03923417627811432, + "kl": 0.011358261108398438, + "learning_rate": 6.567105754338798e-07, + "loss": -0.0218, + "reward": 1.3666671514511108, + "reward_std": 0.2978068180382252, + "rewards/mrr_reward": 0.49135666340589523, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7989402711391449, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.859375, + "epoch": 0.3656, + "grad_norm": 0.04292134568095207, + "kl": 0.014402389526367188, + "learning_rate": 6.345011325180772e-07, + "loss": -0.0116, + "reward": 1.500946819782257, + "reward_std": 0.3475816771388054, + "rewards/mrr_reward": 0.613163448870182, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7879088073968887, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.609375, + "epoch": 0.3664, + "grad_norm": 0.03790047764778137, + "kl": 0.008701324462890625, + "learning_rate": 6.126614234612593e-07, + "loss": -0.0025, + "reward": 1.5834512114524841, + "reward_std": 0.29408957809209824, + "rewards/mrr_reward": 0.6931423544883728, + "rewards/rank_answer_foramt_reward": 0.94140625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7564990818500519, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.46875, + "epoch": 0.3672, + "grad_norm": 0.041111089289188385, + "kl": 0.01041412353515625, + "learning_rate": 5.911923104577455e-07, + "loss": 0.0538, + "reward": 1.5291092991828918, + "reward_std": 0.33945245295763016, + "rewards/mrr_reward": 0.6747395843267441, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.723764643073082, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.40625, + "epoch": 0.368, + "grad_norm": 0.040803536772727966, + "kl": 0.00865936279296875, + "learning_rate": 5.700946410713548e-07, + "loss": 0.0111, + "reward": 1.4347401559352875, + "reward_std": 0.23255350813269615, + "rewards/mrr_reward": 0.5265811011195183, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8203562796115875, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.375, + "epoch": 0.3688, + "grad_norm": 0.03991984575986862, + "kl": 0.010066986083984375, + "learning_rate": 5.49369248201953e-07, + "loss": -0.0406, + "reward": 1.5644738674163818, + "reward_std": 0.28139273822307587, + "rewards/mrr_reward": 0.6668402999639511, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7630703374743462, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.296875, + "epoch": 0.3696, + "grad_norm": 0.041922301054000854, + "kl": 0.009305953979492188, + "learning_rate": 5.290169500525577e-07, + "loss": 0.0128, + "reward": 1.3219963014125824, + "reward_std": 0.29185811802744865, + "rewards/mrr_reward": 0.44941097497940063, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7848227918148041, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.265625, + "epoch": 0.3704, + "grad_norm": 0.0457732267677784, + "kl": 0.009916305541992188, + "learning_rate": 5.090385500970551e-07, + "loss": -0.0337, + "reward": 1.5106676518917084, + "reward_std": 0.2413267381489277, + "rewards/mrr_reward": 0.6132688522338867, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7740776985883713, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.75, + "epoch": 0.3712, + "grad_norm": 0.040093667805194855, + "kl": 0.00926971435546875, + "learning_rate": 4.894348370484648e-07, + "loss": -0.007, + "reward": 1.5942188501358032, + "reward_std": 0.2485157772898674, + "rewards/mrr_reward": 0.6885416582226753, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8343198895454407, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.03125, + "epoch": 0.372, + "grad_norm": 0.04025663435459137, + "kl": 0.010089874267578125, + "learning_rate": 4.702065848278126e-07, + "loss": -0.0089, + "reward": 1.3519022762775421, + "reward_std": 0.35284218564629555, + "rewards/mrr_reward": 0.5015129074454308, + "rewards/rank_answer_foramt_reward": 0.859375, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.7488124072551727, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.75, + "epoch": 0.3728, + "grad_norm": 0.03952139616012573, + "kl": 0.011041641235351562, + "learning_rate": 4.5135455253357053e-07, + "loss": -0.0105, + "reward": 1.3247076570987701, + "reward_std": 0.30543025955557823, + "rewards/mrr_reward": 0.45331722497940063, + "rewards/rank_answer_foramt_reward": 0.88671875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7616707235574722, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.671875, + "epoch": 0.3736, + "grad_norm": 0.03882599249482155, + "kl": 0.011716842651367188, + "learning_rate": 4.3287948441169457e-07, + "loss": -0.0342, + "reward": 1.689410239458084, + "reward_std": 0.31131643801927567, + "rewards/mrr_reward": 0.8078124970197678, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7418206632137299, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.09375, + "epoch": 0.3744, + "grad_norm": 0.03910544514656067, + "kl": 0.01006317138671875, + "learning_rate": 4.1478210982624055e-07, + "loss": -0.02, + "reward": 1.40711310505867, + "reward_std": 0.2673242837190628, + "rewards/mrr_reward": 0.5138020887970924, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7695029377937317, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.0, + "epoch": 0.3752, + "grad_norm": 0.039743226021528244, + "kl": 0.008121490478515625, + "learning_rate": 3.9706314323056936e-07, + "loss": -0.045, + "reward": 1.4817086458206177, + "reward_std": 0.26649700850248337, + "rewards/mrr_reward": 0.5803571343421936, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7860555499792099, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.765625, + "epoch": 0.376, + "grad_norm": 0.03648286312818527, + "kl": 0.010162353515625, + "learning_rate": 3.7972328413914074e-07, + "loss": -0.0279, + "reward": 1.5091581344604492, + "reward_std": 0.3366273455321789, + "rewards/mrr_reward": 0.634523794054985, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7343911230564117, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.46875, + "epoch": 0.3768, + "grad_norm": 0.0420987531542778, + "kl": 0.012674331665039062, + "learning_rate": 3.627632170999029e-07, + "loss": -0.0287, + "reward": 1.5445944368839264, + "reward_std": 0.2770478278398514, + "rewards/mrr_reward": 0.6619543731212616, + "rewards/rank_answer_foramt_reward": 0.94140625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7488852739334106, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.5625, + "epoch": 0.3776, + "grad_norm": 0.033279478549957275, + "kl": 0.007472038269042969, + "learning_rate": 3.4618361166726123e-07, + "loss": -0.0299, + "reward": 1.5285615921020508, + "reward_std": 0.3049774765968323, + "rewards/mrr_reward": 0.6360863149166107, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8001734763383865, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.984375, + "epoch": 0.3784, + "grad_norm": 0.042792558670043945, + "kl": 0.010000228881835938, + "learning_rate": 3.2998512237565005e-07, + "loss": 0.0096, + "reward": 1.6697318851947784, + "reward_std": 0.20043689012527466, + "rewards/mrr_reward": 0.7655381858348846, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7692774683237076, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.984375, + "epoch": 0.3792, + "grad_norm": 0.040974345058202744, + "kl": 0.0115509033203125, + "learning_rate": 3.1416838871368925e-07, + "loss": -0.0247, + "reward": 1.4770236611366272, + "reward_std": 0.32821785286068916, + "rewards/mrr_reward": 0.6046874970197678, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7489113509654999, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.125, + "epoch": 0.38, + "grad_norm": 0.03732066601514816, + "kl": 0.010282516479492188, + "learning_rate": 2.987340350989421e-07, + "loss": -0.029, + "reward": 1.5076874494552612, + "reward_std": 0.22487280704081059, + "rewards/mrr_reward": 0.5981832966208458, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7775574177503586, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.328125, + "epoch": 0.3808, + "grad_norm": 0.04424848407506943, + "kl": 0.009943008422851562, + "learning_rate": 2.836826708532603e-07, + "loss": -0.0106, + "reward": 1.4497572183609009, + "reward_std": 0.38473574817180634, + "rewards/mrr_reward": 0.592447929084301, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7072817832231522, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.390625, + "epoch": 0.3816, + "grad_norm": 0.042454175651073456, + "kl": 0.009542465209960938, + "learning_rate": 2.6901489017873375e-07, + "loss": 0.0024, + "reward": 1.4682879745960236, + "reward_std": 0.3182600736618042, + "rewards/mrr_reward": 0.5997767746448517, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7138832211494446, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.640625, + "epoch": 0.3824, + "grad_norm": 0.04164545610547066, + "kl": 0.010335922241210938, + "learning_rate": 2.547312721342277e-07, + "loss": 0.0132, + "reward": 1.4912968873977661, + "reward_std": 0.28201924264431, + "rewards/mrr_reward": 0.5958767384290695, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7973784357309341, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.328125, + "epoch": 0.3832, + "grad_norm": 0.04302007704973221, + "kl": 0.011838912963867188, + "learning_rate": 2.4083238061252565e-07, + "loss": -0.0429, + "reward": 1.3676048517227173, + "reward_std": 0.2852121964097023, + "rewards/mrr_reward": 0.5010788887739182, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7371641099452972, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.34375, + "epoch": 0.384, + "grad_norm": 0.04239465668797493, + "kl": 0.010568618774414062, + "learning_rate": 2.273187643180652e-07, + "loss": 0.0114, + "reward": 1.5518296658992767, + "reward_std": 0.404549989849329, + "rewards/mrr_reward": 0.6595238298177719, + "rewards/rank_answer_foramt_reward": 0.88671875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8250507712364197, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.8125, + "epoch": 0.3848, + "grad_norm": 0.0668654814362526, + "kl": 0.021732330322265625, + "learning_rate": 2.1419095674527934e-07, + "loss": -0.0356, + "reward": 1.413476824760437, + "reward_std": 0.3368876613676548, + "rewards/mrr_reward": 0.5408854186534882, + "rewards/rank_answer_foramt_reward": 0.84765625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7965600192546844, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.21875, + "epoch": 0.3856, + "grad_norm": 0.045830436050891876, + "kl": 0.011808395385742188, + "learning_rate": 2.014494761575314e-07, + "loss": -0.0255, + "reward": 1.5131396651268005, + "reward_std": 0.3415753096342087, + "rewards/mrr_reward": 0.6448970809578896, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7443193197250366, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.328125, + "epoch": 0.3864, + "grad_norm": 0.03847775235772133, + "kl": 0.009500503540039062, + "learning_rate": 1.8909482556666026e-07, + "loss": -0.0099, + "reward": 1.3552064895629883, + "reward_std": 0.21233711391687393, + "rewards/mrr_reward": 0.4519283324480057, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7879875898361206, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.21875, + "epoch": 0.3872, + "grad_norm": 0.04119429737329483, + "kl": 0.011724472045898438, + "learning_rate": 1.7712749271311392e-07, + "loss": -0.0224, + "reward": 1.6171193420886993, + "reward_std": 0.33994160592556, + "rewards/mrr_reward": 0.7380208224058151, + "rewards/rank_answer_foramt_reward": 0.9140625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.749872237443924, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.203125, + "epoch": 0.388, + "grad_norm": 0.043465420603752136, + "kl": 0.011735916137695312, + "learning_rate": 1.6554795004670389e-07, + "loss": -0.0467, + "reward": 1.3008288443088531, + "reward_std": 0.34553826972842216, + "rewards/mrr_reward": 0.4407738149166107, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7390396296977997, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.765625, + "epoch": 0.3888, + "grad_norm": 0.043878600001335144, + "kl": 0.010034561157226562, + "learning_rate": 1.543566547079467e-07, + "loss": -0.009, + "reward": 1.3604202717542648, + "reward_std": 0.2537879031151533, + "rewards/mrr_reward": 0.49144964292645454, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7387129366397858, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.0625, + "epoch": 0.3896, + "grad_norm": 0.05306778848171234, + "kl": 0.0155029296875, + "learning_rate": 1.4355404851001953e-07, + "loss": -0.0855, + "reward": 1.5500088930130005, + "reward_std": 0.2238281350582838, + "rewards/mrr_reward": 0.7127604186534882, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.6211006790399551, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.703125, + "epoch": 0.3904, + "grad_norm": 0.046678684651851654, + "kl": 0.009532928466796875, + "learning_rate": 1.3314055792131964e-07, + "loss": -0.009, + "reward": 1.4595491290092468, + "reward_std": 0.25613268837332726, + "rewards/mrr_reward": 0.5737351253628731, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7956129163503647, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.25, + "epoch": 0.3912, + "grad_norm": 0.04260161146521568, + "kl": 0.009710311889648438, + "learning_rate": 1.231165940486234e-07, + "loss": -0.0066, + "reward": 1.4045831859111786, + "reward_std": 0.31474703550338745, + "rewards/mrr_reward": 0.4935515895485878, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8642172515392303, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.96875, + "epoch": 0.392, + "grad_norm": 0.03883928805589676, + "kl": 0.008863449096679688, + "learning_rate": 1.134825526208605e-07, + "loss": -0.0364, + "reward": 1.4408005475997925, + "reward_std": 0.33699289709329605, + "rewards/mrr_reward": 0.5509920567274094, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7784203439950943, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.984375, + "epoch": 0.3928, + "grad_norm": 0.0388728491961956, + "kl": 0.010242462158203125, + "learning_rate": 1.0423881397349067e-07, + "loss": -0.0279, + "reward": 1.3634454309940338, + "reward_std": 0.32560136914253235, + "rewards/mrr_reward": 0.5041666775941849, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7366872876882553, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.8125, + "epoch": 0.3936, + "grad_norm": 0.03954172879457474, + "kl": 0.008582115173339844, + "learning_rate": 9.538574303348813e-08, + "loss": -0.015, + "reward": 1.2776685655117035, + "reward_std": 0.2750714495778084, + "rewards/mrr_reward": 0.3992125578224659, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7596440017223358, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.1875, + "epoch": 0.3944, + "grad_norm": 0.04806819558143616, + "kl": 0.0110321044921875, + "learning_rate": 8.692368930493522e-08, + "loss": -0.0395, + "reward": 1.3417856693267822, + "reward_std": 0.21769994869828224, + "rewards/mrr_reward": 0.48239708691835403, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7233483120799065, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.09375, + "epoch": 0.3952, + "grad_norm": 0.042950328439474106, + "kl": 0.010545730590820312, + "learning_rate": 7.885298685522235e-08, + "loss": -0.0172, + "reward": 1.4324612021446228, + "reward_std": 0.3757603354752064, + "rewards/mrr_reward": 0.5950520783662796, + "rewards/rank_answer_foramt_reward": 0.79296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7524469792842865, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.625, + "epoch": 0.396, + "grad_norm": 0.04140486195683479, + "kl": 0.010625839233398438, + "learning_rate": 7.117395430186414e-08, + "loss": -0.018, + "reward": 1.4777464866638184, + "reward_std": 0.2758827228099108, + "rewards/mrr_reward": 0.5884734690189362, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7865635007619858, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.984375, + "epoch": 0.3968, + "grad_norm": 0.045392703264951706, + "kl": 0.009822845458984375, + "learning_rate": 6.388689479991606e-08, + "loss": -0.0167, + "reward": 1.2444620728492737, + "reward_std": 0.2947104088962078, + "rewards/mrr_reward": 0.3777901865541935, + "rewards/rank_answer_foramt_reward": 0.88671875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7551845461130142, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.234375, + "epoch": 0.3976, + "grad_norm": 0.03996572643518448, + "kl": 0.0064067840576171875, + "learning_rate": 5.699209603001077e-08, + "loss": 0.0273, + "reward": 1.4518903493881226, + "reward_std": 0.1597257237881422, + "rewards/mrr_reward": 0.5305245518684387, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8349859565496445, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.203125, + "epoch": 0.3984, + "grad_norm": 0.04201935976743698, + "kl": 0.013742446899414062, + "learning_rate": 5.048983018699827e-08, + "loss": -0.0298, + "reward": 1.4749173521995544, + "reward_std": 0.45922574400901794, + "rewards/mrr_reward": 0.6170573085546494, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7734040170907974, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.984375, + "epoch": 0.3992, + "grad_norm": 0.043673526495695114, + "kl": 0.011587142944335938, + "learning_rate": 4.438035396920004e-08, + "loss": -0.029, + "reward": 1.3924769461154938, + "reward_std": 0.3469556476920843, + "rewards/mrr_reward": 0.5519531145691872, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.6583698391914368, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.71875, + "epoch": 0.4, + "grad_norm": 0.04227694869041443, + "kl": 0.01059722900390625, + "learning_rate": 3.866390856827495e-08, + "loss": -0.0028, + "reward": 1.4461922645568848, + "reward_std": 0.32431307435035706, + "rewards/mrr_reward": 0.5536830350756645, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.81004199385643, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 475.90625, + "epoch": 0.4008, + "grad_norm": 0.04245263710618019, + "kl": 0.009504318237304688, + "learning_rate": 3.3340719659701315e-08, + "loss": 0.0258, + "reward": 1.0929251462221146, + "reward_std": 0.21385096292942762, + "rewards/mrr_reward": 0.1986049171537161, + "rewards/rank_answer_foramt_reward": 0.849609375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8604518622159958, + "step": 501 + }, + { + "epoch": 0.4008, + "step": 501, + "total_flos": 0.0, + "train_loss": 5.142112260688089e-05, + "train_runtime": 276.4871, + "train_samples_per_second": 115.738, + "train_steps_per_second": 1.808 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}