{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 288.625, "epoch": 0.008, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.055, "reward": 0.748445987701416, "reward_std": 0.48841265588998795, "rewards/mrr_reward": 0.24103422835469246, "rewards/rank_answer_foramt_reward": 0.4765625, "rewards/rank_overall_format_reward": 0.875, "rewards/rank_think_format_reward": 0.1860488811507821, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 308.5, "epoch": 0.016, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0847, "reward": 0.6804930120706558, "reward_std": 0.38646717369556427, "rewards/mrr_reward": 0.1652343738824129, "rewards/rank_answer_foramt_reward": 0.421875, "rewards/rank_overall_format_reward": 0.8671875, "rewards/rank_think_format_reward": 0.27232725732028484, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 294.84375, "epoch": 0.024, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0834, "reward": 0.9339272081851959, "reward_std": 0.5670942962169647, "rewards/mrr_reward": 0.37167659401893616, "rewards/rank_answer_foramt_reward": 0.5703125, "rewards/rank_overall_format_reward": 0.8671875, "rewards/rank_think_format_reward": 0.26628969237208366, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 326.671875, "epoch": 0.032, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0804, "reward": 1.024831861257553, "reward_std": 0.5523797571659088, "rewards/mrr_reward": 0.37087053433060646, "rewards/rank_answer_foramt_reward": 0.623046875, "rewards/rank_overall_format_reward": 0.90625, "rewards/rank_think_format_reward": 0.45240409672260284, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 294.25, "epoch": 0.04, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0456, "reward": 0.9988610446453094, "reward_std": 0.5176242738962173, "rewards/mrr_reward": 0.4147135466337204, "rewards/rank_answer_foramt_reward": 0.607421875, "rewards/rank_overall_format_reward": 0.890625, "rewards/rank_think_format_reward": 0.27209700644016266, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 301.34375, "epoch": 0.048, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0798, "reward": 0.7302423864603043, "reward_std": 0.3661453425884247, "rewards/mrr_reward": 0.182986113242805, "rewards/rank_answer_foramt_reward": 0.48828125, "rewards/rank_overall_format_reward": 0.90625, "rewards/rank_think_format_reward": 0.2638210151344538, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 288.421875, "epoch": 0.056, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0388, "reward": 0.9191122651100159, "reward_std": 0.5329124554991722, "rewards/mrr_reward": 0.358767356723547, "rewards/rank_answer_foramt_reward": 0.564453125, "rewards/rank_overall_format_reward": 0.875, "rewards/rank_think_format_reward": 0.2585616558790207, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 296.421875, "epoch": 0.064, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0344, "reward": 0.8672969937324524, "reward_std": 0.48230237513780594, "rewards/mrr_reward": 0.3153645843267441, "rewards/rank_answer_foramt_reward": 0.525390625, "rewards/rank_overall_format_reward": 0.859375, "rewards/rank_think_format_reward": 0.2877568071708083, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 304.59375, "epoch": 0.072, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0414, "reward": 1.0287966132164001, "reward_std": 0.5458935871720314, "rewards/mrr_reward": 0.4440104216337204, "rewards/rank_answer_foramt_reward": 0.560546875, "rewards/rank_overall_format_reward": 0.875, "rewards/rank_think_format_reward": 0.33653245121240616, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 307.8125, "epoch": 0.08, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0409, "reward": 1.157219260931015, "reward_std": 0.38755715638399124, "rewards/mrr_reward": 0.5087177604436874, "rewards/rank_answer_foramt_reward": 0.8203125, "rewards/rank_overall_format_reward": 0.859375, "rewards/rank_think_format_reward": 0.2854683920741081, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 331.140625, "epoch": 0.088, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0841, "reward": 0.9862491339445114, "reward_std": 0.5109377801418304, "rewards/mrr_reward": 0.3429687526077032, "rewards/rank_answer_foramt_reward": 0.611328125, "rewards/rank_overall_format_reward": 0.921875, "rewards/rank_think_format_reward": 0.41613128781318665, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 292.796875, "epoch": 0.096, "grad_norm": 0.03115917183458805, "kl": 0.0, "learning_rate": 1.9999999684172664e-05, "loss": -0.0053, "reward": 0.7712794989347458, "reward_std": 0.48659200221300125, "rewards/mrr_reward": 0.2405133955180645, "rewards/rank_answer_foramt_reward": 0.400390625, "rewards/rank_overall_format_reward": 0.90625, "rewards/rank_think_format_reward": 0.3017413951456547, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 313.734375, "epoch": 0.104, "grad_norm": 0.03115917183458805, "kl": -6.780028343200684e-06, "learning_rate": 1.9999999684172664e-05, "loss": -0.0281, "reward": 0.8285368829965591, "reward_std": 0.5136675909161568, "rewards/mrr_reward": 0.27406374365091324, "rewards/rank_answer_foramt_reward": 0.494140625, "rewards/rank_overall_format_reward": 0.890625, "rewards/rank_think_format_reward": 0.2954559400677681, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 320.71875, "epoch": 0.112, "grad_norm": 0.03210530802607536, "kl": -6.556510925292969e-06, "learning_rate": 1.9999998736690666e-05, "loss": -0.0714, "reward": 0.8715860396623611, "reward_std": 0.49175362288951874, "rewards/mrr_reward": 0.27080853283405304, "rewards/rank_answer_foramt_reward": 0.5078125, "rewards/rank_overall_format_reward": 0.9296875, "rewards/rank_think_format_reward": 0.3830379396677017, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 300.921875, "epoch": 0.12, "grad_norm": 0.02840823121368885, "kl": -7.599592208862305e-06, "learning_rate": 1.999999715755407e-05, "loss": -0.0276, "reward": 0.8312882781028748, "reward_std": 0.4621574282646179, "rewards/mrr_reward": 0.2509300671517849, "rewards/rank_answer_foramt_reward": 0.5234375, "rewards/rank_overall_format_reward": 0.921875, "rewards/rank_think_format_reward": 0.31334864534437656, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 318.390625, "epoch": 0.128, "grad_norm": 0.025292610749602318, "kl": -5.260109901428223e-06, "learning_rate": 1.9999994946762974e-05, "loss": -0.0246, "reward": 0.9414703845977783, "reward_std": 0.5526015311479568, "rewards/mrr_reward": 0.36406249925494194, "rewards/rank_answer_foramt_reward": 0.482421875, "rewards/rank_overall_format_reward": 0.859375, "rewards/rank_think_format_reward": 0.4079238325357437, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 318.046875, "epoch": 0.136, "grad_norm": 0.03076692298054695, "kl": -3.769993782043457e-06, "learning_rate": 1.999999210431752e-05, "loss": 0.0048, "reward": 0.8599012494087219, "reward_std": 0.49315596371889114, "rewards/mrr_reward": 0.28183284401893616, "rewards/rank_answer_foramt_reward": 0.53515625, "rewards/rank_overall_format_reward": 0.8984375, "rewards/rank_think_format_reward": 0.31812864542007446, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 309.5, "epoch": 0.144, "grad_norm": 0.029196709394454956, "kl": 2.5480985641479492e-06, "learning_rate": 1.9999988630217885e-05, "loss": -0.0552, "reward": 0.9807900786399841, "reward_std": 0.5464348271489143, "rewards/mrr_reward": 0.38268229365348816, "rewards/rank_answer_foramt_reward": 0.623046875, "rewards/rank_overall_format_reward": 0.8828125, "rewards/rank_think_format_reward": 0.306588314473629, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 274.65625, "epoch": 0.152, "grad_norm": 0.03444257006049156, "kl": 7.301568984985352e-07, "learning_rate": 1.999998452446429e-05, "loss": -0.0274, "reward": 0.7828617691993713, "reward_std": 0.38783423602581024, "rewards/mrr_reward": 0.2496279813349247, "rewards/rank_answer_foramt_reward": 0.48046875, "rewards/rank_overall_format_reward": 0.8984375, "rewards/rank_think_format_reward": 0.23695369437336922, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 319.46875, "epoch": 0.16, "grad_norm": 0.029183639213442802, "kl": 8.150935173034668e-06, "learning_rate": 1.9999979787056998e-05, "loss": -0.0822, "reward": 1.052710935473442, "reward_std": 0.5361286401748657, "rewards/mrr_reward": 0.420355923473835, "rewards/rank_answer_foramt_reward": 0.650390625, "rewards/rank_overall_format_reward": 0.890625, "rewards/rank_think_format_reward": 0.3752116933465004, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 351.703125, "epoch": 0.168, "grad_norm": 0.028047969564795494, "kl": 1.2278556823730469e-05, "learning_rate": 1.9999974417996303e-05, "loss": -0.0809, "reward": 1.107970878481865, "reward_std": 0.5233071744441986, "rewards/mrr_reward": 0.43270088732242584, "rewards/rank_answer_foramt_reward": 0.62109375, "rewards/rank_overall_format_reward": 0.890625, "rewards/rank_think_format_reward": 0.5345538482069969, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 312.203125, "epoch": 0.176, "grad_norm": 0.03128313645720482, "kl": 1.9982457160949707e-05, "learning_rate": 1.9999968417282542e-05, "loss": -0.0821, "reward": 0.8156533539295197, "reward_std": 0.43370306491851807, "rewards/mrr_reward": 0.25281498208642006, "rewards/rank_answer_foramt_reward": 0.498046875, "rewards/rank_overall_format_reward": 0.890625, "rewards/rank_think_format_reward": 0.3168988637626171, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 326.640625, "epoch": 0.184, "grad_norm": 0.02993021160364151, "kl": 2.4840235710144043e-05, "learning_rate": 1.99999617849161e-05, "loss": -0.078, "reward": 1.129832923412323, "reward_std": 0.5531031638383865, "rewards/mrr_reward": 0.47254466265439987, "rewards/rank_answer_foramt_reward": 0.705078125, "rewards/rank_overall_format_reward": 0.8671875, "rewards/rank_think_format_reward": 0.4195169061422348, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 303.828125, "epoch": 0.192, "grad_norm": 0.03055522032082081, "kl": 4.00543212890625e-05, "learning_rate": 1.9999954520897394e-05, "loss": -0.0252, "reward": 0.9006818234920502, "reward_std": 0.5223089158535004, "rewards/mrr_reward": 0.3359374925494194, "rewards/rank_answer_foramt_reward": 0.525390625, "rewards/rank_overall_format_reward": 0.9296875, "rewards/rank_think_format_reward": 0.25626825354993343, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 295.796875, "epoch": 0.2, "grad_norm": 0.03174176439642906, "kl": 3.859400749206543e-05, "learning_rate": 1.999994662522688e-05, "loss": -0.0891, "reward": 0.9934758394956589, "reward_std": 0.4950134977698326, "rewards/mrr_reward": 0.3896019347012043, "rewards/rank_answer_foramt_reward": 0.5703125, "rewards/rank_overall_format_reward": 0.8984375, "rewards/rank_think_format_reward": 0.3611707091331482, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 285.921875, "epoch": 0.208, "grad_norm": 0.03093676082789898, "kl": 7.84844160079956e-05, "learning_rate": 1.9999938097905064e-05, "loss": -0.1107, "reward": 0.7378821074962616, "reward_std": 0.46790947765111923, "rewards/mrr_reward": 0.2100694440305233, "rewards/rank_answer_foramt_reward": 0.486328125, "rewards/rank_overall_format_reward": 0.8828125, "rewards/rank_think_format_reward": 0.23029159009456635, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 294.296875, "epoch": 0.216, "grad_norm": 0.03421920910477638, "kl": 8.234381675720215e-05, "learning_rate": 1.9999928938932473e-05, "loss": -0.0799, "reward": 0.9299369752407074, "reward_std": 0.48957522213459015, "rewards/mrr_reward": 0.3561817966401577, "rewards/rank_answer_foramt_reward": 0.619140625, "rewards/rank_overall_format_reward": 0.8984375, "rewards/rank_think_format_reward": 0.22107390873134136, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 312.4375, "epoch": 0.224, "grad_norm": 0.03421920910477638, "kl": 0.00018423795700073242, "learning_rate": 1.9999928938932473e-05, "loss": -0.0439, "reward": 0.9257623851299286, "reward_std": 0.5316600203514099, "rewards/mrr_reward": 0.36640625819563866, "rewards/rank_answer_foramt_reward": 0.59375, "rewards/rank_overall_format_reward": 0.84375, "rewards/rank_think_format_reward": 0.25751855596899986, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 311.609375, "epoch": 0.232, "grad_norm": 0.02869362011551857, "kl": 0.0001027137041091919, "learning_rate": 1.99999191483097e-05, "loss": -0.0072, "reward": 0.849075511097908, "reward_std": 0.39980996400117874, "rewards/mrr_reward": 0.31562500074505806, "rewards/rank_answer_foramt_reward": 0.44921875, "rewards/rank_overall_format_reward": 0.8984375, "rewards/rank_think_format_reward": 0.26886044442653656, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 341.328125, "epoch": 0.24, "grad_norm": 0.028789300471544266, "kl": 0.00010889768600463867, "learning_rate": 1.999990872603735e-05, "loss": -0.056, "reward": 1.0844270288944244, "reward_std": 0.4819156527519226, "rewards/mrr_reward": 0.3760416656732559, "rewards/rank_answer_foramt_reward": 0.625, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.5684971548616886, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 301.5, "epoch": 0.248, "grad_norm": 0.03265060856938362, "kl": 0.00014287233352661133, "learning_rate": 1.999989767211609e-05, "loss": -0.0575, "reward": 1.005102053284645, "reward_std": 0.44889208674430847, "rewards/mrr_reward": 0.3782986141741276, "rewards/rank_answer_foramt_reward": 0.59375, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.35252929478883743, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 315.171875, "epoch": 0.256, "grad_norm": 0.027822472155094147, "kl": 0.00015693902969360352, "learning_rate": 1.9999885986546613e-05, "loss": -0.0599, "reward": 0.9155676811933517, "reward_std": 0.4416455924510956, "rewards/mrr_reward": 0.3112413324415684, "rewards/rank_answer_foramt_reward": 0.607421875, "rewards/rank_overall_format_reward": 0.9296875, "rewards/rank_think_format_reward": 0.29418253153562546, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 314.421875, "epoch": 0.264, "grad_norm": 0.03131254017353058, "kl": 0.00016450881958007812, "learning_rate": 1.999987366932966e-05, "loss": -0.0832, "reward": 0.7057739198207855, "reward_std": 0.4360158443450928, "rewards/mrr_reward": 0.19101562350988388, "rewards/rank_answer_foramt_reward": 0.439453125, "rewards/rank_overall_format_reward": 0.8984375, "rewards/rank_think_format_reward": 0.221982903778553, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 306.0, "epoch": 0.272, "grad_norm": 0.031528931111097336, "kl": 0.00021272897720336914, "learning_rate": 1.9999860720466007e-05, "loss": 0.0174, "reward": 0.8981269598007202, "reward_std": 0.43555907905101776, "rewards/mrr_reward": 0.33803943544626236, "rewards/rank_answer_foramt_reward": 0.478515625, "rewards/rank_overall_format_reward": 0.9140625, "rewards/rank_think_format_reward": 0.304656695574522, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 331.171875, "epoch": 0.28, "grad_norm": 0.029030395671725273, "kl": 0.00024181604385375977, "learning_rate": 1.9999847139956477e-05, "loss": 0.0231, "reward": 1.1540020108222961, "reward_std": 0.45568330585956573, "rewards/mrr_reward": 0.4646453410387039, "rewards/rank_answer_foramt_reward": 0.732421875, "rewards/rank_overall_format_reward": 0.9296875, "rewards/rank_think_format_reward": 0.4268500804901123, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 317.3125, "epoch": 0.288, "grad_norm": 0.03088027611374855, "kl": 0.0003453493118286133, "learning_rate": 1.9999832927801922e-05, "loss": -0.0633, "reward": 0.9360256493091583, "reward_std": 0.4951440170407295, "rewards/mrr_reward": 0.330078125, "rewards/rank_answer_foramt_reward": 0.5703125, "rewards/rank_overall_format_reward": 0.9453125, "rewards/rank_think_format_reward": 0.32057957723736763, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 314.546875, "epoch": 0.296, "grad_norm": 0.033634252846241, "kl": 0.00039967894554138184, "learning_rate": 1.9999818084003243e-05, "loss": -0.0662, "reward": 0.989772766828537, "reward_std": 0.4721188619732857, "rewards/mrr_reward": 0.36536458879709244, "rewards/rank_answer_foramt_reward": 0.62109375, "rewards/rank_overall_format_reward": 0.9375, "rewards/rank_think_format_reward": 0.33355215936899185, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 287.65625, "epoch": 0.304, "grad_norm": 0.03363899141550064, "kl": 0.0003497004508972168, "learning_rate": 1.999980260856137e-05, "loss": -0.0616, "reward": 1.0276555567979813, "reward_std": 0.49545609951019287, "rewards/mrr_reward": 0.4115767106413841, "rewards/rank_answer_foramt_reward": 0.6171875, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.29659304022789, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 330.5, "epoch": 0.312, "grad_norm": 0.03134270757436752, "kl": 0.00044792890548706055, "learning_rate": 1.9999786501477298e-05, "loss": -0.0625, "reward": 0.9894662201404572, "reward_std": 0.4404422789812088, "rewards/mrr_reward": 0.3252232186496258, "rewards/rank_answer_foramt_reward": 0.673828125, "rewards/rank_overall_format_reward": 0.921875, "rewards/rank_think_format_reward": 0.4171544536948204, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 317.5625, "epoch": 0.32, "grad_norm": 0.03180496767163277, "kl": 0.0005990266799926758, "learning_rate": 1.9999769762752024e-05, "loss": -0.034, "reward": 0.9837304353713989, "reward_std": 0.5629166960716248, "rewards/mrr_reward": 0.3815104216337204, "rewards/rank_answer_foramt_reward": 0.5625, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.29365903325378895, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 293.5625, "epoch": 0.328, "grad_norm": 0.03363664820790291, "kl": 0.0006002187728881836, "learning_rate": 1.999975239238662e-05, "loss": -0.0186, "reward": 1.0903609842061996, "reward_std": 0.6452237367630005, "rewards/mrr_reward": 0.5052083358168602, "rewards/rank_answer_foramt_reward": 0.60546875, "rewards/rank_overall_format_reward": 0.9375, "rewards/rank_think_format_reward": 0.2302209585905075, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 335.953125, "epoch": 0.336, "grad_norm": 0.02957266755402088, "kl": 0.000903010368347168, "learning_rate": 1.999973439038218e-05, "loss": -0.0295, "reward": 1.0027846843004227, "reward_std": 0.4366834908723831, "rewards/mrr_reward": 0.3347470201551914, "rewards/rank_answer_foramt_reward": 0.6328125, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.43841899931430817, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 330.25, "epoch": 0.344, "grad_norm": 0.03136228397488594, "kl": 0.000525057315826416, "learning_rate": 1.9999715756739833e-05, "loss": -0.0375, "reward": 1.043868064880371, "reward_std": 0.45916447043418884, "rewards/mrr_reward": 0.38901908695697784, "rewards/rank_answer_foramt_reward": 0.67578125, "rewards/rank_overall_format_reward": 0.9296875, "rewards/rank_think_format_reward": 0.37892188876867294, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 328.234375, "epoch": 0.352, "grad_norm": 0.031588468700647354, "kl": 0.0005010366439819336, "learning_rate": 1.9999696491460764e-05, "loss": -0.0386, "reward": 1.1304609179496765, "reward_std": 0.41832099854946136, "rewards/mrr_reward": 0.43151041865348816, "rewards/rank_answer_foramt_reward": 0.75, "rewards/rank_overall_format_reward": 0.9296875, "rewards/rank_think_format_reward": 0.43834417313337326, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 337.390625, "epoch": 0.36, "grad_norm": 0.032946065068244934, "kl": 0.0008701086044311523, "learning_rate": 1.9999676594546187e-05, "loss": -0.0529, "reward": 1.1123294532299042, "reward_std": 0.5354420319199562, "rewards/mrr_reward": 0.46336185187101364, "rewards/rank_answer_foramt_reward": 0.68359375, "rewards/rank_overall_format_reward": 0.9296875, "rewards/rank_think_format_reward": 0.35328710824251175, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 351.125, "epoch": 0.368, "grad_norm": 0.029322165995836258, "kl": 0.0005701780319213867, "learning_rate": 1.999965606599736e-05, "loss": -0.0847, "reward": 1.107217699289322, "reward_std": 0.4067194238305092, "rewards/mrr_reward": 0.3920392580330372, "rewards/rank_answer_foramt_reward": 0.69140625, "rewards/rank_overall_format_reward": 0.9453125, "rewards/rank_think_format_reward": 0.5304885134100914, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 306.6875, "epoch": 0.376, "grad_norm": 0.0324987918138504, "kl": 0.0009363889694213867, "learning_rate": 1.999963490581558e-05, "loss": -0.0823, "reward": 0.7899350076913834, "reward_std": 0.40959134697914124, "rewards/mrr_reward": 0.21142169833183289, "rewards/rank_answer_foramt_reward": 0.5703125, "rewards/rank_overall_format_reward": 0.9375, "rewards/rank_think_format_reward": 0.24525804258883, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 338.078125, "epoch": 0.384, "grad_norm": 0.03164631128311157, "kl": 0.0010160207748413086, "learning_rate": 1.9999613114002184e-05, "loss": -0.0145, "reward": 1.3335086703300476, "reward_std": 0.3851042538881302, "rewards/mrr_reward": 0.6021019294857979, "rewards/rank_answer_foramt_reward": 0.787109375, "rewards/rank_overall_format_reward": 0.9296875, "rewards/rank_think_format_reward": 0.4995870888233185, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 328.875, "epoch": 0.392, "grad_norm": 0.03086424618959427, "kl": 0.0010756254196166992, "learning_rate": 1.9999590690558545e-05, "loss": -0.0188, "reward": 1.048632025718689, "reward_std": 0.5718654319643974, "rewards/mrr_reward": 0.40448908507823944, "rewards/rank_answer_foramt_reward": 0.650390625, "rewards/rank_overall_format_reward": 0.9375, "rewards/rank_think_format_reward": 0.3640574663877487, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 353.6875, "epoch": 0.4, "grad_norm": 0.03133213520050049, "kl": 0.0009970664978027344, "learning_rate": 1.9999567635486086e-05, "loss": -0.0647, "reward": 1.1414598375558853, "reward_std": 0.3979858383536339, "rewards/mrr_reward": 0.4474826380610466, "rewards/rank_answer_foramt_reward": 0.693359375, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.45647673308849335, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 326.578125, "epoch": 0.408, "grad_norm": 0.031969401985406876, "kl": 0.0011034011840820312, "learning_rate": 1.9999543948786258e-05, "loss": -0.0894, "reward": 1.0822753310203552, "reward_std": 0.48015688359737396, "rewards/mrr_reward": 0.4179687649011612, "rewards/rank_answer_foramt_reward": 0.640625, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.419300127774477, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 312.890625, "epoch": 0.416, "grad_norm": 0.03478927165269852, "kl": 0.0011467933654785156, "learning_rate": 1.9999519630460554e-05, "loss": -0.0107, "reward": 1.0111391097307205, "reward_std": 0.4753674492239952, "rewards/mrr_reward": 0.4023437537252903, "rewards/rank_answer_foramt_reward": 0.59375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.2588968575000763, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 350.703125, "epoch": 0.424, "grad_norm": 0.036083146929740906, "kl": 0.0012824535369873047, "learning_rate": 1.999949468051052e-05, "loss": -0.0088, "reward": 1.0059151947498322, "reward_std": 0.45006410777568817, "rewards/mrr_reward": 0.30140748247504234, "rewards/rank_answer_foramt_reward": 0.599609375, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.566512443125248, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 352.4375, "epoch": 0.432, "grad_norm": 0.031547144055366516, "kl": 0.0010249614715576172, "learning_rate": 1.9999469098937726e-05, "loss": -0.0604, "reward": 1.0756309181451797, "reward_std": 0.41381075978279114, "rewards/mrr_reward": 0.3790246248245239, "rewards/rank_answer_foramt_reward": 0.6875, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.4624905288219452, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 321.234375, "epoch": 0.44, "grad_norm": 0.03417390212416649, "kl": 0.001626729965209961, "learning_rate": 1.9999442885743785e-05, "loss": -0.049, "reward": 1.037936955690384, "reward_std": 0.5517462939023972, "rewards/mrr_reward": 0.4401041716337204, "rewards/rank_answer_foramt_reward": 0.583984375, "rewards/rank_overall_format_reward": 0.9375, "rewards/rank_think_format_reward": 0.2901299186050892, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 322.46875, "epoch": 0.448, "grad_norm": 0.03233444690704346, "kl": 0.0018727779388427734, "learning_rate": 1.9999416040930354e-05, "loss": -0.0382, "reward": 0.9221065938472748, "reward_std": 0.46301373839378357, "rewards/mrr_reward": 0.289515133947134, "rewards/rank_answer_foramt_reward": 0.611328125, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.344678096473217, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 339.15625, "epoch": 0.456, "grad_norm": 0.03557359054684639, "kl": 0.0014951229095458984, "learning_rate": 1.9999388564499135e-05, "loss": -0.0302, "reward": 1.0118384808301926, "reward_std": 0.40575922280550003, "rewards/mrr_reward": 0.3382130526006222, "rewards/rank_answer_foramt_reward": 0.708984375, "rewards/rank_overall_format_reward": 0.9453125, "rewards/rank_think_format_reward": 0.38699227198958397, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 348.53125, "epoch": 0.464, "grad_norm": 0.03160971775650978, "kl": 0.0015277862548828125, "learning_rate": 1.999936045645186e-05, "loss": -0.0462, "reward": 1.2952305674552917, "reward_std": 0.4332849085330963, "rewards/mrr_reward": 0.5801215022802353, "rewards/rank_answer_foramt_reward": 0.7578125, "rewards/rank_overall_format_reward": 0.9375, "rewards/rank_think_format_reward": 0.4716843515634537, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 374.84375, "epoch": 0.472, "grad_norm": 0.029569024220108986, "kl": 0.0018584728240966797, "learning_rate": 1.9999331716790303e-05, "loss": -0.0289, "reward": 1.2610860168933868, "reward_std": 0.42367615550756454, "rewards/mrr_reward": 0.489583320915699, "rewards/rank_answer_foramt_reward": 0.802734375, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.5664023458957672, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 351.640625, "epoch": 0.48, "grad_norm": 0.031776271760463715, "kl": 0.001566171646118164, "learning_rate": 1.9999302345516278e-05, "loss": -0.048, "reward": 1.31424281001091, "reward_std": 0.5264566540718079, "rewards/mrr_reward": 0.5922247171401978, "rewards/rank_answer_foramt_reward": 0.767578125, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.45941800996661186, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 335.859375, "epoch": 0.488, "grad_norm": 0.0326690673828125, "kl": 0.003124237060546875, "learning_rate": 1.9999272342631644e-05, "loss": -0.0727, "reward": 1.1519178003072739, "reward_std": 0.41105426847934723, "rewards/mrr_reward": 0.4376183748245239, "rewards/rank_answer_foramt_reward": 0.771484375, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.43212172016501427, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 316.59375, "epoch": 0.496, "grad_norm": 0.032641906291246414, "kl": 0.003032207489013672, "learning_rate": 1.9999241708138296e-05, "loss": -0.0419, "reward": 1.0120218098163605, "reward_std": 0.5163902416825294, "rewards/mrr_reward": 0.37465277686715126, "rewards/rank_answer_foramt_reward": 0.619140625, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.3513430394232273, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 351.953125, "epoch": 0.504, "grad_norm": 0.03364777937531471, "kl": 0.0018007755279541016, "learning_rate": 1.9999210442038164e-05, "loss": 0.0066, "reward": 1.251243233680725, "reward_std": 0.46613559126853943, "rewards/mrr_reward": 0.5524925589561462, "rewards/rank_answer_foramt_reward": 0.80078125, "rewards/rank_overall_format_reward": 0.9453125, "rewards/rank_think_format_reward": 0.3713323250412941, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 355.546875, "epoch": 0.512, "grad_norm": 0.03263048082590103, "kl": 0.002674579620361328, "learning_rate": 1.9999178544333228e-05, "loss": -0.0261, "reward": 1.2319741547107697, "reward_std": 0.40322718769311905, "rewards/mrr_reward": 0.5384114533662796, "rewards/rank_answer_foramt_reward": 0.7578125, "rewards/rank_overall_format_reward": 0.9453125, "rewards/rank_think_format_reward": 0.3985799662768841, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 361.5, "epoch": 0.52, "grad_norm": 0.04350940138101578, "kl": 0.002279043197631836, "learning_rate": 1.9999146015025503e-05, "loss": -0.0436, "reward": 1.2106666564941406, "reward_std": 0.5054564848542213, "rewards/mrr_reward": 0.5012090876698494, "rewards/rank_answer_foramt_reward": 0.771484375, "rewards/rank_overall_format_reward": 0.9375, "rewards/rank_think_format_reward": 0.44088681042194366, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 372.046875, "epoch": 0.528, "grad_norm": 0.0323958545923233, "kl": 0.0024566650390625, "learning_rate": 1.999911285411704e-05, "loss": -0.0498, "reward": 0.9955967366695404, "reward_std": 0.2997688129544258, "rewards/mrr_reward": 0.2881200350821018, "rewards/rank_answer_foramt_reward": 0.70703125, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.4680873528122902, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 368.515625, "epoch": 0.536, "grad_norm": 0.03429020941257477, "kl": 0.0021696090698242188, "learning_rate": 1.9999079061609933e-05, "loss": -0.0498, "reward": 1.1947258114814758, "reward_std": 0.3867759630084038, "rewards/mrr_reward": 0.4760354682803154, "rewards/rank_answer_foramt_reward": 0.78515625, "rewards/rank_overall_format_reward": 0.9296875, "rewards/rank_think_format_reward": 0.46300553530454636, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 360.96875, "epoch": 0.544, "grad_norm": 0.04061827063560486, "kl": 0.003525972366333008, "learning_rate": 1.999904463750632e-05, "loss": 0.0386, "reward": 1.1350408345460892, "reward_std": 0.4857459217309952, "rewards/mrr_reward": 0.487152773886919, "rewards/rank_answer_foramt_reward": 0.716796875, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.2933751530945301, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 351.46875, "epoch": 0.552, "grad_norm": 0.036203574389219284, "kl": 0.0033969879150390625, "learning_rate": 1.999900958180838e-05, "loss": -0.0501, "reward": 1.3717794716358185, "reward_std": 0.34492378681898117, "rewards/mrr_reward": 0.613802082836628, "rewards/rank_answer_foramt_reward": 0.853515625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.4668227881193161, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 328.59375, "epoch": 0.56, "grad_norm": 0.03513631597161293, "kl": 0.0030303001403808594, "learning_rate": 1.9998973894518318e-05, "loss": -0.0498, "reward": 1.38496533036232, "reward_std": 0.565589427947998, "rewards/mrr_reward": 0.6923362985253334, "rewards/rank_answer_foramt_reward": 0.7890625, "rewards/rank_overall_format_reward": 0.9375, "rewards/rank_think_format_reward": 0.37231335788965225, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 352.578125, "epoch": 0.568, "grad_norm": 0.037784941494464874, "kl": 0.005275249481201172, "learning_rate": 1.999893757563839e-05, "loss": -0.0125, "reward": 1.1960534453392029, "reward_std": 0.4160301834344864, "rewards/mrr_reward": 0.48038194328546524, "rewards/rank_answer_foramt_reward": 0.78515625, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.4147949740290642, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 357.625, "epoch": 0.576, "grad_norm": 0.040549177676439285, "kl": 0.004588603973388672, "learning_rate": 1.9998900625170897e-05, "loss": -0.0169, "reward": 1.429105520248413, "reward_std": 0.4955332353711128, "rewards/mrr_reward": 0.6686197817325592, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 0.921875, "rewards/rank_think_format_reward": 0.5095802322030067, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 364.578125, "epoch": 0.584, "grad_norm": 0.03511694818735123, "kl": 0.003040313720703125, "learning_rate": 1.9998863043118163e-05, "loss": -0.0441, "reward": 0.9159589856863022, "reward_std": 0.38665422797203064, "rewards/mrr_reward": 0.22663691639900208, "rewards/rank_answer_foramt_reward": 0.576171875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.5283078029751778, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 369.3125, "epoch": 0.592, "grad_norm": 0.03343343734741211, "kl": 0.0038709640502929688, "learning_rate": 1.999882482948257e-05, "loss": -0.0629, "reward": 1.190420851111412, "reward_std": 0.47938936948776245, "rewards/mrr_reward": 0.45703125, "rewards/rank_answer_foramt_reward": 0.708984375, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.5446582287549973, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 351.015625, "epoch": 0.6, "grad_norm": 0.03709420561790466, "kl": 0.003997325897216797, "learning_rate": 1.999878598426653e-05, "loss": -0.0283, "reward": 1.1767661273479462, "reward_std": 0.46029242873191833, "rewards/mrr_reward": 0.4516245126724243, "rewards/rank_answer_foramt_reward": 0.771484375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.4493520185351372, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 377.59375, "epoch": 0.608, "grad_norm": 0.03993833065032959, "kl": 0.003428936004638672, "learning_rate": 1.9998746507472493e-05, "loss": -0.0233, "reward": 1.3970292508602142, "reward_std": 0.4703235626220703, "rewards/mrr_reward": 0.6536458432674408, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_overall_format_reward": 0.9453125, "rewards/rank_think_format_reward": 0.47533298283815384, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 379.546875, "epoch": 0.616, "grad_norm": 0.03496386855840683, "kl": 0.0029549598693847656, "learning_rate": 1.999870639910296e-05, "loss": -0.0372, "reward": 1.3616310954093933, "reward_std": 0.3801681473851204, "rewards/mrr_reward": 0.5703992992639542, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.5949375629425049, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 346.90625, "epoch": 0.624, "grad_norm": 0.039075274020433426, "kl": 0.005405902862548828, "learning_rate": 1.9998665659160453e-05, "loss": -0.0099, "reward": 1.1831572949886322, "reward_std": 0.48211684823036194, "rewards/mrr_reward": 0.4921874850988388, "rewards/rank_answer_foramt_reward": 0.689453125, "rewards/rank_overall_format_reward": 0.9375, "rewards/rank_think_format_reward": 0.46689455583691597, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 373.546875, "epoch": 0.632, "grad_norm": 0.03583746775984764, "kl": 0.007263660430908203, "learning_rate": 1.999862428764756e-05, "loss": -0.057, "reward": 1.1416280269622803, "reward_std": 0.49653460085392, "rewards/mrr_reward": 0.41783855855464935, "rewards/rank_answer_foramt_reward": 0.689453125, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.5507232397794724, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 367.84375, "epoch": 0.64, "grad_norm": 0.03996375948190689, "kl": 0.0040721893310546875, "learning_rate": 1.9998582284566878e-05, "loss": -0.0417, "reward": 1.2116663455963135, "reward_std": 0.4182490184903145, "rewards/mrr_reward": 0.4896267279982567, "rewards/rank_answer_foramt_reward": 0.755859375, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.4712018519639969, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 383.046875, "epoch": 0.648, "grad_norm": 0.03645450249314308, "kl": 0.0038928985595703125, "learning_rate": 1.999853964992107e-05, "loss": -0.0315, "reward": 1.1708963364362717, "reward_std": 0.4123397395014763, "rewards/mrr_reward": 0.4096788167953491, "rewards/rank_answer_foramt_reward": 0.7734375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.5489069819450378, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 338.25, "epoch": 0.656, "grad_norm": 0.03626188263297081, "kl": 0.004076957702636719, "learning_rate": 1.9998496383712828e-05, "loss": -0.0268, "reward": 1.4341766834259033, "reward_std": 0.3844763785600662, "rewards/mrr_reward": 0.651041679084301, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.5000894367694855, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 366.765625, "epoch": 0.664, "grad_norm": 0.03594069182872772, "kl": 0.006040096282958984, "learning_rate": 1.999845248594489e-05, "loss": -0.0422, "reward": 1.225644826889038, "reward_std": 0.45316731184720993, "rewards/mrr_reward": 0.45325520634651184, "rewards/rank_answer_foramt_reward": 0.798828125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.5495589375495911, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 368.078125, "epoch": 0.672, "grad_norm": 0.03165300935506821, "kl": 0.0038886070251464844, "learning_rate": 1.9998407956620017e-05, "loss": -0.0321, "reward": 1.4280948042869568, "reward_std": 0.45007024705410004, "rewards/mrr_reward": 0.6236979365348816, "rewards/rank_answer_foramt_reward": 0.841796875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6113943159580231, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 381.21875, "epoch": 0.68, "grad_norm": 0.03456708416342735, "kl": 0.00469207763671875, "learning_rate": 1.9998362795741027e-05, "loss": -0.033, "reward": 1.0892604291439056, "reward_std": 0.34771857038140297, "rewards/mrr_reward": 0.32194321043789387, "rewards/rank_answer_foramt_reward": 0.78125, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.583016149699688, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 388.171875, "epoch": 0.688, "grad_norm": 0.03425449877977371, "kl": 0.004755496978759766, "learning_rate": 1.9998317003310775e-05, "loss": -0.0464, "reward": 1.2324982285499573, "reward_std": 0.42087381333112717, "rewards/mrr_reward": 0.4638020992279053, "rewards/rank_answer_foramt_reward": 0.798828125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.5539913699030876, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 368.65625, "epoch": 0.696, "grad_norm": 0.04177427664399147, "kl": 0.00435638427734375, "learning_rate": 1.9998270579332154e-05, "loss": -0.0672, "reward": 1.3639829754829407, "reward_std": 0.43886173516511917, "rewards/mrr_reward": 0.6211123615503311, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.42495106160640717, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 384.25, "epoch": 0.704, "grad_norm": 0.03630625456571579, "kl": 0.0060329437255859375, "learning_rate": 1.9998223523808092e-05, "loss": 0.0047, "reward": 1.08887879550457, "reward_std": 0.4105418995022774, "rewards/mrr_reward": 0.3867373540997505, "rewards/rank_answer_foramt_reward": 0.65234375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.4987950399518013, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 421.265625, "epoch": 0.712, "grad_norm": 0.033998582512140274, "kl": 0.004588127136230469, "learning_rate": 1.9998175836741564e-05, "loss": -0.0166, "reward": 1.3856353461742401, "reward_std": 0.5018515959382057, "rewards/mrr_reward": 0.5744357854127884, "rewards/rank_answer_foramt_reward": 0.802734375, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.6866960972547531, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 391.59375, "epoch": 0.72, "grad_norm": 0.03614083677530289, "kl": 0.005847930908203125, "learning_rate": 1.999812751813558e-05, "loss": -0.0538, "reward": 1.2505441904067993, "reward_std": 0.39697666093707085, "rewards/mrr_reward": 0.4752671793103218, "rewards/rank_answer_foramt_reward": 0.814453125, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.5817459300160408, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 378.15625, "epoch": 0.728, "grad_norm": 0.03517737612128258, "kl": 0.006634712219238281, "learning_rate": 1.9998078567993197e-05, "loss": -0.0462, "reward": 1.4160068929195404, "reward_std": 0.3253961279988289, "rewards/mrr_reward": 0.6407738029956818, "rewards/rank_answer_foramt_reward": 0.853515625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.5191128998994827, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 376.6875, "epoch": 0.736, "grad_norm": 0.03939124941825867, "kl": 0.006640434265136719, "learning_rate": 1.9998028986317504e-05, "loss": 0.0157, "reward": 1.2075020372867584, "reward_std": 0.48353683948516846, "rewards/mrr_reward": 0.5126488097012043, "rewards/rank_answer_foramt_reward": 0.748046875, "rewards/rank_overall_format_reward": 0.9296875, "rewards/rank_think_format_reward": 0.42788132280111313, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 395.0625, "epoch": 0.744, "grad_norm": 0.036580055952072144, "kl": 0.0062274932861328125, "learning_rate": 1.999797877311163e-05, "loss": -0.0228, "reward": 1.3180749416351318, "reward_std": 0.3634856082499027, "rewards/mrr_reward": 0.49606895446777344, "rewards/rank_answer_foramt_reward": 0.830078125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.6842865198850632, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 357.203125, "epoch": 0.752, "grad_norm": 0.03729122877120972, "kl": 0.0060596466064453125, "learning_rate": 1.9997927928378753e-05, "loss": -0.0667, "reward": 1.5598929524421692, "reward_std": 0.28483540937304497, "rewards/mrr_reward": 0.7404017746448517, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.5653376057744026, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 414.5625, "epoch": 0.76, "grad_norm": 0.037984032183885574, "kl": 0.004604816436767578, "learning_rate": 1.999787645212208e-05, "loss": -0.0151, "reward": 1.3950347006320953, "reward_std": 0.4405433312058449, "rewards/mrr_reward": 0.5818328410387039, "rewards/rank_answer_foramt_reward": 0.77734375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7025292068719864, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 396.5, "epoch": 0.768, "grad_norm": 0.03686251863837242, "kl": 0.0077533721923828125, "learning_rate": 1.999782434434486e-05, "loss": 0.0062, "reward": 1.0413940846920013, "reward_std": 0.45301979780197144, "rewards/mrr_reward": 0.3227802626788616, "rewards/rank_answer_foramt_reward": 0.626953125, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.5819142758846283, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 393.125, "epoch": 0.776, "grad_norm": 0.035745490342378616, "kl": 0.0069942474365234375, "learning_rate": 1.999777160505039e-05, "loss": -0.056, "reward": 1.2480146288871765, "reward_std": 0.38091614469885826, "rewards/mrr_reward": 0.4531250074505806, "rewards/rank_answer_foramt_reward": 0.806640625, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.6489906013011932, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 403.96875, "epoch": 0.784, "grad_norm": 0.03713075444102287, "kl": 0.007597923278808594, "learning_rate": 1.9997718234242e-05, "loss": -0.0048, "reward": 1.2264422178268433, "reward_std": 0.4423217736184597, "rewards/mrr_reward": 0.46302084624767303, "rewards/rank_answer_foramt_reward": 0.744140625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.5926948338747025, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 389.484375, "epoch": 0.792, "grad_norm": 0.03935292363166809, "kl": 0.009016036987304688, "learning_rate": 1.999766423192306e-05, "loss": -0.0447, "reward": 1.442174106836319, "reward_std": 0.2968660295009613, "rewards/mrr_reward": 0.6010602712631226, "rewards/rank_answer_foramt_reward": 0.912109375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6523452401161194, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 403.90625, "epoch": 0.8, "grad_norm": 0.040015578269958496, "kl": 0.00775146484375, "learning_rate": 1.9997609598096982e-05, "loss": 0.0258, "reward": 1.3165824115276337, "reward_std": 0.4892076849937439, "rewards/mrr_reward": 0.546875, "rewards/rank_answer_foramt_reward": 0.78125, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.5902589708566666, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 411.125, "epoch": 0.808, "grad_norm": 0.037578992545604706, "kl": 0.007415771484375, "learning_rate": 1.9997554332767214e-05, "loss": -0.0399, "reward": 1.4174852073192596, "reward_std": 0.41939637064933777, "rewards/mrr_reward": 0.6032738238573074, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.6411352306604385, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 385.046875, "epoch": 0.816, "grad_norm": 0.03692830353975296, "kl": 0.007373809814453125, "learning_rate": 1.9997498435937254e-05, "loss": -0.0086, "reward": 1.2665570676326752, "reward_std": 0.4313989281654358, "rewards/mrr_reward": 0.53735176846385, "rewards/rank_answer_foramt_reward": 0.8203125, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.42846303433179855, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 411.96875, "epoch": 0.824, "grad_norm": 0.03523707389831543, "kl": 0.007399559020996094, "learning_rate": 1.9997441907610624e-05, "loss": 0.056, "reward": 1.2068978399038315, "reward_std": 0.3156552240252495, "rewards/mrr_reward": 0.43065476045012474, "rewards/rank_answer_foramt_reward": 0.826171875, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.5573297291994095, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 420.421875, "epoch": 0.832, "grad_norm": 0.03504414111375809, "kl": 0.00534820556640625, "learning_rate": 1.9997384747790903e-05, "loss": -0.0279, "reward": 1.2453699111938477, "reward_std": 0.3804602213203907, "rewards/mrr_reward": 0.44001737236976624, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.6162433475255966, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 413.515625, "epoch": 0.84, "grad_norm": 0.033324431627988815, "kl": 0.0075550079345703125, "learning_rate": 1.9997326956481693e-05, "loss": -0.0007, "reward": 1.3570080995559692, "reward_std": 0.35840654745697975, "rewards/mrr_reward": 0.5557477697730064, "rewards/rank_answer_foramt_reward": 0.830078125, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.6370457410812378, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 418.03125, "epoch": 0.848, "grad_norm": 0.034413669258356094, "kl": 0.005972862243652344, "learning_rate": 1.999726853368665e-05, "loss": -0.0128, "reward": 1.4416370689868927, "reward_std": 0.435688741505146, "rewards/mrr_reward": 0.6221354156732559, "rewards/rank_answer_foramt_reward": 0.828125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6630256772041321, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 439.8125, "epoch": 0.856, "grad_norm": 0.0455174446105957, "kl": 0.007664680480957031, "learning_rate": 1.9997209479409464e-05, "loss": 0.0547, "reward": 1.3583179116249084, "reward_std": 0.33279163390398026, "rewards/mrr_reward": 0.5312686040997505, "rewards/rank_answer_foramt_reward": 0.857421875, "rewards/rank_overall_format_reward": 0.9375, "rewards/rank_think_format_reward": 0.7112879157066345, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 405.125, "epoch": 0.864, "grad_norm": 0.03769733011722565, "kl": 0.007842063903808594, "learning_rate": 1.9997149793653862e-05, "loss": -0.0215, "reward": 1.5695316791534424, "reward_std": 0.4045773670077324, "rewards/mrr_reward": 0.7216145843267441, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7022580057382584, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 377.25, "epoch": 0.872, "grad_norm": 0.044221311807632446, "kl": 0.009271621704101562, "learning_rate": 1.9997089476423617e-05, "loss": 0.0187, "reward": 1.373361498117447, "reward_std": 0.4413030967116356, "rewards/mrr_reward": 0.5739335417747498, "rewards/rank_answer_foramt_reward": 0.84375, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.6178214848041534, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 417.984375, "epoch": 0.88, "grad_norm": 0.03868413344025612, "kl": 0.009458541870117188, "learning_rate": 1.999702852772254e-05, "loss": -0.0229, "reward": 1.2501296997070312, "reward_std": 0.34093382209539413, "rewards/mrr_reward": 0.4071986749768257, "rewards/rank_answer_foramt_reward": 0.84375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7183988690376282, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 411.921875, "epoch": 0.888, "grad_norm": 0.041314590722322464, "kl": 0.00878143310546875, "learning_rate": 1.9996966947554476e-05, "loss": -0.0452, "reward": 1.5108999907970428, "reward_std": 0.38177699968218803, "rewards/mrr_reward": 0.6616443544626236, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6848299354314804, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 390.09375, "epoch": 0.896, "grad_norm": 0.03469943255186081, "kl": 0.008004188537597656, "learning_rate": 1.9996904735923325e-05, "loss": -0.024, "reward": 1.4337850511074066, "reward_std": 0.3383907675743103, "rewards/mrr_reward": 0.604687511920929, "rewards/rank_answer_foramt_reward": 0.884765625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6432760506868362, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 421.625, "epoch": 0.904, "grad_norm": 0.03499722108244896, "kl": 0.008179664611816406, "learning_rate": 1.9996841892833e-05, "loss": 0.009, "reward": 1.4004603326320648, "reward_std": 0.4079892486333847, "rewards/mrr_reward": 0.597135417163372, "rewards/rank_answer_foramt_reward": 0.828125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6218177676200867, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 412.703125, "epoch": 0.912, "grad_norm": 0.03794240951538086, "kl": 0.007323265075683594, "learning_rate": 1.9996778418287486e-05, "loss": -0.0338, "reward": 1.2410678267478943, "reward_std": 0.3803473189473152, "rewards/mrr_reward": 0.4220486134290695, "rewards/rank_answer_foramt_reward": 0.78125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7084388732910156, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 400.453125, "epoch": 0.92, "grad_norm": 0.03945469111204147, "kl": 0.010195732116699219, "learning_rate": 1.9996714312290784e-05, "loss": -0.0435, "reward": 1.2030570209026337, "reward_std": 0.4484190344810486, "rewards/mrr_reward": 0.38446180522441864, "rewards/rank_answer_foramt_reward": 0.759765625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7364507168531418, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 420.8125, "epoch": 0.928, "grad_norm": 0.03646434098482132, "kl": 0.008108139038085938, "learning_rate": 1.9996649574846948e-05, "loss": -0.0451, "reward": 1.4321197271347046, "reward_std": 0.3283313438296318, "rewards/mrr_reward": 0.5520399212837219, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7587052434682846, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 420.859375, "epoch": 0.936, "grad_norm": 0.040539614856243134, "kl": 0.009557723999023438, "learning_rate": 1.9996584205960063e-05, "loss": -0.0017, "reward": 1.408842772245407, "reward_std": 0.3340775966644287, "rewards/mrr_reward": 0.533283744007349, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7664903402328491, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 428.03125, "epoch": 0.944, "grad_norm": 0.03920678421854973, "kl": 0.009622573852539062, "learning_rate": 1.999651820563426e-05, "loss": -0.0548, "reward": 1.2891173958778381, "reward_std": 0.3157798573374748, "rewards/mrr_reward": 0.4310639798641205, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7759429067373276, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 405.359375, "epoch": 0.952, "grad_norm": 0.09537135064601898, "kl": 0.0254364013671875, "learning_rate": 1.999645157387371e-05, "loss": -0.0146, "reward": 1.5277230143547058, "reward_std": 0.3659735471010208, "rewards/mrr_reward": 0.6688368320465088, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6905757486820221, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 412.421875, "epoch": 0.96, "grad_norm": 0.04111702740192413, "kl": 0.008625030517578125, "learning_rate": 1.9996384310682615e-05, "loss": -0.0319, "reward": 1.2409729659557343, "reward_std": 0.3955169692635536, "rewards/mrr_reward": 0.40638699010014534, "rewards/rank_answer_foramt_reward": 0.814453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7224076837301254, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 397.828125, "epoch": 0.968, "grad_norm": 0.03895534947514534, "kl": 0.008586883544921875, "learning_rate": 1.999631641606523e-05, "loss": -0.0318, "reward": 1.2535248398780823, "reward_std": 0.4473446235060692, "rewards/mrr_reward": 0.46145833283662796, "rewards/rank_answer_foramt_reward": 0.74609375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.6775450259447098, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 422.40625, "epoch": 0.976, "grad_norm": 0.0410754568874836, "kl": 0.008921623229980469, "learning_rate": 1.9996247890025845e-05, "loss": 0.03, "reward": 1.274911493062973, "reward_std": 0.4455215707421303, "rewards/mrr_reward": 0.4238405302166939, "rewards/rank_answer_foramt_reward": 0.84375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7430653125047684, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 395.328125, "epoch": 0.984, "grad_norm": 0.038471318781375885, "kl": 0.008017539978027344, "learning_rate": 1.9996178732568784e-05, "loss": -0.0218, "reward": 1.1934560984373093, "reward_std": 0.37501702457666397, "rewards/mrr_reward": 0.38322172313928604, "rewards/rank_answer_foramt_reward": 0.79296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6700992956757545, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 429.859375, "epoch": 0.992, "grad_norm": 0.03736767917871475, "kl": 0.009922027587890625, "learning_rate": 1.9996108943698412e-05, "loss": -0.0288, "reward": 1.4828196465969086, "reward_std": 0.34185753017663956, "rewards/mrr_reward": 0.6228484585881233, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.764176219701767, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 382.84375, "epoch": 1.0, "grad_norm": 0.03988206014037132, "kl": 0.009454727172851562, "learning_rate": 1.9996038523419148e-05, "loss": -0.0202, "reward": 1.2809572219848633, "reward_std": 0.4308247435837984, "rewards/mrr_reward": 0.4690104201436043, "rewards/rank_answer_foramt_reward": 0.775390625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7084915935993195, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 409.46875, "epoch": 0.1008, "grad_norm": 0.03588635101914406, "kl": 0.007241249084472656, "learning_rate": 1.7583619152887222e-05, "loss": -0.0384, "reward": 1.2436591684818268, "reward_std": 0.4599437266588211, "rewards/mrr_reward": 0.4638392850756645, "rewards/rank_answer_foramt_reward": 0.765625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6130905151367188, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 416.03125, "epoch": 0.1016, "grad_norm": 0.042928650975227356, "kl": 0.009435653686523438, "learning_rate": 1.754251380736104e-05, "loss": 0.0143, "reward": 1.1844342947006226, "reward_std": 0.42420684546232224, "rewards/mrr_reward": 0.36152033507823944, "rewards/rank_answer_foramt_reward": 0.712890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7807879894971848, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 418.09375, "epoch": 0.1024, "grad_norm": 0.03468465059995651, "kl": 0.009843826293945312, "learning_rate": 1.7501110696304598e-05, "loss": 0.0222, "reward": 1.2601959109306335, "reward_std": 0.40002964437007904, "rewards/mrr_reward": 0.4385974779725075, "rewards/rank_answer_foramt_reward": 0.818359375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.694770097732544, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 418.75, "epoch": 0.1032, "grad_norm": 0.037175022065639496, "kl": 0.00865936279296875, "learning_rate": 1.7459411454241822e-05, "loss": -0.0003, "reward": 1.2285803258419037, "reward_std": 0.305690661072731, "rewards/mrr_reward": 0.4261036813259125, "rewards/rank_answer_foramt_reward": 0.75, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6895599216222763, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 397.796875, "epoch": 0.104, "grad_norm": 0.04238922521471977, "kl": 0.008852005004882812, "learning_rate": 1.7417417727387392e-05, "loss": -0.0508, "reward": 1.2500934600830078, "reward_std": 0.4739305451512337, "rewards/mrr_reward": 0.4687500074505806, "rewards/rank_answer_foramt_reward": 0.712890625, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.6938792169094086, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 414.546875, "epoch": 0.1048, "grad_norm": 0.039994798600673676, "kl": 0.008302688598632812, "learning_rate": 1.737513117358174e-05, "loss": -0.0044, "reward": 1.3667091131210327, "reward_std": 0.4646128937602043, "rewards/mrr_reward": 0.5651041865348816, "rewards/rank_answer_foramt_reward": 0.751953125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6849651783704758, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 436.765625, "epoch": 0.1056, "grad_norm": 0.045952655375003815, "kl": 0.007769584655761719, "learning_rate": 1.7332553462225604e-05, "loss": -0.0423, "reward": 1.362520307302475, "reward_std": 0.40486711636185646, "rewards/mrr_reward": 0.5388020724058151, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6621311977505684, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 407.953125, "epoch": 0.1064, "grad_norm": 0.03839990124106407, "kl": 0.009153366088867188, "learning_rate": 1.7289686274214116e-05, "loss": -0.0437, "reward": 1.3655548691749573, "reward_std": 0.3156409915536642, "rewards/mrr_reward": 0.5556609630584717, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.6007082015275955, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 413.703125, "epoch": 0.1072, "grad_norm": 0.04110146313905716, "kl": 0.009965896606445312, "learning_rate": 1.7246531301870467e-05, "loss": 0.0237, "reward": 1.2509280890226364, "reward_std": 0.3098420277237892, "rewards/mrr_reward": 0.42760417610406876, "rewards/rank_answer_foramt_reward": 0.802734375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7078114748001099, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 418.90625, "epoch": 0.108, "grad_norm": 0.03654169663786888, "kl": 0.011074066162109375, "learning_rate": 1.720309024887907e-05, "loss": -0.0004, "reward": 1.2900923788547516, "reward_std": 0.30175913497805595, "rewards/mrr_reward": 0.4537264332175255, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7238953113555908, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 418.65625, "epoch": 0.1088, "grad_norm": 0.03870633617043495, "kl": 0.0076389312744140625, "learning_rate": 1.7159364830218312e-05, "loss": -0.0273, "reward": 1.4046232402324677, "reward_std": 0.1458736453205347, "rewards/mrr_reward": 0.5324838608503342, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7541746199131012, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 421.453125, "epoch": 0.1096, "grad_norm": 0.03937549516558647, "kl": 0.0086822509765625, "learning_rate": 1.7115356772092858e-05, "loss": -0.0347, "reward": 1.4993912875652313, "reward_std": 0.3712911829352379, "rewards/mrr_reward": 0.6232638955116272, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7701657563447952, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 413.609375, "epoch": 0.1104, "grad_norm": 0.03570104390382767, "kl": 0.008465766906738281, "learning_rate": 1.7071067811865477e-05, "loss": -0.0247, "reward": 1.6328608393669128, "reward_std": 0.45016467571258545, "rewards/mrr_reward": 0.8177083283662796, "rewards/rank_answer_foramt_reward": 0.869140625, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.6400808244943619, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 442.46875, "epoch": 0.1112, "grad_norm": 0.03384555131196976, "kl": 0.00774383544921875, "learning_rate": 1.7026499697988496e-05, "loss": -0.0086, "reward": 1.325139045715332, "reward_std": 0.34500668570399284, "rewards/mrr_reward": 0.46684029698371887, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7669208198785782, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 439.390625, "epoch": 0.112, "grad_norm": 0.03522227331995964, "kl": 0.007678031921386719, "learning_rate": 1.698165418993473e-05, "loss": 0.0335, "reward": 1.4752865731716156, "reward_std": 0.341037068516016, "rewards/mrr_reward": 0.6348958387970924, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.6892164349555969, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 411.140625, "epoch": 0.1128, "grad_norm": 0.03765201196074486, "kl": 0.008716583251953125, "learning_rate": 1.693653305812805e-05, "loss": -0.0408, "reward": 1.3769253194332123, "reward_std": 0.3529956042766571, "rewards/mrr_reward": 0.5243489742279053, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7085645794868469, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 441.390625, "epoch": 0.1136, "grad_norm": 0.04156142473220825, "kl": 0.01380157470703125, "learning_rate": 1.6891138083873486e-05, "loss": -0.0025, "reward": 1.199166625738144, "reward_std": 0.3892873339354992, "rewards/mrr_reward": 0.3583643361926079, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7334325760602951, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 415.328125, "epoch": 0.1144, "grad_norm": 0.04161708801984787, "kl": 0.008413314819335938, "learning_rate": 1.684547105928689e-05, "loss": 0.0184, "reward": 1.47340127825737, "reward_std": 0.5706894248723984, "rewards/mrr_reward": 0.6829427182674408, "rewards/rank_answer_foramt_reward": 0.79296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6179851442575455, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 410.890625, "epoch": 0.1152, "grad_norm": 0.038589105010032654, "kl": 0.007936477661132812, "learning_rate": 1.6799533787224192e-05, "loss": -0.033, "reward": 1.3689055740833282, "reward_std": 0.29699520394206047, "rewards/mrr_reward": 0.5373697876930237, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6291802376508713, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 429.03125, "epoch": 0.116, "grad_norm": 0.034039001911878586, "kl": 0.006977081298828125, "learning_rate": 1.6753328081210244e-05, "loss": -0.0048, "reward": 1.5173978507518768, "reward_std": 0.41116751730442047, "rewards/mrr_reward": 0.6428571343421936, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8024669140577316, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 439.9375, "epoch": 0.1168, "grad_norm": 0.03563903272151947, "kl": 0.008948326110839844, "learning_rate": 1.6706855765367202e-05, "loss": -0.0162, "reward": 1.3468570411205292, "reward_std": 0.36933452636003494, "rewards/mrr_reward": 0.510937511920929, "rewards/rank_answer_foramt_reward": 0.703125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8455893844366074, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 410.421875, "epoch": 0.1176, "grad_norm": 0.036863867193460464, "kl": 0.008589744567871094, "learning_rate": 1.666011867434252e-05, "loss": -0.0109, "reward": 1.2238706946372986, "reward_std": 0.38623613119125366, "rewards/mrr_reward": 0.3937686011195183, "rewards/rank_answer_foramt_reward": 0.779296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7439763844013214, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 449.609375, "epoch": 0.1184, "grad_norm": 0.03305863216519356, "kl": 0.007039070129394531, "learning_rate": 1.661311865323652e-05, "loss": -0.017, "reward": 1.4561704695224762, "reward_std": 0.2617410905659199, "rewards/mrr_reward": 0.5868675485253334, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7123762518167496, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 432.46875, "epoch": 0.1192, "grad_norm": 0.03671317920088768, "kl": 0.008609771728515625, "learning_rate": 1.6565857557529567e-05, "loss": -0.0152, "reward": 1.1992796063423157, "reward_std": 0.40606988221406937, "rewards/mrr_reward": 0.3986979275941849, "rewards/rank_answer_foramt_reward": 0.6875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7619423866271973, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 417.0, "epoch": 0.12, "grad_norm": 0.035398129373788834, "kl": 0.007327079772949219, "learning_rate": 1.651833725300879e-05, "loss": -0.0072, "reward": 1.4916549921035767, "reward_std": 0.3184054736047983, "rewards/mrr_reward": 0.6293340772390366, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7810622304677963, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 407.796875, "epoch": 0.1208, "grad_norm": 0.04079672694206238, "kl": 0.009191513061523438, "learning_rate": 1.6470559615694445e-05, "loss": -0.0333, "reward": 1.3449455797672272, "reward_std": 0.35879848897457123, "rewards/mrr_reward": 0.5183903872966766, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.6101813167333603, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 430.53125, "epoch": 0.1216, "grad_norm": 0.037399739027023315, "kl": 0.0074825286865234375, "learning_rate": 1.6422526531765846e-05, "loss": -0.0135, "reward": 1.627054363489151, "reward_std": 0.3690681420266628, "rewards/mrr_reward": 0.7510850876569748, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7306240200996399, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 424.328125, "epoch": 0.1224, "grad_norm": 0.0394161157310009, "kl": 0.009618759155273438, "learning_rate": 1.63742398974869e-05, "loss": -0.0023, "reward": 1.2834027111530304, "reward_std": 0.41649453714489937, "rewards/mrr_reward": 0.45035962387919426, "rewards/rank_answer_foramt_reward": 0.818359375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7216386198997498, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 453.046875, "epoch": 0.1232, "grad_norm": 0.033092282712459564, "kl": 0.009715080261230469, "learning_rate": 1.6325701619131246e-05, "loss": 0.0023, "reward": 1.5148942172527313, "reward_std": 0.4242171198129654, "rewards/mrr_reward": 0.6458953246474266, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.7739546746015549, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 434.328125, "epoch": 0.124, "grad_norm": 0.03961697220802307, "kl": 0.0072040557861328125, "learning_rate": 1.6276913612907005e-05, "loss": 0.0073, "reward": 1.5414968132972717, "reward_std": 0.338295828551054, "rewards/mrr_reward": 0.6806175708770752, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.70833420753479, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 377.34375, "epoch": 0.1248, "grad_norm": 0.03811102360486984, "kl": 0.008966445922851562, "learning_rate": 1.6227877804881126e-05, "loss": -0.0576, "reward": 1.5272436439990997, "reward_std": 0.25212423875927925, "rewards/mrr_reward": 0.7075520902872086, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.5815699324011803, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 425.015625, "epoch": 0.1256, "grad_norm": 0.040453094989061356, "kl": 0.009019851684570312, "learning_rate": 1.6178596130903345e-05, "loss": 0.0118, "reward": 1.3084798157215118, "reward_std": 0.41541341692209244, "rewards/mrr_reward": 0.4686570018529892, "rewards/rank_answer_foramt_reward": 0.806640625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7617143541574478, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 426.859375, "epoch": 0.1264, "grad_norm": 0.03529253602027893, "kl": 0.00853729248046875, "learning_rate": 1.6129070536529767e-05, "loss": -0.0249, "reward": 1.3785496950149536, "reward_std": 0.317409735172987, "rewards/mrr_reward": 0.5057911723852158, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7912070602178574, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 423.328125, "epoch": 0.1272, "grad_norm": 0.03864454850554466, "kl": 0.008512496948242188, "learning_rate": 1.6079302976946055e-05, "loss": -0.0397, "reward": 1.2989526093006134, "reward_std": 0.4882466495037079, "rewards/mrr_reward": 0.4919270947575569, "rewards/rank_answer_foramt_reward": 0.765625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7033442407846451, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 411.3125, "epoch": 0.128, "grad_norm": 0.03776967152953148, "kl": 0.010316848754882812, "learning_rate": 1.602929541689025e-05, "loss": -0.0441, "reward": 1.2715973854064941, "reward_std": 0.31563179939985275, "rewards/mrr_reward": 0.4436384178698063, "rewards/rank_answer_foramt_reward": 0.845703125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6788883656263351, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 392.609375, "epoch": 0.1288, "grad_norm": 0.04325239732861519, "kl": 0.00868988037109375, "learning_rate": 1.597904983057519e-05, "loss": -0.0253, "reward": 1.3210601806640625, "reward_std": 0.34434930980205536, "rewards/mrr_reward": 0.46409972012043, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.704271599650383, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 430.640625, "epoch": 0.1296, "grad_norm": 0.033945418894290924, "kl": 0.009050369262695312, "learning_rate": 1.5928568201610593e-05, "loss": -0.0045, "reward": 1.0713335573673248, "reward_std": 0.262711264193058, "rewards/mrr_reward": 0.23516865447163582, "rewards/rank_answer_foramt_reward": 0.80859375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7486767023801804, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 433.515625, "epoch": 0.1304, "grad_norm": 0.035303499549627304, "kl": 0.008235931396484375, "learning_rate": 1.5877852522924733e-05, "loss": -0.0111, "reward": 1.5817199647426605, "reward_std": 0.4066779837012291, "rewards/mrr_reward": 0.7029947862029076, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8092876970767975, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 450.078125, "epoch": 0.1312, "grad_norm": 0.03836703300476074, "kl": 0.008309364318847656, "learning_rate": 1.5826904796685763e-05, "loss": -0.0341, "reward": 1.3209553062915802, "reward_std": 0.3643554821610451, "rewards/mrr_reward": 0.4703125134110451, "rewards/rank_answer_foramt_reward": 0.791015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7866897433996201, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 427.046875, "epoch": 0.132, "grad_norm": 0.03883913904428482, "kl": 0.011602401733398438, "learning_rate": 1.5775727034222675e-05, "loss": 0.0062, "reward": 1.370899885892868, "reward_std": 0.38270220160484314, "rewards/mrr_reward": 0.506225224584341, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7842886447906494, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 428.375, "epoch": 0.1328, "grad_norm": 0.03487444296479225, "kl": 0.00785064697265625, "learning_rate": 1.572432125594591e-05, "loss": -0.0483, "reward": 1.5634401440620422, "reward_std": 0.2945178374648094, "rewards/mrr_reward": 0.6865327507257462, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7334667444229126, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 401.65625, "epoch": 0.1336, "grad_norm": 0.0383358933031559, "kl": 0.009000778198242188, "learning_rate": 1.567268949126757e-05, "loss": -0.0308, "reward": 1.2559186518192291, "reward_std": 0.3751576766371727, "rewards/mrr_reward": 0.42760416120290756, "rewards/rank_answer_foramt_reward": 0.8203125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7053561210632324, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 444.25, "epoch": 0.1344, "grad_norm": 0.0366198867559433, "kl": 0.008385658264160156, "learning_rate": 1.5620833778521306e-05, "loss": 0.015, "reward": 1.3973772525787354, "reward_std": 0.39591234177351, "rewards/mrr_reward": 0.531770870089531, "rewards/rank_answer_foramt_reward": 0.78125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8417995721101761, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 412.6875, "epoch": 0.1352, "grad_norm": 0.03712960332632065, "kl": 0.008477210998535156, "learning_rate": 1.556875616488188e-05, "loss": -0.008, "reward": 1.428687036037445, "reward_std": 0.3864752873778343, "rewards/mrr_reward": 0.5722842365503311, "rewards/rank_answer_foramt_reward": 0.857421875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7377379089593887, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 396.203125, "epoch": 0.136, "grad_norm": 0.03674308583140373, "kl": 0.012662887573242188, "learning_rate": 1.5516458706284306e-05, "loss": -0.045, "reward": 1.4177999794483185, "reward_std": 0.37393568456172943, "rewards/mrr_reward": 0.5833333283662796, "rewards/rank_answer_foramt_reward": 0.857421875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.6947023347020149, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 435.71875, "epoch": 0.1368, "grad_norm": 0.03608371317386627, "kl": 0.007844924926757812, "learning_rate": 1.5463943467342694e-05, "loss": -0.009, "reward": 1.3503046333789825, "reward_std": 0.31038716807961464, "rewards/mrr_reward": 0.49075521528720856, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7453198581933975, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 420.984375, "epoch": 0.1376, "grad_norm": 0.040419355034828186, "kl": 0.010079383850097656, "learning_rate": 1.541121252126876e-05, "loss": -0.0363, "reward": 1.244249552488327, "reward_std": 0.31654617190361023, "rewards/mrr_reward": 0.4160156324505806, "rewards/rank_answer_foramt_reward": 0.763671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7461278587579727, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 433.375, "epoch": 0.1384, "grad_norm": 0.03765353932976723, "kl": 0.0077056884765625, "learning_rate": 1.5358267949789968e-05, "loss": 0.0198, "reward": 1.4578497409820557, "reward_std": 0.34172070026397705, "rewards/mrr_reward": 0.5898437574505806, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7260240614414215, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 401.8125, "epoch": 0.1392, "grad_norm": 0.041146595031023026, "kl": 0.010126113891601562, "learning_rate": 1.5305111843067343e-05, "loss": -0.0176, "reward": 1.3684653639793396, "reward_std": 0.29850760102272034, "rewards/mrr_reward": 0.5057725831866264, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7314079403877258, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 411.984375, "epoch": 0.14, "grad_norm": 0.03607525676488876, "kl": 0.00901031494140625, "learning_rate": 1.5251746299612959e-05, "loss": -0.0444, "reward": 1.1752477288246155, "reward_std": 0.3523693382740021, "rewards/mrr_reward": 0.36098091304302216, "rewards/rank_answer_foramt_reward": 0.814453125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.6764592081308365, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 411.75, "epoch": 0.1408, "grad_norm": 0.04322541132569313, "kl": 0.010807037353515625, "learning_rate": 1.5198173426207095e-05, "loss": -0.0348, "reward": 1.3782255053520203, "reward_std": 0.33319515362381935, "rewards/mrr_reward": 0.5212177634239197, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6868368983268738, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 433.5625, "epoch": 0.1416, "grad_norm": 0.039602383971214294, "kl": 0.012025833129882812, "learning_rate": 1.5144395337815066e-05, "loss": -0.0754, "reward": 1.3582023978233337, "reward_std": 0.40261131525039673, "rewards/mrr_reward": 0.503838062286377, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7296076565980911, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 423.125, "epoch": 0.1424, "grad_norm": 0.041363585740327835, "kl": 0.009540557861328125, "learning_rate": 1.5090414157503715e-05, "loss": -0.0197, "reward": 1.323040246963501, "reward_std": 0.3850451856851578, "rewards/mrr_reward": 0.48712798207998276, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.6834579259157181, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 408.5, "epoch": 0.1432, "grad_norm": 0.03604589402675629, "kl": 0.009143829345703125, "learning_rate": 1.503623201635761e-05, "loss": -0.0201, "reward": 1.4883578717708588, "reward_std": 0.38600361347198486, "rewards/mrr_reward": 0.6411458402872086, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6844964772462845, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 417.96875, "epoch": 0.144, "grad_norm": 0.03885720670223236, "kl": 0.00914764404296875, "learning_rate": 1.498185105339491e-05, "loss": -0.0262, "reward": 1.3653124272823334, "reward_std": 0.4237579368054867, "rewards/mrr_reward": 0.5257812589406967, "rewards/rank_answer_foramt_reward": 0.857421875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.702236957848072, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 412.140625, "epoch": 0.1448, "grad_norm": 0.039864372462034225, "kl": 0.007981300354003906, "learning_rate": 1.4927273415482916e-05, "loss": 0.0164, "reward": 1.3653839826583862, "reward_std": 0.39134908467531204, "rewards/mrr_reward": 0.5079861059784889, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7759094536304474, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 403.46875, "epoch": 0.1456, "grad_norm": 0.04062679037451744, "kl": 0.009542465209960938, "learning_rate": 1.4872501257253325e-05, "loss": -0.0132, "reward": 1.255301147699356, "reward_std": 0.38088975846767426, "rewards/mrr_reward": 0.4399925619363785, "rewards/rank_answer_foramt_reward": 0.794921875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.691335141658783, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 399.890625, "epoch": 0.1464, "grad_norm": 0.04299888014793396, "kl": 0.011081695556640625, "learning_rate": 1.4817536741017153e-05, "loss": -0.0543, "reward": 1.4246177673339844, "reward_std": 0.296498604118824, "rewards/mrr_reward": 0.582961305975914, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6559426188468933, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 429.40625, "epoch": 0.1472, "grad_norm": 0.05667012929916382, "kl": 0.02362060546875, "learning_rate": 1.4762382036679393e-05, "loss": -0.0457, "reward": 1.3302158117294312, "reward_std": 0.46241550147533417, "rewards/mrr_reward": 0.50021081417799, "rewards/rank_answer_foramt_reward": 0.7578125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7651665955781937, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 437.65625, "epoch": 0.148, "grad_norm": 0.03618616238236427, "kl": 0.00848388671875, "learning_rate": 1.470703932165333e-05, "loss": 0.0097, "reward": 1.4576621353626251, "reward_std": 0.32078187353909016, "rewards/mrr_reward": 0.5706349164247513, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7973360121250153, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 432.015625, "epoch": 0.1488, "grad_norm": 0.03927586227655411, "kl": 0.009456634521484375, "learning_rate": 1.4651510780774585e-05, "loss": -0.0331, "reward": 1.5841456949710846, "reward_std": 0.44154639542102814, "rewards/mrr_reward": 0.703125, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.781087726354599, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 440.8125, "epoch": 0.1496, "grad_norm": 0.037521783262491226, "kl": 0.009569168090820312, "learning_rate": 1.4595798606214882e-05, "loss": 0.0342, "reward": 1.3032934665679932, "reward_std": 0.37545448541641235, "rewards/mrr_reward": 0.4639943093061447, "rewards/rank_answer_foramt_reward": 0.798828125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7679399847984314, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 411.4375, "epoch": 0.1504, "grad_norm": 0.039995163679122925, "kl": 0.00890350341796875, "learning_rate": 1.4539904997395468e-05, "loss": -0.0577, "reward": 1.563814789056778, "reward_std": 0.28440105356276035, "rewards/mrr_reward": 0.6919271051883698, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7182557433843613, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 443.9375, "epoch": 0.1512, "grad_norm": 0.04071548208594322, "kl": 0.008481979370117188, "learning_rate": 1.4483832160900326e-05, "loss": -0.0276, "reward": 1.3721172213554382, "reward_std": 0.33370155096054077, "rewards/mrr_reward": 0.5183965861797333, "rewards/rank_answer_foramt_reward": 0.845703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7491414994001389, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 412.984375, "epoch": 0.152, "grad_norm": 0.041358765214681625, "kl": 0.008540153503417969, "learning_rate": 1.442758231038902e-05, "loss": -0.0272, "reward": 1.6237071752548218, "reward_std": 0.3772216849029064, "rewards/mrr_reward": 0.7747395932674408, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6566131561994553, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 430.203125, "epoch": 0.1528, "grad_norm": 0.033840637654066086, "kl": 0.009374618530273438, "learning_rate": 1.437115766650933e-05, "loss": 0.0117, "reward": 1.2770065069198608, "reward_std": 0.2644694857299328, "rewards/mrr_reward": 0.4101128578186035, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7792940139770508, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 422.0625, "epoch": 0.1536, "grad_norm": 0.038305025547742844, "kl": 0.009326934814453125, "learning_rate": 1.4314560456809592e-05, "loss": -0.0217, "reward": 1.2699792385101318, "reward_std": 0.3280069828033447, "rewards/mrr_reward": 0.422588050365448, "rewards/rank_answer_foramt_reward": 0.8046875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7631644010543823, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 426.390625, "epoch": 0.1544, "grad_norm": 0.03981156274676323, "kl": 0.00913238525390625, "learning_rate": 1.4257792915650728e-05, "loss": -0.047, "reward": 1.3255890011787415, "reward_std": 0.3319142311811447, "rewards/mrr_reward": 0.49487847834825516, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.714570015668869, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 420.25, "epoch": 0.1552, "grad_norm": 0.03656027093529701, "kl": 0.009430885314941406, "learning_rate": 1.4200857284118067e-05, "loss": -0.0332, "reward": 1.4800786972045898, "reward_std": 0.26632819697260857, "rewards/mrr_reward": 0.5991257503628731, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7340072840452194, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 406.0625, "epoch": 0.156, "grad_norm": 0.04333413392305374, "kl": 0.0097503662109375, "learning_rate": 1.4143755809932843e-05, "loss": -0.0022, "reward": 1.4679460525512695, "reward_std": 0.4186020493507385, "rewards/mrr_reward": 0.6158854365348816, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7128610759973526, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 422.78125, "epoch": 0.1568, "grad_norm": 0.03645530715584755, "kl": 0.009119987487792969, "learning_rate": 1.4086490747363492e-05, "loss": -0.0124, "reward": 1.3291675448417664, "reward_std": 0.34505781903862953, "rewards/mrr_reward": 0.47128596901893616, "rewards/rank_answer_foramt_reward": 0.818359375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8047191351652145, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 412.9375, "epoch": 0.1576, "grad_norm": 0.04249008744955063, "kl": 0.010274887084960938, "learning_rate": 1.4029064357136628e-05, "loss": 0.0055, "reward": 1.4444103240966797, "reward_std": 0.4831986427307129, "rewards/mrr_reward": 0.6312500089406967, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6418563276529312, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 411.859375, "epoch": 0.1584, "grad_norm": 0.038807179778814316, "kl": 0.01303863525390625, "learning_rate": 1.3971478906347806e-05, "loss": 0.0025, "reward": 1.4374900162220001, "reward_std": 0.39226941764354706, "rewards/mrr_reward": 0.5747395902872086, "rewards/rank_answer_foramt_reward": 0.84375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7706450521945953, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 434.796875, "epoch": 0.1592, "grad_norm": 0.03673255071043968, "kl": 0.008836746215820312, "learning_rate": 1.3913736668372027e-05, "loss": -0.0084, "reward": 1.6338341534137726, "reward_std": 0.3351624459028244, "rewards/mrr_reward": 0.767447903752327, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6820531934499741, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 433.0, "epoch": 0.16, "grad_norm": 0.03517894819378853, "kl": 0.010756492614746094, "learning_rate": 1.3855839922773968e-05, "loss": -0.0186, "reward": 1.489585041999817, "reward_std": 0.35270126909017563, "rewards/mrr_reward": 0.6339409798383713, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7764543294906616, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 447.625, "epoch": 0.1608, "grad_norm": 0.039749667048454285, "kl": 0.009137153625488281, "learning_rate": 1.3797790955218014e-05, "loss": -0.0303, "reward": 1.57523912191391, "reward_std": 0.3249845299869776, "rewards/mrr_reward": 0.6855654865503311, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7721523940563202, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 427.75, "epoch": 0.1616, "grad_norm": 0.03552306815981865, "kl": 0.008211135864257812, "learning_rate": 1.3739592057378005e-05, "loss": -0.0296, "reward": 1.6716056764125824, "reward_std": 0.32526458986103535, "rewards/mrr_reward": 0.7610677182674408, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8431902080774307, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 432.125, "epoch": 0.1624, "grad_norm": 0.037600353360176086, "kl": 0.008890151977539062, "learning_rate": 1.3681245526846782e-05, "loss": -0.0236, "reward": 1.3627182841300964, "reward_std": 0.2870614193379879, "rewards/mrr_reward": 0.486111119389534, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8087291121482849, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 423.375, "epoch": 0.1632, "grad_norm": 0.04189027473330498, "kl": 0.01094818115234375, "learning_rate": 1.3622753667045459e-05, "loss": -0.0358, "reward": 1.410872757434845, "reward_std": 0.3953894004225731, "rewards/mrr_reward": 0.5746279805898666, "rewards/rank_answer_foramt_reward": 0.845703125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.703996941447258, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 431.59375, "epoch": 0.164, "grad_norm": 0.04231363534927368, "kl": 0.010824203491210938, "learning_rate": 1.3564118787132507e-05, "loss": 0.0009, "reward": 1.3419412970542908, "reward_std": 0.34718624502420425, "rewards/mrr_reward": 0.49730904027819633, "rewards/rank_answer_foramt_reward": 0.818359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7411323189735413, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 407.578125, "epoch": 0.1648, "grad_norm": 0.04113239422440529, "kl": 0.01050567626953125, "learning_rate": 1.350534320191259e-05, "loss": -0.0414, "reward": 1.3435261249542236, "reward_std": 0.3302622064948082, "rewards/mrr_reward": 0.48489584773778915, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.715191051363945, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 440.46875, "epoch": 0.1656, "grad_norm": 0.03465278074145317, "kl": 0.008320808410644531, "learning_rate": 1.344642923174517e-05, "loss": -0.0085, "reward": 1.4250126481056213, "reward_std": 0.3028215132653713, "rewards/mrr_reward": 0.5315724313259125, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8245819211006165, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 428.15625, "epoch": 0.1664, "grad_norm": 0.040414392948150635, "kl": 0.011005401611328125, "learning_rate": 1.3387379202452917e-05, "loss": -0.0043, "reward": 1.2858222126960754, "reward_std": 0.3545723408460617, "rewards/mrr_reward": 0.4399677626788616, "rewards/rank_answer_foramt_reward": 0.8203125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.758507713675499, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 423.015625, "epoch": 0.1672, "grad_norm": 0.03951350972056389, "kl": 0.010406494140625, "learning_rate": 1.3328195445229869e-05, "loss": -0.0136, "reward": 1.5512892603874207, "reward_std": 0.23556600511074066, "rewards/mrr_reward": 0.6481771022081375, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.773812785744667, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 436.03125, "epoch": 0.168, "grad_norm": 0.03597855195403099, "kl": 0.012434005737304688, "learning_rate": 1.3268880296549424e-05, "loss": -0.0475, "reward": 1.5305464267730713, "reward_std": 0.3849369138479233, "rewards/mrr_reward": 0.6821614503860474, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.7075821757316589, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 419.484375, "epoch": 0.1688, "grad_norm": 0.04128649830818176, "kl": 0.009695053100585938, "learning_rate": 1.3209436098072095e-05, "loss": -0.0153, "reward": 1.5931483805179596, "reward_std": 0.24104237789288163, "rewards/mrr_reward": 0.6974144279956818, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7768451869487762, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 434.125, "epoch": 0.1696, "grad_norm": 0.03705138713121414, "kl": 0.008800506591796875, "learning_rate": 1.3149865196553049e-05, "loss": 0.0028, "reward": 1.374686360359192, "reward_std": 0.2328730747103691, "rewards/mrr_reward": 0.4906249921768904, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7883486449718475, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 420.34375, "epoch": 0.1704, "grad_norm": 0.03573732078075409, "kl": 0.008937835693359375, "learning_rate": 1.3090169943749475e-05, "loss": -0.0051, "reward": 1.4394998252391815, "reward_std": 0.22585050389170647, "rewards/mrr_reward": 0.524367555975914, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.814143717288971, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 409.578125, "epoch": 0.1712, "grad_norm": 0.038891687989234924, "kl": 0.010797500610351562, "learning_rate": 1.3030352696327741e-05, "loss": -0.0244, "reward": 1.3804857730865479, "reward_std": 0.35337623208761215, "rewards/mrr_reward": 0.5573722943663597, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6602989211678505, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 445.34375, "epoch": 0.172, "grad_norm": 0.03773610666394234, "kl": 0.007359504699707031, "learning_rate": 1.297041581577035e-05, "loss": -0.0117, "reward": 1.4981496632099152, "reward_std": 0.23435491137206554, "rewards/mrr_reward": 0.6250000149011612, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7513765692710876, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 436.203125, "epoch": 0.1728, "grad_norm": 0.039261989295482635, "kl": 0.01084136962890625, "learning_rate": 1.2910361668282718e-05, "loss": -0.039, "reward": 1.5580702126026154, "reward_std": 0.28226844780147076, "rewards/mrr_reward": 0.6630394235253334, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7708081007003784, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 456.375, "epoch": 0.1736, "grad_norm": 0.05010940507054329, "kl": 0.008663177490234375, "learning_rate": 1.2850192624699762e-05, "loss": -0.0192, "reward": 1.4300898909568787, "reward_std": 0.4626375511288643, "rewards/mrr_reward": 0.5627356097102165, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7943617403507233, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 437.546875, "epoch": 0.1744, "grad_norm": 0.03629588708281517, "kl": 0.010377883911132812, "learning_rate": 1.2789911060392295e-05, "loss": 0.0197, "reward": 1.5270054042339325, "reward_std": 0.2459590807557106, "rewards/mrr_reward": 0.6557477712631226, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6909556984901428, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 429.796875, "epoch": 0.1752, "grad_norm": 0.04659593850374222, "kl": 0.00952911376953125, "learning_rate": 1.2729519355173254e-05, "loss": -0.0118, "reward": 1.3348519802093506, "reward_std": 0.398675125092268, "rewards/mrr_reward": 0.5026785731315613, "rewards/rank_answer_foramt_reward": 0.806640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7229092568159103, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 389.921875, "epoch": 0.176, "grad_norm": 0.043882764875888824, "kl": 0.011045455932617188, "learning_rate": 1.2669019893203758e-05, "loss": -0.0301, "reward": 1.497531145811081, "reward_std": 0.27045511081814766, "rewards/mrr_reward": 0.6290550753474236, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6942455917596817, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 411.84375, "epoch": 0.1768, "grad_norm": 0.03796388581395149, "kl": 0.010257720947265625, "learning_rate": 1.2608415062898971e-05, "loss": -0.0241, "reward": 1.4202575087547302, "reward_std": 0.3259681724011898, "rewards/mrr_reward": 0.5604166686534882, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6915156245231628, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 453.1875, "epoch": 0.1776, "grad_norm": 0.040519829839468, "kl": 0.009119033813476562, "learning_rate": 1.2547707256833823e-05, "loss": -0.0025, "reward": 1.271849274635315, "reward_std": 0.2725183069705963, "rewards/mrr_reward": 0.39179687947034836, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7879189848899841, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 424.34375, "epoch": 0.1784, "grad_norm": 0.041910942643880844, "kl": 0.008281707763671875, "learning_rate": 1.2486898871648552e-05, "loss": -0.013, "reward": 1.3902939558029175, "reward_std": 0.3279624804854393, "rewards/mrr_reward": 0.520114079117775, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7462835609912872, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 437.484375, "epoch": 0.1792, "grad_norm": 0.03706391900777817, "kl": 0.008459091186523438, "learning_rate": 1.2425992307954075e-05, "loss": 0.0018, "reward": 1.4472178220748901, "reward_std": 0.3175524137914181, "rewards/mrr_reward": 0.5600880607962608, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8113187104463577, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 439.234375, "epoch": 0.18, "grad_norm": 0.03729787841439247, "kl": 0.009768486022949219, "learning_rate": 1.236498997023725e-05, "loss": -0.0125, "reward": 1.4809614419937134, "reward_std": 0.340122077614069, "rewards/mrr_reward": 0.6090277582406998, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7496449500322342, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 420.015625, "epoch": 0.1808, "grad_norm": 0.04090157151222229, "kl": 0.009912490844726562, "learning_rate": 1.2303894266765908e-05, "loss": 0.0045, "reward": 1.4084090292453766, "reward_std": 0.24517753347754478, "rewards/mrr_reward": 0.5429625511169434, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.731939822435379, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 449.5, "epoch": 0.1816, "grad_norm": 0.040676042437553406, "kl": 0.00809478759765625, "learning_rate": 1.2242707609493814e-05, "loss": -0.0272, "reward": 1.5291298627853394, "reward_std": 0.3408605456352234, "rewards/mrr_reward": 0.6497396007180214, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.760521873831749, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 412.40625, "epoch": 0.1824, "grad_norm": 0.04246947541832924, "kl": 0.009426116943359375, "learning_rate": 1.2181432413965428e-05, "loss": 0.0024, "reward": 1.4244692921638489, "reward_std": 0.2590717747807503, "rewards/mrr_reward": 0.5588541701436043, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7129197269678116, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 424.375, "epoch": 0.1832, "grad_norm": 0.04285717383027077, "kl": 0.008266448974609375, "learning_rate": 1.212007109922055e-05, "loss": -0.0374, "reward": 1.4698918163776398, "reward_std": 0.33165768533945084, "rewards/mrr_reward": 0.59691222012043, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7508614361286163, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 399.59375, "epoch": 0.184, "grad_norm": 0.041689760982990265, "kl": 0.01070404052734375, "learning_rate": 1.2058626087698814e-05, "loss": 0.0003, "reward": 1.43734011054039, "reward_std": 0.44176214933395386, "rewards/mrr_reward": 0.6163132339715958, "rewards/rank_answer_foramt_reward": 0.8046875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.706710159778595, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 462.328125, "epoch": 0.1848, "grad_norm": 0.03540234640240669, "kl": 0.007287025451660156, "learning_rate": 1.1997099805144071e-05, "loss": 0.0284, "reward": 1.5855993926525116, "reward_std": 0.3010600432753563, "rewards/mrr_reward": 0.6768229231238365, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7948835492134094, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 426.171875, "epoch": 0.1856, "grad_norm": 0.03808577358722687, "kl": 0.009876251220703125, "learning_rate": 1.1935494680508606e-05, "loss": -0.0193, "reward": 1.5138767063617706, "reward_std": 0.46021201461553574, "rewards/mrr_reward": 0.67578125, "rewards/rank_answer_foramt_reward": 0.818359375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7369488328695297, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 409.125, "epoch": 0.1864, "grad_norm": 0.03571336343884468, "kl": 0.010625839233398438, "learning_rate": 1.187381314585725e-05, "loss": -0.0271, "reward": 1.2613760828971863, "reward_std": 0.2764936424791813, "rewards/mrr_reward": 0.43248388171195984, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6641382277011871, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 428.34375, "epoch": 0.1872, "grad_norm": 0.03995465487241745, "kl": 0.008295059204101562, "learning_rate": 1.1812057636271374e-05, "loss": -0.0114, "reward": 1.4478269517421722, "reward_std": 0.3669227175414562, "rewards/mrr_reward": 0.5865017399191856, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7272636741399765, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 436.84375, "epoch": 0.188, "grad_norm": 0.04256037250161171, "kl": 0.00879669189453125, "learning_rate": 1.1750230589752763e-05, "loss": -0.0233, "reward": 1.298891007900238, "reward_std": 0.3018713817000389, "rewards/mrr_reward": 0.4497953839600086, "rewards/rank_answer_foramt_reward": 0.814453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7741887420415878, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 434.671875, "epoch": 0.1888, "grad_norm": 0.039598457515239716, "kl": 0.009709358215332031, "learning_rate": 1.1688334447127338e-05, "loss": -0.0192, "reward": 1.6007757186889648, "reward_std": 0.4135289564728737, "rewards/mrr_reward": 0.744140625, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.7384417653083801, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 427.78125, "epoch": 0.1896, "grad_norm": 0.036554981023073196, "kl": 0.009065628051757812, "learning_rate": 1.1626371651948839e-05, "loss": 0.0093, "reward": 1.5135816633701324, "reward_std": 0.2605090048164129, "rewards/mrr_reward": 0.640625, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7722761482000351, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 423.78125, "epoch": 0.1904, "grad_norm": 0.04034959152340889, "kl": 0.009461402893066406, "learning_rate": 1.156434465040231e-05, "loss": -0.0206, "reward": 1.374279260635376, "reward_std": 0.3329411558806896, "rewards/mrr_reward": 0.5505580306053162, "rewards/rank_answer_foramt_reward": 0.806640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.705109030008316, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 415.9375, "epoch": 0.1912, "grad_norm": 0.0426294319331646, "kl": 0.009059906005859375, "learning_rate": 1.1502255891207572e-05, "loss": 0.0192, "reward": 1.5010081231594086, "reward_std": 0.37140223383903503, "rewards/mrr_reward": 0.6446304619312286, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6985992938280106, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 433.609375, "epoch": 0.192, "grad_norm": 0.03961321711540222, "kl": 0.0083465576171875, "learning_rate": 1.1440107825522522e-05, "loss": -0.0221, "reward": 1.4463289082050323, "reward_std": 0.2782457806169987, "rewards/mrr_reward": 0.5504092425107956, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8067047744989395, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 411.828125, "epoch": 0.1928, "grad_norm": 0.04309277981519699, "kl": 0.0095672607421875, "learning_rate": 1.137790290684638e-05, "loss": -0.024, "reward": 1.3144385814666748, "reward_std": 0.26049618795514107, "rewards/mrr_reward": 0.4519035294651985, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.719211108982563, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 426.59375, "epoch": 0.1936, "grad_norm": 0.04275864362716675, "kl": 0.009810447692871094, "learning_rate": 1.1315643590922827e-05, "loss": -0.0326, "reward": 1.597327709197998, "reward_std": 0.22876879945397377, "rewards/mrr_reward": 0.7057291716337204, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.731110468506813, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 439.71875, "epoch": 0.1944, "grad_norm": 0.03837655857205391, "kl": 0.011631011962890625, "learning_rate": 1.1253332335643043e-05, "loss": -0.0353, "reward": 1.4636397659778595, "reward_std": 0.2928088903427124, "rewards/mrr_reward": 0.5801215246319771, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7417809367179871, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 439.1875, "epoch": 0.1952, "grad_norm": 0.039478182792663574, "kl": 0.013881683349609375, "learning_rate": 1.11909716009487e-05, "loss": -0.0426, "reward": 1.2862295508384705, "reward_std": 0.2817831374704838, "rewards/mrr_reward": 0.43656374514102936, "rewards/rank_answer_foramt_reward": 0.818359375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7641979157924652, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 447.90625, "epoch": 0.196, "grad_norm": 0.04093409702181816, "kl": 0.008157730102539062, "learning_rate": 1.1128563848734817e-05, "loss": -0.0431, "reward": 1.3213240504264832, "reward_std": 0.3792931139469147, "rewards/mrr_reward": 0.4646267518401146, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7776929587125778, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 421.609375, "epoch": 0.1968, "grad_norm": 0.038951508700847626, "kl": 0.0119476318359375, "learning_rate": 1.10661115427526e-05, "loss": -0.0258, "reward": 1.4929006099700928, "reward_std": 0.19486585073173046, "rewards/mrr_reward": 0.5957589447498322, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.781110942363739, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 449.84375, "epoch": 0.1976, "grad_norm": 0.040840230882167816, "kl": 0.008116722106933594, "learning_rate": 1.1003617148512149e-05, "loss": 0.0099, "reward": 1.4765380024909973, "reward_std": 0.30231093615293503, "rewards/mrr_reward": 0.5662698447704315, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8423726558685303, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 431.390625, "epoch": 0.1984, "grad_norm": 0.03562074154615402, "kl": 0.010061264038085938, "learning_rate": 1.0941083133185146e-05, "loss": -0.0065, "reward": 1.5500171780586243, "reward_std": 0.37310022860765457, "rewards/mrr_reward": 0.6682291775941849, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7931784391403198, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 456.421875, "epoch": 0.1992, "grad_norm": 0.040016964077949524, "kl": 0.007778167724609375, "learning_rate": 1.0878511965507435e-05, "loss": -0.0046, "reward": 1.4200845062732697, "reward_std": 0.2748546898365021, "rewards/mrr_reward": 0.5281250104308128, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7712667435407639, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 431.953125, "epoch": 0.2, "grad_norm": 0.0408429279923439, "kl": 0.008358001708984375, "learning_rate": 1.0815906115681579e-05, "loss": -0.0003, "reward": 1.4024586379528046, "reward_std": 0.45451923459768295, "rewards/mrr_reward": 0.5480654612183571, "rewards/rank_answer_foramt_reward": 0.8203125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.792195051908493, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 389.390625, "epoch": 0.2008, "grad_norm": 0.044342394918203354, "kl": 0.011260986328125, "learning_rate": 1.0753268055279328e-05, "loss": -0.0243, "reward": 1.4417364001274109, "reward_std": 0.32018742710351944, "rewards/mrr_reward": 0.599479153752327, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7026848942041397, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 412.015625, "epoch": 0.2016, "grad_norm": 0.0681619718670845, "kl": 0.021749496459960938, "learning_rate": 1.0690600257144062e-05, "loss": -0.0105, "reward": 1.3221397995948792, "reward_std": 0.3863202631473541, "rewards/mrr_reward": 0.4698536768555641, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.719403862953186, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 470.546875, "epoch": 0.2024, "grad_norm": 0.0338444858789444, "kl": 0.009431838989257812, "learning_rate": 1.0627905195293135e-05, "loss": -0.0055, "reward": 1.4945510029792786, "reward_std": 0.2610199525952339, "rewards/mrr_reward": 0.5938120037317276, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8095900267362595, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 418.109375, "epoch": 0.2032, "grad_norm": 0.04314475134015083, "kl": 0.008597373962402344, "learning_rate": 1.0565185344820248e-05, "loss": -0.0178, "reward": 1.4714539349079132, "reward_std": 0.224628996104002, "rewards/mrr_reward": 0.5916666761040688, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7226623743772507, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 408.21875, "epoch": 0.204, "grad_norm": 0.04039366543292999, "kl": 0.01071929931640625, "learning_rate": 1.0502443181797696e-05, "loss": -0.0199, "reward": 1.255561202764511, "reward_std": 0.34654828906059265, "rewards/mrr_reward": 0.43823785334825516, "rewards/rank_answer_foramt_reward": 0.8203125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6564248204231262, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 433.984375, "epoch": 0.2048, "grad_norm": 0.04161591827869415, "kl": 0.008603096008300781, "learning_rate": 1.043968118317865e-05, "loss": 0.0094, "reward": 1.4394341111183167, "reward_std": 0.32009443640708923, "rewards/mrr_reward": 0.5737165212631226, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7288551479578018, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 444.40625, "epoch": 0.2056, "grad_norm": 0.038173969835042953, "kl": 0.010286331176757812, "learning_rate": 1.0376901826699349e-05, "loss": -0.0254, "reward": 1.445276916027069, "reward_std": 0.3517743721604347, "rewards/mrr_reward": 0.6002604365348816, "rewards/rank_answer_foramt_reward": 0.841796875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7422964721918106, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 436.171875, "epoch": 0.2064, "grad_norm": 0.03894796594977379, "kl": 0.010990142822265625, "learning_rate": 1.0314107590781284e-05, "loss": -0.0134, "reward": 1.4257104396820068, "reward_std": 0.23646708950400352, "rewards/mrr_reward": 0.5429191589355469, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7512968927621841, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 405.875, "epoch": 0.2072, "grad_norm": 0.0407368466258049, "kl": 0.0094451904296875, "learning_rate": 1.0251300954433377e-05, "loss": -0.0281, "reward": 1.5280113816261292, "reward_std": 0.3434094376862049, "rewards/mrr_reward": 0.6421006992459297, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7744212746620178, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 453.203125, "epoch": 0.208, "grad_norm": 0.03855285048484802, "kl": 0.008718490600585938, "learning_rate": 1.0188484397154083e-05, "loss": 0.0004, "reward": 1.5177774131298065, "reward_std": 0.3898182809352875, "rewards/mrr_reward": 0.6282552182674408, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8205215632915497, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 409.46875, "epoch": 0.2088, "grad_norm": 0.0410795584321022, "kl": 0.01009368896484375, "learning_rate": 1.0125660398833528e-05, "loss": -0.0357, "reward": 1.5540205836296082, "reward_std": 0.2195252738893032, "rewards/mrr_reward": 0.6762276813387871, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.743962749838829, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 418.125, "epoch": 0.2096, "grad_norm": 0.03606094419956207, "kl": 0.011699676513671875, "learning_rate": 1.0062831439655591e-05, "loss": -0.0107, "reward": 1.5303250849246979, "reward_std": 0.22126532718539238, "rewards/mrr_reward": 0.6565104424953461, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.67721988260746, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 445.84375, "epoch": 0.2104, "grad_norm": 0.03741401433944702, "kl": 0.008730888366699219, "learning_rate": 1e-05, "loss": -0.0197, "reward": 1.541961818933487, "reward_std": 0.27997246757149696, "rewards/mrr_reward": 0.6421006917953491, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7893517762422562, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 438.796875, "epoch": 0.2112, "grad_norm": 0.03900214284658432, "kl": 0.008241653442382812, "learning_rate": 9.937168560344412e-06, "loss": 0.0077, "reward": 1.4794524312019348, "reward_std": 0.35047149658203125, "rewards/mrr_reward": 0.5804191678762436, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7868431955575943, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 437.171875, "epoch": 0.212, "grad_norm": 0.038283322006464005, "kl": 0.00952911376953125, "learning_rate": 9.874339601166474e-06, "loss": 0.0138, "reward": 1.506893515586853, "reward_std": 0.2615400552749634, "rewards/mrr_reward": 0.6185701861977577, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7739198356866837, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 437.453125, "epoch": 0.2128, "grad_norm": 0.03715907782316208, "kl": 0.009618759155273438, "learning_rate": 9.81151560284592e-06, "loss": -0.016, "reward": 1.5035657286643982, "reward_std": 0.3300909399986267, "rewards/mrr_reward": 0.6235367059707642, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7507388442754745, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 424.390625, "epoch": 0.2136, "grad_norm": 0.041197191923856735, "kl": 0.010580062866210938, "learning_rate": 9.748699045566626e-06, "loss": -0.0366, "reward": 1.576979547739029, "reward_std": 0.439083069562912, "rewards/mrr_reward": 0.7342447936534882, "rewards/rank_answer_foramt_reward": 0.884765625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6689759790897369, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 442.453125, "epoch": 0.2144, "grad_norm": 0.041197191923856735, "kl": 0.009737014770507812, "learning_rate": 9.748699045566626e-06, "loss": 0.001, "reward": 1.4734593629837036, "reward_std": 0.26162197068333626, "rewards/mrr_reward": 0.5788566395640373, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7656047195196152, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 424.875, "epoch": 0.2152, "grad_norm": 0.0444633774459362, "kl": 0.008083343505859375, "learning_rate": 9.685892409218718e-06, "loss": -0.0102, "reward": 1.4165308475494385, "reward_std": 0.26300579868257046, "rewards/mrr_reward": 0.5317460373044014, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7495253682136536, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 426.75, "epoch": 0.216, "grad_norm": 0.0361800491809845, "kl": 0.009550094604492188, "learning_rate": 9.623098173300655e-06, "loss": -0.0378, "reward": 1.4879783987998962, "reward_std": 0.3136373609304428, "rewards/mrr_reward": 0.6265625134110451, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7080072462558746, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 468.15625, "epoch": 0.2168, "grad_norm": 0.04007337614893913, "kl": 0.010549545288085938, "learning_rate": 9.560318816821354e-06, "loss": -0.0364, "reward": 1.2709717452526093, "reward_std": 0.36226093024015427, "rewards/mrr_reward": 0.4365575388073921, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7043090015649796, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 430.484375, "epoch": 0.2176, "grad_norm": 0.039333194494247437, "kl": 0.009454727172851562, "learning_rate": 9.497556818202306e-06, "loss": -0.0502, "reward": 1.3762290477752686, "reward_std": 0.3271942213177681, "rewards/mrr_reward": 0.49231771379709244, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7742221802473068, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 404.953125, "epoch": 0.2184, "grad_norm": 0.04287045821547508, "kl": 0.014556884765625, "learning_rate": 9.434814655179756e-06, "loss": -0.0263, "reward": 1.5546920597553253, "reward_std": 0.38028147257864475, "rewards/mrr_reward": 0.7030567973852158, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6939939856529236, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 410.21875, "epoch": 0.2192, "grad_norm": 0.039541661739349365, "kl": 0.010419845581054688, "learning_rate": 9.372094804706867e-06, "loss": -0.0369, "reward": 1.7257789969444275, "reward_std": 0.29538945853710175, "rewards/mrr_reward": 0.8723958432674408, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6289780139923096, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 415.21875, "epoch": 0.22, "grad_norm": 0.04606436938047409, "kl": 0.014463424682617188, "learning_rate": 9.309399742855943e-06, "loss": 0.0001, "reward": 1.4452278017997742, "reward_std": 0.3192713689059019, "rewards/mrr_reward": 0.576227679848671, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.75247423350811, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 417.078125, "epoch": 0.2208, "grad_norm": 0.037436868995428085, "kl": 0.0102691650390625, "learning_rate": 9.246731944720675e-06, "loss": -0.0397, "reward": 1.184128776192665, "reward_std": 0.2542712949216366, "rewards/mrr_reward": 0.3415798544883728, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6723189651966095, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 431.421875, "epoch": 0.2216, "grad_norm": 0.04319589212536812, "kl": 0.010812759399414062, "learning_rate": 9.184093884318426e-06, "loss": -0.0151, "reward": 1.4168269038200378, "reward_std": 0.38048839569091797, "rewards/mrr_reward": 0.5684895813465118, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7211094051599503, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 442.390625, "epoch": 0.2224, "grad_norm": 0.03658146783709526, "kl": 0.008172988891601562, "learning_rate": 9.121488034492569e-06, "loss": -0.006, "reward": 1.4866581559181213, "reward_std": 0.3485083729028702, "rewards/mrr_reward": 0.6006448417901993, "rewards/rank_answer_foramt_reward": 0.884765625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8001230210065842, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 452.71875, "epoch": 0.2232, "grad_norm": 0.036550384014844894, "kl": 0.009181976318359375, "learning_rate": 9.058916866814857e-06, "loss": -0.0197, "reward": 1.2319900691509247, "reward_std": 0.2513876333832741, "rewards/mrr_reward": 0.3517051041126251, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7925300300121307, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 430.859375, "epoch": 0.224, "grad_norm": 0.03881368413567543, "kl": 0.009093284606933594, "learning_rate": 8.996382851487851e-06, "loss": -0.0384, "reward": 1.3488417267799377, "reward_std": 0.32528745383024216, "rewards/mrr_reward": 0.4834573529660702, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7630017250776291, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 446.9375, "epoch": 0.2248, "grad_norm": 0.04073338583111763, "kl": 0.010313034057617188, "learning_rate": 8.933888457247402e-06, "loss": -0.0459, "reward": 1.402299851179123, "reward_std": 0.40668466687202454, "rewards/mrr_reward": 0.5348090305924416, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7693850100040436, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 448.1875, "epoch": 0.2256, "grad_norm": 0.043222904205322266, "kl": 0.0100250244140625, "learning_rate": 8.871436151265183e-06, "loss": 0.0008, "reward": 1.3178912699222565, "reward_std": 0.3209885358810425, "rewards/mrr_reward": 0.4478670582175255, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7887807190418243, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 465.234375, "epoch": 0.2264, "grad_norm": 0.036993607878685, "kl": 0.008585929870605469, "learning_rate": 8.809028399051302e-06, "loss": 0.0047, "reward": 1.469320148229599, "reward_std": 0.3071388304233551, "rewards/mrr_reward": 0.5837797746062279, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.8065022975206375, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 426.703125, "epoch": 0.2272, "grad_norm": 0.0379331074655056, "kl": 0.010618209838867188, "learning_rate": 8.746667664356957e-06, "loss": 0.0039, "reward": 1.2527941763401031, "reward_std": 0.23029084131121635, "rewards/mrr_reward": 0.3807787857949734, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7889550924301147, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 449.546875, "epoch": 0.228, "grad_norm": 0.03945260867476463, "kl": 0.010408401489257812, "learning_rate": 8.684356409077177e-06, "loss": 0.0084, "reward": 1.5159841179847717, "reward_std": 0.34652554243803024, "rewards/mrr_reward": 0.6295944899320602, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7895446866750717, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 431.234375, "epoch": 0.2288, "grad_norm": 0.03930336609482765, "kl": 0.008619308471679688, "learning_rate": 8.62209709315362e-06, "loss": -0.0207, "reward": 1.3798122704029083, "reward_std": 0.29061378724873066, "rewards/mrr_reward": 0.5122581720352173, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7656702399253845, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 420.40625, "epoch": 0.2296, "grad_norm": 0.04059287905693054, "kl": 0.011449813842773438, "learning_rate": 8.559892174477478e-06, "loss": -0.0303, "reward": 1.4114105701446533, "reward_std": 0.39132068306207657, "rewards/mrr_reward": 0.583333358168602, "rewards/rank_answer_foramt_reward": 0.8203125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.689012199640274, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 429.828125, "epoch": 0.2304, "grad_norm": 0.03984786570072174, "kl": 0.010134696960449219, "learning_rate": 8.49774410879243e-06, "loss": -0.0409, "reward": 1.2429711520671844, "reward_std": 0.28321645595133305, "rewards/mrr_reward": 0.3541666753590107, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7831906080245972, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 417.078125, "epoch": 0.2312, "grad_norm": 0.0415896400809288, "kl": 0.010349273681640625, "learning_rate": 8.43565534959769e-06, "loss": -0.042, "reward": 1.3772334456443787, "reward_std": 0.26923793181777, "rewards/mrr_reward": 0.5224826335906982, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.646794430911541, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 454.359375, "epoch": 0.232, "grad_norm": 0.03732220083475113, "kl": 0.008699417114257812, "learning_rate": 8.373628348051165e-06, "loss": -0.0092, "reward": 1.5791191756725311, "reward_std": 0.315543457865715, "rewards/mrr_reward": 0.6815476268529892, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7472574412822723, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 440.890625, "epoch": 0.2328, "grad_norm": 0.04088292643427849, "kl": 0.010980606079101562, "learning_rate": 8.311665552872662e-06, "loss": -0.0234, "reward": 1.4555590748786926, "reward_std": 0.2937028855085373, "rewards/mrr_reward": 0.583984375, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7524634450674057, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 420.625, "epoch": 0.2336, "grad_norm": 0.044596508145332336, "kl": 0.012334823608398438, "learning_rate": 8.249769410247239e-06, "loss": -0.04, "reward": 1.431397259235382, "reward_std": 0.36401835083961487, "rewards/mrr_reward": 0.580512136220932, "rewards/rank_answer_foramt_reward": 0.845703125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7561738044023514, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 439.03125, "epoch": 0.2344, "grad_norm": 0.04020597040653229, "kl": 0.00943756103515625, "learning_rate": 8.187942363728626e-06, "loss": 0.0116, "reward": 1.4588199257850647, "reward_std": 0.235812745988369, "rewards/mrr_reward": 0.5665984451770782, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7388575822114944, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 450.640625, "epoch": 0.2352, "grad_norm": 0.042804013937711716, "kl": 0.009305953979492188, "learning_rate": 8.126186854142752e-06, "loss": 0.0242, "reward": 1.3589671552181244, "reward_std": 0.3509839344769716, "rewards/mrr_reward": 0.4847656264901161, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7955798357725143, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 466.828125, "epoch": 0.236, "grad_norm": 0.03595392778515816, "kl": 0.008063316345214844, "learning_rate": 8.064505319491398e-06, "loss": -0.0051, "reward": 1.3917770087718964, "reward_std": 0.23030859045684338, "rewards/mrr_reward": 0.49776167050004005, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7911685407161713, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 444.515625, "epoch": 0.2368, "grad_norm": 0.042226508259773254, "kl": 0.015703201293945312, "learning_rate": 8.00290019485593e-06, "loss": -0.0313, "reward": 1.4788413643836975, "reward_std": 0.32476067543029785, "rewards/mrr_reward": 0.6268229335546494, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.699061393737793, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 443.03125, "epoch": 0.2376, "grad_norm": 0.03763006627559662, "kl": 0.008462905883789062, "learning_rate": 7.94137391230119e-06, "loss": -0.0188, "reward": 1.582606554031372, "reward_std": 0.20448793843388557, "rewards/mrr_reward": 0.6720920205116272, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7806191295385361, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 424.796875, "epoch": 0.2384, "grad_norm": 0.03964044153690338, "kl": 0.009276390075683594, "learning_rate": 7.879928900779457e-06, "loss": 0.0007, "reward": 1.4767478704452515, "reward_std": 0.3392485938966274, "rewards/mrr_reward": 0.6051401421427727, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7760008126497269, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 447.5, "epoch": 0.2392, "grad_norm": 0.03888827934861183, "kl": 0.010454177856445312, "learning_rate": 7.818567586034578e-06, "loss": -0.0233, "reward": 1.5537761747837067, "reward_std": 0.1316620425786823, "rewards/mrr_reward": 0.6217633932828903, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8379528671503067, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 417.625, "epoch": 0.24, "grad_norm": 0.038070328533649445, "kl": 0.009571075439453125, "learning_rate": 7.757292390506191e-06, "loss": -0.0591, "reward": 1.7321368753910065, "reward_std": 0.1761476807296276, "rewards/mrr_reward": 0.841796875, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7136247903108597, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 419.46875, "epoch": 0.2408, "grad_norm": 0.0418258011341095, "kl": 0.010776519775390625, "learning_rate": 7.696105733234099e-06, "loss": -0.0165, "reward": 1.5259293019771576, "reward_std": 0.29256412014365196, "rewards/mrr_reward": 0.6532242149114609, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6875295341014862, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 442.09375, "epoch": 0.2416, "grad_norm": 0.04463861510157585, "kl": 0.008836746215820312, "learning_rate": 7.635010029762755e-06, "loss": -0.0098, "reward": 1.4686961770057678, "reward_std": 0.3223220370709896, "rewards/mrr_reward": 0.606919676065445, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7012877017259598, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 427.75, "epoch": 0.2424, "grad_norm": 0.04330219700932503, "kl": 0.011468887329101562, "learning_rate": 7.574007692045928e-06, "loss": 0.0058, "reward": 1.4482861161231995, "reward_std": 0.38629114255309105, "rewards/mrr_reward": 0.5864769443869591, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.7541209012269974, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 434.171875, "epoch": 0.2432, "grad_norm": 0.03801389038562775, "kl": 0.011196136474609375, "learning_rate": 7.513101128351454e-06, "loss": -0.0036, "reward": 1.5316137671470642, "reward_std": 0.33731117844581604, "rewards/mrr_reward": 0.672395870089531, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.658377930521965, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 445.75, "epoch": 0.244, "grad_norm": 0.03998512774705887, "kl": 0.008950233459472656, "learning_rate": 7.4522927431661805e-06, "loss": -0.0292, "reward": 1.5211172103881836, "reward_std": 0.23774503916502, "rewards/mrr_reward": 0.6258432418107986, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8027948141098022, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 431.40625, "epoch": 0.2448, "grad_norm": 0.043149009346961975, "kl": 0.011903762817382812, "learning_rate": 7.391584937101034e-06, "loss": -0.0233, "reward": 1.3914374113082886, "reward_std": 0.23153280839323997, "rewards/mrr_reward": 0.5059523731470108, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7399283051490784, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 426.65625, "epoch": 0.2456, "grad_norm": 0.04259219393134117, "kl": 0.009889602661132812, "learning_rate": 7.330980106796247e-06, "loss": -0.0619, "reward": 1.380819708108902, "reward_std": 0.23608434945344925, "rewards/mrr_reward": 0.5217137709259987, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7088199555873871, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 454.53125, "epoch": 0.2464, "grad_norm": 0.0958850160241127, "kl": 0.02011871337890625, "learning_rate": 7.27048064482675e-06, "loss": -0.0235, "reward": 1.3718312680721283, "reward_std": 0.2723774276673794, "rewards/mrr_reward": 0.48489585518836975, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7872923165559769, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 402.28125, "epoch": 0.2472, "grad_norm": 0.0465591698884964, "kl": 0.016143798828125, "learning_rate": 7.210088939607709e-06, "loss": -0.0312, "reward": 1.231211543083191, "reward_std": 0.32072607427835464, "rewards/mrr_reward": 0.3999132066965103, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.6929138600826263, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 436.984375, "epoch": 0.248, "grad_norm": 0.038580480962991714, "kl": 0.011264801025390625, "learning_rate": 7.149807375300239e-06, "loss": 0.0033, "reward": 1.326162338256836, "reward_std": 0.33862701058387756, "rewards/mrr_reward": 0.47708334028720856, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7624196261167526, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 464.703125, "epoch": 0.2488, "grad_norm": 0.037029314786195755, "kl": 0.008462905883789062, "learning_rate": 7.0896383317172845e-06, "loss": -0.0385, "reward": 1.2545623183250427, "reward_std": 0.31125089153647423, "rewards/mrr_reward": 0.3746279776096344, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8129519671201706, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 433.453125, "epoch": 0.2496, "grad_norm": 0.041599493473768234, "kl": 0.010118484497070312, "learning_rate": 7.029584184229653e-06, "loss": 0.026, "reward": 1.4846043288707733, "reward_std": 0.36305932328104973, "rewards/mrr_reward": 0.5936383754014969, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8327091336250305, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 436.1875, "epoch": 0.2504, "grad_norm": 0.0423130989074707, "kl": 0.010608673095703125, "learning_rate": 6.969647303672262e-06, "loss": -0.0047, "reward": 1.3655261397361755, "reward_std": 0.3140504229813814, "rewards/mrr_reward": 0.48500124737620354, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.791303962469101, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 396.921875, "epoch": 0.2512, "grad_norm": 0.06699636578559875, "kl": 0.0169677734375, "learning_rate": 6.909830056250527e-06, "loss": -0.0292, "reward": 1.3997429311275482, "reward_std": 0.3030847944319248, "rewards/mrr_reward": 0.5588541775941849, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.686819538474083, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 439.65625, "epoch": 0.252, "grad_norm": 0.03891773149371147, "kl": 0.011255264282226562, "learning_rate": 6.850134803446955e-06, "loss": -0.0275, "reward": 1.4285947978496552, "reward_std": 0.2862687110900879, "rewards/mrr_reward": 0.5552455335855484, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.812528446316719, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 451.328125, "epoch": 0.2528, "grad_norm": 0.04100382700562477, "kl": 0.009349822998046875, "learning_rate": 6.790563901927907e-06, "loss": -0.0514, "reward": 1.50279700756073, "reward_std": 0.32215361297130585, "rewards/mrr_reward": 0.6073288694024086, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7760395705699921, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 437.90625, "epoch": 0.2536, "grad_norm": 0.0384552925825119, "kl": 0.011285781860351562, "learning_rate": 6.731119703450577e-06, "loss": -0.0035, "reward": 1.3444324433803558, "reward_std": 0.31130388006567955, "rewards/mrr_reward": 0.47606024146080017, "rewards/rank_answer_foramt_reward": 0.8359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7954932898283005, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 458.578125, "epoch": 0.2544, "grad_norm": 0.03822173550724983, "kl": 0.007521629333496094, "learning_rate": 6.671804554770135e-06, "loss": -0.0184, "reward": 1.4845271408557892, "reward_std": 0.24785233289003372, "rewards/mrr_reward": 0.576884925365448, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8051183372735977, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 453.484375, "epoch": 0.2552, "grad_norm": 0.03790445998311043, "kl": 0.012842178344726562, "learning_rate": 6.612620797547087e-06, "loss": -0.0038, "reward": 1.250670075416565, "reward_std": 0.29811690375208855, "rewards/mrr_reward": 0.35522693768143654, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8482294976711273, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 393.734375, "epoch": 0.256, "grad_norm": 0.04126419499516487, "kl": 0.009257316589355469, "learning_rate": 6.553570768254831e-06, "loss": -0.017, "reward": 1.694937378168106, "reward_std": 0.23029077798128128, "rewards/mrr_reward": 0.8385850638151169, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6379755735397339, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 443.015625, "epoch": 0.2568, "grad_norm": 0.04230582341551781, "kl": 0.014929771423339844, "learning_rate": 6.494656798087412e-06, "loss": -0.0211, "reward": 1.477415293455124, "reward_std": 0.2716873064637184, "rewards/mrr_reward": 0.6104352548718452, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7092431783676147, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 434.03125, "epoch": 0.2576, "grad_norm": 0.036968860775232315, "kl": 0.008441925048828125, "learning_rate": 6.435881212867494e-06, "loss": -0.0312, "reward": 1.4142729341983795, "reward_std": 0.31983111053705215, "rewards/mrr_reward": 0.5367559641599655, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.797814130783081, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 423.375, "epoch": 0.2584, "grad_norm": 0.0385778546333313, "kl": 0.010784149169921875, "learning_rate": 6.377246332954544e-06, "loss": -0.025, "reward": 1.481526404619217, "reward_std": 0.3511744774878025, "rewards/mrr_reward": 0.5972842276096344, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8084279000759125, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 415.625, "epoch": 0.2592, "grad_norm": 0.04187595844268799, "kl": 0.01099395751953125, "learning_rate": 6.318754473153221e-06, "loss": -0.0355, "reward": 1.4383732378482819, "reward_std": 0.298159871250391, "rewards/mrr_reward": 0.5604166761040688, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7425056099891663, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 433.359375, "epoch": 0.26, "grad_norm": 0.04161127656698227, "kl": 0.010236740112304688, "learning_rate": 6.260407942621998e-06, "loss": 0.0278, "reward": 1.4653730988502502, "reward_std": 0.28907064720988274, "rewards/mrr_reward": 0.5929687470197678, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7608369886875153, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 429.5, "epoch": 0.2608, "grad_norm": 0.044767290353775024, "kl": 0.010900497436523438, "learning_rate": 6.202209044781991e-06, "loss": -0.0244, "reward": 1.445456624031067, "reward_std": 0.2606505677103996, "rewards/mrr_reward": 0.5714161843061447, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7404041886329651, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 425.4375, "epoch": 0.2616, "grad_norm": 0.042263783514499664, "kl": 0.010478019714355469, "learning_rate": 6.144160077226035e-06, "loss": -0.0364, "reward": 1.4664171934127808, "reward_std": 0.37280726805329323, "rewards/mrr_reward": 0.5984560996294022, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7766695767641068, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 437.25, "epoch": 0.2624, "grad_norm": 0.0396074615418911, "kl": 0.009977340698242188, "learning_rate": 6.086263331627976e-06, "loss": -0.0303, "reward": 1.3097558319568634, "reward_std": 0.3439793810248375, "rewards/mrr_reward": 0.46181795187294483, "rewards/rank_answer_foramt_reward": 0.81640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7687272727489471, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 411.78125, "epoch": 0.2632, "grad_norm": 0.040687814354896545, "kl": 0.008890151977539062, "learning_rate": 6.028521093652195e-06, "loss": -0.0267, "reward": 1.4824483394622803, "reward_std": 0.35101721435785294, "rewards/mrr_reward": 0.6202008873224258, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7222457826137543, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 444.96875, "epoch": 0.264, "grad_norm": 0.03991749510169029, "kl": 0.009294509887695312, "learning_rate": 5.970935642863375e-06, "loss": -0.0049, "reward": 1.4789340198040009, "reward_std": 0.26942019537091255, "rewards/mrr_reward": 0.5792286917567253, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7673952430486679, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 411.171875, "epoch": 0.2648, "grad_norm": 0.04425426945090294, "kl": 0.009923934936523438, "learning_rate": 5.913509252636511e-06, "loss": -0.0369, "reward": 1.4969634413719177, "reward_std": 0.26327501237392426, "rewards/mrr_reward": 0.6369791775941849, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6626534163951874, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 423.5625, "epoch": 0.2656, "grad_norm": 0.04324210807681084, "kl": 0.0114898681640625, "learning_rate": 5.85624419006716e-06, "loss": -0.0185, "reward": 1.4559223651885986, "reward_std": 0.3514735624194145, "rewards/mrr_reward": 0.5601996332406998, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8568893522024155, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 408.640625, "epoch": 0.2664, "grad_norm": 0.04046209529042244, "kl": 0.012134552001953125, "learning_rate": 5.799142715881938e-06, "loss": -0.0164, "reward": 1.4584909677505493, "reward_std": 0.25963789224624634, "rewards/mrr_reward": 0.610590286552906, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6572864204645157, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 428.265625, "epoch": 0.2672, "grad_norm": 0.03997616469860077, "kl": 0.011236190795898438, "learning_rate": 5.742207084349274e-06, "loss": -0.0195, "reward": 1.5415248572826385, "reward_std": 0.33238353580236435, "rewards/mrr_reward": 0.6676215380430222, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7810041755437851, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 401.546875, "epoch": 0.268, "grad_norm": 0.04179609194397926, "kl": 0.009923934936523438, "learning_rate": 5.685439543190409e-06, "loss": -0.0119, "reward": 1.8288907408714294, "reward_std": 0.2181766740977764, "rewards/mrr_reward": 0.9140625, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8210345953702927, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 446.015625, "epoch": 0.2688, "grad_norm": 0.04074598103761673, "kl": 0.008538246154785156, "learning_rate": 5.628842333490674e-06, "loss": 0.0237, "reward": 1.5049967169761658, "reward_std": 0.26735008880496025, "rewards/mrr_reward": 0.5956349298357964, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8181416988372803, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 456.046875, "epoch": 0.2696, "grad_norm": 0.038923174142837524, "kl": 0.0103607177734375, "learning_rate": 5.572417689610987e-06, "loss": -0.0079, "reward": 1.4763097763061523, "reward_std": 0.22926313430070877, "rewards/mrr_reward": 0.5806609615683556, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7687745988368988, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 417.65625, "epoch": 0.2704, "grad_norm": 0.040242068469524384, "kl": 0.009759902954101562, "learning_rate": 5.516167839099679e-06, "loss": -0.0346, "reward": 1.583184838294983, "reward_std": 0.36991823837161064, "rewards/mrr_reward": 0.6937499940395355, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7772882282733917, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 395.375, "epoch": 0.2712, "grad_norm": 0.04075354337692261, "kl": 0.012285232543945312, "learning_rate": 5.460095002604533e-06, "loss": -0.0154, "reward": 1.6092648804187775, "reward_std": 0.32450181245803833, "rewards/mrr_reward": 0.7244791835546494, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7729655653238297, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 418.390625, "epoch": 0.272, "grad_norm": 0.045717377215623856, "kl": 0.010404586791992188, "learning_rate": 5.404201393785123e-06, "loss": -0.0222, "reward": 1.3845095038414001, "reward_std": 0.49984729290008545, "rewards/mrr_reward": 0.5492187440395355, "rewards/rank_answer_foramt_reward": 0.78125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7577463984489441, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 445.515625, "epoch": 0.2728, "grad_norm": 0.03839201480150223, "kl": 0.01171875, "learning_rate": 5.348489219225417e-06, "loss": -0.0114, "reward": 1.4330370724201202, "reward_std": 0.3987097330391407, "rewards/mrr_reward": 0.5739769265055656, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.7203998863697052, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 432.703125, "epoch": 0.2736, "grad_norm": 0.04262446239590645, "kl": 0.010091781616210938, "learning_rate": 5.292960678346674e-06, "loss": -0.0048, "reward": 1.3840036988258362, "reward_std": 0.26258981972932816, "rewards/mrr_reward": 0.5252604261040688, "rewards/rank_answer_foramt_reward": 0.79296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8249083608388901, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 428.125, "epoch": 0.2744, "grad_norm": 0.03931298851966858, "kl": 0.009639739990234375, "learning_rate": 5.237617963320608e-06, "loss": -0.012, "reward": 1.4826070964336395, "reward_std": 0.24309994652867317, "rewards/mrr_reward": 0.5967881828546524, "rewards/rank_answer_foramt_reward": 0.92578125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7663308084011078, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 418.5625, "epoch": 0.2752, "grad_norm": 0.04591841250658035, "kl": 0.01171875, "learning_rate": 5.1824632589828465e-06, "loss": -0.0147, "reward": 1.4441474378108978, "reward_std": 0.4614497572183609, "rewards/mrr_reward": 0.5976562574505806, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7409058511257172, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 444.4375, "epoch": 0.276, "grad_norm": 0.04385710135102272, "kl": 0.00975799560546875, "learning_rate": 5.127498742746675e-06, "loss": 0.0096, "reward": 1.4345026016235352, "reward_std": 0.3426055870950222, "rewards/mrr_reward": 0.5657985955476761, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7691548317670822, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 408.859375, "epoch": 0.2768, "grad_norm": 0.04883267357945442, "kl": 0.010219573974609375, "learning_rate": 5.072726584517086e-06, "loss": 0.001, "reward": 1.6219241619110107, "reward_std": 0.311983335763216, "rewards/mrr_reward": 0.7708333432674408, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.6571878343820572, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 423.03125, "epoch": 0.2776, "grad_norm": 0.038777440786361694, "kl": 0.009169578552246094, "learning_rate": 5.018148946605092e-06, "loss": -0.0351, "reward": 1.5609507262706757, "reward_std": 0.23676074855029583, "rewards/mrr_reward": 0.6690104454755783, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7594898641109467, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 432.890625, "epoch": 0.2784, "grad_norm": 0.041409607976675034, "kl": 0.011716842651367188, "learning_rate": 4.9637679836423926e-06, "loss": -0.0058, "reward": 1.518358290195465, "reward_std": 0.26389508321881294, "rewards/mrr_reward": 0.6400359719991684, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7318951040506363, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 452.96875, "epoch": 0.2792, "grad_norm": 0.039285335689783096, "kl": 0.008753776550292969, "learning_rate": 4.909585842496287e-06, "loss": 0.0012, "reward": 1.3429620265960693, "reward_std": 0.3432212918996811, "rewards/mrr_reward": 0.45104166865348816, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8082574605941772, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 445.765625, "epoch": 0.28, "grad_norm": 0.04197699576616287, "kl": 0.00982666015625, "learning_rate": 4.855604662184935e-06, "loss": -0.0316, "reward": 1.3227859288454056, "reward_std": 0.27039322815835476, "rewards/mrr_reward": 0.459523793309927, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7761019766330719, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 406.890625, "epoch": 0.2808, "grad_norm": 0.04606630280613899, "kl": 0.01332855224609375, "learning_rate": 4.801826573792905e-06, "loss": -0.0354, "reward": 1.3492314517498016, "reward_std": 0.3611329570412636, "rewards/mrr_reward": 0.5093750059604645, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7344726175069809, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 452.640625, "epoch": 0.2816, "grad_norm": 0.037404146045446396, "kl": 0.009065628051757812, "learning_rate": 4.7482537003870425e-06, "loss": -0.0019, "reward": 1.7138548493385315, "reward_std": 0.37241312861442566, "rewards/mrr_reward": 0.8257812559604645, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7966005057096481, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 439.15625, "epoch": 0.2824, "grad_norm": 0.04106292501091957, "kl": 0.01189422607421875, "learning_rate": 4.694888156932657e-06, "loss": 0.0021, "reward": 1.4714123904705048, "reward_std": 0.3619183078408241, "rewards/mrr_reward": 0.6462239772081375, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6353365629911423, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 425.3125, "epoch": 0.2832, "grad_norm": 0.04472840949892998, "kl": 0.01025390625, "learning_rate": 4.641732050210032e-06, "loss": -0.0384, "reward": 1.3738009333610535, "reward_std": 0.34098855778574944, "rewards/mrr_reward": 0.5247395932674408, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7662725001573563, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 432.25, "epoch": 0.284, "grad_norm": 0.0422356016933918, "kl": 0.007762908935546875, "learning_rate": 4.588787478731242e-06, "loss": -0.0179, "reward": 1.5886476337909698, "reward_std": 0.3262007385492325, "rewards/mrr_reward": 0.7180555313825607, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7553452551364899, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 416.265625, "epoch": 0.2848, "grad_norm": 0.045158080756664276, "kl": 0.014837265014648438, "learning_rate": 4.53605653265731e-06, "loss": -0.0355, "reward": 1.6843328177928925, "reward_std": 0.30529190599918365, "rewards/mrr_reward": 0.8046875149011612, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7378572374582291, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 435.46875, "epoch": 0.2856, "grad_norm": 0.04521140828728676, "kl": 0.009514808654785156, "learning_rate": 4.483541293715699e-06, "loss": -0.0368, "reward": 1.5743017494678497, "reward_std": 0.2875717096030712, "rewards/mrr_reward": 0.6670758798718452, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8331535160541534, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 445.421875, "epoch": 0.2864, "grad_norm": 0.04222414642572403, "kl": 0.008882522583007812, "learning_rate": 4.4312438351181246e-06, "loss": -0.0041, "reward": 1.5822243094444275, "reward_std": 0.3601609170436859, "rewards/mrr_reward": 0.6888020932674408, "rewards/rank_answer_foramt_reward": 0.8984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8167148977518082, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 444.109375, "epoch": 0.2872, "grad_norm": 0.042955782264471054, "kl": 0.008832931518554688, "learning_rate": 4.379166221478697e-06, "loss": -0.0191, "reward": 1.5433863401412964, "reward_std": 0.3501916863024235, "rewards/mrr_reward": 0.639533743262291, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8229316174983978, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 404.6875, "epoch": 0.288, "grad_norm": 0.040022190660238266, "kl": 0.009357452392578125, "learning_rate": 4.3273105087324375e-06, "loss": -0.0363, "reward": 1.546007513999939, "reward_std": 0.2631131783127785, "rewards/mrr_reward": 0.6498883962631226, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.750668540596962, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 477.328125, "epoch": 0.2888, "grad_norm": 0.03617052733898163, "kl": 0.008795738220214844, "learning_rate": 4.275678744054094e-06, "loss": 0.0144, "reward": 1.3946971893310547, "reward_std": 0.24523303098976612, "rewards/mrr_reward": 0.4952690973877907, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8173362910747528, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 421.203125, "epoch": 0.2896, "grad_norm": 0.04181591421365738, "kl": 0.009317398071289062, "learning_rate": 4.224272965777326e-06, "loss": -0.0191, "reward": 1.5229368805885315, "reward_std": 0.19505535019561648, "rewards/mrr_reward": 0.6376736015081406, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.793943926692009, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 412.0625, "epoch": 0.2904, "grad_norm": 0.040446627885103226, "kl": 0.009857177734375, "learning_rate": 4.173095203314241e-06, "loss": -0.0547, "reward": 1.463654488325119, "reward_std": 0.2678440436720848, "rewards/mrr_reward": 0.6329861134290695, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6831922978162766, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 450.4375, "epoch": 0.2912, "grad_norm": 0.03971518948674202, "kl": 0.009275436401367188, "learning_rate": 4.12214747707527e-06, "loss": -0.0236, "reward": 1.4997781217098236, "reward_std": 0.24033548682928085, "rewards/mrr_reward": 0.6165364682674408, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8034428507089615, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 431.046875, "epoch": 0.292, "grad_norm": 0.04116431623697281, "kl": 0.0095062255859375, "learning_rate": 4.071431798389408e-06, "loss": -0.0171, "reward": 1.57802614569664, "reward_std": 0.25258472189307213, "rewards/mrr_reward": 0.6707217246294022, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7923757880926132, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 461.984375, "epoch": 0.2928, "grad_norm": 0.04119415953755379, "kl": 0.008808135986328125, "learning_rate": 4.020950169424815e-06, "loss": 0.0292, "reward": 1.5060677528381348, "reward_std": 0.3413529172539711, "rewards/mrr_reward": 0.6168836876749992, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8351219594478607, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 441.796875, "epoch": 0.2936, "grad_norm": 0.03977242112159729, "kl": 0.010467529296875, "learning_rate": 3.970704583109755e-06, "loss": -0.0172, "reward": 1.4002881050109863, "reward_std": 0.30990614742040634, "rewards/mrr_reward": 0.4957217499613762, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8465787619352341, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 436.875, "epoch": 0.2944, "grad_norm": 0.042127519845962524, "kl": 0.010568618774414062, "learning_rate": 3.920697023053949e-06, "loss": -0.0185, "reward": 1.4711587131023407, "reward_std": 0.2235279567539692, "rewards/mrr_reward": 0.5671316906809807, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7687725126743317, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 451.140625, "epoch": 0.2952, "grad_norm": 0.037665415555238724, "kl": 0.0077457427978515625, "learning_rate": 3.8709294634702374e-06, "loss": 0.009, "reward": 1.6012870371341705, "reward_std": 0.2355819009244442, "rewards/mrr_reward": 0.69921875, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8038526326417923, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 423.578125, "epoch": 0.296, "grad_norm": 0.045537903904914856, "kl": 0.012182235717773438, "learning_rate": 3.821403869096658e-06, "loss": -0.027, "reward": 1.413929671049118, "reward_std": 0.2634736839681864, "rewards/mrr_reward": 0.5472656264901161, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7180513441562653, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 440.890625, "epoch": 0.2968, "grad_norm": 0.041050802916288376, "kl": 0.008855819702148438, "learning_rate": 3.772122195118877e-06, "loss": -0.0169, "reward": 1.451367437839508, "reward_std": 0.3203510493040085, "rewards/mrr_reward": 0.580543152987957, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7287050783634186, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 434.421875, "epoch": 0.2976, "grad_norm": 0.04175710305571556, "kl": 0.009654998779296875, "learning_rate": 3.723086387092997e-06, "loss": -0.0023, "reward": 1.476247251033783, "reward_std": 0.40557974576950073, "rewards/mrr_reward": 0.5853484943509102, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8110213726758957, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 445.28125, "epoch": 0.2984, "grad_norm": 0.044037140905857086, "kl": 0.009381294250488281, "learning_rate": 3.674298380868756e-06, "loss": 0.0037, "reward": 1.3548841178417206, "reward_std": 0.3138626515865326, "rewards/mrr_reward": 0.46049726754426956, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8196381330490112, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 462.34375, "epoch": 0.2992, "grad_norm": 0.04055827111005783, "kl": 0.007966995239257812, "learning_rate": 3.625760102513103e-06, "loss": 0.0133, "reward": 1.4686636626720428, "reward_std": 0.3094808869063854, "rewards/mrr_reward": 0.5532738268375397, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8168772459030151, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 427.171875, "epoch": 0.3, "grad_norm": 0.04194509610533714, "kl": 0.009783744812011719, "learning_rate": 3.5774734682341563e-06, "loss": -0.0282, "reward": 1.4523612558841705, "reward_std": 0.36508404091000557, "rewards/mrr_reward": 0.5867187529802322, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7403464317321777, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 443.109375, "epoch": 0.3008, "grad_norm": 0.03853485733270645, "kl": 0.009164810180664062, "learning_rate": 3.5294403843055604e-06, "loss": 0.0198, "reward": 1.4639207124710083, "reward_std": 0.21186323463916779, "rewards/mrr_reward": 0.5635416880249977, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7733431160449982, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 456.765625, "epoch": 0.3016, "grad_norm": 0.04190933704376221, "kl": 0.008619308471679688, "learning_rate": 3.4816627469912147e-06, "loss": -0.0263, "reward": 1.4570399820804596, "reward_std": 0.3633367531001568, "rewards/mrr_reward": 0.5539248585700989, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8206967562437057, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 422.890625, "epoch": 0.3024, "grad_norm": 0.04122132807970047, "kl": 0.008838653564453125, "learning_rate": 3.4341424424704373e-06, "loss": -0.0555, "reward": 1.4766654968261719, "reward_std": 0.27532494999468327, "rewards/mrr_reward": 0.6087425425648689, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6906161457300186, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 416.640625, "epoch": 0.3032, "grad_norm": 0.044697657227516174, "kl": 0.013956069946289062, "learning_rate": 3.3868813467634833e-06, "loss": -0.0113, "reward": 1.4251343607902527, "reward_std": 0.25692647136747837, "rewards/mrr_reward": 0.5694444477558136, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6789370179176331, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 458.40625, "epoch": 0.304, "grad_norm": 0.03943547606468201, "kl": 0.008486747741699219, "learning_rate": 3.3398813256574847e-06, "loss": 0.0001, "reward": 1.6643346548080444, "reward_std": 0.26205284520983696, "rewards/mrr_reward": 0.7689236104488373, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7758665382862091, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 392.9375, "epoch": 0.3048, "grad_norm": 0.049449626356363297, "kl": 0.011739730834960938, "learning_rate": 3.2931442346328e-06, "loss": -0.0534, "reward": 1.339944213628769, "reward_std": 0.2773168385028839, "rewards/mrr_reward": 0.4835565537214279, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6634734272956848, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 460.765625, "epoch": 0.3056, "grad_norm": 0.040942005813121796, "kl": 0.010448455810546875, "learning_rate": 3.2466719187897555e-06, "loss": -0.0245, "reward": 1.5223515927791595, "reward_std": 0.3074699230492115, "rewards/mrr_reward": 0.6432291939854622, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.7733821123838425, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 450.625, "epoch": 0.3064, "grad_norm": 0.038345228880643845, "kl": 0.010009765625, "learning_rate": 3.200466212775808e-06, "loss": -0.0195, "reward": 1.519100308418274, "reward_std": 0.24653562158346176, "rewards/mrr_reward": 0.6174231022596359, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8065735548734665, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 426.828125, "epoch": 0.3072, "grad_norm": 0.03888833522796631, "kl": 0.009685516357421875, "learning_rate": 3.1545289407131128e-06, "loss": -0.002, "reward": 1.4710081219673157, "reward_std": 0.38252247869968414, "rewards/mrr_reward": 0.5821180492639542, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7834498137235641, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 432.46875, "epoch": 0.308, "grad_norm": 0.038772933185100555, "kl": 0.009763717651367188, "learning_rate": 3.108861916126518e-06, "loss": -0.0166, "reward": 1.4420768022537231, "reward_std": 0.32166776806116104, "rewards/mrr_reward": 0.5522321313619614, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7668113112449646, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 431.96875, "epoch": 0.3088, "grad_norm": 0.04055608808994293, "kl": 0.009151458740234375, "learning_rate": 3.063466941871952e-06, "loss": -0.0134, "reward": 1.5163768231868744, "reward_std": 0.3534121476113796, "rewards/mrr_reward": 0.6155258119106293, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.798210859298706, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 438.859375, "epoch": 0.3096, "grad_norm": 0.04640579968690872, "kl": 0.008348464965820312, "learning_rate": 3.0183458100652752e-06, "loss": 0.025, "reward": 1.5191034972667694, "reward_std": 0.39003700762987137, "rewards/mrr_reward": 0.6491505652666092, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7709864974021912, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 425.609375, "epoch": 0.3104, "grad_norm": 0.03954498469829559, "kl": 0.010484695434570312, "learning_rate": 2.9735003020115095e-06, "loss": -0.0515, "reward": 1.6413687765598297, "reward_std": 0.28913578018546104, "rewards/mrr_reward": 0.76171875, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7339653223752975, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 453.8125, "epoch": 0.3112, "grad_norm": 0.03751927241683006, "kl": 0.010460853576660156, "learning_rate": 2.9289321881345257e-06, "loss": -0.0129, "reward": 1.5514837503433228, "reward_std": 0.15940535813570023, "rewards/mrr_reward": 0.6467633992433548, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7689204066991806, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 401.859375, "epoch": 0.312, "grad_norm": 0.04677467420697212, "kl": 0.013050079345703125, "learning_rate": 2.884643227907147e-06, "loss": 0.0117, "reward": 1.3619890213012695, "reward_std": 0.2909104973077774, "rewards/mrr_reward": 0.5390625, "rewards/rank_answer_foramt_reward": 0.787109375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7144197523593903, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 435.640625, "epoch": 0.3128, "grad_norm": 0.03747595474123955, "kl": 0.009700775146484375, "learning_rate": 2.840635169781688e-06, "loss": -0.0023, "reward": 1.5902496874332428, "reward_std": 0.3390812985599041, "rewards/mrr_reward": 0.7155134230852127, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7620441168546677, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 423.28125, "epoch": 0.3136, "grad_norm": 0.038146279752254486, "kl": 0.009531021118164062, "learning_rate": 2.796909751120931e-06, "loss": -0.0248, "reward": 1.51711967587471, "reward_std": 0.387925885617733, "rewards/mrr_reward": 0.6617807745933533, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7227952480316162, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 451.09375, "epoch": 0.3144, "grad_norm": 0.040637459605932236, "kl": 0.009733200073242188, "learning_rate": 2.7534686981295335e-06, "loss": 0.0159, "reward": 1.3827373087406158, "reward_std": 0.4360157921910286, "rewards/mrr_reward": 0.49673859030008316, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.821563258767128, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 444.4375, "epoch": 0.3152, "grad_norm": 0.04179443418979645, "kl": 0.009754180908203125, "learning_rate": 2.7103137257858867e-06, "loss": -0.0236, "reward": 1.4687740206718445, "reward_std": 0.3465902768075466, "rewards/mrr_reward": 0.5991691499948502, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.717197373509407, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 427.71875, "epoch": 0.316, "grad_norm": 0.046546537429094315, "kl": 0.012380599975585938, "learning_rate": 2.667446537774402e-06, "loss": -0.0005, "reward": 1.5630870461463928, "reward_std": 0.3838147297501564, "rewards/mrr_reward": 0.685695692896843, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7915740013122559, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 450.109375, "epoch": 0.3168, "grad_norm": 0.04081228747963905, "kl": 0.009305953979492188, "learning_rate": 2.624868826418262e-06, "loss": -0.0092, "reward": 1.4161024391651154, "reward_std": 0.3121535889804363, "rewards/mrr_reward": 0.5301339440047741, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.831237405538559, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 443.828125, "epoch": 0.3176, "grad_norm": 0.04065534844994545, "kl": 0.010629653930664062, "learning_rate": 2.5825822726126095e-06, "loss": 0.0077, "reward": 1.4853081405162811, "reward_std": 0.2746486961841583, "rewards/mrr_reward": 0.5950706824660301, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7699548155069351, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 447.984375, "epoch": 0.3184, "grad_norm": 0.04703785106539726, "kl": 0.01222991943359375, "learning_rate": 2.5405885457581793e-06, "loss": -0.0213, "reward": 1.2661742269992828, "reward_std": 0.30840878933668137, "rewards/mrr_reward": 0.3860801197588444, "rewards/rank_answer_foramt_reward": 0.91015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7567955106496811, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 423.421875, "epoch": 0.3192, "grad_norm": 0.03758614510297775, "kl": 0.01016998291015625, "learning_rate": 2.4988893036954045e-06, "loss": -0.0215, "reward": 1.606367141008377, "reward_std": 0.12326683290302753, "rewards/mrr_reward": 0.6976996511220932, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.782834529876709, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 447.6875, "epoch": 0.32, "grad_norm": 0.040864165872335434, "kl": 0.009775161743164062, "learning_rate": 2.4574861926389615e-06, "loss": -0.0104, "reward": 1.3981690108776093, "reward_std": 0.30249515548348427, "rewards/mrr_reward": 0.5021701380610466, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.804991826415062, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 468.359375, "epoch": 0.3208, "grad_norm": 0.0384257435798645, "kl": 0.008899688720703125, "learning_rate": 2.4163808471127815e-06, "loss": -0.0165, "reward": 1.363112986087799, "reward_std": 0.38970305398106575, "rewards/mrr_reward": 0.4783792197704315, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.804058238863945, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 444.25, "epoch": 0.3216, "grad_norm": 0.040598995983600616, "kl": 0.008358001708984375, "learning_rate": 2.37557488988552e-06, "loss": 0.0419, "reward": 1.5591547191143036, "reward_std": 0.2982029393315315, "rewards/mrr_reward": 0.6777963787317276, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.7450015395879745, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 425.421875, "epoch": 0.3224, "grad_norm": 0.039519764482975006, "kl": 0.012409210205078125, "learning_rate": 2.335069931906503e-06, "loss": -0.0157, "reward": 1.4554656445980072, "reward_std": 0.30417120456695557, "rewards/mrr_reward": 0.569221243262291, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7715264558792114, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 455.390625, "epoch": 0.3232, "grad_norm": 0.04290325567126274, "kl": 0.010166168212890625, "learning_rate": 2.2948675722421086e-06, "loss": -0.0324, "reward": 1.323316365480423, "reward_std": 0.31126467883586884, "rewards/mrr_reward": 0.4303385391831398, "rewards/rank_answer_foramt_reward": 0.884765625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.821227639913559, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 457.96875, "epoch": 0.324, "grad_norm": 0.039676424115896225, "kl": 0.01012420654296875, "learning_rate": 2.254969398012663e-06, "loss": -0.0132, "reward": 1.3026447296142578, "reward_std": 0.30587131530046463, "rewards/mrr_reward": 0.41312623769044876, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8224635273218155, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 460.046875, "epoch": 0.3248, "grad_norm": 0.04161365330219269, "kl": 0.009571075439453125, "learning_rate": 2.215376984329767e-06, "loss": 0.0262, "reward": 1.6110160648822784, "reward_std": 0.355252580717206, "rewards/mrr_reward": 0.717968761920929, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8175320029258728, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 448.734375, "epoch": 0.3256, "grad_norm": 0.03859516605734825, "kl": 0.00945281982421875, "learning_rate": 2.1760918942341193e-06, "loss": -0.0282, "reward": 1.5105039477348328, "reward_std": 0.3397863022983074, "rewards/mrr_reward": 0.6239955350756645, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7977168411016464, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 399.21875, "epoch": 0.3264, "grad_norm": 0.044858209788799286, "kl": 0.014190673828125, "learning_rate": 2.1371156786338108e-06, "loss": -0.0274, "reward": 1.4385575950145721, "reward_std": 0.47953635454177856, "rewards/mrr_reward": 0.6119791567325592, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6903298795223236, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 458.65625, "epoch": 0.3272, "grad_norm": 0.03756219893693924, "kl": 0.010061264038085938, "learning_rate": 2.098449876243096e-06, "loss": -0.0082, "reward": 1.3541287928819656, "reward_std": 0.2427298128604889, "rewards/mrr_reward": 0.4596044272184372, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7946641892194748, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 461.28125, "epoch": 0.328, "grad_norm": 0.04040595516562462, "kl": 0.008250236511230469, "learning_rate": 2.0600960135216463e-06, "loss": -0.0227, "reward": 1.6990008354187012, "reward_std": 0.15870059095323086, "rewards/mrr_reward": 0.7838541716337204, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.786843404173851, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 426.90625, "epoch": 0.3288, "grad_norm": 0.04700813814997673, "kl": 0.0103759765625, "learning_rate": 2.022055604614289e-06, "loss": -0.0175, "reward": 1.3733271956443787, "reward_std": 0.27594383619725704, "rewards/mrr_reward": 0.4928571507334709, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7794190347194672, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 451.09375, "epoch": 0.3296, "grad_norm": 0.041563473641872406, "kl": 0.009532928466796875, "learning_rate": 1.984330151291233e-06, "loss": 0.0048, "reward": 1.6681818068027496, "reward_std": 0.31681402772665024, "rewards/mrr_reward": 0.76953125, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8091207593679428, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 445.84375, "epoch": 0.3304, "grad_norm": 0.03638165071606636, "kl": 0.009305953979492188, "learning_rate": 1.9469211428887813e-06, "loss": 0.0181, "reward": 1.3885623514652252, "reward_std": 0.34145616739988327, "rewards/mrr_reward": 0.501946933567524, "rewards/rank_answer_foramt_reward": 0.830078125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8644477128982544, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 457.625, "epoch": 0.3312, "grad_norm": 0.03859129920601845, "kl": 0.0097808837890625, "learning_rate": 1.9098300562505266e-06, "loss": 0.0172, "reward": 1.4659111499786377, "reward_std": 0.2733108922839165, "rewards/mrr_reward": 0.5613281279802322, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8388167470693588, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 431.28125, "epoch": 0.332, "grad_norm": 0.042557183653116226, "kl": 0.010099411010742188, "learning_rate": 1.8730583556690607e-06, "loss": 0.0313, "reward": 1.3624907732009888, "reward_std": 0.270840547978878, "rewards/mrr_reward": 0.4820932298898697, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7225586175918579, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 429.703125, "epoch": 0.3328, "grad_norm": 0.04032871872186661, "kl": 0.009693145751953125, "learning_rate": 1.8366074928281608e-06, "loss": -0.0172, "reward": 1.355335295200348, "reward_std": 0.16411831602454185, "rewards/mrr_reward": 0.4381696507334709, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7929615378379822, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 428.5, "epoch": 0.3336, "grad_norm": 0.03926324471831322, "kl": 0.009528160095214844, "learning_rate": 1.8004789067454763e-06, "loss": -0.0313, "reward": 1.3210996389389038, "reward_std": 0.31574152782559395, "rewards/mrr_reward": 0.4390067160129547, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7765243351459503, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 451.484375, "epoch": 0.3344, "grad_norm": 0.0452018603682518, "kl": 0.009485244750976562, "learning_rate": 1.7646740237157256e-06, "loss": 0.0053, "reward": 1.3444588482379913, "reward_std": 0.3464643657207489, "rewards/mrr_reward": 0.4705481231212616, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7946986109018326, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 424.1875, "epoch": 0.3352, "grad_norm": 0.04134754836559296, "kl": 0.011098861694335938, "learning_rate": 1.7291942572543806e-06, "loss": -0.0201, "reward": 1.5103633105754852, "reward_std": 0.26311646308749914, "rewards/mrr_reward": 0.6181113570928574, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7897311300039291, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 421.40625, "epoch": 0.336, "grad_norm": 0.04511724412441254, "kl": 0.012248992919921875, "learning_rate": 1.6940410080418723e-06, "loss": -0.0424, "reward": 1.3716591596603394, "reward_std": 0.3640734553337097, "rewards/mrr_reward": 0.490234375, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7823120504617691, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 419.796875, "epoch": 0.3368, "grad_norm": 0.04100240021944046, "kl": 0.010448455810546875, "learning_rate": 1.6592156638682887e-06, "loss": -0.0274, "reward": 1.425389677286148, "reward_std": 0.3395872414112091, "rewards/mrr_reward": 0.5344307869672775, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7682346403598785, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 449.046875, "epoch": 0.3376, "grad_norm": 0.03941246122121811, "kl": 0.009730339050292969, "learning_rate": 1.6247195995785836e-06, "loss": 0.0095, "reward": 1.5374208092689514, "reward_std": 0.165388286113739, "rewards/mrr_reward": 0.6133246421813965, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8491195142269135, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 450.0625, "epoch": 0.3384, "grad_norm": 0.036055538803339005, "kl": 0.00992584228515625, "learning_rate": 1.5905541770183096e-06, "loss": -0.0127, "reward": 1.5841495990753174, "reward_std": 0.2556448373943567, "rewards/mrr_reward": 0.6819444745779037, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7769235521554947, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 427.09375, "epoch": 0.3392, "grad_norm": 0.04632318392395973, "kl": 0.010486602783203125, "learning_rate": 1.5567207449798517e-06, "loss": 0.0142, "reward": 1.3678037822246552, "reward_std": 0.31665654107928276, "rewards/mrr_reward": 0.4849950596690178, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8138497620820999, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 443.328125, "epoch": 0.34, "grad_norm": 0.0403946228325367, "kl": 0.0091400146484375, "learning_rate": 1.52322063914917e-06, "loss": -0.0281, "reward": 1.3638777285814285, "reward_std": 0.3191062808036804, "rewards/mrr_reward": 0.527752973139286, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 0.953125, "rewards/rank_think_format_reward": 0.7075394093990326, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 406.140625, "epoch": 0.3408, "grad_norm": 0.04744080826640129, "kl": 0.010467529296875, "learning_rate": 1.490055182053083e-06, "loss": 0.0045, "reward": 1.5458801984786987, "reward_std": 0.223420899361372, "rewards/mrr_reward": 0.6529947966337204, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7662600725889206, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 433.625, "epoch": 0.3416, "grad_norm": 0.035729970782995224, "kl": 0.009447097778320312, "learning_rate": 1.4572256830070497e-06, "loss": -0.0069, "reward": 1.506346195936203, "reward_std": 0.2625518664717674, "rewards/mrr_reward": 0.6086123436689377, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8024365603923798, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 450.375, "epoch": 0.3424, "grad_norm": 0.04041771963238716, "kl": 0.009374618530273438, "learning_rate": 1.4247334380634792e-06, "loss": -0.0142, "reward": 1.44622141122818, "reward_std": 0.19066144712269306, "rewards/mrr_reward": 0.5250868275761604, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8049887120723724, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 463.3125, "epoch": 0.3432, "grad_norm": 0.03661702945828438, "kl": 0.008138656616210938, "learning_rate": 1.3925797299605649e-06, "loss": -0.0252, "reward": 1.3437042236328125, "reward_std": 0.23780394345521927, "rewards/mrr_reward": 0.44283854961395264, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8158335089683533, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 414.75, "epoch": 0.344, "grad_norm": 0.04520437493920326, "kl": 0.012065887451171875, "learning_rate": 1.3607658280716474e-06, "loss": -0.0399, "reward": 1.4789037704467773, "reward_std": 0.3689150810241699, "rewards/mrr_reward": 0.6351562589406967, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6681386232376099, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 429.0625, "epoch": 0.3448, "grad_norm": 0.0407538115978241, "kl": 0.0075836181640625, "learning_rate": 1.3292929883550998e-06, "loss": 0.0148, "reward": 1.4983892738819122, "reward_std": 0.2416827231645584, "rewards/mrr_reward": 0.5973958522081375, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7908297926187515, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 439.375, "epoch": 0.3456, "grad_norm": 0.038508400321006775, "kl": 0.010146141052246094, "learning_rate": 1.2981624533047432e-06, "loss": -0.0291, "reward": 1.6629530489444733, "reward_std": 0.33460457250475883, "rewards/mrr_reward": 0.7881510555744171, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7134150713682175, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 453.96875, "epoch": 0.3464, "grad_norm": 0.043071672320365906, "kl": 0.012836456298828125, "learning_rate": 1.2673754519008008e-06, "loss": 0.0302, "reward": 1.4194163978099823, "reward_std": 0.21476874873042107, "rewards/mrr_reward": 0.5527033805847168, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7514027804136276, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 428.375, "epoch": 0.3472, "grad_norm": 0.039470795542001724, "kl": 0.00861358642578125, "learning_rate": 1.2369331995613664e-06, "loss": -0.0457, "reward": 1.6773528754711151, "reward_std": 0.2540069557726383, "rewards/mrr_reward": 0.7979166507720947, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.735270619392395, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 428.3125, "epoch": 0.348, "grad_norm": 0.042133934795856476, "kl": 0.012884140014648438, "learning_rate": 1.206836898094439e-06, "loss": -0.0082, "reward": 1.3745841085910797, "reward_std": 0.25170470029115677, "rewards/mrr_reward": 0.4813492000102997, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7946629524230957, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 437.609375, "epoch": 0.3488, "grad_norm": 0.0405392199754715, "kl": 0.010301589965820312, "learning_rate": 1.1770877356504684e-06, "loss": -0.0233, "reward": 1.5826008915901184, "reward_std": 0.24600278958678246, "rewards/mrr_reward": 0.6921875178813934, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7255659103393555, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 427.578125, "epoch": 0.3496, "grad_norm": 0.04396117478609085, "kl": 0.014806747436523438, "learning_rate": 1.1476868866754488e-06, "loss": -0.0133, "reward": 1.5188599526882172, "reward_std": 0.3319630119949579, "rewards/mrr_reward": 0.651041679084301, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7567054778337479, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 425.515625, "epoch": 0.3504, "grad_norm": 0.04198921099305153, "kl": 0.01055908203125, "learning_rate": 1.1186355118645552e-06, "loss": -0.0335, "reward": 1.3222236633300781, "reward_std": 0.3595678508281708, "rewards/mrr_reward": 0.4773871749639511, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7124541997909546, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 443.59375, "epoch": 0.3512, "grad_norm": 0.036484286189079285, "kl": 0.009668350219726562, "learning_rate": 1.0899347581163222e-06, "loss": -0.011, "reward": 1.4655856788158417, "reward_std": 0.2706412933766842, "rewards/mrr_reward": 0.5767113268375397, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7853553593158722, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 461.0, "epoch": 0.352, "grad_norm": 0.04056106507778168, "kl": 0.009960174560546875, "learning_rate": 1.0615857584873624e-06, "loss": -0.0195, "reward": 1.39870023727417, "reward_std": 0.28301356360316277, "rewards/mrr_reward": 0.5242187604308128, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7534593939781189, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 441.078125, "epoch": 0.3528, "grad_norm": 0.0444202646613121, "kl": 0.010128021240234375, "learning_rate": 1.0335896321476413e-06, "loss": -0.0272, "reward": 1.4179519712924957, "reward_std": 0.2796483486890793, "rewards/mrr_reward": 0.5222966149449348, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7961382865905762, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 425.71875, "epoch": 0.3536, "grad_norm": 0.040348101407289505, "kl": 0.009181976318359375, "learning_rate": 1.0059474843362893e-06, "loss": -0.0111, "reward": 1.5824732184410095, "reward_std": 0.27756644412875175, "rewards/mrr_reward": 0.6705729365348816, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8336466401815414, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 428.796875, "epoch": 0.3544, "grad_norm": 0.04108656197786331, "kl": 0.010869979858398438, "learning_rate": 9.786604063179728e-07, "loss": -0.0285, "reward": 1.447850614786148, "reward_std": 0.4478020519018173, "rewards/mrr_reward": 0.6013020873069763, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7332671880722046, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 462.40625, "epoch": 0.3552, "grad_norm": 0.04063072428107262, "kl": 0.009490966796875, "learning_rate": 9.517294753398066e-07, "loss": 0.0038, "reward": 1.4069487750530243, "reward_std": 0.24927948415279388, "rewards/mrr_reward": 0.4890128970146179, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7952955961227417, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 429.546875, "epoch": 0.356, "grad_norm": 0.03845556825399399, "kl": 0.008657455444335938, "learning_rate": 9.251557545888312e-07, "loss": -0.0288, "reward": 1.4185177087783813, "reward_std": 0.25927474722266197, "rewards/mrr_reward": 0.5444196350872517, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7679224908351898, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 432.671875, "epoch": 0.3568, "grad_norm": 0.0459010936319828, "kl": 0.009598731994628906, "learning_rate": 8.989402931500434e-07, "loss": 0.0213, "reward": 1.6813454329967499, "reward_std": 0.3304239772260189, "rewards/mrr_reward": 0.7842820137739182, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7769677639007568, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 421.46875, "epoch": 0.3576, "grad_norm": 0.04193993657827377, "kl": 0.010549545288085938, "learning_rate": 8.730841259649725e-07, "loss": 0.0062, "reward": 1.5157469809055328, "reward_std": 0.34639402106404305, "rewards/mrr_reward": 0.6320250630378723, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7502106577157974, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 439.640625, "epoch": 0.3584, "grad_norm": 0.03967143967747688, "kl": 0.010587692260742188, "learning_rate": 8.475882737908248e-07, "loss": -0.0067, "reward": 1.677318662405014, "reward_std": 0.26558706164360046, "rewards/mrr_reward": 0.760627493262291, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8403518497943878, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 443.765625, "epoch": 0.3592, "grad_norm": 0.04171387106180191, "kl": 0.009777069091796875, "learning_rate": 8.224537431601886e-07, "loss": -0.0275, "reward": 1.4102658927440643, "reward_std": 0.29483452066779137, "rewards/mrr_reward": 0.5575024783611298, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.757959634065628, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 421.625, "epoch": 0.36, "grad_norm": 0.047788720577955246, "kl": 0.010875701904296875, "learning_rate": 7.976815263412963e-07, "loss": -0.0053, "reward": 1.4586642682552338, "reward_std": 0.28997623920440674, "rewards/mrr_reward": 0.5601996779441833, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7929323315620422, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 449.09375, "epoch": 0.3608, "grad_norm": 0.03860335424542427, "kl": 0.007927894592285156, "learning_rate": 7.732726012988512e-07, "loss": 0.0076, "reward": 1.3660823702812195, "reward_std": 0.33029213547706604, "rewards/mrr_reward": 0.4904638007283211, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7490926533937454, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 450.15625, "epoch": 0.3616, "grad_norm": 0.03353331610560417, "kl": 0.009148597717285156, "learning_rate": 7.492279316554207e-07, "loss": -0.0041, "reward": 1.5296534895896912, "reward_std": 0.16189000383019447, "rewards/mrr_reward": 0.6202628910541534, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7967446148395538, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 437.75, "epoch": 0.3624, "grad_norm": 0.04088251292705536, "kl": 0.009881973266601562, "learning_rate": 7.255484666533874e-07, "loss": 0.0072, "reward": 1.5006992816925049, "reward_std": 0.28344327583909035, "rewards/mrr_reward": 0.6140811145305634, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7433622479438782, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 426.59375, "epoch": 0.3632, "grad_norm": 0.04271751642227173, "kl": 0.010400772094726562, "learning_rate": 7.022351411174866e-07, "loss": -0.0219, "reward": 1.3978624939918518, "reward_std": 0.27385834977030754, "rewards/mrr_reward": 0.5089161656796932, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7699484676122665, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 424.171875, "epoch": 0.364, "grad_norm": 0.048336923122406006, "kl": 0.01015472412109375, "learning_rate": 6.792888754178906e-07, "loss": -0.0419, "reward": 1.5317293107509613, "reward_std": 0.38713493943214417, "rewards/mrr_reward": 0.6699218675494194, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7560686320066452, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 444.578125, "epoch": 0.3648, "grad_norm": 0.03923417627811432, "kl": 0.011358261108398438, "learning_rate": 6.567105754338798e-07, "loss": -0.0218, "reward": 1.3666671514511108, "reward_std": 0.2978068180382252, "rewards/mrr_reward": 0.49135666340589523, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7989402711391449, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 427.859375, "epoch": 0.3656, "grad_norm": 0.04292134568095207, "kl": 0.014402389526367188, "learning_rate": 6.345011325180772e-07, "loss": -0.0116, "reward": 1.500946819782257, "reward_std": 0.3475816771388054, "rewards/mrr_reward": 0.613163448870182, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7879088073968887, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 452.609375, "epoch": 0.3664, "grad_norm": 0.03790047764778137, "kl": 0.008701324462890625, "learning_rate": 6.126614234612593e-07, "loss": -0.0025, "reward": 1.5834512114524841, "reward_std": 0.29408957809209824, "rewards/mrr_reward": 0.6931423544883728, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7564990818500519, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 437.46875, "epoch": 0.3672, "grad_norm": 0.041111089289188385, "kl": 0.01041412353515625, "learning_rate": 5.911923104577455e-07, "loss": 0.0538, "reward": 1.5291092991828918, "reward_std": 0.33945245295763016, "rewards/mrr_reward": 0.6747395843267441, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.723764643073082, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 447.40625, "epoch": 0.368, "grad_norm": 0.040803536772727966, "kl": 0.00865936279296875, "learning_rate": 5.700946410713548e-07, "loss": 0.0111, "reward": 1.4347401559352875, "reward_std": 0.23255350813269615, "rewards/mrr_reward": 0.5265811011195183, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8203562796115875, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 451.375, "epoch": 0.3688, "grad_norm": 0.03991984575986862, "kl": 0.010066986083984375, "learning_rate": 5.49369248201953e-07, "loss": -0.0406, "reward": 1.5644738674163818, "reward_std": 0.28139273822307587, "rewards/mrr_reward": 0.6668402999639511, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7630703374743462, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 441.296875, "epoch": 0.3696, "grad_norm": 0.041922301054000854, "kl": 0.009305953979492188, "learning_rate": 5.290169500525577e-07, "loss": 0.0128, "reward": 1.3219963014125824, "reward_std": 0.29185811802744865, "rewards/mrr_reward": 0.44941097497940063, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7848227918148041, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 416.265625, "epoch": 0.3704, "grad_norm": 0.0457732267677784, "kl": 0.009916305541992188, "learning_rate": 5.090385500970551e-07, "loss": -0.0337, "reward": 1.5106676518917084, "reward_std": 0.2413267381489277, "rewards/mrr_reward": 0.6132688522338867, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7740776985883713, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 442.75, "epoch": 0.3712, "grad_norm": 0.040093667805194855, "kl": 0.00926971435546875, "learning_rate": 4.894348370484648e-07, "loss": -0.007, "reward": 1.5942188501358032, "reward_std": 0.2485157772898674, "rewards/mrr_reward": 0.6885416582226753, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8343198895454407, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 446.03125, "epoch": 0.372, "grad_norm": 0.04025663435459137, "kl": 0.010089874267578125, "learning_rate": 4.702065848278126e-07, "loss": -0.0089, "reward": 1.3519022762775421, "reward_std": 0.35284218564629555, "rewards/mrr_reward": 0.5015129074454308, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.7488124072551727, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 452.75, "epoch": 0.3728, "grad_norm": 0.03952139616012573, "kl": 0.011041641235351562, "learning_rate": 4.5135455253357053e-07, "loss": -0.0105, "reward": 1.3247076570987701, "reward_std": 0.30543025955557823, "rewards/mrr_reward": 0.45331722497940063, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7616707235574722, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 427.671875, "epoch": 0.3736, "grad_norm": 0.03882599249482155, "kl": 0.011716842651367188, "learning_rate": 4.3287948441169457e-07, "loss": -0.0342, "reward": 1.689410239458084, "reward_std": 0.31131643801927567, "rewards/mrr_reward": 0.8078124970197678, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7418206632137299, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 436.09375, "epoch": 0.3744, "grad_norm": 0.03910544514656067, "kl": 0.01006317138671875, "learning_rate": 4.1478210982624055e-07, "loss": -0.02, "reward": 1.40711310505867, "reward_std": 0.2673242837190628, "rewards/mrr_reward": 0.5138020887970924, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7695029377937317, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 424.0, "epoch": 0.3752, "grad_norm": 0.039743226021528244, "kl": 0.008121490478515625, "learning_rate": 3.9706314323056936e-07, "loss": -0.045, "reward": 1.4817086458206177, "reward_std": 0.26649700850248337, "rewards/mrr_reward": 0.5803571343421936, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7860555499792099, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 430.765625, "epoch": 0.376, "grad_norm": 0.03648286312818527, "kl": 0.010162353515625, "learning_rate": 3.7972328413914074e-07, "loss": -0.0279, "reward": 1.5091581344604492, "reward_std": 0.3366273455321789, "rewards/mrr_reward": 0.634523794054985, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7343911230564117, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 424.46875, "epoch": 0.3768, "grad_norm": 0.0420987531542778, "kl": 0.012674331665039062, "learning_rate": 3.627632170999029e-07, "loss": -0.0287, "reward": 1.5445944368839264, "reward_std": 0.2770478278398514, "rewards/mrr_reward": 0.6619543731212616, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7488852739334106, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 451.5625, "epoch": 0.3776, "grad_norm": 0.033279478549957275, "kl": 0.007472038269042969, "learning_rate": 3.4618361166726123e-07, "loss": -0.0299, "reward": 1.5285615921020508, "reward_std": 0.3049774765968323, "rewards/mrr_reward": 0.6360863149166107, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8001734763383865, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 439.984375, "epoch": 0.3784, "grad_norm": 0.042792558670043945, "kl": 0.010000228881835938, "learning_rate": 3.2998512237565005e-07, "loss": 0.0096, "reward": 1.6697318851947784, "reward_std": 0.20043689012527466, "rewards/mrr_reward": 0.7655381858348846, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7692774683237076, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 415.984375, "epoch": 0.3792, "grad_norm": 0.040974345058202744, "kl": 0.0115509033203125, "learning_rate": 3.1416838871368925e-07, "loss": -0.0247, "reward": 1.4770236611366272, "reward_std": 0.32821785286068916, "rewards/mrr_reward": 0.6046874970197678, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7489113509654999, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 431.125, "epoch": 0.38, "grad_norm": 0.03732066601514816, "kl": 0.010282516479492188, "learning_rate": 2.987340350989421e-07, "loss": -0.029, "reward": 1.5076874494552612, "reward_std": 0.22487280704081059, "rewards/mrr_reward": 0.5981832966208458, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7775574177503586, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 414.328125, "epoch": 0.3808, "grad_norm": 0.04424848407506943, "kl": 0.009943008422851562, "learning_rate": 2.836826708532603e-07, "loss": -0.0106, "reward": 1.4497572183609009, "reward_std": 0.38473574817180634, "rewards/mrr_reward": 0.592447929084301, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7072817832231522, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 438.390625, "epoch": 0.3816, "grad_norm": 0.042454175651073456, "kl": 0.009542465209960938, "learning_rate": 2.6901489017873375e-07, "loss": 0.0024, "reward": 1.4682879745960236, "reward_std": 0.3182600736618042, "rewards/mrr_reward": 0.5997767746448517, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7138832211494446, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 442.640625, "epoch": 0.3824, "grad_norm": 0.04164545610547066, "kl": 0.010335922241210938, "learning_rate": 2.547312721342277e-07, "loss": 0.0132, "reward": 1.4912968873977661, "reward_std": 0.28201924264431, "rewards/mrr_reward": 0.5958767384290695, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7973784357309341, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 429.328125, "epoch": 0.3832, "grad_norm": 0.04302007704973221, "kl": 0.011838912963867188, "learning_rate": 2.4083238061252565e-07, "loss": -0.0429, "reward": 1.3676048517227173, "reward_std": 0.2852121964097023, "rewards/mrr_reward": 0.5010788887739182, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7371641099452972, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 419.34375, "epoch": 0.384, "grad_norm": 0.04239465668797493, "kl": 0.010568618774414062, "learning_rate": 2.273187643180652e-07, "loss": 0.0114, "reward": 1.5518296658992767, "reward_std": 0.404549989849329, "rewards/mrr_reward": 0.6595238298177719, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8250507712364197, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 437.8125, "epoch": 0.3848, "grad_norm": 0.0668654814362526, "kl": 0.021732330322265625, "learning_rate": 2.1419095674527934e-07, "loss": -0.0356, "reward": 1.413476824760437, "reward_std": 0.3368876613676548, "rewards/mrr_reward": 0.5408854186534882, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7965600192546844, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 443.21875, "epoch": 0.3856, "grad_norm": 0.045830436050891876, "kl": 0.011808395385742188, "learning_rate": 2.014494761575314e-07, "loss": -0.0255, "reward": 1.5131396651268005, "reward_std": 0.3415753096342087, "rewards/mrr_reward": 0.6448970809578896, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7443193197250366, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 448.328125, "epoch": 0.3864, "grad_norm": 0.03847775235772133, "kl": 0.009500503540039062, "learning_rate": 1.8909482556666026e-07, "loss": -0.0099, "reward": 1.3552064895629883, "reward_std": 0.21233711391687393, "rewards/mrr_reward": 0.4519283324480057, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7879875898361206, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 435.21875, "epoch": 0.3872, "grad_norm": 0.04119429737329483, "kl": 0.011724472045898438, "learning_rate": 1.7712749271311392e-07, "loss": -0.0224, "reward": 1.6171193420886993, "reward_std": 0.33994160592556, "rewards/mrr_reward": 0.7380208224058151, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.749872237443924, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 425.203125, "epoch": 0.388, "grad_norm": 0.043465420603752136, "kl": 0.011735916137695312, "learning_rate": 1.6554795004670389e-07, "loss": -0.0467, "reward": 1.3008288443088531, "reward_std": 0.34553826972842216, "rewards/mrr_reward": 0.4407738149166107, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7390396296977997, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 422.765625, "epoch": 0.3888, "grad_norm": 0.043878600001335144, "kl": 0.010034561157226562, "learning_rate": 1.543566547079467e-07, "loss": -0.009, "reward": 1.3604202717542648, "reward_std": 0.2537879031151533, "rewards/mrr_reward": 0.49144964292645454, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7387129366397858, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 400.0625, "epoch": 0.3896, "grad_norm": 0.05306778848171234, "kl": 0.0155029296875, "learning_rate": 1.4355404851001953e-07, "loss": -0.0855, "reward": 1.5500088930130005, "reward_std": 0.2238281350582838, "rewards/mrr_reward": 0.7127604186534882, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.6211006790399551, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 415.703125, "epoch": 0.3904, "grad_norm": 0.046678684651851654, "kl": 0.009532928466796875, "learning_rate": 1.3314055792131964e-07, "loss": -0.009, "reward": 1.4595491290092468, "reward_std": 0.25613268837332726, "rewards/mrr_reward": 0.5737351253628731, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7956129163503647, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 462.25, "epoch": 0.3912, "grad_norm": 0.04260161146521568, "kl": 0.009710311889648438, "learning_rate": 1.231165940486234e-07, "loss": -0.0066, "reward": 1.4045831859111786, "reward_std": 0.31474703550338745, "rewards/mrr_reward": 0.4935515895485878, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8642172515392303, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 435.96875, "epoch": 0.392, "grad_norm": 0.03883928805589676, "kl": 0.008863449096679688, "learning_rate": 1.134825526208605e-07, "loss": -0.0364, "reward": 1.4408005475997925, "reward_std": 0.33699289709329605, "rewards/mrr_reward": 0.5509920567274094, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7784203439950943, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 426.984375, "epoch": 0.3928, "grad_norm": 0.0388728491961956, "kl": 0.010242462158203125, "learning_rate": 1.0423881397349067e-07, "loss": -0.0279, "reward": 1.3634454309940338, "reward_std": 0.32560136914253235, "rewards/mrr_reward": 0.5041666775941849, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7366872876882553, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 435.8125, "epoch": 0.3936, "grad_norm": 0.03954172879457474, "kl": 0.008582115173339844, "learning_rate": 9.538574303348813e-08, "loss": -0.015, "reward": 1.2776685655117035, "reward_std": 0.2750714495778084, "rewards/mrr_reward": 0.3992125578224659, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7596440017223358, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 427.1875, "epoch": 0.3944, "grad_norm": 0.04806819558143616, "kl": 0.0110321044921875, "learning_rate": 8.692368930493522e-08, "loss": -0.0395, "reward": 1.3417856693267822, "reward_std": 0.21769994869828224, "rewards/mrr_reward": 0.48239708691835403, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7233483120799065, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 439.09375, "epoch": 0.3952, "grad_norm": 0.042950328439474106, "kl": 0.010545730590820312, "learning_rate": 7.885298685522235e-08, "loss": -0.0172, "reward": 1.4324612021446228, "reward_std": 0.3757603354752064, "rewards/mrr_reward": 0.5950520783662796, "rewards/rank_answer_foramt_reward": 0.79296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7524469792842865, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 421.625, "epoch": 0.396, "grad_norm": 0.04140486195683479, "kl": 0.010625839233398438, "learning_rate": 7.117395430186414e-08, "loss": -0.018, "reward": 1.4777464866638184, "reward_std": 0.2758827228099108, "rewards/mrr_reward": 0.5884734690189362, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7865635007619858, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 434.984375, "epoch": 0.3968, "grad_norm": 0.045392703264951706, "kl": 0.009822845458984375, "learning_rate": 6.388689479991606e-08, "loss": -0.0167, "reward": 1.2444620728492737, "reward_std": 0.2947104088962078, "rewards/mrr_reward": 0.3777901865541935, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7551845461130142, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 476.234375, "epoch": 0.3976, "grad_norm": 0.03996572643518448, "kl": 0.0064067840576171875, "learning_rate": 5.699209603001077e-08, "loss": 0.0273, "reward": 1.4518903493881226, "reward_std": 0.1597257237881422, "rewards/mrr_reward": 0.5305245518684387, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8349859565496445, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 438.203125, "epoch": 0.3984, "grad_norm": 0.04201935976743698, "kl": 0.013742446899414062, "learning_rate": 5.048983018699827e-08, "loss": -0.0298, "reward": 1.4749173521995544, "reward_std": 0.45922574400901794, "rewards/mrr_reward": 0.6170573085546494, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7734040170907974, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 407.984375, "epoch": 0.3992, "grad_norm": 0.043673526495695114, "kl": 0.011587142944335938, "learning_rate": 4.438035396920004e-08, "loss": -0.029, "reward": 1.3924769461154938, "reward_std": 0.3469556476920843, "rewards/mrr_reward": 0.5519531145691872, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.6583698391914368, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 443.71875, "epoch": 0.4, "grad_norm": 0.04227694869041443, "kl": 0.01059722900390625, "learning_rate": 3.866390856827495e-08, "loss": -0.0028, "reward": 1.4461922645568848, "reward_std": 0.32431307435035706, "rewards/mrr_reward": 0.5536830350756645, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.81004199385643, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 475.90625, "epoch": 0.4008, "grad_norm": 0.042455315589904785, "kl": 0.009504318237304688, "learning_rate": 1.0408293519785103e-05, "loss": 0.0258, "reward": 1.0929251462221146, "reward_std": 0.21385096292942762, "rewards/mrr_reward": 0.1986049171537161, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8604518622159958, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 458.5, "epoch": 0.4016, "grad_norm": 0.03798682987689972, "kl": 0.008695602416992188, "learning_rate": 1.0376901826699349e-05, "loss": 0.0227, "reward": 1.2380023002624512, "reward_std": 0.1255473867058754, "rewards/mrr_reward": 0.30468748323619366, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8555702418088913, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 461.65625, "epoch": 0.4024, "grad_norm": 0.037431634962558746, "kl": 0.0093994140625, "learning_rate": 1.0345506413744726e-05, "loss": -0.0127, "reward": 1.179205298423767, "reward_std": 0.12070270255208015, "rewards/mrr_reward": 0.2780816126614809, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8146621286869049, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 498.25, "epoch": 0.4032, "grad_norm": 0.038511723279953, "kl": 0.009913444519042969, "learning_rate": 1.0314107590781284e-05, "loss": -0.0139, "reward": 1.0198038667440414, "reward_std": 0.16722827591001987, "rewards/mrr_reward": 0.12537202425301075, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8510243445634842, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 453.828125, "epoch": 0.404, "grad_norm": 0.039540309458971024, "kl": 0.009187698364257812, "learning_rate": 1.0282705667702734e-05, "loss": -0.0003, "reward": 1.3496000170707703, "reward_std": 0.24421941116452217, "rewards/mrr_reward": 0.4431547671556473, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8014911860227585, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 471.28125, "epoch": 0.4048, "grad_norm": 0.0396139994263649, "kl": 0.008319854736328125, "learning_rate": 1.0251300954433377e-05, "loss": -0.0217, "reward": 1.1254372000694275, "reward_std": 0.1360313263721764, "rewards/mrr_reward": 0.19592014141380787, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.838202714920044, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 432.1875, "epoch": 0.4056, "grad_norm": 0.04718116670846939, "kl": 0.009462356567382812, "learning_rate": 1.0219893760925053e-05, "loss": -0.0408, "reward": 1.112642079591751, "reward_std": 0.19297415390610695, "rewards/mrr_reward": 0.23144841380417347, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7562213093042374, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 483.390625, "epoch": 0.4064, "grad_norm": 0.035113196820020676, "kl": 0.008016586303710938, "learning_rate": 1.0188484397154083e-05, "loss": 0.005, "reward": 1.0684494376182556, "reward_std": 0.23044782131910324, "rewards/mrr_reward": 0.18250869028270245, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.829199954867363, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 453.25, "epoch": 0.4072, "grad_norm": 0.04335900396108627, "kl": 0.009159088134765625, "learning_rate": 1.0157073173118207e-05, "loss": -0.0511, "reward": 1.1197124123573303, "reward_std": 0.21612372063100338, "rewards/mrr_reward": 0.22139137983322144, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8042160123586655, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 478.984375, "epoch": 0.408, "grad_norm": 0.037360094487667084, "kl": 0.009061813354492188, "learning_rate": 1.0125660398833528e-05, "loss": -0.0163, "reward": 1.2794201076030731, "reward_std": 0.19604488089680672, "rewards/mrr_reward": 0.3479786738753319, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.86356520652771, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 448.234375, "epoch": 0.4088, "grad_norm": 0.03932580351829529, "kl": 0.008629798889160156, "learning_rate": 1.0094246384331444e-05, "loss": -0.025, "reward": 1.1660117506980896, "reward_std": 0.20460292138159275, "rewards/mrr_reward": 0.2748697977513075, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7726956307888031, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 434.53125, "epoch": 0.4096, "grad_norm": 0.04939593747258186, "kl": 0.008899688720703125, "learning_rate": 1.0062831439655591e-05, "loss": 0.005, "reward": 1.1041904985904694, "reward_std": 0.15664683748036623, "rewards/mrr_reward": 0.19107143580913544, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8471054881811142, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 457.34375, "epoch": 0.4104, "grad_norm": 0.03681405261158943, "kl": 0.007862091064453125, "learning_rate": 1.0031415874858796e-05, "loss": -0.0162, "reward": 1.1272632777690887, "reward_std": 0.15625868551433086, "rewards/mrr_reward": 0.20879836566746235, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.8496331721544266, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 448.375, "epoch": 0.4112, "grad_norm": 0.03907299041748047, "kl": 0.008602142333984375, "learning_rate": 1e-05, "loss": -0.0077, "reward": 1.1561536490917206, "reward_std": 0.15583913400769234, "rewards/mrr_reward": 0.24771205335855484, "rewards/rank_answer_foramt_reward": 0.8984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8622283041477203, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 458.21875, "epoch": 0.412, "grad_norm": 0.043046656996011734, "kl": 0.010164260864257812, "learning_rate": 9.968584125141206e-06, "loss": -0.0261, "reward": 1.0769396722316742, "reward_std": 0.2242715172469616, "rewards/mrr_reward": 0.20158729702234268, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8049266040325165, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 474.9375, "epoch": 0.4128, "grad_norm": 0.03716719150543213, "kl": 0.007334709167480469, "learning_rate": 9.937168560344412e-06, "loss": -0.0037, "reward": 1.192034512758255, "reward_std": 0.18218636699020863, "rewards/mrr_reward": 0.25520213693380356, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8388857245445251, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 471.3125, "epoch": 0.4136, "grad_norm": 0.03868447244167328, "kl": 0.0097503662109375, "learning_rate": 9.905753615668561e-06, "loss": -0.0471, "reward": 1.096491515636444, "reward_std": 0.14279304654337466, "rewards/mrr_reward": 0.1803695447742939, "rewards/rank_answer_foramt_reward": 0.953125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8308144211769104, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 452.125, "epoch": 0.4144, "grad_norm": 0.03858815133571625, "kl": 0.00782012939453125, "learning_rate": 9.874339601166474e-06, "loss": -0.0198, "reward": 1.2538862526416779, "reward_std": 0.2349272146821022, "rewards/mrr_reward": 0.35144468769431114, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8264680653810501, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 476.1875, "epoch": 0.4152, "grad_norm": 0.04030821472406387, "kl": 0.007167816162109375, "learning_rate": 9.842926826881796e-06, "loss": -0.0117, "reward": 1.15766641497612, "reward_std": 0.20966706797480583, "rewards/mrr_reward": 0.251940730959177, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8149357289075851, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 483.328125, "epoch": 0.416, "grad_norm": 0.03588611260056496, "kl": 0.007293701171875, "learning_rate": 9.81151560284592e-06, "loss": -0.0062, "reward": 1.1331438720226288, "reward_std": 0.18311648909002542, "rewards/mrr_reward": 0.21963665634393692, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8111724406480789, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 464.6875, "epoch": 0.4168, "grad_norm": 0.039640478789806366, "kl": 0.008254051208496094, "learning_rate": 9.78010623907495e-06, "loss": -0.0318, "reward": 1.0459920465946198, "reward_std": 0.08210169477388263, "rewards/mrr_reward": 0.11974826268851757, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8556275069713593, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 450.234375, "epoch": 0.4176, "grad_norm": 0.041469115763902664, "kl": 0.008711814880371094, "learning_rate": 9.748699045566626e-06, "loss": -0.0294, "reward": 1.1578961312770844, "reward_std": 0.18251374177634716, "rewards/mrr_reward": 0.2508494593203068, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.818938821554184, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 439.65625, "epoch": 0.4184, "grad_norm": 0.03994135931134224, "kl": 0.010793685913085938, "learning_rate": 9.717294332297269e-06, "loss": -0.0525, "reward": 1.147641807794571, "reward_std": 0.2086065262556076, "rewards/mrr_reward": 0.25408606603741646, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7702446281909943, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 443.5, "epoch": 0.4192, "grad_norm": 0.04150420427322388, "kl": 0.008188247680664062, "learning_rate": 9.685892409218718e-06, "loss": 0.0358, "reward": 1.186688095331192, "reward_std": 0.18247143924236298, "rewards/mrr_reward": 0.29063739627599716, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.812961220741272, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 450.0625, "epoch": 0.42, "grad_norm": 0.04004862159490585, "kl": 0.00971221923828125, "learning_rate": 9.654493586255279e-06, "loss": -0.0507, "reward": 1.134672150015831, "reward_std": 0.20101629197597504, "rewards/mrr_reward": 0.23550966568291187, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8067660629749298, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 455.03125, "epoch": 0.4208, "grad_norm": 0.041226062923669815, "kl": 0.008754730224609375, "learning_rate": 9.623098173300655e-06, "loss": 0.0024, "reward": 1.1712295413017273, "reward_std": 0.17994631733745337, "rewards/mrr_reward": 0.2511904891580343, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8231534212827682, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 460.171875, "epoch": 0.4216, "grad_norm": 0.03775149956345558, "kl": 0.012889862060546875, "learning_rate": 9.5917064802149e-06, "loss": -0.034, "reward": 1.1483985781669617, "reward_std": 0.12394142523407936, "rewards/mrr_reward": 0.21857019513845444, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8528178334236145, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 454.875, "epoch": 0.4224, "grad_norm": 0.03994860127568245, "kl": 0.008665084838867188, "learning_rate": 9.560318816821354e-06, "loss": -0.0014, "reward": 1.161629170179367, "reward_std": 0.11074419878423214, "rewards/mrr_reward": 0.23227927647531033, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8572273999452591, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 464.328125, "epoch": 0.4232, "grad_norm": 0.037875980138778687, "kl": 0.008001327514648438, "learning_rate": 9.528935492903575e-06, "loss": -0.0433, "reward": 1.1144822239875793, "reward_std": 0.15936855971813202, "rewards/mrr_reward": 0.19868551194667816, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8024852275848389, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 473.328125, "epoch": 0.424, "grad_norm": 0.038447290658950806, "kl": 0.008828163146972656, "learning_rate": 9.497556818202306e-06, "loss": -0.029, "reward": 1.1741737723350525, "reward_std": 0.09742028824985027, "rewards/mrr_reward": 0.25808530673384666, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8033694326877594, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 461.28125, "epoch": 0.4248, "grad_norm": 0.036154162138700485, "kl": 0.008661270141601562, "learning_rate": 9.466183102412397e-06, "loss": 0.0113, "reward": 1.1091251969337463, "reward_std": 0.19129342585802078, "rewards/mrr_reward": 0.19771825522184372, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8184796869754791, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 477.328125, "epoch": 0.4256, "grad_norm": 0.037424515932798386, "kl": 0.0073680877685546875, "learning_rate": 9.434814655179756e-06, "loss": 0.0312, "reward": 1.1813635230064392, "reward_std": 0.1663509365171194, "rewards/mrr_reward": 0.2525049652904272, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8440196663141251, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 440.5, "epoch": 0.4264, "grad_norm": 0.03965657949447632, "kl": 0.00954437255859375, "learning_rate": 9.403451786098295e-06, "loss": -0.0287, "reward": 1.1262851357460022, "reward_std": 0.24533293023705482, "rewards/mrr_reward": 0.2629836406558752, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.776221290230751, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 452.9375, "epoch": 0.4272, "grad_norm": 0.043141767382621765, "kl": 0.01007843017578125, "learning_rate": 9.372094804706867e-06, "loss": 0.0115, "reward": 1.135538101196289, "reward_std": 0.17273031920194626, "rewards/mrr_reward": 0.24095982685685158, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8084994107484818, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 449.828125, "epoch": 0.428, "grad_norm": 0.035875458270311356, "kl": 0.007419586181640625, "learning_rate": 9.340744020486223e-06, "loss": 0.0009, "reward": 1.1043555736541748, "reward_std": 0.09360181912779808, "rewards/mrr_reward": 0.17147817462682724, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8269011080265045, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 453.3125, "epoch": 0.4288, "grad_norm": 0.035530660301446915, "kl": 0.009695053100585938, "learning_rate": 9.309399742855943e-06, "loss": -0.0219, "reward": 1.0510781407356262, "reward_std": 0.10896967723965645, "rewards/mrr_reward": 0.1329303029924631, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8467190861701965, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 454.71875, "epoch": 0.4296, "grad_norm": 0.040476907044649124, "kl": 0.008740425109863281, "learning_rate": 9.278062281171394e-06, "loss": -0.0039, "reward": 1.104703813791275, "reward_std": 0.21153176575899124, "rewards/mrr_reward": 0.20956101268529892, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8023973703384399, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 464.265625, "epoch": 0.4304, "grad_norm": 0.04103953763842583, "kl": 0.007740974426269531, "learning_rate": 9.246731944720675e-06, "loss": -0.0205, "reward": 1.0625648498535156, "reward_std": 0.09056078270077705, "rewards/mrr_reward": 0.1506386436522007, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8005219399929047, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 460.9375, "epoch": 0.4312, "grad_norm": 0.040170818567276, "kl": 0.007616996765136719, "learning_rate": 9.215409042721553e-06, "loss": -0.0218, "reward": 1.1552115380764008, "reward_std": 0.10027447901666164, "rewards/mrr_reward": 0.23503224551677704, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8294375985860825, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 474.53125, "epoch": 0.432, "grad_norm": 0.042113546282052994, "kl": 0.00826263427734375, "learning_rate": 9.184093884318426e-06, "loss": 0.0013, "reward": 1.070628046989441, "reward_std": 0.11641362123191357, "rewards/mrr_reward": 0.14624256640672684, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8773399144411087, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 465.984375, "epoch": 0.4328, "grad_norm": 0.03919651731848717, "kl": 0.008460044860839844, "learning_rate": 9.152786778579266e-06, "loss": -0.0144, "reward": 1.205646112561226, "reward_std": 0.24700526893138885, "rewards/mrr_reward": 0.31289682909846306, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8283476531505585, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 472.078125, "epoch": 0.4336, "grad_norm": 0.04152778908610344, "kl": 0.008693695068359375, "learning_rate": 9.121488034492569e-06, "loss": 0.006, "reward": 1.1138120293617249, "reward_std": 0.19620881974697113, "rewards/mrr_reward": 0.2173549085855484, "rewards/rank_answer_foramt_reward": 0.923828125, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.8239585608243942, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 453.3125, "epoch": 0.4344, "grad_norm": 0.03831581026315689, "kl": 0.008653640747070312, "learning_rate": 9.090197960964301e-06, "loss": -0.0178, "reward": 1.043550044298172, "reward_std": 0.12762367445975542, "rewards/mrr_reward": 0.14636037312448025, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7812563478946686, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 445.203125, "epoch": 0.4352, "grad_norm": 0.04002196714282036, "kl": 0.0078277587890625, "learning_rate": 9.058916866814857e-06, "loss": -0.0065, "reward": 1.2575414180755615, "reward_std": 0.13060189969837666, "rewards/mrr_reward": 0.3513020761311054, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8106326907873154, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 480.171875, "epoch": 0.436, "grad_norm": 0.039209943264722824, "kl": 0.00780487060546875, "learning_rate": 9.027645060776008e-06, "loss": -0.0084, "reward": 1.1186776161193848, "reward_std": 0.14777595922350883, "rewards/mrr_reward": 0.22275545448064804, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8281967639923096, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 427.515625, "epoch": 0.4368, "grad_norm": 0.04363907501101494, "kl": 0.008817672729492188, "learning_rate": 8.996382851487851e-06, "loss": -0.0523, "reward": 1.1956347823143005, "reward_std": 0.21989241987466812, "rewards/mrr_reward": 0.30300718545913696, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7596193552017212, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 468.03125, "epoch": 0.4376, "grad_norm": 0.03548242896795273, "kl": 0.0070171356201171875, "learning_rate": 8.965130547495777e-06, "loss": -0.0023, "reward": 1.1320178806781769, "reward_std": 0.11437905207276344, "rewards/mrr_reward": 0.20006200671195984, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8319211304187775, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 482.28125, "epoch": 0.4384, "grad_norm": 0.03946579992771149, "kl": 0.008230209350585938, "learning_rate": 8.933888457247402e-06, "loss": 0.0214, "reward": 1.1079612374305725, "reward_std": 0.21605945192277431, "rewards/mrr_reward": 0.21192336827516556, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.8383130878210068, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 468.796875, "epoch": 0.4392, "grad_norm": 0.03934666886925697, "kl": 0.007843017578125, "learning_rate": 8.902656889089548e-06, "loss": -0.0018, "reward": 1.1603901982307434, "reward_std": 0.18869752623140812, "rewards/mrr_reward": 0.23634053207933903, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8274939954280853, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 439.234375, "epoch": 0.44, "grad_norm": 0.03816671669483185, "kl": 0.008211135864257812, "learning_rate": 8.871436151265183e-06, "loss": -0.0658, "reward": 1.1312854886054993, "reward_std": 0.1634649522602558, "rewards/mrr_reward": 0.22075272910296917, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.79434634745121, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 431.03125, "epoch": 0.4408, "grad_norm": 0.049642350524663925, "kl": 0.008757591247558594, "learning_rate": 8.840226551910387e-06, "loss": -0.0436, "reward": 1.0978001356124878, "reward_std": 0.1608146745711565, "rewards/mrr_reward": 0.18125620111823082, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8184214979410172, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 469.171875, "epoch": 0.4416, "grad_norm": 0.03793758898973465, "kl": 0.008665084838867188, "learning_rate": 8.809028399051302e-06, "loss": 0.0105, "reward": 1.1738522052764893, "reward_std": 0.10969773586839437, "rewards/mrr_reward": 0.242367310449481, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8773688822984695, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 442.40625, "epoch": 0.4424, "grad_norm": 0.039615653455257416, "kl": 0.008930206298828125, "learning_rate": 8.777842000601106e-06, "loss": 0.0215, "reward": 1.2303386330604553, "reward_std": 0.22473370283842087, "rewards/mrr_reward": 0.32853423431515694, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7678967267274857, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 483.265625, "epoch": 0.4432, "grad_norm": 0.039541710168123245, "kl": 0.007976531982421875, "learning_rate": 8.746667664356957e-06, "loss": 0.0074, "reward": 1.1821573972702026, "reward_std": 0.13846352836117148, "rewards/mrr_reward": 0.2504030168056488, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8742790371179581, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 493.140625, "epoch": 0.444, "grad_norm": 0.03958950564265251, "kl": 0.007397651672363281, "learning_rate": 8.715505697996972e-06, "loss": -0.0001, "reward": 1.1536555588245392, "reward_std": 0.1938557531684637, "rewards/mrr_reward": 0.23748759925365448, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8407195657491684, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 441.53125, "epoch": 0.4448, "grad_norm": 0.04035869613289833, "kl": 0.0110321044921875, "learning_rate": 8.684356409077177e-06, "loss": -0.0185, "reward": 1.1211222410202026, "reward_std": 0.15442631393671036, "rewards/mrr_reward": 0.20714285783469677, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8067436665296555, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 469.671875, "epoch": 0.4456, "grad_norm": 0.04031984135508537, "kl": 0.007228851318359375, "learning_rate": 8.653220105028476e-06, "loss": -0.0004, "reward": 1.3500173687934875, "reward_std": 0.16067894361913204, "rewards/mrr_reward": 0.3977244533598423, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.885735884308815, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 475.40625, "epoch": 0.4464, "grad_norm": 0.03806934878230095, "kl": 0.008632659912109375, "learning_rate": 8.62209709315362e-06, "loss": -0.028, "reward": 1.1859273612499237, "reward_std": 0.16438952274620533, "rewards/mrr_reward": 0.2831225246191025, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7904596626758575, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 471.609375, "epoch": 0.4472, "grad_norm": 0.03851349279284477, "kl": 0.00989532470703125, "learning_rate": 8.590987680624174e-06, "loss": -0.0014, "reward": 1.124581664800644, "reward_std": 0.14083863236010075, "rewards/mrr_reward": 0.23160961642861366, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8114445358514786, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 448.796875, "epoch": 0.448, "grad_norm": 0.03883276879787445, "kl": 0.009004592895507812, "learning_rate": 8.559892174477478e-06, "loss": -0.0232, "reward": 1.2451259791851044, "reward_std": 0.20462225936353207, "rewards/mrr_reward": 0.3293340802192688, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8591111749410629, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 451.71875, "epoch": 0.4488, "grad_norm": 0.041181642562150955, "kl": 0.008396148681640625, "learning_rate": 8.528810881613626e-06, "loss": -0.0414, "reward": 1.1230032444000244, "reward_std": 0.16564571298658848, "rewards/mrr_reward": 0.22324529103934765, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7968515604734421, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 464.421875, "epoch": 0.4496, "grad_norm": 0.03920190408825874, "kl": 0.008289337158203125, "learning_rate": 8.49774410879243e-06, "loss": 0.0054, "reward": 1.0872045755386353, "reward_std": 0.12622703425586224, "rewards/mrr_reward": 0.16274181567132473, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8580428808927536, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 481.296875, "epoch": 0.4504, "grad_norm": 0.03609956428408623, "kl": 0.007109642028808594, "learning_rate": 8.466692162630393e-06, "loss": 0.0235, "reward": 1.2699479758739471, "reward_std": 0.1954438053071499, "rewards/mrr_reward": 0.3503100275993347, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8766252249479294, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 453.921875, "epoch": 0.4512, "grad_norm": 0.04106234386563301, "kl": 0.008481979370117188, "learning_rate": 8.43565534959769e-06, "loss": -0.0142, "reward": 1.0173709094524384, "reward_std": 0.1483047273941338, "rewards/mrr_reward": 0.12945188395678997, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8234761506319046, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 446.90625, "epoch": 0.452, "grad_norm": 0.03835194930434227, "kl": 0.00774383544921875, "learning_rate": 8.404633976015136e-06, "loss": -0.0165, "reward": 1.1766445636749268, "reward_std": 0.12921269796788692, "rewards/mrr_reward": 0.2505580559372902, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8278071284294128, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 457.046875, "epoch": 0.4528, "grad_norm": 0.03958544135093689, "kl": 0.008856773376464844, "learning_rate": 8.373628348051165e-06, "loss": 0.0131, "reward": 1.1318785846233368, "reward_std": 0.11855714162811637, "rewards/mrr_reward": 0.22342510148882866, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8310142606496811, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 452.546875, "epoch": 0.4536, "grad_norm": 0.04320928454399109, "kl": 0.008389472961425781, "learning_rate": 8.342638771718804e-06, "loss": 0.0107, "reward": 1.128523826599121, "reward_std": 0.16640270128846169, "rewards/mrr_reward": 0.21039807423949242, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8583710491657257, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 466.953125, "epoch": 0.4544, "grad_norm": 0.04389907792210579, "kl": 0.009497642517089844, "learning_rate": 8.311665552872662e-06, "loss": -0.0266, "reward": 1.0887642204761505, "reward_std": 0.18486547190696, "rewards/mrr_reward": 0.18142360635101795, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.831548199057579, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 435.40625, "epoch": 0.4552, "grad_norm": 0.04440563917160034, "kl": 0.008943557739257812, "learning_rate": 8.280708997205904e-06, "loss": -0.0072, "reward": 1.2277150750160217, "reward_std": 0.24825106747448444, "rewards/mrr_reward": 0.36322543025016785, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7173216342926025, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 465.21875, "epoch": 0.456, "grad_norm": 0.0392410084605217, "kl": 0.009092330932617188, "learning_rate": 8.249769410247239e-06, "loss": 0.0086, "reward": 1.204152375459671, "reward_std": 0.21110957488417625, "rewards/mrr_reward": 0.2886160612106323, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8290398269891739, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 450.046875, "epoch": 0.4568, "grad_norm": 0.0392969585955143, "kl": 0.007068634033203125, "learning_rate": 8.218847097357898e-06, "loss": -0.0417, "reward": 1.1151992976665497, "reward_std": 0.12983771739527583, "rewards/mrr_reward": 0.17941468209028244, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.849382758140564, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 441.65625, "epoch": 0.4576, "grad_norm": 0.037597186863422394, "kl": 0.007890701293945312, "learning_rate": 8.187942363728626e-06, "loss": -0.0264, "reward": 1.0076032876968384, "reward_std": 0.11738604307174683, "rewards/mrr_reward": 0.10858755186200142, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.827805757522583, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 407.921875, "epoch": 0.4584, "grad_norm": 0.041158709675073624, "kl": 0.009695053100585938, "learning_rate": 8.157055514376667e-06, "loss": -0.0447, "reward": 1.2002838253974915, "reward_std": 0.1709437482059002, "rewards/mrr_reward": 0.31932043842971325, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6910702735185623, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 445.96875, "epoch": 0.4592, "grad_norm": 0.040003228932619095, "kl": 0.009104728698730469, "learning_rate": 8.126186854142752e-06, "loss": -0.0428, "reward": 1.1452887952327728, "reward_std": 0.1888815239071846, "rewards/mrr_reward": 0.24434524402022362, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8277881443500519, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 478.0, "epoch": 0.46, "grad_norm": 0.03768211975693703, "kl": 0.006785392761230469, "learning_rate": 8.095336687688102e-06, "loss": 0.004, "reward": 1.2456986904144287, "reward_std": 0.1416485607624054, "rewards/mrr_reward": 0.31855158507823944, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8505522757768631, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 456.8125, "epoch": 0.4608, "grad_norm": 0.04362956061959267, "kl": 0.007818222045898438, "learning_rate": 8.064505319491398e-06, "loss": -0.0028, "reward": 1.2511990666389465, "reward_std": 0.19724944420158863, "rewards/mrr_reward": 0.3255952559411526, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8400160819292068, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 447.796875, "epoch": 0.4616, "grad_norm": 0.040099550038576126, "kl": 0.009653091430664062, "learning_rate": 8.033693053845801e-06, "loss": -0.0472, "reward": 1.159771978855133, "reward_std": 0.1415103916078806, "rewards/mrr_reward": 0.22492559626698494, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.854352131485939, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 449.546875, "epoch": 0.4624, "grad_norm": 0.038695842027664185, "kl": 0.008846282958984375, "learning_rate": 8.00290019485593e-06, "loss": -0.0166, "reward": 1.1117965877056122, "reward_std": 0.16431541368365288, "rewards/mrr_reward": 0.2048921138048172, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8028827607631683, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 450.109375, "epoch": 0.4632, "grad_norm": 0.04034760221838951, "kl": 0.00905609130859375, "learning_rate": 7.972127046434878e-06, "loss": -0.014, "reward": 1.1206817924976349, "reward_std": 0.21675598435103893, "rewards/mrr_reward": 0.23121900483965874, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8183884024620056, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 434.96875, "epoch": 0.464, "grad_norm": 0.042170315980911255, "kl": 0.010354995727539062, "learning_rate": 7.94137391230119e-06, "loss": -0.0259, "reward": 1.1011703163385391, "reward_std": 0.24708281457424164, "rewards/mrr_reward": 0.2212301604449749, "rewards/rank_answer_foramt_reward": 0.845703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8207820504903793, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 446.421875, "epoch": 0.4648, "grad_norm": 0.03817920386791229, "kl": 0.008044242858886719, "learning_rate": 7.910641095975886e-06, "loss": 0.0116, "reward": 1.1385302245616913, "reward_std": 0.11444682255387306, "rewards/mrr_reward": 0.20651661977171898, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8516274094581604, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 461.46875, "epoch": 0.4656, "grad_norm": 0.039664629846811295, "kl": 0.007870674133300781, "learning_rate": 7.879928900779457e-06, "loss": -0.0641, "reward": 1.178428053855896, "reward_std": 0.17734414339065552, "rewards/mrr_reward": 0.2622581832110882, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7762722671031952, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 446.5625, "epoch": 0.4664, "grad_norm": 0.043783195316791534, "kl": 0.008320808410644531, "learning_rate": 7.84923762982887e-06, "loss": 0.0012, "reward": 1.0347242206335068, "reward_std": 0.17706408351659775, "rewards/mrr_reward": 0.13465401716530323, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8173290193080902, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 483.40625, "epoch": 0.4672, "grad_norm": 0.040662799030542374, "kl": 0.008103370666503906, "learning_rate": 7.818567586034578e-06, "loss": -0.0064, "reward": 1.1952064037322998, "reward_std": 0.21328644640743732, "rewards/mrr_reward": 0.2867993488907814, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8425921499729156, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 463.140625, "epoch": 0.468, "grad_norm": 0.038001302629709244, "kl": 0.00820159912109375, "learning_rate": 7.787919072097531e-06, "loss": -0.0179, "reward": 1.2342975735664368, "reward_std": 0.26813187077641487, "rewards/mrr_reward": 0.32374753057956696, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8217423409223557, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 474.84375, "epoch": 0.4688, "grad_norm": 0.038148414343595505, "kl": 0.008083343505859375, "learning_rate": 7.757292390506191e-06, "loss": -0.0098, "reward": 1.0755177438259125, "reward_std": 0.19383916724473238, "rewards/mrr_reward": 0.1725260429084301, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8359476029872894, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 442.640625, "epoch": 0.4696, "grad_norm": 0.04210152477025986, "kl": 0.009115219116210938, "learning_rate": 7.726687843533539e-06, "loss": -0.0532, "reward": 1.108871042728424, "reward_std": 0.164753595367074, "rewards/mrr_reward": 0.1870349682867527, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8422707915306091, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 463.953125, "epoch": 0.4704, "grad_norm": 0.04036114737391472, "kl": 0.00824737548828125, "learning_rate": 7.696105733234099e-06, "loss": -0.0084, "reward": 1.0524960458278656, "reward_std": 0.1158986147493124, "rewards/mrr_reward": 0.140625, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.819886177778244, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 446.15625, "epoch": 0.4712, "grad_norm": 0.038603611290454865, "kl": 0.01171112060546875, "learning_rate": 7.66554636144095e-06, "loss": -0.0141, "reward": 1.1410838067531586, "reward_std": 0.1260488135740161, "rewards/mrr_reward": 0.22431176900863647, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8269250690937042, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 474.984375, "epoch": 0.472, "grad_norm": 0.04096561297774315, "kl": 0.009233474731445312, "learning_rate": 7.635010029762755e-06, "loss": 0.0165, "reward": 1.2618905901908875, "reward_std": 0.1963303443044424, "rewards/mrr_reward": 0.337270587682724, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8507068455219269, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 487.328125, "epoch": 0.4728, "grad_norm": 0.03711889311671257, "kl": 0.009479522705078125, "learning_rate": 7.604497039580785e-06, "loss": -0.0077, "reward": 1.1018753945827484, "reward_std": 0.1098787821829319, "rewards/mrr_reward": 0.16659227386116982, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8830194175243378, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 449.296875, "epoch": 0.4736, "grad_norm": 0.040535878390073776, "kl": 0.009859085083007812, "learning_rate": 7.574007692045928e-06, "loss": -0.0361, "reward": 1.1256814897060394, "reward_std": 0.16627157852053642, "rewards/mrr_reward": 0.2195622455328703, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8083157539367676, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 464.390625, "epoch": 0.4744, "grad_norm": 0.04586748778820038, "kl": 0.012853622436523438, "learning_rate": 7.543542288075739e-06, "loss": 0.0211, "reward": 1.0531246066093445, "reward_std": 0.1633172556757927, "rewards/mrr_reward": 0.16449653171002865, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8353904634714127, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 431.140625, "epoch": 0.4752, "grad_norm": 0.04041828587651253, "kl": 0.008060455322265625, "learning_rate": 7.513101128351454e-06, "loss": -0.0028, "reward": 1.0381239652633667, "reward_std": 0.11173910088837147, "rewards/mrr_reward": 0.1378658302128315, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8471954613924026, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 453.515625, "epoch": 0.476, "grad_norm": 0.03903096914291382, "kl": 0.008426666259765625, "learning_rate": 7.482684513315031e-06, "loss": -0.0315, "reward": 1.2388408780097961, "reward_std": 0.23095028288662434, "rewards/mrr_reward": 0.34299975633621216, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7693574577569962, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 448.03125, "epoch": 0.4768, "grad_norm": 0.037792522460222244, "kl": 0.008992195129394531, "learning_rate": 7.4522927431661805e-06, "loss": -0.0293, "reward": 1.1880051493644714, "reward_std": 0.13500093296170235, "rewards/mrr_reward": 0.2826884910464287, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8000244349241257, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 495.265625, "epoch": 0.4776, "grad_norm": 0.03439981862902641, "kl": 0.007958412170410156, "learning_rate": 7.421926117859403e-06, "loss": -0.0139, "reward": 1.2536373734474182, "reward_std": 0.1599850282073021, "rewards/mrr_reward": 0.3125309981405735, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8655090630054474, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 481.6875, "epoch": 0.4784, "grad_norm": 0.036926135420799255, "kl": 0.008787155151367188, "learning_rate": 7.391584937101034e-06, "loss": -0.0252, "reward": 1.2367544770240784, "reward_std": 0.1867055855691433, "rewards/mrr_reward": 0.32385294139385223, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8093369156122208, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 474.90625, "epoch": 0.4792, "grad_norm": 0.040266942232847214, "kl": 0.007959365844726562, "learning_rate": 7.361269500346274e-06, "loss": -0.0266, "reward": 1.160109281539917, "reward_std": 0.21854684129357338, "rewards/mrr_reward": 0.23885169252753258, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8678614497184753, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 420.34375, "epoch": 0.48, "grad_norm": 0.04247404634952545, "kl": 0.009807586669921875, "learning_rate": 7.330980106796247e-06, "loss": -0.0226, "reward": 1.2082679271697998, "reward_std": 0.14304961264133453, "rewards/mrr_reward": 0.28249626979231834, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.846384271979332, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 448.75, "epoch": 0.4808, "grad_norm": 0.043656595051288605, "kl": 0.009093284606933594, "learning_rate": 7.300717055395039e-06, "loss": 0.0033, "reward": 1.2197607904672623, "reward_std": 0.13689721561968327, "rewards/mrr_reward": 0.311445914208889, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8149691671133041, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 489.0, "epoch": 0.4816, "grad_norm": 0.035476408898830414, "kl": 0.007262229919433594, "learning_rate": 7.27048064482675e-06, "loss": -0.0232, "reward": 1.1573797166347504, "reward_std": 0.2298069056123495, "rewards/mrr_reward": 0.25371403992176056, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8223650008440018, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 455.8125, "epoch": 0.4824, "grad_norm": 0.04022166132926941, "kl": 0.0081024169921875, "learning_rate": 7.240271173512545e-06, "loss": -0.0167, "reward": 1.2249160706996918, "reward_std": 0.26648064982146025, "rewards/mrr_reward": 0.31931424885988235, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8360446691513062, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 460.328125, "epoch": 0.4832, "grad_norm": 0.03857105225324631, "kl": 0.008441925048828125, "learning_rate": 7.210088939607709e-06, "loss": -0.0502, "reward": 1.152012288570404, "reward_std": 0.14423850551247597, "rewards/mrr_reward": 0.24311136081814766, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8030732870101929, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 459.71875, "epoch": 0.484, "grad_norm": 0.04305540770292282, "kl": 0.008272171020507812, "learning_rate": 7.179934240998707e-06, "loss": -0.0375, "reward": 1.118052452802658, "reward_std": 0.1358859408646822, "rewards/mrr_reward": 0.1941592302173376, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8445982933044434, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 454.0625, "epoch": 0.4848, "grad_norm": 0.04774133116006851, "kl": 0.010158538818359375, "learning_rate": 7.149807375300239e-06, "loss": -0.0581, "reward": 1.0857676565647125, "reward_std": 0.2192548532038927, "rewards/mrr_reward": 0.19006076455116272, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8314506113529205, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 449.078125, "epoch": 0.4856, "grad_norm": 0.03719841688871384, "kl": 0.009935379028320312, "learning_rate": 7.119708639852312e-06, "loss": -0.0208, "reward": 1.192948192358017, "reward_std": 0.148657638579607, "rewards/mrr_reward": 0.2748511843383312, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8465651720762253, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 479.265625, "epoch": 0.4864, "grad_norm": 0.036122966557741165, "kl": 0.008540153503417969, "learning_rate": 7.0896383317172845e-06, "loss": -0.0174, "reward": 1.3368725180625916, "reward_std": 0.23823470249772072, "rewards/mrr_reward": 0.4140749163925648, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8237001299858093, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 456.078125, "epoch": 0.4872, "grad_norm": 0.03762197121977806, "kl": 0.007107734680175781, "learning_rate": 7.059596747676963e-06, "loss": -0.017, "reward": 1.2796568274497986, "reward_std": 0.16884921677410603, "rewards/mrr_reward": 0.3945312574505806, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7642298191785812, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 441.390625, "epoch": 0.488, "grad_norm": 0.0399199053645134, "kl": 0.008553504943847656, "learning_rate": 7.029584184229653e-06, "loss": -0.0136, "reward": 1.3700619339942932, "reward_std": 0.2029927484691143, "rewards/mrr_reward": 0.44716642796993256, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8376685678958893, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 483.34375, "epoch": 0.4888, "grad_norm": 0.040925391018390656, "kl": 0.008733749389648438, "learning_rate": 6.99960093758724e-06, "loss": 0.015, "reward": 1.1895627975463867, "reward_std": 0.23921046033501625, "rewards/mrr_reward": 0.27715774811804295, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8761919885873795, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 450.9375, "epoch": 0.4896, "grad_norm": 0.040661729872226715, "kl": 0.010442733764648438, "learning_rate": 6.969647303672262e-06, "loss": -0.0553, "reward": 1.24416384100914, "reward_std": 0.2634511739015579, "rewards/mrr_reward": 0.3377480283379555, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8463238179683685, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 437.515625, "epoch": 0.4904, "grad_norm": 0.040657393634319305, "kl": 0.009714126586914062, "learning_rate": 6.9397235781149945e-06, "loss": -0.0143, "reward": 1.1388709545135498, "reward_std": 0.1958305425941944, "rewards/mrr_reward": 0.22525421902537346, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8310353606939316, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 452.21875, "epoch": 0.4912, "grad_norm": 0.03989201411604881, "kl": 0.010036468505859375, "learning_rate": 6.909830056250527e-06, "loss": -0.0312, "reward": 1.1368741393089294, "reward_std": 0.12755226157605648, "rewards/mrr_reward": 0.2184709832072258, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7908523082733154, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 463.96875, "epoch": 0.492, "grad_norm": 0.04114677384495735, "kl": 0.0092315673828125, "learning_rate": 6.879967033115853e-06, "loss": -0.0002, "reward": 1.3170638680458069, "reward_std": 0.21237649768590927, "rewards/mrr_reward": 0.3855530768632889, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.836431622505188, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 455.046875, "epoch": 0.4928, "grad_norm": 0.0431981161236763, "kl": 0.009008407592773438, "learning_rate": 6.850134803446955e-06, "loss": -0.0007, "reward": 1.121709167957306, "reward_std": 0.17951259203255177, "rewards/mrr_reward": 0.19001736119389534, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8877614140510559, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 462.21875, "epoch": 0.4936, "grad_norm": 0.035142622888088226, "kl": 0.007939338684082031, "learning_rate": 6.820333661675893e-06, "loss": -0.0185, "reward": 1.3334324955940247, "reward_std": 0.13348893821239471, "rewards/mrr_reward": 0.42053572088479996, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7800257503986359, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 497.171875, "epoch": 0.4944, "grad_norm": 0.03749988600611687, "kl": 0.007336616516113281, "learning_rate": 6.790563901927907e-06, "loss": -0.0012, "reward": 1.1629119515419006, "reward_std": 0.23179220408201218, "rewards/mrr_reward": 0.25689484365284443, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8763656616210938, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 498.59375, "epoch": 0.4952, "grad_norm": 0.044601455330848694, "kl": 0.007954597473144531, "learning_rate": 6.7608258180185085e-06, "loss": -0.0065, "reward": 1.171342521905899, "reward_std": 0.10423477459698915, "rewards/mrr_reward": 0.24530010670423508, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8335327506065369, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 437.515625, "epoch": 0.496, "grad_norm": 0.04305371642112732, "kl": 0.010364532470703125, "learning_rate": 6.731119703450577e-06, "loss": -0.0619, "reward": 1.2109524309635162, "reward_std": 0.26471400633454323, "rewards/mrr_reward": 0.3258928619325161, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7933267802000046, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 481.5625, "epoch": 0.4968, "grad_norm": 0.04213489592075348, "kl": 0.008825302124023438, "learning_rate": 6.701445851411472e-06, "loss": -0.0069, "reward": 1.136389434337616, "reward_std": 0.19171301275491714, "rewards/mrr_reward": 0.21873139590024948, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8784380555152893, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 468.984375, "epoch": 0.4976, "grad_norm": 0.03531862050294876, "kl": 0.0073719024658203125, "learning_rate": 6.671804554770135e-06, "loss": -0.0387, "reward": 1.099222093820572, "reward_std": 0.15219473466277122, "rewards/mrr_reward": 0.18070436641573906, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8107306212186813, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 443.53125, "epoch": 0.4984, "grad_norm": 0.03785242512822151, "kl": 0.009787559509277344, "learning_rate": 6.642196106074195e-06, "loss": -0.0231, "reward": 1.3554746210575104, "reward_std": 0.20645315200090408, "rewards/mrr_reward": 0.4374689795076847, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8111319541931152, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 465.6875, "epoch": 0.4992, "grad_norm": 0.038349699229002, "kl": 0.00804901123046875, "learning_rate": 6.612620797547087e-06, "loss": 0.0146, "reward": 1.1561428606510162, "reward_std": 0.13428733311593533, "rewards/mrr_reward": 0.21650546044111252, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8825420886278152, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 447.5, "epoch": 0.5, "grad_norm": 0.039120424538850784, "kl": 0.008810997009277344, "learning_rate": 6.583078921085167e-06, "loss": 0.0093, "reward": 1.2225628197193146, "reward_std": 0.19562321156263351, "rewards/mrr_reward": 0.3105902783572674, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8143343031406403, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 467.0, "epoch": 0.5008, "grad_norm": 0.03858473151922226, "kl": 0.009241104125976562, "learning_rate": 6.553570768254831e-06, "loss": -0.0354, "reward": 1.1035878658294678, "reward_std": 0.1448194831609726, "rewards/mrr_reward": 0.18493304029107094, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8384897708892822, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 452.734375, "epoch": 0.5016, "grad_norm": 0.04008089378476143, "kl": 0.009889602661132812, "learning_rate": 6.524096630289632e-06, "loss": -0.0247, "reward": 1.2089687287807465, "reward_std": 0.1602461007423699, "rewards/mrr_reward": 0.28939732909202576, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8568924069404602, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 467.140625, "epoch": 0.5024, "grad_norm": 0.04044454172253609, "kl": 0.0072193145751953125, "learning_rate": 6.494656798087412e-06, "loss": -0.0287, "reward": 1.2421257197856903, "reward_std": 0.12500086054205894, "rewards/mrr_reward": 0.30170511454343796, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8575716465711594, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 458.421875, "epoch": 0.5032, "grad_norm": 0.0375799760222435, "kl": 0.010451316833496094, "learning_rate": 6.465251562207431e-06, "loss": -0.0058, "reward": 1.1293274760246277, "reward_std": 0.15937496908009052, "rewards/mrr_reward": 0.20788690820336342, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8332597315311432, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 471.046875, "epoch": 0.504, "grad_norm": 0.03754337131977081, "kl": 0.00838470458984375, "learning_rate": 6.435881212867494e-06, "loss": -0.0153, "reward": 1.145438402891159, "reward_std": 0.17192682810127735, "rewards/mrr_reward": 0.220145083963871, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8586065769195557, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 439.125, "epoch": 0.5048, "grad_norm": 0.04335853457450867, "kl": 0.008279800415039062, "learning_rate": 6.406546039941095e-06, "loss": 0.0053, "reward": 1.215552568435669, "reward_std": 0.18727103993296623, "rewards/mrr_reward": 0.29786086082458496, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8297120332717896, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 494.734375, "epoch": 0.5056, "grad_norm": 0.03874685615301132, "kl": 0.007633209228515625, "learning_rate": 6.377246332954544e-06, "loss": -0.01, "reward": 1.14056196808815, "reward_std": 0.1842045597732067, "rewards/mrr_reward": 0.2259982731193304, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8417176455259323, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 482.3125, "epoch": 0.5064, "grad_norm": 0.04324718937277794, "kl": 0.008602142333984375, "learning_rate": 6.3479823810841235e-06, "loss": 0.0259, "reward": 1.1707454323768616, "reward_std": 0.20734077505767345, "rewards/mrr_reward": 0.24067460745573044, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8945683091878891, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 461.453125, "epoch": 0.5072, "grad_norm": 0.03912781924009323, "kl": 0.009914398193359375, "learning_rate": 6.318754473153221e-06, "loss": 0.003, "reward": 1.152208298444748, "reward_std": 0.18202020972967148, "rewards/mrr_reward": 0.22909846529364586, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8793335855007172, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 435.96875, "epoch": 0.508, "grad_norm": 0.03814735263586044, "kl": 0.009555816650390625, "learning_rate": 6.289562897629492e-06, "loss": -0.004, "reward": 1.2428525686264038, "reward_std": 0.10947381239384413, "rewards/mrr_reward": 0.3264322876930237, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7907028645277023, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 454.8125, "epoch": 0.5088, "grad_norm": 0.04343482851982117, "kl": 0.008977890014648438, "learning_rate": 6.260407942621998e-06, "loss": 0.0267, "reward": 1.2960951030254364, "reward_std": 0.1967066526412964, "rewards/mrr_reward": 0.36600323021411896, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8887726813554764, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 465.671875, "epoch": 0.5096, "grad_norm": 0.04106997326016426, "kl": 0.008977890014648438, "learning_rate": 6.231289895878375e-06, "loss": -0.0182, "reward": 1.2052837014198303, "reward_std": 0.17037715390324593, "rewards/mrr_reward": 0.2953869067132473, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8275755047798157, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 477.484375, "epoch": 0.5104, "grad_norm": 0.03704027459025383, "kl": 0.009426116943359375, "learning_rate": 6.202209044781991e-06, "loss": -0.0061, "reward": 1.1235398948192596, "reward_std": 0.13441728707402945, "rewards/mrr_reward": 0.20093626528978348, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8328777849674225, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 470.046875, "epoch": 0.5112, "grad_norm": 0.04103616252541542, "kl": 0.0093231201171875, "learning_rate": 6.173165676349103e-06, "loss": -0.0435, "reward": 1.1044175028800964, "reward_std": 0.17622700706124306, "rewards/mrr_reward": 0.19592014327645302, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.837006464600563, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 473.984375, "epoch": 0.512, "grad_norm": 0.05446719005703926, "kl": 0.008823394775390625, "learning_rate": 6.144160077226035e-06, "loss": 0.0136, "reward": 1.2099950313568115, "reward_std": 0.19803481549024582, "rewards/mrr_reward": 0.28655754402279854, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.86860790848732, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 462.46875, "epoch": 0.5128, "grad_norm": 0.041340798139572144, "kl": 0.009428024291992188, "learning_rate": 6.115192533686341e-06, "loss": -0.0448, "reward": 1.1741358637809753, "reward_std": 0.1973903514444828, "rewards/mrr_reward": 0.28611112385988235, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7749683260917664, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 454.375, "epoch": 0.5136, "grad_norm": 0.037386320531368256, "kl": 0.009598731994628906, "learning_rate": 6.086263331627976e-06, "loss": -0.04, "reward": 1.1221703886985779, "reward_std": 0.21877944841980934, "rewards/mrr_reward": 0.22787078097462654, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7998424768447876, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 478.875, "epoch": 0.5144, "grad_norm": 0.04460527002811432, "kl": 0.00873565673828125, "learning_rate": 6.05737275657049e-06, "loss": -0.025, "reward": 1.1142739951610565, "reward_std": 0.1813185941427946, "rewards/mrr_reward": 0.21093130111694336, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8155269175767899, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 448.515625, "epoch": 0.5152, "grad_norm": 0.03829963132739067, "kl": 0.008128166198730469, "learning_rate": 6.028521093652195e-06, "loss": -0.0117, "reward": 1.140984207391739, "reward_std": 0.1788063794374466, "rewards/mrr_reward": 0.22751735523343086, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8032376319169998, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 461.171875, "epoch": 0.516, "grad_norm": 0.040134068578481674, "kl": 0.011653900146484375, "learning_rate": 5.9997086276273545e-06, "loss": -0.0714, "reward": 1.1092098951339722, "reward_std": 0.15996074490249157, "rewards/mrr_reward": 0.19925596192479134, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8218892067670822, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 446.796875, "epoch": 0.5168, "grad_norm": 0.043228864669799805, "kl": 0.012115478515625, "learning_rate": 5.970935642863375e-06, "loss": -0.041, "reward": 1.1475808322429657, "reward_std": 0.12733451835811138, "rewards/mrr_reward": 0.2023809589445591, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.885726198554039, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 477.453125, "epoch": 0.5176, "grad_norm": 0.03866947069764137, "kl": 0.008696556091308594, "learning_rate": 5.942202423338001e-06, "loss": -0.0142, "reward": 1.1517845392227173, "reward_std": 0.2236475944519043, "rewards/mrr_reward": 0.26093750819563866, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7561768889427185, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 465.234375, "epoch": 0.5184, "grad_norm": 0.038130611181259155, "kl": 0.0076313018798828125, "learning_rate": 5.913509252636511e-06, "loss": 0.0073, "reward": 1.214305818080902, "reward_std": 0.22642408311367035, "rewards/mrr_reward": 0.27718254178762436, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8807831108570099, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 464.390625, "epoch": 0.5192, "grad_norm": 0.04136860743165016, "kl": 0.0090179443359375, "learning_rate": 5.884856413948913e-06, "loss": 0.0167, "reward": 1.1337697207927704, "reward_std": 0.1099173566326499, "rewards/mrr_reward": 0.1892671175301075, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8836133182048798, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 481.046875, "epoch": 0.52, "grad_norm": 0.040968600660562515, "kl": 0.009687423706054688, "learning_rate": 5.85624419006716e-06, "loss": 0.0129, "reward": 1.14986552298069, "reward_std": 0.23535252176225185, "rewards/mrr_reward": 0.23812004178762436, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8820055425167084, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 471.578125, "epoch": 0.5208, "grad_norm": 0.04172006994485855, "kl": 0.008195877075195312, "learning_rate": 5.8276728633823494e-06, "loss": 0.0156, "reward": 1.0877700746059418, "reward_std": 0.1796468086540699, "rewards/mrr_reward": 0.18993056192994118, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8183818161487579, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 445.4375, "epoch": 0.5216, "grad_norm": 0.04257863759994507, "kl": 0.009547233581542969, "learning_rate": 5.799142715881938e-06, "loss": -0.0619, "reward": 1.0521957874298096, "reward_std": 0.22515824437141418, "rewards/mrr_reward": 0.17905506119132042, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.768927738070488, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 459.34375, "epoch": 0.5224, "grad_norm": 0.040138933807611465, "kl": 0.009622573852539062, "learning_rate": 5.770654029146969e-06, "loss": 0.0044, "reward": 1.1919938325881958, "reward_std": 0.18187857419252396, "rewards/mrr_reward": 0.2777653820812702, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8250766545534134, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 458.8125, "epoch": 0.5232, "grad_norm": 0.043729230761528015, "kl": 0.009851455688476562, "learning_rate": 5.742207084349274e-06, "loss": -0.0332, "reward": 1.179715782403946, "reward_std": 0.22262733057141304, "rewards/mrr_reward": 0.2926153317093849, "rewards/rank_answer_foramt_reward": 0.857421875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8541986495256424, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 484.390625, "epoch": 0.524, "grad_norm": 0.0415637381374836, "kl": 0.008465766906738281, "learning_rate": 5.713802162248718e-06, "loss": 0.0018, "reward": 1.2584901452064514, "reward_std": 0.11682092864066362, "rewards/mrr_reward": 0.3142175190150738, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8907290101051331, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 464.875, "epoch": 0.5248, "grad_norm": 0.0408024825155735, "kl": 0.00860595703125, "learning_rate": 5.685439543190409e-06, "loss": -0.0104, "reward": 1.355711817741394, "reward_std": 0.22719109896570444, "rewards/mrr_reward": 0.44937377236783504, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7738226503133774, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 466.328125, "epoch": 0.5256, "grad_norm": 0.04255751892924309, "kl": 0.009609222412109375, "learning_rate": 5.657119507101955e-06, "loss": 0.0158, "reward": 1.1442281603813171, "reward_std": 0.19760075956583023, "rewards/mrr_reward": 0.22868303954601288, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.842738464474678, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 462.96875, "epoch": 0.5264, "grad_norm": 0.03842722252011299, "kl": 0.009244918823242188, "learning_rate": 5.628842333490674e-06, "loss": -0.0173, "reward": 1.1806485652923584, "reward_std": 0.14790054596960545, "rewards/mrr_reward": 0.24701761454343796, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8428566604852676, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 455.046875, "epoch": 0.5272, "grad_norm": 0.04238361120223999, "kl": 0.0117950439453125, "learning_rate": 5.600608301440848e-06, "loss": -0.004, "reward": 1.201371192932129, "reward_std": 0.2469993531703949, "rewards/mrr_reward": 0.30833955481648445, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.831156387925148, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 439.0625, "epoch": 0.528, "grad_norm": 0.055894963443279266, "kl": 0.009580612182617188, "learning_rate": 5.572417689610987e-06, "loss": -0.0304, "reward": 1.1189578771591187, "reward_std": 0.18920623883605003, "rewards/mrr_reward": 0.21641245111823082, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.803345337510109, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 435.34375, "epoch": 0.5288, "grad_norm": 0.042352769523859024, "kl": 0.010715484619140625, "learning_rate": 5.544270776231038e-06, "loss": 0.0025, "reward": 1.1625032126903534, "reward_std": 0.19885125942528248, "rewards/mrr_reward": 0.26426712423563004, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8000523597002029, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 456.859375, "epoch": 0.5296, "grad_norm": 0.03811383992433548, "kl": 0.008238792419433594, "learning_rate": 5.516167839099679e-06, "loss": -0.017, "reward": 1.3036585450172424, "reward_std": 0.1993249226361513, "rewards/mrr_reward": 0.3752976208925247, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.869855523109436, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 446.65625, "epoch": 0.5304, "grad_norm": 0.042755480855703354, "kl": 0.011791229248046875, "learning_rate": 5.488109155581549e-06, "loss": 0.0111, "reward": 1.2058091461658478, "reward_std": 0.21377132274210453, "rewards/mrr_reward": 0.2779761999845505, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8467710912227631, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 464.65625, "epoch": 0.5312, "grad_norm": 0.039466727524995804, "kl": 0.008899688720703125, "learning_rate": 5.460095002604533e-06, "loss": -0.0181, "reward": 1.189045011997223, "reward_std": 0.128420518245548, "rewards/mrr_reward": 0.26372147910296917, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8040104508399963, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 467.546875, "epoch": 0.532, "grad_norm": 0.04358460754156113, "kl": 0.008716583251953125, "learning_rate": 5.432125656657004e-06, "loss": -0.0043, "reward": 1.1679949164390564, "reward_std": 0.18363160640001297, "rewards/mrr_reward": 0.2324280794709921, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8760666251182556, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 439.328125, "epoch": 0.5328, "grad_norm": 0.044500019401311874, "kl": 0.009143829345703125, "learning_rate": 5.404201393785123e-06, "loss": -0.0744, "reward": 1.1327187716960907, "reward_std": 0.16905860230326653, "rewards/mrr_reward": 0.20547495409846306, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8371732831001282, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 452.140625, "epoch": 0.5336, "grad_norm": 0.04244980216026306, "kl": 0.012605667114257812, "learning_rate": 5.376322489590085e-06, "loss": -0.0043, "reward": 1.1357988119125366, "reward_std": 0.16269230097532272, "rewards/mrr_reward": 0.23528027534484863, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.78548464179039, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 479.359375, "epoch": 0.5344, "grad_norm": 0.04275548830628395, "kl": 0.008051872253417969, "learning_rate": 5.348489219225417e-06, "loss": 0.0322, "reward": 1.3845745623111725, "reward_std": 0.21123279444873333, "rewards/mrr_reward": 0.4523933492600918, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8658071458339691, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 419.8125, "epoch": 0.5352, "grad_norm": 0.045763902366161346, "kl": 0.01207733154296875, "learning_rate": 5.3207018573942684e-06, "loss": -0.0302, "reward": 1.1802934408187866, "reward_std": 0.2740876227617264, "rewards/mrr_reward": 0.3093688115477562, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.7524467706680298, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 484.5, "epoch": 0.536, "grad_norm": 0.04125389829277992, "kl": 0.009202957153320312, "learning_rate": 5.292960678346674e-06, "loss": 0.0166, "reward": 1.2574660181999207, "reward_std": 0.16352756042033434, "rewards/mrr_reward": 0.31458334624767303, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.894329383969307, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 462.40625, "epoch": 0.5368, "grad_norm": 0.03898369520902634, "kl": 0.009569168090820312, "learning_rate": 5.2652659558768795e-06, "loss": -0.0111, "reward": 1.1695980429649353, "reward_std": 0.18792172148823738, "rewards/mrr_reward": 0.24601934850215912, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8514575362205505, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 447.640625, "epoch": 0.5376, "grad_norm": 0.04127902537584305, "kl": 0.00872039794921875, "learning_rate": 5.237617963320608e-06, "loss": 0.0196, "reward": 1.2061417400836945, "reward_std": 0.2502102144062519, "rewards/mrr_reward": 0.28403398394584656, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8294219672679901, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 458.109375, "epoch": 0.5384, "grad_norm": 0.03780240938067436, "kl": 0.009091377258300781, "learning_rate": 5.2100169735523906e-06, "loss": -0.0262, "reward": 1.200999915599823, "reward_std": 0.11425712890923023, "rewards/mrr_reward": 0.28924231603741646, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7765735238790512, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 451.125, "epoch": 0.5392, "grad_norm": 0.04221047833561897, "kl": 0.01013946533203125, "learning_rate": 5.1824632589828465e-06, "loss": 0.0056, "reward": 1.0810420513153076, "reward_std": 0.15975480899214745, "rewards/mrr_reward": 0.17140376567840576, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9609375, "rewards/rank_think_format_reward": 0.8639013320207596, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 451.640625, "epoch": 0.54, "grad_norm": 0.04080146923661232, "kl": 0.0093536376953125, "learning_rate": 5.154957091556021e-06, "loss": -0.0236, "reward": 1.0876788198947906, "reward_std": 0.17044154927134514, "rewards/mrr_reward": 0.17075273394584656, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8625483214855194, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 428.921875, "epoch": 0.5408, "grad_norm": 0.039513736963272095, "kl": 0.008271217346191406, "learning_rate": 5.127498742746675e-06, "loss": 0.0053, "reward": 1.283906728029251, "reward_std": 0.22036350145936012, "rewards/mrr_reward": 0.3861917220056057, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7848014086484909, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 461.671875, "epoch": 0.5416, "grad_norm": 0.04135413467884064, "kl": 0.008291244506835938, "learning_rate": 5.100088483557635e-06, "loss": -0.0174, "reward": 1.1332715302705765, "reward_std": 0.21648670360445976, "rewards/mrr_reward": 0.2384672686457634, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8345748037099838, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 451.6875, "epoch": 0.5424, "grad_norm": 0.045175570994615555, "kl": 0.00896453857421875, "learning_rate": 5.072726584517086e-06, "loss": -0.0161, "reward": 1.0853496938943863, "reward_std": 0.20591749995946884, "rewards/mrr_reward": 0.19198289886116982, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.832172155380249, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 467.796875, "epoch": 0.5432, "grad_norm": 0.03891496732831001, "kl": 0.007755279541015625, "learning_rate": 5.045413315675925e-06, "loss": -0.0063, "reward": 1.2105883359909058, "reward_std": 0.10096981842070818, "rewards/mrr_reward": 0.2832031324505806, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8102580308914185, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 466.4375, "epoch": 0.544, "grad_norm": 0.03714423254132271, "kl": 0.008653640747070312, "learning_rate": 5.018148946605092e-06, "loss": -0.0243, "reward": 1.1117302775382996, "reward_std": 0.13797536864876747, "rewards/mrr_reward": 0.19155506789684296, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8372378200292587, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 469.703125, "epoch": 0.5448, "grad_norm": 0.04121573269367218, "kl": 0.0098114013671875, "learning_rate": 4.9909337463929e-06, "loss": -0.0248, "reward": 1.0979313254356384, "reward_std": 0.14380038622766733, "rewards/mrr_reward": 0.17858382873237133, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8210577219724655, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 451.140625, "epoch": 0.5456, "grad_norm": 0.04169347882270813, "kl": 0.011077880859375, "learning_rate": 4.9637679836423926e-06, "loss": -0.0243, "reward": 1.349208414554596, "reward_std": 0.14660646300762892, "rewards/mrr_reward": 0.4205109141767025, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8279066383838654, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 462.53125, "epoch": 0.5464, "grad_norm": 0.039740189909935, "kl": 0.009710311889648438, "learning_rate": 4.936651926468673e-06, "loss": -0.0384, "reward": 1.0722960233688354, "reward_std": 0.1104632755741477, "rewards/mrr_reward": 0.16297122836112976, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.792638972401619, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 463.609375, "epoch": 0.5472, "grad_norm": 0.04026377573609352, "kl": 0.008731842041015625, "learning_rate": 4.909585842496287e-06, "loss": -0.0021, "reward": 1.1576222777366638, "reward_std": 0.14097119309008121, "rewards/mrr_reward": 0.2248635906726122, "rewards/rank_answer_foramt_reward": 0.984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8421664088964462, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 462.25, "epoch": 0.548, "grad_norm": 0.039616670459508896, "kl": 0.009366989135742188, "learning_rate": 4.882569998856549e-06, "loss": -0.018, "reward": 1.2478437423706055, "reward_std": 0.24337460100650787, "rewards/mrr_reward": 0.34682539850473404, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7928584069013596, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 438.984375, "epoch": 0.5488, "grad_norm": 0.04094656556844711, "kl": 0.011662483215332031, "learning_rate": 4.855604662184935e-06, "loss": -0.0047, "reward": 1.0998060703277588, "reward_std": 0.1794710010290146, "rewards/mrr_reward": 0.23454860970377922, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7040233463048935, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 448.640625, "epoch": 0.5496, "grad_norm": 0.04387884959578514, "kl": 0.01041412353515625, "learning_rate": 4.828690098618429e-06, "loss": -0.0005, "reward": 1.158925324678421, "reward_std": 0.23121210932731628, "rewards/mrr_reward": 0.2560763880610466, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8355152010917664, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 436.953125, "epoch": 0.5504, "grad_norm": 0.18276724219322205, "kl": 0.04758262634277344, "learning_rate": 4.801826573792905e-06, "loss": -0.0495, "reward": 1.299856573343277, "reward_std": 0.10169149003922939, "rewards/mrr_reward": 0.3726624511182308, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8448353558778763, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 472.3125, "epoch": 0.5512, "grad_norm": 0.035259928554296494, "kl": 0.008893013000488281, "learning_rate": 4.775014352840512e-06, "loss": -0.006, "reward": 1.169858604669571, "reward_std": 0.1330111986026168, "rewards/mrr_reward": 0.23686135932803154, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8409361243247986, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 460.328125, "epoch": 0.552, "grad_norm": 0.036417923867702484, "kl": 0.007439613342285156, "learning_rate": 4.7482537003870425e-06, "loss": -0.0064, "reward": 1.116908699274063, "reward_std": 0.09379957616329193, "rewards/mrr_reward": 0.1877418179064989, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8430007994174957, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 462.984375, "epoch": 0.5528, "grad_norm": 0.04526480659842491, "kl": 0.007946968078613281, "learning_rate": 4.721544880549337e-06, "loss": -0.0073, "reward": 1.3033201396465302, "reward_std": 0.2789052985608578, "rewards/mrr_reward": 0.38737599924206734, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.879103884100914, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 464.171875, "epoch": 0.5536, "grad_norm": 0.04109110310673714, "kl": 0.00826263427734375, "learning_rate": 4.694888156932657e-06, "loss": -0.0182, "reward": 1.118406057357788, "reward_std": 0.16681309789419174, "rewards/mrr_reward": 0.20881697162985802, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8344555050134659, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 467.109375, "epoch": 0.5544, "grad_norm": 0.037574220448732376, "kl": 0.009721755981445312, "learning_rate": 4.668283792628114e-06, "loss": 0.0062, "reward": 1.2277896106243134, "reward_std": 0.1619633361697197, "rewards/mrr_reward": 0.30946800857782364, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8355270475149155, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 476.53125, "epoch": 0.5552, "grad_norm": 0.03914060443639755, "kl": 0.00797271728515625, "learning_rate": 4.641732050210032e-06, "loss": -0.0052, "reward": 1.2452102601528168, "reward_std": 0.2378566674888134, "rewards/mrr_reward": 0.3098648376762867, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8910206854343414, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 450.03125, "epoch": 0.556, "grad_norm": 0.042982131242752075, "kl": 0.008350372314453125, "learning_rate": 4.6152331917333985e-06, "loss": 0.0454, "reward": 1.203509271144867, "reward_std": 0.11863551568239927, "rewards/mrr_reward": 0.2798239067196846, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8478744924068451, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 494.53125, "epoch": 0.5568, "grad_norm": 0.036636147648096085, "kl": 0.010402679443359375, "learning_rate": 4.588787478731242e-06, "loss": -0.0137, "reward": 1.102398157119751, "reward_std": 0.20872093737125397, "rewards/mrr_reward": 0.1981026753783226, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8066954910755157, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 463.609375, "epoch": 0.5576, "grad_norm": 0.04322382062673569, "kl": 0.007706642150878906, "learning_rate": 4.562395172212074e-06, "loss": 0.0143, "reward": 1.1307188272476196, "reward_std": 0.13696488551795483, "rewards/mrr_reward": 0.19231151044368744, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8944397121667862, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 455.3125, "epoch": 0.5584, "grad_norm": 0.03792423754930496, "kl": 0.008268356323242188, "learning_rate": 4.53605653265731e-06, "loss": -0.0294, "reward": 1.1437303125858307, "reward_std": 0.16462960094213486, "rewards/mrr_reward": 0.23955854214727879, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8043674677610397, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 463.0, "epoch": 0.5592, "grad_norm": 0.040437955409288406, "kl": 0.00787353515625, "learning_rate": 4.509771820018682e-06, "loss": 0.0165, "reward": 1.2735324501991272, "reward_std": 0.24658278934657574, "rewards/mrr_reward": 0.34225572273135185, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8767381012439728, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 457.015625, "epoch": 0.56, "grad_norm": 0.039811957627534866, "kl": 0.009557723999023438, "learning_rate": 4.483541293715699e-06, "loss": 0.0038, "reward": 1.2039062678813934, "reward_std": 0.0991826388053596, "rewards/mrr_reward": 0.26781994476914406, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8717812895774841, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 447.4375, "epoch": 0.5608, "grad_norm": 0.04115857928991318, "kl": 0.009393692016601562, "learning_rate": 4.457365212633058e-06, "loss": -0.0331, "reward": 1.0938493311405182, "reward_std": 0.12323368107900023, "rewards/mrr_reward": 0.17080233432352543, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8244557082653046, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 457.375, "epoch": 0.5616, "grad_norm": 0.03895072266459465, "kl": 0.009398460388183594, "learning_rate": 4.4312438351181246e-06, "loss": 0.0034, "reward": 1.1528900265693665, "reward_std": 0.14880603551864624, "rewards/mrr_reward": 0.231932045891881, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8454690128564835, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 460.40625, "epoch": 0.5624, "grad_norm": 0.039955999702215195, "kl": 0.0096435546875, "learning_rate": 4.405177418978331e-06, "loss": -0.0002, "reward": 1.150672048330307, "reward_std": 0.09315567277371883, "rewards/mrr_reward": 0.2145027294754982, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.872032955288887, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 455.734375, "epoch": 0.5632, "grad_norm": 0.039580851793289185, "kl": 0.00884246826171875, "learning_rate": 4.379166221478697e-06, "loss": -0.0308, "reward": 1.1858681738376617, "reward_std": 0.07365155033767223, "rewards/mrr_reward": 0.26101810671389103, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8162476569414139, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 464.46875, "epoch": 0.564, "grad_norm": 0.0397285558283329, "kl": 0.008618354797363281, "learning_rate": 4.353210499339231e-06, "loss": -0.0254, "reward": 1.1983648836612701, "reward_std": 0.17565249279141426, "rewards/mrr_reward": 0.25363964214921, "rewards/rank_answer_foramt_reward": 0.984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8862412869930267, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 467.375, "epoch": 0.5648, "grad_norm": 0.038892053067684174, "kl": 0.008718490600585938, "learning_rate": 4.3273105087324375e-06, "loss": -0.0375, "reward": 1.2153197675943375, "reward_std": 0.2709946185350418, "rewards/mrr_reward": 0.33776042610406876, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7862237989902496, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 462.875, "epoch": 0.5656, "grad_norm": 0.039017390459775925, "kl": 0.008515357971191406, "learning_rate": 4.301466505280763e-06, "loss": -0.0007, "reward": 1.138901799917221, "reward_std": 0.14515507780015469, "rewards/mrr_reward": 0.20719866082072258, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8526394814252853, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 449.28125, "epoch": 0.5664, "grad_norm": 0.03923118859529495, "kl": 0.009359359741210938, "learning_rate": 4.275678744054094e-06, "loss": 0.0008, "reward": 1.1316049695014954, "reward_std": 0.1249928786419332, "rewards/mrr_reward": 0.20212053880095482, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8439630717039108, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 441.453125, "epoch": 0.5672, "grad_norm": 0.04064570739865303, "kl": 0.009347915649414062, "learning_rate": 4.249947479567218e-06, "loss": -0.0284, "reward": 1.1335679292678833, "reward_std": 0.12174471095204353, "rewards/mrr_reward": 0.24037698283791542, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7613266557455063, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 429.578125, "epoch": 0.568, "grad_norm": 0.0448368564248085, "kl": 0.011203765869140625, "learning_rate": 4.224272965777326e-06, "loss": 0.0051, "reward": 1.1839303076267242, "reward_std": 0.22850197553634644, "rewards/mrr_reward": 0.2795758992433548, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.843983381986618, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 458.96875, "epoch": 0.5688, "grad_norm": 0.03913170099258423, "kl": 0.009366035461425781, "learning_rate": 4.1986554560815095e-06, "loss": -0.0368, "reward": 1.1309897899627686, "reward_std": 0.11143102683126926, "rewards/mrr_reward": 0.22149058431386948, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8185580670833588, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 433.046875, "epoch": 0.5696, "grad_norm": 0.03881422057747841, "kl": 0.009281158447265625, "learning_rate": 4.173095203314241e-06, "loss": -0.0238, "reward": 1.0363554060459137, "reward_std": 0.12483402527868748, "rewards/mrr_reward": 0.1456535290926695, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7772215157747269, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 454.859375, "epoch": 0.5704, "grad_norm": 0.04099239408969879, "kl": 0.008396148681640625, "learning_rate": 4.1475924597449025e-06, "loss": -0.0137, "reward": 1.1610506176948547, "reward_std": 0.22206255793571472, "rewards/mrr_reward": 0.259381216019392, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8241281360387802, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 444.984375, "epoch": 0.5712, "grad_norm": 0.05151242017745972, "kl": 0.011735916137695312, "learning_rate": 4.12214747707527e-06, "loss": -0.009, "reward": 1.273858591914177, "reward_std": 0.23207318596541882, "rewards/mrr_reward": 0.37640749476850033, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8406425565481186, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 438.3125, "epoch": 0.572, "grad_norm": 0.03908145800232887, "kl": 0.008737564086914062, "learning_rate": 4.096760506437057e-06, "loss": -0.0372, "reward": 1.262620210647583, "reward_std": 0.19197899289429188, "rewards/mrr_reward": 0.33578869700431824, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8300645500421524, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 489.90625, "epoch": 0.5728, "grad_norm": 0.03428515046834946, "kl": 0.007670402526855469, "learning_rate": 4.071431798389408e-06, "loss": -0.0106, "reward": 1.1059209406375885, "reward_std": 0.17448805645108223, "rewards/mrr_reward": 0.18847966939210892, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8074686527252197, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 424.265625, "epoch": 0.5736, "grad_norm": 0.04790130630135536, "kl": 0.009938240051269531, "learning_rate": 4.046161602916453e-06, "loss": -0.0354, "reward": 1.0969393253326416, "reward_std": 0.16678292863070965, "rewards/mrr_reward": 0.20392486453056335, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7627447545528412, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 453.375, "epoch": 0.5744, "grad_norm": 0.043300457298755646, "kl": 0.012233734130859375, "learning_rate": 4.020950169424815e-06, "loss": -0.0072, "reward": 1.088065654039383, "reward_std": 0.1905173622071743, "rewards/mrr_reward": 0.18054936081171036, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8145024180412292, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 486.21875, "epoch": 0.5752, "grad_norm": 0.03566781058907509, "kl": 0.007656097412109375, "learning_rate": 3.9957977467411615e-06, "loss": -0.0198, "reward": 1.3303083777427673, "reward_std": 0.1124997977167368, "rewards/mrr_reward": 0.3919084668159485, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8514484316110611, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 453.0, "epoch": 0.576, "grad_norm": 0.03936339169740677, "kl": 0.007256507873535156, "learning_rate": 3.970704583109755e-06, "loss": -0.0184, "reward": 1.2179461419582367, "reward_std": 0.19140197336673737, "rewards/mrr_reward": 0.29061879590153694, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8452388942241669, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 482.640625, "epoch": 0.5768, "grad_norm": 0.040819380432367325, "kl": 0.008708953857421875, "learning_rate": 3.945670926189987e-06, "loss": -0.0032, "reward": 1.363356113433838, "reward_std": 0.29930396378040314, "rewards/mrr_reward": 0.4476066455245018, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8511701226234436, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 452.296875, "epoch": 0.5776, "grad_norm": 0.03900689631700516, "kl": 0.009435653686523438, "learning_rate": 3.920697023053949e-06, "loss": -0.0039, "reward": 1.103367805480957, "reward_std": 0.08733582124114037, "rewards/mrr_reward": 0.169921875, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8520613461732864, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 454.796875, "epoch": 0.5784, "grad_norm": 0.04251600429415703, "kl": 0.007720947265625, "learning_rate": 3.895783120183975e-06, "loss": 0.0037, "reward": 1.167445570230484, "reward_std": 0.14570345729589462, "rewards/mrr_reward": 0.2338479682803154, "rewards/rank_answer_foramt_reward": 0.953125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8759585320949554, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 465.875, "epoch": 0.5792, "grad_norm": 0.038369178771972656, "kl": 0.00865936279296875, "learning_rate": 3.8709294634702374e-06, "loss": 0.0008, "reward": 1.1614885032176971, "reward_std": 0.1151402248069644, "rewards/mrr_reward": 0.22297247499227524, "rewards/rank_answer_foramt_reward": 0.984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.875237762928009, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 446.75, "epoch": 0.58, "grad_norm": 0.04544536769390106, "kl": 0.009899139404296875, "learning_rate": 3.846136298208285e-06, "loss": -0.0184, "reward": 1.1653823256492615, "reward_std": 0.3885572552680969, "rewards/mrr_reward": 0.3096292167901993, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7826442420482635, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 476.375, "epoch": 0.5808, "grad_norm": 0.04791020229458809, "kl": 0.007744789123535156, "learning_rate": 3.821403869096658e-06, "loss": -0.0082, "reward": 1.1828205287456512, "reward_std": 0.23169685155153275, "rewards/mrr_reward": 0.2780692037194967, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8256549388170242, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 450.078125, "epoch": 0.5816, "grad_norm": 0.04281622916460037, "kl": 0.009065628051757812, "learning_rate": 3.7967324202344433e-06, "loss": -0.0319, "reward": 1.0675395727157593, "reward_std": 0.16223911754786968, "rewards/mrr_reward": 0.153949661180377, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8270478695631027, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 470.640625, "epoch": 0.5824, "grad_norm": 0.03857819736003876, "kl": 0.007999420166015625, "learning_rate": 3.772122195118877e-06, "loss": -0.0071, "reward": 1.100866436958313, "reward_std": 0.09526841063052416, "rewards/mrr_reward": 0.1559399850666523, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8770851939916611, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 456.203125, "epoch": 0.5832, "grad_norm": 0.04351005703210831, "kl": 0.00836944580078125, "learning_rate": 3.747573436642952e-06, "loss": 0.0079, "reward": 1.080918699502945, "reward_std": 0.07267094915732741, "rewards/mrr_reward": 0.13717758283019066, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.887165293097496, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 497.921875, "epoch": 0.584, "grad_norm": 0.03764002025127411, "kl": 0.007304191589355469, "learning_rate": 3.723086387092997e-06, "loss": 0.0091, "reward": 1.173829346895218, "reward_std": 0.16954005137085915, "rewards/mrr_reward": 0.23038814216852188, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8862564116716385, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 437.609375, "epoch": 0.5848, "grad_norm": 0.04356111213564873, "kl": 0.011903762817382812, "learning_rate": 3.6986612881463114e-06, "loss": -0.0154, "reward": 1.092040315270424, "reward_std": 0.22640430741012096, "rewards/mrr_reward": 0.2199590802192688, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7559514939785004, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 439.296875, "epoch": 0.5856, "grad_norm": 0.04148973152041435, "kl": 0.008152008056640625, "learning_rate": 3.674298380868756e-06, "loss": -0.0105, "reward": 1.0901039838790894, "reward_std": 0.1490055676549673, "rewards/mrr_reward": 0.1813926100730896, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.82398322224617, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 456.015625, "epoch": 0.5864, "grad_norm": 0.04152887687087059, "kl": 0.009691238403320312, "learning_rate": 3.649997905712396e-06, "loss": -0.0123, "reward": 1.1922140419483185, "reward_std": 0.1852929126471281, "rewards/mrr_reward": 0.2508494630455971, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8799635022878647, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 458.015625, "epoch": 0.5872, "grad_norm": 0.03897524252533913, "kl": 0.010408401489257812, "learning_rate": 3.625760102513103e-06, "loss": -0.0037, "reward": 1.1527060270309448, "reward_std": 0.18498503416776657, "rewards/mrr_reward": 0.23213045671582222, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8599352687597275, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 459.640625, "epoch": 0.588, "grad_norm": 0.0379464253783226, "kl": 0.008158683776855469, "learning_rate": 3.601585210488218e-06, "loss": -0.0026, "reward": 1.1971299052238464, "reward_std": 0.06598696298897266, "rewards/mrr_reward": 0.2490451280027628, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.9022810012102127, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 449.484375, "epoch": 0.5888, "grad_norm": 0.19841361045837402, "kl": 0.03487586975097656, "learning_rate": 3.5774734682341563e-06, "loss": -0.0105, "reward": 1.2799700498580933, "reward_std": 0.21030810847878456, "rewards/mrr_reward": 0.36318204924464226, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8640826940536499, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 477.125, "epoch": 0.5896, "grad_norm": 0.04086919128894806, "kl": 0.008722305297851562, "learning_rate": 3.5534251137240883e-06, "loss": -0.0162, "reward": 1.1706961393356323, "reward_std": 0.26971735805273056, "rewards/mrr_reward": 0.25814731419086456, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8746743649244308, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 456.046875, "epoch": 0.5904, "grad_norm": 0.04114146903157234, "kl": 0.008604049682617188, "learning_rate": 3.5294403843055604e-06, "loss": 0.0248, "reward": 1.1644734144210815, "reward_std": 0.1057867594063282, "rewards/mrr_reward": 0.2209201343357563, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.9080804586410522, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 458.828125, "epoch": 0.5912, "grad_norm": 0.04274991899728775, "kl": 0.009336471557617188, "learning_rate": 3.505519516698165e-06, "loss": -0.0115, "reward": 1.101497322320938, "reward_std": 0.1800498366355896, "rewards/mrr_reward": 0.18911830335855484, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8565816581249237, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 456.671875, "epoch": 0.592, "grad_norm": 0.041656360030174255, "kl": 0.007590293884277344, "learning_rate": 3.4816627469912147e-06, "loss": -0.0137, "reward": 1.1390126645565033, "reward_std": 0.14937850926071405, "rewards/mrr_reward": 0.21043526753783226, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8548864126205444, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 441.5, "epoch": 0.5928, "grad_norm": 0.0403144545853138, "kl": 0.009930610656738281, "learning_rate": 3.4578703106413903e-06, "loss": -0.032, "reward": 1.118723675608635, "reward_std": 0.194508895277977, "rewards/mrr_reward": 0.21798115223646164, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8096006661653519, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 465.796875, "epoch": 0.5936, "grad_norm": 0.03830445185303688, "kl": 0.0073337554931640625, "learning_rate": 3.4341424424704373e-06, "loss": 0.0001, "reward": 1.1513462364673615, "reward_std": 0.14330180920660496, "rewards/mrr_reward": 0.24214409850537777, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8450015187263489, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 460.875, "epoch": 0.5944, "grad_norm": 0.04006378352642059, "kl": 0.0078029632568359375, "learning_rate": 3.4104793766628307e-06, "loss": -0.0074, "reward": 1.165648490190506, "reward_std": 0.19168706238269806, "rewards/mrr_reward": 0.2604600749909878, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8699481040239334, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 456.71875, "epoch": 0.5952, "grad_norm": 0.043256212025880814, "kl": 0.009433746337890625, "learning_rate": 3.3868813467634833e-06, "loss": -0.0223, "reward": 1.1679503321647644, "reward_std": 0.2485765404999256, "rewards/mrr_reward": 0.3030258007347584, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.78699891269207, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 449.796875, "epoch": 0.596, "grad_norm": 0.04080253094434738, "kl": 0.008932113647460938, "learning_rate": 3.3633485856754143e-06, "loss": -0.005, "reward": 1.1107763051986694, "reward_std": 0.13455253094434738, "rewards/mrr_reward": 0.1878348346799612, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8378077745437622, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 441.21875, "epoch": 0.5968, "grad_norm": 0.04848048463463783, "kl": 0.00908660888671875, "learning_rate": 3.3398813256574847e-06, "loss": -0.0434, "reward": 1.212068885564804, "reward_std": 0.2859114482998848, "rewards/mrr_reward": 0.3225632533431053, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7872682809829712, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 463.984375, "epoch": 0.5976, "grad_norm": 0.038503892719745636, "kl": 0.008440017700195312, "learning_rate": 3.316479798322072e-06, "loss": -0.0188, "reward": 1.080864131450653, "reward_std": 0.11961814388632774, "rewards/mrr_reward": 0.1450644824653864, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8709126263856888, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 438.59375, "epoch": 0.5984, "grad_norm": 0.04509103670716286, "kl": 0.009431838989257812, "learning_rate": 3.2931442346328e-06, "loss": 0.0045, "reward": 1.1249631643295288, "reward_std": 0.18062920682132244, "rewards/mrr_reward": 0.2126116044819355, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8193891793489456, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 442.359375, "epoch": 0.5992, "grad_norm": 0.03829497843980789, "kl": 0.009325027465820312, "learning_rate": 3.2698748649022693e-06, "loss": -0.031, "reward": 1.3105561435222626, "reward_std": 0.24571607820689678, "rewards/mrr_reward": 0.4127728193998337, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7752430289983749, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 430.796875, "epoch": 0.6, "grad_norm": 0.0403105802834034, "kl": 0.008958816528320312, "learning_rate": 3.2466719187897555e-06, "loss": -0.0318, "reward": 1.3205139636993408, "reward_std": 0.30093929544091225, "rewards/mrr_reward": 0.4177951477468014, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8058240860700607, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 442.578125, "epoch": 0.6008, "grad_norm": 0.04085452854633331, "kl": 0.008657455444335938, "learning_rate": 3.223535625298979e-06, "loss": 0.0034, "reward": 1.2626567482948303, "reward_std": 0.163003820925951, "rewards/mrr_reward": 0.35236856341362, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8014176934957504, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 447.0, "epoch": 0.6016, "grad_norm": 0.04556753858923912, "kl": 0.01016998291015625, "learning_rate": 3.200466212775808e-06, "loss": 0.0427, "reward": 1.1225496679544449, "reward_std": 0.1800556741654873, "rewards/mrr_reward": 0.20434647798538208, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8664180636405945, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 464.28125, "epoch": 0.6024, "grad_norm": 0.03804047778248787, "kl": 0.0096435546875, "learning_rate": 3.1774639089060364e-06, "loss": -0.017, "reward": 1.097492665052414, "reward_std": 0.14176994934678078, "rewards/mrr_reward": 0.1837735567241907, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8235331773757935, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 456.125, "epoch": 0.6032, "grad_norm": 0.04436762258410454, "kl": 0.008335113525390625, "learning_rate": 3.1545289407131128e-06, "loss": -0.03, "reward": 1.258377730846405, "reward_std": 0.2980855964124203, "rewards/mrr_reward": 0.34546130523085594, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8640695363283157, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 437.09375, "epoch": 0.604, "grad_norm": 0.04474704712629318, "kl": 0.01195526123046875, "learning_rate": 3.1316615345559188e-06, "loss": -0.0335, "reward": 1.1586785316467285, "reward_std": 0.217840775847435, "rewards/mrr_reward": 0.26437871530652046, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.817421168088913, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 455.09375, "epoch": 0.6048, "grad_norm": 0.0435645654797554, "kl": 0.008356094360351562, "learning_rate": 3.108861916126518e-06, "loss": -0.0303, "reward": 1.2183335721492767, "reward_std": 0.18061750754714012, "rewards/mrr_reward": 0.3272135518491268, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7784887701272964, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 450.734375, "epoch": 0.6056, "grad_norm": 0.041368041187524796, "kl": 0.009997367858886719, "learning_rate": 3.086130310447937e-06, "loss": -0.0175, "reward": 1.0250304490327835, "reward_std": 0.17961598467081785, "rewards/mrr_reward": 0.14366319216787815, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8563565909862518, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 467.0, "epoch": 0.6064, "grad_norm": 0.04095418006181717, "kl": 0.0072650909423828125, "learning_rate": 3.063466941871952e-06, "loss": -0.0173, "reward": 1.1504963636398315, "reward_std": 0.12690221052616835, "rewards/mrr_reward": 0.2360739130526781, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7983209490776062, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 444.90625, "epoch": 0.6072, "grad_norm": 0.04432576522231102, "kl": 0.007794380187988281, "learning_rate": 3.040872034076857e-06, "loss": 0.0107, "reward": 1.1860106885433197, "reward_std": 0.13013709522783756, "rewards/mrr_reward": 0.2523561418056488, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8780841082334518, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 456.203125, "epoch": 0.608, "grad_norm": 0.039851389825344086, "kl": 0.008897781372070312, "learning_rate": 3.0183458100652752e-06, "loss": -0.0125, "reward": 1.1152794659137726, "reward_std": 0.19540764205157757, "rewards/mrr_reward": 0.19703001156449318, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8724178522825241, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 449.921875, "epoch": 0.6088, "grad_norm": 0.040054626762866974, "kl": 0.010467529296875, "learning_rate": 2.9958884921619368e-06, "loss": -0.0206, "reward": 1.1841309070587158, "reward_std": 0.16952326335012913, "rewards/mrr_reward": 0.2834821455180645, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8073635548353195, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 449.390625, "epoch": 0.6096, "grad_norm": 0.043995246291160583, "kl": 0.01117706298828125, "learning_rate": 2.9735003020115095e-06, "loss": 0.005, "reward": 1.2519162595272064, "reward_std": 0.33760569244623184, "rewards/mrr_reward": 0.3831969276070595, "rewards/rank_answer_foramt_reward": 0.806640625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8492796123027802, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 478.546875, "epoch": 0.6104, "grad_norm": 0.04608379304409027, "kl": 0.009157180786132812, "learning_rate": 2.9511814605763855e-06, "loss": 0.0333, "reward": 1.1024558991193771, "reward_std": 0.21346063539385796, "rewards/mrr_reward": 0.21845859102904797, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.7666701674461365, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 450.421875, "epoch": 0.6112, "grad_norm": 0.04478094354271889, "kl": 0.00982666015625, "learning_rate": 2.9289321881345257e-06, "loss": -0.0129, "reward": 1.0492956936359406, "reward_std": 0.13610859028995037, "rewards/mrr_reward": 0.1452070940285921, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.802162379026413, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 456.203125, "epoch": 0.612, "grad_norm": 0.05722177028656006, "kl": 0.015047073364257812, "learning_rate": 2.9067527042772638e-06, "loss": -0.0044, "reward": 1.134572982788086, "reward_std": 0.20083492621779442, "rewards/mrr_reward": 0.2158978171646595, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8541764914989471, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 430.390625, "epoch": 0.6128, "grad_norm": 0.0441712960600853, "kl": 0.011053085327148438, "learning_rate": 2.884643227907147e-06, "loss": -0.0469, "reward": 1.0365698337554932, "reward_std": 0.14963462762534618, "rewards/mrr_reward": 0.12454117089509964, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8457542955875397, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 446.1875, "epoch": 0.6136, "grad_norm": 0.03963298723101616, "kl": 0.008359909057617188, "learning_rate": 2.8626039772357884e-06, "loss": 0.0217, "reward": 1.2510823607444763, "reward_std": 0.14392628520727158, "rewards/mrr_reward": 0.3217137847095728, "rewards/rank_answer_foramt_reward": 0.984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8397057503461838, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 464.84375, "epoch": 0.6144, "grad_norm": 0.04018845781683922, "kl": 0.007063865661621094, "learning_rate": 2.840635169781688e-06, "loss": 0.004, "reward": 1.1201017796993256, "reward_std": 0.1913389079272747, "rewards/mrr_reward": 0.21623263508081436, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8347004055976868, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 490.125, "epoch": 0.6152, "grad_norm": 0.040367186069488525, "kl": 0.008176803588867188, "learning_rate": 2.8187370223681134e-06, "loss": 0.0112, "reward": 1.131416529417038, "reward_std": 0.19026605784893036, "rewards/mrr_reward": 0.21690849214792252, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8493613749742508, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 495.46875, "epoch": 0.616, "grad_norm": 0.04241256043314934, "kl": 0.008129119873046875, "learning_rate": 2.796909751120931e-06, "loss": 0.0415, "reward": 1.2324725687503815, "reward_std": 0.2082219198346138, "rewards/mrr_reward": 0.28964534401893616, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8707241117954254, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 461.234375, "epoch": 0.6168, "grad_norm": 0.039128441363573074, "kl": 0.007460594177246094, "learning_rate": 2.7751535714665025e-06, "loss": 0.0041, "reward": 1.225624144077301, "reward_std": 0.14080518763512373, "rewards/mrr_reward": 0.296521570533514, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8564777374267578, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 460.59375, "epoch": 0.6176, "grad_norm": 0.03881870210170746, "kl": 0.008975982666015625, "learning_rate": 2.7534686981295335e-06, "loss": -0.0376, "reward": 1.0894058048725128, "reward_std": 0.08322741370648146, "rewards/mrr_reward": 0.15754588693380356, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.845302164554596, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 467.140625, "epoch": 0.6184, "grad_norm": 0.04274870082736015, "kl": 0.008192062377929688, "learning_rate": 2.7318553451309726e-06, "loss": 0.0279, "reward": 1.1450502276420593, "reward_std": 0.13760012108832598, "rewards/mrr_reward": 0.20704985596239567, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8639096468687057, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 444.703125, "epoch": 0.6192, "grad_norm": 0.041456714272499084, "kl": 0.010091781616210938, "learning_rate": 2.7103137257858867e-06, "loss": -0.0241, "reward": 1.1570966690778732, "reward_std": 0.2309955172240734, "rewards/mrr_reward": 0.2663938533514738, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.7850367873907089, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 432.28125, "epoch": 0.62, "grad_norm": 0.03964508697390556, "kl": 0.00922393798828125, "learning_rate": 2.6888440527013595e-06, "loss": -0.018, "reward": 1.3120769262313843, "reward_std": 0.2254694253206253, "rewards/mrr_reward": 0.39821428060531616, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8044368922710419, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 460.28125, "epoch": 0.6208, "grad_norm": 0.03911551460623741, "kl": 0.008837699890136719, "learning_rate": 2.667446537774402e-06, "loss": -0.0125, "reward": 1.1185062527656555, "reward_std": 0.17317464342340827, "rewards/mrr_reward": 0.2100074477493763, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.854589119553566, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 476.609375, "epoch": 0.6216, "grad_norm": 0.03766229748725891, "kl": 0.007938385009765625, "learning_rate": 2.646121392189841e-06, "loss": -0.0097, "reward": 1.310092717409134, "reward_std": 0.1681712232530117, "rewards/mrr_reward": 0.36098089441657066, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8760963976383209, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 440.25, "epoch": 0.6224, "grad_norm": 0.03969811275601387, "kl": 0.008672714233398438, "learning_rate": 2.624868826418262e-06, "loss": -0.0219, "reward": 1.174126923084259, "reward_std": 0.22464322298765182, "rewards/mrr_reward": 0.2639323025941849, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8753529489040375, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 465.4375, "epoch": 0.6232, "grad_norm": 0.04108897224068642, "kl": 0.008535385131835938, "learning_rate": 2.603689050213902e-06, "loss": 0.0067, "reward": 1.0609664767980576, "reward_std": 0.15589328296482563, "rewards/mrr_reward": 0.156274801120162, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8508649319410324, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 445.1875, "epoch": 0.624, "grad_norm": 0.04128960147500038, "kl": 0.009108543395996094, "learning_rate": 2.5825822726126095e-06, "loss": -0.0168, "reward": 1.2373025119304657, "reward_std": 0.21737964265048504, "rewards/mrr_reward": 0.3343130014836788, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8320348858833313, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 436.640625, "epoch": 0.6248, "grad_norm": 0.04342431575059891, "kl": 0.010232925415039062, "learning_rate": 2.561548701929749e-06, "loss": -0.0274, "reward": 1.1886743009090424, "reward_std": 0.1972416564822197, "rewards/mrr_reward": 0.27752356603741646, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8470001071691513, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 448.578125, "epoch": 0.6256, "grad_norm": 0.04006101191043854, "kl": 0.010242462158203125, "learning_rate": 2.5405885457581793e-06, "loss": -0.0326, "reward": 1.24961519241333, "reward_std": 0.2031826265156269, "rewards/mrr_reward": 0.3277033753693104, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8639844059944153, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 439.75, "epoch": 0.6264, "grad_norm": 0.040887244045734406, "kl": 0.00983428955078125, "learning_rate": 2.5197020109661775e-06, "loss": -0.0045, "reward": 1.0167758166790009, "reward_std": 0.1663502948358655, "rewards/mrr_reward": 0.12349330447614193, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8182446956634521, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 467.375, "epoch": 0.6272, "grad_norm": 0.047615982592105865, "kl": 0.010096549987792969, "learning_rate": 2.4988893036954045e-06, "loss": 0.0403, "reward": 1.1557523906230927, "reward_std": 0.13815788738429546, "rewards/mrr_reward": 0.22870784625411034, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8795382529497147, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 442.484375, "epoch": 0.628, "grad_norm": 0.04078188166022301, "kl": 0.008787155151367188, "learning_rate": 2.4781506293588876e-06, "loss": -0.0127, "reward": 1.0591696500778198, "reward_std": 0.15905814059078693, "rewards/mrr_reward": 0.14029017835855484, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8743270188570023, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 433.828125, "epoch": 0.6288, "grad_norm": 0.0395292192697525, "kl": 0.009163856506347656, "learning_rate": 2.4574861926389615e-06, "loss": 0.0017, "reward": 1.1320269703865051, "reward_std": 0.11425493052229285, "rewards/mrr_reward": 0.1963045708835125, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.862866073846817, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 455.953125, "epoch": 0.6296, "grad_norm": 0.050643905997276306, "kl": 0.00799560546875, "learning_rate": 2.436896197485282e-06, "loss": -0.0253, "reward": 1.1131080090999603, "reward_std": 0.17427906021475792, "rewards/mrr_reward": 0.19312996231019497, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8698435872793198, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 430.484375, "epoch": 0.6304, "grad_norm": 0.04102494940161705, "kl": 0.00933074951171875, "learning_rate": 2.4163808471127815e-06, "loss": -0.0149, "reward": 1.060589388012886, "reward_std": 0.14299902319908142, "rewards/mrr_reward": 0.18007192388176918, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.7268284261226654, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 449.515625, "epoch": 0.6312, "grad_norm": 0.04130149260163307, "kl": 0.008101463317871094, "learning_rate": 2.395940343999691e-06, "loss": -0.0096, "reward": 1.1629635095596313, "reward_std": 0.1361176036298275, "rewards/mrr_reward": 0.24042658135294914, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8717381805181503, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 473.515625, "epoch": 0.632, "grad_norm": 0.04310329630970955, "kl": 0.0070438385009765625, "learning_rate": 2.37557488988552e-06, "loss": 0.0486, "reward": 1.1800865232944489, "reward_std": 0.16778289526700974, "rewards/mrr_reward": 0.24763764813542366, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8392744958400726, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 454.71875, "epoch": 0.6328, "grad_norm": 0.22937795519828796, "kl": 0.05006980895996094, "learning_rate": 2.3552846857690847e-06, "loss": -0.0004, "reward": 1.1795984208583832, "reward_std": 0.18312821350991726, "rewards/mrr_reward": 0.26884301006793976, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8458021879196167, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 449.046875, "epoch": 0.6336, "grad_norm": 0.04103982821106911, "kl": 0.0069122314453125, "learning_rate": 2.335069931906503e-06, "loss": -0.0263, "reward": 1.0953315794467926, "reward_std": 0.09945772588253021, "rewards/mrr_reward": 0.15505332499742508, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.862999752163887, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 477.0625, "epoch": 0.6344, "grad_norm": 0.036689817905426025, "kl": 0.0096435546875, "learning_rate": 2.3149308278092343e-06, "loss": -0.0214, "reward": 1.1218132674694061, "reward_std": 0.20247724652290344, "rewards/mrr_reward": 0.22500619292259216, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8074406385421753, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 440.4375, "epoch": 0.6352, "grad_norm": 0.039789628237485886, "kl": 0.007559776306152344, "learning_rate": 2.2948675722421086e-06, "loss": -0.026, "reward": 1.1090701520442963, "reward_std": 0.21167061291635036, "rewards/mrr_reward": 0.22198661416769028, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7564912438392639, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 438.609375, "epoch": 0.636, "grad_norm": 0.03997084125876427, "kl": 0.008802413940429688, "learning_rate": 2.2748803632213556e-06, "loss": -0.0237, "reward": 1.1740768551826477, "reward_std": 0.14025119692087173, "rewards/mrr_reward": 0.24523189291357994, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8420254588127136, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 458.984375, "epoch": 0.6368, "grad_norm": 0.04768791422247887, "kl": 0.01158905029296875, "learning_rate": 2.254969398012663e-06, "loss": 0.0419, "reward": 1.158288836479187, "reward_std": 0.16161417961120605, "rewards/mrr_reward": 0.241412453353405, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8604445457458496, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 458.140625, "epoch": 0.6376, "grad_norm": 0.038609255105257034, "kl": 0.007890701293945312, "learning_rate": 2.2351348731292134e-06, "loss": -0.0108, "reward": 1.0548525154590607, "reward_std": 0.22406835108995438, "rewards/mrr_reward": 0.17250124365091324, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8026978820562363, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 481.953125, "epoch": 0.6384, "grad_norm": 0.042043354362249374, "kl": 0.008205413818359375, "learning_rate": 2.215376984329767e-06, "loss": 0.0081, "reward": 1.1629877984523773, "reward_std": 0.14389540813863277, "rewards/mrr_reward": 0.22026289999485016, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8977576941251755, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 470.8125, "epoch": 0.6392, "grad_norm": 0.04208019748330116, "kl": 0.0068149566650390625, "learning_rate": 2.195695926616702e-06, "loss": 0.0019, "reward": 1.1885976493358612, "reward_std": 0.19713237322866917, "rewards/mrr_reward": 0.254371277987957, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.9013012945652008, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 415.109375, "epoch": 0.64, "grad_norm": 0.04990691691637039, "kl": 0.0174713134765625, "learning_rate": 2.1760918942341193e-06, "loss": -0.0155, "reward": 1.2124728560447693, "reward_std": 0.20276020467281342, "rewards/mrr_reward": 0.34215650893747807, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7349783107638359, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 438.203125, "epoch": 0.6408, "grad_norm": 0.045392923057079315, "kl": 0.007555961608886719, "learning_rate": 2.1565650806658977e-06, "loss": -0.0492, "reward": 1.1203849911689758, "reward_std": 0.15124214440584183, "rewards/mrr_reward": 0.2108196932822466, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7972739636898041, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 465.96875, "epoch": 0.6416, "grad_norm": 0.0389019213616848, "kl": 0.008981704711914062, "learning_rate": 2.1371156786338108e-06, "loss": -0.0184, "reward": 1.1323182135820389, "reward_std": 0.20558064430952072, "rewards/mrr_reward": 0.2269345335662365, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8471024334430695, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 446.984375, "epoch": 0.6424, "grad_norm": 0.042265504598617554, "kl": 0.007932662963867188, "learning_rate": 2.117743880095601e-06, "loss": -0.0239, "reward": 1.2599590122699738, "reward_std": 0.2774098366498947, "rewards/mrr_reward": 0.35645462572574615, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8335951864719391, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 468.375, "epoch": 0.6432, "grad_norm": 0.057718675583601, "kl": 0.01120758056640625, "learning_rate": 2.098449876243096e-06, "loss": 0.0413, "reward": 1.4104988873004913, "reward_std": 0.2141698058694601, "rewards/mrr_reward": 0.5004154220223427, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8144690990447998, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 445.75, "epoch": 0.644, "grad_norm": 0.04124637320637703, "kl": 0.010328292846679688, "learning_rate": 2.0792338575003303e-06, "loss": -0.0279, "reward": 1.0802222043275833, "reward_std": 0.219939723610878, "rewards/mrr_reward": 0.20515252836048603, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7806323915719986, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 457.265625, "epoch": 0.6448, "grad_norm": 0.04474443942308426, "kl": 0.008333206176757812, "learning_rate": 2.0600960135216463e-06, "loss": -0.015, "reward": 1.1868183612823486, "reward_std": 0.1631349828094244, "rewards/mrr_reward": 0.2634982690215111, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8662989884614944, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 446.796875, "epoch": 0.6456, "grad_norm": 0.03788938745856285, "kl": 0.0070400238037109375, "learning_rate": 2.041036533189842e-06, "loss": -0.0001, "reward": 1.2256155908107758, "reward_std": 0.18802989460527897, "rewards/mrr_reward": 0.29671378806233406, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8441506326198578, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 449.46875, "epoch": 0.6464, "grad_norm": 0.03982880339026451, "kl": 0.007785797119140625, "learning_rate": 2.022055604614289e-06, "loss": 0.007, "reward": 1.1301990747451782, "reward_std": 0.16357433795928955, "rewards/mrr_reward": 0.2091703936457634, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8534960001707077, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 431.59375, "epoch": 0.6472, "grad_norm": 0.0403585247695446, "kl": 0.009008407592773438, "learning_rate": 2.0031534151290944e-06, "loss": 0.0021, "reward": 1.1528845131397247, "reward_std": 0.1795931002125144, "rewards/mrr_reward": 0.23609252274036407, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8406573981046677, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 427.125, "epoch": 0.648, "grad_norm": 0.04252356290817261, "kl": 0.0116729736328125, "learning_rate": 1.984330151291233e-06, "loss": -0.0035, "reward": 1.1698086261749268, "reward_std": 0.22232595458626747, "rewards/mrr_reward": 0.2710999473929405, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7878125905990601, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 466.40625, "epoch": 0.6488, "grad_norm": 0.04095650836825371, "kl": 0.0076580047607421875, "learning_rate": 1.965585998878724e-06, "loss": 0.0121, "reward": 1.2161291241645813, "reward_std": 0.26305726869031787, "rewards/mrr_reward": 0.3006200324743986, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8777853697538376, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 445.703125, "epoch": 0.6496, "grad_norm": 0.04271842539310455, "kl": 0.008337020874023438, "learning_rate": 1.9469211428887813e-06, "loss": -0.0074, "reward": 1.3187182247638702, "reward_std": 0.18177231401205063, "rewards/mrr_reward": 0.3931051604449749, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8693408221006393, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 440.859375, "epoch": 0.6504, "grad_norm": 0.04430151358246803, "kl": 0.009272575378417969, "learning_rate": 1.928335767535997e-06, "loss": -0.054, "reward": 1.2358072102069855, "reward_std": 0.18769995868206024, "rewards/mrr_reward": 0.3260292708873749, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8135431855916977, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 449.84375, "epoch": 0.6512, "grad_norm": 0.037271056324243546, "kl": 0.0101318359375, "learning_rate": 1.9098300562505266e-06, "loss": -0.0295, "reward": 1.2297299802303314, "reward_std": 0.15305910632014275, "rewards/mrr_reward": 0.2948102727532387, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8604336082935333, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 420.609375, "epoch": 0.652, "grad_norm": 0.04709629714488983, "kl": 0.009583473205566406, "learning_rate": 1.8914041916762648e-06, "loss": -0.0617, "reward": 1.0883388370275497, "reward_std": 0.1659074891358614, "rewards/mrr_reward": 0.21894222125411034, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7243789285421371, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 438.09375, "epoch": 0.6528, "grad_norm": 0.041499897837638855, "kl": 0.0078277587890625, "learning_rate": 1.8730583556690607e-06, "loss": -0.0036, "reward": 1.0809594094753265, "reward_std": 0.1182434605434537, "rewards/mrr_reward": 0.15505952574312687, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8682572394609451, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 441.953125, "epoch": 0.6536, "grad_norm": 0.041228439658880234, "kl": 0.00791168212890625, "learning_rate": 1.8547927292949053e-06, "loss": -0.0025, "reward": 1.2632147371768951, "reward_std": 0.2752368990331888, "rewards/mrr_reward": 0.35054564103484154, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8437888473272324, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 461.96875, "epoch": 0.6544, "grad_norm": 0.039352841675281525, "kl": 0.007452964782714844, "learning_rate": 1.8366074928281608e-06, "loss": 0.0081, "reward": 1.285154014825821, "reward_std": 0.2392275594174862, "rewards/mrr_reward": 0.35860615968704224, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8643612116575241, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 452.359375, "epoch": 0.6552, "grad_norm": 0.04071607068181038, "kl": 0.008332252502441406, "learning_rate": 1.818502825749764e-06, "loss": -0.0496, "reward": 1.1540860533714294, "reward_std": 0.1926177842542529, "rewards/mrr_reward": 0.24485987797379494, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8392150849103928, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 467.328125, "epoch": 0.656, "grad_norm": 0.038751162588596344, "kl": 0.007927894592285156, "learning_rate": 1.8004789067454763e-06, "loss": -0.0364, "reward": 1.1840001344680786, "reward_std": 0.21113791782408953, "rewards/mrr_reward": 0.25868676230311394, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8547612130641937, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 450.78125, "epoch": 0.6568, "grad_norm": 0.04042641073465347, "kl": 0.006926536560058594, "learning_rate": 1.7825359137040987e-06, "loss": 0.0111, "reward": 1.1455277800559998, "reward_std": 0.14714708551764488, "rewards/mrr_reward": 0.20182912051677704, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.9007083028554916, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 439.046875, "epoch": 0.6576, "grad_norm": 0.04337405040860176, "kl": 0.013456344604492188, "learning_rate": 1.7646740237157256e-06, "loss": -0.0211, "reward": 1.1473829746246338, "reward_std": 0.1825929917395115, "rewards/mrr_reward": 0.23546627908945084, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8454151898622513, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 437.40625, "epoch": 0.6584, "grad_norm": 0.03853907063603401, "kl": 0.008459091186523438, "learning_rate": 1.7468934130700044e-06, "loss": -0.0356, "reward": 1.3643218874931335, "reward_std": 0.2208605632185936, "rewards/mrr_reward": 0.44823288172483444, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8033709824085236, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 443.28125, "epoch": 0.6592, "grad_norm": 0.04316363483667374, "kl": 0.009531974792480469, "learning_rate": 1.7291942572543806e-06, "loss": -0.0131, "reward": 1.1022516041994095, "reward_std": 0.2808724083006382, "rewards/mrr_reward": 0.24019718542695045, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7450985908508301, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 445.109375, "epoch": 0.66, "grad_norm": 0.04379027336835861, "kl": 0.009677886962890625, "learning_rate": 1.7115767309523811e-06, "loss": -0.0232, "reward": 1.1667591333389282, "reward_std": 0.23061896860599518, "rewards/mrr_reward": 0.2626550104469061, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.79049052298069, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 460.515625, "epoch": 0.6608, "grad_norm": 0.042437583208084106, "kl": 0.0114288330078125, "learning_rate": 1.6940410080418723e-06, "loss": 0.0054, "reward": 1.0727409720420837, "reward_std": 0.15770739316940308, "rewards/mrr_reward": 0.15853795036673546, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8269526362419128, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 436.453125, "epoch": 0.6616, "grad_norm": 0.04106762260198593, "kl": 0.0107269287109375, "learning_rate": 1.6765872615933676e-06, "loss": -0.0267, "reward": 1.1445525884628296, "reward_std": 0.17659413255751133, "rewards/mrr_reward": 0.2239893414080143, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8794291019439697, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 435.0, "epoch": 0.6624, "grad_norm": 0.04433257132768631, "kl": 0.008103370666503906, "learning_rate": 1.6592156638682887e-06, "loss": -0.0554, "reward": 1.1460805833339691, "reward_std": 0.14873161166906357, "rewards/mrr_reward": 0.253608625382185, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7747729271650314, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 465.234375, "epoch": 0.6632, "grad_norm": 0.03736725449562073, "kl": 0.007901191711425781, "learning_rate": 1.6419263863172997e-06, "loss": -0.0225, "reward": 1.2986692488193512, "reward_std": 0.18618784938007593, "rewards/mrr_reward": 0.36312004551291466, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8506224751472473, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 462.484375, "epoch": 0.664, "grad_norm": 0.03575273975729942, "kl": 0.007147789001464844, "learning_rate": 1.6247195995785836e-06, "loss": 0.0085, "reward": 1.231167882680893, "reward_std": 0.14090694999322295, "rewards/mrr_reward": 0.2981584817171097, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8683166652917862, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 441.75, "epoch": 0.6648, "grad_norm": 0.04371047392487526, "kl": 0.008418083190917969, "learning_rate": 1.6075954734761844e-06, "loss": -0.0038, "reward": 1.1615868508815765, "reward_std": 0.243436086922884, "rewards/mrr_reward": 0.2448226734995842, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8132294416427612, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 438.5, "epoch": 0.6656, "grad_norm": 0.04143482446670532, "kl": 0.00856781005859375, "learning_rate": 1.5905541770183096e-06, "loss": -0.0463, "reward": 1.1261744499206543, "reward_std": 0.13198063895106316, "rewards/mrr_reward": 0.1937314048409462, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.880272313952446, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 469.09375, "epoch": 0.6664, "grad_norm": 0.04452883452177048, "kl": 0.010478973388671875, "learning_rate": 1.5735958783956795e-06, "loss": 0.0252, "reward": 1.1056948900222778, "reward_std": 0.13016505632549524, "rewards/mrr_reward": 0.17806300520896912, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.855927437543869, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 443.671875, "epoch": 0.6672, "grad_norm": 0.04416726902127266, "kl": 0.008454322814941406, "learning_rate": 1.5567207449798517e-06, "loss": -0.0404, "reward": 1.2114092707633972, "reward_std": 0.27365200221538544, "rewards/mrr_reward": 0.30291419103741646, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.831140398979187, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 452.296875, "epoch": 0.668, "grad_norm": 0.04146156832575798, "kl": 0.007966995239257812, "learning_rate": 1.5399289433215792e-06, "loss": -0.0037, "reward": 1.1683192551136017, "reward_std": 0.22274474799633026, "rewards/mrr_reward": 0.23329614847898483, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8959033340215683, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 451.5625, "epoch": 0.6688, "grad_norm": 0.04751385748386383, "kl": 0.009054183959960938, "learning_rate": 1.52322063914917e-06, "loss": -0.0242, "reward": 1.154505506157875, "reward_std": 0.222128264605999, "rewards/mrr_reward": 0.2571180574595928, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7701369524002075, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 465.96875, "epoch": 0.6696, "grad_norm": 0.04068367928266525, "kl": 0.007717132568359375, "learning_rate": 1.5065959973668355e-06, "loss": 0.0565, "reward": 1.2096376717090607, "reward_std": 0.14537774212658405, "rewards/mrr_reward": 0.287667416036129, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8641615360975266, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 442.921875, "epoch": 0.6704, "grad_norm": 0.040934283286333084, "kl": 0.008641242980957031, "learning_rate": 1.490055182053083e-06, "loss": -0.0112, "reward": 1.1347018480300903, "reward_std": 0.19622957706451416, "rewards/mrr_reward": 0.2253534235060215, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8122418373823166, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 450.0, "epoch": 0.6712, "grad_norm": 0.03677130863070488, "kl": 0.008676528930664062, "learning_rate": 1.4735983564590784e-06, "loss": -0.0226, "reward": 1.2674466371536255, "reward_std": 0.20840232260525227, "rewards/mrr_reward": 0.34420883283019066, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8465183228254318, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 432.53125, "epoch": 0.672, "grad_norm": 0.0432620495557785, "kl": 0.010099411010742188, "learning_rate": 1.4572256830070497e-06, "loss": -0.0358, "reward": 1.1152960062026978, "reward_std": 0.11179288476705551, "rewards/mrr_reward": 0.1966145858168602, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8190391659736633, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 470.375, "epoch": 0.6728, "grad_norm": 0.04272717610001564, "kl": 0.008855819702148438, "learning_rate": 1.4409373232886703e-06, "loss": -0.0453, "reward": 1.1043509542942047, "reward_std": 0.1713030431419611, "rewards/mrr_reward": 0.21338666044175625, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7702041119337082, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 457.578125, "epoch": 0.6736, "grad_norm": 0.03976025804877281, "kl": 0.009042739868164062, "learning_rate": 1.4247334380634792e-06, "loss": -0.02, "reward": 1.1753461956977844, "reward_std": 0.23029477335512638, "rewards/mrr_reward": 0.25109126791357994, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8710848391056061, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 451.921875, "epoch": 0.6744, "grad_norm": 0.03952572122216225, "kl": 0.0076618194580078125, "learning_rate": 1.408614187257279e-06, "loss": -0.0295, "reward": 1.2743225693702698, "reward_std": 0.2028570305556059, "rewards/mrr_reward": 0.3688492067158222, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8141710609197617, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 458.671875, "epoch": 0.6752, "grad_norm": 0.04077652096748352, "kl": 0.009363174438476562, "learning_rate": 1.3925797299605649e-06, "loss": -0.0244, "reward": 1.102505773305893, "reward_std": 0.12630261853337288, "rewards/mrr_reward": 0.18967634066939354, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8169309347867966, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 486.46875, "epoch": 0.676, "grad_norm": 0.0385010689496994, "kl": 0.010541915893554688, "learning_rate": 1.3766302244269624e-06, "loss": -0.0217, "reward": 1.0942464470863342, "reward_std": 0.20936249569058418, "rewards/mrr_reward": 0.19885292276740074, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8129228949546814, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 461.953125, "epoch": 0.6768, "grad_norm": 0.04482361301779747, "kl": 0.008653640747070312, "learning_rate": 1.3607658280716474e-06, "loss": -0.0199, "reward": 1.0879277884960175, "reward_std": 0.13143850397318602, "rewards/mrr_reward": 0.17986730858683586, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8278702795505524, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 431.09375, "epoch": 0.6776, "grad_norm": 0.038818296045064926, "kl": 0.008572578430175781, "learning_rate": 1.3449866974698123e-06, "loss": -0.0419, "reward": 1.0753368735313416, "reward_std": 0.172625370323658, "rewards/mrr_reward": 0.22671131044626236, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7161238044500351, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 436.296875, "epoch": 0.6784, "grad_norm": 0.04571860656142235, "kl": 0.0086669921875, "learning_rate": 1.3292929883550998e-06, "loss": -0.0586, "reward": 1.1929639875888824, "reward_std": 0.1677282489836216, "rewards/mrr_reward": 0.2719742190092802, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8182217180728912, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 441.484375, "epoch": 0.6792, "grad_norm": 0.05399928614497185, "kl": 0.01306915283203125, "learning_rate": 1.3136848556180893e-06, "loss": -0.0565, "reward": 1.1323251575231552, "reward_std": 0.15411379747092724, "rewards/mrr_reward": 0.21484375, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8505590111017227, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 441.015625, "epoch": 0.68, "grad_norm": 0.03968013450503349, "kl": 0.008097648620605469, "learning_rate": 1.2981624533047432e-06, "loss": 0.0032, "reward": 1.1719480752944946, "reward_std": 0.1420932300388813, "rewards/mrr_reward": 0.22884425148367882, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8715621381998062, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 474.9375, "epoch": 0.6808, "grad_norm": 0.03793917968869209, "kl": 0.010494232177734375, "learning_rate": 1.2827259346149123e-06, "loss": -0.0135, "reward": 1.1404308676719666, "reward_std": 0.1706097424030304, "rewards/mrr_reward": 0.21374628692865372, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8296191394329071, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 461.015625, "epoch": 0.6816, "grad_norm": 0.03952965885400772, "kl": 0.007433891296386719, "learning_rate": 1.2673754519008008e-06, "loss": -0.0227, "reward": 1.2492049932479858, "reward_std": 0.18265685997903347, "rewards/mrr_reward": 0.3240761533379555, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8307643234729767, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 458.40625, "epoch": 0.6824, "grad_norm": 0.035804543644189835, "kl": 0.007625579833984375, "learning_rate": 1.2521111566654732e-06, "loss": 0.0024, "reward": 1.2148383259773254, "reward_std": 0.1326262354850769, "rewards/mrr_reward": 0.28328993543982506, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.858029916882515, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 464.65625, "epoch": 0.6832, "grad_norm": 0.037164557725191116, "kl": 0.008016586303710938, "learning_rate": 1.2369331995613664e-06, "loss": 0.0025, "reward": 1.085724025964737, "reward_std": 0.09625130379572511, "rewards/mrr_reward": 0.14905753917992115, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.865726962685585, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 447.171875, "epoch": 0.684, "grad_norm": 0.040722329169511795, "kl": 0.009304046630859375, "learning_rate": 1.2218417303887842e-06, "loss": -0.0171, "reward": 1.1564219295978546, "reward_std": 0.1851737443357706, "rewards/mrr_reward": 0.2559833899140358, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.781335860490799, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 434.421875, "epoch": 0.6848, "grad_norm": 0.045357346534729004, "kl": 0.008769035339355469, "learning_rate": 1.206836898094439e-06, "loss": -0.0129, "reward": 1.2376780807971954, "reward_std": 0.22706399112939835, "rewards/mrr_reward": 0.33642733469605446, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7857502400875092, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 456.21875, "epoch": 0.6856, "grad_norm": 0.04047942906618118, "kl": 0.007975578308105469, "learning_rate": 1.1919188507699641e-06, "loss": 0.0062, "reward": 1.1433132886886597, "reward_std": 0.15857827477157116, "rewards/mrr_reward": 0.21695809438824654, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8481525033712387, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 471.078125, "epoch": 0.6864, "grad_norm": 0.04048970341682434, "kl": 0.008747100830078125, "learning_rate": 1.1770877356504684e-06, "loss": 0.0148, "reward": 1.0512803494930267, "reward_std": 0.10148373059928417, "rewards/mrr_reward": 0.12783358246088028, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.866682767868042, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 470.890625, "epoch": 0.6872, "grad_norm": 0.03905528783798218, "kl": 0.008471488952636719, "learning_rate": 1.1623436991130654e-06, "loss": -0.0104, "reward": 1.1480936110019684, "reward_std": 0.19399065151810646, "rewards/mrr_reward": 0.23568949103355408, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8586108982563019, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 433.828125, "epoch": 0.688, "grad_norm": 0.04030711576342583, "kl": 0.008829116821289062, "learning_rate": 1.1476868866754488e-06, "loss": -0.0087, "reward": 1.0645442605018616, "reward_std": 0.1284739300608635, "rewards/mrr_reward": 0.14106522873044014, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8667805790901184, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 476.5, "epoch": 0.6888, "grad_norm": 0.04082191362977028, "kl": 0.0068359375, "learning_rate": 1.1331174429944346e-06, "loss": -0.0112, "reward": 1.2571602165699005, "reward_std": 0.08795512840151787, "rewards/mrr_reward": 0.31218378245830536, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8635648190975189, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 472.34375, "epoch": 0.6896, "grad_norm": 0.0414448082447052, "kl": 0.0075016021728515625, "learning_rate": 1.1186355118645552e-06, "loss": -0.0359, "reward": 1.0901014506816864, "reward_std": 0.06480870395898819, "rewards/mrr_reward": 0.14286334067583084, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8704183101654053, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 464.90625, "epoch": 0.6904, "grad_norm": 0.04123280942440033, "kl": 0.008793830871582031, "learning_rate": 1.1042412362166221e-06, "loss": -0.0314, "reward": 1.2326207756996155, "reward_std": 0.23798086121678352, "rewards/mrr_reward": 0.3054501600563526, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8232797235250473, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 453.84375, "epoch": 0.6912, "grad_norm": 0.0383603498339653, "kl": 0.009160995483398438, "learning_rate": 1.0899347581163222e-06, "loss": -0.0088, "reward": 1.1871033906936646, "reward_std": 0.21597011759877205, "rewards/mrr_reward": 0.28604910895228386, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8261706084012985, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 459.484375, "epoch": 0.692, "grad_norm": 0.04392939805984497, "kl": 0.007921218872070312, "learning_rate": 1.0757162187628223e-06, "loss": -0.0154, "reward": 1.1239495277404785, "reward_std": 0.09795428067445755, "rewards/mrr_reward": 0.2013950888067484, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8112445026636124, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 465.234375, "epoch": 0.6928, "grad_norm": 0.03897380828857422, "kl": 0.00806427001953125, "learning_rate": 1.0615857584873624e-06, "loss": -0.0143, "reward": 1.1538923382759094, "reward_std": 0.11023381073027849, "rewards/mrr_reward": 0.21333704888820648, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8579799234867096, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 436.5, "epoch": 0.6936, "grad_norm": 0.04020663723349571, "kl": 0.008386611938476562, "learning_rate": 1.0475435167518843e-06, "loss": -0.0445, "reward": 1.1563519537448883, "reward_std": 0.17987770959734917, "rewards/mrr_reward": 0.23674975894391537, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8140168786048889, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 482.765625, "epoch": 0.6944, "grad_norm": 0.04018275439739227, "kl": 0.007943153381347656, "learning_rate": 1.0335896321476413e-06, "loss": -0.0054, "reward": 1.1884158253669739, "reward_std": 0.09958555456250906, "rewards/mrr_reward": 0.258320938795805, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.875109925866127, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 459.765625, "epoch": 0.6952, "grad_norm": 0.0402548648416996, "kl": 0.008069038391113281, "learning_rate": 1.0197242423938447e-06, "loss": -0.0367, "reward": 1.1144362390041351, "reward_std": 0.14857859443873167, "rewards/mrr_reward": 0.1893291138112545, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8599952906370163, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 459.671875, "epoch": 0.696, "grad_norm": 0.03870267793536186, "kl": 0.009595870971679688, "learning_rate": 1.0059474843362893e-06, "loss": -0.0152, "reward": 1.1645599752664566, "reward_std": 0.16920059733092785, "rewards/mrr_reward": 0.2650669626891613, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8097206056118011, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 475.046875, "epoch": 0.6968, "grad_norm": 0.04259733855724335, "kl": 0.009562492370605469, "learning_rate": 9.922594939460195e-07, "loss": 0.011, "reward": 1.1267297565937042, "reward_std": 0.2542701195925474, "rewards/mrr_reward": 0.24404142051935196, "rewards/rank_answer_foramt_reward": 0.869140625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8212972581386566, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 422.90625, "epoch": 0.6976, "grad_norm": 0.043759021908044815, "kl": 0.011583328247070312, "learning_rate": 9.786604063179728e-07, "loss": 0.0017, "reward": 1.2579089105129242, "reward_std": 0.17472930811345577, "rewards/mrr_reward": 0.34670138359069824, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.831547275185585, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 433.59375, "epoch": 0.6984, "grad_norm": 0.03983573243021965, "kl": 0.009865760803222656, "learning_rate": 9.651503556696519e-07, "loss": -0.0446, "reward": 1.1973416805267334, "reward_std": 0.17388677783310413, "rewards/mrr_reward": 0.2698722742497921, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8534820228815079, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 438.375, "epoch": 0.6992, "grad_norm": 0.043607067316770554, "kl": 0.008998870849609375, "learning_rate": 9.517294753398066e-07, "loss": -0.0139, "reward": 1.294020414352417, "reward_std": 0.19010700285434723, "rewards/mrr_reward": 0.37508679926395416, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8471473008394241, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 455.1875, "epoch": 0.7, "grad_norm": 0.04746263101696968, "kl": 0.009923934936523438, "learning_rate": 9.383978977871022e-07, "loss": -0.0106, "reward": 1.0680293440818787, "reward_std": 0.11477180011570454, "rewards/mrr_reward": 0.13857267424464226, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8712227642536163, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 419.625, "epoch": 0.7008, "grad_norm": 0.04973239824175835, "kl": 0.009832382202148438, "learning_rate": 9.251557545888312e-07, "loss": -0.0656, "reward": 1.2519857585430145, "reward_std": 0.197287205606699, "rewards/mrr_reward": 0.34175965934991837, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7875577062368393, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 485.8125, "epoch": 0.7016, "grad_norm": 0.039514243602752686, "kl": 0.006424903869628906, "learning_rate": 9.120031764395987e-07, "loss": 0.0036, "reward": 1.2768616378307343, "reward_std": 0.17542525753378868, "rewards/mrr_reward": 0.3354600667953491, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.880075678229332, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 443.21875, "epoch": 0.7024, "grad_norm": 0.04365525767207146, "kl": 0.009319305419921875, "learning_rate": 8.989402931500434e-07, "loss": 0.0149, "reward": 1.0583529770374298, "reward_std": 0.1347158532589674, "rewards/mrr_reward": 0.1308221723884344, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8810119330883026, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 452.421875, "epoch": 0.7032, "grad_norm": 0.03814084827899933, "kl": 0.00922393798828125, "learning_rate": 8.859672336455471e-07, "loss": -0.0151, "reward": 1.1374922394752502, "reward_std": 0.20830333605408669, "rewards/mrr_reward": 0.23531125485897064, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7963816821575165, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 445.09375, "epoch": 0.704, "grad_norm": 0.03798083961009979, "kl": 0.009809494018554688, "learning_rate": 8.730841259649725e-07, "loss": -0.0172, "reward": 1.0921657383441925, "reward_std": 0.13509543286636472, "rewards/mrr_reward": 0.17431175522506237, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8165318071842194, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 474.46875, "epoch": 0.7048, "grad_norm": 0.03497171774506569, "kl": 0.007781982421875, "learning_rate": 8.602910972593892e-07, "loss": -0.0358, "reward": 1.11690154671669, "reward_std": 0.14519068971276283, "rewards/mrr_reward": 0.19576513767242432, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7913223654031754, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 442.71875, "epoch": 0.7056, "grad_norm": 0.042609184980392456, "kl": 0.008588790893554688, "learning_rate": 8.475882737908248e-07, "loss": -0.0342, "reward": 1.0906363278627396, "reward_std": 0.20800522714853287, "rewards/mrr_reward": 0.1928075421601534, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8046774417161942, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 463.625, "epoch": 0.7064, "grad_norm": 0.040892452001571655, "kl": 0.0076351165771484375, "learning_rate": 8.349757809310211e-07, "loss": -0.0081, "reward": 1.2315025329589844, "reward_std": 0.21779580041766167, "rewards/mrr_reward": 0.29523809254169464, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8645085841417313, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 486.3125, "epoch": 0.7072, "grad_norm": 0.03839423879981041, "kl": 0.008156776428222656, "learning_rate": 8.224537431601886e-07, "loss": 0.0002, "reward": 1.138191044330597, "reward_std": 0.1623014360666275, "rewards/mrr_reward": 0.20939361490309238, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8360219746828079, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 454.09375, "epoch": 0.708, "grad_norm": 0.03817719221115112, "kl": 0.009266853332519531, "learning_rate": 8.100222840657879e-07, "loss": -0.0066, "reward": 1.2030390501022339, "reward_std": 0.11985693499445915, "rewards/mrr_reward": 0.2815476171672344, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8275544792413712, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 460.859375, "epoch": 0.7088, "grad_norm": 0.04101807251572609, "kl": 0.009810447692871094, "learning_rate": 7.976815263412963e-07, "loss": -0.0044, "reward": 1.1409578323364258, "reward_std": 0.23399163782596588, "rewards/mrr_reward": 0.23911211267113686, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.836381271481514, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 466.296875, "epoch": 0.7096, "grad_norm": 0.04037481173872948, "kl": 0.009243011474609375, "learning_rate": 7.854315917850163e-07, "loss": -0.0243, "reward": 1.1751604080200195, "reward_std": 0.2168668694794178, "rewards/mrr_reward": 0.271905992180109, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8269782513380051, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 460.1875, "epoch": 0.7104, "grad_norm": 0.037997495383024216, "kl": 0.0094451904296875, "learning_rate": 7.732726012988512e-07, "loss": -0.0095, "reward": 1.1163204610347748, "reward_std": 0.14024154655635357, "rewards/mrr_reward": 0.1814298164099455, "rewards/rank_answer_foramt_reward": 0.984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8564394563436508, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 439.890625, "epoch": 0.7112, "grad_norm": 0.03766101598739624, "kl": 0.008104324340820312, "learning_rate": 7.612046748871327e-07, "loss": -0.0091, "reward": 1.1998648047447205, "reward_std": 0.14241230860352516, "rewards/mrr_reward": 0.2643105238676071, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8486847132444382, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 462.578125, "epoch": 0.712, "grad_norm": 0.03838858753442764, "kl": 0.00780487060546875, "learning_rate": 7.492279316554207e-07, "loss": -0.0223, "reward": 1.1085861921310425, "reward_std": 0.15880068391561508, "rewards/mrr_reward": 0.17414434999227524, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.886329397559166, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 469.15625, "epoch": 0.7128, "grad_norm": 0.0412142239511013, "kl": 0.007672309875488281, "learning_rate": 7.373424898093339e-07, "loss": 0.0255, "reward": 1.2313543856143951, "reward_std": 0.20873188227415085, "rewards/mrr_reward": 0.3427393361926079, "rewards/rank_answer_foramt_reward": 0.96875, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7474602460861206, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 456.296875, "epoch": 0.7136, "grad_norm": 0.04028737545013428, "kl": 0.0076961517333984375, "learning_rate": 7.255484666533874e-07, "loss": -0.0215, "reward": 1.1419077217578888, "reward_std": 0.1702859941869974, "rewards/mrr_reward": 0.20735986903309822, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8729787468910217, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 463.609375, "epoch": 0.7144, "grad_norm": 0.041100095957517624, "kl": 0.009128570556640625, "learning_rate": 7.138459785898266e-07, "loss": -0.0366, "reward": 1.1841089129447937, "reward_std": 0.23279405757784843, "rewards/mrr_reward": 0.27274926379323006, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8183364868164062, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 443.828125, "epoch": 0.7152, "grad_norm": 0.040628962218761444, "kl": 0.009103775024414062, "learning_rate": 7.022351411174866e-07, "loss": -0.0277, "reward": 1.2547601759433746, "reward_std": 0.1291979430243373, "rewards/mrr_reward": 0.3259424641728401, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8282708525657654, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 440.53125, "epoch": 0.716, "grad_norm": 0.04295559599995613, "kl": 0.008701324462890625, "learning_rate": 6.907160688306425e-07, "loss": -0.0339, "reward": 1.2229861319065094, "reward_std": 0.18074735067784786, "rewards/mrr_reward": 0.29606895335018635, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8498553782701492, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 452.40625, "epoch": 0.7168, "grad_norm": 0.04010513424873352, "kl": 0.009496688842773438, "learning_rate": 6.792888754178906e-07, "loss": -0.0085, "reward": 1.2736192345619202, "reward_std": 0.28126518800854683, "rewards/mrr_reward": 0.36719369515776634, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.81510329246521, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 435.515625, "epoch": 0.7176, "grad_norm": 0.04235503822565079, "kl": 0.0098114013671875, "learning_rate": 6.679536736610137e-07, "loss": -0.0137, "reward": 1.257362738251686, "reward_std": 0.1873782053589821, "rewards/mrr_reward": 0.3338169790804386, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8552640378475189, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 446.203125, "epoch": 0.7184, "grad_norm": 0.03878998011350632, "kl": 0.007755279541015625, "learning_rate": 6.567105754338798e-07, "loss": -0.0676, "reward": 1.2187067866325378, "reward_std": 0.20256269164383411, "rewards/mrr_reward": 0.28175223618745804, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8392561823129654, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 428.609375, "epoch": 0.7192, "grad_norm": 0.04290932044386864, "kl": 0.011754989624023438, "learning_rate": 6.455596917013274e-07, "loss": -0.0499, "reward": 1.0937216877937317, "reward_std": 0.19892499037086964, "rewards/mrr_reward": 0.20499752275645733, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.7790410816669464, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 458.78125, "epoch": 0.72, "grad_norm": 0.0432630330324173, "kl": 0.008266448974609375, "learning_rate": 6.345011325180772e-07, "loss": 0.0333, "reward": 1.2824538052082062, "reward_std": 0.11213483987376094, "rewards/mrr_reward": 0.340978417545557, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8881120085716248, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 448.1875, "epoch": 0.7208, "grad_norm": 0.039642494171857834, "kl": 0.009405136108398438, "learning_rate": 6.235350070276447e-07, "loss": -0.0272, "reward": 1.1255181729793549, "reward_std": 0.17566965892910957, "rewards/mrr_reward": 0.22555803321301937, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8130893558263779, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 445.921875, "epoch": 0.7216, "grad_norm": 0.03899867832660675, "kl": 0.008452415466308594, "learning_rate": 6.126614234612593e-07, "loss": -0.0113, "reward": 1.22465381026268, "reward_std": 0.1621338054537773, "rewards/mrr_reward": 0.28670016303658485, "rewards/rank_answer_foramt_reward": 0.984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8657210916280746, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 456.484375, "epoch": 0.7224, "grad_norm": 0.04027952253818512, "kl": 0.008156776428222656, "learning_rate": 6.018804891368035e-07, "loss": -0.0082, "reward": 1.144799381494522, "reward_std": 0.16850159130990505, "rewards/mrr_reward": 0.21117932349443436, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8838390558958054, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 456.234375, "epoch": 0.7232, "grad_norm": 0.04422492906451225, "kl": 0.008924484252929688, "learning_rate": 5.911923104577455e-07, "loss": -0.0543, "reward": 1.121463656425476, "reward_std": 0.17780250683426857, "rewards/mrr_reward": 0.20161830820143223, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8284256458282471, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 429.859375, "epoch": 0.724, "grad_norm": 0.045736148953437805, "kl": 0.010265350341796875, "learning_rate": 5.805969929120947e-07, "loss": -0.0569, "reward": 1.1411092430353165, "reward_std": 0.2664986848831177, "rewards/mrr_reward": 0.29250372759997845, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.6750472635030746, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 469.21875, "epoch": 0.7248, "grad_norm": 0.037118908017873764, "kl": 0.008056640625, "learning_rate": 5.700946410713548e-07, "loss": -0.0048, "reward": 1.160048007965088, "reward_std": 0.15992580354213715, "rewards/mrr_reward": 0.22242064028978348, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8686386346817017, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 492.296875, "epoch": 0.7256, "grad_norm": 0.039864134043455124, "kl": 0.00611114501953125, "learning_rate": 5.596853585895034e-07, "loss": 0.0155, "reward": 1.2810848355293274, "reward_std": 0.17151801194995642, "rewards/mrr_reward": 0.33007192611694336, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8818571120500565, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 458.03125, "epoch": 0.7264, "grad_norm": 0.04468508064746857, "kl": 0.008083343505859375, "learning_rate": 5.49369248201953e-07, "loss": -0.0198, "reward": 1.1350350379943848, "reward_std": 0.1694689802825451, "rewards/mrr_reward": 0.21646826341748238, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8382229655981064, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 455.109375, "epoch": 0.7272, "grad_norm": 0.03844364359974861, "kl": 0.008733749389648438, "learning_rate": 5.391464117245471e-07, "loss": -0.0246, "reward": 1.1559688597917557, "reward_std": 0.1414298638701439, "rewards/mrr_reward": 0.2493179589509964, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8021143823862076, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 491.109375, "epoch": 0.728, "grad_norm": 0.038503993302583694, "kl": 0.0072689056396484375, "learning_rate": 5.290169500525577e-07, "loss": 0.0071, "reward": 1.2781525254249573, "reward_std": 0.10281640524044633, "rewards/mrr_reward": 0.33461061120033264, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8865616470575333, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 474.859375, "epoch": 0.7288, "grad_norm": 0.0404188446700573, "kl": 0.007802009582519531, "learning_rate": 5.189809631596798e-07, "loss": -0.055, "reward": 1.1833963990211487, "reward_std": 0.15963028743863106, "rewards/mrr_reward": 0.2542596831917763, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8643939197063446, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 446.078125, "epoch": 0.7296, "grad_norm": 0.04015132039785385, "kl": 0.008340835571289062, "learning_rate": 5.090385500970551e-07, "loss": -0.0663, "reward": 1.073600858449936, "reward_std": 0.1477372208610177, "rewards/mrr_reward": 0.16947544738650322, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8237582743167877, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 457.546875, "epoch": 0.7304, "grad_norm": 0.04625895991921425, "kl": 0.008310317993164062, "learning_rate": 4.99189808992282e-07, "loss": 0.0159, "reward": 1.0823292136192322, "reward_std": 0.15047525987029076, "rewards/mrr_reward": 0.1619233600795269, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8887177854776382, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 442.515625, "epoch": 0.7312, "grad_norm": 0.04360828921198845, "kl": 0.008584976196289062, "learning_rate": 4.894348370484648e-07, "loss": -0.0237, "reward": 1.1279266774654388, "reward_std": 0.2273631989955902, "rewards/mrr_reward": 0.21179315820336342, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8855368793010712, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 438.90625, "epoch": 0.732, "grad_norm": 0.04092933610081673, "kl": 0.009286880493164062, "learning_rate": 4.797737305432337e-07, "loss": -0.0416, "reward": 1.0745922327041626, "reward_std": 0.12707211077213287, "rewards/mrr_reward": 0.15930680558085442, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8556233495473862, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 457.890625, "epoch": 0.7328, "grad_norm": 0.03805176913738251, "kl": 0.008405685424804688, "learning_rate": 4.702065848278126e-07, "loss": -0.0324, "reward": 1.2603042423725128, "reward_std": 0.11048845760524273, "rewards/mrr_reward": 0.3443204537034035, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7757083177566528, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 452.5625, "epoch": 0.7336, "grad_norm": 0.04079756885766983, "kl": 0.007517814636230469, "learning_rate": 4.6073349432606554e-07, "loss": -0.0101, "reward": 1.1363862454891205, "reward_std": 0.15507613588124514, "rewards/mrr_reward": 0.21534598991274834, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.882827877998352, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 431.25, "epoch": 0.7344, "grad_norm": 0.039096131920814514, "kl": 0.009703636169433594, "learning_rate": 4.5135455253357053e-07, "loss": -0.0168, "reward": 1.1451416015625, "reward_std": 0.16741209849715233, "rewards/mrr_reward": 0.2290364522486925, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.826857328414917, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 459.890625, "epoch": 0.7352, "grad_norm": 0.04211005941033363, "kl": 0.010351181030273438, "learning_rate": 4.420698520166988e-07, "loss": 0.007, "reward": 1.2284757196903229, "reward_std": 0.2662056963890791, "rewards/mrr_reward": 0.29736483842134476, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.9016263037919998, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 457.078125, "epoch": 0.736, "grad_norm": 0.04248746857047081, "kl": 0.008783340454101562, "learning_rate": 4.3287948441169457e-07, "loss": 0.0271, "reward": 1.0713848173618317, "reward_std": 0.11600453779101372, "rewards/mrr_reward": 0.14102182537317276, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8602972775697708, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 444.796875, "epoch": 0.7368, "grad_norm": 0.0458797849714756, "kl": 0.009664535522460938, "learning_rate": 4.2378354042377776e-07, "loss": -0.0193, "reward": 1.2582030892372131, "reward_std": 0.29735782369971275, "rewards/mrr_reward": 0.3648375477641821, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8184964209794998, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 469.171875, "epoch": 0.7376, "grad_norm": 0.06803036481142044, "kl": 0.02263164520263672, "learning_rate": 4.1478210982624055e-07, "loss": -0.034, "reward": 1.264964759349823, "reward_std": 0.1754690520465374, "rewards/mrr_reward": 0.34071801975369453, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8144194781780243, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 464.34375, "epoch": 0.7384, "grad_norm": 0.03774275258183479, "kl": 0.00981903076171875, "learning_rate": 4.0587528145957235e-07, "loss": 0.0038, "reward": 1.077519729733467, "reward_std": 0.10510019911453128, "rewards/mrr_reward": 0.15089286118745804, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8704600483179092, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 453.734375, "epoch": 0.7392, "grad_norm": 0.0418127216398716, "kl": 0.007354736328125, "learning_rate": 3.9706314323056936e-07, "loss": -0.0363, "reward": 1.0994101464748383, "reward_std": 0.16536758467555046, "rewards/mrr_reward": 0.18959572538733482, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8273257464170456, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 419.3125, "epoch": 0.74, "grad_norm": 0.04003710672259331, "kl": 0.010486602783203125, "learning_rate": 3.883457821114811e-07, "loss": -0.026, "reward": 1.259395271539688, "reward_std": 0.185102803632617, "rewards/mrr_reward": 0.3478422686457634, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8169691562652588, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 466.90625, "epoch": 0.7408, "grad_norm": 0.03963383659720421, "kl": 0.009263992309570312, "learning_rate": 3.7972328413914074e-07, "loss": -0.0154, "reward": 1.1112309098243713, "reward_std": 0.17668243870139122, "rewards/mrr_reward": 0.20306920632719994, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8555205762386322, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 439.125, "epoch": 0.7416, "grad_norm": 0.043505482375621796, "kl": 0.01036834716796875, "learning_rate": 3.711957344141237e-07, "loss": -0.0355, "reward": 1.1022839844226837, "reward_std": 0.19026428647339344, "rewards/mrr_reward": 0.20350942388176918, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8192622512578964, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 448.953125, "epoch": 0.7424, "grad_norm": 0.04294588789343834, "kl": 0.008907318115234375, "learning_rate": 3.627632170999029e-07, "loss": 0.0123, "reward": 1.0245526134967804, "reward_std": 0.16080267634242773, "rewards/mrr_reward": 0.14318576268851757, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8309648036956787, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 439.78125, "epoch": 0.7432, "grad_norm": 0.039757587015628815, "kl": 0.007798194885253906, "learning_rate": 3.544258154220193e-07, "loss": -0.0165, "reward": 1.2130238115787506, "reward_std": 0.20079264417290688, "rewards/mrr_reward": 0.2949962895363569, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8229171335697174, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 457.9375, "epoch": 0.744, "grad_norm": 0.039665572345256805, "kl": 0.0093231201171875, "learning_rate": 3.4618361166726123e-07, "loss": -0.0331, "reward": 1.18429334461689, "reward_std": 0.18769347667694092, "rewards/mrr_reward": 0.2760540656745434, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8342714756727219, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 479.3125, "epoch": 0.7448, "grad_norm": 0.042533498257398605, "kl": 0.008765220642089844, "learning_rate": 3.380366871828522e-07, "loss": 0.0221, "reward": 1.19572052359581, "reward_std": 0.22476008161902428, "rewards/mrr_reward": 0.28604911267757416, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8600956052541733, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 448.0625, "epoch": 0.7456, "grad_norm": 0.03890962526202202, "kl": 0.008768081665039062, "learning_rate": 3.2998512237565005e-07, "loss": -0.0272, "reward": 1.0621162056922913, "reward_std": 0.09464808227494359, "rewards/mrr_reward": 0.14957217499613762, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7926287055015564, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 441.28125, "epoch": 0.7464, "grad_norm": 0.04664164409041405, "kl": 0.00838470458984375, "learning_rate": 3.2202899671134546e-07, "loss": -0.0119, "reward": 1.2424928843975067, "reward_std": 0.18272621743381023, "rewards/mrr_reward": 0.31674107909202576, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8267927318811417, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 448.0625, "epoch": 0.7472, "grad_norm": 0.04111839085817337, "kl": 0.009455680847167969, "learning_rate": 3.1416838871368925e-07, "loss": -0.0532, "reward": 1.117896169424057, "reward_std": 0.13564826268702745, "rewards/mrr_reward": 0.20161830261349678, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8390995860099792, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 427.875, "epoch": 0.748, "grad_norm": 0.046598147600889206, "kl": 0.009102821350097656, "learning_rate": 3.064033759637064e-07, "loss": -0.072, "reward": 1.1321559846401215, "reward_std": 0.20450343750417233, "rewards/mrr_reward": 0.2703249081969261, "rewards/rank_answer_foramt_reward": 0.912109375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.7073123753070831, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 447.015625, "epoch": 0.7488, "grad_norm": 0.03951597958803177, "kl": 0.00994110107421875, "learning_rate": 2.987340350989421e-07, "loss": -0.0239, "reward": 1.126262903213501, "reward_std": 0.15488239750266075, "rewards/mrr_reward": 0.19883432984352112, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.845545768737793, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 449.65625, "epoch": 0.7496, "grad_norm": 0.040472351014614105, "kl": 0.007419586181640625, "learning_rate": 2.911604418126901e-07, "loss": -0.0046, "reward": 1.1847195029258728, "reward_std": 0.22169340029358864, "rewards/mrr_reward": 0.2553013488650322, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8789184093475342, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 460.3125, "epoch": 0.7504, "grad_norm": 0.03997602313756943, "kl": 0.008014678955078125, "learning_rate": 2.836826708532603e-07, "loss": 0.0314, "reward": 1.1391558349132538, "reward_std": 0.15339666418731213, "rewards/mrr_reward": 0.23206845670938492, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8034368455410004, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 424.265625, "epoch": 0.7512, "grad_norm": 0.04147866368293762, "kl": 0.008632659912109375, "learning_rate": 2.7630079602323447e-07, "loss": -0.0225, "reward": 1.193027675151825, "reward_std": 0.21564476191997528, "rewards/mrr_reward": 0.2960689514875412, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8293846398591995, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 447.234375, "epoch": 0.752, "grad_norm": 0.03809836506843567, "kl": 0.008844375610351562, "learning_rate": 2.6901489017873375e-07, "loss": -0.0142, "reward": 1.265418291091919, "reward_std": 0.16339326091110706, "rewards/mrr_reward": 0.3403521776199341, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8247148245573044, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 455.59375, "epoch": 0.7528, "grad_norm": 0.04365180432796478, "kl": 0.012082099914550781, "learning_rate": 2.6182502522871135e-07, "loss": 0.0047, "reward": 1.286356657743454, "reward_std": 0.17650778219103813, "rewards/mrr_reward": 0.35570436902344227, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8846114873886108, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 481.078125, "epoch": 0.7536, "grad_norm": 0.042372170835733414, "kl": 0.008172988891601562, "learning_rate": 2.547312721342277e-07, "loss": 0.0007, "reward": 1.1792892515659332, "reward_std": 0.17659651907160878, "rewards/mrr_reward": 0.2534412369132042, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8602875918149948, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 434.21875, "epoch": 0.7544, "grad_norm": 0.044003140181303024, "kl": 0.009439468383789062, "learning_rate": 2.4773370090776625e-07, "loss": -0.0062, "reward": 1.177651286125183, "reward_std": 0.18516015633940697, "rewards/mrr_reward": 0.2531622014939785, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8424975126981735, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 458.828125, "epoch": 0.7552, "grad_norm": 0.039039552211761475, "kl": 0.0077571868896484375, "learning_rate": 2.4083238061252565e-07, "loss": -0.0039, "reward": 1.0639915466308594, "reward_std": 0.08116667065769434, "rewards/mrr_reward": 0.13606150820851326, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8333936333656311, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 450.015625, "epoch": 0.756, "grad_norm": 0.037782639265060425, "kl": 0.009856224060058594, "learning_rate": 2.3402737936175423e-07, "loss": -0.0108, "reward": 1.1415889263153076, "reward_std": 0.17221311293542385, "rewards/mrr_reward": 0.23467881605029106, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8048529326915741, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 467.765625, "epoch": 0.7568, "grad_norm": 0.039081498980522156, "kl": 0.010256767272949219, "learning_rate": 2.273187643180652e-07, "loss": -0.0199, "reward": 1.0599176287651062, "reward_std": 0.1111216451972723, "rewards/mrr_reward": 0.13382937014102936, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8844529241323471, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 433.46875, "epoch": 0.7576, "grad_norm": 0.04526249319314957, "kl": 0.009111404418945312, "learning_rate": 2.2070660169278168e-07, "loss": -0.052, "reward": 1.1703499853610992, "reward_std": 0.17350439634174109, "rewards/mrr_reward": 0.2586867641657591, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7899595201015472, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 443.46875, "epoch": 0.7584, "grad_norm": 0.04180796444416046, "kl": 0.008915901184082031, "learning_rate": 2.1419095674527934e-07, "loss": -0.0295, "reward": 1.2294658422470093, "reward_std": 0.1340288333594799, "rewards/mrr_reward": 0.307868305593729, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.8376415371894836, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 457.765625, "epoch": 0.7592, "grad_norm": 0.0391828715801239, "kl": 0.008802413940429688, "learning_rate": 2.077718937823414e-07, "loss": -0.0093, "reward": 1.2045442461967468, "reward_std": 0.10182449035346508, "rewards/mrr_reward": 0.2698102742433548, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8735426217317581, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 459.984375, "epoch": 0.76, "grad_norm": 0.04419267922639847, "kl": 0.010101318359375, "learning_rate": 2.014494761575314e-07, "loss": -0.0648, "reward": 1.0402642339468002, "reward_std": 0.14703384041786194, "rewards/mrr_reward": 0.1415922623127699, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7779356688261032, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 445.109375, "epoch": 0.7608, "grad_norm": 0.04218144342303276, "kl": 0.008372306823730469, "learning_rate": 1.9522376627055585e-07, "loss": -0.0113, "reward": 1.2481240332126617, "reward_std": 0.21777066215872765, "rewards/mrr_reward": 0.3340773805975914, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8264787942171097, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 451.953125, "epoch": 0.7616, "grad_norm": 0.042666662484407425, "kl": 0.0077667236328125, "learning_rate": 1.8909482556666026e-07, "loss": -0.0216, "reward": 1.1155872642993927, "reward_std": 0.14810610190033913, "rewards/mrr_reward": 0.198753722012043, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8407832533121109, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 462.171875, "epoch": 0.7624, "grad_norm": 0.04106247425079346, "kl": 0.008396148681640625, "learning_rate": 1.8306271453601198e-07, "loss": 0.0195, "reward": 1.224888414144516, "reward_std": 0.10434846580028534, "rewards/mrr_reward": 0.2887338884174824, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8446441739797592, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 471.140625, "epoch": 0.7632, "grad_norm": 0.038032591342926025, "kl": 0.007456779479980469, "learning_rate": 1.7712749271311392e-07, "loss": 0.0057, "reward": 1.2025950849056244, "reward_std": 0.14355502650141716, "rewards/mrr_reward": 0.2678571380674839, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8813671916723251, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 451.640625, "epoch": 0.764, "grad_norm": 0.0445358082652092, "kl": 0.007924079895019531, "learning_rate": 1.7128921867620828e-07, "loss": -0.0353, "reward": 1.316545844078064, "reward_std": 0.17137969937175512, "rewards/mrr_reward": 0.38145462423563004, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.86095330119133, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 443.875, "epoch": 0.7648, "grad_norm": 0.03883486986160278, "kl": 0.008409500122070312, "learning_rate": 1.6554795004670389e-07, "loss": -0.0121, "reward": 1.2443694174289703, "reward_std": 0.20190820842981339, "rewards/mrr_reward": 0.3092943839728832, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8687168508768082, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 487.625, "epoch": 0.7656, "grad_norm": 0.04018738493323326, "kl": 0.009251594543457031, "learning_rate": 1.5990374348860304e-07, "loss": -0.0126, "reward": 1.1161698698997498, "reward_std": 0.1159959128126502, "rewards/mrr_reward": 0.18919890373945236, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8304871916770935, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 451.46875, "epoch": 0.7664, "grad_norm": 0.04235168918967247, "kl": 0.010198593139648438, "learning_rate": 1.543566547079467e-07, "loss": -0.0157, "reward": 1.2352637648582458, "reward_std": 0.29030829295516014, "rewards/mrr_reward": 0.3278707917779684, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.808269277215004, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 454.6875, "epoch": 0.7672, "grad_norm": 0.03733037784695625, "kl": 0.007913589477539062, "learning_rate": 1.4890673845226133e-07, "loss": -0.0042, "reward": 1.1977432668209076, "reward_std": 0.19426253903657198, "rewards/mrr_reward": 0.25998884066939354, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.9080862104892731, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 440.8125, "epoch": 0.768, "grad_norm": 0.048545051366090775, "kl": 0.009515762329101562, "learning_rate": 1.4355404851001953e-07, "loss": -0.0343, "reward": 1.0536463111639023, "reward_std": 0.1689282413572073, "rewards/mrr_reward": 0.15678943321108818, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8271231651306152, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 439.4375, "epoch": 0.7688, "grad_norm": 0.039646901190280914, "kl": 0.009168624877929688, "learning_rate": 1.3829863771011253e-07, "loss": -0.0126, "reward": 1.2258817553520203, "reward_std": 0.20934798568487167, "rewards/mrr_reward": 0.2947792708873749, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8625383079051971, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 450.390625, "epoch": 0.7696, "grad_norm": 0.04106530547142029, "kl": 0.008655548095703125, "learning_rate": 1.3314055792131964e-07, "loss": -0.0274, "reward": 1.222478300333023, "reward_std": 0.1886294735595584, "rewards/mrr_reward": 0.3115823529660702, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.787634402513504, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 425.890625, "epoch": 0.7704, "grad_norm": 0.04176384583115578, "kl": 0.009407997131347656, "learning_rate": 1.280798600518085e-07, "loss": -0.0046, "reward": 1.0859091877937317, "reward_std": 0.10481879487633705, "rewards/mrr_reward": 0.17180059850215912, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7836976647377014, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 446.484375, "epoch": 0.7712, "grad_norm": 0.04281945526599884, "kl": 0.0075130462646484375, "learning_rate": 1.231165940486234e-07, "loss": -0.0033, "reward": 1.126029521226883, "reward_std": 0.15459425188601017, "rewards/mrr_reward": 0.18929191306233406, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8659424334764481, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 478.9375, "epoch": 0.772, "grad_norm": 0.03905851021409035, "kl": 0.008356094360351562, "learning_rate": 1.1825080889719565e-07, "loss": 0.0076, "reward": 1.13689586520195, "reward_std": 0.14984166249632835, "rewards/mrr_reward": 0.2044580951333046, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8880689293146133, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 452.53125, "epoch": 0.7728, "grad_norm": 0.041591085493564606, "kl": 0.011320114135742188, "learning_rate": 1.134825526208605e-07, "loss": -0.0267, "reward": 1.0707438588142395, "reward_std": 0.21261652931571007, "rewards/mrr_reward": 0.1851748488843441, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.81635482609272, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 448.984375, "epoch": 0.7736, "grad_norm": 0.04075038060545921, "kl": 0.0076580047607421875, "learning_rate": 1.0881187228038214e-07, "loss": -0.0047, "reward": 1.211523026227951, "reward_std": 0.18444243492558599, "rewards/mrr_reward": 0.28766740672290325, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8562028259038925, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 450.734375, "epoch": 0.7744, "grad_norm": 0.039919495582580566, "kl": 0.008312225341796875, "learning_rate": 1.0423881397349067e-07, "loss": -0.0307, "reward": 1.1101964116096497, "reward_std": 0.14763948507606983, "rewards/mrr_reward": 0.19312375597655773, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8219766318798065, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 459.90625, "epoch": 0.7752, "grad_norm": 0.04152054339647293, "kl": 0.010423660278320312, "learning_rate": 9.976342283442464e-08, "loss": -0.0013, "reward": 1.1466363668441772, "reward_std": 0.12986961100250483, "rewards/mrr_reward": 0.23280629888176918, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.804338127374649, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 441.828125, "epoch": 0.776, "grad_norm": 0.03675472363829613, "kl": 0.008668899536132812, "learning_rate": 9.538574303348813e-08, "loss": 0.0016, "reward": 1.2197460532188416, "reward_std": 0.21070226281881332, "rewards/mrr_reward": 0.33571428433060646, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.7491966933012009, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 468.984375, "epoch": 0.7768, "grad_norm": 0.03766244277358055, "kl": 0.010084152221679688, "learning_rate": 9.110581777661331e-08, "loss": -0.0202, "reward": 1.0216215252876282, "reward_std": 0.1345711536705494, "rewards/mrr_reward": 0.1375496033579111, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.8000995367765427, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 452.078125, "epoch": 0.7776, "grad_norm": 0.04581380635499954, "kl": 0.009756088256835938, "learning_rate": 8.692368930493522e-08, "loss": -0.031, "reward": 1.1670778393745422, "reward_std": 0.1815691478550434, "rewards/mrr_reward": 0.261966772377491, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8111199587583542, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 448.34375, "epoch": 0.7784, "grad_norm": 0.04068039730191231, "kl": 0.008086204528808594, "learning_rate": 8.283939889437209e-08, "loss": -0.0219, "reward": 1.129480630159378, "reward_std": 0.15650018118321896, "rewards/mrr_reward": 0.20040303468704224, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8564022034406662, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 474.3125, "epoch": 0.7792, "grad_norm": 0.04014911130070686, "kl": 0.008867263793945312, "learning_rate": 7.885298685522235e-08, "loss": -0.0152, "reward": 1.3073887526988983, "reward_std": 0.25805073603987694, "rewards/mrr_reward": 0.4035032168030739, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8406094461679459, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 459.53125, "epoch": 0.78, "grad_norm": 0.040287163108587265, "kl": 0.007943153381347656, "learning_rate": 7.496449253176274e-08, "loss": -0.0055, "reward": 1.145872712135315, "reward_std": 0.16490156902000308, "rewards/mrr_reward": 0.221788190305233, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8549434095621109, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 451.71875, "epoch": 0.7808, "grad_norm": 0.04375848174095154, "kl": 0.009169578552246094, "learning_rate": 7.117395430186414e-08, "loss": 0.0074, "reward": 1.293823391199112, "reward_std": 0.25591621547937393, "rewards/mrr_reward": 0.3684089817106724, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8667858242988586, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 456.984375, "epoch": 0.7816, "grad_norm": 0.04306996241211891, "kl": 0.008794784545898438, "learning_rate": 6.748140957660632e-08, "loss": 0.0091, "reward": 1.1962977647781372, "reward_std": 0.23732590302824974, "rewards/mrr_reward": 0.30388763919472694, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.809741660952568, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 442.34375, "epoch": 0.7824, "grad_norm": 0.04417388513684273, "kl": 0.009141921997070312, "learning_rate": 6.388689479991606e-08, "loss": -0.0188, "reward": 1.076131820678711, "reward_std": 0.09198620077222586, "rewards/mrr_reward": 0.14461186155676842, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8501312732696533, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 459.109375, "epoch": 0.7832, "grad_norm": 0.04082540050148964, "kl": 0.0095977783203125, "learning_rate": 6.039044544820404e-08, "loss": -0.011, "reward": 1.1028445363044739, "reward_std": 0.0961092640645802, "rewards/mrr_reward": 0.17588666267693043, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8245881348848343, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 443.75, "epoch": 0.784, "grad_norm": 0.04861390218138695, "kl": 0.009130477905273438, "learning_rate": 5.699209603001077e-08, "loss": 0.0231, "reward": 1.1513446420431137, "reward_std": 0.2505143228918314, "rewards/mrr_reward": 0.26168156415224075, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_overall_format_reward": 0.96875, "rewards/rank_think_format_reward": 0.8521986305713654, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 465.140625, "epoch": 0.7848, "grad_norm": 0.04346904158592224, "kl": 0.008517265319824219, "learning_rate": 5.369188008567672e-08, "loss": -0.007, "reward": 1.0626238584518433, "reward_std": 0.10511020570993423, "rewards/mrr_reward": 0.16136533580720425, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.854133203625679, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 433.734375, "epoch": 0.7856, "grad_norm": 0.04609828442335129, "kl": 0.009782791137695312, "learning_rate": 5.048983018699827e-08, "loss": -0.0588, "reward": 1.2032456696033478, "reward_std": 0.15944865625351667, "rewards/mrr_reward": 0.3129526190459728, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.752545177936554, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 446.140625, "epoch": 0.7864, "grad_norm": 0.043732285499572754, "kl": 0.0076694488525390625, "learning_rate": 4.7385977936916796e-08, "loss": -0.0008, "reward": 1.090537279844284, "reward_std": 0.1588072832673788, "rewards/mrr_reward": 0.1814856231212616, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8582175672054291, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 449.296875, "epoch": 0.7872, "grad_norm": 0.03897866606712341, "kl": 0.008481025695800781, "learning_rate": 4.438035396920004e-08, "loss": -0.0183, "reward": 1.0922786891460419, "reward_std": 0.13948439992964268, "rewards/mrr_reward": 0.17337549850344658, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8333830386400223, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 444.328125, "epoch": 0.788, "grad_norm": 0.042856015264987946, "kl": 0.0099334716796875, "learning_rate": 4.147298794814347e-08, "loss": -0.0059, "reward": 1.162740409374237, "reward_std": 0.20458003506064415, "rewards/mrr_reward": 0.2671751044690609, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8173497021198273, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 452.453125, "epoch": 0.7888, "grad_norm": 0.04340024292469025, "kl": 0.008185386657714844, "learning_rate": 3.866390856827495e-08, "loss": -0.0132, "reward": 1.1505849063396454, "reward_std": 0.18550662696361542, "rewards/mrr_reward": 0.22609126195311546, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8561831563711166, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 445.046875, "epoch": 0.7896, "grad_norm": 0.040927499532699585, "kl": 0.008787155151367188, "learning_rate": 3.595314355407609e-08, "loss": -0.0091, "reward": 1.0734205096960068, "reward_std": 0.1213362361304462, "rewards/mrr_reward": 0.155629962682724, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.8651677072048187, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 458.15625, "epoch": 0.7904, "grad_norm": 0.04325427487492561, "kl": 0.011075973510742188, "learning_rate": 3.3340719659701315e-08, "loss": -0.0347, "reward": 1.2632518410682678, "reward_std": 0.20907128229737282, "rewards/mrr_reward": 0.34834448993206024, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.823227733373642, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 464.0, "epoch": 0.7912, "grad_norm": 0.040987979620695114, "kl": 0.0085906982421875, "learning_rate": 3.082666266872036e-08, "loss": -0.0128, "reward": 1.2261515259742737, "reward_std": 0.20394627377390862, "rewards/mrr_reward": 0.2976128365844488, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8684409856796265, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 460.5, "epoch": 0.792, "grad_norm": 0.04282965511083603, "kl": 0.007916450500488281, "learning_rate": 2.8410997393860663e-08, "loss": -0.0384, "reward": 1.1509665548801422, "reward_std": 0.20199062675237656, "rewards/mrr_reward": 0.22928448393940926, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.833991602063179, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 466.625, "epoch": 0.7928, "grad_norm": 0.04164193570613861, "kl": 0.008025169372558594, "learning_rate": 2.6093747676763093e-08, "loss": -0.0101, "reward": 1.2387970685958862, "reward_std": 0.13890931848436594, "rewards/mrr_reward": 0.3080791234970093, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8340292572975159, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 449.5, "epoch": 0.7936, "grad_norm": 0.04225363954901695, "kl": 0.008797645568847656, "learning_rate": 2.3874936387747738e-08, "loss": -0.0029, "reward": 1.1117708086967468, "reward_std": 0.164918664842844, "rewards/mrr_reward": 0.21209697611629963, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7985499054193497, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 456.203125, "epoch": 0.7944, "grad_norm": 0.046599503606557846, "kl": 0.00894927978515625, "learning_rate": 2.175458542558517e-08, "loss": 0.0412, "reward": 1.156593143939972, "reward_std": 0.1569316927343607, "rewards/mrr_reward": 0.24654637277126312, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.7987329065799713, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 453.171875, "epoch": 0.7952, "grad_norm": 0.040313415229320526, "kl": 0.010005950927734375, "learning_rate": 1.973271571728441e-08, "loss": -0.0121, "reward": 1.200623333454132, "reward_std": 0.16222818940877914, "rewards/mrr_reward": 0.27439235523343086, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8341041952371597, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 446.078125, "epoch": 0.796, "grad_norm": 0.03999168425798416, "kl": 0.008646011352539062, "learning_rate": 1.7809347217881966e-08, "loss": 0.0166, "reward": 1.1675262749195099, "reward_std": 0.12778384890407324, "rewards/mrr_reward": 0.2326450925320387, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8466451019048691, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 446.65625, "epoch": 0.7968, "grad_norm": 0.03779391199350357, "kl": 0.0085906982421875, "learning_rate": 1.5984498910249778e-08, "loss": 0.0062, "reward": 1.2637392580509186, "reward_std": 0.1601163186132908, "rewards/mrr_reward": 0.34081100672483444, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_overall_format_reward": 0.984375, "rewards/rank_think_format_reward": 0.826048955321312, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 433.125, "epoch": 0.7976, "grad_norm": 0.04796071723103523, "kl": 0.008829116821289062, "learning_rate": 1.425818880490315e-08, "loss": -0.0411, "reward": 1.2064868211746216, "reward_std": 0.15722107328474522, "rewards/mrr_reward": 0.28802703507244587, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_overall_format_reward": 1.0, "rewards/rank_think_format_reward": 0.8242269903421402, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 465.390625, "epoch": 0.7984, "grad_norm": 0.04089636355638504, "kl": 0.0083160400390625, "learning_rate": 1.2630433939825326e-08, "loss": -0.0002, "reward": 1.2470715939998627, "reward_std": 0.30819156393408775, "rewards/mrr_reward": 0.3308655694127083, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.852553591132164, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 476.1875, "epoch": 0.7992, "grad_norm": 0.03799648582935333, "kl": 0.0072174072265625, "learning_rate": 1.1101250380300965e-08, "loss": -0.0031, "reward": 1.127793937921524, "reward_std": 0.11800427921116352, "rewards/mrr_reward": 0.20063864439725876, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_overall_format_reward": 0.9921875, "rewards/rank_think_format_reward": 0.8447176665067673, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 441.25, "epoch": 0.8, "grad_norm": 0.04348551481962204, "kl": 0.008482933044433594, "learning_rate": 9.670653218752935e-09, "loss": -0.0049, "reward": 1.0936516225337982, "reward_std": 0.18971389904618263, "rewards/mrr_reward": 0.19807788357138634, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_overall_format_reward": 0.9765625, "rewards/rank_think_format_reward": 0.7939379811286926, "step": 1000 }, { "epoch": 0.8, "step": 1000, "total_flos": 0.0, "train_loss": -0.006724746519234032, "train_runtime": 117510.8339, "train_samples_per_second": 0.545, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }