diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.4008, + "epoch": 0.8, "eval_steps": 500, - "global_step": 501, + "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -8012,9 +8012,9 @@ "clip_ratio": 0.0, "completion_length": 475.90625, "epoch": 0.4008, - "grad_norm": 0.04245263710618019, + "grad_norm": 0.042455315589904785, "kl": 0.009504318237304688, - "learning_rate": 3.3340719659701315e-08, + "learning_rate": 1.0408293519785103e-05, "loss": 0.0258, "reward": 1.0929251462221146, "reward_std": 0.21385096292942762, @@ -8025,17 +8025,8001 @@ "step": 501 }, { - "epoch": 0.4008, - "step": 501, + "clip_ratio": 0.0, + "completion_length": 458.5, + "epoch": 0.4016, + "grad_norm": 0.03798682987689972, + "kl": 0.008695602416992188, + "learning_rate": 1.0376901826699349e-05, + "loss": 0.0227, + "reward": 1.2380023002624512, + "reward_std": 0.1255473867058754, + "rewards/mrr_reward": 0.30468748323619366, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8555702418088913, + "step": 502 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.65625, + "epoch": 0.4024, + "grad_norm": 0.037431634962558746, + "kl": 0.0093994140625, + "learning_rate": 1.0345506413744726e-05, + "loss": -0.0127, + "reward": 1.179205298423767, + "reward_std": 0.12070270255208015, + "rewards/mrr_reward": 0.2780816126614809, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8146621286869049, + "step": 503 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.25, + "epoch": 0.4032, + "grad_norm": 0.038511723279953, + "kl": 0.009913444519042969, + "learning_rate": 1.0314107590781284e-05, + "loss": -0.0139, + "reward": 1.0198038667440414, + "reward_std": 0.16722827591001987, + "rewards/mrr_reward": 0.12537202425301075, + "rewards/rank_answer_foramt_reward": 0.859375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8510243445634842, + "step": 504 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.828125, + "epoch": 0.404, + "grad_norm": 0.039540309458971024, + "kl": 0.009187698364257812, + "learning_rate": 1.0282705667702734e-05, + "loss": -0.0003, + "reward": 1.3496000170707703, + "reward_std": 0.24421941116452217, + "rewards/mrr_reward": 0.4431547671556473, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8014911860227585, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.28125, + "epoch": 0.4048, + "grad_norm": 0.0396139994263649, + "kl": 0.008319854736328125, + "learning_rate": 1.0251300954433377e-05, + "loss": -0.0217, + "reward": 1.1254372000694275, + "reward_std": 0.1360313263721764, + "rewards/mrr_reward": 0.19592014141380787, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.838202714920044, + "step": 506 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.1875, + "epoch": 0.4056, + "grad_norm": 0.04718116670846939, + "kl": 0.009462356567382812, + "learning_rate": 1.0219893760925053e-05, + "loss": -0.0408, + "reward": 1.112642079591751, + "reward_std": 0.19297415390610695, + "rewards/mrr_reward": 0.23144841380417347, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7562213093042374, + "step": 507 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.390625, + "epoch": 0.4064, + "grad_norm": 0.035113196820020676, + "kl": 0.008016586303710938, + "learning_rate": 1.0188484397154083e-05, + "loss": 0.005, + "reward": 1.0684494376182556, + "reward_std": 0.23044782131910324, + "rewards/mrr_reward": 0.18250869028270245, + "rewards/rank_answer_foramt_reward": 0.88671875, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.829199954867363, + "step": 508 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.25, + "epoch": 0.4072, + "grad_norm": 0.04335900396108627, + "kl": 0.009159088134765625, + "learning_rate": 1.0157073173118207e-05, + "loss": -0.0511, + "reward": 1.1197124123573303, + "reward_std": 0.21612372063100338, + "rewards/mrr_reward": 0.22139137983322144, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8042160123586655, + "step": 509 + }, + { + "clip_ratio": 0.0, + "completion_length": 478.984375, + "epoch": 0.408, + "grad_norm": 0.037360094487667084, + "kl": 0.009061813354492188, + "learning_rate": 1.0125660398833528e-05, + "loss": -0.0163, + "reward": 1.2794201076030731, + "reward_std": 0.19604488089680672, + "rewards/mrr_reward": 0.3479786738753319, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.86356520652771, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.234375, + "epoch": 0.4088, + "grad_norm": 0.03932580351829529, + "kl": 0.008629798889160156, + "learning_rate": 1.0094246384331444e-05, + "loss": -0.025, + "reward": 1.1660117506980896, + "reward_std": 0.20460292138159275, + "rewards/mrr_reward": 0.2748697977513075, + "rewards/rank_answer_foramt_reward": 0.927734375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7726956307888031, + "step": 511 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.53125, + "epoch": 0.4096, + "grad_norm": 0.04939593747258186, + "kl": 0.008899688720703125, + "learning_rate": 1.0062831439655591e-05, + "loss": 0.005, + "reward": 1.1041904985904694, + "reward_std": 0.15664683748036623, + "rewards/mrr_reward": 0.19107143580913544, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8471054881811142, + "step": 512 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.34375, + "epoch": 0.4104, + "grad_norm": 0.03681405261158943, + "kl": 0.007862091064453125, + "learning_rate": 1.0031415874858796e-05, + "loss": -0.0162, + "reward": 1.1272632777690887, + "reward_std": 0.15625868551433086, + "rewards/mrr_reward": 0.20879836566746235, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.8496331721544266, + "step": 513 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.375, + "epoch": 0.4112, + "grad_norm": 0.03907299041748047, + "kl": 0.008602142333984375, + "learning_rate": 1e-05, + "loss": -0.0077, + "reward": 1.1561536490917206, + "reward_std": 0.15583913400769234, + "rewards/mrr_reward": 0.24771205335855484, + "rewards/rank_answer_foramt_reward": 0.8984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8622283041477203, + "step": 514 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.21875, + "epoch": 0.412, + "grad_norm": 0.043046656996011734, + "kl": 0.010164260864257812, + "learning_rate": 9.968584125141206e-06, + "loss": -0.0261, + "reward": 1.0769396722316742, + "reward_std": 0.2242715172469616, + "rewards/mrr_reward": 0.20158729702234268, + "rewards/rank_answer_foramt_reward": 0.84765625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8049266040325165, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completion_length": 474.9375, + "epoch": 0.4128, + "grad_norm": 0.03716719150543213, + "kl": 0.007334709167480469, + "learning_rate": 9.937168560344412e-06, + "loss": -0.0037, + "reward": 1.192034512758255, + "reward_std": 0.18218636699020863, + "rewards/mrr_reward": 0.25520213693380356, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8388857245445251, + "step": 516 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.3125, + "epoch": 0.4136, + "grad_norm": 0.03868447244167328, + "kl": 0.0097503662109375, + "learning_rate": 9.905753615668561e-06, + "loss": -0.0471, + "reward": 1.096491515636444, + "reward_std": 0.14279304654337466, + "rewards/mrr_reward": 0.1803695447742939, + "rewards/rank_answer_foramt_reward": 0.953125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8308144211769104, + "step": 517 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.125, + "epoch": 0.4144, + "grad_norm": 0.03858815133571625, + "kl": 0.00782012939453125, + "learning_rate": 9.874339601166474e-06, + "loss": -0.0198, + "reward": 1.2538862526416779, + "reward_std": 0.2349272146821022, + "rewards/mrr_reward": 0.35144468769431114, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8264680653810501, + "step": 518 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.1875, + "epoch": 0.4152, + "grad_norm": 0.04030821472406387, + "kl": 0.007167816162109375, + "learning_rate": 9.842926826881796e-06, + "loss": -0.0117, + "reward": 1.15766641497612, + "reward_std": 0.20966706797480583, + "rewards/mrr_reward": 0.251940730959177, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8149357289075851, + "step": 519 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.328125, + "epoch": 0.416, + "grad_norm": 0.03588611260056496, + "kl": 0.007293701171875, + "learning_rate": 9.81151560284592e-06, + "loss": -0.0062, + "reward": 1.1331438720226288, + "reward_std": 0.18311648909002542, + "rewards/mrr_reward": 0.21963665634393692, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8111724406480789, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.6875, + "epoch": 0.4168, + "grad_norm": 0.039640478789806366, + "kl": 0.008254051208496094, + "learning_rate": 9.78010623907495e-06, + "loss": -0.0318, + "reward": 1.0459920465946198, + "reward_std": 0.08210169477388263, + "rewards/mrr_reward": 0.11974826268851757, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8556275069713593, + "step": 521 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.234375, + "epoch": 0.4176, + "grad_norm": 0.041469115763902664, + "kl": 0.008711814880371094, + "learning_rate": 9.748699045566626e-06, + "loss": -0.0294, + "reward": 1.1578961312770844, + "reward_std": 0.18251374177634716, + "rewards/mrr_reward": 0.2508494593203068, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.818938821554184, + "step": 522 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.65625, + "epoch": 0.4184, + "grad_norm": 0.03994135931134224, + "kl": 0.010793685913085938, + "learning_rate": 9.717294332297269e-06, + "loss": -0.0525, + "reward": 1.147641807794571, + "reward_std": 0.2086065262556076, + "rewards/mrr_reward": 0.25408606603741646, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7702446281909943, + "step": 523 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.5, + "epoch": 0.4192, + "grad_norm": 0.04150420427322388, + "kl": 0.008188247680664062, + "learning_rate": 9.685892409218718e-06, + "loss": 0.0358, + "reward": 1.186688095331192, + "reward_std": 0.18247143924236298, + "rewards/mrr_reward": 0.29063739627599716, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.812961220741272, + "step": 524 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.0625, + "epoch": 0.42, + "grad_norm": 0.04004862159490585, + "kl": 0.00971221923828125, + "learning_rate": 9.654493586255279e-06, + "loss": -0.0507, + "reward": 1.134672150015831, + "reward_std": 0.20101629197597504, + "rewards/mrr_reward": 0.23550966568291187, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8067660629749298, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.03125, + "epoch": 0.4208, + "grad_norm": 0.041226062923669815, + "kl": 0.008754730224609375, + "learning_rate": 9.623098173300655e-06, + "loss": 0.0024, + "reward": 1.1712295413017273, + "reward_std": 0.17994631733745337, + "rewards/mrr_reward": 0.2511904891580343, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8231534212827682, + "step": 526 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.171875, + "epoch": 0.4216, + "grad_norm": 0.03775149956345558, + "kl": 0.012889862060546875, + "learning_rate": 9.5917064802149e-06, + "loss": -0.034, + "reward": 1.1483985781669617, + "reward_std": 0.12394142523407936, + "rewards/mrr_reward": 0.21857019513845444, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8528178334236145, + "step": 527 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.875, + "epoch": 0.4224, + "grad_norm": 0.03994860127568245, + "kl": 0.008665084838867188, + "learning_rate": 9.560318816821354e-06, + "loss": -0.0014, + "reward": 1.161629170179367, + "reward_std": 0.11074419878423214, + "rewards/mrr_reward": 0.23227927647531033, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8572273999452591, + "step": 528 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.328125, + "epoch": 0.4232, + "grad_norm": 0.037875980138778687, + "kl": 0.008001327514648438, + "learning_rate": 9.528935492903575e-06, + "loss": -0.0433, + "reward": 1.1144822239875793, + "reward_std": 0.15936855971813202, + "rewards/mrr_reward": 0.19868551194667816, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8024852275848389, + "step": 529 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.328125, + "epoch": 0.424, + "grad_norm": 0.038447290658950806, + "kl": 0.008828163146972656, + "learning_rate": 9.497556818202306e-06, + "loss": -0.029, + "reward": 1.1741737723350525, + "reward_std": 0.09742028824985027, + "rewards/mrr_reward": 0.25808530673384666, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8033694326877594, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.28125, + "epoch": 0.4248, + "grad_norm": 0.036154162138700485, + "kl": 0.008661270141601562, + "learning_rate": 9.466183102412397e-06, + "loss": 0.0113, + "reward": 1.1091251969337463, + "reward_std": 0.19129342585802078, + "rewards/mrr_reward": 0.19771825522184372, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8184796869754791, + "step": 531 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.328125, + "epoch": 0.4256, + "grad_norm": 0.037424515932798386, + "kl": 0.0073680877685546875, + "learning_rate": 9.434814655179756e-06, + "loss": 0.0312, + "reward": 1.1813635230064392, + "reward_std": 0.1663509365171194, + "rewards/mrr_reward": 0.2525049652904272, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8440196663141251, + "step": 532 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.5, + "epoch": 0.4264, + "grad_norm": 0.03965657949447632, + "kl": 0.00954437255859375, + "learning_rate": 9.403451786098295e-06, + "loss": -0.0287, + "reward": 1.1262851357460022, + "reward_std": 0.24533293023705482, + "rewards/mrr_reward": 0.2629836406558752, + "rewards/rank_answer_foramt_reward": 0.84765625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.776221290230751, + "step": 533 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.9375, + "epoch": 0.4272, + "grad_norm": 0.043141767382621765, + "kl": 0.01007843017578125, + "learning_rate": 9.372094804706867e-06, + "loss": 0.0115, + "reward": 1.135538101196289, + "reward_std": 0.17273031920194626, + "rewards/mrr_reward": 0.24095982685685158, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8084994107484818, + "step": 534 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.828125, + "epoch": 0.428, + "grad_norm": 0.035875458270311356, + "kl": 0.007419586181640625, + "learning_rate": 9.340744020486223e-06, + "loss": 0.0009, + "reward": 1.1043555736541748, + "reward_std": 0.09360181912779808, + "rewards/mrr_reward": 0.17147817462682724, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8269011080265045, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.3125, + "epoch": 0.4288, + "grad_norm": 0.035530660301446915, + "kl": 0.009695053100585938, + "learning_rate": 9.309399742855943e-06, + "loss": -0.0219, + "reward": 1.0510781407356262, + "reward_std": 0.10896967723965645, + "rewards/mrr_reward": 0.1329303029924631, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8467190861701965, + "step": 536 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.71875, + "epoch": 0.4296, + "grad_norm": 0.040476907044649124, + "kl": 0.008740425109863281, + "learning_rate": 9.278062281171394e-06, + "loss": -0.0039, + "reward": 1.104703813791275, + "reward_std": 0.21153176575899124, + "rewards/mrr_reward": 0.20956101268529892, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8023973703384399, + "step": 537 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.265625, + "epoch": 0.4304, + "grad_norm": 0.04103953763842583, + "kl": 0.007740974426269531, + "learning_rate": 9.246731944720675e-06, + "loss": -0.0205, + "reward": 1.0625648498535156, + "reward_std": 0.09056078270077705, + "rewards/mrr_reward": 0.1506386436522007, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8005219399929047, + "step": 538 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.9375, + "epoch": 0.4312, + "grad_norm": 0.040170818567276, + "kl": 0.007616996765136719, + "learning_rate": 9.215409042721553e-06, + "loss": -0.0218, + "reward": 1.1552115380764008, + "reward_std": 0.10027447901666164, + "rewards/mrr_reward": 0.23503224551677704, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8294375985860825, + "step": 539 + }, + { + "clip_ratio": 0.0, + "completion_length": 474.53125, + "epoch": 0.432, + "grad_norm": 0.042113546282052994, + "kl": 0.00826263427734375, + "learning_rate": 9.184093884318426e-06, + "loss": 0.0013, + "reward": 1.070628046989441, + "reward_std": 0.11641362123191357, + "rewards/mrr_reward": 0.14624256640672684, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8773399144411087, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.984375, + "epoch": 0.4328, + "grad_norm": 0.03919651731848717, + "kl": 0.008460044860839844, + "learning_rate": 9.152786778579266e-06, + "loss": -0.0144, + "reward": 1.205646112561226, + "reward_std": 0.24700526893138885, + "rewards/mrr_reward": 0.31289682909846306, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8283476531505585, + "step": 541 + }, + { + "clip_ratio": 0.0, + "completion_length": 472.078125, + "epoch": 0.4336, + "grad_norm": 0.04152778908610344, + "kl": 0.008693695068359375, + "learning_rate": 9.121488034492569e-06, + "loss": 0.006, + "reward": 1.1138120293617249, + "reward_std": 0.19620881974697113, + "rewards/mrr_reward": 0.2173549085855484, + "rewards/rank_answer_foramt_reward": 0.923828125, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.8239585608243942, + "step": 542 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.3125, + "epoch": 0.4344, + "grad_norm": 0.03831581026315689, + "kl": 0.008653640747070312, + "learning_rate": 9.090197960964301e-06, + "loss": -0.0178, + "reward": 1.043550044298172, + "reward_std": 0.12762367445975542, + "rewards/mrr_reward": 0.14636037312448025, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7812563478946686, + "step": 543 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.203125, + "epoch": 0.4352, + "grad_norm": 0.04002196714282036, + "kl": 0.0078277587890625, + "learning_rate": 9.058916866814857e-06, + "loss": -0.0065, + "reward": 1.2575414180755615, + "reward_std": 0.13060189969837666, + "rewards/mrr_reward": 0.3513020761311054, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8106326907873154, + "step": 544 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.171875, + "epoch": 0.436, + "grad_norm": 0.039209943264722824, + "kl": 0.00780487060546875, + "learning_rate": 9.027645060776008e-06, + "loss": -0.0084, + "reward": 1.1186776161193848, + "reward_std": 0.14777595922350883, + "rewards/mrr_reward": 0.22275545448064804, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8281967639923096, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.515625, + "epoch": 0.4368, + "grad_norm": 0.04363907501101494, + "kl": 0.008817672729492188, + "learning_rate": 8.996382851487851e-06, + "loss": -0.0523, + "reward": 1.1956347823143005, + "reward_std": 0.21989241987466812, + "rewards/mrr_reward": 0.30300718545913696, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7596193552017212, + "step": 546 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.03125, + "epoch": 0.4376, + "grad_norm": 0.03548242896795273, + "kl": 0.0070171356201171875, + "learning_rate": 8.965130547495777e-06, + "loss": -0.0023, + "reward": 1.1320178806781769, + "reward_std": 0.11437905207276344, + "rewards/mrr_reward": 0.20006200671195984, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8319211304187775, + "step": 547 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.28125, + "epoch": 0.4384, + "grad_norm": 0.03946579992771149, + "kl": 0.008230209350585938, + "learning_rate": 8.933888457247402e-06, + "loss": 0.0214, + "reward": 1.1079612374305725, + "reward_std": 0.21605945192277431, + "rewards/mrr_reward": 0.21192336827516556, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.8383130878210068, + "step": 548 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.796875, + "epoch": 0.4392, + "grad_norm": 0.03934666886925697, + "kl": 0.007843017578125, + "learning_rate": 8.902656889089548e-06, + "loss": -0.0018, + "reward": 1.1603901982307434, + "reward_std": 0.18869752623140812, + "rewards/mrr_reward": 0.23634053207933903, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8274939954280853, + "step": 549 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.234375, + "epoch": 0.44, + "grad_norm": 0.03816671669483185, + "kl": 0.008211135864257812, + "learning_rate": 8.871436151265183e-06, + "loss": -0.0658, + "reward": 1.1312854886054993, + "reward_std": 0.1634649522602558, + "rewards/mrr_reward": 0.22075272910296917, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.79434634745121, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.03125, + "epoch": 0.4408, + "grad_norm": 0.049642350524663925, + "kl": 0.008757591247558594, + "learning_rate": 8.840226551910387e-06, + "loss": -0.0436, + "reward": 1.0978001356124878, + "reward_std": 0.1608146745711565, + "rewards/mrr_reward": 0.18125620111823082, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8184214979410172, + "step": 551 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.171875, + "epoch": 0.4416, + "grad_norm": 0.03793758898973465, + "kl": 0.008665084838867188, + "learning_rate": 8.809028399051302e-06, + "loss": 0.0105, + "reward": 1.1738522052764893, + "reward_std": 0.10969773586839437, + "rewards/mrr_reward": 0.242367310449481, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8773688822984695, + "step": 552 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.40625, + "epoch": 0.4424, + "grad_norm": 0.039615653455257416, + "kl": 0.008930206298828125, + "learning_rate": 8.777842000601106e-06, + "loss": 0.0215, + "reward": 1.2303386330604553, + "reward_std": 0.22473370283842087, + "rewards/mrr_reward": 0.32853423431515694, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7678967267274857, + "step": 553 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.265625, + "epoch": 0.4432, + "grad_norm": 0.039541710168123245, + "kl": 0.007976531982421875, + "learning_rate": 8.746667664356957e-06, + "loss": 0.0074, + "reward": 1.1821573972702026, + "reward_std": 0.13846352836117148, + "rewards/mrr_reward": 0.2504030168056488, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8742790371179581, + "step": 554 + }, + { + "clip_ratio": 0.0, + "completion_length": 493.140625, + "epoch": 0.444, + "grad_norm": 0.03958950564265251, + "kl": 0.007397651672363281, + "learning_rate": 8.715505697996972e-06, + "loss": -0.0001, + "reward": 1.1536555588245392, + "reward_std": 0.1938557531684637, + "rewards/mrr_reward": 0.23748759925365448, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8407195657491684, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.53125, + "epoch": 0.4448, + "grad_norm": 0.04035869613289833, + "kl": 0.0110321044921875, + "learning_rate": 8.684356409077177e-06, + "loss": -0.0185, + "reward": 1.1211222410202026, + "reward_std": 0.15442631393671036, + "rewards/mrr_reward": 0.20714285783469677, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8067436665296555, + "step": 556 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.671875, + "epoch": 0.4456, + "grad_norm": 0.04031984135508537, + "kl": 0.007228851318359375, + "learning_rate": 8.653220105028476e-06, + "loss": -0.0004, + "reward": 1.3500173687934875, + "reward_std": 0.16067894361913204, + "rewards/mrr_reward": 0.3977244533598423, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.885735884308815, + "step": 557 + }, + { + "clip_ratio": 0.0, + "completion_length": 475.40625, + "epoch": 0.4464, + "grad_norm": 0.03806934878230095, + "kl": 0.008632659912109375, + "learning_rate": 8.62209709315362e-06, + "loss": -0.028, + "reward": 1.1859273612499237, + "reward_std": 0.16438952274620533, + "rewards/mrr_reward": 0.2831225246191025, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7904596626758575, + "step": 558 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.609375, + "epoch": 0.4472, + "grad_norm": 0.03851349279284477, + "kl": 0.00989532470703125, + "learning_rate": 8.590987680624174e-06, + "loss": -0.0014, + "reward": 1.124581664800644, + "reward_std": 0.14083863236010075, + "rewards/mrr_reward": 0.23160961642861366, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8114445358514786, + "step": 559 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.796875, + "epoch": 0.448, + "grad_norm": 0.03883276879787445, + "kl": 0.009004592895507812, + "learning_rate": 8.559892174477478e-06, + "loss": -0.0232, + "reward": 1.2451259791851044, + "reward_std": 0.20462225936353207, + "rewards/mrr_reward": 0.3293340802192688, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8591111749410629, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.71875, + "epoch": 0.4488, + "grad_norm": 0.041181642562150955, + "kl": 0.008396148681640625, + "learning_rate": 8.528810881613626e-06, + "loss": -0.0414, + "reward": 1.1230032444000244, + "reward_std": 0.16564571298658848, + "rewards/mrr_reward": 0.22324529103934765, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7968515604734421, + "step": 561 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.421875, + "epoch": 0.4496, + "grad_norm": 0.03920190408825874, + "kl": 0.008289337158203125, + "learning_rate": 8.49774410879243e-06, + "loss": 0.0054, + "reward": 1.0872045755386353, + "reward_std": 0.12622703425586224, + "rewards/mrr_reward": 0.16274181567132473, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8580428808927536, + "step": 562 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.296875, + "epoch": 0.4504, + "grad_norm": 0.03609956428408623, + "kl": 0.007109642028808594, + "learning_rate": 8.466692162630393e-06, + "loss": 0.0235, + "reward": 1.2699479758739471, + "reward_std": 0.1954438053071499, + "rewards/mrr_reward": 0.3503100275993347, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8766252249479294, + "step": 563 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.921875, + "epoch": 0.4512, + "grad_norm": 0.04106234386563301, + "kl": 0.008481979370117188, + "learning_rate": 8.43565534959769e-06, + "loss": -0.0142, + "reward": 1.0173709094524384, + "reward_std": 0.1483047273941338, + "rewards/mrr_reward": 0.12945188395678997, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8234761506319046, + "step": 564 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.90625, + "epoch": 0.452, + "grad_norm": 0.03835194930434227, + "kl": 0.00774383544921875, + "learning_rate": 8.404633976015136e-06, + "loss": -0.0165, + "reward": 1.1766445636749268, + "reward_std": 0.12921269796788692, + "rewards/mrr_reward": 0.2505580559372902, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8278071284294128, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.046875, + "epoch": 0.4528, + "grad_norm": 0.03958544135093689, + "kl": 0.008856773376464844, + "learning_rate": 8.373628348051165e-06, + "loss": 0.0131, + "reward": 1.1318785846233368, + "reward_std": 0.11855714162811637, + "rewards/mrr_reward": 0.22342510148882866, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8310142606496811, + "step": 566 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.546875, + "epoch": 0.4536, + "grad_norm": 0.04320928454399109, + "kl": 0.008389472961425781, + "learning_rate": 8.342638771718804e-06, + "loss": 0.0107, + "reward": 1.128523826599121, + "reward_std": 0.16640270128846169, + "rewards/mrr_reward": 0.21039807423949242, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8583710491657257, + "step": 567 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.953125, + "epoch": 0.4544, + "grad_norm": 0.04389907792210579, + "kl": 0.009497642517089844, + "learning_rate": 8.311665552872662e-06, + "loss": -0.0266, + "reward": 1.0887642204761505, + "reward_std": 0.18486547190696, + "rewards/mrr_reward": 0.18142360635101795, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.831548199057579, + "step": 568 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.40625, + "epoch": 0.4552, + "grad_norm": 0.04440563917160034, + "kl": 0.008943557739257812, + "learning_rate": 8.280708997205904e-06, + "loss": -0.0072, + "reward": 1.2277150750160217, + "reward_std": 0.24825106747448444, + "rewards/mrr_reward": 0.36322543025016785, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7173216342926025, + "step": 569 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.21875, + "epoch": 0.456, + "grad_norm": 0.0392410084605217, + "kl": 0.009092330932617188, + "learning_rate": 8.249769410247239e-06, + "loss": 0.0086, + "reward": 1.204152375459671, + "reward_std": 0.21110957488417625, + "rewards/mrr_reward": 0.2886160612106323, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8290398269891739, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.046875, + "epoch": 0.4568, + "grad_norm": 0.0392969585955143, + "kl": 0.007068634033203125, + "learning_rate": 8.218847097357898e-06, + "loss": -0.0417, + "reward": 1.1151992976665497, + "reward_std": 0.12983771739527583, + "rewards/mrr_reward": 0.17941468209028244, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.849382758140564, + "step": 571 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.65625, + "epoch": 0.4576, + "grad_norm": 0.037597186863422394, + "kl": 0.007890701293945312, + "learning_rate": 8.187942363728626e-06, + "loss": -0.0264, + "reward": 1.0076032876968384, + "reward_std": 0.11738604307174683, + "rewards/mrr_reward": 0.10858755186200142, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.827805757522583, + "step": 572 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.921875, + "epoch": 0.4584, + "grad_norm": 0.041158709675073624, + "kl": 0.009695053100585938, + "learning_rate": 8.157055514376667e-06, + "loss": -0.0447, + "reward": 1.2002838253974915, + "reward_std": 0.1709437482059002, + "rewards/mrr_reward": 0.31932043842971325, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6910702735185623, + "step": 573 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.96875, + "epoch": 0.4592, + "grad_norm": 0.040003228932619095, + "kl": 0.009104728698730469, + "learning_rate": 8.126186854142752e-06, + "loss": -0.0428, + "reward": 1.1452887952327728, + "reward_std": 0.1888815239071846, + "rewards/mrr_reward": 0.24434524402022362, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8277881443500519, + "step": 574 + }, + { + "clip_ratio": 0.0, + "completion_length": 478.0, + "epoch": 0.46, + "grad_norm": 0.03768211975693703, + "kl": 0.006785392761230469, + "learning_rate": 8.095336687688102e-06, + "loss": 0.004, + "reward": 1.2456986904144287, + "reward_std": 0.1416485607624054, + "rewards/mrr_reward": 0.31855158507823944, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8505522757768631, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.8125, + "epoch": 0.4608, + "grad_norm": 0.04362956061959267, + "kl": 0.007818222045898438, + "learning_rate": 8.064505319491398e-06, + "loss": -0.0028, + "reward": 1.2511990666389465, + "reward_std": 0.19724944420158863, + "rewards/mrr_reward": 0.3255952559411526, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8400160819292068, + "step": 576 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.796875, + "epoch": 0.4616, + "grad_norm": 0.040099550038576126, + "kl": 0.009653091430664062, + "learning_rate": 8.033693053845801e-06, + "loss": -0.0472, + "reward": 1.159771978855133, + "reward_std": 0.1415103916078806, + "rewards/mrr_reward": 0.22492559626698494, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.854352131485939, + "step": 577 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.546875, + "epoch": 0.4624, + "grad_norm": 0.038695842027664185, + "kl": 0.008846282958984375, + "learning_rate": 8.00290019485593e-06, + "loss": -0.0166, + "reward": 1.1117965877056122, + "reward_std": 0.16431541368365288, + "rewards/mrr_reward": 0.2048921138048172, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8028827607631683, + "step": 578 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.109375, + "epoch": 0.4632, + "grad_norm": 0.04034760221838951, + "kl": 0.00905609130859375, + "learning_rate": 7.972127046434878e-06, + "loss": -0.014, + "reward": 1.1206817924976349, + "reward_std": 0.21675598435103893, + "rewards/mrr_reward": 0.23121900483965874, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8183884024620056, + "step": 579 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.96875, + "epoch": 0.464, + "grad_norm": 0.042170315980911255, + "kl": 0.010354995727539062, + "learning_rate": 7.94137391230119e-06, + "loss": -0.0259, + "reward": 1.1011703163385391, + "reward_std": 0.24708281457424164, + "rewards/mrr_reward": 0.2212301604449749, + "rewards/rank_answer_foramt_reward": 0.845703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8207820504903793, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.421875, + "epoch": 0.4648, + "grad_norm": 0.03817920386791229, + "kl": 0.008044242858886719, + "learning_rate": 7.910641095975886e-06, + "loss": 0.0116, + "reward": 1.1385302245616913, + "reward_std": 0.11444682255387306, + "rewards/mrr_reward": 0.20651661977171898, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8516274094581604, + "step": 581 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.46875, + "epoch": 0.4656, + "grad_norm": 0.039664629846811295, + "kl": 0.007870674133300781, + "learning_rate": 7.879928900779457e-06, + "loss": -0.0641, + "reward": 1.178428053855896, + "reward_std": 0.17734414339065552, + "rewards/mrr_reward": 0.2622581832110882, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7762722671031952, + "step": 582 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.5625, + "epoch": 0.4664, + "grad_norm": 0.043783195316791534, + "kl": 0.008320808410644531, + "learning_rate": 7.84923762982887e-06, + "loss": 0.0012, + "reward": 1.0347242206335068, + "reward_std": 0.17706408351659775, + "rewards/mrr_reward": 0.13465401716530323, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8173290193080902, + "step": 583 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.40625, + "epoch": 0.4672, + "grad_norm": 0.040662799030542374, + "kl": 0.008103370666503906, + "learning_rate": 7.818567586034578e-06, + "loss": -0.0064, + "reward": 1.1952064037322998, + "reward_std": 0.21328644640743732, + "rewards/mrr_reward": 0.2867993488907814, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8425921499729156, + "step": 584 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.140625, + "epoch": 0.468, + "grad_norm": 0.038001302629709244, + "kl": 0.00820159912109375, + "learning_rate": 7.787919072097531e-06, + "loss": -0.0179, + "reward": 1.2342975735664368, + "reward_std": 0.26813187077641487, + "rewards/mrr_reward": 0.32374753057956696, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8217423409223557, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completion_length": 474.84375, + "epoch": 0.4688, + "grad_norm": 0.038148414343595505, + "kl": 0.008083343505859375, + "learning_rate": 7.757292390506191e-06, + "loss": -0.0098, + "reward": 1.0755177438259125, + "reward_std": 0.19383916724473238, + "rewards/mrr_reward": 0.1725260429084301, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8359476029872894, + "step": 586 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.640625, + "epoch": 0.4696, + "grad_norm": 0.04210152477025986, + "kl": 0.009115219116210938, + "learning_rate": 7.726687843533539e-06, + "loss": -0.0532, + "reward": 1.108871042728424, + "reward_std": 0.164753595367074, + "rewards/mrr_reward": 0.1870349682867527, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8422707915306091, + "step": 587 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.953125, + "epoch": 0.4704, + "grad_norm": 0.04036114737391472, + "kl": 0.00824737548828125, + "learning_rate": 7.696105733234099e-06, + "loss": -0.0084, + "reward": 1.0524960458278656, + "reward_std": 0.1158986147493124, + "rewards/mrr_reward": 0.140625, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.819886177778244, + "step": 588 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.15625, + "epoch": 0.4712, + "grad_norm": 0.038603611290454865, + "kl": 0.01171112060546875, + "learning_rate": 7.66554636144095e-06, + "loss": -0.0141, + "reward": 1.1410838067531586, + "reward_std": 0.1260488135740161, + "rewards/mrr_reward": 0.22431176900863647, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8269250690937042, + "step": 589 + }, + { + "clip_ratio": 0.0, + "completion_length": 474.984375, + "epoch": 0.472, + "grad_norm": 0.04096561297774315, + "kl": 0.009233474731445312, + "learning_rate": 7.635010029762755e-06, + "loss": 0.0165, + "reward": 1.2618905901908875, + "reward_std": 0.1963303443044424, + "rewards/mrr_reward": 0.337270587682724, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8507068455219269, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completion_length": 487.328125, + "epoch": 0.4728, + "grad_norm": 0.03711889311671257, + "kl": 0.009479522705078125, + "learning_rate": 7.604497039580785e-06, + "loss": -0.0077, + "reward": 1.1018753945827484, + "reward_std": 0.1098787821829319, + "rewards/mrr_reward": 0.16659227386116982, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8830194175243378, + "step": 591 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.296875, + "epoch": 0.4736, + "grad_norm": 0.040535878390073776, + "kl": 0.009859085083007812, + "learning_rate": 7.574007692045928e-06, + "loss": -0.0361, + "reward": 1.1256814897060394, + "reward_std": 0.16627157852053642, + "rewards/mrr_reward": 0.2195622455328703, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8083157539367676, + "step": 592 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.390625, + "epoch": 0.4744, + "grad_norm": 0.04586748778820038, + "kl": 0.012853622436523438, + "learning_rate": 7.543542288075739e-06, + "loss": 0.0211, + "reward": 1.0531246066093445, + "reward_std": 0.1633172556757927, + "rewards/mrr_reward": 0.16449653171002865, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8353904634714127, + "step": 593 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.140625, + "epoch": 0.4752, + "grad_norm": 0.04041828587651253, + "kl": 0.008060455322265625, + "learning_rate": 7.513101128351454e-06, + "loss": -0.0028, + "reward": 1.0381239652633667, + "reward_std": 0.11173910088837147, + "rewards/mrr_reward": 0.1378658302128315, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8471954613924026, + "step": 594 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.515625, + "epoch": 0.476, + "grad_norm": 0.03903096914291382, + "kl": 0.008426666259765625, + "learning_rate": 7.482684513315031e-06, + "loss": -0.0315, + "reward": 1.2388408780097961, + "reward_std": 0.23095028288662434, + "rewards/mrr_reward": 0.34299975633621216, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7693574577569962, + "step": 595 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.03125, + "epoch": 0.4768, + "grad_norm": 0.037792522460222244, + "kl": 0.008992195129394531, + "learning_rate": 7.4522927431661805e-06, + "loss": -0.0293, + "reward": 1.1880051493644714, + "reward_std": 0.13500093296170235, + "rewards/mrr_reward": 0.2826884910464287, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8000244349241257, + "step": 596 + }, + { + "clip_ratio": 0.0, + "completion_length": 495.265625, + "epoch": 0.4776, + "grad_norm": 0.03439981862902641, + "kl": 0.007958412170410156, + "learning_rate": 7.421926117859403e-06, + "loss": -0.0139, + "reward": 1.2536373734474182, + "reward_std": 0.1599850282073021, + "rewards/mrr_reward": 0.3125309981405735, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8655090630054474, + "step": 597 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.6875, + "epoch": 0.4784, + "grad_norm": 0.036926135420799255, + "kl": 0.008787155151367188, + "learning_rate": 7.391584937101034e-06, + "loss": -0.0252, + "reward": 1.2367544770240784, + "reward_std": 0.1867055855691433, + "rewards/mrr_reward": 0.32385294139385223, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8093369156122208, + "step": 598 + }, + { + "clip_ratio": 0.0, + "completion_length": 474.90625, + "epoch": 0.4792, + "grad_norm": 0.040266942232847214, + "kl": 0.007959365844726562, + "learning_rate": 7.361269500346274e-06, + "loss": -0.0266, + "reward": 1.160109281539917, + "reward_std": 0.21854684129357338, + "rewards/mrr_reward": 0.23885169252753258, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8678614497184753, + "step": 599 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.34375, + "epoch": 0.48, + "grad_norm": 0.04247404634952545, + "kl": 0.009807586669921875, + "learning_rate": 7.330980106796247e-06, + "loss": -0.0226, + "reward": 1.2082679271697998, + "reward_std": 0.14304961264133453, + "rewards/mrr_reward": 0.28249626979231834, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.846384271979332, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.75, + "epoch": 0.4808, + "grad_norm": 0.043656595051288605, + "kl": 0.009093284606933594, + "learning_rate": 7.300717055395039e-06, + "loss": 0.0033, + "reward": 1.2197607904672623, + "reward_std": 0.13689721561968327, + "rewards/mrr_reward": 0.311445914208889, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8149691671133041, + "step": 601 + }, + { + "clip_ratio": 0.0, + "completion_length": 489.0, + "epoch": 0.4816, + "grad_norm": 0.035476408898830414, + "kl": 0.007262229919433594, + "learning_rate": 7.27048064482675e-06, + "loss": -0.0232, + "reward": 1.1573797166347504, + "reward_std": 0.2298069056123495, + "rewards/mrr_reward": 0.25371403992176056, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8223650008440018, + "step": 602 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.8125, + "epoch": 0.4824, + "grad_norm": 0.04022166132926941, + "kl": 0.0081024169921875, + "learning_rate": 7.240271173512545e-06, + "loss": -0.0167, + "reward": 1.2249160706996918, + "reward_std": 0.26648064982146025, + "rewards/mrr_reward": 0.31931424885988235, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8360446691513062, + "step": 603 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.328125, + "epoch": 0.4832, + "grad_norm": 0.03857105225324631, + "kl": 0.008441925048828125, + "learning_rate": 7.210088939607709e-06, + "loss": -0.0502, + "reward": 1.152012288570404, + "reward_std": 0.14423850551247597, + "rewards/mrr_reward": 0.24311136081814766, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8030732870101929, + "step": 604 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.71875, + "epoch": 0.484, + "grad_norm": 0.04305540770292282, + "kl": 0.008272171020507812, + "learning_rate": 7.179934240998707e-06, + "loss": -0.0375, + "reward": 1.118052452802658, + "reward_std": 0.1358859408646822, + "rewards/mrr_reward": 0.1941592302173376, + "rewards/rank_answer_foramt_reward": 0.955078125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8445982933044434, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.0625, + "epoch": 0.4848, + "grad_norm": 0.04774133116006851, + "kl": 0.010158538818359375, + "learning_rate": 7.149807375300239e-06, + "loss": -0.0581, + "reward": 1.0857676565647125, + "reward_std": 0.2192548532038927, + "rewards/mrr_reward": 0.19006076455116272, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8314506113529205, + "step": 606 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.078125, + "epoch": 0.4856, + "grad_norm": 0.03719841688871384, + "kl": 0.009935379028320312, + "learning_rate": 7.119708639852312e-06, + "loss": -0.0208, + "reward": 1.192948192358017, + "reward_std": 0.148657638579607, + "rewards/mrr_reward": 0.2748511843383312, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8465651720762253, + "step": 607 + }, + { + "clip_ratio": 0.0, + "completion_length": 479.265625, + "epoch": 0.4864, + "grad_norm": 0.036122966557741165, + "kl": 0.008540153503417969, + "learning_rate": 7.0896383317172845e-06, + "loss": -0.0174, + "reward": 1.3368725180625916, + "reward_std": 0.23823470249772072, + "rewards/mrr_reward": 0.4140749163925648, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8237001299858093, + "step": 608 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.078125, + "epoch": 0.4872, + "grad_norm": 0.03762197121977806, + "kl": 0.007107734680175781, + "learning_rate": 7.059596747676963e-06, + "loss": -0.017, + "reward": 1.2796568274497986, + "reward_std": 0.16884921677410603, + "rewards/mrr_reward": 0.3945312574505806, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7642298191785812, + "step": 609 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.390625, + "epoch": 0.488, + "grad_norm": 0.0399199053645134, + "kl": 0.008553504943847656, + "learning_rate": 7.029584184229653e-06, + "loss": -0.0136, + "reward": 1.3700619339942932, + "reward_std": 0.2029927484691143, + "rewards/mrr_reward": 0.44716642796993256, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8376685678958893, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.34375, + "epoch": 0.4888, + "grad_norm": 0.040925391018390656, + "kl": 0.008733749389648438, + "learning_rate": 6.99960093758724e-06, + "loss": 0.015, + "reward": 1.1895627975463867, + "reward_std": 0.23921046033501625, + "rewards/mrr_reward": 0.27715774811804295, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8761919885873795, + "step": 611 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.9375, + "epoch": 0.4896, + "grad_norm": 0.040661729872226715, + "kl": 0.010442733764648438, + "learning_rate": 6.969647303672262e-06, + "loss": -0.0553, + "reward": 1.24416384100914, + "reward_std": 0.2634511739015579, + "rewards/mrr_reward": 0.3377480283379555, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8463238179683685, + "step": 612 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.515625, + "epoch": 0.4904, + "grad_norm": 0.040657393634319305, + "kl": 0.009714126586914062, + "learning_rate": 6.9397235781149945e-06, + "loss": -0.0143, + "reward": 1.1388709545135498, + "reward_std": 0.1958305425941944, + "rewards/mrr_reward": 0.22525421902537346, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8310353606939316, + "step": 613 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.21875, + "epoch": 0.4912, + "grad_norm": 0.03989201411604881, + "kl": 0.010036468505859375, + "learning_rate": 6.909830056250527e-06, + "loss": -0.0312, + "reward": 1.1368741393089294, + "reward_std": 0.12755226157605648, + "rewards/mrr_reward": 0.2184709832072258, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7908523082733154, + "step": 614 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.96875, + "epoch": 0.492, + "grad_norm": 0.04114677384495735, + "kl": 0.0092315673828125, + "learning_rate": 6.879967033115853e-06, + "loss": -0.0002, + "reward": 1.3170638680458069, + "reward_std": 0.21237649768590927, + "rewards/mrr_reward": 0.3855530768632889, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.836431622505188, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.046875, + "epoch": 0.4928, + "grad_norm": 0.0431981161236763, + "kl": 0.009008407592773438, + "learning_rate": 6.850134803446955e-06, + "loss": -0.0007, + "reward": 1.121709167957306, + "reward_std": 0.17951259203255177, + "rewards/mrr_reward": 0.19001736119389534, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8877614140510559, + "step": 616 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.21875, + "epoch": 0.4936, + "grad_norm": 0.035142622888088226, + "kl": 0.007939338684082031, + "learning_rate": 6.820333661675893e-06, + "loss": -0.0185, + "reward": 1.3334324955940247, + "reward_std": 0.13348893821239471, + "rewards/mrr_reward": 0.42053572088479996, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7800257503986359, + "step": 617 + }, + { + "clip_ratio": 0.0, + "completion_length": 497.171875, + "epoch": 0.4944, + "grad_norm": 0.03749988600611687, + "kl": 0.007336616516113281, + "learning_rate": 6.790563901927907e-06, + "loss": -0.0012, + "reward": 1.1629119515419006, + "reward_std": 0.23179220408201218, + "rewards/mrr_reward": 0.25689484365284443, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8763656616210938, + "step": 618 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.59375, + "epoch": 0.4952, + "grad_norm": 0.044601455330848694, + "kl": 0.007954597473144531, + "learning_rate": 6.7608258180185085e-06, + "loss": -0.0065, + "reward": 1.171342521905899, + "reward_std": 0.10423477459698915, + "rewards/mrr_reward": 0.24530010670423508, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8335327506065369, + "step": 619 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.515625, + "epoch": 0.496, + "grad_norm": 0.04305371642112732, + "kl": 0.010364532470703125, + "learning_rate": 6.731119703450577e-06, + "loss": -0.0619, + "reward": 1.2109524309635162, + "reward_std": 0.26471400633454323, + "rewards/mrr_reward": 0.3258928619325161, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7933267802000046, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.5625, + "epoch": 0.4968, + "grad_norm": 0.04213489592075348, + "kl": 0.008825302124023438, + "learning_rate": 6.701445851411472e-06, + "loss": -0.0069, + "reward": 1.136389434337616, + "reward_std": 0.19171301275491714, + "rewards/mrr_reward": 0.21873139590024948, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8784380555152893, + "step": 621 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.984375, + "epoch": 0.4976, + "grad_norm": 0.03531862050294876, + "kl": 0.0073719024658203125, + "learning_rate": 6.671804554770135e-06, + "loss": -0.0387, + "reward": 1.099222093820572, + "reward_std": 0.15219473466277122, + "rewards/mrr_reward": 0.18070436641573906, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8107306212186813, + "step": 622 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.53125, + "epoch": 0.4984, + "grad_norm": 0.03785242512822151, + "kl": 0.009787559509277344, + "learning_rate": 6.642196106074195e-06, + "loss": -0.0231, + "reward": 1.3554746210575104, + "reward_std": 0.20645315200090408, + "rewards/mrr_reward": 0.4374689795076847, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8111319541931152, + "step": 623 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.6875, + "epoch": 0.4992, + "grad_norm": 0.038349699229002, + "kl": 0.00804901123046875, + "learning_rate": 6.612620797547087e-06, + "loss": 0.0146, + "reward": 1.1561428606510162, + "reward_std": 0.13428733311593533, + "rewards/mrr_reward": 0.21650546044111252, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8825420886278152, + "step": 624 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.5, + "epoch": 0.5, + "grad_norm": 0.039120424538850784, + "kl": 0.008810997009277344, + "learning_rate": 6.583078921085167e-06, + "loss": 0.0093, + "reward": 1.2225628197193146, + "reward_std": 0.19562321156263351, + "rewards/mrr_reward": 0.3105902783572674, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8143343031406403, + "step": 625 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.0, + "epoch": 0.5008, + "grad_norm": 0.03858473151922226, + "kl": 0.009241104125976562, + "learning_rate": 6.553570768254831e-06, + "loss": -0.0354, + "reward": 1.1035878658294678, + "reward_std": 0.1448194831609726, + "rewards/mrr_reward": 0.18493304029107094, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8384897708892822, + "step": 626 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.734375, + "epoch": 0.5016, + "grad_norm": 0.04008089378476143, + "kl": 0.009889602661132812, + "learning_rate": 6.524096630289632e-06, + "loss": -0.0247, + "reward": 1.2089687287807465, + "reward_std": 0.1602461007423699, + "rewards/mrr_reward": 0.28939732909202576, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8568924069404602, + "step": 627 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.140625, + "epoch": 0.5024, + "grad_norm": 0.04044454172253609, + "kl": 0.0072193145751953125, + "learning_rate": 6.494656798087412e-06, + "loss": -0.0287, + "reward": 1.2421257197856903, + "reward_std": 0.12500086054205894, + "rewards/mrr_reward": 0.30170511454343796, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8575716465711594, + "step": 628 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.421875, + "epoch": 0.5032, + "grad_norm": 0.0375799760222435, + "kl": 0.010451316833496094, + "learning_rate": 6.465251562207431e-06, + "loss": -0.0058, + "reward": 1.1293274760246277, + "reward_std": 0.15937496908009052, + "rewards/mrr_reward": 0.20788690820336342, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8332597315311432, + "step": 629 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.046875, + "epoch": 0.504, + "grad_norm": 0.03754337131977081, + "kl": 0.00838470458984375, + "learning_rate": 6.435881212867494e-06, + "loss": -0.0153, + "reward": 1.145438402891159, + "reward_std": 0.17192682810127735, + "rewards/mrr_reward": 0.220145083963871, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8586065769195557, + "step": 630 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.125, + "epoch": 0.5048, + "grad_norm": 0.04335853457450867, + "kl": 0.008279800415039062, + "learning_rate": 6.406546039941095e-06, + "loss": 0.0053, + "reward": 1.215552568435669, + "reward_std": 0.18727103993296623, + "rewards/mrr_reward": 0.29786086082458496, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8297120332717896, + "step": 631 + }, + { + "clip_ratio": 0.0, + "completion_length": 494.734375, + "epoch": 0.5056, + "grad_norm": 0.03874685615301132, + "kl": 0.007633209228515625, + "learning_rate": 6.377246332954544e-06, + "loss": -0.01, + "reward": 1.14056196808815, + "reward_std": 0.1842045597732067, + "rewards/mrr_reward": 0.2259982731193304, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8417176455259323, + "step": 632 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.3125, + "epoch": 0.5064, + "grad_norm": 0.04324718937277794, + "kl": 0.008602142333984375, + "learning_rate": 6.3479823810841235e-06, + "loss": 0.0259, + "reward": 1.1707454323768616, + "reward_std": 0.20734077505767345, + "rewards/mrr_reward": 0.24067460745573044, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8945683091878891, + "step": 633 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.453125, + "epoch": 0.5072, + "grad_norm": 0.03912781924009323, + "kl": 0.009914398193359375, + "learning_rate": 6.318754473153221e-06, + "loss": 0.003, + "reward": 1.152208298444748, + "reward_std": 0.18202020972967148, + "rewards/mrr_reward": 0.22909846529364586, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8793335855007172, + "step": 634 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.96875, + "epoch": 0.508, + "grad_norm": 0.03814735263586044, + "kl": 0.009555816650390625, + "learning_rate": 6.289562897629492e-06, + "loss": -0.004, + "reward": 1.2428525686264038, + "reward_std": 0.10947381239384413, + "rewards/mrr_reward": 0.3264322876930237, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7907028645277023, + "step": 635 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.8125, + "epoch": 0.5088, + "grad_norm": 0.04343482851982117, + "kl": 0.008977890014648438, + "learning_rate": 6.260407942621998e-06, + "loss": 0.0267, + "reward": 1.2960951030254364, + "reward_std": 0.1967066526412964, + "rewards/mrr_reward": 0.36600323021411896, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8887726813554764, + "step": 636 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.671875, + "epoch": 0.5096, + "grad_norm": 0.04106997326016426, + "kl": 0.008977890014648438, + "learning_rate": 6.231289895878375e-06, + "loss": -0.0182, + "reward": 1.2052837014198303, + "reward_std": 0.17037715390324593, + "rewards/mrr_reward": 0.2953869067132473, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8275755047798157, + "step": 637 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.484375, + "epoch": 0.5104, + "grad_norm": 0.03704027459025383, + "kl": 0.009426116943359375, + "learning_rate": 6.202209044781991e-06, + "loss": -0.0061, + "reward": 1.1235398948192596, + "reward_std": 0.13441728707402945, + "rewards/mrr_reward": 0.20093626528978348, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8328777849674225, + "step": 638 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.046875, + "epoch": 0.5112, + "grad_norm": 0.04103616252541542, + "kl": 0.0093231201171875, + "learning_rate": 6.173165676349103e-06, + "loss": -0.0435, + "reward": 1.1044175028800964, + "reward_std": 0.17622700706124306, + "rewards/mrr_reward": 0.19592014327645302, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.837006464600563, + "step": 639 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.984375, + "epoch": 0.512, + "grad_norm": 0.05446719005703926, + "kl": 0.008823394775390625, + "learning_rate": 6.144160077226035e-06, + "loss": 0.0136, + "reward": 1.2099950313568115, + "reward_std": 0.19803481549024582, + "rewards/mrr_reward": 0.28655754402279854, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.86860790848732, + "step": 640 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.46875, + "epoch": 0.5128, + "grad_norm": 0.041340798139572144, + "kl": 0.009428024291992188, + "learning_rate": 6.115192533686341e-06, + "loss": -0.0448, + "reward": 1.1741358637809753, + "reward_std": 0.1973903514444828, + "rewards/mrr_reward": 0.28611112385988235, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7749683260917664, + "step": 641 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.375, + "epoch": 0.5136, + "grad_norm": 0.037386320531368256, + "kl": 0.009598731994628906, + "learning_rate": 6.086263331627976e-06, + "loss": -0.04, + "reward": 1.1221703886985779, + "reward_std": 0.21877944841980934, + "rewards/mrr_reward": 0.22787078097462654, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7998424768447876, + "step": 642 + }, + { + "clip_ratio": 0.0, + "completion_length": 478.875, + "epoch": 0.5144, + "grad_norm": 0.04460527002811432, + "kl": 0.00873565673828125, + "learning_rate": 6.05737275657049e-06, + "loss": -0.025, + "reward": 1.1142739951610565, + "reward_std": 0.1813185941427946, + "rewards/mrr_reward": 0.21093130111694336, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8155269175767899, + "step": 643 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.515625, + "epoch": 0.5152, + "grad_norm": 0.03829963132739067, + "kl": 0.008128166198730469, + "learning_rate": 6.028521093652195e-06, + "loss": -0.0117, + "reward": 1.140984207391739, + "reward_std": 0.1788063794374466, + "rewards/mrr_reward": 0.22751735523343086, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8032376319169998, + "step": 644 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.171875, + "epoch": 0.516, + "grad_norm": 0.040134068578481674, + "kl": 0.011653900146484375, + "learning_rate": 5.9997086276273545e-06, + "loss": -0.0714, + "reward": 1.1092098951339722, + "reward_std": 0.15996074490249157, + "rewards/mrr_reward": 0.19925596192479134, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8218892067670822, + "step": 645 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.796875, + "epoch": 0.5168, + "grad_norm": 0.043228864669799805, + "kl": 0.012115478515625, + "learning_rate": 5.970935642863375e-06, + "loss": -0.041, + "reward": 1.1475808322429657, + "reward_std": 0.12733451835811138, + "rewards/mrr_reward": 0.2023809589445591, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.885726198554039, + "step": 646 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.453125, + "epoch": 0.5176, + "grad_norm": 0.03866947069764137, + "kl": 0.008696556091308594, + "learning_rate": 5.942202423338001e-06, + "loss": -0.0142, + "reward": 1.1517845392227173, + "reward_std": 0.2236475944519043, + "rewards/mrr_reward": 0.26093750819563866, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7561768889427185, + "step": 647 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.234375, + "epoch": 0.5184, + "grad_norm": 0.038130611181259155, + "kl": 0.0076313018798828125, + "learning_rate": 5.913509252636511e-06, + "loss": 0.0073, + "reward": 1.214305818080902, + "reward_std": 0.22642408311367035, + "rewards/mrr_reward": 0.27718254178762436, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8807831108570099, + "step": 648 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.390625, + "epoch": 0.5192, + "grad_norm": 0.04136860743165016, + "kl": 0.0090179443359375, + "learning_rate": 5.884856413948913e-06, + "loss": 0.0167, + "reward": 1.1337697207927704, + "reward_std": 0.1099173566326499, + "rewards/mrr_reward": 0.1892671175301075, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8836133182048798, + "step": 649 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.046875, + "epoch": 0.52, + "grad_norm": 0.040968600660562515, + "kl": 0.009687423706054688, + "learning_rate": 5.85624419006716e-06, + "loss": 0.0129, + "reward": 1.14986552298069, + "reward_std": 0.23535252176225185, + "rewards/mrr_reward": 0.23812004178762436, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8820055425167084, + "step": 650 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.578125, + "epoch": 0.5208, + "grad_norm": 0.04172006994485855, + "kl": 0.008195877075195312, + "learning_rate": 5.8276728633823494e-06, + "loss": 0.0156, + "reward": 1.0877700746059418, + "reward_std": 0.1796468086540699, + "rewards/mrr_reward": 0.18993056192994118, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8183818161487579, + "step": 651 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.4375, + "epoch": 0.5216, + "grad_norm": 0.04257863759994507, + "kl": 0.009547233581542969, + "learning_rate": 5.799142715881938e-06, + "loss": -0.0619, + "reward": 1.0521957874298096, + "reward_std": 0.22515824437141418, + "rewards/mrr_reward": 0.17905506119132042, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.768927738070488, + "step": 652 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.34375, + "epoch": 0.5224, + "grad_norm": 0.040138933807611465, + "kl": 0.009622573852539062, + "learning_rate": 5.770654029146969e-06, + "loss": 0.0044, + "reward": 1.1919938325881958, + "reward_std": 0.18187857419252396, + "rewards/mrr_reward": 0.2777653820812702, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8250766545534134, + "step": 653 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.8125, + "epoch": 0.5232, + "grad_norm": 0.043729230761528015, + "kl": 0.009851455688476562, + "learning_rate": 5.742207084349274e-06, + "loss": -0.0332, + "reward": 1.179715782403946, + "reward_std": 0.22262733057141304, + "rewards/mrr_reward": 0.2926153317093849, + "rewards/rank_answer_foramt_reward": 0.857421875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8541986495256424, + "step": 654 + }, + { + "clip_ratio": 0.0, + "completion_length": 484.390625, + "epoch": 0.524, + "grad_norm": 0.0415637381374836, + "kl": 0.008465766906738281, + "learning_rate": 5.713802162248718e-06, + "loss": 0.0018, + "reward": 1.2584901452064514, + "reward_std": 0.11682092864066362, + "rewards/mrr_reward": 0.3142175190150738, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8907290101051331, + "step": 655 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.875, + "epoch": 0.5248, + "grad_norm": 0.0408024825155735, + "kl": 0.00860595703125, + "learning_rate": 5.685439543190409e-06, + "loss": -0.0104, + "reward": 1.355711817741394, + "reward_std": 0.22719109896570444, + "rewards/mrr_reward": 0.44937377236783504, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7738226503133774, + "step": 656 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.328125, + "epoch": 0.5256, + "grad_norm": 0.04255751892924309, + "kl": 0.009609222412109375, + "learning_rate": 5.657119507101955e-06, + "loss": 0.0158, + "reward": 1.1442281603813171, + "reward_std": 0.19760075956583023, + "rewards/mrr_reward": 0.22868303954601288, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.842738464474678, + "step": 657 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.96875, + "epoch": 0.5264, + "grad_norm": 0.03842722252011299, + "kl": 0.009244918823242188, + "learning_rate": 5.628842333490674e-06, + "loss": -0.0173, + "reward": 1.1806485652923584, + "reward_std": 0.14790054596960545, + "rewards/mrr_reward": 0.24701761454343796, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8428566604852676, + "step": 658 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.046875, + "epoch": 0.5272, + "grad_norm": 0.04238361120223999, + "kl": 0.0117950439453125, + "learning_rate": 5.600608301440848e-06, + "loss": -0.004, + "reward": 1.201371192932129, + "reward_std": 0.2469993531703949, + "rewards/mrr_reward": 0.30833955481648445, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.831156387925148, + "step": 659 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.0625, + "epoch": 0.528, + "grad_norm": 0.055894963443279266, + "kl": 0.009580612182617188, + "learning_rate": 5.572417689610987e-06, + "loss": -0.0304, + "reward": 1.1189578771591187, + "reward_std": 0.18920623883605003, + "rewards/mrr_reward": 0.21641245111823082, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.803345337510109, + "step": 660 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.34375, + "epoch": 0.5288, + "grad_norm": 0.042352769523859024, + "kl": 0.010715484619140625, + "learning_rate": 5.544270776231038e-06, + "loss": 0.0025, + "reward": 1.1625032126903534, + "reward_std": 0.19885125942528248, + "rewards/mrr_reward": 0.26426712423563004, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8000523597002029, + "step": 661 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.859375, + "epoch": 0.5296, + "grad_norm": 0.03811383992433548, + "kl": 0.008238792419433594, + "learning_rate": 5.516167839099679e-06, + "loss": -0.017, + "reward": 1.3036585450172424, + "reward_std": 0.1993249226361513, + "rewards/mrr_reward": 0.3752976208925247, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.869855523109436, + "step": 662 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.65625, + "epoch": 0.5304, + "grad_norm": 0.042755480855703354, + "kl": 0.011791229248046875, + "learning_rate": 5.488109155581549e-06, + "loss": 0.0111, + "reward": 1.2058091461658478, + "reward_std": 0.21377132274210453, + "rewards/mrr_reward": 0.2779761999845505, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8467710912227631, + "step": 663 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.65625, + "epoch": 0.5312, + "grad_norm": 0.039466727524995804, + "kl": 0.008899688720703125, + "learning_rate": 5.460095002604533e-06, + "loss": -0.0181, + "reward": 1.189045011997223, + "reward_std": 0.128420518245548, + "rewards/mrr_reward": 0.26372147910296917, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8040104508399963, + "step": 664 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.546875, + "epoch": 0.532, + "grad_norm": 0.04358460754156113, + "kl": 0.008716583251953125, + "learning_rate": 5.432125656657004e-06, + "loss": -0.0043, + "reward": 1.1679949164390564, + "reward_std": 0.18363160640001297, + "rewards/mrr_reward": 0.2324280794709921, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8760666251182556, + "step": 665 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.328125, + "epoch": 0.5328, + "grad_norm": 0.044500019401311874, + "kl": 0.009143829345703125, + "learning_rate": 5.404201393785123e-06, + "loss": -0.0744, + "reward": 1.1327187716960907, + "reward_std": 0.16905860230326653, + "rewards/mrr_reward": 0.20547495409846306, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8371732831001282, + "step": 666 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.140625, + "epoch": 0.5336, + "grad_norm": 0.04244980216026306, + "kl": 0.012605667114257812, + "learning_rate": 5.376322489590085e-06, + "loss": -0.0043, + "reward": 1.1357988119125366, + "reward_std": 0.16269230097532272, + "rewards/mrr_reward": 0.23528027534484863, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.78548464179039, + "step": 667 + }, + { + "clip_ratio": 0.0, + "completion_length": 479.359375, + "epoch": 0.5344, + "grad_norm": 0.04275548830628395, + "kl": 0.008051872253417969, + "learning_rate": 5.348489219225417e-06, + "loss": 0.0322, + "reward": 1.3845745623111725, + "reward_std": 0.21123279444873333, + "rewards/mrr_reward": 0.4523933492600918, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8658071458339691, + "step": 668 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.8125, + "epoch": 0.5352, + "grad_norm": 0.045763902366161346, + "kl": 0.01207733154296875, + "learning_rate": 5.3207018573942684e-06, + "loss": -0.0302, + "reward": 1.1802934408187866, + "reward_std": 0.2740876227617264, + "rewards/mrr_reward": 0.3093688115477562, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.7524467706680298, + "step": 669 + }, + { + "clip_ratio": 0.0, + "completion_length": 484.5, + "epoch": 0.536, + "grad_norm": 0.04125389829277992, + "kl": 0.009202957153320312, + "learning_rate": 5.292960678346674e-06, + "loss": 0.0166, + "reward": 1.2574660181999207, + "reward_std": 0.16352756042033434, + "rewards/mrr_reward": 0.31458334624767303, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.894329383969307, + "step": 670 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.40625, + "epoch": 0.5368, + "grad_norm": 0.03898369520902634, + "kl": 0.009569168090820312, + "learning_rate": 5.2652659558768795e-06, + "loss": -0.0111, + "reward": 1.1695980429649353, + "reward_std": 0.18792172148823738, + "rewards/mrr_reward": 0.24601934850215912, + "rewards/rank_answer_foramt_reward": 0.955078125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8514575362205505, + "step": 671 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.640625, + "epoch": 0.5376, + "grad_norm": 0.04127902537584305, + "kl": 0.00872039794921875, + "learning_rate": 5.237617963320608e-06, + "loss": 0.0196, + "reward": 1.2061417400836945, + "reward_std": 0.2502102144062519, + "rewards/mrr_reward": 0.28403398394584656, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8294219672679901, + "step": 672 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.109375, + "epoch": 0.5384, + "grad_norm": 0.03780240938067436, + "kl": 0.009091377258300781, + "learning_rate": 5.2100169735523906e-06, + "loss": -0.0262, + "reward": 1.200999915599823, + "reward_std": 0.11425712890923023, + "rewards/mrr_reward": 0.28924231603741646, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7765735238790512, + "step": 673 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.125, + "epoch": 0.5392, + "grad_norm": 0.04221047833561897, + "kl": 0.01013946533203125, + "learning_rate": 5.1824632589828465e-06, + "loss": 0.0056, + "reward": 1.0810420513153076, + "reward_std": 0.15975480899214745, + "rewards/mrr_reward": 0.17140376567840576, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9609375, + "rewards/rank_think_format_reward": 0.8639013320207596, + "step": 674 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.640625, + "epoch": 0.54, + "grad_norm": 0.04080146923661232, + "kl": 0.0093536376953125, + "learning_rate": 5.154957091556021e-06, + "loss": -0.0236, + "reward": 1.0876788198947906, + "reward_std": 0.17044154927134514, + "rewards/mrr_reward": 0.17075273394584656, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8625483214855194, + "step": 675 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.921875, + "epoch": 0.5408, + "grad_norm": 0.039513736963272095, + "kl": 0.008271217346191406, + "learning_rate": 5.127498742746675e-06, + "loss": 0.0053, + "reward": 1.283906728029251, + "reward_std": 0.22036350145936012, + "rewards/mrr_reward": 0.3861917220056057, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7848014086484909, + "step": 676 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.671875, + "epoch": 0.5416, + "grad_norm": 0.04135413467884064, + "kl": 0.008291244506835938, + "learning_rate": 5.100088483557635e-06, + "loss": -0.0174, + "reward": 1.1332715302705765, + "reward_std": 0.21648670360445976, + "rewards/mrr_reward": 0.2384672686457634, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8345748037099838, + "step": 677 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.6875, + "epoch": 0.5424, + "grad_norm": 0.045175570994615555, + "kl": 0.00896453857421875, + "learning_rate": 5.072726584517086e-06, + "loss": -0.0161, + "reward": 1.0853496938943863, + "reward_std": 0.20591749995946884, + "rewards/mrr_reward": 0.19198289886116982, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.832172155380249, + "step": 678 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.796875, + "epoch": 0.5432, + "grad_norm": 0.03891496732831001, + "kl": 0.007755279541015625, + "learning_rate": 5.045413315675925e-06, + "loss": -0.0063, + "reward": 1.2105883359909058, + "reward_std": 0.10096981842070818, + "rewards/mrr_reward": 0.2832031324505806, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8102580308914185, + "step": 679 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.4375, + "epoch": 0.544, + "grad_norm": 0.03714423254132271, + "kl": 0.008653640747070312, + "learning_rate": 5.018148946605092e-06, + "loss": -0.0243, + "reward": 1.1117302775382996, + "reward_std": 0.13797536864876747, + "rewards/mrr_reward": 0.19155506789684296, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8372378200292587, + "step": 680 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.703125, + "epoch": 0.5448, + "grad_norm": 0.04121573269367218, + "kl": 0.0098114013671875, + "learning_rate": 4.9909337463929e-06, + "loss": -0.0248, + "reward": 1.0979313254356384, + "reward_std": 0.14380038622766733, + "rewards/mrr_reward": 0.17858382873237133, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8210577219724655, + "step": 681 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.140625, + "epoch": 0.5456, + "grad_norm": 0.04169347882270813, + "kl": 0.011077880859375, + "learning_rate": 4.9637679836423926e-06, + "loss": -0.0243, + "reward": 1.349208414554596, + "reward_std": 0.14660646300762892, + "rewards/mrr_reward": 0.4205109141767025, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8279066383838654, + "step": 682 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.53125, + "epoch": 0.5464, + "grad_norm": 0.039740189909935, + "kl": 0.009710311889648438, + "learning_rate": 4.936651926468673e-06, + "loss": -0.0384, + "reward": 1.0722960233688354, + "reward_std": 0.1104632755741477, + "rewards/mrr_reward": 0.16297122836112976, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.792638972401619, + "step": 683 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.609375, + "epoch": 0.5472, + "grad_norm": 0.04026377573609352, + "kl": 0.008731842041015625, + "learning_rate": 4.909585842496287e-06, + "loss": -0.0021, + "reward": 1.1576222777366638, + "reward_std": 0.14097119309008121, + "rewards/mrr_reward": 0.2248635906726122, + "rewards/rank_answer_foramt_reward": 0.984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8421664088964462, + "step": 684 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.25, + "epoch": 0.548, + "grad_norm": 0.039616670459508896, + "kl": 0.009366989135742188, + "learning_rate": 4.882569998856549e-06, + "loss": -0.018, + "reward": 1.2478437423706055, + "reward_std": 0.24337460100650787, + "rewards/mrr_reward": 0.34682539850473404, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7928584069013596, + "step": 685 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.984375, + "epoch": 0.5488, + "grad_norm": 0.04094656556844711, + "kl": 0.011662483215332031, + "learning_rate": 4.855604662184935e-06, + "loss": -0.0047, + "reward": 1.0998060703277588, + "reward_std": 0.1794710010290146, + "rewards/mrr_reward": 0.23454860970377922, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7040233463048935, + "step": 686 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.640625, + "epoch": 0.5496, + "grad_norm": 0.04387884959578514, + "kl": 0.01041412353515625, + "learning_rate": 4.828690098618429e-06, + "loss": -0.0005, + "reward": 1.158925324678421, + "reward_std": 0.23121210932731628, + "rewards/mrr_reward": 0.2560763880610466, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8355152010917664, + "step": 687 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.953125, + "epoch": 0.5504, + "grad_norm": 0.18276724219322205, + "kl": 0.04758262634277344, + "learning_rate": 4.801826573792905e-06, + "loss": -0.0495, + "reward": 1.299856573343277, + "reward_std": 0.10169149003922939, + "rewards/mrr_reward": 0.3726624511182308, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8448353558778763, + "step": 688 + }, + { + "clip_ratio": 0.0, + "completion_length": 472.3125, + "epoch": 0.5512, + "grad_norm": 0.035259928554296494, + "kl": 0.008893013000488281, + "learning_rate": 4.775014352840512e-06, + "loss": -0.006, + "reward": 1.169858604669571, + "reward_std": 0.1330111986026168, + "rewards/mrr_reward": 0.23686135932803154, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8409361243247986, + "step": 689 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.328125, + "epoch": 0.552, + "grad_norm": 0.036417923867702484, + "kl": 0.007439613342285156, + "learning_rate": 4.7482537003870425e-06, + "loss": -0.0064, + "reward": 1.116908699274063, + "reward_std": 0.09379957616329193, + "rewards/mrr_reward": 0.1877418179064989, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8430007994174957, + "step": 690 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.984375, + "epoch": 0.5528, + "grad_norm": 0.04526480659842491, + "kl": 0.007946968078613281, + "learning_rate": 4.721544880549337e-06, + "loss": -0.0073, + "reward": 1.3033201396465302, + "reward_std": 0.2789052985608578, + "rewards/mrr_reward": 0.38737599924206734, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.879103884100914, + "step": 691 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.171875, + "epoch": 0.5536, + "grad_norm": 0.04109110310673714, + "kl": 0.00826263427734375, + "learning_rate": 4.694888156932657e-06, + "loss": -0.0182, + "reward": 1.118406057357788, + "reward_std": 0.16681309789419174, + "rewards/mrr_reward": 0.20881697162985802, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8344555050134659, + "step": 692 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.109375, + "epoch": 0.5544, + "grad_norm": 0.037574220448732376, + "kl": 0.009721755981445312, + "learning_rate": 4.668283792628114e-06, + "loss": 0.0062, + "reward": 1.2277896106243134, + "reward_std": 0.1619633361697197, + "rewards/mrr_reward": 0.30946800857782364, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8355270475149155, + "step": 693 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.53125, + "epoch": 0.5552, + "grad_norm": 0.03914060443639755, + "kl": 0.00797271728515625, + "learning_rate": 4.641732050210032e-06, + "loss": -0.0052, + "reward": 1.2452102601528168, + "reward_std": 0.2378566674888134, + "rewards/mrr_reward": 0.3098648376762867, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8910206854343414, + "step": 694 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.03125, + "epoch": 0.556, + "grad_norm": 0.042982131242752075, + "kl": 0.008350372314453125, + "learning_rate": 4.6152331917333985e-06, + "loss": 0.0454, + "reward": 1.203509271144867, + "reward_std": 0.11863551568239927, + "rewards/mrr_reward": 0.2798239067196846, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8478744924068451, + "step": 695 + }, + { + "clip_ratio": 0.0, + "completion_length": 494.53125, + "epoch": 0.5568, + "grad_norm": 0.036636147648096085, + "kl": 0.010402679443359375, + "learning_rate": 4.588787478731242e-06, + "loss": -0.0137, + "reward": 1.102398157119751, + "reward_std": 0.20872093737125397, + "rewards/mrr_reward": 0.1981026753783226, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8066954910755157, + "step": 696 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.609375, + "epoch": 0.5576, + "grad_norm": 0.04322382062673569, + "kl": 0.007706642150878906, + "learning_rate": 4.562395172212074e-06, + "loss": 0.0143, + "reward": 1.1307188272476196, + "reward_std": 0.13696488551795483, + "rewards/mrr_reward": 0.19231151044368744, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8944397121667862, + "step": 697 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.3125, + "epoch": 0.5584, + "grad_norm": 0.03792423754930496, + "kl": 0.008268356323242188, + "learning_rate": 4.53605653265731e-06, + "loss": -0.0294, + "reward": 1.1437303125858307, + "reward_std": 0.16462960094213486, + "rewards/mrr_reward": 0.23955854214727879, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8043674677610397, + "step": 698 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.0, + "epoch": 0.5592, + "grad_norm": 0.040437955409288406, + "kl": 0.00787353515625, + "learning_rate": 4.509771820018682e-06, + "loss": 0.0165, + "reward": 1.2735324501991272, + "reward_std": 0.24658278934657574, + "rewards/mrr_reward": 0.34225572273135185, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8767381012439728, + "step": 699 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.015625, + "epoch": 0.56, + "grad_norm": 0.039811957627534866, + "kl": 0.009557723999023438, + "learning_rate": 4.483541293715699e-06, + "loss": 0.0038, + "reward": 1.2039062678813934, + "reward_std": 0.0991826388053596, + "rewards/mrr_reward": 0.26781994476914406, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8717812895774841, + "step": 700 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.4375, + "epoch": 0.5608, + "grad_norm": 0.04115857928991318, + "kl": 0.009393692016601562, + "learning_rate": 4.457365212633058e-06, + "loss": -0.0331, + "reward": 1.0938493311405182, + "reward_std": 0.12323368107900023, + "rewards/mrr_reward": 0.17080233432352543, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8244557082653046, + "step": 701 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.375, + "epoch": 0.5616, + "grad_norm": 0.03895072266459465, + "kl": 0.009398460388183594, + "learning_rate": 4.4312438351181246e-06, + "loss": 0.0034, + "reward": 1.1528900265693665, + "reward_std": 0.14880603551864624, + "rewards/mrr_reward": 0.231932045891881, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8454690128564835, + "step": 702 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.40625, + "epoch": 0.5624, + "grad_norm": 0.039955999702215195, + "kl": 0.0096435546875, + "learning_rate": 4.405177418978331e-06, + "loss": -0.0002, + "reward": 1.150672048330307, + "reward_std": 0.09315567277371883, + "rewards/mrr_reward": 0.2145027294754982, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.872032955288887, + "step": 703 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.734375, + "epoch": 0.5632, + "grad_norm": 0.039580851793289185, + "kl": 0.00884246826171875, + "learning_rate": 4.379166221478697e-06, + "loss": -0.0308, + "reward": 1.1858681738376617, + "reward_std": 0.07365155033767223, + "rewards/mrr_reward": 0.26101810671389103, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8162476569414139, + "step": 704 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.46875, + "epoch": 0.564, + "grad_norm": 0.0397285558283329, + "kl": 0.008618354797363281, + "learning_rate": 4.353210499339231e-06, + "loss": -0.0254, + "reward": 1.1983648836612701, + "reward_std": 0.17565249279141426, + "rewards/mrr_reward": 0.25363964214921, + "rewards/rank_answer_foramt_reward": 0.984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8862412869930267, + "step": 705 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.375, + "epoch": 0.5648, + "grad_norm": 0.038892053067684174, + "kl": 0.008718490600585938, + "learning_rate": 4.3273105087324375e-06, + "loss": -0.0375, + "reward": 1.2153197675943375, + "reward_std": 0.2709946185350418, + "rewards/mrr_reward": 0.33776042610406876, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7862237989902496, + "step": 706 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.875, + "epoch": 0.5656, + "grad_norm": 0.039017390459775925, + "kl": 0.008515357971191406, + "learning_rate": 4.301466505280763e-06, + "loss": -0.0007, + "reward": 1.138901799917221, + "reward_std": 0.14515507780015469, + "rewards/mrr_reward": 0.20719866082072258, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8526394814252853, + "step": 707 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.28125, + "epoch": 0.5664, + "grad_norm": 0.03923118859529495, + "kl": 0.009359359741210938, + "learning_rate": 4.275678744054094e-06, + "loss": 0.0008, + "reward": 1.1316049695014954, + "reward_std": 0.1249928786419332, + "rewards/mrr_reward": 0.20212053880095482, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8439630717039108, + "step": 708 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.453125, + "epoch": 0.5672, + "grad_norm": 0.04064570739865303, + "kl": 0.009347915649414062, + "learning_rate": 4.249947479567218e-06, + "loss": -0.0284, + "reward": 1.1335679292678833, + "reward_std": 0.12174471095204353, + "rewards/mrr_reward": 0.24037698283791542, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7613266557455063, + "step": 709 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.578125, + "epoch": 0.568, + "grad_norm": 0.0448368564248085, + "kl": 0.011203765869140625, + "learning_rate": 4.224272965777326e-06, + "loss": 0.0051, + "reward": 1.1839303076267242, + "reward_std": 0.22850197553634644, + "rewards/mrr_reward": 0.2795758992433548, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.843983381986618, + "step": 710 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.96875, + "epoch": 0.5688, + "grad_norm": 0.03913170099258423, + "kl": 0.009366035461425781, + "learning_rate": 4.1986554560815095e-06, + "loss": -0.0368, + "reward": 1.1309897899627686, + "reward_std": 0.11143102683126926, + "rewards/mrr_reward": 0.22149058431386948, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8185580670833588, + "step": 711 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.046875, + "epoch": 0.5696, + "grad_norm": 0.03881422057747841, + "kl": 0.009281158447265625, + "learning_rate": 4.173095203314241e-06, + "loss": -0.0238, + "reward": 1.0363554060459137, + "reward_std": 0.12483402527868748, + "rewards/mrr_reward": 0.1456535290926695, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7772215157747269, + "step": 712 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.859375, + "epoch": 0.5704, + "grad_norm": 0.04099239408969879, + "kl": 0.008396148681640625, + "learning_rate": 4.1475924597449025e-06, + "loss": -0.0137, + "reward": 1.1610506176948547, + "reward_std": 0.22206255793571472, + "rewards/mrr_reward": 0.259381216019392, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8241281360387802, + "step": 713 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.984375, + "epoch": 0.5712, + "grad_norm": 0.05151242017745972, + "kl": 0.011735916137695312, + "learning_rate": 4.12214747707527e-06, + "loss": -0.009, + "reward": 1.273858591914177, + "reward_std": 0.23207318596541882, + "rewards/mrr_reward": 0.37640749476850033, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8406425565481186, + "step": 714 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.3125, + "epoch": 0.572, + "grad_norm": 0.03908145800232887, + "kl": 0.008737564086914062, + "learning_rate": 4.096760506437057e-06, + "loss": -0.0372, + "reward": 1.262620210647583, + "reward_std": 0.19197899289429188, + "rewards/mrr_reward": 0.33578869700431824, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8300645500421524, + "step": 715 + }, + { + "clip_ratio": 0.0, + "completion_length": 489.90625, + "epoch": 0.5728, + "grad_norm": 0.03428515046834946, + "kl": 0.007670402526855469, + "learning_rate": 4.071431798389408e-06, + "loss": -0.0106, + "reward": 1.1059209406375885, + "reward_std": 0.17448805645108223, + "rewards/mrr_reward": 0.18847966939210892, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8074686527252197, + "step": 716 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.265625, + "epoch": 0.5736, + "grad_norm": 0.04790130630135536, + "kl": 0.009938240051269531, + "learning_rate": 4.046161602916453e-06, + "loss": -0.0354, + "reward": 1.0969393253326416, + "reward_std": 0.16678292863070965, + "rewards/mrr_reward": 0.20392486453056335, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7627447545528412, + "step": 717 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.375, + "epoch": 0.5744, + "grad_norm": 0.043300457298755646, + "kl": 0.012233734130859375, + "learning_rate": 4.020950169424815e-06, + "loss": -0.0072, + "reward": 1.088065654039383, + "reward_std": 0.1905173622071743, + "rewards/mrr_reward": 0.18054936081171036, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8145024180412292, + "step": 718 + }, + { + "clip_ratio": 0.0, + "completion_length": 486.21875, + "epoch": 0.5752, + "grad_norm": 0.03566781058907509, + "kl": 0.007656097412109375, + "learning_rate": 3.9957977467411615e-06, + "loss": -0.0198, + "reward": 1.3303083777427673, + "reward_std": 0.1124997977167368, + "rewards/mrr_reward": 0.3919084668159485, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8514484316110611, + "step": 719 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.0, + "epoch": 0.576, + "grad_norm": 0.03936339169740677, + "kl": 0.007256507873535156, + "learning_rate": 3.970704583109755e-06, + "loss": -0.0184, + "reward": 1.2179461419582367, + "reward_std": 0.19140197336673737, + "rewards/mrr_reward": 0.29061879590153694, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8452388942241669, + "step": 720 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.640625, + "epoch": 0.5768, + "grad_norm": 0.040819380432367325, + "kl": 0.008708953857421875, + "learning_rate": 3.945670926189987e-06, + "loss": -0.0032, + "reward": 1.363356113433838, + "reward_std": 0.29930396378040314, + "rewards/mrr_reward": 0.4476066455245018, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8511701226234436, + "step": 721 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.296875, + "epoch": 0.5776, + "grad_norm": 0.03900689631700516, + "kl": 0.009435653686523438, + "learning_rate": 3.920697023053949e-06, + "loss": -0.0039, + "reward": 1.103367805480957, + "reward_std": 0.08733582124114037, + "rewards/mrr_reward": 0.169921875, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8520613461732864, + "step": 722 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.796875, + "epoch": 0.5784, + "grad_norm": 0.04251600429415703, + "kl": 0.007720947265625, + "learning_rate": 3.895783120183975e-06, + "loss": 0.0037, + "reward": 1.167445570230484, + "reward_std": 0.14570345729589462, + "rewards/mrr_reward": 0.2338479682803154, + "rewards/rank_answer_foramt_reward": 0.953125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8759585320949554, + "step": 723 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.875, + "epoch": 0.5792, + "grad_norm": 0.038369178771972656, + "kl": 0.00865936279296875, + "learning_rate": 3.8709294634702374e-06, + "loss": 0.0008, + "reward": 1.1614885032176971, + "reward_std": 0.1151402248069644, + "rewards/mrr_reward": 0.22297247499227524, + "rewards/rank_answer_foramt_reward": 0.984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.875237762928009, + "step": 724 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.75, + "epoch": 0.58, + "grad_norm": 0.04544536769390106, + "kl": 0.009899139404296875, + "learning_rate": 3.846136298208285e-06, + "loss": -0.0184, + "reward": 1.1653823256492615, + "reward_std": 0.3885572552680969, + "rewards/mrr_reward": 0.3096292167901993, + "rewards/rank_answer_foramt_reward": 0.833984375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7826442420482635, + "step": 725 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.375, + "epoch": 0.5808, + "grad_norm": 0.04791020229458809, + "kl": 0.007744789123535156, + "learning_rate": 3.821403869096658e-06, + "loss": -0.0082, + "reward": 1.1828205287456512, + "reward_std": 0.23169685155153275, + "rewards/mrr_reward": 0.2780692037194967, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8256549388170242, + "step": 726 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.078125, + "epoch": 0.5816, + "grad_norm": 0.04281622916460037, + "kl": 0.009065628051757812, + "learning_rate": 3.7967324202344433e-06, + "loss": -0.0319, + "reward": 1.0675395727157593, + "reward_std": 0.16223911754786968, + "rewards/mrr_reward": 0.153949661180377, + "rewards/rank_answer_foramt_reward": 0.94140625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8270478695631027, + "step": 727 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.640625, + "epoch": 0.5824, + "grad_norm": 0.03857819736003876, + "kl": 0.007999420166015625, + "learning_rate": 3.772122195118877e-06, + "loss": -0.0071, + "reward": 1.100866436958313, + "reward_std": 0.09526841063052416, + "rewards/mrr_reward": 0.1559399850666523, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8770851939916611, + "step": 728 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.203125, + "epoch": 0.5832, + "grad_norm": 0.04351005703210831, + "kl": 0.00836944580078125, + "learning_rate": 3.747573436642952e-06, + "loss": 0.0079, + "reward": 1.080918699502945, + "reward_std": 0.07267094915732741, + "rewards/mrr_reward": 0.13717758283019066, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.887165293097496, + "step": 729 + }, + { + "clip_ratio": 0.0, + "completion_length": 497.921875, + "epoch": 0.584, + "grad_norm": 0.03764002025127411, + "kl": 0.007304191589355469, + "learning_rate": 3.723086387092997e-06, + "loss": 0.0091, + "reward": 1.173829346895218, + "reward_std": 0.16954005137085915, + "rewards/mrr_reward": 0.23038814216852188, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8862564116716385, + "step": 730 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.609375, + "epoch": 0.5848, + "grad_norm": 0.04356111213564873, + "kl": 0.011903762817382812, + "learning_rate": 3.6986612881463114e-06, + "loss": -0.0154, + "reward": 1.092040315270424, + "reward_std": 0.22640430741012096, + "rewards/mrr_reward": 0.2199590802192688, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7559514939785004, + "step": 731 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.296875, + "epoch": 0.5856, + "grad_norm": 0.04148973152041435, + "kl": 0.008152008056640625, + "learning_rate": 3.674298380868756e-06, + "loss": -0.0105, + "reward": 1.0901039838790894, + "reward_std": 0.1490055676549673, + "rewards/mrr_reward": 0.1813926100730896, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.82398322224617, + "step": 732 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.015625, + "epoch": 0.5864, + "grad_norm": 0.04152887687087059, + "kl": 0.009691238403320312, + "learning_rate": 3.649997905712396e-06, + "loss": -0.0123, + "reward": 1.1922140419483185, + "reward_std": 0.1852929126471281, + "rewards/mrr_reward": 0.2508494630455971, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8799635022878647, + "step": 733 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.015625, + "epoch": 0.5872, + "grad_norm": 0.03897524252533913, + "kl": 0.010408401489257812, + "learning_rate": 3.625760102513103e-06, + "loss": -0.0037, + "reward": 1.1527060270309448, + "reward_std": 0.18498503416776657, + "rewards/mrr_reward": 0.23213045671582222, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8599352687597275, + "step": 734 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.640625, + "epoch": 0.588, + "grad_norm": 0.0379464253783226, + "kl": 0.008158683776855469, + "learning_rate": 3.601585210488218e-06, + "loss": -0.0026, + "reward": 1.1971299052238464, + "reward_std": 0.06598696298897266, + "rewards/mrr_reward": 0.2490451280027628, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.9022810012102127, + "step": 735 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.484375, + "epoch": 0.5888, + "grad_norm": 0.19841361045837402, + "kl": 0.03487586975097656, + "learning_rate": 3.5774734682341563e-06, + "loss": -0.0105, + "reward": 1.2799700498580933, + "reward_std": 0.21030810847878456, + "rewards/mrr_reward": 0.36318204924464226, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8640826940536499, + "step": 736 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.125, + "epoch": 0.5896, + "grad_norm": 0.04086919128894806, + "kl": 0.008722305297851562, + "learning_rate": 3.5534251137240883e-06, + "loss": -0.0162, + "reward": 1.1706961393356323, + "reward_std": 0.26971735805273056, + "rewards/mrr_reward": 0.25814731419086456, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8746743649244308, + "step": 737 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.046875, + "epoch": 0.5904, + "grad_norm": 0.04114146903157234, + "kl": 0.008604049682617188, + "learning_rate": 3.5294403843055604e-06, + "loss": 0.0248, + "reward": 1.1644734144210815, + "reward_std": 0.1057867594063282, + "rewards/mrr_reward": 0.2209201343357563, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.9080804586410522, + "step": 738 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.828125, + "epoch": 0.5912, + "grad_norm": 0.04274991899728775, + "kl": 0.009336471557617188, + "learning_rate": 3.505519516698165e-06, + "loss": -0.0115, + "reward": 1.101497322320938, + "reward_std": 0.1800498366355896, + "rewards/mrr_reward": 0.18911830335855484, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8565816581249237, + "step": 739 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.671875, + "epoch": 0.592, + "grad_norm": 0.041656360030174255, + "kl": 0.007590293884277344, + "learning_rate": 3.4816627469912147e-06, + "loss": -0.0137, + "reward": 1.1390126645565033, + "reward_std": 0.14937850926071405, + "rewards/mrr_reward": 0.21043526753783226, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8548864126205444, + "step": 740 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.5, + "epoch": 0.5928, + "grad_norm": 0.0403144545853138, + "kl": 0.009930610656738281, + "learning_rate": 3.4578703106413903e-06, + "loss": -0.032, + "reward": 1.118723675608635, + "reward_std": 0.194508895277977, + "rewards/mrr_reward": 0.21798115223646164, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8096006661653519, + "step": 741 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.796875, + "epoch": 0.5936, + "grad_norm": 0.03830445185303688, + "kl": 0.0073337554931640625, + "learning_rate": 3.4341424424704373e-06, + "loss": 0.0001, + "reward": 1.1513462364673615, + "reward_std": 0.14330180920660496, + "rewards/mrr_reward": 0.24214409850537777, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8450015187263489, + "step": 742 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.875, + "epoch": 0.5944, + "grad_norm": 0.04006378352642059, + "kl": 0.0078029632568359375, + "learning_rate": 3.4104793766628307e-06, + "loss": -0.0074, + "reward": 1.165648490190506, + "reward_std": 0.19168706238269806, + "rewards/mrr_reward": 0.2604600749909878, + "rewards/rank_answer_foramt_reward": 0.873046875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8699481040239334, + "step": 743 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.71875, + "epoch": 0.5952, + "grad_norm": 0.043256212025880814, + "kl": 0.009433746337890625, + "learning_rate": 3.3868813467634833e-06, + "loss": -0.0223, + "reward": 1.1679503321647644, + "reward_std": 0.2485765404999256, + "rewards/mrr_reward": 0.3030258007347584, + "rewards/rank_answer_foramt_reward": 0.849609375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.78699891269207, + "step": 744 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.796875, + "epoch": 0.596, + "grad_norm": 0.04080253094434738, + "kl": 0.008932113647460938, + "learning_rate": 3.3633485856754143e-06, + "loss": -0.005, + "reward": 1.1107763051986694, + "reward_std": 0.13455253094434738, + "rewards/mrr_reward": 0.1878348346799612, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8378077745437622, + "step": 745 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.21875, + "epoch": 0.5968, + "grad_norm": 0.04848048463463783, + "kl": 0.00908660888671875, + "learning_rate": 3.3398813256574847e-06, + "loss": -0.0434, + "reward": 1.212068885564804, + "reward_std": 0.2859114482998848, + "rewards/mrr_reward": 0.3225632533431053, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7872682809829712, + "step": 746 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.984375, + "epoch": 0.5976, + "grad_norm": 0.038503892719745636, + "kl": 0.008440017700195312, + "learning_rate": 3.316479798322072e-06, + "loss": -0.0188, + "reward": 1.080864131450653, + "reward_std": 0.11961814388632774, + "rewards/mrr_reward": 0.1450644824653864, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8709126263856888, + "step": 747 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.59375, + "epoch": 0.5984, + "grad_norm": 0.04509103670716286, + "kl": 0.009431838989257812, + "learning_rate": 3.2931442346328e-06, + "loss": 0.0045, + "reward": 1.1249631643295288, + "reward_std": 0.18062920682132244, + "rewards/mrr_reward": 0.2126116044819355, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8193891793489456, + "step": 748 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.359375, + "epoch": 0.5992, + "grad_norm": 0.03829497843980789, + "kl": 0.009325027465820312, + "learning_rate": 3.2698748649022693e-06, + "loss": -0.031, + "reward": 1.3105561435222626, + "reward_std": 0.24571607820689678, + "rewards/mrr_reward": 0.4127728193998337, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7752430289983749, + "step": 749 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.796875, + "epoch": 0.6, + "grad_norm": 0.0403105802834034, + "kl": 0.008958816528320312, + "learning_rate": 3.2466719187897555e-06, + "loss": -0.0318, + "reward": 1.3205139636993408, + "reward_std": 0.30093929544091225, + "rewards/mrr_reward": 0.4177951477468014, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8058240860700607, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.578125, + "epoch": 0.6008, + "grad_norm": 0.04085452854633331, + "kl": 0.008657455444335938, + "learning_rate": 3.223535625298979e-06, + "loss": 0.0034, + "reward": 1.2626567482948303, + "reward_std": 0.163003820925951, + "rewards/mrr_reward": 0.35236856341362, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8014176934957504, + "step": 751 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.0, + "epoch": 0.6016, + "grad_norm": 0.04556753858923912, + "kl": 0.01016998291015625, + "learning_rate": 3.200466212775808e-06, + "loss": 0.0427, + "reward": 1.1225496679544449, + "reward_std": 0.1800556741654873, + "rewards/mrr_reward": 0.20434647798538208, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8664180636405945, + "step": 752 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.28125, + "epoch": 0.6024, + "grad_norm": 0.03804047778248787, + "kl": 0.0096435546875, + "learning_rate": 3.1774639089060364e-06, + "loss": -0.017, + "reward": 1.097492665052414, + "reward_std": 0.14176994934678078, + "rewards/mrr_reward": 0.1837735567241907, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8235331773757935, + "step": 753 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.125, + "epoch": 0.6032, + "grad_norm": 0.04436762258410454, + "kl": 0.008335113525390625, + "learning_rate": 3.1545289407131128e-06, + "loss": -0.03, + "reward": 1.258377730846405, + "reward_std": 0.2980855964124203, + "rewards/mrr_reward": 0.34546130523085594, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8640695363283157, + "step": 754 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.09375, + "epoch": 0.604, + "grad_norm": 0.04474704712629318, + "kl": 0.01195526123046875, + "learning_rate": 3.1316615345559188e-06, + "loss": -0.0335, + "reward": 1.1586785316467285, + "reward_std": 0.217840775847435, + "rewards/mrr_reward": 0.26437871530652046, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.817421168088913, + "step": 755 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.09375, + "epoch": 0.6048, + "grad_norm": 0.0435645654797554, + "kl": 0.008356094360351562, + "learning_rate": 3.108861916126518e-06, + "loss": -0.0303, + "reward": 1.2183335721492767, + "reward_std": 0.18061750754714012, + "rewards/mrr_reward": 0.3272135518491268, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7784887701272964, + "step": 756 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.734375, + "epoch": 0.6056, + "grad_norm": 0.041368041187524796, + "kl": 0.009997367858886719, + "learning_rate": 3.086130310447937e-06, + "loss": -0.0175, + "reward": 1.0250304490327835, + "reward_std": 0.17961598467081785, + "rewards/mrr_reward": 0.14366319216787815, + "rewards/rank_answer_foramt_reward": 0.822265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8563565909862518, + "step": 757 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.0, + "epoch": 0.6064, + "grad_norm": 0.04095418006181717, + "kl": 0.0072650909423828125, + "learning_rate": 3.063466941871952e-06, + "loss": -0.0173, + "reward": 1.1504963636398315, + "reward_std": 0.12690221052616835, + "rewards/mrr_reward": 0.2360739130526781, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7983209490776062, + "step": 758 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.90625, + "epoch": 0.6072, + "grad_norm": 0.04432576522231102, + "kl": 0.007794380187988281, + "learning_rate": 3.040872034076857e-06, + "loss": 0.0107, + "reward": 1.1860106885433197, + "reward_std": 0.13013709522783756, + "rewards/mrr_reward": 0.2523561418056488, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8780841082334518, + "step": 759 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.203125, + "epoch": 0.608, + "grad_norm": 0.039851389825344086, + "kl": 0.008897781372070312, + "learning_rate": 3.0183458100652752e-06, + "loss": -0.0125, + "reward": 1.1152794659137726, + "reward_std": 0.19540764205157757, + "rewards/mrr_reward": 0.19703001156449318, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8724178522825241, + "step": 760 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.921875, + "epoch": 0.6088, + "grad_norm": 0.040054626762866974, + "kl": 0.010467529296875, + "learning_rate": 2.9958884921619368e-06, + "loss": -0.0206, + "reward": 1.1841309070587158, + "reward_std": 0.16952326335012913, + "rewards/mrr_reward": 0.2834821455180645, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8073635548353195, + "step": 761 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.390625, + "epoch": 0.6096, + "grad_norm": 0.043995246291160583, + "kl": 0.01117706298828125, + "learning_rate": 2.9735003020115095e-06, + "loss": 0.005, + "reward": 1.2519162595272064, + "reward_std": 0.33760569244623184, + "rewards/mrr_reward": 0.3831969276070595, + "rewards/rank_answer_foramt_reward": 0.806640625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8492796123027802, + "step": 762 + }, + { + "clip_ratio": 0.0, + "completion_length": 478.546875, + "epoch": 0.6104, + "grad_norm": 0.04608379304409027, + "kl": 0.009157180786132812, + "learning_rate": 2.9511814605763855e-06, + "loss": 0.0333, + "reward": 1.1024558991193771, + "reward_std": 0.21346063539385796, + "rewards/mrr_reward": 0.21845859102904797, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.7666701674461365, + "step": 763 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.421875, + "epoch": 0.6112, + "grad_norm": 0.04478094354271889, + "kl": 0.00982666015625, + "learning_rate": 2.9289321881345257e-06, + "loss": -0.0129, + "reward": 1.0492956936359406, + "reward_std": 0.13610859028995037, + "rewards/mrr_reward": 0.1452070940285921, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.802162379026413, + "step": 764 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.203125, + "epoch": 0.612, + "grad_norm": 0.05722177028656006, + "kl": 0.015047073364257812, + "learning_rate": 2.9067527042772638e-06, + "loss": -0.0044, + "reward": 1.134572982788086, + "reward_std": 0.20083492621779442, + "rewards/mrr_reward": 0.2158978171646595, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8541764914989471, + "step": 765 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.390625, + "epoch": 0.6128, + "grad_norm": 0.0441712960600853, + "kl": 0.011053085327148438, + "learning_rate": 2.884643227907147e-06, + "loss": -0.0469, + "reward": 1.0365698337554932, + "reward_std": 0.14963462762534618, + "rewards/mrr_reward": 0.12454117089509964, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8457542955875397, + "step": 766 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.1875, + "epoch": 0.6136, + "grad_norm": 0.03963298723101616, + "kl": 0.008359909057617188, + "learning_rate": 2.8626039772357884e-06, + "loss": 0.0217, + "reward": 1.2510823607444763, + "reward_std": 0.14392628520727158, + "rewards/mrr_reward": 0.3217137847095728, + "rewards/rank_answer_foramt_reward": 0.984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8397057503461838, + "step": 767 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.84375, + "epoch": 0.6144, + "grad_norm": 0.04018845781683922, + "kl": 0.007063865661621094, + "learning_rate": 2.840635169781688e-06, + "loss": 0.004, + "reward": 1.1201017796993256, + "reward_std": 0.1913389079272747, + "rewards/mrr_reward": 0.21623263508081436, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8347004055976868, + "step": 768 + }, + { + "clip_ratio": 0.0, + "completion_length": 490.125, + "epoch": 0.6152, + "grad_norm": 0.040367186069488525, + "kl": 0.008176803588867188, + "learning_rate": 2.8187370223681134e-06, + "loss": 0.0112, + "reward": 1.131416529417038, + "reward_std": 0.19026605784893036, + "rewards/mrr_reward": 0.21690849214792252, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8493613749742508, + "step": 769 + }, + { + "clip_ratio": 0.0, + "completion_length": 495.46875, + "epoch": 0.616, + "grad_norm": 0.04241256043314934, + "kl": 0.008129119873046875, + "learning_rate": 2.796909751120931e-06, + "loss": 0.0415, + "reward": 1.2324725687503815, + "reward_std": 0.2082219198346138, + "rewards/mrr_reward": 0.28964534401893616, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8707241117954254, + "step": 770 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.234375, + "epoch": 0.6168, + "grad_norm": 0.039128441363573074, + "kl": 0.007460594177246094, + "learning_rate": 2.7751535714665025e-06, + "loss": 0.0041, + "reward": 1.225624144077301, + "reward_std": 0.14080518763512373, + "rewards/mrr_reward": 0.296521570533514, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8564777374267578, + "step": 771 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.59375, + "epoch": 0.6176, + "grad_norm": 0.03881870210170746, + "kl": 0.008975982666015625, + "learning_rate": 2.7534686981295335e-06, + "loss": -0.0376, + "reward": 1.0894058048725128, + "reward_std": 0.08322741370648146, + "rewards/mrr_reward": 0.15754588693380356, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.845302164554596, + "step": 772 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.140625, + "epoch": 0.6184, + "grad_norm": 0.04274870082736015, + "kl": 0.008192062377929688, + "learning_rate": 2.7318553451309726e-06, + "loss": 0.0279, + "reward": 1.1450502276420593, + "reward_std": 0.13760012108832598, + "rewards/mrr_reward": 0.20704985596239567, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8639096468687057, + "step": 773 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.703125, + "epoch": 0.6192, + "grad_norm": 0.041456714272499084, + "kl": 0.010091781616210938, + "learning_rate": 2.7103137257858867e-06, + "loss": -0.0241, + "reward": 1.1570966690778732, + "reward_std": 0.2309955172240734, + "rewards/mrr_reward": 0.2663938533514738, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.7850367873907089, + "step": 774 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.28125, + "epoch": 0.62, + "grad_norm": 0.03964508697390556, + "kl": 0.00922393798828125, + "learning_rate": 2.6888440527013595e-06, + "loss": -0.018, + "reward": 1.3120769262313843, + "reward_std": 0.2254694253206253, + "rewards/mrr_reward": 0.39821428060531616, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8044368922710419, + "step": 775 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.28125, + "epoch": 0.6208, + "grad_norm": 0.03911551460623741, + "kl": 0.008837699890136719, + "learning_rate": 2.667446537774402e-06, + "loss": -0.0125, + "reward": 1.1185062527656555, + "reward_std": 0.17317464342340827, + "rewards/mrr_reward": 0.2100074477493763, + "rewards/rank_answer_foramt_reward": 0.9140625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.854589119553566, + "step": 776 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.609375, + "epoch": 0.6216, + "grad_norm": 0.03766229748725891, + "kl": 0.007938385009765625, + "learning_rate": 2.646121392189841e-06, + "loss": -0.0097, + "reward": 1.310092717409134, + "reward_std": 0.1681712232530117, + "rewards/mrr_reward": 0.36098089441657066, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8760963976383209, + "step": 777 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.25, + "epoch": 0.6224, + "grad_norm": 0.03969811275601387, + "kl": 0.008672714233398438, + "learning_rate": 2.624868826418262e-06, + "loss": -0.0219, + "reward": 1.174126923084259, + "reward_std": 0.22464322298765182, + "rewards/mrr_reward": 0.2639323025941849, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8753529489040375, + "step": 778 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.4375, + "epoch": 0.6232, + "grad_norm": 0.04108897224068642, + "kl": 0.008535385131835938, + "learning_rate": 2.603689050213902e-06, + "loss": 0.0067, + "reward": 1.0609664767980576, + "reward_std": 0.15589328296482563, + "rewards/mrr_reward": 0.156274801120162, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8508649319410324, + "step": 779 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.1875, + "epoch": 0.624, + "grad_norm": 0.04128960147500038, + "kl": 0.009108543395996094, + "learning_rate": 2.5825822726126095e-06, + "loss": -0.0168, + "reward": 1.2373025119304657, + "reward_std": 0.21737964265048504, + "rewards/mrr_reward": 0.3343130014836788, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8320348858833313, + "step": 780 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.640625, + "epoch": 0.6248, + "grad_norm": 0.04342431575059891, + "kl": 0.010232925415039062, + "learning_rate": 2.561548701929749e-06, + "loss": -0.0274, + "reward": 1.1886743009090424, + "reward_std": 0.1972416564822197, + "rewards/mrr_reward": 0.27752356603741646, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8470001071691513, + "step": 781 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.578125, + "epoch": 0.6256, + "grad_norm": 0.04006101191043854, + "kl": 0.010242462158203125, + "learning_rate": 2.5405885457581793e-06, + "loss": -0.0326, + "reward": 1.24961519241333, + "reward_std": 0.2031826265156269, + "rewards/mrr_reward": 0.3277033753693104, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8639844059944153, + "step": 782 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.75, + "epoch": 0.6264, + "grad_norm": 0.040887244045734406, + "kl": 0.00983428955078125, + "learning_rate": 2.5197020109661775e-06, + "loss": -0.0045, + "reward": 1.0167758166790009, + "reward_std": 0.1663502948358655, + "rewards/mrr_reward": 0.12349330447614193, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8182446956634521, + "step": 783 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.375, + "epoch": 0.6272, + "grad_norm": 0.047615982592105865, + "kl": 0.010096549987792969, + "learning_rate": 2.4988893036954045e-06, + "loss": 0.0403, + "reward": 1.1557523906230927, + "reward_std": 0.13815788738429546, + "rewards/mrr_reward": 0.22870784625411034, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8795382529497147, + "step": 784 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.484375, + "epoch": 0.628, + "grad_norm": 0.04078188166022301, + "kl": 0.008787155151367188, + "learning_rate": 2.4781506293588876e-06, + "loss": -0.0127, + "reward": 1.0591696500778198, + "reward_std": 0.15905814059078693, + "rewards/mrr_reward": 0.14029017835855484, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8743270188570023, + "step": 785 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.828125, + "epoch": 0.6288, + "grad_norm": 0.0395292192697525, + "kl": 0.009163856506347656, + "learning_rate": 2.4574861926389615e-06, + "loss": 0.0017, + "reward": 1.1320269703865051, + "reward_std": 0.11425493052229285, + "rewards/mrr_reward": 0.1963045708835125, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.862866073846817, + "step": 786 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.953125, + "epoch": 0.6296, + "grad_norm": 0.050643905997276306, + "kl": 0.00799560546875, + "learning_rate": 2.436896197485282e-06, + "loss": -0.0253, + "reward": 1.1131080090999603, + "reward_std": 0.17427906021475792, + "rewards/mrr_reward": 0.19312996231019497, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8698435872793198, + "step": 787 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.484375, + "epoch": 0.6304, + "grad_norm": 0.04102494940161705, + "kl": 0.00933074951171875, + "learning_rate": 2.4163808471127815e-06, + "loss": -0.0149, + "reward": 1.060589388012886, + "reward_std": 0.14299902319908142, + "rewards/mrr_reward": 0.18007192388176918, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.7268284261226654, + "step": 788 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.515625, + "epoch": 0.6312, + "grad_norm": 0.04130149260163307, + "kl": 0.008101463317871094, + "learning_rate": 2.395940343999691e-06, + "loss": -0.0096, + "reward": 1.1629635095596313, + "reward_std": 0.1361176036298275, + "rewards/mrr_reward": 0.24042658135294914, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8717381805181503, + "step": 789 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.515625, + "epoch": 0.632, + "grad_norm": 0.04310329630970955, + "kl": 0.0070438385009765625, + "learning_rate": 2.37557488988552e-06, + "loss": 0.0486, + "reward": 1.1800865232944489, + "reward_std": 0.16778289526700974, + "rewards/mrr_reward": 0.24763764813542366, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8392744958400726, + "step": 790 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.71875, + "epoch": 0.6328, + "grad_norm": 0.22937795519828796, + "kl": 0.05006980895996094, + "learning_rate": 2.3552846857690847e-06, + "loss": -0.0004, + "reward": 1.1795984208583832, + "reward_std": 0.18312821350991726, + "rewards/mrr_reward": 0.26884301006793976, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8458021879196167, + "step": 791 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.046875, + "epoch": 0.6336, + "grad_norm": 0.04103982821106911, + "kl": 0.0069122314453125, + "learning_rate": 2.335069931906503e-06, + "loss": -0.0263, + "reward": 1.0953315794467926, + "reward_std": 0.09945772588253021, + "rewards/mrr_reward": 0.15505332499742508, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.862999752163887, + "step": 792 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.0625, + "epoch": 0.6344, + "grad_norm": 0.036689817905426025, + "kl": 0.0096435546875, + "learning_rate": 2.3149308278092343e-06, + "loss": -0.0214, + "reward": 1.1218132674694061, + "reward_std": 0.20247724652290344, + "rewards/mrr_reward": 0.22500619292259216, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8074406385421753, + "step": 793 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.4375, + "epoch": 0.6352, + "grad_norm": 0.039789628237485886, + "kl": 0.007559776306152344, + "learning_rate": 2.2948675722421086e-06, + "loss": -0.026, + "reward": 1.1090701520442963, + "reward_std": 0.21167061291635036, + "rewards/mrr_reward": 0.22198661416769028, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7564912438392639, + "step": 794 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.609375, + "epoch": 0.636, + "grad_norm": 0.03997084125876427, + "kl": 0.008802413940429688, + "learning_rate": 2.2748803632213556e-06, + "loss": -0.0237, + "reward": 1.1740768551826477, + "reward_std": 0.14025119692087173, + "rewards/mrr_reward": 0.24523189291357994, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8420254588127136, + "step": 795 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.984375, + "epoch": 0.6368, + "grad_norm": 0.04768791422247887, + "kl": 0.01158905029296875, + "learning_rate": 2.254969398012663e-06, + "loss": 0.0419, + "reward": 1.158288836479187, + "reward_std": 0.16161417961120605, + "rewards/mrr_reward": 0.241412453353405, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8604445457458496, + "step": 796 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.140625, + "epoch": 0.6376, + "grad_norm": 0.038609255105257034, + "kl": 0.007890701293945312, + "learning_rate": 2.2351348731292134e-06, + "loss": -0.0108, + "reward": 1.0548525154590607, + "reward_std": 0.22406835108995438, + "rewards/mrr_reward": 0.17250124365091324, + "rewards/rank_answer_foramt_reward": 0.88671875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8026978820562363, + "step": 797 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.953125, + "epoch": 0.6384, + "grad_norm": 0.042043354362249374, + "kl": 0.008205413818359375, + "learning_rate": 2.215376984329767e-06, + "loss": 0.0081, + "reward": 1.1629877984523773, + "reward_std": 0.14389540813863277, + "rewards/mrr_reward": 0.22026289999485016, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8977576941251755, + "step": 798 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.8125, + "epoch": 0.6392, + "grad_norm": 0.04208019748330116, + "kl": 0.0068149566650390625, + "learning_rate": 2.195695926616702e-06, + "loss": 0.0019, + "reward": 1.1885976493358612, + "reward_std": 0.19713237322866917, + "rewards/mrr_reward": 0.254371277987957, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.9013012945652008, + "step": 799 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.109375, + "epoch": 0.64, + "grad_norm": 0.04990691691637039, + "kl": 0.0174713134765625, + "learning_rate": 2.1760918942341193e-06, + "loss": -0.0155, + "reward": 1.2124728560447693, + "reward_std": 0.20276020467281342, + "rewards/mrr_reward": 0.34215650893747807, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7349783107638359, + "step": 800 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.203125, + "epoch": 0.6408, + "grad_norm": 0.045392923057079315, + "kl": 0.007555961608886719, + "learning_rate": 2.1565650806658977e-06, + "loss": -0.0492, + "reward": 1.1203849911689758, + "reward_std": 0.15124214440584183, + "rewards/mrr_reward": 0.2108196932822466, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7972739636898041, + "step": 801 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.96875, + "epoch": 0.6416, + "grad_norm": 0.0389019213616848, + "kl": 0.008981704711914062, + "learning_rate": 2.1371156786338108e-06, + "loss": -0.0184, + "reward": 1.1323182135820389, + "reward_std": 0.20558064430952072, + "rewards/mrr_reward": 0.2269345335662365, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8471024334430695, + "step": 802 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.984375, + "epoch": 0.6424, + "grad_norm": 0.042265504598617554, + "kl": 0.007932662963867188, + "learning_rate": 2.117743880095601e-06, + "loss": -0.0239, + "reward": 1.2599590122699738, + "reward_std": 0.2774098366498947, + "rewards/mrr_reward": 0.35645462572574615, + "rewards/rank_answer_foramt_reward": 0.927734375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8335951864719391, + "step": 803 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.375, + "epoch": 0.6432, + "grad_norm": 0.057718675583601, + "kl": 0.01120758056640625, + "learning_rate": 2.098449876243096e-06, + "loss": 0.0413, + "reward": 1.4104988873004913, + "reward_std": 0.2141698058694601, + "rewards/mrr_reward": 0.5004154220223427, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8144690990447998, + "step": 804 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.75, + "epoch": 0.644, + "grad_norm": 0.04124637320637703, + "kl": 0.010328292846679688, + "learning_rate": 2.0792338575003303e-06, + "loss": -0.0279, + "reward": 1.0802222043275833, + "reward_std": 0.219939723610878, + "rewards/mrr_reward": 0.20515252836048603, + "rewards/rank_answer_foramt_reward": 0.88671875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7806323915719986, + "step": 805 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.265625, + "epoch": 0.6448, + "grad_norm": 0.04474443942308426, + "kl": 0.008333206176757812, + "learning_rate": 2.0600960135216463e-06, + "loss": -0.015, + "reward": 1.1868183612823486, + "reward_std": 0.1631349828094244, + "rewards/mrr_reward": 0.2634982690215111, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8662989884614944, + "step": 806 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.796875, + "epoch": 0.6456, + "grad_norm": 0.03788938745856285, + "kl": 0.0070400238037109375, + "learning_rate": 2.041036533189842e-06, + "loss": -0.0001, + "reward": 1.2256155908107758, + "reward_std": 0.18802989460527897, + "rewards/mrr_reward": 0.29671378806233406, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8441506326198578, + "step": 807 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.46875, + "epoch": 0.6464, + "grad_norm": 0.03982880339026451, + "kl": 0.007785797119140625, + "learning_rate": 2.022055604614289e-06, + "loss": 0.007, + "reward": 1.1301990747451782, + "reward_std": 0.16357433795928955, + "rewards/mrr_reward": 0.2091703936457634, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8534960001707077, + "step": 808 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.59375, + "epoch": 0.6472, + "grad_norm": 0.0403585247695446, + "kl": 0.009008407592773438, + "learning_rate": 2.0031534151290944e-06, + "loss": 0.0021, + "reward": 1.1528845131397247, + "reward_std": 0.1795931002125144, + "rewards/mrr_reward": 0.23609252274036407, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8406573981046677, + "step": 809 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.125, + "epoch": 0.648, + "grad_norm": 0.04252356290817261, + "kl": 0.0116729736328125, + "learning_rate": 1.984330151291233e-06, + "loss": -0.0035, + "reward": 1.1698086261749268, + "reward_std": 0.22232595458626747, + "rewards/mrr_reward": 0.2710999473929405, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7878125905990601, + "step": 810 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.40625, + "epoch": 0.6488, + "grad_norm": 0.04095650836825371, + "kl": 0.0076580047607421875, + "learning_rate": 1.965585998878724e-06, + "loss": 0.0121, + "reward": 1.2161291241645813, + "reward_std": 0.26305726869031787, + "rewards/mrr_reward": 0.3006200324743986, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8777853697538376, + "step": 811 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.703125, + "epoch": 0.6496, + "grad_norm": 0.04271842539310455, + "kl": 0.008337020874023438, + "learning_rate": 1.9469211428887813e-06, + "loss": -0.0074, + "reward": 1.3187182247638702, + "reward_std": 0.18177231401205063, + "rewards/mrr_reward": 0.3931051604449749, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8693408221006393, + "step": 812 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.859375, + "epoch": 0.6504, + "grad_norm": 0.04430151358246803, + "kl": 0.009272575378417969, + "learning_rate": 1.928335767535997e-06, + "loss": -0.054, + "reward": 1.2358072102069855, + "reward_std": 0.18769995868206024, + "rewards/mrr_reward": 0.3260292708873749, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8135431855916977, + "step": 813 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.84375, + "epoch": 0.6512, + "grad_norm": 0.037271056324243546, + "kl": 0.0101318359375, + "learning_rate": 1.9098300562505266e-06, + "loss": -0.0295, + "reward": 1.2297299802303314, + "reward_std": 0.15305910632014275, + "rewards/mrr_reward": 0.2948102727532387, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8604336082935333, + "step": 814 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.609375, + "epoch": 0.652, + "grad_norm": 0.04709629714488983, + "kl": 0.009583473205566406, + "learning_rate": 1.8914041916762648e-06, + "loss": -0.0617, + "reward": 1.0883388370275497, + "reward_std": 0.1659074891358614, + "rewards/mrr_reward": 0.21894222125411034, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7243789285421371, + "step": 815 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.09375, + "epoch": 0.6528, + "grad_norm": 0.041499897837638855, + "kl": 0.0078277587890625, + "learning_rate": 1.8730583556690607e-06, + "loss": -0.0036, + "reward": 1.0809594094753265, + "reward_std": 0.1182434605434537, + "rewards/mrr_reward": 0.15505952574312687, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8682572394609451, + "step": 816 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.953125, + "epoch": 0.6536, + "grad_norm": 0.041228439658880234, + "kl": 0.00791168212890625, + "learning_rate": 1.8547927292949053e-06, + "loss": -0.0025, + "reward": 1.2632147371768951, + "reward_std": 0.2752368990331888, + "rewards/mrr_reward": 0.35054564103484154, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8437888473272324, + "step": 817 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.96875, + "epoch": 0.6544, + "grad_norm": 0.039352841675281525, + "kl": 0.007452964782714844, + "learning_rate": 1.8366074928281608e-06, + "loss": 0.0081, + "reward": 1.285154014825821, + "reward_std": 0.2392275594174862, + "rewards/mrr_reward": 0.35860615968704224, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8643612116575241, + "step": 818 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.359375, + "epoch": 0.6552, + "grad_norm": 0.04071607068181038, + "kl": 0.008332252502441406, + "learning_rate": 1.818502825749764e-06, + "loss": -0.0496, + "reward": 1.1540860533714294, + "reward_std": 0.1926177842542529, + "rewards/mrr_reward": 0.24485987797379494, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8392150849103928, + "step": 819 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.328125, + "epoch": 0.656, + "grad_norm": 0.038751162588596344, + "kl": 0.007927894592285156, + "learning_rate": 1.8004789067454763e-06, + "loss": -0.0364, + "reward": 1.1840001344680786, + "reward_std": 0.21113791782408953, + "rewards/mrr_reward": 0.25868676230311394, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8547612130641937, + "step": 820 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.78125, + "epoch": 0.6568, + "grad_norm": 0.04042641073465347, + "kl": 0.006926536560058594, + "learning_rate": 1.7825359137040987e-06, + "loss": 0.0111, + "reward": 1.1455277800559998, + "reward_std": 0.14714708551764488, + "rewards/mrr_reward": 0.20182912051677704, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.9007083028554916, + "step": 821 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.046875, + "epoch": 0.6576, + "grad_norm": 0.04337405040860176, + "kl": 0.013456344604492188, + "learning_rate": 1.7646740237157256e-06, + "loss": -0.0211, + "reward": 1.1473829746246338, + "reward_std": 0.1825929917395115, + "rewards/mrr_reward": 0.23546627908945084, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8454151898622513, + "step": 822 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.40625, + "epoch": 0.6584, + "grad_norm": 0.03853907063603401, + "kl": 0.008459091186523438, + "learning_rate": 1.7468934130700044e-06, + "loss": -0.0356, + "reward": 1.3643218874931335, + "reward_std": 0.2208605632185936, + "rewards/mrr_reward": 0.44823288172483444, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8033709824085236, + "step": 823 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.28125, + "epoch": 0.6592, + "grad_norm": 0.04316363483667374, + "kl": 0.009531974792480469, + "learning_rate": 1.7291942572543806e-06, + "loss": -0.0131, + "reward": 1.1022516041994095, + "reward_std": 0.2808724083006382, + "rewards/mrr_reward": 0.24019718542695045, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7450985908508301, + "step": 824 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.109375, + "epoch": 0.66, + "grad_norm": 0.04379027336835861, + "kl": 0.009677886962890625, + "learning_rate": 1.7115767309523811e-06, + "loss": -0.0232, + "reward": 1.1667591333389282, + "reward_std": 0.23061896860599518, + "rewards/mrr_reward": 0.2626550104469061, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.79049052298069, + "step": 825 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.515625, + "epoch": 0.6608, + "grad_norm": 0.042437583208084106, + "kl": 0.0114288330078125, + "learning_rate": 1.6940410080418723e-06, + "loss": 0.0054, + "reward": 1.0727409720420837, + "reward_std": 0.15770739316940308, + "rewards/mrr_reward": 0.15853795036673546, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8269526362419128, + "step": 826 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.453125, + "epoch": 0.6616, + "grad_norm": 0.04106762260198593, + "kl": 0.0107269287109375, + "learning_rate": 1.6765872615933676e-06, + "loss": -0.0267, + "reward": 1.1445525884628296, + "reward_std": 0.17659413255751133, + "rewards/mrr_reward": 0.2239893414080143, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8794291019439697, + "step": 827 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.0, + "epoch": 0.6624, + "grad_norm": 0.04433257132768631, + "kl": 0.008103370666503906, + "learning_rate": 1.6592156638682887e-06, + "loss": -0.0554, + "reward": 1.1460805833339691, + "reward_std": 0.14873161166906357, + "rewards/mrr_reward": 0.253608625382185, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7747729271650314, + "step": 828 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.234375, + "epoch": 0.6632, + "grad_norm": 0.03736725449562073, + "kl": 0.007901191711425781, + "learning_rate": 1.6419263863172997e-06, + "loss": -0.0225, + "reward": 1.2986692488193512, + "reward_std": 0.18618784938007593, + "rewards/mrr_reward": 0.36312004551291466, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8506224751472473, + "step": 829 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.484375, + "epoch": 0.664, + "grad_norm": 0.03575273975729942, + "kl": 0.007147789001464844, + "learning_rate": 1.6247195995785836e-06, + "loss": 0.0085, + "reward": 1.231167882680893, + "reward_std": 0.14090694999322295, + "rewards/mrr_reward": 0.2981584817171097, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8683166652917862, + "step": 830 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.75, + "epoch": 0.6648, + "grad_norm": 0.04371047392487526, + "kl": 0.008418083190917969, + "learning_rate": 1.6075954734761844e-06, + "loss": -0.0038, + "reward": 1.1615868508815765, + "reward_std": 0.243436086922884, + "rewards/mrr_reward": 0.2448226734995842, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8132294416427612, + "step": 831 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.5, + "epoch": 0.6656, + "grad_norm": 0.04143482446670532, + "kl": 0.00856781005859375, + "learning_rate": 1.5905541770183096e-06, + "loss": -0.0463, + "reward": 1.1261744499206543, + "reward_std": 0.13198063895106316, + "rewards/mrr_reward": 0.1937314048409462, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.880272313952446, + "step": 832 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.09375, + "epoch": 0.6664, + "grad_norm": 0.04452883452177048, + "kl": 0.010478973388671875, + "learning_rate": 1.5735958783956795e-06, + "loss": 0.0252, + "reward": 1.1056948900222778, + "reward_std": 0.13016505632549524, + "rewards/mrr_reward": 0.17806300520896912, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.855927437543869, + "step": 833 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.671875, + "epoch": 0.6672, + "grad_norm": 0.04416726902127266, + "kl": 0.008454322814941406, + "learning_rate": 1.5567207449798517e-06, + "loss": -0.0404, + "reward": 1.2114092707633972, + "reward_std": 0.27365200221538544, + "rewards/mrr_reward": 0.30291419103741646, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.831140398979187, + "step": 834 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.296875, + "epoch": 0.668, + "grad_norm": 0.04146156832575798, + "kl": 0.007966995239257812, + "learning_rate": 1.5399289433215792e-06, + "loss": -0.0037, + "reward": 1.1683192551136017, + "reward_std": 0.22274474799633026, + "rewards/mrr_reward": 0.23329614847898483, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8959033340215683, + "step": 835 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.5625, + "epoch": 0.6688, + "grad_norm": 0.04751385748386383, + "kl": 0.009054183959960938, + "learning_rate": 1.52322063914917e-06, + "loss": -0.0242, + "reward": 1.154505506157875, + "reward_std": 0.222128264605999, + "rewards/mrr_reward": 0.2571180574595928, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7701369524002075, + "step": 836 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.96875, + "epoch": 0.6696, + "grad_norm": 0.04068367928266525, + "kl": 0.007717132568359375, + "learning_rate": 1.5065959973668355e-06, + "loss": 0.0565, + "reward": 1.2096376717090607, + "reward_std": 0.14537774212658405, + "rewards/mrr_reward": 0.287667416036129, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8641615360975266, + "step": 837 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.921875, + "epoch": 0.6704, + "grad_norm": 0.040934283286333084, + "kl": 0.008641242980957031, + "learning_rate": 1.490055182053083e-06, + "loss": -0.0112, + "reward": 1.1347018480300903, + "reward_std": 0.19622957706451416, + "rewards/mrr_reward": 0.2253534235060215, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8122418373823166, + "step": 838 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.0, + "epoch": 0.6712, + "grad_norm": 0.03677130863070488, + "kl": 0.008676528930664062, + "learning_rate": 1.4735983564590784e-06, + "loss": -0.0226, + "reward": 1.2674466371536255, + "reward_std": 0.20840232260525227, + "rewards/mrr_reward": 0.34420883283019066, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8465183228254318, + "step": 839 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.53125, + "epoch": 0.672, + "grad_norm": 0.0432620495557785, + "kl": 0.010099411010742188, + "learning_rate": 1.4572256830070497e-06, + "loss": -0.0358, + "reward": 1.1152960062026978, + "reward_std": 0.11179288476705551, + "rewards/mrr_reward": 0.1966145858168602, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8190391659736633, + "step": 840 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.375, + "epoch": 0.6728, + "grad_norm": 0.04272717610001564, + "kl": 0.008855819702148438, + "learning_rate": 1.4409373232886703e-06, + "loss": -0.0453, + "reward": 1.1043509542942047, + "reward_std": 0.1713030431419611, + "rewards/mrr_reward": 0.21338666044175625, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7702041119337082, + "step": 841 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.578125, + "epoch": 0.6736, + "grad_norm": 0.03976025804877281, + "kl": 0.009042739868164062, + "learning_rate": 1.4247334380634792e-06, + "loss": -0.02, + "reward": 1.1753461956977844, + "reward_std": 0.23029477335512638, + "rewards/mrr_reward": 0.25109126791357994, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8710848391056061, + "step": 842 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.921875, + "epoch": 0.6744, + "grad_norm": 0.03952572122216225, + "kl": 0.0076618194580078125, + "learning_rate": 1.408614187257279e-06, + "loss": -0.0295, + "reward": 1.2743225693702698, + "reward_std": 0.2028570305556059, + "rewards/mrr_reward": 0.3688492067158222, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8141710609197617, + "step": 843 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.671875, + "epoch": 0.6752, + "grad_norm": 0.04077652096748352, + "kl": 0.009363174438476562, + "learning_rate": 1.3925797299605649e-06, + "loss": -0.0244, + "reward": 1.102505773305893, + "reward_std": 0.12630261853337288, + "rewards/mrr_reward": 0.18967634066939354, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8169309347867966, + "step": 844 + }, + { + "clip_ratio": 0.0, + "completion_length": 486.46875, + "epoch": 0.676, + "grad_norm": 0.0385010689496994, + "kl": 0.010541915893554688, + "learning_rate": 1.3766302244269624e-06, + "loss": -0.0217, + "reward": 1.0942464470863342, + "reward_std": 0.20936249569058418, + "rewards/mrr_reward": 0.19885292276740074, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8129228949546814, + "step": 845 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.953125, + "epoch": 0.6768, + "grad_norm": 0.04482361301779747, + "kl": 0.008653640747070312, + "learning_rate": 1.3607658280716474e-06, + "loss": -0.0199, + "reward": 1.0879277884960175, + "reward_std": 0.13143850397318602, + "rewards/mrr_reward": 0.17986730858683586, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8278702795505524, + "step": 846 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.09375, + "epoch": 0.6776, + "grad_norm": 0.038818296045064926, + "kl": 0.008572578430175781, + "learning_rate": 1.3449866974698123e-06, + "loss": -0.0419, + "reward": 1.0753368735313416, + "reward_std": 0.172625370323658, + "rewards/mrr_reward": 0.22671131044626236, + "rewards/rank_answer_foramt_reward": 0.86328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7161238044500351, + "step": 847 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.296875, + "epoch": 0.6784, + "grad_norm": 0.04571860656142235, + "kl": 0.0086669921875, + "learning_rate": 1.3292929883550998e-06, + "loss": -0.0586, + "reward": 1.1929639875888824, + "reward_std": 0.1677282489836216, + "rewards/mrr_reward": 0.2719742190092802, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8182217180728912, + "step": 848 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.484375, + "epoch": 0.6792, + "grad_norm": 0.05399928614497185, + "kl": 0.01306915283203125, + "learning_rate": 1.3136848556180893e-06, + "loss": -0.0565, + "reward": 1.1323251575231552, + "reward_std": 0.15411379747092724, + "rewards/mrr_reward": 0.21484375, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8505590111017227, + "step": 849 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.015625, + "epoch": 0.68, + "grad_norm": 0.03968013450503349, + "kl": 0.008097648620605469, + "learning_rate": 1.2981624533047432e-06, + "loss": 0.0032, + "reward": 1.1719480752944946, + "reward_std": 0.1420932300388813, + "rewards/mrr_reward": 0.22884425148367882, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8715621381998062, + "step": 850 + }, + { + "clip_ratio": 0.0, + "completion_length": 474.9375, + "epoch": 0.6808, + "grad_norm": 0.03793917968869209, + "kl": 0.010494232177734375, + "learning_rate": 1.2827259346149123e-06, + "loss": -0.0135, + "reward": 1.1404308676719666, + "reward_std": 0.1706097424030304, + "rewards/mrr_reward": 0.21374628692865372, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8296191394329071, + "step": 851 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.015625, + "epoch": 0.6816, + "grad_norm": 0.03952965885400772, + "kl": 0.007433891296386719, + "learning_rate": 1.2673754519008008e-06, + "loss": -0.0227, + "reward": 1.2492049932479858, + "reward_std": 0.18265685997903347, + "rewards/mrr_reward": 0.3240761533379555, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8307643234729767, + "step": 852 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.40625, + "epoch": 0.6824, + "grad_norm": 0.035804543644189835, + "kl": 0.007625579833984375, + "learning_rate": 1.2521111566654732e-06, + "loss": 0.0024, + "reward": 1.2148383259773254, + "reward_std": 0.1326262354850769, + "rewards/mrr_reward": 0.28328993543982506, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.858029916882515, + "step": 853 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.65625, + "epoch": 0.6832, + "grad_norm": 0.037164557725191116, + "kl": 0.008016586303710938, + "learning_rate": 1.2369331995613664e-06, + "loss": 0.0025, + "reward": 1.085724025964737, + "reward_std": 0.09625130379572511, + "rewards/mrr_reward": 0.14905753917992115, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.865726962685585, + "step": 854 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.171875, + "epoch": 0.684, + "grad_norm": 0.040722329169511795, + "kl": 0.009304046630859375, + "learning_rate": 1.2218417303887842e-06, + "loss": -0.0171, + "reward": 1.1564219295978546, + "reward_std": 0.1851737443357706, + "rewards/mrr_reward": 0.2559833899140358, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.781335860490799, + "step": 855 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.421875, + "epoch": 0.6848, + "grad_norm": 0.045357346534729004, + "kl": 0.008769035339355469, + "learning_rate": 1.206836898094439e-06, + "loss": -0.0129, + "reward": 1.2376780807971954, + "reward_std": 0.22706399112939835, + "rewards/mrr_reward": 0.33642733469605446, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7857502400875092, + "step": 856 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.21875, + "epoch": 0.6856, + "grad_norm": 0.04047942906618118, + "kl": 0.007975578308105469, + "learning_rate": 1.1919188507699641e-06, + "loss": 0.0062, + "reward": 1.1433132886886597, + "reward_std": 0.15857827477157116, + "rewards/mrr_reward": 0.21695809438824654, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8481525033712387, + "step": 857 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.078125, + "epoch": 0.6864, + "grad_norm": 0.04048970341682434, + "kl": 0.008747100830078125, + "learning_rate": 1.1770877356504684e-06, + "loss": 0.0148, + "reward": 1.0512803494930267, + "reward_std": 0.10148373059928417, + "rewards/mrr_reward": 0.12783358246088028, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.866682767868042, + "step": 858 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.890625, + "epoch": 0.6872, + "grad_norm": 0.03905528783798218, + "kl": 0.008471488952636719, + "learning_rate": 1.1623436991130654e-06, + "loss": -0.0104, + "reward": 1.1480936110019684, + "reward_std": 0.19399065151810646, + "rewards/mrr_reward": 0.23568949103355408, + "rewards/rank_answer_foramt_reward": 0.9140625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8586108982563019, + "step": 859 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.828125, + "epoch": 0.688, + "grad_norm": 0.04030711576342583, + "kl": 0.008829116821289062, + "learning_rate": 1.1476868866754488e-06, + "loss": -0.0087, + "reward": 1.0645442605018616, + "reward_std": 0.1284739300608635, + "rewards/mrr_reward": 0.14106522873044014, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8667805790901184, + "step": 860 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.5, + "epoch": 0.6888, + "grad_norm": 0.04082191362977028, + "kl": 0.0068359375, + "learning_rate": 1.1331174429944346e-06, + "loss": -0.0112, + "reward": 1.2571602165699005, + "reward_std": 0.08795512840151787, + "rewards/mrr_reward": 0.31218378245830536, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8635648190975189, + "step": 861 + }, + { + "clip_ratio": 0.0, + "completion_length": 472.34375, + "epoch": 0.6896, + "grad_norm": 0.0414448082447052, + "kl": 0.0075016021728515625, + "learning_rate": 1.1186355118645552e-06, + "loss": -0.0359, + "reward": 1.0901014506816864, + "reward_std": 0.06480870395898819, + "rewards/mrr_reward": 0.14286334067583084, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8704183101654053, + "step": 862 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.90625, + "epoch": 0.6904, + "grad_norm": 0.04123280942440033, + "kl": 0.008793830871582031, + "learning_rate": 1.1042412362166221e-06, + "loss": -0.0314, + "reward": 1.2326207756996155, + "reward_std": 0.23798086121678352, + "rewards/mrr_reward": 0.3054501600563526, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8232797235250473, + "step": 863 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.84375, + "epoch": 0.6912, + "grad_norm": 0.0383603498339653, + "kl": 0.009160995483398438, + "learning_rate": 1.0899347581163222e-06, + "loss": -0.0088, + "reward": 1.1871033906936646, + "reward_std": 0.21597011759877205, + "rewards/mrr_reward": 0.28604910895228386, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8261706084012985, + "step": 864 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.484375, + "epoch": 0.692, + "grad_norm": 0.04392939805984497, + "kl": 0.007921218872070312, + "learning_rate": 1.0757162187628223e-06, + "loss": -0.0154, + "reward": 1.1239495277404785, + "reward_std": 0.09795428067445755, + "rewards/mrr_reward": 0.2013950888067484, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8112445026636124, + "step": 865 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.234375, + "epoch": 0.6928, + "grad_norm": 0.03897380828857422, + "kl": 0.00806427001953125, + "learning_rate": 1.0615857584873624e-06, + "loss": -0.0143, + "reward": 1.1538923382759094, + "reward_std": 0.11023381073027849, + "rewards/mrr_reward": 0.21333704888820648, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8579799234867096, + "step": 866 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.5, + "epoch": 0.6936, + "grad_norm": 0.04020663723349571, + "kl": 0.008386611938476562, + "learning_rate": 1.0475435167518843e-06, + "loss": -0.0445, + "reward": 1.1563519537448883, + "reward_std": 0.17987770959734917, + "rewards/mrr_reward": 0.23674975894391537, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8140168786048889, + "step": 867 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.765625, + "epoch": 0.6944, + "grad_norm": 0.04018275439739227, + "kl": 0.007943153381347656, + "learning_rate": 1.0335896321476413e-06, + "loss": -0.0054, + "reward": 1.1884158253669739, + "reward_std": 0.09958555456250906, + "rewards/mrr_reward": 0.258320938795805, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.875109925866127, + "step": 868 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.765625, + "epoch": 0.6952, + "grad_norm": 0.0402548648416996, + "kl": 0.008069038391113281, + "learning_rate": 1.0197242423938447e-06, + "loss": -0.0367, + "reward": 1.1144362390041351, + "reward_std": 0.14857859443873167, + "rewards/mrr_reward": 0.1893291138112545, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8599952906370163, + "step": 869 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.671875, + "epoch": 0.696, + "grad_norm": 0.03870267793536186, + "kl": 0.009595870971679688, + "learning_rate": 1.0059474843362893e-06, + "loss": -0.0152, + "reward": 1.1645599752664566, + "reward_std": 0.16920059733092785, + "rewards/mrr_reward": 0.2650669626891613, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8097206056118011, + "step": 870 + }, + { + "clip_ratio": 0.0, + "completion_length": 475.046875, + "epoch": 0.6968, + "grad_norm": 0.04259733855724335, + "kl": 0.009562492370605469, + "learning_rate": 9.922594939460195e-07, + "loss": 0.011, + "reward": 1.1267297565937042, + "reward_std": 0.2542701195925474, + "rewards/mrr_reward": 0.24404142051935196, + "rewards/rank_answer_foramt_reward": 0.869140625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8212972581386566, + "step": 871 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.90625, + "epoch": 0.6976, + "grad_norm": 0.043759021908044815, + "kl": 0.011583328247070312, + "learning_rate": 9.786604063179728e-07, + "loss": 0.0017, + "reward": 1.2579089105129242, + "reward_std": 0.17472930811345577, + "rewards/mrr_reward": 0.34670138359069824, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.831547275185585, + "step": 872 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.59375, + "epoch": 0.6984, + "grad_norm": 0.03983573243021965, + "kl": 0.009865760803222656, + "learning_rate": 9.651503556696519e-07, + "loss": -0.0446, + "reward": 1.1973416805267334, + "reward_std": 0.17388677783310413, + "rewards/mrr_reward": 0.2698722742497921, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8534820228815079, + "step": 873 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.375, + "epoch": 0.6992, + "grad_norm": 0.043607067316770554, + "kl": 0.008998870849609375, + "learning_rate": 9.517294753398066e-07, + "loss": -0.0139, + "reward": 1.294020414352417, + "reward_std": 0.19010700285434723, + "rewards/mrr_reward": 0.37508679926395416, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8471473008394241, + "step": 874 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.1875, + "epoch": 0.7, + "grad_norm": 0.04746263101696968, + "kl": 0.009923934936523438, + "learning_rate": 9.383978977871022e-07, + "loss": -0.0106, + "reward": 1.0680293440818787, + "reward_std": 0.11477180011570454, + "rewards/mrr_reward": 0.13857267424464226, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8712227642536163, + "step": 875 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.625, + "epoch": 0.7008, + "grad_norm": 0.04973239824175835, + "kl": 0.009832382202148438, + "learning_rate": 9.251557545888312e-07, + "loss": -0.0656, + "reward": 1.2519857585430145, + "reward_std": 0.197287205606699, + "rewards/mrr_reward": 0.34175965934991837, + "rewards/rank_answer_foramt_reward": 0.970703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7875577062368393, + "step": 876 + }, + { + "clip_ratio": 0.0, + "completion_length": 485.8125, + "epoch": 0.7016, + "grad_norm": 0.039514243602752686, + "kl": 0.006424903869628906, + "learning_rate": 9.120031764395987e-07, + "loss": 0.0036, + "reward": 1.2768616378307343, + "reward_std": 0.17542525753378868, + "rewards/mrr_reward": 0.3354600667953491, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.880075678229332, + "step": 877 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.21875, + "epoch": 0.7024, + "grad_norm": 0.04365525767207146, + "kl": 0.009319305419921875, + "learning_rate": 8.989402931500434e-07, + "loss": 0.0149, + "reward": 1.0583529770374298, + "reward_std": 0.1347158532589674, + "rewards/mrr_reward": 0.1308221723884344, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8810119330883026, + "step": 878 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.421875, + "epoch": 0.7032, + "grad_norm": 0.03814084827899933, + "kl": 0.00922393798828125, + "learning_rate": 8.859672336455471e-07, + "loss": -0.0151, + "reward": 1.1374922394752502, + "reward_std": 0.20830333605408669, + "rewards/mrr_reward": 0.23531125485897064, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7963816821575165, + "step": 879 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.09375, + "epoch": 0.704, + "grad_norm": 0.03798083961009979, + "kl": 0.009809494018554688, + "learning_rate": 8.730841259649725e-07, + "loss": -0.0172, + "reward": 1.0921657383441925, + "reward_std": 0.13509543286636472, + "rewards/mrr_reward": 0.17431175522506237, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8165318071842194, + "step": 880 + }, + { + "clip_ratio": 0.0, + "completion_length": 474.46875, + "epoch": 0.7048, + "grad_norm": 0.03497171774506569, + "kl": 0.007781982421875, + "learning_rate": 8.602910972593892e-07, + "loss": -0.0358, + "reward": 1.11690154671669, + "reward_std": 0.14519068971276283, + "rewards/mrr_reward": 0.19576513767242432, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7913223654031754, + "step": 881 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.71875, + "epoch": 0.7056, + "grad_norm": 0.042609184980392456, + "kl": 0.008588790893554688, + "learning_rate": 8.475882737908248e-07, + "loss": -0.0342, + "reward": 1.0906363278627396, + "reward_std": 0.20800522714853287, + "rewards/mrr_reward": 0.1928075421601534, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8046774417161942, + "step": 882 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.625, + "epoch": 0.7064, + "grad_norm": 0.040892452001571655, + "kl": 0.0076351165771484375, + "learning_rate": 8.349757809310211e-07, + "loss": -0.0081, + "reward": 1.2315025329589844, + "reward_std": 0.21779580041766167, + "rewards/mrr_reward": 0.29523809254169464, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8645085841417313, + "step": 883 + }, + { + "clip_ratio": 0.0, + "completion_length": 486.3125, + "epoch": 0.7072, + "grad_norm": 0.03839423879981041, + "kl": 0.008156776428222656, + "learning_rate": 8.224537431601886e-07, + "loss": 0.0002, + "reward": 1.138191044330597, + "reward_std": 0.1623014360666275, + "rewards/mrr_reward": 0.20939361490309238, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8360219746828079, + "step": 884 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.09375, + "epoch": 0.708, + "grad_norm": 0.03817719221115112, + "kl": 0.009266853332519531, + "learning_rate": 8.100222840657879e-07, + "loss": -0.0066, + "reward": 1.2030390501022339, + "reward_std": 0.11985693499445915, + "rewards/mrr_reward": 0.2815476171672344, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8275544792413712, + "step": 885 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.859375, + "epoch": 0.7088, + "grad_norm": 0.04101807251572609, + "kl": 0.009810447692871094, + "learning_rate": 7.976815263412963e-07, + "loss": -0.0044, + "reward": 1.1409578323364258, + "reward_std": 0.23399163782596588, + "rewards/mrr_reward": 0.23911211267113686, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.836381271481514, + "step": 886 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.296875, + "epoch": 0.7096, + "grad_norm": 0.04037481173872948, + "kl": 0.009243011474609375, + "learning_rate": 7.854315917850163e-07, + "loss": -0.0243, + "reward": 1.1751604080200195, + "reward_std": 0.2168668694794178, + "rewards/mrr_reward": 0.271905992180109, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8269782513380051, + "step": 887 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.1875, + "epoch": 0.7104, + "grad_norm": 0.037997495383024216, + "kl": 0.0094451904296875, + "learning_rate": 7.732726012988512e-07, + "loss": -0.0095, + "reward": 1.1163204610347748, + "reward_std": 0.14024154655635357, + "rewards/mrr_reward": 0.1814298164099455, + "rewards/rank_answer_foramt_reward": 0.984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8564394563436508, + "step": 888 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.890625, + "epoch": 0.7112, + "grad_norm": 0.03766101598739624, + "kl": 0.008104324340820312, + "learning_rate": 7.612046748871327e-07, + "loss": -0.0091, + "reward": 1.1998648047447205, + "reward_std": 0.14241230860352516, + "rewards/mrr_reward": 0.2643105238676071, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8486847132444382, + "step": 889 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.578125, + "epoch": 0.712, + "grad_norm": 0.03838858753442764, + "kl": 0.00780487060546875, + "learning_rate": 7.492279316554207e-07, + "loss": -0.0223, + "reward": 1.1085861921310425, + "reward_std": 0.15880068391561508, + "rewards/mrr_reward": 0.17414434999227524, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.886329397559166, + "step": 890 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.15625, + "epoch": 0.7128, + "grad_norm": 0.0412142239511013, + "kl": 0.007672309875488281, + "learning_rate": 7.373424898093339e-07, + "loss": 0.0255, + "reward": 1.2313543856143951, + "reward_std": 0.20873188227415085, + "rewards/mrr_reward": 0.3427393361926079, + "rewards/rank_answer_foramt_reward": 0.96875, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7474602460861206, + "step": 891 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.296875, + "epoch": 0.7136, + "grad_norm": 0.04028737545013428, + "kl": 0.0076961517333984375, + "learning_rate": 7.255484666533874e-07, + "loss": -0.0215, + "reward": 1.1419077217578888, + "reward_std": 0.1702859941869974, + "rewards/mrr_reward": 0.20735986903309822, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8729787468910217, + "step": 892 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.609375, + "epoch": 0.7144, + "grad_norm": 0.041100095957517624, + "kl": 0.009128570556640625, + "learning_rate": 7.138459785898266e-07, + "loss": -0.0366, + "reward": 1.1841089129447937, + "reward_std": 0.23279405757784843, + "rewards/mrr_reward": 0.27274926379323006, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8183364868164062, + "step": 893 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.828125, + "epoch": 0.7152, + "grad_norm": 0.040628962218761444, + "kl": 0.009103775024414062, + "learning_rate": 7.022351411174866e-07, + "loss": -0.0277, + "reward": 1.2547601759433746, + "reward_std": 0.1291979430243373, + "rewards/mrr_reward": 0.3259424641728401, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8282708525657654, + "step": 894 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.53125, + "epoch": 0.716, + "grad_norm": 0.04295559599995613, + "kl": 0.008701324462890625, + "learning_rate": 6.907160688306425e-07, + "loss": -0.0339, + "reward": 1.2229861319065094, + "reward_std": 0.18074735067784786, + "rewards/mrr_reward": 0.29606895335018635, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8498553782701492, + "step": 895 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.40625, + "epoch": 0.7168, + "grad_norm": 0.04010513424873352, + "kl": 0.009496688842773438, + "learning_rate": 6.792888754178906e-07, + "loss": -0.0085, + "reward": 1.2736192345619202, + "reward_std": 0.28126518800854683, + "rewards/mrr_reward": 0.36719369515776634, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.81510329246521, + "step": 896 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.515625, + "epoch": 0.7176, + "grad_norm": 0.04235503822565079, + "kl": 0.0098114013671875, + "learning_rate": 6.679536736610137e-07, + "loss": -0.0137, + "reward": 1.257362738251686, + "reward_std": 0.1873782053589821, + "rewards/mrr_reward": 0.3338169790804386, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8552640378475189, + "step": 897 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.203125, + "epoch": 0.7184, + "grad_norm": 0.03878998011350632, + "kl": 0.007755279541015625, + "learning_rate": 6.567105754338798e-07, + "loss": -0.0676, + "reward": 1.2187067866325378, + "reward_std": 0.20256269164383411, + "rewards/mrr_reward": 0.28175223618745804, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8392561823129654, + "step": 898 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.609375, + "epoch": 0.7192, + "grad_norm": 0.04290932044386864, + "kl": 0.011754989624023438, + "learning_rate": 6.455596917013274e-07, + "loss": -0.0499, + "reward": 1.0937216877937317, + "reward_std": 0.19892499037086964, + "rewards/mrr_reward": 0.20499752275645733, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.7790410816669464, + "step": 899 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.78125, + "epoch": 0.72, + "grad_norm": 0.0432630330324173, + "kl": 0.008266448974609375, + "learning_rate": 6.345011325180772e-07, + "loss": 0.0333, + "reward": 1.2824538052082062, + "reward_std": 0.11213483987376094, + "rewards/mrr_reward": 0.340978417545557, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8881120085716248, + "step": 900 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.1875, + "epoch": 0.7208, + "grad_norm": 0.039642494171857834, + "kl": 0.009405136108398438, + "learning_rate": 6.235350070276447e-07, + "loss": -0.0272, + "reward": 1.1255181729793549, + "reward_std": 0.17566965892910957, + "rewards/mrr_reward": 0.22555803321301937, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8130893558263779, + "step": 901 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.921875, + "epoch": 0.7216, + "grad_norm": 0.03899867832660675, + "kl": 0.008452415466308594, + "learning_rate": 6.126614234612593e-07, + "loss": -0.0113, + "reward": 1.22465381026268, + "reward_std": 0.1621338054537773, + "rewards/mrr_reward": 0.28670016303658485, + "rewards/rank_answer_foramt_reward": 0.984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8657210916280746, + "step": 902 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.484375, + "epoch": 0.7224, + "grad_norm": 0.04027952253818512, + "kl": 0.008156776428222656, + "learning_rate": 6.018804891368035e-07, + "loss": -0.0082, + "reward": 1.144799381494522, + "reward_std": 0.16850159130990505, + "rewards/mrr_reward": 0.21117932349443436, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8838390558958054, + "step": 903 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.234375, + "epoch": 0.7232, + "grad_norm": 0.04422492906451225, + "kl": 0.008924484252929688, + "learning_rate": 5.911923104577455e-07, + "loss": -0.0543, + "reward": 1.121463656425476, + "reward_std": 0.17780250683426857, + "rewards/mrr_reward": 0.20161830820143223, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8284256458282471, + "step": 904 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.859375, + "epoch": 0.724, + "grad_norm": 0.045736148953437805, + "kl": 0.010265350341796875, + "learning_rate": 5.805969929120947e-07, + "loss": -0.0569, + "reward": 1.1411092430353165, + "reward_std": 0.2664986848831177, + "rewards/mrr_reward": 0.29250372759997845, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.6750472635030746, + "step": 905 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.21875, + "epoch": 0.7248, + "grad_norm": 0.037118908017873764, + "kl": 0.008056640625, + "learning_rate": 5.700946410713548e-07, + "loss": -0.0048, + "reward": 1.160048007965088, + "reward_std": 0.15992580354213715, + "rewards/mrr_reward": 0.22242064028978348, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8686386346817017, + "step": 906 + }, + { + "clip_ratio": 0.0, + "completion_length": 492.296875, + "epoch": 0.7256, + "grad_norm": 0.039864134043455124, + "kl": 0.00611114501953125, + "learning_rate": 5.596853585895034e-07, + "loss": 0.0155, + "reward": 1.2810848355293274, + "reward_std": 0.17151801194995642, + "rewards/mrr_reward": 0.33007192611694336, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8818571120500565, + "step": 907 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.03125, + "epoch": 0.7264, + "grad_norm": 0.04468508064746857, + "kl": 0.008083343505859375, + "learning_rate": 5.49369248201953e-07, + "loss": -0.0198, + "reward": 1.1350350379943848, + "reward_std": 0.1694689802825451, + "rewards/mrr_reward": 0.21646826341748238, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8382229655981064, + "step": 908 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.109375, + "epoch": 0.7272, + "grad_norm": 0.03844364359974861, + "kl": 0.008733749389648438, + "learning_rate": 5.391464117245471e-07, + "loss": -0.0246, + "reward": 1.1559688597917557, + "reward_std": 0.1414298638701439, + "rewards/mrr_reward": 0.2493179589509964, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8021143823862076, + "step": 909 + }, + { + "clip_ratio": 0.0, + "completion_length": 491.109375, + "epoch": 0.728, + "grad_norm": 0.038503993302583694, + "kl": 0.0072689056396484375, + "learning_rate": 5.290169500525577e-07, + "loss": 0.0071, + "reward": 1.2781525254249573, + "reward_std": 0.10281640524044633, + "rewards/mrr_reward": 0.33461061120033264, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8865616470575333, + "step": 910 + }, + { + "clip_ratio": 0.0, + "completion_length": 474.859375, + "epoch": 0.7288, + "grad_norm": 0.0404188446700573, + "kl": 0.007802009582519531, + "learning_rate": 5.189809631596798e-07, + "loss": -0.055, + "reward": 1.1833963990211487, + "reward_std": 0.15963028743863106, + "rewards/mrr_reward": 0.2542596831917763, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8643939197063446, + "step": 911 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.078125, + "epoch": 0.7296, + "grad_norm": 0.04015132039785385, + "kl": 0.008340835571289062, + "learning_rate": 5.090385500970551e-07, + "loss": -0.0663, + "reward": 1.073600858449936, + "reward_std": 0.1477372208610177, + "rewards/mrr_reward": 0.16947544738650322, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8237582743167877, + "step": 912 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.546875, + "epoch": 0.7304, + "grad_norm": 0.04625895991921425, + "kl": 0.008310317993164062, + "learning_rate": 4.99189808992282e-07, + "loss": 0.0159, + "reward": 1.0823292136192322, + "reward_std": 0.15047525987029076, + "rewards/mrr_reward": 0.1619233600795269, + "rewards/rank_answer_foramt_reward": 0.900390625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8887177854776382, + "step": 913 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.515625, + "epoch": 0.7312, + "grad_norm": 0.04360828921198845, + "kl": 0.008584976196289062, + "learning_rate": 4.894348370484648e-07, + "loss": -0.0237, + "reward": 1.1279266774654388, + "reward_std": 0.2273631989955902, + "rewards/mrr_reward": 0.21179315820336342, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8855368793010712, + "step": 914 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.90625, + "epoch": 0.732, + "grad_norm": 0.04092933610081673, + "kl": 0.009286880493164062, + "learning_rate": 4.797737305432337e-07, + "loss": -0.0416, + "reward": 1.0745922327041626, + "reward_std": 0.12707211077213287, + "rewards/mrr_reward": 0.15930680558085442, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8556233495473862, + "step": 915 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.890625, + "epoch": 0.7328, + "grad_norm": 0.03805176913738251, + "kl": 0.008405685424804688, + "learning_rate": 4.702065848278126e-07, + "loss": -0.0324, + "reward": 1.2603042423725128, + "reward_std": 0.11048845760524273, + "rewards/mrr_reward": 0.3443204537034035, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7757083177566528, + "step": 916 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.5625, + "epoch": 0.7336, + "grad_norm": 0.04079756885766983, + "kl": 0.007517814636230469, + "learning_rate": 4.6073349432606554e-07, + "loss": -0.0101, + "reward": 1.1363862454891205, + "reward_std": 0.15507613588124514, + "rewards/mrr_reward": 0.21534598991274834, + "rewards/rank_answer_foramt_reward": 0.916015625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.882827877998352, + "step": 917 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.25, + "epoch": 0.7344, + "grad_norm": 0.039096131920814514, + "kl": 0.009703636169433594, + "learning_rate": 4.5135455253357053e-07, + "loss": -0.0168, + "reward": 1.1451416015625, + "reward_std": 0.16741209849715233, + "rewards/mrr_reward": 0.2290364522486925, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.826857328414917, + "step": 918 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.890625, + "epoch": 0.7352, + "grad_norm": 0.04211005941033363, + "kl": 0.010351181030273438, + "learning_rate": 4.420698520166988e-07, + "loss": 0.007, + "reward": 1.2284757196903229, + "reward_std": 0.2662056963890791, + "rewards/mrr_reward": 0.29736483842134476, + "rewards/rank_answer_foramt_reward": 0.927734375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.9016263037919998, + "step": 919 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.078125, + "epoch": 0.736, + "grad_norm": 0.04248746857047081, + "kl": 0.008783340454101562, + "learning_rate": 4.3287948441169457e-07, + "loss": 0.0271, + "reward": 1.0713848173618317, + "reward_std": 0.11600453779101372, + "rewards/mrr_reward": 0.14102182537317276, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8602972775697708, + "step": 920 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.796875, + "epoch": 0.7368, + "grad_norm": 0.0458797849714756, + "kl": 0.009664535522460938, + "learning_rate": 4.2378354042377776e-07, + "loss": -0.0193, + "reward": 1.2582030892372131, + "reward_std": 0.29735782369971275, + "rewards/mrr_reward": 0.3648375477641821, + "rewards/rank_answer_foramt_reward": 0.888671875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8184964209794998, + "step": 921 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.171875, + "epoch": 0.7376, + "grad_norm": 0.06803036481142044, + "kl": 0.02263164520263672, + "learning_rate": 4.1478210982624055e-07, + "loss": -0.034, + "reward": 1.264964759349823, + "reward_std": 0.1754690520465374, + "rewards/mrr_reward": 0.34071801975369453, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8144194781780243, + "step": 922 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.34375, + "epoch": 0.7384, + "grad_norm": 0.03774275258183479, + "kl": 0.00981903076171875, + "learning_rate": 4.0587528145957235e-07, + "loss": 0.0038, + "reward": 1.077519729733467, + "reward_std": 0.10510019911453128, + "rewards/mrr_reward": 0.15089286118745804, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8704600483179092, + "step": 923 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.734375, + "epoch": 0.7392, + "grad_norm": 0.0418127216398716, + "kl": 0.007354736328125, + "learning_rate": 3.9706314323056936e-07, + "loss": -0.0363, + "reward": 1.0994101464748383, + "reward_std": 0.16536758467555046, + "rewards/mrr_reward": 0.18959572538733482, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8273257464170456, + "step": 924 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.3125, + "epoch": 0.74, + "grad_norm": 0.04003710672259331, + "kl": 0.010486602783203125, + "learning_rate": 3.883457821114811e-07, + "loss": -0.026, + "reward": 1.259395271539688, + "reward_std": 0.185102803632617, + "rewards/mrr_reward": 0.3478422686457634, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8169691562652588, + "step": 925 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.90625, + "epoch": 0.7408, + "grad_norm": 0.03963383659720421, + "kl": 0.009263992309570312, + "learning_rate": 3.7972328413914074e-07, + "loss": -0.0154, + "reward": 1.1112309098243713, + "reward_std": 0.17668243870139122, + "rewards/mrr_reward": 0.20306920632719994, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8555205762386322, + "step": 926 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.125, + "epoch": 0.7416, + "grad_norm": 0.043505482375621796, + "kl": 0.01036834716796875, + "learning_rate": 3.711957344141237e-07, + "loss": -0.0355, + "reward": 1.1022839844226837, + "reward_std": 0.19026428647339344, + "rewards/mrr_reward": 0.20350942388176918, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8192622512578964, + "step": 927 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.953125, + "epoch": 0.7424, + "grad_norm": 0.04294588789343834, + "kl": 0.008907318115234375, + "learning_rate": 3.627632170999029e-07, + "loss": 0.0123, + "reward": 1.0245526134967804, + "reward_std": 0.16080267634242773, + "rewards/mrr_reward": 0.14318576268851757, + "rewards/rank_answer_foramt_reward": 0.84765625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8309648036956787, + "step": 928 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.78125, + "epoch": 0.7432, + "grad_norm": 0.039757587015628815, + "kl": 0.007798194885253906, + "learning_rate": 3.544258154220193e-07, + "loss": -0.0165, + "reward": 1.2130238115787506, + "reward_std": 0.20079264417290688, + "rewards/mrr_reward": 0.2949962895363569, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8229171335697174, + "step": 929 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.9375, + "epoch": 0.744, + "grad_norm": 0.039665572345256805, + "kl": 0.0093231201171875, + "learning_rate": 3.4618361166726123e-07, + "loss": -0.0331, + "reward": 1.18429334461689, + "reward_std": 0.18769347667694092, + "rewards/mrr_reward": 0.2760540656745434, + "rewards/rank_answer_foramt_reward": 0.91796875, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8342714756727219, + "step": 930 + }, + { + "clip_ratio": 0.0, + "completion_length": 479.3125, + "epoch": 0.7448, + "grad_norm": 0.042533498257398605, + "kl": 0.008765220642089844, + "learning_rate": 3.380366871828522e-07, + "loss": 0.0221, + "reward": 1.19572052359581, + "reward_std": 0.22476008161902428, + "rewards/mrr_reward": 0.28604911267757416, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8600956052541733, + "step": 931 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.0625, + "epoch": 0.7456, + "grad_norm": 0.03890962526202202, + "kl": 0.008768081665039062, + "learning_rate": 3.2998512237565005e-07, + "loss": -0.0272, + "reward": 1.0621162056922913, + "reward_std": 0.09464808227494359, + "rewards/mrr_reward": 0.14957217499613762, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7926287055015564, + "step": 932 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.28125, + "epoch": 0.7464, + "grad_norm": 0.04664164409041405, + "kl": 0.00838470458984375, + "learning_rate": 3.2202899671134546e-07, + "loss": -0.0119, + "reward": 1.2424928843975067, + "reward_std": 0.18272621743381023, + "rewards/mrr_reward": 0.31674107909202576, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8267927318811417, + "step": 933 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.0625, + "epoch": 0.7472, + "grad_norm": 0.04111839085817337, + "kl": 0.009455680847167969, + "learning_rate": 3.1416838871368925e-07, + "loss": -0.0532, + "reward": 1.117896169424057, + "reward_std": 0.13564826268702745, + "rewards/mrr_reward": 0.20161830261349678, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8390995860099792, + "step": 934 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.875, + "epoch": 0.748, + "grad_norm": 0.046598147600889206, + "kl": 0.009102821350097656, + "learning_rate": 3.064033759637064e-07, + "loss": -0.072, + "reward": 1.1321559846401215, + "reward_std": 0.20450343750417233, + "rewards/mrr_reward": 0.2703249081969261, + "rewards/rank_answer_foramt_reward": 0.912109375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.7073123753070831, + "step": 935 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.015625, + "epoch": 0.7488, + "grad_norm": 0.03951597958803177, + "kl": 0.00994110107421875, + "learning_rate": 2.987340350989421e-07, + "loss": -0.0239, + "reward": 1.126262903213501, + "reward_std": 0.15488239750266075, + "rewards/mrr_reward": 0.19883432984352112, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.845545768737793, + "step": 936 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.65625, + "epoch": 0.7496, + "grad_norm": 0.040472351014614105, + "kl": 0.007419586181640625, + "learning_rate": 2.911604418126901e-07, + "loss": -0.0046, + "reward": 1.1847195029258728, + "reward_std": 0.22169340029358864, + "rewards/mrr_reward": 0.2553013488650322, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8789184093475342, + "step": 937 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.3125, + "epoch": 0.7504, + "grad_norm": 0.03997602313756943, + "kl": 0.008014678955078125, + "learning_rate": 2.836826708532603e-07, + "loss": 0.0314, + "reward": 1.1391558349132538, + "reward_std": 0.15339666418731213, + "rewards/mrr_reward": 0.23206845670938492, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8034368455410004, + "step": 938 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.265625, + "epoch": 0.7512, + "grad_norm": 0.04147866368293762, + "kl": 0.008632659912109375, + "learning_rate": 2.7630079602323447e-07, + "loss": -0.0225, + "reward": 1.193027675151825, + "reward_std": 0.21564476191997528, + "rewards/mrr_reward": 0.2960689514875412, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8293846398591995, + "step": 939 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.234375, + "epoch": 0.752, + "grad_norm": 0.03809836506843567, + "kl": 0.008844375610351562, + "learning_rate": 2.6901489017873375e-07, + "loss": -0.0142, + "reward": 1.265418291091919, + "reward_std": 0.16339326091110706, + "rewards/mrr_reward": 0.3403521776199341, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8247148245573044, + "step": 940 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.59375, + "epoch": 0.7528, + "grad_norm": 0.04365180432796478, + "kl": 0.012082099914550781, + "learning_rate": 2.6182502522871135e-07, + "loss": 0.0047, + "reward": 1.286356657743454, + "reward_std": 0.17650778219103813, + "rewards/mrr_reward": 0.35570436902344227, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8846114873886108, + "step": 941 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.078125, + "epoch": 0.7536, + "grad_norm": 0.042372170835733414, + "kl": 0.008172988891601562, + "learning_rate": 2.547312721342277e-07, + "loss": 0.0007, + "reward": 1.1792892515659332, + "reward_std": 0.17659651907160878, + "rewards/mrr_reward": 0.2534412369132042, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8602875918149948, + "step": 942 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.21875, + "epoch": 0.7544, + "grad_norm": 0.044003140181303024, + "kl": 0.009439468383789062, + "learning_rate": 2.4773370090776625e-07, + "loss": -0.0062, + "reward": 1.177651286125183, + "reward_std": 0.18516015633940697, + "rewards/mrr_reward": 0.2531622014939785, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8424975126981735, + "step": 943 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.828125, + "epoch": 0.7552, + "grad_norm": 0.039039552211761475, + "kl": 0.0077571868896484375, + "learning_rate": 2.4083238061252565e-07, + "loss": -0.0039, + "reward": 1.0639915466308594, + "reward_std": 0.08116667065769434, + "rewards/mrr_reward": 0.13606150820851326, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8333936333656311, + "step": 944 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.015625, + "epoch": 0.756, + "grad_norm": 0.037782639265060425, + "kl": 0.009856224060058594, + "learning_rate": 2.3402737936175423e-07, + "loss": -0.0108, + "reward": 1.1415889263153076, + "reward_std": 0.17221311293542385, + "rewards/mrr_reward": 0.23467881605029106, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8048529326915741, + "step": 945 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.765625, + "epoch": 0.7568, + "grad_norm": 0.039081498980522156, + "kl": 0.010256767272949219, + "learning_rate": 2.273187643180652e-07, + "loss": -0.0199, + "reward": 1.0599176287651062, + "reward_std": 0.1111216451972723, + "rewards/mrr_reward": 0.13382937014102936, + "rewards/rank_answer_foramt_reward": 0.9296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8844529241323471, + "step": 946 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.46875, + "epoch": 0.7576, + "grad_norm": 0.04526249319314957, + "kl": 0.009111404418945312, + "learning_rate": 2.2070660169278168e-07, + "loss": -0.052, + "reward": 1.1703499853610992, + "reward_std": 0.17350439634174109, + "rewards/mrr_reward": 0.2586867641657591, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7899595201015472, + "step": 947 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.46875, + "epoch": 0.7584, + "grad_norm": 0.04180796444416046, + "kl": 0.008915901184082031, + "learning_rate": 2.1419095674527934e-07, + "loss": -0.0295, + "reward": 1.2294658422470093, + "reward_std": 0.1340288333594799, + "rewards/mrr_reward": 0.307868305593729, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.8376415371894836, + "step": 948 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.765625, + "epoch": 0.7592, + "grad_norm": 0.0391828715801239, + "kl": 0.008802413940429688, + "learning_rate": 2.077718937823414e-07, + "loss": -0.0093, + "reward": 1.2045442461967468, + "reward_std": 0.10182449035346508, + "rewards/mrr_reward": 0.2698102742433548, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8735426217317581, + "step": 949 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.984375, + "epoch": 0.76, + "grad_norm": 0.04419267922639847, + "kl": 0.010101318359375, + "learning_rate": 2.014494761575314e-07, + "loss": -0.0648, + "reward": 1.0402642339468002, + "reward_std": 0.14703384041786194, + "rewards/mrr_reward": 0.1415922623127699, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7779356688261032, + "step": 950 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.109375, + "epoch": 0.7608, + "grad_norm": 0.04218144342303276, + "kl": 0.008372306823730469, + "learning_rate": 1.9522376627055585e-07, + "loss": -0.0113, + "reward": 1.2481240332126617, + "reward_std": 0.21777066215872765, + "rewards/mrr_reward": 0.3340773805975914, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8264787942171097, + "step": 951 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.953125, + "epoch": 0.7616, + "grad_norm": 0.042666662484407425, + "kl": 0.0077667236328125, + "learning_rate": 1.8909482556666026e-07, + "loss": -0.0216, + "reward": 1.1155872642993927, + "reward_std": 0.14810610190033913, + "rewards/mrr_reward": 0.198753722012043, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8407832533121109, + "step": 952 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.171875, + "epoch": 0.7624, + "grad_norm": 0.04106247425079346, + "kl": 0.008396148681640625, + "learning_rate": 1.8306271453601198e-07, + "loss": 0.0195, + "reward": 1.224888414144516, + "reward_std": 0.10434846580028534, + "rewards/mrr_reward": 0.2887338884174824, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8446441739797592, + "step": 953 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.140625, + "epoch": 0.7632, + "grad_norm": 0.038032591342926025, + "kl": 0.007456779479980469, + "learning_rate": 1.7712749271311392e-07, + "loss": 0.0057, + "reward": 1.2025950849056244, + "reward_std": 0.14355502650141716, + "rewards/mrr_reward": 0.2678571380674839, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8813671916723251, + "step": 954 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.640625, + "epoch": 0.764, + "grad_norm": 0.0445358082652092, + "kl": 0.007924079895019531, + "learning_rate": 1.7128921867620828e-07, + "loss": -0.0353, + "reward": 1.316545844078064, + "reward_std": 0.17137969937175512, + "rewards/mrr_reward": 0.38145462423563004, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.86095330119133, + "step": 955 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.875, + "epoch": 0.7648, + "grad_norm": 0.03883486986160278, + "kl": 0.008409500122070312, + "learning_rate": 1.6554795004670389e-07, + "loss": -0.0121, + "reward": 1.2443694174289703, + "reward_std": 0.20190820842981339, + "rewards/mrr_reward": 0.3092943839728832, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8687168508768082, + "step": 956 + }, + { + "clip_ratio": 0.0, + "completion_length": 487.625, + "epoch": 0.7656, + "grad_norm": 0.04018738493323326, + "kl": 0.009251594543457031, + "learning_rate": 1.5990374348860304e-07, + "loss": -0.0126, + "reward": 1.1161698698997498, + "reward_std": 0.1159959128126502, + "rewards/mrr_reward": 0.18919890373945236, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8304871916770935, + "step": 957 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.46875, + "epoch": 0.7664, + "grad_norm": 0.04235168918967247, + "kl": 0.010198593139648438, + "learning_rate": 1.543566547079467e-07, + "loss": -0.0157, + "reward": 1.2352637648582458, + "reward_std": 0.29030829295516014, + "rewards/mrr_reward": 0.3278707917779684, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.808269277215004, + "step": 958 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.6875, + "epoch": 0.7672, + "grad_norm": 0.03733037784695625, + "kl": 0.007913589477539062, + "learning_rate": 1.4890673845226133e-07, + "loss": -0.0042, + "reward": 1.1977432668209076, + "reward_std": 0.19426253903657198, + "rewards/mrr_reward": 0.25998884066939354, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.9080862104892731, + "step": 959 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.8125, + "epoch": 0.768, + "grad_norm": 0.048545051366090775, + "kl": 0.009515762329101562, + "learning_rate": 1.4355404851001953e-07, + "loss": -0.0343, + "reward": 1.0536463111639023, + "reward_std": 0.1689282413572073, + "rewards/mrr_reward": 0.15678943321108818, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8271231651306152, + "step": 960 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.4375, + "epoch": 0.7688, + "grad_norm": 0.039646901190280914, + "kl": 0.009168624877929688, + "learning_rate": 1.3829863771011253e-07, + "loss": -0.0126, + "reward": 1.2258817553520203, + "reward_std": 0.20934798568487167, + "rewards/mrr_reward": 0.2947792708873749, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8625383079051971, + "step": 961 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.390625, + "epoch": 0.7696, + "grad_norm": 0.04106530547142029, + "kl": 0.008655548095703125, + "learning_rate": 1.3314055792131964e-07, + "loss": -0.0274, + "reward": 1.222478300333023, + "reward_std": 0.1886294735595584, + "rewards/mrr_reward": 0.3115823529660702, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.787634402513504, + "step": 962 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.890625, + "epoch": 0.7704, + "grad_norm": 0.04176384583115578, + "kl": 0.009407997131347656, + "learning_rate": 1.280798600518085e-07, + "loss": -0.0046, + "reward": 1.0859091877937317, + "reward_std": 0.10481879487633705, + "rewards/mrr_reward": 0.17180059850215912, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7836976647377014, + "step": 963 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.484375, + "epoch": 0.7712, + "grad_norm": 0.04281945526599884, + "kl": 0.0075130462646484375, + "learning_rate": 1.231165940486234e-07, + "loss": -0.0033, + "reward": 1.126029521226883, + "reward_std": 0.15459425188601017, + "rewards/mrr_reward": 0.18929191306233406, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8659424334764481, + "step": 964 + }, + { + "clip_ratio": 0.0, + "completion_length": 478.9375, + "epoch": 0.772, + "grad_norm": 0.03905851021409035, + "kl": 0.008356094360351562, + "learning_rate": 1.1825080889719565e-07, + "loss": 0.0076, + "reward": 1.13689586520195, + "reward_std": 0.14984166249632835, + "rewards/mrr_reward": 0.2044580951333046, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8880689293146133, + "step": 965 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.53125, + "epoch": 0.7728, + "grad_norm": 0.041591085493564606, + "kl": 0.011320114135742188, + "learning_rate": 1.134825526208605e-07, + "loss": -0.0267, + "reward": 1.0707438588142395, + "reward_std": 0.21261652931571007, + "rewards/mrr_reward": 0.1851748488843441, + "rewards/rank_answer_foramt_reward": 0.890625, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.81635482609272, + "step": 966 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.984375, + "epoch": 0.7736, + "grad_norm": 0.04075038060545921, + "kl": 0.0076580047607421875, + "learning_rate": 1.0881187228038214e-07, + "loss": -0.0047, + "reward": 1.211523026227951, + "reward_std": 0.18444243492558599, + "rewards/mrr_reward": 0.28766740672290325, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8562028259038925, + "step": 967 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.734375, + "epoch": 0.7744, + "grad_norm": 0.039919495582580566, + "kl": 0.008312225341796875, + "learning_rate": 1.0423881397349067e-07, + "loss": -0.0307, + "reward": 1.1101964116096497, + "reward_std": 0.14763948507606983, + "rewards/mrr_reward": 0.19312375597655773, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8219766318798065, + "step": 968 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.90625, + "epoch": 0.7752, + "grad_norm": 0.04152054339647293, + "kl": 0.010423660278320312, + "learning_rate": 9.976342283442464e-08, + "loss": -0.0013, + "reward": 1.1466363668441772, + "reward_std": 0.12986961100250483, + "rewards/mrr_reward": 0.23280629888176918, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.804338127374649, + "step": 969 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.828125, + "epoch": 0.776, + "grad_norm": 0.03675472363829613, + "kl": 0.008668899536132812, + "learning_rate": 9.538574303348813e-08, + "loss": 0.0016, + "reward": 1.2197460532188416, + "reward_std": 0.21070226281881332, + "rewards/mrr_reward": 0.33571428433060646, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.7491966933012009, + "step": 970 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.984375, + "epoch": 0.7768, + "grad_norm": 0.03766244277358055, + "kl": 0.010084152221679688, + "learning_rate": 9.110581777661331e-08, + "loss": -0.0202, + "reward": 1.0216215252876282, + "reward_std": 0.1345711536705494, + "rewards/mrr_reward": 0.1375496033579111, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.8000995367765427, + "step": 971 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.078125, + "epoch": 0.7776, + "grad_norm": 0.04581380635499954, + "kl": 0.009756088256835938, + "learning_rate": 8.692368930493522e-08, + "loss": -0.031, + "reward": 1.1670778393745422, + "reward_std": 0.1815691478550434, + "rewards/mrr_reward": 0.261966772377491, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8111199587583542, + "step": 972 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.34375, + "epoch": 0.7784, + "grad_norm": 0.04068039730191231, + "kl": 0.008086204528808594, + "learning_rate": 8.283939889437209e-08, + "loss": -0.0219, + "reward": 1.129480630159378, + "reward_std": 0.15650018118321896, + "rewards/mrr_reward": 0.20040303468704224, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8564022034406662, + "step": 973 + }, + { + "clip_ratio": 0.0, + "completion_length": 474.3125, + "epoch": 0.7792, + "grad_norm": 0.04014911130070686, + "kl": 0.008867263793945312, + "learning_rate": 7.885298685522235e-08, + "loss": -0.0152, + "reward": 1.3073887526988983, + "reward_std": 0.25805073603987694, + "rewards/mrr_reward": 0.4035032168030739, + "rewards/rank_answer_foramt_reward": 0.9140625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8406094461679459, + "step": 974 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.53125, + "epoch": 0.78, + "grad_norm": 0.040287163108587265, + "kl": 0.007943153381347656, + "learning_rate": 7.496449253176274e-08, + "loss": -0.0055, + "reward": 1.145872712135315, + "reward_std": 0.16490156902000308, + "rewards/mrr_reward": 0.221788190305233, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8549434095621109, + "step": 975 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.71875, + "epoch": 0.7808, + "grad_norm": 0.04375848174095154, + "kl": 0.009169578552246094, + "learning_rate": 7.117395430186414e-08, + "loss": 0.0074, + "reward": 1.293823391199112, + "reward_std": 0.25591621547937393, + "rewards/mrr_reward": 0.3684089817106724, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8667858242988586, + "step": 976 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.984375, + "epoch": 0.7816, + "grad_norm": 0.04306996241211891, + "kl": 0.008794784545898438, + "learning_rate": 6.748140957660632e-08, + "loss": 0.0091, + "reward": 1.1962977647781372, + "reward_std": 0.23732590302824974, + "rewards/mrr_reward": 0.30388763919472694, + "rewards/rank_answer_foramt_reward": 0.90234375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.809741660952568, + "step": 977 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.34375, + "epoch": 0.7824, + "grad_norm": 0.04417388513684273, + "kl": 0.009141921997070312, + "learning_rate": 6.388689479991606e-08, + "loss": -0.0188, + "reward": 1.076131820678711, + "reward_std": 0.09198620077222586, + "rewards/mrr_reward": 0.14461186155676842, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8501312732696533, + "step": 978 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.109375, + "epoch": 0.7832, + "grad_norm": 0.04082540050148964, + "kl": 0.0095977783203125, + "learning_rate": 6.039044544820404e-08, + "loss": -0.011, + "reward": 1.1028445363044739, + "reward_std": 0.0961092640645802, + "rewards/mrr_reward": 0.17588666267693043, + "rewards/rank_answer_foramt_reward": 1.0, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8245881348848343, + "step": 979 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.75, + "epoch": 0.784, + "grad_norm": 0.04861390218138695, + "kl": 0.009130477905273438, + "learning_rate": 5.699209603001077e-08, + "loss": 0.0231, + "reward": 1.1513446420431137, + "reward_std": 0.2505143228918314, + "rewards/mrr_reward": 0.26168156415224075, + "rewards/rank_answer_foramt_reward": 0.875, + "rewards/rank_overall_format_reward": 0.96875, + "rewards/rank_think_format_reward": 0.8521986305713654, + "step": 980 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.140625, + "epoch": 0.7848, + "grad_norm": 0.04346904158592224, + "kl": 0.008517265319824219, + "learning_rate": 5.369188008567672e-08, + "loss": -0.007, + "reward": 1.0626238584518433, + "reward_std": 0.10511020570993423, + "rewards/mrr_reward": 0.16136533580720425, + "rewards/rank_answer_foramt_reward": 0.876953125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.854133203625679, + "step": 981 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.734375, + "epoch": 0.7856, + "grad_norm": 0.04609828442335129, + "kl": 0.009782791137695312, + "learning_rate": 5.048983018699827e-08, + "loss": -0.0588, + "reward": 1.2032456696033478, + "reward_std": 0.15944865625351667, + "rewards/mrr_reward": 0.3129526190459728, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.752545177936554, + "step": 982 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.140625, + "epoch": 0.7864, + "grad_norm": 0.043732285499572754, + "kl": 0.0076694488525390625, + "learning_rate": 4.7385977936916796e-08, + "loss": -0.0008, + "reward": 1.090537279844284, + "reward_std": 0.1588072832673788, + "rewards/mrr_reward": 0.1814856231212616, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8582175672054291, + "step": 983 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.296875, + "epoch": 0.7872, + "grad_norm": 0.03897866606712341, + "kl": 0.008481025695800781, + "learning_rate": 4.438035396920004e-08, + "loss": -0.0183, + "reward": 1.0922786891460419, + "reward_std": 0.13948439992964268, + "rewards/mrr_reward": 0.17337549850344658, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8333830386400223, + "step": 984 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.328125, + "epoch": 0.788, + "grad_norm": 0.042856015264987946, + "kl": 0.0099334716796875, + "learning_rate": 4.147298794814347e-08, + "loss": -0.0059, + "reward": 1.162740409374237, + "reward_std": 0.20458003506064415, + "rewards/mrr_reward": 0.2671751044690609, + "rewards/rank_answer_foramt_reward": 0.904296875, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8173497021198273, + "step": 985 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.453125, + "epoch": 0.7888, + "grad_norm": 0.04340024292469025, + "kl": 0.008185386657714844, + "learning_rate": 3.866390856827495e-08, + "loss": -0.0132, + "reward": 1.1505849063396454, + "reward_std": 0.18550662696361542, + "rewards/mrr_reward": 0.22609126195311546, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8561831563711166, + "step": 986 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.046875, + "epoch": 0.7896, + "grad_norm": 0.040927499532699585, + "kl": 0.008787155151367188, + "learning_rate": 3.595314355407609e-08, + "loss": -0.0091, + "reward": 1.0734205096960068, + "reward_std": 0.1213362361304462, + "rewards/mrr_reward": 0.155629962682724, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.8651677072048187, + "step": 987 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.15625, + "epoch": 0.7904, + "grad_norm": 0.04325427487492561, + "kl": 0.011075973510742188, + "learning_rate": 3.3340719659701315e-08, + "loss": -0.0347, + "reward": 1.2632518410682678, + "reward_std": 0.20907128229737282, + "rewards/mrr_reward": 0.34834448993206024, + "rewards/rank_answer_foramt_reward": 0.95703125, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.823227733373642, + "step": 988 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.0, + "epoch": 0.7912, + "grad_norm": 0.040987979620695114, + "kl": 0.0085906982421875, + "learning_rate": 3.082666266872036e-08, + "loss": -0.0128, + "reward": 1.2261515259742737, + "reward_std": 0.20394627377390862, + "rewards/mrr_reward": 0.2976128365844488, + "rewards/rank_answer_foramt_reward": 0.9453125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8684409856796265, + "step": 989 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.5, + "epoch": 0.792, + "grad_norm": 0.04282965511083603, + "kl": 0.007916450500488281, + "learning_rate": 2.8410997393860663e-08, + "loss": -0.0384, + "reward": 1.1509665548801422, + "reward_std": 0.20199062675237656, + "rewards/mrr_reward": 0.22928448393940926, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.833991602063179, + "step": 990 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.625, + "epoch": 0.7928, + "grad_norm": 0.04164193570613861, + "kl": 0.008025169372558594, + "learning_rate": 2.6093747676763093e-08, + "loss": -0.0101, + "reward": 1.2387970685958862, + "reward_std": 0.13890931848436594, + "rewards/mrr_reward": 0.3080791234970093, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8340292572975159, + "step": 991 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.5, + "epoch": 0.7936, + "grad_norm": 0.04225363954901695, + "kl": 0.008797645568847656, + "learning_rate": 2.3874936387747738e-08, + "loss": -0.0029, + "reward": 1.1117708086967468, + "reward_std": 0.164918664842844, + "rewards/mrr_reward": 0.21209697611629963, + "rewards/rank_answer_foramt_reward": 0.927734375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7985499054193497, + "step": 992 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.203125, + "epoch": 0.7944, + "grad_norm": 0.046599503606557846, + "kl": 0.00894927978515625, + "learning_rate": 2.175458542558517e-08, + "loss": 0.0412, + "reward": 1.156593143939972, + "reward_std": 0.1569316927343607, + "rewards/mrr_reward": 0.24654637277126312, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.7987329065799713, + "step": 993 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.171875, + "epoch": 0.7952, + "grad_norm": 0.040313415229320526, + "kl": 0.010005950927734375, + "learning_rate": 1.973271571728441e-08, + "loss": -0.0121, + "reward": 1.200623333454132, + "reward_std": 0.16222818940877914, + "rewards/mrr_reward": 0.27439235523343086, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8341041952371597, + "step": 994 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.078125, + "epoch": 0.796, + "grad_norm": 0.03999168425798416, + "kl": 0.008646011352539062, + "learning_rate": 1.7809347217881966e-08, + "loss": 0.0166, + "reward": 1.1675262749195099, + "reward_std": 0.12778384890407324, + "rewards/mrr_reward": 0.2326450925320387, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8466451019048691, + "step": 995 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.65625, + "epoch": 0.7968, + "grad_norm": 0.03779391199350357, + "kl": 0.0085906982421875, + "learning_rate": 1.5984498910249778e-08, + "loss": 0.0062, + "reward": 1.2637392580509186, + "reward_std": 0.1601163186132908, + "rewards/mrr_reward": 0.34081100672483444, + "rewards/rank_answer_foramt_reward": 0.986328125, + "rewards/rank_overall_format_reward": 0.984375, + "rewards/rank_think_format_reward": 0.826048955321312, + "step": 996 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.125, + "epoch": 0.7976, + "grad_norm": 0.04796071723103523, + "kl": 0.008829116821289062, + "learning_rate": 1.425818880490315e-08, + "loss": -0.0411, + "reward": 1.2064868211746216, + "reward_std": 0.15722107328474522, + "rewards/mrr_reward": 0.28802703507244587, + "rewards/rank_answer_foramt_reward": 0.958984375, + "rewards/rank_overall_format_reward": 1.0, + "rewards/rank_think_format_reward": 0.8242269903421402, + "step": 997 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.390625, + "epoch": 0.7984, + "grad_norm": 0.04089636355638504, + "kl": 0.0083160400390625, + "learning_rate": 1.2630433939825326e-08, + "loss": -0.0002, + "reward": 1.2470715939998627, + "reward_std": 0.30819156393408775, + "rewards/mrr_reward": 0.3308655694127083, + "rewards/rank_answer_foramt_reward": 0.931640625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.852553591132164, + "step": 998 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.1875, + "epoch": 0.7992, + "grad_norm": 0.03799648582935333, + "kl": 0.0072174072265625, + "learning_rate": 1.1101250380300965e-08, + "loss": -0.0031, + "reward": 1.127793937921524, + "reward_std": 0.11800427921116352, + "rewards/mrr_reward": 0.20063864439725876, + "rewards/rank_answer_foramt_reward": 0.97265625, + "rewards/rank_overall_format_reward": 0.9921875, + "rewards/rank_think_format_reward": 0.8447176665067673, + "step": 999 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.25, + "epoch": 0.8, + "grad_norm": 0.04348551481962204, + "kl": 0.008482933044433594, + "learning_rate": 9.670653218752935e-09, + "loss": -0.0049, + "reward": 1.0936516225337982, + "reward_std": 0.18971389904618263, + "rewards/mrr_reward": 0.19807788357138634, + "rewards/rank_answer_foramt_reward": 0.943359375, + "rewards/rank_overall_format_reward": 0.9765625, + "rewards/rank_think_format_reward": 0.7939379811286926, + "step": 1000 + }, + { + "epoch": 0.8, + "step": 1000, "total_flos": 0.0, - "train_loss": 5.142112260688089e-05, - "train_runtime": 276.4871, - "train_samples_per_second": 115.738, - "train_steps_per_second": 1.808 + "train_loss": -0.006724746519234032, + "train_runtime": 117510.8339, + "train_samples_per_second": 0.545, + "train_steps_per_second": 0.009 } ], "logging_steps": 1, - "max_steps": 500, + "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500,