| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 1931, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00518000518000518, |
| "grad_norm": 6.292460918426514, |
| "learning_rate": 9.278350515463919e-07, |
| "loss": 0.5994, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.01036001036001036, |
| "grad_norm": 3.469723701477051, |
| "learning_rate": 1.9587628865979384e-06, |
| "loss": 0.4581, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.01554001554001554, |
| "grad_norm": 1.8256299495697021, |
| "learning_rate": 2.9896907216494846e-06, |
| "loss": 0.254, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.02072002072002072, |
| "grad_norm": 0.6593835949897766, |
| "learning_rate": 4.020618556701032e-06, |
| "loss": 0.1497, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0259000259000259, |
| "grad_norm": 0.4585426449775696, |
| "learning_rate": 5.051546391752578e-06, |
| "loss": 0.1086, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03108003108003108, |
| "grad_norm": 0.3679659068584442, |
| "learning_rate": 6.082474226804124e-06, |
| "loss": 0.087, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.03626003626003626, |
| "grad_norm": 0.3684946596622467, |
| "learning_rate": 7.113402061855671e-06, |
| "loss": 0.0775, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.04144004144004144, |
| "grad_norm": 0.328337162733078, |
| "learning_rate": 8.144329896907216e-06, |
| "loss": 0.0686, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.046620046620046623, |
| "grad_norm": 1.8907654285430908, |
| "learning_rate": 9.175257731958764e-06, |
| "loss": 0.0658, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0518000518000518, |
| "grad_norm": 0.5488079786300659, |
| "learning_rate": 1.0206185567010309e-05, |
| "loss": 0.0631, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.05698005698005698, |
| "grad_norm": 0.3149188458919525, |
| "learning_rate": 1.1237113402061856e-05, |
| "loss": 0.0554, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.06216006216006216, |
| "grad_norm": 0.28675776720046997, |
| "learning_rate": 1.2268041237113405e-05, |
| "loss": 0.0487, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.06734006734006734, |
| "grad_norm": 0.20799441635608673, |
| "learning_rate": 1.3298969072164948e-05, |
| "loss": 0.0462, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.07252007252007252, |
| "grad_norm": 0.21834194660186768, |
| "learning_rate": 1.4329896907216495e-05, |
| "loss": 0.0437, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.0777000777000777, |
| "grad_norm": 0.20788688957691193, |
| "learning_rate": 1.5360824742268042e-05, |
| "loss": 0.0416, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.08288008288008288, |
| "grad_norm": 0.19446660578250885, |
| "learning_rate": 1.6391752577319588e-05, |
| "loss": 0.0395, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.08806008806008805, |
| "grad_norm": 0.23992358148097992, |
| "learning_rate": 1.7422680412371137e-05, |
| "loss": 0.0382, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.09324009324009325, |
| "grad_norm": 0.274311363697052, |
| "learning_rate": 1.8453608247422682e-05, |
| "loss": 0.0374, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.09842009842009843, |
| "grad_norm": 0.20221377909183502, |
| "learning_rate": 1.9484536082474227e-05, |
| "loss": 0.0362, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.1036001036001036, |
| "grad_norm": 0.1887311339378357, |
| "learning_rate": 1.9999591109366888e-05, |
| "loss": 0.0352, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.10878010878010878, |
| "grad_norm": 0.16539287567138672, |
| "learning_rate": 1.9996320184929093e-05, |
| "loss": 0.0345, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.11396011396011396, |
| "grad_norm": 0.19868427515029907, |
| "learning_rate": 1.9989779405991916e-05, |
| "loss": 0.0321, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.11914011914011914, |
| "grad_norm": 0.18100829422473907, |
| "learning_rate": 1.9979970912082214e-05, |
| "loss": 0.0325, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.12432012432012432, |
| "grad_norm": 0.16753119230270386, |
| "learning_rate": 1.9966897911615417e-05, |
| "loss": 0.0321, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.1295001295001295, |
| "grad_norm": 0.16504332423210144, |
| "learning_rate": 1.9950564680846042e-05, |
| "loss": 0.0302, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.13468013468013468, |
| "grad_norm": 0.16585540771484375, |
| "learning_rate": 1.993097656246892e-05, |
| "loss": 0.0299, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.13986013986013987, |
| "grad_norm": 0.1773216277360916, |
| "learning_rate": 1.9908139963871547e-05, |
| "loss": 0.0288, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.14504014504014504, |
| "grad_norm": 0.16576074063777924, |
| "learning_rate": 1.988206235503821e-05, |
| "loss": 0.0292, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.15022015022015023, |
| "grad_norm": 0.15523289144039154, |
| "learning_rate": 1.98527522661065e-05, |
| "loss": 0.0291, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.1554001554001554, |
| "grad_norm": 0.16800253093242645, |
| "learning_rate": 1.9820219284577052e-05, |
| "loss": 0.0284, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.16058016058016059, |
| "grad_norm": 0.15856459736824036, |
| "learning_rate": 1.9784474052177435e-05, |
| "loss": 0.0277, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.16576016576016575, |
| "grad_norm": 0.15079988539218903, |
| "learning_rate": 1.9745528261381156e-05, |
| "loss": 0.0277, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.17094017094017094, |
| "grad_norm": 0.18376828730106354, |
| "learning_rate": 1.970339465158301e-05, |
| "loss": 0.0272, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.1761201761201761, |
| "grad_norm": 0.1650630086660385, |
| "learning_rate": 1.9658087004931926e-05, |
| "loss": 0.0276, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.1813001813001813, |
| "grad_norm": 0.14251892268657684, |
| "learning_rate": 1.960962014182276e-05, |
| "loss": 0.0263, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.1864801864801865, |
| "grad_norm": 0.13589179515838623, |
| "learning_rate": 1.955800991604846e-05, |
| "loss": 0.0267, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.19166019166019166, |
| "grad_norm": 0.15240447223186493, |
| "learning_rate": 1.9503273209614183e-05, |
| "loss": 0.0259, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.19684019684019685, |
| "grad_norm": 0.13825932145118713, |
| "learning_rate": 1.9445427927215108e-05, |
| "loss": 0.0243, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.20202020202020202, |
| "grad_norm": 0.13044404983520508, |
| "learning_rate": 1.9384492990379703e-05, |
| "loss": 0.0254, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.2072002072002072, |
| "grad_norm": 0.15161941945552826, |
| "learning_rate": 1.9320488331280372e-05, |
| "loss": 0.024, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.21238021238021237, |
| "grad_norm": 0.12539660930633545, |
| "learning_rate": 1.9253434886213548e-05, |
| "loss": 0.0247, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.21756021756021757, |
| "grad_norm": 0.1758316159248352, |
| "learning_rate": 1.9183354588751274e-05, |
| "loss": 0.025, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.22274022274022273, |
| "grad_norm": 0.1396271139383316, |
| "learning_rate": 1.911027036256664e-05, |
| "loss": 0.0239, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.22792022792022792, |
| "grad_norm": 0.1349797546863556, |
| "learning_rate": 1.9034206113935297e-05, |
| "loss": 0.0237, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.2331002331002331, |
| "grad_norm": 0.13483276963233948, |
| "learning_rate": 1.8955186723915573e-05, |
| "loss": 0.024, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.23828023828023828, |
| "grad_norm": 0.1201694905757904, |
| "learning_rate": 1.887323804020975e-05, |
| "loss": 0.0226, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.24346024346024345, |
| "grad_norm": 0.1224222481250763, |
| "learning_rate": 1.878838686870911e-05, |
| "loss": 0.0242, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.24864024864024864, |
| "grad_norm": 0.11309316009283066, |
| "learning_rate": 1.8700660964725583e-05, |
| "loss": 0.0221, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.2538202538202538, |
| "grad_norm": 0.13578607141971588, |
| "learning_rate": 1.8610089023912828e-05, |
| "loss": 0.0237, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.259000259000259, |
| "grad_norm": 0.13230614364147186, |
| "learning_rate": 1.8516700672879706e-05, |
| "loss": 0.0224, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.2641802641802642, |
| "grad_norm": 0.12713994085788727, |
| "learning_rate": 1.8420526459499252e-05, |
| "loss": 0.023, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.26936026936026936, |
| "grad_norm": 0.1189827024936676, |
| "learning_rate": 1.8321597842916282e-05, |
| "loss": 0.0219, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.2745402745402745, |
| "grad_norm": 0.12820567190647125, |
| "learning_rate": 1.821994718325693e-05, |
| "loss": 0.0228, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.27972027972027974, |
| "grad_norm": 0.11525845527648926, |
| "learning_rate": 1.811560773104346e-05, |
| "loss": 0.0214, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.2849002849002849, |
| "grad_norm": 0.1343483179807663, |
| "learning_rate": 1.8008613616317823e-05, |
| "loss": 0.0218, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.29008029008029007, |
| "grad_norm": 0.11713062226772308, |
| "learning_rate": 1.7898999837477528e-05, |
| "loss": 0.0216, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.29526029526029524, |
| "grad_norm": 0.14427222311496735, |
| "learning_rate": 1.7786802249827454e-05, |
| "loss": 0.0212, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.30044030044030046, |
| "grad_norm": 0.11759313195943832, |
| "learning_rate": 1.7672057553851387e-05, |
| "loss": 0.0216, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.3056203056203056, |
| "grad_norm": 0.11268991976976395, |
| "learning_rate": 1.755480328320705e-05, |
| "loss": 0.0212, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.3108003108003108, |
| "grad_norm": 0.1134837344288826, |
| "learning_rate": 1.7435077792448666e-05, |
| "loss": 0.0214, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.315980315980316, |
| "grad_norm": 0.10657832026481628, |
| "learning_rate": 1.731292024448091e-05, |
| "loss": 0.0214, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.32116032116032117, |
| "grad_norm": 0.11426619440317154, |
| "learning_rate": 1.7188370597748553e-05, |
| "loss": 0.0211, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.32634032634032634, |
| "grad_norm": 0.11127694696187973, |
| "learning_rate": 1.706146959316576e-05, |
| "loss": 0.0194, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.3315203315203315, |
| "grad_norm": 0.11302390694618225, |
| "learning_rate": 1.6932258740789553e-05, |
| "loss": 0.02, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.3367003367003367, |
| "grad_norm": 0.1174977570772171, |
| "learning_rate": 1.6800780306241596e-05, |
| "loss": 0.0197, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.3418803418803419, |
| "grad_norm": 0.12497369199991226, |
| "learning_rate": 1.666707729688289e-05, |
| "loss": 0.0197, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.34706034706034705, |
| "grad_norm": 0.10607603937387466, |
| "learning_rate": 1.6531193447745776e-05, |
| "loss": 0.0197, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.3522403522403522, |
| "grad_norm": 0.1149929016828537, |
| "learning_rate": 1.6393173207228e-05, |
| "loss": 0.0199, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.35742035742035744, |
| "grad_norm": 0.11435063183307648, |
| "learning_rate": 1.6253061722553353e-05, |
| "loss": 0.0196, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.3626003626003626, |
| "grad_norm": 0.11646245419979095, |
| "learning_rate": 1.6110904825003754e-05, |
| "loss": 0.0199, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.36778036778036777, |
| "grad_norm": 0.6431416273117065, |
| "learning_rate": 1.596674901492758e-05, |
| "loss": 0.0238, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.372960372960373, |
| "grad_norm": 6.487484455108643, |
| "learning_rate": 1.5820641446529127e-05, |
| "loss": 0.0379, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.37814037814037815, |
| "grad_norm": 0.19220368564128876, |
| "learning_rate": 1.567262991244419e-05, |
| "loss": 0.0306, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.3833203833203833, |
| "grad_norm": 0.11730585992336273, |
| "learning_rate": 1.5522762828106822e-05, |
| "loss": 0.022, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.3885003885003885, |
| "grad_norm": 0.12174461036920547, |
| "learning_rate": 1.5371089215912363e-05, |
| "loss": 0.0215, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.3936803936803937, |
| "grad_norm": 0.1016201600432396, |
| "learning_rate": 1.5217658689181925e-05, |
| "loss": 0.0205, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.39886039886039887, |
| "grad_norm": 0.09577111154794693, |
| "learning_rate": 1.5062521435933586e-05, |
| "loss": 0.0198, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.40404040404040403, |
| "grad_norm": 0.08802168816328049, |
| "learning_rate": 1.4905728202465596e-05, |
| "loss": 0.0188, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.4092204092204092, |
| "grad_norm": 0.12182024121284485, |
| "learning_rate": 1.4747330276756986e-05, |
| "loss": 0.0195, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.4144004144004144, |
| "grad_norm": 0.09344793111085892, |
| "learning_rate": 1.4587379471690937e-05, |
| "loss": 0.0202, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.4195804195804196, |
| "grad_norm": 0.10629361122846603, |
| "learning_rate": 1.4425928108106519e-05, |
| "loss": 0.0186, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.42476042476042475, |
| "grad_norm": 0.09861776977777481, |
| "learning_rate": 1.4263028997684217e-05, |
| "loss": 0.0197, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.4299404299404299, |
| "grad_norm": 0.10772161930799484, |
| "learning_rate": 1.4098735425670931e-05, |
| "loss": 0.0193, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.43512043512043513, |
| "grad_norm": 0.09898128360509872, |
| "learning_rate": 1.393310113345006e-05, |
| "loss": 0.0194, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.4403004403004403, |
| "grad_norm": 0.09928678721189499, |
| "learning_rate": 1.3766180300962393e-05, |
| "loss": 0.0183, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.44548044548044546, |
| "grad_norm": 0.09649905562400818, |
| "learning_rate": 1.3598027528983517e-05, |
| "loss": 0.0179, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.4506604506604507, |
| "grad_norm": 0.1102428138256073, |
| "learning_rate": 1.34286978212636e-05, |
| "loss": 0.0193, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.45584045584045585, |
| "grad_norm": 0.10131724178791046, |
| "learning_rate": 1.325824656653534e-05, |
| "loss": 0.0193, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.461020461020461, |
| "grad_norm": 0.09785453975200653, |
| "learning_rate": 1.308672952039598e-05, |
| "loss": 0.0182, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.4662004662004662, |
| "grad_norm": 0.11056746542453766, |
| "learning_rate": 1.2914202787069345e-05, |
| "loss": 0.0183, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.4713804713804714, |
| "grad_norm": 0.10242997854948044, |
| "learning_rate": 1.2740722801053808e-05, |
| "loss": 0.0183, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.47656047656047656, |
| "grad_norm": 0.09493660181760788, |
| "learning_rate": 1.2566346308662248e-05, |
| "loss": 0.0187, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.48174048174048173, |
| "grad_norm": 0.09588351845741272, |
| "learning_rate": 1.239113034945999e-05, |
| "loss": 0.0174, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.4869204869204869, |
| "grad_norm": 0.09339374303817749, |
| "learning_rate": 1.2215132237606843e-05, |
| "loss": 0.0177, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.4921004921004921, |
| "grad_norm": 0.09003674238920212, |
| "learning_rate": 1.2038409543109295e-05, |
| "loss": 0.0176, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.4972804972804973, |
| "grad_norm": 0.09645362198352814, |
| "learning_rate": 1.186102007298904e-05, |
| "loss": 0.0185, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.5024605024605024, |
| "grad_norm": 0.09411321580410004, |
| "learning_rate": 1.168302185237395e-05, |
| "loss": 0.0175, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.5076405076405076, |
| "grad_norm": 0.0976066067814827, |
| "learning_rate": 1.1504473105517731e-05, |
| "loss": 0.017, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.5128205128205128, |
| "grad_norm": 0.09469664096832275, |
| "learning_rate": 1.1325432236754424e-05, |
| "loss": 0.0174, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.518000518000518, |
| "grad_norm": 0.09026806801557541, |
| "learning_rate": 1.1145957811394006e-05, |
| "loss": 0.0174, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.5231805231805232, |
| "grad_norm": 0.09828916192054749, |
| "learning_rate": 1.096610853656535e-05, |
| "loss": 0.0175, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.5283605283605284, |
| "grad_norm": 0.09940842539072037, |
| "learning_rate": 1.0785943242012763e-05, |
| "loss": 0.0167, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.5335405335405335, |
| "grad_norm": 0.07944466173648834, |
| "learning_rate": 1.0605520860852442e-05, |
| "loss": 0.0173, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.5387205387205387, |
| "grad_norm": 0.08528061211109161, |
| "learning_rate": 1.0424900410295115e-05, |
| "loss": 0.0169, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.5439005439005439, |
| "grad_norm": 0.10138797760009766, |
| "learning_rate": 1.0244140972341155e-05, |
| "loss": 0.0174, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.549080549080549, |
| "grad_norm": 0.10442786663770676, |
| "learning_rate": 1.0063301674454526e-05, |
| "loss": 0.0171, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.5542605542605542, |
| "grad_norm": 0.10265690833330154, |
| "learning_rate": 9.882441670221846e-06, |
| "loss": 0.0162, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.5594405594405595, |
| "grad_norm": 0.10706663131713867, |
| "learning_rate": 9.701620120002885e-06, |
| "loss": 0.0178, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.5646205646205646, |
| "grad_norm": 0.08483204990625381, |
| "learning_rate": 9.520896171578891e-06, |
| "loss": 0.0175, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.5698005698005698, |
| "grad_norm": 0.09182880818843842, |
| "learning_rate": 9.340328940805003e-06, |
| "loss": 0.0174, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.574980574980575, |
| "grad_norm": 0.09123273193836212, |
| "learning_rate": 9.159977492273086e-06, |
| "loss": 0.0166, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.5801605801605801, |
| "grad_norm": 0.09600038826465607, |
| "learning_rate": 8.9799008199914e-06, |
| "loss": 0.0166, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.5853405853405853, |
| "grad_norm": 0.07484059780836105, |
| "learning_rate": 8.800157828087275e-06, |
| "loss": 0.017, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.5905205905205905, |
| "grad_norm": 0.1001143753528595, |
| "learning_rate": 8.620807311539258e-06, |
| "loss": 0.017, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.5957005957005957, |
| "grad_norm": 0.09151753783226013, |
| "learning_rate": 8.441907936944933e-06, |
| "loss": 0.0172, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.6008806008806009, |
| "grad_norm": 0.08270428329706192, |
| "learning_rate": 8.263518223330698e-06, |
| "loss": 0.0164, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.6060606060606061, |
| "grad_norm": 0.07453130185604095, |
| "learning_rate": 8.085696523009907e-06, |
| "loss": 0.0164, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.6112406112406112, |
| "grad_norm": 0.08895987272262573, |
| "learning_rate": 7.908501002495445e-06, |
| "loss": 0.0169, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.6164206164206164, |
| "grad_norm": 0.08611361682415009, |
| "learning_rate": 7.731989623473144e-06, |
| "loss": 0.0155, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.6216006216006216, |
| "grad_norm": 0.09515902400016785, |
| "learning_rate": 7.556220123842173e-06, |
| "loss": 0.0169, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.6267806267806267, |
| "grad_norm": 0.08863533288240433, |
| "learning_rate": 7.38124999882863e-06, |
| "loss": 0.0167, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.631960631960632, |
| "grad_norm": 0.0799228847026825, |
| "learning_rate": 7.207136482178538e-06, |
| "loss": 0.0162, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.6371406371406372, |
| "grad_norm": 0.08676367253065109, |
| "learning_rate": 7.033936527436318e-06, |
| "loss": 0.017, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.6423206423206423, |
| "grad_norm": 0.08304847776889801, |
| "learning_rate": 6.861706789314993e-06, |
| "loss": 0.0158, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.6475006475006475, |
| "grad_norm": 0.0753399059176445, |
| "learning_rate": 6.6905036051640804e-06, |
| "loss": 0.016, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.6526806526806527, |
| "grad_norm": 0.09043081849813461, |
| "learning_rate": 6.520382976541313e-06, |
| "loss": 0.0159, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.6578606578606578, |
| "grad_norm": 0.08677306771278381, |
| "learning_rate": 6.351400550894224e-06, |
| "loss": 0.0158, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.663040663040663, |
| "grad_norm": 0.08940693736076355, |
| "learning_rate": 6.183611603357513e-06, |
| "loss": 0.0159, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.6682206682206682, |
| "grad_norm": 0.0751878097653389, |
| "learning_rate": 6.0170710186722605e-06, |
| "loss": 0.0161, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.6734006734006734, |
| "grad_norm": 0.0836741104722023, |
| "learning_rate": 5.851833273232788e-06, |
| "loss": 0.016, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.6785806785806786, |
| "grad_norm": 0.08590537309646606, |
| "learning_rate": 5.687952417267115e-06, |
| "loss": 0.0157, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.6837606837606838, |
| "grad_norm": 0.09049531072378159, |
| "learning_rate": 5.525482057156833e-06, |
| "loss": 0.0159, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.6889406889406889, |
| "grad_norm": 0.08685445785522461, |
| "learning_rate": 5.364475337902108e-06, |
| "loss": 0.0155, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.6941206941206941, |
| "grad_norm": 0.08676782995462418, |
| "learning_rate": 5.204984925737689e-06, |
| "loss": 0.0166, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.6993006993006993, |
| "grad_norm": 0.09176863729953766, |
| "learning_rate": 5.047062990905436e-06, |
| "loss": 0.016, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.7044807044807044, |
| "grad_norm": 0.08616431057453156, |
| "learning_rate": 4.890761190589157e-06, |
| "loss": 0.0156, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.7096607096607097, |
| "grad_norm": 0.07910116016864777, |
| "learning_rate": 4.736130652017228e-06, |
| "loss": 0.0154, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.7148407148407149, |
| "grad_norm": 0.07988451421260834, |
| "learning_rate": 4.5832219557385896e-06, |
| "loss": 0.0153, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.72002072002072, |
| "grad_norm": 0.07867439091205597, |
| "learning_rate": 4.432085119077536e-06, |
| "loss": 0.0153, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.7252007252007252, |
| "grad_norm": 0.08625340461730957, |
| "learning_rate": 4.2827695797727835e-06, |
| "loss": 0.0153, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.7303807303807304, |
| "grad_norm": 0.07801397144794464, |
| "learning_rate": 4.135324179806079e-06, |
| "loss": 0.0158, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.7355607355607355, |
| "grad_norm": 0.07337366789579391, |
| "learning_rate": 3.989797149425714e-06, |
| "loss": 0.0153, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.7407407407407407, |
| "grad_norm": 0.07763037085533142, |
| "learning_rate": 3.846236091370119e-06, |
| "loss": 0.0154, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.745920745920746, |
| "grad_norm": 0.08092360198497772, |
| "learning_rate": 3.704687965296746e-06, |
| "loss": 0.0162, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.7511007511007511, |
| "grad_norm": 0.08551878482103348, |
| "learning_rate": 3.5651990724212716e-06, |
| "loss": 0.0153, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.7562807562807563, |
| "grad_norm": 0.07818982750177383, |
| "learning_rate": 3.4278150403722222e-06, |
| "loss": 0.0155, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.7614607614607615, |
| "grad_norm": 0.08288519084453583, |
| "learning_rate": 3.292580808265897e-06, |
| "loss": 0.0154, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.7666407666407666, |
| "grad_norm": 0.08305075764656067, |
| "learning_rate": 3.1595406120065174e-06, |
| "loss": 0.0152, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.7718207718207718, |
| "grad_norm": 0.08365499973297119, |
| "learning_rate": 3.0287379698164245e-06, |
| "loss": 0.0148, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.777000777000777, |
| "grad_norm": 0.07930707186460495, |
| "learning_rate": 2.900215668000991e-06, |
| "loss": 0.0161, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.7821807821807821, |
| "grad_norm": 0.07458413392305374, |
| "learning_rate": 2.7740157469529915e-06, |
| "loss": 0.0159, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.7873607873607874, |
| "grad_norm": 0.08544889092445374, |
| "learning_rate": 2.6501794874009425e-06, |
| "loss": 0.0153, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.7925407925407926, |
| "grad_norm": 0.0796743705868721, |
| "learning_rate": 2.5287473969059174e-06, |
| "loss": 0.0156, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.7977207977207977, |
| "grad_norm": 0.089390829205513, |
| "learning_rate": 2.4097591966113155e-06, |
| "loss": 0.0149, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.8029008029008029, |
| "grad_norm": 0.0870826244354248, |
| "learning_rate": 2.2932538082498225e-06, |
| "loss": 0.0156, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.8080808080808081, |
| "grad_norm": 0.07120909541845322, |
| "learning_rate": 2.179269341411896e-06, |
| "loss": 0.0152, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.8132608132608132, |
| "grad_norm": 0.07515786588191986, |
| "learning_rate": 2.0678430810799e-06, |
| "loss": 0.0141, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.8184408184408184, |
| "grad_norm": 0.07680921256542206, |
| "learning_rate": 1.959011475431952e-06, |
| "loss": 0.0155, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.8236208236208237, |
| "grad_norm": 0.08255264908075333, |
| "learning_rate": 1.8528101239195394e-06, |
| "loss": 0.0153, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.8288008288008288, |
| "grad_norm": 0.08244433999061584, |
| "learning_rate": 1.7492737656227032e-06, |
| "loss": 0.0152, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.833980833980834, |
| "grad_norm": 0.06824195384979248, |
| "learning_rate": 1.6484362678867083e-06, |
| "loss": 0.015, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.8391608391608392, |
| "grad_norm": 0.07020522654056549, |
| "learning_rate": 1.5503306152438146e-06, |
| "loss": 0.015, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.8443408443408443, |
| "grad_norm": 0.08058454096317291, |
| "learning_rate": 1.4549888986238658e-06, |
| "loss": 0.0154, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.8495208495208495, |
| "grad_norm": 0.0823742225766182, |
| "learning_rate": 1.3624423048571434e-06, |
| "loss": 0.0146, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.8547008547008547, |
| "grad_norm": 0.07870230078697205, |
| "learning_rate": 1.2727211064729862e-06, |
| "loss": 0.0149, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.8598808598808598, |
| "grad_norm": 0.06677371263504028, |
| "learning_rate": 1.1858546517974511e-06, |
| "loss": 0.014, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.8650608650608651, |
| "grad_norm": 0.08251772075891495, |
| "learning_rate": 1.1018713553533279e-06, |
| "loss": 0.015, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.8702408702408703, |
| "grad_norm": 0.07941140979528427, |
| "learning_rate": 1.0207986885655664e-06, |
| "loss": 0.0151, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.8754208754208754, |
| "grad_norm": 0.07803778350353241, |
| "learning_rate": 9.426631707752243e-07, |
| "loss": 0.015, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.8806008806008806, |
| "grad_norm": 0.07801060378551483, |
| "learning_rate": 8.674903605648221e-07, |
| "loss": 0.0155, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.8857808857808858, |
| "grad_norm": 0.07815500348806381, |
| "learning_rate": 7.953048473980041e-07, |
| "loss": 0.0149, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.8909608909608909, |
| "grad_norm": 0.07662923634052277, |
| "learning_rate": 7.261302435761564e-07, |
| "loss": 0.0155, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.8961408961408961, |
| "grad_norm": 0.07826782763004303, |
| "learning_rate": 6.59989176514707e-07, |
| "loss": 0.0148, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.9013209013209014, |
| "grad_norm": 0.07217193394899368, |
| "learning_rate": 5.969032813415577e-07, |
| "loss": 0.0145, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.9065009065009065, |
| "grad_norm": 0.08113069832324982, |
| "learning_rate": 5.368931938201006e-07, |
| "loss": 0.015, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.9116809116809117, |
| "grad_norm": 0.07725197076797485, |
| "learning_rate": 4.799785435991577e-07, |
| "loss": 0.0148, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.9168609168609169, |
| "grad_norm": 0.07793201506137848, |
| "learning_rate": 4.261779477919892e-07, |
| "loss": 0.0151, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.922040922040922, |
| "grad_norm": 0.0730026513338089, |
| "learning_rate": 3.755090048865406e-07, |
| "loss": 0.0156, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.9272209272209272, |
| "grad_norm": 0.07092654705047607, |
| "learning_rate": 3.27988288988873e-07, |
| "loss": 0.0147, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.9324009324009324, |
| "grad_norm": 0.07984331995248795, |
| "learning_rate": 2.8363134440166806e-07, |
| "loss": 0.0151, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.9375809375809376, |
| "grad_norm": 0.07417917996644974, |
| "learning_rate": 2.424526805396088e-07, |
| "loss": 0.0148, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.9427609427609428, |
| "grad_norm": 0.07675167918205261, |
| "learning_rate": 2.0446576718325283e-07, |
| "loss": 0.0152, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.947940947940948, |
| "grad_norm": 0.07929011434316635, |
| "learning_rate": 1.6968303007300124e-07, |
| "loss": 0.0149, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.9531209531209531, |
| "grad_norm": 0.06900076568126678, |
| "learning_rate": 1.3811584684455648e-07, |
| "loss": 0.0153, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.9583009583009583, |
| "grad_norm": 0.07187589257955551, |
| "learning_rate": 1.0977454330723725e-07, |
| "loss": 0.0146, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.9634809634809635, |
| "grad_norm": 0.07358822226524353, |
| "learning_rate": 8.466839006634364e-08, |
| "loss": 0.0148, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.9686609686609686, |
| "grad_norm": 0.07493717968463898, |
| "learning_rate": 6.280559949068731e-08, |
| "loss": 0.0146, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.9738409738409738, |
| "grad_norm": 0.07487895339727402, |
| "learning_rate": 4.4193323026283655e-08, |
| "loss": 0.0148, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.9790209790209791, |
| "grad_norm": 0.07463818788528442, |
| "learning_rate": 2.8837648857066304e-08, |
| "loss": 0.0147, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.9842009842009842, |
| "grad_norm": 0.07078888267278671, |
| "learning_rate": 1.6743599913405796e-08, |
| "loss": 0.0148, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.9893809893809894, |
| "grad_norm": 0.08100683987140656, |
| "learning_rate": 7.91513222908602e-09, |
| "loss": 0.0152, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.9945609945609946, |
| "grad_norm": 0.07245540618896484, |
| "learning_rate": 2.3551336472582563e-09, |
| "loss": 0.0149, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.9997409997409997, |
| "grad_norm": 0.07577144354581833, |
| "learning_rate": 6.542287581123852e-11, |
| "loss": 0.0143, |
| "step": 1930 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 1931, |
| "total_flos": 3.1334792111153218e+19, |
| "train_loss": 0.029184402332697614, |
| "train_runtime": 111018.586, |
| "train_samples_per_second": 8.903, |
| "train_steps_per_second": 0.017 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1931, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.1334792111153218e+19, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|