adpretko's picture
End of training
6f81ec7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1931,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00518000518000518,
"grad_norm": 6.292460918426514,
"learning_rate": 9.278350515463919e-07,
"loss": 0.5994,
"step": 10
},
{
"epoch": 0.01036001036001036,
"grad_norm": 3.469723701477051,
"learning_rate": 1.9587628865979384e-06,
"loss": 0.4581,
"step": 20
},
{
"epoch": 0.01554001554001554,
"grad_norm": 1.8256299495697021,
"learning_rate": 2.9896907216494846e-06,
"loss": 0.254,
"step": 30
},
{
"epoch": 0.02072002072002072,
"grad_norm": 0.6593835949897766,
"learning_rate": 4.020618556701032e-06,
"loss": 0.1497,
"step": 40
},
{
"epoch": 0.0259000259000259,
"grad_norm": 0.4585426449775696,
"learning_rate": 5.051546391752578e-06,
"loss": 0.1086,
"step": 50
},
{
"epoch": 0.03108003108003108,
"grad_norm": 0.3679659068584442,
"learning_rate": 6.082474226804124e-06,
"loss": 0.087,
"step": 60
},
{
"epoch": 0.03626003626003626,
"grad_norm": 0.3684946596622467,
"learning_rate": 7.113402061855671e-06,
"loss": 0.0775,
"step": 70
},
{
"epoch": 0.04144004144004144,
"grad_norm": 0.328337162733078,
"learning_rate": 8.144329896907216e-06,
"loss": 0.0686,
"step": 80
},
{
"epoch": 0.046620046620046623,
"grad_norm": 1.8907654285430908,
"learning_rate": 9.175257731958764e-06,
"loss": 0.0658,
"step": 90
},
{
"epoch": 0.0518000518000518,
"grad_norm": 0.5488079786300659,
"learning_rate": 1.0206185567010309e-05,
"loss": 0.0631,
"step": 100
},
{
"epoch": 0.05698005698005698,
"grad_norm": 0.3149188458919525,
"learning_rate": 1.1237113402061856e-05,
"loss": 0.0554,
"step": 110
},
{
"epoch": 0.06216006216006216,
"grad_norm": 0.28675776720046997,
"learning_rate": 1.2268041237113405e-05,
"loss": 0.0487,
"step": 120
},
{
"epoch": 0.06734006734006734,
"grad_norm": 0.20799441635608673,
"learning_rate": 1.3298969072164948e-05,
"loss": 0.0462,
"step": 130
},
{
"epoch": 0.07252007252007252,
"grad_norm": 0.21834194660186768,
"learning_rate": 1.4329896907216495e-05,
"loss": 0.0437,
"step": 140
},
{
"epoch": 0.0777000777000777,
"grad_norm": 0.20788688957691193,
"learning_rate": 1.5360824742268042e-05,
"loss": 0.0416,
"step": 150
},
{
"epoch": 0.08288008288008288,
"grad_norm": 0.19446660578250885,
"learning_rate": 1.6391752577319588e-05,
"loss": 0.0395,
"step": 160
},
{
"epoch": 0.08806008806008805,
"grad_norm": 0.23992358148097992,
"learning_rate": 1.7422680412371137e-05,
"loss": 0.0382,
"step": 170
},
{
"epoch": 0.09324009324009325,
"grad_norm": 0.274311363697052,
"learning_rate": 1.8453608247422682e-05,
"loss": 0.0374,
"step": 180
},
{
"epoch": 0.09842009842009843,
"grad_norm": 0.20221377909183502,
"learning_rate": 1.9484536082474227e-05,
"loss": 0.0362,
"step": 190
},
{
"epoch": 0.1036001036001036,
"grad_norm": 0.1887311339378357,
"learning_rate": 1.9999591109366888e-05,
"loss": 0.0352,
"step": 200
},
{
"epoch": 0.10878010878010878,
"grad_norm": 0.16539287567138672,
"learning_rate": 1.9996320184929093e-05,
"loss": 0.0345,
"step": 210
},
{
"epoch": 0.11396011396011396,
"grad_norm": 0.19868427515029907,
"learning_rate": 1.9989779405991916e-05,
"loss": 0.0321,
"step": 220
},
{
"epoch": 0.11914011914011914,
"grad_norm": 0.18100829422473907,
"learning_rate": 1.9979970912082214e-05,
"loss": 0.0325,
"step": 230
},
{
"epoch": 0.12432012432012432,
"grad_norm": 0.16753119230270386,
"learning_rate": 1.9966897911615417e-05,
"loss": 0.0321,
"step": 240
},
{
"epoch": 0.1295001295001295,
"grad_norm": 0.16504332423210144,
"learning_rate": 1.9950564680846042e-05,
"loss": 0.0302,
"step": 250
},
{
"epoch": 0.13468013468013468,
"grad_norm": 0.16585540771484375,
"learning_rate": 1.993097656246892e-05,
"loss": 0.0299,
"step": 260
},
{
"epoch": 0.13986013986013987,
"grad_norm": 0.1773216277360916,
"learning_rate": 1.9908139963871547e-05,
"loss": 0.0288,
"step": 270
},
{
"epoch": 0.14504014504014504,
"grad_norm": 0.16576074063777924,
"learning_rate": 1.988206235503821e-05,
"loss": 0.0292,
"step": 280
},
{
"epoch": 0.15022015022015023,
"grad_norm": 0.15523289144039154,
"learning_rate": 1.98527522661065e-05,
"loss": 0.0291,
"step": 290
},
{
"epoch": 0.1554001554001554,
"grad_norm": 0.16800253093242645,
"learning_rate": 1.9820219284577052e-05,
"loss": 0.0284,
"step": 300
},
{
"epoch": 0.16058016058016059,
"grad_norm": 0.15856459736824036,
"learning_rate": 1.9784474052177435e-05,
"loss": 0.0277,
"step": 310
},
{
"epoch": 0.16576016576016575,
"grad_norm": 0.15079988539218903,
"learning_rate": 1.9745528261381156e-05,
"loss": 0.0277,
"step": 320
},
{
"epoch": 0.17094017094017094,
"grad_norm": 0.18376828730106354,
"learning_rate": 1.970339465158301e-05,
"loss": 0.0272,
"step": 330
},
{
"epoch": 0.1761201761201761,
"grad_norm": 0.1650630086660385,
"learning_rate": 1.9658087004931926e-05,
"loss": 0.0276,
"step": 340
},
{
"epoch": 0.1813001813001813,
"grad_norm": 0.14251892268657684,
"learning_rate": 1.960962014182276e-05,
"loss": 0.0263,
"step": 350
},
{
"epoch": 0.1864801864801865,
"grad_norm": 0.13589179515838623,
"learning_rate": 1.955800991604846e-05,
"loss": 0.0267,
"step": 360
},
{
"epoch": 0.19166019166019166,
"grad_norm": 0.15240447223186493,
"learning_rate": 1.9503273209614183e-05,
"loss": 0.0259,
"step": 370
},
{
"epoch": 0.19684019684019685,
"grad_norm": 0.13825932145118713,
"learning_rate": 1.9445427927215108e-05,
"loss": 0.0243,
"step": 380
},
{
"epoch": 0.20202020202020202,
"grad_norm": 0.13044404983520508,
"learning_rate": 1.9384492990379703e-05,
"loss": 0.0254,
"step": 390
},
{
"epoch": 0.2072002072002072,
"grad_norm": 0.15161941945552826,
"learning_rate": 1.9320488331280372e-05,
"loss": 0.024,
"step": 400
},
{
"epoch": 0.21238021238021237,
"grad_norm": 0.12539660930633545,
"learning_rate": 1.9253434886213548e-05,
"loss": 0.0247,
"step": 410
},
{
"epoch": 0.21756021756021757,
"grad_norm": 0.1758316159248352,
"learning_rate": 1.9183354588751274e-05,
"loss": 0.025,
"step": 420
},
{
"epoch": 0.22274022274022273,
"grad_norm": 0.1396271139383316,
"learning_rate": 1.911027036256664e-05,
"loss": 0.0239,
"step": 430
},
{
"epoch": 0.22792022792022792,
"grad_norm": 0.1349797546863556,
"learning_rate": 1.9034206113935297e-05,
"loss": 0.0237,
"step": 440
},
{
"epoch": 0.2331002331002331,
"grad_norm": 0.13483276963233948,
"learning_rate": 1.8955186723915573e-05,
"loss": 0.024,
"step": 450
},
{
"epoch": 0.23828023828023828,
"grad_norm": 0.1201694905757904,
"learning_rate": 1.887323804020975e-05,
"loss": 0.0226,
"step": 460
},
{
"epoch": 0.24346024346024345,
"grad_norm": 0.1224222481250763,
"learning_rate": 1.878838686870911e-05,
"loss": 0.0242,
"step": 470
},
{
"epoch": 0.24864024864024864,
"grad_norm": 0.11309316009283066,
"learning_rate": 1.8700660964725583e-05,
"loss": 0.0221,
"step": 480
},
{
"epoch": 0.2538202538202538,
"grad_norm": 0.13578607141971588,
"learning_rate": 1.8610089023912828e-05,
"loss": 0.0237,
"step": 490
},
{
"epoch": 0.259000259000259,
"grad_norm": 0.13230614364147186,
"learning_rate": 1.8516700672879706e-05,
"loss": 0.0224,
"step": 500
},
{
"epoch": 0.2641802641802642,
"grad_norm": 0.12713994085788727,
"learning_rate": 1.8420526459499252e-05,
"loss": 0.023,
"step": 510
},
{
"epoch": 0.26936026936026936,
"grad_norm": 0.1189827024936676,
"learning_rate": 1.8321597842916282e-05,
"loss": 0.0219,
"step": 520
},
{
"epoch": 0.2745402745402745,
"grad_norm": 0.12820567190647125,
"learning_rate": 1.821994718325693e-05,
"loss": 0.0228,
"step": 530
},
{
"epoch": 0.27972027972027974,
"grad_norm": 0.11525845527648926,
"learning_rate": 1.811560773104346e-05,
"loss": 0.0214,
"step": 540
},
{
"epoch": 0.2849002849002849,
"grad_norm": 0.1343483179807663,
"learning_rate": 1.8008613616317823e-05,
"loss": 0.0218,
"step": 550
},
{
"epoch": 0.29008029008029007,
"grad_norm": 0.11713062226772308,
"learning_rate": 1.7898999837477528e-05,
"loss": 0.0216,
"step": 560
},
{
"epoch": 0.29526029526029524,
"grad_norm": 0.14427222311496735,
"learning_rate": 1.7786802249827454e-05,
"loss": 0.0212,
"step": 570
},
{
"epoch": 0.30044030044030046,
"grad_norm": 0.11759313195943832,
"learning_rate": 1.7672057553851387e-05,
"loss": 0.0216,
"step": 580
},
{
"epoch": 0.3056203056203056,
"grad_norm": 0.11268991976976395,
"learning_rate": 1.755480328320705e-05,
"loss": 0.0212,
"step": 590
},
{
"epoch": 0.3108003108003108,
"grad_norm": 0.1134837344288826,
"learning_rate": 1.7435077792448666e-05,
"loss": 0.0214,
"step": 600
},
{
"epoch": 0.315980315980316,
"grad_norm": 0.10657832026481628,
"learning_rate": 1.731292024448091e-05,
"loss": 0.0214,
"step": 610
},
{
"epoch": 0.32116032116032117,
"grad_norm": 0.11426619440317154,
"learning_rate": 1.7188370597748553e-05,
"loss": 0.0211,
"step": 620
},
{
"epoch": 0.32634032634032634,
"grad_norm": 0.11127694696187973,
"learning_rate": 1.706146959316576e-05,
"loss": 0.0194,
"step": 630
},
{
"epoch": 0.3315203315203315,
"grad_norm": 0.11302390694618225,
"learning_rate": 1.6932258740789553e-05,
"loss": 0.02,
"step": 640
},
{
"epoch": 0.3367003367003367,
"grad_norm": 0.1174977570772171,
"learning_rate": 1.6800780306241596e-05,
"loss": 0.0197,
"step": 650
},
{
"epoch": 0.3418803418803419,
"grad_norm": 0.12497369199991226,
"learning_rate": 1.666707729688289e-05,
"loss": 0.0197,
"step": 660
},
{
"epoch": 0.34706034706034705,
"grad_norm": 0.10607603937387466,
"learning_rate": 1.6531193447745776e-05,
"loss": 0.0197,
"step": 670
},
{
"epoch": 0.3522403522403522,
"grad_norm": 0.1149929016828537,
"learning_rate": 1.6393173207228e-05,
"loss": 0.0199,
"step": 680
},
{
"epoch": 0.35742035742035744,
"grad_norm": 0.11435063183307648,
"learning_rate": 1.6253061722553353e-05,
"loss": 0.0196,
"step": 690
},
{
"epoch": 0.3626003626003626,
"grad_norm": 0.11646245419979095,
"learning_rate": 1.6110904825003754e-05,
"loss": 0.0199,
"step": 700
},
{
"epoch": 0.36778036778036777,
"grad_norm": 0.6431416273117065,
"learning_rate": 1.596674901492758e-05,
"loss": 0.0238,
"step": 710
},
{
"epoch": 0.372960372960373,
"grad_norm": 6.487484455108643,
"learning_rate": 1.5820641446529127e-05,
"loss": 0.0379,
"step": 720
},
{
"epoch": 0.37814037814037815,
"grad_norm": 0.19220368564128876,
"learning_rate": 1.567262991244419e-05,
"loss": 0.0306,
"step": 730
},
{
"epoch": 0.3833203833203833,
"grad_norm": 0.11730585992336273,
"learning_rate": 1.5522762828106822e-05,
"loss": 0.022,
"step": 740
},
{
"epoch": 0.3885003885003885,
"grad_norm": 0.12174461036920547,
"learning_rate": 1.5371089215912363e-05,
"loss": 0.0215,
"step": 750
},
{
"epoch": 0.3936803936803937,
"grad_norm": 0.1016201600432396,
"learning_rate": 1.5217658689181925e-05,
"loss": 0.0205,
"step": 760
},
{
"epoch": 0.39886039886039887,
"grad_norm": 0.09577111154794693,
"learning_rate": 1.5062521435933586e-05,
"loss": 0.0198,
"step": 770
},
{
"epoch": 0.40404040404040403,
"grad_norm": 0.08802168816328049,
"learning_rate": 1.4905728202465596e-05,
"loss": 0.0188,
"step": 780
},
{
"epoch": 0.4092204092204092,
"grad_norm": 0.12182024121284485,
"learning_rate": 1.4747330276756986e-05,
"loss": 0.0195,
"step": 790
},
{
"epoch": 0.4144004144004144,
"grad_norm": 0.09344793111085892,
"learning_rate": 1.4587379471690937e-05,
"loss": 0.0202,
"step": 800
},
{
"epoch": 0.4195804195804196,
"grad_norm": 0.10629361122846603,
"learning_rate": 1.4425928108106519e-05,
"loss": 0.0186,
"step": 810
},
{
"epoch": 0.42476042476042475,
"grad_norm": 0.09861776977777481,
"learning_rate": 1.4263028997684217e-05,
"loss": 0.0197,
"step": 820
},
{
"epoch": 0.4299404299404299,
"grad_norm": 0.10772161930799484,
"learning_rate": 1.4098735425670931e-05,
"loss": 0.0193,
"step": 830
},
{
"epoch": 0.43512043512043513,
"grad_norm": 0.09898128360509872,
"learning_rate": 1.393310113345006e-05,
"loss": 0.0194,
"step": 840
},
{
"epoch": 0.4403004403004403,
"grad_norm": 0.09928678721189499,
"learning_rate": 1.3766180300962393e-05,
"loss": 0.0183,
"step": 850
},
{
"epoch": 0.44548044548044546,
"grad_norm": 0.09649905562400818,
"learning_rate": 1.3598027528983517e-05,
"loss": 0.0179,
"step": 860
},
{
"epoch": 0.4506604506604507,
"grad_norm": 0.1102428138256073,
"learning_rate": 1.34286978212636e-05,
"loss": 0.0193,
"step": 870
},
{
"epoch": 0.45584045584045585,
"grad_norm": 0.10131724178791046,
"learning_rate": 1.325824656653534e-05,
"loss": 0.0193,
"step": 880
},
{
"epoch": 0.461020461020461,
"grad_norm": 0.09785453975200653,
"learning_rate": 1.308672952039598e-05,
"loss": 0.0182,
"step": 890
},
{
"epoch": 0.4662004662004662,
"grad_norm": 0.11056746542453766,
"learning_rate": 1.2914202787069345e-05,
"loss": 0.0183,
"step": 900
},
{
"epoch": 0.4713804713804714,
"grad_norm": 0.10242997854948044,
"learning_rate": 1.2740722801053808e-05,
"loss": 0.0183,
"step": 910
},
{
"epoch": 0.47656047656047656,
"grad_norm": 0.09493660181760788,
"learning_rate": 1.2566346308662248e-05,
"loss": 0.0187,
"step": 920
},
{
"epoch": 0.48174048174048173,
"grad_norm": 0.09588351845741272,
"learning_rate": 1.239113034945999e-05,
"loss": 0.0174,
"step": 930
},
{
"epoch": 0.4869204869204869,
"grad_norm": 0.09339374303817749,
"learning_rate": 1.2215132237606843e-05,
"loss": 0.0177,
"step": 940
},
{
"epoch": 0.4921004921004921,
"grad_norm": 0.09003674238920212,
"learning_rate": 1.2038409543109295e-05,
"loss": 0.0176,
"step": 950
},
{
"epoch": 0.4972804972804973,
"grad_norm": 0.09645362198352814,
"learning_rate": 1.186102007298904e-05,
"loss": 0.0185,
"step": 960
},
{
"epoch": 0.5024605024605024,
"grad_norm": 0.09411321580410004,
"learning_rate": 1.168302185237395e-05,
"loss": 0.0175,
"step": 970
},
{
"epoch": 0.5076405076405076,
"grad_norm": 0.0976066067814827,
"learning_rate": 1.1504473105517731e-05,
"loss": 0.017,
"step": 980
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.09469664096832275,
"learning_rate": 1.1325432236754424e-05,
"loss": 0.0174,
"step": 990
},
{
"epoch": 0.518000518000518,
"grad_norm": 0.09026806801557541,
"learning_rate": 1.1145957811394006e-05,
"loss": 0.0174,
"step": 1000
},
{
"epoch": 0.5231805231805232,
"grad_norm": 0.09828916192054749,
"learning_rate": 1.096610853656535e-05,
"loss": 0.0175,
"step": 1010
},
{
"epoch": 0.5283605283605284,
"grad_norm": 0.09940842539072037,
"learning_rate": 1.0785943242012763e-05,
"loss": 0.0167,
"step": 1020
},
{
"epoch": 0.5335405335405335,
"grad_norm": 0.07944466173648834,
"learning_rate": 1.0605520860852442e-05,
"loss": 0.0173,
"step": 1030
},
{
"epoch": 0.5387205387205387,
"grad_norm": 0.08528061211109161,
"learning_rate": 1.0424900410295115e-05,
"loss": 0.0169,
"step": 1040
},
{
"epoch": 0.5439005439005439,
"grad_norm": 0.10138797760009766,
"learning_rate": 1.0244140972341155e-05,
"loss": 0.0174,
"step": 1050
},
{
"epoch": 0.549080549080549,
"grad_norm": 0.10442786663770676,
"learning_rate": 1.0063301674454526e-05,
"loss": 0.0171,
"step": 1060
},
{
"epoch": 0.5542605542605542,
"grad_norm": 0.10265690833330154,
"learning_rate": 9.882441670221846e-06,
"loss": 0.0162,
"step": 1070
},
{
"epoch": 0.5594405594405595,
"grad_norm": 0.10706663131713867,
"learning_rate": 9.701620120002885e-06,
"loss": 0.0178,
"step": 1080
},
{
"epoch": 0.5646205646205646,
"grad_norm": 0.08483204990625381,
"learning_rate": 9.520896171578891e-06,
"loss": 0.0175,
"step": 1090
},
{
"epoch": 0.5698005698005698,
"grad_norm": 0.09182880818843842,
"learning_rate": 9.340328940805003e-06,
"loss": 0.0174,
"step": 1100
},
{
"epoch": 0.574980574980575,
"grad_norm": 0.09123273193836212,
"learning_rate": 9.159977492273086e-06,
"loss": 0.0166,
"step": 1110
},
{
"epoch": 0.5801605801605801,
"grad_norm": 0.09600038826465607,
"learning_rate": 8.9799008199914e-06,
"loss": 0.0166,
"step": 1120
},
{
"epoch": 0.5853405853405853,
"grad_norm": 0.07484059780836105,
"learning_rate": 8.800157828087275e-06,
"loss": 0.017,
"step": 1130
},
{
"epoch": 0.5905205905205905,
"grad_norm": 0.1001143753528595,
"learning_rate": 8.620807311539258e-06,
"loss": 0.017,
"step": 1140
},
{
"epoch": 0.5957005957005957,
"grad_norm": 0.09151753783226013,
"learning_rate": 8.441907936944933e-06,
"loss": 0.0172,
"step": 1150
},
{
"epoch": 0.6008806008806009,
"grad_norm": 0.08270428329706192,
"learning_rate": 8.263518223330698e-06,
"loss": 0.0164,
"step": 1160
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.07453130185604095,
"learning_rate": 8.085696523009907e-06,
"loss": 0.0164,
"step": 1170
},
{
"epoch": 0.6112406112406112,
"grad_norm": 0.08895987272262573,
"learning_rate": 7.908501002495445e-06,
"loss": 0.0169,
"step": 1180
},
{
"epoch": 0.6164206164206164,
"grad_norm": 0.08611361682415009,
"learning_rate": 7.731989623473144e-06,
"loss": 0.0155,
"step": 1190
},
{
"epoch": 0.6216006216006216,
"grad_norm": 0.09515902400016785,
"learning_rate": 7.556220123842173e-06,
"loss": 0.0169,
"step": 1200
},
{
"epoch": 0.6267806267806267,
"grad_norm": 0.08863533288240433,
"learning_rate": 7.38124999882863e-06,
"loss": 0.0167,
"step": 1210
},
{
"epoch": 0.631960631960632,
"grad_norm": 0.0799228847026825,
"learning_rate": 7.207136482178538e-06,
"loss": 0.0162,
"step": 1220
},
{
"epoch": 0.6371406371406372,
"grad_norm": 0.08676367253065109,
"learning_rate": 7.033936527436318e-06,
"loss": 0.017,
"step": 1230
},
{
"epoch": 0.6423206423206423,
"grad_norm": 0.08304847776889801,
"learning_rate": 6.861706789314993e-06,
"loss": 0.0158,
"step": 1240
},
{
"epoch": 0.6475006475006475,
"grad_norm": 0.0753399059176445,
"learning_rate": 6.6905036051640804e-06,
"loss": 0.016,
"step": 1250
},
{
"epoch": 0.6526806526806527,
"grad_norm": 0.09043081849813461,
"learning_rate": 6.520382976541313e-06,
"loss": 0.0159,
"step": 1260
},
{
"epoch": 0.6578606578606578,
"grad_norm": 0.08677306771278381,
"learning_rate": 6.351400550894224e-06,
"loss": 0.0158,
"step": 1270
},
{
"epoch": 0.663040663040663,
"grad_norm": 0.08940693736076355,
"learning_rate": 6.183611603357513e-06,
"loss": 0.0159,
"step": 1280
},
{
"epoch": 0.6682206682206682,
"grad_norm": 0.0751878097653389,
"learning_rate": 6.0170710186722605e-06,
"loss": 0.0161,
"step": 1290
},
{
"epoch": 0.6734006734006734,
"grad_norm": 0.0836741104722023,
"learning_rate": 5.851833273232788e-06,
"loss": 0.016,
"step": 1300
},
{
"epoch": 0.6785806785806786,
"grad_norm": 0.08590537309646606,
"learning_rate": 5.687952417267115e-06,
"loss": 0.0157,
"step": 1310
},
{
"epoch": 0.6837606837606838,
"grad_norm": 0.09049531072378159,
"learning_rate": 5.525482057156833e-06,
"loss": 0.0159,
"step": 1320
},
{
"epoch": 0.6889406889406889,
"grad_norm": 0.08685445785522461,
"learning_rate": 5.364475337902108e-06,
"loss": 0.0155,
"step": 1330
},
{
"epoch": 0.6941206941206941,
"grad_norm": 0.08676782995462418,
"learning_rate": 5.204984925737689e-06,
"loss": 0.0166,
"step": 1340
},
{
"epoch": 0.6993006993006993,
"grad_norm": 0.09176863729953766,
"learning_rate": 5.047062990905436e-06,
"loss": 0.016,
"step": 1350
},
{
"epoch": 0.7044807044807044,
"grad_norm": 0.08616431057453156,
"learning_rate": 4.890761190589157e-06,
"loss": 0.0156,
"step": 1360
},
{
"epoch": 0.7096607096607097,
"grad_norm": 0.07910116016864777,
"learning_rate": 4.736130652017228e-06,
"loss": 0.0154,
"step": 1370
},
{
"epoch": 0.7148407148407149,
"grad_norm": 0.07988451421260834,
"learning_rate": 4.5832219557385896e-06,
"loss": 0.0153,
"step": 1380
},
{
"epoch": 0.72002072002072,
"grad_norm": 0.07867439091205597,
"learning_rate": 4.432085119077536e-06,
"loss": 0.0153,
"step": 1390
},
{
"epoch": 0.7252007252007252,
"grad_norm": 0.08625340461730957,
"learning_rate": 4.2827695797727835e-06,
"loss": 0.0153,
"step": 1400
},
{
"epoch": 0.7303807303807304,
"grad_norm": 0.07801397144794464,
"learning_rate": 4.135324179806079e-06,
"loss": 0.0158,
"step": 1410
},
{
"epoch": 0.7355607355607355,
"grad_norm": 0.07337366789579391,
"learning_rate": 3.989797149425714e-06,
"loss": 0.0153,
"step": 1420
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.07763037085533142,
"learning_rate": 3.846236091370119e-06,
"loss": 0.0154,
"step": 1430
},
{
"epoch": 0.745920745920746,
"grad_norm": 0.08092360198497772,
"learning_rate": 3.704687965296746e-06,
"loss": 0.0162,
"step": 1440
},
{
"epoch": 0.7511007511007511,
"grad_norm": 0.08551878482103348,
"learning_rate": 3.5651990724212716e-06,
"loss": 0.0153,
"step": 1450
},
{
"epoch": 0.7562807562807563,
"grad_norm": 0.07818982750177383,
"learning_rate": 3.4278150403722222e-06,
"loss": 0.0155,
"step": 1460
},
{
"epoch": 0.7614607614607615,
"grad_norm": 0.08288519084453583,
"learning_rate": 3.292580808265897e-06,
"loss": 0.0154,
"step": 1470
},
{
"epoch": 0.7666407666407666,
"grad_norm": 0.08305075764656067,
"learning_rate": 3.1595406120065174e-06,
"loss": 0.0152,
"step": 1480
},
{
"epoch": 0.7718207718207718,
"grad_norm": 0.08365499973297119,
"learning_rate": 3.0287379698164245e-06,
"loss": 0.0148,
"step": 1490
},
{
"epoch": 0.777000777000777,
"grad_norm": 0.07930707186460495,
"learning_rate": 2.900215668000991e-06,
"loss": 0.0161,
"step": 1500
},
{
"epoch": 0.7821807821807821,
"grad_norm": 0.07458413392305374,
"learning_rate": 2.7740157469529915e-06,
"loss": 0.0159,
"step": 1510
},
{
"epoch": 0.7873607873607874,
"grad_norm": 0.08544889092445374,
"learning_rate": 2.6501794874009425e-06,
"loss": 0.0153,
"step": 1520
},
{
"epoch": 0.7925407925407926,
"grad_norm": 0.0796743705868721,
"learning_rate": 2.5287473969059174e-06,
"loss": 0.0156,
"step": 1530
},
{
"epoch": 0.7977207977207977,
"grad_norm": 0.089390829205513,
"learning_rate": 2.4097591966113155e-06,
"loss": 0.0149,
"step": 1540
},
{
"epoch": 0.8029008029008029,
"grad_norm": 0.0870826244354248,
"learning_rate": 2.2932538082498225e-06,
"loss": 0.0156,
"step": 1550
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.07120909541845322,
"learning_rate": 2.179269341411896e-06,
"loss": 0.0152,
"step": 1560
},
{
"epoch": 0.8132608132608132,
"grad_norm": 0.07515786588191986,
"learning_rate": 2.0678430810799e-06,
"loss": 0.0141,
"step": 1570
},
{
"epoch": 0.8184408184408184,
"grad_norm": 0.07680921256542206,
"learning_rate": 1.959011475431952e-06,
"loss": 0.0155,
"step": 1580
},
{
"epoch": 0.8236208236208237,
"grad_norm": 0.08255264908075333,
"learning_rate": 1.8528101239195394e-06,
"loss": 0.0153,
"step": 1590
},
{
"epoch": 0.8288008288008288,
"grad_norm": 0.08244433999061584,
"learning_rate": 1.7492737656227032e-06,
"loss": 0.0152,
"step": 1600
},
{
"epoch": 0.833980833980834,
"grad_norm": 0.06824195384979248,
"learning_rate": 1.6484362678867083e-06,
"loss": 0.015,
"step": 1610
},
{
"epoch": 0.8391608391608392,
"grad_norm": 0.07020522654056549,
"learning_rate": 1.5503306152438146e-06,
"loss": 0.015,
"step": 1620
},
{
"epoch": 0.8443408443408443,
"grad_norm": 0.08058454096317291,
"learning_rate": 1.4549888986238658e-06,
"loss": 0.0154,
"step": 1630
},
{
"epoch": 0.8495208495208495,
"grad_norm": 0.0823742225766182,
"learning_rate": 1.3624423048571434e-06,
"loss": 0.0146,
"step": 1640
},
{
"epoch": 0.8547008547008547,
"grad_norm": 0.07870230078697205,
"learning_rate": 1.2727211064729862e-06,
"loss": 0.0149,
"step": 1650
},
{
"epoch": 0.8598808598808598,
"grad_norm": 0.06677371263504028,
"learning_rate": 1.1858546517974511e-06,
"loss": 0.014,
"step": 1660
},
{
"epoch": 0.8650608650608651,
"grad_norm": 0.08251772075891495,
"learning_rate": 1.1018713553533279e-06,
"loss": 0.015,
"step": 1670
},
{
"epoch": 0.8702408702408703,
"grad_norm": 0.07941140979528427,
"learning_rate": 1.0207986885655664e-06,
"loss": 0.0151,
"step": 1680
},
{
"epoch": 0.8754208754208754,
"grad_norm": 0.07803778350353241,
"learning_rate": 9.426631707752243e-07,
"loss": 0.015,
"step": 1690
},
{
"epoch": 0.8806008806008806,
"grad_norm": 0.07801060378551483,
"learning_rate": 8.674903605648221e-07,
"loss": 0.0155,
"step": 1700
},
{
"epoch": 0.8857808857808858,
"grad_norm": 0.07815500348806381,
"learning_rate": 7.953048473980041e-07,
"loss": 0.0149,
"step": 1710
},
{
"epoch": 0.8909608909608909,
"grad_norm": 0.07662923634052277,
"learning_rate": 7.261302435761564e-07,
"loss": 0.0155,
"step": 1720
},
{
"epoch": 0.8961408961408961,
"grad_norm": 0.07826782763004303,
"learning_rate": 6.59989176514707e-07,
"loss": 0.0148,
"step": 1730
},
{
"epoch": 0.9013209013209014,
"grad_norm": 0.07217193394899368,
"learning_rate": 5.969032813415577e-07,
"loss": 0.0145,
"step": 1740
},
{
"epoch": 0.9065009065009065,
"grad_norm": 0.08113069832324982,
"learning_rate": 5.368931938201006e-07,
"loss": 0.015,
"step": 1750
},
{
"epoch": 0.9116809116809117,
"grad_norm": 0.07725197076797485,
"learning_rate": 4.799785435991577e-07,
"loss": 0.0148,
"step": 1760
},
{
"epoch": 0.9168609168609169,
"grad_norm": 0.07793201506137848,
"learning_rate": 4.261779477919892e-07,
"loss": 0.0151,
"step": 1770
},
{
"epoch": 0.922040922040922,
"grad_norm": 0.0730026513338089,
"learning_rate": 3.755090048865406e-07,
"loss": 0.0156,
"step": 1780
},
{
"epoch": 0.9272209272209272,
"grad_norm": 0.07092654705047607,
"learning_rate": 3.27988288988873e-07,
"loss": 0.0147,
"step": 1790
},
{
"epoch": 0.9324009324009324,
"grad_norm": 0.07984331995248795,
"learning_rate": 2.8363134440166806e-07,
"loss": 0.0151,
"step": 1800
},
{
"epoch": 0.9375809375809376,
"grad_norm": 0.07417917996644974,
"learning_rate": 2.424526805396088e-07,
"loss": 0.0148,
"step": 1810
},
{
"epoch": 0.9427609427609428,
"grad_norm": 0.07675167918205261,
"learning_rate": 2.0446576718325283e-07,
"loss": 0.0152,
"step": 1820
},
{
"epoch": 0.947940947940948,
"grad_norm": 0.07929011434316635,
"learning_rate": 1.6968303007300124e-07,
"loss": 0.0149,
"step": 1830
},
{
"epoch": 0.9531209531209531,
"grad_norm": 0.06900076568126678,
"learning_rate": 1.3811584684455648e-07,
"loss": 0.0153,
"step": 1840
},
{
"epoch": 0.9583009583009583,
"grad_norm": 0.07187589257955551,
"learning_rate": 1.0977454330723725e-07,
"loss": 0.0146,
"step": 1850
},
{
"epoch": 0.9634809634809635,
"grad_norm": 0.07358822226524353,
"learning_rate": 8.466839006634364e-08,
"loss": 0.0148,
"step": 1860
},
{
"epoch": 0.9686609686609686,
"grad_norm": 0.07493717968463898,
"learning_rate": 6.280559949068731e-08,
"loss": 0.0146,
"step": 1870
},
{
"epoch": 0.9738409738409738,
"grad_norm": 0.07487895339727402,
"learning_rate": 4.4193323026283655e-08,
"loss": 0.0148,
"step": 1880
},
{
"epoch": 0.9790209790209791,
"grad_norm": 0.07463818788528442,
"learning_rate": 2.8837648857066304e-08,
"loss": 0.0147,
"step": 1890
},
{
"epoch": 0.9842009842009842,
"grad_norm": 0.07078888267278671,
"learning_rate": 1.6743599913405796e-08,
"loss": 0.0148,
"step": 1900
},
{
"epoch": 0.9893809893809894,
"grad_norm": 0.08100683987140656,
"learning_rate": 7.91513222908602e-09,
"loss": 0.0152,
"step": 1910
},
{
"epoch": 0.9945609945609946,
"grad_norm": 0.07245540618896484,
"learning_rate": 2.3551336472582563e-09,
"loss": 0.0149,
"step": 1920
},
{
"epoch": 0.9997409997409997,
"grad_norm": 0.07577144354581833,
"learning_rate": 6.542287581123852e-11,
"loss": 0.0143,
"step": 1930
},
{
"epoch": 1.0,
"step": 1931,
"total_flos": 3.1334792111153218e+19,
"train_loss": 0.029184402332697614,
"train_runtime": 111018.586,
"train_samples_per_second": 8.903,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 1931,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.1334792111153218e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}