lawyer/trainer_state.json

1461 lines
37 KiB
JSON

{
"best_metric": 0.81632209,
"best_model_checkpoint": "/mnt/workspace/output/qwen2-7b-instruct/v0-20240906-160301/checkpoint-1300",
"epoch": 0.9782710939704637,
"eval_steps": 100,
"global_step": 1300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"acc": 0.56549656,
"epoch": 0.0007525162261311259,
"grad_norm": 2.546875,
"learning_rate": 1.25e-06,
"loss": 1.70594239,
"memory(GiB)": 16.58,
"step": 1,
"train_speed(iter/s)": 0.138978
},
{
"acc": 0.62718646,
"epoch": 0.007525162261311259,
"grad_norm": 1.6484375,
"learning_rate": 1.25e-05,
"loss": 1.38872963,
"memory(GiB)": 20.36,
"step": 10,
"train_speed(iter/s)": 0.177611
},
{
"acc": 0.65103383,
"epoch": 0.015050324522622519,
"grad_norm": 1.7578125,
"learning_rate": 2.5e-05,
"loss": 1.2758357,
"memory(GiB)": 20.36,
"step": 20,
"train_speed(iter/s)": 0.179256
},
{
"acc": 0.6864974,
"epoch": 0.022575486783933778,
"grad_norm": 0.703125,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.08145266,
"memory(GiB)": 21.71,
"step": 30,
"train_speed(iter/s)": 0.180531
},
{
"acc": 0.70903621,
"epoch": 0.030100649045245037,
"grad_norm": 1.015625,
"learning_rate": 5e-05,
"loss": 0.98728647,
"memory(GiB)": 21.71,
"step": 40,
"train_speed(iter/s)": 0.17987
},
{
"acc": 0.72273793,
"epoch": 0.037625811306556296,
"grad_norm": 0.7265625,
"learning_rate": 4.9992563706868603e-05,
"loss": 0.91920843,
"memory(GiB)": 16.47,
"step": 50,
"train_speed(iter/s)": 0.180818
},
{
"acc": 0.71242127,
"epoch": 0.045150973567867556,
"grad_norm": 0.80859375,
"learning_rate": 4.997025925135086e-05,
"loss": 0.95391417,
"memory(GiB)": 16.47,
"step": 60,
"train_speed(iter/s)": 0.181081
},
{
"acc": 0.72270803,
"epoch": 0.052676135829178815,
"grad_norm": 0.78515625,
"learning_rate": 4.99330999024443e-05,
"loss": 0.92681618,
"memory(GiB)": 16.47,
"step": 70,
"train_speed(iter/s)": 0.181559
},
{
"acc": 0.71700387,
"epoch": 0.060201298090490074,
"grad_norm": 0.890625,
"learning_rate": 4.988110776637383e-05,
"loss": 0.92395344,
"memory(GiB)": 16.47,
"step": 80,
"train_speed(iter/s)": 0.182177
},
{
"acc": 0.72583823,
"epoch": 0.06772646035180134,
"grad_norm": 0.8125,
"learning_rate": 4.981431377344059e-05,
"loss": 0.9007925,
"memory(GiB)": 17.57,
"step": 90,
"train_speed(iter/s)": 0.182174
},
{
"acc": 0.72602968,
"epoch": 0.07525162261311259,
"grad_norm": 0.734375,
"learning_rate": 4.973275765962145e-05,
"loss": 0.91265221,
"memory(GiB)": 17.57,
"step": 100,
"train_speed(iter/s)": 0.182475
},
{
"epoch": 0.07525162261311259,
"eval_acc": 0.726610340283699,
"eval_loss": 0.9165924787521362,
"eval_runtime": 17.5418,
"eval_samples_per_second": 12.199,
"eval_steps_per_second": 12.199,
"step": 100
},
{
"acc": 0.73096576,
"epoch": 0.08277678487442386,
"grad_norm": 0.79296875,
"learning_rate": 4.963648794292992e-05,
"loss": 0.88530436,
"memory(GiB)": 17.57,
"step": 110,
"train_speed(iter/s)": 0.177125
},
{
"acc": 0.73312225,
"epoch": 0.09030194713573511,
"grad_norm": 0.7890625,
"learning_rate": 4.952556189455266e-05,
"loss": 0.87520084,
"memory(GiB)": 18.79,
"step": 120,
"train_speed(iter/s)": 0.177734
},
{
"acc": 0.7300118,
"epoch": 0.09782710939704638,
"grad_norm": 1.078125,
"learning_rate": 4.940004550477858e-05,
"loss": 0.89652071,
"memory(GiB)": 18.79,
"step": 130,
"train_speed(iter/s)": 0.178221
},
{
"acc": 0.72675986,
"epoch": 0.10535227165835763,
"grad_norm": 0.8359375,
"learning_rate": 4.9260013443741074e-05,
"loss": 0.88306637,
"memory(GiB)": 18.79,
"step": 140,
"train_speed(iter/s)": 0.178327
},
{
"acc": 0.7353128,
"epoch": 0.1128774339196689,
"grad_norm": 0.8203125,
"learning_rate": 4.9105549016996414e-05,
"loss": 0.87760782,
"memory(GiB)": 18.79,
"step": 150,
"train_speed(iter/s)": 0.178348
},
{
"acc": 0.73115945,
"epoch": 0.12040259618098015,
"grad_norm": 0.96484375,
"learning_rate": 4.893674411596507e-05,
"loss": 0.88493299,
"memory(GiB)": 18.79,
"step": 160,
"train_speed(iter/s)": 0.179132
},
{
"acc": 0.73725371,
"epoch": 0.1279277584422914,
"grad_norm": 1.078125,
"learning_rate": 4.875369916326512e-05,
"loss": 0.8738554,
"memory(GiB)": 18.79,
"step": 170,
"train_speed(iter/s)": 0.180388
},
{
"acc": 0.73314247,
"epoch": 0.13545292070360268,
"grad_norm": 0.8984375,
"learning_rate": 4.855652305297052e-05,
"loss": 0.87920074,
"memory(GiB)": 18.79,
"step": 180,
"train_speed(iter/s)": 0.181303
},
{
"acc": 0.73699288,
"epoch": 0.14297808296491393,
"grad_norm": 0.8828125,
"learning_rate": 4.834533308582965e-05,
"loss": 0.86675138,
"memory(GiB)": 18.79,
"step": 190,
"train_speed(iter/s)": 0.181264
},
{
"acc": 0.7354795,
"epoch": 0.15050324522622519,
"grad_norm": 0.796875,
"learning_rate": 4.8120254899482665e-05,
"loss": 0.85725517,
"memory(GiB)": 18.79,
"step": 200,
"train_speed(iter/s)": 0.182005
},
{
"epoch": 0.15050324522622519,
"eval_acc": 0.7330827067669173,
"eval_loss": 0.8884855508804321,
"eval_runtime": 17.6396,
"eval_samples_per_second": 12.132,
"eval_steps_per_second": 12.132,
"step": 200
},
{
"acc": 0.7260026,
"epoch": 0.15802840748753644,
"grad_norm": 1.0078125,
"learning_rate": 4.788142239371927e-05,
"loss": 0.88581524,
"memory(GiB)": 18.79,
"step": 210,
"train_speed(iter/s)": 0.179819
},
{
"acc": 0.73441463,
"epoch": 0.16555356974884772,
"grad_norm": 1.2109375,
"learning_rate": 4.762897765082124e-05,
"loss": 0.87213602,
"memory(GiB)": 18.79,
"step": 220,
"train_speed(iter/s)": 0.180055
},
{
"acc": 0.73429847,
"epoch": 0.17307873201015897,
"grad_norm": 0.859375,
"learning_rate": 4.7363070851037175e-05,
"loss": 0.85944548,
"memory(GiB)": 18.79,
"step": 230,
"train_speed(iter/s)": 0.179967
},
{
"acc": 0.73485651,
"epoch": 0.18060389427147022,
"grad_norm": 1.0703125,
"learning_rate": 4.708386018323979e-05,
"loss": 0.87450695,
"memory(GiB)": 18.79,
"step": 240,
"train_speed(iter/s)": 0.180087
},
{
"acc": 0.73376856,
"epoch": 0.18812905653278147,
"grad_norm": 1.1015625,
"learning_rate": 4.6791511750818784e-05,
"loss": 0.87443905,
"memory(GiB)": 18.79,
"step": 250,
"train_speed(iter/s)": 0.18021
},
{
"acc": 0.72789798,
"epoch": 0.19565421879409275,
"grad_norm": 1.03125,
"learning_rate": 4.6486199472865344e-05,
"loss": 0.89129868,
"memory(GiB)": 18.79,
"step": 260,
"train_speed(iter/s)": 0.180445
},
{
"acc": 0.73053126,
"epoch": 0.203179381055404,
"grad_norm": 1.28125,
"learning_rate": 4.6168104980707107e-05,
"loss": 0.85515051,
"memory(GiB)": 18.79,
"step": 270,
"train_speed(iter/s)": 0.180552
},
{
"acc": 0.74167385,
"epoch": 0.21070454331671526,
"grad_norm": 1.3046875,
"learning_rate": 4.583741750985505e-05,
"loss": 0.84416103,
"memory(GiB)": 18.79,
"step": 280,
"train_speed(iter/s)": 0.180675
},
{
"acc": 0.74637232,
"epoch": 0.21822970557802654,
"grad_norm": 1.03125,
"learning_rate": 4.5494333787426635e-05,
"loss": 0.82163248,
"memory(GiB)": 18.79,
"step": 290,
"train_speed(iter/s)": 0.180719
},
{
"acc": 0.73941607,
"epoch": 0.2257548678393378,
"grad_norm": 0.92578125,
"learning_rate": 4.5139057915112135e-05,
"loss": 0.87674456,
"memory(GiB)": 18.79,
"step": 300,
"train_speed(iter/s)": 0.180666
},
{
"epoch": 0.2257548678393378,
"eval_acc": 0.7357569180683667,
"eval_loss": 0.8706684112548828,
"eval_runtime": 17.5139,
"eval_samples_per_second": 12.219,
"eval_steps_per_second": 12.219,
"step": 300
},
{
"acc": 0.73469014,
"epoch": 0.23328003010064904,
"grad_norm": 1.0625,
"learning_rate": 4.477180124775388e-05,
"loss": 0.85682964,
"memory(GiB)": 18.79,
"step": 310,
"train_speed(iter/s)": 0.17887
},
{
"acc": 0.72672372,
"epoch": 0.2408051923619603,
"grad_norm": 1.0625,
"learning_rate": 4.43927822676105e-05,
"loss": 0.89327288,
"memory(GiB)": 18.79,
"step": 320,
"train_speed(iter/s)": 0.179041
},
{
"acc": 0.73567257,
"epoch": 0.24833035462327158,
"grad_norm": 1.3203125,
"learning_rate": 4.400222645438109e-05,
"loss": 0.85311451,
"memory(GiB)": 18.79,
"step": 330,
"train_speed(iter/s)": 0.179189
},
{
"acc": 0.7419868,
"epoch": 0.2558555168845828,
"grad_norm": 1.15625,
"learning_rate": 4.3600366151066575e-05,
"loss": 0.82233105,
"memory(GiB)": 18.79,
"step": 340,
"train_speed(iter/s)": 0.179204
},
{
"acc": 0.74832144,
"epoch": 0.2633806791458941,
"grad_norm": 1.0,
"learning_rate": 4.3187440425747994e-05,
"loss": 0.81996908,
"memory(GiB)": 18.79,
"step": 350,
"train_speed(iter/s)": 0.179326
},
{
"acc": 0.73445973,
"epoch": 0.27090584140720536,
"grad_norm": 1.2421875,
"learning_rate": 4.2763694929364166e-05,
"loss": 0.85199509,
"memory(GiB)": 18.79,
"step": 360,
"train_speed(iter/s)": 0.179521
},
{
"acc": 0.74308429,
"epoch": 0.2784310036685166,
"grad_norm": 1.1953125,
"learning_rate": 4.232938174957302e-05,
"loss": 0.83573999,
"memory(GiB)": 18.79,
"step": 370,
"train_speed(iter/s)": 0.179651
},
{
"acc": 0.7366713,
"epoch": 0.28595616592982787,
"grad_norm": 1.46875,
"learning_rate": 4.1884759260783816e-05,
"loss": 0.86369457,
"memory(GiB)": 18.79,
"step": 380,
"train_speed(iter/s)": 0.179604
},
{
"acc": 0.74553895,
"epoch": 0.29348132819113915,
"grad_norm": 1.21875,
"learning_rate": 4.143009197044932e-05,
"loss": 0.82978315,
"memory(GiB)": 19.6,
"step": 390,
"train_speed(iter/s)": 0.179568
},
{
"acc": 0.73931322,
"epoch": 0.30100649045245037,
"grad_norm": 1.265625,
"learning_rate": 4.0965650361709363e-05,
"loss": 0.85100927,
"memory(GiB)": 19.6,
"step": 400,
"train_speed(iter/s)": 0.17966
},
{
"epoch": 0.30100649045245037,
"eval_acc": 0.7413378807844353,
"eval_loss": 0.8540410399436951,
"eval_runtime": 17.4936,
"eval_samples_per_second": 12.233,
"eval_steps_per_second": 12.233,
"step": 400
},
{
"acc": 0.73913217,
"epoch": 0.30853165271376165,
"grad_norm": 1.0625,
"learning_rate": 4.0491710732479566e-05,
"loss": 0.83017845,
"memory(GiB)": 19.6,
"step": 410,
"train_speed(iter/s)": 0.178277
},
{
"acc": 0.73962994,
"epoch": 0.3160568149750729,
"grad_norm": 1.2265625,
"learning_rate": 4.000855503108069e-05,
"loss": 0.84346523,
"memory(GiB)": 19.6,
"step": 420,
"train_speed(iter/s)": 0.178475
},
{
"acc": 0.74960279,
"epoch": 0.32358197723638416,
"grad_norm": 1.3671875,
"learning_rate": 3.951647068850662e-05,
"loss": 0.79658604,
"memory(GiB)": 19.6,
"step": 430,
"train_speed(iter/s)": 0.17856
},
{
"acc": 0.74074645,
"epoch": 0.33110713949769544,
"grad_norm": 1.1796875,
"learning_rate": 3.901575044743072e-05,
"loss": 0.86585417,
"memory(GiB)": 19.6,
"step": 440,
"train_speed(iter/s)": 0.178589
},
{
"acc": 0.74740105,
"epoch": 0.33863230175900666,
"grad_norm": 1.0625,
"learning_rate": 3.8506692188052116e-05,
"loss": 0.8223978,
"memory(GiB)": 19.6,
"step": 450,
"train_speed(iter/s)": 0.178586
},
{
"acc": 0.75465593,
"epoch": 0.34615746402031794,
"grad_norm": 1.015625,
"learning_rate": 3.798959875088584e-05,
"loss": 0.80566111,
"memory(GiB)": 19.6,
"step": 460,
"train_speed(iter/s)": 0.178662
},
{
"acc": 0.74560366,
"epoch": 0.3536826262816292,
"grad_norm": 1.140625,
"learning_rate": 3.7464777756601905e-05,
"loss": 0.83098888,
"memory(GiB)": 19.6,
"step": 470,
"train_speed(iter/s)": 0.178743
},
{
"acc": 0.7393681,
"epoch": 0.36120778854294044,
"grad_norm": 1.671875,
"learning_rate": 3.693254142302071e-05,
"loss": 0.82851734,
"memory(GiB)": 19.6,
"step": 480,
"train_speed(iter/s)": 0.178887
},
{
"acc": 0.74919853,
"epoch": 0.3687329508042517,
"grad_norm": 1.15625,
"learning_rate": 3.639320637937357e-05,
"loss": 0.83529673,
"memory(GiB)": 19.6,
"step": 490,
"train_speed(iter/s)": 0.178857
},
{
"acc": 0.74647813,
"epoch": 0.37625811306556295,
"grad_norm": 1.1171875,
"learning_rate": 3.5847093477938956e-05,
"loss": 0.83272486,
"memory(GiB)": 19.6,
"step": 500,
"train_speed(iter/s)": 0.17888
},
{
"epoch": 0.37625811306556295,
"eval_acc": 0.7429656615766219,
"eval_loss": 0.8442411422729492,
"eval_runtime": 17.5096,
"eval_samples_per_second": 12.222,
"eval_steps_per_second": 12.222,
"step": 500
},
{
"acc": 0.73344955,
"epoch": 0.38378327532687423,
"grad_norm": 1.1875,
"learning_rate": 3.529452760316629e-05,
"loss": 0.85569019,
"memory(GiB)": 19.6,
"step": 510,
"train_speed(iter/s)": 0.177842
},
{
"acc": 0.73404479,
"epoch": 0.3913084375881855,
"grad_norm": 1.15625,
"learning_rate": 3.473583747840112e-05,
"loss": 0.87756214,
"memory(GiB)": 19.6,
"step": 520,
"train_speed(iter/s)": 0.178009
},
{
"acc": 0.74784298,
"epoch": 0.39883359984949673,
"grad_norm": 1.1953125,
"learning_rate": 3.4171355470326414e-05,
"loss": 0.81443148,
"memory(GiB)": 19.6,
"step": 530,
"train_speed(iter/s)": 0.177998
},
{
"acc": 0.73395772,
"epoch": 0.406358762110808,
"grad_norm": 1.1796875,
"learning_rate": 3.360141739123655e-05,
"loss": 0.87308407,
"memory(GiB)": 19.6,
"step": 540,
"train_speed(iter/s)": 0.178089
},
{
"acc": 0.74080787,
"epoch": 0.4138839243721193,
"grad_norm": 1.3046875,
"learning_rate": 3.302636229926135e-05,
"loss": 0.83921108,
"memory(GiB)": 19.6,
"step": 550,
"train_speed(iter/s)": 0.178188
},
{
"acc": 0.74777498,
"epoch": 0.4214090866334305,
"grad_norm": 1.125,
"learning_rate": 3.244653229665925e-05,
"loss": 0.80149298,
"memory(GiB)": 19.6,
"step": 560,
"train_speed(iter/s)": 0.178273
},
{
"acc": 0.74247198,
"epoch": 0.4289342488947418,
"grad_norm": 1.1796875,
"learning_rate": 3.1862272326299526e-05,
"loss": 0.84387398,
"memory(GiB)": 19.6,
"step": 570,
"train_speed(iter/s)": 0.178376
},
{
"acc": 0.73954334,
"epoch": 0.4364594111560531,
"grad_norm": 1.40625,
"learning_rate": 3.1273929966454535e-05,
"loss": 0.83296976,
"memory(GiB)": 19.6,
"step": 580,
"train_speed(iter/s)": 0.178516
},
{
"acc": 0.74422665,
"epoch": 0.4439845734173643,
"grad_norm": 1.265625,
"learning_rate": 3.0681855224024235e-05,
"loss": 0.8339119,
"memory(GiB)": 19.6,
"step": 590,
"train_speed(iter/s)": 0.178637
},
{
"acc": 0.73981266,
"epoch": 0.4515097356786756,
"grad_norm": 1.15625,
"learning_rate": 3.008640032631585e-05,
"loss": 0.83661747,
"memory(GiB)": 19.6,
"step": 600,
"train_speed(iter/s)": 0.178682
},
{
"epoch": 0.4515097356786756,
"eval_acc": 0.7431594450042632,
"eval_loss": 0.8382883667945862,
"eval_runtime": 17.5013,
"eval_samples_per_second": 12.228,
"eval_steps_per_second": 12.228,
"step": 600
},
{
"acc": 0.74818993,
"epoch": 0.4590348979399868,
"grad_norm": 1.3359375,
"learning_rate": 2.9487919511502653e-05,
"loss": 0.81010456,
"memory(GiB)": 19.6,
"step": 610,
"train_speed(iter/s)": 0.177788
},
{
"acc": 0.75267715,
"epoch": 0.4665600602012981,
"grad_norm": 1.3125,
"learning_rate": 2.888676881788645e-05,
"loss": 0.80105429,
"memory(GiB)": 19.6,
"step": 620,
"train_speed(iter/s)": 0.17777
},
{
"acc": 0.7422555,
"epoch": 0.47408522246260937,
"grad_norm": 1.1875,
"learning_rate": 2.8283305872089145e-05,
"loss": 0.8282918,
"memory(GiB)": 19.6,
"step": 630,
"train_speed(iter/s)": 0.177843
},
{
"acc": 0.74146862,
"epoch": 0.4816103847239206,
"grad_norm": 1.3046875,
"learning_rate": 2.767788967629944e-05,
"loss": 0.83825617,
"memory(GiB)": 19.6,
"step": 640,
"train_speed(iter/s)": 0.177979
},
{
"acc": 0.74322186,
"epoch": 0.4891355469852319,
"grad_norm": 1.2265625,
"learning_rate": 2.707088039470122e-05,
"loss": 0.82794752,
"memory(GiB)": 19.6,
"step": 650,
"train_speed(iter/s)": 0.178091
},
{
"acc": 0.74506578,
"epoch": 0.49666070924654315,
"grad_norm": 1.2421875,
"learning_rate": 2.64626391392106e-05,
"loss": 0.82765636,
"memory(GiB)": 19.6,
"step": 660,
"train_speed(iter/s)": 0.178174
},
{
"acc": 0.74580374,
"epoch": 0.5041858715078544,
"grad_norm": 1.078125,
"learning_rate": 2.5853527754649198e-05,
"loss": 0.82673864,
"memory(GiB)": 19.6,
"step": 670,
"train_speed(iter/s)": 0.178186
},
{
"acc": 0.74080172,
"epoch": 0.5117110337691656,
"grad_norm": 1.28125,
"learning_rate": 2.5243908603481453e-05,
"loss": 0.84204159,
"memory(GiB)": 19.6,
"step": 680,
"train_speed(iter/s)": 0.178249
},
{
"acc": 0.75322657,
"epoch": 0.5192361960304769,
"grad_norm": 1.1015625,
"learning_rate": 2.4634144350243894e-05,
"loss": 0.81114817,
"memory(GiB)": 19.6,
"step": 690,
"train_speed(iter/s)": 0.178267
},
{
"acc": 0.75419984,
"epoch": 0.5267613582917882,
"grad_norm": 1.1015625,
"learning_rate": 2.402459774579475e-05,
"loss": 0.79072003,
"memory(GiB)": 19.6,
"step": 700,
"train_speed(iter/s)": 0.178363
},
{
"epoch": 0.5267613582917882,
"eval_acc": 0.7448647391675064,
"eval_loss": 0.8292610049247742,
"eval_runtime": 17.5299,
"eval_samples_per_second": 12.208,
"eval_steps_per_second": 12.208,
"step": 700
},
{
"acc": 0.74434228,
"epoch": 0.5342865205530994,
"grad_norm": 1.2265625,
"learning_rate": 2.3415631411512283e-05,
"loss": 0.82065563,
"memory(GiB)": 19.6,
"step": 710,
"train_speed(iter/s)": 0.177614
},
{
"acc": 0.75825992,
"epoch": 0.5418116828144107,
"grad_norm": 1.234375,
"learning_rate": 2.2807607623569986e-05,
"loss": 0.78528976,
"memory(GiB)": 19.6,
"step": 720,
"train_speed(iter/s)": 0.17764
},
{
"acc": 0.73724399,
"epoch": 0.5493368450757219,
"grad_norm": 1.2265625,
"learning_rate": 2.2200888097417307e-05,
"loss": 0.83492756,
"memory(GiB)": 19.6,
"step": 730,
"train_speed(iter/s)": 0.177754
},
{
"acc": 0.74586935,
"epoch": 0.5568620073370332,
"grad_norm": 1.2734375,
"learning_rate": 2.159583377259384e-05,
"loss": 0.8267067,
"memory(GiB)": 19.6,
"step": 740,
"train_speed(iter/s)": 0.17778
},
{
"acc": 0.74503641,
"epoch": 0.5643871695983445,
"grad_norm": 1.2265625,
"learning_rate": 2.0992804598005174e-05,
"loss": 0.80896463,
"memory(GiB)": 19.6,
"step": 750,
"train_speed(iter/s)": 0.17788
},
{
"acc": 0.74723582,
"epoch": 0.5719123318596557,
"grad_norm": 1.3359375,
"learning_rate": 2.0392159317788028e-05,
"loss": 0.81317263,
"memory(GiB)": 19.6,
"step": 760,
"train_speed(iter/s)": 0.177963
},
{
"acc": 0.74401774,
"epoch": 0.579437494120967,
"grad_norm": 1.046875,
"learning_rate": 1.9794255257892125e-05,
"loss": 0.83063583,
"memory(GiB)": 19.6,
"step": 770,
"train_speed(iter/s)": 0.178029
},
{
"acc": 0.7539969,
"epoch": 0.5869626563822783,
"grad_norm": 1.2421875,
"learning_rate": 1.9199448113505802e-05,
"loss": 0.79329357,
"memory(GiB)": 19.6,
"step": 780,
"train_speed(iter/s)": 0.178108
},
{
"acc": 0.74454188,
"epoch": 0.5944878186435895,
"grad_norm": 1.2265625,
"learning_rate": 1.860809173745162e-05,
"loss": 0.83453617,
"memory(GiB)": 19.6,
"step": 790,
"train_speed(iter/s)": 0.178158
},
{
"acc": 0.74658017,
"epoch": 0.6020129809049007,
"grad_norm": 1.3359375,
"learning_rate": 1.802053792967819e-05,
"loss": 0.82045937,
"memory(GiB)": 19.6,
"step": 800,
"train_speed(iter/s)": 0.178154
},
{
"epoch": 0.6020129809049007,
"eval_acc": 0.7476164638400125,
"eval_loss": 0.8245773315429688,
"eval_runtime": 17.4656,
"eval_samples_per_second": 12.253,
"eval_steps_per_second": 12.253,
"step": 800
},
{
"acc": 0.74530048,
"epoch": 0.609538143166212,
"grad_norm": 1.609375,
"learning_rate": 1.743713622797311e-05,
"loss": 0.81913815,
"memory(GiB)": 19.6,
"step": 810,
"train_speed(iter/s)": 0.177468
},
{
"acc": 0.75947304,
"epoch": 0.6170633054275233,
"grad_norm": 1.2421875,
"learning_rate": 1.6858233700021754e-05,
"loss": 0.78629255,
"memory(GiB)": 19.6,
"step": 820,
"train_speed(iter/s)": 0.17748
},
{
"acc": 0.75599556,
"epoch": 0.6245884676888346,
"grad_norm": 1.34375,
"learning_rate": 1.628417473693552e-05,
"loss": 0.77964149,
"memory(GiB)": 19.6,
"step": 830,
"train_speed(iter/s)": 0.177495
},
{
"acc": 0.7474288,
"epoch": 0.6321136299501458,
"grad_norm": 1.359375,
"learning_rate": 1.571530084837234e-05,
"loss": 0.80840693,
"memory(GiB)": 19.6,
"step": 840,
"train_speed(iter/s)": 0.17758
},
{
"acc": 0.74778714,
"epoch": 0.639638792211457,
"grad_norm": 1.578125,
"learning_rate": 1.5151950459371417e-05,
"loss": 0.7930984,
"memory(GiB)": 19.6,
"step": 850,
"train_speed(iter/s)": 0.177866
},
{
"acc": 0.75338955,
"epoch": 0.6471639544727683,
"grad_norm": 1.328125,
"learning_rate": 1.4594458709023034e-05,
"loss": 0.78671389,
"memory(GiB)": 19.6,
"step": 860,
"train_speed(iter/s)": 0.178036
},
{
"acc": 0.76030412,
"epoch": 0.6546891167340796,
"grad_norm": 1.21875,
"learning_rate": 1.4043157251093097e-05,
"loss": 0.75907588,
"memory(GiB)": 19.6,
"step": 870,
"train_speed(iter/s)": 0.178081
},
{
"acc": 0.74864521,
"epoch": 0.6622142789953909,
"grad_norm": 1.1875,
"learning_rate": 1.3498374056721197e-05,
"loss": 0.82172766,
"memory(GiB)": 21.02,
"step": 880,
"train_speed(iter/s)": 0.178291
},
{
"acc": 0.75451798,
"epoch": 0.6697394412567022,
"grad_norm": 1.171875,
"learning_rate": 1.2960433219309453e-05,
"loss": 0.79079266,
"memory(GiB)": 21.02,
"step": 890,
"train_speed(iter/s)": 0.178342
},
{
"acc": 0.75273514,
"epoch": 0.6772646035180133,
"grad_norm": 1.4609375,
"learning_rate": 1.2429654761718206e-05,
"loss": 0.79605904,
"memory(GiB)": 21.02,
"step": 900,
"train_speed(iter/s)": 0.178448
},
{
"epoch": 0.6772646035180133,
"eval_acc": 0.7478102472676537,
"eval_loss": 0.8203556537628174,
"eval_runtime": 17.6833,
"eval_samples_per_second": 12.102,
"eval_steps_per_second": 12.102,
"step": 900
},
{
"acc": 0.75493579,
"epoch": 0.6847897657793246,
"grad_norm": 1.21875,
"learning_rate": 1.1906354445883342e-05,
"loss": 0.79625425,
"memory(GiB)": 21.02,
"step": 910,
"train_speed(iter/s)": 0.177828
},
{
"acc": 0.74433489,
"epoch": 0.6923149280406359,
"grad_norm": 1.3828125,
"learning_rate": 1.1390843584968398e-05,
"loss": 0.82548571,
"memory(GiB)": 21.02,
"step": 920,
"train_speed(iter/s)": 0.177868
},
{
"acc": 0.74827509,
"epoch": 0.6998400903019472,
"grad_norm": 1.21875,
"learning_rate": 1.08834288581633e-05,
"loss": 0.8204648,
"memory(GiB)": 21.02,
"step": 930,
"train_speed(iter/s)": 0.177889
},
{
"acc": 0.75613265,
"epoch": 0.7073652525632584,
"grad_norm": 1.21875,
"learning_rate": 1.0384412128239885e-05,
"loss": 0.77681332,
"memory(GiB)": 21.02,
"step": 940,
"train_speed(iter/s)": 0.177919
},
{
"acc": 0.75295358,
"epoch": 0.7148904148245696,
"grad_norm": 1.4609375,
"learning_rate": 9.894090261972639e-06,
"loss": 0.80203428,
"memory(GiB)": 21.02,
"step": 950,
"train_speed(iter/s)": 0.177945
},
{
"acc": 0.75752335,
"epoch": 0.7224155770858809,
"grad_norm": 1.4609375,
"learning_rate": 9.412754953531663e-06,
"loss": 0.77538629,
"memory(GiB)": 21.02,
"step": 960,
"train_speed(iter/s)": 0.177984
},
{
"acc": 0.75414553,
"epoch": 0.7299407393471922,
"grad_norm": 1.4765625,
"learning_rate": 8.940692550952806e-06,
"loss": 0.78667145,
"memory(GiB)": 21.02,
"step": 970,
"train_speed(iter/s)": 0.17801
},
{
"acc": 0.74900241,
"epoch": 0.7374659016085034,
"grad_norm": 1.3671875,
"learning_rate": 8.478183885788216e-06,
"loss": 0.80684509,
"memory(GiB)": 21.02,
"step": 980,
"train_speed(iter/s)": 0.178043
},
{
"acc": 0.75565805,
"epoch": 0.7449910638698147,
"grad_norm": 1.203125,
"learning_rate": 8.025504106038692e-06,
"loss": 0.80487137,
"memory(GiB)": 21.02,
"step": 990,
"train_speed(iter/s)": 0.17808
},
{
"acc": 0.74333644,
"epoch": 0.7525162261311259,
"grad_norm": 1.703125,
"learning_rate": 7.582922512467183e-06,
"loss": 0.81951437,
"memory(GiB)": 21.54,
"step": 1000,
"train_speed(iter/s)": 0.178069
},
{
"epoch": 0.7525162261311259,
"eval_acc": 0.7478490039531819,
"eval_loss": 0.8180410861968994,
"eval_runtime": 17.4781,
"eval_samples_per_second": 12.244,
"eval_steps_per_second": 12.244,
"step": 1000
},
{
"acc": 0.75819354,
"epoch": 0.7600413883924372,
"grad_norm": 1.4765625,
"learning_rate": 7.150702398390841e-06,
"loss": 0.78239226,
"memory(GiB)": 17.02,
"step": 1010,
"train_speed(iter/s)": 0.17756
},
{
"acc": 0.75477595,
"epoch": 0.7675665506537485,
"grad_norm": 1.6015625,
"learning_rate": 6.729100893046897e-06,
"loss": 0.77195005,
"memory(GiB)": 17.02,
"step": 1020,
"train_speed(iter/s)": 0.177596
},
{
"acc": 0.75055728,
"epoch": 0.7750917129150597,
"grad_norm": 1.1796875,
"learning_rate": 6.318368808625641e-06,
"loss": 0.79199243,
"memory(GiB)": 17.02,
"step": 1030,
"train_speed(iter/s)": 0.177596
},
{
"acc": 0.74901199,
"epoch": 0.782616875176371,
"grad_norm": 1.1875,
"learning_rate": 5.918750491061323e-06,
"loss": 0.8067667,
"memory(GiB)": 17.02,
"step": 1040,
"train_speed(iter/s)": 0.177656
},
{
"acc": 0.75432835,
"epoch": 0.7901420374376823,
"grad_norm": 1.34375,
"learning_rate": 5.530483674669948e-06,
"loss": 0.80282774,
"memory(GiB)": 17.02,
"step": 1050,
"train_speed(iter/s)": 0.177668
},
{
"acc": 0.75713019,
"epoch": 0.7976671996989935,
"grad_norm": 1.21875,
"learning_rate": 5.153799340720309e-06,
"loss": 0.78160086,
"memory(GiB)": 17.02,
"step": 1060,
"train_speed(iter/s)": 0.177697
},
{
"acc": 0.74733901,
"epoch": 0.8051923619603047,
"grad_norm": 1.5,
"learning_rate": 4.788921580022421e-06,
"loss": 0.80715675,
"memory(GiB)": 17.02,
"step": 1070,
"train_speed(iter/s)": 0.177759
},
{
"acc": 0.75728092,
"epoch": 0.812717524221616,
"grad_norm": 1.2734375,
"learning_rate": 4.436067459615145e-06,
"loss": 0.78215985,
"memory(GiB)": 17.02,
"step": 1080,
"train_speed(iter/s)": 0.177787
},
{
"acc": 0.75329976,
"epoch": 0.8202426864829273,
"grad_norm": 1.2578125,
"learning_rate": 4.095446893632235e-06,
"loss": 0.77923803,
"memory(GiB)": 17.02,
"step": 1090,
"train_speed(iter/s)": 0.177802
},
{
"acc": 0.74634914,
"epoch": 0.8277678487442386,
"grad_norm": 1.453125,
"learning_rate": 3.7672625184237034e-06,
"loss": 0.81888247,
"memory(GiB)": 17.02,
"step": 1100,
"train_speed(iter/s)": 0.177796
},
{
"epoch": 0.8277678487442386,
"eval_acc": 0.7483140841795209,
"eval_loss": 0.8171122670173645,
"eval_runtime": 17.5192,
"eval_samples_per_second": 12.215,
"eval_steps_per_second": 12.215,
"step": 1100
},
{
"acc": 0.7570456,
"epoch": 0.8352930110055498,
"grad_norm": 1.2421875,
"learning_rate": 3.4517095720067783e-06,
"loss": 0.78159804,
"memory(GiB)": 17.02,
"step": 1110,
"train_speed(iter/s)": 0.177278
},
{
"acc": 0.75265865,
"epoch": 0.842818173266861,
"grad_norm": 1.46875,
"learning_rate": 3.148975777918095e-06,
"loss": 0.78856063,
"memory(GiB)": 17.02,
"step": 1120,
"train_speed(iter/s)": 0.177277
},
{
"acc": 0.75768456,
"epoch": 0.8503433355281723,
"grad_norm": 1.2265625,
"learning_rate": 2.8592412335363472e-06,
"loss": 0.77000494,
"memory(GiB)": 17.02,
"step": 1130,
"train_speed(iter/s)": 0.177348
},
{
"acc": 0.75568204,
"epoch": 0.8578684977894836,
"grad_norm": 1.3828125,
"learning_rate": 2.5826783029417157e-06,
"loss": 0.78663826,
"memory(GiB)": 17.02,
"step": 1140,
"train_speed(iter/s)": 0.177395
},
{
"acc": 0.74540915,
"epoch": 0.8653936600507949,
"grad_norm": 1.484375,
"learning_rate": 2.3194515143758976e-06,
"loss": 0.82176847,
"memory(GiB)": 17.02,
"step": 1150,
"train_speed(iter/s)": 0.177432
},
{
"acc": 0.74521751,
"epoch": 0.8729188223121062,
"grad_norm": 1.2734375,
"learning_rate": 2.0697174623636794e-06,
"loss": 0.80748806,
"memory(GiB)": 17.02,
"step": 1160,
"train_speed(iter/s)": 0.177452
},
{
"acc": 0.74583597,
"epoch": 0.8804439845734173,
"grad_norm": 1.328125,
"learning_rate": 1.8336247145543079e-06,
"loss": 0.80123825,
"memory(GiB)": 17.02,
"step": 1170,
"train_speed(iter/s)": 0.177474
},
{
"acc": 0.75462961,
"epoch": 0.8879691468347286,
"grad_norm": 1.3828125,
"learning_rate": 1.6113137233380954e-06,
"loss": 0.78282847,
"memory(GiB)": 17.02,
"step": 1180,
"train_speed(iter/s)": 0.177502
},
{
"acc": 0.76421356,
"epoch": 0.8954943090960399,
"grad_norm": 1.40625,
"learning_rate": 1.4029167422908107e-06,
"loss": 0.77380075,
"memory(GiB)": 17.02,
"step": 1190,
"train_speed(iter/s)": 0.177542
},
{
"acc": 0.7530828,
"epoch": 0.9030194713573512,
"grad_norm": 1.21875,
"learning_rate": 1.2085577474955533e-06,
"loss": 0.78172884,
"memory(GiB)": 17.02,
"step": 1200,
"train_speed(iter/s)": 0.177574
},
{
"epoch": 0.9030194713573512,
"eval_acc": 0.7491279745756143,
"eval_loss": 0.8163484930992126,
"eval_runtime": 17.5046,
"eval_samples_per_second": 12.225,
"eval_steps_per_second": 12.225,
"step": 1200
},
{
"acc": 0.75201364,
"epoch": 0.9105446336186624,
"grad_norm": 1.4140625,
"learning_rate": 1.0283523637889592e-06,
"loss": 0.80502033,
"memory(GiB)": 17.02,
"step": 1210,
"train_speed(iter/s)": 0.17712
},
{
"acc": 0.74727998,
"epoch": 0.9180697958799736,
"grad_norm": 1.296875,
"learning_rate": 8.624077959756032e-07,
"loss": 0.79580932,
"memory(GiB)": 17.02,
"step": 1220,
"train_speed(iter/s)": 0.17714
},
{
"acc": 0.75626488,
"epoch": 0.9255949581412849,
"grad_norm": 1.1796875,
"learning_rate": 7.108227650514637e-07,
"loss": 0.80513897,
"memory(GiB)": 17.02,
"step": 1230,
"train_speed(iter/s)": 0.177153
},
{
"acc": 0.75931497,
"epoch": 0.9331201204025962,
"grad_norm": 1.078125,
"learning_rate": 5.736874494744887e-07,
"loss": 0.77948103,
"memory(GiB)": 17.02,
"step": 1240,
"train_speed(iter/s)": 0.177196
},
{
"acc": 0.74202876,
"epoch": 0.9406452826639075,
"grad_norm": 1.2734375,
"learning_rate": 4.5108343151710196e-07,
"loss": 0.82898502,
"memory(GiB)": 17.02,
"step": 1250,
"train_speed(iter/s)": 0.177234
},
{
"acc": 0.74493279,
"epoch": 0.9481704449252187,
"grad_norm": 1.3984375,
"learning_rate": 3.430836487326311e-07,
"loss": 0.83584261,
"memory(GiB)": 17.02,
"step": 1260,
"train_speed(iter/s)": 0.177249
},
{
"acc": 0.75014348,
"epoch": 0.9556956071865299,
"grad_norm": 1.4140625,
"learning_rate": 2.497523505645083e-07,
"loss": 0.80501699,
"memory(GiB)": 17.02,
"step": 1270,
"train_speed(iter/s)": 0.177308
},
{
"acc": 0.75359378,
"epoch": 0.9632207694478412,
"grad_norm": 1.359375,
"learning_rate": 1.7114506012405607e-07,
"loss": 0.80206327,
"memory(GiB)": 17.02,
"step": 1280,
"train_speed(iter/s)": 0.17736
},
{
"acc": 0.76561971,
"epoch": 0.9707459317091525,
"grad_norm": 1.2109375,
"learning_rate": 1.0730854115959532e-07,
"loss": 0.74447112,
"memory(GiB)": 17.02,
"step": 1290,
"train_speed(iter/s)": 0.177376
},
{
"acc": 0.74775319,
"epoch": 0.9782710939704637,
"grad_norm": 1.5625,
"learning_rate": 5.8280770236518456e-08,
"loss": 0.81718521,
"memory(GiB)": 17.02,
"step": 1300,
"train_speed(iter/s)": 0.177414
},
{
"epoch": 0.9782710939704637,
"eval_acc": 0.7494767847453686,
"eval_loss": 0.8163220882415771,
"eval_runtime": 17.5248,
"eval_samples_per_second": 12.211,
"eval_steps_per_second": 12.211,
"step": 1300
}
],
"logging_steps": 10,
"max_steps": 1328,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.720134513414052e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}