| { |
| "best_global_step": 175000, |
| "best_metric": 0.0006260189693421125, |
| "best_model_checkpoint": "/data/bozos/models/f8d245da3b0d0e66db4c97688fe67d8c31303d4f662c4b64e5da18eb8964c893/checkpoints/checkpoint-175000", |
| "epoch": 4.08, |
| "eval_steps": 5000, |
| "global_step": 255000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.016, |
| "grad_norm": 2.213568925857544, |
| "learning_rate": 0.00022758072642650628, |
| "loss": 1.732, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 0.6694766879081726, |
| "learning_rate": 0.00022757708507668012, |
| "loss": 0.4826, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 2.43005633354187, |
| "learning_rate": 0.00022757344372685396, |
| "loss": 0.1817, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 0.4847993850708008, |
| "learning_rate": 0.0002275698023770278, |
| "loss": 0.1264, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.162339448928833, |
| "learning_rate": 0.00022756616102720165, |
| "loss": 0.0816, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_loss": 0.05317778140306473, |
| "eval_runtime": 27.3774, |
| "eval_samples_per_second": 36.526, |
| "eval_steps_per_second": 4.566, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 0.13133646547794342, |
| "learning_rate": 0.0002275625196773755, |
| "loss": 0.0531, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 0.16350027918815613, |
| "learning_rate": 0.00022755887832754933, |
| "loss": 0.0471, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 0.15041467547416687, |
| "learning_rate": 0.00022755523697772317, |
| "loss": 0.0303, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 0.22431999444961548, |
| "learning_rate": 0.000227551595627897, |
| "loss": 0.0213, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.09702177345752716, |
| "learning_rate": 0.00022754795427807082, |
| "loss": 0.016, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_loss": 0.011971595697104931, |
| "eval_runtime": 27.2488, |
| "eval_samples_per_second": 36.699, |
| "eval_steps_per_second": 4.587, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 0.22383971512317657, |
| "learning_rate": 0.0002275443129282447, |
| "loss": 0.0182, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 0.10483340919017792, |
| "learning_rate": 0.0002275406715784185, |
| "loss": 0.0099, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 0.06974471360445023, |
| "learning_rate": 0.00022753703022859238, |
| "loss": 0.0102, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 0.08179257810115814, |
| "learning_rate": 0.0002275333888787662, |
| "loss": 0.0095, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.06491447985172272, |
| "learning_rate": 0.00022752974752894006, |
| "loss": 0.0108, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_loss": 0.007058731280267239, |
| "eval_runtime": 27.4538, |
| "eval_samples_per_second": 36.425, |
| "eval_steps_per_second": 4.553, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 0.07762598991394043, |
| "learning_rate": 0.00022752610617911387, |
| "loss": 0.0083, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.272, |
| "grad_norm": 0.09719238430261612, |
| "learning_rate": 0.00022752246482928774, |
| "loss": 0.0088, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.288, |
| "grad_norm": 0.0923658162355423, |
| "learning_rate": 0.00022751882347946155, |
| "loss": 0.0058, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.304, |
| "grad_norm": 0.07014696300029755, |
| "learning_rate": 0.00022751518212963542, |
| "loss": 0.0075, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.060413043946027756, |
| "learning_rate": 0.00022751154077980924, |
| "loss": 0.006, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_loss": 0.004977255128324032, |
| "eval_runtime": 27.1268, |
| "eval_samples_per_second": 36.864, |
| "eval_steps_per_second": 4.608, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.336, |
| "grad_norm": 0.07979925721883774, |
| "learning_rate": 0.0002275078994299831, |
| "loss": 0.0074, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.352, |
| "grad_norm": 0.07468965649604797, |
| "learning_rate": 0.00022750425808015692, |
| "loss": 0.0061, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.368, |
| "grad_norm": 0.11379896104335785, |
| "learning_rate": 0.00022750061673033079, |
| "loss": 0.0066, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 0.14029712975025177, |
| "learning_rate": 0.0002274969753805046, |
| "loss": 0.0048, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.025649070739746094, |
| "learning_rate": 0.00022749333403067844, |
| "loss": 0.0062, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_loss": 0.00377740734256804, |
| "eval_runtime": 27.2272, |
| "eval_samples_per_second": 36.728, |
| "eval_steps_per_second": 4.591, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.416, |
| "grad_norm": 0.07835651934146881, |
| "learning_rate": 0.00022748969268085228, |
| "loss": 0.0057, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.432, |
| "grad_norm": 0.037621937692165375, |
| "learning_rate": 0.00022748605133102612, |
| "loss": 0.0043, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.448, |
| "grad_norm": 0.05530184134840965, |
| "learning_rate": 0.00022748240998119996, |
| "loss": 0.0078, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.464, |
| "grad_norm": 0.2537539601325989, |
| "learning_rate": 0.0002274787686313738, |
| "loss": 0.004, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.08855901658535004, |
| "learning_rate": 0.00022747512728154765, |
| "loss": 0.0055, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_loss": 0.00407881336286664, |
| "eval_runtime": 27.3314, |
| "eval_samples_per_second": 36.588, |
| "eval_steps_per_second": 4.573, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.496, |
| "grad_norm": 0.01860993541777134, |
| "learning_rate": 0.0002274714859317215, |
| "loss": 0.0044, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.512, |
| "grad_norm": 0.030549678951501846, |
| "learning_rate": 0.00022746784458189533, |
| "loss": 0.0062, |
| "step": 32000 |
| }, |
| { |
| "epoch": 0.528, |
| "grad_norm": 0.07974190264940262, |
| "learning_rate": 0.00022746420323206917, |
| "loss": 0.0044, |
| "step": 33000 |
| }, |
| { |
| "epoch": 0.544, |
| "grad_norm": 0.07146530598402023, |
| "learning_rate": 0.000227460561882243, |
| "loss": 0.0033, |
| "step": 34000 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.03786474093794823, |
| "learning_rate": 0.00022745692053241685, |
| "loss": 0.0064, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.56, |
| "eval_loss": 0.002913910197094083, |
| "eval_runtime": 27.2895, |
| "eval_samples_per_second": 36.644, |
| "eval_steps_per_second": 4.581, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.576, |
| "grad_norm": 1.5708693265914917, |
| "learning_rate": 0.0002274532791825907, |
| "loss": 0.0048, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.592, |
| "grad_norm": 0.04259568825364113, |
| "learning_rate": 0.0002274496378327645, |
| "loss": 0.0027, |
| "step": 37000 |
| }, |
| { |
| "epoch": 0.608, |
| "grad_norm": 0.029481125995516777, |
| "learning_rate": 0.00022744599648293838, |
| "loss": 0.0049, |
| "step": 38000 |
| }, |
| { |
| "epoch": 0.624, |
| "grad_norm": 0.3993789553642273, |
| "learning_rate": 0.0002274423551331122, |
| "loss": 0.0048, |
| "step": 39000 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.03810903802514076, |
| "learning_rate": 0.00022743871378328606, |
| "loss": 0.0027, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_loss": 0.00209135003387928, |
| "eval_runtime": 27.0955, |
| "eval_samples_per_second": 36.906, |
| "eval_steps_per_second": 4.613, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.656, |
| "grad_norm": 0.027073705568909645, |
| "learning_rate": 0.00022743507243345987, |
| "loss": 0.0033, |
| "step": 41000 |
| }, |
| { |
| "epoch": 0.672, |
| "grad_norm": 0.04906334728002548, |
| "learning_rate": 0.00022743143108363374, |
| "loss": 0.0033, |
| "step": 42000 |
| }, |
| { |
| "epoch": 0.688, |
| "grad_norm": 0.05806988850235939, |
| "learning_rate": 0.00022742778973380755, |
| "loss": 0.0039, |
| "step": 43000 |
| }, |
| { |
| "epoch": 0.704, |
| "grad_norm": 0.022845715284347534, |
| "learning_rate": 0.00022742414838398142, |
| "loss": 0.0031, |
| "step": 44000 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.06443994492292404, |
| "learning_rate": 0.00022742050703415524, |
| "loss": 0.0027, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.72, |
| "eval_loss": 0.002537691965699196, |
| "eval_runtime": 27.3857, |
| "eval_samples_per_second": 36.515, |
| "eval_steps_per_second": 4.564, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.736, |
| "grad_norm": 0.1143941730260849, |
| "learning_rate": 0.0002274168656843291, |
| "loss": 0.0034, |
| "step": 46000 |
| }, |
| { |
| "epoch": 0.752, |
| "grad_norm": 0.04524613544344902, |
| "learning_rate": 0.00022741322433450292, |
| "loss": 0.0034, |
| "step": 47000 |
| }, |
| { |
| "epoch": 0.768, |
| "grad_norm": 0.027965977787971497, |
| "learning_rate": 0.00022740958298467676, |
| "loss": 0.0031, |
| "step": 48000 |
| }, |
| { |
| "epoch": 0.784, |
| "grad_norm": 0.033201005309820175, |
| "learning_rate": 0.0002274059416348506, |
| "loss": 0.0032, |
| "step": 49000 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.11329031735658646, |
| "learning_rate": 0.00022740230028502444, |
| "loss": 0.0034, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_loss": 0.00293481582775712, |
| "eval_runtime": 27.0954, |
| "eval_samples_per_second": 36.907, |
| "eval_steps_per_second": 4.613, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.816, |
| "grad_norm": 0.08998037129640579, |
| "learning_rate": 0.00022739865893519828, |
| "loss": 0.003, |
| "step": 51000 |
| }, |
| { |
| "epoch": 0.832, |
| "grad_norm": 0.034506551921367645, |
| "learning_rate": 0.00022739501758537212, |
| "loss": 0.0039, |
| "step": 52000 |
| }, |
| { |
| "epoch": 0.848, |
| "grad_norm": 0.10205531865358353, |
| "learning_rate": 0.00022739137623554596, |
| "loss": 0.0031, |
| "step": 53000 |
| }, |
| { |
| "epoch": 0.864, |
| "grad_norm": 0.016757190227508545, |
| "learning_rate": 0.0002273877348857198, |
| "loss": 0.0024, |
| "step": 54000 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.038212958723306656, |
| "learning_rate": 0.00022738409353589365, |
| "loss": 0.0035, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.88, |
| "eval_loss": 0.003101126756519079, |
| "eval_runtime": 27.2079, |
| "eval_samples_per_second": 36.754, |
| "eval_steps_per_second": 4.594, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.896, |
| "grad_norm": 0.20447635650634766, |
| "learning_rate": 0.0002273804521860675, |
| "loss": 0.0029, |
| "step": 56000 |
| }, |
| { |
| "epoch": 0.912, |
| "grad_norm": 0.029786735773086548, |
| "learning_rate": 0.00022737681083624133, |
| "loss": 0.0028, |
| "step": 57000 |
| }, |
| { |
| "epoch": 0.928, |
| "grad_norm": 0.1717972755432129, |
| "learning_rate": 0.00022737316948641517, |
| "loss": 0.0035, |
| "step": 58000 |
| }, |
| { |
| "epoch": 0.944, |
| "grad_norm": 0.051670778542757034, |
| "learning_rate": 0.000227369528136589, |
| "loss": 0.0026, |
| "step": 59000 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.18315136432647705, |
| "learning_rate": 0.00022736588678676285, |
| "loss": 0.0021, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.96, |
| "eval_loss": 0.0022276523523032665, |
| "eval_runtime": 27.2912, |
| "eval_samples_per_second": 36.642, |
| "eval_steps_per_second": 4.58, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.976, |
| "grad_norm": 0.2137993574142456, |
| "learning_rate": 0.0002273622454369367, |
| "loss": 0.0031, |
| "step": 61000 |
| }, |
| { |
| "epoch": 0.992, |
| "grad_norm": 0.02584846317768097, |
| "learning_rate": 0.00022735860408711053, |
| "loss": 0.0033, |
| "step": 62000 |
| }, |
| { |
| "epoch": 1.008, |
| "grad_norm": 0.054690442979335785, |
| "learning_rate": 0.00022735496273728438, |
| "loss": 0.0024, |
| "step": 63000 |
| }, |
| { |
| "epoch": 1.024, |
| "grad_norm": 0.01702144928276539, |
| "learning_rate": 0.00022735132138745822, |
| "loss": 0.0032, |
| "step": 64000 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 0.02373000793159008, |
| "learning_rate": 0.00022734768003763206, |
| "loss": 0.0032, |
| "step": 65000 |
| }, |
| { |
| "epoch": 1.04, |
| "eval_loss": 0.0020411296281963587, |
| "eval_runtime": 27.2492, |
| "eval_samples_per_second": 36.698, |
| "eval_steps_per_second": 4.587, |
| "step": 65000 |
| }, |
| { |
| "epoch": 1.056, |
| "grad_norm": 0.012987160123884678, |
| "learning_rate": 0.0002273440386878059, |
| "loss": 0.002, |
| "step": 66000 |
| }, |
| { |
| "epoch": 1.072, |
| "grad_norm": 0.029065946117043495, |
| "learning_rate": 0.0002273403973379797, |
| "loss": 0.0031, |
| "step": 67000 |
| }, |
| { |
| "epoch": 1.088, |
| "grad_norm": 0.17107020318508148, |
| "learning_rate": 0.00022733675598815358, |
| "loss": 0.0022, |
| "step": 68000 |
| }, |
| { |
| "epoch": 1.104, |
| "grad_norm": 0.019081389531493187, |
| "learning_rate": 0.0002273331146383274, |
| "loss": 0.0031, |
| "step": 69000 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 0.008192900568246841, |
| "learning_rate": 0.00022732947328850126, |
| "loss": 0.0022, |
| "step": 70000 |
| }, |
| { |
| "epoch": 1.12, |
| "eval_loss": 0.0018265106482431293, |
| "eval_runtime": 27.5029, |
| "eval_samples_per_second": 36.36, |
| "eval_steps_per_second": 4.545, |
| "step": 70000 |
| }, |
| { |
| "epoch": 1.1360000000000001, |
| "grad_norm": 0.009845556691288948, |
| "learning_rate": 0.00022732583193867508, |
| "loss": 0.0033, |
| "step": 71000 |
| }, |
| { |
| "epoch": 1.152, |
| "grad_norm": 0.1637999713420868, |
| "learning_rate": 0.00022732219058884895, |
| "loss": 0.0024, |
| "step": 72000 |
| }, |
| { |
| "epoch": 1.168, |
| "grad_norm": 0.03215477615594864, |
| "learning_rate": 0.00022731854923902276, |
| "loss": 0.0026, |
| "step": 73000 |
| }, |
| { |
| "epoch": 1.184, |
| "grad_norm": 0.028977178037166595, |
| "learning_rate": 0.00022731490788919663, |
| "loss": 0.0023, |
| "step": 74000 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.057766951620578766, |
| "learning_rate": 0.00022731126653937044, |
| "loss": 0.0034, |
| "step": 75000 |
| }, |
| { |
| "epoch": 1.2, |
| "eval_loss": 0.0021767348516732454, |
| "eval_runtime": 27.526, |
| "eval_samples_per_second": 36.329, |
| "eval_steps_per_second": 4.541, |
| "step": 75000 |
| }, |
| { |
| "epoch": 1.216, |
| "grad_norm": 0.00946386530995369, |
| "learning_rate": 0.0002273076251895443, |
| "loss": 0.0021, |
| "step": 76000 |
| }, |
| { |
| "epoch": 1.232, |
| "grad_norm": 0.12553413212299347, |
| "learning_rate": 0.00022730398383971812, |
| "loss": 0.0019, |
| "step": 77000 |
| }, |
| { |
| "epoch": 1.248, |
| "grad_norm": 0.0369916595518589, |
| "learning_rate": 0.000227300342489892, |
| "loss": 0.003, |
| "step": 78000 |
| }, |
| { |
| "epoch": 1.264, |
| "grad_norm": 0.19732122123241425, |
| "learning_rate": 0.0002272967011400658, |
| "loss": 0.0024, |
| "step": 79000 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 0.02471228875219822, |
| "learning_rate": 0.00022729305979023967, |
| "loss": 0.0024, |
| "step": 80000 |
| }, |
| { |
| "epoch": 1.28, |
| "eval_loss": 0.002593559678643942, |
| "eval_runtime": 27.2923, |
| "eval_samples_per_second": 36.64, |
| "eval_steps_per_second": 4.58, |
| "step": 80000 |
| }, |
| { |
| "epoch": 1.296, |
| "grad_norm": 0.5299795269966125, |
| "learning_rate": 0.0002272894184404135, |
| "loss": 0.0019, |
| "step": 81000 |
| }, |
| { |
| "epoch": 1.312, |
| "grad_norm": 0.03472663834691048, |
| "learning_rate": 0.00022728577709058736, |
| "loss": 0.003, |
| "step": 82000 |
| }, |
| { |
| "epoch": 1.328, |
| "grad_norm": 0.09357739239931107, |
| "learning_rate": 0.00022728213574076117, |
| "loss": 0.0022, |
| "step": 83000 |
| }, |
| { |
| "epoch": 1.3439999999999999, |
| "grad_norm": 0.01810472272336483, |
| "learning_rate": 0.00022727849439093504, |
| "loss": 0.0019, |
| "step": 84000 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "grad_norm": 0.024920647963881493, |
| "learning_rate": 0.00022727485304110885, |
| "loss": 0.0021, |
| "step": 85000 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "eval_loss": 0.0022232765331864357, |
| "eval_runtime": 27.3645, |
| "eval_samples_per_second": 36.544, |
| "eval_steps_per_second": 4.568, |
| "step": 85000 |
| }, |
| { |
| "epoch": 1.376, |
| "grad_norm": 0.03085111826658249, |
| "learning_rate": 0.00022727121169128272, |
| "loss": 0.0023, |
| "step": 86000 |
| }, |
| { |
| "epoch": 1.392, |
| "grad_norm": 0.010742255486547947, |
| "learning_rate": 0.00022726757034145654, |
| "loss": 0.0019, |
| "step": 87000 |
| }, |
| { |
| "epoch": 1.408, |
| "grad_norm": 0.03559265285730362, |
| "learning_rate": 0.00022726392899163038, |
| "loss": 0.0022, |
| "step": 88000 |
| }, |
| { |
| "epoch": 1.424, |
| "grad_norm": 0.0898701399564743, |
| "learning_rate": 0.00022726028764180422, |
| "loss": 0.0028, |
| "step": 89000 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 0.026589710265398026, |
| "learning_rate": 0.00022725664629197806, |
| "loss": 0.0016, |
| "step": 90000 |
| }, |
| { |
| "epoch": 1.44, |
| "eval_loss": 0.00150102109182626, |
| "eval_runtime": 27.6944, |
| "eval_samples_per_second": 36.108, |
| "eval_steps_per_second": 4.514, |
| "step": 90000 |
| }, |
| { |
| "epoch": 1.456, |
| "grad_norm": 0.016303159296512604, |
| "learning_rate": 0.0002272530049421519, |
| "loss": 0.0024, |
| "step": 91000 |
| }, |
| { |
| "epoch": 1.472, |
| "grad_norm": 0.01823027804493904, |
| "learning_rate": 0.00022724936359232574, |
| "loss": 0.0018, |
| "step": 92000 |
| }, |
| { |
| "epoch": 1.488, |
| "grad_norm": 0.15236489474773407, |
| "learning_rate": 0.00022724572224249958, |
| "loss": 0.0024, |
| "step": 93000 |
| }, |
| { |
| "epoch": 1.504, |
| "grad_norm": 0.03902558609843254, |
| "learning_rate": 0.00022724208089267342, |
| "loss": 0.0021, |
| "step": 94000 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 0.020767396315932274, |
| "learning_rate": 0.00022723843954284726, |
| "loss": 0.002, |
| "step": 95000 |
| }, |
| { |
| "epoch": 1.52, |
| "eval_loss": 0.001406910945661366, |
| "eval_runtime": 27.6364, |
| "eval_samples_per_second": 36.184, |
| "eval_steps_per_second": 4.523, |
| "step": 95000 |
| }, |
| { |
| "epoch": 1.536, |
| "grad_norm": 0.09269700944423676, |
| "learning_rate": 0.0002272347981930211, |
| "loss": 0.0023, |
| "step": 96000 |
| }, |
| { |
| "epoch": 1.552, |
| "grad_norm": 0.04058321192860603, |
| "learning_rate": 0.00022723115684319495, |
| "loss": 0.0019, |
| "step": 97000 |
| }, |
| { |
| "epoch": 1.568, |
| "grad_norm": 0.04894057661294937, |
| "learning_rate": 0.0002272275154933688, |
| "loss": 0.0018, |
| "step": 98000 |
| }, |
| { |
| "epoch": 1.584, |
| "grad_norm": 0.04043205827474594, |
| "learning_rate": 0.00022722387414354263, |
| "loss": 0.0022, |
| "step": 99000 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 0.1002797931432724, |
| "learning_rate": 0.00022722023279371647, |
| "loss": 0.002, |
| "step": 100000 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_loss": 0.0017908032750710845, |
| "eval_runtime": 27.5843, |
| "eval_samples_per_second": 36.253, |
| "eval_steps_per_second": 4.532, |
| "step": 100000 |
| }, |
| { |
| "epoch": 1.616, |
| "grad_norm": 0.02161436155438423, |
| "learning_rate": 0.0002272165914438903, |
| "loss": 0.0018, |
| "step": 101000 |
| }, |
| { |
| "epoch": 1.6320000000000001, |
| "grad_norm": 0.010246573947370052, |
| "learning_rate": 0.00022721295009406415, |
| "loss": 0.002, |
| "step": 102000 |
| }, |
| { |
| "epoch": 1.6480000000000001, |
| "grad_norm": 0.06802576035261154, |
| "learning_rate": 0.000227209308744238, |
| "loss": 0.0015, |
| "step": 103000 |
| }, |
| { |
| "epoch": 1.6640000000000001, |
| "grad_norm": 0.013391965068876743, |
| "learning_rate": 0.00022720566739441183, |
| "loss": 0.0025, |
| "step": 104000 |
| }, |
| { |
| "epoch": 1.6800000000000002, |
| "grad_norm": 0.10946637392044067, |
| "learning_rate": 0.00022720202604458568, |
| "loss": 0.0018, |
| "step": 105000 |
| }, |
| { |
| "epoch": 1.6800000000000002, |
| "eval_loss": 0.0015562042826786637, |
| "eval_runtime": 27.7549, |
| "eval_samples_per_second": 36.03, |
| "eval_steps_per_second": 4.504, |
| "step": 105000 |
| }, |
| { |
| "epoch": 1.696, |
| "grad_norm": 0.028942033648490906, |
| "learning_rate": 0.00022719838469475952, |
| "loss": 0.002, |
| "step": 106000 |
| }, |
| { |
| "epoch": 1.712, |
| "grad_norm": 0.023039843887090683, |
| "learning_rate": 0.00022719474334493333, |
| "loss": 0.0014, |
| "step": 107000 |
| }, |
| { |
| "epoch": 1.728, |
| "grad_norm": 0.010488491505384445, |
| "learning_rate": 0.0002271911019951072, |
| "loss": 0.0016, |
| "step": 108000 |
| }, |
| { |
| "epoch": 1.744, |
| "grad_norm": 0.019485417753458023, |
| "learning_rate": 0.000227187460645281, |
| "loss": 0.002, |
| "step": 109000 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 0.010597913525998592, |
| "learning_rate": 0.00022718381929545488, |
| "loss": 0.002, |
| "step": 110000 |
| }, |
| { |
| "epoch": 1.76, |
| "eval_loss": 0.000878525257576257, |
| "eval_runtime": 27.823, |
| "eval_samples_per_second": 35.941, |
| "eval_steps_per_second": 4.493, |
| "step": 110000 |
| }, |
| { |
| "epoch": 1.776, |
| "grad_norm": 0.02870281971991062, |
| "learning_rate": 0.0002271801779456287, |
| "loss": 0.0023, |
| "step": 111000 |
| }, |
| { |
| "epoch": 1.792, |
| "grad_norm": 0.041255537420511246, |
| "learning_rate": 0.00022717653659580256, |
| "loss": 0.0014, |
| "step": 112000 |
| }, |
| { |
| "epoch": 1.808, |
| "grad_norm": 0.04701690748333931, |
| "learning_rate": 0.00022717289524597638, |
| "loss": 0.0015, |
| "step": 113000 |
| }, |
| { |
| "epoch": 1.8239999999999998, |
| "grad_norm": 0.059342917054891586, |
| "learning_rate": 0.00022716925389615024, |
| "loss": 0.0028, |
| "step": 114000 |
| }, |
| { |
| "epoch": 1.8399999999999999, |
| "grad_norm": 0.040327928960323334, |
| "learning_rate": 0.00022716561254632406, |
| "loss": 0.0014, |
| "step": 115000 |
| }, |
| { |
| "epoch": 1.8399999999999999, |
| "eval_loss": 0.0016007705125957727, |
| "eval_runtime": 27.7178, |
| "eval_samples_per_second": 36.078, |
| "eval_steps_per_second": 4.51, |
| "step": 115000 |
| }, |
| { |
| "epoch": 1.8559999999999999, |
| "grad_norm": 0.018858684226870537, |
| "learning_rate": 0.00022716197119649793, |
| "loss": 0.002, |
| "step": 116000 |
| }, |
| { |
| "epoch": 1.8719999999999999, |
| "grad_norm": 0.026660999283194542, |
| "learning_rate": 0.00022715832984667174, |
| "loss": 0.0019, |
| "step": 117000 |
| }, |
| { |
| "epoch": 1.888, |
| "grad_norm": 0.08471547812223434, |
| "learning_rate": 0.0002271546884968456, |
| "loss": 0.0016, |
| "step": 118000 |
| }, |
| { |
| "epoch": 1.904, |
| "grad_norm": 0.03236541524529457, |
| "learning_rate": 0.00022715104714701942, |
| "loss": 0.0014, |
| "step": 119000 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 0.015728328377008438, |
| "learning_rate": 0.0002271474057971933, |
| "loss": 0.0022, |
| "step": 120000 |
| }, |
| { |
| "epoch": 1.92, |
| "eval_loss": 0.0017823727102950215, |
| "eval_runtime": 27.6601, |
| "eval_samples_per_second": 36.153, |
| "eval_steps_per_second": 4.519, |
| "step": 120000 |
| }, |
| { |
| "epoch": 1.936, |
| "grad_norm": 0.2575147747993469, |
| "learning_rate": 0.0002271437644473671, |
| "loss": 0.0015, |
| "step": 121000 |
| }, |
| { |
| "epoch": 1.952, |
| "grad_norm": 0.03020591102540493, |
| "learning_rate": 0.00022714012309754097, |
| "loss": 0.0015, |
| "step": 122000 |
| }, |
| { |
| "epoch": 1.968, |
| "grad_norm": 0.011387010104954243, |
| "learning_rate": 0.0002271364817477148, |
| "loss": 0.0015, |
| "step": 123000 |
| }, |
| { |
| "epoch": 1.984, |
| "grad_norm": 0.033326998353004456, |
| "learning_rate": 0.00022713284039788866, |
| "loss": 0.0019, |
| "step": 124000 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.234897643327713, |
| "learning_rate": 0.00022712919904806247, |
| "loss": 0.0029, |
| "step": 125000 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.00910487212240696, |
| "eval_runtime": 27.7906, |
| "eval_samples_per_second": 35.983, |
| "eval_steps_per_second": 4.498, |
| "step": 125000 |
| }, |
| { |
| "epoch": 2.016, |
| "grad_norm": 0.05067163705825806, |
| "learning_rate": 0.0002271255576982363, |
| "loss": 0.0019, |
| "step": 126000 |
| }, |
| { |
| "epoch": 2.032, |
| "grad_norm": 0.015078851021826267, |
| "learning_rate": 0.00022712191634841015, |
| "loss": 0.0012, |
| "step": 127000 |
| }, |
| { |
| "epoch": 2.048, |
| "grad_norm": 0.03365013748407364, |
| "learning_rate": 0.000227118274998584, |
| "loss": 0.0018, |
| "step": 128000 |
| }, |
| { |
| "epoch": 2.064, |
| "grad_norm": 0.00802704505622387, |
| "learning_rate": 0.00022711463364875783, |
| "loss": 0.0013, |
| "step": 129000 |
| }, |
| { |
| "epoch": 2.08, |
| "grad_norm": 0.011523068882524967, |
| "learning_rate": 0.00022711099229893168, |
| "loss": 0.0021, |
| "step": 130000 |
| }, |
| { |
| "epoch": 2.08, |
| "eval_loss": 0.0009301243117079139, |
| "eval_runtime": 27.505, |
| "eval_samples_per_second": 36.357, |
| "eval_steps_per_second": 4.545, |
| "step": 130000 |
| }, |
| { |
| "epoch": 2.096, |
| "grad_norm": 0.012680677697062492, |
| "learning_rate": 0.00022710735094910552, |
| "loss": 0.0014, |
| "step": 131000 |
| }, |
| { |
| "epoch": 2.112, |
| "grad_norm": 0.0508689247071743, |
| "learning_rate": 0.00022710370959927936, |
| "loss": 0.002, |
| "step": 132000 |
| }, |
| { |
| "epoch": 2.128, |
| "grad_norm": 0.014830244705080986, |
| "learning_rate": 0.0002271000682494532, |
| "loss": 0.001, |
| "step": 133000 |
| }, |
| { |
| "epoch": 2.144, |
| "grad_norm": 0.028912167996168137, |
| "learning_rate": 0.00022709642689962704, |
| "loss": 0.0019, |
| "step": 134000 |
| }, |
| { |
| "epoch": 2.16, |
| "grad_norm": 0.06254349648952484, |
| "learning_rate": 0.00022709278554980088, |
| "loss": 0.0012, |
| "step": 135000 |
| }, |
| { |
| "epoch": 2.16, |
| "eval_loss": 0.0014802517835050821, |
| "eval_runtime": 27.695, |
| "eval_samples_per_second": 36.108, |
| "eval_steps_per_second": 4.513, |
| "step": 135000 |
| }, |
| { |
| "epoch": 2.176, |
| "grad_norm": 0.01877821609377861, |
| "learning_rate": 0.00022708914419997472, |
| "loss": 0.0015, |
| "step": 136000 |
| }, |
| { |
| "epoch": 2.192, |
| "grad_norm": 0.18786460161209106, |
| "learning_rate": 0.00022708550285014856, |
| "loss": 0.0018, |
| "step": 137000 |
| }, |
| { |
| "epoch": 2.208, |
| "grad_norm": 0.016280388459563255, |
| "learning_rate": 0.0002270818615003224, |
| "loss": 0.0015, |
| "step": 138000 |
| }, |
| { |
| "epoch": 2.224, |
| "grad_norm": 0.009028231725096703, |
| "learning_rate": 0.00022707822015049625, |
| "loss": 0.0022, |
| "step": 139000 |
| }, |
| { |
| "epoch": 2.24, |
| "grad_norm": 0.02473852038383484, |
| "learning_rate": 0.0002270745788006701, |
| "loss": 0.0011, |
| "step": 140000 |
| }, |
| { |
| "epoch": 2.24, |
| "eval_loss": 0.0011171329533681273, |
| "eval_runtime": 27.6717, |
| "eval_samples_per_second": 36.138, |
| "eval_steps_per_second": 4.517, |
| "step": 140000 |
| }, |
| { |
| "epoch": 2.2560000000000002, |
| "grad_norm": 0.015900999307632446, |
| "learning_rate": 0.00022707093745084393, |
| "loss": 0.0015, |
| "step": 141000 |
| }, |
| { |
| "epoch": 2.2720000000000002, |
| "grad_norm": 0.018436668440699577, |
| "learning_rate": 0.00022706729610101774, |
| "loss": 0.0015, |
| "step": 142000 |
| }, |
| { |
| "epoch": 2.288, |
| "grad_norm": 0.268839567899704, |
| "learning_rate": 0.0002270636547511916, |
| "loss": 0.0013, |
| "step": 143000 |
| }, |
| { |
| "epoch": 2.304, |
| "grad_norm": 0.024980826303362846, |
| "learning_rate": 0.00022706001340136542, |
| "loss": 0.0017, |
| "step": 144000 |
| }, |
| { |
| "epoch": 2.32, |
| "grad_norm": 0.025631515309214592, |
| "learning_rate": 0.00022705637205153926, |
| "loss": 0.0009, |
| "step": 145000 |
| }, |
| { |
| "epoch": 2.32, |
| "eval_loss": 0.0012023365125060081, |
| "eval_runtime": 27.5991, |
| "eval_samples_per_second": 36.233, |
| "eval_steps_per_second": 4.529, |
| "step": 145000 |
| }, |
| { |
| "epoch": 2.336, |
| "grad_norm": 0.010165953077375889, |
| "learning_rate": 0.0002270527307017131, |
| "loss": 0.0018, |
| "step": 146000 |
| }, |
| { |
| "epoch": 2.352, |
| "grad_norm": 0.012398986145853996, |
| "learning_rate": 0.00022704908935188695, |
| "loss": 0.001, |
| "step": 147000 |
| }, |
| { |
| "epoch": 2.368, |
| "grad_norm": 0.02246440201997757, |
| "learning_rate": 0.0002270454480020608, |
| "loss": 0.0025, |
| "step": 148000 |
| }, |
| { |
| "epoch": 2.384, |
| "grad_norm": 0.018412381410598755, |
| "learning_rate": 0.00022704180665223463, |
| "loss": 0.0008, |
| "step": 149000 |
| }, |
| { |
| "epoch": 2.4, |
| "grad_norm": 0.025599336251616478, |
| "learning_rate": 0.00022703816530240847, |
| "loss": 0.0025, |
| "step": 150000 |
| }, |
| { |
| "epoch": 2.4, |
| "eval_loss": 0.000995820271782577, |
| "eval_runtime": 27.7548, |
| "eval_samples_per_second": 36.03, |
| "eval_steps_per_second": 4.504, |
| "step": 150000 |
| }, |
| { |
| "epoch": 2.416, |
| "grad_norm": 0.03476562350988388, |
| "learning_rate": 0.0002270345239525823, |
| "loss": 0.0016, |
| "step": 151000 |
| }, |
| { |
| "epoch": 2.432, |
| "grad_norm": 0.002502072835341096, |
| "learning_rate": 0.00022703088260275615, |
| "loss": 0.001, |
| "step": 152000 |
| }, |
| { |
| "epoch": 2.448, |
| "grad_norm": 0.09545526653528214, |
| "learning_rate": 0.00022702724125293, |
| "loss": 0.0019, |
| "step": 153000 |
| }, |
| { |
| "epoch": 2.464, |
| "grad_norm": 0.026374874636530876, |
| "learning_rate": 0.00022702359990310383, |
| "loss": 0.0027, |
| "step": 154000 |
| }, |
| { |
| "epoch": 2.48, |
| "grad_norm": 0.02330603636801243, |
| "learning_rate": 0.00022701995855327768, |
| "loss": 0.0013, |
| "step": 155000 |
| }, |
| { |
| "epoch": 2.48, |
| "eval_loss": 0.0009146310039795935, |
| "eval_runtime": 27.6699, |
| "eval_samples_per_second": 36.14, |
| "eval_steps_per_second": 4.518, |
| "step": 155000 |
| }, |
| { |
| "epoch": 2.496, |
| "grad_norm": 0.042115718126297, |
| "learning_rate": 0.00022701631720345152, |
| "loss": 0.001, |
| "step": 156000 |
| }, |
| { |
| "epoch": 2.512, |
| "grad_norm": 0.006467332132160664, |
| "learning_rate": 0.00022701267585362536, |
| "loss": 0.0013, |
| "step": 157000 |
| }, |
| { |
| "epoch": 2.528, |
| "grad_norm": 0.039700523018836975, |
| "learning_rate": 0.0002270090345037992, |
| "loss": 0.0012, |
| "step": 158000 |
| }, |
| { |
| "epoch": 2.544, |
| "grad_norm": 0.006177098024636507, |
| "learning_rate": 0.00022700539315397304, |
| "loss": 0.0032, |
| "step": 159000 |
| }, |
| { |
| "epoch": 2.56, |
| "grad_norm": 0.016644610092043877, |
| "learning_rate": 0.00022700175180414688, |
| "loss": 0.0007, |
| "step": 160000 |
| }, |
| { |
| "epoch": 2.56, |
| "eval_loss": 0.0010344331385567784, |
| "eval_runtime": 27.8065, |
| "eval_samples_per_second": 35.963, |
| "eval_steps_per_second": 4.495, |
| "step": 160000 |
| }, |
| { |
| "epoch": 2.576, |
| "grad_norm": 0.01400495320558548, |
| "learning_rate": 0.00022699811045432072, |
| "loss": 0.0012, |
| "step": 161000 |
| }, |
| { |
| "epoch": 2.592, |
| "grad_norm": 0.016703518107533455, |
| "learning_rate": 0.00022699446910449456, |
| "loss": 0.0012, |
| "step": 162000 |
| }, |
| { |
| "epoch": 2.608, |
| "grad_norm": 0.006359017454087734, |
| "learning_rate": 0.0002269908277546684, |
| "loss": 0.0012, |
| "step": 163000 |
| }, |
| { |
| "epoch": 2.624, |
| "grad_norm": 0.01771441660821438, |
| "learning_rate": 0.00022698718640484222, |
| "loss": 0.0016, |
| "step": 164000 |
| }, |
| { |
| "epoch": 2.64, |
| "grad_norm": 0.01094936951994896, |
| "learning_rate": 0.0002269835450550161, |
| "loss": 0.0011, |
| "step": 165000 |
| }, |
| { |
| "epoch": 2.64, |
| "eval_loss": 0.0007599141681566834, |
| "eval_runtime": 27.7146, |
| "eval_samples_per_second": 36.082, |
| "eval_steps_per_second": 4.51, |
| "step": 165000 |
| }, |
| { |
| "epoch": 2.656, |
| "grad_norm": 0.09152177721261978, |
| "learning_rate": 0.0002269799037051899, |
| "loss": 0.0024, |
| "step": 166000 |
| }, |
| { |
| "epoch": 2.672, |
| "grad_norm": 0.012105804868042469, |
| "learning_rate": 0.00022697626235536377, |
| "loss": 0.0009, |
| "step": 167000 |
| }, |
| { |
| "epoch": 2.6879999999999997, |
| "grad_norm": 0.01530654076486826, |
| "learning_rate": 0.00022697262100553758, |
| "loss": 0.0011, |
| "step": 168000 |
| }, |
| { |
| "epoch": 2.7039999999999997, |
| "grad_norm": 0.031053414568305016, |
| "learning_rate": 0.00022696897965571145, |
| "loss": 0.0015, |
| "step": 169000 |
| }, |
| { |
| "epoch": 2.7199999999999998, |
| "grad_norm": 0.01557753887027502, |
| "learning_rate": 0.00022696533830588527, |
| "loss": 0.001, |
| "step": 170000 |
| }, |
| { |
| "epoch": 2.7199999999999998, |
| "eval_loss": 0.0008088626782409847, |
| "eval_runtime": 27.776, |
| "eval_samples_per_second": 36.002, |
| "eval_steps_per_second": 4.5, |
| "step": 170000 |
| }, |
| { |
| "epoch": 2.7359999999999998, |
| "grad_norm": 0.02831295132637024, |
| "learning_rate": 0.00022696169695605913, |
| "loss": 0.0014, |
| "step": 171000 |
| }, |
| { |
| "epoch": 2.752, |
| "grad_norm": 0.017672572284936905, |
| "learning_rate": 0.00022695805560623295, |
| "loss": 0.0011, |
| "step": 172000 |
| }, |
| { |
| "epoch": 2.768, |
| "grad_norm": 0.018164193257689476, |
| "learning_rate": 0.00022695441425640682, |
| "loss": 0.0019, |
| "step": 173000 |
| }, |
| { |
| "epoch": 2.784, |
| "grad_norm": 0.017383994534611702, |
| "learning_rate": 0.00022695077290658063, |
| "loss": 0.001, |
| "step": 174000 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 0.006576849147677422, |
| "learning_rate": 0.0002269471315567545, |
| "loss": 0.0011, |
| "step": 175000 |
| }, |
| { |
| "epoch": 2.8, |
| "eval_loss": 0.0006260189693421125, |
| "eval_runtime": 27.3919, |
| "eval_samples_per_second": 36.507, |
| "eval_steps_per_second": 4.563, |
| "step": 175000 |
| }, |
| { |
| "epoch": 2.816, |
| "grad_norm": 0.019615883007645607, |
| "learning_rate": 0.0002269434902069283, |
| "loss": 0.0012, |
| "step": 176000 |
| }, |
| { |
| "epoch": 2.832, |
| "grad_norm": 0.03926165774464607, |
| "learning_rate": 0.00022693984885710218, |
| "loss": 0.0014, |
| "step": 177000 |
| }, |
| { |
| "epoch": 2.848, |
| "grad_norm": 0.021534917876124382, |
| "learning_rate": 0.000226936207507276, |
| "loss": 0.0012, |
| "step": 178000 |
| }, |
| { |
| "epoch": 2.864, |
| "grad_norm": 0.04047563299536705, |
| "learning_rate": 0.00022693256615744986, |
| "loss": 0.001, |
| "step": 179000 |
| }, |
| { |
| "epoch": 2.88, |
| "grad_norm": 0.04712160676717758, |
| "learning_rate": 0.00022692892480762368, |
| "loss": 0.0015, |
| "step": 180000 |
| }, |
| { |
| "epoch": 2.88, |
| "eval_loss": 0.0013630291214212775, |
| "eval_runtime": 27.4056, |
| "eval_samples_per_second": 36.489, |
| "eval_steps_per_second": 4.561, |
| "step": 180000 |
| }, |
| { |
| "epoch": 2.896, |
| "grad_norm": 0.21584591269493103, |
| "learning_rate": 0.00022692528345779754, |
| "loss": 0.0019, |
| "step": 181000 |
| }, |
| { |
| "epoch": 2.912, |
| "grad_norm": 0.015519549138844013, |
| "learning_rate": 0.00022692164210797136, |
| "loss": 0.0012, |
| "step": 182000 |
| }, |
| { |
| "epoch": 2.928, |
| "grad_norm": 0.0314391665160656, |
| "learning_rate": 0.00022691800075814523, |
| "loss": 0.0009, |
| "step": 183000 |
| }, |
| { |
| "epoch": 2.944, |
| "grad_norm": 0.16906876862049103, |
| "learning_rate": 0.00022691435940831904, |
| "loss": 0.0013, |
| "step": 184000 |
| }, |
| { |
| "epoch": 2.96, |
| "grad_norm": 0.04538990557193756, |
| "learning_rate": 0.00022691071805849288, |
| "loss": 0.001, |
| "step": 185000 |
| }, |
| { |
| "epoch": 2.96, |
| "eval_loss": 0.0014080323744565248, |
| "eval_runtime": 27.3828, |
| "eval_samples_per_second": 36.519, |
| "eval_steps_per_second": 4.565, |
| "step": 185000 |
| }, |
| { |
| "epoch": 2.976, |
| "grad_norm": 0.008023149333894253, |
| "learning_rate": 0.00022690707670866672, |
| "loss": 0.0013, |
| "step": 186000 |
| }, |
| { |
| "epoch": 2.992, |
| "grad_norm": 0.011926773004233837, |
| "learning_rate": 0.00022690343535884056, |
| "loss": 0.0012, |
| "step": 187000 |
| }, |
| { |
| "epoch": 3.008, |
| "grad_norm": 0.01701526716351509, |
| "learning_rate": 0.0002268997940090144, |
| "loss": 0.0011, |
| "step": 188000 |
| }, |
| { |
| "epoch": 3.024, |
| "grad_norm": 0.015581037849187851, |
| "learning_rate": 0.00022689615265918825, |
| "loss": 0.0013, |
| "step": 189000 |
| }, |
| { |
| "epoch": 3.04, |
| "grad_norm": 0.012046800926327705, |
| "learning_rate": 0.0002268925113093621, |
| "loss": 0.001, |
| "step": 190000 |
| }, |
| { |
| "epoch": 3.04, |
| "eval_loss": 0.0010119588114321232, |
| "eval_runtime": 27.6665, |
| "eval_samples_per_second": 36.145, |
| "eval_steps_per_second": 4.518, |
| "step": 190000 |
| }, |
| { |
| "epoch": 3.056, |
| "grad_norm": 0.009263888001441956, |
| "learning_rate": 0.00022688886995953593, |
| "loss": 0.001, |
| "step": 191000 |
| }, |
| { |
| "epoch": 3.072, |
| "grad_norm": 0.0538918599486351, |
| "learning_rate": 0.00022688522860970977, |
| "loss": 0.0012, |
| "step": 192000 |
| }, |
| { |
| "epoch": 3.088, |
| "grad_norm": 0.0521121546626091, |
| "learning_rate": 0.0002268815872598836, |
| "loss": 0.0017, |
| "step": 193000 |
| }, |
| { |
| "epoch": 3.104, |
| "grad_norm": 0.05000779777765274, |
| "learning_rate": 0.00022687794591005745, |
| "loss": 0.0008, |
| "step": 194000 |
| }, |
| { |
| "epoch": 3.12, |
| "grad_norm": 0.06467895954847336, |
| "learning_rate": 0.0002268743045602313, |
| "loss": 0.0011, |
| "step": 195000 |
| }, |
| { |
| "epoch": 3.12, |
| "eval_loss": 0.0008815609035082161, |
| "eval_runtime": 27.5652, |
| "eval_samples_per_second": 36.278, |
| "eval_steps_per_second": 4.535, |
| "step": 195000 |
| }, |
| { |
| "epoch": 3.136, |
| "grad_norm": 0.01422048918902874, |
| "learning_rate": 0.00022687066321040513, |
| "loss": 0.0011, |
| "step": 196000 |
| }, |
| { |
| "epoch": 3.152, |
| "grad_norm": 0.02482694387435913, |
| "learning_rate": 0.00022686702186057897, |
| "loss": 0.0011, |
| "step": 197000 |
| }, |
| { |
| "epoch": 3.168, |
| "grad_norm": 0.03517874330282211, |
| "learning_rate": 0.00022686338051075282, |
| "loss": 0.0017, |
| "step": 198000 |
| }, |
| { |
| "epoch": 3.184, |
| "grad_norm": 0.027310600504279137, |
| "learning_rate": 0.00022685973916092666, |
| "loss": 0.0008, |
| "step": 199000 |
| }, |
| { |
| "epoch": 3.2, |
| "grad_norm": 0.06521017849445343, |
| "learning_rate": 0.0002268560978111005, |
| "loss": 0.002, |
| "step": 200000 |
| }, |
| { |
| "epoch": 3.2, |
| "eval_loss": 0.00754576688632369, |
| "eval_runtime": 27.5143, |
| "eval_samples_per_second": 36.345, |
| "eval_steps_per_second": 4.543, |
| "step": 200000 |
| }, |
| { |
| "epoch": 3.216, |
| "grad_norm": 0.24959920346736908, |
| "learning_rate": 0.00022685245646127434, |
| "loss": 0.0008, |
| "step": 201000 |
| }, |
| { |
| "epoch": 3.232, |
| "grad_norm": 0.010456324554979801, |
| "learning_rate": 0.00022684881511144818, |
| "loss": 0.0011, |
| "step": 202000 |
| }, |
| { |
| "epoch": 3.248, |
| "grad_norm": 0.010797294788062572, |
| "learning_rate": 0.00022684517376162202, |
| "loss": 0.0011, |
| "step": 203000 |
| }, |
| { |
| "epoch": 3.2640000000000002, |
| "grad_norm": 0.04222773015499115, |
| "learning_rate": 0.00022684153241179584, |
| "loss": 0.001, |
| "step": 204000 |
| }, |
| { |
| "epoch": 3.2800000000000002, |
| "grad_norm": 0.03277302905917168, |
| "learning_rate": 0.0002268378910619697, |
| "loss": 0.0015, |
| "step": 205000 |
| }, |
| { |
| "epoch": 3.2800000000000002, |
| "eval_loss": 0.0007634469075128436, |
| "eval_runtime": 27.7742, |
| "eval_samples_per_second": 36.005, |
| "eval_steps_per_second": 4.501, |
| "step": 205000 |
| }, |
| { |
| "epoch": 3.296, |
| "grad_norm": 0.0069810631684958935, |
| "learning_rate": 0.00022683424971214352, |
| "loss": 0.001, |
| "step": 206000 |
| }, |
| { |
| "epoch": 3.312, |
| "grad_norm": 0.01147681474685669, |
| "learning_rate": 0.00022683060836231739, |
| "loss": 0.0009, |
| "step": 207000 |
| }, |
| { |
| "epoch": 3.328, |
| "grad_norm": 0.009766928851604462, |
| "learning_rate": 0.0002268269670124912, |
| "loss": 0.0019, |
| "step": 208000 |
| }, |
| { |
| "epoch": 3.344, |
| "grad_norm": 0.03460145741701126, |
| "learning_rate": 0.00022682332566266507, |
| "loss": 0.0008, |
| "step": 209000 |
| }, |
| { |
| "epoch": 3.36, |
| "grad_norm": 0.016247229650616646, |
| "learning_rate": 0.00022681968431283888, |
| "loss": 0.001, |
| "step": 210000 |
| }, |
| { |
| "epoch": 3.36, |
| "eval_loss": 0.0010268606711179018, |
| "eval_runtime": 26.9593, |
| "eval_samples_per_second": 37.093, |
| "eval_steps_per_second": 4.637, |
| "step": 210000 |
| }, |
| { |
| "epoch": 3.376, |
| "grad_norm": 0.012766228057444096, |
| "learning_rate": 0.00022681604296301275, |
| "loss": 0.0008, |
| "step": 211000 |
| }, |
| { |
| "epoch": 3.392, |
| "grad_norm": 0.005086794961243868, |
| "learning_rate": 0.00022681240161318656, |
| "loss": 0.0014, |
| "step": 212000 |
| }, |
| { |
| "epoch": 3.408, |
| "grad_norm": 0.028264038264751434, |
| "learning_rate": 0.00022680876026336043, |
| "loss": 0.0011, |
| "step": 213000 |
| }, |
| { |
| "epoch": 3.424, |
| "grad_norm": 0.05160939320921898, |
| "learning_rate": 0.00022680511891353425, |
| "loss": 0.0009, |
| "step": 214000 |
| }, |
| { |
| "epoch": 3.44, |
| "grad_norm": 0.02259020321071148, |
| "learning_rate": 0.00022680147756370811, |
| "loss": 0.0012, |
| "step": 215000 |
| }, |
| { |
| "epoch": 3.44, |
| "eval_loss": 0.0007602461846545339, |
| "eval_runtime": 26.8881, |
| "eval_samples_per_second": 37.191, |
| "eval_steps_per_second": 4.649, |
| "step": 215000 |
| }, |
| { |
| "epoch": 3.456, |
| "grad_norm": 0.03077981248497963, |
| "learning_rate": 0.00022679783621388193, |
| "loss": 0.0012, |
| "step": 216000 |
| }, |
| { |
| "epoch": 3.472, |
| "grad_norm": 0.027997983619570732, |
| "learning_rate": 0.0002267941948640558, |
| "loss": 0.0008, |
| "step": 217000 |
| }, |
| { |
| "epoch": 3.488, |
| "grad_norm": 0.009089149534702301, |
| "learning_rate": 0.0002267905535142296, |
| "loss": 0.0011, |
| "step": 218000 |
| }, |
| { |
| "epoch": 3.504, |
| "grad_norm": 0.09043902903795242, |
| "learning_rate": 0.00022678691216440348, |
| "loss": 0.0011, |
| "step": 219000 |
| }, |
| { |
| "epoch": 3.52, |
| "grad_norm": 0.06199198588728905, |
| "learning_rate": 0.0002267832708145773, |
| "loss": 0.0011, |
| "step": 220000 |
| }, |
| { |
| "epoch": 3.52, |
| "eval_loss": 0.001106478739529848, |
| "eval_runtime": 27.0055, |
| "eval_samples_per_second": 37.029, |
| "eval_steps_per_second": 4.629, |
| "step": 220000 |
| }, |
| { |
| "epoch": 3.536, |
| "grad_norm": 0.013115255162119865, |
| "learning_rate": 0.00022677962946475116, |
| "loss": 0.0015, |
| "step": 221000 |
| }, |
| { |
| "epoch": 3.552, |
| "grad_norm": 0.030206598341464996, |
| "learning_rate": 0.00022677598811492498, |
| "loss": 0.001, |
| "step": 222000 |
| }, |
| { |
| "epoch": 3.568, |
| "grad_norm": 0.014335270039737225, |
| "learning_rate": 0.00022677234676509882, |
| "loss": 0.0008, |
| "step": 223000 |
| }, |
| { |
| "epoch": 3.584, |
| "grad_norm": 0.04320364445447922, |
| "learning_rate": 0.00022676870541527266, |
| "loss": 0.001, |
| "step": 224000 |
| }, |
| { |
| "epoch": 3.6, |
| "grad_norm": 0.01011396199464798, |
| "learning_rate": 0.0002267650640654465, |
| "loss": 0.0014, |
| "step": 225000 |
| }, |
| { |
| "epoch": 3.6, |
| "eval_loss": 0.0008724904037080705, |
| "eval_runtime": 26.7598, |
| "eval_samples_per_second": 37.37, |
| "eval_steps_per_second": 4.671, |
| "step": 225000 |
| }, |
| { |
| "epoch": 3.616, |
| "grad_norm": 0.06343936175107956, |
| "learning_rate": 0.00022676142271562034, |
| "loss": 0.0009, |
| "step": 226000 |
| }, |
| { |
| "epoch": 3.632, |
| "grad_norm": 0.04553668946027756, |
| "learning_rate": 0.00022675778136579418, |
| "loss": 0.001, |
| "step": 227000 |
| }, |
| { |
| "epoch": 3.648, |
| "grad_norm": 0.0029150221962481737, |
| "learning_rate": 0.00022675414001596802, |
| "loss": 0.0018, |
| "step": 228000 |
| }, |
| { |
| "epoch": 3.664, |
| "grad_norm": 0.03533324971795082, |
| "learning_rate": 0.00022675049866614186, |
| "loss": 0.0017, |
| "step": 229000 |
| }, |
| { |
| "epoch": 3.68, |
| "grad_norm": 0.020134087651968002, |
| "learning_rate": 0.0002267468573163157, |
| "loss": 0.0013, |
| "step": 230000 |
| }, |
| { |
| "epoch": 3.68, |
| "eval_loss": 0.001037033973261714, |
| "eval_runtime": 27.1191, |
| "eval_samples_per_second": 36.874, |
| "eval_steps_per_second": 4.609, |
| "step": 230000 |
| }, |
| { |
| "epoch": 3.6959999999999997, |
| "grad_norm": 0.01976308599114418, |
| "learning_rate": 0.00022674321596648955, |
| "loss": 0.0009, |
| "step": 231000 |
| }, |
| { |
| "epoch": 3.7119999999999997, |
| "grad_norm": 0.05415629222989082, |
| "learning_rate": 0.00022673957461666339, |
| "loss": 0.0012, |
| "step": 232000 |
| }, |
| { |
| "epoch": 3.7279999999999998, |
| "grad_norm": 0.020477378740906715, |
| "learning_rate": 0.00022673593326683723, |
| "loss": 0.001, |
| "step": 233000 |
| }, |
| { |
| "epoch": 3.7439999999999998, |
| "grad_norm": 0.014153924770653248, |
| "learning_rate": 0.00022673229191701107, |
| "loss": 0.0017, |
| "step": 234000 |
| }, |
| { |
| "epoch": 3.76, |
| "grad_norm": 0.02030963823199272, |
| "learning_rate": 0.0002267286505671849, |
| "loss": 0.0007, |
| "step": 235000 |
| }, |
| { |
| "epoch": 3.76, |
| "eval_loss": 0.0007908450206741691, |
| "eval_runtime": 27.0159, |
| "eval_samples_per_second": 37.015, |
| "eval_steps_per_second": 4.627, |
| "step": 235000 |
| }, |
| { |
| "epoch": 3.776, |
| "grad_norm": 0.03953304514288902, |
| "learning_rate": 0.00022672500921735875, |
| "loss": 0.0008, |
| "step": 236000 |
| }, |
| { |
| "epoch": 3.792, |
| "grad_norm": 0.007172519341111183, |
| "learning_rate": 0.0002267213678675326, |
| "loss": 0.0016, |
| "step": 237000 |
| }, |
| { |
| "epoch": 3.808, |
| "grad_norm": 0.03694753348827362, |
| "learning_rate": 0.00022671772651770643, |
| "loss": 0.0008, |
| "step": 238000 |
| }, |
| { |
| "epoch": 3.824, |
| "grad_norm": 0.04899757727980614, |
| "learning_rate": 0.00022671408516788027, |
| "loss": 0.0011, |
| "step": 239000 |
| }, |
| { |
| "epoch": 3.84, |
| "grad_norm": 0.05499159172177315, |
| "learning_rate": 0.00022671044381805412, |
| "loss": 0.0013, |
| "step": 240000 |
| }, |
| { |
| "epoch": 3.84, |
| "eval_loss": 0.0008275896543636918, |
| "eval_runtime": 27.099, |
| "eval_samples_per_second": 36.902, |
| "eval_steps_per_second": 4.613, |
| "step": 240000 |
| }, |
| { |
| "epoch": 3.856, |
| "grad_norm": 0.02498927153646946, |
| "learning_rate": 0.00022670680246822796, |
| "loss": 0.0008, |
| "step": 241000 |
| }, |
| { |
| "epoch": 3.872, |
| "grad_norm": 0.02703891508281231, |
| "learning_rate": 0.00022670316111840177, |
| "loss": 0.0009, |
| "step": 242000 |
| }, |
| { |
| "epoch": 3.888, |
| "grad_norm": 0.010871395468711853, |
| "learning_rate": 0.00022669951976857564, |
| "loss": 0.0009, |
| "step": 243000 |
| }, |
| { |
| "epoch": 3.904, |
| "grad_norm": 0.006647611036896706, |
| "learning_rate": 0.00022669587841874945, |
| "loss": 0.0019, |
| "step": 244000 |
| }, |
| { |
| "epoch": 3.92, |
| "grad_norm": 0.11232209205627441, |
| "learning_rate": 0.00022669223706892332, |
| "loss": 0.0006, |
| "step": 245000 |
| }, |
| { |
| "epoch": 3.92, |
| "eval_loss": 0.004233696032315493, |
| "eval_runtime": 27.2042, |
| "eval_samples_per_second": 36.759, |
| "eval_steps_per_second": 4.595, |
| "step": 245000 |
| }, |
| { |
| "epoch": 3.936, |
| "grad_norm": 0.03585943579673767, |
| "learning_rate": 0.00022668859571909713, |
| "loss": 0.0012, |
| "step": 246000 |
| }, |
| { |
| "epoch": 3.952, |
| "grad_norm": 0.028422392904758453, |
| "learning_rate": 0.000226684954369271, |
| "loss": 0.0009, |
| "step": 247000 |
| }, |
| { |
| "epoch": 3.968, |
| "grad_norm": 0.029626131057739258, |
| "learning_rate": 0.00022668131301944482, |
| "loss": 0.0009, |
| "step": 248000 |
| }, |
| { |
| "epoch": 3.984, |
| "grad_norm": 0.01423815730959177, |
| "learning_rate": 0.00022667767166961866, |
| "loss": 0.0011, |
| "step": 249000 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.028744470328092575, |
| "learning_rate": 0.0002266740303197925, |
| "loss": 0.0012, |
| "step": 250000 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.0009512793621979654, |
| "eval_runtime": 27.0826, |
| "eval_samples_per_second": 36.924, |
| "eval_steps_per_second": 4.616, |
| "step": 250000 |
| }, |
| { |
| "epoch": 4.016, |
| "grad_norm": 0.05679468810558319, |
| "learning_rate": 0.00022667038896996634, |
| "loss": 0.0008, |
| "step": 251000 |
| }, |
| { |
| "epoch": 4.032, |
| "grad_norm": 0.01259209681302309, |
| "learning_rate": 0.00022666674762014018, |
| "loss": 0.0012, |
| "step": 252000 |
| }, |
| { |
| "epoch": 4.048, |
| "grad_norm": 0.02058994211256504, |
| "learning_rate": 0.00022666310627031402, |
| "loss": 0.0007, |
| "step": 253000 |
| }, |
| { |
| "epoch": 4.064, |
| "grad_norm": 0.028425488620996475, |
| "learning_rate": 0.00022665946492048786, |
| "loss": 0.0017, |
| "step": 254000 |
| }, |
| { |
| "epoch": 4.08, |
| "grad_norm": 0.035576559603214264, |
| "learning_rate": 0.0002266558235706617, |
| "loss": 0.0008, |
| "step": 255000 |
| }, |
| { |
| "epoch": 4.08, |
| "eval_loss": 0.0006453625974245369, |
| "eval_runtime": 27.5028, |
| "eval_samples_per_second": 36.36, |
| "eval_steps_per_second": 4.545, |
| "step": 255000 |
| } |
| ], |
| "logging_steps": 1000, |
| "max_steps": 62500000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1000, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.9094798804101104e+19, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|