beast33 commited on
Commit
baad276
·
verified ·
1 Parent(s): eeb6969

Training in progress, step 361, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db1af205eff0d1c3981f58af318310bd969f0ccee8e313b79ebead6997c843b5
3
  size 639691872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc0496d758ec86c37d1971c49b2e7b5876067a9ce8635cb3d8c4520d38cd938b
3
  size 639691872
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cf0977855e53e7a960c7caf7aa4b8b6575b7d1cea78d007041fe7b74b13d404
3
  size 325340244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de404a495901928b91622cc6faceee097ce90f7963c9310ed4a92168b38a33d6
3
  size 325340244
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a30f2d89b79bfb2d9929f2dc3ffeef086eb1500788503382d980f99cbe057e80
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f761d844d5f040bacf1f99959e3a5fd3da1b16fd7877660f2d80d9193f6afa1
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:825ef3e3db682363455a0d008a860e7cc4412a53aa533791e6b37fda9dca6312
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b857be7b95ff8324c4727de3c0f481a268cea8c6e2533b10d776846f18e23993
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 1.556799054145813,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-300",
4
- "epoch": 0.8310249307479224,
5
  "eval_steps": 100,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2139,6 +2139,433 @@
2139
  "eval_samples_per_second": 6.356,
2140
  "eval_steps_per_second": 1.589,
2141
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2142
  }
2143
  ],
2144
  "logging_steps": 1,
@@ -2162,12 +2589,12 @@
2162
  "should_evaluate": false,
2163
  "should_log": false,
2164
  "should_save": true,
2165
- "should_training_stop": false
2166
  },
2167
  "attributes": {}
2168
  }
2169
  },
2170
- "total_flos": 4.281474924675072e+17,
2171
  "train_batch_size": 8,
2172
  "trial_name": null,
2173
  "trial_params": null
 
1
  {
2
  "best_metric": 1.556799054145813,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-300",
4
+ "epoch": 1.0,
5
  "eval_steps": 100,
6
+ "global_step": 361,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2139
  "eval_samples_per_second": 6.356,
2140
  "eval_steps_per_second": 1.589,
2141
  "step": 300
2142
+ },
2143
+ {
2144
+ "epoch": 0.8337950138504155,
2145
+ "grad_norm": 0.38641664385795593,
2146
+ "learning_rate": 7.446405675168938e-06,
2147
+ "loss": 1.6862,
2148
+ "step": 301
2149
+ },
2150
+ {
2151
+ "epoch": 0.8365650969529086,
2152
+ "grad_norm": 0.386618435382843,
2153
+ "learning_rate": 7.206354198753862e-06,
2154
+ "loss": 1.6566,
2155
+ "step": 302
2156
+ },
2157
+ {
2158
+ "epoch": 0.8393351800554016,
2159
+ "grad_norm": 0.4190298318862915,
2160
+ "learning_rate": 6.969934901005809e-06,
2161
+ "loss": 1.6447,
2162
+ "step": 303
2163
+ },
2164
+ {
2165
+ "epoch": 0.8421052631578947,
2166
+ "grad_norm": 0.40949639678001404,
2167
+ "learning_rate": 6.7371678483882264e-06,
2168
+ "loss": 1.6331,
2169
+ "step": 304
2170
+ },
2171
+ {
2172
+ "epoch": 0.8448753462603878,
2173
+ "grad_norm": 0.42585617303848267,
2174
+ "learning_rate": 6.508072797374454e-06,
2175
+ "loss": 1.6439,
2176
+ "step": 305
2177
+ },
2178
+ {
2179
+ "epoch": 0.8476454293628809,
2180
+ "grad_norm": 0.41650789976119995,
2181
+ "learning_rate": 6.282669192770896e-06,
2182
+ "loss": 1.616,
2183
+ "step": 306
2184
+ },
2185
+ {
2186
+ "epoch": 0.850415512465374,
2187
+ "grad_norm": 0.4084133803844452,
2188
+ "learning_rate": 6.060976166066546e-06,
2189
+ "loss": 1.587,
2190
+ "step": 307
2191
+ },
2192
+ {
2193
+ "epoch": 0.853185595567867,
2194
+ "grad_norm": 0.4127242863178253,
2195
+ "learning_rate": 5.8430125338092115e-06,
2196
+ "loss": 1.6102,
2197
+ "step": 308
2198
+ },
2199
+ {
2200
+ "epoch": 0.8559556786703602,
2201
+ "grad_norm": 0.42294085025787354,
2202
+ "learning_rate": 5.628796796008434e-06,
2203
+ "loss": 1.6542,
2204
+ "step": 309
2205
+ },
2206
+ {
2207
+ "epoch": 0.8587257617728532,
2208
+ "grad_norm": 0.41091540455818176,
2209
+ "learning_rate": 5.418347134565249e-06,
2210
+ "loss": 1.5955,
2211
+ "step": 310
2212
+ },
2213
+ {
2214
+ "epoch": 0.8614958448753463,
2215
+ "grad_norm": 0.41265037655830383,
2216
+ "learning_rate": 5.211681411728969e-06,
2217
+ "loss": 1.6711,
2218
+ "step": 311
2219
+ },
2220
+ {
2221
+ "epoch": 0.8642659279778393,
2222
+ "grad_norm": 0.44204774498939514,
2223
+ "learning_rate": 5.008817168581137e-06,
2224
+ "loss": 1.5871,
2225
+ "step": 312
2226
+ },
2227
+ {
2228
+ "epoch": 0.8670360110803325,
2229
+ "grad_norm": 0.4217774271965027,
2230
+ "learning_rate": 4.809771623546627e-06,
2231
+ "loss": 1.5632,
2232
+ "step": 313
2233
+ },
2234
+ {
2235
+ "epoch": 0.8698060941828255,
2236
+ "grad_norm": 0.43789371848106384,
2237
+ "learning_rate": 4.614561670932288e-06,
2238
+ "loss": 1.5672,
2239
+ "step": 314
2240
+ },
2241
+ {
2242
+ "epoch": 0.8725761772853186,
2243
+ "grad_norm": 0.44473928213119507,
2244
+ "learning_rate": 4.423203879492943e-06,
2245
+ "loss": 1.5175,
2246
+ "step": 315
2247
+ },
2248
+ {
2249
+ "epoch": 0.8753462603878116,
2250
+ "grad_norm": 0.4420805871486664,
2251
+ "learning_rate": 4.2357144910251e-06,
2252
+ "loss": 1.5502,
2253
+ "step": 316
2254
+ },
2255
+ {
2256
+ "epoch": 0.8781163434903048,
2257
+ "grad_norm": 0.448369562625885,
2258
+ "learning_rate": 4.05210941898847e-06,
2259
+ "loss": 1.5517,
2260
+ "step": 317
2261
+ },
2262
+ {
2263
+ "epoch": 0.8808864265927978,
2264
+ "grad_norm": 0.44154343008995056,
2265
+ "learning_rate": 3.872404247155193e-06,
2266
+ "loss": 1.4863,
2267
+ "step": 318
2268
+ },
2269
+ {
2270
+ "epoch": 0.8836565096952909,
2271
+ "grad_norm": 0.44638606905937195,
2272
+ "learning_rate": 3.696614228287187e-06,
2273
+ "loss": 1.5786,
2274
+ "step": 319
2275
+ },
2276
+ {
2277
+ "epoch": 0.8864265927977839,
2278
+ "grad_norm": 0.43334829807281494,
2279
+ "learning_rate": 3.5247542828415747e-06,
2280
+ "loss": 1.5491,
2281
+ "step": 320
2282
+ },
2283
+ {
2284
+ "epoch": 0.889196675900277,
2285
+ "grad_norm": 0.4540192484855652,
2286
+ "learning_rate": 3.356838997704226e-06,
2287
+ "loss": 1.5194,
2288
+ "step": 321
2289
+ },
2290
+ {
2291
+ "epoch": 0.8919667590027701,
2292
+ "grad_norm": 0.4565938115119934,
2293
+ "learning_rate": 3.1928826249516987e-06,
2294
+ "loss": 1.5226,
2295
+ "step": 322
2296
+ },
2297
+ {
2298
+ "epoch": 0.8947368421052632,
2299
+ "grad_norm": 0.4570363461971283,
2300
+ "learning_rate": 3.0328990806415934e-06,
2301
+ "loss": 1.5702,
2302
+ "step": 323
2303
+ },
2304
+ {
2305
+ "epoch": 0.8975069252077562,
2306
+ "grad_norm": 0.4690124988555908,
2307
+ "learning_rate": 2.8769019436313715e-06,
2308
+ "loss": 1.5384,
2309
+ "step": 324
2310
+ },
2311
+ {
2312
+ "epoch": 0.9002770083102493,
2313
+ "grad_norm": 0.47177836298942566,
2314
+ "learning_rate": 2.7249044544258363e-06,
2315
+ "loss": 1.5368,
2316
+ "step": 325
2317
+ },
2318
+ {
2319
+ "epoch": 0.9030470914127424,
2320
+ "grad_norm": 0.4627552628517151,
2321
+ "learning_rate": 2.576919514053355e-06,
2322
+ "loss": 1.5475,
2323
+ "step": 326
2324
+ },
2325
+ {
2326
+ "epoch": 0.9058171745152355,
2327
+ "grad_norm": 0.4692405164241791,
2328
+ "learning_rate": 2.4329596829708144e-06,
2329
+ "loss": 1.4843,
2330
+ "step": 327
2331
+ },
2332
+ {
2333
+ "epoch": 0.9085872576177285,
2334
+ "grad_norm": 0.5105109214782715,
2335
+ "learning_rate": 2.2930371799975594e-06,
2336
+ "loss": 1.5008,
2337
+ "step": 328
2338
+ },
2339
+ {
2340
+ "epoch": 0.9113573407202216,
2341
+ "grad_norm": 0.48938921093940735,
2342
+ "learning_rate": 2.157163881278312e-06,
2343
+ "loss": 1.5492,
2344
+ "step": 329
2345
+ },
2346
+ {
2347
+ "epoch": 0.9141274238227147,
2348
+ "grad_norm": 0.485343337059021,
2349
+ "learning_rate": 2.0253513192751373e-06,
2350
+ "loss": 1.5183,
2351
+ "step": 330
2352
+ },
2353
+ {
2354
+ "epoch": 0.9168975069252078,
2355
+ "grad_norm": 0.4986342489719391,
2356
+ "learning_rate": 1.8976106817886196e-06,
2357
+ "loss": 1.5177,
2358
+ "step": 331
2359
+ },
2360
+ {
2361
+ "epoch": 0.9196675900277008,
2362
+ "grad_norm": 0.49586573243141174,
2363
+ "learning_rate": 1.7739528110083004e-06,
2364
+ "loss": 1.519,
2365
+ "step": 332
2366
+ },
2367
+ {
2368
+ "epoch": 0.9224376731301939,
2369
+ "grad_norm": 0.5050438642501831,
2370
+ "learning_rate": 1.6543882025923886e-06,
2371
+ "loss": 1.4329,
2372
+ "step": 333
2373
+ },
2374
+ {
2375
+ "epoch": 0.925207756232687,
2376
+ "grad_norm": 0.5025500655174255,
2377
+ "learning_rate": 1.5389270047769578e-06,
2378
+ "loss": 1.486,
2379
+ "step": 334
2380
+ },
2381
+ {
2382
+ "epoch": 0.9279778393351801,
2383
+ "grad_norm": 0.50665682554245,
2384
+ "learning_rate": 1.4275790175145908e-06,
2385
+ "loss": 1.4943,
2386
+ "step": 335
2387
+ },
2388
+ {
2389
+ "epoch": 0.9307479224376731,
2390
+ "grad_norm": 0.518688440322876,
2391
+ "learning_rate": 1.3203536916425841e-06,
2392
+ "loss": 1.526,
2393
+ "step": 336
2394
+ },
2395
+ {
2396
+ "epoch": 0.9335180055401662,
2397
+ "grad_norm": 0.5425696969032288,
2398
+ "learning_rate": 1.217260128080816e-06,
2399
+ "loss": 1.513,
2400
+ "step": 337
2401
+ },
2402
+ {
2403
+ "epoch": 0.9362880886426593,
2404
+ "grad_norm": 0.5442622303962708,
2405
+ "learning_rate": 1.1183070770592441e-06,
2406
+ "loss": 1.5735,
2407
+ "step": 338
2408
+ },
2409
+ {
2410
+ "epoch": 0.9390581717451524,
2411
+ "grad_norm": 0.5330432057380676,
2412
+ "learning_rate": 1.0235029373752758e-06,
2413
+ "loss": 1.5291,
2414
+ "step": 339
2415
+ },
2416
+ {
2417
+ "epoch": 0.9418282548476454,
2418
+ "grad_norm": 0.5193708539009094,
2419
+ "learning_rate": 9.32855755680867e-07,
2420
+ "loss": 1.418,
2421
+ "step": 340
2422
+ },
2423
+ {
2424
+ "epoch": 0.9445983379501385,
2425
+ "grad_norm": 0.5180433392524719,
2426
+ "learning_rate": 8.463732257995571e-07,
2427
+ "loss": 1.495,
2428
+ "step": 341
2429
+ },
2430
+ {
2431
+ "epoch": 0.9473684210526315,
2432
+ "grad_norm": 0.5407363772392273,
2433
+ "learning_rate": 7.640626880734581e-07,
2434
+ "loss": 1.5243,
2435
+ "step": 342
2436
+ },
2437
+ {
2438
+ "epoch": 0.9501385041551247,
2439
+ "grad_norm": 0.5722284913063049,
2440
+ "learning_rate": 6.859311287402081e-07,
2441
+ "loss": 1.4204,
2442
+ "step": 343
2443
+ },
2444
+ {
2445
+ "epoch": 0.9529085872576177,
2446
+ "grad_norm": 0.5504374504089355,
2447
+ "learning_rate": 6.119851793400189e-07,
2448
+ "loss": 1.4763,
2449
+ "step": 344
2450
+ },
2451
+ {
2452
+ "epoch": 0.9556786703601108,
2453
+ "grad_norm": 0.5620555281639099,
2454
+ "learning_rate": 5.422311161528193e-07,
2455
+ "loss": 1.4551,
2456
+ "step": 345
2457
+ },
2458
+ {
2459
+ "epoch": 0.9584487534626038,
2460
+ "grad_norm": 0.5651283860206604,
2461
+ "learning_rate": 4.7667485966552685e-07,
2462
+ "loss": 1.4817,
2463
+ "step": 346
2464
+ },
2465
+ {
2466
+ "epoch": 0.961218836565097,
2467
+ "grad_norm": 0.6057314872741699,
2468
+ "learning_rate": 4.153219740695435e-07,
2469
+ "loss": 1.5466,
2470
+ "step": 347
2471
+ },
2472
+ {
2473
+ "epoch": 0.96398891966759,
2474
+ "grad_norm": 0.6597415208816528,
2475
+ "learning_rate": 3.5817766678850615e-07,
2476
+ "loss": 1.5562,
2477
+ "step": 348
2478
+ },
2479
+ {
2480
+ "epoch": 0.9667590027700831,
2481
+ "grad_norm": 0.7001429796218872,
2482
+ "learning_rate": 3.052467880362675e-07,
2483
+ "loss": 1.899,
2484
+ "step": 349
2485
+ },
2486
+ {
2487
+ "epoch": 0.9695290858725761,
2488
+ "grad_norm": 0.8798688054084778,
2489
+ "learning_rate": 2.5653383040524227e-07,
2490
+ "loss": 1.96,
2491
+ "step": 350
2492
+ },
2493
+ {
2494
+ "epoch": 0.9722991689750693,
2495
+ "grad_norm": 0.3499889671802521,
2496
+ "learning_rate": 2.1204292848509555e-07,
2497
+ "loss": 1.6123,
2498
+ "step": 351
2499
+ },
2500
+ {
2501
+ "epoch": 0.9750692520775623,
2502
+ "grad_norm": 0.38690975308418274,
2503
+ "learning_rate": 1.717778585118013e-07,
2504
+ "loss": 1.5579,
2505
+ "step": 352
2506
+ },
2507
+ {
2508
+ "epoch": 0.9778393351800554,
2509
+ "grad_norm": 0.41460373997688293,
2510
+ "learning_rate": 1.3574203804713747e-07,
2511
+ "loss": 1.585,
2512
+ "step": 353
2513
+ },
2514
+ {
2515
+ "epoch": 0.9806094182825484,
2516
+ "grad_norm": 0.4353157877922058,
2517
+ "learning_rate": 1.0393852568860719e-07,
2518
+ "loss": 1.5611,
2519
+ "step": 354
2520
+ },
2521
+ {
2522
+ "epoch": 0.9833795013850416,
2523
+ "grad_norm": 0.450083464384079,
2524
+ "learning_rate": 7.637002080985168e-08,
2525
+ "loss": 1.5568,
2526
+ "step": 355
2527
+ },
2528
+ {
2529
+ "epoch": 0.9861495844875346,
2530
+ "grad_norm": 0.47905081510543823,
2531
+ "learning_rate": 5.303886333151153e-08,
2532
+ "loss": 1.5438,
2533
+ "step": 356
2534
+ },
2535
+ {
2536
+ "epoch": 0.9889196675900277,
2537
+ "grad_norm": 0.4843379855155945,
2538
+ "learning_rate": 3.394703352263551e-08,
2539
+ "loss": 1.4646,
2540
+ "step": 357
2541
+ },
2542
+ {
2543
+ "epoch": 0.9916897506925207,
2544
+ "grad_norm": 0.5109671950340271,
2545
+ "learning_rate": 1.9096151832609375e-08,
2546
+ "loss": 1.513,
2547
+ "step": 358
2548
+ },
2549
+ {
2550
+ "epoch": 0.9944598337950139,
2551
+ "grad_norm": 0.5229476690292358,
2552
+ "learning_rate": 8.487478753615997e-09,
2553
+ "loss": 1.4675,
2554
+ "step": 359
2555
+ },
2556
+ {
2557
+ "epoch": 0.997229916897507,
2558
+ "grad_norm": 0.5755655765533447,
2559
+ "learning_rate": 2.1219147136264382e-09,
2560
+ "loss": 1.5564,
2561
+ "step": 360
2562
+ },
2563
+ {
2564
+ "epoch": 1.0,
2565
+ "grad_norm": 0.7581580877304077,
2566
+ "learning_rate": 0.0,
2567
+ "loss": 1.8149,
2568
+ "step": 361
2569
  }
2570
  ],
2571
  "logging_steps": 1,
 
2589
  "should_evaluate": false,
2590
  "should_log": false,
2591
  "should_save": true,
2592
+ "should_training_stop": true
2593
  },
2594
  "attributes": {}
2595
  }
2596
  },
2597
+ "total_flos": 5.1502575448070554e+17,
2598
  "train_batch_size": 8,
2599
  "trial_name": null,
2600
  "trial_params": null