chtmp223 commited on
Commit
4868842
·
verified ·
1 Parent(s): 6f83649

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +8 -0
  2. all_results.json +8 -0
  3. config.json +40 -0
  4. generation_config.json +12 -0
  5. log.out +3 -0
  6. model-00001-of-00007.safetensors +3 -0
  7. model-00002-of-00007.safetensors +3 -0
  8. model-00003-of-00007.safetensors +3 -0
  9. model-00004-of-00007.safetensors +3 -0
  10. model-00005-of-00007.safetensors +3 -0
  11. model-00006-of-00007.safetensors +3 -0
  12. model-00007-of-00007.safetensors +3 -0
  13. model.safetensors.index.json +298 -0
  14. special_tokens_map.json +16 -0
  15. tokenizer.json +0 -0
  16. tokenizer_config.json +2062 -0
  17. train_results.json +8 -0
  18. trainer_state.json +0 -0
  19. training_args.bin +3 -0
  20. wandb/debug-cli.ctpham_umass_edu.log +0 -0
  21. wandb/debug-internal.log +0 -0
  22. wandb/debug.log +32 -0
  23. wandb/run-20241231_100054-t2idt8o9/files/conda-environment.yaml +233 -0
  24. wandb/run-20241231_100054-t2idt8o9/files/config.yaml +713 -0
  25. wandb/run-20241231_100054-t2idt8o9/files/output.log +4 -0
  26. wandb/run-20241231_100054-t2idt8o9/files/requirements.txt +244 -0
  27. wandb/run-20241231_100054-t2idt8o9/files/wandb-metadata.json +705 -0
  28. wandb/run-20241231_100054-t2idt8o9/files/wandb-summary.json +1 -0
  29. wandb/run-20241231_100054-t2idt8o9/logs/debug-internal.log +244 -0
  30. wandb/run-20241231_100054-t2idt8o9/logs/debug.log +31 -0
  31. wandb/run-20241231_100054-t2idt8o9/run-t2idt8o9.wandb +0 -0
  32. wandb/run-20250101_112144-t9wzg2aq/files/conda-environment.yaml +233 -0
  33. wandb/run-20250101_112144-t9wzg2aq/files/config.yaml +713 -0
  34. wandb/run-20250101_112144-t9wzg2aq/files/output.log +0 -0
  35. wandb/run-20250101_112144-t9wzg2aq/files/requirements.txt +244 -0
  36. wandb/run-20250101_112144-t9wzg2aq/files/wandb-metadata.json +705 -0
  37. wandb/run-20250101_112144-t9wzg2aq/files/wandb-summary.json +0 -0
  38. wandb/run-20250101_112144-t9wzg2aq/logs/debug-internal.log +0 -0
  39. wandb/run-20250101_112144-t9wzg2aq/logs/debug.log +31 -0
  40. wandb/run-20250101_112144-t9wzg2aq/run-t9wzg2aq.wandb +3 -0
  41. wandb/run-20250102_021927-pw8rud5e/files/conda-environment.yaml +233 -0
  42. wandb/run-20250102_021927-pw8rud5e/files/config.yaml +713 -0
  43. wandb/run-20250102_021927-pw8rud5e/files/output.log +293 -0
  44. wandb/run-20250102_021927-pw8rud5e/files/requirements.txt +244 -0
  45. wandb/run-20250102_021927-pw8rud5e/files/wandb-metadata.json +705 -0
  46. wandb/run-20250102_021927-pw8rud5e/files/wandb-summary.json +1 -0
  47. wandb/run-20250102_021927-pw8rud5e/logs/debug-internal.log +0 -0
  48. wandb/run-20250102_021927-pw8rud5e/logs/debug.log +31 -0
  49. wandb/run-20250102_021927-pw8rud5e/run-pw8rud5e.wandb +3 -0
  50. wandb/run-20250102_074844-1ecgrehs/files/conda-environment.yaml +233 -0
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ log.out filter=lfs diff=lfs merge=lfs -text
37
+ wandb/run-20250101_112144-t9wzg2aq/run-t9wzg2aq.wandb filter=lfs diff=lfs merge=lfs -text
38
+ wandb/run-20250102_021927-pw8rud5e/run-pw8rud5e.wandb filter=lfs diff=lfs merge=lfs -text
39
+ wandb/run-20250102_074844-1ecgrehs/run-1ecgrehs.wandb filter=lfs diff=lfs merge=lfs -text
40
+ wandb/run-20250103_132649-33elw7rf/run-33elw7rf.wandb filter=lfs diff=lfs merge=lfs -text
41
+ wandb/run-20250104_084255-r3b53lzn/run-r3b53lzn.wandb filter=lfs diff=lfs merge=lfs -text
42
+ wandb/run-20250117_195232-mv1zgbot/run-mv1zgbot.wandb filter=lfs diff=lfs merge=lfs -text
43
+ wandb/run-20250121_083313-ma4iz8vp/run-ma4iz8vp.wandb filter=lfs diff=lfs merge=lfs -text
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.0778410631950095,
3
+ "num_input_tokens_seen": 7732199424,
4
+ "train_loss": 0.036126901625228955,
5
+ "train_runtime": 9429.4679,
6
+ "train_samples_per_second": 0.782,
7
+ "train_steps_per_second": 0.391
8
+ }
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 128000,
9
+ "eos_token_id": [
10
+ 128001,
11
+ 128008,
12
+ 128009
13
+ ],
14
+ "hidden_act": "silu",
15
+ "hidden_size": 4096,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 14336,
18
+ "max_position_embeddings": 131072,
19
+ "mlp_bias": false,
20
+ "model_type": "llama",
21
+ "num_attention_heads": 32,
22
+ "num_hidden_layers": 32,
23
+ "num_key_value_heads": 8,
24
+ "pad_token_id": 0,
25
+ "pretraining_tp": 1,
26
+ "rms_norm_eps": 1e-05,
27
+ "rope_scaling": {
28
+ "factor": 8.0,
29
+ "high_freq_factor": 4.0,
30
+ "low_freq_factor": 1.0,
31
+ "original_max_position_embeddings": 8192,
32
+ "rope_type": "llama3"
33
+ },
34
+ "rope_theta": 500000.0,
35
+ "tie_word_embeddings": false,
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.44.2",
38
+ "use_cache": true,
39
+ "vocab_size": 128256
40
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 128000,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 128001,
6
+ 128008,
7
+ 128009
8
+ ],
9
+ "temperature": 0.6,
10
+ "top_p": 0.9,
11
+ "transformers_version": "4.44.2"
12
+ }
log.out ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6963529e7017bead51a7c8c4ec43da1c11e51fc7e7bf359f1c8bf8031e2887b
3
+ size 11215004
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b37b547ddb30f3ba8c469637a2b3b2856ede8d588143d3d4cfa00f049136250
3
+ size 4886466168
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63d61bd8d47275a3c52e65bfbcdce67cbec860de33955ef431ad924a91d5f57f
3
+ size 4832007448
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f539fcd92b48eb4d6aa603aada5453477cb33d75b497bf64b38ae096dac750e9
3
+ size 4999813112
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8649934669fba2177b102627fde9f1eaceb4c38931d33c37b3f063130823e5c2
3
+ size 4999813128
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cc2c14df798d8bfdc66be730382828c9d04953f542dedd59dd6aa95fcea672d
3
+ size 4832007496
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3f0a55d8f39643270fa55bef347acf3df13c5b47cec0cf10ab2beaa5cf30216
3
+ size 4999813120
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1531d157bbcfe2a426999ef6b0d7b9ee73a11372433f6f22d067e1a801caa6ea
3
+ size 2571158184
model.safetensors.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 32121044992
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00007-of-00007.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
243
+ "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
252
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
253
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
254
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
256
+ "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
257
+ "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
258
+ "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
259
+ "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
261
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
262
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
263
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
265
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
266
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
267
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
268
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
270
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
271
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
272
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
274
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
275
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
276
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
277
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
280
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
281
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
296
+ "model.norm.weight": "model-00007-of-00007.safetensors"
297
+ }
298
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin_of_text|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|eot_id|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ }
16
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,2062 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "128000": {
4
+ "content": "<|begin_of_text|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "128001": {
12
+ "content": "<|end_of_text|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "128002": {
20
+ "content": "<|reserved_special_token_0|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "128003": {
28
+ "content": "<|reserved_special_token_1|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128004": {
36
+ "content": "<|finetune_right_pad_id|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "128005": {
44
+ "content": "<|reserved_special_token_2|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "128006": {
52
+ "content": "<|start_header_id|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "128007": {
60
+ "content": "<|end_header_id|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "128008": {
68
+ "content": "<|eom_id|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "128009": {
76
+ "content": "<|eot_id|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "128010": {
84
+ "content": "<|python_tag|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "128011": {
92
+ "content": "<|reserved_special_token_3|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "128012": {
100
+ "content": "<|reserved_special_token_4|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "128013": {
108
+ "content": "<|reserved_special_token_5|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "128014": {
116
+ "content": "<|reserved_special_token_6|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "128015": {
124
+ "content": "<|reserved_special_token_7|>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "128016": {
132
+ "content": "<|reserved_special_token_8|>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "128017": {
140
+ "content": "<|reserved_special_token_9|>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "128018": {
148
+ "content": "<|reserved_special_token_10|>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "128019": {
156
+ "content": "<|reserved_special_token_11|>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "128020": {
164
+ "content": "<|reserved_special_token_12|>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "128021": {
172
+ "content": "<|reserved_special_token_13|>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "128022": {
180
+ "content": "<|reserved_special_token_14|>",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "128023": {
188
+ "content": "<|reserved_special_token_15|>",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "128024": {
196
+ "content": "<|reserved_special_token_16|>",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "128025": {
204
+ "content": "<|reserved_special_token_17|>",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "128026": {
212
+ "content": "<|reserved_special_token_18|>",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "128027": {
220
+ "content": "<|reserved_special_token_19|>",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "128028": {
228
+ "content": "<|reserved_special_token_20|>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "128029": {
236
+ "content": "<|reserved_special_token_21|>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "128030": {
244
+ "content": "<|reserved_special_token_22|>",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "128031": {
252
+ "content": "<|reserved_special_token_23|>",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "128032": {
260
+ "content": "<|reserved_special_token_24|>",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "128033": {
268
+ "content": "<|reserved_special_token_25|>",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "128034": {
276
+ "content": "<|reserved_special_token_26|>",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "128035": {
284
+ "content": "<|reserved_special_token_27|>",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "128036": {
292
+ "content": "<|reserved_special_token_28|>",
293
+ "lstrip": false,
294
+ "normalized": false,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "128037": {
300
+ "content": "<|reserved_special_token_29|>",
301
+ "lstrip": false,
302
+ "normalized": false,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "128038": {
308
+ "content": "<|reserved_special_token_30|>",
309
+ "lstrip": false,
310
+ "normalized": false,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "128039": {
316
+ "content": "<|reserved_special_token_31|>",
317
+ "lstrip": false,
318
+ "normalized": false,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "128040": {
324
+ "content": "<|reserved_special_token_32|>",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "128041": {
332
+ "content": "<|reserved_special_token_33|>",
333
+ "lstrip": false,
334
+ "normalized": false,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "128042": {
340
+ "content": "<|reserved_special_token_34|>",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "128043": {
348
+ "content": "<|reserved_special_token_35|>",
349
+ "lstrip": false,
350
+ "normalized": false,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "128044": {
356
+ "content": "<|reserved_special_token_36|>",
357
+ "lstrip": false,
358
+ "normalized": false,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "128045": {
364
+ "content": "<|reserved_special_token_37|>",
365
+ "lstrip": false,
366
+ "normalized": false,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "128046": {
372
+ "content": "<|reserved_special_token_38|>",
373
+ "lstrip": false,
374
+ "normalized": false,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "128047": {
380
+ "content": "<|reserved_special_token_39|>",
381
+ "lstrip": false,
382
+ "normalized": false,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "128048": {
388
+ "content": "<|reserved_special_token_40|>",
389
+ "lstrip": false,
390
+ "normalized": false,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "128049": {
396
+ "content": "<|reserved_special_token_41|>",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "128050": {
404
+ "content": "<|reserved_special_token_42|>",
405
+ "lstrip": false,
406
+ "normalized": false,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "128051": {
412
+ "content": "<|reserved_special_token_43|>",
413
+ "lstrip": false,
414
+ "normalized": false,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "128052": {
420
+ "content": "<|reserved_special_token_44|>",
421
+ "lstrip": false,
422
+ "normalized": false,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "128053": {
428
+ "content": "<|reserved_special_token_45|>",
429
+ "lstrip": false,
430
+ "normalized": false,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "128054": {
436
+ "content": "<|reserved_special_token_46|>",
437
+ "lstrip": false,
438
+ "normalized": false,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "128055": {
444
+ "content": "<|reserved_special_token_47|>",
445
+ "lstrip": false,
446
+ "normalized": false,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "128056": {
452
+ "content": "<|reserved_special_token_48|>",
453
+ "lstrip": false,
454
+ "normalized": false,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": true
458
+ },
459
+ "128057": {
460
+ "content": "<|reserved_special_token_49|>",
461
+ "lstrip": false,
462
+ "normalized": false,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": true
466
+ },
467
+ "128058": {
468
+ "content": "<|reserved_special_token_50|>",
469
+ "lstrip": false,
470
+ "normalized": false,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": true
474
+ },
475
+ "128059": {
476
+ "content": "<|reserved_special_token_51|>",
477
+ "lstrip": false,
478
+ "normalized": false,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": true
482
+ },
483
+ "128060": {
484
+ "content": "<|reserved_special_token_52|>",
485
+ "lstrip": false,
486
+ "normalized": false,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": true
490
+ },
491
+ "128061": {
492
+ "content": "<|reserved_special_token_53|>",
493
+ "lstrip": false,
494
+ "normalized": false,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": true
498
+ },
499
+ "128062": {
500
+ "content": "<|reserved_special_token_54|>",
501
+ "lstrip": false,
502
+ "normalized": false,
503
+ "rstrip": false,
504
+ "single_word": false,
505
+ "special": true
506
+ },
507
+ "128063": {
508
+ "content": "<|reserved_special_token_55|>",
509
+ "lstrip": false,
510
+ "normalized": false,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": true
514
+ },
515
+ "128064": {
516
+ "content": "<|reserved_special_token_56|>",
517
+ "lstrip": false,
518
+ "normalized": false,
519
+ "rstrip": false,
520
+ "single_word": false,
521
+ "special": true
522
+ },
523
+ "128065": {
524
+ "content": "<|reserved_special_token_57|>",
525
+ "lstrip": false,
526
+ "normalized": false,
527
+ "rstrip": false,
528
+ "single_word": false,
529
+ "special": true
530
+ },
531
+ "128066": {
532
+ "content": "<|reserved_special_token_58|>",
533
+ "lstrip": false,
534
+ "normalized": false,
535
+ "rstrip": false,
536
+ "single_word": false,
537
+ "special": true
538
+ },
539
+ "128067": {
540
+ "content": "<|reserved_special_token_59|>",
541
+ "lstrip": false,
542
+ "normalized": false,
543
+ "rstrip": false,
544
+ "single_word": false,
545
+ "special": true
546
+ },
547
+ "128068": {
548
+ "content": "<|reserved_special_token_60|>",
549
+ "lstrip": false,
550
+ "normalized": false,
551
+ "rstrip": false,
552
+ "single_word": false,
553
+ "special": true
554
+ },
555
+ "128069": {
556
+ "content": "<|reserved_special_token_61|>",
557
+ "lstrip": false,
558
+ "normalized": false,
559
+ "rstrip": false,
560
+ "single_word": false,
561
+ "special": true
562
+ },
563
+ "128070": {
564
+ "content": "<|reserved_special_token_62|>",
565
+ "lstrip": false,
566
+ "normalized": false,
567
+ "rstrip": false,
568
+ "single_word": false,
569
+ "special": true
570
+ },
571
+ "128071": {
572
+ "content": "<|reserved_special_token_63|>",
573
+ "lstrip": false,
574
+ "normalized": false,
575
+ "rstrip": false,
576
+ "single_word": false,
577
+ "special": true
578
+ },
579
+ "128072": {
580
+ "content": "<|reserved_special_token_64|>",
581
+ "lstrip": false,
582
+ "normalized": false,
583
+ "rstrip": false,
584
+ "single_word": false,
585
+ "special": true
586
+ },
587
+ "128073": {
588
+ "content": "<|reserved_special_token_65|>",
589
+ "lstrip": false,
590
+ "normalized": false,
591
+ "rstrip": false,
592
+ "single_word": false,
593
+ "special": true
594
+ },
595
+ "128074": {
596
+ "content": "<|reserved_special_token_66|>",
597
+ "lstrip": false,
598
+ "normalized": false,
599
+ "rstrip": false,
600
+ "single_word": false,
601
+ "special": true
602
+ },
603
+ "128075": {
604
+ "content": "<|reserved_special_token_67|>",
605
+ "lstrip": false,
606
+ "normalized": false,
607
+ "rstrip": false,
608
+ "single_word": false,
609
+ "special": true
610
+ },
611
+ "128076": {
612
+ "content": "<|reserved_special_token_68|>",
613
+ "lstrip": false,
614
+ "normalized": false,
615
+ "rstrip": false,
616
+ "single_word": false,
617
+ "special": true
618
+ },
619
+ "128077": {
620
+ "content": "<|reserved_special_token_69|>",
621
+ "lstrip": false,
622
+ "normalized": false,
623
+ "rstrip": false,
624
+ "single_word": false,
625
+ "special": true
626
+ },
627
+ "128078": {
628
+ "content": "<|reserved_special_token_70|>",
629
+ "lstrip": false,
630
+ "normalized": false,
631
+ "rstrip": false,
632
+ "single_word": false,
633
+ "special": true
634
+ },
635
+ "128079": {
636
+ "content": "<|reserved_special_token_71|>",
637
+ "lstrip": false,
638
+ "normalized": false,
639
+ "rstrip": false,
640
+ "single_word": false,
641
+ "special": true
642
+ },
643
+ "128080": {
644
+ "content": "<|reserved_special_token_72|>",
645
+ "lstrip": false,
646
+ "normalized": false,
647
+ "rstrip": false,
648
+ "single_word": false,
649
+ "special": true
650
+ },
651
+ "128081": {
652
+ "content": "<|reserved_special_token_73|>",
653
+ "lstrip": false,
654
+ "normalized": false,
655
+ "rstrip": false,
656
+ "single_word": false,
657
+ "special": true
658
+ },
659
+ "128082": {
660
+ "content": "<|reserved_special_token_74|>",
661
+ "lstrip": false,
662
+ "normalized": false,
663
+ "rstrip": false,
664
+ "single_word": false,
665
+ "special": true
666
+ },
667
+ "128083": {
668
+ "content": "<|reserved_special_token_75|>",
669
+ "lstrip": false,
670
+ "normalized": false,
671
+ "rstrip": false,
672
+ "single_word": false,
673
+ "special": true
674
+ },
675
+ "128084": {
676
+ "content": "<|reserved_special_token_76|>",
677
+ "lstrip": false,
678
+ "normalized": false,
679
+ "rstrip": false,
680
+ "single_word": false,
681
+ "special": true
682
+ },
683
+ "128085": {
684
+ "content": "<|reserved_special_token_77|>",
685
+ "lstrip": false,
686
+ "normalized": false,
687
+ "rstrip": false,
688
+ "single_word": false,
689
+ "special": true
690
+ },
691
+ "128086": {
692
+ "content": "<|reserved_special_token_78|>",
693
+ "lstrip": false,
694
+ "normalized": false,
695
+ "rstrip": false,
696
+ "single_word": false,
697
+ "special": true
698
+ },
699
+ "128087": {
700
+ "content": "<|reserved_special_token_79|>",
701
+ "lstrip": false,
702
+ "normalized": false,
703
+ "rstrip": false,
704
+ "single_word": false,
705
+ "special": true
706
+ },
707
+ "128088": {
708
+ "content": "<|reserved_special_token_80|>",
709
+ "lstrip": false,
710
+ "normalized": false,
711
+ "rstrip": false,
712
+ "single_word": false,
713
+ "special": true
714
+ },
715
+ "128089": {
716
+ "content": "<|reserved_special_token_81|>",
717
+ "lstrip": false,
718
+ "normalized": false,
719
+ "rstrip": false,
720
+ "single_word": false,
721
+ "special": true
722
+ },
723
+ "128090": {
724
+ "content": "<|reserved_special_token_82|>",
725
+ "lstrip": false,
726
+ "normalized": false,
727
+ "rstrip": false,
728
+ "single_word": false,
729
+ "special": true
730
+ },
731
+ "128091": {
732
+ "content": "<|reserved_special_token_83|>",
733
+ "lstrip": false,
734
+ "normalized": false,
735
+ "rstrip": false,
736
+ "single_word": false,
737
+ "special": true
738
+ },
739
+ "128092": {
740
+ "content": "<|reserved_special_token_84|>",
741
+ "lstrip": false,
742
+ "normalized": false,
743
+ "rstrip": false,
744
+ "single_word": false,
745
+ "special": true
746
+ },
747
+ "128093": {
748
+ "content": "<|reserved_special_token_85|>",
749
+ "lstrip": false,
750
+ "normalized": false,
751
+ "rstrip": false,
752
+ "single_word": false,
753
+ "special": true
754
+ },
755
+ "128094": {
756
+ "content": "<|reserved_special_token_86|>",
757
+ "lstrip": false,
758
+ "normalized": false,
759
+ "rstrip": false,
760
+ "single_word": false,
761
+ "special": true
762
+ },
763
+ "128095": {
764
+ "content": "<|reserved_special_token_87|>",
765
+ "lstrip": false,
766
+ "normalized": false,
767
+ "rstrip": false,
768
+ "single_word": false,
769
+ "special": true
770
+ },
771
+ "128096": {
772
+ "content": "<|reserved_special_token_88|>",
773
+ "lstrip": false,
774
+ "normalized": false,
775
+ "rstrip": false,
776
+ "single_word": false,
777
+ "special": true
778
+ },
779
+ "128097": {
780
+ "content": "<|reserved_special_token_89|>",
781
+ "lstrip": false,
782
+ "normalized": false,
783
+ "rstrip": false,
784
+ "single_word": false,
785
+ "special": true
786
+ },
787
+ "128098": {
788
+ "content": "<|reserved_special_token_90|>",
789
+ "lstrip": false,
790
+ "normalized": false,
791
+ "rstrip": false,
792
+ "single_word": false,
793
+ "special": true
794
+ },
795
+ "128099": {
796
+ "content": "<|reserved_special_token_91|>",
797
+ "lstrip": false,
798
+ "normalized": false,
799
+ "rstrip": false,
800
+ "single_word": false,
801
+ "special": true
802
+ },
803
+ "128100": {
804
+ "content": "<|reserved_special_token_92|>",
805
+ "lstrip": false,
806
+ "normalized": false,
807
+ "rstrip": false,
808
+ "single_word": false,
809
+ "special": true
810
+ },
811
+ "128101": {
812
+ "content": "<|reserved_special_token_93|>",
813
+ "lstrip": false,
814
+ "normalized": false,
815
+ "rstrip": false,
816
+ "single_word": false,
817
+ "special": true
818
+ },
819
+ "128102": {
820
+ "content": "<|reserved_special_token_94|>",
821
+ "lstrip": false,
822
+ "normalized": false,
823
+ "rstrip": false,
824
+ "single_word": false,
825
+ "special": true
826
+ },
827
+ "128103": {
828
+ "content": "<|reserved_special_token_95|>",
829
+ "lstrip": false,
830
+ "normalized": false,
831
+ "rstrip": false,
832
+ "single_word": false,
833
+ "special": true
834
+ },
835
+ "128104": {
836
+ "content": "<|reserved_special_token_96|>",
837
+ "lstrip": false,
838
+ "normalized": false,
839
+ "rstrip": false,
840
+ "single_word": false,
841
+ "special": true
842
+ },
843
+ "128105": {
844
+ "content": "<|reserved_special_token_97|>",
845
+ "lstrip": false,
846
+ "normalized": false,
847
+ "rstrip": false,
848
+ "single_word": false,
849
+ "special": true
850
+ },
851
+ "128106": {
852
+ "content": "<|reserved_special_token_98|>",
853
+ "lstrip": false,
854
+ "normalized": false,
855
+ "rstrip": false,
856
+ "single_word": false,
857
+ "special": true
858
+ },
859
+ "128107": {
860
+ "content": "<|reserved_special_token_99|>",
861
+ "lstrip": false,
862
+ "normalized": false,
863
+ "rstrip": false,
864
+ "single_word": false,
865
+ "special": true
866
+ },
867
+ "128108": {
868
+ "content": "<|reserved_special_token_100|>",
869
+ "lstrip": false,
870
+ "normalized": false,
871
+ "rstrip": false,
872
+ "single_word": false,
873
+ "special": true
874
+ },
875
+ "128109": {
876
+ "content": "<|reserved_special_token_101|>",
877
+ "lstrip": false,
878
+ "normalized": false,
879
+ "rstrip": false,
880
+ "single_word": false,
881
+ "special": true
882
+ },
883
+ "128110": {
884
+ "content": "<|reserved_special_token_102|>",
885
+ "lstrip": false,
886
+ "normalized": false,
887
+ "rstrip": false,
888
+ "single_word": false,
889
+ "special": true
890
+ },
891
+ "128111": {
892
+ "content": "<|reserved_special_token_103|>",
893
+ "lstrip": false,
894
+ "normalized": false,
895
+ "rstrip": false,
896
+ "single_word": false,
897
+ "special": true
898
+ },
899
+ "128112": {
900
+ "content": "<|reserved_special_token_104|>",
901
+ "lstrip": false,
902
+ "normalized": false,
903
+ "rstrip": false,
904
+ "single_word": false,
905
+ "special": true
906
+ },
907
+ "128113": {
908
+ "content": "<|reserved_special_token_105|>",
909
+ "lstrip": false,
910
+ "normalized": false,
911
+ "rstrip": false,
912
+ "single_word": false,
913
+ "special": true
914
+ },
915
+ "128114": {
916
+ "content": "<|reserved_special_token_106|>",
917
+ "lstrip": false,
918
+ "normalized": false,
919
+ "rstrip": false,
920
+ "single_word": false,
921
+ "special": true
922
+ },
923
+ "128115": {
924
+ "content": "<|reserved_special_token_107|>",
925
+ "lstrip": false,
926
+ "normalized": false,
927
+ "rstrip": false,
928
+ "single_word": false,
929
+ "special": true
930
+ },
931
+ "128116": {
932
+ "content": "<|reserved_special_token_108|>",
933
+ "lstrip": false,
934
+ "normalized": false,
935
+ "rstrip": false,
936
+ "single_word": false,
937
+ "special": true
938
+ },
939
+ "128117": {
940
+ "content": "<|reserved_special_token_109|>",
941
+ "lstrip": false,
942
+ "normalized": false,
943
+ "rstrip": false,
944
+ "single_word": false,
945
+ "special": true
946
+ },
947
+ "128118": {
948
+ "content": "<|reserved_special_token_110|>",
949
+ "lstrip": false,
950
+ "normalized": false,
951
+ "rstrip": false,
952
+ "single_word": false,
953
+ "special": true
954
+ },
955
+ "128119": {
956
+ "content": "<|reserved_special_token_111|>",
957
+ "lstrip": false,
958
+ "normalized": false,
959
+ "rstrip": false,
960
+ "single_word": false,
961
+ "special": true
962
+ },
963
+ "128120": {
964
+ "content": "<|reserved_special_token_112|>",
965
+ "lstrip": false,
966
+ "normalized": false,
967
+ "rstrip": false,
968
+ "single_word": false,
969
+ "special": true
970
+ },
971
+ "128121": {
972
+ "content": "<|reserved_special_token_113|>",
973
+ "lstrip": false,
974
+ "normalized": false,
975
+ "rstrip": false,
976
+ "single_word": false,
977
+ "special": true
978
+ },
979
+ "128122": {
980
+ "content": "<|reserved_special_token_114|>",
981
+ "lstrip": false,
982
+ "normalized": false,
983
+ "rstrip": false,
984
+ "single_word": false,
985
+ "special": true
986
+ },
987
+ "128123": {
988
+ "content": "<|reserved_special_token_115|>",
989
+ "lstrip": false,
990
+ "normalized": false,
991
+ "rstrip": false,
992
+ "single_word": false,
993
+ "special": true
994
+ },
995
+ "128124": {
996
+ "content": "<|reserved_special_token_116|>",
997
+ "lstrip": false,
998
+ "normalized": false,
999
+ "rstrip": false,
1000
+ "single_word": false,
1001
+ "special": true
1002
+ },
1003
+ "128125": {
1004
+ "content": "<|reserved_special_token_117|>",
1005
+ "lstrip": false,
1006
+ "normalized": false,
1007
+ "rstrip": false,
1008
+ "single_word": false,
1009
+ "special": true
1010
+ },
1011
+ "128126": {
1012
+ "content": "<|reserved_special_token_118|>",
1013
+ "lstrip": false,
1014
+ "normalized": false,
1015
+ "rstrip": false,
1016
+ "single_word": false,
1017
+ "special": true
1018
+ },
1019
+ "128127": {
1020
+ "content": "<|reserved_special_token_119|>",
1021
+ "lstrip": false,
1022
+ "normalized": false,
1023
+ "rstrip": false,
1024
+ "single_word": false,
1025
+ "special": true
1026
+ },
1027
+ "128128": {
1028
+ "content": "<|reserved_special_token_120|>",
1029
+ "lstrip": false,
1030
+ "normalized": false,
1031
+ "rstrip": false,
1032
+ "single_word": false,
1033
+ "special": true
1034
+ },
1035
+ "128129": {
1036
+ "content": "<|reserved_special_token_121|>",
1037
+ "lstrip": false,
1038
+ "normalized": false,
1039
+ "rstrip": false,
1040
+ "single_word": false,
1041
+ "special": true
1042
+ },
1043
+ "128130": {
1044
+ "content": "<|reserved_special_token_122|>",
1045
+ "lstrip": false,
1046
+ "normalized": false,
1047
+ "rstrip": false,
1048
+ "single_word": false,
1049
+ "special": true
1050
+ },
1051
+ "128131": {
1052
+ "content": "<|reserved_special_token_123|>",
1053
+ "lstrip": false,
1054
+ "normalized": false,
1055
+ "rstrip": false,
1056
+ "single_word": false,
1057
+ "special": true
1058
+ },
1059
+ "128132": {
1060
+ "content": "<|reserved_special_token_124|>",
1061
+ "lstrip": false,
1062
+ "normalized": false,
1063
+ "rstrip": false,
1064
+ "single_word": false,
1065
+ "special": true
1066
+ },
1067
+ "128133": {
1068
+ "content": "<|reserved_special_token_125|>",
1069
+ "lstrip": false,
1070
+ "normalized": false,
1071
+ "rstrip": false,
1072
+ "single_word": false,
1073
+ "special": true
1074
+ },
1075
+ "128134": {
1076
+ "content": "<|reserved_special_token_126|>",
1077
+ "lstrip": false,
1078
+ "normalized": false,
1079
+ "rstrip": false,
1080
+ "single_word": false,
1081
+ "special": true
1082
+ },
1083
+ "128135": {
1084
+ "content": "<|reserved_special_token_127|>",
1085
+ "lstrip": false,
1086
+ "normalized": false,
1087
+ "rstrip": false,
1088
+ "single_word": false,
1089
+ "special": true
1090
+ },
1091
+ "128136": {
1092
+ "content": "<|reserved_special_token_128|>",
1093
+ "lstrip": false,
1094
+ "normalized": false,
1095
+ "rstrip": false,
1096
+ "single_word": false,
1097
+ "special": true
1098
+ },
1099
+ "128137": {
1100
+ "content": "<|reserved_special_token_129|>",
1101
+ "lstrip": false,
1102
+ "normalized": false,
1103
+ "rstrip": false,
1104
+ "single_word": false,
1105
+ "special": true
1106
+ },
1107
+ "128138": {
1108
+ "content": "<|reserved_special_token_130|>",
1109
+ "lstrip": false,
1110
+ "normalized": false,
1111
+ "rstrip": false,
1112
+ "single_word": false,
1113
+ "special": true
1114
+ },
1115
+ "128139": {
1116
+ "content": "<|reserved_special_token_131|>",
1117
+ "lstrip": false,
1118
+ "normalized": false,
1119
+ "rstrip": false,
1120
+ "single_word": false,
1121
+ "special": true
1122
+ },
1123
+ "128140": {
1124
+ "content": "<|reserved_special_token_132|>",
1125
+ "lstrip": false,
1126
+ "normalized": false,
1127
+ "rstrip": false,
1128
+ "single_word": false,
1129
+ "special": true
1130
+ },
1131
+ "128141": {
1132
+ "content": "<|reserved_special_token_133|>",
1133
+ "lstrip": false,
1134
+ "normalized": false,
1135
+ "rstrip": false,
1136
+ "single_word": false,
1137
+ "special": true
1138
+ },
1139
+ "128142": {
1140
+ "content": "<|reserved_special_token_134|>",
1141
+ "lstrip": false,
1142
+ "normalized": false,
1143
+ "rstrip": false,
1144
+ "single_word": false,
1145
+ "special": true
1146
+ },
1147
+ "128143": {
1148
+ "content": "<|reserved_special_token_135|>",
1149
+ "lstrip": false,
1150
+ "normalized": false,
1151
+ "rstrip": false,
1152
+ "single_word": false,
1153
+ "special": true
1154
+ },
1155
+ "128144": {
1156
+ "content": "<|reserved_special_token_136|>",
1157
+ "lstrip": false,
1158
+ "normalized": false,
1159
+ "rstrip": false,
1160
+ "single_word": false,
1161
+ "special": true
1162
+ },
1163
+ "128145": {
1164
+ "content": "<|reserved_special_token_137|>",
1165
+ "lstrip": false,
1166
+ "normalized": false,
1167
+ "rstrip": false,
1168
+ "single_word": false,
1169
+ "special": true
1170
+ },
1171
+ "128146": {
1172
+ "content": "<|reserved_special_token_138|>",
1173
+ "lstrip": false,
1174
+ "normalized": false,
1175
+ "rstrip": false,
1176
+ "single_word": false,
1177
+ "special": true
1178
+ },
1179
+ "128147": {
1180
+ "content": "<|reserved_special_token_139|>",
1181
+ "lstrip": false,
1182
+ "normalized": false,
1183
+ "rstrip": false,
1184
+ "single_word": false,
1185
+ "special": true
1186
+ },
1187
+ "128148": {
1188
+ "content": "<|reserved_special_token_140|>",
1189
+ "lstrip": false,
1190
+ "normalized": false,
1191
+ "rstrip": false,
1192
+ "single_word": false,
1193
+ "special": true
1194
+ },
1195
+ "128149": {
1196
+ "content": "<|reserved_special_token_141|>",
1197
+ "lstrip": false,
1198
+ "normalized": false,
1199
+ "rstrip": false,
1200
+ "single_word": false,
1201
+ "special": true
1202
+ },
1203
+ "128150": {
1204
+ "content": "<|reserved_special_token_142|>",
1205
+ "lstrip": false,
1206
+ "normalized": false,
1207
+ "rstrip": false,
1208
+ "single_word": false,
1209
+ "special": true
1210
+ },
1211
+ "128151": {
1212
+ "content": "<|reserved_special_token_143|>",
1213
+ "lstrip": false,
1214
+ "normalized": false,
1215
+ "rstrip": false,
1216
+ "single_word": false,
1217
+ "special": true
1218
+ },
1219
+ "128152": {
1220
+ "content": "<|reserved_special_token_144|>",
1221
+ "lstrip": false,
1222
+ "normalized": false,
1223
+ "rstrip": false,
1224
+ "single_word": false,
1225
+ "special": true
1226
+ },
1227
+ "128153": {
1228
+ "content": "<|reserved_special_token_145|>",
1229
+ "lstrip": false,
1230
+ "normalized": false,
1231
+ "rstrip": false,
1232
+ "single_word": false,
1233
+ "special": true
1234
+ },
1235
+ "128154": {
1236
+ "content": "<|reserved_special_token_146|>",
1237
+ "lstrip": false,
1238
+ "normalized": false,
1239
+ "rstrip": false,
1240
+ "single_word": false,
1241
+ "special": true
1242
+ },
1243
+ "128155": {
1244
+ "content": "<|reserved_special_token_147|>",
1245
+ "lstrip": false,
1246
+ "normalized": false,
1247
+ "rstrip": false,
1248
+ "single_word": false,
1249
+ "special": true
1250
+ },
1251
+ "128156": {
1252
+ "content": "<|reserved_special_token_148|>",
1253
+ "lstrip": false,
1254
+ "normalized": false,
1255
+ "rstrip": false,
1256
+ "single_word": false,
1257
+ "special": true
1258
+ },
1259
+ "128157": {
1260
+ "content": "<|reserved_special_token_149|>",
1261
+ "lstrip": false,
1262
+ "normalized": false,
1263
+ "rstrip": false,
1264
+ "single_word": false,
1265
+ "special": true
1266
+ },
1267
+ "128158": {
1268
+ "content": "<|reserved_special_token_150|>",
1269
+ "lstrip": false,
1270
+ "normalized": false,
1271
+ "rstrip": false,
1272
+ "single_word": false,
1273
+ "special": true
1274
+ },
1275
+ "128159": {
1276
+ "content": "<|reserved_special_token_151|>",
1277
+ "lstrip": false,
1278
+ "normalized": false,
1279
+ "rstrip": false,
1280
+ "single_word": false,
1281
+ "special": true
1282
+ },
1283
+ "128160": {
1284
+ "content": "<|reserved_special_token_152|>",
1285
+ "lstrip": false,
1286
+ "normalized": false,
1287
+ "rstrip": false,
1288
+ "single_word": false,
1289
+ "special": true
1290
+ },
1291
+ "128161": {
1292
+ "content": "<|reserved_special_token_153|>",
1293
+ "lstrip": false,
1294
+ "normalized": false,
1295
+ "rstrip": false,
1296
+ "single_word": false,
1297
+ "special": true
1298
+ },
1299
+ "128162": {
1300
+ "content": "<|reserved_special_token_154|>",
1301
+ "lstrip": false,
1302
+ "normalized": false,
1303
+ "rstrip": false,
1304
+ "single_word": false,
1305
+ "special": true
1306
+ },
1307
+ "128163": {
1308
+ "content": "<|reserved_special_token_155|>",
1309
+ "lstrip": false,
1310
+ "normalized": false,
1311
+ "rstrip": false,
1312
+ "single_word": false,
1313
+ "special": true
1314
+ },
1315
+ "128164": {
1316
+ "content": "<|reserved_special_token_156|>",
1317
+ "lstrip": false,
1318
+ "normalized": false,
1319
+ "rstrip": false,
1320
+ "single_word": false,
1321
+ "special": true
1322
+ },
1323
+ "128165": {
1324
+ "content": "<|reserved_special_token_157|>",
1325
+ "lstrip": false,
1326
+ "normalized": false,
1327
+ "rstrip": false,
1328
+ "single_word": false,
1329
+ "special": true
1330
+ },
1331
+ "128166": {
1332
+ "content": "<|reserved_special_token_158|>",
1333
+ "lstrip": false,
1334
+ "normalized": false,
1335
+ "rstrip": false,
1336
+ "single_word": false,
1337
+ "special": true
1338
+ },
1339
+ "128167": {
1340
+ "content": "<|reserved_special_token_159|>",
1341
+ "lstrip": false,
1342
+ "normalized": false,
1343
+ "rstrip": false,
1344
+ "single_word": false,
1345
+ "special": true
1346
+ },
1347
+ "128168": {
1348
+ "content": "<|reserved_special_token_160|>",
1349
+ "lstrip": false,
1350
+ "normalized": false,
1351
+ "rstrip": false,
1352
+ "single_word": false,
1353
+ "special": true
1354
+ },
1355
+ "128169": {
1356
+ "content": "<|reserved_special_token_161|>",
1357
+ "lstrip": false,
1358
+ "normalized": false,
1359
+ "rstrip": false,
1360
+ "single_word": false,
1361
+ "special": true
1362
+ },
1363
+ "128170": {
1364
+ "content": "<|reserved_special_token_162|>",
1365
+ "lstrip": false,
1366
+ "normalized": false,
1367
+ "rstrip": false,
1368
+ "single_word": false,
1369
+ "special": true
1370
+ },
1371
+ "128171": {
1372
+ "content": "<|reserved_special_token_163|>",
1373
+ "lstrip": false,
1374
+ "normalized": false,
1375
+ "rstrip": false,
1376
+ "single_word": false,
1377
+ "special": true
1378
+ },
1379
+ "128172": {
1380
+ "content": "<|reserved_special_token_164|>",
1381
+ "lstrip": false,
1382
+ "normalized": false,
1383
+ "rstrip": false,
1384
+ "single_word": false,
1385
+ "special": true
1386
+ },
1387
+ "128173": {
1388
+ "content": "<|reserved_special_token_165|>",
1389
+ "lstrip": false,
1390
+ "normalized": false,
1391
+ "rstrip": false,
1392
+ "single_word": false,
1393
+ "special": true
1394
+ },
1395
+ "128174": {
1396
+ "content": "<|reserved_special_token_166|>",
1397
+ "lstrip": false,
1398
+ "normalized": false,
1399
+ "rstrip": false,
1400
+ "single_word": false,
1401
+ "special": true
1402
+ },
1403
+ "128175": {
1404
+ "content": "<|reserved_special_token_167|>",
1405
+ "lstrip": false,
1406
+ "normalized": false,
1407
+ "rstrip": false,
1408
+ "single_word": false,
1409
+ "special": true
1410
+ },
1411
+ "128176": {
1412
+ "content": "<|reserved_special_token_168|>",
1413
+ "lstrip": false,
1414
+ "normalized": false,
1415
+ "rstrip": false,
1416
+ "single_word": false,
1417
+ "special": true
1418
+ },
1419
+ "128177": {
1420
+ "content": "<|reserved_special_token_169|>",
1421
+ "lstrip": false,
1422
+ "normalized": false,
1423
+ "rstrip": false,
1424
+ "single_word": false,
1425
+ "special": true
1426
+ },
1427
+ "128178": {
1428
+ "content": "<|reserved_special_token_170|>",
1429
+ "lstrip": false,
1430
+ "normalized": false,
1431
+ "rstrip": false,
1432
+ "single_word": false,
1433
+ "special": true
1434
+ },
1435
+ "128179": {
1436
+ "content": "<|reserved_special_token_171|>",
1437
+ "lstrip": false,
1438
+ "normalized": false,
1439
+ "rstrip": false,
1440
+ "single_word": false,
1441
+ "special": true
1442
+ },
1443
+ "128180": {
1444
+ "content": "<|reserved_special_token_172|>",
1445
+ "lstrip": false,
1446
+ "normalized": false,
1447
+ "rstrip": false,
1448
+ "single_word": false,
1449
+ "special": true
1450
+ },
1451
+ "128181": {
1452
+ "content": "<|reserved_special_token_173|>",
1453
+ "lstrip": false,
1454
+ "normalized": false,
1455
+ "rstrip": false,
1456
+ "single_word": false,
1457
+ "special": true
1458
+ },
1459
+ "128182": {
1460
+ "content": "<|reserved_special_token_174|>",
1461
+ "lstrip": false,
1462
+ "normalized": false,
1463
+ "rstrip": false,
1464
+ "single_word": false,
1465
+ "special": true
1466
+ },
1467
+ "128183": {
1468
+ "content": "<|reserved_special_token_175|>",
1469
+ "lstrip": false,
1470
+ "normalized": false,
1471
+ "rstrip": false,
1472
+ "single_word": false,
1473
+ "special": true
1474
+ },
1475
+ "128184": {
1476
+ "content": "<|reserved_special_token_176|>",
1477
+ "lstrip": false,
1478
+ "normalized": false,
1479
+ "rstrip": false,
1480
+ "single_word": false,
1481
+ "special": true
1482
+ },
1483
+ "128185": {
1484
+ "content": "<|reserved_special_token_177|>",
1485
+ "lstrip": false,
1486
+ "normalized": false,
1487
+ "rstrip": false,
1488
+ "single_word": false,
1489
+ "special": true
1490
+ },
1491
+ "128186": {
1492
+ "content": "<|reserved_special_token_178|>",
1493
+ "lstrip": false,
1494
+ "normalized": false,
1495
+ "rstrip": false,
1496
+ "single_word": false,
1497
+ "special": true
1498
+ },
1499
+ "128187": {
1500
+ "content": "<|reserved_special_token_179|>",
1501
+ "lstrip": false,
1502
+ "normalized": false,
1503
+ "rstrip": false,
1504
+ "single_word": false,
1505
+ "special": true
1506
+ },
1507
+ "128188": {
1508
+ "content": "<|reserved_special_token_180|>",
1509
+ "lstrip": false,
1510
+ "normalized": false,
1511
+ "rstrip": false,
1512
+ "single_word": false,
1513
+ "special": true
1514
+ },
1515
+ "128189": {
1516
+ "content": "<|reserved_special_token_181|>",
1517
+ "lstrip": false,
1518
+ "normalized": false,
1519
+ "rstrip": false,
1520
+ "single_word": false,
1521
+ "special": true
1522
+ },
1523
+ "128190": {
1524
+ "content": "<|reserved_special_token_182|>",
1525
+ "lstrip": false,
1526
+ "normalized": false,
1527
+ "rstrip": false,
1528
+ "single_word": false,
1529
+ "special": true
1530
+ },
1531
+ "128191": {
1532
+ "content": "<|reserved_special_token_183|>",
1533
+ "lstrip": false,
1534
+ "normalized": false,
1535
+ "rstrip": false,
1536
+ "single_word": false,
1537
+ "special": true
1538
+ },
1539
+ "128192": {
1540
+ "content": "<|reserved_special_token_184|>",
1541
+ "lstrip": false,
1542
+ "normalized": false,
1543
+ "rstrip": false,
1544
+ "single_word": false,
1545
+ "special": true
1546
+ },
1547
+ "128193": {
1548
+ "content": "<|reserved_special_token_185|>",
1549
+ "lstrip": false,
1550
+ "normalized": false,
1551
+ "rstrip": false,
1552
+ "single_word": false,
1553
+ "special": true
1554
+ },
1555
+ "128194": {
1556
+ "content": "<|reserved_special_token_186|>",
1557
+ "lstrip": false,
1558
+ "normalized": false,
1559
+ "rstrip": false,
1560
+ "single_word": false,
1561
+ "special": true
1562
+ },
1563
+ "128195": {
1564
+ "content": "<|reserved_special_token_187|>",
1565
+ "lstrip": false,
1566
+ "normalized": false,
1567
+ "rstrip": false,
1568
+ "single_word": false,
1569
+ "special": true
1570
+ },
1571
+ "128196": {
1572
+ "content": "<|reserved_special_token_188|>",
1573
+ "lstrip": false,
1574
+ "normalized": false,
1575
+ "rstrip": false,
1576
+ "single_word": false,
1577
+ "special": true
1578
+ },
1579
+ "128197": {
1580
+ "content": "<|reserved_special_token_189|>",
1581
+ "lstrip": false,
1582
+ "normalized": false,
1583
+ "rstrip": false,
1584
+ "single_word": false,
1585
+ "special": true
1586
+ },
1587
+ "128198": {
1588
+ "content": "<|reserved_special_token_190|>",
1589
+ "lstrip": false,
1590
+ "normalized": false,
1591
+ "rstrip": false,
1592
+ "single_word": false,
1593
+ "special": true
1594
+ },
1595
+ "128199": {
1596
+ "content": "<|reserved_special_token_191|>",
1597
+ "lstrip": false,
1598
+ "normalized": false,
1599
+ "rstrip": false,
1600
+ "single_word": false,
1601
+ "special": true
1602
+ },
1603
+ "128200": {
1604
+ "content": "<|reserved_special_token_192|>",
1605
+ "lstrip": false,
1606
+ "normalized": false,
1607
+ "rstrip": false,
1608
+ "single_word": false,
1609
+ "special": true
1610
+ },
1611
+ "128201": {
1612
+ "content": "<|reserved_special_token_193|>",
1613
+ "lstrip": false,
1614
+ "normalized": false,
1615
+ "rstrip": false,
1616
+ "single_word": false,
1617
+ "special": true
1618
+ },
1619
+ "128202": {
1620
+ "content": "<|reserved_special_token_194|>",
1621
+ "lstrip": false,
1622
+ "normalized": false,
1623
+ "rstrip": false,
1624
+ "single_word": false,
1625
+ "special": true
1626
+ },
1627
+ "128203": {
1628
+ "content": "<|reserved_special_token_195|>",
1629
+ "lstrip": false,
1630
+ "normalized": false,
1631
+ "rstrip": false,
1632
+ "single_word": false,
1633
+ "special": true
1634
+ },
1635
+ "128204": {
1636
+ "content": "<|reserved_special_token_196|>",
1637
+ "lstrip": false,
1638
+ "normalized": false,
1639
+ "rstrip": false,
1640
+ "single_word": false,
1641
+ "special": true
1642
+ },
1643
+ "128205": {
1644
+ "content": "<|reserved_special_token_197|>",
1645
+ "lstrip": false,
1646
+ "normalized": false,
1647
+ "rstrip": false,
1648
+ "single_word": false,
1649
+ "special": true
1650
+ },
1651
+ "128206": {
1652
+ "content": "<|reserved_special_token_198|>",
1653
+ "lstrip": false,
1654
+ "normalized": false,
1655
+ "rstrip": false,
1656
+ "single_word": false,
1657
+ "special": true
1658
+ },
1659
+ "128207": {
1660
+ "content": "<|reserved_special_token_199|>",
1661
+ "lstrip": false,
1662
+ "normalized": false,
1663
+ "rstrip": false,
1664
+ "single_word": false,
1665
+ "special": true
1666
+ },
1667
+ "128208": {
1668
+ "content": "<|reserved_special_token_200|>",
1669
+ "lstrip": false,
1670
+ "normalized": false,
1671
+ "rstrip": false,
1672
+ "single_word": false,
1673
+ "special": true
1674
+ },
1675
+ "128209": {
1676
+ "content": "<|reserved_special_token_201|>",
1677
+ "lstrip": false,
1678
+ "normalized": false,
1679
+ "rstrip": false,
1680
+ "single_word": false,
1681
+ "special": true
1682
+ },
1683
+ "128210": {
1684
+ "content": "<|reserved_special_token_202|>",
1685
+ "lstrip": false,
1686
+ "normalized": false,
1687
+ "rstrip": false,
1688
+ "single_word": false,
1689
+ "special": true
1690
+ },
1691
+ "128211": {
1692
+ "content": "<|reserved_special_token_203|>",
1693
+ "lstrip": false,
1694
+ "normalized": false,
1695
+ "rstrip": false,
1696
+ "single_word": false,
1697
+ "special": true
1698
+ },
1699
+ "128212": {
1700
+ "content": "<|reserved_special_token_204|>",
1701
+ "lstrip": false,
1702
+ "normalized": false,
1703
+ "rstrip": false,
1704
+ "single_word": false,
1705
+ "special": true
1706
+ },
1707
+ "128213": {
1708
+ "content": "<|reserved_special_token_205|>",
1709
+ "lstrip": false,
1710
+ "normalized": false,
1711
+ "rstrip": false,
1712
+ "single_word": false,
1713
+ "special": true
1714
+ },
1715
+ "128214": {
1716
+ "content": "<|reserved_special_token_206|>",
1717
+ "lstrip": false,
1718
+ "normalized": false,
1719
+ "rstrip": false,
1720
+ "single_word": false,
1721
+ "special": true
1722
+ },
1723
+ "128215": {
1724
+ "content": "<|reserved_special_token_207|>",
1725
+ "lstrip": false,
1726
+ "normalized": false,
1727
+ "rstrip": false,
1728
+ "single_word": false,
1729
+ "special": true
1730
+ },
1731
+ "128216": {
1732
+ "content": "<|reserved_special_token_208|>",
1733
+ "lstrip": false,
1734
+ "normalized": false,
1735
+ "rstrip": false,
1736
+ "single_word": false,
1737
+ "special": true
1738
+ },
1739
+ "128217": {
1740
+ "content": "<|reserved_special_token_209|>",
1741
+ "lstrip": false,
1742
+ "normalized": false,
1743
+ "rstrip": false,
1744
+ "single_word": false,
1745
+ "special": true
1746
+ },
1747
+ "128218": {
1748
+ "content": "<|reserved_special_token_210|>",
1749
+ "lstrip": false,
1750
+ "normalized": false,
1751
+ "rstrip": false,
1752
+ "single_word": false,
1753
+ "special": true
1754
+ },
1755
+ "128219": {
1756
+ "content": "<|reserved_special_token_211|>",
1757
+ "lstrip": false,
1758
+ "normalized": false,
1759
+ "rstrip": false,
1760
+ "single_word": false,
1761
+ "special": true
1762
+ },
1763
+ "128220": {
1764
+ "content": "<|reserved_special_token_212|>",
1765
+ "lstrip": false,
1766
+ "normalized": false,
1767
+ "rstrip": false,
1768
+ "single_word": false,
1769
+ "special": true
1770
+ },
1771
+ "128221": {
1772
+ "content": "<|reserved_special_token_213|>",
1773
+ "lstrip": false,
1774
+ "normalized": false,
1775
+ "rstrip": false,
1776
+ "single_word": false,
1777
+ "special": true
1778
+ },
1779
+ "128222": {
1780
+ "content": "<|reserved_special_token_214|>",
1781
+ "lstrip": false,
1782
+ "normalized": false,
1783
+ "rstrip": false,
1784
+ "single_word": false,
1785
+ "special": true
1786
+ },
1787
+ "128223": {
1788
+ "content": "<|reserved_special_token_215|>",
1789
+ "lstrip": false,
1790
+ "normalized": false,
1791
+ "rstrip": false,
1792
+ "single_word": false,
1793
+ "special": true
1794
+ },
1795
+ "128224": {
1796
+ "content": "<|reserved_special_token_216|>",
1797
+ "lstrip": false,
1798
+ "normalized": false,
1799
+ "rstrip": false,
1800
+ "single_word": false,
1801
+ "special": true
1802
+ },
1803
+ "128225": {
1804
+ "content": "<|reserved_special_token_217|>",
1805
+ "lstrip": false,
1806
+ "normalized": false,
1807
+ "rstrip": false,
1808
+ "single_word": false,
1809
+ "special": true
1810
+ },
1811
+ "128226": {
1812
+ "content": "<|reserved_special_token_218|>",
1813
+ "lstrip": false,
1814
+ "normalized": false,
1815
+ "rstrip": false,
1816
+ "single_word": false,
1817
+ "special": true
1818
+ },
1819
+ "128227": {
1820
+ "content": "<|reserved_special_token_219|>",
1821
+ "lstrip": false,
1822
+ "normalized": false,
1823
+ "rstrip": false,
1824
+ "single_word": false,
1825
+ "special": true
1826
+ },
1827
+ "128228": {
1828
+ "content": "<|reserved_special_token_220|>",
1829
+ "lstrip": false,
1830
+ "normalized": false,
1831
+ "rstrip": false,
1832
+ "single_word": false,
1833
+ "special": true
1834
+ },
1835
+ "128229": {
1836
+ "content": "<|reserved_special_token_221|>",
1837
+ "lstrip": false,
1838
+ "normalized": false,
1839
+ "rstrip": false,
1840
+ "single_word": false,
1841
+ "special": true
1842
+ },
1843
+ "128230": {
1844
+ "content": "<|reserved_special_token_222|>",
1845
+ "lstrip": false,
1846
+ "normalized": false,
1847
+ "rstrip": false,
1848
+ "single_word": false,
1849
+ "special": true
1850
+ },
1851
+ "128231": {
1852
+ "content": "<|reserved_special_token_223|>",
1853
+ "lstrip": false,
1854
+ "normalized": false,
1855
+ "rstrip": false,
1856
+ "single_word": false,
1857
+ "special": true
1858
+ },
1859
+ "128232": {
1860
+ "content": "<|reserved_special_token_224|>",
1861
+ "lstrip": false,
1862
+ "normalized": false,
1863
+ "rstrip": false,
1864
+ "single_word": false,
1865
+ "special": true
1866
+ },
1867
+ "128233": {
1868
+ "content": "<|reserved_special_token_225|>",
1869
+ "lstrip": false,
1870
+ "normalized": false,
1871
+ "rstrip": false,
1872
+ "single_word": false,
1873
+ "special": true
1874
+ },
1875
+ "128234": {
1876
+ "content": "<|reserved_special_token_226|>",
1877
+ "lstrip": false,
1878
+ "normalized": false,
1879
+ "rstrip": false,
1880
+ "single_word": false,
1881
+ "special": true
1882
+ },
1883
+ "128235": {
1884
+ "content": "<|reserved_special_token_227|>",
1885
+ "lstrip": false,
1886
+ "normalized": false,
1887
+ "rstrip": false,
1888
+ "single_word": false,
1889
+ "special": true
1890
+ },
1891
+ "128236": {
1892
+ "content": "<|reserved_special_token_228|>",
1893
+ "lstrip": false,
1894
+ "normalized": false,
1895
+ "rstrip": false,
1896
+ "single_word": false,
1897
+ "special": true
1898
+ },
1899
+ "128237": {
1900
+ "content": "<|reserved_special_token_229|>",
1901
+ "lstrip": false,
1902
+ "normalized": false,
1903
+ "rstrip": false,
1904
+ "single_word": false,
1905
+ "special": true
1906
+ },
1907
+ "128238": {
1908
+ "content": "<|reserved_special_token_230|>",
1909
+ "lstrip": false,
1910
+ "normalized": false,
1911
+ "rstrip": false,
1912
+ "single_word": false,
1913
+ "special": true
1914
+ },
1915
+ "128239": {
1916
+ "content": "<|reserved_special_token_231|>",
1917
+ "lstrip": false,
1918
+ "normalized": false,
1919
+ "rstrip": false,
1920
+ "single_word": false,
1921
+ "special": true
1922
+ },
1923
+ "128240": {
1924
+ "content": "<|reserved_special_token_232|>",
1925
+ "lstrip": false,
1926
+ "normalized": false,
1927
+ "rstrip": false,
1928
+ "single_word": false,
1929
+ "special": true
1930
+ },
1931
+ "128241": {
1932
+ "content": "<|reserved_special_token_233|>",
1933
+ "lstrip": false,
1934
+ "normalized": false,
1935
+ "rstrip": false,
1936
+ "single_word": false,
1937
+ "special": true
1938
+ },
1939
+ "128242": {
1940
+ "content": "<|reserved_special_token_234|>",
1941
+ "lstrip": false,
1942
+ "normalized": false,
1943
+ "rstrip": false,
1944
+ "single_word": false,
1945
+ "special": true
1946
+ },
1947
+ "128243": {
1948
+ "content": "<|reserved_special_token_235|>",
1949
+ "lstrip": false,
1950
+ "normalized": false,
1951
+ "rstrip": false,
1952
+ "single_word": false,
1953
+ "special": true
1954
+ },
1955
+ "128244": {
1956
+ "content": "<|reserved_special_token_236|>",
1957
+ "lstrip": false,
1958
+ "normalized": false,
1959
+ "rstrip": false,
1960
+ "single_word": false,
1961
+ "special": true
1962
+ },
1963
+ "128245": {
1964
+ "content": "<|reserved_special_token_237|>",
1965
+ "lstrip": false,
1966
+ "normalized": false,
1967
+ "rstrip": false,
1968
+ "single_word": false,
1969
+ "special": true
1970
+ },
1971
+ "128246": {
1972
+ "content": "<|reserved_special_token_238|>",
1973
+ "lstrip": false,
1974
+ "normalized": false,
1975
+ "rstrip": false,
1976
+ "single_word": false,
1977
+ "special": true
1978
+ },
1979
+ "128247": {
1980
+ "content": "<|reserved_special_token_239|>",
1981
+ "lstrip": false,
1982
+ "normalized": false,
1983
+ "rstrip": false,
1984
+ "single_word": false,
1985
+ "special": true
1986
+ },
1987
+ "128248": {
1988
+ "content": "<|reserved_special_token_240|>",
1989
+ "lstrip": false,
1990
+ "normalized": false,
1991
+ "rstrip": false,
1992
+ "single_word": false,
1993
+ "special": true
1994
+ },
1995
+ "128249": {
1996
+ "content": "<|reserved_special_token_241|>",
1997
+ "lstrip": false,
1998
+ "normalized": false,
1999
+ "rstrip": false,
2000
+ "single_word": false,
2001
+ "special": true
2002
+ },
2003
+ "128250": {
2004
+ "content": "<|reserved_special_token_242|>",
2005
+ "lstrip": false,
2006
+ "normalized": false,
2007
+ "rstrip": false,
2008
+ "single_word": false,
2009
+ "special": true
2010
+ },
2011
+ "128251": {
2012
+ "content": "<|reserved_special_token_243|>",
2013
+ "lstrip": false,
2014
+ "normalized": false,
2015
+ "rstrip": false,
2016
+ "single_word": false,
2017
+ "special": true
2018
+ },
2019
+ "128252": {
2020
+ "content": "<|reserved_special_token_244|>",
2021
+ "lstrip": false,
2022
+ "normalized": false,
2023
+ "rstrip": false,
2024
+ "single_word": false,
2025
+ "special": true
2026
+ },
2027
+ "128253": {
2028
+ "content": "<|reserved_special_token_245|>",
2029
+ "lstrip": false,
2030
+ "normalized": false,
2031
+ "rstrip": false,
2032
+ "single_word": false,
2033
+ "special": true
2034
+ },
2035
+ "128254": {
2036
+ "content": "<|reserved_special_token_246|>",
2037
+ "lstrip": false,
2038
+ "normalized": false,
2039
+ "rstrip": false,
2040
+ "single_word": false,
2041
+ "special": true
2042
+ },
2043
+ "128255": {
2044
+ "content": "<|reserved_special_token_247|>",
2045
+ "lstrip": false,
2046
+ "normalized": false,
2047
+ "rstrip": false,
2048
+ "single_word": false,
2049
+ "special": true
2050
+ }
2051
+ },
2052
+ "bos_token": "<|begin_of_text|>",
2053
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
2054
+ "clean_up_tokenization_spaces": true,
2055
+ "eos_token": "<|eot_id|>",
2056
+ "model_input_names": [
2057
+ "input_ids",
2058
+ "attention_mask"
2059
+ ],
2060
+ "model_max_length": 131072,
2061
+ "tokenizer_class": "PreTrainedTokenizerFast"
2062
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.0778410631950095,
3
+ "num_input_tokens_seen": 7732199424,
4
+ "train_loss": 0.036126901625228955,
5
+ "train_runtime": 9429.4679,
6
+ "train_samples_per_second": 0.782,
7
+ "train_steps_per_second": 0.391
8
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0c16558cb168f94a8e3ea814f2ed1f2dd4c0ffd2157ce5165c1d9f7a30dce59
3
+ size 5560
wandb/debug-cli.ctpham_umass_edu.log ADDED
File without changes
wandb/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug.log ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-01-21 08:33:13,169 INFO MainThread:285553 [wandb_setup.py:_flush():76] Current SDK version is 0.17.3
2
+ 2025-01-21 08:33:13,169 INFO MainThread:285553 [wandb_setup.py:_flush():76] Configure stats pid to 285553
3
+ 2025-01-21 08:33:13,169 INFO MainThread:285553 [wandb_setup.py:_flush():76] Loading settings from /home/ctpham_umass_edu/.config/wandb/settings
4
+ 2025-01-21 08:33:13,170 INFO MainThread:285553 [wandb_setup.py:_flush():76] Loading settings from /work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/wandb/settings
5
+ 2025-01-21 08:33:13,170 INFO MainThread:285553 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'root_dir': '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', 'project': 'prolong', 'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2025-01-21 08:33:13,170 INFO MainThread:285553 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2025-01-21 08:33:13,170 INFO MainThread:285553 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'prolong-final/finetune.py', 'program_abspath': '/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/finetune.py', 'program': '/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/finetune.py'}
8
+ 2025-01-21 08:33:13,170 INFO MainThread:285553 [wandb_setup.py:_flush():76] Applying login settings: {}
9
+ 2025-01-21 08:33:13,170 INFO MainThread:285553 [wandb_init.py:_log_setup():520] Logging user logs to /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20250121_083313-ma4iz8vp/logs/debug.log
10
+ 2025-01-21 08:33:13,170 INFO MainThread:285553 [wandb_init.py:_log_setup():521] Logging internal logs to /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20250121_083313-ma4iz8vp/logs/debug-internal.log
11
+ 2025-01-21 08:33:13,170 INFO MainThread:285553 [wandb_init.py:init():560] calling init triggers
12
+ 2025-01-21 08:33:13,170 INFO MainThread:285553 [wandb_init.py:init():567] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2025-01-21 08:33:13,170 INFO MainThread:285553 [wandb_init.py:init():610] starting backend
15
+ 2025-01-21 08:33:13,170 INFO MainThread:285553 [wandb_init.py:init():614] setting up manager
16
+ 2025-01-21 08:33:13,173 INFO MainThread:285553 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2025-01-21 08:33:13,176 INFO MainThread:285553 [wandb_init.py:init():622] backend started and connected
18
+ 2025-01-21 08:33:13,185 INFO MainThread:285553 [wandb_init.py:init():711] updated telemetry
19
+ 2025-01-21 08:33:13,225 INFO MainThread:285553 [wandb_init.py:init():744] communicating run to backend with 90.0 second timeout
20
+ 2025-01-21 08:33:13,495 INFO MainThread:285553 [wandb_run.py:_on_init():2402] communicating current version
21
+ 2025-01-21 08:33:13,562 INFO MainThread:285553 [wandb_run.py:_on_init():2411] got version response upgrade_message: "wandb version 0.19.4 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
22
+
23
+ 2025-01-21 08:33:13,562 INFO MainThread:285553 [wandb_init.py:init():795] starting run threads in backend
24
+ 2025-01-21 08:33:20,849 INFO MainThread:285553 [wandb_run.py:_console_start():2380] atexit reg
25
+ 2025-01-21 08:33:20,849 INFO MainThread:285553 [wandb_run.py:_redirect():2235] redirect: wrap_raw
26
+ 2025-01-21 08:33:20,849 INFO MainThread:285553 [wandb_run.py:_redirect():2300] Wrapping output streams.
27
+ 2025-01-21 08:33:20,850 INFO MainThread:285553 [wandb_run.py:_redirect():2325] Redirects installed.
28
+ 2025-01-21 08:33:20,851 INFO MainThread:285553 [wandb_init.py:init():838] run started, returning control to user process
29
+ 2025-01-21 08:33:20,853 INFO MainThread:285553 [wandb_run.py:_config_callback():1382] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 8.0, 'low_freq_factor': 1.0, 'high_freq_factor': 4.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': 0, 'eos_token_id': [128001, 128008, 128009], 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/', 'transformers_version': '4.44.2', 'model_type': 'llama', 'output_dir': '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', 'overwrite_output_dir': False, 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 1e-06, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.05, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/runs/Jan21_08-26-17_gpu021', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 200, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 1, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', 'disable_tqdm': True, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': True, 'fsdp': ['auto_wrap', 'offload'], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False, 'min_lr_ratio': 0.1, 'cuda_empty_cache': True, 'streaming_dataset': True, 'seq_parallel_size': 8}
30
+ 2025-01-21 08:33:20,857 INFO MainThread:285553 [wandb_config.py:__setitem__():151] config set model/num_parameters = 1003782656 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x72984c36eda0>>
31
+ 2025-01-21 08:33:20,857 INFO MainThread:285553 [wandb_run.py:_config_callback():1382] config_cb model/num_parameters 1003782656 None
32
+ 2025-01-21 11:11:55,795 WARNING MsgRouterThr:285553 [router.py:message_loop():77] message_loop has been closed
wandb/run-20241231_100054-t2idt8o9/files/conda-environment.yaml ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=conda_forge
6
+ - _openmp_mutex=4.5=2_gnu
7
+ - bzip2=1.0.8=h4bc722e_7
8
+ - ca-certificates=2024.12.14=hbcca054_0
9
+ - ld_impl_linux-64=2.43=h712a8e2_2
10
+ - libffi=3.4.2=h7f98852_5
11
+ - libgcc=14.2.0=h77fa898_1
12
+ - libgcc-ng=14.2.0=h69a702a_1
13
+ - libgomp=14.2.0=h77fa898_1
14
+ - liblzma=5.6.3=hb9d3cd8_1
15
+ - liblzma-devel=5.6.3=hb9d3cd8_1
16
+ - libnsl=2.0.1=hd590300_0
17
+ - libsqlite=3.47.2=hee588c1_0
18
+ - libuuid=2.38.1=h0b41bf4_0
19
+ - libzlib=1.3.1=hb9d3cd8_2
20
+ - ncurses=6.5=he02047a_1
21
+ - openssl=3.4.0=hb9d3cd8_0
22
+ - pip=24.3.1=pyh8b19718_2
23
+ - python=3.10.0=h543edf9_3_cpython
24
+ - readline=8.2=h8228510_1
25
+ - setuptools=75.6.0=pyhff2d567_1
26
+ - sqlite=3.47.2=h9eae976_0
27
+ - tk=8.6.13=noxft_h4845f30_101
28
+ - wheel=0.45.1=pyhd8ed1ab_1
29
+ - xz=5.6.3=hbcc6ac9_1
30
+ - xz-gpl-tools=5.6.3=hbcc6ac9_1
31
+ - xz-tools=5.6.3=hb9d3cd8_1
32
+ - pip:
33
+ - accelerate==0.32.1
34
+ - aiohappyeyeballs==2.4.3
35
+ - aiohttp==3.11.2
36
+ - aioprometheus==23.12.0
37
+ - aiosignal==1.3.1
38
+ - annotated-types==0.7.0
39
+ - anthropic==0.39.0
40
+ - anyio==4.6.2.post1
41
+ - argcomplete==3.5.1
42
+ - arrow==1.3.0
43
+ - async-timeout==5.0.1
44
+ - attrs==24.2.0
45
+ - azure-core==1.32.0
46
+ - azure-identity==1.19.0
47
+ - azure-storage-blob==12.24.0
48
+ - azure-storage-file-datalake==12.18.0
49
+ - backoff==2.2.1
50
+ - bcrypt==4.2.0
51
+ - blobfile==3.0.0
52
+ - boto3==1.35.63
53
+ - botocore==1.35.63
54
+ - brotli==1.1.0
55
+ - cachetools==5.5.0
56
+ - certifi==2024.8.30
57
+ - cffi==1.17.1
58
+ - charset-normalizer==3.4.0
59
+ - circuitbreaker==2.0.0
60
+ - click==8.1.7
61
+ - cloudpickle==3.1.0
62
+ - compressed-tensors==0.8.0
63
+ - contourpy==1.3.1
64
+ - cramjam==2.9.0
65
+ - cryptography==43.0.3
66
+ - cycler==0.12.1
67
+ - datasets==2.20.0
68
+ - debugpy==1.8.11
69
+ - dill==0.3.8
70
+ - diskcache==5.6.3
71
+ - distro==1.9.0
72
+ - docker-pycreds==0.4.0
73
+ - docstring-parser==0.16
74
+ - einops==0.8.0
75
+ - fastapi==0.115.5
76
+ - filelock==3.16.1
77
+ - flash-attn==2.6.1
78
+ - fonttools==4.55.0
79
+ - frozenlist==1.5.0
80
+ - fsspec==2024.5.0
81
+ - gguf==0.10.0
82
+ - gitdb==4.0.11
83
+ - gitpython==3.1.43
84
+ - google-api-core==2.23.0
85
+ - google-auth==2.36.0
86
+ - google-cloud-aiplatform==1.71.1
87
+ - google-cloud-bigquery==3.27.0
88
+ - google-cloud-core==2.4.1
89
+ - google-cloud-resource-manager==1.13.1
90
+ - google-cloud-storage==2.10.0
91
+ - google-crc32c==1.6.0
92
+ - google-resumable-media==2.7.2
93
+ - googleapis-common-protos==1.66.0
94
+ - gql==3.5.0
95
+ - graphql-core==3.2.5
96
+ - grpc-google-iam-v1==0.13.1
97
+ - grpcio==1.68.0
98
+ - grpcio-status==1.62.3
99
+ - h11==0.14.0
100
+ - httpcore==1.0.7
101
+ - httptools==0.6.4
102
+ - httpx==0.27.2
103
+ - huggingface-hub==0.26.2
104
+ - idna==3.10
105
+ - importlib-metadata==8.5.0
106
+ - interegular==0.3.3
107
+ - ipython==8.18.0
108
+ - isodate==0.7.2
109
+ - jedi==0.19.2
110
+ - jinja2==3.1.4
111
+ - jiter==0.7.1
112
+ - jmespath==1.0.1
113
+ - jsonschema==4.23.0
114
+ - jsonschema-specifications==2024.10.1
115
+ - kiwisolver==1.4.7
116
+ - lark==1.2.2
117
+ - llvmlite==0.43.0
118
+ - lm-format-enforcer==0.10.9
119
+ - lxml==5.3.0
120
+ - markdown-it-py==3.0.0
121
+ - markupsafe==3.0.2
122
+ - matplotlib==3.9.2
123
+ - mdurl==0.1.2
124
+ - mosaicml-cli==0.5.34
125
+ - mosaicml-streaming==0.8.1
126
+ - mpmath==1.3.0
127
+ - msal==1.31.1
128
+ - msal-extensions==1.2.0
129
+ - msgpack==1.1.0
130
+ - msgspec==0.18.6
131
+ - multidict==6.1.0
132
+ - multiprocess==0.70.16
133
+ - networkx==3.4.2
134
+ - ninja==1.11.1.1
135
+ - numba==0.60.0
136
+ - numpy==1.26.4
137
+ - nvidia-cublas-cu12==12.1.3.1
138
+ - nvidia-cuda-cupti-cu12==12.1.105
139
+ - nvidia-cuda-nvrtc-cu12==12.1.105
140
+ - nvidia-cuda-runtime-cu12==12.1.105
141
+ - nvidia-cudnn-cu12==9.1.0.70
142
+ - nvidia-cufft-cu12==11.0.2.54
143
+ - nvidia-curand-cu12==10.3.2.106
144
+ - nvidia-cusolver-cu12==11.4.5.107
145
+ - nvidia-cusparse-cu12==12.1.0.106
146
+ - nvidia-ml-py==12.560.30
147
+ - nvidia-nccl-cu12==2.20.5
148
+ - nvidia-nvjitlink-cu12==12.4.127
149
+ - nvidia-nvtx-cu12==12.1.105
150
+ - oci==2.138.1
151
+ - openai==1.54.5
152
+ - opencv-python-headless==4.10.0.84
153
+ - orjson==3.10.11
154
+ - outlines==0.0.46
155
+ - packaging==24.1
156
+ - pandas==2.2.1
157
+ - paramiko==3.5.0
158
+ - partial-json-parser==0.2.1.1.post4
159
+ - pillow==10.4.0
160
+ - portalocker==2.10.1
161
+ - prometheus-client==0.21.0
162
+ - prometheus-fastapi-instrumentator==7.0.0
163
+ - prompt-toolkit==3.0.36
164
+ - propcache==0.2.0
165
+ - proto-plus==1.25.0
166
+ - protobuf==4.25.3
167
+ - py-cpuinfo==9.0.0
168
+ - pyairports==2.1.1
169
+ - pyarrow==18.0.0
170
+ - pyarrow-hotfix==0.6
171
+ - pyasn1==0.6.1
172
+ - pyasn1-modules==0.4.1
173
+ - pycountry==24.6.1
174
+ - pycparser==2.22
175
+ - pycryptodomex==3.21.0
176
+ - pydantic==2.9.2
177
+ - pydantic-core==2.23.4
178
+ - pyjwt==2.10.0
179
+ - pynacl==1.5.0
180
+ - pyopenssl==24.2.1
181
+ - pyparsing==3.2.0
182
+ - python-dateutil==2.9.0
183
+ - python-dotenv==1.0.1
184
+ - python-snappy==0.7.3
185
+ - pytz==2024.2
186
+ - pyyaml==6.0.2
187
+ - quantile-python==1.1
188
+ - questionary==2.0.1
189
+ - ray==2.39.0
190
+ - referencing==0.35.1
191
+ - regex==2023.12.25
192
+ - requests==2.32.3
193
+ - rich==13.9.4
194
+ - rotary-emb==0.5.2
195
+ - rpds-py==0.21.0
196
+ - rsa==4.9
197
+ - ruamel-yaml==0.18.6
198
+ - ruamel-yaml-clib==0.2.12
199
+ - s3transfer==0.10.3
200
+ - safetensors==0.4.5
201
+ - sentencepiece==0.1.99
202
+ - sentry-sdk==2.18.0
203
+ - setproctitle==1.3.4
204
+ - shapely==2.0.6
205
+ - simple-parsing==0.1.6
206
+ - smmap==5.0.1
207
+ - sniffio==1.3.1
208
+ - starlette==0.41.3
209
+ - sympy==1.13.1
210
+ - tiktoken==0.7.0
211
+ - tokenizers==0.19.1
212
+ - torch==2.4.1
213
+ - torchvision==0.19.1
214
+ - tqdm==4.66.4
215
+ - transformers==4.44.2
216
+ - triton==3.0.0
217
+ - types-python-dateutil==2.9.0.20241003
218
+ - tzdata==2024.2
219
+ - urllib3==2.2.3
220
+ - uvicorn==0.32.0
221
+ - uvloop==0.21.0
222
+ - validators==0.34.0
223
+ - vertexai==1.71.1
224
+ - wandb==0.17.3
225
+ - watchfiles==0.24.0
226
+ - websockets==11.0.3
227
+ - xformers==0.0.28.post1
228
+ - xxhash==3.5.0
229
+ - yarl==1.17.2
230
+ - zipp==3.21.0
231
+ - zstandard==0.23.0
232
+ - zstd==1.5.5.1
233
+ prefix: /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final
wandb/run-20241231_100054-t2idt8o9/files/config.yaml ADDED
@@ -0,0 +1,713 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.10.0
7
+ cli_version: 0.17.3
8
+ framework: huggingface
9
+ huggingface_version: 4.44.2
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1735639254
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 11
17
+ - 41
18
+ - 49
19
+ - 51
20
+ - 55
21
+ - 71
22
+ - 105
23
+ 2:
24
+ - 1
25
+ - 11
26
+ - 41
27
+ - 49
28
+ - 51
29
+ - 55
30
+ - 71
31
+ - 105
32
+ 3:
33
+ - 7
34
+ - 13
35
+ - 19
36
+ - 23
37
+ - 66
38
+ 4: 3.10.0
39
+ 5: 0.17.3
40
+ 6: 4.44.2
41
+ 8:
42
+ - 5
43
+ 9:
44
+ 1: transformers_trainer
45
+ 13: linux-x86_64
46
+ m:
47
+ - 1: train/global_step
48
+ 6:
49
+ - 3
50
+ - 1: train/loss
51
+ 5: 1
52
+ 6:
53
+ - 1
54
+ - 1: train/grad_norm
55
+ 5: 1
56
+ 6:
57
+ - 1
58
+ - 1: train/learning_rate
59
+ 5: 1
60
+ 6:
61
+ - 1
62
+ - 1: train/epoch
63
+ 5: 1
64
+ 6:
65
+ - 1
66
+ - 1: train/num_input_tokens_seen
67
+ 5: 1
68
+ 6:
69
+ - 1
70
+ vocab_size:
71
+ desc: null
72
+ value: 128256
73
+ max_position_embeddings:
74
+ desc: null
75
+ value: 131072
76
+ hidden_size:
77
+ desc: null
78
+ value: 4096
79
+ intermediate_size:
80
+ desc: null
81
+ value: 14336
82
+ num_hidden_layers:
83
+ desc: null
84
+ value: 32
85
+ num_attention_heads:
86
+ desc: null
87
+ value: 32
88
+ num_key_value_heads:
89
+ desc: null
90
+ value: 8
91
+ hidden_act:
92
+ desc: null
93
+ value: silu
94
+ initializer_range:
95
+ desc: null
96
+ value: 0.02
97
+ rms_norm_eps:
98
+ desc: null
99
+ value: 1.0e-05
100
+ pretraining_tp:
101
+ desc: null
102
+ value: 1
103
+ use_cache:
104
+ desc: null
105
+ value: true
106
+ rope_theta:
107
+ desc: null
108
+ value: 500000.0
109
+ rope_scaling:
110
+ desc: null
111
+ value:
112
+ factor: 8.0
113
+ low_freq_factor: 1.0
114
+ high_freq_factor: 4.0
115
+ original_max_position_embeddings: 8192
116
+ rope_type: llama3
117
+ attention_bias:
118
+ desc: null
119
+ value: false
120
+ attention_dropout:
121
+ desc: null
122
+ value: 0.0
123
+ mlp_bias:
124
+ desc: null
125
+ value: false
126
+ return_dict:
127
+ desc: null
128
+ value: true
129
+ output_hidden_states:
130
+ desc: null
131
+ value: false
132
+ output_attentions:
133
+ desc: null
134
+ value: false
135
+ torchscript:
136
+ desc: null
137
+ value: false
138
+ torch_dtype:
139
+ desc: null
140
+ value: bfloat16
141
+ use_bfloat16:
142
+ desc: null
143
+ value: false
144
+ tf_legacy_loss:
145
+ desc: null
146
+ value: false
147
+ pruned_heads:
148
+ desc: null
149
+ value: {}
150
+ tie_word_embeddings:
151
+ desc: null
152
+ value: false
153
+ chunk_size_feed_forward:
154
+ desc: null
155
+ value: 0
156
+ is_encoder_decoder:
157
+ desc: null
158
+ value: false
159
+ is_decoder:
160
+ desc: null
161
+ value: false
162
+ cross_attention_hidden_size:
163
+ desc: null
164
+ value: null
165
+ add_cross_attention:
166
+ desc: null
167
+ value: false
168
+ tie_encoder_decoder:
169
+ desc: null
170
+ value: false
171
+ max_length:
172
+ desc: null
173
+ value: 20
174
+ min_length:
175
+ desc: null
176
+ value: 0
177
+ do_sample:
178
+ desc: null
179
+ value: false
180
+ early_stopping:
181
+ desc: null
182
+ value: false
183
+ num_beams:
184
+ desc: null
185
+ value: 1
186
+ num_beam_groups:
187
+ desc: null
188
+ value: 1
189
+ diversity_penalty:
190
+ desc: null
191
+ value: 0.0
192
+ temperature:
193
+ desc: null
194
+ value: 1.0
195
+ top_k:
196
+ desc: null
197
+ value: 50
198
+ top_p:
199
+ desc: null
200
+ value: 1.0
201
+ typical_p:
202
+ desc: null
203
+ value: 1.0
204
+ repetition_penalty:
205
+ desc: null
206
+ value: 1.0
207
+ length_penalty:
208
+ desc: null
209
+ value: 1.0
210
+ no_repeat_ngram_size:
211
+ desc: null
212
+ value: 0
213
+ encoder_no_repeat_ngram_size:
214
+ desc: null
215
+ value: 0
216
+ bad_words_ids:
217
+ desc: null
218
+ value: null
219
+ num_return_sequences:
220
+ desc: null
221
+ value: 1
222
+ output_scores:
223
+ desc: null
224
+ value: false
225
+ return_dict_in_generate:
226
+ desc: null
227
+ value: false
228
+ forced_bos_token_id:
229
+ desc: null
230
+ value: null
231
+ forced_eos_token_id:
232
+ desc: null
233
+ value: null
234
+ remove_invalid_values:
235
+ desc: null
236
+ value: false
237
+ exponential_decay_length_penalty:
238
+ desc: null
239
+ value: null
240
+ suppress_tokens:
241
+ desc: null
242
+ value: null
243
+ begin_suppress_tokens:
244
+ desc: null
245
+ value: null
246
+ architectures:
247
+ desc: null
248
+ value:
249
+ - LlamaForCausalLM
250
+ finetuning_task:
251
+ desc: null
252
+ value: null
253
+ id2label:
254
+ desc: null
255
+ value:
256
+ '0': LABEL_0
257
+ '1': LABEL_1
258
+ label2id:
259
+ desc: null
260
+ value:
261
+ LABEL_0: 0
262
+ LABEL_1: 1
263
+ tokenizer_class:
264
+ desc: null
265
+ value: null
266
+ prefix:
267
+ desc: null
268
+ value: null
269
+ bos_token_id:
270
+ desc: null
271
+ value: 128000
272
+ pad_token_id:
273
+ desc: null
274
+ value: 0
275
+ eos_token_id:
276
+ desc: null
277
+ value:
278
+ - 128001
279
+ - 128008
280
+ - 128009
281
+ sep_token_id:
282
+ desc: null
283
+ value: null
284
+ decoder_start_token_id:
285
+ desc: null
286
+ value: null
287
+ task_specific_params:
288
+ desc: null
289
+ value: null
290
+ problem_type:
291
+ desc: null
292
+ value: null
293
+ _name_or_path:
294
+ desc: null
295
+ value: /datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/
296
+ transformers_version:
297
+ desc: null
298
+ value: 4.44.2
299
+ model_type:
300
+ desc: null
301
+ value: llama
302
+ output_dir:
303
+ desc: null
304
+ value: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_
305
+ overwrite_output_dir:
306
+ desc: null
307
+ value: false
308
+ do_train:
309
+ desc: null
310
+ value: true
311
+ do_eval:
312
+ desc: null
313
+ value: false
314
+ do_predict:
315
+ desc: null
316
+ value: false
317
+ eval_strategy:
318
+ desc: null
319
+ value: 'no'
320
+ prediction_loss_only:
321
+ desc: null
322
+ value: false
323
+ per_device_train_batch_size:
324
+ desc: null
325
+ value: 1
326
+ per_device_eval_batch_size:
327
+ desc: null
328
+ value: 8
329
+ per_gpu_train_batch_size:
330
+ desc: null
331
+ value: null
332
+ per_gpu_eval_batch_size:
333
+ desc: null
334
+ value: null
335
+ gradient_accumulation_steps:
336
+ desc: null
337
+ value: 2
338
+ eval_accumulation_steps:
339
+ desc: null
340
+ value: null
341
+ eval_delay:
342
+ desc: null
343
+ value: 0
344
+ torch_empty_cache_steps:
345
+ desc: null
346
+ value: null
347
+ learning_rate:
348
+ desc: null
349
+ value: 1.0e-06
350
+ weight_decay:
351
+ desc: null
352
+ value: 0.1
353
+ adam_beta1:
354
+ desc: null
355
+ value: 0.9
356
+ adam_beta2:
357
+ desc: null
358
+ value: 0.95
359
+ adam_epsilon:
360
+ desc: null
361
+ value: 1.0e-08
362
+ max_grad_norm:
363
+ desc: null
364
+ value: 1.0
365
+ num_train_epochs:
366
+ desc: null
367
+ value: 1.0
368
+ max_steps:
369
+ desc: null
370
+ value: -1
371
+ lr_scheduler_type:
372
+ desc: null
373
+ value: cosine
374
+ lr_scheduler_kwargs:
375
+ desc: null
376
+ value: {}
377
+ warmup_ratio:
378
+ desc: null
379
+ value: 0.05
380
+ warmup_steps:
381
+ desc: null
382
+ value: 0
383
+ log_level:
384
+ desc: null
385
+ value: info
386
+ log_level_replica:
387
+ desc: null
388
+ value: warning
389
+ log_on_each_node:
390
+ desc: null
391
+ value: true
392
+ logging_dir:
393
+ desc: null
394
+ value: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/runs/Dec31_10-00-19_gpu020
395
+ logging_strategy:
396
+ desc: null
397
+ value: steps
398
+ logging_first_step:
399
+ desc: null
400
+ value: false
401
+ logging_steps:
402
+ desc: null
403
+ value: 1.0
404
+ logging_nan_inf_filter:
405
+ desc: null
406
+ value: true
407
+ save_strategy:
408
+ desc: null
409
+ value: steps
410
+ save_steps:
411
+ desc: null
412
+ value: 200
413
+ save_total_limit:
414
+ desc: null
415
+ value: null
416
+ save_safetensors:
417
+ desc: null
418
+ value: true
419
+ save_on_each_node:
420
+ desc: null
421
+ value: false
422
+ save_only_model:
423
+ desc: null
424
+ value: false
425
+ restore_callback_states_from_checkpoint:
426
+ desc: null
427
+ value: false
428
+ no_cuda:
429
+ desc: null
430
+ value: false
431
+ use_cpu:
432
+ desc: null
433
+ value: false
434
+ use_mps_device:
435
+ desc: null
436
+ value: false
437
+ seed:
438
+ desc: null
439
+ value: 42
440
+ data_seed:
441
+ desc: null
442
+ value: null
443
+ jit_mode_eval:
444
+ desc: null
445
+ value: false
446
+ use_ipex:
447
+ desc: null
448
+ value: false
449
+ bf16:
450
+ desc: null
451
+ value: true
452
+ fp16:
453
+ desc: null
454
+ value: false
455
+ fp16_opt_level:
456
+ desc: null
457
+ value: O1
458
+ half_precision_backend:
459
+ desc: null
460
+ value: auto
461
+ bf16_full_eval:
462
+ desc: null
463
+ value: false
464
+ fp16_full_eval:
465
+ desc: null
466
+ value: false
467
+ tf32:
468
+ desc: null
469
+ value: null
470
+ local_rank:
471
+ desc: null
472
+ value: 0
473
+ ddp_backend:
474
+ desc: null
475
+ value: null
476
+ tpu_num_cores:
477
+ desc: null
478
+ value: null
479
+ tpu_metrics_debug:
480
+ desc: null
481
+ value: false
482
+ debug:
483
+ desc: null
484
+ value: []
485
+ dataloader_drop_last:
486
+ desc: null
487
+ value: false
488
+ eval_steps:
489
+ desc: null
490
+ value: null
491
+ dataloader_num_workers:
492
+ desc: null
493
+ value: 1
494
+ dataloader_prefetch_factor:
495
+ desc: null
496
+ value: null
497
+ past_index:
498
+ desc: null
499
+ value: -1
500
+ run_name:
501
+ desc: null
502
+ value: _llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_
503
+ disable_tqdm:
504
+ desc: null
505
+ value: true
506
+ remove_unused_columns:
507
+ desc: null
508
+ value: false
509
+ label_names:
510
+ desc: null
511
+ value: null
512
+ load_best_model_at_end:
513
+ desc: null
514
+ value: false
515
+ metric_for_best_model:
516
+ desc: null
517
+ value: null
518
+ greater_is_better:
519
+ desc: null
520
+ value: null
521
+ ignore_data_skip:
522
+ desc: null
523
+ value: false
524
+ fsdp:
525
+ desc: null
526
+ value:
527
+ - auto_wrap
528
+ - offload
529
+ fsdp_min_num_params:
530
+ desc: null
531
+ value: 0
532
+ fsdp_config:
533
+ desc: null
534
+ value:
535
+ min_num_params: 0
536
+ xla: false
537
+ xla_fsdp_v2: false
538
+ xla_fsdp_grad_ckpt: false
539
+ fsdp_transformer_layer_cls_to_wrap:
540
+ desc: null
541
+ value: null
542
+ accelerator_config:
543
+ desc: null
544
+ value:
545
+ split_batches: false
546
+ dispatch_batches: null
547
+ even_batches: true
548
+ use_seedable_sampler: true
549
+ non_blocking: false
550
+ gradient_accumulation_kwargs: null
551
+ deepspeed:
552
+ desc: null
553
+ value: null
554
+ label_smoothing_factor:
555
+ desc: null
556
+ value: 0.0
557
+ optim:
558
+ desc: null
559
+ value: adamw_torch
560
+ optim_args:
561
+ desc: null
562
+ value: null
563
+ adafactor:
564
+ desc: null
565
+ value: false
566
+ group_by_length:
567
+ desc: null
568
+ value: false
569
+ length_column_name:
570
+ desc: null
571
+ value: length
572
+ report_to:
573
+ desc: null
574
+ value:
575
+ - wandb
576
+ ddp_find_unused_parameters:
577
+ desc: null
578
+ value: false
579
+ ddp_bucket_cap_mb:
580
+ desc: null
581
+ value: null
582
+ ddp_broadcast_buffers:
583
+ desc: null
584
+ value: null
585
+ dataloader_pin_memory:
586
+ desc: null
587
+ value: true
588
+ dataloader_persistent_workers:
589
+ desc: null
590
+ value: false
591
+ skip_memory_metrics:
592
+ desc: null
593
+ value: true
594
+ use_legacy_prediction_loop:
595
+ desc: null
596
+ value: false
597
+ push_to_hub:
598
+ desc: null
599
+ value: false
600
+ resume_from_checkpoint:
601
+ desc: null
602
+ value: null
603
+ hub_model_id:
604
+ desc: null
605
+ value: null
606
+ hub_strategy:
607
+ desc: null
608
+ value: every_save
609
+ hub_token:
610
+ desc: null
611
+ value: <HUB_TOKEN>
612
+ hub_private_repo:
613
+ desc: null
614
+ value: false
615
+ hub_always_push:
616
+ desc: null
617
+ value: false
618
+ gradient_checkpointing:
619
+ desc: null
620
+ value: true
621
+ gradient_checkpointing_kwargs:
622
+ desc: null
623
+ value: null
624
+ include_inputs_for_metrics:
625
+ desc: null
626
+ value: false
627
+ eval_do_concat_batches:
628
+ desc: null
629
+ value: true
630
+ fp16_backend:
631
+ desc: null
632
+ value: auto
633
+ evaluation_strategy:
634
+ desc: null
635
+ value: null
636
+ push_to_hub_model_id:
637
+ desc: null
638
+ value: null
639
+ push_to_hub_organization:
640
+ desc: null
641
+ value: null
642
+ push_to_hub_token:
643
+ desc: null
644
+ value: <PUSH_TO_HUB_TOKEN>
645
+ mp_parameters:
646
+ desc: null
647
+ value: ''
648
+ auto_find_batch_size:
649
+ desc: null
650
+ value: false
651
+ full_determinism:
652
+ desc: null
653
+ value: false
654
+ torchdynamo:
655
+ desc: null
656
+ value: null
657
+ ray_scope:
658
+ desc: null
659
+ value: last
660
+ ddp_timeout:
661
+ desc: null
662
+ value: 1800
663
+ torch_compile:
664
+ desc: null
665
+ value: false
666
+ torch_compile_backend:
667
+ desc: null
668
+ value: null
669
+ torch_compile_mode:
670
+ desc: null
671
+ value: null
672
+ dispatch_batches:
673
+ desc: null
674
+ value: null
675
+ split_batches:
676
+ desc: null
677
+ value: null
678
+ include_tokens_per_second:
679
+ desc: null
680
+ value: false
681
+ include_num_input_tokens_seen:
682
+ desc: null
683
+ value: false
684
+ neftune_noise_alpha:
685
+ desc: null
686
+ value: null
687
+ optim_target_modules:
688
+ desc: null
689
+ value: null
690
+ batch_eval_metrics:
691
+ desc: null
692
+ value: false
693
+ eval_on_start:
694
+ desc: null
695
+ value: false
696
+ eval_use_gather_object:
697
+ desc: null
698
+ value: false
699
+ min_lr_ratio:
700
+ desc: null
701
+ value: 0.1
702
+ cuda_empty_cache:
703
+ desc: null
704
+ value: true
705
+ streaming_dataset:
706
+ desc: null
707
+ value: true
708
+ seq_parallel_size:
709
+ desc: null
710
+ value: 8
711
+ model/num_parameters:
712
+ desc: null
713
+ value: 1003782656
wandb/run-20241231_100054-t2idt8o9/files/output.log ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final/lib/python3.10/site-packages/torch/utils/checkpoint.py:1399: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
2
+ with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context: # type: ignore[attr-defined]
3
+ [INFO|trainer.py:175] 2024-12-31 10:01:50,057 >> {'loss': 1.4933, 'grad_norm': 33.300479888916016, 'learning_rate': 5.405405405405406e-09, 'epoch': 0.00027122321670735016, 'num_input_tokens_seen': 2097152, 'completed': '0.03% (1 / 3_687)', 'remaining time': '49:07:09', 'throughput': '2732.19', 'gpu_mem_free': '5581MB'}
4
+ [INFO|trainer.py:175] 2024-12-31 10:02:25,461 >> {'loss': 1.6295, 'grad_norm': 35.11330795288086, 'learning_rate': 1.0810810810810811e-08, 'epoch': 0.0005424464334147003, 'num_input_tokens_seen': 4194304, 'completed': '0.05% (2 / 3_687)', 'remaining time': '42:40:22', 'throughput': '7404.33', 'gpu_mem_free': '5581MB'}
wandb/run-20241231_100054-t2idt8o9/files/requirements.txt ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Brotli==1.1.0
2
+ GitPython==3.1.43
3
+ Jinja2==3.1.4
4
+ MarkupSafe==3.0.2
5
+ PyJWT==2.10.0
6
+ PyNaCl==1.5.0
7
+ PyYAML==6.0.2
8
+ Pygments==2.18.0
9
+ accelerate==0.32.1
10
+ aiohappyeyeballs==2.4.3
11
+ aiohttp==3.11.2
12
+ aioprometheus==23.12.0
13
+ aiosignal==1.3.1
14
+ annotated-types==0.7.0
15
+ anthropic==0.39.0
16
+ anyio==4.6.2.post1
17
+ argcomplete==3.5.1
18
+ arrow==1.3.0
19
+ asttokens==2.4.1
20
+ async-timeout==5.0.1
21
+ attrs==24.2.0
22
+ autocommand==2.2.2
23
+ azure-core==1.32.0
24
+ azure-identity==1.19.0
25
+ azure-storage-blob==12.24.0
26
+ azure-storage-file-datalake==12.18.0
27
+ backoff==2.2.1
28
+ backports.tarfile==1.2.0
29
+ bcrypt==4.2.0
30
+ blobfile==3.0.0
31
+ boto3==1.35.63
32
+ botocore==1.35.63
33
+ cachetools==5.5.0
34
+ certifi==2024.8.30
35
+ cffi==1.17.1
36
+ charset-normalizer==3.4.0
37
+ circuitbreaker==2.0.0
38
+ click==8.1.7
39
+ cloudpickle==3.1.0
40
+ comm==0.2.2
41
+ compressed-tensors==0.8.0
42
+ contourpy==1.3.1
43
+ cramjam==2.9.0
44
+ cryptography==43.0.3
45
+ cycler==0.12.1
46
+ datasets==2.20.0
47
+ datatools==0.1
48
+ debugpy==1.8.11
49
+ decorator==5.1.1
50
+ dill==0.3.8
51
+ diskcache==5.6.3
52
+ distro==1.9.0
53
+ docker-pycreds==0.4.0
54
+ docstring_parser==0.16
55
+ einops==0.8.0
56
+ exceptiongroup==1.2.2
57
+ executing==2.1.0
58
+ fastapi==0.115.5
59
+ filelock==3.16.1
60
+ flash-attn==2.6.1
61
+ fonttools==4.55.0
62
+ frozenlist==1.5.0
63
+ fsspec==2024.5.0
64
+ gguf==0.10.0
65
+ gitdb==4.0.11
66
+ google-api-core==2.23.0
67
+ google-auth==2.36.0
68
+ google-cloud-aiplatform==1.71.1
69
+ google-cloud-bigquery==3.27.0
70
+ google-cloud-core==2.4.1
71
+ google-cloud-resource-manager==1.13.1
72
+ google-cloud-storage==2.10.0
73
+ google-crc32c==1.6.0
74
+ google-resumable-media==2.7.2
75
+ googleapis-common-protos==1.66.0
76
+ gql==3.5.0
77
+ graphql-core==3.2.5
78
+ grpc-google-iam-v1==0.13.1
79
+ grpcio-status==1.62.3
80
+ grpcio==1.68.0
81
+ h11==0.14.0
82
+ httpcore==1.0.7
83
+ httptools==0.6.4
84
+ httpx==0.27.2
85
+ huggingface-hub==0.26.2
86
+ idna==3.10
87
+ importlib_metadata==8.0.0
88
+ importlib_metadata==8.5.0
89
+ inflect==7.3.1
90
+ interegular==0.3.3
91
+ ipykernel==6.29.5
92
+ ipython==8.18.0
93
+ isodate==0.7.2
94
+ jaraco.collections==5.1.0
95
+ jaraco.context==5.3.0
96
+ jaraco.functools==4.0.1
97
+ jaraco.text==3.12.1
98
+ jedi==0.19.2
99
+ jiter==0.7.1
100
+ jmespath==1.0.1
101
+ jsonschema-specifications==2024.10.1
102
+ jsonschema==4.23.0
103
+ jupyter_client==8.6.3
104
+ jupyter_core==5.7.2
105
+ kiwisolver==1.4.7
106
+ lark==1.2.2
107
+ llvmlite==0.43.0
108
+ lm-format-enforcer==0.10.9
109
+ lxml==5.3.0
110
+ markdown-it-py==3.0.0
111
+ matplotlib-inline==0.1.7
112
+ matplotlib==3.9.2
113
+ mdurl==0.1.2
114
+ more-itertools==10.3.0
115
+ mosaicml-cli==0.5.34
116
+ mosaicml-streaming==0.8.1
117
+ mpmath==1.3.0
118
+ msal-extensions==1.2.0
119
+ msal==1.31.1
120
+ msgpack==1.1.0
121
+ msgspec==0.18.6
122
+ multidict==6.1.0
123
+ multiprocess==0.70.16
124
+ nest-asyncio==1.6.0
125
+ networkx==3.4.2
126
+ ninja==1.11.1.1
127
+ numba==0.60.0
128
+ numpy==1.26.4
129
+ nvidia-cublas-cu12==12.1.3.1
130
+ nvidia-cuda-cupti-cu12==12.1.105
131
+ nvidia-cuda-nvrtc-cu12==12.1.105
132
+ nvidia-cuda-runtime-cu12==12.1.105
133
+ nvidia-cudnn-cu12==9.1.0.70
134
+ nvidia-cufft-cu12==11.0.2.54
135
+ nvidia-curand-cu12==10.3.2.106
136
+ nvidia-cusolver-cu12==11.4.5.107
137
+ nvidia-cusparse-cu12==12.1.0.106
138
+ nvidia-ml-py==12.560.30
139
+ nvidia-nccl-cu12==2.20.5
140
+ nvidia-nvjitlink-cu12==12.4.127
141
+ nvidia-nvtx-cu12==12.1.105
142
+ oci==2.138.1
143
+ openai==1.54.5
144
+ opencv-python-headless==4.10.0.84
145
+ orjson==3.10.11
146
+ outlines==0.0.46
147
+ packaging==24.1
148
+ packaging==24.2
149
+ pandas==2.2.1
150
+ paramiko==3.5.0
151
+ parso==0.8.4
152
+ partial-json-parser==0.2.1.1.post4
153
+ pexpect==4.9.0
154
+ pillow==10.4.0
155
+ pip==24.3.1
156
+ platformdirs==4.2.2
157
+ platformdirs==4.3.6
158
+ portalocker==2.10.1
159
+ prometheus-fastapi-instrumentator==7.0.0
160
+ prometheus_client==0.21.0
161
+ prompt-toolkit==3.0.36
162
+ propcache==0.2.0
163
+ proto-plus==1.25.0
164
+ protobuf==4.25.3
165
+ psutil==6.1.0
166
+ ptyprocess==0.7.0
167
+ pure_eval==0.2.3
168
+ py-cpuinfo==9.0.0
169
+ pyOpenSSL==24.2.1
170
+ pyairports==2.1.1
171
+ pyarrow-hotfix==0.6
172
+ pyarrow==18.0.0
173
+ pyasn1==0.6.1
174
+ pyasn1_modules==0.4.1
175
+ pycountry==24.6.1
176
+ pycparser==2.22
177
+ pycryptodomex==3.21.0
178
+ pydantic==2.9.2
179
+ pydantic_core==2.23.4
180
+ pyparsing==3.2.0
181
+ python-dateutil==2.9.0
182
+ python-dotenv==1.0.1
183
+ python-snappy==0.7.3
184
+ pytz==2024.2
185
+ pyzmq==26.2.0
186
+ quantile-python==1.1
187
+ questionary==2.0.1
188
+ ray==2.39.0
189
+ referencing==0.35.1
190
+ regex==2023.12.25
191
+ requests==2.32.3
192
+ rich==13.9.4
193
+ rotary-emb==0.5.2
194
+ rpds-py==0.21.0
195
+ rsa==4.9
196
+ ruamel.yaml.clib==0.2.12
197
+ ruamel.yaml==0.18.6
198
+ s3transfer==0.10.3
199
+ safetensors==0.4.5
200
+ sentencepiece==0.1.99
201
+ sentry-sdk==2.18.0
202
+ setproctitle==1.3.4
203
+ setuptools==75.6.0
204
+ shapely==2.0.6
205
+ simple-parsing==0.1.6
206
+ six==1.16.0
207
+ smmap==5.0.1
208
+ sniffio==1.3.1
209
+ stack-data==0.6.3
210
+ starlette==0.41.3
211
+ sympy==1.13.1
212
+ tiktoken==0.7.0
213
+ tokenizers==0.19.1
214
+ tomli==2.0.1
215
+ torch==2.4.1
216
+ torchvision==0.19.1
217
+ tornado==6.4.1
218
+ tqdm==4.66.4
219
+ traitlets==5.14.3
220
+ transformers==4.44.2
221
+ triton==3.0.0
222
+ typeguard==4.3.0
223
+ types-python-dateutil==2.9.0.20241003
224
+ typing_extensions==4.12.2
225
+ typing_extensions==4.12.2
226
+ tzdata==2024.2
227
+ urllib3==2.2.3
228
+ uvicorn==0.32.0
229
+ uvloop==0.21.0
230
+ validators==0.34.0
231
+ vertexai==1.71.1
232
+ wandb==0.17.3
233
+ watchfiles==0.24.0
234
+ wcwidth==0.2.13
235
+ websockets==11.0.3
236
+ wheel==0.43.0
237
+ wheel==0.45.1
238
+ xformers==0.0.28.post1
239
+ xxhash==3.5.0
240
+ yarl==1.17.2
241
+ zipp==3.19.2
242
+ zipp==3.21.0
243
+ zstandard==0.23.0
244
+ zstd==1.5.5.1
wandb/run-20241231_100054-t2idt8o9/files/wandb-metadata.json ADDED
@@ -0,0 +1,705 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-48-generic-x86_64-with-glibc2.39",
3
+ "python": "3.10.0",
4
+ "heartbeatAt": "2024-12-31T10:00:55.248811",
5
+ "startedAt": "2024-12-31T10:00:54.766527",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--model_family",
10
+ "llama",
11
+ "--apply_instruct_masks",
12
+ "--token_scaled_loss",
13
+ "--seq_parallel_size",
14
+ "8",
15
+ "--report_to",
16
+ "wandb",
17
+ "--do_train",
18
+ "--model_name_or_path",
19
+ "/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/",
20
+ "--config_name",
21
+ "/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/",
22
+ "--tokenizer_name",
23
+ "/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/",
24
+ "--run_name",
25
+ "_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_",
26
+ "--output_dir",
27
+ "/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_",
28
+ "--config_overrides_json",
29
+ "",
30
+ "--gradient_accumulation_steps",
31
+ "2",
32
+ "--per_device_train_batch_size",
33
+ "1",
34
+ "--bf16",
35
+ "--learning_rate",
36
+ "1e-6",
37
+ "--min_lr_ratio",
38
+ "0.1",
39
+ "--lr_scheduler_type",
40
+ "cosine",
41
+ "--max_grad_norm",
42
+ "1.0",
43
+ "--adam_beta1",
44
+ "0.9",
45
+ "--adam_beta2",
46
+ "0.95",
47
+ "--weight_decay",
48
+ "0.1",
49
+ "--warmup_ratio",
50
+ "0.05",
51
+ "--optim",
52
+ "adamw_torch",
53
+ "--logging_steps",
54
+ "1",
55
+ "--log_level",
56
+ "info",
57
+ "--save_steps",
58
+ "200",
59
+ "--dataloader_num_workers",
60
+ "1",
61
+ "--disable_tqdm",
62
+ "true",
63
+ "--use_fast_tokenizer",
64
+ "false",
65
+ "--remove_unused_columns",
66
+ "false",
67
+ "--ddp_find_unused_parameters",
68
+ "false",
69
+ "--fsdp",
70
+ "auto_wrap offload",
71
+ "--gradient_checkpointing",
72
+ "--tokenized_mds_train",
73
+ "/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/data/ft/bookclaim_balanced_pack_complete",
74
+ "--cuda_empty_cache",
75
+ "--num_train_epochs",
76
+ "1"
77
+ ],
78
+ "state": "running",
79
+ "program": "/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/finetune.py",
80
+ "codePathLocal": "finetune.py",
81
+ "codePath": "prolong-final/finetune.py",
82
+ "git": {
83
+ "remote": "https://github.com/chtmp223/BookGen-dev.git",
84
+ "commit": "0e796521430a0f767be7c4dadba5c2fcaee1f909"
85
+ },
86
+ "email": "[email protected]",
87
+ "root": "/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev",
88
+ "host": "gpu020",
89
+ "username": "ctpham_umass_edu",
90
+ "executable": "/scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final/bin/python3.10",
91
+ "cpu_count": 112,
92
+ "cpu_count_logical": 112,
93
+ "cpu_freq": {
94
+ "current": 957.8740624999999,
95
+ "min": 800.0,
96
+ "max": 3800.0
97
+ },
98
+ "cpu_freq_per_core": [
99
+ {
100
+ "current": 2463.059,
101
+ "min": 800.0,
102
+ "max": 3800.0
103
+ },
104
+ {
105
+ "current": 759.577,
106
+ "min": 800.0,
107
+ "max": 3800.0
108
+ },
109
+ {
110
+ "current": 2024.63,
111
+ "min": 800.0,
112
+ "max": 3800.0
113
+ },
114
+ {
115
+ "current": 800.0,
116
+ "min": 800.0,
117
+ "max": 3800.0
118
+ },
119
+ {
120
+ "current": 2400.0,
121
+ "min": 800.0,
122
+ "max": 3800.0
123
+ },
124
+ {
125
+ "current": 800.0,
126
+ "min": 800.0,
127
+ "max": 3800.0
128
+ },
129
+ {
130
+ "current": 1740.158,
131
+ "min": 800.0,
132
+ "max": 3800.0
133
+ },
134
+ {
135
+ "current": 800.0,
136
+ "min": 800.0,
137
+ "max": 3800.0
138
+ },
139
+ {
140
+ "current": 1100.0,
141
+ "min": 800.0,
142
+ "max": 3800.0
143
+ },
144
+ {
145
+ "current": 800.0,
146
+ "min": 800.0,
147
+ "max": 3800.0
148
+ },
149
+ {
150
+ "current": 2035.036,
151
+ "min": 800.0,
152
+ "max": 3800.0
153
+ },
154
+ {
155
+ "current": 800.0,
156
+ "min": 800.0,
157
+ "max": 3800.0
158
+ },
159
+ {
160
+ "current": 800.0,
161
+ "min": 800.0,
162
+ "max": 3800.0
163
+ },
164
+ {
165
+ "current": 800.0,
166
+ "min": 800.0,
167
+ "max": 3800.0
168
+ },
169
+ {
170
+ "current": 800.0,
171
+ "min": 800.0,
172
+ "max": 3800.0
173
+ },
174
+ {
175
+ "current": 800.0,
176
+ "min": 800.0,
177
+ "max": 3800.0
178
+ },
179
+ {
180
+ "current": 800.0,
181
+ "min": 800.0,
182
+ "max": 3800.0
183
+ },
184
+ {
185
+ "current": 800.0,
186
+ "min": 800.0,
187
+ "max": 3800.0
188
+ },
189
+ {
190
+ "current": 800.0,
191
+ "min": 800.0,
192
+ "max": 3800.0
193
+ },
194
+ {
195
+ "current": 800.0,
196
+ "min": 800.0,
197
+ "max": 3800.0
198
+ },
199
+ {
200
+ "current": 800.0,
201
+ "min": 800.0,
202
+ "max": 3800.0
203
+ },
204
+ {
205
+ "current": 800.0,
206
+ "min": 800.0,
207
+ "max": 3800.0
208
+ },
209
+ {
210
+ "current": 800.0,
211
+ "min": 800.0,
212
+ "max": 3800.0
213
+ },
214
+ {
215
+ "current": 800.0,
216
+ "min": 800.0,
217
+ "max": 3800.0
218
+ },
219
+ {
220
+ "current": 1400.0,
221
+ "min": 800.0,
222
+ "max": 3800.0
223
+ },
224
+ {
225
+ "current": 800.0,
226
+ "min": 800.0,
227
+ "max": 3800.0
228
+ },
229
+ {
230
+ "current": 1900.0,
231
+ "min": 800.0,
232
+ "max": 3800.0
233
+ },
234
+ {
235
+ "current": 800.0,
236
+ "min": 800.0,
237
+ "max": 3800.0
238
+ },
239
+ {
240
+ "current": 800.0,
241
+ "min": 800.0,
242
+ "max": 3800.0
243
+ },
244
+ {
245
+ "current": 800.0,
246
+ "min": 800.0,
247
+ "max": 3800.0
248
+ },
249
+ {
250
+ "current": 800.0,
251
+ "min": 800.0,
252
+ "max": 3800.0
253
+ },
254
+ {
255
+ "current": 800.0,
256
+ "min": 800.0,
257
+ "max": 3800.0
258
+ },
259
+ {
260
+ "current": 790.179,
261
+ "min": 800.0,
262
+ "max": 3800.0
263
+ },
264
+ {
265
+ "current": 800.0,
266
+ "min": 800.0,
267
+ "max": 3800.0
268
+ },
269
+ {
270
+ "current": 800.0,
271
+ "min": 800.0,
272
+ "max": 3800.0
273
+ },
274
+ {
275
+ "current": 800.0,
276
+ "min": 800.0,
277
+ "max": 3800.0
278
+ },
279
+ {
280
+ "current": 1800.0,
281
+ "min": 800.0,
282
+ "max": 3800.0
283
+ },
284
+ {
285
+ "current": 800.0,
286
+ "min": 800.0,
287
+ "max": 3800.0
288
+ },
289
+ {
290
+ "current": 800.0,
291
+ "min": 800.0,
292
+ "max": 3800.0
293
+ },
294
+ {
295
+ "current": 800.0,
296
+ "min": 800.0,
297
+ "max": 3800.0
298
+ },
299
+ {
300
+ "current": 1393.77,
301
+ "min": 800.0,
302
+ "max": 3800.0
303
+ },
304
+ {
305
+ "current": 800.0,
306
+ "min": 800.0,
307
+ "max": 3800.0
308
+ },
309
+ {
310
+ "current": 800.0,
311
+ "min": 800.0,
312
+ "max": 3800.0
313
+ },
314
+ {
315
+ "current": 800.0,
316
+ "min": 800.0,
317
+ "max": 3800.0
318
+ },
319
+ {
320
+ "current": 800.0,
321
+ "min": 800.0,
322
+ "max": 3800.0
323
+ },
324
+ {
325
+ "current": 800.0,
326
+ "min": 800.0,
327
+ "max": 3800.0
328
+ },
329
+ {
330
+ "current": 800.0,
331
+ "min": 800.0,
332
+ "max": 3800.0
333
+ },
334
+ {
335
+ "current": 800.0,
336
+ "min": 800.0,
337
+ "max": 3800.0
338
+ },
339
+ {
340
+ "current": 800.0,
341
+ "min": 800.0,
342
+ "max": 3800.0
343
+ },
344
+ {
345
+ "current": 3800.0,
346
+ "min": 800.0,
347
+ "max": 3800.0
348
+ },
349
+ {
350
+ "current": 1439.809,
351
+ "min": 800.0,
352
+ "max": 3800.0
353
+ },
354
+ {
355
+ "current": 800.0,
356
+ "min": 800.0,
357
+ "max": 3800.0
358
+ },
359
+ {
360
+ "current": 800.0,
361
+ "min": 800.0,
362
+ "max": 3800.0
363
+ },
364
+ {
365
+ "current": 800.0,
366
+ "min": 800.0,
367
+ "max": 3800.0
368
+ },
369
+ {
370
+ "current": 800.0,
371
+ "min": 800.0,
372
+ "max": 3800.0
373
+ },
374
+ {
375
+ "current": 800.0,
376
+ "min": 800.0,
377
+ "max": 3800.0
378
+ },
379
+ {
380
+ "current": 800.0,
381
+ "min": 800.0,
382
+ "max": 3800.0
383
+ },
384
+ {
385
+ "current": 3800.0,
386
+ "min": 800.0,
387
+ "max": 3800.0
388
+ },
389
+ {
390
+ "current": 800.0,
391
+ "min": 800.0,
392
+ "max": 3800.0
393
+ },
394
+ {
395
+ "current": 800.0,
396
+ "min": 800.0,
397
+ "max": 3800.0
398
+ },
399
+ {
400
+ "current": 800.0,
401
+ "min": 800.0,
402
+ "max": 3800.0
403
+ },
404
+ {
405
+ "current": 800.0,
406
+ "min": 800.0,
407
+ "max": 3800.0
408
+ },
409
+ {
410
+ "current": 800.0,
411
+ "min": 800.0,
412
+ "max": 3800.0
413
+ },
414
+ {
415
+ "current": 800.0,
416
+ "min": 800.0,
417
+ "max": 3800.0
418
+ },
419
+ {
420
+ "current": 800.0,
421
+ "min": 800.0,
422
+ "max": 3800.0
423
+ },
424
+ {
425
+ "current": 800.0,
426
+ "min": 800.0,
427
+ "max": 3800.0
428
+ },
429
+ {
430
+ "current": 800.0,
431
+ "min": 800.0,
432
+ "max": 3800.0
433
+ },
434
+ {
435
+ "current": 800.0,
436
+ "min": 800.0,
437
+ "max": 3800.0
438
+ },
439
+ {
440
+ "current": 800.0,
441
+ "min": 800.0,
442
+ "max": 3800.0
443
+ },
444
+ {
445
+ "current": 800.0,
446
+ "min": 800.0,
447
+ "max": 3800.0
448
+ },
449
+ {
450
+ "current": 782.956,
451
+ "min": 800.0,
452
+ "max": 3800.0
453
+ },
454
+ {
455
+ "current": 800.0,
456
+ "min": 800.0,
457
+ "max": 3800.0
458
+ },
459
+ {
460
+ "current": 783.716,
461
+ "min": 800.0,
462
+ "max": 3800.0
463
+ },
464
+ {
465
+ "current": 800.0,
466
+ "min": 800.0,
467
+ "max": 3800.0
468
+ },
469
+ {
470
+ "current": 783.035,
471
+ "min": 800.0,
472
+ "max": 3800.0
473
+ },
474
+ {
475
+ "current": 800.0,
476
+ "min": 800.0,
477
+ "max": 3800.0
478
+ },
479
+ {
480
+ "current": 800.0,
481
+ "min": 800.0,
482
+ "max": 3800.0
483
+ },
484
+ {
485
+ "current": 800.0,
486
+ "min": 800.0,
487
+ "max": 3800.0
488
+ },
489
+ {
490
+ "current": 784.273,
491
+ "min": 800.0,
492
+ "max": 3800.0
493
+ },
494
+ {
495
+ "current": 800.0,
496
+ "min": 800.0,
497
+ "max": 3800.0
498
+ },
499
+ {
500
+ "current": 800.0,
501
+ "min": 800.0,
502
+ "max": 3800.0
503
+ },
504
+ {
505
+ "current": 800.0,
506
+ "min": 800.0,
507
+ "max": 3800.0
508
+ },
509
+ {
510
+ "current": 800.0,
511
+ "min": 800.0,
512
+ "max": 3800.0
513
+ },
514
+ {
515
+ "current": 800.0,
516
+ "min": 800.0,
517
+ "max": 3800.0
518
+ },
519
+ {
520
+ "current": 800.0,
521
+ "min": 800.0,
522
+ "max": 3800.0
523
+ },
524
+ {
525
+ "current": 800.0,
526
+ "min": 800.0,
527
+ "max": 3800.0
528
+ },
529
+ {
530
+ "current": 800.0,
531
+ "min": 800.0,
532
+ "max": 3800.0
533
+ },
534
+ {
535
+ "current": 800.0,
536
+ "min": 800.0,
537
+ "max": 3800.0
538
+ },
539
+ {
540
+ "current": 800.0,
541
+ "min": 800.0,
542
+ "max": 3800.0
543
+ },
544
+ {
545
+ "current": 800.0,
546
+ "min": 800.0,
547
+ "max": 3800.0
548
+ },
549
+ {
550
+ "current": 800.0,
551
+ "min": 800.0,
552
+ "max": 3800.0
553
+ },
554
+ {
555
+ "current": 800.0,
556
+ "min": 800.0,
557
+ "max": 3800.0
558
+ },
559
+ {
560
+ "current": 800.0,
561
+ "min": 800.0,
562
+ "max": 3800.0
563
+ },
564
+ {
565
+ "current": 784.246,
566
+ "min": 800.0,
567
+ "max": 3800.0
568
+ },
569
+ {
570
+ "current": 800.0,
571
+ "min": 800.0,
572
+ "max": 3800.0
573
+ },
574
+ {
575
+ "current": 800.0,
576
+ "min": 800.0,
577
+ "max": 3800.0
578
+ },
579
+ {
580
+ "current": 800.0,
581
+ "min": 800.0,
582
+ "max": 3800.0
583
+ },
584
+ {
585
+ "current": 800.0,
586
+ "min": 800.0,
587
+ "max": 3800.0
588
+ },
589
+ {
590
+ "current": 800.0,
591
+ "min": 800.0,
592
+ "max": 3800.0
593
+ },
594
+ {
595
+ "current": 800.0,
596
+ "min": 800.0,
597
+ "max": 3800.0
598
+ },
599
+ {
600
+ "current": 800.0,
601
+ "min": 800.0,
602
+ "max": 3800.0
603
+ },
604
+ {
605
+ "current": 800.0,
606
+ "min": 800.0,
607
+ "max": 3800.0
608
+ },
609
+ {
610
+ "current": 800.0,
611
+ "min": 800.0,
612
+ "max": 3800.0
613
+ },
614
+ {
615
+ "current": 784.059,
616
+ "min": 800.0,
617
+ "max": 3800.0
618
+ },
619
+ {
620
+ "current": 800.0,
621
+ "min": 800.0,
622
+ "max": 3800.0
623
+ },
624
+ {
625
+ "current": 800.0,
626
+ "min": 800.0,
627
+ "max": 3800.0
628
+ },
629
+ {
630
+ "current": 790.704,
631
+ "min": 800.0,
632
+ "max": 3800.0
633
+ },
634
+ {
635
+ "current": 800.0,
636
+ "min": 800.0,
637
+ "max": 3800.0
638
+ },
639
+ {
640
+ "current": 800.0,
641
+ "min": 800.0,
642
+ "max": 3800.0
643
+ },
644
+ {
645
+ "current": 800.0,
646
+ "min": 800.0,
647
+ "max": 3800.0
648
+ },
649
+ {
650
+ "current": 800.0,
651
+ "min": 800.0,
652
+ "max": 3800.0
653
+ },
654
+ {
655
+ "current": 800.0,
656
+ "min": 800.0,
657
+ "max": 3800.0
658
+ }
659
+ ],
660
+ "disk": {
661
+ "/": {
662
+ "total": 438.487850189209,
663
+ "used": 18.201271057128906
664
+ }
665
+ },
666
+ "gpu": "NVIDIA A100-SXM4-80GB",
667
+ "gpu_count": 8,
668
+ "gpu_devices": [
669
+ {
670
+ "name": "NVIDIA A100-SXM4-80GB",
671
+ "memory_total": 85899345920
672
+ },
673
+ {
674
+ "name": "NVIDIA A100-SXM4-80GB",
675
+ "memory_total": 85899345920
676
+ },
677
+ {
678
+ "name": "NVIDIA A100-SXM4-80GB",
679
+ "memory_total": 85899345920
680
+ },
681
+ {
682
+ "name": "NVIDIA A100-SXM4-80GB",
683
+ "memory_total": 85899345920
684
+ },
685
+ {
686
+ "name": "NVIDIA A100-SXM4-80GB",
687
+ "memory_total": 85899345920
688
+ },
689
+ {
690
+ "name": "NVIDIA A100-SXM4-80GB",
691
+ "memory_total": 85899345920
692
+ },
693
+ {
694
+ "name": "NVIDIA A100-SXM4-80GB",
695
+ "memory_total": 85899345920
696
+ },
697
+ {
698
+ "name": "NVIDIA A100-SXM4-80GB",
699
+ "memory_total": 85899345920
700
+ }
701
+ ],
702
+ "memory": {
703
+ "total": 2015.3287239074707
704
+ }
705
+ }
wandb/run-20241231_100054-t2idt8o9/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss": 1.738, "train/grad_norm": 41.506248474121094, "train/learning_rate": 1.6216216216216218e-08, "train/epoch": 0.0008136696501220504, "train/num_input_tokens_seen": 6291456, "train/global_step": 3, "_timestamp": 1735639380.6717923, "_runtime": 125.88794016838074, "_step": 2}
wandb/run-20241231_100054-t2idt8o9/logs/debug-internal.log ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-12-31 10:00:54,781 INFO StreamThr :1511293 [internal.py:wandb_internal():85] W&B internal server running at pid: 1511293, started at: 2024-12-31 10:00:54.780041
2
+ 2024-12-31 10:00:54,783 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status
3
+ 2024-12-31 10:00:54,789 INFO WriterThread:1511293 [datastore.py:open_for_write():87] open: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/run-t2idt8o9.wandb
4
+ 2024-12-31 10:00:54,790 DEBUG SenderThread:1511293 [sender.py:send():379] send: header
5
+ 2024-12-31 10:00:54,825 DEBUG SenderThread:1511293 [sender.py:send():379] send: run
6
+ 2024-12-31 10:00:55,057 INFO SenderThread:1511293 [dir_watcher.py:__init__():211] watching files in: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files
7
+ 2024-12-31 10:00:55,057 INFO SenderThread:1511293 [sender.py:_start_run_threads():1188] run started: t2idt8o9 with start time 1735639254.783852
8
+ 2024-12-31 10:00:55,080 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: check_version
9
+ 2024-12-31 10:00:55,081 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: check_version
10
+ 2024-12-31 10:00:55,155 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: run_start
11
+ 2024-12-31 10:00:55,183 DEBUG HandlerThread:1511293 [system_info.py:__init__():26] System info init
12
+ 2024-12-31 10:00:55,183 DEBUG HandlerThread:1511293 [system_info.py:__init__():41] System info init done
13
+ 2024-12-31 10:00:55,183 INFO HandlerThread:1511293 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-12-31 10:00:55,183 INFO SystemMonitor:1511293 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-12-31 10:00:55,183 INFO HandlerThread:1511293 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-12-31 10:00:55,184 INFO SystemMonitor:1511293 [interfaces.py:start():188] Started cpu monitoring
17
+ 2024-12-31 10:00:55,185 INFO SystemMonitor:1511293 [interfaces.py:start():188] Started disk monitoring
18
+ 2024-12-31 10:00:55,186 INFO SystemMonitor:1511293 [interfaces.py:start():188] Started gpu monitoring
19
+ 2024-12-31 10:00:55,186 INFO SystemMonitor:1511293 [interfaces.py:start():188] Started memory monitoring
20
+ 2024-12-31 10:00:55,187 INFO SystemMonitor:1511293 [interfaces.py:start():188] Started network monitoring
21
+ 2024-12-31 10:00:55,248 DEBUG HandlerThread:1511293 [system_info.py:probe():152] Probing system
22
+ 2024-12-31 10:00:55,251 DEBUG HandlerThread:1511293 [system_info.py:_probe_git():137] Probing git
23
+ 2024-12-31 10:00:55,270 DEBUG HandlerThread:1511293 [system_info.py:_probe_git():145] Probing git done
24
+ 2024-12-31 10:00:55,270 DEBUG HandlerThread:1511293 [system_info.py:probe():200] Probing system done
25
+ 2024-12-31 10:00:55,271 DEBUG HandlerThread:1511293 [system_monitor.py:probe():223] {'os': 'Linux-6.8.0-48-generic-x86_64-with-glibc2.39', 'python': '3.10.0', 'heartbeatAt': '2024-12-31T10:00:55.248811', 'startedAt': '2024-12-31T10:00:54.766527', 'docker': None, 'cuda': None, 'args': ('--model_family', 'llama', '--apply_instruct_masks', '--token_scaled_loss', '--seq_parallel_size', '8', '--report_to', 'wandb', '--do_train', '--model_name_or_path', '/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/', '--config_name', '/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/', '--tokenizer_name', '/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/', '--run_name', '_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', '--output_dir', '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', '--config_overrides_json', '', '--gradient_accumulation_steps', '2', '--per_device_train_batch_size', '1', '--bf16', '--learning_rate', '1e-6', '--min_lr_ratio', '0.1', '--lr_scheduler_type', 'cosine', '--max_grad_norm', '1.0', '--adam_beta1', '0.9', '--adam_beta2', '0.95', '--weight_decay', '0.1', '--warmup_ratio', '0.05', '--optim', 'adamw_torch', '--logging_steps', '1', '--log_level', 'info', '--save_steps', '200', '--dataloader_num_workers', '1', '--disable_tqdm', 'true', '--use_fast_tokenizer', 'false', '--remove_unused_columns', 'false', '--ddp_find_unused_parameters', 'false', '--fsdp', 'auto_wrap offload', '--gradient_checkpointing', '--tokenized_mds_train', '/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/data/ft/bookclaim_balanced_pack_complete', '--cuda_empty_cache', '--num_train_epochs', '1'), 'state': 'running', 'program': '/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/finetune.py', 'codePathLocal': 'finetune.py', 'codePath': 'prolong-final/finetune.py', 'git': {'remote': 'https://github.com/chtmp223/BookGen-dev.git', 'commit': '0e796521430a0f767be7c4dadba5c2fcaee1f909'}, 'email': '[email protected]', 'root': '/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev', 'host': 'gpu020', 'username': 'ctpham_umass_edu', 'executable': '/scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final/bin/python3.10', 'cpu_count': 112, 'cpu_count_logical': 112, 'cpu_freq': {'current': 957.8740624999999, 'min': 800.0, 'max': 3800.0}, 'cpu_freq_per_core': [{'current': 2463.059, 'min': 800.0, 'max': 3800.0}, {'current': 759.577, 'min': 800.0, 'max': 3800.0}, {'current': 2024.63, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 2400.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 1740.158, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 1100.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 2035.036, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 1400.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 1900.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 790.179, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 1800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 1393.77, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 3800.0, 'min': 800.0, 'max': 3800.0}, {'current': 1439.809, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 3800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 782.956, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 783.716, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 783.035, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 784.273, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 784.246, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 784.059, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 790.704, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}, {'current': 800.0, 'min': 800.0, 'max': 3800.0}], 'disk': {'/': {'total': 438.487850189209, 'used': 18.201271057128906}}, 'gpu': 'NVIDIA A100-SXM4-80GB', 'gpu_count': 8, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-80GB', 'memory_total': 85899345920}, {'name': 'NVIDIA A100-SXM4-80GB', 'memory_total': 85899345920}, {'name': 'NVIDIA A100-SXM4-80GB', 'memory_total': 85899345920}, {'name': 'NVIDIA A100-SXM4-80GB', 'memory_total': 85899345920}, {'name': 'NVIDIA A100-SXM4-80GB', 'memory_total': 85899345920}, {'name': 'NVIDIA A100-SXM4-80GB', 'memory_total': 85899345920}, {'name': 'NVIDIA A100-SXM4-80GB', 'memory_total': 85899345920}, {'name': 'NVIDIA A100-SXM4-80GB', 'memory_total': 85899345920}], 'memory': {'total': 2015.3287239074707}}
26
+ 2024-12-31 10:00:55,271 INFO HandlerThread:1511293 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-12-31 10:00:55,271 INFO HandlerThread:1511293 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-12-31 10:00:55,271 DEBUG HandlerThread:1511293 [system_info.py:_save_conda():209] Saving list of conda packages installed into the current environment
29
+ 2024-12-31 10:00:56,059 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_created():271] file/dir created: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/conda-environment.yaml
30
+ 2024-12-31 10:01:01,448 DEBUG HandlerThread:1511293 [system_info.py:_save_conda():224] Saving conda packages done
31
+ 2024-12-31 10:01:01,452 INFO HandlerThread:1511293 [system_monitor.py:probe():229] Finished publishing system info
32
+ 2024-12-31 10:01:01,459 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
33
+ 2024-12-31 10:01:01,459 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: keepalive
34
+ 2024-12-31 10:01:01,460 DEBUG SenderThread:1511293 [sender.py:send():379] send: files
35
+ 2024-12-31 10:01:01,460 INFO SenderThread:1511293 [sender.py:_save_file():1454] saving file wandb-metadata.json with policy now
36
+ 2024-12-31 10:01:01,752 INFO wandb-upload_0:1511293 [upload_job.py:push():130] Uploaded file /tmp/tmp_luoc01bwandb/35p6d7uj-wandb-metadata.json
37
+ 2024-12-31 10:01:01,779 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: python_packages
38
+ 2024-12-31 10:01:01,780 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: python_packages
39
+ 2024-12-31 10:01:01,781 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: stop_status
40
+ 2024-12-31 10:01:01,782 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: stop_status
41
+ 2024-12-31 10:01:01,787 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
42
+ 2024-12-31 10:01:01,910 DEBUG SenderThread:1511293 [sender.py:send():379] send: telemetry
43
+ 2024-12-31 10:01:01,911 DEBUG SenderThread:1511293 [sender.py:send():379] send: config
44
+ 2024-12-31 10:01:01,911 DEBUG SenderThread:1511293 [sender.py:send():379] send: telemetry
45
+ 2024-12-31 10:01:01,911 DEBUG SenderThread:1511293 [sender.py:send():379] send: metric
46
+ 2024-12-31 10:01:01,911 DEBUG SenderThread:1511293 [sender.py:send():379] send: telemetry
47
+ 2024-12-31 10:01:01,911 DEBUG SenderThread:1511293 [sender.py:send():379] send: metric
48
+ 2024-12-31 10:01:01,912 WARNING SenderThread:1511293 [sender.py:send_metric():1405] Seen metric with glob (shouldn't happen)
49
+ 2024-12-31 10:01:01,912 DEBUG SenderThread:1511293 [sender.py:send():379] send: telemetry
50
+ 2024-12-31 10:01:01,912 DEBUG SenderThread:1511293 [sender.py:send():379] send: telemetry
51
+ 2024-12-31 10:01:01,912 DEBUG SenderThread:1511293 [sender.py:send():379] send: config
52
+ 2024-12-31 10:01:02,060 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_modified():288] file/dir modified: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/conda-environment.yaml
53
+ 2024-12-31 10:01:02,061 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_created():271] file/dir created: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/requirements.txt
54
+ 2024-12-31 10:01:02,061 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_created():271] file/dir created: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/wandb-metadata.json
55
+ 2024-12-31 10:01:02,791 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
56
+ 2024-12-31 10:01:03,784 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
57
+ 2024-12-31 10:01:04,784 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
58
+ 2024-12-31 10:01:05,788 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
59
+ 2024-12-31 10:01:05,912 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
60
+ 2024-12-31 10:01:06,788 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
61
+ 2024-12-31 10:01:07,788 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
62
+ 2024-12-31 10:01:08,788 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
63
+ 2024-12-31 10:01:09,788 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
64
+ 2024-12-31 10:01:10,789 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
65
+ 2024-12-31 10:01:11,782 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
66
+ 2024-12-31 10:01:11,789 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
67
+ 2024-12-31 10:01:12,789 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
68
+ 2024-12-31 10:01:13,789 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
69
+ 2024-12-31 10:01:14,789 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
70
+ 2024-12-31 10:01:15,789 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
71
+ 2024-12-31 10:01:16,780 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: stop_status
72
+ 2024-12-31 10:01:16,781 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: stop_status
73
+ 2024-12-31 10:01:16,821 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
74
+ 2024-12-31 10:01:16,948 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
75
+ 2024-12-31 10:01:17,790 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
76
+ 2024-12-31 10:01:18,790 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
77
+ 2024-12-31 10:01:19,790 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
78
+ 2024-12-31 10:01:20,790 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
79
+ 2024-12-31 10:01:21,790 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
80
+ 2024-12-31 10:01:22,078 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_created():271] file/dir created: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/output.log
81
+ 2024-12-31 10:01:22,782 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
82
+ 2024-12-31 10:01:22,790 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
83
+ 2024-12-31 10:01:23,790 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
84
+ 2024-12-31 10:01:24,080 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_modified():288] file/dir modified: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/output.log
85
+ 2024-12-31 10:01:24,790 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
86
+ 2024-12-31 10:01:25,791 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
87
+ 2024-12-31 10:01:26,790 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
88
+ 2024-12-31 10:01:27,788 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
89
+ 2024-12-31 10:01:27,796 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
90
+ 2024-12-31 10:01:28,083 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_modified():288] file/dir modified: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/config.yaml
91
+ 2024-12-31 10:01:28,791 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
92
+ 2024-12-31 10:01:29,791 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
93
+ 2024-12-31 10:01:30,791 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
94
+ 2024-12-31 10:01:31,780 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: stop_status
95
+ 2024-12-31 10:01:31,781 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: stop_status
96
+ 2024-12-31 10:01:31,820 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
97
+ 2024-12-31 10:01:32,792 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
98
+ 2024-12-31 10:01:33,791 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
99
+ 2024-12-31 10:01:33,879 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
100
+ 2024-12-31 10:01:34,791 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
101
+ 2024-12-31 10:01:35,792 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
102
+ 2024-12-31 10:01:36,791 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
103
+ 2024-12-31 10:01:37,792 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
104
+ 2024-12-31 10:01:38,792 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
105
+ 2024-12-31 10:01:39,782 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
106
+ 2024-12-31 10:01:39,792 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
107
+ 2024-12-31 10:01:40,792 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
108
+ 2024-12-31 10:01:41,794 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
109
+ 2024-12-31 10:01:42,797 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
110
+ 2024-12-31 10:01:43,802 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
111
+ 2024-12-31 10:01:44,790 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
112
+ 2024-12-31 10:01:44,799 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
113
+ 2024-12-31 10:01:45,803 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
114
+ 2024-12-31 10:01:46,782 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: stop_status
115
+ 2024-12-31 10:01:46,783 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: stop_status
116
+ 2024-12-31 10:01:46,824 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
117
+ 2024-12-31 10:01:47,799 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
118
+ 2024-12-31 10:01:48,799 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
119
+ 2024-12-31 10:01:49,799 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
120
+ 2024-12-31 10:01:49,887 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
121
+ 2024-12-31 10:01:50,057 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: partial_history
122
+ 2024-12-31 10:01:50,058 DEBUG SenderThread:1511293 [sender.py:send():379] send: metric
123
+ 2024-12-31 10:01:50,059 DEBUG SenderThread:1511293 [sender.py:send():379] send: metric
124
+ 2024-12-31 10:01:50,059 DEBUG SenderThread:1511293 [sender.py:send():379] send: metric
125
+ 2024-12-31 10:01:50,059 DEBUG SenderThread:1511293 [sender.py:send():379] send: metric
126
+ 2024-12-31 10:01:50,059 DEBUG SenderThread:1511293 [sender.py:send():379] send: metric
127
+ 2024-12-31 10:01:50,059 DEBUG SenderThread:1511293 [sender.py:send():379] send: history
128
+ 2024-12-31 10:01:50,059 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: summary_record
129
+ 2024-12-31 10:01:50,064 INFO SenderThread:1511293 [sender.py:_save_file():1454] saving file wandb-summary.json with policy end
130
+ 2024-12-31 10:01:50,110 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_created():271] file/dir created: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/wandb-summary.json
131
+ 2024-12-31 10:01:50,800 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
132
+ 2024-12-31 10:01:51,799 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
133
+ 2024-12-31 10:01:52,112 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_modified():288] file/dir modified: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/output.log
134
+ 2024-12-31 10:01:52,800 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
135
+ 2024-12-31 10:01:53,800 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
136
+ 2024-12-31 10:01:54,800 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
137
+ 2024-12-31 10:01:55,187 DEBUG SystemMonitor:1511293 [system_monitor.py:_start():172] Starting system metrics aggregation loop
138
+ 2024-12-31 10:01:55,190 DEBUG SenderThread:1511293 [sender.py:send():379] send: stats
139
+ 2024-12-31 10:01:55,192 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
140
+ 2024-12-31 10:01:55,800 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
141
+ 2024-12-31 10:01:56,800 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
142
+ 2024-12-31 10:01:57,800 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
143
+ 2024-12-31 10:01:58,800 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
144
+ 2024-12-31 10:01:59,800 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
145
+ 2024-12-31 10:02:00,790 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
146
+ 2024-12-31 10:02:00,800 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
147
+ 2024-12-31 10:02:01,119 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_modified():288] file/dir modified: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/config.yaml
148
+ 2024-12-31 10:02:01,782 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: stop_status
149
+ 2024-12-31 10:02:01,783 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: stop_status
150
+ 2024-12-31 10:02:01,823 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
151
+ 2024-12-31 10:02:02,801 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
152
+ 2024-12-31 10:02:03,801 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
153
+ 2024-12-31 10:02:04,965 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
154
+ 2024-12-31 10:02:05,876 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
155
+ 2024-12-31 10:02:06,784 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
156
+ 2024-12-31 10:02:06,876 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
157
+ 2024-12-31 10:02:07,876 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
158
+ 2024-12-31 10:02:08,876 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
159
+ 2024-12-31 10:02:09,876 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
160
+ 2024-12-31 10:02:10,876 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
161
+ 2024-12-31 10:02:11,784 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
162
+ 2024-12-31 10:02:11,876 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
163
+ 2024-12-31 10:02:12,876 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
164
+ 2024-12-31 10:02:13,877 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
165
+ 2024-12-31 10:02:14,876 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
166
+ 2024-12-31 10:02:15,877 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
167
+ 2024-12-31 10:02:16,783 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: stop_status
168
+ 2024-12-31 10:02:16,783 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: stop_status
169
+ 2024-12-31 10:02:16,877 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
170
+ 2024-12-31 10:02:16,903 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
171
+ 2024-12-31 10:02:17,877 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
172
+ 2024-12-31 10:02:18,887 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
173
+ 2024-12-31 10:02:19,880 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
174
+ 2024-12-31 10:02:20,880 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
175
+ 2024-12-31 10:02:21,882 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
176
+ 2024-12-31 10:02:22,792 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
177
+ 2024-12-31 10:02:22,882 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
178
+ 2024-12-31 10:02:23,883 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
179
+ 2024-12-31 10:02:24,883 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
180
+ 2024-12-31 10:02:25,192 DEBUG SenderThread:1511293 [sender.py:send():379] send: stats
181
+ 2024-12-31 10:02:25,461 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: partial_history
182
+ 2024-12-31 10:02:25,463 DEBUG SenderThread:1511293 [sender.py:send():379] send: history
183
+ 2024-12-31 10:02:25,464 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: summary_record
184
+ 2024-12-31 10:02:25,465 INFO SenderThread:1511293 [sender.py:_save_file():1454] saving file wandb-summary.json with policy end
185
+ 2024-12-31 10:02:25,883 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
186
+ 2024-12-31 10:02:26,145 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_modified():288] file/dir modified: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/wandb-summary.json
187
+ 2024-12-31 10:02:26,146 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_modified():288] file/dir modified: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/output.log
188
+ 2024-12-31 10:02:26,883 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
189
+ 2024-12-31 10:02:27,883 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
190
+ 2024-12-31 10:02:28,788 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
191
+ 2024-12-31 10:02:28,884 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
192
+ 2024-12-31 10:02:29,884 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
193
+ 2024-12-31 10:02:30,883 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
194
+ 2024-12-31 10:02:31,783 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: stop_status
195
+ 2024-12-31 10:02:31,783 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: stop_status
196
+ 2024-12-31 10:02:31,883 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
197
+ 2024-12-31 10:02:32,884 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
198
+ 2024-12-31 10:02:33,884 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
199
+ 2024-12-31 10:02:33,938 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
200
+ 2024-12-31 10:02:34,884 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
201
+ 2024-12-31 10:02:35,884 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
202
+ 2024-12-31 10:02:36,884 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
203
+ 2024-12-31 10:02:37,884 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
204
+ 2024-12-31 10:02:38,884 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
205
+ 2024-12-31 10:02:39,787 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
206
+ 2024-12-31 10:02:40,032 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
207
+ 2024-12-31 10:02:41,031 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
208
+ 2024-12-31 10:02:42,031 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
209
+ 2024-12-31 10:02:43,032 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
210
+ 2024-12-31 10:02:44,032 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
211
+ 2024-12-31 10:02:44,787 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
212
+ 2024-12-31 10:02:45,032 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
213
+ 2024-12-31 10:02:46,032 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
214
+ 2024-12-31 10:02:46,783 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: stop_status
215
+ 2024-12-31 10:02:46,783 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: stop_status
216
+ 2024-12-31 10:02:47,032 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
217
+ 2024-12-31 10:02:48,032 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
218
+ 2024-12-31 10:02:49,032 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
219
+ 2024-12-31 10:02:49,855 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
220
+ 2024-12-31 10:02:50,032 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
221
+ 2024-12-31 10:02:51,032 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
222
+ 2024-12-31 10:02:52,032 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
223
+ 2024-12-31 10:02:53,032 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
224
+ 2024-12-31 10:02:54,046 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
225
+ 2024-12-31 10:02:55,042 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
226
+ 2024-12-31 10:02:55,206 DEBUG SenderThread:1511293 [sender.py:send():379] send: stats
227
+ 2024-12-31 10:02:55,207 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
228
+ 2024-12-31 10:02:56,042 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
229
+ 2024-12-31 10:02:57,045 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
230
+ 2024-12-31 10:02:58,045 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
231
+ 2024-12-31 10:02:59,045 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
232
+ 2024-12-31 10:03:00,045 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
233
+ 2024-12-31 10:03:00,673 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: partial_history
234
+ 2024-12-31 10:03:00,676 DEBUG SenderThread:1511293 [sender.py:send():379] send: history
235
+ 2024-12-31 10:03:00,677 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: summary_record
236
+ 2024-12-31 10:03:00,677 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: status_report
237
+ 2024-12-31 10:03:00,679 INFO SenderThread:1511293 [sender.py:_save_file():1454] saving file wandb-summary.json with policy end
238
+ 2024-12-31 10:03:01,103 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
239
+ 2024-12-31 10:03:01,182 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_modified():288] file/dir modified: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/wandb-summary.json
240
+ 2024-12-31 10:03:01,783 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: stop_status
241
+ 2024-12-31 10:03:01,784 DEBUG SenderThread:1511293 [sender.py:send_request():406] send_request: stop_status
242
+ 2024-12-31 10:03:02,162 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
243
+ 2024-12-31 10:03:02,183 INFO Thread-12 :1511293 [dir_watcher.py:_on_file_modified():288] file/dir modified: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/files/output.log
244
+ 2024-12-31 10:03:03,162 DEBUG HandlerThread:1511293 [handler.py:handle_request():158] handle_request: internal_messages
wandb/run-20241231_100054-t2idt8o9/logs/debug.log ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-12-31 10:00:54,775 INFO MainThread:1510803 [wandb_setup.py:_flush():76] Current SDK version is 0.17.3
2
+ 2024-12-31 10:00:54,775 INFO MainThread:1510803 [wandb_setup.py:_flush():76] Configure stats pid to 1510803
3
+ 2024-12-31 10:00:54,775 INFO MainThread:1510803 [wandb_setup.py:_flush():76] Loading settings from /home/ctpham_umass_edu/.config/wandb/settings
4
+ 2024-12-31 10:00:54,775 INFO MainThread:1510803 [wandb_setup.py:_flush():76] Loading settings from /work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/wandb/settings
5
+ 2024-12-31 10:00:54,775 INFO MainThread:1510803 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'root_dir': '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', 'project': 'prolong', 'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2024-12-31 10:00:54,775 INFO MainThread:1510803 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-12-31 10:00:54,775 INFO MainThread:1510803 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'prolong-final/finetune.py', 'program_abspath': '/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/finetune.py', 'program': '/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/finetune.py'}
8
+ 2024-12-31 10:00:54,775 INFO MainThread:1510803 [wandb_setup.py:_flush():76] Applying login settings: {}
9
+ 2024-12-31 10:00:54,775 INFO MainThread:1510803 [wandb_init.py:_log_setup():520] Logging user logs to /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/logs/debug.log
10
+ 2024-12-31 10:00:54,775 INFO MainThread:1510803 [wandb_init.py:_log_setup():521] Logging internal logs to /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20241231_100054-t2idt8o9/logs/debug-internal.log
11
+ 2024-12-31 10:00:54,776 INFO MainThread:1510803 [wandb_init.py:init():560] calling init triggers
12
+ 2024-12-31 10:00:54,776 INFO MainThread:1510803 [wandb_init.py:init():567] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-12-31 10:00:54,776 INFO MainThread:1510803 [wandb_init.py:init():610] starting backend
15
+ 2024-12-31 10:00:54,776 INFO MainThread:1510803 [wandb_init.py:init():614] setting up manager
16
+ 2024-12-31 10:00:54,778 INFO MainThread:1510803 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-12-31 10:00:54,783 INFO MainThread:1510803 [wandb_init.py:init():622] backend started and connected
18
+ 2024-12-31 10:00:54,788 INFO MainThread:1510803 [wandb_init.py:init():711] updated telemetry
19
+ 2024-12-31 10:00:54,824 INFO MainThread:1510803 [wandb_init.py:init():744] communicating run to backend with 90.0 second timeout
20
+ 2024-12-31 10:00:55,080 INFO MainThread:1510803 [wandb_run.py:_on_init():2402] communicating current version
21
+ 2024-12-31 10:00:55,143 INFO MainThread:1510803 [wandb_run.py:_on_init():2411] got version response upgrade_message: "wandb version 0.19.1 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
22
+
23
+ 2024-12-31 10:00:55,144 INFO MainThread:1510803 [wandb_init.py:init():795] starting run threads in backend
24
+ 2024-12-31 10:01:01,781 INFO MainThread:1510803 [wandb_run.py:_console_start():2380] atexit reg
25
+ 2024-12-31 10:01:01,781 INFO MainThread:1510803 [wandb_run.py:_redirect():2235] redirect: wrap_raw
26
+ 2024-12-31 10:01:01,781 INFO MainThread:1510803 [wandb_run.py:_redirect():2300] Wrapping output streams.
27
+ 2024-12-31 10:01:01,781 INFO MainThread:1510803 [wandb_run.py:_redirect():2325] Redirects installed.
28
+ 2024-12-31 10:01:01,785 INFO MainThread:1510803 [wandb_init.py:init():838] run started, returning control to user process
29
+ 2024-12-31 10:01:01,787 INFO MainThread:1510803 [wandb_run.py:_config_callback():1382] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 8.0, 'low_freq_factor': 1.0, 'high_freq_factor': 4.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': 0, 'eos_token_id': [128001, 128008, 128009], 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/', 'transformers_version': '4.44.2', 'model_type': 'llama', 'output_dir': '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', 'overwrite_output_dir': False, 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 1e-06, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.05, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/runs/Dec31_10-00-19_gpu020', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 200, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 1, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', 'disable_tqdm': True, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': ['auto_wrap', 'offload'], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False, 'min_lr_ratio': 0.1, 'cuda_empty_cache': True, 'streaming_dataset': True, 'seq_parallel_size': 8}
30
+ 2024-12-31 10:01:01,790 INFO MainThread:1510803 [wandb_config.py:__setitem__():151] config set model/num_parameters = 1003782656 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x74daa2385f90>>
31
+ 2024-12-31 10:01:01,790 INFO MainThread:1510803 [wandb_run.py:_config_callback():1382] config_cb model/num_parameters 1003782656 None
wandb/run-20241231_100054-t2idt8o9/run-t2idt8o9.wandb ADDED
File without changes
wandb/run-20250101_112144-t9wzg2aq/files/conda-environment.yaml ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=conda_forge
6
+ - _openmp_mutex=4.5=2_gnu
7
+ - bzip2=1.0.8=h4bc722e_7
8
+ - ca-certificates=2024.12.14=hbcca054_0
9
+ - ld_impl_linux-64=2.43=h712a8e2_2
10
+ - libffi=3.4.2=h7f98852_5
11
+ - libgcc=14.2.0=h77fa898_1
12
+ - libgcc-ng=14.2.0=h69a702a_1
13
+ - libgomp=14.2.0=h77fa898_1
14
+ - liblzma=5.6.3=hb9d3cd8_1
15
+ - liblzma-devel=5.6.3=hb9d3cd8_1
16
+ - libnsl=2.0.1=hd590300_0
17
+ - libsqlite=3.47.2=hee588c1_0
18
+ - libuuid=2.38.1=h0b41bf4_0
19
+ - libzlib=1.3.1=hb9d3cd8_2
20
+ - ncurses=6.5=he02047a_1
21
+ - openssl=3.4.0=hb9d3cd8_0
22
+ - pip=24.3.1=pyh8b19718_2
23
+ - python=3.10.0=h543edf9_3_cpython
24
+ - readline=8.2=h8228510_1
25
+ - setuptools=75.6.0=pyhff2d567_1
26
+ - sqlite=3.47.2=h9eae976_0
27
+ - tk=8.6.13=noxft_h4845f30_101
28
+ - wheel=0.45.1=pyhd8ed1ab_1
29
+ - xz=5.6.3=hbcc6ac9_1
30
+ - xz-gpl-tools=5.6.3=hbcc6ac9_1
31
+ - xz-tools=5.6.3=hb9d3cd8_1
32
+ - pip:
33
+ - accelerate==0.32.1
34
+ - aiohappyeyeballs==2.4.3
35
+ - aiohttp==3.11.2
36
+ - aioprometheus==23.12.0
37
+ - aiosignal==1.3.1
38
+ - annotated-types==0.7.0
39
+ - anthropic==0.39.0
40
+ - anyio==4.6.2.post1
41
+ - argcomplete==3.5.1
42
+ - arrow==1.3.0
43
+ - async-timeout==5.0.1
44
+ - attrs==24.2.0
45
+ - azure-core==1.32.0
46
+ - azure-identity==1.19.0
47
+ - azure-storage-blob==12.24.0
48
+ - azure-storage-file-datalake==12.18.0
49
+ - backoff==2.2.1
50
+ - bcrypt==4.2.0
51
+ - blobfile==3.0.0
52
+ - boto3==1.35.63
53
+ - botocore==1.35.63
54
+ - brotli==1.1.0
55
+ - cachetools==5.5.0
56
+ - certifi==2024.8.30
57
+ - cffi==1.17.1
58
+ - charset-normalizer==3.4.0
59
+ - circuitbreaker==2.0.0
60
+ - click==8.1.7
61
+ - cloudpickle==3.1.0
62
+ - compressed-tensors==0.8.0
63
+ - contourpy==1.3.1
64
+ - cramjam==2.9.0
65
+ - cryptography==43.0.3
66
+ - cycler==0.12.1
67
+ - datasets==2.20.0
68
+ - debugpy==1.8.11
69
+ - dill==0.3.8
70
+ - diskcache==5.6.3
71
+ - distro==1.9.0
72
+ - docker-pycreds==0.4.0
73
+ - docstring-parser==0.16
74
+ - einops==0.8.0
75
+ - fastapi==0.115.5
76
+ - filelock==3.16.1
77
+ - flash-attn==2.6.1
78
+ - fonttools==4.55.0
79
+ - frozenlist==1.5.0
80
+ - fsspec==2024.5.0
81
+ - gguf==0.10.0
82
+ - gitdb==4.0.11
83
+ - gitpython==3.1.43
84
+ - google-api-core==2.23.0
85
+ - google-auth==2.36.0
86
+ - google-cloud-aiplatform==1.71.1
87
+ - google-cloud-bigquery==3.27.0
88
+ - google-cloud-core==2.4.1
89
+ - google-cloud-resource-manager==1.13.1
90
+ - google-cloud-storage==2.10.0
91
+ - google-crc32c==1.6.0
92
+ - google-resumable-media==2.7.2
93
+ - googleapis-common-protos==1.66.0
94
+ - gql==3.5.0
95
+ - graphql-core==3.2.5
96
+ - grpc-google-iam-v1==0.13.1
97
+ - grpcio==1.68.0
98
+ - grpcio-status==1.62.3
99
+ - h11==0.14.0
100
+ - httpcore==1.0.7
101
+ - httptools==0.6.4
102
+ - httpx==0.27.2
103
+ - huggingface-hub==0.26.2
104
+ - idna==3.10
105
+ - importlib-metadata==8.5.0
106
+ - interegular==0.3.3
107
+ - ipython==8.18.0
108
+ - isodate==0.7.2
109
+ - jedi==0.19.2
110
+ - jinja2==3.1.4
111
+ - jiter==0.7.1
112
+ - jmespath==1.0.1
113
+ - jsonschema==4.23.0
114
+ - jsonschema-specifications==2024.10.1
115
+ - kiwisolver==1.4.7
116
+ - lark==1.2.2
117
+ - llvmlite==0.43.0
118
+ - lm-format-enforcer==0.10.9
119
+ - lxml==5.3.0
120
+ - markdown-it-py==3.0.0
121
+ - markupsafe==3.0.2
122
+ - matplotlib==3.9.2
123
+ - mdurl==0.1.2
124
+ - mosaicml-cli==0.5.34
125
+ - mosaicml-streaming==0.8.1
126
+ - mpmath==1.3.0
127
+ - msal==1.31.1
128
+ - msal-extensions==1.2.0
129
+ - msgpack==1.1.0
130
+ - msgspec==0.18.6
131
+ - multidict==6.1.0
132
+ - multiprocess==0.70.16
133
+ - networkx==3.4.2
134
+ - ninja==1.11.1.1
135
+ - numba==0.60.0
136
+ - numpy==1.26.4
137
+ - nvidia-cublas-cu12==12.1.3.1
138
+ - nvidia-cuda-cupti-cu12==12.1.105
139
+ - nvidia-cuda-nvrtc-cu12==12.1.105
140
+ - nvidia-cuda-runtime-cu12==12.1.105
141
+ - nvidia-cudnn-cu12==9.1.0.70
142
+ - nvidia-cufft-cu12==11.0.2.54
143
+ - nvidia-curand-cu12==10.3.2.106
144
+ - nvidia-cusolver-cu12==11.4.5.107
145
+ - nvidia-cusparse-cu12==12.1.0.106
146
+ - nvidia-ml-py==12.560.30
147
+ - nvidia-nccl-cu12==2.20.5
148
+ - nvidia-nvjitlink-cu12==12.4.127
149
+ - nvidia-nvtx-cu12==12.1.105
150
+ - oci==2.138.1
151
+ - openai==1.54.5
152
+ - opencv-python-headless==4.10.0.84
153
+ - orjson==3.10.11
154
+ - outlines==0.0.46
155
+ - packaging==24.1
156
+ - pandas==2.2.1
157
+ - paramiko==3.5.0
158
+ - partial-json-parser==0.2.1.1.post4
159
+ - pillow==10.4.0
160
+ - portalocker==2.10.1
161
+ - prometheus-client==0.21.0
162
+ - prometheus-fastapi-instrumentator==7.0.0
163
+ - prompt-toolkit==3.0.36
164
+ - propcache==0.2.0
165
+ - proto-plus==1.25.0
166
+ - protobuf==4.25.3
167
+ - py-cpuinfo==9.0.0
168
+ - pyairports==2.1.1
169
+ - pyarrow==18.0.0
170
+ - pyarrow-hotfix==0.6
171
+ - pyasn1==0.6.1
172
+ - pyasn1-modules==0.4.1
173
+ - pycountry==24.6.1
174
+ - pycparser==2.22
175
+ - pycryptodomex==3.21.0
176
+ - pydantic==2.9.2
177
+ - pydantic-core==2.23.4
178
+ - pyjwt==2.10.0
179
+ - pynacl==1.5.0
180
+ - pyopenssl==24.2.1
181
+ - pyparsing==3.2.0
182
+ - python-dateutil==2.9.0
183
+ - python-dotenv==1.0.1
184
+ - python-snappy==0.7.3
185
+ - pytz==2024.2
186
+ - pyyaml==6.0.2
187
+ - quantile-python==1.1
188
+ - questionary==2.0.1
189
+ - ray==2.39.0
190
+ - referencing==0.35.1
191
+ - regex==2023.12.25
192
+ - requests==2.32.3
193
+ - rich==13.9.4
194
+ - rotary-emb==0.5.2
195
+ - rpds-py==0.21.0
196
+ - rsa==4.9
197
+ - ruamel-yaml==0.18.6
198
+ - ruamel-yaml-clib==0.2.12
199
+ - s3transfer==0.10.3
200
+ - safetensors==0.4.5
201
+ - sentencepiece==0.1.99
202
+ - sentry-sdk==2.18.0
203
+ - setproctitle==1.3.4
204
+ - shapely==2.0.6
205
+ - simple-parsing==0.1.6
206
+ - smmap==5.0.1
207
+ - sniffio==1.3.1
208
+ - starlette==0.41.3
209
+ - sympy==1.13.1
210
+ - tiktoken==0.7.0
211
+ - tokenizers==0.19.1
212
+ - torch==2.4.1
213
+ - torchvision==0.19.1
214
+ - tqdm==4.66.4
215
+ - transformers==4.44.2
216
+ - triton==3.0.0
217
+ - types-python-dateutil==2.9.0.20241003
218
+ - tzdata==2024.2
219
+ - urllib3==2.2.3
220
+ - uvicorn==0.32.0
221
+ - uvloop==0.21.0
222
+ - validators==0.34.0
223
+ - vertexai==1.71.1
224
+ - wandb==0.17.3
225
+ - watchfiles==0.24.0
226
+ - websockets==11.0.3
227
+ - xformers==0.0.28.post1
228
+ - xxhash==3.5.0
229
+ - yarl==1.17.2
230
+ - zipp==3.21.0
231
+ - zstandard==0.23.0
232
+ - zstd==1.5.5.1
233
+ prefix: /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final
wandb/run-20250101_112144-t9wzg2aq/files/config.yaml ADDED
@@ -0,0 +1,713 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.10.0
7
+ cli_version: 0.17.3
8
+ framework: huggingface
9
+ huggingface_version: 4.44.2
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1735730504
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 11
17
+ - 41
18
+ - 49
19
+ - 51
20
+ - 55
21
+ - 71
22
+ - 105
23
+ 2:
24
+ - 1
25
+ - 11
26
+ - 41
27
+ - 49
28
+ - 51
29
+ - 55
30
+ - 71
31
+ - 105
32
+ 3:
33
+ - 7
34
+ - 13
35
+ - 19
36
+ - 23
37
+ - 66
38
+ 4: 3.10.0
39
+ 5: 0.17.3
40
+ 6: 4.44.2
41
+ 8:
42
+ - 5
43
+ 9:
44
+ 1: transformers_trainer
45
+ 13: linux-x86_64
46
+ m:
47
+ - 1: train/global_step
48
+ 6:
49
+ - 3
50
+ - 1: train/loss
51
+ 5: 1
52
+ 6:
53
+ - 1
54
+ - 1: train/grad_norm
55
+ 5: 1
56
+ 6:
57
+ - 1
58
+ - 1: train/learning_rate
59
+ 5: 1
60
+ 6:
61
+ - 1
62
+ - 1: train/epoch
63
+ 5: 1
64
+ 6:
65
+ - 1
66
+ - 1: train/num_input_tokens_seen
67
+ 5: 1
68
+ 6:
69
+ - 1
70
+ vocab_size:
71
+ desc: null
72
+ value: 128256
73
+ max_position_embeddings:
74
+ desc: null
75
+ value: 131072
76
+ hidden_size:
77
+ desc: null
78
+ value: 4096
79
+ intermediate_size:
80
+ desc: null
81
+ value: 14336
82
+ num_hidden_layers:
83
+ desc: null
84
+ value: 32
85
+ num_attention_heads:
86
+ desc: null
87
+ value: 32
88
+ num_key_value_heads:
89
+ desc: null
90
+ value: 8
91
+ hidden_act:
92
+ desc: null
93
+ value: silu
94
+ initializer_range:
95
+ desc: null
96
+ value: 0.02
97
+ rms_norm_eps:
98
+ desc: null
99
+ value: 1.0e-05
100
+ pretraining_tp:
101
+ desc: null
102
+ value: 1
103
+ use_cache:
104
+ desc: null
105
+ value: true
106
+ rope_theta:
107
+ desc: null
108
+ value: 500000.0
109
+ rope_scaling:
110
+ desc: null
111
+ value:
112
+ factor: 8.0
113
+ low_freq_factor: 1.0
114
+ high_freq_factor: 4.0
115
+ original_max_position_embeddings: 8192
116
+ rope_type: llama3
117
+ attention_bias:
118
+ desc: null
119
+ value: false
120
+ attention_dropout:
121
+ desc: null
122
+ value: 0.0
123
+ mlp_bias:
124
+ desc: null
125
+ value: false
126
+ return_dict:
127
+ desc: null
128
+ value: true
129
+ output_hidden_states:
130
+ desc: null
131
+ value: false
132
+ output_attentions:
133
+ desc: null
134
+ value: false
135
+ torchscript:
136
+ desc: null
137
+ value: false
138
+ torch_dtype:
139
+ desc: null
140
+ value: bfloat16
141
+ use_bfloat16:
142
+ desc: null
143
+ value: false
144
+ tf_legacy_loss:
145
+ desc: null
146
+ value: false
147
+ pruned_heads:
148
+ desc: null
149
+ value: {}
150
+ tie_word_embeddings:
151
+ desc: null
152
+ value: false
153
+ chunk_size_feed_forward:
154
+ desc: null
155
+ value: 0
156
+ is_encoder_decoder:
157
+ desc: null
158
+ value: false
159
+ is_decoder:
160
+ desc: null
161
+ value: false
162
+ cross_attention_hidden_size:
163
+ desc: null
164
+ value: null
165
+ add_cross_attention:
166
+ desc: null
167
+ value: false
168
+ tie_encoder_decoder:
169
+ desc: null
170
+ value: false
171
+ max_length:
172
+ desc: null
173
+ value: 20
174
+ min_length:
175
+ desc: null
176
+ value: 0
177
+ do_sample:
178
+ desc: null
179
+ value: false
180
+ early_stopping:
181
+ desc: null
182
+ value: false
183
+ num_beams:
184
+ desc: null
185
+ value: 1
186
+ num_beam_groups:
187
+ desc: null
188
+ value: 1
189
+ diversity_penalty:
190
+ desc: null
191
+ value: 0.0
192
+ temperature:
193
+ desc: null
194
+ value: 1.0
195
+ top_k:
196
+ desc: null
197
+ value: 50
198
+ top_p:
199
+ desc: null
200
+ value: 1.0
201
+ typical_p:
202
+ desc: null
203
+ value: 1.0
204
+ repetition_penalty:
205
+ desc: null
206
+ value: 1.0
207
+ length_penalty:
208
+ desc: null
209
+ value: 1.0
210
+ no_repeat_ngram_size:
211
+ desc: null
212
+ value: 0
213
+ encoder_no_repeat_ngram_size:
214
+ desc: null
215
+ value: 0
216
+ bad_words_ids:
217
+ desc: null
218
+ value: null
219
+ num_return_sequences:
220
+ desc: null
221
+ value: 1
222
+ output_scores:
223
+ desc: null
224
+ value: false
225
+ return_dict_in_generate:
226
+ desc: null
227
+ value: false
228
+ forced_bos_token_id:
229
+ desc: null
230
+ value: null
231
+ forced_eos_token_id:
232
+ desc: null
233
+ value: null
234
+ remove_invalid_values:
235
+ desc: null
236
+ value: false
237
+ exponential_decay_length_penalty:
238
+ desc: null
239
+ value: null
240
+ suppress_tokens:
241
+ desc: null
242
+ value: null
243
+ begin_suppress_tokens:
244
+ desc: null
245
+ value: null
246
+ architectures:
247
+ desc: null
248
+ value:
249
+ - LlamaForCausalLM
250
+ finetuning_task:
251
+ desc: null
252
+ value: null
253
+ id2label:
254
+ desc: null
255
+ value:
256
+ '0': LABEL_0
257
+ '1': LABEL_1
258
+ label2id:
259
+ desc: null
260
+ value:
261
+ LABEL_0: 0
262
+ LABEL_1: 1
263
+ tokenizer_class:
264
+ desc: null
265
+ value: null
266
+ prefix:
267
+ desc: null
268
+ value: null
269
+ bos_token_id:
270
+ desc: null
271
+ value: 128000
272
+ pad_token_id:
273
+ desc: null
274
+ value: 0
275
+ eos_token_id:
276
+ desc: null
277
+ value:
278
+ - 128001
279
+ - 128008
280
+ - 128009
281
+ sep_token_id:
282
+ desc: null
283
+ value: null
284
+ decoder_start_token_id:
285
+ desc: null
286
+ value: null
287
+ task_specific_params:
288
+ desc: null
289
+ value: null
290
+ problem_type:
291
+ desc: null
292
+ value: null
293
+ _name_or_path:
294
+ desc: null
295
+ value: /datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/
296
+ transformers_version:
297
+ desc: null
298
+ value: 4.44.2
299
+ model_type:
300
+ desc: null
301
+ value: llama
302
+ output_dir:
303
+ desc: null
304
+ value: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_
305
+ overwrite_output_dir:
306
+ desc: null
307
+ value: false
308
+ do_train:
309
+ desc: null
310
+ value: true
311
+ do_eval:
312
+ desc: null
313
+ value: false
314
+ do_predict:
315
+ desc: null
316
+ value: false
317
+ eval_strategy:
318
+ desc: null
319
+ value: 'no'
320
+ prediction_loss_only:
321
+ desc: null
322
+ value: false
323
+ per_device_train_batch_size:
324
+ desc: null
325
+ value: 1
326
+ per_device_eval_batch_size:
327
+ desc: null
328
+ value: 8
329
+ per_gpu_train_batch_size:
330
+ desc: null
331
+ value: null
332
+ per_gpu_eval_batch_size:
333
+ desc: null
334
+ value: null
335
+ gradient_accumulation_steps:
336
+ desc: null
337
+ value: 2
338
+ eval_accumulation_steps:
339
+ desc: null
340
+ value: null
341
+ eval_delay:
342
+ desc: null
343
+ value: 0
344
+ torch_empty_cache_steps:
345
+ desc: null
346
+ value: null
347
+ learning_rate:
348
+ desc: null
349
+ value: 1.0e-06
350
+ weight_decay:
351
+ desc: null
352
+ value: 0.1
353
+ adam_beta1:
354
+ desc: null
355
+ value: 0.9
356
+ adam_beta2:
357
+ desc: null
358
+ value: 0.95
359
+ adam_epsilon:
360
+ desc: null
361
+ value: 1.0e-08
362
+ max_grad_norm:
363
+ desc: null
364
+ value: 1.0
365
+ num_train_epochs:
366
+ desc: null
367
+ value: 1.0
368
+ max_steps:
369
+ desc: null
370
+ value: -1
371
+ lr_scheduler_type:
372
+ desc: null
373
+ value: cosine
374
+ lr_scheduler_kwargs:
375
+ desc: null
376
+ value: {}
377
+ warmup_ratio:
378
+ desc: null
379
+ value: 0.05
380
+ warmup_steps:
381
+ desc: null
382
+ value: 0
383
+ log_level:
384
+ desc: null
385
+ value: info
386
+ log_level_replica:
387
+ desc: null
388
+ value: warning
389
+ log_on_each_node:
390
+ desc: null
391
+ value: true
392
+ logging_dir:
393
+ desc: null
394
+ value: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/runs/Jan01_11-21-09_gpu017
395
+ logging_strategy:
396
+ desc: null
397
+ value: steps
398
+ logging_first_step:
399
+ desc: null
400
+ value: false
401
+ logging_steps:
402
+ desc: null
403
+ value: 1.0
404
+ logging_nan_inf_filter:
405
+ desc: null
406
+ value: true
407
+ save_strategy:
408
+ desc: null
409
+ value: steps
410
+ save_steps:
411
+ desc: null
412
+ value: 200
413
+ save_total_limit:
414
+ desc: null
415
+ value: null
416
+ save_safetensors:
417
+ desc: null
418
+ value: true
419
+ save_on_each_node:
420
+ desc: null
421
+ value: false
422
+ save_only_model:
423
+ desc: null
424
+ value: false
425
+ restore_callback_states_from_checkpoint:
426
+ desc: null
427
+ value: false
428
+ no_cuda:
429
+ desc: null
430
+ value: false
431
+ use_cpu:
432
+ desc: null
433
+ value: false
434
+ use_mps_device:
435
+ desc: null
436
+ value: false
437
+ seed:
438
+ desc: null
439
+ value: 42
440
+ data_seed:
441
+ desc: null
442
+ value: null
443
+ jit_mode_eval:
444
+ desc: null
445
+ value: false
446
+ use_ipex:
447
+ desc: null
448
+ value: false
449
+ bf16:
450
+ desc: null
451
+ value: true
452
+ fp16:
453
+ desc: null
454
+ value: false
455
+ fp16_opt_level:
456
+ desc: null
457
+ value: O1
458
+ half_precision_backend:
459
+ desc: null
460
+ value: auto
461
+ bf16_full_eval:
462
+ desc: null
463
+ value: false
464
+ fp16_full_eval:
465
+ desc: null
466
+ value: false
467
+ tf32:
468
+ desc: null
469
+ value: null
470
+ local_rank:
471
+ desc: null
472
+ value: 0
473
+ ddp_backend:
474
+ desc: null
475
+ value: null
476
+ tpu_num_cores:
477
+ desc: null
478
+ value: null
479
+ tpu_metrics_debug:
480
+ desc: null
481
+ value: false
482
+ debug:
483
+ desc: null
484
+ value: []
485
+ dataloader_drop_last:
486
+ desc: null
487
+ value: false
488
+ eval_steps:
489
+ desc: null
490
+ value: null
491
+ dataloader_num_workers:
492
+ desc: null
493
+ value: 1
494
+ dataloader_prefetch_factor:
495
+ desc: null
496
+ value: null
497
+ past_index:
498
+ desc: null
499
+ value: -1
500
+ run_name:
501
+ desc: null
502
+ value: _llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_
503
+ disable_tqdm:
504
+ desc: null
505
+ value: true
506
+ remove_unused_columns:
507
+ desc: null
508
+ value: false
509
+ label_names:
510
+ desc: null
511
+ value: null
512
+ load_best_model_at_end:
513
+ desc: null
514
+ value: false
515
+ metric_for_best_model:
516
+ desc: null
517
+ value: null
518
+ greater_is_better:
519
+ desc: null
520
+ value: null
521
+ ignore_data_skip:
522
+ desc: null
523
+ value: false
524
+ fsdp:
525
+ desc: null
526
+ value:
527
+ - auto_wrap
528
+ - offload
529
+ fsdp_min_num_params:
530
+ desc: null
531
+ value: 0
532
+ fsdp_config:
533
+ desc: null
534
+ value:
535
+ min_num_params: 0
536
+ xla: false
537
+ xla_fsdp_v2: false
538
+ xla_fsdp_grad_ckpt: false
539
+ fsdp_transformer_layer_cls_to_wrap:
540
+ desc: null
541
+ value: null
542
+ accelerator_config:
543
+ desc: null
544
+ value:
545
+ split_batches: false
546
+ dispatch_batches: null
547
+ even_batches: true
548
+ use_seedable_sampler: true
549
+ non_blocking: false
550
+ gradient_accumulation_kwargs: null
551
+ deepspeed:
552
+ desc: null
553
+ value: null
554
+ label_smoothing_factor:
555
+ desc: null
556
+ value: 0.0
557
+ optim:
558
+ desc: null
559
+ value: adamw_torch
560
+ optim_args:
561
+ desc: null
562
+ value: null
563
+ adafactor:
564
+ desc: null
565
+ value: false
566
+ group_by_length:
567
+ desc: null
568
+ value: false
569
+ length_column_name:
570
+ desc: null
571
+ value: length
572
+ report_to:
573
+ desc: null
574
+ value:
575
+ - wandb
576
+ ddp_find_unused_parameters:
577
+ desc: null
578
+ value: false
579
+ ddp_bucket_cap_mb:
580
+ desc: null
581
+ value: null
582
+ ddp_broadcast_buffers:
583
+ desc: null
584
+ value: null
585
+ dataloader_pin_memory:
586
+ desc: null
587
+ value: true
588
+ dataloader_persistent_workers:
589
+ desc: null
590
+ value: false
591
+ skip_memory_metrics:
592
+ desc: null
593
+ value: true
594
+ use_legacy_prediction_loop:
595
+ desc: null
596
+ value: false
597
+ push_to_hub:
598
+ desc: null
599
+ value: false
600
+ resume_from_checkpoint:
601
+ desc: null
602
+ value: null
603
+ hub_model_id:
604
+ desc: null
605
+ value: null
606
+ hub_strategy:
607
+ desc: null
608
+ value: every_save
609
+ hub_token:
610
+ desc: null
611
+ value: <HUB_TOKEN>
612
+ hub_private_repo:
613
+ desc: null
614
+ value: false
615
+ hub_always_push:
616
+ desc: null
617
+ value: false
618
+ gradient_checkpointing:
619
+ desc: null
620
+ value: true
621
+ gradient_checkpointing_kwargs:
622
+ desc: null
623
+ value: null
624
+ include_inputs_for_metrics:
625
+ desc: null
626
+ value: false
627
+ eval_do_concat_batches:
628
+ desc: null
629
+ value: true
630
+ fp16_backend:
631
+ desc: null
632
+ value: auto
633
+ evaluation_strategy:
634
+ desc: null
635
+ value: null
636
+ push_to_hub_model_id:
637
+ desc: null
638
+ value: null
639
+ push_to_hub_organization:
640
+ desc: null
641
+ value: null
642
+ push_to_hub_token:
643
+ desc: null
644
+ value: <PUSH_TO_HUB_TOKEN>
645
+ mp_parameters:
646
+ desc: null
647
+ value: ''
648
+ auto_find_batch_size:
649
+ desc: null
650
+ value: false
651
+ full_determinism:
652
+ desc: null
653
+ value: false
654
+ torchdynamo:
655
+ desc: null
656
+ value: null
657
+ ray_scope:
658
+ desc: null
659
+ value: last
660
+ ddp_timeout:
661
+ desc: null
662
+ value: 1800
663
+ torch_compile:
664
+ desc: null
665
+ value: false
666
+ torch_compile_backend:
667
+ desc: null
668
+ value: null
669
+ torch_compile_mode:
670
+ desc: null
671
+ value: null
672
+ dispatch_batches:
673
+ desc: null
674
+ value: null
675
+ split_batches:
676
+ desc: null
677
+ value: null
678
+ include_tokens_per_second:
679
+ desc: null
680
+ value: false
681
+ include_num_input_tokens_seen:
682
+ desc: null
683
+ value: false
684
+ neftune_noise_alpha:
685
+ desc: null
686
+ value: null
687
+ optim_target_modules:
688
+ desc: null
689
+ value: null
690
+ batch_eval_metrics:
691
+ desc: null
692
+ value: false
693
+ eval_on_start:
694
+ desc: null
695
+ value: false
696
+ eval_use_gather_object:
697
+ desc: null
698
+ value: false
699
+ min_lr_ratio:
700
+ desc: null
701
+ value: 0.1
702
+ cuda_empty_cache:
703
+ desc: null
704
+ value: true
705
+ streaming_dataset:
706
+ desc: null
707
+ value: true
708
+ seq_parallel_size:
709
+ desc: null
710
+ value: 8
711
+ model/num_parameters:
712
+ desc: null
713
+ value: 1003782656
wandb/run-20250101_112144-t9wzg2aq/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20250101_112144-t9wzg2aq/files/requirements.txt ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Brotli==1.1.0
2
+ GitPython==3.1.43
3
+ Jinja2==3.1.4
4
+ MarkupSafe==3.0.2
5
+ PyJWT==2.10.0
6
+ PyNaCl==1.5.0
7
+ PyYAML==6.0.2
8
+ Pygments==2.18.0
9
+ accelerate==0.32.1
10
+ aiohappyeyeballs==2.4.3
11
+ aiohttp==3.11.2
12
+ aioprometheus==23.12.0
13
+ aiosignal==1.3.1
14
+ annotated-types==0.7.0
15
+ anthropic==0.39.0
16
+ anyio==4.6.2.post1
17
+ argcomplete==3.5.1
18
+ arrow==1.3.0
19
+ asttokens==2.4.1
20
+ async-timeout==5.0.1
21
+ attrs==24.2.0
22
+ autocommand==2.2.2
23
+ azure-core==1.32.0
24
+ azure-identity==1.19.0
25
+ azure-storage-blob==12.24.0
26
+ azure-storage-file-datalake==12.18.0
27
+ backoff==2.2.1
28
+ backports.tarfile==1.2.0
29
+ bcrypt==4.2.0
30
+ blobfile==3.0.0
31
+ boto3==1.35.63
32
+ botocore==1.35.63
33
+ cachetools==5.5.0
34
+ certifi==2024.8.30
35
+ cffi==1.17.1
36
+ charset-normalizer==3.4.0
37
+ circuitbreaker==2.0.0
38
+ click==8.1.7
39
+ cloudpickle==3.1.0
40
+ comm==0.2.2
41
+ compressed-tensors==0.8.0
42
+ contourpy==1.3.1
43
+ cramjam==2.9.0
44
+ cryptography==43.0.3
45
+ cycler==0.12.1
46
+ datasets==2.20.0
47
+ datatools==0.1
48
+ debugpy==1.8.11
49
+ decorator==5.1.1
50
+ dill==0.3.8
51
+ diskcache==5.6.3
52
+ distro==1.9.0
53
+ docker-pycreds==0.4.0
54
+ docstring_parser==0.16
55
+ einops==0.8.0
56
+ exceptiongroup==1.2.2
57
+ executing==2.1.0
58
+ fastapi==0.115.5
59
+ filelock==3.16.1
60
+ flash-attn==2.6.1
61
+ fonttools==4.55.0
62
+ frozenlist==1.5.0
63
+ fsspec==2024.5.0
64
+ gguf==0.10.0
65
+ gitdb==4.0.11
66
+ google-api-core==2.23.0
67
+ google-auth==2.36.0
68
+ google-cloud-aiplatform==1.71.1
69
+ google-cloud-bigquery==3.27.0
70
+ google-cloud-core==2.4.1
71
+ google-cloud-resource-manager==1.13.1
72
+ google-cloud-storage==2.10.0
73
+ google-crc32c==1.6.0
74
+ google-resumable-media==2.7.2
75
+ googleapis-common-protos==1.66.0
76
+ gql==3.5.0
77
+ graphql-core==3.2.5
78
+ grpc-google-iam-v1==0.13.1
79
+ grpcio-status==1.62.3
80
+ grpcio==1.68.0
81
+ h11==0.14.0
82
+ httpcore==1.0.7
83
+ httptools==0.6.4
84
+ httpx==0.27.2
85
+ huggingface-hub==0.26.2
86
+ idna==3.10
87
+ importlib_metadata==8.0.0
88
+ importlib_metadata==8.5.0
89
+ inflect==7.3.1
90
+ interegular==0.3.3
91
+ ipykernel==6.29.5
92
+ ipython==8.18.0
93
+ isodate==0.7.2
94
+ jaraco.collections==5.1.0
95
+ jaraco.context==5.3.0
96
+ jaraco.functools==4.0.1
97
+ jaraco.text==3.12.1
98
+ jedi==0.19.2
99
+ jiter==0.7.1
100
+ jmespath==1.0.1
101
+ jsonschema-specifications==2024.10.1
102
+ jsonschema==4.23.0
103
+ jupyter_client==8.6.3
104
+ jupyter_core==5.7.2
105
+ kiwisolver==1.4.7
106
+ lark==1.2.2
107
+ llvmlite==0.43.0
108
+ lm-format-enforcer==0.10.9
109
+ lxml==5.3.0
110
+ markdown-it-py==3.0.0
111
+ matplotlib-inline==0.1.7
112
+ matplotlib==3.9.2
113
+ mdurl==0.1.2
114
+ more-itertools==10.3.0
115
+ mosaicml-cli==0.5.34
116
+ mosaicml-streaming==0.8.1
117
+ mpmath==1.3.0
118
+ msal-extensions==1.2.0
119
+ msal==1.31.1
120
+ msgpack==1.1.0
121
+ msgspec==0.18.6
122
+ multidict==6.1.0
123
+ multiprocess==0.70.16
124
+ nest-asyncio==1.6.0
125
+ networkx==3.4.2
126
+ ninja==1.11.1.1
127
+ numba==0.60.0
128
+ numpy==1.26.4
129
+ nvidia-cublas-cu12==12.1.3.1
130
+ nvidia-cuda-cupti-cu12==12.1.105
131
+ nvidia-cuda-nvrtc-cu12==12.1.105
132
+ nvidia-cuda-runtime-cu12==12.1.105
133
+ nvidia-cudnn-cu12==9.1.0.70
134
+ nvidia-cufft-cu12==11.0.2.54
135
+ nvidia-curand-cu12==10.3.2.106
136
+ nvidia-cusolver-cu12==11.4.5.107
137
+ nvidia-cusparse-cu12==12.1.0.106
138
+ nvidia-ml-py==12.560.30
139
+ nvidia-nccl-cu12==2.20.5
140
+ nvidia-nvjitlink-cu12==12.4.127
141
+ nvidia-nvtx-cu12==12.1.105
142
+ oci==2.138.1
143
+ openai==1.54.5
144
+ opencv-python-headless==4.10.0.84
145
+ orjson==3.10.11
146
+ outlines==0.0.46
147
+ packaging==24.1
148
+ packaging==24.2
149
+ pandas==2.2.1
150
+ paramiko==3.5.0
151
+ parso==0.8.4
152
+ partial-json-parser==0.2.1.1.post4
153
+ pexpect==4.9.0
154
+ pillow==10.4.0
155
+ pip==24.3.1
156
+ platformdirs==4.2.2
157
+ platformdirs==4.3.6
158
+ portalocker==2.10.1
159
+ prometheus-fastapi-instrumentator==7.0.0
160
+ prometheus_client==0.21.0
161
+ prompt-toolkit==3.0.36
162
+ propcache==0.2.0
163
+ proto-plus==1.25.0
164
+ protobuf==4.25.3
165
+ psutil==6.1.0
166
+ ptyprocess==0.7.0
167
+ pure_eval==0.2.3
168
+ py-cpuinfo==9.0.0
169
+ pyOpenSSL==24.2.1
170
+ pyairports==2.1.1
171
+ pyarrow-hotfix==0.6
172
+ pyarrow==18.0.0
173
+ pyasn1==0.6.1
174
+ pyasn1_modules==0.4.1
175
+ pycountry==24.6.1
176
+ pycparser==2.22
177
+ pycryptodomex==3.21.0
178
+ pydantic==2.9.2
179
+ pydantic_core==2.23.4
180
+ pyparsing==3.2.0
181
+ python-dateutil==2.9.0
182
+ python-dotenv==1.0.1
183
+ python-snappy==0.7.3
184
+ pytz==2024.2
185
+ pyzmq==26.2.0
186
+ quantile-python==1.1
187
+ questionary==2.0.1
188
+ ray==2.39.0
189
+ referencing==0.35.1
190
+ regex==2023.12.25
191
+ requests==2.32.3
192
+ rich==13.9.4
193
+ rotary-emb==0.5.2
194
+ rpds-py==0.21.0
195
+ rsa==4.9
196
+ ruamel.yaml.clib==0.2.12
197
+ ruamel.yaml==0.18.6
198
+ s3transfer==0.10.3
199
+ safetensors==0.4.5
200
+ sentencepiece==0.1.99
201
+ sentry-sdk==2.18.0
202
+ setproctitle==1.3.4
203
+ setuptools==75.6.0
204
+ shapely==2.0.6
205
+ simple-parsing==0.1.6
206
+ six==1.16.0
207
+ smmap==5.0.1
208
+ sniffio==1.3.1
209
+ stack-data==0.6.3
210
+ starlette==0.41.3
211
+ sympy==1.13.1
212
+ tiktoken==0.7.0
213
+ tokenizers==0.19.1
214
+ tomli==2.0.1
215
+ torch==2.4.1
216
+ torchvision==0.19.1
217
+ tornado==6.4.1
218
+ tqdm==4.66.4
219
+ traitlets==5.14.3
220
+ transformers==4.44.2
221
+ triton==3.0.0
222
+ typeguard==4.3.0
223
+ types-python-dateutil==2.9.0.20241003
224
+ typing_extensions==4.12.2
225
+ typing_extensions==4.12.2
226
+ tzdata==2024.2
227
+ urllib3==2.2.3
228
+ uvicorn==0.32.0
229
+ uvloop==0.21.0
230
+ validators==0.34.0
231
+ vertexai==1.71.1
232
+ wandb==0.17.3
233
+ watchfiles==0.24.0
234
+ wcwidth==0.2.13
235
+ websockets==11.0.3
236
+ wheel==0.43.0
237
+ wheel==0.45.1
238
+ xformers==0.0.28.post1
239
+ xxhash==3.5.0
240
+ yarl==1.17.2
241
+ zipp==3.19.2
242
+ zipp==3.21.0
243
+ zstandard==0.23.0
244
+ zstd==1.5.5.1
wandb/run-20250101_112144-t9wzg2aq/files/wandb-metadata.json ADDED
@@ -0,0 +1,705 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-48-generic-x86_64-with-glibc2.39",
3
+ "python": "3.10.0",
4
+ "heartbeatAt": "2025-01-01T11:21:45.060977",
5
+ "startedAt": "2025-01-01T11:21:44.520147",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--model_family",
10
+ "llama",
11
+ "--apply_instruct_masks",
12
+ "--token_scaled_loss",
13
+ "--seq_parallel_size",
14
+ "8",
15
+ "--report_to",
16
+ "wandb",
17
+ "--do_train",
18
+ "--model_name_or_path",
19
+ "/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/",
20
+ "--config_name",
21
+ "/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/",
22
+ "--tokenizer_name",
23
+ "/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/",
24
+ "--run_name",
25
+ "_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_",
26
+ "--output_dir",
27
+ "/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_",
28
+ "--config_overrides_json",
29
+ "",
30
+ "--gradient_accumulation_steps",
31
+ "2",
32
+ "--per_device_train_batch_size",
33
+ "1",
34
+ "--bf16",
35
+ "--learning_rate",
36
+ "1e-6",
37
+ "--min_lr_ratio",
38
+ "0.1",
39
+ "--lr_scheduler_type",
40
+ "cosine",
41
+ "--max_grad_norm",
42
+ "1.0",
43
+ "--adam_beta1",
44
+ "0.9",
45
+ "--adam_beta2",
46
+ "0.95",
47
+ "--weight_decay",
48
+ "0.1",
49
+ "--warmup_ratio",
50
+ "0.05",
51
+ "--optim",
52
+ "adamw_torch",
53
+ "--logging_steps",
54
+ "1",
55
+ "--log_level",
56
+ "info",
57
+ "--save_steps",
58
+ "200",
59
+ "--dataloader_num_workers",
60
+ "1",
61
+ "--disable_tqdm",
62
+ "true",
63
+ "--use_fast_tokenizer",
64
+ "false",
65
+ "--remove_unused_columns",
66
+ "false",
67
+ "--ddp_find_unused_parameters",
68
+ "false",
69
+ "--fsdp",
70
+ "auto_wrap offload",
71
+ "--gradient_checkpointing",
72
+ "--tokenized_mds_train",
73
+ "/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/data/ft/bookclaim_balanced_pack_complete",
74
+ "--cuda_empty_cache",
75
+ "--num_train_epochs",
76
+ "1"
77
+ ],
78
+ "state": "running",
79
+ "program": "/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/finetune.py",
80
+ "codePathLocal": "finetune.py",
81
+ "codePath": "prolong-final/finetune.py",
82
+ "git": {
83
+ "remote": "https://github.com/chtmp223/BookGen-dev.git",
84
+ "commit": "0e796521430a0f767be7c4dadba5c2fcaee1f909"
85
+ },
86
+ "email": "[email protected]",
87
+ "root": "/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev",
88
+ "host": "gpu017",
89
+ "username": "ctpham_umass_edu",
90
+ "executable": "/scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final/bin/python3.10",
91
+ "cpu_count": 112,
92
+ "cpu_count_logical": 112,
93
+ "cpu_freq": {
94
+ "current": 975.6290892857141,
95
+ "min": 800.0,
96
+ "max": 3800.0
97
+ },
98
+ "cpu_freq_per_core": [
99
+ {
100
+ "current": 762.481,
101
+ "min": 800.0,
102
+ "max": 3800.0
103
+ },
104
+ {
105
+ "current": 800.0,
106
+ "min": 800.0,
107
+ "max": 3800.0
108
+ },
109
+ {
110
+ "current": 800.0,
111
+ "min": 800.0,
112
+ "max": 3800.0
113
+ },
114
+ {
115
+ "current": 800.0,
116
+ "min": 800.0,
117
+ "max": 3800.0
118
+ },
119
+ {
120
+ "current": 800.0,
121
+ "min": 800.0,
122
+ "max": 3800.0
123
+ },
124
+ {
125
+ "current": 800.0,
126
+ "min": 800.0,
127
+ "max": 3800.0
128
+ },
129
+ {
130
+ "current": 2100.0,
131
+ "min": 800.0,
132
+ "max": 3800.0
133
+ },
134
+ {
135
+ "current": 785.34,
136
+ "min": 800.0,
137
+ "max": 3800.0
138
+ },
139
+ {
140
+ "current": 800.0,
141
+ "min": 800.0,
142
+ "max": 3800.0
143
+ },
144
+ {
145
+ "current": 800.0,
146
+ "min": 800.0,
147
+ "max": 3800.0
148
+ },
149
+ {
150
+ "current": 1300.0,
151
+ "min": 800.0,
152
+ "max": 3800.0
153
+ },
154
+ {
155
+ "current": 800.0,
156
+ "min": 800.0,
157
+ "max": 3800.0
158
+ },
159
+ {
160
+ "current": 800.0,
161
+ "min": 800.0,
162
+ "max": 3800.0
163
+ },
164
+ {
165
+ "current": 800.0,
166
+ "min": 800.0,
167
+ "max": 3800.0
168
+ },
169
+ {
170
+ "current": 800.0,
171
+ "min": 800.0,
172
+ "max": 3800.0
173
+ },
174
+ {
175
+ "current": 800.0,
176
+ "min": 800.0,
177
+ "max": 3800.0
178
+ },
179
+ {
180
+ "current": 1103.96,
181
+ "min": 800.0,
182
+ "max": 3800.0
183
+ },
184
+ {
185
+ "current": 800.0,
186
+ "min": 800.0,
187
+ "max": 3800.0
188
+ },
189
+ {
190
+ "current": 800.0,
191
+ "min": 800.0,
192
+ "max": 3800.0
193
+ },
194
+ {
195
+ "current": 800.0,
196
+ "min": 800.0,
197
+ "max": 3800.0
198
+ },
199
+ {
200
+ "current": 790.077,
201
+ "min": 800.0,
202
+ "max": 3800.0
203
+ },
204
+ {
205
+ "current": 800.0,
206
+ "min": 800.0,
207
+ "max": 3800.0
208
+ },
209
+ {
210
+ "current": 1587.993,
211
+ "min": 800.0,
212
+ "max": 3800.0
213
+ },
214
+ {
215
+ "current": 800.0,
216
+ "min": 800.0,
217
+ "max": 3800.0
218
+ },
219
+ {
220
+ "current": 800.0,
221
+ "min": 800.0,
222
+ "max": 3800.0
223
+ },
224
+ {
225
+ "current": 784.494,
226
+ "min": 800.0,
227
+ "max": 3800.0
228
+ },
229
+ {
230
+ "current": 800.0,
231
+ "min": 800.0,
232
+ "max": 3800.0
233
+ },
234
+ {
235
+ "current": 800.0,
236
+ "min": 800.0,
237
+ "max": 3800.0
238
+ },
239
+ {
240
+ "current": 800.0,
241
+ "min": 800.0,
242
+ "max": 3800.0
243
+ },
244
+ {
245
+ "current": 800.0,
246
+ "min": 800.0,
247
+ "max": 3800.0
248
+ },
249
+ {
250
+ "current": 800.0,
251
+ "min": 800.0,
252
+ "max": 3800.0
253
+ },
254
+ {
255
+ "current": 800.0,
256
+ "min": 800.0,
257
+ "max": 3800.0
258
+ },
259
+ {
260
+ "current": 800.0,
261
+ "min": 800.0,
262
+ "max": 3800.0
263
+ },
264
+ {
265
+ "current": 800.0,
266
+ "min": 800.0,
267
+ "max": 3800.0
268
+ },
269
+ {
270
+ "current": 2547.505,
271
+ "min": 800.0,
272
+ "max": 3800.0
273
+ },
274
+ {
275
+ "current": 800.0,
276
+ "min": 800.0,
277
+ "max": 3800.0
278
+ },
279
+ {
280
+ "current": 2400.0,
281
+ "min": 800.0,
282
+ "max": 3800.0
283
+ },
284
+ {
285
+ "current": 800.0,
286
+ "min": 800.0,
287
+ "max": 3800.0
288
+ },
289
+ {
290
+ "current": 800.0,
291
+ "min": 800.0,
292
+ "max": 3800.0
293
+ },
294
+ {
295
+ "current": 3800.0,
296
+ "min": 800.0,
297
+ "max": 3800.0
298
+ },
299
+ {
300
+ "current": 2000.0,
301
+ "min": 800.0,
302
+ "max": 3800.0
303
+ },
304
+ {
305
+ "current": 800.0,
306
+ "min": 800.0,
307
+ "max": 3800.0
308
+ },
309
+ {
310
+ "current": 2400.0,
311
+ "min": 800.0,
312
+ "max": 3800.0
313
+ },
314
+ {
315
+ "current": 784.425,
316
+ "min": 800.0,
317
+ "max": 3800.0
318
+ },
319
+ {
320
+ "current": 1997.889,
321
+ "min": 800.0,
322
+ "max": 3800.0
323
+ },
324
+ {
325
+ "current": 800.0,
326
+ "min": 800.0,
327
+ "max": 3800.0
328
+ },
329
+ {
330
+ "current": 800.0,
331
+ "min": 800.0,
332
+ "max": 3800.0
333
+ },
334
+ {
335
+ "current": 800.0,
336
+ "min": 800.0,
337
+ "max": 3800.0
338
+ },
339
+ {
340
+ "current": 800.0,
341
+ "min": 800.0,
342
+ "max": 3800.0
343
+ },
344
+ {
345
+ "current": 800.0,
346
+ "min": 800.0,
347
+ "max": 3800.0
348
+ },
349
+ {
350
+ "current": 800.0,
351
+ "min": 800.0,
352
+ "max": 3800.0
353
+ },
354
+ {
355
+ "current": 800.0,
356
+ "min": 800.0,
357
+ "max": 3800.0
358
+ },
359
+ {
360
+ "current": 800.0,
361
+ "min": 800.0,
362
+ "max": 3800.0
363
+ },
364
+ {
365
+ "current": 800.0,
366
+ "min": 800.0,
367
+ "max": 3800.0
368
+ },
369
+ {
370
+ "current": 800.0,
371
+ "min": 800.0,
372
+ "max": 3800.0
373
+ },
374
+ {
375
+ "current": 800.0,
376
+ "min": 800.0,
377
+ "max": 3800.0
378
+ },
379
+ {
380
+ "current": 800.0,
381
+ "min": 800.0,
382
+ "max": 3800.0
383
+ },
384
+ {
385
+ "current": 800.0,
386
+ "min": 800.0,
387
+ "max": 3800.0
388
+ },
389
+ {
390
+ "current": 800.0,
391
+ "min": 800.0,
392
+ "max": 3800.0
393
+ },
394
+ {
395
+ "current": 800.0,
396
+ "min": 800.0,
397
+ "max": 3800.0
398
+ },
399
+ {
400
+ "current": 800.0,
401
+ "min": 800.0,
402
+ "max": 3800.0
403
+ },
404
+ {
405
+ "current": 800.0,
406
+ "min": 800.0,
407
+ "max": 3800.0
408
+ },
409
+ {
410
+ "current": 800.0,
411
+ "min": 800.0,
412
+ "max": 3800.0
413
+ },
414
+ {
415
+ "current": 800.0,
416
+ "min": 800.0,
417
+ "max": 3800.0
418
+ },
419
+ {
420
+ "current": 800.0,
421
+ "min": 800.0,
422
+ "max": 3800.0
423
+ },
424
+ {
425
+ "current": 1300.0,
426
+ "min": 800.0,
427
+ "max": 3800.0
428
+ },
429
+ {
430
+ "current": 800.0,
431
+ "min": 800.0,
432
+ "max": 3800.0
433
+ },
434
+ {
435
+ "current": 800.0,
436
+ "min": 800.0,
437
+ "max": 3800.0
438
+ },
439
+ {
440
+ "current": 782.826,
441
+ "min": 800.0,
442
+ "max": 3800.0
443
+ },
444
+ {
445
+ "current": 1100.0,
446
+ "min": 800.0,
447
+ "max": 3800.0
448
+ },
449
+ {
450
+ "current": 800.0,
451
+ "min": 800.0,
452
+ "max": 3800.0
453
+ },
454
+ {
455
+ "current": 800.0,
456
+ "min": 800.0,
457
+ "max": 3800.0
458
+ },
459
+ {
460
+ "current": 800.0,
461
+ "min": 800.0,
462
+ "max": 3800.0
463
+ },
464
+ {
465
+ "current": 768.56,
466
+ "min": 800.0,
467
+ "max": 3800.0
468
+ },
469
+ {
470
+ "current": 800.0,
471
+ "min": 800.0,
472
+ "max": 3800.0
473
+ },
474
+ {
475
+ "current": 800.0,
476
+ "min": 800.0,
477
+ "max": 3800.0
478
+ },
479
+ {
480
+ "current": 800.0,
481
+ "min": 800.0,
482
+ "max": 3800.0
483
+ },
484
+ {
485
+ "current": 800.0,
486
+ "min": 800.0,
487
+ "max": 3800.0
488
+ },
489
+ {
490
+ "current": 800.0,
491
+ "min": 800.0,
492
+ "max": 3800.0
493
+ },
494
+ {
495
+ "current": 800.0,
496
+ "min": 800.0,
497
+ "max": 3800.0
498
+ },
499
+ {
500
+ "current": 800.0,
501
+ "min": 800.0,
502
+ "max": 3800.0
503
+ },
504
+ {
505
+ "current": 800.0,
506
+ "min": 800.0,
507
+ "max": 3800.0
508
+ },
509
+ {
510
+ "current": 800.0,
511
+ "min": 800.0,
512
+ "max": 3800.0
513
+ },
514
+ {
515
+ "current": 800.0,
516
+ "min": 800.0,
517
+ "max": 3800.0
518
+ },
519
+ {
520
+ "current": 800.0,
521
+ "min": 800.0,
522
+ "max": 3800.0
523
+ },
524
+ {
525
+ "current": 800.0,
526
+ "min": 800.0,
527
+ "max": 3800.0
528
+ },
529
+ {
530
+ "current": 800.0,
531
+ "min": 800.0,
532
+ "max": 3800.0
533
+ },
534
+ {
535
+ "current": 800.0,
536
+ "min": 800.0,
537
+ "max": 3800.0
538
+ },
539
+ {
540
+ "current": 800.0,
541
+ "min": 800.0,
542
+ "max": 3800.0
543
+ },
544
+ {
545
+ "current": 3800.0,
546
+ "min": 800.0,
547
+ "max": 3800.0
548
+ },
549
+ {
550
+ "current": 783.299,
551
+ "min": 800.0,
552
+ "max": 3800.0
553
+ },
554
+ {
555
+ "current": 800.0,
556
+ "min": 800.0,
557
+ "max": 3800.0
558
+ },
559
+ {
560
+ "current": 800.0,
561
+ "min": 800.0,
562
+ "max": 3800.0
563
+ },
564
+ {
565
+ "current": 800.0,
566
+ "min": 800.0,
567
+ "max": 3800.0
568
+ },
569
+ {
570
+ "current": 800.0,
571
+ "min": 800.0,
572
+ "max": 3800.0
573
+ },
574
+ {
575
+ "current": 800.0,
576
+ "min": 800.0,
577
+ "max": 3800.0
578
+ },
579
+ {
580
+ "current": 800.0,
581
+ "min": 800.0,
582
+ "max": 3800.0
583
+ },
584
+ {
585
+ "current": 800.0,
586
+ "min": 800.0,
587
+ "max": 3800.0
588
+ },
589
+ {
590
+ "current": 800.0,
591
+ "min": 800.0,
592
+ "max": 3800.0
593
+ },
594
+ {
595
+ "current": 800.0,
596
+ "min": 800.0,
597
+ "max": 3800.0
598
+ },
599
+ {
600
+ "current": 782.047,
601
+ "min": 800.0,
602
+ "max": 3800.0
603
+ },
604
+ {
605
+ "current": 800.0,
606
+ "min": 800.0,
607
+ "max": 3800.0
608
+ },
609
+ {
610
+ "current": 800.0,
611
+ "min": 800.0,
612
+ "max": 3800.0
613
+ },
614
+ {
615
+ "current": 800.0,
616
+ "min": 800.0,
617
+ "max": 3800.0
618
+ },
619
+ {
620
+ "current": 794.129,
621
+ "min": 800.0,
622
+ "max": 3800.0
623
+ },
624
+ {
625
+ "current": 800.0,
626
+ "min": 800.0,
627
+ "max": 3800.0
628
+ },
629
+ {
630
+ "current": 800.0,
631
+ "min": 800.0,
632
+ "max": 3800.0
633
+ },
634
+ {
635
+ "current": 800.0,
636
+ "min": 800.0,
637
+ "max": 3800.0
638
+ },
639
+ {
640
+ "current": 800.0,
641
+ "min": 800.0,
642
+ "max": 3800.0
643
+ },
644
+ {
645
+ "current": 800.0,
646
+ "min": 800.0,
647
+ "max": 3800.0
648
+ },
649
+ {
650
+ "current": 800.0,
651
+ "min": 800.0,
652
+ "max": 3800.0
653
+ },
654
+ {
655
+ "current": 800.0,
656
+ "min": 800.0,
657
+ "max": 3800.0
658
+ }
659
+ ],
660
+ "disk": {
661
+ "/": {
662
+ "total": 438.487850189209,
663
+ "used": 16.457809448242188
664
+ }
665
+ },
666
+ "gpu": "NVIDIA A100-SXM4-80GB",
667
+ "gpu_count": 8,
668
+ "gpu_devices": [
669
+ {
670
+ "name": "NVIDIA A100-SXM4-80GB",
671
+ "memory_total": 85899345920
672
+ },
673
+ {
674
+ "name": "NVIDIA A100-SXM4-80GB",
675
+ "memory_total": 85899345920
676
+ },
677
+ {
678
+ "name": "NVIDIA A100-SXM4-80GB",
679
+ "memory_total": 85899345920
680
+ },
681
+ {
682
+ "name": "NVIDIA A100-SXM4-80GB",
683
+ "memory_total": 85899345920
684
+ },
685
+ {
686
+ "name": "NVIDIA A100-SXM4-80GB",
687
+ "memory_total": 85899345920
688
+ },
689
+ {
690
+ "name": "NVIDIA A100-SXM4-80GB",
691
+ "memory_total": 85899345920
692
+ },
693
+ {
694
+ "name": "NVIDIA A100-SXM4-80GB",
695
+ "memory_total": 85899345920
696
+ },
697
+ {
698
+ "name": "NVIDIA A100-SXM4-80GB",
699
+ "memory_total": 85899345920
700
+ }
701
+ ],
702
+ "memory": {
703
+ "total": 2015.3286399841309
704
+ }
705
+ }
wandb/run-20250101_112144-t9wzg2aq/files/wandb-summary.json ADDED
File without changes
wandb/run-20250101_112144-t9wzg2aq/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20250101_112144-t9wzg2aq/logs/debug.log ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-01-01 11:21:44,529 INFO MainThread:2689681 [wandb_setup.py:_flush():76] Current SDK version is 0.17.3
2
+ 2025-01-01 11:21:44,529 INFO MainThread:2689681 [wandb_setup.py:_flush():76] Configure stats pid to 2689681
3
+ 2025-01-01 11:21:44,529 INFO MainThread:2689681 [wandb_setup.py:_flush():76] Loading settings from /home/ctpham_umass_edu/.config/wandb/settings
4
+ 2025-01-01 11:21:44,529 INFO MainThread:2689681 [wandb_setup.py:_flush():76] Loading settings from /work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/wandb/settings
5
+ 2025-01-01 11:21:44,530 INFO MainThread:2689681 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'root_dir': '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', 'project': 'prolong', 'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2025-01-01 11:21:44,530 INFO MainThread:2689681 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2025-01-01 11:21:44,530 INFO MainThread:2689681 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'prolong-final/finetune.py', 'program_abspath': '/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/finetune.py', 'program': '/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/finetune.py'}
8
+ 2025-01-01 11:21:44,530 INFO MainThread:2689681 [wandb_setup.py:_flush():76] Applying login settings: {}
9
+ 2025-01-01 11:21:44,530 INFO MainThread:2689681 [wandb_init.py:_log_setup():520] Logging user logs to /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20250101_112144-t9wzg2aq/logs/debug.log
10
+ 2025-01-01 11:21:44,530 INFO MainThread:2689681 [wandb_init.py:_log_setup():521] Logging internal logs to /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20250101_112144-t9wzg2aq/logs/debug-internal.log
11
+ 2025-01-01 11:21:44,530 INFO MainThread:2689681 [wandb_init.py:init():560] calling init triggers
12
+ 2025-01-01 11:21:44,530 INFO MainThread:2689681 [wandb_init.py:init():567] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2025-01-01 11:21:44,530 INFO MainThread:2689681 [wandb_init.py:init():610] starting backend
15
+ 2025-01-01 11:21:44,530 INFO MainThread:2689681 [wandb_init.py:init():614] setting up manager
16
+ 2025-01-01 11:21:44,533 INFO MainThread:2689681 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2025-01-01 11:21:44,536 INFO MainThread:2689681 [wandb_init.py:init():622] backend started and connected
18
+ 2025-01-01 11:21:44,544 INFO MainThread:2689681 [wandb_init.py:init():711] updated telemetry
19
+ 2025-01-01 11:21:44,575 INFO MainThread:2689681 [wandb_init.py:init():744] communicating run to backend with 90.0 second timeout
20
+ 2025-01-01 11:21:44,902 INFO MainThread:2689681 [wandb_run.py:_on_init():2402] communicating current version
21
+ 2025-01-01 11:21:44,965 INFO MainThread:2689681 [wandb_run.py:_on_init():2411] got version response upgrade_message: "wandb version 0.19.1 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
22
+
23
+ 2025-01-01 11:21:44,965 INFO MainThread:2689681 [wandb_init.py:init():795] starting run threads in backend
24
+ 2025-01-01 11:21:51,404 INFO MainThread:2689681 [wandb_run.py:_console_start():2380] atexit reg
25
+ 2025-01-01 11:21:51,404 INFO MainThread:2689681 [wandb_run.py:_redirect():2235] redirect: wrap_raw
26
+ 2025-01-01 11:21:51,404 INFO MainThread:2689681 [wandb_run.py:_redirect():2300] Wrapping output streams.
27
+ 2025-01-01 11:21:51,404 INFO MainThread:2689681 [wandb_run.py:_redirect():2325] Redirects installed.
28
+ 2025-01-01 11:21:51,407 INFO MainThread:2689681 [wandb_init.py:init():838] run started, returning control to user process
29
+ 2025-01-01 11:21:51,408 INFO MainThread:2689681 [wandb_run.py:_config_callback():1382] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 8.0, 'low_freq_factor': 1.0, 'high_freq_factor': 4.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': 0, 'eos_token_id': [128001, 128008, 128009], 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/', 'transformers_version': '4.44.2', 'model_type': 'llama', 'output_dir': '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', 'overwrite_output_dir': False, 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 1e-06, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.05, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/runs/Jan01_11-21-09_gpu017', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 200, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 1, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', 'disable_tqdm': True, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': ['auto_wrap', 'offload'], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False, 'min_lr_ratio': 0.1, 'cuda_empty_cache': True, 'streaming_dataset': True, 'seq_parallel_size': 8}
30
+ 2025-01-01 11:21:51,411 INFO MainThread:2689681 [wandb_config.py:__setitem__():151] config set model/num_parameters = 1003782656 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x73716eb5e260>>
31
+ 2025-01-01 11:21:51,411 INFO MainThread:2689681 [wandb_run.py:_config_callback():1382] config_cb model/num_parameters 1003782656 None
wandb/run-20250101_112144-t9wzg2aq/run-t9wzg2aq.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24d095c55a3634e35a51c48500ba917124bdc8e697787fb2642a5f1f31ace3f3
3
+ size 6852892
wandb/run-20250102_021927-pw8rud5e/files/conda-environment.yaml ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=conda_forge
6
+ - _openmp_mutex=4.5=2_gnu
7
+ - bzip2=1.0.8=h4bc722e_7
8
+ - ca-certificates=2024.12.14=hbcca054_0
9
+ - ld_impl_linux-64=2.43=h712a8e2_2
10
+ - libffi=3.4.2=h7f98852_5
11
+ - libgcc=14.2.0=h77fa898_1
12
+ - libgcc-ng=14.2.0=h69a702a_1
13
+ - libgomp=14.2.0=h77fa898_1
14
+ - liblzma=5.6.3=hb9d3cd8_1
15
+ - liblzma-devel=5.6.3=hb9d3cd8_1
16
+ - libnsl=2.0.1=hd590300_0
17
+ - libsqlite=3.47.2=hee588c1_0
18
+ - libuuid=2.38.1=h0b41bf4_0
19
+ - libzlib=1.3.1=hb9d3cd8_2
20
+ - ncurses=6.5=he02047a_1
21
+ - openssl=3.4.0=hb9d3cd8_0
22
+ - pip=24.3.1=pyh8b19718_2
23
+ - python=3.10.0=h543edf9_3_cpython
24
+ - readline=8.2=h8228510_1
25
+ - setuptools=75.6.0=pyhff2d567_1
26
+ - sqlite=3.47.2=h9eae976_0
27
+ - tk=8.6.13=noxft_h4845f30_101
28
+ - wheel=0.45.1=pyhd8ed1ab_1
29
+ - xz=5.6.3=hbcc6ac9_1
30
+ - xz-gpl-tools=5.6.3=hbcc6ac9_1
31
+ - xz-tools=5.6.3=hb9d3cd8_1
32
+ - pip:
33
+ - accelerate==0.32.1
34
+ - aiohappyeyeballs==2.4.3
35
+ - aiohttp==3.11.2
36
+ - aioprometheus==23.12.0
37
+ - aiosignal==1.3.1
38
+ - annotated-types==0.7.0
39
+ - anthropic==0.39.0
40
+ - anyio==4.6.2.post1
41
+ - argcomplete==3.5.1
42
+ - arrow==1.3.0
43
+ - async-timeout==5.0.1
44
+ - attrs==24.2.0
45
+ - azure-core==1.32.0
46
+ - azure-identity==1.19.0
47
+ - azure-storage-blob==12.24.0
48
+ - azure-storage-file-datalake==12.18.0
49
+ - backoff==2.2.1
50
+ - bcrypt==4.2.0
51
+ - blobfile==3.0.0
52
+ - boto3==1.35.63
53
+ - botocore==1.35.63
54
+ - brotli==1.1.0
55
+ - cachetools==5.5.0
56
+ - certifi==2024.8.30
57
+ - cffi==1.17.1
58
+ - charset-normalizer==3.4.0
59
+ - circuitbreaker==2.0.0
60
+ - click==8.1.7
61
+ - cloudpickle==3.1.0
62
+ - compressed-tensors==0.8.0
63
+ - contourpy==1.3.1
64
+ - cramjam==2.9.0
65
+ - cryptography==43.0.3
66
+ - cycler==0.12.1
67
+ - datasets==2.20.0
68
+ - debugpy==1.8.11
69
+ - dill==0.3.8
70
+ - diskcache==5.6.3
71
+ - distro==1.9.0
72
+ - docker-pycreds==0.4.0
73
+ - docstring-parser==0.16
74
+ - einops==0.8.0
75
+ - fastapi==0.115.5
76
+ - filelock==3.16.1
77
+ - flash-attn==2.6.1
78
+ - fonttools==4.55.0
79
+ - frozenlist==1.5.0
80
+ - fsspec==2024.5.0
81
+ - gguf==0.10.0
82
+ - gitdb==4.0.11
83
+ - gitpython==3.1.43
84
+ - google-api-core==2.23.0
85
+ - google-auth==2.36.0
86
+ - google-cloud-aiplatform==1.71.1
87
+ - google-cloud-bigquery==3.27.0
88
+ - google-cloud-core==2.4.1
89
+ - google-cloud-resource-manager==1.13.1
90
+ - google-cloud-storage==2.10.0
91
+ - google-crc32c==1.6.0
92
+ - google-resumable-media==2.7.2
93
+ - googleapis-common-protos==1.66.0
94
+ - gql==3.5.0
95
+ - graphql-core==3.2.5
96
+ - grpc-google-iam-v1==0.13.1
97
+ - grpcio==1.68.0
98
+ - grpcio-status==1.62.3
99
+ - h11==0.14.0
100
+ - httpcore==1.0.7
101
+ - httptools==0.6.4
102
+ - httpx==0.27.2
103
+ - huggingface-hub==0.26.2
104
+ - idna==3.10
105
+ - importlib-metadata==8.5.0
106
+ - interegular==0.3.3
107
+ - ipython==8.18.0
108
+ - isodate==0.7.2
109
+ - jedi==0.19.2
110
+ - jinja2==3.1.4
111
+ - jiter==0.7.1
112
+ - jmespath==1.0.1
113
+ - jsonschema==4.23.0
114
+ - jsonschema-specifications==2024.10.1
115
+ - kiwisolver==1.4.7
116
+ - lark==1.2.2
117
+ - llvmlite==0.43.0
118
+ - lm-format-enforcer==0.10.9
119
+ - lxml==5.3.0
120
+ - markdown-it-py==3.0.0
121
+ - markupsafe==3.0.2
122
+ - matplotlib==3.9.2
123
+ - mdurl==0.1.2
124
+ - mosaicml-cli==0.5.34
125
+ - mosaicml-streaming==0.8.1
126
+ - mpmath==1.3.0
127
+ - msal==1.31.1
128
+ - msal-extensions==1.2.0
129
+ - msgpack==1.1.0
130
+ - msgspec==0.18.6
131
+ - multidict==6.1.0
132
+ - multiprocess==0.70.16
133
+ - networkx==3.4.2
134
+ - ninja==1.11.1.1
135
+ - numba==0.60.0
136
+ - numpy==1.26.4
137
+ - nvidia-cublas-cu12==12.1.3.1
138
+ - nvidia-cuda-cupti-cu12==12.1.105
139
+ - nvidia-cuda-nvrtc-cu12==12.1.105
140
+ - nvidia-cuda-runtime-cu12==12.1.105
141
+ - nvidia-cudnn-cu12==9.1.0.70
142
+ - nvidia-cufft-cu12==11.0.2.54
143
+ - nvidia-curand-cu12==10.3.2.106
144
+ - nvidia-cusolver-cu12==11.4.5.107
145
+ - nvidia-cusparse-cu12==12.1.0.106
146
+ - nvidia-ml-py==12.560.30
147
+ - nvidia-nccl-cu12==2.20.5
148
+ - nvidia-nvjitlink-cu12==12.4.127
149
+ - nvidia-nvtx-cu12==12.1.105
150
+ - oci==2.138.1
151
+ - openai==1.54.5
152
+ - opencv-python-headless==4.10.0.84
153
+ - orjson==3.10.11
154
+ - outlines==0.0.46
155
+ - packaging==24.1
156
+ - pandas==2.2.1
157
+ - paramiko==3.5.0
158
+ - partial-json-parser==0.2.1.1.post4
159
+ - pillow==10.4.0
160
+ - portalocker==2.10.1
161
+ - prometheus-client==0.21.0
162
+ - prometheus-fastapi-instrumentator==7.0.0
163
+ - prompt-toolkit==3.0.36
164
+ - propcache==0.2.0
165
+ - proto-plus==1.25.0
166
+ - protobuf==4.25.3
167
+ - py-cpuinfo==9.0.0
168
+ - pyairports==2.1.1
169
+ - pyarrow==18.0.0
170
+ - pyarrow-hotfix==0.6
171
+ - pyasn1==0.6.1
172
+ - pyasn1-modules==0.4.1
173
+ - pycountry==24.6.1
174
+ - pycparser==2.22
175
+ - pycryptodomex==3.21.0
176
+ - pydantic==2.9.2
177
+ - pydantic-core==2.23.4
178
+ - pyjwt==2.10.0
179
+ - pynacl==1.5.0
180
+ - pyopenssl==24.2.1
181
+ - pyparsing==3.2.0
182
+ - python-dateutil==2.9.0
183
+ - python-dotenv==1.0.1
184
+ - python-snappy==0.7.3
185
+ - pytz==2024.2
186
+ - pyyaml==6.0.2
187
+ - quantile-python==1.1
188
+ - questionary==2.0.1
189
+ - ray==2.39.0
190
+ - referencing==0.35.1
191
+ - regex==2023.12.25
192
+ - requests==2.32.3
193
+ - rich==13.9.4
194
+ - rotary-emb==0.5.2
195
+ - rpds-py==0.21.0
196
+ - rsa==4.9
197
+ - ruamel-yaml==0.18.6
198
+ - ruamel-yaml-clib==0.2.12
199
+ - s3transfer==0.10.3
200
+ - safetensors==0.4.5
201
+ - sentencepiece==0.1.99
202
+ - sentry-sdk==2.18.0
203
+ - setproctitle==1.3.4
204
+ - shapely==2.0.6
205
+ - simple-parsing==0.1.6
206
+ - smmap==5.0.1
207
+ - sniffio==1.3.1
208
+ - starlette==0.41.3
209
+ - sympy==1.13.1
210
+ - tiktoken==0.7.0
211
+ - tokenizers==0.19.1
212
+ - torch==2.4.1
213
+ - torchvision==0.19.1
214
+ - tqdm==4.66.4
215
+ - transformers==4.44.2
216
+ - triton==3.0.0
217
+ - types-python-dateutil==2.9.0.20241003
218
+ - tzdata==2024.2
219
+ - urllib3==2.2.3
220
+ - uvicorn==0.32.0
221
+ - uvloop==0.21.0
222
+ - validators==0.34.0
223
+ - vertexai==1.71.1
224
+ - wandb==0.17.3
225
+ - watchfiles==0.24.0
226
+ - websockets==11.0.3
227
+ - xformers==0.0.28.post1
228
+ - xxhash==3.5.0
229
+ - yarl==1.17.2
230
+ - zipp==3.21.0
231
+ - zstandard==0.23.0
232
+ - zstd==1.5.5.1
233
+ prefix: /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final
wandb/run-20250102_021927-pw8rud5e/files/config.yaml ADDED
@@ -0,0 +1,713 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.10.0
7
+ cli_version: 0.17.3
8
+ framework: huggingface
9
+ huggingface_version: 4.44.2
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1735784367
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 11
17
+ - 41
18
+ - 49
19
+ - 51
20
+ - 55
21
+ - 71
22
+ - 105
23
+ 2:
24
+ - 1
25
+ - 11
26
+ - 41
27
+ - 49
28
+ - 51
29
+ - 55
30
+ - 71
31
+ - 105
32
+ 3:
33
+ - 7
34
+ - 13
35
+ - 19
36
+ - 23
37
+ - 66
38
+ 4: 3.10.0
39
+ 5: 0.17.3
40
+ 6: 4.44.2
41
+ 8:
42
+ - 5
43
+ 9:
44
+ 1: transformers_trainer
45
+ 13: linux-x86_64
46
+ m:
47
+ - 1: train/global_step
48
+ 6:
49
+ - 3
50
+ - 1: train/loss
51
+ 5: 1
52
+ 6:
53
+ - 1
54
+ - 1: train/grad_norm
55
+ 5: 1
56
+ 6:
57
+ - 1
58
+ - 1: train/learning_rate
59
+ 5: 1
60
+ 6:
61
+ - 1
62
+ - 1: train/epoch
63
+ 5: 1
64
+ 6:
65
+ - 1
66
+ - 1: train/num_input_tokens_seen
67
+ 5: 1
68
+ 6:
69
+ - 1
70
+ vocab_size:
71
+ desc: null
72
+ value: 128256
73
+ max_position_embeddings:
74
+ desc: null
75
+ value: 131072
76
+ hidden_size:
77
+ desc: null
78
+ value: 4096
79
+ intermediate_size:
80
+ desc: null
81
+ value: 14336
82
+ num_hidden_layers:
83
+ desc: null
84
+ value: 32
85
+ num_attention_heads:
86
+ desc: null
87
+ value: 32
88
+ num_key_value_heads:
89
+ desc: null
90
+ value: 8
91
+ hidden_act:
92
+ desc: null
93
+ value: silu
94
+ initializer_range:
95
+ desc: null
96
+ value: 0.02
97
+ rms_norm_eps:
98
+ desc: null
99
+ value: 1.0e-05
100
+ pretraining_tp:
101
+ desc: null
102
+ value: 1
103
+ use_cache:
104
+ desc: null
105
+ value: true
106
+ rope_theta:
107
+ desc: null
108
+ value: 500000.0
109
+ rope_scaling:
110
+ desc: null
111
+ value:
112
+ factor: 8.0
113
+ low_freq_factor: 1.0
114
+ high_freq_factor: 4.0
115
+ original_max_position_embeddings: 8192
116
+ rope_type: llama3
117
+ attention_bias:
118
+ desc: null
119
+ value: false
120
+ attention_dropout:
121
+ desc: null
122
+ value: 0.0
123
+ mlp_bias:
124
+ desc: null
125
+ value: false
126
+ return_dict:
127
+ desc: null
128
+ value: true
129
+ output_hidden_states:
130
+ desc: null
131
+ value: false
132
+ output_attentions:
133
+ desc: null
134
+ value: false
135
+ torchscript:
136
+ desc: null
137
+ value: false
138
+ torch_dtype:
139
+ desc: null
140
+ value: bfloat16
141
+ use_bfloat16:
142
+ desc: null
143
+ value: false
144
+ tf_legacy_loss:
145
+ desc: null
146
+ value: false
147
+ pruned_heads:
148
+ desc: null
149
+ value: {}
150
+ tie_word_embeddings:
151
+ desc: null
152
+ value: false
153
+ chunk_size_feed_forward:
154
+ desc: null
155
+ value: 0
156
+ is_encoder_decoder:
157
+ desc: null
158
+ value: false
159
+ is_decoder:
160
+ desc: null
161
+ value: false
162
+ cross_attention_hidden_size:
163
+ desc: null
164
+ value: null
165
+ add_cross_attention:
166
+ desc: null
167
+ value: false
168
+ tie_encoder_decoder:
169
+ desc: null
170
+ value: false
171
+ max_length:
172
+ desc: null
173
+ value: 20
174
+ min_length:
175
+ desc: null
176
+ value: 0
177
+ do_sample:
178
+ desc: null
179
+ value: false
180
+ early_stopping:
181
+ desc: null
182
+ value: false
183
+ num_beams:
184
+ desc: null
185
+ value: 1
186
+ num_beam_groups:
187
+ desc: null
188
+ value: 1
189
+ diversity_penalty:
190
+ desc: null
191
+ value: 0.0
192
+ temperature:
193
+ desc: null
194
+ value: 1.0
195
+ top_k:
196
+ desc: null
197
+ value: 50
198
+ top_p:
199
+ desc: null
200
+ value: 1.0
201
+ typical_p:
202
+ desc: null
203
+ value: 1.0
204
+ repetition_penalty:
205
+ desc: null
206
+ value: 1.0
207
+ length_penalty:
208
+ desc: null
209
+ value: 1.0
210
+ no_repeat_ngram_size:
211
+ desc: null
212
+ value: 0
213
+ encoder_no_repeat_ngram_size:
214
+ desc: null
215
+ value: 0
216
+ bad_words_ids:
217
+ desc: null
218
+ value: null
219
+ num_return_sequences:
220
+ desc: null
221
+ value: 1
222
+ output_scores:
223
+ desc: null
224
+ value: false
225
+ return_dict_in_generate:
226
+ desc: null
227
+ value: false
228
+ forced_bos_token_id:
229
+ desc: null
230
+ value: null
231
+ forced_eos_token_id:
232
+ desc: null
233
+ value: null
234
+ remove_invalid_values:
235
+ desc: null
236
+ value: false
237
+ exponential_decay_length_penalty:
238
+ desc: null
239
+ value: null
240
+ suppress_tokens:
241
+ desc: null
242
+ value: null
243
+ begin_suppress_tokens:
244
+ desc: null
245
+ value: null
246
+ architectures:
247
+ desc: null
248
+ value:
249
+ - LlamaForCausalLM
250
+ finetuning_task:
251
+ desc: null
252
+ value: null
253
+ id2label:
254
+ desc: null
255
+ value:
256
+ '0': LABEL_0
257
+ '1': LABEL_1
258
+ label2id:
259
+ desc: null
260
+ value:
261
+ LABEL_0: 0
262
+ LABEL_1: 1
263
+ tokenizer_class:
264
+ desc: null
265
+ value: null
266
+ prefix:
267
+ desc: null
268
+ value: null
269
+ bos_token_id:
270
+ desc: null
271
+ value: 128000
272
+ pad_token_id:
273
+ desc: null
274
+ value: 0
275
+ eos_token_id:
276
+ desc: null
277
+ value:
278
+ - 128001
279
+ - 128008
280
+ - 128009
281
+ sep_token_id:
282
+ desc: null
283
+ value: null
284
+ decoder_start_token_id:
285
+ desc: null
286
+ value: null
287
+ task_specific_params:
288
+ desc: null
289
+ value: null
290
+ problem_type:
291
+ desc: null
292
+ value: null
293
+ _name_or_path:
294
+ desc: null
295
+ value: /datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/
296
+ transformers_version:
297
+ desc: null
298
+ value: 4.44.2
299
+ model_type:
300
+ desc: null
301
+ value: llama
302
+ output_dir:
303
+ desc: null
304
+ value: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_
305
+ overwrite_output_dir:
306
+ desc: null
307
+ value: false
308
+ do_train:
309
+ desc: null
310
+ value: true
311
+ do_eval:
312
+ desc: null
313
+ value: false
314
+ do_predict:
315
+ desc: null
316
+ value: false
317
+ eval_strategy:
318
+ desc: null
319
+ value: 'no'
320
+ prediction_loss_only:
321
+ desc: null
322
+ value: false
323
+ per_device_train_batch_size:
324
+ desc: null
325
+ value: 1
326
+ per_device_eval_batch_size:
327
+ desc: null
328
+ value: 8
329
+ per_gpu_train_batch_size:
330
+ desc: null
331
+ value: null
332
+ per_gpu_eval_batch_size:
333
+ desc: null
334
+ value: null
335
+ gradient_accumulation_steps:
336
+ desc: null
337
+ value: 2
338
+ eval_accumulation_steps:
339
+ desc: null
340
+ value: null
341
+ eval_delay:
342
+ desc: null
343
+ value: 0
344
+ torch_empty_cache_steps:
345
+ desc: null
346
+ value: null
347
+ learning_rate:
348
+ desc: null
349
+ value: 1.0e-06
350
+ weight_decay:
351
+ desc: null
352
+ value: 0.1
353
+ adam_beta1:
354
+ desc: null
355
+ value: 0.9
356
+ adam_beta2:
357
+ desc: null
358
+ value: 0.95
359
+ adam_epsilon:
360
+ desc: null
361
+ value: 1.0e-08
362
+ max_grad_norm:
363
+ desc: null
364
+ value: 1.0
365
+ num_train_epochs:
366
+ desc: null
367
+ value: 1.0
368
+ max_steps:
369
+ desc: null
370
+ value: -1
371
+ lr_scheduler_type:
372
+ desc: null
373
+ value: cosine
374
+ lr_scheduler_kwargs:
375
+ desc: null
376
+ value: {}
377
+ warmup_ratio:
378
+ desc: null
379
+ value: 0.05
380
+ warmup_steps:
381
+ desc: null
382
+ value: 0
383
+ log_level:
384
+ desc: null
385
+ value: info
386
+ log_level_replica:
387
+ desc: null
388
+ value: warning
389
+ log_on_each_node:
390
+ desc: null
391
+ value: true
392
+ logging_dir:
393
+ desc: null
394
+ value: /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/runs/Jan02_02-12-22_gpu020
395
+ logging_strategy:
396
+ desc: null
397
+ value: steps
398
+ logging_first_step:
399
+ desc: null
400
+ value: false
401
+ logging_steps:
402
+ desc: null
403
+ value: 1.0
404
+ logging_nan_inf_filter:
405
+ desc: null
406
+ value: true
407
+ save_strategy:
408
+ desc: null
409
+ value: steps
410
+ save_steps:
411
+ desc: null
412
+ value: 200
413
+ save_total_limit:
414
+ desc: null
415
+ value: null
416
+ save_safetensors:
417
+ desc: null
418
+ value: true
419
+ save_on_each_node:
420
+ desc: null
421
+ value: false
422
+ save_only_model:
423
+ desc: null
424
+ value: false
425
+ restore_callback_states_from_checkpoint:
426
+ desc: null
427
+ value: false
428
+ no_cuda:
429
+ desc: null
430
+ value: false
431
+ use_cpu:
432
+ desc: null
433
+ value: false
434
+ use_mps_device:
435
+ desc: null
436
+ value: false
437
+ seed:
438
+ desc: null
439
+ value: 42
440
+ data_seed:
441
+ desc: null
442
+ value: null
443
+ jit_mode_eval:
444
+ desc: null
445
+ value: false
446
+ use_ipex:
447
+ desc: null
448
+ value: false
449
+ bf16:
450
+ desc: null
451
+ value: true
452
+ fp16:
453
+ desc: null
454
+ value: false
455
+ fp16_opt_level:
456
+ desc: null
457
+ value: O1
458
+ half_precision_backend:
459
+ desc: null
460
+ value: auto
461
+ bf16_full_eval:
462
+ desc: null
463
+ value: false
464
+ fp16_full_eval:
465
+ desc: null
466
+ value: false
467
+ tf32:
468
+ desc: null
469
+ value: null
470
+ local_rank:
471
+ desc: null
472
+ value: 0
473
+ ddp_backend:
474
+ desc: null
475
+ value: null
476
+ tpu_num_cores:
477
+ desc: null
478
+ value: null
479
+ tpu_metrics_debug:
480
+ desc: null
481
+ value: false
482
+ debug:
483
+ desc: null
484
+ value: []
485
+ dataloader_drop_last:
486
+ desc: null
487
+ value: false
488
+ eval_steps:
489
+ desc: null
490
+ value: null
491
+ dataloader_num_workers:
492
+ desc: null
493
+ value: 1
494
+ dataloader_prefetch_factor:
495
+ desc: null
496
+ value: null
497
+ past_index:
498
+ desc: null
499
+ value: -1
500
+ run_name:
501
+ desc: null
502
+ value: _llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_
503
+ disable_tqdm:
504
+ desc: null
505
+ value: true
506
+ remove_unused_columns:
507
+ desc: null
508
+ value: false
509
+ label_names:
510
+ desc: null
511
+ value: null
512
+ load_best_model_at_end:
513
+ desc: null
514
+ value: false
515
+ metric_for_best_model:
516
+ desc: null
517
+ value: null
518
+ greater_is_better:
519
+ desc: null
520
+ value: null
521
+ ignore_data_skip:
522
+ desc: null
523
+ value: true
524
+ fsdp:
525
+ desc: null
526
+ value:
527
+ - auto_wrap
528
+ - offload
529
+ fsdp_min_num_params:
530
+ desc: null
531
+ value: 0
532
+ fsdp_config:
533
+ desc: null
534
+ value:
535
+ min_num_params: 0
536
+ xla: false
537
+ xla_fsdp_v2: false
538
+ xla_fsdp_grad_ckpt: false
539
+ fsdp_transformer_layer_cls_to_wrap:
540
+ desc: null
541
+ value: null
542
+ accelerator_config:
543
+ desc: null
544
+ value:
545
+ split_batches: false
546
+ dispatch_batches: null
547
+ even_batches: true
548
+ use_seedable_sampler: true
549
+ non_blocking: false
550
+ gradient_accumulation_kwargs: null
551
+ deepspeed:
552
+ desc: null
553
+ value: null
554
+ label_smoothing_factor:
555
+ desc: null
556
+ value: 0.0
557
+ optim:
558
+ desc: null
559
+ value: adamw_torch
560
+ optim_args:
561
+ desc: null
562
+ value: null
563
+ adafactor:
564
+ desc: null
565
+ value: false
566
+ group_by_length:
567
+ desc: null
568
+ value: false
569
+ length_column_name:
570
+ desc: null
571
+ value: length
572
+ report_to:
573
+ desc: null
574
+ value:
575
+ - wandb
576
+ ddp_find_unused_parameters:
577
+ desc: null
578
+ value: false
579
+ ddp_bucket_cap_mb:
580
+ desc: null
581
+ value: null
582
+ ddp_broadcast_buffers:
583
+ desc: null
584
+ value: null
585
+ dataloader_pin_memory:
586
+ desc: null
587
+ value: true
588
+ dataloader_persistent_workers:
589
+ desc: null
590
+ value: false
591
+ skip_memory_metrics:
592
+ desc: null
593
+ value: true
594
+ use_legacy_prediction_loop:
595
+ desc: null
596
+ value: false
597
+ push_to_hub:
598
+ desc: null
599
+ value: false
600
+ resume_from_checkpoint:
601
+ desc: null
602
+ value: null
603
+ hub_model_id:
604
+ desc: null
605
+ value: null
606
+ hub_strategy:
607
+ desc: null
608
+ value: every_save
609
+ hub_token:
610
+ desc: null
611
+ value: <HUB_TOKEN>
612
+ hub_private_repo:
613
+ desc: null
614
+ value: false
615
+ hub_always_push:
616
+ desc: null
617
+ value: false
618
+ gradient_checkpointing:
619
+ desc: null
620
+ value: true
621
+ gradient_checkpointing_kwargs:
622
+ desc: null
623
+ value: null
624
+ include_inputs_for_metrics:
625
+ desc: null
626
+ value: false
627
+ eval_do_concat_batches:
628
+ desc: null
629
+ value: true
630
+ fp16_backend:
631
+ desc: null
632
+ value: auto
633
+ evaluation_strategy:
634
+ desc: null
635
+ value: null
636
+ push_to_hub_model_id:
637
+ desc: null
638
+ value: null
639
+ push_to_hub_organization:
640
+ desc: null
641
+ value: null
642
+ push_to_hub_token:
643
+ desc: null
644
+ value: <PUSH_TO_HUB_TOKEN>
645
+ mp_parameters:
646
+ desc: null
647
+ value: ''
648
+ auto_find_batch_size:
649
+ desc: null
650
+ value: false
651
+ full_determinism:
652
+ desc: null
653
+ value: false
654
+ torchdynamo:
655
+ desc: null
656
+ value: null
657
+ ray_scope:
658
+ desc: null
659
+ value: last
660
+ ddp_timeout:
661
+ desc: null
662
+ value: 1800
663
+ torch_compile:
664
+ desc: null
665
+ value: false
666
+ torch_compile_backend:
667
+ desc: null
668
+ value: null
669
+ torch_compile_mode:
670
+ desc: null
671
+ value: null
672
+ dispatch_batches:
673
+ desc: null
674
+ value: null
675
+ split_batches:
676
+ desc: null
677
+ value: null
678
+ include_tokens_per_second:
679
+ desc: null
680
+ value: false
681
+ include_num_input_tokens_seen:
682
+ desc: null
683
+ value: false
684
+ neftune_noise_alpha:
685
+ desc: null
686
+ value: null
687
+ optim_target_modules:
688
+ desc: null
689
+ value: null
690
+ batch_eval_metrics:
691
+ desc: null
692
+ value: false
693
+ eval_on_start:
694
+ desc: null
695
+ value: false
696
+ eval_use_gather_object:
697
+ desc: null
698
+ value: false
699
+ min_lr_ratio:
700
+ desc: null
701
+ value: 0.1
702
+ cuda_empty_cache:
703
+ desc: null
704
+ value: true
705
+ streaming_dataset:
706
+ desc: null
707
+ value: true
708
+ seq_parallel_size:
709
+ desc: null
710
+ value: 8
711
+ model/num_parameters:
712
+ desc: null
713
+ value: 1003782656
wandb/run-20250102_021927-pw8rud5e/files/output.log ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final/lib/python3.10/site-packages/transformers/trainer.py:2833: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
2
+ checkpoint_rng_state = torch.load(rng_file)
3
+ /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final/lib/python3.10/site-packages/torch/utils/checkpoint.py:1399: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
4
+ with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context: # type: ignore[attr-defined]
5
+ /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final/lib/python3.10/site-packages/torch/utils/checkpoint.py:1399: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
6
+ with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context: # type: ignore[attr-defined]
7
+ [INFO|trainer.py:175] 2025-01-02 02:20:19,436 >> {'loss': 0.5452, 'grad_norm': 21.211387634277344, 'learning_rate': 8.847211348554382e-07, 'epoch': 0.00027122321670735016, 'num_input_tokens_seen': 2099249152, 'completed': '27.15% (1_001 / 3_687)', 'remaining time': '32:49:21', 'throughput': '2979.49', 'gpu_mem_free': '5581MB'}
8
+ [INFO|trainer.py:175] 2025-01-02 02:20:50,985 >> {'loss': 0.494, 'grad_norm': 15.134449005126953, 'learning_rate': 8.844511851055991e-07, 'epoch': 0.0005424464334147003, 'num_input_tokens_seen': 2101346304, 'completed': '27.18% (1_002 / 3_687)', 'remaining time': '28:10:14', 'throughput': '8308.84', 'gpu_mem_free': '5581MB'}
9
+ [INFO|trainer.py:175] 2025-01-02 02:21:21,666 >> {'loss': 0.5585, 'grad_norm': 15.855742454528809, 'learning_rate': 8.841809662021731e-07, 'epoch': 0.0008136696501220504, 'num_input_tokens_seen': 2103443456, 'completed': '27.20% (1_003 / 3_687)', 'remaining time': '26:23:53', 'throughput': '8544.34', 'gpu_mem_free': '5581MB'}
10
+ [INFO|trainer.py:175] 2025-01-02 02:21:52,330 >> {'loss': 0.9896, 'grad_norm': 25.325538635253906, 'learning_rate': 8.839104783626219e-07, 'epoch': 0.0010848928668294006, 'num_input_tokens_seen': 2105540608, 'completed': '27.23% (1_004 / 3_687)', 'remaining time': '25:30:16', 'throughput': '8548.91', 'gpu_mem_free': '5581MB'}
11
+ [INFO|trainer.py:175] 2025-01-02 02:22:22,067 >> {'loss': 0.423, 'grad_norm': 12.955252647399902, 'learning_rate': 8.836397218046239e-07, 'epoch': 0.0013561160835367507, 'num_input_tokens_seen': 2107637760, 'completed': '27.26% (1_005 / 3_687)', 'remaining time': '24:49:36', 'throughput': '8815.37', 'gpu_mem_free': '5581MB'}
12
+ [INFO|trainer.py:175] 2025-01-02 02:22:50,884 >> {'loss': 0.5536, 'grad_norm': 18.2934513092041, 'learning_rate': 8.83368696746074e-07, 'epoch': 0.0016273393002441008, 'num_input_tokens_seen': 2109734912, 'completed': '27.29% (1_006 / 3_687)', 'remaining time': '24:15:29', 'throughput': '9096.90', 'gpu_mem_free': '5581MB'}
13
+ [INFO|trainer.py:175] 2025-01-02 02:23:19,708 >> {'loss': 0.8425, 'grad_norm': 19.734365463256836, 'learning_rate': 8.830974034050824e-07, 'epoch': 0.001898562516951451, 'num_input_tokens_seen': 2111832064, 'completed': '27.31% (1_007 / 3_687)', 'remaining time': '23:51:00', 'throughput': '9094.73', 'gpu_mem_free': '5581MB'}
14
+ [INFO|trainer.py:175] 2025-01-02 02:23:48,416 >> {'loss': 0.2779, 'grad_norm': 9.506952285766602, 'learning_rate': 8.828258419999759e-07, 'epoch': 0.0021697857336588013, 'num_input_tokens_seen': 2113929216, 'completed': '27.34% (1_008 / 3_687)', 'remaining time': '23:31:53', 'throughput': '9131.29', 'gpu_mem_free': '5581MB'}
15
+ [INFO|trainer.py:175] 2025-01-02 02:24:20,107 >> {'loss': 0.3105, 'grad_norm': 10.620465278625488, 'learning_rate': 8.825540127492965e-07, 'epoch': 0.0024410089503661514, 'num_input_tokens_seen': 2116026368, 'completed': '27.37% (1_009 / 3_687)', 'remaining time': '23:31:43', 'throughput': '8271.85', 'gpu_mem_free': '5581MB'}
16
+ [INFO|trainer.py:175] 2025-01-02 02:24:51,346 >> {'loss': 0.775, 'grad_norm': 20.96041488647461, 'learning_rate': 8.822819158718026e-07, 'epoch': 0.0027122321670735015, 'num_input_tokens_seen': 2118123520, 'completed': '27.39% (1_010 / 3_687)', 'remaining time': '23:29:26', 'throughput': '8391.71', 'gpu_mem_free': '5581MB'}
17
+ [INFO|trainer.py:175] 2025-01-02 02:25:21,486 >> {'loss': 0.505, 'grad_norm': 15.075592994689941, 'learning_rate': 8.820095515864669e-07, 'epoch': 0.0029834553837808516, 'num_input_tokens_seen': 2120220672, 'completed': '27.42% (1_011 / 3_687)', 'remaining time': '23:23:02', 'throughput': '8697.53', 'gpu_mem_free': '5581MB'}
18
+ [INFO|trainer.py:175] 2025-01-02 02:25:51,490 >> {'loss': 0.6136, 'grad_norm': 19.47187042236328, 'learning_rate': 8.81736920112478e-07, 'epoch': 0.0032546786004882017, 'num_input_tokens_seen': 2122317824, 'completed': '27.45% (1_012 / 3_687)', 'remaining time': '23:17:07', 'throughput': '8736.74', 'gpu_mem_free': '5581MB'}
19
+ [INFO|trainer.py:175] 2025-01-02 02:26:26,199 >> {'loss': 0.4918, 'grad_norm': 17.397031784057617, 'learning_rate': 8.814640216692391e-07, 'epoch': 0.003525901817195552, 'num_input_tokens_seen': 2124414976, 'completed': '27.47% (1_013 / 3_687)', 'remaining time': '23:28:09', 'throughput': '7552.64', 'gpu_mem_free': '5581MB'}
20
+ [INFO|trainer.py:175] 2025-01-02 02:26:53,599 >> {'loss': 0.8976, 'grad_norm': 23.4683895111084, 'learning_rate': 8.81190856476369e-07, 'epoch': 0.003797125033902902, 'num_input_tokens_seen': 2126512128, 'completed': '27.50% (1_014 / 3_687)', 'remaining time': '23:14:16', 'throughput': '9567.54', 'gpu_mem_free': '5581MB'}
21
+ [INFO|trainer.py:175] 2025-01-02 02:27:22,811 >> {'loss': 0.6695, 'grad_norm': 21.26357078552246, 'learning_rate': 8.809174247537003e-07, 'epoch': 0.0040683482506102524, 'num_input_tokens_seen': 2128609280, 'completed': '27.53% (1_015 / 3_687)', 'remaining time': '23:07:33', 'throughput': '8973.69', 'gpu_mem_free': '5581MB'}
22
+ [INFO|trainer.py:175] 2025-01-02 02:27:54,975 >> {'loss': 0.2329, 'grad_norm': 9.867420196533203, 'learning_rate': 8.806437267212805e-07, 'epoch': 0.0043395714673176026, 'num_input_tokens_seen': 2130706432, 'completed': '27.56% (1_016 / 3_687)', 'remaining time': '23:09:50', 'throughput': '8150.26', 'gpu_mem_free': '5581MB'}
23
+ [INFO|trainer.py:175] 2025-01-02 02:28:28,662 >> {'loss': 0.8958, 'grad_norm': 26.58327293395996, 'learning_rate': 8.803697625993713e-07, 'epoch': 0.004610794684024953, 'num_input_tokens_seen': 2132803584, 'completed': '27.58% (1_017 / 3_687)', 'remaining time': '23:15:46', 'throughput': '7781.76', 'gpu_mem_free': '5581MB'}
24
+ [INFO|trainer.py:175] 2025-01-02 02:28:57,726 >> {'loss': 0.62, 'grad_norm': 15.50483226776123, 'learning_rate': 8.800955326084487e-07, 'epoch': 0.004882017900732303, 'num_input_tokens_seen': 2134900736, 'completed': '27.61% (1_018 / 3_687)', 'remaining time': '23:09:33', 'throughput': '9019.51', 'gpu_mem_free': '5581MB'}
25
+ [INFO|trainer.py:175] 2025-01-02 02:29:26,842 >> {'loss': 0.255, 'grad_norm': 9.97493839263916, 'learning_rate': 8.798210369692025e-07, 'epoch': 0.005153241117439653, 'num_input_tokens_seen': 2136997888, 'completed': '27.64% (1_019 / 3_687)', 'remaining time': '23:04:04', 'throughput': '9003.38', 'gpu_mem_free': '5581MB'}
26
+ [INFO|trainer.py:175] 2025-01-02 02:29:56,337 >> {'loss': 0.5253, 'grad_norm': 13.728446960449219, 'learning_rate': 8.795462759025364e-07, 'epoch': 0.005424464334147003, 'num_input_tokens_seen': 2139095040, 'completed': '27.66% (1_020 / 3_687)', 'remaining time': '22:59:56', 'throughput': '8887.74', 'gpu_mem_free': '5581MB'}
27
+ [INFO|trainer.py:175] 2025-01-02 02:30:28,143 >> {'loss': 0.3554, 'grad_norm': 13.042479515075684, 'learning_rate': 8.792712496295677e-07, 'epoch': 0.005695687550854353, 'num_input_tokens_seen': 2141192192, 'completed': '27.69% (1_021 / 3_687)', 'remaining time': '23:01:01', 'throughput': '8242.02', 'gpu_mem_free': '5581MB'}
28
+ [INFO|trainer.py:175] 2025-01-02 02:30:58,684 >> {'loss': 0.4398, 'grad_norm': 14.499848365783691, 'learning_rate': 8.789959583716268e-07, 'epoch': 0.005966910767561703, 'num_input_tokens_seen': 2143289344, 'completed': '27.72% (1_022 / 3_687)', 'remaining time': '22:59:25', 'throughput': '8583.25', 'gpu_mem_free': '5581MB'}
29
+ [INFO|trainer.py:175] 2025-01-02 02:31:28,795 >> {'loss': 0.3154, 'grad_norm': 12.071355819702148, 'learning_rate': 8.787204023502579e-07, 'epoch': 0.006238133984269053, 'num_input_tokens_seen': 2145386496, 'completed': '27.75% (1_023 / 3_687)', 'remaining time': '22:57:04', 'throughput': '8706.06', 'gpu_mem_free': '5581MB'}
30
+ [INFO|trainer.py:175] 2025-01-02 02:31:56,066 >> {'loss': 0.4772, 'grad_norm': 16.29543113708496, 'learning_rate': 8.78444581787218e-07, 'epoch': 0.006509357200976403, 'num_input_tokens_seen': 2147483648, 'completed': '27.77% (1_024 / 3_687)', 'remaining time': '22:49:38', 'throughput': '9612.56', 'gpu_mem_free': '5581MB'}
31
+ [INFO|trainer.py:175] 2025-01-02 02:32:28,517 >> {'loss': 0.5242, 'grad_norm': 14.60418701171875, 'learning_rate': 8.781684969044769e-07, 'epoch': 0.0067805804176837535, 'num_input_tokens_seen': 2149580800, 'completed': '27.80% (1_025 / 3_687)', 'remaining time': '22:51:56', 'throughput': '8078.20', 'gpu_mem_free': '5581MB'}
32
+ [INFO|trainer.py:175] 2025-01-02 02:33:00,452 >> {'loss': 0.6014, 'grad_norm': 16.060916900634766, 'learning_rate': 8.778921479242173e-07, 'epoch': 0.007051803634391104, 'num_input_tokens_seen': 2151677952, 'completed': '27.83% (1_026 / 3_687)', 'remaining time': '22:53:09', 'throughput': '8208.63', 'gpu_mem_free': '5581MB'}
33
+ [INFO|trainer.py:175] 2025-01-02 02:33:30,816 >> {'loss': 0.2425, 'grad_norm': 11.258378028869629, 'learning_rate': 8.776155350688342e-07, 'epoch': 0.007323026851098454, 'num_input_tokens_seen': 2153775104, 'completed': '27.85% (1_027 / 3_687)', 'remaining time': '22:51:39', 'throughput': '8633.45', 'gpu_mem_free': '5581MB'}
34
+ [INFO|trainer.py:175] 2025-01-02 02:34:00,846 >> {'loss': 0.3129, 'grad_norm': 13.625775337219238, 'learning_rate': 8.773386585609352e-07, 'epoch': 0.007594250067805804, 'num_input_tokens_seen': 2155872256, 'completed': '27.88% (1_028 / 3_687)', 'remaining time': '22:49:42', 'throughput': '8729.27', 'gpu_mem_free': '5581MB'}
35
+ [INFO|trainer.py:175] 2025-01-02 02:34:30,215 >> {'loss': 0.4769, 'grad_norm': 14.161125183105469, 'learning_rate': 8.770615186233398e-07, 'epoch': 0.007865473284513154, 'num_input_tokens_seen': 2157969408, 'completed': '27.91% (1_029 / 3_687)', 'remaining time': '22:46:50', 'throughput': '8926.03', 'gpu_mem_free': '5581MB'}
36
+ [INFO|trainer.py:175] 2025-01-02 02:35:00,692 >> {'loss': 0.4059, 'grad_norm': 14.752073287963867, 'learning_rate': 8.7678411547908e-07, 'epoch': 0.008136696501220505, 'num_input_tokens_seen': 2160066560, 'completed': '27.94% (1_030 / 3_687)', 'remaining time': '22:45:46', 'throughput': '8601.31', 'gpu_mem_free': '5581MB'}
37
+ [INFO|trainer.py:175] 2025-01-02 02:35:29,350 >> {'loss': 0.9232, 'grad_norm': 19.5262451171875, 'learning_rate': 8.76506449351399e-07, 'epoch': 0.008407919717927854, 'num_input_tokens_seen': 2162163712, 'completed': '27.96% (1_031 / 3_687)', 'remaining time': '22:42:08', 'throughput': '9147.26', 'gpu_mem_free': '5581MB'}
38
+ [INFO|trainer.py:175] 2025-01-02 02:36:01,590 >> {'loss': 0.3903, 'grad_norm': 23.627025604248047, 'learning_rate': 8.762285204637522e-07, 'epoch': 0.008679142934635205, 'num_input_tokens_seen': 2164260864, 'completed': '27.99% (1_032 / 3_687)', 'remaining time': '22:43:39', 'throughput': '8131.00', 'gpu_mem_free': '5581MB'}
39
+ [INFO|trainer.py:175] 2025-01-02 02:36:29,479 >> {'loss': 0.9677, 'grad_norm': 22.0424861907959, 'learning_rate': 8.75950329039806e-07, 'epoch': 0.008950366151342554, 'num_input_tokens_seen': 2166358016, 'completed': '28.02% (1_033 / 3_687)', 'remaining time': '22:39:12', 'throughput': '9399.68', 'gpu_mem_free': '5581MB'}
40
+ [INFO|trainer.py:175] 2025-01-02 02:37:00,579 >> {'loss': 0.4666, 'grad_norm': 16.031373977661133, 'learning_rate': 8.756718753034381e-07, 'epoch': 0.009221589368049905, 'num_input_tokens_seen': 2168455168, 'completed': '28.04% (1_034 / 3_687)', 'remaining time': '22:39:11', 'throughput': '8428.90', 'gpu_mem_free': '5581MB'}
41
+ [INFO|trainer.py:175] 2025-01-02 02:37:31,780 >> {'loss': 0.4239, 'grad_norm': 12.060325622558594, 'learning_rate': 8.75393159478738e-07, 'epoch': 0.009492812584757255, 'num_input_tokens_seen': 2170552320, 'completed': '28.07% (1_035 / 3_687)', 'remaining time': '22:39:15', 'throughput': '8401.79', 'gpu_mem_free': '5581MB'}
42
+ [INFO|trainer.py:175] 2025-01-02 02:38:02,110 >> {'loss': 0.5839, 'grad_norm': 17.0775089263916, 'learning_rate': 8.751141817900052e-07, 'epoch': 0.009764035801464606, 'num_input_tokens_seen': 2172649472, 'completed': '28.10% (1_036 / 3_687)', 'remaining time': '22:38:13', 'throughput': '8643.13', 'gpu_mem_free': '5581MB'}
43
+ [INFO|trainer.py:175] 2025-01-02 02:38:35,023 >> {'loss': 0.3156, 'grad_norm': 12.101827621459961, 'learning_rate': 8.748349424617504e-07, 'epoch': 0.010035259018171955, 'num_input_tokens_seen': 2174746624, 'completed': '28.13% (1_037 / 3_687)', 'remaining time': '22:40:18', 'throughput': '7964.89', 'gpu_mem_free': '5581MB'}
44
+ [INFO|trainer.py:175] 2025-01-02 02:39:06,013 >> {'loss': 0.7375, 'grad_norm': 19.348678588867188, 'learning_rate': 8.745554417186946e-07, 'epoch': 0.010306482234879306, 'num_input_tokens_seen': 2176843776, 'completed': '28.15% (1_038 / 3_687)', 'remaining time': '22:40:00', 'throughput': '8458.93', 'gpu_mem_free': '5581MB'}
45
+ [INFO|trainer.py:175] 2025-01-02 02:39:35,175 >> {'loss': 0.7549, 'grad_norm': 20.518571853637695, 'learning_rate': 8.742756797857698e-07, 'epoch': 0.010577705451586655, 'num_input_tokens_seen': 2178940928, 'completed': '28.18% (1_039 / 3_687)', 'remaining time': '22:37:38', 'throughput': '8989.30', 'gpu_mem_free': '5581MB'}
46
+ [INFO|trainer.py:175] 2025-01-02 02:40:07,171 >> {'loss': 0.5175, 'grad_norm': 29.709476470947266, 'learning_rate': 8.739956568881174e-07, 'epoch': 0.010848928668294006, 'num_input_tokens_seen': 2181038080, 'completed': '28.21% (1_040 / 3_687)', 'remaining time': '22:38:29', 'throughput': '8193.02', 'gpu_mem_free': '5581MB'}
47
+ [INFO|trainer.py:175] 2025-01-02 02:40:39,040 >> {'loss': 0.5272, 'grad_norm': 16.73153305053711, 'learning_rate': 8.737153732510894e-07, 'epoch': 0.011120151885001357, 'num_input_tokens_seen': 2183135232, 'completed': '28.23% (1_041 / 3_687)', 'remaining time': '22:39:08', 'throughput': '8225.66', 'gpu_mem_free': '5581MB'}
48
+ [INFO|trainer.py:175] 2025-01-02 02:41:08,249 >> {'loss': 0.4049, 'grad_norm': 11.584559440612793, 'learning_rate': 8.734348291002472e-07, 'epoch': 0.011391375101708706, 'num_input_tokens_seen': 2185232384, 'completed': '28.26% (1_042 / 3_687)', 'remaining time': '22:36:55', 'throughput': '8974.70', 'gpu_mem_free': '5581MB'}
49
+ [INFO|trainer.py:175] 2025-01-02 02:41:39,763 >> {'loss': 0.6874, 'grad_norm': 16.299226760864258, 'learning_rate': 8.731540246613621e-07, 'epoch': 0.011662598318416057, 'num_input_tokens_seen': 2187329536, 'completed': '28.29% (1_043 / 3_687)', 'remaining time': '22:37:10', 'throughput': '8318.21', 'gpu_mem_free': '5581MB'}
50
+ [INFO|trainer.py:175] 2025-01-02 02:42:10,729 >> {'loss': 0.2929, 'grad_norm': 12.125165939331055, 'learning_rate': 8.728729601604149e-07, 'epoch': 0.011933821535123406, 'num_input_tokens_seen': 2189426688, 'completed': '28.32% (1_044 / 3_687)', 'remaining time': '22:36:49', 'throughput': '8466.06', 'gpu_mem_free': '5581MB'}
51
+ [INFO|trainer.py:175] 2025-01-02 02:42:41,551 >> {'loss': 0.5507, 'grad_norm': 15.17818546295166, 'learning_rate': 8.725916358235956e-07, 'epoch': 0.012205044751830757, 'num_input_tokens_seen': 2191523840, 'completed': '28.34% (1_045 / 3_687)', 'remaining time': '22:36:19', 'throughput': '8504.81', 'gpu_mem_free': '5581MB'}
52
+ [INFO|trainer.py:175] 2025-01-02 02:43:08,687 >> {'loss': 0.7183, 'grad_norm': 15.40369987487793, 'learning_rate': 8.723100518773034e-07, 'epoch': 0.012476267968538107, 'num_input_tokens_seen': 2193620992, 'completed': '28.37% (1_046 / 3_687)', 'remaining time': '22:32:18', 'throughput': '9660.13', 'gpu_mem_free': '5581MB'}
53
+ [INFO|trainer.py:175] 2025-01-02 02:43:39,016 >> {'loss': 0.3474, 'grad_norm': 13.198980331420898, 'learning_rate': 8.720282085481463e-07, 'epoch': 0.012747491185245458, 'num_input_tokens_seen': 2195718144, 'completed': '28.40% (1_047 / 3_687)', 'remaining time': '22:31:25', 'throughput': '8643.50', 'gpu_mem_free': '5581MB'}
54
+ [INFO|trainer.py:175] 2025-01-02 02:44:10,472 >> {'loss': 0.4623, 'grad_norm': 13.224248886108398, 'learning_rate': 8.717461060629408e-07, 'epoch': 0.013018714401952807, 'num_input_tokens_seen': 2197815296, 'completed': '28.42% (1_048 / 3_687)', 'remaining time': '22:31:35', 'throughput': '8333.54', 'gpu_mem_free': '5581MB'}
55
+ [INFO|trainer.py:175] 2025-01-02 02:44:41,473 >> {'loss': 0.6044, 'grad_norm': 21.415082931518555, 'learning_rate': 8.714637446487127e-07, 'epoch': 0.013289937618660158, 'num_input_tokens_seen': 2199912448, 'completed': '28.45% (1_049 / 3_687)', 'remaining time': '22:31:19', 'throughput': '8456.40', 'gpu_mem_free': '5581MB'}
56
+ [INFO|trainer.py:175] 2025-01-02 02:45:11,575 >> {'loss': 0.4799, 'grad_norm': 17.107685089111328, 'learning_rate': 8.711811245326955e-07, 'epoch': 0.013561160835367507, 'num_input_tokens_seen': 2202009600, 'completed': '28.48% (1_050 / 3_687)', 'remaining time': '22:30:15', 'throughput': '8708.24', 'gpu_mem_free': '5581MB'}
57
+ [INFO|trainer.py:175] 2025-01-02 02:45:44,712 >> {'loss': 0.5492, 'grad_norm': 16.153474807739258, 'learning_rate': 8.70898245942331e-07, 'epoch': 0.013832384052074858, 'num_input_tokens_seen': 2204106752, 'completed': '28.51% (1_051 / 3_687)', 'remaining time': '22:31:49', 'throughput': '7910.73', 'gpu_mem_free': '5581MB'}
58
+ [INFO|trainer.py:175] 2025-01-02 02:46:13,943 >> {'loss': 0.6168, 'grad_norm': 14.7495756149292, 'learning_rate': 8.706151091052693e-07, 'epoch': 0.014103607268782207, 'num_input_tokens_seen': 2206203904, 'completed': '28.53% (1_052 / 3_687)', 'remaining time': '22:30:00', 'throughput': '8968.00', 'gpu_mem_free': '5581MB'}
59
+ [INFO|trainer.py:175] 2025-01-02 02:46:46,168 >> {'loss': 0.5481, 'grad_norm': 15.120250701904297, 'learning_rate': 8.703317142493681e-07, 'epoch': 0.014374830485489558, 'num_input_tokens_seen': 2208301056, 'completed': '28.56% (1_053 / 3_687)', 'remaining time': '22:30:43', 'throughput': '8134.84', 'gpu_mem_free': '5581MB'}
60
+ [INFO|trainer.py:175] 2025-01-02 02:47:14,235 >> {'loss': 0.6981, 'grad_norm': 18.195310592651367, 'learning_rate': 8.700480616026928e-07, 'epoch': 0.014646053702196907, 'num_input_tokens_seen': 2210398208, 'completed': '28.59% (1_054 / 3_687)', 'remaining time': '22:28:01', 'throughput': '9340.13', 'gpu_mem_free': '5581MB'}
61
+ [INFO|trainer.py:175] 2025-01-02 02:47:45,152 >> {'loss': 0.5543, 'grad_norm': 15.834187507629395, 'learning_rate': 8.697641513935164e-07, 'epoch': 0.014917276918904258, 'num_input_tokens_seen': 2212495360, 'completed': '28.61% (1_055 / 3_687)', 'remaining time': '22:27:40', 'throughput': '8478.84', 'gpu_mem_free': '5581MB'}
62
+ [INFO|trainer.py:175] 2025-01-02 02:48:14,541 >> {'loss': 0.5255, 'grad_norm': 13.692312240600586, 'learning_rate': 8.694799838503186e-07, 'epoch': 0.015188500135611608, 'num_input_tokens_seen': 2214592512, 'completed': '28.64% (1_056 / 3_687)', 'remaining time': '22:26:06', 'throughput': '8919.83', 'gpu_mem_free': '5581MB'}
63
+ [INFO|trainer.py:175] 2025-01-02 02:48:44,733 >> {'loss': 0.3443, 'grad_norm': 11.233868598937988, 'learning_rate': 8.691955592017872e-07, 'epoch': 0.015459723352318959, 'num_input_tokens_seen': 2216689664, 'completed': '28.67% (1_057 / 3_687)', 'remaining time': '22:25:12', 'throughput': '8682.44', 'gpu_mem_free': '5581MB'}
64
+ [INFO|trainer.py:175] 2025-01-02 02:49:10,235 >> {'loss': 0.7494, 'grad_norm': 20.017642974853516, 'learning_rate': 8.689108776768159e-07, 'epoch': 0.015730946569026308, 'num_input_tokens_seen': 2218786816, 'completed': '28.70% (1_058 / 3_687)', 'remaining time': '22:20:47', 'throughput': '10279.44', 'gpu_mem_free': '5581MB'}
65
+ [INFO|trainer.py:175] 2025-01-02 02:49:39,376 >> {'loss': 0.6063, 'grad_norm': 14.512331008911133, 'learning_rate': 8.686259395045056e-07, 'epoch': 0.01600216978573366, 'num_input_tokens_seen': 2220883968, 'completed': '28.72% (1_059 / 3_687)', 'remaining time': '22:19:11', 'throughput': '8995.75', 'gpu_mem_free': '5581MB'}
66
+ [INFO|trainer.py:175] 2025-01-02 02:50:11,041 >> {'loss': 0.4511, 'grad_norm': 15.613311767578125, 'learning_rate': 8.68340744914164e-07, 'epoch': 0.01627339300244101, 'num_input_tokens_seen': 2222981120, 'completed': '28.75% (1_060 / 3_687)', 'remaining time': '22:19:28', 'throughput': '8278.60', 'gpu_mem_free': '5581MB'}
67
+ [INFO|trainer.py:175] 2025-01-02 02:50:41,348 >> {'loss': 0.3019, 'grad_norm': 13.627348899841309, 'learning_rate': 8.680552941353045e-07, 'epoch': 0.01654461621914836, 'num_input_tokens_seen': 2225078272, 'completed': '28.78% (1_061 / 3_687)', 'remaining time': '22:18:45', 'throughput': '8649.63', 'gpu_mem_free': '5581MB'}
68
+ [INFO|trainer.py:175] 2025-01-02 02:51:11,986 >> {'loss': 0.701, 'grad_norm': 16.624229431152344, 'learning_rate': 8.677695873976473e-07, 'epoch': 0.016815839435855708, 'num_input_tokens_seen': 2227175424, 'completed': '28.80% (1_062 / 3_687)', 'remaining time': '22:18:17', 'throughput': '8556.23', 'gpu_mem_free': '5581MB'}
69
+ [INFO|trainer.py:175] 2025-01-02 02:51:42,246 >> {'loss': 0.4085, 'grad_norm': 11.855605125427246, 'learning_rate': 8.674836249311182e-07, 'epoch': 0.01708706265256306, 'num_input_tokens_seen': 2229272576, 'completed': '28.83% (1_063 / 3_687)', 'remaining time': '22:17:32', 'throughput': '8663.36', 'gpu_mem_free': '5581MB'}
70
+ [INFO|trainer.py:175] 2025-01-02 02:52:09,333 >> {'loss': 0.6829, 'grad_norm': 16.31574249267578, 'learning_rate': 8.671974069658488e-07, 'epoch': 0.01735828586927041, 'num_input_tokens_seen': 2231369728, 'completed': '28.86% (1_064 / 3_687)', 'remaining time': '22:14:38', 'throughput': '9677.36', 'gpu_mem_free': '5581MB'}
71
+ [INFO|trainer.py:175] 2025-01-02 02:52:37,866 >> {'loss': 0.603, 'grad_norm': 16.633039474487305, 'learning_rate': 8.669109337321767e-07, 'epoch': 0.01762950908597776, 'num_input_tokens_seen': 2233466880, 'completed': '28.89% (1_065 / 3_687)', 'remaining time': '22:12:47', 'throughput': '9187.40', 'gpu_mem_free': '5581MB'}
72
+ [INFO|trainer.py:175] 2025-01-02 02:53:06,622 >> {'loss': 0.4269, 'grad_norm': 12.263994216918945, 'learning_rate': 8.666242054606444e-07, 'epoch': 0.01790073230268511, 'num_input_tokens_seen': 2235564032, 'completed': '28.91% (1_066 / 3_687)', 'remaining time': '22:11:08', 'throughput': '9116.34', 'gpu_mem_free': '5581MB'}
73
+ [INFO|trainer.py:175] 2025-01-02 02:53:39,113 >> {'loss': 0.5071, 'grad_norm': 13.086897850036621, 'learning_rate': 8.66337222382e-07, 'epoch': 0.01817195551939246, 'num_input_tokens_seen': 2237661184, 'completed': '28.94% (1_067 / 3_687)', 'remaining time': '22:11:56', 'throughput': '8068.37', 'gpu_mem_free': '5581MB'}
74
+ [INFO|trainer.py:175] 2025-01-02 02:54:08,289 >> {'loss': 0.7171, 'grad_norm': 18.24944305419922, 'learning_rate': 8.660499847271965e-07, 'epoch': 0.01844317873609981, 'num_input_tokens_seen': 2239758336, 'completed': '28.97% (1_068 / 3_687)', 'remaining time': '22:10:35', 'throughput': '8984.57', 'gpu_mem_free': '5581MB'}
75
+ [INFO|trainer.py:175] 2025-01-02 02:54:39,778 >> {'loss': 0.3733, 'grad_norm': 12.880998611450195, 'learning_rate': 8.657624927273919e-07, 'epoch': 0.01871440195280716, 'num_input_tokens_seen': 2241855488, 'completed': '28.99% (1_069 / 3_687)', 'remaining time': '22:10:42', 'throughput': '8325.07', 'gpu_mem_free': '5581MB'}
76
+ [INFO|trainer.py:175] 2025-01-02 02:55:09,157 >> {'loss': 0.417, 'grad_norm': 11.316634178161621, 'learning_rate': 8.654747466139488e-07, 'epoch': 0.01898562516951451, 'num_input_tokens_seen': 2243952640, 'completed': '29.02% (1_070 / 3_687)', 'remaining time': '22:09:30', 'throughput': '8922.85', 'gpu_mem_free': '5581MB'}
77
+ [INFO|trainer.py:175] 2025-01-02 02:55:37,977 >> {'loss': 0.515, 'grad_norm': 16.127735137939453, 'learning_rate': 8.651867466184344e-07, 'epoch': 0.01925684838622186, 'num_input_tokens_seen': 2246049792, 'completed': '29.05% (1_071 / 3_687)', 'remaining time': '22:07:58', 'throughput': '9095.89', 'gpu_mem_free': '5581MB'}
78
+ [INFO|trainer.py:175] 2025-01-02 02:56:08,540 >> {'loss': 0.4598, 'grad_norm': 13.648701667785645, 'learning_rate': 8.6489849297262e-07, 'epoch': 0.01952807160292921, 'num_input_tokens_seen': 2248146944, 'completed': '29.08% (1_072 / 3_687)', 'remaining time': '22:07:32', 'throughput': '8577.02', 'gpu_mem_free': '5581MB'}
79
+ [INFO|trainer.py:175] 2025-01-02 02:56:38,440 >> {'loss': 0.4691, 'grad_norm': 18.77680778503418, 'learning_rate': 8.646099859084812e-07, 'epoch': 0.019799294819636562, 'num_input_tokens_seen': 2250244096, 'completed': '29.10% (1_073 / 3_687)', 'remaining time': '22:06:41', 'throughput': '8767.53', 'gpu_mem_free': '5581MB'}
80
+ [INFO|trainer.py:175] 2025-01-02 02:57:07,219 >> {'loss': 0.4609, 'grad_norm': 14.230118751525879, 'learning_rate': 8.643212256581978e-07, 'epoch': 0.02007051803634391, 'num_input_tokens_seen': 2252341248, 'completed': '29.13% (1_074 / 3_687)', 'remaining time': '22:05:12', 'throughput': '9108.71', 'gpu_mem_free': '5581MB'}
81
+ [INFO|trainer.py:175] 2025-01-02 02:57:36,902 >> {'loss': 0.4302, 'grad_norm': 20.32296371459961, 'learning_rate': 8.640322124541525e-07, 'epoch': 0.02034174125305126, 'num_input_tokens_seen': 2254438400, 'completed': '29.16% (1_075 / 3_687)', 'remaining time': '22:04:15', 'throughput': '8831.58', 'gpu_mem_free': '5581MB'}
82
+ [INFO|trainer.py:175] 2025-01-02 02:58:06,360 >> {'loss': 0.7439, 'grad_norm': 19.631397247314453, 'learning_rate': 8.637429465289324e-07, 'epoch': 0.02061296446975861, 'num_input_tokens_seen': 2256535552, 'completed': '29.18% (1_076 / 3_687)', 'remaining time': '22:03:12', 'throughput': '8898.82', 'gpu_mem_free': '5581MB'}
83
+ [INFO|trainer.py:175] 2025-01-02 02:58:34,622 >> {'loss': 0.9491, 'grad_norm': 20.95248794555664, 'learning_rate': 8.63453428115328e-07, 'epoch': 0.020884187686465962, 'num_input_tokens_seen': 2258632704, 'completed': '29.21% (1_077 / 3_687)', 'remaining time': '22:01:29', 'throughput': '9275.54', 'gpu_mem_free': '5581MB'}
84
+ [INFO|trainer.py:175] 2025-01-02 02:59:01,850 >> {'loss': 1.0095, 'grad_norm': 23.594327926635742, 'learning_rate': 8.631636574463321e-07, 'epoch': 0.02115541090317331, 'num_input_tokens_seen': 2260729856, 'completed': '29.24% (1_078 / 3_687)', 'remaining time': '21:59:13', 'throughput': '9627.86', 'gpu_mem_free': '5581MB'}
85
+ [INFO|trainer.py:175] 2025-01-02 02:59:31,093 >> {'loss': 0.4976, 'grad_norm': 13.66223430633545, 'learning_rate': 8.628736347551417e-07, 'epoch': 0.02142663411988066, 'num_input_tokens_seen': 2262827008, 'completed': '29.26% (1_079 / 3_687)', 'remaining time': '21:58:06', 'throughput': '8964.25', 'gpu_mem_free': '5581MB'}
86
+ [INFO|trainer.py:175] 2025-01-02 03:00:04,207 >> {'loss': 0.4426, 'grad_norm': 14.790290832519531, 'learning_rate': 8.625833602751559e-07, 'epoch': 0.021697857336588012, 'num_input_tokens_seen': 2264924160, 'completed': '29.29% (1_080 / 3_687)', 'remaining time': '21:59:07', 'throughput': '7916.41', 'gpu_mem_free': '5581MB'}
87
+ [INFO|trainer.py:175] 2025-01-02 03:00:33,068 >> {'loss': 0.9969, 'grad_norm': 22.70588493347168, 'learning_rate': 8.622928342399762e-07, 'epoch': 0.021969080553295363, 'num_input_tokens_seen': 2267021312, 'completed': '29.32% (1_081 / 3_687)', 'remaining time': '21:57:48', 'throughput': '9082.98', 'gpu_mem_free': '5581MB'}
88
+ [INFO|trainer.py:175] 2025-01-02 03:01:02,657 >> {'loss': 0.6167, 'grad_norm': 15.693807601928711, 'learning_rate': 8.620020568834072e-07, 'epoch': 0.022240303770002714, 'num_input_tokens_seen': 2269118464, 'completed': '29.35% (1_082 / 3_687)', 'remaining time': '21:56:54', 'throughput': '8859.53', 'gpu_mem_free': '5581MB'}
89
+ [INFO|trainer.py:175] 2025-01-02 03:01:31,264 >> {'loss': 0.5203, 'grad_norm': 14.573871612548828, 'learning_rate': 8.617110284394553e-07, 'epoch': 0.02251152698671006, 'num_input_tokens_seen': 2271215616, 'completed': '29.37% (1_083 / 3_687)', 'remaining time': '21:55:30', 'throughput': '9163.43', 'gpu_mem_free': '5581MB'}
90
+ [INFO|trainer.py:175] 2025-01-02 03:02:00,401 >> {'loss': 0.3329, 'grad_norm': 12.033546447753906, 'learning_rate': 8.614197491423293e-07, 'epoch': 0.022782750203417412, 'num_input_tokens_seen': 2273312768, 'completed': '29.40% (1_084 / 3_687)', 'remaining time': '21:54:23', 'throughput': '8997.07', 'gpu_mem_free': '5581MB'}
91
+ [INFO|trainer.py:175] 2025-01-02 03:02:29,023 >> {'loss': 0.8502, 'grad_norm': 21.395675659179688, 'learning_rate': 8.611282192264396e-07, 'epoch': 0.023053973420124763, 'num_input_tokens_seen': 2275409920, 'completed': '29.43% (1_085 / 3_687)', 'remaining time': '21:53:01', 'throughput': '9158.90', 'gpu_mem_free': '5581MB'}
92
+ [INFO|trainer.py:175] 2025-01-02 03:02:58,945 >> {'loss': 0.3252, 'grad_norm': 10.336832046508789, 'learning_rate': 8.608364389263984e-07, 'epoch': 0.023325196636832114, 'num_input_tokens_seen': 2277507072, 'completed': '29.45% (1_086 / 3_687)', 'remaining time': '21:52:20', 'throughput': '8760.86', 'gpu_mem_free': '5581MB'}
93
+ [INFO|trainer.py:175] 2025-01-02 03:03:31,179 >> {'loss': 0.4895, 'grad_norm': 14.438992500305176, 'learning_rate': 8.605444084770192e-07, 'epoch': 0.023596419853539462, 'num_input_tokens_seen': 2279604224, 'completed': '29.48% (1_087 / 3_687)', 'remaining time': '21:52:49', 'throughput': '8132.54', 'gpu_mem_free': '5581MB'}
94
+ [INFO|trainer.py:175] 2025-01-02 03:04:01,815 >> {'loss': 0.4126, 'grad_norm': 12.839303970336914, 'learning_rate': 8.602521281133173e-07, 'epoch': 0.023867643070246813, 'num_input_tokens_seen': 2281701376, 'completed': '29.51% (1_088 / 3_687)', 'remaining time': '21:52:28', 'throughput': '8556.86', 'gpu_mem_free': '5581MB'}
95
+ [INFO|trainer.py:175] 2025-01-02 03:04:32,077 >> {'loss': 0.298, 'grad_norm': 15.832900047302246, 'learning_rate': 8.599595980705085e-07, 'epoch': 0.024138866286954164, 'num_input_tokens_seen': 2283798528, 'completed': '29.54% (1_089 / 3_687)', 'remaining time': '21:51:57', 'throughput': '8662.42', 'gpu_mem_free': '5581MB'}
96
+ [INFO|trainer.py:175] 2025-01-02 03:05:04,652 >> {'loss': 0.5055, 'grad_norm': 16.126102447509766, 'learning_rate': 8.596668185840102e-07, 'epoch': 0.024410089503661515, 'num_input_tokens_seen': 2285895680, 'completed': '29.56% (1_090 / 3_687)', 'remaining time': '21:52:32', 'throughput': '8047.42', 'gpu_mem_free': '5581MB'}
97
+ [INFO|trainer.py:175] 2025-01-02 03:05:32,673 >> {'loss': 0.5144, 'grad_norm': 13.678728103637695, 'learning_rate': 8.593737898894398e-07, 'epoch': 0.024681312720368862, 'num_input_tokens_seen': 2287992832, 'completed': '29.59% (1_091 / 3_687)', 'remaining time': '21:50:56', 'throughput': '9355.21', 'gpu_mem_free': '5581MB'}
98
+ [INFO|trainer.py:175] 2025-01-02 03:05:59,558 >> {'loss': 0.5175, 'grad_norm': 13.563033103942871, 'learning_rate': 8.59080512222616e-07, 'epoch': 0.024952535937076213, 'num_input_tokens_seen': 2290089984, 'completed': '29.62% (1_092 / 3_687)', 'remaining time': '21:48:50', 'throughput': '9750.51', 'gpu_mem_free': '5581MB'}
99
+ [INFO|trainer.py:175] 2025-01-02 03:06:33,080 >> {'loss': 0.662, 'grad_norm': 22.787578582763672, 'learning_rate': 8.587869858195574e-07, 'epoch': 0.025223759153783564, 'num_input_tokens_seen': 2292187136, 'completed': '29.64% (1_093 / 3_687)', 'remaining time': '21:49:50', 'throughput': '7820.14', 'gpu_mem_free': '5581MB'}
100
+ [INFO|trainer.py:175] 2025-01-02 03:07:01,780 >> {'loss': 0.4915, 'grad_norm': 12.561062812805176, 'learning_rate': 8.584932109164826e-07, 'epoch': 0.025494982370490915, 'num_input_tokens_seen': 2294284288, 'completed': '29.67% (1_094 / 3_687)', 'remaining time': '21:48:36', 'throughput': '9133.75', 'gpu_mem_free': '5581MB'}
101
+ [INFO|trainer.py:175] 2025-01-02 03:07:35,254 >> {'loss': 0.3419, 'grad_norm': 10.324007034301758, 'learning_rate': 8.581991877498109e-07, 'epoch': 0.025766205587198263, 'num_input_tokens_seen': 2296381440, 'completed': '29.70% (1_095 / 3_687)', 'remaining time': '21:49:33', 'throughput': '7831.36', 'gpu_mem_free': '5581MB'}
102
+ [INFO|trainer.py:175] 2025-01-02 03:08:03,704 >> {'loss': 0.5506, 'grad_norm': 16.793432235717773, 'learning_rate': 8.579049165561607e-07, 'epoch': 0.026037428803905614, 'num_input_tokens_seen': 2298478592, 'completed': '29.73% (1_096 / 3_687)', 'remaining time': '21:48:12', 'throughput': '9214.26', 'gpu_mem_free': '5581MB'}
103
+ [INFO|trainer.py:175] 2025-01-02 03:08:31,947 >> {'loss': 0.6403, 'grad_norm': 18.08753776550293, 'learning_rate': 8.576103975723502e-07, 'epoch': 0.026308652020612965, 'num_input_tokens_seen': 2300575744, 'completed': '29.75% (1_097 / 3_687)', 'remaining time': '21:46:47', 'throughput': '9281.67', 'gpu_mem_free': '5581MB'}
104
+ [INFO|trainer.py:175] 2025-01-02 03:09:00,488 >> {'loss': 0.5685, 'grad_norm': 14.949502944946289, 'learning_rate': 8.573156310353974e-07, 'epoch': 0.026579875237320316, 'num_input_tokens_seen': 2302672896, 'completed': '29.78% (1_098 / 3_687)', 'remaining time': '21:45:31', 'throughput': '9184.78', 'gpu_mem_free': '5581MB'}
105
+ [INFO|trainer.py:175] 2025-01-02 03:09:27,358 >> {'loss': 0.5644, 'grad_norm': 14.36096477508545, 'learning_rate': 8.570206171825188e-07, 'epoch': 0.026851098454027666, 'num_input_tokens_seen': 2304770048, 'completed': '29.81% (1_099 / 3_687)', 'remaining time': '21:43:32', 'throughput': '9755.85', 'gpu_mem_free': '5581MB'}
106
+ [INFO|trainer.py:175] 2025-01-02 03:09:57,340 >> {'loss': 0.439, 'grad_norm': 13.551562309265137, 'learning_rate': 8.567253562511306e-07, 'epoch': 0.027122321670735014, 'num_input_tokens_seen': 2306867200, 'completed': '29.83% (1_100 / 3_687)', 'remaining time': '21:42:56', 'throughput': '8743.39', 'gpu_mem_free': '5581MB'}
107
+ [INFO|trainer.py:175] 2025-01-02 03:10:26,308 >> {'loss': 0.4937, 'grad_norm': 18.999597549438477, 'learning_rate': 8.564298484788472e-07, 'epoch': 0.027393544887442365, 'num_input_tokens_seen': 2308964352, 'completed': '29.86% (1_101 / 3_687)', 'remaining time': '21:41:54', 'throughput': '9049.65', 'gpu_mem_free': '5581MB'}
108
+ [INFO|trainer.py:175] 2025-01-02 03:10:56,945 >> {'loss': 0.2738, 'grad_norm': 10.107467651367188, 'learning_rate': 8.561340941034825e-07, 'epoch': 0.027664768104149716, 'num_input_tokens_seen': 2311061504, 'completed': '29.89% (1_102 / 3_687)', 'remaining time': '21:41:34', 'throughput': '8556.30', 'gpu_mem_free': '5581MB'}
109
+ [INFO|trainer.py:175] 2025-01-02 03:11:27,991 >> {'loss': 0.6454, 'grad_norm': 19.000905990600586, 'learning_rate': 8.55838093363048e-07, 'epoch': 0.027935991320857067, 'num_input_tokens_seen': 2313158656, 'completed': '29.92% (1_103 / 3_687)', 'remaining time': '21:41:25', 'throughput': '8443.69', 'gpu_mem_free': '5581MB'}
110
+ [INFO|trainer.py:175] 2025-01-02 03:11:59,402 >> {'loss': 0.6641, 'grad_norm': 31.94487190246582, 'learning_rate': 8.555418464957542e-07, 'epoch': 0.028207214537564414, 'num_input_tokens_seen': 2315255808, 'completed': '29.94% (1_104 / 3_687)', 'remaining time': '21:41:25', 'throughput': '8345.60', 'gpu_mem_free': '5581MB'}
111
+ [INFO|trainer.py:175] 2025-01-02 03:12:28,424 >> {'loss': 0.4975, 'grad_norm': 16.993053436279297, 'learning_rate': 8.552453537400089e-07, 'epoch': 0.028478437754271765, 'num_input_tokens_seen': 2317352960, 'completed': '29.97% (1_105 / 3_687)', 'remaining time': '21:40:25', 'throughput': '9032.67', 'gpu_mem_free': '5581MB'}
112
+ [INFO|trainer.py:175] 2025-01-02 03:13:00,207 >> {'loss': 0.7325, 'grad_norm': 21.718708038330078, 'learning_rate': 8.549486153344183e-07, 'epoch': 0.028749660970979116, 'num_input_tokens_seen': 2319450112, 'completed': '30.00% (1_106 / 3_687)', 'remaining time': '21:40:32', 'throughput': '8247.94', 'gpu_mem_free': '5581MB'}
113
+ [INFO|trainer.py:175] 2025-01-02 03:13:27,379 >> {'loss': 0.5915, 'grad_norm': 16.96457862854004, 'learning_rate': 8.546516315177863e-07, 'epoch': 0.029020884187686467, 'num_input_tokens_seen': 2321547264, 'completed': '30.02% (1_107 / 3_687)', 'remaining time': '21:38:48', 'throughput': '9647.76', 'gpu_mem_free': '5581MB'}
114
+ [INFO|trainer.py:175] 2025-01-02 03:13:58,299 >> {'loss': 0.5334, 'grad_norm': 15.942675590515137, 'learning_rate': 8.543544025291143e-07, 'epoch': 0.029292107404393815, 'num_input_tokens_seen': 2323644416, 'completed': '30.05% (1_108 / 3_687)', 'remaining time': '21:38:35', 'throughput': '8478.04', 'gpu_mem_free': '5581MB'}
115
+ [INFO|trainer.py:175] 2025-01-02 03:14:28,559 >> {'loss': 0.5069, 'grad_norm': 14.432857513427734, 'learning_rate': 8.540569286076004e-07, 'epoch': 0.029563330621101166, 'num_input_tokens_seen': 2325741568, 'completed': '30.08% (1_109 / 3_687)', 'remaining time': '21:38:06', 'throughput': '8663.14', 'gpu_mem_free': '5581MB'}
116
+ [INFO|trainer.py:175] 2025-01-02 03:14:59,622 >> {'loss': 0.6163, 'grad_norm': 16.693790435791016, 'learning_rate': 8.537592099926407e-07, 'epoch': 0.029834553837808517, 'num_input_tokens_seen': 2327838720, 'completed': '30.11% (1_110 / 3_687)', 'remaining time': '21:37:56', 'throughput': '8439.12', 'gpu_mem_free': '5581MB'}
117
+ [INFO|trainer.py:175] 2025-01-02 03:15:31,233 >> {'loss': 0.3806, 'grad_norm': 13.320590019226074, 'learning_rate': 8.534612469238278e-07, 'epoch': 0.030105777054515868, 'num_input_tokens_seen': 2329935872, 'completed': '30.13% (1_111 / 3_687)', 'remaining time': '21:37:58', 'throughput': '8292.62', 'gpu_mem_free': '5581MB'}
118
+ [INFO|trainer.py:175] 2025-01-02 03:16:01,388 >> {'loss': 0.6614, 'grad_norm': 20.88117790222168, 'learning_rate': 8.531630396409507e-07, 'epoch': 0.030377000271223215, 'num_input_tokens_seen': 2332033024, 'completed': '30.16% (1_112 / 3_687)', 'remaining time': '21:37:26', 'throughput': '8693.44', 'gpu_mem_free': '5581MB'}
119
+ [INFO|trainer.py:175] 2025-01-02 03:16:32,314 >> {'loss': 0.489, 'grad_norm': 12.577048301696777, 'learning_rate': 8.528645883839956e-07, 'epoch': 0.030648223487930566, 'num_input_tokens_seen': 2334130176, 'completed': '30.19% (1_113 / 3_687)', 'remaining time': '21:37:12', 'throughput': '8476.44', 'gpu_mem_free': '5581MB'}
120
+ [INFO|trainer.py:175] 2025-01-02 03:17:02,970 >> {'loss': 0.5687, 'grad_norm': 16.97111701965332, 'learning_rate': 8.525658933931448e-07, 'epoch': 0.030919446704637917, 'num_input_tokens_seen': 2336227328, 'completed': '30.21% (1_114 / 3_687)', 'remaining time': '21:36:51', 'throughput': '8551.13', 'gpu_mem_free': '5581MB'}
121
+ [INFO|trainer.py:175] 2025-01-02 03:17:29,609 >> {'loss': 0.556, 'grad_norm': 15.8145112991333, 'learning_rate': 8.522669549087762e-07, 'epoch': 0.031190669921345268, 'num_input_tokens_seen': 2338324480, 'completed': '30.24% (1_115 / 3_687)', 'remaining time': '21:35:00', 'throughput': '9840.45', 'gpu_mem_free': '5581MB'}
122
+ [INFO|trainer.py:175] 2025-01-02 03:17:59,834 >> {'loss': 0.7402, 'grad_norm': 16.058319091796875, 'learning_rate': 8.519677731714645e-07, 'epoch': 0.031461893138052616, 'num_input_tokens_seen': 2340421632, 'completed': '30.27% (1_116 / 3_687)', 'remaining time': '21:34:30', 'throughput': '8673.12', 'gpu_mem_free': '5581MB'}
123
+ [INFO|trainer.py:175] 2025-01-02 03:18:29,249 >> {'loss': 0.4213, 'grad_norm': 11.520198822021484, 'learning_rate': 8.516683484219797e-07, 'epoch': 0.03173311635475997, 'num_input_tokens_seen': 2342518784, 'completed': '30.30% (1_117 / 3_687)', 'remaining time': '21:33:42', 'throughput': '8911.81', 'gpu_mem_free': '5581MB'}
124
+ [INFO|trainer.py:175] 2025-01-02 03:18:59,584 >> {'loss': 0.5229, 'grad_norm': 18.839998245239258, 'learning_rate': 8.513686809012875e-07, 'epoch': 0.03200433957146732, 'num_input_tokens_seen': 2344615936, 'completed': '30.32% (1_118 / 3_687)', 'remaining time': '21:33:15', 'throughput': '8641.83', 'gpu_mem_free': '5581MB'}
125
+ [INFO|trainer.py:175] 2025-01-02 03:19:28,717 >> {'loss': 0.5792, 'grad_norm': 16.22575569152832, 'learning_rate': 8.510687708505489e-07, 'epoch': 0.032275562788174665, 'num_input_tokens_seen': 2346713088, 'completed': '30.35% (1_119 / 3_687)', 'remaining time': '21:32:22', 'throughput': '8998.20', 'gpu_mem_free': '5581MB'}
126
+ [INFO|trainer.py:175] 2025-01-02 03:19:59,019 >> {'loss': 0.5386, 'grad_norm': 15.674661636352539, 'learning_rate': 8.507686185111199e-07, 'epoch': 0.03254678600488202, 'num_input_tokens_seen': 2348810240, 'completed': '30.38% (1_120 / 3_687)', 'remaining time': '21:31:54', 'throughput': '8651.08', 'gpu_mem_free': '5581MB'}
127
+ [INFO|trainer.py:175] 2025-01-02 03:20:28,089 >> {'loss': 0.3716, 'grad_norm': 18.9416446685791, 'learning_rate': 8.504682241245516e-07, 'epoch': 0.03281800922158937, 'num_input_tokens_seen': 2350907392, 'completed': '30.40% (1_121 / 3_687)', 'remaining time': '21:31:00', 'throughput': '9017.52', 'gpu_mem_free': '5581MB'}
128
+ [INFO|trainer.py:175] 2025-01-02 03:20:58,384 >> {'loss': 0.2198, 'grad_norm': 10.184718132019043, 'learning_rate': 8.501675879325906e-07, 'epoch': 0.03308923243829672, 'num_input_tokens_seen': 2353004544, 'completed': '30.43% (1_122 / 3_687)', 'remaining time': '21:30:32', 'throughput': '8653.08', 'gpu_mem_free': '5581MB'}
129
+ [INFO|trainer.py:175] 2025-01-02 03:21:29,450 >> {'loss': 0.4381, 'grad_norm': 12.775616645812988, 'learning_rate': 8.498667101771769e-07, 'epoch': 0.03336045565500407, 'num_input_tokens_seen': 2355101696, 'completed': '30.46% (1_123 / 3_687)', 'remaining time': '21:30:20', 'throughput': '8438.43', 'gpu_mem_free': '5581MB'}
130
+ [INFO|trainer.py:175] 2025-01-02 03:22:02,910 >> {'loss': 0.6905, 'grad_norm': 19.200225830078125, 'learning_rate': 8.495655911004456e-07, 'epoch': 0.033631678871711417, 'num_input_tokens_seen': 2357198848, 'completed': '30.49% (1_124 / 3_687)', 'remaining time': '21:30:57', 'throughput': '7834.53', 'gpu_mem_free': '5581MB'}
131
+ [INFO|trainer.py:175] 2025-01-02 03:22:33,511 >> {'loss': 0.551, 'grad_norm': 16.312833786010742, 'learning_rate': 8.492642309447257e-07, 'epoch': 0.03390290208841877, 'num_input_tokens_seen': 2359296000, 'completed': '30.51% (1_125 / 3_687)', 'remaining time': '21:30:35', 'throughput': '8566.45', 'gpu_mem_free': '5581MB'}
132
+ [INFO|trainer.py:175] 2025-01-02 03:23:03,300 >> {'loss': 0.3892, 'grad_norm': 12.200955390930176, 'learning_rate': 8.489626299525409e-07, 'epoch': 0.03417412530512612, 'num_input_tokens_seen': 2361393152, 'completed': '30.54% (1_126 / 3_687)', 'remaining time': '21:29:56', 'throughput': '8799.94', 'gpu_mem_free': '5581MB'}
133
+ [INFO|trainer.py:175] 2025-01-02 03:23:32,934 >> {'loss': 0.4058, 'grad_norm': 12.41970157623291, 'learning_rate': 8.486607883666077e-07, 'epoch': 0.034445348521833466, 'num_input_tokens_seen': 2363490304, 'completed': '30.57% (1_127 / 3_687)', 'remaining time': '21:29:14', 'throughput': '8845.95', 'gpu_mem_free': '5581MB'}
134
+ [INFO|trainer.py:175] 2025-01-02 03:24:03,941 >> {'loss': 0.5063, 'grad_norm': 17.284793853759766, 'learning_rate': 8.483587064298372e-07, 'epoch': 0.03471657173854082, 'num_input_tokens_seen': 2365587456, 'completed': '30.59% (1_128 / 3_687)', 'remaining time': '21:28:59', 'throughput': '8454.38', 'gpu_mem_free': '5581MB'}
135
+ [INFO|trainer.py:175] 2025-01-02 03:24:36,817 >> {'loss': 0.3901, 'grad_norm': 12.992026329040527, 'learning_rate': 8.480563843853328e-07, 'epoch': 0.03498779495524817, 'num_input_tokens_seen': 2367684608, 'completed': '30.62% (1_129 / 3_687)', 'remaining time': '21:29:22', 'throughput': '7973.69', 'gpu_mem_free': '5581MB'}
136
+ [INFO|trainer.py:175] 2025-01-02 03:25:07,653 >> {'loss': 0.6989, 'grad_norm': 25.536312103271484, 'learning_rate': 8.477538224763923e-07, 'epoch': 0.03525901817195552, 'num_input_tokens_seen': 2369781760, 'completed': '30.65% (1_130 / 3_687)', 'remaining time': '21:29:03', 'throughput': '8501.30', 'gpu_mem_free': '5581MB'}
137
+ [INFO|trainer.py:175] 2025-01-02 03:25:38,462 >> {'loss': 0.4448, 'grad_norm': 14.612037658691406, 'learning_rate': 8.474510209465058e-07, 'epoch': 0.03553024138866287, 'num_input_tokens_seen': 2371878912, 'completed': '30.68% (1_131 / 3_687)', 'remaining time': '21:28:44', 'throughput': '8508.80', 'gpu_mem_free': '5581MB'}
138
+ [INFO|trainer.py:175] 2025-01-02 03:26:09,146 >> {'loss': 0.5952, 'grad_norm': 17.197641372680664, 'learning_rate': 8.471479800393565e-07, 'epoch': 0.03580146460537022, 'num_input_tokens_seen': 2373976064, 'completed': '30.70% (1_132 / 3_687)', 'remaining time': '21:28:22', 'throughput': '8543.34', 'gpu_mem_free': '5581MB'}
139
+ [INFO|trainer.py:175] 2025-01-02 03:26:39,555 >> {'loss': 0.5679, 'grad_norm': 15.149774551391602, 'learning_rate': 8.468446999988202e-07, 'epoch': 0.03607268782207757, 'num_input_tokens_seen': 2376073216, 'completed': '30.73% (1_133 / 3_687)', 'remaining time': '21:27:55', 'throughput': '8620.49', 'gpu_mem_free': '5581MB'}
140
+ [INFO|trainer.py:175] 2025-01-02 03:27:05,122 >> {'loss': 0.9363, 'grad_norm': 20.161657333374023, 'learning_rate': 8.465411810689653e-07, 'epoch': 0.03634391103878492, 'num_input_tokens_seen': 2378170368, 'completed': '30.76% (1_134 / 3_687)', 'remaining time': '21:25:55', 'throughput': '10253.43', 'gpu_mem_free': '5581MB'}
141
+ [INFO|trainer.py:175] 2025-01-02 03:27:35,906 >> {'loss': 0.4595, 'grad_norm': 13.918910026550293, 'learning_rate': 8.462374234940517e-07, 'epoch': 0.03661513425549227, 'num_input_tokens_seen': 2380267520, 'completed': '30.78% (1_135 / 3_687)', 'remaining time': '21:25:35', 'throughput': '8515.53', 'gpu_mem_free': '5581MB'}
142
+ [INFO|trainer.py:175] 2025-01-02 03:28:05,325 >> {'loss': 0.3769, 'grad_norm': 11.941801071166992, 'learning_rate': 8.459334275185325e-07, 'epoch': 0.03688635747219962, 'num_input_tokens_seen': 2382364672, 'completed': '30.81% (1_136 / 3_687)', 'remaining time': '21:24:50', 'throughput': '8910.76', 'gpu_mem_free': '5581MB'}
143
+ [INFO|trainer.py:175] 2025-01-02 03:28:33,468 >> {'loss': 1.0131, 'grad_norm': 23.785158157348633, 'learning_rate': 8.456291933870521e-07, 'epoch': 0.03715758068890697, 'num_input_tokens_seen': 2384461824, 'completed': '30.84% (1_137 / 3_687)', 'remaining time': '21:23:41', 'throughput': '9314.52', 'gpu_mem_free': '5581MB'}
144
+ [INFO|trainer.py:175] 2025-01-02 03:29:04,895 >> {'loss': 0.2438, 'grad_norm': 12.919363021850586, 'learning_rate': 8.453247213444463e-07, 'epoch': 0.03742880390561432, 'num_input_tokens_seen': 2386558976, 'completed': '30.87% (1_138 / 3_687)', 'remaining time': '21:23:33', 'throughput': '8341.47', 'gpu_mem_free': '5581MB'}
145
+ [INFO|trainer.py:175] 2025-01-02 03:29:35,415 >> {'loss': 0.4749, 'grad_norm': 14.640851020812988, 'learning_rate': 8.450200116357428e-07, 'epoch': 0.03770002712232167, 'num_input_tokens_seen': 2388656128, 'completed': '30.89% (1_139 / 3_687)', 'remaining time': '21:23:09', 'throughput': '8589.20', 'gpu_mem_free': '5581MB'}
146
+ [INFO|trainer.py:175] 2025-01-02 03:30:07,406 >> {'loss': 0.3616, 'grad_norm': 10.543968200683594, 'learning_rate': 8.4471506450616e-07, 'epoch': 0.03797125033902902, 'num_input_tokens_seen': 2390753280, 'completed': '30.92% (1_140 / 3_687)', 'remaining time': '21:23:11', 'throughput': '8194.45', 'gpu_mem_free': '5581MB'}
147
+ [INFO|trainer.py:175] 2025-01-02 03:30:36,514 >> {'loss': 0.4972, 'grad_norm': 15.393962860107422, 'learning_rate': 8.444098802011083e-07, 'epoch': 0.03824247355573637, 'num_input_tokens_seen': 2392850432, 'completed': '30.95% (1_141 / 3_687)', 'remaining time': '21:22:21', 'throughput': '9005.76', 'gpu_mem_free': '5581MB'}
148
+ [INFO|trainer.py:175] 2025-01-02 03:31:06,857 >> {'loss': 0.5945, 'grad_norm': 79.32931518554688, 'learning_rate': 8.441044589661881e-07, 'epoch': 0.03851369677244372, 'num_input_tokens_seen': 2394947584, 'completed': '30.97% (1_142 / 3_687)', 'remaining time': '21:21:52', 'throughput': '8639.46', 'gpu_mem_free': '5581MB'}
149
+ [INFO|trainer.py:175] 2025-01-02 03:31:36,973 >> {'loss': 0.5593, 'grad_norm': 13.988775253295898, 'learning_rate': 8.437988010471907e-07, 'epoch': 0.038784919989151075, 'num_input_tokens_seen': 2397044736, 'completed': '31.00% (1_143 / 3_687)', 'remaining time': '21:21:20', 'throughput': '8704.44', 'gpu_mem_free': '5581MB'}
150
+ [INFO|trainer.py:175] 2025-01-02 03:32:06,915 >> {'loss': 0.5196, 'grad_norm': 14.631237983703613, 'learning_rate': 8.434929066900982e-07, 'epoch': 0.03905614320585842, 'num_input_tokens_seen': 2399141888, 'completed': '31.03% (1_144 / 3_687)', 'remaining time': '21:20:45', 'throughput': '8755.03', 'gpu_mem_free': '5581MB'}
151
+ [INFO|trainer.py:175] 2025-01-02 03:32:34,404 >> {'loss': 0.8291, 'grad_norm': 23.943798065185547, 'learning_rate': 8.431867761410826e-07, 'epoch': 0.03932736642256577, 'num_input_tokens_seen': 2401239040, 'completed': '31.06% (1_145 / 3_687)', 'remaining time': '21:19:27', 'throughput': '9536.41', 'gpu_mem_free': '5581MB'}
152
+ [INFO|trainer.py:175] 2025-01-02 03:33:03,670 >> {'loss': 0.4124, 'grad_norm': 13.034646034240723, 'learning_rate': 8.42880409646506e-07, 'epoch': 0.039598589639273124, 'num_input_tokens_seen': 2403336192, 'completed': '31.08% (1_146 / 3_687)', 'remaining time': '21:18:41', 'throughput': '8957.03', 'gpu_mem_free': '5581MB'}
153
+ [INFO|trainer.py:175] 2025-01-02 03:33:33,571 >> {'loss': 0.4623, 'grad_norm': 16.411720275878906, 'learning_rate': 8.42573807452921e-07, 'epoch': 0.03986981285598047, 'num_input_tokens_seen': 2405433344, 'completed': '31.11% (1_147 / 3_687)', 'remaining time': '21:18:05', 'throughput': '8767.35', 'gpu_mem_free': '5581MB'}
154
+ [INFO|trainer.py:175] 2025-01-02 03:34:02,372 >> {'loss': 0.5727, 'grad_norm': 18.51169204711914, 'learning_rate': 8.422669698070687e-07, 'epoch': 0.04014103607268782, 'num_input_tokens_seen': 2407530496, 'completed': '31.14% (1_148 / 3_687)', 'remaining time': '21:17:11', 'throughput': '9101.85', 'gpu_mem_free': '5581MB'}
155
+ [INFO|trainer.py:175] 2025-01-02 03:34:31,786 >> {'loss': 0.2515, 'grad_norm': 10.572891235351562, 'learning_rate': 8.419598969558808e-07, 'epoch': 0.040412259289395173, 'num_input_tokens_seen': 2409627648, 'completed': '31.16% (1_149 / 3_687)', 'remaining time': '21:16:28', 'throughput': '8912.08', 'gpu_mem_free': '5581MB'}
156
+ [INFO|trainer.py:175] 2025-01-02 03:35:02,943 >> {'loss': 0.3125, 'grad_norm': 10.220544815063477, 'learning_rate': 8.416525891464776e-07, 'epoch': 0.04068348250610252, 'num_input_tokens_seen': 2411724800, 'completed': '31.19% (1_150 / 3_687)', 'remaining time': '21:16:15', 'throughput': '8413.78', 'gpu_mem_free': '5581MB'}
157
+ [INFO|trainer.py:175] 2025-01-02 03:35:35,053 >> {'loss': 0.429, 'grad_norm': 14.300618171691895, 'learning_rate': 8.413450466261691e-07, 'epoch': 0.040954705722809875, 'num_input_tokens_seen': 2413821952, 'completed': '31.22% (1_151 / 3_687)', 'remaining time': '21:16:17', 'throughput': '8163.95', 'gpu_mem_free': '5581MB'}
158
+ [INFO|trainer.py:175] 2025-01-02 03:36:04,990 >> {'loss': 0.3731, 'grad_norm': 14.614937782287598, 'learning_rate': 8.410372696424535e-07, 'epoch': 0.04122592893951722, 'num_input_tokens_seen': 2415919104, 'completed': '31.24% (1_152 / 3_687)', 'remaining time': '21:15:42', 'throughput': '8756.49', 'gpu_mem_free': '5581MB'}
159
+ [INFO|trainer.py:175] 2025-01-02 03:36:35,431 >> {'loss': 0.4024, 'grad_norm': 12.267914772033691, 'learning_rate': 8.40729258443018e-07, 'epoch': 0.04149715215622457, 'num_input_tokens_seen': 2418016256, 'completed': '31.27% (1_153 / 3_687)', 'remaining time': '21:15:16', 'throughput': '8611.58', 'gpu_mem_free': '5581MB'}
160
+ [INFO|trainer.py:175] 2025-01-02 03:37:04,253 >> {'loss': 0.3267, 'grad_norm': 12.720690727233887, 'learning_rate': 8.404210132757385e-07, 'epoch': 0.041768375372931925, 'num_input_tokens_seen': 2420113408, 'completed': '31.30% (1_154 / 3_687)', 'remaining time': '21:14:23', 'throughput': '9095.26', 'gpu_mem_free': '5581MB'}
161
+ [INFO|trainer.py:175] 2025-01-02 03:37:33,986 >> {'loss': 0.5473, 'grad_norm': 15.737774848937988, 'learning_rate': 8.401125343886787e-07, 'epoch': 0.04203959858963927, 'num_input_tokens_seen': 2422210560, 'completed': '31.33% (1_155 / 3_687)', 'remaining time': '21:13:46', 'throughput': '8816.37', 'gpu_mem_free': '5581MB'}
162
+ [INFO|trainer.py:175] 2025-01-02 03:38:04,141 >> {'loss': 0.3099, 'grad_norm': 9.714899063110352, 'learning_rate': 8.398038220300908e-07, 'epoch': 0.04231082180634662, 'num_input_tokens_seen': 2424307712, 'completed': '31.35% (1_156 / 3_687)', 'remaining time': '21:13:15', 'throughput': '8693.38', 'gpu_mem_free': '5581MB'}
163
+ [INFO|trainer.py:175] 2025-01-02 03:38:29,843 >> {'loss': 0.7204, 'grad_norm': 18.72726058959961, 'learning_rate': 8.39494876448415e-07, 'epoch': 0.042582045023053974, 'num_input_tokens_seen': 2426404864, 'completed': '31.38% (1_157 / 3_687)', 'remaining time': '21:11:33', 'throughput': '10199.13', 'gpu_mem_free': '5581MB'}
164
+ [INFO|trainer.py:175] 2025-01-02 03:38:59,084 >> {'loss': 0.8302, 'grad_norm': 19.449329376220703, 'learning_rate': 8.391856978922785e-07, 'epoch': 0.04285326823976132, 'num_input_tokens_seen': 2428502016, 'completed': '31.41% (1_158 / 3_687)', 'remaining time': '21:10:48', 'throughput': '8964.92', 'gpu_mem_free': '5581MB'}
165
+ [INFO|trainer.py:175] 2025-01-02 03:39:29,631 >> {'loss': 0.2353, 'grad_norm': 12.25051212310791, 'learning_rate': 8.38876286610497e-07, 'epoch': 0.043124491456468676, 'num_input_tokens_seen': 2430599168, 'completed': '31.43% (1_159 / 3_687)', 'remaining time': '21:10:24', 'throughput': '8581.93', 'gpu_mem_free': '5581MB'}
166
+ [INFO|trainer.py:175] 2025-01-02 03:39:59,479 >> {'loss': 0.6826, 'grad_norm': 17.40892791748047, 'learning_rate': 8.385666428520723e-07, 'epoch': 0.043395714673176024, 'num_input_tokens_seen': 2432696320, 'completed': '31.46% (1_160 / 3_687)', 'remaining time': '21:09:49', 'throughput': '8782.55', 'gpu_mem_free': '5581MB'}
167
+ [INFO|trainer.py:175] 2025-01-02 03:40:30,946 >> {'loss': 0.694, 'grad_norm': 20.962051391601562, 'learning_rate': 8.382567668661943e-07, 'epoch': 0.04366693788988337, 'num_input_tokens_seen': 2434793472, 'completed': '31.49% (1_161 / 3_687)', 'remaining time': '21:09:40', 'throughput': '8330.68', 'gpu_mem_free': '5581MB'}
168
+ [INFO|trainer.py:175] 2025-01-02 03:41:00,777 >> {'loss': 0.6455, 'grad_norm': 17.564403533935547, 'learning_rate': 8.379466589022393e-07, 'epoch': 0.043938161106590726, 'num_input_tokens_seen': 2436890624, 'completed': '31.52% (1_162 / 3_687)', 'remaining time': '21:09:04', 'throughput': '8787.74', 'gpu_mem_free': '5581MB'}
169
+ [INFO|trainer.py:175] 2025-01-02 03:41:30,832 >> {'loss': 0.5219, 'grad_norm': 16.586353302001953, 'learning_rate': 8.376363192097703e-07, 'epoch': 0.04420938432329807, 'num_input_tokens_seen': 2438987776, 'completed': '31.54% (1_163 / 3_687)', 'remaining time': '21:08:33', 'throughput': '8722.10', 'gpu_mem_free': '5581MB'}
170
+ [INFO|trainer.py:175] 2025-01-02 03:42:02,404 >> {'loss': 0.6066, 'grad_norm': 20.66847801208496, 'learning_rate': 8.37325748038537e-07, 'epoch': 0.04448060754000543, 'num_input_tokens_seen': 2441084928, 'completed': '31.57% (1_164 / 3_687)', 'remaining time': '21:08:24', 'throughput': '8303.05', 'gpu_mem_free': '5581MB'}
171
+ [INFO|trainer.py:175] 2025-01-02 03:42:31,402 >> {'loss': 0.4234, 'grad_norm': 17.191225051879883, 'learning_rate': 8.370149456384754e-07, 'epoch': 0.044751830756712775, 'num_input_tokens_seen': 2443182080, 'completed': '31.60% (1_165 / 3_687)', 'remaining time': '21:07:36', 'throughput': '9040.14', 'gpu_mem_free': '5581MB'}
172
+ [INFO|trainer.py:175] 2025-01-02 03:43:02,305 >> {'loss': 0.3087, 'grad_norm': 13.013045310974121, 'learning_rate': 8.36703912259707e-07, 'epoch': 0.04502305397342012, 'num_input_tokens_seen': 2445279232, 'completed': '31.62% (1_166 / 3_687)', 'remaining time': '21:07:17', 'throughput': '8482.82', 'gpu_mem_free': '5581MB'}
173
+ [INFO|trainer.py:175] 2025-01-02 03:43:31,344 >> {'loss': 0.7541, 'grad_norm': 22.083459854125977, 'learning_rate': 8.363926481525402e-07, 'epoch': 0.04529427719012748, 'num_input_tokens_seen': 2447376384, 'completed': '31.65% (1_167 / 3_687)', 'remaining time': '21:06:30', 'throughput': '9027.19', 'gpu_mem_free': '5581MB'}
174
+ [INFO|trainer.py:175] 2025-01-02 03:44:01,312 >> {'loss': 0.4852, 'grad_norm': 14.312165260314941, 'learning_rate': 8.360811535674682e-07, 'epoch': 0.045565500406834825, 'num_input_tokens_seen': 2449473536, 'completed': '31.68% (1_168 / 3_687)', 'remaining time': '21:05:57', 'throughput': '8747.58', 'gpu_mem_free': '5581MB'}
175
+ [INFO|trainer.py:175] 2025-01-02 03:44:32,103 >> {'loss': 0.7828, 'grad_norm': 20.598003387451172, 'learning_rate': 8.357694287551698e-07, 'epoch': 0.04583672362354217, 'num_input_tokens_seen': 2451570688, 'completed': '31.71% (1_169 / 3_687)', 'remaining time': '21:05:37', 'throughput': '8513.53', 'gpu_mem_free': '5581MB'}
176
+ [INFO|trainer.py:175] 2025-01-02 03:45:03,402 >> {'loss': 0.4868, 'grad_norm': 19.325916290283203, 'learning_rate': 8.354574739665096e-07, 'epoch': 0.04610794684024953, 'num_input_tokens_seen': 2453667840, 'completed': '31.73% (1_170 / 3_687)', 'remaining time': '21:05:23', 'throughput': '8375.58', 'gpu_mem_free': '5581MB'}
177
+ [INFO|trainer.py:175] 2025-01-02 03:45:37,007 >> {'loss': 0.3321, 'grad_norm': 12.34057903289795, 'learning_rate': 8.351452894525368e-07, 'epoch': 0.046379170056956874, 'num_input_tokens_seen': 2455764992, 'completed': '31.76% (1_171 / 3_687)', 'remaining time': '21:05:44', 'throughput': '7800.62', 'gpu_mem_free': '5581MB'}
178
+ [INFO|trainer.py:175] 2025-01-02 03:46:06,669 >> {'loss': 0.4123, 'grad_norm': 12.274550437927246, 'learning_rate': 8.348328754644855e-07, 'epoch': 0.04665039327366423, 'num_input_tokens_seen': 2457862144, 'completed': '31.79% (1_172 / 3_687)', 'remaining time': '21:05:06', 'throughput': '8837.83', 'gpu_mem_free': '5581MB'}
179
+ [INFO|trainer.py:175] 2025-01-02 03:46:35,978 >> {'loss': 0.5802, 'grad_norm': 14.192599296569824, 'learning_rate': 8.34520232253775e-07, 'epoch': 0.046921616490371576, 'num_input_tokens_seen': 2459959296, 'completed': '31.81% (1_173 / 3_687)', 'remaining time': '21:04:23', 'throughput': '8944.03', 'gpu_mem_free': '5581MB'}
180
+ [INFO|trainer.py:175] 2025-01-02 03:47:07,034 >> {'loss': 0.6071, 'grad_norm': 20.330703735351562, 'learning_rate': 8.342073600720082e-07, 'epoch': 0.047192839707078924, 'num_input_tokens_seen': 2462056448, 'completed': '31.84% (1_174 / 3_687)', 'remaining time': '21:04:06', 'throughput': '8441.11', 'gpu_mem_free': '5581MB'}
181
+ [INFO|trainer.py:175] 2025-01-02 03:47:43,424 >> {'loss': 0.4686, 'grad_norm': 16.746747970581055, 'learning_rate': 8.33894259170973e-07, 'epoch': 0.04746406292378628, 'num_input_tokens_seen': 2464153600, 'completed': '31.87% (1_175 / 3_687)', 'remaining time': '21:05:05', 'throughput': '7203.77', 'gpu_mem_free': '5581MB'}
182
+ [INFO|trainer.py:175] 2025-01-02 03:48:15,167 >> {'loss': 0.329, 'grad_norm': 13.190831184387207, 'learning_rate': 8.335809298026409e-07, 'epoch': 0.047735286140493625, 'num_input_tokens_seen': 2466250752, 'completed': '31.90% (1_176 / 3_687)', 'remaining time': '21:04:56', 'throughput': '8258.36', 'gpu_mem_free': '5581MB'}
183
+ [INFO|trainer.py:175] 2025-01-02 03:48:47,268 >> {'loss': 0.396, 'grad_norm': 14.298337936401367, 'learning_rate': 8.332673722191677e-07, 'epoch': 0.04800650935720097, 'num_input_tokens_seen': 2468347904, 'completed': '31.92% (1_177 / 3_687)', 'remaining time': '21:04:53', 'throughput': '8166.12', 'gpu_mem_free': '5581MB'}
184
+ [INFO|trainer.py:175] 2025-01-02 03:49:17,731 >> {'loss': 0.4024, 'grad_norm': 14.075898170471191, 'learning_rate': 8.329535866728922e-07, 'epoch': 0.04827773257390833, 'num_input_tokens_seen': 2470445056, 'completed': '31.95% (1_178 / 3_687)', 'remaining time': '21:04:26', 'throughput': '8605.22', 'gpu_mem_free': '5581MB'}
185
+ [INFO|trainer.py:175] 2025-01-02 03:49:49,409 >> {'loss': 0.6025, 'grad_norm': 15.5481595993042, 'learning_rate': 8.326395734163375e-07, 'epoch': 0.048548955790615675, 'num_input_tokens_seen': 2472542208, 'completed': '31.98% (1_179 / 3_687)', 'remaining time': '21:04:15', 'throughput': '8275.43', 'gpu_mem_free': '5581MB'}
186
+ [INFO|trainer.py:175] 2025-01-02 03:50:24,915 >> {'loss': 0.3926, 'grad_norm': 12.739563941955566, 'learning_rate': 8.323253327022094e-07, 'epoch': 0.04882017900732303, 'num_input_tokens_seen': 2474639360, 'completed': '32.00% (1_180 / 3_687)', 'remaining time': '21:04:59', 'throughput': '7383.12', 'gpu_mem_free': '5581MB'}
187
+ [INFO|trainer.py:175] 2025-01-02 03:50:58,557 >> {'loss': 0.3176, 'grad_norm': 10.208687782287598, 'learning_rate': 8.320108647833967e-07, 'epoch': 0.04909140222403038, 'num_input_tokens_seen': 2476736512, 'completed': '32.03% (1_181 / 3_687)', 'remaining time': '21:05:15', 'throughput': '7792.19', 'gpu_mem_free': '5581MB'}
188
+ [INFO|trainer.py:175] 2025-01-02 03:51:29,217 >> {'loss': 0.247, 'grad_norm': 9.83712387084961, 'learning_rate': 8.316961699129714e-07, 'epoch': 0.049362625440737724, 'num_input_tokens_seen': 2478833664, 'completed': '32.06% (1_182 / 3_687)', 'remaining time': '21:04:50', 'throughput': '8549.83', 'gpu_mem_free': '5581MB'}
189
+ [INFO|trainer.py:175] 2025-01-02 03:52:01,476 >> {'loss': 0.7097, 'grad_norm': 19.233394622802734, 'learning_rate': 8.313812483441879e-07, 'epoch': 0.04963384865744508, 'num_input_tokens_seen': 2480930816, 'completed': '32.09% (1_183 / 3_687)', 'remaining time': '21:04:46', 'throughput': '8126.22', 'gpu_mem_free': '5581MB'}
190
+ [INFO|trainer.py:175] 2025-01-02 03:52:30,419 >> {'loss': 0.33, 'grad_norm': 12.595343589782715, 'learning_rate': 8.310661003304829e-07, 'epoch': 0.049905071874152426, 'num_input_tokens_seen': 2483027968, 'completed': '32.11% (1_184 / 3_687)', 'remaining time': '21:03:57', 'throughput': '9057.21', 'gpu_mem_free': '5581MB'}
191
+ [INFO|trainer.py:175] 2025-01-02 03:52:59,321 >> {'loss': 0.4683, 'grad_norm': 14.636686325073242, 'learning_rate': 8.30750726125476e-07, 'epoch': 0.05017629509085978, 'num_input_tokens_seen': 2485125120, 'completed': '32.14% (1_185 / 3_687)', 'remaining time': '21:03:08', 'throughput': '9070.10', 'gpu_mem_free': '5581MB'}
192
+ [INFO|trainer.py:175] 2025-01-02 03:53:27,887 >> {'loss': 0.4655, 'grad_norm': 13.301069259643555, 'learning_rate': 8.304351259829678e-07, 'epoch': 0.05044751830756713, 'num_input_tokens_seen': 2487222272, 'completed': '32.17% (1_186 / 3_687)', 'remaining time': '21:02:15', 'throughput': '9177.00', 'gpu_mem_free': '5581MB'}
193
+ [INFO|trainer.py:175] 2025-01-02 03:53:56,613 >> {'loss': 0.3759, 'grad_norm': 13.354597091674805, 'learning_rate': 8.301193001569418e-07, 'epoch': 0.050718741524274476, 'num_input_tokens_seen': 2489319424, 'completed': '32.19% (1_187 / 3_687)', 'remaining time': '21:01:24', 'throughput': '9125.55', 'gpu_mem_free': '5581MB'}
194
+ [INFO|trainer.py:175] 2025-01-02 03:54:27,565 >> {'loss': 0.3448, 'grad_norm': 12.58713150024414, 'learning_rate': 8.298032489015623e-07, 'epoch': 0.05098996474098183, 'num_input_tokens_seen': 2491416576, 'completed': '32.22% (1_188 / 3_687)', 'remaining time': '21:01:02', 'throughput': '8469.51', 'gpu_mem_free': '5581MB'}
195
+ [INFO|trainer.py:175] 2025-01-02 03:54:56,377 >> {'loss': 0.586, 'grad_norm': 19.054805755615234, 'learning_rate': 8.294869724711752e-07, 'epoch': 0.05126118795768918, 'num_input_tokens_seen': 2493513728, 'completed': '32.25% (1_189 / 3_687)', 'remaining time': '21:00:13', 'throughput': '9098.19', 'gpu_mem_free': '5581MB'}
196
+ [INFO|trainer.py:175] 2025-01-02 03:55:27,239 >> {'loss': 0.3833, 'grad_norm': 11.936301231384277, 'learning_rate': 8.291704711203082e-07, 'epoch': 0.051532411174396525, 'num_input_tokens_seen': 2495610880, 'completed': '32.28% (1_190 / 3_687)', 'remaining time': '20:59:50', 'throughput': '8494.21', 'gpu_mem_free': '5581MB'}
197
+ [INFO|trainer.py:175] 2025-01-02 03:55:59,718 >> {'loss': 0.4227, 'grad_norm': 18.23284912109375, 'learning_rate': 8.288537451036691e-07, 'epoch': 0.05180363439110388, 'num_input_tokens_seen': 2497708032, 'completed': '32.30% (1_191 / 3_687)', 'remaining time': '20:59:49', 'throughput': '8071.06', 'gpu_mem_free': '5581MB'}
198
+ [INFO|trainer.py:175] 2025-01-02 03:56:31,070 >> {'loss': 0.3232, 'grad_norm': 14.435386657714844, 'learning_rate': 8.28536794676147e-07, 'epoch': 0.05207485760781123, 'num_input_tokens_seen': 2499805184, 'completed': '32.33% (1_192 / 3_687)', 'remaining time': '20:59:32', 'throughput': '8361.41', 'gpu_mem_free': '5581MB'}
199
+ [INFO|trainer.py:175] 2025-01-02 03:57:04,242 >> {'loss': 0.4033, 'grad_norm': 13.897747039794922, 'learning_rate': 8.282196200928119e-07, 'epoch': 0.05234608082451858, 'num_input_tokens_seen': 2501902336, 'completed': '32.36% (1_193 / 3_687)', 'remaining time': '20:59:39', 'throughput': '7902.56', 'gpu_mem_free': '5581MB'}
200
+ [INFO|trainer.py:175] 2025-01-02 03:57:34,381 >> {'loss': 0.3174, 'grad_norm': 11.174321174621582, 'learning_rate': 8.279022216089135e-07, 'epoch': 0.05261730404122593, 'num_input_tokens_seen': 2503999488, 'completed': '32.38% (1_194 / 3_687)', 'remaining time': '20:59:07', 'throughput': '8698.00', 'gpu_mem_free': '5581MB'}
201
+ [INFO|trainer.py:175] 2025-01-02 03:58:05,139 >> {'loss': 0.4879, 'grad_norm': 15.450333595275879, 'learning_rate': 8.275845994798821e-07, 'epoch': 0.05288852725793328, 'num_input_tokens_seen': 2506096640, 'completed': '32.41% (1_195 / 3_687)', 'remaining time': '20:58:42', 'throughput': '8522.65', 'gpu_mem_free': '5581MB'}
202
+ [INFO|trainer.py:175] 2025-01-02 03:58:34,400 >> {'loss': 0.3927, 'grad_norm': 12.975201606750488, 'learning_rate': 8.272667539613281e-07, 'epoch': 0.05315975047464063, 'num_input_tokens_seen': 2508193792, 'completed': '32.44% (1_196 / 3_687)', 'remaining time': '20:57:59', 'throughput': '8958.83', 'gpu_mem_free': '5581MB'}
203
+ [INFO|trainer.py:175] 2025-01-02 03:59:05,682 >> {'loss': 0.6859, 'grad_norm': 20.552017211914062, 'learning_rate': 8.26948685309041e-07, 'epoch': 0.05343097369134798, 'num_input_tokens_seen': 2510290944, 'completed': '32.47% (1_197 / 3_687)', 'remaining time': '20:57:41', 'throughput': '8380.07', 'gpu_mem_free': '5581MB'}
204
+ [INFO|trainer.py:175] 2025-01-02 03:59:37,398 >> {'loss': 0.4479, 'grad_norm': 14.239463806152344, 'learning_rate': 8.266303937789908e-07, 'epoch': 0.05370219690805533, 'num_input_tokens_seen': 2512388096, 'completed': '32.49% (1_198 / 3_687)', 'remaining time': '20:57:28', 'throughput': '8265.23', 'gpu_mem_free': '5581MB'}
205
+ [INFO|trainer.py:175] 2025-01-02 04:00:06,751 >> {'loss': 0.5931, 'grad_norm': 13.604122161865234, 'learning_rate': 8.263118796273263e-07, 'epoch': 0.05397342012476268, 'num_input_tokens_seen': 2514485248, 'completed': '32.52% (1_199 / 3_687)', 'remaining time': '20:56:46', 'throughput': '8930.89', 'gpu_mem_free': '5581MB'}
206
+ [INFO|trainer.py:175] 2025-01-02 04:00:38,358 >> {'loss': 0.4736, 'grad_norm': 22.870668411254883, 'learning_rate': 8.259931431103754e-07, 'epoch': 0.05424464334147003, 'num_input_tokens_seen': 2516582400, 'completed': '32.55% (1_200 / 3_687)', 'remaining time': '20:56:32', 'throughput': '8293.69', 'gpu_mem_free': '5581MB'}
207
+ /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:689: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
208
+ warnings.warn(
209
+ [INFO|trainer.py:3503] 2025-01-02 04:01:02,359 >> Saving model checkpoint to /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/checkpoint-1200
210
+ [INFO|configuration_utils.py:472] 2025-01-02 04:01:02,365 >> Configuration saved in /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/checkpoint-1200/config.json
211
+ [INFO|configuration_utils.py:807] 2025-01-02 04:01:02,366 >> Configuration saved in /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/checkpoint-1200/generation_config.json
212
+ [INFO|modeling_utils.py:2807] 2025-01-02 04:02:02,595 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 7 checkpoint shards. You can find where each parameters has been saved in the index located at /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/checkpoint-1200/model.safetensors.index.json.
213
+ [INFO|tokenization_utils_base.py:2684] 2025-01-02 04:02:02,600 >> tokenizer config file saved in /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/checkpoint-1200/tokenizer_config.json
214
+ [INFO|tokenization_utils_base.py:2693] 2025-01-02 04:02:02,600 >> Special tokens file saved in /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/checkpoint-1200/special_tokens_map.json
215
+ /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:689: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
216
+ warnings.warn(
217
+ [WARNING|trainer.py:868] 2025-01-02 04:05:41,083 >> Save streaming dataset state: {'epoch': 0, 'sample_in_epoch': 2400, 'num_canonical_nodes': 1, 'shuffle_seed': 42, 'initial_physical_nodes': 1}
218
+ 01/02/2025 04:05:41 - WARNING - streaming.base.dataset - Because `shuffle_block_size` was not specified, it will default to max(4_000_000 // num_canonical_nodes, 1 << 18) if num_canonical_nodes is not None, otherwise 262144. Prior to Streaming v0.7.0, `shuffle_block_size` defaulted to 262144.
219
+ /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final/lib/python3.10/site-packages/torch/utils/checkpoint.py:1399: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
220
+ with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context: # type: ignore[attr-defined]
221
+ [INFO|trainer.py:175] 2025-01-02 04:06:13,247 >> {'loss': 0.5313, 'grad_norm': 15.544678688049316, 'learning_rate': 8.256741844846452e-07, 'epoch': 0.05451586655817738, 'num_input_tokens_seen': 2518679552, 'completed': '32.57% (1_201 / 3_687)', 'remaining time': '21:58:49', 'throughput': '782.78', 'gpu_mem_free': '5581MB'}
222
+ [INFO|trainer.py:175] 2025-01-02 04:06:47,174 >> {'loss': 0.3772, 'grad_norm': 12.727087020874023, 'learning_rate': 8.253550040068216e-07, 'epoch': 0.05478708977488473, 'num_input_tokens_seen': 2520776704, 'completed': '32.60% (1_202 / 3_687)', 'remaining time': '21:58:43', 'throughput': '7726.61', 'gpu_mem_free': '5581MB'}
223
+ [INFO|trainer.py:175] 2025-01-02 04:07:20,530 >> {'loss': 0.4334, 'grad_norm': 14.271738052368164, 'learning_rate': 8.250356019337688e-07, 'epoch': 0.05505831299159208, 'num_input_tokens_seen': 2522873856, 'completed': '32.63% (1_203 / 3_687)', 'remaining time': '21:58:29', 'throughput': '7859.06', 'gpu_mem_free': '5581MB'}
224
+ [INFO|trainer.py:175] 2025-01-02 04:07:48,568 >> {'loss': 0.939, 'grad_norm': 20.528383255004883, 'learning_rate': 8.247159785225295e-07, 'epoch': 0.05532953620829943, 'num_input_tokens_seen': 2524971008, 'completed': '32.66% (1_204 / 3_687)', 'remaining time': '21:57:11', 'throughput': '9349.75', 'gpu_mem_free': '5581MB'}
225
+ [INFO|trainer.py:175] 2025-01-02 04:08:16,023 >> {'loss': 0.7866, 'grad_norm': 21.59691619873047, 'learning_rate': 8.243961340303245e-07, 'epoch': 0.05560075942500678, 'num_input_tokens_seen': 2527068160, 'completed': '32.68% (1_205 / 3_687)', 'remaining time': '21:55:46', 'throughput': '9547.89', 'gpu_mem_free': '5581MB'}
226
+ [INFO|trainer.py:175] 2025-01-02 04:08:46,029 >> {'loss': 0.2154, 'grad_norm': 8.262271881103516, 'learning_rate': 8.240760687145521e-07, 'epoch': 0.055871982641714134, 'num_input_tokens_seen': 2529165312, 'completed': '32.71% (1_206 / 3_687)', 'remaining time': '21:54:53', 'throughput': '8736.56', 'gpu_mem_free': '5581MB'}
227
+ [INFO|trainer.py:175] 2025-01-02 04:09:15,965 >> {'loss': 0.6242, 'grad_norm': 16.18045425415039, 'learning_rate': 8.237557828327891e-07, 'epoch': 0.05614320585842148, 'num_input_tokens_seen': 2531262464, 'completed': '32.74% (1_207 / 3_687)', 'remaining time': '21:53:59', 'throughput': '8756.70', 'gpu_mem_free': '5581MB'}
228
+ [INFO|trainer.py:175] 2025-01-02 04:09:47,480 >> {'loss': 0.7028, 'grad_norm': 19.58281707763672, 'learning_rate': 8.234352766427894e-07, 'epoch': 0.05641442907512883, 'num_input_tokens_seen': 2533359616, 'completed': '32.76% (1_208 / 3_687)', 'remaining time': '21:53:24', 'throughput': '8318.13', 'gpu_mem_free': '5581MB'}
229
+ [INFO|trainer.py:175] 2025-01-02 04:10:16,664 >> {'loss': 0.7435, 'grad_norm': 19.143722534179688, 'learning_rate': 8.231145504024838e-07, 'epoch': 0.05668565229183618, 'num_input_tokens_seen': 2535456768, 'completed': '32.79% (1_209 / 3_687)', 'remaining time': '21:52:21', 'throughput': '8982.56', 'gpu_mem_free': '5581MB'}
230
+ [INFO|trainer.py:175] 2025-01-02 04:10:46,833 >> {'loss': 0.4029, 'grad_norm': 10.882222175598145, 'learning_rate': 8.22793604369981e-07, 'epoch': 0.05695687550854353, 'num_input_tokens_seen': 2537553920, 'completed': '32.82% (1_210 / 3_687)', 'remaining time': '21:51:30', 'throughput': '8689.04', 'gpu_mem_free': '5581MB'}
231
+ [INFO|trainer.py:175] 2025-01-02 04:11:17,357 >> {'loss': 0.3975, 'grad_norm': 13.754764556884766, 'learning_rate': 8.224724388035659e-07, 'epoch': 0.05722809872525088, 'num_input_tokens_seen': 2539651072, 'completed': '32.85% (1_211 / 3_687)', 'remaining time': '21:50:44', 'throughput': '8588.13', 'gpu_mem_free': '5581MB'}
232
+ [INFO|trainer.py:175] 2025-01-02 04:11:47,053 >> {'loss': 0.5442, 'grad_norm': 17.92790412902832, 'learning_rate': 8.221510539617003e-07, 'epoch': 0.05749932194195823, 'num_input_tokens_seen': 2541748224, 'completed': '32.87% (1_212 / 3_687)', 'remaining time': '21:49:48', 'throughput': '8827.71', 'gpu_mem_free': '5581MB'}
233
+ [INFO|trainer.py:175] 2025-01-02 04:12:17,947 >> {'loss': 0.3787, 'grad_norm': 12.144906997680664, 'learning_rate': 8.218294501030226e-07, 'epoch': 0.05777054515866558, 'num_input_tokens_seen': 2543845376, 'completed': '32.90% (1_213 / 3_687)', 'remaining time': '21:49:06', 'throughput': '8485.10', 'gpu_mem_free': '5581MB'}
234
+ [INFO|trainer.py:175] 2025-01-02 04:12:49,727 >> {'loss': 0.5476, 'grad_norm': 15.397769927978516, 'learning_rate': 8.215076274863476e-07, 'epoch': 0.058041768375372935, 'num_input_tokens_seen': 2545942528, 'completed': '32.93% (1_214 / 3_687)', 'remaining time': '21:48:35', 'throughput': '8248.90', 'gpu_mem_free': '5581MB'}
235
+ [INFO|trainer.py:175] 2025-01-02 04:13:21,159 >> {'loss': 0.4261, 'grad_norm': 13.692743301391602, 'learning_rate': 8.211855863706654e-07, 'epoch': 0.05831299159208028, 'num_input_tokens_seen': 2548039680, 'completed': '32.95% (1_215 / 3_687)', 'remaining time': '21:47:59', 'throughput': '8339.91', 'gpu_mem_free': '5581MB'}
236
+ [INFO|trainer.py:175] 2025-01-02 04:13:51,951 >> {'loss': 0.5572, 'grad_norm': 15.524927139282227, 'learning_rate': 8.208633270151426e-07, 'epoch': 0.05858421480878763, 'num_input_tokens_seen': 2550136832, 'completed': '32.98% (1_216 / 3_687)', 'remaining time': '21:47:17', 'throughput': '8513.34', 'gpu_mem_free': '5581MB'}
237
+ [INFO|trainer.py:175] 2025-01-02 04:14:22,788 >> {'loss': 0.6069, 'grad_norm': 21.096981048583984, 'learning_rate': 8.205408496791216e-07, 'epoch': 0.058855438025494984, 'num_input_tokens_seen': 2552233984, 'completed': '33.01% (1_217 / 3_687)', 'remaining time': '21:46:35', 'throughput': '8500.93', 'gpu_mem_free': '5581MB'}
238
+ [INFO|trainer.py:175] 2025-01-02 04:14:56,803 >> {'loss': 0.3785, 'grad_norm': 12.139968872070312, 'learning_rate': 8.202181546221193e-07, 'epoch': 0.05912666124220233, 'num_input_tokens_seen': 2554331136, 'completed': '33.03% (1_218 / 3_687)', 'remaining time': '21:46:29', 'throughput': '7706.67', 'gpu_mem_free': '5581MB'}
239
+ [INFO|trainer.py:175] 2025-01-02 04:15:24,489 >> {'loss': 0.7316, 'grad_norm': 18.615991592407227, 'learning_rate': 8.19895242103829e-07, 'epoch': 0.059397884458909686, 'num_input_tokens_seen': 2556428288, 'completed': '33.06% (1_219 / 3_687)', 'remaining time': '21:45:11', 'throughput': '9468.75', 'gpu_mem_free': '5581MB'}
240
+ [INFO|trainer.py:175] 2025-01-02 04:15:53,152 >> {'loss': 0.7, 'grad_norm': 20.739112854003906, 'learning_rate': 8.19572112384118e-07, 'epoch': 0.059669107675617034, 'num_input_tokens_seen': 2558525440, 'completed': '33.09% (1_220 / 3_687)', 'remaining time': '21:44:05', 'throughput': '9145.54', 'gpu_mem_free': '5581MB'}
241
+ [INFO|trainer.py:175] 2025-01-02 04:16:24,038 >> {'loss': 0.3302, 'grad_norm': 13.558570861816406, 'learning_rate': 8.192487657230288e-07, 'epoch': 0.05994033089232438, 'num_input_tokens_seen': 2560622592, 'completed': '33.12% (1_221 / 3_687)', 'remaining time': '21:43:24', 'throughput': '8487.67', 'gpu_mem_free': '5581MB'}
242
+ [INFO|trainer.py:175] 2025-01-02 04:16:57,408 >> {'loss': 0.3308, 'grad_norm': 12.751009941101074, 'learning_rate': 8.18925202380779e-07, 'epoch': 0.060211554109031735, 'num_input_tokens_seen': 2562719744, 'completed': '33.14% (1_222 / 3_687)', 'remaining time': '21:43:11', 'throughput': '7855.47', 'gpu_mem_free': '5581MB'}
243
+ [INFO|trainer.py:175] 2025-01-02 04:17:29,981 >> {'loss': 0.5112, 'grad_norm': 14.548094749450684, 'learning_rate': 8.186014226177594e-07, 'epoch': 0.06048277732573908, 'num_input_tokens_seen': 2564816896, 'completed': '33.17% (1_223 / 3_687)', 'remaining time': '21:42:48', 'throughput': '8048.06', 'gpu_mem_free': '5581MB'}
244
+ [INFO|trainer.py:175] 2025-01-02 04:17:59,436 >> {'loss': 0.5099, 'grad_norm': 13.776844024658203, 'learning_rate': 8.18277426694536e-07, 'epoch': 0.06075400054244643, 'num_input_tokens_seen': 2566914048, 'completed': '33.20% (1_224 / 3_687)', 'remaining time': '21:41:52', 'throughput': '8899.60', 'gpu_mem_free': '5581MB'}
245
+ [INFO|trainer.py:175] 2025-01-02 04:18:29,725 >> {'loss': 0.5184, 'grad_norm': 14.455738067626953, 'learning_rate': 8.179532148718483e-07, 'epoch': 0.061025223759153785, 'num_input_tokens_seen': 2569011200, 'completed': '33.22% (1_225 / 3_687)', 'remaining time': '21:41:04', 'throughput': '8654.78', 'gpu_mem_free': '5581MB'}
246
+ [INFO|trainer.py:175] 2025-01-02 04:19:00,435 >> {'loss': 0.6313, 'grad_norm': 16.7779598236084, 'learning_rate': 8.176287874106097e-07, 'epoch': 0.06129644697586113, 'num_input_tokens_seen': 2571108352, 'completed': '33.25% (1_226 / 3_687)', 'remaining time': '21:40:22', 'throughput': '8536.12', 'gpu_mem_free': '5581MB'}
247
+ [INFO|trainer.py:175] 2025-01-02 04:19:27,921 >> {'loss': 0.7062, 'grad_norm': 17.045129776000977, 'learning_rate': 8.173041445719069e-07, 'epoch': 0.06156767019256849, 'num_input_tokens_seen': 2573205504, 'completed': '33.28% (1_227 / 3_687)', 'remaining time': '21:39:04', 'throughput': '9537.51', 'gpu_mem_free': '5581MB'}
248
+ [INFO|trainer.py:175] 2025-01-02 04:19:59,897 >> {'loss': 0.8444, 'grad_norm': 23.766611099243164, 'learning_rate': 8.169792866170003e-07, 'epoch': 0.061838893409275834, 'num_input_tokens_seen': 2575302656, 'completed': '33.31% (1_228 / 3_687)', 'remaining time': '21:38:36', 'throughput': '8198.14', 'gpu_mem_free': '5581MB'}
249
+ [INFO|trainer.py:175] 2025-01-02 04:20:29,050 >> {'loss': 0.5219, 'grad_norm': 13.02985668182373, 'learning_rate': 8.166542138073232e-07, 'epoch': 0.06211011662598318, 'num_input_tokens_seen': 2577399808, 'completed': '33.33% (1_229 / 3_687)', 'remaining time': '21:37:37', 'throughput': '8991.95', 'gpu_mem_free': '5581MB'}
250
+ [INFO|trainer.py:175] 2025-01-02 04:21:01,220 >> {'loss': 0.6017, 'grad_norm': 15.076738357543945, 'learning_rate': 8.163289264044817e-07, 'epoch': 0.062381339842690536, 'num_input_tokens_seen': 2579496960, 'completed': '33.36% (1_230 / 3_687)', 'remaining time': '21:37:11', 'throughput': '8148.74', 'gpu_mem_free': '5581MB'}
251
+ [INFO|trainer.py:175] 2025-01-02 04:21:31,907 >> {'loss': 0.5749, 'grad_norm': 15.141392707824707, 'learning_rate': 8.160034246702548e-07, 'epoch': 0.06265256305939788, 'num_input_tokens_seen': 2581594112, 'completed': '33.39% (1_231 / 3_687)', 'remaining time': '21:36:28', 'throughput': '8542.41', 'gpu_mem_free': '5581MB'}
252
+ [INFO|trainer.py:175] 2025-01-02 04:22:02,871 >> {'loss': 0.4558, 'grad_norm': 13.555374145507812, 'learning_rate': 8.156777088665939e-07, 'epoch': 0.06292378627610523, 'num_input_tokens_seen': 2583691264, 'completed': '33.41% (1_232 / 3_687)', 'remaining time': '21:35:49', 'throughput': '8466.22', 'gpu_mem_free': '5581MB'}
253
+ [INFO|trainer.py:175] 2025-01-02 04:22:33,348 >> {'loss': 0.3044, 'grad_norm': 13.061405181884766, 'learning_rate': 8.153517792556226e-07, 'epoch': 0.06319500949281258, 'num_input_tokens_seen': 2585788416, 'completed': '33.44% (1_233 / 3_687)', 'remaining time': '21:35:05', 'throughput': '8601.22', 'gpu_mem_free': '5581MB'}
254
+ [INFO|trainer.py:175] 2025-01-02 04:23:01,379 >> {'loss': 0.6267, 'grad_norm': 17.51995277404785, 'learning_rate': 8.15025636099637e-07, 'epoch': 0.06346623270951994, 'num_input_tokens_seen': 2587885568, 'completed': '33.47% (1_234 / 3_687)', 'remaining time': '21:33:55', 'throughput': '9352.11', 'gpu_mem_free': '5581MB'}
255
+ [INFO|trainer.py:175] 2025-01-02 04:23:34,089 >> {'loss': 0.3617, 'grad_norm': 13.34430980682373, 'learning_rate': 8.146992796611042e-07, 'epoch': 0.06373745592622729, 'num_input_tokens_seen': 2589982720, 'completed': '33.50% (1_235 / 3_687)', 'remaining time': '21:33:35', 'throughput': '8014.23', 'gpu_mem_free': '5581MB'}
256
+ [INFO|trainer.py:175] 2025-01-02 04:24:00,555 >> {'loss': 0.8845, 'grad_norm': 18.68717384338379, 'learning_rate': 8.143727102026638e-07, 'epoch': 0.06400867914293464, 'num_input_tokens_seen': 2592079872, 'completed': '33.52% (1_236 / 3_687)', 'remaining time': '21:32:09', 'throughput': '9904.72', 'gpu_mem_free': '5581MB'}
257
+ [INFO|trainer.py:175] 2025-01-02 04:24:30,910 >> {'loss': 0.3102, 'grad_norm': 13.216950416564941, 'learning_rate': 8.140459279871264e-07, 'epoch': 0.06427990235964198, 'num_input_tokens_seen': 2594177024, 'completed': '33.55% (1_237 / 3_687)', 'remaining time': '21:31:24', 'throughput': '8636.22', 'gpu_mem_free': '5581MB'}
258
+ [INFO|trainer.py:175] 2025-01-02 04:25:00,875 >> {'loss': 0.7681, 'grad_norm': 22.348615646362305, 'learning_rate': 8.137189332774738e-07, 'epoch': 0.06455112557634933, 'num_input_tokens_seen': 2596274176, 'completed': '33.58% (1_238 / 3_687)', 'remaining time': '21:30:36', 'throughput': '8748.03', 'gpu_mem_free': '5581MB'}
259
+ [INFO|trainer.py:175] 2025-01-02 04:25:30,514 >> {'loss': 0.3414, 'grad_norm': 12.911705017089844, 'learning_rate': 8.133917263368589e-07, 'epoch': 0.06482234879305669, 'num_input_tokens_seen': 2598371328, 'completed': '33.60% (1_239 / 3_687)', 'remaining time': '21:29:44', 'throughput': '8844.61', 'gpu_mem_free': '5581MB'}
260
+ [INFO|trainer.py:175] 2025-01-02 04:26:01,069 >> {'loss': 0.4748, 'grad_norm': 19.478103637695312, 'learning_rate': 8.130643074286056e-07, 'epoch': 0.06509357200976404, 'num_input_tokens_seen': 2600468480, 'completed': '33.63% (1_240 / 3_687)', 'remaining time': '21:29:01', 'throughput': '8579.34', 'gpu_mem_free': '5581MB'}
261
+ [INFO|trainer.py:175] 2025-01-02 04:26:37,315 >> {'loss': 0.4883, 'grad_norm': 16.46529197692871, 'learning_rate': 8.127366768162077e-07, 'epoch': 0.06536479522647139, 'num_input_tokens_seen': 2602565632, 'completed': '33.66% (1_241 / 3_687)', 'remaining time': '21:29:17', 'throughput': '7232.36', 'gpu_mem_free': '5581MB'}
262
+ [INFO|trainer.py:175] 2025-01-02 04:27:09,210 >> {'loss': 0.4281, 'grad_norm': 13.854009628295898, 'learning_rate': 8.124088347633304e-07, 'epoch': 0.06563601844317873, 'num_input_tokens_seen': 2604662784, 'completed': '33.69% (1_242 / 3_687)', 'remaining time': '21:28:48', 'throughput': '8219.10', 'gpu_mem_free': '5581MB'}
263
+ [INFO|trainer.py:175] 2025-01-02 04:27:39,418 >> {'loss': 0.4065, 'grad_norm': 12.586260795593262, 'learning_rate': 8.120807815338083e-07, 'epoch': 0.06590724165988608, 'num_input_tokens_seen': 2606759936, 'completed': '33.71% (1_243 / 3_687)', 'remaining time': '21:28:02', 'throughput': '8677.89', 'gpu_mem_free': '5581MB'}
264
+ [INFO|trainer.py:175] 2025-01-02 04:28:12,754 >> {'loss': 0.3784, 'grad_norm': 12.778905868530273, 'learning_rate': 8.11752517391646e-07, 'epoch': 0.06617846487659344, 'num_input_tokens_seen': 2608857088, 'completed': '33.74% (1_244 / 3_687)', 'remaining time': '21:27:47', 'throughput': '7863.80', 'gpu_mem_free': '5581MB'}
265
+ [INFO|trainer.py:175] 2025-01-02 04:28:43,792 >> {'loss': 0.3571, 'grad_norm': 13.905147552490234, 'learning_rate': 8.114240426010183e-07, 'epoch': 0.06644968809330079, 'num_input_tokens_seen': 2610954240, 'completed': '33.77% (1_245 / 3_687)', 'remaining time': '21:27:10', 'throughput': '8445.80', 'gpu_mem_free': '5581MB'}
266
+ [INFO|trainer.py:175] 2025-01-02 04:29:13,042 >> {'loss': 0.6425, 'grad_norm': 18.284894943237305, 'learning_rate': 8.11095357426269e-07, 'epoch': 0.06672091131000814, 'num_input_tokens_seen': 2613051392, 'completed': '33.79% (1_246 / 3_687)', 'remaining time': '21:26:15', 'throughput': '8962.34', 'gpu_mem_free': '5581MB'}
267
+ [INFO|trainer.py:175] 2025-01-02 04:29:42,766 >> {'loss': 0.5305, 'grad_norm': 14.604440689086914, 'learning_rate': 8.107664621319113e-07, 'epoch': 0.06699213452671549, 'num_input_tokens_seen': 2615148544, 'completed': '33.82% (1_247 / 3_687)', 'remaining time': '21:25:24', 'throughput': '8819.15', 'gpu_mem_free': '5581MB'}
268
+ [INFO|trainer.py:175] 2025-01-02 04:30:12,192 >> {'loss': 0.4138, 'grad_norm': 14.840185165405273, 'learning_rate': 8.10437356982628e-07, 'epoch': 0.06726335774342283, 'num_input_tokens_seen': 2617245696, 'completed': '33.85% (1_248 / 3_687)', 'remaining time': '21:24:31', 'throughput': '8908.50', 'gpu_mem_free': '5581MB'}
269
+ [INFO|trainer.py:175] 2025-01-02 04:30:46,121 >> {'loss': 0.466, 'grad_norm': 13.899113655090332, 'learning_rate': 8.1010804224327e-07, 'epoch': 0.06753458096013018, 'num_input_tokens_seen': 2619342848, 'completed': '33.88% (1_249 / 3_687)', 'remaining time': '21:24:23', 'throughput': '7726.36', 'gpu_mem_free': '5581MB'}
270
+ [INFO|trainer.py:175] 2025-01-02 04:31:15,595 >> {'loss': 0.3711, 'grad_norm': 12.415678977966309, 'learning_rate': 8.097785181788574e-07, 'epoch': 0.06780580417683754, 'num_input_tokens_seen': 2621440000, 'completed': '33.90% (1_250 / 3_687)', 'remaining time': '21:23:30', 'throughput': '8894.03', 'gpu_mem_free': '5581MB'}
271
+ [INFO|trainer.py:175] 2025-01-02 04:31:46,618 >> {'loss': 0.3472, 'grad_norm': 12.70449161529541, 'learning_rate': 8.09448785054579e-07, 'epoch': 0.06807702739354489, 'num_input_tokens_seen': 2623537152, 'completed': '33.93% (1_251 / 3_687)', 'remaining time': '21:22:53', 'throughput': '8449.95', 'gpu_mem_free': '5581MB'}
272
+ [INFO|trainer.py:175] 2025-01-02 04:32:14,774 >> {'loss': 0.9622, 'grad_norm': 20.355985641479492, 'learning_rate': 8.091188431357908e-07, 'epoch': 0.06834825061025224, 'num_input_tokens_seen': 2625634304, 'completed': '33.96% (1_252 / 3_687)', 'remaining time': '21:21:48', 'throughput': '9310.32', 'gpu_mem_free': '5581MB'}
273
+ [INFO|trainer.py:175] 2025-01-02 04:32:44,245 >> {'loss': 0.91, 'grad_norm': 19.507715225219727, 'learning_rate': 8.087886926880181e-07, 'epoch': 0.06861947382695958, 'num_input_tokens_seen': 2627731456, 'completed': '33.98% (1_253 / 3_687)', 'remaining time': '21:20:56', 'throughput': '8895.08', 'gpu_mem_free': '5581MB'}
274
+ [INFO|trainer.py:175] 2025-01-02 04:33:17,264 >> {'loss': 0.4888, 'grad_norm': 13.477721214294434, 'learning_rate': 8.084583339769531e-07, 'epoch': 0.06889069704366693, 'num_input_tokens_seen': 2629828608, 'completed': '34.01% (1_254 / 3_687)', 'remaining time': '21:20:38', 'throughput': '7939.17', 'gpu_mem_free': '5581MB'}
275
+ [INFO|trainer.py:175] 2025-01-02 04:33:47,963 >> {'loss': 0.438, 'grad_norm': 15.250035285949707, 'learning_rate': 8.081277672684557e-07, 'epoch': 0.0691619202603743, 'num_input_tokens_seen': 2631925760, 'completed': '34.04% (1_255 / 3_687)', 'remaining time': '21:19:58', 'throughput': '8539.13', 'gpu_mem_free': '5581MB'}
276
+ [INFO|trainer.py:175] 2025-01-02 04:34:15,764 >> {'loss': 0.5853, 'grad_norm': 13.72596263885498, 'learning_rate': 8.077969928285541e-07, 'epoch': 0.06943314347708164, 'num_input_tokens_seen': 2634022912, 'completed': '34.07% (1_256 / 3_687)', 'remaining time': '21:18:51', 'throughput': '9429.42', 'gpu_mem_free': '5581MB'}
277
+ [INFO|trainer.py:175] 2025-01-02 04:34:46,720 >> {'loss': 0.456, 'grad_norm': 23.236064910888672, 'learning_rate': 8.074660109234424e-07, 'epoch': 0.06970436669378899, 'num_input_tokens_seen': 2636120064, 'completed': '34.09% (1_257 / 3_687)', 'remaining time': '21:18:14', 'throughput': '8468.28', 'gpu_mem_free': '5581MB'}
278
+ [INFO|trainer.py:175] 2025-01-02 04:35:16,377 >> {'loss': 0.5927, 'grad_norm': 19.698314666748047, 'learning_rate': 8.071348218194823e-07, 'epoch': 0.06997558991049634, 'num_input_tokens_seen': 2638217216, 'completed': '34.12% (1_258 / 3_687)', 'remaining time': '21:17:24', 'throughput': '8839.13', 'gpu_mem_free': '5581MB'}
279
+ [INFO|trainer.py:175] 2025-01-02 04:35:46,815 >> {'loss': 0.7272, 'grad_norm': 17.194185256958008, 'learning_rate': 8.068034257832026e-07, 'epoch': 0.07024681312720368, 'num_input_tokens_seen': 2640314368, 'completed': '34.15% (1_259 / 3_687)', 'remaining time': '21:16:42', 'throughput': '8612.51', 'gpu_mem_free': '5581MB'}
280
+ [INFO|trainer.py:175] 2025-01-02 04:36:15,182 >> {'loss': 0.4973, 'grad_norm': 14.79769229888916, 'learning_rate': 8.064718230812976e-07, 'epoch': 0.07051803634391104, 'num_input_tokens_seen': 2642411520, 'completed': '34.17% (1_260 / 3_687)', 'remaining time': '21:15:41', 'throughput': '9241.16', 'gpu_mem_free': '5581MB'}
281
+ [INFO|trainer.py:175] 2025-01-02 04:36:45,415 >> {'loss': 0.7414, 'grad_norm': 18.805007934570312, 'learning_rate': 8.06140013980629e-07, 'epoch': 0.07078925956061839, 'num_input_tokens_seen': 2644508672, 'completed': '34.20% (1_261 / 3_687)', 'remaining time': '21:14:57', 'throughput': '8670.76', 'gpu_mem_free': '5581MB'}
282
+ [INFO|trainer.py:175] 2025-01-02 04:37:16,834 >> {'loss': 0.3165, 'grad_norm': 10.652469635009766, 'learning_rate': 8.05807998748224e-07, 'epoch': 0.07106048277732574, 'num_input_tokens_seen': 2646605824, 'completed': '34.23% (1_262 / 3_687)', 'remaining time': '21:14:25', 'throughput': '8343.33', 'gpu_mem_free': '5581MB'}
283
+ [INFO|trainer.py:175] 2025-01-02 04:37:46,764 >> {'loss': 0.3287, 'grad_norm': 10.750865936279297, 'learning_rate': 8.05475777651276e-07, 'epoch': 0.07133170599403309, 'num_input_tokens_seen': 2648702976, 'completed': '34.26% (1_263 / 3_687)', 'remaining time': '21:13:38', 'throughput': '8758.68', 'gpu_mem_free': '5581MB'}
284
+ [INFO|trainer.py:175] 2025-01-02 04:38:15,609 >> {'loss': 0.6628, 'grad_norm': 17.767995834350586, 'learning_rate': 8.051433509571435e-07, 'epoch': 0.07160292921074043, 'num_input_tokens_seen': 2650800128, 'completed': '34.28% (1_264 / 3_687)', 'remaining time': '21:12:42', 'throughput': '9087.90', 'gpu_mem_free': '5581MB'}
285
+ [INFO|trainer.py:175] 2025-01-02 04:38:47,686 >> {'loss': 0.6028, 'grad_norm': 17.3983097076416, 'learning_rate': 8.04810718933351e-07, 'epoch': 0.0718741524274478, 'num_input_tokens_seen': 2652897280, 'completed': '34.31% (1_265 / 3_687)', 'remaining time': '21:12:16', 'throughput': '8172.54', 'gpu_mem_free': '5581MB'}
286
+ [INFO|trainer.py:175] 2025-01-02 04:39:17,452 >> {'loss': 0.3962, 'grad_norm': 12.742525100708008, 'learning_rate': 8.044778818475884e-07, 'epoch': 0.07214537564415514, 'num_input_tokens_seen': 2654994432, 'completed': '34.34% (1_266 / 3_687)', 'remaining time': '21:11:28', 'throughput': '8806.76', 'gpu_mem_free': '5581MB'}
287
+ [INFO|trainer.py:175] 2025-01-02 04:39:47,530 >> {'loss': 0.6197, 'grad_norm': 15.34299373626709, 'learning_rate': 8.0414483996771e-07, 'epoch': 0.07241659886086249, 'num_input_tokens_seen': 2657091584, 'completed': '34.36% (1_267 / 3_687)', 'remaining time': '21:10:44', 'throughput': '8715.37', 'gpu_mem_free': '5581MB'}
288
+ [INFO|trainer.py:175] 2025-01-02 04:40:19,517 >> {'loss': 0.8734, 'grad_norm': 18.275728225708008, 'learning_rate': 8.038115935617355e-07, 'epoch': 0.07268782207756984, 'num_input_tokens_seen': 2659188736, 'completed': '34.39% (1_268 / 3_687)', 'remaining time': '21:10:17', 'throughput': '8195.51', 'gpu_mem_free': '5581MB'}
289
+ [INFO|trainer.py:175] 2025-01-02 04:40:49,334 >> {'loss': 0.3667, 'grad_norm': 11.38978099822998, 'learning_rate': 8.034781428978484e-07, 'epoch': 0.07295904529427719, 'num_input_tokens_seen': 2661285888, 'completed': '34.42% (1_269 / 3_687)', 'remaining time': '21:09:30', 'throughput': '8791.66', 'gpu_mem_free': '5581MB'}
290
+ [INFO|trainer.py:175] 2025-01-02 04:41:19,011 >> {'loss': 0.2297, 'grad_norm': 17.417783737182617, 'learning_rate': 8.031444882443976e-07, 'epoch': 0.07323026851098453, 'num_input_tokens_seen': 2663383040, 'completed': '34.45% (1_270 / 3_687)', 'remaining time': '21:08:42', 'throughput': '8833.25', 'gpu_mem_free': '5581MB'}
291
+ [INFO|trainer.py:175] 2025-01-02 04:41:51,598 >> {'loss': 0.6265, 'grad_norm': 15.738312721252441, 'learning_rate': 8.028106298698957e-07, 'epoch': 0.0735014917276919, 'num_input_tokens_seen': 2665480192, 'completed': '34.47% (1_271 / 3_687)', 'remaining time': '21:08:20', 'throughput': '8044.26', 'gpu_mem_free': '5581MB'}
292
+ [INFO|trainer.py:175] 2025-01-02 04:42:22,177 >> {'loss': 0.3186, 'grad_norm': 14.270974159240723, 'learning_rate': 8.024765680430188e-07, 'epoch': 0.07377271494439924, 'num_input_tokens_seen': 2667577344, 'completed': '34.50% (1_272 / 3_687)', 'remaining time': '21:07:41', 'throughput': '8572.82', 'gpu_mem_free': '5581MB'}
293
+ [INFO|trainer.py:175] 2025-01-02 04:42:53,382 >> {'loss': 0.5579, 'grad_norm': 13.561492919921875, 'learning_rate': 8.021423030326075e-07, 'epoch': 0.07404393816110659, 'num_input_tokens_seen': 2669674496, 'completed': '34.53% (1_273 / 3_687)', 'remaining time': '21:07:07', 'throughput': '8400.70', 'gpu_mem_free': '5581MB'}
wandb/run-20250102_021927-pw8rud5e/files/requirements.txt ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Brotli==1.1.0
2
+ GitPython==3.1.43
3
+ Jinja2==3.1.4
4
+ MarkupSafe==3.0.2
5
+ PyJWT==2.10.0
6
+ PyNaCl==1.5.0
7
+ PyYAML==6.0.2
8
+ Pygments==2.18.0
9
+ accelerate==0.32.1
10
+ aiohappyeyeballs==2.4.3
11
+ aiohttp==3.11.2
12
+ aioprometheus==23.12.0
13
+ aiosignal==1.3.1
14
+ annotated-types==0.7.0
15
+ anthropic==0.39.0
16
+ anyio==4.6.2.post1
17
+ argcomplete==3.5.1
18
+ arrow==1.3.0
19
+ asttokens==2.4.1
20
+ async-timeout==5.0.1
21
+ attrs==24.2.0
22
+ autocommand==2.2.2
23
+ azure-core==1.32.0
24
+ azure-identity==1.19.0
25
+ azure-storage-blob==12.24.0
26
+ azure-storage-file-datalake==12.18.0
27
+ backoff==2.2.1
28
+ backports.tarfile==1.2.0
29
+ bcrypt==4.2.0
30
+ blobfile==3.0.0
31
+ boto3==1.35.63
32
+ botocore==1.35.63
33
+ cachetools==5.5.0
34
+ certifi==2024.8.30
35
+ cffi==1.17.1
36
+ charset-normalizer==3.4.0
37
+ circuitbreaker==2.0.0
38
+ click==8.1.7
39
+ cloudpickle==3.1.0
40
+ comm==0.2.2
41
+ compressed-tensors==0.8.0
42
+ contourpy==1.3.1
43
+ cramjam==2.9.0
44
+ cryptography==43.0.3
45
+ cycler==0.12.1
46
+ datasets==2.20.0
47
+ datatools==0.1
48
+ debugpy==1.8.11
49
+ decorator==5.1.1
50
+ dill==0.3.8
51
+ diskcache==5.6.3
52
+ distro==1.9.0
53
+ docker-pycreds==0.4.0
54
+ docstring_parser==0.16
55
+ einops==0.8.0
56
+ exceptiongroup==1.2.2
57
+ executing==2.1.0
58
+ fastapi==0.115.5
59
+ filelock==3.16.1
60
+ flash-attn==2.6.1
61
+ fonttools==4.55.0
62
+ frozenlist==1.5.0
63
+ fsspec==2024.5.0
64
+ gguf==0.10.0
65
+ gitdb==4.0.11
66
+ google-api-core==2.23.0
67
+ google-auth==2.36.0
68
+ google-cloud-aiplatform==1.71.1
69
+ google-cloud-bigquery==3.27.0
70
+ google-cloud-core==2.4.1
71
+ google-cloud-resource-manager==1.13.1
72
+ google-cloud-storage==2.10.0
73
+ google-crc32c==1.6.0
74
+ google-resumable-media==2.7.2
75
+ googleapis-common-protos==1.66.0
76
+ gql==3.5.0
77
+ graphql-core==3.2.5
78
+ grpc-google-iam-v1==0.13.1
79
+ grpcio-status==1.62.3
80
+ grpcio==1.68.0
81
+ h11==0.14.0
82
+ httpcore==1.0.7
83
+ httptools==0.6.4
84
+ httpx==0.27.2
85
+ huggingface-hub==0.26.2
86
+ idna==3.10
87
+ importlib_metadata==8.0.0
88
+ importlib_metadata==8.5.0
89
+ inflect==7.3.1
90
+ interegular==0.3.3
91
+ ipykernel==6.29.5
92
+ ipython==8.18.0
93
+ isodate==0.7.2
94
+ jaraco.collections==5.1.0
95
+ jaraco.context==5.3.0
96
+ jaraco.functools==4.0.1
97
+ jaraco.text==3.12.1
98
+ jedi==0.19.2
99
+ jiter==0.7.1
100
+ jmespath==1.0.1
101
+ jsonschema-specifications==2024.10.1
102
+ jsonschema==4.23.0
103
+ jupyter_client==8.6.3
104
+ jupyter_core==5.7.2
105
+ kiwisolver==1.4.7
106
+ lark==1.2.2
107
+ llvmlite==0.43.0
108
+ lm-format-enforcer==0.10.9
109
+ lxml==5.3.0
110
+ markdown-it-py==3.0.0
111
+ matplotlib-inline==0.1.7
112
+ matplotlib==3.9.2
113
+ mdurl==0.1.2
114
+ more-itertools==10.3.0
115
+ mosaicml-cli==0.5.34
116
+ mosaicml-streaming==0.8.1
117
+ mpmath==1.3.0
118
+ msal-extensions==1.2.0
119
+ msal==1.31.1
120
+ msgpack==1.1.0
121
+ msgspec==0.18.6
122
+ multidict==6.1.0
123
+ multiprocess==0.70.16
124
+ nest-asyncio==1.6.0
125
+ networkx==3.4.2
126
+ ninja==1.11.1.1
127
+ numba==0.60.0
128
+ numpy==1.26.4
129
+ nvidia-cublas-cu12==12.1.3.1
130
+ nvidia-cuda-cupti-cu12==12.1.105
131
+ nvidia-cuda-nvrtc-cu12==12.1.105
132
+ nvidia-cuda-runtime-cu12==12.1.105
133
+ nvidia-cudnn-cu12==9.1.0.70
134
+ nvidia-cufft-cu12==11.0.2.54
135
+ nvidia-curand-cu12==10.3.2.106
136
+ nvidia-cusolver-cu12==11.4.5.107
137
+ nvidia-cusparse-cu12==12.1.0.106
138
+ nvidia-ml-py==12.560.30
139
+ nvidia-nccl-cu12==2.20.5
140
+ nvidia-nvjitlink-cu12==12.4.127
141
+ nvidia-nvtx-cu12==12.1.105
142
+ oci==2.138.1
143
+ openai==1.54.5
144
+ opencv-python-headless==4.10.0.84
145
+ orjson==3.10.11
146
+ outlines==0.0.46
147
+ packaging==24.1
148
+ packaging==24.2
149
+ pandas==2.2.1
150
+ paramiko==3.5.0
151
+ parso==0.8.4
152
+ partial-json-parser==0.2.1.1.post4
153
+ pexpect==4.9.0
154
+ pillow==10.4.0
155
+ pip==24.3.1
156
+ platformdirs==4.2.2
157
+ platformdirs==4.3.6
158
+ portalocker==2.10.1
159
+ prometheus-fastapi-instrumentator==7.0.0
160
+ prometheus_client==0.21.0
161
+ prompt-toolkit==3.0.36
162
+ propcache==0.2.0
163
+ proto-plus==1.25.0
164
+ protobuf==4.25.3
165
+ psutil==6.1.0
166
+ ptyprocess==0.7.0
167
+ pure_eval==0.2.3
168
+ py-cpuinfo==9.0.0
169
+ pyOpenSSL==24.2.1
170
+ pyairports==2.1.1
171
+ pyarrow-hotfix==0.6
172
+ pyarrow==18.0.0
173
+ pyasn1==0.6.1
174
+ pyasn1_modules==0.4.1
175
+ pycountry==24.6.1
176
+ pycparser==2.22
177
+ pycryptodomex==3.21.0
178
+ pydantic==2.9.2
179
+ pydantic_core==2.23.4
180
+ pyparsing==3.2.0
181
+ python-dateutil==2.9.0
182
+ python-dotenv==1.0.1
183
+ python-snappy==0.7.3
184
+ pytz==2024.2
185
+ pyzmq==26.2.0
186
+ quantile-python==1.1
187
+ questionary==2.0.1
188
+ ray==2.39.0
189
+ referencing==0.35.1
190
+ regex==2023.12.25
191
+ requests==2.32.3
192
+ rich==13.9.4
193
+ rotary-emb==0.5.2
194
+ rpds-py==0.21.0
195
+ rsa==4.9
196
+ ruamel.yaml.clib==0.2.12
197
+ ruamel.yaml==0.18.6
198
+ s3transfer==0.10.3
199
+ safetensors==0.4.5
200
+ sentencepiece==0.1.99
201
+ sentry-sdk==2.18.0
202
+ setproctitle==1.3.4
203
+ setuptools==75.6.0
204
+ shapely==2.0.6
205
+ simple-parsing==0.1.6
206
+ six==1.16.0
207
+ smmap==5.0.1
208
+ sniffio==1.3.1
209
+ stack-data==0.6.3
210
+ starlette==0.41.3
211
+ sympy==1.13.1
212
+ tiktoken==0.7.0
213
+ tokenizers==0.19.1
214
+ tomli==2.0.1
215
+ torch==2.4.1
216
+ torchvision==0.19.1
217
+ tornado==6.4.1
218
+ tqdm==4.66.4
219
+ traitlets==5.14.3
220
+ transformers==4.44.2
221
+ triton==3.0.0
222
+ typeguard==4.3.0
223
+ types-python-dateutil==2.9.0.20241003
224
+ typing_extensions==4.12.2
225
+ typing_extensions==4.12.2
226
+ tzdata==2024.2
227
+ urllib3==2.2.3
228
+ uvicorn==0.32.0
229
+ uvloop==0.21.0
230
+ validators==0.34.0
231
+ vertexai==1.71.1
232
+ wandb==0.17.3
233
+ watchfiles==0.24.0
234
+ wcwidth==0.2.13
235
+ websockets==11.0.3
236
+ wheel==0.43.0
237
+ wheel==0.45.1
238
+ xformers==0.0.28.post1
239
+ xxhash==3.5.0
240
+ yarl==1.17.2
241
+ zipp==3.19.2
242
+ zipp==3.21.0
243
+ zstandard==0.23.0
244
+ zstd==1.5.5.1
wandb/run-20250102_021927-pw8rud5e/files/wandb-metadata.json ADDED
@@ -0,0 +1,705 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-48-generic-x86_64-with-glibc2.39",
3
+ "python": "3.10.0",
4
+ "heartbeatAt": "2025-01-02T02:19:28.293281",
5
+ "startedAt": "2025-01-02T02:19:27.822029",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--model_family",
10
+ "llama",
11
+ "--apply_instruct_masks",
12
+ "--token_scaled_loss",
13
+ "--seq_parallel_size",
14
+ "8",
15
+ "--report_to",
16
+ "wandb",
17
+ "--do_train",
18
+ "--model_name_or_path",
19
+ "/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/",
20
+ "--config_name",
21
+ "/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/",
22
+ "--tokenizer_name",
23
+ "/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/",
24
+ "--run_name",
25
+ "_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_",
26
+ "--output_dir",
27
+ "/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_",
28
+ "--config_overrides_json",
29
+ "",
30
+ "--gradient_accumulation_steps",
31
+ "2",
32
+ "--per_device_train_batch_size",
33
+ "1",
34
+ "--bf16",
35
+ "--learning_rate",
36
+ "1e-6",
37
+ "--min_lr_ratio",
38
+ "0.1",
39
+ "--lr_scheduler_type",
40
+ "cosine",
41
+ "--max_grad_norm",
42
+ "1.0",
43
+ "--adam_beta1",
44
+ "0.9",
45
+ "--adam_beta2",
46
+ "0.95",
47
+ "--weight_decay",
48
+ "0.1",
49
+ "--warmup_ratio",
50
+ "0.05",
51
+ "--optim",
52
+ "adamw_torch",
53
+ "--logging_steps",
54
+ "1",
55
+ "--log_level",
56
+ "info",
57
+ "--save_steps",
58
+ "200",
59
+ "--dataloader_num_workers",
60
+ "1",
61
+ "--disable_tqdm",
62
+ "true",
63
+ "--use_fast_tokenizer",
64
+ "false",
65
+ "--remove_unused_columns",
66
+ "false",
67
+ "--ddp_find_unused_parameters",
68
+ "false",
69
+ "--fsdp",
70
+ "auto_wrap offload",
71
+ "--gradient_checkpointing",
72
+ "--tokenized_mds_train",
73
+ "/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/data/ft/bookclaim_balanced_pack_complete",
74
+ "--cuda_empty_cache",
75
+ "--num_train_epochs",
76
+ "1"
77
+ ],
78
+ "state": "running",
79
+ "program": "/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/finetune.py",
80
+ "codePathLocal": "finetune.py",
81
+ "codePath": "prolong-final/finetune.py",
82
+ "git": {
83
+ "remote": "https://github.com/chtmp223/BookGen-dev.git",
84
+ "commit": "0e796521430a0f767be7c4dadba5c2fcaee1f909"
85
+ },
86
+ "email": "[email protected]",
87
+ "root": "/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev",
88
+ "host": "gpu020",
89
+ "username": "ctpham_umass_edu",
90
+ "executable": "/scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final/bin/python3.10",
91
+ "cpu_count": 112,
92
+ "cpu_count_logical": 112,
93
+ "cpu_freq": {
94
+ "current": 958.6112589285714,
95
+ "min": 800.0,
96
+ "max": 3800.0
97
+ },
98
+ "cpu_freq_per_core": [
99
+ {
100
+ "current": 784.548,
101
+ "min": 800.0,
102
+ "max": 3800.0
103
+ },
104
+ {
105
+ "current": 761.056,
106
+ "min": 800.0,
107
+ "max": 3800.0
108
+ },
109
+ {
110
+ "current": 800.0,
111
+ "min": 800.0,
112
+ "max": 3800.0
113
+ },
114
+ {
115
+ "current": 800.0,
116
+ "min": 800.0,
117
+ "max": 3800.0
118
+ },
119
+ {
120
+ "current": 800.0,
121
+ "min": 800.0,
122
+ "max": 3800.0
123
+ },
124
+ {
125
+ "current": 800.0,
126
+ "min": 800.0,
127
+ "max": 3800.0
128
+ },
129
+ {
130
+ "current": 800.0,
131
+ "min": 800.0,
132
+ "max": 3800.0
133
+ },
134
+ {
135
+ "current": 784.579,
136
+ "min": 800.0,
137
+ "max": 3800.0
138
+ },
139
+ {
140
+ "current": 800.0,
141
+ "min": 800.0,
142
+ "max": 3800.0
143
+ },
144
+ {
145
+ "current": 784.219,
146
+ "min": 800.0,
147
+ "max": 3800.0
148
+ },
149
+ {
150
+ "current": 800.0,
151
+ "min": 800.0,
152
+ "max": 3800.0
153
+ },
154
+ {
155
+ "current": 800.0,
156
+ "min": 800.0,
157
+ "max": 3800.0
158
+ },
159
+ {
160
+ "current": 800.0,
161
+ "min": 800.0,
162
+ "max": 3800.0
163
+ },
164
+ {
165
+ "current": 800.0,
166
+ "min": 800.0,
167
+ "max": 3800.0
168
+ },
169
+ {
170
+ "current": 800.0,
171
+ "min": 800.0,
172
+ "max": 3800.0
173
+ },
174
+ {
175
+ "current": 800.0,
176
+ "min": 800.0,
177
+ "max": 3800.0
178
+ },
179
+ {
180
+ "current": 800.0,
181
+ "min": 800.0,
182
+ "max": 3800.0
183
+ },
184
+ {
185
+ "current": 800.0,
186
+ "min": 800.0,
187
+ "max": 3800.0
188
+ },
189
+ {
190
+ "current": 800.0,
191
+ "min": 800.0,
192
+ "max": 3800.0
193
+ },
194
+ {
195
+ "current": 800.0,
196
+ "min": 800.0,
197
+ "max": 3800.0
198
+ },
199
+ {
200
+ "current": 800.0,
201
+ "min": 800.0,
202
+ "max": 3800.0
203
+ },
204
+ {
205
+ "current": 800.0,
206
+ "min": 800.0,
207
+ "max": 3800.0
208
+ },
209
+ {
210
+ "current": 800.0,
211
+ "min": 800.0,
212
+ "max": 3800.0
213
+ },
214
+ {
215
+ "current": 800.0,
216
+ "min": 800.0,
217
+ "max": 3800.0
218
+ },
219
+ {
220
+ "current": 1800.0,
221
+ "min": 800.0,
222
+ "max": 3800.0
223
+ },
224
+ {
225
+ "current": 800.0,
226
+ "min": 800.0,
227
+ "max": 3800.0
228
+ },
229
+ {
230
+ "current": 800.0,
231
+ "min": 800.0,
232
+ "max": 3800.0
233
+ },
234
+ {
235
+ "current": 800.0,
236
+ "min": 800.0,
237
+ "max": 3800.0
238
+ },
239
+ {
240
+ "current": 800.0,
241
+ "min": 800.0,
242
+ "max": 3800.0
243
+ },
244
+ {
245
+ "current": 800.0,
246
+ "min": 800.0,
247
+ "max": 3800.0
248
+ },
249
+ {
250
+ "current": 800.0,
251
+ "min": 800.0,
252
+ "max": 3800.0
253
+ },
254
+ {
255
+ "current": 800.0,
256
+ "min": 800.0,
257
+ "max": 3800.0
258
+ },
259
+ {
260
+ "current": 800.0,
261
+ "min": 800.0,
262
+ "max": 3800.0
263
+ },
264
+ {
265
+ "current": 800.0,
266
+ "min": 800.0,
267
+ "max": 3800.0
268
+ },
269
+ {
270
+ "current": 2400.0,
271
+ "min": 800.0,
272
+ "max": 3800.0
273
+ },
274
+ {
275
+ "current": 800.0,
276
+ "min": 800.0,
277
+ "max": 3800.0
278
+ },
279
+ {
280
+ "current": 1800.0,
281
+ "min": 800.0,
282
+ "max": 3800.0
283
+ },
284
+ {
285
+ "current": 800.0,
286
+ "min": 800.0,
287
+ "max": 3800.0
288
+ },
289
+ {
290
+ "current": 1500.0,
291
+ "min": 800.0,
292
+ "max": 3800.0
293
+ },
294
+ {
295
+ "current": 800.0,
296
+ "min": 800.0,
297
+ "max": 3800.0
298
+ },
299
+ {
300
+ "current": 2400.0,
301
+ "min": 800.0,
302
+ "max": 3800.0
303
+ },
304
+ {
305
+ "current": 800.0,
306
+ "min": 800.0,
307
+ "max": 3800.0
308
+ },
309
+ {
310
+ "current": 800.0,
311
+ "min": 800.0,
312
+ "max": 3800.0
313
+ },
314
+ {
315
+ "current": 800.0,
316
+ "min": 800.0,
317
+ "max": 3800.0
318
+ },
319
+ {
320
+ "current": 800.0,
321
+ "min": 800.0,
322
+ "max": 3800.0
323
+ },
324
+ {
325
+ "current": 800.0,
326
+ "min": 800.0,
327
+ "max": 3800.0
328
+ },
329
+ {
330
+ "current": 800.0,
331
+ "min": 800.0,
332
+ "max": 3800.0
333
+ },
334
+ {
335
+ "current": 800.0,
336
+ "min": 800.0,
337
+ "max": 3800.0
338
+ },
339
+ {
340
+ "current": 800.0,
341
+ "min": 800.0,
342
+ "max": 3800.0
343
+ },
344
+ {
345
+ "current": 3800.0,
346
+ "min": 800.0,
347
+ "max": 3800.0
348
+ },
349
+ {
350
+ "current": 2000.0,
351
+ "min": 800.0,
352
+ "max": 3800.0
353
+ },
354
+ {
355
+ "current": 786.524,
356
+ "min": 800.0,
357
+ "max": 3800.0
358
+ },
359
+ {
360
+ "current": 800.0,
361
+ "min": 800.0,
362
+ "max": 3800.0
363
+ },
364
+ {
365
+ "current": 800.0,
366
+ "min": 800.0,
367
+ "max": 3800.0
368
+ },
369
+ {
370
+ "current": 1994.764,
371
+ "min": 800.0,
372
+ "max": 3800.0
373
+ },
374
+ {
375
+ "current": 784.651,
376
+ "min": 800.0,
377
+ "max": 3800.0
378
+ },
379
+ {
380
+ "current": 800.0,
381
+ "min": 800.0,
382
+ "max": 3800.0
383
+ },
384
+ {
385
+ "current": 3800.0,
386
+ "min": 800.0,
387
+ "max": 3800.0
388
+ },
389
+ {
390
+ "current": 1717.12,
391
+ "min": 800.0,
392
+ "max": 3800.0
393
+ },
394
+ {
395
+ "current": 800.0,
396
+ "min": 800.0,
397
+ "max": 3800.0
398
+ },
399
+ {
400
+ "current": 1300.0,
401
+ "min": 800.0,
402
+ "max": 3800.0
403
+ },
404
+ {
405
+ "current": 800.0,
406
+ "min": 800.0,
407
+ "max": 3800.0
408
+ },
409
+ {
410
+ "current": 2005.181,
411
+ "min": 800.0,
412
+ "max": 3800.0
413
+ },
414
+ {
415
+ "current": 800.0,
416
+ "min": 800.0,
417
+ "max": 3800.0
418
+ },
419
+ {
420
+ "current": 800.0,
421
+ "min": 800.0,
422
+ "max": 3800.0
423
+ },
424
+ {
425
+ "current": 800.0,
426
+ "min": 800.0,
427
+ "max": 3800.0
428
+ },
429
+ {
430
+ "current": 800.0,
431
+ "min": 800.0,
432
+ "max": 3800.0
433
+ },
434
+ {
435
+ "current": 800.0,
436
+ "min": 800.0,
437
+ "max": 3800.0
438
+ },
439
+ {
440
+ "current": 800.0,
441
+ "min": 800.0,
442
+ "max": 3800.0
443
+ },
444
+ {
445
+ "current": 800.0,
446
+ "min": 800.0,
447
+ "max": 3800.0
448
+ },
449
+ {
450
+ "current": 800.0,
451
+ "min": 800.0,
452
+ "max": 3800.0
453
+ },
454
+ {
455
+ "current": 800.0,
456
+ "min": 800.0,
457
+ "max": 3800.0
458
+ },
459
+ {
460
+ "current": 800.0,
461
+ "min": 800.0,
462
+ "max": 3800.0
463
+ },
464
+ {
465
+ "current": 800.0,
466
+ "min": 800.0,
467
+ "max": 3800.0
468
+ },
469
+ {
470
+ "current": 800.0,
471
+ "min": 800.0,
472
+ "max": 3800.0
473
+ },
474
+ {
475
+ "current": 800.0,
476
+ "min": 800.0,
477
+ "max": 3800.0
478
+ },
479
+ {
480
+ "current": 800.0,
481
+ "min": 800.0,
482
+ "max": 3800.0
483
+ },
484
+ {
485
+ "current": 800.0,
486
+ "min": 800.0,
487
+ "max": 3800.0
488
+ },
489
+ {
490
+ "current": 800.0,
491
+ "min": 800.0,
492
+ "max": 3800.0
493
+ },
494
+ {
495
+ "current": 800.0,
496
+ "min": 800.0,
497
+ "max": 3800.0
498
+ },
499
+ {
500
+ "current": 783.387,
501
+ "min": 800.0,
502
+ "max": 3800.0
503
+ },
504
+ {
505
+ "current": 800.0,
506
+ "min": 800.0,
507
+ "max": 3800.0
508
+ },
509
+ {
510
+ "current": 782.915,
511
+ "min": 800.0,
512
+ "max": 3800.0
513
+ },
514
+ {
515
+ "current": 800.0,
516
+ "min": 800.0,
517
+ "max": 3800.0
518
+ },
519
+ {
520
+ "current": 800.0,
521
+ "min": 800.0,
522
+ "max": 3800.0
523
+ },
524
+ {
525
+ "current": 800.0,
526
+ "min": 800.0,
527
+ "max": 3800.0
528
+ },
529
+ {
530
+ "current": 800.0,
531
+ "min": 800.0,
532
+ "max": 3800.0
533
+ },
534
+ {
535
+ "current": 800.0,
536
+ "min": 800.0,
537
+ "max": 3800.0
538
+ },
539
+ {
540
+ "current": 800.0,
541
+ "min": 800.0,
542
+ "max": 3800.0
543
+ },
544
+ {
545
+ "current": 800.0,
546
+ "min": 800.0,
547
+ "max": 3800.0
548
+ },
549
+ {
550
+ "current": 800.0,
551
+ "min": 800.0,
552
+ "max": 3800.0
553
+ },
554
+ {
555
+ "current": 800.0,
556
+ "min": 800.0,
557
+ "max": 3800.0
558
+ },
559
+ {
560
+ "current": 800.0,
561
+ "min": 800.0,
562
+ "max": 3800.0
563
+ },
564
+ {
565
+ "current": 800.0,
566
+ "min": 800.0,
567
+ "max": 3800.0
568
+ },
569
+ {
570
+ "current": 800.0,
571
+ "min": 800.0,
572
+ "max": 3800.0
573
+ },
574
+ {
575
+ "current": 800.0,
576
+ "min": 800.0,
577
+ "max": 3800.0
578
+ },
579
+ {
580
+ "current": 800.0,
581
+ "min": 800.0,
582
+ "max": 3800.0
583
+ },
584
+ {
585
+ "current": 800.0,
586
+ "min": 800.0,
587
+ "max": 3800.0
588
+ },
589
+ {
590
+ "current": 800.0,
591
+ "min": 800.0,
592
+ "max": 3800.0
593
+ },
594
+ {
595
+ "current": 800.0,
596
+ "min": 800.0,
597
+ "max": 3800.0
598
+ },
599
+ {
600
+ "current": 800.0,
601
+ "min": 800.0,
602
+ "max": 3800.0
603
+ },
604
+ {
605
+ "current": 800.0,
606
+ "min": 800.0,
607
+ "max": 3800.0
608
+ },
609
+ {
610
+ "current": 800.0,
611
+ "min": 800.0,
612
+ "max": 3800.0
613
+ },
614
+ {
615
+ "current": 800.0,
616
+ "min": 800.0,
617
+ "max": 3800.0
618
+ },
619
+ {
620
+ "current": 800.0,
621
+ "min": 800.0,
622
+ "max": 3800.0
623
+ },
624
+ {
625
+ "current": 784.371,
626
+ "min": 800.0,
627
+ "max": 3800.0
628
+ },
629
+ {
630
+ "current": 800.0,
631
+ "min": 800.0,
632
+ "max": 3800.0
633
+ },
634
+ {
635
+ "current": 800.0,
636
+ "min": 800.0,
637
+ "max": 3800.0
638
+ },
639
+ {
640
+ "current": 800.0,
641
+ "min": 800.0,
642
+ "max": 3800.0
643
+ },
644
+ {
645
+ "current": 800.0,
646
+ "min": 800.0,
647
+ "max": 3800.0
648
+ },
649
+ {
650
+ "current": 800.0,
651
+ "min": 800.0,
652
+ "max": 3800.0
653
+ },
654
+ {
655
+ "current": 800.0,
656
+ "min": 800.0,
657
+ "max": 3800.0
658
+ }
659
+ ],
660
+ "disk": {
661
+ "/": {
662
+ "total": 438.487850189209,
663
+ "used": 18.224109649658203
664
+ }
665
+ },
666
+ "gpu": "NVIDIA A100-SXM4-80GB",
667
+ "gpu_count": 8,
668
+ "gpu_devices": [
669
+ {
670
+ "name": "NVIDIA A100-SXM4-80GB",
671
+ "memory_total": 85899345920
672
+ },
673
+ {
674
+ "name": "NVIDIA A100-SXM4-80GB",
675
+ "memory_total": 85899345920
676
+ },
677
+ {
678
+ "name": "NVIDIA A100-SXM4-80GB",
679
+ "memory_total": 85899345920
680
+ },
681
+ {
682
+ "name": "NVIDIA A100-SXM4-80GB",
683
+ "memory_total": 85899345920
684
+ },
685
+ {
686
+ "name": "NVIDIA A100-SXM4-80GB",
687
+ "memory_total": 85899345920
688
+ },
689
+ {
690
+ "name": "NVIDIA A100-SXM4-80GB",
691
+ "memory_total": 85899345920
692
+ },
693
+ {
694
+ "name": "NVIDIA A100-SXM4-80GB",
695
+ "memory_total": 85899345920
696
+ },
697
+ {
698
+ "name": "NVIDIA A100-SXM4-80GB",
699
+ "memory_total": 85899345920
700
+ }
701
+ ],
702
+ "memory": {
703
+ "total": 2015.3287239074707
704
+ }
705
+ }
wandb/run-20250102_021927-pw8rud5e/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss": 0.2278, "train/grad_norm": 8.256059646606445, "train/learning_rate": 8.018078351076653e-07, "train/epoch": 0.07431516137781394, "train/num_input_tokens_seen": 2671771648, "train/global_step": 1274, "_timestamp": 1735793002.997822, "_runtime": 8635.158229112625, "_step": 273}
wandb/run-20250102_021927-pw8rud5e/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20250102_021927-pw8rud5e/logs/debug.log ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-01-02 02:19:27,831 INFO MainThread:2085425 [wandb_setup.py:_flush():76] Current SDK version is 0.17.3
2
+ 2025-01-02 02:19:27,831 INFO MainThread:2085425 [wandb_setup.py:_flush():76] Configure stats pid to 2085425
3
+ 2025-01-02 02:19:27,831 INFO MainThread:2085425 [wandb_setup.py:_flush():76] Loading settings from /home/ctpham_umass_edu/.config/wandb/settings
4
+ 2025-01-02 02:19:27,832 INFO MainThread:2085425 [wandb_setup.py:_flush():76] Loading settings from /work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/wandb/settings
5
+ 2025-01-02 02:19:27,832 INFO MainThread:2085425 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'root_dir': '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', 'project': 'prolong', 'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2025-01-02 02:19:27,832 INFO MainThread:2085425 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2025-01-02 02:19:27,832 INFO MainThread:2085425 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'prolong-final/finetune.py', 'program_abspath': '/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/finetune.py', 'program': '/work/pi_miyyer_umass_edu/ctpham/BookClaim-dev/prolong-final/finetune.py'}
8
+ 2025-01-02 02:19:27,832 INFO MainThread:2085425 [wandb_setup.py:_flush():76] Applying login settings: {}
9
+ 2025-01-02 02:19:27,832 INFO MainThread:2085425 [wandb_init.py:_log_setup():520] Logging user logs to /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20250102_021927-pw8rud5e/logs/debug.log
10
+ 2025-01-02 02:19:27,832 INFO MainThread:2085425 [wandb_init.py:_log_setup():521] Logging internal logs to /scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/wandb/run-20250102_021927-pw8rud5e/logs/debug-internal.log
11
+ 2025-01-02 02:19:27,832 INFO MainThread:2085425 [wandb_init.py:init():560] calling init triggers
12
+ 2025-01-02 02:19:27,832 INFO MainThread:2085425 [wandb_init.py:init():567] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2025-01-02 02:19:27,832 INFO MainThread:2085425 [wandb_init.py:init():610] starting backend
15
+ 2025-01-02 02:19:27,832 INFO MainThread:2085425 [wandb_init.py:init():614] setting up manager
16
+ 2025-01-02 02:19:27,835 INFO MainThread:2085425 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2025-01-02 02:19:27,839 INFO MainThread:2085425 [wandb_init.py:init():622] backend started and connected
18
+ 2025-01-02 02:19:27,847 INFO MainThread:2085425 [wandb_init.py:init():711] updated telemetry
19
+ 2025-01-02 02:19:27,879 INFO MainThread:2085425 [wandb_init.py:init():744] communicating run to backend with 90.0 second timeout
20
+ 2025-01-02 02:19:28,136 INFO MainThread:2085425 [wandb_run.py:_on_init():2402] communicating current version
21
+ 2025-01-02 02:19:28,196 INFO MainThread:2085425 [wandb_run.py:_on_init():2411] got version response upgrade_message: "wandb version 0.19.1 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
22
+
23
+ 2025-01-02 02:19:28,196 INFO MainThread:2085425 [wandb_init.py:init():795] starting run threads in backend
24
+ 2025-01-02 02:19:35,037 INFO MainThread:2085425 [wandb_run.py:_console_start():2380] atexit reg
25
+ 2025-01-02 02:19:35,037 INFO MainThread:2085425 [wandb_run.py:_redirect():2235] redirect: wrap_raw
26
+ 2025-01-02 02:19:35,037 INFO MainThread:2085425 [wandb_run.py:_redirect():2300] Wrapping output streams.
27
+ 2025-01-02 02:19:35,037 INFO MainThread:2085425 [wandb_run.py:_redirect():2325] Redirects installed.
28
+ 2025-01-02 02:19:35,039 INFO MainThread:2085425 [wandb_init.py:init():838] run started, returning control to user process
29
+ 2025-01-02 02:19:35,041 INFO MainThread:2085425 [wandb_run.py:_config_callback():1382] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 8.0, 'low_freq_factor': 1.0, 'high_freq_factor': 4.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': 0, 'eos_token_id': [128001, 128008, 128009], 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/datasets/ai/llama3/meta-llama/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/', 'transformers_version': '4.44.2', 'model_type': 'llama', 'output_dir': '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', 'overwrite_output_dir': False, 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 1e-06, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.05, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/scratch3/workspace/ctpham_umass_edu-ft/_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_/runs/Jan02_02-12-22_gpu020', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 200, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 1, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '_llama-3.1-8b-instruct_bsz-16_lr-1e-6_epochs-1_', 'disable_tqdm': True, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': True, 'fsdp': ['auto_wrap', 'offload'], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'eval_use_gather_object': False, 'min_lr_ratio': 0.1, 'cuda_empty_cache': True, 'streaming_dataset': True, 'seq_parallel_size': 8}
30
+ 2025-01-02 02:19:35,044 INFO MainThread:2085425 [wandb_config.py:__setitem__():151] config set model/num_parameters = 1003782656 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x78e4f41c3100>>
31
+ 2025-01-02 02:19:35,044 INFO MainThread:2085425 [wandb_run.py:_config_callback():1382] config_cb model/num_parameters 1003782656 None
wandb/run-20250102_021927-pw8rud5e/run-pw8rud5e.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b6cc9012c5f1b9992a864e01d9766956c97b137b6869526158e3838a3707f24
3
+ size 1737525
wandb/run-20250102_074844-1ecgrehs/files/conda-environment.yaml ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=conda_forge
6
+ - _openmp_mutex=4.5=2_gnu
7
+ - bzip2=1.0.8=h4bc722e_7
8
+ - ca-certificates=2024.12.14=hbcca054_0
9
+ - ld_impl_linux-64=2.43=h712a8e2_2
10
+ - libffi=3.4.2=h7f98852_5
11
+ - libgcc=14.2.0=h77fa898_1
12
+ - libgcc-ng=14.2.0=h69a702a_1
13
+ - libgomp=14.2.0=h77fa898_1
14
+ - liblzma=5.6.3=hb9d3cd8_1
15
+ - liblzma-devel=5.6.3=hb9d3cd8_1
16
+ - libnsl=2.0.1=hd590300_0
17
+ - libsqlite=3.47.2=hee588c1_0
18
+ - libuuid=2.38.1=h0b41bf4_0
19
+ - libzlib=1.3.1=hb9d3cd8_2
20
+ - ncurses=6.5=he02047a_1
21
+ - openssl=3.4.0=hb9d3cd8_0
22
+ - pip=24.3.1=pyh8b19718_2
23
+ - python=3.10.0=h543edf9_3_cpython
24
+ - readline=8.2=h8228510_1
25
+ - setuptools=75.6.0=pyhff2d567_1
26
+ - sqlite=3.47.2=h9eae976_0
27
+ - tk=8.6.13=noxft_h4845f30_101
28
+ - wheel=0.45.1=pyhd8ed1ab_1
29
+ - xz=5.6.3=hbcc6ac9_1
30
+ - xz-gpl-tools=5.6.3=hbcc6ac9_1
31
+ - xz-tools=5.6.3=hb9d3cd8_1
32
+ - pip:
33
+ - accelerate==0.32.1
34
+ - aiohappyeyeballs==2.4.3
35
+ - aiohttp==3.11.2
36
+ - aioprometheus==23.12.0
37
+ - aiosignal==1.3.1
38
+ - annotated-types==0.7.0
39
+ - anthropic==0.39.0
40
+ - anyio==4.6.2.post1
41
+ - argcomplete==3.5.1
42
+ - arrow==1.3.0
43
+ - async-timeout==5.0.1
44
+ - attrs==24.2.0
45
+ - azure-core==1.32.0
46
+ - azure-identity==1.19.0
47
+ - azure-storage-blob==12.24.0
48
+ - azure-storage-file-datalake==12.18.0
49
+ - backoff==2.2.1
50
+ - bcrypt==4.2.0
51
+ - blobfile==3.0.0
52
+ - boto3==1.35.63
53
+ - botocore==1.35.63
54
+ - brotli==1.1.0
55
+ - cachetools==5.5.0
56
+ - certifi==2024.8.30
57
+ - cffi==1.17.1
58
+ - charset-normalizer==3.4.0
59
+ - circuitbreaker==2.0.0
60
+ - click==8.1.7
61
+ - cloudpickle==3.1.0
62
+ - compressed-tensors==0.8.0
63
+ - contourpy==1.3.1
64
+ - cramjam==2.9.0
65
+ - cryptography==43.0.3
66
+ - cycler==0.12.1
67
+ - datasets==2.20.0
68
+ - debugpy==1.8.11
69
+ - dill==0.3.8
70
+ - diskcache==5.6.3
71
+ - distro==1.9.0
72
+ - docker-pycreds==0.4.0
73
+ - docstring-parser==0.16
74
+ - einops==0.8.0
75
+ - fastapi==0.115.5
76
+ - filelock==3.16.1
77
+ - flash-attn==2.6.1
78
+ - fonttools==4.55.0
79
+ - frozenlist==1.5.0
80
+ - fsspec==2024.5.0
81
+ - gguf==0.10.0
82
+ - gitdb==4.0.11
83
+ - gitpython==3.1.43
84
+ - google-api-core==2.23.0
85
+ - google-auth==2.36.0
86
+ - google-cloud-aiplatform==1.71.1
87
+ - google-cloud-bigquery==3.27.0
88
+ - google-cloud-core==2.4.1
89
+ - google-cloud-resource-manager==1.13.1
90
+ - google-cloud-storage==2.10.0
91
+ - google-crc32c==1.6.0
92
+ - google-resumable-media==2.7.2
93
+ - googleapis-common-protos==1.66.0
94
+ - gql==3.5.0
95
+ - graphql-core==3.2.5
96
+ - grpc-google-iam-v1==0.13.1
97
+ - grpcio==1.68.0
98
+ - grpcio-status==1.62.3
99
+ - h11==0.14.0
100
+ - httpcore==1.0.7
101
+ - httptools==0.6.4
102
+ - httpx==0.27.2
103
+ - huggingface-hub==0.26.2
104
+ - idna==3.10
105
+ - importlib-metadata==8.5.0
106
+ - interegular==0.3.3
107
+ - ipython==8.18.0
108
+ - isodate==0.7.2
109
+ - jedi==0.19.2
110
+ - jinja2==3.1.4
111
+ - jiter==0.7.1
112
+ - jmespath==1.0.1
113
+ - jsonschema==4.23.0
114
+ - jsonschema-specifications==2024.10.1
115
+ - kiwisolver==1.4.7
116
+ - lark==1.2.2
117
+ - llvmlite==0.43.0
118
+ - lm-format-enforcer==0.10.9
119
+ - lxml==5.3.0
120
+ - markdown-it-py==3.0.0
121
+ - markupsafe==3.0.2
122
+ - matplotlib==3.9.2
123
+ - mdurl==0.1.2
124
+ - mosaicml-cli==0.5.34
125
+ - mosaicml-streaming==0.8.1
126
+ - mpmath==1.3.0
127
+ - msal==1.31.1
128
+ - msal-extensions==1.2.0
129
+ - msgpack==1.1.0
130
+ - msgspec==0.18.6
131
+ - multidict==6.1.0
132
+ - multiprocess==0.70.16
133
+ - networkx==3.4.2
134
+ - ninja==1.11.1.1
135
+ - numba==0.60.0
136
+ - numpy==1.26.4
137
+ - nvidia-cublas-cu12==12.1.3.1
138
+ - nvidia-cuda-cupti-cu12==12.1.105
139
+ - nvidia-cuda-nvrtc-cu12==12.1.105
140
+ - nvidia-cuda-runtime-cu12==12.1.105
141
+ - nvidia-cudnn-cu12==9.1.0.70
142
+ - nvidia-cufft-cu12==11.0.2.54
143
+ - nvidia-curand-cu12==10.3.2.106
144
+ - nvidia-cusolver-cu12==11.4.5.107
145
+ - nvidia-cusparse-cu12==12.1.0.106
146
+ - nvidia-ml-py==12.560.30
147
+ - nvidia-nccl-cu12==2.20.5
148
+ - nvidia-nvjitlink-cu12==12.4.127
149
+ - nvidia-nvtx-cu12==12.1.105
150
+ - oci==2.138.1
151
+ - openai==1.54.5
152
+ - opencv-python-headless==4.10.0.84
153
+ - orjson==3.10.11
154
+ - outlines==0.0.46
155
+ - packaging==24.1
156
+ - pandas==2.2.1
157
+ - paramiko==3.5.0
158
+ - partial-json-parser==0.2.1.1.post4
159
+ - pillow==10.4.0
160
+ - portalocker==2.10.1
161
+ - prometheus-client==0.21.0
162
+ - prometheus-fastapi-instrumentator==7.0.0
163
+ - prompt-toolkit==3.0.36
164
+ - propcache==0.2.0
165
+ - proto-plus==1.25.0
166
+ - protobuf==4.25.3
167
+ - py-cpuinfo==9.0.0
168
+ - pyairports==2.1.1
169
+ - pyarrow==18.0.0
170
+ - pyarrow-hotfix==0.6
171
+ - pyasn1==0.6.1
172
+ - pyasn1-modules==0.4.1
173
+ - pycountry==24.6.1
174
+ - pycparser==2.22
175
+ - pycryptodomex==3.21.0
176
+ - pydantic==2.9.2
177
+ - pydantic-core==2.23.4
178
+ - pyjwt==2.10.0
179
+ - pynacl==1.5.0
180
+ - pyopenssl==24.2.1
181
+ - pyparsing==3.2.0
182
+ - python-dateutil==2.9.0
183
+ - python-dotenv==1.0.1
184
+ - python-snappy==0.7.3
185
+ - pytz==2024.2
186
+ - pyyaml==6.0.2
187
+ - quantile-python==1.1
188
+ - questionary==2.0.1
189
+ - ray==2.39.0
190
+ - referencing==0.35.1
191
+ - regex==2023.12.25
192
+ - requests==2.32.3
193
+ - rich==13.9.4
194
+ - rotary-emb==0.5.2
195
+ - rpds-py==0.21.0
196
+ - rsa==4.9
197
+ - ruamel-yaml==0.18.6
198
+ - ruamel-yaml-clib==0.2.12
199
+ - s3transfer==0.10.3
200
+ - safetensors==0.4.5
201
+ - sentencepiece==0.1.99
202
+ - sentry-sdk==2.18.0
203
+ - setproctitle==1.3.4
204
+ - shapely==2.0.6
205
+ - simple-parsing==0.1.6
206
+ - smmap==5.0.1
207
+ - sniffio==1.3.1
208
+ - starlette==0.41.3
209
+ - sympy==1.13.1
210
+ - tiktoken==0.7.0
211
+ - tokenizers==0.19.1
212
+ - torch==2.4.1
213
+ - torchvision==0.19.1
214
+ - tqdm==4.66.4
215
+ - transformers==4.44.2
216
+ - triton==3.0.0
217
+ - types-python-dateutil==2.9.0.20241003
218
+ - tzdata==2024.2
219
+ - urllib3==2.2.3
220
+ - uvicorn==0.32.0
221
+ - uvloop==0.21.0
222
+ - validators==0.34.0
223
+ - vertexai==1.71.1
224
+ - wandb==0.17.3
225
+ - watchfiles==0.24.0
226
+ - websockets==11.0.3
227
+ - xformers==0.0.28.post1
228
+ - xxhash==3.5.0
229
+ - yarl==1.17.2
230
+ - zipp==3.21.0
231
+ - zstandard==0.23.0
232
+ - zstd==1.5.5.1
233
+ prefix: /scratch3/workspace/ctpham_umass_edu-ft/envs/prolong-final