Triangle104 commited on
Commit
acaa5b4
·
verified ·
1 Parent(s): 3ab59a0

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +80 -72
README.md CHANGED
@@ -27,79 +27,87 @@ This model was converted to GGUF format from [`EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2`
27
  Refer to the [original model card](https://huggingface.co/EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2) for more details on the model.
28
 
29
  ---
 
 
30
 
31
- - model.layers.33.self_attn.v_proj
32
- - model.layers.42.self_attn.v_proj
33
- - model.layers.29.self_attn.v_proj
34
- - model.layers.9.self_attn.v_proj
35
- - model.layers.14.self_attn.v_proj
36
- - model.layers.35.self_attn.v_proj
37
- - model.layers.38.self_attn.v_proj
38
- - model.layers.13.self_attn.v_proj
39
- - model.layers.30.self_attn.v_proj
40
- - model.layers.34.self_attn.v_proj
41
- - model.layers.5.self_attn.v_proj
42
- - model.layers.28.self_attn.v_proj
43
- - model.layers.37.self_attn.v_proj
44
- - model.layers.27.self_attn.v_proj
45
- - model.layers.11.self_attn.v_proj
46
-
47
- wandb_project: EVA-Qwen2.5-14B-SFFT-v0.2
48
- wandb_entity:
49
- wandb_watch:
50
- wandb_name: Unit-02
51
- wandb_log_model:
52
-
53
- gradient_accumulation_steps: 8
54
- micro_batch_size: 2
55
- num_epochs: 3
56
- optimizer: paged_ademamix_8bit
57
- lr_scheduler: cosine
58
- learning_rate: 0.00005
59
- max_grad_norm: 3
60
-
61
- train_on_inputs: false
62
- group_by_length: false
63
- bf16: auto
64
- fp16:
65
- tf32: false
66
-
67
- gradient_checkpointing: "unsloth"
68
- # gradient_checkpointing_kwargs:
69
- # use_reentrant: true
70
- early_stopping_patience:
71
- resume_from_checkpoint:
72
- local_rank:
73
- logging_steps: 1
74
- xformers_attention:
75
- flash_attention: true
76
-
77
- warmup_steps: 20
78
- evals_per_epoch: 4
79
- saves_per_epoch: 4
80
- save_safetensors: true
81
- hub_model_id:
82
- hub_strategy:
83
- debug:
84
- deepspeed: deepspeed_configs/zero3_bf16.json
85
- weight_decay: 0.1
86
- # fsdp:
87
- # - full_shard
88
- # - auto_wrap
89
- # fsdp_config:
90
- # fsdp_limit_all_gathers: true
91
- # fsdp_sync_module_states: false
92
- # fsdp_offload_params: true
93
- # fsdp_cpu_ram_efficient_loading: true
94
- # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
95
- # fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
96
- # fsdp_activation_checkpointing: true
97
- # fsdp_state_dict_type: SHARDED_STATE_DICT # Changed from FULL_STATE_DICT
98
- # fsdp_sharding_strategy: FULL_SHARD
99
- # fsdp_forward_prefetch: false # Added
100
- # fsdp_backward_prefetch: "BACKWARD_PRE" # Added
101
- # fsdp_backward_prefetch_limit: 1 # Added
102
- # fsdp_mixed_precision: BF16 # Added
 
 
 
 
 
 
103
 
104
  ---
105
  ## Use with llama.cpp
 
27
  Refer to the [original model card](https://huggingface.co/EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2) for more details on the model.
28
 
29
  ---
30
+ Model details:
31
+ -
32
 
33
+ A RP/storywriting specialist model, full-parameter finetune of Qwen2.5-14B on mixture of synthetic and natural data.
34
+
35
+ It uses Celeste 70B 0.1 data mixture, greatly expanding it to improve
36
+ versatility, creativity and "flavor" of the resulting model.
37
+
38
+
39
+
40
+
41
+
42
+ Version notes for 0.2: Now using the refined dataset from 32B
43
+ 0.2. Major improvements in coherence, instruction following and
44
+ long-context comprehension over 14B v0.1.
45
+
46
+
47
+
48
+
49
+
50
+ Prompt format is ChatML.
51
+
52
+
53
+
54
+ Recommended sampler values:
55
+
56
+
57
+ Temperature: 0.8
58
+ Min-P: 0.05
59
+ Top-A: 0.3
60
+ Repetition Penalty: 1.03
61
+
62
+
63
+
64
+ Recommended SillyTavern presets (via CalamitousFelicitousness):
65
+
66
+
67
+
68
+ Context
69
+ Instruct and System Prompt
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+ Training data:
79
+
80
+
81
+
82
+ Celeste 70B 0.1 data mixture minus Opus Instruct subset. See that model's card for details.
83
+ Kalomaze's Opus_Instruct_25k dataset, filtered for refusals.
84
+ A subset (1k rows) of ChatGPT-4o-WritingPrompts by Gryphe
85
+ A subset (2k rows) of Sonnet3.5-Charcards-Roleplay by Gryphe
86
+ Synthstruct and SynthRP datasets by Epiculous
87
+ A subset from Dolphin-2.9.3, including filtered version of not_samantha and a small subset of systemchat.
88
+
89
+
90
+
91
+ Training time and hardware:
92
+
93
+
94
+
95
+ 3 hours on 8xH100 SXM, provided by FeatherlessAI
96
+
97
+
98
+
99
+
100
+
101
+
102
+ Model was created by Kearm, Auri and Cahvay.
103
+
104
+
105
+ Special thanks:
106
+ to Cahvay for his work on investigating and reprocessing the
107
+ corrupted dataset, removing the single biggest source of data poisoning.
108
+ to FeatherlessAI for generously providing 8xH100 SXM node for training of this model
109
+ to Gryphe, Lemmy, Kalomaze, Nopm, Epiculous and CognitiveComputations for the data
110
+ and to Allura-org for support, feedback, beta-testing and doing quality control of EVA models.
111
 
112
  ---
113
  ## Use with llama.cpp