leafspark commited on
Commit
9afce9c
1 Parent(s): bdd3422

readme: add new llama.cpp release info

Browse files
Files changed (1) hide show
  1. README.md +11 -11
README.md CHANGED
@@ -7,21 +7,20 @@ tags:
7
  - deepseek
8
  - gguf
9
  - bf16
10
- - chinese
11
- - english
12
  metrics:
13
  - accuracy
 
 
 
14
  ---
15
 
16
  # Deepseek-V2-Chat-GGUF
17
 
18
  Quantizised from [https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat)
19
 
20
- Using llama.cpp fork: [https://github.com/fairydreaming/llama.cpp/tree/deepseek-v2](https://github.com/fairydreaming/llama.cpp/tree/deepseek-v2)
21
 
22
- TODO: Make llamafile for Q2_K and Q4_K_M
23
-
24
- # Warning: This will not work unless you compile llama.cpp from the repo provided (and set metadata KV overrides)!
25
 
26
  # How to use:
27
 
@@ -79,27 +78,28 @@ quantize \
79
  # Quants:
80
  ```
81
  - bf16 [size: 439gb]
82
- - q8_0 (later, please use q4_k_m for now) [estimated size: 233.27gb]
83
  - q4_k_m [size: 132gb]
84
  - q2_k [size: 80gb]
85
  - iq2_xxs [size: 61.5gb]
86
  - iq3_xs (uploading) [size: 89.6gb]
87
- - iq1_m [size: 27.3gb]
 
88
  ```
89
 
90
  Note: Use iMatrix quants only if you can fully offload to GPU, otherwise speed will be affected a lot.
91
 
92
- # Planned Quants (using importance matrix):
93
  ```
94
  - q5_k_m
95
  - q5_k_s
96
- - q3_k_m
97
  - q6_k
98
  - iq4_nl
99
  - iq4_xs
100
  - iq2_xs
101
  - iq2_s
102
  - iq2_m
 
103
  - iq1_s (note: for fun only, this quant is likely useless)
104
  ```
105
 
@@ -113,7 +113,7 @@ deepseek2.expert_shared_count=int:2
113
  deepseek2.expert_feed_forward_length=int:1536
114
  deepseek2.experts_weight_scale=int:16
115
  deepseek2.leading_dense_block_count=int:1
116
- rope.scaling.yarn_log_multiplier=float:0.0707
117
  ```
118
 
119
  A precompiled AVX2 version is avaliable at `llama.cpp-039896407afd40e54321d47c5063c46a52da3e01.zip` in the root of this repo.
 
7
  - deepseek
8
  - gguf
9
  - bf16
 
 
10
  metrics:
11
  - accuracy
12
+ language:
13
+ - en
14
+ - zh
15
  ---
16
 
17
  # Deepseek-V2-Chat-GGUF
18
 
19
  Quantizised from [https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat)
20
 
21
+ Using llama.cpp b3026 for quantizisation
22
 
23
+ # Warning: This will not work unless you set metadata KV overrides, nor will it in LM Studio/similar wrapper apps!
 
 
24
 
25
  # How to use:
26
 
 
78
  # Quants:
79
  ```
80
  - bf16 [size: 439gb]
81
+ - q8_0 [estimated size: 233.27gb]
82
  - q4_k_m [size: 132gb]
83
  - q2_k [size: 80gb]
84
  - iq2_xxs [size: 61.5gb]
85
  - iq3_xs (uploading) [size: 89.6gb]
86
+ - iq1_m (uploading) [size: 27.3gb]
87
+ - q3_k_m (uploading) [size: 92.6gb]
88
  ```
89
 
90
  Note: Use iMatrix quants only if you can fully offload to GPU, otherwise speed will be affected a lot.
91
 
92
+ # Planned Quants (weighted/imatrix):
93
  ```
94
  - q5_k_m
95
  - q5_k_s
 
96
  - q6_k
97
  - iq4_nl
98
  - iq4_xs
99
  - iq2_xs
100
  - iq2_s
101
  - iq2_m
102
+ - iq3_xxs
103
  - iq1_s (note: for fun only, this quant is likely useless)
104
  ```
105
 
 
113
  deepseek2.expert_feed_forward_length=int:1536
114
  deepseek2.experts_weight_scale=int:16
115
  deepseek2.leading_dense_block_count=int:1
116
+ deepseek2.rope.scaling.yarn_log_multiplier=float:0.0707
117
  ```
118
 
119
  A precompiled AVX2 version is avaliable at `llama.cpp-039896407afd40e54321d47c5063c46a52da3e01.zip` in the root of this repo.