NyxKrage mo137 commited on
Commit
107ba5f
1 Parent(s): 9243531

Add support for EXL2 4 bit KV cache; switch from metric gigabytes (1e9 bytes) to JEDEC gigabytes (2^30 bytes) (#2)

Browse files

- Add support for EXL2 4 bit KV cache; switch from metric gigabytes (1e9 bytes) to JEDEC gigabytes (2^30 bytes) (c8c7129688460a9f9ecab851edacde8933618778)


Co-authored-by: Matt <[email protected]>

Files changed (1) hide show
  1. index.html +17 -17
index.html CHANGED
@@ -128,19 +128,16 @@
128
  return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
129
  }
130
 
131
- function kvCache(context=8192, model_config, fp8_cache=false) {
132
  const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
133
  const n_embd_gqa = model_config["hidden_size"] / n_gqa
134
  const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
135
  const size = 2 * n_elements
136
- if (fp8_cache) {
137
- return size
138
- }
139
- return size * 2
140
  }
141
 
142
- function contextSize(context=8192, model_config, bsz=512, fp8_cache=false) {
143
- return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, fp8_cache) + computeBuffer(context, model_config, bsz)).toFixed(2))
144
  }
145
 
146
  function modelSize(model_config, bpw=4.5) {
@@ -152,22 +149,22 @@
152
  const model_config = await modelConfig(document.getElementById("modelsearch").value)
153
  const context = parseInt(document.getElementById("contextsize").value)
154
  let bsz = 512
155
- let fp8_cache = false
156
  let bpw = 0
157
  if (format === "gguf") {
158
  bsz = parseInt(document.getElementById("batchsize").value)
159
  bpw = gguf_quants[document.getElementById("quantsize").innerText]
160
 
161
  } else if (format == "exl2") {
162
- fp8_cache = document.getElementById("fp8cache").checked
163
  bpw = Number.parseFloat(document.getElementById("bpw").value)
164
  }
165
 
166
  const model_size = modelSize(model_config, bpw)
167
- const context_size = contextSize(context, model_config, bsz, fp8_cache)
168
- const total_size = ((model_size + context_size) / 1e+9)
169
- document.getElementById("resultmodel").innerText = (model_size / 1e+9).toFixed(2)
170
- document.getElementById("resultcontext").innerText = (context_size / 1e+9).toFixed(2)
171
  const result_total_el = document.getElementById("resulttotal");
172
  result_total_el.innerText = total_size.toFixed(2)
173
 
@@ -401,13 +398,16 @@
401
  class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
402
  >
403
  <label
404
- for="fp8cache"
405
  class="inline-block bg-white text-xs font-medium text-gray-900"
406
  >
407
- FP8 Cache
408
  </label>
409
- <input id="fp8cache" type="checkbox">
410
- </input>
 
 
 
411
  </div>
412
  </div>
413
  </div>
 
128
  return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
129
  }
130
 
131
+ function kvCache(context=8192, model_config, cache_bit=16) {
132
  const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
133
  const n_embd_gqa = model_config["hidden_size"] / n_gqa
134
  const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
135
  const size = 2 * n_elements
136
+ return size * (cache_bit / 8)
 
 
 
137
  }
138
 
139
+ function contextSize(context=8192, model_config, bsz=512, cache_bit=16) {
140
+ return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, cache_bit) + computeBuffer(context, model_config, bsz)).toFixed(2))
141
  }
142
 
143
  function modelSize(model_config, bpw=4.5) {
 
149
  const model_config = await modelConfig(document.getElementById("modelsearch").value)
150
  const context = parseInt(document.getElementById("contextsize").value)
151
  let bsz = 512
152
+ let cache_bit = 16
153
  let bpw = 0
154
  if (format === "gguf") {
155
  bsz = parseInt(document.getElementById("batchsize").value)
156
  bpw = gguf_quants[document.getElementById("quantsize").innerText]
157
 
158
  } else if (format == "exl2") {
159
+ cache_bit = Number.parseInt(document.getElementById("kvCache").value)
160
  bpw = Number.parseFloat(document.getElementById("bpw").value)
161
  }
162
 
163
  const model_size = modelSize(model_config, bpw)
164
+ const context_size = contextSize(context, model_config, bsz, cache_bit)
165
+ const total_size = ((model_size + context_size) / 2**30)
166
+ document.getElementById("resultmodel").innerText = (model_size / 2**30).toFixed(2)
167
+ document.getElementById("resultcontext").innerText = (context_size / 2**30).toFixed(2)
168
  const result_total_el = document.getElementById("resulttotal");
169
  result_total_el.innerText = total_size.toFixed(2)
170
 
 
398
  class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
399
  >
400
  <label
401
+ for="kvCache"
402
  class="inline-block bg-white text-xs font-medium text-gray-900"
403
  >
404
+ KV Cache
405
  </label>
406
+ <select id="kvCache" name="kvCache">
407
+ <option value="16">16 bit</option>
408
+ <option value="8">8 bit</option>
409
+ <option value="4">4 bit</option>
410
+ </select>
411
  </div>
412
  </div>
413
  </div>