Upload new k-quant GGML quantised models.
Browse files
README.md
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
---
|
2 |
inference: false
|
3 |
license: other
|
4 |
-
datasets:
|
5 |
-
- jondurbin/airoboros-gpt4
|
6 |
---
|
7 |
|
8 |
<!-- header start -->
|
@@ -19,9 +17,9 @@ datasets:
|
|
19 |
</div>
|
20 |
<!-- header end -->
|
21 |
|
22 |
-
# Jon Durbin's Airoboros
|
23 |
|
24 |
-
These files are GGML format model files for [Jon Durbin's Airoboros
|
25 |
|
26 |
GGML files are for CPU + GPU inference using [llama.cpp](https://github.com/ggerganov/llama.cpp) and libraries and UIs which support this format, such as:
|
27 |
* [text-generation-webui](https://github.com/oobabooga/text-generation-webui)
|
@@ -32,45 +30,55 @@ GGML files are for CPU + GPU inference using [llama.cpp](https://github.com/gger
|
|
32 |
|
33 |
## Repositories available
|
34 |
|
35 |
-
* [4-bit GPTQ models for GPU inference](https://huggingface.co/TheBloke/airoboros-
|
36 |
-
* [4
|
37 |
* [Unquantised fp16 model in pytorch format, for GPU inference and for further conversions](https://huggingface.co/TheBloke/airoboros-7b-gpt4-fp16)
|
38 |
|
39 |
-
|
|
|
40 |
|
41 |
-
|
42 |
|
43 |
-
|
44 |
-
USER: prompt
|
45 |
-
ASSISTANT:
|
46 |
-
```
|
47 |
-
|
48 |
-
## Context length with GGML
|
49 |
|
50 |
-
|
51 |
|
52 |
-
|
53 |
|
54 |
-
|
55 |
|
56 |
-
|
57 |
|
58 |
-
|
59 |
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
I have quantised the GGML files in this repo with the latest version. Therefore you will require llama.cpp compiled on May 19th or later (commit `2d5db48` or later) to use them.
|
65 |
|
66 |
## Provided files
|
67 |
| Name | Quant method | Bits | Size | Max RAM required | Use case |
|
68 |
| ---- | ---- | ---- | ---- | ---- | ----- |
|
69 |
-
| airoboros-
|
70 |
-
| airoboros-
|
71 |
-
| airoboros-
|
72 |
-
| airoboros-
|
73 |
-
| airoboros-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
|
76 |
**Note**: the above RAM figures assume no GPU offloading. If layers are offloaded to the GPU, this will reduce RAM usage and use VRAM instead.
|
@@ -80,7 +88,7 @@ I have quantised the GGML files in this repo with the latest version. Therefore
|
|
80 |
I use the following command line; adjust for your tastes and needs:
|
81 |
|
82 |
```
|
83 |
-
./main -t 10 -ngl 32 -m airoboros-
|
84 |
```
|
85 |
Change `-t 10` to the number of physical CPU cores you have. For example if your system has 8 cores/16 threads, use `-t 8`.
|
86 |
|
@@ -112,19 +120,21 @@ Donaters will get priority support on any and all AI/LLM/model questions and req
|
|
112 |
* Patreon: https://patreon.com/TheBlokeAI
|
113 |
* Ko-Fi: https://ko-fi.com/TheBlokeAI
|
114 |
|
115 |
-
**
|
|
|
|
|
116 |
|
117 |
Thank you to all my generous patrons and donaters!
|
|
|
118 |
<!-- footer end -->
|
119 |
|
120 |
-
# Original model card: Jon Durbin's Airoboros
|
|
|
121 |
|
122 |
## Overview
|
123 |
|
124 |
This is a fine-tuned 7b parameter LlaMa model, using completely synthetic training data created gpt4 via https://github.com/jondurbin/airoboros
|
125 |
|
126 |
-
The context size has been increased to 4096.
|
127 |
-
|
128 |
The dataset used to fine-tune this model is available [here](https://huggingface.co/airoboros-gpt4), with a specific focus on:
|
129 |
- trivia
|
130 |
- math/reasoning (although it still sucks)
|
@@ -253,7 +263,7 @@ Once exploited for logging and mining, today the sparsely populated Upper Penins
|
|
253 |
The Lower Peninsula is a hub for manufacturing, forestry, agriculture, services, and high-tech industry.
|
254 |
ENDINPUT
|
255 |
BEGININSTRUCTION
|
256 |
-
What is Michigan's capital?
|
257 |
Give me a JSON object with 3 facts from Foolo's article.
|
258 |
What are the states that border Michigan?
|
259 |
Reference?
|
@@ -499,12 +509,12 @@ void insert(char* key, char* value) {
|
|
499 |
perror("malloc failed");
|
500 |
exit(1);
|
501 |
}
|
502 |
-
|
503 |
strcpy(parent->key, key);
|
504 |
strcpy(parent->value, value);
|
505 |
-
|
506 |
parent->left = parent->right = NULL;
|
507 |
-
|
508 |
if (!tree) {
|
509 |
tree = parent;
|
510 |
} else {
|
@@ -520,7 +530,7 @@ char* search(char* key) {
|
|
520 |
}
|
521 |
current = current->right;
|
522 |
}
|
523 |
-
|
524 |
return NULL;
|
525 |
}
|
526 |
|
@@ -535,17 +545,17 @@ void print_tree() {
|
|
535 |
void* accept_client(void* args) {
|
536 |
int client_socket = *(int*)args;
|
537 |
char buffer[1024];
|
538 |
-
|
539 |
while (1) {
|
540 |
ssize_t bytes_received = recv(client_socket, buffer, sizeof(buffer), 0);
|
541 |
if (bytes_received <= 0) {
|
542 |
close(client_socket);
|
543 |
break;
|
544 |
}
|
545 |
-
|
546 |
char* input = buffer;
|
547 |
char* input_token;
|
548 |
-
|
549 |
while ((input_token = strtok(input, " \n")) != NULL) {
|
550 |
switch (input_token[0]) {
|
551 |
case 'G':
|
@@ -555,7 +565,7 @@ void* accept_client(void* args) {
|
|
555 |
}
|
556 |
printf("%s = %s\n", input_token, search(input_token));
|
557 |
break;
|
558 |
-
|
559 |
case 'S':
|
560 |
if (!strcmp(input_token, "SET")) {
|
561 |
char* key = strtok(NULL, " \n");
|
@@ -564,7 +574,7 @@ void* accept_client(void* args) {
|
|
564 |
perror("Invalid input");
|
565 |
exit(1);
|
566 |
}
|
567 |
-
|
568 |
insert(key, value);
|
569 |
printf("%s set %s to %s\n", input_token, key, value);
|
570 |
} else {
|
@@ -572,16 +582,16 @@ void* accept_client(void* args) {
|
|
572 |
exit(1);
|
573 |
}
|
574 |
break;
|
575 |
-
|
576 |
default:
|
577 |
perror("Invalid input");
|
578 |
exit(1);
|
579 |
}
|
580 |
-
|
581 |
input = strtok(NULL, " \n");
|
582 |
}
|
583 |
}
|
584 |
-
|
585 |
return NULL;
|
586 |
}
|
587 |
|
@@ -591,50 +601,50 @@ int main() {
|
|
591 |
perror("socket failed");
|
592 |
exit(1);
|
593 |
}
|
594 |
-
|
595 |
struct sockaddr_in server_addr;
|
596 |
memset(&server_addr, 0, sizeof(server_addr));
|
597 |
server_addr.sin_family = AF_INET;
|
598 |
server_addr.sin_port = htons(8080);
|
599 |
server_addr.sin_addr.s_addr = INADDR_ANY;
|
600 |
-
|
601 |
if (bind(server_socket, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
|
602 |
perror("bind failed");
|
603 |
exit(1);
|
604 |
}
|
605 |
-
|
606 |
if (listen(server_socket, 5) < 0) {
|
607 |
perror("listen failed");
|
608 |
exit(1);
|
609 |
}
|
610 |
-
|
611 |
pthread_t accept_thread;
|
612 |
pthread_create(&accept_thread, NULL, accept_client, &server_socket);
|
613 |
-
|
614 |
char* client_input;
|
615 |
int client_socket = accept(server_socket, (struct sockaddr*)NULL, NULL);
|
616 |
if (client_socket < 0) {
|
617 |
perror("accept failed");
|
618 |
exit(1);
|
619 |
}
|
620 |
-
|
621 |
while (1) {
|
622 |
sleep(1);
|
623 |
-
|
624 |
char buffer[1024];
|
625 |
ssize_t bytes_received = recv(client_socket, buffer, sizeof(buffer), 0);
|
626 |
if (bytes_received <= 0) {
|
627 |
close(client_socket);
|
628 |
break;
|
629 |
}
|
630 |
-
|
631 |
client_input = buffer;
|
632 |
parse_input(client_input);
|
633 |
}
|
634 |
-
|
635 |
close(client_socket);
|
636 |
pthread_join(accept_thread, NULL);
|
637 |
-
|
638 |
return 0;
|
639 |
}
|
640 |
```
|
|
|
1 |
---
|
2 |
inference: false
|
3 |
license: other
|
|
|
|
|
4 |
---
|
5 |
|
6 |
<!-- header start -->
|
|
|
17 |
</div>
|
18 |
<!-- header end -->
|
19 |
|
20 |
+
# Jon Durbin's Airoboros 7B GPT4 GGML
|
21 |
|
22 |
+
These files are GGML format model files for [Jon Durbin's Airoboros 7B GPT4](https://huggingface.co/jondurbin/airoboros-7b-gpt4).
|
23 |
|
24 |
GGML files are for CPU + GPU inference using [llama.cpp](https://github.com/ggerganov/llama.cpp) and libraries and UIs which support this format, such as:
|
25 |
* [text-generation-webui](https://github.com/oobabooga/text-generation-webui)
|
|
|
30 |
|
31 |
## Repositories available
|
32 |
|
33 |
+
* [4-bit GPTQ models for GPU inference](https://huggingface.co/TheBloke/airoboros-7B-gpt4-GPTQ)
|
34 |
+
* [2, 3, 4, 5, 6 and 8-bit GGML models for CPU+GPU inference](https://huggingface.co/TheBloke/airoboros-7B-gpt4-GGML)
|
35 |
* [Unquantised fp16 model in pytorch format, for GPU inference and for further conversions](https://huggingface.co/TheBloke/airoboros-7b-gpt4-fp16)
|
36 |
|
37 |
+
<!-- compatibility_ggml start -->
|
38 |
+
## Compatibility
|
39 |
|
40 |
+
### Original llama.cpp quant methods: `q4_0, q4_1, q5_0, q5_1, q8_0`
|
41 |
|
42 |
+
I have quantized these 'original' quantisation methods using an older version of llama.cpp so that they remain compatible with llama.cpp as of May 19th, commit `2d5db48`.
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
+
They should be compatible with all current UIs and libraries that use llama.cpp, such as those listed at the top of this README.
|
45 |
|
46 |
+
### New k-quant methods: `q2_K, q3_K_S, q3_K_M, q3_K_L, q4_K_S, q4_K_M, q5_K_S, q6_K`
|
47 |
|
48 |
+
These new quantisation methods are only compatible with llama.cpp as of June 6th, commit `2d43387`.
|
49 |
|
50 |
+
They will NOT be compatible with koboldcpp, text-generation-ui, and other UIs and libraries yet. Support is expected to come over the next few days.
|
51 |
|
52 |
+
## Explanation of the new k-quant methods
|
53 |
|
54 |
+
The new methods available are:
|
55 |
+
* GGML_TYPE_Q2_K - "type-1" 2-bit quantization in super-blocks containing 16 blocks, each block having 16 weight. Block scales and mins are quantized with 4 bits. This ends up effectively using 2.5625 bits per weight (bpw)
|
56 |
+
* GGML_TYPE_Q3_K - "type-0" 3-bit quantization in super-blocks containing 16 blocks, each block having 16 weights. Scales are quantized with 6 bits. This end up using 3.4375 bpw.
|
57 |
+
* GGML_TYPE_Q4_K - "type-1" 4-bit quantization in super-blocks containing 8 blocks, each block having 32 weights. Scales and mins are quantized with 6 bits. This ends up using 4.5 bpw.
|
58 |
+
* GGML_TYPE_Q5_K - "type-1" 5-bit quantization. Same super-block structure as GGML_TYPE_Q4_K resulting in 5.5 bpw
|
59 |
+
* GGML_TYPE_Q6_K - "type-0" 6-bit quantization. Super-blocks with 16 blocks, each block having 16 weights. Scales are quantized with 8 bits. This ends up using 6.5625 bpw
|
60 |
+
* GGML_TYPE_Q8_K - "type-0" 8-bit quantization. Only used for quantizing intermediate results. The difference to the existing Q8_0 is that the block size is 256. All 2-6 bit dot products are implemented for this quantization type.
|
61 |
|
62 |
+
Refer to the Provided Files table below to see what files use which methods, and how.
|
63 |
+
<!-- compatibility_ggml end -->
|
|
|
64 |
|
65 |
## Provided files
|
66 |
| Name | Quant method | Bits | Size | Max RAM required | Use case |
|
67 |
| ---- | ---- | ---- | ---- | ---- | ----- |
|
68 |
+
| airoboros-7B.ggmlv3.q2_K.bin | q2_K | 2 | 2.80 GB | 5.30 GB | New k-quant method. Uses GGML_TYPE_Q4_K for the attention.vw and feed_forward.w2 tensors, GGML_TYPE_Q2_K for the other tensors. |
|
69 |
+
| airoboros-7B.ggmlv3.q3_K_L.bin | q3_K_L | 3 | 3.55 GB | 6.05 GB | New k-quant method. Uses GGML_TYPE_Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else GGML_TYPE_Q3_K |
|
70 |
+
| airoboros-7B.ggmlv3.q3_K_M.bin | q3_K_M | 3 | 3.23 GB | 5.73 GB | New k-quant method. Uses GGML_TYPE_Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else GGML_TYPE_Q3_K |
|
71 |
+
| airoboros-7B.ggmlv3.q3_K_S.bin | q3_K_S | 3 | 2.90 GB | 5.40 GB | New k-quant method. Uses GGML_TYPE_Q3_K for all tensors |
|
72 |
+
| airoboros-7B.ggmlv3.q4_K_M.bin | q4_K_M | 4 | 4.05 GB | 6.55 GB | New k-quant method. Uses GGML_TYPE_Q6_K for half of the attention.wv and feed_forward.w2 tensors, else GGML_TYPE_Q4_K |
|
73 |
+
| airoboros-7B.ggmlv3.q4_K_S.bin | q4_K_S | 4 | 3.79 GB | 6.29 GB | New k-quant method. Uses GGML_TYPE_Q4_K for all tensors |
|
74 |
+
| airoboros-7B.ggmlv3.q5_K_M.bin | q5_K_M | 5 | 4.77 GB | 7.27 GB | New k-quant method. Uses GGML_TYPE_Q6_K for half of the attention.wv and feed_forward.w2 tensors, else GGML_TYPE_Q5_K |
|
75 |
+
| airoboros-7B.ggmlv3.q5_K_S.bin | q5_K_S | 5 | 4.63 GB | 7.13 GB | New k-quant method. Uses GGML_TYPE_Q5_K for all tensors |
|
76 |
+
| airoboros-7B.ggmlv3.q6_K.bin | q6_K | 6 | 5.53 GB | 8.03 GB | New k-quant method. Uses GGML_TYPE_Q8_K - 6-bit quantization - for all tensors |
|
77 |
+
| airoboros-7b-gpt4.ggmlv3.q4_0.bin | q4_0 | 4 | 3.79 GB | 6.29 GB | Original llama.cpp quant method, 4-bit. |
|
78 |
+
| airoboros-7b-gpt4.ggmlv3.q4_1.bin | q4_1 | 4 | 4.21 GB | 6.71 GB | Original llama.cpp quant method, 4-bit. Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models. |
|
79 |
+
| airoboros-7b-gpt4.ggmlv3.q5_0.bin | q5_0 | 5 | 4.63 GB | 7.13 GB | Original llama.cpp quant method, 5-bit. Higher accuracy, higher resource usage and slower inference. |
|
80 |
+
| airoboros-7b-gpt4.ggmlv3.q5_1.bin | q5_1 | 5 | 5.06 GB | 7.56 GB | Original llama.cpp quant method, 5-bit. Even higher accuracy, resource usage and slower inference. |
|
81 |
+
| airoboros-7b-gpt4.ggmlv3.q8_0.bin | q8_0 | 8 | 7.16 GB | 9.66 GB | Original llama.cpp quant method, 8-bit. Almost indistinguishable from float16. High resource use and slow. Not recommended for most users. |
|
82 |
|
83 |
|
84 |
**Note**: the above RAM figures assume no GPU offloading. If layers are offloaded to the GPU, this will reduce RAM usage and use VRAM instead.
|
|
|
88 |
I use the following command line; adjust for your tastes and needs:
|
89 |
|
90 |
```
|
91 |
+
./main -t 10 -ngl 32 -m airoboros-7B.ggmlv3.q5_0.bin --color -c 2048 --temp 0.7 --repeat_penalty 1.1 -n -1 -p "### Instruction: Write a story about llamas\n### Response:"
|
92 |
```
|
93 |
Change `-t 10` to the number of physical CPU cores you have. For example if your system has 8 cores/16 threads, use `-t 8`.
|
94 |
|
|
|
120 |
* Patreon: https://patreon.com/TheBlokeAI
|
121 |
* Ko-Fi: https://ko-fi.com/TheBlokeAI
|
122 |
|
123 |
+
**Special thanks to**: Luke from CarbonQuill, Aemon Algiz, Dmitriy Samsonov.
|
124 |
+
|
125 |
+
**Patreon special mentions**: Ajan Kanaga, Kalila, Derek Yates, Sean Connelly, Luke, Nathan LeClaire, Trenton Dambrowitz, Mano Prime, David Flickinger, vamX, Nikolai Manek, senxiiz, Khalefa Al-Ahmad, Illia Dulskyi, trip7s trip, Jonathan Leane, Talal Aujan, Artur Olbinski, Cory Kujawski, Joseph William Delisle, Pyrater, Oscar Rangel, Lone Striker, Luke Pendergrass, Eugene Pentland, Johann-Peter Hartmann.
|
126 |
|
127 |
Thank you to all my generous patrons and donaters!
|
128 |
+
|
129 |
<!-- footer end -->
|
130 |
|
131 |
+
# Original model card: Jon Durbin's Airoboros 7B GPT4
|
132 |
+
|
133 |
|
134 |
## Overview
|
135 |
|
136 |
This is a fine-tuned 7b parameter LlaMa model, using completely synthetic training data created gpt4 via https://github.com/jondurbin/airoboros
|
137 |
|
|
|
|
|
138 |
The dataset used to fine-tune this model is available [here](https://huggingface.co/airoboros-gpt4), with a specific focus on:
|
139 |
- trivia
|
140 |
- math/reasoning (although it still sucks)
|
|
|
263 |
The Lower Peninsula is a hub for manufacturing, forestry, agriculture, services, and high-tech industry.
|
264 |
ENDINPUT
|
265 |
BEGININSTRUCTION
|
266 |
+
What is Michigan's capital?
|
267 |
Give me a JSON object with 3 facts from Foolo's article.
|
268 |
What are the states that border Michigan?
|
269 |
Reference?
|
|
|
509 |
perror("malloc failed");
|
510 |
exit(1);
|
511 |
}
|
512 |
+
|
513 |
strcpy(parent->key, key);
|
514 |
strcpy(parent->value, value);
|
515 |
+
|
516 |
parent->left = parent->right = NULL;
|
517 |
+
|
518 |
if (!tree) {
|
519 |
tree = parent;
|
520 |
} else {
|
|
|
530 |
}
|
531 |
current = current->right;
|
532 |
}
|
533 |
+
|
534 |
return NULL;
|
535 |
}
|
536 |
|
|
|
545 |
void* accept_client(void* args) {
|
546 |
int client_socket = *(int*)args;
|
547 |
char buffer[1024];
|
548 |
+
|
549 |
while (1) {
|
550 |
ssize_t bytes_received = recv(client_socket, buffer, sizeof(buffer), 0);
|
551 |
if (bytes_received <= 0) {
|
552 |
close(client_socket);
|
553 |
break;
|
554 |
}
|
555 |
+
|
556 |
char* input = buffer;
|
557 |
char* input_token;
|
558 |
+
|
559 |
while ((input_token = strtok(input, " \n")) != NULL) {
|
560 |
switch (input_token[0]) {
|
561 |
case 'G':
|
|
|
565 |
}
|
566 |
printf("%s = %s\n", input_token, search(input_token));
|
567 |
break;
|
568 |
+
|
569 |
case 'S':
|
570 |
if (!strcmp(input_token, "SET")) {
|
571 |
char* key = strtok(NULL, " \n");
|
|
|
574 |
perror("Invalid input");
|
575 |
exit(1);
|
576 |
}
|
577 |
+
|
578 |
insert(key, value);
|
579 |
printf("%s set %s to %s\n", input_token, key, value);
|
580 |
} else {
|
|
|
582 |
exit(1);
|
583 |
}
|
584 |
break;
|
585 |
+
|
586 |
default:
|
587 |
perror("Invalid input");
|
588 |
exit(1);
|
589 |
}
|
590 |
+
|
591 |
input = strtok(NULL, " \n");
|
592 |
}
|
593 |
}
|
594 |
+
|
595 |
return NULL;
|
596 |
}
|
597 |
|
|
|
601 |
perror("socket failed");
|
602 |
exit(1);
|
603 |
}
|
604 |
+
|
605 |
struct sockaddr_in server_addr;
|
606 |
memset(&server_addr, 0, sizeof(server_addr));
|
607 |
server_addr.sin_family = AF_INET;
|
608 |
server_addr.sin_port = htons(8080);
|
609 |
server_addr.sin_addr.s_addr = INADDR_ANY;
|
610 |
+
|
611 |
if (bind(server_socket, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
|
612 |
perror("bind failed");
|
613 |
exit(1);
|
614 |
}
|
615 |
+
|
616 |
if (listen(server_socket, 5) < 0) {
|
617 |
perror("listen failed");
|
618 |
exit(1);
|
619 |
}
|
620 |
+
|
621 |
pthread_t accept_thread;
|
622 |
pthread_create(&accept_thread, NULL, accept_client, &server_socket);
|
623 |
+
|
624 |
char* client_input;
|
625 |
int client_socket = accept(server_socket, (struct sockaddr*)NULL, NULL);
|
626 |
if (client_socket < 0) {
|
627 |
perror("accept failed");
|
628 |
exit(1);
|
629 |
}
|
630 |
+
|
631 |
while (1) {
|
632 |
sleep(1);
|
633 |
+
|
634 |
char buffer[1024];
|
635 |
ssize_t bytes_received = recv(client_socket, buffer, sizeof(buffer), 0);
|
636 |
if (bytes_received <= 0) {
|
637 |
close(client_socket);
|
638 |
break;
|
639 |
}
|
640 |
+
|
641 |
client_input = buffer;
|
642 |
parse_input(client_input);
|
643 |
}
|
644 |
+
|
645 |
close(client_socket);
|
646 |
pthread_join(accept_thread, NULL);
|
647 |
+
|
648 |
return 0;
|
649 |
}
|
650 |
```
|