Spaces:
Running
on
A10G
Running
on
A10G
File size: 6,653 Bytes
08e5ef1 7edda8b 2bede7c 7edda8b 2bede7c 75b770e 08e5ef1 1fba392 925d15e 08e5ef1 2bede7c 69d19e7 925d15e 7686e09 2bede7c d9267f6 7c36326 d9267f6 5696fee f4651d4 9781999 d9267f6 75b770e 2124573 f4651d4 2124573 d9267f6 9781999 f4651d4 9781999 5696fee 9781999 b7ccecf 9781999 5696fee 9781999 2124573 5696fee 9781999 b7ccecf d9267f6 b7ccecf d9267f6 9781999 5696fee ef80b76 9781999 ef80b76 b7ccecf 9781999 b7ccecf 9781999 b7ccecf f4651d4 9781999 f4651d4 9781999 b7ccecf f4651d4 9781999 5696fee 9781999 b7ccecf 2124573 b7ccecf ef80b76 b7ccecf 9781999 5696fee 9781999 5696fee 9781999 5696fee 9781999 5696fee 9781999 5696fee 9781999 5696fee 9781999 2bede7c f4651d4 2bede7c 1fba392 7edda8b 1fba392 7edda8b f4651d4 7686e09 b416bb7 7edda8b 2124573 d9267f6 f4651d4 7cd57ad 87f5ccd d9267f6 2bede7c ec000c3 d2fb1de ec000c3 2bede7c 925d15e 2bede7c ec000c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import os
import shutil
import subprocess
import gradio as gr
from huggingface_hub import create_repo, HfApi
from huggingface_hub import snapshot_download
from huggingface_hub import whoami
from huggingface_hub import ModelCard
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from apscheduler.schedulers.background import BackgroundScheduler
from textwrap import dedent
LLAMA_LIKE_ARCHS = ["MistralForCausalLM",]
HF_TOKEN = os.environ.get("HF_TOKEN")
def script_to_use(model_id, api):
info = api.model_info(model_id)
if info.config is None:
return None
arch = info.config.get("architectures", None)
if arch is None:
return None
arch = arch[0]
return "convert.py" if arch in LLAMA_LIKE_ARCHS else "convert-hf-to-gguf.py"
def process_model(model_id, q_method, private_repo, oauth_token: gr.OAuthToken | None):
if oauth_token.token is None:
raise ValueError("You must be logged in to use GGUF-my-repo")
model_name = model_id.split('/')[-1]
fp16 = f"{model_name}/{model_name.lower()}.fp16.bin"
try:
api = HfApi(token=oauth_token.token)
dl_pattern = ["*.md", "*.json", "*.model"]
pattern = (
"*.safetensors"
if any(
file.path.endswith(".safetensors")
for file in api.list_repo_tree(
repo_id=model_id,
recursive=True,
)
)
else "*.bin"
)
dl_pattern += pattern
api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
print("Model downloaded successully!")
conversion_script = script_to_use(model_id, api)
fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
print(result)
if result.returncode != 0:
raise Exception(f"Error converting to fp16: {result.stderr}")
print("Model converted to fp16 successully!")
qtype = f"{model_name}/{model_name.lower()}.{q_method.upper()}.gguf"
quantise_ggml = f"./llama.cpp/quantize {fp16} {qtype} {q_method}"
result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
if result.returncode != 0:
raise Exception(f"Error quantizing: {result.stderr}")
print("Quantised successfully!")
# Create empty repo
new_repo_url = api.create_repo(repo_id=f"{model_name}-{q_method}-GGUF", exist_ok=True, private=private_repo)
new_repo_id = new_repo_url.repo_id
print("Repo created successfully!", new_repo_url)
try:
card = ModelCard.load(model_id, token=oauth_token.token)
except:
card = ModelCard("")
if card.data.tags is None:
card.data.tags = []
card.data.tags.append("llama-cpp")
card.data.tags.append("gguf-my-repo")
card.text = dedent(
f"""
# {new_repo_id}
This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
## Use with llama.cpp
Install llama.cpp through brew.
```bash
brew install ggerganov/ggerganov/llama.cpp
```
Invoke the llama.cpp server or the CLI.
CLI:
```bash
llama-cli --hf-repo {new_repo_id} --model {qtype.split("/")[-1]} -p "The meaning to life and the universe is"
```
Server:
```bash
llama-server --hf-repo {new_repo_id} --model {qtype.split("/")[-1]} -c 2048
```
Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
```
git clone https://github.com/ggerganov/llama.cpp && \
cd llama.cpp && \
make && \
./main -m {qtype.split("/")[-1]} -n 128
```
"""
)
card.save(os.path.join(model_name, "README-new.md"))
api.upload_file(
path_or_fileobj=qtype,
path_in_repo=qtype.split("/")[-1],
repo_id=new_repo_id,
)
api.upload_file(
path_or_fileobj=f"{model_name}/README-new.md",
path_in_repo="README.md",
repo_id=new_repo_id,
)
print("Uploaded successfully!")
return (
f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
"llama.png",
)
except Exception as e:
return (f"Error: {e}", "error.png")
finally:
shutil.rmtree(model_name, ignore_errors=True)
print("Folder cleaned up successfully!")
# Create Gradio interface
iface = gr.Interface(
fn=process_model,
inputs=[
HuggingfaceHubSearch(
label="Hub Model ID",
placeholder="Search for model id on Huggingface",
search_type="model",
),
gr.Dropdown(
["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
label="Quantization Method",
info="GGML quantisation type",
value="Q4_K_M",
filterable=False
),
gr.Checkbox(
value=False,
label="Private Repo",
info="Create a private repo under your username."
),
],
outputs=[
gr.Markdown(label="output"),
gr.Image(show_label=False),
],
title="Create your own GGUF Quants, blazingly fast ⚡!",
description="The space takes an HF repo as an input, quantises it and creates a Public repo containing the selected quant under your HF user namespace.",
)
with gr.Blocks() as demo:
gr.Markdown("You must be logged in to use GGUF-my-repo.")
gr.LoginButton(min_width=250)
iface.render()
def restart_space():
HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=86400)
scheduler.start()
# Launch the interface
demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True) |