convert-fp8 / app.py
mgoin's picture
Update app.py
66e9797 verified
import os
from typing import Tuple, List
import gradio as gr
import spaces
from dataclasses import dataclass
from huggingface_hub import HfApi, CommitOperationAdd
from transformers import AutoProcessor
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot, wrap_hf_model_class
@dataclass
class CommitInfo:
repo_url: str
def parse_ignore_list(ignore_str: str) -> List[str]:
"""Parse comma-separated ignore list string into list"""
return [item.strip() for item in ignore_str.split(',') if item.strip()]
def create_quantized_model(
model_id: str,
work_dir: str,
ignore_list: List[str],
model_class_name: str
) -> Tuple[str, List[Tuple[str, Exception]]]:
"""Quantize model to FP8 and save to disk"""
errors = []
try:
# Get the appropriate model class
exec(f"from transformers import {model_class_name}")
model_class = eval(model_class_name)
wrapped_model_class = wrap_hf_model_class(model_class)
# Load model with ZeroGPU
model = wrapped_model_class.from_pretrained(
model_id,
device_map="auto",
torch_dtype="auto",
trust_remote_code=True,
_attn_implementation="eager"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Configure quantization
recipe = QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
ignore=ignore_list,
)
# Apply quantization
save_dir = os.path.join(work_dir, f"{model_id.split('/')[-1]}-FP8-dynamic")
oneshot(model=model, recipe=recipe, output_dir=save_dir)
processor.save_pretrained(save_dir)
return save_dir, errors
except Exception as e:
errors.append((model_id, e))
raise e
def push_to_hub(
api: HfApi,
model_id: str,
quantized_path: str,
token: str,
ignore_list: List[str],
model_class_name: str,
) -> CommitInfo:
"""Create new repository with quantized model"""
# Create new model repo name
original_owner = model_id.split('/')[0]
new_model_name = f"{model_id.split('/')[-1]}-fp8"
# Get the token owner's username
token_owner = api.whoami(token)["name"]
# Create the new repo under the token owner's account
target_repo = f"{token_owner}/{new_model_name}"
# Create model card content
model_card = f"""---
language:
- en
license: apache-2.0
tags:
- fp8
- quantized
- llmcompressor
base_model: {model_id}
quantization_config:
ignored_layers: {ignore_list}
model_class: {model_class_name}
---
# {new_model_name}
This is an FP8-quantized version of [{model_id}](https://huggingface.co/{model_id}) using [LLM Compressor](https://github.com/georgian-io/LLM-Compressor).
## Quantization Details
- Weights quantized to FP8 with per channel PTQ
- Activations quantized to FP8 with dynamic per token
- Linear layers targeted for quantization
- Ignored layers: {ignore_list}
- Model class: {model_class_name}
## Usage
```python
from transformers import {model_class_name}, AutoProcessor
model = {model_class_name}.from_pretrained("{target_repo}")
processor = AutoProcessor.from_pretrained("{target_repo}")
```
"""
# Create new repository
api.create_repo(
repo_id=target_repo,
private=False,
exist_ok=True,
)
# Prepare operations for upload
operations = [
CommitOperationAdd(path_in_repo="README.md", path_or_content=model_card),
]
# Add all files from quantized model
for root, _, files in os.walk(quantized_path):
for file in files:
file_path = os.path.join(root, file)
relative_path = os.path.relpath(file_path, quantized_path)
operations.append(
CommitOperationAdd(
path_in_repo=relative_path,
path_or_content=file_path
)
)
# Upload files
api.create_commit(
repo_id=target_repo,
operations=operations,
commit_message=f"Add FP8 quantized version of {model_id}",
)
return CommitInfo(repo_url=f"https://huggingface.co/{target_repo}")
@spaces.GPU(duration=900) # 15 minutes timeout for large models
def run(
model_id: str,
token: str,
ignore_str: str,
model_class_name: str
) -> str:
"""Main function to handle quantization and model upload"""
if not token or model_id == "":
return """
### Invalid input 🐞
Please provide both a token and model_id.
"""
try:
# Parse ignore list
ignore_list = parse_ignore_list(ignore_str)
# Set up API with user's token
api = HfApi(token=token)
print("Processing model:", model_id)
print("Ignore list:", ignore_list)
print("Model class:", model_class_name)
# Create working directory
work_dir = "quantized_models"
os.makedirs(work_dir, exist_ok=True)
# Quantize model
quantized_path, errors = create_quantized_model(
model_id,
work_dir,
ignore_list,
model_class_name
)
# Upload quantized model to new repository
commit_info = push_to_hub(
api,
model_id,
quantized_path,
token,
ignore_list,
model_class_name
)
response = f"""
### Success πŸ”₯
Your model has been successfully quantized to FP8 and uploaded to a new repository:
[{commit_info.repo_url}]({commit_info.repo_url})
Configuration:
- Ignored layers: {ignore_list}
- Model class: {model_class_name}
You can use this model directly with the transformers library!
"""
if errors:
response += "\nWarnings during quantization:\n"
response += "\n".join(f"Warning for {filename}: {e}" for filename, e in errors)
return response
except Exception as e:
return f"""
### Error 😒
An error occurred during processing:
{str(e)}
"""
# Gradio Interface
DESCRIPTION = """
# Convert any model to FP8 using LLM Compressor
This space will quantize your model to FP8 format using LLM Compressor and create a new model repository under your account.
The steps are:
1. Paste your HuggingFace token (from hf.co/settings/tokens) - needs write access
2. Enter the model ID you want to quantize
3. (Optional) Customize ignored layers and model class
4. Click "Submit"
5. You'll get a link to your new quantized model repository on your profile! πŸš€
## Advanced Options:
- **Ignore List**: Comma-separated list of layer patterns to ignore during quantization. Examples:
- Llama: `lm_head`
- Phi3v: `re:.*lm_head,re:model.vision_embed_tokens.*`
- Llama Vision: `re:.*lm_head,re:multi_modal_projector.*,re:vision_model.*`
- **Model Class**: Specific model class from transformers (default: AutoModelForCausalLM). Examples:
- `AutoModelForCausalLM`
- `MllamaForConditionalGeneration`
- `LlavaForConditionalGeneration`
Note:
- Processing may take several minutes depending on the model size
- The quantized model will be created as a new public repository under your account
- Your token needs write access to create the new repository
"""
title = "FP8 Quantization with LLM Compressor"
with gr.Blocks(title=title) as demo:
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column():
model_id = gr.Text(
max_lines=1,
label="model_id",
placeholder="huggingface/model-name"
)
token = gr.Text(
max_lines=1,
label="your_hf_token (requires write access)",
placeholder="hf_..."
)
ignore_str = gr.Text(
max_lines=1,
label="ignore_list (comma-separated)",
placeholder="re:.*lm_head,re:vision_model.*",
value="re:.*lm_head"
)
model_class_name = gr.Text(
max_lines=1,
label="model_class_name (optional)",
placeholder="AutoModelForCausalLM",
value="AutoModelForCausalLM"
)
with gr.Row():
clean = gr.ClearButton()
submit = gr.Button("Submit", variant="primary")
with gr.Column():
output = gr.Markdown()
submit.click(
run,
inputs=[model_id, token, ignore_str, model_class_name],
outputs=output,
concurrency_limit=1
)
demo.queue(max_size=10).launch(show_api=True)