Spaces:
Runtime error
Runtime error
import gradio as gr | |
import spaces | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
model_to_use = "thesven/Llama3-8B-SFT-code_bagel-bnb-4bit" | |
def start(n): | |
model_name_or_path = "thesven/Llama3-8B-SFT-code_bagel-bnb-4bit" | |
# BitsAndBytesConfig for loading the model in 4-bit precision | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype="bfloat16", | |
) | |
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name_or_path, | |
device_map="auto", | |
trust_remote_code=True, | |
quantization_config=bnb_config | |
) | |
model.pad_token = model.config.eos_token_id | |
# Example response generation | |
input_text = "Hello, how are you?" | |
input_ids = tokenizer(input_text, return_tensors='pt').input_ids.cuda() | |
output = model.generate(inputs=input_ids, max_new_tokens=50) | |
generated_text = tokenizer.decode(output[0], skip_special_tokens=True) | |
return generated_text | |
demo = gr.Interface(fn=start, inputs=gr.Number(), outputs=gr.Text()) | |
demo.launch() |