import gradio as gr import spaces import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig model_to_use = "thesven/Llama3-8B-SFT-code_bagel-bnb-4bit" @spaces.GPU def start(n): model_name_or_path = "thesven/Llama3-8B-SFT-code_bagel-bnb-4bit" # BitsAndBytesConfig for loading the model in 4-bit precision bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="bfloat16", ) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) model = AutoModelForCausalLM.from_pretrained( model_name_or_path, device_map="auto", trust_remote_code=True, quantization_config=bnb_config ) model.pad_token = model.config.eos_token_id # Example response generation input_text = "Hello, how are you?" input_ids = tokenizer(input_text, return_tensors='pt').input_ids.cuda() output = model.generate(inputs=input_ids, max_new_tokens=50) generated_text = tokenizer.decode(output[0], skip_special_tokens=True) return generated_text demo = gr.Interface(fn=start, inputs=gr.Number(), outputs=gr.Text()) demo.launch()