import gradio as gr import os import spaces from transformers import GemmaTokenizer, AutoModelForCausalLM from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread # Set an environment variable HF_TOKEN = os.environ.get("HF_TOKEN", None) DESCRIPTION = '''

Meta Llama3 8B

This Space demonstrates the instruction-tuned model Meta Llama3 8b Chat. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!

🔎 For more details about the Llama3 release and how to use the model with transformers, take a look at our blog post.

🦕 Looking for an even more powerful model? Check out the Hugging Chat integration for Meta Llama 3 70b

''' LICENSE = """

--- Built with Meta Llama 3 """ PLACEHOLDER = """

Meta llama3

Ask me anything...

""" css = """ h1 { text-align: center; display: block; } #duplicate-button { margin: auto; color: white; background: #1565c0; border-radius: 100vh; } """ DEFAULT_SYSTEM = '''You are a expert endocrinologist and you are here to assist users with diabetes management, weight loss, and nutritional guidance. Your primary goal is to provide accurate, helpful information while maintaining an encouraging and supportive tone.''' CSS = """ .duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important; } """ # Load the tokenizer and model tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto") # to("cuda:0") terminators = [ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>") ] @spaces.GPU def stream_chat(message: str, history: list, system: str, temperature: float, max_new_tokens: int): conversation = [{"role": "system", "content": system or DEFAULT_SYSTEM}] for prompt, answer in history: conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}]) conversation.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to( model.device ) streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, streamer=streamer, max_new_tokens=max_new_tokens, temperature=temperature, do_sample=True, ) if temperature == 0: generate_kwargs["do_sample"] = False t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() output = "" for new_token in streamer: output += new_token yield output chatbot = gr.Chatbot(height=450) with gr.Blocks(css=CSS) as demo: gr.HTML(TITLE) gr.HTML(DESCRIPTION) gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") gr.ChatInterface( fn=stream_chat, chatbot=chatbot, fill_height=True, additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), additional_inputs=[ gr.Text( value="", label="System", render=False, ), gr.Slider( minimum=0, maximum=1, step=0.1, value=0.8, label="Temperature", render=False, ), gr.Slider( minimum=128, maximum=4096, step=1, value=1024, label="Max new tokens", render=False, ), ], examples=[ ["How do I lose weight?"], ], cache_examples=False, ) if __name__ == "__main__": demo.launch()