import gradio as gr # def greet(name): # return "Hello " + name + "!!" # iface = gr.Interface(fn=greet, inputs="text", outputs="text") # iface.launch() import spaces import torch from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline, set_seed device = "cuda:0" if torch.cuda.is_available() else "cpu" repo_id = "j2moreno/test-model" model = AutoModelForCausalLM.from_pretrained(repo_id).to(device) tokenizer = AutoTokenizer.from_pretrained(repo_id) SEED = 42 default_text = "Ask me about Leonardo Moreno" title = "Who is Leonardo Moreno" ### This Space demonstrates model [Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) by Meta, a Llama 2 model with 70B parameters fine-tuned for chat instructions. This space is running on Inference Endpoints using text-generation-inference library. If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://ui.endpoints.huggingface.co/). description = """ This Space is... [placeholder] """ css = """.toast-wrap { display: none !important } """ examples=[ ['Who is Leonardo Moreno?'], ['Describe Leonardo Moreno\'s professional background.'], ['What projects has Leonardo Moreno worked on?'], ["What are Leonardo Moreno's core technical skills?"], ['How has Leonardo Moreno integrated AI in his work?'], ] # def vote(data: gr.LikeData): # if data.liked: # print("You upvoted this response: " + data.value) # else: # print("You downvoted this response: " + data.value) @spaces.GPU def generate_response(message, chatbot, system_prompt="",): set_seed(SEED) if system_prompt != "": input_prompt = f"[INST] <>\n{system_prompt}\n<>\n\n " else: input_prompt = f"[INST] " for interaction in chatbot: input_prompt = input_prompt + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " [INST] " input_prompt = input_prompt + str(message) + " [/INST] " print(input_prompt) tokenized_prompt = tokenizer(input_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128) # print(tokenized_prompt) output_sequences = model.generate(**tokenized_prompt, max_length=1024, num_return_sequences=1) decoded_output = tokenizer.decode(output_sequences[0], skip_special_tokens=True) # print(decoded_output) return decoded_output chatbot_stream = gr.Chatbot() chat_interface_stream = gr.ChatInterface(generate_response, title=title, description=description, textbox=gr.Textbox(), chatbot=chatbot_stream, css=css, examples=examples, #cache_examples=True, #additional_inputs=additional_inputs, ) # Gradio Demo with gr.Blocks() as demo: # streaming chatbot # chatbot_stream.like(vote, None, None) chat_interface_stream.render() demo.queue(max_size=100).launch()