Spaces:
Runtime error
Runtime error
File size: 3,062 Bytes
e959087 d9d6c5f e959087 d9d6c5f e3c5eb3 d9d6c5f 6698b86 d9d6c5f bb665f7 d9d6c5f 49cdeed d9d6c5f 5e884e7 d9d6c5f bb665f7 d9d6c5f bb665f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import gradio as gr
# def greet(name):
# return "Hello " + name + "!!"
# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
# iface.launch()
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline, set_seed
device = "cuda:0" if torch.cuda.is_available() else "cpu"
repo_id = "j2moreno/test-model"
model = AutoModelForCausalLM.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
SEED = 42
default_text = "Ask me about Leonardo Moreno"
title = "Who is Leonardo Moreno"
### This Space demonstrates model [Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) by Meta, a Llama 2 model with 70B parameters fine-tuned for chat instructions. This space is running on Inference Endpoints using text-generation-inference library. If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://ui.endpoints.huggingface.co/).
description = """
This Space is... [placeholder]
"""
css = """.toast-wrap { display: none !important } """
examples=[
['Who is Leonardo Moreno?'],
['Describe Leonardo Moreno\'s professional background.'],
['What projects has Leonardo Moreno worked on?'],
["What are Leonardo Moreno's core technical skills?"],
['How has Leonardo Moreno integrated AI in his work?'],
]
# def vote(data: gr.LikeData):
# if data.liked:
# print("You upvoted this response: " + data.value)
# else:
# print("You downvoted this response: " + data.value)
@spaces.GPU
def generate_response(message, chatbot, system_prompt="",):
set_seed(SEED)
if system_prompt != "":
input_prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n "
else:
input_prompt = f"<s>[INST] "
for interaction in chatbot:
input_prompt = input_prompt + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s>[INST] "
input_prompt = input_prompt + str(message) + " [/INST] "
print(input_prompt)
tokenized_prompt = tokenizer(input_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
# print(tokenized_prompt)
output_sequences = model.generate(**tokenized_prompt, max_length=1024, num_return_sequences=1)
decoded_output = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
# print(decoded_output)
return decoded_output
chatbot_stream = gr.Chatbot()
chat_interface_stream = gr.ChatInterface(generate_response,
title=title,
description=description,
textbox=gr.Textbox(),
chatbot=chatbot_stream,
css=css,
examples=examples,
#cache_examples=True,
#additional_inputs=additional_inputs,
)
# Gradio Demo
with gr.Blocks() as demo:
# streaming chatbot
# chatbot_stream.like(vote, None, None)
chat_interface_stream.render()
demo.queue(max_size=100).launch() |