File size: 4,504 Bytes
e959087
 
d9d6c5f
 
e959087
d9d6c5f
 
 
936709d
e3c5eb3
d9d6c5f
 
 
 
6698b86
 
d9d6c5f
 
500f2bb
d9d6c5f
 
 
 
bb665f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
936709d
500f2bb
d9d6c5f
500f2bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb665f7
 
 
 
500f2bb
bb665f7
 
 
 
 
 
 
 
 
 
 
d9d6c5f
936709d
500f2bb
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr

# def greet(name):
#     return "Hello " + name + "!!"

# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
# iface.launch()

# import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline, set_seed

device = "cuda:0" if torch.cuda.is_available() else "cpu"

repo_id = "j2moreno/test-model"

model = AutoModelForCausalLM.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

SEED = 42

default_text = "Ask me about Leonardo Moreno"
title = "Who is Leonardo Moreno"

### This Space demonstrates model [Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) by Meta, a Llama 2 model with 70B parameters fine-tuned for chat instructions. This space is running on Inference Endpoints using text-generation-inference library. If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://ui.endpoints.huggingface.co/).

description = """
This Space is... [placeholder]
"""
css = """.toast-wrap { display: none !important } """
examples=[
    ['Who is Leonardo Moreno?'],
    ['Describe Leonardo Moreno\'s professional background.'],
    ['What projects has Leonardo Moreno worked on?'],
    ["What are Leonardo Moreno's core technical skills?"],
    ['How has Leonardo Moreno integrated AI in his work?'],
]

# def vote(data: gr.LikeData):
#     if data.liked:
#         print("You upvoted this response: " + data.value)
#     else:
#         print("You downvoted this response: " + data.value)
        
# @spaces.GPU
def generate_response(message, history):
    set_seed(SEED)
    # data = {
    #     "role": "user", 
    #     "content": message
    # }
    # tokenized_prompt = tokenizer(data["content"], return_tensors="pt", padding=True, truncation=True, max_length=128)
    # print(tokenized_prompt)

    # output_sequences = model.generate(**tokenized_prompt, max_length=1024, num_return_sequences=1)
    # decoded_output = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)[0]
    # print(decoded_output)

    # yield decoded_output
    temperature=0.4
    top_p=0.95
    top_k=50
    max_new_tokens=256

    message_template = [
        {
            "role": "system",
            "content": "You are a highly knowledgeable and friendly chatbot equipped with extensive information across various domains. Your goal is to understand and respond to user inquiries with accuracy and clarity. You're adept at providing detailed explanations, concise summaries, and insightful responses. Your interactions are always respectful, helpful, and focused on delivering the most relevant information to the user.",
        },
        {"role": "user", "content": message},
    ]

    # Set tokenize correctly. Otherwise ticking the box breaks it.
    prompt = pipe.tokenizer.apply_chat_template(message_template, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=True,
                   temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=1.10)

    print(outputs[0]["generated_text"])
    return outputs[0]["generated_text"]

# chatbot_stream = gr.Chatbot()
chat_interface_stream = gr.ChatInterface(generate_response, 
                 title=title, 
                 description=description, 
                 textbox=gr.Textbox(),
                #  chatbot=chatbot_stream,
                 css=css, 
                 examples=examples, 
                 #cache_examples=True, 
                 #additional_inputs=additional_inputs,
                 ) 

# Gradio Demo 
with gr.Blocks() as demo:
    # streaming chatbot
    # chatbot_stream.like(vote, None, None)
    chat_interface_stream.render()
        
if __name__ == "__main__":
    demo.queue().launch(share=True)


# messages = [
#     {
#         "role": "system",
#         "content": "You are a friendly chatbot who always responds in the style of a thug",
#     },
#     {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
# ]
# model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
# input_length = model_inputs.shape[1]
# generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20)
# print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])