File size: 5,590 Bytes
be2dcb9
91655f7
a16c971
de28914
 
 
91655f7
 
a16c971
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "facebook/opt-350m"
# model_name = "NousResearch/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-2-7b-chat-hf") 

def predict(message, chatbot, temperature=0.9, max_new_tokens=256, top_p=0.6, repetition_penalty=1.0,):

    system_message = "\n당신은 도움이 되고 μ •μ€‘ν•˜λ©° μ •μ§ν•œ Assistantμž…λ‹ˆλ‹€. μ•ˆμ „μ„ μœ μ§€ν•˜λ©΄μ„œ 항상 κ°€λŠ₯ν•œ ν•œ 도움이 λ˜λ„λ‘ λ‹΅λ³€ν•˜μ‹­μ‹œμ˜€. κ·€ν•˜μ˜ λ‹΅λ³€μ—λŠ” μœ ν•΄ν•˜κ±°λ‚˜, λΉ„μœ€λ¦¬μ μ΄κ±°λ‚˜, μΈμ’…μ°¨λ³„μ μ΄κ±°λ‚˜, μ„±μ°¨λ³„μ μ΄κ±°λ‚˜, 독성이 μžˆκ±°λ‚˜, μœ„ν—˜ν•˜κ±°λ‚˜ λΆˆλ²•μ μΈ μ½˜ν…μΈ κ°€ ν¬ν•¨λ˜μ–΄μ„œλŠ” μ•ˆ λ©λ‹ˆλ‹€. κ·€ν•˜μ˜ 닡변은 μ‚¬νšŒμ μœΌλ‘œ 편견이 μ—†κ³  κΈμ •μ μž…λ‹ˆλ‹€.\n\n질문이 μ˜λ―Έκ°€ μ—†κ±°λ‚˜ μ‚¬μ‹€μ μœΌλ‘œ 일관성이 μ—†λŠ” 경우, μ˜³μ§€ μ•Šμ€ 것에 λ‹΅λ³€ν•˜λŠ” λŒ€μ‹  이유λ₯Ό μ„€λͺ…ν•˜μ‹­μ‹œμ˜€. μ§ˆλ¬Έμ— λŒ€ν•œ 닡변을 λͺ¨λ₯΄λŠ” 경우, ν—ˆμœ„μ •λ³΄ κ³΅μœ ν•˜μ§€ λ§ˆμ„Έμš”"    
    input_system = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n "

    input_history = ""
    for interaction in chatbot:
        input_history = input_system + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s> [INST] "

    input_prompt = input_history + str(message) + " [/INST] "

    inputs = tokenizer.encode(input_prompt, return_tensors="pt").to('cuda')

    temperature = float(temperature)
    if temperature < 1e-2: temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        input_ids=inputs,
        temperature=temperature, 
        top_p=top_p, 
        max_new_tokens=max_new_tokens,
        repetition_penalty=repetition_penalty,
    )

    outputs = model.generate(**generate_kwargs)
    generated_indcluded_full_text = tokenizer.decode(outputs[0])
    print("generated_indcluded_full_text:", generated_indcluded_full_text)

    generated_text = generated_indcluded_full_text.split('[/INST]  ')[-1]
    if '</s>' in generated_text : 
        generated_text = generated_text.split('</s>')[0]
    else : pass

    import json
    tokens = generated_text.split('\n')
    token_list = []
    for idx, token in enumerate(tokens):
        token_dict = {"id": idx + 1, "text": token}
        token_list.append(token_dict)
    response = {"data": {"token": token_list}}
    response = json.dumps(response, indent=4)

    response = json.loads(response)
    data_dict = response.get('data', {})
    token_list = data_dict.get('token', [])

    import time
    partial_message = "" 
    for token_entry in token_list:
        if token_entry:            
            try:
                token_id = token_entry.get('id', None)
                token_text = token_entry.get('text', None)
                
                if token_text:  
                    for char in token_text:  
                        partial_message += char  
                        yield partial_message
                        time.sleep(0.01)
                else:
                    gr.Warning(f"The key 'text' does not exist or is None in this token entry: {token_entry}")

            except KeyError as e:
                gr.Warning(f"KeyError: {e} occurred for token entry: {token_entry}")
                continue

title = "TheBloke/Llama-2-7b-Chat-GPTQ닝 λͺ¨λΈ chatbot"

description = """
TheBloke/Llama-2-7b-Chat-GPTQ λͺ¨λΈμž…λ‹ˆλ‹€.
"""
css = """.toast-wrap { display: none !important } """
examples=[
    ['Hello there! How are you doing?'],
    ['Can you explain to me briefly what is Python programming language?'],
    ['Explain the plot of Cinderella in a sentence.'],
    ['How many hours does it take a man to eat a Helicopter?'],
    ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
    ]

import gradio as gr
def vote(data: gr.LikeData):
    if data.liked:
        print("You upvoted this response: " + data.value)
    else:
        print("You downvoted this response: " + data.value)


additional_inputs=[
    gr.Slider(
        label="Temperature",
        value=0.9,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),
    gr.Slider(
        label="Max new tokens",
        value=256,
        minimum=0,
        maximum=4096,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.6,
        minimum=0.0,
        maximum=1,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Repetition penalty",
        value=1.2,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    )
]

chatbot_stream = gr.Chatbot(avatar_images=('user.png', 'bot2.png'), bubble_full_width = False)
chat_interface_stream = gr.ChatInterface(predict, 
                 title=title, 
                 description=description, 
                 chatbot=chatbot_stream,
                 css=css, 
                 examples=examples, 
                 cache_examples=False, 
                 additional_inputs=additional_inputs,) 

with gr.Blocks() as demo:

    with gr.Tab("Streaming"):
        chatbot_stream.like(vote, None, None)
        chat_interface_stream.render()

demo.queue(concurrency_count=75, max_size=100).launch(debug=True)