File size: 4,146 Bytes
d29ae9a
 
d92dc0a
4d8faf1
d29ae9a
 
 
 
d92dc0a
856636a
d29ae9a
61849bb
856636a
d92dc0a
 
d29ae9a
d92dc0a
 
 
 
 
 
856636a
 
d92dc0a
d6b4766
d92dc0a
91208a3
5d9b451
01bf48f
dc34b69
d92dc0a
856636a
 
 
 
d92dc0a
 
856636a
d29ae9a
 
 
 
 
 
 
856636a
 
d29ae9a
 
 
 
 
 
 
 
 
 
8ace6ee
856636a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d29ae9a
 
d92dc0a
d29ae9a
 
 
856636a
d29ae9a
 
856636a
d29ae9a
 
 
 
 
 
 
 
 
 
856636a
 
d92dc0a
856636a
 
 
e5bddd1
856636a
d92dc0a
856636a
d92dc0a
d29ae9a
856636a
 
 
 
d92dc0a
d29ae9a
 
 
d92dc0a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr
from huggingface_hub import InferenceClient
from datetime import datetime
import spaces

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
#client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
lora_name = "robinhad/UAlpaca-2.0-Mistral-7B"

from peft import PeftModel, PeftConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from torch import bfloat16
model_name = "mistralai/Mistral-7B-v0.1"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(lora_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config
)

model = PeftModel.from_pretrained(model, lora_name, torch_device="cpu")

model = model.to("cuda")

from transformers import StoppingCriteriaList, StopStringCriteria, TextIteratorStreamer
from threading import Thread

stop_criteria = StoppingCriteriaList([StopStringCriteria(tokenizer, stop_strings=["<|im_end|>"])])

# will be used with normal template
@spaces.GPU
def respond(
    message,
    history: list[tuple[str, str]],
    max_tokens,
    temperature,
    top_p,
):
    # messages = [{"role": "system", "content": system_message}]
    messages = []

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})


    tokenized = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda") #, tokenize=False) #
    #print(tokenized)
    #tokenized = tokenizer(tokenized, return_tensors="pt")["input_ids"]
    print(tokenizer.batch_decode(tokenized)[0])
    print("====")
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
    generation_kwargs = dict(inputs=tokenized, streamer=streamer, max_new_tokens=max_tokens, stopping_criteria=stop_criteria, top_p=top_p, temperature=temperature)

    thread = Thread(target=model.generate, kwargs=generation_kwargs)

    thread.start()

    generated_text = ""

    for new_text in streamer:

        generated_text += new_text
        # generated_text = generated_text.replace("<|im_start|>assistant\n", "")
        generated_text = generated_text.replace("<|im_end|>", "")
        yield generated_text



"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        #gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    description="""### Attribution: ELEKS supported this project through a grant dedicated to the memory of Oleksiy Skrypnyk""",
    title=f"Inference demo for '{lora_name}' (alpha) model, instruction-tuned for Ukrainian",
    examples=[
        ["Напиши історію про Івасика-Телесика"],
        ["Яка найвища гора в Україні?"],
        ["Як звали батька Тараса Григоровича Шевченка?"],
        #["Як можна заробити нелегально швидко гроші?"],
        ["Яка з цих гір не знаходиться у Європі? Говерла, Монблан, Гран-Парадізо, Еверест"],
        [
        "Дай відповідь на питання\nЧому у качки жовті ноги?"
    ]],
)




demo.launch()


if __name__ == "__main__":
    demo.launch()