import os
import time
import spaces
from threading import Thread
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import gradio as gr
MODEL = "weblab-GENIAC/Tanuki-8B-dpo-v1.0"
HF_TOKEN = os.environ.get("HF_TOKEN", None)
TITLE = "
Tanuki-8B-dpo-v1.0
"
DESCRIPTION = """
"""
PLACEHOLDER = """
Tanuki-8B
"""
CSS = """
.duplicate-button {
margin: auto !important;
color: white !important;
background: black !important;
border-radius: 100vh !important;
}
h3 {
text-align: center;
}
.model-description {
padding: 0.5em 1em;
margin: 2em 0;
border-top: solid 5px #5d627b;
box-shadow: 0 1px 1px rgba(0, 0, 0, 0.22);
border-radius: 5px;
}
.model-description p {
margin: 0;
padding: 0;
color: #5d627b;
}
.image-placeholder {
text-align: center;
display: flex;
flex-direction: column;
align-items: center;
}
.image-placeholder img {
width: 100%;
height: auto;
opacity: 0.55;
}
.image-placeholder h1 {
font-size: 28px;
margin-bottom: 2px;
opacity: 0.55;
}
"""
ANALYTICS_HEAD = """
"""
ANALYTICS_JS = """
function() {
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-JLBL393020');
}
"""
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(
MODEL,
torch_dtype=torch.bfloat16,
device_map="auto",
)
print(model)
@spaces.GPU()
def stream_chat(
message: str,
history: list,
system_prompt: str,
temperature: float = 0.3,
max_new_tokens: int = 1024,
top_p: float = 1.0,
top_k: int = 20,
):
print(f'message: {message}')
print(f'history: {history}')
conversation = [
{"role": "system", "content": system_prompt}
]
for prompt, answer in history:
if prompt == None:
prompt = " "
if answer == None:
answer = " "
conversation.extend([
{"role": "user", "content": prompt},
{"role": "assistant", "content": answer},
])
conversation.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
input_ids=input_ids,
max_new_tokens = max_new_tokens,
do_sample = False if temperature == 0 else True,
top_p = top_p,
top_k = top_k,
temperature = temperature,
streamer=streamer,
)
with torch.no_grad():
thread = Thread(target=model.generate, kwargs=generate_kwargs)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
yield buffer
chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
with gr.Blocks(head=ANALYTICS_HEAD, css=CSS, theme="soft") as demo:
demo.load(None, js=ANALYTICS_JS)
gr.HTML(TITLE)
gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
gr.Markdown(DESCRIPTION)
if __name__ == "__main__":
demo.launch()