import os
from threading import Thread
from typing import Iterator
import gradio as gr
from gradio.themes.base import Base
from gradio.themes.utils import colors, sizes, fonts
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
DESCRIPTION = """\
# Llama 3.2 1B Instruct
Llama 3.2 1B is Meta's latest iteration of open LLMs.
This is a demo of [`meta-llama/Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct), fine-tuned for instruction following.
For more details, please check [our post](https://huggingface.co/blog/llama32).
"""
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_id = "ussipan/SipanGPT-0.3-Llama-3.2-1B-GGUF"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
)
model.eval()
# Main Gradio inference function
def generate(
message: str,
chat_history: list[tuple[str, str]],
max_new_tokens: int = 1024,
temperature: float = 0.6,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2,
) -> Iterator[str]:
conversation = [{k: v for k, v in d.items() if k != 'metadata'} for d in chat_history]
conversation.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
gr.Warning(f"Se recortó la entrada de la conversación porque era más larga que {MAX_INPUT_TOKEN_LENGTH} tokens.")
input_ids = input_ids.to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
{"input_ids": input_ids},
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
num_beams=1,
repetition_penalty=repetition_penalty,
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
conversation.append({"role": "assistant", "content": ""})
outputs = []
for text in streamer:
outputs.append(text)
bot_response = "".join(outputs)
conversation[-1]['content'] = bot_response
yield "", conversation
# Implementing Gradio 5 features and building a ChatInterface UI yourself
PLACEHOLDER = """
"""
def handle_retry(history, retry_data: gr.RetryData):
new_history = history[:retry_data.index]
previous_prompt = history[retry_data.index]['content']
yield from generate(previous_prompt, chat_history = new_history, max_new_tokens = 1024, temperature = 0.6, top_p = 0.9, top_k = 50, repetition_penalty = 1.2)
def handle_like(data: gr.LikeData):
if data.liked:
print("Votaste positivamente esta respuesta: ", data.value)
else:
print("Votaste negativamente esta respuesta: ", data.value)
def handle_undo(history, undo_data: gr.UndoData):
chatbot = history[:undo_data.index]
prompt = history[undo_data.index]['content']
return chatbot, prompt
def chat_examples_fill(data: gr.SelectData):
yield from generate(data.value['text'], chat_history = [], max_new_tokens = 1024, temperature = 0.6, top_p = 0.9, top_k = 50, repetition_penalty = 1.2)
class SipanGPTTheme(Base):
def __init__(
self,
*,
primary_hue: colors.Color | str = colors.Color(
name="custom_green",
c50="#f0fde4",
c100="#e1fbc8",
c200="#c3f789",
c300="#a5f34a",
c400="#7dfa00", # primary color
c500="#5ef000",
c600="#4cc700",
c700="#39a000",
c800="#2b7900",
c900="#1d5200",
c950="#102e00",
),
secondary_hue: colors.Color | str = colors.Color(
name="custom_secondary_green",
c50="#edfce0",
c100="#dbf9c1",
c200="#b7f583",
c300="#93f145",
c400="#5fed00", # secondary color
c500="#4ed400",
c600="#3fad00",
c700="#308700",
c800="#236100",
c900="#153b00",
c950="#0a1f00",
),
neutral_hue: colors.Color | str = colors.gray,
spacing_size: sizes.Size | str = sizes.spacing_md,
radius_size: sizes.Size | str = sizes.radius_md,
text_size: sizes.Size | str = sizes.text_md,
font: fonts.Font | str | list[fonts.Font | str] = [
fonts.GoogleFont("Exo 2"),
"ui-sans-serif",
"system-ui",
"sans-serif",
],
font_mono: fonts.Font | str | list[fonts.Font | str] = [
fonts.GoogleFont("Fraunces"),
"ui-monospace",
"monospace",
],
):
super().__init__(
primary_hue=primary_hue,
secondary_hue=secondary_hue,
neutral_hue=neutral_hue,
spacing_size=spacing_size,
radius_size=radius_size,
text_size=text_size,
font=font,
font_mono=font_mono,
)
self.set(
# Light mode settings
body_background_fill="*neutral_50",
body_text_color="*neutral_900",
color_accent_soft="*secondary_200",
button_primary_background_fill="*primary_600",
button_primary_background_fill_hover="*primary_500",
button_primary_text_color="*neutral_50",
block_title_text_color="*primary_600",
input_background_fill="*neutral_200",
input_border_color="*neutral_300",
input_placeholder_color="*neutral_500",
block_background_fill="*neutral_100",
block_label_background_fill="*primary_100",
block_label_text_color="*neutral_800",
checkbox_background_color="*neutral_200",
checkbox_border_color="*primary_500",
loader_color="*primary_500",
slider_color="*primary_500",
# Dark mode settings
body_background_fill_dark="*neutral_900",
body_text_color_dark="*neutral_50",
color_accent_soft_dark="*secondary_800",
button_primary_background_fill_dark="*primary_700",
button_primary_background_fill_hover_dark="*primary_600",
button_primary_text_color_dark="*neutral_950",
block_title_text_color_dark="*primary_400",
input_background_fill_dark="*neutral_800",
input_border_color_dark="*neutral_700",
input_placeholder_color_dark="*neutral_400",
block_background_fill_dark="*neutral_850",
block_label_background_fill_dark="*primary_900",
block_label_text_color_dark="*neutral_200",
checkbox_background_color_dark="*neutral_800",
checkbox_border_color_dark="*primary_600",
loader_color_dark="*primary_400",
slider_color_dark="*primary_600",
)
# Uso del tema
theme = SipanGPTTheme()
with gr.Blocks(theme=theme, fill_height=True) as demo:
with gr.Column(elem_id="container", scale=1):
chatbot = gr.Chatbot(
label="SipánGPT 0.3 Llama 3.2",
show_label=False,
type="messages",
scale=1,
suggestions = [
{"text": "Háblame del reglamento de estudiantes de la universidad"},
{"text": "Qué becas ofrece la universidad"},
{"text": "Hablame sobre el temario del examen de admisión para pregrado"},
{"text": "Cuando se fundó la universidad?"},
],
placeholder = PLACEHOLDER,
)
msg = gr.Textbox(submit_btn=True, show_label=False)
with gr.Accordion('Additional inputs', open=False):
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, )
temperature = gr.Slider(label="Temperature",minimum=0.1, maximum=4.0, step=0.1, value=0.6,)
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9, )
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50, )
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2, )
msg.submit(generate, [msg, chatbot, max_new_tokens, temperature, top_p, top_k, repetition_penalty], [msg, chatbot])
chatbot.retry(handle_retry, chatbot, [msg, chatbot])
chatbot.like(handle_like, None, None)
chatbot.undo(handle_undo, chatbot, [chatbot, msg])
chatbot.suggestion_select(chat_examples_fill, None, [msg, chatbot] )
demo.launch()