Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,490 Bytes
355b7d6 ab382f0 355b7d6 ab382f0 355b7d6 57ab467 ab382f0 355b7d6 2d44604 355b7d6 2d44604 3b4a08d 355b7d6 ab382f0 355b7d6 2ae46d7 355b7d6 f9d021c 1e98994 a2f9117 582365f f9d021c 582365f ec5921d f9d021c 355b7d6 2ae46d7 355b7d6 2ae46d7 355b7d6 2ae46d7 3b4a08d 2ae46d7 355b7d6 80d5294 355b7d6 80d5294 90c63d3 4a4974d 80d5294 bcd2a0f 355b7d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import json
import subprocess
from threading import Thread
import torch
import spaces
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
CHAT_TEMPLATE = "َAuto"
MODEL_NAME = MODEL_ID.split("/")[-1]
CONTEXT_LENGTH = 16000
# Estableciendo valores directamente para las variables
COLOR = "black" # Color predeterminado de la interfaz
EMOJI = "🤖" # Emoji predeterminado para el modelo
DESCRIPTION = f"This is 4bit quntized {MODEL_NAME} model with BnB and designed for testing thinking for general AI tasks." # Descripción predeterminada
latex_delimiters_set = [{
"left": "\\(",
"right": "\\)",
"display": False
}, {
"left": "\\begin{equation}",
"right": "\\end{equation}",
"display": True
}, {
"left": "\\begin{align}",
"right": "\\end{align}",
"display": True
}, {
"left": "\\begin{alignat}",
"right": "\\end{alignat}",
"display": True
}, {
"left": "\\begin{gather}",
"right": "\\end{gather}",
"display": True
}, {
"left": "\\begin{CD}",
"right": "\\end{CD}",
"display": True
}, {
"left": "\\[",
"right": "\\]",
"display": True
}]
@spaces.GPU()
def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
# Format history with a given chat template
# stop_tokens = ["<|endoftext|>", "<|im_end|>"]
# instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
# for user, assistant in history:
# instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
# instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
stop_tokens = ["<|endoftext|>", "<|im_end|>"]
instruction = '<|System|>\n' + system_prompt + '\n'
for user, assistant in history:
instruction += f'<|User|>\n{user}\n<|Assistant|>\n{assistant}\n'
instruction += f'<|User|>\n{message}\n<think>\n'
print(instruction)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
input_ids, attention_mask = enc.input_ids, enc.attention_mask
if input_ids.shape[1] > CONTEXT_LENGTH:
input_ids = input_ids[:, -CONTEXT_LENGTH:]
attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
generate_kwargs = dict(
input_ids=input_ids.to(device),
attention_mask=attention_mask.to(device),
streamer=streamer,
do_sample=True,
temperature=temperature,
max_new_tokens=max_new_tokens,
top_k=top_k,
repetition_penalty=repetition_penalty,
top_p=top_p
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
outputs = []
for new_token in streamer:
outputs.append(new_token)
if new_token in stop_tokens:
break
yield "".join(outputs)
# Load model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
quantization_config=quantization_config,
attn_implementation="flash_attention_2",
)
# Create Gradio interface
gr.ChatInterface(
predict,
title=EMOJI + " " + MODEL_NAME,
description=DESCRIPTION,
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False),
additional_inputs=[
gr.Textbox("You are a useful assistant. first recognize user request and then reply carfuly and thinking", label="System prompt"),
gr.Slider(0, 1, 0.6, label="Temperature"),
gr.Slider(0, 30000, 20000, label="Max new tokens"),
gr.Slider(1, 80, 40, label="Top K sampling"),
gr.Slider(0, 2, 1.1, label="Repetition penalty"),
gr.Slider(0, 1, 0.95, label="Top P sampling"),
],
#theme=gr.themes.Soft(primary_hue=COLOR),
).queue().launch()
|