nikravan commited on
Commit
23aa3a9
verified
1 Parent(s): 3b991b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -66
app.py CHANGED
@@ -1,59 +1,35 @@
1
-
2
- import json
3
- import subprocess
4
- from threading import Thread
5
-
6
  import torch
7
- import spaces
8
  import gradio as gr
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
 
10
 
11
- #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
12
-
13
  MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
14
  CHAT_TEMPLATE = "Auto"
15
  MODEL_NAME = MODEL_ID.split("/")[-1]
16
  CONTEXT_LENGTH = 16000
17
 
18
- # Estableciendo valores directamente para las variables
19
- COLOR = "blue" # Color predeterminado de la interfaz
20
- EMOJI = "馃" # Emoji predeterminado para el modelo
21
- DESCRIPTION = f"This is the {MODEL_NAME} model designed for testing thinking for general AI tasks." # Descripci贸n predeterminada
22
 
23
- latex_delimiters_set = [{
24
- "left": "\\(",
25
- "right": "\\)",
26
- "display": False
27
- }, {
28
- "left": "\\begin{equation}",
29
- "right": "\\end{equation}",
30
- "display": True
31
- }, {
32
- "left": "\\begin{align}",
33
- "right": "\\end{align}",
34
- "display": True
35
- }, {
36
- "left": "\\begin{alignat}",
37
- "right": "\\end{alignat}",
38
- "display": True
39
- }, {
40
- "left": "\\begin{gather}",
41
- "right": "\\end{gather}",
42
- "display": True
43
- }, {
44
- "left": "\\begin{CD}",
45
- "right": "\\end{CD}",
46
- "display": True
47
- }, {
48
- "left": "\\[",
49
- "right": "\\]",
50
- "display": True
51
- }]
52
 
53
 
54
- @spaces.GPU()
55
  def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
56
- # Format history with a given chat template
57
  if CHAT_TEMPLATE == "Auto":
58
  stop_tokens = [tokenizer.eos_token_id]
59
  instruction = system_prompt + "\n\n"
@@ -74,18 +50,18 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
74
  instruction += f' {message} [/INST]'
75
  else:
76
  raise Exception("Incorrect chat template, select 'Auto', 'ChatML' or 'Mistral Instruct'")
77
- print(instruction)
78
 
79
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
80
  enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
81
- input_ids, attention_mask = enc.input_ids, enc.attention_mask
82
 
83
  if input_ids.shape[1] > CONTEXT_LENGTH:
84
  input_ids = input_ids[:, -CONTEXT_LENGTH:]
85
  attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
86
 
87
  generate_kwargs = dict(
88
- input_ids=input_ids.to(device),
89
  attention_mask=attention_mask,
90
  streamer=streamer,
91
  do_sample=True,
@@ -105,28 +81,10 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
105
  yield "".join(outputs)
106
 
107
 
108
- # Load model
109
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
110
- quantization_config = BitsAndBytesConfig(
111
- load_in_4bit=True,
112
- bnb_4bit_compute_dtype=torch.bfloat16
113
- )
114
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
115
- model = AutoModelForCausalLM.from_pretrained(
116
- MODEL_ID,
117
- device_map="auto",
118
- quantization_config=quantization_config,
119
- #attn_implementation="flash_attention_2",
120
- )
121
-
122
- # Create Gradio interface
123
  gr.ChatInterface(
124
  predict,
125
  title=EMOJI + " " + MODEL_NAME,
126
  description=DESCRIPTION,
127
-
128
-
129
-
130
  additional_inputs_accordion=gr.Accordion(label="鈿欙笍 Parameters", open=False),
131
  additional_inputs=[
132
  gr.Textbox("You are a code assistant.", label="System prompt"),
@@ -137,5 +95,4 @@ gr.ChatInterface(
137
  gr.Slider(0, 1, 0.95, label="Top P sampling"),
138
  ],
139
  theme=gr.themes.Soft(primary_hue=COLOR),
140
- ).queue().launch()
141
-
 
 
 
 
 
 
1
  import torch
 
2
  import gradio as gr
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
4
+ from threading import Thread
5
 
6
+ # 鬲賳馗蹖賲丕鬲 賲丿賱
 
7
  MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
8
  CHAT_TEMPLATE = "Auto"
9
  MODEL_NAME = MODEL_ID.split("/")[-1]
10
  CONTEXT_LENGTH = 16000
11
 
 
 
 
 
12
 
13
+ COLOR = "blue"
14
+ EMOJI = "馃"
15
+ DESCRIPTION = f"This is the {MODEL_NAME} model designed for testing thinking for general AI tasks."
16
+
17
+
18
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19
+ quantization_config = BitsAndBytesConfig(
20
+ load_in_4bit=True,
21
+ bnb_4bit_compute_dtype=torch.bfloat16
22
+ )
23
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
24
+ model = AutoModelForCausalLM.from_pretrained(
25
+ MODEL_ID,
26
+ device_map="auto",
27
+ quantization_config=quantization_config,
28
+ ).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
 
 
31
  def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
32
+
33
  if CHAT_TEMPLATE == "Auto":
34
  stop_tokens = [tokenizer.eos_token_id]
35
  instruction = system_prompt + "\n\n"
 
50
  instruction += f' {message} [/INST]'
51
  else:
52
  raise Exception("Incorrect chat template, select 'Auto', 'ChatML' or 'Mistral Instruct'")
53
+
54
 
55
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
56
  enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
57
+ input_ids, attention_mask = enc.input_ids.to(device), enc.attention_mask.to(device)
58
 
59
  if input_ids.shape[1] > CONTEXT_LENGTH:
60
  input_ids = input_ids[:, -CONTEXT_LENGTH:]
61
  attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
62
 
63
  generate_kwargs = dict(
64
+ input_ids=input_ids,
65
  attention_mask=attention_mask,
66
  streamer=streamer,
67
  do_sample=True,
 
81
  yield "".join(outputs)
82
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  gr.ChatInterface(
85
  predict,
86
  title=EMOJI + " " + MODEL_NAME,
87
  description=DESCRIPTION,
 
 
 
88
  additional_inputs_accordion=gr.Accordion(label="鈿欙笍 Parameters", open=False),
89
  additional_inputs=[
90
  gr.Textbox("You are a code assistant.", label="System prompt"),
 
95
  gr.Slider(0, 1, 0.95, label="Top P sampling"),
96
  ],
97
  theme=gr.themes.Soft(primary_hue=COLOR),
98
+ ).queue().launch()