beyoru commited on
Commit
31391ab
ยท
verified ยท
1 Parent(s): 9015f33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -24
app.py CHANGED
@@ -1,23 +1,30 @@
1
- import subprocess
2
- from threading import Thread
3
-
4
  import torch
5
- import spaces
6
  import gradio as gr
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
8
 
9
  MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
10
- MODEL_NAME = MODEL_ID.split("/")[-1]
11
  CONTEXT_LENGTH = 4096
12
 
13
- def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
14
- stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
15
- instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
 
 
 
 
 
 
 
 
 
16
 
 
17
  for user, assistant in history:
18
  instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
19
  instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
20
-
21
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
22
  enc = tokenizer(instruction, return_tensors="pt", truncation=True, max_length=CONTEXT_LENGTH)
23
  input_ids, attention_mask = enc.input_ids, enc.attention_mask
@@ -36,27 +43,77 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
36
 
37
  t = Thread(target=model.generate, kwargs=generate_kwargs)
38
  t.start()
39
-
40
  outputs = []
 
 
 
 
41
  for new_token in streamer:
42
- if new_token in stop_tokens:
43
- break # Stop generation but don't add the stop token
44
- outputs.append(new_token)
45
- yield "".join(outputs).replace("<|im_end|>", "") # Ensure no leftover stop tokens
 
 
 
 
 
 
46
 
47
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
48
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
50
  gr.ChatInterface(
51
  predict,
52
  additional_inputs=[
53
- gr.Textbox("You are a helpful assistant. Format responses clearly using natural Markdown formatting where appropriate.",
54
- label="System prompt"),
55
- gr.Slider(0, 1, 0.6, label="Temperature"),
56
- gr.Slider(0, 4096, 512, label="Max new tokens"),
57
- gr.Slider(1, 80, 40, label="Top K sampling"),
58
- gr.Slider(0, 2, 1.1, label="Repetition penalty"),
59
- gr.Slider(0, 1, 0.95, label="Top P sampling"),
 
 
 
 
60
  ],
61
- css=".message { white-space: pre-wrap; }", # Preserve newlines
 
 
 
 
 
 
 
 
 
 
62
  ).queue().launch()
 
1
+ import re
 
 
2
  import torch
3
+ from threading import Thread
4
  import gradio as gr
5
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
 
7
  MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 
8
  CONTEXT_LENGTH = 4096
9
 
10
+ # Add special tokens for thinking process
11
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
12
+ tokenizer.add_special_tokens({
13
+ "additional_special_tokens": ["<think>", "</think>"]
14
+ })
15
+
16
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
17
+ model.resize_token_embeddings(len(tokenizer))
18
+
19
+ def predict(message, history, show_thinking, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
20
+ stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|", "</think>"]
21
+ instruction = f'<|im_start|>system\n{system_prompt}\n<|im_end|>\n'
22
 
23
+ # Format chat history
24
  for user, assistant in history:
25
  instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
26
  instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
27
+
28
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
29
  enc = tokenizer(instruction, return_tensors="pt", truncation=True, max_length=CONTEXT_LENGTH)
30
  input_ids, attention_mask = enc.input_ids, enc.attention_mask
 
43
 
44
  t = Thread(target=model.generate, kwargs=generate_kwargs)
45
  t.start()
46
+
47
  outputs = []
48
+ thinking_buffer = []
49
+ in_thinking = False
50
+ current_chunk = ""
51
+
52
  for new_token in streamer:
53
+ current_chunk += new_token
54
+
55
+ # Check for thinking tags
56
+ if "<think>" in current_chunk and not in_thinking:
57
+ in_thinking = True
58
+ pre, _, post = current_chunk.partition("<think>")
59
+ if pre:
60
+ outputs.append(pre)
61
+ yield _clean_output("".join(outputs), show_thinking)
62
+ current_chunk = post
63
 
64
+ if "</think>" in current_chunk and in_thinking:
65
+ in_thinking = False
66
+ pre, _, post = current_chunk.partition("</think>")
67
+ thinking_buffer.append(pre)
68
+ if show_thinking:
69
+ outputs.extend(thinking_buffer)
70
+ thinking_buffer = []
71
+ current_chunk = post
72
+
73
+ if in_thinking:
74
+ thinking_buffer.append(current_chunk)
75
+ if show_thinking:
76
+ outputs.append(current_chunk)
77
+ yield _clean_output("".join(outputs), show_thinking)
78
+ current_chunk = ""
79
+ else:
80
+ if current_chunk:
81
+ outputs.append(current_chunk)
82
+ yield _clean_output("".join(outputs), show_thinking)
83
+ current_chunk = ""
84
+
85
+ def _clean_output(text: str, show_thinking: bool) -> str:
86
+ # Remove residual tags and format thinking content
87
+ text = re.sub(r'\s*<think>\s*', '\n\n*Thinking:* ', text)
88
+ text = re.sub(r'\s*</think>\s*', ' ', text)
89
+ text = re.sub(r'(\*Thinking:\*)(?! )', r'\1 ', text)
90
+ return text.strip()
91
 
92
+ # Create interface with toggle
93
  gr.ChatInterface(
94
  predict,
95
  additional_inputs=[
96
+ gr.Checkbox(value=True, label="๐Ÿ” Show Thinking Process"),
97
+ gr.Textbox(
98
+ "You are an AI assistant. First analyze requests using <think> tags, then provide answers. "
99
+ "Put all reasoning between <think> and </think> tags.",
100
+ label="System Prompt"
101
+ ),
102
+ gr.Slider(0, 1, 0.6, label="๐ŸŒก๏ธ Temperature"),
103
+ gr.Slider(0, 4096, 512, label="๐Ÿ“ Max New Tokens"),
104
+ gr.Slider(1, 80, 40, label="๐ŸŽ›๏ธ Top K"),
105
+ gr.Slider(0.1, 2.0, 1.1, label="๐Ÿ”„ Repetition Penalty"),
106
+ gr.Slider(0, 1, 0.95, label="๐Ÿงฎ Top P"),
107
  ],
108
+ css="""
109
+ .thinking {
110
+ color: #666;
111
+ font-style: italic;
112
+ border-left: 3px solid #ddd;
113
+ padding-left: 1em;
114
+ margin: 0.5em 0;
115
+ }
116
+ """,
117
+ title="DeepSeek AI Assistant with Reasoning",
118
+ description="Toggle the 'Show Thinking Process' checkbox to view/hide the model's internal reasoning"
119
  ).queue().launch()