Update app.py
Browse files
app.py
CHANGED
@@ -11,7 +11,7 @@ from transformers import (
|
|
11 |
Trainer,
|
12 |
TrainingArguments,
|
13 |
pipeline,
|
14 |
-
BitsAndBytesConfig,
|
15 |
)
|
16 |
|
17 |
# PEFT (LoRA / QLoRA)
|
@@ -22,21 +22,34 @@ from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_tr
|
|
22 |
##############################################################################
|
23 |
|
24 |
TEXT_PIPELINE = None
|
25 |
-
COMPARISON_PIPELINE = None #
|
26 |
-
NUM_EXAMPLES = 50 # We'll train on 50 lines
|
27 |
|
28 |
@spaces.GPU(duration=600) # up to 10 min
|
29 |
def finetune_small_subset():
|
30 |
"""
|
31 |
1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
|
32 |
2) Adds LoRA adapters (trainable),
|
33 |
-
3) Trains on
|
34 |
4) Saves LoRA adapter to 'finetuned_myr1',
|
35 |
5) Reloads LoRA adapters for inference in a pipeline.
|
36 |
"""
|
37 |
|
38 |
-
# --- 1) Load
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
|
41 |
|
42 |
# --- 2) Setup 4-bit quantization with BitsAndBytes ---
|
@@ -83,9 +96,18 @@ def finetune_small_subset():
|
|
83 |
|
84 |
# --- 4) Tokenize dataset ---
|
85 |
def tokenize_fn(ex):
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
-
ds = ds.map(tokenize_fn, batched=
|
89 |
ds.set_format("torch")
|
90 |
|
91 |
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
@@ -128,7 +150,6 @@ def finetune_small_subset():
|
|
128 |
)
|
129 |
base_model_2 = prepare_model_for_kbit_training(base_model_2)
|
130 |
|
131 |
-
# Instead of load_adapter(...), we use PeftModel.from_pretrained
|
132 |
lora_model_2 = PeftModel.from_pretrained(
|
133 |
base_model_2,
|
134 |
"finetuned_myr1",
|
@@ -137,7 +158,7 @@ def finetune_small_subset():
|
|
137 |
global TEXT_PIPELINE
|
138 |
TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
|
139 |
|
140 |
-
return "Finetuning complete (QLoRA + LoRA). Model loaded for inference."
|
141 |
|
142 |
def ensure_pipeline():
|
143 |
"""
|
@@ -165,37 +186,6 @@ def ensure_pipeline():
|
|
165 |
TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
|
166 |
return TEXT_PIPELINE
|
167 |
|
168 |
-
def ensure_comparison_pipeline():
|
169 |
-
"""
|
170 |
-
Load the DeepSeek model pipeline if not already loaded.
|
171 |
-
Adjust config if you'd like to load in 4-bit, or just do standard fp16/bfloat16.
|
172 |
-
"""
|
173 |
-
global COMPARISON_PIPELINE
|
174 |
-
if COMPARISON_PIPELINE is None:
|
175 |
-
# Example: standard load (no QLoRA).
|
176 |
-
# If you want 4-bit, you can set up BitsAndBytesConfig here similarly.
|
177 |
-
config = AutoConfig.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
|
178 |
-
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
|
179 |
-
|
180 |
-
# If you want to use device_map="auto" for GPU usage:
|
181 |
-
# In many cases you might want to do:
|
182 |
-
# device_map="auto" or device_map=0 for single-GPU.
|
183 |
-
# For demonstration, let's keep it simple.
|
184 |
-
# If your environment supports accelerate, you can do device_map="auto".
|
185 |
-
model = AutoModelForCausalLM.from_pretrained(
|
186 |
-
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
|
187 |
-
config=config,
|
188 |
-
device_map="auto"
|
189 |
-
)
|
190 |
-
|
191 |
-
COMPARISON_PIPELINE = pipeline(
|
192 |
-
"text-generation",
|
193 |
-
model=model,
|
194 |
-
tokenizer=tokenizer
|
195 |
-
)
|
196 |
-
|
197 |
-
return COMPARISON_PIPELINE
|
198 |
-
|
199 |
@spaces.GPU(duration=120) # up to 2 min for text generation
|
200 |
def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
|
201 |
"""
|
@@ -212,45 +202,15 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
|
|
212 |
)
|
213 |
return out[0]["generated_text"]
|
214 |
|
215 |
-
|
216 |
-
def
|
217 |
-
|
218 |
-
Generates text side-by-side from the local myr1 pipeline (fine-tuned or base)
|
219 |
-
AND from the DeepSeek model.
|
220 |
-
Returns two strings.
|
221 |
-
"""
|
222 |
-
# Ensure both pipelines are loaded:
|
223 |
-
local_pipe = ensure_pipeline()
|
224 |
-
comp_pipe = ensure_comparison_pipeline()
|
225 |
|
226 |
-
local_out = local_pipe(
|
227 |
-
prompt,
|
228 |
-
temperature=float(temperature),
|
229 |
-
top_p=float(top_p),
|
230 |
-
min_new_tokens=int(min_new_tokens),
|
231 |
-
max_new_tokens=int(max_new_tokens),
|
232 |
-
do_sample=True
|
233 |
-
)
|
234 |
-
local_text = local_out[0]["generated_text"]
|
235 |
-
|
236 |
-
comp_out = comp_pipe(
|
237 |
-
prompt,
|
238 |
-
temperature=float(temperature),
|
239 |
-
top_p=float(top_p),
|
240 |
-
min_new_tokens=int(min_new_tokens),
|
241 |
-
max_new_tokens=int(max_new_tokens),
|
242 |
-
do_sample=True
|
243 |
-
)
|
244 |
-
comp_text = comp_out[0]["generated_text"]
|
245 |
-
|
246 |
-
return local_text, comp_text
|
247 |
-
|
248 |
-
# Build Gradio UI
|
249 |
with gr.Blocks() as demo:
|
250 |
-
gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1")
|
251 |
-
gr.Markdown("Finetune or skip to use the base model. Then
|
252 |
|
253 |
-
finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on
|
254 |
status_box = gr.Textbox(label="Finetune Status")
|
255 |
finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
|
256 |
|
@@ -259,11 +219,11 @@ with gr.Blocks() as demo:
|
|
259 |
prompt_in = gr.Textbox(lines=3, label="Prompt")
|
260 |
temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
|
261 |
top_p = gr.Slider(0.0, 1.0, step=0.05, value=0.9, label="Top-p")
|
262 |
-
min_tokens = gr.Slider(
|
263 |
-
max_tokens = gr.Slider(
|
264 |
|
265 |
-
output_box = gr.Textbox(label="
|
266 |
-
gen_btn = gr.Button("Generate
|
267 |
|
268 |
gen_btn.click(
|
269 |
fn=predict,
|
@@ -271,16 +231,4 @@ with gr.Blocks() as demo:
|
|
271 |
outputs=output_box
|
272 |
)
|
273 |
|
274 |
-
gr.Markdown("### Compare myr1 vs DeepSeek-R1-Distill-Llama-8B side-by-side")
|
275 |
-
|
276 |
-
compare_btn = gr.Button("Compare (Side-by-side)")
|
277 |
-
out_local = gr.Textbox(label="myr1 Output", lines=10)
|
278 |
-
out_deepseek = gr.Textbox(label="DeepSeek Output", lines=10)
|
279 |
-
|
280 |
-
compare_btn.click(
|
281 |
-
fn=compare_models,
|
282 |
-
inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
|
283 |
-
outputs=[out_local, out_deepseek]
|
284 |
-
)
|
285 |
-
|
286 |
demo.launch()
|
|
|
11 |
Trainer,
|
12 |
TrainingArguments,
|
13 |
pipeline,
|
14 |
+
BitsAndBytesConfig,
|
15 |
)
|
16 |
|
17 |
# PEFT (LoRA / QLoRA)
|
|
|
22 |
##############################################################################
|
23 |
|
24 |
TEXT_PIPELINE = None
|
25 |
+
COMPARISON_PIPELINE = None # pipeline for the comparison model, if desired
|
26 |
+
NUM_EXAMPLES = 50 # We'll train on 50 lines (or rows) for demonstration
|
27 |
|
28 |
@spaces.GPU(duration=600) # up to 10 min
|
29 |
def finetune_small_subset():
|
30 |
"""
|
31 |
1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
|
32 |
2) Adds LoRA adapters (trainable),
|
33 |
+
3) Trains on a small subset of Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B,
|
34 |
4) Saves LoRA adapter to 'finetuned_myr1',
|
35 |
5) Reloads LoRA adapters for inference in a pipeline.
|
36 |
"""
|
37 |
|
38 |
+
# --- 1) Load Magpie dataset ---
|
39 |
+
# You can load 'train' or 'validation' split depending on your preference
|
40 |
+
ds = load_dataset(
|
41 |
+
"Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B",
|
42 |
+
split="train"
|
43 |
+
)
|
44 |
+
|
45 |
+
# EXAMPLE: Filter for a single conversation_id
|
46 |
+
# (Alternatively, just do ds.select(range(...)) for a small random subset.)
|
47 |
+
# We'll demonstrate filtering for the first conversation_id:
|
48 |
+
unique_ids = list(set(ds["conversation_id"]))
|
49 |
+
single_id = unique_ids[0]
|
50 |
+
ds = ds.filter(lambda x: x["conversation_id"] == single_id)
|
51 |
+
|
52 |
+
# After filtering, still pick just up to NUM_EXAMPLES
|
53 |
ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
|
54 |
|
55 |
# --- 2) Setup 4-bit quantization with BitsAndBytes ---
|
|
|
96 |
|
97 |
# --- 4) Tokenize dataset ---
|
98 |
def tokenize_fn(ex):
|
99 |
+
"""
|
100 |
+
Example: combine instruction + response
|
101 |
+
into a single text. Adjust to your liking.
|
102 |
+
"""
|
103 |
+
# For demonstration, let's do a short prompt style:
|
104 |
+
text = (
|
105 |
+
f"Instruction: {ex['instruction']}\n\n"
|
106 |
+
f"Response: {ex['response']}"
|
107 |
+
)
|
108 |
+
return tokenizer(text, truncation=True, max_length=512)
|
109 |
|
110 |
+
ds = ds.map(tokenize_fn, batched=False, remove_columns=ds.column_names)
|
111 |
ds.set_format("torch")
|
112 |
|
113 |
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
|
|
150 |
)
|
151 |
base_model_2 = prepare_model_for_kbit_training(base_model_2)
|
152 |
|
|
|
153 |
lora_model_2 = PeftModel.from_pretrained(
|
154 |
base_model_2,
|
155 |
"finetuned_myr1",
|
|
|
158 |
global TEXT_PIPELINE
|
159 |
TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
|
160 |
|
161 |
+
return "Finetuning complete (QLoRA + LoRA on Magpie dataset). Model loaded for inference."
|
162 |
|
163 |
def ensure_pipeline():
|
164 |
"""
|
|
|
186 |
TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
|
187 |
return TEXT_PIPELINE
|
188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
@spaces.GPU(duration=120) # up to 2 min for text generation
|
190 |
def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
|
191 |
"""
|
|
|
202 |
)
|
203 |
return out[0]["generated_text"]
|
204 |
|
205 |
+
# (Optional) If you want to compare with another model, define it here:
|
206 |
+
# def ensure_comparison_pipeline():
|
207 |
+
# ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
with gr.Blocks() as demo:
|
210 |
+
gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1 (Magpie dataset subset)")
|
211 |
+
gr.Markdown("Finetune or skip to use the base model. Then generate text below.")
|
212 |
|
213 |
+
finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on small subset of Magpie dataset (up to 10 min)")
|
214 |
status_box = gr.Textbox(label="Finetune Status")
|
215 |
finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
|
216 |
|
|
|
219 |
prompt_in = gr.Textbox(lines=3, label="Prompt")
|
220 |
temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
|
221 |
top_p = gr.Slider(0.0, 1.0, step=0.05, value=0.9, label="Top-p")
|
222 |
+
min_tokens = gr.Slider(50, 1024, value=50, step=10, label="Min New Tokens")
|
223 |
+
max_tokens = gr.Slider(50, 1024, value=200, step=50, label="Max New Tokens")
|
224 |
|
225 |
+
output_box = gr.Textbox(label="Generated Text", lines=12)
|
226 |
+
gen_btn = gr.Button("Generate")
|
227 |
|
228 |
gen_btn.click(
|
229 |
fn=predict,
|
|
|
231 |
outputs=output_box
|
232 |
)
|
233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
demo.launch()
|