Update app.py
Browse files
app.py
CHANGED
@@ -21,41 +21,39 @@ from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_tr
|
|
21 |
# ZeroGPU + QLoRA Example
|
22 |
##############################################################################
|
23 |
|
24 |
-
TEXT_PIPELINE = None
|
25 |
-
COMPARISON_PIPELINE = None #
|
26 |
-
NUM_EXAMPLES = 50 # We'll train on 50 lines (or rows) for demonstration
|
27 |
|
28 |
-
|
|
|
|
|
29 |
def finetune_small_subset():
|
30 |
"""
|
31 |
1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
|
32 |
2) Adds LoRA adapters (trainable),
|
33 |
-
3) Trains on a small subset of Magpie
|
34 |
4) Saves LoRA adapter to 'finetuned_myr1',
|
35 |
5) Reloads LoRA adapters for inference in a pipeline.
|
36 |
"""
|
37 |
|
38 |
-
# --- 1) Load Magpie dataset ---
|
39 |
-
# You can load 'train' or 'validation' split depending on your preference
|
40 |
ds = load_dataset(
|
41 |
"Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B",
|
42 |
split="train"
|
43 |
)
|
44 |
|
45 |
-
#
|
46 |
-
# (Alternatively, just do ds.select(range(...)) for a small random subset.)
|
47 |
-
# We'll demonstrate filtering for the first conversation_id:
|
48 |
unique_ids = list(set(ds["conversation_id"]))
|
49 |
single_id = unique_ids[0]
|
50 |
ds = ds.filter(lambda x: x["conversation_id"] == single_id)
|
51 |
|
52 |
-
#
|
53 |
ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
|
54 |
|
55 |
# --- 2) Setup 4-bit quantization with BitsAndBytes ---
|
56 |
bnb_config = BitsAndBytesConfig(
|
57 |
load_in_4bit=True,
|
58 |
-
bnb_4bit_compute_dtype=torch.bfloat16, # or torch.float16
|
59 |
bnb_4bit_use_double_quant=True,
|
60 |
bnb_4bit_quant_type="nf4",
|
61 |
)
|
@@ -75,12 +73,12 @@ def finetune_small_subset():
|
|
75 |
"wuhp/myr1",
|
76 |
subfolder="myr1",
|
77 |
config=config,
|
78 |
-
quantization_config=bnb_config,
|
79 |
device_map="auto",
|
80 |
trust_remote_code=True
|
81 |
)
|
82 |
|
83 |
-
# Prepare the model for k-bit training
|
84 |
base_model = prepare_model_for_kbit_training(base_model)
|
85 |
|
86 |
# --- 3) Create LoRA config & wrap the base model in LoRA ---
|
@@ -97,10 +95,9 @@ def finetune_small_subset():
|
|
97 |
# --- 4) Tokenize dataset ---
|
98 |
def tokenize_fn(ex):
|
99 |
"""
|
100 |
-
|
101 |
-
|
102 |
"""
|
103 |
-
# For demonstration, let's do a short prompt style:
|
104 |
text = (
|
105 |
f"Instruction: {ex['instruction']}\n\n"
|
106 |
f"Response: {ex['response']}"
|
@@ -119,9 +116,9 @@ def finetune_small_subset():
|
|
119 |
per_device_train_batch_size=1,
|
120 |
gradient_accumulation_steps=2,
|
121 |
logging_steps=5,
|
122 |
-
save_steps=999999,
|
123 |
save_total_limit=1,
|
124 |
-
fp16=False,
|
125 |
)
|
126 |
|
127 |
# Trainer
|
@@ -158,7 +155,8 @@ def finetune_small_subset():
|
|
158 |
global TEXT_PIPELINE
|
159 |
TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
|
160 |
|
161 |
-
return "Finetuning complete
|
|
|
162 |
|
163 |
def ensure_pipeline():
|
164 |
"""
|
@@ -186,10 +184,34 @@ def ensure_pipeline():
|
|
186 |
TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
|
187 |
return TEXT_PIPELINE
|
188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
@spaces.GPU(duration=120) # up to 2 min for text generation
|
190 |
def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
|
191 |
"""
|
192 |
-
Generates text from the
|
193 |
"""
|
194 |
pipe = ensure_pipeline()
|
195 |
out = pipe(
|
@@ -202,19 +224,49 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
|
|
202 |
)
|
203 |
return out[0]["generated_text"]
|
204 |
|
205 |
-
# (Optional) If you want to compare with another model, define it here:
|
206 |
-
# def ensure_comparison_pipeline():
|
207 |
-
# ...
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
with gr.Blocks() as demo:
|
210 |
-
gr.Markdown("
|
211 |
-
gr.Markdown("
|
212 |
|
213 |
-
finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on
|
214 |
status_box = gr.Textbox(label="Finetune Status")
|
215 |
finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
|
216 |
|
217 |
-
gr.Markdown("### Generate with myr1 (fine-tuned if done
|
218 |
|
219 |
prompt_in = gr.Textbox(lines=3, label="Prompt")
|
220 |
temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
|
@@ -222,8 +274,8 @@ with gr.Blocks() as demo:
|
|
222 |
min_tokens = gr.Slider(50, 1024, value=50, step=10, label="Min New Tokens")
|
223 |
max_tokens = gr.Slider(50, 1024, value=200, step=50, label="Max New Tokens")
|
224 |
|
225 |
-
output_box = gr.Textbox(label="
|
226 |
-
gen_btn = gr.Button("Generate")
|
227 |
|
228 |
gen_btn.click(
|
229 |
fn=predict,
|
@@ -231,4 +283,16 @@ with gr.Blocks() as demo:
|
|
231 |
outputs=output_box
|
232 |
)
|
233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
demo.launch()
|
|
|
21 |
# ZeroGPU + QLoRA Example
|
22 |
##############################################################################
|
23 |
|
24 |
+
TEXT_PIPELINE = None # Pipeline for wuhp/myr1 (fine-tuned or base)
|
25 |
+
COMPARISON_PIPELINE = None # Pipeline for the DeepSeek model
|
|
|
26 |
|
27 |
+
NUM_EXAMPLES = 50 # We'll train on 50 rows for demonstration
|
28 |
+
|
29 |
+
@spaces.GPU(duration=300) # up to 5 min
|
30 |
def finetune_small_subset():
|
31 |
"""
|
32 |
1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
|
33 |
2) Adds LoRA adapters (trainable),
|
34 |
+
3) Trains on a small subset of the Magpie dataset,
|
35 |
4) Saves LoRA adapter to 'finetuned_myr1',
|
36 |
5) Reloads LoRA adapters for inference in a pipeline.
|
37 |
"""
|
38 |
|
39 |
+
# --- 1) Load a small subset of the Magpie dataset ---
|
|
|
40 |
ds = load_dataset(
|
41 |
"Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B",
|
42 |
split="train"
|
43 |
)
|
44 |
|
45 |
+
# For demonstration, pick a single conversation_id
|
|
|
|
|
46 |
unique_ids = list(set(ds["conversation_id"]))
|
47 |
single_id = unique_ids[0]
|
48 |
ds = ds.filter(lambda x: x["conversation_id"] == single_id)
|
49 |
|
50 |
+
# Then select only NUM_EXAMPLES from that subset
|
51 |
ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
|
52 |
|
53 |
# --- 2) Setup 4-bit quantization with BitsAndBytes ---
|
54 |
bnb_config = BitsAndBytesConfig(
|
55 |
load_in_4bit=True,
|
56 |
+
bnb_4bit_compute_dtype=torch.bfloat16, # or torch.float16
|
57 |
bnb_4bit_use_double_quant=True,
|
58 |
bnb_4bit_quant_type="nf4",
|
59 |
)
|
|
|
73 |
"wuhp/myr1",
|
74 |
subfolder="myr1",
|
75 |
config=config,
|
76 |
+
quantization_config=bnb_config, # <--- QLoRA 4-bit
|
77 |
device_map="auto",
|
78 |
trust_remote_code=True
|
79 |
)
|
80 |
|
81 |
+
# Prepare the model for k-bit training
|
82 |
base_model = prepare_model_for_kbit_training(base_model)
|
83 |
|
84 |
# --- 3) Create LoRA config & wrap the base model in LoRA ---
|
|
|
95 |
# --- 4) Tokenize dataset ---
|
96 |
def tokenize_fn(ex):
|
97 |
"""
|
98 |
+
Combine instruction + response into a single text.
|
99 |
+
You can adjust this to include more fields or different formatting.
|
100 |
"""
|
|
|
101 |
text = (
|
102 |
f"Instruction: {ex['instruction']}\n\n"
|
103 |
f"Response: {ex['response']}"
|
|
|
116 |
per_device_train_batch_size=1,
|
117 |
gradient_accumulation_steps=2,
|
118 |
logging_steps=5,
|
119 |
+
save_steps=999999, # effectively don't save mid-epoch
|
120 |
save_total_limit=1,
|
121 |
+
fp16=False, # rely on bfloat16 from quantization
|
122 |
)
|
123 |
|
124 |
# Trainer
|
|
|
155 |
global TEXT_PIPELINE
|
156 |
TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
|
157 |
|
158 |
+
return "Finetuning complete. Model loaded for inference."
|
159 |
+
|
160 |
|
161 |
def ensure_pipeline():
|
162 |
"""
|
|
|
184 |
TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
|
185 |
return TEXT_PIPELINE
|
186 |
|
187 |
+
|
188 |
+
def ensure_comparison_pipeline():
|
189 |
+
"""
|
190 |
+
Load the DeepSeek model pipeline if not already loaded.
|
191 |
+
"""
|
192 |
+
global COMPARISON_PIPELINE
|
193 |
+
if COMPARISON_PIPELINE is None:
|
194 |
+
# If you prefer 4-bit, you can define BitsAndBytesConfig here,
|
195 |
+
# but let's keep it simpler for demonstration (fp16 or bf16).
|
196 |
+
config = AutoConfig.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
|
197 |
+
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
|
198 |
+
model = AutoModelForCausalLM.from_pretrained(
|
199 |
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
|
200 |
+
config=config,
|
201 |
+
device_map="auto"
|
202 |
+
)
|
203 |
+
COMPARISON_PIPELINE = pipeline(
|
204 |
+
"text-generation",
|
205 |
+
model=model,
|
206 |
+
tokenizer=tokenizer
|
207 |
+
)
|
208 |
+
return COMPARISON_PIPELINE
|
209 |
+
|
210 |
+
|
211 |
@spaces.GPU(duration=120) # up to 2 min for text generation
|
212 |
def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
|
213 |
"""
|
214 |
+
Generates text from the fine-tuned (LoRA) model if present, else the base model.
|
215 |
"""
|
216 |
pipe = ensure_pipeline()
|
217 |
out = pipe(
|
|
|
224 |
)
|
225 |
return out[0]["generated_text"]
|
226 |
|
|
|
|
|
|
|
227 |
|
228 |
+
@spaces.GPU(duration=120) # up to 2 min for text generation
|
229 |
+
def compare_models(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
|
230 |
+
"""
|
231 |
+
Generates text side-by-side from the local myr1 pipeline (fine-tuned or base)
|
232 |
+
AND from the DeepSeek model. Returns two strings.
|
233 |
+
"""
|
234 |
+
local_pipe = ensure_pipeline()
|
235 |
+
comp_pipe = ensure_comparison_pipeline()
|
236 |
+
|
237 |
+
local_out = local_pipe(
|
238 |
+
prompt,
|
239 |
+
temperature=float(temperature),
|
240 |
+
top_p=float(top_p),
|
241 |
+
min_new_tokens=int(min_new_tokens),
|
242 |
+
max_new_tokens=int(max_new_tokens),
|
243 |
+
do_sample=True
|
244 |
+
)
|
245 |
+
local_text = local_out[0]["generated_text"]
|
246 |
+
|
247 |
+
comp_out = comp_pipe(
|
248 |
+
prompt,
|
249 |
+
temperature=float(temperature),
|
250 |
+
top_p=float(top_p),
|
251 |
+
min_new_tokens=int(min_new_tokens),
|
252 |
+
max_new_tokens=int(max_new_tokens),
|
253 |
+
do_sample=True
|
254 |
+
)
|
255 |
+
comp_text = comp_out[0]["generated_text"]
|
256 |
+
|
257 |
+
return local_text, comp_text
|
258 |
+
|
259 |
+
|
260 |
+
# Build Gradio UI
|
261 |
with gr.Blocks() as demo:
|
262 |
+
gr.Markdown("# QLoRA Fine-tuning & Comparison Demo")
|
263 |
+
gr.Markdown("**Fine-tune wuhp/myr1** on a small subset of the Magpie dataset, then generate or compare output with the DeepSeek model.")
|
264 |
|
265 |
+
finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on Magpie subset (up to 5 min)")
|
266 |
status_box = gr.Textbox(label="Finetune Status")
|
267 |
finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
|
268 |
|
269 |
+
gr.Markdown("### Generate with myr1 (fine-tuned if done, else base)")
|
270 |
|
271 |
prompt_in = gr.Textbox(lines=3, label="Prompt")
|
272 |
temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
|
|
|
274 |
min_tokens = gr.Slider(50, 1024, value=50, step=10, label="Min New Tokens")
|
275 |
max_tokens = gr.Slider(50, 1024, value=200, step=50, label="Max New Tokens")
|
276 |
|
277 |
+
output_box = gr.Textbox(label="myr1 Output", lines=8)
|
278 |
+
gen_btn = gr.Button("Generate with myr1")
|
279 |
|
280 |
gen_btn.click(
|
281 |
fn=predict,
|
|
|
283 |
outputs=output_box
|
284 |
)
|
285 |
|
286 |
+
gr.Markdown("### Compare myr1 vs DeepSeek side-by-side")
|
287 |
+
|
288 |
+
compare_btn = gr.Button("Compare")
|
289 |
+
out_local = gr.Textbox(label="myr1 Output", lines=8)
|
290 |
+
out_deepseek = gr.Textbox(label="DeepSeek Output", lines=8)
|
291 |
+
|
292 |
+
compare_btn.click(
|
293 |
+
fn=compare_models,
|
294 |
+
inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
|
295 |
+
outputs=[out_local, out_deepseek]
|
296 |
+
)
|
297 |
+
|
298 |
demo.launch()
|