Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,65 @@
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
"""
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
config = AutoConfig.from_pretrained(
|
14 |
-
"wuhp/myr1",
|
15 |
subfolder="myr1",
|
16 |
trust_remote_code=True
|
17 |
)
|
18 |
tokenizer = AutoTokenizer.from_pretrained(
|
19 |
-
"wuhp/myr1",
|
20 |
subfolder="myr1",
|
21 |
trust_remote_code=True
|
22 |
)
|
@@ -24,41 +67,146 @@ def load_model():
|
|
24 |
"wuhp/myr1",
|
25 |
subfolder="myr1",
|
26 |
config=config,
|
27 |
-
torch_dtype=
|
28 |
-
device_map="auto",
|
29 |
trust_remote_code=True
|
30 |
)
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
def ensure_pipeline():
|
35 |
"""
|
36 |
-
If
|
37 |
-
|
|
|
38 |
"""
|
39 |
-
global
|
40 |
-
if
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
@spaces.GPU(duration=
|
45 |
-
def predict(prompt, max_new_tokens=
|
46 |
"""
|
47 |
-
|
48 |
-
|
|
|
49 |
"""
|
50 |
-
pipe = ensure_pipeline()
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
return outputs[0]["generated_text"]
|
53 |
|
54 |
-
|
|
|
|
|
55 |
with gr.Blocks() as demo:
|
56 |
-
gr.Markdown("
|
57 |
-
prompt = gr.Textbox(label="Prompt")
|
58 |
-
max_tok = gr.Slider(1, 256, value=64, step=1, label="Max New Tokens")
|
59 |
-
output = gr.Textbox(label="Generated Text")
|
60 |
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
+
import torch
|
4 |
+
from datasets import load_dataset
|
5 |
+
from transformers import (
|
6 |
+
AutoConfig,
|
7 |
+
AutoTokenizer,
|
8 |
+
AutoModelForCausalLM,
|
9 |
+
DataCollatorForLanguageModeling,
|
10 |
+
Trainer,
|
11 |
+
TrainingArguments,
|
12 |
+
pipeline
|
13 |
+
)
|
14 |
|
15 |
+
#############################################################
|
16 |
+
# ZeroGPU REQUIREMENT:
|
17 |
+
# - No CUDA references at global scope.
|
18 |
+
# - All GPU usage within @spaces.GPU(...) functions.
|
19 |
+
#############################################################
|
20 |
|
21 |
+
# We'll do a small subset of WikiText-2 for demonstration.
|
22 |
+
# Real finetuning on the entire dataset likely exceeds typical ZeroGPU time.
|
23 |
+
NUM_EXAMPLES = 1000 # or fewer to keep it quick
|
24 |
+
|
25 |
+
# We'll store the "inference pipeline" after training
|
26 |
+
TEXT_PIPELINE = None
|
27 |
+
|
28 |
+
@spaces.GPU(duration=300) # up to 5 minutes for a mini-finetraining
|
29 |
+
def finetune_small_subset():
|
30 |
"""
|
31 |
+
1) Loads the model & tokenizer from 'wuhp/myr1'.
|
32 |
+
2) Loads a small subset of WikiText-2 for language modeling.
|
33 |
+
3) Runs a quick 1-epoch finetune.
|
34 |
+
4) Saves model + tokenizer to 'finetuned_myr1'.
|
35 |
+
5) Loads the newly trained model back into a text-generation pipeline.
|
36 |
+
Returns a success message.
|
37 |
"""
|
38 |
+
|
39 |
+
# -------------------------------
|
40 |
+
# A) Load a small dataset
|
41 |
+
# -------------------------------
|
42 |
+
ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
|
43 |
+
# Keep only a subset so we don't exceed time.
|
44 |
+
ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
|
45 |
+
|
46 |
+
def format_and_tokenize(ex):
|
47 |
+
# For standard LM, we just treat each line as text
|
48 |
+
return tokenizer(ex["text"], truncation=True, max_length=512)
|
49 |
+
|
50 |
+
# We'll define them once we have the tokenizer below.
|
51 |
+
|
52 |
+
# -------------------------------
|
53 |
+
# B) Load config, tokenizer, model from HF
|
54 |
+
# (trust_remote_code = True for custom modeling_deepseek)
|
55 |
+
# -------------------------------
|
56 |
config = AutoConfig.from_pretrained(
|
57 |
+
"wuhp/myr1",
|
58 |
subfolder="myr1",
|
59 |
trust_remote_code=True
|
60 |
)
|
61 |
tokenizer = AutoTokenizer.from_pretrained(
|
62 |
+
"wuhp/myr1",
|
63 |
subfolder="myr1",
|
64 |
trust_remote_code=True
|
65 |
)
|
|
|
67 |
"wuhp/myr1",
|
68 |
subfolder="myr1",
|
69 |
config=config,
|
70 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
71 |
+
device_map="auto",
|
72 |
trust_remote_code=True
|
73 |
)
|
74 |
+
|
75 |
+
# -------------------------------
|
76 |
+
# C) Process dataset
|
77 |
+
# -------------------------------
|
78 |
+
ds = ds.map(format_and_tokenize, batched=True, remove_columns=["text"])
|
79 |
+
ds.set_format("torch")
|
80 |
+
|
81 |
+
# -------------------------------
|
82 |
+
# D) Data Collator
|
83 |
+
# -------------------------------
|
84 |
+
collator = DataCollatorForLanguageModeling(
|
85 |
+
tokenizer=tokenizer,
|
86 |
+
mlm=False
|
87 |
+
)
|
88 |
+
|
89 |
+
# -------------------------------
|
90 |
+
# E) Training Arguments + Trainer
|
91 |
+
# -------------------------------
|
92 |
+
training_args = TrainingArguments(
|
93 |
+
output_dir="finetuned_myr1",
|
94 |
+
num_train_epochs=1, # 1 epoch for demonstration
|
95 |
+
per_device_train_batch_size=1,
|
96 |
+
gradient_accumulation_steps=2,
|
97 |
+
logging_steps=10,
|
98 |
+
save_steps=999999, # effectively "don't save mid-training"
|
99 |
+
save_total_limit=1,
|
100 |
+
fp16=torch.cuda.is_available(),
|
101 |
+
# ZeroGPU ephemeral environment => no real advantage to push_to_hub
|
102 |
+
)
|
103 |
+
|
104 |
+
trainer = Trainer(
|
105 |
+
model=model,
|
106 |
+
args=training_args,
|
107 |
+
train_dataset=ds,
|
108 |
+
data_collator=collator,
|
109 |
+
)
|
110 |
+
|
111 |
+
# -------------------------------
|
112 |
+
# F) Train
|
113 |
+
# -------------------------------
|
114 |
+
trainer.train()
|
115 |
+
|
116 |
+
# -------------------------------
|
117 |
+
# G) Save local checkpoint
|
118 |
+
# -------------------------------
|
119 |
+
trainer.save_model("finetuned_myr1")
|
120 |
+
tokenizer.save_pretrained("finetuned_myr1")
|
121 |
+
|
122 |
+
# -------------------------------
|
123 |
+
# H) Reload the newly finetuned model as a pipeline
|
124 |
+
# -------------------------------
|
125 |
+
# (We do this so we can do inference in the same GPU session)
|
126 |
+
# However, if the pipeline is used *after* this function returns,
|
127 |
+
# we might need to re-load in a separate function call.
|
128 |
+
finetuned_model = AutoModelForCausalLM.from_pretrained(
|
129 |
+
"finetuned_myr1",
|
130 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
131 |
+
device_map="auto",
|
132 |
+
trust_remote_code=True
|
133 |
+
)
|
134 |
+
global TEXT_PIPELINE
|
135 |
+
TEXT_PIPELINE = pipeline(
|
136 |
+
"text-generation",
|
137 |
+
model=finetuned_model,
|
138 |
+
tokenizer=tokenizer
|
139 |
+
)
|
140 |
+
return "Finetuning complete. Model reloaded for inference!"
|
141 |
|
142 |
def ensure_pipeline():
|
143 |
"""
|
144 |
+
If TEXT_PIPELINE is None (e.g., we didn't finetune yet),
|
145 |
+
let's just load the *original* model from wuhp/myr1
|
146 |
+
so that 'predict' can still run.
|
147 |
"""
|
148 |
+
global TEXT_PIPELINE
|
149 |
+
if TEXT_PIPELINE is None:
|
150 |
+
# Load the original model for inference
|
151 |
+
TEXT_PIPELINE = pipeline(
|
152 |
+
"text-generation",
|
153 |
+
model="wuhp/myr1/myr1", # subfolder syntax
|
154 |
+
trust_remote_code=True
|
155 |
+
)
|
156 |
+
return TEXT_PIPELINE
|
157 |
|
158 |
+
@spaces.GPU(duration=120) # up to 2 minutes to generate text
|
159 |
+
def predict(prompt, min_new_tokens=260, max_new_tokens=2600):
|
160 |
"""
|
161 |
+
Generate text from the (possibly finetuned) model.
|
162 |
+
We default max_new_tokens to 2,600, but allow up to 5,000 in the UI slider.
|
163 |
+
We'll also ensure a minimum of 260 tokens.
|
164 |
"""
|
165 |
+
pipe = ensure_pipeline() # load model if not already
|
166 |
+
# Use pipeline generate params.
|
167 |
+
# The pipeline will handle do_sample by default.
|
168 |
+
# We set a large max_new_tokens, but be careful about timeouts.
|
169 |
+
outputs = pipe(
|
170 |
+
prompt,
|
171 |
+
min_new_tokens=int(min_new_tokens),
|
172 |
+
max_new_tokens=int(max_new_tokens),
|
173 |
+
temperature=0.7,
|
174 |
+
top_p=0.9
|
175 |
+
)
|
176 |
return outputs[0]["generated_text"]
|
177 |
|
178 |
+
#############################################################
|
179 |
+
# Build a Gradio UI
|
180 |
+
#############################################################
|
181 |
with gr.Blocks() as demo:
|
182 |
+
gr.Markdown("## ZeroGPU Finetuning & Long-Text Generation Demo")
|
|
|
|
|
|
|
183 |
|
184 |
+
finetune_btn = gr.Button("Finetune on a small WikiText-2 subset (5 min limit)")
|
185 |
+
finetune_status = gr.Textbox(label="Status")
|
186 |
+
# When user clicks, we run 'finetune_small_subset'
|
187 |
+
finetune_btn.click(fn=finetune_small_subset, outputs=finetune_status)
|
188 |
+
|
189 |
+
gr.Markdown(
|
190 |
+
"Once finetuning completes, or if you skip it, you can still do inference "
|
191 |
+
"with either the new or original model."
|
192 |
+
)
|
193 |
+
|
194 |
+
prompt_in = gr.Textbox(label="Prompt", lines=3)
|
195 |
+
min_tok_slider = gr.Slider(
|
196 |
+
minimum=260, maximum=5000, value=260, step=10,
|
197 |
+
label="Minimum New Tokens"
|
198 |
+
)
|
199 |
+
max_tok_slider = gr.Slider(
|
200 |
+
minimum=260, maximum=5000, value=2600, step=50,
|
201 |
+
label="Maximum New Tokens"
|
202 |
+
)
|
203 |
+
gen_btn = gr.Button("Generate")
|
204 |
+
output_box = gr.Textbox(label="Generated Text", lines=12)
|
205 |
+
|
206 |
+
gen_btn.click(
|
207 |
+
fn=predict,
|
208 |
+
inputs=[prompt_in, min_tok_slider, max_tok_slider],
|
209 |
+
outputs=output_box
|
210 |
+
)
|
211 |
|
212 |
demo.launch()
|