Spaces:
wuhp
/
Running on Zero

wuhp commited on
Commit
f82c314
·
verified ·
1 Parent(s): b26485f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -31
app.py CHANGED
@@ -1,22 +1,65 @@
1
  import gradio as gr
2
  import spaces
3
- from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline
 
 
 
 
 
 
 
 
 
 
4
 
5
- text_pipeline = None # global var to hold our pipeline once loaded
 
 
 
 
6
 
7
- @spaces.GPU(duration=120) # request up to 120s GPU time to load the model
8
- def load_model():
 
 
 
 
 
 
 
9
  """
10
- This function will run in a *child* process that has GPU allocated.
11
- We can safely do device_map="auto" or .to("cuda") here.
 
 
 
 
12
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  config = AutoConfig.from_pretrained(
14
- "wuhp/myr1",
15
  subfolder="myr1",
16
  trust_remote_code=True
17
  )
18
  tokenizer = AutoTokenizer.from_pretrained(
19
- "wuhp/myr1",
20
  subfolder="myr1",
21
  trust_remote_code=True
22
  )
@@ -24,41 +67,146 @@ def load_model():
24
  "wuhp/myr1",
25
  subfolder="myr1",
26
  config=config,
27
- torch_dtype="auto", # triggers GPU usage
28
- device_map="auto", # triggers GPU usage
29
  trust_remote_code=True
30
  )
31
- text_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
32
- return text_pipe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def ensure_pipeline():
35
  """
36
- If we've never loaded the pipeline, call load_model() now.
37
- If ZeroGPU has deallocated it, we might need to reload again.
 
38
  """
39
- global text_pipeline
40
- if text_pipeline is None:
41
- text_pipeline = load_model() # <-- calls the GPU-wrapped function
42
- return text_pipeline
 
 
 
 
 
43
 
44
- @spaces.GPU(duration=60) # up to 60s for each generate call
45
- def predict(prompt, max_new_tokens=64):
46
  """
47
- Called when the user clicks 'Generate'; ensures the model is loaded,
48
- then runs inference on GPU.
 
49
  """
50
- pipe = ensure_pipeline()
51
- outputs = pipe(prompt, max_new_tokens=int(max_new_tokens))
 
 
 
 
 
 
 
 
 
52
  return outputs[0]["generated_text"]
53
 
54
- # Build the Gradio UI
 
 
55
  with gr.Blocks() as demo:
56
- gr.Markdown("# ZeroGPU Inference Demo")
57
- prompt = gr.Textbox(label="Prompt")
58
- max_tok = gr.Slider(1, 256, value=64, step=1, label="Max New Tokens")
59
- output = gr.Textbox(label="Generated Text")
60
 
61
- generate_btn = gr.Button("Generate")
62
- generate_btn.click(fn=predict, inputs=[prompt, max_tok], outputs=output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  demo.launch()
 
1
  import gradio as gr
2
  import spaces
3
+ import torch
4
+ from datasets import load_dataset
5
+ from transformers import (
6
+ AutoConfig,
7
+ AutoTokenizer,
8
+ AutoModelForCausalLM,
9
+ DataCollatorForLanguageModeling,
10
+ Trainer,
11
+ TrainingArguments,
12
+ pipeline
13
+ )
14
 
15
+ #############################################################
16
+ # ZeroGPU REQUIREMENT:
17
+ # - No CUDA references at global scope.
18
+ # - All GPU usage within @spaces.GPU(...) functions.
19
+ #############################################################
20
 
21
+ # We'll do a small subset of WikiText-2 for demonstration.
22
+ # Real finetuning on the entire dataset likely exceeds typical ZeroGPU time.
23
+ NUM_EXAMPLES = 1000 # or fewer to keep it quick
24
+
25
+ # We'll store the "inference pipeline" after training
26
+ TEXT_PIPELINE = None
27
+
28
+ @spaces.GPU(duration=300) # up to 5 minutes for a mini-finetraining
29
+ def finetune_small_subset():
30
  """
31
+ 1) Loads the model & tokenizer from 'wuhp/myr1'.
32
+ 2) Loads a small subset of WikiText-2 for language modeling.
33
+ 3) Runs a quick 1-epoch finetune.
34
+ 4) Saves model + tokenizer to 'finetuned_myr1'.
35
+ 5) Loads the newly trained model back into a text-generation pipeline.
36
+ Returns a success message.
37
  """
38
+
39
+ # -------------------------------
40
+ # A) Load a small dataset
41
+ # -------------------------------
42
+ ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
43
+ # Keep only a subset so we don't exceed time.
44
+ ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
45
+
46
+ def format_and_tokenize(ex):
47
+ # For standard LM, we just treat each line as text
48
+ return tokenizer(ex["text"], truncation=True, max_length=512)
49
+
50
+ # We'll define them once we have the tokenizer below.
51
+
52
+ # -------------------------------
53
+ # B) Load config, tokenizer, model from HF
54
+ # (trust_remote_code = True for custom modeling_deepseek)
55
+ # -------------------------------
56
  config = AutoConfig.from_pretrained(
57
+ "wuhp/myr1",
58
  subfolder="myr1",
59
  trust_remote_code=True
60
  )
61
  tokenizer = AutoTokenizer.from_pretrained(
62
+ "wuhp/myr1",
63
  subfolder="myr1",
64
  trust_remote_code=True
65
  )
 
67
  "wuhp/myr1",
68
  subfolder="myr1",
69
  config=config,
70
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
71
+ device_map="auto",
72
  trust_remote_code=True
73
  )
74
+
75
+ # -------------------------------
76
+ # C) Process dataset
77
+ # -------------------------------
78
+ ds = ds.map(format_and_tokenize, batched=True, remove_columns=["text"])
79
+ ds.set_format("torch")
80
+
81
+ # -------------------------------
82
+ # D) Data Collator
83
+ # -------------------------------
84
+ collator = DataCollatorForLanguageModeling(
85
+ tokenizer=tokenizer,
86
+ mlm=False
87
+ )
88
+
89
+ # -------------------------------
90
+ # E) Training Arguments + Trainer
91
+ # -------------------------------
92
+ training_args = TrainingArguments(
93
+ output_dir="finetuned_myr1",
94
+ num_train_epochs=1, # 1 epoch for demonstration
95
+ per_device_train_batch_size=1,
96
+ gradient_accumulation_steps=2,
97
+ logging_steps=10,
98
+ save_steps=999999, # effectively "don't save mid-training"
99
+ save_total_limit=1,
100
+ fp16=torch.cuda.is_available(),
101
+ # ZeroGPU ephemeral environment => no real advantage to push_to_hub
102
+ )
103
+
104
+ trainer = Trainer(
105
+ model=model,
106
+ args=training_args,
107
+ train_dataset=ds,
108
+ data_collator=collator,
109
+ )
110
+
111
+ # -------------------------------
112
+ # F) Train
113
+ # -------------------------------
114
+ trainer.train()
115
+
116
+ # -------------------------------
117
+ # G) Save local checkpoint
118
+ # -------------------------------
119
+ trainer.save_model("finetuned_myr1")
120
+ tokenizer.save_pretrained("finetuned_myr1")
121
+
122
+ # -------------------------------
123
+ # H) Reload the newly finetuned model as a pipeline
124
+ # -------------------------------
125
+ # (We do this so we can do inference in the same GPU session)
126
+ # However, if the pipeline is used *after* this function returns,
127
+ # we might need to re-load in a separate function call.
128
+ finetuned_model = AutoModelForCausalLM.from_pretrained(
129
+ "finetuned_myr1",
130
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
131
+ device_map="auto",
132
+ trust_remote_code=True
133
+ )
134
+ global TEXT_PIPELINE
135
+ TEXT_PIPELINE = pipeline(
136
+ "text-generation",
137
+ model=finetuned_model,
138
+ tokenizer=tokenizer
139
+ )
140
+ return "Finetuning complete. Model reloaded for inference!"
141
 
142
  def ensure_pipeline():
143
  """
144
+ If TEXT_PIPELINE is None (e.g., we didn't finetune yet),
145
+ let's just load the *original* model from wuhp/myr1
146
+ so that 'predict' can still run.
147
  """
148
+ global TEXT_PIPELINE
149
+ if TEXT_PIPELINE is None:
150
+ # Load the original model for inference
151
+ TEXT_PIPELINE = pipeline(
152
+ "text-generation",
153
+ model="wuhp/myr1/myr1", # subfolder syntax
154
+ trust_remote_code=True
155
+ )
156
+ return TEXT_PIPELINE
157
 
158
+ @spaces.GPU(duration=120) # up to 2 minutes to generate text
159
+ def predict(prompt, min_new_tokens=260, max_new_tokens=2600):
160
  """
161
+ Generate text from the (possibly finetuned) model.
162
+ We default max_new_tokens to 2,600, but allow up to 5,000 in the UI slider.
163
+ We'll also ensure a minimum of 260 tokens.
164
  """
165
+ pipe = ensure_pipeline() # load model if not already
166
+ # Use pipeline generate params.
167
+ # The pipeline will handle do_sample by default.
168
+ # We set a large max_new_tokens, but be careful about timeouts.
169
+ outputs = pipe(
170
+ prompt,
171
+ min_new_tokens=int(min_new_tokens),
172
+ max_new_tokens=int(max_new_tokens),
173
+ temperature=0.7,
174
+ top_p=0.9
175
+ )
176
  return outputs[0]["generated_text"]
177
 
178
+ #############################################################
179
+ # Build a Gradio UI
180
+ #############################################################
181
  with gr.Blocks() as demo:
182
+ gr.Markdown("## ZeroGPU Finetuning & Long-Text Generation Demo")
 
 
 
183
 
184
+ finetune_btn = gr.Button("Finetune on a small WikiText-2 subset (5 min limit)")
185
+ finetune_status = gr.Textbox(label="Status")
186
+ # When user clicks, we run 'finetune_small_subset'
187
+ finetune_btn.click(fn=finetune_small_subset, outputs=finetune_status)
188
+
189
+ gr.Markdown(
190
+ "Once finetuning completes, or if you skip it, you can still do inference "
191
+ "with either the new or original model."
192
+ )
193
+
194
+ prompt_in = gr.Textbox(label="Prompt", lines=3)
195
+ min_tok_slider = gr.Slider(
196
+ minimum=260, maximum=5000, value=260, step=10,
197
+ label="Minimum New Tokens"
198
+ )
199
+ max_tok_slider = gr.Slider(
200
+ minimum=260, maximum=5000, value=2600, step=50,
201
+ label="Maximum New Tokens"
202
+ )
203
+ gen_btn = gr.Button("Generate")
204
+ output_box = gr.Textbox(label="Generated Text", lines=12)
205
+
206
+ gen_btn.click(
207
+ fn=predict,
208
+ inputs=[prompt_in, min_tok_slider, max_tok_slider],
209
+ outputs=output_box
210
+ )
211
 
212
  demo.launch()