Spaces:
wuhp
/
Running on Zero

wuhp commited on
Commit
1ce8e5a
·
verified ·
1 Parent(s): 451ee23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -33
app.py CHANGED
@@ -15,12 +15,12 @@ from transformers import (
15
  )
16
 
17
  # PEFT (LoRA / QLoRA)
18
- from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
19
-
20
 
21
  ##############################################################################
22
  # ZeroGPU + QLoRA Example
23
  ##############################################################################
 
24
  TEXT_PIPELINE = None
25
  NUM_EXAMPLES = 50 # We'll train on 50 lines of WikiText-2 for demonstration
26
 
@@ -38,16 +38,12 @@ def finetune_small_subset():
38
  ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
39
  ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
40
 
41
- # We'll define tokenize_fn after we have the tokenizer
42
-
43
  # --- 2) Setup 4-bit quantization with BitsAndBytes ---
44
- # This is QLoRA approach: we load the base model in 4-bit
45
- # and attach LoRA adapters for training.
46
  bnb_config = BitsAndBytesConfig(
47
  load_in_4bit=True,
48
- bnb_4bit_compute_dtype=torch.bfloat16, # or torch.float16 if preferred
49
  bnb_4bit_use_double_quant=True,
50
- bnb_4bit_quant_type="nf4", # "nf4" is standard for QLoRA
51
  )
52
 
53
  config = AutoConfig.from_pretrained(
@@ -61,7 +57,6 @@ def finetune_small_subset():
61
  trust_remote_code=True
62
  )
63
 
64
- # Load model in 4-bit
65
  base_model = AutoModelForCausalLM.from_pretrained(
66
  "wuhp/myr1",
67
  subfolder="myr1",
@@ -72,18 +67,16 @@ def finetune_small_subset():
72
  )
73
 
74
  # Prepare the model for k-bit training (QLoRA)
75
- # This step disables dropout on some layers, sets up gradients for LN, etc.
76
  base_model = prepare_model_for_kbit_training(base_model)
77
 
78
- # --- 3) Create LoRA config & wrap the base model in LoRA adapter ---
79
- # For LLaMA-like models, "q_proj" and "v_proj" are typical. If your model is different,
80
- # adjust target_modules accordingly (maybe "c_attn", "W_pack", "query_key_value", etc.)
81
  lora_config = LoraConfig(
82
  r=16,
83
  lora_alpha=32,
84
  lora_dropout=0.05,
85
  bias="none",
86
- target_modules=["q_proj", "v_proj"], # Adjust if your model uses different layer names
87
  task_type=TaskType.CAUSAL_LM,
88
  )
89
  lora_model = get_peft_model(base_model, lora_config)
@@ -95,7 +88,6 @@ def finetune_small_subset():
95
  ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
96
  ds.set_format("torch")
97
 
98
- # Data collator
99
  collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
100
 
101
  # Training args
@@ -107,7 +99,7 @@ def finetune_small_subset():
107
  logging_steps=5,
108
  save_steps=999999,
109
  save_total_limit=1,
110
- fp16=False, # We'll rely on bnb_4bit/bfloat16 for the base model
111
  )
112
 
113
  # Trainer
@@ -121,13 +113,11 @@ def finetune_small_subset():
121
  # --- 5) Train ---
122
  trainer.train()
123
 
124
- # Save LoRA adapter + tokenizer
125
- # The 'save_model' would save only the LoRA adapter if using PEFT
126
  trainer.model.save_pretrained("finetuned_myr1")
127
  tokenizer.save_pretrained("finetuned_myr1")
128
 
129
- # --- 6) Reload the base model in 4-bit, then merge or apply the LoRA adapter for inference
130
- # We'll do the same approach, then load adapter from 'finetuned_myr1'
131
  base_model_2 = AutoModelForCausalLM.from_pretrained(
132
  "wuhp/myr1",
133
  subfolder="myr1",
@@ -138,17 +128,12 @@ def finetune_small_subset():
138
  )
139
  base_model_2 = prepare_model_for_kbit_training(base_model_2)
140
 
141
- # Re-inject LoRA
142
- # If your LoRA was saved in the same folder, you can do:
143
- # from peft import PeftModel
144
- # lora_model_2 = PeftModel.from_pretrained(base_model_2, "finetuned_myr1")
145
- # or you can do get_peft_model and pass the weights, etc.
146
-
147
- # But we can reuse 'get_peft_model' + load the LoRA weights
148
- lora_model_2 = get_peft_model(base_model_2, lora_config)
149
- lora_model_2.load_adapter("finetuned_myr1")
150
 
151
- # Create pipeline
152
  global TEXT_PIPELINE
153
  TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
154
 
@@ -162,7 +147,6 @@ def ensure_pipeline():
162
  """
163
  global TEXT_PIPELINE
164
  if TEXT_PIPELINE is None:
165
- # Just load base model in 4-bit
166
  bnb_config = BitsAndBytesConfig(
167
  load_in_4bit=True,
168
  bnb_4bit_compute_dtype=torch.bfloat16,
@@ -182,7 +166,6 @@ def ensure_pipeline():
182
  TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
183
  return TEXT_PIPELINE
184
 
185
-
186
  @spaces.GPU(duration=120) # up to 2 min for text generation
187
  def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
188
  """
@@ -199,7 +182,6 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
199
  )
200
  return out[0]["generated_text"]
201
 
202
-
203
  # Build Gradio UI
204
  with gr.Blocks() as demo:
205
  gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1")
 
15
  )
16
 
17
  # PEFT (LoRA / QLoRA)
18
+ from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training, PeftModel
 
19
 
20
  ##############################################################################
21
  # ZeroGPU + QLoRA Example
22
  ##############################################################################
23
+
24
  TEXT_PIPELINE = None
25
  NUM_EXAMPLES = 50 # We'll train on 50 lines of WikiText-2 for demonstration
26
 
 
38
  ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
39
  ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
40
 
 
 
41
  # --- 2) Setup 4-bit quantization with BitsAndBytes ---
 
 
42
  bnb_config = BitsAndBytesConfig(
43
  load_in_4bit=True,
44
+ bnb_4bit_compute_dtype=torch.bfloat16, # or torch.float16 if you prefer
45
  bnb_4bit_use_double_quant=True,
46
+ bnb_4bit_quant_type="nf4",
47
  )
48
 
49
  config = AutoConfig.from_pretrained(
 
57
  trust_remote_code=True
58
  )
59
 
 
60
  base_model = AutoModelForCausalLM.from_pretrained(
61
  "wuhp/myr1",
62
  subfolder="myr1",
 
67
  )
68
 
69
  # Prepare the model for k-bit training (QLoRA)
 
70
  base_model = prepare_model_for_kbit_training(base_model)
71
 
72
+ # --- 3) Create LoRA config & wrap the base model in LoRA ---
73
+ # Adjust target_modules if your model uses different param names than "q_proj"/"v_proj".
 
74
  lora_config = LoraConfig(
75
  r=16,
76
  lora_alpha=32,
77
  lora_dropout=0.05,
78
  bias="none",
79
+ target_modules=["q_proj", "v_proj"],
80
  task_type=TaskType.CAUSAL_LM,
81
  )
82
  lora_model = get_peft_model(base_model, lora_config)
 
88
  ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
89
  ds.set_format("torch")
90
 
 
91
  collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
92
 
93
  # Training args
 
99
  logging_steps=5,
100
  save_steps=999999,
101
  save_total_limit=1,
102
+ fp16=False, # rely on bfloat16 from quantization
103
  )
104
 
105
  # Trainer
 
113
  # --- 5) Train ---
114
  trainer.train()
115
 
116
+ # --- 6) Save LoRA adapter + tokenizer ---
 
117
  trainer.model.save_pretrained("finetuned_myr1")
118
  tokenizer.save_pretrained("finetuned_myr1")
119
 
120
+ # --- 7) Reload the base model + LoRA adapter for inference
 
121
  base_model_2 = AutoModelForCausalLM.from_pretrained(
122
  "wuhp/myr1",
123
  subfolder="myr1",
 
128
  )
129
  base_model_2 = prepare_model_for_kbit_training(base_model_2)
130
 
131
+ # Instead of load_adapter(...), we use PeftModel.from_pretrained
132
+ lora_model_2 = PeftModel.from_pretrained(
133
+ base_model_2,
134
+ "finetuned_myr1",
135
+ )
 
 
 
 
136
 
 
137
  global TEXT_PIPELINE
138
  TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
139
 
 
147
  """
148
  global TEXT_PIPELINE
149
  if TEXT_PIPELINE is None:
 
150
  bnb_config = BitsAndBytesConfig(
151
  load_in_4bit=True,
152
  bnb_4bit_compute_dtype=torch.bfloat16,
 
166
  TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
167
  return TEXT_PIPELINE
168
 
 
169
  @spaces.GPU(duration=120) # up to 2 min for text generation
170
  def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
171
  """
 
182
  )
183
  return out[0]["generated_text"]
184
 
 
185
  # Build Gradio UI
186
  with gr.Blocks() as demo:
187
  gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1")