Hjgugugjhuhjggg commited on
Commit
4bb04f8
·
verified ·
1 Parent(s): 99136f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -32
app.py CHANGED
@@ -9,6 +9,8 @@ from transformers import (
9
  AutoModelForCausalLM,
10
  AutoTokenizer,
11
  GenerationConfig,
 
 
12
  )
13
  import boto3
14
  import uvicorn
@@ -43,7 +45,7 @@ class GenerateRequest(BaseModel):
43
  input_text: str = ""
44
  task_type: str
45
  temperature: float = 1.0
46
- max_new_tokens: int = 3
47
  stream: bool = True
48
  top_p: float = 1.0
49
  top_k: int = 50
@@ -92,33 +94,44 @@ class S3ModelLoader:
92
  )
93
 
94
  tokenizer = AutoTokenizer.from_pretrained(
95
- s3_uri, config=config, local_files_only=False
96
  )
 
 
 
 
 
 
 
 
 
 
97
 
98
- if tokenizer.eos_token_id is not None and \
99
- tokenizer.pad_token_id is None:
100
- tokenizer.pad_token_id = config.pad_token_id \
101
- or tokenizer.eos_token_id
102
- model_cache[model_name] = (model, tokenizer)
103
- return model, tokenizer
104
  except (EnvironmentError, NoCredentialsError):
105
  try:
106
  config = AutoConfig.from_pretrained(
107
  model_name, token=HUGGINGFACE_HUB_TOKEN
108
  )
109
  tokenizer = AutoTokenizer.from_pretrained(
110
- model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
111
  )
112
 
113
  model = AutoModelForCausalLM.from_pretrained(
114
  model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
115
  )
 
 
 
 
 
 
116
 
117
-
118
- if tokenizer.eos_token_id is not None and \
119
- tokenizer.pad_token_id is None:
120
- tokenizer.pad_token_id = config.pad_token_id \
121
- or tokenizer.eos_token_id
122
 
123
 
124
  model.save_pretrained(s3_uri)
@@ -134,10 +147,22 @@ class S3ModelLoader:
134
  )
135
 
136
  tokenizer = AutoTokenizer.from_pretrained(
137
- s3_uri, config=config, local_files_only=False
138
  )
139
- model_cache[model_name] = (model, tokenizer)
140
- return model, tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
141
  except Exception as e:
142
  raise HTTPException(
143
  status_code=500, detail=f"Error loading model: {e}"
@@ -145,6 +170,31 @@ class S3ModelLoader:
145
 
146
  model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  @app.post("/generate")
149
  async def generate(request: GenerateRequest):
150
  try:
@@ -161,7 +211,7 @@ async def generate(request: GenerateRequest):
161
  do_sample = request.do_sample
162
  stop_sequences = request.stop_sequences
163
 
164
- model, tokenizer = await model_loader.load_model_and_tokenizer(model_name)
165
  device = "cuda" if torch.cuda.is_available() else "cpu"
166
  model.to(device)
167
 
@@ -174,14 +224,30 @@ async def generate(request: GenerateRequest):
174
  repetition_penalty=repetition_penalty,
175
  do_sample=do_sample,
176
  num_return_sequences=num_return_sequences,
 
177
  )
 
 
 
178
 
179
- return StreamingResponse(
180
- stream_text(model, tokenizer, input_text,
181
  generation_config, stop_sequences,
182
- device),
 
 
 
 
 
 
 
 
 
 
 
183
  media_type="text/plain"
184
  )
 
185
  else:
186
  return HTTPException(status_code=400, detail="Task type not text-to-text")
187
 
@@ -193,11 +259,13 @@ async def generate(request: GenerateRequest):
193
 
194
  async def stream_text(model, tokenizer, input_text,
195
  generation_config, stop_sequences,
196
- device):
197
-
198
  encoded_input = tokenizer(
199
  input_text, return_tensors="pt",
200
- truncation=True
 
 
201
  ).to(device)
202
 
203
  stop_regex = re.compile(r'[\.\?\!\n]+')
@@ -216,6 +284,8 @@ async def stream_text(model, tokenizer, input_text,
216
 
217
 
218
  output_text = ""
 
 
219
  while True:
220
  outputs = model.generate(
221
  **encoded_input,
@@ -228,8 +298,10 @@ async def stream_text(model, tokenizer, input_text,
228
  num_return_sequences=generation_config.num_return_sequences,
229
  output_scores=True,
230
  return_dict_in_generate=True,
 
 
231
  )
232
-
233
  new_text = tokenizer.decode(
234
  outputs.sequences[0][len(encoded_input["input_ids"][0]):],
235
  skip_special_tokens=True
@@ -243,22 +315,32 @@ async def stream_text(model, tokenizer, input_text,
243
  final_output = output_text[:stop_index]
244
 
245
  for text in final_output.split():
246
- yield json.dumps({"text": text, "is_end": False}) + "\n"
247
  yield json.dumps({"text": "", "is_end": True}) + "\n"
248
  break
249
  else:
250
- for text in new_text.split():
251
- yield json.dumps({"text": text, "is_end": False}) + "\n"
 
 
 
 
 
 
 
252
 
253
  if len(new_text) == 0:
 
254
  for text in output_text.split():
255
- yield json.dumps({"text": text, "is_end": False}) + "\n"
256
  yield json.dumps({"text": "", "is_end": True}) + "\n"
257
  break
258
-
259
  encoded_input = tokenizer(
260
- output_text, return_tensors="pt",
261
- truncation=True
 
 
262
  ).to(device)
263
  output_text = ""
264
 
 
9
  AutoModelForCausalLM,
10
  AutoTokenizer,
11
  GenerationConfig,
12
+ StoppingCriteria,
13
+ StoppingCriteriaList
14
  )
15
  import boto3
16
  import uvicorn
 
45
  input_text: str = ""
46
  task_type: str
47
  temperature: float = 1.0
48
+ max_new_tokens: int = 3
49
  stream: bool = True
50
  top_p: float = 1.0
51
  top_k: int = 50
 
94
  )
95
 
96
  tokenizer = AutoTokenizer.from_pretrained(
97
+ s3_uri, config=config, local_files_only=False, padding_side="left"
98
  )
99
+
100
+ eos_token_id = tokenizer.eos_token_id
101
+ pad_token_id = tokenizer.pad_token_id
102
+ eos_token = tokenizer.eos_token
103
+ pad_token = tokenizer.pad_token
104
+ padding = tokenizer.padding_side
105
+
106
+ if eos_token_id is not None and pad_token_id is None:
107
+ pad_token_id = config.pad_token_id or eos_token_id
108
+ tokenizer.pad_token_id = pad_token_id
109
 
110
+ model_cache[model_name] = (model, tokenizer,eos_token_id,
111
+ pad_token_id,eos_token,pad_token,padding)
112
+ return model, tokenizer,eos_token_id,pad_token_id,eos_token,pad_token,padding
 
 
 
113
  except (EnvironmentError, NoCredentialsError):
114
  try:
115
  config = AutoConfig.from_pretrained(
116
  model_name, token=HUGGINGFACE_HUB_TOKEN
117
  )
118
  tokenizer = AutoTokenizer.from_pretrained(
119
+ model_name, config=config, token=HUGGINGFACE_HUB_TOKEN, padding_side="left"
120
  )
121
 
122
  model = AutoModelForCausalLM.from_pretrained(
123
  model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
124
  )
125
+
126
+ eos_token_id = tokenizer.eos_token_id
127
+ pad_token_id = tokenizer.pad_token_id
128
+ eos_token = tokenizer.eos_token
129
+ pad_token = tokenizer.pad_token
130
+ padding = tokenizer.padding_side
131
 
132
+ if eos_token_id is not None and pad_token_id is None:
133
+ pad_token_id = config.pad_token_id or eos_token_id
134
+ tokenizer.pad_token_id = pad_token_id
 
 
135
 
136
 
137
  model.save_pretrained(s3_uri)
 
147
  )
148
 
149
  tokenizer = AutoTokenizer.from_pretrained(
150
+ s3_uri, config=config, local_files_only=False, padding_side="left"
151
  )
152
+
153
+ eos_token_id = tokenizer.eos_token_id
154
+ pad_token_id = tokenizer.pad_token_id
155
+ eos_token = tokenizer.eos_token
156
+ pad_token = tokenizer.pad_token
157
+ padding = tokenizer.padding_side
158
+
159
+ if eos_token_id is not None and pad_token_id is None:
160
+ pad_token_id = config.pad_token_id or eos_token_id
161
+ tokenizer.pad_token_id = pad_token_id
162
+
163
+ model_cache[model_name] = (model, tokenizer,eos_token_id,
164
+ pad_token_id,eos_token,pad_token,padding)
165
+ return model, tokenizer,eos_token_id,pad_token_id,eos_token,pad_token,padding
166
  except Exception as e:
167
  raise HTTPException(
168
  status_code=500, detail=f"Error loading model: {e}"
 
170
 
171
  model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
172
 
173
+ class StopOnSequencesCriteria(StoppingCriteria):
174
+ def __init__(self, stop_sequences, tokenizer):
175
+ self.stop_sequences = stop_sequences
176
+ self.tokenizer = tokenizer
177
+
178
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
179
+
180
+ decoded_text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
181
+
182
+ for seq in self.stop_sequences:
183
+ if seq in decoded_text:
184
+ return True
185
+ return False
186
+
187
+ async def generate_stream(model, tokenizer, input_text,
188
+ generation_config, stop_sequences,
189
+ device, pad_token_id, max_model_length,
190
+ max_new_tokens):
191
+ async def stream():
192
+ async for token in stream_text(model, tokenizer, input_text,
193
+ generation_config, stop_sequences,
194
+ device,pad_token_id, max_model_length, max_new_tokens):
195
+ yield token
196
+ return stream()
197
+
198
  @app.post("/generate")
199
  async def generate(request: GenerateRequest):
200
  try:
 
211
  do_sample = request.do_sample
212
  stop_sequences = request.stop_sequences
213
 
214
+ model, tokenizer, eos_token_id, pad_token_id, eos_token, pad_token, padding = await model_loader.load_model_and_tokenizer(model_name)
215
  device = "cuda" if torch.cuda.is_available() else "cpu"
216
  model.to(device)
217
 
 
224
  repetition_penalty=repetition_penalty,
225
  do_sample=do_sample,
226
  num_return_sequences=num_return_sequences,
227
+ pad_token_id=pad_token_id if pad_token_id is not None else None
228
  )
229
+
230
+ max_model_length = model.config.max_position_embeddings
231
+ input_text = input_text[:max_model_length]
232
 
233
+ streams = [
234
+ generate_stream(model, tokenizer, input_text,
235
  generation_config, stop_sequences,
236
+ device,pad_token_id, max_model_length, max_new_tokens)
237
+ for _ in range(num_return_sequences)
238
+ ]
239
+
240
+
241
+ async def stream_response():
242
+ async for results in asyncio.as_completed(streams):
243
+ async for chunk in await results:
244
+ yield chunk
245
+
246
+ return StreamingResponse(
247
+ stream_response(),
248
  media_type="text/plain"
249
  )
250
+
251
  else:
252
  return HTTPException(status_code=400, detail="Task type not text-to-text")
253
 
 
259
 
260
  async def stream_text(model, tokenizer, input_text,
261
  generation_config, stop_sequences,
262
+ device,pad_token_id, max_model_length, max_new_tokens):
263
+
264
  encoded_input = tokenizer(
265
  input_text, return_tensors="pt",
266
+ truncation=True,
267
+ padding = "max_length",
268
+ max_length=max_model_length
269
  ).to(device)
270
 
271
  stop_regex = re.compile(r'[\.\?\!\n]+')
 
284
 
285
 
286
  output_text = ""
287
+ stop_criteria = StoppingCriteriaList([StopOnSequencesCriteria(stop_sequences, tokenizer)])
288
+
289
  while True:
290
  outputs = model.generate(
291
  **encoded_input,
 
298
  num_return_sequences=generation_config.num_return_sequences,
299
  output_scores=True,
300
  return_dict_in_generate=True,
301
+ pad_token_id=pad_token_id if pad_token_id is not None else None,
302
+ stopping_criteria = stop_criteria
303
  )
304
+
305
  new_text = tokenizer.decode(
306
  outputs.sequences[0][len(encoded_input["input_ids"][0]):],
307
  skip_special_tokens=True
 
315
  final_output = output_text[:stop_index]
316
 
317
  for text in final_output.split():
318
+ yield json.dumps({"text": text, "is_end": False}) + "\n"
319
  yield json.dumps({"text": "", "is_end": True}) + "\n"
320
  break
321
  else:
322
+
323
+ tokens = new_text.split()
324
+
325
+
326
+ for i in range(0, len(tokens), max_new_tokens):
327
+ chunk = tokens[i:i + max_new_tokens]
328
+ chunk_text = " ".join(chunk)
329
+ for text in chunk_text.split():
330
+ yield json.dumps({"text": text, "is_end": False}) + "\n"
331
 
332
  if len(new_text) == 0:
333
+
334
  for text in output_text.split():
335
+ yield json.dumps({"text": text, "is_end": False}) + "\n"
336
  yield json.dumps({"text": "", "is_end": True}) + "\n"
337
  break
338
+
339
  encoded_input = tokenizer(
340
+ output_text, return_tensors="pt",
341
+ truncation=True,
342
+ padding = "max_length" ,
343
+ max_length = max_model_length
344
  ).to(device)
345
  output_text = ""
346