Hjgugugjhuhjggg commited on
Commit
40aabaa
·
verified ·
1 Parent(s): b7a38a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -23
app.py CHANGED
@@ -18,6 +18,7 @@ import json
18
  from huggingface_hub import login
19
  import base64
20
  from botocore.exceptions import NoCredentialsError
 
21
 
22
 
23
  AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
@@ -93,13 +94,20 @@ class S3ModelLoader:
93
  tokenizer = AutoTokenizer.from_pretrained(
94
  s3_uri, config=config, local_files_only=False
95
  )
 
 
 
 
 
 
 
 
 
 
96
 
97
- if tokenizer.eos_token_id is not None and \
98
- tokenizer.pad_token_id is None:
99
- tokenizer.pad_token_id = config.pad_token_id \
100
- or tokenizer.eos_token_id
101
- model_cache[model_name] = (model, tokenizer)
102
- return model, tokenizer
103
  except (EnvironmentError, NoCredentialsError):
104
  try:
105
  config = AutoConfig.from_pretrained(
@@ -112,12 +120,16 @@ class S3ModelLoader:
112
  model = AutoModelForCausalLM.from_pretrained(
113
  model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
114
  )
 
 
 
 
 
 
115
 
116
-
117
- if tokenizer.eos_token_id is not None and \
118
- tokenizer.pad_token_id is None:
119
- tokenizer.pad_token_id = config.pad_token_id \
120
- or tokenizer.eos_token_id
121
 
122
 
123
  model.save_pretrained(s3_uri)
@@ -135,8 +147,20 @@ class S3ModelLoader:
135
  tokenizer = AutoTokenizer.from_pretrained(
136
  s3_uri, config=config, local_files_only=False
137
  )
138
- model_cache[model_name] = (model, tokenizer)
139
- return model, tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
140
  except Exception as e:
141
  raise HTTPException(
142
  status_code=500, detail=f"Error loading model: {e}"
@@ -160,7 +184,7 @@ async def generate(request: GenerateRequest):
160
  do_sample = request.do_sample
161
  stop_sequences = request.stop_sequences
162
 
163
- model, tokenizer = await model_loader.load_model_and_tokenizer(model_name)
164
  device = "cuda" if torch.cuda.is_available() else "cpu"
165
  model.to(device)
166
 
@@ -178,7 +202,7 @@ async def generate(request: GenerateRequest):
178
  return StreamingResponse(
179
  stream_text(model, tokenizer, input_text,
180
  generation_config, stop_sequences,
181
- device),
182
  media_type="text/plain"
183
  )
184
  else:
@@ -192,22 +216,28 @@ async def generate(request: GenerateRequest):
192
 
193
  async def stream_text(model, tokenizer, input_text,
194
  generation_config, stop_sequences,
195
- device):
196
 
197
  encoded_input = tokenizer(
198
  input_text, return_tensors="pt",
199
  truncation=True
200
  ).to(device)
201
-
202
-
 
203
  def find_stop(output_text, stop_sequences):
204
  for seq in stop_sequences:
205
- if seq in output_text:
206
- last_index = output_text.rfind(seq)
207
- return last_index + len(seq)
208
-
 
 
 
 
209
  return -1
210
 
 
211
  output_text = ""
212
  while True:
213
  outputs = model.generate(
@@ -221,6 +251,7 @@ async def stream_text(model, tokenizer, input_text,
221
  num_return_sequences=generation_config.num_return_sequences,
222
  output_scores=True,
223
  return_dict_in_generate=True,
 
224
  )
225
 
226
  new_text = tokenizer.decode(
@@ -251,7 +282,8 @@ async def stream_text(model, tokenizer, input_text,
251
 
252
  encoded_input = tokenizer(
253
  output_text, return_tensors="pt",
254
- truncation=True
 
255
  ).to(device)
256
  output_text = ""
257
 
 
18
  from huggingface_hub import login
19
  import base64
20
  from botocore.exceptions import NoCredentialsError
21
+ import re
22
 
23
 
24
  AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
 
94
  tokenizer = AutoTokenizer.from_pretrained(
95
  s3_uri, config=config, local_files_only=False
96
  )
97
+
98
+ eos_token_id = tokenizer.eos_token_id
99
+ pad_token_id = tokenizer.pad_token_id
100
+ eos_token = tokenizer.eos_token
101
+ pad_token = tokenizer.pad_token
102
+ padding = tokenizer.padding_side
103
+
104
+ if eos_token_id is not None and pad_token_id is None:
105
+ pad_token_id = config.pad_token_id or eos_token_id
106
+ tokenizer.pad_token_id = pad_token_id
107
 
108
+ model_cache[model_name] = (model, tokenizer,eos_token_id,
109
+ pad_token_id,eos_token,pad_token,padding)
110
+ return model, tokenizer,eos_token_id,pad_token_id,eos_token,pad_token,padding
 
 
 
111
  except (EnvironmentError, NoCredentialsError):
112
  try:
113
  config = AutoConfig.from_pretrained(
 
120
  model = AutoModelForCausalLM.from_pretrained(
121
  model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
122
  )
123
+
124
+ eos_token_id = tokenizer.eos_token_id
125
+ pad_token_id = tokenizer.pad_token_id
126
+ eos_token = tokenizer.eos_token
127
+ pad_token = tokenizer.pad_token
128
+ padding = tokenizer.padding_side
129
 
130
+ if eos_token_id is not None and pad_token_id is None:
131
+ pad_token_id = config.pad_token_id or eos_token_id
132
+ tokenizer.pad_token_id = pad_token_id
 
 
133
 
134
 
135
  model.save_pretrained(s3_uri)
 
147
  tokenizer = AutoTokenizer.from_pretrained(
148
  s3_uri, config=config, local_files_only=False
149
  )
150
+
151
+ eos_token_id = tokenizer.eos_token_id
152
+ pad_token_id = tokenizer.pad_token_id
153
+ eos_token = tokenizer.eos_token
154
+ pad_token = tokenizer.pad_token
155
+ padding = tokenizer.padding_side
156
+
157
+ if eos_token_id is not None and pad_token_id is None:
158
+ pad_token_id = config.pad_token_id or eos_token_id
159
+ tokenizer.pad_token_id = pad_token_id
160
+
161
+ model_cache[model_name] = (model, tokenizer,eos_token_id,
162
+ pad_token_id,eos_token,pad_token,padding)
163
+ return model, tokenizer,eos_token_id,pad_token_id,eos_token,pad_token,padding
164
  except Exception as e:
165
  raise HTTPException(
166
  status_code=500, detail=f"Error loading model: {e}"
 
184
  do_sample = request.do_sample
185
  stop_sequences = request.stop_sequences
186
 
187
+ model, tokenizer, eos_token_id, pad_token_id, eos_token, pad_token, padding = await model_loader.load_model_and_tokenizer(model_name)
188
  device = "cuda" if torch.cuda.is_available() else "cpu"
189
  model.to(device)
190
 
 
202
  return StreamingResponse(
203
  stream_text(model, tokenizer, input_text,
204
  generation_config, stop_sequences,
205
+ device,pad_token_id),
206
  media_type="text/plain"
207
  )
208
  else:
 
216
 
217
  async def stream_text(model, tokenizer, input_text,
218
  generation_config, stop_sequences,
219
+ device,pad_token_id):
220
 
221
  encoded_input = tokenizer(
222
  input_text, return_tensors="pt",
223
  truncation=True
224
  ).to(device)
225
+
226
+ stop_regex = re.compile(r'[\.\?\!\n]+')
227
+
228
  def find_stop(output_text, stop_sequences):
229
  for seq in stop_sequences:
230
+ if seq in output_text:
231
+ last_index = output_text.rfind(seq)
232
+ return last_index + len(seq)
233
+
234
+ match = stop_regex.search(output_text)
235
+ if match:
236
+ return match.end()
237
+
238
  return -1
239
 
240
+
241
  output_text = ""
242
  while True:
243
  outputs = model.generate(
 
251
  num_return_sequences=generation_config.num_return_sequences,
252
  output_scores=True,
253
  return_dict_in_generate=True,
254
+ pad_token_id=pad_token_id
255
  )
256
 
257
  new_text = tokenizer.decode(
 
282
 
283
  encoded_input = tokenizer(
284
  output_text, return_tensors="pt",
285
+ truncation=True,
286
+ padding = "max_length" if pad_token_id is not None else False
287
  ).to(device)
288
  output_text = ""
289