Hjgugugjhuhjggg commited on
Commit
e77c20c
·
verified ·
1 Parent(s): 78f7e86

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -113
app.py CHANGED
@@ -17,8 +17,8 @@ from transformers import pipeline
17
  import json
18
  from huggingface_hub import login
19
  import base64
20
- import io
21
- from PIL import Image
22
 
23
  AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
24
  AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
@@ -26,23 +26,17 @@ AWS_REGION = os.getenv("AWS_REGION")
26
  S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
27
  HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
28
 
 
29
  if HUGGINGFACE_HUB_TOKEN:
30
- login(token=HUGGINGFACE_HUB_TOKEN, add_to_git_credential=False)
 
31
 
32
- s3_client = boto3.client(
33
- "s3",
34
- aws_access_key_id=AWS_ACCESS_KEY_ID,
35
- aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
36
- region_name=AWS_REGION,
37
- )
38
 
39
  app = FastAPI()
40
 
41
- # Global variables for tokenizer tokens
42
- EOS_TOKEN_ID = None
43
- PAD_TOKEN_ID = None
44
-
45
-
46
  class GenerateRequest(BaseModel):
47
  model_name: str
48
  input_text: str = ""
@@ -71,39 +65,37 @@ class GenerateRequest(BaseModel):
71
  raise ValueError(f"task_type must be one of: {valid_types}")
72
  return v
73
 
74
-
75
  class S3ModelLoader:
76
  def __init__(self, bucket_name, s3_client):
77
  self.bucket_name = bucket_name
78
  self.s3_client = s3_client
79
 
80
  def _get_s3_uri(self, model_name):
81
- return f"s3://{self.bucket_name}/" \
82
  f"{model_name.replace('/', '-')}"
83
 
84
  async def load_model_and_tokenizer(self, model_name):
85
- global EOS_TOKEN_ID, PAD_TOKEN_ID
86
  s3_uri = self._get_s3_uri(model_name)
87
  try:
88
  config = AutoConfig.from_pretrained(
89
  s3_uri, local_files_only=False
90
  )
 
91
  model = AutoModelForCausalLM.from_pretrained(
92
  s3_uri, config=config, local_files_only=False
93
  )
 
94
  tokenizer = AutoTokenizer.from_pretrained(
95
  s3_uri, config=config, local_files_only=False
96
  )
97
 
98
- EOS_TOKEN_ID = tokenizer.eos_token_id
99
- PAD_TOKEN_ID = tokenizer.pad_token_id
100
-
101
- if EOS_TOKEN_ID is not None and PAD_TOKEN_ID is None:
102
- PAD_TOKEN_ID = config.pad_token_id or EOS_TOKEN_ID
103
- tokenizer.pad_token_id = PAD_TOKEN_ID
104
 
105
  return model, tokenizer
106
- except EnvironmentError:
107
  try:
108
  config = AutoConfig.from_pretrained(
109
  model_name, token=HUGGINGFACE_HUB_TOKEN
@@ -111,29 +103,42 @@ class S3ModelLoader:
111
  tokenizer = AutoTokenizer.from_pretrained(
112
  model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
113
  )
114
-
115
  model = AutoModelForCausalLM.from_pretrained(
116
  model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
117
  )
118
- EOS_TOKEN_ID = tokenizer.eos_token_id
119
- PAD_TOKEN_ID = tokenizer.pad_token_id
120
 
121
- if EOS_TOKEN_ID is not None and PAD_TOKEN_ID is None:
122
- PAD_TOKEN_ID = config.pad_token_id or EOS_TOKEN_ID
123
- tokenizer.pad_token_id = PAD_TOKEN_ID
124
 
 
 
 
 
 
 
125
  model.save_pretrained(s3_uri)
126
  tokenizer.save_pretrained(s3_uri)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  return model, tokenizer
128
  except Exception as e:
129
  raise HTTPException(
130
  status_code=500, detail=f"Error loading model: {e}"
131
  )
132
 
133
-
134
  model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
135
 
136
-
137
  @app.post("/generate")
138
  async def generate(request: GenerateRequest):
139
  try:
@@ -141,7 +146,7 @@ async def generate(request: GenerateRequest):
141
  input_text = request.input_text
142
  task_type = request.task_type
143
  temperature = request.temperature
144
- max_new_tokens = request.max_new_tokens
145
  stream = request.stream
146
  top_p = request.top_p
147
  top_k = request.top_k
@@ -153,7 +158,7 @@ async def generate(request: GenerateRequest):
153
  model, tokenizer = await model_loader.load_model_and_tokenizer(model_name)
154
  device = "cuda" if torch.cuda.is_available() else "cpu"
155
  model.to(device)
156
-
157
  if "text-to-text" == task_type:
158
  generation_config = GenerationConfig(
159
  temperature=temperature,
@@ -166,21 +171,13 @@ async def generate(request: GenerateRequest):
166
  )
167
 
168
  return StreamingResponse(
169
- stream_text(
170
- model,
171
- tokenizer,
172
- input_text,
173
- generation_config,
174
- stop_sequences,
175
- device,
176
- max_length=10,
177
- ),
178
- media_type="text/plain",
179
  )
180
  else:
181
- raise HTTPException(
182
- status_code=400, detail="Task type not text-to-text"
183
- )
184
 
185
  except Exception as e:
186
  raise HTTPException(
@@ -188,11 +185,12 @@ async def generate(request: GenerateRequest):
188
  )
189
 
190
 
191
- async def stream_text(
192
- model, tokenizer, input_text, generation_config, stop_sequences, device, max_length
193
- ):
194
  encoded_input = tokenizer(
195
- input_text, return_tensors="pt", truncation=True, max_length=max_length
 
196
  ).to(device)
197
  input_length = encoded_input["input_ids"].shape[1]
198
  remaining_tokens = max_length - input_length
@@ -203,12 +201,14 @@ async def stream_text(
203
  generation_config.max_new_tokens = min(
204
  remaining_tokens, generation_config.max_new_tokens
205
  )
 
206
 
207
  def find_stop(output_text, stop_sequences):
208
  for seq in stop_sequences:
209
  if seq in output_text:
210
  last_index = output_text.rfind(seq)
211
  return last_index + len(seq)
 
212
  return -1
213
 
214
  output_text = ""
@@ -229,7 +229,7 @@ async def stream_text(
229
 
230
  new_text = tokenizer.decode(
231
  outputs.sequences[0][len(encoded_input["input_ids"][0]):],
232
- skip_special_tokens=True,
233
  )
234
 
235
  output_text += new_text
@@ -238,39 +238,33 @@ async def stream_text(
238
 
239
  if stop_index != -1:
240
  final_output = output_text[:stop_index]
241
- chunked_output = [
242
- final_output[i: i + 10] for i in range(0, len(final_output), 10)
243
- ]
244
-
245
- for chunk in chunked_output:
246
- yield json.dumps({"text": chunk, "is_end": False}) + "\n"
247
 
 
 
 
248
  yield json.dumps({"text": "", "is_end": True}) + "\n"
249
  break
250
 
251
  else:
252
- chunked_output = [
253
- new_text[i: i + 10] for i in range(0, len(new_text), 10)
254
- ]
255
 
256
- for chunk in chunked_output:
257
- yield json.dumps({"text": chunk, "is_end": False}) + "\n"
258
 
259
  if len(output_text) >= generation_config.max_new_tokens:
260
- chunked_output = [
261
- output_text[i: i + 10] for i in range(0, len(output_text), 10)
262
- ]
263
 
264
- for chunk in chunked_output:
265
- yield json.dumps({"text": chunk, "is_end": False}) + "\n"
 
 
266
  yield json.dumps({"text": "", "is_end": True}) + "\n"
267
  break
268
 
269
  encoded_input = tokenizer(
270
- output_text, return_tensors="pt", truncation=True, max_length=max_length
 
271
  ).to(device)
272
 
273
-
274
  @app.post("/generate-image")
275
  async def generate_image(request: GenerateRequest):
276
  try:
@@ -278,27 +272,19 @@ async def generate_image(request: GenerateRequest):
278
  device = "cuda" if torch.cuda.is_available() else "cpu"
279
 
280
  image_generator = pipeline(
281
- "text-to-image", model=validated_body.model_name, device=device
 
282
  )
283
  image = image_generator(validated_body.input_text)[0]
284
-
285
- async def stream_image():
286
- buffered = io.BytesIO()
287
- image.save(buffered, format="PNG")
288
- image_bytes = buffered.getvalue()
289
- image_base64 = base64.b64encode(image_bytes).decode("utf-8")
290
- chunk_size = 1000
291
- for i in range(0, len(image_base64), chunk_size):
292
- chunk = image_base64[i: i + chunk_size]
293
- yield json.dumps({"image": chunk, "is_end": False}) + "\n"
294
-
295
- yield json.dumps({"image": "", "is_end": True}) + "\n"
296
-
297
- return StreamingResponse(stream_image(), media_type="text/plain")
298
 
299
  except Exception as e:
300
  raise HTTPException(
301
- status_code=500, detail=f"Internal server error: {str(e)}"
 
302
  )
303
 
304
 
@@ -309,25 +295,22 @@ async def generate_text_to_speech(request: GenerateRequest):
309
  device = "cuda" if torch.cuda.is_available() else "cpu"
310
 
311
  audio_generator = pipeline(
312
- "text-to-speech", model=validated_body.model_name, device=device
 
313
  )
314
  audio = audio_generator(validated_body.input_text)
315
- audio_bytes = audio["audio"]
316
 
317
- async def stream_audio():
318
- audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
319
- chunk_size = 1000
320
- for i in range(0, len(audio_base64), chunk_size):
321
- chunk = audio_base64[i: i + chunk_size]
322
- yield json.dumps({"audio": chunk, "is_end": False}) + "\n"
323
-
324
- yield json.dumps({"audio": "", "is_end": True}) + "\n"
325
-
326
- return StreamingResponse(stream_audio(), media_type="text/plain")
327
 
328
  except Exception as e:
329
  raise HTTPException(
330
- status_code=500, detail=f"Internal server error: {str(e)}"
 
331
  )
332
 
333
 
@@ -337,26 +320,21 @@ async def generate_video(request: GenerateRequest):
337
  validated_body = request
338
  device = "cuda" if torch.cuda.is_available() else "cpu"
339
  video_generator = pipeline(
340
- "text-to-video", model=validated_body.model_name, device=device
 
341
  )
342
  video = video_generator(validated_body.input_text)
343
-
344
- async def stream_video():
345
- video_base64 = base64.b64encode(video).decode("utf-8")
346
- chunk_size = 1000
347
- for i in range(0, len(video_base64), chunk_size):
348
- chunk = video_base64[i: i + chunk_size]
349
- yield json.dumps({"video": chunk, "is_end": False}) + "\n"
350
-
351
- yield json.dumps({"video": "", "is_end": True}) + "\n"
352
- return StreamingResponse(stream_video(), media_type="text/plain")
353
-
354
 
355
  except Exception as e:
356
  raise HTTPException(
357
- status_code=500, detail=f"Internal server error: {str(e)}"
 
358
  )
359
 
360
-
361
  if __name__ == "__main__":
362
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
17
  import json
18
  from huggingface_hub import login
19
  import base64
20
+ from botocore.exceptions import NoCredentialsError
21
+
22
 
23
  AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
24
  AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
 
26
  S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
27
  HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
28
 
29
+
30
  if HUGGINGFACE_HUB_TOKEN:
31
+ login(token=HUGGINGFACE_HUB_TOKEN,
32
+ add_to_git_credential=False)
33
 
34
+ s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID,
35
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
36
+ region_name=AWS_REGION)
 
 
 
37
 
38
  app = FastAPI()
39
 
 
 
 
 
 
40
  class GenerateRequest(BaseModel):
41
  model_name: str
42
  input_text: str = ""
 
65
  raise ValueError(f"task_type must be one of: {valid_types}")
66
  return v
67
 
 
68
  class S3ModelLoader:
69
  def __init__(self, bucket_name, s3_client):
70
  self.bucket_name = bucket_name
71
  self.s3_client = s3_client
72
 
73
  def _get_s3_uri(self, model_name):
74
+ return f"s3://{self.bucket_name}/" \
75
  f"{model_name.replace('/', '-')}"
76
 
77
  async def load_model_and_tokenizer(self, model_name):
 
78
  s3_uri = self._get_s3_uri(model_name)
79
  try:
80
  config = AutoConfig.from_pretrained(
81
  s3_uri, local_files_only=False
82
  )
83
+
84
  model = AutoModelForCausalLM.from_pretrained(
85
  s3_uri, config=config, local_files_only=False
86
  )
87
+
88
  tokenizer = AutoTokenizer.from_pretrained(
89
  s3_uri, config=config, local_files_only=False
90
  )
91
 
92
+ if tokenizer.eos_token_id is not None and \
93
+ tokenizer.pad_token_id is None:
94
+ tokenizer.pad_token_id = config.pad_token_id \
95
+ or tokenizer.eos_token_id
 
 
96
 
97
  return model, tokenizer
98
+ except (EnvironmentError, NoCredentialsError):
99
  try:
100
  config = AutoConfig.from_pretrained(
101
  model_name, token=HUGGINGFACE_HUB_TOKEN
 
103
  tokenizer = AutoTokenizer.from_pretrained(
104
  model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
105
  )
106
+
107
  model = AutoModelForCausalLM.from_pretrained(
108
  model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
109
  )
 
 
110
 
 
 
 
111
 
112
+ if tokenizer.eos_token_id is not None and \
113
+ tokenizer.pad_token_id is None:
114
+ tokenizer.pad_token_id = config.pad_token_id \
115
+ or tokenizer.eos_token_id
116
+
117
+
118
  model.save_pretrained(s3_uri)
119
  tokenizer.save_pretrained(s3_uri)
120
+
121
+
122
+ config = AutoConfig.from_pretrained(
123
+ s3_uri, local_files_only=False
124
+ )
125
+
126
+ model = AutoModelForCausalLM.from_pretrained(
127
+ s3_uri, config=config, local_files_only=False
128
+ )
129
+
130
+ tokenizer = AutoTokenizer.from_pretrained(
131
+ s3_uri, config=config, local_files_only=False
132
+ )
133
+
134
  return model, tokenizer
135
  except Exception as e:
136
  raise HTTPException(
137
  status_code=500, detail=f"Error loading model: {e}"
138
  )
139
 
 
140
  model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
141
 
 
142
  @app.post("/generate")
143
  async def generate(request: GenerateRequest):
144
  try:
 
146
  input_text = request.input_text
147
  task_type = request.task_type
148
  temperature = request.temperature
149
+ max_new_tokens = request.max_new_tokens # This value will be used to constraint the output
150
  stream = request.stream
151
  top_p = request.top_p
152
  top_k = request.top_k
 
158
  model, tokenizer = await model_loader.load_model_and_tokenizer(model_name)
159
  device = "cuda" if torch.cuda.is_available() else "cpu"
160
  model.to(device)
161
+
162
  if "text-to-text" == task_type:
163
  generation_config = GenerationConfig(
164
  temperature=temperature,
 
171
  )
172
 
173
  return StreamingResponse(
174
+ stream_text(model, tokenizer, input_text,
175
+ generation_config, stop_sequences,
176
+ device, max_length=10),
177
+ media_type="text/plain"
 
 
 
 
 
 
178
  )
179
  else:
180
+ return HTTPException(status_code=400, detail="Task type not text-to-text")
 
 
181
 
182
  except Exception as e:
183
  raise HTTPException(
 
185
  )
186
 
187
 
188
+ async def stream_text(model, tokenizer, input_text,
189
+ generation_config, stop_sequences,
190
+ device, max_length):
191
  encoded_input = tokenizer(
192
+ input_text, return_tensors="pt",
193
+ truncation=True, max_length=max_length
194
  ).to(device)
195
  input_length = encoded_input["input_ids"].shape[1]
196
  remaining_tokens = max_length - input_length
 
201
  generation_config.max_new_tokens = min(
202
  remaining_tokens, generation_config.max_new_tokens
203
  )
204
+
205
 
206
  def find_stop(output_text, stop_sequences):
207
  for seq in stop_sequences:
208
  if seq in output_text:
209
  last_index = output_text.rfind(seq)
210
  return last_index + len(seq)
211
+
212
  return -1
213
 
214
  output_text = ""
 
229
 
230
  new_text = tokenizer.decode(
231
  outputs.sequences[0][len(encoded_input["input_ids"][0]):],
232
+ skip_special_tokens=True
233
  )
234
 
235
  output_text += new_text
 
238
 
239
  if stop_index != -1:
240
  final_output = output_text[:stop_index]
 
 
 
 
 
 
241
 
242
+ for chunk in [final_output[i:i+10] for i in range(0,len(final_output),10)]:
243
+ for text in chunk.split():
244
+ yield json.dumps({"text": text, "is_end": False}) + "\n"
245
  yield json.dumps({"text": "", "is_end": True}) + "\n"
246
  break
247
 
248
  else:
249
+ for chunk in [new_text[i:i+10] for i in range(0, len(new_text), 10)]:
250
+ for text in chunk.split():
251
+ yield json.dumps({"text": text, "is_end": False}) + "\n"
252
 
 
 
253
 
254
  if len(output_text) >= generation_config.max_new_tokens:
 
 
 
255
 
256
+ for chunk in [output_text[i:i+10] for i in range(0, len(output_text), 10)]:
257
+ for text in chunk.split():
258
+ yield json.dumps({"text": text, "is_end": False}) + "\n"
259
+
260
  yield json.dumps({"text": "", "is_end": True}) + "\n"
261
  break
262
 
263
  encoded_input = tokenizer(
264
+ output_text, return_tensors="pt",
265
+ truncation=True, max_length=max_length
266
  ).to(device)
267
 
 
268
  @app.post("/generate-image")
269
  async def generate_image(request: GenerateRequest):
270
  try:
 
272
  device = "cuda" if torch.cuda.is_available() else "cpu"
273
 
274
  image_generator = pipeline(
275
+ "text-to-image", model=validated_body.model_name,
276
+ device=device
277
  )
278
  image = image_generator(validated_body.input_text)[0]
279
+
280
+ image_data = list(image.getdata())
281
+
282
+ return json.dumps({"image_data": image_data, "is_end": True})
 
 
 
 
 
 
 
 
 
 
283
 
284
  except Exception as e:
285
  raise HTTPException(
286
+ status_code=500,
287
+ detail=f"Internal server error: {str(e)}"
288
  )
289
 
290
 
 
295
  device = "cuda" if torch.cuda.is_available() else "cpu"
296
 
297
  audio_generator = pipeline(
298
+ "text-to-speech", model=validated_body.model_name,
299
+ device=device
300
  )
301
  audio = audio_generator(validated_body.input_text)
 
302
 
303
+
304
+ audio_bytes = audio["audio"]
305
+
306
+ audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
307
+
308
+ return json.dumps({"audio": audio_base64, "is_end": True})
 
 
 
 
309
 
310
  except Exception as e:
311
  raise HTTPException(
312
+ status_code=500,
313
+ detail=f"Internal server error: {str(e)}"
314
  )
315
 
316
 
 
320
  validated_body = request
321
  device = "cuda" if torch.cuda.is_available() else "cpu"
322
  video_generator = pipeline(
323
+ "text-to-video", model=validated_body.model_name,
324
+ device=device
325
  )
326
  video = video_generator(validated_body.input_text)
327
+
328
+
329
+ video_base64 = base64.b64encode(video).decode('utf-8')
330
+
331
+ return json.dumps({"video": video_base64, "is_end": True})
 
 
 
 
 
 
332
 
333
  except Exception as e:
334
  raise HTTPException(
335
+ status_code=500,
336
+ detail=f"Internal server error: {str(e)}"
337
  )
338
 
 
339
  if __name__ == "__main__":
340
  uvicorn.run(app, host="0.0.0.0", port=7860)