Hjgugugjhuhjggg commited on
Commit
66c68f4
·
verified ·
1 Parent(s): d05ede6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -82
app.py CHANGED
@@ -10,29 +10,23 @@ from transformers import (
10
  GenerationConfig,
11
  StoppingCriteria,
12
  StoppingCriteriaList,
 
13
  )
14
- import boto3
15
  import uvicorn
16
  import asyncio
17
  import json
 
18
  from huggingface_hub import login
19
  from botocore.exceptions import NoCredentialsError
 
20
 
21
 
22
- AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
23
- AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
24
- AWS_REGION = os.getenv("AWS_REGION")
25
- S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
26
  HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
27
 
28
-
29
  if HUGGINGFACE_HUB_TOKEN:
30
  login(token=HUGGINGFACE_HUB_TOKEN,
31
  add_to_git_credential=False)
32
 
33
- s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID,
34
- aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
35
- region_name=AWS_REGION)
36
 
37
  app = FastAPI()
38
 
@@ -66,67 +60,42 @@ class GenerateRequest(BaseModel):
66
 
67
  model_data = {} # Global dictionary to store model data
68
 
69
- class S3ModelLoader:
70
- def __init__(self, bucket_name, s3_client):
71
- self.bucket_name = bucket_name
72
- self.s3_client = s3_client
73
-
74
- def _get_s3_uri(self, model_name):
75
- return f"s3://{self.bucket_name}/" \
76
- f"{model_name.replace('/', '-')}"
77
-
78
- async def load_model_and_tokenizer(self, model_name):
79
- if model_name in model_data:
80
- return model_data[model_name]["model"], model_data[model_name]["tokenizer"]
81
-
82
- s3_uri = self._get_s3_uri(model_name)
83
- try:
84
-
85
- config = AutoConfig.from_pretrained(
86
- s3_uri, local_files_only=False
87
- )
88
-
89
- model = AutoModelForCausalLM.from_pretrained(
90
- s3_uri, config=config, local_files_only=False
91
- )
92
-
93
- tokenizer = AutoTokenizer.from_pretrained(
94
- s3_uri, config=config, local_files_only=False
95
- )
96
-
97
- if tokenizer.eos_token_id is not None and \
98
- tokenizer.pad_token_id is None:
99
- tokenizer.pad_token_id = config.pad_token_id \
100
- or tokenizer.eos_token_id
101
- model_data[model_name] = {"model":model, "tokenizer":tokenizer}
102
- return model, tokenizer
103
- except (EnvironmentError, NoCredentialsError):
104
- try:
105
- config = AutoConfig.from_pretrained(
106
- model_name, token=HUGGINGFACE_HUB_TOKEN
107
- )
108
- tokenizer = AutoTokenizer.from_pretrained(
109
- model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
110
- )
111
-
112
- model = AutoModelForCausalLM.from_pretrained(
113
- model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
114
- )
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
- if tokenizer.eos_token_id is not None and \
118
- tokenizer.pad_token_id is None:
119
- tokenizer.pad_token_id = config.pad_token_id \
120
- or tokenizer.eos_token_id
121
-
122
- model_data[model_name] = {"model":model, "tokenizer":tokenizer}
123
- return model, tokenizer
124
- except Exception as e:
125
- raise HTTPException(
126
- status_code=500, detail=f"Error loading model: {e}"
127
- )
128
 
129
- model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
130
 
131
  @app.post("/generate")
132
  async def generate(request: GenerateRequest):
@@ -144,8 +113,8 @@ async def generate(request: GenerateRequest):
144
  do_sample = request.do_sample
145
  stop_sequences = request.stop_sequences
146
 
147
- model, tokenizer = await model_loader.load_model_and_tokenizer(model_name)
148
- device = "cuda" if torch.cuda.is_available() else "cpu"
149
  model.to(device)
150
 
151
  if "text-to-text" == task_type:
@@ -231,18 +200,17 @@ async def stream_text(model, tokenizer, input_text,
231
 
232
  if len(new_text) == 0:
233
  if not stop_criteria(outputs.sequences, None):
234
- for text in output_text.split():
235
- yield json.dumps({"text": text, "is_end": False}) + "\n"
236
- yield json.dumps({"text": "", "is_end": True}) + "\n"
237
  break
238
 
239
  output_text += new_text
240
 
241
- for text in new_text.split():
242
- yield json.dumps({"text": text, "is_end": False}) + "\n"
243
 
244
  if stop_criteria(outputs.sequences, None):
245
- yield json.dumps({"text": "", "is_end": True}) + "\n"
246
  break
247
 
248
  encoded_input = tokenizer(
@@ -250,8 +218,12 @@ async def stream_text(model, tokenizer, input_text,
250
  truncation=True
251
  ).to(device)
252
  output_text = ""
253
-
254
 
 
 
 
 
 
255
  async def generate_text(model, tokenizer, input_text,
256
  generation_config, stop_sequences,
257
  device):
@@ -288,7 +260,7 @@ async def generate_text(model, tokenizer, input_text,
288
  async def generate_image(request: GenerateRequest):
289
  try:
290
  validated_body = request
291
- device = "cuda" if torch.cuda.is_available() else "cpu"
292
 
293
  if validated_body.model_name not in model_data:
294
  config = AutoConfig.from_pretrained(
@@ -306,7 +278,7 @@ async def generate_image(request: GenerateRequest):
306
 
307
  image_data = list(image.getdata())
308
 
309
- return json.dumps({"image_data": image_data, "is_end": True})
310
 
311
  except Exception as e:
312
  raise HTTPException(
@@ -319,7 +291,7 @@ async def generate_image(request: GenerateRequest):
319
  async def generate_text_to_speech(request: GenerateRequest):
320
  try:
321
  validated_body = request
322
- device = "cuda" if torch.cuda.is_available() else "cpu"
323
 
324
  if validated_body.model_name not in model_data:
325
  config = AutoConfig.from_pretrained(
@@ -341,7 +313,7 @@ async def generate_text_to_speech(request: GenerateRequest):
341
 
342
  audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
343
 
344
- return json.dumps({"audio": audio_base64, "is_end": True})
345
 
346
  except Exception as e:
347
  raise HTTPException(
@@ -354,7 +326,7 @@ async def generate_text_to_speech(request: GenerateRequest):
354
  async def generate_video(request: GenerateRequest):
355
  try:
356
  validated_body = request
357
- device = "cuda" if torch.cuda.is_available() else "cpu"
358
  if validated_body.model_name not in model_data:
359
  config = AutoConfig.from_pretrained(
360
  validated_body.model_name, token=HUGGINGFACE_HUB_TOKEN
@@ -373,7 +345,7 @@ async def generate_video(request: GenerateRequest):
373
 
374
  video_base64 = base64.b64encode(video).decode('utf-8')
375
 
376
- return json.dumps({"video": video_base64, "is_end": True})
377
 
378
  except Exception as e:
379
  raise HTTPException(
@@ -381,5 +353,27 @@ async def generate_video(request: GenerateRequest):
381
  detail=f"Internal server error: {str(e)}"
382
  )
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  if __name__ == "__main__":
385
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
10
  GenerationConfig,
11
  StoppingCriteria,
12
  StoppingCriteriaList,
13
+ pipeline
14
  )
 
15
  import uvicorn
16
  import asyncio
17
  import json
18
+ import base64
19
  from huggingface_hub import login
20
  from botocore.exceptions import NoCredentialsError
21
+ from functools import lru_cache
22
 
23
 
 
 
 
 
24
  HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
25
 
 
26
  if HUGGINGFACE_HUB_TOKEN:
27
  login(token=HUGGINGFACE_HUB_TOKEN,
28
  add_to_git_credential=False)
29
 
 
 
 
30
 
31
  app = FastAPI()
32
 
 
60
 
61
  model_data = {} # Global dictionary to store model data
62
 
63
+ model_load_lock = asyncio.Lock() # Lock to avoid race conditions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ @lru_cache(maxsize=None)
66
+ async def _load_model_and_tokenizer(model_name):
67
+ try:
68
+ config = AutoConfig.from_pretrained(
69
+ model_name, token=HUGGINGFACE_HUB_TOKEN
70
+ )
71
+ tokenizer = AutoTokenizer.from_pretrained(
72
+ model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
73
+ )
74
+
75
+ model = AutoModelForCausalLM.from_pretrained(
76
+ model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
77
+ )
78
+
79
+ if tokenizer.eos_token_id is not None and \
80
+ tokenizer.pad_token_id is None:
81
+ tokenizer.pad_token_id = config.pad_token_id \
82
+ or tokenizer.eos_token_id
83
+
84
+ return {"model":model, "tokenizer":tokenizer}
85
+ except Exception as e:
86
+ raise HTTPException(
87
+ status_code=500, detail=f"Error loading model: {e}"
88
+ )
89
 
90
+ async def load_model_and_tokenizer(model_name):
91
+ async with model_load_lock:
92
+ if model_name in model_data:
93
+ return model_data[model_name].get("model"), model_data[model_name].get("tokenizer")
94
+
95
+ model_bundle = await _load_model_and_tokenizer(model_name)
96
+ model_data[model_name] = model_bundle
97
+ return model_bundle.get("model"), model_bundle.get("tokenizer")
 
 
 
98
 
 
99
 
100
  @app.post("/generate")
101
  async def generate(request: GenerateRequest):
 
113
  do_sample = request.do_sample
114
  stop_sequences = request.stop_sequences
115
 
116
+ model, tokenizer = await load_model_and_tokenizer(model_name)
117
+ device = "cpu" # Force CPU
118
  model.to(device)
119
 
120
  if "text-to-text" == task_type:
 
200
 
201
  if len(new_text) == 0:
202
  if not stop_criteria(outputs.sequences, None):
203
+ yield {"text": output_text, "is_end": False}
204
+
205
+ yield {"text": "", "is_end": True}
206
  break
207
 
208
  output_text += new_text
209
 
210
+ yield {"text": new_text, "is_end": False}
 
211
 
212
  if stop_criteria(outputs.sequences, None):
213
+ yield {"text": "", "is_end": True}
214
  break
215
 
216
  encoded_input = tokenizer(
 
218
  truncation=True
219
  ).to(device)
220
  output_text = ""
 
221
 
222
+
223
+ async def stream_json_responses(generator):
224
+ async for data in generator:
225
+ yield json.dumps(data) + "\n"
226
+
227
  async def generate_text(model, tokenizer, input_text,
228
  generation_config, stop_sequences,
229
  device):
 
260
  async def generate_image(request: GenerateRequest):
261
  try:
262
  validated_body = request
263
+ device = "cpu" # Force CPU
264
 
265
  if validated_body.model_name not in model_data:
266
  config = AutoConfig.from_pretrained(
 
278
 
279
  image_data = list(image.getdata())
280
 
281
+ return JSONResponse({"image_data": image_data, "is_end": True})
282
 
283
  except Exception as e:
284
  raise HTTPException(
 
291
  async def generate_text_to_speech(request: GenerateRequest):
292
  try:
293
  validated_body = request
294
+ device = "cpu" # Force CPU
295
 
296
  if validated_body.model_name not in model_data:
297
  config = AutoConfig.from_pretrained(
 
313
 
314
  audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
315
 
316
+ return JSONResponse({"audio": audio_base64, "is_end": True})
317
 
318
  except Exception as e:
319
  raise HTTPException(
 
326
  async def generate_video(request: GenerateRequest):
327
  try:
328
  validated_body = request
329
+ device = "cpu" # Force CPU
330
  if validated_body.model_name not in model_data:
331
  config = AutoConfig.from_pretrained(
332
  validated_body.model_name, token=HUGGINGFACE_HUB_TOKEN
 
345
 
346
  video_base64 = base64.b64encode(video).decode('utf-8')
347
 
348
+ return JSONResponse({"video": video_base64, "is_end": True})
349
 
350
  except Exception as e:
351
  raise HTTPException(
 
353
  detail=f"Internal server error: {str(e)}"
354
  )
355
 
356
+ @app.on_event("startup")
357
+ async def startup_event():
358
+ # Load models here
359
+ print("Loading models...")
360
+
361
+ models_to_load = set()
362
+
363
+ for env_var_key, env_var_value in os.environ.items():
364
+ if env_var_key.startswith("MODEL_NAME_"):
365
+ models_to_load.add(env_var_value)
366
+
367
+
368
+ for model_name in models_to_load:
369
+ try:
370
+ await load_model_and_tokenizer(model_name)
371
+ print(f"Model {model_name} loaded")
372
+ except Exception as e:
373
+ print(f"Error loading model {model_name}: {e}")
374
+
375
+
376
+ print("Models loaded.")
377
+
378
  if __name__ == "__main__":
379
  uvicorn.run(app, host="0.0.0.0", port=7860)