Hjgugugjhuhjggg commited on
Commit
7f43658
·
verified ·
1 Parent(s): e416837

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -109
app.py CHANGED
@@ -1,28 +1,21 @@
1
  import os
2
- import logging
3
- import time
4
- from io import BytesIO
5
- from typing import Union
6
-
7
- from fastapi import FastAPI, HTTPException, Response, Request, UploadFile, File
8
  from fastapi.responses import StreamingResponse
9
- from pydantic import BaseModel, ValidationError, field_validator
10
  from transformers import (
11
  AutoConfig,
 
12
  AutoModelForCausalLM,
13
  AutoTokenizer,
14
- pipeline,
15
  GenerationConfig,
16
  StoppingCriteriaList
17
  )
18
  import boto3
19
- from huggingface_hub import hf_hub_download
20
- import soundfile as sf
21
- import numpy as np
22
- import torch
23
  import uvicorn
24
-
25
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s")
 
26
 
27
  AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
28
  AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
@@ -30,13 +23,17 @@ AWS_REGION = os.getenv("AWS_REGION")
30
  S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
31
  HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
32
 
 
 
 
 
33
  class GenerateRequest(BaseModel):
34
  model_name: str
35
  input_text: str = ""
36
  task_type: str
37
  temperature: float = 1.0
38
  max_new_tokens: int = 200
39
- stream: bool = False
40
  top_p: float = 1.0
41
  top_k: int = 50
42
  repetition_penalty: float = 1.0
@@ -45,8 +42,6 @@ class GenerateRequest(BaseModel):
45
  chunk_delay: float = 0.0
46
  stop_sequences: list[str] = []
47
 
48
- model_config = {"protected_namespaces": ()}
49
-
50
  @field_validator("model_name")
51
  def model_name_cannot_be_empty(cls, v):
52
  if not v:
@@ -71,127 +66,177 @@ class S3ModelLoader:
71
  async def load_model_and_tokenizer(self, model_name):
72
  s3_uri = self._get_s3_uri(model_name)
73
  try:
74
- logging.info(f"Trying to load {model_name} from S3...")
75
- config = AutoConfig.from_pretrained(s3_uri)
76
- model = AutoModelForCausalLM.from_pretrained(s3_uri, config=config)
77
- tokenizer = AutoTokenizer.from_pretrained(s3_uri, config=config)
78
 
79
  if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
80
  tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
81
 
82
- logging.info(f"Loaded {model_name} from S3 successfully.")
83
  return model, tokenizer
84
  except EnvironmentError:
85
- logging.info(f"Model {model_name} not found in S3. Downloading...")
86
  try:
87
  config = AutoConfig.from_pretrained(model_name)
88
  tokenizer = AutoTokenizer.from_pretrained(model_name, config=config)
89
- model = AutoModelForCausalLM.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN)
90
 
91
  if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
92
  tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
93
 
94
- logging.info(f"Downloaded {model_name} successfully.")
95
- logging.info(f"Saving {model_name} to S3...")
96
  model.save_pretrained(s3_uri)
97
  tokenizer.save_pretrained(s3_uri)
98
- logging.info(f"Saved {model_name} to S3 successfully.")
99
  return model, tokenizer
100
  except Exception as e:
101
- logging.exception(f"Error downloading/uploading model: {e}")
102
  raise HTTPException(status_code=500, detail=f"Error loading model: {e}")
103
 
104
- app = FastAPI()
105
-
106
- s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, region_name=AWS_REGION)
107
  model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
108
 
109
  @app.post("/generate")
110
- async def generate(request: Request, body: GenerateRequest):
111
  try:
112
- validated_body = GenerateRequest(**body.model_dump())
113
- model, tokenizer = await model_loader.load_model_and_tokenizer(validated_body.model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  device = "cuda" if torch.cuda.is_available() else "cpu"
115
  model.to(device)
116
 
117
- if validated_body.task_type == "text-to-text":
118
- generation_config = GenerationConfig(
119
- temperature=validated_body.temperature,
120
- max_new_tokens=validated_body.max_new_tokens,
121
- top_p=validated_body.top_p,
122
- top_k=validated_body.top_k,
123
- repetition_penalty=validated_body.repetition_penalty,
124
- do_sample=validated_body.do_sample,
125
- num_return_sequences=validated_body.num_return_sequences
126
- )
127
-
128
- async def stream_text():
129
- input_text = validated_body.input_text
130
- generated_text = ""
131
- max_length = model.config.max_position_embeddings
132
-
133
- while True:
134
- encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
135
- input_length = encoded_input["input_ids"].shape[1]
136
- remaining_tokens = max_length - input_length
137
-
138
- if remaining_tokens <= 0:
139
- break
140
-
141
- generation_config.max_new_tokens = min(remaining_tokens, validated_body.max_new_tokens)
142
-
143
- stopping_criteria = StoppingCriteriaList(
144
- [lambda _, outputs: tokenizer.decode(outputs[0][-1], skip_special_tokens=True) in validated_body.stop_sequences] if validated_body.stop_sequences else []
145
- )
146
-
147
- output = model.generate(**encoded_input, generation_config=generation_config, stopping_criteria=stopping_criteria)
148
- chunk = tokenizer.decode(output[0], skip_special_tokens=True)
149
- generated_text += chunk
150
- yield chunk
151
- time.sleep(validated_body.chunk_delay)
152
- input_text = generated_text
153
-
154
- if validated_body.stream:
155
- return StreamingResponse(stream_text(), media_type="text/plain")
156
- else:
157
- generated_text = ""
158
- async for chunk in stream_text():
159
- generated_text += chunk
160
- return {"result": generated_text}
161
-
162
- elif validated_body.task_type == "text-to-image":
163
- generator = pipeline("text-to-image", model=model, tokenizer=tokenizer, device=device)
164
- image = generator(validated_body.input_text)[0]
165
- image_bytes = image.tobytes()
166
- return Response(content=image_bytes, media_type="image/png")
167
-
168
- elif validated_body.task_type == "text-to-speech":
169
- generator = pipeline("text-to-speech", model=model, tokenizer=tokenizer, device=device)
170
- audio = generator(validated_body.input_text)
171
- audio_bytesio = BytesIO()
172
- sf.write(audio_bytesio, audio["sampling_rate"], np.int16(audio["audio"]))
173
- audio_bytes = audio_bytesio.getvalue()
174
- return Response(content=audio_bytes, media_type="audio/wav")
175
-
176
- elif validated_body.task_type == "text-to-video":
177
- try:
178
- generator = pipeline("text-to-video", model=model, tokenizer=tokenizer, device=device)
179
- video = generator(validated_body.input_text)
180
- return Response(content=video, media_type="video/mp4")
181
- except Exception as e:
182
- raise HTTPException(status_code=500, detail=f"Error in text-to-video generation: {e}")
183
 
184
- else:
185
- raise HTTPException(status_code=400, detail="Unsupported task type")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- except HTTPException as e:
188
- raise e
189
- except ValidationError as e:
190
- raise HTTPException(status_code=422, detail=e.errors())
191
  except Exception as e:
192
- logging.exception(f"An unexpected error occurred: {e}")
193
- raise HTTPException(status_code=500, detail="An unexpected error occurred.")
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  if __name__ == "__main__":
197
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  import os
2
+ import torch
3
+ from fastapi import FastAPI, HTTPException
 
 
 
 
4
  from fastapi.responses import StreamingResponse
5
+ from pydantic import BaseModel, field_validator
6
  from transformers import (
7
  AutoConfig,
8
+ pipeline,
9
  AutoModelForCausalLM,
10
  AutoTokenizer,
 
11
  GenerationConfig,
12
  StoppingCriteriaList
13
  )
14
  import boto3
 
 
 
 
15
  import uvicorn
16
+ import asyncio
17
+ from io import BytesIO
18
+ from transformers import pipeline
19
 
20
  AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
21
  AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
 
23
  S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
24
  HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
25
 
26
+ s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, region_name=AWS_REGION)
27
+
28
+ app = FastAPI()
29
+
30
  class GenerateRequest(BaseModel):
31
  model_name: str
32
  input_text: str = ""
33
  task_type: str
34
  temperature: float = 1.0
35
  max_new_tokens: int = 200
36
+ stream: bool = True
37
  top_p: float = 1.0
38
  top_k: int = 50
39
  repetition_penalty: float = 1.0
 
42
  chunk_delay: float = 0.0
43
  stop_sequences: list[str] = []
44
 
 
 
45
  @field_validator("model_name")
46
  def model_name_cannot_be_empty(cls, v):
47
  if not v:
 
66
  async def load_model_and_tokenizer(self, model_name):
67
  s3_uri = self._get_s3_uri(model_name)
68
  try:
69
+ config = AutoConfig.from_pretrained(s3_uri, local_files_only=True)
70
+ model = AutoModelForCausalLM.from_pretrained(s3_uri, config=config, local_files_only=True)
71
+ tokenizer = AutoTokenizer.from_pretrained(s3_uri, config=config, local_files_only=True)
 
72
 
73
  if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
74
  tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
75
 
 
76
  return model, tokenizer
77
  except EnvironmentError:
 
78
  try:
79
  config = AutoConfig.from_pretrained(model_name)
80
  tokenizer = AutoTokenizer.from_pretrained(model_name, config=config)
81
+ model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
82
 
83
  if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
84
  tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
85
 
 
 
86
  model.save_pretrained(s3_uri)
87
  tokenizer.save_pretrained(s3_uri)
 
88
  return model, tokenizer
89
  except Exception as e:
 
90
  raise HTTPException(status_code=500, detail=f"Error loading model: {e}")
91
 
 
 
 
92
  model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
93
 
94
  @app.post("/generate")
95
+ async def generate(request: GenerateRequest):
96
  try:
97
+ model_name = request.model_name
98
+ input_text = request.input_text
99
+ task_type = request.task_type
100
+ temperature = request.temperature
101
+ max_new_tokens = request.max_new_tokens
102
+ stream = request.stream
103
+ top_p = request.top_p
104
+ top_k = request.top_k
105
+ repetition_penalty = request.repetition_penalty
106
+ num_return_sequences = request.num_return_sequences
107
+ do_sample = request.do_sample
108
+ chunk_delay = request.chunk_delay
109
+ stop_sequences = request.stop_sequences
110
+
111
+ model, tokenizer = await model_loader.load_model_and_tokenizer(model_name)
112
  device = "cuda" if torch.cuda.is_available() else "cpu"
113
  model.to(device)
114
 
115
+ generation_config = GenerationConfig(
116
+ temperature=temperature,
117
+ max_new_tokens=max_new_tokens,
118
+ top_p=top_p,
119
+ top_k=top_k,
120
+ repetition_penalty=repetition_penalty,
121
+ do_sample=do_sample,
122
+ num_return_sequences=num_return_sequences,
123
+ )
124
+
125
+ return StreamingResponse(
126
+ stream_text(model, tokenizer, input_text, generation_config, stop_sequences, device, chunk_delay),
127
+ media_type="text/plain"
128
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ except Exception as e:
131
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
132
+
133
+ async def stream_text(model, tokenizer, input_text, generation_config, stop_sequences, device, chunk_delay, max_length=2048):
134
+ encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
135
+ input_length = encoded_input["input_ids"].shape[1]
136
+ remaining_tokens = max_length - input_length
137
+
138
+ if remaining_tokens <= 0:
139
+ yield ""
140
+
141
+ generation_config.max_new_tokens = min(remaining_tokens, generation_config.max_new_tokens)
142
+
143
+ def stop_criteria(input_ids, scores):
144
+ decoded_output = tokenizer.decode(int(input_ids[0][-1]), skip_special_tokens=True)
145
+ return decoded_output in stop_sequences
146
+
147
+ stopping_criteria = StoppingCriteriaList([stop_criteria])
148
+
149
+ output_text = ""
150
+ outputs = model.generate(
151
+ **encoded_input,
152
+ do_sample=generation_config.do_sample,
153
+ max_new_tokens=generation_config.max_new_tokens,
154
+ temperature=generation_config.temperature,
155
+ top_p=generation_config.top_p,
156
+ top_k=generation_config.top_k,
157
+ repetition_penalty=generation_config.repetition_penalty,
158
+ num_return_sequences=generation_config.num_return_sequences,
159
+ stopping_criteria=stopping_criteria,
160
+ output_scores=True,
161
+ return_dict_in_generate=True
162
+ )
163
+
164
+ for output in outputs.sequences:
165
+ for token_id in output:
166
+ token = tokenizer.decode(token_id, skip_special_tokens=True)
167
+ yield token
168
+ await asyncio.sleep(chunk_delay) # Simula el delay entre tokens
169
+
170
+ if stop_sequences and any(stop in output_text for stop in stop_sequences):
171
+ yield output_text
172
+ return
173
+
174
+ outputs = model.generate(
175
+ **encoded_input,
176
+ do_sample=generation_config.do_sample,
177
+ max_new_tokens=generation_config.max_new_tokens,
178
+ temperature=generation_config.temperature,
179
+ top_p=generation_config.top_p,
180
+ top_k=generation_config.top_k,
181
+ repetition_penalty=generation_config.repetition_penalty,
182
+ num_return_sequences=generation_config.num_return_sequences,
183
+ stopping_criteria=stopping_criteria,
184
+ output_scores=True,
185
+ return_dict_in_generate=True
186
+ )
187
+
188
+ @app.post("/generate-image")
189
+ async def generate_image(request: GenerateRequest):
190
+ try:
191
+ validated_body = request
192
+ device = "cuda" if torch.cuda.is_available() else "cpu"
193
+
194
+ image_generator = pipeline("text-to-image", model=validated_body.model_name, device=device)
195
+ image = image_generator(validated_body.input_text)[0]
196
+
197
+ img_byte_arr = BytesIO()
198
+ image.save(img_byte_arr, format="PNG")
199
+ img_byte_arr.seek(0)
200
+
201
+ return StreamingResponse(img_byte_arr, media_type="image/png")
202
+
203
+ except Exception as e:
204
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
205
+
206
+ @app.post("/generate-text-to-speech")
207
+ async def generate_text_to_speech(request: GenerateRequest):
208
+ try:
209
+ validated_body = request
210
+ device = "cuda" if torch.cuda.is_available() else "cpu"
211
+
212
+ audio_generator = pipeline("text-to-speech", model=validated_body.model_name, device=device)
213
+ audio = audio_generator(validated_body.input_text)[0]
214
+
215
+ audio_byte_arr = BytesIO()
216
+ audio.save(audio_byte_arr)
217
+ audio_byte_arr.seek(0)
218
+
219
+ return StreamingResponse(audio_byte_arr, media_type="audio/wav")
220
 
 
 
 
 
221
  except Exception as e:
222
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 
223
 
224
+ @app.post("/generate-video")
225
+ async def generate_video(request: GenerateRequest):
226
+ try:
227
+ validated_body = request
228
+ device = "cuda" if torch.cuda.is_available() else "cpu"
229
+ video_generator = pipeline("text-to-video", model=validated_body.model_name, device=device)
230
+ video = video_generator(validated_body.input_text)[0]
231
+
232
+ video_byte_arr = BytesIO()
233
+ video.save(video_byte_arr)
234
+ video_byte_arr.seek(0)
235
+
236
+ return StreamingResponse(video_byte_arr, media_type="video/mp4")
237
+
238
+ except Exception as e:
239
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
240
 
241
  if __name__ == "__main__":
242
+ uvicorn.run(app, host="0.0.0.0", port=7860)