aws_test

Sleeping

App Files Files Community

Hjgugugjhuhjggg commited on Dec 27, 2024

Commit

b5fcdec

verified ·

1 Parent(s): 7c21718

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -51

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import torch
-from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, field_validator
 from transformers import (
@@ -23,7 +23,9 @@ AWS_REGION = os.getenv("AWS_REGION")
 S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
 HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
-s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, region_name=AWS_REGION)
 app = FastAPI()
@@ -88,7 +90,7 @@ class S3ModelLoader:
                 return model, tokenizer
             except Exception as e:
                 raise HTTPException(status_code=500, detail=f"Error loading model: {e}")
 model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
 @app.post("/generate")
@@ -123,31 +125,47 @@ async def generate(request: GenerateRequest):
         )
         return StreamingResponse(
-            stream_text(model, tokenizer, input_text, generation_config, stop_sequences, device, chunk_delay),
             media_type="text/plain"
         )
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
-async def stream_text(model, tokenizer, input_text, generation_config, stop_sequences, device, chunk_delay, max_length=2048):
-    encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
     input_length = encoded_input["input_ids"].shape[1]
     remaining_tokens = max_length - input_length
     if remaining_tokens <= 0:
         yield ""
-    generation_config.max_new_tokens = min(remaining_tokens, generation_config.max_new_tokens)
-    def stop_criteria(input_ids, scores):
-        decoded_output = tokenizer.decode(int(input_ids[0][-1]), skip_special_tokens=True)
-        return decoded_output in stop_sequences
-    stopping_criteria = StoppingCriteriaList([stop_criteria])
     output_text = ""
-    outputs = model.generate(
         **encoded_input,
         do_sample=generation_config.do_sample,
         max_new_tokens=generation_config.max_new_tokens,
@@ -156,42 +174,42 @@ async def stream_text(model, tokenizer, input_text, generation_config, stop_sequ
         top_k=generation_config.top_k,
         repetition_penalty=generation_config.repetition_penalty,
         num_return_sequences=generation_config.num_return_sequences,
-        stopping_criteria=stopping_criteria,
         output_scores=True,
-        return_dict_in_generate=True
-    )
-    for output in outputs.sequences:
-        for token_id in output:
-            token = tokenizer.decode(token_id, skip_special_tokens=True)
-            yield token
-            await asyncio.sleep(chunk_delay)  # Simula el delay entre tokens
-        if stop_sequences and any(stop in output_text for stop in stop_sequences):
-            yield output_text
-            return
-        outputs = model.generate(
-            **encoded_input,
-            do_sample=generation_config.do_sample,
-            max_new_tokens=generation_config.max_new_tokens,
-            temperature=generation_config.temperature,
-            top_p=generation_config.top_p,
-            top_k=generation_config.top_k,
-            repetition_penalty=generation_config.repetition_penalty,
-            num_return_sequences=generation_config.num_return_sequences,
-            stopping_criteria=stopping_criteria,
-            output_scores=True,
-            return_dict_in_generate=True
-        )
 @app.post("/generate-image")
 async def generate_image(request: GenerateRequest):
     try:
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        image_generator = pipeline("text-to-image", model=validated_body.model_name, device=device)
         image = image_generator(validated_body.input_text)[0]
         img_byte_arr = BytesIO()
@@ -199,17 +217,20 @@ async def generate_image(request: GenerateRequest):
         img_byte_arr.seek(0)
         return StreamingResponse(img_byte_arr, media_type="image/png")
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/generate-text-to-speech")
 async def generate_text_to_speech(request: GenerateRequest):
     try:
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        audio_generator = pipeline("text-to-speech", model=validated_body.model_name, device=device)
         audio = audio_generator(validated_body.input_text)[0]
         audio_byte_arr = BytesIO()
@@ -219,24 +240,29 @@ async def generate_text_to_speech(request: GenerateRequest):
         return StreamingResponse(audio_byte_arr, media_type="audio/wav")
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/generate-video")
 async def generate_video(request: GenerateRequest):
     try:
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        video_generator = pipeline("text-to-video", model=validated_body.model_name, device=device)
         video = video_generator(validated_body.input_text)[0]
         video_byte_arr = BytesIO()
         video.save(video_byte_arr)
         video_byte_arr.seek(0)
-        return StreamingResponse(video_byte_arr, media_type="video/mp4")
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 import os
 import torch
+from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, field_validator
 from transformers import (
 S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
 HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
+s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID,
+                         aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+                         region_name=AWS_REGION)
 app = FastAPI()
                 return model, tokenizer
             except Exception as e:
                 raise HTTPException(status_code=500, detail=f"Error loading model: {e}")
 model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
 @app.post("/generate")
         )
         return StreamingResponse(
+            stream_text(model, tokenizer, input_text,
+                         generation_config, stop_sequences,
+                         device, chunk_delay),
             media_type="text/plain"
         )
     except Exception as e:
+        raise HTTPException(status_code=500,
+                            detail=f"Internal server error: {str(e)}")
+async def stream_text(model, tokenizer, input_text,
+                        generation_config, stop_sequences,
+                        device, chunk_delay, max_length=2048):
+    encoded_input = tokenizer(input_text,
+                               return_tensors="pt",
+                               truncation=True,
+                               max_length=max_length).to(device)
     input_length = encoded_input["input_ids"].shape[1]
     remaining_tokens = max_length - input_length
     if remaining_tokens <= 0:
         yield ""
+    generation_config.max_new_tokens = min(
+        remaining_tokens, generation_config.max_new_tokens
+    )
+    def find_stop(output_text, stop_sequences):
+        for seq in stop_sequences:
+            if seq in output_text:
+                last_index = output_text.rfind(seq)
+                return last_index + len(seq)
+        return -1
     output_text = ""
+    while True:
+        outputs = model.generate(
         **encoded_input,
         do_sample=generation_config.do_sample,
         max_new_tokens=generation_config.max_new_tokens,
         top_k=generation_config.top_k,
         repetition_penalty=generation_config.repetition_penalty,
         num_return_sequences=generation_config.num_return_sequences,
         output_scores=True,
+        return_dict_in_generate=True,
+        )
+        new_text = tokenizer.decode(outputs.sequences[0][len(encoded_input["input_ids"][0]):], skip_special_tokens=True)
+        output_text += new_text
+        yield new_text
+        await asyncio.sleep(chunk_delay)
+        stop_index = find_stop(output_text, stop_sequences)
+        if stop_index != -1:
+            yield output_text[:stop_index]
+            break
+        if len(output_text) >= generation_config.max_new_tokens:
+            break
+        encoded_input = tokenizer(output_text,
+                               return_tensors="pt",
+                               truncation=True,
+                               max_length=max_length).to(device)
 @app.post("/generate-image")
 async def generate_image(request: GenerateRequest):
     try:
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        image_generator = pipeline("text-to-image",
+                                    model=validated_body.model_name,
+                                    device=device)
         image = image_generator(validated_body.input_text)[0]
         img_byte_arr = BytesIO()
         img_byte_arr.seek(0)
         return StreamingResponse(img_byte_arr, media_type="image/png")
     except Exception as e:
+        raise HTTPException(status_code=500,
+                             detail=f"Internal server error: {str(e)}")
 @app.post("/generate-text-to-speech")
 async def generate_text_to_speech(request: GenerateRequest):
     try:
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        audio_generator = pipeline("text-to-speech",
+                                    model=validated_body.model_name,
+                                    device=device)
         audio = audio_generator(validated_body.input_text)[0]
         audio_byte_arr = BytesIO()
         return StreamingResponse(audio_byte_arr, media_type="audio/wav")
     except Exception as e:
+         raise HTTPException(status_code=500,
+                             detail=f"Internal server error: {str(e)}")
 @app.post("/generate-video")
 async def generate_video(request: GenerateRequest):
     try:
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        video_generator = pipeline("text-to-video",
+                                    model=validated_body.model_name,
+                                    device=device)
         video = video_generator(validated_body.input_text)[0]
         video_byte_arr = BytesIO()
         video.save(video_byte_arr)
         video_byte_arr.seek(0)
+        return StreamingResponse(video_byte_arr,
+                                 media_type="video/mp4")
     except Exception as e:
+         raise HTTPException(status_code=500,
+                            detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)