aws_test

Sleeping

App Files Files Community

Hjgugugjhuhjggg commited on Dec 27, 2024

Commit

e6982de

verified ·

1 Parent(s): 277e316

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -74

app.py CHANGED Viewed

@@ -17,6 +17,7 @@ import asyncio
 from io import BytesIO
 from transformers import pipeline
 import json
 AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
 AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
@@ -24,6 +25,11 @@ AWS_REGION = os.getenv("AWS_REGION")
 S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
 HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
 s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID,
                          aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
                          region_name=AWS_REGION)
@@ -53,7 +59,8 @@ class GenerateRequest(BaseModel):
     @field_validator("task_type")
     def task_type_must_be_valid(cls, v):
-        valid_types = ["text-to-text", "text-to-image", "text-to-speech", "text-to-video"]
         if v not in valid_types:
             raise ValueError(f"task_type must be one of: {valid_types}")
         return v
@@ -64,34 +71,51 @@ class S3ModelLoader:
         self.s3_client = s3_client
     def _get_s3_uri(self, model_name):
-        return f"s3://{self.bucket_name}/{model_name.replace('/', '-')}"
     async def load_model_and_tokenizer(self, model_name):
         s3_uri = self._get_s3_uri(model_name)
         try:
-            config = AutoConfig.from_pretrained(s3_uri, local_files_only=True)
-            model = AutoModelForSeq2SeqLM.from_pretrained(s3_uri, config=config, local_files_only=True)
-            tokenizer = AutoTokenizer.from_pretrained(s3_uri, config=config, local_files_only=True)
-            if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
-                tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
             return model, tokenizer
         except EnvironmentError:
             try:
                 config = AutoConfig.from_pretrained(model_name)
-                tokenizer = AutoTokenizer.from_pretrained(model_name, config=config)
-                model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config=config)
-                if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
-                    tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
                 model.save_pretrained(s3_uri)
                 tokenizer.save_pretrained(s3_uri)
                 return model, tokenizer
             except Exception as e:
-                raise HTTPException(status_code=500, detail=f"Error loading model: {e}")
 model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
 @app.post("/generate")
@@ -111,7 +135,8 @@ async def generate(request: GenerateRequest):
         chunk_delay = request.chunk_delay
         stop_sequences = request.stop_sequences
-        model, tokenizer = await model_loader.load_model_and_tokenizer(model_name)
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model.to(device)
@@ -131,19 +156,20 @@ async def generate(request: GenerateRequest):
                          device, chunk_delay),
             media_type="text/plain"
         )
     except Exception as e:
-        raise HTTPException(status_code=500,
-                            detail=f"Internal server error: {str(e)}")
 async def stream_text(model, tokenizer, input_text,
                         generation_config, stop_sequences,
                         device, chunk_delay, max_length=2048):
-    encoded_input = tokenizer(input_text,
-                               return_tensors="pt",
-                               truncation=True,
-                               max_length=max_length).to(device)
     input_length = encoded_input["input_ids"].shape[1]
     remaining_tokens = max_length - input_length
@@ -153,7 +179,7 @@ async def stream_text(model, tokenizer, input_text,
     generation_config.max_new_tokens = min(
         remaining_tokens, generation_config.max_new_tokens
     )
     def find_stop(output_text, stop_sequences):
         for seq in stop_sequences:
             if seq in output_text:
@@ -161,9 +187,9 @@ async def stream_text(model, tokenizer, input_text,
                 return last_index + len(seq)
         return -1
     output_text = ""
     while True:
         outputs = model.generate(
             **encoded_input,
@@ -177,51 +203,50 @@ async def stream_text(model, tokenizer, input_text,
             output_scores=True,
             return_dict_in_generate=True,
         )
-        new_text = tokenizer.decode(outputs.sequences[0][len(encoded_input["input_ids"][0]):], skip_special_tokens=True)
         output_text += new_text
         stop_index = find_stop(output_text, stop_sequences)
         if stop_index != -1:
             final_output = output_text[:stop_index]
-            chunked_output = [final_output[i:i+10] for i in range(0, len(final_output), 10)]
             for chunk in chunked_output:
                 yield json.dumps({"text": chunk, "is_end": False}) + "\n"
                 await asyncio.sleep(chunk_delay)
             yield json.dumps({"text": "", "is_end": True}) + "\n"
             break
         else:
-            chunked_output = [new_text[i:i+10] for i in range(0, len(new_text), 10)]
             for chunk in chunked_output:
                 yield json.dumps({"text": chunk, "is_end": False}) + "\n"
                 await asyncio.sleep(chunk_delay)
         if len(output_text) >= generation_config.max_new_tokens:
-            chunked_output = [output_text[i:i+10] for i in range(0, len(output_text), 10)]
             for chunk in chunked_output:
                 yield json.dumps({"text": chunk, "is_end": False}) + "\n"
                 await asyncio.sleep(chunk_delay)
             yield json.dumps({"text": "", "is_end": True}) + "\n"
             break
-        encoded_input = tokenizer(output_text,
-                               return_tensors="pt",
-                               truncation=True,
-                               max_length=max_length).to(device)
 @app.post("/generate-image")
 async def generate_image(request: GenerateRequest):
@@ -229,62 +254,78 @@ async def generate_image(request: GenerateRequest):
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        image_generator = pipeline("text-to-image",
-                                    model=validated_body.model_name,
-                                    device=device)
         image = image_generator(validated_body.input_text)[0]
         img_byte_arr = BytesIO()
         image.save(img_byte_arr, format="PNG")
         img_byte_arr.seek(0)
-        return StreamingResponse(img_byte_arr, media_type="image/png")
     except Exception as e:
-        raise HTTPException(status_code=500,
-                             detail=f"Internal server error: {str(e)}")
 @app.post("/generate-text-to-speech")
 async def generate_text_to_speech(request: GenerateRequest):
     try:
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        audio_generator = pipeline("text-to-speech",
-                                    model=validated_body.model_name,
-                                    device=device)
         audio = audio_generator(validated_body.input_text)[0]
         audio_byte_arr = BytesIO()
         audio.save(audio_byte_arr)
         audio_byte_arr.seek(0)
-        return StreamingResponse(audio_byte_arr, media_type="audio/wav")
     except Exception as e:
-         raise HTTPException(status_code=500,
-                             detail=f"Internal server error: {str(e)}")
 @app.post("/generate-video")
 async def generate_video(request: GenerateRequest):
     try:
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        video_generator = pipeline("text-to-video",
-                                    model=validated_body.model_name,
-                                    device=device)
         video = video_generator(validated_body.input_text)[0]
         video_byte_arr = BytesIO()
         video.save(video_byte_arr)
         video_byte_arr.seek(0)
-        return StreamingResponse(video_byte_arr,
-                                 media_type="video/mp4")
     except Exception as e:
-         raise HTTPException(status_code=500,
-                            detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from io import BytesIO
 from transformers import pipeline
 import json
+from huggingface_hub import login
 AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
 AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
 S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
 HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
+if HUGGINGFACE_HUB_TOKEN:
+    login(token=HUGGINGFACE_HUB_TOKEN,
+          add_to_git_credential=False)
 s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID,
                          aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
                          region_name=AWS_REGION)
     @field_validator("task_type")
     def task_type_must_be_valid(cls, v):
+        valid_types = ["text-to-text", "text-to-image",
+                       "text-to-speech", "text-to-video"]
         if v not in valid_types:
             raise ValueError(f"task_type must be one of: {valid_types}")
         return v
         self.s3_client = s3_client
     def _get_s3_uri(self, model_name):
+        return f"s3://{self.bucket_name}/" \
+               f"{model_name.replace('/', '-')}"
     async def load_model_and_tokenizer(self, model_name):
         s3_uri = self._get_s3_uri(model_name)
         try:
+            config = AutoConfig.from_pretrained(
+                s3_uri, local_files_only=True
+            )
+            model = AutoModelForSeq2SeqLM.from_pretrained(
+                s3_uri, config=config, local_files_only=True
+            )
+            tokenizer = AutoTokenizer.from_pretrained(
+                s3_uri, config=config, local_files_only=True
+            )
+            if tokenizer.eos_token_id is not None and \
+               tokenizer.pad_token_id is None:
+                tokenizer.pad_token_id = config.pad_token_id \
+                                        or tokenizer.eos_token_id
             return model, tokenizer
         except EnvironmentError:
             try:
                 config = AutoConfig.from_pretrained(model_name)
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_name, config=config
+                )
+                model = AutoModelForSeq2SeqLM.from_pretrained(
+                    model_name, config=config
+                )
+                if tokenizer.eos_token_id is not None and \
+                   tokenizer.pad_token_id is None:
+                    tokenizer.pad_token_id = config.pad_token_id \
+                                            or tokenizer.eos_token_id
                 model.save_pretrained(s3_uri)
                 tokenizer.save_pretrained(s3_uri)
                 return model, tokenizer
             except Exception as e:
+                raise HTTPException(
+                    status_code=500, detail=f"Error loading model: {e}"
+                )
 model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
 @app.post("/generate")
         chunk_delay = request.chunk_delay
         stop_sequences = request.stop_sequences
+        model, tokenizer = await model_loader.\
+            load_model_and_tokenizer(model_name)
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model.to(device)
                          device, chunk_delay),
             media_type="text/plain"
         )
     except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Internal server error: {str(e)}"
+        )
 async def stream_text(model, tokenizer, input_text,
                         generation_config, stop_sequences,
                         device, chunk_delay, max_length=2048):
+    encoded_input = tokenizer(
+        input_text, return_tensors="pt",
+        truncation=True, max_length=max_length
+    ).to(device)
     input_length = encoded_input["input_ids"].shape[1]
     remaining_tokens = max_length - input_length
     generation_config.max_new_tokens = min(
         remaining_tokens, generation_config.max_new_tokens
     )
     def find_stop(output_text, stop_sequences):
         for seq in stop_sequences:
             if seq in output_text:
                 return last_index + len(seq)
         return -1
     output_text = ""
     while True:
         outputs = model.generate(
             **encoded_input,
             output_scores=True,
             return_dict_in_generate=True,
         )
+        new_text = tokenizer.decode(
+            outputs.sequences[0][len(encoded_input["input_ids"][0]):],
+            skip_special_tokens=True
+        )
         output_text += new_text
         stop_index = find_stop(output_text, stop_sequences)
         if stop_index != -1:
             final_output = output_text[:stop_index]
+            chunked_output = [final_output[i:i+10]
+                             for i in range(0, len(final_output), 10)]
             for chunk in chunked_output:
                 yield json.dumps({"text": chunk, "is_end": False}) + "\n"
                 await asyncio.sleep(chunk_delay)
             yield json.dumps({"text": "", "is_end": True}) + "\n"
             break
         else:
+            chunked_output = [new_text[i:i+10]
+                             for i in range(0, len(new_text), 10)]
             for chunk in chunked_output:
                 yield json.dumps({"text": chunk, "is_end": False}) + "\n"
                 await asyncio.sleep(chunk_delay)
         if len(output_text) >= generation_config.max_new_tokens:
+            chunked_output = [output_text[i:i+10]
+                             for i in range(0, len(output_text), 10)]
             for chunk in chunked_output:
                 yield json.dumps({"text": chunk, "is_end": False}) + "\n"
                 await asyncio.sleep(chunk_delay)
             yield json.dumps({"text": "", "is_end": True}) + "\n"
             break
+        encoded_input = tokenizer(
+            output_text, return_tensors="pt",
+            truncation=True, max_length=max_length
+        ).to(device)
 @app.post("/generate-image")
 async def generate_image(request: GenerateRequest):
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        image_generator = pipeline(
+            "text-to-image", model=validated_body.model_name,
+            device=device
+        )
         image = image_generator(validated_body.input_text)[0]
         img_byte_arr = BytesIO()
         image.save(img_byte_arr, format="PNG")
         img_byte_arr.seek(0)
+        return StreamingResponse(
+            img_byte_arr, media_type="image/png"
+        )
     except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Internal server error: {str(e)}"
+        )
 @app.post("/generate-text-to-speech")
 async def generate_text_to_speech(request: GenerateRequest):
     try:
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        audio_generator = pipeline(
+            "text-to-speech", model=validated_body.model_name,
+            device=device
+        )
         audio = audio_generator(validated_body.input_text)[0]
         audio_byte_arr = BytesIO()
         audio.save(audio_byte_arr)
         audio_byte_arr.seek(0)
+        return StreamingResponse(
+            audio_byte_arr, media_type="audio/wav"
+        )
     except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Internal server error: {str(e)}"
+        )
 @app.post("/generate-video")
 async def generate_video(request: GenerateRequest):
     try:
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        video_generator = pipeline(
+            "text-to-video", model=validated_body.model_name,
+            device=device
+        )
         video = video_generator(validated_body.input_text)[0]
         video_byte_arr = BytesIO()
         video.save(video_byte_arr)
         video_byte_arr.seek(0)
+        return StreamingResponse(
+            video_byte_arr, media_type="video/mp4"
+        )
     except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Internal server error: {str(e)}"
+        )
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)