aws_test

Sleeping

App Files Files Community

Hjgugugjhuhjggg commited on Dec 27, 2024

Commit

d05ede6

verified ·

1 Parent(s): e079cb9

Update app.py

Browse files

Files changed (1) hide show

app.py +263 -82

app.py CHANGED Viewed

@@ -3,15 +3,21 @@ import torch
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse, JSONResponse
 from pydantic import BaseModel, field_validator
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StoppingCriteria, StoppingCriteriaList, pipeline
 import boto3
 import uvicorn
 import asyncio
 import json
 from huggingface_hub import login
 from botocore.exceptions import NoCredentialsError
-from functools import cached_property
-import base64
 AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
 AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
@@ -19,79 +25,107 @@ AWS_REGION = os.getenv("AWS_REGION")
 S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
 HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
 if HUGGINGFACE_HUB_TOKEN:
-    login(token=HUGGINGFACE_HUB_TOKEN,add_to_git_credential=False)
-s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID,aws_secret_access_key=AWS_SECRET_ACCESS_KEY,region_name=AWS_REGION)
 app = FastAPI()
 class GenerateRequest(BaseModel):
     model_name: str
     input_text: str = ""
     task_type: str
-    temperature: float = 0.01
-    max_new_tokens: int = 20
     stream: bool = True
     top_p: float = 1.0
-    top_k: int = 1
     repetition_penalty: float = 1.0
     num_return_sequences: int = 1
-    do_sample: bool = False
     stop_sequences: list[str] = []
-    quantize: bool = False
-    use_onnx: bool = False
     @field_validator("model_name")
     def model_name_cannot_be_empty(cls, v):
         if not v:
             raise ValueError("model_name cannot be empty.")
         return v
     @field_validator("task_type")
     def task_type_must_be_valid(cls, v):
-        valid_types = ["text-to-text", "text-to-image","text-to-speech", "text-to-video"]
         if v not in valid_types:
             raise ValueError(f"task_type must be one of: {valid_types}")
         return v
 class S3ModelLoader:
     def __init__(self, bucket_name, s3_client):
         self.bucket_name = bucket_name
         self.s3_client = s3_client
-        self.model_cache = {}
     def _get_s3_uri(self, model_name):
-         return f"s3://{self.bucket_name}/{model_name.replace('/', '-')}"
-    async def _load_model_and_tokenizer(self, model_name, quantize, use_onnx):
        s3_uri = self._get_s3_uri(model_name)
        try:
-            config = AutoConfig.from_pretrained(s3_uri, local_files_only=False)
-            model = AutoModelForCausalLM.from_pretrained(s3_uri, config=config, local_files_only=False).to(self.device)
-            tokenizer = AutoTokenizer.from_pretrained(s3_uri, config=config, local_files_only=False)
-            if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
-                tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
             return model, tokenizer
        except (EnvironmentError, NoCredentialsError):
             try:
-                config = AutoConfig.from_pretrained(model_name, token=HUGGINGFACE_HUB_TOKEN)
-                tokenizer = AutoTokenizer.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN)
-                model = AutoModelForCausalLM.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN).to(self.device)
-                if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
-                    tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
                 return model, tokenizer
             except Exception as e:
-                raise HTTPException(status_code=500, detail=f"Error loading model: {e}")
-    @cached_property
-    def device(self):
-        return torch.device("cpu")
-    async def get_model_and_tokenizer(self, model_name, quantize, use_onnx):
-        key = f"{model_name}-{quantize}-{use_onnx}"
-        if key not in self.model_cache:
-            model, tokenizer = await self._load_model_and_tokenizer(model_name, quantize, use_onnx)
-            self.model_cache[key] = {"model":model, "tokenizer":tokenizer}
-        return self.model_cache[key]["model"], self.model_cache[key]["tokenizer"]
-    async def get_pipeline(self, model_name, task_type):
-        if model_name not in self.model_cache:
-             config = AutoConfig.from_pretrained(model_name, token=HUGGINGFACE_HUB_TOKEN)
-             model = pipeline(task_type, model=model_name,device=self.device, config=config)
-             self.model_cache[model_name] = {"model":model}
-        return self.model_cache[model_name]["model"]
 model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
 @app.post("/generate")
@@ -109,96 +143,243 @@ async def generate(request: GenerateRequest):
         num_return_sequences = request.num_return_sequences
         do_sample = request.do_sample
         stop_sequences = request.stop_sequences
-        quantize = request.quantize
-        use_onnx = request.use_onnx
-        model, tokenizer = await model_loader.get_model_and_tokenizer(model_name, quantize, use_onnx)
         if "text-to-text" == task_type:
-            generation_config = GenerationConfig(temperature=temperature,max_new_tokens=max_new_tokens,top_p=top_p,top_k=top_k,repetition_penalty=repetition_penalty,do_sample=do_sample,num_return_sequences=num_return_sequences,eos_token_id = tokenizer.eos_token_id)
             if stream:
-                return StreamingResponse(stream_text(model, tokenizer, input_text,generation_config, stop_sequences),media_type="text/plain")
             else:
-                result = await generate_text(model, tokenizer, input_text,generation_config, stop_sequences)
                 return JSONResponse({"text": result, "is_end": True})
         else:
             return HTTPException(status_code=400, detail="Task type not text-to-text")
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 class StopOnSequences(StoppingCriteria):
     def __init__(self, stop_sequences, tokenizer):
         self.stop_sequences = stop_sequences
         self.tokenizer = tokenizer
         self.stop_ids = [tokenizer.encode(seq, add_special_tokens=False) for seq in stop_sequences]
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
         decoded_text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
         for stop_sequence in self.stop_sequences:
              if stop_sequence in decoded_text:
                  return True
         return False
-async def stream_text(model, tokenizer, input_text,generation_config, stop_sequences):
-    encoded_input = tokenizer(input_text, return_tensors="pt",truncation=True).to(model_loader.device)
     stop_criteria = StopOnSequences(stop_sequences, tokenizer)
     stopping_criteria = StoppingCriteriaList([stop_criteria])
-    async for token in _stream_text(model, encoded_input, tokenizer, generation_config, stop_criteria, stopping_criteria):
-            yield json.dumps({"text":token, "is_end": False}) + "\n"
-    yield json.dumps({"text":"", "is_end": True}) + "\n"
-async def _stream_text(model, encoded_input, tokenizer, generation_config, stop_criteria, stopping_criteria):
     output_text = ""
     while True:
-         outputs = await asyncio.to_thread(model.generate,**encoded_input,do_sample=generation_config.do_sample,max_new_tokens=generation_config.max_new_tokens,temperature=generation_config.temperature,top_p=generation_config.top_p,top_k=generation_config.top_k,repetition_penalty=generation_config.repetition_penalty,num_return_sequences=generation_config.num_return_sequences,output_scores=True,return_dict_in_generate=True,stopping_criteria=stopping_criteria)
-         new_text = tokenizer.decode(outputs.sequences[0][len(encoded_input["input_ids"][0]):],skip_special_tokens=True)
-         if len(new_text) == 0:
-             if not stop_criteria(outputs.sequences, None):
-                 for token in output_text.split():
-                    yield token
-             break
-         output_text += new_text
-         for token in new_text.split():
-              yield token
-         if stop_criteria(outputs.sequences, None):
-             break
-         encoded_input = tokenizer(output_text, return_tensors="pt",truncation=True).to(model_loader.device)
-         output_text=""
-async def generate_text(model, tokenizer, input_text,generation_config, stop_sequences):
-    encoded_input = tokenizer(input_text, return_tensors="pt",truncation=True).to(model_loader.device)
     stop_criteria = StopOnSequences(stop_sequences, tokenizer)
     stopping_criteria = StoppingCriteriaList([stop_criteria])
-    outputs = await asyncio.to_thread(model.generate,**encoded_input,do_sample=generation_config.do_sample,max_new_tokens=generation_config.max_new_tokens,temperature=generation_config.temperature,top_p=generation_config.top_p,top_k=generation_config.top_k,repetition_penalty=generation_config.repetition_penalty,num_return_sequences=num_return_sequences,output_scores=True,return_dict_in_generate=True,stopping_criteria=stopping_criteria)
-    generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
     return generated_text
 @app.post("/generate-image")
 async def generate_image(request: GenerateRequest):
     try:
         validated_body = request
-        model = await model_loader.get_pipeline(validated_body.model_name, "text-to-image")
         image = model(validated_body.input_text)[0]
         image_data = list(image.getdata())
         return json.dumps({"image_data": image_data, "is_end": True})
     except Exception as e:
-        raise HTTPException(status_code=500,detail=f"Internal server error: {str(e)}")
 @app.post("/generate-text-to-speech")
 async def generate_text_to_speech(request: GenerateRequest):
     try:
         validated_body = request
-        audio_generator = await model_loader.get_pipeline(validated_body.model_name, "text-to-speech")
         audio = audio_generator(validated_body.input_text)
         audio_bytes = audio["audio"]
         audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
         return json.dumps({"audio": audio_base64, "is_end": True})
     except Exception as e:
-        raise HTTPException(status_code=500,detail=f"Internal server error: {str(e)}")
 @app.post("/generate-video")
 async def generate_video(request: GenerateRequest):
     try:
         validated_body = request
-        video_generator = await model_loader.get_pipeline(validated_body.model_name, "text-to-video")
         video = video_generator(validated_body.input_text)
         video_base64 = base64.b64encode(video).decode('utf-8')
         return json.dumps({"video": video_base64, "is_end": True})
     except Exception as e:
-        raise HTTPException(status_code=500,detail=f"Internal server error: {str(e)}")
-async def load_all_models():
-    pass
 if __name__ == "__main__":
-    import asyncio
-    asyncio.run(load_all_models())
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse, JSONResponse
 from pydantic import BaseModel, field_validator
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    GenerationConfig,
+    StoppingCriteria,
+    StoppingCriteriaList,
+)
 import boto3
 import uvicorn
 import asyncio
 import json
 from huggingface_hub import login
 from botocore.exceptions import NoCredentialsError
 AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
 AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
 S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
 HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
 if HUGGINGFACE_HUB_TOKEN:
+    login(token=HUGGINGFACE_HUB_TOKEN,
+          add_to_git_credential=False)
+s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID,
+                         aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+                         region_name=AWS_REGION)
 app = FastAPI()
 class GenerateRequest(BaseModel):
     model_name: str
     input_text: str = ""
     task_type: str
+    temperature: float = 1.0
+    max_new_tokens: int = 3
     stream: bool = True
     top_p: float = 1.0
+    top_k: int = 50
     repetition_penalty: float = 1.0
     num_return_sequences: int = 1
+    do_sample: bool = True
     stop_sequences: list[str] = []
     @field_validator("model_name")
     def model_name_cannot_be_empty(cls, v):
         if not v:
             raise ValueError("model_name cannot be empty.")
         return v
     @field_validator("task_type")
     def task_type_must_be_valid(cls, v):
+        valid_types = ["text-to-text", "text-to-image",
+                       "text-to-speech", "text-to-video"]
         if v not in valid_types:
             raise ValueError(f"task_type must be one of: {valid_types}")
         return v
+model_data = {}  # Global dictionary to store model data
 class S3ModelLoader:
     def __init__(self, bucket_name, s3_client):
         self.bucket_name = bucket_name
         self.s3_client = s3_client
     def _get_s3_uri(self, model_name):
+         return f"s3://{self.bucket_name}/" \
+               f"{model_name.replace('/', '-')}"
+    async def load_model_and_tokenizer(self, model_name):
+       if model_name in model_data:
+          return model_data[model_name]["model"], model_data[model_name]["tokenizer"]
        s3_uri = self._get_s3_uri(model_name)
        try:
+            config = AutoConfig.from_pretrained(
+                s3_uri, local_files_only=False
+            )
+            model = AutoModelForCausalLM.from_pretrained(
+                s3_uri, config=config, local_files_only=False
+            )
+            tokenizer = AutoTokenizer.from_pretrained(
+                s3_uri, config=config, local_files_only=False
+            )
+            if tokenizer.eos_token_id is not None and \
+               tokenizer.pad_token_id is None:
+                tokenizer.pad_token_id = config.pad_token_id \
+                                        or tokenizer.eos_token_id
+            model_data[model_name] = {"model":model, "tokenizer":tokenizer}
             return model, tokenizer
        except (EnvironmentError, NoCredentialsError):
             try:
+                config = AutoConfig.from_pretrained(
+                    model_name, token=HUGGINGFACE_HUB_TOKEN
+                )
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
+                )
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name, config=config, token=HUGGINGFACE_HUB_TOKEN
+                )
+                if tokenizer.eos_token_id is not None and \
+                   tokenizer.pad_token_id is None:
+                    tokenizer.pad_token_id = config.pad_token_id \
+                                            or tokenizer.eos_token_id
+                model_data[model_name] = {"model":model, "tokenizer":tokenizer}
                 return model, tokenizer
             except Exception as e:
+                raise HTTPException(
+                    status_code=500, detail=f"Error loading model: {e}"
+                )
 model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client)
 @app.post("/generate")
         num_return_sequences = request.num_return_sequences
         do_sample = request.do_sample
         stop_sequences = request.stop_sequences
+        model, tokenizer = await model_loader.load_model_and_tokenizer(model_name)
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model.to(device)
         if "text-to-text" == task_type:
+            generation_config = GenerationConfig(
+                temperature=temperature,
+                max_new_tokens=max_new_tokens,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                do_sample=do_sample,
+                num_return_sequences=num_return_sequences,
+                eos_token_id = tokenizer.eos_token_id
+            )
             if stream:
+                return StreamingResponse(
+                        stream_text(model, tokenizer, input_text,
+                                     generation_config, stop_sequences,
+                                     device),
+                        media_type="text/plain"
+                    )
             else:
+                result = await generate_text(model, tokenizer, input_text,
+                                     generation_config, stop_sequences,
+                                     device)
                 return JSONResponse({"text": result, "is_end": True})
         else:
             return HTTPException(status_code=400, detail="Task type not text-to-text")
     except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Internal server error: {str(e)}"
+        )
 class StopOnSequences(StoppingCriteria):
     def __init__(self, stop_sequences, tokenizer):
         self.stop_sequences = stop_sequences
         self.tokenizer = tokenizer
         self.stop_ids = [tokenizer.encode(seq, add_special_tokens=False) for seq in stop_sequences]
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
         decoded_text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
         for stop_sequence in self.stop_sequences:
              if stop_sequence in decoded_text:
                  return True
         return False
+async def stream_text(model, tokenizer, input_text,
+                        generation_config, stop_sequences,
+                        device):
+    encoded_input = tokenizer(
+        input_text, return_tensors="pt",
+        truncation=True
+    ).to(device)
     stop_criteria = StopOnSequences(stop_sequences, tokenizer)
     stopping_criteria = StoppingCriteriaList([stop_criteria])
     output_text = ""
     while True:
+        outputs = await asyncio.to_thread(model.generate,
+                                          **encoded_input,
+                                           do_sample=generation_config.do_sample,
+                                            max_new_tokens=generation_config.max_new_tokens,
+                                            temperature=generation_config.temperature,
+                                            top_p=generation_config.top_p,
+                                            top_k=generation_config.top_k,
+                                            repetition_penalty=generation_config.repetition_penalty,
+                                            num_return_sequences=generation_config.num_return_sequences,
+                                            output_scores=True,
+                                            return_dict_in_generate=True,
+                                            stopping_criteria=stopping_criteria
+                                         )
+        new_text = tokenizer.decode(
+            outputs.sequences[0][len(encoded_input["input_ids"][0]):],
+            skip_special_tokens=True
+        )
+        if len(new_text) == 0:
+            if not stop_criteria(outputs.sequences, None):
+                for text in output_text.split():
+                    yield json.dumps({"text": text, "is_end": False}) + "\n"
+                yield json.dumps({"text": "", "is_end": True}) + "\n"
+            break
+        output_text += new_text
+        for text in new_text.split():
+              yield json.dumps({"text": text, "is_end": False}) + "\n"
+        if stop_criteria(outputs.sequences, None):
+            yield json.dumps({"text": "", "is_end": True}) + "\n"
+            break
+        encoded_input = tokenizer(
+            output_text, return_tensors="pt",
+            truncation=True
+        ).to(device)
+        output_text = ""
+async def generate_text(model, tokenizer, input_text,
+                        generation_config, stop_sequences,
+                        device):
+    encoded_input = tokenizer(
+        input_text, return_tensors="pt",
+        truncation=True
+    ).to(device)
     stop_criteria = StopOnSequences(stop_sequences, tokenizer)
     stopping_criteria = StoppingCriteriaList([stop_criteria])
+    outputs = await asyncio.to_thread(model.generate,
+                                      **encoded_input,
+                                       do_sample=generation_config.do_sample,
+                                        max_new_tokens=generation_config.max_new_tokens,
+                                        temperature=generation_config.temperature,
+                                        top_p=generation_config.top_p,
+                                        top_k=generation_config.top_k,
+                                        repetition_penalty=generation_config.repetition_penalty,
+                                        num_return_sequences=generation_config.num_return_sequences,
+                                        output_scores=True,
+                                        return_dict_in_generate=True,
+                                        stopping_criteria=stopping_criteria
+                                     )
+    generated_text = tokenizer.decode(
+        outputs.sequences[0], skip_special_tokens=True
+    )
     return generated_text
 @app.post("/generate-image")
 async def generate_image(request: GenerateRequest):
     try:
         validated_body = request
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        if validated_body.model_name not in model_data:
+            config = AutoConfig.from_pretrained(
+                    validated_body.model_name, token=HUGGINGFACE_HUB_TOKEN
+                )
+            model = pipeline(
+                "text-to-image", model=validated_body.model_name,
+                device=device, config=config
+            )
+            model_data[validated_body.model_name] = {"model":model}
+        else:
+            model = model_data[validated_body.model_name]["model"]
         image = model(validated_body.input_text)[0]
         image_data = list(image.getdata())
         return json.dumps({"image_data": image_data, "is_end": True})
     except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Internal server error: {str(e)}"
+        )
 @app.post("/generate-text-to-speech")
 async def generate_text_to_speech(request: GenerateRequest):
     try:
         validated_body = request
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        if validated_body.model_name not in model_data:
+             config = AutoConfig.from_pretrained(
+                    validated_body.model_name, token=HUGGINGFACE_HUB_TOKEN
+                )
+             audio_generator = pipeline(
+                "text-to-speech", model=validated_body.model_name,
+                device=device, config=config
+            )
+             model_data[validated_body.model_name] = {"model":audio_generator}
+        else:
+            audio_generator = model_data[validated_body.model_name]["model"]
         audio = audio_generator(validated_body.input_text)
         audio_bytes = audio["audio"]
         audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
         return json.dumps({"audio": audio_base64, "is_end": True})
     except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Internal server error: {str(e)}"
+        )
 @app.post("/generate-video")
 async def generate_video(request: GenerateRequest):
     try:
         validated_body = request
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        if validated_body.model_name not in model_data:
+            config = AutoConfig.from_pretrained(
+                    validated_body.model_name, token=HUGGINGFACE_HUB_TOKEN
+                )
+            video_generator = pipeline(
+                "text-to-video", model=validated_body.model_name,
+                device=device, config=config
+            )
+            model_data[validated_body.model_name] = {"model":video_generator}
+        else:
+            video_generator = model_data[validated_body.model_name]["model"]
         video = video_generator(validated_body.input_text)
         video_base64 = base64.b64encode(video).decode('utf-8')
         return json.dumps({"video": video_base64, "is_end": True})
     except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Internal server error: {str(e)}"
+        )
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)