import os import logging import requests import threading from io import BytesIO from fastapi import FastAPI, HTTPException, Response, Request from fastapi.responses import StreamingResponse from pydantic import BaseModel from transformers import ( AutoConfig, AutoModelForCausalLM, AutoTokenizer, pipeline, GenerationConfig ) import boto3 from huggingface_hub import hf_hub_download import soundfile as sf import numpy as np import torch import uvicorn from tqdm import tqdm logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") AWS_REGION = os.getenv("AWS_REGION") S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME") HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN") class GenerateRequest(BaseModel): model_name: str input_text: str task_type: str temperature: float = 1.0 max_new_tokens: int = 200 stream: bool = False top_p: float = 1.0 top_k: int = 50 repetition_penalty: float = 1.0 num_return_sequences: int = 1 do_sample: bool = True class S3ModelLoader: def __init__(self, bucket_name, s3_client): self.bucket_name = bucket_name self.s3_client = s3_client def _get_s3_uri(self, model_name): return f"s3://{self.bucket_name}/{model_name.replace('/', '-')}" def download_model_from_s3(self, model_name): try: config = AutoConfig.from_pretrained(f"s3://{self.bucket_name}/{model_name}") model = AutoModelForCausalLM.from_pretrained(f"s3://{self.bucket_name}/{model_name}", config=config) tokenizer = AutoTokenizer.from_pretrained(f"s3://{self.bucket_name}/{model_name}") return model, tokenizer except Exception: return None, None async def load_model_and_tokenizer(self, model_name): try: model, tokenizer = self.download_model_from_s3(model_name) if model is None or tokenizer is None: model, tokenizer = await self.download_and_save_model_from_huggingface(model_name) return model, tokenizer except Exception as e: raise HTTPException(status_code=500, detail=f"Error loading model: {e}") async def download_and_save_model_from_huggingface(self, model_name): try: with tqdm(unit="B", unit_scale=True, desc=f"Downloading {model_name}") as t: model = AutoModelForCausalLM.from_pretrained(model_name, token=HUGGINGFACE_HUB_TOKEN, _tqdm=t) tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGINGFACE_HUB_TOKEN) self.upload_model_to_s3(model_name, model, tokenizer) return model, tokenizer except Exception as e: raise HTTPException(status_code=500, detail=f"Error downloading model from Hugging Face: {e}") def upload_model_to_s3(self, model_name, model, tokenizer): try: s3_uri = self._get_s3_uri(model_name) model.save_pretrained(s3_uri) tokenizer.save_pretrained(s3_uri) except Exception as e: raise HTTPException(status_code=500, detail=f"Error saving model to S3: {e}") app = FastAPI() s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, region_name=AWS_REGION) model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client) @app.post("/generate") async def generate(request: Request, body: GenerateRequest): try: model, tokenizer = await model_loader.load_model_and_tokenizer(body.model_name) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) if body.task_type == "text-to-text": generation_config = GenerationConfig( temperature=body.temperature, max_new_tokens=body.max_new_tokens, top_p=body.top_p, top_k=body.top_k, repetition_penalty=body.repetition_penalty, do_sample=body.do_sample, num_return_sequences=body.num_return_sequences ) async def stream_text(): input_text = body.input_text max_length = model.config.max_position_embeddings generated_text = "" while True: inputs = tokenizer(input_text, return_tensors="pt").to(device) input_length = inputs.input_ids.shape[1] remaining_tokens = max_length - input_length if remaining_tokens < body.max_new_tokens: generation_config.max_new_tokens = remaining_tokens if remaining_tokens <= 0: break output = model.generate(**inputs, generation_config=generation_config) chunk = tokenizer.decode(output[0], skip_special_tokens=True) generated_text += chunk yield chunk if len(tokenizer.encode(generated_text)) >= max_length: break input_text = chunk if body.stream: return StreamingResponse(stream_text(), media_type="text/plain") else: generated_text = "" async for chunk in stream_text(): generated_text += chunk return {"result": generated_text} elif body.task_type == "text-to-image": generator = pipeline("text-to-image", model=model, tokenizer=tokenizer, device=device) image = generator(body.input_text)[0] image_bytes = image.tobytes() return Response(content=image_bytes, media_type="image/png") elif body.task_type == "text-to-speech": generator = pipeline("text-to-speech", model=model, tokenizer=tokenizer, device=device) audio = generator(body.input_text) audio_bytesio = BytesIO() sf.write(audio_bytesio, audio["sampling_rate"], np.int16(audio["audio"])) audio_bytes = audio_bytesio.getvalue() return Response(content=audio_bytes, media_type="audio/wav") elif body.task_type == "text-to-video": try: generator = pipeline("text-to-video", model=model, tokenizer=tokenizer, device=device) video = generator(body.input_text) return Response(content=video, media_type="video/mp4") except Exception as e: raise HTTPException(status_code=500, detail=f"Error in text-to-video generation: {e}") else: raise HTTPException(status_code=400, detail="Unsupported task type") except HTTPException as e: raise e except Exception as e: raise HTTPException(status_code=500, detail=str(e)) def download_all_models_in_background(): models_url = "https://huggingface.co/api/models" try: response = requests.get(models_url) if response.status_code != 200: raise HTTPException(status_code=500, detail="Error al obtener la lista de modelos.") models = response.json() for model in models: model_name = model["id"] model_loader.download_and_save_model_from_huggingface(model_name) except Exception as e: raise HTTPException(status_code=500, detail="Error al descargar modelos en segundo plano.") def run_in_background(): threading.Thread(target=download_all_models_in_background, daemon=True).start() @app.on_event("startup") async def startup_event(): run_in_background() if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)