import os import logging import time from io import BytesIO from fastapi import FastAPI, HTTPException, Response, Request from fastapi.responses import StreamingResponse from pydantic import BaseModel from transformers import ( AutoConfig, AutoModelForCausalLM, AutoTokenizer, pipeline, GenerationConfig ) import boto3 from huggingface_hub import hf_hub_download import soundfile as sf import numpy as np import torch import uvicorn from tqdm import tqdm logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") AWS_REGION = os.getenv("AWS_REGION") S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME") HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN") class GenerateRequest(BaseModel): model_name: str input_text: str task_type: str temperature: float = 1.0 max_new_tokens: int = 200 stream: bool = False top_p: float = 1.0 top_k: int = 50 repetition_penalty: float = 1.0 num_return_sequences: int = 1 do_sample: bool = True chunk_delay: float = 0.0 class S3ModelLoader: def __init__(self, bucket_name, s3_client): self.bucket_name = bucket_name self.s3_client = s3_client def _get_s3_uri(self, model_name): return f"s3://{self.bucket_name}/{model_name.replace('/', '-')}" async def load_model_and_tokenizer(self, model_name): s3_uri = self._get_s3_uri(model_name) try: logging.info(f"Trying to load {model_name} from S3...") config = AutoConfig.from_pretrained(s3_uri) model = AutoModelForCausalLM.from_pretrained(s3_uri, config=config) tokenizer = AutoTokenizer.from_pretrained(s3_uri) logging.info(f"Loaded {model_name} from S3 successfully.") return model, tokenizer except EnvironmentError: logging.info(f"Model {model_name} not found in S3. Downloading...") try: with tqdm(unit="B", unit_scale=True, desc=f"Downloading {model_name}", disable=False) as t: model = AutoModelForCausalLM.from_pretrained(model_name, token=HUGGINGFACE_HUB_TOKEN, _tqdm=t) tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGINGFACE_HUB_TOKEN) logging.info(f"Downloaded {model_name} successfully.") logging.info(f"Saving {model_name} to S3...") model.save_pretrained(s3_uri) tokenizer.save_pretrained(s3_uri) logging.info(f"Saved {model_name} to S3 successfully.") return model, tokenizer except Exception as e: logging.error(f"Error downloading/uploading model: {e}") raise HTTPException(status_code=500, detail=f"Error loading model: {e}") app = FastAPI() s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, region_name=AWS_REGION) model_loader = S3ModelLoader(S3_BUCKET_NAME, s3_client) @app.post("/generate") async def generate(request: Request, body: GenerateRequest): try: model, tokenizer = await model_loader.load_model_and_tokenizer(body.model_name) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) if body.task_type == "text-to-text": generation_config = GenerationConfig( temperature=body.temperature, max_new_tokens=body.max_new_tokens, top_p=body.top_p, top_k=body.top_k, repetition_penalty=body.repetition_penalty, do_sample=body.do_sample, num_return_sequences=body.num_return_sequences ) async def stream_text(): input_text = body.input_text generated_text = "" max_length = model.config.max_position_embeddings while True: encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device) input_length = encoded_input["input_ids"].shape[1] remaining_tokens = max_length - input_length if remaining_tokens <= 0: break generation_config.max_new_tokens = min(remaining_tokens, body.max_new_tokens) output = model.generate(**encoded_input, generation_config=generation_config) chunk = tokenizer.decode(output[0], skip_special_tokens=True) generated_text += chunk yield chunk time.sleep(body.chunk_delay) input_text = generated_text if body.stream: return StreamingResponse(stream_text(), media_type="text/plain") else: generated_text = "" async for chunk in stream_text(): generated_text += chunk return {"result": generated_text} elif body.task_type == "text-to-image": generator = pipeline("text-to-image", model=model, tokenizer=tokenizer, device=device) image = generator(body.input_text)[0] image_bytes = image.tobytes() return Response(content=image_bytes, media_type="image/png") elif body.task_type == "text-to-speech": generator = pipeline("text-to-speech", model=model, tokenizer=tokenizer, device=device) audio = generator(body.input_text) audio_bytesio = BytesIO() sf.write(audio_bytesio, audio["sampling_rate"], np.int16(audio["audio"])) audio_bytes = audio_bytesio.getvalue() return Response(content=audio_bytes, media_type="audio/wav") elif body.task_type == "text-to-audio": generator = pipeline("text-to-audio", model=model, tokenizer=tokenizer, device=device) audio = generator(body.input_text) audio_bytesio = BytesIO() sf.write(audio_bytesio, audio["sampling_rate"], np.int16(audio["audio"])) audio_bytes = audio_bytesio.getvalue() return Response(content=audio_bytes, media_type="audio/wav") elif body.task_type == "text-to-video": try: generator = pipeline("text-to-video", model=model, tokenizer=tokenizer, device=device) video = generator(body.input_text) return Response(content=video, media_type="video/mp4") except Exception as e: raise HTTPException(status_code=500, detail=f"Error in text-to-video generation: {e}") else: raise HTTPException(status_code=400, detail="Unsupported task type") except HTTPException as e: raise e except Exception as e: raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)