import os import torch import multiprocessing from fastapi import FastAPI, Request from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoTokenizer from dotenv import load_dotenv from accelerate import Accelerator # Load environment variables from a .env file (useful for local development) load_dotenv() # Initialize FastAPI app app = FastAPI(description="Use the Llama-3.2-1B-Instruct model using the API", docs_url="/", redoc_url="/doc") # Set your Hugging Face token from environment variable HF_TOKEN = os.getenv("HF_TOKEN") MODEL = "meta-llama/Llama-3.2-1B-Instruct" # Auto-select CPU or GPU device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Set PyTorch to use all available CPU cores if running on CPU torch.set_num_threads(multiprocessing.cpu_count()) # Initialize Accelerator for managing device allocation accelerator = Accelerator() # Load model and tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL, token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained( MODEL, token=HF_TOKEN, torch_dtype=torch.bfloat16 if device == 'cpu' else torch.float16, # Use bfloat16 for CPUs, float16 for GPUs device_map="auto" ) # Prepare model for multi-device setup with accelerate model, tokenizer = accelerator.prepare(model, tokenizer) # Pydantic model for input class PromptRequest(BaseModel): prompt: str max_new_tokens: int = 100 temperature: float = 0.7 @app.post("/generate/") async def generate_text(request: PromptRequest): inputs = tokenizer(request.prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=request.max_new_tokens, temperature=request.temperature, do_sample=True, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": response}