import os import torch import multiprocessing from fastapi import FastAPI, Request from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoTokenizer from dotenv import load_dotenv from accelerate import Accelerator from typing import List, Tuple # Load environment variables from a .env file (useful for local development) load_dotenv() # HTML for the Buy Me a Coffee badge html_content = """ Llama-3.2-1B-Instruct-API
Buy Me A Coffee

Please Chill Out! 😎

This API takes around 5.62 minutes to process a single request due to current hardware limitations.

Want Faster Responses? Help Me Out! 🚀

If you'd like to see this API running faster on high-performance A100 hardware, please consider buying me a coffee. ☕ Your support will go towards upgrading to Hugging Face Pro, which will allow me to run A100-powered spaces for everyone! 🙌

Instructions to Clone and Run Locally:

  1. Clone the Repository:
    git clone https://huggingface.co/spaces/xxparthparekhxx/llama-3.2-1B-FastApi
    cd llama-3.2-1B-FastApi
  2. Run the Docker container:
    docker build -t llama-api .
    docker run -p 7860:7860 llama-api
  3. Access the API locally:

    Open http://localhost:7860 to access the API docs locally.

""" # FastAPI app with embedded Buy Me a Coffee badge and instructions app = FastAPI( title="Llama-3.2-1B-Instruct-API", description= html_content, docs_url="/", # URL for Swagger docs redoc_url="/doc" # URL for ReDoc docs ) HF_TOKEN = os.getenv("HF_TOKEN") MODEL = "meta-llama/Llama-3.2-1B-Instruct" device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") torch.set_num_threads(multiprocessing.cpu_count()) accelerator = Accelerator() tokenizer = AutoTokenizer.from_pretrained(MODEL, token=HF_TOKEN, use_fast=True) model = AutoModelForCausalLM.from_pretrained( MODEL, token=HF_TOKEN, torch_dtype=torch.float16, device_map=device ) model, tokenizer = accelerator.prepare(model, tokenizer) # Pydantic models for request validation class PromptRequest(BaseModel): prompt: str max_new_tokens: int = 100 temperature: float = 0.7 class ChatRequest(BaseModel): message: str history: List[Tuple[str, str]] = [] max_new_tokens: int = 100 temperature: float = 0.7 system_prompt: str = "You are a helpful assistant." # Endpoints @app.post("/generate/") async def generate_text(request: PromptRequest): inputs = tokenizer(request.prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=request.max_new_tokens, temperature=request.temperature, do_sample=False, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": response} @app.post("/chat/") async def chat(request: ChatRequest): conversation = [ {"role": "system", "content": request.system_prompt} ] for human, assistant in request.history: conversation.extend([ {"role": "user", "content": human}, {"role": "assistant", "content": assistant} ]) conversation.append({"role": "user", "content": request.message}) input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate( input_ids, max_new_tokens=request.max_new_tokens, temperature=request.temperature, do_sample=False, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract only the assistant's response assistant_response = response.split("Assistant:")[-1].strip() return {"response": assistant_response}