import os
import torch
import multiprocessing
from fastapi import FastAPI, Request
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from dotenv import load_dotenv
from accelerate import Accelerator
from typing import List, Tuple
# Load environment variables from a .env file (useful for local development)
load_dotenv()
# HTML for the Buy Me a Coffee badge
html_content = """
Llama-3.2-1B-Instruct-API
Please Chill Out! 😎
This API takes around 5.62 minutes to process a single request due to current hardware limitations.
Want Faster Responses? Help Me Out! 🚀
If you'd like to see this API running faster on high-performance A100 hardware, please consider buying me a coffee. ☕ Your support will go towards upgrading to Hugging Face Pro, which will allow me to run A100-powered spaces for everyone! 🙌
Instructions to Clone and Run Locally:
- Clone the Repository:
git clone https://huggingface.co/spaces/xxparthparekhxx/llama-3.2-1B-FastApi
cd llama-3.2-1B-FastApi
- Run the Docker container:
docker build -t llama-api .
docker run -p 7860:7860 llama-api
- Access the API locally:
Open http://localhost:7860 to access the API docs locally.
"""
# FastAPI app with embedded Buy Me a Coffee badge and instructions
app = FastAPI(
title="Llama-3.2-1B-Instruct-API",
description= html_content,
docs_url="/", # URL for Swagger docs
redoc_url="/doc" # URL for ReDoc docs
)
HF_TOKEN = os.getenv("HF_TOKEN")
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
torch.set_num_threads(multiprocessing.cpu_count())
accelerator = Accelerator()
tokenizer = AutoTokenizer.from_pretrained(MODEL, token=HF_TOKEN, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL,
token=HF_TOKEN,
torch_dtype=torch.float16,
device_map=device
)
model, tokenizer = accelerator.prepare(model, tokenizer)
# Pydantic models for request validation
class PromptRequest(BaseModel):
prompt: str
max_new_tokens: int = 100
temperature: float = 0.7
class ChatRequest(BaseModel):
message: str
history: List[Tuple[str, str]] = []
max_new_tokens: int = 100
temperature: float = 0.7
system_prompt: str = "You are a helpful assistant."
# Endpoints
@app.post("/generate/")
async def generate_text(request: PromptRequest):
inputs = tokenizer(request.prompt, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=request.max_new_tokens,
temperature=request.temperature,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"response": response}
@app.post("/chat/")
async def chat(request: ChatRequest):
conversation = [
{"role": "system", "content": request.system_prompt}
]
for human, assistant in request.history:
conversation.extend([
{"role": "user", "content": human},
{"role": "assistant", "content": assistant}
])
conversation.append({"role": "user", "content": request.message})
input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model.generate(
input_ids,
max_new_tokens=request.max_new_tokens,
temperature=request.temperature,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the assistant's response
assistant_response = response.split("Assistant:")[-1].strip()
return {"response": assistant_response}