Spaces:
Runtime error
Runtime error
from fastapi import FastAPI, HTTPException | |
from pydantic import BaseModel | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from typing import List | |
import torch | |
app = FastAPI(title="Language Model API") | |
# Model configuration | |
CHECKPOINT = "HuggingFaceTB/SmolLM2-135M-Instruct" | |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
# Initialize model and tokenizer | |
try: | |
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT) | |
model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE) | |
except Exception as e: | |
raise RuntimeError(f"Failed to load model: {str(e)}") | |
class ChatMessage(BaseModel): | |
role: str | |
content: str | |
class ChatRequest(BaseModel): | |
messages: List[ChatMessage] | |
max_new_tokens: int = 50 | |
temperature: float = 0.2 | |
top_p: float = 0.9 | |
async def generate_response(request: ChatRequest): | |
try: | |
# Convert messages to the format expected by the model | |
messages = [{"role": msg.role, "content": msg.content} for msg in request.messages] | |
# Prepare input | |
input_text = tokenizer.apply_chat_template(messages, tokenize=False) | |
inputs = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE) | |
# Generate response | |
outputs = model.generate( | |
inputs, | |
max_new_tokens=request.max_new_tokens, | |
temperature=request.temperature, | |
top_p=request.top_p, | |
do_sample=True | |
) | |
# Decode and return response | |
response_text = tokenizer.decode(outputs[0]) | |
return { | |
"generated_text": response_text | |
} | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=str(e)) | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0", port=7860) | |