Spaces:
Runtime error
Runtime error
# # from fastapi import FastAPI, HTTPException | |
# # from pydantic import BaseModel | |
# # from transformers import AutoModelForCausalLM, AutoTokenizer | |
# # from typing import List | |
# # import torch | |
# # app = FastAPI(title="Language Model API") | |
# # # Model configuration | |
# # CHECKPOINT = "HuggingFaceTB/SmolLM2-135M-Instruct" | |
# # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
# # # Initialize model and tokenizer | |
# # try: | |
# # tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT) | |
# # model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE) | |
# # except Exception as e: | |
# # raise RuntimeError(f"Failed to load model: {str(e)}") | |
# # class ChatMessage(BaseModel): | |
# # role: str | |
# # content: str | |
# # class ChatRequest(BaseModel): | |
# # messages: List[ChatMessage] | |
# # max_new_tokens: int = 50 | |
# # temperature: float = 0.2 | |
# # top_p: float = 0.9 | |
# # @app.post("/generate") | |
# # async def generate_response(request: ChatRequest): | |
# # try: | |
# # # Convert messages to the format expected by the model | |
# # messages = [{"role": msg.role, "content": msg.content} for msg in request.messages] | |
# # # Prepare input | |
# # input_text = tokenizer.apply_chat_template(messages, tokenize=False) | |
# # inputs = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE) | |
# # # Generate response | |
# # outputs = model.generate( | |
# # inputs, | |
# # max_new_tokens=request.max_new_tokens, | |
# # temperature=request.temperature, | |
# # top_p=request.top_p, | |
# # do_sample=True | |
# # ) | |
# # # Decode and return response | |
# # response_text = tokenizer.decode(outputs[0]) | |
# # return { | |
# # "generated_text": response_text | |
# # } | |
# # except Exception as e: | |
# # raise HTTPException(status_code=500, detail=str(e)) | |
# # if __name__ == "__main__": | |
# # import uvicorn | |
# # uvicorn.run(app, host="0.0.0.0", port=7860) | |
# from fastapi import FastAPI, HTTPException | |
# from pydantic import BaseModel | |
# from typing import List, Dict | |
# import transformers | |
# import torch | |
# app = FastAPI(title="LLaMA API") | |
# # Initialize the model and pipeline at startup | |
# model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" | |
# pipeline = transformers.pipeline( | |
# "text-generation", | |
# model=model_id, | |
# model_kwargs={"torch_dtype": torch.bfloat16}, | |
# device_map="auto", | |
# ) | |
# class Message(BaseModel): | |
# role: str | |
# content: str | |
# class ChatRequest(BaseModel): | |
# messages: List[Message] | |
# max_new_tokens: int = 256 | |
# class ChatResponse(BaseModel): | |
# generated_text: str | |
# @app.post("/generate", response_model=ChatResponse) | |
# async def chat(request: ChatRequest): | |
# try: | |
# outputs = pipeline( | |
# [{"role": msg.role, "content": msg.content} for msg in request.messages], | |
# max_new_tokens=request.max_new_tokens, | |
# ) | |
# # Extract the last generated message | |
# generated_text = outputs[0]["generated_text"][-1] | |
# return ChatResponse(generated_text=generated_text) | |
# except Exception as e: | |
# raise HTTPException(status_code=500, detail=str(e)) | |
# # Health check endpoint | |
# @app.get("/") | |
# async def health_check(): | |
# return {"status": "healthy"} | |
# if __name__ == "__main__": | |
# import uvicorn | |
# uvicorn.run(app, host="0.0.0.0", port=8000) | |
# # from fastapi import FastAPI, HTTPException | |
# # from pydantic import BaseModel | |
# # from transformers import AutoModelForCausalLM, AutoTokenizer | |
# # from typing import List | |
# # import torch | |
# # app = FastAPI(title="Language Model API") | |
# # # Model configuration | |
# # CHECKPOINT = "HuggingFaceTB/SmolLM2-135M-Instruct" | |
# # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
# # # Initialize model and tokenizer | |
# # try: | |
# # tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT) | |
# # model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE) | |
# # except Exception as e: | |
# # raise RuntimeError(f"Failed to load model: {str(e)}") | |
# # class ChatMessage(BaseModel): | |
# # role: str | |
# # content: str | |
# # class ChatRequest(BaseModel): | |
# # messages: List[ChatMessage] | |
# # max_new_tokens: int = 50 | |
# # temperature: float = 0.2 | |
# # top_p: float = 0.9 | |
# # @app.post("/generate") | |
# # async def generate_response(request: ChatRequest): | |
# # try: | |
# # # Convert messages to the format expected by the model | |
# # messages = [{"role": msg.role, "content": msg.content} for msg in request.messages] | |
# # # Prepare input | |
# # input_text = tokenizer.apply_chat_template(messages, tokenize=False) | |
# # inputs = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE) | |
# # # Generate response | |
# # outputs = model.generate( | |
# # inputs, | |
# # max_new_tokens=request.max_new_tokens, | |
# # temperature=request.temperature, | |
# # top_p=request.top_p, | |
# # do_sample=True | |
# # ) | |
# # # Decode and return response | |
# # response_text = tokenizer.decode(outputs[0]) | |
# # return { | |
# # "generated_text": response_text | |
# # } | |
# # except Exception as e: | |
# # raise HTTPException(status_code=500, detail=str(e)) | |
# # if __name__ == "__main__": | |
# # import uvicorn | |
# # uvicorn.run(app, host="0.0.0.0", port=7860) | |
from fastapi import FastAPI, HTTPException | |
from pydantic import BaseModel | |
from typing import List, Dict | |
import transformers | |
import torch | |
app = FastAPI(title="LLaMA API") | |
# Initialize the model and pipeline at startup | |
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" | |
pipeline = transformers.pipeline( | |
"text-generation", | |
model=model_id, | |
model_kwargs={"torch_dtype": torch.bfloat16}, | |
device_map="auto", | |
) | |
class Message(BaseModel): | |
role: str | |
content: str | |
class ChatRequest(BaseModel): | |
messages: List[Message] | |
max_new_tokens: int = 256 | |
class ChatResponse(BaseModel): | |
generated_text: str | |
async def chat(request: ChatRequest): | |
try: | |
outputs = pipeline( | |
[{"role": msg.role, "content": msg.content} for msg in request.messages], | |
max_new_tokens=request.max_new_tokens, | |
) | |
# Extract the last generated message | |
generated_text = outputs[0]["generated_text"][-1] | |
return ChatResponse(generated_text=generated_text) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=str(e)) | |
# Health check endpoint | |
async def health_check(): | |
return {"status": "healthy"} | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0", port=8000) |