SmolLM2-135M / app.py
Vaibhav-Singh's picture
remove console log in ai api
a771038
# # from fastapi import FastAPI, HTTPException
# # from pydantic import BaseModel
# # from transformers import AutoModelForCausalLM, AutoTokenizer
# # from typing import List
# # import torch
# # app = FastAPI(title="Language Model API")
# # # Model configuration
# # CHECKPOINT = "HuggingFaceTB/SmolLM2-135M-Instruct"
# # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# # # Initialize model and tokenizer
# # try:
# # tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
# # model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE)
# # except Exception as e:
# # raise RuntimeError(f"Failed to load model: {str(e)}")
# # class ChatMessage(BaseModel):
# # role: str
# # content: str
# # class ChatRequest(BaseModel):
# # messages: List[ChatMessage]
# # max_new_tokens: int = 50
# # temperature: float = 0.2
# # top_p: float = 0.9
# # @app.post("/generate")
# # async def generate_response(request: ChatRequest):
# # try:
# # # Convert messages to the format expected by the model
# # messages = [{"role": msg.role, "content": msg.content} for msg in request.messages]
# # # Prepare input
# # input_text = tokenizer.apply_chat_template(messages, tokenize=False)
# # inputs = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE)
# # # Generate response
# # outputs = model.generate(
# # inputs,
# # max_new_tokens=request.max_new_tokens,
# # temperature=request.temperature,
# # top_p=request.top_p,
# # do_sample=True
# # )
# # # Decode and return response
# # response_text = tokenizer.decode(outputs[0])
# # return {
# # "generated_text": response_text
# # }
# # except Exception as e:
# # raise HTTPException(status_code=500, detail=str(e))
# # if __name__ == "__main__":
# # import uvicorn
# # uvicorn.run(app, host="0.0.0.0", port=7860)
# from fastapi import FastAPI, HTTPException
# from pydantic import BaseModel
# from typing import List, Dict
# import transformers
# import torch
# app = FastAPI(title="LLaMA API")
# # Initialize the model and pipeline at startup
# model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# pipeline = transformers.pipeline(
# "text-generation",
# model=model_id,
# model_kwargs={"torch_dtype": torch.bfloat16},
# device_map="auto",
# )
# class Message(BaseModel):
# role: str
# content: str
# class ChatRequest(BaseModel):
# messages: List[Message]
# max_new_tokens: int = 256
# class ChatResponse(BaseModel):
# generated_text: str
# @app.post("/generate", response_model=ChatResponse)
# async def chat(request: ChatRequest):
# try:
# outputs = pipeline(
# [{"role": msg.role, "content": msg.content} for msg in request.messages],
# max_new_tokens=request.max_new_tokens,
# )
# # Extract the last generated message
# generated_text = outputs[0]["generated_text"][-1]
# return ChatResponse(generated_text=generated_text)
# except Exception as e:
# raise HTTPException(status_code=500, detail=str(e))
# # Health check endpoint
# @app.get("/")
# async def health_check():
# return {"status": "healthy"}
# if __name__ == "__main__":
# import uvicorn
# uvicorn.run(app, host="0.0.0.0", port=8000)
# # from fastapi import FastAPI, HTTPException
# # from pydantic import BaseModel
# # from transformers import AutoModelForCausalLM, AutoTokenizer
# # from typing import List
# # import torch
# # app = FastAPI(title="Language Model API")
# # # Model configuration
# # CHECKPOINT = "HuggingFaceTB/SmolLM2-135M-Instruct"
# # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# # # Initialize model and tokenizer
# # try:
# # tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
# # model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE)
# # except Exception as e:
# # raise RuntimeError(f"Failed to load model: {str(e)}")
# # class ChatMessage(BaseModel):
# # role: str
# # content: str
# # class ChatRequest(BaseModel):
# # messages: List[ChatMessage]
# # max_new_tokens: int = 50
# # temperature: float = 0.2
# # top_p: float = 0.9
# # @app.post("/generate")
# # async def generate_response(request: ChatRequest):
# # try:
# # # Convert messages to the format expected by the model
# # messages = [{"role": msg.role, "content": msg.content} for msg in request.messages]
# # # Prepare input
# # input_text = tokenizer.apply_chat_template(messages, tokenize=False)
# # inputs = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE)
# # # Generate response
# # outputs = model.generate(
# # inputs,
# # max_new_tokens=request.max_new_tokens,
# # temperature=request.temperature,
# # top_p=request.top_p,
# # do_sample=True
# # )
# # # Decode and return response
# # response_text = tokenizer.decode(outputs[0])
# # return {
# # "generated_text": response_text
# # }
# # except Exception as e:
# # raise HTTPException(status_code=500, detail=str(e))
# # if __name__ == "__main__":
# # import uvicorn
# # uvicorn.run(app, host="0.0.0.0", port=7860)
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Dict
import transformers
import torch
app = FastAPI(title="LLaMA API")
# Initialize the model and pipeline at startup
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16},
device_map="auto",
)
class Message(BaseModel):
role: str
content: str
class ChatRequest(BaseModel):
messages: List[Message]
max_new_tokens: int = 256
class ChatResponse(BaseModel):
generated_text: str
@app.post("/generate", response_model=ChatResponse)
async def chat(request: ChatRequest):
try:
outputs = pipeline(
[{"role": msg.role, "content": msg.content} for msg in request.messages],
max_new_tokens=request.max_new_tokens,
)
# Extract the last generated message
generated_text = outputs[0]["generated_text"][-1]
return ChatResponse(generated_text=generated_text)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# Health check endpoint
@app.get("/")
async def health_check():
return {"status": "healthy"}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)