Spaces:

Vaibhav-Singh
/

SmolLM2-135M

Runtime error

App Files Files Community

SmolLM2-135M / app.py

Vaibhav-Singh

remove console log in ai api

a771038 21 days ago

raw

history blame contribute delete

6.91 kB

	# # from fastapi import FastAPI, HTTPException
	# # from pydantic import BaseModel
	# # from transformers import AutoModelForCausalLM, AutoTokenizer
	# # from typing import List
	# # import torch

	# # app = FastAPI(title="Language Model API")

	# # # Model configuration
	# # CHECKPOINT = "HuggingFaceTB/SmolLM2-135M-Instruct"
	# # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# # # Initialize model and tokenizer
	# # try:
	# # tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
	# # model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE)
	# # except Exception as e:
	# # raise RuntimeError(f"Failed to load model: {str(e)}")

	# # class ChatMessage(BaseModel):
	# # role: str
	# # content: str

	# # class ChatRequest(BaseModel):
	# # messages: List[ChatMessage]
	# # max_new_tokens: int = 50
	# # temperature: float = 0.2
	# # top_p: float = 0.9

	# # @app.post("/generate")
	# # async def generate_response(request: ChatRequest):
	# # try:
	# # # Convert messages to the format expected by the model
	# # messages = [{"role": msg.role, "content": msg.content} for msg in request.messages]

	# # # Prepare input
	# # input_text = tokenizer.apply_chat_template(messages, tokenize=False)
	# # inputs = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE)

	# # # Generate response
	# # outputs = model.generate(
	# # inputs,
	# # max_new_tokens=request.max_new_tokens,
	# # temperature=request.temperature,
	# # top_p=request.top_p,
	# # do_sample=True
	# # )

	# # # Decode and return response
	# # response_text = tokenizer.decode(outputs[0])

	# # return {
	# # "generated_text": response_text
	# # }

	# # except Exception as e:
	# # raise HTTPException(status_code=500, detail=str(e))

	# # if __name__ == "__main__":
	# # import uvicorn
	# # uvicorn.run(app, host="0.0.0.0", port=7860)



	# from fastapi import FastAPI, HTTPException
	# from pydantic import BaseModel
	# from typing import List, Dict
	# import transformers
	# import torch

	# app = FastAPI(title="LLaMA API")

	# # Initialize the model and pipeline at startup
	# model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
	# pipeline = transformers.pipeline(
	# "text-generation",
	# model=model_id,
	# model_kwargs={"torch_dtype": torch.bfloat16},
	# device_map="auto",
	# )

	# class Message(BaseModel):
	# role: str
	# content: str

	# class ChatRequest(BaseModel):
	# messages: List[Message]
	# max_new_tokens: int = 256

	# class ChatResponse(BaseModel):
	# generated_text: str

	# @app.post("/generate", response_model=ChatResponse)
	# async def chat(request: ChatRequest):
	# try:
	# outputs = pipeline(
	# [{"role": msg.role, "content": msg.content} for msg in request.messages],
	# max_new_tokens=request.max_new_tokens,
	# )

	# # Extract the last generated message
	# generated_text = outputs[0]["generated_text"][-1]

	# return ChatResponse(generated_text=generated_text)

	# except Exception as e:
	# raise HTTPException(status_code=500, detail=str(e))

	# # Health check endpoint
	# @app.get("/")
	# async def health_check():
	# return {"status": "healthy"}

	# if __name__ == "__main__":
	# import uvicorn
	# uvicorn.run(app, host="0.0.0.0", port=8000)



	# # from fastapi import FastAPI, HTTPException
	# # from pydantic import BaseModel
	# # from transformers import AutoModelForCausalLM, AutoTokenizer
	# # from typing import List
	# # import torch

	# # app = FastAPI(title="Language Model API")

	# # # Model configuration
	# # CHECKPOINT = "HuggingFaceTB/SmolLM2-135M-Instruct"
	# # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# # # Initialize model and tokenizer
	# # try:
	# # tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
	# # model = AutoModelForCausalLM.from_pretrained(CHECKPOINT).to(DEVICE)
	# # except Exception as e:
	# # raise RuntimeError(f"Failed to load model: {str(e)}")

	# # class ChatMessage(BaseModel):
	# # role: str
	# # content: str

	# # class ChatRequest(BaseModel):
	# # messages: List[ChatMessage]
	# # max_new_tokens: int = 50
	# # temperature: float = 0.2
	# # top_p: float = 0.9

	# # @app.post("/generate")
	# # async def generate_response(request: ChatRequest):
	# # try:
	# # # Convert messages to the format expected by the model
	# # messages = [{"role": msg.role, "content": msg.content} for msg in request.messages]

	# # # Prepare input
	# # input_text = tokenizer.apply_chat_template(messages, tokenize=False)
	# # inputs = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE)

	# # # Generate response
	# # outputs = model.generate(
	# # inputs,
	# # max_new_tokens=request.max_new_tokens,
	# # temperature=request.temperature,
	# # top_p=request.top_p,
	# # do_sample=True
	# # )

	# # # Decode and return response
	# # response_text = tokenizer.decode(outputs[0])

	# # return {
	# # "generated_text": response_text
	# # }

	# # except Exception as e:
	# # raise HTTPException(status_code=500, detail=str(e))

	# # if __name__ == "__main__":
	# # import uvicorn
	# # uvicorn.run(app, host="0.0.0.0", port=7860)



	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from typing import List, Dict
	import transformers
	import torch

	app = FastAPI(title="LLaMA API")

	# Initialize the model and pipeline at startup
	model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
	pipeline = transformers.pipeline(
	"text-generation",
	model=model_id,
	model_kwargs={"torch_dtype": torch.bfloat16},
	device_map="auto",
	)

	class Message(BaseModel):
	role: str
	content: str

	class ChatRequest(BaseModel):
	messages: List[Message]
	max_new_tokens: int = 256

	class ChatResponse(BaseModel):
	generated_text: str

	@app.post("/generate", response_model=ChatResponse)
	async def chat(request: ChatRequest):
	try:
	outputs = pipeline(
	[{"role": msg.role, "content": msg.content} for msg in request.messages],
	max_new_tokens=request.max_new_tokens,
	)

	# Extract the last generated message
	generated_text = outputs[0]["generated_text"][-1]

	return ChatResponse(generated_text=generated_text)

	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	# Health check endpoint
	@app.get("/")
	async def health_check():
	return {"status": "healthy"}

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=8000)