kiddobellamy
/

Llama_Vision

Video-Text-to-Text

text-generation

Inference Endpoints

Model card Files Files and versions Community

Llama_Vision / handler.py

kiddobellamy's picture

Update handler.py

1900674 verified about 2 months ago

1.54 kB

	import torch
	from transformers import LlamaForCausalLM, AutoProcessor
	from PIL import Image
	import base64
	import io

	# Load model and processor globally
	model_id = "kiddobellamy/Llama_Vision"

	model = LlamaForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)
	processor = AutoProcessor.from_pretrained(model_id)

	def handler(event, context):
	try:
	# Parse inputs
	inputs = event.get('inputs', {})
	image_base64 = inputs.get('image')
	prompt = inputs.get('prompt', '')

	if not image_base64 or not prompt:
	return {'error': 'Both "image" and "prompt" are required in inputs.'}

	# Decode the base64 image
	image_bytes = base64.b64decode(image_base64)
	image = Image.open(io.BytesIO(image_bytes)).convert('RGB')

	# Prepare the message
	messages = [
	{"role": "user", "content": [
	{"type": "image"},
	{"type": "text", "text": prompt}
	]}
	]
	input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

	# Process inputs
	inputs = processor(image, input_text, return_tensors="pt").to(model.device)

	# Generate output
	output_ids = model.generate(**inputs, max_new_tokens=50)
	generated_text = processor.decode(output_ids[0], skip_special_tokens=True)

	# Return the result
	return {'generated_text': generated_text}

	except Exception as e:
	return {'error': str(e)}