import torch from transformers import LlamaForCausalLM, AutoTokenizer, AutoProcessor from PIL import Image import base64 import io # Load model and processor globally model_id = "kiddobellamy/Llama_Vision" # Load the model model = LlamaForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, # Use torch.float16 if bfloat16 is not supported device_map="auto", ) # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_id) # Load the processor if needed (for image processing) processor = AutoProcessor.from_pretrained(model_id) def handler(event, context): try: # Parse inputs inputs = event.get('inputs', {}) image_base64 = inputs.get('image') prompt = inputs.get('prompt', '') if not image_base64 or not prompt: return {'error': 'Both "image" and "prompt" are required in inputs.'} # Decode the base64 image image_bytes = base64.b64decode(image_base64) image = Image.open(io.BytesIO(image_bytes)).convert('RGB') # Process image if necessary (depends on your model) # Assuming your processor handles image preprocessing image_inputs = processor(images=image, return_tensors="pt").to(model.device) # Tokenize the prompt text_inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # Combine image and text inputs if required by your model # This step depends on how your model processes images and text together inputs = { 'input_ids': text_inputs['input_ids'], 'attention_mask': text_inputs['attention_mask'], # Include image inputs as required # 'pixel_values': image_inputs['pixel_values'], } # Generate output output_ids = model.generate(**inputs, max_new_tokens=50) generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) # Return the result return {'generated_text': generated_text} except Exception as e: return {'error': str(e)} #111