File size: 2,070 Bytes
8e761cc
6cf0252
ebfa455
 
 
 
67e8921
1900674
ebfa455
6cf0252
1900674
67e8921
6cf0252
67e8921
 
6cf0252
 
 
 
 
67e8921
ebfa455
67e8921
 
 
 
 
 
ebfa455
67e8921
 
ebfa455
67e8921
 
 
ebfa455
6cf0252
 
 
 
 
 
ebfa455
6cf0252
 
 
 
 
 
 
 
ebfa455
67e8921
 
6cf0252
ebfa455
67e8921
 
ebfa455
67e8921
 
6cf0252
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import torch
from transformers import LlamaForCausalLM, AutoTokenizer, AutoProcessor
from PIL import Image
import base64
import io

# Load model and processor globally
model_id = "kiddobellamy/Llama_Vision"

# Load the model
model = LlamaForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  # Use torch.float16 if bfloat16 is not supported
    device_map="auto",
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the processor if needed (for image processing)
processor = AutoProcessor.from_pretrained(model_id)

def handler(event, context):
    try:
        # Parse inputs
        inputs = event.get('inputs', {})
        image_base64 = inputs.get('image')
        prompt = inputs.get('prompt', '')

        if not image_base64 or not prompt:
            return {'error': 'Both "image" and "prompt" are required in inputs.'}

        # Decode the base64 image
        image_bytes = base64.b64decode(image_base64)
        image = Image.open(io.BytesIO(image_bytes)).convert('RGB')

        # Process image if necessary (depends on your model)
        # Assuming your processor handles image preprocessing
        image_inputs = processor(images=image, return_tensors="pt").to(model.device)

        # Tokenize the prompt
        text_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        # Combine image and text inputs if required by your model
        # This step depends on how your model processes images and text together
        inputs = {
            'input_ids': text_inputs['input_ids'],
            'attention_mask': text_inputs['attention_mask'],
            # Include image inputs as required
            # 'pixel_values': image_inputs['pixel_values'],
        }

        # Generate output
        output_ids = model.generate(**inputs, max_new_tokens=50)
        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Return the result
        return {'generated_text': generated_text}

    except Exception as e:
        return {'error': str(e)}
#111