kiddobellamy
/

Llama_Vision

Video-Text-to-Text

text-generation

Inference Endpoints

Model card Files Files and versions Community

kiddobellamy commited on Sep 28, 2024

Commit

8e761cc

·

verified ·

1 Parent(s): 42c2dda

Update handler.py

Files changed (1) hide show

handler.py +53 -39

handler.py CHANGED Viewed

@@ -1,39 +1,53 @@
-import torch
-from transformers import AutoModelForVision2Seq, AutoTokenizer
-class ModelHandler:
-    def __init__(self):
-        self.model = None
-        self.tokenizer = None
-    def initialize(self, context):
-        """ Load the model and tokenizer """
-        model_dir = context.system_properties.get("model_dir")
-        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
-        self.model = AutoModelForVision2Seq.from_pretrained(model_dir)
-        self.model.eval()
-    def preprocess(self, data):
-        """ Preprocess the input data before passing it to the model """
-        inputs = self.tokenizer(data, return_tensors="pt")
-        return inputs
-    def inference(self, inputs):
-        """ Run the forward pass of the model """
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-        return outputs
-    def postprocess(self, outputs):
-        """ Post-process the output data from the model """
-        return outputs
-# This is required for the Hugging Face inference endpoints
-_handler = ModelHandler()
-def handle(data, context):
-    if not _handler.model:
-        _handler.initialize(context)
-    inputs = _handler.preprocess(data)
-    outputs = _handler.inference(inputs)
-    return _handler.postprocess(outputs)

+import requests
+import torch
+from PIL import Image
+from transformers import MllamaForConditionalGeneration, AutoProcessor
+# Define the model ID and load the model and processor
+model_id = "meta-llama/Llama-3.2-90B-Vision-Instruct"
+def load_model():
+    """Loads the Llama 3.2-90B Vision-Instruct model and processor."""
+    model = MllamaForConditionalGeneration.from_pretrained(
+        model_id,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+    )
+    processor = AutoProcessor.from_pretrained(model_id)
+    return model, processor
+def process_image(url):
+    """Processes the image from the given URL."""
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+def generate_response(model, processor, image, prompt):
+    """Generates a text response based on the image and the prompt."""
+    messages = [
+        {"role": "user", "content": [
+            {"type": "image"},
+            {"type": "text", "text": prompt}
+        ]}
+    ]
+    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(image, input_text, return_tensors="pt").to(model.device)
+    output = model.generate(**inputs, max_new_tokens=30)
+    return processor.decode(output[0])
+def main():
+    # Load model and processor
+    model, processor = load_model()
+    # Sample image URL
+    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+    image = process_image(url)
+    # Define a sample prompt
+    prompt = "If I had to write a haiku for this one, it would be:"
+    # Generate response
+    response = generate_response(model, processor, image, prompt)
+    print(response)
+if __name__ == "__main__":
+    main()