Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,578 Bytes
99f09a0 2f8eff4 cd7160b 2f8eff4 4ba32da 6b51df0 4ba32da 2f8eff4 ecebb24 2f8eff4 d67efb9 835d83d 2f8eff4 f6ae4ba 2f8eff4 657202e 2f8eff4 dc10bd3 2f8eff4 7171a2f 213e6ee 7171a2f 2f8eff4 7171a2f 2f8eff4 7171a2f 2f8eff4 99f09a0 2f8eff4 99f09a0 2f8eff4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import gradio as gr
import requests
import torch
from PIL import Image
import spaces
from transformers import MllamaForConditionalGeneration, AutoProcessor
import os
from huggingface_hub import login
huggingface_token = os.getenv("SECRET_ENV_VARIABLE")
login(huggingface_token)
# Load the Llama 3.2 Vision Model
def load_llama_model():
model_id = "meta-llama/Llama-3.2-11B-Vision"
# Load model and processor
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
offload_folder="offload",
)
model.tie_weights()
processor = AutoProcessor.from_pretrained(model_id)
return model, processor
# Function to generate predictions for text and image
@spaces.GPU
def process_input(text, image=None):
model, processor = load_llama_model()
if image:
# If an image is uploaded, process it as a PIL Image object
vision_input = image.convert("RGB").resize((224, 224))
prompt = f"<|image|><|begin_of_text|>{text}"
# Process image and text together
inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
else:
# If no image is uploaded, just process the text
prompt = f"<|begin_of_text|>{text}"
inputs = processor(prompt, return_tensors="pt").to(model.device)
# Generate output from the model
outputs = model.generate(**inputs, max_new_tokens=50)
# Decode the output to return a readable text
decoded_output = processor.decode(outputs[0], skip_special_tokens=True)
return decoded_output
def demo():
# Define Gradio input and output components
text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)
image_input = gr.Image(label="Upload an Image", type="pil")
output = gr.Textbox(label="Model Output", lines=3)
# Add two examples for multimodal analysis
examples = [
["The llama is ", "./examples/llama.png"],
["The cute hampster is wearing ", "./examples/hampster.png"]
]
# Define the interface layout
interface = gr.Interface(
fn=process_input,
inputs=[text_input, image_input],
outputs=output,
examples=examples,
title="Llama 3.2 Multimodal Text-Image Analyzer",
description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model. You can also try out the provided examples.",
)
# Launch the demo
interface.launch()
# Run the demo
if __name__ == "__main__":
demo()
|