Spaces:

VinitT
/

Llama-3.2-11B-Vision-Instruct

Running

File size: 2,879 Bytes

672864c

import os
import streamlit as st
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import requests
import torch

# Step 1: Log in to Hugging Face with your access token from secrets
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")  # Fetch the token from environment
if huggingface_token:
    login(token=huggingface_token)  # Authenticate using the token
else:
    st.error("Hugging Face token not found. Please set it in the Secrets section.")

# Step 2: Load the model and tokenizer
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"  # Adjust if needed
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    st.success("Model loaded successfully!")
except Exception as e:
    st.error(f"Error loading model: {str(e)}")

# Step 3: Create a simple Streamlit app
def main():
    st.title("Llama 3.2 11B Vision Model")
    st.write("Upload an image and enter a prompt to generate output.")
    
    # Upload image
    image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
    prompt = st.text_area("Enter your prompt here:")
    
    if st.button("Generate Output"):
        if image_file and prompt:
            # Load image
            image = Image.open(image_file)
            st.image(image, caption="Uploaded Image", use_column_width=True)
            
            # Preprocess the image if needed (convert to tensor, etc.)
            # This depends on how the model expects the image input
            
            # Example of converting image to a format suitable for the model
            # Note: Adjust this part based on your model's requirements.
            # Here, we're just using a placeholder for the model input.
            # You might need to resize or normalize the image based on the model's requirements.
            # For example:
            # image_tensor = preprocess_image(image)

            try:
                # Prepare the input for the model
                inputs = tokenizer(prompt, return_tensors='pt')
                
                # Perform inference
                # Adjust the input format for the model accordingly
                # Here we assume the model takes a prompt and an image (adjust as necessary)
                with torch.no_grad():
                    model_output = model.generate(**inputs)  # Pass image tensor if required
                
                # Decode the output
                output_text = tokenizer.decode(model_output[0], skip_special_tokens=True)
                st.write("Generated Output:", output_text)
            except Exception as e:
                st.error(f"Error during prediction: {str(e)}")
        else:
            st.warning("Please upload an image and enter a prompt.")

if __name__ == "__main__":
    main()