Spaces:

VinitT
/

Llama-3.2-11B-Vision-Instruct

Running

App Files Files Community

Llama-3.2-11B-Vision-Instruct / app.py

VinitT

Create app.py

672864c verified about 2 months ago

raw

history blame

2.88 kB

	import os
	import streamlit as st
	from huggingface_hub import login
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from PIL import Image
	import requests
	import torch

	# Step 1: Log in to Hugging Face with your access token from secrets
	huggingface_token = os.getenv("HUGGINGFACE_TOKEN") # Fetch the token from environment
	if huggingface_token:
	login(token=huggingface_token) # Authenticate using the token
	else:
	st.error("Hugging Face token not found. Please set it in the Secrets section.")

	# Step 2: Load the model and tokenizer
	model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" # Adjust if needed
	try:
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name)
	st.success("Model loaded successfully!")
	except Exception as e:
	st.error(f"Error loading model: {str(e)}")

	# Step 3: Create a simple Streamlit app
	def main():
	st.title("Llama 3.2 11B Vision Model")
	st.write("Upload an image and enter a prompt to generate output.")

	# Upload image
	image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
	prompt = st.text_area("Enter your prompt here:")

	if st.button("Generate Output"):
	if image_file and prompt:
	# Load image
	image = Image.open(image_file)
	st.image(image, caption="Uploaded Image", use_column_width=True)

	# Preprocess the image if needed (convert to tensor, etc.)
	# This depends on how the model expects the image input

	# Example of converting image to a format suitable for the model
	# Note: Adjust this part based on your model's requirements.
	# Here, we're just using a placeholder for the model input.
	# You might need to resize or normalize the image based on the model's requirements.
	# For example:
	# image_tensor = preprocess_image(image)

	try:
	# Prepare the input for the model
	inputs = tokenizer(prompt, return_tensors='pt')

	# Perform inference
	# Adjust the input format for the model accordingly
	# Here we assume the model takes a prompt and an image (adjust as necessary)
	with torch.no_grad():
	model_output = model.generate(**inputs) # Pass image tensor if required

	# Decode the output
	output_text = tokenizer.decode(model_output[0], skip_special_tokens=True)
	st.write("Generated Output:", output_text)
	except Exception as e:
	st.error(f"Error during prediction: {str(e)}")
	else:
	st.warning("Please upload an image and enter a prompt.")

	if __name__ == "__main__":
	main()