import cv2 import os import gradio as gr import requests from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image import torch import uuid # Load Models device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Model 1: ViT-GPT2 model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device) feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") # Model 2: FuseCap processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap") model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device) # Model 3: BLIP Large processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device) # Frame Extraction and Captioning Logic def process_video(video_path): vidObj = cv2.VideoCapture(video_path) print(vidObj) count = 0 success = True frame_captions = {"Model 1": [], "Model 2": [], "Model 3": []} print("LOGX") while success: success, frame = vidObj.read() print("LOGY") print(success) print(frame) if not success: break # Process every 20th frame if count % 20 == 0: image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # Model 1: ViT-GPT2 pixel_values = feature_extractor1(images=[image], return_tensors="pt").pixel_values.to(device) output_ids = model1.generate(pixel_values, max_length=16, num_beams=4) caption1 = tokenizer1.decode(output_ids[0], skip_special_tokens=True) frame_captions["Model 1"].append(caption1) # Model 2: FuseCap inputs = processor2(image, "a picture of ", return_tensors="pt").to(device) out2 = model2.generate(**inputs, num_beams=3) caption2 = processor2.decode(out2[0], skip_special_tokens=True) frame_captions["Model 2"].append(caption2) # Model 3: BLIP Large inputs3 = processor3(image, return_tensors="pt").to(device) out3 = model3.generate(**inputs3) caption3 = processor3.decode(out3[0], skip_special_tokens=True) frame_captions["Model 3"].append(caption3) count += 1 vidObj.release() return frame_captions # Gradio Interface def generate_captions(video): print("LOG1") captions = process_video(video) print("LOG PO") result = "" for model_name, model_captions in captions.items(): result += f"### {model_name}\n" result += "\n".join(f"- {caption}" for caption in model_captions) result += "\n\n" print("LOG KONIEc") return result with gr.Blocks() as demo: gr.Markdown("# Video Captioning with Multiple Models 🎥") gr.Markdown("Upload a video to generate captions for its frames using three different models.") video_input = gr.Video(label="Upload Video") output = gr.Textbox(label="Generated Captions", lines=20) submit_button = gr.Button("Generate Captions") submit_button.click( fn=generate_captions, inputs=video_input, outputs=output, ) if __name__ == "__main__": demo.launch()