Spaces:

matjarm
/

model-comparison

Sleeping

File size: 3,496 Bytes

9d46b1d
043d7db
 
9d46b1d
 
 
043d7db
 
9d46b1d
043d7db
 
9d46b1d
043d7db
9d46b1d
 
 
 
043d7db
9d46b1d
 
 
 
 
 
 
 
 
 
 
 
7afbc44
9d46b1d
 
 
559d80f
9d46b1d
 
559d80f
dbffd62
 
9d46b1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559d80f
a590910
559d80f
a590910
 
 
 
 
559d80f
a590910
9d46b1d
 
559d80f
9d46b1d
 
 
 
 
 
 
 
 
 
 
043d7db
 
 
9d46b1d

import cv2
import os
import gradio as gr
import requests
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import uuid

# Load Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model 1: ViT-GPT2
model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Model 2: FuseCap
processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap")
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device)

# Model 3: BLIP Large
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)


# Frame Extraction and Captioning Logic
def process_video(video_path):
    vidObj = cv2.VideoCapture(video_path)
    print(vidObj)
    count = 0
    success = True
    frame_captions = {"Model 1": [], "Model 2": [], "Model 3": []}
    print("LOGX")
    while success:
        success, frame = vidObj.read()
        print("LOGY")
        print(success)
        print(frame)
        if not success:
            break

        # Process every 20th frame
        if count % 20 == 0:
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

            # Model 1: ViT-GPT2
            pixel_values = feature_extractor1(images=[image], return_tensors="pt").pixel_values.to(device)
            output_ids = model1.generate(pixel_values, max_length=16, num_beams=4)
            caption1 = tokenizer1.decode(output_ids[0], skip_special_tokens=True)
            frame_captions["Model 1"].append(caption1)

            # Model 2: FuseCap
            inputs = processor2(image, "a picture of ", return_tensors="pt").to(device)
            out2 = model2.generate(**inputs, num_beams=3)
            caption2 = processor2.decode(out2[0], skip_special_tokens=True)
            frame_captions["Model 2"].append(caption2)

            # Model 3: BLIP Large
            inputs3 = processor3(image, return_tensors="pt").to(device)
            out3 = model3.generate(**inputs3)
            caption3 = processor3.decode(out3[0], skip_special_tokens=True)
            frame_captions["Model 3"].append(caption3)

        count += 1

    vidObj.release()
    return frame_captions


# Gradio Interface
def generate_captions(video):
    print("LOG1")
    captions = process_video(video)
    print("LOG PO")
    result = ""
    for model_name, model_captions in captions.items():
        result += f"### {model_name}\n"
        result += "\n".join(f"- {caption}" for caption in model_captions)
        result += "\n\n"
    print("LOG KONIEc")
    return result



with gr.Blocks() as demo:
    gr.Markdown("# Video Captioning with Multiple Models 🎥")
    gr.Markdown("Upload a video to generate captions for its frames using three different models.")
    video_input = gr.Video(label="Upload Video")
    output = gr.Textbox(label="Generated Captions", lines=20)
    submit_button = gr.Button("Generate Captions")

    submit_button.click(
        fn=generate_captions,
        inputs=video_input,
        outputs=output,
    )

if __name__ == "__main__":
    demo.launch()