import gradio as gr from transformers import LlavaProcessor, LlavaForConditionalGeneration, TextIteratorStreamer from threading import Thread import re import time from PIL import Image import torch import cv2 import spaces model_id = "llava-hf/llava-interleave-qwen-0.5b-hf" processor = LlavaProcessor.from_pretrained(model_id) model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16) model.to("cuda") def replace_video_with_images(text, frames): return text.replace("