from transformers import pipeline import torch from PIL import Image import torch.nn as nn import torchvision.models as models import torch.nn.functional as F from PIL import Image from utils import prompt_frame_summarization, assistant_role, prompt_audio_summarization import streamlit as st from utils import timer import numpy as np import whisper from utils import batch_generator, cosine_sim from streamlit import session_state as sst import onnxruntime class SiameseNetwork(nn.Module): def __init__(self, model_name="vit_b_16"): super(SiameseNetwork, self).__init__() self.encoder = models.vit_b_16(weights="IMAGENET1K_V1") # Pretrained ViT self.encoder.heads = nn.Identity() # Remove classification head self.fc = nn.Linear(768, 128) # Reduce to 128-d embedding def forward(self, video_frames1, video_frames2): """ video1: (B, nf, H, W, C) # Batch of videos (50 frames each) video2: (B, nf, H, W, C) """ B,num_frames,H,W,C = video_frames1.shape # (Batch, Channels, H, W) # Flatten frames into batch dimension for ViT video_frames1 = video_frames1.permute(0,1,4,2,3).reshape(B * num_frames, C,H,W) video_frames2 = video_frames2.reshape(0,1,4,2,3).reshape(B * num_frames, C,H,W) # Extract frame-level embeddings emb1 = self.encoder(video_frames1) # (B*num_frames, 768) emb2 = self.encoder(video_frames2) # Reshape back to (B, T, 768) and average over T #TODO: Change this to use LSTM instead of averaging emb1 = emb1.reshape(B, num_frames, -1).mean(dim=1) # (B, 768) emb2 = emb2.reshape(B, num_frames, -1).mean(dim=1) # Pass through fully connected layer emb1 = self.fc(emb1) # (B, 128) emb2 = self.fc(emb2) return emb1, emb2 def inference(self, video_frames): """ video: (B, 50, C, H, W) """ B, num_frames, H, W, C = video_frames.shape video_frames = video_frames.permute(0,1,4,2,3).reshape(B * num_frames, C,H,W) emb = self.encoder(video_frames) emb = emb.reshape(B, num_frames, -1).mean(dim=1) emb = self.fc(emb) return emb @timer def get_text_from_audio(audio_tensors): """Transcribe multiple audio tensors in parallel using Whisper's batch processing.""" # Transcribe the in-memory audio audio_tensors = audio_tensors.to(sst['device']) result = audio_transcriber_model.transcribe(audio_tensors ) all_transcription_segments = result["text"] return all_transcription_segments @timer def summarize_from_text(raw_transcription): summary = text_summarizer(prompt_audio_summarization + raw_transcription, max_length=108, min_length=36, do_sample=False)[0]['summary_text'] return summary def get_important_frames_ML(frame): """ Classifies frames using your second ML model. """ # Implement your model's logic here # ... return None def Vit_Summarize_Video(video_frames): """ Summarizes video frames into a text sentence. """ processor = None messages = None model = None tokenizer = None if video_frames is None or len(video_frames) == 0: return "Error: No video frames available." # Ensure frames are properly formatted video_frames = [Image.fromarray(frame.astype("uint8")) for frame in video_frames] # Ensure correct format for processor inputs = processor(messages, images=None, videos=[video_frames]) inputs.update({ "tokenizer": tokenizer, "max_new_tokens": 54, "decode_text": True, }) summary_text = model.generate(**inputs) return summary_text @timer def rate_video_frames(video_frames): """ Classifies video frames into another category. """ inp_frames = np.array(video_frames, dtype = np.float32).reshape(len(video_frames)//5, 5, 224,224,3)# 20,5,224,224,3 inputs_dict = {"frames": inp_frames} video_frame_emb = video_rating_model.run(['emb'], inputs_dict)[0] overall_sim, count_upg = cosine_sim(emb1 = base_frame_emb, emb2 = torch.tensor(video_frame_emb), threshold=0.4 ) perc_of_upg = count_upg / (len(video_frames)//5) if perc_of_upg > 0.4: return f"Out of {len(video_frames)} important moments of this video, {count_upg*5} moments contain under or at least PG content. Hence this video is suitable for kids & family." else: return f"Out of {len(video_frames)} important moments of this video, {(len(video_frames)//5 - count_upg)*5} moments contain at least PG-13 content.Hence parental guidance is strongly suggested for this video." @st.cache_resource def load_models(): sst['device'] = 'cuda' if torch.cuda.is_available() else 'cpu' transcriber = whisper.load_model("base", device = sst['device']) summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device = sst['device']) base_frame_emb = torch.tensor( np.load('base_frame_medoid.npz')['arr'], dtype = torch.float32, device = sst['device'] ) session = onnxruntime.InferenceSession("video_rating_siamesev2.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'] ) return ( transcriber, summarizer, session, base_frame_emb ) audio_transcriber_model, text_summarizer, video_rating_model,base_frame_emb = load_models()