from transformers import pipeline
import torch
from PIL import Image

import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F

from PIL import Image
from utils import prompt_frame_summarization, assistant_role, prompt_audio_summarization
import streamlit as st
from utils import timer

import numpy as np
import whisper
from utils import batch_generator, cosine_sim
from streamlit import session_state as sst
import onnxruntime


class SiameseNetwork(nn.Module):
    def __init__(self, model_name="vit_b_16"):
        super(SiameseNetwork, self).__init__()

        self.encoder = models.vit_b_16(weights="IMAGENET1K_V1")  # Pretrained ViT
        self.encoder.heads = nn.Identity()  # Remove classification head

        self.fc = nn.Linear(768, 128)  # Reduce to 128-d embedding

    def forward(self, video_frames1, video_frames2):
        """
        video1: (B, nf, H, W, C)  # Batch of videos (50 frames each)
        video2: (B, nf, H, W, C)
        """
        B,num_frames,H,W,C = video_frames1.shape  # (Batch, Channels, H, W)

        # Flatten frames into batch dimension for ViT
        video_frames1 = video_frames1.permute(0,1,4,2,3).reshape(B * num_frames, C,H,W)
        video_frames2 = video_frames2.reshape(0,1,4,2,3).reshape(B * num_frames, C,H,W)

        # Extract frame-level embeddings
        emb1 = self.encoder(video_frames1)  # (B*num_frames, 768)
        emb2 = self.encoder(video_frames2)

        # Reshape back to (B, T, 768) and average over T
        #TODO: Change this to use LSTM instead of averaging
        emb1 = emb1.reshape(B, num_frames, -1).mean(dim=1)  # (B, 768)
        emb2 = emb2.reshape(B, num_frames, -1).mean(dim=1)

        # Pass through fully connected layer
        emb1 = self.fc(emb1)  # (B, 128)
        emb2 = self.fc(emb2)

        return emb1, emb2

    def inference(self, video_frames):
        """
        video: (B, 50, C, H, W)
        """
        B, num_frames, H, W, C = video_frames.shape

        video_frames = video_frames.permute(0,1,4,2,3).reshape(B * num_frames, C,H,W)
        emb = self.encoder(video_frames)
        emb = emb.reshape(B, num_frames, -1).mean(dim=1)
        emb = self.fc(emb)

        return emb


@timer
def get_text_from_audio(audio_tensors):
    """Transcribe multiple audio tensors in parallel using Whisper's batch processing."""
    # Transcribe the in-memory audio
    audio_tensors = audio_tensors.to(sst['device'])
    result = audio_transcriber_model.transcribe(audio_tensors
                                                )
    all_transcription_segments = result["text"]
    return all_transcription_segments

@timer
def summarize_from_text(raw_transcription):

    summary = text_summarizer(prompt_audio_summarization + raw_transcription,
                              max_length=108,
                              min_length=36, do_sample=False)[0]['summary_text']

    
    return summary


def get_important_frames_ML(frame):
    """
    Classifies frames using your second ML model.
    """
    # Implement your model's logic here
    # ...
    return None

def Vit_Summarize_Video(video_frames):
    """
    Summarizes video frames into a text sentence.
    """

    processor = None
    messages = None
    model = None
    tokenizer = None

    if video_frames is None or len(video_frames) == 0:
        return "Error: No video frames available."


    # Ensure frames are properly formatted
    video_frames = [Image.fromarray(frame.astype("uint8")) for frame in video_frames]

    # Ensure correct format for processor
    inputs = processor(messages, images=None, videos=[video_frames])

    inputs.update({
        "tokenizer": tokenizer,
        "max_new_tokens": 54,
        "decode_text": True,
                 })

    summary_text = model.generate(**inputs)

    return summary_text

@timer
def rate_video_frames(video_frames):
    """
    Classifies video frames into another category.
    """
        
    inp_frames = np.array(video_frames, dtype = np.float32).reshape(len(video_frames)//5, 5, 224,224,3)# 20,5,224,224,3
    inputs_dict = {"frames": inp_frames}

    video_frame_emb = video_rating_model.run(['emb'], inputs_dict)[0]
    
    overall_sim, count_upg = cosine_sim(emb1 = base_frame_emb,
                                        emb2 = torch.tensor(video_frame_emb),
                                        threshold=0.4
                                        )
    
    perc_of_upg = count_upg / (len(video_frames)//5)

    if perc_of_upg > 0.4:
        return f"Out of {len(video_frames)} important moments of this video, {count_upg*5} moments contain under or at least PG content. Hence this video is suitable for kids & family."
    else:
        return f"Out of {len(video_frames)} important moments of this video, {(len(video_frames)//5 - count_upg)*5} moments contain at least PG-13 content.Hence parental guidance is strongly suggested for this video."

@st.cache_resource
def load_models():
    sst['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'
    transcriber = whisper.load_model("base", device = sst['device'])
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device = sst['device'])

    base_frame_emb = torch.tensor(
                                  np.load('base_frame_medoid.npz')['arr'],
                                  dtype = torch.float32,
                                  device = sst['device']
                                 )
    
    session = onnxruntime.InferenceSession("video_rating_siamesev2.onnx",
                                           providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
                                           )
    
    return ( 
        transcriber, summarizer, session, base_frame_emb
            )

audio_transcriber_model, text_summarizer, video_rating_model,base_frame_emb = load_models()