import streamlit as st
import torch
from utils import (
    prompt_audio_summarization,
    timer,
    cosine_sim
)
from transformers import BartForConditionalGeneration, BartTokenizer
import numpy as np
import whisper
from streamlit import session_state as sst
import onnxruntime


@timer
def get_text_from_audio(audio_tensors) -> str:
    """Transcribe multiple audio tensors in parallel using Whisper's batch processing."""
    # Transcribe the in-memory audio
    audio_tensors = audio_tensors.to(sst['device'])
    result = audio_transcriber_model.transcribe(audio_tensors
                                                )
    all_transcription_segments = result["text"]
    return all_transcription_segments

@timer
def summarize_from_text(raw_transcription):
    
    inputs = text_summarizer[0](prompt_audio_summarization + raw_transcription,
                                return_tensors="pt",
                                max_length=1024,
                                truncation=True)\
                                .to(sst['device'])
    
    summary_ids = text_summarizer[1].generate(**inputs,
                                              max_length=150,
                                              min_length=30,
                                              length_penalty=2.0,
                                              num_beams=4
                                              )
    
    return text_summarizer[0].decode(summary_ids[0], skip_special_tokens=True)

@timer
def rate_video_frames(video_frames):
    """
    Classifies video frames into another category.
    """
        
    inp_frames = np.array(video_frames, dtype = np.float32).reshape(len(video_frames)//5, 5, 224,224,3)# 20,5,224,224,3
    inputs_dict = {"frames": inp_frames}

    video_frame_emb = video_rating_model.run(['emb'], inputs_dict)[0]
    
    overall_sim, count_upg = cosine_sim(emb1 = base_frame_emb,
                                        emb2 = torch.tensor(video_frame_emb),
                                        threshold=0.4
                                        )
    
    perc_of_upg = count_upg / (len(video_frames)//5)

    if perc_of_upg > 0.4:
        return f"Out of {len(video_frames)} important moments of this video, {count_upg*5} moments contain under or at least PG content. Hence this video is suitable for kids & family."
    else:
        return f"Out of {len(video_frames)} important moments of this video, {(len(video_frames)//5 - count_upg)*5} moments contain at least PG-13 content.Hence parental guidance is strongly suggested for this video."

@st.cache_resource
def load_models():
    sst['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'
    transcriber = whisper.load_model("base", device = sst['device'])

    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(sst['device'])
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

    base_frame_emb = torch.tensor(
                                  np.load('base_frame_medoid.npz')['arr'],
                                  dtype = torch.float32,
                                  device = sst['device']
                                 )
    
    session = onnxruntime.InferenceSession("video_rating_siamesev2.onnx",
                                           providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
                                           )
    
    return ( 
        transcriber, (tokenizer, model), session, base_frame_emb
            )

audio_transcriber_model, text_summarizer, video_rating_model,base_frame_emb = load_models()