Spaces:

AmithAdiraju1694
/

Video_Summary_Beta

Sleeping

App Files Files Community

AmithAdiraju1694 commited on 13 days ago

Commit

271d9ed

verified ·

1 Parent(s): 470188c

refactor_inference (#1)

Browse files

- Modified inference order, moved transcription and summarization to separate functions and replaced regular pytorch model with onnx one for faster inference (2beef55857c2f71271d604afca602cc2e7376e50)

Files changed (5) hide show

app.py +2 -1
model_inference.py +30 -25
pages.py +30 -24
requirements.txt +1 -0
video_rating_siamesev2.onnx +3 -0

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import streamlit as st
 from streamlit import session_state as sst
 import asyncio
 from pages import landing_page, model_inference_page
@@ -9,7 +10,7 @@ if "page" not in sst:
 def reset_sst():
     for key in list(sst.keys()):
-        if key != "page":
             sst.pop(key, None)

 import streamlit as st
 from streamlit import session_state as sst
 import asyncio
+import torch
 from pages import landing_page, model_inference_page
 def reset_sst():
     for key in list(sst.keys()):
+        if key != "page" and key != 'device':
             sst.pop(key, None)

model_inference.py CHANGED Viewed

@@ -14,6 +14,8 @@ from utils import timer
 import numpy as np
 import whisper
 from utils import batch_generator, cosine_sim
@@ -67,13 +69,19 @@ class SiameseNetwork(nn.Module):
 @timer
-def summarize_from_audio(audio_tensor):
     # Transcribe the in-memory audio
-    result = audio_transcriber_model.transcribe(audio_tensor)
     all_transcription_segments = result["text"]
-    summary = text_summarizer(prompt_audio_summarization + all_transcription_segments,
                               max_length=108,
                               min_length=36, do_sample=False)[0]['summary_text']
@@ -125,44 +133,41 @@ def rate_video_frames(video_frames):
     Classifies video frames into another category.
     """
-    tensor = torch.tensor(
-                            np.array(video_frames),
-                            dtype = torch.float32
-                         ).reshape(len(video_frames)//5, 5, 224,224,3) # 20,5,224,224,3
-    video_frame_emb = video_rating_model.inference(tensor) # 20,128
     overall_sim, count_upg = cosine_sim(emb1 = base_frame_emb,
-                                        emb2 = video_frame_emb,
                                         threshold=0.4
                                         )
-    if count_upg / (len(video_frames)//5) > 0.5:
         return f"Out of {len(video_frames)} important moments of this video, {count_upg*5} moments contain under or at least PG content. Hence this video is suitable for kids & family."
     else:
         return f"Out of {len(video_frames)} important moments of this video, {(len(video_frames)//5 - count_upg)*5} moments contain at least PG-13 content.Hence parental guidance is strongly suggested for this video."
 @st.cache_resource
 def load_models():
-    transcriber = whisper.load_model("base")
-    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
     base_frame_emb = torch.tensor(
                                   np.load('base_frame_medoid.npz')['arr'],
-                                  dtype = torch.float32
                                  )
-    video_rating_model = SiameseNetwork()
-    # video_rating_model.load_state_dict(
-    #     torch.load('/Users/amithadiraju/Desktop/Video_Summary_App/video_contrastive-siamese_v3.pt',
-    #                weights_only = True
-    #                )
-    # )
-    video_rating_model.eval()
     return (
-        transcriber, summarizer, video_rating_model, base_frame_emb
             )
 audio_transcriber_model, text_summarizer, video_rating_model,base_frame_emb = load_models()

 import numpy as np
 import whisper
 from utils import batch_generator, cosine_sim
+from streamlit import session_state as sst
+import onnxruntime
 @timer
+def get_text_from_audio(audio_tensors):
+    """Transcribe multiple audio tensors in parallel using Whisper's batch processing."""
     # Transcribe the in-memory audio
+    audio_tensors = audio_tensors.to(sst['device'])
+    result = audio_transcriber_model.transcribe(audio_tensors
+                                                )
     all_transcription_segments = result["text"]
+    return all_transcription_segments
+@timer
+def summarize_from_text(raw_transcription):
+    summary = text_summarizer(prompt_audio_summarization + raw_transcription,
                               max_length=108,
                               min_length=36, do_sample=False)[0]['summary_text']
     Classifies video frames into another category.
     """
+    inp_frames = np.array(video_frames, dtype = np.float32).reshape(len(video_frames)//5, 5, 224,224,3)# 20,5,224,224,3
+    inputs_dict = {"frames": inp_frames}
+    video_frame_emb = video_rating_model.run(['emb'], inputs_dict)[0]
     overall_sim, count_upg = cosine_sim(emb1 = base_frame_emb,
+                                        emb2 = torch.tensor(video_frame_emb),
                                         threshold=0.4
                                         )
+    perc_of_upg = count_upg / (len(video_frames)//5)
+    if perc_of_upg > 0.4:
         return f"Out of {len(video_frames)} important moments of this video, {count_upg*5} moments contain under or at least PG content. Hence this video is suitable for kids & family."
     else:
         return f"Out of {len(video_frames)} important moments of this video, {(len(video_frames)//5 - count_upg)*5} moments contain at least PG-13 content.Hence parental guidance is strongly suggested for this video."
 @st.cache_resource
 def load_models():
+    sst['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'
+    transcriber = whisper.load_model("base", device = sst['device'])
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device = sst['device'])
     base_frame_emb = torch.tensor(
                                   np.load('base_frame_medoid.npz')['arr'],
+                                  dtype = torch.float32,
+                                  device = sst['device']
                                  )
+    session = onnxruntime.InferenceSession("video_rating_siamesev2.onnx",
+                                           providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
+                                           )
     return (
+        transcriber, summarizer, session, base_frame_emb
             )
 audio_transcriber_model, text_summarizer, video_rating_model,base_frame_emb = load_models()

pages.py CHANGED Viewed

@@ -5,7 +5,7 @@ import time
 import pandas as pd
 from utils import navigate_to
-from model_inference import rate_video_frames,summarize_from_audio
 from utils import read_important_frames, extract_audio
 import numpy as np
@@ -61,28 +61,12 @@ async def landing_page():
 async def model_inference_page():
-    df = pd.DataFrame([('Video_Text_Summary', 'Video_Rating_Scale')])
-    sl_df = st.table(df)
-    # check if audio is present and it's non-empty
-    if "audio_transcript" in sst:
-        video_summary_text = summarize_from_audio(sst["audio_transcript"])
-        if len(video_summary_text) > 0:
-            pass
-        else:
-            video_summary_text = "Sorry, we couldn't find any audio data from your video, hence couldn't generate any summary"
-        print("Time taken to generate text summary from audio in seconds: ", summarize_from_audio.total_time)
     # check if frames are present and they are non-empty
     if "important_frames" in sst:
         important_frames = sst["important_frames"]
-        with st.spinner("Generating text summary for your video"):
             video_rating_scale = rate_video_frames(important_frames)
         if len(video_rating_scale) > 0:
@@ -90,13 +74,35 @@ async def model_inference_page():
         else:
             video_rating_scale = "Sorry, we couldn't find any images from your video, hence couldn't generate any summary"
-        print("Time taken to generate video rating in seconds: ", rate_video_frames.total_time)
-    sl_df.add_rows(
-                [( video_summary_text, video_rating_scale ) ]
-                  )
     st.button("Go Home",
                       on_click = navigate_to,

 import pandas as pd
 from utils import navigate_to
+from model_inference import rate_video_frames,get_text_from_audio, summarize_from_text
 from utils import read_important_frames, extract_audio
 import numpy as np
 async def model_inference_page():
     # check if frames are present and they are non-empty
     if "important_frames" in sst:
         important_frames = sst["important_frames"]
+        with st.spinner("Generating Movie Scale rating for your video"):
             video_rating_scale = rate_video_frames(important_frames)
         if len(video_rating_scale) > 0:
         else:
             video_rating_scale = "Sorry, we couldn't find any images from your video, hence couldn't generate any summary"
+        st.toast("Done")
+        st.header("Movie Scale Rating of Your Video: ", divider = True)
+        st.write(video_rating_scale)
+        st.markdown("************************")
+    # check if audio is present and it's non-empty
+    if "audio_transcript" in sst:
+        with st.spinner("Extracting text from audio file"):
+            video_raw_text = get_text_from_audio(sst["audio_transcript"])
+        st.toast("Done")
+        with st.spinner("Summarizing text from entire transcript"):
+            video_summary_text = summarize_from_text(video_raw_text)
+        st.toast("Done")
+        if len(video_summary_text) > 0:
+            pass
+        else:
+            video_summary_text = "Sorry, we couldn't find any audio data from your video, hence couldn't generate any summary"
+        print("Time taken to get raw text from audio in seconds: ", get_text_from_audio.total_time)
+        print("Time taken to generate text summary from raw text in seconds: ", summarize_from_text.total_time)
+        st.header("Audio Transcript summary of your video: ", divider = True)
+        st.write(video_summary_text)
     st.button("Go Home",
                       on_click = navigate_to,

requirements.txt CHANGED Viewed

@@ -23,6 +23,7 @@ PyYAML==6.0.2
 safetensors==0.4.5
 scipy==1.13.1
 sentencepiece==0.2.0
 smmap==5.0.2
 sniffio==1.3.1
 soundfile==0.13.1

 safetensors==0.4.5
 scipy==1.13.1
 sentencepiece==0.2.0
+onnxruntime-gpu==1.17.1
 smmap==5.0.2
 sniffio==1.3.1
 soundfile==0.13.1

video_rating_siamesev2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f073fb71728915d53cde75d316c3196c07bda3fe79af5acab6596bc397146b6
+size 344064697