Spaces:

vsrinivas
/

Transcribe_the_audio_and_get_semantic_chunks

Running

App Files Files Community

vsrinivas commited on Sep 22, 2024

Commit

ee54bd3

verified ·

1 Parent(s): 4a94923

Create app.py

Browse files

Files changed (1) hide show

app.py +123 -0

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from pytubefix import YouTube
+from moviepy.editor import VideoFileClip, AudioFileClip
+from pydub import AudioSegment
+import whisper
+import pandas as pd
+import nltk
+from nltk.tokenize import sent_tokenize
+nltk.download('punkt')
+import gradio as gr
+import ast
+from IPython.display import Audio, display
+model = whisper.load_model("base")
+def extract_yt_audio(video_url):
+    """
+    Takes youtube url (youtobe_url) and path where audio clip will be stored (audio_path)
+    in string format as input arguments.
+    Returns the extracted video clip (video) and the path to audio clip (audio_path).
+    """
+    if "youtube.com" in video_url or "youtu.be" in video_url:
+      yt = YouTube(video_url)
+      a = yt.streams.filter(only_audio=True).first()
+      audio_file = a.download()
+      sample = AudioSegment.from_file(audio_file)
+    else:
+      sample = AudioSegment.from_file(video_url)
+    audio_path = 'audio.wav'
+    display(Audio(audio_path))
+    sample.export(audio_path, format="wav")
+    result = model.transcribe(audio_path)
+    print("Transcription started \nTranscript:\n")
+    print(result['text'], '\n')
+    return gr.update(visible=True, value=result['text']), gr.update(visible=True), result['segments'], gr.update(visible=True, value=audio_path)
+def semantic_chunks(segs, max_chunk_length=15.0):
+    print(type(segs))
+    print(segs)
+    """
+    Takes segments of transcribed audio and 15secs as maximum check duration and returns chunks of the audio as a list.
+    """
+    segs = ast.literal_eval(segs)
+    print(type(segs))
+    chunks = []
+    current_chunk = []
+    chunk_start_time = None
+    chunk_end_time = None
+    chunk_duration = 0
+    # iterate over segments and create chunks out of each segment
+    for segment in segs:
+        start = segment['start']
+        end = segment['end']
+        text = segment['text']
+        # sentence tokenize each segment to capture more semantic context
+        sentences = sent_tokenize(text)
+        # iterate over the sentences and group them into chunks subject to the max_chunk_length is 15 secs
+        for sentence in sentences:
+            sentence_duration = (end - start) / len(sentences)
+            # Check if adding the sentence exceeds the max_chunk_length of 15 secs
+            if chunk_duration + sentence_duration <= max_chunk_length:
+                if not current_chunk:
+                    chunk_start_time = start
+                current_chunk.append(sentence)
+                chunk_duration += sentence_duration
+                chunk_end_time = end
+            else:
+                # If the chunk would be too long, finalize the current chunk with required parameters
+                chunks.append({
+                    'chunk_id': len(chunks) + 1,
+                    'chunk_length (secs)': chunk_duration,
+                    'semantic_chunk': ' '.join(current_chunk),
+                    'start_time (secs)': chunk_start_time,
+                    'end_time (secs)': chunk_end_time
+                })
+                # Start a new chunk with the current sentence
+                current_chunk = [sentence]
+                chunk_start_time = start
+                chunk_end_time = end
+                chunk_duration = sentence_duration
+    # Finalize the last chunk if it exists
+    if current_chunk:
+        chunks.append({
+            'chunk_id': len(chunks) + 1,
+            'chunk_length (secs)': chunk_duration,
+            'semantic_chunk': ' '.join(current_chunk),
+            'start_time (secs)': chunk_start_time,
+            'end_time (secs)': chunk_end_time
+        })
+    return gr.update(visible=True, value=pd.DataFrame(chunks))
+def clear_all():
+    return (gr.update(visible=True, value=""), gr.update(visible=True, value=""), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False))
+with gr.Blocks() as demo:
+    gr.Markdown(
+    """
+    # Extract audio from video, get the transcript and then get the semantic chunk information.
+    """)
+    input_url = gr.Textbox(label="Type-in the URL or File Location of the Video", value='https://www.youtube.com/watch?v=ug5e4JfC3oo')
+    segments = gr.Textbox(visible=False)
+    submit_btn_1 = gr.Button("Get the Transcript", visible=True)
+    audio = gr.Audio(visible=True, type="filepath", label='Play Audio')
+    transcript = gr.Textbox(visible=True, label='Transcript')
+    submit_btn_2 = gr.Button("Get the semantically Chuncked Segments", visible=False)
+    chunks = gr.Dataframe(visible=False, label = 'semantic Chunks')
+    clear_btn = gr.Button("Clear")
+    submit_btn_1.click(fn=extract_yt_audio, inputs=[input_url], outputs=[transcript, submit_btn_2, segments, audio])
+    submit_btn_2.click(fn=semantic_chunks, inputs=[segments], outputs=[chunks])
+    clear_btn.click(fn=clear_all, outputs=[input_url, transcript, submit_btn_2, chunks, audio])
+demo.launch(debug=True)