Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pytubefix import YouTube
|
2 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
3 |
+
from pydub import AudioSegment
|
4 |
+
import whisper
|
5 |
+
import pandas as pd
|
6 |
+
import nltk
|
7 |
+
from nltk.tokenize import sent_tokenize
|
8 |
+
nltk.download('punkt')
|
9 |
+
import gradio as gr
|
10 |
+
import ast
|
11 |
+
from IPython.display import Audio, display
|
12 |
+
|
13 |
+
model = whisper.load_model("base")
|
14 |
+
|
15 |
+
def extract_yt_audio(video_url):
|
16 |
+
|
17 |
+
"""
|
18 |
+
Takes youtube url (youtobe_url) and path where audio clip will be stored (audio_path)
|
19 |
+
in string format as input arguments.
|
20 |
+
Returns the extracted video clip (video) and the path to audio clip (audio_path).
|
21 |
+
"""
|
22 |
+
|
23 |
+
if "youtube.com" in video_url or "youtu.be" in video_url:
|
24 |
+
yt = YouTube(video_url)
|
25 |
+
a = yt.streams.filter(only_audio=True).first()
|
26 |
+
audio_file = a.download()
|
27 |
+
sample = AudioSegment.from_file(audio_file)
|
28 |
+
else:
|
29 |
+
sample = AudioSegment.from_file(video_url)
|
30 |
+
audio_path = 'audio.wav'
|
31 |
+
display(Audio(audio_path))
|
32 |
+
sample.export(audio_path, format="wav")
|
33 |
+
result = model.transcribe(audio_path)
|
34 |
+
print("Transcription started \nTranscript:\n")
|
35 |
+
print(result['text'], '\n')
|
36 |
+
return gr.update(visible=True, value=result['text']), gr.update(visible=True), result['segments'], gr.update(visible=True, value=audio_path)
|
37 |
+
|
38 |
+
|
39 |
+
def semantic_chunks(segs, max_chunk_length=15.0):
|
40 |
+
print(type(segs))
|
41 |
+
print(segs)
|
42 |
+
"""
|
43 |
+
Takes segments of transcribed audio and 15secs as maximum check duration and returns chunks of the audio as a list.
|
44 |
+
"""
|
45 |
+
segs = ast.literal_eval(segs)
|
46 |
+
print(type(segs))
|
47 |
+
|
48 |
+
chunks = []
|
49 |
+
current_chunk = []
|
50 |
+
chunk_start_time = None
|
51 |
+
chunk_end_time = None
|
52 |
+
chunk_duration = 0
|
53 |
+
|
54 |
+
# iterate over segments and create chunks out of each segment
|
55 |
+
for segment in segs:
|
56 |
+
start = segment['start']
|
57 |
+
end = segment['end']
|
58 |
+
text = segment['text']
|
59 |
+
|
60 |
+
# sentence tokenize each segment to capture more semantic context
|
61 |
+
sentences = sent_tokenize(text)
|
62 |
+
|
63 |
+
# iterate over the sentences and group them into chunks subject to the max_chunk_length is 15 secs
|
64 |
+
for sentence in sentences:
|
65 |
+
sentence_duration = (end - start) / len(sentences)
|
66 |
+
|
67 |
+
# Check if adding the sentence exceeds the max_chunk_length of 15 secs
|
68 |
+
if chunk_duration + sentence_duration <= max_chunk_length:
|
69 |
+
if not current_chunk:
|
70 |
+
chunk_start_time = start
|
71 |
+
current_chunk.append(sentence)
|
72 |
+
chunk_duration += sentence_duration
|
73 |
+
chunk_end_time = end
|
74 |
+
else:
|
75 |
+
# If the chunk would be too long, finalize the current chunk with required parameters
|
76 |
+
chunks.append({
|
77 |
+
'chunk_id': len(chunks) + 1,
|
78 |
+
'chunk_length (secs)': chunk_duration,
|
79 |
+
'semantic_chunk': ' '.join(current_chunk),
|
80 |
+
'start_time (secs)': chunk_start_time,
|
81 |
+
'end_time (secs)': chunk_end_time
|
82 |
+
})
|
83 |
+
# Start a new chunk with the current sentence
|
84 |
+
current_chunk = [sentence]
|
85 |
+
chunk_start_time = start
|
86 |
+
chunk_end_time = end
|
87 |
+
chunk_duration = sentence_duration
|
88 |
+
|
89 |
+
# Finalize the last chunk if it exists
|
90 |
+
if current_chunk:
|
91 |
+
chunks.append({
|
92 |
+
'chunk_id': len(chunks) + 1,
|
93 |
+
'chunk_length (secs)': chunk_duration,
|
94 |
+
'semantic_chunk': ' '.join(current_chunk),
|
95 |
+
'start_time (secs)': chunk_start_time,
|
96 |
+
'end_time (secs)': chunk_end_time
|
97 |
+
})
|
98 |
+
|
99 |
+
return gr.update(visible=True, value=pd.DataFrame(chunks))
|
100 |
+
|
101 |
+
|
102 |
+
def clear_all():
|
103 |
+
return (gr.update(visible=True, value=""), gr.update(visible=True, value=""), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False))
|
104 |
+
|
105 |
+
|
106 |
+
with gr.Blocks() as demo:
|
107 |
+
gr.Markdown(
|
108 |
+
"""
|
109 |
+
# Extract audio from video, get the transcript and then get the semantic chunk information.
|
110 |
+
""")
|
111 |
+
input_url = gr.Textbox(label="Type-in the URL or File Location of the Video", value='https://www.youtube.com/watch?v=ug5e4JfC3oo')
|
112 |
+
segments = gr.Textbox(visible=False)
|
113 |
+
submit_btn_1 = gr.Button("Get the Transcript", visible=True)
|
114 |
+
audio = gr.Audio(visible=True, type="filepath", label='Play Audio')
|
115 |
+
transcript = gr.Textbox(visible=True, label='Transcript')
|
116 |
+
submit_btn_2 = gr.Button("Get the semantically Chuncked Segments", visible=False)
|
117 |
+
chunks = gr.Dataframe(visible=False, label = 'semantic Chunks')
|
118 |
+
clear_btn = gr.Button("Clear")
|
119 |
+
|
120 |
+
submit_btn_1.click(fn=extract_yt_audio, inputs=[input_url], outputs=[transcript, submit_btn_2, segments, audio])
|
121 |
+
submit_btn_2.click(fn=semantic_chunks, inputs=[segments], outputs=[chunks])
|
122 |
+
clear_btn.click(fn=clear_all, outputs=[input_url, transcript, submit_btn_2, chunks, audio])
|
123 |
+
demo.launch(debug=True)
|