whisper-sami-demo

Running on Zero

App Files Files Community

versae commited on May 15, 2024

Commit

6a3ae5e

verified ·

1 Parent(s): 1515c07

Add timestamps

Browse files

Files changed (1) hide show

app.py +19 -5

app.py CHANGED Viewed

@@ -28,8 +28,18 @@ def pipe(file):
     asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
     return asr(file)
-def transcribe(file):
-    text = pipe(file)["text"]
     return text
@@ -42,13 +52,13 @@ def _return_yt_html_embed(yt_url):
     return HTML_str
-def yt_transcribe(yt_url):
     yt = pt.YouTube(yt_url)
     html_embed_str = _return_yt_html_embed(yt_url)
     stream = yt.streams.filter(only_audio=True)[0]
     stream.download(filename="audio.mp3")
-    text = pipe("audio.mp3")["text"]
     return html_embed_str, text
@@ -59,6 +69,7 @@ mf_transcribe = gr.Interface(
     fn=transcribe,
     inputs=[
         gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
     ],
     outputs="text",
     theme="huggingface",
@@ -73,7 +84,10 @@ mf_transcribe = gr.Interface(
 yt_transcribe = gr.Interface(
     fn=yt_transcribe,
-    inputs=[gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
     examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
     outputs=["html", "text"],
     theme="huggingface",

     asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
     return asr(file)
+def transcribe(file, return_timestamps=False):
+    if not return_timestamps:
+        text = pipe(file)["text"]
+    else:
+        chunks = pipe(file, return_timestamps=True)["chunks"]
+        text = []
+        for chunk in chunks:
+            start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
+            end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
+            line = f"[{start_time} -> {end_time}] {chunk['text']}"
+            text.append(line)
+        text = "\n".join(text)
     return text
     return HTML_str
+def yt_transcribe(yt_url, return_timestamps=False):
     yt = pt.YouTube(yt_url)
     html_embed_str = _return_yt_html_embed(yt_url)
     stream = yt.streams.filter(only_audio=True)[0]
     stream.download(filename="audio.mp3")
+    text = transcribe("audio.mp3")
     return html_embed_str, text
     fn=transcribe,
     inputs=[
         gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
+        gr.components.Checkbox(label="Return timestamps"),
     ],
     outputs="text",
     theme="huggingface",
 yt_transcribe = gr.Interface(
     fn=yt_transcribe,
+    inputs=[
+        gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
+        gr.components.Checkbox(label="Return timestamps"),
+    ],
     examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
     outputs=["html", "text"],
     theme="huggingface",