Spaces:
Runtime error
Runtime error
File size: 3,421 Bytes
851ae91 2503b95 746f081 851ae91 38d85c1 851ae91 746f081 851ae91 ff878ab 4b01587 ff878ab 4b01587 851ae91 ff878ab 851ae91 38d85c1 851ae91 38d85c1 851ae91 38d85c1 851ae91 38d85c1 851ae91 4b01587 851ae91 38d85c1 851ae91 38d85c1 851ae91 38d85c1 c993b6c 38d85c1 c993b6c 851ae91 38d85c1 851ae91 38d85c1 851ae91 38d85c1 c993b6c 38d85c1 851ae91 38d85c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import torch
import os
import gradio as gr
import pytube as pt
from asr_diarize import ASRDiarizationPipeline # TODO: speechbox import
MODEL_NAME = "openai/whisper-tiny"
device = 0 if torch.cuda.is_available() else "cpu"
HF_TOKEN = os.environ.get("HF_TOKEN")
pipe = ASRDiarizationPipeline.from_pretrained(
asr_model=MODEL_NAME,
device=device,
use_auth_token=HF_TOKEN,
)
def tuple_to_string(start_end_tuple, ndigits=1):
return str((round(start_end_tuple[0], ndigits), round(start_end_tuple[1], ndigits)))
def format_as_transcription(raw_segments, with_timestamps=False):
if with_timestamps:
return "\n\n".join([chunk["speaker"] + " " + tuple_to_string(chunk["timestamp"]) + chunk["text"] for chunk in raw_segments])
else:
return "\n\n".join([chunk["speaker"] + chunk["text"] for chunk in raw_segments])
def transcribe(file_upload, with_timestamps):
raw_segments = pipe(file_upload)
transcription = format_as_transcription(raw_segments, with_timestamps=with_timestamps)
return transcription
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def yt_transcribe(yt_url, with_timestamps):
yt = pt.YouTube(yt_url)
html_embed_str = _return_yt_html_embed(yt_url)
stream = yt.streams.filter(only_audio=True)[0]
stream.download(filename="audio.mp3")
text = pipe("audio.mp3")
return html_embed_str, format_as_transcription(text, with_timestamps=with_timestamps)
demo = gr.Blocks()
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="upload", type="filepath", optional=True),
gr.Checkbox(label="With timestamps?", value=True),
],
outputs="text",
layout="horizontal",
theme="huggingface",
title="Whisper Demo: Transcribe Audio",
description=(
"Transcribe audio files with speaker diarization using 🤗 Speechbox. Demo uses the pre-trained checkpoint"
f" [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) for the ASR transcriptions and"
f" [PyAnnote Audio](https://huggingface.co/pyannote/speaker-diarization) to label the speakers."
)
examples=[
["./sample.wav", True],
],
allow_flagging="never",
)
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
gr.Checkbox(label="With timestamps?", value=True),
],
outputs=["html", "text"],
layout="horizontal",
theme="huggingface",
title="Whisper Speaker Diarization Demo: Transcribe YouTube",
description=(
"Transcribe YouTube videos with speaker diarization using 🤗 Speechbox. Demo uses the pre-trained checkpoint"
f" [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) for the ASR transcriptions and"
f" [PyAnnote Audio](https://huggingface.co/pyannote/speaker-diarization) to label the speakers."
)
examples=[
["https://www.youtube.com/watch?v=9dAWIPixYxc", True],
],
allow_flagging="never",
)
with demo:
gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
demo.launch(enable_queue=True, share=True)
|