Spaces:
Runtime error
Runtime error
File size: 3,481 Bytes
851ae91 2503b95 746f081 851ae91 b263b21 851ae91 746f081 851ae91 ff878ab 4b01587 ff878ab 4b01587 851ae91 ff878ab 851ae91 38d85c1 851ae91 38d85c1 851ae91 38d85c1 851ae91 38d85c1 851ae91 4b01587 851ae91 38d85c1 851ae91 6e91234 38d85c1 851ae91 e0361cf 851ae91 38d85c1 c993b6c 241f12e c993b6c ff9897a 02215da c993b6c 851ae91 38d85c1 851ae91 e0361cf 851ae91 38d85c1 c993b6c 241f12e 38d85c1 02215da 38d85c1 851ae91 9c51c8a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import torch
import os
import gradio as gr
import pytube as pt
from speechbox import ASRDiarizationPipeline
MODEL_NAME = "openai/whisper-tiny"
device = 0 if torch.cuda.is_available() else "cpu"
HF_TOKEN = os.environ.get("HF_TOKEN")
pipe = ASRDiarizationPipeline.from_pretrained(
asr_model=MODEL_NAME,
device=device,
use_auth_token=HF_TOKEN,
)
def tuple_to_string(start_end_tuple, ndigits=1):
return str((round(start_end_tuple[0], ndigits), round(start_end_tuple[1], ndigits)))
def format_as_transcription(raw_segments, with_timestamps=False):
if with_timestamps:
return "\n\n".join([chunk["speaker"] + " " + tuple_to_string(chunk["timestamp"]) + chunk["text"] for chunk in raw_segments])
else:
return "\n\n".join([chunk["speaker"] + chunk["text"] for chunk in raw_segments])
def transcribe(file_upload, with_timestamps):
raw_segments = pipe(file_upload)
transcription = format_as_transcription(raw_segments, with_timestamps=with_timestamps)
return transcription
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def yt_transcribe(yt_url, with_timestamps):
yt = pt.YouTube(yt_url)
html_embed_str = _return_yt_html_embed(yt_url)
stream = yt.streams.filter(only_audio=True)[0]
stream.download(filename="audio.mp3")
text = pipe("audio.mp3")
return html_embed_str, format_as_transcription(text, with_timestamps=with_timestamps)
demo = gr.Blocks()
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="upload", type="filepath"),
gr.Checkbox(label="With timestamps?", value=True),
],
outputs="text",
layout="horizontal",
theme="huggingface",
title="Whisper Speaker Diarization: Transcribe Audio",
description=(
"Transcribe audio files with speaker diarization using 🤗 Speechbox. Demo uses the pre-trained checkpoint"
f" [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) for the ASR transcriptions and"
f" [PyAnnote Audio](https://huggingface.co/pyannote/speaker-diarization) to label the speakers."
),
examples=[
["./processed.wav", True],
["./processed.wav", False],
],
allow_flagging="never",
)
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
gr.Checkbox(label="With timestamps?", value=True),
],
outputs=["html", "text"],
layout="horizontal",
theme="huggingface",
title="Whisper Speaker Diarization: Transcribe YouTube",
description=(
"Transcribe YouTube videos with speaker diarization using 🤗 Speechbox. Demo uses the pre-trained checkpoint"
f" [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) for the ASR transcriptions and"
f" [PyAnnote Audio](https://huggingface.co/pyannote/speaker-diarization) to label the speakers."
),
examples=[
["https://www.youtube.com/watch?v=9dAWIPixYxc", True],
["https://www.youtube.com/watch?v=9dAWIPixYxc", False],
],
allow_flagging="never",
)
with demo:
gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
demo.launch(enable_queue=True)
|