Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,849 Bytes
07ebfb1 5b11f8b 07ebfb1 6351056 ad6cbd0 07ebfb1 5b11f8b 769dbd6 84d6345 983c638 602514f 071a764 769dbd6 07ebfb1 6a3ae5e c1541fb 07ebfb1 6a3ae5e 07ebfb1 3d53725 07ebfb1 0c101e3 ad19622 07ebfb1 6a3ae5e ad19622 6a3ae5e ad19622 07ebfb1 459ad15 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import os
import torch
import gradio as gr
import pytube as pt
import spaces
from transformers import pipeline
from huggingface_hub import model_info
MODEL_NAME = "NbAiLab/whisper-large-sme" #this always needs to stay in line 8 :D sorry for the hackiness
lang = "fi"
share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
auth_token = os.environ.get("AUTH_TOKEN") or True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
@spaces.GPU(duration=120)
def pipe(file, return_timestamps=False):
asr = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
token=auth_token,
)
asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
language=lang,
task="transcribe",
no_timestamps=not return_timestamps,
)
# asr.model.config.no_timestamps_token_id = asr.tokenizer.encode("<|notimestamps|>", add_special_tokens=False)[0]
return asr(file, return_timestamps=return_timestamps)
def transcribe(file, return_timestamps=False):
if not return_timestamps:
text = pipe(file)["text"]
else:
chunks = pipe(file, return_timestamps=True)["chunks"]
text = []
for chunk in chunks:
start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
line = f"[{start_time} -> {end_time}] {chunk['text']}"
text.append(line)
text = "\n".join(text)
return text
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def yt_transcribe(yt_url, return_timestamps=False):
yt = pt.YouTube(yt_url)
html_embed_str = _return_yt_html_embed(yt_url)
stream = yt.streams.filter(only_audio=True)[0]
stream.download(filename="audio.mp3")
text = transcribe("audio.mp3", return_timestamps=return_timestamps)
return html_embed_str, text
demo = gr.Blocks()
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
# gr.components.Checkbox(label="Return timestamps"),
],
outputs="text",
theme="huggingface",
title="Whisper Demo: Transcribe Audio",
description=(
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
" of arbitrary length."
),
allow_flagging="never",
)
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
# gr.components.Checkbox(label="Return timestamps"),
],
examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
outputs=["html", "text"],
theme="huggingface",
title="Whisper Demo: Transcribe YouTube",
description=(
"Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
" arbitrary length."
),
allow_flagging="never",
)
with demo:
gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
demo.launch(share=True).queue()
|