Spaces:
Sleeping
Sleeping
"""Røst speech-to-text demo.""" | |
import logging | |
import os | |
import warnings | |
import gradio as gr | |
import numpy as np | |
import samplerate | |
import torch | |
from punctfix import PunctFixer | |
from transformers import pipeline | |
from dotenv import load_dotenv | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s ⋅ %(name)s ⋅ %(message)s", | |
datefmt="%Y-%m-%d %H:%M:%S", | |
) | |
logger = logging.getLogger("roest-asr-demo") | |
load_dotenv() | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
icon = """ | |
<svg xmlns="http://www.w3.org/2000/svg" width="14px" viewBox="0 0 24 24" fill="none" | |
stroke="currentColor" stroke-width="2" stroke-linecap="round" | |
stroke-linejoin="round" style="display: inline;"> | |
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/> | |
<polyline points="17 8 12 3 7 8"/> | |
<line x1="12" y1="3" x2="12" y2="15"/> | |
</svg> | |
""" | |
TITLE = "Røst tale-til-tekst demo" | |
EMAIL_SUBJECT = "Røst tale-til-tekst demo".replace(" ", "+") | |
EMAIL_BODY = """Hej, | |
Jeg har lige prøvet jeres Røst tale-til-tekst demo, og jeg er imponeret! | |
Jeg kunne godt tænke mig at høre mere om jeres talegenkendelsesløsninger. | |
Min use case er [indsæt use case her]. | |
Venlig hilsen, | |
[dit navn]""".replace(" ", "+").replace("\n", "%0D") | |
DESCRIPTION = f""" | |
This is a demo of the Danish speech recognition model | |
[Røst](https://huggingface.co/alexandrainst/roest-315m). | |
Press "Record" to record your | |
own voice. When you're done you can press "Stop" to stop recording and "Submit" to | |
send the audio to the model for transcription. You can also upload an audio file by | |
pressing the {icon} button. | |
_If you like what you see and are interested in integrating speech-to-text solutions | |
into your products, feel free to | |
[contact us](mailto:[email protected]?subject={EMAIL_SUBJECT}&body={EMAIL_BODY})._ | |
""" | |
logger.info("Loading the ASR model...") | |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
transcriber = pipeline( | |
task="automatic-speech-recognition", | |
model="alexandrainst/roest-315m", | |
device=device, | |
token=os.getenv("HUGGINGFACE_HUB_TOKEN", True), | |
) | |
logger.info("Loading the punctuation fixer model...") | |
transcription_fixer = PunctFixer(language="da", device=device) | |
logger.info("Models loaded, ready to transcribe audio.") | |
def transcribe_audio(sampling_rate_and_audio: tuple[int, np.ndarray] | None) -> str: | |
"""Transcribe the audio. | |
Args: | |
sampling_rate_and_audio: | |
A tuple with the sampling rate and the audio, or None if no audio was | |
provided. | |
Returns: | |
The transcription. | |
""" | |
if sampling_rate_and_audio is None: | |
return ( | |
"No audio was provided. Please record or upload an audio clip, and try " | |
"again." | |
) | |
sampling_rate, audio = sampling_rate_and_audio | |
if audio.ndim > 1: | |
audio = np.mean(audio, axis=1) | |
audio = samplerate.resample(audio, 16_000 / sampling_rate, "sinc_best") | |
logger.info(f"Transcribing audio clip of {len(audio) / 16_000:.2f} seconds...") | |
transcription = transcriber( | |
inputs=audio, generate_kwargs=dict(language="danish", task="transcribe") | |
) | |
if not isinstance(transcription, dict): | |
return "" | |
logger.info(f"Raw transcription is {transcription['text']!r}. Cleaning it up...") | |
cleaned_transcription = transcription_fixer.punctuate( | |
text=transcription["text"] | |
) | |
logger.info(f"Final transcription: {cleaned_transcription!r}") | |
return cleaned_transcription | |
demo = gr.Interface( | |
fn=transcribe_audio, | |
inputs=gr.Audio( | |
sources=["microphone", "upload"], show_label=False, min_length=1, max_length=60 | |
), | |
outputs="textbox", | |
title=TITLE, | |
description=DESCRIPTION, | |
css="p { font-size: 1.0rem; }", | |
allow_flagging="never", | |
examples=[ | |
"https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/bornholmsk.wav", | |
"https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/soenderjysk.wav", | |
"https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/nordjysk.wav", | |
"https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/accent.wav", | |
], | |
cache_examples=False, | |
theme=gr.themes.Soft(primary_hue="orange"), | |
) | |
demo.launch() | |