Spaces:
Sleeping
Sleeping
from transformers import pipeline | |
import gradio as gr | |
import numpy as np | |
import librosa | |
# Initialize the speech recognition pipeline | |
pipe = pipeline("automatic-speech-recognition", model="oyemade/w2v-bert-2.0-yoruba-CV17.0") | |
def transcribe(audio): | |
if audio is None: | |
return "No audio detected. Please try again." | |
try: | |
# Check if the input is a file path (for uploaded files) or numpy array (for microphone input) | |
if isinstance(audio, str): | |
# Load the audio file using librosa | |
audio, sr = librosa.load(audio, sr=16000) # Resample to 16kHz | |
elif isinstance(audio, tuple): # Gradio audio components return a tuple (sr, audio) | |
sr, audio = audio | |
if sr != 16000: | |
audio = librosa.resample(audio, sr, 16000) | |
else: | |
return "Invalid audio format. Please try again." | |
# Check if the audio is valid (not silent) | |
if np.max(np.abs(audio)) < 0.01: | |
return "Audio is too quiet. Please speak louder or choose a different file and try again." | |
text = pipe(audio)["text"] | |
return text | |
except Exception as e: | |
return f"An error occurred: {str(e)}" | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=transcribe, | |
inputs=[ | |
gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input"), | |
], | |
outputs="text", | |
title="Neoform AI: Yoruba Speech Recognition", | |
description="Realtime demo for Yoruba speech recognition using a fine-tuned Wav2Vec-Bert model. " | |
"You can either use your microphone or upload an MP3 file. " | |
"https://neoformai.com", | |
) | |
# Launch the interface | |
iface.launch() |