gooya-v1 / app.py
navidved's picture
Update app.py
0166f6b verified
raw
history blame
1.85 kB
import gradio as gr
from transformers import pipeline
from faster_whisper import WhisperModel
import librosa
# Load the model
# pipe = pipeline("automatic-speech-recognition", model="navidved/persian-whisper-large-v3-ct2")
model = WhisperModel("navidved/persian-whisper-large-v3-ct2", device="cpu", compute_type="int8")
# Define the maximum audio length in seconds
MAX_AUDIO_LENGTH = 40 # 40 seconds
# Define the inference function
def transcribe_audio(audio):
if audio is None:
return "No audio file uploaded. Please try again."
try:
audio_data, sr = librosa.load(audio, sr=None)
duration = librosa.get_duration(y=audio_data, sr=sr)
# Check if the audio is longer than the allowed duration
if duration > MAX_AUDIO_LENGTH:
return f"Audio is too long. Please upload an audio file shorter than {MAX_AUDIO_LENGTH} seconds."
# Perform transcription
segments, _ = model.transcribe(audio, vad_filter=True)
result = segments.text
return result
except Exception as e:
return f"Error during transcription: {str(e)}"
# Create a Gradio interface for uploading audio or using the microphone
with gr.Blocks() as interface:
gr.Markdown("# Whisper Large V3 Speech Recognition")
gr.Markdown("Upload an audio file or use your microphone to transcribe speech to text.")
# Create the input and output components
audio_input = gr.Audio(type="filepath", label="Input Audio")
output_text = gr.Textbox(label="Transcription")
# Add a button to trigger the transcription
transcribe_button = gr.Button("Transcribe")
# Bind the transcribe_audio function to the button click
transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=output_text)
# Launch the Gradio app
interface.launch()