import gradio as gr from transformers import pipeline from faster_whisper import WhisperModel import librosa # Load the model # pipe = pipeline("automatic-speech-recognition", model="navidved/persian-whisper-large-v3-ct2") model = WhisperModel("navidved/persian-whisper-large-v3-ct2", device="cpu", compute_type="int8") # Define the maximum audio length in seconds MAX_AUDIO_LENGTH = 40 # 40 seconds # Define the inference function def transcribe_audio(audio): if audio is None: return "No audio file uploaded. Please try again." try: audio_data, sr = librosa.load(audio, sr=None) duration = librosa.get_duration(y=audio_data, sr=sr) # Check if the audio is longer than the allowed duration if duration > MAX_AUDIO_LENGTH: return f"Audio is too long. Please upload an audio file shorter than {MAX_AUDIO_LENGTH} seconds." # Perform transcription segments, _ = model.transcribe(audio, vad_filter=True) result = segments.text return result except Exception as e: return f"Error during transcription: {str(e)}" # Create a Gradio interface for uploading audio or using the microphone with gr.Blocks() as interface: gr.Markdown("# Whisper Large V3 Speech Recognition") gr.Markdown("Upload an audio file or use your microphone to transcribe speech to text.") # Create the input and output components audio_input = gr.Audio(type="filepath", label="Input Audio") output_text = gr.Textbox(label="Transcription") # Add a button to trigger the transcription transcribe_button = gr.Button("Transcribe") # Bind the transcribe_audio function to the button click transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=output_text) # Launch the Gradio app interface.launch()