import gradio as gr from transformers import pipeline from faster_whisper import WhisperModel import librosa # Load the model model = WhisperModel("navidved/faster-gooya-v1", device="cpu", compute_type="int8", local_files_only=False) # Define the maximum audio length in seconds MAX_AUDIO_LENGTH = 30 # seconds # Define the inference function def transcribe_audio(audio): if audio is None: return "No audio file uploaded. Please try again." results = "" try: audio_data, sr = librosa.load(audio, sr=None) duration = librosa.get_duration(y=audio_data, sr=sr) # Check if the audio is longer than the allowed duration if duration > MAX_AUDIO_LENGTH: return f"Audio is too long. Please upload an audio file shorter than {MAX_AUDIO_LENGTH} seconds." # Perform transcription segments, _ = model.transcribe(audio, vad_filter=True) for seg in segments: results += seg.text return results except Exception as e: return f"Error during transcription: {str(e)}" # Create a Gradio interface for uploading audio or using the microphone with gr.Blocks() as interface: gr.Markdown("# Gooya v1 Persian Speech Recognition") gr.Markdown("Upload an audio file or use your microphone to transcribe speech to text.") # Create the input and output components audio_input = gr.Audio(type="filepath", label="Input Audio") output_text = gr.Textbox(label="Transcription") # Add a button to trigger the transcription transcribe_button = gr.Button("Transcribe") # Bind the transcribe_audio function to the button click transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=output_text) # Launch the Gradio app interface.launch()