import gradio as gr import speech_recognition as sr from pydub import AudioSegment from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import os import torch tokenizer = Wav2Vec2Processor.from_pretrained('jonatasgrosman/wav2vec2-large-xlsr-53-portuguese') model = Wav2Vec2ForCTC.from_pretrained('jonatasgrosman/wav2vec2-large-xlsr-53-portuguese') # Load the pre-trained speech recognition model recognizer = sr.Recognizer() def recognize_speech(audio_path): print(audio_path) # Perform speech recognition on the captured audio try: clip = AudioSegment.from_file(audio_path) clip = clip.set_frame_rate(16000) print(clip) x = torch.FloatTensor(clip.get_array_of_samples()) inputs = tokenizer(x, sampling_rate=16000, return_tensors='pt', padding='longest').input_values logits = model(inputs).logits tokens = torch.argmax(logits, axis=-1) text = tokenizer.batch_decode(tokens) return str(text).lower() except sr.UnknownValueError: return "Could not understand the audio." except sr.RequestError as e: return f"Error accessing the Google Speech Recognition service: {e}" # Create the Gradio interface with microphone input audio_recognizer_interface = gr.Interface( fn=recognize_speech, inputs=gr.inputs.Audio(source="microphone", type="filepath", label="Speak into the microphone..."), outputs="text", title="Real-time Speech Recognition" ) # Run the interface audio_recognizer_interface.launch()