import os import gradio as gr import whisper import librosa import torch from transformers import Wav2Vec2Processor, Wav2Vec2Tokenizer device = "cuda" if torch.cuda.is_available() else "cpu" def audio_to_text(audio): model = whisper.load_model("base") audio = whisper.load_audio(audio) result = model.transcribe(audio) return result["text"] # tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") # logits = preprocess(audio) # predicted_ids = torch.argmax(logits, dim=-1) # transcriptions = tokenizer.decode(predicted_ids[0]) # return transcriptions def preprocess(audio): model_save_path = "model_save" model_name = "wav2vec2_osr_version_1" speech, rate = librosa.load(audio, sr=16000) model_path = os.path.join(model_save_path, model_name+".pt") pipeline_path = os.path.join(model_save_path, model_name+"_vocab") access_token = "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" processor = Wav2Vec2Processor.from_pretrained(pipeline_path, use_auth_token=access_token) model = torch.load(model_path) model.eval() input_values = processor(speech, sampling_rate=rate, return_tensors="pt").input_values.to(device) logits = model(input_values).logits return logits demo = gr.Interface( fn=audio_to_text, inputs=gr.Audio(source="upload", type="filepath"), examples=[["example.flac"]], outputs="text" ) demo.launch()