Spaces:

Guhanselvam
/

Audio_recog

Runtime error

File size: 1,516 Bytes

ec8aa5d
b272e20
 
 
 
 
 
e59aa7b
b272e20
 
e59aa7b
 
 
b272e20
 
 
e59aa7b
b272e20
 
 
 
e59aa7b
b272e20
 
 
 
 
 
e59aa7b
b272e20
 
 
ec8aa5d
b272e20
ec8aa5d
b272e20
ec8aa5d
b272e20
ec8aa5d
b272e20

import numpy as np
import soundfile as sf
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import torch
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load Hugging Face's Wav2Vec2 model and tokenizer
model_name = "facebook/wav2vec2-large-xlsr-53"
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

def load_audio(file_path):
    audio, sample_rate = sf.read(file_path)
    return audio

def extract_mfcc_features(audio, sample_rate):
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled = np.mean(mfccs.T, axis=0)
    return mfccs_scaled

def predict_emotion(file_path):
    audio = load_audio(file_path)
    mfcc_features = extract_mfcc_features(audio, 16000)  # Adjust sample rate if needed
    
    # Prepare for prediction (just using random sample for this dummy)
    encoded_input = tokenizer(audio, sampling_rate=16000, return_tensors="pt", padding=True)

    # Make sure to use the correct model input and outputs for emotion prediction
    with torch.no_grad():
        logits = model(**encoded_input).logits

    predicted_ids = torch.argmax(logits, dim=-1)

    return tokenizer.decode(predicted_ids[0])

# Example usage of the model
if __name__ == "__main__":
    file_name = "path_to_your_audio_file.wav"  # Replace with your audio file path
    emotion = predict_emotion(file_name)
    print(f'Predicted Emotion: {emotion}')