import numpy as np import soundfile as sf import librosa from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer import torch from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier # Load Hugging Face's Wav2Vec2 model and tokenizer model_name = "facebook/wav2vec2-large-xlsr-53" tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) def load_audio(file_path): audio, sample_rate = sf.read(file_path) return audio def extract_mfcc_features(audio, sample_rate): mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40) mfccs_scaled = np.mean(mfccs.T, axis=0) return mfccs_scaled def predict_emotion(file_path): audio = load_audio(file_path) mfcc_features = extract_mfcc_features(audio, 16000) # Adjust sample rate if needed # Prepare for prediction (just using random sample for this dummy) encoded_input = tokenizer(audio, sampling_rate=16000, return_tensors="pt", padding=True) # Make sure to use the correct model input and outputs for emotion prediction with torch.no_grad(): logits = model(**encoded_input).logits predicted_ids = torch.argmax(logits, dim=-1) return tokenizer.decode(predicted_ids[0]) # Example usage of the model if __name__ == "__main__": file_name = "path_to_your_audio_file.wav" # Replace with your audio file path emotion = predict_emotion(file_name) print(f'Predicted Emotion: {emotion}')