Spaces:

Guhanselvam
/

Audio_recog

Runtime error

App Files Files Community

Guhanselvam commited on Nov 13, 2024

Commit

b272e20

verified ·

1 Parent(s): e7102c8

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -63

app.py CHANGED Viewed

@@ -1,75 +1,42 @@
-import torch
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
-import sounddevice as sd
-import soundfile as sf
 import numpy as np
-import requests
-import webbrowser
-import os
-# Load pre-trained Wav2Vec2 model and tokenizer
-model_name = "facebook/wav2vec2-large-xlsr-53"  # Model name for audio processing
 tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
 model = Wav2Vec2ForCTC.from_pretrained(model_name)
-# Function to record audio
-def record_audio(duration=5, fs=16000):
-    print("Recording...")
-    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
-    sd.wait()  # Wait until recording is finished
-    print("Recording finished.")
-    return audio.flatten()
-# Function for emotion recognition
-def recognize_emotion(audio):
-    # Normalize audio if necessary (check your audio data properties if required)
-    input_values = tokenizer(audio, return_tensors='pt', padding='longest', sampling_rate=16000).input_values
-    # Get the logits (raw predictions) and apply softmax to get probabilities
-    with torch.no_grad():
-        logits = model(input_values).logits
-        predicted_ids = torch.argmax(logits, dim=-1)
-    # Decode the predicted IDs to text
-    transcription = tokenizer.decode(predicted_ids[0])
-    return transcription  # Return the detected text
-# Function to get Spotify playlist based on mood
-def get_playlist(mood):
-    url = "https://unsa-unofficial-spotify-api.p.rapidapi.com/search"
-    querystring = {"query": mood, "count": "10", "type": "playlists"}
-    headers = {
-        'x-rapidapi-key': "your-api-key",  # Replace with your actual API key
-        'x-rapidapi-host': "unsa-unofficial-spotify-api.p.rapidapi.com"
-    }
-    try:
-        response = requests.get(url, headers=headers, params=querystring)
-        response.raise_for_status()  # Raises error for bad responses
-        playlist_id = response.json()["Results"][0]["id"]  # Get the first playlist
-        return playlist_id
-    except requests.exceptions.RequestException as e:
-        print(f"Error fetching playlist data: {e}")
-        return None
-# Function to open the Spotify playlist in a web browser
-def open_playlist(playlist_id):
-    webbrowser.open(f'https://open.spotify.com/playlist/{playlist_id}')
-# Main function to record audio and recognize mood
-def main():
-    # Record audio
-    audio = record_audio()
-    # Recognize the mood/emotion from audio
-    emotion_text = recognize_emotion(audio)
-    print(f"Detected Emotion: {emotion_text}")
-    # Get Spotify playlist based on the detected emotion
-    playlist_id = get_playlist(emotion_text)
-    if playlist_id:
-        open_playlist(playlist_id)
 if __name__ == "__main__":
-    main()

 import numpy as np
+import soundfile as sf
+import librosa
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
+import torch
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+# Load Hugging Face's Wav2Vec2 model and tokenizer
+model_name = "facebook/wav2vec2-large-xlsr-53"
 tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
 model = Wav2Vec2ForCTC.from_pretrained(model_name)
+def load_audio(file_path):
+    audio, sample_rate = sf.read(file_path)
+    return audio
+def extract_mfcc_features(audio, sample_rate):
+    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
+    mfccs_scaled = np.mean(mfccs.T, axis=0)
+    return mfccs_scaled
+def predict_emotion(file_path):
+    audio = load_audio(file_path)
+    mfcc_features = extract_mfcc_features(audio, 16000)  # Adjust sample rate if needed
+    # Prepare for prediction (just using random sample for this dummy)
+    encoded_input = tokenizer(audio, sampling_rate=16000, return_tensors="pt", padding=True)
+    # Make sure to use the correct model input and outputs for emotion prediction
+    with torch.no_grad():
+        logits = model(**encoded_input).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    return tokenizer.decode(predicted_ids[0])
+# Example usage of the model
 if __name__ == "__main__":
+    file_name = "path_to_your_audio_file.wav"  # Replace with your audio file path
+    emotion = predict_emotion(file_name)
+    print(f'Predicted Emotion: {emotion}')