emotion_recognizer / utils.py
chaitanya9's picture
Upload utils.py
9af4f2c
import soundfile
import librosa
import numpy as np
import pickle
import os
from convert_wavs import convert_audio
AVAILABLE_EMOTIONS = {
"neutral",
"calm",
"happy",
"sad",
"angry",
"fear",
"disgust",
"ps", # pleasant surprised
"boredom"
}
def get_label(audio_config):
"""Returns label corresponding to which features are to be extracted
e.g:
audio_config = {'mfcc': True, 'chroma': True, 'contrast': False, 'tonnetz': False, 'mel': False}
get_label(audio_config): 'mfcc-chroma'
"""
features = ["mfcc", "chroma", "mel", "contrast", "tonnetz"]
label = ""
for feature in features:
if audio_config[feature]:
label += f"{feature}-"
return label.rstrip("-")
def get_dropout_str(dropout, n_layers=3):
if isinstance(dropout, list):
return "_".join([ str(d) for d in dropout])
elif isinstance(dropout, float):
return "_".join([ str(dropout) for i in range(n_layers) ])
def get_first_letters(emotions):
return "".join(sorted([ e[0].upper() for e in emotions ]))
def extract_feature(file_name, **kwargs):
"""
Extract feature from audio file `file_name`
Features supported:
- MFCC (mfcc)
- Chroma (chroma)
- MEL Spectrogram Frequency (mel)
- Contrast (contrast)
- Tonnetz (tonnetz)
e.g:
`features = extract_feature(path, mel=True, mfcc=True)`
"""
mfcc = kwargs.get("mfcc")
chroma = kwargs.get("chroma")
mel = kwargs.get("mel")
contrast = kwargs.get("contrast")
tonnetz = kwargs.get("tonnetz")
# try:
# with soundfile.SoundFile(file_name) as sound_file:
# pass
# except RuntimeError:
# # not properly formated, convert to 16000 sample rate & mono channel using ffmpeg
# # get the basename
# basename = os.path.basename(file_name)
# dirname = os.path.dirname(file_name)
# name, ext = os.path.splitext(basename)
# new_basename = f"{name}_c.wav"
# new_filename = os.path.join(dirname, new_basename)
# v = convert_audio(file_name, new_filename)
# if v:
# raise NotImplementedError("Converting the audio files failed, make sure `ffmpeg` is installed in your machine and added to PATH.")
# else:
# new_filename = file_name
# with soundfile.SoundFile(new_filename) as sound_file:
X = file_name[1].astype("float32")
#X = sound_file.read(dtype="float32")
sample_rate = file_name[0] #sound_file.samplerate
#sample_rate = sound_file.samplerate
if chroma or contrast:
stft = np.abs(librosa.stft(X))
result = np.array([])
if mfcc:
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
result = np.hstack((result, mfccs))
if chroma:
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
result = np.hstack((result, chroma))
if mel:
mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
result = np.hstack((result, mel))
if contrast:
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
result = np.hstack((result, contrast))
if tonnetz:
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
result = np.hstack((result, tonnetz))
return result
def get_best_estimators(classification):
"""
Loads the estimators that are pickled in `grid` folder
Note that if you want to use different or more estimators,
you can fine tune the parameters in `grid_search.py` script
and run it again ( may take hours )
"""
if classification:
return pickle.load(open("grid/best_classifiers.pickle", "rb"))
else:
return pickle.load(open("grid/best_regressors.pickle", "rb"))
def get_audio_config(features_list):
"""
Converts a list of features into a dictionary understandable by
`data_extractor.AudioExtractor` class
"""
audio_config = {'mfcc': False, 'chroma': False, 'mel': False, 'contrast': False, 'tonnetz': False}
for feature in features_list:
if feature not in audio_config:
raise TypeError(f"Feature passed: {feature} is not recognized.")
audio_config[feature] = True
return audio_config