emotion_recognizer / data_extractor.py
chaitanya9's picture
Upload data_extractor.py
27bf1d6
import numpy as np
import pandas as pd
import pickle
import tqdm
import os
from utils import get_label, extract_feature, get_first_letters
from collections import defaultdict
class AudioExtractor:
"""A class that is used to featurize audio clips, and provide
them to the machine learning algorithms for training and testing"""
def __init__(self, audio_config=None, verbose=1, features_folder_name="features", classification=True,
emotions=['sad', 'neutral', 'happy'], balance=True):
"""
Params:
audio_config (dict): the dictionary that indicates what features to extract from the audio file,
default is {'mfcc': True, 'chroma': True, 'mel': True, 'contrast': False, 'tonnetz': False}
(i.e mfcc, chroma and mel)
verbose (bool/int): verbosity level, 0 for silence, 1 for info, default is 1
features_folder_name (str): the folder to store output features extracted, default is "features".
classification (bool): whether it is a classification or regression, default is True (i.e classification)
emotions (list): list of emotions to be extracted, default is ['sad', 'neutral', 'happy']
balance (bool): whether to balance dataset (both training and testing), default is True
"""
self.audio_config = audio_config if audio_config else {'mfcc': True, 'chroma': True, 'mel': True, 'contrast': False, 'tonnetz': False}
self.verbose = verbose
self.features_folder_name = features_folder_name
self.classification = classification
self.emotions = emotions
self.balance = balance
# input dimension
self.input_dimension = None
def _load_data(self, desc_files, partition, shuffle):
self.load_metadata_from_desc_file(desc_files, partition)
# balancing the datasets ( both training or testing )
if partition == "train" and self.balance:
self.balance_training_data()
elif partition == "test" and self.balance:
self.balance_testing_data()
else:
if self.balance:
raise TypeError("Invalid partition, must be either train/test")
if shuffle:
self.shuffle_data_by_partition(partition)
def load_train_data(self, desc_files=["train_speech.csv"], shuffle=False):
"""Loads training data from the metadata files `desc_files`"""
self._load_data(desc_files, "train", shuffle)
def load_test_data(self, desc_files=["test_speech.csv"], shuffle=False):
"""Loads testing data from the metadata files `desc_files`"""
self._load_data(desc_files, "test", shuffle)
def shuffle_data_by_partition(self, partition):
if partition == "train":
self.train_audio_paths, self.train_emotions, self.train_features = shuffle_data(self.train_audio_paths,
self.train_emotions, self.train_features)
elif partition == "test":
self.test_audio_paths, self.test_emotions, self.test_features = shuffle_data(self.test_audio_paths,
self.test_emotions, self.test_features)
else:
raise TypeError("Invalid partition, must be either train/test")
def load_metadata_from_desc_file(self, desc_files, partition):
"""Read metadata from a CSV file & Extract and loads features of audio files
Params:
desc_files (list): list of description files (csv files) to read from
partition (str): whether is "train" or "test"
"""
# empty dataframe
df = pd.DataFrame({'path': [], 'emotion': []})
for desc_file in desc_files:
# concat dataframes
df = pd.concat((df, pd.read_csv(desc_file)), sort=False)
if self.verbose:
print("[*] Loading audio file paths and its corresponding labels...")
# get columns
audio_paths, emotions = list(df['path']), list(df['emotion'])
# if not classification, convert emotions to numbers
if not self.classification:
# so naive and need to be implemented
# in a better way
if len(self.emotions) == 3:
self.categories = {'sad': 1, 'neutral': 2, 'happy': 3}
elif len(self.emotions) == 5:
self.categories = {'angry': 1, 'sad': 2, 'neutral': 3, 'ps': 4, 'happy': 5}
else:
raise TypeError("Regression is only for either ['sad', 'neutral', 'happy'] or ['angry', 'sad', 'neutral', 'ps', 'happy']")
emotions = [ self.categories[e] for e in emotions ]
# make features folder if does not exist
if not os.path.isdir(self.features_folder_name):
os.mkdir(self.features_folder_name)
# get label for features
label = get_label(self.audio_config)
# construct features file name
n_samples = len(audio_paths)
first_letters = get_first_letters(self.emotions)
name = os.path.join(self.features_folder_name, f"{partition}_{label}_{first_letters}_{n_samples}.npy")
if os.path.isfile(name):
# if file already exists, just load then
if self.verbose:
print("[+] Feature file already exists, loading...")
features = np.load(name)
else:
# file does not exist, extract those features and dump them into the file
features = []
append = features.append
for audio_file in tqdm.tqdm(audio_paths, f"Extracting features for {partition}"):
feature = extract_feature(audio_file, **self.audio_config)
if self.input_dimension is None:
self.input_dimension = feature.shape[0]
append(feature)
# convert to numpy array
features = np.array(features)
# save it
np.save(name, features)
if partition == "train":
try:
self.train_audio_paths
except AttributeError:
self.train_audio_paths = audio_paths
self.train_emotions = emotions
self.train_features = features
else:
if self.verbose:
print("[*] Adding additional training samples")
self.train_audio_paths += audio_paths
self.train_emotions += emotions
self.train_features = np.vstack((self.train_features, features))
elif partition == "test":
try:
self.test_audio_paths
except AttributeError:
self.test_audio_paths = audio_paths
self.test_emotions = emotions
self.test_features = features
else:
if self.verbose:
print("[*] Adding additional testing samples")
self.test_audio_paths += audio_paths
self.test_emotions += emotions
self.test_features = np.vstack((self.test_features, features))
else:
raise TypeError("Invalid partition, must be either train/test")
def _balance_data(self, partition):
if partition == "train":
emotions = self.train_emotions
features = self.train_features
audio_paths = self.train_audio_paths
elif partition == "test":
emotions = self.test_emotions
features = self.test_features
audio_paths = self.test_audio_paths
else:
raise TypeError("Invalid partition, must be either train/test")
count = []
if self.classification:
for emotion in self.emotions:
count.append(len([ e for e in emotions if e == emotion]))
else:
# regression, take actual numbers, not label emotion
for emotion in self.categories.values():
count.append(len([ e for e in emotions if e == emotion]))
# get the minimum data samples to balance to
minimum = min(count)
if minimum == 0:
# won't balance, otherwise 0 samples will be loaded
print("[!] One class has 0 samples, setting balance to False")
self.balance = False
return
if self.verbose:
print("[*] Balancing the dataset to the minimum value:", minimum)
d = defaultdict(list)
if self.classification:
counter = {e: 0 for e in self.emotions }
else:
counter = { e: 0 for e in self.categories.values() }
for emotion, feature, audio_path in zip(emotions, features, audio_paths):
if counter[emotion] >= minimum:
# minimum value exceeded
continue
counter[emotion] += 1
d[emotion].append((feature, audio_path))
emotions, features, audio_paths = [], [], []
for emotion, features_audio_paths in d.items():
for feature, audio_path in features_audio_paths:
emotions.append(emotion)
features.append(feature)
audio_paths.append(audio_path)
if partition == "train":
self.train_emotions = emotions
self.train_features = features
self.train_audio_paths = audio_paths
elif partition == "test":
self.test_emotions = emotions
self.test_features = features
self.test_audio_paths = audio_paths
else:
raise TypeError("Invalid partition, must be either train/test")
def balance_training_data(self):
self._balance_data("train")
def balance_testing_data(self):
self._balance_data("test")
def shuffle_data(audio_paths, emotions, features):
""" Shuffle the data (called after making a complete pass through
training or validation data during the training process)
Params:
audio_paths (list): Paths to audio clips
emotions (list): Emotions in each audio clip
features (list): features audio clips
"""
p = np.random.permutation(len(audio_paths))
audio_paths = [audio_paths[i] for i in p]
emotions = [emotions[i] for i in p]
features = [features[i] for i in p]
return audio_paths, emotions, features
def load_data(train_desc_files, test_desc_files, audio_config=None, classification=True, shuffle=True,
balance=True, emotions=['sad', 'neutral', 'happy']):
# instantiate the class
audiogen = AudioExtractor(audio_config=audio_config, classification=classification, emotions=emotions,
balance=balance, verbose=0)
# Loads training data
audiogen.load_train_data(train_desc_files, shuffle=shuffle)
# Loads testing data
audiogen.load_test_data(test_desc_files, shuffle=shuffle)
# X_train, X_test, y_train, y_test
return {
"X_train": np.array(audiogen.train_features),
"X_test": np.array(audiogen.test_features),
"y_train": np.array(audiogen.train_emotions),
"y_test": np.array(audiogen.test_emotions),
"train_audio_paths": audiogen.train_audio_paths,
"test_audio_paths": audiogen.test_audio_paths,
"balance": audiogen.balance,
}