Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
import numpy as np | |
import librosa | |
import tensorflow as tf | |
from tensorflow import keras | |
from tensorflow.keras import layers | |
from transformers import AutoFeatureExtractor | |
from sklearnex import patch_sklearn, unpatch_sklearn | |
patch_sklearn() | |
import xgboost as xgb | |
MAX_DURATION = 2 | |
# Sampling rate is the number of samples of audio recorded every second | |
SAMPLING_RATE = 16000 | |
BATCH_SIZE = 2 # Batch-size for training and evaluating our model. | |
NUM_CLASSES = 8 # Number of classes our dataset will have (11 in our case). | |
HIDDEN_DIM = 768 # Dimension of our model output (768 in case of Wav2Vec 2.0 - Base). | |
MAX_SEQ_LENGTH = MAX_DURATION * SAMPLING_RATE # Maximum length of the input audio file. | |
# Wav2Vec 2.0 results in an output frequency with a stride of about 20ms. | |
MAX_FRAMES = 99 | |
MAX_EPOCHS = 5 # Maximum number of training epochs. | |
RAVDESS_CLASS_LABELS = ("angry", "calm", "disgust", "fear", "happy", "neutral","sad","surprise") | |
MODEL_CHECKPOINT = "facebook/wav2vec2-base" | |
labels = RAVDESS_CLASS_LABELS | |
label2id, id2label = dict(), dict() | |
from transformers import TFWav2Vec2Model | |
def mean_pool(hidden_states, feature_lengths): | |
attenion_mask = tf.sequence_mask( | |
feature_lengths, maxlen=MAX_FRAMES, dtype=tf.dtypes.int64 | |
) | |
padding_mask = tf.cast( | |
tf.reverse(tf.cumsum(tf.reverse(attenion_mask, [-1]), -1), [-1]), | |
dtype=tf.dtypes.bool, | |
) | |
hidden_states = tf.where( | |
tf.broadcast_to( | |
tf.expand_dims(~padding_mask, -1), (BATCH_SIZE, MAX_FRAMES, HIDDEN_DIM) | |
), | |
0.0, | |
hidden_states, | |
) | |
pooled_state = tf.math.reduce_sum(hidden_states, axis=1) / tf.reshape( | |
tf.math.reduce_sum(tf.cast(padding_mask, dtype=tf.dtypes.float32), axis=1), | |
[-1, 1], | |
) | |
return pooled_state | |
class TFWav2Vec2ForAudioClassification(keras.Model): | |
def __init__(self, model_checkpoint): | |
super().__init__() | |
# Instantiate the Wav2Vec 2.0 model without the Classification-Head | |
self.wav2vec2 = TFWav2Vec2Model.from_pretrained( | |
model_checkpoint, apply_spec_augment=False, from_pt=True | |
) | |
self.pooling = layers.GlobalAveragePooling1D() | |
self.flat = layers.Flatten() | |
self.intermediate_layer_dropout = layers.Dropout(0.5) | |
def call(self, inputs): | |
hidden_states = self.wav2vec2(inputs[0])[0] | |
if tf.is_tensor(inputs[1]): | |
audio_lengths = tf.cumsum(inputs[1], -1)[:, -1] | |
feature_lengths = self.wav2vec2.wav2vec2._get_feat_extract_output_lengths( | |
audio_lengths | |
) | |
pooled_state = mean_pool(hidden_states, feature_lengths) | |
else: | |
pooled_state = self.pooling(hidden_states) | |
intermediate_state = self.flat(self.intermediate_layer_dropout(pooled_state)) | |
return intermediate_state | |
wav2vec2_model = TFWav2Vec2ForAudioClassification(MODEL_CHECKPOINT) | |
for i, label in enumerate(labels): | |
label2id[label] = str(i) | |
id2label[str(i)] = label | |
feature_extractor = AutoFeatureExtractor.from_pretrained( | |
MODEL_CHECKPOINT, return_attention_mask=True | |
) | |
xgb_params = { | |
'objective': 'binary:logistic', | |
'predictor': 'cpu_predictor', | |
'disable_default_eval_metric': 'true', | |
} | |
model_xgb= xgb.XGBClassifier(**xgb_params) | |
model_xgb.load_model('xgb.json') | |
def greet(name): | |
inp = feature_extractor( | |
name[1], | |
sampling_rate=feature_extractor.sampling_rate, | |
max_length=MAX_SEQ_LENGTH, | |
truncation=True, | |
padding=True, | |
) | |
inp = np.array([y for x,y in inp.items()]) | |
pred = wav2vec2_model.predict([inp[0],inp[1]]) | |
pred = model_xgb.predict(pred) | |
lab = id2label[str(pred[0])] | |
return lab | |
iface = gr.Interface(fn=greet, inputs="audio", outputs="text") | |
iface.launch() |