Spaces:
Build error
Build error
import gradio as gr | |
import librosa | |
import numpy as np | |
from tensorflow.keras.models import Sequential | |
from tensorflow.keras.layers import LSTM, Dense, Embedding | |
from tensorflow.keras.optimizers import Adam | |
from tensorflow.keras.losses import CategoricalCrossentropy | |
from sklearn.preprocessing import LabelEncoder | |
from tensorflow.keras.utils import to_categorical | |
import os | |
import json | |
def extract_features(file_path): | |
try: | |
audio, sr = librosa.load(file_path, sr=None) # Load audio, keep the original sampling rate | |
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13) | |
return mfccs.T # Transpose to have (time_steps, features) | |
except Exception as e: | |
print(f"Error processing {file_path}: {e}") | |
return None | |
def create_model(input_shape, vocab_size): | |
model = Sequential() | |
#Embedding to increase the vocabulary space | |
model.add(Embedding(input_dim=vocab_size, output_dim=16, input_length=input_shape)) | |
model.add(LSTM(64)) # Simple LSTM with 64 units. | |
model.add(Dense(vocab_size, activation='softmax')) # Output layer. | |
optimizer = Adam(learning_rate=0.001) #Adam optimizer | |
loss_function = CategoricalCrossentropy() #categorical crossentropy loss | |
model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy']) | |
return model | |
def prepare_data(mfccs_list): | |
all_mfccs = np.concatenate(mfccs_list, axis=0) | |
label_encoder = LabelEncoder() | |
integer_encoded = label_encoder.fit_transform(all_mfccs.reshape(-1, all_mfccs.shape[-1]).astype(str)) | |
integer_encoded = integer_encoded.reshape(all_mfccs.shape[0], all_mfccs.shape[1]) | |
vocab_size = len(label_encoder.classes_) | |
# Creating the sequences | |
seq_length = 10 | |
dataX, dataY = [], [] | |
for i in range(0, len(integer_encoded) - seq_length, 1): | |
seq_in = integer_encoded[i:i + seq_length] | |
seq_out = integer_encoded[i + seq_length] | |
dataX.append(seq_in) | |
dataY.append(seq_out) | |
n_patterns = len(dataX) | |
# Reshape input to be [samples, time steps, features] | |
dataX = np.array(dataX) | |
dataX = np.reshape(dataX, (n_patterns, seq_length, all_mfccs.shape[-1])) | |
dataY = np.array(dataY) | |
dataY = to_categorical(dataY, num_classes=vocab_size) | |
return dataX, dataY, vocab_size, label_encoder | |
def train_model(model, dataX, dataY): | |
model.fit(dataX, dataY, epochs=10, batch_size=64, verbose=0) | |
def generate_rap(model, start_seq, label_encoder, seq_length, vocab_size, num_frames=50): | |
generated_seq = start_seq.copy() | |
for _ in range(num_frames): | |
# Reshape the input to be [samples, time_steps, features] | |
x_input = np.reshape(generated_seq, (1, len(generated_seq), generated_seq[0].shape[0])) | |
# Predict the next token | |
predicted_probabilities = model.predict(x_input, verbose=0)[0] | |
predicted_token = np.argmax(predicted_probabilities) | |
# Add the new mfcc | |
generated_seq = np.concatenate((generated_seq, [label_encoder.classes_[predicted_token].split()]), axis=0) | |
generated_seq = generated_seq.astype(float) | |
return generated_seq | |
# Function to train model and get results | |
def train_and_generate(file_path): | |
# Check File extensions | |
if not file_path.lower().endswith(('.mp3', '.wav')): | |
return "Invalid file type" | |
# Extract features and prepare data | |
features = extract_features(file_path) | |
if features is None: | |
return "Error extracting audio features, check input" | |
dataX, dataY, vocab_size, label_encoder = prepare_data([features]) | |
input_shape = dataX.shape[1] | |
# Create and Train model | |
model = create_model(input_shape, vocab_size) | |
train_model(model, dataX, dataY) | |
# Generation from a random seed | |
rand_index = np.random.randint(0, len(dataX)-1) | |
start_seq = dataX[rand_index] | |
generated_mfcc_sequence = generate_rap(model, start_seq, label_encoder, input_shape, vocab_size) | |
return generated_mfcc_sequence | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=train_and_generate, | |
inputs=gr.Audio(source="upload", type="filepath", label="Upload MP3 or WAV File"), | |
outputs=gr.Textbox(label="Generated Rap"), | |
title="AI Rapper", | |
description="Upload a Rap song to train the model and generate a new rap verse" | |
) | |
if __name__ == "__main__": | |
iface.launch() |