AI-rap-learner / app.py
underwater45's picture
Rename main.py to app.py
65ce1e6 verified
raw
history blame
4.32 kB
import gradio as gr
import librosa
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import os
import json
def extract_features(file_path):
try:
audio, sr = librosa.load(file_path, sr=None) # Load audio, keep the original sampling rate
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
return mfccs.T # Transpose to have (time_steps, features)
except Exception as e:
print(f"Error processing {file_path}: {e}")
return None
def create_model(input_shape, vocab_size):
model = Sequential()
#Embedding to increase the vocabulary space
model.add(Embedding(input_dim=vocab_size, output_dim=16, input_length=input_shape))
model.add(LSTM(64)) # Simple LSTM with 64 units.
model.add(Dense(vocab_size, activation='softmax')) # Output layer.
optimizer = Adam(learning_rate=0.001) #Adam optimizer
loss_function = CategoricalCrossentropy() #categorical crossentropy loss
model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])
return model
def prepare_data(mfccs_list):
all_mfccs = np.concatenate(mfccs_list, axis=0)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(all_mfccs.reshape(-1, all_mfccs.shape[-1]).astype(str))
integer_encoded = integer_encoded.reshape(all_mfccs.shape[0], all_mfccs.shape[1])
vocab_size = len(label_encoder.classes_)
# Creating the sequences
seq_length = 10
dataX, dataY = [], []
for i in range(0, len(integer_encoded) - seq_length, 1):
seq_in = integer_encoded[i:i + seq_length]
seq_out = integer_encoded[i + seq_length]
dataX.append(seq_in)
dataY.append(seq_out)
n_patterns = len(dataX)
# Reshape input to be [samples, time steps, features]
dataX = np.array(dataX)
dataX = np.reshape(dataX, (n_patterns, seq_length, all_mfccs.shape[-1]))
dataY = np.array(dataY)
dataY = to_categorical(dataY, num_classes=vocab_size)
return dataX, dataY, vocab_size, label_encoder
def train_model(model, dataX, dataY):
model.fit(dataX, dataY, epochs=10, batch_size=64, verbose=0)
def generate_rap(model, start_seq, label_encoder, seq_length, vocab_size, num_frames=50):
generated_seq = start_seq.copy()
for _ in range(num_frames):
# Reshape the input to be [samples, time_steps, features]
x_input = np.reshape(generated_seq, (1, len(generated_seq), generated_seq[0].shape[0]))
# Predict the next token
predicted_probabilities = model.predict(x_input, verbose=0)[0]
predicted_token = np.argmax(predicted_probabilities)
# Add the new mfcc
generated_seq = np.concatenate((generated_seq, [label_encoder.classes_[predicted_token].split()]), axis=0)
generated_seq = generated_seq.astype(float)
return generated_seq
# Function to train model and get results
def train_and_generate(file_path):
# Check File extensions
if not file_path.lower().endswith(('.mp3', '.wav')):
return "Invalid file type"
# Extract features and prepare data
features = extract_features(file_path)
if features is None:
return "Error extracting audio features, check input"
dataX, dataY, vocab_size, label_encoder = prepare_data([features])
input_shape = dataX.shape[1]
# Create and Train model
model = create_model(input_shape, vocab_size)
train_model(model, dataX, dataY)
# Generation from a random seed
rand_index = np.random.randint(0, len(dataX)-1)
start_seq = dataX[rand_index]
generated_mfcc_sequence = generate_rap(model, start_seq, label_encoder, input_shape, vocab_size)
return generated_mfcc_sequence
# Gradio Interface
iface = gr.Interface(
fn=train_and_generate,
inputs=gr.Audio(source="upload", type="filepath", label="Upload MP3 or WAV File"),
outputs=gr.Textbox(label="Generated Rap"),
title="AI Rapper",
description="Upload a Rap song to train the model and generate a new rap verse"
)
if __name__ == "__main__":
iface.launch()