File size: 4,321 Bytes
2d978d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr
import librosa
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import os
import json


def extract_features(file_path):
    try:
        audio, sr = librosa.load(file_path, sr=None)  # Load audio, keep the original sampling rate
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        return mfccs.T  # Transpose to have (time_steps, features)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def create_model(input_shape, vocab_size):
    model = Sequential()
    #Embedding to increase the vocabulary space
    model.add(Embedding(input_dim=vocab_size, output_dim=16, input_length=input_shape))
    model.add(LSTM(64))  # Simple LSTM with 64 units.
    model.add(Dense(vocab_size, activation='softmax'))  # Output layer.
    optimizer = Adam(learning_rate=0.001) #Adam optimizer
    loss_function = CategoricalCrossentropy() #categorical crossentropy loss
    model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])
    return model
    
def prepare_data(mfccs_list):
    all_mfccs = np.concatenate(mfccs_list, axis=0)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(all_mfccs.reshape(-1, all_mfccs.shape[-1]).astype(str))
    integer_encoded = integer_encoded.reshape(all_mfccs.shape[0], all_mfccs.shape[1])
    vocab_size = len(label_encoder.classes_)
    # Creating the sequences
    seq_length = 10
    dataX, dataY = [], []
    for i in range(0, len(integer_encoded) - seq_length, 1):
        seq_in = integer_encoded[i:i + seq_length]
        seq_out = integer_encoded[i + seq_length]
        dataX.append(seq_in)
        dataY.append(seq_out)
    n_patterns = len(dataX)
    # Reshape input to be [samples, time steps, features]
    dataX = np.array(dataX)
    dataX = np.reshape(dataX, (n_patterns, seq_length, all_mfccs.shape[-1]))
    dataY = np.array(dataY)
    dataY = to_categorical(dataY, num_classes=vocab_size)
    return dataX, dataY, vocab_size, label_encoder

def train_model(model, dataX, dataY):
    model.fit(dataX, dataY, epochs=10, batch_size=64, verbose=0)

def generate_rap(model, start_seq, label_encoder, seq_length, vocab_size, num_frames=50):
    generated_seq = start_seq.copy()
    for _ in range(num_frames):
        # Reshape the input to be [samples, time_steps, features]
        x_input = np.reshape(generated_seq, (1, len(generated_seq), generated_seq[0].shape[0]))
        # Predict the next token
        predicted_probabilities = model.predict(x_input, verbose=0)[0]
        predicted_token = np.argmax(predicted_probabilities)

        # Add the new mfcc
        generated_seq = np.concatenate((generated_seq, [label_encoder.classes_[predicted_token].split()]), axis=0)
    generated_seq = generated_seq.astype(float)
    return generated_seq


# Function to train model and get results
def train_and_generate(file_path):
    # Check File extensions
    if not file_path.lower().endswith(('.mp3', '.wav')):
        return "Invalid file type"
    
    # Extract features and prepare data
    features = extract_features(file_path)
    if features is None:
        return "Error extracting audio features, check input"
    dataX, dataY, vocab_size, label_encoder = prepare_data([features])
    input_shape = dataX.shape[1]
    # Create and Train model
    model = create_model(input_shape, vocab_size)
    train_model(model, dataX, dataY)
    # Generation from a random seed
    rand_index = np.random.randint(0, len(dataX)-1)
    start_seq = dataX[rand_index]
    generated_mfcc_sequence = generate_rap(model, start_seq, label_encoder, input_shape, vocab_size)
    return generated_mfcc_sequence

# Gradio Interface
iface = gr.Interface(
    fn=train_and_generate,
    inputs=gr.Audio(source="upload", type="filepath", label="Upload MP3 or WAV File"),
    outputs=gr.Textbox(label="Generated Rap"),
    title="AI Rapper",
    description="Upload a Rap song to train the model and generate a new rap verse"
)

if __name__ == "__main__":
    iface.launch()