Spaces:
Build error
Build error
Create main.py
Browse files
main.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import librosa
|
3 |
+
import numpy as np
|
4 |
+
from tensorflow.keras.models import Sequential
|
5 |
+
from tensorflow.keras.layers import LSTM, Dense, Embedding
|
6 |
+
from tensorflow.keras.optimizers import Adam
|
7 |
+
from tensorflow.keras.losses import CategoricalCrossentropy
|
8 |
+
from sklearn.preprocessing import LabelEncoder
|
9 |
+
from tensorflow.keras.utils import to_categorical
|
10 |
+
import os
|
11 |
+
import json
|
12 |
+
|
13 |
+
|
14 |
+
def extract_features(file_path):
|
15 |
+
try:
|
16 |
+
audio, sr = librosa.load(file_path, sr=None) # Load audio, keep the original sampling rate
|
17 |
+
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
|
18 |
+
return mfccs.T # Transpose to have (time_steps, features)
|
19 |
+
except Exception as e:
|
20 |
+
print(f"Error processing {file_path}: {e}")
|
21 |
+
return None
|
22 |
+
|
23 |
+
def create_model(input_shape, vocab_size):
|
24 |
+
model = Sequential()
|
25 |
+
#Embedding to increase the vocabulary space
|
26 |
+
model.add(Embedding(input_dim=vocab_size, output_dim=16, input_length=input_shape))
|
27 |
+
model.add(LSTM(64)) # Simple LSTM with 64 units.
|
28 |
+
model.add(Dense(vocab_size, activation='softmax')) # Output layer.
|
29 |
+
optimizer = Adam(learning_rate=0.001) #Adam optimizer
|
30 |
+
loss_function = CategoricalCrossentropy() #categorical crossentropy loss
|
31 |
+
model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])
|
32 |
+
return model
|
33 |
+
|
34 |
+
def prepare_data(mfccs_list):
|
35 |
+
all_mfccs = np.concatenate(mfccs_list, axis=0)
|
36 |
+
label_encoder = LabelEncoder()
|
37 |
+
integer_encoded = label_encoder.fit_transform(all_mfccs.reshape(-1, all_mfccs.shape[-1]).astype(str))
|
38 |
+
integer_encoded = integer_encoded.reshape(all_mfccs.shape[0], all_mfccs.shape[1])
|
39 |
+
vocab_size = len(label_encoder.classes_)
|
40 |
+
# Creating the sequences
|
41 |
+
seq_length = 10
|
42 |
+
dataX, dataY = [], []
|
43 |
+
for i in range(0, len(integer_encoded) - seq_length, 1):
|
44 |
+
seq_in = integer_encoded[i:i + seq_length]
|
45 |
+
seq_out = integer_encoded[i + seq_length]
|
46 |
+
dataX.append(seq_in)
|
47 |
+
dataY.append(seq_out)
|
48 |
+
n_patterns = len(dataX)
|
49 |
+
# Reshape input to be [samples, time steps, features]
|
50 |
+
dataX = np.array(dataX)
|
51 |
+
dataX = np.reshape(dataX, (n_patterns, seq_length, all_mfccs.shape[-1]))
|
52 |
+
dataY = np.array(dataY)
|
53 |
+
dataY = to_categorical(dataY, num_classes=vocab_size)
|
54 |
+
return dataX, dataY, vocab_size, label_encoder
|
55 |
+
|
56 |
+
def train_model(model, dataX, dataY):
|
57 |
+
model.fit(dataX, dataY, epochs=10, batch_size=64, verbose=0)
|
58 |
+
|
59 |
+
def generate_rap(model, start_seq, label_encoder, seq_length, vocab_size, num_frames=50):
|
60 |
+
generated_seq = start_seq.copy()
|
61 |
+
for _ in range(num_frames):
|
62 |
+
# Reshape the input to be [samples, time_steps, features]
|
63 |
+
x_input = np.reshape(generated_seq, (1, len(generated_seq), generated_seq[0].shape[0]))
|
64 |
+
# Predict the next token
|
65 |
+
predicted_probabilities = model.predict(x_input, verbose=0)[0]
|
66 |
+
predicted_token = np.argmax(predicted_probabilities)
|
67 |
+
|
68 |
+
# Add the new mfcc
|
69 |
+
generated_seq = np.concatenate((generated_seq, [label_encoder.classes_[predicted_token].split()]), axis=0)
|
70 |
+
generated_seq = generated_seq.astype(float)
|
71 |
+
return generated_seq
|
72 |
+
|
73 |
+
|
74 |
+
# Function to train model and get results
|
75 |
+
def train_and_generate(file_path):
|
76 |
+
# Check File extensions
|
77 |
+
if not file_path.lower().endswith(('.mp3', '.wav')):
|
78 |
+
return "Invalid file type"
|
79 |
+
|
80 |
+
# Extract features and prepare data
|
81 |
+
features = extract_features(file_path)
|
82 |
+
if features is None:
|
83 |
+
return "Error extracting audio features, check input"
|
84 |
+
dataX, dataY, vocab_size, label_encoder = prepare_data([features])
|
85 |
+
input_shape = dataX.shape[1]
|
86 |
+
# Create and Train model
|
87 |
+
model = create_model(input_shape, vocab_size)
|
88 |
+
train_model(model, dataX, dataY)
|
89 |
+
# Generation from a random seed
|
90 |
+
rand_index = np.random.randint(0, len(dataX)-1)
|
91 |
+
start_seq = dataX[rand_index]
|
92 |
+
generated_mfcc_sequence = generate_rap(model, start_seq, label_encoder, input_shape, vocab_size)
|
93 |
+
return generated_mfcc_sequence
|
94 |
+
|
95 |
+
# Gradio Interface
|
96 |
+
iface = gr.Interface(
|
97 |
+
fn=train_and_generate,
|
98 |
+
inputs=gr.Audio(source="upload", type="filepath", label="Upload MP3 or WAV File"),
|
99 |
+
outputs=gr.Textbox(label="Generated Rap"),
|
100 |
+
title="AI Rapper",
|
101 |
+
description="Upload a Rap song to train the model and generate a new rap verse"
|
102 |
+
)
|
103 |
+
|
104 |
+
if __name__ == "__main__":
|
105 |
+
iface.launch()
|