underwater45 commited on
Commit
2d978d2
·
verified ·
1 Parent(s): fe8df42

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +105 -0
main.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ from tensorflow.keras.models import Sequential
5
+ from tensorflow.keras.layers import LSTM, Dense, Embedding
6
+ from tensorflow.keras.optimizers import Adam
7
+ from tensorflow.keras.losses import CategoricalCrossentropy
8
+ from sklearn.preprocessing import LabelEncoder
9
+ from tensorflow.keras.utils import to_categorical
10
+ import os
11
+ import json
12
+
13
+
14
+ def extract_features(file_path):
15
+ try:
16
+ audio, sr = librosa.load(file_path, sr=None) # Load audio, keep the original sampling rate
17
+ mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
18
+ return mfccs.T # Transpose to have (time_steps, features)
19
+ except Exception as e:
20
+ print(f"Error processing {file_path}: {e}")
21
+ return None
22
+
23
+ def create_model(input_shape, vocab_size):
24
+ model = Sequential()
25
+ #Embedding to increase the vocabulary space
26
+ model.add(Embedding(input_dim=vocab_size, output_dim=16, input_length=input_shape))
27
+ model.add(LSTM(64)) # Simple LSTM with 64 units.
28
+ model.add(Dense(vocab_size, activation='softmax')) # Output layer.
29
+ optimizer = Adam(learning_rate=0.001) #Adam optimizer
30
+ loss_function = CategoricalCrossentropy() #categorical crossentropy loss
31
+ model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])
32
+ return model
33
+
34
+ def prepare_data(mfccs_list):
35
+ all_mfccs = np.concatenate(mfccs_list, axis=0)
36
+ label_encoder = LabelEncoder()
37
+ integer_encoded = label_encoder.fit_transform(all_mfccs.reshape(-1, all_mfccs.shape[-1]).astype(str))
38
+ integer_encoded = integer_encoded.reshape(all_mfccs.shape[0], all_mfccs.shape[1])
39
+ vocab_size = len(label_encoder.classes_)
40
+ # Creating the sequences
41
+ seq_length = 10
42
+ dataX, dataY = [], []
43
+ for i in range(0, len(integer_encoded) - seq_length, 1):
44
+ seq_in = integer_encoded[i:i + seq_length]
45
+ seq_out = integer_encoded[i + seq_length]
46
+ dataX.append(seq_in)
47
+ dataY.append(seq_out)
48
+ n_patterns = len(dataX)
49
+ # Reshape input to be [samples, time steps, features]
50
+ dataX = np.array(dataX)
51
+ dataX = np.reshape(dataX, (n_patterns, seq_length, all_mfccs.shape[-1]))
52
+ dataY = np.array(dataY)
53
+ dataY = to_categorical(dataY, num_classes=vocab_size)
54
+ return dataX, dataY, vocab_size, label_encoder
55
+
56
+ def train_model(model, dataX, dataY):
57
+ model.fit(dataX, dataY, epochs=10, batch_size=64, verbose=0)
58
+
59
+ def generate_rap(model, start_seq, label_encoder, seq_length, vocab_size, num_frames=50):
60
+ generated_seq = start_seq.copy()
61
+ for _ in range(num_frames):
62
+ # Reshape the input to be [samples, time_steps, features]
63
+ x_input = np.reshape(generated_seq, (1, len(generated_seq), generated_seq[0].shape[0]))
64
+ # Predict the next token
65
+ predicted_probabilities = model.predict(x_input, verbose=0)[0]
66
+ predicted_token = np.argmax(predicted_probabilities)
67
+
68
+ # Add the new mfcc
69
+ generated_seq = np.concatenate((generated_seq, [label_encoder.classes_[predicted_token].split()]), axis=0)
70
+ generated_seq = generated_seq.astype(float)
71
+ return generated_seq
72
+
73
+
74
+ # Function to train model and get results
75
+ def train_and_generate(file_path):
76
+ # Check File extensions
77
+ if not file_path.lower().endswith(('.mp3', '.wav')):
78
+ return "Invalid file type"
79
+
80
+ # Extract features and prepare data
81
+ features = extract_features(file_path)
82
+ if features is None:
83
+ return "Error extracting audio features, check input"
84
+ dataX, dataY, vocab_size, label_encoder = prepare_data([features])
85
+ input_shape = dataX.shape[1]
86
+ # Create and Train model
87
+ model = create_model(input_shape, vocab_size)
88
+ train_model(model, dataX, dataY)
89
+ # Generation from a random seed
90
+ rand_index = np.random.randint(0, len(dataX)-1)
91
+ start_seq = dataX[rand_index]
92
+ generated_mfcc_sequence = generate_rap(model, start_seq, label_encoder, input_shape, vocab_size)
93
+ return generated_mfcc_sequence
94
+
95
+ # Gradio Interface
96
+ iface = gr.Interface(
97
+ fn=train_and_generate,
98
+ inputs=gr.Audio(source="upload", type="filepath", label="Upload MP3 or WAV File"),
99
+ outputs=gr.Textbox(label="Generated Rap"),
100
+ title="AI Rapper",
101
+ description="Upload a Rap song to train the model and generate a new rap verse"
102
+ )
103
+
104
+ if __name__ == "__main__":
105
+ iface.launch()