Spaces:

azoodle
/

genre_classify

Running

App Files Files Community

azeus commited on Dec 8, 2024

Commit

eb6bacc

1 Parent(s): f3617b6

adapting to audio formats

Browse files

Files changed (1) hide show

app.py +71 -48

app.py CHANGED Viewed

@@ -4,6 +4,9 @@ import torch
 from transformers import Wav2Vec2Processor, Wav2Vec2Model
 import torchaudio
 import io
 # Initialize model and processor
@@ -14,40 +17,68 @@ def load_model():
     return processor, model
 # Audio processing function
 def process_audio(audio_file, processor, model):
-    # Read audio file
-    audio_bytes = audio_file.read()
-    waveform, sample_rate = torchaudio.load(io.BytesIO(audio_bytes))
-    # Resample if needed
-    if sample_rate != 16000:
-        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
-        waveform = resampler(waveform)
-    # Convert to mono if stereo
-    if waveform.shape[0] > 1:
-        waveform = torch.mean(waveform, dim=0, keepdim=True)
-    # Process through Wav2Vec2
-    inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    # Get features from last hidden states
-    features = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
-    return features
-# Simple genre classifier (we'll use a basic classifier for demonstration)
 class SimpleGenreClassifier:
     def __init__(self):
         self.genres = ["Rock", "Pop", "Hip Hop", "Classical", "Jazz"]
-        # Simulated learned weights (in real application, these would be trained)
         self.weights = np.random.randn(768, len(self.genres))
     def predict(self, features):
-        # Simple linear classification
         logits = np.dot(features, self.weights)
         probabilities = self.softmax(logits)
         return probabilities
@@ -72,7 +103,7 @@ except Exception as e:
     st.error(f"Error loading models: {str(e)}")
     st.stop()
-# Create two columns for layout
 col1, col2 = st.columns(2)
 with col1:
@@ -84,61 +115,53 @@ with col1:
         st.audio(audio_file)
         st.success("File uploaded successfully!")
         # Add classify button
         if st.button("Classify Genre"):
             try:
                 with st.spinner("Analyzing audio..."):
-                    # Extract features using Wav2Vec2
                     features = process_audio(audio_file, processor, wav2vec_model)
-                    # Get genre predictions
-                    probabilities = classifier.predict(features)
-                    # Show results
-                    st.write("### Genre Analysis Results:")
-                    for genre, prob in zip(classifier.genres, probabilities):
-                        # Create a progress bar for each genre
-                        st.write(f"{genre}:")
-                        st.progress(float(prob))
-                        st.write(f"{prob:.2%}")
-                    # Show top prediction
-                    top_genre = classifier.genres[np.argmax(probabilities)]
-                    st.write(f"**Predicted Genre:** {top_genre}")
             except Exception as e:
                 st.error(f"Error during analysis: {str(e)}")
 with col2:
-    # Display information about the model
     st.write("### About the Model:")
     st.write("""
     This classifier uses:
     - Facebook's Wav2Vec2 for audio feature extraction
     - Custom genre classification layer
-    - Pre-trained on speech recognition
     """)
     st.write("### Supported Genres:")
     for genre in classifier.genres:
         st.write(f"- {genre}")
-    # Add usage tips
     st.write("### Tips for best results:")
     st.write("- Upload clear, high-quality audio")
-    st.write("- Ideal length: 10-30 seconds")
     st.write("- Avoid audio with multiple overlapping genres")
     st.write("- Ensure minimal background noise")
-# Update requirements.txt
-if st.sidebar.checkbox("Show requirements.txt contents"):
-    st.sidebar.code("""
-    streamlit==1.31.0
-    torch==2.0.1
-    torchaudio==2.0.1
-    transformers==4.30.2
-    numpy==1.24.3
-    """)
 # Footer
 st.markdown("---")
 st.write("Made with ❤️ using Streamlit and Hugging Face Transformers")

 from transformers import Wav2Vec2Processor, Wav2Vec2Model
 import torchaudio
 import io
+from pydub import AudioSegment
+import tempfile
+import os
 # Initialize model and processor
     return processor, model
+def convert_audio_to_wav(audio_file):
+    """Convert uploaded audio to WAV format"""
+    # Read uploaded file
+    audio_bytes = audio_file.read()
+    # Create a temporary file
+    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
+        # Convert audio using pydub
+        audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
+        audio.export(temp_wav.name, format='wav')
+        return temp_wav.name
 # Audio processing function
 def process_audio(audio_file, processor, model):
+    try:
+        # Convert audio to WAV format
+        wav_path = convert_audio_to_wav(audio_file)
+        # Load the WAV file
+        waveform, sample_rate = torchaudio.load(wav_path)
+        # Clean up temporary file
+        os.remove(wav_path)
+        # Resample if needed
+        if sample_rate != 16000:
+            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
+            waveform = resampler(waveform)
+        # Convert to mono if stereo
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        # Limit audio length to 30 seconds
+        max_length = 16000 * 30  # 30 seconds at 16kHz
+        if waveform.shape[1] > max_length:
+            waveform = waveform[:, :max_length]
+        # Process through Wav2Vec2
+        inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # Get features from last hidden states
+        features = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+        return features
+    except Exception as e:
+        st.error(f"Error processing audio: {str(e)}")
+        return None
+# Simple genre classifier
 class SimpleGenreClassifier:
     def __init__(self):
         self.genres = ["Rock", "Pop", "Hip Hop", "Classical", "Jazz"]
+        # Simulated learned weights
+        np.random.seed(42)  # For consistent results
         self.weights = np.random.randn(768, len(self.genres))
     def predict(self, features):
         logits = np.dot(features, self.weights)
         probabilities = self.softmax(logits)
         return probabilities
     st.error(f"Error loading models: {str(e)}")
     st.stop()
+# Create two columns
 col1, col2 = st.columns(2)
 with col1:
         st.audio(audio_file)
         st.success("File uploaded successfully!")
+        # Reset file pointer
+        audio_file.seek(0)
         # Add classify button
         if st.button("Classify Genre"):
             try:
                 with st.spinner("Analyzing audio..."):
+                    # Extract features
                     features = process_audio(audio_file, processor, wav2vec_model)
+                    if features is not None:
+                        # Get predictions
+                        probabilities = classifier.predict(features)
+                        # Show results
+                        st.write("### Genre Analysis Results:")
+                        for genre, prob in zip(classifier.genres, probabilities):
+                            st.write(f"{genre}:")
+                            st.progress(float(prob))
+                            st.write(f"{prob:.2%}")
+                        # Show top prediction
+                        top_genre = classifier.genres[np.argmax(probabilities)]
+                        st.write(f"**Predicted Genre:** {top_genre}")
             except Exception as e:
                 st.error(f"Error during analysis: {str(e)}")
 with col2:
     st.write("### About the Model:")
     st.write("""
     This classifier uses:
     - Facebook's Wav2Vec2 for audio feature extraction
     - Custom genre classification layer
+    - Handles MP3 and WAV formats
     """)
     st.write("### Supported Genres:")
     for genre in classifier.genres:
         st.write(f"- {genre}")
     st.write("### Tips for best results:")
     st.write("- Upload clear, high-quality audio")
+    st.write("- Best length: 10-30 seconds")
     st.write("- Avoid audio with multiple overlapping genres")
     st.write("- Ensure minimal background noise")
 # Footer
 st.markdown("---")
 st.write("Made with ❤️ using Streamlit and Hugging Face Transformers")