import os import torch import librosa import numpy as np import gradio as gr from sonics import HFAudioClassifier # Restructured model configurations for separate selectors MODEL_TYPES = ["SpecTTTra-α", "SpecTTTra-β", "SpecTTTra-γ"] DURATIONS = ["5s", "120s"] # Mapping for model IDs def get_model_id(model_type, duration): model_map = { "SpecTTTra-α-5s": "awsaf49/sonics-spectttra-alpha-5s", "SpecTTTra-β-5s": "awsaf49/sonics-spectttra-beta-5s", "SpecTTTra-γ-5s": "awsaf49/sonics-spectttra-gamma-5s", "SpecTTTra-α-120s": "awsaf49/sonics-spectttra-alpha-120s", "SpecTTTra-β-120s": "awsaf49/sonics-spectttra-beta-120s", "SpecTTTra-γ-120s": "awsaf49/sonics-spectttra-gamma-120s", } key = f"{model_type}-{duration}" return model_map[key] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_cache = {} def load_model(model_type, duration): """Load model if not already cached""" model_key = f"{model_type}-{duration}" if model_key not in model_cache: model_id = get_model_id(model_type, duration) model = HFAudioClassifier.from_pretrained(model_id) model = model.to(device) model.eval() model_cache[model_key] = model return model_cache[model_key] def process_audio(audio_path, model_type, duration): """Process audio file and return prediction""" try: model = load_model(model_type, duration) max_time = model.config.audio.max_time # Load and process audio audio, sr = librosa.load(audio_path, sr=16000) chunk_samples = int(max_time * sr) total_chunks = len(audio) // chunk_samples middle_chunk_idx = total_chunks // 2 # Extract middle chunk start = middle_chunk_idx * chunk_samples end = start + chunk_samples chunk = audio[start:end] if len(chunk) < chunk_samples: chunk = np.pad(chunk, (0, chunk_samples - len(chunk))) # Get prediction with torch.no_grad(): chunk = torch.from_numpy(chunk).float().to(device) pred = model(chunk.unsqueeze(0)) prob = torch.sigmoid(pred).cpu().numpy()[0] real_prob = 1 - prob fake_prob = prob # Return formatted results return { "Real": float(real_prob), "Fake": float(fake_prob) } except Exception as e: return {"Error": str(e)} def predict(audio_file, model_type, duration): """Gradio interface function""" if audio_file is None: return {"Message": "Please upload an audio file"} return process_audio(audio_file, model_type, duration) # Updated CSS with better color scheme for resource links css = """ /* Custom CSS that works with Ocean theme */ .sonics-header { text-align: center; padding: 20px; margin-bottom: 20px; border-radius: 10px; } .sonics-logo { max-width: 150px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.3); } .sonics-title { font-size: 28px; margin-bottom: 10px; } .sonics-subtitle { margin-bottom: 15px; } .sonics-description { font-size: 16px; margin: 0; } /* Resource links styling */ .resource-links { display: flex; justify-content: center; flex-wrap: wrap; gap: 8px; margin-bottom: 25px; } .resource-link { background-color: #222222; color: #4aedd6; border: 1px solid #333333; padding: 8px 16px; border-radius: 20px; margin: 5px; text-decoration: none; display: inline-block; font-weight: 500; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.3); transition: all 0.2s ease; } .resource-link:hover { background-color: #333333; transform: translateY(-2px); box-shadow: 0 3px 6px rgba(0, 0, 0, 0.4); transition: all 0.2s ease; } .resource-link-icon { margin-right: 5px; } /* Footer styling */ .sonics-footer { text-align: center; margin-top: 30px; padding: 15px; } /* Selectors wrapper for side-by-side appearance */ .selectors-wrapper { display: flex; gap: 10px; } .selectors-wrapper > div { flex: 1; } """ # Create Gradio interface with gr.Blocks(css=css, theme=gr.themes.Ocean()) as demo: # Title and Logo gr.HTML( """

SONICS: Synthetic Or Not - Identifying Counterfeit Songs

ICLR 2025 [Poster]

Detect if a song is real or AI-generated with our state-of-the-art models. Simply upload an audio file to verify its authenticity!

""" ) # Resource Links - Updated with custom styling to match screenshot gr.HTML( """ """ ) # Main Interface with gr.Row(equal_height=True): with gr.Column(): audio_input = gr.Audio( label="Upload Audio File", type="filepath", elem_id="audio_input" ) # Add CSS class to create a wrapper for side-by-side dropdowns with gr.Row(elem_classes="selectors-wrapper"): model_dropdown = gr.Dropdown( choices=MODEL_TYPES, value="SpecTTTra-γ", label="Select Model", elem_id="model_dropdown" ) duration_dropdown = gr.Dropdown( choices=DURATIONS, value="5s", label="Select Duration", elem_id="duration_dropdown" ) submit_btn = gr.Button( "✨ Analyze Audio", elem_id="submit_btn", variant="primary" ) with gr.Column(): # Define output before using it in Examples output = gr.Label( label="Analysis Result", num_top_classes=2, elem_id="output" ) with gr.Accordion("How It Works", open=True): gr.Markdown(""" ### The SONICS classifier The SONICS classifier analyzes your audio to determine if it's an authentic song (human created) or generated by AI. Our models are trained on a diverse dataset of real and AI-generated songs from Suno and Udio. ### Models available: - **SpecTTTra-γ**: Optimized for speed - **SpecTTTra-β**: Balanced performance - **SpecTTTra-α**: Highest accuracy ### Duration variants: - **5s**: Analyzes a 5-second clip (faster) - **120s**: Analyzes up to 2 minutes (more accurate) """) # Add Examples section after output is defined with gr.Accordion("Example Audio Files", open=True): gr.Examples( examples=[ ["example/real_song.mp3", "SpecTTTra-γ", "5s"], ["example/fake_song.mp3", "SpecTTTra-γ", "5s"], ], inputs=[audio_input, model_dropdown, duration_dropdown], outputs=[output], fn=predict, cache_examples=True, ) # Footer gr.HTML( """ """ ) # Prediction handling submit_btn.click(fn=predict, inputs=[audio_input, model_dropdown, duration_dropdown], outputs=[output]) if __name__ == "__main__": demo.launch()