Spaces:

awsaf49
/

sonics-fake-song-detection

Running

App Files Files Community

awsaf49 commited on 11 days ago

Commit

3f50570

1 Parent(s): e0a0564

Initial Commit

Browse files

Files changed (23) hide show

.gitignore +124 -0
README copy.md +80 -0
app.py +145 -0
requirements.txt +24 -0
sonics/__init__.py +5 -0
sonics/layers/__init__.py +6 -0
sonics/layers/augment.py +244 -0
sonics/layers/embedding.py +33 -0
sonics/layers/feature.py +146 -0
sonics/layers/tokenizer.py +117 -0
sonics/layers/transformer.py +176 -0
sonics/models/__init__.py +3 -0
sonics/models/hf_model.py +108 -0
sonics/models/model.py +128 -0
sonics/models/spectttra.py +85 -0
sonics/models/vit.py +101 -0
sonics/utils/config.py +24 -0
sonics/utils/dataset.py +137 -0
sonics/utils/losses.py +65 -0
sonics/utils/metrics.py +149 -0
sonics/utils/perf.py +107 -0
sonics/utils/scheduler.py +95 -0
sonics/utils/seed.py +22 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,124 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

README copy.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# SONICS: Synthetic Or Not - Identifying Counterfeit Songs
+This repository contains the official source code for our paper **SONICS: Synthetic Or Not - Identifying Counterfeit Songs**.
+## System Configuration
+- Disk Space: 150GB
+- GPU Memory: 48GB
+- RAM: 32GB
+- Python Version: 3.10
+- OS: Ubuntu 20.04
+- CUDA Version: 12.4
+## Installation
+```
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+## Dataset
+[As a part of our submission, we are not providing our dataset. It will be published after the final decision.]
+After downloading the dataset, the folder structure should look like following:
+```
+parentFolder
+│
+├──sonics
+│
+├──dataset
+│       ├──real_songs
+│       │   └──xxx.mp3
+│       ├──fake_songs
+│       │   └──yyy.mp3
+│       ├──real_songs.csv
+│       └──fake_songs.csv
+```
+After downloading the dataset, to split it into train, val, and test set, we will need to run the following part from the parent folder
+```shell
+python data_split.py
+```
+> **Note:** The `real_songs.csv` and `fake_songs.csv` contain the metadata for the songs including filepath, duration, split, etc and config file contains path of the metadata.
+> **Note:** Output files including checkpoints, model predictions will be saved in `./output/<experiment_name>/` folder.
+## Training
+Choose any of the config from `config` folder and run the following
+```shell
+python train.py --config <path to the config file>
+```
+## Testing
+Choose any of the config from `config` folder and run the following
+```shell
+python test.py --config <path to the config file> --ckpt_path <path to the checkpoint file>
+```
+## Model Profiling
+Choose any of the config from `config` folder and run the following
+```shell
+python model_profile.py --config <path to the config file> --batch_size 12
+```
+## Acknowledgement
+We have utilized the code and models provided in the following repository:
+- [Pytorch Image Models](https://github.com/huggingface/pytorch-image-models)

app.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+import math
+import gradio as gr
+import torch
+import librosa
+import pandas as pd
+import numpy as np
+from sonics import HFAudioClassifier
+# Constants
+MODEL_IDS = {
+    "SpecTTTra-α (5s)": "awsaf49/sonics-spectttra-alpha-5s",
+    "SpecTTTra-β (5s)": "awsaf49/sonics-spectttra-beta-5s",
+    "SpecTTTra-γ (5s)": "awsaf49/sonics-spectttra-gamma-5s",
+    "SpecTTTra-α (120s)": "awsaf49/sonics-spectttra-alpha-120s",
+    "SpecTTTra-β (120s)": "awsaf49/sonics-spectttra-beta-120s",
+    "SpecTTTra-γ (120s)": "awsaf49/sonics-spectttra-gamma-120s",
+}
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_cache = {}
+def load_model(model_name):
+    """Load model if not already cached"""
+    if model_name not in model_cache:
+        model_id = MODEL_IDS[model_name]
+        model = HFAudioClassifier.from_pretrained(model_id)
+        model = model.to(device)
+        model.eval()
+        model_cache[model_name] = model
+    return model_cache[model_name]
+def process_audio(audio_path, model_name):
+    """Process audio file and return prediction"""
+    try:
+        # Load model
+        model = load_model(model_name)
+        # Get max time from model config
+        max_time = model.config.audio.max_time
+        # Load and process audio
+        audio, sr = librosa.load(audio_path, sr=16000)
+        duration = len(audio) / sr
+        # Calculate chunk size and middle position
+        chunk_samples = int(max_time * sr)
+        total_chunks = len(audio) // chunk_samples
+        middle_chunk_idx = total_chunks // 2
+        # Extract middle chunk
+        start = middle_chunk_idx * chunk_samples
+        end = start + chunk_samples
+        chunk = audio[start:end]
+        # Pad if needed (shouldn't be necessary for middle chunk)
+        if len(chunk) < chunk_samples:
+            chunk = np.pad(chunk, (0, chunk_samples - len(chunk)))
+        # Convert to tensor and get prediction
+        with torch.no_grad():
+            chunk = torch.from_numpy(chunk).float().to(device)
+            pred = model(chunk.unsqueeze(0))
+            prob = torch.sigmoid(pred).cpu().numpy()[0]
+        # Get prediction
+        output = {"Real": 1 - prob, "Fake": prob}
+        return output
+    except Exception as e:
+        return {
+            "Duration": "Error",
+            "Prediction": f"Error: {str(e)}",
+            "Confidence": "N/A",
+        }
+def predict(audio_file, model_name):
+    """Gradio interface function"""
+    if audio_file is None:
+        return {
+            "Duration": "No file",
+            "Prediction": "Please upload an audio file",
+            "Confidence": "N/A",
+        }
+    return process_audio(audio_file, model_name)
+# Create Gradio interface
+css = """
+.heading {
+    text-align: center;
+    margin-bottom: 2rem;
+}
+.logo {
+    max-width: 250px;
+    margin: 0 auto;
+    display: block;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    gr.HTML(
+        """
+        <div class="heading">
+            <img src="https://i.postimg.cc/3Jx3yZ5b/real-vs-fake-sonics-w-logo.jpg" class="logo">
+            <h1>SONICS: Synthetic Or Not - Identifying Counterfeit Songs</h1>
+            <h3><span style="color:red;"><b>ICLR 2025 [Poster]</b></span></h3>
+        </div>
+    """
+    )
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(label="Upload Audio", type="filepath")
+            model_dropdown = gr.Dropdown(
+                choices=list(MODEL_IDS.keys()),
+                value="SpecTTTra-γ (5s)",
+                label="Select Model",
+            )
+            submit_btn = gr.Button("Predict")
+        with gr.Column():
+            output = gr.Label(label="Result", num_top_classes=2)
+    submit_btn.click(fn=predict, inputs=[audio_input, model_dropdown], outputs=[output])
+    gr.Markdown(
+        """
+    ## Resources
+    - 📄 [Paper](https://openreview.net/forum?id=PY7KSh29Z8)
+    - 🎵 [Dataset](https://huggingface.co/datasets/awsaf49/sonics)
+    - 🔬 [ArXiv](https://arxiv.org/abs/2408.14080)
+    - 💻 [GitHub](https://github.com/awsaf49/sonics)
+    """
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+# Core libraries
+torch>=2.4.0
+torchaudio>=2.4.0
+# Audio processing
+librosa>=0.9.0
+# Data processing
+pandas>=1.3.0
+# Visualization
+matplotlib>=3.4.0
+tqdm>=4.60.0
+# ML utilities
+scikit-learn>=1.0.0
+# flop
+fvcore
+timm>=1.0.7
+# gradio
+gradio>=4.0.0

sonics/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from sonics.utils.seed import set_seed
+from sonics.utils.config import dict2cfg
+from sonics.utils.dataset import get_dataloader
+from sonics.utils.scheduler import get_scheduler
+from sonics.models.hf_model import HFAudioClassifier

sonics/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from sonics.layers.tokenizer import Tokenizer1D, STTokenizer
+from sonics.layers.embedding import (
+    SinusoidPositionalEncoding,
+    LearnedPositionalEncoding,
+)
+from sonics.layers.transformer import Transformer

sonics/layers/augment.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import math
+from typing import Tuple
+import torch
+import torch.nn as nn
+from torchaudio.transforms import SpecAugment
+from torch import Tensor
+from torchvision.transforms import functional as F
+class AugmentLayer(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        # Initialize MixUp
+        self.mixup = MixUp(
+            alpha=cfg.augment.mixup_alpha,
+            num_classes=cfg.num_classes,
+            p=cfg.augment.mixup_p,
+            inplace=True,
+        )
+        # Initialize other augmentations
+        self.time_freq_mask = SpecAugment(
+            n_time_masks=cfg.augment.n_time_masks,
+            time_mask_param=cfg.augment.time_mask_param,
+            n_freq_masks=cfg.augment.n_freq_masks,
+            freq_mask_param=cfg.augment.freq_mask_param,
+            p=cfg.augment.time_freq_mask_p,
+            zero_masking=True,
+        )
+    def forward(self, spec, y=None):
+        # Apply MixUp or CutMix with RandomChoice
+        if y is not None:
+            # img = spec.unsqueeze(1)  # shape: (batch_size, 1, n_mels, n_frames)
+            spec, y = self.mixup(spec, y)
+            # spec = img.squeeze(1)  # shape: (batch_size, n_mels, n_frames)
+        # Apply TimeMasking and FrequencyMasking
+        spec = self.time_freq_mask(spec)
+        return spec, y
+class MixUp(torch.nn.Module):
+    """Randomly apply MixUp to the provided batch and targets.
+    The class implements the data augmentations as described in the paper
+    `"mixup: Beyond Empirical Risk Minimization" <https://arxiv.org/abs/1710.09412>`_.
+    Args:
+        num_classes (int): number of classes used for one-hot encoding.
+        p (float): probability of the batch being transformed. Default value is 0.5.
+        alpha (float): hyperparameter of the Beta distribution used for mixup.
+            Default value is 1.0.
+        inplace (bool): boolean to make this transform inplace. Default set to False.
+    """
+    def __init__(
+        self,
+        num_classes: int,
+        p: float = 0.5,
+        alpha: float = 1.0,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        if num_classes < 1:
+            raise ValueError(
+                f"Please provide a valid positive value for the num_classes. Got num_classes={num_classes}"
+            )
+        if alpha <= 0:
+            raise ValueError("Alpha param can't be zero.")
+        self.num_classes = num_classes
+        self.p = p
+        self.alpha = alpha
+        self.inplace = inplace
+    def forward(self, batch: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            batch (Tensor): Float tensor of size (B, C, H, W)
+            target (Tensor): Integer tensor of size (B, )
+        Returns:
+            Tensor: Randomly transformed batch.
+        """
+        if batch.ndim != 3 and batch.ndim != 2:
+            raise ValueError(
+                f"Batch ndim should be 3 (b, f, t) or 2 (b, n). Got {batch.ndim}"
+            )
+        if target.ndim != 1:
+            raise ValueError(f"Target ndim should be 1. Got {target.ndim}")
+        if not batch.is_floating_point():
+            raise TypeError(f"Batch dtype should be a float tensor. Got {batch.dtype}.")
+        if target.dtype != torch.int64 and self.num_classes > 1:
+            raise TypeError(f"Target dtype should be torch.int64. Got {target.dtype}")
+        if not self.inplace:
+            batch = batch.clone()
+            target = target.clone()
+        if target.ndim == 1 and self.num_classes > 1:
+            target = torch.nn.functional.one_hot(target, num_classes=self.num_classes)
+        target = target.to(dtype=batch.dtype)
+        if torch.rand(1).item() >= self.p:
+            return batch, target
+        # It's faster to roll the batch by one instead of shuffling it to create image pairs
+        batch_rolled = batch.roll(1, 0)
+        target_rolled = target.roll(1, 0)
+        # Implemented as on mixup paper, page 3.
+        lambda_param = float(
+            torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0]
+        )
+        batch_rolled.mul_(1.0 - lambda_param)
+        batch.mul_(lambda_param).add_(batch_rolled)
+        target_rolled.mul_(1.0 - lambda_param)
+        target.mul_(lambda_param).add_(target_rolled)
+        return batch, target
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"num_classes={self.num_classes}"
+            f", p={self.p}"
+            f", alpha={self.alpha}"
+            f", inplace={self.inplace}"
+            f")"
+        )
+        return s
+# Todo: height of spec should be 1, adjust it for audio input (bs, n_samples)
+class CutMix(torch.nn.Module):
+    """Randomly apply CutMix to the provided batch and targets.
+    The class implements the data augmentations as described in the paper
+    `"CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features"
+    <https://arxiv.org/abs/1905.04899>`_.
+    Args:
+        num_classes (int): number of classes used for one-hot encoding.
+        p (float): probability of the batch being transformed. Default value is 0.5.
+        alpha (float): hyperparameter of the Beta distribution used for cutmix.
+            Default value is 1.0.
+        inplace (bool): boolean to make this transform inplace. Default set to False.
+    """
+    def __init__(
+        self,
+        num_classes: int,
+        p: float = 0.5,
+        alpha: float = 1.0,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        if num_classes < 1:
+            raise ValueError(
+                "Please provide a valid positive value for the num_classes."
+            )
+        if alpha <= 0:
+            raise ValueError("Alpha param can't be zero.")
+        self.num_classes = num_classes
+        self.p = p
+        self.alpha = alpha
+        self.inplace = inplace
+    def forward(self, batch: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            batch (Tensor): Float tensor of size (B, C, H, W)
+            target (Tensor): Integer tensor of size (B, )
+        Returns:
+            Tensor: Randomly transformed batch.
+        """
+        if batch.ndim != 4:
+            raise ValueError(f"Batch ndim should be 4. Got {batch.ndim}")
+        if target.ndim != 1:
+            raise ValueError(f"Target ndim should be 1. Got {target.ndim}")
+        if not batch.is_floating_point():
+            raise TypeError(f"Batch dtype should be a float tensor. Got {batch.dtype}.")
+        if target.dtype != torch.int64 and self.num_classes > 1:
+            raise TypeError(f"Target dtype should be torch.int64. Got {target.dtype}")
+        if not self.inplace:
+            batch = batch.clone()
+            target = target.clone()
+        if target.ndim == 1 and self.num_classes > 1:
+            target = torch.nn.functional.one_hot(target, num_classes=self.num_classes)
+        target = target.to(dtype=batch.dtype)
+        if torch.rand(1).item() >= self.p:
+            return batch, target
+        # It's faster to roll the batch by one instead of shuffling it to create image pairs
+        batch_rolled = batch.roll(1, 0)
+        target_rolled = target.roll(1, 0)
+        # Implemented as on cutmix paper, page 12 (with minor corrections on typos).
+        lambda_param = float(
+            torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0]
+        )
+        _, H, W = F.get_dimensions(batch)
+        r_x = torch.randint(W, (1,))
+        r_y = torch.randint(H, (1,))
+        r = 0.5 * math.sqrt(1.0 - lambda_param)
+        r_w_half = int(r * W)
+        r_h_half = int(r * H)
+        x1 = int(torch.clamp(r_x - r_w_half, min=0))
+        y1 = int(torch.clamp(r_y - r_h_half, min=0))
+        x2 = int(torch.clamp(r_x + r_w_half, max=W))
+        y2 = int(torch.clamp(r_y + r_h_half, max=H))
+        batch[:, :, y1:y2, x1:x2] = batch_rolled[:, :, y1:y2, x1:x2]
+        lambda_param = float(1.0 - (x2 - x1) * (y2 - y1) / (W * H))
+        target_rolled.mul_(1.0 - lambda_param)
+        target.mul_(lambda_param).add_(target_rolled)
+        return batch, target
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"num_classes={self.num_classes}"
+            f", p={self.p}"
+            f", alpha={self.alpha}"
+            f", inplace={self.inplace}"
+            f")"
+        )
+        return s

sonics/layers/embedding.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+import torch.nn as nn
+class SinusoidPositionalEncoding(nn.Module):
+    def __init__(self, token_dim, max_len=5000):
+        super(SinusoidPositionalEncoding, self).__init__()
+        pe = torch.zeros(max_len, token_dim)  # shape: (max_len, token_dim)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(
+            1
+        )  # shape: (max_len, 1)
+        div_term = torch.exp(
+            torch.arange(0, token_dim, 2).float()
+            * (-torch.log(torch.tensor(10000.0)) / token_dim)
+        )  # shape: (token_dim // 2)
+        pe[:, 0::2] = torch.sin(position * div_term)  # shape: (max_len, token_dim // 2)
+        pe[:, 1::2] = torch.cos(position * div_term)  # shape: (max_len, token_dim // 2)
+        pe = pe.unsqueeze(0)  # shape: (1, max_len, token_dim)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        x = x + self.pe[:, : x.size(1), :]  # shape: (batch_size, seq_len, token_dim)
+        return x
+class LearnedPositionalEncoding(nn.Module):
+    def __init__(self, token_dim, num_tokens):
+        super(LearnedPositionalEncoding, self).__init__()
+        self.pe = nn.Parameter(torch.randn(1, num_tokens, token_dim) * 0.02)
+    def forward(self, x):
+        x = x + self.pe
+        return x

sonics/layers/feature.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+import numpy as np
+import torch.nn as nn
+try:
+    from torch.amp import autocast
+    torch_amp_new = True
+except:
+    from torch.cuda.amp import autocast
+    torch_amp_new = False
+from torchaudio.transforms import AmplitudeToDB, MelSpectrogram
+class FeatureExtractor(nn.Module):
+    def __init__(
+        self,
+        cfg,
+    ):
+        """
+        Feature extraction module.
+        Args:
+            params (dict): Parameters for the spectrogram.
+            aug_config (dict, optional): Configuration for data augmentation. Defaults to None.
+            top_db (float, optional): Threshold for computing the amplitude to dB. Defaults to None.
+            norm (str, optional): Normalization method. Defaults to "min_max".
+        """
+        super().__init__()
+        self.audio2melspec = MelSpectrogram(
+            n_fft=cfg.melspec.n_fft,
+            hop_length=cfg.melspec.hop_length,
+            win_length=cfg.melspec.win_length,
+            n_mels=cfg.melspec.n_mels,
+            sample_rate=cfg.audio.sample_rate,
+            f_min=cfg.melspec.f_min,
+            f_max=cfg.melspec.f_max,
+            power=cfg.melspec.power,
+        )
+        self.amplitude_to_db = AmplitudeToDB(top_db=cfg.melspec.top_db)
+        if cfg.melspec.norm == "mean_std":
+            self.normalizer = MeanStdNorm()
+        elif cfg.melspec.norm == "min_max":
+            self.normalizer = MinMaxNorm()
+        elif cfg.melspec.norm == "simple":
+            self.normalizer = SimpleNorm()
+        else:
+            self.normalizer = nn.Identity()
+    def forward(self, x):
+        """
+        Forward pass of the feature extractor.
+        Args:
+            x (torch.Tensor): Input audio data.
+        Returns:
+            torch.Tensor: Extracted features.
+        """
+        with (
+            autocast("cuda", enabled=False)
+            if torch_amp_new
+            else autocast(enabled=False)
+        ):
+            melspec = self.audio2melspec(x.float())
+            melspec = self.amplitude_to_db(melspec)
+            melspec = self.normalizer(melspec)
+        return melspec
+class MinMaxNorm(nn.Module):
+    def __init__(self, eps=1e-6):
+        """
+        Module for performing min-max normalization on input data.
+        Args:
+            eps (float, optional): Small value to avoid division by zero. Defaults to 1e-6.
+        """
+        super().__init__()
+        self.eps = eps
+    def forward(self, X):
+        """
+        Forward pass of the min-max normalization module.
+        Args:
+            X (torch.Tensor): Input data.
+        Returns:
+            torch.Tensor: Normalized data.
+        """
+        min_ = torch.amax(X, dim=(1, 2), keepdim=True)
+        max_ = torch.amin(X, dim=(1, 2), keepdim=True)
+        return (X - min_) / (max_ - min_ + self.eps)
+class SimpleNorm(nn.Module):
+    def __init__(self):
+        """
+        Module for performing simple normalization on input data.
+        """
+        super().__init__()
+    def forward(self, x):
+        """
+        Forward pass of the simple normalization module.
+        Args:
+            x (torch.Tensor): Input data.
+        Returns:
+            torch.Tensor: Normalized data.
+        """
+        return (x - 40) / 80
+class MeanStdNorm(nn.Module):
+    def __init__(self, eps=1e-6):
+        """
+        Module for performing mean and standard deviation normalization on input data.
+        Args:
+            eps (float, optional): Small value to avoid division by zero. Defaults to 1e-6.
+        """
+        super().__init__()
+        self.eps = eps
+    def forward(self, X):
+        """
+        Forward pass of the mean and standard deviation normalization module.
+        Args:
+            X (torch.Tensor): Input data.
+        Returns:
+            torch.Tensor: Normalized data.
+        """
+        mean = X.mean((1, 2), keepdim=True)
+        std = X.reshape(X.size(0), -1).std(1, keepdim=True).unsqueeze(-1)
+        return (X - mean) / (std + self.eps)

sonics/layers/tokenizer.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import math
+import torch
+import torch.nn as nn
+from sonics.layers.embedding import (
+    SinusoidPositionalEncoding,
+    LearnedPositionalEncoding,
+)
+class STTokenizer(nn.Module):
+    def __init__(
+        self,
+        input_spec_dim,
+        input_temp_dim,
+        t_clip,
+        f_clip,
+        embed_dim,
+        pre_norm=False,
+        pe_learnable=False,
+    ):
+        super(STTokenizer, self).__init__()
+        self.input_spec_dim = input_spec_dim
+        self.input_temp_dim = input_temp_dim
+        self.t_clip = t_clip
+        self.f_clip = f_clip
+        self.embed_dim = embed_dim
+        self.pre_norm = pre_norm
+        self.pe_learnable = pe_learnable
+        self.num_temporal_tokens = math.floor(
+            (input_temp_dim - t_clip) / t_clip + 1
+        )  # floor((1280 - 5) / 5 + 1)= 256
+        self.num_spectral_tokens = math.floor(
+            (input_spec_dim - f_clip) / f_clip + 1
+        )  # floor((128 - 3) / 3 + 1) = 42
+        # L_out = floor((L_in + 2*p - d*(k - 1) - 1) / s + 1) (ref: PyTorch docs)
+        self.num_tokens = (
+            self.num_temporal_tokens + self.num_spectral_tokens
+        )  # 255 + 42 = 299
+        # For ViT, num_tokens = (1280 * 128)//(5 * 3) = 10922 :)
+        self.temporal_tokenizer = Tokenizer1D(
+            input_spec_dim,
+            embed_dim,
+            clip_size=t_clip,
+            num_clips=self.num_temporal_tokens,
+            pre_norm=pre_norm,
+            pe_learnable=pe_learnable,
+        )
+        self.spectral_tokenizer = Tokenizer1D(
+            input_temp_dim,
+            embed_dim,
+            clip_size=f_clip,
+            num_clips=self.num_spectral_tokens,
+            pre_norm=pre_norm,
+            pe_learnable=pe_learnable,
+        )
+    def forward(self, x):
+        # Temporal tokenization
+        temporal_input = x  # shape: (B, F, T)
+        temporal_tokens = self.temporal_tokenizer(
+            temporal_input
+        )  # shape: (B, T/t, dim)
+        # Spectral tokenization
+        spectral_input = x.permute(0, 2, 1)  # shape: (batch_size, T, F)
+        spectral_tokens = self.spectral_tokenizer(
+            spectral_input
+        )  # shape: (B, F/f, dim)
+        spectro_temporal_tokens = torch.cat(
+            (temporal_tokens, spectral_tokens), dim=1
+        )  # shape: (B, T/t + F/f, dim)
+        return spectro_temporal_tokens
+class Tokenizer1D(nn.Module):
+    """Teimporal/Spectral Tokenizer
+    Whisper uses temporal tokenizer but time_clip_size is too small, stride=1,  thus
+    complexity is very high. We use stride=clip_size - 1 to reduce complexity.
+    """
+    def __init__(
+        self,
+        input_dim,
+        token_dim,
+        clip_size,
+        num_clips,
+        pre_norm=False,
+        pe_learnable=False,
+    ):
+        super(Tokenizer1D, self).__init__()
+        self.conv1d = nn.Conv1d(
+            input_dim,
+            token_dim,
+            clip_size,
+            stride=clip_size,
+            bias=not pre_norm,  #  # disable bias if pre-norm is used (e.g. CLIP)
+        )
+        self.act = nn.GELU()
+        self.pos_encoder = (
+            SinusoidPositionalEncoding(token_dim)
+            if not pe_learnable
+            else LearnedPositionalEncoding(token_dim, num_clips)
+        )
+        self.norm_pre = nn.LayerNorm(token_dim, eps=1e-6) if pre_norm else nn.Identity()
+    def forward(self, x):
+        x = x  # (F, T)
+        x = self.conv1d(x)  # (F, T) -> (dim, T/t)
+        x = self.act(x)
+        x = x.transpose(1, 2)  # (dim, T/t) -> (T/t, dim)
+        x = self.pos_encoder(x)  # add position embeds
+        x = self.norm_pre(x)
+        return x

sonics/layers/transformer.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import torch.nn as nn
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.jit import Final
+from timm.layers import (
+    Mlp,
+    DropPath,
+    use_fused_attn,
+)
+class Attention(nn.Module):
+    fused_attn: Final[bool]
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.fused_attn = use_fused_attn()
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                dropout_p=self.attn_drop.p if self.training else 0.0,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: float = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: nn.Module = nn.GELU,
+        norm_layer: nn.Module = nn.LayerNorm,
+        mlp_layer: nn.Module = Mlp,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            norm_layer=norm_layer,
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = mlp_layer(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=proj_drop,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+class Transformer(nn.Module):
+    """
+    Transformer layer, taken from timm library
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        num_layers: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        drop_path: float = 0.0,
+    ):
+        super(Transformer, self).__init__()
+        self.blocks = nn.ModuleList(
+            [
+                TransformerBlock(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_norm=qk_norm,
+                    proj_drop=proj_drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        return x

sonics/models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from sonics.models.model import AudioClassifier
+from sonics.models.spectttra import SpecTTTra
+from sonics.models.vit import ViT

sonics/models/hf_model.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import json
+import torch
+import torch.nn as nn
+from .model import AudioClassifier
+from ..utils.config import dict2cfg, cfg2dict
+from huggingface_hub import HfApi, create_repo, hf_hub_download
+class HFAudioClassifier(AudioClassifier):
+    """Hugging Face compatible AudioClassifier model"""
+    def __init__(self, config):
+        if isinstance(config, dict):
+            self.config = dict2cfg(config)
+        super().__init__(self.config)
+    @classmethod
+    def from_pretrained(cls, model_id, cache_dir=None, map_location="cpu", strict=False):
+        # Check if model_id is a local path
+        is_local = os.path.exists(model_id)
+        if is_local:
+            # Load from local checkpoint
+            config_file = os.path.join(model_id, "config.json")
+            model_file = os.path.join(model_id, "pytorch_model.bin")
+        else:
+            # Download from HF Hub
+            config_file = hf_hub_download(repo_id=model_id, filename="config.json", cache_dir=cache_dir)
+            model_file = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin", cache_dir=cache_dir)
+        # Read config
+        config = None
+        if os.path.exists(config_file):
+            with open(config_file, "r", encoding="utf-8") as f:
+                config = json.load(f)
+        # Create model
+        model = cls(config)
+        # Load weights
+        if os.path.exists(model_file):
+            state_dict = torch.load(model_file, map_location=torch.device(map_location))
+            model.load_state_dict(state_dict, strict=strict)
+            model.eval()
+        else:
+            raise FileNotFoundError(f"Model weights not found at {model_file}")
+        return model
+    def push_to_hub(self, repo_id, token=None, commit_message=None, private=False):
+        """Push model and config to Hugging Face Hub.
+        Args:
+            repo_id (str): Repository ID on HuggingFace Hub (e.g., 'username/model-name')
+            token (str, optional): HuggingFace token. If None, will use token from ~/.huggingface/token
+            commit_message (str, optional): Commit message for the push
+            private (bool, optional): Whether to make the repository private
+        """
+        # Create repo if it doesn't exist
+        api = HfApi()
+        try:
+            create_repo(repo_id, private=private, token=token, exist_ok=True)
+        except Exception as e:
+            print(f"Repository creation failed: {e}")
+            return
+        # Save config
+        config = cfg2dict(self.config)
+        with open("config.json", "w", encoding="utf-8") as f:
+            json.dump(config, f, indent=2, sort_keys=True)
+        # Save model weights
+        torch.save(self.cpu().state_dict(), "pytorch_model.bin")
+        self.to(self.device if hasattr(self, 'device') else 'cuda' if torch.cuda.is_available() else 'cpu')  # restore device
+        # Push files to hub
+        files_to_push = ["config.json", "pytorch_model.bin"]
+        for file in files_to_push:
+            api.upload_file(
+                path_or_fileobj=file,
+                path_in_repo=file,
+                repo_id=repo_id,
+                token=token,
+                commit_message=commit_message or f"Upload {file}"
+            )
+            os.remove(file)  # Clean up local files
+    def save_pretrained(self, save_directory: str, **kwargs):
+        """Save model weights and configuration to a directory.
+        Args:
+            save_directory (str): Directory to save files in
+            **kwargs: Additional arguments passed to save functions
+        """
+        os.makedirs(save_directory, exist_ok=True)
+        # Save config
+        config = cfg2dict(self.config)
+        config_file = os.path.join(save_directory, "config.json")
+        with open(config_file, "w", encoding="utf-8") as f:
+            json.dump(config, f, indent=2, sort_keys=True)
+        # Save model weights
+        model_file = os.path.join(save_directory, "pytorch_model.bin")
+        torch.save(self.cpu().state_dict(), model_file)
+        self.to(self.device if hasattr(self, 'device') else 'cuda' if torch.cuda.is_available() else 'cpu')  # restore device

sonics/models/model.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from sonics.models.spectttra import SpecTTTra
+from sonics.models.vit import ViT
+from sonics.layers.feature import FeatureExtractor
+from sonics.layers.augment import AugmentLayer
+import torch.nn as nn
+import torch.nn.functional as F
+import timm
+def use_global_pool(model_name):
+    """
+    Check if the model requires global pooling or not.
+    """
+    no_global_pool = ["timm"]
+    return False if any(x in model_name for x in no_global_pool) else True
+def get_embed_dim(model_name, encoder):
+    """
+    Get the embedding dimension of the encoder.
+    """
+    if "timm" in model_name:
+        return encoder.head_hidden_size
+    else:
+        return encoder.embed_dim
+def use_init_weights(model_name):
+    """
+    Check if the model requires initialization of weights or not.
+    """
+    has_init_weights = ["timm"]
+    return False if any(x in model_name for x in has_init_weights) else True
+class AudioClassifier(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.model_name = cfg.model.name
+        self.input_shape = cfg.model.input_shape
+        self.num_classes = cfg.num_classes
+        self.ft_extractor = FeatureExtractor(cfg)
+        self.augment = AugmentLayer(cfg)
+        self.encoder = self.get_encoder(cfg)
+        self.embed_dim = get_embed_dim(self.model_name, self.encoder)
+        self.classifier = nn.Linear(self.embed_dim, self.num_classes)
+        self.use_init_weights = getattr(cfg.model, "use_init_weights", True)
+        # Initialize weights
+        (
+            self.initialize_weights()
+            if self.use_init_weights and use_init_weights(self.model_name)
+            else None
+        )
+    def get_encoder(self, cfg):
+        if cfg.model.name == "SpecTTTra":
+            model = SpecTTTra(
+                input_spec_dim=cfg.model.input_shape[0],
+                input_temp_dim=cfg.model.input_shape[1],
+                embed_dim=cfg.model.embed_dim,
+                t_clip=cfg.model.t_clip,
+                f_clip=cfg.model.f_clip,
+                num_heads=cfg.model.num_heads,
+                num_layers=cfg.model.num_layers,
+                pre_norm=cfg.model.pre_norm,
+                pe_learnable=cfg.model.pe_learnable,
+                pos_drop_rate=getattr(cfg.model, "pos_drop_rate", 0.0),
+                attn_drop_rate=getattr(cfg.model, "attn_drop_rate", 0.0),
+                proj_drop_rate=getattr(cfg.model, "proj_drop_rate", 0.0),
+                mlp_ratio=getattr(cfg.model, "mlp_ratio", 4.0),
+            )
+        elif cfg.model.name == "ViT":
+            model = ViT(
+                image_size=cfg.model.input_shape,
+                patch_size=cfg.model.patch_size,
+                embed_dim=cfg.model.embed_dim,
+                num_heads=cfg.model.num_heads,
+                num_layers=cfg.model.num_layers,
+                pe_learnable=cfg.model.pe_learnable,
+                patch_norm=getattr(cfg.model, "patch_norm", False),
+                pos_drop_rate=getattr(cfg.model, "pos_drop_rate", 0.0),
+                attn_drop_rate=getattr(cfg.model, "attn_drop_rate", 0.0),
+                proj_drop_rate=getattr(cfg.model, "proj_drop_rate", 0.0),
+                mlp_ratio=getattr(cfg.model, "mlp_ratio", 4.0),
+            )
+        elif "timm" in cfg.model.name:
+            model_name = cfg.model.name.replace("timm-", "")
+            model = timm.create_model(
+                model_name,
+                pretrained=cfg.model.pretrained,
+                in_chans=1,
+                num_classes=0,
+            )
+        else:
+            raise ValueError(f"Model {cfg.model.name} not supported in V1.")
+        return model
+    def forward(self, audio, y=None):
+        spec = self.ft_extractor(audio)  # shape: (batch_size, n_mels, n_frames)
+        if self.training:
+            spec, y = self.augment(spec, y)
+        spec = spec.unsqueeze(1)  # shape: (batch_size, 1, n_mels, n_frames)
+        spec = F.interpolate(spec, size=tuple(self.input_shape), mode="bilinear")
+        features = self.encoder(spec)
+        embeds = features.mean(dim=1) if use_global_pool(self.model_name) else features
+        preds = self.classifier(embeds)
+        return preds if y is None else (preds, y)
+    def initialize_weights(self):
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear):
+                if name.startswith("classifier"):
+                    nn.init.zeros_(module.weight)
+                    nn.init.constant_(module.bias, 0.0)
+                else:
+                    nn.init.xavier_uniform_(module.weight)
+                    if module.bias is not None:
+                        nn.init.normal_(module.bias, std=1e-6)
+            elif isinstance(module, nn.Conv2d) or isinstance(module, nn.Conv1d):
+                nn.init.kaiming_normal_(
+                    module.weight, mode="fan_out", nonlinearity="relu"
+                )
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            elif hasattr(module, "init_weights"):
+                module.init_weights()

sonics/models/spectttra.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch.nn as nn
+from sonics.layers import Transformer
+from sonics.layers.tokenizer import STTokenizer
+class SpecTTTra(nn.Module):
+    def __init__(
+        self,
+        input_spec_dim,
+        input_temp_dim,
+        embed_dim,
+        t_clip,
+        f_clip,
+        num_heads,
+        num_layers,
+        pre_norm=False,
+        pe_learnable=False,
+        pos_drop_rate=0.0,
+        attn_drop_rate=0.0,
+        proj_drop_rate=0.0,
+        mlp_ratio=4.0,
+    ):
+        super(SpecTTTra, self).__init__()
+        self.input_spec_dim = input_spec_dim
+        self.input_temp_dim = input_temp_dim
+        self.embed_dim = embed_dim
+        self.t_clip = t_clip
+        self.f_clip = f_clip
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pre_norm = (
+            pre_norm  # applied after tokenization before transformer (used in CLIP)
+        )
+        self.pe_learnable = pe_learnable  # learned positional encoding
+        self.pos_drop_rate = pos_drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.proj_drop_rate = proj_drop_rate
+        self.mlp_ratio = mlp_ratio
+        self.st_tokenizer = STTokenizer(
+            input_spec_dim,
+            input_temp_dim,
+            t_clip,
+            f_clip,
+            embed_dim,
+            pre_norm=pre_norm,
+            pe_learnable=pe_learnable,
+        )
+        self.pos_drop = nn.Dropout(p=pos_drop_rate)
+        self.transformer = Transformer(
+            embed_dim,
+            num_heads,
+            num_layers,
+            attn_drop=self.attn_drop_rate,
+            proj_drop=self.proj_drop_rate,
+            mlp_ratio=self.mlp_ratio,
+        )
+    def forward(self, x):
+        # Squeeze the channel dimension if it exists
+        if x.dim() == 4:
+            x = x.squeeze(1)
+        # Spectro-temporal tokenization
+        spectro_temporal_tokens = self.st_tokenizer(x)
+        # Positional dropout
+        spectro_temporal_tokens = self.pos_drop(spectro_temporal_tokens)
+        # Transformer
+        output = self.transformer(spectro_temporal_tokens)  # shape: (B, T/t + F/f, dim)
+        return output
+# Example usage:
+input_spec_dim = 384
+input_temp_dim = 128
+embed_dim = 512
+t_clip = 20  # This means t
+f_clip = 10  # This means f
+num_heads = 8
+num_layers = 6
+dim_feedforward = 512
+num_classes = 10

sonics/models/vit.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import torch
+import torch.nn as nn
+from sonics.layers import (
+    SinusoidPositionalEncoding,
+    LearnedPositionalEncoding,
+    Transformer,
+)
+from timm.layers import PatchEmbed
+class ViT(nn.Module):
+    def __init__(
+        self,
+        image_size,
+        patch_size,
+        embed_dim,
+        num_heads,
+        num_layers,
+        pe_learnable=False,
+        patch_norm=False,
+        pos_drop_rate=0.0,
+        attn_drop_rate=0.0,
+        proj_drop_rate=0.0,
+        mlp_ratio=4.0,
+    ):
+        super().__init__()
+        assert (
+            image_size[0] % patch_size == 0 and image_size[1] % patch_size == 0
+        ), "Image dimensions must be divisible by patch size."
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pe_learnable = pe_learnable
+        self.patch_norm = patch_norm
+        self.pos_drop_rate = pos_drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.proj_drop_rate = proj_drop_rate
+        self.mlp_ratio = mlp_ratio
+        self.num_patches = (image_size[0] // patch_size) * (image_size[1] // patch_size)
+        # self.patch_conv = nn.Conv2d(
+        #     1, embed_dim, kernel_size=patch_size, stride=patch_size
+        # )  # Original ViT has 3 input channels
+        self.patch_encoder = PatchEmbed(
+            img_size=image_size,
+            patch_size=patch_size,
+            in_chans=1,
+            embed_dim=embed_dim,
+            norm_layer=nn.LayerNorm if patch_norm else None,
+        )
+        self.pos_encoder = (
+            SinusoidPositionalEncoding(embed_dim)
+            if not pe_learnable
+            else LearnedPositionalEncoding(embed_dim, self.num_patches)
+        )
+        self.pos_drop = nn.Dropout(p=pos_drop_rate)
+        self.transformer = Transformer(
+            embed_dim,
+            num_heads,
+            num_layers,
+            attn_drop=self.attn_drop_rate,
+            proj_drop=self.proj_drop_rate,
+            mlp_ratio=self.mlp_ratio,
+        )
+    def forward(self, x):
+        B = x.shape[0]
+        # x = x.unsqueeze(1)  # B x 1 x n_mels x n_frames # taken care of in the AudioClassifier
+        if x.dim() == 3:
+            x = x.unsqueeze(1)  # timm PatchEmbed expects 4D tensor
+        # Convolutional patch embedding
+        # patches = self.patch_conv(x)  # B x embed_dim x num_patches_h x num_patches_w
+        patches = self.patch_encoder(x)
+        # # Reshape patches
+        # patches = patches.permute(
+        #     0, 2, 3, 1
+        # ).contiguous()  # B x num_patches_h x num_patches_w x embed_dim
+        # patches = patches.view(B, -1, patches.size(-1))  # B x num_patches x embed_dim
+        # Add positional embeddings
+        embeddings = self.pos_encoder(patches)
+        # Positional dropout
+        embeddings = self.pos_drop(embeddings)
+        # Transformer encoding
+        output = self.transformer(embeddings)  # B x num_patches x embed_dim
+        return output
+batch_size = 1
+input_height = 128
+input_width = 384 * 6 * 4
+patch_size = 16

sonics/utils/config.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from types import SimpleNamespace
+def dict2cfg(d):
+    """
+    Converts a dictionary into a SimpleNamespace
+    """
+    for k, v in d.items():
+        if type(v) == dict:
+            d[k] = SimpleNamespace(**v)
+    c = SimpleNamespace(**d)
+    c.audio.max_len = int(c.audio.max_time * c.audio.sample_rate)
+    return c
+def cfg2dict(cfg):
+    """
+    Converts a SimpleNamespace into a dictionary without modifying the original cfg.
+    """
+    d = vars(cfg).copy()  # Make a shallow copy of the cfg's __dict__
+    for k, v in d.items():
+        if isinstance(v, SimpleNamespace):
+            d[k] = cfg2dict(v)  # Recursively convert nested SimpleNamespace objects
+    return d

sonics/utils/dataset.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+import numpy as np
+import torch
+import librosa
+class AudioDataset(Dataset):
+    def __init__(
+        self,
+        filepaths,
+        labels,
+        skip_times=None,
+        num_classes=1,
+        normalize="std",
+        max_len=32000,
+        random_sampling=True,
+        train=False,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.filepaths = filepaths
+        self.labels = labels
+        self.skip_times = skip_times
+        self.num_classes = num_classes
+        self.random_sampling = random_sampling
+        self.normalize = normalize
+        self.max_len = max_len
+        self.train = train
+        if not self.train:
+            assert (
+                not self.random_sampling
+            ), "Ensure random_sampling is disabled for val"
+    def __len__(self):
+        return len(self.filepaths)
+    def crop_or_pad(self, audio, max_len, random_sampling=True):
+        audio_len = audio.shape[0]
+        if random_sampling:
+            diff_len = abs(max_len - audio_len)
+            if audio_len < max_len:
+                pad1 = np.random.randint(0, diff_len)
+                pad2 = diff_len - pad1
+                audio = np.pad(audio, (pad1, pad2), mode="constant")
+            elif audio_len > max_len:
+                idx = np.random.randint(0, diff_len)
+                audio = audio[idx : (idx + max_len)]
+        else:
+            if audio_len < max_len:
+                audio = np.pad(audio, (0, max_len - audio_len), mode="constant")
+            elif audio_len > max_len:
+                # Crop from the beginning
+                # audio = audio[:max_len]
+                # Crop from 3/4 of the audio
+                # eq: l = (3x + t + x) => idx = 3x = (l - t) / 4 * 3
+                idx = int((audio_len - max_len) / 4 * 3)
+                audio = audio[idx : (idx + max_len)]
+        return audio
+    def __getitem__(self, idx):
+        # Load audio
+        audio, sr = librosa.load(self.filepaths[idx], sr=None)
+        target = np.array([self.labels[idx]])
+        # Trim start of audio (torchaudio.transforms.vad)
+        if self.skip_times is not None:
+            skip_time = self.skip_times[idx]
+            audio = audio[int(skip_time*sr):]
+        # Ensure fixed length
+        audio = self.crop_or_pad(audio, self.max_len, self.random_sampling)
+        if self.normalize == "std":
+            audio /= np.maximum(np.std(audio), 1e-6)
+        elif self.normalize == "minmax":
+            audio -= np.min(audio)
+            audio /= np.maximum(np.max(audio), 1e-6)
+        audio = torch.from_numpy(audio).float()
+        target = torch.from_numpy(target).float().squeeze()
+        return {
+            "audio": audio,
+            "target": target,
+        }
+def get_dataloader(
+    filepaths,
+    labels,
+    skip_times=None,
+    batch_size=8,
+    num_classes=1,
+    max_len=32000,
+    random_sampling=True,
+    normalize="std",
+    train=False,
+    # drop_last=False,
+    pin_memory=True,
+    worker_init_fn=None,
+    collate_fn=None,
+    num_workers=0,
+    distributed=False,
+):
+    dataset = AudioDataset(
+        filepaths,
+        labels,
+        skip_times=skip_times,
+        num_classes=num_classes,
+        max_len=max_len,
+        random_sampling=random_sampling,
+        normalize=normalize,
+        train=train,
+    )
+    if distributed:
+        # drop_last is set to True to validate properly
+        # Ref: https://discuss.pytorch.org/t/how-do-i-validate-with-pytorch-distributeddataparallel/172269/8
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            dataset, shuffle=train, drop_last=not train
+        )
+    else:
+        sampler = None
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=(sampler is None) and train,
+        # drop_last=drop_last,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        worker_init_fn=worker_init_fn,
+        collate_fn=collate_fn,
+        sampler=sampler,
+    )
+    return dataloader

sonics/utils/losses.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class BCEWithLogitsLoss(nn.BCEWithLogitsLoss):
+    def __init__(self, label_smoothing=0.0, **kwargs):
+        super(BCEWithLogitsLoss, self).__init__(**kwargs)
+        self.label_smoothing = label_smoothing
+    def forward(self, input, target):
+        if self.label_smoothing:
+            target = target * (1.0 - self.label_smoothing) + 0.5 * self.label_smoothing
+        return super(BCEWithLogitsLoss, self).forward(input, target)
+class SigmoidFocalLoss(nn.Module):
+    def __init__(self, alpha=1, gamma=2, label_smoothing=0.0, reduction="mean"):
+        """
+        Args:
+            alpha (float): Weighting factor in range (0,1) to balance positive vs negative examples.
+            gamma (float): Focusing parameter to reduce the relative loss for well-classified examples.
+            label_smoothing (float): Label smoothing factor to reduce the confidence of the true label.
+            reduction (str): Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
+                             'none': no reduction will be applied,
+                             'mean': the sum of the output will be divided by the number of elements in the output,
+                             'sum': the output will be summed.
+        """
+        super(SigmoidFocalLoss, self).__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.label_smoothing = label_smoothing
+        self.reduction = reduction
+    def forward(self, input, target):
+        """
+        Args:
+            input (Tensor): Predicted logits for each example.
+            target (Tensor): Ground truth binary labels (0 or 1) for each example.
+        """
+        if self.label_smoothing:
+            target = target * (1.0 - self.label_smoothing) + 0.5 * self.label_smoothing
+        p = torch.sigmoid(input)
+        ce_loss = F.binary_cross_entropy_with_logits(input, target, reduction="none")
+        p_t = p * target + (1 - p) * (1 - target)
+        loss = ce_loss * ((1 - p_t) ** self.gamma)
+        if self.alpha >= 0:
+            alpha_t = self.alpha * target + (1 - self.alpha) * (1 - target)
+            loss = alpha_t * loss
+        # Check reduction option and return loss accordingly
+        if self.reduction == "none":
+            pass
+        elif self.reduction == "mean":
+            loss = loss.mean()
+        elif self.reduction == "sum":
+            loss = loss.sum()
+        else:
+            raise ValueError(
+                f"Invalid Value for arg 'reduction': '{self.reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+            )
+        return loss

sonics/utils/metrics.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import numpy as np
+import pandas as pd
+from sklearn import metrics
+np.seterr(divide="ignore", invalid="ignore")
+class AverageMeter:
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+class F1Meter:
+    def __init__(self, average="binary"):
+        self.average = average
+        self.reset()
+    def update(self, y_true, y_pred):
+        self.y_true = np.concatenate([self.y_true, y_true])
+        self.y_pred = np.concatenate([self.y_pred, y_pred])
+        self.avg = metrics.f1_score(self.y_true, self.y_pred, average=self.average)
+    def reset(self):
+        self.y_true = np.array([])
+        self.y_pred = np.array([])
+class SensitivityMeter:
+    def __init__(self, average="binary"):
+        self.average = average
+        self.reset()
+    def update(self, y_true, y_pred):
+        self.y_true = np.concatenate([self.y_true, y_true])
+        self.y_pred = np.concatenate([self.y_pred, y_pred])
+        self.avg = metrics.recall_score(
+            self.y_true, self.y_pred, pos_label=1, average=self.average
+        )
+    def reset(self):
+        self.y_true = np.array([])
+        self.y_pred = np.array([])
+class SpecificityMeter:
+    def __init__(self, average="binary"):
+        self.average = average
+        self.reset()
+    def update(self, y_true, y_pred):
+        self.y_true = np.concatenate([self.y_true, y_true])
+        self.y_pred = np.concatenate([self.y_pred, y_pred])
+        self.avg = metrics.recall_score(
+            self.y_true, self.y_pred, pos_label=0, average=self.average
+        )
+    def reset(self):
+        self.y_true = np.array([])
+        self.y_pred = np.array([])
+class AccuracyMeter:
+    def __init__(self):
+        self.reset()
+    def update(self, y_true, y_pred):
+        self.y_true = np.concatenate([self.y_true, y_true])
+        self.y_pred = np.concatenate([self.y_pred, y_pred])
+        self.avg = metrics.balanced_accuracy_score(self.y_true, self.y_pred)
+    def reset(self):
+        self.y_true = np.array([])
+        self.y_pred = np.array([])
+def get_part_result(test_pred_df):
+    # Create `singer` column to store whether the singer is seen or unseen
+    test_pred_df["singer"] = test_pred_df.artist_overlap.map(
+        lambda x: "seen" if x else "unseen"
+    )
+    # Create `fake_type` column to store different types of fake songs
+    test_pred_df["fake_type"] = test_pred_df.label
+    # Create `length` column to store different duration type songs
+    test_pred_df["length"] = test_pred_df["duration_part"] = test_pred_df[
+        "duration"
+    ].map(lambda t: "short" if t <= 60 else ("long" if t > 120 else "medium"))
+    # Initialize an empty DataFrame to store results
+    part_result_df = pd.DataFrame()
+    # Loop through the specified categories
+    for cat in ["algorithm", "singer", "fake_type", "length"]:
+        # Filter the dataframe based on the condition for each category
+        if cat in ["algorithm", "fake_type"]:
+            cat_df = test_pred_df.query("target == 1")
+        elif cat == "singer":
+            cat_df = test_pred_df.query("target == 0")
+        else:
+            cat_df = test_pred_df.copy()
+        # Compute metrics for each partition
+        for part in cat_df[cat].unique():
+            part_df = cat_df[cat_df[cat] == part]
+            y_true = part_df.y_true.values.astype(int)
+            y_pred = (part_df.y_pred.values > 0.5).astype(int)
+            # Compute TPR for `algorithm`, `fake_type`; TNR for `singer` and F1 for `length`
+            score = (
+                metrics.recall_score(
+                    y_true, y_pred, pos_label=1 if cat != "singer" else 0
+                )
+                if cat != "length"
+                else metrics.f1_score(y_true, y_pred, average="macro")
+            )
+            # Create a DataFrame for the current result
+            result_df = pd.DataFrame(
+                {
+                    "category": [cat],
+                    "partition": [part],
+                    "score": [score],
+                    "size": [len(part_df)],
+                }
+            )
+            # Concatenate the result with the existing DataFrame
+            part_result_df = pd.concat([part_result_df, result_df], ignore_index=True)
+    # Create a dictionary with the results
+    result_dict = {
+        f"{row['category']}/{row['partition']}": row["score"]
+        for _, row in part_result_df.iterrows()
+    }
+    return part_result_df, result_dict

sonics/utils/perf.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import time
+import torch
+import pandas as pd
+from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis
+def profile_model(model, input_tensor, display=False):
+    flops = calculate_flops(model, input_tensor[0:1, ...])  # (1, n_mels, n_frames)
+    acts = calculate_activations(model, input_tensor[0:1, ...])  # (1, n_mels, n_frames)
+    params = calculate_params(model)
+    speed = calculate_speed(model, input_tensor[0:1, ...])  # (1, n_mels, n_frames)
+    memory = calculate_memory(model, input_tensor)  # (B, n_mels, n_frames)
+    profile_data = {
+        "Metric": [
+            "FLOPs (G)",
+            "Activations (M)",
+            "Params (M)",
+            "Memory (GB)",
+            "Speed (A/S)",
+        ],
+        "Value": [flops, acts, params, memory, speed],
+    }
+    profile_df = pd.DataFrame(profile_data).set_index("Metric").T
+    if display:
+        print(profile_df.to_markdown(index=False, tablefmt="grid"))
+    return profile_df
+def calculate_speed(model, input_tensor, num_runs=100, warmup_runs=5):
+    model.eval()
+    if torch.cuda.is_available():
+        # Warm-up iterations
+        with torch.no_grad():
+            for _ in range(warmup_runs):
+                _ = model(input_tensor)
+        # Create CUDA events for timing
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        # Actual timing
+        start.record()
+        with torch.no_grad():
+            for _ in range(num_runs):
+                _ = model(input_tensor)
+        end.record()
+        # Synchronize to wait for the events to be recorded
+        torch.cuda.synchronize()
+        # Calculate elapsed time
+        elapsed_time = start.elapsed_time(end)  # in milliseconds
+        latency = elapsed_time / num_runs / 1000.0  # convert to seconds
+    else:
+        # Warm-up iterations
+        with torch.no_grad():
+            for _ in range(warmup_runs):
+                _ = model(input_tensor)
+        # Actual timing
+        start = time.time()
+        with torch.no_grad():
+            for _ in range(num_runs):
+                _ = model(input_tensor)
+        end = time.time()
+        # Calculate elapsed time
+        latency = (end - start) / num_runs
+    return 1.0 / latency
+def calculate_flops(model, input_tensor):
+    """Calculate FLOPs in GigaFLOPs.
+    Models often reports MACs as FLOPs e.g. ConvNeXt, timm library
+    Reference:
+    1. https://github.com/huggingface/pytorch-image-models/blob/main/benchmark.py#L206
+    2. https://github.com/facebookresearch/fvcore/issues/69
+    """
+    flops = FlopCountAnalysis(model, input_tensor).total()
+    return flops / 1e9  # in GigaFLOPs
+def calculate_activations(model, input_tensor):
+    acts = ActivationCountAnalysis(model, input_tensor).total()
+    return acts / 1e6  # in Millions
+def calculate_params(model):
+    return sum(p.numel() for p in model.parameters()) / 1e6  # in Millions
+def calculate_memory(model, input_tensor):
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats(device=None)
+        start_memory = torch.cuda.max_memory_allocated(device=None)
+        model.train()
+        _ = model(input_tensor)
+        end_memory = torch.cuda.max_memory_allocated(device=None)
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats(device=None)
+        memory = (end_memory - start_memory) / (1024**3)  # in GB
+    else:
+        memory = 0
+    return memory

sonics/utils/scheduler.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import math
+from torch.optim.lr_scheduler import LambdaLR
+from functools import partial
+def get_scheduler(
+    optimizer,
+    start_lr,
+    max_lr,
+    min_lr,
+    warmup_epochs,
+    sustain_epochs,
+    total_epochs,
+    decay,
+    mode="cosine",
+):
+    def lr_lambda(epoch):
+        if epoch < warmup_epochs:
+            return (max_lr - start_lr) / warmup_epochs * epoch + start_lr
+        elif epoch < warmup_epochs + sustain_epochs:
+            return max_lr
+        elif mode == "exponential":
+            return (max_lr - min_lr) * decay ** (
+                epoch - warmup_epochs - sustain_epochs
+            ) + min_lr
+        elif mode == "step":
+            return max_lr * decay ** ((epoch - warmup_epochs - sustain_epochs) // 2)
+        elif mode == "cosine":
+            decay_total_epochs = total_epochs - warmup_epochs - sustain_epochs + 3
+            decay_epoch_index = epoch - warmup_epochs - sustain_epochs
+            phase = math.pi * decay_epoch_index / decay_total_epochs
+            cosine_decay = 0.5 * (1 + math.cos(phase))
+            return (max_lr - min_lr) * cosine_decay + min_lr
+        else:
+            raise ValueError(
+                f"Unsupported mode '{mode}'. Supported modes are 'exp', 'step', 'cosine'."
+            )
+    return LambdaLR(optimizer, lr_lambda)
+def _get_cosine_schedule_with_warmup_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float,
+):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(
+        max(1, num_training_steps - num_warmup_steps)
+    )
+    return max(
+        0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
+    )
+def get_cosine_schedule_with_warmup(
+    optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1
+):
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`float`, *optional*, defaults to 0.5):
+            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+            following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    lr_lambda = partial(
+        _get_cosine_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)

sonics/utils/seed.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import random
+import numpy as np
+import torch
+import os
+def set_seed(seed, cudnn=False):
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+    # May affect performance ref: https://pytorch.org/docs/stable/notes/randomness.html
+    if torch.backends.cudnn.is_available and cudnn:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+def worker_init_fn(worker_id):
+    np.random.seed(np.random.get_state()[1][0] + worker_id)