Text-to-Audio / utils /audio.py
yuancwang
init
5548515
raw
history blame
2.08 kB
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
import numpy as np
from numpy import linalg as LA
import librosa
import soundfile as sf
import librosa.filters
def load_audio_torch(wave_file, fs):
"""Load audio data into torch tensor
Args:
wave_file (str): path to wave file
fs (int): sample rate
Returns:
audio (tensor): audio data in tensor
fs (int): sample rate
"""
audio, sample_rate = librosa.load(wave_file, sr=fs, mono=True)
# audio: (T,)
assert len(audio) > 2
# Check the audio type (for soundfile loading backbone) - float, 8bit or 16bit
if np.issubdtype(audio.dtype, np.integer):
max_mag = -np.iinfo(audio.dtype).min
else:
max_mag = max(np.amax(audio), -np.amin(audio))
max_mag = (
(2**31) + 1
if max_mag > (2**15)
else ((2**15) + 1 if max_mag > 1.01 else 1.0)
)
# Normalize the audio
audio = torch.FloatTensor(audio.astype(np.float32)) / max_mag
if (torch.isnan(audio) | torch.isinf(audio)).any():
return [], sample_rate or fs or 48000
# Resample the audio to our target samplerate
if fs is not None and fs != sample_rate:
audio = torch.from_numpy(
librosa.core.resample(audio.numpy(), orig_sr=sample_rate, target_sr=fs)
)
sample_rate = fs
return audio, fs
def _stft(y, cfg):
return librosa.stft(
y=y, n_fft=cfg.n_fft, hop_length=cfg.hop_size, win_length=cfg.win_size
)
def energy(wav, cfg):
D = _stft(wav, cfg)
magnitudes = np.abs(D).T # [F, T]
return LA.norm(magnitudes, axis=1)
def get_energy_from_tacotron(audio, _stft):
audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
audio = torch.autograd.Variable(audio, requires_grad=False)
mel, energy = _stft.mel_spectrogram(audio)
energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
return mel, energy