Spaces:

hieupham14022003
/

wanna_hear_your_voice

Sleeping

App Files Files Community

hieugiaosu commited on Sep 4, 2024

Commit

7596274

1 Parent(s): e9096c9

Add application file

Browse files

Files changed (41) hide show

app.py +102 -0
checkpoints/concat_emb.pth +3 -0
network/__init__.py +1 -0
network/layers/ISTFT_layer.py +69 -0
network/layers/STFT_Layer.py +67 -0
network/layers/__init__.py +3 -0
network/layers/__pycache__/ISTFT_layer.cpython-311.pyc +0 -0
network/layers/__pycache__/STFTLayer.cpython-311.pyc +0 -0
network/layers/__pycache__/STFT_Layer.cpython-311.pyc +0 -0
network/layers/__pycache__/__init__.cpython-311.pyc +0 -0
network/layers/__pycache__/film_layer.cpython-311.pyc +0 -0
network/layers/film_layer.py +21 -0
network/models/TF_gridnet_with_condition.py +638 -0
network/models/__init__.py +2 -0
network/models/__pycache__/TF_gridnet.cpython-311.pyc +0 -0
network/models/__pycache__/TF_gridnet_with_condition.cpython-311.pyc +0 -0
network/models/__pycache__/TF_gridnet_with_condition_new.cpython-311.pyc +0 -0
network/models/__pycache__/__init__.cpython-311.pyc +0 -0
network/models/__pycache__/embedding_model.cpython-311.pyc +0 -0
network/models/__pycache__/new_style.cpython-311.pyc +0 -0
network/models/__pycache__/tfGrideNetLOTH.cpython-311.pyc +0 -0
network/models/embedding_model.py +14 -0
network/modules/__init__.py +0 -0
network/modules/attention.py +197 -0
network/modules/gate_module.py +16 -0
network/modules/input_tranformation.py +92 -0
network/modules/output_transformation.py +47 -0
network/modules/sequence_embed.py +138 -0
network/modules/split_modules.py +121 -0
network/modules/tf_gridnet_modules/__init__.py +3 -0
network/modules/tf_gridnet_modules/__pycache__/__init__.cpython-311.pyc +0 -0
network/modules/tf_gridnet_modules/__pycache__/deconv.cpython-311.pyc +0 -0
network/modules/tf_gridnet_modules/__pycache__/dimension_embedding.cpython-311.pyc +0 -0
network/modules/tf_gridnet_modules/__pycache__/tf_gridnet_block.cpython-311.pyc +0 -0
network/modules/tf_gridnet_modules/deconv.py +20 -0
network/modules/tf_gridnet_modules/dimension_embedding.py +15 -0
network/modules/tf_gridnet_modules/tf_gridnet_block.py +255 -0
network/utils/__init__.py +2 -0
network/utils/enum_declare.py +6 -0
network/utils/error_message.py +7 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import gradio as gr
+import torch
+import torchaudio
+from torch.cuda.amp import autocast
+from network.models import FilterBandTFGridnet, ResemblyzerVoiceEncoder
+# device = 'cuda' if torch.cuda.is_available() else 'cpu'
+device = 'cpu'
+model = FilterBandTFGridnet(n_layers=5,conditional_dim=256*2)
+emb = ResemblyzerVoiceEncoder(device=device)
+mixed_voice_tool = None
+def load_voice(voice_path):
+    voice, rate = torchaudio.load(voice_path)
+    if rate != 16000:
+        voice = torchaudio.functional.resample(voice, rate, 8000)
+        rate = 16000
+    voice = voice.float()
+    return voice, rate
+def mix(voice1_path, voice2_path, snr=0):
+    global mixed_voice_tool
+    voice1, _ = load_voice(voice1_path)
+    voice2, _ = load_voice(voice2_path)
+    mix = torchaudio.functional.add_noise(voice1, voice2, torch.tensor([float(snr)])).float()
+    mixed_voice_tool = mix
+    return gr.Audio(tuple((16000,mix[0].numpy())),type='numpy')
+# def seprate_from_file(mixed_voice, ref_voice):
+def seprate(mixed_voice_path, clean_voice_path, drop_down):
+    if drop_down == 'From mixing tool':
+        mixed_voice = mixed_voice_tool
+    else:
+        mixed_voice,rate = load_voice(mixed_voice_path)
+    clean_voice,rate = load_voice(clean_voice_path)
+    if clean_voice.shape[-1] < 16000*4:
+        n = 16000*4 // clean_voice.shape[-1] + 1
+        clean_voice = torch.cat([clean_voice]*n, dim=-1)
+        clean_voice = clean_voice[:,:16000*4]
+    if not model:
+        return None
+    model.to(device)
+    model.eval()
+    e = emb(clean_voice)
+    e_mix = emb(mixed_voice)
+    e = torch.cat([e,e_mix],dim=1)
+    mixed_voice = torchaudio.functional.resample(mixed_voice, rate, 8000)
+    with autocast():
+        with torch.no_grad():
+            yHat = model(
+                mixed_voice,
+                e,
+            )
+        yHat = torchaudio.functional.resample(yHat, 8000, 16000).numpy().astype('float32')
+        audio = gr.Audio(tuple((16000,yHat[0])),type='numpy')
+        return audio
+def load_checkpoint(filepath):
+    checkpoint = torch.load(
+        filepath,
+        weights_only=True,
+        map_location=device,
+    )
+    model.load_state_dict(checkpoint)
+with gr.Blocks() as demo:
+    load_checkpoint('checkpoints/concat_emb.pth')
+    with gr.Row():
+        snr = gr.Slider(label='SNR', minimum=-10, maximum=10, step=1, value=0)
+    with gr.Row():
+        with gr.Column(scale=1,min_width=200):
+            voice1 = gr.Audio(label='speaker 1', type='filepath')
+        with gr.Column(scale=1,min_width=200):
+            voice2 = gr.Audio(label='speaker 2', type='filepath')
+        with gr.Column(scale=1,min_width=200):
+            with gr.Row():
+                mixed_voice = gr.Audio(label='Mixed voice')
+            with gr.Row():
+                btn = gr.Button("Mix voices", size='sm')
+                btn.click(mix, inputs=[voice1, voice2, snr], outputs=mixed_voice)
+    with gr.Row():
+        choose_mix_source = gr.Label('Extract target speaker voice from mixed voice')
+    with gr.Row():
+        drop_down = gr.Dropdown(['From mixing tool', 'Upload'], label='Choose mixed voice source')
+    with gr.Row():
+        with gr.Column(scale=1,min_width=200):
+            with gr.Row():
+                mixed_voice_path = gr.Audio(label='Mixed voice', type='filepath')
+        with gr.Column(scale=1,min_width=200):
+            with gr.Row():
+                ref_voice_path = gr.Audio(label='reference voice', type='filepath')
+        with gr.Column(scale=1,min_width=200):
+            with gr.Row():
+                sep_voice = gr.Audio(label="Separate Voice")
+            with gr.Row():
+                btn = gr.Button("Separate voices", size='sm')
+                btn.click(seprate, inputs=[mixed_voice_path, ref_voice_path, drop_down], outputs=sep_voice)
+demo.launch()

checkpoints/concat_emb.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c0d3e074f574a701ef9231d787611c47e86556b7ef7d63e1e0f6e4a4a6caa73
+size 39815976

network/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __package__= "network"

network/layers/ISTFT_layer.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+import torch.nn as nn
+from typing import Optional
+from ..utils import ErrorMessageUtil
+from einops import rearrange
+class InverseSTFTLayer(nn.Module):
+    def __init__(
+            self,
+            n_fft:int = 128,
+            win_length: Optional[int] = None,
+            hop_length:int = 64,
+            window: str = "hann",
+            center: bool = True,
+            normalized: bool = False,
+            onesided: bool = True,
+            ):
+        super().__init__()
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length else n_fft
+        self.hop_length = hop_length
+        self.center = center
+        self.normalized = normalized
+        self.onesided = onesided
+        self.window = getattr(torch,f"{window}_window")
+    def forward(self,input,audio_length:int):
+        """STFT forward function.
+        Args:
+            input: (Batch, Freq, Frames) or (Batch, Channels, Freq, Frames)
+        Returns:
+            output: (Batch, Nsamples) or (Batch, Channel, Nsample)
+        Notice:
+            input is a complex tensor
+        """
+        assert input.dim() == 4 or input.dim() == 3, ErrorMessageUtil.only_support_batch_input
+        batch_size = input.size(0)
+        multi_channel = (input.dim() == 4)
+        if multi_channel:
+            input = rearrange(input, "b c f t -> (b c) f t")
+        window = self.window(
+                    self.win_length,
+                    dtype = input.real.dtype,
+                    device = input.device
+                )
+        istft_kwargs = dict(
+                n_fft=self.n_fft,
+                win_length=self.n_fft,
+                hop_length=self.hop_length,
+                center=self.center,
+                window=window,
+                length = audio_length,
+                return_complex = False
+            )
+        wave = torch.istft(input,**istft_kwargs)
+        if multi_channel:
+            wave = rearrange(wave,"(b c) l -> b c l", b = batch_size)
+        return wave
+class ComplexTensorLayer(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(seal,input):
+        assert input.shape[1] == 2, ErrorMessageUtil.complex_format_convert
+        real = input[:,0]
+        imag = input[:,1]
+        return torch.complex(real,imag)

network/layers/STFT_Layer.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import torch.nn as nn
+from typing import Optional
+from ..utils import ErrorMessageUtil
+from einops import rearrange
+class STFTLayer(nn.Module):
+    def __init__(
+            self,
+            n_fft:int = 128,
+            win_length: Optional[int] = None,
+            hop_length:int = 64,
+            window: str = "hann",
+            center: bool = True,
+            normalized: bool = False,
+            onesided: bool = True,
+            pad_mode:str ="reflect"
+            ):
+        super().__init__()
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length else n_fft
+        self.hop_length = hop_length
+        self.center = center
+        self.normalized = normalized
+        self.onesided = onesided
+        self.pad_mode = pad_mode
+        self.window = getattr(torch,f"{window}_window")
+    def forward(self,input:torch.Tensor):
+        """STFT forward function.
+        Args:
+            input: (Batch, Nsamples) or (Batch, Channel, Nsample)
+        Returns:
+            output: (Batch, Freq, Frames) or (Batch, Channels, Freq, Frames)
+        Notice:
+            output is a complex tensor
+        """
+        assert input.dim() == 2 or input.dim() == 3, ErrorMessageUtil.only_support_batch_input
+        batch_size = input.size(0)
+        multi_channel = (input.dim() == 3)
+        if multi_channel:
+            input = rearrange(input, "b c l -> (b c) l")
+        window = self.window(
+                    self.win_length,
+                    dtype = input.dtype,
+                    device = input.device
+                )
+        stft_kwargs = dict(
+                n_fft=self.n_fft,
+                win_length=self.n_fft,
+                hop_length=self.hop_length,
+                center=self.center,
+                window=window,
+                pad_mode=self.pad_mode,
+                return_complex=True
+            )
+        n_pad_left = (self.n_fft - window.shape[0]) // 2
+        n_pad_right = self.n_fft - window.shape[0] - n_pad_left
+        stft_kwargs["window"] = torch.cat(
+            [torch.zeros(n_pad_left,device=input.device), window, torch.zeros(n_pad_right,device=input.device)], 0
+        )
+        output = torch.stft(input,**stft_kwargs)
+        if multi_channel:
+            output = rearrange(output,"(b c) f t -> b c f t", b = batch_size)
+        return output

network/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .STFT_Layer import STFTLayer
+from .ISTFT_layer import *
+from .film_layer import FiLMLayer

network/layers/__pycache__/ISTFT_layer.cpython-311.pyc ADDED Viewed

Binary file (4.13 kB). View file

network/layers/__pycache__/STFTLayer.cpython-311.pyc ADDED Viewed

Binary file (3.66 kB). View file

network/layers/__pycache__/STFT_Layer.cpython-311.pyc ADDED Viewed

Binary file (3.63 kB). View file

network/layers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (358 Bytes). View file

network/layers/__pycache__/film_layer.cpython-311.pyc ADDED Viewed

Binary file (2.04 kB). View file

network/layers/film_layer.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch.nn as nn
+from einops import rearrange
+class FiLMLayer(nn.Module):
+    def __init__(self,channels,conditional_dim=256, apply_dim = 1):
+        super().__init__()
+        self.alpha = nn.Linear(conditional_dim,channels)
+        self.beta = nn.Linear(conditional_dim,channels)
+        self.apply_dim = apply_dim
+    def forward(self,x,condition):
+        alpha = self.alpha(condition)
+        beta = self.beta(condition)
+        input = x
+        if self.apply_dim != 1:
+            input = input.transpose(1,-1)
+        alpha = rearrange(alpha,"b d -> b d"+" 1"*(x.dim()-alpha.dim()))
+        beta = rearrange(beta,"b d -> b d"+" 1"*(x.dim()-beta.dim()))
+        out = alpha*input+beta
+        if self.apply_dim != 1:
+            out = out.transpose(1,-1)
+        return out

network/models/TF_gridnet_with_condition.py ADDED Viewed

	@@ -0,0 +1,638 @@

+import torch.nn as nn
+from ..modules.tf_gridnet_modules import *
+from ..modules.input_tranformation import STFTInput, RMSNormalizeInput
+from ..modules.output_transformation import WaveGeneratorByISTFT, RMSDenormalizeOutput
+from ..modules.convolution_module import SplitFeatureDeconv
+from ..modules.attention import *
+from .TF_gridnet import TF_Gridnet
+from ..layers import FiLMLayer
+from ..modules.gate_module import BandFilterGate
+from einops import rearrange, repeat
+import math
+class TFGridFormer(nn.Module):
+    def __init__(
+            self,
+            n_srcs=2,
+            n_fft=128,
+            hop_length=64,
+            window="hann",
+            n_audio_channel=1,
+            n_layers=6,
+            input_kernel_size_T = 3,
+            input_kernel_size_F = 3,
+            output_kernel_size_T = 3,
+            output_kernel_size_F = 3,
+            lstm_hidden_units=192,
+            attn_n_head=4,
+            qk_output_channel=4,
+            emb_dim=48,
+            emb_ks=4,
+            emb_hs=1,
+            activation="PReLU",
+            eps=1.0e-5,):
+        super().__init__()
+        self.ref_input_normalize = RMSNormalizeInput((1,2),keepdim=True)
+        self.mix_input_normalize = RMSNormalizeInput((1,2),keepdim=True)
+        self.stft = STFTInput(
+            n_fft=n_fft,
+            win_length=n_fft,
+            hop_length=hop_length,
+            window=window,
+        )
+        self.mix_dimension_embedding = DimensionEmbedding(
+            audio_channel=n_audio_channel,
+            emb_dim=emb_dim,
+            kernel_size=(input_kernel_size_F,input_kernel_size_T),
+            eps=eps
+        )
+        self.ref_dimension_embedding = DimensionEmbedding(
+            audio_channel=n_audio_channel,
+            emb_dim=emb_dim,
+            kernel_size=(input_kernel_size_F,input_kernel_size_T),
+            eps=eps
+        )
+        self.istft = WaveGeneratorByISTFT(
+            n_fft=n_fft,
+            win_length=n_fft,
+            hop_length=hop_length,
+            window=window
+        )
+        self.output_denormalize = RMSDenormalizeOutput()
+        self.ref_encoder = TFGridnetBlock(
+                    emb_dim=emb_dim,
+                    kernel_size=emb_ks,
+                    emb_hop_size=emb_hs,
+                    hidden_channels=lstm_hidden_units,
+                    n_head=attn_n_head,
+                    qk_output_channel=qk_output_channel,
+                    activation=activation,
+                    eps=eps
+                )
+        mix_encode_layers = math.ceil(n_layers*2/3)
+        mix_decode_layers = n_layers - mix_encode_layers
+        self.mix_encoder = nn.Sequential(
+            *[
+                TFGridnetBlock(
+                    emb_dim=emb_dim,
+                    kernel_size=emb_ks,
+                    emb_hop_size=emb_hs,
+                    hidden_channels=lstm_hidden_units,
+                    n_head=attn_n_head,
+                    qk_output_channel=qk_output_channel,
+                    activation=activation,
+                    eps=eps
+                ) for _ in range(mix_encode_layers)
+            ]
+        )
+        self.split_layer = SplitFeatureDeconv(
+            emb_dim=emb_dim,
+            n_srcs=n_srcs,
+            kernel_size_T=output_kernel_size_T,
+            kernel_size_F=output_kernel_size_F,
+            padding_F=output_kernel_size_F//2,
+            padding_T=output_kernel_size_T//2
+            )
+        self.intra_frame_cross_att = IntraFrameCrossAttention(
+            emb_dim=emb_dim,
+            n_head= attn_n_head,
+            qk_output_channel=attn_n_head*3,
+            activation=activation,
+            eps=eps
+            )
+        self.cross_frame_cross_att = CrossFrameCrossAttention(
+            emb_dim=emb_dim,
+            n_head=attn_n_head,
+            qk_output_channel=qk_output_channel,
+            activation=activation,
+            eps=eps
+        )
+        self.mix_decoder = nn.Sequential(
+            *[
+                TFGridnetBlock(
+                    emb_dim=emb_dim,
+                    kernel_size=emb_ks,
+                    emb_hop_size=emb_hs,
+                    hidden_channels=lstm_hidden_units,
+                    n_head=attn_n_head,
+                    qk_output_channel=qk_output_channel,
+                    activation=activation,
+                    eps=eps
+                ) for _ in range(mix_decode_layers)
+            ]
+        )
+        self.deconv = TFGridnetDeconv(
+            emb_dim=emb_dim,
+            n_srcs=1,
+            kernel_size_T=output_kernel_size_T,
+            kernel_size_F=output_kernel_size_F,
+            padding_F=output_kernel_size_F//2,
+            padding_T=output_kernel_size_T//2
+            )
+        self.middle_deconv = TFGridnetDeconv(
+            emb_dim=emb_dim,
+            n_srcs=n_srcs,
+            kernel_size_T=output_kernel_size_T,
+            kernel_size_F=output_kernel_size_F,
+            padding_F=output_kernel_size_F//2,
+            padding_T=output_kernel_size_T//2
+            )
+    def forward(self,mix,ref,middle = False):
+        audio_length = mix.shape[-1]
+        x = mix
+        c = ref
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+        if c.dim() == 2:
+            c = c.unsqueeze(1)
+        x, std = self.mix_input_normalize(x)
+        x = self.stft(x)
+        c, _ = self.ref_input_normalize(c)
+        c = self.stft(c)
+        c = self.ref_dimension_embedding(c)
+        c = self.ref_encoder(c)
+        x = self.mix_dimension_embedding(x)
+        x = self.mix_encoder(x)
+        m = None
+        if middle:
+            m = self.middle_deconv(x)
+            m = rearrange(m,"B C N F T -> B N C F T")
+            m = self.istft(m,audio_length)
+            m = self.output_denormalize(m,std)
+        x = self.split_layer(x)
+        x = self.intra_frame_cross_att(c,x)
+        x = self.cross_frame_cross_att(c,x)
+        x = self.mix_decoder(x)
+        x = self.deconv(x)
+        x = rearrange(x,"B C N F T -> B N C F T") #becasue in istft, the 1 dim is for real and im part
+        x = self.istft(x,audio_length)
+        x = self.output_denormalize(x,std)
+        if middle: return x[:,0], m
+        return x[:,0]
+class DoubleChannelTFGridNet(TF_Gridnet):
+    def __init__(self,
+                #  n_srcs=2,
+                 n_fft=128,
+                 hop_length=64,
+                 window="hann",
+                 n_audio_channel=1,
+                 n_layers=6,
+                 input_kernel_size_T=3,
+                 input_kernel_size_F=3,
+                 output_kernel_size_T=3,
+                 output_kernel_size_F=3,
+                 lstm_hidden_units=192,
+                 attn_n_head=4,
+                 qk_output_channel=4,
+                 emb_dim=48,
+                 emb_ks=4,
+                 emb_hs=1,
+                 activation="PReLU",
+                 eps=0.00001):
+        super().__init__(1, n_fft, hop_length, window, n_audio_channel*2, n_layers, input_kernel_size_T, input_kernel_size_F, output_kernel_size_T, output_kernel_size_F, lstm_hidden_units, attn_n_head, qk_output_channel, emb_dim, emb_ks, emb_hs, activation, eps)
+    def forward(self,input,condition):
+        x = input
+        c = condition
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+        if c.dim() == 2:
+            c = c.unsqueeze(1)
+        tc = c.shape[-1]
+        tx = x.shape[-1]
+        if tc >= tx:
+            c = c[:,:,-tx:]
+        else:
+            n = math.ceil(tx/tc)
+            c = repeat(c,"b c t -> b c (t n)",n=n)
+            c = c[:,:,-tx:]
+        mix_with_clue = torch.cat([x,c],dim=1)
+        o = super().forward(mix_with_clue)
+        return o[:,0]
+class TargetSpeakerTF(nn.Module):
+    def __init__(
+            self,
+            n_srcs=2,
+            n_fft=128,
+            hop_length=64,
+            window="hann",
+            n_audio_channel=1,
+            n_layers=6,
+            input_kernel_size_T = 3,
+            input_kernel_size_F = 3,
+            output_kernel_size_T = 3,
+            output_kernel_size_F = 3,
+            lstm_hidden_units=192,
+            attn_n_head=4,
+            qk_output_channel=4,
+            emb_dim=48,
+            emb_ks=4,
+            emb_hs=1,
+            activation="PReLU",
+            eps=1.0e-5,
+            conditional_dim = 256
+            ):
+        super().__init__()
+        self.input_normalize = RMSNormalizeInput((1,2),keepdim=True)
+        self.stft = STFTInput(
+            n_fft=n_fft,
+            win_length=n_fft,
+            hop_length=hop_length,
+            window=window,
+        )
+        self.istft = WaveGeneratorByISTFT(
+            n_fft=n_fft,
+            win_length=n_fft,
+            hop_length=hop_length,
+            window=window
+        )
+        self.output_denormalize = RMSDenormalizeOutput()
+        self.dimension_embedding = DimensionEmbedding(
+            audio_channel=n_audio_channel,
+            emb_dim=emb_dim,
+            kernel_size=(input_kernel_size_F,input_kernel_size_T),
+            eps=eps
+        )
+        self.tf_gridnet_block = nn.ModuleList(
+            [
+                TFGridnetBlock(
+                    emb_dim=emb_dim,
+                    kernel_size=emb_ks,
+                    emb_hop_size=emb_hs,
+                    hidden_channels=lstm_hidden_units,
+                    n_head=attn_n_head,
+                    qk_output_channel=qk_output_channel,
+                    activation=activation,
+                    eps=eps
+                ) for _ in range(n_layers)
+            ]
+        )
+        self.film_layer = nn.ModuleList(
+            [
+                FiLMLayer(emb_dim,conditional_dim=conditional_dim,apply_dim=1)
+                for _ in range(n_layers)
+            ]
+        )
+        self.deconv = TFGridnetDeconv(
+            emb_dim=emb_dim,
+            n_srcs=n_srcs,
+            kernel_size_T=output_kernel_size_T,
+            kernel_size_F=output_kernel_size_F,
+            padding_F=output_kernel_size_F//2,
+            padding_T=output_kernel_size_T//2
+            )
+        self.n_layers = n_layers
+    def forward(self,input, clue):
+        audio_length = input.shape[-1]
+        x = input
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+        x, std = self.input_normalize(x)
+        x = self.stft(x)
+        x = self.dimension_embedding(x)
+        for i in range(self.n_layers):
+            x = self.tf_gridnet_block[i](x)
+            x = self.film_layer[i](x,clue)
+        x = self.deconv(x)
+        x = rearrange(x,"B C N F T -> B N C F T") #becasue in istft, the 1 dim is for real and im part
+        x = self.istft(x,audio_length)
+        x = self.output_denormalize(x,std)
+        return x
+class DoubleChannelTargetSpeakerTF(TargetSpeakerTF):
+    def __init__(self,
+                #  n_srcs=2,
+                n_fft=128,
+                hop_length=64,
+                window="hann",
+                n_audio_channel=1,
+                n_layers=6,
+                input_kernel_size_T=3,
+                input_kernel_size_F=3,
+                output_kernel_size_T=3,
+                output_kernel_size_F=3,
+                lstm_hidden_units=192,
+                attn_n_head=4,
+                qk_output_channel=4,
+                emb_dim=48,
+                emb_ks=4,
+                emb_hs=1,
+                activation="PReLU",
+                eps=0.00001,
+                conditional_dim = 256
+                 ):
+        super().__init__(1, n_fft, hop_length, window, n_audio_channel*2, n_layers, input_kernel_size_T, input_kernel_size_F, output_kernel_size_T, output_kernel_size_F, lstm_hidden_units, attn_n_head, qk_output_channel, emb_dim, emb_ks, emb_hs, activation, eps, conditional_dim)
+    def forward(self, input, reference, embedding):
+        x = input
+        c = reference
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+        if c.dim() == 2:
+            c = c.unsqueeze(1)
+        tc = c.shape[-1]
+        tx = x.shape[-1]
+        if tc >= tx:
+            c = c[:,:,-tx:]
+        else:
+            n = math.ceil(tx/tc)
+            c = repeat(c,"b c t -> b c (t n)",n=n)
+            c = c[:,:,-tx:]
+        mix_with_clue = torch.cat([x,c],dim=1)
+        o = super().forward(mix_with_clue,embedding)
+        return o[:,0]
+class FilterBandTFGridnet(nn.Module):
+    def __init__(
+            self,
+            # n_srcs=2,
+            n_fft=128,
+            hop_length=64,
+            window="hann",
+            n_audio_channel=1,
+            n_layers=6,
+            input_kernel_size_T = 3,
+            input_kernel_size_F = 3,
+            output_kernel_size_T = 3,
+            output_kernel_size_F = 3,
+            lstm_hidden_units=192,
+            attn_n_head=4,
+            qk_output_channel=4,
+            emb_dim=48,
+            emb_ks=4,
+            emb_hs=1,
+            activation="PReLU",
+            eps=1.0e-5,
+            conditional_dim = 256
+            ):
+        super().__init__()
+        n_freqs = n_fft//2 + 1
+        self.input_normalize = RMSNormalizeInput((1,2),keepdim=True)
+        self.stft = STFTInput(
+            n_fft=n_fft,
+            win_length=n_fft,
+            hop_length=hop_length,
+            window=window,
+        )
+        self.istft = WaveGeneratorByISTFT(
+            n_fft=n_fft,
+            win_length=n_fft,
+            hop_length=hop_length,
+            window=window
+        )
+        self.output_denormalize = RMSDenormalizeOutput()
+        self.dimension_embedding = DimensionEmbedding(
+            audio_channel=n_audio_channel,
+            emb_dim=emb_dim,
+            kernel_size=(input_kernel_size_F,input_kernel_size_T),
+            eps=eps
+        )
+        self.tf_gridnet_block = nn.ModuleList(
+            [
+                TFGridnetBlock(
+                    emb_dim=emb_dim,
+                    kernel_size=emb_ks,
+                    emb_hop_size=emb_hs,
+                    hidden_channels=lstm_hidden_units,
+                    n_head=attn_n_head,
+                    qk_output_channel=qk_output_channel,
+                    activation=activation,
+                    eps=eps
+                ) for _ in range(n_layers)
+            ]
+        )
+        self.filter_gen = nn.Linear(conditional_dim,emb_dim*n_freqs)
+        self.bias_gen = nn.Linear(conditional_dim,emb_dim*n_freqs)
+        self.gates = nn.ModuleList(
+            [
+                BandFilterGate(emb_dim,n_freqs)
+                for _ in range(n_layers)
+            ]
+        )
+        self.deconv = TFGridnetDeconv(
+            emb_dim=emb_dim,
+            n_srcs=1,
+            kernel_size_T=output_kernel_size_T,
+            kernel_size_F=output_kernel_size_F,
+            padding_F=output_kernel_size_F//2,
+            padding_T=output_kernel_size_T//2
+            )
+        self.n_layers = n_layers
+    def forward(self,input, clue):
+        audio_length = input.shape[-1]
+        x = input
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+        x, std = self.input_normalize(x)
+        x = self.stft(x)
+        x = self.dimension_embedding(x)
+        n_freqs = x.shape[-2]
+        f = self.filter_gen(clue)
+        b = self.bias_gen(clue)
+        f = rearrange(f,"b (d q) -> b d q 1", q = n_freqs)
+        b = rearrange(b,"b (d q) -> b d q 1", q = n_freqs)
+        for i in range(self.n_layers):
+            x = self.tf_gridnet_block[i](x)
+            x = self.gates[i](x,f,b)
+        x = self.deconv(x)
+        x = rearrange(x,"B C N F T -> B N C F T") #becasue in istft, the 1 dim is for real and im part
+        x = self.istft(x,audio_length)
+        x = self.output_denormalize(x,std)
+        return x[:,0]
+class FilterBandTFGridnetWithAttentionGate(nn.Module):
+    def __init__(
+            self,
+            # n_srcs=2,
+            n_fft=128,
+            hop_length=64,
+            window="hann",
+            n_audio_channel=1,
+            n_layers=6,
+            input_kernel_size_T = 3,
+            input_kernel_size_F = 3,
+            output_kernel_size_T = 3,
+            output_kernel_size_F = 3,
+            lstm_hidden_units=192,
+            attn_n_head=4,
+            qk_output_channel=4,
+            emb_dim=48,
+            emb_ks=4,
+            emb_hs=1,
+            activation="PReLU",
+            eps=1.0e-5,
+            conditional_dim = 256
+            ):
+        super().__init__()
+        n_freqs = n_fft//2 + 1
+        self.input_normalize = RMSNormalizeInput((1,2),keepdim=True)
+        self.stft = STFTInput(
+            n_fft=n_fft,
+            win_length=n_fft,
+            hop_length=hop_length,
+            window=window,
+        )
+        self.istft = WaveGeneratorByISTFT(
+            n_fft=n_fft,
+            win_length=n_fft,
+            hop_length=hop_length,
+            window=window
+        )
+        self.output_denormalize = RMSDenormalizeOutput()
+        self.dimension_embedding = DimensionEmbedding(
+            audio_channel=n_audio_channel,
+            emb_dim=emb_dim,
+            kernel_size=(input_kernel_size_F,input_kernel_size_T),
+            eps=eps
+        )
+        self.tf_gridnet_block = nn.ModuleList(
+            [
+                TFGridnetBlock(
+                    emb_dim=emb_dim,
+                    kernel_size=emb_ks,
+                    emb_hop_size=emb_hs,
+                    hidden_channels=lstm_hidden_units,
+                    n_head=attn_n_head,
+                    qk_output_channel=qk_output_channel,
+                    activation=activation,
+                    eps=eps
+                ) for _ in range(n_layers)
+            ]
+        )
+        self.query_gen = nn.Linear(conditional_dim,emb_dim*n_freqs)
+        self.attentions = nn.ModuleList(
+            [
+                CrossAttentionFilterV2(emb_dim)
+                for _ in range(n_layers)
+            ]
+        )
+        self.deconv = TFGridnetDeconv(
+            emb_dim=emb_dim,
+            n_srcs=1,
+            kernel_size_T=output_kernel_size_T,
+            kernel_size_F=output_kernel_size_F,
+            padding_F=output_kernel_size_F//2,
+            padding_T=output_kernel_size_T//2
+            )
+        self.n_layers = n_layers
+    def forward(self,input, clue):
+        audio_length = input.shape[-1]
+        x = input
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+        x, std = self.input_normalize(x)
+        x = self.stft(x)
+        x = self.dimension_embedding(x)
+        n_freqs = x.shape[-2]
+        q = self.query_gen(clue)
+        q = rearrange(q,"b (d f) -> b f d", f=n_freqs)
+        for i in range(self.n_layers):
+            x = self.tf_gridnet_block[i](x)
+            x = self.attentions[i](q,x)
+        x = self.deconv(x)
+        x = rearrange(x,"B C N F T -> B N C F T") #becasue in istft, the 1 dim is for real and im part
+        x = self.istft(x,audio_length)
+        x = self.output_denormalize(x,std)
+        return x[:,0]

network/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .TF_gridnet_with_condition import *
2	+ from .embedding_model import ResemblyzerVoiceEncoder

network/models/__pycache__/TF_gridnet.cpython-311.pyc ADDED Viewed

Binary file (4 kB). View file

network/models/__pycache__/TF_gridnet_with_condition.cpython-311.pyc ADDED Viewed

Binary file (22.4 kB). View file

network/models/__pycache__/TF_gridnet_with_condition_new.cpython-311.pyc ADDED Viewed

Binary file (5.3 kB). View file

network/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (592 Bytes). View file

network/models/__pycache__/embedding_model.cpython-311.pyc ADDED Viewed

Binary file (2.14 kB). View file

network/models/__pycache__/new_style.cpython-311.pyc ADDED Viewed

Binary file (8.66 kB). View file

network/models/__pycache__/tfGrideNetLOTH.cpython-311.pyc ADDED Viewed

Binary file (5.41 kB). View file

network/models/embedding_model.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch
+from resemblyzer import VoiceEncoder
+class ResemblyzerVoiceEncoder:
+    def __init__(self, device) -> None:
+        self.model = VoiceEncoder(device)
+    def __call__(self, audio: torch.Tensor):
+        if audio.ndimension() == 1:
+            return torch.tensor(self.model.embed_utterance(audio.numpy())).float().cpu()
+        else:
+            e = torch.stack([torch.tensor(self.model.embed_utterance(audio[i,:].numpy())).float().cpu()
+                             for i in range(audio.shape[0])])
+            return e

network/modules/__init__.py ADDED Viewed

File without changes

network/modules/attention.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .tf_gridnet_modules import AllHeadPReLULayerNormalization4DC, LayerNormalization
+from einops import rearrange, repeat
+import math
+class IntraFrameCrossAttention(nn.Module):
+    def __init__(
+            self,
+            emb_dim = 48,
+            n_head = 4,
+            qk_output_channel=12,
+            activation="PReLU",
+            eps = 1e-5
+    ):
+        super().__init__()
+        assert emb_dim % n_head == 0
+        E = qk_output_channel
+        self.conv_Q = nn.Conv2d(emb_dim,n_head*E,1)
+        self.norm_Q = AllHeadPReLULayerNormalization4DC((n_head, E), eps=eps)
+        self.conv_K = nn.Conv2d(emb_dim,n_head*E,1)
+        self.norm_K = AllHeadPReLULayerNormalization4DC((n_head, E), eps=eps)
+        self.conv_V = nn.Conv2d(emb_dim, emb_dim, 1)
+        self.norm_V = AllHeadPReLULayerNormalization4DC((n_head, emb_dim // n_head), eps=eps)
+        self.concat_proj = nn.Sequential(
+            nn.Conv2d(emb_dim,emb_dim,1),
+            getattr(nn,activation)(),
+            LayerNormalization(emb_dim, dim=-3, total_dim=4, eps=eps),
+        )
+        self.emb_dim = emb_dim
+        self.n_head = n_head
+    def forward(self,q,kv):
+        """
+        args:
+            query (torch.Tensor): a query for cross attention, come frome the reference encoder
+                                [B D Q Tq]
+            kv (torch.Tensor): a key and value for cross attention, come frome the output of feature split
+                                [B nSrc D Q Tkv]
+        output:
+            output: (torch.Tensor):[B D Q Tkv]
+        """
+        B, D, freq, Tq = q.shape
+        _, nSrc, _, _, Tkv = kv.shape
+        if Tq >= Tkv:
+            q = q[:,:,:,-Tkv:]
+        else:
+            r = math.ceil(Tkv/Tq)
+            q = repeat(q,"B D Q T -> B D Q (T r)", r = r)
+            q = q[:,:,:,-Tkv:]
+        query = rearrange(q,"B D Q T -> B D T Q")
+        kvInput = rearrange(kv,"B n D Q T -> B D T (n Q)")
+        Q = self.norm_Q(self.conv_Q(query)) # [B, n_head, C, T, Q]
+        K = self.norm_K(self.conv_K(kvInput)) # [B, n_head, C, T, Q*nSrc]
+        V = self.norm_V(self.conv_V(kvInput))
+        Q = rearrange(Q, "B H C T Q -> (B H T) Q C")
+        K = rearrange(K, "B H C T Q -> (B H T) C Q").contiguous()
+        _, n_head, channel, _, _ = V.shape
+        V = rearrange(V, "B H C T Q -> (B H T) Q C")
+        emb_dim = Q.shape[-1]
+        qkT = torch.matmul(Q, K) / (emb_dim**0.5)
+        qkT = F.softmax(qkT,dim=2)
+        att = torch.matmul(qkT,V)
+        att = rearrange(att, "(B H T) Q C -> B (H C) T Q", C=channel, Q=freq, H = n_head, B = B, T=Tkv)
+        att = self.concat_proj(att)
+        out = att + query
+        out = rearrange(out, "B C T Q -> B C Q T")
+        return out
+class CrossFrameCrossAttention(nn.Module):
+    def __init__(
+            self,
+            emb_dim = 48,
+            n_head=4,
+            qk_output_channel=4,
+            activation="PReLU",
+            eps = 1e-5
+    ):
+        super().__init__()
+        assert emb_dim % n_head == 0
+        E = qk_output_channel
+        self.conv_Q = nn.Conv2d(emb_dim,n_head*E,1)
+        self.norm_Q = AllHeadPReLULayerNormalization4DC((n_head, E), eps=eps)
+        self.conv_K = nn.Conv2d(emb_dim,n_head*E,1)
+        self.norm_K = AllHeadPReLULayerNormalization4DC((n_head, E), eps=eps)
+        self.conv_V = nn.Conv2d(emb_dim, emb_dim, 1)
+        self.norm_V = AllHeadPReLULayerNormalization4DC((n_head, emb_dim // n_head), eps=eps)
+        self.concat_proj = nn.Sequential(
+            nn.Conv2d(emb_dim,emb_dim,1),
+            getattr(nn,activation)(),
+            LayerNormalization(emb_dim, dim=-3, total_dim=4, eps=eps),
+        )
+        self.emb_dim = emb_dim
+        self.n_head = n_head
+    def forward(self,q,kv):
+        """
+        args:
+            query (torch.Tensor): a query for cross attention, come frome the reference encoder
+                                [B D Q Tq]
+            kv (torch.Tensor): a key and value for cross attention, come frome the output of feature split
+                                [B D Q Tkv]
+        output:
+            output: (torch.Tensor):[B D Q Tkv]
+        """
+        Tq = q.shape[-1]
+        Tkv = kv.shape[-1]
+        if Tq >= Tkv:
+            q = q[:,:,:,-Tkv:]
+        else:
+            r = math.ceil(Tkv/Tq)
+            q = repeat(q,"B D Q T -> B D Q (T r)", r = r)
+            q = q[:,:,:,-Tkv:]
+        input = rearrange(q,"B C Q T -> B C T Q")
+        kvInput = rearrange(kv,"B C Q T -> B C T Q")
+        Q = self.norm_Q(self.conv_Q(input)) # [B, n_head, C, T, Q]
+        K = self.norm_K(self.conv_K(kvInput))
+        V = self.norm_V(self.conv_V(kvInput))
+        Q = rearrange(Q, "B H C T Q -> (B H) T (C Q)")
+        K = rearrange(K, "B H C T Q -> (B H) (C Q) T").contiguous()
+        batch, n_head, channel, frame, freq = V.shape
+        V = rearrange(V, "B H C T Q -> (B H) T (C Q)")
+        emb_dim = Q.shape[-1]
+        qkT = torch.matmul(Q, K) / (emb_dim**0.5)
+        qkT = F.softmax(qkT,dim=2)
+        att = torch.matmul(qkT,V)
+        att = rearrange(att, "(B H) T (C Q) -> B (H C) T Q", C=channel, Q=freq, H = n_head, B = batch, T=frame)
+        att = self.concat_proj(att)
+        out = att + input
+        out = rearrange(out, "B C T Q -> B C Q T")
+        return out
+class CrossAttentionFilter(nn.Module):
+    def __init__(self, emb_dim = 48) -> None:
+        super().__init__()
+        self.emb_dim = emb_dim
+    def forward(self, q, k, v):
+        """
+        Args:
+            q (torch.Tensor): from the provious layer, [B D F T]
+            k (torch.Tensor): from the speaker embedidng encoder, [B D]
+            v (torch.Tensor): from the speaker embedidng encoder, [B D]
+        """
+        B, D, _, T = q.shape
+        q = rearrange(q, "B D F T -> (B T) F D")
+        k = repeat(k, "B D -> (B T) D 1", T = T)
+        v = repeat(v, "B D -> (B T) 1 D", T = T)
+        qkT = torch.matmul(q, k)/(D**0.5)   # [(B T) F 1]
+        qkT = F.softmax(qkT, dim=-1)
+        att = torch.matmul(qkT, v)      # [(B T) F D]
+        att = rearrange(att, "(B T) F D -> B D F T", B = B, T = T)
+        return att
+class CrossAttentionFilterV2(nn.Module):
+    def __init__(self, emb_dim = 48) -> None:
+        super().__init__()
+        self.emb_dim = emb_dim
+    def forward(self,q, kv):
+        """
+        Args:
+        q: torch.Tensor, [B F D] a query for cross attention, come from the reference encoder (speaker embedding)
+        kv: torch.Tensor, [B D F T] a key and value for cross attention, come from the output of previous layer (TF gridnet)
+        """
+        B, D, _, T = kv.shape
+        Q = repeat(q, "B F D -> (B T) F D", T = T)
+        K = rearrange(kv, "B D F T -> (B T) D F")
+        V = rearrange(kv, "B D F T -> (B T) F D")
+        qkT = torch.matmul(Q,K)/(D**0.5) #[(B T) F F]
+        qkT = F.softmax(qkT, dim=-1)
+        att = torch.matmul(qkT, V)      # [(B T) F D]
+        att = rearrange(att, "(B T) F D -> B D F T", B = B, T = T)
+        return att

network/modules/gate_module.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+class BandFilterGate(nn.Module):
+    def __init__(self,emb_dim=48, n_freqs = 65):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.empty(1,emb_dim,n_freqs,1).to(torch.float32))
+        self.beta = nn.Parameter(torch.empty(1,emb_dim,n_freqs,1).to(torch.float32))
+        nn.init.xavier_normal_(self.alpha)
+        nn.init.xavier_normal_(self.beta)
+    def forward(self,input,filters,bias):
+        f = F.sigmoid(self.alpha*filters)
+        b = F.tanh(self.beta*bias)
+        return f*input + b

network/modules/input_tranformation.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import torch
+import torch.nn as nn
+from ..layers import STFTLayer
+from ..utils import STFT_transform_type_enum
+from typing import Iterable
+class SimpleConv1DInput(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel, stride = 1,
+                 padding = 0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', activation:str = 'ReLU'):
+        super().__init__()
+        activation = getattr(nn,activation)
+        self.model = nn.Sequential(
+            nn.Conv1d(in_channels,out_channels,kernel,stride,padding,dilation,groups,bias,padding_mode),
+            activation()
+        )
+    def forward(self,input):
+        return self.model(input)
+class STFTInput(nn.Module):
+    def __init__(
+            self,
+            n_fft: int = 128,
+            win_length: int = None,
+            hop_length: int = 64,
+            window="hann",
+            center: bool = True,
+            normalized: bool = False,
+            onesided: bool = True,
+            spec_transform_type: str = None,
+            spec_factor: float = 0.15,
+            spec_abs_exponent: float = 0.5,
+        ):
+        super().__init__()
+        self.stft = STFTLayer(
+            n_fft,
+            win_length,
+            hop_length,
+            window,
+            center,
+            normalized,
+            onesided
+            )
+        self.spec_transform_type = spec_transform_type
+        self.spec_factor = spec_factor
+        self.spec_abs_exponent = spec_abs_exponent
+        self.spec_transform = lambda spec: spec
+        if self.spec_transform_type == STFT_transform_type_enum.exponent:
+            self.spec_transform = lambda spec: spec.abs() ** self.spec_abs_exponent * torch.exp(1j * spec.angle())
+        elif self.spec_transform_type == STFT_transform_type_enum.log:
+            self.spec_transform = lambda spec: torch.log(1 + spec.abs()) * torch.exp(1j * spec.angle()) * self.spec_factor
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
+    def forward(self,input):
+        """
+        Notice that, in pytorch, the STFT does not support quantize 16 bit float, so this function
+        is decorated with @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
+        Args:
+            input (torch.Tensor): signal [Batch, Nsamples] or [Batch,channel,Nsamples]
+        ouputs:
+            spectrum (torch.Tensor): float tensor perform the spectrum with 2 channel, the first channel
+                is real part of spectrum, the second channel is the imaginary part of spectrum
+                [Batch, 2, F, T] or [Batch, 2 * channel, F, T]
+        """
+        spectrum = self.stft(input.float())
+        spectrum = self.spec_transform(spectrum)
+        re = spectrum.real
+        im = spectrum.imag
+        if input.dim() == 2:
+            re = re.unsqueeze(1)
+            im = im.unsqueeze(1)
+        if input.dtype in (torch.float16, torch.bfloat16):
+            re = re.to(dtype=input.dtype)
+            im = im.to(dtype=input.dtype)
+        return torch.cat([re,im],dim=1)
+class RMSNormalizeInput(nn.Module):
+    def __init__(self, dim: Iterable[int], keepdim:bool = True) -> None:
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+    def forward(self,input):
+        std = torch.std(input,dim=self.dim,keepdim=self.keepdim)
+        output = input/std
+        return output, std

network/modules/output_transformation.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+import torch.nn as nn
+from ..layers import InverseSTFTLayer, ComplexTensorLayer
+from typing import Iterable, Optional
+class WaveGeneratorByISTFT(nn.Module):
+    def __init__(
+            self,
+            n_fft:int = 128,
+            win_length: Optional[int] = None,
+            hop_length:int = 64,
+            window: str = "hann",
+            center: bool = True,
+            normalized: bool = False,
+            onesided: bool = True
+            ) -> None:
+        super().__init__()
+        self.istft = InverseSTFTLayer(
+            n_fft,
+            win_length,
+            hop_length,
+            window,
+            center,
+            normalized,
+            onesided
+        )
+        self.float_to_complex = ComplexTensorLayer()
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
+    def forward(self,input,length:int=None):
+        x = input
+        if input.dtype in (torch.float16, torch.bfloat16):
+            x = input.float()
+        if x.dtype in (torch.float32,):
+            x = self.float_to_complex(x)
+        wav = self.istft(x,length)
+        wav = wav.to(dtype=input.dtype)
+        return wav
+class RMSDenormalizeOutput(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+    def forward(self,input,std):
+        return input*std

network/modules/sequence_embed.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from .tf_gridnet_modules import CrossFrameSelfAttention
+class SequenceEmbed(nn.Module):
+    def __init__(
+            self,
+            emb_dim: int = 48,
+            n_fft: int = 128,
+            hidden_size: int = 192,
+            kernel_T: int = 5,
+            kernel_F: int = 5,
+            ):
+        super().__init__()
+        self.n_freqs = n_fft // 2 + 1
+        self.emb_dim = emb_dim
+        self.conv = nn.Sequential(
+            nn.Conv2d(emb_dim*2,emb_dim*2,(kernel_F,kernel_T),padding=(kernel_F//2,kernel_T//2),groups=emb_dim*2),
+            nn.PReLU(),
+            nn.Conv2d(emb_dim*2,emb_dim*2,1),
+            nn.PReLU(),
+            nn.Conv2d(emb_dim*2,emb_dim*2,(kernel_F,kernel_T),padding=(kernel_F//2,kernel_T//2),groups=emb_dim*2),
+            nn.PReLU(),
+            nn.Conv2d(emb_dim*2,emb_dim,1),
+            nn.PReLU(),
+        )
+        self.linear_pre = nn.Conv1d(emb_dim*self.n_freqs,hidden_size,1)
+        self.lstm = nn.LSTM(
+            hidden_size,hidden_size,1,batch_first=True,bidirectional=True
+        )
+        self.linear = nn.Linear(hidden_size*2,emb_dim*self.n_freqs)
+        self.filter_gen = nn.Conv1d(emb_dim,emb_dim,1)
+        self.bias_gen = nn.Conv1d(emb_dim,emb_dim,1)
+    def forward(self,x,ref):
+        """
+        Args:
+            x: (B, D, F, T) input tensor from prevous layer
+            ref: (B, D, F, T) embedding tensor previous layer
+        """
+        B, D, n_freq, T = x.shape
+        input = torch.cat([x,ref],dim=1)
+        input = self.conv(input)
+        input = rearrange(input,'B D F T -> B (D F) T')
+        input = self.linear_pre(input)
+        input = rearrange(input,'B C T -> B T C')
+        rnn , _ = self.lstm(input)
+        feature = rnn[:,0]+rnn[:,-1] # (B, 2*Hidden)
+        feature = self.linear(feature) # (B, D*F)
+        feature = rearrange(feature,'B (D F) -> B D F',D=D,F=n_freq)
+        f = self.filter_gen(feature)
+        b = self.bias_gen(feature)
+        return f.unsqueeze(-1), b.unsqueeze(-1)
+class CrossFrameCrossAttention(CrossFrameSelfAttention):
+    def __init__(self, emb_dim=48, n_freqs=65, n_head=4, qk_output_channel=4, activation="PReLU", eps=0.00001):
+        super().__init__(emb_dim, n_freqs, n_head, qk_output_channel, activation, eps)
+    def forward(self, q, kv):
+        """
+        Args:
+            q: (B, D, F, T) query tensor
+            kv: (B, D, F, T) key and value tensor
+        """
+        input_q = rearrange(q,"B C Q T -> B C T Q")
+        input_kv = rearrange(kv,"B C Q T -> B C T Q")
+        Q = self.norm_Q(self.conv_Q(input_q))
+        K = self.norm_K(self.conv_K(input_kv))
+        V = self.norm_V(self.conv_V(input_kv))
+        Q = rearrange(Q, "B H C T Q -> (B H) T (C Q)")
+        K = rearrange(K, "B H C T Q -> (B H) (C Q) T").contiguous()
+        batch, n_head, channel, frame, freq = V.shape
+        V = rearrange(V, "B H C T Q -> (B H) T (C Q)")
+        emb_dim = Q.shape[-1]
+        qkT = torch.matmul(Q, K) / (emb_dim**0.5)
+        qkT = F.softmax(qkT,dim=2)
+        att = torch.matmul(qkT,V)
+        att = rearrange(att, "(B H) T (C Q) -> B (H C) T Q", C=channel, Q=freq, H = n_head, B = batch, T=frame)
+        att = self.concat_proj(att)
+        out = att + input_q
+        out = rearrange(out, "B C T Q -> B C Q T")
+        return out
+class MutualAttention(nn.Module):
+    def __init__(self,kernel_T=5, kernel_F=5 ,emb_dim=48, n_freqs=65, n_head=4, qk_output_channel=4, activation="PReLU", eps=0.00001):
+        super().__init__()
+        self.ref_att = CrossFrameCrossAttention(emb_dim, n_freqs, n_head, qk_output_channel, activation, eps)
+        self.tar_att = CrossFrameCrossAttention(emb_dim, n_freqs, n_head, qk_output_channel, activation, eps)
+        self.mt_conv = nn.Sequential(
+            nn.Conv2d(emb_dim,emb_dim,(kernel_F,kernel_T),padding=(kernel_F//2,kernel_T//2)),
+            nn.PReLU(),
+            nn.Conv2d(emb_dim,emb_dim,(kernel_F,kernel_T),padding=(kernel_F//2,kernel_T//2)),
+            nn.Sigmoid()
+        )
+        self.mr_conv = nn.Sequential(
+            nn.Conv2d(emb_dim,emb_dim,(kernel_F,kernel_T),padding=(kernel_F//2,kernel_T//2)),
+            nn.PReLU(),
+            nn.Conv2d(emb_dim,emb_dim,(kernel_F,kernel_T),padding=(kernel_F//2,kernel_T//2)),
+            nn.Sigmoid()
+        )
+        self.mtr_conv = nn.Sequential(
+            nn.Conv2d(emb_dim,emb_dim,(kernel_F,kernel_T),padding=(kernel_F//2,kernel_T//2)),
+            nn.PReLU(),
+            nn.Conv2d(emb_dim,emb_dim,(kernel_F,kernel_T),padding=(kernel_F//2,kernel_T//2)),
+            nn.PReLU()
+        )
+    def forward(self,tar,ref):
+        """
+        Args:
+            ref: (B, D, F, T) reference tensor
+            tar: (B, D, F, T) target tensor
+        """
+        mr = self.ref_att(ref,tar)
+        mt = self.tar_att(tar,ref)
+        mrt = mr + mt
+        mr = self.mr_conv(mr)
+        mt = self.mt_conv(mt)
+        mrt_o = self.mtr_conv(mrt)
+        o = mr*mt*mrt_o + mrt
+        return o

network/modules/split_modules.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+class DimensionDimAttention(nn.Module):
+    def __init__(
+            self,
+            emb_dim: int = 48,
+            kernel_size: int = (7,7),
+            dilation: int = 2,
+            ) -> None:
+        super().__init__()
+        self.emb_dim = emb_dim
+        self.attn = nn.Sequential(
+            nn.Conv2d(2*emb_dim,2*emb_dim,1),
+            nn.GELU(),
+            nn.Conv2d(2*emb_dim,2*emb_dim,kernel_size=kernel_size,groups=2*emb_dim,padding="same"),
+            nn.PReLU(),
+            nn.Conv2d(2*emb_dim,2*emb_dim,kernel_size=kernel_size,dilation=dilation,groups=2*emb_dim,padding="same"),
+            nn.PReLU(),
+            nn.Conv2d(2*emb_dim,emb_dim,1),
+            nn.Sigmoid()
+        )
+        self.transform = nn.Sequential(
+            nn.Conv2d(emb_dim,emb_dim,1),
+            nn.GELU(),
+            nn.Conv2d(emb_dim,emb_dim,kernel_size=kernel_size,groups=emb_dim,padding="same"),
+            nn.PReLU(),
+            nn.Conv2d(emb_dim,emb_dim,kernel_size=kernel_size,dilation=dilation,groups=emb_dim,padding="same"),
+            nn.PReLU(),
+            nn.Conv2d(emb_dim,emb_dim,1)
+        )
+    def forward(self,x,e):
+        """
+        Args:
+            x: (B, D, F, T) input tensor from privous layer
+            e: (B, D, F) embedding after reshape
+        """
+        T = x.shape[-1]
+        emb = repeat(e, 'B D F -> B D F T', T=T)
+        att = torch.cat([x,emb],dim=1)
+        i = self.transform(x)
+        att = self.attn(att)
+        return i*att
+class FDAttention(nn.Module):
+    def __init__(
+            self
+            ) -> None:
+        super().__init__()
+    def forward(self,x,e):
+        """
+        Args:
+        x: (B, D, F, T) input tensor from privous layer (for k and v)
+        e: (B, D, F) embedding after reshape (for q)
+        """
+        _,D,n_freq,T = x.shape
+        q = repeat(e, 'B D F -> B T (D F)', T=T)
+        k = rearrange(x, 'B D F T -> B (D F) T')
+        v = rearrange(x, 'B D F T -> B T (D F)')
+        q = self.positional_encoding(q)
+        qkT = torch.matmul(q,k)/((D*n_freq)**0.5)
+        qkT = F.softmax(qkT,dim=-1)
+        att = torch.matmul(qkT,v)
+        att = rearrange(att, 'B T (D F) -> B D F T', D=D, F=n_freq)
+        return att
+    def positional_encoding(self, x):
+        """
+        Args:
+            x: (B, T, D) input to add positional encoding
+        """
+        B, T, D = x.shape
+        pos = torch.arange(T, device=x.device).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, D, 2, device=x.device) * (-torch.log(torch.tensor(10000.0)) / D))
+        pos_enc = torch.zeros_like(x)
+        pos_enc[:, :, 0::2] = torch.sin(pos * div_term)
+        pos_enc[:, :, 1::2] = torch.cos(pos * div_term)
+        return x + pos_enc
+class SplitModule(nn.Module):
+    def __init__(
+            self,
+            emb_dim: int = 48,
+            condition_dim: int = 256,
+            n_fft: int = 128,
+            ) -> None:
+        super().__init__()
+        self.emb_dim = emb_dim
+        self.condition_dim = condition_dim
+        n_freq = n_fft // 2 + 1
+        self.n_freqs = n_freq
+        self.alpha = nn.Parameter(torch.empty(1,emb_dim,self.n_freqs).to(torch.float32))
+        self.beta = nn.Parameter(torch.empty(1,emb_dim,self.n_freqs,1).to(torch.float32))
+        self.d_att = DimensionDimAttention(emb_dim=emb_dim)
+        # self.f_att = FDAttention()
+    def forward(self,input,emb):
+        """
+        Args:
+            input: (B, D, F, T) input tensor
+            emb: (B, D, F) embedding after reshape
+        """
+        e = F.tanh(emb*self.alpha)
+        x = self.d_att(input,e)
+        # x = self.f_att(x,e)
+        x = x*F.sigmoid(self.beta*emb.unsqueeze(-1))
+        return x

network/modules/tf_gridnet_modules/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .dimension_embedding import DimensionEmbedding
+from .tf_gridnet_block import *
+from .deconv import *

network/modules/tf_gridnet_modules/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (369 Bytes). View file

network/modules/tf_gridnet_modules/__pycache__/deconv.cpython-311.pyc ADDED Viewed

Binary file (1.6 kB). View file

network/modules/tf_gridnet_modules/__pycache__/dimension_embedding.cpython-311.pyc ADDED Viewed

Binary file (1.63 kB). View file

network/modules/tf_gridnet_modules/__pycache__/tf_gridnet_block.cpython-311.pyc ADDED Viewed

Binary file (15.7 kB). View file

network/modules/tf_gridnet_modules/deconv.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch.nn as nn
+from einops import rearrange
+class TFGridnetDeconv(nn.Module):
+    def __init__(
+            self,
+            emb_dim = 48,
+            n_srcs = 2,
+            kernel_size_T = 3,
+            kernel_size_F = 3,
+            padding_T = 1,
+            padding_F = 1,
+            ) -> None:
+        super().__init__()
+        self.n_srcs = n_srcs
+        self.deconv = nn.ConvTranspose2d(emb_dim, n_srcs * 2, (kernel_size_F,kernel_size_T), padding=(padding_F,padding_T))
+    def forward(self,input):
+        output = self.deconv(input)
+        output = rearrange(output,"B (N C) F T -> B C N F T", C=self.n_srcs)
+        return output

network/modules/tf_gridnet_modules/dimension_embedding.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch.nn as nn
+from typing import Tuple
+class DimensionEmbedding(nn.Module):
+    def __init__(
+            self, audio_channel:int = 1,emb_dim:int = 48,
+            kernel_size: Tuple[int,int] = (3,3),
+            padding = "same",eps=1.0e-5
+            ) -> None:
+        super().__init__()
+        self.emb = nn.Sequential(
+            nn.Conv2d(2*audio_channel, emb_dim, kernel_size,padding=padding),
+            nn.GroupNorm(1,emb_dim,eps=eps)
+        )
+    def forward(self,input):
+        return self.emb(input)

network/modules/tf_gridnet_modules/tf_gridnet_block.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+import math
+if hasattr(torch, "bfloat16"):
+    HALF_PRECISION_DTYPES = (torch.float16, torch.bfloat16)
+else:
+    HALF_PRECISION_DTYPES = (torch.float16,)
+class IntraAndInterBandModule(nn.Module):
+    def __init__(
+            self, emb_dim:int = 48,
+            kernel_size:int = 4,
+            emb_hop_size:int = 1,
+            hidden_channels:int = 192,
+            eps = 1e-5
+            ) -> None:
+        super().__init__()
+        self.emb_dim = emb_dim
+        self.emb_hs = emb_hop_size
+        self.kernel_size = kernel_size
+        in_channels = emb_dim * kernel_size
+        self.intra_norm = nn.LayerNorm(emb_dim,eps=eps)
+        self.intra_lstm = nn.LSTM(
+            in_channels,hidden_channels,1,batch_first=True,bidirectional=True
+        )
+        if kernel_size == emb_hop_size:
+            self.intra_linear = nn.Linear(hidden_channels*2, in_channels)
+        else:
+            self.intra_linear = nn.ConvTranspose1d(hidden_channels*2, emb_dim,kernel_size ,emb_hop_size)
+        self.inter_norm = nn.LayerNorm(emb_dim, eps=eps)
+        self.inter_lstm = nn.LSTM(
+            in_channels,hidden_channels,1,batch_first=True,bidirectional=True
+        )
+        if kernel_size == emb_hop_size:
+            self.inter_linear = nn.Linear(hidden_channels*2, in_channels)
+        else:
+            self.inter_linear = nn.ConvTranspose1d(hidden_channels*2, emb_dim,kernel_size ,emb_hop_size)
+    def forward(self,x):
+        """
+        Args:
+            input (torch.Tensor): [B C Q T]
+        output:
+            ouput (torch.Tensor): [B C Q T]
+        """
+        B, C, old_Q, old_T = x.shape
+        padding = self.kernel_size - self.emb_hs
+        T = (
+            math.ceil((old_T + 2 * padding - self.kernel_size) / self.emb_hs) * self.emb_hs
+            + self.kernel_size
+        )
+        Q = (
+            math.ceil((old_Q + 2 * padding - self.kernel_size) / self.emb_hs) * self.emb_hs
+            + self.kernel_size
+        )
+        input = rearrange(x, "B C Q T -> B T Q C")
+        input = F.pad(input, (0, 0, padding, Q - old_Q - padding, padding, T - old_T - padding))
+        intra_rnn = self.intra_norm(input)
+        if self.kernel_size == self.emb_hs:
+            intra_rnn = intra_rnn.view([B * T, -1, self.kernel_size * C])
+            intra_rnn, _ = self.intra_lstm(intra_rnn)
+            intra_rnn = self.intra_linear(intra_rnn)
+            intra_rnn = intra_rnn.view([B, T, Q, C])
+        else:
+            intra_rnn = rearrange(intra_rnn,"B T Q C -> (B T) C Q")
+            intra_rnn = F.unfold(
+                intra_rnn[...,None],(self.kernel_size,1),stride=(self.emb_hs,1)
+            )
+            intra_rnn = intra_rnn.transpose(1, 2)  # [BT, -1, C*I]
+            intra_rnn, _ = self.intra_lstm(intra_rnn)
+            intra_rnn = intra_rnn.transpose(1, 2)  # [BT, H, -1]
+            intra_rnn = self.intra_linear(intra_rnn)  # [BT, C, Q]
+            intra_rnn = intra_rnn.view([B, T, C, Q])
+            intra_rnn = intra_rnn.transpose(-2, -1)  # [B, T, Q, C]
+        intra_rnn = intra_rnn + input
+        inter_input = rearrange(intra_rnn, "B T Q C -> B Q T C")
+        inter_rnn = self.inter_norm(inter_input)
+        if self.kernel_size == self.emb_hs:
+            inter_rnn = inter_rnn.view([B * Q, -1, self.kernel_size * C])
+            inter_rnn, _ = self.inter_lstm(inter_rnn)
+            inter_rnn = self.inter_linear(intra_rnn)
+            inter_rnn = inter_rnn.view([B, Q, T, C])
+        else:
+            inter_rnn = rearrange(inter_rnn,"B Q T C -> (B Q) C T")
+            inter_rnn = F.unfold(
+                inter_rnn[...,None],(self.kernel_size,1),stride=(self.emb_hs,1)
+            )
+            inter_rnn = inter_rnn.transpose(1, 2)  # [BQ, -1, C*I]
+            inter_rnn,_ = self.inter_lstm(inter_rnn)
+            inter_rnn = inter_rnn.transpose(1, 2)  # [BQ, H, -1]
+            inter_rnn = self.inter_linear(inter_rnn)  # [BQ, C, T]
+            inter_rnn = inter_rnn.view([B, Q, C, T])
+            inter_rnn = inter_rnn.transpose(-2, -1)  # [B, Q, T, C]
+        inter_rnn = inter_rnn + inter_input
+        inter_rnn = rearrange(inter_rnn,"B Q T C -> B C Q T")
+        inter_rnn = inter_rnn[..., padding : padding + old_Q, padding : padding + old_T]
+        return inter_rnn
+class LayerNormalization(nn.Module):
+    def __init__(self, input_dim, dim=1, total_dim=4, eps=1e-5):
+        super().__init__()
+        self.dim = dim if dim >= 0 else total_dim + dim
+        param_size = [1 if ii != self.dim else input_dim for ii in range(total_dim)]
+        self.gamma = nn.Parameter(torch.Tensor(*param_size).to(torch.float32))
+        self.beta = nn.Parameter(torch.Tensor(*param_size).to(torch.float32))
+        nn.init.ones_(self.gamma)
+        nn.init.zeros_(self.beta)
+        self.eps = eps
+    @torch.cuda.amp.autocast(enabled=False)
+    def forward(self, x):
+        if x.ndim - 1 < self.dim:
+            raise ValueError(
+                f"Expect x to have {self.dim + 1} dimensions, but got {x.ndim}"
+            )
+        if x.dtype in HALF_PRECISION_DTYPES:
+            dtype = x.dtype
+            x = x.float()
+        else:
+            dtype = None
+        mu_ = x.mean(dim=self.dim, keepdim=True)
+        std_ = torch.sqrt(x.var(dim=self.dim, unbiased=False, keepdim=True) + self.eps)
+        x_hat = ((x - mu_) / std_) * self.gamma + self.beta
+        return x_hat.to(dtype=dtype) if dtype else x_hat
+class AllHeadPReLULayerNormalization4DC(nn.Module):
+    def __init__(self, input_dimension, eps=1e-5):
+        super().__init__()
+        assert len(input_dimension) == 2, input_dimension
+        H, E = input_dimension
+        param_size = [1, H, E, 1, 1]
+        self.gamma = nn.Parameter(torch.Tensor(*param_size).to(torch.float32))
+        self.beta = nn.Parameter(torch.Tensor(*param_size).to(torch.float32))
+        nn.init.ones_(self.gamma)
+        nn.init.zeros_(self.beta)
+        self.act = nn.PReLU(num_parameters=H, init=0.25)
+        self.eps = eps
+        self.H = H
+        self.E = E
+    def forward(self, x):
+        assert x.ndim == 4
+        B, _, T, F = x.shape
+        x = x.view([B, self.H, self.E, T, F])
+        x = self.act(x)  # [B,H,E,T,F]
+        stat_dim = (2,)
+        mu_ = x.mean(dim=stat_dim, keepdim=True)  # [B,H,1,T,1]
+        std_ = torch.sqrt(
+            x.var(dim=stat_dim, unbiased=False, keepdim=True) + self.eps
+        )  # [B,H,1,T,1]
+        x = ((x - mu_) / std_) * self.gamma + self.beta  # [B,H,E,T,F]
+        return x
+class CrossFrameSelfAttention(nn.Module):
+    def __init__(
+            self,
+            emb_dim = 48,
+            n_freqs = 65,
+            n_head=4,
+            qk_output_channel=4,
+            activation="PReLU",
+            eps = 1e-5
+    ):
+        super().__init__()
+        assert emb_dim % n_head == 0
+        E = qk_output_channel
+        self.conv_Q = nn.Conv2d(emb_dim,n_head*E,1)
+        self.norm_Q = AllHeadPReLULayerNormalization4DC((n_head, E), eps=eps)
+        self.conv_K = nn.Conv2d(emb_dim,n_head*E,1)
+        self.norm_K = AllHeadPReLULayerNormalization4DC((n_head, E), eps=eps)
+        self.conv_V = nn.Conv2d(emb_dim, emb_dim, 1)
+        self.norm_V = AllHeadPReLULayerNormalization4DC((n_head, emb_dim // n_head), eps=eps)
+        self.concat_proj = nn.Sequential(
+            nn.Conv2d(emb_dim,emb_dim,1),
+            getattr(nn,activation)(),
+            LayerNormalization(emb_dim, dim=-3, total_dim=4, eps=eps),
+        )
+        self.emb_dim = emb_dim
+        self.n_head = n_head
+    def forward(self,x):
+        """
+        arg:
+            x: (torch.Tensor) [B C Q T]
+        output:
+            output: (torch.Tensor) [B C Q T]
+        """
+        input = rearrange(x,"B C Q T -> B C T Q")
+        Q = self.norm_Q(self.conv_Q(input)) # [B, n_head, C, T, Q]
+        K = self.norm_K(self.conv_K(input))
+        V = self.norm_V(self.conv_V(input))
+        Q = rearrange(Q, "B H C T Q -> (B H) T (C Q)")
+        K = rearrange(K, "B H C T Q -> (B H) (C Q) T").contiguous()
+        batch, n_head, channel, frame, freq = V.shape
+        V = rearrange(V, "B H C T Q -> (B H) T (C Q)")
+        emb_dim = Q.shape[-1]
+        qkT = torch.matmul(Q, K) / (emb_dim**0.5)
+        qkT = F.softmax(qkT,dim=2)
+        att = torch.matmul(qkT,V)
+        att = rearrange(att, "(B H) T (C Q) -> B (H C) T Q", C=channel, Q=freq, H = n_head, B = batch, T=frame)
+        att = self.concat_proj(att)
+        out = att + input
+        out = rearrange(out, "B C T Q -> B C Q T")
+        return out
+class TFGridnetBlock(nn.Module):
+    def __init__(
+            self,
+            emb_dim = 48,
+            kernel_size:int = 4,
+            emb_hop_size:int = 1,
+            n_freqs = 65,
+            hidden_channels:int = 192,
+            n_head=4,
+            qk_output_channel=4,
+            activation="PReLU",
+            eps = 1e-5
+    ):
+        super().__init__()
+        self.tf_grid_block = nn.Sequential(
+            IntraAndInterBandModule(
+                emb_dim=emb_dim,
+                kernel_size=kernel_size,
+                emb_hop_size=emb_hop_size,
+                hidden_channels=hidden_channels,
+                eps=eps
+            ),
+            CrossFrameSelfAttention(
+                emb_dim=emb_dim,
+                n_freqs=n_freqs,
+                n_head=n_head,
+                qk_output_channel=qk_output_channel,
+                activation=activation,
+                eps=eps
+            )
+        )
+    def forward(self,input):
+        return self.tf_grid_block(input)

network/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .error_message import ErrorMessageUtil
2	+ from .enum_declare import *

network/utils/enum_declare.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from dataclasses import dataclass
+@dataclass
+class STFT_transform_type_enum:
+    exponent = "exponent"
+    log = "log"

network/utils/error_message.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from dataclasses import dataclass
+@dataclass
+class ErrorMessageUtil:
+    only_support_batch_input = "we only support batch input. If you use a single input, please call .unsqueeze(0) before."
+    complex_format_convert = "we require the input with input.size(1) == 2"
+    two_input_in_the_same_shape = "2 input must be in the same shape"

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch==2.0.1
+torchaudio==2.0.1
+numpy==1.23.5
+einops==0.7.0
+pandas==1.5.3
+scikit-learn==1.2.2
+resemblyzer
+gradio