Spaces:

hieupham14022003
/

wanna_hear_your_voice

Running

App Files Files Community

hieugiaosu commited on Sep 4, 2024

Commit

9160a00

1 Parent(s): 42e1680

update model

Browse files

Files changed (3) hide show

network/models/TF_gridnet.py +102 -0
network/models/__init__.py +1 -0
network/modules/convolution_module.py +20 -0

network/models/TF_gridnet.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import torch.nn as nn
+from ..modules.tf_gridnet_modules import *
+from ..modules.input_tranformation import STFTInput, RMSNormalizeInput
+from ..modules.output_transformation import WaveGeneratorByISTFT, RMSDenormalizeOutput
+class TF_Gridnet(nn.Module):
+    def __init__(
+            self,
+            n_srcs=2,
+            n_fft=128,
+            hop_length=64,
+            window="hann",
+            n_audio_channel=1,
+            n_layers=6,
+            input_kernel_size_T = 3,
+            input_kernel_size_F = 3,
+            output_kernel_size_T = 3,
+            output_kernel_size_F = 3,
+            lstm_hidden_units=192,
+            attn_n_head=4,
+            qk_output_channel=4,
+            emb_dim=48,
+            emb_ks=4,
+            emb_hs=1,
+            activation="PReLU",
+            eps=1.0e-5,
+            ):
+        super().__init__()
+        self.input_normalize = RMSNormalizeInput((1,2),keepdim=True)
+        self.stft = STFTInput(
+            n_fft=n_fft,
+            win_length=n_fft,
+            hop_length=hop_length,
+            window=window,
+        )
+        self.istft = WaveGeneratorByISTFT(
+            n_fft=n_fft,
+            win_length=n_fft,
+            hop_length=hop_length,
+            window=window
+        )
+        self.output_denormalize = RMSDenormalizeOutput()
+        self.dimension_embedding = DimensionEmbedding(
+            audio_channel=n_audio_channel,
+            emb_dim=emb_dim,
+            kernel_size=(input_kernel_size_F,input_kernel_size_T),
+            eps=eps
+        )
+        self.tf_gridnet_block = nn.Sequential(
+            *[
+                TFGridnetBlock(
+                    emb_dim=emb_dim,
+                    kernel_size=emb_ks,
+                    emb_hop_size=emb_hs,
+                    hidden_channels=lstm_hidden_units,
+                    n_head=attn_n_head,
+                    qk_output_channel=qk_output_channel,
+                    activation=activation,
+                    eps=eps
+                ) for _ in range(n_layers)
+            ]
+        )
+        self.deconv = TFGridnetDeconv(
+            emb_dim=emb_dim,
+            n_srcs=n_srcs,
+            kernel_size_T=output_kernel_size_T,
+            kernel_size_F=output_kernel_size_F,
+            padding_F=output_kernel_size_F//2,
+            padding_T=output_kernel_size_T//2
+            )
+    def forward(self,input):
+        audio_length = input.shape[-1]
+        x = input
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+        x, std = self.input_normalize(x)
+        x = self.stft(x)
+        x = self.dimension_embedding(x)
+        x = self.tf_gridnet_block(x)
+        x = self.deconv(x)
+        x = rearrange(x,"B C N F T -> B N C F T") #becasue in istft, the 1 dim is for real and im part
+        x = self.istft(x,audio_length)
+        x = self.output_denormalize(x,std)
+        return x

network/models/__init__.py CHANGED Viewed

@@ -1,2 +1,3 @@
 from .TF_gridnet_with_condition import *
 from .embedding_model import ResemblyzerVoiceEncoder

+from .TF_gridnet import *
 from .TF_gridnet_with_condition import *
 from .embedding_model import ResemblyzerVoiceEncoder

network/modules/convolution_module.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch.nn as nn
+from einops import rearrange
+class SplitFeatureDeconv(nn.Module):
+    def __init__(
+            self,
+            emb_dim = 48,
+            n_srcs = 2,
+            kernel_size_T = 3,
+            kernel_size_F = 3,
+            padding_T = 1,
+            padding_F = 1,
+            ) -> None:
+        super().__init__()
+        self.n_srcs = n_srcs
+        self.deconv = nn.ConvTranspose2d(emb_dim, n_srcs * emb_dim, (kernel_size_F,kernel_size_T), padding=(padding_F,padding_T))
+    def forward(self,input):
+        output = self.deconv(input)
+        output = rearrange(output,"B (N C) F T -> B C N F T", C=self.n_srcs)
+        return output