File size: 4,846 Bytes
af7ac2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import torch
from torch import nn
from librosa.filters import mel as librosa_mel_fn
from stft import STFT

torch.manual_seed(1234)

clip_val = 1e-5
C = 1


class convolutional_module(nn.Module):
    """This class defines a 1d convolutional layer and its initialization for the system we are
    replicating"""
    def __init__(self, in_ch, out_ch, kernel_size=1, stride=1, padding=None, dilation=1, bias=True,
                 w_init_gain='linear'):
        # in PyTorch you define your Models as subclasses of torch.nn.Module
        super(convolutional_module, self).__init__()
        if padding is None:
            assert(kernel_size % 2 == 1)
            padding = int(dilation * (kernel_size - 1) / 2)

        # initialize the convolutional layer which is an instance of Conv1d
        # torch.nn.Conv1d calls internally the method torch.nn.functional.conv1d, which accepts the
        # input with the shape (minibatch x in_channels x input_w), and a weight of shape
        # (out_channels x (in_channels/groups) x kernel_w). In our case, we do not split into groups.
        # Then, our input shape will be (48 x 512 x 189) and the weights are set up as
        # (512 x 512 x 5)
        self.conv_layer = torch.nn.Conv1d(in_ch, out_ch, kernel_size=kernel_size, stride=stride,
                                          padding=padding, dilation=dilation, bias=bias)

        """Useful information of Xavier initialization in:
        https://prateekvjoshi.com/2016/03/29/understanding-xavier-initialization-in-deep-neural-networks/"""
        torch.nn.init.xavier_uniform_(self.conv_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))

    def forward(self, x):
        conv_output = self.conv_layer(x)
        return conv_output


class linear_module(torch.nn.Module):
    """This class defines a linear layer and its initialization method for the system we are
    replicating. This implements a linear transformation: y = xA^t + b"""
    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
        super(linear_module, self).__init__()
        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)

        torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))

    def forward(self, x):
        return self.linear_layer(x)


class location_layer(nn.Module):
    def __init__(self, attention_n_filters, attention_kernel_size, attention_dim):
        super(location_layer, self).__init__()
        padding = int((attention_kernel_size - 1) / 2)
        """We are being very restricting without training a bias"""
        """I think in_channels = 2 is k (number of vectors for every encoded stage position from prev.
        alignment)."""
        self.location_conv = convolutional_module(2, attention_n_filters, kernel_size=attention_kernel_size,
                                                  padding=padding, bias=False, stride=1, dilation=1)
        self.location_dense = linear_module(attention_n_filters, attention_dim, bias=False,
                                            w_init_gain='tanh')

    def forward(self, attention_weights_cat):
        processed_attention = self.location_conv(attention_weights_cat)
        processed_attention = processed_attention.transpose(1, 2)
        processed_attention = self.location_dense(processed_attention)
        return processed_attention


class TacotronSTFT(nn.Module):
    def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
                 n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
                 mel_fmax=8000.0):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa_mel_fn(sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, 
                                   fmin=mel_fmin, fmax=mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)

    def spectral_de_normalize(self, magnitudes):
        output = torch.exp(magnitudes) / C
        return output

    def mel_spectrogram(self, y):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        assert(torch.min(y.data) >= -1)
        assert(torch.max(y.data) <= 1)

        magnitudes, phases = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        mel_output = torch.log(torch.clamp(mel_output, min=clip_val) * C)
        return mel_output