Spaces:
Running
Running
import torch | |
from torch import nn | |
from torch.nn import functional as F | |
from nn_layers import convolutional_module | |
torch.manual_seed(1234) | |
class Encoder(nn.Module): | |
"""This is the encoder part of tacotron2. It includes a stack of three 1d convolutional layers | |
followed by batch normalization and ReLU activations, and a bidirectional LSTM layer. | |
These part encodes sequences of input characters.""" | |
def __init__(self, encoder_params): | |
super(Encoder, self).__init__() | |
# we set the dropout applied at each convolutional layer, as specified in Tacotron2's paper | |
# self.dropout = nn.Dropout(0.5) | |
# A stack of convolution layers. For this model, there are 3 conv1d layers. We initialize a python | |
# list and run in a loop as many times as number of convolutional layers (three). In each | |
# iteration we initialize nn.Sequential container that permits us set a block of neural network | |
# modules. We need three equal nn sequences in a list. Then this list is properly registered using | |
# ModuleList class object (can act as an iterable, or be indexed). | |
# To see how the convolution is computed: | |
# https://pytorch.org/docs/stable/nn.html#conv1d | |
stack_of_convolutions = [] | |
for _ in range(encoder_params['encoder_convs']): | |
conv_layer = nn.Sequential(convolutional_module(encoder_params['symbols_embedding_length'], | |
encoder_params['symbols_embedding_length'], | |
kernel_size=encoder_params['conv_kernel_size'], | |
stride=encoder_params['conv_stride'], | |
padding=int((encoder_params['conv_kernel_size'] - 1) / 2), | |
dilation=encoder_params['conv_dilation'], | |
w_init_gain=encoder_params['w_init_gain']), | |
nn.BatchNorm1d(encoder_params['symbols_embedding_length'])) | |
stack_of_convolutions.append(conv_layer) | |
self.stack_conv = nn.ModuleList(stack_of_convolutions) | |
# Last part of the encoder is the bi-directional LSTM layer. As described in the original Tacotron2 | |
# paper, there is only one BiLSTM layer with 256 units for each direction. | |
"""Can I add the bidirectional LSTM layer together with the convolutional stack??? CHECK IT OUT!""" | |
self.bi_lstm = nn.LSTM(encoder_params['symbols_embedding_length'], | |
int(encoder_params['symbols_embedding_length'] / 2), 1, batch_first=True, | |
bidirectional=True) | |
def forward(self, input_sequences, input_lengths): | |
for conv in self.stack_conv: | |
input_sequences = F.dropout(F.relu(conv(input_sequences)), 0.5, self.training) | |
input_sequences = input_sequences.transpose(1, 2) | |
# After convolution filters, is the original sequence length the same? CHECK IT OUT | |
input_lengths = input_lengths.cpu().numpy() | |
# Returns a packed sequence object with variable-length sequences before passing through BiLSTM layer | |
input_sequences = nn.utils.rnn.pack_padded_sequence(input_sequences, input_lengths, batch_first=True) | |
# nn.LSTM accepts packed variable length sequence tensors. The output will also return a packed variable | |
# length sequence tensor. The output dimension is (seq_length, batch, num_directions*hidden_size), but | |
# if batch_first is True, then (batch, seq_length, num_direction*hidden_size). | |
self.bi_lstm.flatten_parameters() | |
outputs, _ = self.bi_lstm(input_sequences) | |
# Pads again the tensor back to normal format before packing | |
outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) | |
return outputs # [N, Max_seq_length, E_length] | |
def inference(self, x): | |
for conv in self.stack_conv: | |
x = F.dropout(F.relu(conv(x)), 0.5, self.training) | |
x = x.transpose(1, 2) | |
self.bi_lstm.flatten_parameters() | |
outputs, _ = self.bi_lstm(x) | |
return outputs | |