tacotron2-gst-en / Encoder.py
mireiafarrus's picture
tacotron2 and hifigan upload
af7ac2b
import torch
from torch import nn
from torch.nn import functional as F
from nn_layers import convolutional_module
torch.manual_seed(1234)
class Encoder(nn.Module):
"""This is the encoder part of tacotron2. It includes a stack of three 1d convolutional layers
followed by batch normalization and ReLU activations, and a bidirectional LSTM layer.
These part encodes sequences of input characters."""
def __init__(self, encoder_params):
super(Encoder, self).__init__()
# we set the dropout applied at each convolutional layer, as specified in Tacotron2's paper
# self.dropout = nn.Dropout(0.5)
# A stack of convolution layers. For this model, there are 3 conv1d layers. We initialize a python
# list and run in a loop as many times as number of convolutional layers (three). In each
# iteration we initialize nn.Sequential container that permits us set a block of neural network
# modules. We need three equal nn sequences in a list. Then this list is properly registered using
# ModuleList class object (can act as an iterable, or be indexed).
# To see how the convolution is computed:
# https://pytorch.org/docs/stable/nn.html#conv1d
stack_of_convolutions = []
for _ in range(encoder_params['encoder_convs']):
conv_layer = nn.Sequential(convolutional_module(encoder_params['symbols_embedding_length'],
encoder_params['symbols_embedding_length'],
kernel_size=encoder_params['conv_kernel_size'],
stride=encoder_params['conv_stride'],
padding=int((encoder_params['conv_kernel_size'] - 1) / 2),
dilation=encoder_params['conv_dilation'],
w_init_gain=encoder_params['w_init_gain']),
nn.BatchNorm1d(encoder_params['symbols_embedding_length']))
stack_of_convolutions.append(conv_layer)
self.stack_conv = nn.ModuleList(stack_of_convolutions)
# Last part of the encoder is the bi-directional LSTM layer. As described in the original Tacotron2
# paper, there is only one BiLSTM layer with 256 units for each direction.
"""Can I add the bidirectional LSTM layer together with the convolutional stack??? CHECK IT OUT!"""
self.bi_lstm = nn.LSTM(encoder_params['symbols_embedding_length'],
int(encoder_params['symbols_embedding_length'] / 2), 1, batch_first=True,
bidirectional=True)
def forward(self, input_sequences, input_lengths):
for conv in self.stack_conv:
input_sequences = F.dropout(F.relu(conv(input_sequences)), 0.5, self.training)
input_sequences = input_sequences.transpose(1, 2)
# After convolution filters, is the original sequence length the same? CHECK IT OUT
input_lengths = input_lengths.cpu().numpy()
# Returns a packed sequence object with variable-length sequences before passing through BiLSTM layer
input_sequences = nn.utils.rnn.pack_padded_sequence(input_sequences, input_lengths, batch_first=True)
# nn.LSTM accepts packed variable length sequence tensors. The output will also return a packed variable
# length sequence tensor. The output dimension is (seq_length, batch, num_directions*hidden_size), but
# if batch_first is True, then (batch, seq_length, num_direction*hidden_size).
self.bi_lstm.flatten_parameters()
outputs, _ = self.bi_lstm(input_sequences)
# Pads again the tensor back to normal format before packing
outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
return outputs # [N, Max_seq_length, E_length]
def inference(self, x):
for conv in self.stack_conv:
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
x = x.transpose(1, 2)
self.bi_lstm.flatten_parameters()
outputs, _ = self.bi_lstm(x)
return outputs