Spaces:

CLiC-UB
/

tacotron2-gst-en

Running

App Files Files Community

tacotron2-gst-en / Encoder.py

mireiafarrus

tacotron2 and hifigan upload

af7ac2b about 1 year ago

raw

history blame contribute delete

4.36 kB

	import torch
	from torch import nn
	from torch.nn import functional as F
	from nn_layers import convolutional_module

	torch.manual_seed(1234)

	class Encoder(nn.Module):
	"""This is the encoder part of tacotron2. It includes a stack of three 1d convolutional layers
	followed by batch normalization and ReLU activations, and a bidirectional LSTM layer.
	These part encodes sequences of input characters."""
	def __init__(self, encoder_params):
	super(Encoder, self).__init__()
	# we set the dropout applied at each convolutional layer, as specified in Tacotron2's paper
	# self.dropout = nn.Dropout(0.5)

	# A stack of convolution layers. For this model, there are 3 conv1d layers. We initialize a python
	# list and run in a loop as many times as number of convolutional layers (three). In each
	# iteration we initialize nn.Sequential container that permits us set a block of neural network
	# modules. We need three equal nn sequences in a list. Then this list is properly registered using
	# ModuleList class object (can act as an iterable, or be indexed).
	# To see how the convolution is computed:
	# https://pytorch.org/docs/stable/nn.html#conv1d

	stack_of_convolutions = []
	for _ in range(encoder_params['encoder_convs']):
	conv_layer = nn.Sequential(convolutional_module(encoder_params['symbols_embedding_length'],
	encoder_params['symbols_embedding_length'],
	kernel_size=encoder_params['conv_kernel_size'],
	stride=encoder_params['conv_stride'],
	padding=int((encoder_params['conv_kernel_size'] - 1) / 2),
	dilation=encoder_params['conv_dilation'],
	w_init_gain=encoder_params['w_init_gain']),
	nn.BatchNorm1d(encoder_params['symbols_embedding_length']))
	stack_of_convolutions.append(conv_layer)
	self.stack_conv = nn.ModuleList(stack_of_convolutions)

	# Last part of the encoder is the bi-directional LSTM layer. As described in the original Tacotron2
	# paper, there is only one BiLSTM layer with 256 units for each direction.

	"""Can I add the bidirectional LSTM layer together with the convolutional stack??? CHECK IT OUT!"""

	self.bi_lstm = nn.LSTM(encoder_params['symbols_embedding_length'],
	int(encoder_params['symbols_embedding_length'] / 2), 1, batch_first=True,
	bidirectional=True)

	def forward(self, input_sequences, input_lengths):
	for conv in self.stack_conv:
	input_sequences = F.dropout(F.relu(conv(input_sequences)), 0.5, self.training)

	input_sequences = input_sequences.transpose(1, 2)
	# After convolution filters, is the original sequence length the same? CHECK IT OUT
	input_lengths = input_lengths.cpu().numpy()
	# Returns a packed sequence object with variable-length sequences before passing through BiLSTM layer
	input_sequences = nn.utils.rnn.pack_padded_sequence(input_sequences, input_lengths, batch_first=True)
	# nn.LSTM accepts packed variable length sequence tensors. The output will also return a packed variable
	# length sequence tensor. The output dimension is (seq_length, batch, num_directions*hidden_size), but
	# if batch_first is True, then (batch, seq_length, num_direction*hidden_size).
	self.bi_lstm.flatten_parameters()
	outputs, _ = self.bi_lstm(input_sequences)
	# Pads again the tensor back to normal format before packing
	outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)

	return outputs # [N, Max_seq_length, E_length]

	def inference(self, x):
	for conv in self.stack_conv:
	x = F.dropout(F.relu(conv(x)), 0.5, self.training)

	x = x.transpose(1, 2)

	self.bi_lstm.flatten_parameters()
	outputs, _ = self.bi_lstm(x)

	return outputs