File size: 4,607 Bytes
af7ac2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import random

import numpy as np
import torch
import torch.utils.data

import nn_layers
from scipy.io.wavfile import read
from text import text_to_sequence
from hyper_parameters import tacotron_params

torch.manual_seed(1234)


class DataPreparation(torch.utils.data.Dataset):

    def __init__(self, audiopaths_and_text, tacotron_hyperparams):
        self.audiopaths_and_text = audiopaths_and_text
        self.audio_text_parameters = tacotron_hyperparams
        self.stft = nn_layers.TacotronSTFT(tacotron_hyperparams['filter_length'], tacotron_hyperparams['hop_length'],
                                           tacotron_hyperparams['win_length'], tacotron_hyperparams['n_mel_channels'],
                                           self.audio_text_parameters['sampling_rate'],
                                           tacotron_hyperparams['mel_fmin'], tacotron_hyperparams['mel_fmax'])
        random.seed(1234)
        random.shuffle(self.audiopaths_and_text)

    def load_audiowav_torch(self, audiopath, samp_rate):
        sr, data = read(audiopath)
        assert samp_rate == sr, "Sample rate does not match with the configuration"

        return torch.FloatTensor(data.astype(np.float32))

    def melspec_textSequence_pair(self, audiopath_and_text):
        wav_path, sentence = audiopath_and_text[0], audiopath_and_text[1]
        # wav to torch tensor
        wav_torch = self.load_audiowav_torch(wav_path, self.audio_text_parameters['sampling_rate'])
        wav_torch_norm = wav_torch / self.audio_text_parameters['max_wav_value']
        wav_torch_norm = wav_torch_norm.unsqueeze(0)
        wav_torch_norm = torch.autograd.Variable(wav_torch_norm, requires_grad=False)
        mel_spec = self.stft.mel_spectrogram(wav_torch_norm)
        mel_spec = torch.squeeze(mel_spec, 0)
        # text to torch integer tensor sequence
        sentence_sequence = torch.IntTensor(text_to_sequence(sentence, self.audio_text_parameters['text_cleaners']))

        return sentence_sequence, mel_spec

    def __getitem__(self, index):
        return self.melspec_textSequence_pair(self.audiopaths_and_text[index])

    def __len__(self):
        return len(self.audiopaths_and_text)


class DataCollate:

    def __init__(self, number_frames_step):
        self.number_frames_step = number_frames_step

    def __call__(self, batch):
        inp_lengths, sorted_decreasing = torch.sort(torch.LongTensor([len(x[0]) for x in batch]),
                                                   dim=0, descending=True)
        max_length_in = inp_lengths[0]

        # padding sentences sequences for a fixed-length tensor size
        sentences_padded = torch.LongTensor(len(batch), max_length_in)
        sentences_padded.zero_()
        for i in range(len(sorted_decreasing)):
            int_seq_sentence = batch[sorted_decreasing[i]][0]
            # all slots of a line until the end of the sentence. The rest, 0's
            sentences_padded[i, :int_seq_sentence.size(0)] = int_seq_sentence

        # length of the mel filterbank used
        num_melfilters = batch[0][1].size(0)

        # longest recorded spectrogram representation + 1 space to mark the end
        max_length_target = max([x[1].size(1) for x in batch])  # THERE IS A CHANGE FROM THE ORIGINAL CODE!!!
        # add extra space if the number of frames per step is higher than 1
        if max_length_target % self.number_frames_step != 0:
            max_length_target += self.number_frames_step - max_length_target % self.number_frames_step
            assert max_length_target % self.number_frames_step == 0

        # padding mel spectrogram representations. The output is a 3D tensor
        melspec_padded = torch.FloatTensor(len(batch), num_melfilters, max_length_target)
        melspec_padded.zero_()

        # GST new prosody matrices definition with zero padding:
        prosody_padded = torch.FloatTensor(len(batch), num_melfilters, max_length_target)
        prosody_padded.zero_()

        gate_padded = torch.FloatTensor(len(batch), max_length_target)
        gate_padded.zero_()
        output_lengths = torch.LongTensor(len(batch))

        for j in range(len(sorted_decreasing)):
            melspec = batch[sorted_decreasing[j]][1]
            melspec_padded[j, :, :melspec.size(1)] = melspec

            # GST filling padded prosody matrix:
            prosody_padded[j, :, :melspec.size(1)] = melspec

            gate_padded[j, melspec.size(1) - 1:] = 1
            output_lengths[j] = melspec.size(1)

        return sentences_padded, inp_lengths, melspec_padded, gate_padded, output_lengths, prosody_padded