File size: 2,109 Bytes
12da6cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from argparse import Namespace
from pathlib import Path
from typing import NamedTuple

from jax.numpy import ndarray


class FLAGS(Namespace):
    """Configurations"""

    duration_lstm_dim = 256
    vocab_size = 256
    duration_embed_dropout_rate = 0.5
    num_training_steps = 200_000
    postnet_dim = 512
    acoustic_decoder_dim = 512
    acoustic_encoder_dim = 256

    # dataset
    max_phoneme_seq_len = 256 * 1
    assert max_phoneme_seq_len % 256 == 0  # prevent compilation error on Colab T4 GPU
    max_wave_len = 1024 * 64 * 3

    # Montreal Forced Aligner
    special_phonemes = ["sil", "sp", "spn", " "]  # [sil], [sp] [spn] [word end]
    sil_index = special_phonemes.index("sil")
    sp_index = sil_index  # no use of "sp"
    word_end_index = special_phonemes.index(" ")
    _normal_phonemes = (
        []
        + ["a", "b", "c", "d", "e", "g", "h", "i", "k", "l"]
        + ["m", "n", "o", "p", "q", "r", "s", "t", "u", "v"]
        + ["x", "y", "à", "á", "â", "ã", "è", "é", "ê", "ì"]
        + ["í", "ò", "ó", "ô", "õ", "ù", "ú", "ý", "ă", "đ"]
        + ["ĩ", "ũ", "ơ", "ư", "ạ", "ả", "ấ", "ầ", "ẩ", "ẫ"]
        + ["ậ", "ắ", "ằ", "ẳ", "ẵ", "ặ", "ẹ", "ẻ", "ẽ", "ế"]
        + ["ề", "ể", "ễ", "ệ", "ỉ", "ị", "ọ", "ỏ", "ố", "ồ"]
        + ["ổ", "ỗ", "ộ", "ớ", "ờ", "ở", "ỡ", "ợ", "ụ", "ủ"]
        + ["ứ", "ừ", "ử", "ữ", "ự", "ỳ", "ỵ", "ỷ", "ỹ"]
    )

    # dsp
    mel_dim = 80
    n_fft = 1024
    sample_rate = 16000
    fmin = 0.0
    fmax = 8000

    # training
    batch_size = 64
    learning_rate = 1e-4
    duration_learning_rate = 1e-4
    max_grad_norm = 1.0
    weight_decay = 1e-4
    token_mask_prob = 0.1

    # ckpt
    ckpt_dir = Path("assets/infore/nat")
    data_dir = Path("train_data")


class DurationInput(NamedTuple):
    phonemes: ndarray
    lengths: ndarray
    durations: ndarray


class AcousticInput(NamedTuple):
    phonemes: ndarray
    lengths: ndarray
    durations: ndarray
    wavs: ndarray
    wav_lengths: ndarray
    mels: ndarray