File size: 2,109 Bytes
12da6cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
from argparse import Namespace
from pathlib import Path
from typing import NamedTuple
from jax.numpy import ndarray
class FLAGS(Namespace):
"""Configurations"""
duration_lstm_dim = 256
vocab_size = 256
duration_embed_dropout_rate = 0.5
num_training_steps = 200_000
postnet_dim = 512
acoustic_decoder_dim = 512
acoustic_encoder_dim = 256
# dataset
max_phoneme_seq_len = 256 * 1
assert max_phoneme_seq_len % 256 == 0 # prevent compilation error on Colab T4 GPU
max_wave_len = 1024 * 64 * 3
# Montreal Forced Aligner
special_phonemes = ["sil", "sp", "spn", " "] # [sil], [sp] [spn] [word end]
sil_index = special_phonemes.index("sil")
sp_index = sil_index # no use of "sp"
word_end_index = special_phonemes.index(" ")
_normal_phonemes = (
[]
+ ["a", "b", "c", "d", "e", "g", "h", "i", "k", "l"]
+ ["m", "n", "o", "p", "q", "r", "s", "t", "u", "v"]
+ ["x", "y", "à", "á", "â", "ã", "è", "é", "ê", "ì"]
+ ["í", "ò", "ó", "ô", "õ", "ù", "ú", "ý", "ă", "đ"]
+ ["ĩ", "ũ", "ơ", "ư", "ạ", "ả", "ấ", "ầ", "ẩ", "ẫ"]
+ ["ậ", "ắ", "ằ", "ẳ", "ẵ", "ặ", "ẹ", "ẻ", "ẽ", "ế"]
+ ["ề", "ể", "ễ", "ệ", "ỉ", "ị", "ọ", "ỏ", "ố", "ồ"]
+ ["ổ", "ỗ", "ộ", "ớ", "ờ", "ở", "ỡ", "ợ", "ụ", "ủ"]
+ ["ứ", "ừ", "ử", "ữ", "ự", "ỳ", "ỵ", "ỷ", "ỹ"]
)
# dsp
mel_dim = 80
n_fft = 1024
sample_rate = 16000
fmin = 0.0
fmax = 8000
# training
batch_size = 64
learning_rate = 1e-4
duration_learning_rate = 1e-4
max_grad_norm = 1.0
weight_decay = 1e-4
token_mask_prob = 0.1
# ckpt
ckpt_dir = Path("assets/infore/nat")
data_dir = Path("train_data")
class DurationInput(NamedTuple):
phonemes: ndarray
lengths: ndarray
durations: ndarray
class AcousticInput(NamedTuple):
phonemes: ndarray
lengths: ndarray
durations: ndarray
wavs: ndarray
wav_lengths: ndarray
mels: ndarray
|