|
from argparse import Namespace |
|
from pathlib import Path |
|
from typing import NamedTuple |
|
|
|
from jax.numpy import ndarray |
|
|
|
|
|
class FLAGS(Namespace): |
|
"""Configurations""" |
|
|
|
duration_lstm_dim = 256 |
|
vocab_size = 256 |
|
duration_embed_dropout_rate = 0.5 |
|
num_training_steps = 200_000 |
|
postnet_dim = 512 |
|
acoustic_decoder_dim = 512 |
|
acoustic_encoder_dim = 256 |
|
|
|
|
|
max_phoneme_seq_len = 256 * 1 |
|
assert max_phoneme_seq_len % 256 == 0 |
|
max_wave_len = 1024 * 64 * 3 |
|
|
|
|
|
special_phonemes = ["sil", "sp", "spn", " "] |
|
sil_index = special_phonemes.index("sil") |
|
sp_index = sil_index |
|
word_end_index = special_phonemes.index(" ") |
|
_normal_phonemes = ( |
|
[] |
|
+ ["a", "b", "c", "d", "e", "g", "h", "i", "k", "l"] |
|
+ ["m", "n", "o", "p", "q", "r", "s", "t", "u", "v"] |
|
+ ["x", "y", "à", "á", "â", "ã", "è", "é", "ê", "ì"] |
|
+ ["í", "ò", "ó", "ô", "õ", "ù", "ú", "ý", "ă", "đ"] |
|
+ ["ĩ", "ũ", "ơ", "ư", "ạ", "ả", "ấ", "ầ", "ẩ", "ẫ"] |
|
+ ["ậ", "ắ", "ằ", "ẳ", "ẵ", "ặ", "ẹ", "ẻ", "ẽ", "ế"] |
|
+ ["ề", "ể", "ễ", "ệ", "ỉ", "ị", "ọ", "ỏ", "ố", "ồ"] |
|
+ ["ổ", "ỗ", "ộ", "ớ", "ờ", "ở", "ỡ", "ợ", "ụ", "ủ"] |
|
+ ["ứ", "ừ", "ử", "ữ", "ự", "ỳ", "ỵ", "ỷ", "ỹ"] |
|
) |
|
|
|
|
|
mel_dim = 80 |
|
n_fft = 1024 |
|
sample_rate = 16000 |
|
fmin = 0.0 |
|
fmax = 8000 |
|
|
|
|
|
batch_size = 64 |
|
learning_rate = 1e-4 |
|
duration_learning_rate = 1e-4 |
|
max_grad_norm = 1.0 |
|
weight_decay = 1e-4 |
|
token_mask_prob = 0.1 |
|
|
|
|
|
ckpt_dir = Path("assets/infore/nat") |
|
data_dir = Path("train_data") |
|
|
|
|
|
class DurationInput(NamedTuple): |
|
phonemes: ndarray |
|
lengths: ndarray |
|
durations: ndarray |
|
|
|
|
|
class AcousticInput(NamedTuple): |
|
phonemes: ndarray |
|
lengths: ndarray |
|
durations: ndarray |
|
wavs: ndarray |
|
wav_lengths: ndarray |
|
mels: ndarray |
|
|