File size: 3,742 Bytes
15ff819
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
{
  "args": null,
  "audio_embedding_dim": 2048,
  "audio_embedding_dropout": 0.0,
  "audio_max_length": 20.0,
  "audio_min_length": 2.0,
  "audio_pad_token": 2050,
  "audio_positional_embedding_dropout": 0.0,
  "audio_vocab_size": "2048",
  "batch_size": 100,
  "clipping_update_period": 1000,
  "codebook_weight": "[2,1,1,1]",
  "d_model": 2048,
  "dataset": "gigaspeech",
  "dataset_dir": "/data/scratch/pyp/datasets/gigaspeech_phn_enc_manifest/xl",
  "drop_long": 1,
  "dynamic_batching": 1,
  "early_stop_step": 3200,
  "early_stop_threshold": -1.0,
  "empty_token": 2048,
  "encodec_folder_name": "encodec_16khz_4codebooks",
  "encodec_sr": 50,
  "eog": 2049,
  "eos": 2051,
  "exp_dir": "/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/tts_enhanced_830M",
  "gradient_accumulation_steps": 24,
  "gradient_clip_val": 1.0,
  "load_model_from": null,
  "lr": 1e-05,
  "manifest_name": "manifest_large16khz_lessambi",
  "mask_len_max": 600,
  "mask_len_min": 1,
  "mask_sample_dist": "poisson1",
  "max_mask_portion": 0.9,
  "max_n_spans": 3,
  "max_num_tokens": 50000,
  "min_gap": 5,
  "n_codebooks": 4,
  "n_special": 4,
  "nhead": 16,
  "num_buckets": 10,
  "num_decoder_layers": 16,
  "num_epochs": 10,
  "num_steps": 500000,
  "num_workers": 8,
  "optimizer_name": "AdamW",
  "pad_x": 0,
  "phn2num": {
    "!": 17,
    "\"": 97,
    ",": 64,
    ".": 77,
    "1": 80,
    ":": 93,
    ";": 81,
    "<MUSIC>": 39,
    "<NOISE>": 52,
    "<OTHER>": 60,
    "<SIL>": 53,
    "?": 78,
    "_": 15,
    "a\u026a": 48,
    "a\u026a\u0259": 56,
    "a\u026a\u025a": 2,
    "a\u028a": 36,
    "b": 20,
    "d": 72,
    "d\u0292": 57,
    "e": 85,
    "e\u026a": 6,
    "f": 69,
    "h": 14,
    "i": 27,
    "i\u0259": 42,
    "i\u02d0": 68,
    "i\u02d0\u02d0": 51,
    "j": 67,
    "k": 41,
    "kh": 84,
    "l": 63,
    "m": 9,
    "n": 23,
    "n\u02b2": 8,
    "o": 86,
    "o\u028a": 25,
    "o\u02d0": 74,
    "o\u02d0\u0279": 40,
    "p": 34,
    "q": 96,
    "r": 79,
    "s": 66,
    "t": 73,
    "t\u0255": 87,
    "t\u0283": 75,
    "t\u02b0": 94,
    "u": 1,
    "u\u02d0": 47,
    "v": 31,
    "w": 19,
    "x": 4,
    "z": 22,
    "\u00a1": 98,
    "\u00ab": 88,
    "\u00bb": 89,
    "\u00bf": 95,
    "\u00e6": 32,
    "\u00e6\u00e6": 50,
    "\u00e7": 10,
    "\u00f0": 7,
    "\u014b": 58,
    "\u0250": 70,
    "\u0250\u0250": 71,
    "\u0251": 61,
    "\u0251\u02d0": 0,
    "\u0251\u02d0\u0279": 44,
    "\u0252": 83,
    "\u0254": 3,
    "\u0254\u026a": 13,
    "\u0254\u02d0": 29,
    "\u0254\u02d0\u0279": 33,
    "\u0259": 54,
    "\u0259l": 16,
    "\u0259\u028a": 90,
    "\u025a": 35,
    "\u025b": 18,
    "\u025b\u0279": 11,
    "\u025b\u02d0": 82,
    "\u025c\u02d0": 21,
    "\u0261": 49,
    "\u0261\u02b2": 37,
    "\u026a": 65,
    "\u026a\u0279": 76,
    "\u026a\u02d0": 100,
    "\u026c": 46,
    "\u026f": 91,
    "\u0279": 5,
    "\u027e": 24,
    "\u0283": 26,
    "\u028a": 43,
    "\u028a\u0279": 28,
    "\u028c": 38,
    "\u0292": 55,
    "\u0294": 59,
    "\u0303": 45,
    "\u0329": 12,
    "\u03b8": 30,
    "\u1d7b": 62,
    "\u2014": 99,
    "\u2026": 92
  },
  "phn_folder_name": "phonemes",
  "precision": "float16",
  "print_every_n_steps": 800,
  "pseudo_epoch_size": 3000,
  "reduce_lr_start_epoch": 4,
  "reduce_lr_start_step": 3000,
  "reduced_eog": 1,
  "resume": false,
  "seed": 1,
  "shuffle_mask_embedding": 0,
  "special_first": 0,
  "tb_write_every_n_steps": 100,
  "text_embedding_dropout": 0.0,
  "text_max_length": 400,
  "text_min_length": 10.0,
  "text_pad_token": 120,
  "text_positional_embedding_dropout": 0.0,
  "text_vocab_size": 120,
  "trm_dropout": 0.0,
  "val_every_n_steps": 3200,
  "val_max_num_tokens": 6000,
  "warmup_fraction": 0.1,
  "weight_decay": 0.0
}