hndrbrm commited on
Commit
0774298
·
1 Parent(s): 42aac44
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .idea/
README.md CHANGED
@@ -1,12 +1,148 @@
1
  ---
2
- title: F5 Tts Id Space
3
- emoji: 😻
 
 
 
 
 
 
 
 
4
  colorFrom: red
5
- colorTo: purple
 
 
 
 
6
  sdk: gradio
7
- sdk_version: 5.10.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  app_file: app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  pinned: false
10
- ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ # Reference:
3
+ # https://huggingface.co/docs/hub/spaces-config-reference
4
+
5
+ # Display title for the Space.
6
+ title: F5-TTS-ID
7
+
8
+ # Space emoji (emoji-only character allowed).
9
+ emoji: ⛏
10
+
11
+ # Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray).
12
  colorFrom: red
13
+
14
+ # Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray).
15
+ colorTo: gray
16
+
17
+ # Can be either gradio, streamlit, docker, or static.
18
  sdk: gradio
19
+
20
+ # Any valid Python 3.x or 3.x.x version.
21
+ # Defaults to 3.10.
22
+ # python_version: string
23
+
24
+ # Specify the version of the selected SDK (Streamlit or Gradio).
25
+ # All versions of Gradio are supported.
26
+ # All versions of Streamlit from 0.79.0 are supported.
27
+ sdk_version: 5.9.1
28
+
29
+ # Specify the suggested hardware on which this Space must be run.
30
+ # Useful for Spaces that are meant to be duplicated by other users.
31
+ # Setting this value will not automatically assign an hardware to this Space.
32
+ # Value must be a valid hardware flavor. Current valid hardware flavors:
33
+ # * CPU: "cpu-basic", "cpu-upgrade"
34
+ # * GPU: "t4-small", "t4-medium", "l4x1", "l4x4", "a10g-small", "a10g-large", "a10g-largex2", "a10g-largex4","a100-large"
35
+ # * TPU: "v5e-1x1", "v5e-2x2", "v5e-2x4"
36
+ # suggested_hardware: string
37
+
38
+ # Specify the suggested permanent storage on which this Space must be run.
39
+ # Useful for Spaces that are meant to be duplicated by other users.
40
+ # Setting this value will not automatically assign a permanent storage to this Space.
41
+ # Value must be one of "small", "medium" or "large".
42
+ # suggested_storage: string
43
+
44
+ # Path to your main application file (which contains either gradio or streamlit Python code, or static html code).
45
+ # Path is relative to the root of the repository.
46
  app_file: app.py
47
+
48
+ # Port on which your application is running.
49
+ # Used only if sdk is docker.
50
+ # Default port is 7860.
51
+ # app_port: int
52
+
53
+ # For non-static Spaces, initial url to render. Needs to start with /.
54
+ # For static Spaces, use app_file instead.
55
+ # base_path: string
56
+
57
+ # Whether your Space is rendered inside a full-width (when true)
58
+ # or fixed-width column (ie. “container” CSS) inside the iframe.
59
+ # Defaults to true.
60
+ # fullWidth: boolean
61
+
62
+ # Can be either mini or default.
63
+ # If header is set to mini the space will be displayed full-screen with a mini floating header.
64
+ # header: string
65
+
66
+ # A short description of the Space.
67
+ # This will be displayed in the Space’s thumbnail.
68
+ short_description: F5-TTS for indonesian language
69
+
70
+ # HF model IDs (like openai-community/gpt2 or deepset/roberta-base-squad2) used in the Space.
71
+ # Will be parsed automatically from your code if not specified here.
72
+ # models : List[string]
73
+
74
+ # HF dataset IDs (like mozilla-foundation/common_voice_13_0 or oscar-corpus/OSCAR-2109) used in the Space.
75
+ # Will be parsed automatically from your code if not specified here.
76
+ # datasets : List[string]
77
+
78
+ # List of terms that describe your Space task or scope.
79
+ # tags : List[string]
80
+
81
+ # URL for defining a custom thumbnail for social sharing.
82
+ # thumbnail: string
83
+
84
+ # Whether the Space stays on top of your profile.
85
+ # Can be useful if you have a lot of Spaces so you and others can quickly see your best Space.
86
  pinned: false
 
87
 
88
+ # Whether a connected OAuth app is associated to this Space.
89
+ # See Adding a Sign-In with HF button to your Space for more details.
90
+ # hf_oauth : boolean
91
+
92
+ # Authorized scopes of the connected OAuth app.
93
+ # openid and profile are authorized by default and do not need this parameter.
94
+ # See Adding a Sign-In with HF button to your space for more details.
95
+ # hf_oauth_scopes : List[string]
96
+
97
+ # Duration of the OAuth token in minutes.
98
+ # Defaults to 480 minutes (8 hours).
99
+ # Maximum duration is 43200 minutes (30 days).
100
+ # See Adding a Sign-In with HF button to your space for more details.
101
+ # hf_oauth_expiration_minutes : int
102
+
103
+ # Whether the Space iframe can be embedded in other websites.
104
+ # Defaults to false, i.e. Spaces can be embedded.
105
+ # disable_embedding : boolean
106
+
107
+ # Set a custom startup duration timeout for your Space.
108
+ # This is the maximum time your Space is allowed to start before it times out and is flagged as unhealthy.
109
+ # Defaults to 30 minutes, but any valid duration (like 1h, 30m) is acceptable.
110
+ # startup_duration_timeout: string
111
+
112
+ # Set custom HTTP headers that will be added to all HTTP responses when serving your Space.
113
+ # For now, only the cross-origin-embedder-policy (COEP), cross-origin-opener-policy (COOP), and cross-origin-resource-policy (CORP) headers are allowed.
114
+ # These headers can be used to set up a cross-origin isolated environment and enable powerful features like SharedArrayBuffer,
115
+ # for example:
116
+ # custom_headers:
117
+ # cross-origin-embedder-policy: require-corp
118
+ # cross-origin-opener-policy: same-origin
119
+ # cross-origin-resource-policy: cross-origin
120
+ # Note: all headers and values must be lowercase.
121
+ # custom_headers : Dict[string, string]
122
+
123
+ # Specify a list of Hugging Face Hub models or other large files to be preloaded during the build time of your Space.
124
+ # This optimizes the startup time by having the files ready when your application starts.
125
+ # This is particularly useful for Spaces that rely on large models or datasets that would otherwise need to be downloaded at runtime.
126
+ #
127
+ # The format for each item is "repository_name" to download all files from a repository,
128
+ # or "repository_name file1,file2" for downloading specific files within that repository.
129
+ # You can also specify a specific commit to download using the format "repository_name file1,file2 commit_sha256".
130
+ #
131
+ # Example usage:
132
+ # preload_from_hub:
133
+ # - warp-ai/wuerstchen-prior text_encoder/model.safetensors,prior/diffusion_pytorch_model.safetensors
134
+ # - coqui/XTTS-v1
135
+ # - openai-community/gpt2 config.json 11c5a3d5811f50298f278a704980280950aedb10
136
+ #
137
+ # In this example, the Space will preload specific .safetensors files from warp-ai/wuerstchen-prior,
138
+ # the complete coqui/XTTS-v1 repository,
139
+ # and a specific revision of the config.json file in the openai-community/gpt2 repository
140
+ # from the Hugging Face Hub during build time.
141
+ #
142
+ # Files are saved in the default `huggingface_hub` disk cache `~/.cache/huggingface/hub`.
143
+ # If you application expects them elsewhere or you changed your `HF_HOME` variable, this pre-loading does not follow that at this time.
144
+ # preload_from_hub: List[string]
145
+
146
+ license: unknown
147
+
148
+ ---
app.py ADDED
@@ -0,0 +1,888 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E402
2
+ # Above allows ruff to ignore E402: module level import not at top of file
3
+
4
+ import json
5
+ import re
6
+ import tempfile
7
+ from collections import OrderedDict
8
+ from importlib.resources import files
9
+
10
+ import click
11
+ import gradio as gr
12
+ import numpy as np
13
+ import soundfile as sf
14
+ import torchaudio
15
+ from cached_path import cached_path
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer
17
+
18
+ try:
19
+ import spaces
20
+
21
+ USING_SPACES = True
22
+ except ImportError:
23
+ USING_SPACES = False
24
+
25
+
26
+ def gpu_decorator(func):
27
+ if USING_SPACES:
28
+ return spaces.GPU(func)
29
+ else:
30
+ return func
31
+
32
+
33
+ from f5_tts.model import DiT, UNetT
34
+ from f5_tts.infer.utils_infer import (
35
+ load_vocoder,
36
+ load_model,
37
+ preprocess_ref_audio_text,
38
+ infer_process,
39
+ remove_silence_for_generated_wav,
40
+ save_spectrogram,
41
+ )
42
+
43
+
44
+ DEFAULT_TTS_MODEL = "F5-TTS"
45
+ tts_model_choice = DEFAULT_TTS_MODEL
46
+
47
+ DEFAULT_TTS_MODEL_CFG = [
48
+ "hf://hndrbrm/f5_tts_id_model/model_id.safetensors",
49
+ "hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt",
50
+ json.dumps(dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)),
51
+ ]
52
+
53
+
54
+ # load models
55
+
56
+ vocoder = load_vocoder()
57
+
58
+
59
+ def load_f5tts(ckpt_path=str(cached_path("hf://hndrbrm/f5_tts_id_model/model_id.safetensors"))):
60
+ F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
61
+ return load_model(DiT, F5TTS_model_cfg, ckpt_path)
62
+
63
+
64
+ def load_e2tts(ckpt_path=str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))):
65
+ E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
66
+ return load_model(UNetT, E2TTS_model_cfg, ckpt_path)
67
+
68
+
69
+ def load_custom(ckpt_path: str, vocab_path="", model_cfg=None):
70
+ ckpt_path, vocab_path = ckpt_path.strip(), vocab_path.strip()
71
+ if ckpt_path.startswith("hf://"):
72
+ ckpt_path = str(cached_path(ckpt_path))
73
+ if vocab_path.startswith("hf://"):
74
+ vocab_path = str(cached_path(vocab_path))
75
+ if model_cfg is None:
76
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
77
+ return load_model(DiT, model_cfg, ckpt_path, vocab_file=vocab_path)
78
+
79
+
80
+ F5TTS_ema_model = load_f5tts()
81
+ E2TTS_ema_model = load_e2tts() if USING_SPACES else None
82
+ custom_ema_model, pre_custom_path = None, ""
83
+
84
+ chat_model_state = None
85
+ chat_tokenizer_state = None
86
+
87
+
88
+ @gpu_decorator
89
+ def generate_response(messages, model, tokenizer):
90
+ """Generate response using Qwen"""
91
+ text = tokenizer.apply_chat_template(
92
+ messages,
93
+ tokenize=False,
94
+ add_generation_prompt=True,
95
+ )
96
+
97
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
98
+ generated_ids = model.generate(
99
+ **model_inputs,
100
+ max_new_tokens=512,
101
+ temperature=0.7,
102
+ top_p=0.95,
103
+ )
104
+
105
+ generated_ids = [
106
+ output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
107
+ ]
108
+ return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
109
+
110
+
111
+ @gpu_decorator
112
+ def infer(
113
+ ref_audio_orig,
114
+ ref_text,
115
+ gen_text,
116
+ model,
117
+ remove_silence,
118
+ cross_fade_duration=0.15,
119
+ nfe_step=32,
120
+ speed=1,
121
+ show_info=gr.Info,
122
+ ):
123
+ if not ref_audio_orig:
124
+ gr.Warning("Please provide reference audio.")
125
+ return gr.update(), gr.update(), ref_text
126
+
127
+ if not gen_text.strip():
128
+ gr.Warning("Please enter text to generate.")
129
+ return gr.update(), gr.update(), ref_text
130
+
131
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
132
+
133
+ if model == "F5-TTS":
134
+ ema_model = F5TTS_ema_model
135
+ elif model == "E2-TTS":
136
+ global E2TTS_ema_model
137
+ if E2TTS_ema_model is None:
138
+ show_info("Loading E2-TTS model...")
139
+ E2TTS_ema_model = load_e2tts()
140
+ ema_model = E2TTS_ema_model
141
+ elif isinstance(model, list) and model[0] == "Custom":
142
+ assert not USING_SPACES, "Only official checkpoints allowed in Spaces."
143
+ global custom_ema_model, pre_custom_path
144
+ if pre_custom_path != model[1]:
145
+ show_info("Loading Custom TTS model...")
146
+ custom_ema_model = load_custom(model[1], vocab_path=model[2], model_cfg=model[3])
147
+ pre_custom_path = model[1]
148
+ ema_model = custom_ema_model
149
+
150
+ final_wave, final_sample_rate, combined_spectrogram = infer_process(
151
+ ref_audio,
152
+ ref_text,
153
+ gen_text,
154
+ ema_model,
155
+ vocoder,
156
+ cross_fade_duration=cross_fade_duration,
157
+ nfe_step=nfe_step,
158
+ speed=speed,
159
+ show_info=show_info,
160
+ progress=gr.Progress(),
161
+ )
162
+
163
+ # Remove silence
164
+ if remove_silence:
165
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
166
+ sf.write(f.name, final_wave, final_sample_rate)
167
+ remove_silence_for_generated_wav(f.name)
168
+ final_wave, _ = torchaudio.load(f.name)
169
+ final_wave = final_wave.squeeze().cpu().numpy()
170
+
171
+ # Save the spectrogram
172
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
173
+ spectrogram_path = tmp_spectrogram.name
174
+ save_spectrogram(combined_spectrogram, spectrogram_path)
175
+
176
+ return (final_sample_rate, final_wave), spectrogram_path, ref_text
177
+
178
+
179
+ with gr.Blocks() as app_credits:
180
+ gr.Markdown("""
181
+ # Credits
182
+
183
+ * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
184
+ * [RootingInLoad](https://github.com/RootingInLoad) for initial chunk generation and podcast app exploration
185
+ * [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation & voice chat
186
+ """)
187
+ with gr.Blocks() as app_tts:
188
+ gr.Markdown("# Batched TTS")
189
+ ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
190
+ gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
191
+ generate_btn = gr.Button("Synthesize", variant="primary")
192
+ with gr.Accordion("Advanced Settings", open=False):
193
+ ref_text_input = gr.Textbox(
194
+ label="Reference Text",
195
+ info="Leave blank to automatically transcribe the reference audio. If you enter text it will override automatic transcription.",
196
+ lines=2,
197
+ )
198
+ remove_silence = gr.Checkbox(
199
+ label="Remove Silences",
200
+ info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
201
+ value=False,
202
+ )
203
+ speed_slider = gr.Slider(
204
+ label="Speed",
205
+ minimum=0.3,
206
+ maximum=2.0,
207
+ value=1.0,
208
+ step=0.1,
209
+ info="Adjust the speed of the audio.",
210
+ )
211
+ nfe_slider = gr.Slider(
212
+ label="NFE Steps",
213
+ minimum=4,
214
+ maximum=64,
215
+ value=32,
216
+ step=2,
217
+ info="Set the number of denoising steps.",
218
+ )
219
+ cross_fade_duration_slider = gr.Slider(
220
+ label="Cross-Fade Duration (s)",
221
+ minimum=0.0,
222
+ maximum=1.0,
223
+ value=0.15,
224
+ step=0.01,
225
+ info="Set the duration of the cross-fade between audio clips.",
226
+ )
227
+
228
+ audio_output = gr.Audio(label="Synthesized Audio")
229
+ spectrogram_output = gr.Image(label="Spectrogram")
230
+
231
+ @gpu_decorator
232
+ def basic_tts(
233
+ ref_audio_input,
234
+ ref_text_input,
235
+ gen_text_input,
236
+ remove_silence,
237
+ cross_fade_duration_slider,
238
+ nfe_slider,
239
+ speed_slider,
240
+ ):
241
+ audio_out, spectrogram_path, ref_text_out = infer(
242
+ ref_audio_input,
243
+ ref_text_input,
244
+ gen_text_input,
245
+ tts_model_choice,
246
+ remove_silence,
247
+ cross_fade_duration=cross_fade_duration_slider,
248
+ nfe_step=nfe_slider,
249
+ speed=speed_slider,
250
+ )
251
+ return audio_out, spectrogram_path, ref_text_out
252
+
253
+ generate_btn.click(
254
+ basic_tts,
255
+ inputs=[
256
+ ref_audio_input,
257
+ ref_text_input,
258
+ gen_text_input,
259
+ remove_silence,
260
+ cross_fade_duration_slider,
261
+ nfe_slider,
262
+ speed_slider,
263
+ ],
264
+ outputs=[audio_output, spectrogram_output, ref_text_input],
265
+ )
266
+
267
+
268
+ def parse_speechtypes_text(gen_text):
269
+ # Pattern to find {speechtype}
270
+ pattern = r"\{(.*?)\}"
271
+
272
+ # Split the text by the pattern
273
+ tokens = re.split(pattern, gen_text)
274
+
275
+ segments = []
276
+
277
+ current_style = "Regular"
278
+
279
+ for i in range(len(tokens)):
280
+ if i % 2 == 0:
281
+ # This is text
282
+ text = tokens[i].strip()
283
+ if text:
284
+ segments.append({"style": current_style, "text": text})
285
+ else:
286
+ # This is style
287
+ style = tokens[i].strip()
288
+ current_style = style
289
+
290
+ return segments
291
+
292
+
293
+ with gr.Blocks() as app_multistyle:
294
+ # New section for multistyle generation
295
+ gr.Markdown(
296
+ """
297
+ # Multiple Speech-Type Generation
298
+
299
+ This section allows you to generate multiple speech types or multiple people's voices. Enter your text in the format shown below, and the system will generate speech using the appropriate type. If unspecified, the model will use the regular speech type. The current speech type will be used until the next speech type is specified.
300
+ """
301
+ )
302
+
303
+ with gr.Row():
304
+ gr.Markdown(
305
+ """
306
+ **Example Input:**
307
+ {Regular} Hello, I'd like to order a sandwich please.
308
+ {Surprised} What do you mean you're out of bread?
309
+ {Sad} I really wanted a sandwich though...
310
+ {Angry} You know what, darn you and your little shop!
311
+ {Whisper} I'll just go back home and cry now.
312
+ {Shouting} Why me?!
313
+ """
314
+ )
315
+
316
+ gr.Markdown(
317
+ """
318
+ **Example Input 2:**
319
+ {Speaker1_Happy} Hello, I'd like to order a sandwich please.
320
+ {Speaker2_Regular} Sorry, we're out of bread.
321
+ {Speaker1_Sad} I really wanted a sandwich though...
322
+ {Speaker2_Whisper} I'll give you the last one I was hiding.
323
+ """
324
+ )
325
+
326
+ gr.Markdown(
327
+ "Upload different audio clips for each speech type. The first speech type is mandatory. You can add additional speech types by clicking the 'Add Speech Type' button."
328
+ )
329
+
330
+ # Regular speech type (mandatory)
331
+ with gr.Row() as regular_row:
332
+ with gr.Column():
333
+ regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
334
+ regular_insert = gr.Button("Insert Label", variant="secondary")
335
+ regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
336
+ regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
337
+
338
+ # Regular speech type (max 100)
339
+ max_speech_types = 100
340
+ speech_type_rows = [regular_row]
341
+ speech_type_names = [regular_name]
342
+ speech_type_audios = [regular_audio]
343
+ speech_type_ref_texts = [regular_ref_text]
344
+ speech_type_delete_btns = [None]
345
+ speech_type_insert_btns = [regular_insert]
346
+
347
+ # Additional speech types (99 more)
348
+ for i in range(max_speech_types - 1):
349
+ with gr.Row(visible=False) as row:
350
+ with gr.Column():
351
+ name_input = gr.Textbox(label="Speech Type Name")
352
+ delete_btn = gr.Button("Delete Type", variant="secondary")
353
+ insert_btn = gr.Button("Insert Label", variant="secondary")
354
+ audio_input = gr.Audio(label="Reference Audio", type="filepath")
355
+ ref_text_input = gr.Textbox(label="Reference Text", lines=2)
356
+ speech_type_rows.append(row)
357
+ speech_type_names.append(name_input)
358
+ speech_type_audios.append(audio_input)
359
+ speech_type_ref_texts.append(ref_text_input)
360
+ speech_type_delete_btns.append(delete_btn)
361
+ speech_type_insert_btns.append(insert_btn)
362
+
363
+ # Button to add speech type
364
+ add_speech_type_btn = gr.Button("Add Speech Type")
365
+
366
+ # Keep track of autoincrement of speech types, no roll back
367
+ speech_type_count = 1
368
+
369
+ # Function to add a speech type
370
+ def add_speech_type_fn():
371
+ row_updates = [gr.update() for _ in range(max_speech_types)]
372
+ global speech_type_count
373
+ if speech_type_count < max_speech_types:
374
+ row_updates[speech_type_count] = gr.update(visible=True)
375
+ speech_type_count += 1
376
+ else:
377
+ gr.Warning("Exhausted maximum number of speech types. Consider restart the app.")
378
+ return row_updates
379
+
380
+ add_speech_type_btn.click(add_speech_type_fn, outputs=speech_type_rows)
381
+
382
+ # Function to delete a speech type
383
+ def delete_speech_type_fn():
384
+ return gr.update(visible=False), None, None, None
385
+
386
+ # Update delete button clicks
387
+ for i in range(1, len(speech_type_delete_btns)):
388
+ speech_type_delete_btns[i].click(
389
+ delete_speech_type_fn,
390
+ outputs=[speech_type_rows[i], speech_type_names[i], speech_type_audios[i], speech_type_ref_texts[i]],
391
+ )
392
+
393
+ # Text input for the prompt
394
+ gen_text_input_multistyle = gr.Textbox(
395
+ label="Text to Generate",
396
+ lines=10,
397
+ placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
398
+ )
399
+
400
+ def make_insert_speech_type_fn(index):
401
+ def insert_speech_type_fn(current_text, speech_type_name):
402
+ current_text = current_text or ""
403
+ speech_type_name = speech_type_name or "None"
404
+ updated_text = current_text + f"{{{speech_type_name}}} "
405
+ return updated_text
406
+
407
+ return insert_speech_type_fn
408
+
409
+ for i, insert_btn in enumerate(speech_type_insert_btns):
410
+ insert_fn = make_insert_speech_type_fn(i)
411
+ insert_btn.click(
412
+ insert_fn,
413
+ inputs=[gen_text_input_multistyle, speech_type_names[i]],
414
+ outputs=gen_text_input_multistyle,
415
+ )
416
+
417
+ with gr.Accordion("Advanced Settings", open=False):
418
+ remove_silence_multistyle = gr.Checkbox(
419
+ label="Remove Silences",
420
+ value=True,
421
+ )
422
+
423
+ # Generate button
424
+ generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
425
+
426
+ # Output audio
427
+ audio_output_multistyle = gr.Audio(label="Synthesized Audio")
428
+
429
+ @gpu_decorator
430
+ def generate_multistyle_speech(
431
+ gen_text,
432
+ *args,
433
+ ):
434
+ speech_type_names_list = args[:max_speech_types]
435
+ speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
436
+ speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
437
+ remove_silence = args[3 * max_speech_types]
438
+ # Collect the speech types and their audios into a dict
439
+ speech_types = OrderedDict()
440
+
441
+ ref_text_idx = 0
442
+ for name_input, audio_input, ref_text_input in zip(
443
+ speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
444
+ ):
445
+ if name_input and audio_input:
446
+ speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
447
+ else:
448
+ speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
449
+ ref_text_idx += 1
450
+
451
+ # Parse the gen_text into segments
452
+ segments = parse_speechtypes_text(gen_text)
453
+
454
+ # For each segment, generate speech
455
+ generated_audio_segments = []
456
+ current_style = "Regular"
457
+
458
+ for segment in segments:
459
+ style = segment["style"]
460
+ text = segment["text"]
461
+
462
+ if style in speech_types:
463
+ current_style = style
464
+ else:
465
+ gr.Warning(f"Type {style} is not available, will use Regular as default.")
466
+ current_style = "Regular"
467
+
468
+ try:
469
+ ref_audio = speech_types[current_style]["audio"]
470
+ except KeyError:
471
+ gr.Warning(f"Please provide reference audio for type {current_style}.")
472
+ return [None] + [speech_types[style]["ref_text"] for style in speech_types]
473
+ ref_text = speech_types[current_style].get("ref_text", "")
474
+
475
+ # Generate speech for this segment
476
+ audio_out, _, ref_text_out = infer(
477
+ ref_audio, ref_text, text, tts_model_choice, remove_silence, 0, show_info=print
478
+ ) # show_info=print no pull to top when generating
479
+ sr, audio_data = audio_out
480
+
481
+ generated_audio_segments.append(audio_data)
482
+ speech_types[current_style]["ref_text"] = ref_text_out
483
+
484
+ # Concatenate all audio segments
485
+ if generated_audio_segments:
486
+ final_audio_data = np.concatenate(generated_audio_segments)
487
+ return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types]
488
+ else:
489
+ gr.Warning("No audio generated.")
490
+ return [None] + [speech_types[style]["ref_text"] for style in speech_types]
491
+
492
+ generate_multistyle_btn.click(
493
+ generate_multistyle_speech,
494
+ inputs=[
495
+ gen_text_input_multistyle,
496
+ ]
497
+ + speech_type_names
498
+ + speech_type_audios
499
+ + speech_type_ref_texts
500
+ + [
501
+ remove_silence_multistyle,
502
+ ],
503
+ outputs=[audio_output_multistyle] + speech_type_ref_texts,
504
+ )
505
+
506
+ # Validation function to disable Generate button if speech types are missing
507
+ def validate_speech_types(gen_text, regular_name, *args):
508
+ speech_type_names_list = args
509
+
510
+ # Collect the speech types names
511
+ speech_types_available = set()
512
+ if regular_name:
513
+ speech_types_available.add(regular_name)
514
+ for name_input in speech_type_names_list:
515
+ if name_input:
516
+ speech_types_available.add(name_input)
517
+
518
+ # Parse the gen_text to get the speech types used
519
+ segments = parse_speechtypes_text(gen_text)
520
+ speech_types_in_text = set(segment["style"] for segment in segments)
521
+
522
+ # Check if all speech types in text are available
523
+ missing_speech_types = speech_types_in_text - speech_types_available
524
+
525
+ if missing_speech_types:
526
+ # Disable the generate button
527
+ return gr.update(interactive=False)
528
+ else:
529
+ # Enable the generate button
530
+ return gr.update(interactive=True)
531
+
532
+ gen_text_input_multistyle.change(
533
+ validate_speech_types,
534
+ inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
535
+ outputs=generate_multistyle_btn,
536
+ )
537
+
538
+
539
+ with gr.Blocks() as app_chat:
540
+ gr.Markdown(
541
+ """
542
+ # Voice Chat
543
+ Have a conversation with an AI using your reference voice!
544
+ 1. Upload a reference audio clip and optionally its transcript.
545
+ 2. Load the chat model.
546
+ 3. Record your message through your microphone.
547
+ 4. The AI will respond using the reference voice.
548
+ """
549
+ )
550
+
551
+ if not USING_SPACES:
552
+ load_chat_model_btn = gr.Button("Load Chat Model", variant="primary")
553
+
554
+ chat_interface_container = gr.Column(visible=False)
555
+
556
+ @gpu_decorator
557
+ def load_chat_model():
558
+ global chat_model_state, chat_tokenizer_state
559
+ if chat_model_state is None:
560
+ show_info = gr.Info
561
+ show_info("Loading chat model...")
562
+ model_name = "Qwen/Qwen2.5-3B-Instruct"
563
+ chat_model_state = AutoModelForCausalLM.from_pretrained(
564
+ model_name, torch_dtype="auto", device_map="auto"
565
+ )
566
+ chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
567
+ show_info("Chat model loaded.")
568
+
569
+ return gr.update(visible=False), gr.update(visible=True)
570
+
571
+ load_chat_model_btn.click(load_chat_model, outputs=[load_chat_model_btn, chat_interface_container])
572
+
573
+ else:
574
+ chat_interface_container = gr.Column()
575
+
576
+ if chat_model_state is None:
577
+ model_name = "Qwen/Qwen2.5-3B-Instruct"
578
+ chat_model_state = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
579
+ chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
580
+
581
+ with chat_interface_container:
582
+ with gr.Row():
583
+ with gr.Column():
584
+ ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
585
+ with gr.Column():
586
+ with gr.Accordion("Advanced Settings", open=False):
587
+ remove_silence_chat = gr.Checkbox(
588
+ label="Remove Silences",
589
+ value=True,
590
+ )
591
+ ref_text_chat = gr.Textbox(
592
+ label="Reference Text",
593
+ info="Optional: Leave blank to auto-transcribe",
594
+ lines=2,
595
+ )
596
+ system_prompt_chat = gr.Textbox(
597
+ label="System Prompt",
598
+ value="You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
599
+ lines=2,
600
+ )
601
+
602
+ chatbot_interface = gr.Chatbot(label="Conversation")
603
+
604
+ with gr.Row():
605
+ with gr.Column():
606
+ audio_input_chat = gr.Microphone(
607
+ label="Speak your message",
608
+ type="filepath",
609
+ )
610
+ audio_output_chat = gr.Audio(autoplay=True)
611
+ with gr.Column():
612
+ text_input_chat = gr.Textbox(
613
+ label="Type your message",
614
+ lines=1,
615
+ )
616
+ send_btn_chat = gr.Button("Send Message")
617
+ clear_btn_chat = gr.Button("Clear Conversation")
618
+
619
+ conversation_state = gr.State(
620
+ value=[
621
+ {
622
+ "role": "system",
623
+ "content": "You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
624
+ }
625
+ ]
626
+ )
627
+
628
+ # Modify process_audio_input to use model and tokenizer from state
629
+ @gpu_decorator
630
+ def process_audio_input(audio_path, text, history, conv_state):
631
+ """Handle audio or text input from user"""
632
+
633
+ if not audio_path and not text.strip():
634
+ return history, conv_state, ""
635
+
636
+ if audio_path:
637
+ text = preprocess_ref_audio_text(audio_path, text)[1]
638
+
639
+ if not text.strip():
640
+ return history, conv_state, ""
641
+
642
+ conv_state.append({"role": "user", "content": text})
643
+ history.append((text, None))
644
+
645
+ response = generate_response(conv_state, chat_model_state, chat_tokenizer_state)
646
+
647
+ conv_state.append({"role": "assistant", "content": response})
648
+ history[-1] = (text, response)
649
+
650
+ return history, conv_state, ""
651
+
652
+ @gpu_decorator
653
+ def generate_audio_response(history, ref_audio, ref_text, remove_silence):
654
+ """Generate TTS audio for AI response"""
655
+ if not history or not ref_audio:
656
+ return None
657
+
658
+ last_user_message, last_ai_response = history[-1]
659
+ if not last_ai_response:
660
+ return None
661
+
662
+ audio_result, _, ref_text_out = infer(
663
+ ref_audio,
664
+ ref_text,
665
+ last_ai_response,
666
+ tts_model_choice,
667
+ remove_silence,
668
+ cross_fade_duration=0.15,
669
+ speed=1.0,
670
+ show_info=print, # show_info=print no pull to top when generating
671
+ )
672
+ return audio_result, ref_text_out
673
+
674
+ def clear_conversation():
675
+ """Reset the conversation"""
676
+ return [], [
677
+ {
678
+ "role": "system",
679
+ "content": "You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
680
+ }
681
+ ]
682
+
683
+ def update_system_prompt(new_prompt):
684
+ """Update the system prompt and reset the conversation"""
685
+ new_conv_state = [{"role": "system", "content": new_prompt}]
686
+ return [], new_conv_state
687
+
688
+ # Handle audio input
689
+ audio_input_chat.stop_recording(
690
+ process_audio_input,
691
+ inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
692
+ outputs=[chatbot_interface, conversation_state],
693
+ ).then(
694
+ generate_audio_response,
695
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
696
+ outputs=[audio_output_chat, ref_text_chat],
697
+ ).then(
698
+ lambda: None,
699
+ None,
700
+ audio_input_chat,
701
+ )
702
+
703
+ # Handle text input
704
+ text_input_chat.submit(
705
+ process_audio_input,
706
+ inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
707
+ outputs=[chatbot_interface, conversation_state],
708
+ ).then(
709
+ generate_audio_response,
710
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
711
+ outputs=[audio_output_chat, ref_text_chat],
712
+ ).then(
713
+ lambda: None,
714
+ None,
715
+ text_input_chat,
716
+ )
717
+
718
+ # Handle send button
719
+ send_btn_chat.click(
720
+ process_audio_input,
721
+ inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
722
+ outputs=[chatbot_interface, conversation_state],
723
+ ).then(
724
+ generate_audio_response,
725
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
726
+ outputs=[audio_output_chat, ref_text_chat],
727
+ ).then(
728
+ lambda: None,
729
+ None,
730
+ text_input_chat,
731
+ )
732
+
733
+ # Handle clear button
734
+ clear_btn_chat.click(
735
+ clear_conversation,
736
+ outputs=[chatbot_interface, conversation_state],
737
+ )
738
+
739
+ # Handle system prompt change and reset conversation
740
+ system_prompt_chat.change(
741
+ update_system_prompt,
742
+ inputs=system_prompt_chat,
743
+ outputs=[chatbot_interface, conversation_state],
744
+ )
745
+
746
+
747
+ with gr.Blocks() as app:
748
+ gr.Markdown(
749
+ """
750
+ # E2/F5 TTS
751
+
752
+ This is a local web UI for F5 TTS with advanced batch processing support. This app supports the following TTS models:
753
+
754
+ * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
755
+ * [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
756
+
757
+ The checkpoints currently support English and Chinese.
758
+
759
+ If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
760
+
761
+ **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
762
+ """
763
+ )
764
+
765
+ last_used_custom = files("f5_tts").joinpath("infer/.cache/last_used_custom_model_info.txt")
766
+
767
+ def load_last_used_custom():
768
+ try:
769
+ custom = []
770
+ with open(last_used_custom, "r", encoding="utf-8") as f:
771
+ for line in f:
772
+ custom.append(line.strip())
773
+ return custom
774
+ except FileNotFoundError:
775
+ last_used_custom.parent.mkdir(parents=True, exist_ok=True)
776
+ return DEFAULT_TTS_MODEL_CFG
777
+
778
+ def switch_tts_model(new_choice):
779
+ global tts_model_choice
780
+ if new_choice == "Custom": # override in case webpage is refreshed
781
+ custom_ckpt_path, custom_vocab_path, custom_model_cfg = load_last_used_custom()
782
+ tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path, json.loads(custom_model_cfg)]
783
+ return (
784
+ gr.update(visible=True, value=custom_ckpt_path),
785
+ gr.update(visible=True, value=custom_vocab_path),
786
+ gr.update(visible=True, value=custom_model_cfg),
787
+ )
788
+ else:
789
+ tts_model_choice = new_choice
790
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
791
+
792
+ def set_custom_model(custom_ckpt_path, custom_vocab_path, custom_model_cfg):
793
+ global tts_model_choice
794
+ tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path, json.loads(custom_model_cfg)]
795
+ with open(last_used_custom, "w", encoding="utf-8") as f:
796
+ f.write(custom_ckpt_path + "\n" + custom_vocab_path + "\n" + custom_model_cfg + "\n")
797
+
798
+ with gr.Row():
799
+ if not USING_SPACES:
800
+ choose_tts_model = gr.Radio(
801
+ choices=[DEFAULT_TTS_MODEL, "E2-TTS", "Custom"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
802
+ )
803
+ else:
804
+ choose_tts_model = gr.Radio(
805
+ choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
806
+ )
807
+ custom_ckpt_path = gr.Dropdown(
808
+ choices=[DEFAULT_TTS_MODEL_CFG[0]],
809
+ value=load_last_used_custom()[0],
810
+ allow_custom_value=True,
811
+ label="Model: local_path | hf://user_id/repo_id/model_ckpt",
812
+ visible=False,
813
+ )
814
+ custom_vocab_path = gr.Dropdown(
815
+ choices=[DEFAULT_TTS_MODEL_CFG[1]],
816
+ value=load_last_used_custom()[1],
817
+ allow_custom_value=True,
818
+ label="Vocab: local_path | hf://user_id/repo_id/vocab_file",
819
+ visible=False,
820
+ )
821
+ custom_model_cfg = gr.Dropdown(
822
+ choices=[
823
+ DEFAULT_TTS_MODEL_CFG[2],
824
+ json.dumps(dict(dim=768, depth=18, heads=12, ff_mult=2, text_dim=512, conv_layers=4)),
825
+ ],
826
+ value=load_last_used_custom()[2],
827
+ allow_custom_value=True,
828
+ label="Config: in a dictionary form",
829
+ visible=False,
830
+ )
831
+
832
+ choose_tts_model.change(
833
+ switch_tts_model,
834
+ inputs=[choose_tts_model],
835
+ outputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
836
+ show_progress="hidden",
837
+ )
838
+ custom_ckpt_path.change(
839
+ set_custom_model,
840
+ inputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
841
+ show_progress="hidden",
842
+ )
843
+ custom_vocab_path.change(
844
+ set_custom_model,
845
+ inputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
846
+ show_progress="hidden",
847
+ )
848
+ custom_model_cfg.change(
849
+ set_custom_model,
850
+ inputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
851
+ show_progress="hidden",
852
+ )
853
+
854
+ gr.TabbedInterface(
855
+ [app_tts, app_multistyle, app_chat, app_credits],
856
+ ["Basic-TTS", "Multi-Speech", "Voice-Chat", "Credits"],
857
+ )
858
+
859
+
860
+ @click.command()
861
+ @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
862
+ @click.option("--host", "-H", default=None, help="Host to run the app on")
863
+ @click.option(
864
+ "--share",
865
+ "-s",
866
+ default=False,
867
+ is_flag=True,
868
+ help="Share the app via Gradio share link",
869
+ )
870
+ @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
871
+ @click.option(
872
+ "--root_path",
873
+ "-r",
874
+ default=None,
875
+ type=str,
876
+ help='The root path (or "mount point") of the application, if it\'s not served from the root ("/") of the domain. Often used when the application is behind a reverse proxy that forwards requests to the application, e.g. set "/myapp" or full URL for application served at "https://example.com/myapp".',
877
+ )
878
+ def main(port, host, share, api, root_path):
879
+ global app
880
+ print("Startingg app...")
881
+ app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api, root_path=root_path)
882
+
883
+
884
+ if __name__ == "__main__":
885
+ if not USING_SPACES:
886
+ main()
887
+ else:
888
+ app.queue().launch()
f5_tts/infer/examples/vocab.txt ADDED
@@ -0,0 +1,2545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ !
3
+ "
4
+ #
5
+ $
6
+ %
7
+ &
8
+ '
9
+ (
10
+ )
11
+ *
12
+ +
13
+ ,
14
+ -
15
+ .
16
+ /
17
+ 0
18
+ 1
19
+ 2
20
+ 3
21
+ 4
22
+ 5
23
+ 6
24
+ 7
25
+ 8
26
+ 9
27
+ :
28
+ ;
29
+ =
30
+ >
31
+ ?
32
+ @
33
+ A
34
+ B
35
+ C
36
+ D
37
+ E
38
+ F
39
+ G
40
+ H
41
+ I
42
+ J
43
+ K
44
+ L
45
+ M
46
+ N
47
+ O
48
+ P
49
+ Q
50
+ R
51
+ S
52
+ T
53
+ U
54
+ V
55
+ W
56
+ X
57
+ Y
58
+ Z
59
+ [
60
+ \
61
+ ]
62
+ _
63
+ a
64
+ a1
65
+ ai1
66
+ ai2
67
+ ai3
68
+ ai4
69
+ an1
70
+ an3
71
+ an4
72
+ ang1
73
+ ang2
74
+ ang4
75
+ ao1
76
+ ao2
77
+ ao3
78
+ ao4
79
+ b
80
+ ba
81
+ ba1
82
+ ba2
83
+ ba3
84
+ ba4
85
+ bai1
86
+ bai2
87
+ bai3
88
+ bai4
89
+ ban1
90
+ ban2
91
+ ban3
92
+ ban4
93
+ bang1
94
+ bang2
95
+ bang3
96
+ bang4
97
+ bao1
98
+ bao2
99
+ bao3
100
+ bao4
101
+ bei
102
+ bei1
103
+ bei2
104
+ bei3
105
+ bei4
106
+ ben1
107
+ ben2
108
+ ben3
109
+ ben4
110
+ beng
111
+ beng1
112
+ beng2
113
+ beng3
114
+ beng4
115
+ bi1
116
+ bi2
117
+ bi3
118
+ bi4
119
+ bian1
120
+ bian2
121
+ bian3
122
+ bian4
123
+ biao1
124
+ biao2
125
+ biao3
126
+ bie1
127
+ bie2
128
+ bie3
129
+ bie4
130
+ bin1
131
+ bin4
132
+ bing1
133
+ bing2
134
+ bing3
135
+ bing4
136
+ bo
137
+ bo1
138
+ bo2
139
+ bo3
140
+ bo4
141
+ bu2
142
+ bu3
143
+ bu4
144
+ c
145
+ ca1
146
+ cai1
147
+ cai2
148
+ cai3
149
+ cai4
150
+ can1
151
+ can2
152
+ can3
153
+ can4
154
+ cang1
155
+ cang2
156
+ cao1
157
+ cao2
158
+ cao3
159
+ ce4
160
+ cen1
161
+ cen2
162
+ ceng1
163
+ ceng2
164
+ ceng4
165
+ cha1
166
+ cha2
167
+ cha3
168
+ cha4
169
+ chai1
170
+ chai2
171
+ chan1
172
+ chan2
173
+ chan3
174
+ chan4
175
+ chang1
176
+ chang2
177
+ chang3
178
+ chang4
179
+ chao1
180
+ chao2
181
+ chao3
182
+ che1
183
+ che2
184
+ che3
185
+ che4
186
+ chen1
187
+ chen2
188
+ chen3
189
+ chen4
190
+ cheng1
191
+ cheng2
192
+ cheng3
193
+ cheng4
194
+ chi1
195
+ chi2
196
+ chi3
197
+ chi4
198
+ chong1
199
+ chong2
200
+ chong3
201
+ chong4
202
+ chou1
203
+ chou2
204
+ chou3
205
+ chou4
206
+ chu1
207
+ chu2
208
+ chu3
209
+ chu4
210
+ chua1
211
+ chuai1
212
+ chuai2
213
+ chuai3
214
+ chuai4
215
+ chuan1
216
+ chuan2
217
+ chuan3
218
+ chuan4
219
+ chuang1
220
+ chuang2
221
+ chuang3
222
+ chuang4
223
+ chui1
224
+ chui2
225
+ chun1
226
+ chun2
227
+ chun3
228
+ chuo1
229
+ chuo4
230
+ ci1
231
+ ci2
232
+ ci3
233
+ ci4
234
+ cong1
235
+ cong2
236
+ cou4
237
+ cu1
238
+ cu4
239
+ cuan1
240
+ cuan2
241
+ cuan4
242
+ cui1
243
+ cui3
244
+ cui4
245
+ cun1
246
+ cun2
247
+ cun4
248
+ cuo1
249
+ cuo2
250
+ cuo4
251
+ d
252
+ da
253
+ da1
254
+ da2
255
+ da3
256
+ da4
257
+ dai1
258
+ dai2
259
+ dai3
260
+ dai4
261
+ dan1
262
+ dan2
263
+ dan3
264
+ dan4
265
+ dang1
266
+ dang2
267
+ dang3
268
+ dang4
269
+ dao1
270
+ dao2
271
+ dao3
272
+ dao4
273
+ de
274
+ de1
275
+ de2
276
+ dei3
277
+ den4
278
+ deng1
279
+ deng2
280
+ deng3
281
+ deng4
282
+ di1
283
+ di2
284
+ di3
285
+ di4
286
+ dia3
287
+ dian1
288
+ dian2
289
+ dian3
290
+ dian4
291
+ diao1
292
+ diao3
293
+ diao4
294
+ die1
295
+ die2
296
+ die4
297
+ ding1
298
+ ding2
299
+ ding3
300
+ ding4
301
+ diu1
302
+ dong1
303
+ dong3
304
+ dong4
305
+ dou1
306
+ dou2
307
+ dou3
308
+ dou4
309
+ du1
310
+ du2
311
+ du3
312
+ du4
313
+ duan1
314
+ duan2
315
+ duan3
316
+ duan4
317
+ dui1
318
+ dui4
319
+ dun1
320
+ dun3
321
+ dun4
322
+ duo1
323
+ duo2
324
+ duo3
325
+ duo4
326
+ e
327
+ e1
328
+ e2
329
+ e3
330
+ e4
331
+ ei2
332
+ en1
333
+ en4
334
+ er
335
+ er2
336
+ er3
337
+ er4
338
+ f
339
+ fa1
340
+ fa2
341
+ fa3
342
+ fa4
343
+ fan1
344
+ fan2
345
+ fan3
346
+ fan4
347
+ fang1
348
+ fang2
349
+ fang3
350
+ fang4
351
+ fei1
352
+ fei2
353
+ fei3
354
+ fei4
355
+ fen1
356
+ fen2
357
+ fen3
358
+ fen4
359
+ feng1
360
+ feng2
361
+ feng3
362
+ feng4
363
+ fo2
364
+ fou2
365
+ fou3
366
+ fu1
367
+ fu2
368
+ fu3
369
+ fu4
370
+ g
371
+ ga1
372
+ ga2
373
+ ga3
374
+ ga4
375
+ gai1
376
+ gai2
377
+ gai3
378
+ gai4
379
+ gan1
380
+ gan2
381
+ gan3
382
+ gan4
383
+ gang1
384
+ gang2
385
+ gang3
386
+ gang4
387
+ gao1
388
+ gao2
389
+ gao3
390
+ gao4
391
+ ge1
392
+ ge2
393
+ ge3
394
+ ge4
395
+ gei2
396
+ gei3
397
+ gen1
398
+ gen2
399
+ gen3
400
+ gen4
401
+ geng1
402
+ geng3
403
+ geng4
404
+ gong1
405
+ gong3
406
+ gong4
407
+ gou1
408
+ gou2
409
+ gou3
410
+ gou4
411
+ gu
412
+ gu1
413
+ gu2
414
+ gu3
415
+ gu4
416
+ gua1
417
+ gua2
418
+ gua3
419
+ gua4
420
+ guai1
421
+ guai2
422
+ guai3
423
+ guai4
424
+ guan1
425
+ guan2
426
+ guan3
427
+ guan4
428
+ guang1
429
+ guang2
430
+ guang3
431
+ guang4
432
+ gui1
433
+ gui2
434
+ gui3
435
+ gui4
436
+ gun3
437
+ gun4
438
+ guo1
439
+ guo2
440
+ guo3
441
+ guo4
442
+ h
443
+ ha1
444
+ ha2
445
+ ha3
446
+ hai1
447
+ hai2
448
+ hai3
449
+ hai4
450
+ han1
451
+ han2
452
+ han3
453
+ han4
454
+ hang1
455
+ hang2
456
+ hang4
457
+ hao1
458
+ hao2
459
+ hao3
460
+ hao4
461
+ he1
462
+ he2
463
+ he4
464
+ hei1
465
+ hen2
466
+ hen3
467
+ hen4
468
+ heng1
469
+ heng2
470
+ heng4
471
+ hong1
472
+ hong2
473
+ hong3
474
+ hong4
475
+ hou1
476
+ hou2
477
+ hou3
478
+ hou4
479
+ hu1
480
+ hu2
481
+ hu3
482
+ hu4
483
+ hua1
484
+ hua2
485
+ hua4
486
+ huai2
487
+ huai4
488
+ huan1
489
+ huan2
490
+ huan3
491
+ huan4
492
+ huang1
493
+ huang2
494
+ huang3
495
+ huang4
496
+ hui1
497
+ hui2
498
+ hui3
499
+ hui4
500
+ hun1
501
+ hun2
502
+ hun4
503
+ huo
504
+ huo1
505
+ huo2
506
+ huo3
507
+ huo4
508
+ i
509
+ j
510
+ ji1
511
+ ji2
512
+ ji3
513
+ ji4
514
+ jia
515
+ jia1
516
+ jia2
517
+ jia3
518
+ jia4
519
+ jian1
520
+ jian2
521
+ jian3
522
+ jian4
523
+ jiang1
524
+ jiang2
525
+ jiang3
526
+ jiang4
527
+ jiao1
528
+ jiao2
529
+ jiao3
530
+ jiao4
531
+ jie1
532
+ jie2
533
+ jie3
534
+ jie4
535
+ jin1
536
+ jin2
537
+ jin3
538
+ jin4
539
+ jing1
540
+ jing2
541
+ jing3
542
+ jing4
543
+ jiong3
544
+ jiu1
545
+ jiu2
546
+ jiu3
547
+ jiu4
548
+ ju1
549
+ ju2
550
+ ju3
551
+ ju4
552
+ juan1
553
+ juan2
554
+ juan3
555
+ juan4
556
+ jue1
557
+ jue2
558
+ jue4
559
+ jun1
560
+ jun4
561
+ k
562
+ ka1
563
+ ka2
564
+ ka3
565
+ kai1
566
+ kai2
567
+ kai3
568
+ kai4
569
+ kan1
570
+ kan2
571
+ kan3
572
+ kan4
573
+ kang1
574
+ kang2
575
+ kang4
576
+ kao1
577
+ kao2
578
+ kao3
579
+ kao4
580
+ ke1
581
+ ke2
582
+ ke3
583
+ ke4
584
+ ken3
585
+ keng1
586
+ kong1
587
+ kong3
588
+ kong4
589
+ kou1
590
+ kou2
591
+ kou3
592
+ kou4
593
+ ku1
594
+ ku2
595
+ ku3
596
+ ku4
597
+ kua1
598
+ kua3
599
+ kua4
600
+ kuai3
601
+ kuai4
602
+ kuan1
603
+ kuan2
604
+ kuan3
605
+ kuang1
606
+ kuang2
607
+ kuang4
608
+ kui1
609
+ kui2
610
+ kui3
611
+ kui4
612
+ kun1
613
+ kun3
614
+ kun4
615
+ kuo4
616
+ l
617
+ la
618
+ la1
619
+ la2
620
+ la3
621
+ la4
622
+ lai2
623
+ lai4
624
+ lan2
625
+ lan3
626
+ lan4
627
+ lang1
628
+ lang2
629
+ lang3
630
+ lang4
631
+ lao1
632
+ lao2
633
+ lao3
634
+ lao4
635
+ le
636
+ le1
637
+ le4
638
+ lei
639
+ lei1
640
+ lei2
641
+ lei3
642
+ lei4
643
+ leng1
644
+ leng2
645
+ leng3
646
+ leng4
647
+ li
648
+ li1
649
+ li2
650
+ li3
651
+ li4
652
+ lia3
653
+ lian2
654
+ lian3
655
+ lian4
656
+ liang2
657
+ liang3
658
+ liang4
659
+ liao1
660
+ liao2
661
+ liao3
662
+ liao4
663
+ lie1
664
+ lie2
665
+ lie3
666
+ lie4
667
+ lin1
668
+ lin2
669
+ lin3
670
+ lin4
671
+ ling2
672
+ ling3
673
+ ling4
674
+ liu1
675
+ liu2
676
+ liu3
677
+ liu4
678
+ long1
679
+ long2
680
+ long3
681
+ long4
682
+ lou1
683
+ lou2
684
+ lou3
685
+ lou4
686
+ lu1
687
+ lu2
688
+ lu3
689
+ lu4
690
+ luan2
691
+ luan3
692
+ luan4
693
+ lun1
694
+ lun2
695
+ lun4
696
+ luo1
697
+ luo2
698
+ luo3
699
+ luo4
700
+ lv2
701
+ lv3
702
+ lv4
703
+ lve3
704
+ lve4
705
+ m
706
+ ma
707
+ ma1
708
+ ma2
709
+ ma3
710
+ ma4
711
+ mai2
712
+ mai3
713
+ mai4
714
+ man1
715
+ man2
716
+ man3
717
+ man4
718
+ mang2
719
+ mang3
720
+ mao1
721
+ mao2
722
+ mao3
723
+ mao4
724
+ me
725
+ mei2
726
+ mei3
727
+ mei4
728
+ men
729
+ men1
730
+ men2
731
+ men4
732
+ meng
733
+ meng1
734
+ meng2
735
+ meng3
736
+ meng4
737
+ mi1
738
+ mi2
739
+ mi3
740
+ mi4
741
+ mian2
742
+ mian3
743
+ mian4
744
+ miao1
745
+ miao2
746
+ miao3
747
+ miao4
748
+ mie1
749
+ mie4
750
+ min2
751
+ min3
752
+ ming2
753
+ ming3
754
+ ming4
755
+ miu4
756
+ mo1
757
+ mo2
758
+ mo3
759
+ mo4
760
+ mou1
761
+ mou2
762
+ mou3
763
+ mu2
764
+ mu3
765
+ mu4
766
+ n
767
+ n2
768
+ na1
769
+ na2
770
+ na3
771
+ na4
772
+ nai2
773
+ nai3
774
+ nai4
775
+ nan1
776
+ nan2
777
+ nan3
778
+ nan4
779
+ nang1
780
+ nang2
781
+ nang3
782
+ nao1
783
+ nao2
784
+ nao3
785
+ nao4
786
+ ne
787
+ ne2
788
+ ne4
789
+ nei3
790
+ nei4
791
+ nen4
792
+ neng2
793
+ ni1
794
+ ni2
795
+ ni3
796
+ ni4
797
+ nian1
798
+ nian2
799
+ nian3
800
+ nian4
801
+ niang2
802
+ niang4
803
+ niao2
804
+ niao3
805
+ niao4
806
+ nie1
807
+ nie4
808
+ nin2
809
+ ning2
810
+ ning3
811
+ ning4
812
+ niu1
813
+ niu2
814
+ niu3
815
+ niu4
816
+ nong2
817
+ nong4
818
+ nou4
819
+ nu2
820
+ nu3
821
+ nu4
822
+ nuan3
823
+ nuo2
824
+ nuo4
825
+ nv2
826
+ nv3
827
+ nve4
828
+ o
829
+ o1
830
+ o2
831
+ ou1
832
+ ou2
833
+ ou3
834
+ ou4
835
+ p
836
+ pa1
837
+ pa2
838
+ pa4
839
+ pai1
840
+ pai2
841
+ pai3
842
+ pai4
843
+ pan1
844
+ pan2
845
+ pan4
846
+ pang1
847
+ pang2
848
+ pang4
849
+ pao1
850
+ pao2
851
+ pao3
852
+ pao4
853
+ pei1
854
+ pei2
855
+ pei4
856
+ pen1
857
+ pen2
858
+ pen4
859
+ peng1
860
+ peng2
861
+ peng3
862
+ peng4
863
+ pi1
864
+ pi2
865
+ pi3
866
+ pi4
867
+ pian1
868
+ pian2
869
+ pian4
870
+ piao1
871
+ piao2
872
+ piao3
873
+ piao4
874
+ pie1
875
+ pie2
876
+ pie3
877
+ pin1
878
+ pin2
879
+ pin3
880
+ pin4
881
+ ping1
882
+ ping2
883
+ po1
884
+ po2
885
+ po3
886
+ po4
887
+ pou1
888
+ pu1
889
+ pu2
890
+ pu3
891
+ pu4
892
+ q
893
+ qi1
894
+ qi2
895
+ qi3
896
+ qi4
897
+ qia1
898
+ qia3
899
+ qia4
900
+ qian1
901
+ qian2
902
+ qian3
903
+ qian4
904
+ qiang1
905
+ qiang2
906
+ qiang3
907
+ qiang4
908
+ qiao1
909
+ qiao2
910
+ qiao3
911
+ qiao4
912
+ qie1
913
+ qie2
914
+ qie3
915
+ qie4
916
+ qin1
917
+ qin2
918
+ qin3
919
+ qin4
920
+ qing1
921
+ qing2
922
+ qing3
923
+ qing4
924
+ qiong1
925
+ qiong2
926
+ qiu1
927
+ qiu2
928
+ qiu3
929
+ qu1
930
+ qu2
931
+ qu3
932
+ qu4
933
+ quan1
934
+ quan2
935
+ quan3
936
+ quan4
937
+ que1
938
+ que2
939
+ que4
940
+ qun2
941
+ r
942
+ ran2
943
+ ran3
944
+ rang1
945
+ rang2
946
+ rang3
947
+ rang4
948
+ rao2
949
+ rao3
950
+ rao4
951
+ re2
952
+ re3
953
+ re4
954
+ ren2
955
+ ren3
956
+ ren4
957
+ reng1
958
+ reng2
959
+ ri4
960
+ rong1
961
+ rong2
962
+ rong3
963
+ rou2
964
+ rou4
965
+ ru2
966
+ ru3
967
+ ru4
968
+ ruan2
969
+ ruan3
970
+ rui3
971
+ rui4
972
+ run4
973
+ ruo4
974
+ s
975
+ sa1
976
+ sa2
977
+ sa3
978
+ sa4
979
+ sai1
980
+ sai4
981
+ san1
982
+ san2
983
+ san3
984
+ san4
985
+ sang1
986
+ sang3
987
+ sang4
988
+ sao1
989
+ sao2
990
+ sao3
991
+ sao4
992
+ se4
993
+ sen1
994
+ seng1
995
+ sha1
996
+ sha2
997
+ sha3
998
+ sha4
999
+ shai1
1000
+ shai2
1001
+ shai3
1002
+ shai4
1003
+ shan1
1004
+ shan3
1005
+ shan4
1006
+ shang
1007
+ shang1
1008
+ shang3
1009
+ shang4
1010
+ shao1
1011
+ shao2
1012
+ shao3
1013
+ shao4
1014
+ she1
1015
+ she2
1016
+ she3
1017
+ she4
1018
+ shei2
1019
+ shen1
1020
+ shen2
1021
+ shen3
1022
+ shen4
1023
+ sheng1
1024
+ sheng2
1025
+ sheng3
1026
+ sheng4
1027
+ shi
1028
+ shi1
1029
+ shi2
1030
+ shi3
1031
+ shi4
1032
+ shou1
1033
+ shou2
1034
+ shou3
1035
+ shou4
1036
+ shu1
1037
+ shu2
1038
+ shu3
1039
+ shu4
1040
+ shua1
1041
+ shua2
1042
+ shua3
1043
+ shua4
1044
+ shuai1
1045
+ shuai3
1046
+ shuai4
1047
+ shuan1
1048
+ shuan4
1049
+ shuang1
1050
+ shuang3
1051
+ shui2
1052
+ shui3
1053
+ shui4
1054
+ shun3
1055
+ shun4
1056
+ shuo1
1057
+ shuo4
1058
+ si1
1059
+ si2
1060
+ si3
1061
+ si4
1062
+ song1
1063
+ song3
1064
+ song4
1065
+ sou1
1066
+ sou3
1067
+ sou4
1068
+ su1
1069
+ su2
1070
+ su4
1071
+ suan1
1072
+ suan4
1073
+ sui1
1074
+ sui2
1075
+ sui3
1076
+ sui4
1077
+ sun1
1078
+ sun3
1079
+ suo
1080
+ suo1
1081
+ suo2
1082
+ suo3
1083
+ t
1084
+ ta1
1085
+ ta2
1086
+ ta3
1087
+ ta4
1088
+ tai1
1089
+ tai2
1090
+ tai4
1091
+ tan1
1092
+ tan2
1093
+ tan3
1094
+ tan4
1095
+ tang1
1096
+ tang2
1097
+ tang3
1098
+ tang4
1099
+ tao1
1100
+ tao2
1101
+ tao3
1102
+ tao4
1103
+ te4
1104
+ teng2
1105
+ ti1
1106
+ ti2
1107
+ ti3
1108
+ ti4
1109
+ tian1
1110
+ tian2
1111
+ tian3
1112
+ tiao1
1113
+ tiao2
1114
+ tiao3
1115
+ tiao4
1116
+ tie1
1117
+ tie2
1118
+ tie3
1119
+ tie4
1120
+ ting1
1121
+ ting2
1122
+ ting3
1123
+ tong1
1124
+ tong2
1125
+ tong3
1126
+ tong4
1127
+ tou
1128
+ tou1
1129
+ tou2
1130
+ tou4
1131
+ tu1
1132
+ tu2
1133
+ tu3
1134
+ tu4
1135
+ tuan1
1136
+ tuan2
1137
+ tui1
1138
+ tui2
1139
+ tui3
1140
+ tui4
1141
+ tun1
1142
+ tun2
1143
+ tun4
1144
+ tuo1
1145
+ tuo2
1146
+ tuo3
1147
+ tuo4
1148
+ u
1149
+ v
1150
+ w
1151
+ wa
1152
+ wa1
1153
+ wa2
1154
+ wa3
1155
+ wa4
1156
+ wai1
1157
+ wai3
1158
+ wai4
1159
+ wan1
1160
+ wan2
1161
+ wan3
1162
+ wan4
1163
+ wang1
1164
+ wang2
1165
+ wang3
1166
+ wang4
1167
+ wei1
1168
+ wei2
1169
+ wei3
1170
+ wei4
1171
+ wen1
1172
+ wen2
1173
+ wen3
1174
+ wen4
1175
+ weng1
1176
+ weng4
1177
+ wo1
1178
+ wo2
1179
+ wo3
1180
+ wo4
1181
+ wu1
1182
+ wu2
1183
+ wu3
1184
+ wu4
1185
+ x
1186
+ xi1
1187
+ xi2
1188
+ xi3
1189
+ xi4
1190
+ xia1
1191
+ xia2
1192
+ xia4
1193
+ xian1
1194
+ xian2
1195
+ xian3
1196
+ xian4
1197
+ xiang1
1198
+ xiang2
1199
+ xiang3
1200
+ xiang4
1201
+ xiao1
1202
+ xiao2
1203
+ xiao3
1204
+ xiao4
1205
+ xie1
1206
+ xie2
1207
+ xie3
1208
+ xie4
1209
+ xin1
1210
+ xin2
1211
+ xin4
1212
+ xing1
1213
+ xing2
1214
+ xing3
1215
+ xing4
1216
+ xiong1
1217
+ xiong2
1218
+ xiu1
1219
+ xiu3
1220
+ xiu4
1221
+ xu
1222
+ xu1
1223
+ xu2
1224
+ xu3
1225
+ xu4
1226
+ xuan1
1227
+ xuan2
1228
+ xuan3
1229
+ xuan4
1230
+ xue1
1231
+ xue2
1232
+ xue3
1233
+ xue4
1234
+ xun1
1235
+ xun2
1236
+ xun4
1237
+ y
1238
+ ya
1239
+ ya1
1240
+ ya2
1241
+ ya3
1242
+ ya4
1243
+ yan1
1244
+ yan2
1245
+ yan3
1246
+ yan4
1247
+ yang1
1248
+ yang2
1249
+ yang3
1250
+ yang4
1251
+ yao1
1252
+ yao2
1253
+ yao3
1254
+ yao4
1255
+ ye1
1256
+ ye2
1257
+ ye3
1258
+ ye4
1259
+ yi
1260
+ yi1
1261
+ yi2
1262
+ yi3
1263
+ yi4
1264
+ yin1
1265
+ yin2
1266
+ yin3
1267
+ yin4
1268
+ ying1
1269
+ ying2
1270
+ ying3
1271
+ ying4
1272
+ yo1
1273
+ yong1
1274
+ yong2
1275
+ yong3
1276
+ yong4
1277
+ you1
1278
+ you2
1279
+ you3
1280
+ you4
1281
+ yu1
1282
+ yu2
1283
+ yu3
1284
+ yu4
1285
+ yuan1
1286
+ yuan2
1287
+ yuan3
1288
+ yuan4
1289
+ yue1
1290
+ yue4
1291
+ yun1
1292
+ yun2
1293
+ yun3
1294
+ yun4
1295
+ z
1296
+ za1
1297
+ za2
1298
+ za3
1299
+ zai1
1300
+ zai3
1301
+ zai4
1302
+ zan1
1303
+ zan2
1304
+ zan3
1305
+ zan4
1306
+ zang1
1307
+ zang4
1308
+ zao1
1309
+ zao2
1310
+ zao3
1311
+ zao4
1312
+ ze2
1313
+ ze4
1314
+ zei2
1315
+ zen3
1316
+ zeng1
1317
+ zeng4
1318
+ zha1
1319
+ zha2
1320
+ zha3
1321
+ zha4
1322
+ zhai1
1323
+ zhai2
1324
+ zhai3
1325
+ zhai4
1326
+ zhan1
1327
+ zhan2
1328
+ zhan3
1329
+ zhan4
1330
+ zhang1
1331
+ zhang2
1332
+ zhang3
1333
+ zhang4
1334
+ zhao1
1335
+ zhao2
1336
+ zhao3
1337
+ zhao4
1338
+ zhe
1339
+ zhe1
1340
+ zhe2
1341
+ zhe3
1342
+ zhe4
1343
+ zhen1
1344
+ zhen2
1345
+ zhen3
1346
+ zhen4
1347
+ zheng1
1348
+ zheng2
1349
+ zheng3
1350
+ zheng4
1351
+ zhi1
1352
+ zhi2
1353
+ zhi3
1354
+ zhi4
1355
+ zhong1
1356
+ zhong2
1357
+ zhong3
1358
+ zhong4
1359
+ zhou1
1360
+ zhou2
1361
+ zhou3
1362
+ zhou4
1363
+ zhu1
1364
+ zhu2
1365
+ zhu3
1366
+ zhu4
1367
+ zhua1
1368
+ zhua2
1369
+ zhua3
1370
+ zhuai1
1371
+ zhuai3
1372
+ zhuai4
1373
+ zhuan1
1374
+ zhuan2
1375
+ zhuan3
1376
+ zhuan4
1377
+ zhuang1
1378
+ zhuang4
1379
+ zhui1
1380
+ zhui4
1381
+ zhun1
1382
+ zhun2
1383
+ zhun3
1384
+ zhuo1
1385
+ zhuo2
1386
+ zi
1387
+ zi1
1388
+ zi2
1389
+ zi3
1390
+ zi4
1391
+ zong1
1392
+ zong2
1393
+ zong3
1394
+ zong4
1395
+ zou1
1396
+ zou2
1397
+ zou3
1398
+ zou4
1399
+ zu1
1400
+ zu2
1401
+ zu3
1402
+ zuan1
1403
+ zuan3
1404
+ zuan4
1405
+ zui2
1406
+ zui3
1407
+ zui4
1408
+ zun1
1409
+ zuo
1410
+ zuo1
1411
+ zuo2
1412
+ zuo3
1413
+ zuo4
1414
+ {
1415
+ ~
1416
+ ¡
1417
+ ¢
1418
+ £
1419
+ ¥
1420
+ §
1421
+ ¨
1422
+ ©
1423
+ «
1424
+ ®
1425
+ ¯
1426
+ °
1427
+ ±
1428
+ ²
1429
+ ³
1430
+ ´
1431
+ µ
1432
+ ·
1433
+ ¹
1434
+ º
1435
+ »
1436
+ ¼
1437
+ ½
1438
+ ¾
1439
+ ¿
1440
+ À
1441
+ Á
1442
+ Â
1443
+ Ã
1444
+ Ä
1445
+ Å
1446
+ Æ
1447
+ Ç
1448
+ È
1449
+ É
1450
+ Ê
1451
+ Í
1452
+ Î
1453
+ Ñ
1454
+ Ó
1455
+ Ö
1456
+ ×
1457
+ Ø
1458
+ Ú
1459
+ Ü
1460
+ Ý
1461
+ Þ
1462
+ ß
1463
+ à
1464
+ á
1465
+ â
1466
+ ã
1467
+ ä
1468
+ å
1469
+ æ
1470
+ ç
1471
+ è
1472
+ é
1473
+ ê
1474
+ ë
1475
+ ì
1476
+ í
1477
+ î
1478
+ ï
1479
+ ð
1480
+ ñ
1481
+ ò
1482
+ ó
1483
+ ô
1484
+ õ
1485
+ ö
1486
+ ø
1487
+ ù
1488
+ ú
1489
+ û
1490
+ ü
1491
+ ý
1492
+ Ā
1493
+ ā
1494
+ ă
1495
+ ą
1496
+ ć
1497
+ Č
1498
+ č
1499
+ Đ
1500
+ đ
1501
+ ē
1502
+ ė
1503
+ ę
1504
+ ě
1505
+ ĝ
1506
+ ğ
1507
+ ħ
1508
+ ī
1509
+ į
1510
+ İ
1511
+ ı
1512
+ Ł
1513
+ ł
1514
+ ń
1515
+ ņ
1516
+ ň
1517
+ ŋ
1518
+ Ō
1519
+ ō
1520
+ ő
1521
+ œ
1522
+ ř
1523
+ Ś
1524
+ ś
1525
+ Ş
1526
+ ş
1527
+ Š
1528
+ š
1529
+ Ť
1530
+ ť
1531
+ ũ
1532
+ ū
1533
+ ź
1534
+ Ż
1535
+ ż
1536
+ Ž
1537
+ ž
1538
+ ơ
1539
+ ư
1540
+ ǎ
1541
+ ǐ
1542
+ ǒ
1543
+ ǔ
1544
+ ǚ
1545
+ ș
1546
+ ț
1547
+ ɑ
1548
+ ɔ
1549
+ ɕ
1550
+ ə
1551
+ ɛ
1552
+ ɜ
1553
+ ɡ
1554
+ ɣ
1555
+ ɪ
1556
+ ɫ
1557
+ ɴ
1558
+ ɹ
1559
+ ɾ
1560
+ ʃ
1561
+ ʊ
1562
+ ʌ
1563
+ ʒ
1564
+ ʔ
1565
+ ʰ
1566
+ ʷ
1567
+ ʻ
1568
+ ʾ
1569
+ ʿ
1570
+ ˈ
1571
+ ː
1572
+ ˙
1573
+ ˜
1574
+ ˢ
1575
+ ́
1576
+ ̅
1577
+ Α
1578
+ Β
1579
+ Δ
1580
+ Ε
1581
+ Θ
1582
+ Κ
1583
+ Λ
1584
+ Μ
1585
+ Ξ
1586
+ Π
1587
+ Σ
1588
+ Τ
1589
+ Φ
1590
+ Χ
1591
+ Ψ
1592
+ Ω
1593
+ ά
1594
+ έ
1595
+ ή
1596
+ ί
1597
+ α
1598
+ β
1599
+ γ
1600
+ δ
1601
+ ε
1602
+ ζ
1603
+ η
1604
+ θ
1605
+ ι
1606
+ κ
1607
+ λ
1608
+ μ
1609
+ ν
1610
+ ξ
1611
+ ο
1612
+ π
1613
+ ρ
1614
+ ς
1615
+ σ
1616
+ τ
1617
+ υ
1618
+ φ
1619
+ χ
1620
+ ψ
1621
+ ω
1622
+ ϊ
1623
+ ό
1624
+ ύ
1625
+ ώ
1626
+ ϕ
1627
+ ϵ
1628
+ Ё
1629
+ А
1630
+ Б
1631
+ В
1632
+ Г
1633
+ Д
1634
+ Е
1635
+ Ж
1636
+ З
1637
+ И
1638
+ Й
1639
+ К
1640
+ Л
1641
+ М
1642
+ Н
1643
+ О
1644
+ П
1645
+ Р
1646
+ С
1647
+ Т
1648
+ У
1649
+ Ф
1650
+ Х
1651
+ Ц
1652
+ Ч
1653
+ Ш
1654
+ Щ
1655
+ Ы
1656
+ Ь
1657
+ Э
1658
+ Ю
1659
+ Я
1660
+ а
1661
+ б
1662
+ в
1663
+ г
1664
+ д
1665
+ е
1666
+ ж
1667
+ з
1668
+ и
1669
+ й
1670
+ к
1671
+ л
1672
+ м
1673
+ н
1674
+ о
1675
+ п
1676
+ р
1677
+ с
1678
+ т
1679
+ у
1680
+ ф
1681
+ х
1682
+ ц
1683
+ ч
1684
+ ш
1685
+ щ
1686
+ ъ
1687
+ ы
1688
+ ь
1689
+ э
1690
+ ю
1691
+ я
1692
+ ё
1693
+ і
1694
+ ְ
1695
+ ִ
1696
+ ֵ
1697
+ ֶ
1698
+ ַ
1699
+ ָ
1700
+ ֹ
1701
+ ּ
1702
+ ־
1703
+ ׁ
1704
+ א
1705
+ ב
1706
+ ג
1707
+ ד
1708
+ ה
1709
+ ו
1710
+ ז
1711
+ ח
1712
+ ט
1713
+ י
1714
+ כ
1715
+ ל
1716
+ ם
1717
+ מ
1718
+ ן
1719
+ נ
1720
+ ס
1721
+ ע
1722
+ פ
1723
+ ק
1724
+ ר
1725
+ ש
1726
+ ת
1727
+ أ
1728
+ ب
1729
+ ة
1730
+ ت
1731
+ ج
1732
+ ح
1733
+ د
1734
+ ر
1735
+ ز
1736
+ س
1737
+ ص
1738
+ ط
1739
+ ع
1740
+ ق
1741
+ ك
1742
+ ل
1743
+ م
1744
+ ن
1745
+ ه
1746
+ و
1747
+ ي
1748
+ َ
1749
+ ُ
1750
+ ِ
1751
+ ْ
1752
+
1753
+
1754
+
1755
+
1756
+
1757
+
1758
+
1759
+
1760
+
1761
+
1762
+
1763
+
1764
+
1765
+
1766
+
1767
+
1768
+
1769
+
1770
+
1771
+
1772
+
1773
+
1774
+
1775
+
1776
+
1777
+
1778
+
1779
+
1780
+
1781
+
1782
+
1783
+
1784
+
1785
+
1786
+
1787
+
1788
+
1789
+
1790
+
1791
+
1792
+
1793
+
1794
+
1795
+
1796
+
1797
+
1798
+
1799
+
1800
+ ế
1801
+
1802
+
1803
+
1804
+
1805
+
1806
+
1807
+
1808
+
1809
+
1810
+
1811
+
1812
+
1813
+
1814
+
1815
+
1816
+
1817
+
1818
+
1819
+
1820
+
1821
+
1822
+
1823
+
1824
+
1825
+
1826
+
1827
+
1828
+
1829
+
1830
+ ���
1831
+
1832
+
1833
+
1834
+
1835
+
1836
+
1837
+
1838
+
1839
+
1840
+
1841
+
1842
+
1843
+
1844
+
1845
+
1846
+
1847
+
1848
+
1849
+
1850
+
1851
+
1852
+
1853
+
1854
+
1855
+
1856
+
1857
+
1858
+
1859
+
1860
+
1861
+
1862
+
1863
+
1864
+
1865
+
1866
+
1867
+
1868
+
1869
+
1870
+
1871
+
1872
+
1873
+
1874
+
1875
+
1876
+
1877
+
1878
+
1879
+
1880
+
1881
+
1882
+
1883
+
1884
+
1885
+
1886
+
1887
+
1888
+
1889
+
1890
+
1891
+
1892
+
1893
+
1894
+
1895
+
1896
+
1897
+
1898
+
1899
+
1900
+
1901
+
1902
+
1903
+
1904
+
1905
+
1906
+
1907
+
1908
+
1909
+
1910
+
1911
+
1912
+
1913
+
1914
+
1915
+
1916
+
1917
+
1918
+
1919
+
1920
+
1921
+
1922
+
1923
+
1924
+
1925
+
1926
+
1927
+
1928
+
1929
+
1930
+
1931
+
1932
+
1933
+
1934
+
1935
+
1936
+
1937
+
1938
+
1939
+
1940
+
1941
+
1942
+
1943
+
1944
+
1945
+
1946
+
1947
+
1948
+
1949
+
1950
+
1951
+
1952
+
1953
+
1954
+
1955
+
1956
+
1957
+
1958
+
1959
+
1960
+
1961
+
1962
+
1963
+
1964
+
1965
+
1966
+
1967
+
1968
+
1969
+
1970
+
1971
+
1972
+
1973
+
1974
+
1975
+
1976
+
1977
+
1978
+
1979
+
1980
+
1981
+
1982
+
1983
+
1984
+
1985
+
1986
+
1987
+
1988
+
1989
+
1990
+
1991
+
1992
+
1993
+
1994
+
1995
+
1996
+
1997
+
1998
+
1999
+
2000
+
2001
+
2002
+
2003
+
2004
+
2005
+
2006
+
2007
+
2008
+
2009
+
2010
+
2011
+
2012
+
2013
+
2014
+
2015
+
2016
+
2017
+
2018
+
2019
+
2020
+
2021
+
2022
+
2023
+
2024
+
2025
+
2026
+
2027
+
2028
+
2029
+
2030
+
2031
+
2032
+
2033
+
2034
+
2035
+
2036
+
2037
+
2038
+
2039
+
2040
+
2041
+
2042
+
2043
+
2044
+
2045
+
2046
+
2047
+
2048
+
2049
+
2050
+
2051
+
2052
+
2053
+
2054
+
2055
+
2056
+
2057
+
2058
+
2059
+
2060
+
2061
+
2062
+
2063
+
2064
+
2065
+
2066
+
2067
+
2068
+
2069
+
2070
+
2071
+
2072
+
2073
+
2074
+
2075
+
2076
+
2077
+
2078
+
2079
+
2080
+
2081
+
2082
+
2083
+
2084
+
2085
+
2086
+
2087
+
2088
+
2089
+
2090
+
2091
+
2092
+
2093
+
2094
+
2095
+
2096
+
2097
+
2098
+
2099
+
2100
+
2101
+
2102
+
2103
+
2104
+
2105
+
2106
+
2107
+
2108
+
2109
+
2110
+
2111
+
2112
+
2113
+
2114
+
2115
+
2116
+
2117
+
2118
+
2119
+
2120
+
2121
+
2122
+
2123
+
2124
+
2125
+
2126
+
2127
+
2128
+
2129
+
2130
+
2131
+
2132
+
2133
+
2134
+
2135
+
2136
+
2137
+
2138
+
2139
+
2140
+
2141
+
2142
+
2143
+
2144
+
2145
+
2146
+
2147
+
2148
+
2149
+
2150
+
2151
+
2152
+
2153
+
2154
+
2155
+
2156
+
2157
+
2158
+
2159
+
2160
+
2161
+
2162
+
2163
+
2164
+
2165
+
2166
+
2167
+
2168
+
2169
+
2170
+
2171
+
2172
+
2173
+
2174
+
2175
+
2176
+
2177
+
2178
+
2179
+
2180
+
2181
+
2182
+
2183
+
2184
+
2185
+
2186
+
2187
+
2188
+
2189
+
2190
+
2191
+
2192
+
2193
+
2194
+
2195
+
2196
+
2197
+
2198
+
2199
+
2200
+
2201
+
2202
+
2203
+
2204
+
2205
+
2206
+
2207
+
2208
+
2209
+
2210
+
2211
+
2212
+
2213
+
2214
+
2215
+
2216
+
2217
+
2218
+
2219
+
2220
+
2221
+
2222
+
2223
+
2224
+
2225
+
2226
+
2227
+
2228
+
2229
+
2230
+
2231
+
2232
+
2233
+
2234
+
2235
+
2236
+
2237
+
2238
+
2239
+
2240
+
2241
+
2242
+
2243
+
2244
+
2245
+
2246
+
2247
+
2248
+
2249
+
2250
+
2251
+
2252
+
2253
+
2254
+
2255
+
2256
+
2257
+
2258
+
2259
+
2260
+
2261
+
2262
+
2263
+
2264
+
2265
+
2266
+
2267
+
2268
+
2269
+
2270
+
2271
+
2272
+
2273
+
2274
+
2275
+
2276
+
2277
+
2278
+
2279
+
2280
+
2281
+
2282
+
2283
+
2284
+
2285
+
2286
+
2287
+
2288
+
2289
+
2290
+
2291
+
2292
+
2293
+
2294
+
2295
+
2296
+
2297
+
2298
+
2299
+
2300
+
2301
+
2302
+
2303
+
2304
+
2305
+
2306
+
2307
+
2308
+
2309
+
2310
+
2311
+
2312
+
2313
+
2314
+
2315
+
2316
+
2317
+
2318
+
2319
+
2320
+
2321
+
2322
+
2323
+
2324
+
2325
+
2326
+
2327
+
2328
+
2329
+
2330
+
2331
+
2332
+
2333
+
2334
+
2335
+
2336
+
2337
+
2338
+
2339
+
2340
+
2341
+
2342
+
2343
+
2344
+
2345
+
2346
+
2347
+
2348
+
2349
+
2350
+
2351
+
2352
+
2353
+
2354
+
2355
+
2356
+
2357
+
2358
+
2359
+
2360
+
2361
+
2362
+
2363
+
2364
+
2365
+
2366
+
2367
+
2368
+
2369
+
2370
+
2371
+
2372
+
2373
+
2374
+
2375
+
2376
+
2377
+
2378
+
2379
+
2380
+
2381
+
2382
+
2383
+
2384
+
2385
+
2386
+
2387
+
2388
+
2389
+
2390
+
2391
+
2392
+
2393
+
2394
+
2395
+
2396
+
2397
+
2398
+
2399
+
2400
+
2401
+
2402
+
2403
+
2404
+
2405
+
2406
+
2407
+
2408
+
2409
+
2410
+
2411
+
2412
+
2413
+
2414
+
2415
+
2416
+
2417
+
2418
+
2419
+
2420
+
2421
+
2422
+
2423
+
2424
+
2425
+
2426
+
2427
+
2428
+
2429
+
2430
+
2431
+
2432
+
2433
+
2434
+
2435
+
2436
+
2437
+
2438
+
2439
+
2440
+
2441
+
2442
+
2443
+
2444
+
2445
+
2446
+
2447
+
2448
+
2449
+
2450
+
2451
+
2452
+
2453
+
2454
+
2455
+
2456
+
2457
+
2458
+
2459
+
2460
+
2461
+
2462
+
2463
+
2464
+
2465
+
2466
+
2467
+
2468
+
2469
+
2470
+
2471
+
2472
+
2473
+
2474
+
2475
+
2476
+
2477
+
2478
+
2479
+
2480
+
2481
+
2482
+
2483
+
2484
+
2485
+
2486
+
2487
+
2488
+
2489
+
2490
+
2491
+
2492
+
2493
+
2494
+
2495
+
2496
+
2497
+
2498
+
2499
+
2500
+
2501
+
2502
+
2503
+
2504
+
2505
+
2506
+
2507
+
2508
+
2509
+
2510
+
2511
+
2512
+
2513
+
2514
+
2515
+
2516
+
2517
+
2518
+
2519
+
2520
+
2521
+
2522
+
2523
+
2524
+
2525
+
2526
+
2527
+
2528
+
2529
+
2530
+
2531
+
2532
+
2533
+
2534
+
2535
+
2536
+
2537
+
2538
+
2539
+
2540
+
2541
+
2542
+
2543
+
2544
+
2545
+ 𠮶
f5_tts/infer/utils_infer.py ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A unified script for inference process
2
+ # Make adjustments inside functions, and consider both gradio and cli scripts if need to change func output format
3
+ import os
4
+ import sys
5
+
6
+ os.environ["PYTOCH_ENABLE_MPS_FALLBACK"] = "1" # for MPS device compatibility
7
+ sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/../../third_party/BigVGAN/")
8
+
9
+ import hashlib
10
+ import re
11
+ import tempfile
12
+ from importlib.resources import files
13
+
14
+ import matplotlib
15
+
16
+ matplotlib.use("Agg")
17
+
18
+ import matplotlib.pylab as plt
19
+ import numpy as np
20
+ import torch
21
+ import torchaudio
22
+ import tqdm
23
+ from huggingface_hub import snapshot_download, hf_hub_download
24
+ from pydub import AudioSegment, silence
25
+ from transformers import pipeline
26
+ from vocos import Vocos
27
+
28
+ from f5_tts.model import CFM
29
+ from f5_tts.model.utils import (
30
+ get_tokenizer,
31
+ convert_char_to_pinyin,
32
+ )
33
+
34
+ _ref_audio_cache = {}
35
+
36
+ device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
37
+
38
+ # -----------------------------------------
39
+
40
+ target_sample_rate = 24000
41
+ n_mel_channels = 100
42
+ hop_length = 256
43
+ win_length = 1024
44
+ n_fft = 1024
45
+ mel_spec_type = "vocos"
46
+ target_rms = 0.1
47
+ cross_fade_duration = 0.15
48
+ ode_method = "euler"
49
+ nfe_step = 32 # 16, 32
50
+ cfg_strength = 2.0
51
+ sway_sampling_coef = -1.0
52
+ speed = 1.0
53
+ fix_duration = None
54
+
55
+ # -----------------------------------------
56
+
57
+
58
+ # chunk text into smaller pieces
59
+
60
+
61
+ def chunk_text(text, max_chars=135):
62
+ """
63
+ Splits the input text into chunks, each with a maximum number of characters.
64
+
65
+ Args:
66
+ text (str): The text to be split.
67
+ max_chars (int): The maximum number of characters per chunk.
68
+
69
+ Returns:
70
+ List[str]: A list of text chunks.
71
+ """
72
+ chunks = []
73
+ current_chunk = ""
74
+ # Split the text into sentences based on punctuation followed by whitespace
75
+ sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[;:,。!?])", text)
76
+
77
+ for sentence in sentences:
78
+ if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
79
+ current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
80
+ else:
81
+ if current_chunk:
82
+ chunks.append(current_chunk.strip())
83
+ current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
84
+
85
+ if current_chunk:
86
+ chunks.append(current_chunk.strip())
87
+
88
+ return chunks
89
+
90
+
91
+ # load vocoder
92
+ def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=device, hf_cache_dir=None):
93
+ if vocoder_name == "vocos":
94
+ # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
95
+ if is_local:
96
+ print(f"Load vocos from local path {local_path}")
97
+ config_path = f"{local_path}/config.yaml"
98
+ model_path = f"{local_path}/pytorch_model.bin"
99
+ else:
100
+ print("Download Vocos from huggingface charactr/vocos-mel-24khz")
101
+ repo_id = "charactr/vocos-mel-24khz"
102
+ config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
103
+ model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
104
+ vocoder = Vocos.from_hparams(config_path)
105
+ state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
106
+ from vocos.feature_extractors import EncodecFeatures
107
+
108
+ if isinstance(vocoder.feature_extractor, EncodecFeatures):
109
+ encodec_parameters = {
110
+ "feature_extractor.encodec." + key: value
111
+ for key, value in vocoder.feature_extractor.encodec.state_dict().items()
112
+ }
113
+ state_dict.update(encodec_parameters)
114
+ vocoder.load_state_dict(state_dict)
115
+ vocoder = vocoder.eval().to(device)
116
+ elif vocoder_name == "bigvgan":
117
+ try:
118
+ from third_party.BigVGAN import bigvgan
119
+ except ImportError:
120
+ print("You need to follow the README to init submodule and change the BigVGAN source code.")
121
+ if is_local:
122
+ """download from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main"""
123
+ vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
124
+ else:
125
+ local_path = snapshot_download(repo_id="nvidia/bigvgan_v2_24khz_100band_256x", cache_dir=hf_cache_dir)
126
+ vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
127
+
128
+ vocoder.remove_weight_norm()
129
+ vocoder = vocoder.eval().to(device)
130
+ return vocoder
131
+
132
+
133
+ # load asr pipeline
134
+
135
+ asr_pipe = None
136
+
137
+
138
+ def initialize_asr_pipeline(device: str = device, dtype=None):
139
+ if dtype is None:
140
+ dtype = (
141
+ torch.float16
142
+ if "cuda" in device
143
+ and torch.cuda.get_device_properties(device).major >= 6
144
+ and not torch.cuda.get_device_name().endswith("[ZLUDA]")
145
+ else torch.float32
146
+ )
147
+ global asr_pipe
148
+ asr_pipe = pipeline(
149
+ "automatic-speech-recognition",
150
+ model="openai/whisper-large-v3-turbo",
151
+ torch_dtype=dtype,
152
+ device=device,
153
+ )
154
+
155
+
156
+ # transcribe
157
+
158
+
159
+ def transcribe(ref_audio, language=None):
160
+ global asr_pipe
161
+ if asr_pipe is None:
162
+ initialize_asr_pipeline(device=device)
163
+ return asr_pipe(
164
+ ref_audio,
165
+ chunk_length_s=30,
166
+ batch_size=128,
167
+ generate_kwargs={"task": "transcribe", "language": language} if language else {"task": "transcribe"},
168
+ return_timestamps=False,
169
+ )["text"].strip()
170
+
171
+
172
+ # load model checkpoint for inference
173
+
174
+
175
+ def load_checkpoint(model, ckpt_path, device: str, dtype=None, use_ema=True):
176
+ if dtype is None:
177
+ dtype = (
178
+ torch.float16
179
+ if "cuda" in device
180
+ and torch.cuda.get_device_properties(device).major >= 6
181
+ and not torch.cuda.get_device_name().endswith("[ZLUDA]")
182
+ else torch.float32
183
+ )
184
+ model = model.to(dtype)
185
+
186
+ ckpt_type = ckpt_path.split(".")[-1]
187
+ if ckpt_type == "safetensors":
188
+ from safetensors.torch import load_file
189
+
190
+ checkpoint = load_file(ckpt_path, device=device)
191
+ else:
192
+ checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True)
193
+
194
+ if use_ema:
195
+ if ckpt_type == "safetensors":
196
+ checkpoint = {"ema_model_state_dict": checkpoint}
197
+ checkpoint["model_state_dict"] = {
198
+ k.replace("ema_model.", ""): v
199
+ for k, v in checkpoint["ema_model_state_dict"].items()
200
+ if k not in ["initted", "step"]
201
+ }
202
+
203
+ # patch for backward compatibility, 305e3ea
204
+ for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
205
+ if key in checkpoint["model_state_dict"]:
206
+ del checkpoint["model_state_dict"][key]
207
+
208
+ model.load_state_dict(checkpoint["model_state_dict"])
209
+ else:
210
+ if ckpt_type == "safetensors":
211
+ checkpoint = {"model_state_dict": checkpoint}
212
+ model.load_state_dict(checkpoint["model_state_dict"])
213
+
214
+ del checkpoint
215
+ torch.cuda.empty_cache()
216
+
217
+ return model.to(device)
218
+
219
+
220
+ # load model for inference
221
+
222
+
223
+ def load_model(
224
+ model_cls,
225
+ model_cfg,
226
+ ckpt_path,
227
+ mel_spec_type=mel_spec_type,
228
+ vocab_file="",
229
+ ode_method=ode_method,
230
+ use_ema=True,
231
+ device=device,
232
+ ):
233
+ if vocab_file == "":
234
+ vocab_file = str(files("f5_tts").joinpath("infer/examples/vocab.txt"))
235
+ tokenizer = "custom"
236
+
237
+ print("\nvocab : ", vocab_file)
238
+ print("token : ", tokenizer)
239
+ print("model : ", ckpt_path, "\n")
240
+
241
+ vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
242
+ model = CFM(
243
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
244
+ mel_spec_kwargs=dict(
245
+ n_fft=n_fft,
246
+ hop_length=hop_length,
247
+ win_length=win_length,
248
+ n_mel_channels=n_mel_channels,
249
+ target_sample_rate=target_sample_rate,
250
+ mel_spec_type=mel_spec_type,
251
+ ),
252
+ odeint_kwargs=dict(
253
+ method=ode_method,
254
+ ),
255
+ vocab_char_map=vocab_char_map,
256
+ ).to(device)
257
+
258
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
259
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
260
+
261
+ return model
262
+
263
+
264
+ def remove_silence_edges(audio, silence_threshold=-42):
265
+ # Remove silence from the start
266
+ non_silent_start_idx = silence.detect_leading_silence(audio, silence_threshold=silence_threshold)
267
+ audio = audio[non_silent_start_idx:]
268
+
269
+ # Remove silence from the end
270
+ non_silent_end_duration = audio.duration_seconds
271
+ for ms in reversed(audio):
272
+ if ms.dBFS > silence_threshold:
273
+ break
274
+ non_silent_end_duration -= 0.001
275
+ trimmed_audio = audio[: int(non_silent_end_duration * 1000)]
276
+
277
+ return trimmed_audio
278
+
279
+
280
+ # preprocess reference audio and text
281
+
282
+
283
+ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_info=print, device=device):
284
+ show_info("Converting audio...")
285
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
286
+ aseg = AudioSegment.from_file(ref_audio_orig)
287
+
288
+ if clip_short:
289
+ # 1. try to find long silence for clipping
290
+ non_silent_segs = silence.split_on_silence(
291
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
292
+ )
293
+ non_silent_wave = AudioSegment.silent(duration=0)
294
+ for non_silent_seg in non_silent_segs:
295
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
296
+ show_info("Audio is over 15s, clipping short. (1)")
297
+ break
298
+ non_silent_wave += non_silent_seg
299
+
300
+ # 2. try to find short silence for clipping if 1. failed
301
+ if len(non_silent_wave) > 15000:
302
+ non_silent_segs = silence.split_on_silence(
303
+ aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
304
+ )
305
+ non_silent_wave = AudioSegment.silent(duration=0)
306
+ for non_silent_seg in non_silent_segs:
307
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
308
+ show_info("Audio is over 15s, clipping short. (2)")
309
+ break
310
+ non_silent_wave += non_silent_seg
311
+
312
+ aseg = non_silent_wave
313
+
314
+ # 3. if no proper silence found for clipping
315
+ if len(aseg) > 15000:
316
+ aseg = aseg[:15000]
317
+ show_info("Audio is over 15s, clipping short. (3)")
318
+
319
+ aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
320
+ aseg.export(f.name, format="wav")
321
+ ref_audio = f.name
322
+
323
+ # Compute a hash of the reference audio file
324
+ with open(ref_audio, "rb") as audio_file:
325
+ audio_data = audio_file.read()
326
+ audio_hash = hashlib.md5(audio_data).hexdigest()
327
+
328
+ if not ref_text.strip():
329
+ global _ref_audio_cache
330
+ if audio_hash in _ref_audio_cache:
331
+ # Use cached asr transcription
332
+ show_info("Using cached reference text...")
333
+ ref_text = _ref_audio_cache[audio_hash]
334
+ else:
335
+ show_info("No reference text provided, transcribing reference audio...")
336
+ ref_text = transcribe(ref_audio)
337
+ # Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
338
+ _ref_audio_cache[audio_hash] = ref_text
339
+ else:
340
+ show_info("Using custom reference text...")
341
+
342
+ # Ensure ref_text ends with a proper sentence-ending punctuation
343
+ if not ref_text.endswith(". ") and not ref_text.endswith("。"):
344
+ if ref_text.endswith("."):
345
+ ref_text += " "
346
+ else:
347
+ ref_text += ". "
348
+
349
+ print("\nref_text ", ref_text)
350
+
351
+ return ref_audio, ref_text
352
+
353
+
354
+ # infer process: chunk text -> infer batches [i.e. infer_batch_process()]
355
+
356
+
357
+ def infer_process(
358
+ ref_audio,
359
+ ref_text,
360
+ gen_text,
361
+ model_obj,
362
+ vocoder,
363
+ mel_spec_type=mel_spec_type,
364
+ show_info=print,
365
+ progress=tqdm,
366
+ target_rms=target_rms,
367
+ cross_fade_duration=cross_fade_duration,
368
+ nfe_step=nfe_step,
369
+ cfg_strength=cfg_strength,
370
+ sway_sampling_coef=sway_sampling_coef,
371
+ speed=speed,
372
+ fix_duration=fix_duration,
373
+ device=device,
374
+ ):
375
+ # Split the input text into batches
376
+ audio, sr = torchaudio.load(ref_audio)
377
+ max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
378
+ gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
379
+ for i, gen_text in enumerate(gen_text_batches):
380
+ print(f"gen_text {i}", gen_text)
381
+ print("\n")
382
+
383
+ show_info(f"Generating audio in {len(gen_text_batches)} batches...")
384
+ return infer_batch_process(
385
+ (audio, sr),
386
+ ref_text,
387
+ gen_text_batches,
388
+ model_obj,
389
+ vocoder,
390
+ mel_spec_type=mel_spec_type,
391
+ progress=progress,
392
+ target_rms=target_rms,
393
+ cross_fade_duration=cross_fade_duration,
394
+ nfe_step=nfe_step,
395
+ cfg_strength=cfg_strength,
396
+ sway_sampling_coef=sway_sampling_coef,
397
+ speed=speed,
398
+ fix_duration=fix_duration,
399
+ device=device,
400
+ )
401
+
402
+
403
+ # infer batches
404
+
405
+
406
+ def infer_batch_process(
407
+ ref_audio,
408
+ ref_text,
409
+ gen_text_batches,
410
+ model_obj,
411
+ vocoder,
412
+ mel_spec_type="vocos",
413
+ progress=tqdm,
414
+ target_rms=0.1,
415
+ cross_fade_duration=0.15,
416
+ nfe_step=32,
417
+ cfg_strength=2.0,
418
+ sway_sampling_coef=-1,
419
+ speed=1,
420
+ fix_duration=None,
421
+ device=None,
422
+ ):
423
+ audio, sr = ref_audio
424
+ if audio.shape[0] > 1:
425
+ audio = torch.mean(audio, dim=0, keepdim=True)
426
+
427
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
428
+ if rms < target_rms:
429
+ audio = audio * target_rms / rms
430
+ if sr != target_sample_rate:
431
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
432
+ audio = resampler(audio)
433
+ audio = audio.to(device)
434
+
435
+ generated_waves = []
436
+ spectrograms = []
437
+
438
+ if len(ref_text[-1].encode("utf-8")) == 1:
439
+ ref_text = ref_text + " "
440
+ for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
441
+ # Prepare the text
442
+ text_list = [ref_text + gen_text]
443
+ final_text_list = convert_char_to_pinyin(text_list)
444
+
445
+ ref_audio_len = audio.shape[-1] // hop_length
446
+ if fix_duration is not None:
447
+ duration = int(fix_duration * target_sample_rate / hop_length)
448
+ else:
449
+ # Calculate duration
450
+ ref_text_len = len(ref_text.encode("utf-8"))
451
+ gen_text_len = len(gen_text.encode("utf-8"))
452
+ duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
453
+
454
+ # inference
455
+ with torch.inference_mode():
456
+ generated, _ = model_obj.sample(
457
+ cond=audio,
458
+ text=final_text_list,
459
+ duration=duration,
460
+ steps=nfe_step,
461
+ cfg_strength=cfg_strength,
462
+ sway_sampling_coef=sway_sampling_coef,
463
+ )
464
+
465
+ generated = generated.to(torch.float32)
466
+ generated = generated[:, ref_audio_len:, :]
467
+ generated_mel_spec = generated.permute(0, 2, 1)
468
+ if mel_spec_type == "vocos":
469
+ generated_wave = vocoder.decode(generated_mel_spec)
470
+ elif mel_spec_type == "bigvgan":
471
+ generated_wave = vocoder(generated_mel_spec)
472
+ if rms < target_rms:
473
+ generated_wave = generated_wave * rms / target_rms
474
+
475
+ # wav -> numpy
476
+ generated_wave = generated_wave.squeeze().cpu().numpy()
477
+
478
+ generated_waves.append(generated_wave)
479
+ spectrograms.append(generated_mel_spec[0].cpu().numpy())
480
+
481
+ # Combine all generated waves with cross-fading
482
+ if cross_fade_duration <= 0:
483
+ # Simply concatenate
484
+ final_wave = np.concatenate(generated_waves)
485
+ else:
486
+ final_wave = generated_waves[0]
487
+ for i in range(1, len(generated_waves)):
488
+ prev_wave = final_wave
489
+ next_wave = generated_waves[i]
490
+
491
+ # Calculate cross-fade samples, ensuring it does not exceed wave lengths
492
+ cross_fade_samples = int(cross_fade_duration * target_sample_rate)
493
+ cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
494
+
495
+ if cross_fade_samples <= 0:
496
+ # No overlap possible, concatenate
497
+ final_wave = np.concatenate([prev_wave, next_wave])
498
+ continue
499
+
500
+ # Overlapping parts
501
+ prev_overlap = prev_wave[-cross_fade_samples:]
502
+ next_overlap = next_wave[:cross_fade_samples]
503
+
504
+ # Fade out and fade in
505
+ fade_out = np.linspace(1, 0, cross_fade_samples)
506
+ fade_in = np.linspace(0, 1, cross_fade_samples)
507
+
508
+ # Cross-faded overlap
509
+ cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
510
+
511
+ # Combine
512
+ new_wave = np.concatenate(
513
+ [prev_wave[:-cross_fade_samples], cross_faded_overlap, next_wave[cross_fade_samples:]]
514
+ )
515
+
516
+ final_wave = new_wave
517
+
518
+ # Create a combined spectrogram
519
+ combined_spectrogram = np.concatenate(spectrograms, axis=1)
520
+
521
+ return final_wave, target_sample_rate, combined_spectrogram
522
+
523
+
524
+ # remove silence from generated wav
525
+
526
+
527
+ def remove_silence_for_generated_wav(filename):
528
+ aseg = AudioSegment.from_file(filename)
529
+ non_silent_segs = silence.split_on_silence(
530
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500, seek_step=10
531
+ )
532
+ non_silent_wave = AudioSegment.silent(duration=0)
533
+ for non_silent_seg in non_silent_segs:
534
+ non_silent_wave += non_silent_seg
535
+ aseg = non_silent_wave
536
+ aseg.export(filename, format="wav")
537
+
538
+
539
+ # save spectrogram
540
+
541
+
542
+ def save_spectrogram(spectrogram, path):
543
+ plt.figure(figsize=(12, 4))
544
+ plt.imshow(spectrogram, origin="lower", aspect="auto")
545
+ plt.colorbar()
546
+ plt.savefig(path)
547
+ plt.close()
f5_tts/model/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from f5_tts.model.cfm import CFM
2
+
3
+ from f5_tts.model.backbones.unett import UNetT
4
+ from f5_tts.model.backbones.dit import DiT
5
+ from f5_tts.model.backbones.mmdit import MMDiT
6
+
7
+ from f5_tts.model.trainer import Trainer
8
+
9
+
10
+ __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]
f5_tts/model/backbones/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Backbones quick introduction
2
+
3
+
4
+ ### unett.py
5
+ - flat unet transformer
6
+ - structure same as in e2-tts & voicebox paper except using rotary pos emb
7
+ - update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
8
+
9
+ ### dit.py
10
+ - adaln-zero dit
11
+ - embedded timestep as condition
12
+ - concatted noised_input + masked_cond + embedded_text, linear proj in
13
+ - possible abs pos emb & convnextv2 blocks for embedded text before concat
14
+ - possible long skip connection (first layer to last layer)
15
+
16
+ ### mmdit.py
17
+ - sd3 structure
18
+ - timestep as condition
19
+ - left stream: text embedded and applied a abs pos emb
20
+ - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett
f5_tts/model/backbones/dit.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import torch
13
+ from torch import nn
14
+ import torch.nn.functional as F
15
+
16
+ from x_transformers.x_transformers import RotaryEmbedding
17
+
18
+ from f5_tts.model.modules import (
19
+ TimestepEmbedding,
20
+ ConvNeXtV2Block,
21
+ ConvPositionEmbedding,
22
+ DiTBlock,
23
+ AdaLayerNormZero_Final,
24
+ precompute_freqs_cis,
25
+ get_pos_embed_indices,
26
+ )
27
+
28
+
29
+ # Text embedding
30
+
31
+
32
+ class TextEmbedding(nn.Module):
33
+ def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
34
+ super().__init__()
35
+ self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
36
+
37
+ if conv_layers > 0:
38
+ self.extra_modeling = True
39
+ self.precompute_max_pos = 4096 # ~44s of 24khz audio
40
+ self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
41
+ self.text_blocks = nn.Sequential(
42
+ *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
43
+ )
44
+ else:
45
+ self.extra_modeling = False
46
+
47
+ def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
48
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
49
+ text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
50
+ batch, text_len = text.shape[0], text.shape[1]
51
+ text = F.pad(text, (0, seq_len - text_len), value=0)
52
+
53
+ if drop_text: # cfg for text
54
+ text = torch.zeros_like(text)
55
+
56
+ text = self.text_embed(text) # b n -> b n d
57
+
58
+ # possible extra modeling
59
+ if self.extra_modeling:
60
+ # sinus pos emb
61
+ batch_start = torch.zeros((batch,), dtype=torch.long)
62
+ pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
63
+ text_pos_embed = self.freqs_cis[pos_idx]
64
+ text = text + text_pos_embed
65
+
66
+ # convnextv2 blocks
67
+ text = self.text_blocks(text)
68
+
69
+ return text
70
+
71
+
72
+ # noised input audio and context mixing embedding
73
+
74
+
75
+ class InputEmbedding(nn.Module):
76
+ def __init__(self, mel_dim, text_dim, out_dim):
77
+ super().__init__()
78
+ self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
79
+ self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
80
+
81
+ def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
82
+ if drop_audio_cond: # cfg for cond audio
83
+ cond = torch.zeros_like(cond)
84
+
85
+ x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
86
+ x = self.conv_pos_embed(x) + x
87
+ return x
88
+
89
+
90
+ # Transformer backbone using DiT blocks
91
+
92
+
93
+ class DiT(nn.Module):
94
+ def __init__(
95
+ self,
96
+ *,
97
+ dim,
98
+ depth=8,
99
+ heads=8,
100
+ dim_head=64,
101
+ dropout=0.1,
102
+ ff_mult=4,
103
+ mel_dim=100,
104
+ text_num_embeds=256,
105
+ text_dim=None,
106
+ conv_layers=0,
107
+ long_skip_connection=False,
108
+ checkpoint_activations=False,
109
+ ):
110
+ super().__init__()
111
+
112
+ self.time_embed = TimestepEmbedding(dim)
113
+ if text_dim is None:
114
+ text_dim = mel_dim
115
+ self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
116
+ self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
117
+
118
+ self.rotary_embed = RotaryEmbedding(dim_head)
119
+
120
+ self.dim = dim
121
+ self.depth = depth
122
+
123
+ self.transformer_blocks = nn.ModuleList(
124
+ [DiTBlock(dim=dim, heads=heads, dim_head=dim_head, ff_mult=ff_mult, dropout=dropout) for _ in range(depth)]
125
+ )
126
+ self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
127
+
128
+ self.norm_out = AdaLayerNormZero_Final(dim) # final modulation
129
+ self.proj_out = nn.Linear(dim, mel_dim)
130
+
131
+ self.checkpoint_activations = checkpoint_activations
132
+
133
+ def ckpt_wrapper(self, module):
134
+ # https://github.com/chuanyangjin/fast-DiT/blob/main/models.py
135
+ def ckpt_forward(*inputs):
136
+ outputs = module(*inputs)
137
+ return outputs
138
+
139
+ return ckpt_forward
140
+
141
+ def forward(
142
+ self,
143
+ x: float["b n d"], # nosied input audio # noqa: F722
144
+ cond: float["b n d"], # masked cond audio # noqa: F722
145
+ text: int["b nt"], # text # noqa: F722
146
+ time: float["b"] | float[""], # time step # noqa: F821 F722
147
+ drop_audio_cond, # cfg for cond audio
148
+ drop_text, # cfg for text
149
+ mask: bool["b n"] | None = None, # noqa: F722
150
+ ):
151
+ batch, seq_len = x.shape[0], x.shape[1]
152
+ if time.ndim == 0:
153
+ time = time.repeat(batch)
154
+
155
+ # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
156
+ t = self.time_embed(time)
157
+ text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
158
+ x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
159
+
160
+ rope = self.rotary_embed.forward_from_seq_len(seq_len)
161
+
162
+ if self.long_skip_connection is not None:
163
+ residual = x
164
+
165
+ for block in self.transformer_blocks:
166
+ if self.checkpoint_activations:
167
+ x = torch.utils.checkpoint.checkpoint(self.ckpt_wrapper(block), x, t, mask, rope)
168
+ else:
169
+ x = block(x, t, mask=mask, rope=rope)
170
+
171
+ if self.long_skip_connection is not None:
172
+ x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
173
+
174
+ x = self.norm_out(x, t)
175
+ output = self.proj_out(x)
176
+
177
+ return output
f5_tts/model/backbones/mmdit.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import torch
13
+ from torch import nn
14
+
15
+ from x_transformers.x_transformers import RotaryEmbedding
16
+
17
+ from f5_tts.model.modules import (
18
+ TimestepEmbedding,
19
+ ConvPositionEmbedding,
20
+ MMDiTBlock,
21
+ AdaLayerNormZero_Final,
22
+ precompute_freqs_cis,
23
+ get_pos_embed_indices,
24
+ )
25
+
26
+
27
+ # text embedding
28
+
29
+
30
+ class TextEmbedding(nn.Module):
31
+ def __init__(self, out_dim, text_num_embeds):
32
+ super().__init__()
33
+ self.text_embed = nn.Embedding(text_num_embeds + 1, out_dim) # will use 0 as filler token
34
+
35
+ self.precompute_max_pos = 1024
36
+ self.register_buffer("freqs_cis", precompute_freqs_cis(out_dim, self.precompute_max_pos), persistent=False)
37
+
38
+ def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]: # noqa: F722
39
+ text = text + 1
40
+ if drop_text:
41
+ text = torch.zeros_like(text)
42
+ text = self.text_embed(text)
43
+
44
+ # sinus pos emb
45
+ batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
46
+ batch_text_len = text.shape[1]
47
+ pos_idx = get_pos_embed_indices(batch_start, batch_text_len, max_pos=self.precompute_max_pos)
48
+ text_pos_embed = self.freqs_cis[pos_idx]
49
+
50
+ text = text + text_pos_embed
51
+
52
+ return text
53
+
54
+
55
+ # noised input & masked cond audio embedding
56
+
57
+
58
+ class AudioEmbedding(nn.Module):
59
+ def __init__(self, in_dim, out_dim):
60
+ super().__init__()
61
+ self.linear = nn.Linear(2 * in_dim, out_dim)
62
+ self.conv_pos_embed = ConvPositionEmbedding(out_dim)
63
+
64
+ def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False): # noqa: F722
65
+ if drop_audio_cond:
66
+ cond = torch.zeros_like(cond)
67
+ x = torch.cat((x, cond), dim=-1)
68
+ x = self.linear(x)
69
+ x = self.conv_pos_embed(x) + x
70
+ return x
71
+
72
+
73
+ # Transformer backbone using MM-DiT blocks
74
+
75
+
76
+ class MMDiT(nn.Module):
77
+ def __init__(
78
+ self,
79
+ *,
80
+ dim,
81
+ depth=8,
82
+ heads=8,
83
+ dim_head=64,
84
+ dropout=0.1,
85
+ ff_mult=4,
86
+ text_num_embeds=256,
87
+ mel_dim=100,
88
+ ):
89
+ super().__init__()
90
+
91
+ self.time_embed = TimestepEmbedding(dim)
92
+ self.text_embed = TextEmbedding(dim, text_num_embeds)
93
+ self.audio_embed = AudioEmbedding(mel_dim, dim)
94
+
95
+ self.rotary_embed = RotaryEmbedding(dim_head)
96
+
97
+ self.dim = dim
98
+ self.depth = depth
99
+
100
+ self.transformer_blocks = nn.ModuleList(
101
+ [
102
+ MMDiTBlock(
103
+ dim=dim,
104
+ heads=heads,
105
+ dim_head=dim_head,
106
+ dropout=dropout,
107
+ ff_mult=ff_mult,
108
+ context_pre_only=i == depth - 1,
109
+ )
110
+ for i in range(depth)
111
+ ]
112
+ )
113
+ self.norm_out = AdaLayerNormZero_Final(dim) # final modulation
114
+ self.proj_out = nn.Linear(dim, mel_dim)
115
+
116
+ def forward(
117
+ self,
118
+ x: float["b n d"], # nosied input audio # noqa: F722
119
+ cond: float["b n d"], # masked cond audio # noqa: F722
120
+ text: int["b nt"], # text # noqa: F722
121
+ time: float["b"] | float[""], # time step # noqa: F821 F722
122
+ drop_audio_cond, # cfg for cond audio
123
+ drop_text, # cfg for text
124
+ mask: bool["b n"] | None = None, # noqa: F722
125
+ ):
126
+ batch = x.shape[0]
127
+ if time.ndim == 0:
128
+ time = time.repeat(batch)
129
+
130
+ # t: conditioning (time), c: context (text + masked cond audio), x: noised input audio
131
+ t = self.time_embed(time)
132
+ c = self.text_embed(text, drop_text=drop_text)
133
+ x = self.audio_embed(x, cond, drop_audio_cond=drop_audio_cond)
134
+
135
+ seq_len = x.shape[1]
136
+ text_len = text.shape[1]
137
+ rope_audio = self.rotary_embed.forward_from_seq_len(seq_len)
138
+ rope_text = self.rotary_embed.forward_from_seq_len(text_len)
139
+
140
+ for block in self.transformer_blocks:
141
+ c, x = block(x, c, t, mask=mask, rope=rope_audio, c_rope=rope_text)
142
+
143
+ x = self.norm_out(x, t)
144
+ output = self.proj_out(x)
145
+
146
+ return output
f5_tts/model/backbones/unett.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+ from typing import Literal
12
+
13
+ import torch
14
+ from torch import nn
15
+ import torch.nn.functional as F
16
+
17
+ from x_transformers import RMSNorm
18
+ from x_transformers.x_transformers import RotaryEmbedding
19
+
20
+ from f5_tts.model.modules import (
21
+ TimestepEmbedding,
22
+ ConvNeXtV2Block,
23
+ ConvPositionEmbedding,
24
+ Attention,
25
+ AttnProcessor,
26
+ FeedForward,
27
+ precompute_freqs_cis,
28
+ get_pos_embed_indices,
29
+ )
30
+
31
+
32
+ # Text embedding
33
+
34
+
35
+ class TextEmbedding(nn.Module):
36
+ def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
37
+ super().__init__()
38
+ self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
39
+
40
+ if conv_layers > 0:
41
+ self.extra_modeling = True
42
+ self.precompute_max_pos = 4096 # ~44s of 24khz audio
43
+ self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
44
+ self.text_blocks = nn.Sequential(
45
+ *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
46
+ )
47
+ else:
48
+ self.extra_modeling = False
49
+
50
+ def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
51
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
52
+ text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
53
+ batch, text_len = text.shape[0], text.shape[1]
54
+ text = F.pad(text, (0, seq_len - text_len), value=0)
55
+
56
+ if drop_text: # cfg for text
57
+ text = torch.zeros_like(text)
58
+
59
+ text = self.text_embed(text) # b n -> b n d
60
+
61
+ # possible extra modeling
62
+ if self.extra_modeling:
63
+ # sinus pos emb
64
+ batch_start = torch.zeros((batch,), dtype=torch.long)
65
+ pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
66
+ text_pos_embed = self.freqs_cis[pos_idx]
67
+ text = text + text_pos_embed
68
+
69
+ # convnextv2 blocks
70
+ text = self.text_blocks(text)
71
+
72
+ return text
73
+
74
+
75
+ # noised input audio and context mixing embedding
76
+
77
+
78
+ class InputEmbedding(nn.Module):
79
+ def __init__(self, mel_dim, text_dim, out_dim):
80
+ super().__init__()
81
+ self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
82
+ self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
83
+
84
+ def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
85
+ if drop_audio_cond: # cfg for cond audio
86
+ cond = torch.zeros_like(cond)
87
+
88
+ x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
89
+ x = self.conv_pos_embed(x) + x
90
+ return x
91
+
92
+
93
+ # Flat UNet Transformer backbone
94
+
95
+
96
+ class UNetT(nn.Module):
97
+ def __init__(
98
+ self,
99
+ *,
100
+ dim,
101
+ depth=8,
102
+ heads=8,
103
+ dim_head=64,
104
+ dropout=0.1,
105
+ ff_mult=4,
106
+ mel_dim=100,
107
+ text_num_embeds=256,
108
+ text_dim=None,
109
+ conv_layers=0,
110
+ skip_connect_type: Literal["add", "concat", "none"] = "concat",
111
+ ):
112
+ super().__init__()
113
+ assert depth % 2 == 0, "UNet-Transformer's depth should be even."
114
+
115
+ self.time_embed = TimestepEmbedding(dim)
116
+ if text_dim is None:
117
+ text_dim = mel_dim
118
+ self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
119
+ self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
120
+
121
+ self.rotary_embed = RotaryEmbedding(dim_head)
122
+
123
+ # transformer layers & skip connections
124
+
125
+ self.dim = dim
126
+ self.skip_connect_type = skip_connect_type
127
+ needs_skip_proj = skip_connect_type == "concat"
128
+
129
+ self.depth = depth
130
+ self.layers = nn.ModuleList([])
131
+
132
+ for idx in range(depth):
133
+ is_later_half = idx >= (depth // 2)
134
+
135
+ attn_norm = RMSNorm(dim)
136
+ attn = Attention(
137
+ processor=AttnProcessor(),
138
+ dim=dim,
139
+ heads=heads,
140
+ dim_head=dim_head,
141
+ dropout=dropout,
142
+ )
143
+
144
+ ff_norm = RMSNorm(dim)
145
+ ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
146
+
147
+ skip_proj = nn.Linear(dim * 2, dim, bias=False) if needs_skip_proj and is_later_half else None
148
+
149
+ self.layers.append(
150
+ nn.ModuleList(
151
+ [
152
+ skip_proj,
153
+ attn_norm,
154
+ attn,
155
+ ff_norm,
156
+ ff,
157
+ ]
158
+ )
159
+ )
160
+
161
+ self.norm_out = RMSNorm(dim)
162
+ self.proj_out = nn.Linear(dim, mel_dim)
163
+
164
+ def forward(
165
+ self,
166
+ x: float["b n d"], # nosied input audio # noqa: F722
167
+ cond: float["b n d"], # masked cond audio # noqa: F722
168
+ text: int["b nt"], # text # noqa: F722
169
+ time: float["b"] | float[""], # time step # noqa: F821 F722
170
+ drop_audio_cond, # cfg for cond audio
171
+ drop_text, # cfg for text
172
+ mask: bool["b n"] | None = None, # noqa: F722
173
+ ):
174
+ batch, seq_len = x.shape[0], x.shape[1]
175
+ if time.ndim == 0:
176
+ time = time.repeat(batch)
177
+
178
+ # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
179
+ t = self.time_embed(time)
180
+ text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
181
+ x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
182
+
183
+ # postfix time t to input x, [b n d] -> [b n+1 d]
184
+ x = torch.cat([t.unsqueeze(1), x], dim=1) # pack t to x
185
+ if mask is not None:
186
+ mask = F.pad(mask, (1, 0), value=1)
187
+
188
+ rope = self.rotary_embed.forward_from_seq_len(seq_len + 1)
189
+
190
+ # flat unet transformer
191
+ skip_connect_type = self.skip_connect_type
192
+ skips = []
193
+ for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(self.layers):
194
+ layer = idx + 1
195
+
196
+ # skip connection logic
197
+ is_first_half = layer <= (self.depth // 2)
198
+ is_later_half = not is_first_half
199
+
200
+ if is_first_half:
201
+ skips.append(x)
202
+
203
+ if is_later_half:
204
+ skip = skips.pop()
205
+ if skip_connect_type == "concat":
206
+ x = torch.cat((x, skip), dim=-1)
207
+ x = maybe_skip_proj(x)
208
+ elif skip_connect_type == "add":
209
+ x = x + skip
210
+
211
+ # attention and feedforward blocks
212
+ x = attn(attn_norm(x), rope=rope, mask=mask) + x
213
+ x = ff(ff_norm(x)) + x
214
+
215
+ assert len(skips) == 0
216
+
217
+ x = self.norm_out(x)[:, 1:, :] # unpack t from x
218
+
219
+ return self.proj_out(x)
f5_tts/model/cfm.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from random import random
13
+ from typing import Callable
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ from torch import nn
18
+ from torch.nn.utils.rnn import pad_sequence
19
+ from torchdiffeq import odeint
20
+
21
+ from f5_tts.model.modules import MelSpec
22
+ from f5_tts.model.utils import (
23
+ default,
24
+ exists,
25
+ lens_to_mask,
26
+ list_str_to_idx,
27
+ list_str_to_tensor,
28
+ mask_from_frac_lengths,
29
+ )
30
+
31
+
32
+ class CFM(nn.Module):
33
+ def __init__(
34
+ self,
35
+ transformer: nn.Module,
36
+ sigma=0.0,
37
+ odeint_kwargs: dict = dict(
38
+ # atol = 1e-5,
39
+ # rtol = 1e-5,
40
+ method="euler" # 'midpoint'
41
+ ),
42
+ audio_drop_prob=0.3,
43
+ cond_drop_prob=0.2,
44
+ num_channels=None,
45
+ mel_spec_module: nn.Module | None = None,
46
+ mel_spec_kwargs: dict = dict(),
47
+ frac_lengths_mask: tuple[float, float] = (0.7, 1.0),
48
+ vocab_char_map: dict[str:int] | None = None,
49
+ ):
50
+ super().__init__()
51
+
52
+ self.frac_lengths_mask = frac_lengths_mask
53
+
54
+ # mel spec
55
+ self.mel_spec = default(mel_spec_module, MelSpec(**mel_spec_kwargs))
56
+ num_channels = default(num_channels, self.mel_spec.n_mel_channels)
57
+ self.num_channels = num_channels
58
+
59
+ # classifier-free guidance
60
+ self.audio_drop_prob = audio_drop_prob
61
+ self.cond_drop_prob = cond_drop_prob
62
+
63
+ # transformer
64
+ self.transformer = transformer
65
+ dim = transformer.dim
66
+ self.dim = dim
67
+
68
+ # conditional flow related
69
+ self.sigma = sigma
70
+
71
+ # sampling related
72
+ self.odeint_kwargs = odeint_kwargs
73
+
74
+ # vocab map for tokenization
75
+ self.vocab_char_map = vocab_char_map
76
+
77
+ @property
78
+ def device(self):
79
+ return next(self.parameters()).device
80
+
81
+ @torch.no_grad()
82
+ def sample(
83
+ self,
84
+ cond: float["b n d"] | float["b nw"], # noqa: F722
85
+ text: int["b nt"] | list[str], # noqa: F722
86
+ duration: int | int["b"], # noqa: F821
87
+ *,
88
+ lens: int["b"] | None = None, # noqa: F821
89
+ steps=32,
90
+ cfg_strength=1.0,
91
+ sway_sampling_coef=None,
92
+ seed: int | None = None,
93
+ max_duration=4096,
94
+ vocoder: Callable[[float["b d n"]], float["b nw"]] | None = None, # noqa: F722
95
+ no_ref_audio=False,
96
+ duplicate_test=False,
97
+ t_inter=0.1,
98
+ edit_mask=None,
99
+ ):
100
+ self.eval()
101
+ # raw wave
102
+
103
+ if cond.ndim == 2:
104
+ cond = self.mel_spec(cond)
105
+ cond = cond.permute(0, 2, 1)
106
+ assert cond.shape[-1] == self.num_channels
107
+
108
+ cond = cond.to(next(self.parameters()).dtype)
109
+
110
+ batch, cond_seq_len, device = *cond.shape[:2], cond.device
111
+ if not exists(lens):
112
+ lens = torch.full((batch,), cond_seq_len, device=device, dtype=torch.long)
113
+
114
+ # text
115
+
116
+ if isinstance(text, list):
117
+ if exists(self.vocab_char_map):
118
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
119
+ else:
120
+ text = list_str_to_tensor(text).to(device)
121
+ assert text.shape[0] == batch
122
+
123
+ if exists(text):
124
+ text_lens = (text != -1).sum(dim=-1)
125
+ lens = torch.maximum(text_lens, lens) # make sure lengths are at least those of the text characters
126
+
127
+ # duration
128
+
129
+ cond_mask = lens_to_mask(lens)
130
+ if edit_mask is not None:
131
+ cond_mask = cond_mask & edit_mask
132
+
133
+ if isinstance(duration, int):
134
+ duration = torch.full((batch,), duration, device=device, dtype=torch.long)
135
+
136
+ duration = torch.maximum(lens + 1, duration) # just add one token so something is generated
137
+ duration = duration.clamp(max=max_duration)
138
+ max_duration = duration.amax()
139
+
140
+ # duplicate test corner for inner time step oberservation
141
+ if duplicate_test:
142
+ test_cond = F.pad(cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0)
143
+
144
+ cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
145
+ cond_mask = F.pad(cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False)
146
+ cond_mask = cond_mask.unsqueeze(-1)
147
+ step_cond = torch.where(
148
+ cond_mask, cond, torch.zeros_like(cond)
149
+ ) # allow direct control (cut cond audio) with lens passed in
150
+
151
+ if batch > 1:
152
+ mask = lens_to_mask(duration)
153
+ else: # save memory and speed up, as single inference need no mask currently
154
+ mask = None
155
+
156
+ # test for no ref audio
157
+ if no_ref_audio:
158
+ cond = torch.zeros_like(cond)
159
+
160
+ # neural ode
161
+
162
+ def fn(t, x):
163
+ # at each step, conditioning is fixed
164
+ # step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond))
165
+
166
+ # predict flow
167
+ pred = self.transformer(
168
+ x=x, cond=step_cond, text=text, time=t, mask=mask, drop_audio_cond=False, drop_text=False
169
+ )
170
+ if cfg_strength < 1e-5:
171
+ return pred
172
+
173
+ null_pred = self.transformer(
174
+ x=x, cond=step_cond, text=text, time=t, mask=mask, drop_audio_cond=True, drop_text=True
175
+ )
176
+ return pred + (pred - null_pred) * cfg_strength
177
+
178
+ # noise input
179
+ # to make sure batch inference result is same with different batch size, and for sure single inference
180
+ # still some difference maybe due to convolutional layers
181
+ y0 = []
182
+ for dur in duration:
183
+ if exists(seed):
184
+ torch.manual_seed(seed)
185
+ y0.append(torch.randn(dur, self.num_channels, device=self.device, dtype=step_cond.dtype))
186
+ y0 = pad_sequence(y0, padding_value=0, batch_first=True)
187
+
188
+ t_start = 0
189
+
190
+ # duplicate test corner for inner time step oberservation
191
+ if duplicate_test:
192
+ t_start = t_inter
193
+ y0 = (1 - t_start) * y0 + t_start * test_cond
194
+ steps = int(steps * (1 - t_start))
195
+
196
+ t = torch.linspace(t_start, 1, steps + 1, device=self.device, dtype=step_cond.dtype)
197
+ if sway_sampling_coef is not None:
198
+ t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
199
+
200
+ trajectory = odeint(fn, y0, t, **self.odeint_kwargs)
201
+
202
+ sampled = trajectory[-1]
203
+ out = sampled
204
+ out = torch.where(cond_mask, cond, out)
205
+
206
+ if exists(vocoder):
207
+ out = out.permute(0, 2, 1)
208
+ out = vocoder(out)
209
+
210
+ return out, trajectory
211
+
212
+ def forward(
213
+ self,
214
+ inp: float["b n d"] | float["b nw"], # mel or raw wave # noqa: F722
215
+ text: int["b nt"] | list[str], # noqa: F722
216
+ *,
217
+ lens: int["b"] | None = None, # noqa: F821
218
+ noise_scheduler: str | None = None,
219
+ ):
220
+ # handle raw wave
221
+ if inp.ndim == 2:
222
+ inp = self.mel_spec(inp)
223
+ inp = inp.permute(0, 2, 1)
224
+ assert inp.shape[-1] == self.num_channels
225
+
226
+ batch, seq_len, dtype, device, _σ1 = *inp.shape[:2], inp.dtype, self.device, self.sigma
227
+
228
+ # handle text as string
229
+ if isinstance(text, list):
230
+ if exists(self.vocab_char_map):
231
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
232
+ else:
233
+ text = list_str_to_tensor(text).to(device)
234
+ assert text.shape[0] == batch
235
+
236
+ # lens and mask
237
+ if not exists(lens):
238
+ lens = torch.full((batch,), seq_len, device=device)
239
+
240
+ mask = lens_to_mask(lens, length=seq_len) # useless here, as collate_fn will pad to max length in batch
241
+
242
+ # get a random span to mask out for training conditionally
243
+ frac_lengths = torch.zeros((batch,), device=self.device).float().uniform_(*self.frac_lengths_mask)
244
+ rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
245
+
246
+ if exists(mask):
247
+ rand_span_mask &= mask
248
+
249
+ # mel is x1
250
+ x1 = inp
251
+
252
+ # x0 is gaussian noise
253
+ x0 = torch.randn_like(x1)
254
+
255
+ # time step
256
+ time = torch.rand((batch,), dtype=dtype, device=self.device)
257
+ # TODO. noise_scheduler
258
+
259
+ # sample xt (φ_t(x) in the paper)
260
+ t = time.unsqueeze(-1).unsqueeze(-1)
261
+ φ = (1 - t) * x0 + t * x1
262
+ flow = x1 - x0
263
+
264
+ # only predict what is within the random mask span for infilling
265
+ cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)
266
+
267
+ # transformer and cfg training with a drop rate
268
+ drop_audio_cond = random() < self.audio_drop_prob # p_drop in voicebox paper
269
+ if random() < self.cond_drop_prob: # p_uncond in voicebox paper
270
+ drop_audio_cond = True
271
+ drop_text = True
272
+ else:
273
+ drop_text = False
274
+
275
+ # if want rigourously mask out padding, record in collate_fn in dataset.py, and pass in here
276
+ # adding mask will use more memory, thus also need to adjust batchsampler with scaled down threshold for long sequences
277
+ pred = self.transformer(
278
+ x=φ, cond=cond, text=text, time=time, drop_audio_cond=drop_audio_cond, drop_text=drop_text
279
+ )
280
+
281
+ # flow matching loss
282
+ loss = F.mse_loss(pred, flow, reduction="none")
283
+ loss = loss[rand_span_mask]
284
+
285
+ return loss.mean(), cond, pred
f5_tts/model/dataset.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ from importlib.resources import files
4
+
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import torchaudio
8
+ from datasets import Dataset as Dataset_
9
+ from datasets import load_from_disk
10
+ from torch import nn
11
+ from torch.utils.data import Dataset, Sampler
12
+ from tqdm import tqdm
13
+
14
+ from f5_tts.model.modules import MelSpec
15
+ from f5_tts.model.utils import default
16
+
17
+
18
+ class HFDataset(Dataset):
19
+ def __init__(
20
+ self,
21
+ hf_dataset: Dataset,
22
+ target_sample_rate=24_000,
23
+ n_mel_channels=100,
24
+ hop_length=256,
25
+ n_fft=1024,
26
+ win_length=1024,
27
+ mel_spec_type="vocos",
28
+ ):
29
+ self.data = hf_dataset
30
+ self.target_sample_rate = target_sample_rate
31
+ self.hop_length = hop_length
32
+
33
+ self.mel_spectrogram = MelSpec(
34
+ n_fft=n_fft,
35
+ hop_length=hop_length,
36
+ win_length=win_length,
37
+ n_mel_channels=n_mel_channels,
38
+ target_sample_rate=target_sample_rate,
39
+ mel_spec_type=mel_spec_type,
40
+ )
41
+
42
+ def get_frame_len(self, index):
43
+ row = self.data[index]
44
+ audio = row["audio"]["array"]
45
+ sample_rate = row["audio"]["sampling_rate"]
46
+ return audio.shape[-1] / sample_rate * self.target_sample_rate / self.hop_length
47
+
48
+ def __len__(self):
49
+ return len(self.data)
50
+
51
+ def __getitem__(self, index):
52
+ row = self.data[index]
53
+ audio = row["audio"]["array"]
54
+
55
+ # logger.info(f"Audio shape: {audio.shape}")
56
+
57
+ sample_rate = row["audio"]["sampling_rate"]
58
+ duration = audio.shape[-1] / sample_rate
59
+
60
+ if duration > 30 or duration < 0.3:
61
+ return self.__getitem__((index + 1) % len(self.data))
62
+
63
+ audio_tensor = torch.from_numpy(audio).float()
64
+
65
+ if sample_rate != self.target_sample_rate:
66
+ resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate)
67
+ audio_tensor = resampler(audio_tensor)
68
+
69
+ audio_tensor = audio_tensor.unsqueeze(0) # 't -> 1 t')
70
+
71
+ mel_spec = self.mel_spectrogram(audio_tensor)
72
+
73
+ mel_spec = mel_spec.squeeze(0) # '1 d t -> d t'
74
+
75
+ text = row["text"]
76
+
77
+ return dict(
78
+ mel_spec=mel_spec,
79
+ text=text,
80
+ )
81
+
82
+
83
+ class CustomDataset(Dataset):
84
+ def __init__(
85
+ self,
86
+ custom_dataset: Dataset,
87
+ durations=None,
88
+ target_sample_rate=24_000,
89
+ hop_length=256,
90
+ n_mel_channels=100,
91
+ n_fft=1024,
92
+ win_length=1024,
93
+ mel_spec_type="vocos",
94
+ preprocessed_mel=False,
95
+ mel_spec_module: nn.Module | None = None,
96
+ ):
97
+ self.data = custom_dataset
98
+ self.durations = durations
99
+ self.target_sample_rate = target_sample_rate
100
+ self.hop_length = hop_length
101
+ self.n_fft = n_fft
102
+ self.win_length = win_length
103
+ self.mel_spec_type = mel_spec_type
104
+ self.preprocessed_mel = preprocessed_mel
105
+
106
+ if not preprocessed_mel:
107
+ self.mel_spectrogram = default(
108
+ mel_spec_module,
109
+ MelSpec(
110
+ n_fft=n_fft,
111
+ hop_length=hop_length,
112
+ win_length=win_length,
113
+ n_mel_channels=n_mel_channels,
114
+ target_sample_rate=target_sample_rate,
115
+ mel_spec_type=mel_spec_type,
116
+ ),
117
+ )
118
+
119
+ def get_frame_len(self, index):
120
+ if (
121
+ self.durations is not None
122
+ ): # Please make sure the separately provided durations are correct, otherwise 99.99% OOM
123
+ return self.durations[index] * self.target_sample_rate / self.hop_length
124
+ return self.data[index]["duration"] * self.target_sample_rate / self.hop_length
125
+
126
+ def __len__(self):
127
+ return len(self.data)
128
+
129
+ def __getitem__(self, index):
130
+ while True:
131
+ row = self.data[index]
132
+ audio_path = row["audio_path"]
133
+ text = row["text"]
134
+ duration = row["duration"]
135
+
136
+ # filter by given length
137
+ if 0.3 <= duration <= 30:
138
+ break # valid
139
+
140
+ index = (index + 1) % len(self.data)
141
+
142
+ if self.preprocessed_mel:
143
+ mel_spec = torch.tensor(row["mel_spec"])
144
+ else:
145
+ audio, source_sample_rate = torchaudio.load(audio_path)
146
+
147
+ # make sure mono input
148
+ if audio.shape[0] > 1:
149
+ audio = torch.mean(audio, dim=0, keepdim=True)
150
+
151
+ # resample if necessary
152
+ if source_sample_rate != self.target_sample_rate:
153
+ resampler = torchaudio.transforms.Resample(source_sample_rate, self.target_sample_rate)
154
+ audio = resampler(audio)
155
+
156
+ # to mel spectrogram
157
+ mel_spec = self.mel_spectrogram(audio)
158
+ mel_spec = mel_spec.squeeze(0) # '1 d t -> d t'
159
+
160
+ return {
161
+ "mel_spec": mel_spec,
162
+ "text": text,
163
+ }
164
+
165
+
166
+ # Dynamic Batch Sampler
167
+ class DynamicBatchSampler(Sampler[list[int]]):
168
+ """Extension of Sampler that will do the following:
169
+ 1. Change the batch size (essentially number of sequences)
170
+ in a batch to ensure that the total number of frames are less
171
+ than a certain threshold.
172
+ 2. Make sure the padding efficiency in the batch is high.
173
+ """
174
+
175
+ def __init__(
176
+ self, sampler: Sampler[int], frames_threshold: int, max_samples=0, random_seed=None, drop_last: bool = False
177
+ ):
178
+ self.sampler = sampler
179
+ self.frames_threshold = frames_threshold
180
+ self.max_samples = max_samples
181
+
182
+ indices, batches = [], []
183
+ data_source = self.sampler.data_source
184
+
185
+ for idx in tqdm(
186
+ self.sampler, desc="Sorting with sampler... if slow, check whether dataset is provided with duration"
187
+ ):
188
+ indices.append((idx, data_source.get_frame_len(idx)))
189
+ indices.sort(key=lambda elem: elem[1])
190
+
191
+ batch = []
192
+ batch_frames = 0
193
+ for idx, frame_len in tqdm(
194
+ indices, desc=f"Creating dynamic batches with {frames_threshold} audio frames per gpu"
195
+ ):
196
+ if batch_frames + frame_len <= self.frames_threshold and (max_samples == 0 or len(batch) < max_samples):
197
+ batch.append(idx)
198
+ batch_frames += frame_len
199
+ else:
200
+ if len(batch) > 0:
201
+ batches.append(batch)
202
+ if frame_len <= self.frames_threshold:
203
+ batch = [idx]
204
+ batch_frames = frame_len
205
+ else:
206
+ batch = []
207
+ batch_frames = 0
208
+
209
+ if not drop_last and len(batch) > 0:
210
+ batches.append(batch)
211
+
212
+ del indices
213
+
214
+ # if want to have different batches between epochs, may just set a seed and log it in ckpt
215
+ # cuz during multi-gpu training, although the batch on per gpu not change between epochs, the formed general minibatch is different
216
+ # e.g. for epoch n, use (random_seed + n)
217
+ random.seed(random_seed)
218
+ random.shuffle(batches)
219
+
220
+ self.batches = batches
221
+
222
+ def __iter__(self):
223
+ return iter(self.batches)
224
+
225
+ def __len__(self):
226
+ return len(self.batches)
227
+
228
+
229
+ # Load dataset
230
+
231
+
232
+ def load_dataset(
233
+ dataset_name: str,
234
+ tokenizer: str = "pinyin",
235
+ dataset_type: str = "CustomDataset",
236
+ audio_type: str = "raw",
237
+ mel_spec_module: nn.Module | None = None,
238
+ mel_spec_kwargs: dict = dict(),
239
+ ) -> CustomDataset | HFDataset:
240
+ """
241
+ dataset_type - "CustomDataset" if you want to use tokenizer name and default data path to load for train_dataset
242
+ - "CustomDatasetPath" if you just want to pass the full path to a preprocessed dataset without relying on tokenizer
243
+ """
244
+
245
+ print("Loading dataset ...")
246
+
247
+ if dataset_type == "CustomDataset":
248
+ rel_data_path = str(files("f5_tts").joinpath(f"../../data/{dataset_name}_{tokenizer}"))
249
+ if audio_type == "raw":
250
+ try:
251
+ train_dataset = load_from_disk(f"{rel_data_path}/raw")
252
+ except: # noqa: E722
253
+ train_dataset = Dataset_.from_file(f"{rel_data_path}/raw.arrow")
254
+ preprocessed_mel = False
255
+ elif audio_type == "mel":
256
+ train_dataset = Dataset_.from_file(f"{rel_data_path}/mel.arrow")
257
+ preprocessed_mel = True
258
+ with open(f"{rel_data_path}/duration.json", "r", encoding="utf-8") as f:
259
+ data_dict = json.load(f)
260
+ durations = data_dict["duration"]
261
+ train_dataset = CustomDataset(
262
+ train_dataset,
263
+ durations=durations,
264
+ preprocessed_mel=preprocessed_mel,
265
+ mel_spec_module=mel_spec_module,
266
+ **mel_spec_kwargs,
267
+ )
268
+
269
+ elif dataset_type == "CustomDatasetPath":
270
+ try:
271
+ train_dataset = load_from_disk(f"{dataset_name}/raw")
272
+ except: # noqa: E722
273
+ train_dataset = Dataset_.from_file(f"{dataset_name}/raw.arrow")
274
+
275
+ with open(f"{dataset_name}/duration.json", "r", encoding="utf-8") as f:
276
+ data_dict = json.load(f)
277
+ durations = data_dict["duration"]
278
+ train_dataset = CustomDataset(
279
+ train_dataset, durations=durations, preprocessed_mel=preprocessed_mel, **mel_spec_kwargs
280
+ )
281
+
282
+ elif dataset_type == "HFDataset":
283
+ print(
284
+ "Should manually modify the path of huggingface dataset to your need.\n"
285
+ + "May also the corresponding script cuz different dataset may have different format."
286
+ )
287
+ pre, post = dataset_name.split("_")
288
+ train_dataset = HFDataset(
289
+ load_dataset(f"{pre}/{pre}", split=f"train.{post}", cache_dir=str(files("f5_tts").joinpath("../../data"))),
290
+ )
291
+
292
+ return train_dataset
293
+
294
+
295
+ # collation
296
+
297
+
298
+ def collate_fn(batch):
299
+ mel_specs = [item["mel_spec"].squeeze(0) for item in batch]
300
+ mel_lengths = torch.LongTensor([spec.shape[-1] for spec in mel_specs])
301
+ max_mel_length = mel_lengths.amax()
302
+
303
+ padded_mel_specs = []
304
+ for spec in mel_specs: # TODO. maybe records mask for attention here
305
+ padding = (0, max_mel_length - spec.size(-1))
306
+ padded_spec = F.pad(spec, padding, value=0)
307
+ padded_mel_specs.append(padded_spec)
308
+
309
+ mel_specs = torch.stack(padded_mel_specs)
310
+
311
+ text = [item["text"] for item in batch]
312
+ text_lengths = torch.LongTensor([len(item) for item in text])
313
+
314
+ return dict(
315
+ mel=mel_specs,
316
+ mel_lengths=mel_lengths,
317
+ text=text,
318
+ text_lengths=text_lengths,
319
+ )
f5_tts/model/modules.py ADDED
@@ -0,0 +1,658 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import math
13
+ from typing import Optional
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ import torchaudio
18
+ from librosa.filters import mel as librosa_mel_fn
19
+ from torch import nn
20
+ from x_transformers.x_transformers import apply_rotary_pos_emb
21
+
22
+
23
+ # raw wav to mel spec
24
+
25
+
26
+ mel_basis_cache = {}
27
+ hann_window_cache = {}
28
+
29
+
30
+ def get_bigvgan_mel_spectrogram(
31
+ waveform,
32
+ n_fft=1024,
33
+ n_mel_channels=100,
34
+ target_sample_rate=24000,
35
+ hop_length=256,
36
+ win_length=1024,
37
+ fmin=0,
38
+ fmax=None,
39
+ center=False,
40
+ ): # Copy from https://github.com/NVIDIA/BigVGAN/tree/main
41
+ device = waveform.device
42
+ key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
43
+
44
+ if key not in mel_basis_cache:
45
+ mel = librosa_mel_fn(sr=target_sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=fmin, fmax=fmax)
46
+ mel_basis_cache[key] = torch.from_numpy(mel).float().to(device) # TODO: why they need .float()?
47
+ hann_window_cache[key] = torch.hann_window(win_length).to(device)
48
+
49
+ mel_basis = mel_basis_cache[key]
50
+ hann_window = hann_window_cache[key]
51
+
52
+ padding = (n_fft - hop_length) // 2
53
+ waveform = torch.nn.functional.pad(waveform.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1)
54
+
55
+ spec = torch.stft(
56
+ waveform,
57
+ n_fft,
58
+ hop_length=hop_length,
59
+ win_length=win_length,
60
+ window=hann_window,
61
+ center=center,
62
+ pad_mode="reflect",
63
+ normalized=False,
64
+ onesided=True,
65
+ return_complex=True,
66
+ )
67
+ spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
68
+
69
+ mel_spec = torch.matmul(mel_basis, spec)
70
+ mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
71
+
72
+ return mel_spec
73
+
74
+
75
+ def get_vocos_mel_spectrogram(
76
+ waveform,
77
+ n_fft=1024,
78
+ n_mel_channels=100,
79
+ target_sample_rate=24000,
80
+ hop_length=256,
81
+ win_length=1024,
82
+ ):
83
+ mel_stft = torchaudio.transforms.MelSpectrogram(
84
+ sample_rate=target_sample_rate,
85
+ n_fft=n_fft,
86
+ win_length=win_length,
87
+ hop_length=hop_length,
88
+ n_mels=n_mel_channels,
89
+ power=1,
90
+ center=True,
91
+ normalized=False,
92
+ norm=None,
93
+ ).to(waveform.device)
94
+ if len(waveform.shape) == 3:
95
+ waveform = waveform.squeeze(1) # 'b 1 nw -> b nw'
96
+
97
+ assert len(waveform.shape) == 2
98
+
99
+ mel = mel_stft(waveform)
100
+ mel = mel.clamp(min=1e-5).log()
101
+ return mel
102
+
103
+
104
+ class MelSpec(nn.Module):
105
+ def __init__(
106
+ self,
107
+ n_fft=1024,
108
+ hop_length=256,
109
+ win_length=1024,
110
+ n_mel_channels=100,
111
+ target_sample_rate=24_000,
112
+ mel_spec_type="vocos",
113
+ ):
114
+ super().__init__()
115
+ assert mel_spec_type in ["vocos", "bigvgan"], print("We only support two extract mel backend: vocos or bigvgan")
116
+
117
+ self.n_fft = n_fft
118
+ self.hop_length = hop_length
119
+ self.win_length = win_length
120
+ self.n_mel_channels = n_mel_channels
121
+ self.target_sample_rate = target_sample_rate
122
+
123
+ if mel_spec_type == "vocos":
124
+ self.extractor = get_vocos_mel_spectrogram
125
+ elif mel_spec_type == "bigvgan":
126
+ self.extractor = get_bigvgan_mel_spectrogram
127
+
128
+ self.register_buffer("dummy", torch.tensor(0), persistent=False)
129
+
130
+ def forward(self, wav):
131
+ if self.dummy.device != wav.device:
132
+ self.to(wav.device)
133
+
134
+ mel = self.extractor(
135
+ waveform=wav,
136
+ n_fft=self.n_fft,
137
+ n_mel_channels=self.n_mel_channels,
138
+ target_sample_rate=self.target_sample_rate,
139
+ hop_length=self.hop_length,
140
+ win_length=self.win_length,
141
+ )
142
+
143
+ return mel
144
+
145
+
146
+ # sinusoidal position embedding
147
+
148
+
149
+ class SinusPositionEmbedding(nn.Module):
150
+ def __init__(self, dim):
151
+ super().__init__()
152
+ self.dim = dim
153
+
154
+ def forward(self, x, scale=1000):
155
+ device = x.device
156
+ half_dim = self.dim // 2
157
+ emb = math.log(10000) / (half_dim - 1)
158
+ emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
159
+ emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
160
+ emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
161
+ return emb
162
+
163
+
164
+ # convolutional position embedding
165
+
166
+
167
+ class ConvPositionEmbedding(nn.Module):
168
+ def __init__(self, dim, kernel_size=31, groups=16):
169
+ super().__init__()
170
+ assert kernel_size % 2 != 0
171
+ self.conv1d = nn.Sequential(
172
+ nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
173
+ nn.Mish(),
174
+ nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
175
+ nn.Mish(),
176
+ )
177
+
178
+ def forward(self, x: float["b n d"], mask: bool["b n"] | None = None): # noqa: F722
179
+ if mask is not None:
180
+ mask = mask[..., None]
181
+ x = x.masked_fill(~mask, 0.0)
182
+
183
+ x = x.permute(0, 2, 1)
184
+ x = self.conv1d(x)
185
+ out = x.permute(0, 2, 1)
186
+
187
+ if mask is not None:
188
+ out = out.masked_fill(~mask, 0.0)
189
+
190
+ return out
191
+
192
+
193
+ # rotary positional embedding related
194
+
195
+
196
+ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
197
+ # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
198
+ # has some connection to NTK literature
199
+ # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
200
+ # https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
201
+ theta *= theta_rescale_factor ** (dim / (dim - 2))
202
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
203
+ t = torch.arange(end, device=freqs.device) # type: ignore
204
+ freqs = torch.outer(t, freqs).float() # type: ignore
205
+ freqs_cos = torch.cos(freqs) # real part
206
+ freqs_sin = torch.sin(freqs) # imaginary part
207
+ return torch.cat([freqs_cos, freqs_sin], dim=-1)
208
+
209
+
210
+ def get_pos_embed_indices(start, length, max_pos, scale=1.0):
211
+ # length = length if isinstance(length, int) else length.max()
212
+ scale = scale * torch.ones_like(start, dtype=torch.float32) # in case scale is a scalar
213
+ pos = (
214
+ start.unsqueeze(1)
215
+ + (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
216
+ )
217
+ # avoid extra long error.
218
+ pos = torch.where(pos < max_pos, pos, max_pos - 1)
219
+ return pos
220
+
221
+
222
+ # Global Response Normalization layer (Instance Normalization ?)
223
+
224
+
225
+ class GRN(nn.Module):
226
+ def __init__(self, dim):
227
+ super().__init__()
228
+ self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
229
+ self.beta = nn.Parameter(torch.zeros(1, 1, dim))
230
+
231
+ def forward(self, x):
232
+ Gx = torch.norm(x, p=2, dim=1, keepdim=True)
233
+ Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
234
+ return self.gamma * (x * Nx) + self.beta + x
235
+
236
+
237
+ # ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
238
+ # ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
239
+
240
+
241
+ class ConvNeXtV2Block(nn.Module):
242
+ def __init__(
243
+ self,
244
+ dim: int,
245
+ intermediate_dim: int,
246
+ dilation: int = 1,
247
+ ):
248
+ super().__init__()
249
+ padding = (dilation * (7 - 1)) // 2
250
+ self.dwconv = nn.Conv1d(
251
+ dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
252
+ ) # depthwise conv
253
+ self.norm = nn.LayerNorm(dim, eps=1e-6)
254
+ self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers
255
+ self.act = nn.GELU()
256
+ self.grn = GRN(intermediate_dim)
257
+ self.pwconv2 = nn.Linear(intermediate_dim, dim)
258
+
259
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
260
+ residual = x
261
+ x = x.transpose(1, 2) # b n d -> b d n
262
+ x = self.dwconv(x)
263
+ x = x.transpose(1, 2) # b d n -> b n d
264
+ x = self.norm(x)
265
+ x = self.pwconv1(x)
266
+ x = self.act(x)
267
+ x = self.grn(x)
268
+ x = self.pwconv2(x)
269
+ return residual + x
270
+
271
+
272
+ # AdaLayerNormZero
273
+ # return with modulated x for attn input, and params for later mlp modulation
274
+
275
+
276
+ class AdaLayerNormZero(nn.Module):
277
+ def __init__(self, dim):
278
+ super().__init__()
279
+
280
+ self.silu = nn.SiLU()
281
+ self.linear = nn.Linear(dim, dim * 6)
282
+
283
+ self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
284
+
285
+ def forward(self, x, emb=None):
286
+ emb = self.linear(self.silu(emb))
287
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
288
+
289
+ x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
290
+ return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
291
+
292
+
293
+ # AdaLayerNormZero for final layer
294
+ # return only with modulated x for attn input, cuz no more mlp modulation
295
+
296
+
297
+ class AdaLayerNormZero_Final(nn.Module):
298
+ def __init__(self, dim):
299
+ super().__init__()
300
+
301
+ self.silu = nn.SiLU()
302
+ self.linear = nn.Linear(dim, dim * 2)
303
+
304
+ self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
305
+
306
+ def forward(self, x, emb):
307
+ emb = self.linear(self.silu(emb))
308
+ scale, shift = torch.chunk(emb, 2, dim=1)
309
+
310
+ x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
311
+ return x
312
+
313
+
314
+ # FeedForward
315
+
316
+
317
+ class FeedForward(nn.Module):
318
+ def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
319
+ super().__init__()
320
+ inner_dim = int(dim * mult)
321
+ dim_out = dim_out if dim_out is not None else dim
322
+
323
+ activation = nn.GELU(approximate=approximate)
324
+ project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
325
+ self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
326
+
327
+ def forward(self, x):
328
+ return self.ff(x)
329
+
330
+
331
+ # Attention with possible joint part
332
+ # modified from diffusers/src/diffusers/models/attention_processor.py
333
+
334
+
335
+ class Attention(nn.Module):
336
+ def __init__(
337
+ self,
338
+ processor: JointAttnProcessor | AttnProcessor,
339
+ dim: int,
340
+ heads: int = 8,
341
+ dim_head: int = 64,
342
+ dropout: float = 0.0,
343
+ context_dim: Optional[int] = None, # if not None -> joint attention
344
+ context_pre_only=None,
345
+ ):
346
+ super().__init__()
347
+
348
+ if not hasattr(F, "scaled_dot_product_attention"):
349
+ raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
350
+
351
+ self.processor = processor
352
+
353
+ self.dim = dim
354
+ self.heads = heads
355
+ self.inner_dim = dim_head * heads
356
+ self.dropout = dropout
357
+
358
+ self.context_dim = context_dim
359
+ self.context_pre_only = context_pre_only
360
+
361
+ self.to_q = nn.Linear(dim, self.inner_dim)
362
+ self.to_k = nn.Linear(dim, self.inner_dim)
363
+ self.to_v = nn.Linear(dim, self.inner_dim)
364
+
365
+ if self.context_dim is not None:
366
+ self.to_k_c = nn.Linear(context_dim, self.inner_dim)
367
+ self.to_v_c = nn.Linear(context_dim, self.inner_dim)
368
+ if self.context_pre_only is not None:
369
+ self.to_q_c = nn.Linear(context_dim, self.inner_dim)
370
+
371
+ self.to_out = nn.ModuleList([])
372
+ self.to_out.append(nn.Linear(self.inner_dim, dim))
373
+ self.to_out.append(nn.Dropout(dropout))
374
+
375
+ if self.context_pre_only is not None and not self.context_pre_only:
376
+ self.to_out_c = nn.Linear(self.inner_dim, dim)
377
+
378
+ def forward(
379
+ self,
380
+ x: float["b n d"], # noised input x # noqa: F722
381
+ c: float["b n d"] = None, # context c # noqa: F722
382
+ mask: bool["b n"] | None = None, # noqa: F722
383
+ rope=None, # rotary position embedding for x
384
+ c_rope=None, # rotary position embedding for c
385
+ ) -> torch.Tensor:
386
+ if c is not None:
387
+ return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
388
+ else:
389
+ return self.processor(self, x, mask=mask, rope=rope)
390
+
391
+
392
+ # Attention processor
393
+
394
+
395
+ class AttnProcessor:
396
+ def __init__(self):
397
+ pass
398
+
399
+ def __call__(
400
+ self,
401
+ attn: Attention,
402
+ x: float["b n d"], # noised input x # noqa: F722
403
+ mask: bool["b n"] | None = None, # noqa: F722
404
+ rope=None, # rotary position embedding
405
+ ) -> torch.FloatTensor:
406
+ batch_size = x.shape[0]
407
+
408
+ # `sample` projections.
409
+ query = attn.to_q(x)
410
+ key = attn.to_k(x)
411
+ value = attn.to_v(x)
412
+
413
+ # apply rotary position embedding
414
+ if rope is not None:
415
+ freqs, xpos_scale = rope
416
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
417
+
418
+ query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
419
+ key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
420
+
421
+ # attention
422
+ inner_dim = key.shape[-1]
423
+ head_dim = inner_dim // attn.heads
424
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
425
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
426
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
427
+
428
+ # mask. e.g. inference got a batch with different target durations, mask out the padding
429
+ if mask is not None:
430
+ attn_mask = mask
431
+ attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
432
+ attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
433
+ else:
434
+ attn_mask = None
435
+
436
+ x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
437
+ x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
438
+ x = x.to(query.dtype)
439
+
440
+ # linear proj
441
+ x = attn.to_out[0](x)
442
+ # dropout
443
+ x = attn.to_out[1](x)
444
+
445
+ if mask is not None:
446
+ mask = mask.unsqueeze(-1)
447
+ x = x.masked_fill(~mask, 0.0)
448
+
449
+ return x
450
+
451
+
452
+ # Joint Attention processor for MM-DiT
453
+ # modified from diffusers/src/diffusers/models/attention_processor.py
454
+
455
+
456
+ class JointAttnProcessor:
457
+ def __init__(self):
458
+ pass
459
+
460
+ def __call__(
461
+ self,
462
+ attn: Attention,
463
+ x: float["b n d"], # noised input x # noqa: F722
464
+ c: float["b nt d"] = None, # context c, here text # noqa: F722
465
+ mask: bool["b n"] | None = None, # noqa: F722
466
+ rope=None, # rotary position embedding for x
467
+ c_rope=None, # rotary position embedding for c
468
+ ) -> torch.FloatTensor:
469
+ residual = x
470
+
471
+ batch_size = c.shape[0]
472
+
473
+ # `sample` projections.
474
+ query = attn.to_q(x)
475
+ key = attn.to_k(x)
476
+ value = attn.to_v(x)
477
+
478
+ # `context` projections.
479
+ c_query = attn.to_q_c(c)
480
+ c_key = attn.to_k_c(c)
481
+ c_value = attn.to_v_c(c)
482
+
483
+ # apply rope for context and noised input independently
484
+ if rope is not None:
485
+ freqs, xpos_scale = rope
486
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
487
+ query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
488
+ key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
489
+ if c_rope is not None:
490
+ freqs, xpos_scale = c_rope
491
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
492
+ c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
493
+ c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
494
+
495
+ # attention
496
+ query = torch.cat([query, c_query], dim=1)
497
+ key = torch.cat([key, c_key], dim=1)
498
+ value = torch.cat([value, c_value], dim=1)
499
+
500
+ inner_dim = key.shape[-1]
501
+ head_dim = inner_dim // attn.heads
502
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
503
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
504
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
505
+
506
+ # mask. e.g. inference got a batch with different target durations, mask out the padding
507
+ if mask is not None:
508
+ attn_mask = F.pad(mask, (0, c.shape[1]), value=True) # no mask for c (text)
509
+ attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
510
+ attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
511
+ else:
512
+ attn_mask = None
513
+
514
+ x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
515
+ x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
516
+ x = x.to(query.dtype)
517
+
518
+ # Split the attention outputs.
519
+ x, c = (
520
+ x[:, : residual.shape[1]],
521
+ x[:, residual.shape[1] :],
522
+ )
523
+
524
+ # linear proj
525
+ x = attn.to_out[0](x)
526
+ # dropout
527
+ x = attn.to_out[1](x)
528
+ if not attn.context_pre_only:
529
+ c = attn.to_out_c(c)
530
+
531
+ if mask is not None:
532
+ mask = mask.unsqueeze(-1)
533
+ x = x.masked_fill(~mask, 0.0)
534
+ # c = c.masked_fill(~mask, 0.) # no mask for c (text)
535
+
536
+ return x, c
537
+
538
+
539
+ # DiT Block
540
+
541
+
542
+ class DiTBlock(nn.Module):
543
+ def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1):
544
+ super().__init__()
545
+
546
+ self.attn_norm = AdaLayerNormZero(dim)
547
+ self.attn = Attention(
548
+ processor=AttnProcessor(),
549
+ dim=dim,
550
+ heads=heads,
551
+ dim_head=dim_head,
552
+ dropout=dropout,
553
+ )
554
+
555
+ self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
556
+ self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
557
+
558
+ def forward(self, x, t, mask=None, rope=None): # x: noised input, t: time embedding
559
+ # pre-norm & modulation for attention input
560
+ norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
561
+
562
+ # attention
563
+ attn_output = self.attn(x=norm, mask=mask, rope=rope)
564
+
565
+ # process attention output for input x
566
+ x = x + gate_msa.unsqueeze(1) * attn_output
567
+
568
+ norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
569
+ ff_output = self.ff(norm)
570
+ x = x + gate_mlp.unsqueeze(1) * ff_output
571
+
572
+ return x
573
+
574
+
575
+ # MMDiT Block https://arxiv.org/abs/2403.03206
576
+
577
+
578
+ class MMDiTBlock(nn.Module):
579
+ r"""
580
+ modified from diffusers/src/diffusers/models/attention.py
581
+
582
+ notes.
583
+ _c: context related. text, cond, etc. (left part in sd3 fig2.b)
584
+ _x: noised input related. (right part)
585
+ context_pre_only: last layer only do prenorm + modulation cuz no more ffn
586
+ """
587
+
588
+ def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_pre_only=False):
589
+ super().__init__()
590
+
591
+ self.context_pre_only = context_pre_only
592
+
593
+ self.attn_norm_c = AdaLayerNormZero_Final(dim) if context_pre_only else AdaLayerNormZero(dim)
594
+ self.attn_norm_x = AdaLayerNormZero(dim)
595
+ self.attn = Attention(
596
+ processor=JointAttnProcessor(),
597
+ dim=dim,
598
+ heads=heads,
599
+ dim_head=dim_head,
600
+ dropout=dropout,
601
+ context_dim=dim,
602
+ context_pre_only=context_pre_only,
603
+ )
604
+
605
+ if not context_pre_only:
606
+ self.ff_norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
607
+ self.ff_c = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
608
+ else:
609
+ self.ff_norm_c = None
610
+ self.ff_c = None
611
+ self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
612
+ self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
613
+
614
+ def forward(self, x, c, t, mask=None, rope=None, c_rope=None): # x: noised input, c: context, t: time embedding
615
+ # pre-norm & modulation for attention input
616
+ if self.context_pre_only:
617
+ norm_c = self.attn_norm_c(c, t)
618
+ else:
619
+ norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
620
+ norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
621
+
622
+ # attention
623
+ x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
624
+
625
+ # process attention output for context c
626
+ if self.context_pre_only:
627
+ c = None
628
+ else: # if not last layer
629
+ c = c + c_gate_msa.unsqueeze(1) * c_attn_output
630
+
631
+ norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
632
+ c_ff_output = self.ff_c(norm_c)
633
+ c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
634
+
635
+ # process attention output for input x
636
+ x = x + x_gate_msa.unsqueeze(1) * x_attn_output
637
+
638
+ norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
639
+ x_ff_output = self.ff_x(norm_x)
640
+ x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
641
+
642
+ return c, x
643
+
644
+
645
+ # time step conditioning embedding
646
+
647
+
648
+ class TimestepEmbedding(nn.Module):
649
+ def __init__(self, dim, freq_embed_dim=256):
650
+ super().__init__()
651
+ self.time_embed = SinusPositionEmbedding(freq_embed_dim)
652
+ self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
653
+
654
+ def forward(self, timestep: float["b"]): # noqa: F821
655
+ time_hidden = self.time_embed(timestep)
656
+ time_hidden = time_hidden.to(timestep.dtype)
657
+ time = self.time_mlp(time_hidden) # b d
658
+ return time
f5_tts/model/trainer.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import gc
4
+ import os
5
+
6
+ import torch
7
+ import torchaudio
8
+ import wandb
9
+ from accelerate import Accelerator
10
+ from accelerate.utils import DistributedDataParallelKwargs
11
+ from ema_pytorch import EMA
12
+ from torch.optim import AdamW
13
+ from torch.optim.lr_scheduler import LinearLR, SequentialLR
14
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler
15
+ from tqdm import tqdm
16
+
17
+ from f5_tts.model import CFM
18
+ from f5_tts.model.dataset import DynamicBatchSampler, collate_fn
19
+ from f5_tts.model.utils import default, exists
20
+
21
+ # trainer
22
+
23
+
24
+ class Trainer:
25
+ def __init__(
26
+ self,
27
+ model: CFM,
28
+ epochs,
29
+ learning_rate,
30
+ num_warmup_updates=20000,
31
+ save_per_updates=1000,
32
+ checkpoint_path=None,
33
+ batch_size=32,
34
+ batch_size_type: str = "sample",
35
+ max_samples=32,
36
+ grad_accumulation_steps=1,
37
+ max_grad_norm=1.0,
38
+ noise_scheduler: str | None = None,
39
+ duration_predictor: torch.nn.Module | None = None,
40
+ logger: str | None = "wandb", # "wandb" | "tensorboard" | None
41
+ wandb_project="test_e2-tts",
42
+ wandb_run_name="test_run",
43
+ wandb_resume_id: str = None,
44
+ log_samples: bool = False,
45
+ last_per_steps=None,
46
+ accelerate_kwargs: dict = dict(),
47
+ ema_kwargs: dict = dict(),
48
+ bnb_optimizer: bool = False,
49
+ mel_spec_type: str = "vocos", # "vocos" | "bigvgan"
50
+ is_local_vocoder: bool = False, # use local path vocoder
51
+ local_vocoder_path: str = "", # local vocoder path
52
+ ):
53
+ ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
54
+
55
+ if logger == "wandb" and not wandb.api.api_key:
56
+ logger = None
57
+ print(f"Using logger: {logger}")
58
+ self.log_samples = log_samples
59
+
60
+ self.accelerator = Accelerator(
61
+ log_with=logger if logger == "wandb" else None,
62
+ kwargs_handlers=[ddp_kwargs],
63
+ gradient_accumulation_steps=grad_accumulation_steps,
64
+ **accelerate_kwargs,
65
+ )
66
+
67
+ self.logger = logger
68
+ if self.logger == "wandb":
69
+ if exists(wandb_resume_id):
70
+ init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name, "id": wandb_resume_id}}
71
+ else:
72
+ init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
73
+
74
+ self.accelerator.init_trackers(
75
+ project_name=wandb_project,
76
+ init_kwargs=init_kwargs,
77
+ config={
78
+ "epochs": epochs,
79
+ "learning_rate": learning_rate,
80
+ "num_warmup_updates": num_warmup_updates,
81
+ "batch_size": batch_size,
82
+ "batch_size_type": batch_size_type,
83
+ "max_samples": max_samples,
84
+ "grad_accumulation_steps": grad_accumulation_steps,
85
+ "max_grad_norm": max_grad_norm,
86
+ "gpus": self.accelerator.num_processes,
87
+ "noise_scheduler": noise_scheduler,
88
+ },
89
+ )
90
+
91
+ elif self.logger == "tensorboard":
92
+ from torch.utils.tensorboard import SummaryWriter
93
+
94
+ self.writer = SummaryWriter(log_dir=f"runs/{wandb_run_name}")
95
+
96
+ self.model = model
97
+
98
+ if self.is_main:
99
+ self.ema_model = EMA(model, include_online_model=False, **ema_kwargs)
100
+ self.ema_model.to(self.accelerator.device)
101
+
102
+ self.epochs = epochs
103
+ self.num_warmup_updates = num_warmup_updates
104
+ self.save_per_updates = save_per_updates
105
+ self.last_per_steps = default(last_per_steps, save_per_updates * grad_accumulation_steps)
106
+ self.checkpoint_path = default(checkpoint_path, "ckpts/test_e2-tts")
107
+
108
+ self.batch_size = batch_size
109
+ self.batch_size_type = batch_size_type
110
+ self.max_samples = max_samples
111
+ self.grad_accumulation_steps = grad_accumulation_steps
112
+ self.max_grad_norm = max_grad_norm
113
+
114
+ # mel vocoder config
115
+ self.vocoder_name = mel_spec_type
116
+ self.is_local_vocoder = is_local_vocoder
117
+ self.local_vocoder_path = local_vocoder_path
118
+
119
+ self.noise_scheduler = noise_scheduler
120
+
121
+ self.duration_predictor = duration_predictor
122
+
123
+ if bnb_optimizer:
124
+ import bitsandbytes as bnb
125
+
126
+ self.optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)
127
+ else:
128
+ self.optimizer = AdamW(model.parameters(), lr=learning_rate)
129
+ self.model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
130
+
131
+ @property
132
+ def is_main(self):
133
+ return self.accelerator.is_main_process
134
+
135
+ def save_checkpoint(self, step, last=False):
136
+ self.accelerator.wait_for_everyone()
137
+ if self.is_main:
138
+ checkpoint = dict(
139
+ model_state_dict=self.accelerator.unwrap_model(self.model).state_dict(),
140
+ optimizer_state_dict=self.accelerator.unwrap_model(self.optimizer).state_dict(),
141
+ ema_model_state_dict=self.ema_model.state_dict(),
142
+ scheduler_state_dict=self.scheduler.state_dict(),
143
+ step=step,
144
+ )
145
+ if not os.path.exists(self.checkpoint_path):
146
+ os.makedirs(self.checkpoint_path)
147
+ if last:
148
+ self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_last.pt")
149
+ print(f"Saved last checkpoint at step {step}")
150
+ else:
151
+ self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_{step}.pt")
152
+
153
+ def load_checkpoint(self):
154
+ if (
155
+ not exists(self.checkpoint_path)
156
+ or not os.path.exists(self.checkpoint_path)
157
+ or not any(filename.endswith(".pt") for filename in os.listdir(self.checkpoint_path))
158
+ ):
159
+ return 0
160
+
161
+ self.accelerator.wait_for_everyone()
162
+ if "model_last.pt" in os.listdir(self.checkpoint_path):
163
+ latest_checkpoint = "model_last.pt"
164
+ else:
165
+ latest_checkpoint = sorted(
166
+ [f for f in os.listdir(self.checkpoint_path) if f.endswith(".pt")],
167
+ key=lambda x: int("".join(filter(str.isdigit, x))),
168
+ )[-1]
169
+ # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device) # rather use accelerator.load_state ಥ_ಥ
170
+ checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", weights_only=True, map_location="cpu")
171
+
172
+ # patch for backward compatibility, 305e3ea
173
+ for key in ["ema_model.mel_spec.mel_stft.mel_scale.fb", "ema_model.mel_spec.mel_stft.spectrogram.window"]:
174
+ if key in checkpoint["ema_model_state_dict"]:
175
+ del checkpoint["ema_model_state_dict"][key]
176
+
177
+ if self.is_main:
178
+ self.ema_model.load_state_dict(checkpoint["ema_model_state_dict"])
179
+
180
+ if "step" in checkpoint:
181
+ # patch for backward compatibility, 305e3ea
182
+ for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
183
+ if key in checkpoint["model_state_dict"]:
184
+ del checkpoint["model_state_dict"][key]
185
+
186
+ self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
187
+ self.accelerator.unwrap_model(self.optimizer).load_state_dict(checkpoint["optimizer_state_dict"])
188
+ if self.scheduler:
189
+ self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
190
+ step = checkpoint["step"]
191
+ else:
192
+ checkpoint["model_state_dict"] = {
193
+ k.replace("ema_model.", ""): v
194
+ for k, v in checkpoint["ema_model_state_dict"].items()
195
+ if k not in ["initted", "step"]
196
+ }
197
+ self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
198
+ step = 0
199
+
200
+ del checkpoint
201
+ gc.collect()
202
+ return step
203
+
204
+ def train(self, train_dataset: Dataset, num_workers=16, resumable_with_seed: int = None):
205
+ if self.log_samples:
206
+ from f5_tts.infer.utils_infer import cfg_strength, load_vocoder, nfe_step, sway_sampling_coef
207
+
208
+ vocoder = load_vocoder(
209
+ vocoder_name=self.vocoder_name, is_local=self.is_local_vocoder, local_path=self.local_vocoder_path
210
+ )
211
+ target_sample_rate = self.accelerator.unwrap_model(self.model).mel_spec.target_sample_rate
212
+ log_samples_path = f"{self.checkpoint_path}/samples"
213
+ os.makedirs(log_samples_path, exist_ok=True)
214
+
215
+ if exists(resumable_with_seed):
216
+ generator = torch.Generator()
217
+ generator.manual_seed(resumable_with_seed)
218
+ else:
219
+ generator = None
220
+
221
+ if self.batch_size_type == "sample":
222
+ train_dataloader = DataLoader(
223
+ train_dataset,
224
+ collate_fn=collate_fn,
225
+ num_workers=num_workers,
226
+ pin_memory=True,
227
+ persistent_workers=True,
228
+ batch_size=self.batch_size,
229
+ shuffle=True,
230
+ generator=generator,
231
+ )
232
+ elif self.batch_size_type == "frame":
233
+ self.accelerator.even_batches = False
234
+ sampler = SequentialSampler(train_dataset)
235
+ batch_sampler = DynamicBatchSampler(
236
+ sampler, self.batch_size, max_samples=self.max_samples, random_seed=resumable_with_seed, drop_last=False
237
+ )
238
+ train_dataloader = DataLoader(
239
+ train_dataset,
240
+ collate_fn=collate_fn,
241
+ num_workers=num_workers,
242
+ pin_memory=True,
243
+ persistent_workers=True,
244
+ batch_sampler=batch_sampler,
245
+ )
246
+ else:
247
+ raise ValueError(f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}")
248
+
249
+ # accelerator.prepare() dispatches batches to devices;
250
+ # which means the length of dataloader calculated before, should consider the number of devices
251
+ warmup_steps = (
252
+ self.num_warmup_updates * self.accelerator.num_processes
253
+ ) # consider a fixed warmup steps while using accelerate multi-gpu ddp
254
+ # otherwise by default with split_batches=False, warmup steps change with num_processes
255
+ total_steps = len(train_dataloader) * self.epochs / self.grad_accumulation_steps
256
+ decay_steps = total_steps - warmup_steps
257
+ warmup_scheduler = LinearLR(self.optimizer, start_factor=1e-8, end_factor=1.0, total_iters=warmup_steps)
258
+ decay_scheduler = LinearLR(self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_steps)
259
+ self.scheduler = SequentialLR(
260
+ self.optimizer, schedulers=[warmup_scheduler, decay_scheduler], milestones=[warmup_steps]
261
+ )
262
+ train_dataloader, self.scheduler = self.accelerator.prepare(
263
+ train_dataloader, self.scheduler
264
+ ) # actual steps = 1 gpu steps / gpus
265
+ start_step = self.load_checkpoint()
266
+ global_step = start_step
267
+
268
+ if exists(resumable_with_seed):
269
+ orig_epoch_step = len(train_dataloader)
270
+ skipped_epoch = int(start_step // orig_epoch_step)
271
+ skipped_batch = start_step % orig_epoch_step
272
+ skipped_dataloader = self.accelerator.skip_first_batches(train_dataloader, num_batches=skipped_batch)
273
+ else:
274
+ skipped_epoch = 0
275
+
276
+ for epoch in range(skipped_epoch, self.epochs):
277
+ self.model.train()
278
+ if exists(resumable_with_seed) and epoch == skipped_epoch:
279
+ progress_bar = tqdm(
280
+ skipped_dataloader,
281
+ desc=f"Epoch {epoch+1}/{self.epochs}",
282
+ unit="step",
283
+ disable=not self.accelerator.is_local_main_process,
284
+ initial=skipped_batch,
285
+ total=orig_epoch_step,
286
+ )
287
+ else:
288
+ progress_bar = tqdm(
289
+ train_dataloader,
290
+ desc=f"Epoch {epoch+1}/{self.epochs}",
291
+ unit="step",
292
+ disable=not self.accelerator.is_local_main_process,
293
+ )
294
+
295
+ for batch in progress_bar:
296
+ with self.accelerator.accumulate(self.model):
297
+ text_inputs = batch["text"]
298
+ mel_spec = batch["mel"].permute(0, 2, 1)
299
+ mel_lengths = batch["mel_lengths"]
300
+
301
+ # TODO. add duration predictor training
302
+ if self.duration_predictor is not None and self.accelerator.is_local_main_process:
303
+ dur_loss = self.duration_predictor(mel_spec, lens=batch.get("durations"))
304
+ self.accelerator.log({"duration loss": dur_loss.item()}, step=global_step)
305
+
306
+ loss, cond, pred = self.model(
307
+ mel_spec, text=text_inputs, lens=mel_lengths, noise_scheduler=self.noise_scheduler
308
+ )
309
+ self.accelerator.backward(loss)
310
+
311
+ if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
312
+ self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
313
+
314
+ self.optimizer.step()
315
+ self.scheduler.step()
316
+ self.optimizer.zero_grad()
317
+
318
+ if self.is_main and self.accelerator.sync_gradients:
319
+ self.ema_model.update()
320
+
321
+ global_step += 1
322
+
323
+ if self.accelerator.is_local_main_process:
324
+ self.accelerator.log({"loss": loss.item(), "lr": self.scheduler.get_last_lr()[0]}, step=global_step)
325
+ if self.logger == "tensorboard":
326
+ self.writer.add_scalar("loss", loss.item(), global_step)
327
+ self.writer.add_scalar("lr", self.scheduler.get_last_lr()[0], global_step)
328
+
329
+ progress_bar.set_postfix(step=str(global_step), loss=loss.item())
330
+
331
+ if global_step % (self.save_per_updates * self.grad_accumulation_steps) == 0:
332
+ self.save_checkpoint(global_step)
333
+
334
+ if self.log_samples and self.accelerator.is_local_main_process:
335
+ ref_audio_len = mel_lengths[0]
336
+ infer_text = [
337
+ text_inputs[0] + ([" "] if isinstance(text_inputs[0], list) else " ") + text_inputs[0]
338
+ ]
339
+ with torch.inference_mode():
340
+ generated, _ = self.accelerator.unwrap_model(self.model).sample(
341
+ cond=mel_spec[0][:ref_audio_len].unsqueeze(0),
342
+ text=infer_text,
343
+ duration=ref_audio_len * 2,
344
+ steps=nfe_step,
345
+ cfg_strength=cfg_strength,
346
+ sway_sampling_coef=sway_sampling_coef,
347
+ )
348
+ generated = generated.to(torch.float32)
349
+ gen_mel_spec = generated[:, ref_audio_len:, :].permute(0, 2, 1).to(self.accelerator.device)
350
+ ref_mel_spec = batch["mel"][0].unsqueeze(0)
351
+ if self.vocoder_name == "vocos":
352
+ gen_audio = vocoder.decode(gen_mel_spec).cpu()
353
+ ref_audio = vocoder.decode(ref_mel_spec).cpu()
354
+ elif self.vocoder_name == "bigvgan":
355
+ gen_audio = vocoder(gen_mel_spec).squeeze(0).cpu()
356
+ ref_audio = vocoder(ref_mel_spec).squeeze(0).cpu()
357
+
358
+ torchaudio.save(f"{log_samples_path}/step_{global_step}_gen.wav", gen_audio, target_sample_rate)
359
+ torchaudio.save(f"{log_samples_path}/step_{global_step}_ref.wav", ref_audio, target_sample_rate)
360
+
361
+ if global_step % self.last_per_steps == 0:
362
+ self.save_checkpoint(global_step, last=True)
363
+
364
+ self.save_checkpoint(global_step, last=True)
365
+
366
+ self.accelerator.end_training()
f5_tts/model/utils.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import random
5
+ from collections import defaultdict
6
+ from importlib.resources import files
7
+
8
+ import torch
9
+ from torch.nn.utils.rnn import pad_sequence
10
+
11
+ import jieba
12
+ from pypinyin import lazy_pinyin, Style
13
+
14
+
15
+ # seed everything
16
+
17
+
18
+ def seed_everything(seed=0):
19
+ random.seed(seed)
20
+ os.environ["PYTHONHASHSEED"] = str(seed)
21
+ torch.manual_seed(seed)
22
+ torch.cuda.manual_seed(seed)
23
+ torch.cuda.manual_seed_all(seed)
24
+ torch.backends.cudnn.deterministic = True
25
+ torch.backends.cudnn.benchmark = False
26
+
27
+
28
+ # helpers
29
+
30
+
31
+ def exists(v):
32
+ return v is not None
33
+
34
+
35
+ def default(v, d):
36
+ return v if exists(v) else d
37
+
38
+
39
+ # tensor helpers
40
+
41
+
42
+ def lens_to_mask(t: int["b"], length: int | None = None) -> bool["b n"]: # noqa: F722 F821
43
+ if not exists(length):
44
+ length = t.amax()
45
+
46
+ seq = torch.arange(length, device=t.device)
47
+ return seq[None, :] < t[:, None]
48
+
49
+
50
+ def mask_from_start_end_indices(seq_len: int["b"], start: int["b"], end: int["b"]): # noqa: F722 F821
51
+ max_seq_len = seq_len.max().item()
52
+ seq = torch.arange(max_seq_len, device=start.device).long()
53
+ start_mask = seq[None, :] >= start[:, None]
54
+ end_mask = seq[None, :] < end[:, None]
55
+ return start_mask & end_mask
56
+
57
+
58
+ def mask_from_frac_lengths(seq_len: int["b"], frac_lengths: float["b"]): # noqa: F722 F821
59
+ lengths = (frac_lengths * seq_len).long()
60
+ max_start = seq_len - lengths
61
+
62
+ rand = torch.rand_like(frac_lengths)
63
+ start = (max_start * rand).long().clamp(min=0)
64
+ end = start + lengths
65
+
66
+ return mask_from_start_end_indices(seq_len, start, end)
67
+
68
+
69
+ def maybe_masked_mean(t: float["b n d"], mask: bool["b n"] = None) -> float["b d"]: # noqa: F722
70
+ if not exists(mask):
71
+ return t.mean(dim=1)
72
+
73
+ t = torch.where(mask[:, :, None], t, torch.tensor(0.0, device=t.device))
74
+ num = t.sum(dim=1)
75
+ den = mask.float().sum(dim=1)
76
+
77
+ return num / den.clamp(min=1.0)
78
+
79
+
80
+ # simple utf-8 tokenizer, since paper went character based
81
+ def list_str_to_tensor(text: list[str], padding_value=-1) -> int["b nt"]: # noqa: F722
82
+ list_tensors = [torch.tensor([*bytes(t, "UTF-8")]) for t in text] # ByT5 style
83
+ text = pad_sequence(list_tensors, padding_value=padding_value, batch_first=True)
84
+ return text
85
+
86
+
87
+ # char tokenizer, based on custom dataset's extracted .txt file
88
+ def list_str_to_idx(
89
+ text: list[str] | list[list[str]],
90
+ vocab_char_map: dict[str, int], # {char: idx}
91
+ padding_value=-1,
92
+ ) -> int["b nt"]: # noqa: F722
93
+ list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text] # pinyin or char style
94
+ text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
95
+ return text
96
+
97
+
98
+ # Get tokenizer
99
+
100
+
101
+ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
102
+ """
103
+ tokenizer - "pinyin" do g2p for only chinese characters, need .txt vocab_file
104
+ - "char" for char-wise tokenizer, need .txt vocab_file
105
+ - "byte" for utf-8 tokenizer
106
+ - "custom" if you're directly passing in a path to the vocab.txt you want to use
107
+ vocab_size - if use "pinyin", all available pinyin types, common alphabets (also those with accent) and symbols
108
+ - if use "char", derived from unfiltered character & symbol counts of custom dataset
109
+ - if use "byte", set to 256 (unicode byte range)
110
+ """
111
+ if tokenizer in ["pinyin", "char"]:
112
+ tokenizer_path = os.path.join(files("f5_tts").joinpath("../../data"), f"{dataset_name}_{tokenizer}/vocab.txt")
113
+ with open(tokenizer_path, "r", encoding="utf-8") as f:
114
+ vocab_char_map = {}
115
+ for i, char in enumerate(f):
116
+ vocab_char_map[char[:-1]] = i
117
+ vocab_size = len(vocab_char_map)
118
+ assert vocab_char_map[" "] == 0, "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char"
119
+
120
+ elif tokenizer == "byte":
121
+ vocab_char_map = None
122
+ vocab_size = 256
123
+
124
+ elif tokenizer == "custom":
125
+ with open(dataset_name, "r", encoding="utf-8") as f:
126
+ vocab_char_map = {}
127
+ for i, char in enumerate(f):
128
+ vocab_char_map[char[:-1]] = i
129
+ vocab_size = len(vocab_char_map)
130
+
131
+ return vocab_char_map, vocab_size
132
+
133
+
134
+ # convert char to pinyin
135
+
136
+ jieba.initialize()
137
+ print("Word segmentation module jieba initialized.\n")
138
+
139
+
140
+ def convert_char_to_pinyin(text_list, polyphone=True):
141
+ final_text_list = []
142
+ custom_trans = str.maketrans(
143
+ {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"}
144
+ ) # add custom trans here, to address oov
145
+
146
+ def is_chinese(c):
147
+ return (
148
+ "\u3100" <= c <= "\u9fff" # common chinese characters
149
+ )
150
+
151
+ for text in text_list:
152
+ char_list = []
153
+ text = text.translate(custom_trans)
154
+ for seg in jieba.cut(text):
155
+ seg_byte_len = len(bytes(seg, "UTF-8"))
156
+ if seg_byte_len == len(seg): # if pure alphabets and symbols
157
+ if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
158
+ char_list.append(" ")
159
+ char_list.extend(seg)
160
+ elif polyphone and seg_byte_len == 3 * len(seg): # if pure east asian characters
161
+ seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
162
+ for i, c in enumerate(seg):
163
+ if is_chinese(c):
164
+ char_list.append(" ")
165
+ char_list.append(seg_[i])
166
+ else: # if mixed characters, alphabets and symbols
167
+ for c in seg:
168
+ if ord(c) < 256:
169
+ char_list.extend(c)
170
+ elif is_chinese(c):
171
+ char_list.append(" ")
172
+ char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
173
+ else:
174
+ char_list.append(c)
175
+ final_text_list.append(char_list)
176
+
177
+ return final_text_list
178
+
179
+
180
+ # filter func for dirty data with many repetitions
181
+
182
+
183
+ def repetition_found(text, length=2, tolerance=10):
184
+ pattern_count = defaultdict(int)
185
+ for i in range(len(text) - length + 1):
186
+ pattern = text[i : i + length]
187
+ pattern_count[pattern] += 1
188
+ for pattern, count in pattern_count.items():
189
+ if count > tolerance:
190
+ return True
191
+ return False
f5_tts_id_colab.ipynb ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "code",
21
+ "metadata": {
22
+ "colab": {
23
+ "base_uri": "https://localhost:8080/"
24
+ },
25
+ "id": "n1wPWmjJd--t",
26
+ "outputId": "a862ae5a-3d6e-4134-d352-54b02dd28685",
27
+ "jupyter": {
28
+ "is_executing": true
29
+ }
30
+ },
31
+ "source": [
32
+ "!nvidia-smi"
33
+ ],
34
+ "outputs": [],
35
+ "execution_count": null
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "source": [
40
+ "!git clone https://huggingface.co/spaces/hndrbrm/f5_tts_id_space\n",
41
+ "%cd /content/f5_tts_id_space\n",
42
+ "!pip install -r requirements.txt\n",
43
+ "!python app.py --share"
44
+ ],
45
+ "metadata": {
46
+ "colab": {
47
+ "base_uri": "https://localhost:8080/"
48
+ },
49
+ "id": "w5huO7B9etpd",
50
+ "outputId": "1603dca4-81b8-4177-f0a2-50299c21402b"
51
+ },
52
+ "execution_count": 3,
53
+ "outputs": [
54
+ {
55
+ "output_type": "stream",
56
+ "name": "stdout",
57
+ "text": [
58
+ "Cloning into 'f5_tts_id_space'...\n",
59
+ "remote: Enumerating objects: 27, done.\u001B[K\n",
60
+ "remote: Counting objects: 100% (23/23), done.\u001B[K\n",
61
+ "remote: Compressing objects: 100% (21/21), done.\u001B[K\n",
62
+ "remote: Total 27 (delta 2), reused 0 (delta 0), pack-reused 4 (from 1)\u001B[K\n",
63
+ "Unpacking objects: 100% (27/27), 44.75 KiB | 2.80 MiB/s, done.\n",
64
+ "/content/f5_tts_id_space\n",
65
+ "Requirement already satisfied: accelerate>=0.33.0 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 1)) (1.2.1)\n",
66
+ "Requirement already satisfied: cached_path in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 2)) (1.6.6)\n",
67
+ "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 3)) (3.2.0)\n",
68
+ "Requirement already satisfied: ema_pytorch>=0.5.2 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 4)) (0.7.7)\n",
69
+ "Requirement already satisfied: gradio>=3.45.2 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 5)) (5.10.0)\n",
70
+ "Requirement already satisfied: jieba in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 6)) (0.42.1)\n",
71
+ "Requirement already satisfied: librosa in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 7)) (0.10.2.post1)\n",
72
+ "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 8)) (3.8.0)\n",
73
+ "Requirement already satisfied: numpy<=1.26.4 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 9)) (1.26.4)\n",
74
+ "Requirement already satisfied: pypinyin in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 10)) (0.53.0)\n",
75
+ "Requirement already satisfied: soundfile in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 11)) (0.12.1)\n",
76
+ "Requirement already satisfied: torchaudio>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 12)) (2.5.1+cu121)\n",
77
+ "Requirement already satisfied: torchdiffeq in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 13)) (0.2.5)\n",
78
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 14)) (4.47.1)\n",
79
+ "Requirement already satisfied: vocos in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 15)) (0.1.0)\n",
80
+ "Requirement already satisfied: wandb in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 16)) (0.19.1)\n",
81
+ "Requirement already satisfied: x_transformers>=1.31.14 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 17)) (1.44.4)\n",
82
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.33.0->-r requirements.txt (line 1)) (24.2)\n",
83
+ "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.33.0->-r requirements.txt (line 1)) (5.9.5)\n",
84
+ "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.33.0->-r requirements.txt (line 1)) (6.0.2)\n",
85
+ "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.33.0->-r requirements.txt (line 1)) (2.5.1+cu121)\n",
86
+ "Requirement already satisfied: huggingface-hub>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.33.0->-r requirements.txt (line 1)) (0.26.5)\n",
87
+ "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.33.0->-r requirements.txt (line 1)) (0.4.5)\n",
88
+ "Requirement already satisfied: requests<3.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from cached_path->-r requirements.txt (line 2)) (2.32.3)\n",
89
+ "Requirement already satisfied: rich<14.0,>=12.1 in /usr/local/lib/python3.10/dist-packages (from cached_path->-r requirements.txt (line 2)) (13.9.4)\n",
90
+ "Requirement already satisfied: filelock<4.0,>=3.4 in /usr/local/lib/python3.10/dist-packages (from cached_path->-r requirements.txt (line 2)) (3.16.1)\n",
91
+ "Requirement already satisfied: boto3<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from cached_path->-r requirements.txt (line 2)) (1.35.93)\n",
92
+ "Requirement already satisfied: google-cloud-storage<3.0,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from cached_path->-r requirements.txt (line 2)) (2.19.0)\n",
93
+ "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (17.0.0)\n",
94
+ "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (0.3.8)\n",
95
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (2.2.2)\n",
96
+ "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (4.67.1)\n",
97
+ "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (3.5.0)\n",
98
+ "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (0.70.16)\n",
99
+ "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets->-r requirements.txt (line 3)) (2024.9.0)\n",
100
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets->-r requirements.txt (line 3)) (3.11.10)\n",
101
+ "Requirement already satisfied: aiofiles<24.0,>=22.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (23.2.1)\n",
102
+ "Requirement already satisfied: anyio<5.0,>=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (3.7.1)\n",
103
+ "Requirement already satisfied: fastapi<1.0,>=0.115.2 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (0.115.6)\n",
104
+ "Requirement already satisfied: ffmpy in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (0.5.0)\n",
105
+ "Requirement already satisfied: gradio-client==1.5.3 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (1.5.3)\n",
106
+ "Requirement already satisfied: httpx>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (0.28.1)\n",
107
+ "Requirement already satisfied: jinja2<4.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (3.1.4)\n",
108
+ "Requirement already satisfied: markupsafe~=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (2.1.5)\n",
109
+ "Requirement already satisfied: orjson~=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (3.10.12)\n",
110
+ "Requirement already satisfied: pillow<12.0,>=8.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (11.0.0)\n",
111
+ "Requirement already satisfied: pydantic>=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (2.10.3)\n",
112
+ "Requirement already satisfied: pydub in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (0.25.1)\n",
113
+ "Requirement already satisfied: python-multipart>=0.0.18 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (0.0.20)\n",
114
+ "Requirement already satisfied: ruff>=0.2.2 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (0.8.6)\n",
115
+ "Requirement already satisfied: safehttpx<0.2.0,>=0.1.6 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (0.1.6)\n",
116
+ "Requirement already satisfied: semantic-version~=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (2.10.0)\n",
117
+ "Requirement already satisfied: starlette<1.0,>=0.40.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (0.41.3)\n",
118
+ "Requirement already satisfied: tomlkit<0.14.0,>=0.12.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (0.13.2)\n",
119
+ "Requirement already satisfied: typer<1.0,>=0.12 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (0.15.1)\n",
120
+ "Requirement already satisfied: typing-extensions~=4.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (4.12.2)\n",
121
+ "Requirement already satisfied: uvicorn>=0.14.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->-r requirements.txt (line 5)) (0.34.0)\n",
122
+ "Requirement already satisfied: websockets<15.0,>=10.0 in /usr/local/lib/python3.10/dist-packages (from gradio-client==1.5.3->gradio>=3.45.2->-r requirements.txt (line 5)) (14.1)\n",
123
+ "Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements.txt (line 7)) (3.0.1)\n",
124
+ "Requirement already satisfied: scipy>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements.txt (line 7)) (1.13.1)\n",
125
+ "Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements.txt (line 7)) (1.6.0)\n",
126
+ "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements.txt (line 7)) (1.4.2)\n",
127
+ "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements.txt (line 7)) (4.4.2)\n",
128
+ "Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements.txt (line 7)) (0.60.0)\n",
129
+ "Requirement already satisfied: pooch>=1.1 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements.txt (line 7)) (1.8.2)\n",
130
+ "Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements.txt (line 7)) (0.5.0.post1)\n",
131
+ "Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements.txt (line 7)) (0.4)\n",
132
+ "Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa->-r requirements.txt (line 7)) (1.1.0)\n",
133
+ "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->-r requirements.txt (line 8)) (1.3.1)\n",
134
+ "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->-r requirements.txt (line 8)) (0.12.1)\n",
135
+ "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->-r requirements.txt (line 8)) (4.55.3)\n",
136
+ "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->-r requirements.txt (line 8)) (1.4.7)\n",
137
+ "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->-r requirements.txt (line 8)) (3.2.0)\n",
138
+ "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->-r requirements.txt (line 8)) (2.8.2)\n",
139
+ "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile->-r requirements.txt (line 11)) (1.17.1)\n",
140
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate>=0.33.0->-r requirements.txt (line 1)) (3.4.2)\n",
141
+ "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate>=0.33.0->-r requirements.txt (line 1)) (1.13.1)\n",
142
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=1.10.0->accelerate>=0.33.0->-r requirements.txt (line 1)) (1.3.0)\n",
143
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->-r requirements.txt (line 14)) (2024.11.6)\n",
144
+ "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.10/dist-packages (from transformers->-r requirements.txt (line 14)) (0.21.0)\n",
145
+ "Requirement already satisfied: einops in /usr/local/lib/python3.10/dist-packages (from vocos->-r requirements.txt (line 15)) (0.8.0)\n",
146
+ "Requirement already satisfied: encodec==0.1.1 in /usr/local/lib/python3.10/dist-packages (from vocos->-r requirements.txt (line 15)) (0.1.1)\n",
147
+ "Requirement already satisfied: click!=8.0.0,>=7.1 in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements.txt (line 16)) (8.1.7)\n",
148
+ "Requirement already satisfied: docker-pycreds>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements.txt (line 16)) (0.4.0)\n",
149
+ "Requirement already satisfied: gitpython!=3.1.29,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements.txt (line 16)) (3.1.43)\n",
150
+ "Requirement already satisfied: platformdirs in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements.txt (line 16)) (4.3.6)\n",
151
+ "Requirement already satisfied: protobuf!=4.21.0,!=5.28.0,<6,>=3.19.0 in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements.txt (line 16)) (4.25.5)\n",
152
+ "Requirement already satisfied: sentry-sdk>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements.txt (line 16)) (2.19.2)\n",
153
+ "Requirement already satisfied: setproctitle in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements.txt (line 16)) (1.3.4)\n",
154
+ "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from wandb->-r requirements.txt (line 16)) (75.1.0)\n",
155
+ "Requirement already satisfied: einx>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from x_transformers>=1.31.14->-r requirements.txt (line 17)) (0.3.0)\n",
156
+ "Requirement already satisfied: loguru in /usr/local/lib/python3.10/dist-packages (from x_transformers>=1.31.14->-r requirements.txt (line 17)) (0.7.3)\n",
157
+ "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio>=3.45.2->-r requirements.txt (line 5)) (3.10)\n",
158
+ "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio>=3.45.2->-r requirements.txt (line 5)) (1.3.1)\n",
159
+ "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio>=3.45.2->-r requirements.txt (line 5)) (1.2.2)\n",
160
+ "Requirement already satisfied: botocore<1.36.0,>=1.35.93 in /usr/local/lib/python3.10/dist-packages (from boto3<2.0,>=1.0->cached_path->-r requirements.txt (line 2)) (1.35.93)\n",
161
+ "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from boto3<2.0,>=1.0->cached_path->-r requirements.txt (line 2)) (1.0.1)\n",
162
+ "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in /usr/local/lib/python3.10/dist-packages (from boto3<2.0,>=1.0->cached_path->-r requirements.txt (line 2)) (0.10.4)\n",
163
+ "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile->-r requirements.txt (line 11)) (2.22)\n",
164
+ "Requirement already satisfied: six>=1.4.0 in /usr/local/lib/python3.10/dist-packages (from docker-pycreds>=0.4.0->wandb->-r requirements.txt (line 16)) (1.17.0)\n",
165
+ "Requirement already satisfied: frozendict in /usr/local/lib/python3.10/dist-packages (from einx>=0.3.0->x_transformers>=1.31.14->-r requirements.txt (line 17)) (2.4.6)\n",
166
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (2.4.4)\n",
167
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (1.3.2)\n",
168
+ "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (4.0.3)\n",
169
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (24.3.0)\n",
170
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (1.5.0)\n",
171
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (6.1.0)\n",
172
+ "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (0.2.1)\n",
173
+ "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->-r requirements.txt (line 3)) (1.18.3)\n",
174
+ "Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.10/dist-packages (from gitpython!=3.1.29,>=1.0.0->wandb->-r requirements.txt (line 16)) (4.0.11)\n",
175
+ "Requirement already satisfied: google-auth<3.0dev,>=2.26.1 in /usr/local/lib/python3.10/dist-packages (from google-cloud-storage<3.0,>=1.32.0->cached_path->-r requirements.txt (line 2)) (2.27.0)\n",
176
+ "Requirement already satisfied: google-api-core<3.0.0dev,>=2.15.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-storage<3.0,>=1.32.0->cached_path->-r requirements.txt (line 2)) (2.19.2)\n",
177
+ "Requirement already satisfied: google-cloud-core<3.0dev,>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-storage<3.0,>=1.32.0->cached_path->-r requirements.txt (line 2)) (2.4.1)\n",
178
+ "Requirement already satisfied: google-resumable-media>=2.7.2 in /usr/local/lib/python3.10/dist-packages (from google-cloud-storage<3.0,>=1.32.0->cached_path->-r requirements.txt (line 2)) (2.7.2)\n",
179
+ "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-storage<3.0,>=1.32.0->cached_path->-r requirements.txt (line 2)) (1.6.0)\n",
180
+ "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio>=3.45.2->-r requirements.txt (line 5)) (2024.12.14)\n",
181
+ "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio>=3.45.2->-r requirements.txt (line 5)) (1.0.7)\n",
182
+ "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx>=0.24.1->gradio>=3.45.2->-r requirements.txt (line 5)) (0.14.0)\n",
183
+ "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.0->librosa->-r requirements.txt (line 7)) (0.43.0)\n",
184
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets->-r requirements.txt (line 3)) (2024.2)\n",
185
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets->-r requirements.txt (line 3)) (2024.2)\n",
186
+ "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio>=3.45.2->-r requirements.txt (line 5)) (0.7.0)\n",
187
+ "Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio>=3.45.2->-r requirements.txt (line 5)) (2.27.1)\n",
188
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.0->cached_path->-r requirements.txt (line 2)) (3.4.0)\n",
189
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.0->cached_path->-r requirements.txt (line 2)) (2.2.3)\n",
190
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<14.0,>=12.1->cached_path->-r requirements.txt (line 2)) (3.0.0)\n",
191
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<14.0,>=12.1->cached_path->-r requirements.txt (line 2)) (2.18.0)\n",
192
+ "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20.0->librosa->-r requirements.txt (line 7)) (3.5.0)\n",
193
+ "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio>=3.45.2->-r requirements.txt (line 5)) (1.5.4)\n",
194
+ "Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.10/dist-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb->-r requirements.txt (line 16)) (5.0.1)\n",
195
+ "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage<3.0,>=1.32.0->cached_path->-r requirements.txt (line 2)) (1.66.0)\n",
196
+ "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.3 in /usr/local/lib/python3.10/dist-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage<3.0,>=1.32.0->cached_path->-r requirements.txt (line 2)) (1.25.0)\n",
197
+ "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage<3.0,>=1.32.0->cached_path->-r requirements.txt (line 2)) (5.5.0)\n",
198
+ "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage<3.0,>=1.32.0->cached_path->-r requirements.txt (line 2)) (0.4.1)\n",
199
+ "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage<3.0,>=1.32.0->cached_path->-r requirements.txt (line 2)) (4.9)\n",
200
+ "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<14.0,>=12.1->cached_path->-r requirements.txt (line 2)) (0.1.2)\n",
201
+ "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.26.1->google-cloud-storage<3.0,>=1.32.0->cached_path->-r requirements.txt (line 2)) (0.6.1)\n",
202
+ "Building prefix dict from the default dictionary ...\n",
203
+ "Loading model from cache /tmp/jieba.cache\n",
204
+ "Loading model cost 0.732 seconds.\n",
205
+ "Prefix dict has been built successfully.\n",
206
+ "Word segmentation module jieba initialized.\n",
207
+ "\n",
208
+ "2025-01-07 16:24:35.598424: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
209
+ "2025-01-07 16:24:35.621175: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
210
+ "2025-01-07 16:24:35.628006: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
211
+ "2025-01-07 16:24:35.644501: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
212
+ "To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
213
+ "2025-01-07 16:24:36.902693: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
214
+ "Download Vocos from huggingface charactr/vocos-mel-24khz\n",
215
+ "\n",
216
+ "vocab : /content/f5_tts_id_space/f5_tts/infer/examples/vocab.txt\n",
217
+ "token : custom\n",
218
+ "model : /root/.cache/huggingface/hub/models--hndrbrm--f5_tts_id_model/snapshots/9e24ec2d125c99a0b0c63dabdfeeff913af0be87/model_id.safetensors \n",
219
+ "\n",
220
+ "/usr/local/lib/python3.10/dist-packages/gradio/components/chatbot.py:249: UserWarning: You have not specified a value for the `type` parameter. Defaulting to the 'tuples' format for chatbot messages, but this is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style dictionaries with 'role' and 'content' keys.\n",
221
+ " warnings.warn(\n",
222
+ "Startingg app...\n",
223
+ "* Running on local URL: http://127.0.0.1:7860\n",
224
+ "* Running on public URL: https://5096edd1da979fde33.gradio.live\n",
225
+ "\n",
226
+ "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n",
227
+ "\n",
228
+ "ref_text panas matahari siang ini, sebenarnya bisa membuat cucian basah di jemuran, kering dalam sekejap. \n",
229
+ "gen_text 0 panas matahari siang ini, sebenarnya bisa membuat cucian basah di jemuran, kering dalam sekejap.\n",
230
+ "\n",
231
+ "\n",
232
+ "/usr/local/lib/python3.10/dist-packages/gradio/processing_utils.py:738: UserWarning: Trying to convert audio automatically from float32 to 16-bit int format.\n",
233
+ " warnings.warn(warning.format(data.dtype))\n",
234
+ "Keyboard interruption in main thread... closing server.\n",
235
+ "\n",
236
+ "Aborted!\n",
237
+ "Killing tunnel 127.0.0.1:7860 <> https://5096edd1da979fde33.gradio.live\n"
238
+ ]
239
+ }
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "source": [],
245
+ "metadata": {
246
+ "id": "pvxfC6bhiFGe"
247
+ },
248
+ "execution_count": null,
249
+ "outputs": []
250
+ }
251
+ ]
252
+ }
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate>=0.33.0
2
+ cached_path
3
+ datasets
4
+ ema_pytorch>=0.5.2
5
+ gradio>=3.45.2
6
+ jieba
7
+ librosa
8
+ matplotlib
9
+ numpy<=1.26.4
10
+ pypinyin
11
+ soundfile
12
+ torchaudio>=2.0.0
13
+ torchdiffeq
14
+ transformers
15
+ vocos
16
+ wandb
17
+ x_transformers>=1.31.14