firstpixel commited on
Commit
9f9c96f
·
1 Parent(s): 6194263

Adjusts added agent and organize, readme adjusts.

Browse files
AgentF5TTSChunk.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ import logging
5
+ import subprocess
6
+ from f5_tts.api import F5TTS # Correct import for F5-TTS API
7
+
8
+
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+
12
+
13
+ class AgentF5TTS:
14
+ def __init__(self, ckpt_file, vocoder_name="vocos", delay=0, device="mps"):
15
+ """
16
+ Initialize the F5-TTS Agent.
17
+
18
+ :param ckpt_file: Path to the safetensors model checkpoint.
19
+ :param vocoder_name: Name of the vocoder to use ("vocos" or "bigvgan").
20
+ :param delay: Delay in seconds between audio generations.
21
+ :param device: Device to use ("cpu", "cuda", "mps").
22
+ """
23
+ self.model = F5TTS(ckpt_file=ckpt_file, vocoder_name=vocoder_name, device=device)
24
+ self.delay = delay # Delay in seconds
25
+
26
+ def generate_emotion_speech(self, text_file, output_audio_file, speaker_emotion_refs, convert_to_mp3=False):
27
+ """
28
+ Generate speech using the F5-TTS model.
29
+
30
+ :param text_file: Path to the input text file.
31
+ :param output_audio_file: Path to save the combined audio output.
32
+ :param speaker_emotion_refs: Dictionary mapping (speaker, emotion) tuples to reference audio paths.
33
+ :param convert_to_mp3: Boolean flag to convert the output to MP3.
34
+ """
35
+ try:
36
+ with open(text_file, "r", encoding="utf-8") as file:
37
+ lines = [line.strip() for line in file if line.strip()]
38
+ except FileNotFoundError:
39
+ logging.error(f"Text file not found: {text_file}")
40
+ return
41
+
42
+ if not lines:
43
+ logging.error("Input text file is empty.")
44
+ return
45
+
46
+ temp_files = []
47
+ os.makedirs(os.path.dirname(output_audio_file), exist_ok=True)
48
+
49
+ for i, line in enumerate(lines):
50
+
51
+ speaker, emotion = self._determine_speaker_emotion(line)
52
+ ref_audio = speaker_emotion_refs.get((speaker, emotion))
53
+ line = re.sub(r'\[speaker:.*?\]\s*', '', line)
54
+ if not ref_audio or not os.path.exists(ref_audio):
55
+ logging.error(f"Reference audio not found for speaker '{speaker}', emotion '{emotion}'.")
56
+ continue
57
+
58
+ ref_text = "" # Placeholder or load corresponding text
59
+ temp_file = f"{output_audio_file}_line{i + 1}.wav"
60
+
61
+ try:
62
+ logging.info(f"Generating speech for line {i + 1}: '{line}' with speaker '{speaker}', emotion '{emotion}'")
63
+ self.model.infer(
64
+ ref_file=ref_audio,
65
+ ref_text=ref_text,
66
+ gen_text=line,
67
+ file_wave=temp_file,
68
+ remove_silence=True,
69
+ )
70
+ temp_files.append(temp_file)
71
+ time.sleep(self.delay)
72
+ except Exception as e:
73
+ logging.error(f"Error generating speech for line {i + 1}: {e}")
74
+
75
+ self._combine_audio_files(temp_files, output_audio_file, convert_to_mp3)
76
+
77
+
78
+
79
+ def generate_speech(self, text_file, output_audio_file, ref_audio, convert_to_mp3=False):
80
+ try:
81
+ with open(text_file, 'r', encoding='utf-8') as file:
82
+ lines = [line.strip() for line in file if line.strip()]
83
+ except FileNotFoundError:
84
+ logging.error(f"Text file not found: {text_file}")
85
+ return
86
+
87
+ if not lines:
88
+ logging.error("Input text file is empty.")
89
+ return
90
+
91
+ temp_files = []
92
+ os.makedirs(os.path.dirname(output_audio_file), exist_ok=True)
93
+
94
+ for i, line in enumerate(lines):
95
+
96
+ if not ref_audio or not os.path.exists(ref_audio):
97
+ logging.error(f"Reference audio not found for speaker.")
98
+ continue
99
+ temp_file = f"{output_audio_file}_line{i + 1}.wav"
100
+
101
+ try:
102
+ logging.info(f"Generating speech for line {i + 1}: '{line}'")
103
+ self.model.infer(
104
+ ref_file=ref_audio, # No reference audio
105
+ ref_text="", # No reference text
106
+ gen_text=line,
107
+ file_wave=temp_file,
108
+ )
109
+ temp_files.append(temp_file)
110
+ except Exception as e:
111
+ logging.error(f"Error generating speech for line {i + 1}: {e}")
112
+
113
+ # Combine temp_files into output_audio_file if needed
114
+ self._combine_audio_files(temp_files, output_audio_file, convert_to_mp3)
115
+
116
+
117
+
118
+
119
+ def _determine_speaker_emotion(self, text):
120
+ """
121
+ Extract speaker and emotion from the text using regex.
122
+ Default to "speaker1" and "neutral" if not specified.
123
+ """
124
+ speaker, emotion = "speaker1", "neutral" # Default values
125
+
126
+ # Use regex to find [speaker:speaker_name, emotion:emotion_name]
127
+ match = re.search(r"\[speaker:(.*?), emotion:(.*?)\]", text)
128
+ if match:
129
+ speaker = match.group(1).strip()
130
+ emotion = match.group(2).strip()
131
+
132
+ logging.info(f"Determined speaker: '{speaker}', emotion: '{emotion}'")
133
+ return speaker, emotion
134
+
135
+ def _combine_audio_files(self, temp_files, output_audio_file, convert_to_mp3):
136
+ """Combine multiple audio files into a single file using FFmpeg."""
137
+ if not temp_files:
138
+ logging.error("No audio files to combine.")
139
+ return
140
+
141
+ list_file = "file_list.txt"
142
+ with open(list_file, "w") as f:
143
+ for temp in temp_files:
144
+ f.write(f"file '{temp}'\n")
145
+
146
+ try:
147
+ subprocess.run(["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_audio_file], check=True)
148
+ if convert_to_mp3:
149
+ mp3_output = output_audio_file.replace(".wav", ".mp3")
150
+ subprocess.run(["ffmpeg", "-y", "-i", output_audio_file, "-codec:a", "libmp3lame", "-qscale:a", "2", mp3_output], check=True)
151
+ logging.info(f"Converted to MP3: {mp3_output}")
152
+ for temp in temp_files:
153
+ os.remove(temp)
154
+ os.remove(list_file)
155
+ except Exception as e:
156
+ logging.error(f"Error combining audio files: {e}")
157
+
158
+
159
+ # Example usage, remove from this line on to import into other agents.
160
+ # make sure to adjust the paths to yourr files.
161
+ if __name__ == "__main__":
162
+
163
+ env = os.environ.copy()
164
+ env["PYTHONUNBUFFERED"] = "1"
165
+
166
+ model_path = "./F5-TTS/ckpts/pt-br/model_last.safetensors"
167
+ speaker_emotion_refs = {
168
+ ("speaker1", "happy"): "ref_audios/speaker1_happy.wav",
169
+ ("speaker1", "sad"): "ref_audios/speaker1_sad.wav",
170
+ ("speaker1", "angry"): "ref_audios/speaker1_angry.wav",
171
+ }
172
+ agent = AgentF5TTS(ckpt_file=model_path, vocoder_name="vocos", delay=6)
173
+
174
+ agent.generate_emotion_speech(
175
+ text_file="input_text.txt",
176
+ output_audio_file="output/final_output_emo.wav",
177
+ speaker_emotion_refs=speaker_emotion_refs,
178
+ convert_to_mp3=True,
179
+ )
180
+
181
+ agent.generate_speech(
182
+ text_file="input_text2.txt",
183
+ output_audio_file="output/final_output.wav",
184
+ ref_audio="ref_audios/refaudio.mp3",
185
+ convert_to_mp3=True,
186
+ )
187
+
188
+
189
+
190
+
README.md CHANGED
Binary files a/README.md and b/README.md differ
 
pt-br DELETED
File without changes
model_200000.pt → pt-br/model_200000.pt RENAMED
File without changes
model_last.pt → pt-br/model_last.pt RENAMED
File without changes
model_last.safetensors → pt-br/model_last.safetensors RENAMED
File without changes