Spaces:

pupunpu
/

voice-clone-app

Sleeping

App Files Files Community

hengjie yang commited on 20 days ago

Commit

a03d4c1

1 Parent(s): 4ecc033

Complete overhaul of audio processing and embedding extraction

Browse files

Files changed (1) hide show

src/deploy/voice_clone.py +100 -47

src/deploy/voice_clone.py CHANGED Viewed

@@ -41,6 +41,40 @@ class VoiceCloneSystem:
         print("模型加载完成！")
     def extract_speaker_embedding(
         self,
         audio_paths: List[Union[str, Path]]
@@ -57,33 +91,42 @@ class VoiceCloneSystem:
         embeddings = []
         for audio_path in audio_paths:
-            # 加载音频
-            waveform, sr = torchaudio.load(str(audio_path))
-            # 重采样到16kHz
-            if sr != 16000:
-                waveform = torchaudio.functional.resample(waveform, sr, 16000)
-            # 确保音频是单声道
-            if waveform.shape[0] > 1:
-                waveform = torch.mean(waveform, dim=0, keepdim=True)
-            # 提取特征
-            with torch.no_grad():
-                embedding = self.speaker_encoder.encode_batch(waveform.to(self.device))
-                # 调整维度：从 [1, 1, 1, 512] 转换为 [1, 512]
-                embedding = embedding.squeeze()  # 移除所有维度为1的维度
-                if embedding.dim() == 1:
-                    embedding = embedding.unsqueeze(0)  # 确保是 [1, 512]
-                embeddings.append(embedding)
         # 计算平均特征
-        mean_embedding = torch.mean(torch.stack(embeddings), dim=0)
         if mean_embedding.dim() == 1:
-            mean_embedding = mean_embedding.unsqueeze(0)  # 确保是 [1, 512]
-        # 打印维度信息以便调试
         print(f"Final embedding shape: {mean_embedding.shape}")
         return mean_embedding
     def generate_speech(
@@ -101,21 +144,26 @@ class VoiceCloneSystem:
         Returns:
             生成的语音波形
         """
-        # 处理输入文本
-        inputs = self.processor(text=text, return_tensors="pt")
-        # 确保说话人特征维度正确
-        if speaker_embedding.dim() != 2 or speaker_embedding.size(1) != 512:
-            raise ValueError(f"Speaker embedding should have shape [1, 512], but got {speaker_embedding.shape}")
-        # 生成语音
-        speech = self.tts_model.generate_speech(
-            inputs["input_ids"].to(self.device),
-            speaker_embedding.to(self.device),
-            vocoder=self.vocoder
-        )
-        return speech
     def clone_voice(
         self,
@@ -140,6 +188,7 @@ class VoiceCloneSystem:
             speech = self.generate_speech(text, speaker_embedding)
             return speech
         except Exception as e:
             print(f"Error in clone_voice: {str(e)}")
             raise
@@ -158,13 +207,17 @@ class VoiceCloneSystem:
             output_path: 输出文件路径
             sample_rate: 采样率
         """
-        # 确保输出目录存在
-        output_path = Path(output_path)
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        # 保存音频
-        torchaudio.save(
-            str(output_path),
-            waveform.unsqueeze(0).cpu(),
-            sample_rate
-        )

         print("模型加载完成！")
+    def process_audio(self, waveform: torch.Tensor, sr: int) -> torch.Tensor:
+        """
+        处理音频：重采样和转换为单声道
+        Args:
+            waveform: 输入音频波形
+            sr: 采样率
+        Returns:
+            处理后的音频波形
+        """
+        # 重采样到16kHz
+        if sr != 16000:
+            waveform = torchaudio.functional.resample(waveform, sr, 16000)
+        # 确保音频是单声道
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        # 标准化音频长度（3秒）
+        target_length = 16000 * 3
+        current_length = waveform.shape[1]
+        if current_length > target_length:
+            # 如果太长，截取中间部分
+            start = (current_length - target_length) // 2
+            waveform = waveform[:, start:start + target_length]
+        elif current_length < target_length:
+            # 如果太短，用0填充
+            padding = torch.zeros(1, target_length - current_length)
+            waveform = torch.cat([waveform, padding], dim=1)
+        return waveform
     def extract_speaker_embedding(
         self,
         audio_paths: List[Union[str, Path]]
         embeddings = []
         for audio_path in audio_paths:
+            try:
+                # 加载音频
+                waveform, sr = torchaudio.load(str(audio_path))
+                # 处理音频
+                waveform = self.process_audio(waveform, sr)
+                # 提取特征
+                with torch.no_grad():
+                    # 确保输入维度正确 [batch, time]
+                    if waveform.dim() == 2:
+                        waveform = waveform.squeeze(0)
+                    # 提取特征并处理维度
+                    embedding = self.speaker_encoder.encode_batch(waveform.unsqueeze(0).to(self.device))
+                    embedding = embedding.squeeze()  # 移除所有维度为1的维度
+                    # 打印中间结果
+                    print(f"Raw embedding shape: {embedding.shape}")
+                    embeddings.append(embedding)
+            except Exception as e:
+                print(f"Error processing audio {audio_path}: {str(e)}")
+                raise
         # 计算平均特征
+        mean_embedding = torch.stack(embeddings).mean(dim=0)
+        # 确保最终维度正确 [1, 512]
         if mean_embedding.dim() == 1:
+            mean_embedding = mean_embedding.unsqueeze(0)
+        # 打印最终维度
         print(f"Final embedding shape: {mean_embedding.shape}")
         return mean_embedding
     def generate_speech(
         Returns:
             生成的语音波形
         """
+        try:
+            # 处理输入文本
+            inputs = self.processor(text=text, return_tensors="pt")
+            # 确保说话人特征维度正确
+            if speaker_embedding.dim() != 2 or speaker_embedding.size(1) != 512:
+                raise ValueError(f"Speaker embedding should have shape [1, 512], but got {speaker_embedding.shape}")
+            # 生成语音
+            speech = self.tts_model.generate_speech(
+                inputs["input_ids"].to(self.device),
+                speaker_embedding.to(self.device),
+                vocoder=self.vocoder
+            )
+            return speech
+        except Exception as e:
+            print(f"Error in generate_speech: {str(e)}")
+            raise
     def clone_voice(
         self,
             speech = self.generate_speech(text, speaker_embedding)
             return speech
         except Exception as e:
             print(f"Error in clone_voice: {str(e)}")
             raise
             output_path: 输出文件路径
             sample_rate: 采样率
         """
+        try:
+            # 确保输出目录存在
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            # 保存音频
+            torchaudio.save(
+                str(output_path),
+                waveform.unsqueeze(0).cpu(),
+                sample_rate
+            )
+        except Exception as e:
+            print(f"Error saving audio: {str(e)}")
+            raise