import sys import io, os, stat import subprocess import random from zipfile import ZipFile import uuid import time import torch import torchaudio import time # Mantenemos la descarga de MeCab os.system('python -m unidic download') # Mantenemos el acuerdo de CPML os.environ["COQUI_TOS_AGREED"] = "1" import langid import base64 import csv from io import StringIO import datetime import re import gradio as gr from scipy.io.wavfile import write from pydub import AudioSegment from TTS.api import TTS from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts from TTS.utils.generic_utils import get_user_data_dir HF_TOKEN = os.environ.get("HF_TOKEN") from huggingface_hub import hf_hub_download import os from TTS.utils.manage import get_user_data_dir # Mantenemos la autenticación y descarga del modelo repo_id = "Blakus/Pedro_Lab_XTTS" local_dir = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2") os.makedirs(local_dir, exist_ok=True) files_to_download = ["config.json", "model.pth", "vocab.json"] for file_name in files_to_download: print(f"Downloading {file_name} from {repo_id}") local_file_path = os.path.join(local_dir, file_name) hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=local_dir) # Cargamos configuración y modelo config_path = os.path.join(local_dir, "config.json") checkpoint_path = os.path.join(local_dir, "model.pth") vocab_path = os.path.join(local_dir, "vocab.json") config = XttsConfig() config.load_json(config_path) model = Xtts.init_from_config(config) model.load_checkpoint(config, checkpoint_path=checkpoint_path, vocab_path=vocab_path, eval=True, use_deepspeed=False) print("Modelo cargado en CPU") # Mantenemos variables globales y funciones auxiliares DEVICE_ASSERT_DETECTED = 0 DEVICE_ASSERT_PROMPT = None DEVICE_ASSERT_LANG = None supported_languages = config.languages # Función de inferencia usando parámetros predeterminados del archivo de configuración def predict(prompt, language, audio_file_pth, mic_file_path, use_mic): try: if use_mic: speaker_wav = mic_file_path else: speaker_wav = audio_file_pth if len(prompt) < 2 or len(prompt) > 200: return None, None, "El texto debe tener entre 2 y 200 caracteres." # Usamos los valores de la configuración directamente temperature = getattr(config, "temperature", 0.75) repetition_penalty = getattr(config, "repetition_penalty", 5.0) gpt_cond_len = getattr(config, "gpt_cond_len", 30) gpt_cond_chunk_len = getattr(config, "gpt_cond_chunk_len", 4) max_ref_length = getattr(config, "max_ref_len", 60) gpt_cond_latent, speaker_embedding = model.get_conditioning_latents( audio_path=speaker_wav, gpt_cond_len=gpt_cond_len, gpt_cond_chunk_len=gpt_cond_chunk_len, max_ref_length=max_ref_length ) # Medimos el tiempo de inferencia manualmente start_time = time.time() out = model.inference( prompt, language, gpt_cond_latent, speaker_embedding, temperature=temperature, repetition_penalty=repetition_penalty, ) inference_time = time.time() - start_time torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000) # Calculamos las métricas usando el tiempo medido manualmente audio_length = len(out["wav"]) / 24000 # duración del audio en segundos real_time_factor = inference_time / audio_length metrics_text = f"Tiempo de generación: {inference_time:.2f} segundos\n" metrics_text += f"Factor de tiempo real: {real_time_factor:.2f}" return gr.make_waveform("output.wav"), "output.wav", metrics_text except Exception as e: print(f"Error detallado: {str(e)}") return None, None, f"Error: {str(e)}" # Interfaz de Gradio actualizada sin sliders with gr.Blocks(theme=gr.themes.Base()) as demo: gr.Markdown("# Sintetizador de Voz XTTS") with gr.Row(): with gr.Column(): input_text = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...") language = gr.Dropdown(label="Idioma", choices=supported_languages, value="es") audio_file = gr.Audio(label="Audio de referencia", type="filepath") use_mic = gr.Checkbox(label="Usar micrófono") mic_file = gr.Audio(label="Grabar con micrófono", source="microphone", type="filepath", visible=False) use_mic.change(fn=lambda x: gr.update(visible=x), inputs=[use_mic], outputs=[mic_file]) generate_button = gr.Button("Generar voz") with gr.Column(): output_audio = gr.Audio(label="Audio generado") waveform = gr.Image(label="Forma de onda") metrics = gr.Textbox(label="Métricas") generate_button.click( predict, inputs=[input_text, language, audio_file, mic_file, use_mic], outputs=[waveform, output_audio, metrics] ) demo.launch(debug=True)