Spaces:

tobiccino
/

tts

Sleeping

File size: 4,546 Bytes

8c70653

from dataclasses import dataclass, field

from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
from TTS.vocoder.models.wavernn import WavernnArgs


@dataclass
class WavernnConfig(BaseVocoderConfig):
    """Defines parameters for Wavernn vocoder.
    Example:

        >>> from TTS.vocoder.configs import WavernnConfig
        >>> config = WavernnConfig()

    Args:
        model (str):
            Model name used for selecting the right model at initialization. Defaults to `wavernn`.
        mode (str):
            Output mode of the WaveRNN vocoder. `mold` for Mixture of Logistic Distribution, `gauss` for a single
            Gaussian Distribution and `bits` for quantized bits as the model's output.
        mulaw (bool):
            enable / disable the use of Mulaw quantization for training. Only applicable if `mode == 'bits'`. Defaults
            to `True`.
        generator_model (str):
            One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
            considered as a generator too. Defaults to `WaveRNN`.
        wavernn_model_params (dict):
            kwargs for the WaveRNN model. Defaults to
            `{
                "rnn_dims": 512,
                "fc_dims": 512,
                "compute_dims": 128,
                "res_out_dims": 128,
                "num_res_blocks": 10,
                "use_aux_net": True,
                "use_upsample_net": True,
                "upsample_factors": [4, 8, 8]
            }`
        batched (bool):
            enable / disable the batched inference. It speeds up the inference by splitting the input into segments and
            processing the segments in a batch. Then it merges the outputs with a certain overlap and smoothing. If
            you set it False, without CUDA, it is too slow to be practical. Defaults to True.
        target_samples (int):
            Size of the segments in batched mode. Defaults to 11000.
        overlap_sampels (int):
            Size of the overlap between consecutive segments. Defaults to 550.
        batch_size (int):
            Batch size used at training. Larger values use more memory. Defaults to 256.
        seq_len (int):
            Audio segment length used at training. Larger values use more memory. Defaults to 1280.

        use_noise_augment (bool):
            enable / disable random noise added to the input waveform. The noise is added after computing the
            features. Defaults to True.
        use_cache (bool):
            enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
            not large enough. Defaults to True.
        mixed_precision (bool):
            enable / disable mixed precision training. Default is True.
        eval_split_size (int):
            Number of samples used for evalutaion. Defaults to 50.
        num_epochs_before_test (int):
            Number of epochs waited to run the next evalution. Since inference takes some time, it is better to
            wait some number of epochs not ot waste training time. Defaults to 10.
        grad_clip (float):
            Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 4.0
        lr (float):
            Initila leraning rate. Defaults to 1e-4.
        lr_scheduler (str):
            One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`.
        lr_scheduler_params (dict):
            kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [200000, 400000, 600000]}`
    """

    model: str = "wavernn"

    # Model specific params
    model_args: WavernnArgs = field(default_factory=WavernnArgs)
    target_loss: str = "loss"

    # Inference
    batched: bool = True
    target_samples: int = 11000
    overlap_samples: int = 550

    # Training - overrides
    epochs: int = 10000
    batch_size: int = 256
    seq_len: int = 1280
    use_noise_augment: bool = False
    use_cache: bool = True
    mixed_precision: bool = True
    eval_split_size: int = 50
    num_epochs_before_test: int = (
        10  # number of epochs to wait until the next test run (synthesizing a full audio clip).
    )

    # optimizer overrides
    grad_clip: float = 4.0
    lr: float = 1e-4  # Initial learning rate.
    lr_scheduler: str = "MultiStepLR"  # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
    lr_scheduler_params: dict = field(default_factory=lambda: {"gamma": 0.5, "milestones": [200000, 400000, 600000]})