Spaces:

softcatala
/

comparativa-tts-catala

Running

App Files Files Community

ccoreilly commited on Apr 29, 2023

Commit

7f0efc6

•

1 Parent(s): 58b3ffd

Afegeix models

Browse files

Files changed (13) hide show

Dockerfile +2 -2
models/bsc/best_model.pth +3 -0
models/bsc/config.json +262 -0
models/bsc/speaker_map.json +10 -0
models/bsc/speakers.pth +3 -0
models/collectivat/catotron-ona-TTS-API-entry.json +10 -0
models/collectivat/fast-speech_best_model.pth +3 -0
models/collectivat/fast-speech_config.json +213 -0
models/collectivat/ljspeech--hifigan_v2_config.json +158 -0
models/collectivat/ljspeech--hifigan_v2_model_file.pth +3 -0
models/piper/MODEL_CARD +15 -0
models/piper/ca-upc_ona-x-low.onnx +3 -0
models/piper/ca-upc_ona-x-low.onnx.json +409 -0

Dockerfile CHANGED Viewed

@@ -12,6 +12,7 @@ RUN cd espeak-ng && \
 COPY requirements.txt .
 COPY app.py .
 RUN pip install -r requirements.txt
@@ -20,7 +21,6 @@ RUN mkdir -p cache && chmod 777 cache
 ENV NUMBA_CACHE_DIR=./cache
 ENV MPLCONFIGDIR=./cache
 EXPOSE 7860
-CMD python app.py

 COPY requirements.txt .
 COPY app.py .
+COPY models .
 RUN pip install -r requirements.txt
 ENV NUMBA_CACHE_DIR=./cache
 ENV MPLCONFIGDIR=./cache
 EXPOSE 7860
+CMD ["python", "app.py"]

models/bsc/best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b15fa7d2052bada1cf421e49d2d03b00e95b49fcd0e42b7af1d92da2880cdecc
+size 1038659133

models/bsc/config.json ADDED Viewed

	@@ -0,0 +1,262 @@

+{
+    "output_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/experiments_from_previous",
+    "logger_uri": null,
+    "run_name": "multispeaker_vits_ca_1e4_1e4_32",
+    "project_name": null,
+    "run_description": "\ud83d\udc38Coqui trainer run.",
+    "print_step": 25,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "tensorboard",
+    "log_model_step": 1000,
+    "save_step": 1000,
+    "save_n_checkpoints": 5,
+    "save_checkpoints": true,
+    "save_all_best": true,
+    "save_best_after": 10000,
+    "target_loss": null,
+    "print_eval": true,
+    "test_delay_epochs": -1,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": false,
+    "epochs": 1000,
+    "batch_size": 16,
+    "eval_batch_size": 8,
+    "grad_clip": [
+        1000.0,
+        1000.0
+    ],
+    "scheduler_after_epoch": true,
+    "lr": 0.001,
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": "",
+    "lr_scheduler_params": null,
+    "use_grad_scaler": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "vits",
+    "num_loader_workers": 4,
+    "num_eval_loader_workers": 4,
+    "use_noise_augment": false,
+    "audio": {
+        "fft_size": 1024,
+        "sample_rate": 22050,
+        "win_length": 1024,
+        "hop_length": 256,
+        "num_mels": 80,
+        "mel_fmin": 0,
+        "mel_fmax": null
+    },
+    "use_phonemes": true,
+    "phonemizer": "espeak",
+    "phoneme_language": "ca",
+    "compute_input_seq_cache": true,
+    "text_cleaner": "multilingual_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/phoneme_cache",
+    "characters": {
+        "characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
+        "vocab_dict": null,
+        "pad": "<PAD>",
+        "eos": "<EOS>",
+        "bos": "<BOS>",
+        "blank": "<BLNK>",
+        "characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
+        "punctuations": "!'(),-.:;? ",
+        "phonemes": null,
+        "is_unique": false,
+        "is_sorted": true
+    },
+    "add_blank": true,
+    "batch_group_size": 5,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": 325,
+    "compute_f0": false,
+    "compute_linear_spec": true,
+    "precompute_num_workers": 0,
+    "start_by_longest": false,
+    "datasets": [
+        {
+            "formatter": "vctk_old",
+            "dataset_name": "vctk_old",
+            "path": "/gpfs/scratch/bsc88/bsc88474/data/multispeaker_ca",
+            "meta_file_train": "",
+            "ignored_speakers": [
+                "uri",
+                "09796",
+                "05450"
+            ],
+            "language": "ca",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [
+        [
+            "Per exemple, dels nostres bancs que inverteixen en armament de les nostres empreses."
+        ],
+        [
+            "Preguntin-se si aix\u00f2 era necessari."
+        ],
+        [
+            "La suposada ocultaci\u00f3 dels informes que advertien de risc s\u00edsmic."
+        ],
+        [
+            "\u00c9s de 633 milions d'euros quan es far\u00e0 la publicaci\u00f3 detallada."
+        ]
+    ],
+    "eval_split_max_size": null,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "model_args": {
+        "num_chars": 131,
+        "out_channels": 513,
+        "spec_segment_size": 32,
+        "hidden_channels": 192,
+        "hidden_channels_ffn_text_encoder": 768,
+        "num_heads_text_encoder": 2,
+        "num_layers_text_encoder": 6,
+        "kernel_size_text_encoder": 3,
+        "dropout_p_text_encoder": 0.1,
+        "dropout_p_duration_predictor": 0.5,
+        "kernel_size_posterior_encoder": 5,
+        "dilation_rate_posterior_encoder": 1,
+        "num_layers_posterior_encoder": 16,
+        "kernel_size_flow": 5,
+        "dilation_rate_flow": 1,
+        "num_layers_flow": 4,
+        "resblock_type_decoder": "1",
+        "resblock_kernel_sizes_decoder": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes_decoder": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates_decoder": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel_decoder": 512,
+        "upsample_kernel_sizes_decoder": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "periods_multi_period_discriminator": [
+            2,
+            3,
+            5,
+            7,
+            11
+        ],
+        "use_sdp": true,
+        "noise_scale": 1.0,
+        "inference_noise_scale": 0.667,
+        "length_scale": 1.0,
+        "noise_scale_dp": 1.0,
+        "inference_noise_scale_dp": 1.0,
+        "max_inference_len": null,
+        "init_discriminator": true,
+        "use_spectral_norm_disriminator": false,
+        "use_speaker_embedding": true,
+        "num_speakers": 257,
+        "speakers_file": "/home/user/app/speakers.pth",
+        "d_vector_file": null,
+        "speaker_embedding_channels": 256,
+        "use_d_vector_file": false,
+        "d_vector_dim": 0,
+        "detach_dp_input": true,
+        "use_language_embedding": false,
+        "embedded_language_dim": 4,
+        "num_languages": 0,
+        "language_ids_file": null,
+        "use_speaker_encoder_as_loss": false,
+        "speaker_encoder_config_path": "",
+        "speaker_encoder_model_path": "",
+        "condition_dp_on_speaker": true,
+        "freeze_encoder": false,
+        "freeze_DP": false,
+        "freeze_PE": false,
+        "freeze_flow_decoder": false,
+        "freeze_waveform_decoder": false,
+        "encoder_sample_rate": null,
+        "interpolate_z": true,
+        "reinit_DP": false,
+        "reinit_text_encoder": false
+    },
+    "lr_gen": 0.0001,
+    "lr_disc": 0.0001,
+    "lr_scheduler_gen": "ExponentialLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "lr_scheduler_disc": "ExponentialLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "kl_loss_alpha": 1.0,
+    "disc_loss_alpha": 1.0,
+    "gen_loss_alpha": 1.0,
+    "feat_loss_alpha": 1.0,
+    "mel_loss_alpha": 45.0,
+    "dur_loss_alpha": 1.0,
+    "speaker_encoder_loss_alpha": 1.0,
+    "return_wav": true,
+    "use_weighted_sampler": false,
+    "weighted_sampler_attrs": null,
+    "weighted_sampler_multipliers": null,
+    "r": 1,
+    "num_speakers": 257,
+    "use_speaker_embedding": true,
+    "speakers_file": "/home/user/app/speakers.pth",
+    "speaker_embedding_channels": 256,
+    "language_ids_file": null,
+    "use_language_embedding": false,
+    "use_d_vector_file": false,
+    "d_vector_file": null,
+    "d_vector_dim": 0
+}

models/bsc/speaker_map.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "f_cen_05": "05739",
+    "f_cen_81": "8162d651b6211f06f655a69cd7fdd383d6b4287e9ba132b9898ef9ac8687349e777626333d23bed93f9264aae965efb14ed650cb64fd0ad90494aff903eaef11",
+    "f_occ_31": "31535cb2ece4710d08fdbeefb6f8f75ed093fee4cf8573bd601d960f8c6156f0fd0a85712761691e86e31160b993ee0eacb10c4c8aed000cc394cf7c7d207a7e",
+    "f_occ_de": "dee065b956b99b10db4763759d64c41791af1a7e77f1864f90a2b0847a12633dcf9bc108db7eaf73cc8d0e750f5c37383a56cd77cc2276d3960104c6bebe6346",
+    "f_sep_31": "31e6f3a011661320b2e59b6f8be43f6db2243e9feabc2b9787c1413788e13eb0e5810bed983bf7ff66e46417d183a91ed50b3b9be9d89e4f51aada72293b9881",
+    "m_cen_08": "08935",
+    "m_occ_44": "30b1f81c579755895581259d79a8a5a3ca45b908b0bd14ad1c6418f39aa1e2f47cb4749c69b5440cdb92e3bafb772e19e7bc2b16d196b061addd173a1309e491",
+    "m_val_89": "896256329fbeb5b8116349c31d8a39a7d36d5f970d48558e1db5417d611e240e4dbf473f6e49137f7aa6116394b7deabb0bbec4a014896cdc9484ee91458117d"
+}

models/bsc/speakers.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6dacda0b8dd3e111c5072f8f33c08b4a29b92ac79aaf22ceca912d01e7deb905
+size 30191

models/collectivat/catotron-ona-TTS-API-entry.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "voice": "ona-fast-hifigan",
+    "lang": "ca",
+    "model_type": "coqui",
+    "tts_config_path": "fast-speech_config.json",
+    "tts_model_path": "fast-speech_best_model.pth",
+    "vocoder_config_path": "ljspeech--hifigan_v2_config.json",
+    "vocoder_model_path": "ljspeech--hifigan_v2_model_file.pth",
+    "load": true
+}

models/collectivat/fast-speech_best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a5aefb9f49f6172e34b816e1de8f5234012f0a9a05747973f6610e40869983f
+size 457921637

models/collectivat/fast-speech_config.json ADDED Viewed

	@@ -0,0 +1,213 @@

+{
+    "output_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron",
+    "logger_uri": null,
+    "run_name": "fast_pitch_ljspeech",
+    "project_name": null,
+    "run_description": "\ud83d\udc38Coqui trainer run.",
+    "print_step": 50,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "tensorboard",
+    "log_model_step": null,
+    "save_step": 10000,
+    "save_n_checkpoints": 5,
+    "save_checkpoints": true,
+    "save_all_best": false,
+    "save_best_after": 1000,
+    "target_loss": null,
+    "print_eval": false,
+    "test_delay_epochs": -1,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": false,
+    "epochs": 1000,
+    "batch_size": 16,
+    "eval_batch_size": 16,
+    "grad_clip": 5.0,
+    "scheduler_after_epoch": true,
+    "lr": 0.0001,
+    "optimizer": "Adam",
+    "optimizer_params": {
+        "betas": [
+            0.9,
+            0.998
+        ],
+        "weight_decay": 1e-06
+    },
+    "lr_scheduler": "NoamLR",
+    "lr_scheduler_params": {
+        "warmup_steps": 4000
+    },
+    "use_grad_scaler": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "fast_pitch",
+    "num_loader_workers": 8,
+    "num_eval_loader_workers": 4,
+    "use_noise_augment": false,
+    "audio": {
+        "fft_size": 1024,
+        "win_length": 1024,
+        "hop_length": 256,
+        "frame_shift_ms": null,
+        "frame_length_ms": null,
+        "stft_pad_mode": "reflect",
+        "sample_rate": 22050,
+        "resample": false,
+        "preemphasis": 0.0,
+        "ref_level_db": 20,
+        "do_sound_norm": false,
+        "log_func": "np.log",
+        "do_trim_silence": true,
+        "trim_db": 60.0,
+        "do_rms_norm": false,
+        "db_level": null,
+        "power": 1.5,
+        "griffin_lim_iters": 60,
+        "num_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000,
+        "spec_gain": 1.0,
+        "do_amp_to_db_linear": true,
+        "do_amp_to_db_mel": true,
+        "pitch_fmax": 640.0,
+        "pitch_fmin": 0.0,
+        "signal_norm": false,
+        "min_level_db": -100,
+        "symmetric_norm": true,
+        "max_norm": 4.0,
+        "clip_norm": true,
+        "stats_path": null
+    },
+    "use_phonemes": false,
+    "phonemizer": null,
+    "phoneme_language": "ca-es",
+    "compute_input_seq_cache": true,
+    "text_cleaner": "multilingual_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": null,
+    "characters": {
+        "characters_class": "TTS.tts.utils.text.characters.Graphemes",
+        "vocab_dict": null,
+        "pad": "_",
+        "eos": "*",
+        "bos": "^",
+        "blank": null,
+        "characters": "A\u00c0\u00c1BC\u00c7DE\u00c9\u00c8FGHI\u00cd\u00cfJKLMNO\u00d3\u00d2PQRSTU\u00dc\u00daVWXYZa\u00e0\u00e1bc\u00e7de\u00e9\u00e8fghi\u00ed\u00efjklmno\u00f3\u00f2pqrstu\u00fc\u00favwxyz",
+        "punctuations": "!'(),-.:;?\u00b7 ",
+        "phonemes": "",
+        "is_unique": true,
+        "is_sorted": true
+    },
+    "add_blank": false,
+    "batch_group_size": 0,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": Infinity,
+    "compute_f0": true,
+    "compute_linear_spec": false,
+    "precompute_num_workers": 4,
+    "start_by_longest": false,
+    "datasets": [
+        {
+            "name": "custom_turkish",
+            "path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
+            "meta_file_train": "upc_ona_train.txt",
+            "ignored_speakers": null,
+            "language": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "name": "custom_turkish",
+            "path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
+            "meta_file_train": "upc_ona_val.txt",
+            "ignored_speakers": null,
+            "language": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [
+        "Hola Barcelona!",
+        "Escriviu al text."
+    ],
+    "eval_split_max_size": null,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "base_model": "forward_tts",
+    "model_args": {
+        "num_chars": 89,
+        "out_channels": 80,
+        "hidden_channels": 384,
+        "use_aligner": true,
+        "use_pitch": true,
+        "pitch_predictor_hidden_channels": 256,
+        "pitch_predictor_kernel_size": 3,
+        "pitch_predictor_dropout_p": 0.1,
+        "pitch_embedding_kernel_size": 3,
+        "duration_predictor_hidden_channels": 256,
+        "duration_predictor_kernel_size": 3,
+        "duration_predictor_dropout_p": 0.1,
+        "positional_encoding": true,
+        "poisitonal_encoding_use_scale": true,
+        "length_scale": 1,
+        "encoder_type": "fftransformer",
+        "encoder_params": {
+            "hidden_channels_ffn": 1024,
+            "num_heads": 1,
+            "num_layers": 6,
+            "dropout_p": 0.1
+        },
+        "decoder_type": "fftransformer",
+        "decoder_params": {
+            "hidden_channels_ffn": 1024,
+            "num_heads": 1,
+            "num_layers": 6,
+            "dropout_p": 0.1
+        },
+        "detach_duration_predictor": false,
+        "max_duration": 75,
+        "num_speakers": 1,
+        "use_speaker_embedding": false,
+        "speakers_file": null,
+        "use_d_vector_file": false,
+        "d_vector_dim": null,
+        "d_vector_file": null
+    },
+    "num_speakers": 0,
+    "speakers_file": null,
+    "use_speaker_embedding": false,
+    "use_d_vector_file": false,
+    "d_vector_file": false,
+    "d_vector_dim": 0,
+    "spec_loss_type": "mse",
+    "duration_loss_type": "mse",
+    "use_ssim_loss": true,
+    "ssim_loss_alpha": 1.0,
+    "spec_loss_alpha": 1.0,
+    "aligner_loss_alpha": 1.0,
+    "pitch_loss_alpha": 0.1,
+    "dur_loss_alpha": 0.1,
+    "binary_align_loss_alpha": 0.1,
+    "binary_loss_warmup_epochs": 150,
+    "min_seq_len": 13,
+    "max_seq_len": 500000,
+    "r": 1,
+    "f0_cache_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/f0_cache",
+    "restore_path": "/home/twbgmy/.local/share/tts/tts_models--en--ljspeech--fast_pitch/model_file.pth",
+    "github_branch": "* dev"
+}

models/collectivat/ljspeech--hifigan_v2_config.json ADDED Viewed

	@@ -0,0 +1,158 @@

+{
+    "run_name": "hifigan",
+    "run_description": "universal hifigan trained on LibriTTS with no spectrogram normalization and using log() for scaling instead of log10()",
+    // AUDIO PARAMETERS
+    "audio":{
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+        "log_func": "np.log",
+        // Silence trimming
+        "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 0.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 1.0,         // scaler value appplied after log transform of spectrogram.
+        // Normalization parameters
+        "signal_norm": false,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+    // DISTRIBUTED TRAINING
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54324"
+    },
+    // MODEL PARAMETERS
+    "use_pqmf": false,
+    // LOSS PARAMETERS
+    "use_stft_loss": false,
+    "use_subband_stft_loss": false,
+    "use_mse_gan_loss": true,
+    "use_hinge_gan_loss": false,
+    "use_feat_match_loss": true,  // use only with melgan discriminators
+    "use_l1_spec_loss": true,
+    // loss weights
+    "stft_loss_weight": 0,
+    "subband_stft_loss_weight": 0,
+    "mse_G_loss_weight": 1,
+    "hinge_G_loss_weight": 0,
+    "feat_match_loss_weight": 10,
+    "l1_spec_loss_weight": 45,
+    // multiscale stft loss parameters
+    // "stft_loss_params": {
+    //     "n_ffts": [1024, 2048, 512],
+    //     "hop_lengths": [120, 240, 50],
+    //     "win_lengths": [600, 1200, 240]
+    // },
+    "l1_spec_loss_params": {
+        "use_mel": true,
+        "sample_rate": 16000,
+        "n_fft": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "n_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "target_loss": "avg_G_loss",  // loss value to pick the best model to save after each epoch
+    // DISCRIMINATOR
+    "discriminator_model": "hifigan_discriminator",
+    //"discriminator_model_params":{
+    //    "peroids": [2, 3, 5, 7, 11],
+    //    "base_channels": 16,
+    //    "max_channels":512,
+    //    "downsample_factors":[4, 4, 4]
+    //},
+    "steps_to_start_discriminator": 0,      // steps required to start GAN trainining.1
+    // GENERATOR
+    "generator_model": "hifigan_generator",
+    "generator_model_params": {
+        "resblock_type": "1",
+        "upsample_factors": [8,8,2,2],
+        "upsample_kernel_sizes": [16,16,4,4],
+        "upsample_initial_channel": 128,
+        "resblock_kernel_sizes": [3,7,11],
+        "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]]
+    },
+    // DATASET
+    "data_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/vo_voice_quality_transformation/",
+    "feature_path": null,
+    // "feature_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA/",
+    "seq_len": 8192,
+    "pad_short": 2000,
+    "conv_pad": 0,
+    "use_noise_augment": false,
+    "use_cache": true,
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+    // TRAINING
+    "batch_size": 16,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 10,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+    // OPTIMIZER
+    "epochs": 10000,                // total number of epochs to train.
+    "wd": 0.0,                // Weight decay weight.
+    "gen_clip_grad": -1,      // Generator gradient clipping threshold. Apply gradient clipping if > 0
+    "disc_clip_grad": -1,     // Discriminator gradient clipping threshold.
+    // "lr_scheduler_gen": "ExponentialLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+    // "lr_scheduler_gen_params": {
+    //    "gamma": 0.999,
+        // "last_epoch": -1
+    // },
+    // "lr_scheduler_disc": "ExponentialLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+    // "lr_scheduler_disc_params": {
+    	//   "gamma": 0.999,
+        // "last_epoch": -1
+    // },
+    "lr_gen": 0.00001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "lr_disc": 0.00001,
+    // TENSORBOARD and LOGGING
+    "print_step": 25,       // Number of steps to log traning on console.
+    "print_eval": false,     // If True, it prints loss values for each step in eval run.
+    "save_step": 25000,      // Number of training steps expected to plot training stats on TB and save model checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+    // DATA LOADING
+    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
+    "eval_split_size": 10,
+    // PATHS
+    "output_path": "/home/erogol/gdrive/Trainings/sam/"
+}

models/collectivat/ljspeech--hifigan_v2_model_file.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4047e93886faa1aba11948efa71f59dcb0ec9117e286660e59b91892ef98d129
+size 3794153

models/piper/MODEL_CARD ADDED Viewed

	@@ -0,0 +1,15 @@

+# Model card for upc_ona (x-low)
+* Language: ca (Catalan)
+* Speakers: 1
+* Quality: x-low
+* Samplerate: 16,000Hz
+## Dataset
+* URL: https://collectivat.cat/asr#upc-festcat-tts-corpora
+* License: CC BY-SA 3.0 ES
+## Training
+Trained from scratch.

models/piper/ca-upc_ona-x-low.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13661d26423e0c791823823a5971f4e1aaf644a62e65e0e94d299c0e70560e14
+size 20628813

models/piper/ca-upc_ona-x-low.onnx.json ADDED Viewed

	@@ -0,0 +1,409 @@

+{
+    "audio": {
+        "sample_rate": 16000
+    },
+    "espeak": {
+        "voice": "ca"
+    },
+    "inference": {
+        "noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_w": 0.8
+    },
+    "phoneme_map": {},
+    "phoneme_id_map": {
+        "_": [
+            0
+        ],
+        "^": [
+            1
+        ],
+        "$": [
+            2
+        ],
+        " ": [
+            3
+        ],
+        "!": [
+            4
+        ],
+        "'": [
+            5
+        ],
+        "(": [
+            6
+        ],
+        ")": [
+            7
+        ],
+        ",": [
+            8
+        ],
+        "-": [
+            9
+        ],
+        ".": [
+            10
+        ],
+        ":": [
+            11
+        ],
+        ";": [
+            12
+        ],
+        "?": [
+            13
+        ],
+        "a": [
+            14
+        ],
+        "b": [
+            15
+        ],
+        "c": [
+            16
+        ],
+        "d": [
+            17
+        ],
+        "e": [
+            18
+        ],
+        "f": [
+            19
+        ],
+        "h": [
+            20
+        ],
+        "i": [
+            21
+        ],
+        "j": [
+            22
+        ],
+        "k": [
+            23
+        ],
+        "l": [
+            24
+        ],
+        "m": [
+            25
+        ],
+        "n": [
+            26
+        ],
+        "o": [
+            27
+        ],
+        "p": [
+            28
+        ],
+        "q": [
+            29
+        ],
+        "r": [
+            30
+        ],
+        "s": [
+            31
+        ],
+        "t": [
+            32
+        ],
+        "u": [
+            33
+        ],
+        "v": [
+            34
+        ],
+        "w": [
+            35
+        ],
+        "x": [
+            36
+        ],
+        "y": [
+            37
+        ],
+        "z": [
+            38
+        ],
+        "æ": [
+            39
+        ],
+        "ç": [
+            40
+        ],
+        "ð": [
+            41
+        ],
+        "ø": [
+            42
+        ],
+        "ħ": [
+            43
+        ],
+        "ŋ": [
+            44
+        ],
+        "œ": [
+            45
+        ],
+        "ǀ": [
+            46
+        ],
+        "ǁ": [
+            47
+        ],
+        "ǂ": [
+            48
+        ],
+        "ǃ": [
+            49
+        ],
+        "ɐ": [
+            50
+        ],
+        "ɑ": [
+            51
+        ],
+        "ɒ": [
+            52
+        ],
+        "ɓ": [
+            53
+        ],
+        "ɔ": [
+            54
+        ],
+        "ɕ": [
+            55
+        ],
+        "ɖ": [
+            56
+        ],
+        "ɗ": [
+            57
+        ],
+        "ɘ": [
+            58
+        ],
+        "ə": [
+            59
+        ],
+        "ɚ": [
+            60
+        ],
+        "ɛ": [
+            61
+        ],
+        "ɜ": [
+            62
+        ],
+        "ɞ": [
+            63
+        ],
+        "ɟ": [
+            64
+        ],
+        "ɠ": [
+            65
+        ],
+        "ɡ": [
+            66
+        ],
+        "ɢ": [
+            67
+        ],
+        "ɣ": [
+            68
+        ],
+        "ɤ": [
+            69
+        ],
+        "ɥ": [
+            70
+        ],
+        "ɦ": [
+            71
+        ],
+        "ɧ": [
+            72
+        ],
+        "ɨ": [
+            73
+        ],
+        "ɪ": [
+            74
+        ],
+        "ɫ": [
+            75
+        ],
+        "ɬ": [
+            76
+        ],
+        "ɭ": [
+            77
+        ],
+        "ɮ": [
+            78
+        ],
+        "ɯ": [
+            79
+        ],
+        "ɰ": [
+            80
+        ],
+        "ɱ": [
+            81
+        ],
+        "ɲ": [
+            82
+        ],
+        "ɳ": [
+            83
+        ],
+        "ɴ": [
+            84
+        ],
+        "ɵ": [
+            85
+        ],
+        "ɶ": [
+            86
+        ],
+        "ɸ": [
+            87
+        ],
+        "ɹ": [
+            88
+        ],
+        "ɺ": [
+            89
+        ],
+        "ɻ": [
+            90
+        ],
+        "ɽ": [
+            91
+        ],
+        "ɾ": [
+            92
+        ],
+        "ʀ": [
+            93
+        ],
+        "ʁ": [
+            94
+        ],
+        "ʂ": [
+            95
+        ],
+        "ʃ": [
+            96
+        ],
+        "ʄ": [
+            97
+        ],
+        "ʈ": [
+            98
+        ],
+        "ʉ": [
+            99
+        ],
+        "ʊ": [
+            100
+        ],
+        "ʋ": [
+            101
+        ],
+        "ʌ": [
+            102
+        ],
+        "ʍ": [
+            103
+        ],
+        "ʎ": [
+            104
+        ],
+        "ʏ": [
+            105
+        ],
+        "ʐ": [
+            106
+        ],
+        "ʑ": [
+            107
+        ],
+        "ʒ": [
+            108
+        ],
+        "ʔ": [
+            109
+        ],
+        "ʕ": [
+            110
+        ],
+        "ʘ": [
+            111
+        ],
+        "ʙ": [
+            112
+        ],
+        "ʛ": [
+            113
+        ],
+        "ʜ": [
+            114
+        ],
+        "ʝ": [
+            115
+        ],
+        "ʟ": [
+            116
+        ],
+        "ʡ": [
+            117
+        ],
+        "ʢ": [
+            118
+        ],
+        "ʲ": [
+            119
+        ],
+        "ˈ": [
+            120
+        ],
+        "ˌ": [
+            121
+        ],
+        "ː": [
+            122
+        ],
+        "ˑ": [
+            123
+        ],
+        "˞": [
+            124
+        ],
+        "β": [
+            125
+        ],
+        "θ": [
+            126
+        ],
+        "χ": [
+            127
+        ],
+        "ᵻ": [
+            128
+        ],
+        "ⱱ": [
+            129
+        ]
+    },
+    "num_symbols": 130,
+    "num_speakers": 1,
+    "speaker_id_map": {}
+}