Spaces:

tobiccino
/

tts

Sleeping

App Files Files Community

tobiccino commited on Feb 17, 2023

Commit

12da6cc

1 Parent(s): e851206

upload

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
LICENSE +20 -0
README.md +116 -12
Untitled.ipynb +24 -0
app.py +7 -0
assets/.DS_Store +0 -0
assets/hifigan/config.json +38 -0
assets/infore/.DS_Store +0 -0
assets/infore/lexicon.txt +0 -0
assets/transcript.txt +26 -0
notebooks/align_text_audio_infore_mfa.ipynb +193 -0
notebooks/denoise_infore_dataset.ipynb +138 -0
scripts/download_aligned_infore_dataset.py +45 -0
scripts/quick_start.sh +12 -0
setup.cfg +14 -0
setup.py +43 -0
tests/test_nat_acoustic.py +18 -0
tests/test_nat_duration.py +15 -0
vietTTS.egg-info/PKG-INFO +11 -0
vietTTS.egg-info/SOURCES.txt +10 -0
vietTTS.egg-info/dependency_links.txt +1 -0
vietTTS.egg-info/requires.txt +12 -0
vietTTS.egg-info/top_level.txt +1 -0
vietTTS/__init__.py +0 -0
vietTTS/__pycache__/__init__.cpython-39.pyc +0 -0
vietTTS/__pycache__/synthesizer.cpython-39.pyc +0 -0
vietTTS/hifigan/__pycache__/config.cpython-39.pyc +0 -0
vietTTS/hifigan/__pycache__/mel2wave.cpython-39.pyc +0 -0
vietTTS/hifigan/__pycache__/model.cpython-39.pyc +0 -0
vietTTS/hifigan/config.py +6 -0
vietTTS/hifigan/convert_torch_model_to_haiku.py +83 -0
vietTTS/hifigan/create_mel.py +241 -0
vietTTS/hifigan/data_loader.py +0 -0
vietTTS/hifigan/mel2wave.py +41 -0
vietTTS/hifigan/model.py +125 -0
vietTTS/hifigan/torch_model.py +414 -0
vietTTS/hifigan/trainer.py +0 -0
vietTTS/nat/__init__.py +0 -0
vietTTS/nat/__pycache__/__init__.cpython-39.pyc +0 -0
vietTTS/nat/__pycache__/config.cpython-39.pyc +0 -0
vietTTS/nat/__pycache__/data_loader.cpython-39.pyc +0 -0
vietTTS/nat/__pycache__/model.cpython-39.pyc +0 -0
vietTTS/nat/__pycache__/text2mel.cpython-39.pyc +0 -0
vietTTS/nat/acoustic_tpu_trainer.py +189 -0
vietTTS/nat/acoustic_trainer.py +181 -0
vietTTS/nat/config.py +74 -0
vietTTS/nat/data_loader.py +156 -0
vietTTS/nat/dsp.py +128 -0
vietTTS/nat/duration_trainer.py +142 -0
vietTTS/nat/gta.py +82 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

LICENSE ADDED Viewed

	@@ -0,0 +1,20 @@

+Copyright (c) 2021 ntt123
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,116 @@
----
-title: Tts
-emoji: 🔥
-colorFrom: green
-colorTo: pink
-sdk: gradio
-sdk_version: 3.18.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+A Vietnamese TTS
+================
+Duration model + Acoustic model + HiFiGAN vocoder for vietnamese text-to-speech application.
+Online demo at https://huggingface.co/spaces/ntt123/vietTTS.
+A synthesized audio clip: [clip.wav](assets/infore/clip.wav). A colab notebook: [notebook](https://colab.research.google.com/drive/1oczrWOQOr1Y_qLdgis1twSlNZlfPVXoY?usp=sharing).
+🔔Checkout the experimental `multi-speaker` branch (`git checkout multi-speaker`) for multi-speaker support.🔔
+Install
+-------
+```sh
+git clone https://github.com/NTT123/vietTTS.git
+cd vietTTS
+pip3 install -e .
+```
+Quick start using pretrained models
+----------------------------------
+```sh
+bash ./scripts/quick_start.sh
+```
+Download InfoRe dataset
+-----------------------
+```sh
+python ./scripts/download_aligned_infore_dataset.py
+```
+**Note**: this is a denoised and aligned version of the original dataset which is donated by the InfoRe Technology company (see [here](https://www.facebook.com/groups/j2team.community/permalink/1010834009248719/)). You can download the original dataset (**InfoRe Technology 1**) at [here](https://github.com/TensorSpeech/TensorFlowASR/blob/main/README.md#vietnamese).
+See `notebooks/denoise_infore_dataset.ipynb` for instructions on how to denoise the dataset. We use the Montreal Forced Aligner (MFA) to align transcript and speech (textgrid files).
+See `notebooks/align_text_audio_infore_mfa.ipynb` for instructions on how to create textgrid files.
+Train duration model
+--------------------
+```sh
+python -m vietTTS.nat.duration_trainer
+```
+Train acoustic model
+--------------------
+```sh
+python -m vietTTS.nat.acoustic_trainer
+```
+Train HiFiGAN vocoder
+-------------
+We use the original implementation from HiFiGAN authors at https://github.com/jik876/hifi-gan. Use the config file at `assets/hifigan/config.json` to train your model.
+```sh
+git clone https://github.com/jik876/hifi-gan.git
+# create dataset in hifi-gan format
+ln -sf `pwd`/train_data hifi-gan/data
+cd hifi-gan/data
+ls -1 *.TextGrid | sed -e 's/\.TextGrid$//' > files.txt
+cd ..
+head -n 100 data/files.txt > val_files.txt
+tail -n +101 data/files.txt > train_files.txt
+rm data/files.txt
+# training
+python train.py \
+  --config ../assets/hifigan/config.json \
+  --input_wavs_dir=data \
+  --input_training_file=train_files.txt \
+  --input_validation_file=val_files.txt
+```
+Finetune on Ground-Truth Aligned melspectrograms:
+```sh
+cd /path/to/vietTTS # go to vietTTS directory
+python -m vietTTS.nat.zero_silence_segments -o train_data # zero all [sil, sp, spn] segments
+python -m vietTTS.nat.gta -o /path/to/hifi-gan/ft_dataset  # create gta melspectrograms at hifi-gan/ft_dataset directory
+# turn on finetune
+cd /path/to/hifi-gan
+python train.py \
+  --fine_tuning True \
+  --config ../assets/hifigan/config.json \
+  --input_wavs_dir=data \
+  --input_training_file=train_files.txt \
+  --input_validation_file=val_files.txt
+```
+Then, use the following command to convert pytorch model to haiku format:
+```sh
+cd ..
+python -m vietTTS.hifigan.convert_torch_model_to_haiku \
+  --config-file=assets/hifigan/config.json \
+  --checkpoint-file=hifi-gan/cp_hifigan/g_[latest_checkpoint]
+```
+Synthesize speech
+-----------------
+```sh
+python -m vietTTS.synthesizer \
+  --lexicon-file=train_data/lexicon.txt \
+  --text="hôm qua em tới trường" \
+  --output=clip.wav
+```

Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b036f793-1443-4341-932c-d112386937ea",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": ""
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

app.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import gradio as gr
+def greet(name):
+    return "Hello " + name + "!!"
+iface = gr.Interface(fn=greet, inputs="text", outputs="text")
+iface.launch()

assets/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

assets/hifigan/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "resblock": "1",
+  "num_gpus": 0,
+  "batch_size": 16,
+  "learning_rate": 0.0002,
+  "adam_b1": 0.8,
+  "adam_b2": 0.99,
+  "lr_decay": 0.999,
+  "seed": 1234,
+  "upsample_rates": [8,8,2,2],
+  "upsample_kernel_sizes": [16,16,4,4],
+  "upsample_initial_channel": 512,
+  "resblock_kernel_sizes": [3,7,11],
+  "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+  "resblock_initial_channel": 256,
+  "segment_size": 8192,
+  "num_mels": 80,
+  "num_freq": 1025,
+  "n_fft": 1024,
+  "hop_size": 256,
+  "win_size": 1024,
+  "sampling_rate": 16000,
+  "fmin": 0,
+  "fmax": 8000,
+  "fmax_for_loss": null,
+  "num_workers": 4,
+  "dist_config": {
+      "dist_backend": "nccl",
+      "dist_url": "tcp://localhost:54321",
+      "world_size": 1
+  }
+}

assets/infore/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

assets/infore/lexicon.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

assets/transcript.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+Trăm năm trong cõi người ta,
+Chữ tài chữ mệnh khéo là ghét nhau.
+Trải qua một cuộc bể dâu,
+Những điều trông thấy mà đau đớn lòng.
+Lạ gì bỉ sắc tư phong,
+Trời xanh quen thói má hồng đánh ghen.
+Cảo thơm lần giở trước đèn,
+Phong tình cổ lục còn truyền sử xanh.
+Rằng: Năm Gia tĩnh triều Minh,
+Bốn phương phẳng lặng hai kinh chữ vàng.
+Có nhà viên ngoại họ Vương,
+Gia tư nghỉ cũng thường thường bậc trung.
+Một trai con thứ rốt lòng,
+Vương Quan là chữ nối dòng nho gia.
+Đầu lòng hai ả tố nga,
+Thúy Kiều là chị em là Thúy Vân.
+Mai cốt cách tuyết tinh thần,
+Mỗi người một vẻ mười phân vẹn mười.
+Vân xem trang trọng khác vời,
+Khuôn trăng đầy đặn nét ngài nở nang.
+Hoa cười ngọc thốt đoan trang,
+Mây thua nước tóc tuyết nhường màu da.
+Kiều càng sắc sảo mặn mà,
+So bề tài sắc lại là phần hơn.
+Làn thu thủy nét xuân sơn,
+Hoa ghen thua thắm liễu hờn kém xanh.

notebooks/align_text_audio_infore_mfa.ipynb ADDED Viewed

	@@ -0,0 +1,193 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Align text and audio using Montreal Forced Aligner (MFA)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "IPkicKwU8IWj"
+   },
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "!apt update -y\n",
+    "!pip install -U pip"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "G6Z-aDd08hfk"
+   },
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "%%bash\n",
+    "data_root=\"./infore_16k_denoised\"\n",
+    "mkdir -p $data_root\n",
+    "cd $data_root\n",
+    "wget https://huggingface.co/datasets/ntt123/infore/resolve/main/infore_16k_denoised.zip -O infore.zip\n",
+    "unzip infore.zip "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "VWwgAePDXy4m"
+   },
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "txt_files = sorted(Path(\"./infore_16k_denoised\").glob(\"*.txt\"))\n",
+    "f = open(\"/content/words.txt\", \"w\", encoding=\"utf-8\")\n",
+    "for txt_file in txt_files:\n",
+    "    wav_file = txt_file.with_suffix(\".wav\")\n",
+    "    if not wav_file.exists():\n",
+    "        continue\n",
+    "    line = open(txt_file, \"r\", encoding=\"utf-8\").read()\n",
+    "    for word in line.strip().lower().split():\n",
+    "        f.write(word)\n",
+    "        f.write(\"\\n\")\n",
+    "f.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "FktjNXbDkBLh"
+   },
+   "outputs": [],
+   "source": [
+    "black_list = (\n",
+    "    []\n",
+    "    + [\"q\", \"adn\", \"h\", \"stress\", \"b\", \"k\", \"mark\", \"gas\", \"cs\", \"test\", \"l\", \"hiv\"]\n",
+    "    + [\"v\", \"d\", \"c\", \"p\", \"martin\", \"visa\", \"euro\", \"laser\", \"x\", \"real\", \"shop\"]\n",
+    "    + [\"studio\", \"kelvin\", \"đt\", \"pop\", \"rock\", \"gara\", \"karaoke\", \"đicr\", \"đigiúp\"]\n",
+    "    + [\"khmer\", \"ii\", \"s\", \"tr\", \"xhcn\", \"casino\", \"guitar\", \"sex\", \"oxi\", \"radio\"]\n",
+    "    + [\"qúy\", \"asean\", \"hlv\" \"ts\", \"video\", \"virus\", \"usd\", \"robot\", \"ph\", \"album\"]\n",
+    "    + [\"s\", \"kg\", \"km\", \"g\", \"tr\", \"đ\", \"ak\", \"d\", \"m\", \"n\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "b3nMwfzK_g0B"
+   },
+   "outputs": [],
+   "source": [
+    "ws = open(\"/content/words.txt\").readlines()\n",
+    "f = open(\"/content/lexicon.txt\", \"w\")\n",
+    "for w in sorted(set(ws)):\n",
+    "    w = w.strip()\n",
+    "\n",
+    "    # this is a hack to match phoneme set in the vietTTS repo\n",
+    "    p = list(w)\n",
+    "    p = \" \".join(p)\n",
+    "    if w in black_list:\n",
+    "        continue\n",
+    "    else:\n",
+    "        f.write(f\"{w}\\t{p}\\n\")\n",
+    "f.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "WuWZKTNRt1eM"
+   },
+   "outputs": [],
+   "source": [
+    "%%writefile install_mfa.sh\n",
+    "#!/bin/bash\n",
+    "\n",
+    "## a script to install Montreal Forced Aligner (MFA)\n",
+    "\n",
+    "root_dir=${1:-/tmp/mfa}\n",
+    "mkdir -p $root_dir\n",
+    "cd $root_dir\n",
+    "\n",
+    "# download miniconda3\n",
+    "wget -q --show-progress https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh\n",
+    "bash Miniconda3-latest-Linux-x86_64.sh -b -p $root_dir/miniconda3 -f\n",
+    "\n",
+    "#install MFA\n",
+    "$root_dir/miniconda3/bin/conda create -n aligner -c conda-forge montreal-forced-aligner=2.0.0rc7 -y\n",
+    "\n",
+    "echo -e \"\\n======== DONE ==========\"\n",
+    "echo -e \"\\nTo activate MFA, run: source $root_dir/miniconda3/bin/activate aligner\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "osR7KJCNXJYq"
+   },
+   "outputs": [],
+   "source": [
+    "# download and install mfa\n",
+    "INSTALL_DIR = \"/tmp/mfa\"  # path to install directory\n",
+    "!bash ./install_mfa.sh {INSTALL_DIR}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "hxbXwJZlXLPz",
+    "outputId": "d3e40ec5-68a7-40ec-d070-137736d7a956"
+   },
+   "outputs": [],
+   "source": [
+    "!source {INSTALL_DIR}/miniconda3/bin/activate aligner; \\\n",
+    "mfa train --clean -t ./temp -o ./infore_mfa.zip ./infore_16k_denoised lexicon.txt ./infore_textgrid"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "8Z65_BtXagn1"
+   },
+   "outputs": [],
+   "source": [
+    "# copy to train directory\n",
+    "!mkdir -p train_data\n",
+    "!cp ./infore_16k_denoised/*.wav ./train_data\n",
+    "!cp ./infore_textgrid/*.TextGrid ./train_data"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "collapsed_sections": [],
+   "name": "align-text-audio | InfoRe using MFA v2rc7.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

notebooks/denoise_infore_dataset.ipynb ADDED Viewed

	@@ -0,0 +1,138 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qjubHCEzYtG8"
+      },
+      "source": [
+        "### Step 1. Download InfoRE dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zCBJzJi6BE_o"
+      },
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "%%bash\n",
+        "mkdir -p /content/data\n",
+        "cd /content/data\n",
+        "wget https://huggingface.co/datasets/ntt123/infore/resolve/main/infore_16k.zip\n",
+        "# unzip -P BroughtToYouByInfoRe 25hours.zip\n",
+        "unzip infore_16k.zip"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6C47hb9nYzmB"
+      },
+      "source": [
+        "### Step 2. Normalize audio clip"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Hp9TK8PbBcQM"
+      },
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "!sudo apt install -y sox\n",
+        "!pip install soundfile librosa\n",
+        "!pip install onnxruntime==1.11.1"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "FW45D8xM9Mcc",
+        "outputId": "8d7ea7a9-ea5a-48ca-88fe-37dd4ed55d9b"
+      },
+      "outputs": [],
+      "source": [
+        "!mkdir -p /content/infore_16k\n",
+        "from pathlib import Path\n",
+        "import os\n",
+        "from tqdm.cli import tqdm\n",
+        "\n",
+        "wavs = sorted(Path(\"/content/data/InfoRe\").glob(\"*.wav\"))\n",
+        "for path in tqdm(wavs):\n",
+        "    out = Path(\"/content/infore_16k\") / path.name\n",
+        "    cmd = f\"sox {path} -c 1 -e signed-integer -b 16 -r 16k --norm=-3 {out}\"\n",
+        "    os.system(cmd)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kooiBrsQY5sQ"
+      },
+      "source": [
+        "### Step 3. Denoise using DNS-Challenge's baseline"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "hsXeNkZ3Xacj"
+      },
+      "outputs": [],
+      "source": [
+        "!git clone https://github.com/microsoft/DNS-Challenge\n",
+        "%cd DNS-Challenge/NSNet2-baseline/\n",
+        "!git checkout -f 8b87a33b2892f147b5c7ad39ea978453730db269\n",
+        "!python run_nsnet2.py -i /content/infore_16k/ -o /content/infore_16k_denoised -m ./nsnet2-20ms-baseline.onnx"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "T5JtZwKgZI4r"
+      },
+      "source": [
+        "### Step 4. Zip the denoised dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "eeFggV0uYop_"
+      },
+      "outputs": [],
+      "source": [
+        "%cd /content\n",
+        "!cp /content/data/InfoRe/*.txt ./infore_16k_denoised\n",
+        "!cd ./infore_16k_denoised; zip -r ../infore_16k_denoised.zip ."
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "prepare_infore_dataset.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

scripts/download_aligned_infore_dataset.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+A script to download the InfoRE dataset and textgrid files.
+"""
+import shutil
+from pathlib import Path
+import pooch
+from pooch import Unzip
+from tqdm.cli import tqdm
+def download_infore_data():
+    """download infore wav files"""
+    files = pooch.retrieve(
+        url="https://huggingface.co/datasets/ntt123/infore/resolve/main/infore_16k_denoised.zip",
+        known_hash="2445527b345fb0b1816ce3c8f09bae419d6bbe251f16d6c74d8dd95ef9fb0737",
+        processor=Unzip(),
+        progressbar=True,
+    )
+    data_dir = Path(sorted(files)[0]).parent
+    return data_dir
+def download_textgrid():
+    """download textgrid files"""
+    files = pooch.retrieve(
+        url="https://huggingface.co/datasets/ntt123/infore/resolve/main/infore_tg.zip",
+        known_hash="26e4f53025220097ea95dc266657de8d65104b0a17a6ffba778fc016c8dd36d7",
+        processor=Unzip(),
+        progressbar=True,
+    )
+    data_dir = Path(sorted(files)[0]).parent
+    return data_dir
+DATA_ROOT = Path("./train_data")
+DATA_ROOT.mkdir(parents=True, exist_ok=True)
+wav_dir = download_infore_data()
+tg_dir = download_textgrid()
+for path in tqdm(tg_dir.glob("*.TextGrid")):
+    wav_name = path.with_suffix(".wav").name
+    wav_src = wav_dir / wav_name
+    shutil.copy(path, DATA_ROOT)
+    shutil.copy(wav_src, DATA_ROOT)

scripts/quick_start.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+if [ ! -f assets/infore/hifigan/g_01140000 ]; then
+  echo "Downloading models..."
+  mkdir -p assets/infore/{nat,hifigan}
+  wget https://huggingface.co/ntt123/viettts_infore_16k/resolve/main/duration_latest_ckpt.pickle -O assets/infore/nat/duration_latest_ckpt.pickle
+  wget https://huggingface.co/ntt123/viettts_infore_16k/resolve/main/acoustic_latest_ckpt.pickle -O assets/infore/nat/acoustic_latest_ckpt.pickle
+  wget https://huggingface.co/ntt123/viettts_infore_16k/resolve/main/g_01140000 -O assets/infore/hifigan/g_01140000
+  python3 -m vietTTS.hifigan.convert_torch_model_to_haiku --config-file=assets/hifigan/config.json --checkpoint-file=assets/infore/hifigan/g_01140000
+fi
+echo "Generate audio clip"
+text=`cat assets/transcript.txt`
+python3 -m vietTTS.synthesizer --text "$text" --output assets/infore/clip.wav --lexicon-file assets/infore/lexicon.txt --silence-duration 0.2

setup.cfg ADDED Viewed

	@@ -0,0 +1,14 @@

+[pep8]
+max-line-length = 120
+indent-size = 2
+[pycodestyle]
+max-line-length = 120
+[yapf]
+based_on_style = pep8
+column_limit = 120
+[tool:pytest]
+testpaths=
+  tests

setup.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from setuptools import setup
+__version__ = "0.4.1"
+url = "https://github.com/ntt123/vietTTS"
+install_requires = [
+    "dm-haiku",
+    "einops",
+    "fire",
+    "gdown",
+    "jax",
+    "jaxlib",
+    "librosa",
+    "optax",
+    "tabulate",
+    "textgrid @ git+https://github.com/kylebgorman/textgrid.git",
+    "tqdm",
+    "matplotlib",
+]
+setup_requires = []
+tests_require = []
+setup(
+    name="vietTTS",
+    version=__version__,
+    description="A vietnamese text-to-speech library.",
+    author="ntt123",
+    url=url,
+    keywords=[
+        "text-to-speech",
+        "tts",
+        "deep-learning",
+        "dm-haiku",
+        "jax",
+        "vietnamese",
+        "speech-synthesis",
+    ],
+    install_requires=install_requires,
+    setup_requires=setup_requires,
+    tests_require=tests_require,
+    packages=["vietTTS"],
+    python_requires=">=3.7",
+)

tests/test_nat_acoustic.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import haiku
+import haiku as hk
+import jax.numpy as jnp
+import jax.random
+from vietTTS.nat.config import FLAGS
+from vietTTS.nat.model import AcousticModel
+@hk.testing.transform_and_run
+def test_duration():
+    net = AcousticModel()
+    token = jnp.zeros((2, 10), dtype=jnp.int32)
+    lengths = jnp.zeros((2,), dtype=jnp.int32)
+    durations = jnp.zeros((2, 10), dtype=jnp.float32)
+    mel = jnp.zeros((2, 20, 160), dtype=jnp.float32)
+    o1, o2 = net(token, mel, lengths, durations)
+    assert o1.shape == (2, 20, 160)
+    assert o2.shape == (2, 20, 160)

tests/test_nat_duration.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import haiku
+import haiku as hk
+import jax.numpy as jnp
+import jax.random
+from vietTTS.nat.config import FLAGS
+from vietTTS.nat.model import DurationModel
+@hk.testing.transform_and_run
+def test_duration():
+    net = DurationModel()
+    p = jnp.zeros((2, 10), dtype=jnp.int32)
+    l = jnp.zeros((2,), dtype=jnp.int32)
+    o = net(p, l)
+    assert o.shape == (2, 10, 1)

vietTTS.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,11 @@

+Metadata-Version: 1.2
+Name: vietTTS
+Version: 0.4.1
+Summary: A vietnamese text-to-speech library.
+Home-page: https://github.com/ntt123/vietTTS
+Author: ntt123
+License: UNKNOWN
+Description: UNKNOWN
+Keywords: text-to-speech,tts,deep-learning,dm-haiku,jax,vietnamese,speech-synthesis
+Platform: UNKNOWN
+Requires-Python: >=3.7

vietTTS.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+README.md
+setup.cfg
+setup.py
+vietTTS/__init__.py
+vietTTS/synthesizer.py
+vietTTS.egg-info/PKG-INFO
+vietTTS.egg-info/SOURCES.txt
+vietTTS.egg-info/dependency_links.txt
+vietTTS.egg-info/requires.txt
+vietTTS.egg-info/top_level.txt

vietTTS.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

vietTTS.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+dm-haiku
+einops
+fire
+gdown
+jax
+jaxlib
+librosa
+optax
+tabulate
+textgrid@ git+https://github.com/kylebgorman/textgrid.git
+tqdm
+matplotlib

vietTTS.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ vietTTS

vietTTS/__init__.py ADDED Viewed

File without changes

vietTTS/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (154 Bytes). View file

vietTTS/__pycache__/synthesizer.cpython-39.pyc ADDED Viewed

Binary file (1.39 kB). View file

vietTTS/hifigan/__pycache__/config.cpython-39.pyc ADDED Viewed

Binary file (437 Bytes). View file

vietTTS/hifigan/__pycache__/mel2wave.cpython-39.pyc ADDED Viewed

Binary file (1.51 kB). View file

vietTTS/hifigan/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (3.8 kB). View file

vietTTS/hifigan/config.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from pathlib import Path
+from typing import NamedTuple
+class FLAGS:
+    ckpt_dir = Path("./assets/infore/hifigan")

vietTTS/hifigan/convert_torch_model_to_haiku.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import argparse
+import json
+import os
+import pickle
+import numpy as np
+import torch
+from .config import FLAGS
+from .torch_model import Generator
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print("Loading '{}'".format(filepath))
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    print("Complete.")
+    return checkpoint_dict
+def convert_to_haiku(a, h, device):
+    generator = Generator(h).to(device)
+    state_dict_g = load_checkpoint(a.checkpoint_file, device)
+    generator.load_state_dict(state_dict_g["generator"])
+    generator.eval()
+    generator.remove_weight_norm()
+    hk_map = {}
+    for a, b in generator.state_dict().items():
+        print(a, b.shape)
+        if a.startswith("conv_pre"):
+            a = "generator/~/conv1_d"
+        elif a.startswith("conv_post"):
+            a = "generator/~/conv1_d_1"
+        elif a.startswith("ups."):
+            ii = a.split(".")[1]
+            a = f"generator/~/ups_{ii}"
+        elif a.startswith("resblocks."):
+            _, x, y, z, _ = a.split(".")
+            ver = h.resblock
+            a = f"generator/~/res_block{ver}_{x}/~/{y}_{z}"
+        print(a, b.shape)
+        if a not in hk_map:
+            hk_map[a] = {}
+        if len(b.shape) == 1:
+            hk_map[a]["b"] = b.numpy()
+        else:
+            if "ups" in a:
+                hk_map[a]["w"] = np.rot90(b.numpy(), k=1, axes=(0, 2))
+            elif "conv" in a:
+                hk_map[a]["w"] = np.swapaxes(b.numpy(), 0, 2)
+            else:
+                hk_map[a]["w"] = b.numpy()
+    FLAGS.ckpt_dir.mkdir(parents=True, exist_ok=True)
+    with open(FLAGS.ckpt_dir / "hk_hifi.pickle", "wb") as f:
+        pickle.dump(hk_map, f)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint-file", required=True)
+    parser.add_argument("--config-file", required=True)
+    a = parser.parse_args()
+    config_file = a.config_file
+    with open(config_file) as f:
+        data = f.read()
+    json_config = json.loads(data)
+    h = AttrDict(json_config)
+    device = torch.device("cpu")
+    convert_to_haiku(a, h, device)
+if __name__ == "__main__":
+    main()

vietTTS/hifigan/create_mel.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import math
+import os
+import random
+import torch
+import torch.utils.data
+import numpy as np
+from librosa.util import normalize
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+MAX_WAV_VALUE = 32768.0
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(
+    y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
+):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[str(fmax) + "_" + str(y.device)] = (
+            torch.from_numpy(mel).float().to(y.device)
+        )
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+    )
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def get_dataset_filelist(a):
+    with open(a.input_training_file, "r", encoding="utf-8") as fi:
+        training_files = [
+            os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav")
+            for x in fi.read().split("\n")
+            if len(x) > 0
+        ]
+    with open(a.input_validation_file, "r", encoding="utf-8") as fi:
+        validation_files = [
+            os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav")
+            for x in fi.read().split("\n")
+            if len(x) > 0
+        ]
+    return training_files, validation_files
+class MelDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        training_files,
+        segment_size,
+        n_fft,
+        num_mels,
+        hop_size,
+        win_size,
+        sampling_rate,
+        fmin,
+        fmax,
+        split=True,
+        shuffle=True,
+        n_cache_reuse=1,
+        device=None,
+        fmax_loss=None,
+        fine_tuning=False,
+        base_mels_path=None,
+    ):
+        self.audio_files = training_files
+        random.seed(1234)
+        if shuffle:
+            random.shuffle(self.audio_files)
+        self.segment_size = segment_size
+        self.sampling_rate = sampling_rate
+        self.split = split
+        self.n_fft = n_fft
+        self.num_mels = num_mels
+        self.hop_size = hop_size
+        self.win_size = win_size
+        self.fmin = fmin
+        self.fmax = fmax
+        self.fmax_loss = fmax_loss
+        self.cached_wav = None
+        self.n_cache_reuse = n_cache_reuse
+        self._cache_ref_count = 0
+        self.device = device
+        self.fine_tuning = fine_tuning
+        self.base_mels_path = base_mels_path
+    def __getitem__(self, index):
+        filename = self.audio_files[index]
+        if self._cache_ref_count == 0:
+            audio, sampling_rate = load_wav(filename)
+            audio = audio / MAX_WAV_VALUE
+            if not self.fine_tuning:
+                audio = normalize(audio) * 0.95
+            self.cached_wav = audio
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    "{} SR doesn't match target {} SR".format(
+                        sampling_rate, self.sampling_rate
+                    )
+                )
+            self._cache_ref_count = self.n_cache_reuse
+        else:
+            audio = self.cached_wav
+            self._cache_ref_count -= 1
+        audio = torch.FloatTensor(audio)
+        audio = audio.unsqueeze(0)
+        if not self.fine_tuning:
+            if self.split:
+                if audio.size(1) >= self.segment_size:
+                    max_audio_start = audio.size(1) - self.segment_size
+                    audio_start = random.randint(0, max_audio_start)
+                    audio = audio[:, audio_start : audio_start + self.segment_size]
+                else:
+                    audio = torch.nn.functional.pad(
+                        audio, (0, self.segment_size - audio.size(1)), "constant"
+                    )
+            mel = mel_spectrogram(
+                audio,
+                self.n_fft,
+                self.num_mels,
+                self.sampling_rate,
+                self.hop_size,
+                self.win_size,
+                self.fmin,
+                self.fmax,
+                center=False,
+            )
+        else:
+            mel = np.load(
+                os.path.join(
+                    self.base_mels_path,
+                    os.path.splitext(os.path.split(filename)[-1])[0] + ".npy",
+                )
+            )
+            mel = torch.from_numpy(mel)
+            if len(mel.shape) < 3:
+                mel = mel.unsqueeze(0)
+            if self.split:
+                frames_per_seg = math.ceil(self.segment_size / self.hop_size)
+                if audio.size(1) >= self.segment_size:
+                    mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
+                    mel = mel[:, :, mel_start : mel_start + frames_per_seg]
+                    audio = audio[
+                        :,
+                        mel_start
+                        * self.hop_size : (mel_start + frames_per_seg)
+                        * self.hop_size,
+                    ]
+                else:
+                    mel = torch.nn.functional.pad(
+                        mel, (0, frames_per_seg - mel.size(2)), "constant"
+                    )
+                    audio = torch.nn.functional.pad(
+                        audio, (0, self.segment_size - audio.size(1)), "constant"
+                    )
+        mel_loss = mel_spectrogram(
+            audio,
+            self.n_fft,
+            self.num_mels,
+            self.sampling_rate,
+            self.hop_size,
+            self.win_size,
+            self.fmin,
+            self.fmax_loss,
+            center=False,
+        )
+        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
+    def __len__(self):
+        return len(self.audio_files)

vietTTS/hifigan/data_loader.py ADDED Viewed

File without changes

vietTTS/hifigan/mel2wave.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import json
+import os
+import pickle
+import haiku as hk
+import jax
+import jax.numpy as jnp
+import numpy as np
+from .config import FLAGS
+from .model import Generator
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def mel2wave(mel):
+    config_file = "assets/hifigan/config.json"
+    MAX_WAV_VALUE = 32768.0
+    with open(config_file) as f:
+        data = f.read()
+    json_config = json.loads(data)
+    h = AttrDict(json_config)
+    @hk.transform_with_state
+    def forward(x):
+        net = Generator(h)
+        return net(x)
+    rng = next(hk.PRNGSequence(42))
+    with open(FLAGS.ckpt_dir / "hk_hifi.pickle", "rb") as f:
+        params = pickle.load(f)
+    aux = {}
+    wav, aux = forward.apply(params, aux, rng, mel)
+    wav = jnp.squeeze(wav)
+    audio = jax.device_get(wav)
+    return audio

vietTTS/hifigan/model.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import haiku as hk
+import jax
+import jax.numpy as jnp
+LRELU_SLOPE = 0.1
+def get_padding(kernel_size, dilation=1):
+    p = int((kernel_size * dilation - dilation) / 2)
+    return ((p, p),)
+class ResBlock1(hk.Module):
+    def __init__(
+        self, h, channels, kernel_size=3, dilation=(1, 3, 5), name="resblock1"
+    ):
+        super().__init__(name=name)
+        self.h = h
+        self.convs1 = [
+            hk.Conv1D(
+                channels,
+                kernel_size,
+                1,
+                rate=dilation[i],
+                padding=get_padding(kernel_size, dilation[i]),
+                name=f"convs1_{i}",
+            )
+            for i in range(3)
+        ]
+        self.convs2 = [
+            hk.Conv1D(
+                channels,
+                kernel_size,
+                1,
+                rate=1,
+                padding=get_padding(kernel_size, 1),
+                name=f"convs2_{i}",
+            )
+            for i in range(3)
+        ]
+    def __call__(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = jax.nn.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = jax.nn.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+class ResBlock2(hk.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), name="ResBlock2"):
+        super().__init__(name=name)
+        self.h = h
+        self.convs = [
+            hk.Conv1D(
+                channels,
+                kernel_size,
+                1,
+                rate=dilation[i],
+                padding=get_padding(kernel_size, dilation[i]),
+            )
+            for i in range(2)
+        ]
+    def __call__(self, x):
+        for c in self.convs:
+            xt = jax.nn.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+class Generator(hk.Module):
+    def __init__(self, h):
+        super().__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = hk.Conv1D(h.upsample_initial_channel, 7, 1, padding=((3, 3),))
+        resblock = ResBlock1 if h.resblock == "1" else ResBlock2
+        self.ups = []
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                hk.Conv1DTranspose(
+                    h.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_shape=k,
+                    stride=u,
+                    padding="SAME",
+                    name=f"ups_{i}",
+                )
+            )
+        self.resblocks = []
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
+            ):
+                self.resblocks.append(
+                    resblock(h, ch, k, d, name=f"res_block1_{len(self.resblocks)}")
+                )
+        self.conv_post = hk.Conv1D(1, 7, 1, padding=((3, 3),))
+    def __call__(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = jax.nn.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = jax.nn.leaky_relu(x)  # default pytorch value
+        x = self.conv_post(x)
+        x = jnp.tanh(x)
+        return x

vietTTS/hifigan/torch_model.py ADDED Viewed

	@@ -0,0 +1,414 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+# from utils import init_weights, get_padding
+LRELU_SLOPE = 0.1
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(
+            Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)
+        )
+        resblock = ResBlock1 if h.resblock == "1" else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        h.upsample_initial_channel // (2**i),
+                        h.upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorP(2),
+                DiscriminatorP(3),
+                DiscriminatorP(5),
+                DiscriminatorP(7),
+                DiscriminatorP(11),
+            ]
+        )
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+                norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+                norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+                norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorS(use_spectral_norm=True),
+                DiscriminatorS(),
+                DiscriminatorS(),
+            ]
+        )
+        self.meanpools = nn.ModuleList(
+            [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)]
+        )
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss * 2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg**2)
+        loss += r_loss + g_loss
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses

vietTTS/hifigan/trainer.py ADDED Viewed

File without changes

vietTTS/nat/__init__.py ADDED Viewed

File without changes

vietTTS/nat/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (158 Bytes). View file

vietTTS/nat/__pycache__/config.cpython-39.pyc ADDED Viewed

Binary file (2.43 kB). View file

vietTTS/nat/__pycache__/data_loader.cpython-39.pyc ADDED Viewed

Binary file (4.25 kB). View file

vietTTS/nat/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (6.93 kB). View file

vietTTS/nat/__pycache__/text2mel.cpython-39.pyc ADDED Viewed

Binary file (4.01 kB). View file

vietTTS/nat/acoustic_tpu_trainer.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import os
+import pickle
+from functools import partial
+from typing import Deque
+import fire
+import jax
+import jax.numpy as jnp
+import jax.tools.colab_tpu
+import matplotlib.pyplot as plt
+import optax
+from tqdm.auto import tqdm
+from .acoustic_trainer import initial_state, loss_vag, val_loss_fn
+from .config import FLAGS
+from .data_loader import load_textgrid_wav
+from .dsp import MelFilter
+from .utils import print_flags
+def setup_colab_tpu():
+    jax.tools.colab_tpu.setup_tpu()
+def train(
+    batch_size: int = 32,
+    steps_per_update: int = 10,
+    learning_rate: float = 1024e-6,
+):
+    """Train acoustic model on multiple cores (TPU)."""
+    lr_schedule = optax.exponential_decay(learning_rate, 50_000, 0.5, staircase=True)
+    optimizer = optax.chain(
+        optax.clip_by_global_norm(1.0),
+        optax.adamw(lr_schedule, weight_decay=FLAGS.weight_decay),
+    )
+    def update_step(prev_state, inputs):
+        params, aux, rng, optim_state = prev_state
+        rng, new_rng = jax.random.split(rng)
+        (loss, new_aux), grads = loss_vag(params, aux, rng, inputs)
+        grads = jax.lax.pmean(grads, axis_name="i")
+        updates, new_optim_state = optimizer.update(grads, optim_state, params)
+        new_params = optax.apply_updates(params, updates)
+        next_state = (new_params, new_aux, new_rng, new_optim_state)
+        return next_state, loss
+    @partial(jax.pmap, axis_name="i")
+    def update(params, aux, rng, optim_state, inputs):
+        states, losses = jax.lax.scan(
+            update_step, (params, aux, rng, optim_state), inputs
+        )
+        return states, jnp.mean(losses)
+    print(jax.devices())
+    num_devices = jax.device_count()
+    train_data_iter = load_textgrid_wav(
+        FLAGS.data_dir,
+        FLAGS.max_phoneme_seq_len,
+        batch_size * num_devices * steps_per_update,
+        FLAGS.max_wave_len,
+        "train",
+    )
+    val_data_iter = load_textgrid_wav(
+        FLAGS.data_dir,
+        FLAGS.max_phoneme_seq_len,
+        batch_size,
+        FLAGS.max_wave_len,
+        "val",
+    )
+    melfilter = MelFilter(
+        FLAGS.sample_rate,
+        FLAGS.n_fft,
+        FLAGS.mel_dim,
+        FLAGS.fmin,
+        FLAGS.fmax,
+    )
+    batch = next(train_data_iter)
+    batch = jax.tree_map(lambda x: x[:1], batch)
+    batch = batch._replace(mels=melfilter(batch.wavs.astype(jnp.float32) / (2**15)))
+    params, aux, rng, optim_state = initial_state(optimizer, batch)
+    losses = Deque(maxlen=1000)
+    val_losses = Deque(maxlen=100)
+    last_step = -steps_per_update
+    # loading latest checkpoint
+    ckpt_fn = FLAGS.ckpt_dir / "acoustic_latest_ckpt.pickle"
+    if ckpt_fn.exists():
+        print("Resuming from latest checkpoint at", ckpt_fn)
+        with open(ckpt_fn, "rb") as f:
+            dic = pickle.load(f)
+            last_step, params, aux, rng, optim_state = (
+                dic["step"],
+                dic["params"],
+                dic["aux"],
+                dic["rng"],
+                dic["optim_state"],
+            )
+    tr = tqdm(
+        range(
+            last_step + steps_per_update, FLAGS.num_training_steps + 1, steps_per_update
+        ),
+        desc="training",
+        total=FLAGS.num_training_steps // steps_per_update + 1,
+        initial=last_step // steps_per_update + 1,
+    )
+    params, aux, rng, optim_state = jax.device_put_replicated(
+        (params, aux, rng, optim_state), jax.devices()
+    )
+    def batch_reshape(batch):
+        return jax.tree_map(
+            lambda x: jnp.reshape(x, (num_devices, steps_per_update, -1) + x.shape[1:]),
+            batch,
+        )
+    for step in tr:
+        batch = next(train_data_iter)
+        batch = batch_reshape(batch)
+        (params, aux, rng, optim_state), loss = update(
+            params, aux, rng, optim_state, batch
+        )
+        losses.append(loss)
+        if step % 10 == 0:
+            val_batch = next(val_data_iter)
+            val_loss, val_aux, predicted_mel, gt_mel = val_loss_fn(
+                *jax.tree_map(lambda x: x[0], (params, aux, rng)), val_batch
+            )
+            val_losses.append(val_loss)
+            attn = jax.device_get(val_aux["acoustic_model"]["attn"])
+            predicted_mel = jax.device_get(predicted_mel[0])
+            gt_mel = jax.device_get(gt_mel[0])
+        if step % 1000 == 0:
+            loss = jnp.mean(sum(losses)).item() / len(losses)
+            val_loss = sum(val_losses).item() / len(val_losses)
+            tr.write(f"step {step}  train loss {loss:.3f}  val loss {val_loss:.3f}")
+            # saving predicted mels
+            plt.figure(figsize=(10, 10))
+            plt.subplot(3, 1, 1)
+            plt.imshow(predicted_mel.T, origin="lower", aspect="auto")
+            plt.subplot(3, 1, 2)
+            plt.imshow(gt_mel.T, origin="lower", aspect="auto")
+            plt.subplot(3, 1, 3)
+            plt.imshow(attn.T, origin="lower", aspect="auto")
+            plt.tight_layout()
+            plt.savefig(FLAGS.ckpt_dir / f"mel_{step:06d}.png")
+            plt.close()
+            # saving checkpoint
+            with open(ckpt_fn, "wb") as f:
+                params_, aux_, rng_, optim_state_ = jax.tree_map(
+                    lambda x: x[0], (params, aux, rng, optim_state)
+                )
+                pickle.dump(
+                    {
+                        "step": step,
+                        "params": params_,
+                        "aux": aux_,
+                        "rng": rng_,
+                        "optim_state": optim_state_,
+                    },
+                    f,
+                )
+if __name__ == "__main__":
+    # we don't use these flags.
+    del FLAGS.batch_size
+    del FLAGS.learning_rate
+    del FLAGS.duration_learning_rate
+    del FLAGS.duration_lstm_dim
+    del FLAGS.duration_embed_dropout_rate
+    print_flags(FLAGS.__dict__)
+    if "COLAB_TPU_ADDR" in os.environ:
+        setup_colab_tpu()
+    if not FLAGS.ckpt_dir.exists():
+        print("Create checkpoint dir at", FLAGS.ckpt_dir)
+        FLAGS.ckpt_dir.mkdir(parents=True, exist_ok=True)
+    fire.Fire(train)

vietTTS/nat/acoustic_trainer.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import pickle
+from functools import partial
+from typing import Deque
+import haiku as hk
+import jax
+import jax.numpy as jnp
+import matplotlib.pyplot as plt
+import optax
+from tqdm.auto import tqdm
+from vietTTS.nat.config import AcousticInput
+from .config import FLAGS, AcousticInput
+from .data_loader import load_textgrid_wav
+from .dsp import MelFilter
+from .model import AcousticModel
+from .utils import print_flags
+@hk.transform_with_state
+def net(x):
+    return AcousticModel(is_training=True)(x)
+@hk.transform_with_state
+def val_net(x):
+    return AcousticModel(is_training=False)(x)
+def loss_fn(params, aux, rng, inputs: AcousticInput, is_training=True):
+    """Compute loss"""
+    melfilter = MelFilter(
+        FLAGS.sample_rate, FLAGS.n_fft, FLAGS.mel_dim, FLAGS.fmin, FLAGS.fmax
+    )
+    wavs = inputs.wavs.astype(jnp.float32) / (2**15)
+    mels = melfilter(wavs)
+    B, L, D = mels.shape
+    go_frame = jnp.zeros((B, 1, D), dtype=jnp.float32)
+    inp_mels = jnp.concatenate((go_frame, mels[:, :-1, :]), axis=1)
+    n_frames = inputs.durations * FLAGS.sample_rate / (FLAGS.n_fft // 4)
+    inputs = inputs._replace(mels=inp_mels, durations=n_frames)
+    model = net if is_training else val_net
+    (mel1_hat, mel2_hat), new_aux = model.apply(params, aux, rng, inputs)
+    loss1 = (jnp.square(mel1_hat - mels) + jnp.square(mel2_hat - mels)) / 2
+    loss2 = (jnp.abs(mel1_hat - mels) + jnp.abs(mel2_hat - mels)) / 2
+    loss = jnp.mean((loss1 + loss2) / 2, axis=-1)
+    num_frames = (inputs.wav_lengths // (FLAGS.n_fft // 4))[:, None]
+    mask = jnp.arange(0, L)[None, :] < num_frames
+    loss = jnp.sum(loss * mask) / jnp.sum(mask)
+    return (loss, new_aux) if is_training else (loss, new_aux, mel2_hat, mels)
+train_loss_fn = partial(loss_fn, is_training=True)
+val_loss_fn = jax.jit(partial(loss_fn, is_training=False))
+loss_vag = jax.value_and_grad(train_loss_fn, has_aux=True)
+def initial_state(optimizer, batch):
+    rng = jax.random.PRNGKey(42)
+    params, aux = hk.transform_with_state(lambda x: AcousticModel(True)(x)).init(
+        rng, batch
+    )
+    optim_state = optimizer.init(params)
+    return params, aux, rng, optim_state
+def train():
+    optimizer = optax.chain(
+        optax.clip_by_global_norm(1.0),
+        optax.adamw(FLAGS.learning_rate, weight_decay=FLAGS.weight_decay),
+    )
+    @jax.jit
+    def update(params, aux, rng, optim_state, inputs):
+        rng, new_rng = jax.random.split(rng)
+        (loss, new_aux), grads = loss_vag(params, aux, rng, inputs)
+        updates, new_optim_state = optimizer.update(grads, optim_state, params)
+        new_params = optax.apply_updates(updates, params)
+        return loss, (new_params, new_aux, new_rng, new_optim_state)
+    train_data_iter = load_textgrid_wav(
+        FLAGS.data_dir,
+        FLAGS.max_phoneme_seq_len,
+        FLAGS.batch_size,
+        FLAGS.max_wave_len,
+        "train",
+    )
+    val_data_iter = load_textgrid_wav(
+        FLAGS.data_dir,
+        FLAGS.max_phoneme_seq_len,
+        FLAGS.batch_size,
+        FLAGS.max_wave_len,
+        "val",
+    )
+    melfilter = MelFilter(
+        FLAGS.sample_rate, FLAGS.n_fft, FLAGS.mel_dim, FLAGS.fmin, FLAGS.fmax
+    )
+    batch = next(train_data_iter)
+    batch = batch._replace(mels=melfilter(batch.wavs.astype(jnp.float32) / (2**15)))
+    params, aux, rng, optim_state = initial_state(optimizer, batch)
+    losses = Deque(maxlen=1000)
+    val_losses = Deque(maxlen=100)
+    last_step = -1
+    # loading latest checkpoint
+    ckpt_fn = FLAGS.ckpt_dir / "acoustic_latest_ckpt.pickle"
+    if ckpt_fn.exists():
+        print("Resuming from latest checkpoint at", ckpt_fn)
+        with open(ckpt_fn, "rb") as f:
+            dic = pickle.load(f)
+            last_step, params, aux, rng, optim_state = (
+                dic["step"],
+                dic["params"],
+                dic["aux"],
+                dic["rng"],
+                dic["optim_state"],
+            )
+    tr = tqdm(
+        range(last_step + 1, FLAGS.num_training_steps + 1),
+        desc="training",
+        total=FLAGS.num_training_steps + 1,
+        initial=last_step + 1,
+    )
+    for step in tr:
+        batch = next(train_data_iter)
+        loss, (params, aux, rng, optim_state) = update(
+            params, aux, rng, optim_state, batch
+        )
+        losses.append(loss)
+        if step % 10 == 0:
+            val_batch = next(val_data_iter)
+            val_loss, val_aux, predicted_mel, gt_mel = val_loss_fn(
+                params, aux, rng, val_batch
+            )
+            val_losses.append(val_loss)
+            attn = jax.device_get(val_aux["acoustic_model"]["attn"])
+            predicted_mel = jax.device_get(predicted_mel[0])
+            gt_mel = jax.device_get(gt_mel[0])
+        if step % 1000 == 0:
+            loss = sum(losses).item() / len(losses)
+            val_loss = sum(val_losses).item() / len(val_losses)
+            tr.write(f"step {step}  train loss {loss:.3f}  val loss {val_loss:.3f}")
+            # saving predicted mels
+            plt.figure(figsize=(10, 10))
+            plt.subplot(3, 1, 1)
+            plt.imshow(predicted_mel.T, origin="lower", aspect="auto")
+            plt.subplot(3, 1, 2)
+            plt.imshow(gt_mel.T, origin="lower", aspect="auto")
+            plt.subplot(3, 1, 3)
+            plt.imshow(attn.T, origin="lower", aspect="auto")
+            plt.tight_layout()
+            plt.savefig(FLAGS.ckpt_dir / f"mel_{step:06d}.png")
+            plt.close()
+            # saving checkpoint
+            with open(ckpt_fn, "wb") as f:
+                pickle.dump(
+                    {
+                        "step": step,
+                        "params": params,
+                        "aux": aux,
+                        "rng": rng,
+                        "optim_state": optim_state,
+                    },
+                    f,
+                )
+if __name__ == "__main__":
+    print_flags(FLAGS.__dict__)
+    if not FLAGS.ckpt_dir.exists():
+        print("Create checkpoint dir at", FLAGS.ckpt_dir)
+        FLAGS.ckpt_dir.mkdir(parents=True, exist_ok=True)
+    train()

vietTTS/nat/config.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from argparse import Namespace
+from pathlib import Path
+from typing import NamedTuple
+from jax.numpy import ndarray
+class FLAGS(Namespace):
+    """Configurations"""
+    duration_lstm_dim = 256
+    vocab_size = 256
+    duration_embed_dropout_rate = 0.5
+    num_training_steps = 200_000
+    postnet_dim = 512
+    acoustic_decoder_dim = 512
+    acoustic_encoder_dim = 256
+    # dataset
+    max_phoneme_seq_len = 256 * 1
+    assert max_phoneme_seq_len % 256 == 0  # prevent compilation error on Colab T4 GPU
+    max_wave_len = 1024 * 64 * 3
+    # Montreal Forced Aligner
+    special_phonemes = ["sil", "sp", "spn", " "]  # [sil], [sp] [spn] [word end]
+    sil_index = special_phonemes.index("sil")
+    sp_index = sil_index  # no use of "sp"
+    word_end_index = special_phonemes.index(" ")
+    _normal_phonemes = (
+        []
+        + ["a", "b", "c", "d", "e", "g", "h", "i", "k", "l"]
+        + ["m", "n", "o", "p", "q", "r", "s", "t", "u", "v"]
+        + ["x", "y", "à", "á", "â", "ã", "è", "é", "ê", "ì"]
+        + ["í", "ò", "ó", "ô", "õ", "ù", "ú", "ý", "ă", "đ"]
+        + ["ĩ", "ũ", "ơ", "ư", "ạ", "ả", "ấ", "ầ", "ẩ", "ẫ"]
+        + ["ậ", "ắ", "ằ", "ẳ", "ẵ", "ặ", "ẹ", "ẻ", "ẽ", "ế"]
+        + ["ề", "ể", "ễ", "ệ", "ỉ", "ị", "ọ", "ỏ", "ố", "ồ"]
+        + ["ổ", "ỗ", "ộ", "ớ", "ờ", "ở", "ỡ", "ợ", "ụ", "ủ"]
+        + ["ứ", "ừ", "ử", "ữ", "ự", "ỳ", "ỵ", "ỷ", "ỹ"]
+    )
+    # dsp
+    mel_dim = 80
+    n_fft = 1024
+    sample_rate = 16000
+    fmin = 0.0
+    fmax = 8000
+    # training
+    batch_size = 64
+    learning_rate = 1e-4
+    duration_learning_rate = 1e-4
+    max_grad_norm = 1.0
+    weight_decay = 1e-4
+    token_mask_prob = 0.1
+    # ckpt
+    ckpt_dir = Path("assets/infore/nat")
+    data_dir = Path("train_data")
+class DurationInput(NamedTuple):
+    phonemes: ndarray
+    lengths: ndarray
+    durations: ndarray
+class AcousticInput(NamedTuple):
+    phonemes: ndarray
+    lengths: ndarray
+    durations: ndarray
+    wavs: ndarray
+    wav_lengths: ndarray
+    mels: ndarray

vietTTS/nat/data_loader.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import random
+from pathlib import Path
+import numpy as np
+import textgrid
+from scipy.io import wavfile
+from .config import FLAGS, AcousticInput, DurationInput
+def load_phonemes_set():
+    S = FLAGS.special_phonemes + FLAGS._normal_phonemes
+    return S
+def pad_seq(s, maxlen, value=0):
+    assert maxlen >= len(s)
+    return tuple(s) + (value,) * (maxlen - len(s))
+def is_in_word(phone, word):
+    def time_in_word(time, word):
+        return (word.minTime - 1e-3) < time and (word.maxTime + 1e-3) > time
+    return time_in_word(phone.minTime, word) and time_in_word(phone.maxTime, word)
+def load_textgrid(fn: Path):
+    """load textgrid file"""
+    tg = textgrid.TextGrid.fromFile(str(fn.resolve()))
+    data = []
+    words = list(tg[0])
+    widx = 0
+    assert tg[1][0].minTime == 0, "The first phoneme has to start at time 0"
+    for p in tg[1]:
+        if not p in words[widx]:
+            widx = widx + 1
+            if len(words[widx - 1].mark) > 0:
+                data.append((FLAGS.special_phonemes[FLAGS.word_end_index], 0.0))
+            if widx >= len(words):
+                break
+            assert p in words[widx], "mismatched word vs phoneme"
+        mark = p.mark.strip().lower()
+        if len(mark) == 0:
+            mark = "sil"
+        data.append((mark, p.duration()))
+    return data
+def textgrid_data_loader(data_dir: Path, seq_len: int, batch_size: int, mode: str):
+    """load all textgrid files in the directory"""
+    tg_files = sorted(data_dir.glob("*.TextGrid"))
+    random.Random(42).shuffle(tg_files)
+    L = len(tg_files) * 95 // 100
+    assert mode in ["train", "val"]
+    phonemes = load_phonemes_set()
+    if mode == "train":
+        tg_files = tg_files[:L]
+    if mode == "val":
+        tg_files = tg_files[L:]
+    data = []
+    for fn in tg_files:
+        ps, ds = zip(*load_textgrid(fn))
+        ps = [phonemes.index(p) for p in ps]
+        l = len(ps)
+        ps = pad_seq(ps, seq_len, 0)
+        ds = pad_seq(ds, seq_len, 0)
+        data.append((ps, ds, l))
+    batch = []
+    while True:
+        random.shuffle(data)
+        for e in data:
+            batch.append(e)
+            if len(batch) == batch_size:
+                ps, ds, lengths = zip(*batch)
+                ps = np.array(ps, dtype=np.int32)
+                ds = np.array(ds, dtype=np.float32)
+                lengths = np.array(lengths, dtype=np.int32)
+                yield DurationInput(ps, lengths, ds)
+                batch = []
+def load_textgrid_wav(
+    data_dir: Path, token_seq_len: int, batch_size, pad_wav_len, mode: str
+):
+    """load wav and textgrid files to memory."""
+    tg_files = sorted(data_dir.glob("*.TextGrid"))
+    random.Random(42).shuffle(tg_files)
+    L = len(tg_files) * 95 // 100
+    assert mode in ["train", "val", "gta"]
+    phonemes = load_phonemes_set()
+    if mode == "gta":
+        tg_files = tg_files  # all files
+    elif mode == "train":
+        tg_files = tg_files[:L]
+    elif mode == "val":
+        tg_files = tg_files[L:]
+    data = []
+    for fn in tg_files:
+        ps, ds = zip(*load_textgrid(fn))
+        ps = [phonemes.index(p) for p in ps]
+        l = len(ps)
+        ps = pad_seq(ps, token_seq_len, 0)
+        ds = pad_seq(ds, token_seq_len, 0)
+        wav_file = data_dir / f"{fn.stem}.wav"
+        sr, y = wavfile.read(wav_file)
+        y = np.copy(y)
+        start_time = 0
+        for i, (phone_idx, duration) in enumerate(zip(ps, ds)):
+            l = int(start_time * sr)
+            end_time = start_time + duration
+            r = int(end_time * sr)
+            if i == len(ps) - 1:
+                r = len(y)
+            if phone_idx < len(FLAGS.special_phonemes):
+                y[l:r] = 0
+            start_time = end_time
+        if len(y) > pad_wav_len:
+            y = y[:pad_wav_len]
+        # # normalize to match hifigan preprocessing
+        # y = y.astype(np.float32)
+        # y = y / np.max(np.abs(y))
+        # y = y * 0.95
+        # y = y * (2 ** 15)
+        # y = y.astype(np.int16)
+        wav_length = len(y)
+        y = np.pad(y, (0, pad_wav_len - len(y)))
+        data.append((fn.stem, ps, ds, l, y, wav_length))
+    batch = []
+    while True:
+        random.shuffle(data)
+        for idx, e in enumerate(data):
+            batch.append(e)
+            if len(batch) == batch_size or (mode == "gta" and idx == len(data) - 1):
+                names, ps, ds, lengths, wavs, wav_lengths = zip(*batch)
+                ps = np.array(ps, dtype=np.int32)
+                ds = np.array(ds, dtype=np.float32)
+                lengths = np.array(lengths, dtype=np.int32)
+                wavs = np.array(wavs, dtype=np.int16)
+                wav_lengths = np.array(wav_lengths, dtype=np.int32)
+                if mode == "gta":
+                    yield names, AcousticInput(ps, lengths, ds, wavs, wav_lengths, None)
+                else:
+                    yield AcousticInput(ps, lengths, ds, wavs, wav_lengths, None)
+                batch = []
+        if mode == "gta":
+            assert len(batch) == 0
+            break

vietTTS/nat/dsp.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from functools import partial
+from typing import Optional
+import jax
+import jax.numpy as jnp
+import librosa
+from einops import rearrange
+from jax.numpy import ndarray
+def rolling_window(a: ndarray, window: int, hop_length: int):
+    """return a stack of overlap subsequence of an array.
+    ``return jnp.stack( [a[0:10], a[5:15], a[10:20],...], axis=0)``
+    Source: https://github.com/google/jax/issues/3171
+    Args:
+      a (ndarray): input array of shape `[L, ...]`
+      window (int): length of each subarray (window).
+      hop_length (int): distance between neighbouring windows.
+    """
+    idx = (
+        jnp.arange(window)[:, None]
+        + jnp.arange((len(a) - window) // hop_length + 1)[None, :] * hop_length
+    )
+    return a[idx]
+@partial(jax.jit, static_argnums=[1, 2, 3, 4, 5, 6])
+def stft(
+    y: ndarray,
+    n_fft: int = 2048,
+    hop_length: Optional[int] = None,
+    win_length: Optional[int] = None,
+    window: str = "hann",
+    center: bool = True,
+    pad_mode: str = "reflect",
+):
+    """A jax reimplementation of ``librosa.stft`` function."""
+    if win_length is None:
+        win_length = n_fft
+    if hop_length is None:
+        hop_length = win_length // 4
+    if window == "hann":
+        fft_window = jnp.hanning(win_length + 1)[:-1]
+    else:
+        raise RuntimeError(f"{window} window function is not supported!")
+    pad_len = (n_fft - win_length) // 2
+    fft_window = jnp.pad(fft_window, (pad_len, pad_len), mode="constant")
+    fft_window = fft_window[:, None]
+    if center:
+        y = jnp.pad(y, int(n_fft // 2), mode=pad_mode)
+    # jax does not support ``np.lib.stride_tricks.as_strided`` function
+    # see https://github.com/google/jax/issues/3171 for comments.
+    y_frames = rolling_window(y, n_fft, hop_length) * fft_window
+    stft_matrix = jnp.fft.fft(y_frames, axis=0)
+    d = int(1 + n_fft // 2)
+    return stft_matrix[:d]
+@partial(jax.jit, static_argnums=[1, 2, 3, 4, 5, 6])
+def batched_stft(
+    y: ndarray,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+    window: str,
+    center: bool = True,
+    pad_mode: str = "reflect",
+):
+    """Batched version of ``stft`` function.
+    TN => FTN
+    """
+    assert len(y.shape) >= 2
+    if window == "hann":
+        fft_window = jnp.hanning(win_length + 1)[:-1]
+    else:
+        raise RuntimeError(f"{window} window function is not supported!")
+    pad_len = (n_fft - win_length) // 2
+    if pad_len > 0:
+        fft_window = jnp.pad(fft_window, (pad_len, pad_len), mode="constant")
+        win_length = n_fft
+    else:
+        fft_window = fft_window
+    if center:
+        pad_width = ((n_fft // 2, n_fft // 2),) + ((0, 0),) * (len(y.shape) - 1)
+        y = jnp.pad(y, pad_width, mode=pad_mode)
+    # jax does not support ``np.lib.stride_tricks.as_strided`` function
+    # see https://github.com/google/jax/issues/3171 for comments.
+    y_frames = rolling_window(y, n_fft, hop_length)
+    fft_window = jnp.reshape(fft_window, (-1,) + (1,) * (len(y.shape)))
+    y_frames = y_frames * fft_window
+    stft_matrix = jnp.fft.fft(y_frames, axis=0)
+    d = int(1 + n_fft // 2)
+    return stft_matrix[:d]
+class MelFilter:
+    """Convert waveform to mel spectrogram."""
+    def __init__(self, sample_rate: int, n_fft: int, n_mels: int, fmin=0.0, fmax=8000):
+        self.melfb = jax.device_put(
+            librosa.filters.mel(
+                sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
+            )
+        )
+        self.n_fft = n_fft
+    def __call__(self, y: ndarray) -> ndarray:
+        hop_length = self.n_fft // 4
+        window_length = self.n_fft
+        assert len(y.shape) == 2
+        y = rearrange(y, "n s -> s n")
+        p = (self.n_fft - hop_length) // 2
+        y = jnp.pad(y, ((p, p), (0, 0)), mode="reflect")
+        spec = batched_stft(
+            y, self.n_fft, hop_length, window_length, "hann", False, "reflect"
+        )
+        mag = jnp.sqrt(jnp.square(spec.real) + jnp.square(spec.imag) + 1e-9)
+        mel = jnp.einsum("ms,sfn->nfm", self.melfb, mag)
+        cond = jnp.log(jnp.clip(mel, a_min=1e-5, a_max=None))
+        return cond

vietTTS/nat/duration_trainer.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from functools import partial
+from typing import Deque
+import haiku as hk
+import jax
+import jax.numpy as jnp
+import matplotlib.pyplot as plt
+import numpy as np
+import optax
+from tqdm.auto import tqdm
+from vietTTS.nat.config import DurationInput
+from .config import FLAGS
+from .data_loader import textgrid_data_loader
+from .model import DurationModel
+from .utils import load_latest_ckpt, print_flags, save_ckpt
+def loss_fn(params, aux, rng, x: DurationInput, is_training=True):
+    """return the l1 loss"""
+    @hk.transform_with_state
+    def net(x):
+        return DurationModel(is_training=is_training)(x)
+    if is_training:
+        # randomly mask tokens with [WORD END] token
+        # during training to avoid overfitting
+        m_rng, rng = jax.random.split(rng, 2)
+        m = jax.random.bernoulli(m_rng, FLAGS.token_mask_prob, x.phonemes.shape)
+        x = x._replace(phonemes=jnp.where(m, FLAGS.word_end_index, x.phonemes))
+    durations, aux = net.apply(params, aux, rng, x)
+    mask = jnp.arange(0, x.phonemes.shape[1])[None, :] < x.lengths[:, None]
+    # NOT predict [WORD END] token
+    mask = jnp.where(x.phonemes == FLAGS.word_end_index, False, mask)
+    masked_loss = jnp.abs(durations - x.durations) * mask
+    loss = jnp.sum(masked_loss) / jnp.sum(mask)
+    return loss, aux
+forward_fn = jax.jit(
+    hk.transform_with_state(lambda x: DurationModel(is_training=False)(x)).apply
+)
+def predict_duration(params, aux, rng, x: DurationInput):
+    d, _ = forward_fn(params, aux, rng, x)
+    return d, x.durations
+val_loss_fn = jax.jit(partial(loss_fn, is_training=False))
+loss_vag = jax.value_and_grad(loss_fn, has_aux=True)
+optimizer = optax.chain(
+    optax.clip_by_global_norm(FLAGS.max_grad_norm),
+    optax.adamw(FLAGS.duration_learning_rate, weight_decay=FLAGS.weight_decay),
+)
+@jax.jit
+def update(params, aux, rng, optim_state, inputs: DurationInput):
+    rng, new_rng = jax.random.split(rng)
+    (loss, new_aux), grads = loss_vag(params, aux, rng, inputs)
+    updates, new_optim_state = optimizer.update(grads, optim_state, params)
+    new_params = optax.apply_updates(params, updates)
+    return loss, (new_params, new_aux, new_rng, new_optim_state)
+def initial_state(batch):
+    rng = jax.random.PRNGKey(42)
+    params, aux = hk.transform_with_state(lambda x: DurationModel(True)(x)).init(
+        rng, batch
+    )
+    optim_state = optimizer.init(params)
+    return params, aux, rng, optim_state
+def plot_val_duration(step: int, batch, params, aux, rng):
+    fn = FLAGS.ckpt_dir / f"duration_{step:06d}.png"
+    predicted_dur, gt_dur = predict_duration(params, aux, rng, batch)
+    L = batch.lengths[0]
+    x = np.arange(0, L) * 3
+    plt.plot(predicted_dur[0, :L])
+    plt.plot(gt_dur[0, :L])
+    plt.legend(["predicted", "gt"])
+    plt.title("Phoneme durations")
+    plt.savefig(fn)
+    plt.close()
+def train():
+    train_data_iter = textgrid_data_loader(
+        FLAGS.data_dir, FLAGS.max_phoneme_seq_len, FLAGS.batch_size, mode="train"
+    )
+    val_data_iter = textgrid_data_loader(
+        FLAGS.data_dir, FLAGS.max_phoneme_seq_len, FLAGS.batch_size, mode="val"
+    )
+    losses = Deque(maxlen=1000)
+    val_losses = Deque(maxlen=100)
+    latest_ckpt = load_latest_ckpt(FLAGS.ckpt_dir)
+    if latest_ckpt is not None:
+        last_step, params, aux, rng, optim_state = latest_ckpt
+    else:
+        last_step = -1
+        print("Generate random initial states...")
+        params, aux, rng, optim_state = initial_state(next(train_data_iter))
+    tr = tqdm(
+        range(last_step + 1, 1 + FLAGS.num_training_steps),
+        total=1 + FLAGS.num_training_steps,
+        initial=last_step + 1,
+        ncols=80,
+        desc="training",
+    )
+    for step in tr:
+        batch = next(train_data_iter)
+        loss, (params, aux, rng, optim_state) = update(
+            params, aux, rng, optim_state, batch
+        )
+        losses.append(loss)
+        if step % 10 == 0:
+            val_loss, _ = val_loss_fn(params, aux, rng, next(val_data_iter))
+            val_losses.append(val_loss)
+        if step % 1000 == 0:
+            loss = sum(losses).item() / len(losses)
+            val_loss = sum(val_losses).item() / len(val_losses)
+            plot_val_duration(step, next(val_data_iter), params, aux, rng)
+            tr.write(
+                f" {step:>6d}/{FLAGS.num_training_steps:>6d} | train loss {loss:.5f} | val loss {val_loss:.5f}"
+            )
+            save_ckpt(step, params, aux, rng, optim_state, ckpt_dir=FLAGS.ckpt_dir)
+if __name__ == "__main__":
+    print_flags(FLAGS.__dict__)
+    if not FLAGS.ckpt_dir.exists():
+        print("Create checkpoint dir at", FLAGS.ckpt_dir)
+        FLAGS.ckpt_dir.mkdir(parents=True, exist_ok=True)
+    train()

vietTTS/nat/gta.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import pickle
+from argparse import ArgumentParser
+from pathlib import Path
+import haiku as hk
+import jax
+import jax.numpy as jnp
+import numpy as np
+from tqdm.auto import tqdm
+from vietTTS.nat.config import AcousticInput
+from .config import FLAGS, AcousticInput
+from .data_loader import load_textgrid_wav
+from .dsp import MelFilter
+from .model import AcousticModel
+@hk.transform_with_state
+def net(x):
+    return AcousticModel(is_training=True)(x)
+@hk.transform_with_state
+def val_net(x):
+    return AcousticModel(is_training=False)(x)
+def forward_fn_(params, aux, rng, inputs: AcousticInput):
+    melfilter = MelFilter(
+        FLAGS.sample_rate, FLAGS.n_fft, FLAGS.mel_dim, FLAGS.fmin, FLAGS.fmax
+    )
+    mels = melfilter(inputs.wavs.astype(jnp.float32) / (2**15))
+    B, L, D = mels.shape
+    inp_mels = jnp.concatenate(
+        (jnp.zeros((B, 1, D), dtype=jnp.float32), mels[:, :-1, :]), axis=1
+    )
+    n_frames = inputs.durations * FLAGS.sample_rate / (FLAGS.n_fft // 4)
+    inputs = inputs._replace(mels=inp_mels, durations=n_frames)
+    (mel1_hat, mel2_hat), new_aux = val_net.apply(params, aux, rng, inputs)
+    return mel2_hat
+forward_fn = jax.jit(forward_fn_)
+def generate_gta(out_dir: Path):
+    out_dir.mkdir(parents=True, exist_ok=True)
+    data_iter = load_textgrid_wav(
+        FLAGS.data_dir,
+        FLAGS.max_phoneme_seq_len,
+        FLAGS.batch_size,
+        FLAGS.max_wave_len,
+        "gta",
+    )
+    ckpt_fn = FLAGS.ckpt_dir / "acoustic_latest_ckpt.pickle"
+    print("Resuming from latest checkpoint at", ckpt_fn)
+    with open(ckpt_fn, "rb") as f:
+        dic = pickle.load(f)
+        _, params, aux, rng, _ = (
+            dic["step"],
+            dic["params"],
+            dic["aux"],
+            dic["rng"],
+            dic["optim_state"],
+        )
+    tr = tqdm(data_iter)
+    for names, batch in tr:
+        lengths = batch.wav_lengths
+        predicted_mel = forward_fn(params, aux, rng, batch)
+        mel = jax.device_get(predicted_mel)
+        for idx, fn in enumerate(names):
+            file = out_dir / f"{fn}.npy"
+            tr.write(f"saving to file {file}")
+            l = lengths[idx] // (FLAGS.n_fft // 4)
+            np.save(file, mel[idx, :l].T)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("-o", "--output-dir", type=Path, default="gta")
+    generate_gta(parser.parse_args().output_dir)